""" scraper.py - Telethon user client. Handles: - Listening for new file messages in watched channels - Listening for messages with inline download buttons (bot-dispatched files) - Backfilling recent channel history on startup (batched via tdl) - Downloading files safely (size guard, flood wait) """ import asyncio import logging import time from pathlib import Path from tqdm import tqdm from telethon import TelegramClient, events from telethon.errors import FloodWaitError, ChannelPrivateError, UsernameNotOccupiedError from telethon.tl.types import ( MessageMediaDocument, DocumentAttributeFilename, InputDocumentFileLocation, ) from config import ( ALLOWED_EXTENSIONS, BACKFILL_LIMIT, MAX_FILE_SIZE, TEMP_DIR, WATCHED_CHANNELS, TDL_AMOUNT, ) from core.bot_downloader import handle_bot_download_message, has_download_button, extract_password from utils.cache import is_seen, mark_seen from core.processor import process_file from core.notifier import notify from core.tdl_downloader import ( BatchEntry, download_batch_with_tdl, download_single_with_tdl, is_tdl_available, ) from tui import events as bus log = logging.getLogger(__name__) # ─── Helpers ────────────────────────────────────────────────────────────────── def get_filename(msg) -> str | None: """Extract the filename from a document message, if any.""" if not isinstance(msg.media, MessageMediaDocument): return None doc = msg.media.document for attr in doc.attributes: if isinstance(attr, DocumentAttributeFilename): return attr.file_name mime = getattr(doc, "mime_type", "") or "" ext_map = { "application/x-rar-compressed": ".rar", "application/vnd.rar": ".rar", "application/zip": ".zip", "application/x-7z-compressed": ".7z", "text/plain": ".txt", } return f"{msg.id}{ext_map.get(mime, '.bin')}" def get_filesize(msg) -> int: """Return document size in bytes, or 0 if not a document.""" if not isinstance(msg.media, MessageMediaDocument): return 0 return msg.media.document.size or 0 def is_processable(filename: str, size: int) -> tuple[bool, str]: """Check whether a file should be downloaded. Returns (ok, reason).""" suffix = Path(filename).suffix.lower() if suffix not in ALLOWED_EXTENSIONS: return False, f"extension {suffix!r} not in allowlist" if size > MAX_FILE_SIZE: mb = size / (1024 * 1024) return False, f"too large ({mb:.1f} MB > {MAX_FILE_SIZE // (1024 * 1024)} MB limit)" return True, "" def _make_dest(msg, filename: str) -> Path: """Resolve the destination path, avoiding name collisions.""" TEMP_DIR.mkdir(exist_ok=True) dest = TEMP_DIR / filename if dest.exists(): dest = TEMP_DIR / f"{msg.id}_{filename}" return dest # ─── Telethon fallback download ─────────────────────────────────────────────── async def _telethon_download(client: TelegramClient, msg, dest: Path, filename: str, size: int, batch_id: str | None = None) -> bool: """Download a single file via Telethon. Returns True on success.""" _bid = batch_id or f"telethon_{int(time.monotonic_ns())}" if batch_id is None: # Standalone call (not already queued by tdl path) - post queued event bus.post(bus.EvDownloadQueued( batch_id=_bid, filename=filename, size_mb=round(size / (1024 * 1024), 2), source="telethon", password=None, )) bus.post(bus.EvDownloadStarted(batch_id=_bid, filename=filename)) try: with tqdm( total=size, unit="B", unit_scale=True, unit_divisor=1024, desc=filename[:40], colour="cyan", ) as pbar: async def progress(current, total): pbar.n = current pbar.refresh() doc = msg.media.document location = InputDocumentFileLocation( id=doc.id, access_hash=doc.access_hash, file_reference=doc.file_reference, thumb_size="", ) await client.download_file( location, file=dest, part_size_kb=512, progress_callback=progress, ) bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon")) return True except FloodWaitError as e: log.warning(f" Flood wait: sleeping {e.seconds}s...") await asyncio.sleep(e.seconds) await client.download_media(msg, file=dest) bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon")) return True except Exception as e: log.error(f" Telethon download failed for {filename}: {e}") bus.post(bus.EvDownloadFailed(batch_id=_bid, filename=filename, reason=str(e))) return False # ─── Single-message pipeline (live handler + bot_downloader) ────────────────── async def handle_message( client: TelegramClient, bot: TelegramClient, msg, source_name: str, patterns, password: str | None = None, ) -> None: """Download and process a single file message.""" filename = get_filename(msg) if not filename: log.warning(" handle_message: could not extract filename, skipping.") return size = get_filesize(msg) ok, reason = is_processable(filename, size) if not ok: log.warning(f" handle_message: skipping '{filename}' - {reason}") return doc_id = msg.media.document.id if is_seen(doc_id): log.info(f" Skipping {filename} - already processed.") return dest = _make_dest(msg, filename) log.info(f"↓ Downloading: {filename} ({size / 1024:.1f} KB) from {source_name}") # tdl single → Telethon fallback downloaded = await download_single_with_tdl(msg, dest) if is_tdl_available() else False if not downloaded: if is_tdl_available(): log.warning(" [tdl] failed - falling back to Telethon") downloaded = await _telethon_download(client, msg, dest, filename, size) if not downloaded: log.error(f" All download attempts failed for {filename}") return hits = process_file(dest, patterns, password=password) mark_seen(doc_id) if hits: await notify(bot, hits, source_name, filename) else: log.info(f" No hits in {filename}") # ─── Batch pipeline (backfill only) ─────────────────────────────────────────── async def _process_batch( client: TelegramClient, bot: TelegramClient, batch: list[tuple], # list of (msg, source_name, password) patterns, ) -> int: """ Download up to TDL_AMOUNT messages in one tdl invocation, then process each. Falls back to Telethon per-file for anything tdl missed. Returns the number of files successfully processed. """ if not batch: return 0 # Build BatchEntry list entries: list[BatchEntry] = [] for msg, source_name, password in batch: filename = get_filename(msg) if not filename: continue entries.append(BatchEntry( msg=msg, filename=filename, dest=_make_dest(msg, filename), doc_id=msg.media.document.id, source_name=source_name, password=password, )) names = ", ".join(e.filename for e in entries) log.info(f"[Batch] {len(entries)} file(s): {names}") # One tdl call for the whole batch results = await download_batch_with_tdl(entries) processed = 0 for entry in entries: tdl_ok = results.get(entry.doc_id, False) if not tdl_ok: # Per-file Telethon fallback log.info(f" [Batch] Telethon fallback: {entry.filename}") size = get_filesize(entry.msg) tdl_ok = await _telethon_download(client, entry.msg, entry.dest, entry.filename, size) if not tdl_ok: log.error(f" [Batch] All attempts failed: {entry.filename}") continue hits = process_file(entry.dest, patterns, password=entry.password) mark_seen(entry.doc_id) if hits: await notify(bot, hits, entry.source_name, entry.filename) else: log.info(f" No hits in {entry.filename}") processed += 1 return processed # ─── Backfill ───────────────────────────────────────────────────────────────── async def backfill_channel( client: TelegramClient, bot: TelegramClient, channel: str | int, patterns, limit: int, ) -> None: """Scan the last `limit` messages of a channel for file attachments.""" log.info(f"[Backfill] Scanning history: {channel} (last {limit} messages)") total = 0 batch: list[tuple] = [] # (msg, source_name, password) last_password: str | None = None # carry password across adjacent messages async def flush_batch(): nonlocal total if batch: total += await _process_batch(client, bot, batch, patterns) batch.clear() try: async for msg in client.iter_messages(channel, limit=limit): source_name = str(channel) # Extract password from this message if present, and remember it. # iter_messages goes newest→oldest, so a password post that appears # above the files in the channel will arrive AFTER them here. # We therefore carry last_password in both directions: # - apply it to file messages that have no inline password # - update it whenever we see a fresh password, so subsequent # (older) file messages in the same batch pick it up too. msg_password = extract_password(msg) if msg_password: last_password = msg_password password = msg_password or last_password if msg.media and isinstance(msg.media, MessageMediaDocument): filename = get_filename(msg) size = get_filesize(msg) if not filename: continue ok, reason = is_processable(filename, size) if not ok: log.warning(f" [Backfill] Skipping '{filename}' - {reason}") continue if is_seen(msg.media.document.id): log.info(f" [Backfill] Already seen: {filename}") continue if is_tdl_available(): batch.append((msg, source_name, password)) if len(batch) >= TDL_AMOUNT: await flush_batch() else: # No tdl - fall straight through to single handle_message await handle_message(client, bot, msg, source_name, patterns, password=password) total += 1 await asyncio.sleep(0.5) elif msg.buttons and has_download_button(msg): # Bot-button messages can't be batched - handle individually await flush_batch() # flush any pending batch first await handle_bot_download_message(client, bot, msg, source_name, patterns, password=password) total += 1 await asyncio.sleep(1.5) # Flush whatever's left await flush_batch() except (ChannelPrivateError, UsernameNotOccupiedError) as e: log.error(f"[Backfill] Cannot access {channel}: {e}") except Exception as e: log.error(f"[Backfill] Error scanning {channel}: {e}") log.info(f"[Backfill] Done: {channel} - {total} file(s) processed") async def backfill_all( client: TelegramClient, bot: TelegramClient, patterns, ) -> None: """Backfill all watched channels sequentially.""" if BACKFILL_LIMIT <= 0: log.info("[Backfill] Disabled (BACKFILL_LIMIT=0)") return log.info(f"[Backfill] Starting for {len(WATCHED_CHANNELS)} channel(s)...") for ch in WATCHED_CHANNELS: await backfill_channel(client, bot, ch, patterns, BACKFILL_LIMIT) log.info("[Backfill] Complete.") # ─── Entity cache warmup ────────────────────────────────────────────────────── async def warm_entity_cache(client: TelegramClient) -> None: """ Fetches your dialog list so Telethon caches all entity mappings. Required before using raw numeric IDs. """ log.info("Warming entity cache (fetching dialogs)...") async for _ in client.iter_dialogs(): pass log.info("Entity cache ready.") # ─── Live listener ──────────────────────────────────────────────────────────── def register_handlers( client: TelegramClient, bot: TelegramClient, patterns, ) -> None: """Register the NewMessage event handler for all watched channels.""" # Per-channel password cache for the live handler. # Channels often post a text message with the password separately from # the file message. We remember the last seen password per channel so # that the file message that follows (or precedes by seconds) picks it up. _channel_passwords: dict[int, str] = {} @client.on(events.NewMessage(chats=WATCHED_CHANNELS)) async def on_new_message(event): msg = event.message try: source = event.chat.username or str(event.chat_id) except Exception: source = str(event.chat_id) chat_id = event.chat_id log.info(f"[Live] New message in {source}") # Update cache if this message carries a password msg_password = extract_password(msg) if msg_password: _channel_passwords[chat_id] = msg_password log.debug(f"[Live] Password cached for {source}: '{msg_password}'") password = msg_password or _channel_passwords.get(chat_id) if msg.media and isinstance(msg.media, MessageMediaDocument): await handle_message(client, bot, msg, source, patterns, password=password) elif msg.buttons and has_download_button(msg): await handle_bot_download_message(client, bot, msg, source, patterns, password=password)