- Core Telegram monitoring pipeline (scraper, processor, notifier, downloaders) - Textual TUI frontend with thread-safe event bus - SQLite persistence, severity scoring, dedup cache - Fixed ULP parser: handles https:// truncation, port+path URLs, semicolon separator - Test suite: 88 tests across scorer, cache, database, processor
411 lines
15 KiB
Python
411 lines
15 KiB
Python
"""
|
|
scraper.py — Telethon user client.
|
|
|
|
Handles:
|
|
- Listening for new file messages in watched channels
|
|
- Listening for messages with inline download buttons (bot-dispatched files)
|
|
- Backfilling recent channel history on startup (batched via tdl)
|
|
- Downloading files safely (size guard, flood wait)
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from tqdm import tqdm
|
|
from telethon import TelegramClient, events
|
|
from telethon.errors import FloodWaitError, ChannelPrivateError, UsernameNotOccupiedError
|
|
from telethon.tl.types import (
|
|
MessageMediaDocument,
|
|
DocumentAttributeFilename,
|
|
InputDocumentFileLocation,
|
|
)
|
|
|
|
from config import (
|
|
ALLOWED_EXTENSIONS,
|
|
BACKFILL_LIMIT,
|
|
MAX_FILE_SIZE,
|
|
TEMP_DIR,
|
|
WATCHED_CHANNELS,
|
|
TDL_AMOUNT,
|
|
)
|
|
from core.bot_downloader import handle_bot_download_message, has_download_button, extract_password
|
|
from utils.cache import is_seen, mark_seen
|
|
from core.processor import process_file
|
|
from core.notifier import notify
|
|
from core.tdl_downloader import (
|
|
BatchEntry,
|
|
download_batch_with_tdl,
|
|
download_single_with_tdl,
|
|
is_tdl_available,
|
|
)
|
|
from tui import events as bus
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
|
|
|
def get_filename(msg) -> str | None:
|
|
"""Extract the filename from a document message, if any."""
|
|
if not isinstance(msg.media, MessageMediaDocument):
|
|
return None
|
|
doc = msg.media.document
|
|
for attr in doc.attributes:
|
|
if isinstance(attr, DocumentAttributeFilename):
|
|
return attr.file_name
|
|
mime = getattr(doc, "mime_type", "") or ""
|
|
ext_map = {
|
|
"application/x-rar-compressed": ".rar",
|
|
"application/vnd.rar": ".rar",
|
|
"application/zip": ".zip",
|
|
"application/x-7z-compressed": ".7z",
|
|
"text/plain": ".txt",
|
|
}
|
|
return f"{msg.id}{ext_map.get(mime, '.bin')}"
|
|
|
|
|
|
def get_filesize(msg) -> int:
|
|
"""Return document size in bytes, or 0 if not a document."""
|
|
if not isinstance(msg.media, MessageMediaDocument):
|
|
return 0
|
|
return msg.media.document.size or 0
|
|
|
|
|
|
def is_processable(filename: str, size: int) -> tuple[bool, str]:
|
|
"""Check whether a file should be downloaded. Returns (ok, reason)."""
|
|
suffix = Path(filename).suffix.lower()
|
|
if suffix not in ALLOWED_EXTENSIONS:
|
|
return False, f"extension {suffix!r} not in allowlist"
|
|
if size > MAX_FILE_SIZE:
|
|
mb = size / (1024 * 1024)
|
|
return False, f"too large ({mb:.1f} MB > {MAX_FILE_SIZE // (1024 * 1024)} MB limit)"
|
|
return True, ""
|
|
|
|
|
|
def _make_dest(msg, filename: str) -> Path:
|
|
"""Resolve the destination path, avoiding name collisions."""
|
|
TEMP_DIR.mkdir(exist_ok=True)
|
|
dest = TEMP_DIR / filename
|
|
if dest.exists():
|
|
dest = TEMP_DIR / f"{msg.id}_{filename}"
|
|
return dest
|
|
|
|
|
|
# ─── Telethon fallback download ───────────────────────────────────────────────
|
|
|
|
async def _telethon_download(client: TelegramClient, msg, dest: Path, filename: str, size: int, batch_id: str | None = None) -> bool:
|
|
"""Download a single file via Telethon. Returns True on success."""
|
|
_bid = batch_id or f"telethon_{int(time.monotonic_ns())}"
|
|
if batch_id is None:
|
|
# Standalone call (not already queued by tdl path) — post queued event
|
|
bus.post(bus.EvDownloadQueued(
|
|
batch_id=_bid, filename=filename,
|
|
size_mb=round(size / (1024 * 1024), 2),
|
|
source="telethon", password=None,
|
|
))
|
|
bus.post(bus.EvDownloadStarted(batch_id=_bid, filename=filename))
|
|
try:
|
|
with tqdm(
|
|
total=size,
|
|
unit="B",
|
|
unit_scale=True,
|
|
unit_divisor=1024,
|
|
desc=filename[:40],
|
|
colour="cyan",
|
|
) as pbar:
|
|
async def progress(current, total):
|
|
pbar.n = current
|
|
pbar.refresh()
|
|
|
|
doc = msg.media.document
|
|
location = InputDocumentFileLocation(
|
|
id=doc.id,
|
|
access_hash=doc.access_hash,
|
|
file_reference=doc.file_reference,
|
|
thumb_size="",
|
|
)
|
|
await client.download_file(
|
|
location,
|
|
file=dest,
|
|
part_size_kb=512,
|
|
progress_callback=progress,
|
|
)
|
|
bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon"))
|
|
return True
|
|
except FloodWaitError as e:
|
|
log.warning(f" Flood wait: sleeping {e.seconds}s...")
|
|
await asyncio.sleep(e.seconds)
|
|
await client.download_media(msg, file=dest)
|
|
bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon"))
|
|
return True
|
|
except Exception as e:
|
|
log.error(f" Telethon download failed for {filename}: {e}")
|
|
bus.post(bus.EvDownloadFailed(batch_id=_bid, filename=filename, reason=str(e)))
|
|
return False
|
|
|
|
|
|
# ─── Single-message pipeline (live handler + bot_downloader) ──────────────────
|
|
|
|
async def handle_message(
|
|
client: TelegramClient,
|
|
bot: TelegramClient,
|
|
msg,
|
|
source_name: str,
|
|
patterns,
|
|
password: str | None = None,
|
|
) -> None:
|
|
"""Download and process a single file message."""
|
|
filename = get_filename(msg)
|
|
if not filename:
|
|
log.warning(" handle_message: could not extract filename, skipping.")
|
|
return
|
|
|
|
size = get_filesize(msg)
|
|
ok, reason = is_processable(filename, size)
|
|
if not ok:
|
|
log.warning(f" handle_message: skipping '{filename}' — {reason}")
|
|
return
|
|
|
|
doc_id = msg.media.document.id
|
|
if is_seen(doc_id):
|
|
log.info(f" Skipping {filename} — already processed.")
|
|
return
|
|
|
|
dest = _make_dest(msg, filename)
|
|
log.info(f"↓ Downloading: {filename} ({size / 1024:.1f} KB) from {source_name}")
|
|
|
|
# tdl single → Telethon fallback
|
|
downloaded = await download_single_with_tdl(msg, dest) if is_tdl_available() else False
|
|
if not downloaded:
|
|
if is_tdl_available():
|
|
log.warning(" [tdl] failed — falling back to Telethon")
|
|
downloaded = await _telethon_download(client, msg, dest, filename, size)
|
|
|
|
if not downloaded:
|
|
log.error(f" All download attempts failed for {filename}")
|
|
return
|
|
|
|
hits = process_file(dest, patterns, password=password)
|
|
mark_seen(doc_id)
|
|
|
|
if hits:
|
|
await notify(bot, hits, source_name, filename)
|
|
else:
|
|
log.info(f" No hits in {filename}")
|
|
|
|
|
|
# ─── Batch pipeline (backfill only) ───────────────────────────────────────────
|
|
|
|
async def _process_batch(
|
|
client: TelegramClient,
|
|
bot: TelegramClient,
|
|
batch: list[tuple], # list of (msg, source_name, password)
|
|
patterns,
|
|
) -> int:
|
|
"""
|
|
Download up to TDL_AMOUNT messages in one tdl invocation, then process
|
|
each. Falls back to Telethon per-file for anything tdl missed.
|
|
Returns the number of files successfully processed.
|
|
"""
|
|
if not batch:
|
|
return 0
|
|
|
|
# Build BatchEntry list
|
|
entries: list[BatchEntry] = []
|
|
for msg, source_name, password in batch:
|
|
filename = get_filename(msg)
|
|
if not filename:
|
|
continue
|
|
entries.append(BatchEntry(
|
|
msg=msg,
|
|
filename=filename,
|
|
dest=_make_dest(msg, filename),
|
|
doc_id=msg.media.document.id,
|
|
source_name=source_name,
|
|
password=password,
|
|
))
|
|
|
|
names = ", ".join(e.filename for e in entries)
|
|
log.info(f"[Batch] {len(entries)} file(s): {names}")
|
|
|
|
# One tdl call for the whole batch
|
|
results = await download_batch_with_tdl(entries)
|
|
|
|
processed = 0
|
|
for entry in entries:
|
|
tdl_ok = results.get(entry.doc_id, False)
|
|
|
|
if not tdl_ok:
|
|
# Per-file Telethon fallback
|
|
log.info(f" [Batch] Telethon fallback: {entry.filename}")
|
|
size = get_filesize(entry.msg)
|
|
tdl_ok = await _telethon_download(client, entry.msg, entry.dest, entry.filename, size)
|
|
|
|
if not tdl_ok:
|
|
log.error(f" [Batch] All attempts failed: {entry.filename}")
|
|
continue
|
|
|
|
hits = process_file(entry.dest, patterns, password=entry.password)
|
|
mark_seen(entry.doc_id)
|
|
|
|
if hits:
|
|
await notify(bot, hits, entry.source_name, entry.filename)
|
|
else:
|
|
log.info(f" No hits in {entry.filename}")
|
|
|
|
processed += 1
|
|
|
|
return processed
|
|
|
|
|
|
# ─── Backfill ─────────────────────────────────────────────────────────────────
|
|
|
|
async def backfill_channel(
|
|
client: TelegramClient,
|
|
bot: TelegramClient,
|
|
channel: str | int,
|
|
patterns,
|
|
limit: int,
|
|
) -> None:
|
|
"""Scan the last `limit` messages of a channel for file attachments."""
|
|
log.info(f"[Backfill] Scanning history: {channel} (last {limit} messages)")
|
|
total = 0
|
|
batch: list[tuple] = [] # (msg, source_name, password)
|
|
last_password: str | None = None # carry password across adjacent messages
|
|
|
|
async def flush_batch():
|
|
nonlocal total
|
|
if batch:
|
|
total += await _process_batch(client, bot, batch, patterns)
|
|
batch.clear()
|
|
|
|
try:
|
|
async for msg in client.iter_messages(channel, limit=limit):
|
|
source_name = str(channel)
|
|
|
|
# Extract password from this message if present, and remember it.
|
|
# iter_messages goes newest→oldest, so a password post that appears
|
|
# above the files in the channel will arrive AFTER them here.
|
|
# We therefore carry last_password in both directions:
|
|
# - apply it to file messages that have no inline password
|
|
# - update it whenever we see a fresh password, so subsequent
|
|
# (older) file messages in the same batch pick it up too.
|
|
msg_password = extract_password(msg)
|
|
if msg_password:
|
|
last_password = msg_password
|
|
|
|
password = msg_password or last_password
|
|
|
|
if msg.media and isinstance(msg.media, MessageMediaDocument):
|
|
filename = get_filename(msg)
|
|
size = get_filesize(msg)
|
|
|
|
if not filename:
|
|
continue
|
|
|
|
ok, reason = is_processable(filename, size)
|
|
if not ok:
|
|
log.warning(f" [Backfill] Skipping '{filename}' — {reason}")
|
|
continue
|
|
|
|
if is_seen(msg.media.document.id):
|
|
log.info(f" [Backfill] Already seen: {filename}")
|
|
continue
|
|
|
|
if is_tdl_available():
|
|
batch.append((msg, source_name, password))
|
|
if len(batch) >= TDL_AMOUNT:
|
|
await flush_batch()
|
|
else:
|
|
# No tdl — fall straight through to single handle_message
|
|
await handle_message(client, bot, msg, source_name, patterns, password=password)
|
|
total += 1
|
|
await asyncio.sleep(0.5)
|
|
|
|
elif msg.buttons and has_download_button(msg):
|
|
# Bot-button messages can't be batched — handle individually
|
|
await flush_batch() # flush any pending batch first
|
|
await handle_bot_download_message(client, bot, msg, source_name, patterns, password=password)
|
|
total += 1
|
|
await asyncio.sleep(1.5)
|
|
|
|
# Flush whatever's left
|
|
await flush_batch()
|
|
|
|
except (ChannelPrivateError, UsernameNotOccupiedError) as e:
|
|
log.error(f"[Backfill] Cannot access {channel}: {e}")
|
|
except Exception as e:
|
|
log.error(f"[Backfill] Error scanning {channel}: {e}")
|
|
|
|
log.info(f"[Backfill] Done: {channel} — {total} file(s) processed")
|
|
|
|
|
|
async def backfill_all(
|
|
client: TelegramClient,
|
|
bot: TelegramClient,
|
|
patterns,
|
|
) -> None:
|
|
"""Backfill all watched channels sequentially."""
|
|
if BACKFILL_LIMIT <= 0:
|
|
log.info("[Backfill] Disabled (BACKFILL_LIMIT=0)")
|
|
return
|
|
log.info(f"[Backfill] Starting for {len(WATCHED_CHANNELS)} channel(s)...")
|
|
for ch in WATCHED_CHANNELS:
|
|
await backfill_channel(client, bot, ch, patterns, BACKFILL_LIMIT)
|
|
log.info("[Backfill] Complete.")
|
|
|
|
|
|
# ─── Entity cache warmup ──────────────────────────────────────────────────────
|
|
|
|
async def warm_entity_cache(client: TelegramClient) -> None:
|
|
"""
|
|
Fetches your dialog list so Telethon caches all entity mappings.
|
|
Required before using raw numeric IDs.
|
|
"""
|
|
log.info("Warming entity cache (fetching dialogs)...")
|
|
async for _ in client.iter_dialogs():
|
|
pass
|
|
log.info("Entity cache ready.")
|
|
|
|
|
|
# ─── Live listener ────────────────────────────────────────────────────────────
|
|
|
|
def register_handlers(
|
|
client: TelegramClient,
|
|
bot: TelegramClient,
|
|
patterns,
|
|
) -> None:
|
|
"""Register the NewMessage event handler for all watched channels."""
|
|
|
|
# Per-channel password cache for the live handler.
|
|
# Channels often post a text message with the password separately from
|
|
# the file message. We remember the last seen password per channel so
|
|
# that the file message that follows (or precedes by seconds) picks it up.
|
|
_channel_passwords: dict[int, str] = {}
|
|
|
|
@client.on(events.NewMessage(chats=WATCHED_CHANNELS))
|
|
async def on_new_message(event):
|
|
msg = event.message
|
|
try:
|
|
source = event.chat.username or str(event.chat_id)
|
|
except Exception:
|
|
source = str(event.chat_id)
|
|
|
|
chat_id = event.chat_id
|
|
log.info(f"[Live] New message in {source}")
|
|
|
|
# Update cache if this message carries a password
|
|
msg_password = extract_password(msg)
|
|
if msg_password:
|
|
_channel_passwords[chat_id] = msg_password
|
|
log.debug(f"[Live] Password cached for {source}: '{msg_password}'")
|
|
|
|
password = msg_password or _channel_passwords.get(chat_id)
|
|
|
|
if msg.media and isinstance(msg.media, MessageMediaDocument):
|
|
await handle_message(client, bot, msg, source, patterns, password=password)
|
|
elif msg.buttons and has_download_button(msg):
|
|
await handle_bot_download_message(client, bot, msg, source, patterns, password=password)
|