Initial commit: ULPgrammer

- Core Telegram monitoring pipeline (scraper, processor, notifier, downloaders)
- Textual TUI frontend with thread-safe event bus
- SQLite persistence, severity scoring, dedup cache
- Fixed ULP parser: handles https:// truncation, port+path URLs, semicolon separator
- Test suite: 88 tests across scorer, cache, database, processor
This commit is contained in:
2026-04-02 01:58:49 -03:00
commit 48f486ac97
41 changed files with 5270 additions and 0 deletions

410
core/scraper.py Normal file
View File

@@ -0,0 +1,410 @@
"""
scraper.py — Telethon user client.
Handles:
- Listening for new file messages in watched channels
- Listening for messages with inline download buttons (bot-dispatched files)
- Backfilling recent channel history on startup (batched via tdl)
- Downloading files safely (size guard, flood wait)
"""
import asyncio
import logging
import time
from pathlib import Path
from tqdm import tqdm
from telethon import TelegramClient, events
from telethon.errors import FloodWaitError, ChannelPrivateError, UsernameNotOccupiedError
from telethon.tl.types import (
MessageMediaDocument,
DocumentAttributeFilename,
InputDocumentFileLocation,
)
from config import (
ALLOWED_EXTENSIONS,
BACKFILL_LIMIT,
MAX_FILE_SIZE,
TEMP_DIR,
WATCHED_CHANNELS,
TDL_AMOUNT,
)
from core.bot_downloader import handle_bot_download_message, has_download_button, extract_password
from utils.cache import is_seen, mark_seen
from core.processor import process_file
from core.notifier import notify
from core.tdl_downloader import (
BatchEntry,
download_batch_with_tdl,
download_single_with_tdl,
is_tdl_available,
)
from tui import events as bus
log = logging.getLogger(__name__)
# ─── Helpers ──────────────────────────────────────────────────────────────────
def get_filename(msg) -> str | None:
"""Extract the filename from a document message, if any."""
if not isinstance(msg.media, MessageMediaDocument):
return None
doc = msg.media.document
for attr in doc.attributes:
if isinstance(attr, DocumentAttributeFilename):
return attr.file_name
mime = getattr(doc, "mime_type", "") or ""
ext_map = {
"application/x-rar-compressed": ".rar",
"application/vnd.rar": ".rar",
"application/zip": ".zip",
"application/x-7z-compressed": ".7z",
"text/plain": ".txt",
}
return f"{msg.id}{ext_map.get(mime, '.bin')}"
def get_filesize(msg) -> int:
"""Return document size in bytes, or 0 if not a document."""
if not isinstance(msg.media, MessageMediaDocument):
return 0
return msg.media.document.size or 0
def is_processable(filename: str, size: int) -> tuple[bool, str]:
"""Check whether a file should be downloaded. Returns (ok, reason)."""
suffix = Path(filename).suffix.lower()
if suffix not in ALLOWED_EXTENSIONS:
return False, f"extension {suffix!r} not in allowlist"
if size > MAX_FILE_SIZE:
mb = size / (1024 * 1024)
return False, f"too large ({mb:.1f} MB > {MAX_FILE_SIZE // (1024 * 1024)} MB limit)"
return True, ""
def _make_dest(msg, filename: str) -> Path:
"""Resolve the destination path, avoiding name collisions."""
TEMP_DIR.mkdir(exist_ok=True)
dest = TEMP_DIR / filename
if dest.exists():
dest = TEMP_DIR / f"{msg.id}_{filename}"
return dest
# ─── Telethon fallback download ───────────────────────────────────────────────
async def _telethon_download(client: TelegramClient, msg, dest: Path, filename: str, size: int, batch_id: str | None = None) -> bool:
"""Download a single file via Telethon. Returns True on success."""
_bid = batch_id or f"telethon_{int(time.monotonic_ns())}"
if batch_id is None:
# Standalone call (not already queued by tdl path) — post queued event
bus.post(bus.EvDownloadQueued(
batch_id=_bid, filename=filename,
size_mb=round(size / (1024 * 1024), 2),
source="telethon", password=None,
))
bus.post(bus.EvDownloadStarted(batch_id=_bid, filename=filename))
try:
with tqdm(
total=size,
unit="B",
unit_scale=True,
unit_divisor=1024,
desc=filename[:40],
colour="cyan",
) as pbar:
async def progress(current, total):
pbar.n = current
pbar.refresh()
doc = msg.media.document
location = InputDocumentFileLocation(
id=doc.id,
access_hash=doc.access_hash,
file_reference=doc.file_reference,
thumb_size="",
)
await client.download_file(
location,
file=dest,
part_size_kb=512,
progress_callback=progress,
)
bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon"))
return True
except FloodWaitError as e:
log.warning(f" Flood wait: sleeping {e.seconds}s...")
await asyncio.sleep(e.seconds)
await client.download_media(msg, file=dest)
bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon"))
return True
except Exception as e:
log.error(f" Telethon download failed for {filename}: {e}")
bus.post(bus.EvDownloadFailed(batch_id=_bid, filename=filename, reason=str(e)))
return False
# ─── Single-message pipeline (live handler + bot_downloader) ──────────────────
async def handle_message(
client: TelegramClient,
bot: TelegramClient,
msg,
source_name: str,
patterns,
password: str | None = None,
) -> None:
"""Download and process a single file message."""
filename = get_filename(msg)
if not filename:
log.warning(" handle_message: could not extract filename, skipping.")
return
size = get_filesize(msg)
ok, reason = is_processable(filename, size)
if not ok:
log.warning(f" handle_message: skipping '{filename}'{reason}")
return
doc_id = msg.media.document.id
if is_seen(doc_id):
log.info(f" Skipping {filename} — already processed.")
return
dest = _make_dest(msg, filename)
log.info(f"↓ Downloading: {filename} ({size / 1024:.1f} KB) from {source_name}")
# tdl single → Telethon fallback
downloaded = await download_single_with_tdl(msg, dest) if is_tdl_available() else False
if not downloaded:
if is_tdl_available():
log.warning(" [tdl] failed — falling back to Telethon")
downloaded = await _telethon_download(client, msg, dest, filename, size)
if not downloaded:
log.error(f" All download attempts failed for {filename}")
return
hits = process_file(dest, patterns, password=password)
mark_seen(doc_id)
if hits:
await notify(bot, hits, source_name, filename)
else:
log.info(f" No hits in {filename}")
# ─── Batch pipeline (backfill only) ───────────────────────────────────────────
async def _process_batch(
client: TelegramClient,
bot: TelegramClient,
batch: list[tuple], # list of (msg, source_name, password)
patterns,
) -> int:
"""
Download up to TDL_AMOUNT messages in one tdl invocation, then process
each. Falls back to Telethon per-file for anything tdl missed.
Returns the number of files successfully processed.
"""
if not batch:
return 0
# Build BatchEntry list
entries: list[BatchEntry] = []
for msg, source_name, password in batch:
filename = get_filename(msg)
if not filename:
continue
entries.append(BatchEntry(
msg=msg,
filename=filename,
dest=_make_dest(msg, filename),
doc_id=msg.media.document.id,
source_name=source_name,
password=password,
))
names = ", ".join(e.filename for e in entries)
log.info(f"[Batch] {len(entries)} file(s): {names}")
# One tdl call for the whole batch
results = await download_batch_with_tdl(entries)
processed = 0
for entry in entries:
tdl_ok = results.get(entry.doc_id, False)
if not tdl_ok:
# Per-file Telethon fallback
log.info(f" [Batch] Telethon fallback: {entry.filename}")
size = get_filesize(entry.msg)
tdl_ok = await _telethon_download(client, entry.msg, entry.dest, entry.filename, size)
if not tdl_ok:
log.error(f" [Batch] All attempts failed: {entry.filename}")
continue
hits = process_file(entry.dest, patterns, password=entry.password)
mark_seen(entry.doc_id)
if hits:
await notify(bot, hits, entry.source_name, entry.filename)
else:
log.info(f" No hits in {entry.filename}")
processed += 1
return processed
# ─── Backfill ─────────────────────────────────────────────────────────────────
async def backfill_channel(
client: TelegramClient,
bot: TelegramClient,
channel: str | int,
patterns,
limit: int,
) -> None:
"""Scan the last `limit` messages of a channel for file attachments."""
log.info(f"[Backfill] Scanning history: {channel} (last {limit} messages)")
total = 0
batch: list[tuple] = [] # (msg, source_name, password)
last_password: str | None = None # carry password across adjacent messages
async def flush_batch():
nonlocal total
if batch:
total += await _process_batch(client, bot, batch, patterns)
batch.clear()
try:
async for msg in client.iter_messages(channel, limit=limit):
source_name = str(channel)
# Extract password from this message if present, and remember it.
# iter_messages goes newest→oldest, so a password post that appears
# above the files in the channel will arrive AFTER them here.
# We therefore carry last_password in both directions:
# - apply it to file messages that have no inline password
# - update it whenever we see a fresh password, so subsequent
# (older) file messages in the same batch pick it up too.
msg_password = extract_password(msg)
if msg_password:
last_password = msg_password
password = msg_password or last_password
if msg.media and isinstance(msg.media, MessageMediaDocument):
filename = get_filename(msg)
size = get_filesize(msg)
if not filename:
continue
ok, reason = is_processable(filename, size)
if not ok:
log.warning(f" [Backfill] Skipping '{filename}'{reason}")
continue
if is_seen(msg.media.document.id):
log.info(f" [Backfill] Already seen: {filename}")
continue
if is_tdl_available():
batch.append((msg, source_name, password))
if len(batch) >= TDL_AMOUNT:
await flush_batch()
else:
# No tdl — fall straight through to single handle_message
await handle_message(client, bot, msg, source_name, patterns, password=password)
total += 1
await asyncio.sleep(0.5)
elif msg.buttons and has_download_button(msg):
# Bot-button messages can't be batched — handle individually
await flush_batch() # flush any pending batch first
await handle_bot_download_message(client, bot, msg, source_name, patterns, password=password)
total += 1
await asyncio.sleep(1.5)
# Flush whatever's left
await flush_batch()
except (ChannelPrivateError, UsernameNotOccupiedError) as e:
log.error(f"[Backfill] Cannot access {channel}: {e}")
except Exception as e:
log.error(f"[Backfill] Error scanning {channel}: {e}")
log.info(f"[Backfill] Done: {channel}{total} file(s) processed")
async def backfill_all(
client: TelegramClient,
bot: TelegramClient,
patterns,
) -> None:
"""Backfill all watched channels sequentially."""
if BACKFILL_LIMIT <= 0:
log.info("[Backfill] Disabled (BACKFILL_LIMIT=0)")
return
log.info(f"[Backfill] Starting for {len(WATCHED_CHANNELS)} channel(s)...")
for ch in WATCHED_CHANNELS:
await backfill_channel(client, bot, ch, patterns, BACKFILL_LIMIT)
log.info("[Backfill] Complete.")
# ─── Entity cache warmup ──────────────────────────────────────────────────────
async def warm_entity_cache(client: TelegramClient) -> None:
"""
Fetches your dialog list so Telethon caches all entity mappings.
Required before using raw numeric IDs.
"""
log.info("Warming entity cache (fetching dialogs)...")
async for _ in client.iter_dialogs():
pass
log.info("Entity cache ready.")
# ─── Live listener ────────────────────────────────────────────────────────────
def register_handlers(
client: TelegramClient,
bot: TelegramClient,
patterns,
) -> None:
"""Register the NewMessage event handler for all watched channels."""
# Per-channel password cache for the live handler.
# Channels often post a text message with the password separately from
# the file message. We remember the last seen password per channel so
# that the file message that follows (or precedes by seconds) picks it up.
_channel_passwords: dict[int, str] = {}
@client.on(events.NewMessage(chats=WATCHED_CHANNELS))
async def on_new_message(event):
msg = event.message
try:
source = event.chat.username or str(event.chat_id)
except Exception:
source = str(event.chat_id)
chat_id = event.chat_id
log.info(f"[Live] New message in {source}")
# Update cache if this message carries a password
msg_password = extract_password(msg)
if msg_password:
_channel_passwords[chat_id] = msg_password
log.debug(f"[Live] Password cached for {source}: '{msg_password}'")
password = msg_password or _channel_passwords.get(chat_id)
if msg.media and isinstance(msg.media, MessageMediaDocument):
await handle_message(client, bot, msg, source, patterns, password=password)
elif msg.buttons and has_download_button(msg):
await handle_bot_download_message(client, bot, msg, source, patterns, password=password)