Initial commit: ULPgrammer
- Core Telegram monitoring pipeline (scraper, processor, notifier, downloaders) - Textual TUI frontend with thread-safe event bus - SQLite persistence, severity scoring, dedup cache - Fixed ULP parser: handles https:// truncation, port+path URLs, semicolon separator - Test suite: 88 tests across scorer, cache, database, processor
This commit is contained in:
410
core/scraper.py
Normal file
410
core/scraper.py
Normal file
@@ -0,0 +1,410 @@
|
||||
"""
|
||||
scraper.py — Telethon user client.
|
||||
|
||||
Handles:
|
||||
- Listening for new file messages in watched channels
|
||||
- Listening for messages with inline download buttons (bot-dispatched files)
|
||||
- Backfilling recent channel history on startup (batched via tdl)
|
||||
- Downloading files safely (size guard, flood wait)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from tqdm import tqdm
|
||||
from telethon import TelegramClient, events
|
||||
from telethon.errors import FloodWaitError, ChannelPrivateError, UsernameNotOccupiedError
|
||||
from telethon.tl.types import (
|
||||
MessageMediaDocument,
|
||||
DocumentAttributeFilename,
|
||||
InputDocumentFileLocation,
|
||||
)
|
||||
|
||||
from config import (
|
||||
ALLOWED_EXTENSIONS,
|
||||
BACKFILL_LIMIT,
|
||||
MAX_FILE_SIZE,
|
||||
TEMP_DIR,
|
||||
WATCHED_CHANNELS,
|
||||
TDL_AMOUNT,
|
||||
)
|
||||
from core.bot_downloader import handle_bot_download_message, has_download_button, extract_password
|
||||
from utils.cache import is_seen, mark_seen
|
||||
from core.processor import process_file
|
||||
from core.notifier import notify
|
||||
from core.tdl_downloader import (
|
||||
BatchEntry,
|
||||
download_batch_with_tdl,
|
||||
download_single_with_tdl,
|
||||
is_tdl_available,
|
||||
)
|
||||
from tui import events as bus
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_filename(msg) -> str | None:
|
||||
"""Extract the filename from a document message, if any."""
|
||||
if not isinstance(msg.media, MessageMediaDocument):
|
||||
return None
|
||||
doc = msg.media.document
|
||||
for attr in doc.attributes:
|
||||
if isinstance(attr, DocumentAttributeFilename):
|
||||
return attr.file_name
|
||||
mime = getattr(doc, "mime_type", "") or ""
|
||||
ext_map = {
|
||||
"application/x-rar-compressed": ".rar",
|
||||
"application/vnd.rar": ".rar",
|
||||
"application/zip": ".zip",
|
||||
"application/x-7z-compressed": ".7z",
|
||||
"text/plain": ".txt",
|
||||
}
|
||||
return f"{msg.id}{ext_map.get(mime, '.bin')}"
|
||||
|
||||
|
||||
def get_filesize(msg) -> int:
|
||||
"""Return document size in bytes, or 0 if not a document."""
|
||||
if not isinstance(msg.media, MessageMediaDocument):
|
||||
return 0
|
||||
return msg.media.document.size or 0
|
||||
|
||||
|
||||
def is_processable(filename: str, size: int) -> tuple[bool, str]:
|
||||
"""Check whether a file should be downloaded. Returns (ok, reason)."""
|
||||
suffix = Path(filename).suffix.lower()
|
||||
if suffix not in ALLOWED_EXTENSIONS:
|
||||
return False, f"extension {suffix!r} not in allowlist"
|
||||
if size > MAX_FILE_SIZE:
|
||||
mb = size / (1024 * 1024)
|
||||
return False, f"too large ({mb:.1f} MB > {MAX_FILE_SIZE // (1024 * 1024)} MB limit)"
|
||||
return True, ""
|
||||
|
||||
|
||||
def _make_dest(msg, filename: str) -> Path:
|
||||
"""Resolve the destination path, avoiding name collisions."""
|
||||
TEMP_DIR.mkdir(exist_ok=True)
|
||||
dest = TEMP_DIR / filename
|
||||
if dest.exists():
|
||||
dest = TEMP_DIR / f"{msg.id}_{filename}"
|
||||
return dest
|
||||
|
||||
|
||||
# ─── Telethon fallback download ───────────────────────────────────────────────
|
||||
|
||||
async def _telethon_download(client: TelegramClient, msg, dest: Path, filename: str, size: int, batch_id: str | None = None) -> bool:
|
||||
"""Download a single file via Telethon. Returns True on success."""
|
||||
_bid = batch_id or f"telethon_{int(time.monotonic_ns())}"
|
||||
if batch_id is None:
|
||||
# Standalone call (not already queued by tdl path) — post queued event
|
||||
bus.post(bus.EvDownloadQueued(
|
||||
batch_id=_bid, filename=filename,
|
||||
size_mb=round(size / (1024 * 1024), 2),
|
||||
source="telethon", password=None,
|
||||
))
|
||||
bus.post(bus.EvDownloadStarted(batch_id=_bid, filename=filename))
|
||||
try:
|
||||
with tqdm(
|
||||
total=size,
|
||||
unit="B",
|
||||
unit_scale=True,
|
||||
unit_divisor=1024,
|
||||
desc=filename[:40],
|
||||
colour="cyan",
|
||||
) as pbar:
|
||||
async def progress(current, total):
|
||||
pbar.n = current
|
||||
pbar.refresh()
|
||||
|
||||
doc = msg.media.document
|
||||
location = InputDocumentFileLocation(
|
||||
id=doc.id,
|
||||
access_hash=doc.access_hash,
|
||||
file_reference=doc.file_reference,
|
||||
thumb_size="",
|
||||
)
|
||||
await client.download_file(
|
||||
location,
|
||||
file=dest,
|
||||
part_size_kb=512,
|
||||
progress_callback=progress,
|
||||
)
|
||||
bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon"))
|
||||
return True
|
||||
except FloodWaitError as e:
|
||||
log.warning(f" Flood wait: sleeping {e.seconds}s...")
|
||||
await asyncio.sleep(e.seconds)
|
||||
await client.download_media(msg, file=dest)
|
||||
bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon"))
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f" Telethon download failed for {filename}: {e}")
|
||||
bus.post(bus.EvDownloadFailed(batch_id=_bid, filename=filename, reason=str(e)))
|
||||
return False
|
||||
|
||||
|
||||
# ─── Single-message pipeline (live handler + bot_downloader) ──────────────────
|
||||
|
||||
async def handle_message(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
msg,
|
||||
source_name: str,
|
||||
patterns,
|
||||
password: str | None = None,
|
||||
) -> None:
|
||||
"""Download and process a single file message."""
|
||||
filename = get_filename(msg)
|
||||
if not filename:
|
||||
log.warning(" handle_message: could not extract filename, skipping.")
|
||||
return
|
||||
|
||||
size = get_filesize(msg)
|
||||
ok, reason = is_processable(filename, size)
|
||||
if not ok:
|
||||
log.warning(f" handle_message: skipping '{filename}' — {reason}")
|
||||
return
|
||||
|
||||
doc_id = msg.media.document.id
|
||||
if is_seen(doc_id):
|
||||
log.info(f" Skipping {filename} — already processed.")
|
||||
return
|
||||
|
||||
dest = _make_dest(msg, filename)
|
||||
log.info(f"↓ Downloading: {filename} ({size / 1024:.1f} KB) from {source_name}")
|
||||
|
||||
# tdl single → Telethon fallback
|
||||
downloaded = await download_single_with_tdl(msg, dest) if is_tdl_available() else False
|
||||
if not downloaded:
|
||||
if is_tdl_available():
|
||||
log.warning(" [tdl] failed — falling back to Telethon")
|
||||
downloaded = await _telethon_download(client, msg, dest, filename, size)
|
||||
|
||||
if not downloaded:
|
||||
log.error(f" All download attempts failed for {filename}")
|
||||
return
|
||||
|
||||
hits = process_file(dest, patterns, password=password)
|
||||
mark_seen(doc_id)
|
||||
|
||||
if hits:
|
||||
await notify(bot, hits, source_name, filename)
|
||||
else:
|
||||
log.info(f" No hits in {filename}")
|
||||
|
||||
|
||||
# ─── Batch pipeline (backfill only) ───────────────────────────────────────────
|
||||
|
||||
async def _process_batch(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
batch: list[tuple], # list of (msg, source_name, password)
|
||||
patterns,
|
||||
) -> int:
|
||||
"""
|
||||
Download up to TDL_AMOUNT messages in one tdl invocation, then process
|
||||
each. Falls back to Telethon per-file for anything tdl missed.
|
||||
Returns the number of files successfully processed.
|
||||
"""
|
||||
if not batch:
|
||||
return 0
|
||||
|
||||
# Build BatchEntry list
|
||||
entries: list[BatchEntry] = []
|
||||
for msg, source_name, password in batch:
|
||||
filename = get_filename(msg)
|
||||
if not filename:
|
||||
continue
|
||||
entries.append(BatchEntry(
|
||||
msg=msg,
|
||||
filename=filename,
|
||||
dest=_make_dest(msg, filename),
|
||||
doc_id=msg.media.document.id,
|
||||
source_name=source_name,
|
||||
password=password,
|
||||
))
|
||||
|
||||
names = ", ".join(e.filename for e in entries)
|
||||
log.info(f"[Batch] {len(entries)} file(s): {names}")
|
||||
|
||||
# One tdl call for the whole batch
|
||||
results = await download_batch_with_tdl(entries)
|
||||
|
||||
processed = 0
|
||||
for entry in entries:
|
||||
tdl_ok = results.get(entry.doc_id, False)
|
||||
|
||||
if not tdl_ok:
|
||||
# Per-file Telethon fallback
|
||||
log.info(f" [Batch] Telethon fallback: {entry.filename}")
|
||||
size = get_filesize(entry.msg)
|
||||
tdl_ok = await _telethon_download(client, entry.msg, entry.dest, entry.filename, size)
|
||||
|
||||
if not tdl_ok:
|
||||
log.error(f" [Batch] All attempts failed: {entry.filename}")
|
||||
continue
|
||||
|
||||
hits = process_file(entry.dest, patterns, password=entry.password)
|
||||
mark_seen(entry.doc_id)
|
||||
|
||||
if hits:
|
||||
await notify(bot, hits, entry.source_name, entry.filename)
|
||||
else:
|
||||
log.info(f" No hits in {entry.filename}")
|
||||
|
||||
processed += 1
|
||||
|
||||
return processed
|
||||
|
||||
|
||||
# ─── Backfill ─────────────────────────────────────────────────────────────────
|
||||
|
||||
async def backfill_channel(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
channel: str | int,
|
||||
patterns,
|
||||
limit: int,
|
||||
) -> None:
|
||||
"""Scan the last `limit` messages of a channel for file attachments."""
|
||||
log.info(f"[Backfill] Scanning history: {channel} (last {limit} messages)")
|
||||
total = 0
|
||||
batch: list[tuple] = [] # (msg, source_name, password)
|
||||
last_password: str | None = None # carry password across adjacent messages
|
||||
|
||||
async def flush_batch():
|
||||
nonlocal total
|
||||
if batch:
|
||||
total += await _process_batch(client, bot, batch, patterns)
|
||||
batch.clear()
|
||||
|
||||
try:
|
||||
async for msg in client.iter_messages(channel, limit=limit):
|
||||
source_name = str(channel)
|
||||
|
||||
# Extract password from this message if present, and remember it.
|
||||
# iter_messages goes newest→oldest, so a password post that appears
|
||||
# above the files in the channel will arrive AFTER them here.
|
||||
# We therefore carry last_password in both directions:
|
||||
# - apply it to file messages that have no inline password
|
||||
# - update it whenever we see a fresh password, so subsequent
|
||||
# (older) file messages in the same batch pick it up too.
|
||||
msg_password = extract_password(msg)
|
||||
if msg_password:
|
||||
last_password = msg_password
|
||||
|
||||
password = msg_password or last_password
|
||||
|
||||
if msg.media and isinstance(msg.media, MessageMediaDocument):
|
||||
filename = get_filename(msg)
|
||||
size = get_filesize(msg)
|
||||
|
||||
if not filename:
|
||||
continue
|
||||
|
||||
ok, reason = is_processable(filename, size)
|
||||
if not ok:
|
||||
log.warning(f" [Backfill] Skipping '{filename}' — {reason}")
|
||||
continue
|
||||
|
||||
if is_seen(msg.media.document.id):
|
||||
log.info(f" [Backfill] Already seen: {filename}")
|
||||
continue
|
||||
|
||||
if is_tdl_available():
|
||||
batch.append((msg, source_name, password))
|
||||
if len(batch) >= TDL_AMOUNT:
|
||||
await flush_batch()
|
||||
else:
|
||||
# No tdl — fall straight through to single handle_message
|
||||
await handle_message(client, bot, msg, source_name, patterns, password=password)
|
||||
total += 1
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
elif msg.buttons and has_download_button(msg):
|
||||
# Bot-button messages can't be batched — handle individually
|
||||
await flush_batch() # flush any pending batch first
|
||||
await handle_bot_download_message(client, bot, msg, source_name, patterns, password=password)
|
||||
total += 1
|
||||
await asyncio.sleep(1.5)
|
||||
|
||||
# Flush whatever's left
|
||||
await flush_batch()
|
||||
|
||||
except (ChannelPrivateError, UsernameNotOccupiedError) as e:
|
||||
log.error(f"[Backfill] Cannot access {channel}: {e}")
|
||||
except Exception as e:
|
||||
log.error(f"[Backfill] Error scanning {channel}: {e}")
|
||||
|
||||
log.info(f"[Backfill] Done: {channel} — {total} file(s) processed")
|
||||
|
||||
|
||||
async def backfill_all(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
patterns,
|
||||
) -> None:
|
||||
"""Backfill all watched channels sequentially."""
|
||||
if BACKFILL_LIMIT <= 0:
|
||||
log.info("[Backfill] Disabled (BACKFILL_LIMIT=0)")
|
||||
return
|
||||
log.info(f"[Backfill] Starting for {len(WATCHED_CHANNELS)} channel(s)...")
|
||||
for ch in WATCHED_CHANNELS:
|
||||
await backfill_channel(client, bot, ch, patterns, BACKFILL_LIMIT)
|
||||
log.info("[Backfill] Complete.")
|
||||
|
||||
|
||||
# ─── Entity cache warmup ──────────────────────────────────────────────────────
|
||||
|
||||
async def warm_entity_cache(client: TelegramClient) -> None:
|
||||
"""
|
||||
Fetches your dialog list so Telethon caches all entity mappings.
|
||||
Required before using raw numeric IDs.
|
||||
"""
|
||||
log.info("Warming entity cache (fetching dialogs)...")
|
||||
async for _ in client.iter_dialogs():
|
||||
pass
|
||||
log.info("Entity cache ready.")
|
||||
|
||||
|
||||
# ─── Live listener ────────────────────────────────────────────────────────────
|
||||
|
||||
def register_handlers(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
patterns,
|
||||
) -> None:
|
||||
"""Register the NewMessage event handler for all watched channels."""
|
||||
|
||||
# Per-channel password cache for the live handler.
|
||||
# Channels often post a text message with the password separately from
|
||||
# the file message. We remember the last seen password per channel so
|
||||
# that the file message that follows (or precedes by seconds) picks it up.
|
||||
_channel_passwords: dict[int, str] = {}
|
||||
|
||||
@client.on(events.NewMessage(chats=WATCHED_CHANNELS))
|
||||
async def on_new_message(event):
|
||||
msg = event.message
|
||||
try:
|
||||
source = event.chat.username or str(event.chat_id)
|
||||
except Exception:
|
||||
source = str(event.chat_id)
|
||||
|
||||
chat_id = event.chat_id
|
||||
log.info(f"[Live] New message in {source}")
|
||||
|
||||
# Update cache if this message carries a password
|
||||
msg_password = extract_password(msg)
|
||||
if msg_password:
|
||||
_channel_passwords[chat_id] = msg_password
|
||||
log.debug(f"[Live] Password cached for {source}: '{msg_password}'")
|
||||
|
||||
password = msg_password or _channel_passwords.get(chat_id)
|
||||
|
||||
if msg.media and isinstance(msg.media, MessageMediaDocument):
|
||||
await handle_message(client, bot, msg, source, patterns, password=password)
|
||||
elif msg.buttons and has_download_button(msg):
|
||||
await handle_bot_download_message(client, bot, msg, source, patterns, password=password)
|
||||
Reference in New Issue
Block a user