Initial commit: ULPgrammer

- Core Telegram monitoring pipeline (scraper, processor, notifier, downloaders)
- Textual TUI frontend with thread-safe event bus
- SQLite persistence, severity scoring, dedup cache
- Fixed ULP parser: handles https:// truncation, port+path URLs, semicolon separator
- Test suite: 88 tests across scorer, cache, database, processor
This commit is contained in:
2026-04-02 01:58:49 -03:00
commit 48f486ac97
41 changed files with 5270 additions and 0 deletions

1
core/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""core — Telegram I/O pipeline (scraper, downloader, processor, notifier)."""

68
core/bot_downloader.md Normal file
View File

@@ -0,0 +1,68 @@
# core/bot_downloader.py
Handles "click to download" inline button flows. Some Telegram channels post files via a bot behind a button rather than directly attaching them.
## Public API
```python
from core.bot_downloader import (
handle_bot_download_message,
has_download_button,
extract_password,
)
```
### `handle_bot_download_message(client, bot, msg, source_name, patterns, password=None)`
**async.** Full pipeline:
1. Detect download button
2. Click it (URL button → `/start payload` to the bot; callback button → `.click()`)
3. Wait up to `BOT_REPLY_TIMEOUT` seconds for the bot to send a file back
4. Hand each file response to `core.scraper.handle_message()`
### `has_download_button(msg) -> bool`
Returns `True` if the message contains a recognisable download button.
Checked in live handler and backfill before calling this module.
### `extract_password(msg) -> str | None`
Scans message text for `Pass: ...` / `Password: ...` / `Contraseña: ...` patterns.
Returns the extracted password string, or `None`.
---
## Button detection
Recognised button text keywords (case-insensitive):
```
DOWNLOAD, DESCARGAR, GET FILE, GET PACK, ⬇, 📥
```
---
## URL button flow (most common)
```
Button URL: https://t.me/SomeBot?start=ABC123
→ parse bot username + payload
→ client.send_message(bot_entity, "/start ABC123")
→ poll get_messages(bot_entity, limit=3) every 1s for BOT_REPLY_TIMEOUT seconds
→ return file messages found
```
## Callback button flow (fallback)
```
btn.click()
→ sleep 2s
→ get_messages(sender, limit=5)
→ return file messages found
```
---
## Constants
| Name | Value | Description |
|------|-------|-------------|
| `BOT_REPLY_TIMEOUT` | `10` | Seconds to wait for bot file reply |
| `DOWNLOAD_BUTTON_KEYWORDS` | see above | Button text triggers |
| `PASSWORD_PATTERN` | regex | Matches `Pass[word]: value` in message text |

161
core/bot_downloader.py Normal file
View File

@@ -0,0 +1,161 @@
"""
bot_downloader.py — Handles "click to download" inline button flows.
Some Telegram channels post messages with a DOWNLOAD button that triggers
a bot to send you the actual file. This module simulates that click and
captures the bot's file response.
"""
import asyncio
import re
import logging
from telethon import TelegramClient
from telethon.tl.types import MessageMediaDocument, KeyboardButtonUrl
from telethon.errors import FloodWaitError
log = logging.getLogger(__name__)
DOWNLOAD_BUTTON_KEYWORDS = ["DOWNLOAD", "DESCARGAR", "GET FILE", "GET PACK", "", "📥"]
BOT_REPLY_TIMEOUT = 10
PASSWORD_PATTERN = re.compile(
r"(?:Pass|Password|Contraseña|Contrasena|Clave)[\s]*:[\s]*(.+)$",
re.IGNORECASE | re.MULTILINE
)
# ─── Password extraction ──────────────────────────────────────────────────────
def extract_password(msg) -> str | None:
if not msg.text:
return None
match = PASSWORD_PATTERN.search(msg.text)
if match:
pwd = match.group(1).strip()
# Strip markdown formatting characters
pwd = pwd.strip("*`_~")
log.info(f" Found password in message: '{pwd}'")
return pwd
return None
# ─── Button detection ─────────────────────────────────────────────────────────
def find_download_button(msg):
"""
Scans a message's inline keyboard for a download-like button.
Returns the button object or None.
"""
if not msg.buttons:
return None
for row in msg.buttons:
for btn in row:
if any(kw in btn.text.upper() for kw in DOWNLOAD_BUTTON_KEYWORDS):
return btn
return None
def has_download_button(msg) -> bool:
return find_download_button(msg) is not None
# ─── Click + wait flow ────────────────────────────────────────────────────────
async def click_download_button(client: TelegramClient, msg) -> list:
"""
Clicks the download button on a message, then waits for the bot to reply
with a file. Returns a list of response messages containing documents.
"""
btn = find_download_button(msg)
if not btn:
return []
log.info(f" Clicking button: '{btn.text}'")
# ── URL button (most common) ───────────────────────────────────────────
if isinstance(btn.button, KeyboardButtonUrl):
url = btn.button.url # e.g. https://t.me/SomeBot?start=ABC123
match = re.search(r"t\.me/([A-Za-z0-9_]+)\?start=(.+)", url)
if not match:
log.warning(f" Unrecognised URL format: {url}")
return []
bot_username, payload = match.group(1), match.group(2)
log.info(f" → Messaging @{bot_username} with /start {payload}")
try:
bot_entity = await client.get_entity(bot_username)
await client.send_message(bot_entity, f"/start {payload}")
except Exception as e:
log.error(f" Failed to message bot: {e}")
return []
# Poll for reply
log.info(f" Waiting up to {BOT_REPLY_TIMEOUT}s for bot reply...")
for _ in range(BOT_REPLY_TIMEOUT):
await asyncio.sleep(1)
try:
recent = await client.get_messages(bot_entity, limit=3)
files = [m for m in recent if m.media and isinstance(m.media, MessageMediaDocument)]
if files:
log.info(f" ✓ Got file from bot.")
return files
except Exception as e:
log.warning(f" Poll error: {e}")
break
log.warning(f" Bot did not reply within {BOT_REPLY_TIMEOUT}s.")
return []
# ── Callback button (less common) ─────────────────────────────────────
else:
try:
await btn.click()
await asyncio.sleep(2)
except Exception as e:
log.error(f" Callback click failed: {e}")
return []
try:
sender = await msg.get_sender()
recent = await client.get_messages(sender, limit=5)
return [m for m in recent if m.media and isinstance(m.media, MessageMediaDocument)]
except Exception as e:
log.warning(f" Fallback poll failed: {e}")
return []
# ─── Main entry point ─────────────────────────────────────────────────────────
async def handle_bot_download_message(
client: TelegramClient,
bot: TelegramClient,
msg,
source_name: str,
patterns,
password: str | None = None,
) -> None:
"""
Full pipeline for a message with a download button:
1. Detect download button
2. Click it
3. Wait for bot to send back a file
4. Hand off to the normal handle_message() flow
"""
if not has_download_button(msg):
return
log.info(f"[BotDL] Download button detected in {source_name}")
responses = await click_download_button(client, msg)
if not responses:
log.warning(f"[BotDL] No file received for message in {source_name}.")
return
from core.scraper import handle_message
for resp in responses:
log.info(f" [BotDL] Response media type: {type(resp.media).__name__}, attrs: {getattr(resp.media.document, 'attributes', []) if hasattr(resp.media, 'document') else 'none'}")
await handle_message(client, bot, resp, f"{source_name}[bot]", patterns, password=password)

67
core/notifier.md Normal file
View File

@@ -0,0 +1,67 @@
# core/notifier.py
Scores hits, deduplicates, persists to disk and DB, sends Telegram alerts.
## Public API
```python
from core.notifier import notify, send_status
```
### `notify(bot, hits: list[str], source: str, filename: str)`
**async.** Full notification pipeline:
1. `score_hits(hits)``list[ScoredHit]`
2. Deduplicate via SHA-256 hashes (`data/dedup.json`)
3. `insert_hits()` into SQLite for new + dupes (flagged accordingly)
4. `write_hits()` → append to `data/hits.txt`
5. `write_hits_csv()` → append to `data/hits.csv`
6. `send_alert()` → Telegram message for CRITICAL/HIGH/MEDIUM only
7. Post `EvHit` events onto the TUI bus for each new hit
### `send_status(bot, message: str)`
**async.** Sends a plain Markdown message to `config.NOTIFY_CHAT_ID`. Used for startup/status notifications.
---
## Internal functions
| Function | Description |
|----------|-------------|
| `deduplicate(hits)` | Returns `(new_hits, dupe_hits)`; updates `data/dedup.json` |
| `write_hits(scored_hits, source)` | Appends grouped human-readable block to `data/hits.txt` |
| `write_hits_csv(scored_hits, source, filename)` | Appends rows to `data/hits.csv`; writes header on first call |
| `send_alert(bot, scored_hits, source, filename)` | Sends Telegram message grouped by severity; skips if all LOW |
---
## Output files
| File | Format | Notes |
|------|--------|-------|
| `data/hits.txt` | Plain text, grouped by severity | Human-readable, append-only |
| `data/hits.csv` | CSV with header | Columns: `timestamp, severity, score, url, username, password, reasons, source, filename` |
| `data/dedup.json` | JSON array of SHA-256 hex strings | Hashes of `line.strip().lower()` |
---
## Alert behaviour
- CRITICAL / HIGH / MEDIUM → Telegram alert sent immediately
- LOW → stored in DB + files, **no** Telegram alert
- Duplicates → stored in DB with `seen_before=1`, no alert, no file write
## Telegram alert format
```
🚨 Credential hit(s) detected
📁 `filename`
📢 `source`
🕐 `timestamp`
Summary: 🔴 N 🟠 N 🟡 N 🟢 N
🔴 CRITICAL (N)
`url:user:pass`
↳ reason | reason
... (up to 10 per severity; remainder counted)
```

248
core/notifier.py Normal file
View File

@@ -0,0 +1,248 @@
"""
notifier.py — Persists hits to disk and sends Telegram bot alerts.
Includes:
- Severity scoring via scorer.py
- Deduplication: same credential never written or alerted twice
- SQLite storage via database.py
- hits.txt kept as a human-readable backup
- Telegram alerts grouped by severity
"""
import logging
import hashlib
import json
from datetime import datetime, timezone
from pathlib import Path
from telethon import TelegramClient
import csv
from config import HITS_FILE, NOTIFY_CHAT_ID
from utils.scorer import score_hits, summarize, CRITICAL, HIGH, MEDIUM, LOW, SEVERITY_EMOJI
from utils.database import insert_hits
from tui import events as bus
HITS_CSV = HITS_FILE.with_suffix(".csv")
log = logging.getLogger(__name__)
MAX_PREVIEW = 10 # hits to show per severity group in alert
DEDUP_FILE = Path("./data/dedup.json")
# Only alert immediately for these severities — LOW hits are silent
ALERT_SEVERITIES = {CRITICAL, HIGH, MEDIUM}
# ─── Deduplication ────────────────────────────────────────────────────────────
def _hash(line: str) -> str:
return hashlib.sha256(line.strip().lower().encode()).hexdigest()
def _load_seen_hashes() -> set:
if not DEDUP_FILE.exists():
return set()
try:
with open(DEDUP_FILE, "r") as f:
return set(json.load(f))
except Exception:
return set()
def _save_seen_hashes(seen: set) -> None:
try:
with open(DEDUP_FILE, "w") as f:
json.dump(list(seen), f)
except Exception as e:
log.warning(f"Could not save dedup file: {e}")
def deduplicate(hits: list) -> tuple[list, list]:
"""
Accepts a list of ScoredHit objects.
Returns (new_hits, dupe_hits).
"""
seen = _load_seen_hashes()
new_hits = []
dupe_hits = []
new_hashes = set()
for h in hits:
digest = _hash(h.raw)
if digest in seen:
dupe_hits.append(h)
else:
new_hits.append(h)
new_hashes.add(digest)
if new_hashes:
seen.update(new_hashes)
_save_seen_hashes(seen)
log.info(
f" Dedup: {len(hits)} raw hit(s) → "
f"{len(new_hits)} new, {len(dupe_hits)} duplicate(s)"
)
return new_hits, dupe_hits
# ─── Helpers ─────────────────────────────────────────────────────────────────
def _timestamp() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
# ─── Output ──────────────────────────────────────────────────────────────────
def write_hits(scored_hits: list, source: str) -> None:
"""Append new hits to hits.txt grouped by severity."""
HITS_FILE.parent.mkdir(parents=True, exist_ok=True)
summary = summarize(scored_hits)
with open(HITS_FILE, "a", encoding="utf-8") as f:
f.write(f"\n{'='*60}\n")
f.write(f"Source : {source}\n")
f.write(f"Time : {_timestamp()}\n")
f.write(f"Hits : {len(scored_hits)} ")
f.write(f"(CRITICAL={summary[CRITICAL]} HIGH={summary[HIGH]} ")
f.write(f"MEDIUM={summary[MEDIUM]} LOW={summary[LOW]})\n")
f.write(f"{'='*60}\n")
for severity in [CRITICAL, HIGH, MEDIUM, LOW]:
group = [h for h in scored_hits if h.severity == severity]
if not group:
continue
emoji = SEVERITY_EMOJI[severity]
f.write(f"\n{emoji} {severity} ({len(group)})\n")
for h in group:
f.write(f" {h.raw}\n")
f.write(f"{' | '.join(h.reasons)}\n")
log.info(f" Wrote {len(scored_hits)} hit(s) to {HITS_FILE}")
def write_hits_csv(scored_hits: list, source: str, filename: str) -> None:
"""Append new hits to hits.csv — one row per hit, easy to import."""
HITS_CSV.parent.mkdir(parents=True, exist_ok=True)
write_header = not HITS_CSV.exists()
timestamp = _timestamp()
with open(HITS_CSV, "a", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
if write_header:
writer.writerow([
"timestamp", "severity", "score", "url", "username",
"password", "reasons", "source", "filename",
])
for h in scored_hits:
writer.writerow([
timestamp, h.severity, h.score,
h.url or "", h.username or "", h.password or "",
" | ".join(h.reasons), source, filename,
])
log.info(f" Wrote {len(scored_hits)} hit(s) to {HITS_CSV}")
async def send_alert(
bot: TelegramClient,
scored_hits: list,
source: str,
filename: str,
) -> None:
"""
Send a Telegram alert grouped by severity.
Only includes CRITICAL, HIGH, MEDIUM — LOW hits are omitted from alerts.
"""
summary = summarize(scored_hits)
alertable = [h for h in scored_hits if h.severity in ALERT_SEVERITIES]
if not alertable:
log.info(" No alertable hits (all LOW) — skipping Telegram notification.")
return
lines = [
f"🚨 *Credential hit(s) detected*",
f"",
f"📁 `{filename}`",
f"📢 `{source}`",
f"🕐 `{_timestamp()}`",
f"",
f"*Summary:*",
f"🔴 CRITICAL: `{summary[CRITICAL]}` "
f"🟠 HIGH: `{summary[HIGH]}` "
f"🟡 MEDIUM: `{summary[MEDIUM]}` "
f"🟢 LOW: `{summary[LOW]}`",
]
for severity in [CRITICAL, HIGH, MEDIUM]:
group = [h for h in scored_hits if h.severity == severity]
if not group:
continue
emoji = SEVERITY_EMOJI[severity]
lines.append(f"\n{emoji} *{severity}* ({len(group)})")
for h in group[:MAX_PREVIEW]:
safe = h.raw.replace("`", "'")
lines.append(f"`{safe}`")
lines.append(f"_↳ {' | '.join(h.reasons)}_")
if len(group) > MAX_PREVIEW:
lines.append(f"_...and {len(group) - MAX_PREVIEW} more_")
try:
await bot.send_message(NOTIFY_CHAT_ID, "\n".join(lines), parse_mode="markdown")
except Exception as e:
log.error(f"Failed to send Telegram alert: {e}")
# ─── Main entry point ────────────────────────────────────────────────────────
async def notify(bot: TelegramClient, hits: list[str], source: str, filename: str) -> None:
"""
Full notification pipeline:
1. Score all hits
2. Deduplicate
3. Insert all hits into SQLite (new + dupes, flagged accordingly)
4. Write new hits to hits.txt
5. Send Telegram alert for new alertable hits only
"""
if not hits:
return
# Score first
scored = score_hits(hits)
log.info(f" Scored {len(scored)} hit(s) — {summarize(scored)}")
# Deduplicate
new_hits, dupe_hits = deduplicate(scored)
# Always insert into DB
if new_hits:
insert_hits(new_hits, source, filename, seen_before=False)
if dupe_hits:
insert_hits(dupe_hits, source, filename, seen_before=True)
if not new_hits:
log.info(" All hits already seen before — no alert sent.")
return
# Push hits to TUI
for h in new_hits:
bus.post(bus.EvHit(
severity=h.severity,
raw=h.raw,
source=source,
filename=filename,
reasons=h.reasons,
))
write_hits(new_hits, source)
write_hits_csv(new_hits, source, filename)
await send_alert(bot, new_hits, source, filename)
async def send_status(bot: TelegramClient, message: str) -> None:
"""Send a plain status/info message to the notify chat."""
try:
await bot.send_message(NOTIFY_CHAT_ID, message, parse_mode="markdown")
except Exception as e:
log.error(f"Failed to send status message: {e}")

69
core/processor.md Normal file
View File

@@ -0,0 +1,69 @@
# core/processor.py
Archive extraction and hit searching. No Telegram deps, no async.
## Public API
```python
from core.processor import compile_patterns, process_file
```
### `compile_patterns(keywords: list[str]) -> list[re.Pattern]`
Compiles a list of keyword strings into case-insensitive regex patterns.
Call once at startup; pass the result everywhere patterns are needed.
```python
patterns = compile_patterns(config.TARGET_KEYWORDS)
```
### `process_file(filepath: Path, patterns, password=None) -> list[str]`
Full pipeline: unpack → search each `.txt` → recurse into nested archives → clean up everything.
Returns list of matching raw lines (hits). Deletes the original file and all extracted contents on completion.
```python
hits = process_file(Path("data/tmp/combo.zip"), patterns, password="infected")
```
---
## Internal functions
| Function | Signature | Description |
|----------|-----------|-------------|
| `search_file` | `(filepath, patterns) -> list[str]` | Stream-reads `.txt` line by line; ignores encoding errors |
| `unpack` | `(filepath, extra_password) -> (files, extract_dir\|None)` | Dispatches to correct extractor; plain `.txt` returned as-is |
| `extract_zip` | `(filepath, dest, extra_password)` | Tries no password first, then `ARCHIVE_PASSWORDS` list |
| `extract_7z` | `(filepath, dest, extra_password)` | Requires `py7zr`; skips if not installed |
| `extract_rar` | `(filepath, dest, extra_password)` | Requires `rarfile` + `unrar` binary |
| `_try_passwords` | `(extract_fn, passwords)` | Iterates password list, stops on first success |
---
## Supported formats
| Extension | Library | Notes |
|-----------|---------|-------|
| `.txt` | built-in | Stream-read, no load into memory |
| `.zip` | `zipfile` | stdlib |
| `.7z` | `py7zr` | optional; skipped if not installed |
| `.rar` | `rarfile` | optional; requires `unrar` system binary |
Nested archives are recursed **one level** only.
---
## Password order
1. `extra_password` (from message/channel carry-forward) — tried first
2. `config.ARCHIVE_PASSWORDS` — tried in order
---
## Cleanup guarantee
`process_file` always deletes:
- Extracted individual files
- Extract subdirectory
- Original downloaded file
Even if no hits are found.

233
core/processor.py Normal file
View File

@@ -0,0 +1,233 @@
"""
processor.py — Archive extraction and hit searching logic.
Supports: .txt, .zip, .7z, .rar
Stream-processes files line by line — safe for large combo lists.
"""
import rarfile
rarfile.UNRAR_TOOL = "unrar"
import re
import zipfile
import logging
import shutil
from pathlib import Path
try:
import py7zr
HAS_7Z = True
except ImportError:
HAS_7Z = False
try:
import rarfile
HAS_RAR = True
except ImportError:
HAS_RAR = False
from config import ARCHIVE_PASSWORDS
log = logging.getLogger(__name__)
# ─── Searching ───────────────────────────────────────────────────────────────
def compile_patterns(keywords: list[str]) -> list[re.Pattern]:
return [re.compile(kw, re.IGNORECASE) for kw in keywords]
def search_file(filepath: Path, patterns: list[re.Pattern]) -> list[str]:
"""
Stream-reads a text file line by line and returns lines matching any pattern.
Ignores encoding errors — combo files are often messy.
"""
hits: list[str] = []
try:
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
for line in f:
stripped = line.strip()
if stripped and any(p.search(stripped) for p in patterns):
hits.append(stripped)
except Exception as e:
log.warning(f"Could not read {filepath.name}: {e}")
return hits
# ─── Extraction ──────────────────────────────────────────────────────────────
def _try_passwords(extract_fn, passwords: list[bytes]) -> bool:
"""Try a list of passwords against an extract function. Returns True on success."""
for pwd in passwords:
try:
extract_fn(pwd)
return True
except Exception:
continue
return False
def extract_zip(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
passwords = ARCHIVE_PASSWORDS.copy()
if extra_password:
passwords.insert(0, extra_password.encode())
extracted: list[Path] = []
try:
with zipfile.ZipFile(filepath) as zf:
def try_extract(pwd: bytes):
zf.extractall(dest, pwd=pwd or None)
try:
zf.extractall(dest)
except RuntimeError:
log.info(f" ZIP is password-protected, trying common passwords...")
if not _try_passwords(try_extract, ARCHIVE_PASSWORDS):
log.warning(f" Could not unlock {filepath.name} — skipping.")
return []
extracted = [p for p in dest.rglob("*") if p.is_file()]
except zipfile.BadZipFile:
log.warning(f" {filepath.name} is not a valid ZIP.")
except Exception as e:
log.warning(f" ZIP extraction error on {filepath.name}: {e}")
return extracted
def extract_7z(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
if not HAS_7Z:
log.warning("py7zr not installed — skipping .7z file.")
return []
extracted: list[Path] = []
passwords = ARCHIVE_PASSWORDS.copy()
if extra_password:
passwords.insert(0, extra_password.encode())
try:
# Try without password first
try:
with py7zr.SevenZipFile(filepath, mode="r") as z:
z.extractall(dest)
except py7zr.exceptions.PasswordRequired:
log.info(f" 7z is password-protected, trying common passwords...")
success = False
for pwd in ARCHIVE_PASSWORDS:
try:
with py7zr.SevenZipFile(filepath, mode="r", password=pwd.decode()) as z:
z.extractall(dest)
success = True
break
except Exception:
continue
if not success:
log.warning(f" Could not unlock {filepath.name} — skipping.")
return []
extracted = [p for p in dest.rglob("*") if p.is_file()]
except Exception as e:
log.warning(f" 7z extraction error on {filepath.name}: {e}")
return extracted
def extract_rar(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
if not HAS_RAR:
log.warning("rarfile not installed — skipping .rar file.")
return []
passwords = ARCHIVE_PASSWORDS.copy()
if extra_password:
passwords.insert(0, extra_password.encode())
extracted: list[Path] = []
try:
with rarfile.RarFile(filepath) as rf:
def try_extract(pwd: bytes):
rf.extractall(dest, pwd=pwd.decode() if pwd else None)
try:
rf.extractall(dest)
except rarfile.BadRarFile:
log.warning(f" {filepath.name} is not a valid RAR.")
return []
except Exception:
log.info(f" RAR may be password-protected, trying common passwords...")
if not _try_passwords(try_extract, ARCHIVE_PASSWORDS):
log.warning(f" Could not unlock {filepath.name} — skipping.")
return []
extracted = [p for p in dest.rglob("*") if p.is_file()]
except Exception as e:
log.warning(f" RAR extraction error on {filepath.name}: {e}")
return extracted
def unpack(filepath: Path, extra_password: str | None = None) -> tuple[list[Path], Path | None]:
"""
Unpacks an archive into a sibling directory.
Returns (list of extracted files, extract_dir or None).
If it's not an archive, returns ([filepath], None).
"""
suffix = filepath.suffix.lower()
extract_dir = filepath.parent / filepath.stem
if suffix == ".zip":
extract_dir.mkdir(exist_ok=True)
files = extract_zip(filepath, extract_dir, extra_password)
return files, extract_dir
elif suffix == ".7z":
extract_dir.mkdir(exist_ok=True)
files = extract_7z(filepath, extract_dir, extra_password)
return files, extract_dir
elif suffix == ".rar":
extract_dir.mkdir(exist_ok=True)
files = extract_rar(filepath, extract_dir, extra_password)
return files, extract_dir
else:
# Plain file — return as-is, no extract dir to clean up
return [filepath], None
# ─── Main entry point ────────────────────────────────────────────────────────
def process_file(filepath: Path, patterns, password: str | None = None) -> list[str]:
"""
Full pipeline: unpack → search each file → clean up everything.
Returns list of matching lines (hits).
"""
log.info(f" Processing: {filepath.name}")
all_hits: list[str] = []
files, extract_dir = unpack(filepath, extra_password=password)
for f in files:
if f.suffix.lower() == ".txt":
hits = search_file(f, patterns)
if hits:
log.info(f"{len(hits)} hit(s) in {f.name}")
all_hits.extend(hits)
# Nested archives — recurse one level
elif f.suffix.lower() in {".zip", ".7z", ".rar"} and f != filepath:
log.info(f" → Nested archive: {f.name}")
nested_hits = process_file(f, patterns)
all_hits.extend(nested_hits)
continue # process_file already cleaned up f
# Clean up extracted file
try:
f.unlink(missing_ok=True)
except Exception:
pass
# Clean up extract dir
if extract_dir and extract_dir.exists():
shutil.rmtree(extract_dir, ignore_errors=True)
# Clean up original download
try:
filepath.unlink(missing_ok=True)
except Exception:
pass
return all_hits

65
core/scraper.md Normal file
View File

@@ -0,0 +1,65 @@
# core/scraper.py
Telethon user-client layer. Handles live listening, backfill, and the single-message download pipeline.
## Public API
```python
from core.scraper import handle_message, backfill_all, register_handlers, warm_entity_cache
```
### `handle_message(client, bot, msg, source_name, patterns, password=None)`
**async.** Full pipeline for one document message:
1. Extract filename + size, check allowlist + size guard
2. Check `utils.cache` — skip if already seen
3. Try `tdl` download → Telethon fallback
4. `core.processor.process_file()` → hits
5. `core.notifier.notify()` if hits found
6. `utils.cache.mark_seen()`
Called by: live handler, `bot_downloader`, backfill fallback path.
### `backfill_all(client, bot, patterns)`
**async.** Iterates `config.WATCHED_CHANNELS`, calls `backfill_channel()` for each.
No-op if `config.BACKFILL_LIMIT == 0`.
### `register_handlers(client, bot, patterns)`
Registers a `NewMessage` Telethon event handler on `config.WATCHED_CHANNELS`.
Used in **CLI mode only** (`--no-tui`). The TUI manages its own handler via `_make_handler()` in `tui/app.py`.
### `warm_entity_cache(client)`
**async.** Iterates `client.iter_dialogs()` so Telethon caches entity mappings.
Must be called before using raw numeric channel IDs.
---
## Internal functions
| Function | Description |
|----------|-------------|
| `get_filename(msg)` | Extracts filename from `MessageMediaDocument`; falls back to `{msg_id}{ext}` from MIME |
| `get_filesize(msg)` | Returns document size in bytes |
| `is_processable(filename, size)` | Checks extension allowlist + size limit; returns `(bool, reason)` |
| `_make_dest(msg, filename)` | Resolves temp path, handles collision with `{msg_id}_{filename}` |
| `_telethon_download(client, msg, dest, ...)` | Telethon fallback with tqdm progress + flood-wait handling. Posts `EvDownload*` bus events |
| `backfill_channel(client, bot, channel, patterns, limit)` | Scans history with password carry-forward; batches via tdl |
| `_process_batch(client, bot, batch, patterns)` | One tdl invocation for up to `TDL_AMOUNT` messages; per-file Telethon fallback |
---
## Password carry-forward (backfill)
Channels often post the archive password as a separate text message.
`backfill_channel` iterates newest→oldest, carrying `last_password` so both older and newer file messages in the same scan pick it up.
---
## Download strategy
```
is_tdl_available()?
yes → download_single_with_tdl() / download_batch_with_tdl()
↓ failed?
_telethon_download()
no → _telethon_download() directly
```

410
core/scraper.py Normal file
View File

@@ -0,0 +1,410 @@
"""
scraper.py — Telethon user client.
Handles:
- Listening for new file messages in watched channels
- Listening for messages with inline download buttons (bot-dispatched files)
- Backfilling recent channel history on startup (batched via tdl)
- Downloading files safely (size guard, flood wait)
"""
import asyncio
import logging
import time
from pathlib import Path
from tqdm import tqdm
from telethon import TelegramClient, events
from telethon.errors import FloodWaitError, ChannelPrivateError, UsernameNotOccupiedError
from telethon.tl.types import (
MessageMediaDocument,
DocumentAttributeFilename,
InputDocumentFileLocation,
)
from config import (
ALLOWED_EXTENSIONS,
BACKFILL_LIMIT,
MAX_FILE_SIZE,
TEMP_DIR,
WATCHED_CHANNELS,
TDL_AMOUNT,
)
from core.bot_downloader import handle_bot_download_message, has_download_button, extract_password
from utils.cache import is_seen, mark_seen
from core.processor import process_file
from core.notifier import notify
from core.tdl_downloader import (
BatchEntry,
download_batch_with_tdl,
download_single_with_tdl,
is_tdl_available,
)
from tui import events as bus
log = logging.getLogger(__name__)
# ─── Helpers ──────────────────────────────────────────────────────────────────
def get_filename(msg) -> str | None:
"""Extract the filename from a document message, if any."""
if not isinstance(msg.media, MessageMediaDocument):
return None
doc = msg.media.document
for attr in doc.attributes:
if isinstance(attr, DocumentAttributeFilename):
return attr.file_name
mime = getattr(doc, "mime_type", "") or ""
ext_map = {
"application/x-rar-compressed": ".rar",
"application/vnd.rar": ".rar",
"application/zip": ".zip",
"application/x-7z-compressed": ".7z",
"text/plain": ".txt",
}
return f"{msg.id}{ext_map.get(mime, '.bin')}"
def get_filesize(msg) -> int:
"""Return document size in bytes, or 0 if not a document."""
if not isinstance(msg.media, MessageMediaDocument):
return 0
return msg.media.document.size or 0
def is_processable(filename: str, size: int) -> tuple[bool, str]:
"""Check whether a file should be downloaded. Returns (ok, reason)."""
suffix = Path(filename).suffix.lower()
if suffix not in ALLOWED_EXTENSIONS:
return False, f"extension {suffix!r} not in allowlist"
if size > MAX_FILE_SIZE:
mb = size / (1024 * 1024)
return False, f"too large ({mb:.1f} MB > {MAX_FILE_SIZE // (1024 * 1024)} MB limit)"
return True, ""
def _make_dest(msg, filename: str) -> Path:
"""Resolve the destination path, avoiding name collisions."""
TEMP_DIR.mkdir(exist_ok=True)
dest = TEMP_DIR / filename
if dest.exists():
dest = TEMP_DIR / f"{msg.id}_{filename}"
return dest
# ─── Telethon fallback download ───────────────────────────────────────────────
async def _telethon_download(client: TelegramClient, msg, dest: Path, filename: str, size: int, batch_id: str | None = None) -> bool:
"""Download a single file via Telethon. Returns True on success."""
_bid = batch_id or f"telethon_{int(time.monotonic_ns())}"
if batch_id is None:
# Standalone call (not already queued by tdl path) — post queued event
bus.post(bus.EvDownloadQueued(
batch_id=_bid, filename=filename,
size_mb=round(size / (1024 * 1024), 2),
source="telethon", password=None,
))
bus.post(bus.EvDownloadStarted(batch_id=_bid, filename=filename))
try:
with tqdm(
total=size,
unit="B",
unit_scale=True,
unit_divisor=1024,
desc=filename[:40],
colour="cyan",
) as pbar:
async def progress(current, total):
pbar.n = current
pbar.refresh()
doc = msg.media.document
location = InputDocumentFileLocation(
id=doc.id,
access_hash=doc.access_hash,
file_reference=doc.file_reference,
thumb_size="",
)
await client.download_file(
location,
file=dest,
part_size_kb=512,
progress_callback=progress,
)
bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon"))
return True
except FloodWaitError as e:
log.warning(f" Flood wait: sleeping {e.seconds}s...")
await asyncio.sleep(e.seconds)
await client.download_media(msg, file=dest)
bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon"))
return True
except Exception as e:
log.error(f" Telethon download failed for {filename}: {e}")
bus.post(bus.EvDownloadFailed(batch_id=_bid, filename=filename, reason=str(e)))
return False
# ─── Single-message pipeline (live handler + bot_downloader) ──────────────────
async def handle_message(
client: TelegramClient,
bot: TelegramClient,
msg,
source_name: str,
patterns,
password: str | None = None,
) -> None:
"""Download and process a single file message."""
filename = get_filename(msg)
if not filename:
log.warning(" handle_message: could not extract filename, skipping.")
return
size = get_filesize(msg)
ok, reason = is_processable(filename, size)
if not ok:
log.warning(f" handle_message: skipping '{filename}'{reason}")
return
doc_id = msg.media.document.id
if is_seen(doc_id):
log.info(f" Skipping {filename} — already processed.")
return
dest = _make_dest(msg, filename)
log.info(f"↓ Downloading: {filename} ({size / 1024:.1f} KB) from {source_name}")
# tdl single → Telethon fallback
downloaded = await download_single_with_tdl(msg, dest) if is_tdl_available() else False
if not downloaded:
if is_tdl_available():
log.warning(" [tdl] failed — falling back to Telethon")
downloaded = await _telethon_download(client, msg, dest, filename, size)
if not downloaded:
log.error(f" All download attempts failed for {filename}")
return
hits = process_file(dest, patterns, password=password)
mark_seen(doc_id)
if hits:
await notify(bot, hits, source_name, filename)
else:
log.info(f" No hits in {filename}")
# ─── Batch pipeline (backfill only) ───────────────────────────────────────────
async def _process_batch(
client: TelegramClient,
bot: TelegramClient,
batch: list[tuple], # list of (msg, source_name, password)
patterns,
) -> int:
"""
Download up to TDL_AMOUNT messages in one tdl invocation, then process
each. Falls back to Telethon per-file for anything tdl missed.
Returns the number of files successfully processed.
"""
if not batch:
return 0
# Build BatchEntry list
entries: list[BatchEntry] = []
for msg, source_name, password in batch:
filename = get_filename(msg)
if not filename:
continue
entries.append(BatchEntry(
msg=msg,
filename=filename,
dest=_make_dest(msg, filename),
doc_id=msg.media.document.id,
source_name=source_name,
password=password,
))
names = ", ".join(e.filename for e in entries)
log.info(f"[Batch] {len(entries)} file(s): {names}")
# One tdl call for the whole batch
results = await download_batch_with_tdl(entries)
processed = 0
for entry in entries:
tdl_ok = results.get(entry.doc_id, False)
if not tdl_ok:
# Per-file Telethon fallback
log.info(f" [Batch] Telethon fallback: {entry.filename}")
size = get_filesize(entry.msg)
tdl_ok = await _telethon_download(client, entry.msg, entry.dest, entry.filename, size)
if not tdl_ok:
log.error(f" [Batch] All attempts failed: {entry.filename}")
continue
hits = process_file(entry.dest, patterns, password=entry.password)
mark_seen(entry.doc_id)
if hits:
await notify(bot, hits, entry.source_name, entry.filename)
else:
log.info(f" No hits in {entry.filename}")
processed += 1
return processed
# ─── Backfill ─────────────────────────────────────────────────────────────────
async def backfill_channel(
client: TelegramClient,
bot: TelegramClient,
channel: str | int,
patterns,
limit: int,
) -> None:
"""Scan the last `limit` messages of a channel for file attachments."""
log.info(f"[Backfill] Scanning history: {channel} (last {limit} messages)")
total = 0
batch: list[tuple] = [] # (msg, source_name, password)
last_password: str | None = None # carry password across adjacent messages
async def flush_batch():
nonlocal total
if batch:
total += await _process_batch(client, bot, batch, patterns)
batch.clear()
try:
async for msg in client.iter_messages(channel, limit=limit):
source_name = str(channel)
# Extract password from this message if present, and remember it.
# iter_messages goes newest→oldest, so a password post that appears
# above the files in the channel will arrive AFTER them here.
# We therefore carry last_password in both directions:
# - apply it to file messages that have no inline password
# - update it whenever we see a fresh password, so subsequent
# (older) file messages in the same batch pick it up too.
msg_password = extract_password(msg)
if msg_password:
last_password = msg_password
password = msg_password or last_password
if msg.media and isinstance(msg.media, MessageMediaDocument):
filename = get_filename(msg)
size = get_filesize(msg)
if not filename:
continue
ok, reason = is_processable(filename, size)
if not ok:
log.warning(f" [Backfill] Skipping '{filename}'{reason}")
continue
if is_seen(msg.media.document.id):
log.info(f" [Backfill] Already seen: {filename}")
continue
if is_tdl_available():
batch.append((msg, source_name, password))
if len(batch) >= TDL_AMOUNT:
await flush_batch()
else:
# No tdl — fall straight through to single handle_message
await handle_message(client, bot, msg, source_name, patterns, password=password)
total += 1
await asyncio.sleep(0.5)
elif msg.buttons and has_download_button(msg):
# Bot-button messages can't be batched — handle individually
await flush_batch() # flush any pending batch first
await handle_bot_download_message(client, bot, msg, source_name, patterns, password=password)
total += 1
await asyncio.sleep(1.5)
# Flush whatever's left
await flush_batch()
except (ChannelPrivateError, UsernameNotOccupiedError) as e:
log.error(f"[Backfill] Cannot access {channel}: {e}")
except Exception as e:
log.error(f"[Backfill] Error scanning {channel}: {e}")
log.info(f"[Backfill] Done: {channel}{total} file(s) processed")
async def backfill_all(
client: TelegramClient,
bot: TelegramClient,
patterns,
) -> None:
"""Backfill all watched channels sequentially."""
if BACKFILL_LIMIT <= 0:
log.info("[Backfill] Disabled (BACKFILL_LIMIT=0)")
return
log.info(f"[Backfill] Starting for {len(WATCHED_CHANNELS)} channel(s)...")
for ch in WATCHED_CHANNELS:
await backfill_channel(client, bot, ch, patterns, BACKFILL_LIMIT)
log.info("[Backfill] Complete.")
# ─── Entity cache warmup ──────────────────────────────────────────────────────
async def warm_entity_cache(client: TelegramClient) -> None:
"""
Fetches your dialog list so Telethon caches all entity mappings.
Required before using raw numeric IDs.
"""
log.info("Warming entity cache (fetching dialogs)...")
async for _ in client.iter_dialogs():
pass
log.info("Entity cache ready.")
# ─── Live listener ────────────────────────────────────────────────────────────
def register_handlers(
client: TelegramClient,
bot: TelegramClient,
patterns,
) -> None:
"""Register the NewMessage event handler for all watched channels."""
# Per-channel password cache for the live handler.
# Channels often post a text message with the password separately from
# the file message. We remember the last seen password per channel so
# that the file message that follows (or precedes by seconds) picks it up.
_channel_passwords: dict[int, str] = {}
@client.on(events.NewMessage(chats=WATCHED_CHANNELS))
async def on_new_message(event):
msg = event.message
try:
source = event.chat.username or str(event.chat_id)
except Exception:
source = str(event.chat_id)
chat_id = event.chat_id
log.info(f"[Live] New message in {source}")
# Update cache if this message carries a password
msg_password = extract_password(msg)
if msg_password:
_channel_passwords[chat_id] = msg_password
log.debug(f"[Live] Password cached for {source}: '{msg_password}'")
password = msg_password or _channel_passwords.get(chat_id)
if msg.media and isinstance(msg.media, MessageMediaDocument):
await handle_message(client, bot, msg, source, patterns, password=password)
elif msg.buttons and has_download_button(msg):
await handle_bot_download_message(client, bot, msg, source, patterns, password=password)

70
core/tdl_downloader.md Normal file
View File

@@ -0,0 +1,70 @@
# core/tdl_downloader.py
Fast file downloads via `tdl` (Go MTProto). Falls back gracefully if tdl is not installed.
## Public API
```python
from core.tdl_downloader import (
is_tdl_available,
download_single_with_tdl,
download_batch_with_tdl,
BatchEntry,
)
```
### `is_tdl_available() -> bool`
Returns `True` if `tdl` binary is on PATH.
### `download_single_with_tdl(msg, dest: Path) -> bool`
**async.** Downloads one message's document. Returns `True` on success.
Used by the live handler and `bot_downloader`.
### `download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]`
**async.** Downloads up to `TDL_AMOUNT` messages in a single `tdl dl` invocation.
Returns `{doc_id: True|False}``False` means Telethon fallback needed.
---
## BatchEntry dataclass
```python
@dataclass
class BatchEntry:
msg: object # Telethon Message
filename: str
dest: Path # final destination path in TEMP_DIR
doc_id: int # msg.media.document.id
source_name: str
password: str | None
```
---
## TUI output pipeline
In TUI mode (`bus.tui_active == True`), `_run_tdl` pipes stdout+stderr and relays lines as `EvTdlOutput` events in real time.
**Reads raw 256-byte chunks** (not line-by-line) and splits on `\r` and `\n`, because tdl uses `\r` to overwrite its progress bar in place.
In CLI mode: subprocess inherits the terminal, progress bars render natively.
---
## Staging directory isolation
Each batch/single download gets a unique `data/tmp/_tdl_{monotonic_ns}/` staging dir.
After `tdl` exits, files are matched by name (with fuzzy stem fallback for `filenamify()` mangling) and moved to final `dest`. Staging dir is removed regardless of outcome.
`--template '{{ filenamify .FileName }}'` — tdl uses the original Telegram filename, not its default `DialogID_MessageID_filename` format.
---
## Config knobs (`config.py`)
| Setting | Default | Description |
|---------|---------|-------------|
| `TDL_NAMESPACE` | `"default"` | `-n` flag; `None` omits it |
| `TDL_THREADS` | `8` | `-t` chunk workers per file |
| `TDL_PERFILE` | `4` | `-l` concurrent files per invocation |
| `TDL_AMOUNT` | `4` | Max messages per batch |
| `TDL_TAKEOUT` | `False` | `--takeout` session flag |

363
core/tdl_downloader.py Normal file
View File

@@ -0,0 +1,363 @@
"""
tdl_downloader.py — Fast file downloads via tdl (Go MTProto implementation).
Install: https://github.com/iyear/tdl
curl -sSL https://raw.githubusercontent.com/iyear/tdl/main/scripts/install.sh | bash
First-time setup — log in once:
tdl login # saves to namespace "default"
tdl login -n myns # saves to a named namespace
Relevant config.py knobs:
TDL_NAMESPACE str|None Session namespace (default "default"; None omits -n)
TDL_THREADS int Chunk workers per file (-t, default 4)
TDL_PERFILE int Concurrent files (-l, default 4)
TDL_AMOUNT int Messages per tdl batch (default 4)
TDL_TAKEOUT bool Use takeout session (--takeout)
Flag reference:
Global (BEFORE subcommand): -n --ns, -t --threads, -l --limit
dl-specific: -u --url, -d --dir, --template, --continue, --takeout
Download isolation strategy:
Each batch gets its own staging subdirectory (TEMP_DIR/<batch_id>/) so that
concurrent downloads and homoglyph filename collisions can never cause tdl's
internal .tmp → final rename to fail. Files are moved to TEMP_DIR after
the batch completes and the staging dir is removed.
"""
import asyncio
import logging
import shutil
import time
from dataclasses import dataclass
from pathlib import Path
from config import TDL_NAMESPACE, TDL_THREADS, TDL_PERFILE, TDL_TAKEOUT, TEMP_DIR
from tui import events as bus
log = logging.getLogger(__name__)
# ─── Availability ─────────────────────────────────────────────────────────────
def is_tdl_available() -> bool:
return shutil.which("tdl") is not None
# ─── Message → URL ────────────────────────────────────────────────────────────
def _build_message_url(msg) -> str:
"""
Build a t.me/c/<channel_id>/<msg_id> link from a Telethon Message.
Works for public and private channels alike.
"""
peer = msg.peer_id
if hasattr(peer, "channel_id"):
return f"https://t.me/c/{peer.channel_id}/{msg.id}"
elif hasattr(peer, "chat_id"):
return f"https://t.me/c/{peer.chat_id}/{msg.id}"
elif hasattr(peer, "user_id"):
return f"https://t.me/c/{peer.user_id}/{msg.id}"
raise ValueError(f"Cannot build message URL from peer: {peer!r}")
# ─── Command builder ──────────────────────────────────────────────────────────
def _build_cmd(urls: list[str], staging_dir: Path) -> list[str]:
"""
Build the full tdl dl command.
Global flags (-n, -t, -l) MUST precede the subcommand.
staging_dir is always an absolute path to a fresh per-batch directory,
so tdl's internal .tmp → final rename can never collide with an existing
file of the same name.
--template '{{ filenamify .FileName }}' keeps just the original filename
(no DialogID_MessageID_ prefix).
--continue is kept so interrupted downloads resume rather than restart.
--skip-same is intentionally omitted — deduplication is handled upstream
by is_seen(), and --skip-same can cause the .tmp rename to fail when a
same-named file already exists in the directory.
"""
global_flags: list[str] = []
if TDL_NAMESPACE:
global_flags += ["-n", str(TDL_NAMESPACE)]
global_flags += ["-t", str(TDL_THREADS), "-l", str(TDL_PERFILE)]
url_flags: list[str] = []
for url in urls:
url_flags += ["-u", url]
dl_flags = [
"-d", str(staging_dir),
"--template", "{{ filenamify .FileName }}",
"--continue",
]
if TDL_TAKEOUT:
dl_flags.append("--takeout")
return ["tdl", *global_flags, "dl", *url_flags, *dl_flags]
# ─── Runner ───────────────────────────────────────────────────────────────────
# ANSI escape stripper — tdl emits colour codes even when not a TTY
import re as _re
_ANSI_RE = _re.compile(r"\x1b\[[0-9;]*[mGKHFJA-Z]|\x1b=|\x1b>|\x1b\[\?[0-9]+[hl]")
def _strip_ansi(text: str) -> str:
return _ANSI_RE.sub("", text)
async def _run_tdl(cmd: list[str], label: str) -> bool:
"""
Spawn tdl and handle output based on whether the TUI is running:
- TUI mode: pipe stdout+stderr, read raw chunks (NOT line-by-line),
split on both \\r and \\n, strip ANSI, post non-empty
segments immediately as EvTdlOutput.
tdl uses \\r to overwrite its progress bar in place, so
async-for-line on the stream would block until EOF.
Chunk-reading + manual split delivers progress live.
- CLI mode: inherit the terminal so tdl's progress bars render natively.
Returns True on exit code 0, False otherwise.
"""
log.debug(f"[tdl] cmd: {' '.join(cmd)}")
try:
if bus.tui_active:
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
async def _relay(stream):
buf = ""
while True:
chunk = await stream.read(256)
if not chunk:
break
buf += chunk.decode(errors="replace")
# Split on both \r and \n; process all complete segments
parts = _re.split(r"[\r\n]", buf)
# Last element may be an incomplete segment — keep in buffer
buf = parts[-1]
for part in parts[:-1]:
clean = _strip_ansi(part).strip()
if clean:
bus.post(bus.EvTdlOutput(line=clean))
# Flush any remaining buffer content
if buf:
clean = _strip_ansi(buf).strip()
if clean:
bus.post(bus.EvTdlOutput(line=clean))
await asyncio.gather(_relay(proc.stdout), _relay(proc.stderr))
await proc.wait()
else:
proc = await asyncio.create_subprocess_exec(*cmd)
await proc.wait()
if proc.returncode == 0:
log.info(f"[tdl] ✓ {label}")
return True
else:
log.error(f"[tdl] ✗ exit {proc.returncode}{label}")
return False
except FileNotFoundError:
log.error("[tdl] binary not found at runtime")
return False
except Exception as e:
log.error(f"[tdl] Unexpected error: {e}")
return False
# ─── Staging dir helpers ──────────────────────────────────────────────────────
def _make_staging_dir() -> Path:
"""Create a unique staging subdirectory under TEMP_DIR for one batch."""
staging = TEMP_DIR.resolve() / f"_tdl_{int(time.monotonic_ns())}"
staging.mkdir(parents=True, exist_ok=True)
return staging
def _find_in_staging(staging: Path, expected_name: str) -> Path | None:
"""
Locate a downloaded file in the staging dir by matching its name.
filenamify() can munge characters (strips @, collapses unicode, etc.)
so we do a normalised stem comparison as a fallback.
"""
# Exact match first
exact = staging / expected_name
if exact.exists():
return exact
expected_stem = Path(expected_name).stem.lower().lstrip("@").replace(" ", "")
expected_suffix = Path(expected_name).suffix.lower()
for candidate in staging.iterdir():
if not candidate.is_file():
continue
if candidate.suffix.lower() != expected_suffix:
continue
cand_stem = candidate.stem.lower().lstrip("@").replace(" ", "")
if cand_stem == expected_stem:
return candidate
return None
def _move_from_staging(staging: Path, expected_name: str, final_dest: Path) -> bool:
"""
Find the file in staging, move it to final_dest, return True on success.
"""
found = _find_in_staging(staging, expected_name)
if not found:
log.warning(f"[tdl] Not found in staging: '{expected_name}' (staging: {staging})")
return False
try:
found.rename(final_dest)
log.debug(f"[tdl] Moved: {found.name}{final_dest}")
return True
except Exception as e:
log.error(f"[tdl] Move failed {found}{final_dest}: {e}")
return False
def _cleanup_staging(staging: Path) -> None:
try:
shutil.rmtree(staging, ignore_errors=True)
except Exception:
pass
# ─── Public API ───────────────────────────────────────────────────────────────
@dataclass
class BatchEntry:
"""Carries everything needed to process one file after a batch download."""
msg: object # Telethon Message
filename: str
dest: Path
doc_id: int
source_name: str
password: str | None
async def download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]:
"""
Download a batch of messages in a single tdl invocation.
Each batch gets its own staging subdirectory so filenames can never
collide with existing files in TEMP_DIR. After tdl exits, files are
moved from staging to their final dest paths.
Returns dict mapping doc_id → True (ready at entry.dest) / False (fallback needed).
"""
if not entries:
return {}
if not is_tdl_available():
log.warning("[tdl] not available — all entries need Telethon fallback")
return {e.doc_id: False for e in entries}
urls: list[str] = []
for entry in entries:
try:
urls.append(_build_message_url(entry.msg))
except ValueError as exc:
log.error(f"[tdl] Skipping {entry.filename}: {exc}")
urls.append("")
valid_entries = [(e, u) for e, u in zip(entries, urls) if u]
if not valid_entries:
return {e.doc_id: False for e in entries}
batch_id = f"batch_{int(time.monotonic_ns())}"
names = ", ".join(e.filename for e, _ in valid_entries)
log.info(f"[tdl] Batch ({len(valid_entries)} files): {names}")
# Notify TUI: all files in this batch are queued
for entry, _ in valid_entries:
size_mb = (entry.msg.media.document.size or 0) / (1024 * 1024)
bus.post(bus.EvDownloadQueued(
batch_id=batch_id,
filename=entry.filename,
size_mb=round(size_mb, 2),
source=entry.source_name,
password=entry.password,
))
staging = _make_staging_dir()
cmd = _build_cmd([u for _, u in valid_entries], staging)
# Signal batch started
for entry, _ in valid_entries:
bus.post(bus.EvDownloadStarted(batch_id=batch_id, filename=entry.filename))
tdl_ok = await _run_tdl(cmd, f"batch of {len(valid_entries)}")
results: dict[int, bool] = {}
for entry in entries:
if not any(e.doc_id == entry.doc_id for e, _ in valid_entries):
results[entry.doc_id] = False
continue
if tdl_ok:
moved = _move_from_staging(staging, entry.filename, entry.dest)
results[entry.doc_id] = moved
if moved:
bus.post(bus.EvDownloadDone(batch_id=batch_id, filename=entry.filename, via="tdl"))
else:
log.warning(f"[tdl] Fallback needed: {entry.filename}")
bus.post(bus.EvDownloadFailed(batch_id=batch_id, filename=entry.filename, reason="staging mismatch"))
else:
results[entry.doc_id] = False
bus.post(bus.EvDownloadFailed(batch_id=batch_id, filename=entry.filename, reason="tdl exit error"))
_cleanup_staging(staging)
return results
async def download_single_with_tdl(msg, dest: Path) -> bool:
"""
Download a single message with tdl. Used by the live handler and
bot_downloader where batching doesn't apply.
"""
if not is_tdl_available():
log.warning("[tdl] not available — falling back to Telethon")
return False
try:
url = _build_message_url(msg)
except ValueError as e:
log.error(f"[tdl] Cannot build URL: {e}")
return False
batch_id = f"single_{int(time.monotonic_ns())}"
size_mb = (msg.media.document.size or 0) / (1024 * 1024) if hasattr(msg, "media") and msg.media else 0
bus.post(bus.EvDownloadQueued(
batch_id=batch_id, filename=dest.name,
size_mb=round(size_mb, 2), source="live", password=None,
))
bus.post(bus.EvDownloadStarted(batch_id=batch_id, filename=dest.name))
staging = _make_staging_dir()
cmd = _build_cmd([url], staging)
log.info(f"[tdl] Single: {dest.name} ({url})")
tdl_ok = await _run_tdl(cmd, dest.name)
if tdl_ok:
result = _move_from_staging(staging, dest.name, dest)
else:
result = False
_cleanup_staging(staging)
if result:
bus.post(bus.EvDownloadDone(batch_id=batch_id, filename=dest.name, via="tdl"))
else:
bus.post(bus.EvDownloadFailed(batch_id=batch_id, filename=dest.name, reason="tdl failed"))
return result