Initial commit: ULPgrammer
- Core Telegram monitoring pipeline (scraper, processor, notifier, downloaders) - Textual TUI frontend with thread-safe event bus - SQLite persistence, severity scoring, dedup cache - Fixed ULP parser: handles https:// truncation, port+path URLs, semicolon separator - Test suite: 88 tests across scorer, cache, database, processor
This commit is contained in:
1
core/__init__.py
Normal file
1
core/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""core — Telegram I/O pipeline (scraper, downloader, processor, notifier)."""
|
||||
68
core/bot_downloader.md
Normal file
68
core/bot_downloader.md
Normal file
@@ -0,0 +1,68 @@
|
||||
# core/bot_downloader.py
|
||||
|
||||
Handles "click to download" inline button flows. Some Telegram channels post files via a bot behind a button rather than directly attaching them.
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from core.bot_downloader import (
|
||||
handle_bot_download_message,
|
||||
has_download_button,
|
||||
extract_password,
|
||||
)
|
||||
```
|
||||
|
||||
### `handle_bot_download_message(client, bot, msg, source_name, patterns, password=None)`
|
||||
**async.** Full pipeline:
|
||||
1. Detect download button
|
||||
2. Click it (URL button → `/start payload` to the bot; callback button → `.click()`)
|
||||
3. Wait up to `BOT_REPLY_TIMEOUT` seconds for the bot to send a file back
|
||||
4. Hand each file response to `core.scraper.handle_message()`
|
||||
|
||||
### `has_download_button(msg) -> bool`
|
||||
Returns `True` if the message contains a recognisable download button.
|
||||
Checked in live handler and backfill before calling this module.
|
||||
|
||||
### `extract_password(msg) -> str | None`
|
||||
Scans message text for `Pass: ...` / `Password: ...` / `Contraseña: ...` patterns.
|
||||
Returns the extracted password string, or `None`.
|
||||
|
||||
---
|
||||
|
||||
## Button detection
|
||||
|
||||
Recognised button text keywords (case-insensitive):
|
||||
```
|
||||
DOWNLOAD, DESCARGAR, GET FILE, GET PACK, ⬇, 📥
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## URL button flow (most common)
|
||||
|
||||
```
|
||||
Button URL: https://t.me/SomeBot?start=ABC123
|
||||
→ parse bot username + payload
|
||||
→ client.send_message(bot_entity, "/start ABC123")
|
||||
→ poll get_messages(bot_entity, limit=3) every 1s for BOT_REPLY_TIMEOUT seconds
|
||||
→ return file messages found
|
||||
```
|
||||
|
||||
## Callback button flow (fallback)
|
||||
|
||||
```
|
||||
btn.click()
|
||||
→ sleep 2s
|
||||
→ get_messages(sender, limit=5)
|
||||
→ return file messages found
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Constants
|
||||
|
||||
| Name | Value | Description |
|
||||
|------|-------|-------------|
|
||||
| `BOT_REPLY_TIMEOUT` | `10` | Seconds to wait for bot file reply |
|
||||
| `DOWNLOAD_BUTTON_KEYWORDS` | see above | Button text triggers |
|
||||
| `PASSWORD_PATTERN` | regex | Matches `Pass[word]: value` in message text |
|
||||
161
core/bot_downloader.py
Normal file
161
core/bot_downloader.py
Normal file
@@ -0,0 +1,161 @@
|
||||
"""
|
||||
bot_downloader.py — Handles "click to download" inline button flows.
|
||||
|
||||
Some Telegram channels post messages with a DOWNLOAD button that triggers
|
||||
a bot to send you the actual file. This module simulates that click and
|
||||
captures the bot's file response.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
import logging
|
||||
|
||||
from telethon import TelegramClient
|
||||
from telethon.tl.types import MessageMediaDocument, KeyboardButtonUrl
|
||||
from telethon.errors import FloodWaitError
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
DOWNLOAD_BUTTON_KEYWORDS = ["DOWNLOAD", "DESCARGAR", "GET FILE", "GET PACK", "⬇", "📥"]
|
||||
BOT_REPLY_TIMEOUT = 10
|
||||
|
||||
PASSWORD_PATTERN = re.compile(
|
||||
r"(?:Pass|Password|Contraseña|Contrasena|Clave)[\s]*:[\s]*(.+)$",
|
||||
re.IGNORECASE | re.MULTILINE
|
||||
)
|
||||
|
||||
|
||||
# ─── Password extraction ──────────────────────────────────────────────────────
|
||||
|
||||
def extract_password(msg) -> str | None:
|
||||
if not msg.text:
|
||||
return None
|
||||
match = PASSWORD_PATTERN.search(msg.text)
|
||||
if match:
|
||||
pwd = match.group(1).strip()
|
||||
# Strip markdown formatting characters
|
||||
pwd = pwd.strip("*`_~")
|
||||
log.info(f" Found password in message: '{pwd}'")
|
||||
return pwd
|
||||
return None
|
||||
|
||||
|
||||
# ─── Button detection ─────────────────────────────────────────────────────────
|
||||
|
||||
def find_download_button(msg):
|
||||
"""
|
||||
Scans a message's inline keyboard for a download-like button.
|
||||
Returns the button object or None.
|
||||
"""
|
||||
if not msg.buttons:
|
||||
return None
|
||||
for row in msg.buttons:
|
||||
for btn in row:
|
||||
if any(kw in btn.text.upper() for kw in DOWNLOAD_BUTTON_KEYWORDS):
|
||||
return btn
|
||||
return None
|
||||
|
||||
|
||||
def has_download_button(msg) -> bool:
|
||||
return find_download_button(msg) is not None
|
||||
|
||||
|
||||
# ─── Click + wait flow ────────────────────────────────────────────────────────
|
||||
|
||||
async def click_download_button(client: TelegramClient, msg) -> list:
|
||||
"""
|
||||
Clicks the download button on a message, then waits for the bot to reply
|
||||
with a file. Returns a list of response messages containing documents.
|
||||
"""
|
||||
btn = find_download_button(msg)
|
||||
if not btn:
|
||||
return []
|
||||
|
||||
log.info(f" Clicking button: '{btn.text}'")
|
||||
|
||||
# ── URL button (most common) ───────────────────────────────────────────
|
||||
if isinstance(btn.button, KeyboardButtonUrl):
|
||||
url = btn.button.url # e.g. https://t.me/SomeBot?start=ABC123
|
||||
|
||||
match = re.search(r"t\.me/([A-Za-z0-9_]+)\?start=(.+)", url)
|
||||
if not match:
|
||||
log.warning(f" Unrecognised URL format: {url}")
|
||||
return []
|
||||
|
||||
bot_username, payload = match.group(1), match.group(2)
|
||||
log.info(f" → Messaging @{bot_username} with /start {payload}")
|
||||
|
||||
try:
|
||||
bot_entity = await client.get_entity(bot_username)
|
||||
await client.send_message(bot_entity, f"/start {payload}")
|
||||
except Exception as e:
|
||||
log.error(f" Failed to message bot: {e}")
|
||||
return []
|
||||
|
||||
# Poll for reply
|
||||
log.info(f" Waiting up to {BOT_REPLY_TIMEOUT}s for bot reply...")
|
||||
for _ in range(BOT_REPLY_TIMEOUT):
|
||||
await asyncio.sleep(1)
|
||||
try:
|
||||
recent = await client.get_messages(bot_entity, limit=3)
|
||||
files = [m for m in recent if m.media and isinstance(m.media, MessageMediaDocument)]
|
||||
if files:
|
||||
log.info(f" ✓ Got file from bot.")
|
||||
return files
|
||||
except Exception as e:
|
||||
log.warning(f" Poll error: {e}")
|
||||
break
|
||||
|
||||
log.warning(f" Bot did not reply within {BOT_REPLY_TIMEOUT}s.")
|
||||
return []
|
||||
|
||||
# ── Callback button (less common) ─────────────────────────────────────
|
||||
else:
|
||||
try:
|
||||
await btn.click()
|
||||
await asyncio.sleep(2)
|
||||
except Exception as e:
|
||||
log.error(f" Callback click failed: {e}")
|
||||
return []
|
||||
|
||||
try:
|
||||
sender = await msg.get_sender()
|
||||
recent = await client.get_messages(sender, limit=5)
|
||||
return [m for m in recent if m.media and isinstance(m.media, MessageMediaDocument)]
|
||||
except Exception as e:
|
||||
log.warning(f" Fallback poll failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
# ─── Main entry point ─────────────────────────────────────────────────────────
|
||||
|
||||
async def handle_bot_download_message(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
msg,
|
||||
source_name: str,
|
||||
patterns,
|
||||
password: str | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Full pipeline for a message with a download button:
|
||||
1. Detect download button
|
||||
2. Click it
|
||||
3. Wait for bot to send back a file
|
||||
4. Hand off to the normal handle_message() flow
|
||||
"""
|
||||
if not has_download_button(msg):
|
||||
return
|
||||
|
||||
log.info(f"[BotDL] Download button detected in {source_name}")
|
||||
|
||||
responses = await click_download_button(client, msg)
|
||||
|
||||
if not responses:
|
||||
log.warning(f"[BotDL] No file received for message in {source_name}.")
|
||||
return
|
||||
|
||||
from core.scraper import handle_message
|
||||
for resp in responses:
|
||||
log.info(f" [BotDL] Response media type: {type(resp.media).__name__}, attrs: {getattr(resp.media.document, 'attributes', []) if hasattr(resp.media, 'document') else 'none'}")
|
||||
await handle_message(client, bot, resp, f"{source_name}[bot]", patterns, password=password)
|
||||
67
core/notifier.md
Normal file
67
core/notifier.md
Normal file
@@ -0,0 +1,67 @@
|
||||
# core/notifier.py
|
||||
|
||||
Scores hits, deduplicates, persists to disk and DB, sends Telegram alerts.
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from core.notifier import notify, send_status
|
||||
```
|
||||
|
||||
### `notify(bot, hits: list[str], source: str, filename: str)`
|
||||
**async.** Full notification pipeline:
|
||||
1. `score_hits(hits)` → `list[ScoredHit]`
|
||||
2. Deduplicate via SHA-256 hashes (`data/dedup.json`)
|
||||
3. `insert_hits()` into SQLite for new + dupes (flagged accordingly)
|
||||
4. `write_hits()` → append to `data/hits.txt`
|
||||
5. `write_hits_csv()` → append to `data/hits.csv`
|
||||
6. `send_alert()` → Telegram message for CRITICAL/HIGH/MEDIUM only
|
||||
7. Post `EvHit` events onto the TUI bus for each new hit
|
||||
|
||||
### `send_status(bot, message: str)`
|
||||
**async.** Sends a plain Markdown message to `config.NOTIFY_CHAT_ID`. Used for startup/status notifications.
|
||||
|
||||
---
|
||||
|
||||
## Internal functions
|
||||
|
||||
| Function | Description |
|
||||
|----------|-------------|
|
||||
| `deduplicate(hits)` | Returns `(new_hits, dupe_hits)`; updates `data/dedup.json` |
|
||||
| `write_hits(scored_hits, source)` | Appends grouped human-readable block to `data/hits.txt` |
|
||||
| `write_hits_csv(scored_hits, source, filename)` | Appends rows to `data/hits.csv`; writes header on first call |
|
||||
| `send_alert(bot, scored_hits, source, filename)` | Sends Telegram message grouped by severity; skips if all LOW |
|
||||
|
||||
---
|
||||
|
||||
## Output files
|
||||
|
||||
| File | Format | Notes |
|
||||
|------|--------|-------|
|
||||
| `data/hits.txt` | Plain text, grouped by severity | Human-readable, append-only |
|
||||
| `data/hits.csv` | CSV with header | Columns: `timestamp, severity, score, url, username, password, reasons, source, filename` |
|
||||
| `data/dedup.json` | JSON array of SHA-256 hex strings | Hashes of `line.strip().lower()` |
|
||||
|
||||
---
|
||||
|
||||
## Alert behaviour
|
||||
|
||||
- CRITICAL / HIGH / MEDIUM → Telegram alert sent immediately
|
||||
- LOW → stored in DB + files, **no** Telegram alert
|
||||
- Duplicates → stored in DB with `seen_before=1`, no alert, no file write
|
||||
|
||||
## Telegram alert format
|
||||
|
||||
```
|
||||
🚨 Credential hit(s) detected
|
||||
📁 `filename`
|
||||
📢 `source`
|
||||
🕐 `timestamp`
|
||||
|
||||
Summary: 🔴 N 🟠 N 🟡 N 🟢 N
|
||||
|
||||
🔴 CRITICAL (N)
|
||||
`url:user:pass`
|
||||
↳ reason | reason
|
||||
... (up to 10 per severity; remainder counted)
|
||||
```
|
||||
248
core/notifier.py
Normal file
248
core/notifier.py
Normal file
@@ -0,0 +1,248 @@
|
||||
"""
|
||||
notifier.py — Persists hits to disk and sends Telegram bot alerts.
|
||||
|
||||
Includes:
|
||||
- Severity scoring via scorer.py
|
||||
- Deduplication: same credential never written or alerted twice
|
||||
- SQLite storage via database.py
|
||||
- hits.txt kept as a human-readable backup
|
||||
- Telegram alerts grouped by severity
|
||||
"""
|
||||
|
||||
import logging
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from telethon import TelegramClient
|
||||
|
||||
import csv
|
||||
|
||||
from config import HITS_FILE, NOTIFY_CHAT_ID
|
||||
from utils.scorer import score_hits, summarize, CRITICAL, HIGH, MEDIUM, LOW, SEVERITY_EMOJI
|
||||
from utils.database import insert_hits
|
||||
from tui import events as bus
|
||||
|
||||
HITS_CSV = HITS_FILE.with_suffix(".csv")
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
MAX_PREVIEW = 10 # hits to show per severity group in alert
|
||||
DEDUP_FILE = Path("./data/dedup.json")
|
||||
|
||||
# Only alert immediately for these severities — LOW hits are silent
|
||||
ALERT_SEVERITIES = {CRITICAL, HIGH, MEDIUM}
|
||||
|
||||
|
||||
# ─── Deduplication ────────────────────────────────────────────────────────────
|
||||
|
||||
def _hash(line: str) -> str:
|
||||
return hashlib.sha256(line.strip().lower().encode()).hexdigest()
|
||||
|
||||
|
||||
def _load_seen_hashes() -> set:
|
||||
if not DEDUP_FILE.exists():
|
||||
return set()
|
||||
try:
|
||||
with open(DEDUP_FILE, "r") as f:
|
||||
return set(json.load(f))
|
||||
except Exception:
|
||||
return set()
|
||||
|
||||
|
||||
def _save_seen_hashes(seen: set) -> None:
|
||||
try:
|
||||
with open(DEDUP_FILE, "w") as f:
|
||||
json.dump(list(seen), f)
|
||||
except Exception as e:
|
||||
log.warning(f"Could not save dedup file: {e}")
|
||||
|
||||
|
||||
def deduplicate(hits: list) -> tuple[list, list]:
|
||||
"""
|
||||
Accepts a list of ScoredHit objects.
|
||||
Returns (new_hits, dupe_hits).
|
||||
"""
|
||||
seen = _load_seen_hashes()
|
||||
new_hits = []
|
||||
dupe_hits = []
|
||||
new_hashes = set()
|
||||
|
||||
for h in hits:
|
||||
digest = _hash(h.raw)
|
||||
if digest in seen:
|
||||
dupe_hits.append(h)
|
||||
else:
|
||||
new_hits.append(h)
|
||||
new_hashes.add(digest)
|
||||
|
||||
if new_hashes:
|
||||
seen.update(new_hashes)
|
||||
_save_seen_hashes(seen)
|
||||
|
||||
log.info(
|
||||
f" Dedup: {len(hits)} raw hit(s) → "
|
||||
f"{len(new_hits)} new, {len(dupe_hits)} duplicate(s)"
|
||||
)
|
||||
return new_hits, dupe_hits
|
||||
|
||||
|
||||
# ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def _timestamp() -> str:
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
|
||||
|
||||
# ─── Output ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def write_hits(scored_hits: list, source: str) -> None:
|
||||
"""Append new hits to hits.txt grouped by severity."""
|
||||
HITS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
summary = summarize(scored_hits)
|
||||
|
||||
with open(HITS_FILE, "a", encoding="utf-8") as f:
|
||||
f.write(f"\n{'='*60}\n")
|
||||
f.write(f"Source : {source}\n")
|
||||
f.write(f"Time : {_timestamp()}\n")
|
||||
f.write(f"Hits : {len(scored_hits)} ")
|
||||
f.write(f"(CRITICAL={summary[CRITICAL]} HIGH={summary[HIGH]} ")
|
||||
f.write(f"MEDIUM={summary[MEDIUM]} LOW={summary[LOW]})\n")
|
||||
f.write(f"{'='*60}\n")
|
||||
|
||||
for severity in [CRITICAL, HIGH, MEDIUM, LOW]:
|
||||
group = [h for h in scored_hits if h.severity == severity]
|
||||
if not group:
|
||||
continue
|
||||
emoji = SEVERITY_EMOJI[severity]
|
||||
f.write(f"\n{emoji} {severity} ({len(group)})\n")
|
||||
for h in group:
|
||||
f.write(f" {h.raw}\n")
|
||||
f.write(f" → {' | '.join(h.reasons)}\n")
|
||||
|
||||
log.info(f" Wrote {len(scored_hits)} hit(s) to {HITS_FILE}")
|
||||
|
||||
|
||||
def write_hits_csv(scored_hits: list, source: str, filename: str) -> None:
|
||||
"""Append new hits to hits.csv — one row per hit, easy to import."""
|
||||
HITS_CSV.parent.mkdir(parents=True, exist_ok=True)
|
||||
write_header = not HITS_CSV.exists()
|
||||
timestamp = _timestamp()
|
||||
with open(HITS_CSV, "a", newline="", encoding="utf-8") as f:
|
||||
writer = csv.writer(f)
|
||||
if write_header:
|
||||
writer.writerow([
|
||||
"timestamp", "severity", "score", "url", "username",
|
||||
"password", "reasons", "source", "filename",
|
||||
])
|
||||
for h in scored_hits:
|
||||
writer.writerow([
|
||||
timestamp, h.severity, h.score,
|
||||
h.url or "", h.username or "", h.password or "",
|
||||
" | ".join(h.reasons), source, filename,
|
||||
])
|
||||
log.info(f" Wrote {len(scored_hits)} hit(s) to {HITS_CSV}")
|
||||
|
||||
|
||||
async def send_alert(
|
||||
bot: TelegramClient,
|
||||
scored_hits: list,
|
||||
source: str,
|
||||
filename: str,
|
||||
) -> None:
|
||||
"""
|
||||
Send a Telegram alert grouped by severity.
|
||||
Only includes CRITICAL, HIGH, MEDIUM — LOW hits are omitted from alerts.
|
||||
"""
|
||||
summary = summarize(scored_hits)
|
||||
alertable = [h for h in scored_hits if h.severity in ALERT_SEVERITIES]
|
||||
|
||||
if not alertable:
|
||||
log.info(" No alertable hits (all LOW) — skipping Telegram notification.")
|
||||
return
|
||||
|
||||
lines = [
|
||||
f"🚨 *Credential hit(s) detected*",
|
||||
f"",
|
||||
f"📁 `{filename}`",
|
||||
f"📢 `{source}`",
|
||||
f"🕐 `{_timestamp()}`",
|
||||
f"",
|
||||
f"*Summary:*",
|
||||
f"🔴 CRITICAL: `{summary[CRITICAL]}` "
|
||||
f"🟠 HIGH: `{summary[HIGH]}` "
|
||||
f"🟡 MEDIUM: `{summary[MEDIUM]}` "
|
||||
f"🟢 LOW: `{summary[LOW]}`",
|
||||
]
|
||||
|
||||
for severity in [CRITICAL, HIGH, MEDIUM]:
|
||||
group = [h for h in scored_hits if h.severity == severity]
|
||||
if not group:
|
||||
continue
|
||||
emoji = SEVERITY_EMOJI[severity]
|
||||
lines.append(f"\n{emoji} *{severity}* ({len(group)})")
|
||||
for h in group[:MAX_PREVIEW]:
|
||||
safe = h.raw.replace("`", "'")
|
||||
lines.append(f"`{safe}`")
|
||||
lines.append(f"_↳ {' | '.join(h.reasons)}_")
|
||||
if len(group) > MAX_PREVIEW:
|
||||
lines.append(f"_...and {len(group) - MAX_PREVIEW} more_")
|
||||
|
||||
try:
|
||||
await bot.send_message(NOTIFY_CHAT_ID, "\n".join(lines), parse_mode="markdown")
|
||||
except Exception as e:
|
||||
log.error(f"Failed to send Telegram alert: {e}")
|
||||
|
||||
|
||||
# ─── Main entry point ────────────────────────────────────────────────────────
|
||||
|
||||
async def notify(bot: TelegramClient, hits: list[str], source: str, filename: str) -> None:
|
||||
"""
|
||||
Full notification pipeline:
|
||||
1. Score all hits
|
||||
2. Deduplicate
|
||||
3. Insert all hits into SQLite (new + dupes, flagged accordingly)
|
||||
4. Write new hits to hits.txt
|
||||
5. Send Telegram alert for new alertable hits only
|
||||
"""
|
||||
if not hits:
|
||||
return
|
||||
|
||||
# Score first
|
||||
scored = score_hits(hits)
|
||||
log.info(f" Scored {len(scored)} hit(s) — {summarize(scored)}")
|
||||
|
||||
# Deduplicate
|
||||
new_hits, dupe_hits = deduplicate(scored)
|
||||
|
||||
# Always insert into DB
|
||||
if new_hits:
|
||||
insert_hits(new_hits, source, filename, seen_before=False)
|
||||
if dupe_hits:
|
||||
insert_hits(dupe_hits, source, filename, seen_before=True)
|
||||
|
||||
if not new_hits:
|
||||
log.info(" All hits already seen before — no alert sent.")
|
||||
return
|
||||
|
||||
# Push hits to TUI
|
||||
for h in new_hits:
|
||||
bus.post(bus.EvHit(
|
||||
severity=h.severity,
|
||||
raw=h.raw,
|
||||
source=source,
|
||||
filename=filename,
|
||||
reasons=h.reasons,
|
||||
))
|
||||
|
||||
write_hits(new_hits, source)
|
||||
write_hits_csv(new_hits, source, filename)
|
||||
await send_alert(bot, new_hits, source, filename)
|
||||
|
||||
|
||||
async def send_status(bot: TelegramClient, message: str) -> None:
|
||||
"""Send a plain status/info message to the notify chat."""
|
||||
try:
|
||||
await bot.send_message(NOTIFY_CHAT_ID, message, parse_mode="markdown")
|
||||
except Exception as e:
|
||||
log.error(f"Failed to send status message: {e}")
|
||||
69
core/processor.md
Normal file
69
core/processor.md
Normal file
@@ -0,0 +1,69 @@
|
||||
# core/processor.py
|
||||
|
||||
Archive extraction and hit searching. No Telegram deps, no async.
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from core.processor import compile_patterns, process_file
|
||||
```
|
||||
|
||||
### `compile_patterns(keywords: list[str]) -> list[re.Pattern]`
|
||||
Compiles a list of keyword strings into case-insensitive regex patterns.
|
||||
Call once at startup; pass the result everywhere patterns are needed.
|
||||
|
||||
```python
|
||||
patterns = compile_patterns(config.TARGET_KEYWORDS)
|
||||
```
|
||||
|
||||
### `process_file(filepath: Path, patterns, password=None) -> list[str]`
|
||||
Full pipeline: unpack → search each `.txt` → recurse into nested archives → clean up everything.
|
||||
Returns list of matching raw lines (hits). Deletes the original file and all extracted contents on completion.
|
||||
|
||||
```python
|
||||
hits = process_file(Path("data/tmp/combo.zip"), patterns, password="infected")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Internal functions
|
||||
|
||||
| Function | Signature | Description |
|
||||
|----------|-----------|-------------|
|
||||
| `search_file` | `(filepath, patterns) -> list[str]` | Stream-reads `.txt` line by line; ignores encoding errors |
|
||||
| `unpack` | `(filepath, extra_password) -> (files, extract_dir\|None)` | Dispatches to correct extractor; plain `.txt` returned as-is |
|
||||
| `extract_zip` | `(filepath, dest, extra_password)` | Tries no password first, then `ARCHIVE_PASSWORDS` list |
|
||||
| `extract_7z` | `(filepath, dest, extra_password)` | Requires `py7zr`; skips if not installed |
|
||||
| `extract_rar` | `(filepath, dest, extra_password)` | Requires `rarfile` + `unrar` binary |
|
||||
| `_try_passwords` | `(extract_fn, passwords)` | Iterates password list, stops on first success |
|
||||
|
||||
---
|
||||
|
||||
## Supported formats
|
||||
|
||||
| Extension | Library | Notes |
|
||||
|-----------|---------|-------|
|
||||
| `.txt` | built-in | Stream-read, no load into memory |
|
||||
| `.zip` | `zipfile` | stdlib |
|
||||
| `.7z` | `py7zr` | optional; skipped if not installed |
|
||||
| `.rar` | `rarfile` | optional; requires `unrar` system binary |
|
||||
|
||||
Nested archives are recursed **one level** only.
|
||||
|
||||
---
|
||||
|
||||
## Password order
|
||||
|
||||
1. `extra_password` (from message/channel carry-forward) — tried first
|
||||
2. `config.ARCHIVE_PASSWORDS` — tried in order
|
||||
|
||||
---
|
||||
|
||||
## Cleanup guarantee
|
||||
|
||||
`process_file` always deletes:
|
||||
- Extracted individual files
|
||||
- Extract subdirectory
|
||||
- Original downloaded file
|
||||
|
||||
Even if no hits are found.
|
||||
233
core/processor.py
Normal file
233
core/processor.py
Normal file
@@ -0,0 +1,233 @@
|
||||
"""
|
||||
processor.py — Archive extraction and hit searching logic.
|
||||
|
||||
Supports: .txt, .zip, .7z, .rar
|
||||
Stream-processes files line by line — safe for large combo lists.
|
||||
"""
|
||||
|
||||
import rarfile
|
||||
rarfile.UNRAR_TOOL = "unrar"
|
||||
|
||||
import re
|
||||
import zipfile
|
||||
import logging
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import py7zr
|
||||
HAS_7Z = True
|
||||
except ImportError:
|
||||
HAS_7Z = False
|
||||
|
||||
try:
|
||||
import rarfile
|
||||
HAS_RAR = True
|
||||
except ImportError:
|
||||
HAS_RAR = False
|
||||
|
||||
from config import ARCHIVE_PASSWORDS
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ─── Searching ───────────────────────────────────────────────────────────────
|
||||
|
||||
def compile_patterns(keywords: list[str]) -> list[re.Pattern]:
|
||||
return [re.compile(kw, re.IGNORECASE) for kw in keywords]
|
||||
|
||||
|
||||
def search_file(filepath: Path, patterns: list[re.Pattern]) -> list[str]:
|
||||
"""
|
||||
Stream-reads a text file line by line and returns lines matching any pattern.
|
||||
Ignores encoding errors — combo files are often messy.
|
||||
"""
|
||||
hits: list[str] = []
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
|
||||
for line in f:
|
||||
stripped = line.strip()
|
||||
if stripped and any(p.search(stripped) for p in patterns):
|
||||
hits.append(stripped)
|
||||
except Exception as e:
|
||||
log.warning(f"Could not read {filepath.name}: {e}")
|
||||
return hits
|
||||
|
||||
|
||||
# ─── Extraction ──────────────────────────────────────────────────────────────
|
||||
|
||||
def _try_passwords(extract_fn, passwords: list[bytes]) -> bool:
|
||||
"""Try a list of passwords against an extract function. Returns True on success."""
|
||||
for pwd in passwords:
|
||||
try:
|
||||
extract_fn(pwd)
|
||||
return True
|
||||
except Exception:
|
||||
continue
|
||||
return False
|
||||
|
||||
|
||||
def extract_zip(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
|
||||
passwords = ARCHIVE_PASSWORDS.copy()
|
||||
if extra_password:
|
||||
passwords.insert(0, extra_password.encode())
|
||||
extracted: list[Path] = []
|
||||
try:
|
||||
with zipfile.ZipFile(filepath) as zf:
|
||||
def try_extract(pwd: bytes):
|
||||
zf.extractall(dest, pwd=pwd or None)
|
||||
|
||||
try:
|
||||
zf.extractall(dest)
|
||||
except RuntimeError:
|
||||
log.info(f" ZIP is password-protected, trying common passwords...")
|
||||
if not _try_passwords(try_extract, ARCHIVE_PASSWORDS):
|
||||
log.warning(f" Could not unlock {filepath.name} — skipping.")
|
||||
return []
|
||||
|
||||
extracted = [p for p in dest.rglob("*") if p.is_file()]
|
||||
except zipfile.BadZipFile:
|
||||
log.warning(f" {filepath.name} is not a valid ZIP.")
|
||||
except Exception as e:
|
||||
log.warning(f" ZIP extraction error on {filepath.name}: {e}")
|
||||
return extracted
|
||||
|
||||
|
||||
def extract_7z(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
|
||||
if not HAS_7Z:
|
||||
log.warning("py7zr not installed — skipping .7z file.")
|
||||
return []
|
||||
extracted: list[Path] = []
|
||||
passwords = ARCHIVE_PASSWORDS.copy()
|
||||
if extra_password:
|
||||
passwords.insert(0, extra_password.encode())
|
||||
|
||||
try:
|
||||
# Try without password first
|
||||
try:
|
||||
with py7zr.SevenZipFile(filepath, mode="r") as z:
|
||||
z.extractall(dest)
|
||||
except py7zr.exceptions.PasswordRequired:
|
||||
log.info(f" 7z is password-protected, trying common passwords...")
|
||||
success = False
|
||||
for pwd in ARCHIVE_PASSWORDS:
|
||||
try:
|
||||
with py7zr.SevenZipFile(filepath, mode="r", password=pwd.decode()) as z:
|
||||
z.extractall(dest)
|
||||
success = True
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
if not success:
|
||||
log.warning(f" Could not unlock {filepath.name} — skipping.")
|
||||
return []
|
||||
|
||||
extracted = [p for p in dest.rglob("*") if p.is_file()]
|
||||
except Exception as e:
|
||||
log.warning(f" 7z extraction error on {filepath.name}: {e}")
|
||||
return extracted
|
||||
|
||||
|
||||
def extract_rar(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
|
||||
if not HAS_RAR:
|
||||
log.warning("rarfile not installed — skipping .rar file.")
|
||||
return []
|
||||
|
||||
passwords = ARCHIVE_PASSWORDS.copy()
|
||||
if extra_password:
|
||||
passwords.insert(0, extra_password.encode())
|
||||
extracted: list[Path] = []
|
||||
try:
|
||||
with rarfile.RarFile(filepath) as rf:
|
||||
def try_extract(pwd: bytes):
|
||||
rf.extractall(dest, pwd=pwd.decode() if pwd else None)
|
||||
|
||||
try:
|
||||
rf.extractall(dest)
|
||||
except rarfile.BadRarFile:
|
||||
log.warning(f" {filepath.name} is not a valid RAR.")
|
||||
return []
|
||||
except Exception:
|
||||
log.info(f" RAR may be password-protected, trying common passwords...")
|
||||
if not _try_passwords(try_extract, ARCHIVE_PASSWORDS):
|
||||
log.warning(f" Could not unlock {filepath.name} — skipping.")
|
||||
return []
|
||||
|
||||
extracted = [p for p in dest.rglob("*") if p.is_file()]
|
||||
except Exception as e:
|
||||
log.warning(f" RAR extraction error on {filepath.name}: {e}")
|
||||
return extracted
|
||||
|
||||
|
||||
def unpack(filepath: Path, extra_password: str | None = None) -> tuple[list[Path], Path | None]:
|
||||
"""
|
||||
Unpacks an archive into a sibling directory.
|
||||
Returns (list of extracted files, extract_dir or None).
|
||||
If it's not an archive, returns ([filepath], None).
|
||||
"""
|
||||
suffix = filepath.suffix.lower()
|
||||
extract_dir = filepath.parent / filepath.stem
|
||||
|
||||
if suffix == ".zip":
|
||||
extract_dir.mkdir(exist_ok=True)
|
||||
files = extract_zip(filepath, extract_dir, extra_password)
|
||||
return files, extract_dir
|
||||
|
||||
elif suffix == ".7z":
|
||||
extract_dir.mkdir(exist_ok=True)
|
||||
files = extract_7z(filepath, extract_dir, extra_password)
|
||||
return files, extract_dir
|
||||
|
||||
elif suffix == ".rar":
|
||||
extract_dir.mkdir(exist_ok=True)
|
||||
files = extract_rar(filepath, extract_dir, extra_password)
|
||||
return files, extract_dir
|
||||
|
||||
else:
|
||||
# Plain file — return as-is, no extract dir to clean up
|
||||
return [filepath], None
|
||||
|
||||
|
||||
# ─── Main entry point ────────────────────────────────────────────────────────
|
||||
|
||||
def process_file(filepath: Path, patterns, password: str | None = None) -> list[str]:
|
||||
"""
|
||||
Full pipeline: unpack → search each file → clean up everything.
|
||||
Returns list of matching lines (hits).
|
||||
"""
|
||||
log.info(f" Processing: {filepath.name}")
|
||||
all_hits: list[str] = []
|
||||
|
||||
files, extract_dir = unpack(filepath, extra_password=password)
|
||||
|
||||
for f in files:
|
||||
if f.suffix.lower() == ".txt":
|
||||
hits = search_file(f, patterns)
|
||||
if hits:
|
||||
log.info(f" ✓ {len(hits)} hit(s) in {f.name}")
|
||||
all_hits.extend(hits)
|
||||
|
||||
# Nested archives — recurse one level
|
||||
elif f.suffix.lower() in {".zip", ".7z", ".rar"} and f != filepath:
|
||||
log.info(f" → Nested archive: {f.name}")
|
||||
nested_hits = process_file(f, patterns)
|
||||
all_hits.extend(nested_hits)
|
||||
continue # process_file already cleaned up f
|
||||
|
||||
# Clean up extracted file
|
||||
try:
|
||||
f.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Clean up extract dir
|
||||
if extract_dir and extract_dir.exists():
|
||||
shutil.rmtree(extract_dir, ignore_errors=True)
|
||||
|
||||
# Clean up original download
|
||||
try:
|
||||
filepath.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return all_hits
|
||||
65
core/scraper.md
Normal file
65
core/scraper.md
Normal file
@@ -0,0 +1,65 @@
|
||||
# core/scraper.py
|
||||
|
||||
Telethon user-client layer. Handles live listening, backfill, and the single-message download pipeline.
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from core.scraper import handle_message, backfill_all, register_handlers, warm_entity_cache
|
||||
```
|
||||
|
||||
### `handle_message(client, bot, msg, source_name, patterns, password=None)`
|
||||
**async.** Full pipeline for one document message:
|
||||
1. Extract filename + size, check allowlist + size guard
|
||||
2. Check `utils.cache` — skip if already seen
|
||||
3. Try `tdl` download → Telethon fallback
|
||||
4. `core.processor.process_file()` → hits
|
||||
5. `core.notifier.notify()` if hits found
|
||||
6. `utils.cache.mark_seen()`
|
||||
|
||||
Called by: live handler, `bot_downloader`, backfill fallback path.
|
||||
|
||||
### `backfill_all(client, bot, patterns)`
|
||||
**async.** Iterates `config.WATCHED_CHANNELS`, calls `backfill_channel()` for each.
|
||||
No-op if `config.BACKFILL_LIMIT == 0`.
|
||||
|
||||
### `register_handlers(client, bot, patterns)`
|
||||
Registers a `NewMessage` Telethon event handler on `config.WATCHED_CHANNELS`.
|
||||
Used in **CLI mode only** (`--no-tui`). The TUI manages its own handler via `_make_handler()` in `tui/app.py`.
|
||||
|
||||
### `warm_entity_cache(client)`
|
||||
**async.** Iterates `client.iter_dialogs()` so Telethon caches entity mappings.
|
||||
Must be called before using raw numeric channel IDs.
|
||||
|
||||
---
|
||||
|
||||
## Internal functions
|
||||
|
||||
| Function | Description |
|
||||
|----------|-------------|
|
||||
| `get_filename(msg)` | Extracts filename from `MessageMediaDocument`; falls back to `{msg_id}{ext}` from MIME |
|
||||
| `get_filesize(msg)` | Returns document size in bytes |
|
||||
| `is_processable(filename, size)` | Checks extension allowlist + size limit; returns `(bool, reason)` |
|
||||
| `_make_dest(msg, filename)` | Resolves temp path, handles collision with `{msg_id}_{filename}` |
|
||||
| `_telethon_download(client, msg, dest, ...)` | Telethon fallback with tqdm progress + flood-wait handling. Posts `EvDownload*` bus events |
|
||||
| `backfill_channel(client, bot, channel, patterns, limit)` | Scans history with password carry-forward; batches via tdl |
|
||||
| `_process_batch(client, bot, batch, patterns)` | One tdl invocation for up to `TDL_AMOUNT` messages; per-file Telethon fallback |
|
||||
|
||||
---
|
||||
|
||||
## Password carry-forward (backfill)
|
||||
|
||||
Channels often post the archive password as a separate text message.
|
||||
`backfill_channel` iterates newest→oldest, carrying `last_password` so both older and newer file messages in the same scan pick it up.
|
||||
|
||||
---
|
||||
|
||||
## Download strategy
|
||||
|
||||
```
|
||||
is_tdl_available()?
|
||||
yes → download_single_with_tdl() / download_batch_with_tdl()
|
||||
↓ failed?
|
||||
_telethon_download()
|
||||
no → _telethon_download() directly
|
||||
```
|
||||
410
core/scraper.py
Normal file
410
core/scraper.py
Normal file
@@ -0,0 +1,410 @@
|
||||
"""
|
||||
scraper.py — Telethon user client.
|
||||
|
||||
Handles:
|
||||
- Listening for new file messages in watched channels
|
||||
- Listening for messages with inline download buttons (bot-dispatched files)
|
||||
- Backfilling recent channel history on startup (batched via tdl)
|
||||
- Downloading files safely (size guard, flood wait)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from tqdm import tqdm
|
||||
from telethon import TelegramClient, events
|
||||
from telethon.errors import FloodWaitError, ChannelPrivateError, UsernameNotOccupiedError
|
||||
from telethon.tl.types import (
|
||||
MessageMediaDocument,
|
||||
DocumentAttributeFilename,
|
||||
InputDocumentFileLocation,
|
||||
)
|
||||
|
||||
from config import (
|
||||
ALLOWED_EXTENSIONS,
|
||||
BACKFILL_LIMIT,
|
||||
MAX_FILE_SIZE,
|
||||
TEMP_DIR,
|
||||
WATCHED_CHANNELS,
|
||||
TDL_AMOUNT,
|
||||
)
|
||||
from core.bot_downloader import handle_bot_download_message, has_download_button, extract_password
|
||||
from utils.cache import is_seen, mark_seen
|
||||
from core.processor import process_file
|
||||
from core.notifier import notify
|
||||
from core.tdl_downloader import (
|
||||
BatchEntry,
|
||||
download_batch_with_tdl,
|
||||
download_single_with_tdl,
|
||||
is_tdl_available,
|
||||
)
|
||||
from tui import events as bus
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_filename(msg) -> str | None:
|
||||
"""Extract the filename from a document message, if any."""
|
||||
if not isinstance(msg.media, MessageMediaDocument):
|
||||
return None
|
||||
doc = msg.media.document
|
||||
for attr in doc.attributes:
|
||||
if isinstance(attr, DocumentAttributeFilename):
|
||||
return attr.file_name
|
||||
mime = getattr(doc, "mime_type", "") or ""
|
||||
ext_map = {
|
||||
"application/x-rar-compressed": ".rar",
|
||||
"application/vnd.rar": ".rar",
|
||||
"application/zip": ".zip",
|
||||
"application/x-7z-compressed": ".7z",
|
||||
"text/plain": ".txt",
|
||||
}
|
||||
return f"{msg.id}{ext_map.get(mime, '.bin')}"
|
||||
|
||||
|
||||
def get_filesize(msg) -> int:
|
||||
"""Return document size in bytes, or 0 if not a document."""
|
||||
if not isinstance(msg.media, MessageMediaDocument):
|
||||
return 0
|
||||
return msg.media.document.size or 0
|
||||
|
||||
|
||||
def is_processable(filename: str, size: int) -> tuple[bool, str]:
|
||||
"""Check whether a file should be downloaded. Returns (ok, reason)."""
|
||||
suffix = Path(filename).suffix.lower()
|
||||
if suffix not in ALLOWED_EXTENSIONS:
|
||||
return False, f"extension {suffix!r} not in allowlist"
|
||||
if size > MAX_FILE_SIZE:
|
||||
mb = size / (1024 * 1024)
|
||||
return False, f"too large ({mb:.1f} MB > {MAX_FILE_SIZE // (1024 * 1024)} MB limit)"
|
||||
return True, ""
|
||||
|
||||
|
||||
def _make_dest(msg, filename: str) -> Path:
|
||||
"""Resolve the destination path, avoiding name collisions."""
|
||||
TEMP_DIR.mkdir(exist_ok=True)
|
||||
dest = TEMP_DIR / filename
|
||||
if dest.exists():
|
||||
dest = TEMP_DIR / f"{msg.id}_{filename}"
|
||||
return dest
|
||||
|
||||
|
||||
# ─── Telethon fallback download ───────────────────────────────────────────────
|
||||
|
||||
async def _telethon_download(client: TelegramClient, msg, dest: Path, filename: str, size: int, batch_id: str | None = None) -> bool:
|
||||
"""Download a single file via Telethon. Returns True on success."""
|
||||
_bid = batch_id or f"telethon_{int(time.monotonic_ns())}"
|
||||
if batch_id is None:
|
||||
# Standalone call (not already queued by tdl path) — post queued event
|
||||
bus.post(bus.EvDownloadQueued(
|
||||
batch_id=_bid, filename=filename,
|
||||
size_mb=round(size / (1024 * 1024), 2),
|
||||
source="telethon", password=None,
|
||||
))
|
||||
bus.post(bus.EvDownloadStarted(batch_id=_bid, filename=filename))
|
||||
try:
|
||||
with tqdm(
|
||||
total=size,
|
||||
unit="B",
|
||||
unit_scale=True,
|
||||
unit_divisor=1024,
|
||||
desc=filename[:40],
|
||||
colour="cyan",
|
||||
) as pbar:
|
||||
async def progress(current, total):
|
||||
pbar.n = current
|
||||
pbar.refresh()
|
||||
|
||||
doc = msg.media.document
|
||||
location = InputDocumentFileLocation(
|
||||
id=doc.id,
|
||||
access_hash=doc.access_hash,
|
||||
file_reference=doc.file_reference,
|
||||
thumb_size="",
|
||||
)
|
||||
await client.download_file(
|
||||
location,
|
||||
file=dest,
|
||||
part_size_kb=512,
|
||||
progress_callback=progress,
|
||||
)
|
||||
bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon"))
|
||||
return True
|
||||
except FloodWaitError as e:
|
||||
log.warning(f" Flood wait: sleeping {e.seconds}s...")
|
||||
await asyncio.sleep(e.seconds)
|
||||
await client.download_media(msg, file=dest)
|
||||
bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon"))
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f" Telethon download failed for {filename}: {e}")
|
||||
bus.post(bus.EvDownloadFailed(batch_id=_bid, filename=filename, reason=str(e)))
|
||||
return False
|
||||
|
||||
|
||||
# ─── Single-message pipeline (live handler + bot_downloader) ──────────────────
|
||||
|
||||
async def handle_message(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
msg,
|
||||
source_name: str,
|
||||
patterns,
|
||||
password: str | None = None,
|
||||
) -> None:
|
||||
"""Download and process a single file message."""
|
||||
filename = get_filename(msg)
|
||||
if not filename:
|
||||
log.warning(" handle_message: could not extract filename, skipping.")
|
||||
return
|
||||
|
||||
size = get_filesize(msg)
|
||||
ok, reason = is_processable(filename, size)
|
||||
if not ok:
|
||||
log.warning(f" handle_message: skipping '{filename}' — {reason}")
|
||||
return
|
||||
|
||||
doc_id = msg.media.document.id
|
||||
if is_seen(doc_id):
|
||||
log.info(f" Skipping {filename} — already processed.")
|
||||
return
|
||||
|
||||
dest = _make_dest(msg, filename)
|
||||
log.info(f"↓ Downloading: {filename} ({size / 1024:.1f} KB) from {source_name}")
|
||||
|
||||
# tdl single → Telethon fallback
|
||||
downloaded = await download_single_with_tdl(msg, dest) if is_tdl_available() else False
|
||||
if not downloaded:
|
||||
if is_tdl_available():
|
||||
log.warning(" [tdl] failed — falling back to Telethon")
|
||||
downloaded = await _telethon_download(client, msg, dest, filename, size)
|
||||
|
||||
if not downloaded:
|
||||
log.error(f" All download attempts failed for {filename}")
|
||||
return
|
||||
|
||||
hits = process_file(dest, patterns, password=password)
|
||||
mark_seen(doc_id)
|
||||
|
||||
if hits:
|
||||
await notify(bot, hits, source_name, filename)
|
||||
else:
|
||||
log.info(f" No hits in {filename}")
|
||||
|
||||
|
||||
# ─── Batch pipeline (backfill only) ───────────────────────────────────────────
|
||||
|
||||
async def _process_batch(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
batch: list[tuple], # list of (msg, source_name, password)
|
||||
patterns,
|
||||
) -> int:
|
||||
"""
|
||||
Download up to TDL_AMOUNT messages in one tdl invocation, then process
|
||||
each. Falls back to Telethon per-file for anything tdl missed.
|
||||
Returns the number of files successfully processed.
|
||||
"""
|
||||
if not batch:
|
||||
return 0
|
||||
|
||||
# Build BatchEntry list
|
||||
entries: list[BatchEntry] = []
|
||||
for msg, source_name, password in batch:
|
||||
filename = get_filename(msg)
|
||||
if not filename:
|
||||
continue
|
||||
entries.append(BatchEntry(
|
||||
msg=msg,
|
||||
filename=filename,
|
||||
dest=_make_dest(msg, filename),
|
||||
doc_id=msg.media.document.id,
|
||||
source_name=source_name,
|
||||
password=password,
|
||||
))
|
||||
|
||||
names = ", ".join(e.filename for e in entries)
|
||||
log.info(f"[Batch] {len(entries)} file(s): {names}")
|
||||
|
||||
# One tdl call for the whole batch
|
||||
results = await download_batch_with_tdl(entries)
|
||||
|
||||
processed = 0
|
||||
for entry in entries:
|
||||
tdl_ok = results.get(entry.doc_id, False)
|
||||
|
||||
if not tdl_ok:
|
||||
# Per-file Telethon fallback
|
||||
log.info(f" [Batch] Telethon fallback: {entry.filename}")
|
||||
size = get_filesize(entry.msg)
|
||||
tdl_ok = await _telethon_download(client, entry.msg, entry.dest, entry.filename, size)
|
||||
|
||||
if not tdl_ok:
|
||||
log.error(f" [Batch] All attempts failed: {entry.filename}")
|
||||
continue
|
||||
|
||||
hits = process_file(entry.dest, patterns, password=entry.password)
|
||||
mark_seen(entry.doc_id)
|
||||
|
||||
if hits:
|
||||
await notify(bot, hits, entry.source_name, entry.filename)
|
||||
else:
|
||||
log.info(f" No hits in {entry.filename}")
|
||||
|
||||
processed += 1
|
||||
|
||||
return processed
|
||||
|
||||
|
||||
# ─── Backfill ─────────────────────────────────────────────────────────────────
|
||||
|
||||
async def backfill_channel(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
channel: str | int,
|
||||
patterns,
|
||||
limit: int,
|
||||
) -> None:
|
||||
"""Scan the last `limit` messages of a channel for file attachments."""
|
||||
log.info(f"[Backfill] Scanning history: {channel} (last {limit} messages)")
|
||||
total = 0
|
||||
batch: list[tuple] = [] # (msg, source_name, password)
|
||||
last_password: str | None = None # carry password across adjacent messages
|
||||
|
||||
async def flush_batch():
|
||||
nonlocal total
|
||||
if batch:
|
||||
total += await _process_batch(client, bot, batch, patterns)
|
||||
batch.clear()
|
||||
|
||||
try:
|
||||
async for msg in client.iter_messages(channel, limit=limit):
|
||||
source_name = str(channel)
|
||||
|
||||
# Extract password from this message if present, and remember it.
|
||||
# iter_messages goes newest→oldest, so a password post that appears
|
||||
# above the files in the channel will arrive AFTER them here.
|
||||
# We therefore carry last_password in both directions:
|
||||
# - apply it to file messages that have no inline password
|
||||
# - update it whenever we see a fresh password, so subsequent
|
||||
# (older) file messages in the same batch pick it up too.
|
||||
msg_password = extract_password(msg)
|
||||
if msg_password:
|
||||
last_password = msg_password
|
||||
|
||||
password = msg_password or last_password
|
||||
|
||||
if msg.media and isinstance(msg.media, MessageMediaDocument):
|
||||
filename = get_filename(msg)
|
||||
size = get_filesize(msg)
|
||||
|
||||
if not filename:
|
||||
continue
|
||||
|
||||
ok, reason = is_processable(filename, size)
|
||||
if not ok:
|
||||
log.warning(f" [Backfill] Skipping '{filename}' — {reason}")
|
||||
continue
|
||||
|
||||
if is_seen(msg.media.document.id):
|
||||
log.info(f" [Backfill] Already seen: {filename}")
|
||||
continue
|
||||
|
||||
if is_tdl_available():
|
||||
batch.append((msg, source_name, password))
|
||||
if len(batch) >= TDL_AMOUNT:
|
||||
await flush_batch()
|
||||
else:
|
||||
# No tdl — fall straight through to single handle_message
|
||||
await handle_message(client, bot, msg, source_name, patterns, password=password)
|
||||
total += 1
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
elif msg.buttons and has_download_button(msg):
|
||||
# Bot-button messages can't be batched — handle individually
|
||||
await flush_batch() # flush any pending batch first
|
||||
await handle_bot_download_message(client, bot, msg, source_name, patterns, password=password)
|
||||
total += 1
|
||||
await asyncio.sleep(1.5)
|
||||
|
||||
# Flush whatever's left
|
||||
await flush_batch()
|
||||
|
||||
except (ChannelPrivateError, UsernameNotOccupiedError) as e:
|
||||
log.error(f"[Backfill] Cannot access {channel}: {e}")
|
||||
except Exception as e:
|
||||
log.error(f"[Backfill] Error scanning {channel}: {e}")
|
||||
|
||||
log.info(f"[Backfill] Done: {channel} — {total} file(s) processed")
|
||||
|
||||
|
||||
async def backfill_all(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
patterns,
|
||||
) -> None:
|
||||
"""Backfill all watched channels sequentially."""
|
||||
if BACKFILL_LIMIT <= 0:
|
||||
log.info("[Backfill] Disabled (BACKFILL_LIMIT=0)")
|
||||
return
|
||||
log.info(f"[Backfill] Starting for {len(WATCHED_CHANNELS)} channel(s)...")
|
||||
for ch in WATCHED_CHANNELS:
|
||||
await backfill_channel(client, bot, ch, patterns, BACKFILL_LIMIT)
|
||||
log.info("[Backfill] Complete.")
|
||||
|
||||
|
||||
# ─── Entity cache warmup ──────────────────────────────────────────────────────
|
||||
|
||||
async def warm_entity_cache(client: TelegramClient) -> None:
|
||||
"""
|
||||
Fetches your dialog list so Telethon caches all entity mappings.
|
||||
Required before using raw numeric IDs.
|
||||
"""
|
||||
log.info("Warming entity cache (fetching dialogs)...")
|
||||
async for _ in client.iter_dialogs():
|
||||
pass
|
||||
log.info("Entity cache ready.")
|
||||
|
||||
|
||||
# ─── Live listener ────────────────────────────────────────────────────────────
|
||||
|
||||
def register_handlers(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
patterns,
|
||||
) -> None:
|
||||
"""Register the NewMessage event handler for all watched channels."""
|
||||
|
||||
# Per-channel password cache for the live handler.
|
||||
# Channels often post a text message with the password separately from
|
||||
# the file message. We remember the last seen password per channel so
|
||||
# that the file message that follows (or precedes by seconds) picks it up.
|
||||
_channel_passwords: dict[int, str] = {}
|
||||
|
||||
@client.on(events.NewMessage(chats=WATCHED_CHANNELS))
|
||||
async def on_new_message(event):
|
||||
msg = event.message
|
||||
try:
|
||||
source = event.chat.username or str(event.chat_id)
|
||||
except Exception:
|
||||
source = str(event.chat_id)
|
||||
|
||||
chat_id = event.chat_id
|
||||
log.info(f"[Live] New message in {source}")
|
||||
|
||||
# Update cache if this message carries a password
|
||||
msg_password = extract_password(msg)
|
||||
if msg_password:
|
||||
_channel_passwords[chat_id] = msg_password
|
||||
log.debug(f"[Live] Password cached for {source}: '{msg_password}'")
|
||||
|
||||
password = msg_password or _channel_passwords.get(chat_id)
|
||||
|
||||
if msg.media and isinstance(msg.media, MessageMediaDocument):
|
||||
await handle_message(client, bot, msg, source, patterns, password=password)
|
||||
elif msg.buttons and has_download_button(msg):
|
||||
await handle_bot_download_message(client, bot, msg, source, patterns, password=password)
|
||||
70
core/tdl_downloader.md
Normal file
70
core/tdl_downloader.md
Normal file
@@ -0,0 +1,70 @@
|
||||
# core/tdl_downloader.py
|
||||
|
||||
Fast file downloads via `tdl` (Go MTProto). Falls back gracefully if tdl is not installed.
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from core.tdl_downloader import (
|
||||
is_tdl_available,
|
||||
download_single_with_tdl,
|
||||
download_batch_with_tdl,
|
||||
BatchEntry,
|
||||
)
|
||||
```
|
||||
|
||||
### `is_tdl_available() -> bool`
|
||||
Returns `True` if `tdl` binary is on PATH.
|
||||
|
||||
### `download_single_with_tdl(msg, dest: Path) -> bool`
|
||||
**async.** Downloads one message's document. Returns `True` on success.
|
||||
Used by the live handler and `bot_downloader`.
|
||||
|
||||
### `download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]`
|
||||
**async.** Downloads up to `TDL_AMOUNT` messages in a single `tdl dl` invocation.
|
||||
Returns `{doc_id: True|False}` — `False` means Telethon fallback needed.
|
||||
|
||||
---
|
||||
|
||||
## BatchEntry dataclass
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class BatchEntry:
|
||||
msg: object # Telethon Message
|
||||
filename: str
|
||||
dest: Path # final destination path in TEMP_DIR
|
||||
doc_id: int # msg.media.document.id
|
||||
source_name: str
|
||||
password: str | None
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## TUI output pipeline
|
||||
|
||||
In TUI mode (`bus.tui_active == True`), `_run_tdl` pipes stdout+stderr and relays lines as `EvTdlOutput` events in real time.
|
||||
**Reads raw 256-byte chunks** (not line-by-line) and splits on `\r` and `\n`, because tdl uses `\r` to overwrite its progress bar in place.
|
||||
|
||||
In CLI mode: subprocess inherits the terminal, progress bars render natively.
|
||||
|
||||
---
|
||||
|
||||
## Staging directory isolation
|
||||
|
||||
Each batch/single download gets a unique `data/tmp/_tdl_{monotonic_ns}/` staging dir.
|
||||
After `tdl` exits, files are matched by name (with fuzzy stem fallback for `filenamify()` mangling) and moved to final `dest`. Staging dir is removed regardless of outcome.
|
||||
|
||||
`--template '{{ filenamify .FileName }}'` — tdl uses the original Telegram filename, not its default `DialogID_MessageID_filename` format.
|
||||
|
||||
---
|
||||
|
||||
## Config knobs (`config.py`)
|
||||
|
||||
| Setting | Default | Description |
|
||||
|---------|---------|-------------|
|
||||
| `TDL_NAMESPACE` | `"default"` | `-n` flag; `None` omits it |
|
||||
| `TDL_THREADS` | `8` | `-t` chunk workers per file |
|
||||
| `TDL_PERFILE` | `4` | `-l` concurrent files per invocation |
|
||||
| `TDL_AMOUNT` | `4` | Max messages per batch |
|
||||
| `TDL_TAKEOUT` | `False` | `--takeout` session flag |
|
||||
363
core/tdl_downloader.py
Normal file
363
core/tdl_downloader.py
Normal file
@@ -0,0 +1,363 @@
|
||||
"""
|
||||
tdl_downloader.py — Fast file downloads via tdl (Go MTProto implementation).
|
||||
|
||||
Install: https://github.com/iyear/tdl
|
||||
curl -sSL https://raw.githubusercontent.com/iyear/tdl/main/scripts/install.sh | bash
|
||||
|
||||
First-time setup — log in once:
|
||||
tdl login # saves to namespace "default"
|
||||
tdl login -n myns # saves to a named namespace
|
||||
|
||||
Relevant config.py knobs:
|
||||
TDL_NAMESPACE str|None Session namespace (default "default"; None omits -n)
|
||||
TDL_THREADS int Chunk workers per file (-t, default 4)
|
||||
TDL_PERFILE int Concurrent files (-l, default 4)
|
||||
TDL_AMOUNT int Messages per tdl batch (default 4)
|
||||
TDL_TAKEOUT bool Use takeout session (--takeout)
|
||||
|
||||
Flag reference:
|
||||
Global (BEFORE subcommand): -n --ns, -t --threads, -l --limit
|
||||
dl-specific: -u --url, -d --dir, --template, --continue, --takeout
|
||||
|
||||
Download isolation strategy:
|
||||
Each batch gets its own staging subdirectory (TEMP_DIR/<batch_id>/) so that
|
||||
concurrent downloads and homoglyph filename collisions can never cause tdl's
|
||||
internal .tmp → final rename to fail. Files are moved to TEMP_DIR after
|
||||
the batch completes and the staging dir is removed.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import shutil
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from config import TDL_NAMESPACE, TDL_THREADS, TDL_PERFILE, TDL_TAKEOUT, TEMP_DIR
|
||||
from tui import events as bus
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ─── Availability ─────────────────────────────────────────────────────────────
|
||||
|
||||
def is_tdl_available() -> bool:
|
||||
return shutil.which("tdl") is not None
|
||||
|
||||
|
||||
# ─── Message → URL ────────────────────────────────────────────────────────────
|
||||
|
||||
def _build_message_url(msg) -> str:
|
||||
"""
|
||||
Build a t.me/c/<channel_id>/<msg_id> link from a Telethon Message.
|
||||
Works for public and private channels alike.
|
||||
"""
|
||||
peer = msg.peer_id
|
||||
if hasattr(peer, "channel_id"):
|
||||
return f"https://t.me/c/{peer.channel_id}/{msg.id}"
|
||||
elif hasattr(peer, "chat_id"):
|
||||
return f"https://t.me/c/{peer.chat_id}/{msg.id}"
|
||||
elif hasattr(peer, "user_id"):
|
||||
return f"https://t.me/c/{peer.user_id}/{msg.id}"
|
||||
raise ValueError(f"Cannot build message URL from peer: {peer!r}")
|
||||
|
||||
|
||||
# ─── Command builder ──────────────────────────────────────────────────────────
|
||||
|
||||
def _build_cmd(urls: list[str], staging_dir: Path) -> list[str]:
|
||||
"""
|
||||
Build the full tdl dl command.
|
||||
|
||||
Global flags (-n, -t, -l) MUST precede the subcommand.
|
||||
staging_dir is always an absolute path to a fresh per-batch directory,
|
||||
so tdl's internal .tmp → final rename can never collide with an existing
|
||||
file of the same name.
|
||||
|
||||
--template '{{ filenamify .FileName }}' keeps just the original filename
|
||||
(no DialogID_MessageID_ prefix).
|
||||
|
||||
--continue is kept so interrupted downloads resume rather than restart.
|
||||
--skip-same is intentionally omitted — deduplication is handled upstream
|
||||
by is_seen(), and --skip-same can cause the .tmp rename to fail when a
|
||||
same-named file already exists in the directory.
|
||||
"""
|
||||
global_flags: list[str] = []
|
||||
if TDL_NAMESPACE:
|
||||
global_flags += ["-n", str(TDL_NAMESPACE)]
|
||||
global_flags += ["-t", str(TDL_THREADS), "-l", str(TDL_PERFILE)]
|
||||
|
||||
url_flags: list[str] = []
|
||||
for url in urls:
|
||||
url_flags += ["-u", url]
|
||||
|
||||
dl_flags = [
|
||||
"-d", str(staging_dir),
|
||||
"--template", "{{ filenamify .FileName }}",
|
||||
"--continue",
|
||||
]
|
||||
if TDL_TAKEOUT:
|
||||
dl_flags.append("--takeout")
|
||||
|
||||
return ["tdl", *global_flags, "dl", *url_flags, *dl_flags]
|
||||
|
||||
|
||||
# ─── Runner ───────────────────────────────────────────────────────────────────
|
||||
|
||||
# ANSI escape stripper — tdl emits colour codes even when not a TTY
|
||||
import re as _re
|
||||
_ANSI_RE = _re.compile(r"\x1b\[[0-9;]*[mGKHFJA-Z]|\x1b=|\x1b>|\x1b\[\?[0-9]+[hl]")
|
||||
|
||||
def _strip_ansi(text: str) -> str:
|
||||
return _ANSI_RE.sub("", text)
|
||||
|
||||
|
||||
async def _run_tdl(cmd: list[str], label: str) -> bool:
|
||||
"""
|
||||
Spawn tdl and handle output based on whether the TUI is running:
|
||||
- TUI mode: pipe stdout+stderr, read raw chunks (NOT line-by-line),
|
||||
split on both \\r and \\n, strip ANSI, post non-empty
|
||||
segments immediately as EvTdlOutput.
|
||||
tdl uses \\r to overwrite its progress bar in place, so
|
||||
async-for-line on the stream would block until EOF.
|
||||
Chunk-reading + manual split delivers progress live.
|
||||
- CLI mode: inherit the terminal so tdl's progress bars render natively.
|
||||
Returns True on exit code 0, False otherwise.
|
||||
"""
|
||||
log.debug(f"[tdl] cmd: {' '.join(cmd)}")
|
||||
try:
|
||||
if bus.tui_active:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
|
||||
async def _relay(stream):
|
||||
buf = ""
|
||||
while True:
|
||||
chunk = await stream.read(256)
|
||||
if not chunk:
|
||||
break
|
||||
buf += chunk.decode(errors="replace")
|
||||
# Split on both \r and \n; process all complete segments
|
||||
parts = _re.split(r"[\r\n]", buf)
|
||||
# Last element may be an incomplete segment — keep in buffer
|
||||
buf = parts[-1]
|
||||
for part in parts[:-1]:
|
||||
clean = _strip_ansi(part).strip()
|
||||
if clean:
|
||||
bus.post(bus.EvTdlOutput(line=clean))
|
||||
# Flush any remaining buffer content
|
||||
if buf:
|
||||
clean = _strip_ansi(buf).strip()
|
||||
if clean:
|
||||
bus.post(bus.EvTdlOutput(line=clean))
|
||||
|
||||
await asyncio.gather(_relay(proc.stdout), _relay(proc.stderr))
|
||||
await proc.wait()
|
||||
else:
|
||||
proc = await asyncio.create_subprocess_exec(*cmd)
|
||||
await proc.wait()
|
||||
|
||||
if proc.returncode == 0:
|
||||
log.info(f"[tdl] ✓ {label}")
|
||||
return True
|
||||
else:
|
||||
log.error(f"[tdl] ✗ exit {proc.returncode} — {label}")
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
log.error("[tdl] binary not found at runtime")
|
||||
return False
|
||||
except Exception as e:
|
||||
log.error(f"[tdl] Unexpected error: {e}")
|
||||
return False
|
||||
|
||||
|
||||
# ─── Staging dir helpers ──────────────────────────────────────────────────────
|
||||
|
||||
def _make_staging_dir() -> Path:
|
||||
"""Create a unique staging subdirectory under TEMP_DIR for one batch."""
|
||||
staging = TEMP_DIR.resolve() / f"_tdl_{int(time.monotonic_ns())}"
|
||||
staging.mkdir(parents=True, exist_ok=True)
|
||||
return staging
|
||||
|
||||
|
||||
def _find_in_staging(staging: Path, expected_name: str) -> Path | None:
|
||||
"""
|
||||
Locate a downloaded file in the staging dir by matching its name.
|
||||
filenamify() can munge characters (strips @, collapses unicode, etc.)
|
||||
so we do a normalised stem comparison as a fallback.
|
||||
"""
|
||||
# Exact match first
|
||||
exact = staging / expected_name
|
||||
if exact.exists():
|
||||
return exact
|
||||
|
||||
expected_stem = Path(expected_name).stem.lower().lstrip("@").replace(" ", "")
|
||||
expected_suffix = Path(expected_name).suffix.lower()
|
||||
|
||||
for candidate in staging.iterdir():
|
||||
if not candidate.is_file():
|
||||
continue
|
||||
if candidate.suffix.lower() != expected_suffix:
|
||||
continue
|
||||
cand_stem = candidate.stem.lower().lstrip("@").replace(" ", "")
|
||||
if cand_stem == expected_stem:
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _move_from_staging(staging: Path, expected_name: str, final_dest: Path) -> bool:
|
||||
"""
|
||||
Find the file in staging, move it to final_dest, return True on success.
|
||||
"""
|
||||
found = _find_in_staging(staging, expected_name)
|
||||
if not found:
|
||||
log.warning(f"[tdl] Not found in staging: '{expected_name}' (staging: {staging})")
|
||||
return False
|
||||
|
||||
try:
|
||||
found.rename(final_dest)
|
||||
log.debug(f"[tdl] Moved: {found.name} → {final_dest}")
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f"[tdl] Move failed {found} → {final_dest}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def _cleanup_staging(staging: Path) -> None:
|
||||
try:
|
||||
shutil.rmtree(staging, ignore_errors=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# ─── Public API ───────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class BatchEntry:
|
||||
"""Carries everything needed to process one file after a batch download."""
|
||||
msg: object # Telethon Message
|
||||
filename: str
|
||||
dest: Path
|
||||
doc_id: int
|
||||
source_name: str
|
||||
password: str | None
|
||||
|
||||
|
||||
async def download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]:
|
||||
"""
|
||||
Download a batch of messages in a single tdl invocation.
|
||||
|
||||
Each batch gets its own staging subdirectory so filenames can never
|
||||
collide with existing files in TEMP_DIR. After tdl exits, files are
|
||||
moved from staging to their final dest paths.
|
||||
|
||||
Returns dict mapping doc_id → True (ready at entry.dest) / False (fallback needed).
|
||||
"""
|
||||
if not entries:
|
||||
return {}
|
||||
|
||||
if not is_tdl_available():
|
||||
log.warning("[tdl] not available — all entries need Telethon fallback")
|
||||
return {e.doc_id: False for e in entries}
|
||||
|
||||
urls: list[str] = []
|
||||
for entry in entries:
|
||||
try:
|
||||
urls.append(_build_message_url(entry.msg))
|
||||
except ValueError as exc:
|
||||
log.error(f"[tdl] Skipping {entry.filename}: {exc}")
|
||||
urls.append("")
|
||||
|
||||
valid_entries = [(e, u) for e, u in zip(entries, urls) if u]
|
||||
if not valid_entries:
|
||||
return {e.doc_id: False for e in entries}
|
||||
|
||||
batch_id = f"batch_{int(time.monotonic_ns())}"
|
||||
names = ", ".join(e.filename for e, _ in valid_entries)
|
||||
log.info(f"[tdl] Batch ({len(valid_entries)} files): {names}")
|
||||
|
||||
# Notify TUI: all files in this batch are queued
|
||||
for entry, _ in valid_entries:
|
||||
size_mb = (entry.msg.media.document.size or 0) / (1024 * 1024)
|
||||
bus.post(bus.EvDownloadQueued(
|
||||
batch_id=batch_id,
|
||||
filename=entry.filename,
|
||||
size_mb=round(size_mb, 2),
|
||||
source=entry.source_name,
|
||||
password=entry.password,
|
||||
))
|
||||
|
||||
staging = _make_staging_dir()
|
||||
cmd = _build_cmd([u for _, u in valid_entries], staging)
|
||||
|
||||
# Signal batch started
|
||||
for entry, _ in valid_entries:
|
||||
bus.post(bus.EvDownloadStarted(batch_id=batch_id, filename=entry.filename))
|
||||
|
||||
tdl_ok = await _run_tdl(cmd, f"batch of {len(valid_entries)}")
|
||||
|
||||
results: dict[int, bool] = {}
|
||||
for entry in entries:
|
||||
if not any(e.doc_id == entry.doc_id for e, _ in valid_entries):
|
||||
results[entry.doc_id] = False
|
||||
continue
|
||||
|
||||
if tdl_ok:
|
||||
moved = _move_from_staging(staging, entry.filename, entry.dest)
|
||||
results[entry.doc_id] = moved
|
||||
if moved:
|
||||
bus.post(bus.EvDownloadDone(batch_id=batch_id, filename=entry.filename, via="tdl"))
|
||||
else:
|
||||
log.warning(f"[tdl] Fallback needed: {entry.filename}")
|
||||
bus.post(bus.EvDownloadFailed(batch_id=batch_id, filename=entry.filename, reason="staging mismatch"))
|
||||
else:
|
||||
results[entry.doc_id] = False
|
||||
bus.post(bus.EvDownloadFailed(batch_id=batch_id, filename=entry.filename, reason="tdl exit error"))
|
||||
|
||||
_cleanup_staging(staging)
|
||||
return results
|
||||
|
||||
|
||||
async def download_single_with_tdl(msg, dest: Path) -> bool:
|
||||
"""
|
||||
Download a single message with tdl. Used by the live handler and
|
||||
bot_downloader where batching doesn't apply.
|
||||
"""
|
||||
if not is_tdl_available():
|
||||
log.warning("[tdl] not available — falling back to Telethon")
|
||||
return False
|
||||
|
||||
try:
|
||||
url = _build_message_url(msg)
|
||||
except ValueError as e:
|
||||
log.error(f"[tdl] Cannot build URL: {e}")
|
||||
return False
|
||||
|
||||
batch_id = f"single_{int(time.monotonic_ns())}"
|
||||
size_mb = (msg.media.document.size or 0) / (1024 * 1024) if hasattr(msg, "media") and msg.media else 0
|
||||
bus.post(bus.EvDownloadQueued(
|
||||
batch_id=batch_id, filename=dest.name,
|
||||
size_mb=round(size_mb, 2), source="live", password=None,
|
||||
))
|
||||
bus.post(bus.EvDownloadStarted(batch_id=batch_id, filename=dest.name))
|
||||
|
||||
staging = _make_staging_dir()
|
||||
cmd = _build_cmd([url], staging)
|
||||
log.info(f"[tdl] Single: {dest.name} ({url})")
|
||||
tdl_ok = await _run_tdl(cmd, dest.name)
|
||||
|
||||
if tdl_ok:
|
||||
result = _move_from_staging(staging, dest.name, dest)
|
||||
else:
|
||||
result = False
|
||||
|
||||
_cleanup_staging(staging)
|
||||
|
||||
if result:
|
||||
bus.post(bus.EvDownloadDone(batch_id=batch_id, filename=dest.name, via="tdl"))
|
||||
else:
|
||||
bus.post(bus.EvDownloadFailed(batch_id=batch_id, filename=dest.name, reason="tdl failed"))
|
||||
return result
|
||||
Reference in New Issue
Block a user