Rename to stealergram, add pyproject.toml, purge em-dashes
- Rename project to stealergram throughout - Add pyproject.toml (replaces requirements.txt split, folds pytest.ini) - Replace all em-dashes with hyphens across all source files Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1 +1 @@
|
||||
"""core — Telegram I/O pipeline (scraper, downloader, processor, notifier)."""
|
||||
"""core - Telegram I/O pipeline (scraper, downloader, processor, notifier)."""
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
bot_downloader.py — Handles "click to download" inline button flows.
|
||||
bot_downloader.py - Handles "click to download" inline button flows.
|
||||
|
||||
Some Telegram channels post messages with a DOWNLOAD button that triggers
|
||||
a bot to send you the actual file. This module simulates that click and
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
notifier.py — Persists hits to disk and sends Telegram bot alerts.
|
||||
notifier.py - Persists hits to disk and sends Telegram bot alerts.
|
||||
|
||||
Includes:
|
||||
- Severity scoring via scorer.py
|
||||
@@ -31,7 +31,7 @@ log = logging.getLogger(__name__)
|
||||
MAX_PREVIEW = 10 # hits to show per severity group in alert
|
||||
DEDUP_FILE = Path("./data/dedup.json")
|
||||
|
||||
# Only alert immediately for these severities — LOW hits are silent
|
||||
# Only alert immediately for these severities - LOW hits are silent
|
||||
ALERT_SEVERITIES = {CRITICAL, HIGH, MEDIUM}
|
||||
|
||||
|
||||
@@ -124,7 +124,7 @@ def write_hits(scored_hits: list, source: str) -> None:
|
||||
|
||||
|
||||
def write_hits_csv(scored_hits: list, source: str, filename: str) -> None:
|
||||
"""Append new hits to hits.csv — one row per hit, easy to import."""
|
||||
"""Append new hits to hits.csv - one row per hit, easy to import."""
|
||||
HITS_CSV.parent.mkdir(parents=True, exist_ok=True)
|
||||
write_header = not HITS_CSV.exists()
|
||||
timestamp = _timestamp()
|
||||
@@ -152,13 +152,13 @@ async def send_alert(
|
||||
) -> None:
|
||||
"""
|
||||
Send a Telegram alert grouped by severity.
|
||||
Only includes CRITICAL, HIGH, MEDIUM — LOW hits are omitted from alerts.
|
||||
Only includes CRITICAL, HIGH, MEDIUM - LOW hits are omitted from alerts.
|
||||
"""
|
||||
summary = summarize(scored_hits)
|
||||
alertable = [h for h in scored_hits if h.severity in ALERT_SEVERITIES]
|
||||
|
||||
if not alertable:
|
||||
log.info(" No alertable hits (all LOW) — skipping Telegram notification.")
|
||||
log.info(" No alertable hits (all LOW) - skipping Telegram notification.")
|
||||
return
|
||||
|
||||
lines = [
|
||||
@@ -210,7 +210,7 @@ async def notify(bot: TelegramClient, hits: list[str], source: str, filename: st
|
||||
|
||||
# Score first
|
||||
scored = score_hits(hits)
|
||||
log.info(f" Scored {len(scored)} hit(s) — {summarize(scored)}")
|
||||
log.info(f" Scored {len(scored)} hit(s) - {summarize(scored)}")
|
||||
|
||||
# Deduplicate
|
||||
new_hits, dupe_hits = deduplicate(scored)
|
||||
@@ -222,7 +222,7 @@ async def notify(bot: TelegramClient, hits: list[str], source: str, filename: st
|
||||
insert_hits(dupe_hits, source, filename, seen_before=True)
|
||||
|
||||
if not new_hits:
|
||||
log.info(" All hits already seen before — no alert sent.")
|
||||
log.info(" All hits already seen before - no alert sent.")
|
||||
return
|
||||
|
||||
# Push hits to TUI
|
||||
|
||||
@@ -54,8 +54,8 @@ Nested archives are recursed **one level** only.
|
||||
|
||||
## Password order
|
||||
|
||||
1. `extra_password` (from message/channel carry-forward) — tried first
|
||||
2. `config.ARCHIVE_PASSWORDS` — tried in order
|
||||
1. `extra_password` (from message/channel carry-forward) - tried first
|
||||
2. `config.ARCHIVE_PASSWORDS` - tried in order
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
"""
|
||||
processor.py — Archive extraction and hit searching logic.
|
||||
processor.py - Archive extraction and hit searching logic.
|
||||
|
||||
Supports: .txt, .zip, .7z, .rar
|
||||
Stream-processes files line by line — safe for large combo lists.
|
||||
Stream-processes files line by line - safe for large combo lists.
|
||||
"""
|
||||
|
||||
import rarfile
|
||||
@@ -40,7 +40,7 @@ def compile_patterns(keywords: list[str]) -> list[re.Pattern]:
|
||||
def search_file(filepath: Path, patterns: list[re.Pattern]) -> list[str]:
|
||||
"""
|
||||
Stream-reads a text file line by line and returns lines matching any pattern.
|
||||
Ignores encoding errors — combo files are often messy.
|
||||
Ignores encoding errors - combo files are often messy.
|
||||
"""
|
||||
hits: list[str] = []
|
||||
try:
|
||||
@@ -82,7 +82,7 @@ def extract_zip(filepath: Path, dest: Path, extra_password: str | None = None) -
|
||||
except RuntimeError:
|
||||
log.info(f" ZIP is password-protected, trying common passwords...")
|
||||
if not _try_passwords(try_extract, ARCHIVE_PASSWORDS):
|
||||
log.warning(f" Could not unlock {filepath.name} — skipping.")
|
||||
log.warning(f" Could not unlock {filepath.name} - skipping.")
|
||||
return []
|
||||
|
||||
extracted = [p for p in dest.rglob("*") if p.is_file()]
|
||||
@@ -95,7 +95,7 @@ def extract_zip(filepath: Path, dest: Path, extra_password: str | None = None) -
|
||||
|
||||
def extract_7z(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
|
||||
if not HAS_7Z:
|
||||
log.warning("py7zr not installed — skipping .7z file.")
|
||||
log.warning("py7zr not installed - skipping .7z file.")
|
||||
return []
|
||||
extracted: list[Path] = []
|
||||
passwords = ARCHIVE_PASSWORDS.copy()
|
||||
@@ -119,7 +119,7 @@ def extract_7z(filepath: Path, dest: Path, extra_password: str | None = None) ->
|
||||
except Exception:
|
||||
continue
|
||||
if not success:
|
||||
log.warning(f" Could not unlock {filepath.name} — skipping.")
|
||||
log.warning(f" Could not unlock {filepath.name} - skipping.")
|
||||
return []
|
||||
|
||||
extracted = [p for p in dest.rglob("*") if p.is_file()]
|
||||
@@ -130,7 +130,7 @@ def extract_7z(filepath: Path, dest: Path, extra_password: str | None = None) ->
|
||||
|
||||
def extract_rar(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
|
||||
if not HAS_RAR:
|
||||
log.warning("rarfile not installed — skipping .rar file.")
|
||||
log.warning("rarfile not installed - skipping .rar file.")
|
||||
return []
|
||||
|
||||
passwords = ARCHIVE_PASSWORDS.copy()
|
||||
@@ -150,7 +150,7 @@ def extract_rar(filepath: Path, dest: Path, extra_password: str | None = None) -
|
||||
except Exception:
|
||||
log.info(f" RAR may be password-protected, trying common passwords...")
|
||||
if not _try_passwords(try_extract, ARCHIVE_PASSWORDS):
|
||||
log.warning(f" Could not unlock {filepath.name} — skipping.")
|
||||
log.warning(f" Could not unlock {filepath.name} - skipping.")
|
||||
return []
|
||||
|
||||
extracted = [p for p in dest.rglob("*") if p.is_file()]
|
||||
@@ -184,7 +184,7 @@ def unpack(filepath: Path, extra_password: str | None = None) -> tuple[list[Path
|
||||
return files, extract_dir
|
||||
|
||||
else:
|
||||
# Plain file — return as-is, no extract dir to clean up
|
||||
# Plain file - return as-is, no extract dir to clean up
|
||||
return [filepath], None
|
||||
|
||||
|
||||
@@ -207,7 +207,7 @@ def process_file(filepath: Path, patterns, password: str | None = None) -> list[
|
||||
log.info(f" ✓ {len(hits)} hit(s) in {f.name}")
|
||||
all_hits.extend(hits)
|
||||
|
||||
# Nested archives — recurse one level
|
||||
# Nested archives - recurse one level
|
||||
elif f.suffix.lower() in {".zip", ".7z", ".rar"} and f != filepath:
|
||||
log.info(f" → Nested archive: {f.name}")
|
||||
nested_hits = process_file(f, patterns)
|
||||
|
||||
@@ -11,7 +11,7 @@ from core.scraper import handle_message, backfill_all, register_handlers, warm_e
|
||||
### `handle_message(client, bot, msg, source_name, patterns, password=None)`
|
||||
**async.** Full pipeline for one document message:
|
||||
1. Extract filename + size, check allowlist + size guard
|
||||
2. Check `utils.cache` — skip if already seen
|
||||
2. Check `utils.cache` - skip if already seen
|
||||
3. Try `tdl` download → Telethon fallback
|
||||
4. `core.processor.process_file()` → hits
|
||||
5. `core.notifier.notify()` if hits found
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
scraper.py — Telethon user client.
|
||||
scraper.py - Telethon user client.
|
||||
|
||||
Handles:
|
||||
- Listening for new file messages in watched channels
|
||||
@@ -99,7 +99,7 @@ async def _telethon_download(client: TelegramClient, msg, dest: Path, filename:
|
||||
"""Download a single file via Telethon. Returns True on success."""
|
||||
_bid = batch_id or f"telethon_{int(time.monotonic_ns())}"
|
||||
if batch_id is None:
|
||||
# Standalone call (not already queued by tdl path) — post queued event
|
||||
# Standalone call (not already queued by tdl path) - post queued event
|
||||
bus.post(bus.EvDownloadQueued(
|
||||
batch_id=_bid, filename=filename,
|
||||
size_mb=round(size / (1024 * 1024), 2),
|
||||
@@ -165,12 +165,12 @@ async def handle_message(
|
||||
size = get_filesize(msg)
|
||||
ok, reason = is_processable(filename, size)
|
||||
if not ok:
|
||||
log.warning(f" handle_message: skipping '{filename}' — {reason}")
|
||||
log.warning(f" handle_message: skipping '{filename}' - {reason}")
|
||||
return
|
||||
|
||||
doc_id = msg.media.document.id
|
||||
if is_seen(doc_id):
|
||||
log.info(f" Skipping {filename} — already processed.")
|
||||
log.info(f" Skipping {filename} - already processed.")
|
||||
return
|
||||
|
||||
dest = _make_dest(msg, filename)
|
||||
@@ -180,7 +180,7 @@ async def handle_message(
|
||||
downloaded = await download_single_with_tdl(msg, dest) if is_tdl_available() else False
|
||||
if not downloaded:
|
||||
if is_tdl_available():
|
||||
log.warning(" [tdl] failed — falling back to Telethon")
|
||||
log.warning(" [tdl] failed - falling back to Telethon")
|
||||
downloaded = await _telethon_download(client, msg, dest, filename, size)
|
||||
|
||||
if not downloaded:
|
||||
@@ -307,7 +307,7 @@ async def backfill_channel(
|
||||
|
||||
ok, reason = is_processable(filename, size)
|
||||
if not ok:
|
||||
log.warning(f" [Backfill] Skipping '{filename}' — {reason}")
|
||||
log.warning(f" [Backfill] Skipping '{filename}' - {reason}")
|
||||
continue
|
||||
|
||||
if is_seen(msg.media.document.id):
|
||||
@@ -319,13 +319,13 @@ async def backfill_channel(
|
||||
if len(batch) >= TDL_AMOUNT:
|
||||
await flush_batch()
|
||||
else:
|
||||
# No tdl — fall straight through to single handle_message
|
||||
# No tdl - fall straight through to single handle_message
|
||||
await handle_message(client, bot, msg, source_name, patterns, password=password)
|
||||
total += 1
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
elif msg.buttons and has_download_button(msg):
|
||||
# Bot-button messages can't be batched — handle individually
|
||||
# Bot-button messages can't be batched - handle individually
|
||||
await flush_batch() # flush any pending batch first
|
||||
await handle_bot_download_message(client, bot, msg, source_name, patterns, password=password)
|
||||
total += 1
|
||||
@@ -339,7 +339,7 @@ async def backfill_channel(
|
||||
except Exception as e:
|
||||
log.error(f"[Backfill] Error scanning {channel}: {e}")
|
||||
|
||||
log.info(f"[Backfill] Done: {channel} — {total} file(s) processed")
|
||||
log.info(f"[Backfill] Done: {channel} - {total} file(s) processed")
|
||||
|
||||
|
||||
async def backfill_all(
|
||||
|
||||
@@ -22,7 +22,7 @@ Used by the live handler and `bot_downloader`.
|
||||
|
||||
### `download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]`
|
||||
**async.** Downloads up to `TDL_AMOUNT` messages in a single `tdl dl` invocation.
|
||||
Returns `{doc_id: True|False}` — `False` means Telethon fallback needed.
|
||||
Returns `{doc_id: True|False}` - `False` means Telethon fallback needed.
|
||||
|
||||
---
|
||||
|
||||
@@ -55,7 +55,7 @@ In CLI mode: subprocess inherits the terminal, progress bars render natively.
|
||||
Each batch/single download gets a unique `data/tmp/_tdl_{monotonic_ns}/` staging dir.
|
||||
After `tdl` exits, files are matched by name (with fuzzy stem fallback for `filenamify()` mangling) and moved to final `dest`. Staging dir is removed regardless of outcome.
|
||||
|
||||
`--template '{{ filenamify .FileName }}'` — tdl uses the original Telegram filename, not its default `DialogID_MessageID_filename` format.
|
||||
`--template '{{ filenamify .FileName }}'` - tdl uses the original Telegram filename, not its default `DialogID_MessageID_filename` format.
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
"""
|
||||
tdl_downloader.py — Fast file downloads via tdl (Go MTProto implementation).
|
||||
tdl_downloader.py - Fast file downloads via tdl (Go MTProto implementation).
|
||||
|
||||
Install: https://github.com/iyear/tdl
|
||||
curl -sSL https://raw.githubusercontent.com/iyear/tdl/main/scripts/install.sh | bash
|
||||
|
||||
First-time setup — log in once:
|
||||
First-time setup - log in once:
|
||||
tdl login # saves to namespace "default"
|
||||
tdl login -n myns # saves to a named namespace
|
||||
|
||||
@@ -77,7 +77,7 @@ def _build_cmd(urls: list[str], staging_dir: Path) -> list[str]:
|
||||
(no DialogID_MessageID_ prefix).
|
||||
|
||||
--continue is kept so interrupted downloads resume rather than restart.
|
||||
--skip-same is intentionally omitted — deduplication is handled upstream
|
||||
--skip-same is intentionally omitted - deduplication is handled upstream
|
||||
by is_seen(), and --skip-same can cause the .tmp rename to fail when a
|
||||
same-named file already exists in the directory.
|
||||
"""
|
||||
@@ -103,7 +103,7 @@ def _build_cmd(urls: list[str], staging_dir: Path) -> list[str]:
|
||||
|
||||
# ─── Runner ───────────────────────────────────────────────────────────────────
|
||||
|
||||
# ANSI escape stripper — tdl emits colour codes even when not a TTY
|
||||
# ANSI escape stripper - tdl emits colour codes even when not a TTY
|
||||
import re as _re
|
||||
_ANSI_RE = _re.compile(r"\x1b\[[0-9;]*[mGKHFJA-Z]|\x1b=|\x1b>|\x1b\[\?[0-9]+[hl]")
|
||||
|
||||
@@ -141,7 +141,7 @@ async def _run_tdl(cmd: list[str], label: str) -> bool:
|
||||
buf += chunk.decode(errors="replace")
|
||||
# Split on both \r and \n; process all complete segments
|
||||
parts = _re.split(r"[\r\n]", buf)
|
||||
# Last element may be an incomplete segment — keep in buffer
|
||||
# Last element may be an incomplete segment - keep in buffer
|
||||
buf = parts[-1]
|
||||
for part in parts[:-1]:
|
||||
clean = _strip_ansi(part).strip()
|
||||
@@ -163,7 +163,7 @@ async def _run_tdl(cmd: list[str], label: str) -> bool:
|
||||
log.info(f"[tdl] ✓ {label}")
|
||||
return True
|
||||
else:
|
||||
log.error(f"[tdl] ✗ exit {proc.returncode} — {label}")
|
||||
log.error(f"[tdl] ✗ exit {proc.returncode} - {label}")
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
log.error("[tdl] binary not found at runtime")
|
||||
@@ -260,7 +260,7 @@ async def download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]:
|
||||
return {}
|
||||
|
||||
if not is_tdl_available():
|
||||
log.warning("[tdl] not available — all entries need Telethon fallback")
|
||||
log.warning("[tdl] not available - all entries need Telethon fallback")
|
||||
return {e.doc_id: False for e in entries}
|
||||
|
||||
urls: list[str] = []
|
||||
@@ -327,7 +327,7 @@ async def download_single_with_tdl(msg, dest: Path) -> bool:
|
||||
bot_downloader where batching doesn't apply.
|
||||
"""
|
||||
if not is_tdl_available():
|
||||
log.warning("[tdl] not available — falling back to Telethon")
|
||||
log.warning("[tdl] not available - falling back to Telethon")
|
||||
return False
|
||||
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user