Rename to stealergram, add pyproject.toml, purge em-dashes

- Rename project to stealergram throughout
- Add pyproject.toml (replaces requirements.txt split, folds pytest.ini)
- Replace all em-dashes with hyphens across all source files

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-19 10:06:30 -04:00
parent 4c104cddd2
commit 741e6bb0d3
46 changed files with 244 additions and 191 deletions

View File

@@ -1 +1 @@
"""core Telegram I/O pipeline (scraper, downloader, processor, notifier)."""
"""core - Telegram I/O pipeline (scraper, downloader, processor, notifier)."""

View File

@@ -1,5 +1,5 @@
"""
bot_downloader.py Handles "click to download" inline button flows.
bot_downloader.py - Handles "click to download" inline button flows.
Some Telegram channels post messages with a DOWNLOAD button that triggers
a bot to send you the actual file. This module simulates that click and

View File

@@ -1,5 +1,5 @@
"""
notifier.py Persists hits to disk and sends Telegram bot alerts.
notifier.py - Persists hits to disk and sends Telegram bot alerts.
Includes:
- Severity scoring via scorer.py
@@ -31,7 +31,7 @@ log = logging.getLogger(__name__)
MAX_PREVIEW = 10 # hits to show per severity group in alert
DEDUP_FILE = Path("./data/dedup.json")
# Only alert immediately for these severities LOW hits are silent
# Only alert immediately for these severities - LOW hits are silent
ALERT_SEVERITIES = {CRITICAL, HIGH, MEDIUM}
@@ -124,7 +124,7 @@ def write_hits(scored_hits: list, source: str) -> None:
def write_hits_csv(scored_hits: list, source: str, filename: str) -> None:
"""Append new hits to hits.csv one row per hit, easy to import."""
"""Append new hits to hits.csv - one row per hit, easy to import."""
HITS_CSV.parent.mkdir(parents=True, exist_ok=True)
write_header = not HITS_CSV.exists()
timestamp = _timestamp()
@@ -152,13 +152,13 @@ async def send_alert(
) -> None:
"""
Send a Telegram alert grouped by severity.
Only includes CRITICAL, HIGH, MEDIUM LOW hits are omitted from alerts.
Only includes CRITICAL, HIGH, MEDIUM - LOW hits are omitted from alerts.
"""
summary = summarize(scored_hits)
alertable = [h for h in scored_hits if h.severity in ALERT_SEVERITIES]
if not alertable:
log.info(" No alertable hits (all LOW) skipping Telegram notification.")
log.info(" No alertable hits (all LOW) - skipping Telegram notification.")
return
lines = [
@@ -210,7 +210,7 @@ async def notify(bot: TelegramClient, hits: list[str], source: str, filename: st
# Score first
scored = score_hits(hits)
log.info(f" Scored {len(scored)} hit(s) {summarize(scored)}")
log.info(f" Scored {len(scored)} hit(s) - {summarize(scored)}")
# Deduplicate
new_hits, dupe_hits = deduplicate(scored)
@@ -222,7 +222,7 @@ async def notify(bot: TelegramClient, hits: list[str], source: str, filename: st
insert_hits(dupe_hits, source, filename, seen_before=True)
if not new_hits:
log.info(" All hits already seen before no alert sent.")
log.info(" All hits already seen before - no alert sent.")
return
# Push hits to TUI

View File

@@ -54,8 +54,8 @@ Nested archives are recursed **one level** only.
## Password order
1. `extra_password` (from message/channel carry-forward) tried first
2. `config.ARCHIVE_PASSWORDS` tried in order
1. `extra_password` (from message/channel carry-forward) - tried first
2. `config.ARCHIVE_PASSWORDS` - tried in order
---

View File

@@ -1,8 +1,8 @@
"""
processor.py Archive extraction and hit searching logic.
processor.py - Archive extraction and hit searching logic.
Supports: .txt, .zip, .7z, .rar
Stream-processes files line by line safe for large combo lists.
Stream-processes files line by line - safe for large combo lists.
"""
import rarfile
@@ -40,7 +40,7 @@ def compile_patterns(keywords: list[str]) -> list[re.Pattern]:
def search_file(filepath: Path, patterns: list[re.Pattern]) -> list[str]:
"""
Stream-reads a text file line by line and returns lines matching any pattern.
Ignores encoding errors combo files are often messy.
Ignores encoding errors - combo files are often messy.
"""
hits: list[str] = []
try:
@@ -82,7 +82,7 @@ def extract_zip(filepath: Path, dest: Path, extra_password: str | None = None) -
except RuntimeError:
log.info(f" ZIP is password-protected, trying common passwords...")
if not _try_passwords(try_extract, ARCHIVE_PASSWORDS):
log.warning(f" Could not unlock {filepath.name} skipping.")
log.warning(f" Could not unlock {filepath.name} - skipping.")
return []
extracted = [p for p in dest.rglob("*") if p.is_file()]
@@ -95,7 +95,7 @@ def extract_zip(filepath: Path, dest: Path, extra_password: str | None = None) -
def extract_7z(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
if not HAS_7Z:
log.warning("py7zr not installed skipping .7z file.")
log.warning("py7zr not installed - skipping .7z file.")
return []
extracted: list[Path] = []
passwords = ARCHIVE_PASSWORDS.copy()
@@ -119,7 +119,7 @@ def extract_7z(filepath: Path, dest: Path, extra_password: str | None = None) ->
except Exception:
continue
if not success:
log.warning(f" Could not unlock {filepath.name} skipping.")
log.warning(f" Could not unlock {filepath.name} - skipping.")
return []
extracted = [p for p in dest.rglob("*") if p.is_file()]
@@ -130,7 +130,7 @@ def extract_7z(filepath: Path, dest: Path, extra_password: str | None = None) ->
def extract_rar(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
if not HAS_RAR:
log.warning("rarfile not installed skipping .rar file.")
log.warning("rarfile not installed - skipping .rar file.")
return []
passwords = ARCHIVE_PASSWORDS.copy()
@@ -150,7 +150,7 @@ def extract_rar(filepath: Path, dest: Path, extra_password: str | None = None) -
except Exception:
log.info(f" RAR may be password-protected, trying common passwords...")
if not _try_passwords(try_extract, ARCHIVE_PASSWORDS):
log.warning(f" Could not unlock {filepath.name} skipping.")
log.warning(f" Could not unlock {filepath.name} - skipping.")
return []
extracted = [p for p in dest.rglob("*") if p.is_file()]
@@ -184,7 +184,7 @@ def unpack(filepath: Path, extra_password: str | None = None) -> tuple[list[Path
return files, extract_dir
else:
# Plain file return as-is, no extract dir to clean up
# Plain file - return as-is, no extract dir to clean up
return [filepath], None
@@ -207,7 +207,7 @@ def process_file(filepath: Path, patterns, password: str | None = None) -> list[
log.info(f"{len(hits)} hit(s) in {f.name}")
all_hits.extend(hits)
# Nested archives recurse one level
# Nested archives - recurse one level
elif f.suffix.lower() in {".zip", ".7z", ".rar"} and f != filepath:
log.info(f" → Nested archive: {f.name}")
nested_hits = process_file(f, patterns)

View File

@@ -11,7 +11,7 @@ from core.scraper import handle_message, backfill_all, register_handlers, warm_e
### `handle_message(client, bot, msg, source_name, patterns, password=None)`
**async.** Full pipeline for one document message:
1. Extract filename + size, check allowlist + size guard
2. Check `utils.cache` skip if already seen
2. Check `utils.cache` - skip if already seen
3. Try `tdl` download → Telethon fallback
4. `core.processor.process_file()` → hits
5. `core.notifier.notify()` if hits found

View File

@@ -1,5 +1,5 @@
"""
scraper.py Telethon user client.
scraper.py - Telethon user client.
Handles:
- Listening for new file messages in watched channels
@@ -99,7 +99,7 @@ async def _telethon_download(client: TelegramClient, msg, dest: Path, filename:
"""Download a single file via Telethon. Returns True on success."""
_bid = batch_id or f"telethon_{int(time.monotonic_ns())}"
if batch_id is None:
# Standalone call (not already queued by tdl path) post queued event
# Standalone call (not already queued by tdl path) - post queued event
bus.post(bus.EvDownloadQueued(
batch_id=_bid, filename=filename,
size_mb=round(size / (1024 * 1024), 2),
@@ -165,12 +165,12 @@ async def handle_message(
size = get_filesize(msg)
ok, reason = is_processable(filename, size)
if not ok:
log.warning(f" handle_message: skipping '{filename}' {reason}")
log.warning(f" handle_message: skipping '{filename}' - {reason}")
return
doc_id = msg.media.document.id
if is_seen(doc_id):
log.info(f" Skipping {filename} already processed.")
log.info(f" Skipping {filename} - already processed.")
return
dest = _make_dest(msg, filename)
@@ -180,7 +180,7 @@ async def handle_message(
downloaded = await download_single_with_tdl(msg, dest) if is_tdl_available() else False
if not downloaded:
if is_tdl_available():
log.warning(" [tdl] failed falling back to Telethon")
log.warning(" [tdl] failed - falling back to Telethon")
downloaded = await _telethon_download(client, msg, dest, filename, size)
if not downloaded:
@@ -307,7 +307,7 @@ async def backfill_channel(
ok, reason = is_processable(filename, size)
if not ok:
log.warning(f" [Backfill] Skipping '{filename}' {reason}")
log.warning(f" [Backfill] Skipping '{filename}' - {reason}")
continue
if is_seen(msg.media.document.id):
@@ -319,13 +319,13 @@ async def backfill_channel(
if len(batch) >= TDL_AMOUNT:
await flush_batch()
else:
# No tdl fall straight through to single handle_message
# No tdl - fall straight through to single handle_message
await handle_message(client, bot, msg, source_name, patterns, password=password)
total += 1
await asyncio.sleep(0.5)
elif msg.buttons and has_download_button(msg):
# Bot-button messages can't be batched handle individually
# Bot-button messages can't be batched - handle individually
await flush_batch() # flush any pending batch first
await handle_bot_download_message(client, bot, msg, source_name, patterns, password=password)
total += 1
@@ -339,7 +339,7 @@ async def backfill_channel(
except Exception as e:
log.error(f"[Backfill] Error scanning {channel}: {e}")
log.info(f"[Backfill] Done: {channel} {total} file(s) processed")
log.info(f"[Backfill] Done: {channel} - {total} file(s) processed")
async def backfill_all(

View File

@@ -22,7 +22,7 @@ Used by the live handler and `bot_downloader`.
### `download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]`
**async.** Downloads up to `TDL_AMOUNT` messages in a single `tdl dl` invocation.
Returns `{doc_id: True|False}` `False` means Telethon fallback needed.
Returns `{doc_id: True|False}` - `False` means Telethon fallback needed.
---
@@ -55,7 +55,7 @@ In CLI mode: subprocess inherits the terminal, progress bars render natively.
Each batch/single download gets a unique `data/tmp/_tdl_{monotonic_ns}/` staging dir.
After `tdl` exits, files are matched by name (with fuzzy stem fallback for `filenamify()` mangling) and moved to final `dest`. Staging dir is removed regardless of outcome.
`--template '{{ filenamify .FileName }}'` tdl uses the original Telegram filename, not its default `DialogID_MessageID_filename` format.
`--template '{{ filenamify .FileName }}'` - tdl uses the original Telegram filename, not its default `DialogID_MessageID_filename` format.
---

View File

@@ -1,10 +1,10 @@
"""
tdl_downloader.py Fast file downloads via tdl (Go MTProto implementation).
tdl_downloader.py - Fast file downloads via tdl (Go MTProto implementation).
Install: https://github.com/iyear/tdl
curl -sSL https://raw.githubusercontent.com/iyear/tdl/main/scripts/install.sh | bash
First-time setup log in once:
First-time setup - log in once:
tdl login # saves to namespace "default"
tdl login -n myns # saves to a named namespace
@@ -77,7 +77,7 @@ def _build_cmd(urls: list[str], staging_dir: Path) -> list[str]:
(no DialogID_MessageID_ prefix).
--continue is kept so interrupted downloads resume rather than restart.
--skip-same is intentionally omitted deduplication is handled upstream
--skip-same is intentionally omitted - deduplication is handled upstream
by is_seen(), and --skip-same can cause the .tmp rename to fail when a
same-named file already exists in the directory.
"""
@@ -103,7 +103,7 @@ def _build_cmd(urls: list[str], staging_dir: Path) -> list[str]:
# ─── Runner ───────────────────────────────────────────────────────────────────
# ANSI escape stripper tdl emits colour codes even when not a TTY
# ANSI escape stripper - tdl emits colour codes even when not a TTY
import re as _re
_ANSI_RE = _re.compile(r"\x1b\[[0-9;]*[mGKHFJA-Z]|\x1b=|\x1b>|\x1b\[\?[0-9]+[hl]")
@@ -141,7 +141,7 @@ async def _run_tdl(cmd: list[str], label: str) -> bool:
buf += chunk.decode(errors="replace")
# Split on both \r and \n; process all complete segments
parts = _re.split(r"[\r\n]", buf)
# Last element may be an incomplete segment keep in buffer
# Last element may be an incomplete segment - keep in buffer
buf = parts[-1]
for part in parts[:-1]:
clean = _strip_ansi(part).strip()
@@ -163,7 +163,7 @@ async def _run_tdl(cmd: list[str], label: str) -> bool:
log.info(f"[tdl] ✓ {label}")
return True
else:
log.error(f"[tdl] ✗ exit {proc.returncode} {label}")
log.error(f"[tdl] ✗ exit {proc.returncode} - {label}")
return False
except FileNotFoundError:
log.error("[tdl] binary not found at runtime")
@@ -260,7 +260,7 @@ async def download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]:
return {}
if not is_tdl_available():
log.warning("[tdl] not available all entries need Telethon fallback")
log.warning("[tdl] not available - all entries need Telethon fallback")
return {e.doc_id: False for e in entries}
urls: list[str] = []
@@ -327,7 +327,7 @@ async def download_single_with_tdl(msg, dest: Path) -> bool:
bot_downloader where batching doesn't apply.
"""
if not is_tdl_available():
log.warning("[tdl] not available falling back to Telethon")
log.warning("[tdl] not available - falling back to Telethon")
return False
try: