diff --git a/.env.example b/.env.example index 6b949bf..058063b 100644 --- a/.env.example +++ b/.env.example @@ -15,8 +15,8 @@ NOTIFY_CHAT_ID=987654321 # ─── Session name (just a filename, no extension needed) ──────────────────── SESSION_NAME=monitor_session -# ─── tdl (fast Go downloader) — optional but strongly recommended ─────────── +# ─── tdl (fast Go downloader) - optional but strongly recommended ─────────── # Install: https://github.com/iyear/tdl # After installing, run once: tdl login -n -# SESSION_NAME above is shared between Telethon and tdl — no double login needed. +# SESSION_NAME above is shared between Telethon and tdl - no double login needed. # If tdl is not on PATH the bot falls back to Telethon automatically. diff --git a/CLAUDE.md b/CLAUDE.md index 022ff7b..2d7b4f1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -5,7 +5,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Development workflow After every code change: -1. Run `pytest` — all tests must pass at 100%. +1. Run `pytest` - all tests must pass at 100%. 2. If 100% pass: present the change to the user, then commit. 3. If any test fails: fix the bug and re-run before showing anything to the user. @@ -20,7 +20,7 @@ pytest -v # verbose pytest tests/test_scorer.py # single file ``` -Tests cover `utils/scorer`, `utils/cache`, `utils/database`, and `core/processor`. They are fully isolated — no `.env` required, no real DB or cache files touched. The `patched_keywords` fixture in `conftest.py` replaces `TARGET_KEYWORDS` with known test patterns; it must patch both `config.TARGET_KEYWORDS` and `scorer.TARGET_KEYWORDS` (the local `from config import` binding). +Tests cover `utils/scorer`, `utils/cache`, `utils/database`, and `core/processor`. They are fully isolated - no `.env` required, no real DB or cache files touched. The `patched_keywords` fixture in `conftest.py` replaces `TARGET_KEYWORDS` with known test patterns; it must patch both `config.TARGET_KEYWORDS` and `scorer.TARGET_KEYWORDS` (the local `from config import` binding). ## Running the monitor @@ -66,15 +66,15 @@ Telegram channel message with file attachment The TUI and Telegram bot run in separate threads with different event loops: -- **Main thread**: Textual's event loop — runs `MonitorApp`, drains the event bus every 100ms via `_drain_bus()` -- **Bot thread**: own `asyncio` event loop — runs `_bot_main()` with both `user_client` and `bot_client` +- **Main thread**: Textual's event loop - runs `MonitorApp`, drains the event bus every 100ms via `_drain_bus()` +- **Bot thread**: own `asyncio` event loop - runs `_bot_main()` with both `user_client` and `bot_client` - **Cross-thread communication**: bot → TUI via `bus.post()` (`queue.Queue.put_nowait`, always safe); TUI → bot via `loop.call_soon_threadsafe()` (e.g., to signal channel list changes) ### Module responsibilities | Module | Role | |--------|------| -| `config.py` | All settings — edit keywords, channels, paths, tdl tuning here | +| `config.py` | All settings - edit keywords, channels, paths, tdl tuning here | | `core/scraper.py` | Live listener + backfill orchestration; registers Telethon `NewMessage` handlers | | `core/tdl_downloader.py` | Wraps `tdl` subprocess for fast downloads; falls back to Telethon | | `core/bot_downloader.py` | Handles inline button click flow where files come via bot reply | @@ -127,4 +127,4 @@ tail -f data/logs/monitor.log | `r` | Refresh stats | | `q` / `Escape` | Quit / back | -Runtime keyword and channel changes are **not** persisted — copy them to `config.py` to survive restarts. +Runtime keyword and channel changes are **not** persisted - copy them to `config.py` to survive restarts. diff --git a/QUICK_REF.md b/QUICK_REF.md index d9bb89b..12f48a9 100644 --- a/QUICK_REF.md +++ b/QUICK_REF.md @@ -1,4 +1,4 @@ -# ULP Monitor — Quick Reference +# ULP Monitor - Quick Reference > For Claude Code: read the per-file `.md` alongside each `.py` before editing. > Full docs in `README.md`. @@ -10,7 +10,7 @@ ``` ulp_monitor/ ├── main.py Entry point (--no-tui flag for CLI mode) -├── config.py All settings — edit this for keywords, channels, paths +├── config.py All settings - edit this for keywords, channels, paths │ ├── core/ Telegram I/O pipeline (all async, Telethon-dependent) │ ├── scraper.py Live listener + backfill orchestration @@ -24,11 +24,11 @@ ulp_monitor/ │ ├── cache.py Seen file-ID dedup (data/cache.json) │ └── database.py SQLite read/write (data/hits.db) │ -├── tui/ Textual TUI — runs in main thread +├── tui/ Textual TUI - runs in main thread │ ├── app.py MonitorApp + all screens + bot thread launcher │ └── events.py Thread-safe queue.Queue event bus │ -└── data/ Runtime output — gitignored +└── data/ Runtime output - gitignored ├── hits.db ├── hits.txt ├── hits.csv @@ -126,7 +126,7 @@ cross-thread communication | MEDIUM | 20 | Client-facing URL (app, booking, helpdesk…) | | LOW | 10 | Org domain appears anywhere in line | -`@`-keyword rule: pattern requires literal `@` before domain — `user@gmail.com` on a URL containing `myorg.cl` does **not** trigger CRITICAL. +`@`-keyword rule: pattern requires literal `@` before domain - `user@gmail.com` on a URL containing `myorg.cl` does **not** trigger CRITICAL. --- diff --git a/README.md b/README.md index c7d0493..f0510ff 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ ulp_monitor/ │ ├── processor.py Archive extraction + line-by-line search │ └── notifier.py hits.txt / hits.csv writer + bot alerts │ -├── utils/ Pure logic — no Telegram dependencies +├── utils/ Pure logic - no Telegram dependencies │ ├── scorer.py Hit severity scoring │ ├── cache.py Seen-file deduplication │ └── database.py SQLite persistence layer @@ -75,11 +75,11 @@ cp .env.example .env Open `config.py` and set: -- **`TARGET_KEYWORDS`** — your org's domains and email patterns. +- **`TARGET_KEYWORDS`** - your org's domains and email patterns. Keywords with `@` (e.g. `r"@myorg\.cl"`) are **employee email domains** → CRITICAL. Keywords without `@` are plain domain matches → LOW baseline. -- **`WATCHED_CHANNELS`** — channel usernames or numeric IDs -- **`BACKFILL_LIMIT`** — past messages to scan per channel on startup +- **`WATCHED_CHANNELS`** - channel usernames or numeric IDs +- **`BACKFILL_LIMIT`** - past messages to scan per channel on startup ### 5. Install dependencies @@ -97,7 +97,7 @@ curl -sSL https://raw.githubusercontent.com/iyear/tdl/main/scripts/install.sh | tdl login -n monitor_session ``` -### 6. First run — complete Telegram auth +### 6. First run - complete Telegram auth ```bash python main.py --no-tui @@ -130,9 +130,9 @@ python main.py --no-tui # plain CLI | File | Description | |------|-------------| -| `data/hits.db` | SQLite — all hits with scores, severity, dedup flag | +| `data/hits.db` | SQLite - all hits with scores, severity, dedup flag | | `data/hits.txt` | Human-readable grouped log | -| `data/hits.csv` | CSV — easy to pull into Excel / pandas | +| `data/hits.csv` | CSV - easy to pull into Excel / pandas | | `data/logs/monitor.log` | Full run log | Telegram alerts fire for CRITICAL / HIGH / MEDIUM only. LOW is stored silently. @@ -141,6 +141,6 @@ Telegram alerts fire for CRITICAL / HIGH / MEDIUM only. LOW is stored silently. ## Notes -- **Session files are sensitive** — equivalent to a logged-in account. Gitignored, never share. -- **Flood limits** — `FloodWaitError` is handled automatically. -- **Private channels** — your user account must already be a member. +- **Session files are sensitive** - equivalent to a logged-in account. Gitignored, never share. +- **Flood limits** - `FloodWaitError` is handled automatically. +- **Private channels** - your user account must already be a member. diff --git a/config.py b/config.py index ecb2e7b..c9b0371 100644 --- a/config.py +++ b/config.py @@ -1,5 +1,5 @@ """ -config.py — Loads and validates all settings from .env +config.py - Loads and validates all settings from .env """ import json @@ -29,30 +29,35 @@ RUNTIME_CONFIG_PATH = Path("./data/runtime_config.json") # Add your org's domains, email patterns, IP ranges, known usernames, etc. # All patterns are case-insensitive regex. _DEFAULT_KEYWORDS: list[str] = [ - r"sanatorioaleman\.cl", - r"@sanatorioaleman\.cl", + #r"sanatorioaleman\.cl", + #r"@sanatorioaleman\.cl", + #r"@hites\.cl", + #r"hites\.com", # r"192\.168\.10\.", # internal IP range example # r"specificuser", # known internal usernames + r"onion\.global", + r"@onion\.global", ] # Use usernames (without @) or numeric channel IDs (-100xxxxxxxxxx) _DEFAULT_CHANNELS: list[str | int] = [ #-1002230225603, - "cloudxlog", - #-1001967030016, # daisycloud - #"berserklogs", # berserklogs - #"BorwitaFreeLogs", # borwita - -1002748707556, # darkcloud - -1001684073398, # BHF Cloud - -1003163621939, # Wich Love from R - -1003611713618, # Khazan Cloud - -1003328682684, # LogsPlanet - -1003204260194, # JDP - -1002828367761, # HesoyamCloud - -1003513974925, # Slurm Logs - -1003599300787, # Arhont Corp - -1002582513379, # OnlyLogs - -1002788333372, # Ickis Cloud + #"cloudxlog", + ##-1001967030016, # daisycloud + ##"berserklogs", # berserklogs + ##"BorwitaFreeLogs", # borwita + #-1002748707556, # darkcloud + #-1001684073398, # BHF Cloud + #-1003163621939, # Wich Love from R + #-1003611713618, # Khazan Cloud + #-1003328682684, # LogsPlanet + #-1003204260194, # JDP + #-1002828367761, # HesoyamCloud + #-1003513974925, # Slurm Logs + #-1003599300787, # Arhont Corp + #-1002582513379, # OnlyLogs + #-1002788333372, # Ickis Cloud + -1002643355608, # Cloud URL #-1001234567890, # private channel by ID ] @@ -149,5 +154,5 @@ TDL_PERFILE = 4 TDL_AMOUNT = 4 # Whether to use a Telegram takeout session for downloads (lower flood limits). -# Takeout sessions are rate-limited differently — good for bulk backfill. +# Takeout sessions are rate-limited differently - good for bulk backfill. TDL_TAKEOUT = True diff --git a/core/__init__.py b/core/__init__.py index e85ef1c..0f94146 100644 --- a/core/__init__.py +++ b/core/__init__.py @@ -1 +1 @@ -"""core — Telegram I/O pipeline (scraper, downloader, processor, notifier).""" +"""core - Telegram I/O pipeline (scraper, downloader, processor, notifier).""" diff --git a/core/bot_downloader.py b/core/bot_downloader.py index b991765..ab3b8dd 100644 --- a/core/bot_downloader.py +++ b/core/bot_downloader.py @@ -1,5 +1,5 @@ """ -bot_downloader.py — Handles "click to download" inline button flows. +bot_downloader.py - Handles "click to download" inline button flows. Some Telegram channels post messages with a DOWNLOAD button that triggers a bot to send you the actual file. This module simulates that click and diff --git a/core/notifier.py b/core/notifier.py index 710d1ef..cc679d0 100644 --- a/core/notifier.py +++ b/core/notifier.py @@ -1,5 +1,5 @@ """ -notifier.py — Persists hits to disk and sends Telegram bot alerts. +notifier.py - Persists hits to disk and sends Telegram bot alerts. Includes: - Severity scoring via scorer.py @@ -31,7 +31,7 @@ log = logging.getLogger(__name__) MAX_PREVIEW = 10 # hits to show per severity group in alert DEDUP_FILE = Path("./data/dedup.json") -# Only alert immediately for these severities — LOW hits are silent +# Only alert immediately for these severities - LOW hits are silent ALERT_SEVERITIES = {CRITICAL, HIGH, MEDIUM} @@ -124,7 +124,7 @@ def write_hits(scored_hits: list, source: str) -> None: def write_hits_csv(scored_hits: list, source: str, filename: str) -> None: - """Append new hits to hits.csv — one row per hit, easy to import.""" + """Append new hits to hits.csv - one row per hit, easy to import.""" HITS_CSV.parent.mkdir(parents=True, exist_ok=True) write_header = not HITS_CSV.exists() timestamp = _timestamp() @@ -152,13 +152,13 @@ async def send_alert( ) -> None: """ Send a Telegram alert grouped by severity. - Only includes CRITICAL, HIGH, MEDIUM — LOW hits are omitted from alerts. + Only includes CRITICAL, HIGH, MEDIUM - LOW hits are omitted from alerts. """ summary = summarize(scored_hits) alertable = [h for h in scored_hits if h.severity in ALERT_SEVERITIES] if not alertable: - log.info(" No alertable hits (all LOW) — skipping Telegram notification.") + log.info(" No alertable hits (all LOW) - skipping Telegram notification.") return lines = [ @@ -210,7 +210,7 @@ async def notify(bot: TelegramClient, hits: list[str], source: str, filename: st # Score first scored = score_hits(hits) - log.info(f" Scored {len(scored)} hit(s) — {summarize(scored)}") + log.info(f" Scored {len(scored)} hit(s) - {summarize(scored)}") # Deduplicate new_hits, dupe_hits = deduplicate(scored) @@ -222,7 +222,7 @@ async def notify(bot: TelegramClient, hits: list[str], source: str, filename: st insert_hits(dupe_hits, source, filename, seen_before=True) if not new_hits: - log.info(" All hits already seen before — no alert sent.") + log.info(" All hits already seen before - no alert sent.") return # Push hits to TUI diff --git a/core/processor.md b/core/processor.md index 29c4e87..7384a5a 100644 --- a/core/processor.md +++ b/core/processor.md @@ -54,8 +54,8 @@ Nested archives are recursed **one level** only. ## Password order -1. `extra_password` (from message/channel carry-forward) — tried first -2. `config.ARCHIVE_PASSWORDS` — tried in order +1. `extra_password` (from message/channel carry-forward) - tried first +2. `config.ARCHIVE_PASSWORDS` - tried in order --- diff --git a/core/processor.py b/core/processor.py index 4f844dc..9b303ee 100644 --- a/core/processor.py +++ b/core/processor.py @@ -1,8 +1,8 @@ """ -processor.py — Archive extraction and hit searching logic. +processor.py - Archive extraction and hit searching logic. Supports: .txt, .zip, .7z, .rar -Stream-processes files line by line — safe for large combo lists. +Stream-processes files line by line - safe for large combo lists. """ import rarfile @@ -40,7 +40,7 @@ def compile_patterns(keywords: list[str]) -> list[re.Pattern]: def search_file(filepath: Path, patterns: list[re.Pattern]) -> list[str]: """ Stream-reads a text file line by line and returns lines matching any pattern. - Ignores encoding errors — combo files are often messy. + Ignores encoding errors - combo files are often messy. """ hits: list[str] = [] try: @@ -82,7 +82,7 @@ def extract_zip(filepath: Path, dest: Path, extra_password: str | None = None) - except RuntimeError: log.info(f" ZIP is password-protected, trying common passwords...") if not _try_passwords(try_extract, ARCHIVE_PASSWORDS): - log.warning(f" Could not unlock {filepath.name} — skipping.") + log.warning(f" Could not unlock {filepath.name} - skipping.") return [] extracted = [p for p in dest.rglob("*") if p.is_file()] @@ -95,7 +95,7 @@ def extract_zip(filepath: Path, dest: Path, extra_password: str | None = None) - def extract_7z(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]: if not HAS_7Z: - log.warning("py7zr not installed — skipping .7z file.") + log.warning("py7zr not installed - skipping .7z file.") return [] extracted: list[Path] = [] passwords = ARCHIVE_PASSWORDS.copy() @@ -119,7 +119,7 @@ def extract_7z(filepath: Path, dest: Path, extra_password: str | None = None) -> except Exception: continue if not success: - log.warning(f" Could not unlock {filepath.name} — skipping.") + log.warning(f" Could not unlock {filepath.name} - skipping.") return [] extracted = [p for p in dest.rglob("*") if p.is_file()] @@ -130,7 +130,7 @@ def extract_7z(filepath: Path, dest: Path, extra_password: str | None = None) -> def extract_rar(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]: if not HAS_RAR: - log.warning("rarfile not installed — skipping .rar file.") + log.warning("rarfile not installed - skipping .rar file.") return [] passwords = ARCHIVE_PASSWORDS.copy() @@ -150,7 +150,7 @@ def extract_rar(filepath: Path, dest: Path, extra_password: str | None = None) - except Exception: log.info(f" RAR may be password-protected, trying common passwords...") if not _try_passwords(try_extract, ARCHIVE_PASSWORDS): - log.warning(f" Could not unlock {filepath.name} — skipping.") + log.warning(f" Could not unlock {filepath.name} - skipping.") return [] extracted = [p for p in dest.rglob("*") if p.is_file()] @@ -184,7 +184,7 @@ def unpack(filepath: Path, extra_password: str | None = None) -> tuple[list[Path return files, extract_dir else: - # Plain file — return as-is, no extract dir to clean up + # Plain file - return as-is, no extract dir to clean up return [filepath], None @@ -207,7 +207,7 @@ def process_file(filepath: Path, patterns, password: str | None = None) -> list[ log.info(f" ✓ {len(hits)} hit(s) in {f.name}") all_hits.extend(hits) - # Nested archives — recurse one level + # Nested archives - recurse one level elif f.suffix.lower() in {".zip", ".7z", ".rar"} and f != filepath: log.info(f" → Nested archive: {f.name}") nested_hits = process_file(f, patterns) diff --git a/core/scraper.md b/core/scraper.md index 9ade2be..5406635 100644 --- a/core/scraper.md +++ b/core/scraper.md @@ -11,7 +11,7 @@ from core.scraper import handle_message, backfill_all, register_handlers, warm_e ### `handle_message(client, bot, msg, source_name, patterns, password=None)` **async.** Full pipeline for one document message: 1. Extract filename + size, check allowlist + size guard -2. Check `utils.cache` — skip if already seen +2. Check `utils.cache` - skip if already seen 3. Try `tdl` download → Telethon fallback 4. `core.processor.process_file()` → hits 5. `core.notifier.notify()` if hits found diff --git a/core/scraper.py b/core/scraper.py index e95821b..f76f642 100644 --- a/core/scraper.py +++ b/core/scraper.py @@ -1,5 +1,5 @@ """ -scraper.py — Telethon user client. +scraper.py - Telethon user client. Handles: - Listening for new file messages in watched channels @@ -99,7 +99,7 @@ async def _telethon_download(client: TelegramClient, msg, dest: Path, filename: """Download a single file via Telethon. Returns True on success.""" _bid = batch_id or f"telethon_{int(time.monotonic_ns())}" if batch_id is None: - # Standalone call (not already queued by tdl path) — post queued event + # Standalone call (not already queued by tdl path) - post queued event bus.post(bus.EvDownloadQueued( batch_id=_bid, filename=filename, size_mb=round(size / (1024 * 1024), 2), @@ -165,12 +165,12 @@ async def handle_message( size = get_filesize(msg) ok, reason = is_processable(filename, size) if not ok: - log.warning(f" handle_message: skipping '{filename}' — {reason}") + log.warning(f" handle_message: skipping '{filename}' - {reason}") return doc_id = msg.media.document.id if is_seen(doc_id): - log.info(f" Skipping {filename} — already processed.") + log.info(f" Skipping {filename} - already processed.") return dest = _make_dest(msg, filename) @@ -180,7 +180,7 @@ async def handle_message( downloaded = await download_single_with_tdl(msg, dest) if is_tdl_available() else False if not downloaded: if is_tdl_available(): - log.warning(" [tdl] failed — falling back to Telethon") + log.warning(" [tdl] failed - falling back to Telethon") downloaded = await _telethon_download(client, msg, dest, filename, size) if not downloaded: @@ -307,7 +307,7 @@ async def backfill_channel( ok, reason = is_processable(filename, size) if not ok: - log.warning(f" [Backfill] Skipping '{filename}' — {reason}") + log.warning(f" [Backfill] Skipping '{filename}' - {reason}") continue if is_seen(msg.media.document.id): @@ -319,13 +319,13 @@ async def backfill_channel( if len(batch) >= TDL_AMOUNT: await flush_batch() else: - # No tdl — fall straight through to single handle_message + # No tdl - fall straight through to single handle_message await handle_message(client, bot, msg, source_name, patterns, password=password) total += 1 await asyncio.sleep(0.5) elif msg.buttons and has_download_button(msg): - # Bot-button messages can't be batched — handle individually + # Bot-button messages can't be batched - handle individually await flush_batch() # flush any pending batch first await handle_bot_download_message(client, bot, msg, source_name, patterns, password=password) total += 1 @@ -339,7 +339,7 @@ async def backfill_channel( except Exception as e: log.error(f"[Backfill] Error scanning {channel}: {e}") - log.info(f"[Backfill] Done: {channel} — {total} file(s) processed") + log.info(f"[Backfill] Done: {channel} - {total} file(s) processed") async def backfill_all( diff --git a/core/tdl_downloader.md b/core/tdl_downloader.md index 74efc5b..71a1925 100644 --- a/core/tdl_downloader.md +++ b/core/tdl_downloader.md @@ -22,7 +22,7 @@ Used by the live handler and `bot_downloader`. ### `download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]` **async.** Downloads up to `TDL_AMOUNT` messages in a single `tdl dl` invocation. -Returns `{doc_id: True|False}` — `False` means Telethon fallback needed. +Returns `{doc_id: True|False}` - `False` means Telethon fallback needed. --- @@ -55,7 +55,7 @@ In CLI mode: subprocess inherits the terminal, progress bars render natively. Each batch/single download gets a unique `data/tmp/_tdl_{monotonic_ns}/` staging dir. After `tdl` exits, files are matched by name (with fuzzy stem fallback for `filenamify()` mangling) and moved to final `dest`. Staging dir is removed regardless of outcome. -`--template '{{ filenamify .FileName }}'` — tdl uses the original Telegram filename, not its default `DialogID_MessageID_filename` format. +`--template '{{ filenamify .FileName }}'` - tdl uses the original Telegram filename, not its default `DialogID_MessageID_filename` format. --- diff --git a/core/tdl_downloader.py b/core/tdl_downloader.py index eea963f..199f041 100644 --- a/core/tdl_downloader.py +++ b/core/tdl_downloader.py @@ -1,10 +1,10 @@ """ -tdl_downloader.py — Fast file downloads via tdl (Go MTProto implementation). +tdl_downloader.py - Fast file downloads via tdl (Go MTProto implementation). Install: https://github.com/iyear/tdl curl -sSL https://raw.githubusercontent.com/iyear/tdl/main/scripts/install.sh | bash -First-time setup — log in once: +First-time setup - log in once: tdl login # saves to namespace "default" tdl login -n myns # saves to a named namespace @@ -77,7 +77,7 @@ def _build_cmd(urls: list[str], staging_dir: Path) -> list[str]: (no DialogID_MessageID_ prefix). --continue is kept so interrupted downloads resume rather than restart. - --skip-same is intentionally omitted — deduplication is handled upstream + --skip-same is intentionally omitted - deduplication is handled upstream by is_seen(), and --skip-same can cause the .tmp rename to fail when a same-named file already exists in the directory. """ @@ -103,7 +103,7 @@ def _build_cmd(urls: list[str], staging_dir: Path) -> list[str]: # ─── Runner ─────────────────────────────────────────────────────────────────── -# ANSI escape stripper — tdl emits colour codes even when not a TTY +# ANSI escape stripper - tdl emits colour codes even when not a TTY import re as _re _ANSI_RE = _re.compile(r"\x1b\[[0-9;]*[mGKHFJA-Z]|\x1b=|\x1b>|\x1b\[\?[0-9]+[hl]") @@ -141,7 +141,7 @@ async def _run_tdl(cmd: list[str], label: str) -> bool: buf += chunk.decode(errors="replace") # Split on both \r and \n; process all complete segments parts = _re.split(r"[\r\n]", buf) - # Last element may be an incomplete segment — keep in buffer + # Last element may be an incomplete segment - keep in buffer buf = parts[-1] for part in parts[:-1]: clean = _strip_ansi(part).strip() @@ -163,7 +163,7 @@ async def _run_tdl(cmd: list[str], label: str) -> bool: log.info(f"[tdl] ✓ {label}") return True else: - log.error(f"[tdl] ✗ exit {proc.returncode} — {label}") + log.error(f"[tdl] ✗ exit {proc.returncode} - {label}") return False except FileNotFoundError: log.error("[tdl] binary not found at runtime") @@ -260,7 +260,7 @@ async def download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]: return {} if not is_tdl_available(): - log.warning("[tdl] not available — all entries need Telethon fallback") + log.warning("[tdl] not available - all entries need Telethon fallback") return {e.doc_id: False for e in entries} urls: list[str] = [] @@ -327,7 +327,7 @@ async def download_single_with_tdl(msg, dest: Path) -> bool: bot_downloader where batching doesn't apply. """ if not is_tdl_available(): - log.warning("[tdl] not available — falling back to Telethon") + log.warning("[tdl] not available - falling back to Telethon") return False try: diff --git a/main.py b/main.py index f05c809..531921e 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,5 @@ """ -main.py — Entry point for the ULP credential monitor. +main.py - Entry point for the ULP credential monitor. Usage: python main.py # TUI mode (default) @@ -55,7 +55,7 @@ def _start_web_thread(host: str, port: int) -> threading.Thread: # ─── Plain CLI mode ─────────────────────────────────────────────────────────── async def _cli_main(): - """Original asyncio main — runs without the TUI.""" + """Original asyncio main - runs without the TUI.""" logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) from telethon import TelegramClient @@ -64,7 +64,7 @@ async def _cli_main(): from core.scraper import backfill_all, register_handlers, warm_entity_cache log.info("=" * 60) - log.info(" ULP Credential Monitor — CLI mode") + log.info(" ULP Credential Monitor - CLI mode") log.info("=" * 60) patterns = compile_patterns(config.TARGET_KEYWORDS) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..d119a96 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,46 @@ +[build-system] +requires = ["setuptools>=68"] +build-backend = "setuptools.backends.legacy:build" + +[project] +name = "stealergram" +version = "0.1.0" +description = "Telegram channel monitor - downloads, extracts, scores, and alerts on credential leaks" +requires-python = ">=3.11" +dependencies = [ + # Telegram + "telethon", + "tgcrypto", + # TUI + "textual", + # Config + "python-dotenv", + # Progress bars (CLI mode) + "tqdm", + # Archive extraction + "py7zr", + "rarfile", +] + +[project.optional-dependencies] +web = [ + "fastapi", + "uvicorn[standard]", + "jinja2", + "python-multipart", + "bcrypt", + "python-jose[cryptography]", +] +dev = [ + "pytest", +] + +[project.scripts] +stealergram = "main:main" + +[tool.setuptools.packages.find] +where = ["."] +exclude = ["tests*", "data*", "logs*", "tmp*"] + +[tool.pytest.ini_options] +testpaths = ["tests"] diff --git a/requirements.txt b/requirements.txt index 33ea811..5fe1ad7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ tqdm py7zr rarfile -# Web frontend (optional — only needed with --web) +# Web frontend (optional - only needed with --web) fastapi uvicorn[standard] jinja2 diff --git a/tests/conftest.py b/tests/conftest.py index 8fa3c72..ad84030 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,7 +7,7 @@ os.environ.setdefault("API_HASH", "dummy_hash_for_tests") os.environ.setdefault("BOT_TOKEN", "0:dummy_bot_token") os.environ.setdefault("NOTIFY_CHAT_ID", "99999") -# Web frontend test defaults — set once here so all web test files see the same values. +# Web frontend test defaults - set once here so all web test files see the same values. os.environ.setdefault("WEB_SECRET_KEY", "test-secret-key-for-pytest") os.environ.setdefault("WEB_ADMIN_USER", "superadmin") os.environ.setdefault("WEB_ADMIN_PASS", "superpass") @@ -17,8 +17,8 @@ import config import utils.scorer as scorer # Two test keywords: -# @testcorp\.com — employee email domain (triggers CRITICAL) -# testcorp\.com — plain domain match (triggers LOW baseline) +# @testcorp\.com - employee email domain (triggers CRITICAL) +# testcorp\.com - plain domain match (triggers LOW baseline) TEST_KEYWORDS = [r"@testcorp\.com", r"testcorp\.com"] @@ -29,7 +29,7 @@ def patched_keywords(monkeypatch): scorer's module-level globals so scoring logic uses known test patterns. scorer.py now reads _config.TARGET_KEYWORDS at call time via `import config as _config`, - so patching config.TARGET_KEYWORDS is sufficient — no direct scorer patch needed. + so patching config.TARGET_KEYWORDS is sufficient - no direct scorer patch needed. """ monkeypatch.setattr(config, "TARGET_KEYWORDS", TEST_KEYWORDS) monkeypatch.setattr(scorer, "EMPLOYEE_DOMAINS", scorer._build_employee_domains()) diff --git a/tests/test_cache.py b/tests/test_cache.py index 301b2d9..a6b2f32 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -1,5 +1,5 @@ """ -Tests for utils/cache.py — file-ID deduplication cache. +Tests for utils/cache.py - file-ID deduplication cache. Each test gets an isolated cache file via the `isolated_cache` fixture so tests never touch data/cache.json. diff --git a/tests/test_database.py b/tests/test_database.py index 10bb543..7873b52 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -1,5 +1,5 @@ """ -Tests for utils/database.py — SQLite persistence layer. +Tests for utils/database.py - SQLite persistence layer. Each test gets an isolated in-memory-equivalent DB via the `isolated_db` fixture so tests never touch data/hits.db. @@ -112,7 +112,7 @@ def test_by_severity_returns_correct_severity(): def test_by_severity_excludes_duplicates(): - """seen_before=1 rows must be invisible to by_severity — they are stored for stats only.""" + """seen_before=1 rows must be invisible to by_severity - they are stored for stats only.""" hit = make_hit(severity=HIGH, url="intranet.testcorp.com") db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=True) assert db_module.by_severity(HIGH) == [] diff --git a/tests/test_events.py b/tests/test_events.py index 0340876..51ae1af 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -1,5 +1,5 @@ """ -Tests for tui/events.py — subscribe/unsubscribe broadcast, signal_channel_changed. +Tests for tui/events.py - subscribe/unsubscribe broadcast, signal_channel_changed. """ import queue diff --git a/tests/test_processor.py b/tests/test_processor.py index 108586c..a0cf219 100644 --- a/tests/test_processor.py +++ b/tests/test_processor.py @@ -1,5 +1,5 @@ """ -Tests for core/processor.py — archive extraction and line-by-line search. +Tests for core/processor.py - archive extraction and line-by-line search. No Telegram deps, no async. Tests create real archive fixtures in tmp_path so process_file's cleanup guarantee can be verified against actual disk state. @@ -60,7 +60,7 @@ class TestSearchFile: assert search_file(f, patterns) == ["testcorp.com|user|pass"] def test_handles_encoding_errors_gracefully(self, tmp_path, patterns): - """Combo files are often messy — invalid bytes must not crash the search.""" + """Combo files are often messy - invalid bytes must not crash the search.""" f = tmp_path / "combo.txt" f.write_bytes( b"testcorp.com|user1|pass\n" @@ -81,7 +81,7 @@ class TestSearchFile: assert len(hits) == 2 -# ─── process_file — plain .txt ──────────────────────────────────────────────── +# ─── process_file - plain .txt ──────────────────────────────────────────────── class TestProcessFilePlainText: def test_returns_hits(self, tmp_path, patterns): @@ -104,7 +104,7 @@ class TestProcessFilePlainText: assert not f.exists() -# ─── process_file — .zip extraction ────────────────────────────────────────── +# ─── process_file - .zip extraction ────────────────────────────────────────── class TestProcessFileZip: def _make_zip(self, tmp_path: Path, content: str, filename="content.txt") -> Path: @@ -155,7 +155,7 @@ class TestProcessFileZip: assert len(hits) == 2 -# ─── process_file — nested archives ────────────────────────────────────────── +# ─── process_file - nested archives ────────────────────────────────────────── class TestProcessFileNested: def test_nested_zip_is_recursed(self, tmp_path, patterns): @@ -177,7 +177,7 @@ class TestProcessFileNested: assert not (tmp_path / "outer").exists() -# ─── process_file — password-protected .7z ─────────────────────────────────── +# ─── process_file - password-protected .7z ─────────────────────────────────── class TestProcessFile7zPassword: def test_unlocks_with_correct_password(self, tmp_path, patterns, monkeypatch): @@ -218,6 +218,6 @@ class TestProcessFile7zPassword: z.write(txt, "content.txt") txt.unlink() - # No hits — archive could not be opened + # No hits - archive could not be opened hits = process_file(szf, patterns) assert hits == [] diff --git a/tests/test_scorer.py b/tests/test_scorer.py index 54d0912..66e9239 100644 --- a/tests/test_scorer.py +++ b/tests/test_scorer.py @@ -1,10 +1,10 @@ """ -Tests for utils/scorer.py — severity scoring and ULP line parsing. +Tests for utils/scorer.py - severity scoring and ULP line parsing. All tests use the `patched_keywords` fixture (see conftest.py) which replaces TARGET_KEYWORDS with two entries: - @testcorp.com — employee email domain (CRITICAL trigger) - testcorp.com — plain domain match (LOW baseline) + @testcorp.com - employee email domain (CRITICAL trigger) + testcorp.com - plain domain match (LOW baseline) """ import pytest @@ -50,7 +50,7 @@ class TestULPParsingRealWorld: @pytest.mark.parametrize("line,exp_url,exp_user,exp_pass", [ # ── Protocol + port + path, colon separator ────────────────────────── - # Port is digits followed by '/' — must be consumed as part of the URL. + # Port is digits followed by '/' - must be consumed as part of the URL. ( "http://portal.fakehosp.example.com:88/:55512309-1:hunter2", "http://portal.fakehosp.example.com:88/", "55512309-1", "hunter2", @@ -91,7 +91,7 @@ class TestULPParsingRealWorld: "jdoe@fakehosp.example.com", "Passw0rd!", ), - # ── Pipe separator (unambiguous — port stays in URL) ────────────────── + # ── Pipe separator (unambiguous - port stays in URL) ────────────────── ( "http://portal.fakehosp.example.com:88/|22.987.654-3|florida88", "http://portal.fakehosp.example.com:88/", "22.987.654-3", "florida88", @@ -113,7 +113,7 @@ class TestULPParsingRealWorld: "portal.fakehosp.example.com:88/", "22.987.654-3", "florida88", ), - # ── No protocol, no port — plain colon separators ──────────────────── + # ── No protocol, no port - plain colon separators ──────────────────── ( "booking.fakehosp.example.com:66778899-7:correcthorse", "booking.fakehosp.example.com", "66778899-7", "correcthorse", @@ -234,7 +234,7 @@ class TestWeakPasswordFlags: assert any("Common password" in r for r in hit.reasons) def test_weak_password_does_not_escalate_severity(self, patched_keywords): - """Weak password flags are informational — they must not change severity.""" + """Weak password flags are informational - they must not change severity.""" hit = score_hit("testcorp.com|user|abc") assert hit.severity == LOW diff --git a/tests/test_web_auth.py b/tests/test_web_auth.py index 8194682..609fdbb 100644 --- a/tests/test_web_auth.py +++ b/tests/test_web_auth.py @@ -1,5 +1,5 @@ """ -Tests for web/auth.py — JWT token lifecycle, bcrypt helpers. +Tests for web/auth.py - JWT token lifecycle, bcrypt helpers. """ import pytest diff --git a/tests/test_web_db.py b/tests/test_web_db.py index 19aba26..3062b96 100644 --- a/tests/test_web_db.py +++ b/tests/test_web_db.py @@ -1,5 +1,5 @@ """ -Tests for web/db.py — user store and refresh token management. +Tests for web/db.py - user store and refresh token management. """ import pytest diff --git a/tui/__init__.py b/tui/__init__.py index 58754af..ab5b6f7 100644 --- a/tui/__init__.py +++ b/tui/__init__.py @@ -1 +1 @@ -"""tui — Textual TUI frontend and event bus.""" +"""tui - Textual TUI frontend and event bus.""" diff --git a/tui/app.md b/tui/app.md index bb79ded..d3e7c0c 100644 --- a/tui/app.md +++ b/tui/app.md @@ -34,8 +34,8 @@ MonitorApp (App) ### Threading model - **Bot backend** → `threading.Thread(daemon=True)` with its own `asyncio.new_event_loop()` - Runs `_bot_main()` — Telethon is completely isolated from Textual's loop. -- **TUI drain** → `set_interval(0.1, _drain_bus)` — polls `queue.Queue` every 100ms on Textual's loop. + Runs `_bot_main()` - Telethon is completely isolated from Textual's loop. +- **TUI drain** → `set_interval(0.1, _drain_bus)` - polls `queue.Queue` every 100ms on Textual's loop. ### Key methods @@ -105,7 +105,7 @@ Changes apply immediately (handler re-registered). Not persisted to `config.py` - Validates regex before adding - On change: rebuilds `utils.scorer.EMPLOYEE_DOMAINS` and `ORG_DOMAINS` - Bot handler recompiles patterns on the next incoming message automatically -- **Changes are in-memory only** — copy to `config.py` to persist +- **Changes are in-memory only** - copy to `config.py` to persist --- diff --git a/tui/app.py b/tui/app.py index 0413862..feb845c 100644 --- a/tui/app.py +++ b/tui/app.py @@ -1,5 +1,5 @@ """ -tui.py — Textual TUI for the ULP credential monitor. +tui.py - Textual TUI for the ULP credential monitor. Layout (main screen): ┌──────────────────────────────────┬──────────────────────────────────┐ @@ -14,13 +14,13 @@ Layout (main screen): └─────────────────────────────────────────────────────────────────────┘ Additional screens (push/pop via keybindings): - • SearchScreen — full-text search across hits DB [s] - • HitsDBScreen — paginated recent / severity viewer [h] - • KeywordsScreen — live-edit TARGET_KEYWORDS regex list [k] + • SearchScreen - full-text search across hits DB [s] + • HitsDBScreen - paginated recent / severity viewer [h] + • KeywordsScreen - live-edit TARGET_KEYWORDS regex list [k] Architecture: - The entire bot backend runs as a Textual Worker (asyncio task inside the - TUI event loop — no threading needed). + TUI event loop - no threading needed). - A second Worker runs _bus_consumer(), reading events from tui_events.queue and dispatching to the right panel. - Channel add/remove from the UI immediately re-registers Telethon handlers @@ -29,7 +29,7 @@ Architecture: into the download panel's RichLog. - StatsPanel polls database.stats() every 10 s via set_interval(). - Keyword changes are applied in-memory immediately (scorer caches rebuilt); - NOT auto-persisted to config.py — a notice banner reminds the user. + NOT auto-persisted to config.py - a notice banner reminds the user. - Live patterns are recompiled from config.TARGET_KEYWORDS on every message so keyword changes take effect without a handler restart. """ @@ -88,7 +88,7 @@ def _now() -> str: class DownloadPanel(Vertical): """ - Left panel — two sub-logs stacked vertically: + Left panel - two sub-logs stacked vertically: • top: tdl raw output (stripped ANSI), scrolling • bottom: our own structured status entries """ @@ -158,7 +158,7 @@ class DownloadPanel(Vertical): # ─── Hits panel ─────────────────────────────────────────────────────────────── class HitsPanel(Vertical): - """Right panel — scrollable color-coded hit log with live counter badge.""" + """Right panel - scrollable color-coded hit log with live counter badge.""" hit_count: reactive[int] = reactive(0) @@ -208,7 +208,7 @@ class HitsPanel(Vertical): class StatsPanel(Horizontal): """ - Slim bar — shows live DB stats, refreshed every 10 s. + Slim bar - shows live DB stats, refreshed every 10 s. Also refreshed immediately whenever a new hit arrives. """ @@ -233,14 +233,14 @@ class StatsPanel(Horizontal): def compose(self) -> ComposeResult: yield Static("📊 DB Stats", id="stat-label") - yield Static("🔴 —", classes="stat-critical", id="stat-critical") - yield Static("🟠 —", classes="stat-high", id="stat-high") - yield Static("🟡 —", classes="stat-medium", id="stat-medium") - yield Static("🟢 —", classes="stat-low", id="stat-low") - yield Static("total: —", id="stat-total") - yield Static("unique: —", id="stat-unique") - yield Static("dupes: —", id="stat-dupes") - yield Static("sources: —", id="stat-sources") + yield Static("🔴 - ", classes="stat-critical", id="stat-critical") + yield Static("🟠 - ", classes="stat-high", id="stat-high") + yield Static("🟡 - ", classes="stat-medium", id="stat-medium") + yield Static("🟢 - ", classes="stat-low", id="stat-low") + yield Static("total: - ", id="stat-total") + yield Static("unique: - ", id="stat-unique") + yield Static("dupes: - ", id="stat-dupes") + yield Static("sources: - ", id="stat-sources") def on_mount(self) -> None: self.set_interval(10, self.refresh_stats) @@ -266,7 +266,7 @@ class StatsPanel(Horizontal): class ChannelPanel(Vertical): """ - Bottom panel — live-editable channel list. + Bottom panel - live-editable channel list. Changes are applied immediately (Telethon handlers are re-registered). To make them permanent, edit config.py's WATCHED_CHANNELS manually. @@ -314,7 +314,7 @@ class ChannelPanel(Vertical): def compose(self) -> ComposeResult: yield Label( - "📡 Channels — changes apply immediately | edit config.py to persist", + "📡 Channels - changes apply immediately | edit config.py to persist", classes="panel-title", ) with Horizontal(classes="controls"): @@ -524,7 +524,7 @@ class HitsDBScreen(Screen): status, ) self.query_one("#db-status", Label).update( - f" {len(rows)} row(s) — {label}" + f" {len(rows)} row(s) - {label}" ) def _load_recent(self) -> None: @@ -560,7 +560,7 @@ class KeywordsScreen(Screen): • scorer's domain caches are rebuilt • The bot handler recompiles patterns on the next message automatically - Changes are NOT written back to config.py — a notice banner says so. + Changes are NOT written back to config.py - a notice banner says so. """ BINDINGS = [Binding("escape", "dismiss", "Back")] @@ -601,7 +601,7 @@ class KeywordsScreen(Screen): yield Header() yield Label("🔑 Keyword / Pattern Editor", classes="screen-title") yield Label( - "⚠ Changes are in-memory only — copy patterns to config.py to persist across restarts.", + "⚠ Changes are in-memory only - copy patterns to config.py to persist across restarts.", classes="notice", ) with Horizontal(id="kw-controls"): @@ -671,7 +671,7 @@ class KeywordsScreen(Screen): except Exception as e: log.warning(f"Could not rebuild scorer caches: {e}") bus.post(bus.EvStatus( - f"Keywords updated — {len(config.TARGET_KEYWORDS)} pattern(s) active" + f"Keywords updated - {len(config.TARGET_KEYWORDS)} pattern(s) active" )) def action_dismiss(self) -> None: @@ -721,7 +721,7 @@ class MonitorApp(App): # The bot backend runs in its own thread with its own asyncio event # loop, completely isolated from Textual. Telethon spawns background # tasks via asyncio.ensure_future() and calls connect() which returns - # only after its receiver loop is scheduled — both of these deadlock + # only after its receiver loop is scheduled - both of these deadlock # inside Textual's managed loop. Running in a dedicated thread # sidesteps all of that. # @@ -767,7 +767,7 @@ class MonitorApp(App): """ Called every 100 ms by set_interval(). Drains all pending events from the thread-safe queue and dispatches them to the right widget. - Runs on Textual's event loop — safe to call widget methods directly. + Runs on Textual's event loop - safe to call widget methods directly. """ q = bus.get_bus() if q is None: @@ -854,7 +854,7 @@ class MonitorApp(App): async def _bot_main(self) -> None: """ - Full bot backend — runs inside the bot thread's own event loop. + Full bot backend - runs inside the bot thread's own event loop. Telethon is free to schedule background tasks without interfering with Textual's loop. """ @@ -870,7 +870,7 @@ class MonitorApp(App): patterns = compile_patterns(config.TARGET_KEYWORDS) bus.post(bus.EvStatus( - f"Starting — {len(config.WATCHED_CHANNELS)} channel(s), " + f"Starting - {len(config.WATCHED_CHANNELS)} channel(s), " f"{len(patterns)} pattern(s)" )) @@ -894,9 +894,9 @@ class MonitorApp(App): await user_client.connect() log.info("[bot] user_client connected, checking auth...") if not await user_client.is_user_authorized(): - log.error("[bot] user_client not authorized — run: python main.py --no-tui") + log.error("[bot] user_client not authorized - run: python main.py --no-tui") bus.post(bus.EvStatus( - "Not authorized — run --no-tui once to complete login", + "Not authorized - run --no-tui once to complete login", level="error", )) return @@ -962,7 +962,7 @@ class MonitorApp(App): log.info(f"[bot] Handler registered for {len(channels)} channel(s)") bus.post(bus.EvStatus(f"Watching {len(channels)} channel(s)")) - # Channel-change event — lives on this (bot) loop. + # Channel-change event - lives on this (bot) loop. # Textual signals it thread-safely via _signal_channel_changed(). _ch_changed = asyncio.Event() self._bot_loop_channel_event = _ch_changed @@ -971,7 +971,7 @@ class MonitorApp(App): bus.post(bus.EvStatus("Live listener active")) await backfill_all(user_client, bot_client, patterns) - bus.post(bus.EvStatus("Backfill complete — monitoring live")) + bus.post(bus.EvStatus("Backfill complete - monitoring live")) async def _watch_channels(): while True: @@ -1009,7 +1009,7 @@ class MonitorApp(App): # ─── Entry point ────────────────────────────────────────────────────────────── def run_tui() -> None: - # Do NOT call bus.init_bus() here — the Queue must be created inside + # Do NOT call bus.init_bus() here - the Queue must be created inside # Textual's event loop (see MonitorApp.on_mount). Calling it here # would bind the Queue to the outer loop which is discarded when # App.run() creates a new one. diff --git a/tui/events.md b/tui/events.md index 9bcadeb..f313fde 100644 --- a/tui/events.md +++ b/tui/events.md @@ -14,11 +14,11 @@ from tui.events import set_bot_context, signal_channel_changed ``` ### `init_bus() -> queue.Queue` -Creates the `queue.Queue`. Called inside `MonitorApp.on_mount()` — **must run on Textual's event loop**, not before `App.run()`. +Creates the `queue.Queue`. Called inside `MonitorApp.on_mount()` - **must run on Textual's event loop**, not before `App.run()`. ### `post(event: Any) -> None` Fire-and-forget from any thread. Delivers to the TUI queue **and** all subscriber queues. -Uses `queue.Queue.put_nowait()` — never blocks. +Uses `queue.Queue.put_nowait()` - never blocks. ### `get_bus() -> queue.Queue | None` Returns the TUI queue for `_drain_bus()` to consume. diff --git a/tui/events.py b/tui/events.py index 5b61f07..025b078 100644 --- a/tui/events.py +++ b/tui/events.py @@ -1,5 +1,5 @@ """ -tui_events.py — Thread-safe event bus between the bot backend and the TUI. +tui_events.py - Thread-safe event bus between the bot backend and the TUI. The bot backend runs in a dedicated thread with its own asyncio event loop (completely isolated from Textual's loop). Events are posted via a standard @@ -18,7 +18,7 @@ import threading from dataclasses import dataclass, field from typing import Any -# Thread-safe queue — works across the bot thread and Textual's thread. +# Thread-safe queue - works across the bot thread and Textual's thread. _queue: queue.Queue | None = None _queue_lock = threading.Lock() diff --git a/utils/__init__.py b/utils/__init__.py index 8c6b899..7b9df02 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -1 +1 @@ -"""utils — pure logic modules with no Telegram dependencies.""" +"""utils - pure logic modules with no Telegram dependencies.""" diff --git a/utils/cache.md b/utils/cache.md index 91ebaec..08511de 100644 --- a/utils/cache.md +++ b/utils/cache.md @@ -11,7 +11,7 @@ from utils.cache import is_seen, mark_seen ### `is_seen(file_id: int) -> bool` Returns `True` if this document ID has been processed before. -Loads from disk on every call (safe for multi-process, slightly slow for hot loops — not an issue given download cadence). +Loads from disk on every call (safe for multi-process, slightly slow for hot loops - not an issue given download cadence). ### `mark_seen(file_id: int) -> None` Adds `file_id` to the cache and persists to disk. @@ -21,12 +21,12 @@ Adds `file_id` to the cache and persists to disk. ## Storage - **File:** `data/cache.json` -- **Format:** JSON array of integers — `[123456789, 987654321, ...]` -- **No expiry** — grows indefinitely. Safe to delete to re-process all files. +- **Format:** JSON array of integers - `[123456789, 987654321, ...]` +- **No expiry** - grows indefinitely. Safe to delete to re-process all files. --- ## Notes -- `is_seen` + `mark_seen` are called in `core/scraper.py` after a successful download+process cycle, not before — so a file that fails mid-process will be retried on next run. +- `is_seen` + `mark_seen` are called in `core/scraper.py` after a successful download+process cycle, not before - so a file that fails mid-process will be retried on next run. - Not thread-safe (load/modify/save is not atomic). Acceptable because downloads are sequential within the bot loop. diff --git a/utils/cache.py b/utils/cache.py index 8182eeb..8332e40 100644 --- a/utils/cache.py +++ b/utils/cache.py @@ -1,5 +1,5 @@ """ -cache.py — Tracks already-processed file IDs to avoid redownloading. +cache.py - Tracks already-processed file IDs to avoid redownloading. Persists to a simple JSON file on disk. """ diff --git a/utils/database.md b/utils/database.md index 92909f8..e9d195c 100644 --- a/utils/database.md +++ b/utils/database.md @@ -85,5 +85,5 @@ Indexes: `url`, `username`, `source`, `timestamp`, `severity`. ## Notes - Each query opens and closes its own connection via the `_connect()` context manager. -- `conn.row_factory = sqlite3.Row` — rows support both index and column-name access. +- `conn.row_factory = sqlite3.Row` - rows support both index and column-name access. - Transactions: commit on success, rollback on exception. diff --git a/utils/database.py b/utils/database.py index 6745fce..abccd22 100644 --- a/utils/database.py +++ b/utils/database.py @@ -1,5 +1,5 @@ """ -database.py — SQLite storage for credential hits. +database.py - SQLite storage for credential hits. Schema: hits table: diff --git a/utils/scorer.md b/utils/scorer.md index 9d9e59c..907e2e6 100644 --- a/utils/scorer.md +++ b/utils/scorer.md @@ -51,7 +51,7 @@ Check 6 (no severity change): flags weak passwords ≤6 chars or common strings. ## Employee domain matching Keywords in `config.TARGET_KEYWORDS` containing `@` become employee patterns. -Pattern: `@(?:[^a-zA-Z0-9.\-]|$)` — requires literal `@` before the domain. +Pattern: `@(?:[^a-zA-Z0-9.\-]|$)` - requires literal `@` before the domain. **`user@gmail.com` on a URL containing `myorg.cl` does NOT trigger CRITICAL.** Keywords without `@` go only to `ORG_DOMAINS` (LOW baseline). @@ -64,11 +64,11 @@ Separators: `:` `;` `,` `|` `\t` (any of these between the three fields). The URL field handles two common stealer-log complications: -1. **`://` not treated as separator** — the optional scheme prefix `(?:https?|ftp)://` is consumed before the character-class match, so `https://` never gets split at the colon. +1. **`://` not treated as separator** - the optional scheme prefix `(?:https?|ftp)://` is consumed before the character-class match, so `https://` never gets split at the colon. -2. **Port + path consumed into the URL** — the optional group `(?::\d+/[^\s:;,|\t]*)` absorbs `:port/path` when the port is pure digits immediately followed by `/`. This correctly handles `http://host:8085/path/:user:pass` but intentionally skips patterns like `:24145487-8` (RUT number — hyphen after digits, no `/`). +2. **Port + path consumed into the URL** - the optional group `(?::\d+/[^\s:;,|\t]*)` absorbs `:port/path` when the port is pure digits immediately followed by `/`. This correctly handles `http://host:8085/path/:user:pass` but intentionally skips patterns like `:24145487-8` (RUT number - hyphen after digits, no `/`). -**Known limitation:** A bare port with no path (e.g. `https://host:8080:user:pass`) will mis-parse `8080` as the username. This is not observed in practice — stealer logs always include at least a trailing `/`. +**Known limitation:** A bare port with no path (e.g. `https://host:8080:user:pass`) will mis-parse `8080` as the username. This is not observed in practice - stealer logs always include at least a trailing `/`. --- @@ -79,7 +79,7 @@ The URL field handles two common stealer-log complications: | `EMPLOYEE_DOMAINS` | `list[tuple[str, Pattern]]` | `(domain_str, anchored_pattern)` for `@`-keywords | | `ORG_DOMAINS` | `list[Pattern]` | Plain domain patterns for all keywords | -scorer uses `import config as _config` (not `from config import TARGET_KEYWORDS`), so patching `config.TARGET_KEYWORDS` at runtime is sufficient — `_build_*` reads the live module attribute. +scorer uses `import config as _config` (not `from config import TARGET_KEYWORDS`), so patching `config.TARGET_KEYWORDS` at runtime is sufficient - `_build_*` reads the live module attribute. To rebuild after editing `config.TARGET_KEYWORDS` at runtime: ```python diff --git a/utils/scorer.py b/utils/scorer.py index 079bb44..54a8967 100644 --- a/utils/scorer.py +++ b/utils/scorer.py @@ -1,24 +1,24 @@ """ -scorer.py — Severity scoring for credential hits. +scorer.py - Severity scoring for credential hits. Scoring logic (highest match wins): - CRITICAL — Employee credentials (internal email domain) + CRITICAL - Employee credentials (internal email domain) e.g. jdoe@yourclinic.cl:password - — Admin/privileged service URLs + - Admin/privileged service URLs e.g. admin., vpn., ssh., rdp., gitlab., jira. - HIGH — Internal-facing services + HIGH - Internal-facing services e.g. intranet., erp., crm., portal., citrix. - — Password manager or SSO hits - — Any credential where username looks like an employee email + - Password manager or SSO hits + - Any credential where username looks like an employee email - MEDIUM — Client-facing portals + MEDIUM - Client-facing portals e.g. app., patient., client., booking. - — Domain match on a non-privileged service + - Domain match on a non-privileged service - LOW — Generic domain keyword match - — No URL parsed, just a raw domain mention + LOW - Generic domain keyword match + - No URL parsed, just a raw domain mention Each scored hit gets a dict with: - severity: CRITICAL / HIGH / MEDIUM / LOW diff --git a/web/app.py b/web/app.py index f2e5406..03cbdfb 100644 --- a/web/app.py +++ b/web/app.py @@ -1,5 +1,5 @@ """ -web/app.py — FastAPI application factory. +web/app.py - FastAPI application factory. Usage: from web.app import create_app diff --git a/web/auth.py b/web/auth.py index 4fe206b..3260b8a 100644 --- a/web/auth.py +++ b/web/auth.py @@ -1,9 +1,9 @@ """ -web/auth.py — JWT signing/verification and bcrypt password helpers. +web/auth.py - JWT signing/verification and bcrypt password helpers. Tokens: - access — HS256, 15 min TTL, payload: {sub, role, type:"access"} - refresh — HS256, 7 day TTL, payload: {sub, jti, type:"refresh"} + access - HS256, 15 min TTL, payload: {sub, role, type:"access"} + refresh - HS256, 7 day TTL, payload: {sub, jti, type:"refresh"} Both tokens live in httpOnly SameSite=Strict cookies. The `type` claim prevents an access token being used as a refresh token. diff --git a/web/db.py b/web/db.py index f1ffceb..c58877e 100644 --- a/web/db.py +++ b/web/db.py @@ -1,9 +1,9 @@ """ -web/db.py — SQLite user store for the web frontend. +web/db.py - SQLite user store for the web frontend. Tables: - users — credentials + role + active flag - refresh_tokens — JTI-indexed refresh token revocation list + users - credentials + role + active flag + refresh_tokens - JTI-indexed refresh token revocation list Bootstrap: on first init, creates a superadmin from WEB_ADMIN_USER / WEB_ADMIN_PASS env vars (required only on first run if the DB doesn't exist yet). @@ -63,7 +63,9 @@ def init_db() -> None: admin_pass = os.environ.get("WEB_ADMIN_PASS") if not admin_pass: raise RuntimeError( - "WEB_ADMIN_PASS env var is required on first run to create the superadmin." + "WEB_ADMIN_PASS env var is required on first run to bootstrap the superadmin. " + "Add WEB_ADMIN_PASS= (and optionally WEB_ADMIN_USER=) " + "to your .env file, then restart." ) conn.execute( "INSERT INTO users (id, username, password_hash, role, created_at) VALUES (?,?,?,?,?)", diff --git a/web/dependencies.py b/web/dependencies.py index 4cd41b9..bc76209 100644 --- a/web/dependencies.py +++ b/web/dependencies.py @@ -1,5 +1,5 @@ """ -web/dependencies.py — FastAPI dependency functions. +web/dependencies.py - FastAPI dependency functions. get_current_user: reads the access_token cookie, decodes + validates it, loads the user row from web.db. Raises 401 if anything fails. diff --git a/web/models.py b/web/models.py index bb23b70..e287689 100644 --- a/web/models.py +++ b/web/models.py @@ -1,5 +1,5 @@ """ -web/models.py — Pydantic request/response schemas. +web/models.py - Pydantic request/response schemas. """ import re diff --git a/web/routes/auth.py b/web/routes/auth.py index 131bbc4..03bb1b8 100644 --- a/web/routes/auth.py +++ b/web/routes/auth.py @@ -1,9 +1,9 @@ """ -web/routes/auth.py — Login, logout, token refresh. +web/routes/auth.py - Login, logout, token refresh. -POST /login — form submit; sets access_token + refresh_token cookies -POST /logout — revokes refresh token, clears cookies -POST /refresh — exchanges refresh_token cookie for a new access_token +POST /login - form submit; sets access_token + refresh_token cookies +POST /logout - revokes refresh token, clears cookies +POST /refresh - exchanges refresh_token cookie for a new access_token """ from fastapi import APIRouter, Form, HTTPException, Request, Response, status diff --git a/web/routes/config_routes.py b/web/routes/config_routes.py index 800a10e..e369372 100644 --- a/web/routes/config_routes.py +++ b/web/routes/config_routes.py @@ -1,5 +1,5 @@ """ -web/routes/config_routes.py — Keyword groups and channel list management. +web/routes/config_routes.py - Keyword groups and channel list management. GET /config/keywords → render groups editor PUT /config/keywords → validate + save groups, reload scorer diff --git a/web/routes/dashboard.py b/web/routes/dashboard.py index 47dd0da..caeaeec 100644 --- a/web/routes/dashboard.py +++ b/web/routes/dashboard.py @@ -1,5 +1,5 @@ """ -web/routes/dashboard.py — Dashboard views and SSE live stream. +web/routes/dashboard.py - Dashboard views and SSE live stream. GET / → redirect to /dashboard GET /dashboard → overview: all groups, stats, live hit feed diff --git a/web/routes/users.py b/web/routes/users.py index 44240c8..6da2711 100644 --- a/web/routes/users.py +++ b/web/routes/users.py @@ -1,5 +1,5 @@ """ -web/routes/users.py — User CRUD (superadmin only). +web/routes/users.py - User CRUD (superadmin only). GET /users → list all users POST /users → create a new user