From 48f486ac97acdb6ea963cf392d036fad23058d50 Mon Sep 17 00:00:00 2001 From: anti Date: Thu, 2 Apr 2026 01:58:49 -0300 Subject: [PATCH] Initial commit: ULPgrammer - Core Telegram monitoring pipeline (scraper, processor, notifier, downloaders) - Textual TUI frontend with thread-safe event bus - SQLite persistence, severity scoring, dedup cache - Fixed ULP parser: handles https:// truncation, port+path URLs, semicolon separator - Test suite: 88 tests across scorer, cache, database, processor --- .claudeignore | 25 + .env.example | 22 + .gitignore | 28 ++ QUICK_REF.md | 182 +++++++ README.md | 146 ++++++ config.py | 100 ++++ core/__init__.py | 1 + core/bot_downloader.md | 68 +++ core/bot_downloader.py | 161 +++++++ core/notifier.md | 67 +++ core/notifier.py | 248 ++++++++++ core/processor.md | 69 +++ core/processor.py | 233 +++++++++ core/scraper.md | 65 +++ core/scraper.py | 410 ++++++++++++++++ core/tdl_downloader.md | 70 +++ core/tdl_downloader.py | 363 ++++++++++++++ data/.gitkeep | 0 logs/monitor.log | 54 +++ main.py | 142 ++++++ pytest.ini | 2 + requirements-dev.txt | 1 + requirements.txt | 16 + tests/__init__.py | 0 tests/conftest.py | 31 ++ tests/test_cache.py | 55 +++ tests/test_database.py | 188 ++++++++ tests/test_processor.py | 223 +++++++++ tests/test_scorer.py | 282 +++++++++++ tui/__init__.py | 1 + tui/app.md | 130 +++++ tui/app.py | 1016 +++++++++++++++++++++++++++++++++++++++ tui/events.md | 66 +++ tui/events.py | 114 +++++ utils/__init__.py | 1 + utils/cache.md | 32 ++ utils/cache.py | 38 ++ utils/database.md | 89 ++++ utils/database.py | 171 +++++++ utils/scorer.md | 87 ++++ utils/scorer.py | 273 +++++++++++ 41 files changed, 5270 insertions(+) create mode 100644 .claudeignore create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 QUICK_REF.md create mode 100644 README.md create mode 100644 config.py create mode 100644 core/__init__.py create mode 100644 core/bot_downloader.md create mode 100644 core/bot_downloader.py create mode 100644 core/notifier.md create mode 100644 core/notifier.py create mode 100644 core/processor.md create mode 100644 core/processor.py create mode 100644 core/scraper.md create mode 100644 core/scraper.py create mode 100644 core/tdl_downloader.md create mode 100644 core/tdl_downloader.py create mode 100644 data/.gitkeep create mode 100644 logs/monitor.log create mode 100644 main.py create mode 100644 pytest.ini create mode 100644 requirements-dev.txt create mode 100644 requirements.txt create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/test_cache.py create mode 100644 tests/test_database.py create mode 100644 tests/test_processor.py create mode 100644 tests/test_scorer.py create mode 100644 tui/__init__.py create mode 100644 tui/app.md create mode 100644 tui/app.py create mode 100644 tui/events.md create mode 100644 tui/events.py create mode 100644 utils/__init__.py create mode 100644 utils/cache.md create mode 100644 utils/cache.py create mode 100644 utils/database.md create mode 100644 utils/database.py create mode 100644 utils/scorer.md create mode 100644 utils/scorer.py diff --git a/.claudeignore b/.claudeignore new file mode 100644 index 0000000..a99e0af --- /dev/null +++ b/.claudeignore @@ -0,0 +1,25 @@ +# Sessions +*.session +*.session-journal +bot_session* + +# Data — keep the folder, ignore contents +data/hits.db +data/hits.txt +data/hits.csv +data/dedup.json +data/cache.json +data/tmp/ +data/logs/ +!data/.gitkeep + +# Env +.env + +# Python +__pycache__/ +*.pyc +*.pyo +.venv/ +venv/ + diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..6b949bf --- /dev/null +++ b/.env.example @@ -0,0 +1,22 @@ +# ─── Telegram API credentials ────────────────────────────────────────────── +# Get these from https://my.telegram.org → API development tools +API_ID=12345678 +API_HASH=your_api_hash_here + +# ─── Bot credentials ──────────────────────────────────────────────────────── +# Create a bot via @BotFather and paste the token here +BOT_TOKEN=123456789:ABCdefGHIjklMNOpqrSTUvwxYZ + +# ─── Alert destination ────────────────────────────────────────────────────── +# Chat ID to send hit notifications to (your personal ID or a group) +# Tip: message @userinfobot on Telegram to get your ID +NOTIFY_CHAT_ID=987654321 + +# ─── Session name (just a filename, no extension needed) ──────────────────── +SESSION_NAME=monitor_session + +# ─── tdl (fast Go downloader) — optional but strongly recommended ─────────── +# Install: https://github.com/iyear/tdl +# After installing, run once: tdl login -n +# SESSION_NAME above is shared between Telethon and tdl — no double login needed. +# If tdl is not on PATH the bot falls back to Telethon automatically. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..79805e2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,28 @@ +# Sessions +*.session +*.session-journal +bot_session* + +# Data — keep the folder, ignore contents +data/hits.db +data/hits.txt +data/hits.csv +data/dedup.json +data/cache.json +data/tmp/ +data/logs/ +!data/.gitkeep + +# Env +.env + +# Python +__pycache__/ +*.pyc +*.pyo +.venv/ +venv/ + +# Claude things +CLAUDE.md +.claude/* diff --git a/QUICK_REF.md b/QUICK_REF.md new file mode 100644 index 0000000..d9bb89b --- /dev/null +++ b/QUICK_REF.md @@ -0,0 +1,182 @@ +# ULP Monitor — Quick Reference + +> For Claude Code: read the per-file `.md` alongside each `.py` before editing. +> Full docs in `README.md`. + +--- + +## Project layout + +``` +ulp_monitor/ +├── main.py Entry point (--no-tui flag for CLI mode) +├── config.py All settings — edit this for keywords, channels, paths +│ +├── core/ Telegram I/O pipeline (all async, Telethon-dependent) +│ ├── scraper.py Live listener + backfill orchestration +│ ├── tdl_downloader.py tdl subprocess wrapper + Telethon fallback +│ ├── bot_downloader.py Inline "DOWNLOAD" button click flow +│ ├── processor.py Archive extraction (.zip/.7z/.rar) + line search +│ └── notifier.py Scoring → dedup → DB → hits.txt/csv → Telegram alert +│ +├── utils/ Pure logic, no Telegram deps, no async +│ ├── scorer.py Severity scoring (CRITICAL/HIGH/MEDIUM/LOW) +│ ├── cache.py Seen file-ID dedup (data/cache.json) +│ └── database.py SQLite read/write (data/hits.db) +│ +├── tui/ Textual TUI — runs in main thread +│ ├── app.py MonitorApp + all screens + bot thread launcher +│ └── events.py Thread-safe queue.Queue event bus +│ +└── data/ Runtime output — gitignored + ├── hits.db + ├── hits.txt + ├── hits.csv + ├── cache.json + ├── dedup.json + └── logs/monitor.log +``` + +--- + +## Data flow + +``` +Telegram channel + └─ new message with file / download button + │ + ├─ core/scraper.py detects + guards (size, extension, dedup) + │ + ├─ core/tdl_downloader.py downloads via tdl (batched) + │ └─ core/scraper.py Telethon fallback if tdl fails + │ + ├─ core/bot_downloader.py handles inline button → bot reply flow + │ + ├─ core/processor.py extracts archive → searches .txt line by line + │ + └─ core/notifier.py scores → deduplicates → persists → alerts + ├─ utils/scorer.py + ├─ utils/database.py + └─ tui/events.py posts EvHit to TUI +``` + +--- + +## Threading architecture + +``` +main thread (Textual's event loop) + ├─ MonitorApp.on_mount() + │ ├─ bus.init_bus() creates queue.Queue on THIS loop + │ ├─ threading.Thread → _run_bot_thread() + │ └─ set_interval(0.1, _drain_bus) + │ + ├─ _drain_bus() [every 100ms] + │ └─ queue.Queue.get_nowait() → dispatch to widgets + │ + └─ Textual widgets, screens, keybindings + +bot thread (own asyncio event loop) + └─ _bot_main() + ├─ bot_client.connect() + sign_in() + ├─ user_client.connect() + is_user_authorized() + ├─ warm_entity_cache() + ├─ _make_handler() → NewMessage handler registered + ├─ backfill_all() + └─ run_until_disconnected() + _watch_channels() [gathered] + +cross-thread communication + bot → TUI: bus.post(event) [queue.Queue.put_nowait, always safe] + TUI → bot: loop.call_soon_threadsafe() [asyncio.Event.set for channel changes] +``` + +--- + +## Config quick reference (`config.py`) + +| Setting | Type | Description | +|---------|------|-------------| +| `API_ID` | int | From my.telegram.org | +| `API_HASH` | str | From my.telegram.org | +| `BOT_TOKEN` | str | From @BotFather | +| `NOTIFY_CHAT_ID` | int | Your Telegram user/group ID | +| `SESSION_NAME` | str | Session file name (default: `monitor_session`) | +| `TARGET_KEYWORDS` | list[str] | Regex patterns. `@`-prefixed → employee email (CRITICAL). Plain → domain match (LOW) | +| `WATCHED_CHANNELS` | list[str\|int] | Usernames or `-100xxxxxxxxxx` IDs | +| `BACKFILL_LIMIT` | int | Messages to scan per channel on startup (0 = off) | +| `ALLOWED_EXTENSIONS` | set | `.txt .zip .7z .rar` | +| `MAX_FILE_SIZE` | int | Bytes (default 4 GB) | +| `ARCHIVE_PASSWORDS` | list[bytes] | Tried in order on locked archives | +| `TDL_NAMESPACE` | str\|None | `tdl login -n ` namespace | +| `TDL_THREADS` | int | Chunk workers per file (`-t`) | +| `TDL_PERFILE` | int | Concurrent files per tdl call (`-l`) | +| `TDL_AMOUNT` | int | Messages per batch | +| `TEMP_DIR` | Path | `data/tmp` | +| `HITS_FILE` | Path | `data/hits.txt` | +| `LOG_FILE` | Path | `data/logs/monitor.log` | + +--- + +## Severity scoring summary + +| Severity | Score | Triggers | +|----------|-------|----------| +| CRITICAL | 40 | Employee email (`@myorg.cl` in username) · Privileged service URL (admin, vpn, rdp, gitlab…) | +| HIGH | 30 | Internal service URL (intranet, erp, sso, owa…) | +| MEDIUM | 20 | Client-facing URL (app, booking, helpdesk…) | +| LOW | 10 | Org domain appears anywhere in line | + +`@`-keyword rule: pattern requires literal `@` before domain — `user@gmail.com` on a URL containing `myorg.cl` does **not** trigger CRITICAL. + +--- + +## TUI keybindings + +| Key | Action | Screen | +|-----|--------|--------| +| `s` | Search hits DB | → SearchScreen | +| `h` | Browse hits by severity | → HitsDBScreen | +| `k` | Edit keyword patterns live | → KeywordsScreen | +| `c` | Clear download + hits logs | main | +| `r` | Force-refresh stats bar | main | +| `q` / `ctrl+c` | Quit | any | +| `Escape` | Back to main | sub-screens | +| `1`/`2`/`3`/`4` | Filter CRITICAL/HIGH/MEDIUM/LOW | HitsDBScreen | +| `r` | Load recent 50 | HitsDBScreen | + +--- + +## Per-file reference docs + +| File | Reference | +|------|-----------| +| `utils/scorer.py` | `utils/scorer.md` | +| `utils/cache.py` | `utils/cache.md` | +| `utils/database.py` | `utils/database.md` | +| `core/scraper.py` | `core/scraper.md` | +| `core/processor.py` | `core/processor.md` | +| `core/notifier.py` | `core/notifier.md` | +| `core/tdl_downloader.py` | `core/tdl_downloader.md` | +| `core/bot_downloader.py` | `core/bot_downloader.md` | +| `tui/app.py` | `tui/app.md` | +| `tui/events.py` | `tui/events.md` | + +--- + +## Common tasks + +**Add a new keyword at runtime:** open the TUI → press `k` → add pattern → active immediately. Copy to `config.TARGET_KEYWORDS` to persist. + +**Add a channel at runtime:** type username or numeric ID in the Channels panel → ➕ Add. Handler re-registers immediately. Edit `config.WATCHED_CHANNELS` to persist. + +**Query hits from CLI:** +```bash +sqlite3 data/hits.db "SELECT severity, username, url FROM hits WHERE seen_before=0 ORDER BY score DESC LIMIT 20" +``` + +**Re-process all files** (wipe cache): +```bash +rm data/cache.json data/dedup.json +``` + +**Check what's happening:** `tail -f data/logs/monitor.log` diff --git a/README.md b/README.md new file mode 100644 index 0000000..c7d0493 --- /dev/null +++ b/README.md @@ -0,0 +1,146 @@ +# ULP Credential Monitor + +A Telegram-based credential exposure monitor for threat intelligence teams. +Watches channels for combo/stealer log files and alerts you when your +organization's credentials appear in them. + +--- + +## How it works + +``` +User session (Telethon) + └─ watches N channels + └─ detects file attachments (.txt, .zip, .7z, .rar) + └─ downloads → extracts → searches line by line + └─ hit? → writes to data/ + sends bot alert + └─ no hit? → deletes file, moves on +``` + +--- + +## Project structure + +``` +ulp_monitor/ +├── main.py Entry point +├── config.py All settings (keywords, channels, paths) +│ +├── core/ Telegram I/O pipeline +│ ├── scraper.py Live listener + backfill +│ ├── tdl_downloader.py Fast downloads via tdl (Go MTProto) +│ ├── bot_downloader.py Inline button / bot-dispatched file flows +│ ├── processor.py Archive extraction + line-by-line search +│ └── notifier.py hits.txt / hits.csv writer + bot alerts +│ +├── utils/ Pure logic — no Telegram dependencies +│ ├── scorer.py Hit severity scoring +│ ├── cache.py Seen-file deduplication +│ └── database.py SQLite persistence layer +│ +├── tui/ Textual TUI frontend +│ ├── app.py MonitorApp + all Screen classes +│ └── events.py Thread-safe event bus (bot thread → TUI) +│ +└── data/ Runtime-generated (gitignored) + ├── hits.db SQLite database + ├── hits.txt Human-readable hit log + ├── hits.csv CSV hit log (importable into Excel / pandas) + ├── dedup.json Deduplication hashes + ├── cache.json Seen file-ID cache + └── logs/monitor.log +``` + +--- + +## Setup + +### 1. Get Telegram API credentials +- Go to https://my.telegram.org → *API development tools* +- Create an app → note your `api_id` and `api_hash` + +### 2. Create a bot +- Message [@BotFather](https://t.me/BotFather) → `/newbot` +- Start a chat with your new bot before running + +### 3. Get your chat ID +- Message [@userinfobot](https://t.me/userinfobot) + +### 4. Configure + +```bash +cp .env.example .env +# fill in API_ID, API_HASH, BOT_TOKEN, NOTIFY_CHAT_ID +``` + +Open `config.py` and set: + +- **`TARGET_KEYWORDS`** — your org's domains and email patterns. + Keywords with `@` (e.g. `r"@myorg\.cl"`) are **employee email domains** → CRITICAL. + Keywords without `@` are plain domain matches → LOW baseline. +- **`WATCHED_CHANNELS`** — channel usernames or numeric IDs +- **`BACKFILL_LIMIT`** — past messages to scan per channel on startup + +### 5. Install dependencies + +```bash +pip install -r requirements.txt +# rarfile needs the unrar binary: +# Ubuntu/Debian: sudo apt install unrar +# macOS: brew install rar +``` + +### 5a. Install tdl (strongly recommended) + +```bash +curl -sSL https://raw.githubusercontent.com/iyear/tdl/main/scripts/install.sh | bash +tdl login -n monitor_session +``` + +### 6. First run — complete Telegram auth + +```bash +python main.py --no-tui +# follow the phone + 2FA prompts once +``` + +### 7. Run + +```bash +python main.py # TUI mode (recommended) +python main.py --no-tui # plain CLI +``` + +--- + +## TUI keybindings + +| Key | Action | +|-----|--------| +| `s` | Search hits database | +| `h` | Browse hits by severity | +| `k` | Edit keyword patterns live | +| `c` | Clear logs | +| `r` | Refresh stats | +| `q` | Quit | + +--- + +## Output + +| File | Description | +|------|-------------| +| `data/hits.db` | SQLite — all hits with scores, severity, dedup flag | +| `data/hits.txt` | Human-readable grouped log | +| `data/hits.csv` | CSV — easy to pull into Excel / pandas | +| `data/logs/monitor.log` | Full run log | + +Telegram alerts fire for CRITICAL / HIGH / MEDIUM only. LOW is stored silently. + +--- + +## Notes + +- **Session files are sensitive** — equivalent to a logged-in account. Gitignored, never share. +- **Flood limits** — `FloodWaitError` is handled automatically. +- **Private channels** — your user account must already be a member. diff --git a/config.py b/config.py new file mode 100644 index 0000000..260c822 --- /dev/null +++ b/config.py @@ -0,0 +1,100 @@ +""" +config.py — Loads and validates all settings from .env +""" + +import os +from pathlib import Path +from dotenv import load_dotenv + +load_dotenv() + +# -- Timeouts -- +BOT_REPLY_TIMEOUT = 10 + +# ─── Telegram credentials ──────────────────────────────────────────────────── +API_ID = int(os.environ["API_ID"]) +API_HASH = os.environ["API_HASH"] +BOT_TOKEN = os.environ["BOT_TOKEN"] +NOTIFY_CHAT_ID = int(os.environ["NOTIFY_CHAT_ID"]) +SESSION_NAME = os.getenv("SESSION_NAME", "monitor_session") + +# ─── Target keywords ───────────────────────────────────────────────────────── +# Add your org's domains, email patterns, IP ranges, known usernames, etc. +# All patterns are case-insensitive regex. +TARGET_KEYWORDS: list[str] = [ + r"sanatorioaleman\.cl", + r"@sanatorioaleman\.cl", + # r"192\.168\.10\.", # internal IP range example + # r"specificuser", # known internal usernames +] + +# ─── Channels to watch ─────────────────────────────────────────────────────── +# Use usernames (without @) or numeric channel IDs (-100xxxxxxxxxx) +WATCHED_CHANNELS: list[str | int] = [ + #-1002230225603, + "cloudxlog", + #-1001967030016, # daisycloud + #"berserklogs", # berserklogs + #"BorwitaFreeLogs", # borwita + -1002748707556, # darkcloud + -1001684073398, # BHF Cloud + -1003163621939, # Wich Love from R + -1003611713618, # Khazan Cloud + -1003328682684, # LogsPlanet + -1003204260194, # JDP + -1002828367761, # HesoyamCloud + -1003513974925, # Slurm Logs + -1003599300787, # Arhont Corp + -1002582513379, # OnlyLogs + -1002788333372, # Ickis Cloud + #-1001234567890, # private channel by ID +] + +# ─── File handling ─────────────────────────────────────────────────────────── +TEMP_DIR = Path("./tmp") +HITS_FILE = Path("./hits.txt") +LOG_FILE = Path("./logs/monitor.log") + +# Extensions to download and process +ALLOWED_EXTENSIONS = {".txt", ".zip", ".7z", ".rar"} + +# Max file size to download (bytes). Default: 200 MB. +# Very large files are skipped to avoid abuse of your session. +MAX_FILE_SIZE = 4 * 1024 * 1024 * 1024 # 4 GB (Telegram Premium max) + +# ─── Archive passwords to try ──────────────────────────────────────────────── +ARCHIVE_PASSWORDS: list[bytes] = [ + b"1234", + b"0000", + b"infected", + b"telegram", + b"password", + b"12345", + b"", + b"Borwita", + b"@WichLoveFromR", +] + +# ─── Backfill settings ─────────────────────────────────────────────────────── +# How many historical messages to scan per channel on startup (0 = skip backfill) +BACKFILL_LIMIT = 500 + +# ─── tdl downloader settings ───────────────────────────────────────────────── +# Namespace tdl was logged into. Run `tdl login` with no -n flag → namespace +# is "default". Run `tdl login -n foo` → namespace is "foo". +# Set to None to omit -n entirely (tdl will use "default" anyway). +TDL_NAMESPACE: str | None = "ulpmon" + +# Parallel chunk workers per file (-t / --threads global flag) +TDL_THREADS = 8 + +# Max concurrent files per tdl invocation (-l / --limit global flag) +TDL_PERFILE = 4 + +# Max messages to batch into a single tdl invocation during backfill. +# tdl handles the parallelism internally via -l and -t. +TDL_AMOUNT = 4 + +# Whether to use a Telegram takeout session for downloads (lower flood limits). +# Takeout sessions are rate-limited differently — good for bulk backfill. +TDL_TAKEOUT = True diff --git a/core/__init__.py b/core/__init__.py new file mode 100644 index 0000000..e85ef1c --- /dev/null +++ b/core/__init__.py @@ -0,0 +1 @@ +"""core — Telegram I/O pipeline (scraper, downloader, processor, notifier).""" diff --git a/core/bot_downloader.md b/core/bot_downloader.md new file mode 100644 index 0000000..185aa48 --- /dev/null +++ b/core/bot_downloader.md @@ -0,0 +1,68 @@ +# core/bot_downloader.py + +Handles "click to download" inline button flows. Some Telegram channels post files via a bot behind a button rather than directly attaching them. + +## Public API + +```python +from core.bot_downloader import ( + handle_bot_download_message, + has_download_button, + extract_password, +) +``` + +### `handle_bot_download_message(client, bot, msg, source_name, patterns, password=None)` +**async.** Full pipeline: +1. Detect download button +2. Click it (URL button → `/start payload` to the bot; callback button → `.click()`) +3. Wait up to `BOT_REPLY_TIMEOUT` seconds for the bot to send a file back +4. Hand each file response to `core.scraper.handle_message()` + +### `has_download_button(msg) -> bool` +Returns `True` if the message contains a recognisable download button. +Checked in live handler and backfill before calling this module. + +### `extract_password(msg) -> str | None` +Scans message text for `Pass: ...` / `Password: ...` / `Contraseña: ...` patterns. +Returns the extracted password string, or `None`. + +--- + +## Button detection + +Recognised button text keywords (case-insensitive): +``` +DOWNLOAD, DESCARGAR, GET FILE, GET PACK, ⬇, 📥 +``` + +--- + +## URL button flow (most common) + +``` +Button URL: https://t.me/SomeBot?start=ABC123 + → parse bot username + payload + → client.send_message(bot_entity, "/start ABC123") + → poll get_messages(bot_entity, limit=3) every 1s for BOT_REPLY_TIMEOUT seconds + → return file messages found +``` + +## Callback button flow (fallback) + +``` +btn.click() +→ sleep 2s +→ get_messages(sender, limit=5) +→ return file messages found +``` + +--- + +## Constants + +| Name | Value | Description | +|------|-------|-------------| +| `BOT_REPLY_TIMEOUT` | `10` | Seconds to wait for bot file reply | +| `DOWNLOAD_BUTTON_KEYWORDS` | see above | Button text triggers | +| `PASSWORD_PATTERN` | regex | Matches `Pass[word]: value` in message text | diff --git a/core/bot_downloader.py b/core/bot_downloader.py new file mode 100644 index 0000000..b991765 --- /dev/null +++ b/core/bot_downloader.py @@ -0,0 +1,161 @@ +""" +bot_downloader.py — Handles "click to download" inline button flows. + +Some Telegram channels post messages with a DOWNLOAD button that triggers +a bot to send you the actual file. This module simulates that click and +captures the bot's file response. +""" + +import asyncio +import re +import logging + +from telethon import TelegramClient +from telethon.tl.types import MessageMediaDocument, KeyboardButtonUrl +from telethon.errors import FloodWaitError + +log = logging.getLogger(__name__) + +DOWNLOAD_BUTTON_KEYWORDS = ["DOWNLOAD", "DESCARGAR", "GET FILE", "GET PACK", "⬇", "📥"] +BOT_REPLY_TIMEOUT = 10 + +PASSWORD_PATTERN = re.compile( + r"(?:Pass|Password|Contraseña|Contrasena|Clave)[\s]*:[\s]*(.+)$", + re.IGNORECASE | re.MULTILINE +) + + +# ─── Password extraction ────────────────────────────────────────────────────── + +def extract_password(msg) -> str | None: + if not msg.text: + return None + match = PASSWORD_PATTERN.search(msg.text) + if match: + pwd = match.group(1).strip() + # Strip markdown formatting characters + pwd = pwd.strip("*`_~") + log.info(f" Found password in message: '{pwd}'") + return pwd + return None + + +# ─── Button detection ───────────────────────────────────────────────────────── + +def find_download_button(msg): + """ + Scans a message's inline keyboard for a download-like button. + Returns the button object or None. + """ + if not msg.buttons: + return None + for row in msg.buttons: + for btn in row: + if any(kw in btn.text.upper() for kw in DOWNLOAD_BUTTON_KEYWORDS): + return btn + return None + + +def has_download_button(msg) -> bool: + return find_download_button(msg) is not None + + +# ─── Click + wait flow ──────────────────────────────────────────────────────── + +async def click_download_button(client: TelegramClient, msg) -> list: + """ + Clicks the download button on a message, then waits for the bot to reply + with a file. Returns a list of response messages containing documents. + """ + btn = find_download_button(msg) + if not btn: + return [] + + log.info(f" Clicking button: '{btn.text}'") + + # ── URL button (most common) ─────────────────────────────────────────── + if isinstance(btn.button, KeyboardButtonUrl): + url = btn.button.url # e.g. https://t.me/SomeBot?start=ABC123 + + match = re.search(r"t\.me/([A-Za-z0-9_]+)\?start=(.+)", url) + if not match: + log.warning(f" Unrecognised URL format: {url}") + return [] + + bot_username, payload = match.group(1), match.group(2) + log.info(f" → Messaging @{bot_username} with /start {payload}") + + try: + bot_entity = await client.get_entity(bot_username) + await client.send_message(bot_entity, f"/start {payload}") + except Exception as e: + log.error(f" Failed to message bot: {e}") + return [] + + # Poll for reply + log.info(f" Waiting up to {BOT_REPLY_TIMEOUT}s for bot reply...") + for _ in range(BOT_REPLY_TIMEOUT): + await asyncio.sleep(1) + try: + recent = await client.get_messages(bot_entity, limit=3) + files = [m for m in recent if m.media and isinstance(m.media, MessageMediaDocument)] + if files: + log.info(f" ✓ Got file from bot.") + return files + except Exception as e: + log.warning(f" Poll error: {e}") + break + + log.warning(f" Bot did not reply within {BOT_REPLY_TIMEOUT}s.") + return [] + + # ── Callback button (less common) ───────────────────────────────────── + else: + try: + await btn.click() + await asyncio.sleep(2) + except Exception as e: + log.error(f" Callback click failed: {e}") + return [] + + try: + sender = await msg.get_sender() + recent = await client.get_messages(sender, limit=5) + return [m for m in recent if m.media and isinstance(m.media, MessageMediaDocument)] + except Exception as e: + log.warning(f" Fallback poll failed: {e}") + return [] + + +# ─── Main entry point ───────────────────────────────────────────────────────── + +async def handle_bot_download_message( + client: TelegramClient, + bot: TelegramClient, + msg, + source_name: str, + patterns, + password: str | None = None, +) -> None: + """ + Full pipeline for a message with a download button: + 1. Detect download button + 2. Click it + 3. Wait for bot to send back a file + 4. Hand off to the normal handle_message() flow + """ + if not has_download_button(msg): + return + + log.info(f"[BotDL] Download button detected in {source_name}") + + responses = await click_download_button(client, msg) + + if not responses: + log.warning(f"[BotDL] No file received for message in {source_name}.") + return + + from core.scraper import handle_message + for resp in responses: + log.info(f" [BotDL] Response media type: {type(resp.media).__name__}, attrs: {getattr(resp.media.document, 'attributes', []) if hasattr(resp.media, 'document') else 'none'}") + await handle_message(client, bot, resp, f"{source_name}[bot]", patterns, password=password) diff --git a/core/notifier.md b/core/notifier.md new file mode 100644 index 0000000..9ad4dba --- /dev/null +++ b/core/notifier.md @@ -0,0 +1,67 @@ +# core/notifier.py + +Scores hits, deduplicates, persists to disk and DB, sends Telegram alerts. + +## Public API + +```python +from core.notifier import notify, send_status +``` + +### `notify(bot, hits: list[str], source: str, filename: str)` +**async.** Full notification pipeline: +1. `score_hits(hits)` → `list[ScoredHit]` +2. Deduplicate via SHA-256 hashes (`data/dedup.json`) +3. `insert_hits()` into SQLite for new + dupes (flagged accordingly) +4. `write_hits()` → append to `data/hits.txt` +5. `write_hits_csv()` → append to `data/hits.csv` +6. `send_alert()` → Telegram message for CRITICAL/HIGH/MEDIUM only +7. Post `EvHit` events onto the TUI bus for each new hit + +### `send_status(bot, message: str)` +**async.** Sends a plain Markdown message to `config.NOTIFY_CHAT_ID`. Used for startup/status notifications. + +--- + +## Internal functions + +| Function | Description | +|----------|-------------| +| `deduplicate(hits)` | Returns `(new_hits, dupe_hits)`; updates `data/dedup.json` | +| `write_hits(scored_hits, source)` | Appends grouped human-readable block to `data/hits.txt` | +| `write_hits_csv(scored_hits, source, filename)` | Appends rows to `data/hits.csv`; writes header on first call | +| `send_alert(bot, scored_hits, source, filename)` | Sends Telegram message grouped by severity; skips if all LOW | + +--- + +## Output files + +| File | Format | Notes | +|------|--------|-------| +| `data/hits.txt` | Plain text, grouped by severity | Human-readable, append-only | +| `data/hits.csv` | CSV with header | Columns: `timestamp, severity, score, url, username, password, reasons, source, filename` | +| `data/dedup.json` | JSON array of SHA-256 hex strings | Hashes of `line.strip().lower()` | + +--- + +## Alert behaviour + +- CRITICAL / HIGH / MEDIUM → Telegram alert sent immediately +- LOW → stored in DB + files, **no** Telegram alert +- Duplicates → stored in DB with `seen_before=1`, no alert, no file write + +## Telegram alert format + +``` +🚨 Credential hit(s) detected +📁 `filename` +📢 `source` +🕐 `timestamp` + +Summary: 🔴 N 🟠 N 🟡 N 🟢 N + +🔴 CRITICAL (N) +`url:user:pass` +↳ reason | reason +... (up to 10 per severity; remainder counted) +``` diff --git a/core/notifier.py b/core/notifier.py new file mode 100644 index 0000000..710d1ef --- /dev/null +++ b/core/notifier.py @@ -0,0 +1,248 @@ +""" +notifier.py — Persists hits to disk and sends Telegram bot alerts. + +Includes: + - Severity scoring via scorer.py + - Deduplication: same credential never written or alerted twice + - SQLite storage via database.py + - hits.txt kept as a human-readable backup + - Telegram alerts grouped by severity +""" + +import logging +import hashlib +import json +from datetime import datetime, timezone +from pathlib import Path + +from telethon import TelegramClient + +import csv + +from config import HITS_FILE, NOTIFY_CHAT_ID +from utils.scorer import score_hits, summarize, CRITICAL, HIGH, MEDIUM, LOW, SEVERITY_EMOJI +from utils.database import insert_hits +from tui import events as bus + +HITS_CSV = HITS_FILE.with_suffix(".csv") + +log = logging.getLogger(__name__) + +MAX_PREVIEW = 10 # hits to show per severity group in alert +DEDUP_FILE = Path("./data/dedup.json") + +# Only alert immediately for these severities — LOW hits are silent +ALERT_SEVERITIES = {CRITICAL, HIGH, MEDIUM} + + +# ─── Deduplication ──────────────────────────────────────────────────────────── + +def _hash(line: str) -> str: + return hashlib.sha256(line.strip().lower().encode()).hexdigest() + + +def _load_seen_hashes() -> set: + if not DEDUP_FILE.exists(): + return set() + try: + with open(DEDUP_FILE, "r") as f: + return set(json.load(f)) + except Exception: + return set() + + +def _save_seen_hashes(seen: set) -> None: + try: + with open(DEDUP_FILE, "w") as f: + json.dump(list(seen), f) + except Exception as e: + log.warning(f"Could not save dedup file: {e}") + + +def deduplicate(hits: list) -> tuple[list, list]: + """ + Accepts a list of ScoredHit objects. + Returns (new_hits, dupe_hits). + """ + seen = _load_seen_hashes() + new_hits = [] + dupe_hits = [] + new_hashes = set() + + for h in hits: + digest = _hash(h.raw) + if digest in seen: + dupe_hits.append(h) + else: + new_hits.append(h) + new_hashes.add(digest) + + if new_hashes: + seen.update(new_hashes) + _save_seen_hashes(seen) + + log.info( + f" Dedup: {len(hits)} raw hit(s) → " + f"{len(new_hits)} new, {len(dupe_hits)} duplicate(s)" + ) + return new_hits, dupe_hits + + +# ─── Helpers ───────────────────────────────────────────────────────────────── + +def _timestamp() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") + + +# ─── Output ────────────────────────────────────────────────────────────────── + +def write_hits(scored_hits: list, source: str) -> None: + """Append new hits to hits.txt grouped by severity.""" + HITS_FILE.parent.mkdir(parents=True, exist_ok=True) + summary = summarize(scored_hits) + + with open(HITS_FILE, "a", encoding="utf-8") as f: + f.write(f"\n{'='*60}\n") + f.write(f"Source : {source}\n") + f.write(f"Time : {_timestamp()}\n") + f.write(f"Hits : {len(scored_hits)} ") + f.write(f"(CRITICAL={summary[CRITICAL]} HIGH={summary[HIGH]} ") + f.write(f"MEDIUM={summary[MEDIUM]} LOW={summary[LOW]})\n") + f.write(f"{'='*60}\n") + + for severity in [CRITICAL, HIGH, MEDIUM, LOW]: + group = [h for h in scored_hits if h.severity == severity] + if not group: + continue + emoji = SEVERITY_EMOJI[severity] + f.write(f"\n{emoji} {severity} ({len(group)})\n") + for h in group: + f.write(f" {h.raw}\n") + f.write(f" → {' | '.join(h.reasons)}\n") + + log.info(f" Wrote {len(scored_hits)} hit(s) to {HITS_FILE}") + + +def write_hits_csv(scored_hits: list, source: str, filename: str) -> None: + """Append new hits to hits.csv — one row per hit, easy to import.""" + HITS_CSV.parent.mkdir(parents=True, exist_ok=True) + write_header = not HITS_CSV.exists() + timestamp = _timestamp() + with open(HITS_CSV, "a", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + if write_header: + writer.writerow([ + "timestamp", "severity", "score", "url", "username", + "password", "reasons", "source", "filename", + ]) + for h in scored_hits: + writer.writerow([ + timestamp, h.severity, h.score, + h.url or "", h.username or "", h.password or "", + " | ".join(h.reasons), source, filename, + ]) + log.info(f" Wrote {len(scored_hits)} hit(s) to {HITS_CSV}") + + +async def send_alert( + bot: TelegramClient, + scored_hits: list, + source: str, + filename: str, +) -> None: + """ + Send a Telegram alert grouped by severity. + Only includes CRITICAL, HIGH, MEDIUM — LOW hits are omitted from alerts. + """ + summary = summarize(scored_hits) + alertable = [h for h in scored_hits if h.severity in ALERT_SEVERITIES] + + if not alertable: + log.info(" No alertable hits (all LOW) — skipping Telegram notification.") + return + + lines = [ + f"🚨 *Credential hit(s) detected*", + f"", + f"📁 `{filename}`", + f"📢 `{source}`", + f"🕐 `{_timestamp()}`", + f"", + f"*Summary:*", + f"🔴 CRITICAL: `{summary[CRITICAL]}` " + f"🟠 HIGH: `{summary[HIGH]}` " + f"🟡 MEDIUM: `{summary[MEDIUM]}` " + f"🟢 LOW: `{summary[LOW]}`", + ] + + for severity in [CRITICAL, HIGH, MEDIUM]: + group = [h for h in scored_hits if h.severity == severity] + if not group: + continue + emoji = SEVERITY_EMOJI[severity] + lines.append(f"\n{emoji} *{severity}* ({len(group)})") + for h in group[:MAX_PREVIEW]: + safe = h.raw.replace("`", "'") + lines.append(f"`{safe}`") + lines.append(f"_↳ {' | '.join(h.reasons)}_") + if len(group) > MAX_PREVIEW: + lines.append(f"_...and {len(group) - MAX_PREVIEW} more_") + + try: + await bot.send_message(NOTIFY_CHAT_ID, "\n".join(lines), parse_mode="markdown") + except Exception as e: + log.error(f"Failed to send Telegram alert: {e}") + + +# ─── Main entry point ──────────────────────────────────────────────────────── + +async def notify(bot: TelegramClient, hits: list[str], source: str, filename: str) -> None: + """ + Full notification pipeline: + 1. Score all hits + 2. Deduplicate + 3. Insert all hits into SQLite (new + dupes, flagged accordingly) + 4. Write new hits to hits.txt + 5. Send Telegram alert for new alertable hits only + """ + if not hits: + return + + # Score first + scored = score_hits(hits) + log.info(f" Scored {len(scored)} hit(s) — {summarize(scored)}") + + # Deduplicate + new_hits, dupe_hits = deduplicate(scored) + + # Always insert into DB + if new_hits: + insert_hits(new_hits, source, filename, seen_before=False) + if dupe_hits: + insert_hits(dupe_hits, source, filename, seen_before=True) + + if not new_hits: + log.info(" All hits already seen before — no alert sent.") + return + + # Push hits to TUI + for h in new_hits: + bus.post(bus.EvHit( + severity=h.severity, + raw=h.raw, + source=source, + filename=filename, + reasons=h.reasons, + )) + + write_hits(new_hits, source) + write_hits_csv(new_hits, source, filename) + await send_alert(bot, new_hits, source, filename) + + +async def send_status(bot: TelegramClient, message: str) -> None: + """Send a plain status/info message to the notify chat.""" + try: + await bot.send_message(NOTIFY_CHAT_ID, message, parse_mode="markdown") + except Exception as e: + log.error(f"Failed to send status message: {e}") diff --git a/core/processor.md b/core/processor.md new file mode 100644 index 0000000..29c4e87 --- /dev/null +++ b/core/processor.md @@ -0,0 +1,69 @@ +# core/processor.py + +Archive extraction and hit searching. No Telegram deps, no async. + +## Public API + +```python +from core.processor import compile_patterns, process_file +``` + +### `compile_patterns(keywords: list[str]) -> list[re.Pattern]` +Compiles a list of keyword strings into case-insensitive regex patterns. +Call once at startup; pass the result everywhere patterns are needed. + +```python +patterns = compile_patterns(config.TARGET_KEYWORDS) +``` + +### `process_file(filepath: Path, patterns, password=None) -> list[str]` +Full pipeline: unpack → search each `.txt` → recurse into nested archives → clean up everything. +Returns list of matching raw lines (hits). Deletes the original file and all extracted contents on completion. + +```python +hits = process_file(Path("data/tmp/combo.zip"), patterns, password="infected") +``` + +--- + +## Internal functions + +| Function | Signature | Description | +|----------|-----------|-------------| +| `search_file` | `(filepath, patterns) -> list[str]` | Stream-reads `.txt` line by line; ignores encoding errors | +| `unpack` | `(filepath, extra_password) -> (files, extract_dir\|None)` | Dispatches to correct extractor; plain `.txt` returned as-is | +| `extract_zip` | `(filepath, dest, extra_password)` | Tries no password first, then `ARCHIVE_PASSWORDS` list | +| `extract_7z` | `(filepath, dest, extra_password)` | Requires `py7zr`; skips if not installed | +| `extract_rar` | `(filepath, dest, extra_password)` | Requires `rarfile` + `unrar` binary | +| `_try_passwords` | `(extract_fn, passwords)` | Iterates password list, stops on first success | + +--- + +## Supported formats + +| Extension | Library | Notes | +|-----------|---------|-------| +| `.txt` | built-in | Stream-read, no load into memory | +| `.zip` | `zipfile` | stdlib | +| `.7z` | `py7zr` | optional; skipped if not installed | +| `.rar` | `rarfile` | optional; requires `unrar` system binary | + +Nested archives are recursed **one level** only. + +--- + +## Password order + +1. `extra_password` (from message/channel carry-forward) — tried first +2. `config.ARCHIVE_PASSWORDS` — tried in order + +--- + +## Cleanup guarantee + +`process_file` always deletes: +- Extracted individual files +- Extract subdirectory +- Original downloaded file + +Even if no hits are found. diff --git a/core/processor.py b/core/processor.py new file mode 100644 index 0000000..4f844dc --- /dev/null +++ b/core/processor.py @@ -0,0 +1,233 @@ +""" +processor.py — Archive extraction and hit searching logic. + +Supports: .txt, .zip, .7z, .rar +Stream-processes files line by line — safe for large combo lists. +""" + +import rarfile +rarfile.UNRAR_TOOL = "unrar" + +import re +import zipfile +import logging +import shutil +from pathlib import Path + +try: + import py7zr + HAS_7Z = True +except ImportError: + HAS_7Z = False + +try: + import rarfile + HAS_RAR = True +except ImportError: + HAS_RAR = False + +from config import ARCHIVE_PASSWORDS + +log = logging.getLogger(__name__) + + +# ─── Searching ─────────────────────────────────────────────────────────────── + +def compile_patterns(keywords: list[str]) -> list[re.Pattern]: + return [re.compile(kw, re.IGNORECASE) for kw in keywords] + + +def search_file(filepath: Path, patterns: list[re.Pattern]) -> list[str]: + """ + Stream-reads a text file line by line and returns lines matching any pattern. + Ignores encoding errors — combo files are often messy. + """ + hits: list[str] = [] + try: + with open(filepath, "r", encoding="utf-8", errors="ignore") as f: + for line in f: + stripped = line.strip() + if stripped and any(p.search(stripped) for p in patterns): + hits.append(stripped) + except Exception as e: + log.warning(f"Could not read {filepath.name}: {e}") + return hits + + +# ─── Extraction ────────────────────────────────────────────────────────────── + +def _try_passwords(extract_fn, passwords: list[bytes]) -> bool: + """Try a list of passwords against an extract function. Returns True on success.""" + for pwd in passwords: + try: + extract_fn(pwd) + return True + except Exception: + continue + return False + + +def extract_zip(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]: + passwords = ARCHIVE_PASSWORDS.copy() + if extra_password: + passwords.insert(0, extra_password.encode()) + extracted: list[Path] = [] + try: + with zipfile.ZipFile(filepath) as zf: + def try_extract(pwd: bytes): + zf.extractall(dest, pwd=pwd or None) + + try: + zf.extractall(dest) + except RuntimeError: + log.info(f" ZIP is password-protected, trying common passwords...") + if not _try_passwords(try_extract, ARCHIVE_PASSWORDS): + log.warning(f" Could not unlock {filepath.name} — skipping.") + return [] + + extracted = [p for p in dest.rglob("*") if p.is_file()] + except zipfile.BadZipFile: + log.warning(f" {filepath.name} is not a valid ZIP.") + except Exception as e: + log.warning(f" ZIP extraction error on {filepath.name}: {e}") + return extracted + + +def extract_7z(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]: + if not HAS_7Z: + log.warning("py7zr not installed — skipping .7z file.") + return [] + extracted: list[Path] = [] + passwords = ARCHIVE_PASSWORDS.copy() + if extra_password: + passwords.insert(0, extra_password.encode()) + + try: + # Try without password first + try: + with py7zr.SevenZipFile(filepath, mode="r") as z: + z.extractall(dest) + except py7zr.exceptions.PasswordRequired: + log.info(f" 7z is password-protected, trying common passwords...") + success = False + for pwd in ARCHIVE_PASSWORDS: + try: + with py7zr.SevenZipFile(filepath, mode="r", password=pwd.decode()) as z: + z.extractall(dest) + success = True + break + except Exception: + continue + if not success: + log.warning(f" Could not unlock {filepath.name} — skipping.") + return [] + + extracted = [p for p in dest.rglob("*") if p.is_file()] + except Exception as e: + log.warning(f" 7z extraction error on {filepath.name}: {e}") + return extracted + + +def extract_rar(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]: + if not HAS_RAR: + log.warning("rarfile not installed — skipping .rar file.") + return [] + + passwords = ARCHIVE_PASSWORDS.copy() + if extra_password: + passwords.insert(0, extra_password.encode()) + extracted: list[Path] = [] + try: + with rarfile.RarFile(filepath) as rf: + def try_extract(pwd: bytes): + rf.extractall(dest, pwd=pwd.decode() if pwd else None) + + try: + rf.extractall(dest) + except rarfile.BadRarFile: + log.warning(f" {filepath.name} is not a valid RAR.") + return [] + except Exception: + log.info(f" RAR may be password-protected, trying common passwords...") + if not _try_passwords(try_extract, ARCHIVE_PASSWORDS): + log.warning(f" Could not unlock {filepath.name} — skipping.") + return [] + + extracted = [p for p in dest.rglob("*") if p.is_file()] + except Exception as e: + log.warning(f" RAR extraction error on {filepath.name}: {e}") + return extracted + + +def unpack(filepath: Path, extra_password: str | None = None) -> tuple[list[Path], Path | None]: + """ + Unpacks an archive into a sibling directory. + Returns (list of extracted files, extract_dir or None). + If it's not an archive, returns ([filepath], None). + """ + suffix = filepath.suffix.lower() + extract_dir = filepath.parent / filepath.stem + + if suffix == ".zip": + extract_dir.mkdir(exist_ok=True) + files = extract_zip(filepath, extract_dir, extra_password) + return files, extract_dir + + elif suffix == ".7z": + extract_dir.mkdir(exist_ok=True) + files = extract_7z(filepath, extract_dir, extra_password) + return files, extract_dir + + elif suffix == ".rar": + extract_dir.mkdir(exist_ok=True) + files = extract_rar(filepath, extract_dir, extra_password) + return files, extract_dir + + else: + # Plain file — return as-is, no extract dir to clean up + return [filepath], None + + +# ─── Main entry point ──────────────────────────────────────────────────────── + +def process_file(filepath: Path, patterns, password: str | None = None) -> list[str]: + """ + Full pipeline: unpack → search each file → clean up everything. + Returns list of matching lines (hits). + """ + log.info(f" Processing: {filepath.name}") + all_hits: list[str] = [] + + files, extract_dir = unpack(filepath, extra_password=password) + + for f in files: + if f.suffix.lower() == ".txt": + hits = search_file(f, patterns) + if hits: + log.info(f" ✓ {len(hits)} hit(s) in {f.name}") + all_hits.extend(hits) + + # Nested archives — recurse one level + elif f.suffix.lower() in {".zip", ".7z", ".rar"} and f != filepath: + log.info(f" → Nested archive: {f.name}") + nested_hits = process_file(f, patterns) + all_hits.extend(nested_hits) + continue # process_file already cleaned up f + + # Clean up extracted file + try: + f.unlink(missing_ok=True) + except Exception: + pass + + # Clean up extract dir + if extract_dir and extract_dir.exists(): + shutil.rmtree(extract_dir, ignore_errors=True) + + # Clean up original download + try: + filepath.unlink(missing_ok=True) + except Exception: + pass + + return all_hits diff --git a/core/scraper.md b/core/scraper.md new file mode 100644 index 0000000..9ade2be --- /dev/null +++ b/core/scraper.md @@ -0,0 +1,65 @@ +# core/scraper.py + +Telethon user-client layer. Handles live listening, backfill, and the single-message download pipeline. + +## Public API + +```python +from core.scraper import handle_message, backfill_all, register_handlers, warm_entity_cache +``` + +### `handle_message(client, bot, msg, source_name, patterns, password=None)` +**async.** Full pipeline for one document message: +1. Extract filename + size, check allowlist + size guard +2. Check `utils.cache` — skip if already seen +3. Try `tdl` download → Telethon fallback +4. `core.processor.process_file()` → hits +5. `core.notifier.notify()` if hits found +6. `utils.cache.mark_seen()` + +Called by: live handler, `bot_downloader`, backfill fallback path. + +### `backfill_all(client, bot, patterns)` +**async.** Iterates `config.WATCHED_CHANNELS`, calls `backfill_channel()` for each. +No-op if `config.BACKFILL_LIMIT == 0`. + +### `register_handlers(client, bot, patterns)` +Registers a `NewMessage` Telethon event handler on `config.WATCHED_CHANNELS`. +Used in **CLI mode only** (`--no-tui`). The TUI manages its own handler via `_make_handler()` in `tui/app.py`. + +### `warm_entity_cache(client)` +**async.** Iterates `client.iter_dialogs()` so Telethon caches entity mappings. +Must be called before using raw numeric channel IDs. + +--- + +## Internal functions + +| Function | Description | +|----------|-------------| +| `get_filename(msg)` | Extracts filename from `MessageMediaDocument`; falls back to `{msg_id}{ext}` from MIME | +| `get_filesize(msg)` | Returns document size in bytes | +| `is_processable(filename, size)` | Checks extension allowlist + size limit; returns `(bool, reason)` | +| `_make_dest(msg, filename)` | Resolves temp path, handles collision with `{msg_id}_{filename}` | +| `_telethon_download(client, msg, dest, ...)` | Telethon fallback with tqdm progress + flood-wait handling. Posts `EvDownload*` bus events | +| `backfill_channel(client, bot, channel, patterns, limit)` | Scans history with password carry-forward; batches via tdl | +| `_process_batch(client, bot, batch, patterns)` | One tdl invocation for up to `TDL_AMOUNT` messages; per-file Telethon fallback | + +--- + +## Password carry-forward (backfill) + +Channels often post the archive password as a separate text message. +`backfill_channel` iterates newest→oldest, carrying `last_password` so both older and newer file messages in the same scan pick it up. + +--- + +## Download strategy + +``` +is_tdl_available()? + yes → download_single_with_tdl() / download_batch_with_tdl() + ↓ failed? + _telethon_download() + no → _telethon_download() directly +``` diff --git a/core/scraper.py b/core/scraper.py new file mode 100644 index 0000000..e95821b --- /dev/null +++ b/core/scraper.py @@ -0,0 +1,410 @@ +""" +scraper.py — Telethon user client. + +Handles: + - Listening for new file messages in watched channels + - Listening for messages with inline download buttons (bot-dispatched files) + - Backfilling recent channel history on startup (batched via tdl) + - Downloading files safely (size guard, flood wait) +""" + +import asyncio +import logging +import time +from pathlib import Path + +from tqdm import tqdm +from telethon import TelegramClient, events +from telethon.errors import FloodWaitError, ChannelPrivateError, UsernameNotOccupiedError +from telethon.tl.types import ( + MessageMediaDocument, + DocumentAttributeFilename, + InputDocumentFileLocation, +) + +from config import ( + ALLOWED_EXTENSIONS, + BACKFILL_LIMIT, + MAX_FILE_SIZE, + TEMP_DIR, + WATCHED_CHANNELS, + TDL_AMOUNT, +) +from core.bot_downloader import handle_bot_download_message, has_download_button, extract_password +from utils.cache import is_seen, mark_seen +from core.processor import process_file +from core.notifier import notify +from core.tdl_downloader import ( + BatchEntry, + download_batch_with_tdl, + download_single_with_tdl, + is_tdl_available, +) +from tui import events as bus + +log = logging.getLogger(__name__) + + +# ─── Helpers ────────────────────────────────────────────────────────────────── + +def get_filename(msg) -> str | None: + """Extract the filename from a document message, if any.""" + if not isinstance(msg.media, MessageMediaDocument): + return None + doc = msg.media.document + for attr in doc.attributes: + if isinstance(attr, DocumentAttributeFilename): + return attr.file_name + mime = getattr(doc, "mime_type", "") or "" + ext_map = { + "application/x-rar-compressed": ".rar", + "application/vnd.rar": ".rar", + "application/zip": ".zip", + "application/x-7z-compressed": ".7z", + "text/plain": ".txt", + } + return f"{msg.id}{ext_map.get(mime, '.bin')}" + + +def get_filesize(msg) -> int: + """Return document size in bytes, or 0 if not a document.""" + if not isinstance(msg.media, MessageMediaDocument): + return 0 + return msg.media.document.size or 0 + + +def is_processable(filename: str, size: int) -> tuple[bool, str]: + """Check whether a file should be downloaded. Returns (ok, reason).""" + suffix = Path(filename).suffix.lower() + if suffix not in ALLOWED_EXTENSIONS: + return False, f"extension {suffix!r} not in allowlist" + if size > MAX_FILE_SIZE: + mb = size / (1024 * 1024) + return False, f"too large ({mb:.1f} MB > {MAX_FILE_SIZE // (1024 * 1024)} MB limit)" + return True, "" + + +def _make_dest(msg, filename: str) -> Path: + """Resolve the destination path, avoiding name collisions.""" + TEMP_DIR.mkdir(exist_ok=True) + dest = TEMP_DIR / filename + if dest.exists(): + dest = TEMP_DIR / f"{msg.id}_{filename}" + return dest + + +# ─── Telethon fallback download ─────────────────────────────────────────────── + +async def _telethon_download(client: TelegramClient, msg, dest: Path, filename: str, size: int, batch_id: str | None = None) -> bool: + """Download a single file via Telethon. Returns True on success.""" + _bid = batch_id or f"telethon_{int(time.monotonic_ns())}" + if batch_id is None: + # Standalone call (not already queued by tdl path) — post queued event + bus.post(bus.EvDownloadQueued( + batch_id=_bid, filename=filename, + size_mb=round(size / (1024 * 1024), 2), + source="telethon", password=None, + )) + bus.post(bus.EvDownloadStarted(batch_id=_bid, filename=filename)) + try: + with tqdm( + total=size, + unit="B", + unit_scale=True, + unit_divisor=1024, + desc=filename[:40], + colour="cyan", + ) as pbar: + async def progress(current, total): + pbar.n = current + pbar.refresh() + + doc = msg.media.document + location = InputDocumentFileLocation( + id=doc.id, + access_hash=doc.access_hash, + file_reference=doc.file_reference, + thumb_size="", + ) + await client.download_file( + location, + file=dest, + part_size_kb=512, + progress_callback=progress, + ) + bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon")) + return True + except FloodWaitError as e: + log.warning(f" Flood wait: sleeping {e.seconds}s...") + await asyncio.sleep(e.seconds) + await client.download_media(msg, file=dest) + bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon")) + return True + except Exception as e: + log.error(f" Telethon download failed for {filename}: {e}") + bus.post(bus.EvDownloadFailed(batch_id=_bid, filename=filename, reason=str(e))) + return False + + +# ─── Single-message pipeline (live handler + bot_downloader) ────────────────── + +async def handle_message( + client: TelegramClient, + bot: TelegramClient, + msg, + source_name: str, + patterns, + password: str | None = None, +) -> None: + """Download and process a single file message.""" + filename = get_filename(msg) + if not filename: + log.warning(" handle_message: could not extract filename, skipping.") + return + + size = get_filesize(msg) + ok, reason = is_processable(filename, size) + if not ok: + log.warning(f" handle_message: skipping '{filename}' — {reason}") + return + + doc_id = msg.media.document.id + if is_seen(doc_id): + log.info(f" Skipping {filename} — already processed.") + return + + dest = _make_dest(msg, filename) + log.info(f"↓ Downloading: {filename} ({size / 1024:.1f} KB) from {source_name}") + + # tdl single → Telethon fallback + downloaded = await download_single_with_tdl(msg, dest) if is_tdl_available() else False + if not downloaded: + if is_tdl_available(): + log.warning(" [tdl] failed — falling back to Telethon") + downloaded = await _telethon_download(client, msg, dest, filename, size) + + if not downloaded: + log.error(f" All download attempts failed for {filename}") + return + + hits = process_file(dest, patterns, password=password) + mark_seen(doc_id) + + if hits: + await notify(bot, hits, source_name, filename) + else: + log.info(f" No hits in {filename}") + + +# ─── Batch pipeline (backfill only) ─────────────────────────────────────────── + +async def _process_batch( + client: TelegramClient, + bot: TelegramClient, + batch: list[tuple], # list of (msg, source_name, password) + patterns, +) -> int: + """ + Download up to TDL_AMOUNT messages in one tdl invocation, then process + each. Falls back to Telethon per-file for anything tdl missed. + Returns the number of files successfully processed. + """ + if not batch: + return 0 + + # Build BatchEntry list + entries: list[BatchEntry] = [] + for msg, source_name, password in batch: + filename = get_filename(msg) + if not filename: + continue + entries.append(BatchEntry( + msg=msg, + filename=filename, + dest=_make_dest(msg, filename), + doc_id=msg.media.document.id, + source_name=source_name, + password=password, + )) + + names = ", ".join(e.filename for e in entries) + log.info(f"[Batch] {len(entries)} file(s): {names}") + + # One tdl call for the whole batch + results = await download_batch_with_tdl(entries) + + processed = 0 + for entry in entries: + tdl_ok = results.get(entry.doc_id, False) + + if not tdl_ok: + # Per-file Telethon fallback + log.info(f" [Batch] Telethon fallback: {entry.filename}") + size = get_filesize(entry.msg) + tdl_ok = await _telethon_download(client, entry.msg, entry.dest, entry.filename, size) + + if not tdl_ok: + log.error(f" [Batch] All attempts failed: {entry.filename}") + continue + + hits = process_file(entry.dest, patterns, password=entry.password) + mark_seen(entry.doc_id) + + if hits: + await notify(bot, hits, entry.source_name, entry.filename) + else: + log.info(f" No hits in {entry.filename}") + + processed += 1 + + return processed + + +# ─── Backfill ───────────────────────────────────────────────────────────────── + +async def backfill_channel( + client: TelegramClient, + bot: TelegramClient, + channel: str | int, + patterns, + limit: int, +) -> None: + """Scan the last `limit` messages of a channel for file attachments.""" + log.info(f"[Backfill] Scanning history: {channel} (last {limit} messages)") + total = 0 + batch: list[tuple] = [] # (msg, source_name, password) + last_password: str | None = None # carry password across adjacent messages + + async def flush_batch(): + nonlocal total + if batch: + total += await _process_batch(client, bot, batch, patterns) + batch.clear() + + try: + async for msg in client.iter_messages(channel, limit=limit): + source_name = str(channel) + + # Extract password from this message if present, and remember it. + # iter_messages goes newest→oldest, so a password post that appears + # above the files in the channel will arrive AFTER them here. + # We therefore carry last_password in both directions: + # - apply it to file messages that have no inline password + # - update it whenever we see a fresh password, so subsequent + # (older) file messages in the same batch pick it up too. + msg_password = extract_password(msg) + if msg_password: + last_password = msg_password + + password = msg_password or last_password + + if msg.media and isinstance(msg.media, MessageMediaDocument): + filename = get_filename(msg) + size = get_filesize(msg) + + if not filename: + continue + + ok, reason = is_processable(filename, size) + if not ok: + log.warning(f" [Backfill] Skipping '{filename}' — {reason}") + continue + + if is_seen(msg.media.document.id): + log.info(f" [Backfill] Already seen: {filename}") + continue + + if is_tdl_available(): + batch.append((msg, source_name, password)) + if len(batch) >= TDL_AMOUNT: + await flush_batch() + else: + # No tdl — fall straight through to single handle_message + await handle_message(client, bot, msg, source_name, patterns, password=password) + total += 1 + await asyncio.sleep(0.5) + + elif msg.buttons and has_download_button(msg): + # Bot-button messages can't be batched — handle individually + await flush_batch() # flush any pending batch first + await handle_bot_download_message(client, bot, msg, source_name, patterns, password=password) + total += 1 + await asyncio.sleep(1.5) + + # Flush whatever's left + await flush_batch() + + except (ChannelPrivateError, UsernameNotOccupiedError) as e: + log.error(f"[Backfill] Cannot access {channel}: {e}") + except Exception as e: + log.error(f"[Backfill] Error scanning {channel}: {e}") + + log.info(f"[Backfill] Done: {channel} — {total} file(s) processed") + + +async def backfill_all( + client: TelegramClient, + bot: TelegramClient, + patterns, +) -> None: + """Backfill all watched channels sequentially.""" + if BACKFILL_LIMIT <= 0: + log.info("[Backfill] Disabled (BACKFILL_LIMIT=0)") + return + log.info(f"[Backfill] Starting for {len(WATCHED_CHANNELS)} channel(s)...") + for ch in WATCHED_CHANNELS: + await backfill_channel(client, bot, ch, patterns, BACKFILL_LIMIT) + log.info("[Backfill] Complete.") + + +# ─── Entity cache warmup ────────────────────────────────────────────────────── + +async def warm_entity_cache(client: TelegramClient) -> None: + """ + Fetches your dialog list so Telethon caches all entity mappings. + Required before using raw numeric IDs. + """ + log.info("Warming entity cache (fetching dialogs)...") + async for _ in client.iter_dialogs(): + pass + log.info("Entity cache ready.") + + +# ─── Live listener ──────────────────────────────────────────────────────────── + +def register_handlers( + client: TelegramClient, + bot: TelegramClient, + patterns, +) -> None: + """Register the NewMessage event handler for all watched channels.""" + + # Per-channel password cache for the live handler. + # Channels often post a text message with the password separately from + # the file message. We remember the last seen password per channel so + # that the file message that follows (or precedes by seconds) picks it up. + _channel_passwords: dict[int, str] = {} + + @client.on(events.NewMessage(chats=WATCHED_CHANNELS)) + async def on_new_message(event): + msg = event.message + try: + source = event.chat.username or str(event.chat_id) + except Exception: + source = str(event.chat_id) + + chat_id = event.chat_id + log.info(f"[Live] New message in {source}") + + # Update cache if this message carries a password + msg_password = extract_password(msg) + if msg_password: + _channel_passwords[chat_id] = msg_password + log.debug(f"[Live] Password cached for {source}: '{msg_password}'") + + password = msg_password or _channel_passwords.get(chat_id) + + if msg.media and isinstance(msg.media, MessageMediaDocument): + await handle_message(client, bot, msg, source, patterns, password=password) + elif msg.buttons and has_download_button(msg): + await handle_bot_download_message(client, bot, msg, source, patterns, password=password) diff --git a/core/tdl_downloader.md b/core/tdl_downloader.md new file mode 100644 index 0000000..74efc5b --- /dev/null +++ b/core/tdl_downloader.md @@ -0,0 +1,70 @@ +# core/tdl_downloader.py + +Fast file downloads via `tdl` (Go MTProto). Falls back gracefully if tdl is not installed. + +## Public API + +```python +from core.tdl_downloader import ( + is_tdl_available, + download_single_with_tdl, + download_batch_with_tdl, + BatchEntry, +) +``` + +### `is_tdl_available() -> bool` +Returns `True` if `tdl` binary is on PATH. + +### `download_single_with_tdl(msg, dest: Path) -> bool` +**async.** Downloads one message's document. Returns `True` on success. +Used by the live handler and `bot_downloader`. + +### `download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]` +**async.** Downloads up to `TDL_AMOUNT` messages in a single `tdl dl` invocation. +Returns `{doc_id: True|False}` — `False` means Telethon fallback needed. + +--- + +## BatchEntry dataclass + +```python +@dataclass +class BatchEntry: + msg: object # Telethon Message + filename: str + dest: Path # final destination path in TEMP_DIR + doc_id: int # msg.media.document.id + source_name: str + password: str | None +``` + +--- + +## TUI output pipeline + +In TUI mode (`bus.tui_active == True`), `_run_tdl` pipes stdout+stderr and relays lines as `EvTdlOutput` events in real time. +**Reads raw 256-byte chunks** (not line-by-line) and splits on `\r` and `\n`, because tdl uses `\r` to overwrite its progress bar in place. + +In CLI mode: subprocess inherits the terminal, progress bars render natively. + +--- + +## Staging directory isolation + +Each batch/single download gets a unique `data/tmp/_tdl_{monotonic_ns}/` staging dir. +After `tdl` exits, files are matched by name (with fuzzy stem fallback for `filenamify()` mangling) and moved to final `dest`. Staging dir is removed regardless of outcome. + +`--template '{{ filenamify .FileName }}'` — tdl uses the original Telegram filename, not its default `DialogID_MessageID_filename` format. + +--- + +## Config knobs (`config.py`) + +| Setting | Default | Description | +|---------|---------|-------------| +| `TDL_NAMESPACE` | `"default"` | `-n` flag; `None` omits it | +| `TDL_THREADS` | `8` | `-t` chunk workers per file | +| `TDL_PERFILE` | `4` | `-l` concurrent files per invocation | +| `TDL_AMOUNT` | `4` | Max messages per batch | +| `TDL_TAKEOUT` | `False` | `--takeout` session flag | diff --git a/core/tdl_downloader.py b/core/tdl_downloader.py new file mode 100644 index 0000000..eea963f --- /dev/null +++ b/core/tdl_downloader.py @@ -0,0 +1,363 @@ +""" +tdl_downloader.py — Fast file downloads via tdl (Go MTProto implementation). + +Install: https://github.com/iyear/tdl + curl -sSL https://raw.githubusercontent.com/iyear/tdl/main/scripts/install.sh | bash + +First-time setup — log in once: + tdl login # saves to namespace "default" + tdl login -n myns # saves to a named namespace + +Relevant config.py knobs: + TDL_NAMESPACE str|None Session namespace (default "default"; None omits -n) + TDL_THREADS int Chunk workers per file (-t, default 4) + TDL_PERFILE int Concurrent files (-l, default 4) + TDL_AMOUNT int Messages per tdl batch (default 4) + TDL_TAKEOUT bool Use takeout session (--takeout) + +Flag reference: + Global (BEFORE subcommand): -n --ns, -t --threads, -l --limit + dl-specific: -u --url, -d --dir, --template, --continue, --takeout + +Download isolation strategy: + Each batch gets its own staging subdirectory (TEMP_DIR//) so that + concurrent downloads and homoglyph filename collisions can never cause tdl's + internal .tmp → final rename to fail. Files are moved to TEMP_DIR after + the batch completes and the staging dir is removed. +""" + +import asyncio +import logging +import shutil +import time +from dataclasses import dataclass +from pathlib import Path + +from config import TDL_NAMESPACE, TDL_THREADS, TDL_PERFILE, TDL_TAKEOUT, TEMP_DIR +from tui import events as bus + +log = logging.getLogger(__name__) + + +# ─── Availability ───────────────────────────────────────────────────────────── + +def is_tdl_available() -> bool: + return shutil.which("tdl") is not None + + +# ─── Message → URL ──────────────────────────────────────────────────────────── + +def _build_message_url(msg) -> str: + """ + Build a t.me/c// link from a Telethon Message. + Works for public and private channels alike. + """ + peer = msg.peer_id + if hasattr(peer, "channel_id"): + return f"https://t.me/c/{peer.channel_id}/{msg.id}" + elif hasattr(peer, "chat_id"): + return f"https://t.me/c/{peer.chat_id}/{msg.id}" + elif hasattr(peer, "user_id"): + return f"https://t.me/c/{peer.user_id}/{msg.id}" + raise ValueError(f"Cannot build message URL from peer: {peer!r}") + + +# ─── Command builder ────────────────────────────────────────────────────────── + +def _build_cmd(urls: list[str], staging_dir: Path) -> list[str]: + """ + Build the full tdl dl command. + + Global flags (-n, -t, -l) MUST precede the subcommand. + staging_dir is always an absolute path to a fresh per-batch directory, + so tdl's internal .tmp → final rename can never collide with an existing + file of the same name. + + --template '{{ filenamify .FileName }}' keeps just the original filename + (no DialogID_MessageID_ prefix). + + --continue is kept so interrupted downloads resume rather than restart. + --skip-same is intentionally omitted — deduplication is handled upstream + by is_seen(), and --skip-same can cause the .tmp rename to fail when a + same-named file already exists in the directory. + """ + global_flags: list[str] = [] + if TDL_NAMESPACE: + global_flags += ["-n", str(TDL_NAMESPACE)] + global_flags += ["-t", str(TDL_THREADS), "-l", str(TDL_PERFILE)] + + url_flags: list[str] = [] + for url in urls: + url_flags += ["-u", url] + + dl_flags = [ + "-d", str(staging_dir), + "--template", "{{ filenamify .FileName }}", + "--continue", + ] + if TDL_TAKEOUT: + dl_flags.append("--takeout") + + return ["tdl", *global_flags, "dl", *url_flags, *dl_flags] + + +# ─── Runner ─────────────────────────────────────────────────────────────────── + +# ANSI escape stripper — tdl emits colour codes even when not a TTY +import re as _re +_ANSI_RE = _re.compile(r"\x1b\[[0-9;]*[mGKHFJA-Z]|\x1b=|\x1b>|\x1b\[\?[0-9]+[hl]") + +def _strip_ansi(text: str) -> str: + return _ANSI_RE.sub("", text) + + +async def _run_tdl(cmd: list[str], label: str) -> bool: + """ + Spawn tdl and handle output based on whether the TUI is running: + - TUI mode: pipe stdout+stderr, read raw chunks (NOT line-by-line), + split on both \\r and \\n, strip ANSI, post non-empty + segments immediately as EvTdlOutput. + tdl uses \\r to overwrite its progress bar in place, so + async-for-line on the stream would block until EOF. + Chunk-reading + manual split delivers progress live. + - CLI mode: inherit the terminal so tdl's progress bars render natively. + Returns True on exit code 0, False otherwise. + """ + log.debug(f"[tdl] cmd: {' '.join(cmd)}") + try: + if bus.tui_active: + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + async def _relay(stream): + buf = "" + while True: + chunk = await stream.read(256) + if not chunk: + break + buf += chunk.decode(errors="replace") + # Split on both \r and \n; process all complete segments + parts = _re.split(r"[\r\n]", buf) + # Last element may be an incomplete segment — keep in buffer + buf = parts[-1] + for part in parts[:-1]: + clean = _strip_ansi(part).strip() + if clean: + bus.post(bus.EvTdlOutput(line=clean)) + # Flush any remaining buffer content + if buf: + clean = _strip_ansi(buf).strip() + if clean: + bus.post(bus.EvTdlOutput(line=clean)) + + await asyncio.gather(_relay(proc.stdout), _relay(proc.stderr)) + await proc.wait() + else: + proc = await asyncio.create_subprocess_exec(*cmd) + await proc.wait() + + if proc.returncode == 0: + log.info(f"[tdl] ✓ {label}") + return True + else: + log.error(f"[tdl] ✗ exit {proc.returncode} — {label}") + return False + except FileNotFoundError: + log.error("[tdl] binary not found at runtime") + return False + except Exception as e: + log.error(f"[tdl] Unexpected error: {e}") + return False + + +# ─── Staging dir helpers ────────────────────────────────────────────────────── + +def _make_staging_dir() -> Path: + """Create a unique staging subdirectory under TEMP_DIR for one batch.""" + staging = TEMP_DIR.resolve() / f"_tdl_{int(time.monotonic_ns())}" + staging.mkdir(parents=True, exist_ok=True) + return staging + + +def _find_in_staging(staging: Path, expected_name: str) -> Path | None: + """ + Locate a downloaded file in the staging dir by matching its name. + filenamify() can munge characters (strips @, collapses unicode, etc.) + so we do a normalised stem comparison as a fallback. + """ + # Exact match first + exact = staging / expected_name + if exact.exists(): + return exact + + expected_stem = Path(expected_name).stem.lower().lstrip("@").replace(" ", "") + expected_suffix = Path(expected_name).suffix.lower() + + for candidate in staging.iterdir(): + if not candidate.is_file(): + continue + if candidate.suffix.lower() != expected_suffix: + continue + cand_stem = candidate.stem.lower().lstrip("@").replace(" ", "") + if cand_stem == expected_stem: + return candidate + + return None + + +def _move_from_staging(staging: Path, expected_name: str, final_dest: Path) -> bool: + """ + Find the file in staging, move it to final_dest, return True on success. + """ + found = _find_in_staging(staging, expected_name) + if not found: + log.warning(f"[tdl] Not found in staging: '{expected_name}' (staging: {staging})") + return False + + try: + found.rename(final_dest) + log.debug(f"[tdl] Moved: {found.name} → {final_dest}") + return True + except Exception as e: + log.error(f"[tdl] Move failed {found} → {final_dest}: {e}") + return False + + +def _cleanup_staging(staging: Path) -> None: + try: + shutil.rmtree(staging, ignore_errors=True) + except Exception: + pass + + +# ─── Public API ─────────────────────────────────────────────────────────────── + +@dataclass +class BatchEntry: + """Carries everything needed to process one file after a batch download.""" + msg: object # Telethon Message + filename: str + dest: Path + doc_id: int + source_name: str + password: str | None + + +async def download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]: + """ + Download a batch of messages in a single tdl invocation. + + Each batch gets its own staging subdirectory so filenames can never + collide with existing files in TEMP_DIR. After tdl exits, files are + moved from staging to their final dest paths. + + Returns dict mapping doc_id → True (ready at entry.dest) / False (fallback needed). + """ + if not entries: + return {} + + if not is_tdl_available(): + log.warning("[tdl] not available — all entries need Telethon fallback") + return {e.doc_id: False for e in entries} + + urls: list[str] = [] + for entry in entries: + try: + urls.append(_build_message_url(entry.msg)) + except ValueError as exc: + log.error(f"[tdl] Skipping {entry.filename}: {exc}") + urls.append("") + + valid_entries = [(e, u) for e, u in zip(entries, urls) if u] + if not valid_entries: + return {e.doc_id: False for e in entries} + + batch_id = f"batch_{int(time.monotonic_ns())}" + names = ", ".join(e.filename for e, _ in valid_entries) + log.info(f"[tdl] Batch ({len(valid_entries)} files): {names}") + + # Notify TUI: all files in this batch are queued + for entry, _ in valid_entries: + size_mb = (entry.msg.media.document.size or 0) / (1024 * 1024) + bus.post(bus.EvDownloadQueued( + batch_id=batch_id, + filename=entry.filename, + size_mb=round(size_mb, 2), + source=entry.source_name, + password=entry.password, + )) + + staging = _make_staging_dir() + cmd = _build_cmd([u for _, u in valid_entries], staging) + + # Signal batch started + for entry, _ in valid_entries: + bus.post(bus.EvDownloadStarted(batch_id=batch_id, filename=entry.filename)) + + tdl_ok = await _run_tdl(cmd, f"batch of {len(valid_entries)}") + + results: dict[int, bool] = {} + for entry in entries: + if not any(e.doc_id == entry.doc_id for e, _ in valid_entries): + results[entry.doc_id] = False + continue + + if tdl_ok: + moved = _move_from_staging(staging, entry.filename, entry.dest) + results[entry.doc_id] = moved + if moved: + bus.post(bus.EvDownloadDone(batch_id=batch_id, filename=entry.filename, via="tdl")) + else: + log.warning(f"[tdl] Fallback needed: {entry.filename}") + bus.post(bus.EvDownloadFailed(batch_id=batch_id, filename=entry.filename, reason="staging mismatch")) + else: + results[entry.doc_id] = False + bus.post(bus.EvDownloadFailed(batch_id=batch_id, filename=entry.filename, reason="tdl exit error")) + + _cleanup_staging(staging) + return results + + +async def download_single_with_tdl(msg, dest: Path) -> bool: + """ + Download a single message with tdl. Used by the live handler and + bot_downloader where batching doesn't apply. + """ + if not is_tdl_available(): + log.warning("[tdl] not available — falling back to Telethon") + return False + + try: + url = _build_message_url(msg) + except ValueError as e: + log.error(f"[tdl] Cannot build URL: {e}") + return False + + batch_id = f"single_{int(time.monotonic_ns())}" + size_mb = (msg.media.document.size or 0) / (1024 * 1024) if hasattr(msg, "media") and msg.media else 0 + bus.post(bus.EvDownloadQueued( + batch_id=batch_id, filename=dest.name, + size_mb=round(size_mb, 2), source="live", password=None, + )) + bus.post(bus.EvDownloadStarted(batch_id=batch_id, filename=dest.name)) + + staging = _make_staging_dir() + cmd = _build_cmd([url], staging) + log.info(f"[tdl] Single: {dest.name} ({url})") + tdl_ok = await _run_tdl(cmd, dest.name) + + if tdl_ok: + result = _move_from_staging(staging, dest.name, dest) + else: + result = False + + _cleanup_staging(staging) + + if result: + bus.post(bus.EvDownloadDone(batch_id=batch_id, filename=dest.name, via="tdl")) + else: + bus.post(bus.EvDownloadFailed(batch_id=batch_id, filename=dest.name, reason="tdl failed")) + return result diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/logs/monitor.log b/logs/monitor.log new file mode 100644 index 0000000..0580406 --- /dev/null +++ b/logs/monitor.log @@ -0,0 +1,54 @@ +2026-04-02 00:45:48,909 [INFO] utils.database: Database ready: data/hits.db +2026-04-02 00:45:49,119 [INFO] telethon.crypto.aes: libssl detected, it will be used for encryption +2026-04-02 00:45:49,156 [INFO] utils.database: Database ready: data/hits.db +2026-04-02 00:45:49,159 [INFO] tui.app: [bot] Connecting bot_client... +2026-04-02 00:45:49,159 [INFO] telethon.network.mtprotosender: Connecting to 149.154.175.59:443/TcpFull... +2026-04-02 00:45:49,203 [INFO] tui.app: [bus] EvStatus: Starting — 12 channel(s), 2 pattern(s) +2026-04-02 00:45:49,281 [INFO] telethon.network.mtprotosender: Connection to 149.154.175.59:443/TcpFull complete! +2026-04-02 00:45:49,900 [INFO] tui.app: [bot] bot_client connected, authorizing... +2026-04-02 00:45:49,901 [INFO] tui.app: [bot] bot_client ready +2026-04-02 00:45:49,901 [INFO] tui.app: [bot] Connecting user_client... +2026-04-02 00:45:49,901 [INFO] telethon.network.mtprotosender: Connecting to 149.154.175.59:443/TcpFull... +2026-04-02 00:45:49,908 [INFO] __main__: Cleaning up tmp/... +2026-04-02 00:54:16,429 [INFO] utils.database: Database ready: data/hits.db +2026-04-02 00:54:16,638 [INFO] telethon.crypto.aes: libssl detected, it will be used for encryption +2026-04-02 00:54:16,666 [ERROR] tui.app: [bot-thread] Unhandled exception: cannot import name 'HITS_CSV' from 'config' (/home/anti/Tools/sj/telegrammer/config.py) +Traceback (most recent call last): + File "/home/anti/Tools/sj/telegrammer/tui/app.py", line 848, in _run_bot_thread + loop.run_until_complete(self._bot_main()) + ~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^ + File "/usr/lib64/python3.14/asyncio/base_events.py", line 719, in run_until_complete + return future.result() + ~~~~~~~~~~~~~^^ + File "/home/anti/Tools/sj/telegrammer/tui/app.py", line 865, in _bot_main + from core.notifier import send_status + File "/home/anti/Tools/sj/telegrammer/core/notifier.py", line 22, in + from config import HITS_FILE, HITS_CSV, NOTIFY_CHAT_ID +ImportError: cannot import name 'HITS_CSV' from 'config' (/home/anti/Tools/sj/telegrammer/config.py) +2026-04-02 00:54:16,716 [INFO] tui.app: [bus] EvStatus: Bot thread crashed: cannot import name 'HITS_CSV' from 'config' (/home/anti/Tools/sj/telegrammer/config.py) +2026-04-02 00:54:22,624 [INFO] __main__: Cleaning up tmp/... +2026-04-02 00:54:34,773 [INFO] utils.database: Database ready: data/hits.db +2026-04-02 00:54:34,983 [INFO] telethon.crypto.aes: libssl detected, it will be used for encryption +2026-04-02 00:54:35,015 [INFO] utils.database: Database ready: data/hits.db +2026-04-02 00:54:35,015 [INFO] tui.app: [bot] Connecting bot_client... +2026-04-02 00:54:35,015 [INFO] telethon.network.mtprotosender: Connecting to 149.154.175.59:443/TcpFull... +2026-04-02 00:54:35,063 [INFO] tui.app: [bus] EvStatus: Starting — 12 channel(s), 2 pattern(s) +2026-04-02 00:54:35,120 [INFO] telethon.network.mtprotosender: Connection to 149.154.175.59:443/TcpFull complete! +2026-04-02 00:54:35,698 [INFO] tui.app: [bot] bot_client connected, authorizing... +2026-04-02 00:54:35,698 [INFO] tui.app: [bot] bot_client ready +2026-04-02 00:54:35,698 [INFO] tui.app: [bot] Connecting user_client... +2026-04-02 00:54:35,698 [INFO] telethon.network.mtprotosender: Connecting to 149.154.175.59:443/TcpFull... +2026-04-02 00:54:35,810 [INFO] telethon.network.mtprotosender: Connection to 149.154.175.59:443/TcpFull complete! +2026-04-02 00:54:36,420 [INFO] tui.app: [bot] user_client connected, checking auth... +2026-04-02 00:54:36,420 [INFO] tui.app: [bot] user_client ready +2026-04-02 00:54:36,563 [INFO] tui.app: [bus] EvStatus: Connected as 4n (@clp_c) +2026-04-02 00:54:36,653 [INFO] core.scraper: Warming entity cache (fetching dialogs)... +2026-04-02 00:54:38,437 [INFO] core.scraper: Entity cache ready. +2026-04-02 00:54:38,437 [INFO] tui.app: [bot] Handler registered for 12 channel(s) +2026-04-02 00:54:38,437 [INFO] core.scraper: [Backfill] Starting for 12 channel(s)... +2026-04-02 00:54:38,437 [INFO] core.scraper: [Backfill] Scanning history: cloudxlog (last 500 messages) +2026-04-02 00:54:38,463 [INFO] tui.app: [bus] EvStatus: Watching 12 channel(s) +2026-04-02 00:54:38,463 [INFO] tui.app: [bus] EvStatus: Live listener active +2026-04-02 00:54:38,585 [INFO] core.scraper: [Batch] 4 file(s): @cloud t13.txt, @cloud t12.txt, @cloud t11.txt, @cloud t10.txt +2026-04-02 00:54:38,585 [INFO] core.tdl_downloader: [tdl] Batch (4 files): @cloud t13.txt, @cloud t12.txt, @cloud t11.txt, @cloud t10.txt +2026-04-02 00:54:40,248 [INFO] __main__: Cleaning up tmp/... diff --git a/main.py b/main.py new file mode 100644 index 0000000..04393e9 --- /dev/null +++ b/main.py @@ -0,0 +1,142 @@ +""" +main.py — Entry point for the ULP credential monitor. + +Usage: + python main.py # TUI mode (default, requires textual) + python main.py --no-tui # Plain CLI mode + +First run will prompt for your Telegram phone number and 2FA code +to create a session file. Subsequent runs are fully automatic. +""" + +import asyncio +import logging +import sys +import shutil +import argparse + +import config +from utils.database import init_db + + +# ─── Logging setup ──────────────────────────────────────────────────────────── + +config.LOG_FILE.parent.mkdir(parents=True, exist_ok=True) +config.TEMP_DIR.mkdir(parents=True, exist_ok=True) + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + handlers=[ + logging.FileHandler(config.LOG_FILE, encoding="utf-8"), + ], +) +log = logging.getLogger(__name__) + +init_db() + + +# ─── Plain CLI mode ─────────────────────────────────────────────────────────── + +async def _cli_main(): + """Original asyncio main — runs without the TUI.""" + logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) + + from telethon import TelegramClient + from core.processor import compile_patterns + from core.notifier import send_status + from core.scraper import backfill_all, register_handlers, warm_entity_cache + + log.info("=" * 60) + log.info(" ULP Credential Monitor — CLI mode") + log.info("=" * 60) + + patterns = compile_patterns(config.TARGET_KEYWORDS) + log.info(f"Loaded {len(patterns)} keyword pattern(s)") + log.info(f"Watching {len(config.WATCHED_CHANNELS)} channel(s)") + + user_client = TelegramClient( + config.SESSION_NAME, config.API_ID, config.API_HASH, + connection_retries=5, auto_reconnect=True, request_retries=5, + ) + bot_client = TelegramClient( + "bot_session", config.API_ID, config.API_HASH, + ) + + async with user_client, bot_client: + await bot_client.start(bot_token=config.BOT_TOKEN) + log.info("Bot client connected.") + + await user_client.start() + me = await user_client.get_me() + log.info(f"User client connected as: {me.first_name} (@{me.username})") + + await send_status( + bot_client, + f"✅ *Monitor started*\n" + f"User: `{me.first_name}`\n" + f"Channels: `{len(config.WATCHED_CHANNELS)}`\n" + f"Patterns: `{len(patterns)}`\n" + f"Backfill: `{config.BACKFILL_LIMIT} msg/channel`", + ) + + await warm_entity_cache(user_client) + register_handlers(user_client, bot_client, patterns) + log.info("Live listener registered.") + + await backfill_all(user_client, bot_client, patterns) + + log.info("Listening for new messages... (Ctrl+C to stop)") + await user_client.run_until_disconnected() + + log.info("Monitor stopped.") + + +# ─── Entry point ────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser(description="ULP Credential Monitor") + parser.add_argument( + "--no-tui", + action="store_true", + help="Run in plain CLI mode (no Textual TUI)", + ) + args = parser.parse_args() + + if args.no_tui: + try: + asyncio.run(_cli_main()) + except KeyboardInterrupt: + log.info("Interrupted by user.") + finally: + log.info("Cleaning up tmp/...") + if config.TEMP_DIR.exists(): + shutil.rmtree(config.TEMP_DIR, ignore_errors=True) + config.TEMP_DIR.mkdir() + log.info("Done.") + else: + try: + from tui.app import run_tui + except ImportError: + print( + "⚠ Textual is not installed. Install it with:\n" + " pip install textual\n" + "Or run in plain CLI mode:\n" + " python main.py --no-tui", + file=sys.stderr, + ) + sys.exit(1) + + try: + run_tui() + except KeyboardInterrupt: + pass + finally: + log.info("Cleaning up tmp/...") + if config.TEMP_DIR.exists(): + shutil.rmtree(config.TEMP_DIR, ignore_errors=True) + config.TEMP_DIR.mkdir() + + +if __name__ == "__main__": + main() diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..5ee6477 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +testpaths = tests diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..e079f8a --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +pytest diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9fdadb0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +# Telegram +telethon +tgcrypto + +# TUI +textual + +# Config +python-dotenv + +# Progress bars (CLI mode) +tqdm + +# Archive extraction +py7zr +rarfile diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..f2d8d56 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,31 @@ +import os + +# Must be set before config.py is imported by any module. +# load_dotenv() runs at import time; these setdefaults fill the gap when .env is absent. +os.environ.setdefault("API_ID", "12345") +os.environ.setdefault("API_HASH", "dummy_hash_for_tests") +os.environ.setdefault("BOT_TOKEN", "0:dummy_bot_token") +os.environ.setdefault("NOTIFY_CHAT_ID", "99999") + +import pytest +import config +import utils.scorer as scorer + +# Two test keywords: +# @testcorp\.com — employee email domain (triggers CRITICAL) +# testcorp\.com — plain domain match (triggers LOW baseline) +TEST_KEYWORDS = [r"@testcorp\.com", r"testcorp\.com"] + + +@pytest.fixture +def patched_keywords(monkeypatch): + """ + Override TARGET_KEYWORDS for the duration of a test and rebuild the + scorer's module-level globals so scoring logic uses known test patterns. + """ + monkeypatch.setattr(config, "TARGET_KEYWORDS", TEST_KEYWORDS) + # scorer.py uses `from config import TARGET_KEYWORDS` — a local binding that + # doesn't update when config.TARGET_KEYWORDS is patched. Patch it directly. + monkeypatch.setattr(scorer, "TARGET_KEYWORDS", TEST_KEYWORDS) + monkeypatch.setattr(scorer, "EMPLOYEE_DOMAINS", scorer._build_employee_domains()) + monkeypatch.setattr(scorer, "ORG_DOMAINS", scorer._build_org_domains()) diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..301b2d9 --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,55 @@ +""" +Tests for utils/cache.py — file-ID deduplication cache. + +Each test gets an isolated cache file via the `isolated_cache` fixture +so tests never touch data/cache.json. +""" + +import pytest +import utils.cache as cache_module + + +@pytest.fixture(autouse=True) +def isolated_cache(tmp_path, monkeypatch): + monkeypatch.setattr(cache_module, "CACHE_FILE", tmp_path / "cache.json") + + +def test_unseen_id_returns_false(): + assert cache_module.is_seen(12345) is False + + +def test_mark_seen_makes_id_seen(): + cache_module.mark_seen(12345) + assert cache_module.is_seen(12345) is True + + +def test_multiple_ids_stored_independently(): + cache_module.mark_seen(1) + cache_module.mark_seen(2) + cache_module.mark_seen(3) + assert cache_module.is_seen(1) + assert cache_module.is_seen(2) + assert cache_module.is_seen(3) + assert not cache_module.is_seen(4) + + +def test_persists_to_disk_between_calls(): + """ + is_seen() and mark_seen() each load from disk independently. + This verifies the persist-on-write / load-on-read contract + (simulating what happens across separate function calls in the bot loop). + """ + cache_module.mark_seen(999) + assert cache_module.is_seen(999) is True + + +def test_missing_cache_file_handled_gracefully(tmp_path, monkeypatch): + monkeypatch.setattr(cache_module, "CACHE_FILE", tmp_path / "nonexistent.json") + assert cache_module.is_seen(42) is False + + +def test_mark_seen_is_idempotent(): + cache_module.mark_seen(7) + cache_module.mark_seen(7) + cache_module.mark_seen(7) + assert cache_module.is_seen(7) is True diff --git a/tests/test_database.py b/tests/test_database.py new file mode 100644 index 0000000..10bb543 --- /dev/null +++ b/tests/test_database.py @@ -0,0 +1,188 @@ +""" +Tests for utils/database.py — SQLite persistence layer. + +Each test gets an isolated in-memory-equivalent DB via the `isolated_db` +fixture so tests never touch data/hits.db. +""" + +import pytest +import utils.database as db_module +from utils.scorer import ScoredHit, CRITICAL, HIGH, MEDIUM, LOW + + +def make_hit(severity=LOW, url="testcorp.com", username="user", password="pass", raw=None): + """Build a minimal ScoredHit for insertion tests.""" + scores = {CRITICAL: 40, HIGH: 30, MEDIUM: 20, LOW: 10} + return ScoredHit( + raw=raw or f"{url}|{username}|{password}", + severity=severity, + score=scores[severity], + reasons=["Test reason"], + url=url, + username=username, + password=password, + ) + + +@pytest.fixture(autouse=True) +def isolated_db(tmp_path, monkeypatch): + monkeypatch.setattr(db_module, "DB_FILE", tmp_path / "test_hits.db") + db_module.init_db() + + +# ─── init_db ───────────────────────────────────────────────────────────────── + +def test_init_db_is_idempotent(): + db_module.init_db() + db_module.init_db() # must not raise + + +# ─── insert_hits ────────────────────────────────────────────────────────────── + +def test_insert_returns_correct_row_count(): + hits = [make_hit(), make_hit(severity=CRITICAL)] + count = db_module.insert_hits(hits, source="testchan", filename="combo.txt") + assert count == 2 + + +def test_insert_stores_all_fields(): + hit = make_hit(severity=HIGH, url="intranet.testcorp.com", username="jdoe", password="s3cr3t") + db_module.insert_hits([hit], source="mychan", filename="creds.zip") + rows = db_module.search("jdoe") + assert len(rows) == 1 + row = rows[0] + assert row["url"] == "intranet.testcorp.com" + assert row["username"] == "jdoe" + assert row["password"] == "s3cr3t" + assert row["severity"] == HIGH + assert row["score"] == 30 + assert row["source"] == "mychan" + assert row["filename"] == "creds.zip" + assert row["seen_before"] == 0 + + +def test_insert_seen_before_flag(): + hit = make_hit() + db_module.insert_hits([hit], source="chan", filename="f.txt", seen_before=True) + rows = db_module.search("testcorp") + assert rows[0]["seen_before"] == 1 + + +# ─── search ─────────────────────────────────────────────────────────────────── + +def test_search_finds_by_username(): + db_module.insert_hits([make_hit(username="jdoe@testcorp.com")], source="c", filename="f.txt") + results = db_module.search("jdoe") + assert len(results) == 1 + assert results[0]["username"] == "jdoe@testcorp.com" + + +def test_search_finds_by_url(): + db_module.insert_hits([make_hit(url="admin.testcorp.com")], source="c", filename="f.txt") + results = db_module.search("admin.testcorp") + assert len(results) == 1 + + +def test_search_finds_by_raw(): + db_module.insert_hits([make_hit(raw="raw_unique_token_xyz")], source="c", filename="f.txt") + results = db_module.search("unique_token") + assert len(results) == 1 + + +def test_search_returns_empty_for_no_match(): + db_module.insert_hits([make_hit()], source="c", filename="f.txt") + assert db_module.search("zzznomatch_xyz") == [] + + +def test_search_sorted_by_score_descending(): + db_module.insert_hits([make_hit(severity=LOW)], source="c", filename="f.txt") + db_module.insert_hits([make_hit(severity=CRITICAL, url="admin.testcorp.com")], source="c", filename="f.txt") + results = db_module.search("testcorp") + assert results[0]["score"] >= results[-1]["score"] + + +# ─── by_severity ────────────────────────────────────────────────────────────── + +def test_by_severity_returns_correct_severity(): + db_module.insert_hits([make_hit(severity=CRITICAL, url="admin.testcorp.com")], source="c", filename="f.txt") + db_module.insert_hits([make_hit(severity=LOW)], source="c", filename="f.txt") + results = db_module.by_severity(CRITICAL) + assert len(results) == 1 + assert results[0]["severity"] == CRITICAL + + +def test_by_severity_excludes_duplicates(): + """seen_before=1 rows must be invisible to by_severity — they are stored for stats only.""" + hit = make_hit(severity=HIGH, url="intranet.testcorp.com") + db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=True) + assert db_module.by_severity(HIGH) == [] + + +def test_by_severity_returns_empty_when_none(): + assert db_module.by_severity(CRITICAL) == [] + + +# ─── stats ─────────────────────────────────────────────────────────────────── + +def test_stats_counts_by_severity(): + db_module.insert_hits([make_hit(severity=CRITICAL, url="admin.testcorp.com")], source="c", filename="f.txt") + db_module.insert_hits([make_hit(severity=HIGH, url="intranet.testcorp.com")], source="c", filename="f.txt") + db_module.insert_hits([make_hit(severity=MEDIUM, url="app.testcorp.com")], source="c", filename="f.txt") + db_module.insert_hits([make_hit(severity=LOW)], source="c", filename="f.txt") + s = db_module.stats() + assert s["critical"] == 1 + assert s["high"] == 1 + assert s["medium"] == 1 + assert s["low"] == 1 + assert s["total"] == 4 + assert s["unique"] == 4 + assert s["duplicates"] == 0 + + +def test_stats_separates_duplicates(): + hit = make_hit() + db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=False) + db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=True) + s = db_module.stats() + assert s["total"] == 2 + assert s["unique"] == 1 + assert s["duplicates"] == 1 + + +def test_stats_severity_counts_exclude_duplicates(): + hit = make_hit(severity=CRITICAL, url="admin.testcorp.com") + db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=False) + db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=True) + s = db_module.stats() + assert s["critical"] == 1 # only the unique one + + +def test_stats_empty_db(): + s = db_module.stats() + assert s["total"] == 0 + assert s["unique"] == 0 + assert s["top_source"] is None + + +def test_stats_top_source(): + db_module.insert_hits([make_hit()], source="channelA", filename="f.txt") + db_module.insert_hits([make_hit()], source="channelA", filename="f.txt") + db_module.insert_hits([make_hit()], source="channelB", filename="f.txt") + s = db_module.stats() + assert s["top_source"]["source"] == "channelA" + + +# ─── recent ─────────────────────────────────────────────────────────────────── + +def test_recent_respects_limit(): + for i in range(5): + db_module.insert_hits([make_hit(raw=f"testcorp.com|user{i}|pass")], source="c", filename="f.txt") + rows = db_module.recent(limit=3) + assert len(rows) == 3 + + +def test_recent_returns_all_when_under_limit(): + db_module.insert_hits([make_hit()], source="c", filename="f.txt") + db_module.insert_hits([make_hit()], source="c", filename="f.txt") + rows = db_module.recent(limit=50) + assert len(rows) == 2 diff --git a/tests/test_processor.py b/tests/test_processor.py new file mode 100644 index 0000000..108586c --- /dev/null +++ b/tests/test_processor.py @@ -0,0 +1,223 @@ +""" +Tests for core/processor.py — archive extraction and line-by-line search. + +No Telegram deps, no async. Tests create real archive fixtures in tmp_path +so process_file's cleanup guarantee can be verified against actual disk state. +""" + +import zipfile +import pytest +from pathlib import Path + +from core.processor import compile_patterns, search_file, process_file + + +@pytest.fixture +def patterns(): + return compile_patterns([r"testcorp\.com"]) + + +# ─── compile_patterns ───────────────────────────────────────────────────────── + +class TestCompilePatterns: + def test_returns_case_insensitive_patterns(self): + pats = compile_patterns([r"hello"]) + assert pats[0].search("HELLO") is not None + assert pats[0].search("Hello") is not None + + def test_multiple_patterns(self): + pats = compile_patterns([r"alpha", r"beta"]) + assert len(pats) == 2 + assert pats[0].search("alpha_line") + assert pats[1].search("beta_line") + + def test_empty_list(self): + assert compile_patterns([]) == [] + + +# ─── search_file ────────────────────────────────────────────────────────────── + +class TestSearchFile: + def test_returns_matching_lines(self, tmp_path, patterns): + f = tmp_path / "combo.txt" + f.write_text("testcorp.com|user|pass\nothersite.com|user|pass\n") + assert search_file(f, patterns) == ["testcorp.com|user|pass"] + + def test_returns_empty_when_no_match(self, tmp_path, patterns): + f = tmp_path / "combo.txt" + f.write_text("nomatch.com|user|pass\nanother.net|x|y\n") + assert search_file(f, patterns) == [] + + def test_strips_whitespace_from_returned_lines(self, tmp_path, patterns): + f = tmp_path / "combo.txt" + f.write_text(" testcorp.com|user|pass \n") + hits = search_file(f, patterns) + assert hits[0] == "testcorp.com|user|pass" + + def test_skips_blank_lines(self, tmp_path, patterns): + f = tmp_path / "combo.txt" + f.write_text("\n\ntestcorp.com|user|pass\n\n") + assert search_file(f, patterns) == ["testcorp.com|user|pass"] + + def test_handles_encoding_errors_gracefully(self, tmp_path, patterns): + """Combo files are often messy — invalid bytes must not crash the search.""" + f = tmp_path / "combo.txt" + f.write_bytes( + b"testcorp.com|user1|pass\n" + b"\xff\xfe invalid bytes here\n" + b"testcorp.com|user2|pass\n" + ) + hits = search_file(f, patterns) + assert len(hits) == 2 + + def test_multiple_matching_lines_all_returned(self, tmp_path, patterns): + f = tmp_path / "combo.txt" + f.write_text( + "testcorp.com|alice|pass1\n" + "nomatch.com|bob|pass2\n" + "testcorp.com|carol|pass3\n" + ) + hits = search_file(f, patterns) + assert len(hits) == 2 + + +# ─── process_file — plain .txt ──────────────────────────────────────────────── + +class TestProcessFilePlainText: + def test_returns_hits(self, tmp_path, patterns): + f = tmp_path / "combo.txt" + f.write_text("testcorp.com|user|pass\nnomatch.com|x|y\n") + hits = process_file(f, patterns) + assert hits == ["testcorp.com|user|pass"] + + def test_deletes_file_after_processing(self, tmp_path, patterns): + f = tmp_path / "combo.txt" + f.write_text("testcorp.com|user|pass\n") + process_file(f, patterns) + assert not f.exists() + + def test_deletes_file_even_with_no_hits(self, tmp_path, patterns): + f = tmp_path / "combo.txt" + f.write_text("nomatch.com|x|y\n") + hits = process_file(f, patterns) + assert hits == [] + assert not f.exists() + + +# ─── process_file — .zip extraction ────────────────────────────────────────── + +class TestProcessFileZip: + def _make_zip(self, tmp_path: Path, content: str, filename="content.txt") -> Path: + txt = tmp_path / filename + txt.write_text(content) + zf = tmp_path / "combo.zip" + with zipfile.ZipFile(zf, "w") as z: + z.write(txt, filename) + txt.unlink() + return zf + + def test_extracts_and_returns_hits(self, tmp_path, patterns): + zf = self._make_zip(tmp_path, "testcorp.com|user|pass\nnomatch.com|x|y\n") + hits = process_file(zf, patterns) + assert hits == ["testcorp.com|user|pass"] + + def test_deletes_zip_after_processing(self, tmp_path, patterns): + zf = self._make_zip(tmp_path, "testcorp.com|user|pass\n") + process_file(zf, patterns) + assert not zf.exists() + + def test_deletes_extract_dir_after_processing(self, tmp_path, patterns): + zf = self._make_zip(tmp_path, "testcorp.com|user|pass\n") + extract_dir = tmp_path / "combo" # sibling dir named after zip stem + process_file(zf, patterns) + assert not extract_dir.exists() + + def test_no_hits_still_cleans_up(self, tmp_path, patterns): + zf = self._make_zip(tmp_path, "nomatch.com|x|y\n") + extract_dir = tmp_path / "combo" + process_file(zf, patterns) + assert not zf.exists() + assert not extract_dir.exists() + + def test_zip_with_multiple_txt_files(self, tmp_path, patterns): + txt1 = tmp_path / "a.txt" + txt1.write_text("testcorp.com|alice|pass\n") + txt2 = tmp_path / "b.txt" + txt2.write_text("testcorp.com|bob|pass\n") + zf = tmp_path / "combo.zip" + with zipfile.ZipFile(zf, "w") as z: + z.write(txt1, "a.txt") + z.write(txt2, "b.txt") + txt1.unlink() + txt2.unlink() + + hits = process_file(zf, patterns) + assert len(hits) == 2 + + +# ─── process_file — nested archives ────────────────────────────────────────── + +class TestProcessFileNested: + def test_nested_zip_is_recursed(self, tmp_path, patterns): + inner_txt = tmp_path / "inner.txt" + inner_txt.write_text("testcorp.com|user|pass\n") + inner_zip = tmp_path / "inner.zip" + with zipfile.ZipFile(inner_zip, "w") as z: + z.write(inner_txt, "inner.txt") + inner_txt.unlink() + + outer_zip = tmp_path / "outer.zip" + with zipfile.ZipFile(outer_zip, "w") as z: + z.write(inner_zip, "inner.zip") + inner_zip.unlink() + + hits = process_file(outer_zip, patterns) + assert hits == ["testcorp.com|user|pass"] + assert not outer_zip.exists() + assert not (tmp_path / "outer").exists() + + +# ─── process_file — password-protected .7z ─────────────────────────────────── + +class TestProcessFile7zPassword: + def test_unlocks_with_correct_password(self, tmp_path, patterns, monkeypatch): + try: + import py7zr + except ImportError: + pytest.skip("py7zr not installed") + + import core.processor as proc_module + + # Isolate to a single known password so the test doesn't depend on config + monkeypatch.setattr(proc_module, "ARCHIVE_PASSWORDS", [b"secretpwd"]) + + txt = tmp_path / "content.txt" + txt.write_text("testcorp.com|user|pass\n") + szf = tmp_path / "combo.7z" + with py7zr.SevenZipFile(szf, "w", password="secretpwd") as z: + z.write(txt, "content.txt") + txt.unlink() + + hits = process_file(szf, patterns) + assert hits == ["testcorp.com|user|pass"] + assert not szf.exists() + + def test_skips_when_no_password_matches(self, tmp_path, patterns, monkeypatch): + try: + import py7zr + except ImportError: + pytest.skip("py7zr not installed") + + import core.processor as proc_module + monkeypatch.setattr(proc_module, "ARCHIVE_PASSWORDS", [b"wrongpwd"]) + + txt = tmp_path / "content.txt" + txt.write_text("testcorp.com|user|pass\n") + szf = tmp_path / "combo.7z" + with py7zr.SevenZipFile(szf, "w", password="correctpwd") as z: + z.write(txt, "content.txt") + txt.unlink() + + # No hits — archive could not be opened + hits = process_file(szf, patterns) + assert hits == [] diff --git a/tests/test_scorer.py b/tests/test_scorer.py new file mode 100644 index 0000000..54d0912 --- /dev/null +++ b/tests/test_scorer.py @@ -0,0 +1,282 @@ +""" +Tests for utils/scorer.py — severity scoring and ULP line parsing. + +All tests use the `patched_keywords` fixture (see conftest.py) which +replaces TARGET_KEYWORDS with two entries: + @testcorp.com — employee email domain (CRITICAL trigger) + testcorp.com — plain domain match (LOW baseline) +""" + +import pytest +from utils.scorer import score_hit, score_hits, summarize, CRITICAL, HIGH, MEDIUM, LOW + + +# ─── ULP line parsing ───────────────────────────────────────────────────────── + +class TestULPParsing: + def test_parses_pipe_separated_fields(self, patched_keywords): + hit = score_hit("site.com|jdoe@testcorp.com|pass123") + assert hit.url == "site.com" + assert hit.username == "jdoe@testcorp.com" + assert hit.password == "pass123" + + def test_parses_colon_separated_fields(self, patched_keywords): + # 'site.com' has no colon, so url field captures it cleanly + hit = score_hit("site.com:jdoe@testcorp.com:pass123") + assert hit.url == "site.com" + assert hit.username == "jdoe@testcorp.com" + assert hit.password == "pass123" + + def test_malformed_line_yields_none_fields(self, patched_keywords): + hit = score_hit("justaplaindomainmatch_testcorp.com") + assert hit.url is None + assert hit.username is None + assert hit.password is None + + def test_raw_field_preserved_exactly(self, patched_keywords): + line = "site.com|jdoe@testcorp.com|pass123" + hit = score_hit(line) + assert hit.raw == line + + +# ─── Real-world ULP format coverage ────────────────────────────────────────── + +class TestULPParsingRealWorld: + """ + Parametrized against real stealer-log lines. + Only field extraction is asserted (url/username/password), not severity, + so no patched_keywords fixture is needed. + """ + + @pytest.mark.parametrize("line,exp_url,exp_user,exp_pass", [ + # ── Protocol + port + path, colon separator ────────────────────────── + # Port is digits followed by '/' — must be consumed as part of the URL. + ( + "http://portal.fakehosp.example.com:88/:55512309-1:hunter2", + "http://portal.fakehosp.example.com:88/", "55512309-1", "hunter2", + ), + ( + "http://portal.fakehosp.example.com:8085/app/booking/:3:letmein", + "http://portal.fakehosp.example.com:8085/app/booking/", "3", "letmein", + ), + ( + "https://portal.fakehosp.example.com:81/app/FrmResetPassword.aspx:30219876-K:Spr!ng22@", + "https://portal.fakehosp.example.com:81/app/FrmResetPassword.aspx", + "30219876-K", "Spr!ng22@", + ), + + # ── Protocol + no port, ID-style username looks like port but has hyphen ── + # ':\d+-' must NOT be consumed as a port (no '/' after the digits). + ( + "https://booking.fakehosp.example.com:40293817-6:Summ3r99..", + "https://booking.fakehosp.example.com", "40293817-6", "Summ3r99..", + ), + ( + "https://booking.fakehosp.example.com/:40293817-6:Summ3r99..", + "https://booking.fakehosp.example.com/", "40293817-6", "Summ3r99..", + ), + + # ── Protocol + email username directly after host (no trailing slash) ─ + ( + "https://booking.fakehosp.example.com:carlos.gomez@gmail.com:Qwerty99", + "https://booking.fakehosp.example.com", "carlos.gomez@gmail.com", "Qwerty99", + ), + ( + "https://accounts.saas-vendor.example.com/signin:jdoe@fakehosp.example.com:W1nter20", + "https://accounts.saas-vendor.example.com/signin", "jdoe@fakehosp.example.com", "W1nter20", + ), + ( + "https://login.sso-provider.example.com/common/oauth2/authorize:jdoe@fakehosp.example.com:Passw0rd!", + "https://login.sso-provider.example.com/common/oauth2/authorize", + "jdoe@fakehosp.example.com", "Passw0rd!", + ), + + # ── Pipe separator (unambiguous — port stays in URL) ────────────────── + ( + "http://portal.fakehosp.example.com:88/|22.987.654-3|florida88", + "http://portal.fakehosp.example.com:88/", "22.987.654-3", "florida88", + ), + ( + "https://booking.fakehosp.example.com/|77341209-0|Ninja42", + "https://booking.fakehosp.example.com/", "77341209-0", "Ninja42", + ), + + # ── Mixed separators: pipe after URL, colon between user/password ───── + ( + "http://portal.fakehosp.example.com:8085/app/booking/|Z:wd1980wd", + "http://portal.fakehosp.example.com:8085/app/booking/", "Z", "wd1980wd", + ), + + # ── No protocol, port in URL ───────────────────────────────────────── + ( + "portal.fakehosp.example.com:88/:22.987.654-3:florida88", + "portal.fakehosp.example.com:88/", "22.987.654-3", "florida88", + ), + + # ── No protocol, no port — plain colon separators ──────────────────── + ( + "booking.fakehosp.example.com:66778899-7:correcthorse", + "booking.fakehosp.example.com", "66778899-7", "correcthorse", + ), + ( + "booking.fakehosp.example.com/:smithjohnathan:Bb881955", + "booking.fakehosp.example.com/", "smithjohnathan", "Bb881955", + ), + + # ── Password with special characters ───────────────────────────────── + ( + "https://booking.fakehosp.example.com/:11223344-5:dragonball99*", + "https://booking.fakehosp.example.com/", "11223344-5", "dragonball99*", + ), + ( + "https://booking.fakehosp.example.com/:9988776-65:abc.456#", + "https://booking.fakehosp.example.com/", "9988776-65", "abc.456#", + ), + + # ── Semicolon separator ─────────────────────────────────────────────── + ( + "booking.fakehosp.example.com;smithjohnathan;Bb881955", + "booking.fakehosp.example.com", "smithjohnathan", "Bb881955", + ), + ]) + def test_real_world_ulp_parsing(self, line, exp_url, exp_user, exp_pass): + hit = score_hit(line) + assert hit.url == exp_url, f"URL mismatch for: {line!r}" + assert hit.username == exp_user, f"Username mismatch for: {line!r}" + assert hit.password == exp_pass, f"Password mismatch for: {line!r}" + + +# ─── Severity classification ────────────────────────────────────────────────── + +class TestSeverityClassification: + def test_employee_email_in_username_is_critical(self, patched_keywords): + hit = score_hit("site.com|jdoe@testcorp.com|pass123") + assert hit.severity == CRITICAL + + def test_gmail_on_org_url_is_not_critical(self, patched_keywords): + """ + Core documented footgun: org domain appears in the URL, but the + credential username is a gmail address. Must NOT be CRITICAL. + The employee-domain pattern requires a literal '@' before the domain, + so 'testcorp.com' in the URL field never triggers it. + """ + hit = score_hit("testcorp.com|user@gmail.com|pass123") + assert hit.severity != CRITICAL + + def test_critical_service_subdomain_is_critical(self, patched_keywords): + hit = score_hit("admin.testcorp.com|user|pass123") + assert hit.severity == CRITICAL + + def test_vpn_subdomain_is_critical(self, patched_keywords): + hit = score_hit("vpn.testcorp.com|user|pass123") + assert hit.severity == CRITICAL + + def test_gitlab_subdomain_is_critical(self, patched_keywords): + hit = score_hit("gitlab.testcorp.com|user|pass123") + assert hit.severity == CRITICAL + + def test_intranet_subdomain_is_high(self, patched_keywords): + hit = score_hit("intranet.testcorp.com|user|pass123") + assert hit.severity == HIGH + + def test_sso_subdomain_is_high(self, patched_keywords): + hit = score_hit("sso.testcorp.com|user|pass123") + assert hit.severity == HIGH + + def test_app_subdomain_is_medium(self, patched_keywords): + hit = score_hit("app.testcorp.com|user|pass123") + assert hit.severity == MEDIUM + + def test_booking_subdomain_is_medium(self, patched_keywords): + hit = score_hit("booking.testcorp.com|user|pass123") + assert hit.severity == MEDIUM + + def test_plain_domain_match_is_low(self, patched_keywords): + hit = score_hit("testcorp.com|user|pass123") + assert hit.severity == LOW + + def test_employee_email_beats_high_service(self, patched_keywords): + """Employee email domain must win over a HIGH service classification.""" + hit = score_hit("intranet.testcorp.com|jdoe@testcorp.com|pass") + assert hit.severity == CRITICAL + + def test_employee_email_beats_medium_service(self, patched_keywords): + hit = score_hit("app.testcorp.com|jdoe@testcorp.com|pass") + assert hit.severity == CRITICAL + + def test_multiple_checks_accumulate_reasons(self, patched_keywords): + """A line matching both employee email and a critical service URL collects both reasons.""" + hit = score_hit("admin.testcorp.com|jdoe@testcorp.com|pass") + assert hit.severity == CRITICAL + assert len(hit.reasons) >= 2 + + def test_score_matches_severity(self, patched_keywords): + from utils.scorer import SEVERITY_SCORES + for line, expected_severity in [ + ("admin.testcorp.com|user|pass", CRITICAL), + ("intranet.testcorp.com|user|pass", HIGH), + ("app.testcorp.com|user|pass", MEDIUM), + ("testcorp.com|user|pass", LOW), + ]: + hit = score_hit(line) + assert hit.score == SEVERITY_SCORES[expected_severity] + + +# ─── Weak password flags ────────────────────────────────────────────────────── + +class TestWeakPasswordFlags: + def test_short_password_adds_reason(self, patched_keywords): + hit = score_hit("testcorp.com|user|abc") + assert any("Weak password" in r for r in hit.reasons) + + def test_common_password_adds_reason(self, patched_keywords): + hit = score_hit("testcorp.com|user|password") + assert any("Common password" in r for r in hit.reasons) + + def test_weak_password_does_not_escalate_severity(self, patched_keywords): + """Weak password flags are informational — they must not change severity.""" + hit = score_hit("testcorp.com|user|abc") + assert hit.severity == LOW + + def test_strong_password_adds_no_warning(self, patched_keywords): + hit = score_hit("testcorp.com|user|Xk9#mP2qLrTv") + assert not any("password" in r.lower() for r in hit.reasons if "Employee" not in r and "domain" not in r.lower() and "service" not in r.lower()) + + +# ─── score_hits and summarize ───────────────────────────────────────────────── + +class TestScoreHitsAndSummarize: + def test_score_hits_sorted_descending(self, patched_keywords): + lines = [ + "testcorp.com|user|pass", # LOW + "admin.testcorp.com|user|pass", # CRITICAL + "intranet.testcorp.com|user|pass", # HIGH + "app.testcorp.com|user|pass", # MEDIUM + ] + hits = score_hits(lines) + scores = [h.score for h in hits] + assert scores == sorted(scores, reverse=True) + + def test_summarize_counts_each_severity(self, patched_keywords): + lines = [ + "admin.testcorp.com|user|pass", # CRITICAL + "intranet.testcorp.com|user|pass", # HIGH + "app.testcorp.com|user|pass", # MEDIUM + "testcorp.com|user|pass", # LOW + ] + summary = summarize(score_hits(lines)) + assert summary[CRITICAL] == 1 + assert summary[HIGH] == 1 + assert summary[MEDIUM] == 1 + assert summary[LOW] == 1 + + def test_summarize_zero_for_absent_severities(self, patched_keywords): + hits = score_hits(["testcorp.com|user|pass"]) # LOW only + summary = summarize(hits) + assert summary[CRITICAL] == 0 + assert summary[HIGH] == 0 + assert summary[MEDIUM] == 0 + assert summary[LOW] == 1 + + def test_score_hits_empty_list(self, patched_keywords): + assert score_hits([]) == [] diff --git a/tui/__init__.py b/tui/__init__.py new file mode 100644 index 0000000..58754af --- /dev/null +++ b/tui/__init__.py @@ -0,0 +1 @@ +"""tui — Textual TUI frontend and event bus.""" diff --git a/tui/app.md b/tui/app.md new file mode 100644 index 0000000..bb79ded --- /dev/null +++ b/tui/app.md @@ -0,0 +1,130 @@ +# tui/app.py + +Textual TUI frontend. Entry point: `run_tui()`. + +## Entry point + +```python +from tui.app import run_tui +run_tui() # called by main.py +``` + +--- + +## Screen hierarchy + +``` +MonitorApp (App) +├── [default screen] +│ ├── Header +│ ├── #top-row (Horizontal) +│ │ ├── DownloadPanel #dl-panel +│ │ └── HitsPanel #hits-panel +│ ├── StatsPanel #stats-panel +│ ├── ChannelPanel #ch-panel +│ └── Footer +├── SearchScreen (push/pop via 's') +├── HitsDBScreen (push/pop via 'h') +└── KeywordsScreen (push/pop via 'k') +``` + +--- + +## MonitorApp + +### Threading model +- **Bot backend** → `threading.Thread(daemon=True)` with its own `asyncio.new_event_loop()` + Runs `_bot_main()` — Telethon is completely isolated from Textual's loop. +- **TUI drain** → `set_interval(0.1, _drain_bus)` — polls `queue.Queue` every 100ms on Textual's loop. + +### Key methods + +| Method | Description | +|--------|-------------| +| `on_mount()` | Calls `bus.init_bus()`, starts bot thread, sets drain interval | +| `_drain_bus()` | Drains all pending events from `queue.Queue`, dispatches to widgets | +| `_run_bot_thread()` | Thread entry: creates event loop, runs `_bot_main()` | +| `_bot_main()` | Async bot backend: connect, auth, backfill, live handler loop | +| `_signal_channel_changed()` | Thread-safely sets the bot loop's `asyncio.Event` via `call_soon_threadsafe` | + +### Keybindings + +| Key | Action | +|-----|--------| +| `s` | Push `SearchScreen` | +| `h` | Push `HitsDBScreen` | +| `k` | Push `KeywordsScreen` | +| `c` | Clear download + hits logs | +| `r` | Force-refresh stats bar | +| `q` / `ctrl+c` | Quit | + +--- + +## Widgets + +### DownloadPanel +Left panel. Two `RichLog` widgets separated by a dashed line: +- **top** (`#tdl-out`): raw tdl output lines (ANSI stripped) +- **bottom** (`#dl-log`): structured download status entries + +Methods: `tdl_line(line)`, `queued(filename, size_mb, source, password)`, `status(filename, state, via)`, `clear_logs()` + +States for `status()`: `queued` · `downloading` · `done_tdl` · `done_tel` · `failed` + +### HitsPanel +Right panel. Single `RichLog` with color-coded hit entries. +Reactive `hit_count` updates the panel title badge automatically. + +Methods: `add_hit(severity, raw, source, filename, reasons)`, `clear_log()` + +### StatsPanel +Slim horizontal bar. Polls `utils.database.stats()` every 10s via `set_interval`. +Also refreshed immediately on each `EvHit` event. + +### ChannelPanel +Bottom panel. `ListView` + `Input` + buttons. +Add/remove posts `EvChannelAdded` / `EvChannelRemoved` onto the bus. +Changes apply immediately (handler re-registered). Not persisted to `config.py` automatically. + +--- + +## Screens + +### SearchScreen (`s`) +- Text input → queries `utils.database.search(keyword)` +- Results in a `DataTable` with columns: Sev, Time, URL, Username, Password, Source, File +- Submit with `↵` or Search button; `Escape` to dismiss + +### HitsDBScreen (`h`) +- Toolbar buttons + number keys filter by severity +- `r` → recent 50, `1`→CRITICAL, `2`→HIGH, `3`→MEDIUM, `4`→LOW +- Calls `utils.database.recent()` / `by_severity()` + +### KeywordsScreen (`k`) +- Live-edit `config.TARGET_KEYWORDS` +- Validates regex before adding +- On change: rebuilds `utils.scorer.EMPLOYEE_DOMAINS` and `ORG_DOMAINS` +- Bot handler recompiles patterns on the next incoming message automatically +- **Changes are in-memory only** — copy to `config.py` to persist + +--- + +## Bot auth flow (`_bot_main`) + +``` +await bot_client.connect() +await bot_client.is_user_authorized()? → sign_in(bot_token=...) +await user_client.connect() +await user_client.is_user_authorized()? → log error + return (must run --no-tui first) +warm_entity_cache() +_make_handler(channels) ← registers NewMessage handler +backfill_all() +run_until_disconnected() ┐ +_watch_channels() ┘ gathered +``` + +Channel-change signal path: +``` +ChannelPanel button → EvChannel* on bus → _drain_bus → _signal_channel_changed() + → call_soon_threadsafe(asyncio.Event.set) → _watch_channels() wakes → _make_handler() +``` diff --git a/tui/app.py b/tui/app.py new file mode 100644 index 0000000..0413862 --- /dev/null +++ b/tui/app.py @@ -0,0 +1,1016 @@ +""" +tui.py — Textual TUI for the ULP credential monitor. + +Layout (main screen): + ┌──────────────────────────────────┬──────────────────────────────────┐ + │ 📥 Downloads │ 🎯 Hits [N] │ + │ (live tdl output + status log) │ (color-coded hit log) │ + ├──────────────────────────────────┴──────────────────────────────────┤ + │ 📊 Stats bar (live DB counters, auto-refresh every 10 s) │ + ├─────────────────────────────────────────────────────────────────────┤ + │ 📡 Channels (add / remove entries; applied immediately) │ + └─────────────────────────────────────────────────────────────────────┘ + │ Footer (keybindings) │ + └─────────────────────────────────────────────────────────────────────┘ + +Additional screens (push/pop via keybindings): + • SearchScreen — full-text search across hits DB [s] + • HitsDBScreen — paginated recent / severity viewer [h] + • KeywordsScreen — live-edit TARGET_KEYWORDS regex list [k] + +Architecture: + - The entire bot backend runs as a Textual Worker (asyncio task inside the + TUI event loop — no threading needed). + - A second Worker runs _bus_consumer(), reading events from tui_events.queue + and dispatching to the right panel. + - Channel add/remove from the UI immediately re-registers Telethon handlers + via asyncio.Event signalling into the bot worker. + - tdl output is piped (not terminal-inherited) and relayed via EvTdlOutput + into the download panel's RichLog. + - StatsPanel polls database.stats() every 10 s via set_interval(). + - Keyword changes are applied in-memory immediately (scorer caches rebuilt); + NOT auto-persisted to config.py — a notice banner reminds the user. + - Live patterns are recompiled from config.TARGET_KEYWORDS on every message + so keyword changes take effect without a handler restart. +""" + +import asyncio +import logging +import queue +import shutil +import threading +from datetime import datetime, timezone + +from textual.app import App, ComposeResult, Screen +from textual.binding import Binding +from textual.containers import Horizontal, Vertical +from textual.widgets import ( + Footer, Header, Label, Input, Button, + ListView, ListItem, RichLog, DataTable, Static, +) +from textual.reactive import reactive + +from . import events as bus +from config import WATCHED_CHANNELS, SESSION_NAME +import config + +log = logging.getLogger(__name__) + +# ─── Colour maps ────────────────────────────────────────────────────────────── + +SEV_COLOUR = { + "CRITICAL": "bold red", + "HIGH": "bold orange1", + "MEDIUM": "bold yellow", + "LOW": "bold green", +} +SEV_EMOJI = { + "CRITICAL": "🔴", "HIGH": "🟠", "MEDIUM": "🟡", "LOW": "🟢", +} +DL_COLOUR = { + "queued": "dim white", + "downloading": "bold cyan", + "done_tdl": "bold green", + "done_tel": "green", + "failed": "bold red", +} +DL_ICON = { + "queued": "⏳", "downloading": "⬇️ ", + "done_tdl": "✅", "done_tel": "✅", "failed": "❌", +} + + +def _now() -> str: + return datetime.now(timezone.utc).strftime("%H:%M:%S") + + +# ─── Download panel ─────────────────────────────────────────────────────────── + +class DownloadPanel(Vertical): + """ + Left panel — two sub-logs stacked vertically: + • top: tdl raw output (stripped ANSI), scrolling + • bottom: our own structured status entries + """ + + DEFAULT_CSS = """ + DownloadPanel { + border: solid $accent; + height: 100%; + width: 1fr; + } + DownloadPanel Label.panel-title { + background: $accent; + color: $text; + padding: 0 1; + width: 100%; + } + DownloadPanel Label.sub-title { + background: $surface; + color: $text-muted; + padding: 0 1; + width: 100%; + } + DownloadPanel RichLog { + padding: 0 1; + } + #tdl-out { + height: 1fr; + border-bottom: dashed $accent-darken-2; + } + #dl-log { + height: 1fr; + } + """ + + def compose(self) -> ComposeResult: + yield Label("📥 Downloads", classes="panel-title") + yield Label(" tdl output", classes="sub-title") + yield RichLog(highlight=False, markup=False, wrap=True, id="tdl-out") + yield Label(" status", classes="sub-title") + yield RichLog(highlight=True, markup=True, wrap=True, id="dl-log") + + def tdl_line(self, line: str) -> None: + self.query_one("#tdl-out", RichLog).write(line) + + def queued(self, filename: str, size_mb: float, source: str, + password: str | None) -> None: + pw = f" 🔑 [dim]{password}[/dim]" if password else "" + self.query_one("#dl-log", RichLog).write( + f"[{DL_COLOUR['queued']}]{DL_ICON['queued']} {_now()} " + f"{filename}[/{DL_COLOUR['queued']}]" + f" [dim]{size_mb:.1f} MB {source}[/dim]{pw}" + ) + + def status(self, filename: str, state: str, via: str = "") -> None: + colour = DL_COLOUR.get(state, "white") + icon = DL_ICON.get(state, "•") + suffix = f" [dim]via {via}[/dim]" if via else "" + self.query_one("#dl-log", RichLog).write( + f" [dim]↳[/dim] [{colour}]{icon} {filename}[/{colour}]{suffix}" + ) + + def clear_logs(self) -> None: + self.query_one("#tdl-out", RichLog).clear() + self.query_one("#dl-log", RichLog).clear() + + +# ─── Hits panel ─────────────────────────────────────────────────────────────── + +class HitsPanel(Vertical): + """Right panel — scrollable color-coded hit log with live counter badge.""" + + hit_count: reactive[int] = reactive(0) + + DEFAULT_CSS = """ + HitsPanel { + border: solid $error; + height: 100%; + width: 1fr; + } + HitsPanel Label.panel-title { + background: $error; + color: $text; + padding: 0 1; + width: 100%; + } + HitsPanel RichLog { + height: 1fr; + padding: 0 1; + } + """ + + def compose(self) -> ComposeResult: + yield Label("🎯 Hits", classes="panel-title") + yield RichLog(highlight=True, markup=True, wrap=True, id="hits-log") + + def watch_hit_count(self, count: int) -> None: + self.query_one(".panel-title", Label).update(f"🎯 Hits [{count}]") + + def add_hit(self, severity: str, raw: str, source: str, + filename: str, reasons: list[str]) -> None: + colour = SEV_COLOUR.get(severity, "white") + emoji = SEV_EMOJI.get(severity, "⚪") + self.query_one("#hits-log", RichLog).write( + f"{emoji} [{colour}]{severity}[/{colour}] [dim]{_now()}[/dim]\n" + f" [bold]{raw}[/bold]\n" + f" [dim]↳ {' | '.join(reasons)}[/dim]\n" + f" [dim]📁 {filename} 📢 {source}[/dim]" + ) + self.hit_count += 1 + + def clear_log(self) -> None: + self.query_one("#hits-log", RichLog).clear() + self.hit_count = 0 + + +# ─── Stats panel ────────────────────────────────────────────────────────────── + +class StatsPanel(Horizontal): + """ + Slim bar — shows live DB stats, refreshed every 10 s. + Also refreshed immediately whenever a new hit arrives. + """ + + DEFAULT_CSS = """ + StatsPanel { + border: solid $primary-darken-2; + height: 3; + width: 100%; + padding: 0 1; + background: $surface; + } + StatsPanel Static { + width: 1fr; + content-align: center middle; + color: $text-muted; + } + StatsPanel Static.stat-critical { color: red; } + StatsPanel Static.stat-high { color: orange; } + StatsPanel Static.stat-medium { color: yellow; } + StatsPanel Static.stat-low { color: green; } + """ + + def compose(self) -> ComposeResult: + yield Static("📊 DB Stats", id="stat-label") + yield Static("🔴 —", classes="stat-critical", id="stat-critical") + yield Static("🟠 —", classes="stat-high", id="stat-high") + yield Static("🟡 —", classes="stat-medium", id="stat-medium") + yield Static("🟢 —", classes="stat-low", id="stat-low") + yield Static("total: —", id="stat-total") + yield Static("unique: —", id="stat-unique") + yield Static("dupes: —", id="stat-dupes") + yield Static("sources: —", id="stat-sources") + + def on_mount(self) -> None: + self.set_interval(10, self.refresh_stats) + self.refresh_stats() + + def refresh_stats(self) -> None: + try: + from utils.database import stats + s = stats() + self.query_one("#stat-critical", Static).update(f"🔴 {s['critical']}") + self.query_one("#stat-high", Static).update(f"🟠 {s['high']}") + self.query_one("#stat-medium", Static).update(f"🟡 {s['medium']}") + self.query_one("#stat-low", Static).update(f"🟢 {s['low']}") + self.query_one("#stat-total", Static).update(f"total: {s['total']}") + self.query_one("#stat-unique", Static).update(f"unique: {s['unique']}") + self.query_one("#stat-dupes", Static).update(f"dupes: {s['duplicates']}") + self.query_one("#stat-sources", Static).update(f"sources: {s['sources']}") + except Exception: + pass # DB not ready yet on first paint + + +# ─── Channel panel ──────────────────────────────────────────────────────────── + +class ChannelPanel(Vertical): + """ + Bottom panel — live-editable channel list. + + Changes are applied immediately (Telethon handlers are re-registered). + To make them permanent, edit config.py's WATCHED_CHANNELS manually. + """ + + DEFAULT_CSS = """ + ChannelPanel { + border: solid $warning; + height: 14; + width: 100%; + } + ChannelPanel Label.panel-title { + background: $warning; + color: $text; + padding: 0 1; + width: 100%; + } + ChannelPanel Horizontal.controls { + height: 3; + padding: 0 1; + } + ChannelPanel Horizontal.controls Input { + width: 1fr; + } + ChannelPanel Horizontal.controls Button { + width: auto; + margin-left: 1; + } + ChannelPanel Horizontal.list-row { + height: 1fr; + } + ChannelPanel Horizontal.list-row ListView { + width: 1fr; + height: 100%; + } + ChannelPanel Horizontal.list-row Button { + width: 14; + margin: 0 1; + } + """ + + def __init__(self, initial_channels: list, **kwargs): + super().__init__(**kwargs) + self._channels: list[str | int] = list(initial_channels) + + def compose(self) -> ComposeResult: + yield Label( + "📡 Channels — changes apply immediately | edit config.py to persist", + classes="panel-title", + ) + with Horizontal(classes="controls"): + yield Input(placeholder="channel username or -100xxxxxxxxxx", id="ch-input") + yield Button("➕ Add", id="ch-add", variant="success") + with Horizontal(classes="list-row"): + yield ListView(id="ch-list") + yield Button("🗑 Remove", id="ch-remove", variant="error") + + def on_mount(self) -> None: + self._refresh_list() + + def _refresh_list(self) -> None: + lv = self.query_one("#ch-list", ListView) + lv.clear() + for ch in self._channels: + lv.append(ListItem(Label(str(ch)))) + + def on_button_pressed(self, event: Button.Pressed) -> None: + if event.button.id == "ch-add": + inp = self.query_one("#ch-input", Input) + raw = inp.value.strip() + if not raw: + return + channel: str | int = int(raw) if raw.lstrip("-").isdigit() else raw + if channel not in self._channels: + self._channels.append(channel) + self._refresh_list() + bus.post(bus.EvChannelAdded(channel=channel)) + self.app.notify(f"Added: {channel}", severity="information") + inp.value = "" + + elif event.button.id == "ch-remove": + lv = self.query_one("#ch-list", ListView) + idx = lv.index + if idx is None or not (0 <= idx < len(self._channels)): + self.app.notify("Select a channel first", severity="warning") + return + removed = self._channels.pop(idx) + self._refresh_list() + bus.post(bus.EvChannelRemoved(channel=removed)) + self.app.notify(f"Removed: {removed}", severity="warning") + + @property + def channels(self) -> list[str | int]: + return list(self._channels) + + +# ─── Search screen ──────────────────────────────────────────────────────────── + +class SearchScreen(Screen): + """Full-text search across the hits database (url, username, raw line).""" + + BINDINGS = [Binding("escape", "dismiss", "Back")] + + DEFAULT_CSS = """ + SearchScreen { background: $background; } + SearchScreen Label.screen-title { + background: $primary; + color: $text; + padding: 0 1; + width: 100%; + } + SearchScreen #search-bar { + height: 3; + padding: 0 1; + } + SearchScreen #search-bar Input { width: 1fr; } + SearchScreen #search-bar Button { width: 14; margin-left: 1; } + SearchScreen #result-count { padding: 0 1; color: $text-muted; } + SearchScreen #results-table { height: 1fr; margin: 0 1 1 1; } + """ + + def compose(self) -> ComposeResult: + yield Header() + yield Label("🔍 Search Hits Database", classes="screen-title") + with Horizontal(id="search-bar"): + yield Input(placeholder="keyword, domain, username, IP…", id="search-input") + yield Button("Search", id="search-btn", variant="primary") + yield Label("Enter a keyword and press Search or ↵", id="result-count") + yield DataTable(id="results-table", zebra_stripes=True, cursor_type="row") + yield Footer() + + def on_mount(self) -> None: + t = self.query_one("#results-table", DataTable) + t.add_columns("Sev", "Time", "URL", "Username", "Password", "Source", "File") + self.query_one("#search-input", Input).focus() + + def on_button_pressed(self, event: Button.Pressed) -> None: + if event.button.id == "search-btn": + self._run_search() + + def on_input_submitted(self, event: Input.Submitted) -> None: + if event.input.id == "search-input": + self._run_search() + + def _run_search(self) -> None: + kw = self.query_one("#search-input", Input).value.strip() + if not kw: + return + try: + from utils.database import search + rows = search(kw) + except Exception as e: + self.app.notify(f"Search error: {e}", severity="error") + return + + t = self.query_one("#results-table", DataTable) + t.clear() + for row in rows: + emoji = SEV_EMOJI.get(row["severity"], "⚪") + t.add_row( + f"{emoji} {row['severity']}", + row["timestamp"], + (row["url"] or "")[:45], + (row["username"] or "")[:30], + (row["password"] or "")[:20], + (row["source"] or "")[:20], + (row["filename"] or "")[:25], + ) + self.query_one("#result-count", Label).update( + f" {len(rows)} result(s) for '{kw}'" + ) + + def action_dismiss(self) -> None: + self.app.pop_screen() + + +# ─── Hits DB viewer screen ──────────────────────────────────────────────────── + +class HitsDBScreen(Screen): + """ + Paginated viewer for DB hits. + Toolbar buttons + number-key bindings filter by severity. + """ + + BINDINGS = [ + Binding("escape", "dismiss", "Back"), + Binding("r", "load_recent", "Recent 50"), + Binding("1", "filter_critical", "CRITICAL"), + Binding("2", "filter_high", "HIGH"), + Binding("3", "filter_medium", "MEDIUM"), + Binding("4", "filter_low", "LOW"), + ] + + DEFAULT_CSS = """ + HitsDBScreen { background: $background; } + HitsDBScreen Label.screen-title { + background: $error; + color: $text; + padding: 0 1; + width: 100%; + } + HitsDBScreen #toolbar { + height: 3; + padding: 0 1; + background: $surface; + } + HitsDBScreen #toolbar Button { margin-right: 1; width: auto; } + HitsDBScreen #db-status { padding: 0 1; color: $text-muted; } + HitsDBScreen #hits-db-table { height: 1fr; margin: 0 1 1 1; } + """ + + def compose(self) -> ComposeResult: + yield Header() + yield Label("📋 Hits Database Viewer", classes="screen-title") + with Horizontal(id="toolbar"): + yield Button("Recent 50", id="btn-recent", variant="default") + yield Button("🔴 CRITICAL", id="btn-critical", variant="error") + yield Button("🟠 HIGH", id="btn-high", variant="warning") + yield Button("🟡 MEDIUM", id="btn-medium", variant="default") + yield Button("🟢 LOW", id="btn-low", variant="success") + yield Label("", id="db-status") + yield DataTable(id="hits-db-table", zebra_stripes=True, cursor_type="row") + yield Footer() + + def on_mount(self) -> None: + t = self.query_one("#hits-db-table", DataTable) + t.add_columns("ID", "Sev", "Timestamp", "URL", "Username", "Source", "Status") + self._load_recent() + + def on_button_pressed(self, event: Button.Pressed) -> None: + dispatch = { + "btn-recent": self._load_recent, + "btn-critical": lambda: self._load_severity("CRITICAL"), + "btn-high": lambda: self._load_severity("HIGH"), + "btn-medium": lambda: self._load_severity("MEDIUM"), + "btn-low": lambda: self._load_severity("LOW"), + } + fn = dispatch.get(event.button.id) + if fn: + fn() + + def _populate(self, rows, label: str) -> None: + t = self.query_one("#hits-db-table", DataTable) + t.clear() + for row in rows: + emoji = SEV_EMOJI.get(row["severity"], "⚪") + status = "dup" if row["seen_before"] else "new" + t.add_row( + str(row["id"]), + f"{emoji} {row['severity']}", + row["timestamp"], + (row["url"] or "")[:45], + (row["username"] or "")[:30], + (row["source"] or "")[:20], + status, + ) + self.query_one("#db-status", Label).update( + f" {len(rows)} row(s) — {label}" + ) + + def _load_recent(self) -> None: + try: + from utils.database import recent + self._populate(recent(50), "most recent 50") + except Exception as e: + self.app.notify(f"DB error: {e}", severity="error") + + def _load_severity(self, sev: str) -> None: + try: + from utils.database import by_severity + self._populate(by_severity(sev), f"severity = {sev} (unique only)") + except Exception as e: + self.app.notify(f"DB error: {e}", severity="error") + + def action_dismiss(self) : self.app.pop_screen() + def action_load_recent(self) : self._load_recent() + def action_filter_critical(self): self._load_severity("CRITICAL") + def action_filter_high(self) : self._load_severity("HIGH") + def action_filter_medium(self) : self._load_severity("MEDIUM") + def action_filter_low(self) : self._load_severity("LOW") + + +# ─── Keywords screen ────────────────────────────────────────────────────────── + +class KeywordsScreen(Screen): + """ + Live-edit TARGET_KEYWORDS regex patterns. + + Additions / removals apply immediately: + • config.TARGET_KEYWORDS is mutated in place + • scorer's domain caches are rebuilt + • The bot handler recompiles patterns on the next message automatically + + Changes are NOT written back to config.py — a notice banner says so. + """ + + BINDINGS = [Binding("escape", "dismiss", "Back")] + + DEFAULT_CSS = """ + KeywordsScreen { background: $background; } + KeywordsScreen Label.screen-title { + background: $success; + color: $text; + padding: 0 1; + width: 100%; + } + KeywordsScreen Label.notice { + background: $warning; + color: $text; + padding: 0 1; + width: 100%; + } + KeywordsScreen #kw-controls { + height: 3; + padding: 0 1; + } + KeywordsScreen #kw-controls Input { width: 1fr; } + KeywordsScreen #kw-controls Button { width: auto; margin-left: 1; } + KeywordsScreen #kw-list-row { + height: 1fr; + padding: 0 1; + } + KeywordsScreen #kw-list { + width: 1fr; + height: 100%; + border: solid $primary; + } + KeywordsScreen #kw-list-row Button { width: 16; margin-left: 1; } + """ + + def compose(self) -> ComposeResult: + yield Header() + yield Label("🔑 Keyword / Pattern Editor", classes="screen-title") + yield Label( + "⚠ Changes are in-memory only — copy patterns to config.py to persist across restarts.", + classes="notice", + ) + with Horizontal(id="kw-controls"): + yield Input( + placeholder="regex e.g. @myorg\\.com or 192\\.168\\.10\\.", + id="kw-input", + ) + yield Button("➕ Add", id="kw-add", variant="success") + with Horizontal(id="kw-list-row"): + yield ListView(id="kw-list") + yield Button("🗑 Remove", id="kw-remove", variant="error") + yield Footer() + + def on_mount(self) -> None: + self._refresh_list() + self.query_one("#kw-input", Input).focus() + + def _refresh_list(self) -> None: + lv = self.query_one("#kw-list", ListView) + lv.clear() + for kw in config.TARGET_KEYWORDS: + lv.append(ListItem(Label(kw))) + + def on_button_pressed(self, event: Button.Pressed) -> None: + if event.button.id == "kw-add": + inp = self.query_one("#kw-input", Input) + raw = inp.value.strip() + if not raw: + return + import re + try: + re.compile(raw, re.IGNORECASE) + except re.error as e: + self.app.notify(f"Invalid regex: {e}", severity="error") + return + if raw not in config.TARGET_KEYWORDS: + config.TARGET_KEYWORDS.append(raw) + self._rebuild_scorer() + self._refresh_list() + self.app.notify(f"Pattern added: {raw}", severity="information") + inp.value = "" + + elif event.button.id == "kw-remove": + lv = self.query_one("#kw-list", ListView) + idx = lv.index + if idx is None or not (0 <= idx < len(config.TARGET_KEYWORDS)): + self.app.notify("Select a pattern first", severity="warning") + return + removed = config.TARGET_KEYWORDS.pop(idx) + self._rebuild_scorer() + self._refresh_list() + self.app.notify(f"Pattern removed: {removed}", severity="warning") + + def on_input_submitted(self, event: Input.Submitted) -> None: + if event.input.id == "kw-input": + # Simulate Add button press + self.on_button_pressed( + Button.Pressed(self.query_one("#kw-add", Button)) + ) + + def _rebuild_scorer(self) -> None: + """Rebuild scorer's cached domain patterns after a keyword change.""" + try: + import scorer + scorer.EMPLOYEE_DOMAINS = scorer._build_employee_domains() + scorer.ORG_DOMAINS = scorer._build_org_domains() + except Exception as e: + log.warning(f"Could not rebuild scorer caches: {e}") + bus.post(bus.EvStatus( + f"Keywords updated — {len(config.TARGET_KEYWORDS)} pattern(s) active" + )) + + def action_dismiss(self) -> None: + self.app.pop_screen() + + +# ─── Main application ───────────────────────────────────────────────────────── + +class MonitorApp(App): + + CSS = """ + Screen { layout: vertical; } + #top-row { layout: horizontal; height: 1fr; } + """ + + BINDINGS = [ + Binding("q", "quit", "Quit", priority=True), + Binding("ctrl+c", "quit", "Quit", priority=True), + Binding("s", "push_search", "Search DB"), + Binding("h", "push_hits_db", "Hits DB"), + Binding("k", "push_keywords", "Keywords"), + Binding("c", "clear_logs", "Clear Logs"), + Binding("r", "refresh_stats", "Refresh Stats"), + ] + + TITLE = "ULP Credential Monitor" + SUB_TITLE = f"session: {SESSION_NAME}" + + def __init__(self): + super().__init__() + self._live_channels: list[str | int] = list(WATCHED_CHANNELS) + # Set by _drain_bus (Textual loop), read by _bot_main (bot loop) + # via call_soon_threadsafe so the asyncio.Event is set on the right loop. + self._bot_loop_channel_event: asyncio.Event | None = None + self._bot_loop: asyncio.AbstractEventLoop | None = None + + def compose(self) -> ComposeResult: + yield Header() + with Horizontal(id="top-row"): + yield DownloadPanel(id="dl-panel") + yield HitsPanel(id="hits-panel") + yield StatsPanel(id="stats-panel") + yield ChannelPanel(initial_channels=WATCHED_CHANNELS, id="ch-panel") + yield Footer() + + def on_mount(self) -> None: + # The bot backend runs in its own thread with its own asyncio event + # loop, completely isolated from Textual. Telethon spawns background + # tasks via asyncio.ensure_future() and calls connect() which returns + # only after its receiver loop is scheduled — both of these deadlock + # inside Textual's managed loop. Running in a dedicated thread + # sidesteps all of that. + # + # Communication uses a thread-safe queue.Queue (see tui_events.py). + # The TUI polls it every 100 ms via set_interval(). + bus.init_bus() + self._bot_thread = threading.Thread( + target=self._run_bot_thread, + name="bot-thread", + daemon=True, + ) + self._bot_thread.start() + # Poll the thread-safe queue and dispatch to widgets + self.set_interval(0.1, self._drain_bus) + + # ── Screen navigation ───────────────────────────────────────────────────── + + def action_push_search(self) : self.push_screen(SearchScreen()) + def action_push_hits_db(self) : self.push_screen(HitsDBScreen()) + def action_push_keywords(self) : self.push_screen(KeywordsScreen()) + + def action_clear_logs(self) -> None: + self.query_one("#dl-panel", DownloadPanel).clear_logs() + self.query_one("#hits-panel", HitsPanel).clear_log() + self.notify("Logs cleared", severity="information") + + def action_refresh_stats(self) -> None: + self.query_one("#stats-panel", StatsPanel).refresh_stats() + self.notify("Stats refreshed", severity="information") + + # ── Event bus consumer ──────────────────────────────────────────────────── + + def _signal_channel_changed(self) -> None: + """Thread-safely set the channel-change event on the bot loop.""" + ev = self._bot_loop_channel_event + loop = self._bot_loop + if ev is not None and loop is not None and loop.is_running(): + loop.call_soon_threadsafe(ev.set) + + # ── Bus drain (runs on Textual's loop via set_interval) ────────────────── + + def _drain_bus(self) -> None: + """ + Called every 100 ms by set_interval(). Drains all pending events + from the thread-safe queue and dispatches them to the right widget. + Runs on Textual's event loop — safe to call widget methods directly. + """ + q = bus.get_bus() + if q is None: + return + + try: + dl = self.query_one("#dl-panel", DownloadPanel) + hit = self.query_one("#hits-panel", HitsPanel) + stats = self.query_one("#stats-panel", StatsPanel) + except Exception: + return # widgets not mounted yet + + # Drain everything currently in the queue in one pass + while True: + try: + ev = q.get_nowait() + except queue.Empty: + break + + try: + if isinstance(ev, bus.EvTdlOutput): + dl.tdl_line(ev.line) + + elif isinstance(ev, bus.EvDownloadQueued): + dl.queued(ev.filename, ev.size_mb, ev.source, ev.password) + + elif isinstance(ev, bus.EvDownloadStarted): + dl.status(ev.filename, "downloading") + + elif isinstance(ev, bus.EvDownloadDone): + dl.status(ev.filename, + "done_tdl" if ev.via == "tdl" else "done_tel", + via=ev.via) + + elif isinstance(ev, bus.EvDownloadFailed): + dl.status(ev.filename, "failed") + + elif isinstance(ev, bus.EvHit): + hit.add_hit(ev.severity, ev.raw, ev.source, ev.filename, ev.reasons) + stats.refresh_stats() + + elif isinstance(ev, bus.EvChannelAdded): + if ev.channel not in self._live_channels: + self._live_channels.append(ev.channel) + self._signal_channel_changed() + + elif isinstance(ev, bus.EvChannelRemoved): + self._live_channels = [ + c for c in self._live_channels if c != ev.channel + ] + self._signal_channel_changed() + + elif isinstance(ev, bus.EvStatus): + log.info(f"[bus] EvStatus: {ev.text}") + severity = {"error": "error", "warning": "warning"}.get( + ev.level, "information" + ) + self.notify(ev.text, severity=severity) + + else: + log.warning(f"[bus] Unknown event type: {type(ev)}") + + except Exception as e: + log.error(f"[bus] Dispatch error for {type(ev).__name__}: {e}", exc_info=True) + + # ── Bot thread ──────────────────────────────────────────────────────────── + + def _run_bot_thread(self) -> None: + """ + Entry point for the bot background thread. + Creates a brand-new asyncio event loop for Telethon to use, + completely isolated from Textual's loop. + """ + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + self._bot_loop = loop + try: + loop.run_until_complete(self._bot_main()) + except Exception as e: + log.error(f"[bot-thread] Unhandled exception: {e}", exc_info=True) + bus.post(bus.EvStatus(f"Bot thread crashed: {e}", level="error")) + finally: + loop.close() + + async def _bot_main(self) -> None: + """ + Full bot backend — runs inside the bot thread's own event loop. + Telethon is free to schedule background tasks without interfering + with Textual's loop. + """ + import shutil as _shutil + from telethon import TelegramClient + from telethon import events as tl_events + from core.processor import compile_patterns + from core.notifier import send_status + from core.scraper import backfill_all, warm_entity_cache + from utils.database import init_db + + init_db() + patterns = compile_patterns(config.TARGET_KEYWORDS) + + bus.post(bus.EvStatus( + f"Starting — {len(config.WATCHED_CHANNELS)} channel(s), " + f"{len(patterns)} pattern(s)" + )) + + user_client = TelegramClient( + config.SESSION_NAME, config.API_ID, config.API_HASH, + connection_retries=5, auto_reconnect=True, request_retries=5, + ) + bot_client = TelegramClient( + "bot_session", config.API_ID, config.API_HASH, + ) + + try: + log.info("[bot] Connecting bot_client...") + await bot_client.connect() + log.info("[bot] bot_client connected, authorizing...") + if not await bot_client.is_user_authorized(): + await bot_client.sign_in(bot_token=config.BOT_TOKEN) + log.info("[bot] bot_client ready") + + log.info("[bot] Connecting user_client...") + await user_client.connect() + log.info("[bot] user_client connected, checking auth...") + if not await user_client.is_user_authorized(): + log.error("[bot] user_client not authorized — run: python main.py --no-tui") + bus.post(bus.EvStatus( + "Not authorized — run --no-tui once to complete login", + level="error", + )) + return + log.info("[bot] user_client ready") + + try: + me = await user_client.get_me() + bus.post(bus.EvStatus(f"Connected as {me.first_name} (@{me.username})")) + await send_status( + bot_client, + f"✅ *Monitor started* (TUI)\n" + f"User: `{me.first_name}`\n" + f"Channels: `{len(config.WATCHED_CHANNELS)}`\n" + f"Patterns: `{len(patterns)}`", + ) + + await warm_entity_cache(user_client) + + _current_handler = [None] + + def _make_handler(channels): + if _current_handler[0] is not None: + user_client.remove_event_handler(_current_handler[0]) + + from core.bot_downloader import ( + handle_bot_download_message, + has_download_button, + extract_password, + ) + from core.scraper import handle_message + from telethon.tl.types import MessageMediaDocument + + _channel_passwords: dict[int, str] = {} + + @user_client.on(tl_events.NewMessage(chats=channels)) + async def _handler(event): + msg = event.message + try: + source = event.chat.username or str(event.chat_id) + except Exception: + source = str(event.chat_id) + + chat_id = event.chat_id + msg_pw = extract_password(msg) + if msg_pw: + _channel_passwords[chat_id] = msg_pw + password = msg_pw or _channel_passwords.get(chat_id) + + live_patterns = compile_patterns(config.TARGET_KEYWORDS) + + if msg.media and isinstance(msg.media, MessageMediaDocument): + await handle_message( + user_client, bot_client, msg, + source, live_patterns, password=password, + ) + elif msg.buttons and has_download_button(msg): + await handle_bot_download_message( + user_client, bot_client, msg, + source, live_patterns, password=password, + ) + + _current_handler[0] = _handler + log.info(f"[bot] Handler registered for {len(channels)} channel(s)") + bus.post(bus.EvStatus(f"Watching {len(channels)} channel(s)")) + + # Channel-change event — lives on this (bot) loop. + # Textual signals it thread-safely via _signal_channel_changed(). + _ch_changed = asyncio.Event() + self._bot_loop_channel_event = _ch_changed + + _make_handler(list(self._live_channels)) + bus.post(bus.EvStatus("Live listener active")) + + await backfill_all(user_client, bot_client, patterns) + bus.post(bus.EvStatus("Backfill complete — monitoring live")) + + async def _watch_channels(): + while True: + await _ch_changed.wait() + _ch_changed.clear() + new_channels = list(self._live_channels) + log.info(f"[bot] Channel list changed → {new_channels}") + _make_handler(new_channels) + + await asyncio.gather( + user_client.run_until_disconnected(), + _watch_channels(), + ) + + except Exception as e: + bus.post(bus.EvStatus(f"Bot error: {e}", level="error")) + log.error("[bot] Bot main crashed", exc_info=True) + finally: + log.info("[bot] Disconnecting clients...") + await user_client.disconnect() + await bot_client.disconnect() + + except Exception as e: + bus.post(bus.EvStatus(f"Bot connect error: {e}", level="error")) + log.error("[bot] Connection failed", exc_info=True) + finally: + if config.TEMP_DIR.exists(): + _shutil.rmtree(config.TEMP_DIR, ignore_errors=True) + config.TEMP_DIR.mkdir(exist_ok=True) + + def action_quit(self) -> None: + self.exit() + + +# ─── Entry point ────────────────────────────────────────────────────────────── + +def run_tui() -> None: + # Do NOT call bus.init_bus() here — the Queue must be created inside + # Textual's event loop (see MonitorApp.on_mount). Calling it here + # would bind the Queue to the outer loop which is discarded when + # App.run() creates a new one. + MonitorApp().run() diff --git a/tui/events.md b/tui/events.md new file mode 100644 index 0000000..674117e --- /dev/null +++ b/tui/events.md @@ -0,0 +1,66 @@ +# tui/events.py + +Thread-safe event bus between the bot backend thread and the Textual TUI. +The bot thread calls `post()`. The TUI drains the queue every 100ms via `_drain_bus()`. + +## Public API + +```python +from tui import events as bus # from core/ and tui/app.py +from tui.events import post, init_bus, get_bus, tui_active +``` + +### `init_bus() -> queue.Queue` +Creates the `queue.Queue`. Called inside `MonitorApp.on_mount()` — **must run on Textual's event loop**, not before `App.run()`. + +### `post(event: Any) -> None` +Fire-and-forget from any thread. Silently drops if bus not initialised. +Uses `queue.Queue.put_nowait()` — never blocks. + +### `get_bus() -> queue.Queue | None` +Returns the queue for the TUI consumer to drain. + +### `tui_active: bool` +Set to `True` by `init_bus()`. Checked by `core/tdl_downloader.py` to decide whether to pipe tdl output or inherit the terminal. + +--- + +## Event types + +| Class | Fields | Posted by | Consumed by | +|-------|--------|-----------|-------------| +| `EvDownloadQueued` | `batch_id, filename, size_mb, source, password` | `tdl_downloader`, `scraper` | `DownloadPanel.queued()` | +| `EvDownloadStarted` | `batch_id, filename` | `tdl_downloader`, `scraper` | `DownloadPanel.status("downloading")` | +| `EvDownloadDone` | `batch_id, filename, via` | `tdl_downloader`, `scraper` | `DownloadPanel.status("done_tdl"\|"done_tel")` | +| `EvDownloadFailed` | `batch_id, filename, reason` | `tdl_downloader`, `scraper` | `DownloadPanel.status("failed")` | +| `EvTdlOutput` | `line` | `tdl_downloader._relay()` | `DownloadPanel.tdl_line()` | +| `EvHit` | `severity, raw, source, filename, reasons` | `notifier.notify()` | `HitsPanel.add_hit()` + `StatsPanel.refresh_stats()` | +| `EvChannelAdded` | `channel` | `ChannelPanel.on_button_pressed()` | `_drain_bus` → `_signal_channel_changed()` | +| `EvChannelRemoved` | `channel` | `ChannelPanel.on_button_pressed()` | `_drain_bus` → `_signal_channel_changed()` | +| `EvStatus` | `text, level` | everywhere | `MonitorApp.notify()` toast | + +`level` on `EvStatus`: `"info"` (default) · `"warning"` · `"error"` + +--- + +## Threading model + +``` +Bot thread (own asyncio loop) + └─ bus.post(event) ← queue.Queue.put_nowait() [thread-safe] + ↓ + queue.Queue + ↓ +Textual thread (Textual's loop) + └─ _drain_bus() [set_interval 100ms] + └─ q.get_nowait() loop + └─ dispatch to widgets [safe, same thread as Textual] +``` + +Channel changes flow the other way: +``` +_drain_bus sees EvChannelAdded/Removed + → _signal_channel_changed() + → loop.call_soon_threadsafe(asyncio.Event.set) + → bot thread's _watch_channels() wakes +``` diff --git a/tui/events.py b/tui/events.py new file mode 100644 index 0000000..ff0cd27 --- /dev/null +++ b/tui/events.py @@ -0,0 +1,114 @@ +""" +tui_events.py — Thread-safe event bus between the bot backend and the TUI. + +The bot backend runs in a dedicated thread with its own asyncio event loop +(completely isolated from Textual's loop). Events are posted via a standard +queue.Queue (thread-safe), and the TUI consumer polls it from Textual's loop +using asyncio.get_event_loop().run_in_executor() bridging. + +post() is safe to call from any thread or any asyncio loop. +""" + +import queue +import threading +from dataclasses import dataclass, field +from typing import Any + +# Thread-safe queue — works across the bot thread and Textual's thread. +_queue: queue.Queue | None = None +_queue_lock = threading.Lock() + +# Set to True when the TUI is running so tdl pipes output instead of +# writing directly to the terminal. +tui_active: bool = False + + +def init_bus() -> queue.Queue: + """Call once from MonitorApp.on_mount() to create the queue.""" + global _queue, tui_active + _queue = queue.Queue() + tui_active = True + return _queue + + +def get_bus() -> queue.Queue | None: + return _queue + + +def post(event: Any) -> None: + """Fire-and-forget from any thread. Silently drops if bus not up.""" + if _queue is not None: + try: + _queue.put_nowait(event) + except queue.Full: + pass + + +# ─── Event types ────────────────────────────────────────────────────────────── + +@dataclass +class EvDownloadQueued: + """A file has been accepted and is waiting for tdl.""" + batch_id: str + filename: str + size_mb: float + source: str + password: str | None + + +@dataclass +class EvDownloadStarted: + """tdl has begun transferring this file.""" + batch_id: str + filename: str + + +@dataclass +class EvDownloadDone: + """File fully downloaded (tdl or Telethon fallback).""" + batch_id: str + filename: str + via: str # "tdl" | "telethon" + + +@dataclass +class EvDownloadFailed: + """All download attempts failed.""" + batch_id: str + filename: str + reason: str + + +@dataclass +class EvTdlOutput: + """A line of output from tdl's stdout/stderr (TUI mode only).""" + line: str + + +@dataclass +class EvHit: + """A scored credential hit to display in the hits panel.""" + severity: str + raw: str + source: str + filename: str + reasons: list[str] = field(default_factory=list) + + +@dataclass +class EvChannelAdded: + """A channel was added to the live watch list.""" + channel: str | int + + +@dataclass +class EvChannelRemoved: + """A channel was removed from the live watch list.""" + channel: str | int + + +@dataclass +class EvStatus: + """Generic one-line status message (startup, errors, etc.).""" + text: str + level: str = "info" # "info" | "warning" | "error" diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..8c6b899 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1 @@ +"""utils — pure logic modules with no Telegram dependencies.""" diff --git a/utils/cache.md b/utils/cache.md new file mode 100644 index 0000000..91ebaec --- /dev/null +++ b/utils/cache.md @@ -0,0 +1,32 @@ +# utils/cache.py + +Tracks already-processed Telegram document IDs to avoid redownloading. +Persists to `data/cache.json` as a JSON array of integers. + +## Public API + +```python +from utils.cache import is_seen, mark_seen +``` + +### `is_seen(file_id: int) -> bool` +Returns `True` if this document ID has been processed before. +Loads from disk on every call (safe for multi-process, slightly slow for hot loops — not an issue given download cadence). + +### `mark_seen(file_id: int) -> None` +Adds `file_id` to the cache and persists to disk. + +--- + +## Storage + +- **File:** `data/cache.json` +- **Format:** JSON array of integers — `[123456789, 987654321, ...]` +- **No expiry** — grows indefinitely. Safe to delete to re-process all files. + +--- + +## Notes + +- `is_seen` + `mark_seen` are called in `core/scraper.py` after a successful download+process cycle, not before — so a file that fails mid-process will be retried on next run. +- Not thread-safe (load/modify/save is not atomic). Acceptable because downloads are sequential within the bot loop. diff --git a/utils/cache.py b/utils/cache.py new file mode 100644 index 0000000..8182eeb --- /dev/null +++ b/utils/cache.py @@ -0,0 +1,38 @@ +""" +cache.py — Tracks already-processed file IDs to avoid redownloading. +Persists to a simple JSON file on disk. +""" + +import json +import logging +from pathlib import Path + +log = logging.getLogger(__name__) + +CACHE_FILE = Path("./data/cache.json") + + +def _load() -> set: + if not CACHE_FILE.exists(): + return set() + try: + with open(CACHE_FILE, "r") as f: + return set(json.load(f)) + except Exception: + return set() + + +def _save(seen: set) -> None: + with open(CACHE_FILE, "w") as f: + json.dump(list(seen), f) + + +def is_seen(file_id: int) -> bool: + return file_id in _load() + + +def mark_seen(file_id: int) -> None: + seen = _load() + seen.add(file_id) + _save(seen) + log.debug(f" Cached file ID {file_id}") diff --git a/utils/database.md b/utils/database.md new file mode 100644 index 0000000..92909f8 --- /dev/null +++ b/utils/database.md @@ -0,0 +1,89 @@ +# utils/database.py + +SQLite persistence layer for credential hits. +DB file: `data/hits.db` + +## Public API + +```python +from utils.database import init_db, insert_hits, search, recent, by_severity, stats +``` + +### Setup + +#### `init_db() -> None` +Creates `hits` table and indexes if they don't exist. Call once on startup. +Safe to call multiple times (idempotent). + +--- + +### Writing + +#### `insert_hits(scored_hits, source, filename, seen_before=False) -> int` +Inserts a list of `ScoredHit` objects. Returns row count inserted. + +```python +insert_hits(new_hits, source="channelname", filename="combo.zip") +insert_hits(dupe_hits, source="channelname", filename="combo.zip", seen_before=True) +``` + +--- + +### Querying + +#### `search(keyword: str) -> list[sqlite3.Row]` +Full-text search across `url`, `username`, `raw`. Returns rows sorted by score DESC, timestamp DESC. + +#### `recent(limit: int = 50) -> list[sqlite3.Row]` +Most recent hits, newest first. + +#### `by_severity(severity: str) -> list[sqlite3.Row]` +All unique (non-duplicate) hits at a given severity, newest first. +`severity` must be one of: `"CRITICAL"`, `"HIGH"`, `"MEDIUM"`, `"LOW"` + +#### `stats() -> dict` +Returns summary counters: +```python +{ + "total": int, # all rows + "unique": int, # seen_before=0 + "duplicates": int, # seen_before=1 + "critical": int, # unique CRITICAL + "high": int, + "medium": int, + "low": int, + "sources": int, # distinct source channels + "top_source": {"source": str, "cnt": int} | None, +} +``` + +--- + +## Schema + +```sql +hits ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT, + username TEXT, + password TEXT, + raw TEXT NOT NULL, -- full original credential line + source TEXT, -- channel username or ID + filename TEXT, -- downloaded file name + timestamp TEXT NOT NULL, -- "YYYY-MM-DD HH:MM:SS UTC" + severity TEXT NOT NULL, -- CRITICAL/HIGH/MEDIUM/LOW + score INTEGER NOT NULL, -- 40/30/20/10 + reasons TEXT, -- pipe-separated reason strings + seen_before INTEGER NOT NULL -- 0=new, 1=duplicate +) +``` + +Indexes: `url`, `username`, `source`, `timestamp`, `severity`. + +--- + +## Notes + +- Each query opens and closes its own connection via the `_connect()` context manager. +- `conn.row_factory = sqlite3.Row` — rows support both index and column-name access. +- Transactions: commit on success, rollback on exception. diff --git a/utils/database.py b/utils/database.py new file mode 100644 index 0000000..589acb7 --- /dev/null +++ b/utils/database.py @@ -0,0 +1,171 @@ +""" +database.py — SQLite storage for credential hits. + +Schema: + hits table: + - id auto-increment primary key + - url the target URL from the credential line + - username extracted username/email + - password extracted password + - raw the full original line + - source channel/bot it came from + - filename the file it was found in + - timestamp UTC time of discovery + - severity CRITICAL / HIGH / MEDIUM / LOW + - score numeric score (higher = worse) + - reasons pipe-separated list of scoring reasons + - seen_before whether this was a duplicate (for stats) +""" + +import sqlite3 +import logging +from datetime import datetime, timezone +from pathlib import Path +from contextlib import contextmanager + +log = logging.getLogger(__name__) + +DB_FILE = Path("./data/hits.db") + + +# ─── Setup ──────────────────────────────────────────────────────────────────── + +@contextmanager +def _connect(): + conn = sqlite3.connect(DB_FILE) + conn.row_factory = sqlite3.Row + try: + yield conn + conn.commit() + except Exception: + conn.rollback() + raise + finally: + conn.close() + + +def init_db() -> None: + """Create tables if they don't exist yet.""" + with _connect() as conn: + conn.execute(""" + CREATE TABLE IF NOT EXISTS hits ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT, + username TEXT, + password TEXT, + raw TEXT NOT NULL, + source TEXT, + filename TEXT, + timestamp TEXT NOT NULL, + severity TEXT NOT NULL DEFAULT 'LOW', + score INTEGER NOT NULL DEFAULT 10, + reasons TEXT, + seen_before INTEGER NOT NULL DEFAULT 0 + ) + """) + conn.execute("CREATE INDEX IF NOT EXISTS idx_url ON hits(url)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_username ON hits(username)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_source ON hits(source)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_timestamp ON hits(timestamp)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_severity ON hits(severity)") + log.info(f"Database ready: {DB_FILE}") + + +# ─── Writing ───────────────────────────────────────────────────────────────── + +def insert_hits( + scored_hits: list, + source: str, + filename: str, + seen_before: bool = False, +) -> int: + """ + Insert a list of ScoredHit objects into the database. + Returns the number of rows inserted. + """ + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") + rows = [] + for h in scored_hits: + rows.append(( + h.url, + h.username, + h.password, + h.raw, + source, + filename, + timestamp, + h.severity, + h.score, + " | ".join(h.reasons), + 1 if seen_before else 0, + )) + + with _connect() as conn: + conn.executemany(""" + INSERT INTO hits + (url, username, password, raw, source, filename, timestamp, + severity, score, reasons, seen_before) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, rows) + + log.info(f" DB: inserted {len(rows)} row(s) from {filename}") + return len(rows) + + +# ─── Querying ───────────────────────────────────────────────────────────────── + +def search(keyword: str) -> list[sqlite3.Row]: + """Search hits by keyword across url, username, raw fields.""" + with _connect() as conn: + return conn.execute(""" + SELECT * FROM hits + WHERE url LIKE ? OR username LIKE ? OR raw LIKE ? + ORDER BY score DESC, timestamp DESC + """, (f"%{keyword}%",) * 3).fetchall() + + +def recent(limit: int = 50) -> list[sqlite3.Row]: + """Return the most recent hits.""" + with _connect() as conn: + return conn.execute(""" + SELECT * FROM hits + ORDER BY timestamp DESC + LIMIT ? + """, (limit,)).fetchall() + + +def by_severity(severity: str) -> list[sqlite3.Row]: + """Return all hits of a given severity level.""" + with _connect() as conn: + return conn.execute(""" + SELECT * FROM hits + WHERE severity = ? AND seen_before = 0 + ORDER BY timestamp DESC + """, (severity,)).fetchall() + + +def stats() -> dict: + """Return summary statistics.""" + with _connect() as conn: + total = conn.execute("SELECT COUNT(*) FROM hits").fetchone()[0] + unique = conn.execute("SELECT COUNT(*) FROM hits WHERE seen_before=0").fetchone()[0] + critical = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='CRITICAL' AND seen_before=0").fetchone()[0] + high = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='HIGH' AND seen_before=0").fetchone()[0] + medium = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='MEDIUM' AND seen_before=0").fetchone()[0] + low = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='LOW' AND seen_before=0").fetchone()[0] + sources = conn.execute("SELECT COUNT(DISTINCT source) FROM hits").fetchone()[0] + top_source = conn.execute(""" + SELECT source, COUNT(*) as cnt FROM hits + GROUP BY source ORDER BY cnt DESC LIMIT 1 + """).fetchone() + return { + "total": total, + "unique": unique, + "duplicates": total - unique, + "critical": critical, + "high": high, + "medium": medium, + "low": low, + "sources": sources, + "top_source": dict(top_source) if top_source else None, + } diff --git a/utils/scorer.md b/utils/scorer.md new file mode 100644 index 0000000..50df937 --- /dev/null +++ b/utils/scorer.md @@ -0,0 +1,87 @@ +# utils/scorer.py + +Severity scoring for credential hits. No Telegram deps. Pure logic. + +## Public API + +```python +from utils.scorer import score_hit, score_hits, summarize, ScoredHit +from utils.scorer import CRITICAL, HIGH, MEDIUM, LOW, SEVERITY_EMOJI, SEVERITY_SCORES +``` + +### `score_hit(line: str) -> ScoredHit` +Score a single raw credential line. Parses ULP format (`url:user:pass`), runs all checks, returns a `ScoredHit`. + +### `score_hits(lines: list[str]) -> list[ScoredHit]` +Score a list of lines. Returns sorted descending by score. + +### `summarize(scored: list[ScoredHit]) -> dict` +Returns `{CRITICAL: n, HIGH: n, MEDIUM: n, LOW: n}`. + +--- + +## ScoredHit dataclass + +| Field | Type | Description | +|-------|------|-------------| +| `raw` | str | Original credential line | +| `severity` | str | CRITICAL / HIGH / MEDIUM / LOW | +| `score` | int | 40 / 30 / 20 / 10 | +| `reasons` | list[str] | Human-readable match reasons | +| `url` | str\|None | Parsed URL field | +| `username` | str\|None | Parsed username/email field | +| `password` | str\|None | Parsed password field | +| `.emoji` | property | 🔴🟠🟡🟢 | + +--- + +## Scoring rules (highest match wins) + +| Severity | Triggers | +|----------|----------| +| CRITICAL | Employee email domain after `@` in username/line · Privileged service URL (admin, vpn, ssh, rdp, gitlab, jira…) | +| HIGH | Internal service URL (intranet, erp, crm, sso, owa, sharepoint…) | +| MEDIUM | Client-facing URL (app, patient, booking, helpdesk…) | +| LOW | Org domain appears anywhere in line (baseline) | + +Check 6 (no severity change): flags weak passwords ≤6 chars or common strings. + +--- + +## Employee domain matching + +Keywords in `config.TARGET_KEYWORDS` containing `@` become employee patterns. +Pattern: `@(?:[^a-zA-Z0-9.\-]|$)` — requires literal `@` before the domain. +**`user@gmail.com` on a URL containing `myorg.cl` does NOT trigger CRITICAL.** + +Keywords without `@` go only to `ORG_DOMAINS` (LOW baseline). + +--- + +## ULP line parser (`ULP_PATTERN`) + +Separators: `:` `;` `,` `|` `\t` (any of these between the three fields). + +The URL field handles two common stealer-log complications: + +1. **`://` not treated as separator** — the optional scheme prefix `(?:https?|ftp)://` is consumed before the character-class match, so `https://` never gets split at the colon. + +2. **Port + path consumed into the URL** — the optional group `(?::\d+/[^\s:;,|\t]*)` absorbs `:port/path` when the port is pure digits immediately followed by `/`. This correctly handles `http://host:8085/path/:user:pass` but intentionally skips patterns like `:24145487-8` (RUT number — hyphen after digits, no `/`). + +**Known limitation:** A bare port with no path (e.g. `https://host:8080:user:pass`) will mis-parse `8080` as the username. This is not observed in practice — stealer logs always include at least a trailing `/`. + +--- + +## Module-level globals (rebuilt on import + via KeywordsScreen) + +| Name | Type | Description | +|------|------|-------------| +| `EMPLOYEE_DOMAINS` | `list[tuple[str, Pattern]]` | `(domain_str, anchored_pattern)` for `@`-keywords | +| `ORG_DOMAINS` | `list[Pattern]` | Plain domain patterns for all keywords | + +To rebuild after editing `config.TARGET_KEYWORDS` at runtime: +```python +import utils.scorer as scorer +scorer.EMPLOYEE_DOMAINS = scorer._build_employee_domains() +scorer.ORG_DOMAINS = scorer._build_org_domains() +``` diff --git a/utils/scorer.py b/utils/scorer.py new file mode 100644 index 0000000..9f1a3a8 --- /dev/null +++ b/utils/scorer.py @@ -0,0 +1,273 @@ +""" +scorer.py — Severity scoring for credential hits. + +Scoring logic (highest match wins): + + CRITICAL — Employee credentials (internal email domain) + e.g. jdoe@yourclinic.cl:password + — Admin/privileged service URLs + e.g. admin., vpn., ssh., rdp., gitlab., jira. + + HIGH — Internal-facing services + e.g. intranet., erp., crm., portal., citrix. + — Password manager or SSO hits + — Any credential where username looks like an employee email + + MEDIUM — Client-facing portals + e.g. app., patient., client., booking. + — Domain match on a non-privileged service + + LOW — Generic domain keyword match + — No URL parsed, just a raw domain mention + +Each scored hit gets a dict with: + - severity: CRITICAL / HIGH / MEDIUM / LOW + - score: int (higher = worse) + - reasons: list of human-readable reasons + - raw: original line +""" + +import re +import logging +from dataclasses import dataclass, field +from config import TARGET_KEYWORDS + +log = logging.getLogger(__name__) + + +# ─── Severity levels ───────────────────────────────────────────────────────── + +CRITICAL = "CRITICAL" +HIGH = "HIGH" +MEDIUM = "MEDIUM" +LOW = "LOW" + +SEVERITY_SCORES = { + CRITICAL: 40, + HIGH: 30, + MEDIUM: 20, + LOW: 10, +} + +SEVERITY_EMOJI = { + CRITICAL: "🔴", + HIGH: "🟠", + MEDIUM: "🟡", + LOW: "🟢", +} + + +# ─── Pattern banks ─────────────────────────────────────────────────────────── + +# Subdomains/services that indicate privileged access +CRITICAL_SERVICES = re.compile( + r"(?:^|https?://|\.)" + r"(admin|vpn|ssh|rdp|ftp|sftp|gitlab|github|bitbucket|jenkins|" + r"jira|confluence|grafana|kibana|sentry|vault|bastion|jump|" + r"firewall|router|switch|proxy|ldap|ad\.|activedirectory|" + r"exchange|mail\.)", + re.IGNORECASE +) + +HIGH_SERVICES = re.compile( + r"(?:^|https?://|\.)" + r"(intranet|erp|crm|portal|citrix|workspace|webmail|owa|" + r"sharepoint|teams|slack|zoom|meet|sso|login|auth|oauth|" + r"accounts?|dashboard|internal|corp|staff|hr|payroll|" + r"finance|accounting)", + re.IGNORECASE +) + +MEDIUM_SERVICES = re.compile( + r"(?:^|https?://|\.)" + r"(app|patient|client|customer|booking|appointment|" + r"reserva|cita|paciente|user|member|registro|signup|" + r"support|helpdesk|ticket)", + re.IGNORECASE +) + +# Looks like a corporate email (user@domain) +EMAIL_PATTERN = re.compile(r"[a-zA-Z0-9._%+\-]+@([a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})") + +# ULP line parser +# Separator set: colon, semicolon, comma, pipe, tab. +# URL field: optional scheme (http/https/ftp) consumed first so '://' is never +# mistaken for a separator; then an optional port group ':\d+/' absorbs port+path +# (port is digits immediately followed by '/') so 'http://host:88/path:user:pass' +# yields url='http://host:88/path', not url='http'. +ULP_PATTERN = re.compile( + r"^(?P" + r"(?:(?:https?|ftp)://)?[^\s:;,|\t]+" # optional scheme + host/path + r"(?::\d+/[^\s:;,|\t]*)?" # optional :port/path (port = digits then /) + r")" + r"(?:[:;,|\t])" + r"(?P[^\s:;,|\t]+)" + r"(?:[:;,|\t])" + r"(?P.+)$" +) + + +# ─── Derived from config ────────────────────────────────────────────────────── + +def _kw_to_domain(kw: str) -> str: + """Strip regex syntax from a keyword to get a plain domain string.""" + return kw.replace(r"@", "").replace(r"\.", ".").strip("^$").lstrip(".") + + +def _build_employee_domains() -> list[tuple[str, re.Pattern]]: + """ + Keywords that contain '@' are employee email domain patterns. + + Pattern anchors at '@' so that a URL containing the org domain + never causes a false CRITICAL on an unrelated email like @gmail.com. + + Returns list of (domain_str, compiled_pattern) tuples. + """ + patterns = [] + for kw in TARGET_KEYWORDS: + if "@" in kw: + domain = _kw_to_domain(kw) + if domain: + pat = re.compile( + r"@" + re.escape(domain) + r"(?:[^a-zA-Z0-9.\-]|$)", + re.IGNORECASE, + ) + patterns.append((domain, pat)) + return patterns + +EMPLOYEE_DOMAINS = _build_employee_domains() + + +def _build_org_domains() -> list[re.Pattern]: + """ + All keywords as plain domain patterns for the LOW baseline match. + Checks that the org domain appears anywhere in the line. + """ + patterns = [] + for kw in TARGET_KEYWORDS: + domain = _kw_to_domain(kw) + if domain: + patterns.append(re.compile(re.escape(domain), re.IGNORECASE)) + return patterns + +ORG_DOMAINS = _build_org_domains() + + + +# ─── Scoring logic ──────────────────────────────────────────────────────────── + +@dataclass +class ScoredHit: + raw: str + severity: str + score: int + reasons: list[str] = field(default_factory=list) + url: str | None = None + username: str | None = None + password: str | None = None + + @property + def emoji(self) -> str: + return SEVERITY_EMOJI.get(self.severity, "⚪") + + def __str__(self) -> str: + return f"{self.emoji} [{self.severity}] {self.raw}" + + +def score_hit(line: str) -> ScoredHit: + """ + Score a single credential line. + Returns a ScoredHit with severity, score, and reasons. + """ + line = line.strip() + reasons = [] + scores = [] + + # Parse ULP fields if possible + url = username = password = None + m = ULP_PATTERN.match(line) + if m: + url = m.group("url") + username = m.group("username") + password = m.group("password") + + # ── Check 1: Employee email domain in username or line ─────────────── + # EMPLOYEE_DOMAINS entries are (domain_str, pattern) where the pattern + # requires '@' immediately before the domain, so a URL containing the + # org domain never triggers a CRITICAL on an unrelated email (@gmail etc). + for domain_str, pat in EMPLOYEE_DOMAINS: + # Try the parsed username field first; fall back to full line. + # Either way the pattern requires a literal '@' before the domain. + field = username if username else "" + if not pat.search(field): + field = line + if pat.search(field): + scores.append(CRITICAL) + reasons.append(f"Employee email domain: {domain_str}") + break + + # ── Check 2: Is the URL a privileged/critical service? ──────────────── + if url and CRITICAL_SERVICES.search(url): + scores.append(CRITICAL) + reasons.append(f"Critical service URL: {url}") + + # ── Check 3: Is the URL a high-value internal service? ──────────────── + if url and HIGH_SERVICES.search(url): + scores.append(HIGH) + reasons.append(f"High-value internal service: {url}") + + # ── Check 4: Is the URL a client-facing service? ────────────────────── + if url and MEDIUM_SERVICES.search(url): + scores.append(MEDIUM) + reasons.append(f"Client-facing service: {url}") + + # ── Check 5: Generic org domain match (baseline) ───────────────────── + for pattern in ORG_DOMAINS: + if pattern.search(line): + if not scores: + scores.append(LOW) + reasons.append(f"Org domain match in line") + break + + # ── Check 6: Weak/empty password flag ──────────────────────────────── + if password: + if len(password) <= 6: + reasons.append(f"⚠ Weak password ({len(password)} chars)") + if password.lower() in {"123456", "password", "qwerty", "111111", "admin", "letmein"}: + reasons.append(f"⚠ Common password: {password}") + + # ── Resolve final severity ──────────────────────────────────────────── + severity_order = [CRITICAL, HIGH, MEDIUM, LOW] + final_severity = LOW # default + for s in severity_order: + if s in scores: + final_severity = s + break + + if not reasons: + reasons.append("Pattern match") + + return ScoredHit( + raw = line, + severity = final_severity, + score = SEVERITY_SCORES[final_severity], + reasons = reasons, + url = url, + username = username, + password = password, + ) + + +def score_hits(lines: list[str]) -> list[ScoredHit]: + """Score a list of credential lines. Returns sorted by score descending.""" + scored = [score_hit(line) for line in lines] + scored.sort(key=lambda h: h.score, reverse=True) + return scored + + +def summarize(scored: list[ScoredHit]) -> dict: + """Count hits by severity level.""" + summary = {CRITICAL: 0, HIGH: 0, MEDIUM: 0, LOW: 0} + for h in scored: + summary[h.severity] += 1 + return summary