Initial commit: ULPgrammer
- Core Telegram monitoring pipeline (scraper, processor, notifier, downloaders) - Textual TUI frontend with thread-safe event bus - SQLite persistence, severity scoring, dedup cache - Fixed ULP parser: handles https:// truncation, port+path URLs, semicolon separator - Test suite: 88 tests across scorer, cache, database, processor
This commit is contained in:
25
.claudeignore
Normal file
25
.claudeignore
Normal file
@@ -0,0 +1,25 @@
|
||||
# Sessions
|
||||
*.session
|
||||
*.session-journal
|
||||
bot_session*
|
||||
|
||||
# Data — keep the folder, ignore contents
|
||||
data/hits.db
|
||||
data/hits.txt
|
||||
data/hits.csv
|
||||
data/dedup.json
|
||||
data/cache.json
|
||||
data/tmp/
|
||||
data/logs/
|
||||
!data/.gitkeep
|
||||
|
||||
# Env
|
||||
.env
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
.venv/
|
||||
venv/
|
||||
|
||||
22
.env.example
Normal file
22
.env.example
Normal file
@@ -0,0 +1,22 @@
|
||||
# ─── Telegram API credentials ──────────────────────────────────────────────
|
||||
# Get these from https://my.telegram.org → API development tools
|
||||
API_ID=12345678
|
||||
API_HASH=your_api_hash_here
|
||||
|
||||
# ─── Bot credentials ────────────────────────────────────────────────────────
|
||||
# Create a bot via @BotFather and paste the token here
|
||||
BOT_TOKEN=123456789:ABCdefGHIjklMNOpqrSTUvwxYZ
|
||||
|
||||
# ─── Alert destination ──────────────────────────────────────────────────────
|
||||
# Chat ID to send hit notifications to (your personal ID or a group)
|
||||
# Tip: message @userinfobot on Telegram to get your ID
|
||||
NOTIFY_CHAT_ID=987654321
|
||||
|
||||
# ─── Session name (just a filename, no extension needed) ────────────────────
|
||||
SESSION_NAME=monitor_session
|
||||
|
||||
# ─── tdl (fast Go downloader) — optional but strongly recommended ───────────
|
||||
# Install: https://github.com/iyear/tdl
|
||||
# After installing, run once: tdl login -n <SESSION_NAME>
|
||||
# SESSION_NAME above is shared between Telethon and tdl — no double login needed.
|
||||
# If tdl is not on PATH the bot falls back to Telethon automatically.
|
||||
28
.gitignore
vendored
Normal file
28
.gitignore
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
# Sessions
|
||||
*.session
|
||||
*.session-journal
|
||||
bot_session*
|
||||
|
||||
# Data — keep the folder, ignore contents
|
||||
data/hits.db
|
||||
data/hits.txt
|
||||
data/hits.csv
|
||||
data/dedup.json
|
||||
data/cache.json
|
||||
data/tmp/
|
||||
data/logs/
|
||||
!data/.gitkeep
|
||||
|
||||
# Env
|
||||
.env
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
.venv/
|
||||
venv/
|
||||
|
||||
# Claude things
|
||||
CLAUDE.md
|
||||
.claude/*
|
||||
182
QUICK_REF.md
Normal file
182
QUICK_REF.md
Normal file
@@ -0,0 +1,182 @@
|
||||
# ULP Monitor — Quick Reference
|
||||
|
||||
> For Claude Code: read the per-file `.md` alongside each `.py` before editing.
|
||||
> Full docs in `README.md`.
|
||||
|
||||
---
|
||||
|
||||
## Project layout
|
||||
|
||||
```
|
||||
ulp_monitor/
|
||||
├── main.py Entry point (--no-tui flag for CLI mode)
|
||||
├── config.py All settings — edit this for keywords, channels, paths
|
||||
│
|
||||
├── core/ Telegram I/O pipeline (all async, Telethon-dependent)
|
||||
│ ├── scraper.py Live listener + backfill orchestration
|
||||
│ ├── tdl_downloader.py tdl subprocess wrapper + Telethon fallback
|
||||
│ ├── bot_downloader.py Inline "DOWNLOAD" button click flow
|
||||
│ ├── processor.py Archive extraction (.zip/.7z/.rar) + line search
|
||||
│ └── notifier.py Scoring → dedup → DB → hits.txt/csv → Telegram alert
|
||||
│
|
||||
├── utils/ Pure logic, no Telegram deps, no async
|
||||
│ ├── scorer.py Severity scoring (CRITICAL/HIGH/MEDIUM/LOW)
|
||||
│ ├── cache.py Seen file-ID dedup (data/cache.json)
|
||||
│ └── database.py SQLite read/write (data/hits.db)
|
||||
│
|
||||
├── tui/ Textual TUI — runs in main thread
|
||||
│ ├── app.py MonitorApp + all screens + bot thread launcher
|
||||
│ └── events.py Thread-safe queue.Queue event bus
|
||||
│
|
||||
└── data/ Runtime output — gitignored
|
||||
├── hits.db
|
||||
├── hits.txt
|
||||
├── hits.csv
|
||||
├── cache.json
|
||||
├── dedup.json
|
||||
└── logs/monitor.log
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Data flow
|
||||
|
||||
```
|
||||
Telegram channel
|
||||
└─ new message with file / download button
|
||||
│
|
||||
├─ core/scraper.py detects + guards (size, extension, dedup)
|
||||
│
|
||||
├─ core/tdl_downloader.py downloads via tdl (batched)
|
||||
│ └─ core/scraper.py Telethon fallback if tdl fails
|
||||
│
|
||||
├─ core/bot_downloader.py handles inline button → bot reply flow
|
||||
│
|
||||
├─ core/processor.py extracts archive → searches .txt line by line
|
||||
│
|
||||
└─ core/notifier.py scores → deduplicates → persists → alerts
|
||||
├─ utils/scorer.py
|
||||
├─ utils/database.py
|
||||
└─ tui/events.py posts EvHit to TUI
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Threading architecture
|
||||
|
||||
```
|
||||
main thread (Textual's event loop)
|
||||
├─ MonitorApp.on_mount()
|
||||
│ ├─ bus.init_bus() creates queue.Queue on THIS loop
|
||||
│ ├─ threading.Thread → _run_bot_thread()
|
||||
│ └─ set_interval(0.1, _drain_bus)
|
||||
│
|
||||
├─ _drain_bus() [every 100ms]
|
||||
│ └─ queue.Queue.get_nowait() → dispatch to widgets
|
||||
│
|
||||
└─ Textual widgets, screens, keybindings
|
||||
|
||||
bot thread (own asyncio event loop)
|
||||
└─ _bot_main()
|
||||
├─ bot_client.connect() + sign_in()
|
||||
├─ user_client.connect() + is_user_authorized()
|
||||
├─ warm_entity_cache()
|
||||
├─ _make_handler() → NewMessage handler registered
|
||||
├─ backfill_all()
|
||||
└─ run_until_disconnected() + _watch_channels() [gathered]
|
||||
|
||||
cross-thread communication
|
||||
bot → TUI: bus.post(event) [queue.Queue.put_nowait, always safe]
|
||||
TUI → bot: loop.call_soon_threadsafe() [asyncio.Event.set for channel changes]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Config quick reference (`config.py`)
|
||||
|
||||
| Setting | Type | Description |
|
||||
|---------|------|-------------|
|
||||
| `API_ID` | int | From my.telegram.org |
|
||||
| `API_HASH` | str | From my.telegram.org |
|
||||
| `BOT_TOKEN` | str | From @BotFather |
|
||||
| `NOTIFY_CHAT_ID` | int | Your Telegram user/group ID |
|
||||
| `SESSION_NAME` | str | Session file name (default: `monitor_session`) |
|
||||
| `TARGET_KEYWORDS` | list[str] | Regex patterns. `@`-prefixed → employee email (CRITICAL). Plain → domain match (LOW) |
|
||||
| `WATCHED_CHANNELS` | list[str\|int] | Usernames or `-100xxxxxxxxxx` IDs |
|
||||
| `BACKFILL_LIMIT` | int | Messages to scan per channel on startup (0 = off) |
|
||||
| `ALLOWED_EXTENSIONS` | set | `.txt .zip .7z .rar` |
|
||||
| `MAX_FILE_SIZE` | int | Bytes (default 4 GB) |
|
||||
| `ARCHIVE_PASSWORDS` | list[bytes] | Tried in order on locked archives |
|
||||
| `TDL_NAMESPACE` | str\|None | `tdl login -n <name>` namespace |
|
||||
| `TDL_THREADS` | int | Chunk workers per file (`-t`) |
|
||||
| `TDL_PERFILE` | int | Concurrent files per tdl call (`-l`) |
|
||||
| `TDL_AMOUNT` | int | Messages per batch |
|
||||
| `TEMP_DIR` | Path | `data/tmp` |
|
||||
| `HITS_FILE` | Path | `data/hits.txt` |
|
||||
| `LOG_FILE` | Path | `data/logs/monitor.log` |
|
||||
|
||||
---
|
||||
|
||||
## Severity scoring summary
|
||||
|
||||
| Severity | Score | Triggers |
|
||||
|----------|-------|----------|
|
||||
| CRITICAL | 40 | Employee email (`@myorg.cl` in username) · Privileged service URL (admin, vpn, rdp, gitlab…) |
|
||||
| HIGH | 30 | Internal service URL (intranet, erp, sso, owa…) |
|
||||
| MEDIUM | 20 | Client-facing URL (app, booking, helpdesk…) |
|
||||
| LOW | 10 | Org domain appears anywhere in line |
|
||||
|
||||
`@`-keyword rule: pattern requires literal `@` before domain — `user@gmail.com` on a URL containing `myorg.cl` does **not** trigger CRITICAL.
|
||||
|
||||
---
|
||||
|
||||
## TUI keybindings
|
||||
|
||||
| Key | Action | Screen |
|
||||
|-----|--------|--------|
|
||||
| `s` | Search hits DB | → SearchScreen |
|
||||
| `h` | Browse hits by severity | → HitsDBScreen |
|
||||
| `k` | Edit keyword patterns live | → KeywordsScreen |
|
||||
| `c` | Clear download + hits logs | main |
|
||||
| `r` | Force-refresh stats bar | main |
|
||||
| `q` / `ctrl+c` | Quit | any |
|
||||
| `Escape` | Back to main | sub-screens |
|
||||
| `1`/`2`/`3`/`4` | Filter CRITICAL/HIGH/MEDIUM/LOW | HitsDBScreen |
|
||||
| `r` | Load recent 50 | HitsDBScreen |
|
||||
|
||||
---
|
||||
|
||||
## Per-file reference docs
|
||||
|
||||
| File | Reference |
|
||||
|------|-----------|
|
||||
| `utils/scorer.py` | `utils/scorer.md` |
|
||||
| `utils/cache.py` | `utils/cache.md` |
|
||||
| `utils/database.py` | `utils/database.md` |
|
||||
| `core/scraper.py` | `core/scraper.md` |
|
||||
| `core/processor.py` | `core/processor.md` |
|
||||
| `core/notifier.py` | `core/notifier.md` |
|
||||
| `core/tdl_downloader.py` | `core/tdl_downloader.md` |
|
||||
| `core/bot_downloader.py` | `core/bot_downloader.md` |
|
||||
| `tui/app.py` | `tui/app.md` |
|
||||
| `tui/events.py` | `tui/events.md` |
|
||||
|
||||
---
|
||||
|
||||
## Common tasks
|
||||
|
||||
**Add a new keyword at runtime:** open the TUI → press `k` → add pattern → active immediately. Copy to `config.TARGET_KEYWORDS` to persist.
|
||||
|
||||
**Add a channel at runtime:** type username or numeric ID in the Channels panel → ➕ Add. Handler re-registers immediately. Edit `config.WATCHED_CHANNELS` to persist.
|
||||
|
||||
**Query hits from CLI:**
|
||||
```bash
|
||||
sqlite3 data/hits.db "SELECT severity, username, url FROM hits WHERE seen_before=0 ORDER BY score DESC LIMIT 20"
|
||||
```
|
||||
|
||||
**Re-process all files** (wipe cache):
|
||||
```bash
|
||||
rm data/cache.json data/dedup.json
|
||||
```
|
||||
|
||||
**Check what's happening:** `tail -f data/logs/monitor.log`
|
||||
146
README.md
Normal file
146
README.md
Normal file
@@ -0,0 +1,146 @@
|
||||
# ULP Credential Monitor
|
||||
|
||||
A Telegram-based credential exposure monitor for threat intelligence teams.
|
||||
Watches channels for combo/stealer log files and alerts you when your
|
||||
organization's credentials appear in them.
|
||||
|
||||
---
|
||||
|
||||
## How it works
|
||||
|
||||
```
|
||||
User session (Telethon)
|
||||
└─ watches N channels
|
||||
└─ detects file attachments (.txt, .zip, .7z, .rar)
|
||||
└─ downloads → extracts → searches line by line
|
||||
└─ hit? → writes to data/ + sends bot alert
|
||||
└─ no hit? → deletes file, moves on
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Project structure
|
||||
|
||||
```
|
||||
ulp_monitor/
|
||||
├── main.py Entry point
|
||||
├── config.py All settings (keywords, channels, paths)
|
||||
│
|
||||
├── core/ Telegram I/O pipeline
|
||||
│ ├── scraper.py Live listener + backfill
|
||||
│ ├── tdl_downloader.py Fast downloads via tdl (Go MTProto)
|
||||
│ ├── bot_downloader.py Inline button / bot-dispatched file flows
|
||||
│ ├── processor.py Archive extraction + line-by-line search
|
||||
│ └── notifier.py hits.txt / hits.csv writer + bot alerts
|
||||
│
|
||||
├── utils/ Pure logic — no Telegram dependencies
|
||||
│ ├── scorer.py Hit severity scoring
|
||||
│ ├── cache.py Seen-file deduplication
|
||||
│ └── database.py SQLite persistence layer
|
||||
│
|
||||
├── tui/ Textual TUI frontend
|
||||
│ ├── app.py MonitorApp + all Screen classes
|
||||
│ └── events.py Thread-safe event bus (bot thread → TUI)
|
||||
│
|
||||
└── data/ Runtime-generated (gitignored)
|
||||
├── hits.db SQLite database
|
||||
├── hits.txt Human-readable hit log
|
||||
├── hits.csv CSV hit log (importable into Excel / pandas)
|
||||
├── dedup.json Deduplication hashes
|
||||
├── cache.json Seen file-ID cache
|
||||
└── logs/monitor.log
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Setup
|
||||
|
||||
### 1. Get Telegram API credentials
|
||||
- Go to https://my.telegram.org → *API development tools*
|
||||
- Create an app → note your `api_id` and `api_hash`
|
||||
|
||||
### 2. Create a bot
|
||||
- Message [@BotFather](https://t.me/BotFather) → `/newbot`
|
||||
- Start a chat with your new bot before running
|
||||
|
||||
### 3. Get your chat ID
|
||||
- Message [@userinfobot](https://t.me/userinfobot)
|
||||
|
||||
### 4. Configure
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
# fill in API_ID, API_HASH, BOT_TOKEN, NOTIFY_CHAT_ID
|
||||
```
|
||||
|
||||
Open `config.py` and set:
|
||||
|
||||
- **`TARGET_KEYWORDS`** — your org's domains and email patterns.
|
||||
Keywords with `@` (e.g. `r"@myorg\.cl"`) are **employee email domains** → CRITICAL.
|
||||
Keywords without `@` are plain domain matches → LOW baseline.
|
||||
- **`WATCHED_CHANNELS`** — channel usernames or numeric IDs
|
||||
- **`BACKFILL_LIMIT`** — past messages to scan per channel on startup
|
||||
|
||||
### 5. Install dependencies
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
# rarfile needs the unrar binary:
|
||||
# Ubuntu/Debian: sudo apt install unrar
|
||||
# macOS: brew install rar
|
||||
```
|
||||
|
||||
### 5a. Install tdl (strongly recommended)
|
||||
|
||||
```bash
|
||||
curl -sSL https://raw.githubusercontent.com/iyear/tdl/main/scripts/install.sh | bash
|
||||
tdl login -n monitor_session
|
||||
```
|
||||
|
||||
### 6. First run — complete Telegram auth
|
||||
|
||||
```bash
|
||||
python main.py --no-tui
|
||||
# follow the phone + 2FA prompts once
|
||||
```
|
||||
|
||||
### 7. Run
|
||||
|
||||
```bash
|
||||
python main.py # TUI mode (recommended)
|
||||
python main.py --no-tui # plain CLI
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## TUI keybindings
|
||||
|
||||
| Key | Action |
|
||||
|-----|--------|
|
||||
| `s` | Search hits database |
|
||||
| `h` | Browse hits by severity |
|
||||
| `k` | Edit keyword patterns live |
|
||||
| `c` | Clear logs |
|
||||
| `r` | Refresh stats |
|
||||
| `q` | Quit |
|
||||
|
||||
---
|
||||
|
||||
## Output
|
||||
|
||||
| File | Description |
|
||||
|------|-------------|
|
||||
| `data/hits.db` | SQLite — all hits with scores, severity, dedup flag |
|
||||
| `data/hits.txt` | Human-readable grouped log |
|
||||
| `data/hits.csv` | CSV — easy to pull into Excel / pandas |
|
||||
| `data/logs/monitor.log` | Full run log |
|
||||
|
||||
Telegram alerts fire for CRITICAL / HIGH / MEDIUM only. LOW is stored silently.
|
||||
|
||||
---
|
||||
|
||||
## Notes
|
||||
|
||||
- **Session files are sensitive** — equivalent to a logged-in account. Gitignored, never share.
|
||||
- **Flood limits** — `FloodWaitError` is handled automatically.
|
||||
- **Private channels** — your user account must already be a member.
|
||||
100
config.py
Normal file
100
config.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""
|
||||
config.py — Loads and validates all settings from .env
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# -- Timeouts --
|
||||
BOT_REPLY_TIMEOUT = 10
|
||||
|
||||
# ─── Telegram credentials ────────────────────────────────────────────────────
|
||||
API_ID = int(os.environ["API_ID"])
|
||||
API_HASH = os.environ["API_HASH"]
|
||||
BOT_TOKEN = os.environ["BOT_TOKEN"]
|
||||
NOTIFY_CHAT_ID = int(os.environ["NOTIFY_CHAT_ID"])
|
||||
SESSION_NAME = os.getenv("SESSION_NAME", "monitor_session")
|
||||
|
||||
# ─── Target keywords ─────────────────────────────────────────────────────────
|
||||
# Add your org's domains, email patterns, IP ranges, known usernames, etc.
|
||||
# All patterns are case-insensitive regex.
|
||||
TARGET_KEYWORDS: list[str] = [
|
||||
r"sanatorioaleman\.cl",
|
||||
r"@sanatorioaleman\.cl",
|
||||
# r"192\.168\.10\.", # internal IP range example
|
||||
# r"specificuser", # known internal usernames
|
||||
]
|
||||
|
||||
# ─── Channels to watch ───────────────────────────────────────────────────────
|
||||
# Use usernames (without @) or numeric channel IDs (-100xxxxxxxxxx)
|
||||
WATCHED_CHANNELS: list[str | int] = [
|
||||
#-1002230225603,
|
||||
"cloudxlog",
|
||||
#-1001967030016, # daisycloud
|
||||
#"berserklogs", # berserklogs
|
||||
#"BorwitaFreeLogs", # borwita
|
||||
-1002748707556, # darkcloud
|
||||
-1001684073398, # BHF Cloud
|
||||
-1003163621939, # Wich Love from R
|
||||
-1003611713618, # Khazan Cloud
|
||||
-1003328682684, # LogsPlanet
|
||||
-1003204260194, # JDP
|
||||
-1002828367761, # HesoyamCloud
|
||||
-1003513974925, # Slurm Logs
|
||||
-1003599300787, # Arhont Corp
|
||||
-1002582513379, # OnlyLogs
|
||||
-1002788333372, # Ickis Cloud
|
||||
#-1001234567890, # private channel by ID
|
||||
]
|
||||
|
||||
# ─── File handling ───────────────────────────────────────────────────────────
|
||||
TEMP_DIR = Path("./tmp")
|
||||
HITS_FILE = Path("./hits.txt")
|
||||
LOG_FILE = Path("./logs/monitor.log")
|
||||
|
||||
# Extensions to download and process
|
||||
ALLOWED_EXTENSIONS = {".txt", ".zip", ".7z", ".rar"}
|
||||
|
||||
# Max file size to download (bytes). Default: 200 MB.
|
||||
# Very large files are skipped to avoid abuse of your session.
|
||||
MAX_FILE_SIZE = 4 * 1024 * 1024 * 1024 # 4 GB (Telegram Premium max)
|
||||
|
||||
# ─── Archive passwords to try ────────────────────────────────────────────────
|
||||
ARCHIVE_PASSWORDS: list[bytes] = [
|
||||
b"1234",
|
||||
b"0000",
|
||||
b"infected",
|
||||
b"telegram",
|
||||
b"password",
|
||||
b"12345",
|
||||
b"",
|
||||
b"Borwita",
|
||||
b"@WichLoveFromR",
|
||||
]
|
||||
|
||||
# ─── Backfill settings ───────────────────────────────────────────────────────
|
||||
# How many historical messages to scan per channel on startup (0 = skip backfill)
|
||||
BACKFILL_LIMIT = 500
|
||||
|
||||
# ─── tdl downloader settings ─────────────────────────────────────────────────
|
||||
# Namespace tdl was logged into. Run `tdl login` with no -n flag → namespace
|
||||
# is "default". Run `tdl login -n foo` → namespace is "foo".
|
||||
# Set to None to omit -n entirely (tdl will use "default" anyway).
|
||||
TDL_NAMESPACE: str | None = "ulpmon"
|
||||
|
||||
# Parallel chunk workers per file (-t / --threads global flag)
|
||||
TDL_THREADS = 8
|
||||
|
||||
# Max concurrent files per tdl invocation (-l / --limit global flag)
|
||||
TDL_PERFILE = 4
|
||||
|
||||
# Max messages to batch into a single tdl invocation during backfill.
|
||||
# tdl handles the parallelism internally via -l and -t.
|
||||
TDL_AMOUNT = 4
|
||||
|
||||
# Whether to use a Telegram takeout session for downloads (lower flood limits).
|
||||
# Takeout sessions are rate-limited differently — good for bulk backfill.
|
||||
TDL_TAKEOUT = True
|
||||
1
core/__init__.py
Normal file
1
core/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""core — Telegram I/O pipeline (scraper, downloader, processor, notifier)."""
|
||||
68
core/bot_downloader.md
Normal file
68
core/bot_downloader.md
Normal file
@@ -0,0 +1,68 @@
|
||||
# core/bot_downloader.py
|
||||
|
||||
Handles "click to download" inline button flows. Some Telegram channels post files via a bot behind a button rather than directly attaching them.
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from core.bot_downloader import (
|
||||
handle_bot_download_message,
|
||||
has_download_button,
|
||||
extract_password,
|
||||
)
|
||||
```
|
||||
|
||||
### `handle_bot_download_message(client, bot, msg, source_name, patterns, password=None)`
|
||||
**async.** Full pipeline:
|
||||
1. Detect download button
|
||||
2. Click it (URL button → `/start payload` to the bot; callback button → `.click()`)
|
||||
3. Wait up to `BOT_REPLY_TIMEOUT` seconds for the bot to send a file back
|
||||
4. Hand each file response to `core.scraper.handle_message()`
|
||||
|
||||
### `has_download_button(msg) -> bool`
|
||||
Returns `True` if the message contains a recognisable download button.
|
||||
Checked in live handler and backfill before calling this module.
|
||||
|
||||
### `extract_password(msg) -> str | None`
|
||||
Scans message text for `Pass: ...` / `Password: ...` / `Contraseña: ...` patterns.
|
||||
Returns the extracted password string, or `None`.
|
||||
|
||||
---
|
||||
|
||||
## Button detection
|
||||
|
||||
Recognised button text keywords (case-insensitive):
|
||||
```
|
||||
DOWNLOAD, DESCARGAR, GET FILE, GET PACK, ⬇, 📥
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## URL button flow (most common)
|
||||
|
||||
```
|
||||
Button URL: https://t.me/SomeBot?start=ABC123
|
||||
→ parse bot username + payload
|
||||
→ client.send_message(bot_entity, "/start ABC123")
|
||||
→ poll get_messages(bot_entity, limit=3) every 1s for BOT_REPLY_TIMEOUT seconds
|
||||
→ return file messages found
|
||||
```
|
||||
|
||||
## Callback button flow (fallback)
|
||||
|
||||
```
|
||||
btn.click()
|
||||
→ sleep 2s
|
||||
→ get_messages(sender, limit=5)
|
||||
→ return file messages found
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Constants
|
||||
|
||||
| Name | Value | Description |
|
||||
|------|-------|-------------|
|
||||
| `BOT_REPLY_TIMEOUT` | `10` | Seconds to wait for bot file reply |
|
||||
| `DOWNLOAD_BUTTON_KEYWORDS` | see above | Button text triggers |
|
||||
| `PASSWORD_PATTERN` | regex | Matches `Pass[word]: value` in message text |
|
||||
161
core/bot_downloader.py
Normal file
161
core/bot_downloader.py
Normal file
@@ -0,0 +1,161 @@
|
||||
"""
|
||||
bot_downloader.py — Handles "click to download" inline button flows.
|
||||
|
||||
Some Telegram channels post messages with a DOWNLOAD button that triggers
|
||||
a bot to send you the actual file. This module simulates that click and
|
||||
captures the bot's file response.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
import logging
|
||||
|
||||
from telethon import TelegramClient
|
||||
from telethon.tl.types import MessageMediaDocument, KeyboardButtonUrl
|
||||
from telethon.errors import FloodWaitError
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
DOWNLOAD_BUTTON_KEYWORDS = ["DOWNLOAD", "DESCARGAR", "GET FILE", "GET PACK", "⬇", "📥"]
|
||||
BOT_REPLY_TIMEOUT = 10
|
||||
|
||||
PASSWORD_PATTERN = re.compile(
|
||||
r"(?:Pass|Password|Contraseña|Contrasena|Clave)[\s]*:[\s]*(.+)$",
|
||||
re.IGNORECASE | re.MULTILINE
|
||||
)
|
||||
|
||||
|
||||
# ─── Password extraction ──────────────────────────────────────────────────────
|
||||
|
||||
def extract_password(msg) -> str | None:
|
||||
if not msg.text:
|
||||
return None
|
||||
match = PASSWORD_PATTERN.search(msg.text)
|
||||
if match:
|
||||
pwd = match.group(1).strip()
|
||||
# Strip markdown formatting characters
|
||||
pwd = pwd.strip("*`_~")
|
||||
log.info(f" Found password in message: '{pwd}'")
|
||||
return pwd
|
||||
return None
|
||||
|
||||
|
||||
# ─── Button detection ─────────────────────────────────────────────────────────
|
||||
|
||||
def find_download_button(msg):
|
||||
"""
|
||||
Scans a message's inline keyboard for a download-like button.
|
||||
Returns the button object or None.
|
||||
"""
|
||||
if not msg.buttons:
|
||||
return None
|
||||
for row in msg.buttons:
|
||||
for btn in row:
|
||||
if any(kw in btn.text.upper() for kw in DOWNLOAD_BUTTON_KEYWORDS):
|
||||
return btn
|
||||
return None
|
||||
|
||||
|
||||
def has_download_button(msg) -> bool:
|
||||
return find_download_button(msg) is not None
|
||||
|
||||
|
||||
# ─── Click + wait flow ────────────────────────────────────────────────────────
|
||||
|
||||
async def click_download_button(client: TelegramClient, msg) -> list:
|
||||
"""
|
||||
Clicks the download button on a message, then waits for the bot to reply
|
||||
with a file. Returns a list of response messages containing documents.
|
||||
"""
|
||||
btn = find_download_button(msg)
|
||||
if not btn:
|
||||
return []
|
||||
|
||||
log.info(f" Clicking button: '{btn.text}'")
|
||||
|
||||
# ── URL button (most common) ───────────────────────────────────────────
|
||||
if isinstance(btn.button, KeyboardButtonUrl):
|
||||
url = btn.button.url # e.g. https://t.me/SomeBot?start=ABC123
|
||||
|
||||
match = re.search(r"t\.me/([A-Za-z0-9_]+)\?start=(.+)", url)
|
||||
if not match:
|
||||
log.warning(f" Unrecognised URL format: {url}")
|
||||
return []
|
||||
|
||||
bot_username, payload = match.group(1), match.group(2)
|
||||
log.info(f" → Messaging @{bot_username} with /start {payload}")
|
||||
|
||||
try:
|
||||
bot_entity = await client.get_entity(bot_username)
|
||||
await client.send_message(bot_entity, f"/start {payload}")
|
||||
except Exception as e:
|
||||
log.error(f" Failed to message bot: {e}")
|
||||
return []
|
||||
|
||||
# Poll for reply
|
||||
log.info(f" Waiting up to {BOT_REPLY_TIMEOUT}s for bot reply...")
|
||||
for _ in range(BOT_REPLY_TIMEOUT):
|
||||
await asyncio.sleep(1)
|
||||
try:
|
||||
recent = await client.get_messages(bot_entity, limit=3)
|
||||
files = [m for m in recent if m.media and isinstance(m.media, MessageMediaDocument)]
|
||||
if files:
|
||||
log.info(f" ✓ Got file from bot.")
|
||||
return files
|
||||
except Exception as e:
|
||||
log.warning(f" Poll error: {e}")
|
||||
break
|
||||
|
||||
log.warning(f" Bot did not reply within {BOT_REPLY_TIMEOUT}s.")
|
||||
return []
|
||||
|
||||
# ── Callback button (less common) ─────────────────────────────────────
|
||||
else:
|
||||
try:
|
||||
await btn.click()
|
||||
await asyncio.sleep(2)
|
||||
except Exception as e:
|
||||
log.error(f" Callback click failed: {e}")
|
||||
return []
|
||||
|
||||
try:
|
||||
sender = await msg.get_sender()
|
||||
recent = await client.get_messages(sender, limit=5)
|
||||
return [m for m in recent if m.media and isinstance(m.media, MessageMediaDocument)]
|
||||
except Exception as e:
|
||||
log.warning(f" Fallback poll failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
# ─── Main entry point ─────────────────────────────────────────────────────────
|
||||
|
||||
async def handle_bot_download_message(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
msg,
|
||||
source_name: str,
|
||||
patterns,
|
||||
password: str | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Full pipeline for a message with a download button:
|
||||
1. Detect download button
|
||||
2. Click it
|
||||
3. Wait for bot to send back a file
|
||||
4. Hand off to the normal handle_message() flow
|
||||
"""
|
||||
if not has_download_button(msg):
|
||||
return
|
||||
|
||||
log.info(f"[BotDL] Download button detected in {source_name}")
|
||||
|
||||
responses = await click_download_button(client, msg)
|
||||
|
||||
if not responses:
|
||||
log.warning(f"[BotDL] No file received for message in {source_name}.")
|
||||
return
|
||||
|
||||
from core.scraper import handle_message
|
||||
for resp in responses:
|
||||
log.info(f" [BotDL] Response media type: {type(resp.media).__name__}, attrs: {getattr(resp.media.document, 'attributes', []) if hasattr(resp.media, 'document') else 'none'}")
|
||||
await handle_message(client, bot, resp, f"{source_name}[bot]", patterns, password=password)
|
||||
67
core/notifier.md
Normal file
67
core/notifier.md
Normal file
@@ -0,0 +1,67 @@
|
||||
# core/notifier.py
|
||||
|
||||
Scores hits, deduplicates, persists to disk and DB, sends Telegram alerts.
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from core.notifier import notify, send_status
|
||||
```
|
||||
|
||||
### `notify(bot, hits: list[str], source: str, filename: str)`
|
||||
**async.** Full notification pipeline:
|
||||
1. `score_hits(hits)` → `list[ScoredHit]`
|
||||
2. Deduplicate via SHA-256 hashes (`data/dedup.json`)
|
||||
3. `insert_hits()` into SQLite for new + dupes (flagged accordingly)
|
||||
4. `write_hits()` → append to `data/hits.txt`
|
||||
5. `write_hits_csv()` → append to `data/hits.csv`
|
||||
6. `send_alert()` → Telegram message for CRITICAL/HIGH/MEDIUM only
|
||||
7. Post `EvHit` events onto the TUI bus for each new hit
|
||||
|
||||
### `send_status(bot, message: str)`
|
||||
**async.** Sends a plain Markdown message to `config.NOTIFY_CHAT_ID`. Used for startup/status notifications.
|
||||
|
||||
---
|
||||
|
||||
## Internal functions
|
||||
|
||||
| Function | Description |
|
||||
|----------|-------------|
|
||||
| `deduplicate(hits)` | Returns `(new_hits, dupe_hits)`; updates `data/dedup.json` |
|
||||
| `write_hits(scored_hits, source)` | Appends grouped human-readable block to `data/hits.txt` |
|
||||
| `write_hits_csv(scored_hits, source, filename)` | Appends rows to `data/hits.csv`; writes header on first call |
|
||||
| `send_alert(bot, scored_hits, source, filename)` | Sends Telegram message grouped by severity; skips if all LOW |
|
||||
|
||||
---
|
||||
|
||||
## Output files
|
||||
|
||||
| File | Format | Notes |
|
||||
|------|--------|-------|
|
||||
| `data/hits.txt` | Plain text, grouped by severity | Human-readable, append-only |
|
||||
| `data/hits.csv` | CSV with header | Columns: `timestamp, severity, score, url, username, password, reasons, source, filename` |
|
||||
| `data/dedup.json` | JSON array of SHA-256 hex strings | Hashes of `line.strip().lower()` |
|
||||
|
||||
---
|
||||
|
||||
## Alert behaviour
|
||||
|
||||
- CRITICAL / HIGH / MEDIUM → Telegram alert sent immediately
|
||||
- LOW → stored in DB + files, **no** Telegram alert
|
||||
- Duplicates → stored in DB with `seen_before=1`, no alert, no file write
|
||||
|
||||
## Telegram alert format
|
||||
|
||||
```
|
||||
🚨 Credential hit(s) detected
|
||||
📁 `filename`
|
||||
📢 `source`
|
||||
🕐 `timestamp`
|
||||
|
||||
Summary: 🔴 N 🟠 N 🟡 N 🟢 N
|
||||
|
||||
🔴 CRITICAL (N)
|
||||
`url:user:pass`
|
||||
↳ reason | reason
|
||||
... (up to 10 per severity; remainder counted)
|
||||
```
|
||||
248
core/notifier.py
Normal file
248
core/notifier.py
Normal file
@@ -0,0 +1,248 @@
|
||||
"""
|
||||
notifier.py — Persists hits to disk and sends Telegram bot alerts.
|
||||
|
||||
Includes:
|
||||
- Severity scoring via scorer.py
|
||||
- Deduplication: same credential never written or alerted twice
|
||||
- SQLite storage via database.py
|
||||
- hits.txt kept as a human-readable backup
|
||||
- Telegram alerts grouped by severity
|
||||
"""
|
||||
|
||||
import logging
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from telethon import TelegramClient
|
||||
|
||||
import csv
|
||||
|
||||
from config import HITS_FILE, NOTIFY_CHAT_ID
|
||||
from utils.scorer import score_hits, summarize, CRITICAL, HIGH, MEDIUM, LOW, SEVERITY_EMOJI
|
||||
from utils.database import insert_hits
|
||||
from tui import events as bus
|
||||
|
||||
HITS_CSV = HITS_FILE.with_suffix(".csv")
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
MAX_PREVIEW = 10 # hits to show per severity group in alert
|
||||
DEDUP_FILE = Path("./data/dedup.json")
|
||||
|
||||
# Only alert immediately for these severities — LOW hits are silent
|
||||
ALERT_SEVERITIES = {CRITICAL, HIGH, MEDIUM}
|
||||
|
||||
|
||||
# ─── Deduplication ────────────────────────────────────────────────────────────
|
||||
|
||||
def _hash(line: str) -> str:
|
||||
return hashlib.sha256(line.strip().lower().encode()).hexdigest()
|
||||
|
||||
|
||||
def _load_seen_hashes() -> set:
|
||||
if not DEDUP_FILE.exists():
|
||||
return set()
|
||||
try:
|
||||
with open(DEDUP_FILE, "r") as f:
|
||||
return set(json.load(f))
|
||||
except Exception:
|
||||
return set()
|
||||
|
||||
|
||||
def _save_seen_hashes(seen: set) -> None:
|
||||
try:
|
||||
with open(DEDUP_FILE, "w") as f:
|
||||
json.dump(list(seen), f)
|
||||
except Exception as e:
|
||||
log.warning(f"Could not save dedup file: {e}")
|
||||
|
||||
|
||||
def deduplicate(hits: list) -> tuple[list, list]:
|
||||
"""
|
||||
Accepts a list of ScoredHit objects.
|
||||
Returns (new_hits, dupe_hits).
|
||||
"""
|
||||
seen = _load_seen_hashes()
|
||||
new_hits = []
|
||||
dupe_hits = []
|
||||
new_hashes = set()
|
||||
|
||||
for h in hits:
|
||||
digest = _hash(h.raw)
|
||||
if digest in seen:
|
||||
dupe_hits.append(h)
|
||||
else:
|
||||
new_hits.append(h)
|
||||
new_hashes.add(digest)
|
||||
|
||||
if new_hashes:
|
||||
seen.update(new_hashes)
|
||||
_save_seen_hashes(seen)
|
||||
|
||||
log.info(
|
||||
f" Dedup: {len(hits)} raw hit(s) → "
|
||||
f"{len(new_hits)} new, {len(dupe_hits)} duplicate(s)"
|
||||
)
|
||||
return new_hits, dupe_hits
|
||||
|
||||
|
||||
# ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def _timestamp() -> str:
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
|
||||
|
||||
# ─── Output ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def write_hits(scored_hits: list, source: str) -> None:
|
||||
"""Append new hits to hits.txt grouped by severity."""
|
||||
HITS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
summary = summarize(scored_hits)
|
||||
|
||||
with open(HITS_FILE, "a", encoding="utf-8") as f:
|
||||
f.write(f"\n{'='*60}\n")
|
||||
f.write(f"Source : {source}\n")
|
||||
f.write(f"Time : {_timestamp()}\n")
|
||||
f.write(f"Hits : {len(scored_hits)} ")
|
||||
f.write(f"(CRITICAL={summary[CRITICAL]} HIGH={summary[HIGH]} ")
|
||||
f.write(f"MEDIUM={summary[MEDIUM]} LOW={summary[LOW]})\n")
|
||||
f.write(f"{'='*60}\n")
|
||||
|
||||
for severity in [CRITICAL, HIGH, MEDIUM, LOW]:
|
||||
group = [h for h in scored_hits if h.severity == severity]
|
||||
if not group:
|
||||
continue
|
||||
emoji = SEVERITY_EMOJI[severity]
|
||||
f.write(f"\n{emoji} {severity} ({len(group)})\n")
|
||||
for h in group:
|
||||
f.write(f" {h.raw}\n")
|
||||
f.write(f" → {' | '.join(h.reasons)}\n")
|
||||
|
||||
log.info(f" Wrote {len(scored_hits)} hit(s) to {HITS_FILE}")
|
||||
|
||||
|
||||
def write_hits_csv(scored_hits: list, source: str, filename: str) -> None:
|
||||
"""Append new hits to hits.csv — one row per hit, easy to import."""
|
||||
HITS_CSV.parent.mkdir(parents=True, exist_ok=True)
|
||||
write_header = not HITS_CSV.exists()
|
||||
timestamp = _timestamp()
|
||||
with open(HITS_CSV, "a", newline="", encoding="utf-8") as f:
|
||||
writer = csv.writer(f)
|
||||
if write_header:
|
||||
writer.writerow([
|
||||
"timestamp", "severity", "score", "url", "username",
|
||||
"password", "reasons", "source", "filename",
|
||||
])
|
||||
for h in scored_hits:
|
||||
writer.writerow([
|
||||
timestamp, h.severity, h.score,
|
||||
h.url or "", h.username or "", h.password or "",
|
||||
" | ".join(h.reasons), source, filename,
|
||||
])
|
||||
log.info(f" Wrote {len(scored_hits)} hit(s) to {HITS_CSV}")
|
||||
|
||||
|
||||
async def send_alert(
|
||||
bot: TelegramClient,
|
||||
scored_hits: list,
|
||||
source: str,
|
||||
filename: str,
|
||||
) -> None:
|
||||
"""
|
||||
Send a Telegram alert grouped by severity.
|
||||
Only includes CRITICAL, HIGH, MEDIUM — LOW hits are omitted from alerts.
|
||||
"""
|
||||
summary = summarize(scored_hits)
|
||||
alertable = [h for h in scored_hits if h.severity in ALERT_SEVERITIES]
|
||||
|
||||
if not alertable:
|
||||
log.info(" No alertable hits (all LOW) — skipping Telegram notification.")
|
||||
return
|
||||
|
||||
lines = [
|
||||
f"🚨 *Credential hit(s) detected*",
|
||||
f"",
|
||||
f"📁 `{filename}`",
|
||||
f"📢 `{source}`",
|
||||
f"🕐 `{_timestamp()}`",
|
||||
f"",
|
||||
f"*Summary:*",
|
||||
f"🔴 CRITICAL: `{summary[CRITICAL]}` "
|
||||
f"🟠 HIGH: `{summary[HIGH]}` "
|
||||
f"🟡 MEDIUM: `{summary[MEDIUM]}` "
|
||||
f"🟢 LOW: `{summary[LOW]}`",
|
||||
]
|
||||
|
||||
for severity in [CRITICAL, HIGH, MEDIUM]:
|
||||
group = [h for h in scored_hits if h.severity == severity]
|
||||
if not group:
|
||||
continue
|
||||
emoji = SEVERITY_EMOJI[severity]
|
||||
lines.append(f"\n{emoji} *{severity}* ({len(group)})")
|
||||
for h in group[:MAX_PREVIEW]:
|
||||
safe = h.raw.replace("`", "'")
|
||||
lines.append(f"`{safe}`")
|
||||
lines.append(f"_↳ {' | '.join(h.reasons)}_")
|
||||
if len(group) > MAX_PREVIEW:
|
||||
lines.append(f"_...and {len(group) - MAX_PREVIEW} more_")
|
||||
|
||||
try:
|
||||
await bot.send_message(NOTIFY_CHAT_ID, "\n".join(lines), parse_mode="markdown")
|
||||
except Exception as e:
|
||||
log.error(f"Failed to send Telegram alert: {e}")
|
||||
|
||||
|
||||
# ─── Main entry point ────────────────────────────────────────────────────────
|
||||
|
||||
async def notify(bot: TelegramClient, hits: list[str], source: str, filename: str) -> None:
|
||||
"""
|
||||
Full notification pipeline:
|
||||
1. Score all hits
|
||||
2. Deduplicate
|
||||
3. Insert all hits into SQLite (new + dupes, flagged accordingly)
|
||||
4. Write new hits to hits.txt
|
||||
5. Send Telegram alert for new alertable hits only
|
||||
"""
|
||||
if not hits:
|
||||
return
|
||||
|
||||
# Score first
|
||||
scored = score_hits(hits)
|
||||
log.info(f" Scored {len(scored)} hit(s) — {summarize(scored)}")
|
||||
|
||||
# Deduplicate
|
||||
new_hits, dupe_hits = deduplicate(scored)
|
||||
|
||||
# Always insert into DB
|
||||
if new_hits:
|
||||
insert_hits(new_hits, source, filename, seen_before=False)
|
||||
if dupe_hits:
|
||||
insert_hits(dupe_hits, source, filename, seen_before=True)
|
||||
|
||||
if not new_hits:
|
||||
log.info(" All hits already seen before — no alert sent.")
|
||||
return
|
||||
|
||||
# Push hits to TUI
|
||||
for h in new_hits:
|
||||
bus.post(bus.EvHit(
|
||||
severity=h.severity,
|
||||
raw=h.raw,
|
||||
source=source,
|
||||
filename=filename,
|
||||
reasons=h.reasons,
|
||||
))
|
||||
|
||||
write_hits(new_hits, source)
|
||||
write_hits_csv(new_hits, source, filename)
|
||||
await send_alert(bot, new_hits, source, filename)
|
||||
|
||||
|
||||
async def send_status(bot: TelegramClient, message: str) -> None:
|
||||
"""Send a plain status/info message to the notify chat."""
|
||||
try:
|
||||
await bot.send_message(NOTIFY_CHAT_ID, message, parse_mode="markdown")
|
||||
except Exception as e:
|
||||
log.error(f"Failed to send status message: {e}")
|
||||
69
core/processor.md
Normal file
69
core/processor.md
Normal file
@@ -0,0 +1,69 @@
|
||||
# core/processor.py
|
||||
|
||||
Archive extraction and hit searching. No Telegram deps, no async.
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from core.processor import compile_patterns, process_file
|
||||
```
|
||||
|
||||
### `compile_patterns(keywords: list[str]) -> list[re.Pattern]`
|
||||
Compiles a list of keyword strings into case-insensitive regex patterns.
|
||||
Call once at startup; pass the result everywhere patterns are needed.
|
||||
|
||||
```python
|
||||
patterns = compile_patterns(config.TARGET_KEYWORDS)
|
||||
```
|
||||
|
||||
### `process_file(filepath: Path, patterns, password=None) -> list[str]`
|
||||
Full pipeline: unpack → search each `.txt` → recurse into nested archives → clean up everything.
|
||||
Returns list of matching raw lines (hits). Deletes the original file and all extracted contents on completion.
|
||||
|
||||
```python
|
||||
hits = process_file(Path("data/tmp/combo.zip"), patterns, password="infected")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Internal functions
|
||||
|
||||
| Function | Signature | Description |
|
||||
|----------|-----------|-------------|
|
||||
| `search_file` | `(filepath, patterns) -> list[str]` | Stream-reads `.txt` line by line; ignores encoding errors |
|
||||
| `unpack` | `(filepath, extra_password) -> (files, extract_dir\|None)` | Dispatches to correct extractor; plain `.txt` returned as-is |
|
||||
| `extract_zip` | `(filepath, dest, extra_password)` | Tries no password first, then `ARCHIVE_PASSWORDS` list |
|
||||
| `extract_7z` | `(filepath, dest, extra_password)` | Requires `py7zr`; skips if not installed |
|
||||
| `extract_rar` | `(filepath, dest, extra_password)` | Requires `rarfile` + `unrar` binary |
|
||||
| `_try_passwords` | `(extract_fn, passwords)` | Iterates password list, stops on first success |
|
||||
|
||||
---
|
||||
|
||||
## Supported formats
|
||||
|
||||
| Extension | Library | Notes |
|
||||
|-----------|---------|-------|
|
||||
| `.txt` | built-in | Stream-read, no load into memory |
|
||||
| `.zip` | `zipfile` | stdlib |
|
||||
| `.7z` | `py7zr` | optional; skipped if not installed |
|
||||
| `.rar` | `rarfile` | optional; requires `unrar` system binary |
|
||||
|
||||
Nested archives are recursed **one level** only.
|
||||
|
||||
---
|
||||
|
||||
## Password order
|
||||
|
||||
1. `extra_password` (from message/channel carry-forward) — tried first
|
||||
2. `config.ARCHIVE_PASSWORDS` — tried in order
|
||||
|
||||
---
|
||||
|
||||
## Cleanup guarantee
|
||||
|
||||
`process_file` always deletes:
|
||||
- Extracted individual files
|
||||
- Extract subdirectory
|
||||
- Original downloaded file
|
||||
|
||||
Even if no hits are found.
|
||||
233
core/processor.py
Normal file
233
core/processor.py
Normal file
@@ -0,0 +1,233 @@
|
||||
"""
|
||||
processor.py — Archive extraction and hit searching logic.
|
||||
|
||||
Supports: .txt, .zip, .7z, .rar
|
||||
Stream-processes files line by line — safe for large combo lists.
|
||||
"""
|
||||
|
||||
import rarfile
|
||||
rarfile.UNRAR_TOOL = "unrar"
|
||||
|
||||
import re
|
||||
import zipfile
|
||||
import logging
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import py7zr
|
||||
HAS_7Z = True
|
||||
except ImportError:
|
||||
HAS_7Z = False
|
||||
|
||||
try:
|
||||
import rarfile
|
||||
HAS_RAR = True
|
||||
except ImportError:
|
||||
HAS_RAR = False
|
||||
|
||||
from config import ARCHIVE_PASSWORDS
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ─── Searching ───────────────────────────────────────────────────────────────
|
||||
|
||||
def compile_patterns(keywords: list[str]) -> list[re.Pattern]:
|
||||
return [re.compile(kw, re.IGNORECASE) for kw in keywords]
|
||||
|
||||
|
||||
def search_file(filepath: Path, patterns: list[re.Pattern]) -> list[str]:
|
||||
"""
|
||||
Stream-reads a text file line by line and returns lines matching any pattern.
|
||||
Ignores encoding errors — combo files are often messy.
|
||||
"""
|
||||
hits: list[str] = []
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
|
||||
for line in f:
|
||||
stripped = line.strip()
|
||||
if stripped and any(p.search(stripped) for p in patterns):
|
||||
hits.append(stripped)
|
||||
except Exception as e:
|
||||
log.warning(f"Could not read {filepath.name}: {e}")
|
||||
return hits
|
||||
|
||||
|
||||
# ─── Extraction ──────────────────────────────────────────────────────────────
|
||||
|
||||
def _try_passwords(extract_fn, passwords: list[bytes]) -> bool:
|
||||
"""Try a list of passwords against an extract function. Returns True on success."""
|
||||
for pwd in passwords:
|
||||
try:
|
||||
extract_fn(pwd)
|
||||
return True
|
||||
except Exception:
|
||||
continue
|
||||
return False
|
||||
|
||||
|
||||
def extract_zip(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
|
||||
passwords = ARCHIVE_PASSWORDS.copy()
|
||||
if extra_password:
|
||||
passwords.insert(0, extra_password.encode())
|
||||
extracted: list[Path] = []
|
||||
try:
|
||||
with zipfile.ZipFile(filepath) as zf:
|
||||
def try_extract(pwd: bytes):
|
||||
zf.extractall(dest, pwd=pwd or None)
|
||||
|
||||
try:
|
||||
zf.extractall(dest)
|
||||
except RuntimeError:
|
||||
log.info(f" ZIP is password-protected, trying common passwords...")
|
||||
if not _try_passwords(try_extract, ARCHIVE_PASSWORDS):
|
||||
log.warning(f" Could not unlock {filepath.name} — skipping.")
|
||||
return []
|
||||
|
||||
extracted = [p for p in dest.rglob("*") if p.is_file()]
|
||||
except zipfile.BadZipFile:
|
||||
log.warning(f" {filepath.name} is not a valid ZIP.")
|
||||
except Exception as e:
|
||||
log.warning(f" ZIP extraction error on {filepath.name}: {e}")
|
||||
return extracted
|
||||
|
||||
|
||||
def extract_7z(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
|
||||
if not HAS_7Z:
|
||||
log.warning("py7zr not installed — skipping .7z file.")
|
||||
return []
|
||||
extracted: list[Path] = []
|
||||
passwords = ARCHIVE_PASSWORDS.copy()
|
||||
if extra_password:
|
||||
passwords.insert(0, extra_password.encode())
|
||||
|
||||
try:
|
||||
# Try without password first
|
||||
try:
|
||||
with py7zr.SevenZipFile(filepath, mode="r") as z:
|
||||
z.extractall(dest)
|
||||
except py7zr.exceptions.PasswordRequired:
|
||||
log.info(f" 7z is password-protected, trying common passwords...")
|
||||
success = False
|
||||
for pwd in ARCHIVE_PASSWORDS:
|
||||
try:
|
||||
with py7zr.SevenZipFile(filepath, mode="r", password=pwd.decode()) as z:
|
||||
z.extractall(dest)
|
||||
success = True
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
if not success:
|
||||
log.warning(f" Could not unlock {filepath.name} — skipping.")
|
||||
return []
|
||||
|
||||
extracted = [p for p in dest.rglob("*") if p.is_file()]
|
||||
except Exception as e:
|
||||
log.warning(f" 7z extraction error on {filepath.name}: {e}")
|
||||
return extracted
|
||||
|
||||
|
||||
def extract_rar(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
|
||||
if not HAS_RAR:
|
||||
log.warning("rarfile not installed — skipping .rar file.")
|
||||
return []
|
||||
|
||||
passwords = ARCHIVE_PASSWORDS.copy()
|
||||
if extra_password:
|
||||
passwords.insert(0, extra_password.encode())
|
||||
extracted: list[Path] = []
|
||||
try:
|
||||
with rarfile.RarFile(filepath) as rf:
|
||||
def try_extract(pwd: bytes):
|
||||
rf.extractall(dest, pwd=pwd.decode() if pwd else None)
|
||||
|
||||
try:
|
||||
rf.extractall(dest)
|
||||
except rarfile.BadRarFile:
|
||||
log.warning(f" {filepath.name} is not a valid RAR.")
|
||||
return []
|
||||
except Exception:
|
||||
log.info(f" RAR may be password-protected, trying common passwords...")
|
||||
if not _try_passwords(try_extract, ARCHIVE_PASSWORDS):
|
||||
log.warning(f" Could not unlock {filepath.name} — skipping.")
|
||||
return []
|
||||
|
||||
extracted = [p for p in dest.rglob("*") if p.is_file()]
|
||||
except Exception as e:
|
||||
log.warning(f" RAR extraction error on {filepath.name}: {e}")
|
||||
return extracted
|
||||
|
||||
|
||||
def unpack(filepath: Path, extra_password: str | None = None) -> tuple[list[Path], Path | None]:
|
||||
"""
|
||||
Unpacks an archive into a sibling directory.
|
||||
Returns (list of extracted files, extract_dir or None).
|
||||
If it's not an archive, returns ([filepath], None).
|
||||
"""
|
||||
suffix = filepath.suffix.lower()
|
||||
extract_dir = filepath.parent / filepath.stem
|
||||
|
||||
if suffix == ".zip":
|
||||
extract_dir.mkdir(exist_ok=True)
|
||||
files = extract_zip(filepath, extract_dir, extra_password)
|
||||
return files, extract_dir
|
||||
|
||||
elif suffix == ".7z":
|
||||
extract_dir.mkdir(exist_ok=True)
|
||||
files = extract_7z(filepath, extract_dir, extra_password)
|
||||
return files, extract_dir
|
||||
|
||||
elif suffix == ".rar":
|
||||
extract_dir.mkdir(exist_ok=True)
|
||||
files = extract_rar(filepath, extract_dir, extra_password)
|
||||
return files, extract_dir
|
||||
|
||||
else:
|
||||
# Plain file — return as-is, no extract dir to clean up
|
||||
return [filepath], None
|
||||
|
||||
|
||||
# ─── Main entry point ────────────────────────────────────────────────────────
|
||||
|
||||
def process_file(filepath: Path, patterns, password: str | None = None) -> list[str]:
|
||||
"""
|
||||
Full pipeline: unpack → search each file → clean up everything.
|
||||
Returns list of matching lines (hits).
|
||||
"""
|
||||
log.info(f" Processing: {filepath.name}")
|
||||
all_hits: list[str] = []
|
||||
|
||||
files, extract_dir = unpack(filepath, extra_password=password)
|
||||
|
||||
for f in files:
|
||||
if f.suffix.lower() == ".txt":
|
||||
hits = search_file(f, patterns)
|
||||
if hits:
|
||||
log.info(f" ✓ {len(hits)} hit(s) in {f.name}")
|
||||
all_hits.extend(hits)
|
||||
|
||||
# Nested archives — recurse one level
|
||||
elif f.suffix.lower() in {".zip", ".7z", ".rar"} and f != filepath:
|
||||
log.info(f" → Nested archive: {f.name}")
|
||||
nested_hits = process_file(f, patterns)
|
||||
all_hits.extend(nested_hits)
|
||||
continue # process_file already cleaned up f
|
||||
|
||||
# Clean up extracted file
|
||||
try:
|
||||
f.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Clean up extract dir
|
||||
if extract_dir and extract_dir.exists():
|
||||
shutil.rmtree(extract_dir, ignore_errors=True)
|
||||
|
||||
# Clean up original download
|
||||
try:
|
||||
filepath.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return all_hits
|
||||
65
core/scraper.md
Normal file
65
core/scraper.md
Normal file
@@ -0,0 +1,65 @@
|
||||
# core/scraper.py
|
||||
|
||||
Telethon user-client layer. Handles live listening, backfill, and the single-message download pipeline.
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from core.scraper import handle_message, backfill_all, register_handlers, warm_entity_cache
|
||||
```
|
||||
|
||||
### `handle_message(client, bot, msg, source_name, patterns, password=None)`
|
||||
**async.** Full pipeline for one document message:
|
||||
1. Extract filename + size, check allowlist + size guard
|
||||
2. Check `utils.cache` — skip if already seen
|
||||
3. Try `tdl` download → Telethon fallback
|
||||
4. `core.processor.process_file()` → hits
|
||||
5. `core.notifier.notify()` if hits found
|
||||
6. `utils.cache.mark_seen()`
|
||||
|
||||
Called by: live handler, `bot_downloader`, backfill fallback path.
|
||||
|
||||
### `backfill_all(client, bot, patterns)`
|
||||
**async.** Iterates `config.WATCHED_CHANNELS`, calls `backfill_channel()` for each.
|
||||
No-op if `config.BACKFILL_LIMIT == 0`.
|
||||
|
||||
### `register_handlers(client, bot, patterns)`
|
||||
Registers a `NewMessage` Telethon event handler on `config.WATCHED_CHANNELS`.
|
||||
Used in **CLI mode only** (`--no-tui`). The TUI manages its own handler via `_make_handler()` in `tui/app.py`.
|
||||
|
||||
### `warm_entity_cache(client)`
|
||||
**async.** Iterates `client.iter_dialogs()` so Telethon caches entity mappings.
|
||||
Must be called before using raw numeric channel IDs.
|
||||
|
||||
---
|
||||
|
||||
## Internal functions
|
||||
|
||||
| Function | Description |
|
||||
|----------|-------------|
|
||||
| `get_filename(msg)` | Extracts filename from `MessageMediaDocument`; falls back to `{msg_id}{ext}` from MIME |
|
||||
| `get_filesize(msg)` | Returns document size in bytes |
|
||||
| `is_processable(filename, size)` | Checks extension allowlist + size limit; returns `(bool, reason)` |
|
||||
| `_make_dest(msg, filename)` | Resolves temp path, handles collision with `{msg_id}_{filename}` |
|
||||
| `_telethon_download(client, msg, dest, ...)` | Telethon fallback with tqdm progress + flood-wait handling. Posts `EvDownload*` bus events |
|
||||
| `backfill_channel(client, bot, channel, patterns, limit)` | Scans history with password carry-forward; batches via tdl |
|
||||
| `_process_batch(client, bot, batch, patterns)` | One tdl invocation for up to `TDL_AMOUNT` messages; per-file Telethon fallback |
|
||||
|
||||
---
|
||||
|
||||
## Password carry-forward (backfill)
|
||||
|
||||
Channels often post the archive password as a separate text message.
|
||||
`backfill_channel` iterates newest→oldest, carrying `last_password` so both older and newer file messages in the same scan pick it up.
|
||||
|
||||
---
|
||||
|
||||
## Download strategy
|
||||
|
||||
```
|
||||
is_tdl_available()?
|
||||
yes → download_single_with_tdl() / download_batch_with_tdl()
|
||||
↓ failed?
|
||||
_telethon_download()
|
||||
no → _telethon_download() directly
|
||||
```
|
||||
410
core/scraper.py
Normal file
410
core/scraper.py
Normal file
@@ -0,0 +1,410 @@
|
||||
"""
|
||||
scraper.py — Telethon user client.
|
||||
|
||||
Handles:
|
||||
- Listening for new file messages in watched channels
|
||||
- Listening for messages with inline download buttons (bot-dispatched files)
|
||||
- Backfilling recent channel history on startup (batched via tdl)
|
||||
- Downloading files safely (size guard, flood wait)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from tqdm import tqdm
|
||||
from telethon import TelegramClient, events
|
||||
from telethon.errors import FloodWaitError, ChannelPrivateError, UsernameNotOccupiedError
|
||||
from telethon.tl.types import (
|
||||
MessageMediaDocument,
|
||||
DocumentAttributeFilename,
|
||||
InputDocumentFileLocation,
|
||||
)
|
||||
|
||||
from config import (
|
||||
ALLOWED_EXTENSIONS,
|
||||
BACKFILL_LIMIT,
|
||||
MAX_FILE_SIZE,
|
||||
TEMP_DIR,
|
||||
WATCHED_CHANNELS,
|
||||
TDL_AMOUNT,
|
||||
)
|
||||
from core.bot_downloader import handle_bot_download_message, has_download_button, extract_password
|
||||
from utils.cache import is_seen, mark_seen
|
||||
from core.processor import process_file
|
||||
from core.notifier import notify
|
||||
from core.tdl_downloader import (
|
||||
BatchEntry,
|
||||
download_batch_with_tdl,
|
||||
download_single_with_tdl,
|
||||
is_tdl_available,
|
||||
)
|
||||
from tui import events as bus
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_filename(msg) -> str | None:
|
||||
"""Extract the filename from a document message, if any."""
|
||||
if not isinstance(msg.media, MessageMediaDocument):
|
||||
return None
|
||||
doc = msg.media.document
|
||||
for attr in doc.attributes:
|
||||
if isinstance(attr, DocumentAttributeFilename):
|
||||
return attr.file_name
|
||||
mime = getattr(doc, "mime_type", "") or ""
|
||||
ext_map = {
|
||||
"application/x-rar-compressed": ".rar",
|
||||
"application/vnd.rar": ".rar",
|
||||
"application/zip": ".zip",
|
||||
"application/x-7z-compressed": ".7z",
|
||||
"text/plain": ".txt",
|
||||
}
|
||||
return f"{msg.id}{ext_map.get(mime, '.bin')}"
|
||||
|
||||
|
||||
def get_filesize(msg) -> int:
|
||||
"""Return document size in bytes, or 0 if not a document."""
|
||||
if not isinstance(msg.media, MessageMediaDocument):
|
||||
return 0
|
||||
return msg.media.document.size or 0
|
||||
|
||||
|
||||
def is_processable(filename: str, size: int) -> tuple[bool, str]:
|
||||
"""Check whether a file should be downloaded. Returns (ok, reason)."""
|
||||
suffix = Path(filename).suffix.lower()
|
||||
if suffix not in ALLOWED_EXTENSIONS:
|
||||
return False, f"extension {suffix!r} not in allowlist"
|
||||
if size > MAX_FILE_SIZE:
|
||||
mb = size / (1024 * 1024)
|
||||
return False, f"too large ({mb:.1f} MB > {MAX_FILE_SIZE // (1024 * 1024)} MB limit)"
|
||||
return True, ""
|
||||
|
||||
|
||||
def _make_dest(msg, filename: str) -> Path:
|
||||
"""Resolve the destination path, avoiding name collisions."""
|
||||
TEMP_DIR.mkdir(exist_ok=True)
|
||||
dest = TEMP_DIR / filename
|
||||
if dest.exists():
|
||||
dest = TEMP_DIR / f"{msg.id}_{filename}"
|
||||
return dest
|
||||
|
||||
|
||||
# ─── Telethon fallback download ───────────────────────────────────────────────
|
||||
|
||||
async def _telethon_download(client: TelegramClient, msg, dest: Path, filename: str, size: int, batch_id: str | None = None) -> bool:
|
||||
"""Download a single file via Telethon. Returns True on success."""
|
||||
_bid = batch_id or f"telethon_{int(time.monotonic_ns())}"
|
||||
if batch_id is None:
|
||||
# Standalone call (not already queued by tdl path) — post queued event
|
||||
bus.post(bus.EvDownloadQueued(
|
||||
batch_id=_bid, filename=filename,
|
||||
size_mb=round(size / (1024 * 1024), 2),
|
||||
source="telethon", password=None,
|
||||
))
|
||||
bus.post(bus.EvDownloadStarted(batch_id=_bid, filename=filename))
|
||||
try:
|
||||
with tqdm(
|
||||
total=size,
|
||||
unit="B",
|
||||
unit_scale=True,
|
||||
unit_divisor=1024,
|
||||
desc=filename[:40],
|
||||
colour="cyan",
|
||||
) as pbar:
|
||||
async def progress(current, total):
|
||||
pbar.n = current
|
||||
pbar.refresh()
|
||||
|
||||
doc = msg.media.document
|
||||
location = InputDocumentFileLocation(
|
||||
id=doc.id,
|
||||
access_hash=doc.access_hash,
|
||||
file_reference=doc.file_reference,
|
||||
thumb_size="",
|
||||
)
|
||||
await client.download_file(
|
||||
location,
|
||||
file=dest,
|
||||
part_size_kb=512,
|
||||
progress_callback=progress,
|
||||
)
|
||||
bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon"))
|
||||
return True
|
||||
except FloodWaitError as e:
|
||||
log.warning(f" Flood wait: sleeping {e.seconds}s...")
|
||||
await asyncio.sleep(e.seconds)
|
||||
await client.download_media(msg, file=dest)
|
||||
bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon"))
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f" Telethon download failed for {filename}: {e}")
|
||||
bus.post(bus.EvDownloadFailed(batch_id=_bid, filename=filename, reason=str(e)))
|
||||
return False
|
||||
|
||||
|
||||
# ─── Single-message pipeline (live handler + bot_downloader) ──────────────────
|
||||
|
||||
async def handle_message(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
msg,
|
||||
source_name: str,
|
||||
patterns,
|
||||
password: str | None = None,
|
||||
) -> None:
|
||||
"""Download and process a single file message."""
|
||||
filename = get_filename(msg)
|
||||
if not filename:
|
||||
log.warning(" handle_message: could not extract filename, skipping.")
|
||||
return
|
||||
|
||||
size = get_filesize(msg)
|
||||
ok, reason = is_processable(filename, size)
|
||||
if not ok:
|
||||
log.warning(f" handle_message: skipping '{filename}' — {reason}")
|
||||
return
|
||||
|
||||
doc_id = msg.media.document.id
|
||||
if is_seen(doc_id):
|
||||
log.info(f" Skipping {filename} — already processed.")
|
||||
return
|
||||
|
||||
dest = _make_dest(msg, filename)
|
||||
log.info(f"↓ Downloading: {filename} ({size / 1024:.1f} KB) from {source_name}")
|
||||
|
||||
# tdl single → Telethon fallback
|
||||
downloaded = await download_single_with_tdl(msg, dest) if is_tdl_available() else False
|
||||
if not downloaded:
|
||||
if is_tdl_available():
|
||||
log.warning(" [tdl] failed — falling back to Telethon")
|
||||
downloaded = await _telethon_download(client, msg, dest, filename, size)
|
||||
|
||||
if not downloaded:
|
||||
log.error(f" All download attempts failed for {filename}")
|
||||
return
|
||||
|
||||
hits = process_file(dest, patterns, password=password)
|
||||
mark_seen(doc_id)
|
||||
|
||||
if hits:
|
||||
await notify(bot, hits, source_name, filename)
|
||||
else:
|
||||
log.info(f" No hits in {filename}")
|
||||
|
||||
|
||||
# ─── Batch pipeline (backfill only) ───────────────────────────────────────────
|
||||
|
||||
async def _process_batch(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
batch: list[tuple], # list of (msg, source_name, password)
|
||||
patterns,
|
||||
) -> int:
|
||||
"""
|
||||
Download up to TDL_AMOUNT messages in one tdl invocation, then process
|
||||
each. Falls back to Telethon per-file for anything tdl missed.
|
||||
Returns the number of files successfully processed.
|
||||
"""
|
||||
if not batch:
|
||||
return 0
|
||||
|
||||
# Build BatchEntry list
|
||||
entries: list[BatchEntry] = []
|
||||
for msg, source_name, password in batch:
|
||||
filename = get_filename(msg)
|
||||
if not filename:
|
||||
continue
|
||||
entries.append(BatchEntry(
|
||||
msg=msg,
|
||||
filename=filename,
|
||||
dest=_make_dest(msg, filename),
|
||||
doc_id=msg.media.document.id,
|
||||
source_name=source_name,
|
||||
password=password,
|
||||
))
|
||||
|
||||
names = ", ".join(e.filename for e in entries)
|
||||
log.info(f"[Batch] {len(entries)} file(s): {names}")
|
||||
|
||||
# One tdl call for the whole batch
|
||||
results = await download_batch_with_tdl(entries)
|
||||
|
||||
processed = 0
|
||||
for entry in entries:
|
||||
tdl_ok = results.get(entry.doc_id, False)
|
||||
|
||||
if not tdl_ok:
|
||||
# Per-file Telethon fallback
|
||||
log.info(f" [Batch] Telethon fallback: {entry.filename}")
|
||||
size = get_filesize(entry.msg)
|
||||
tdl_ok = await _telethon_download(client, entry.msg, entry.dest, entry.filename, size)
|
||||
|
||||
if not tdl_ok:
|
||||
log.error(f" [Batch] All attempts failed: {entry.filename}")
|
||||
continue
|
||||
|
||||
hits = process_file(entry.dest, patterns, password=entry.password)
|
||||
mark_seen(entry.doc_id)
|
||||
|
||||
if hits:
|
||||
await notify(bot, hits, entry.source_name, entry.filename)
|
||||
else:
|
||||
log.info(f" No hits in {entry.filename}")
|
||||
|
||||
processed += 1
|
||||
|
||||
return processed
|
||||
|
||||
|
||||
# ─── Backfill ─────────────────────────────────────────────────────────────────
|
||||
|
||||
async def backfill_channel(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
channel: str | int,
|
||||
patterns,
|
||||
limit: int,
|
||||
) -> None:
|
||||
"""Scan the last `limit` messages of a channel for file attachments."""
|
||||
log.info(f"[Backfill] Scanning history: {channel} (last {limit} messages)")
|
||||
total = 0
|
||||
batch: list[tuple] = [] # (msg, source_name, password)
|
||||
last_password: str | None = None # carry password across adjacent messages
|
||||
|
||||
async def flush_batch():
|
||||
nonlocal total
|
||||
if batch:
|
||||
total += await _process_batch(client, bot, batch, patterns)
|
||||
batch.clear()
|
||||
|
||||
try:
|
||||
async for msg in client.iter_messages(channel, limit=limit):
|
||||
source_name = str(channel)
|
||||
|
||||
# Extract password from this message if present, and remember it.
|
||||
# iter_messages goes newest→oldest, so a password post that appears
|
||||
# above the files in the channel will arrive AFTER them here.
|
||||
# We therefore carry last_password in both directions:
|
||||
# - apply it to file messages that have no inline password
|
||||
# - update it whenever we see a fresh password, so subsequent
|
||||
# (older) file messages in the same batch pick it up too.
|
||||
msg_password = extract_password(msg)
|
||||
if msg_password:
|
||||
last_password = msg_password
|
||||
|
||||
password = msg_password or last_password
|
||||
|
||||
if msg.media and isinstance(msg.media, MessageMediaDocument):
|
||||
filename = get_filename(msg)
|
||||
size = get_filesize(msg)
|
||||
|
||||
if not filename:
|
||||
continue
|
||||
|
||||
ok, reason = is_processable(filename, size)
|
||||
if not ok:
|
||||
log.warning(f" [Backfill] Skipping '{filename}' — {reason}")
|
||||
continue
|
||||
|
||||
if is_seen(msg.media.document.id):
|
||||
log.info(f" [Backfill] Already seen: {filename}")
|
||||
continue
|
||||
|
||||
if is_tdl_available():
|
||||
batch.append((msg, source_name, password))
|
||||
if len(batch) >= TDL_AMOUNT:
|
||||
await flush_batch()
|
||||
else:
|
||||
# No tdl — fall straight through to single handle_message
|
||||
await handle_message(client, bot, msg, source_name, patterns, password=password)
|
||||
total += 1
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
elif msg.buttons and has_download_button(msg):
|
||||
# Bot-button messages can't be batched — handle individually
|
||||
await flush_batch() # flush any pending batch first
|
||||
await handle_bot_download_message(client, bot, msg, source_name, patterns, password=password)
|
||||
total += 1
|
||||
await asyncio.sleep(1.5)
|
||||
|
||||
# Flush whatever's left
|
||||
await flush_batch()
|
||||
|
||||
except (ChannelPrivateError, UsernameNotOccupiedError) as e:
|
||||
log.error(f"[Backfill] Cannot access {channel}: {e}")
|
||||
except Exception as e:
|
||||
log.error(f"[Backfill] Error scanning {channel}: {e}")
|
||||
|
||||
log.info(f"[Backfill] Done: {channel} — {total} file(s) processed")
|
||||
|
||||
|
||||
async def backfill_all(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
patterns,
|
||||
) -> None:
|
||||
"""Backfill all watched channels sequentially."""
|
||||
if BACKFILL_LIMIT <= 0:
|
||||
log.info("[Backfill] Disabled (BACKFILL_LIMIT=0)")
|
||||
return
|
||||
log.info(f"[Backfill] Starting for {len(WATCHED_CHANNELS)} channel(s)...")
|
||||
for ch in WATCHED_CHANNELS:
|
||||
await backfill_channel(client, bot, ch, patterns, BACKFILL_LIMIT)
|
||||
log.info("[Backfill] Complete.")
|
||||
|
||||
|
||||
# ─── Entity cache warmup ──────────────────────────────────────────────────────
|
||||
|
||||
async def warm_entity_cache(client: TelegramClient) -> None:
|
||||
"""
|
||||
Fetches your dialog list so Telethon caches all entity mappings.
|
||||
Required before using raw numeric IDs.
|
||||
"""
|
||||
log.info("Warming entity cache (fetching dialogs)...")
|
||||
async for _ in client.iter_dialogs():
|
||||
pass
|
||||
log.info("Entity cache ready.")
|
||||
|
||||
|
||||
# ─── Live listener ────────────────────────────────────────────────────────────
|
||||
|
||||
def register_handlers(
|
||||
client: TelegramClient,
|
||||
bot: TelegramClient,
|
||||
patterns,
|
||||
) -> None:
|
||||
"""Register the NewMessage event handler for all watched channels."""
|
||||
|
||||
# Per-channel password cache for the live handler.
|
||||
# Channels often post a text message with the password separately from
|
||||
# the file message. We remember the last seen password per channel so
|
||||
# that the file message that follows (or precedes by seconds) picks it up.
|
||||
_channel_passwords: dict[int, str] = {}
|
||||
|
||||
@client.on(events.NewMessage(chats=WATCHED_CHANNELS))
|
||||
async def on_new_message(event):
|
||||
msg = event.message
|
||||
try:
|
||||
source = event.chat.username or str(event.chat_id)
|
||||
except Exception:
|
||||
source = str(event.chat_id)
|
||||
|
||||
chat_id = event.chat_id
|
||||
log.info(f"[Live] New message in {source}")
|
||||
|
||||
# Update cache if this message carries a password
|
||||
msg_password = extract_password(msg)
|
||||
if msg_password:
|
||||
_channel_passwords[chat_id] = msg_password
|
||||
log.debug(f"[Live] Password cached for {source}: '{msg_password}'")
|
||||
|
||||
password = msg_password or _channel_passwords.get(chat_id)
|
||||
|
||||
if msg.media and isinstance(msg.media, MessageMediaDocument):
|
||||
await handle_message(client, bot, msg, source, patterns, password=password)
|
||||
elif msg.buttons and has_download_button(msg):
|
||||
await handle_bot_download_message(client, bot, msg, source, patterns, password=password)
|
||||
70
core/tdl_downloader.md
Normal file
70
core/tdl_downloader.md
Normal file
@@ -0,0 +1,70 @@
|
||||
# core/tdl_downloader.py
|
||||
|
||||
Fast file downloads via `tdl` (Go MTProto). Falls back gracefully if tdl is not installed.
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from core.tdl_downloader import (
|
||||
is_tdl_available,
|
||||
download_single_with_tdl,
|
||||
download_batch_with_tdl,
|
||||
BatchEntry,
|
||||
)
|
||||
```
|
||||
|
||||
### `is_tdl_available() -> bool`
|
||||
Returns `True` if `tdl` binary is on PATH.
|
||||
|
||||
### `download_single_with_tdl(msg, dest: Path) -> bool`
|
||||
**async.** Downloads one message's document. Returns `True` on success.
|
||||
Used by the live handler and `bot_downloader`.
|
||||
|
||||
### `download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]`
|
||||
**async.** Downloads up to `TDL_AMOUNT` messages in a single `tdl dl` invocation.
|
||||
Returns `{doc_id: True|False}` — `False` means Telethon fallback needed.
|
||||
|
||||
---
|
||||
|
||||
## BatchEntry dataclass
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class BatchEntry:
|
||||
msg: object # Telethon Message
|
||||
filename: str
|
||||
dest: Path # final destination path in TEMP_DIR
|
||||
doc_id: int # msg.media.document.id
|
||||
source_name: str
|
||||
password: str | None
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## TUI output pipeline
|
||||
|
||||
In TUI mode (`bus.tui_active == True`), `_run_tdl` pipes stdout+stderr and relays lines as `EvTdlOutput` events in real time.
|
||||
**Reads raw 256-byte chunks** (not line-by-line) and splits on `\r` and `\n`, because tdl uses `\r` to overwrite its progress bar in place.
|
||||
|
||||
In CLI mode: subprocess inherits the terminal, progress bars render natively.
|
||||
|
||||
---
|
||||
|
||||
## Staging directory isolation
|
||||
|
||||
Each batch/single download gets a unique `data/tmp/_tdl_{monotonic_ns}/` staging dir.
|
||||
After `tdl` exits, files are matched by name (with fuzzy stem fallback for `filenamify()` mangling) and moved to final `dest`. Staging dir is removed regardless of outcome.
|
||||
|
||||
`--template '{{ filenamify .FileName }}'` — tdl uses the original Telegram filename, not its default `DialogID_MessageID_filename` format.
|
||||
|
||||
---
|
||||
|
||||
## Config knobs (`config.py`)
|
||||
|
||||
| Setting | Default | Description |
|
||||
|---------|---------|-------------|
|
||||
| `TDL_NAMESPACE` | `"default"` | `-n` flag; `None` omits it |
|
||||
| `TDL_THREADS` | `8` | `-t` chunk workers per file |
|
||||
| `TDL_PERFILE` | `4` | `-l` concurrent files per invocation |
|
||||
| `TDL_AMOUNT` | `4` | Max messages per batch |
|
||||
| `TDL_TAKEOUT` | `False` | `--takeout` session flag |
|
||||
363
core/tdl_downloader.py
Normal file
363
core/tdl_downloader.py
Normal file
@@ -0,0 +1,363 @@
|
||||
"""
|
||||
tdl_downloader.py — Fast file downloads via tdl (Go MTProto implementation).
|
||||
|
||||
Install: https://github.com/iyear/tdl
|
||||
curl -sSL https://raw.githubusercontent.com/iyear/tdl/main/scripts/install.sh | bash
|
||||
|
||||
First-time setup — log in once:
|
||||
tdl login # saves to namespace "default"
|
||||
tdl login -n myns # saves to a named namespace
|
||||
|
||||
Relevant config.py knobs:
|
||||
TDL_NAMESPACE str|None Session namespace (default "default"; None omits -n)
|
||||
TDL_THREADS int Chunk workers per file (-t, default 4)
|
||||
TDL_PERFILE int Concurrent files (-l, default 4)
|
||||
TDL_AMOUNT int Messages per tdl batch (default 4)
|
||||
TDL_TAKEOUT bool Use takeout session (--takeout)
|
||||
|
||||
Flag reference:
|
||||
Global (BEFORE subcommand): -n --ns, -t --threads, -l --limit
|
||||
dl-specific: -u --url, -d --dir, --template, --continue, --takeout
|
||||
|
||||
Download isolation strategy:
|
||||
Each batch gets its own staging subdirectory (TEMP_DIR/<batch_id>/) so that
|
||||
concurrent downloads and homoglyph filename collisions can never cause tdl's
|
||||
internal .tmp → final rename to fail. Files are moved to TEMP_DIR after
|
||||
the batch completes and the staging dir is removed.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import shutil
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from config import TDL_NAMESPACE, TDL_THREADS, TDL_PERFILE, TDL_TAKEOUT, TEMP_DIR
|
||||
from tui import events as bus
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ─── Availability ─────────────────────────────────────────────────────────────
|
||||
|
||||
def is_tdl_available() -> bool:
|
||||
return shutil.which("tdl") is not None
|
||||
|
||||
|
||||
# ─── Message → URL ────────────────────────────────────────────────────────────
|
||||
|
||||
def _build_message_url(msg) -> str:
|
||||
"""
|
||||
Build a t.me/c/<channel_id>/<msg_id> link from a Telethon Message.
|
||||
Works for public and private channels alike.
|
||||
"""
|
||||
peer = msg.peer_id
|
||||
if hasattr(peer, "channel_id"):
|
||||
return f"https://t.me/c/{peer.channel_id}/{msg.id}"
|
||||
elif hasattr(peer, "chat_id"):
|
||||
return f"https://t.me/c/{peer.chat_id}/{msg.id}"
|
||||
elif hasattr(peer, "user_id"):
|
||||
return f"https://t.me/c/{peer.user_id}/{msg.id}"
|
||||
raise ValueError(f"Cannot build message URL from peer: {peer!r}")
|
||||
|
||||
|
||||
# ─── Command builder ──────────────────────────────────────────────────────────
|
||||
|
||||
def _build_cmd(urls: list[str], staging_dir: Path) -> list[str]:
|
||||
"""
|
||||
Build the full tdl dl command.
|
||||
|
||||
Global flags (-n, -t, -l) MUST precede the subcommand.
|
||||
staging_dir is always an absolute path to a fresh per-batch directory,
|
||||
so tdl's internal .tmp → final rename can never collide with an existing
|
||||
file of the same name.
|
||||
|
||||
--template '{{ filenamify .FileName }}' keeps just the original filename
|
||||
(no DialogID_MessageID_ prefix).
|
||||
|
||||
--continue is kept so interrupted downloads resume rather than restart.
|
||||
--skip-same is intentionally omitted — deduplication is handled upstream
|
||||
by is_seen(), and --skip-same can cause the .tmp rename to fail when a
|
||||
same-named file already exists in the directory.
|
||||
"""
|
||||
global_flags: list[str] = []
|
||||
if TDL_NAMESPACE:
|
||||
global_flags += ["-n", str(TDL_NAMESPACE)]
|
||||
global_flags += ["-t", str(TDL_THREADS), "-l", str(TDL_PERFILE)]
|
||||
|
||||
url_flags: list[str] = []
|
||||
for url in urls:
|
||||
url_flags += ["-u", url]
|
||||
|
||||
dl_flags = [
|
||||
"-d", str(staging_dir),
|
||||
"--template", "{{ filenamify .FileName }}",
|
||||
"--continue",
|
||||
]
|
||||
if TDL_TAKEOUT:
|
||||
dl_flags.append("--takeout")
|
||||
|
||||
return ["tdl", *global_flags, "dl", *url_flags, *dl_flags]
|
||||
|
||||
|
||||
# ─── Runner ───────────────────────────────────────────────────────────────────
|
||||
|
||||
# ANSI escape stripper — tdl emits colour codes even when not a TTY
|
||||
import re as _re
|
||||
_ANSI_RE = _re.compile(r"\x1b\[[0-9;]*[mGKHFJA-Z]|\x1b=|\x1b>|\x1b\[\?[0-9]+[hl]")
|
||||
|
||||
def _strip_ansi(text: str) -> str:
|
||||
return _ANSI_RE.sub("", text)
|
||||
|
||||
|
||||
async def _run_tdl(cmd: list[str], label: str) -> bool:
|
||||
"""
|
||||
Spawn tdl and handle output based on whether the TUI is running:
|
||||
- TUI mode: pipe stdout+stderr, read raw chunks (NOT line-by-line),
|
||||
split on both \\r and \\n, strip ANSI, post non-empty
|
||||
segments immediately as EvTdlOutput.
|
||||
tdl uses \\r to overwrite its progress bar in place, so
|
||||
async-for-line on the stream would block until EOF.
|
||||
Chunk-reading + manual split delivers progress live.
|
||||
- CLI mode: inherit the terminal so tdl's progress bars render natively.
|
||||
Returns True on exit code 0, False otherwise.
|
||||
"""
|
||||
log.debug(f"[tdl] cmd: {' '.join(cmd)}")
|
||||
try:
|
||||
if bus.tui_active:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
|
||||
async def _relay(stream):
|
||||
buf = ""
|
||||
while True:
|
||||
chunk = await stream.read(256)
|
||||
if not chunk:
|
||||
break
|
||||
buf += chunk.decode(errors="replace")
|
||||
# Split on both \r and \n; process all complete segments
|
||||
parts = _re.split(r"[\r\n]", buf)
|
||||
# Last element may be an incomplete segment — keep in buffer
|
||||
buf = parts[-1]
|
||||
for part in parts[:-1]:
|
||||
clean = _strip_ansi(part).strip()
|
||||
if clean:
|
||||
bus.post(bus.EvTdlOutput(line=clean))
|
||||
# Flush any remaining buffer content
|
||||
if buf:
|
||||
clean = _strip_ansi(buf).strip()
|
||||
if clean:
|
||||
bus.post(bus.EvTdlOutput(line=clean))
|
||||
|
||||
await asyncio.gather(_relay(proc.stdout), _relay(proc.stderr))
|
||||
await proc.wait()
|
||||
else:
|
||||
proc = await asyncio.create_subprocess_exec(*cmd)
|
||||
await proc.wait()
|
||||
|
||||
if proc.returncode == 0:
|
||||
log.info(f"[tdl] ✓ {label}")
|
||||
return True
|
||||
else:
|
||||
log.error(f"[tdl] ✗ exit {proc.returncode} — {label}")
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
log.error("[tdl] binary not found at runtime")
|
||||
return False
|
||||
except Exception as e:
|
||||
log.error(f"[tdl] Unexpected error: {e}")
|
||||
return False
|
||||
|
||||
|
||||
# ─── Staging dir helpers ──────────────────────────────────────────────────────
|
||||
|
||||
def _make_staging_dir() -> Path:
|
||||
"""Create a unique staging subdirectory under TEMP_DIR for one batch."""
|
||||
staging = TEMP_DIR.resolve() / f"_tdl_{int(time.monotonic_ns())}"
|
||||
staging.mkdir(parents=True, exist_ok=True)
|
||||
return staging
|
||||
|
||||
|
||||
def _find_in_staging(staging: Path, expected_name: str) -> Path | None:
|
||||
"""
|
||||
Locate a downloaded file in the staging dir by matching its name.
|
||||
filenamify() can munge characters (strips @, collapses unicode, etc.)
|
||||
so we do a normalised stem comparison as a fallback.
|
||||
"""
|
||||
# Exact match first
|
||||
exact = staging / expected_name
|
||||
if exact.exists():
|
||||
return exact
|
||||
|
||||
expected_stem = Path(expected_name).stem.lower().lstrip("@").replace(" ", "")
|
||||
expected_suffix = Path(expected_name).suffix.lower()
|
||||
|
||||
for candidate in staging.iterdir():
|
||||
if not candidate.is_file():
|
||||
continue
|
||||
if candidate.suffix.lower() != expected_suffix:
|
||||
continue
|
||||
cand_stem = candidate.stem.lower().lstrip("@").replace(" ", "")
|
||||
if cand_stem == expected_stem:
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _move_from_staging(staging: Path, expected_name: str, final_dest: Path) -> bool:
|
||||
"""
|
||||
Find the file in staging, move it to final_dest, return True on success.
|
||||
"""
|
||||
found = _find_in_staging(staging, expected_name)
|
||||
if not found:
|
||||
log.warning(f"[tdl] Not found in staging: '{expected_name}' (staging: {staging})")
|
||||
return False
|
||||
|
||||
try:
|
||||
found.rename(final_dest)
|
||||
log.debug(f"[tdl] Moved: {found.name} → {final_dest}")
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f"[tdl] Move failed {found} → {final_dest}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def _cleanup_staging(staging: Path) -> None:
|
||||
try:
|
||||
shutil.rmtree(staging, ignore_errors=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# ─── Public API ───────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class BatchEntry:
|
||||
"""Carries everything needed to process one file after a batch download."""
|
||||
msg: object # Telethon Message
|
||||
filename: str
|
||||
dest: Path
|
||||
doc_id: int
|
||||
source_name: str
|
||||
password: str | None
|
||||
|
||||
|
||||
async def download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]:
|
||||
"""
|
||||
Download a batch of messages in a single tdl invocation.
|
||||
|
||||
Each batch gets its own staging subdirectory so filenames can never
|
||||
collide with existing files in TEMP_DIR. After tdl exits, files are
|
||||
moved from staging to their final dest paths.
|
||||
|
||||
Returns dict mapping doc_id → True (ready at entry.dest) / False (fallback needed).
|
||||
"""
|
||||
if not entries:
|
||||
return {}
|
||||
|
||||
if not is_tdl_available():
|
||||
log.warning("[tdl] not available — all entries need Telethon fallback")
|
||||
return {e.doc_id: False for e in entries}
|
||||
|
||||
urls: list[str] = []
|
||||
for entry in entries:
|
||||
try:
|
||||
urls.append(_build_message_url(entry.msg))
|
||||
except ValueError as exc:
|
||||
log.error(f"[tdl] Skipping {entry.filename}: {exc}")
|
||||
urls.append("")
|
||||
|
||||
valid_entries = [(e, u) for e, u in zip(entries, urls) if u]
|
||||
if not valid_entries:
|
||||
return {e.doc_id: False for e in entries}
|
||||
|
||||
batch_id = f"batch_{int(time.monotonic_ns())}"
|
||||
names = ", ".join(e.filename for e, _ in valid_entries)
|
||||
log.info(f"[tdl] Batch ({len(valid_entries)} files): {names}")
|
||||
|
||||
# Notify TUI: all files in this batch are queued
|
||||
for entry, _ in valid_entries:
|
||||
size_mb = (entry.msg.media.document.size or 0) / (1024 * 1024)
|
||||
bus.post(bus.EvDownloadQueued(
|
||||
batch_id=batch_id,
|
||||
filename=entry.filename,
|
||||
size_mb=round(size_mb, 2),
|
||||
source=entry.source_name,
|
||||
password=entry.password,
|
||||
))
|
||||
|
||||
staging = _make_staging_dir()
|
||||
cmd = _build_cmd([u for _, u in valid_entries], staging)
|
||||
|
||||
# Signal batch started
|
||||
for entry, _ in valid_entries:
|
||||
bus.post(bus.EvDownloadStarted(batch_id=batch_id, filename=entry.filename))
|
||||
|
||||
tdl_ok = await _run_tdl(cmd, f"batch of {len(valid_entries)}")
|
||||
|
||||
results: dict[int, bool] = {}
|
||||
for entry in entries:
|
||||
if not any(e.doc_id == entry.doc_id for e, _ in valid_entries):
|
||||
results[entry.doc_id] = False
|
||||
continue
|
||||
|
||||
if tdl_ok:
|
||||
moved = _move_from_staging(staging, entry.filename, entry.dest)
|
||||
results[entry.doc_id] = moved
|
||||
if moved:
|
||||
bus.post(bus.EvDownloadDone(batch_id=batch_id, filename=entry.filename, via="tdl"))
|
||||
else:
|
||||
log.warning(f"[tdl] Fallback needed: {entry.filename}")
|
||||
bus.post(bus.EvDownloadFailed(batch_id=batch_id, filename=entry.filename, reason="staging mismatch"))
|
||||
else:
|
||||
results[entry.doc_id] = False
|
||||
bus.post(bus.EvDownloadFailed(batch_id=batch_id, filename=entry.filename, reason="tdl exit error"))
|
||||
|
||||
_cleanup_staging(staging)
|
||||
return results
|
||||
|
||||
|
||||
async def download_single_with_tdl(msg, dest: Path) -> bool:
|
||||
"""
|
||||
Download a single message with tdl. Used by the live handler and
|
||||
bot_downloader where batching doesn't apply.
|
||||
"""
|
||||
if not is_tdl_available():
|
||||
log.warning("[tdl] not available — falling back to Telethon")
|
||||
return False
|
||||
|
||||
try:
|
||||
url = _build_message_url(msg)
|
||||
except ValueError as e:
|
||||
log.error(f"[tdl] Cannot build URL: {e}")
|
||||
return False
|
||||
|
||||
batch_id = f"single_{int(time.monotonic_ns())}"
|
||||
size_mb = (msg.media.document.size or 0) / (1024 * 1024) if hasattr(msg, "media") and msg.media else 0
|
||||
bus.post(bus.EvDownloadQueued(
|
||||
batch_id=batch_id, filename=dest.name,
|
||||
size_mb=round(size_mb, 2), source="live", password=None,
|
||||
))
|
||||
bus.post(bus.EvDownloadStarted(batch_id=batch_id, filename=dest.name))
|
||||
|
||||
staging = _make_staging_dir()
|
||||
cmd = _build_cmd([url], staging)
|
||||
log.info(f"[tdl] Single: {dest.name} ({url})")
|
||||
tdl_ok = await _run_tdl(cmd, dest.name)
|
||||
|
||||
if tdl_ok:
|
||||
result = _move_from_staging(staging, dest.name, dest)
|
||||
else:
|
||||
result = False
|
||||
|
||||
_cleanup_staging(staging)
|
||||
|
||||
if result:
|
||||
bus.post(bus.EvDownloadDone(batch_id=batch_id, filename=dest.name, via="tdl"))
|
||||
else:
|
||||
bus.post(bus.EvDownloadFailed(batch_id=batch_id, filename=dest.name, reason="tdl failed"))
|
||||
return result
|
||||
0
data/.gitkeep
Normal file
0
data/.gitkeep
Normal file
54
logs/monitor.log
Normal file
54
logs/monitor.log
Normal file
@@ -0,0 +1,54 @@
|
||||
2026-04-02 00:45:48,909 [INFO] utils.database: Database ready: data/hits.db
|
||||
2026-04-02 00:45:49,119 [INFO] telethon.crypto.aes: libssl detected, it will be used for encryption
|
||||
2026-04-02 00:45:49,156 [INFO] utils.database: Database ready: data/hits.db
|
||||
2026-04-02 00:45:49,159 [INFO] tui.app: [bot] Connecting bot_client...
|
||||
2026-04-02 00:45:49,159 [INFO] telethon.network.mtprotosender: Connecting to 149.154.175.59:443/TcpFull...
|
||||
2026-04-02 00:45:49,203 [INFO] tui.app: [bus] EvStatus: Starting — 12 channel(s), 2 pattern(s)
|
||||
2026-04-02 00:45:49,281 [INFO] telethon.network.mtprotosender: Connection to 149.154.175.59:443/TcpFull complete!
|
||||
2026-04-02 00:45:49,900 [INFO] tui.app: [bot] bot_client connected, authorizing...
|
||||
2026-04-02 00:45:49,901 [INFO] tui.app: [bot] bot_client ready
|
||||
2026-04-02 00:45:49,901 [INFO] tui.app: [bot] Connecting user_client...
|
||||
2026-04-02 00:45:49,901 [INFO] telethon.network.mtprotosender: Connecting to 149.154.175.59:443/TcpFull...
|
||||
2026-04-02 00:45:49,908 [INFO] __main__: Cleaning up tmp/...
|
||||
2026-04-02 00:54:16,429 [INFO] utils.database: Database ready: data/hits.db
|
||||
2026-04-02 00:54:16,638 [INFO] telethon.crypto.aes: libssl detected, it will be used for encryption
|
||||
2026-04-02 00:54:16,666 [ERROR] tui.app: [bot-thread] Unhandled exception: cannot import name 'HITS_CSV' from 'config' (/home/anti/Tools/sj/telegrammer/config.py)
|
||||
Traceback (most recent call last):
|
||||
File "/home/anti/Tools/sj/telegrammer/tui/app.py", line 848, in _run_bot_thread
|
||||
loop.run_until_complete(self._bot_main())
|
||||
~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^
|
||||
File "/usr/lib64/python3.14/asyncio/base_events.py", line 719, in run_until_complete
|
||||
return future.result()
|
||||
~~~~~~~~~~~~~^^
|
||||
File "/home/anti/Tools/sj/telegrammer/tui/app.py", line 865, in _bot_main
|
||||
from core.notifier import send_status
|
||||
File "/home/anti/Tools/sj/telegrammer/core/notifier.py", line 22, in <module>
|
||||
from config import HITS_FILE, HITS_CSV, NOTIFY_CHAT_ID
|
||||
ImportError: cannot import name 'HITS_CSV' from 'config' (/home/anti/Tools/sj/telegrammer/config.py)
|
||||
2026-04-02 00:54:16,716 [INFO] tui.app: [bus] EvStatus: Bot thread crashed: cannot import name 'HITS_CSV' from 'config' (/home/anti/Tools/sj/telegrammer/config.py)
|
||||
2026-04-02 00:54:22,624 [INFO] __main__: Cleaning up tmp/...
|
||||
2026-04-02 00:54:34,773 [INFO] utils.database: Database ready: data/hits.db
|
||||
2026-04-02 00:54:34,983 [INFO] telethon.crypto.aes: libssl detected, it will be used for encryption
|
||||
2026-04-02 00:54:35,015 [INFO] utils.database: Database ready: data/hits.db
|
||||
2026-04-02 00:54:35,015 [INFO] tui.app: [bot] Connecting bot_client...
|
||||
2026-04-02 00:54:35,015 [INFO] telethon.network.mtprotosender: Connecting to 149.154.175.59:443/TcpFull...
|
||||
2026-04-02 00:54:35,063 [INFO] tui.app: [bus] EvStatus: Starting — 12 channel(s), 2 pattern(s)
|
||||
2026-04-02 00:54:35,120 [INFO] telethon.network.mtprotosender: Connection to 149.154.175.59:443/TcpFull complete!
|
||||
2026-04-02 00:54:35,698 [INFO] tui.app: [bot] bot_client connected, authorizing...
|
||||
2026-04-02 00:54:35,698 [INFO] tui.app: [bot] bot_client ready
|
||||
2026-04-02 00:54:35,698 [INFO] tui.app: [bot] Connecting user_client...
|
||||
2026-04-02 00:54:35,698 [INFO] telethon.network.mtprotosender: Connecting to 149.154.175.59:443/TcpFull...
|
||||
2026-04-02 00:54:35,810 [INFO] telethon.network.mtprotosender: Connection to 149.154.175.59:443/TcpFull complete!
|
||||
2026-04-02 00:54:36,420 [INFO] tui.app: [bot] user_client connected, checking auth...
|
||||
2026-04-02 00:54:36,420 [INFO] tui.app: [bot] user_client ready
|
||||
2026-04-02 00:54:36,563 [INFO] tui.app: [bus] EvStatus: Connected as 4n (@clp_c)
|
||||
2026-04-02 00:54:36,653 [INFO] core.scraper: Warming entity cache (fetching dialogs)...
|
||||
2026-04-02 00:54:38,437 [INFO] core.scraper: Entity cache ready.
|
||||
2026-04-02 00:54:38,437 [INFO] tui.app: [bot] Handler registered for 12 channel(s)
|
||||
2026-04-02 00:54:38,437 [INFO] core.scraper: [Backfill] Starting for 12 channel(s)...
|
||||
2026-04-02 00:54:38,437 [INFO] core.scraper: [Backfill] Scanning history: cloudxlog (last 500 messages)
|
||||
2026-04-02 00:54:38,463 [INFO] tui.app: [bus] EvStatus: Watching 12 channel(s)
|
||||
2026-04-02 00:54:38,463 [INFO] tui.app: [bus] EvStatus: Live listener active
|
||||
2026-04-02 00:54:38,585 [INFO] core.scraper: [Batch] 4 file(s): @cloud t13.txt, @cloud t12.txt, @cloud t11.txt, @cloud t10.txt
|
||||
2026-04-02 00:54:38,585 [INFO] core.tdl_downloader: [tdl] Batch (4 files): @cloud t13.txt, @cloud t12.txt, @cloud t11.txt, @cloud t10.txt
|
||||
2026-04-02 00:54:40,248 [INFO] __main__: Cleaning up tmp/...
|
||||
142
main.py
Normal file
142
main.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""
|
||||
main.py — Entry point for the ULP credential monitor.
|
||||
|
||||
Usage:
|
||||
python main.py # TUI mode (default, requires textual)
|
||||
python main.py --no-tui # Plain CLI mode
|
||||
|
||||
First run will prompt for your Telegram phone number and 2FA code
|
||||
to create a session file. Subsequent runs are fully automatic.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
import shutil
|
||||
import argparse
|
||||
|
||||
import config
|
||||
from utils.database import init_db
|
||||
|
||||
|
||||
# ─── Logging setup ────────────────────────────────────────────────────────────
|
||||
|
||||
config.LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
config.TEMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(config.LOG_FILE, encoding="utf-8"),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
init_db()
|
||||
|
||||
|
||||
# ─── Plain CLI mode ───────────────────────────────────────────────────────────
|
||||
|
||||
async def _cli_main():
|
||||
"""Original asyncio main — runs without the TUI."""
|
||||
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
|
||||
|
||||
from telethon import TelegramClient
|
||||
from core.processor import compile_patterns
|
||||
from core.notifier import send_status
|
||||
from core.scraper import backfill_all, register_handlers, warm_entity_cache
|
||||
|
||||
log.info("=" * 60)
|
||||
log.info(" ULP Credential Monitor — CLI mode")
|
||||
log.info("=" * 60)
|
||||
|
||||
patterns = compile_patterns(config.TARGET_KEYWORDS)
|
||||
log.info(f"Loaded {len(patterns)} keyword pattern(s)")
|
||||
log.info(f"Watching {len(config.WATCHED_CHANNELS)} channel(s)")
|
||||
|
||||
user_client = TelegramClient(
|
||||
config.SESSION_NAME, config.API_ID, config.API_HASH,
|
||||
connection_retries=5, auto_reconnect=True, request_retries=5,
|
||||
)
|
||||
bot_client = TelegramClient(
|
||||
"bot_session", config.API_ID, config.API_HASH,
|
||||
)
|
||||
|
||||
async with user_client, bot_client:
|
||||
await bot_client.start(bot_token=config.BOT_TOKEN)
|
||||
log.info("Bot client connected.")
|
||||
|
||||
await user_client.start()
|
||||
me = await user_client.get_me()
|
||||
log.info(f"User client connected as: {me.first_name} (@{me.username})")
|
||||
|
||||
await send_status(
|
||||
bot_client,
|
||||
f"✅ *Monitor started*\n"
|
||||
f"User: `{me.first_name}`\n"
|
||||
f"Channels: `{len(config.WATCHED_CHANNELS)}`\n"
|
||||
f"Patterns: `{len(patterns)}`\n"
|
||||
f"Backfill: `{config.BACKFILL_LIMIT} msg/channel`",
|
||||
)
|
||||
|
||||
await warm_entity_cache(user_client)
|
||||
register_handlers(user_client, bot_client, patterns)
|
||||
log.info("Live listener registered.")
|
||||
|
||||
await backfill_all(user_client, bot_client, patterns)
|
||||
|
||||
log.info("Listening for new messages... (Ctrl+C to stop)")
|
||||
await user_client.run_until_disconnected()
|
||||
|
||||
log.info("Monitor stopped.")
|
||||
|
||||
|
||||
# ─── Entry point ──────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="ULP Credential Monitor")
|
||||
parser.add_argument(
|
||||
"--no-tui",
|
||||
action="store_true",
|
||||
help="Run in plain CLI mode (no Textual TUI)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.no_tui:
|
||||
try:
|
||||
asyncio.run(_cli_main())
|
||||
except KeyboardInterrupt:
|
||||
log.info("Interrupted by user.")
|
||||
finally:
|
||||
log.info("Cleaning up tmp/...")
|
||||
if config.TEMP_DIR.exists():
|
||||
shutil.rmtree(config.TEMP_DIR, ignore_errors=True)
|
||||
config.TEMP_DIR.mkdir()
|
||||
log.info("Done.")
|
||||
else:
|
||||
try:
|
||||
from tui.app import run_tui
|
||||
except ImportError:
|
||||
print(
|
||||
"⚠ Textual is not installed. Install it with:\n"
|
||||
" pip install textual\n"
|
||||
"Or run in plain CLI mode:\n"
|
||||
" python main.py --no-tui",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
run_tui()
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
finally:
|
||||
log.info("Cleaning up tmp/...")
|
||||
if config.TEMP_DIR.exists():
|
||||
shutil.rmtree(config.TEMP_DIR, ignore_errors=True)
|
||||
config.TEMP_DIR.mkdir()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
2
pytest.ini
Normal file
2
pytest.ini
Normal file
@@ -0,0 +1,2 @@
|
||||
[pytest]
|
||||
testpaths = tests
|
||||
1
requirements-dev.txt
Normal file
1
requirements-dev.txt
Normal file
@@ -0,0 +1 @@
|
||||
pytest
|
||||
16
requirements.txt
Normal file
16
requirements.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
# Telegram
|
||||
telethon
|
||||
tgcrypto
|
||||
|
||||
# TUI
|
||||
textual
|
||||
|
||||
# Config
|
||||
python-dotenv
|
||||
|
||||
# Progress bars (CLI mode)
|
||||
tqdm
|
||||
|
||||
# Archive extraction
|
||||
py7zr
|
||||
rarfile
|
||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
31
tests/conftest.py
Normal file
31
tests/conftest.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import os
|
||||
|
||||
# Must be set before config.py is imported by any module.
|
||||
# load_dotenv() runs at import time; these setdefaults fill the gap when .env is absent.
|
||||
os.environ.setdefault("API_ID", "12345")
|
||||
os.environ.setdefault("API_HASH", "dummy_hash_for_tests")
|
||||
os.environ.setdefault("BOT_TOKEN", "0:dummy_bot_token")
|
||||
os.environ.setdefault("NOTIFY_CHAT_ID", "99999")
|
||||
|
||||
import pytest
|
||||
import config
|
||||
import utils.scorer as scorer
|
||||
|
||||
# Two test keywords:
|
||||
# @testcorp\.com — employee email domain (triggers CRITICAL)
|
||||
# testcorp\.com — plain domain match (triggers LOW baseline)
|
||||
TEST_KEYWORDS = [r"@testcorp\.com", r"testcorp\.com"]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def patched_keywords(monkeypatch):
|
||||
"""
|
||||
Override TARGET_KEYWORDS for the duration of a test and rebuild the
|
||||
scorer's module-level globals so scoring logic uses known test patterns.
|
||||
"""
|
||||
monkeypatch.setattr(config, "TARGET_KEYWORDS", TEST_KEYWORDS)
|
||||
# scorer.py uses `from config import TARGET_KEYWORDS` — a local binding that
|
||||
# doesn't update when config.TARGET_KEYWORDS is patched. Patch it directly.
|
||||
monkeypatch.setattr(scorer, "TARGET_KEYWORDS", TEST_KEYWORDS)
|
||||
monkeypatch.setattr(scorer, "EMPLOYEE_DOMAINS", scorer._build_employee_domains())
|
||||
monkeypatch.setattr(scorer, "ORG_DOMAINS", scorer._build_org_domains())
|
||||
55
tests/test_cache.py
Normal file
55
tests/test_cache.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""
|
||||
Tests for utils/cache.py — file-ID deduplication cache.
|
||||
|
||||
Each test gets an isolated cache file via the `isolated_cache` fixture
|
||||
so tests never touch data/cache.json.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import utils.cache as cache_module
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def isolated_cache(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(cache_module, "CACHE_FILE", tmp_path / "cache.json")
|
||||
|
||||
|
||||
def test_unseen_id_returns_false():
|
||||
assert cache_module.is_seen(12345) is False
|
||||
|
||||
|
||||
def test_mark_seen_makes_id_seen():
|
||||
cache_module.mark_seen(12345)
|
||||
assert cache_module.is_seen(12345) is True
|
||||
|
||||
|
||||
def test_multiple_ids_stored_independently():
|
||||
cache_module.mark_seen(1)
|
||||
cache_module.mark_seen(2)
|
||||
cache_module.mark_seen(3)
|
||||
assert cache_module.is_seen(1)
|
||||
assert cache_module.is_seen(2)
|
||||
assert cache_module.is_seen(3)
|
||||
assert not cache_module.is_seen(4)
|
||||
|
||||
|
||||
def test_persists_to_disk_between_calls():
|
||||
"""
|
||||
is_seen() and mark_seen() each load from disk independently.
|
||||
This verifies the persist-on-write / load-on-read contract
|
||||
(simulating what happens across separate function calls in the bot loop).
|
||||
"""
|
||||
cache_module.mark_seen(999)
|
||||
assert cache_module.is_seen(999) is True
|
||||
|
||||
|
||||
def test_missing_cache_file_handled_gracefully(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(cache_module, "CACHE_FILE", tmp_path / "nonexistent.json")
|
||||
assert cache_module.is_seen(42) is False
|
||||
|
||||
|
||||
def test_mark_seen_is_idempotent():
|
||||
cache_module.mark_seen(7)
|
||||
cache_module.mark_seen(7)
|
||||
cache_module.mark_seen(7)
|
||||
assert cache_module.is_seen(7) is True
|
||||
188
tests/test_database.py
Normal file
188
tests/test_database.py
Normal file
@@ -0,0 +1,188 @@
|
||||
"""
|
||||
Tests for utils/database.py — SQLite persistence layer.
|
||||
|
||||
Each test gets an isolated in-memory-equivalent DB via the `isolated_db`
|
||||
fixture so tests never touch data/hits.db.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import utils.database as db_module
|
||||
from utils.scorer import ScoredHit, CRITICAL, HIGH, MEDIUM, LOW
|
||||
|
||||
|
||||
def make_hit(severity=LOW, url="testcorp.com", username="user", password="pass", raw=None):
|
||||
"""Build a minimal ScoredHit for insertion tests."""
|
||||
scores = {CRITICAL: 40, HIGH: 30, MEDIUM: 20, LOW: 10}
|
||||
return ScoredHit(
|
||||
raw=raw or f"{url}|{username}|{password}",
|
||||
severity=severity,
|
||||
score=scores[severity],
|
||||
reasons=["Test reason"],
|
||||
url=url,
|
||||
username=username,
|
||||
password=password,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def isolated_db(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(db_module, "DB_FILE", tmp_path / "test_hits.db")
|
||||
db_module.init_db()
|
||||
|
||||
|
||||
# ─── init_db ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_init_db_is_idempotent():
|
||||
db_module.init_db()
|
||||
db_module.init_db() # must not raise
|
||||
|
||||
|
||||
# ─── insert_hits ──────────────────────────────────────────────────────────────
|
||||
|
||||
def test_insert_returns_correct_row_count():
|
||||
hits = [make_hit(), make_hit(severity=CRITICAL)]
|
||||
count = db_module.insert_hits(hits, source="testchan", filename="combo.txt")
|
||||
assert count == 2
|
||||
|
||||
|
||||
def test_insert_stores_all_fields():
|
||||
hit = make_hit(severity=HIGH, url="intranet.testcorp.com", username="jdoe", password="s3cr3t")
|
||||
db_module.insert_hits([hit], source="mychan", filename="creds.zip")
|
||||
rows = db_module.search("jdoe")
|
||||
assert len(rows) == 1
|
||||
row = rows[0]
|
||||
assert row["url"] == "intranet.testcorp.com"
|
||||
assert row["username"] == "jdoe"
|
||||
assert row["password"] == "s3cr3t"
|
||||
assert row["severity"] == HIGH
|
||||
assert row["score"] == 30
|
||||
assert row["source"] == "mychan"
|
||||
assert row["filename"] == "creds.zip"
|
||||
assert row["seen_before"] == 0
|
||||
|
||||
|
||||
def test_insert_seen_before_flag():
|
||||
hit = make_hit()
|
||||
db_module.insert_hits([hit], source="chan", filename="f.txt", seen_before=True)
|
||||
rows = db_module.search("testcorp")
|
||||
assert rows[0]["seen_before"] == 1
|
||||
|
||||
|
||||
# ─── search ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_search_finds_by_username():
|
||||
db_module.insert_hits([make_hit(username="jdoe@testcorp.com")], source="c", filename="f.txt")
|
||||
results = db_module.search("jdoe")
|
||||
assert len(results) == 1
|
||||
assert results[0]["username"] == "jdoe@testcorp.com"
|
||||
|
||||
|
||||
def test_search_finds_by_url():
|
||||
db_module.insert_hits([make_hit(url="admin.testcorp.com")], source="c", filename="f.txt")
|
||||
results = db_module.search("admin.testcorp")
|
||||
assert len(results) == 1
|
||||
|
||||
|
||||
def test_search_finds_by_raw():
|
||||
db_module.insert_hits([make_hit(raw="raw_unique_token_xyz")], source="c", filename="f.txt")
|
||||
results = db_module.search("unique_token")
|
||||
assert len(results) == 1
|
||||
|
||||
|
||||
def test_search_returns_empty_for_no_match():
|
||||
db_module.insert_hits([make_hit()], source="c", filename="f.txt")
|
||||
assert db_module.search("zzznomatch_xyz") == []
|
||||
|
||||
|
||||
def test_search_sorted_by_score_descending():
|
||||
db_module.insert_hits([make_hit(severity=LOW)], source="c", filename="f.txt")
|
||||
db_module.insert_hits([make_hit(severity=CRITICAL, url="admin.testcorp.com")], source="c", filename="f.txt")
|
||||
results = db_module.search("testcorp")
|
||||
assert results[0]["score"] >= results[-1]["score"]
|
||||
|
||||
|
||||
# ─── by_severity ──────────────────────────────────────────────────────────────
|
||||
|
||||
def test_by_severity_returns_correct_severity():
|
||||
db_module.insert_hits([make_hit(severity=CRITICAL, url="admin.testcorp.com")], source="c", filename="f.txt")
|
||||
db_module.insert_hits([make_hit(severity=LOW)], source="c", filename="f.txt")
|
||||
results = db_module.by_severity(CRITICAL)
|
||||
assert len(results) == 1
|
||||
assert results[0]["severity"] == CRITICAL
|
||||
|
||||
|
||||
def test_by_severity_excludes_duplicates():
|
||||
"""seen_before=1 rows must be invisible to by_severity — they are stored for stats only."""
|
||||
hit = make_hit(severity=HIGH, url="intranet.testcorp.com")
|
||||
db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=True)
|
||||
assert db_module.by_severity(HIGH) == []
|
||||
|
||||
|
||||
def test_by_severity_returns_empty_when_none():
|
||||
assert db_module.by_severity(CRITICAL) == []
|
||||
|
||||
|
||||
# ─── stats ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_stats_counts_by_severity():
|
||||
db_module.insert_hits([make_hit(severity=CRITICAL, url="admin.testcorp.com")], source="c", filename="f.txt")
|
||||
db_module.insert_hits([make_hit(severity=HIGH, url="intranet.testcorp.com")], source="c", filename="f.txt")
|
||||
db_module.insert_hits([make_hit(severity=MEDIUM, url="app.testcorp.com")], source="c", filename="f.txt")
|
||||
db_module.insert_hits([make_hit(severity=LOW)], source="c", filename="f.txt")
|
||||
s = db_module.stats()
|
||||
assert s["critical"] == 1
|
||||
assert s["high"] == 1
|
||||
assert s["medium"] == 1
|
||||
assert s["low"] == 1
|
||||
assert s["total"] == 4
|
||||
assert s["unique"] == 4
|
||||
assert s["duplicates"] == 0
|
||||
|
||||
|
||||
def test_stats_separates_duplicates():
|
||||
hit = make_hit()
|
||||
db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=False)
|
||||
db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=True)
|
||||
s = db_module.stats()
|
||||
assert s["total"] == 2
|
||||
assert s["unique"] == 1
|
||||
assert s["duplicates"] == 1
|
||||
|
||||
|
||||
def test_stats_severity_counts_exclude_duplicates():
|
||||
hit = make_hit(severity=CRITICAL, url="admin.testcorp.com")
|
||||
db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=False)
|
||||
db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=True)
|
||||
s = db_module.stats()
|
||||
assert s["critical"] == 1 # only the unique one
|
||||
|
||||
|
||||
def test_stats_empty_db():
|
||||
s = db_module.stats()
|
||||
assert s["total"] == 0
|
||||
assert s["unique"] == 0
|
||||
assert s["top_source"] is None
|
||||
|
||||
|
||||
def test_stats_top_source():
|
||||
db_module.insert_hits([make_hit()], source="channelA", filename="f.txt")
|
||||
db_module.insert_hits([make_hit()], source="channelA", filename="f.txt")
|
||||
db_module.insert_hits([make_hit()], source="channelB", filename="f.txt")
|
||||
s = db_module.stats()
|
||||
assert s["top_source"]["source"] == "channelA"
|
||||
|
||||
|
||||
# ─── recent ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_recent_respects_limit():
|
||||
for i in range(5):
|
||||
db_module.insert_hits([make_hit(raw=f"testcorp.com|user{i}|pass")], source="c", filename="f.txt")
|
||||
rows = db_module.recent(limit=3)
|
||||
assert len(rows) == 3
|
||||
|
||||
|
||||
def test_recent_returns_all_when_under_limit():
|
||||
db_module.insert_hits([make_hit()], source="c", filename="f.txt")
|
||||
db_module.insert_hits([make_hit()], source="c", filename="f.txt")
|
||||
rows = db_module.recent(limit=50)
|
||||
assert len(rows) == 2
|
||||
223
tests/test_processor.py
Normal file
223
tests/test_processor.py
Normal file
@@ -0,0 +1,223 @@
|
||||
"""
|
||||
Tests for core/processor.py — archive extraction and line-by-line search.
|
||||
|
||||
No Telegram deps, no async. Tests create real archive fixtures in tmp_path
|
||||
so process_file's cleanup guarantee can be verified against actual disk state.
|
||||
"""
|
||||
|
||||
import zipfile
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
from core.processor import compile_patterns, search_file, process_file
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def patterns():
|
||||
return compile_patterns([r"testcorp\.com"])
|
||||
|
||||
|
||||
# ─── compile_patterns ─────────────────────────────────────────────────────────
|
||||
|
||||
class TestCompilePatterns:
|
||||
def test_returns_case_insensitive_patterns(self):
|
||||
pats = compile_patterns([r"hello"])
|
||||
assert pats[0].search("HELLO") is not None
|
||||
assert pats[0].search("Hello") is not None
|
||||
|
||||
def test_multiple_patterns(self):
|
||||
pats = compile_patterns([r"alpha", r"beta"])
|
||||
assert len(pats) == 2
|
||||
assert pats[0].search("alpha_line")
|
||||
assert pats[1].search("beta_line")
|
||||
|
||||
def test_empty_list(self):
|
||||
assert compile_patterns([]) == []
|
||||
|
||||
|
||||
# ─── search_file ──────────────────────────────────────────────────────────────
|
||||
|
||||
class TestSearchFile:
|
||||
def test_returns_matching_lines(self, tmp_path, patterns):
|
||||
f = tmp_path / "combo.txt"
|
||||
f.write_text("testcorp.com|user|pass\nothersite.com|user|pass\n")
|
||||
assert search_file(f, patterns) == ["testcorp.com|user|pass"]
|
||||
|
||||
def test_returns_empty_when_no_match(self, tmp_path, patterns):
|
||||
f = tmp_path / "combo.txt"
|
||||
f.write_text("nomatch.com|user|pass\nanother.net|x|y\n")
|
||||
assert search_file(f, patterns) == []
|
||||
|
||||
def test_strips_whitespace_from_returned_lines(self, tmp_path, patterns):
|
||||
f = tmp_path / "combo.txt"
|
||||
f.write_text(" testcorp.com|user|pass \n")
|
||||
hits = search_file(f, patterns)
|
||||
assert hits[0] == "testcorp.com|user|pass"
|
||||
|
||||
def test_skips_blank_lines(self, tmp_path, patterns):
|
||||
f = tmp_path / "combo.txt"
|
||||
f.write_text("\n\ntestcorp.com|user|pass\n\n")
|
||||
assert search_file(f, patterns) == ["testcorp.com|user|pass"]
|
||||
|
||||
def test_handles_encoding_errors_gracefully(self, tmp_path, patterns):
|
||||
"""Combo files are often messy — invalid bytes must not crash the search."""
|
||||
f = tmp_path / "combo.txt"
|
||||
f.write_bytes(
|
||||
b"testcorp.com|user1|pass\n"
|
||||
b"\xff\xfe invalid bytes here\n"
|
||||
b"testcorp.com|user2|pass\n"
|
||||
)
|
||||
hits = search_file(f, patterns)
|
||||
assert len(hits) == 2
|
||||
|
||||
def test_multiple_matching_lines_all_returned(self, tmp_path, patterns):
|
||||
f = tmp_path / "combo.txt"
|
||||
f.write_text(
|
||||
"testcorp.com|alice|pass1\n"
|
||||
"nomatch.com|bob|pass2\n"
|
||||
"testcorp.com|carol|pass3\n"
|
||||
)
|
||||
hits = search_file(f, patterns)
|
||||
assert len(hits) == 2
|
||||
|
||||
|
||||
# ─── process_file — plain .txt ────────────────────────────────────────────────
|
||||
|
||||
class TestProcessFilePlainText:
|
||||
def test_returns_hits(self, tmp_path, patterns):
|
||||
f = tmp_path / "combo.txt"
|
||||
f.write_text("testcorp.com|user|pass\nnomatch.com|x|y\n")
|
||||
hits = process_file(f, patterns)
|
||||
assert hits == ["testcorp.com|user|pass"]
|
||||
|
||||
def test_deletes_file_after_processing(self, tmp_path, patterns):
|
||||
f = tmp_path / "combo.txt"
|
||||
f.write_text("testcorp.com|user|pass\n")
|
||||
process_file(f, patterns)
|
||||
assert not f.exists()
|
||||
|
||||
def test_deletes_file_even_with_no_hits(self, tmp_path, patterns):
|
||||
f = tmp_path / "combo.txt"
|
||||
f.write_text("nomatch.com|x|y\n")
|
||||
hits = process_file(f, patterns)
|
||||
assert hits == []
|
||||
assert not f.exists()
|
||||
|
||||
|
||||
# ─── process_file — .zip extraction ──────────────────────────────────────────
|
||||
|
||||
class TestProcessFileZip:
|
||||
def _make_zip(self, tmp_path: Path, content: str, filename="content.txt") -> Path:
|
||||
txt = tmp_path / filename
|
||||
txt.write_text(content)
|
||||
zf = tmp_path / "combo.zip"
|
||||
with zipfile.ZipFile(zf, "w") as z:
|
||||
z.write(txt, filename)
|
||||
txt.unlink()
|
||||
return zf
|
||||
|
||||
def test_extracts_and_returns_hits(self, tmp_path, patterns):
|
||||
zf = self._make_zip(tmp_path, "testcorp.com|user|pass\nnomatch.com|x|y\n")
|
||||
hits = process_file(zf, patterns)
|
||||
assert hits == ["testcorp.com|user|pass"]
|
||||
|
||||
def test_deletes_zip_after_processing(self, tmp_path, patterns):
|
||||
zf = self._make_zip(tmp_path, "testcorp.com|user|pass\n")
|
||||
process_file(zf, patterns)
|
||||
assert not zf.exists()
|
||||
|
||||
def test_deletes_extract_dir_after_processing(self, tmp_path, patterns):
|
||||
zf = self._make_zip(tmp_path, "testcorp.com|user|pass\n")
|
||||
extract_dir = tmp_path / "combo" # sibling dir named after zip stem
|
||||
process_file(zf, patterns)
|
||||
assert not extract_dir.exists()
|
||||
|
||||
def test_no_hits_still_cleans_up(self, tmp_path, patterns):
|
||||
zf = self._make_zip(tmp_path, "nomatch.com|x|y\n")
|
||||
extract_dir = tmp_path / "combo"
|
||||
process_file(zf, patterns)
|
||||
assert not zf.exists()
|
||||
assert not extract_dir.exists()
|
||||
|
||||
def test_zip_with_multiple_txt_files(self, tmp_path, patterns):
|
||||
txt1 = tmp_path / "a.txt"
|
||||
txt1.write_text("testcorp.com|alice|pass\n")
|
||||
txt2 = tmp_path / "b.txt"
|
||||
txt2.write_text("testcorp.com|bob|pass\n")
|
||||
zf = tmp_path / "combo.zip"
|
||||
with zipfile.ZipFile(zf, "w") as z:
|
||||
z.write(txt1, "a.txt")
|
||||
z.write(txt2, "b.txt")
|
||||
txt1.unlink()
|
||||
txt2.unlink()
|
||||
|
||||
hits = process_file(zf, patterns)
|
||||
assert len(hits) == 2
|
||||
|
||||
|
||||
# ─── process_file — nested archives ──────────────────────────────────────────
|
||||
|
||||
class TestProcessFileNested:
|
||||
def test_nested_zip_is_recursed(self, tmp_path, patterns):
|
||||
inner_txt = tmp_path / "inner.txt"
|
||||
inner_txt.write_text("testcorp.com|user|pass\n")
|
||||
inner_zip = tmp_path / "inner.zip"
|
||||
with zipfile.ZipFile(inner_zip, "w") as z:
|
||||
z.write(inner_txt, "inner.txt")
|
||||
inner_txt.unlink()
|
||||
|
||||
outer_zip = tmp_path / "outer.zip"
|
||||
with zipfile.ZipFile(outer_zip, "w") as z:
|
||||
z.write(inner_zip, "inner.zip")
|
||||
inner_zip.unlink()
|
||||
|
||||
hits = process_file(outer_zip, patterns)
|
||||
assert hits == ["testcorp.com|user|pass"]
|
||||
assert not outer_zip.exists()
|
||||
assert not (tmp_path / "outer").exists()
|
||||
|
||||
|
||||
# ─── process_file — password-protected .7z ───────────────────────────────────
|
||||
|
||||
class TestProcessFile7zPassword:
|
||||
def test_unlocks_with_correct_password(self, tmp_path, patterns, monkeypatch):
|
||||
try:
|
||||
import py7zr
|
||||
except ImportError:
|
||||
pytest.skip("py7zr not installed")
|
||||
|
||||
import core.processor as proc_module
|
||||
|
||||
# Isolate to a single known password so the test doesn't depend on config
|
||||
monkeypatch.setattr(proc_module, "ARCHIVE_PASSWORDS", [b"secretpwd"])
|
||||
|
||||
txt = tmp_path / "content.txt"
|
||||
txt.write_text("testcorp.com|user|pass\n")
|
||||
szf = tmp_path / "combo.7z"
|
||||
with py7zr.SevenZipFile(szf, "w", password="secretpwd") as z:
|
||||
z.write(txt, "content.txt")
|
||||
txt.unlink()
|
||||
|
||||
hits = process_file(szf, patterns)
|
||||
assert hits == ["testcorp.com|user|pass"]
|
||||
assert not szf.exists()
|
||||
|
||||
def test_skips_when_no_password_matches(self, tmp_path, patterns, monkeypatch):
|
||||
try:
|
||||
import py7zr
|
||||
except ImportError:
|
||||
pytest.skip("py7zr not installed")
|
||||
|
||||
import core.processor as proc_module
|
||||
monkeypatch.setattr(proc_module, "ARCHIVE_PASSWORDS", [b"wrongpwd"])
|
||||
|
||||
txt = tmp_path / "content.txt"
|
||||
txt.write_text("testcorp.com|user|pass\n")
|
||||
szf = tmp_path / "combo.7z"
|
||||
with py7zr.SevenZipFile(szf, "w", password="correctpwd") as z:
|
||||
z.write(txt, "content.txt")
|
||||
txt.unlink()
|
||||
|
||||
# No hits — archive could not be opened
|
||||
hits = process_file(szf, patterns)
|
||||
assert hits == []
|
||||
282
tests/test_scorer.py
Normal file
282
tests/test_scorer.py
Normal file
@@ -0,0 +1,282 @@
|
||||
"""
|
||||
Tests for utils/scorer.py — severity scoring and ULP line parsing.
|
||||
|
||||
All tests use the `patched_keywords` fixture (see conftest.py) which
|
||||
replaces TARGET_KEYWORDS with two entries:
|
||||
@testcorp.com — employee email domain (CRITICAL trigger)
|
||||
testcorp.com — plain domain match (LOW baseline)
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from utils.scorer import score_hit, score_hits, summarize, CRITICAL, HIGH, MEDIUM, LOW
|
||||
|
||||
|
||||
# ─── ULP line parsing ─────────────────────────────────────────────────────────
|
||||
|
||||
class TestULPParsing:
|
||||
def test_parses_pipe_separated_fields(self, patched_keywords):
|
||||
hit = score_hit("site.com|jdoe@testcorp.com|pass123")
|
||||
assert hit.url == "site.com"
|
||||
assert hit.username == "jdoe@testcorp.com"
|
||||
assert hit.password == "pass123"
|
||||
|
||||
def test_parses_colon_separated_fields(self, patched_keywords):
|
||||
# 'site.com' has no colon, so url field captures it cleanly
|
||||
hit = score_hit("site.com:jdoe@testcorp.com:pass123")
|
||||
assert hit.url == "site.com"
|
||||
assert hit.username == "jdoe@testcorp.com"
|
||||
assert hit.password == "pass123"
|
||||
|
||||
def test_malformed_line_yields_none_fields(self, patched_keywords):
|
||||
hit = score_hit("justaplaindomainmatch_testcorp.com")
|
||||
assert hit.url is None
|
||||
assert hit.username is None
|
||||
assert hit.password is None
|
||||
|
||||
def test_raw_field_preserved_exactly(self, patched_keywords):
|
||||
line = "site.com|jdoe@testcorp.com|pass123"
|
||||
hit = score_hit(line)
|
||||
assert hit.raw == line
|
||||
|
||||
|
||||
# ─── Real-world ULP format coverage ──────────────────────────────────────────
|
||||
|
||||
class TestULPParsingRealWorld:
|
||||
"""
|
||||
Parametrized against real stealer-log lines.
|
||||
Only field extraction is asserted (url/username/password), not severity,
|
||||
so no patched_keywords fixture is needed.
|
||||
"""
|
||||
|
||||
@pytest.mark.parametrize("line,exp_url,exp_user,exp_pass", [
|
||||
# ── Protocol + port + path, colon separator ──────────────────────────
|
||||
# Port is digits followed by '/' — must be consumed as part of the URL.
|
||||
(
|
||||
"http://portal.fakehosp.example.com:88/:55512309-1:hunter2",
|
||||
"http://portal.fakehosp.example.com:88/", "55512309-1", "hunter2",
|
||||
),
|
||||
(
|
||||
"http://portal.fakehosp.example.com:8085/app/booking/:3:letmein",
|
||||
"http://portal.fakehosp.example.com:8085/app/booking/", "3", "letmein",
|
||||
),
|
||||
(
|
||||
"https://portal.fakehosp.example.com:81/app/FrmResetPassword.aspx:30219876-K:Spr!ng22@",
|
||||
"https://portal.fakehosp.example.com:81/app/FrmResetPassword.aspx",
|
||||
"30219876-K", "Spr!ng22@",
|
||||
),
|
||||
|
||||
# ── Protocol + no port, ID-style username looks like port but has hyphen ──
|
||||
# ':\d+-' must NOT be consumed as a port (no '/' after the digits).
|
||||
(
|
||||
"https://booking.fakehosp.example.com:40293817-6:Summ3r99..",
|
||||
"https://booking.fakehosp.example.com", "40293817-6", "Summ3r99..",
|
||||
),
|
||||
(
|
||||
"https://booking.fakehosp.example.com/:40293817-6:Summ3r99..",
|
||||
"https://booking.fakehosp.example.com/", "40293817-6", "Summ3r99..",
|
||||
),
|
||||
|
||||
# ── Protocol + email username directly after host (no trailing slash) ─
|
||||
(
|
||||
"https://booking.fakehosp.example.com:carlos.gomez@gmail.com:Qwerty99",
|
||||
"https://booking.fakehosp.example.com", "carlos.gomez@gmail.com", "Qwerty99",
|
||||
),
|
||||
(
|
||||
"https://accounts.saas-vendor.example.com/signin:jdoe@fakehosp.example.com:W1nter20",
|
||||
"https://accounts.saas-vendor.example.com/signin", "jdoe@fakehosp.example.com", "W1nter20",
|
||||
),
|
||||
(
|
||||
"https://login.sso-provider.example.com/common/oauth2/authorize:jdoe@fakehosp.example.com:Passw0rd!",
|
||||
"https://login.sso-provider.example.com/common/oauth2/authorize",
|
||||
"jdoe@fakehosp.example.com", "Passw0rd!",
|
||||
),
|
||||
|
||||
# ── Pipe separator (unambiguous — port stays in URL) ──────────────────
|
||||
(
|
||||
"http://portal.fakehosp.example.com:88/|22.987.654-3|florida88",
|
||||
"http://portal.fakehosp.example.com:88/", "22.987.654-3", "florida88",
|
||||
),
|
||||
(
|
||||
"https://booking.fakehosp.example.com/|77341209-0|Ninja42",
|
||||
"https://booking.fakehosp.example.com/", "77341209-0", "Ninja42",
|
||||
),
|
||||
|
||||
# ── Mixed separators: pipe after URL, colon between user/password ─────
|
||||
(
|
||||
"http://portal.fakehosp.example.com:8085/app/booking/|Z:wd1980wd",
|
||||
"http://portal.fakehosp.example.com:8085/app/booking/", "Z", "wd1980wd",
|
||||
),
|
||||
|
||||
# ── No protocol, port in URL ─────────────────────────────────────────
|
||||
(
|
||||
"portal.fakehosp.example.com:88/:22.987.654-3:florida88",
|
||||
"portal.fakehosp.example.com:88/", "22.987.654-3", "florida88",
|
||||
),
|
||||
|
||||
# ── No protocol, no port — plain colon separators ────────────────────
|
||||
(
|
||||
"booking.fakehosp.example.com:66778899-7:correcthorse",
|
||||
"booking.fakehosp.example.com", "66778899-7", "correcthorse",
|
||||
),
|
||||
(
|
||||
"booking.fakehosp.example.com/:smithjohnathan:Bb881955",
|
||||
"booking.fakehosp.example.com/", "smithjohnathan", "Bb881955",
|
||||
),
|
||||
|
||||
# ── Password with special characters ─────────────────────────────────
|
||||
(
|
||||
"https://booking.fakehosp.example.com/:11223344-5:dragonball99*",
|
||||
"https://booking.fakehosp.example.com/", "11223344-5", "dragonball99*",
|
||||
),
|
||||
(
|
||||
"https://booking.fakehosp.example.com/:9988776-65:abc.456#",
|
||||
"https://booking.fakehosp.example.com/", "9988776-65", "abc.456#",
|
||||
),
|
||||
|
||||
# ── Semicolon separator ───────────────────────────────────────────────
|
||||
(
|
||||
"booking.fakehosp.example.com;smithjohnathan;Bb881955",
|
||||
"booking.fakehosp.example.com", "smithjohnathan", "Bb881955",
|
||||
),
|
||||
])
|
||||
def test_real_world_ulp_parsing(self, line, exp_url, exp_user, exp_pass):
|
||||
hit = score_hit(line)
|
||||
assert hit.url == exp_url, f"URL mismatch for: {line!r}"
|
||||
assert hit.username == exp_user, f"Username mismatch for: {line!r}"
|
||||
assert hit.password == exp_pass, f"Password mismatch for: {line!r}"
|
||||
|
||||
|
||||
# ─── Severity classification ──────────────────────────────────────────────────
|
||||
|
||||
class TestSeverityClassification:
|
||||
def test_employee_email_in_username_is_critical(self, patched_keywords):
|
||||
hit = score_hit("site.com|jdoe@testcorp.com|pass123")
|
||||
assert hit.severity == CRITICAL
|
||||
|
||||
def test_gmail_on_org_url_is_not_critical(self, patched_keywords):
|
||||
"""
|
||||
Core documented footgun: org domain appears in the URL, but the
|
||||
credential username is a gmail address. Must NOT be CRITICAL.
|
||||
The employee-domain pattern requires a literal '@' before the domain,
|
||||
so 'testcorp.com' in the URL field never triggers it.
|
||||
"""
|
||||
hit = score_hit("testcorp.com|user@gmail.com|pass123")
|
||||
assert hit.severity != CRITICAL
|
||||
|
||||
def test_critical_service_subdomain_is_critical(self, patched_keywords):
|
||||
hit = score_hit("admin.testcorp.com|user|pass123")
|
||||
assert hit.severity == CRITICAL
|
||||
|
||||
def test_vpn_subdomain_is_critical(self, patched_keywords):
|
||||
hit = score_hit("vpn.testcorp.com|user|pass123")
|
||||
assert hit.severity == CRITICAL
|
||||
|
||||
def test_gitlab_subdomain_is_critical(self, patched_keywords):
|
||||
hit = score_hit("gitlab.testcorp.com|user|pass123")
|
||||
assert hit.severity == CRITICAL
|
||||
|
||||
def test_intranet_subdomain_is_high(self, patched_keywords):
|
||||
hit = score_hit("intranet.testcorp.com|user|pass123")
|
||||
assert hit.severity == HIGH
|
||||
|
||||
def test_sso_subdomain_is_high(self, patched_keywords):
|
||||
hit = score_hit("sso.testcorp.com|user|pass123")
|
||||
assert hit.severity == HIGH
|
||||
|
||||
def test_app_subdomain_is_medium(self, patched_keywords):
|
||||
hit = score_hit("app.testcorp.com|user|pass123")
|
||||
assert hit.severity == MEDIUM
|
||||
|
||||
def test_booking_subdomain_is_medium(self, patched_keywords):
|
||||
hit = score_hit("booking.testcorp.com|user|pass123")
|
||||
assert hit.severity == MEDIUM
|
||||
|
||||
def test_plain_domain_match_is_low(self, patched_keywords):
|
||||
hit = score_hit("testcorp.com|user|pass123")
|
||||
assert hit.severity == LOW
|
||||
|
||||
def test_employee_email_beats_high_service(self, patched_keywords):
|
||||
"""Employee email domain must win over a HIGH service classification."""
|
||||
hit = score_hit("intranet.testcorp.com|jdoe@testcorp.com|pass")
|
||||
assert hit.severity == CRITICAL
|
||||
|
||||
def test_employee_email_beats_medium_service(self, patched_keywords):
|
||||
hit = score_hit("app.testcorp.com|jdoe@testcorp.com|pass")
|
||||
assert hit.severity == CRITICAL
|
||||
|
||||
def test_multiple_checks_accumulate_reasons(self, patched_keywords):
|
||||
"""A line matching both employee email and a critical service URL collects both reasons."""
|
||||
hit = score_hit("admin.testcorp.com|jdoe@testcorp.com|pass")
|
||||
assert hit.severity == CRITICAL
|
||||
assert len(hit.reasons) >= 2
|
||||
|
||||
def test_score_matches_severity(self, patched_keywords):
|
||||
from utils.scorer import SEVERITY_SCORES
|
||||
for line, expected_severity in [
|
||||
("admin.testcorp.com|user|pass", CRITICAL),
|
||||
("intranet.testcorp.com|user|pass", HIGH),
|
||||
("app.testcorp.com|user|pass", MEDIUM),
|
||||
("testcorp.com|user|pass", LOW),
|
||||
]:
|
||||
hit = score_hit(line)
|
||||
assert hit.score == SEVERITY_SCORES[expected_severity]
|
||||
|
||||
|
||||
# ─── Weak password flags ──────────────────────────────────────────────────────
|
||||
|
||||
class TestWeakPasswordFlags:
|
||||
def test_short_password_adds_reason(self, patched_keywords):
|
||||
hit = score_hit("testcorp.com|user|abc")
|
||||
assert any("Weak password" in r for r in hit.reasons)
|
||||
|
||||
def test_common_password_adds_reason(self, patched_keywords):
|
||||
hit = score_hit("testcorp.com|user|password")
|
||||
assert any("Common password" in r for r in hit.reasons)
|
||||
|
||||
def test_weak_password_does_not_escalate_severity(self, patched_keywords):
|
||||
"""Weak password flags are informational — they must not change severity."""
|
||||
hit = score_hit("testcorp.com|user|abc")
|
||||
assert hit.severity == LOW
|
||||
|
||||
def test_strong_password_adds_no_warning(self, patched_keywords):
|
||||
hit = score_hit("testcorp.com|user|Xk9#mP2qLrTv")
|
||||
assert not any("password" in r.lower() for r in hit.reasons if "Employee" not in r and "domain" not in r.lower() and "service" not in r.lower())
|
||||
|
||||
|
||||
# ─── score_hits and summarize ─────────────────────────────────────────────────
|
||||
|
||||
class TestScoreHitsAndSummarize:
|
||||
def test_score_hits_sorted_descending(self, patched_keywords):
|
||||
lines = [
|
||||
"testcorp.com|user|pass", # LOW
|
||||
"admin.testcorp.com|user|pass", # CRITICAL
|
||||
"intranet.testcorp.com|user|pass", # HIGH
|
||||
"app.testcorp.com|user|pass", # MEDIUM
|
||||
]
|
||||
hits = score_hits(lines)
|
||||
scores = [h.score for h in hits]
|
||||
assert scores == sorted(scores, reverse=True)
|
||||
|
||||
def test_summarize_counts_each_severity(self, patched_keywords):
|
||||
lines = [
|
||||
"admin.testcorp.com|user|pass", # CRITICAL
|
||||
"intranet.testcorp.com|user|pass", # HIGH
|
||||
"app.testcorp.com|user|pass", # MEDIUM
|
||||
"testcorp.com|user|pass", # LOW
|
||||
]
|
||||
summary = summarize(score_hits(lines))
|
||||
assert summary[CRITICAL] == 1
|
||||
assert summary[HIGH] == 1
|
||||
assert summary[MEDIUM] == 1
|
||||
assert summary[LOW] == 1
|
||||
|
||||
def test_summarize_zero_for_absent_severities(self, patched_keywords):
|
||||
hits = score_hits(["testcorp.com|user|pass"]) # LOW only
|
||||
summary = summarize(hits)
|
||||
assert summary[CRITICAL] == 0
|
||||
assert summary[HIGH] == 0
|
||||
assert summary[MEDIUM] == 0
|
||||
assert summary[LOW] == 1
|
||||
|
||||
def test_score_hits_empty_list(self, patched_keywords):
|
||||
assert score_hits([]) == []
|
||||
1
tui/__init__.py
Normal file
1
tui/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""tui — Textual TUI frontend and event bus."""
|
||||
130
tui/app.md
Normal file
130
tui/app.md
Normal file
@@ -0,0 +1,130 @@
|
||||
# tui/app.py
|
||||
|
||||
Textual TUI frontend. Entry point: `run_tui()`.
|
||||
|
||||
## Entry point
|
||||
|
||||
```python
|
||||
from tui.app import run_tui
|
||||
run_tui() # called by main.py
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Screen hierarchy
|
||||
|
||||
```
|
||||
MonitorApp (App)
|
||||
├── [default screen]
|
||||
│ ├── Header
|
||||
│ ├── #top-row (Horizontal)
|
||||
│ │ ├── DownloadPanel #dl-panel
|
||||
│ │ └── HitsPanel #hits-panel
|
||||
│ ├── StatsPanel #stats-panel
|
||||
│ ├── ChannelPanel #ch-panel
|
||||
│ └── Footer
|
||||
├── SearchScreen (push/pop via 's')
|
||||
├── HitsDBScreen (push/pop via 'h')
|
||||
└── KeywordsScreen (push/pop via 'k')
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## MonitorApp
|
||||
|
||||
### Threading model
|
||||
- **Bot backend** → `threading.Thread(daemon=True)` with its own `asyncio.new_event_loop()`
|
||||
Runs `_bot_main()` — Telethon is completely isolated from Textual's loop.
|
||||
- **TUI drain** → `set_interval(0.1, _drain_bus)` — polls `queue.Queue` every 100ms on Textual's loop.
|
||||
|
||||
### Key methods
|
||||
|
||||
| Method | Description |
|
||||
|--------|-------------|
|
||||
| `on_mount()` | Calls `bus.init_bus()`, starts bot thread, sets drain interval |
|
||||
| `_drain_bus()` | Drains all pending events from `queue.Queue`, dispatches to widgets |
|
||||
| `_run_bot_thread()` | Thread entry: creates event loop, runs `_bot_main()` |
|
||||
| `_bot_main()` | Async bot backend: connect, auth, backfill, live handler loop |
|
||||
| `_signal_channel_changed()` | Thread-safely sets the bot loop's `asyncio.Event` via `call_soon_threadsafe` |
|
||||
|
||||
### Keybindings
|
||||
|
||||
| Key | Action |
|
||||
|-----|--------|
|
||||
| `s` | Push `SearchScreen` |
|
||||
| `h` | Push `HitsDBScreen` |
|
||||
| `k` | Push `KeywordsScreen` |
|
||||
| `c` | Clear download + hits logs |
|
||||
| `r` | Force-refresh stats bar |
|
||||
| `q` / `ctrl+c` | Quit |
|
||||
|
||||
---
|
||||
|
||||
## Widgets
|
||||
|
||||
### DownloadPanel
|
||||
Left panel. Two `RichLog` widgets separated by a dashed line:
|
||||
- **top** (`#tdl-out`): raw tdl output lines (ANSI stripped)
|
||||
- **bottom** (`#dl-log`): structured download status entries
|
||||
|
||||
Methods: `tdl_line(line)`, `queued(filename, size_mb, source, password)`, `status(filename, state, via)`, `clear_logs()`
|
||||
|
||||
States for `status()`: `queued` · `downloading` · `done_tdl` · `done_tel` · `failed`
|
||||
|
||||
### HitsPanel
|
||||
Right panel. Single `RichLog` with color-coded hit entries.
|
||||
Reactive `hit_count` updates the panel title badge automatically.
|
||||
|
||||
Methods: `add_hit(severity, raw, source, filename, reasons)`, `clear_log()`
|
||||
|
||||
### StatsPanel
|
||||
Slim horizontal bar. Polls `utils.database.stats()` every 10s via `set_interval`.
|
||||
Also refreshed immediately on each `EvHit` event.
|
||||
|
||||
### ChannelPanel
|
||||
Bottom panel. `ListView` + `Input` + buttons.
|
||||
Add/remove posts `EvChannelAdded` / `EvChannelRemoved` onto the bus.
|
||||
Changes apply immediately (handler re-registered). Not persisted to `config.py` automatically.
|
||||
|
||||
---
|
||||
|
||||
## Screens
|
||||
|
||||
### SearchScreen (`s`)
|
||||
- Text input → queries `utils.database.search(keyword)`
|
||||
- Results in a `DataTable` with columns: Sev, Time, URL, Username, Password, Source, File
|
||||
- Submit with `↵` or Search button; `Escape` to dismiss
|
||||
|
||||
### HitsDBScreen (`h`)
|
||||
- Toolbar buttons + number keys filter by severity
|
||||
- `r` → recent 50, `1`→CRITICAL, `2`→HIGH, `3`→MEDIUM, `4`→LOW
|
||||
- Calls `utils.database.recent()` / `by_severity()`
|
||||
|
||||
### KeywordsScreen (`k`)
|
||||
- Live-edit `config.TARGET_KEYWORDS`
|
||||
- Validates regex before adding
|
||||
- On change: rebuilds `utils.scorer.EMPLOYEE_DOMAINS` and `ORG_DOMAINS`
|
||||
- Bot handler recompiles patterns on the next incoming message automatically
|
||||
- **Changes are in-memory only** — copy to `config.py` to persist
|
||||
|
||||
---
|
||||
|
||||
## Bot auth flow (`_bot_main`)
|
||||
|
||||
```
|
||||
await bot_client.connect()
|
||||
await bot_client.is_user_authorized()? → sign_in(bot_token=...)
|
||||
await user_client.connect()
|
||||
await user_client.is_user_authorized()? → log error + return (must run --no-tui first)
|
||||
warm_entity_cache()
|
||||
_make_handler(channels) ← registers NewMessage handler
|
||||
backfill_all()
|
||||
run_until_disconnected() ┐
|
||||
_watch_channels() ┘ gathered
|
||||
```
|
||||
|
||||
Channel-change signal path:
|
||||
```
|
||||
ChannelPanel button → EvChannel* on bus → _drain_bus → _signal_channel_changed()
|
||||
→ call_soon_threadsafe(asyncio.Event.set) → _watch_channels() wakes → _make_handler()
|
||||
```
|
||||
1016
tui/app.py
Normal file
1016
tui/app.py
Normal file
File diff suppressed because it is too large
Load Diff
66
tui/events.md
Normal file
66
tui/events.md
Normal file
@@ -0,0 +1,66 @@
|
||||
# tui/events.py
|
||||
|
||||
Thread-safe event bus between the bot backend thread and the Textual TUI.
|
||||
The bot thread calls `post()`. The TUI drains the queue every 100ms via `_drain_bus()`.
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from tui import events as bus # from core/ and tui/app.py
|
||||
from tui.events import post, init_bus, get_bus, tui_active
|
||||
```
|
||||
|
||||
### `init_bus() -> queue.Queue`
|
||||
Creates the `queue.Queue`. Called inside `MonitorApp.on_mount()` — **must run on Textual's event loop**, not before `App.run()`.
|
||||
|
||||
### `post(event: Any) -> None`
|
||||
Fire-and-forget from any thread. Silently drops if bus not initialised.
|
||||
Uses `queue.Queue.put_nowait()` — never blocks.
|
||||
|
||||
### `get_bus() -> queue.Queue | None`
|
||||
Returns the queue for the TUI consumer to drain.
|
||||
|
||||
### `tui_active: bool`
|
||||
Set to `True` by `init_bus()`. Checked by `core/tdl_downloader.py` to decide whether to pipe tdl output or inherit the terminal.
|
||||
|
||||
---
|
||||
|
||||
## Event types
|
||||
|
||||
| Class | Fields | Posted by | Consumed by |
|
||||
|-------|--------|-----------|-------------|
|
||||
| `EvDownloadQueued` | `batch_id, filename, size_mb, source, password` | `tdl_downloader`, `scraper` | `DownloadPanel.queued()` |
|
||||
| `EvDownloadStarted` | `batch_id, filename` | `tdl_downloader`, `scraper` | `DownloadPanel.status("downloading")` |
|
||||
| `EvDownloadDone` | `batch_id, filename, via` | `tdl_downloader`, `scraper` | `DownloadPanel.status("done_tdl"\|"done_tel")` |
|
||||
| `EvDownloadFailed` | `batch_id, filename, reason` | `tdl_downloader`, `scraper` | `DownloadPanel.status("failed")` |
|
||||
| `EvTdlOutput` | `line` | `tdl_downloader._relay()` | `DownloadPanel.tdl_line()` |
|
||||
| `EvHit` | `severity, raw, source, filename, reasons` | `notifier.notify()` | `HitsPanel.add_hit()` + `StatsPanel.refresh_stats()` |
|
||||
| `EvChannelAdded` | `channel` | `ChannelPanel.on_button_pressed()` | `_drain_bus` → `_signal_channel_changed()` |
|
||||
| `EvChannelRemoved` | `channel` | `ChannelPanel.on_button_pressed()` | `_drain_bus` → `_signal_channel_changed()` |
|
||||
| `EvStatus` | `text, level` | everywhere | `MonitorApp.notify()` toast |
|
||||
|
||||
`level` on `EvStatus`: `"info"` (default) · `"warning"` · `"error"`
|
||||
|
||||
---
|
||||
|
||||
## Threading model
|
||||
|
||||
```
|
||||
Bot thread (own asyncio loop)
|
||||
└─ bus.post(event) ← queue.Queue.put_nowait() [thread-safe]
|
||||
↓
|
||||
queue.Queue
|
||||
↓
|
||||
Textual thread (Textual's loop)
|
||||
└─ _drain_bus() [set_interval 100ms]
|
||||
└─ q.get_nowait() loop
|
||||
└─ dispatch to widgets [safe, same thread as Textual]
|
||||
```
|
||||
|
||||
Channel changes flow the other way:
|
||||
```
|
||||
_drain_bus sees EvChannelAdded/Removed
|
||||
→ _signal_channel_changed()
|
||||
→ loop.call_soon_threadsafe(asyncio.Event.set)
|
||||
→ bot thread's _watch_channels() wakes
|
||||
```
|
||||
114
tui/events.py
Normal file
114
tui/events.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""
|
||||
tui_events.py — Thread-safe event bus between the bot backend and the TUI.
|
||||
|
||||
The bot backend runs in a dedicated thread with its own asyncio event loop
|
||||
(completely isolated from Textual's loop). Events are posted via a standard
|
||||
queue.Queue (thread-safe), and the TUI consumer polls it from Textual's loop
|
||||
using asyncio.get_event_loop().run_in_executor() bridging.
|
||||
|
||||
post() is safe to call from any thread or any asyncio loop.
|
||||
"""
|
||||
|
||||
import queue
|
||||
import threading
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
# Thread-safe queue — works across the bot thread and Textual's thread.
|
||||
_queue: queue.Queue | None = None
|
||||
_queue_lock = threading.Lock()
|
||||
|
||||
# Set to True when the TUI is running so tdl pipes output instead of
|
||||
# writing directly to the terminal.
|
||||
tui_active: bool = False
|
||||
|
||||
|
||||
def init_bus() -> queue.Queue:
|
||||
"""Call once from MonitorApp.on_mount() to create the queue."""
|
||||
global _queue, tui_active
|
||||
_queue = queue.Queue()
|
||||
tui_active = True
|
||||
return _queue
|
||||
|
||||
|
||||
def get_bus() -> queue.Queue | None:
|
||||
return _queue
|
||||
|
||||
|
||||
def post(event: Any) -> None:
|
||||
"""Fire-and-forget from any thread. Silently drops if bus not up."""
|
||||
if _queue is not None:
|
||||
try:
|
||||
_queue.put_nowait(event)
|
||||
except queue.Full:
|
||||
pass
|
||||
|
||||
|
||||
# ─── Event types ──────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class EvDownloadQueued:
|
||||
"""A file has been accepted and is waiting for tdl."""
|
||||
batch_id: str
|
||||
filename: str
|
||||
size_mb: float
|
||||
source: str
|
||||
password: str | None
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvDownloadStarted:
|
||||
"""tdl has begun transferring this file."""
|
||||
batch_id: str
|
||||
filename: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvDownloadDone:
|
||||
"""File fully downloaded (tdl or Telethon fallback)."""
|
||||
batch_id: str
|
||||
filename: str
|
||||
via: str # "tdl" | "telethon"
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvDownloadFailed:
|
||||
"""All download attempts failed."""
|
||||
batch_id: str
|
||||
filename: str
|
||||
reason: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvTdlOutput:
|
||||
"""A line of output from tdl's stdout/stderr (TUI mode only)."""
|
||||
line: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvHit:
|
||||
"""A scored credential hit to display in the hits panel."""
|
||||
severity: str
|
||||
raw: str
|
||||
source: str
|
||||
filename: str
|
||||
reasons: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvChannelAdded:
|
||||
"""A channel was added to the live watch list."""
|
||||
channel: str | int
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvChannelRemoved:
|
||||
"""A channel was removed from the live watch list."""
|
||||
channel: str | int
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvStatus:
|
||||
"""Generic one-line status message (startup, errors, etc.)."""
|
||||
text: str
|
||||
level: str = "info" # "info" | "warning" | "error"
|
||||
1
utils/__init__.py
Normal file
1
utils/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""utils — pure logic modules with no Telegram dependencies."""
|
||||
32
utils/cache.md
Normal file
32
utils/cache.md
Normal file
@@ -0,0 +1,32 @@
|
||||
# utils/cache.py
|
||||
|
||||
Tracks already-processed Telegram document IDs to avoid redownloading.
|
||||
Persists to `data/cache.json` as a JSON array of integers.
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from utils.cache import is_seen, mark_seen
|
||||
```
|
||||
|
||||
### `is_seen(file_id: int) -> bool`
|
||||
Returns `True` if this document ID has been processed before.
|
||||
Loads from disk on every call (safe for multi-process, slightly slow for hot loops — not an issue given download cadence).
|
||||
|
||||
### `mark_seen(file_id: int) -> None`
|
||||
Adds `file_id` to the cache and persists to disk.
|
||||
|
||||
---
|
||||
|
||||
## Storage
|
||||
|
||||
- **File:** `data/cache.json`
|
||||
- **Format:** JSON array of integers — `[123456789, 987654321, ...]`
|
||||
- **No expiry** — grows indefinitely. Safe to delete to re-process all files.
|
||||
|
||||
---
|
||||
|
||||
## Notes
|
||||
|
||||
- `is_seen` + `mark_seen` are called in `core/scraper.py` after a successful download+process cycle, not before — so a file that fails mid-process will be retried on next run.
|
||||
- Not thread-safe (load/modify/save is not atomic). Acceptable because downloads are sequential within the bot loop.
|
||||
38
utils/cache.py
Normal file
38
utils/cache.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""
|
||||
cache.py — Tracks already-processed file IDs to avoid redownloading.
|
||||
Persists to a simple JSON file on disk.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
CACHE_FILE = Path("./data/cache.json")
|
||||
|
||||
|
||||
def _load() -> set:
|
||||
if not CACHE_FILE.exists():
|
||||
return set()
|
||||
try:
|
||||
with open(CACHE_FILE, "r") as f:
|
||||
return set(json.load(f))
|
||||
except Exception:
|
||||
return set()
|
||||
|
||||
|
||||
def _save(seen: set) -> None:
|
||||
with open(CACHE_FILE, "w") as f:
|
||||
json.dump(list(seen), f)
|
||||
|
||||
|
||||
def is_seen(file_id: int) -> bool:
|
||||
return file_id in _load()
|
||||
|
||||
|
||||
def mark_seen(file_id: int) -> None:
|
||||
seen = _load()
|
||||
seen.add(file_id)
|
||||
_save(seen)
|
||||
log.debug(f" Cached file ID {file_id}")
|
||||
89
utils/database.md
Normal file
89
utils/database.md
Normal file
@@ -0,0 +1,89 @@
|
||||
# utils/database.py
|
||||
|
||||
SQLite persistence layer for credential hits.
|
||||
DB file: `data/hits.db`
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from utils.database import init_db, insert_hits, search, recent, by_severity, stats
|
||||
```
|
||||
|
||||
### Setup
|
||||
|
||||
#### `init_db() -> None`
|
||||
Creates `hits` table and indexes if they don't exist. Call once on startup.
|
||||
Safe to call multiple times (idempotent).
|
||||
|
||||
---
|
||||
|
||||
### Writing
|
||||
|
||||
#### `insert_hits(scored_hits, source, filename, seen_before=False) -> int`
|
||||
Inserts a list of `ScoredHit` objects. Returns row count inserted.
|
||||
|
||||
```python
|
||||
insert_hits(new_hits, source="channelname", filename="combo.zip")
|
||||
insert_hits(dupe_hits, source="channelname", filename="combo.zip", seen_before=True)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Querying
|
||||
|
||||
#### `search(keyword: str) -> list[sqlite3.Row]`
|
||||
Full-text search across `url`, `username`, `raw`. Returns rows sorted by score DESC, timestamp DESC.
|
||||
|
||||
#### `recent(limit: int = 50) -> list[sqlite3.Row]`
|
||||
Most recent hits, newest first.
|
||||
|
||||
#### `by_severity(severity: str) -> list[sqlite3.Row]`
|
||||
All unique (non-duplicate) hits at a given severity, newest first.
|
||||
`severity` must be one of: `"CRITICAL"`, `"HIGH"`, `"MEDIUM"`, `"LOW"`
|
||||
|
||||
#### `stats() -> dict`
|
||||
Returns summary counters:
|
||||
```python
|
||||
{
|
||||
"total": int, # all rows
|
||||
"unique": int, # seen_before=0
|
||||
"duplicates": int, # seen_before=1
|
||||
"critical": int, # unique CRITICAL
|
||||
"high": int,
|
||||
"medium": int,
|
||||
"low": int,
|
||||
"sources": int, # distinct source channels
|
||||
"top_source": {"source": str, "cnt": int} | None,
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Schema
|
||||
|
||||
```sql
|
||||
hits (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT,
|
||||
username TEXT,
|
||||
password TEXT,
|
||||
raw TEXT NOT NULL, -- full original credential line
|
||||
source TEXT, -- channel username or ID
|
||||
filename TEXT, -- downloaded file name
|
||||
timestamp TEXT NOT NULL, -- "YYYY-MM-DD HH:MM:SS UTC"
|
||||
severity TEXT NOT NULL, -- CRITICAL/HIGH/MEDIUM/LOW
|
||||
score INTEGER NOT NULL, -- 40/30/20/10
|
||||
reasons TEXT, -- pipe-separated reason strings
|
||||
seen_before INTEGER NOT NULL -- 0=new, 1=duplicate
|
||||
)
|
||||
```
|
||||
|
||||
Indexes: `url`, `username`, `source`, `timestamp`, `severity`.
|
||||
|
||||
---
|
||||
|
||||
## Notes
|
||||
|
||||
- Each query opens and closes its own connection via the `_connect()` context manager.
|
||||
- `conn.row_factory = sqlite3.Row` — rows support both index and column-name access.
|
||||
- Transactions: commit on success, rollback on exception.
|
||||
171
utils/database.py
Normal file
171
utils/database.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""
|
||||
database.py — SQLite storage for credential hits.
|
||||
|
||||
Schema:
|
||||
hits table:
|
||||
- id auto-increment primary key
|
||||
- url the target URL from the credential line
|
||||
- username extracted username/email
|
||||
- password extracted password
|
||||
- raw the full original line
|
||||
- source channel/bot it came from
|
||||
- filename the file it was found in
|
||||
- timestamp UTC time of discovery
|
||||
- severity CRITICAL / HIGH / MEDIUM / LOW
|
||||
- score numeric score (higher = worse)
|
||||
- reasons pipe-separated list of scoring reasons
|
||||
- seen_before whether this was a duplicate (for stats)
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from contextlib import contextmanager
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
DB_FILE = Path("./data/hits.db")
|
||||
|
||||
|
||||
# ─── Setup ────────────────────────────────────────────────────────────────────
|
||||
|
||||
@contextmanager
|
||||
def _connect():
|
||||
conn = sqlite3.connect(DB_FILE)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
yield conn
|
||||
conn.commit()
|
||||
except Exception:
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def init_db() -> None:
|
||||
"""Create tables if they don't exist yet."""
|
||||
with _connect() as conn:
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS hits (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT,
|
||||
username TEXT,
|
||||
password TEXT,
|
||||
raw TEXT NOT NULL,
|
||||
source TEXT,
|
||||
filename TEXT,
|
||||
timestamp TEXT NOT NULL,
|
||||
severity TEXT NOT NULL DEFAULT 'LOW',
|
||||
score INTEGER NOT NULL DEFAULT 10,
|
||||
reasons TEXT,
|
||||
seen_before INTEGER NOT NULL DEFAULT 0
|
||||
)
|
||||
""")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_url ON hits(url)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_username ON hits(username)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_source ON hits(source)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_timestamp ON hits(timestamp)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_severity ON hits(severity)")
|
||||
log.info(f"Database ready: {DB_FILE}")
|
||||
|
||||
|
||||
# ─── Writing ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def insert_hits(
|
||||
scored_hits: list,
|
||||
source: str,
|
||||
filename: str,
|
||||
seen_before: bool = False,
|
||||
) -> int:
|
||||
"""
|
||||
Insert a list of ScoredHit objects into the database.
|
||||
Returns the number of rows inserted.
|
||||
"""
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
rows = []
|
||||
for h in scored_hits:
|
||||
rows.append((
|
||||
h.url,
|
||||
h.username,
|
||||
h.password,
|
||||
h.raw,
|
||||
source,
|
||||
filename,
|
||||
timestamp,
|
||||
h.severity,
|
||||
h.score,
|
||||
" | ".join(h.reasons),
|
||||
1 if seen_before else 0,
|
||||
))
|
||||
|
||||
with _connect() as conn:
|
||||
conn.executemany("""
|
||||
INSERT INTO hits
|
||||
(url, username, password, raw, source, filename, timestamp,
|
||||
severity, score, reasons, seen_before)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", rows)
|
||||
|
||||
log.info(f" DB: inserted {len(rows)} row(s) from {filename}")
|
||||
return len(rows)
|
||||
|
||||
|
||||
# ─── Querying ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def search(keyword: str) -> list[sqlite3.Row]:
|
||||
"""Search hits by keyword across url, username, raw fields."""
|
||||
with _connect() as conn:
|
||||
return conn.execute("""
|
||||
SELECT * FROM hits
|
||||
WHERE url LIKE ? OR username LIKE ? OR raw LIKE ?
|
||||
ORDER BY score DESC, timestamp DESC
|
||||
""", (f"%{keyword}%",) * 3).fetchall()
|
||||
|
||||
|
||||
def recent(limit: int = 50) -> list[sqlite3.Row]:
|
||||
"""Return the most recent hits."""
|
||||
with _connect() as conn:
|
||||
return conn.execute("""
|
||||
SELECT * FROM hits
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT ?
|
||||
""", (limit,)).fetchall()
|
||||
|
||||
|
||||
def by_severity(severity: str) -> list[sqlite3.Row]:
|
||||
"""Return all hits of a given severity level."""
|
||||
with _connect() as conn:
|
||||
return conn.execute("""
|
||||
SELECT * FROM hits
|
||||
WHERE severity = ? AND seen_before = 0
|
||||
ORDER BY timestamp DESC
|
||||
""", (severity,)).fetchall()
|
||||
|
||||
|
||||
def stats() -> dict:
|
||||
"""Return summary statistics."""
|
||||
with _connect() as conn:
|
||||
total = conn.execute("SELECT COUNT(*) FROM hits").fetchone()[0]
|
||||
unique = conn.execute("SELECT COUNT(*) FROM hits WHERE seen_before=0").fetchone()[0]
|
||||
critical = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='CRITICAL' AND seen_before=0").fetchone()[0]
|
||||
high = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='HIGH' AND seen_before=0").fetchone()[0]
|
||||
medium = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='MEDIUM' AND seen_before=0").fetchone()[0]
|
||||
low = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='LOW' AND seen_before=0").fetchone()[0]
|
||||
sources = conn.execute("SELECT COUNT(DISTINCT source) FROM hits").fetchone()[0]
|
||||
top_source = conn.execute("""
|
||||
SELECT source, COUNT(*) as cnt FROM hits
|
||||
GROUP BY source ORDER BY cnt DESC LIMIT 1
|
||||
""").fetchone()
|
||||
return {
|
||||
"total": total,
|
||||
"unique": unique,
|
||||
"duplicates": total - unique,
|
||||
"critical": critical,
|
||||
"high": high,
|
||||
"medium": medium,
|
||||
"low": low,
|
||||
"sources": sources,
|
||||
"top_source": dict(top_source) if top_source else None,
|
||||
}
|
||||
87
utils/scorer.md
Normal file
87
utils/scorer.md
Normal file
@@ -0,0 +1,87 @@
|
||||
# utils/scorer.py
|
||||
|
||||
Severity scoring for credential hits. No Telegram deps. Pure logic.
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from utils.scorer import score_hit, score_hits, summarize, ScoredHit
|
||||
from utils.scorer import CRITICAL, HIGH, MEDIUM, LOW, SEVERITY_EMOJI, SEVERITY_SCORES
|
||||
```
|
||||
|
||||
### `score_hit(line: str) -> ScoredHit`
|
||||
Score a single raw credential line. Parses ULP format (`url:user:pass`), runs all checks, returns a `ScoredHit`.
|
||||
|
||||
### `score_hits(lines: list[str]) -> list[ScoredHit]`
|
||||
Score a list of lines. Returns sorted descending by score.
|
||||
|
||||
### `summarize(scored: list[ScoredHit]) -> dict`
|
||||
Returns `{CRITICAL: n, HIGH: n, MEDIUM: n, LOW: n}`.
|
||||
|
||||
---
|
||||
|
||||
## ScoredHit dataclass
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `raw` | str | Original credential line |
|
||||
| `severity` | str | CRITICAL / HIGH / MEDIUM / LOW |
|
||||
| `score` | int | 40 / 30 / 20 / 10 |
|
||||
| `reasons` | list[str] | Human-readable match reasons |
|
||||
| `url` | str\|None | Parsed URL field |
|
||||
| `username` | str\|None | Parsed username/email field |
|
||||
| `password` | str\|None | Parsed password field |
|
||||
| `.emoji` | property | 🔴🟠🟡🟢 |
|
||||
|
||||
---
|
||||
|
||||
## Scoring rules (highest match wins)
|
||||
|
||||
| Severity | Triggers |
|
||||
|----------|----------|
|
||||
| CRITICAL | Employee email domain after `@` in username/line · Privileged service URL (admin, vpn, ssh, rdp, gitlab, jira…) |
|
||||
| HIGH | Internal service URL (intranet, erp, crm, sso, owa, sharepoint…) |
|
||||
| MEDIUM | Client-facing URL (app, patient, booking, helpdesk…) |
|
||||
| LOW | Org domain appears anywhere in line (baseline) |
|
||||
|
||||
Check 6 (no severity change): flags weak passwords ≤6 chars or common strings.
|
||||
|
||||
---
|
||||
|
||||
## Employee domain matching
|
||||
|
||||
Keywords in `config.TARGET_KEYWORDS` containing `@` become employee patterns.
|
||||
Pattern: `@<domain>(?:[^a-zA-Z0-9.\-]|$)` — requires literal `@` before the domain.
|
||||
**`user@gmail.com` on a URL containing `myorg.cl` does NOT trigger CRITICAL.**
|
||||
|
||||
Keywords without `@` go only to `ORG_DOMAINS` (LOW baseline).
|
||||
|
||||
---
|
||||
|
||||
## ULP line parser (`ULP_PATTERN`)
|
||||
|
||||
Separators: `:` `;` `,` `|` `\t` (any of these between the three fields).
|
||||
|
||||
The URL field handles two common stealer-log complications:
|
||||
|
||||
1. **`://` not treated as separator** — the optional scheme prefix `(?:https?|ftp)://` is consumed before the character-class match, so `https://` never gets split at the colon.
|
||||
|
||||
2. **Port + path consumed into the URL** — the optional group `(?::\d+/[^\s:;,|\t]*)` absorbs `:port/path` when the port is pure digits immediately followed by `/`. This correctly handles `http://host:8085/path/:user:pass` but intentionally skips patterns like `:24145487-8` (RUT number — hyphen after digits, no `/`).
|
||||
|
||||
**Known limitation:** A bare port with no path (e.g. `https://host:8080:user:pass`) will mis-parse `8080` as the username. This is not observed in practice — stealer logs always include at least a trailing `/`.
|
||||
|
||||
---
|
||||
|
||||
## Module-level globals (rebuilt on import + via KeywordsScreen)
|
||||
|
||||
| Name | Type | Description |
|
||||
|------|------|-------------|
|
||||
| `EMPLOYEE_DOMAINS` | `list[tuple[str, Pattern]]` | `(domain_str, anchored_pattern)` for `@`-keywords |
|
||||
| `ORG_DOMAINS` | `list[Pattern]` | Plain domain patterns for all keywords |
|
||||
|
||||
To rebuild after editing `config.TARGET_KEYWORDS` at runtime:
|
||||
```python
|
||||
import utils.scorer as scorer
|
||||
scorer.EMPLOYEE_DOMAINS = scorer._build_employee_domains()
|
||||
scorer.ORG_DOMAINS = scorer._build_org_domains()
|
||||
```
|
||||
273
utils/scorer.py
Normal file
273
utils/scorer.py
Normal file
@@ -0,0 +1,273 @@
|
||||
"""
|
||||
scorer.py — Severity scoring for credential hits.
|
||||
|
||||
Scoring logic (highest match wins):
|
||||
|
||||
CRITICAL — Employee credentials (internal email domain)
|
||||
e.g. jdoe@yourclinic.cl:password
|
||||
— Admin/privileged service URLs
|
||||
e.g. admin., vpn., ssh., rdp., gitlab., jira.
|
||||
|
||||
HIGH — Internal-facing services
|
||||
e.g. intranet., erp., crm., portal., citrix.
|
||||
— Password manager or SSO hits
|
||||
— Any credential where username looks like an employee email
|
||||
|
||||
MEDIUM — Client-facing portals
|
||||
e.g. app., patient., client., booking.
|
||||
— Domain match on a non-privileged service
|
||||
|
||||
LOW — Generic domain keyword match
|
||||
— No URL parsed, just a raw domain mention
|
||||
|
||||
Each scored hit gets a dict with:
|
||||
- severity: CRITICAL / HIGH / MEDIUM / LOW
|
||||
- score: int (higher = worse)
|
||||
- reasons: list of human-readable reasons
|
||||
- raw: original line
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from config import TARGET_KEYWORDS
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ─── Severity levels ─────────────────────────────────────────────────────────
|
||||
|
||||
CRITICAL = "CRITICAL"
|
||||
HIGH = "HIGH"
|
||||
MEDIUM = "MEDIUM"
|
||||
LOW = "LOW"
|
||||
|
||||
SEVERITY_SCORES = {
|
||||
CRITICAL: 40,
|
||||
HIGH: 30,
|
||||
MEDIUM: 20,
|
||||
LOW: 10,
|
||||
}
|
||||
|
||||
SEVERITY_EMOJI = {
|
||||
CRITICAL: "🔴",
|
||||
HIGH: "🟠",
|
||||
MEDIUM: "🟡",
|
||||
LOW: "🟢",
|
||||
}
|
||||
|
||||
|
||||
# ─── Pattern banks ───────────────────────────────────────────────────────────
|
||||
|
||||
# Subdomains/services that indicate privileged access
|
||||
CRITICAL_SERVICES = re.compile(
|
||||
r"(?:^|https?://|\.)"
|
||||
r"(admin|vpn|ssh|rdp|ftp|sftp|gitlab|github|bitbucket|jenkins|"
|
||||
r"jira|confluence|grafana|kibana|sentry|vault|bastion|jump|"
|
||||
r"firewall|router|switch|proxy|ldap|ad\.|activedirectory|"
|
||||
r"exchange|mail\.)",
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
HIGH_SERVICES = re.compile(
|
||||
r"(?:^|https?://|\.)"
|
||||
r"(intranet|erp|crm|portal|citrix|workspace|webmail|owa|"
|
||||
r"sharepoint|teams|slack|zoom|meet|sso|login|auth|oauth|"
|
||||
r"accounts?|dashboard|internal|corp|staff|hr|payroll|"
|
||||
r"finance|accounting)",
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
MEDIUM_SERVICES = re.compile(
|
||||
r"(?:^|https?://|\.)"
|
||||
r"(app|patient|client|customer|booking|appointment|"
|
||||
r"reserva|cita|paciente|user|member|registro|signup|"
|
||||
r"support|helpdesk|ticket)",
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
# Looks like a corporate email (user@domain)
|
||||
EMAIL_PATTERN = re.compile(r"[a-zA-Z0-9._%+\-]+@([a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})")
|
||||
|
||||
# ULP line parser
|
||||
# Separator set: colon, semicolon, comma, pipe, tab.
|
||||
# URL field: optional scheme (http/https/ftp) consumed first so '://' is never
|
||||
# mistaken for a separator; then an optional port group ':\d+/' absorbs port+path
|
||||
# (port is digits immediately followed by '/') so 'http://host:88/path:user:pass'
|
||||
# yields url='http://host:88/path', not url='http'.
|
||||
ULP_PATTERN = re.compile(
|
||||
r"^(?P<url>"
|
||||
r"(?:(?:https?|ftp)://)?[^\s:;,|\t]+" # optional scheme + host/path
|
||||
r"(?::\d+/[^\s:;,|\t]*)?" # optional :port/path (port = digits then /)
|
||||
r")"
|
||||
r"(?:[:;,|\t])"
|
||||
r"(?P<username>[^\s:;,|\t]+)"
|
||||
r"(?:[:;,|\t])"
|
||||
r"(?P<password>.+)$"
|
||||
)
|
||||
|
||||
|
||||
# ─── Derived from config ──────────────────────────────────────────────────────
|
||||
|
||||
def _kw_to_domain(kw: str) -> str:
|
||||
"""Strip regex syntax from a keyword to get a plain domain string."""
|
||||
return kw.replace(r"@", "").replace(r"\.", ".").strip("^$").lstrip(".")
|
||||
|
||||
|
||||
def _build_employee_domains() -> list[tuple[str, re.Pattern]]:
|
||||
"""
|
||||
Keywords that contain '@' are employee email domain patterns.
|
||||
|
||||
Pattern anchors at '@<domain>' so that a URL containing the org domain
|
||||
never causes a false CRITICAL on an unrelated email like @gmail.com.
|
||||
|
||||
Returns list of (domain_str, compiled_pattern) tuples.
|
||||
"""
|
||||
patterns = []
|
||||
for kw in TARGET_KEYWORDS:
|
||||
if "@" in kw:
|
||||
domain = _kw_to_domain(kw)
|
||||
if domain:
|
||||
pat = re.compile(
|
||||
r"@" + re.escape(domain) + r"(?:[^a-zA-Z0-9.\-]|$)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
patterns.append((domain, pat))
|
||||
return patterns
|
||||
|
||||
EMPLOYEE_DOMAINS = _build_employee_domains()
|
||||
|
||||
|
||||
def _build_org_domains() -> list[re.Pattern]:
|
||||
"""
|
||||
All keywords as plain domain patterns for the LOW baseline match.
|
||||
Checks that the org domain appears anywhere in the line.
|
||||
"""
|
||||
patterns = []
|
||||
for kw in TARGET_KEYWORDS:
|
||||
domain = _kw_to_domain(kw)
|
||||
if domain:
|
||||
patterns.append(re.compile(re.escape(domain), re.IGNORECASE))
|
||||
return patterns
|
||||
|
||||
ORG_DOMAINS = _build_org_domains()
|
||||
|
||||
|
||||
|
||||
# ─── Scoring logic ────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class ScoredHit:
|
||||
raw: str
|
||||
severity: str
|
||||
score: int
|
||||
reasons: list[str] = field(default_factory=list)
|
||||
url: str | None = None
|
||||
username: str | None = None
|
||||
password: str | None = None
|
||||
|
||||
@property
|
||||
def emoji(self) -> str:
|
||||
return SEVERITY_EMOJI.get(self.severity, "⚪")
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.emoji} [{self.severity}] {self.raw}"
|
||||
|
||||
|
||||
def score_hit(line: str) -> ScoredHit:
|
||||
"""
|
||||
Score a single credential line.
|
||||
Returns a ScoredHit with severity, score, and reasons.
|
||||
"""
|
||||
line = line.strip()
|
||||
reasons = []
|
||||
scores = []
|
||||
|
||||
# Parse ULP fields if possible
|
||||
url = username = password = None
|
||||
m = ULP_PATTERN.match(line)
|
||||
if m:
|
||||
url = m.group("url")
|
||||
username = m.group("username")
|
||||
password = m.group("password")
|
||||
|
||||
# ── Check 1: Employee email domain in username or line ───────────────
|
||||
# EMPLOYEE_DOMAINS entries are (domain_str, pattern) where the pattern
|
||||
# requires '@' immediately before the domain, so a URL containing the
|
||||
# org domain never triggers a CRITICAL on an unrelated email (@gmail etc).
|
||||
for domain_str, pat in EMPLOYEE_DOMAINS:
|
||||
# Try the parsed username field first; fall back to full line.
|
||||
# Either way the pattern requires a literal '@' before the domain.
|
||||
field = username if username else ""
|
||||
if not pat.search(field):
|
||||
field = line
|
||||
if pat.search(field):
|
||||
scores.append(CRITICAL)
|
||||
reasons.append(f"Employee email domain: {domain_str}")
|
||||
break
|
||||
|
||||
# ── Check 2: Is the URL a privileged/critical service? ────────────────
|
||||
if url and CRITICAL_SERVICES.search(url):
|
||||
scores.append(CRITICAL)
|
||||
reasons.append(f"Critical service URL: {url}")
|
||||
|
||||
# ── Check 3: Is the URL a high-value internal service? ────────────────
|
||||
if url and HIGH_SERVICES.search(url):
|
||||
scores.append(HIGH)
|
||||
reasons.append(f"High-value internal service: {url}")
|
||||
|
||||
# ── Check 4: Is the URL a client-facing service? ──────────────────────
|
||||
if url and MEDIUM_SERVICES.search(url):
|
||||
scores.append(MEDIUM)
|
||||
reasons.append(f"Client-facing service: {url}")
|
||||
|
||||
# ── Check 5: Generic org domain match (baseline) ─────────────────────
|
||||
for pattern in ORG_DOMAINS:
|
||||
if pattern.search(line):
|
||||
if not scores:
|
||||
scores.append(LOW)
|
||||
reasons.append(f"Org domain match in line")
|
||||
break
|
||||
|
||||
# ── Check 6: Weak/empty password flag ────────────────────────────────
|
||||
if password:
|
||||
if len(password) <= 6:
|
||||
reasons.append(f"⚠ Weak password ({len(password)} chars)")
|
||||
if password.lower() in {"123456", "password", "qwerty", "111111", "admin", "letmein"}:
|
||||
reasons.append(f"⚠ Common password: {password}")
|
||||
|
||||
# ── Resolve final severity ────────────────────────────────────────────
|
||||
severity_order = [CRITICAL, HIGH, MEDIUM, LOW]
|
||||
final_severity = LOW # default
|
||||
for s in severity_order:
|
||||
if s in scores:
|
||||
final_severity = s
|
||||
break
|
||||
|
||||
if not reasons:
|
||||
reasons.append("Pattern match")
|
||||
|
||||
return ScoredHit(
|
||||
raw = line,
|
||||
severity = final_severity,
|
||||
score = SEVERITY_SCORES[final_severity],
|
||||
reasons = reasons,
|
||||
url = url,
|
||||
username = username,
|
||||
password = password,
|
||||
)
|
||||
|
||||
|
||||
def score_hits(lines: list[str]) -> list[ScoredHit]:
|
||||
"""Score a list of credential lines. Returns sorted by score descending."""
|
||||
scored = [score_hit(line) for line in lines]
|
||||
scored.sort(key=lambda h: h.score, reverse=True)
|
||||
return scored
|
||||
|
||||
|
||||
def summarize(scored: list[ScoredHit]) -> dict:
|
||||
"""Count hits by severity level."""
|
||||
summary = {CRITICAL: 0, HIGH: 0, MEDIUM: 0, LOW: 0}
|
||||
for h in scored:
|
||||
summary[h.severity] += 1
|
||||
return summary
|
||||
Reference in New Issue
Block a user