Initial commit: ULPgrammer
- Core Telegram monitoring pipeline (scraper, processor, notifier, downloaders) - Textual TUI frontend with thread-safe event bus - SQLite persistence, severity scoring, dedup cache - Fixed ULP parser: handles https:// truncation, port+path URLs, semicolon separator - Test suite: 88 tests across scorer, cache, database, processor
This commit is contained in:
25
.claudeignore
Normal file
25
.claudeignore
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# Sessions
|
||||||
|
*.session
|
||||||
|
*.session-journal
|
||||||
|
bot_session*
|
||||||
|
|
||||||
|
# Data — keep the folder, ignore contents
|
||||||
|
data/hits.db
|
||||||
|
data/hits.txt
|
||||||
|
data/hits.csv
|
||||||
|
data/dedup.json
|
||||||
|
data/cache.json
|
||||||
|
data/tmp/
|
||||||
|
data/logs/
|
||||||
|
!data/.gitkeep
|
||||||
|
|
||||||
|
# Env
|
||||||
|
.env
|
||||||
|
|
||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
|
||||||
22
.env.example
Normal file
22
.env.example
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
# ─── Telegram API credentials ──────────────────────────────────────────────
|
||||||
|
# Get these from https://my.telegram.org → API development tools
|
||||||
|
API_ID=12345678
|
||||||
|
API_HASH=your_api_hash_here
|
||||||
|
|
||||||
|
# ─── Bot credentials ────────────────────────────────────────────────────────
|
||||||
|
# Create a bot via @BotFather and paste the token here
|
||||||
|
BOT_TOKEN=123456789:ABCdefGHIjklMNOpqrSTUvwxYZ
|
||||||
|
|
||||||
|
# ─── Alert destination ──────────────────────────────────────────────────────
|
||||||
|
# Chat ID to send hit notifications to (your personal ID or a group)
|
||||||
|
# Tip: message @userinfobot on Telegram to get your ID
|
||||||
|
NOTIFY_CHAT_ID=987654321
|
||||||
|
|
||||||
|
# ─── Session name (just a filename, no extension needed) ────────────────────
|
||||||
|
SESSION_NAME=monitor_session
|
||||||
|
|
||||||
|
# ─── tdl (fast Go downloader) — optional but strongly recommended ───────────
|
||||||
|
# Install: https://github.com/iyear/tdl
|
||||||
|
# After installing, run once: tdl login -n <SESSION_NAME>
|
||||||
|
# SESSION_NAME above is shared between Telethon and tdl — no double login needed.
|
||||||
|
# If tdl is not on PATH the bot falls back to Telethon automatically.
|
||||||
28
.gitignore
vendored
Normal file
28
.gitignore
vendored
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
# Sessions
|
||||||
|
*.session
|
||||||
|
*.session-journal
|
||||||
|
bot_session*
|
||||||
|
|
||||||
|
# Data — keep the folder, ignore contents
|
||||||
|
data/hits.db
|
||||||
|
data/hits.txt
|
||||||
|
data/hits.csv
|
||||||
|
data/dedup.json
|
||||||
|
data/cache.json
|
||||||
|
data/tmp/
|
||||||
|
data/logs/
|
||||||
|
!data/.gitkeep
|
||||||
|
|
||||||
|
# Env
|
||||||
|
.env
|
||||||
|
|
||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
|
||||||
|
# Claude things
|
||||||
|
CLAUDE.md
|
||||||
|
.claude/*
|
||||||
182
QUICK_REF.md
Normal file
182
QUICK_REF.md
Normal file
@@ -0,0 +1,182 @@
|
|||||||
|
# ULP Monitor — Quick Reference
|
||||||
|
|
||||||
|
> For Claude Code: read the per-file `.md` alongside each `.py` before editing.
|
||||||
|
> Full docs in `README.md`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Project layout
|
||||||
|
|
||||||
|
```
|
||||||
|
ulp_monitor/
|
||||||
|
├── main.py Entry point (--no-tui flag for CLI mode)
|
||||||
|
├── config.py All settings — edit this for keywords, channels, paths
|
||||||
|
│
|
||||||
|
├── core/ Telegram I/O pipeline (all async, Telethon-dependent)
|
||||||
|
│ ├── scraper.py Live listener + backfill orchestration
|
||||||
|
│ ├── tdl_downloader.py tdl subprocess wrapper + Telethon fallback
|
||||||
|
│ ├── bot_downloader.py Inline "DOWNLOAD" button click flow
|
||||||
|
│ ├── processor.py Archive extraction (.zip/.7z/.rar) + line search
|
||||||
|
│ └── notifier.py Scoring → dedup → DB → hits.txt/csv → Telegram alert
|
||||||
|
│
|
||||||
|
├── utils/ Pure logic, no Telegram deps, no async
|
||||||
|
│ ├── scorer.py Severity scoring (CRITICAL/HIGH/MEDIUM/LOW)
|
||||||
|
│ ├── cache.py Seen file-ID dedup (data/cache.json)
|
||||||
|
│ └── database.py SQLite read/write (data/hits.db)
|
||||||
|
│
|
||||||
|
├── tui/ Textual TUI — runs in main thread
|
||||||
|
│ ├── app.py MonitorApp + all screens + bot thread launcher
|
||||||
|
│ └── events.py Thread-safe queue.Queue event bus
|
||||||
|
│
|
||||||
|
└── data/ Runtime output — gitignored
|
||||||
|
├── hits.db
|
||||||
|
├── hits.txt
|
||||||
|
├── hits.csv
|
||||||
|
├── cache.json
|
||||||
|
├── dedup.json
|
||||||
|
└── logs/monitor.log
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Data flow
|
||||||
|
|
||||||
|
```
|
||||||
|
Telegram channel
|
||||||
|
└─ new message with file / download button
|
||||||
|
│
|
||||||
|
├─ core/scraper.py detects + guards (size, extension, dedup)
|
||||||
|
│
|
||||||
|
├─ core/tdl_downloader.py downloads via tdl (batched)
|
||||||
|
│ └─ core/scraper.py Telethon fallback if tdl fails
|
||||||
|
│
|
||||||
|
├─ core/bot_downloader.py handles inline button → bot reply flow
|
||||||
|
│
|
||||||
|
├─ core/processor.py extracts archive → searches .txt line by line
|
||||||
|
│
|
||||||
|
└─ core/notifier.py scores → deduplicates → persists → alerts
|
||||||
|
├─ utils/scorer.py
|
||||||
|
├─ utils/database.py
|
||||||
|
└─ tui/events.py posts EvHit to TUI
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Threading architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
main thread (Textual's event loop)
|
||||||
|
├─ MonitorApp.on_mount()
|
||||||
|
│ ├─ bus.init_bus() creates queue.Queue on THIS loop
|
||||||
|
│ ├─ threading.Thread → _run_bot_thread()
|
||||||
|
│ └─ set_interval(0.1, _drain_bus)
|
||||||
|
│
|
||||||
|
├─ _drain_bus() [every 100ms]
|
||||||
|
│ └─ queue.Queue.get_nowait() → dispatch to widgets
|
||||||
|
│
|
||||||
|
└─ Textual widgets, screens, keybindings
|
||||||
|
|
||||||
|
bot thread (own asyncio event loop)
|
||||||
|
└─ _bot_main()
|
||||||
|
├─ bot_client.connect() + sign_in()
|
||||||
|
├─ user_client.connect() + is_user_authorized()
|
||||||
|
├─ warm_entity_cache()
|
||||||
|
├─ _make_handler() → NewMessage handler registered
|
||||||
|
├─ backfill_all()
|
||||||
|
└─ run_until_disconnected() + _watch_channels() [gathered]
|
||||||
|
|
||||||
|
cross-thread communication
|
||||||
|
bot → TUI: bus.post(event) [queue.Queue.put_nowait, always safe]
|
||||||
|
TUI → bot: loop.call_soon_threadsafe() [asyncio.Event.set for channel changes]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Config quick reference (`config.py`)
|
||||||
|
|
||||||
|
| Setting | Type | Description |
|
||||||
|
|---------|------|-------------|
|
||||||
|
| `API_ID` | int | From my.telegram.org |
|
||||||
|
| `API_HASH` | str | From my.telegram.org |
|
||||||
|
| `BOT_TOKEN` | str | From @BotFather |
|
||||||
|
| `NOTIFY_CHAT_ID` | int | Your Telegram user/group ID |
|
||||||
|
| `SESSION_NAME` | str | Session file name (default: `monitor_session`) |
|
||||||
|
| `TARGET_KEYWORDS` | list[str] | Regex patterns. `@`-prefixed → employee email (CRITICAL). Plain → domain match (LOW) |
|
||||||
|
| `WATCHED_CHANNELS` | list[str\|int] | Usernames or `-100xxxxxxxxxx` IDs |
|
||||||
|
| `BACKFILL_LIMIT` | int | Messages to scan per channel on startup (0 = off) |
|
||||||
|
| `ALLOWED_EXTENSIONS` | set | `.txt .zip .7z .rar` |
|
||||||
|
| `MAX_FILE_SIZE` | int | Bytes (default 4 GB) |
|
||||||
|
| `ARCHIVE_PASSWORDS` | list[bytes] | Tried in order on locked archives |
|
||||||
|
| `TDL_NAMESPACE` | str\|None | `tdl login -n <name>` namespace |
|
||||||
|
| `TDL_THREADS` | int | Chunk workers per file (`-t`) |
|
||||||
|
| `TDL_PERFILE` | int | Concurrent files per tdl call (`-l`) |
|
||||||
|
| `TDL_AMOUNT` | int | Messages per batch |
|
||||||
|
| `TEMP_DIR` | Path | `data/tmp` |
|
||||||
|
| `HITS_FILE` | Path | `data/hits.txt` |
|
||||||
|
| `LOG_FILE` | Path | `data/logs/monitor.log` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Severity scoring summary
|
||||||
|
|
||||||
|
| Severity | Score | Triggers |
|
||||||
|
|----------|-------|----------|
|
||||||
|
| CRITICAL | 40 | Employee email (`@myorg.cl` in username) · Privileged service URL (admin, vpn, rdp, gitlab…) |
|
||||||
|
| HIGH | 30 | Internal service URL (intranet, erp, sso, owa…) |
|
||||||
|
| MEDIUM | 20 | Client-facing URL (app, booking, helpdesk…) |
|
||||||
|
| LOW | 10 | Org domain appears anywhere in line |
|
||||||
|
|
||||||
|
`@`-keyword rule: pattern requires literal `@` before domain — `user@gmail.com` on a URL containing `myorg.cl` does **not** trigger CRITICAL.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## TUI keybindings
|
||||||
|
|
||||||
|
| Key | Action | Screen |
|
||||||
|
|-----|--------|--------|
|
||||||
|
| `s` | Search hits DB | → SearchScreen |
|
||||||
|
| `h` | Browse hits by severity | → HitsDBScreen |
|
||||||
|
| `k` | Edit keyword patterns live | → KeywordsScreen |
|
||||||
|
| `c` | Clear download + hits logs | main |
|
||||||
|
| `r` | Force-refresh stats bar | main |
|
||||||
|
| `q` / `ctrl+c` | Quit | any |
|
||||||
|
| `Escape` | Back to main | sub-screens |
|
||||||
|
| `1`/`2`/`3`/`4` | Filter CRITICAL/HIGH/MEDIUM/LOW | HitsDBScreen |
|
||||||
|
| `r` | Load recent 50 | HitsDBScreen |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Per-file reference docs
|
||||||
|
|
||||||
|
| File | Reference |
|
||||||
|
|------|-----------|
|
||||||
|
| `utils/scorer.py` | `utils/scorer.md` |
|
||||||
|
| `utils/cache.py` | `utils/cache.md` |
|
||||||
|
| `utils/database.py` | `utils/database.md` |
|
||||||
|
| `core/scraper.py` | `core/scraper.md` |
|
||||||
|
| `core/processor.py` | `core/processor.md` |
|
||||||
|
| `core/notifier.py` | `core/notifier.md` |
|
||||||
|
| `core/tdl_downloader.py` | `core/tdl_downloader.md` |
|
||||||
|
| `core/bot_downloader.py` | `core/bot_downloader.md` |
|
||||||
|
| `tui/app.py` | `tui/app.md` |
|
||||||
|
| `tui/events.py` | `tui/events.md` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Common tasks
|
||||||
|
|
||||||
|
**Add a new keyword at runtime:** open the TUI → press `k` → add pattern → active immediately. Copy to `config.TARGET_KEYWORDS` to persist.
|
||||||
|
|
||||||
|
**Add a channel at runtime:** type username or numeric ID in the Channels panel → ➕ Add. Handler re-registers immediately. Edit `config.WATCHED_CHANNELS` to persist.
|
||||||
|
|
||||||
|
**Query hits from CLI:**
|
||||||
|
```bash
|
||||||
|
sqlite3 data/hits.db "SELECT severity, username, url FROM hits WHERE seen_before=0 ORDER BY score DESC LIMIT 20"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Re-process all files** (wipe cache):
|
||||||
|
```bash
|
||||||
|
rm data/cache.json data/dedup.json
|
||||||
|
```
|
||||||
|
|
||||||
|
**Check what's happening:** `tail -f data/logs/monitor.log`
|
||||||
146
README.md
Normal file
146
README.md
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
# ULP Credential Monitor
|
||||||
|
|
||||||
|
A Telegram-based credential exposure monitor for threat intelligence teams.
|
||||||
|
Watches channels for combo/stealer log files and alerts you when your
|
||||||
|
organization's credentials appear in them.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## How it works
|
||||||
|
|
||||||
|
```
|
||||||
|
User session (Telethon)
|
||||||
|
└─ watches N channels
|
||||||
|
└─ detects file attachments (.txt, .zip, .7z, .rar)
|
||||||
|
└─ downloads → extracts → searches line by line
|
||||||
|
└─ hit? → writes to data/ + sends bot alert
|
||||||
|
└─ no hit? → deletes file, moves on
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Project structure
|
||||||
|
|
||||||
|
```
|
||||||
|
ulp_monitor/
|
||||||
|
├── main.py Entry point
|
||||||
|
├── config.py All settings (keywords, channels, paths)
|
||||||
|
│
|
||||||
|
├── core/ Telegram I/O pipeline
|
||||||
|
│ ├── scraper.py Live listener + backfill
|
||||||
|
│ ├── tdl_downloader.py Fast downloads via tdl (Go MTProto)
|
||||||
|
│ ├── bot_downloader.py Inline button / bot-dispatched file flows
|
||||||
|
│ ├── processor.py Archive extraction + line-by-line search
|
||||||
|
│ └── notifier.py hits.txt / hits.csv writer + bot alerts
|
||||||
|
│
|
||||||
|
├── utils/ Pure logic — no Telegram dependencies
|
||||||
|
│ ├── scorer.py Hit severity scoring
|
||||||
|
│ ├── cache.py Seen-file deduplication
|
||||||
|
│ └── database.py SQLite persistence layer
|
||||||
|
│
|
||||||
|
├── tui/ Textual TUI frontend
|
||||||
|
│ ├── app.py MonitorApp + all Screen classes
|
||||||
|
│ └── events.py Thread-safe event bus (bot thread → TUI)
|
||||||
|
│
|
||||||
|
└── data/ Runtime-generated (gitignored)
|
||||||
|
├── hits.db SQLite database
|
||||||
|
├── hits.txt Human-readable hit log
|
||||||
|
├── hits.csv CSV hit log (importable into Excel / pandas)
|
||||||
|
├── dedup.json Deduplication hashes
|
||||||
|
├── cache.json Seen file-ID cache
|
||||||
|
└── logs/monitor.log
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
### 1. Get Telegram API credentials
|
||||||
|
- Go to https://my.telegram.org → *API development tools*
|
||||||
|
- Create an app → note your `api_id` and `api_hash`
|
||||||
|
|
||||||
|
### 2. Create a bot
|
||||||
|
- Message [@BotFather](https://t.me/BotFather) → `/newbot`
|
||||||
|
- Start a chat with your new bot before running
|
||||||
|
|
||||||
|
### 3. Get your chat ID
|
||||||
|
- Message [@userinfobot](https://t.me/userinfobot)
|
||||||
|
|
||||||
|
### 4. Configure
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
# fill in API_ID, API_HASH, BOT_TOKEN, NOTIFY_CHAT_ID
|
||||||
|
```
|
||||||
|
|
||||||
|
Open `config.py` and set:
|
||||||
|
|
||||||
|
- **`TARGET_KEYWORDS`** — your org's domains and email patterns.
|
||||||
|
Keywords with `@` (e.g. `r"@myorg\.cl"`) are **employee email domains** → CRITICAL.
|
||||||
|
Keywords without `@` are plain domain matches → LOW baseline.
|
||||||
|
- **`WATCHED_CHANNELS`** — channel usernames or numeric IDs
|
||||||
|
- **`BACKFILL_LIMIT`** — past messages to scan per channel on startup
|
||||||
|
|
||||||
|
### 5. Install dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
# rarfile needs the unrar binary:
|
||||||
|
# Ubuntu/Debian: sudo apt install unrar
|
||||||
|
# macOS: brew install rar
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5a. Install tdl (strongly recommended)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -sSL https://raw.githubusercontent.com/iyear/tdl/main/scripts/install.sh | bash
|
||||||
|
tdl login -n monitor_session
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. First run — complete Telegram auth
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python main.py --no-tui
|
||||||
|
# follow the phone + 2FA prompts once
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7. Run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python main.py # TUI mode (recommended)
|
||||||
|
python main.py --no-tui # plain CLI
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## TUI keybindings
|
||||||
|
|
||||||
|
| Key | Action |
|
||||||
|
|-----|--------|
|
||||||
|
| `s` | Search hits database |
|
||||||
|
| `h` | Browse hits by severity |
|
||||||
|
| `k` | Edit keyword patterns live |
|
||||||
|
| `c` | Clear logs |
|
||||||
|
| `r` | Refresh stats |
|
||||||
|
| `q` | Quit |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Output
|
||||||
|
|
||||||
|
| File | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `data/hits.db` | SQLite — all hits with scores, severity, dedup flag |
|
||||||
|
| `data/hits.txt` | Human-readable grouped log |
|
||||||
|
| `data/hits.csv` | CSV — easy to pull into Excel / pandas |
|
||||||
|
| `data/logs/monitor.log` | Full run log |
|
||||||
|
|
||||||
|
Telegram alerts fire for CRITICAL / HIGH / MEDIUM only. LOW is stored silently.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- **Session files are sensitive** — equivalent to a logged-in account. Gitignored, never share.
|
||||||
|
- **Flood limits** — `FloodWaitError` is handled automatically.
|
||||||
|
- **Private channels** — your user account must already be a member.
|
||||||
100
config.py
Normal file
100
config.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
"""
|
||||||
|
config.py — Loads and validates all settings from .env
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# -- Timeouts --
|
||||||
|
BOT_REPLY_TIMEOUT = 10
|
||||||
|
|
||||||
|
# ─── Telegram credentials ────────────────────────────────────────────────────
|
||||||
|
API_ID = int(os.environ["API_ID"])
|
||||||
|
API_HASH = os.environ["API_HASH"]
|
||||||
|
BOT_TOKEN = os.environ["BOT_TOKEN"]
|
||||||
|
NOTIFY_CHAT_ID = int(os.environ["NOTIFY_CHAT_ID"])
|
||||||
|
SESSION_NAME = os.getenv("SESSION_NAME", "monitor_session")
|
||||||
|
|
||||||
|
# ─── Target keywords ─────────────────────────────────────────────────────────
|
||||||
|
# Add your org's domains, email patterns, IP ranges, known usernames, etc.
|
||||||
|
# All patterns are case-insensitive regex.
|
||||||
|
TARGET_KEYWORDS: list[str] = [
|
||||||
|
r"sanatorioaleman\.cl",
|
||||||
|
r"@sanatorioaleman\.cl",
|
||||||
|
# r"192\.168\.10\.", # internal IP range example
|
||||||
|
# r"specificuser", # known internal usernames
|
||||||
|
]
|
||||||
|
|
||||||
|
# ─── Channels to watch ───────────────────────────────────────────────────────
|
||||||
|
# Use usernames (without @) or numeric channel IDs (-100xxxxxxxxxx)
|
||||||
|
WATCHED_CHANNELS: list[str | int] = [
|
||||||
|
#-1002230225603,
|
||||||
|
"cloudxlog",
|
||||||
|
#-1001967030016, # daisycloud
|
||||||
|
#"berserklogs", # berserklogs
|
||||||
|
#"BorwitaFreeLogs", # borwita
|
||||||
|
-1002748707556, # darkcloud
|
||||||
|
-1001684073398, # BHF Cloud
|
||||||
|
-1003163621939, # Wich Love from R
|
||||||
|
-1003611713618, # Khazan Cloud
|
||||||
|
-1003328682684, # LogsPlanet
|
||||||
|
-1003204260194, # JDP
|
||||||
|
-1002828367761, # HesoyamCloud
|
||||||
|
-1003513974925, # Slurm Logs
|
||||||
|
-1003599300787, # Arhont Corp
|
||||||
|
-1002582513379, # OnlyLogs
|
||||||
|
-1002788333372, # Ickis Cloud
|
||||||
|
#-1001234567890, # private channel by ID
|
||||||
|
]
|
||||||
|
|
||||||
|
# ─── File handling ───────────────────────────────────────────────────────────
|
||||||
|
TEMP_DIR = Path("./tmp")
|
||||||
|
HITS_FILE = Path("./hits.txt")
|
||||||
|
LOG_FILE = Path("./logs/monitor.log")
|
||||||
|
|
||||||
|
# Extensions to download and process
|
||||||
|
ALLOWED_EXTENSIONS = {".txt", ".zip", ".7z", ".rar"}
|
||||||
|
|
||||||
|
# Max file size to download (bytes). Default: 200 MB.
|
||||||
|
# Very large files are skipped to avoid abuse of your session.
|
||||||
|
MAX_FILE_SIZE = 4 * 1024 * 1024 * 1024 # 4 GB (Telegram Premium max)
|
||||||
|
|
||||||
|
# ─── Archive passwords to try ────────────────────────────────────────────────
|
||||||
|
ARCHIVE_PASSWORDS: list[bytes] = [
|
||||||
|
b"1234",
|
||||||
|
b"0000",
|
||||||
|
b"infected",
|
||||||
|
b"telegram",
|
||||||
|
b"password",
|
||||||
|
b"12345",
|
||||||
|
b"",
|
||||||
|
b"Borwita",
|
||||||
|
b"@WichLoveFromR",
|
||||||
|
]
|
||||||
|
|
||||||
|
# ─── Backfill settings ───────────────────────────────────────────────────────
|
||||||
|
# How many historical messages to scan per channel on startup (0 = skip backfill)
|
||||||
|
BACKFILL_LIMIT = 500
|
||||||
|
|
||||||
|
# ─── tdl downloader settings ─────────────────────────────────────────────────
|
||||||
|
# Namespace tdl was logged into. Run `tdl login` with no -n flag → namespace
|
||||||
|
# is "default". Run `tdl login -n foo` → namespace is "foo".
|
||||||
|
# Set to None to omit -n entirely (tdl will use "default" anyway).
|
||||||
|
TDL_NAMESPACE: str | None = "ulpmon"
|
||||||
|
|
||||||
|
# Parallel chunk workers per file (-t / --threads global flag)
|
||||||
|
TDL_THREADS = 8
|
||||||
|
|
||||||
|
# Max concurrent files per tdl invocation (-l / --limit global flag)
|
||||||
|
TDL_PERFILE = 4
|
||||||
|
|
||||||
|
# Max messages to batch into a single tdl invocation during backfill.
|
||||||
|
# tdl handles the parallelism internally via -l and -t.
|
||||||
|
TDL_AMOUNT = 4
|
||||||
|
|
||||||
|
# Whether to use a Telegram takeout session for downloads (lower flood limits).
|
||||||
|
# Takeout sessions are rate-limited differently — good for bulk backfill.
|
||||||
|
TDL_TAKEOUT = True
|
||||||
1
core/__init__.py
Normal file
1
core/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""core — Telegram I/O pipeline (scraper, downloader, processor, notifier)."""
|
||||||
68
core/bot_downloader.md
Normal file
68
core/bot_downloader.md
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
# core/bot_downloader.py
|
||||||
|
|
||||||
|
Handles "click to download" inline button flows. Some Telegram channels post files via a bot behind a button rather than directly attaching them.
|
||||||
|
|
||||||
|
## Public API
|
||||||
|
|
||||||
|
```python
|
||||||
|
from core.bot_downloader import (
|
||||||
|
handle_bot_download_message,
|
||||||
|
has_download_button,
|
||||||
|
extract_password,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### `handle_bot_download_message(client, bot, msg, source_name, patterns, password=None)`
|
||||||
|
**async.** Full pipeline:
|
||||||
|
1. Detect download button
|
||||||
|
2. Click it (URL button → `/start payload` to the bot; callback button → `.click()`)
|
||||||
|
3. Wait up to `BOT_REPLY_TIMEOUT` seconds for the bot to send a file back
|
||||||
|
4. Hand each file response to `core.scraper.handle_message()`
|
||||||
|
|
||||||
|
### `has_download_button(msg) -> bool`
|
||||||
|
Returns `True` if the message contains a recognisable download button.
|
||||||
|
Checked in live handler and backfill before calling this module.
|
||||||
|
|
||||||
|
### `extract_password(msg) -> str | None`
|
||||||
|
Scans message text for `Pass: ...` / `Password: ...` / `Contraseña: ...` patterns.
|
||||||
|
Returns the extracted password string, or `None`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Button detection
|
||||||
|
|
||||||
|
Recognised button text keywords (case-insensitive):
|
||||||
|
```
|
||||||
|
DOWNLOAD, DESCARGAR, GET FILE, GET PACK, ⬇, 📥
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## URL button flow (most common)
|
||||||
|
|
||||||
|
```
|
||||||
|
Button URL: https://t.me/SomeBot?start=ABC123
|
||||||
|
→ parse bot username + payload
|
||||||
|
→ client.send_message(bot_entity, "/start ABC123")
|
||||||
|
→ poll get_messages(bot_entity, limit=3) every 1s for BOT_REPLY_TIMEOUT seconds
|
||||||
|
→ return file messages found
|
||||||
|
```
|
||||||
|
|
||||||
|
## Callback button flow (fallback)
|
||||||
|
|
||||||
|
```
|
||||||
|
btn.click()
|
||||||
|
→ sleep 2s
|
||||||
|
→ get_messages(sender, limit=5)
|
||||||
|
→ return file messages found
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Constants
|
||||||
|
|
||||||
|
| Name | Value | Description |
|
||||||
|
|------|-------|-------------|
|
||||||
|
| `BOT_REPLY_TIMEOUT` | `10` | Seconds to wait for bot file reply |
|
||||||
|
| `DOWNLOAD_BUTTON_KEYWORDS` | see above | Button text triggers |
|
||||||
|
| `PASSWORD_PATTERN` | regex | Matches `Pass[word]: value` in message text |
|
||||||
161
core/bot_downloader.py
Normal file
161
core/bot_downloader.py
Normal file
@@ -0,0 +1,161 @@
|
|||||||
|
"""
|
||||||
|
bot_downloader.py — Handles "click to download" inline button flows.
|
||||||
|
|
||||||
|
Some Telegram channels post messages with a DOWNLOAD button that triggers
|
||||||
|
a bot to send you the actual file. This module simulates that click and
|
||||||
|
captures the bot's file response.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from telethon import TelegramClient
|
||||||
|
from telethon.tl.types import MessageMediaDocument, KeyboardButtonUrl
|
||||||
|
from telethon.errors import FloodWaitError
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DOWNLOAD_BUTTON_KEYWORDS = ["DOWNLOAD", "DESCARGAR", "GET FILE", "GET PACK", "⬇", "📥"]
|
||||||
|
BOT_REPLY_TIMEOUT = 10
|
||||||
|
|
||||||
|
PASSWORD_PATTERN = re.compile(
|
||||||
|
r"(?:Pass|Password|Contraseña|Contrasena|Clave)[\s]*:[\s]*(.+)$",
|
||||||
|
re.IGNORECASE | re.MULTILINE
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Password extraction ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def extract_password(msg) -> str | None:
|
||||||
|
if not msg.text:
|
||||||
|
return None
|
||||||
|
match = PASSWORD_PATTERN.search(msg.text)
|
||||||
|
if match:
|
||||||
|
pwd = match.group(1).strip()
|
||||||
|
# Strip markdown formatting characters
|
||||||
|
pwd = pwd.strip("*`_~")
|
||||||
|
log.info(f" Found password in message: '{pwd}'")
|
||||||
|
return pwd
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Button detection ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def find_download_button(msg):
|
||||||
|
"""
|
||||||
|
Scans a message's inline keyboard for a download-like button.
|
||||||
|
Returns the button object or None.
|
||||||
|
"""
|
||||||
|
if not msg.buttons:
|
||||||
|
return None
|
||||||
|
for row in msg.buttons:
|
||||||
|
for btn in row:
|
||||||
|
if any(kw in btn.text.upper() for kw in DOWNLOAD_BUTTON_KEYWORDS):
|
||||||
|
return btn
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def has_download_button(msg) -> bool:
|
||||||
|
return find_download_button(msg) is not None
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Click + wait flow ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def click_download_button(client: TelegramClient, msg) -> list:
|
||||||
|
"""
|
||||||
|
Clicks the download button on a message, then waits for the bot to reply
|
||||||
|
with a file. Returns a list of response messages containing documents.
|
||||||
|
"""
|
||||||
|
btn = find_download_button(msg)
|
||||||
|
if not btn:
|
||||||
|
return []
|
||||||
|
|
||||||
|
log.info(f" Clicking button: '{btn.text}'")
|
||||||
|
|
||||||
|
# ── URL button (most common) ───────────────────────────────────────────
|
||||||
|
if isinstance(btn.button, KeyboardButtonUrl):
|
||||||
|
url = btn.button.url # e.g. https://t.me/SomeBot?start=ABC123
|
||||||
|
|
||||||
|
match = re.search(r"t\.me/([A-Za-z0-9_]+)\?start=(.+)", url)
|
||||||
|
if not match:
|
||||||
|
log.warning(f" Unrecognised URL format: {url}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
bot_username, payload = match.group(1), match.group(2)
|
||||||
|
log.info(f" → Messaging @{bot_username} with /start {payload}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
bot_entity = await client.get_entity(bot_username)
|
||||||
|
await client.send_message(bot_entity, f"/start {payload}")
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f" Failed to message bot: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Poll for reply
|
||||||
|
log.info(f" Waiting up to {BOT_REPLY_TIMEOUT}s for bot reply...")
|
||||||
|
for _ in range(BOT_REPLY_TIMEOUT):
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
try:
|
||||||
|
recent = await client.get_messages(bot_entity, limit=3)
|
||||||
|
files = [m for m in recent if m.media and isinstance(m.media, MessageMediaDocument)]
|
||||||
|
if files:
|
||||||
|
log.info(f" ✓ Got file from bot.")
|
||||||
|
return files
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f" Poll error: {e}")
|
||||||
|
break
|
||||||
|
|
||||||
|
log.warning(f" Bot did not reply within {BOT_REPLY_TIMEOUT}s.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# ── Callback button (less common) ─────────────────────────────────────
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
await btn.click()
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f" Callback click failed: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
sender = await msg.get_sender()
|
||||||
|
recent = await client.get_messages(sender, limit=5)
|
||||||
|
return [m for m in recent if m.media and isinstance(m.media, MessageMediaDocument)]
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f" Fallback poll failed: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Main entry point ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def handle_bot_download_message(
|
||||||
|
client: TelegramClient,
|
||||||
|
bot: TelegramClient,
|
||||||
|
msg,
|
||||||
|
source_name: str,
|
||||||
|
patterns,
|
||||||
|
password: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Full pipeline for a message with a download button:
|
||||||
|
1. Detect download button
|
||||||
|
2. Click it
|
||||||
|
3. Wait for bot to send back a file
|
||||||
|
4. Hand off to the normal handle_message() flow
|
||||||
|
"""
|
||||||
|
if not has_download_button(msg):
|
||||||
|
return
|
||||||
|
|
||||||
|
log.info(f"[BotDL] Download button detected in {source_name}")
|
||||||
|
|
||||||
|
responses = await click_download_button(client, msg)
|
||||||
|
|
||||||
|
if not responses:
|
||||||
|
log.warning(f"[BotDL] No file received for message in {source_name}.")
|
||||||
|
return
|
||||||
|
|
||||||
|
from core.scraper import handle_message
|
||||||
|
for resp in responses:
|
||||||
|
log.info(f" [BotDL] Response media type: {type(resp.media).__name__}, attrs: {getattr(resp.media.document, 'attributes', []) if hasattr(resp.media, 'document') else 'none'}")
|
||||||
|
await handle_message(client, bot, resp, f"{source_name}[bot]", patterns, password=password)
|
||||||
67
core/notifier.md
Normal file
67
core/notifier.md
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
# core/notifier.py
|
||||||
|
|
||||||
|
Scores hits, deduplicates, persists to disk and DB, sends Telegram alerts.
|
||||||
|
|
||||||
|
## Public API
|
||||||
|
|
||||||
|
```python
|
||||||
|
from core.notifier import notify, send_status
|
||||||
|
```
|
||||||
|
|
||||||
|
### `notify(bot, hits: list[str], source: str, filename: str)`
|
||||||
|
**async.** Full notification pipeline:
|
||||||
|
1. `score_hits(hits)` → `list[ScoredHit]`
|
||||||
|
2. Deduplicate via SHA-256 hashes (`data/dedup.json`)
|
||||||
|
3. `insert_hits()` into SQLite for new + dupes (flagged accordingly)
|
||||||
|
4. `write_hits()` → append to `data/hits.txt`
|
||||||
|
5. `write_hits_csv()` → append to `data/hits.csv`
|
||||||
|
6. `send_alert()` → Telegram message for CRITICAL/HIGH/MEDIUM only
|
||||||
|
7. Post `EvHit` events onto the TUI bus for each new hit
|
||||||
|
|
||||||
|
### `send_status(bot, message: str)`
|
||||||
|
**async.** Sends a plain Markdown message to `config.NOTIFY_CHAT_ID`. Used for startup/status notifications.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Internal functions
|
||||||
|
|
||||||
|
| Function | Description |
|
||||||
|
|----------|-------------|
|
||||||
|
| `deduplicate(hits)` | Returns `(new_hits, dupe_hits)`; updates `data/dedup.json` |
|
||||||
|
| `write_hits(scored_hits, source)` | Appends grouped human-readable block to `data/hits.txt` |
|
||||||
|
| `write_hits_csv(scored_hits, source, filename)` | Appends rows to `data/hits.csv`; writes header on first call |
|
||||||
|
| `send_alert(bot, scored_hits, source, filename)` | Sends Telegram message grouped by severity; skips if all LOW |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Output files
|
||||||
|
|
||||||
|
| File | Format | Notes |
|
||||||
|
|------|--------|-------|
|
||||||
|
| `data/hits.txt` | Plain text, grouped by severity | Human-readable, append-only |
|
||||||
|
| `data/hits.csv` | CSV with header | Columns: `timestamp, severity, score, url, username, password, reasons, source, filename` |
|
||||||
|
| `data/dedup.json` | JSON array of SHA-256 hex strings | Hashes of `line.strip().lower()` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Alert behaviour
|
||||||
|
|
||||||
|
- CRITICAL / HIGH / MEDIUM → Telegram alert sent immediately
|
||||||
|
- LOW → stored in DB + files, **no** Telegram alert
|
||||||
|
- Duplicates → stored in DB with `seen_before=1`, no alert, no file write
|
||||||
|
|
||||||
|
## Telegram alert format
|
||||||
|
|
||||||
|
```
|
||||||
|
🚨 Credential hit(s) detected
|
||||||
|
📁 `filename`
|
||||||
|
📢 `source`
|
||||||
|
🕐 `timestamp`
|
||||||
|
|
||||||
|
Summary: 🔴 N 🟠 N 🟡 N 🟢 N
|
||||||
|
|
||||||
|
🔴 CRITICAL (N)
|
||||||
|
`url:user:pass`
|
||||||
|
↳ reason | reason
|
||||||
|
... (up to 10 per severity; remainder counted)
|
||||||
|
```
|
||||||
248
core/notifier.py
Normal file
248
core/notifier.py
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
"""
|
||||||
|
notifier.py — Persists hits to disk and sends Telegram bot alerts.
|
||||||
|
|
||||||
|
Includes:
|
||||||
|
- Severity scoring via scorer.py
|
||||||
|
- Deduplication: same credential never written or alerted twice
|
||||||
|
- SQLite storage via database.py
|
||||||
|
- hits.txt kept as a human-readable backup
|
||||||
|
- Telegram alerts grouped by severity
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from telethon import TelegramClient
|
||||||
|
|
||||||
|
import csv
|
||||||
|
|
||||||
|
from config import HITS_FILE, NOTIFY_CHAT_ID
|
||||||
|
from utils.scorer import score_hits, summarize, CRITICAL, HIGH, MEDIUM, LOW, SEVERITY_EMOJI
|
||||||
|
from utils.database import insert_hits
|
||||||
|
from tui import events as bus
|
||||||
|
|
||||||
|
HITS_CSV = HITS_FILE.with_suffix(".csv")
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MAX_PREVIEW = 10 # hits to show per severity group in alert
|
||||||
|
DEDUP_FILE = Path("./data/dedup.json")
|
||||||
|
|
||||||
|
# Only alert immediately for these severities — LOW hits are silent
|
||||||
|
ALERT_SEVERITIES = {CRITICAL, HIGH, MEDIUM}
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Deduplication ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _hash(line: str) -> str:
|
||||||
|
return hashlib.sha256(line.strip().lower().encode()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def _load_seen_hashes() -> set:
|
||||||
|
if not DEDUP_FILE.exists():
|
||||||
|
return set()
|
||||||
|
try:
|
||||||
|
with open(DEDUP_FILE, "r") as f:
|
||||||
|
return set(json.load(f))
|
||||||
|
except Exception:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
|
||||||
|
def _save_seen_hashes(seen: set) -> None:
|
||||||
|
try:
|
||||||
|
with open(DEDUP_FILE, "w") as f:
|
||||||
|
json.dump(list(seen), f)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f"Could not save dedup file: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def deduplicate(hits: list) -> tuple[list, list]:
|
||||||
|
"""
|
||||||
|
Accepts a list of ScoredHit objects.
|
||||||
|
Returns (new_hits, dupe_hits).
|
||||||
|
"""
|
||||||
|
seen = _load_seen_hashes()
|
||||||
|
new_hits = []
|
||||||
|
dupe_hits = []
|
||||||
|
new_hashes = set()
|
||||||
|
|
||||||
|
for h in hits:
|
||||||
|
digest = _hash(h.raw)
|
||||||
|
if digest in seen:
|
||||||
|
dupe_hits.append(h)
|
||||||
|
else:
|
||||||
|
new_hits.append(h)
|
||||||
|
new_hashes.add(digest)
|
||||||
|
|
||||||
|
if new_hashes:
|
||||||
|
seen.update(new_hashes)
|
||||||
|
_save_seen_hashes(seen)
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
f" Dedup: {len(hits)} raw hit(s) → "
|
||||||
|
f"{len(new_hits)} new, {len(dupe_hits)} duplicate(s)"
|
||||||
|
)
|
||||||
|
return new_hits, dupe_hits
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _timestamp() -> str:
|
||||||
|
return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Output ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def write_hits(scored_hits: list, source: str) -> None:
|
||||||
|
"""Append new hits to hits.txt grouped by severity."""
|
||||||
|
HITS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
summary = summarize(scored_hits)
|
||||||
|
|
||||||
|
with open(HITS_FILE, "a", encoding="utf-8") as f:
|
||||||
|
f.write(f"\n{'='*60}\n")
|
||||||
|
f.write(f"Source : {source}\n")
|
||||||
|
f.write(f"Time : {_timestamp()}\n")
|
||||||
|
f.write(f"Hits : {len(scored_hits)} ")
|
||||||
|
f.write(f"(CRITICAL={summary[CRITICAL]} HIGH={summary[HIGH]} ")
|
||||||
|
f.write(f"MEDIUM={summary[MEDIUM]} LOW={summary[LOW]})\n")
|
||||||
|
f.write(f"{'='*60}\n")
|
||||||
|
|
||||||
|
for severity in [CRITICAL, HIGH, MEDIUM, LOW]:
|
||||||
|
group = [h for h in scored_hits if h.severity == severity]
|
||||||
|
if not group:
|
||||||
|
continue
|
||||||
|
emoji = SEVERITY_EMOJI[severity]
|
||||||
|
f.write(f"\n{emoji} {severity} ({len(group)})\n")
|
||||||
|
for h in group:
|
||||||
|
f.write(f" {h.raw}\n")
|
||||||
|
f.write(f" → {' | '.join(h.reasons)}\n")
|
||||||
|
|
||||||
|
log.info(f" Wrote {len(scored_hits)} hit(s) to {HITS_FILE}")
|
||||||
|
|
||||||
|
|
||||||
|
def write_hits_csv(scored_hits: list, source: str, filename: str) -> None:
|
||||||
|
"""Append new hits to hits.csv — one row per hit, easy to import."""
|
||||||
|
HITS_CSV.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
write_header = not HITS_CSV.exists()
|
||||||
|
timestamp = _timestamp()
|
||||||
|
with open(HITS_CSV, "a", newline="", encoding="utf-8") as f:
|
||||||
|
writer = csv.writer(f)
|
||||||
|
if write_header:
|
||||||
|
writer.writerow([
|
||||||
|
"timestamp", "severity", "score", "url", "username",
|
||||||
|
"password", "reasons", "source", "filename",
|
||||||
|
])
|
||||||
|
for h in scored_hits:
|
||||||
|
writer.writerow([
|
||||||
|
timestamp, h.severity, h.score,
|
||||||
|
h.url or "", h.username or "", h.password or "",
|
||||||
|
" | ".join(h.reasons), source, filename,
|
||||||
|
])
|
||||||
|
log.info(f" Wrote {len(scored_hits)} hit(s) to {HITS_CSV}")
|
||||||
|
|
||||||
|
|
||||||
|
async def send_alert(
|
||||||
|
bot: TelegramClient,
|
||||||
|
scored_hits: list,
|
||||||
|
source: str,
|
||||||
|
filename: str,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Send a Telegram alert grouped by severity.
|
||||||
|
Only includes CRITICAL, HIGH, MEDIUM — LOW hits are omitted from alerts.
|
||||||
|
"""
|
||||||
|
summary = summarize(scored_hits)
|
||||||
|
alertable = [h for h in scored_hits if h.severity in ALERT_SEVERITIES]
|
||||||
|
|
||||||
|
if not alertable:
|
||||||
|
log.info(" No alertable hits (all LOW) — skipping Telegram notification.")
|
||||||
|
return
|
||||||
|
|
||||||
|
lines = [
|
||||||
|
f"🚨 *Credential hit(s) detected*",
|
||||||
|
f"",
|
||||||
|
f"📁 `{filename}`",
|
||||||
|
f"📢 `{source}`",
|
||||||
|
f"🕐 `{_timestamp()}`",
|
||||||
|
f"",
|
||||||
|
f"*Summary:*",
|
||||||
|
f"🔴 CRITICAL: `{summary[CRITICAL]}` "
|
||||||
|
f"🟠 HIGH: `{summary[HIGH]}` "
|
||||||
|
f"🟡 MEDIUM: `{summary[MEDIUM]}` "
|
||||||
|
f"🟢 LOW: `{summary[LOW]}`",
|
||||||
|
]
|
||||||
|
|
||||||
|
for severity in [CRITICAL, HIGH, MEDIUM]:
|
||||||
|
group = [h for h in scored_hits if h.severity == severity]
|
||||||
|
if not group:
|
||||||
|
continue
|
||||||
|
emoji = SEVERITY_EMOJI[severity]
|
||||||
|
lines.append(f"\n{emoji} *{severity}* ({len(group)})")
|
||||||
|
for h in group[:MAX_PREVIEW]:
|
||||||
|
safe = h.raw.replace("`", "'")
|
||||||
|
lines.append(f"`{safe}`")
|
||||||
|
lines.append(f"_↳ {' | '.join(h.reasons)}_")
|
||||||
|
if len(group) > MAX_PREVIEW:
|
||||||
|
lines.append(f"_...and {len(group) - MAX_PREVIEW} more_")
|
||||||
|
|
||||||
|
try:
|
||||||
|
await bot.send_message(NOTIFY_CHAT_ID, "\n".join(lines), parse_mode="markdown")
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Failed to send Telegram alert: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Main entry point ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def notify(bot: TelegramClient, hits: list[str], source: str, filename: str) -> None:
|
||||||
|
"""
|
||||||
|
Full notification pipeline:
|
||||||
|
1. Score all hits
|
||||||
|
2. Deduplicate
|
||||||
|
3. Insert all hits into SQLite (new + dupes, flagged accordingly)
|
||||||
|
4. Write new hits to hits.txt
|
||||||
|
5. Send Telegram alert for new alertable hits only
|
||||||
|
"""
|
||||||
|
if not hits:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Score first
|
||||||
|
scored = score_hits(hits)
|
||||||
|
log.info(f" Scored {len(scored)} hit(s) — {summarize(scored)}")
|
||||||
|
|
||||||
|
# Deduplicate
|
||||||
|
new_hits, dupe_hits = deduplicate(scored)
|
||||||
|
|
||||||
|
# Always insert into DB
|
||||||
|
if new_hits:
|
||||||
|
insert_hits(new_hits, source, filename, seen_before=False)
|
||||||
|
if dupe_hits:
|
||||||
|
insert_hits(dupe_hits, source, filename, seen_before=True)
|
||||||
|
|
||||||
|
if not new_hits:
|
||||||
|
log.info(" All hits already seen before — no alert sent.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Push hits to TUI
|
||||||
|
for h in new_hits:
|
||||||
|
bus.post(bus.EvHit(
|
||||||
|
severity=h.severity,
|
||||||
|
raw=h.raw,
|
||||||
|
source=source,
|
||||||
|
filename=filename,
|
||||||
|
reasons=h.reasons,
|
||||||
|
))
|
||||||
|
|
||||||
|
write_hits(new_hits, source)
|
||||||
|
write_hits_csv(new_hits, source, filename)
|
||||||
|
await send_alert(bot, new_hits, source, filename)
|
||||||
|
|
||||||
|
|
||||||
|
async def send_status(bot: TelegramClient, message: str) -> None:
|
||||||
|
"""Send a plain status/info message to the notify chat."""
|
||||||
|
try:
|
||||||
|
await bot.send_message(NOTIFY_CHAT_ID, message, parse_mode="markdown")
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Failed to send status message: {e}")
|
||||||
69
core/processor.md
Normal file
69
core/processor.md
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
# core/processor.py
|
||||||
|
|
||||||
|
Archive extraction and hit searching. No Telegram deps, no async.
|
||||||
|
|
||||||
|
## Public API
|
||||||
|
|
||||||
|
```python
|
||||||
|
from core.processor import compile_patterns, process_file
|
||||||
|
```
|
||||||
|
|
||||||
|
### `compile_patterns(keywords: list[str]) -> list[re.Pattern]`
|
||||||
|
Compiles a list of keyword strings into case-insensitive regex patterns.
|
||||||
|
Call once at startup; pass the result everywhere patterns are needed.
|
||||||
|
|
||||||
|
```python
|
||||||
|
patterns = compile_patterns(config.TARGET_KEYWORDS)
|
||||||
|
```
|
||||||
|
|
||||||
|
### `process_file(filepath: Path, patterns, password=None) -> list[str]`
|
||||||
|
Full pipeline: unpack → search each `.txt` → recurse into nested archives → clean up everything.
|
||||||
|
Returns list of matching raw lines (hits). Deletes the original file and all extracted contents on completion.
|
||||||
|
|
||||||
|
```python
|
||||||
|
hits = process_file(Path("data/tmp/combo.zip"), patterns, password="infected")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Internal functions
|
||||||
|
|
||||||
|
| Function | Signature | Description |
|
||||||
|
|----------|-----------|-------------|
|
||||||
|
| `search_file` | `(filepath, patterns) -> list[str]` | Stream-reads `.txt` line by line; ignores encoding errors |
|
||||||
|
| `unpack` | `(filepath, extra_password) -> (files, extract_dir\|None)` | Dispatches to correct extractor; plain `.txt` returned as-is |
|
||||||
|
| `extract_zip` | `(filepath, dest, extra_password)` | Tries no password first, then `ARCHIVE_PASSWORDS` list |
|
||||||
|
| `extract_7z` | `(filepath, dest, extra_password)` | Requires `py7zr`; skips if not installed |
|
||||||
|
| `extract_rar` | `(filepath, dest, extra_password)` | Requires `rarfile` + `unrar` binary |
|
||||||
|
| `_try_passwords` | `(extract_fn, passwords)` | Iterates password list, stops on first success |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Supported formats
|
||||||
|
|
||||||
|
| Extension | Library | Notes |
|
||||||
|
|-----------|---------|-------|
|
||||||
|
| `.txt` | built-in | Stream-read, no load into memory |
|
||||||
|
| `.zip` | `zipfile` | stdlib |
|
||||||
|
| `.7z` | `py7zr` | optional; skipped if not installed |
|
||||||
|
| `.rar` | `rarfile` | optional; requires `unrar` system binary |
|
||||||
|
|
||||||
|
Nested archives are recursed **one level** only.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Password order
|
||||||
|
|
||||||
|
1. `extra_password` (from message/channel carry-forward) — tried first
|
||||||
|
2. `config.ARCHIVE_PASSWORDS` — tried in order
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Cleanup guarantee
|
||||||
|
|
||||||
|
`process_file` always deletes:
|
||||||
|
- Extracted individual files
|
||||||
|
- Extract subdirectory
|
||||||
|
- Original downloaded file
|
||||||
|
|
||||||
|
Even if no hits are found.
|
||||||
233
core/processor.py
Normal file
233
core/processor.py
Normal file
@@ -0,0 +1,233 @@
|
|||||||
|
"""
|
||||||
|
processor.py — Archive extraction and hit searching logic.
|
||||||
|
|
||||||
|
Supports: .txt, .zip, .7z, .rar
|
||||||
|
Stream-processes files line by line — safe for large combo lists.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import rarfile
|
||||||
|
rarfile.UNRAR_TOOL = "unrar"
|
||||||
|
|
||||||
|
import re
|
||||||
|
import zipfile
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
try:
|
||||||
|
import py7zr
|
||||||
|
HAS_7Z = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_7Z = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
import rarfile
|
||||||
|
HAS_RAR = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_RAR = False
|
||||||
|
|
||||||
|
from config import ARCHIVE_PASSWORDS
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Searching ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def compile_patterns(keywords: list[str]) -> list[re.Pattern]:
|
||||||
|
return [re.compile(kw, re.IGNORECASE) for kw in keywords]
|
||||||
|
|
||||||
|
|
||||||
|
def search_file(filepath: Path, patterns: list[re.Pattern]) -> list[str]:
|
||||||
|
"""
|
||||||
|
Stream-reads a text file line by line and returns lines matching any pattern.
|
||||||
|
Ignores encoding errors — combo files are often messy.
|
||||||
|
"""
|
||||||
|
hits: list[str] = []
|
||||||
|
try:
|
||||||
|
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
|
||||||
|
for line in f:
|
||||||
|
stripped = line.strip()
|
||||||
|
if stripped and any(p.search(stripped) for p in patterns):
|
||||||
|
hits.append(stripped)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f"Could not read {filepath.name}: {e}")
|
||||||
|
return hits
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Extraction ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _try_passwords(extract_fn, passwords: list[bytes]) -> bool:
|
||||||
|
"""Try a list of passwords against an extract function. Returns True on success."""
|
||||||
|
for pwd in passwords:
|
||||||
|
try:
|
||||||
|
extract_fn(pwd)
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def extract_zip(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
|
||||||
|
passwords = ARCHIVE_PASSWORDS.copy()
|
||||||
|
if extra_password:
|
||||||
|
passwords.insert(0, extra_password.encode())
|
||||||
|
extracted: list[Path] = []
|
||||||
|
try:
|
||||||
|
with zipfile.ZipFile(filepath) as zf:
|
||||||
|
def try_extract(pwd: bytes):
|
||||||
|
zf.extractall(dest, pwd=pwd or None)
|
||||||
|
|
||||||
|
try:
|
||||||
|
zf.extractall(dest)
|
||||||
|
except RuntimeError:
|
||||||
|
log.info(f" ZIP is password-protected, trying common passwords...")
|
||||||
|
if not _try_passwords(try_extract, ARCHIVE_PASSWORDS):
|
||||||
|
log.warning(f" Could not unlock {filepath.name} — skipping.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
extracted = [p for p in dest.rglob("*") if p.is_file()]
|
||||||
|
except zipfile.BadZipFile:
|
||||||
|
log.warning(f" {filepath.name} is not a valid ZIP.")
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f" ZIP extraction error on {filepath.name}: {e}")
|
||||||
|
return extracted
|
||||||
|
|
||||||
|
|
||||||
|
def extract_7z(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
|
||||||
|
if not HAS_7Z:
|
||||||
|
log.warning("py7zr not installed — skipping .7z file.")
|
||||||
|
return []
|
||||||
|
extracted: list[Path] = []
|
||||||
|
passwords = ARCHIVE_PASSWORDS.copy()
|
||||||
|
if extra_password:
|
||||||
|
passwords.insert(0, extra_password.encode())
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Try without password first
|
||||||
|
try:
|
||||||
|
with py7zr.SevenZipFile(filepath, mode="r") as z:
|
||||||
|
z.extractall(dest)
|
||||||
|
except py7zr.exceptions.PasswordRequired:
|
||||||
|
log.info(f" 7z is password-protected, trying common passwords...")
|
||||||
|
success = False
|
||||||
|
for pwd in ARCHIVE_PASSWORDS:
|
||||||
|
try:
|
||||||
|
with py7zr.SevenZipFile(filepath, mode="r", password=pwd.decode()) as z:
|
||||||
|
z.extractall(dest)
|
||||||
|
success = True
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if not success:
|
||||||
|
log.warning(f" Could not unlock {filepath.name} — skipping.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
extracted = [p for p in dest.rglob("*") if p.is_file()]
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f" 7z extraction error on {filepath.name}: {e}")
|
||||||
|
return extracted
|
||||||
|
|
||||||
|
|
||||||
|
def extract_rar(filepath: Path, dest: Path, extra_password: str | None = None) -> list[Path]:
|
||||||
|
if not HAS_RAR:
|
||||||
|
log.warning("rarfile not installed — skipping .rar file.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
passwords = ARCHIVE_PASSWORDS.copy()
|
||||||
|
if extra_password:
|
||||||
|
passwords.insert(0, extra_password.encode())
|
||||||
|
extracted: list[Path] = []
|
||||||
|
try:
|
||||||
|
with rarfile.RarFile(filepath) as rf:
|
||||||
|
def try_extract(pwd: bytes):
|
||||||
|
rf.extractall(dest, pwd=pwd.decode() if pwd else None)
|
||||||
|
|
||||||
|
try:
|
||||||
|
rf.extractall(dest)
|
||||||
|
except rarfile.BadRarFile:
|
||||||
|
log.warning(f" {filepath.name} is not a valid RAR.")
|
||||||
|
return []
|
||||||
|
except Exception:
|
||||||
|
log.info(f" RAR may be password-protected, trying common passwords...")
|
||||||
|
if not _try_passwords(try_extract, ARCHIVE_PASSWORDS):
|
||||||
|
log.warning(f" Could not unlock {filepath.name} — skipping.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
extracted = [p for p in dest.rglob("*") if p.is_file()]
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f" RAR extraction error on {filepath.name}: {e}")
|
||||||
|
return extracted
|
||||||
|
|
||||||
|
|
||||||
|
def unpack(filepath: Path, extra_password: str | None = None) -> tuple[list[Path], Path | None]:
|
||||||
|
"""
|
||||||
|
Unpacks an archive into a sibling directory.
|
||||||
|
Returns (list of extracted files, extract_dir or None).
|
||||||
|
If it's not an archive, returns ([filepath], None).
|
||||||
|
"""
|
||||||
|
suffix = filepath.suffix.lower()
|
||||||
|
extract_dir = filepath.parent / filepath.stem
|
||||||
|
|
||||||
|
if suffix == ".zip":
|
||||||
|
extract_dir.mkdir(exist_ok=True)
|
||||||
|
files = extract_zip(filepath, extract_dir, extra_password)
|
||||||
|
return files, extract_dir
|
||||||
|
|
||||||
|
elif suffix == ".7z":
|
||||||
|
extract_dir.mkdir(exist_ok=True)
|
||||||
|
files = extract_7z(filepath, extract_dir, extra_password)
|
||||||
|
return files, extract_dir
|
||||||
|
|
||||||
|
elif suffix == ".rar":
|
||||||
|
extract_dir.mkdir(exist_ok=True)
|
||||||
|
files = extract_rar(filepath, extract_dir, extra_password)
|
||||||
|
return files, extract_dir
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Plain file — return as-is, no extract dir to clean up
|
||||||
|
return [filepath], None
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Main entry point ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def process_file(filepath: Path, patterns, password: str | None = None) -> list[str]:
|
||||||
|
"""
|
||||||
|
Full pipeline: unpack → search each file → clean up everything.
|
||||||
|
Returns list of matching lines (hits).
|
||||||
|
"""
|
||||||
|
log.info(f" Processing: {filepath.name}")
|
||||||
|
all_hits: list[str] = []
|
||||||
|
|
||||||
|
files, extract_dir = unpack(filepath, extra_password=password)
|
||||||
|
|
||||||
|
for f in files:
|
||||||
|
if f.suffix.lower() == ".txt":
|
||||||
|
hits = search_file(f, patterns)
|
||||||
|
if hits:
|
||||||
|
log.info(f" ✓ {len(hits)} hit(s) in {f.name}")
|
||||||
|
all_hits.extend(hits)
|
||||||
|
|
||||||
|
# Nested archives — recurse one level
|
||||||
|
elif f.suffix.lower() in {".zip", ".7z", ".rar"} and f != filepath:
|
||||||
|
log.info(f" → Nested archive: {f.name}")
|
||||||
|
nested_hits = process_file(f, patterns)
|
||||||
|
all_hits.extend(nested_hits)
|
||||||
|
continue # process_file already cleaned up f
|
||||||
|
|
||||||
|
# Clean up extracted file
|
||||||
|
try:
|
||||||
|
f.unlink(missing_ok=True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Clean up extract dir
|
||||||
|
if extract_dir and extract_dir.exists():
|
||||||
|
shutil.rmtree(extract_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
# Clean up original download
|
||||||
|
try:
|
||||||
|
filepath.unlink(missing_ok=True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return all_hits
|
||||||
65
core/scraper.md
Normal file
65
core/scraper.md
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
# core/scraper.py
|
||||||
|
|
||||||
|
Telethon user-client layer. Handles live listening, backfill, and the single-message download pipeline.
|
||||||
|
|
||||||
|
## Public API
|
||||||
|
|
||||||
|
```python
|
||||||
|
from core.scraper import handle_message, backfill_all, register_handlers, warm_entity_cache
|
||||||
|
```
|
||||||
|
|
||||||
|
### `handle_message(client, bot, msg, source_name, patterns, password=None)`
|
||||||
|
**async.** Full pipeline for one document message:
|
||||||
|
1. Extract filename + size, check allowlist + size guard
|
||||||
|
2. Check `utils.cache` — skip if already seen
|
||||||
|
3. Try `tdl` download → Telethon fallback
|
||||||
|
4. `core.processor.process_file()` → hits
|
||||||
|
5. `core.notifier.notify()` if hits found
|
||||||
|
6. `utils.cache.mark_seen()`
|
||||||
|
|
||||||
|
Called by: live handler, `bot_downloader`, backfill fallback path.
|
||||||
|
|
||||||
|
### `backfill_all(client, bot, patterns)`
|
||||||
|
**async.** Iterates `config.WATCHED_CHANNELS`, calls `backfill_channel()` for each.
|
||||||
|
No-op if `config.BACKFILL_LIMIT == 0`.
|
||||||
|
|
||||||
|
### `register_handlers(client, bot, patterns)`
|
||||||
|
Registers a `NewMessage` Telethon event handler on `config.WATCHED_CHANNELS`.
|
||||||
|
Used in **CLI mode only** (`--no-tui`). The TUI manages its own handler via `_make_handler()` in `tui/app.py`.
|
||||||
|
|
||||||
|
### `warm_entity_cache(client)`
|
||||||
|
**async.** Iterates `client.iter_dialogs()` so Telethon caches entity mappings.
|
||||||
|
Must be called before using raw numeric channel IDs.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Internal functions
|
||||||
|
|
||||||
|
| Function | Description |
|
||||||
|
|----------|-------------|
|
||||||
|
| `get_filename(msg)` | Extracts filename from `MessageMediaDocument`; falls back to `{msg_id}{ext}` from MIME |
|
||||||
|
| `get_filesize(msg)` | Returns document size in bytes |
|
||||||
|
| `is_processable(filename, size)` | Checks extension allowlist + size limit; returns `(bool, reason)` |
|
||||||
|
| `_make_dest(msg, filename)` | Resolves temp path, handles collision with `{msg_id}_{filename}` |
|
||||||
|
| `_telethon_download(client, msg, dest, ...)` | Telethon fallback with tqdm progress + flood-wait handling. Posts `EvDownload*` bus events |
|
||||||
|
| `backfill_channel(client, bot, channel, patterns, limit)` | Scans history with password carry-forward; batches via tdl |
|
||||||
|
| `_process_batch(client, bot, batch, patterns)` | One tdl invocation for up to `TDL_AMOUNT` messages; per-file Telethon fallback |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Password carry-forward (backfill)
|
||||||
|
|
||||||
|
Channels often post the archive password as a separate text message.
|
||||||
|
`backfill_channel` iterates newest→oldest, carrying `last_password` so both older and newer file messages in the same scan pick it up.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Download strategy
|
||||||
|
|
||||||
|
```
|
||||||
|
is_tdl_available()?
|
||||||
|
yes → download_single_with_tdl() / download_batch_with_tdl()
|
||||||
|
↓ failed?
|
||||||
|
_telethon_download()
|
||||||
|
no → _telethon_download() directly
|
||||||
|
```
|
||||||
410
core/scraper.py
Normal file
410
core/scraper.py
Normal file
@@ -0,0 +1,410 @@
|
|||||||
|
"""
|
||||||
|
scraper.py — Telethon user client.
|
||||||
|
|
||||||
|
Handles:
|
||||||
|
- Listening for new file messages in watched channels
|
||||||
|
- Listening for messages with inline download buttons (bot-dispatched files)
|
||||||
|
- Backfilling recent channel history on startup (batched via tdl)
|
||||||
|
- Downloading files safely (size guard, flood wait)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
from telethon import TelegramClient, events
|
||||||
|
from telethon.errors import FloodWaitError, ChannelPrivateError, UsernameNotOccupiedError
|
||||||
|
from telethon.tl.types import (
|
||||||
|
MessageMediaDocument,
|
||||||
|
DocumentAttributeFilename,
|
||||||
|
InputDocumentFileLocation,
|
||||||
|
)
|
||||||
|
|
||||||
|
from config import (
|
||||||
|
ALLOWED_EXTENSIONS,
|
||||||
|
BACKFILL_LIMIT,
|
||||||
|
MAX_FILE_SIZE,
|
||||||
|
TEMP_DIR,
|
||||||
|
WATCHED_CHANNELS,
|
||||||
|
TDL_AMOUNT,
|
||||||
|
)
|
||||||
|
from core.bot_downloader import handle_bot_download_message, has_download_button, extract_password
|
||||||
|
from utils.cache import is_seen, mark_seen
|
||||||
|
from core.processor import process_file
|
||||||
|
from core.notifier import notify
|
||||||
|
from core.tdl_downloader import (
|
||||||
|
BatchEntry,
|
||||||
|
download_batch_with_tdl,
|
||||||
|
download_single_with_tdl,
|
||||||
|
is_tdl_available,
|
||||||
|
)
|
||||||
|
from tui import events as bus
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def get_filename(msg) -> str | None:
|
||||||
|
"""Extract the filename from a document message, if any."""
|
||||||
|
if not isinstance(msg.media, MessageMediaDocument):
|
||||||
|
return None
|
||||||
|
doc = msg.media.document
|
||||||
|
for attr in doc.attributes:
|
||||||
|
if isinstance(attr, DocumentAttributeFilename):
|
||||||
|
return attr.file_name
|
||||||
|
mime = getattr(doc, "mime_type", "") or ""
|
||||||
|
ext_map = {
|
||||||
|
"application/x-rar-compressed": ".rar",
|
||||||
|
"application/vnd.rar": ".rar",
|
||||||
|
"application/zip": ".zip",
|
||||||
|
"application/x-7z-compressed": ".7z",
|
||||||
|
"text/plain": ".txt",
|
||||||
|
}
|
||||||
|
return f"{msg.id}{ext_map.get(mime, '.bin')}"
|
||||||
|
|
||||||
|
|
||||||
|
def get_filesize(msg) -> int:
|
||||||
|
"""Return document size in bytes, or 0 if not a document."""
|
||||||
|
if not isinstance(msg.media, MessageMediaDocument):
|
||||||
|
return 0
|
||||||
|
return msg.media.document.size or 0
|
||||||
|
|
||||||
|
|
||||||
|
def is_processable(filename: str, size: int) -> tuple[bool, str]:
|
||||||
|
"""Check whether a file should be downloaded. Returns (ok, reason)."""
|
||||||
|
suffix = Path(filename).suffix.lower()
|
||||||
|
if suffix not in ALLOWED_EXTENSIONS:
|
||||||
|
return False, f"extension {suffix!r} not in allowlist"
|
||||||
|
if size > MAX_FILE_SIZE:
|
||||||
|
mb = size / (1024 * 1024)
|
||||||
|
return False, f"too large ({mb:.1f} MB > {MAX_FILE_SIZE // (1024 * 1024)} MB limit)"
|
||||||
|
return True, ""
|
||||||
|
|
||||||
|
|
||||||
|
def _make_dest(msg, filename: str) -> Path:
|
||||||
|
"""Resolve the destination path, avoiding name collisions."""
|
||||||
|
TEMP_DIR.mkdir(exist_ok=True)
|
||||||
|
dest = TEMP_DIR / filename
|
||||||
|
if dest.exists():
|
||||||
|
dest = TEMP_DIR / f"{msg.id}_{filename}"
|
||||||
|
return dest
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Telethon fallback download ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def _telethon_download(client: TelegramClient, msg, dest: Path, filename: str, size: int, batch_id: str | None = None) -> bool:
|
||||||
|
"""Download a single file via Telethon. Returns True on success."""
|
||||||
|
_bid = batch_id or f"telethon_{int(time.monotonic_ns())}"
|
||||||
|
if batch_id is None:
|
||||||
|
# Standalone call (not already queued by tdl path) — post queued event
|
||||||
|
bus.post(bus.EvDownloadQueued(
|
||||||
|
batch_id=_bid, filename=filename,
|
||||||
|
size_mb=round(size / (1024 * 1024), 2),
|
||||||
|
source="telethon", password=None,
|
||||||
|
))
|
||||||
|
bus.post(bus.EvDownloadStarted(batch_id=_bid, filename=filename))
|
||||||
|
try:
|
||||||
|
with tqdm(
|
||||||
|
total=size,
|
||||||
|
unit="B",
|
||||||
|
unit_scale=True,
|
||||||
|
unit_divisor=1024,
|
||||||
|
desc=filename[:40],
|
||||||
|
colour="cyan",
|
||||||
|
) as pbar:
|
||||||
|
async def progress(current, total):
|
||||||
|
pbar.n = current
|
||||||
|
pbar.refresh()
|
||||||
|
|
||||||
|
doc = msg.media.document
|
||||||
|
location = InputDocumentFileLocation(
|
||||||
|
id=doc.id,
|
||||||
|
access_hash=doc.access_hash,
|
||||||
|
file_reference=doc.file_reference,
|
||||||
|
thumb_size="",
|
||||||
|
)
|
||||||
|
await client.download_file(
|
||||||
|
location,
|
||||||
|
file=dest,
|
||||||
|
part_size_kb=512,
|
||||||
|
progress_callback=progress,
|
||||||
|
)
|
||||||
|
bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon"))
|
||||||
|
return True
|
||||||
|
except FloodWaitError as e:
|
||||||
|
log.warning(f" Flood wait: sleeping {e.seconds}s...")
|
||||||
|
await asyncio.sleep(e.seconds)
|
||||||
|
await client.download_media(msg, file=dest)
|
||||||
|
bus.post(bus.EvDownloadDone(batch_id=_bid, filename=filename, via="telethon"))
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f" Telethon download failed for {filename}: {e}")
|
||||||
|
bus.post(bus.EvDownloadFailed(batch_id=_bid, filename=filename, reason=str(e)))
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Single-message pipeline (live handler + bot_downloader) ──────────────────
|
||||||
|
|
||||||
|
async def handle_message(
|
||||||
|
client: TelegramClient,
|
||||||
|
bot: TelegramClient,
|
||||||
|
msg,
|
||||||
|
source_name: str,
|
||||||
|
patterns,
|
||||||
|
password: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Download and process a single file message."""
|
||||||
|
filename = get_filename(msg)
|
||||||
|
if not filename:
|
||||||
|
log.warning(" handle_message: could not extract filename, skipping.")
|
||||||
|
return
|
||||||
|
|
||||||
|
size = get_filesize(msg)
|
||||||
|
ok, reason = is_processable(filename, size)
|
||||||
|
if not ok:
|
||||||
|
log.warning(f" handle_message: skipping '{filename}' — {reason}")
|
||||||
|
return
|
||||||
|
|
||||||
|
doc_id = msg.media.document.id
|
||||||
|
if is_seen(doc_id):
|
||||||
|
log.info(f" Skipping {filename} — already processed.")
|
||||||
|
return
|
||||||
|
|
||||||
|
dest = _make_dest(msg, filename)
|
||||||
|
log.info(f"↓ Downloading: {filename} ({size / 1024:.1f} KB) from {source_name}")
|
||||||
|
|
||||||
|
# tdl single → Telethon fallback
|
||||||
|
downloaded = await download_single_with_tdl(msg, dest) if is_tdl_available() else False
|
||||||
|
if not downloaded:
|
||||||
|
if is_tdl_available():
|
||||||
|
log.warning(" [tdl] failed — falling back to Telethon")
|
||||||
|
downloaded = await _telethon_download(client, msg, dest, filename, size)
|
||||||
|
|
||||||
|
if not downloaded:
|
||||||
|
log.error(f" All download attempts failed for {filename}")
|
||||||
|
return
|
||||||
|
|
||||||
|
hits = process_file(dest, patterns, password=password)
|
||||||
|
mark_seen(doc_id)
|
||||||
|
|
||||||
|
if hits:
|
||||||
|
await notify(bot, hits, source_name, filename)
|
||||||
|
else:
|
||||||
|
log.info(f" No hits in {filename}")
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Batch pipeline (backfill only) ───────────────────────────────────────────
|
||||||
|
|
||||||
|
async def _process_batch(
|
||||||
|
client: TelegramClient,
|
||||||
|
bot: TelegramClient,
|
||||||
|
batch: list[tuple], # list of (msg, source_name, password)
|
||||||
|
patterns,
|
||||||
|
) -> int:
|
||||||
|
"""
|
||||||
|
Download up to TDL_AMOUNT messages in one tdl invocation, then process
|
||||||
|
each. Falls back to Telethon per-file for anything tdl missed.
|
||||||
|
Returns the number of files successfully processed.
|
||||||
|
"""
|
||||||
|
if not batch:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Build BatchEntry list
|
||||||
|
entries: list[BatchEntry] = []
|
||||||
|
for msg, source_name, password in batch:
|
||||||
|
filename = get_filename(msg)
|
||||||
|
if not filename:
|
||||||
|
continue
|
||||||
|
entries.append(BatchEntry(
|
||||||
|
msg=msg,
|
||||||
|
filename=filename,
|
||||||
|
dest=_make_dest(msg, filename),
|
||||||
|
doc_id=msg.media.document.id,
|
||||||
|
source_name=source_name,
|
||||||
|
password=password,
|
||||||
|
))
|
||||||
|
|
||||||
|
names = ", ".join(e.filename for e in entries)
|
||||||
|
log.info(f"[Batch] {len(entries)} file(s): {names}")
|
||||||
|
|
||||||
|
# One tdl call for the whole batch
|
||||||
|
results = await download_batch_with_tdl(entries)
|
||||||
|
|
||||||
|
processed = 0
|
||||||
|
for entry in entries:
|
||||||
|
tdl_ok = results.get(entry.doc_id, False)
|
||||||
|
|
||||||
|
if not tdl_ok:
|
||||||
|
# Per-file Telethon fallback
|
||||||
|
log.info(f" [Batch] Telethon fallback: {entry.filename}")
|
||||||
|
size = get_filesize(entry.msg)
|
||||||
|
tdl_ok = await _telethon_download(client, entry.msg, entry.dest, entry.filename, size)
|
||||||
|
|
||||||
|
if not tdl_ok:
|
||||||
|
log.error(f" [Batch] All attempts failed: {entry.filename}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
hits = process_file(entry.dest, patterns, password=entry.password)
|
||||||
|
mark_seen(entry.doc_id)
|
||||||
|
|
||||||
|
if hits:
|
||||||
|
await notify(bot, hits, entry.source_name, entry.filename)
|
||||||
|
else:
|
||||||
|
log.info(f" No hits in {entry.filename}")
|
||||||
|
|
||||||
|
processed += 1
|
||||||
|
|
||||||
|
return processed
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Backfill ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def backfill_channel(
|
||||||
|
client: TelegramClient,
|
||||||
|
bot: TelegramClient,
|
||||||
|
channel: str | int,
|
||||||
|
patterns,
|
||||||
|
limit: int,
|
||||||
|
) -> None:
|
||||||
|
"""Scan the last `limit` messages of a channel for file attachments."""
|
||||||
|
log.info(f"[Backfill] Scanning history: {channel} (last {limit} messages)")
|
||||||
|
total = 0
|
||||||
|
batch: list[tuple] = [] # (msg, source_name, password)
|
||||||
|
last_password: str | None = None # carry password across adjacent messages
|
||||||
|
|
||||||
|
async def flush_batch():
|
||||||
|
nonlocal total
|
||||||
|
if batch:
|
||||||
|
total += await _process_batch(client, bot, batch, patterns)
|
||||||
|
batch.clear()
|
||||||
|
|
||||||
|
try:
|
||||||
|
async for msg in client.iter_messages(channel, limit=limit):
|
||||||
|
source_name = str(channel)
|
||||||
|
|
||||||
|
# Extract password from this message if present, and remember it.
|
||||||
|
# iter_messages goes newest→oldest, so a password post that appears
|
||||||
|
# above the files in the channel will arrive AFTER them here.
|
||||||
|
# We therefore carry last_password in both directions:
|
||||||
|
# - apply it to file messages that have no inline password
|
||||||
|
# - update it whenever we see a fresh password, so subsequent
|
||||||
|
# (older) file messages in the same batch pick it up too.
|
||||||
|
msg_password = extract_password(msg)
|
||||||
|
if msg_password:
|
||||||
|
last_password = msg_password
|
||||||
|
|
||||||
|
password = msg_password or last_password
|
||||||
|
|
||||||
|
if msg.media and isinstance(msg.media, MessageMediaDocument):
|
||||||
|
filename = get_filename(msg)
|
||||||
|
size = get_filesize(msg)
|
||||||
|
|
||||||
|
if not filename:
|
||||||
|
continue
|
||||||
|
|
||||||
|
ok, reason = is_processable(filename, size)
|
||||||
|
if not ok:
|
||||||
|
log.warning(f" [Backfill] Skipping '{filename}' — {reason}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if is_seen(msg.media.document.id):
|
||||||
|
log.info(f" [Backfill] Already seen: {filename}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if is_tdl_available():
|
||||||
|
batch.append((msg, source_name, password))
|
||||||
|
if len(batch) >= TDL_AMOUNT:
|
||||||
|
await flush_batch()
|
||||||
|
else:
|
||||||
|
# No tdl — fall straight through to single handle_message
|
||||||
|
await handle_message(client, bot, msg, source_name, patterns, password=password)
|
||||||
|
total += 1
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
|
elif msg.buttons and has_download_button(msg):
|
||||||
|
# Bot-button messages can't be batched — handle individually
|
||||||
|
await flush_batch() # flush any pending batch first
|
||||||
|
await handle_bot_download_message(client, bot, msg, source_name, patterns, password=password)
|
||||||
|
total += 1
|
||||||
|
await asyncio.sleep(1.5)
|
||||||
|
|
||||||
|
# Flush whatever's left
|
||||||
|
await flush_batch()
|
||||||
|
|
||||||
|
except (ChannelPrivateError, UsernameNotOccupiedError) as e:
|
||||||
|
log.error(f"[Backfill] Cannot access {channel}: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"[Backfill] Error scanning {channel}: {e}")
|
||||||
|
|
||||||
|
log.info(f"[Backfill] Done: {channel} — {total} file(s) processed")
|
||||||
|
|
||||||
|
|
||||||
|
async def backfill_all(
|
||||||
|
client: TelegramClient,
|
||||||
|
bot: TelegramClient,
|
||||||
|
patterns,
|
||||||
|
) -> None:
|
||||||
|
"""Backfill all watched channels sequentially."""
|
||||||
|
if BACKFILL_LIMIT <= 0:
|
||||||
|
log.info("[Backfill] Disabled (BACKFILL_LIMIT=0)")
|
||||||
|
return
|
||||||
|
log.info(f"[Backfill] Starting for {len(WATCHED_CHANNELS)} channel(s)...")
|
||||||
|
for ch in WATCHED_CHANNELS:
|
||||||
|
await backfill_channel(client, bot, ch, patterns, BACKFILL_LIMIT)
|
||||||
|
log.info("[Backfill] Complete.")
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Entity cache warmup ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def warm_entity_cache(client: TelegramClient) -> None:
|
||||||
|
"""
|
||||||
|
Fetches your dialog list so Telethon caches all entity mappings.
|
||||||
|
Required before using raw numeric IDs.
|
||||||
|
"""
|
||||||
|
log.info("Warming entity cache (fetching dialogs)...")
|
||||||
|
async for _ in client.iter_dialogs():
|
||||||
|
pass
|
||||||
|
log.info("Entity cache ready.")
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Live listener ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def register_handlers(
|
||||||
|
client: TelegramClient,
|
||||||
|
bot: TelegramClient,
|
||||||
|
patterns,
|
||||||
|
) -> None:
|
||||||
|
"""Register the NewMessage event handler for all watched channels."""
|
||||||
|
|
||||||
|
# Per-channel password cache for the live handler.
|
||||||
|
# Channels often post a text message with the password separately from
|
||||||
|
# the file message. We remember the last seen password per channel so
|
||||||
|
# that the file message that follows (or precedes by seconds) picks it up.
|
||||||
|
_channel_passwords: dict[int, str] = {}
|
||||||
|
|
||||||
|
@client.on(events.NewMessage(chats=WATCHED_CHANNELS))
|
||||||
|
async def on_new_message(event):
|
||||||
|
msg = event.message
|
||||||
|
try:
|
||||||
|
source = event.chat.username or str(event.chat_id)
|
||||||
|
except Exception:
|
||||||
|
source = str(event.chat_id)
|
||||||
|
|
||||||
|
chat_id = event.chat_id
|
||||||
|
log.info(f"[Live] New message in {source}")
|
||||||
|
|
||||||
|
# Update cache if this message carries a password
|
||||||
|
msg_password = extract_password(msg)
|
||||||
|
if msg_password:
|
||||||
|
_channel_passwords[chat_id] = msg_password
|
||||||
|
log.debug(f"[Live] Password cached for {source}: '{msg_password}'")
|
||||||
|
|
||||||
|
password = msg_password or _channel_passwords.get(chat_id)
|
||||||
|
|
||||||
|
if msg.media and isinstance(msg.media, MessageMediaDocument):
|
||||||
|
await handle_message(client, bot, msg, source, patterns, password=password)
|
||||||
|
elif msg.buttons and has_download_button(msg):
|
||||||
|
await handle_bot_download_message(client, bot, msg, source, patterns, password=password)
|
||||||
70
core/tdl_downloader.md
Normal file
70
core/tdl_downloader.md
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
# core/tdl_downloader.py
|
||||||
|
|
||||||
|
Fast file downloads via `tdl` (Go MTProto). Falls back gracefully if tdl is not installed.
|
||||||
|
|
||||||
|
## Public API
|
||||||
|
|
||||||
|
```python
|
||||||
|
from core.tdl_downloader import (
|
||||||
|
is_tdl_available,
|
||||||
|
download_single_with_tdl,
|
||||||
|
download_batch_with_tdl,
|
||||||
|
BatchEntry,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### `is_tdl_available() -> bool`
|
||||||
|
Returns `True` if `tdl` binary is on PATH.
|
||||||
|
|
||||||
|
### `download_single_with_tdl(msg, dest: Path) -> bool`
|
||||||
|
**async.** Downloads one message's document. Returns `True` on success.
|
||||||
|
Used by the live handler and `bot_downloader`.
|
||||||
|
|
||||||
|
### `download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]`
|
||||||
|
**async.** Downloads up to `TDL_AMOUNT` messages in a single `tdl dl` invocation.
|
||||||
|
Returns `{doc_id: True|False}` — `False` means Telethon fallback needed.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## BatchEntry dataclass
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class BatchEntry:
|
||||||
|
msg: object # Telethon Message
|
||||||
|
filename: str
|
||||||
|
dest: Path # final destination path in TEMP_DIR
|
||||||
|
doc_id: int # msg.media.document.id
|
||||||
|
source_name: str
|
||||||
|
password: str | None
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## TUI output pipeline
|
||||||
|
|
||||||
|
In TUI mode (`bus.tui_active == True`), `_run_tdl` pipes stdout+stderr and relays lines as `EvTdlOutput` events in real time.
|
||||||
|
**Reads raw 256-byte chunks** (not line-by-line) and splits on `\r` and `\n`, because tdl uses `\r` to overwrite its progress bar in place.
|
||||||
|
|
||||||
|
In CLI mode: subprocess inherits the terminal, progress bars render natively.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Staging directory isolation
|
||||||
|
|
||||||
|
Each batch/single download gets a unique `data/tmp/_tdl_{monotonic_ns}/` staging dir.
|
||||||
|
After `tdl` exits, files are matched by name (with fuzzy stem fallback for `filenamify()` mangling) and moved to final `dest`. Staging dir is removed regardless of outcome.
|
||||||
|
|
||||||
|
`--template '{{ filenamify .FileName }}'` — tdl uses the original Telegram filename, not its default `DialogID_MessageID_filename` format.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Config knobs (`config.py`)
|
||||||
|
|
||||||
|
| Setting | Default | Description |
|
||||||
|
|---------|---------|-------------|
|
||||||
|
| `TDL_NAMESPACE` | `"default"` | `-n` flag; `None` omits it |
|
||||||
|
| `TDL_THREADS` | `8` | `-t` chunk workers per file |
|
||||||
|
| `TDL_PERFILE` | `4` | `-l` concurrent files per invocation |
|
||||||
|
| `TDL_AMOUNT` | `4` | Max messages per batch |
|
||||||
|
| `TDL_TAKEOUT` | `False` | `--takeout` session flag |
|
||||||
363
core/tdl_downloader.py
Normal file
363
core/tdl_downloader.py
Normal file
@@ -0,0 +1,363 @@
|
|||||||
|
"""
|
||||||
|
tdl_downloader.py — Fast file downloads via tdl (Go MTProto implementation).
|
||||||
|
|
||||||
|
Install: https://github.com/iyear/tdl
|
||||||
|
curl -sSL https://raw.githubusercontent.com/iyear/tdl/main/scripts/install.sh | bash
|
||||||
|
|
||||||
|
First-time setup — log in once:
|
||||||
|
tdl login # saves to namespace "default"
|
||||||
|
tdl login -n myns # saves to a named namespace
|
||||||
|
|
||||||
|
Relevant config.py knobs:
|
||||||
|
TDL_NAMESPACE str|None Session namespace (default "default"; None omits -n)
|
||||||
|
TDL_THREADS int Chunk workers per file (-t, default 4)
|
||||||
|
TDL_PERFILE int Concurrent files (-l, default 4)
|
||||||
|
TDL_AMOUNT int Messages per tdl batch (default 4)
|
||||||
|
TDL_TAKEOUT bool Use takeout session (--takeout)
|
||||||
|
|
||||||
|
Flag reference:
|
||||||
|
Global (BEFORE subcommand): -n --ns, -t --threads, -l --limit
|
||||||
|
dl-specific: -u --url, -d --dir, --template, --continue, --takeout
|
||||||
|
|
||||||
|
Download isolation strategy:
|
||||||
|
Each batch gets its own staging subdirectory (TEMP_DIR/<batch_id>/) so that
|
||||||
|
concurrent downloads and homoglyph filename collisions can never cause tdl's
|
||||||
|
internal .tmp → final rename to fail. Files are moved to TEMP_DIR after
|
||||||
|
the batch completes and the staging dir is removed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from config import TDL_NAMESPACE, TDL_THREADS, TDL_PERFILE, TDL_TAKEOUT, TEMP_DIR
|
||||||
|
from tui import events as bus
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Availability ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def is_tdl_available() -> bool:
|
||||||
|
return shutil.which("tdl") is not None
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Message → URL ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _build_message_url(msg) -> str:
|
||||||
|
"""
|
||||||
|
Build a t.me/c/<channel_id>/<msg_id> link from a Telethon Message.
|
||||||
|
Works for public and private channels alike.
|
||||||
|
"""
|
||||||
|
peer = msg.peer_id
|
||||||
|
if hasattr(peer, "channel_id"):
|
||||||
|
return f"https://t.me/c/{peer.channel_id}/{msg.id}"
|
||||||
|
elif hasattr(peer, "chat_id"):
|
||||||
|
return f"https://t.me/c/{peer.chat_id}/{msg.id}"
|
||||||
|
elif hasattr(peer, "user_id"):
|
||||||
|
return f"https://t.me/c/{peer.user_id}/{msg.id}"
|
||||||
|
raise ValueError(f"Cannot build message URL from peer: {peer!r}")
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Command builder ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _build_cmd(urls: list[str], staging_dir: Path) -> list[str]:
|
||||||
|
"""
|
||||||
|
Build the full tdl dl command.
|
||||||
|
|
||||||
|
Global flags (-n, -t, -l) MUST precede the subcommand.
|
||||||
|
staging_dir is always an absolute path to a fresh per-batch directory,
|
||||||
|
so tdl's internal .tmp → final rename can never collide with an existing
|
||||||
|
file of the same name.
|
||||||
|
|
||||||
|
--template '{{ filenamify .FileName }}' keeps just the original filename
|
||||||
|
(no DialogID_MessageID_ prefix).
|
||||||
|
|
||||||
|
--continue is kept so interrupted downloads resume rather than restart.
|
||||||
|
--skip-same is intentionally omitted — deduplication is handled upstream
|
||||||
|
by is_seen(), and --skip-same can cause the .tmp rename to fail when a
|
||||||
|
same-named file already exists in the directory.
|
||||||
|
"""
|
||||||
|
global_flags: list[str] = []
|
||||||
|
if TDL_NAMESPACE:
|
||||||
|
global_flags += ["-n", str(TDL_NAMESPACE)]
|
||||||
|
global_flags += ["-t", str(TDL_THREADS), "-l", str(TDL_PERFILE)]
|
||||||
|
|
||||||
|
url_flags: list[str] = []
|
||||||
|
for url in urls:
|
||||||
|
url_flags += ["-u", url]
|
||||||
|
|
||||||
|
dl_flags = [
|
||||||
|
"-d", str(staging_dir),
|
||||||
|
"--template", "{{ filenamify .FileName }}",
|
||||||
|
"--continue",
|
||||||
|
]
|
||||||
|
if TDL_TAKEOUT:
|
||||||
|
dl_flags.append("--takeout")
|
||||||
|
|
||||||
|
return ["tdl", *global_flags, "dl", *url_flags, *dl_flags]
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Runner ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# ANSI escape stripper — tdl emits colour codes even when not a TTY
|
||||||
|
import re as _re
|
||||||
|
_ANSI_RE = _re.compile(r"\x1b\[[0-9;]*[mGKHFJA-Z]|\x1b=|\x1b>|\x1b\[\?[0-9]+[hl]")
|
||||||
|
|
||||||
|
def _strip_ansi(text: str) -> str:
|
||||||
|
return _ANSI_RE.sub("", text)
|
||||||
|
|
||||||
|
|
||||||
|
async def _run_tdl(cmd: list[str], label: str) -> bool:
|
||||||
|
"""
|
||||||
|
Spawn tdl and handle output based on whether the TUI is running:
|
||||||
|
- TUI mode: pipe stdout+stderr, read raw chunks (NOT line-by-line),
|
||||||
|
split on both \\r and \\n, strip ANSI, post non-empty
|
||||||
|
segments immediately as EvTdlOutput.
|
||||||
|
tdl uses \\r to overwrite its progress bar in place, so
|
||||||
|
async-for-line on the stream would block until EOF.
|
||||||
|
Chunk-reading + manual split delivers progress live.
|
||||||
|
- CLI mode: inherit the terminal so tdl's progress bars render natively.
|
||||||
|
Returns True on exit code 0, False otherwise.
|
||||||
|
"""
|
||||||
|
log.debug(f"[tdl] cmd: {' '.join(cmd)}")
|
||||||
|
try:
|
||||||
|
if bus.tui_active:
|
||||||
|
proc = await asyncio.create_subprocess_exec(
|
||||||
|
*cmd,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _relay(stream):
|
||||||
|
buf = ""
|
||||||
|
while True:
|
||||||
|
chunk = await stream.read(256)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
buf += chunk.decode(errors="replace")
|
||||||
|
# Split on both \r and \n; process all complete segments
|
||||||
|
parts = _re.split(r"[\r\n]", buf)
|
||||||
|
# Last element may be an incomplete segment — keep in buffer
|
||||||
|
buf = parts[-1]
|
||||||
|
for part in parts[:-1]:
|
||||||
|
clean = _strip_ansi(part).strip()
|
||||||
|
if clean:
|
||||||
|
bus.post(bus.EvTdlOutput(line=clean))
|
||||||
|
# Flush any remaining buffer content
|
||||||
|
if buf:
|
||||||
|
clean = _strip_ansi(buf).strip()
|
||||||
|
if clean:
|
||||||
|
bus.post(bus.EvTdlOutput(line=clean))
|
||||||
|
|
||||||
|
await asyncio.gather(_relay(proc.stdout), _relay(proc.stderr))
|
||||||
|
await proc.wait()
|
||||||
|
else:
|
||||||
|
proc = await asyncio.create_subprocess_exec(*cmd)
|
||||||
|
await proc.wait()
|
||||||
|
|
||||||
|
if proc.returncode == 0:
|
||||||
|
log.info(f"[tdl] ✓ {label}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
log.error(f"[tdl] ✗ exit {proc.returncode} — {label}")
|
||||||
|
return False
|
||||||
|
except FileNotFoundError:
|
||||||
|
log.error("[tdl] binary not found at runtime")
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"[tdl] Unexpected error: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Staging dir helpers ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _make_staging_dir() -> Path:
|
||||||
|
"""Create a unique staging subdirectory under TEMP_DIR for one batch."""
|
||||||
|
staging = TEMP_DIR.resolve() / f"_tdl_{int(time.monotonic_ns())}"
|
||||||
|
staging.mkdir(parents=True, exist_ok=True)
|
||||||
|
return staging
|
||||||
|
|
||||||
|
|
||||||
|
def _find_in_staging(staging: Path, expected_name: str) -> Path | None:
|
||||||
|
"""
|
||||||
|
Locate a downloaded file in the staging dir by matching its name.
|
||||||
|
filenamify() can munge characters (strips @, collapses unicode, etc.)
|
||||||
|
so we do a normalised stem comparison as a fallback.
|
||||||
|
"""
|
||||||
|
# Exact match first
|
||||||
|
exact = staging / expected_name
|
||||||
|
if exact.exists():
|
||||||
|
return exact
|
||||||
|
|
||||||
|
expected_stem = Path(expected_name).stem.lower().lstrip("@").replace(" ", "")
|
||||||
|
expected_suffix = Path(expected_name).suffix.lower()
|
||||||
|
|
||||||
|
for candidate in staging.iterdir():
|
||||||
|
if not candidate.is_file():
|
||||||
|
continue
|
||||||
|
if candidate.suffix.lower() != expected_suffix:
|
||||||
|
continue
|
||||||
|
cand_stem = candidate.stem.lower().lstrip("@").replace(" ", "")
|
||||||
|
if cand_stem == expected_stem:
|
||||||
|
return candidate
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _move_from_staging(staging: Path, expected_name: str, final_dest: Path) -> bool:
|
||||||
|
"""
|
||||||
|
Find the file in staging, move it to final_dest, return True on success.
|
||||||
|
"""
|
||||||
|
found = _find_in_staging(staging, expected_name)
|
||||||
|
if not found:
|
||||||
|
log.warning(f"[tdl] Not found in staging: '{expected_name}' (staging: {staging})")
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
found.rename(final_dest)
|
||||||
|
log.debug(f"[tdl] Moved: {found.name} → {final_dest}")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"[tdl] Move failed {found} → {final_dest}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup_staging(staging: Path) -> None:
|
||||||
|
try:
|
||||||
|
shutil.rmtree(staging, ignore_errors=True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Public API ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BatchEntry:
|
||||||
|
"""Carries everything needed to process one file after a batch download."""
|
||||||
|
msg: object # Telethon Message
|
||||||
|
filename: str
|
||||||
|
dest: Path
|
||||||
|
doc_id: int
|
||||||
|
source_name: str
|
||||||
|
password: str | None
|
||||||
|
|
||||||
|
|
||||||
|
async def download_batch_with_tdl(entries: list[BatchEntry]) -> dict[int, bool]:
|
||||||
|
"""
|
||||||
|
Download a batch of messages in a single tdl invocation.
|
||||||
|
|
||||||
|
Each batch gets its own staging subdirectory so filenames can never
|
||||||
|
collide with existing files in TEMP_DIR. After tdl exits, files are
|
||||||
|
moved from staging to their final dest paths.
|
||||||
|
|
||||||
|
Returns dict mapping doc_id → True (ready at entry.dest) / False (fallback needed).
|
||||||
|
"""
|
||||||
|
if not entries:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
if not is_tdl_available():
|
||||||
|
log.warning("[tdl] not available — all entries need Telethon fallback")
|
||||||
|
return {e.doc_id: False for e in entries}
|
||||||
|
|
||||||
|
urls: list[str] = []
|
||||||
|
for entry in entries:
|
||||||
|
try:
|
||||||
|
urls.append(_build_message_url(entry.msg))
|
||||||
|
except ValueError as exc:
|
||||||
|
log.error(f"[tdl] Skipping {entry.filename}: {exc}")
|
||||||
|
urls.append("")
|
||||||
|
|
||||||
|
valid_entries = [(e, u) for e, u in zip(entries, urls) if u]
|
||||||
|
if not valid_entries:
|
||||||
|
return {e.doc_id: False for e in entries}
|
||||||
|
|
||||||
|
batch_id = f"batch_{int(time.monotonic_ns())}"
|
||||||
|
names = ", ".join(e.filename for e, _ in valid_entries)
|
||||||
|
log.info(f"[tdl] Batch ({len(valid_entries)} files): {names}")
|
||||||
|
|
||||||
|
# Notify TUI: all files in this batch are queued
|
||||||
|
for entry, _ in valid_entries:
|
||||||
|
size_mb = (entry.msg.media.document.size or 0) / (1024 * 1024)
|
||||||
|
bus.post(bus.EvDownloadQueued(
|
||||||
|
batch_id=batch_id,
|
||||||
|
filename=entry.filename,
|
||||||
|
size_mb=round(size_mb, 2),
|
||||||
|
source=entry.source_name,
|
||||||
|
password=entry.password,
|
||||||
|
))
|
||||||
|
|
||||||
|
staging = _make_staging_dir()
|
||||||
|
cmd = _build_cmd([u for _, u in valid_entries], staging)
|
||||||
|
|
||||||
|
# Signal batch started
|
||||||
|
for entry, _ in valid_entries:
|
||||||
|
bus.post(bus.EvDownloadStarted(batch_id=batch_id, filename=entry.filename))
|
||||||
|
|
||||||
|
tdl_ok = await _run_tdl(cmd, f"batch of {len(valid_entries)}")
|
||||||
|
|
||||||
|
results: dict[int, bool] = {}
|
||||||
|
for entry in entries:
|
||||||
|
if not any(e.doc_id == entry.doc_id for e, _ in valid_entries):
|
||||||
|
results[entry.doc_id] = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
if tdl_ok:
|
||||||
|
moved = _move_from_staging(staging, entry.filename, entry.dest)
|
||||||
|
results[entry.doc_id] = moved
|
||||||
|
if moved:
|
||||||
|
bus.post(bus.EvDownloadDone(batch_id=batch_id, filename=entry.filename, via="tdl"))
|
||||||
|
else:
|
||||||
|
log.warning(f"[tdl] Fallback needed: {entry.filename}")
|
||||||
|
bus.post(bus.EvDownloadFailed(batch_id=batch_id, filename=entry.filename, reason="staging mismatch"))
|
||||||
|
else:
|
||||||
|
results[entry.doc_id] = False
|
||||||
|
bus.post(bus.EvDownloadFailed(batch_id=batch_id, filename=entry.filename, reason="tdl exit error"))
|
||||||
|
|
||||||
|
_cleanup_staging(staging)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
async def download_single_with_tdl(msg, dest: Path) -> bool:
|
||||||
|
"""
|
||||||
|
Download a single message with tdl. Used by the live handler and
|
||||||
|
bot_downloader where batching doesn't apply.
|
||||||
|
"""
|
||||||
|
if not is_tdl_available():
|
||||||
|
log.warning("[tdl] not available — falling back to Telethon")
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
url = _build_message_url(msg)
|
||||||
|
except ValueError as e:
|
||||||
|
log.error(f"[tdl] Cannot build URL: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
batch_id = f"single_{int(time.monotonic_ns())}"
|
||||||
|
size_mb = (msg.media.document.size or 0) / (1024 * 1024) if hasattr(msg, "media") and msg.media else 0
|
||||||
|
bus.post(bus.EvDownloadQueued(
|
||||||
|
batch_id=batch_id, filename=dest.name,
|
||||||
|
size_mb=round(size_mb, 2), source="live", password=None,
|
||||||
|
))
|
||||||
|
bus.post(bus.EvDownloadStarted(batch_id=batch_id, filename=dest.name))
|
||||||
|
|
||||||
|
staging = _make_staging_dir()
|
||||||
|
cmd = _build_cmd([url], staging)
|
||||||
|
log.info(f"[tdl] Single: {dest.name} ({url})")
|
||||||
|
tdl_ok = await _run_tdl(cmd, dest.name)
|
||||||
|
|
||||||
|
if tdl_ok:
|
||||||
|
result = _move_from_staging(staging, dest.name, dest)
|
||||||
|
else:
|
||||||
|
result = False
|
||||||
|
|
||||||
|
_cleanup_staging(staging)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
bus.post(bus.EvDownloadDone(batch_id=batch_id, filename=dest.name, via="tdl"))
|
||||||
|
else:
|
||||||
|
bus.post(bus.EvDownloadFailed(batch_id=batch_id, filename=dest.name, reason="tdl failed"))
|
||||||
|
return result
|
||||||
0
data/.gitkeep
Normal file
0
data/.gitkeep
Normal file
54
logs/monitor.log
Normal file
54
logs/monitor.log
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
2026-04-02 00:45:48,909 [INFO] utils.database: Database ready: data/hits.db
|
||||||
|
2026-04-02 00:45:49,119 [INFO] telethon.crypto.aes: libssl detected, it will be used for encryption
|
||||||
|
2026-04-02 00:45:49,156 [INFO] utils.database: Database ready: data/hits.db
|
||||||
|
2026-04-02 00:45:49,159 [INFO] tui.app: [bot] Connecting bot_client...
|
||||||
|
2026-04-02 00:45:49,159 [INFO] telethon.network.mtprotosender: Connecting to 149.154.175.59:443/TcpFull...
|
||||||
|
2026-04-02 00:45:49,203 [INFO] tui.app: [bus] EvStatus: Starting — 12 channel(s), 2 pattern(s)
|
||||||
|
2026-04-02 00:45:49,281 [INFO] telethon.network.mtprotosender: Connection to 149.154.175.59:443/TcpFull complete!
|
||||||
|
2026-04-02 00:45:49,900 [INFO] tui.app: [bot] bot_client connected, authorizing...
|
||||||
|
2026-04-02 00:45:49,901 [INFO] tui.app: [bot] bot_client ready
|
||||||
|
2026-04-02 00:45:49,901 [INFO] tui.app: [bot] Connecting user_client...
|
||||||
|
2026-04-02 00:45:49,901 [INFO] telethon.network.mtprotosender: Connecting to 149.154.175.59:443/TcpFull...
|
||||||
|
2026-04-02 00:45:49,908 [INFO] __main__: Cleaning up tmp/...
|
||||||
|
2026-04-02 00:54:16,429 [INFO] utils.database: Database ready: data/hits.db
|
||||||
|
2026-04-02 00:54:16,638 [INFO] telethon.crypto.aes: libssl detected, it will be used for encryption
|
||||||
|
2026-04-02 00:54:16,666 [ERROR] tui.app: [bot-thread] Unhandled exception: cannot import name 'HITS_CSV' from 'config' (/home/anti/Tools/sj/telegrammer/config.py)
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/home/anti/Tools/sj/telegrammer/tui/app.py", line 848, in _run_bot_thread
|
||||||
|
loop.run_until_complete(self._bot_main())
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/usr/lib64/python3.14/asyncio/base_events.py", line 719, in run_until_complete
|
||||||
|
return future.result()
|
||||||
|
~~~~~~~~~~~~~^^
|
||||||
|
File "/home/anti/Tools/sj/telegrammer/tui/app.py", line 865, in _bot_main
|
||||||
|
from core.notifier import send_status
|
||||||
|
File "/home/anti/Tools/sj/telegrammer/core/notifier.py", line 22, in <module>
|
||||||
|
from config import HITS_FILE, HITS_CSV, NOTIFY_CHAT_ID
|
||||||
|
ImportError: cannot import name 'HITS_CSV' from 'config' (/home/anti/Tools/sj/telegrammer/config.py)
|
||||||
|
2026-04-02 00:54:16,716 [INFO] tui.app: [bus] EvStatus: Bot thread crashed: cannot import name 'HITS_CSV' from 'config' (/home/anti/Tools/sj/telegrammer/config.py)
|
||||||
|
2026-04-02 00:54:22,624 [INFO] __main__: Cleaning up tmp/...
|
||||||
|
2026-04-02 00:54:34,773 [INFO] utils.database: Database ready: data/hits.db
|
||||||
|
2026-04-02 00:54:34,983 [INFO] telethon.crypto.aes: libssl detected, it will be used for encryption
|
||||||
|
2026-04-02 00:54:35,015 [INFO] utils.database: Database ready: data/hits.db
|
||||||
|
2026-04-02 00:54:35,015 [INFO] tui.app: [bot] Connecting bot_client...
|
||||||
|
2026-04-02 00:54:35,015 [INFO] telethon.network.mtprotosender: Connecting to 149.154.175.59:443/TcpFull...
|
||||||
|
2026-04-02 00:54:35,063 [INFO] tui.app: [bus] EvStatus: Starting — 12 channel(s), 2 pattern(s)
|
||||||
|
2026-04-02 00:54:35,120 [INFO] telethon.network.mtprotosender: Connection to 149.154.175.59:443/TcpFull complete!
|
||||||
|
2026-04-02 00:54:35,698 [INFO] tui.app: [bot] bot_client connected, authorizing...
|
||||||
|
2026-04-02 00:54:35,698 [INFO] tui.app: [bot] bot_client ready
|
||||||
|
2026-04-02 00:54:35,698 [INFO] tui.app: [bot] Connecting user_client...
|
||||||
|
2026-04-02 00:54:35,698 [INFO] telethon.network.mtprotosender: Connecting to 149.154.175.59:443/TcpFull...
|
||||||
|
2026-04-02 00:54:35,810 [INFO] telethon.network.mtprotosender: Connection to 149.154.175.59:443/TcpFull complete!
|
||||||
|
2026-04-02 00:54:36,420 [INFO] tui.app: [bot] user_client connected, checking auth...
|
||||||
|
2026-04-02 00:54:36,420 [INFO] tui.app: [bot] user_client ready
|
||||||
|
2026-04-02 00:54:36,563 [INFO] tui.app: [bus] EvStatus: Connected as 4n (@clp_c)
|
||||||
|
2026-04-02 00:54:36,653 [INFO] core.scraper: Warming entity cache (fetching dialogs)...
|
||||||
|
2026-04-02 00:54:38,437 [INFO] core.scraper: Entity cache ready.
|
||||||
|
2026-04-02 00:54:38,437 [INFO] tui.app: [bot] Handler registered for 12 channel(s)
|
||||||
|
2026-04-02 00:54:38,437 [INFO] core.scraper: [Backfill] Starting for 12 channel(s)...
|
||||||
|
2026-04-02 00:54:38,437 [INFO] core.scraper: [Backfill] Scanning history: cloudxlog (last 500 messages)
|
||||||
|
2026-04-02 00:54:38,463 [INFO] tui.app: [bus] EvStatus: Watching 12 channel(s)
|
||||||
|
2026-04-02 00:54:38,463 [INFO] tui.app: [bus] EvStatus: Live listener active
|
||||||
|
2026-04-02 00:54:38,585 [INFO] core.scraper: [Batch] 4 file(s): @cloud t13.txt, @cloud t12.txt, @cloud t11.txt, @cloud t10.txt
|
||||||
|
2026-04-02 00:54:38,585 [INFO] core.tdl_downloader: [tdl] Batch (4 files): @cloud t13.txt, @cloud t12.txt, @cloud t11.txt, @cloud t10.txt
|
||||||
|
2026-04-02 00:54:40,248 [INFO] __main__: Cleaning up tmp/...
|
||||||
142
main.py
Normal file
142
main.py
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
"""
|
||||||
|
main.py — Entry point for the ULP credential monitor.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python main.py # TUI mode (default, requires textual)
|
||||||
|
python main.py --no-tui # Plain CLI mode
|
||||||
|
|
||||||
|
First run will prompt for your Telegram phone number and 2FA code
|
||||||
|
to create a session file. Subsequent runs are fully automatic.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import shutil
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import config
|
||||||
|
from utils.database import init_db
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Logging setup ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
config.LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
config.TEMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||||
|
handlers=[
|
||||||
|
logging.FileHandler(config.LOG_FILE, encoding="utf-8"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
init_db()
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Plain CLI mode ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def _cli_main():
|
||||||
|
"""Original asyncio main — runs without the TUI."""
|
||||||
|
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
|
||||||
|
|
||||||
|
from telethon import TelegramClient
|
||||||
|
from core.processor import compile_patterns
|
||||||
|
from core.notifier import send_status
|
||||||
|
from core.scraper import backfill_all, register_handlers, warm_entity_cache
|
||||||
|
|
||||||
|
log.info("=" * 60)
|
||||||
|
log.info(" ULP Credential Monitor — CLI mode")
|
||||||
|
log.info("=" * 60)
|
||||||
|
|
||||||
|
patterns = compile_patterns(config.TARGET_KEYWORDS)
|
||||||
|
log.info(f"Loaded {len(patterns)} keyword pattern(s)")
|
||||||
|
log.info(f"Watching {len(config.WATCHED_CHANNELS)} channel(s)")
|
||||||
|
|
||||||
|
user_client = TelegramClient(
|
||||||
|
config.SESSION_NAME, config.API_ID, config.API_HASH,
|
||||||
|
connection_retries=5, auto_reconnect=True, request_retries=5,
|
||||||
|
)
|
||||||
|
bot_client = TelegramClient(
|
||||||
|
"bot_session", config.API_ID, config.API_HASH,
|
||||||
|
)
|
||||||
|
|
||||||
|
async with user_client, bot_client:
|
||||||
|
await bot_client.start(bot_token=config.BOT_TOKEN)
|
||||||
|
log.info("Bot client connected.")
|
||||||
|
|
||||||
|
await user_client.start()
|
||||||
|
me = await user_client.get_me()
|
||||||
|
log.info(f"User client connected as: {me.first_name} (@{me.username})")
|
||||||
|
|
||||||
|
await send_status(
|
||||||
|
bot_client,
|
||||||
|
f"✅ *Monitor started*\n"
|
||||||
|
f"User: `{me.first_name}`\n"
|
||||||
|
f"Channels: `{len(config.WATCHED_CHANNELS)}`\n"
|
||||||
|
f"Patterns: `{len(patterns)}`\n"
|
||||||
|
f"Backfill: `{config.BACKFILL_LIMIT} msg/channel`",
|
||||||
|
)
|
||||||
|
|
||||||
|
await warm_entity_cache(user_client)
|
||||||
|
register_handlers(user_client, bot_client, patterns)
|
||||||
|
log.info("Live listener registered.")
|
||||||
|
|
||||||
|
await backfill_all(user_client, bot_client, patterns)
|
||||||
|
|
||||||
|
log.info("Listening for new messages... (Ctrl+C to stop)")
|
||||||
|
await user_client.run_until_disconnected()
|
||||||
|
|
||||||
|
log.info("Monitor stopped.")
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Entry point ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="ULP Credential Monitor")
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-tui",
|
||||||
|
action="store_true",
|
||||||
|
help="Run in plain CLI mode (no Textual TUI)",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.no_tui:
|
||||||
|
try:
|
||||||
|
asyncio.run(_cli_main())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
log.info("Interrupted by user.")
|
||||||
|
finally:
|
||||||
|
log.info("Cleaning up tmp/...")
|
||||||
|
if config.TEMP_DIR.exists():
|
||||||
|
shutil.rmtree(config.TEMP_DIR, ignore_errors=True)
|
||||||
|
config.TEMP_DIR.mkdir()
|
||||||
|
log.info("Done.")
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
from tui.app import run_tui
|
||||||
|
except ImportError:
|
||||||
|
print(
|
||||||
|
"⚠ Textual is not installed. Install it with:\n"
|
||||||
|
" pip install textual\n"
|
||||||
|
"Or run in plain CLI mode:\n"
|
||||||
|
" python main.py --no-tui",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
run_tui()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
log.info("Cleaning up tmp/...")
|
||||||
|
if config.TEMP_DIR.exists():
|
||||||
|
shutil.rmtree(config.TEMP_DIR, ignore_errors=True)
|
||||||
|
config.TEMP_DIR.mkdir()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
2
pytest.ini
Normal file
2
pytest.ini
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
[pytest]
|
||||||
|
testpaths = tests
|
||||||
1
requirements-dev.txt
Normal file
1
requirements-dev.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
pytest
|
||||||
16
requirements.txt
Normal file
16
requirements.txt
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
# Telegram
|
||||||
|
telethon
|
||||||
|
tgcrypto
|
||||||
|
|
||||||
|
# TUI
|
||||||
|
textual
|
||||||
|
|
||||||
|
# Config
|
||||||
|
python-dotenv
|
||||||
|
|
||||||
|
# Progress bars (CLI mode)
|
||||||
|
tqdm
|
||||||
|
|
||||||
|
# Archive extraction
|
||||||
|
py7zr
|
||||||
|
rarfile
|
||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
31
tests/conftest.py
Normal file
31
tests/conftest.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
# Must be set before config.py is imported by any module.
|
||||||
|
# load_dotenv() runs at import time; these setdefaults fill the gap when .env is absent.
|
||||||
|
os.environ.setdefault("API_ID", "12345")
|
||||||
|
os.environ.setdefault("API_HASH", "dummy_hash_for_tests")
|
||||||
|
os.environ.setdefault("BOT_TOKEN", "0:dummy_bot_token")
|
||||||
|
os.environ.setdefault("NOTIFY_CHAT_ID", "99999")
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import config
|
||||||
|
import utils.scorer as scorer
|
||||||
|
|
||||||
|
# Two test keywords:
|
||||||
|
# @testcorp\.com — employee email domain (triggers CRITICAL)
|
||||||
|
# testcorp\.com — plain domain match (triggers LOW baseline)
|
||||||
|
TEST_KEYWORDS = [r"@testcorp\.com", r"testcorp\.com"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def patched_keywords(monkeypatch):
|
||||||
|
"""
|
||||||
|
Override TARGET_KEYWORDS for the duration of a test and rebuild the
|
||||||
|
scorer's module-level globals so scoring logic uses known test patterns.
|
||||||
|
"""
|
||||||
|
monkeypatch.setattr(config, "TARGET_KEYWORDS", TEST_KEYWORDS)
|
||||||
|
# scorer.py uses `from config import TARGET_KEYWORDS` — a local binding that
|
||||||
|
# doesn't update when config.TARGET_KEYWORDS is patched. Patch it directly.
|
||||||
|
monkeypatch.setattr(scorer, "TARGET_KEYWORDS", TEST_KEYWORDS)
|
||||||
|
monkeypatch.setattr(scorer, "EMPLOYEE_DOMAINS", scorer._build_employee_domains())
|
||||||
|
monkeypatch.setattr(scorer, "ORG_DOMAINS", scorer._build_org_domains())
|
||||||
55
tests/test_cache.py
Normal file
55
tests/test_cache.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
"""
|
||||||
|
Tests for utils/cache.py — file-ID deduplication cache.
|
||||||
|
|
||||||
|
Each test gets an isolated cache file via the `isolated_cache` fixture
|
||||||
|
so tests never touch data/cache.json.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import utils.cache as cache_module
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def isolated_cache(tmp_path, monkeypatch):
|
||||||
|
monkeypatch.setattr(cache_module, "CACHE_FILE", tmp_path / "cache.json")
|
||||||
|
|
||||||
|
|
||||||
|
def test_unseen_id_returns_false():
|
||||||
|
assert cache_module.is_seen(12345) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_mark_seen_makes_id_seen():
|
||||||
|
cache_module.mark_seen(12345)
|
||||||
|
assert cache_module.is_seen(12345) is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiple_ids_stored_independently():
|
||||||
|
cache_module.mark_seen(1)
|
||||||
|
cache_module.mark_seen(2)
|
||||||
|
cache_module.mark_seen(3)
|
||||||
|
assert cache_module.is_seen(1)
|
||||||
|
assert cache_module.is_seen(2)
|
||||||
|
assert cache_module.is_seen(3)
|
||||||
|
assert not cache_module.is_seen(4)
|
||||||
|
|
||||||
|
|
||||||
|
def test_persists_to_disk_between_calls():
|
||||||
|
"""
|
||||||
|
is_seen() and mark_seen() each load from disk independently.
|
||||||
|
This verifies the persist-on-write / load-on-read contract
|
||||||
|
(simulating what happens across separate function calls in the bot loop).
|
||||||
|
"""
|
||||||
|
cache_module.mark_seen(999)
|
||||||
|
assert cache_module.is_seen(999) is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_missing_cache_file_handled_gracefully(tmp_path, monkeypatch):
|
||||||
|
monkeypatch.setattr(cache_module, "CACHE_FILE", tmp_path / "nonexistent.json")
|
||||||
|
assert cache_module.is_seen(42) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_mark_seen_is_idempotent():
|
||||||
|
cache_module.mark_seen(7)
|
||||||
|
cache_module.mark_seen(7)
|
||||||
|
cache_module.mark_seen(7)
|
||||||
|
assert cache_module.is_seen(7) is True
|
||||||
188
tests/test_database.py
Normal file
188
tests/test_database.py
Normal file
@@ -0,0 +1,188 @@
|
|||||||
|
"""
|
||||||
|
Tests for utils/database.py — SQLite persistence layer.
|
||||||
|
|
||||||
|
Each test gets an isolated in-memory-equivalent DB via the `isolated_db`
|
||||||
|
fixture so tests never touch data/hits.db.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import utils.database as db_module
|
||||||
|
from utils.scorer import ScoredHit, CRITICAL, HIGH, MEDIUM, LOW
|
||||||
|
|
||||||
|
|
||||||
|
def make_hit(severity=LOW, url="testcorp.com", username="user", password="pass", raw=None):
|
||||||
|
"""Build a minimal ScoredHit for insertion tests."""
|
||||||
|
scores = {CRITICAL: 40, HIGH: 30, MEDIUM: 20, LOW: 10}
|
||||||
|
return ScoredHit(
|
||||||
|
raw=raw or f"{url}|{username}|{password}",
|
||||||
|
severity=severity,
|
||||||
|
score=scores[severity],
|
||||||
|
reasons=["Test reason"],
|
||||||
|
url=url,
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def isolated_db(tmp_path, monkeypatch):
|
||||||
|
monkeypatch.setattr(db_module, "DB_FILE", tmp_path / "test_hits.db")
|
||||||
|
db_module.init_db()
|
||||||
|
|
||||||
|
|
||||||
|
# ─── init_db ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_init_db_is_idempotent():
|
||||||
|
db_module.init_db()
|
||||||
|
db_module.init_db() # must not raise
|
||||||
|
|
||||||
|
|
||||||
|
# ─── insert_hits ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_insert_returns_correct_row_count():
|
||||||
|
hits = [make_hit(), make_hit(severity=CRITICAL)]
|
||||||
|
count = db_module.insert_hits(hits, source="testchan", filename="combo.txt")
|
||||||
|
assert count == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_insert_stores_all_fields():
|
||||||
|
hit = make_hit(severity=HIGH, url="intranet.testcorp.com", username="jdoe", password="s3cr3t")
|
||||||
|
db_module.insert_hits([hit], source="mychan", filename="creds.zip")
|
||||||
|
rows = db_module.search("jdoe")
|
||||||
|
assert len(rows) == 1
|
||||||
|
row = rows[0]
|
||||||
|
assert row["url"] == "intranet.testcorp.com"
|
||||||
|
assert row["username"] == "jdoe"
|
||||||
|
assert row["password"] == "s3cr3t"
|
||||||
|
assert row["severity"] == HIGH
|
||||||
|
assert row["score"] == 30
|
||||||
|
assert row["source"] == "mychan"
|
||||||
|
assert row["filename"] == "creds.zip"
|
||||||
|
assert row["seen_before"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_insert_seen_before_flag():
|
||||||
|
hit = make_hit()
|
||||||
|
db_module.insert_hits([hit], source="chan", filename="f.txt", seen_before=True)
|
||||||
|
rows = db_module.search("testcorp")
|
||||||
|
assert rows[0]["seen_before"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
# ─── search ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_search_finds_by_username():
|
||||||
|
db_module.insert_hits([make_hit(username="jdoe@testcorp.com")], source="c", filename="f.txt")
|
||||||
|
results = db_module.search("jdoe")
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results[0]["username"] == "jdoe@testcorp.com"
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_finds_by_url():
|
||||||
|
db_module.insert_hits([make_hit(url="admin.testcorp.com")], source="c", filename="f.txt")
|
||||||
|
results = db_module.search("admin.testcorp")
|
||||||
|
assert len(results) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_finds_by_raw():
|
||||||
|
db_module.insert_hits([make_hit(raw="raw_unique_token_xyz")], source="c", filename="f.txt")
|
||||||
|
results = db_module.search("unique_token")
|
||||||
|
assert len(results) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_returns_empty_for_no_match():
|
||||||
|
db_module.insert_hits([make_hit()], source="c", filename="f.txt")
|
||||||
|
assert db_module.search("zzznomatch_xyz") == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_sorted_by_score_descending():
|
||||||
|
db_module.insert_hits([make_hit(severity=LOW)], source="c", filename="f.txt")
|
||||||
|
db_module.insert_hits([make_hit(severity=CRITICAL, url="admin.testcorp.com")], source="c", filename="f.txt")
|
||||||
|
results = db_module.search("testcorp")
|
||||||
|
assert results[0]["score"] >= results[-1]["score"]
|
||||||
|
|
||||||
|
|
||||||
|
# ─── by_severity ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_by_severity_returns_correct_severity():
|
||||||
|
db_module.insert_hits([make_hit(severity=CRITICAL, url="admin.testcorp.com")], source="c", filename="f.txt")
|
||||||
|
db_module.insert_hits([make_hit(severity=LOW)], source="c", filename="f.txt")
|
||||||
|
results = db_module.by_severity(CRITICAL)
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results[0]["severity"] == CRITICAL
|
||||||
|
|
||||||
|
|
||||||
|
def test_by_severity_excludes_duplicates():
|
||||||
|
"""seen_before=1 rows must be invisible to by_severity — they are stored for stats only."""
|
||||||
|
hit = make_hit(severity=HIGH, url="intranet.testcorp.com")
|
||||||
|
db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=True)
|
||||||
|
assert db_module.by_severity(HIGH) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_by_severity_returns_empty_when_none():
|
||||||
|
assert db_module.by_severity(CRITICAL) == []
|
||||||
|
|
||||||
|
|
||||||
|
# ─── stats ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_stats_counts_by_severity():
|
||||||
|
db_module.insert_hits([make_hit(severity=CRITICAL, url="admin.testcorp.com")], source="c", filename="f.txt")
|
||||||
|
db_module.insert_hits([make_hit(severity=HIGH, url="intranet.testcorp.com")], source="c", filename="f.txt")
|
||||||
|
db_module.insert_hits([make_hit(severity=MEDIUM, url="app.testcorp.com")], source="c", filename="f.txt")
|
||||||
|
db_module.insert_hits([make_hit(severity=LOW)], source="c", filename="f.txt")
|
||||||
|
s = db_module.stats()
|
||||||
|
assert s["critical"] == 1
|
||||||
|
assert s["high"] == 1
|
||||||
|
assert s["medium"] == 1
|
||||||
|
assert s["low"] == 1
|
||||||
|
assert s["total"] == 4
|
||||||
|
assert s["unique"] == 4
|
||||||
|
assert s["duplicates"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_stats_separates_duplicates():
|
||||||
|
hit = make_hit()
|
||||||
|
db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=False)
|
||||||
|
db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=True)
|
||||||
|
s = db_module.stats()
|
||||||
|
assert s["total"] == 2
|
||||||
|
assert s["unique"] == 1
|
||||||
|
assert s["duplicates"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_stats_severity_counts_exclude_duplicates():
|
||||||
|
hit = make_hit(severity=CRITICAL, url="admin.testcorp.com")
|
||||||
|
db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=False)
|
||||||
|
db_module.insert_hits([hit], source="c", filename="f.txt", seen_before=True)
|
||||||
|
s = db_module.stats()
|
||||||
|
assert s["critical"] == 1 # only the unique one
|
||||||
|
|
||||||
|
|
||||||
|
def test_stats_empty_db():
|
||||||
|
s = db_module.stats()
|
||||||
|
assert s["total"] == 0
|
||||||
|
assert s["unique"] == 0
|
||||||
|
assert s["top_source"] is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_stats_top_source():
|
||||||
|
db_module.insert_hits([make_hit()], source="channelA", filename="f.txt")
|
||||||
|
db_module.insert_hits([make_hit()], source="channelA", filename="f.txt")
|
||||||
|
db_module.insert_hits([make_hit()], source="channelB", filename="f.txt")
|
||||||
|
s = db_module.stats()
|
||||||
|
assert s["top_source"]["source"] == "channelA"
|
||||||
|
|
||||||
|
|
||||||
|
# ─── recent ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_recent_respects_limit():
|
||||||
|
for i in range(5):
|
||||||
|
db_module.insert_hits([make_hit(raw=f"testcorp.com|user{i}|pass")], source="c", filename="f.txt")
|
||||||
|
rows = db_module.recent(limit=3)
|
||||||
|
assert len(rows) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_recent_returns_all_when_under_limit():
|
||||||
|
db_module.insert_hits([make_hit()], source="c", filename="f.txt")
|
||||||
|
db_module.insert_hits([make_hit()], source="c", filename="f.txt")
|
||||||
|
rows = db_module.recent(limit=50)
|
||||||
|
assert len(rows) == 2
|
||||||
223
tests/test_processor.py
Normal file
223
tests/test_processor.py
Normal file
@@ -0,0 +1,223 @@
|
|||||||
|
"""
|
||||||
|
Tests for core/processor.py — archive extraction and line-by-line search.
|
||||||
|
|
||||||
|
No Telegram deps, no async. Tests create real archive fixtures in tmp_path
|
||||||
|
so process_file's cleanup guarantee can be verified against actual disk state.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import zipfile
|
||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from core.processor import compile_patterns, search_file, process_file
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def patterns():
|
||||||
|
return compile_patterns([r"testcorp\.com"])
|
||||||
|
|
||||||
|
|
||||||
|
# ─── compile_patterns ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestCompilePatterns:
|
||||||
|
def test_returns_case_insensitive_patterns(self):
|
||||||
|
pats = compile_patterns([r"hello"])
|
||||||
|
assert pats[0].search("HELLO") is not None
|
||||||
|
assert pats[0].search("Hello") is not None
|
||||||
|
|
||||||
|
def test_multiple_patterns(self):
|
||||||
|
pats = compile_patterns([r"alpha", r"beta"])
|
||||||
|
assert len(pats) == 2
|
||||||
|
assert pats[0].search("alpha_line")
|
||||||
|
assert pats[1].search("beta_line")
|
||||||
|
|
||||||
|
def test_empty_list(self):
|
||||||
|
assert compile_patterns([]) == []
|
||||||
|
|
||||||
|
|
||||||
|
# ─── search_file ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestSearchFile:
|
||||||
|
def test_returns_matching_lines(self, tmp_path, patterns):
|
||||||
|
f = tmp_path / "combo.txt"
|
||||||
|
f.write_text("testcorp.com|user|pass\nothersite.com|user|pass\n")
|
||||||
|
assert search_file(f, patterns) == ["testcorp.com|user|pass"]
|
||||||
|
|
||||||
|
def test_returns_empty_when_no_match(self, tmp_path, patterns):
|
||||||
|
f = tmp_path / "combo.txt"
|
||||||
|
f.write_text("nomatch.com|user|pass\nanother.net|x|y\n")
|
||||||
|
assert search_file(f, patterns) == []
|
||||||
|
|
||||||
|
def test_strips_whitespace_from_returned_lines(self, tmp_path, patterns):
|
||||||
|
f = tmp_path / "combo.txt"
|
||||||
|
f.write_text(" testcorp.com|user|pass \n")
|
||||||
|
hits = search_file(f, patterns)
|
||||||
|
assert hits[0] == "testcorp.com|user|pass"
|
||||||
|
|
||||||
|
def test_skips_blank_lines(self, tmp_path, patterns):
|
||||||
|
f = tmp_path / "combo.txt"
|
||||||
|
f.write_text("\n\ntestcorp.com|user|pass\n\n")
|
||||||
|
assert search_file(f, patterns) == ["testcorp.com|user|pass"]
|
||||||
|
|
||||||
|
def test_handles_encoding_errors_gracefully(self, tmp_path, patterns):
|
||||||
|
"""Combo files are often messy — invalid bytes must not crash the search."""
|
||||||
|
f = tmp_path / "combo.txt"
|
||||||
|
f.write_bytes(
|
||||||
|
b"testcorp.com|user1|pass\n"
|
||||||
|
b"\xff\xfe invalid bytes here\n"
|
||||||
|
b"testcorp.com|user2|pass\n"
|
||||||
|
)
|
||||||
|
hits = search_file(f, patterns)
|
||||||
|
assert len(hits) == 2
|
||||||
|
|
||||||
|
def test_multiple_matching_lines_all_returned(self, tmp_path, patterns):
|
||||||
|
f = tmp_path / "combo.txt"
|
||||||
|
f.write_text(
|
||||||
|
"testcorp.com|alice|pass1\n"
|
||||||
|
"nomatch.com|bob|pass2\n"
|
||||||
|
"testcorp.com|carol|pass3\n"
|
||||||
|
)
|
||||||
|
hits = search_file(f, patterns)
|
||||||
|
assert len(hits) == 2
|
||||||
|
|
||||||
|
|
||||||
|
# ─── process_file — plain .txt ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestProcessFilePlainText:
|
||||||
|
def test_returns_hits(self, tmp_path, patterns):
|
||||||
|
f = tmp_path / "combo.txt"
|
||||||
|
f.write_text("testcorp.com|user|pass\nnomatch.com|x|y\n")
|
||||||
|
hits = process_file(f, patterns)
|
||||||
|
assert hits == ["testcorp.com|user|pass"]
|
||||||
|
|
||||||
|
def test_deletes_file_after_processing(self, tmp_path, patterns):
|
||||||
|
f = tmp_path / "combo.txt"
|
||||||
|
f.write_text("testcorp.com|user|pass\n")
|
||||||
|
process_file(f, patterns)
|
||||||
|
assert not f.exists()
|
||||||
|
|
||||||
|
def test_deletes_file_even_with_no_hits(self, tmp_path, patterns):
|
||||||
|
f = tmp_path / "combo.txt"
|
||||||
|
f.write_text("nomatch.com|x|y\n")
|
||||||
|
hits = process_file(f, patterns)
|
||||||
|
assert hits == []
|
||||||
|
assert not f.exists()
|
||||||
|
|
||||||
|
|
||||||
|
# ─── process_file — .zip extraction ──────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestProcessFileZip:
|
||||||
|
def _make_zip(self, tmp_path: Path, content: str, filename="content.txt") -> Path:
|
||||||
|
txt = tmp_path / filename
|
||||||
|
txt.write_text(content)
|
||||||
|
zf = tmp_path / "combo.zip"
|
||||||
|
with zipfile.ZipFile(zf, "w") as z:
|
||||||
|
z.write(txt, filename)
|
||||||
|
txt.unlink()
|
||||||
|
return zf
|
||||||
|
|
||||||
|
def test_extracts_and_returns_hits(self, tmp_path, patterns):
|
||||||
|
zf = self._make_zip(tmp_path, "testcorp.com|user|pass\nnomatch.com|x|y\n")
|
||||||
|
hits = process_file(zf, patterns)
|
||||||
|
assert hits == ["testcorp.com|user|pass"]
|
||||||
|
|
||||||
|
def test_deletes_zip_after_processing(self, tmp_path, patterns):
|
||||||
|
zf = self._make_zip(tmp_path, "testcorp.com|user|pass\n")
|
||||||
|
process_file(zf, patterns)
|
||||||
|
assert not zf.exists()
|
||||||
|
|
||||||
|
def test_deletes_extract_dir_after_processing(self, tmp_path, patterns):
|
||||||
|
zf = self._make_zip(tmp_path, "testcorp.com|user|pass\n")
|
||||||
|
extract_dir = tmp_path / "combo" # sibling dir named after zip stem
|
||||||
|
process_file(zf, patterns)
|
||||||
|
assert not extract_dir.exists()
|
||||||
|
|
||||||
|
def test_no_hits_still_cleans_up(self, tmp_path, patterns):
|
||||||
|
zf = self._make_zip(tmp_path, "nomatch.com|x|y\n")
|
||||||
|
extract_dir = tmp_path / "combo"
|
||||||
|
process_file(zf, patterns)
|
||||||
|
assert not zf.exists()
|
||||||
|
assert not extract_dir.exists()
|
||||||
|
|
||||||
|
def test_zip_with_multiple_txt_files(self, tmp_path, patterns):
|
||||||
|
txt1 = tmp_path / "a.txt"
|
||||||
|
txt1.write_text("testcorp.com|alice|pass\n")
|
||||||
|
txt2 = tmp_path / "b.txt"
|
||||||
|
txt2.write_text("testcorp.com|bob|pass\n")
|
||||||
|
zf = tmp_path / "combo.zip"
|
||||||
|
with zipfile.ZipFile(zf, "w") as z:
|
||||||
|
z.write(txt1, "a.txt")
|
||||||
|
z.write(txt2, "b.txt")
|
||||||
|
txt1.unlink()
|
||||||
|
txt2.unlink()
|
||||||
|
|
||||||
|
hits = process_file(zf, patterns)
|
||||||
|
assert len(hits) == 2
|
||||||
|
|
||||||
|
|
||||||
|
# ─── process_file — nested archives ──────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestProcessFileNested:
|
||||||
|
def test_nested_zip_is_recursed(self, tmp_path, patterns):
|
||||||
|
inner_txt = tmp_path / "inner.txt"
|
||||||
|
inner_txt.write_text("testcorp.com|user|pass\n")
|
||||||
|
inner_zip = tmp_path / "inner.zip"
|
||||||
|
with zipfile.ZipFile(inner_zip, "w") as z:
|
||||||
|
z.write(inner_txt, "inner.txt")
|
||||||
|
inner_txt.unlink()
|
||||||
|
|
||||||
|
outer_zip = tmp_path / "outer.zip"
|
||||||
|
with zipfile.ZipFile(outer_zip, "w") as z:
|
||||||
|
z.write(inner_zip, "inner.zip")
|
||||||
|
inner_zip.unlink()
|
||||||
|
|
||||||
|
hits = process_file(outer_zip, patterns)
|
||||||
|
assert hits == ["testcorp.com|user|pass"]
|
||||||
|
assert not outer_zip.exists()
|
||||||
|
assert not (tmp_path / "outer").exists()
|
||||||
|
|
||||||
|
|
||||||
|
# ─── process_file — password-protected .7z ───────────────────────────────────
|
||||||
|
|
||||||
|
class TestProcessFile7zPassword:
|
||||||
|
def test_unlocks_with_correct_password(self, tmp_path, patterns, monkeypatch):
|
||||||
|
try:
|
||||||
|
import py7zr
|
||||||
|
except ImportError:
|
||||||
|
pytest.skip("py7zr not installed")
|
||||||
|
|
||||||
|
import core.processor as proc_module
|
||||||
|
|
||||||
|
# Isolate to a single known password so the test doesn't depend on config
|
||||||
|
monkeypatch.setattr(proc_module, "ARCHIVE_PASSWORDS", [b"secretpwd"])
|
||||||
|
|
||||||
|
txt = tmp_path / "content.txt"
|
||||||
|
txt.write_text("testcorp.com|user|pass\n")
|
||||||
|
szf = tmp_path / "combo.7z"
|
||||||
|
with py7zr.SevenZipFile(szf, "w", password="secretpwd") as z:
|
||||||
|
z.write(txt, "content.txt")
|
||||||
|
txt.unlink()
|
||||||
|
|
||||||
|
hits = process_file(szf, patterns)
|
||||||
|
assert hits == ["testcorp.com|user|pass"]
|
||||||
|
assert not szf.exists()
|
||||||
|
|
||||||
|
def test_skips_when_no_password_matches(self, tmp_path, patterns, monkeypatch):
|
||||||
|
try:
|
||||||
|
import py7zr
|
||||||
|
except ImportError:
|
||||||
|
pytest.skip("py7zr not installed")
|
||||||
|
|
||||||
|
import core.processor as proc_module
|
||||||
|
monkeypatch.setattr(proc_module, "ARCHIVE_PASSWORDS", [b"wrongpwd"])
|
||||||
|
|
||||||
|
txt = tmp_path / "content.txt"
|
||||||
|
txt.write_text("testcorp.com|user|pass\n")
|
||||||
|
szf = tmp_path / "combo.7z"
|
||||||
|
with py7zr.SevenZipFile(szf, "w", password="correctpwd") as z:
|
||||||
|
z.write(txt, "content.txt")
|
||||||
|
txt.unlink()
|
||||||
|
|
||||||
|
# No hits — archive could not be opened
|
||||||
|
hits = process_file(szf, patterns)
|
||||||
|
assert hits == []
|
||||||
282
tests/test_scorer.py
Normal file
282
tests/test_scorer.py
Normal file
@@ -0,0 +1,282 @@
|
|||||||
|
"""
|
||||||
|
Tests for utils/scorer.py — severity scoring and ULP line parsing.
|
||||||
|
|
||||||
|
All tests use the `patched_keywords` fixture (see conftest.py) which
|
||||||
|
replaces TARGET_KEYWORDS with two entries:
|
||||||
|
@testcorp.com — employee email domain (CRITICAL trigger)
|
||||||
|
testcorp.com — plain domain match (LOW baseline)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from utils.scorer import score_hit, score_hits, summarize, CRITICAL, HIGH, MEDIUM, LOW
|
||||||
|
|
||||||
|
|
||||||
|
# ─── ULP line parsing ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestULPParsing:
|
||||||
|
def test_parses_pipe_separated_fields(self, patched_keywords):
|
||||||
|
hit = score_hit("site.com|jdoe@testcorp.com|pass123")
|
||||||
|
assert hit.url == "site.com"
|
||||||
|
assert hit.username == "jdoe@testcorp.com"
|
||||||
|
assert hit.password == "pass123"
|
||||||
|
|
||||||
|
def test_parses_colon_separated_fields(self, patched_keywords):
|
||||||
|
# 'site.com' has no colon, so url field captures it cleanly
|
||||||
|
hit = score_hit("site.com:jdoe@testcorp.com:pass123")
|
||||||
|
assert hit.url == "site.com"
|
||||||
|
assert hit.username == "jdoe@testcorp.com"
|
||||||
|
assert hit.password == "pass123"
|
||||||
|
|
||||||
|
def test_malformed_line_yields_none_fields(self, patched_keywords):
|
||||||
|
hit = score_hit("justaplaindomainmatch_testcorp.com")
|
||||||
|
assert hit.url is None
|
||||||
|
assert hit.username is None
|
||||||
|
assert hit.password is None
|
||||||
|
|
||||||
|
def test_raw_field_preserved_exactly(self, patched_keywords):
|
||||||
|
line = "site.com|jdoe@testcorp.com|pass123"
|
||||||
|
hit = score_hit(line)
|
||||||
|
assert hit.raw == line
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Real-world ULP format coverage ──────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestULPParsingRealWorld:
|
||||||
|
"""
|
||||||
|
Parametrized against real stealer-log lines.
|
||||||
|
Only field extraction is asserted (url/username/password), not severity,
|
||||||
|
so no patched_keywords fixture is needed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("line,exp_url,exp_user,exp_pass", [
|
||||||
|
# ── Protocol + port + path, colon separator ──────────────────────────
|
||||||
|
# Port is digits followed by '/' — must be consumed as part of the URL.
|
||||||
|
(
|
||||||
|
"http://portal.fakehosp.example.com:88/:55512309-1:hunter2",
|
||||||
|
"http://portal.fakehosp.example.com:88/", "55512309-1", "hunter2",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"http://portal.fakehosp.example.com:8085/app/booking/:3:letmein",
|
||||||
|
"http://portal.fakehosp.example.com:8085/app/booking/", "3", "letmein",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"https://portal.fakehosp.example.com:81/app/FrmResetPassword.aspx:30219876-K:Spr!ng22@",
|
||||||
|
"https://portal.fakehosp.example.com:81/app/FrmResetPassword.aspx",
|
||||||
|
"30219876-K", "Spr!ng22@",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Protocol + no port, ID-style username looks like port but has hyphen ──
|
||||||
|
# ':\d+-' must NOT be consumed as a port (no '/' after the digits).
|
||||||
|
(
|
||||||
|
"https://booking.fakehosp.example.com:40293817-6:Summ3r99..",
|
||||||
|
"https://booking.fakehosp.example.com", "40293817-6", "Summ3r99..",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"https://booking.fakehosp.example.com/:40293817-6:Summ3r99..",
|
||||||
|
"https://booking.fakehosp.example.com/", "40293817-6", "Summ3r99..",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Protocol + email username directly after host (no trailing slash) ─
|
||||||
|
(
|
||||||
|
"https://booking.fakehosp.example.com:carlos.gomez@gmail.com:Qwerty99",
|
||||||
|
"https://booking.fakehosp.example.com", "carlos.gomez@gmail.com", "Qwerty99",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"https://accounts.saas-vendor.example.com/signin:jdoe@fakehosp.example.com:W1nter20",
|
||||||
|
"https://accounts.saas-vendor.example.com/signin", "jdoe@fakehosp.example.com", "W1nter20",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"https://login.sso-provider.example.com/common/oauth2/authorize:jdoe@fakehosp.example.com:Passw0rd!",
|
||||||
|
"https://login.sso-provider.example.com/common/oauth2/authorize",
|
||||||
|
"jdoe@fakehosp.example.com", "Passw0rd!",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Pipe separator (unambiguous — port stays in URL) ──────────────────
|
||||||
|
(
|
||||||
|
"http://portal.fakehosp.example.com:88/|22.987.654-3|florida88",
|
||||||
|
"http://portal.fakehosp.example.com:88/", "22.987.654-3", "florida88",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"https://booking.fakehosp.example.com/|77341209-0|Ninja42",
|
||||||
|
"https://booking.fakehosp.example.com/", "77341209-0", "Ninja42",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Mixed separators: pipe after URL, colon between user/password ─────
|
||||||
|
(
|
||||||
|
"http://portal.fakehosp.example.com:8085/app/booking/|Z:wd1980wd",
|
||||||
|
"http://portal.fakehosp.example.com:8085/app/booking/", "Z", "wd1980wd",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── No protocol, port in URL ─────────────────────────────────────────
|
||||||
|
(
|
||||||
|
"portal.fakehosp.example.com:88/:22.987.654-3:florida88",
|
||||||
|
"portal.fakehosp.example.com:88/", "22.987.654-3", "florida88",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── No protocol, no port — plain colon separators ────────────────────
|
||||||
|
(
|
||||||
|
"booking.fakehosp.example.com:66778899-7:correcthorse",
|
||||||
|
"booking.fakehosp.example.com", "66778899-7", "correcthorse",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"booking.fakehosp.example.com/:smithjohnathan:Bb881955",
|
||||||
|
"booking.fakehosp.example.com/", "smithjohnathan", "Bb881955",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Password with special characters ─────────────────────────────────
|
||||||
|
(
|
||||||
|
"https://booking.fakehosp.example.com/:11223344-5:dragonball99*",
|
||||||
|
"https://booking.fakehosp.example.com/", "11223344-5", "dragonball99*",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"https://booking.fakehosp.example.com/:9988776-65:abc.456#",
|
||||||
|
"https://booking.fakehosp.example.com/", "9988776-65", "abc.456#",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Semicolon separator ───────────────────────────────────────────────
|
||||||
|
(
|
||||||
|
"booking.fakehosp.example.com;smithjohnathan;Bb881955",
|
||||||
|
"booking.fakehosp.example.com", "smithjohnathan", "Bb881955",
|
||||||
|
),
|
||||||
|
])
|
||||||
|
def test_real_world_ulp_parsing(self, line, exp_url, exp_user, exp_pass):
|
||||||
|
hit = score_hit(line)
|
||||||
|
assert hit.url == exp_url, f"URL mismatch for: {line!r}"
|
||||||
|
assert hit.username == exp_user, f"Username mismatch for: {line!r}"
|
||||||
|
assert hit.password == exp_pass, f"Password mismatch for: {line!r}"
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Severity classification ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestSeverityClassification:
|
||||||
|
def test_employee_email_in_username_is_critical(self, patched_keywords):
|
||||||
|
hit = score_hit("site.com|jdoe@testcorp.com|pass123")
|
||||||
|
assert hit.severity == CRITICAL
|
||||||
|
|
||||||
|
def test_gmail_on_org_url_is_not_critical(self, patched_keywords):
|
||||||
|
"""
|
||||||
|
Core documented footgun: org domain appears in the URL, but the
|
||||||
|
credential username is a gmail address. Must NOT be CRITICAL.
|
||||||
|
The employee-domain pattern requires a literal '@' before the domain,
|
||||||
|
so 'testcorp.com' in the URL field never triggers it.
|
||||||
|
"""
|
||||||
|
hit = score_hit("testcorp.com|user@gmail.com|pass123")
|
||||||
|
assert hit.severity != CRITICAL
|
||||||
|
|
||||||
|
def test_critical_service_subdomain_is_critical(self, patched_keywords):
|
||||||
|
hit = score_hit("admin.testcorp.com|user|pass123")
|
||||||
|
assert hit.severity == CRITICAL
|
||||||
|
|
||||||
|
def test_vpn_subdomain_is_critical(self, patched_keywords):
|
||||||
|
hit = score_hit("vpn.testcorp.com|user|pass123")
|
||||||
|
assert hit.severity == CRITICAL
|
||||||
|
|
||||||
|
def test_gitlab_subdomain_is_critical(self, patched_keywords):
|
||||||
|
hit = score_hit("gitlab.testcorp.com|user|pass123")
|
||||||
|
assert hit.severity == CRITICAL
|
||||||
|
|
||||||
|
def test_intranet_subdomain_is_high(self, patched_keywords):
|
||||||
|
hit = score_hit("intranet.testcorp.com|user|pass123")
|
||||||
|
assert hit.severity == HIGH
|
||||||
|
|
||||||
|
def test_sso_subdomain_is_high(self, patched_keywords):
|
||||||
|
hit = score_hit("sso.testcorp.com|user|pass123")
|
||||||
|
assert hit.severity == HIGH
|
||||||
|
|
||||||
|
def test_app_subdomain_is_medium(self, patched_keywords):
|
||||||
|
hit = score_hit("app.testcorp.com|user|pass123")
|
||||||
|
assert hit.severity == MEDIUM
|
||||||
|
|
||||||
|
def test_booking_subdomain_is_medium(self, patched_keywords):
|
||||||
|
hit = score_hit("booking.testcorp.com|user|pass123")
|
||||||
|
assert hit.severity == MEDIUM
|
||||||
|
|
||||||
|
def test_plain_domain_match_is_low(self, patched_keywords):
|
||||||
|
hit = score_hit("testcorp.com|user|pass123")
|
||||||
|
assert hit.severity == LOW
|
||||||
|
|
||||||
|
def test_employee_email_beats_high_service(self, patched_keywords):
|
||||||
|
"""Employee email domain must win over a HIGH service classification."""
|
||||||
|
hit = score_hit("intranet.testcorp.com|jdoe@testcorp.com|pass")
|
||||||
|
assert hit.severity == CRITICAL
|
||||||
|
|
||||||
|
def test_employee_email_beats_medium_service(self, patched_keywords):
|
||||||
|
hit = score_hit("app.testcorp.com|jdoe@testcorp.com|pass")
|
||||||
|
assert hit.severity == CRITICAL
|
||||||
|
|
||||||
|
def test_multiple_checks_accumulate_reasons(self, patched_keywords):
|
||||||
|
"""A line matching both employee email and a critical service URL collects both reasons."""
|
||||||
|
hit = score_hit("admin.testcorp.com|jdoe@testcorp.com|pass")
|
||||||
|
assert hit.severity == CRITICAL
|
||||||
|
assert len(hit.reasons) >= 2
|
||||||
|
|
||||||
|
def test_score_matches_severity(self, patched_keywords):
|
||||||
|
from utils.scorer import SEVERITY_SCORES
|
||||||
|
for line, expected_severity in [
|
||||||
|
("admin.testcorp.com|user|pass", CRITICAL),
|
||||||
|
("intranet.testcorp.com|user|pass", HIGH),
|
||||||
|
("app.testcorp.com|user|pass", MEDIUM),
|
||||||
|
("testcorp.com|user|pass", LOW),
|
||||||
|
]:
|
||||||
|
hit = score_hit(line)
|
||||||
|
assert hit.score == SEVERITY_SCORES[expected_severity]
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Weak password flags ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestWeakPasswordFlags:
|
||||||
|
def test_short_password_adds_reason(self, patched_keywords):
|
||||||
|
hit = score_hit("testcorp.com|user|abc")
|
||||||
|
assert any("Weak password" in r for r in hit.reasons)
|
||||||
|
|
||||||
|
def test_common_password_adds_reason(self, patched_keywords):
|
||||||
|
hit = score_hit("testcorp.com|user|password")
|
||||||
|
assert any("Common password" in r for r in hit.reasons)
|
||||||
|
|
||||||
|
def test_weak_password_does_not_escalate_severity(self, patched_keywords):
|
||||||
|
"""Weak password flags are informational — they must not change severity."""
|
||||||
|
hit = score_hit("testcorp.com|user|abc")
|
||||||
|
assert hit.severity == LOW
|
||||||
|
|
||||||
|
def test_strong_password_adds_no_warning(self, patched_keywords):
|
||||||
|
hit = score_hit("testcorp.com|user|Xk9#mP2qLrTv")
|
||||||
|
assert not any("password" in r.lower() for r in hit.reasons if "Employee" not in r and "domain" not in r.lower() and "service" not in r.lower())
|
||||||
|
|
||||||
|
|
||||||
|
# ─── score_hits and summarize ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestScoreHitsAndSummarize:
|
||||||
|
def test_score_hits_sorted_descending(self, patched_keywords):
|
||||||
|
lines = [
|
||||||
|
"testcorp.com|user|pass", # LOW
|
||||||
|
"admin.testcorp.com|user|pass", # CRITICAL
|
||||||
|
"intranet.testcorp.com|user|pass", # HIGH
|
||||||
|
"app.testcorp.com|user|pass", # MEDIUM
|
||||||
|
]
|
||||||
|
hits = score_hits(lines)
|
||||||
|
scores = [h.score for h in hits]
|
||||||
|
assert scores == sorted(scores, reverse=True)
|
||||||
|
|
||||||
|
def test_summarize_counts_each_severity(self, patched_keywords):
|
||||||
|
lines = [
|
||||||
|
"admin.testcorp.com|user|pass", # CRITICAL
|
||||||
|
"intranet.testcorp.com|user|pass", # HIGH
|
||||||
|
"app.testcorp.com|user|pass", # MEDIUM
|
||||||
|
"testcorp.com|user|pass", # LOW
|
||||||
|
]
|
||||||
|
summary = summarize(score_hits(lines))
|
||||||
|
assert summary[CRITICAL] == 1
|
||||||
|
assert summary[HIGH] == 1
|
||||||
|
assert summary[MEDIUM] == 1
|
||||||
|
assert summary[LOW] == 1
|
||||||
|
|
||||||
|
def test_summarize_zero_for_absent_severities(self, patched_keywords):
|
||||||
|
hits = score_hits(["testcorp.com|user|pass"]) # LOW only
|
||||||
|
summary = summarize(hits)
|
||||||
|
assert summary[CRITICAL] == 0
|
||||||
|
assert summary[HIGH] == 0
|
||||||
|
assert summary[MEDIUM] == 0
|
||||||
|
assert summary[LOW] == 1
|
||||||
|
|
||||||
|
def test_score_hits_empty_list(self, patched_keywords):
|
||||||
|
assert score_hits([]) == []
|
||||||
1
tui/__init__.py
Normal file
1
tui/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""tui — Textual TUI frontend and event bus."""
|
||||||
130
tui/app.md
Normal file
130
tui/app.md
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
# tui/app.py
|
||||||
|
|
||||||
|
Textual TUI frontend. Entry point: `run_tui()`.
|
||||||
|
|
||||||
|
## Entry point
|
||||||
|
|
||||||
|
```python
|
||||||
|
from tui.app import run_tui
|
||||||
|
run_tui() # called by main.py
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Screen hierarchy
|
||||||
|
|
||||||
|
```
|
||||||
|
MonitorApp (App)
|
||||||
|
├── [default screen]
|
||||||
|
│ ├── Header
|
||||||
|
│ ├── #top-row (Horizontal)
|
||||||
|
│ │ ├── DownloadPanel #dl-panel
|
||||||
|
│ │ └── HitsPanel #hits-panel
|
||||||
|
│ ├── StatsPanel #stats-panel
|
||||||
|
│ ├── ChannelPanel #ch-panel
|
||||||
|
│ └── Footer
|
||||||
|
├── SearchScreen (push/pop via 's')
|
||||||
|
├── HitsDBScreen (push/pop via 'h')
|
||||||
|
└── KeywordsScreen (push/pop via 'k')
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## MonitorApp
|
||||||
|
|
||||||
|
### Threading model
|
||||||
|
- **Bot backend** → `threading.Thread(daemon=True)` with its own `asyncio.new_event_loop()`
|
||||||
|
Runs `_bot_main()` — Telethon is completely isolated from Textual's loop.
|
||||||
|
- **TUI drain** → `set_interval(0.1, _drain_bus)` — polls `queue.Queue` every 100ms on Textual's loop.
|
||||||
|
|
||||||
|
### Key methods
|
||||||
|
|
||||||
|
| Method | Description |
|
||||||
|
|--------|-------------|
|
||||||
|
| `on_mount()` | Calls `bus.init_bus()`, starts bot thread, sets drain interval |
|
||||||
|
| `_drain_bus()` | Drains all pending events from `queue.Queue`, dispatches to widgets |
|
||||||
|
| `_run_bot_thread()` | Thread entry: creates event loop, runs `_bot_main()` |
|
||||||
|
| `_bot_main()` | Async bot backend: connect, auth, backfill, live handler loop |
|
||||||
|
| `_signal_channel_changed()` | Thread-safely sets the bot loop's `asyncio.Event` via `call_soon_threadsafe` |
|
||||||
|
|
||||||
|
### Keybindings
|
||||||
|
|
||||||
|
| Key | Action |
|
||||||
|
|-----|--------|
|
||||||
|
| `s` | Push `SearchScreen` |
|
||||||
|
| `h` | Push `HitsDBScreen` |
|
||||||
|
| `k` | Push `KeywordsScreen` |
|
||||||
|
| `c` | Clear download + hits logs |
|
||||||
|
| `r` | Force-refresh stats bar |
|
||||||
|
| `q` / `ctrl+c` | Quit |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Widgets
|
||||||
|
|
||||||
|
### DownloadPanel
|
||||||
|
Left panel. Two `RichLog` widgets separated by a dashed line:
|
||||||
|
- **top** (`#tdl-out`): raw tdl output lines (ANSI stripped)
|
||||||
|
- **bottom** (`#dl-log`): structured download status entries
|
||||||
|
|
||||||
|
Methods: `tdl_line(line)`, `queued(filename, size_mb, source, password)`, `status(filename, state, via)`, `clear_logs()`
|
||||||
|
|
||||||
|
States for `status()`: `queued` · `downloading` · `done_tdl` · `done_tel` · `failed`
|
||||||
|
|
||||||
|
### HitsPanel
|
||||||
|
Right panel. Single `RichLog` with color-coded hit entries.
|
||||||
|
Reactive `hit_count` updates the panel title badge automatically.
|
||||||
|
|
||||||
|
Methods: `add_hit(severity, raw, source, filename, reasons)`, `clear_log()`
|
||||||
|
|
||||||
|
### StatsPanel
|
||||||
|
Slim horizontal bar. Polls `utils.database.stats()` every 10s via `set_interval`.
|
||||||
|
Also refreshed immediately on each `EvHit` event.
|
||||||
|
|
||||||
|
### ChannelPanel
|
||||||
|
Bottom panel. `ListView` + `Input` + buttons.
|
||||||
|
Add/remove posts `EvChannelAdded` / `EvChannelRemoved` onto the bus.
|
||||||
|
Changes apply immediately (handler re-registered). Not persisted to `config.py` automatically.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Screens
|
||||||
|
|
||||||
|
### SearchScreen (`s`)
|
||||||
|
- Text input → queries `utils.database.search(keyword)`
|
||||||
|
- Results in a `DataTable` with columns: Sev, Time, URL, Username, Password, Source, File
|
||||||
|
- Submit with `↵` or Search button; `Escape` to dismiss
|
||||||
|
|
||||||
|
### HitsDBScreen (`h`)
|
||||||
|
- Toolbar buttons + number keys filter by severity
|
||||||
|
- `r` → recent 50, `1`→CRITICAL, `2`→HIGH, `3`→MEDIUM, `4`→LOW
|
||||||
|
- Calls `utils.database.recent()` / `by_severity()`
|
||||||
|
|
||||||
|
### KeywordsScreen (`k`)
|
||||||
|
- Live-edit `config.TARGET_KEYWORDS`
|
||||||
|
- Validates regex before adding
|
||||||
|
- On change: rebuilds `utils.scorer.EMPLOYEE_DOMAINS` and `ORG_DOMAINS`
|
||||||
|
- Bot handler recompiles patterns on the next incoming message automatically
|
||||||
|
- **Changes are in-memory only** — copy to `config.py` to persist
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Bot auth flow (`_bot_main`)
|
||||||
|
|
||||||
|
```
|
||||||
|
await bot_client.connect()
|
||||||
|
await bot_client.is_user_authorized()? → sign_in(bot_token=...)
|
||||||
|
await user_client.connect()
|
||||||
|
await user_client.is_user_authorized()? → log error + return (must run --no-tui first)
|
||||||
|
warm_entity_cache()
|
||||||
|
_make_handler(channels) ← registers NewMessage handler
|
||||||
|
backfill_all()
|
||||||
|
run_until_disconnected() ┐
|
||||||
|
_watch_channels() ┘ gathered
|
||||||
|
```
|
||||||
|
|
||||||
|
Channel-change signal path:
|
||||||
|
```
|
||||||
|
ChannelPanel button → EvChannel* on bus → _drain_bus → _signal_channel_changed()
|
||||||
|
→ call_soon_threadsafe(asyncio.Event.set) → _watch_channels() wakes → _make_handler()
|
||||||
|
```
|
||||||
1016
tui/app.py
Normal file
1016
tui/app.py
Normal file
File diff suppressed because it is too large
Load Diff
66
tui/events.md
Normal file
66
tui/events.md
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
# tui/events.py
|
||||||
|
|
||||||
|
Thread-safe event bus between the bot backend thread and the Textual TUI.
|
||||||
|
The bot thread calls `post()`. The TUI drains the queue every 100ms via `_drain_bus()`.
|
||||||
|
|
||||||
|
## Public API
|
||||||
|
|
||||||
|
```python
|
||||||
|
from tui import events as bus # from core/ and tui/app.py
|
||||||
|
from tui.events import post, init_bus, get_bus, tui_active
|
||||||
|
```
|
||||||
|
|
||||||
|
### `init_bus() -> queue.Queue`
|
||||||
|
Creates the `queue.Queue`. Called inside `MonitorApp.on_mount()` — **must run on Textual's event loop**, not before `App.run()`.
|
||||||
|
|
||||||
|
### `post(event: Any) -> None`
|
||||||
|
Fire-and-forget from any thread. Silently drops if bus not initialised.
|
||||||
|
Uses `queue.Queue.put_nowait()` — never blocks.
|
||||||
|
|
||||||
|
### `get_bus() -> queue.Queue | None`
|
||||||
|
Returns the queue for the TUI consumer to drain.
|
||||||
|
|
||||||
|
### `tui_active: bool`
|
||||||
|
Set to `True` by `init_bus()`. Checked by `core/tdl_downloader.py` to decide whether to pipe tdl output or inherit the terminal.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Event types
|
||||||
|
|
||||||
|
| Class | Fields | Posted by | Consumed by |
|
||||||
|
|-------|--------|-----------|-------------|
|
||||||
|
| `EvDownloadQueued` | `batch_id, filename, size_mb, source, password` | `tdl_downloader`, `scraper` | `DownloadPanel.queued()` |
|
||||||
|
| `EvDownloadStarted` | `batch_id, filename` | `tdl_downloader`, `scraper` | `DownloadPanel.status("downloading")` |
|
||||||
|
| `EvDownloadDone` | `batch_id, filename, via` | `tdl_downloader`, `scraper` | `DownloadPanel.status("done_tdl"\|"done_tel")` |
|
||||||
|
| `EvDownloadFailed` | `batch_id, filename, reason` | `tdl_downloader`, `scraper` | `DownloadPanel.status("failed")` |
|
||||||
|
| `EvTdlOutput` | `line` | `tdl_downloader._relay()` | `DownloadPanel.tdl_line()` |
|
||||||
|
| `EvHit` | `severity, raw, source, filename, reasons` | `notifier.notify()` | `HitsPanel.add_hit()` + `StatsPanel.refresh_stats()` |
|
||||||
|
| `EvChannelAdded` | `channel` | `ChannelPanel.on_button_pressed()` | `_drain_bus` → `_signal_channel_changed()` |
|
||||||
|
| `EvChannelRemoved` | `channel` | `ChannelPanel.on_button_pressed()` | `_drain_bus` → `_signal_channel_changed()` |
|
||||||
|
| `EvStatus` | `text, level` | everywhere | `MonitorApp.notify()` toast |
|
||||||
|
|
||||||
|
`level` on `EvStatus`: `"info"` (default) · `"warning"` · `"error"`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Threading model
|
||||||
|
|
||||||
|
```
|
||||||
|
Bot thread (own asyncio loop)
|
||||||
|
└─ bus.post(event) ← queue.Queue.put_nowait() [thread-safe]
|
||||||
|
↓
|
||||||
|
queue.Queue
|
||||||
|
↓
|
||||||
|
Textual thread (Textual's loop)
|
||||||
|
└─ _drain_bus() [set_interval 100ms]
|
||||||
|
└─ q.get_nowait() loop
|
||||||
|
└─ dispatch to widgets [safe, same thread as Textual]
|
||||||
|
```
|
||||||
|
|
||||||
|
Channel changes flow the other way:
|
||||||
|
```
|
||||||
|
_drain_bus sees EvChannelAdded/Removed
|
||||||
|
→ _signal_channel_changed()
|
||||||
|
→ loop.call_soon_threadsafe(asyncio.Event.set)
|
||||||
|
→ bot thread's _watch_channels() wakes
|
||||||
|
```
|
||||||
114
tui/events.py
Normal file
114
tui/events.py
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
"""
|
||||||
|
tui_events.py — Thread-safe event bus between the bot backend and the TUI.
|
||||||
|
|
||||||
|
The bot backend runs in a dedicated thread with its own asyncio event loop
|
||||||
|
(completely isolated from Textual's loop). Events are posted via a standard
|
||||||
|
queue.Queue (thread-safe), and the TUI consumer polls it from Textual's loop
|
||||||
|
using asyncio.get_event_loop().run_in_executor() bridging.
|
||||||
|
|
||||||
|
post() is safe to call from any thread or any asyncio loop.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import queue
|
||||||
|
import threading
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
# Thread-safe queue — works across the bot thread and Textual's thread.
|
||||||
|
_queue: queue.Queue | None = None
|
||||||
|
_queue_lock = threading.Lock()
|
||||||
|
|
||||||
|
# Set to True when the TUI is running so tdl pipes output instead of
|
||||||
|
# writing directly to the terminal.
|
||||||
|
tui_active: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
def init_bus() -> queue.Queue:
|
||||||
|
"""Call once from MonitorApp.on_mount() to create the queue."""
|
||||||
|
global _queue, tui_active
|
||||||
|
_queue = queue.Queue()
|
||||||
|
tui_active = True
|
||||||
|
return _queue
|
||||||
|
|
||||||
|
|
||||||
|
def get_bus() -> queue.Queue | None:
|
||||||
|
return _queue
|
||||||
|
|
||||||
|
|
||||||
|
def post(event: Any) -> None:
|
||||||
|
"""Fire-and-forget from any thread. Silently drops if bus not up."""
|
||||||
|
if _queue is not None:
|
||||||
|
try:
|
||||||
|
_queue.put_nowait(event)
|
||||||
|
except queue.Full:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Event types ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EvDownloadQueued:
|
||||||
|
"""A file has been accepted and is waiting for tdl."""
|
||||||
|
batch_id: str
|
||||||
|
filename: str
|
||||||
|
size_mb: float
|
||||||
|
source: str
|
||||||
|
password: str | None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EvDownloadStarted:
|
||||||
|
"""tdl has begun transferring this file."""
|
||||||
|
batch_id: str
|
||||||
|
filename: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EvDownloadDone:
|
||||||
|
"""File fully downloaded (tdl or Telethon fallback)."""
|
||||||
|
batch_id: str
|
||||||
|
filename: str
|
||||||
|
via: str # "tdl" | "telethon"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EvDownloadFailed:
|
||||||
|
"""All download attempts failed."""
|
||||||
|
batch_id: str
|
||||||
|
filename: str
|
||||||
|
reason: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EvTdlOutput:
|
||||||
|
"""A line of output from tdl's stdout/stderr (TUI mode only)."""
|
||||||
|
line: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EvHit:
|
||||||
|
"""A scored credential hit to display in the hits panel."""
|
||||||
|
severity: str
|
||||||
|
raw: str
|
||||||
|
source: str
|
||||||
|
filename: str
|
||||||
|
reasons: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EvChannelAdded:
|
||||||
|
"""A channel was added to the live watch list."""
|
||||||
|
channel: str | int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EvChannelRemoved:
|
||||||
|
"""A channel was removed from the live watch list."""
|
||||||
|
channel: str | int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EvStatus:
|
||||||
|
"""Generic one-line status message (startup, errors, etc.)."""
|
||||||
|
text: str
|
||||||
|
level: str = "info" # "info" | "warning" | "error"
|
||||||
1
utils/__init__.py
Normal file
1
utils/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""utils — pure logic modules with no Telegram dependencies."""
|
||||||
32
utils/cache.md
Normal file
32
utils/cache.md
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
# utils/cache.py
|
||||||
|
|
||||||
|
Tracks already-processed Telegram document IDs to avoid redownloading.
|
||||||
|
Persists to `data/cache.json` as a JSON array of integers.
|
||||||
|
|
||||||
|
## Public API
|
||||||
|
|
||||||
|
```python
|
||||||
|
from utils.cache import is_seen, mark_seen
|
||||||
|
```
|
||||||
|
|
||||||
|
### `is_seen(file_id: int) -> bool`
|
||||||
|
Returns `True` if this document ID has been processed before.
|
||||||
|
Loads from disk on every call (safe for multi-process, slightly slow for hot loops — not an issue given download cadence).
|
||||||
|
|
||||||
|
### `mark_seen(file_id: int) -> None`
|
||||||
|
Adds `file_id` to the cache and persists to disk.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Storage
|
||||||
|
|
||||||
|
- **File:** `data/cache.json`
|
||||||
|
- **Format:** JSON array of integers — `[123456789, 987654321, ...]`
|
||||||
|
- **No expiry** — grows indefinitely. Safe to delete to re-process all files.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- `is_seen` + `mark_seen` are called in `core/scraper.py` after a successful download+process cycle, not before — so a file that fails mid-process will be retried on next run.
|
||||||
|
- Not thread-safe (load/modify/save is not atomic). Acceptable because downloads are sequential within the bot loop.
|
||||||
38
utils/cache.py
Normal file
38
utils/cache.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
"""
|
||||||
|
cache.py — Tracks already-processed file IDs to avoid redownloading.
|
||||||
|
Persists to a simple JSON file on disk.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
CACHE_FILE = Path("./data/cache.json")
|
||||||
|
|
||||||
|
|
||||||
|
def _load() -> set:
|
||||||
|
if not CACHE_FILE.exists():
|
||||||
|
return set()
|
||||||
|
try:
|
||||||
|
with open(CACHE_FILE, "r") as f:
|
||||||
|
return set(json.load(f))
|
||||||
|
except Exception:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
|
||||||
|
def _save(seen: set) -> None:
|
||||||
|
with open(CACHE_FILE, "w") as f:
|
||||||
|
json.dump(list(seen), f)
|
||||||
|
|
||||||
|
|
||||||
|
def is_seen(file_id: int) -> bool:
|
||||||
|
return file_id in _load()
|
||||||
|
|
||||||
|
|
||||||
|
def mark_seen(file_id: int) -> None:
|
||||||
|
seen = _load()
|
||||||
|
seen.add(file_id)
|
||||||
|
_save(seen)
|
||||||
|
log.debug(f" Cached file ID {file_id}")
|
||||||
89
utils/database.md
Normal file
89
utils/database.md
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
# utils/database.py
|
||||||
|
|
||||||
|
SQLite persistence layer for credential hits.
|
||||||
|
DB file: `data/hits.db`
|
||||||
|
|
||||||
|
## Public API
|
||||||
|
|
||||||
|
```python
|
||||||
|
from utils.database import init_db, insert_hits, search, recent, by_severity, stats
|
||||||
|
```
|
||||||
|
|
||||||
|
### Setup
|
||||||
|
|
||||||
|
#### `init_db() -> None`
|
||||||
|
Creates `hits` table and indexes if they don't exist. Call once on startup.
|
||||||
|
Safe to call multiple times (idempotent).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Writing
|
||||||
|
|
||||||
|
#### `insert_hits(scored_hits, source, filename, seen_before=False) -> int`
|
||||||
|
Inserts a list of `ScoredHit` objects. Returns row count inserted.
|
||||||
|
|
||||||
|
```python
|
||||||
|
insert_hits(new_hits, source="channelname", filename="combo.zip")
|
||||||
|
insert_hits(dupe_hits, source="channelname", filename="combo.zip", seen_before=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Querying
|
||||||
|
|
||||||
|
#### `search(keyword: str) -> list[sqlite3.Row]`
|
||||||
|
Full-text search across `url`, `username`, `raw`. Returns rows sorted by score DESC, timestamp DESC.
|
||||||
|
|
||||||
|
#### `recent(limit: int = 50) -> list[sqlite3.Row]`
|
||||||
|
Most recent hits, newest first.
|
||||||
|
|
||||||
|
#### `by_severity(severity: str) -> list[sqlite3.Row]`
|
||||||
|
All unique (non-duplicate) hits at a given severity, newest first.
|
||||||
|
`severity` must be one of: `"CRITICAL"`, `"HIGH"`, `"MEDIUM"`, `"LOW"`
|
||||||
|
|
||||||
|
#### `stats() -> dict`
|
||||||
|
Returns summary counters:
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"total": int, # all rows
|
||||||
|
"unique": int, # seen_before=0
|
||||||
|
"duplicates": int, # seen_before=1
|
||||||
|
"critical": int, # unique CRITICAL
|
||||||
|
"high": int,
|
||||||
|
"medium": int,
|
||||||
|
"low": int,
|
||||||
|
"sources": int, # distinct source channels
|
||||||
|
"top_source": {"source": str, "cnt": int} | None,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Schema
|
||||||
|
|
||||||
|
```sql
|
||||||
|
hits (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
url TEXT,
|
||||||
|
username TEXT,
|
||||||
|
password TEXT,
|
||||||
|
raw TEXT NOT NULL, -- full original credential line
|
||||||
|
source TEXT, -- channel username or ID
|
||||||
|
filename TEXT, -- downloaded file name
|
||||||
|
timestamp TEXT NOT NULL, -- "YYYY-MM-DD HH:MM:SS UTC"
|
||||||
|
severity TEXT NOT NULL, -- CRITICAL/HIGH/MEDIUM/LOW
|
||||||
|
score INTEGER NOT NULL, -- 40/30/20/10
|
||||||
|
reasons TEXT, -- pipe-separated reason strings
|
||||||
|
seen_before INTEGER NOT NULL -- 0=new, 1=duplicate
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Indexes: `url`, `username`, `source`, `timestamp`, `severity`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Each query opens and closes its own connection via the `_connect()` context manager.
|
||||||
|
- `conn.row_factory = sqlite3.Row` — rows support both index and column-name access.
|
||||||
|
- Transactions: commit on success, rollback on exception.
|
||||||
171
utils/database.py
Normal file
171
utils/database.py
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
"""
|
||||||
|
database.py — SQLite storage for credential hits.
|
||||||
|
|
||||||
|
Schema:
|
||||||
|
hits table:
|
||||||
|
- id auto-increment primary key
|
||||||
|
- url the target URL from the credential line
|
||||||
|
- username extracted username/email
|
||||||
|
- password extracted password
|
||||||
|
- raw the full original line
|
||||||
|
- source channel/bot it came from
|
||||||
|
- filename the file it was found in
|
||||||
|
- timestamp UTC time of discovery
|
||||||
|
- severity CRITICAL / HIGH / MEDIUM / LOW
|
||||||
|
- score numeric score (higher = worse)
|
||||||
|
- reasons pipe-separated list of scoring reasons
|
||||||
|
- seen_before whether this was a duplicate (for stats)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DB_FILE = Path("./data/hits.db")
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Setup ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _connect():
|
||||||
|
conn = sqlite3.connect(DB_FILE)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
try:
|
||||||
|
yield conn
|
||||||
|
conn.commit()
|
||||||
|
except Exception:
|
||||||
|
conn.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def init_db() -> None:
|
||||||
|
"""Create tables if they don't exist yet."""
|
||||||
|
with _connect() as conn:
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS hits (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
url TEXT,
|
||||||
|
username TEXT,
|
||||||
|
password TEXT,
|
||||||
|
raw TEXT NOT NULL,
|
||||||
|
source TEXT,
|
||||||
|
filename TEXT,
|
||||||
|
timestamp TEXT NOT NULL,
|
||||||
|
severity TEXT NOT NULL DEFAULT 'LOW',
|
||||||
|
score INTEGER NOT NULL DEFAULT 10,
|
||||||
|
reasons TEXT,
|
||||||
|
seen_before INTEGER NOT NULL DEFAULT 0
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_url ON hits(url)")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_username ON hits(username)")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_source ON hits(source)")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_timestamp ON hits(timestamp)")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_severity ON hits(severity)")
|
||||||
|
log.info(f"Database ready: {DB_FILE}")
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Writing ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def insert_hits(
|
||||||
|
scored_hits: list,
|
||||||
|
source: str,
|
||||||
|
filename: str,
|
||||||
|
seen_before: bool = False,
|
||||||
|
) -> int:
|
||||||
|
"""
|
||||||
|
Insert a list of ScoredHit objects into the database.
|
||||||
|
Returns the number of rows inserted.
|
||||||
|
"""
|
||||||
|
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||||
|
rows = []
|
||||||
|
for h in scored_hits:
|
||||||
|
rows.append((
|
||||||
|
h.url,
|
||||||
|
h.username,
|
||||||
|
h.password,
|
||||||
|
h.raw,
|
||||||
|
source,
|
||||||
|
filename,
|
||||||
|
timestamp,
|
||||||
|
h.severity,
|
||||||
|
h.score,
|
||||||
|
" | ".join(h.reasons),
|
||||||
|
1 if seen_before else 0,
|
||||||
|
))
|
||||||
|
|
||||||
|
with _connect() as conn:
|
||||||
|
conn.executemany("""
|
||||||
|
INSERT INTO hits
|
||||||
|
(url, username, password, raw, source, filename, timestamp,
|
||||||
|
severity, score, reasons, seen_before)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
""", rows)
|
||||||
|
|
||||||
|
log.info(f" DB: inserted {len(rows)} row(s) from {filename}")
|
||||||
|
return len(rows)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Querying ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def search(keyword: str) -> list[sqlite3.Row]:
|
||||||
|
"""Search hits by keyword across url, username, raw fields."""
|
||||||
|
with _connect() as conn:
|
||||||
|
return conn.execute("""
|
||||||
|
SELECT * FROM hits
|
||||||
|
WHERE url LIKE ? OR username LIKE ? OR raw LIKE ?
|
||||||
|
ORDER BY score DESC, timestamp DESC
|
||||||
|
""", (f"%{keyword}%",) * 3).fetchall()
|
||||||
|
|
||||||
|
|
||||||
|
def recent(limit: int = 50) -> list[sqlite3.Row]:
|
||||||
|
"""Return the most recent hits."""
|
||||||
|
with _connect() as conn:
|
||||||
|
return conn.execute("""
|
||||||
|
SELECT * FROM hits
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT ?
|
||||||
|
""", (limit,)).fetchall()
|
||||||
|
|
||||||
|
|
||||||
|
def by_severity(severity: str) -> list[sqlite3.Row]:
|
||||||
|
"""Return all hits of a given severity level."""
|
||||||
|
with _connect() as conn:
|
||||||
|
return conn.execute("""
|
||||||
|
SELECT * FROM hits
|
||||||
|
WHERE severity = ? AND seen_before = 0
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
""", (severity,)).fetchall()
|
||||||
|
|
||||||
|
|
||||||
|
def stats() -> dict:
|
||||||
|
"""Return summary statistics."""
|
||||||
|
with _connect() as conn:
|
||||||
|
total = conn.execute("SELECT COUNT(*) FROM hits").fetchone()[0]
|
||||||
|
unique = conn.execute("SELECT COUNT(*) FROM hits WHERE seen_before=0").fetchone()[0]
|
||||||
|
critical = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='CRITICAL' AND seen_before=0").fetchone()[0]
|
||||||
|
high = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='HIGH' AND seen_before=0").fetchone()[0]
|
||||||
|
medium = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='MEDIUM' AND seen_before=0").fetchone()[0]
|
||||||
|
low = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='LOW' AND seen_before=0").fetchone()[0]
|
||||||
|
sources = conn.execute("SELECT COUNT(DISTINCT source) FROM hits").fetchone()[0]
|
||||||
|
top_source = conn.execute("""
|
||||||
|
SELECT source, COUNT(*) as cnt FROM hits
|
||||||
|
GROUP BY source ORDER BY cnt DESC LIMIT 1
|
||||||
|
""").fetchone()
|
||||||
|
return {
|
||||||
|
"total": total,
|
||||||
|
"unique": unique,
|
||||||
|
"duplicates": total - unique,
|
||||||
|
"critical": critical,
|
||||||
|
"high": high,
|
||||||
|
"medium": medium,
|
||||||
|
"low": low,
|
||||||
|
"sources": sources,
|
||||||
|
"top_source": dict(top_source) if top_source else None,
|
||||||
|
}
|
||||||
87
utils/scorer.md
Normal file
87
utils/scorer.md
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
# utils/scorer.py
|
||||||
|
|
||||||
|
Severity scoring for credential hits. No Telegram deps. Pure logic.
|
||||||
|
|
||||||
|
## Public API
|
||||||
|
|
||||||
|
```python
|
||||||
|
from utils.scorer import score_hit, score_hits, summarize, ScoredHit
|
||||||
|
from utils.scorer import CRITICAL, HIGH, MEDIUM, LOW, SEVERITY_EMOJI, SEVERITY_SCORES
|
||||||
|
```
|
||||||
|
|
||||||
|
### `score_hit(line: str) -> ScoredHit`
|
||||||
|
Score a single raw credential line. Parses ULP format (`url:user:pass`), runs all checks, returns a `ScoredHit`.
|
||||||
|
|
||||||
|
### `score_hits(lines: list[str]) -> list[ScoredHit]`
|
||||||
|
Score a list of lines. Returns sorted descending by score.
|
||||||
|
|
||||||
|
### `summarize(scored: list[ScoredHit]) -> dict`
|
||||||
|
Returns `{CRITICAL: n, HIGH: n, MEDIUM: n, LOW: n}`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ScoredHit dataclass
|
||||||
|
|
||||||
|
| Field | Type | Description |
|
||||||
|
|-------|------|-------------|
|
||||||
|
| `raw` | str | Original credential line |
|
||||||
|
| `severity` | str | CRITICAL / HIGH / MEDIUM / LOW |
|
||||||
|
| `score` | int | 40 / 30 / 20 / 10 |
|
||||||
|
| `reasons` | list[str] | Human-readable match reasons |
|
||||||
|
| `url` | str\|None | Parsed URL field |
|
||||||
|
| `username` | str\|None | Parsed username/email field |
|
||||||
|
| `password` | str\|None | Parsed password field |
|
||||||
|
| `.emoji` | property | 🔴🟠🟡🟢 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Scoring rules (highest match wins)
|
||||||
|
|
||||||
|
| Severity | Triggers |
|
||||||
|
|----------|----------|
|
||||||
|
| CRITICAL | Employee email domain after `@` in username/line · Privileged service URL (admin, vpn, ssh, rdp, gitlab, jira…) |
|
||||||
|
| HIGH | Internal service URL (intranet, erp, crm, sso, owa, sharepoint…) |
|
||||||
|
| MEDIUM | Client-facing URL (app, patient, booking, helpdesk…) |
|
||||||
|
| LOW | Org domain appears anywhere in line (baseline) |
|
||||||
|
|
||||||
|
Check 6 (no severity change): flags weak passwords ≤6 chars or common strings.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Employee domain matching
|
||||||
|
|
||||||
|
Keywords in `config.TARGET_KEYWORDS` containing `@` become employee patterns.
|
||||||
|
Pattern: `@<domain>(?:[^a-zA-Z0-9.\-]|$)` — requires literal `@` before the domain.
|
||||||
|
**`user@gmail.com` on a URL containing `myorg.cl` does NOT trigger CRITICAL.**
|
||||||
|
|
||||||
|
Keywords without `@` go only to `ORG_DOMAINS` (LOW baseline).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ULP line parser (`ULP_PATTERN`)
|
||||||
|
|
||||||
|
Separators: `:` `;` `,` `|` `\t` (any of these between the three fields).
|
||||||
|
|
||||||
|
The URL field handles two common stealer-log complications:
|
||||||
|
|
||||||
|
1. **`://` not treated as separator** — the optional scheme prefix `(?:https?|ftp)://` is consumed before the character-class match, so `https://` never gets split at the colon.
|
||||||
|
|
||||||
|
2. **Port + path consumed into the URL** — the optional group `(?::\d+/[^\s:;,|\t]*)` absorbs `:port/path` when the port is pure digits immediately followed by `/`. This correctly handles `http://host:8085/path/:user:pass` but intentionally skips patterns like `:24145487-8` (RUT number — hyphen after digits, no `/`).
|
||||||
|
|
||||||
|
**Known limitation:** A bare port with no path (e.g. `https://host:8080:user:pass`) will mis-parse `8080` as the username. This is not observed in practice — stealer logs always include at least a trailing `/`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Module-level globals (rebuilt on import + via KeywordsScreen)
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
|------|------|-------------|
|
||||||
|
| `EMPLOYEE_DOMAINS` | `list[tuple[str, Pattern]]` | `(domain_str, anchored_pattern)` for `@`-keywords |
|
||||||
|
| `ORG_DOMAINS` | `list[Pattern]` | Plain domain patterns for all keywords |
|
||||||
|
|
||||||
|
To rebuild after editing `config.TARGET_KEYWORDS` at runtime:
|
||||||
|
```python
|
||||||
|
import utils.scorer as scorer
|
||||||
|
scorer.EMPLOYEE_DOMAINS = scorer._build_employee_domains()
|
||||||
|
scorer.ORG_DOMAINS = scorer._build_org_domains()
|
||||||
|
```
|
||||||
273
utils/scorer.py
Normal file
273
utils/scorer.py
Normal file
@@ -0,0 +1,273 @@
|
|||||||
|
"""
|
||||||
|
scorer.py — Severity scoring for credential hits.
|
||||||
|
|
||||||
|
Scoring logic (highest match wins):
|
||||||
|
|
||||||
|
CRITICAL — Employee credentials (internal email domain)
|
||||||
|
e.g. jdoe@yourclinic.cl:password
|
||||||
|
— Admin/privileged service URLs
|
||||||
|
e.g. admin., vpn., ssh., rdp., gitlab., jira.
|
||||||
|
|
||||||
|
HIGH — Internal-facing services
|
||||||
|
e.g. intranet., erp., crm., portal., citrix.
|
||||||
|
— Password manager or SSO hits
|
||||||
|
— Any credential where username looks like an employee email
|
||||||
|
|
||||||
|
MEDIUM — Client-facing portals
|
||||||
|
e.g. app., patient., client., booking.
|
||||||
|
— Domain match on a non-privileged service
|
||||||
|
|
||||||
|
LOW — Generic domain keyword match
|
||||||
|
— No URL parsed, just a raw domain mention
|
||||||
|
|
||||||
|
Each scored hit gets a dict with:
|
||||||
|
- severity: CRITICAL / HIGH / MEDIUM / LOW
|
||||||
|
- score: int (higher = worse)
|
||||||
|
- reasons: list of human-readable reasons
|
||||||
|
- raw: original line
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from config import TARGET_KEYWORDS
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Severity levels ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
CRITICAL = "CRITICAL"
|
||||||
|
HIGH = "HIGH"
|
||||||
|
MEDIUM = "MEDIUM"
|
||||||
|
LOW = "LOW"
|
||||||
|
|
||||||
|
SEVERITY_SCORES = {
|
||||||
|
CRITICAL: 40,
|
||||||
|
HIGH: 30,
|
||||||
|
MEDIUM: 20,
|
||||||
|
LOW: 10,
|
||||||
|
}
|
||||||
|
|
||||||
|
SEVERITY_EMOJI = {
|
||||||
|
CRITICAL: "🔴",
|
||||||
|
HIGH: "🟠",
|
||||||
|
MEDIUM: "🟡",
|
||||||
|
LOW: "🟢",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Pattern banks ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Subdomains/services that indicate privileged access
|
||||||
|
CRITICAL_SERVICES = re.compile(
|
||||||
|
r"(?:^|https?://|\.)"
|
||||||
|
r"(admin|vpn|ssh|rdp|ftp|sftp|gitlab|github|bitbucket|jenkins|"
|
||||||
|
r"jira|confluence|grafana|kibana|sentry|vault|bastion|jump|"
|
||||||
|
r"firewall|router|switch|proxy|ldap|ad\.|activedirectory|"
|
||||||
|
r"exchange|mail\.)",
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
HIGH_SERVICES = re.compile(
|
||||||
|
r"(?:^|https?://|\.)"
|
||||||
|
r"(intranet|erp|crm|portal|citrix|workspace|webmail|owa|"
|
||||||
|
r"sharepoint|teams|slack|zoom|meet|sso|login|auth|oauth|"
|
||||||
|
r"accounts?|dashboard|internal|corp|staff|hr|payroll|"
|
||||||
|
r"finance|accounting)",
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
MEDIUM_SERVICES = re.compile(
|
||||||
|
r"(?:^|https?://|\.)"
|
||||||
|
r"(app|patient|client|customer|booking|appointment|"
|
||||||
|
r"reserva|cita|paciente|user|member|registro|signup|"
|
||||||
|
r"support|helpdesk|ticket)",
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
# Looks like a corporate email (user@domain)
|
||||||
|
EMAIL_PATTERN = re.compile(r"[a-zA-Z0-9._%+\-]+@([a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})")
|
||||||
|
|
||||||
|
# ULP line parser
|
||||||
|
# Separator set: colon, semicolon, comma, pipe, tab.
|
||||||
|
# URL field: optional scheme (http/https/ftp) consumed first so '://' is never
|
||||||
|
# mistaken for a separator; then an optional port group ':\d+/' absorbs port+path
|
||||||
|
# (port is digits immediately followed by '/') so 'http://host:88/path:user:pass'
|
||||||
|
# yields url='http://host:88/path', not url='http'.
|
||||||
|
ULP_PATTERN = re.compile(
|
||||||
|
r"^(?P<url>"
|
||||||
|
r"(?:(?:https?|ftp)://)?[^\s:;,|\t]+" # optional scheme + host/path
|
||||||
|
r"(?::\d+/[^\s:;,|\t]*)?" # optional :port/path (port = digits then /)
|
||||||
|
r")"
|
||||||
|
r"(?:[:;,|\t])"
|
||||||
|
r"(?P<username>[^\s:;,|\t]+)"
|
||||||
|
r"(?:[:;,|\t])"
|
||||||
|
r"(?P<password>.+)$"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Derived from config ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _kw_to_domain(kw: str) -> str:
|
||||||
|
"""Strip regex syntax from a keyword to get a plain domain string."""
|
||||||
|
return kw.replace(r"@", "").replace(r"\.", ".").strip("^$").lstrip(".")
|
||||||
|
|
||||||
|
|
||||||
|
def _build_employee_domains() -> list[tuple[str, re.Pattern]]:
|
||||||
|
"""
|
||||||
|
Keywords that contain '@' are employee email domain patterns.
|
||||||
|
|
||||||
|
Pattern anchors at '@<domain>' so that a URL containing the org domain
|
||||||
|
never causes a false CRITICAL on an unrelated email like @gmail.com.
|
||||||
|
|
||||||
|
Returns list of (domain_str, compiled_pattern) tuples.
|
||||||
|
"""
|
||||||
|
patterns = []
|
||||||
|
for kw in TARGET_KEYWORDS:
|
||||||
|
if "@" in kw:
|
||||||
|
domain = _kw_to_domain(kw)
|
||||||
|
if domain:
|
||||||
|
pat = re.compile(
|
||||||
|
r"@" + re.escape(domain) + r"(?:[^a-zA-Z0-9.\-]|$)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
patterns.append((domain, pat))
|
||||||
|
return patterns
|
||||||
|
|
||||||
|
EMPLOYEE_DOMAINS = _build_employee_domains()
|
||||||
|
|
||||||
|
|
||||||
|
def _build_org_domains() -> list[re.Pattern]:
|
||||||
|
"""
|
||||||
|
All keywords as plain domain patterns for the LOW baseline match.
|
||||||
|
Checks that the org domain appears anywhere in the line.
|
||||||
|
"""
|
||||||
|
patterns = []
|
||||||
|
for kw in TARGET_KEYWORDS:
|
||||||
|
domain = _kw_to_domain(kw)
|
||||||
|
if domain:
|
||||||
|
patterns.append(re.compile(re.escape(domain), re.IGNORECASE))
|
||||||
|
return patterns
|
||||||
|
|
||||||
|
ORG_DOMAINS = _build_org_domains()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Scoring logic ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ScoredHit:
|
||||||
|
raw: str
|
||||||
|
severity: str
|
||||||
|
score: int
|
||||||
|
reasons: list[str] = field(default_factory=list)
|
||||||
|
url: str | None = None
|
||||||
|
username: str | None = None
|
||||||
|
password: str | None = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def emoji(self) -> str:
|
||||||
|
return SEVERITY_EMOJI.get(self.severity, "⚪")
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return f"{self.emoji} [{self.severity}] {self.raw}"
|
||||||
|
|
||||||
|
|
||||||
|
def score_hit(line: str) -> ScoredHit:
|
||||||
|
"""
|
||||||
|
Score a single credential line.
|
||||||
|
Returns a ScoredHit with severity, score, and reasons.
|
||||||
|
"""
|
||||||
|
line = line.strip()
|
||||||
|
reasons = []
|
||||||
|
scores = []
|
||||||
|
|
||||||
|
# Parse ULP fields if possible
|
||||||
|
url = username = password = None
|
||||||
|
m = ULP_PATTERN.match(line)
|
||||||
|
if m:
|
||||||
|
url = m.group("url")
|
||||||
|
username = m.group("username")
|
||||||
|
password = m.group("password")
|
||||||
|
|
||||||
|
# ── Check 1: Employee email domain in username or line ───────────────
|
||||||
|
# EMPLOYEE_DOMAINS entries are (domain_str, pattern) where the pattern
|
||||||
|
# requires '@' immediately before the domain, so a URL containing the
|
||||||
|
# org domain never triggers a CRITICAL on an unrelated email (@gmail etc).
|
||||||
|
for domain_str, pat in EMPLOYEE_DOMAINS:
|
||||||
|
# Try the parsed username field first; fall back to full line.
|
||||||
|
# Either way the pattern requires a literal '@' before the domain.
|
||||||
|
field = username if username else ""
|
||||||
|
if not pat.search(field):
|
||||||
|
field = line
|
||||||
|
if pat.search(field):
|
||||||
|
scores.append(CRITICAL)
|
||||||
|
reasons.append(f"Employee email domain: {domain_str}")
|
||||||
|
break
|
||||||
|
|
||||||
|
# ── Check 2: Is the URL a privileged/critical service? ────────────────
|
||||||
|
if url and CRITICAL_SERVICES.search(url):
|
||||||
|
scores.append(CRITICAL)
|
||||||
|
reasons.append(f"Critical service URL: {url}")
|
||||||
|
|
||||||
|
# ── Check 3: Is the URL a high-value internal service? ────────────────
|
||||||
|
if url and HIGH_SERVICES.search(url):
|
||||||
|
scores.append(HIGH)
|
||||||
|
reasons.append(f"High-value internal service: {url}")
|
||||||
|
|
||||||
|
# ── Check 4: Is the URL a client-facing service? ──────────────────────
|
||||||
|
if url and MEDIUM_SERVICES.search(url):
|
||||||
|
scores.append(MEDIUM)
|
||||||
|
reasons.append(f"Client-facing service: {url}")
|
||||||
|
|
||||||
|
# ── Check 5: Generic org domain match (baseline) ─────────────────────
|
||||||
|
for pattern in ORG_DOMAINS:
|
||||||
|
if pattern.search(line):
|
||||||
|
if not scores:
|
||||||
|
scores.append(LOW)
|
||||||
|
reasons.append(f"Org domain match in line")
|
||||||
|
break
|
||||||
|
|
||||||
|
# ── Check 6: Weak/empty password flag ────────────────────────────────
|
||||||
|
if password:
|
||||||
|
if len(password) <= 6:
|
||||||
|
reasons.append(f"⚠ Weak password ({len(password)} chars)")
|
||||||
|
if password.lower() in {"123456", "password", "qwerty", "111111", "admin", "letmein"}:
|
||||||
|
reasons.append(f"⚠ Common password: {password}")
|
||||||
|
|
||||||
|
# ── Resolve final severity ────────────────────────────────────────────
|
||||||
|
severity_order = [CRITICAL, HIGH, MEDIUM, LOW]
|
||||||
|
final_severity = LOW # default
|
||||||
|
for s in severity_order:
|
||||||
|
if s in scores:
|
||||||
|
final_severity = s
|
||||||
|
break
|
||||||
|
|
||||||
|
if not reasons:
|
||||||
|
reasons.append("Pattern match")
|
||||||
|
|
||||||
|
return ScoredHit(
|
||||||
|
raw = line,
|
||||||
|
severity = final_severity,
|
||||||
|
score = SEVERITY_SCORES[final_severity],
|
||||||
|
reasons = reasons,
|
||||||
|
url = url,
|
||||||
|
username = username,
|
||||||
|
password = password,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def score_hits(lines: list[str]) -> list[ScoredHit]:
|
||||||
|
"""Score a list of credential lines. Returns sorted by score descending."""
|
||||||
|
scored = [score_hit(line) for line in lines]
|
||||||
|
scored.sort(key=lambda h: h.score, reverse=True)
|
||||||
|
return scored
|
||||||
|
|
||||||
|
|
||||||
|
def summarize(scored: list[ScoredHit]) -> dict:
|
||||||
|
"""Count hits by severity level."""
|
||||||
|
summary = {CRITICAL: 0, HIGH: 0, MEDIUM: 0, LOW: 0}
|
||||||
|
for h in scored:
|
||||||
|
summary[h.severity] += 1
|
||||||
|
return summary
|
||||||
Reference in New Issue
Block a user