Initial commit: ULPgrammer

- Core Telegram monitoring pipeline (scraper, processor, notifier, downloaders) - Textual TUI frontend with thread-safe event bus - SQLite persistence, severity scoring, dedup cache - Fixed ULP parser: handles https:// truncation, port+path URLs, semicolon separator - Test suite: 88 tests across scorer, cache, database, processor
2026-04-02 01:58:49 -03:00
commit 48f486ac97
41 changed files with 5270 additions and 0 deletions
--- a/utils/init.py
+++ b/utils/init.py
@@ -0,0 +1 @@
+"""utils — pure logic modules with no Telegram dependencies."""
--- a/utils/cache.md
+++ b/utils/cache.md
@@ -0,0 +1,32 @@
+# utils/cache.py
+
+Tracks already-processed Telegram document IDs to avoid redownloading.  
+Persists to `data/cache.json` as a JSON array of integers.
+
+## Public API
+
+```python
+from utils.cache import is_seen, mark_seen
+```
+
+### `is_seen(file_id: int) -> bool`
+Returns `True` if this document ID has been processed before.  
+Loads from disk on every call (safe for multi-process, slightly slow for hot loops — not an issue given download cadence).
+
+### `mark_seen(file_id: int) -> None`
+Adds `file_id` to the cache and persists to disk.
+
+---
+
+## Storage
+
+- **File:** `data/cache.json`
+- **Format:** JSON array of integers — `[123456789, 987654321, ...]`
+- **No expiry** — grows indefinitely. Safe to delete to re-process all files.
+
+---
+
+## Notes
+
+- `is_seen` + `mark_seen` are called in `core/scraper.py` after a successful download+process cycle, not before — so a file that fails mid-process will be retried on next run.
+- Not thread-safe (load/modify/save is not atomic). Acceptable because downloads are sequential within the bot loop.
--- a/utils/cache.py
+++ b/utils/cache.py
@@ -0,0 +1,38 @@
+"""
+cache.py — Tracks already-processed file IDs to avoid redownloading.
+Persists to a simple JSON file on disk.
+"""
+
+import json
+import logging
+from pathlib import Path
+
+log = logging.getLogger(__name__)
+
+CACHE_FILE = Path("./data/cache.json")
+
+
+def _load() -> set:
+    if not CACHE_FILE.exists():
+        return set()
+    try:
+        with open(CACHE_FILE, "r") as f:
+            return set(json.load(f))
+    except Exception:
+        return set()
+
+
+def _save(seen: set) -> None:
+    with open(CACHE_FILE, "w") as f:
+        json.dump(list(seen), f)
+
+
+def is_seen(file_id: int) -> bool:
+    return file_id in _load()
+
+
+def mark_seen(file_id: int) -> None:
+    seen = _load()
+    seen.add(file_id)
+    _save(seen)
+    log.debug(f"  Cached file ID {file_id}")
--- a/utils/database.md
+++ b/utils/database.md
@@ -0,0 +1,89 @@
+# utils/database.py
+
+SQLite persistence layer for credential hits.  
+DB file: `data/hits.db`
+
+## Public API
+
+```python
+from utils.database import init_db, insert_hits, search, recent, by_severity, stats
+```
+
+### Setup
+
+#### `init_db() -> None`
+Creates `hits` table and indexes if they don't exist. Call once on startup.  
+Safe to call multiple times (idempotent).
+
+---
+
+### Writing
+
+#### `insert_hits(scored_hits, source, filename, seen_before=False) -> int`
+Inserts a list of `ScoredHit` objects. Returns row count inserted.
+
+```python
+insert_hits(new_hits, source="channelname", filename="combo.zip")
+insert_hits(dupe_hits, source="channelname", filename="combo.zip", seen_before=True)
+```
+
+---
+
+### Querying
+
+#### `search(keyword: str) -> list[sqlite3.Row]`
+Full-text search across `url`, `username`, `raw`. Returns rows sorted by score DESC, timestamp DESC.
+
+#### `recent(limit: int = 50) -> list[sqlite3.Row]`
+Most recent hits, newest first.
+
+#### `by_severity(severity: str) -> list[sqlite3.Row]`
+All unique (non-duplicate) hits at a given severity, newest first.  
+`severity` must be one of: `"CRITICAL"`, `"HIGH"`, `"MEDIUM"`, `"LOW"`
+
+#### `stats() -> dict`
+Returns summary counters:
+```python
+{
+    "total":      int,   # all rows
+    "unique":     int,   # seen_before=0
+    "duplicates": int,   # seen_before=1
+    "critical":   int,   # unique CRITICAL
+    "high":       int,
+    "medium":     int,
+    "low":        int,
+    "sources":    int,   # distinct source channels
+    "top_source": {"source": str, "cnt": int} | None,
+}
+```
+
+---
+
+## Schema
+
+```sql
+hits (
+    id          INTEGER PRIMARY KEY AUTOINCREMENT,
+    url         TEXT,
+    username    TEXT,
+    password    TEXT,
+    raw         TEXT NOT NULL,      -- full original credential line
+    source      TEXT,               -- channel username or ID
+    filename    TEXT,               -- downloaded file name
+    timestamp   TEXT NOT NULL,      -- "YYYY-MM-DD HH:MM:SS UTC"
+    severity    TEXT NOT NULL,      -- CRITICAL/HIGH/MEDIUM/LOW
+    score       INTEGER NOT NULL,   -- 40/30/20/10
+    reasons     TEXT,               -- pipe-separated reason strings
+    seen_before INTEGER NOT NULL    -- 0=new, 1=duplicate
+)
+```
+
+Indexes: `url`, `username`, `source`, `timestamp`, `severity`.
+
+---
+
+## Notes
+
+- Each query opens and closes its own connection via the `_connect()` context manager.
+- `conn.row_factory = sqlite3.Row` — rows support both index and column-name access.
+- Transactions: commit on success, rollback on exception.
--- a/utils/database.py
+++ b/utils/database.py
@@ -0,0 +1,171 @@
+"""
+database.py — SQLite storage for credential hits.
+
+Schema:
+  hits table:
+    - id          auto-increment primary key
+    - url         the target URL from the credential line
+    - username    extracted username/email
+    - password    extracted password
+    - raw         the full original line
+    - source      channel/bot it came from
+    - filename    the file it was found in
+    - timestamp   UTC time of discovery
+    - severity    CRITICAL / HIGH / MEDIUM / LOW
+    - score       numeric score (higher = worse)
+    - reasons     pipe-separated list of scoring reasons
+    - seen_before whether this was a duplicate (for stats)
+"""
+
+import sqlite3
+import logging
+from datetime import datetime, timezone
+from pathlib import Path
+from contextlib import contextmanager
+
+log = logging.getLogger(__name__)
+
+DB_FILE = Path("./data/hits.db")
+
+
+# ─── Setup ────────────────────────────────────────────────────────────────────
+
+@contextmanager
+def _connect():
+    conn = sqlite3.connect(DB_FILE)
+    conn.row_factory = sqlite3.Row
+    try:
+        yield conn
+        conn.commit()
+    except Exception:
+        conn.rollback()
+        raise
+    finally:
+        conn.close()
+
+
+def init_db() -> None:
+    """Create tables if they don't exist yet."""
+    with _connect() as conn:
+        conn.execute("""
+            CREATE TABLE IF NOT EXISTS hits (
+                id          INTEGER PRIMARY KEY AUTOINCREMENT,
+                url         TEXT,
+                username    TEXT,
+                password    TEXT,
+                raw         TEXT NOT NULL,
+                source      TEXT,
+                filename    TEXT,
+                timestamp   TEXT NOT NULL,
+                severity    TEXT NOT NULL DEFAULT 'LOW',
+                score       INTEGER NOT NULL DEFAULT 10,
+                reasons     TEXT,
+                seen_before INTEGER NOT NULL DEFAULT 0
+            )
+        """)
+        conn.execute("CREATE INDEX IF NOT EXISTS idx_url       ON hits(url)")
+        conn.execute("CREATE INDEX IF NOT EXISTS idx_username  ON hits(username)")
+        conn.execute("CREATE INDEX IF NOT EXISTS idx_source    ON hits(source)")
+        conn.execute("CREATE INDEX IF NOT EXISTS idx_timestamp ON hits(timestamp)")
+        conn.execute("CREATE INDEX IF NOT EXISTS idx_severity  ON hits(severity)")
+    log.info(f"Database ready: {DB_FILE}")
+
+
+# ─── Writing ─────────────────────────────────────────────────────────────────
+
+def insert_hits(
+    scored_hits: list,
+    source: str,
+    filename: str,
+    seen_before: bool = False,
+) -> int:
+    """
+    Insert a list of ScoredHit objects into the database.
+    Returns the number of rows inserted.
+    """
+    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
+    rows = []
+    for h in scored_hits:
+        rows.append((
+            h.url,
+            h.username,
+            h.password,
+            h.raw,
+            source,
+            filename,
+            timestamp,
+            h.severity,
+            h.score,
+            " | ".join(h.reasons),
+            1 if seen_before else 0,
+        ))
+
+    with _connect() as conn:
+        conn.executemany("""
+            INSERT INTO hits
+              (url, username, password, raw, source, filename, timestamp,
+               severity, score, reasons, seen_before)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """, rows)
+
+    log.info(f"  DB: inserted {len(rows)} row(s) from {filename}")
+    return len(rows)
+
+
+# ─── Querying ─────────────────────────────────────────────────────────────────
+
+def search(keyword: str) -> list[sqlite3.Row]:
+    """Search hits by keyword across url, username, raw fields."""
+    with _connect() as conn:
+        return conn.execute("""
+            SELECT * FROM hits
+            WHERE url LIKE ? OR username LIKE ? OR raw LIKE ?
+            ORDER BY score DESC, timestamp DESC
+        """, (f"%{keyword}%",) * 3).fetchall()
+
+
+def recent(limit: int = 50) -> list[sqlite3.Row]:
+    """Return the most recent hits."""
+    with _connect() as conn:
+        return conn.execute("""
+            SELECT * FROM hits
+            ORDER BY timestamp DESC
+            LIMIT ?
+        """, (limit,)).fetchall()
+
+
+def by_severity(severity: str) -> list[sqlite3.Row]:
+    """Return all hits of a given severity level."""
+    with _connect() as conn:
+        return conn.execute("""
+            SELECT * FROM hits
+            WHERE severity = ? AND seen_before = 0
+            ORDER BY timestamp DESC
+        """, (severity,)).fetchall()
+
+
+def stats() -> dict:
+    """Return summary statistics."""
+    with _connect() as conn:
+        total      = conn.execute("SELECT COUNT(*) FROM hits").fetchone()[0]
+        unique     = conn.execute("SELECT COUNT(*) FROM hits WHERE seen_before=0").fetchone()[0]
+        critical   = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='CRITICAL' AND seen_before=0").fetchone()[0]
+        high       = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='HIGH' AND seen_before=0").fetchone()[0]
+        medium     = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='MEDIUM' AND seen_before=0").fetchone()[0]
+        low        = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='LOW' AND seen_before=0").fetchone()[0]
+        sources    = conn.execute("SELECT COUNT(DISTINCT source) FROM hits").fetchone()[0]
+        top_source = conn.execute("""
+            SELECT source, COUNT(*) as cnt FROM hits
+            GROUP BY source ORDER BY cnt DESC LIMIT 1
+        """).fetchone()
+    return {
+        "total":      total,
+        "unique":     unique,
+        "duplicates": total - unique,
+        "critical":   critical,
+        "high":       high,
+        "medium":     medium,
+        "low":        low,
+        "sources":    sources,
+        "top_source": dict(top_source) if top_source else None,
+    }
--- a/utils/scorer.md
+++ b/utils/scorer.md
@@ -0,0 +1,87 @@
+# utils/scorer.py
+
+Severity scoring for credential hits. No Telegram deps. Pure logic.
+
+## Public API
+
+```python
+from utils.scorer import score_hit, score_hits, summarize, ScoredHit
+from utils.scorer import CRITICAL, HIGH, MEDIUM, LOW, SEVERITY_EMOJI, SEVERITY_SCORES
+```
+
+### `score_hit(line: str) -> ScoredHit`
+Score a single raw credential line. Parses ULP format (`url:user:pass`), runs all checks, returns a `ScoredHit`.
+
+### `score_hits(lines: list[str]) -> list[ScoredHit]`
+Score a list of lines. Returns sorted descending by score.
+
+### `summarize(scored: list[ScoredHit]) -> dict`
+Returns `{CRITICAL: n, HIGH: n, MEDIUM: n, LOW: n}`.
+
+---
+
+## ScoredHit dataclass
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `raw` | str | Original credential line |
+| `severity` | str | CRITICAL / HIGH / MEDIUM / LOW |
+| `score` | int | 40 / 30 / 20 / 10 |
+| `reasons` | list[str] | Human-readable match reasons |
+| `url` | str\|None | Parsed URL field |
+| `username` | str\|None | Parsed username/email field |
+| `password` | str\|None | Parsed password field |
+| `.emoji` | property | 🔴🟠🟡🟢 |
+
+---
+
+## Scoring rules (highest match wins)
+
+| Severity | Triggers |
+|----------|----------|
+| CRITICAL | Employee email domain after `@` in username/line · Privileged service URL (admin, vpn, ssh, rdp, gitlab, jira…) |
+| HIGH | Internal service URL (intranet, erp, crm, sso, owa, sharepoint…) |
+| MEDIUM | Client-facing URL (app, patient, booking, helpdesk…) |
+| LOW | Org domain appears anywhere in line (baseline) |
+
+Check 6 (no severity change): flags weak passwords ≤6 chars or common strings.
+
+---
+
+## Employee domain matching
+
+Keywords in `config.TARGET_KEYWORDS` containing `@` become employee patterns.  
+Pattern: `@<domain>(?:[^a-zA-Z0-9.\-]|$)` — requires literal `@` before the domain.  
+**`user@gmail.com` on a URL containing `myorg.cl` does NOT trigger CRITICAL.**
+
+Keywords without `@` go only to `ORG_DOMAINS` (LOW baseline).
+
+---
+
+## ULP line parser (`ULP_PATTERN`)
+
+Separators: `:` `;` `,` `|` `\t` (any of these between the three fields).
+
+The URL field handles two common stealer-log complications:
+
+1. **`://` not treated as separator** — the optional scheme prefix `(?:https?|ftp)://` is consumed before the character-class match, so `https://` never gets split at the colon.
+
+2. **Port + path consumed into the URL** — the optional group `(?::\d+/[^\s:;,|\t]*)` absorbs `:port/path` when the port is pure digits immediately followed by `/`. This correctly handles `http://host:8085/path/:user:pass` but intentionally skips patterns like `:24145487-8` (RUT number — hyphen after digits, no `/`).
+
+**Known limitation:** A bare port with no path (e.g. `https://host:8080:user:pass`) will mis-parse `8080` as the username. This is not observed in practice — stealer logs always include at least a trailing `/`.
+
+---
+
+## Module-level globals (rebuilt on import + via KeywordsScreen)
+
+| Name | Type | Description |
+|------|------|-------------|
+| `EMPLOYEE_DOMAINS` | `list[tuple[str, Pattern]]` | `(domain_str, anchored_pattern)` for `@`-keywords |
+| `ORG_DOMAINS` | `list[Pattern]` | Plain domain patterns for all keywords |
+
+To rebuild after editing `config.TARGET_KEYWORDS` at runtime:
+```python
+import utils.scorer as scorer
+scorer.EMPLOYEE_DOMAINS = scorer._build_employee_domains()
+scorer.ORG_DOMAINS      = scorer._build_org_domains()
+```
--- a/utils/scorer.py
+++ b/utils/scorer.py
@@ -0,0 +1,273 @@
+"""
+scorer.py — Severity scoring for credential hits.
+
+Scoring logic (highest match wins):
+
+  CRITICAL  — Employee credentials (internal email domain)
+                e.g. jdoe@yourclinic.cl:password
+              — Admin/privileged service URLs
+                e.g. admin., vpn., ssh., rdp., gitlab., jira.
+
+  HIGH      — Internal-facing services
+                e.g. intranet., erp., crm., portal., citrix.
+              — Password manager or SSO hits
+              — Any credential where username looks like an employee email
+
+  MEDIUM    — Client-facing portals
+                e.g. app., patient., client., booking.
+              — Domain match on a non-privileged service
+
+  LOW       — Generic domain keyword match
+              — No URL parsed, just a raw domain mention
+
+Each scored hit gets a dict with:
+  - severity:    CRITICAL / HIGH / MEDIUM / LOW
+  - score:       int (higher = worse)
+  - reasons:     list of human-readable reasons
+  - raw:         original line
+"""
+
+import re
+import logging
+from dataclasses import dataclass, field
+from config import TARGET_KEYWORDS
+
+log = logging.getLogger(__name__)
+
+
+# ─── Severity levels ─────────────────────────────────────────────────────────
+
+CRITICAL = "CRITICAL"
+HIGH     = "HIGH"
+MEDIUM   = "MEDIUM"
+LOW      = "LOW"
+
+SEVERITY_SCORES = {
+    CRITICAL: 40,
+    HIGH:     30,
+    MEDIUM:   20,
+    LOW:      10,
+}
+
+SEVERITY_EMOJI = {
+    CRITICAL: "🔴",
+    HIGH:     "🟠",
+    MEDIUM:   "🟡",
+    LOW:      "🟢",
+}
+
+
+# ─── Pattern banks ───────────────────────────────────────────────────────────
+
+# Subdomains/services that indicate privileged access
+CRITICAL_SERVICES = re.compile(
+    r"(?:^|https?://|\.)"
+    r"(admin|vpn|ssh|rdp|ftp|sftp|gitlab|github|bitbucket|jenkins|"
+    r"jira|confluence|grafana|kibana|sentry|vault|bastion|jump|"
+    r"firewall|router|switch|proxy|ldap|ad\.|activedirectory|"
+    r"exchange|mail\.)",
+    re.IGNORECASE
+)
+
+HIGH_SERVICES = re.compile(
+    r"(?:^|https?://|\.)"
+    r"(intranet|erp|crm|portal|citrix|workspace|webmail|owa|"
+    r"sharepoint|teams|slack|zoom|meet|sso|login|auth|oauth|"
+    r"accounts?|dashboard|internal|corp|staff|hr|payroll|"
+    r"finance|accounting)",
+    re.IGNORECASE
+)
+
+MEDIUM_SERVICES = re.compile(
+    r"(?:^|https?://|\.)"
+    r"(app|patient|client|customer|booking|appointment|"
+    r"reserva|cita|paciente|user|member|registro|signup|"
+    r"support|helpdesk|ticket)",
+    re.IGNORECASE
+)
+
+# Looks like a corporate email (user@domain)
+EMAIL_PATTERN = re.compile(r"[a-zA-Z0-9._%+\-]+@([a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})")
+
+# ULP line parser
+# Separator set: colon, semicolon, comma, pipe, tab.
+# URL field: optional scheme (http/https/ftp) consumed first so '://' is never
+# mistaken for a separator; then an optional port group ':\d+/' absorbs port+path
+# (port is digits immediately followed by '/') so 'http://host:88/path:user:pass'
+# yields url='http://host:88/path', not url='http'.
+ULP_PATTERN = re.compile(
+    r"^(?P<url>"
+        r"(?:(?:https?|ftp)://)?[^\s:;,|\t]+"  # optional scheme + host/path
+        r"(?::\d+/[^\s:;,|\t]*)?"              # optional :port/path (port = digits then /)
+    r")"
+    r"(?:[:;,|\t])"
+    r"(?P<username>[^\s:;,|\t]+)"
+    r"(?:[:;,|\t])"
+    r"(?P<password>.+)$"
+)
+
+
+# ─── Derived from config ──────────────────────────────────────────────────────
+
+def _kw_to_domain(kw: str) -> str:
+    """Strip regex syntax from a keyword to get a plain domain string."""
+    return kw.replace(r"@", "").replace(r"\.", ".").strip("^$").lstrip(".")
+
+
+def _build_employee_domains() -> list[tuple[str, re.Pattern]]:
+    """
+    Keywords that contain '@' are employee email domain patterns.
+
+    Pattern anchors at '@<domain>' so that a URL containing the org domain
+    never causes a false CRITICAL on an unrelated email like @gmail.com.
+
+    Returns list of (domain_str, compiled_pattern) tuples.
+    """
+    patterns = []
+    for kw in TARGET_KEYWORDS:
+        if "@" in kw:
+            domain = _kw_to_domain(kw)
+            if domain:
+                pat = re.compile(
+                    r"@" + re.escape(domain) + r"(?:[^a-zA-Z0-9.\-]|$)",
+                    re.IGNORECASE,
+                )
+                patterns.append((domain, pat))
+    return patterns
+
+EMPLOYEE_DOMAINS = _build_employee_domains()
+
+
+def _build_org_domains() -> list[re.Pattern]:
+    """
+    All keywords as plain domain patterns for the LOW baseline match.
+    Checks that the org domain appears anywhere in the line.
+    """
+    patterns = []
+    for kw in TARGET_KEYWORDS:
+        domain = _kw_to_domain(kw)
+        if domain:
+            patterns.append(re.compile(re.escape(domain), re.IGNORECASE))
+    return patterns
+
+ORG_DOMAINS = _build_org_domains()
+
+
+
+# ─── Scoring logic ────────────────────────────────────────────────────────────
+
+@dataclass
+class ScoredHit:
+    raw:      str
+    severity: str
+    score:    int
+    reasons:  list[str] = field(default_factory=list)
+    url:      str | None = None
+    username: str | None = None
+    password: str | None = None
+
+    @property
+    def emoji(self) -> str:
+        return SEVERITY_EMOJI.get(self.severity, "⚪")
+
+    def __str__(self) -> str:
+        return f"{self.emoji} [{self.severity}] {self.raw}"
+
+
+def score_hit(line: str) -> ScoredHit:
+    """
+    Score a single credential line.
+    Returns a ScoredHit with severity, score, and reasons.
+    """
+    line    = line.strip()
+    reasons = []
+    scores  = []
+
+    # Parse ULP fields if possible
+    url = username = password = None
+    m = ULP_PATTERN.match(line)
+    if m:
+        url      = m.group("url")
+        username = m.group("username")
+        password = m.group("password")
+
+    # ── Check 1: Employee email domain in username or line ───────────────
+    # EMPLOYEE_DOMAINS entries are (domain_str, pattern) where the pattern
+    # requires '@' immediately before the domain, so a URL containing the
+    # org domain never triggers a CRITICAL on an unrelated email (@gmail etc).
+    for domain_str, pat in EMPLOYEE_DOMAINS:
+        # Try the parsed username field first; fall back to full line.
+        # Either way the pattern requires a literal '@' before the domain.
+        field = username if username else ""
+        if not pat.search(field):
+            field = line
+        if pat.search(field):
+            scores.append(CRITICAL)
+            reasons.append(f"Employee email domain: {domain_str}")
+            break
+
+    # ── Check 2: Is the URL a privileged/critical service? ────────────────
+    if url and CRITICAL_SERVICES.search(url):
+        scores.append(CRITICAL)
+        reasons.append(f"Critical service URL: {url}")
+
+    # ── Check 3: Is the URL a high-value internal service? ────────────────
+    if url and HIGH_SERVICES.search(url):
+        scores.append(HIGH)
+        reasons.append(f"High-value internal service: {url}")
+
+    # ── Check 4: Is the URL a client-facing service? ──────────────────────
+    if url and MEDIUM_SERVICES.search(url):
+        scores.append(MEDIUM)
+        reasons.append(f"Client-facing service: {url}")
+
+    # ── Check 5: Generic org domain match (baseline) ─────────────────────
+    for pattern in ORG_DOMAINS:
+        if pattern.search(line):
+            if not scores:
+                scores.append(LOW)
+                reasons.append(f"Org domain match in line")
+            break
+
+    # ── Check 6: Weak/empty password flag ────────────────────────────────
+    if password:
+        if len(password) <= 6:
+            reasons.append(f"⚠ Weak password ({len(password)} chars)")
+        if password.lower() in {"123456", "password", "qwerty", "111111", "admin", "letmein"}:
+            reasons.append(f"⚠ Common password: {password}")
+
+    # ── Resolve final severity ────────────────────────────────────────────
+    severity_order = [CRITICAL, HIGH, MEDIUM, LOW]
+    final_severity = LOW  # default
+    for s in severity_order:
+        if s in scores:
+            final_severity = s
+            break
+
+    if not reasons:
+        reasons.append("Pattern match")
+
+    return ScoredHit(
+        raw      = line,
+        severity = final_severity,
+        score    = SEVERITY_SCORES[final_severity],
+        reasons  = reasons,
+        url      = url,
+        username = username,
+        password = password,
+    )
+
+
+def score_hits(lines: list[str]) -> list[ScoredHit]:
+    """Score a list of credential lines. Returns sorted by score descending."""
+    scored = [score_hit(line) for line in lines]
+    scored.sort(key=lambda h: h.score, reverse=True)
+    return scored
+
+
+def summarize(scored: list[ScoredHit]) -> dict:
+    """Count hits by severity level."""
+    summary = {CRITICAL: 0, HIGH: 0, MEDIUM: 0, LOW: 0}
+    for h in scored:
+        summary[h.severity] += 1
+    return summary
				`@@ -0,0 +1 @@`
				`"""utils — pure logic modules with no Telegram dependencies."""`