""" database.py — SQLite storage for credential hits. Schema: hits table: - id auto-increment primary key - url the target URL from the credential line - username extracted username/email - password extracted password - raw the full original line - source channel/bot it came from - filename the file it was found in - timestamp UTC time of discovery - severity CRITICAL / HIGH / MEDIUM / LOW - score numeric score (higher = worse) - reasons pipe-separated list of scoring reasons - seen_before whether this was a duplicate (for stats) """ import sqlite3 import logging from datetime import datetime, timezone from pathlib import Path from contextlib import contextmanager log = logging.getLogger(__name__) DB_FILE = Path("./data/hits.db") # ─── Setup ──────────────────────────────────────────────────────────────────── @contextmanager def _connect(): conn = sqlite3.connect(DB_FILE) conn.row_factory = sqlite3.Row try: yield conn conn.commit() except Exception: conn.rollback() raise finally: conn.close() def init_db() -> None: """Create tables if they don't exist yet.""" with _connect() as conn: conn.execute(""" CREATE TABLE IF NOT EXISTS hits ( id INTEGER PRIMARY KEY AUTOINCREMENT, url TEXT, username TEXT, password TEXT, raw TEXT NOT NULL, source TEXT, filename TEXT, timestamp TEXT NOT NULL, severity TEXT NOT NULL DEFAULT 'LOW', score INTEGER NOT NULL DEFAULT 10, reasons TEXT, seen_before INTEGER NOT NULL DEFAULT 0 ) """) conn.execute("CREATE INDEX IF NOT EXISTS idx_url ON hits(url)") conn.execute("CREATE INDEX IF NOT EXISTS idx_username ON hits(username)") conn.execute("CREATE INDEX IF NOT EXISTS idx_source ON hits(source)") conn.execute("CREATE INDEX IF NOT EXISTS idx_timestamp ON hits(timestamp)") conn.execute("CREATE INDEX IF NOT EXISTS idx_severity ON hits(severity)") log.info(f"Database ready: {DB_FILE}") # ─── Writing ───────────────────────────────────────────────────────────────── def insert_hits( scored_hits: list, source: str, filename: str, seen_before: bool = False, ) -> int: """ Insert a list of ScoredHit objects into the database. Returns the number of rows inserted. """ timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") rows = [] for h in scored_hits: rows.append(( h.url, h.username, h.password, h.raw, source, filename, timestamp, h.severity, h.score, " | ".join(h.reasons), 1 if seen_before else 0, )) with _connect() as conn: conn.executemany(""" INSERT INTO hits (url, username, password, raw, source, filename, timestamp, severity, score, reasons, seen_before) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, rows) log.info(f" DB: inserted {len(rows)} row(s) from {filename}") return len(rows) # ─── Querying ───────────────────────────────────────────────────────────────── def search(keyword: str) -> list[sqlite3.Row]: """Search hits by keyword across url, username, raw fields.""" with _connect() as conn: return conn.execute(""" SELECT * FROM hits WHERE url LIKE ? OR username LIKE ? OR raw LIKE ? ORDER BY score DESC, timestamp DESC """, (f"%{keyword}%",) * 3).fetchall() def recent(limit: int = 50) -> list[sqlite3.Row]: """Return the most recent hits.""" with _connect() as conn: return conn.execute(""" SELECT * FROM hits ORDER BY timestamp DESC LIMIT ? """, (limit,)).fetchall() def by_severity(severity: str) -> list[sqlite3.Row]: """Return all hits of a given severity level.""" with _connect() as conn: return conn.execute(""" SELECT * FROM hits WHERE severity = ? AND seen_before = 0 ORDER BY timestamp DESC """, (severity,)).fetchall() def recent_for_domains(patterns: list[str], limit: int = 100) -> list[sqlite3.Row]: """Return recent hits whose `raw` field matches any of the given regex-like patterns.""" if not patterns: return [] conditions = " OR ".join("raw LIKE ?" for _ in patterns) args = [f"%{p.replace(r'\.','.').replace('@','').replace('^','').replace('$','')}%" for p in patterns] args.append(limit) with _connect() as conn: return conn.execute( f"SELECT * FROM hits WHERE ({conditions}) ORDER BY timestamp DESC LIMIT ?", args, ).fetchall() def count_by_severity_for_domains(patterns: list[str]) -> dict: """Severity counts filtered to hits matching any of the given patterns.""" if not patterns: return {"CRITICAL": 0, "HIGH": 0, "MEDIUM": 0, "LOW": 0} conditions = " OR ".join("raw LIKE ?" for _ in patterns) args = [f"%{p.replace(r'\.','.').replace('@','').replace('^','').replace('$','')}%" for p in patterns] with _connect() as conn: rows = conn.execute( f"SELECT severity, COUNT(*) FROM hits WHERE ({conditions}) GROUP BY severity", args, ).fetchall() counts = {"CRITICAL": 0, "HIGH": 0, "MEDIUM": 0, "LOW": 0} for row in rows: if row[0] in counts: counts[row[0]] = row[1] return counts def count_by_severity() -> dict: """Overall severity counts (unique hits only).""" with _connect() as conn: rows = conn.execute( "SELECT severity, COUNT(*) FROM hits WHERE seen_before=0 GROUP BY severity" ).fetchall() counts = {"CRITICAL": 0, "HIGH": 0, "MEDIUM": 0, "LOW": 0} for row in rows: if row[0] in counts: counts[row[0]] = row[1] return counts def stats() -> dict: """Return summary statistics.""" with _connect() as conn: total = conn.execute("SELECT COUNT(*) FROM hits").fetchone()[0] unique = conn.execute("SELECT COUNT(*) FROM hits WHERE seen_before=0").fetchone()[0] critical = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='CRITICAL' AND seen_before=0").fetchone()[0] high = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='HIGH' AND seen_before=0").fetchone()[0] medium = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='MEDIUM' AND seen_before=0").fetchone()[0] low = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='LOW' AND seen_before=0").fetchone()[0] sources = conn.execute("SELECT COUNT(DISTINCT source) FROM hits").fetchone()[0] top_source = conn.execute(""" SELECT source, COUNT(*) as cnt FROM hits GROUP BY source ORDER BY cnt DESC LIMIT 1 """).fetchone() return { "total": total, "unique": unique, "duplicates": total - unique, "critical": critical, "high": high, "medium": medium, "low": low, "sources": sources, "top_source": dict(top_source) if top_source else None, }