stealergram/utils/database.py

"""
database.py - SQLite storage for credential hits.

Schema:
  hits table:
    - id          auto-increment primary key
    - url         the target URL from the credential line
    - username    extracted username/email
    - password    extracted password
    - raw         the full original line
    - source      channel/bot it came from
    - filename    the file it was found in
    - timestamp   UTC time of discovery
    - severity    CRITICAL / HIGH / MEDIUM / LOW
    - score       numeric score (higher = worse)
    - reasons     pipe-separated list of scoring reasons
    - seen_before whether this was a duplicate (for stats)
"""

import sqlite3
import logging
from datetime import datetime, timezone
from pathlib import Path
from contextlib import contextmanager

log = logging.getLogger(__name__)

DB_FILE = Path("./data/hits.db")


# ─── Setup ────────────────────────────────────────────────────────────────────

@contextmanager
def _connect():
    conn = sqlite3.connect(DB_FILE)
    conn.row_factory = sqlite3.Row
    try:
        yield conn
        conn.commit()
    except Exception:
        conn.rollback()
        raise
    finally:
        conn.close()


def init_db() -> None:
    """Create tables if they don't exist yet."""
    with _connect() as conn:
        conn.execute("""
            CREATE TABLE IF NOT EXISTS hits (
                id          INTEGER PRIMARY KEY AUTOINCREMENT,
                url         TEXT,
                username    TEXT,
                password    TEXT,
                raw         TEXT NOT NULL,
                source      TEXT,
                filename    TEXT,
                timestamp   TEXT NOT NULL,
                severity    TEXT NOT NULL DEFAULT 'LOW',
                score       INTEGER NOT NULL DEFAULT 10,
                reasons     TEXT,
                seen_before INTEGER NOT NULL DEFAULT 0
            )
        """)
        conn.execute("CREATE INDEX IF NOT EXISTS idx_url       ON hits(url)")
        conn.execute("CREATE INDEX IF NOT EXISTS idx_username  ON hits(username)")
        conn.execute("CREATE INDEX IF NOT EXISTS idx_source    ON hits(source)")
        conn.execute("CREATE INDEX IF NOT EXISTS idx_timestamp ON hits(timestamp)")
        conn.execute("CREATE INDEX IF NOT EXISTS idx_severity  ON hits(severity)")
    log.info(f"Database ready: {DB_FILE}")


# ─── Writing ─────────────────────────────────────────────────────────────────

def insert_hits(
    scored_hits: list,
    source: str,
    filename: str,
    seen_before: bool = False,
) -> int:
    """
    Insert a list of ScoredHit objects into the database.
    Returns the number of rows inserted.
    """
    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
    rows = []
    for h in scored_hits:
        rows.append((
            h.url,
            h.username,
            h.password,
            h.raw,
            source,
            filename,
            timestamp,
            h.severity,
            h.score,
            " | ".join(h.reasons),
            1 if seen_before else 0,
        ))

    with _connect() as conn:
        conn.executemany("""
            INSERT INTO hits
              (url, username, password, raw, source, filename, timestamp,
               severity, score, reasons, seen_before)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, rows)

    log.info(f"  DB: inserted {len(rows)} row(s) from {filename}")
    return len(rows)


# ─── Querying ─────────────────────────────────────────────────────────────────

def search(keyword: str) -> list[sqlite3.Row]:
    """Search hits by keyword across url, username, raw fields."""
    with _connect() as conn:
        return conn.execute("""
            SELECT * FROM hits
            WHERE url LIKE ? OR username LIKE ? OR raw LIKE ?
            ORDER BY score DESC, timestamp DESC
        """, (f"%{keyword}%",) * 3).fetchall()


def recent(limit: int = 50) -> list[sqlite3.Row]:
    """Return the most recent hits."""
    with _connect() as conn:
        return conn.execute("""
            SELECT * FROM hits
            ORDER BY timestamp DESC
            LIMIT ?
        """, (limit,)).fetchall()


def by_severity(severity: str) -> list[sqlite3.Row]:
    """Return all hits of a given severity level."""
    with _connect() as conn:
        return conn.execute("""
            SELECT * FROM hits
            WHERE severity = ? AND seen_before = 0
            ORDER BY timestamp DESC
        """, (severity,)).fetchall()


def recent_for_domains(patterns: list[str], limit: int = 100) -> list[sqlite3.Row]:
    """Return recent hits whose `raw` field matches any of the given regex-like patterns."""
    if not patterns:
        return []
    conditions = " OR ".join("raw LIKE ?" for _ in patterns)
    args = [f"%{p.replace(r'\.','.').replace('@','').replace('^','').replace('$','')}%" for p in patterns]
    args.append(limit)
    with _connect() as conn:
        return conn.execute(
            f"SELECT * FROM hits WHERE ({conditions}) ORDER BY timestamp DESC LIMIT ?",
            args,
        ).fetchall()


def count_by_severity_for_domains(patterns: list[str]) -> dict:
    """Severity counts filtered to hits matching any of the given patterns."""
    if not patterns:
        return {"CRITICAL": 0, "HIGH": 0, "MEDIUM": 0, "LOW": 0}
    conditions = " OR ".join("raw LIKE ?" for _ in patterns)
    args = [f"%{p.replace(r'\.','.').replace('@','').replace('^','').replace('$','')}%" for p in patterns]
    with _connect() as conn:
        rows = conn.execute(
            f"SELECT severity, COUNT(*) FROM hits WHERE ({conditions}) GROUP BY severity",
            args,
        ).fetchall()
    counts = {"CRITICAL": 0, "HIGH": 0, "MEDIUM": 0, "LOW": 0}
    for row in rows:
        if row[0] in counts:
            counts[row[0]] = row[1]
    return counts


def count_by_severity() -> dict:
    """Overall severity counts (unique hits only)."""
    with _connect() as conn:
        rows = conn.execute(
            "SELECT severity, COUNT(*) FROM hits WHERE seen_before=0 GROUP BY severity"
        ).fetchall()
    counts = {"CRITICAL": 0, "HIGH": 0, "MEDIUM": 0, "LOW": 0}
    for row in rows:
        if row[0] in counts:
            counts[row[0]] = row[1]
    return counts


def stats() -> dict:
    """Return summary statistics."""
    with _connect() as conn:
        total      = conn.execute("SELECT COUNT(*) FROM hits").fetchone()[0]
        unique     = conn.execute("SELECT COUNT(*) FROM hits WHERE seen_before=0").fetchone()[0]
        critical   = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='CRITICAL' AND seen_before=0").fetchone()[0]
        high       = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='HIGH' AND seen_before=0").fetchone()[0]
        medium     = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='MEDIUM' AND seen_before=0").fetchone()[0]
        low        = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='LOW' AND seen_before=0").fetchone()[0]
        sources    = conn.execute("SELECT COUNT(DISTINCT source) FROM hits").fetchone()[0]
        top_source = conn.execute("""
            SELECT source, COUNT(*) as cnt FROM hits
            GROUP BY source ORDER BY cnt DESC LIMIT 1
        """).fetchone()
    return {
        "total":      total,
        "unique":     unique,
        "duplicates": total - unique,
        "critical":   critical,
        "high":       high,
        "medium":     medium,
        "low":        low,
        "sources":    sources,
        "top_source": dict(top_source) if top_source else None,
    }