6 changed files with 593 additions and 0 deletions
--- a/decnet/profiler/init.py
+++ b/decnet/profiler/init.py
@@ -0,0 +1,5 @@
+"""DECNET profiler — standalone attacker profile builder worker."""
+
+from decnet.profiler.worker import attacker_profile_worker
+
+__all__ = ["attacker_profile_worker"]
--- a/decnet/profiler/pycache/init.cpython-314.pyc
+++ b/decnet/profiler/pycache/init.cpython-314.pyc
--- a/decnet/profiler/pycache/behavioral.cpython-314.pyc
+++ b/decnet/profiler/pycache/behavioral.cpython-314.pyc
--- a/decnet/profiler/pycache/worker.cpython-314.pyc
+++ b/decnet/profiler/pycache/worker.cpython-314.pyc
--- a/decnet/profiler/behavioral.py
+++ b/decnet/profiler/behavioral.py
@@ -0,0 +1,375 @@
+"""
+Behavioral and timing analysis for DECNET attacker profiles.
+
+Consumes the chronological `LogEvent` stream already built by
+`decnet.correlation.engine.CorrelationEngine` and derives per-IP metrics:
+
+  - Inter-event timing statistics (mean / median / stdev / min / max)
+  - Coefficient-of-variation (jitter metric)
+  - Beaconing vs. interactive vs. scanning classification
+  - Tool attribution against known C2 frameworks (Cobalt Strike, Sliver,
+    Havoc, Mythic) using default beacon/jitter profiles
+  - Recon → exfil phase sequencing (latency between the last recon event
+    and the first exfil-like event)
+  - OS / TCP fingerprint + retransmit rollup from sniffer-emitted events
+
+Pure-Python; no external dependencies. All functions are safe to call from
+both sync and async contexts.
+"""
+
+from __future__ import annotations
+
+import json
+import statistics
+from collections import Counter
+from typing import Any
+
+from decnet.correlation.parser import LogEvent
+
+# ─── Event-type taxonomy ────────────────────────────────────────────────────
+
+# Sniffer-emitted packet events that feed into fingerprint rollup.
+_SNIFFER_SYN_EVENT: str = "tcp_syn_fingerprint"
+_SNIFFER_FLOW_EVENT: str = "tcp_flow_timing"
+
+# Events that signal "recon" phase (scans, probes, auth attempts).
+_RECON_EVENT_TYPES: frozenset[str] = frozenset({
+    "scan", "connection", "banner", "probe",
+    "login_attempt", "auth", "auth_failure",
+})
+
+# Events that signal "exfil" / action-on-objective phase.
+_EXFIL_EVENT_TYPES: frozenset[str] = frozenset({
+    "download", "upload", "file_transfer", "data_exfil",
+    "command", "exec", "query", "shell_input",
+})
+
+# Fields carrying payload byte counts (for "large payload" detection).
+_PAYLOAD_SIZE_FIELDS: tuple[str, ...] = ("bytes", "size", "content_length")
+
+# ─── C2 tool attribution signatures ─────────────────────────────────────────
+#
+# Each entry lists the default beacon cadence profile of a popular C2.
+# A profile *matches* an attacker when:
+#   - mean inter-event time is within ±`interval_tolerance` seconds, AND
+#   - jitter (cv = stdev / mean) is within ±`jitter_tolerance`
+#
+# These defaults are documented in each framework's public user guides;
+# real operators often tune them, so attribution is advisory, not definitive.
+
+_TOOL_SIGNATURES: tuple[dict[str, Any], ...] = (
+    {
+        "name": "cobalt_strike",
+        "interval_s": 60.0,
+        "interval_tolerance_s": 8.0,
+        "jitter_cv": 0.20,
+        "jitter_tolerance": 0.05,
+    },
+    {
+        "name": "sliver",
+        "interval_s": 60.0,
+        "interval_tolerance_s": 10.0,
+        "jitter_cv": 0.30,
+        "jitter_tolerance": 0.08,
+    },
+    {
+        "name": "havoc",
+        "interval_s": 45.0,
+        "interval_tolerance_s": 8.0,
+        "jitter_cv": 0.10,
+        "jitter_tolerance": 0.03,
+    },
+    {
+        "name": "mythic",
+        "interval_s": 30.0,
+        "interval_tolerance_s": 6.0,
+        "jitter_cv": 0.15,
+        "jitter_tolerance": 0.03,
+    },
+)
+
+
+# ─── Timing stats ───────────────────────────────────────────────────────────
+
+def timing_stats(events: list[LogEvent]) -> dict[str, Any]:
+    """
+    Compute inter-arrival-time statistics across *events* (sorted by ts).
+
+    Returns a dict with:
+      mean_iat_s, median_iat_s, stdev_iat_s, min_iat_s, max_iat_s, cv,
+      event_count, duration_s
+
+    For n < 2 events the interval-based fields are None/0.
+    """
+    if not events:
+        return {
+            "event_count": 0,
+            "duration_s": 0.0,
+            "mean_iat_s": None,
+            "median_iat_s": None,
+            "stdev_iat_s": None,
+            "min_iat_s": None,
+            "max_iat_s": None,
+            "cv": None,
+        }
+
+    sorted_events = sorted(events, key=lambda e: e.timestamp)
+    duration_s = (sorted_events[-1].timestamp - sorted_events[0].timestamp).total_seconds()
+
+    if len(sorted_events) < 2:
+        return {
+            "event_count": len(sorted_events),
+            "duration_s": round(duration_s, 3),
+            "mean_iat_s": None,
+            "median_iat_s": None,
+            "stdev_iat_s": None,
+            "min_iat_s": None,
+            "max_iat_s": None,
+            "cv": None,
+        }
+
+    iats = [
+        (sorted_events[i].timestamp - sorted_events[i - 1].timestamp).total_seconds()
+        for i in range(1, len(sorted_events))
+    ]
+    # Exclude spuriously-negative (clock-skew) intervals.
+    iats = [v for v in iats if v >= 0]
+    if not iats:
+        return {
+            "event_count": len(sorted_events),
+            "duration_s": round(duration_s, 3),
+            "mean_iat_s": None,
+            "median_iat_s": None,
+            "stdev_iat_s": None,
+            "min_iat_s": None,
+            "max_iat_s": None,
+            "cv": None,
+        }
+
+    mean = statistics.fmean(iats)
+    median = statistics.median(iats)
+    stdev = statistics.pstdev(iats) if len(iats) > 1 else 0.0
+    cv = (stdev / mean) if mean > 0 else None
+
+    return {
+        "event_count": len(sorted_events),
+        "duration_s": round(duration_s, 3),
+        "mean_iat_s": round(mean, 3),
+        "median_iat_s": round(median, 3),
+        "stdev_iat_s": round(stdev, 3),
+        "min_iat_s": round(min(iats), 3),
+        "max_iat_s": round(max(iats), 3),
+        "cv": round(cv, 4) if cv is not None else None,
+    }
+
+
+# ─── Behavior classification ────────────────────────────────────────────────
+
+def classify_behavior(stats: dict[str, Any], services_count: int) -> str:
+    """
+    Coarse behavior bucket: beaconing | interactive | scanning | mixed | unknown
+
+    Heuristics:
+      * `beaconing`   — low CV (< 0.35) + mean IAT ≥ 5 s + ≥ 5 events
+      * `scanning`    — ≥ 3 services touched in short bursts (mean IAT < 3 s)
+      * `interactive` — fast but irregular: mean IAT < 3 s AND CV ≥ 0.5, ≥ 10 events
+      * `mixed`       — moderate count + moderate CV, neither cleanly beaconing nor interactive
+      * `unknown`     — too few data points
+    """
+    n = stats.get("event_count") or 0
+    mean = stats.get("mean_iat_s")
+    cv = stats.get("cv")
+
+    if n < 3 or mean is None:
+        return "unknown"
+
+    # Scanning: many services, fast bursts, few events per service.
+    if services_count >= 3 and mean < 3.0 and n >= 5:
+        return "scanning"
+
+    # Beaconing: regular cadence over many events.
+    if cv is not None and cv < 0.35 and mean >= 5.0 and n >= 5:
+        return "beaconing"
+
+    # Interactive: short, irregular intervals.
+    if cv is not None and cv >= 0.5 and mean < 3.0 and n >= 10:
+        return "interactive"
+
+    return "mixed"
+
+
+# ─── C2 tool attribution ────────────────────────────────────────────────────
+
+def guess_tool(mean_iat_s: float | None, cv: float | None) -> str | None:
+    """
+    Match (mean_iat, cv) against known C2 default beacon profiles.
+
+    Returns the tool name if a single signature matches; None otherwise.
+    Multiple matches also return None to avoid false attribution.
+    """
+    if mean_iat_s is None or cv is None:
+        return None
+
+    hits: list[str] = []
+    for sig in _TOOL_SIGNATURES:
+        if abs(mean_iat_s - sig["interval_s"]) > sig["interval_tolerance_s"]:
+            continue
+        if abs(cv - sig["jitter_cv"]) > sig["jitter_tolerance"]:
+            continue
+        hits.append(sig["name"])
+
+    if len(hits) == 1:
+        return hits[0]
+    return None
+
+
+# ─── Phase sequencing ───────────────────────────────────────────────────────
+
+def phase_sequence(events: list[LogEvent]) -> dict[str, Any]:
+    """
+    Derive recon→exfil phase transition info.
+
+    Returns:
+      recon_end_ts       : ISO timestamp of last recon-class event (or None)
+      exfil_start_ts     : ISO timestamp of first exfil-class event (or None)
+      exfil_latency_s    : seconds between them (None if not both present)
+      large_payload_count: count of events whose *fields* report a payload
+                           ≥ 1 MiB (heuristic for bulk data transfer)
+    """
+    recon_end = None
+    exfil_start = None
+    large_payload_count = 0
+
+    for e in sorted(events, key=lambda x: x.timestamp):
+        if e.event_type in _RECON_EVENT_TYPES:
+            recon_end = e.timestamp
+        elif e.event_type in _EXFIL_EVENT_TYPES and exfil_start is None:
+            exfil_start = e.timestamp
+
+        for fname in _PAYLOAD_SIZE_FIELDS:
+            raw = e.fields.get(fname)
+            if raw is None:
+                continue
+            try:
+                if int(raw) >= 1_048_576:
+                    large_payload_count += 1
+                    break
+            except (TypeError, ValueError):
+                continue
+
+    latency: float | None = None
+    if recon_end is not None and exfil_start is not None and exfil_start >= recon_end:
+        latency = round((exfil_start - recon_end).total_seconds(), 3)
+
+    return {
+        "recon_end_ts": recon_end.isoformat() if recon_end else None,
+        "exfil_start_ts": exfil_start.isoformat() if exfil_start else None,
+        "exfil_latency_s": latency,
+        "large_payload_count": large_payload_count,
+    }
+
+
+# ─── Sniffer rollup (OS fingerprint + retransmits) ──────────────────────────
+
+def sniffer_rollup(events: list[LogEvent]) -> dict[str, Any]:
+    """
+    Roll up sniffer-emitted `tcp_syn_fingerprint` and `tcp_flow_timing`
+    events into a per-attacker summary.
+    """
+    os_guesses: list[str] = []
+    hops: list[int] = []
+    tcp_fp: dict[str, Any] | None = None
+    retransmits = 0
+
+    for e in events:
+        if e.event_type == _SNIFFER_SYN_EVENT:
+            og = e.fields.get("os_guess")
+            if og:
+                os_guesses.append(og)
+            try:
+                hops.append(int(e.fields.get("hop_distance", "0")))
+            except (TypeError, ValueError):
+                pass
+            # Keep the latest fingerprint snapshot.
+            tcp_fp = {
+                "window": _int_or_none(e.fields.get("window")),
+                "wscale": _int_or_none(e.fields.get("wscale")),
+                "mss": _int_or_none(e.fields.get("mss")),
+                "options_sig": e.fields.get("options_sig", ""),
+                "has_sack": e.fields.get("has_sack") == "true",
+                "has_timestamps": e.fields.get("has_timestamps") == "true",
+            }
+
+        elif e.event_type == _SNIFFER_FLOW_EVENT:
+            try:
+                retransmits += int(e.fields.get("retransmits", "0"))
+            except (TypeError, ValueError):
+                pass
+
+    # Mode for the OS bucket — most frequently observed label.
+    os_guess: str | None = None
+    if os_guesses:
+        os_guess = Counter(os_guesses).most_common(1)[0][0]
+
+    # Median hop distance (robust to the occasional weird TTL).
+    hop_distance: int | None = None
+    if hops:
+        hop_distance = int(statistics.median(hops))
+
+    return {
+        "os_guess": os_guess,
+        "hop_distance": hop_distance,
+        "tcp_fingerprint": tcp_fp or {},
+        "retransmit_count": retransmits,
+    }
+
+
+def _int_or_none(v: Any) -> int | None:
+    if v is None or v == "":
+        return None
+    try:
+        return int(v)
+    except (TypeError, ValueError):
+        return None
+
+
+# ─── Composite: build the full AttackerBehavior record ──────────────────────
+
+def build_behavior_record(events: list[LogEvent]) -> dict[str, Any]:
+    """
+    Build the dict to persist in the `attacker_behavior` table.
+
+    Callers (profiler worker) pre-serialize JSON-typed fields; we do the
+    JSON encoding here to keep the repo layer schema-agnostic.
+    """
+    # Timing stats are computed across *all* events (not filtered), because
+    # a C2 beacon often reuses the same "connection" event_type on each
+    # check-in. Filtering would throw that signal away.
+    stats = timing_stats(events)
+    services = {e.service for e in events}
+    behavior = classify_behavior(stats, len(services))
+    tool = guess_tool(stats.get("mean_iat_s"), stats.get("cv"))
+    phase = phase_sequence(events)
+    rollup = sniffer_rollup(events)
+
+    # Beacon-specific projection: only surface interval/jitter when we've
+    # classified the flow as beaconing (otherwise these numbers are noise).
+    beacon_interval_s: float | None = None
+    beacon_jitter_pct: float | None = None
+    if behavior == "beaconing":
+        beacon_interval_s = stats.get("mean_iat_s")
+        cv = stats.get("cv")
+        beacon_jitter_pct = round(cv * 100, 2) if cv is not None else None
+
+    return {
+        "os_guess": rollup["os_guess"],
+        "hop_distance": rollup["hop_distance"],
+        "tcp_fingerprint": json.dumps(rollup["tcp_fingerprint"]),
+        "retransmit_count": rollup["retransmit_count"],
+        "behavior_class": behavior,
+        "beacon_interval_s": beacon_interval_s,
+        "beacon_jitter_pct": beacon_jitter_pct,
+        "tool_guess": tool,
+        "timing_stats": json.dumps(stats),
+        "phase_sequence": json.dumps(phase),
+    }
--- a/decnet/profiler/worker.py
+++ b/decnet/profiler/worker.py
@@ -0,0 +1,213 @@
+"""
+Attacker profile builder — incremental background worker.
+
+Maintains a persistent CorrelationEngine and a log-ID cursor across cycles.
+On cold start (first cycle or process restart), performs one full build from
+all stored logs.  Subsequent cycles fetch only new logs via the cursor,
+ingest them into the existing engine, and rebuild profiles for affected IPs
+only.
+
+Complexity per cycle: O(new_logs + affected_ips) instead of O(total_logs²).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any
+
+from decnet.correlation.engine import CorrelationEngine
+from decnet.correlation.parser import LogEvent
+from decnet.logging import get_logger
+from decnet.profiler.behavioral import build_behavior_record
+from decnet.web.db.repository import BaseRepository
+
+logger = get_logger("attacker_worker")
+
+_BATCH_SIZE = 500
+_STATE_KEY = "attacker_worker_cursor"
+
+# Event types that indicate active command/query execution (not just connection/scan)
+_COMMAND_EVENT_TYPES = frozenset({
+    "command", "exec", "query", "input", "shell_input",
+    "execute", "run", "sql_query", "redis_command",
+})
+
+# Fields that carry the executed command/query text
+_COMMAND_FIELDS = ("command", "query", "input", "line", "sql", "cmd")
+
+
+@dataclass
+class _WorkerState:
+    engine: CorrelationEngine = field(default_factory=CorrelationEngine)
+    last_log_id: int = 0
+    initialized: bool = False
+
+
+async def attacker_profile_worker(repo: BaseRepository, *, interval: int = 30) -> None:
+    """Periodically updates the Attacker table incrementally. Designed to run as an asyncio Task."""
+    logger.info("attacker profile worker started interval=%ds", interval)
+    state = _WorkerState()
+    while True:
+        await asyncio.sleep(interval)
+        try:
+            await _incremental_update(repo, state)
+        except Exception as exc:
+            logger.error("attacker worker: update failed: %s", exc)
+
+
+async def _incremental_update(repo: BaseRepository, state: _WorkerState) -> None:
+    if not state.initialized:
+        await _cold_start(repo, state)
+        return
+
+    affected_ips: set[str] = set()
+
+    while True:
+        batch = await repo.get_logs_after_id(state.last_log_id, limit=_BATCH_SIZE)
+        if not batch:
+            break
+
+        for row in batch:
+            event = state.engine.ingest(row["raw_line"])
+            if event and event.attacker_ip:
+                affected_ips.add(event.attacker_ip)
+            state.last_log_id = row["id"]
+
+        if len(batch) < _BATCH_SIZE:
+            break
+
+    if not affected_ips:
+        await repo.set_state(_STATE_KEY, {"last_log_id": state.last_log_id})
+        return
+
+    await _update_profiles(repo, state, affected_ips)
+    await repo.set_state(_STATE_KEY, {"last_log_id": state.last_log_id})
+
+    logger.info("attacker worker: updated %d profiles (incremental)", len(affected_ips))
+
+
+async def _cold_start(repo: BaseRepository, state: _WorkerState) -> None:
+    all_logs = await repo.get_all_logs_raw()
+    if not all_logs:
+        state.last_log_id = await repo.get_max_log_id()
+        state.initialized = True
+        await repo.set_state(_STATE_KEY, {"last_log_id": state.last_log_id})
+        return
+
+    for row in all_logs:
+        state.engine.ingest(row["raw_line"])
+        state.last_log_id = max(state.last_log_id, row["id"])
+
+    all_ips = set(state.engine._events.keys())
+    await _update_profiles(repo, state, all_ips)
+    await repo.set_state(_STATE_KEY, {"last_log_id": state.last_log_id})
+
+    state.initialized = True
+    logger.info("attacker worker: cold start rebuilt %d profiles", len(all_ips))
+
+
+async def _update_profiles(
+    repo: BaseRepository,
+    state: _WorkerState,
+    ips: set[str],
+) -> None:
+    traversal_map = {t.attacker_ip: t for t in state.engine.traversals(min_deckies=2)}
+    bounties_map = await repo.get_bounties_for_ips(ips)
+
+    for ip in ips:
+        events = state.engine._events.get(ip, [])
+        if not events:
+            continue
+
+        traversal = traversal_map.get(ip)
+        bounties = bounties_map.get(ip, [])
+        commands = _extract_commands_from_events(events)
+
+        record = _build_record(ip, events, traversal, bounties, commands)
+        attacker_uuid = await repo.upsert_attacker(record)
+
+        # Behavioral / fingerprint rollup lives in a sibling table so failures
+        # here never block the core attacker profile upsert.
+        try:
+            behavior = build_behavior_record(events)
+            await repo.upsert_attacker_behavior(attacker_uuid, behavior)
+        except Exception as exc:
+            logger.error("attacker worker: behavior upsert failed for %s: %s", ip, exc)
+
+
+def _build_record(
+    ip: str,
+    events: list[LogEvent],
+    traversal: Any,
+    bounties: list[dict[str, Any]],
+    commands: list[dict[str, Any]],
+) -> dict[str, Any]:
+    services = sorted({e.service for e in events})
+    deckies = (
+        traversal.deckies
+        if traversal
+        else _first_contact_deckies(events)
+    )
+    fingerprints = [b for b in bounties if b.get("bounty_type") == "fingerprint"]
+    credential_count = sum(1 for b in bounties if b.get("bounty_type") == "credential")
+
+    return {
+        "ip": ip,
+        "first_seen": min(e.timestamp for e in events),
+        "last_seen": max(e.timestamp for e in events),
+        "event_count": len(events),
+        "service_count": len(services),
+        "decky_count": len({e.decky for e in events}),
+        "services": json.dumps(services),
+        "deckies": json.dumps(deckies),
+        "traversal_path": traversal.path if traversal else None,
+        "is_traversal": traversal is not None,
+        "bounty_count": len(bounties),
+        "credential_count": credential_count,
+        "fingerprints": json.dumps(fingerprints),
+        "commands": json.dumps(commands),
+        "updated_at": datetime.now(timezone.utc),
+    }
+
+
+def _first_contact_deckies(events: list[LogEvent]) -> list[str]:
+    """Return unique deckies in first-contact order (for non-traversal attackers)."""
+    seen: list[str] = []
+    for e in sorted(events, key=lambda x: x.timestamp):
+        if e.decky not in seen:
+            seen.append(e.decky)
+    return seen
+
+
+def _extract_commands_from_events(events: list[LogEvent]) -> list[dict[str, Any]]:
+    """
+    Extract executed commands from LogEvent objects.
+
+    Works directly on LogEvent.fields (already a dict), so no JSON parsing needed.
+    """
+    commands: list[dict[str, Any]] = []
+    for event in events:
+        if event.event_type not in _COMMAND_EVENT_TYPES:
+            continue
+
+        cmd_text: str | None = None
+        for key in _COMMAND_FIELDS:
+            val = event.fields.get(key)
+            if val:
+                cmd_text = str(val)
+                break
+
+        if not cmd_text:
+            continue
+
+        commands.append({
+            "service": event.service,
+            "decky": event.decky,
+            "command": cmd_text,
+            "timestamp": event.timestamp.isoformat(),
+        })
+
+    return commands