merge: testing → main (reconcile 2-week divergence)

2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions
--- a/decnet/profiler/init.py
+++ b/decnet/profiler/init.py
@@ -0,0 +1,5 @@
+"""DECNET profiler — standalone attacker profile builder worker."""
+
+from decnet.profiler.worker import attacker_profile_worker
+
+__all__ = ["attacker_profile_worker"]
--- a/decnet/profiler/behavioral.py
+++ b/decnet/profiler/behavioral.py
@@ -0,0 +1,107 @@
+"""
+Behavioral and timing analysis for DECNET attacker profiles.
+
+This module is the orchestrator: it composes the topical sub-modules
+(`timing`, `classify`, `tools`, `phases`, `fingerprint`) into the single
+`attacker_behavior` record persisted by the profiler worker.
+
+The individual detectors live in sibling modules:
+  - `timing.py`      — inter-arrival-time statistics
+  - `classify.py`    — behavior bucket (beaconing / scanning / …)
+  - `tools.py`       — C2 beacon cadence + HTTP-header tool attribution
+  - `phases.py`      — recon → exfil phase sequencing
+  - `fingerprint.py` — sniffer + prober TCP/OS fingerprint rollup
+
+Their public symbols are re-exported here for backward compatibility with
+callers and tests that import directly from `decnet.profiler.behavioral`.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from decnet.correlation.parser import LogEvent
+from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
+
+from .classify import classify_behavior
+from .fingerprint import sniffer_rollup
+from .phases import phase_sequence
+from .timing import timing_stats
+from .tools import detect_tools_from_headers, guess_tool, guess_tools
+
+__all__ = [
+    "build_behavior_record",
+    "classify_behavior",
+    "detect_tools_from_headers",
+    "guess_tool",
+    "guess_tools",
+    "phase_sequence",
+    "sniffer_rollup",
+    "timing_stats",
+]
+
+
+@_traced("profiler.build_behavior_record")
+def build_behavior_record(events: list[LogEvent]) -> dict[str, Any]:
+    """
+    Build the dict to persist in the `attacker_behavior` table.
+
+    Callers (profiler worker) pre-serialize JSON-typed fields; we do the
+    JSON encoding here to keep the repo layer schema-agnostic.
+    """
+    # Timing stats are computed across *all* events (not filtered), because
+    # a C2 beacon often reuses the same "connection" event_type on each
+    # check-in. Filtering would throw that signal away.
+    stats = timing_stats(events)
+    services = {e.service for e in events}
+    behavior = classify_behavior(stats, len(services))
+    rollup = sniffer_rollup(events)
+    phase = phase_sequence(events)
+
+    # Combine beacon-timing tool matches with header-based detections.
+    beacon_tools = guess_tools(stats.get("mean_iat_s"), stats.get("cv"))
+    header_tools = detect_tools_from_headers(events)
+    all_tools: list[str] = list(dict.fromkeys(beacon_tools + header_tools))  # dedup, preserve order
+
+    # Promote TCP-level scanner identification to tool_guesses.
+    # p0f fingerprints nmap from the TCP handshake alone — this fires even
+    # when no HTTP service is present, making it far more reliable than the
+    # header-based path for raw port scans.
+    if rollup["os_guess"] == "nmap" and "nmap" not in all_tools:
+        all_tools.insert(0, "nmap")
+
+    # Beacon-specific projection: only surface interval/jitter when we've
+    # classified the flow as beaconing (otherwise these numbers are noise).
+    beacon_interval_s: float | None = None
+    beacon_jitter_pct: float | None = None
+    if behavior == "beaconing":
+        beacon_interval_s = stats.get("mean_iat_s")
+        cv = stats.get("cv")
+        beacon_jitter_pct = round(cv * 100, 2) if cv is not None else None
+
+    _tracer = _get_tracer("profiler")
+    with _tracer.start_as_current_span("profiler.behavior_summary") as _span:
+        _span.set_attribute("behavior_class", behavior)
+        _span.set_attribute("os_guess", rollup["os_guess"] or "unknown")
+        _span.set_attribute("tool_count", len(all_tools))
+        _span.set_attribute("event_count", stats.get("event_count", 0))
+        if all_tools:
+            _span.set_attribute("tools", ",".join(all_tools))
+
+    kex_list = rollup.get("kex_order_raw") or []
+    ssh_banners = rollup.get("ssh_client_banners") or []
+    return {
+        "os_guess": rollup["os_guess"],
+        "hop_distance": rollup["hop_distance"],
+        "tcp_fingerprint": json.dumps(rollup["tcp_fingerprint"]),
+        "kex_order_raw": json.dumps(kex_list) if kex_list else None,
+        "ssh_client_banners": json.dumps(ssh_banners) if ssh_banners else None,
+        "retransmit_count": rollup["retransmit_count"],
+        "behavior_class": behavior,
+        "beacon_interval_s": beacon_interval_s,
+        "beacon_jitter_pct": beacon_jitter_pct,
+        "tool_guesses": json.dumps(all_tools),
+        "timing_stats": json.dumps(stats),
+        "phase_sequence": json.dumps(phase),
+    }
--- a/decnet/profiler/classify.py
+++ b/decnet/profiler/classify.py
@@ -0,0 +1,57 @@
+"""Coarse behavior classification for DECNET attacker profiles."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from decnet.telemetry import traced as _traced
+
+
+@_traced("profiler.classify_behavior")
+def classify_behavior(stats: dict[str, Any], services_count: int) -> str:
+    """
+    Coarse behavior bucket:
+      beaconing | interactive | scanning | brute_force | slow_scan | mixed | unknown
+
+    Heuristics (evaluated in priority order):
+      * `scanning`    — ≥ 3 services touched OR mean IAT < 2 s, ≥ 3 events
+      * `brute_force` — 1 service, n ≥ 8, mean IAT < 5 s, CV < 0.6
+      * `beaconing`   — CV < 0.35, mean IAT ≥ 5 s, ≥ 4 events
+      * `slow_scan`   — ≥ 2 services, mean IAT ≥ 10 s, ≥ 4 events
+      * `interactive` — mean IAT < 5 s AND CV ≥ 0.5, ≥ 6 events
+      * `mixed`       — catch-all for sessions with enough data
+      * `unknown`     — too few data points
+    """
+    n = stats.get("event_count") or 0
+    mean = stats.get("mean_iat_s")
+    cv = stats.get("cv")
+
+    if n < 3 or mean is None:
+        return "unknown"
+
+    # Slow scan / low-and-slow: multiple services with long gaps.
+    # Must be checked before generic scanning so slow multi-service sessions
+    # don't get mis-bucketed as a fast sweep.
+    if services_count >= 2 and mean >= 10.0 and n >= 4:
+        return "slow_scan"
+
+    # Scanning: broad service sweep (multi-service) or very rapid single-service bursts.
+    if n >= 3 and (
+        (services_count >= 3 and mean < 10.0)
+        or (services_count >= 2 and mean < 2.0)
+    ):
+        return "scanning"
+
+    # Brute force: hammering one service rapidly and repeatedly.
+    if services_count == 1 and n >= 8 and mean < 5.0 and cv is not None and cv < 0.6:
+        return "brute_force"
+
+    # Beaconing: regular cadence over multiple events.
+    if cv is not None and cv < 0.35 and mean >= 5.0 and n >= 4:
+        return "beaconing"
+
+    # Interactive: short but irregular bursts (human or tool with think time).
+    if cv is not None and cv >= 0.5 and mean < 5.0 and n >= 6:
+        return "interactive"
+
+    return "mixed"
--- a/decnet/profiler/fingerprint.py
+++ b/decnet/profiler/fingerprint.py
@@ -0,0 +1,296 @@
+"""OS / TCP fingerprint rollup for DECNET attacker profiles.
+
+Consumes sniffer-emitted `tcp_syn_fingerprint` / `tcp_flow_timing` events and
+active prober `tcpfp_fingerprint` events; derives a per-attacker summary
+(os_guess, hop_distance, tcp_fingerprint snapshot, retransmit_count).
+"""
+
+from __future__ import annotations
+
+import logging
+import statistics
+from collections import Counter
+from typing import Any, Optional
+
+from decnet.correlation.parser import LogEvent
+from decnet.prober.osfp import OsMatch, get_all_providers
+from decnet.sniffer.p0f import initial_ttl as _initial_ttl_bucket
+from decnet.telemetry import traced as _traced
+
+_log = logging.getLogger("decnet.profiler.fingerprint")
+
+# Sniffer-emitted packet events that feed into fingerprint rollup.
+_SNIFFER_SYN_EVENT: str  = "tcp_syn_fingerprint"
+_SNIFFER_FLOW_EVENT: str = "tcp_flow_timing"
+# Prober-emitted active-probe result (SYN-ACK fingerprint of attacker machine).
+_PROBER_TCPFP_EVENT: str = "tcpfp_fingerprint"
+# Prober-emitted HASSHServer fingerprint; carries the raw kex_algorithms string.
+_PROBER_HASSH_EVENT: str = "hassh_fingerprint"
+# Sniffer-emitted SSH client identification string (RFC 4253 §4.2).
+_SNIFFER_SSH_BANNER_EVENT: str = "ssh_client_banner"
+
+# Canonical initial TTL for each coarse OS bucket.  Used to derive hop
+# distance when only the observed TTL is available (prober path).
+_INITIAL_TTL: dict[str, int] = {
+    "linux":    64,
+    "windows":  128,
+    "embedded": 255,
+}
+
+
+def _os_from_ttl(ttl_str: str | None) -> str | None:
+    """Derive a coarse OS guess from observed TTL when p0f has no match."""
+    if not ttl_str:
+        return None
+    try:
+        ttl = int(ttl_str)
+    except (TypeError, ValueError):
+        return None
+    if 55 <= ttl <= 70:
+        return "linux"
+    if 115 <= ttl <= 135:
+        return "windows"
+    if 235 <= ttl <= 255:
+        return "embedded"
+    return None
+
+
+def _int_or_none(v: Any) -> int | None:
+    if v is None or v == "":
+        return None
+    try:
+        return int(v)
+    except (TypeError, ValueError):
+        return None
+
+
+def _match_via_osfp_providers(
+    tcp_fp: dict[str, Any] | None,
+    modal_ttl: str | None,
+    context: str,
+) -> Optional[OsMatch]:
+    """Feed the current tcp_fp snapshot through every enabled OS-fingerprint
+    provider and return the best match, or None.
+
+    Must never raise — factory / provider failures collapse to None so a
+    corrupt .fp file or misconfigured DECNET_OSFP_PROVIDERS env var can't
+    wedge the profile rebuild for an entire attacker. Worst case: the
+    caller falls back to the modal-label / TTL-bucket path that existed
+    before this wiring.
+    """
+    if not tcp_fp:
+        return None
+    # Convert the observed TTL (which may be N hops below the initial TTL
+    # the remote OS uses) to the canonical initial-TTL bucket the p0f v2
+    # DB expects (32 / 64 / 128 / 255).
+    try:
+        ttl_int = int(modal_ttl) if modal_ttl is not None else None
+    except (TypeError, ValueError):
+        ttl_int = None
+    initial_ttl_bucket = _initial_ttl_bucket(ttl_int) if ttl_int is not None else None
+
+    obs: dict[str, Any] = {
+        "window":      tcp_fp.get("window"),
+        "wscale":      tcp_fp.get("wscale"),
+        "mss":         tcp_fp.get("mss"),
+        "options_sig": tcp_fp.get("options_sig"),
+        "ttl":         initial_ttl_bucket,
+        # DF and total_len are not captured today — passed as None so
+        # Signature.score treats them as soft fields (skip check when
+        # missing). Promote to hard fields once the sniffer/prober
+        # emit them on tcp_syn_fingerprint / tcpfp_fingerprint.
+        "df":          None,
+        "total_len":   None,
+        # Sniffer doesn't yet emit a quirks SD field, so the matcher
+        # sees an empty set — which matches signatures with no quirks
+        # (the common case) but not signatures with specific quirks.
+        # That's correct behaviour, not a bug.
+        "quirks":      frozenset(),
+        "context":     context,
+    }
+
+    best: Optional[OsMatch] = None
+    try:
+        providers = get_all_providers()
+    except Exception as exc:  # noqa: BLE001 — must not propagate
+        _log.warning("osfp: provider init failed, skipping match: %s", exc)
+        return None
+    for provider in providers:
+        try:
+            match = provider.match(obs)
+        except Exception as exc:  # noqa: BLE001 — must not propagate
+            _log.warning("osfp: provider %s raised during match: %s", provider.name, exc)
+            continue
+        if match is None:
+            continue
+        if best is None or match.confidence > best.confidence:
+            best = match
+    return best
+
+
+@_traced("profiler.sniffer_rollup")
+def sniffer_rollup(events: list[LogEvent]) -> dict[str, Any]:
+    """
+    Roll up sniffer-emitted `tcp_syn_fingerprint` and `tcp_flow_timing`
+    events into a per-attacker summary.
+
+    OS guess priority:
+      1. Modal p0f label from os_guess field (if not "unknown"/empty).
+      2. TTL-based coarse bucket (linux / windows / embedded) as fallback.
+    Hop distance: median of non-zero reported values only.
+    """
+    os_guesses: list[str] = []
+    ttl_values: list[str] = []
+    hops: list[int] = []
+    tcp_fp: dict[str, Any] | None = None
+    ipid_latest: str | None = None
+    isn_latest: str | None = None
+    # Tracks which event set tcp_fp last — picks the provider "context"
+    # (syn vs synack) when we feed the p0f-v2 matcher below.
+    tcp_fp_context: str = "syn"
+    retransmits = 0
+    kex_order_raw: list[str] = []
+    _kex_seen: set[str] = set()
+    ssh_client_banners: list[str] = []
+    _ssh_banner_seen: set[str] = set()
+
+    for e in events:
+        if e.event_type == _SNIFFER_SYN_EVENT:
+            og = e.fields.get("os_guess")
+            if og and og != "unknown":
+                os_guesses.append(og)
+
+            # Collect raw TTL for fallback OS derivation.
+            ttl_raw = e.fields.get("ttl") or e.fields.get("initial_ttl")
+            if ttl_raw:
+                ttl_values.append(ttl_raw)
+
+            # Only include hop distances that are valid and non-zero.
+            hop_raw = e.fields.get("hop_distance")
+            if hop_raw:
+                try:
+                    hop_val = int(hop_raw)
+                    if hop_val > 0:
+                        hops.append(hop_val)
+                except (TypeError, ValueError):
+                    pass
+
+            # Keep the latest fingerprint snapshot.
+            tcp_fp = {
+                "window": _int_or_none(e.fields.get("window")),
+                "wscale": _int_or_none(e.fields.get("wscale")),
+                "mss": _int_or_none(e.fields.get("mss")),
+                "options_sig": e.fields.get("options_sig", ""),
+                "has_sack": e.fields.get("has_sack") == "true",
+                "has_timestamps": e.fields.get("has_timestamps") == "true",
+                "tos": _int_or_none(e.fields.get("tos")),
+                "dscp": _int_or_none(e.fields.get("dscp")),
+                "ecn": _int_or_none(e.fields.get("ecn")),
+            }
+            # Sequence classifications converge as samples accumulate; the
+            # most recent non-"unknown" label wins so a later "unknown" event
+            # (e.g. a deque reset) doesn't overwrite a confident verdict.
+            ipid_class = e.fields.get("ipid_class")
+            if ipid_class and ipid_class != "unknown":
+                ipid_latest = ipid_class
+            tcp_fp["ipid_class"] = ipid_latest
+            isn_class = e.fields.get("isn_class")
+            if isn_class and isn_class != "unknown":
+                isn_latest = isn_class
+            tcp_fp["isn_class"] = isn_latest
+            tcp_fp_context = "syn"
+
+        elif e.event_type == _SNIFFER_FLOW_EVENT:
+            try:
+                retransmits += int(e.fields.get("retransmits", "0"))
+            except (TypeError, ValueError):
+                pass
+
+        elif e.event_type == _PROBER_HASSH_EVENT:
+            # Prober HASSHServer probe: preserve the raw kex_algorithms list
+            # for post-hoc ordering analysis. Dedup because a single attacker
+            # SSH service will emit the same list per port/probe cycle.
+            kex = e.fields.get("kex_algorithms")
+            if kex and kex not in _kex_seen:
+                kex_order_raw.append(kex)
+                _kex_seen.add(kex)
+
+        elif e.event_type == _SNIFFER_SSH_BANNER_EVENT:
+            # Sniffer-observed SSH identification string from attacker.
+            # Dedup: the same attacker will reuse the same client banner
+            # across flows/reconnects; record distinct values in order seen.
+            banner = e.fields.get("ssh_version")
+            if banner and banner not in _ssh_banner_seen:
+                ssh_client_banners.append(banner)
+                _ssh_banner_seen.add(banner)
+
+        elif e.event_type == _PROBER_TCPFP_EVENT:
+            # Active-probe result: prober sent SYN to attacker, got SYN-ACK back.
+            # Field names differ from the passive sniffer (different emitter).
+            ttl_raw = e.fields.get("ttl")
+            if ttl_raw:
+                ttl_values.append(ttl_raw)
+
+                # Derive hop distance from observed TTL vs canonical initial TTL.
+                os_hint = _os_from_ttl(ttl_raw)
+                if os_hint:
+                    initial = _INITIAL_TTL.get(os_hint)
+                    if initial:
+                        try:
+                            hop_val = initial - int(ttl_raw)
+                            if hop_val > 0:
+                                hops.append(hop_val)
+                        except (TypeError, ValueError):
+                            pass
+
+            # Prober uses window_size/window_scale/options_order instead of
+            # the sniffer's window/wscale/options_sig.
+            tcp_fp = {
+                "window":         _int_or_none(e.fields.get("window_size")),
+                "wscale":         _int_or_none(e.fields.get("window_scale")),
+                "mss":            _int_or_none(e.fields.get("mss")),
+                "options_sig":    e.fields.get("options_order", ""),
+                "has_sack":       e.fields.get("sack_ok") == "1",
+                "has_timestamps": e.fields.get("timestamp") == "1",
+                "tos":            _int_or_none(e.fields.get("tos")),
+                "dscp":           _int_or_none(e.fields.get("dscp")),
+                "ecn":            _int_or_none(e.fields.get("ecn")),
+            }
+            tcp_fp_context = "synack"  # prober sent SYN, captured attacker's SYN-ACK
+
+    # OS-guess resolution chain:
+    #   1. p0f-v2 (or whichever providers DECNET_OSFP_PROVIDERS enables)
+    #      matched against the latest tcp_fp snapshot — the 375-sig
+    #      vendored DB is far more discriminating than what follows.
+    #   2. Modal sniffer-emitted label from the old ~10-sig hand-rolled
+    #      table in decnet/sniffer/p0f.py. Kept as fallback because the
+    #      vendored v2 DB predates post-2006 kernels.
+    #   3. TTL bucket (linux / windows / embedded). Coarse but never
+    #      lies when at least one TCP packet was seen.
+    os_guess: str | None = None
+    modal_ttl = Counter(ttl_values).most_common(1)[0][0] if ttl_values else None
+
+    osfp_match = _match_via_osfp_providers(tcp_fp, modal_ttl, tcp_fp_context)
+    if osfp_match is not None:
+        # Render "Linux" + "2.6.x kernel" as "Linux 2.6.x kernel" — a single
+        # string fits the existing os_guess column contract. Flavor can be
+        # empty for generic signatures, in which case we just emit the OS.
+        os_guess = osfp_match.os if not osfp_match.flavor else f"{osfp_match.os} {osfp_match.flavor}"
+    elif os_guesses:
+        os_guess = Counter(os_guesses).most_common(1)[0][0]
+    elif modal_ttl is not None:
+        os_guess = _os_from_ttl(modal_ttl)
+
+    # Median hop distance (robust to the occasional weird TTL).
+    hop_distance: int | None = None
+    if hops:
+        hop_distance = int(statistics.median(hops))
+
+    return {
+        "os_guess": os_guess,
+        "hop_distance": hop_distance,
+        "tcp_fingerprint": tcp_fp or {},
+        "retransmit_count": retransmits,
+        "kex_order_raw": kex_order_raw,
+        "ssh_client_banners": ssh_client_banners,
+    }
--- a/decnet/profiler/identity_rollup.py
+++ b/decnet/profiler/identity_rollup.py
@@ -0,0 +1,109 @@
+"""Identity-level fingerprint rollup.
+
+The clusterer mints :class:`AttackerIdentity` rows (and merges them) from
+union-find over per-IP :class:`Attacker` observations. Each ``Attacker``
+row already carries a ``fingerprints`` JSON list — the output of the
+profiler's ``_build_record`` flatten of every ``bounty_type='fingerprint'``
+bounty seen for that IP. This module distils that per-observation list
+into the cross-observation summary columns on ``AttackerIdentity``:
+
+* ``ja3_hashes``        — TLS ClientHello fingerprints
+* ``hassh_hashes``      — SSH KEX fingerprints
+* ``tls_cert_sha256``   — leaf cert SHA-256s presented by attacker-run
+                          TLS servers (active-prober capture)
+
+These are JSON-serialised ``list[str]`` columns shaped for federation
+gossip — same wire format the campaign clusterer reads. The values are
+deduplicated and sorted so two clusterer runs over the same input produce
+byte-identical column writes.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any, Iterable, Optional
+
+
+# Bounty payload key per fingerprint family. Only fingerprints whose
+# payload carries a stable scalar identifier roll up cleanly here —
+# tcpfp / http_quirks / ja4l etc. don't fit the "list of hashes" shape
+# and stay out of the rollup until they get their own columns.
+_PAYLOAD_KEY_BY_FP_TYPE: dict[str, str] = {
+    "ja3":             "ja3",
+    "hassh_server":    "hash",
+    "tls_certificate": "cert_sha256",
+}
+
+_COLUMN_BY_FP_TYPE: dict[str, str] = {
+    "ja3":             "ja3_hashes",
+    "hassh_server":    "hassh_hashes",
+    "tls_certificate": "tls_cert_sha256",
+}
+
+
+def _payload_of(entry: Any) -> dict[str, Any]:
+    """Return the payload dict from a fingerprint bounty entry."""
+    if not isinstance(entry, dict):
+        return {}
+    p = entry.get("payload")
+    if isinstance(p, dict):
+        return p
+    if isinstance(p, str):
+        try:
+            parsed = json.loads(p)
+        except (TypeError, ValueError):
+            return {}
+        return parsed if isinstance(parsed, dict) else {}
+    # Some legacy callers may have flattened the payload onto the entry.
+    return entry
+
+
+def _parse_fingerprints(raw: Any) -> list[dict[str, Any]]:
+    """Best-effort parse of an Attacker.fingerprints column value."""
+    if raw is None:
+        return []
+    if isinstance(raw, list):
+        return [e for e in raw if isinstance(e, dict)]
+    if isinstance(raw, str):
+        try:
+            decoded = json.loads(raw)
+        except (TypeError, ValueError):
+            return []
+        return [e for e in decoded if isinstance(e, dict)] if isinstance(decoded, list) else []
+    return []
+
+
+def extract_fp_summaries(
+    member_rows: Iterable[dict[str, Any]],
+) -> dict[str, Optional[str]]:
+    """Aggregate fingerprint hashes across the given Attacker rows.
+
+    Returns a dict with keys ``ja3_hashes``, ``hassh_hashes``,
+    ``tls_cert_sha256`` — each value is either a JSON-encoded
+    ``list[str]`` (deduped, sorted) or ``None`` when no signal is
+    present. ``None`` is preferred over ``"[]"`` so the column stays
+    NULL and downstream readers can distinguish "no data yet" from
+    "actively known to be empty".
+
+    Pure: no DB, no clock, no I/O. The clusterer drives the call.
+    """
+    buckets: dict[str, set[str]] = {col: set() for col in _COLUMN_BY_FP_TYPE.values()}
+
+    for row in member_rows:
+        for entry in _parse_fingerprints(row.get("fingerprints")):
+            payload = _payload_of(entry)
+            fp_type = payload.get("fingerprint_type")
+            if not isinstance(fp_type, str):
+                continue
+            payload_key = _PAYLOAD_KEY_BY_FP_TYPE.get(fp_type)
+            column = _COLUMN_BY_FP_TYPE.get(fp_type)
+            if payload_key is None or column is None:
+                continue
+            value = payload.get(payload_key)
+            if isinstance(value, str) and value:
+                buckets[column].add(value)
+
+    return {
+        column: (json.dumps(sorted(values)) if values else None)
+        for column, values in buckets.items()
+    }
--- a/decnet/profiler/phases.py
+++ b/decnet/profiler/phases.py
@@ -0,0 +1,68 @@
+"""Recon → exfil phase sequencing for DECNET attacker profiles."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from decnet.correlation.parser import LogEvent
+from decnet.telemetry import traced as _traced
+
+# Events that signal "recon" phase (scans, probes, auth attempts).
+_RECON_EVENT_TYPES: frozenset[str] = frozenset({
+    "scan", "connection", "banner", "probe",
+    "login_attempt", "auth", "auth_failure",
+})
+
+# Events that signal "exfil" / action-on-objective phase.
+_EXFIL_EVENT_TYPES: frozenset[str] = frozenset({
+    "download", "upload", "file_transfer", "data_exfil",
+    "command", "exec", "query", "shell_input",
+})
+
+# Fields carrying payload byte counts (for "large payload" detection).
+_PAYLOAD_SIZE_FIELDS: tuple[str, ...] = ("bytes", "size", "content_length")
+
+
+@_traced("profiler.phase_sequence")
+def phase_sequence(events: list[LogEvent]) -> dict[str, Any]:
+    """
+    Derive recon→exfil phase transition info.
+
+    Returns:
+      recon_end_ts       : ISO timestamp of last recon-class event (or None)
+      exfil_start_ts     : ISO timestamp of first exfil-class event (or None)
+      exfil_latency_s    : seconds between them (None if not both present)
+      large_payload_count: count of events whose *fields* report a payload
+                           ≥ 1 MiB (heuristic for bulk data transfer)
+    """
+    recon_end = None
+    exfil_start = None
+    large_payload_count = 0
+
+    for e in sorted(events, key=lambda x: x.timestamp):
+        if e.event_type in _RECON_EVENT_TYPES:
+            recon_end = e.timestamp
+        elif e.event_type in _EXFIL_EVENT_TYPES and exfil_start is None:
+            exfil_start = e.timestamp
+
+        for fname in _PAYLOAD_SIZE_FIELDS:
+            raw = e.fields.get(fname)
+            if raw is None:
+                continue
+            try:
+                if int(raw) >= 1_048_576:
+                    large_payload_count += 1
+                    break
+            except (TypeError, ValueError):
+                continue
+
+    latency: float | None = None
+    if recon_end is not None and exfil_start is not None and exfil_start >= recon_end:
+        latency = round((exfil_start - recon_end).total_seconds(), 3)
+
+    return {
+        "recon_end_ts": recon_end.isoformat() if recon_end else None,
+        "exfil_start_ts": exfil_start.isoformat() if exfil_start else None,
+        "exfil_latency_s": latency,
+        "large_payload_count": large_payload_count,
+    }
--- a/decnet/profiler/timing.py
+++ b/decnet/profiler/timing.py
@@ -0,0 +1,82 @@
+"""Inter-arrival timing statistics for DECNET attacker profiles."""
+
+from __future__ import annotations
+
+import statistics
+from typing import Any
+
+from decnet.correlation.parser import LogEvent
+from decnet.telemetry import traced as _traced
+
+
+@_traced("profiler.timing_stats")
+def timing_stats(events: list[LogEvent]) -> dict[str, Any]:
+    """
+    Compute inter-arrival-time statistics across *events* (sorted by ts).
+
+    Returns a dict with:
+      mean_iat_s, median_iat_s, stdev_iat_s, min_iat_s, max_iat_s, cv,
+      event_count, duration_s
+
+    For n < 2 events the interval-based fields are None/0.
+    """
+    if not events:
+        return {
+            "event_count": 0,
+            "duration_s": 0.0,
+            "mean_iat_s": None,
+            "median_iat_s": None,
+            "stdev_iat_s": None,
+            "min_iat_s": None,
+            "max_iat_s": None,
+            "cv": None,
+        }
+
+    sorted_events = sorted(events, key=lambda e: e.timestamp)
+    duration_s = (sorted_events[-1].timestamp - sorted_events[0].timestamp).total_seconds()
+
+    if len(sorted_events) < 2:
+        return {
+            "event_count": len(sorted_events),
+            "duration_s": round(duration_s, 3),
+            "mean_iat_s": None,
+            "median_iat_s": None,
+            "stdev_iat_s": None,
+            "min_iat_s": None,
+            "max_iat_s": None,
+            "cv": None,
+        }
+
+    iats = [
+        (sorted_events[i].timestamp - sorted_events[i - 1].timestamp).total_seconds()
+        for i in range(1, len(sorted_events))
+    ]
+    # Exclude spuriously-negative (clock-skew) intervals.
+    iats = [v for v in iats if v >= 0]
+    if not iats:
+        return {
+            "event_count": len(sorted_events),
+            "duration_s": round(duration_s, 3),
+            "mean_iat_s": None,
+            "median_iat_s": None,
+            "stdev_iat_s": None,
+            "min_iat_s": None,
+            "max_iat_s": None,
+            "cv": None,
+        }
+
+    mean = statistics.fmean(iats)
+    median = statistics.median(iats)
+    stdev = statistics.pstdev(iats) if len(iats) > 1 else 0.0
+    cv = (stdev / mean) if mean > 0 else None
+
+    return {
+        "event_count": len(sorted_events),
+        "duration_s": round(duration_s, 3),
+        "mean_iat_s": round(mean, 3),
+        "median_iat_s": round(median, 3),
+        "stdev_iat_s": round(stdev, 3),
+        "min_iat_s": round(min(iats), 3),
+        "max_iat_s": round(max(iats), 3),
+        "cv": round(cv, 4) if cv is not None else None,
+    }
--- a/decnet/profiler/tools.py
+++ b/decnet/profiler/tools.py
@@ -0,0 +1,179 @@
+"""Tool attribution for DECNET attacker profiles.
+
+Two detection paths:
+
+  * `guess_tools()` — matches beacon cadence (mean IAT + CV jitter) against
+    known C2 default profiles (Cobalt Strike, Sliver, Havoc, Mythic).
+  * `detect_tools_from_headers()` — scans HTTP `request` events for
+    tool-identifying User-Agent / X-Mailer / etc. headers (Nmap NSE, sqlmap,
+    nuclei, masscan, metasploit, curl, and friends).
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from typing import Any
+
+from decnet.correlation.parser import LogEvent
+from decnet.telemetry import traced as _traced
+
+# ─── C2 tool attribution signatures (beacon timing) ─────────────────────────
+#
+# Each entry lists the default beacon cadence profile of a popular C2.
+# A profile *matches* an attacker when:
+#   - mean inter-event time is within ±`interval_tolerance` seconds, AND
+#   - jitter (cv = stdev / mean) is within ±`jitter_tolerance`
+#
+# Multiple matches are all returned (attacker may run multiple implants).
+
+_TOOL_SIGNATURES: tuple[dict[str, Any], ...] = (
+    {
+        "name": "cobalt_strike",
+        "interval_s": 60.0,
+        "interval_tolerance_s": 8.0,
+        "jitter_cv": 0.20,
+        "jitter_tolerance": 0.05,
+    },
+    {
+        "name": "sliver",
+        "interval_s": 60.0,
+        "interval_tolerance_s": 10.0,
+        "jitter_cv": 0.30,
+        "jitter_tolerance": 0.08,
+    },
+    {
+        "name": "havoc",
+        "interval_s": 45.0,
+        "interval_tolerance_s": 8.0,
+        "jitter_cv": 0.10,
+        "jitter_tolerance": 0.03,
+    },
+    {
+        "name": "mythic",
+        "interval_s": 30.0,
+        "interval_tolerance_s": 6.0,
+        "jitter_cv": 0.15,
+        "jitter_tolerance": 0.03,
+    },
+)
+
+# ─── Header-based tool signatures ───────────────────────────────────────────
+#
+# Scanned against HTTP `request` events.  `pattern` is a case-insensitive
+# substring (or a regex anchored with ^ if it starts with that character).
+# `header` is matched case-insensitively against the event's headers dict.
+
+_HEADER_TOOL_SIGNATURES: tuple[dict[str, str], ...] = (
+    {"name": "nmap",             "header": "user-agent", "pattern": "Nmap Scripting Engine"},
+    {"name": "gophish",          "header": "x-mailer",   "pattern": "gophish"},
+    {"name": "nikto",            "header": "user-agent", "pattern": "Nikto"},
+    {"name": "sqlmap",           "header": "user-agent", "pattern": "sqlmap"},
+    {"name": "nuclei",           "header": "user-agent", "pattern": "Nuclei"},
+    {"name": "masscan",          "header": "user-agent", "pattern": "masscan"},
+    {"name": "zgrab",            "header": "user-agent", "pattern": "zgrab"},
+    {"name": "metasploit",       "header": "user-agent", "pattern": "Metasploit"},
+    {"name": "curl",             "header": "user-agent", "pattern": "^curl/"},
+    {"name": "python_requests",  "header": "user-agent", "pattern": "python-requests"},
+    {"name": "gobuster",         "header": "user-agent", "pattern": "gobuster"},
+    {"name": "dirbuster",        "header": "user-agent", "pattern": "DirBuster"},
+    {"name": "hydra",            "header": "user-agent", "pattern": "hydra"},
+    {"name": "wfuzz",            "header": "user-agent", "pattern": "Wfuzz"},
+)
+
+
+def guess_tools(mean_iat_s: float | None, cv: float | None) -> list[str]:
+    """
+    Match (mean_iat, cv) against known C2 default beacon profiles.
+
+    Returns a list of all matching tool names (may be empty).  Multiple
+    matches are all returned because an attacker can run several implants.
+    """
+    if mean_iat_s is None or cv is None:
+        return []
+
+    hits: list[str] = []
+    for sig in _TOOL_SIGNATURES:
+        if abs(mean_iat_s - sig["interval_s"]) > sig["interval_tolerance_s"]:
+            continue
+        if abs(cv - sig["jitter_cv"]) > sig["jitter_tolerance"]:
+            continue
+        hits.append(sig["name"])
+
+    return hits
+
+
+# Keep the old name as an alias so callers that expected a single string still
+# compile, but mark it deprecated.  Returns the first hit or None.
+def guess_tool(mean_iat_s: float | None, cv: float | None) -> str | None:
+    """Deprecated: use guess_tools() instead."""
+    hits = guess_tools(mean_iat_s, cv)
+    if len(hits) == 1:
+        return hits[0]
+    return None
+
+
+@_traced("profiler.detect_tools_from_headers")
+def detect_tools_from_headers(events: list[LogEvent]) -> list[str]:
+    """
+    Scan HTTP `request` events for tool-identifying headers.
+
+    Checks User-Agent, X-Mailer, and other headers case-insensitively
+    against `_HEADER_TOOL_SIGNATURES`.  Returns a deduplicated list of
+    matched tool names in detection order.
+    """
+    found: list[str] = []
+    seen: set[str] = set()
+
+    for e in events:
+        if e.event_type != "request":
+            continue
+
+        raw_headers = e.fields.get("headers")
+        if not raw_headers:
+            continue
+
+        # headers may arrive as a JSON string, a Python-repr string (legacy),
+        # or a dict already (in-memory / test paths).
+        if isinstance(raw_headers, str):
+            try:
+                headers: dict[str, str] = json.loads(raw_headers)
+            except (json.JSONDecodeError, ValueError):
+                # Backward-compat: events written before the JSON-encode fix
+                # were serialized as Python repr via str(dict).  ast.literal_eval
+                # handles that safely (no arbitrary code execution).
+                try:
+                    import ast as _ast
+                    _parsed = _ast.literal_eval(raw_headers)
+                    if isinstance(_parsed, dict):
+                        headers = _parsed
+                    else:
+                        continue
+                except Exception:  # nosec B112 — skip unparseable header values
+                    continue
+        elif isinstance(raw_headers, dict):
+            headers = raw_headers
+        else:
+            continue
+
+        # Normalise header keys to lowercase for matching.
+        lc_headers: dict[str, str] = {k.lower(): str(v) for k, v in headers.items()}
+
+        for sig in _HEADER_TOOL_SIGNATURES:
+            name = sig["name"]
+            if name in seen:
+                continue
+            value = lc_headers.get(sig["header"])
+            if value is None:
+                continue
+            pattern = sig["pattern"]
+            if pattern.startswith("^"):
+                if re.match(pattern, value, re.IGNORECASE):
+                    found.append(name)
+                    seen.add(name)
+            else:
+                if pattern.lower() in value.lower():
+                    found.append(name)
+                    seen.add(name)
+
+    return found
--- a/decnet/profiler/worker.py
+++ b/decnet/profiler/worker.py
@@ -0,0 +1,442 @@
+"""
+Attacker profile builder — incremental background worker.
+
+Maintains a persistent CorrelationEngine and a log-ID cursor across cycles.
+On cold start (first cycle or process restart), performs one full build from
+all stored logs.  Subsequent cycles fetch only new logs via the cursor,
+ingest them into the existing engine, and rebuild profiles for affected IPs
+only.
+
+Complexity per cycle: O(new_logs + affected_ips) instead of O(total_logs²).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import json
+import re
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any, Callable
+
+from decnet.bus import topics as _topics
+from decnet.bus.factory import get_bus
+from decnet.bus.publish import (
+    make_thread_safe_publisher,
+    run_control_listener,
+    run_health_heartbeat,
+)
+from decnet.correlation.engine import CorrelationEngine
+from decnet.correlation.parser import LogEvent
+from decnet.asn import enrich_ip as enrich_ip_asn
+from decnet.geoip import enrich_ip
+from decnet.geoip.ptr import resolve_ptr_record
+from decnet.logging import get_logger
+from decnet.profiler.behavioral import build_behavior_record
+from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
+from decnet.web.db.repository import BaseRepository
+
+logger = get_logger("attacker_worker")
+
+_BATCH_SIZE = 500
+_STATE_KEY = "attacker_worker_cursor"
+
+# Event types that indicate active command/query execution — the
+# shell-family subset of INTERACTION_EVENT_TYPES in
+# decnet/correlation/event_kinds.py. Kept here because this set is a
+# stricter filter (commands that carry text to extract, vs. interactions
+# like RCPT TO or file upload that don't). A test in
+# tests/profiler/ asserts it's a subset of the canonical interaction
+# set so they can't drift.
+_COMMAND_EVENT_TYPES = frozenset({
+    "command", "exec", "query", "input", "shell_input",
+    "execute", "run", "sql_query", "redis_command",
+})
+
+# Fields that carry the executed command/query text
+_COMMAND_FIELDS = ("command", "query", "input", "line", "sql", "cmd")
+
+# SMTP events that carry a recipient email address. `rcpt_to` fires once per
+# accepted RCPT (open-relay mode), `rcpt_denied` once per denied RCPT
+# (harvester mode). `message_accepted` carries the comma-joined rcpt list
+# on the final DATA commit — covered for replay safety, though every
+# address it contains already arrived via `rcpt_to` earlier in the session.
+_SMTP_RCPT_EVENTS = frozenset({"rcpt_to", "rcpt_denied", "message_accepted"})
+
+# Pseudo-TLDs we never want to report on: the RFC 6761 special-use names
+# plus common lab-only values. Matching happens on the *last* label so
+# `foo.example.com` is filtered but `example.corp` is not.
+_BLOCKED_TLDS = frozenset({"invalid", "test", "localhost", "local", "example"})
+
+
+@dataclass
+class _WorkerState:
+    engine: CorrelationEngine = field(default_factory=CorrelationEngine)
+    last_log_id: int = 0
+    initialized: bool = False
+    # Optional bus hook — fires ``("scored", payload)`` per profile upsert.
+    # None when the bus is disabled or unreachable.
+    publish_attacker: Callable[[str, dict[str, Any]], None] | None = None
+    # Set of IPs we've already tried to PTR-resolve in this worker's
+    # lifetime. Bounds retry to once per worker boot so a persistently
+    # NXDOMAIN-returning IP doesn't burn 2s of tick time on every cycle.
+    ptr_attempted: set[str] = field(default_factory=set)
+
+
+async def attacker_profile_worker(repo: BaseRepository, *, interval: int = 30) -> None:
+    """Periodically updates the Attacker table incrementally. Designed to run as an asyncio Task."""
+    logger.info("attacker profile worker started interval=%ds", interval)
+
+    # Optional bus wiring — correlator-family publishes ride on the profiler
+    # worker because CorrelationEngine lives inside it.  If the bus is off or
+    # unreachable the engine runs with publish_fn=None and downstream degrades
+    # to DB-only.
+    bus = None
+    try:
+        bus = get_bus(client_name="profiler")
+        await bus.connect()
+    except Exception as exc:
+        logger.warning("profiler: bus unavailable, continuing without publish: %s", exc)
+        bus = None
+
+    loop = asyncio.get_running_loop()
+    raw_publish = make_thread_safe_publisher(bus, loop) if bus is not None else None
+
+    def _publish_attacker(event_type: str, payload: dict[str, Any]) -> None:
+        if raw_publish is None:
+            return
+        raw_publish(_topics.attacker(event_type), payload, event_type)
+
+    state = _WorkerState(
+        engine=CorrelationEngine(publish_fn=_publish_attacker),
+        publish_attacker=_publish_attacker,
+    )
+    _saved_cursor = await repo.get_state(_STATE_KEY)
+    if _saved_cursor:
+        state.last_log_id = _saved_cursor.get("last_log_id", 0)
+        state.initialized = True
+        logger.info("attacker worker: resumed from cursor last_log_id=%d", state.last_log_id)
+
+    # Workers panel wiring: heartbeat + bus-driven stop.  Main loop is
+    # pure asyncio sleep/await, so an event-based control listener
+    # drops in cleanly without a SIGTERM self-signal.
+    shutdown = asyncio.Event()
+    heartbeat_task = asyncio.create_task(run_health_heartbeat(bus, "profiler"))
+    control_task = asyncio.create_task(
+        run_control_listener(bus, "profiler", shutdown),
+    )
+    try:
+        while not shutdown.is_set():
+            try:
+                await asyncio.wait_for(shutdown.wait(), timeout=interval)
+            except asyncio.TimeoutError:
+                pass  # normal tick
+            if shutdown.is_set():
+                break
+            try:
+                await _incremental_update(repo, state)
+            except Exception as exc:
+                logger.error("attacker worker: update failed: %s", exc)
+    finally:
+        for t in (heartbeat_task, control_task):
+            t.cancel()
+            with contextlib.suppress(Exception, asyncio.CancelledError):
+                await t
+        if bus is not None:
+            with contextlib.suppress(Exception):
+                await bus.close()
+
+
+@_traced("profiler.incremental_update")
+async def _incremental_update(repo: BaseRepository, state: _WorkerState) -> None:
+    was_cold = not state.initialized
+    affected_ips: set[str] = set()
+
+    while True:
+        batch = await repo.get_logs_after_id(state.last_log_id, limit=_BATCH_SIZE)
+        if not batch:
+            break
+
+        for row in batch:
+            event = state.engine.ingest(row["raw_line"])
+            if event and event.attacker_ip:
+                affected_ips.add(event.attacker_ip)
+            state.last_log_id = row["id"]
+
+        await asyncio.sleep(0)  # yield to event loop after each batch
+
+        if len(batch) < _BATCH_SIZE:
+            break
+
+    state.initialized = True
+
+    if not affected_ips:
+        await repo.set_state(_STATE_KEY, {"last_log_id": state.last_log_id})
+        return
+
+    await _update_profiles(repo, state, affected_ips)
+    await repo.set_state(_STATE_KEY, {"last_log_id": state.last_log_id})
+
+    if was_cold:
+        logger.info("attacker worker: cold start rebuilt %d profiles", len(affected_ips))
+    else:
+        logger.info("attacker worker: updated %d profiles (incremental)", len(affected_ips))
+
+
+_PTR_CONCURRENCY = 10
+
+
+async def _resolve_ptrs_for(ips: list[str]) -> dict[str, Any]:
+    """Resolve PTR for each *ip* concurrently, bounded.
+
+    Returns ``{ip: ptr_or_None}`` for every input. Uses an asyncio
+    semaphore to cap parallel lookups — cold-start could see hundreds
+    of fresh IPs and we don't want to hammer the OS resolver.
+    """
+    if not ips:
+        return {}
+    sem = asyncio.Semaphore(_PTR_CONCURRENCY)
+
+    async def _one(ip: str) -> tuple[str, Any]:
+        async with sem:
+            return ip, await resolve_ptr_record(ip)
+
+    results = await asyncio.gather(*(_one(ip) for ip in ips))
+    return dict(results)
+
+
+@_traced("profiler.update_profiles")
+async def _update_profiles(
+    repo: BaseRepository,
+    state: _WorkerState,
+    ips: set[str],
+) -> None:
+    traversal_map = {t.attacker_ip: t for t in state.engine.traversals(min_deckies=2)}
+    bounties_map = await repo.get_bounties_for_ips(ips)
+
+    # PTR resolution: one shot per IP per worker lifetime. OS resolver
+    # caches, so re-runs on worker restart hit cache instantly for IPs
+    # resolved recently; only never-seen addresses pay the 2s ceiling.
+    fresh = [ip for ip in ips if ip not in state.ptr_attempted]
+    for ip in fresh:
+        state.ptr_attempted.add(ip)
+    ptrs = await _resolve_ptrs_for(fresh)
+
+    _tracer = _get_tracer("profiler")
+    for ip in ips:
+        events = state.engine._events.get(ip, [])
+        if not events:
+            continue
+
+        with _tracer.start_as_current_span("profiler.process_ip") as _span:
+            _span.set_attribute("attacker_ip", ip)
+            _span.set_attribute("event_count", len(events))
+
+            traversal = traversal_map.get(ip)
+            bounties = bounties_map.get(ip, [])
+            commands = _extract_commands_from_events(events)
+
+            if ip in ptrs:
+                record = _build_record(
+                    ip, events, traversal, bounties, commands,
+                    ptr_record=ptrs[ip],
+                )
+            else:
+                # Not in ptrs → already attempted in a prior cycle → skip
+                # kwarg so upsert preserves whatever's stored.
+                record = _build_record(ip, events, traversal, bounties, commands)
+            attacker_uuid = await repo.upsert_attacker(record)
+
+            # Backfill Credential.attacker_uuid for every credential row
+            # captured before the profiler had minted this Attacker. The
+            # capture path runs before the profiler — coupling them would
+            # create a chicken-and-egg ordering bug. Soft-fail so a backfill
+            # error never blocks the next attacker.
+            try:
+                await repo.update_credential_attacker_uuid(ip, attacker_uuid)
+            except Exception as exc:
+                _span.record_exception(exc)
+                logger.error("attacker worker: credential backfill failed for %s: %s", ip, exc)
+
+            _span.set_attribute("is_traversal", traversal is not None)
+            _span.set_attribute("bounty_count", len(bounties))
+            _span.set_attribute("command_count", len(commands))
+
+            if state.publish_attacker is not None:
+                try:
+                    state.publish_attacker("scored", {
+                        "attacker_ip": ip,
+                        "event_count": record["event_count"],
+                        "service_count": record["service_count"],
+                        "decky_count": record["decky_count"],
+                        "bounty_count": record["bounty_count"],
+                        "credential_count": record["credential_count"],
+                        "is_traversal": record["is_traversal"],
+                    })
+                except Exception as exc:
+                    logger.warning("attacker worker: scored publish failed for %s: %s", ip, exc)
+
+            # Behavioral / fingerprint rollup lives in a sibling table so failures
+            # here never block the core attacker profile upsert.
+            try:
+                behavior = build_behavior_record(events)
+                await repo.upsert_attacker_behavior(attacker_uuid, behavior)
+            except Exception as exc:
+                _span.record_exception(exc)
+                logger.error("attacker worker: behavior upsert failed for %s: %s", ip, exc)
+
+            # SMTP victim-domain tracking — extract domains from RCPT events
+            # and upsert one row per (attacker, domain) pair. Same
+            # soft-fail posture as the behavior rollup: errors here must
+            # not block the next attacker.
+            try:
+                for domain in _extract_smtp_domains(events):
+                    await repo.increment_smtp_target(attacker_uuid, domain)
+            except Exception as exc:
+                _span.record_exception(exc)
+                logger.error("attacker worker: smtp target upsert failed for %s: %s", ip, exc)
+
+
+_UNSET = object()  # sentinel — distinguishes "not passed" from "None"
+
+
+def _build_record(
+    ip: str,
+    events: list[LogEvent],
+    traversal: Any,
+    bounties: list[dict[str, Any]],
+    commands: list[dict[str, Any]],
+    *,
+    ptr_record: Any = _UNSET,
+) -> dict[str, Any]:
+    services = sorted({e.service for e in events})
+    deckies = (
+        traversal.deckies
+        if traversal
+        else _first_contact_deckies(events)
+    )
+    fingerprints = [b for b in bounties if b.get("bounty_type") == "fingerprint"]
+    credential_count = sum(1 for b in bounties if b.get("bounty_type") == "credential")
+    country_code, country_source = enrich_ip(ip)
+    asn, as_name, asn_source = enrich_ip_asn(ip)
+
+    record: dict[str, Any] = {
+        "ip": ip,
+        "first_seen": min(e.timestamp for e in events),
+        "last_seen": max(e.timestamp for e in events),
+        "event_count": len(events),
+        "service_count": len(services),
+        "decky_count": len({e.decky for e in events}),
+        "services": json.dumps(services),
+        "deckies": json.dumps(deckies),
+        "traversal_path": traversal.path if traversal else None,
+        "is_traversal": traversal is not None,
+        "bounty_count": len(bounties),
+        "credential_count": credential_count,
+        "fingerprints": json.dumps(fingerprints),
+        "commands": json.dumps(commands),
+        "country_code": country_code,
+        "country_source": country_source,
+        "asn": asn,
+        "as_name": as_name,
+        "asn_source": asn_source,
+        "updated_at": datetime.now(timezone.utc),
+    }
+    # ptr_record is omitted from the dict entirely when the caller didn't
+    # supply one — lets the upsert's attribute-merge preserve any value
+    # already stored on the row without us having to think about "None
+    # means preserve vs. overwrite".
+    if ptr_record is not _UNSET:
+        record["ptr_record"] = ptr_record
+    return record
+
+
+def _first_contact_deckies(events: list[LogEvent]) -> list[str]:
+    """Return unique deckies in first-contact order (for non-traversal attackers)."""
+    seen: list[str] = []
+    for e in sorted(events, key=lambda x: x.timestamp):
+        if e.decky not in seen:
+            seen.append(e.decky)
+    return seen
+
+
+def _extract_commands_from_events(events: list[LogEvent]) -> list[dict[str, Any]]:
+    """
+    Extract executed commands from LogEvent objects.
+
+    Works directly on LogEvent.fields (already a dict), so no JSON parsing needed.
+    """
+    commands: list[dict[str, Any]] = []
+    for event in events:
+        if event.event_type not in _COMMAND_EVENT_TYPES:
+            continue
+
+        cmd_text: str | None = None
+        for key in _COMMAND_FIELDS:
+            val = event.fields.get(key)
+            if val:
+                cmd_text = str(val)
+                break
+
+        if not cmd_text:
+            continue
+
+        commands.append({
+            "service": event.service,
+            "decky": event.decky,
+            "command": cmd_text,
+            "timestamp": event.timestamp.isoformat(),
+        })
+
+    return commands
+
+
+_SMTP_ADDR_RE = re.compile(r"<?([^\s<>@]+)@([A-Za-z0-9.-]+\.[A-Za-z]{2,})>?")
+
+
+def _normalize_smtp_domain(raw: str) -> str | None:
+    """Extract a lowercased domain from an envelope-address fragment.
+
+    Returns None when the input doesn't look like an email address or the
+    resulting TLD is on the blocklist. Local-parts (the bit before `@`)
+    are intentionally dropped — this table stores no user-identifying
+    data, only the targeted organisation's domain.
+    """
+    if not raw:
+        return None
+    match = _SMTP_ADDR_RE.search(raw.strip())
+    if not match:
+        return None
+    domain = match.group(2).lower().strip(".")
+    if not domain:
+        return None
+    tld = domain.rsplit(".", 1)[-1]
+    if tld in _BLOCKED_TLDS:
+        return None
+    return domain
+
+
+def _extract_smtp_domains(events: list[LogEvent]) -> set[str]:
+    """Collect the set of victim domains an attacker targeted via SMTP.
+
+    Deduped at the attacker level — repeated hits on the same domain
+    within a single batch collapse to one upsert, and the per-row count
+    is bumped by ``increment_smtp_target`` on each call. The set return
+    type is intentional: we care about *which* domains were seen, not
+    the per-batch frequency (which the DB aggregates over time).
+    """
+    domains: set[str] = set()
+    for event in events:
+        if event.service != "smtp" or event.event_type not in _SMTP_RCPT_EVENTS:
+            continue
+        if event.event_type == "message_accepted":
+            raw_list = event.fields.get("rcpt_to", "")
+            candidates = raw_list.split(",") if raw_list else []
+        else:
+            candidates = [event.fields.get("value", "")]
+        for candidate in candidates:
+            domain = _normalize_smtp_domain(candidate)
+            if domain:
+                domains.add(domain)
+    return domains