merge: testing → main (reconcile 2-week divergence)

2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions
--- a/decnet/clustering/impl/init.py
+++ b/decnet/clustering/impl/init.py
@@ -0,0 +1,6 @@
+"""Concrete clusterer implementations.
+
+Each module here contains exactly one :class:`~decnet.clustering.base.Clusterer`
+subclass. New implementations register themselves in
+:func:`decnet.clustering.factory.get_clusterer`.
+"""
--- a/decnet/clustering/impl/connected_components.py
+++ b/decnet/clustering/impl/connected_components.py
@@ -0,0 +1,379 @@
+"""Connected-components identity clusterer (v1).
+
+Builds a similarity graph over observations (per-IP attacker rows),
+runs union-find over edges that pass a confidence threshold, and writes
+one ``attacker_identities`` row per component.
+
+**v1 signal coverage (this commit):**
+
+* High-weight tier: JA3 / HASSH / payload-hash / C2-endpoint exact
+  match (alone enough to cluster). The production tick currently sees
+  JA3 + HASSH only — payload + C2 require log mining and join in
+  later commits. The fixture tests exercise the full high-weight set
+  through the in-memory path.
+
+Subsequent commits add medium / low / very-low tier edges, phase-
+handoff edges, and revocable merges. Edges MUST stay time-agnostic
+— fixture 7 forbids recency-decay clustering.
+
+**v1 behavior:**
+
+The clusterer assigns identities to NULL observations, merges existing
+identities when a single predicted component spans them, and revokes
+prior merges when the predicted component splits a merged-out identity
+away from its winner. Observations stay FK'd to their original identity
+row throughout — merges are soft pointers via
+``attacker_identities.merged_into_uuid``, never observation re-points.
+That keeps the audit trail intact and lets cached subscribers resolve
+merged-out UUIDs through the chain.
+"""
+from __future__ import annotations
+
+import json
+import uuid as _uuid
+from datetime import datetime, timezone
+from typing import Any, Iterable, Optional
+
+from decnet.clustering.base import Clusterer, ClusterResult
+from decnet.clustering.impl.similarity import (
+    EDGE_THRESHOLD,
+    Observation,
+    combined_edge_weight,
+)
+from decnet.logging import get_logger
+from decnet.profiler.identity_rollup import extract_fp_summaries
+from decnet.web.db.repository import BaseRepository
+
+log = get_logger("clustering.connected_components")
+
+
+def cluster_observations(
+    observations: Iterable[Observation],
+) -> dict[str, str]:
+    """Run connected-components over the high-weight similarity graph.
+
+    Pure: no DB, no clock, no I/O. Both the fixture-validation tests
+    and the production ``tick`` consume this. The mapping is a
+    deterministic function of the input set + edge function.
+
+    Singletons get a stable per-observation cluster id so callers can
+    distinguish "isolated observation" from "merged into nothing."
+
+    Returns ``{observation_id: cluster_id}``. Cluster ids are opaque
+    strings — callers must not rely on their format.
+    """
+    obs_list = list(observations)
+    parent: dict[str, str] = {o.observation_id: o.observation_id for o in obs_list}
+
+    def find(x: str) -> str:
+        while parent[x] != x:
+            parent[x] = parent[parent[x]]
+            x = parent[x]
+        return x
+
+    def union(x: str, y: str) -> None:
+        rx, ry = find(x), find(y)
+        if rx != ry:
+            parent[rx] = ry
+
+    for i, a in enumerate(obs_list):
+        for b in obs_list[i + 1:]:
+            if combined_edge_weight(a, b) >= EDGE_THRESHOLD:
+                union(a.observation_id, b.observation_id)
+
+    # Roots: each unique find(o) is a component representative. Use
+    # them as the cluster id so two runs over the same input produce
+    # the same labels (handy for assertions).
+    return {o.observation_id: f"cc-{find(o.observation_id)}" for o in obs_list}
+
+
+def from_attacker_row(row: dict[str, Any]) -> Observation:
+    """Project an ``Attacker`` row dict into an :class:`Observation`.
+
+    Pulls JA3 / HASSH out of the ``Attacker.fingerprints`` JSON list
+    (one entry per fingerprint event the prober collected). Multiple
+    JA3s on a single observation are flattened to a single value —
+    the most-recent — because :class:`Observation` is a single-row
+    projection; an observation that exhibits two distinct JA3s across
+    its lifetime is a wire-level oddity that the clusterer treats by
+    keeping the latest. The identity row itself can store the full
+    list across observations.
+
+    Payload + C2 + commands are left empty — log mining lands in
+    later commits. The function shape doesn't change when they do.
+    """
+    raw = row.get("fingerprints") or "[]"
+    try:
+        entries = json.loads(raw) if isinstance(raw, str) else list(raw)
+    except (TypeError, ValueError):
+        entries = []
+
+    ja3: Optional[str] = None
+    hassh: Optional[str] = None
+    for entry in entries:
+        if not isinstance(entry, dict):
+            continue
+        kind = entry.get("kind")
+        h = entry.get("hash") or entry.get("value")
+        if not h:
+            continue
+        if kind == "ja3":
+            ja3 = h
+        elif kind == "hassh":
+            hassh = h
+
+    return Observation(
+        observation_id=row["uuid"],
+        ja3=ja3,
+        hassh=hassh,
+        asn=row.get("asn"),
+    )
+
+
+class ConnectedComponentsClusterer(Clusterer):
+    """Connected-components clusterer over the similarity graph.
+
+    See module docstring for v1 signal coverage and behavior notes.
+    """
+
+    name = "connected_components"
+
+    async def tick(self, repo: BaseRepository) -> ClusterResult:
+        try:
+            rows = await repo.list_attackers_for_clustering()
+        except Exception:  # noqa: BLE001
+            log.exception("clusterer: failed to read attackers")
+            return ClusterResult()
+
+        if not rows:
+            return ClusterResult()
+
+        # Build the merge chain so a row's "effective" identity follows
+        # merged_into_uuid up to the canonical winner. Pre-computing it
+        # lets us reason about post-merge identity membership in one
+        # place. ``identity_chain[u]`` is the canonical winner for
+        # identity ``u`` (or ``u`` itself if not merged out).
+        try:
+            all_identities = await repo.list_all_identities()
+        except Exception:  # noqa: BLE001
+            log.exception("clusterer: failed to read identities")
+            return ClusterResult()
+        identity_chain = _build_merge_chain(all_identities)
+
+        # Project + cluster.
+        observations: list[Observation] = []
+        row_by_id: dict[str, dict[str, Any]] = {}
+        for r in rows:
+            obs = from_attacker_row(r)
+            observations.append(obs)
+            row_by_id[obs.observation_id] = r
+        labels = cluster_observations(observations)
+
+        # Group observations by predicted cluster.
+        components: dict[str, list[str]] = {}
+        for obs_id, cluster_id in labels.items():
+            components.setdefault(cluster_id, []).append(obs_id)
+
+        result = ClusterResult()
+        now = datetime.now(timezone.utc)
+
+        # Pass 1 — per-component reconciliation: form, link, merge.
+        for member_ids in components.values():
+            literal_ids = {
+                row_by_id[m]["identity_id"] for m in member_ids
+                if row_by_id[m].get("identity_id")
+            }
+            effective_ids = {identity_chain.get(i, i) for i in literal_ids}
+            unassigned = [
+                m for m in member_ids
+                if not row_by_id[m].get("identity_id")
+            ]
+
+            if not effective_ids:
+                # Fresh component — mint a new identity.
+                identity_uuid = str(_uuid.uuid4())
+                try:
+                    await repo.create_attacker_identity({
+                        "uuid": identity_uuid,
+                        "schema_version": 1,
+                        "first_seen_at": now,
+                        "last_seen_at": now,
+                        "created_at": now,
+                        "updated_at": now,
+                        "observation_count": len(member_ids),
+                    })
+                except Exception:  # noqa: BLE001
+                    log.exception(
+                        "clusterer: failed to create identity for component %s",
+                        member_ids,
+                    )
+                    continue
+
+                linked: list[str] = []
+                for obs_id in member_ids:
+                    if await _link(repo, obs_id, identity_uuid):
+                        linked.append(obs_id)
+                if linked:
+                    result.identities_formed.append({
+                        "identity_uuid": identity_uuid,
+                        "observation_uuids": linked,
+                    })
+                await _roll_up_fingerprints(
+                    repo, identity_uuid, [row_by_id[m] for m in member_ids],
+                )
+                continue
+
+            # Deterministic winner so two clusterer runs produce the
+            # same merge direction. Sorting by uuid string is stable
+            # and doesn't depend on row insertion order.
+            winner_uuid = min(effective_ids)
+            losers = effective_ids - {winner_uuid}
+
+            for loser_uuid in losers:
+                try:
+                    await repo.update_identity_merged_into(loser_uuid, winner_uuid)
+                except Exception:  # noqa: BLE001
+                    log.exception(
+                        "clusterer: failed to merge %s -> %s",
+                        loser_uuid, winner_uuid,
+                    )
+                    continue
+                identity_chain[loser_uuid] = winner_uuid
+                result.identities_merged.append({
+                    "winner_uuid": winner_uuid,
+                    "loser_uuid": loser_uuid,
+                })
+
+            # Link any unassigned observations in the component to the
+            # winner so a subsequent tick sees a single-identity
+            # component and skips this branch entirely.
+            for obs_id in unassigned:
+                if await _link(repo, obs_id, winner_uuid):
+                    result.observations_linked.append({
+                        "identity_uuid": winner_uuid,
+                        "observation_uuid": obs_id,
+                    })
+
+            # Re-roll the winner's fingerprint summary across every
+            # observation now in this component (including the loser
+            # side — the merge unifies their evidence even though the
+            # loser's identity row stays FK'd via merged_into_uuid).
+            await _roll_up_fingerprints(
+                repo, winner_uuid, [row_by_id[m] for m in member_ids],
+            )
+
+        # Pass 2 — revocable-merge undo. For each currently-merged-out
+        # identity, check whether its observations still cluster with
+        # the winner's. If not, the merge is contradicted by new
+        # evidence — clear merged_into_uuid and emit identity.unmerged.
+        # Observations FK'd to the resurrected loser stay where they
+        # were; the chain just stops following.
+        observations_by_literal_identity: dict[str, list[str]] = {}
+        for obs_id, r in row_by_id.items():
+            iid = r.get("identity_id")
+            if iid:
+                observations_by_literal_identity.setdefault(iid, []).append(obs_id)
+
+        for identity_row in all_identities:
+            if not identity_row.get("merged_into_uuid"):
+                continue
+            loser_uuid = identity_row["uuid"]
+            winner_uuid = identity_chain.get(loser_uuid, loser_uuid)
+            if winner_uuid == loser_uuid:
+                continue  # broken chain — paranoia
+            loser_obs = observations_by_literal_identity.get(loser_uuid, [])
+            winner_obs = observations_by_literal_identity.get(winner_uuid, [])
+            if not loser_obs or not winner_obs:
+                # No observations either side — can't disprove the merge.
+                continue
+            loser_clusters = {labels[o] for o in loser_obs}
+            winner_clusters = {labels[o] for o in winner_obs}
+            if loser_clusters & winner_clusters:
+                continue  # still co-clustered with winner — merge stands
+            try:
+                await repo.update_identity_merged_into(loser_uuid, None)
+            except Exception:  # noqa: BLE001
+                log.exception(
+                    "clusterer: failed to unmerge %s from %s",
+                    loser_uuid, winner_uuid,
+                )
+                continue
+            identity_chain[loser_uuid] = loser_uuid
+            result.identities_unmerged.append({
+                "resurrected_uuid": loser_uuid,
+                "former_winner_uuid": winner_uuid,
+            })
+
+        return result
+
+
+def _build_merge_chain(
+    identities: list[dict[str, Any]],
+) -> dict[str, str]:
+    """Build a uuid → canonical-winner map from a list of identity rows.
+
+    Follows ``merged_into_uuid`` to a fixed point per identity, with a
+    hop cap to defend against accidental cycles. The returned dict
+    contains an entry for every identity uuid (mapping to itself if
+    not merged out).
+    """
+    _MAX_HOPS = 8
+    by_uuid: dict[str, dict[str, Any]] = {i["uuid"]: i for i in identities}
+    chain: dict[str, str] = {}
+    for uuid_ in by_uuid:
+        cur = uuid_
+        for _ in range(_MAX_HOPS):
+            row = by_uuid.get(cur)
+            if row is None:
+                break
+            nxt = row.get("merged_into_uuid")
+            if not nxt or nxt == cur:
+                break
+            cur = nxt
+        chain[uuid_] = cur
+    return chain
+
+
+async def _link(
+    repo: BaseRepository, observation_uuid: str, identity_uuid: str,
+) -> bool:
+    """Set ``attackers.identity_id`` and return ``True`` on success.
+
+    Wraps the repo call so the tick body stays linear and exception
+    handling is consistent across the form / link / merge branches.
+    """
+    try:
+        await repo.set_attacker_identity_id(observation_uuid, identity_uuid)
+        return True
+    except Exception:  # noqa: BLE001
+        log.exception(
+            "clusterer: failed to link obs=%s -> identity=%s",
+            observation_uuid, identity_uuid,
+        )
+        return False
+
+
+async def _roll_up_fingerprints(
+    repo: BaseRepository,
+    identity_uuid: str,
+    member_rows: list[dict[str, Any]],
+) -> None:
+    """Project member observations' fingerprint blobs onto the identity's
+    summary columns. Best-effort: a write failure is logged but never
+    breaks the clusterer tick — the columns just stay stale until the
+    next pass."""
+    summaries = extract_fp_summaries(member_rows)
+    try:
+        await repo.update_identity_fingerprints(identity_uuid, **summaries)
+    except Exception:  # noqa: BLE001
+        log.exception(
+            "clusterer: failed to roll up fingerprints for identity=%s",
+            identity_uuid,
+        )
+
+
+__all__ = [
+    "ConnectedComponentsClusterer",
+    "cluster_observations",
+    "from_attacker_row",
+]
--- a/decnet/clustering/impl/similarity.py
+++ b/decnet/clustering/impl/similarity.py
@@ -0,0 +1,313 @@
+"""Similarity-graph primitives for the connected-components clusterer.
+
+Each function takes two :class:`Observation` projections and returns a
+similarity score in ``[0.0, 1.0]``. The connected-components impl
+(landing in subsequent commits) decides how to combine these into a
+single edge weight, applies a threshold, and runs union-find.
+
+**Time-agnostic.** Edges MUST NOT depend on observation timestamps.
+Fixture 7 (``slow_burn``) proves recency-decay clustering fragments
+multi-month APT campaigns; the production graph cannot silently expire
+old edges. Timestamps are still useful for *audit* (the ``first_seen``
+on the resulting identity row) but never for *similarity*.
+
+**Weight tiers** (from `development/IDENTITY_RESOLUTION.md`):
+
+* High — JA3 / HASSH / payload-hash / C2-callback exact match. Stable
+  signals an attacker can't cheaply rotate. A single high-tier match
+  supports identity strongly.
+* Medium — command-sequence Jaccard, bucketed by UKC phase. Tooling
+  habits leak through command order; phase-bucketing avoids comparing
+  a Discovery cmd-list to an Exploitation one.
+* Low — credential-attempt-set Jaccard. Defeated alone by fixture 1
+  (``shared_wordlist``) where two campaigns share rockyou but diverge
+  on infra.
+* Very low — ASN match. Defeated alone by fixture 2 (``vpn_hopping``)
+  where one identity rotates across many ASNs.
+
+The functions are pure (no DB, no I/O); the worker maps observations
+into :class:`Observation` once per tick and feeds these into the
+graph builder.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Mapping, Optional
+
+# ─── Observation projection ─────────────────────────────────────────────────
+
+
+@dataclass(frozen=True)
+class Observation:
+    """Minimal projection of a per-IP attacker observation.
+
+    Built once per ``Attacker`` row by the worker (or per
+    ``SyntheticAttacker`` in tests via :func:`from_synthetic`).
+    Keeping the projection tight isolates the graph code from schema
+    drift on either side.
+
+    All set-typed fields are :class:`frozenset` so they hash and so
+    callers don't accidentally mutate them mid-pass.
+    """
+
+    observation_id: str
+    """Stable ID — for production, the ``Attacker.uuid``; for tests,
+    the ``SyntheticAttacker.attacker_id``."""
+
+    ja3: Optional[str] = None
+    hassh: Optional[str] = None
+    asn: Optional[int] = None
+
+    payload_hashes: frozenset[str] = field(default_factory=frozenset)
+    c2_endpoints: frozenset[str] = field(default_factory=frozenset)
+    credentials: frozenset[tuple[str, str]] = field(default_factory=frozenset)
+
+    commands_by_phase: Mapping[str, tuple[str, ...]] = field(default_factory=dict)
+    """``UKCPhase.value`` → ordered command sequence observed in that
+    phase. Empty dict when no command-bearing sessions were seen."""
+
+
+# ─── Edge functions ─────────────────────────────────────────────────────────
+
+
+def _fingerprints_fully_disagree(a: Observation, b: Observation) -> bool:
+    """True iff every comparable fingerprint slot disagrees.
+
+    "Comparable" = both sides have a non-null value for that slot.
+    Used as a soft-veto on shared C2 / payload signals: when two
+    observations have distinct stable TLS + SSH stacks, sharing a C2
+    endpoint is a *campaign*-level signal (cooperating operators,
+    distinct identities) — not an identity-level one. Fixture 5
+    (``multi_operator``) is the canonical demonstration.
+
+    Returns ``False`` when no fingerprint slot is comparable (any-null
+    cases) — without evidence of disagreement we don't veto. Also
+    ``False`` when at least one slot agrees.
+    """
+    ja3_comparable = a.ja3 is not None and b.ja3 is not None
+    hassh_comparable = a.hassh is not None and b.hassh is not None
+    if not (ja3_comparable or hassh_comparable):
+        return False
+    if ja3_comparable and a.ja3 == b.ja3:
+        return False
+    if hassh_comparable and a.hassh == b.hassh:
+        return False
+    if ja3_comparable and hassh_comparable:
+        return a.ja3 != b.ja3 and a.hassh != b.hassh
+    return True  # exactly one slot is comparable, and it disagrees
+
+
+def high_weight_edge(a: Observation, b: Observation) -> float:
+    """JA3 / HASSH / payload-hash / C2-endpoint exact match.
+
+    Returns ``1.0`` if any of the four exact-match signals agrees
+    (non-null on both sides), ``0.0`` otherwise. Single-signal high-tier
+    agreement is by design enough to support identity — these are the
+    signals the design doc calls out as "stable signals an attacker
+    can't cheaply rotate."
+
+    **Fingerprint-disagreement veto.** Payload and C2 are infra signals
+    that two cooperating operators (different identities) can share.
+    JA3 + HASSH are tooling signals that differ when the operators are
+    actually different humans with different tool stacks. So when the
+    available fingerprint slots fully disagree, we drop the
+    payload/C2 contribution to zero — preventing a campaign-level
+    co-op signal from fusing two distinct identities. Fixture 5
+    (``multi_operator``) is the canonical demonstration: shared
+    stage-1 payload + shared C2, distinct JA3/HASSH per operator —
+    must stay two identities. JA3 / HASSH agreement still returns
+    ``1.0`` directly, since by definition no veto applies when
+    something agrees.
+
+    JA4 will join this tier as a sibling of JA3 once the prober emits
+    it (``ATTACKER_FINGERPRINTED`` already carries a JA4 slot in
+    ``AttackerIdentity``); the function shape doesn't change.
+    """
+    if a.ja3 is not None and a.ja3 == b.ja3:
+        return 1.0
+    if a.hassh is not None and a.hassh == b.hassh:
+        return 1.0
+    if _fingerprints_fully_disagree(a, b):
+        # Stable-tool disagreement vetoes shared-infra signals.
+        return 0.0
+    if a.payload_hashes and b.payload_hashes and (a.payload_hashes & b.payload_hashes):
+        return 1.0
+    if a.c2_endpoints and b.c2_endpoints and (a.c2_endpoints & b.c2_endpoints):
+        return 1.0
+    return 0.0
+
+
+def medium_weight_edge(a: Observation, b: Observation) -> float:
+    """Phase-bucketed command-sequence Jaccard.
+
+    For each UKC phase observed on both sides, computes the Jaccard
+    similarity of the command sets (multisets collapsed to sets — the
+    *order* signal is reserved for a future feature, this commit is
+    the scaffolding). Returns the **maximum** Jaccard across shared
+    phases, so a single strong phase match isn't averaged away by a
+    different phase where the actors diverge.
+
+    Phase-bucketing matters: comparing a Discovery cmd-list to an
+    Exploitation one is meaningless. Both actors had to be in the
+    same phase for the comparison to count.
+
+    Returns ``0.0`` when no phase is observed on both sides.
+    """
+    shared_phases = set(a.commands_by_phase) & set(b.commands_by_phase)
+    if not shared_phases:
+        return 0.0
+    best = 0.0
+    for phase in shared_phases:
+        sa = set(a.commands_by_phase[phase])
+        sb = set(b.commands_by_phase[phase])
+        if not sa and not sb:
+            continue
+        union = sa | sb
+        if not union:
+            continue
+        j = len(sa & sb) / len(union)
+        if j > best:
+            best = j
+    return best
+
+
+def low_weight_edge(a: Observation, b: Observation) -> float:
+    """Credential-attempt-set Jaccard.
+
+    Returns the Jaccard of ``(username, password)`` tuples. Two campaigns
+    burning the same wordlist will score high here — fixture 1 proves
+    this signal is dangerous in isolation. The connected-components
+    impl combines this with other signals; alone it must not push a
+    pair over threshold.
+
+    Returns ``0.0`` when either side attempted no credentials, or when
+    the union is empty.
+    """
+    if not a.credentials or not b.credentials:
+        return 0.0
+    union = a.credentials | b.credentials
+    if not union:
+        return 0.0
+    return len(a.credentials & b.credentials) / len(union)
+
+
+def very_low_weight_edge(a: Observation, b: Observation) -> float:
+    """ASN equality.
+
+    Returns ``1.0`` iff both observations have a non-null ASN and they
+    match. Fixture 2 (``vpn_hopping``) proves ASN-only clustering is
+    a failure mode — one identity legitimately rotates across many
+    ASNs. The combination logic in the connected-components impl
+    weights this so that ASN agreement alone never crosses threshold.
+    """
+    if a.asn is None or b.asn is None:
+        return 0.0
+    return 1.0 if a.asn == b.asn else 0.0
+
+
+# ─── Combined weight ────────────────────────────────────────────────────────
+
+#: Tier multipliers applied to the per-tier edge scores when combining
+#: into a single weight. Tuned so that:
+#:
+#: * High-tier agreement alone (1.0) crosses the 1.0 threshold.
+#: * Medium-tier alone (max 1.0) yields 0.6 — below threshold.
+#: * Low-tier alone (max 1.0) yields 0.2 — defeats fixture 1's
+#:   credential-overlap-only failure mode.
+#: * Very-low alone (max 1.0) yields 0.05 — defeats fixture 2's
+#:   ASN-rotation failure mode.
+#:
+#: The ratio between tiers matters more than the absolute values: a
+#: tier should never combine its way past threshold without help from
+#: a stronger one.
+TIER_WEIGHTS = {
+    "high": 1.0,
+    "medium": 0.6,
+    "low": 0.2,
+    "very_low": 0.05,
+}
+
+#: Threshold a combined edge weight must meet to survive into the
+#: similarity graph. The connected-components impl drops anything
+#: under this before running union-find.
+EDGE_THRESHOLD = 1.0
+
+
+def combined_edge_weight(a: Observation, b: Observation) -> float:
+    """Sum of all four tier scores, weighted by :data:`TIER_WEIGHTS`.
+
+    Each per-tier function returns a score in ``[0, 1]``; the
+    weighted sum lets stronger tiers dominate without letting weaker
+    ones combine their way past threshold.
+
+    The connected-components clusterer compares this against
+    :data:`EDGE_THRESHOLD` to decide whether to draw an edge. Pure /
+    time-agnostic — fixture 7 forbids recency-decay weighting.
+
+    Commits 5–7 land each tier in the call site:
+
+    * Commit 5 (this commit): high + medium.
+    * Commit 6: + phase-handoff (a separate edge family, not a tier).
+    * Commit 7: + low + very_low.
+
+    Until commit 7 lands, the low / very_low contributions stay zero
+    by virtue of the underlying functions returning ``0.0`` whenever
+    their inputs are missing. The combination is forward-compatible.
+    """
+    return (
+        TIER_WEIGHTS["high"] * high_weight_edge(a, b)
+        + TIER_WEIGHTS["medium"] * medium_weight_edge(a, b)
+        + TIER_WEIGHTS["low"] * low_weight_edge(a, b)
+        + TIER_WEIGHTS["very_low"] * very_low_weight_edge(a, b)
+    )
+
+
+# ─── Adapter for the synthetic-corpus tests ─────────────────────────────────
+
+
+def from_synthetic(att) -> Observation:  # type: ignore[no-untyped-def]
+    """Build an :class:`Observation` from a ``SyntheticAttacker``.
+
+    Lives here so test code doesn't import the factory shape into the
+    production module — the adapter is a documented integration point.
+    Imported lazily by callers; the production worker uses a parallel
+    adapter from :class:`Attacker` rows once that lands.
+    """
+    payload_hashes: set[str] = set()
+    c2_endpoints: set[str] = set()
+    credentials: set[tuple[str, str]] = set()
+    commands_by_phase: dict[str, list[str]] = {}
+
+    for s in att.sessions:
+        if s.payload_hash:
+            payload_hashes.add(s.payload_hash)
+        if s.c2_callback:
+            c2_endpoints.add(s.c2_callback)
+        for cred in s.credentials_tried:
+            credentials.add(tuple(cred))
+        if s.commands:
+            commands_by_phase.setdefault(s.phase.value, []).extend(s.commands)
+
+    return Observation(
+        observation_id=att.attacker_id,
+        ja3=att.ja3,
+        hassh=att.hassh,
+        asn=att.asn,
+        payload_hashes=frozenset(payload_hashes),
+        c2_endpoints=frozenset(c2_endpoints),
+        credentials=frozenset(credentials),
+        commands_by_phase={k: tuple(v) for k, v in commands_by_phase.items()},
+    )
+
+
+__all__ = [
+    "Observation",
+    "high_weight_edge",
+    "medium_weight_edge",
+    "low_weight_edge",
+    "very_low_weight_edge",
+    "combined_edge_weight",
+    "from_synthetic",
+    "EDGE_THRESHOLD",
+    "TIER_WEIGHTS",
+]