merge: testing → main (reconcile 2-week divergence)

2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions
--- a/decnet/clustering/init.py
+++ b/decnet/clustering/init.py
@@ -0,0 +1 @@
+"""Campaign clustering — see development/CAMPAIGN_CLUSTERING.md."""
--- a/decnet/clustering/base.py
+++ b/decnet/clustering/base.py
@@ -0,0 +1,83 @@
+"""Identity-resolution clusterer protocol.
+
+Each concrete clusterer (``decnet.clustering.impl.connected_components``,
+and any future variant) implements this. Callers must obtain the active
+clusterer via :func:`decnet.clustering.factory.get_clusterer` — never
+instantiate a concrete class directly.
+
+The clusterer mirrors the provider-subpackage convention used by
+:mod:`decnet.bus` and :mod:`decnet.web.db`: ``base.py`` defines the
+protocol, ``factory.py`` dispatches on ``DECNET_CLUSTERER_TYPE``, and
+``impl/`` holds concrete implementations.
+
+Distinct from the ``tests/factories/campaign_factory.py`` namespace —
+that's the synthetic-data DSL used by the fixture suite. The clusterer
+here is the production worker that the fixture suite *gates*.
+"""
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any
+
+from decnet.web.db.repository import BaseRepository
+
+
+@dataclass
+class ClusterResult:
+    """Side-effects produced by a single clusterer ``tick``.
+
+    The worker shell consumes these to publish on the bus
+    (``identity.formed`` / ``identity.observation.linked`` /
+    ``identity.merged`` / ``identity.unmerged``). The clusterer itself
+    has already committed any DB writes by the time it returns this —
+    losing a publish is at most a few seconds of UI latency.
+    """
+
+    identities_formed: list[dict[str, Any]] = field(default_factory=list)
+    """One dict per newly created identity. Shape:
+    ``{"identity_uuid": str, "observation_uuids": [str, ...]}``."""
+
+    observations_linked: list[dict[str, Any]] = field(default_factory=list)
+    """One dict per observation attached to an existing identity. Shape:
+    ``{"identity_uuid": str, "observation_uuid": str}``."""
+
+    identities_merged: list[dict[str, Any]] = field(default_factory=list)
+    """One dict per merge. Shape: ``{"winner_uuid": str,
+    "loser_uuid": str}``."""
+
+    identities_unmerged: list[dict[str, Any]] = field(default_factory=list)
+    """One dict per revoked merge (contradicting evidence re-split a
+    previously-merged pair). Shape:
+    ``{"resurrected_uuid": str, "former_winner_uuid": str}``.
+
+    Reserved for the revocable-merge work; the skeleton clusterer never
+    produces these. Subscribers on ``identity.>`` should still handle
+    them from day one — see ``identity.unmerged`` in
+    :mod:`decnet.bus.topics`.
+    """
+
+
+class Clusterer(ABC):
+    """Abstract identity-resolution clusterer.
+
+    Single-method contract: ``tick`` reads pending observations from the
+    repo, runs a clustering pass, commits ``attacker_identities`` rows +
+    sets ``attackers.identity_id``, and returns a :class:`ClusterResult`
+    summarising the side-effects so the worker shell can publish.
+
+    Implementations MUST NOT raise from ``tick``: a single bad pass
+    cannot be allowed to crash the worker. Internal failures should be
+    logged and the method should return an empty :class:`ClusterResult`.
+    """
+
+    #: Short tag — surfaces in logs and in
+    #: ``DECNET_CLUSTERER_TYPE`` for factory dispatch.
+    name: str
+
+    @abstractmethod
+    async def tick(self, repo: BaseRepository) -> ClusterResult:
+        """Run a single clustering pass. See class docstring."""
+
+
+__all__ = ["Clusterer", "ClusterResult"]
--- a/decnet/clustering/campaign/init.py
+++ b/decnet/clustering/campaign/init.py
@@ -0,0 +1,5 @@
+"""Campaign clusterer — groups resolved identities into operations.
+
+The layer above identity resolution. See
+``development/CAMPAIGN_CLUSTERING.md`` for the signal taxonomy.
+"""
--- a/decnet/clustering/campaign/base.py
+++ b/decnet/clustering/campaign/base.py
@@ -0,0 +1,66 @@
+"""Campaign clusterer protocol — layer above identity resolution.
+
+Mirrors :mod:`decnet.clustering.base` for the layer above. Each concrete
+campaign clusterer implements :class:`CampaignClusterer`; callers obtain
+the active instance via
+:func:`decnet.clustering.campaign.factory.get_campaign_clusterer`.
+
+The result shape parallels :class:`ClusterResult` but speaks campaign
+vocabulary: campaigns formed, identities assigned, campaigns merged,
+campaigns unmerged.
+"""
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any
+
+from decnet.web.db.repository import BaseRepository
+
+
+@dataclass
+class CampaignClusterResult:
+    """Side-effects produced by a single campaign-clusterer ``tick``.
+
+    Consumed by the worker shell to publish on the bus
+    (``campaign.formed`` / ``campaign.identity.assigned`` /
+    ``campaign.merged`` / ``campaign.unmerged`` plus the cross-family
+    ``identity.campaign.assigned``).  DB writes are already committed
+    by the time this returns.
+    """
+
+    campaigns_formed: list[dict[str, Any]] = field(default_factory=list)
+    """``{"campaign_uuid": str, "identity_uuids": [str, ...]}``."""
+
+    identities_assigned: list[dict[str, Any]] = field(default_factory=list)
+    """``{"campaign_uuid": str, "identity_uuid": str,
+    "prior_campaign_uuid": Optional[str]}``."""
+
+    campaigns_merged: list[dict[str, Any]] = field(default_factory=list)
+    """``{"winner_uuid": str, "loser_uuid": str}``."""
+
+    campaigns_unmerged: list[dict[str, Any]] = field(default_factory=list)
+    """``{"resurrected_uuid": str, "former_winner_uuid": str}``."""
+
+
+class CampaignClusterer(ABC):
+    """Abstract campaign clusterer.
+
+    Single-method contract mirroring :class:`Clusterer`: ``tick`` reads
+    identities from the repo, projects them to a campaign-level feature
+    shape, runs a clustering pass, commits ``campaigns`` rows + sets
+    ``attacker_identities.campaign_id``, and returns a
+    :class:`CampaignClusterResult` summarising side-effects.
+
+    Implementations MUST NOT raise from ``tick``: a single bad pass
+    cannot be allowed to crash the worker.
+    """
+
+    name: str
+
+    @abstractmethod
+    async def tick(self, repo: BaseRepository) -> CampaignClusterResult:
+        """Run a single campaign clustering pass."""
+
+
+__all__ = ["CampaignClusterer", "CampaignClusterResult"]
--- a/decnet/clustering/campaign/factory.py
+++ b/decnet/clustering/campaign/factory.py
@@ -0,0 +1,31 @@
+"""Campaign-clusterer factory.
+
+Mirrors :mod:`decnet.clustering.factory` for the campaign layer.
+Configuration knob ``DECNET_CAMPAIGN_CLUSTERER_TYPE``; default
+``"connected_components"``.
+"""
+from __future__ import annotations
+
+import os
+
+from decnet.clustering.campaign.base import CampaignClusterer
+
+_KNOWN: tuple[str, ...] = ("connected_components",)
+_DEFAULT = "connected_components"
+
+
+def get_campaign_clusterer() -> CampaignClusterer:
+    name = os.environ.get(
+        "DECNET_CAMPAIGN_CLUSTERER_TYPE", _DEFAULT,
+    ).strip().lower()
+    if name == "connected_components":
+        from decnet.clustering.campaign.impl.connected_components import (
+            ConnectedComponentsCampaignClusterer,
+        )
+        return ConnectedComponentsCampaignClusterer()
+    raise ValueError(
+        f"Unknown campaign clusterer: {name!r}. Known: {_KNOWN}"
+    )
+
+
+__all__ = ["get_campaign_clusterer"]
--- a/decnet/clustering/campaign/impl/init.py
+++ b/decnet/clustering/campaign/impl/init.py
--- a/decnet/clustering/campaign/impl/connected_components.py
+++ b/decnet/clustering/campaign/impl/connected_components.py
@@ -0,0 +1,304 @@
+"""Connected-components campaign clusterer (v1).
+
+Builds a similarity graph over identities (the layer below — already
+clustered from raw observations), runs union-find over edges that pass
+:data:`CAMPAIGN_EDGE_THRESHOLD`, and writes one ``campaigns`` row per
+component.
+
+Mirror of :mod:`decnet.clustering.impl.connected_components` for the
+layer above. Same revocable-merge discipline: identities stay FK'd to
+their original campaign row throughout, soft pointers via
+``campaigns.merged_into_uuid``.
+
+**Time-agnostic.** Edges depend only on pairwise relative offsets —
+fixture F7 (slow_burn) invariant carries forward to this layer.
+"""
+from __future__ import annotations
+
+import json
+import uuid as _uuid
+from datetime import datetime, timezone
+from typing import Any, Iterable, Optional
+
+from decnet.clustering.campaign.base import (
+    CampaignClusterer,
+    CampaignClusterResult,
+)
+from decnet.clustering.campaign.impl.similarity import (
+    CAMPAIGN_EDGE_THRESHOLD,
+    IdentityFeatures,
+    combined_campaign_weight,
+)
+from decnet.logging import get_logger
+from decnet.web.db.repository import BaseRepository
+
+log = get_logger("clustering.campaign.connected_components")
+
+
+def cluster_identities(
+    features: Iterable[IdentityFeatures],
+) -> dict[str, str]:
+    """Run connected-components over the campaign-level similarity graph.
+
+    Pure: no DB, no clock, no I/O. Returns ``{identity_uuid: cluster_id}``.
+    Singletons get a stable per-identity cluster id; cluster ids are
+    opaque strings.
+    """
+    feat_list = list(features)
+    parent: dict[str, str] = {f.identity_uuid: f.identity_uuid for f in feat_list}
+
+    def find(x: str) -> str:
+        while parent[x] != x:
+            parent[x] = parent[parent[x]]
+            x = parent[x]
+        return x
+
+    def union(x: str, y: str) -> None:
+        rx, ry = find(x), find(y)
+        if rx != ry:
+            parent[rx] = ry
+
+    for i, a in enumerate(feat_list):
+        for b in feat_list[i + 1:]:
+            if combined_campaign_weight(a, b) >= CAMPAIGN_EDGE_THRESHOLD:
+                union(a.identity_uuid, b.identity_uuid)
+
+    return {f.identity_uuid: f"cmp-{find(f.identity_uuid)}" for f in feat_list}
+
+
+def from_identity_row(row: dict[str, Any]) -> IdentityFeatures:
+    """Project an ``AttackerIdentity`` projection row dict into an
+    :class:`IdentityFeatures`.
+
+    ``row`` is the shape returned by
+    ``BaseRepository.list_identities_for_clustering``: uuid +
+    ja3_hashes / hassh_hashes / payload_simhashes / c2_endpoints
+    (JSON list[str] or null).
+
+    Phase-handoff fields stay empty until the production-row adapter
+    learns to mine logs for per-decky phase sequences (TODO.md
+    "production-side payload + C2 + commands joins"). Without those,
+    the campaign clusterer falls back to shared-infra + temporal
+    overlap + cohort signals on production data; the fixture path
+    exercises the full feature set via :func:`from_synthetic_identity`.
+    """
+    payload_hashes = _parse_json_list(row.get("payload_simhashes"))
+    c2_endpoints = _parse_json_list(row.get("c2_endpoints"))
+
+    return IdentityFeatures(
+        identity_uuid=row["uuid"],
+        payload_hashes=frozenset(payload_hashes),
+        c2_endpoints=frozenset(c2_endpoints),
+    )
+
+
+def _parse_json_list(raw: Optional[str]) -> list[str]:
+    if not raw:
+        return []
+    try:
+        decoded = json.loads(raw)
+    except (TypeError, ValueError):
+        return []
+    if not isinstance(decoded, list):
+        return []
+    return [str(x) for x in decoded if x is not None]
+
+
+class ConnectedComponentsCampaignClusterer(CampaignClusterer):
+    """Connected-components campaign clusterer."""
+
+    name = "connected_components"
+
+    async def tick(self, repo: BaseRepository) -> CampaignClusterResult:
+        try:
+            rows = await repo.list_identities_for_clustering()
+        except Exception:  # noqa: BLE001
+            log.exception("campaign clusterer: failed to read identities")
+            return CampaignClusterResult()
+
+        if not rows:
+            return CampaignClusterResult()
+
+        # Pre-compute the campaign merge chain so an identity's
+        # "effective" campaign follows merged_into_uuid up to the winner.
+        try:
+            all_campaigns = await repo.list_all_campaigns()
+        except Exception:  # noqa: BLE001
+            log.exception("campaign clusterer: failed to read campaigns")
+            return CampaignClusterResult()
+        campaign_chain = _build_merge_chain(all_campaigns)
+
+        # Project + cluster.  Skip identities that are themselves
+        # merged out — their winner is the active row and gets clustered
+        # on its own.  This keeps the campaign graph from double-counting.
+        active_rows = [r for r in rows if not r.get("merged_into_uuid")]
+        feature_list: list[IdentityFeatures] = [
+            from_identity_row(r) for r in active_rows
+        ]
+        row_by_uuid: dict[str, dict[str, Any]] = {
+            r["uuid"]: r for r in active_rows
+        }
+        labels = cluster_identities(feature_list)
+
+        # Group identities by predicted cluster.
+        components: dict[str, list[str]] = {}
+        for identity_uuid, cluster_id in labels.items():
+            components.setdefault(cluster_id, []).append(identity_uuid)
+
+        result = CampaignClusterResult()
+        now = datetime.now(timezone.utc)
+
+        # Pass 1 — per-component reconciliation: form, link, merge.
+        for member_ids in components.values():
+            literal_campaign_ids = {
+                row_by_uuid[m]["campaign_id"] for m in member_ids
+                if row_by_uuid[m].get("campaign_id")
+            }
+            effective_ids = {
+                campaign_chain.get(c, c) for c in literal_campaign_ids
+            }
+            unassigned = [
+                m for m in member_ids
+                if not row_by_uuid[m].get("campaign_id")
+            ]
+
+            if not effective_ids:
+                campaign_uuid = str(_uuid.uuid4())
+                try:
+                    await repo.create_campaign({
+                        "uuid": campaign_uuid,
+                        "schema_version": 1,
+                        "first_seen_at": now,
+                        "last_seen_at": now,
+                        "created_at": now,
+                        "updated_at": now,
+                        "identity_count": len(member_ids),
+                    })
+                except Exception:  # noqa: BLE001
+                    log.exception(
+                        "campaign clusterer: failed to create campaign for "
+                        "component %s", member_ids,
+                    )
+                    continue
+
+                linked: list[str] = []
+                for identity_uuid in member_ids:
+                    if await _link(repo, identity_uuid, campaign_uuid):
+                        linked.append(identity_uuid)
+                if linked:
+                    result.campaigns_formed.append({
+                        "campaign_uuid": campaign_uuid,
+                        "identity_uuids": linked,
+                    })
+                continue
+
+            winner_uuid = min(effective_ids)
+            losers = effective_ids - {winner_uuid}
+
+            for loser_uuid in losers:
+                try:
+                    await repo.update_campaign_merged_into(
+                        loser_uuid, winner_uuid,
+                    )
+                except Exception:  # noqa: BLE001
+                    log.exception(
+                        "campaign clusterer: failed to merge %s -> %s",
+                        loser_uuid, winner_uuid,
+                    )
+                    continue
+                campaign_chain[loser_uuid] = winner_uuid
+                result.campaigns_merged.append({
+                    "winner_uuid": winner_uuid,
+                    "loser_uuid": loser_uuid,
+                })
+
+            for identity_uuid in unassigned:
+                if await _link(repo, identity_uuid, winner_uuid):
+                    result.identities_assigned.append({
+                        "campaign_uuid": winner_uuid,
+                        "identity_uuid": identity_uuid,
+                        "prior_campaign_uuid": None,
+                    })
+
+        # Pass 2 — revocable-merge undo for campaigns. Same shape as
+        # the identity-side check: if a merged-out campaign's
+        # identities no longer cluster with the winner's, revoke.
+        identities_by_literal_campaign: dict[str, list[str]] = {}
+        for identity_uuid, r in row_by_uuid.items():
+            cid = r.get("campaign_id")
+            if cid:
+                identities_by_literal_campaign.setdefault(cid, []).append(
+                    identity_uuid,
+                )
+
+        for campaign_row in all_campaigns:
+            if not campaign_row.get("merged_into_uuid"):
+                continue
+            loser_uuid = campaign_row["uuid"]
+            winner_uuid = campaign_chain.get(loser_uuid, loser_uuid)
+            if winner_uuid == loser_uuid:
+                continue
+            loser_idents = identities_by_literal_campaign.get(loser_uuid, [])
+            winner_idents = identities_by_literal_campaign.get(winner_uuid, [])
+            if not loser_idents or not winner_idents:
+                continue
+            loser_clusters = {labels[i] for i in loser_idents if i in labels}
+            winner_clusters = {labels[i] for i in winner_idents if i in labels}
+            if loser_clusters & winner_clusters:
+                continue
+            try:
+                await repo.update_campaign_merged_into(loser_uuid, None)
+            except Exception:  # noqa: BLE001
+                log.exception(
+                    "campaign clusterer: failed to unmerge %s from %s",
+                    loser_uuid, winner_uuid,
+                )
+                continue
+            campaign_chain[loser_uuid] = loser_uuid
+            result.campaigns_unmerged.append({
+                "resurrected_uuid": loser_uuid,
+                "former_winner_uuid": winner_uuid,
+            })
+
+        return result
+
+
+def _build_merge_chain(
+    rows: list[dict[str, Any]],
+) -> dict[str, str]:
+    _MAX_HOPS = 8
+    by_uuid: dict[str, dict[str, Any]] = {r["uuid"]: r for r in rows}
+    chain: dict[str, str] = {}
+    for uuid_ in by_uuid:
+        cur = uuid_
+        for _ in range(_MAX_HOPS):
+            row = by_uuid.get(cur)
+            if row is None:
+                break
+            nxt = row.get("merged_into_uuid")
+            if not nxt or nxt == cur:
+                break
+            cur = nxt
+        chain[uuid_] = cur
+    return chain
+
+
+async def _link(
+    repo: BaseRepository, identity_uuid: str, campaign_uuid: str,
+) -> bool:
+    try:
+        await repo.set_identity_campaign_id(identity_uuid, campaign_uuid)
+        return True
+    except Exception:  # noqa: BLE001
+        log.exception(
+            "campaign clusterer: failed to link identity=%s -> campaign=%s",
+            identity_uuid, campaign_uuid,
+        )
+        return False
+
+
+__all__ = [
+    "ConnectedComponentsCampaignClusterer",
+    "cluster_identities",
+    "from_identity_row",
+]
--- a/decnet/clustering/campaign/impl/similarity.py
+++ b/decnet/clustering/campaign/impl/similarity.py
@@ -0,0 +1,441 @@
+"""Similarity-graph primitives for the campaign clusterer.
+
+The campaign clusterer reads ``AttackerIdentity`` rows (the layer below)
+and groups them into operations. The graph it builds is **not** the
+identity-level graph: identity-level signals don't translate 1:1, and
+some that get vetoed at identity level (shared infra) are the *primary
+positive signal* at campaign level.
+
+Mirror of ``decnet.clustering.impl.similarity`` for the
+identity layer; see that module for the four-tier identity taxonomy.
+
+**Time-agnostic.** Same F7 invariant as the identity layer — edges
+MUST depend only on *pairwise relative* offsets, never on absolute
+clocks. Shift two identities' session windows by the same Δ and the
+edge weights MUST be identical. The temporal-overlap edge below uses
+this invariant explicitly.
+
+**Edge families** (from ``development/CAMPAIGN_CLUSTERING.md``):
+
+* **Phase-handoff** — A ends in ``COMMAND_AND_CONTROL`` / ``PERSISTENCE``
+  on decky D, B begins ``DISCOVERY`` / ``LATERAL_MOVEMENT`` on D
+  within window W. Load-bearing for fixture F5 (multi_operator) — the
+  signal the identity-side fingerprint-disagreement veto deliberately
+  doesn't try to be.
+* **Shared-infra** — Jaccard over aggregated payload-hashes /
+  C2-endpoints / decky-set across the identities' member observations.
+  Vetoed at identity level (``ed32358``); primary positive signal here.
+* **Temporal overlap** — sessions inside a bounded *relative* window.
+  Campaigns are operations and operations have bounded duration;
+  overlap of distinct identities on shared infra is the canonical
+  co-op pattern.
+* **Cohort** — ASN-cohort + tooling-cohort weak signals. Defeated alone
+  (per F2); useful as supporting weight only.
+
+The functions are pure (no DB, no I/O); the worker maps identities into
+:class:`IdentityFeatures` once per tick and feeds these into the graph
+builder in a sibling module.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Mapping, Optional
+
+
+# ─── Identity-level projection ──────────────────────────────────────────────
+
+
+@dataclass(frozen=True)
+class IdentityFeatures:
+    """Minimal projection of an :class:`AttackerIdentity` row.
+
+    Built once per identity by the worker (or per fixture identity in
+    tests via :func:`from_synthetic_identity`). Keeping the projection
+    tight isolates the campaign-graph code from schema drift on the
+    identity layer.
+    """
+
+    identity_uuid: str
+    """Stable ID — production: ``AttackerIdentity.uuid``."""
+
+    asn_cohort: frozenset[int] = field(default_factory=frozenset)
+    """All ASNs observed across the identity's member observations.
+    A single rotating actor (F2) appears in many ASNs; the *set*
+    overlap is the cohort signal."""
+
+    tooling_cohort: frozenset[str] = field(default_factory=frozenset)
+    """Tooling labels (e.g. ``"hydra"``, ``"hping"``) inferred from
+    fingerprints / commands. Empty until tooling-attribution lands."""
+
+    payload_hashes: frozenset[str] = field(default_factory=frozenset)
+    """Aggregated payload hashes across member observations."""
+
+    c2_endpoints: frozenset[str] = field(default_factory=frozenset)
+    """Aggregated C2 endpoints across member observations."""
+
+    decky_set: frozenset[str] = field(default_factory=frozenset)
+    """Aggregated decky IDs the identity touched."""
+
+    commands_by_phase_on_decky: Mapping[
+        tuple[str, str], tuple[str, ...]
+    ] = field(default_factory=dict)
+    """``(decky_id, UKCPhase.value)`` → ordered command sequence
+    observed on that decky in that phase. Required for the
+    phase-handoff edge — same decky is the join key. Empty when
+    ``commands_by_phase`` is unavailable on the production-row
+    adapter (deferred per TODO.md until log-mining lands)."""
+
+    session_windows: tuple[tuple[float, float], ...] = ()
+    """Per-session ``(start_ts, end_ts)`` tuples in seconds since
+    epoch. Used ONLY for pairwise relative deltas — never compared
+    to an absolute clock. F7 (slow_burn) invariance check verifies
+    that adding Δ to every entry on both sides yields the same edge
+    weight."""
+
+    last_phase_per_decky: Mapping[str, str] = field(default_factory=dict)
+    """``decky_id`` → last UKC phase observed on that decky. The
+    "from" side of a phase handoff."""
+
+    first_phase_per_decky: Mapping[str, str] = field(default_factory=dict)
+    """``decky_id`` → first UKC phase observed on that decky. The
+    "to" side of a phase handoff."""
+
+    last_seen_per_decky: Mapping[str, float] = field(default_factory=dict)
+    """``decky_id`` → last activity timestamp on that decky. Pairs
+    with :attr:`first_seen_per_decky` to compute pairwise handoff
+    gap relative to the two identities (no absolute clock)."""
+
+    first_seen_per_decky: Mapping[str, float] = field(default_factory=dict)
+    """``decky_id`` → first activity timestamp on that decky."""
+
+
+# ─── Phase-handoff edge ─────────────────────────────────────────────────────
+
+
+#: Phases that mark a *handoff-out* — operator A is finished setting
+#: up a foothold and the next operator can step in. Drawn from the
+#: STAGE_IN tail (PERSISTENCE / COMMAND_AND_CONTROL) per the UKC
+#: vocabulary; expanding this set is a tunable knob.
+HANDOFF_OUT_PHASES: frozenset[str] = frozenset({
+    "command_and_control",
+    "persistence",
+})
+
+#: Phases that mark a *handoff-in* — operator B picks up a prepared
+#: foothold and starts operating through the network. STAGE_THROUGH
+#: head (DISCOVERY / LATERAL_MOVEMENT).
+HANDOFF_IN_PHASES: frozenset[str] = frozenset({
+    "discovery",
+    "lateral_movement",
+})
+
+#: Default handoff-window in seconds. The "B starts within W of A's
+#: end" guard. Bounded relative to the pair — fixture F7 invariant
+#: still holds because shifting both timestamps preserves the gap.
+DEFAULT_HANDOFF_WINDOW_S: float = 24 * 3600.0  # 24h
+
+
+def phase_handoff_weight(
+    a: IdentityFeatures,
+    b: IdentityFeatures,
+    window_s: float = DEFAULT_HANDOFF_WINDOW_S,
+) -> float:
+    """Phase-handoff edge — the load-bearing F5 signal.
+
+    Returns ``1.0`` if there exists a decky D such that EITHER:
+
+    * A's last phase on D is in :data:`HANDOFF_OUT_PHASES`, B's first
+      phase on D is in :data:`HANDOFF_IN_PHASES`, and B's first
+      activity on D is within ``window_s`` AFTER A's last activity
+      on D, OR
+    * the symmetric case with A and B swapped.
+
+    Returns ``0.0`` when no shared decky has a matching out→in pair
+    within window. Window comparison is on the *gap* (a single
+    subtraction) — pairwise-relative, so F7 invariance holds.
+    """
+    return max(
+        _directed_handoff(a, b, window_s),
+        _directed_handoff(b, a, window_s),
+    )
+
+
+def _directed_handoff(
+    out: IdentityFeatures, in_: IdentityFeatures, window_s: float,
+) -> float:
+    shared = set(out.last_phase_per_decky) & set(in_.first_phase_per_decky)
+    for decky in shared:
+        out_phase = out.last_phase_per_decky.get(decky)
+        in_phase = in_.first_phase_per_decky.get(decky)
+        if out_phase not in HANDOFF_OUT_PHASES:
+            continue
+        if in_phase not in HANDOFF_IN_PHASES:
+            continue
+        out_t = out.last_seen_per_decky.get(decky)
+        in_t = in_.first_seen_per_decky.get(decky)
+        if out_t is None or in_t is None:
+            continue
+        gap = in_t - out_t
+        if 0.0 <= gap <= window_s:
+            return 1.0
+    return 0.0
+
+
+# ─── Shared-infra edge ──────────────────────────────────────────────────────
+
+
+def shared_infra_weight(a: IdentityFeatures, b: IdentityFeatures) -> float:
+    """Jaccard over payload-hashes ∪ C2-endpoints.
+
+    Excludes ``decky_set`` deliberately: decky overlap is a *fleet
+    scarcity* artifact (a small fleet means many distinct campaigns
+    hit the same deckies) and would fuse F1's two unrelated campaigns
+    on shared targeting. Payload hashes and C2 endpoints are
+    operational artifacts; distinct campaigns rarely share them.
+
+    At identity level this gets vetoed by the fingerprint-disagreement
+    rule (``ed32358``); at campaign level it's the *primary* positive
+    signal — distinct identities sharing payload + C2 is the canonical
+    co-op pattern (F5 multi_operator).
+
+    The decky-overlap signal lives in :func:`cohort_weight` instead
+    where its weak-tier multiplier prevents F1-style false merges.
+
+    Returns Jaccard across the union of the two set families,
+    ``0.0`` when both sides are empty.
+    """
+    a_set = a.payload_hashes | a.c2_endpoints
+    b_set = b.payload_hashes | b.c2_endpoints
+    if not a_set and not b_set:
+        return 0.0
+    union = a_set | b_set
+    if not union:
+        return 0.0
+    return len(a_set & b_set) / len(union)
+
+
+# ─── Temporal-overlap edge ──────────────────────────────────────────────────
+
+
+def temporal_overlap_weight(
+    a: IdentityFeatures, b: IdentityFeatures,
+) -> float:
+    """Pairwise-relative temporal overlap fraction.
+
+    Returns the fraction of A's total session time that overlaps any
+    B session, capped at ``1.0``. Pairwise-relative: the value is
+    invariant under a uniform Δ-shift of every timestamp on both
+    sides (F7 fixture's invariant). Returns ``0.0`` when either side
+    has no session windows.
+
+    Two non-cooperating actors with bounded operations rarely overlap
+    by chance; co-op campaigns overlap heavily. Defeated alone (one
+    overlapping minute means little) — combined with shared-infra
+    or handoff it pulls a pair over threshold.
+    """
+    if not a.session_windows or not b.session_windows:
+        return 0.0
+    a_total = sum(end - start for start, end in a.session_windows)
+    if a_total <= 0:
+        return 0.0
+    overlap = 0.0
+    for a_start, a_end in a.session_windows:
+        for b_start, b_end in b.session_windows:
+            lo = max(a_start, b_start)
+            hi = min(a_end, b_end)
+            if hi > lo:
+                overlap += hi - lo
+    return min(1.0, overlap / a_total)
+
+
+# ─── Cohort edges ───────────────────────────────────────────────────────────
+
+
+def cohort_weight(a: IdentityFeatures, b: IdentityFeatures) -> float:
+    """ASN-cohort + tooling-cohort + decky-overlap weak signal.
+
+    Jaccard over the union of ASN cohort, tooling cohort, and decky
+    set. F2's failure mode (one identity rotating across many ASNs)
+    doesn't apply at *campaign* level — but multiple identities
+    cooperating out of the same hosting cohort is plausible co-op
+    evidence. Decky overlap lives here (not in :func:`shared_infra`)
+    because decky scarcity in a small honeypot fleet would otherwise
+    fuse unrelated campaigns hitting the same SSH targets (F1
+    shared_wordlist).
+
+    Weak by design: the combined-weight tier multiplier keeps this
+    from crossing threshold alone.
+    """
+    a_set: frozenset = frozenset(
+        {("asn", str(x)) for x in a.asn_cohort}
+        | {("tool", x) for x in a.tooling_cohort}
+        | {("decky", x) for x in a.decky_set}
+    )
+    b_set: frozenset = frozenset(
+        {("asn", str(x)) for x in b.asn_cohort}
+        | {("tool", x) for x in b.tooling_cohort}
+        | {("decky", x) for x in b.decky_set}
+    )
+    if not a_set and not b_set:
+        return 0.0
+    union = a_set | b_set
+    if not union:
+        return 0.0
+    return len(a_set & b_set) / len(union)
+
+
+# ─── Combined campaign-level weight ─────────────────────────────────────────
+
+
+#: Tier multipliers for the campaign graph. Tuned so:
+#:
+#: * Phase-handoff alone (max 1.0) crosses threshold — a clean
+#:   F5-style handoff is sufficient evidence on its own.
+#: * Shared-infra alone (max 1.0) crosses threshold — payload+C2
+#:   overlap is the canonical co-op signal (F5 multi_operator's
+#:   intended pass condition; decky overlap was deliberately moved
+#:   to :func:`cohort_weight` to avoid F1's false merge on shared
+#:   targeting).
+#: * Temporal overlap alone (max 1.0) yields 0.4 — supporting weight.
+#: * Cohort alone (max 1.0) yields 0.1 — defeats F1's shared-decky
+#:   failure mode and F2's rotating-ASN one.
+#:
+#: F1 shared_wordlist: payload+C2 = ∅ on both sides → shared_infra =
+#: 0; ASN+decky overlap fires cohort but at 0.1 stays well below
+#: threshold. F2 vpn_hopping is folded by the identity layer first,
+#: so the campaign clusterer sees one identity → one campaign.
+CAMPAIGN_TIER_WEIGHTS: dict[str, float] = {
+    "phase_handoff": 1.0,
+    "shared_infra": 1.0,
+    "temporal_overlap": 0.4,
+    "cohort": 0.1,
+}
+
+#: Threshold a combined campaign-edge weight must meet to survive
+#: into the similarity graph.
+CAMPAIGN_EDGE_THRESHOLD: float = 1.0
+
+
+def combined_campaign_weight(
+    a: IdentityFeatures,
+    b: IdentityFeatures,
+    *,
+    handoff_window_s: float = DEFAULT_HANDOFF_WINDOW_S,
+) -> float:
+    """Sum of all four tier scores, weighted by
+    :data:`CAMPAIGN_TIER_WEIGHTS`.
+
+    The campaign-clusterer worker compares this against
+    :data:`CAMPAIGN_EDGE_THRESHOLD` to decide whether to draw an
+    edge. Pure / time-agnostic — F7 invariant preserved.
+    """
+    return (
+        CAMPAIGN_TIER_WEIGHTS["phase_handoff"]
+        * phase_handoff_weight(a, b, handoff_window_s)
+        + CAMPAIGN_TIER_WEIGHTS["shared_infra"] * shared_infra_weight(a, b)
+        + CAMPAIGN_TIER_WEIGHTS["temporal_overlap"]
+        * temporal_overlap_weight(a, b)
+        + CAMPAIGN_TIER_WEIGHTS["cohort"] * cohort_weight(a, b)
+    )
+
+
+# ─── Adapter for synthetic-fixture tests ────────────────────────────────────
+
+
+def from_synthetic_identity(att, identity_uuid: Optional[str] = None) -> IdentityFeatures:  # type: ignore[no-untyped-def]
+    """Build an :class:`IdentityFeatures` from a ``SyntheticAttacker``.
+
+    Treats one ``SyntheticAttacker`` as one identity — adequate for
+    fixture validation where the campaign-clusterer reads identities
+    not raw observations. The worker's production-row adapter
+    (commit 3) builds the same shape from real ``AttackerIdentity``
+    rows + their member observations.
+
+    Lives here so test code doesn't import the factory shape into the
+    production module — the adapter is a documented integration point.
+    """
+    payload_hashes: set[str] = set()
+    c2_endpoints: set[str] = set()
+    decky_set: set[str] = set()
+    asn_cohort: set[int] = set()
+    if att.asn is not None:
+        asn_cohort.add(att.asn)
+
+    commands_by_phase_on_decky: dict[tuple[str, str], list[str]] = {}
+    last_phase_per_decky: dict[str, str] = {}
+    first_phase_per_decky: dict[str, str] = {}
+    last_seen_per_decky: dict[str, float] = {}
+    first_seen_per_decky: dict[str, float] = {}
+    session_windows: list[tuple[float, float]] = []
+
+    # SyntheticSession order is the campaign DSL's emission order, which
+    # is monotonically time-ordered by construction. We rely on that to
+    # extract first/last phase per decky.
+    for s in att.sessions:
+        if s.payload_hash:
+            payload_hashes.add(s.payload_hash)
+        if s.c2_callback:
+            c2_endpoints.add(s.c2_callback)
+        decky = getattr(s, "decky", None) or getattr(s, "decky_id", None)
+        if decky:
+            decky_set.add(decky)
+        # SyntheticSession exposes ``started_at`` (datetime) +
+        # ``duration_s``; the production-row adapter (commit 3) gets
+        # ``start_ts``/``end_ts`` directly. Support both.
+        started_at = getattr(s, "started_at", None)
+        duration_s = getattr(s, "duration_s", None)
+        if started_at is not None:
+            ts_start = started_at.timestamp()
+            ts_end = ts_start + (float(duration_s) if duration_s else 0.0)
+        else:
+            ts_start = getattr(s, "start_ts", None)
+            ts_end = getattr(s, "end_ts", None)
+        if ts_start is not None and ts_end is not None:
+            session_windows.append((float(ts_start), float(ts_end)))
+        phase_value = s.phase.value if hasattr(s, "phase") else None
+        if decky and phase_value:
+            key = (decky, phase_value)
+            if s.commands:
+                commands_by_phase_on_decky.setdefault(key, []).extend(s.commands)
+            if decky not in first_phase_per_decky:
+                first_phase_per_decky[decky] = phase_value
+                if ts_start is not None:
+                    first_seen_per_decky[decky] = float(ts_start)
+            last_phase_per_decky[decky] = phase_value
+            if ts_end is not None:
+                last_seen_per_decky[decky] = float(ts_end)
+            elif ts_start is not None:
+                last_seen_per_decky[decky] = float(ts_start)
+
+    return IdentityFeatures(
+        identity_uuid=identity_uuid or att.attacker_id,
+        asn_cohort=frozenset(asn_cohort),
+        tooling_cohort=frozenset(),
+        payload_hashes=frozenset(payload_hashes),
+        c2_endpoints=frozenset(c2_endpoints),
+        decky_set=frozenset(decky_set),
+        commands_by_phase_on_decky={
+            k: tuple(v) for k, v in commands_by_phase_on_decky.items()
+        },
+        session_windows=tuple(session_windows),
+        last_phase_per_decky=dict(last_phase_per_decky),
+        first_phase_per_decky=dict(first_phase_per_decky),
+        last_seen_per_decky=dict(last_seen_per_decky),
+        first_seen_per_decky=dict(first_seen_per_decky),
+    )
+
+
+__all__ = [
+    "IdentityFeatures",
+    "phase_handoff_weight",
+    "shared_infra_weight",
+    "temporal_overlap_weight",
+    "cohort_weight",
+    "combined_campaign_weight",
+    "from_synthetic_identity",
+    "HANDOFF_OUT_PHASES",
+    "HANDOFF_IN_PHASES",
+    "DEFAULT_HANDOFF_WINDOW_S",
+    "CAMPAIGN_TIER_WEIGHTS",
+    "CAMPAIGN_EDGE_THRESHOLD",
+]
--- a/decnet/clustering/campaign/worker.py
+++ b/decnet/clustering/campaign/worker.py
@@ -0,0 +1,191 @@
+"""Long-running campaign-clusterer worker.
+
+Mirrors :mod:`decnet.clustering.worker` for the layer above. Bus-woken
+on ``identity.>`` (not ``attacker.>`` — the campaign clusterer reads
+identities, not raw observations); falls back to a 60s slow-tick poll
+when the bus is unavailable.
+
+Publishes the four ``campaign.*`` events plus the cross-family
+``identity.campaign.assigned`` so existing identity-stream subscribers
+see campaign-id changes without subscribing to ``campaign.>``.
+"""
+from __future__ import annotations
+
+import asyncio
+import contextlib
+from typing import Optional
+
+from decnet.bus import topics as _topics
+from decnet.bus.base import BaseBus
+from decnet.bus.factory import get_bus
+from decnet.bus.publish import (
+    publish_safely,
+    run_control_listener_signal as _run_control_listener_signal,
+    run_health_heartbeat as _run_health_heartbeat,
+)
+from decnet.clustering.campaign.base import (
+    CampaignClusterer,
+    CampaignClusterResult,
+)
+from decnet.clustering.campaign.factory import get_campaign_clusterer
+from decnet.logging import get_logger
+from decnet.web.db.repository import BaseRepository
+
+log = get_logger("clustering.campaign.worker")
+
+_DEFAULT_POLL_SECS = 60.0
+_WORKER_NAME = "campaign-clusterer"
+
+
+async def run_campaign_clusterer_loop(
+    repo: BaseRepository,
+    *,
+    poll_interval_secs: float = _DEFAULT_POLL_SECS,
+    clusterer: Optional[CampaignClusterer] = None,
+    shutdown: Optional[asyncio.Event] = None,
+) -> None:
+    """Run the campaign clusterer until cancelled."""
+    if clusterer is None:
+        clusterer = get_campaign_clusterer()
+    log.info(
+        "campaign-clusterer started impl=%s poll_interval_secs=%s",
+        clusterer.name, poll_interval_secs,
+    )
+
+    bus: Optional[BaseBus] = None
+    wake = asyncio.Event()
+    wake_tasks: list[asyncio.Task] = []
+    heartbeat_task: Optional[asyncio.Task] = None
+    try:
+        candidate = get_bus(client_name=_WORKER_NAME)
+        await candidate.connect()
+        bus = candidate
+        # Wake on any identity-layer event — formed / linked / merged /
+        # unmerged all change the input set the campaign clusterer
+        # operates over.
+        wake_tasks.append(asyncio.create_task(
+            _wake_on(bus, wake, f"{_topics.IDENTITY}.>"),
+        ))
+        heartbeat_task = asyncio.create_task(
+            _run_health_heartbeat(bus, _WORKER_NAME),
+        )
+        wake_tasks.append(asyncio.create_task(
+            _run_control_listener_signal(bus, _WORKER_NAME),
+        ))
+    except Exception as exc:  # noqa: BLE001
+        log.warning(
+            "campaign-clusterer: bus unavailable, running in poll-only "
+            "mode: %s", exc,
+        )
+
+    if shutdown is None:
+        shutdown = asyncio.Event()
+
+    try:
+        while not shutdown.is_set():
+            try:
+                result = await clusterer.tick(repo)
+            except Exception:  # noqa: BLE001
+                log.exception("campaign-clusterer: tick failed")
+                result = CampaignClusterResult()
+
+            await _publish_result(bus, result)
+
+            try:
+                await asyncio.wait_for(
+                    wake.wait(), timeout=float(poll_interval_secs),
+                )
+            except asyncio.TimeoutError:
+                pass
+            wake.clear()
+    except (asyncio.CancelledError, KeyboardInterrupt):
+        log.info("campaign-clusterer stopped")
+    finally:
+        for t in wake_tasks:
+            t.cancel()
+        if heartbeat_task is not None:
+            heartbeat_task.cancel()
+        for t in (*wake_tasks, heartbeat_task):
+            if t is None:
+                continue
+            with contextlib.suppress(asyncio.CancelledError, Exception):
+                await t
+        if bus is not None:
+            with contextlib.suppress(Exception):
+                await bus.close()
+
+
+async def _publish_result(
+    bus: Optional[BaseBus], result: CampaignClusterResult,
+) -> None:
+    """Fan ``CampaignClusterResult`` out to ``campaign.*`` topics +
+    cross-family ``identity.campaign.assigned``."""
+    for formed in result.campaigns_formed:
+        await publish_safely(
+            bus,
+            _topics.campaign(_topics.CAMPAIGN_FORMED),
+            formed,
+            event_type=_topics.CAMPAIGN_FORMED,
+        )
+        # Also fire identity.campaign.assigned per identity so the
+        # existing identity SSE stream sees the badge update.
+        for identity_uuid in formed.get("identity_uuids", []):
+            await publish_safely(
+                bus,
+                _topics.identity(_topics.IDENTITY_CAMPAIGN_ASSIGNED),
+                {
+                    "identity_uuid": identity_uuid,
+                    "campaign_uuid": formed["campaign_uuid"],
+                    "prior_campaign_uuid": None,
+                },
+                event_type=_topics.IDENTITY_CAMPAIGN_ASSIGNED,
+            )
+    for assigned in result.identities_assigned:
+        await publish_safely(
+            bus,
+            _topics.campaign(_topics.CAMPAIGN_IDENTITY_ASSIGNED),
+            assigned,
+            event_type=_topics.CAMPAIGN_IDENTITY_ASSIGNED,
+        )
+        await publish_safely(
+            bus,
+            _topics.identity(_topics.IDENTITY_CAMPAIGN_ASSIGNED),
+            {
+                "identity_uuid": assigned["identity_uuid"],
+                "campaign_uuid": assigned["campaign_uuid"],
+                "prior_campaign_uuid": assigned.get("prior_campaign_uuid"),
+            },
+            event_type=_topics.IDENTITY_CAMPAIGN_ASSIGNED,
+        )
+    for merged in result.campaigns_merged:
+        await publish_safely(
+            bus,
+            _topics.campaign(_topics.CAMPAIGN_MERGED),
+            merged,
+            event_type=_topics.CAMPAIGN_MERGED,
+        )
+    for unmerged in result.campaigns_unmerged:
+        await publish_safely(
+            bus,
+            _topics.campaign(_topics.CAMPAIGN_UNMERGED),
+            unmerged,
+            event_type=_topics.CAMPAIGN_UNMERGED,
+        )
+
+
+async def _wake_on(bus: BaseBus, wake: asyncio.Event, pattern: str) -> None:
+    try:
+        sub = bus.subscribe(pattern)
+        async with sub:
+            async for _event in sub:
+                wake.set()
+    except asyncio.CancelledError:
+        raise
+    except Exception as exc:  # noqa: BLE001
+        log.warning(
+            "campaign-clusterer: subscriber for %s died (%s); falling back "
+            "to poll", pattern, exc,
+        )
+
+
+__all__ = ["run_campaign_clusterer_loop"]
--- a/decnet/clustering/factory.py
+++ b/decnet/clustering/factory.py
@@ -0,0 +1,46 @@
+"""Clusterer factory.
+
+Returns the active :class:`~decnet.clustering.base.Clusterer` instance.
+Mirrors :mod:`decnet.bus.factory` and :mod:`decnet.web.db.factory`:
+callers obtain the clusterer via :func:`get_clusterer` rather than
+importing a concrete impl directly.
+
+Configuration knobs (env-overridable):
+
+* ``DECNET_CLUSTERER_TYPE`` — which implementation to use. Default
+  ``"connected_components"``. Unknown values raise :class:`ValueError`
+  so a typo in ``decnet.ini`` surfaces immediately rather than silently
+  falling back.
+
+The ``connected_components`` implementation is the v1 production
+clusterer. Other implementations (e.g. an HDBSCAN variant) can land
+here later without churning callers.
+"""
+from __future__ import annotations
+
+import os
+
+from decnet.clustering.base import Clusterer
+
+_KNOWN_CLUSTERERS = ("connected_components",)
+_DEFAULT_CLUSTERER = "connected_components"
+
+
+def get_clusterer() -> Clusterer:
+    """Return the configured clusterer instance.
+
+    Lazy-imports the concrete impl so the base module stays free of
+    implementation-specific dependencies.
+    """
+    name = os.environ.get("DECNET_CLUSTERER_TYPE", _DEFAULT_CLUSTERER).strip().lower()
+    if name == "connected_components":
+        from decnet.clustering.impl.connected_components import (
+            ConnectedComponentsClusterer,
+        )
+        return ConnectedComponentsClusterer()
+    raise ValueError(
+        f"Unknown clusterer: {name!r}. Known: {_KNOWN_CLUSTERERS}"
+    )
+
+
+__all__ = ["get_clusterer"]
--- a/decnet/clustering/impl/init.py
+++ b/decnet/clustering/impl/init.py
@@ -0,0 +1,6 @@
+"""Concrete clusterer implementations.
+
+Each module here contains exactly one :class:`~decnet.clustering.base.Clusterer`
+subclass. New implementations register themselves in
+:func:`decnet.clustering.factory.get_clusterer`.
+"""
--- a/decnet/clustering/impl/connected_components.py
+++ b/decnet/clustering/impl/connected_components.py
@@ -0,0 +1,379 @@
+"""Connected-components identity clusterer (v1).
+
+Builds a similarity graph over observations (per-IP attacker rows),
+runs union-find over edges that pass a confidence threshold, and writes
+one ``attacker_identities`` row per component.
+
+**v1 signal coverage (this commit):**
+
+* High-weight tier: JA3 / HASSH / payload-hash / C2-endpoint exact
+  match (alone enough to cluster). The production tick currently sees
+  JA3 + HASSH only — payload + C2 require log mining and join in
+  later commits. The fixture tests exercise the full high-weight set
+  through the in-memory path.
+
+Subsequent commits add medium / low / very-low tier edges, phase-
+handoff edges, and revocable merges. Edges MUST stay time-agnostic
+— fixture 7 forbids recency-decay clustering.
+
+**v1 behavior:**
+
+The clusterer assigns identities to NULL observations, merges existing
+identities when a single predicted component spans them, and revokes
+prior merges when the predicted component splits a merged-out identity
+away from its winner. Observations stay FK'd to their original identity
+row throughout — merges are soft pointers via
+``attacker_identities.merged_into_uuid``, never observation re-points.
+That keeps the audit trail intact and lets cached subscribers resolve
+merged-out UUIDs through the chain.
+"""
+from __future__ import annotations
+
+import json
+import uuid as _uuid
+from datetime import datetime, timezone
+from typing import Any, Iterable, Optional
+
+from decnet.clustering.base import Clusterer, ClusterResult
+from decnet.clustering.impl.similarity import (
+    EDGE_THRESHOLD,
+    Observation,
+    combined_edge_weight,
+)
+from decnet.logging import get_logger
+from decnet.profiler.identity_rollup import extract_fp_summaries
+from decnet.web.db.repository import BaseRepository
+
+log = get_logger("clustering.connected_components")
+
+
+def cluster_observations(
+    observations: Iterable[Observation],
+) -> dict[str, str]:
+    """Run connected-components over the high-weight similarity graph.
+
+    Pure: no DB, no clock, no I/O. Both the fixture-validation tests
+    and the production ``tick`` consume this. The mapping is a
+    deterministic function of the input set + edge function.
+
+    Singletons get a stable per-observation cluster id so callers can
+    distinguish "isolated observation" from "merged into nothing."
+
+    Returns ``{observation_id: cluster_id}``. Cluster ids are opaque
+    strings — callers must not rely on their format.
+    """
+    obs_list = list(observations)
+    parent: dict[str, str] = {o.observation_id: o.observation_id for o in obs_list}
+
+    def find(x: str) -> str:
+        while parent[x] != x:
+            parent[x] = parent[parent[x]]
+            x = parent[x]
+        return x
+
+    def union(x: str, y: str) -> None:
+        rx, ry = find(x), find(y)
+        if rx != ry:
+            parent[rx] = ry
+
+    for i, a in enumerate(obs_list):
+        for b in obs_list[i + 1:]:
+            if combined_edge_weight(a, b) >= EDGE_THRESHOLD:
+                union(a.observation_id, b.observation_id)
+
+    # Roots: each unique find(o) is a component representative. Use
+    # them as the cluster id so two runs over the same input produce
+    # the same labels (handy for assertions).
+    return {o.observation_id: f"cc-{find(o.observation_id)}" for o in obs_list}
+
+
+def from_attacker_row(row: dict[str, Any]) -> Observation:
+    """Project an ``Attacker`` row dict into an :class:`Observation`.
+
+    Pulls JA3 / HASSH out of the ``Attacker.fingerprints`` JSON list
+    (one entry per fingerprint event the prober collected). Multiple
+    JA3s on a single observation are flattened to a single value —
+    the most-recent — because :class:`Observation` is a single-row
+    projection; an observation that exhibits two distinct JA3s across
+    its lifetime is a wire-level oddity that the clusterer treats by
+    keeping the latest. The identity row itself can store the full
+    list across observations.
+
+    Payload + C2 + commands are left empty — log mining lands in
+    later commits. The function shape doesn't change when they do.
+    """
+    raw = row.get("fingerprints") or "[]"
+    try:
+        entries = json.loads(raw) if isinstance(raw, str) else list(raw)
+    except (TypeError, ValueError):
+        entries = []
+
+    ja3: Optional[str] = None
+    hassh: Optional[str] = None
+    for entry in entries:
+        if not isinstance(entry, dict):
+            continue
+        kind = entry.get("kind")
+        h = entry.get("hash") or entry.get("value")
+        if not h:
+            continue
+        if kind == "ja3":
+            ja3 = h
+        elif kind == "hassh":
+            hassh = h
+
+    return Observation(
+        observation_id=row["uuid"],
+        ja3=ja3,
+        hassh=hassh,
+        asn=row.get("asn"),
+    )
+
+
+class ConnectedComponentsClusterer(Clusterer):
+    """Connected-components clusterer over the similarity graph.
+
+    See module docstring for v1 signal coverage and behavior notes.
+    """
+
+    name = "connected_components"
+
+    async def tick(self, repo: BaseRepository) -> ClusterResult:
+        try:
+            rows = await repo.list_attackers_for_clustering()
+        except Exception:  # noqa: BLE001
+            log.exception("clusterer: failed to read attackers")
+            return ClusterResult()
+
+        if not rows:
+            return ClusterResult()
+
+        # Build the merge chain so a row's "effective" identity follows
+        # merged_into_uuid up to the canonical winner. Pre-computing it
+        # lets us reason about post-merge identity membership in one
+        # place. ``identity_chain[u]`` is the canonical winner for
+        # identity ``u`` (or ``u`` itself if not merged out).
+        try:
+            all_identities = await repo.list_all_identities()
+        except Exception:  # noqa: BLE001
+            log.exception("clusterer: failed to read identities")
+            return ClusterResult()
+        identity_chain = _build_merge_chain(all_identities)
+
+        # Project + cluster.
+        observations: list[Observation] = []
+        row_by_id: dict[str, dict[str, Any]] = {}
+        for r in rows:
+            obs = from_attacker_row(r)
+            observations.append(obs)
+            row_by_id[obs.observation_id] = r
+        labels = cluster_observations(observations)
+
+        # Group observations by predicted cluster.
+        components: dict[str, list[str]] = {}
+        for obs_id, cluster_id in labels.items():
+            components.setdefault(cluster_id, []).append(obs_id)
+
+        result = ClusterResult()
+        now = datetime.now(timezone.utc)
+
+        # Pass 1 — per-component reconciliation: form, link, merge.
+        for member_ids in components.values():
+            literal_ids = {
+                row_by_id[m]["identity_id"] for m in member_ids
+                if row_by_id[m].get("identity_id")
+            }
+            effective_ids = {identity_chain.get(i, i) for i in literal_ids}
+            unassigned = [
+                m for m in member_ids
+                if not row_by_id[m].get("identity_id")
+            ]
+
+            if not effective_ids:
+                # Fresh component — mint a new identity.
+                identity_uuid = str(_uuid.uuid4())
+                try:
+                    await repo.create_attacker_identity({
+                        "uuid": identity_uuid,
+                        "schema_version": 1,
+                        "first_seen_at": now,
+                        "last_seen_at": now,
+                        "created_at": now,
+                        "updated_at": now,
+                        "observation_count": len(member_ids),
+                    })
+                except Exception:  # noqa: BLE001
+                    log.exception(
+                        "clusterer: failed to create identity for component %s",
+                        member_ids,
+                    )
+                    continue
+
+                linked: list[str] = []
+                for obs_id in member_ids:
+                    if await _link(repo, obs_id, identity_uuid):
+                        linked.append(obs_id)
+                if linked:
+                    result.identities_formed.append({
+                        "identity_uuid": identity_uuid,
+                        "observation_uuids": linked,
+                    })
+                await _roll_up_fingerprints(
+                    repo, identity_uuid, [row_by_id[m] for m in member_ids],
+                )
+                continue
+
+            # Deterministic winner so two clusterer runs produce the
+            # same merge direction. Sorting by uuid string is stable
+            # and doesn't depend on row insertion order.
+            winner_uuid = min(effective_ids)
+            losers = effective_ids - {winner_uuid}
+
+            for loser_uuid in losers:
+                try:
+                    await repo.update_identity_merged_into(loser_uuid, winner_uuid)
+                except Exception:  # noqa: BLE001
+                    log.exception(
+                        "clusterer: failed to merge %s -> %s",
+                        loser_uuid, winner_uuid,
+                    )
+                    continue
+                identity_chain[loser_uuid] = winner_uuid
+                result.identities_merged.append({
+                    "winner_uuid": winner_uuid,
+                    "loser_uuid": loser_uuid,
+                })
+
+            # Link any unassigned observations in the component to the
+            # winner so a subsequent tick sees a single-identity
+            # component and skips this branch entirely.
+            for obs_id in unassigned:
+                if await _link(repo, obs_id, winner_uuid):
+                    result.observations_linked.append({
+                        "identity_uuid": winner_uuid,
+                        "observation_uuid": obs_id,
+                    })
+
+            # Re-roll the winner's fingerprint summary across every
+            # observation now in this component (including the loser
+            # side — the merge unifies their evidence even though the
+            # loser's identity row stays FK'd via merged_into_uuid).
+            await _roll_up_fingerprints(
+                repo, winner_uuid, [row_by_id[m] for m in member_ids],
+            )
+
+        # Pass 2 — revocable-merge undo. For each currently-merged-out
+        # identity, check whether its observations still cluster with
+        # the winner's. If not, the merge is contradicted by new
+        # evidence — clear merged_into_uuid and emit identity.unmerged.
+        # Observations FK'd to the resurrected loser stay where they
+        # were; the chain just stops following.
+        observations_by_literal_identity: dict[str, list[str]] = {}
+        for obs_id, r in row_by_id.items():
+            iid = r.get("identity_id")
+            if iid:
+                observations_by_literal_identity.setdefault(iid, []).append(obs_id)
+
+        for identity_row in all_identities:
+            if not identity_row.get("merged_into_uuid"):
+                continue
+            loser_uuid = identity_row["uuid"]
+            winner_uuid = identity_chain.get(loser_uuid, loser_uuid)
+            if winner_uuid == loser_uuid:
+                continue  # broken chain — paranoia
+            loser_obs = observations_by_literal_identity.get(loser_uuid, [])
+            winner_obs = observations_by_literal_identity.get(winner_uuid, [])
+            if not loser_obs or not winner_obs:
+                # No observations either side — can't disprove the merge.
+                continue
+            loser_clusters = {labels[o] for o in loser_obs}
+            winner_clusters = {labels[o] for o in winner_obs}
+            if loser_clusters & winner_clusters:
+                continue  # still co-clustered with winner — merge stands
+            try:
+                await repo.update_identity_merged_into(loser_uuid, None)
+            except Exception:  # noqa: BLE001
+                log.exception(
+                    "clusterer: failed to unmerge %s from %s",
+                    loser_uuid, winner_uuid,
+                )
+                continue
+            identity_chain[loser_uuid] = loser_uuid
+            result.identities_unmerged.append({
+                "resurrected_uuid": loser_uuid,
+                "former_winner_uuid": winner_uuid,
+            })
+
+        return result
+
+
+def _build_merge_chain(
+    identities: list[dict[str, Any]],
+) -> dict[str, str]:
+    """Build a uuid → canonical-winner map from a list of identity rows.
+
+    Follows ``merged_into_uuid`` to a fixed point per identity, with a
+    hop cap to defend against accidental cycles. The returned dict
+    contains an entry for every identity uuid (mapping to itself if
+    not merged out).
+    """
+    _MAX_HOPS = 8
+    by_uuid: dict[str, dict[str, Any]] = {i["uuid"]: i for i in identities}
+    chain: dict[str, str] = {}
+    for uuid_ in by_uuid:
+        cur = uuid_
+        for _ in range(_MAX_HOPS):
+            row = by_uuid.get(cur)
+            if row is None:
+                break
+            nxt = row.get("merged_into_uuid")
+            if not nxt or nxt == cur:
+                break
+            cur = nxt
+        chain[uuid_] = cur
+    return chain
+
+
+async def _link(
+    repo: BaseRepository, observation_uuid: str, identity_uuid: str,
+) -> bool:
+    """Set ``attackers.identity_id`` and return ``True`` on success.
+
+    Wraps the repo call so the tick body stays linear and exception
+    handling is consistent across the form / link / merge branches.
+    """
+    try:
+        await repo.set_attacker_identity_id(observation_uuid, identity_uuid)
+        return True
+    except Exception:  # noqa: BLE001
+        log.exception(
+            "clusterer: failed to link obs=%s -> identity=%s",
+            observation_uuid, identity_uuid,
+        )
+        return False
+
+
+async def _roll_up_fingerprints(
+    repo: BaseRepository,
+    identity_uuid: str,
+    member_rows: list[dict[str, Any]],
+) -> None:
+    """Project member observations' fingerprint blobs onto the identity's
+    summary columns. Best-effort: a write failure is logged but never
+    breaks the clusterer tick — the columns just stay stale until the
+    next pass."""
+    summaries = extract_fp_summaries(member_rows)
+    try:
+        await repo.update_identity_fingerprints(identity_uuid, **summaries)
+    except Exception:  # noqa: BLE001
+        log.exception(
+            "clusterer: failed to roll up fingerprints for identity=%s",
+            identity_uuid,
+        )
+
+
+__all__ = [
+    "ConnectedComponentsClusterer",
+    "cluster_observations",
+    "from_attacker_row",
+]
--- a/decnet/clustering/impl/similarity.py
+++ b/decnet/clustering/impl/similarity.py
@@ -0,0 +1,313 @@
+"""Similarity-graph primitives for the connected-components clusterer.
+
+Each function takes two :class:`Observation` projections and returns a
+similarity score in ``[0.0, 1.0]``. The connected-components impl
+(landing in subsequent commits) decides how to combine these into a
+single edge weight, applies a threshold, and runs union-find.
+
+**Time-agnostic.** Edges MUST NOT depend on observation timestamps.
+Fixture 7 (``slow_burn``) proves recency-decay clustering fragments
+multi-month APT campaigns; the production graph cannot silently expire
+old edges. Timestamps are still useful for *audit* (the ``first_seen``
+on the resulting identity row) but never for *similarity*.
+
+**Weight tiers** (from `development/IDENTITY_RESOLUTION.md`):
+
+* High — JA3 / HASSH / payload-hash / C2-callback exact match. Stable
+  signals an attacker can't cheaply rotate. A single high-tier match
+  supports identity strongly.
+* Medium — command-sequence Jaccard, bucketed by UKC phase. Tooling
+  habits leak through command order; phase-bucketing avoids comparing
+  a Discovery cmd-list to an Exploitation one.
+* Low — credential-attempt-set Jaccard. Defeated alone by fixture 1
+  (``shared_wordlist``) where two campaigns share rockyou but diverge
+  on infra.
+* Very low — ASN match. Defeated alone by fixture 2 (``vpn_hopping``)
+  where one identity rotates across many ASNs.
+
+The functions are pure (no DB, no I/O); the worker maps observations
+into :class:`Observation` once per tick and feeds these into the
+graph builder.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Mapping, Optional
+
+# ─── Observation projection ─────────────────────────────────────────────────
+
+
+@dataclass(frozen=True)
+class Observation:
+    """Minimal projection of a per-IP attacker observation.
+
+    Built once per ``Attacker`` row by the worker (or per
+    ``SyntheticAttacker`` in tests via :func:`from_synthetic`).
+    Keeping the projection tight isolates the graph code from schema
+    drift on either side.
+
+    All set-typed fields are :class:`frozenset` so they hash and so
+    callers don't accidentally mutate them mid-pass.
+    """
+
+    observation_id: str
+    """Stable ID — for production, the ``Attacker.uuid``; for tests,
+    the ``SyntheticAttacker.attacker_id``."""
+
+    ja3: Optional[str] = None
+    hassh: Optional[str] = None
+    asn: Optional[int] = None
+
+    payload_hashes: frozenset[str] = field(default_factory=frozenset)
+    c2_endpoints: frozenset[str] = field(default_factory=frozenset)
+    credentials: frozenset[tuple[str, str]] = field(default_factory=frozenset)
+
+    commands_by_phase: Mapping[str, tuple[str, ...]] = field(default_factory=dict)
+    """``UKCPhase.value`` → ordered command sequence observed in that
+    phase. Empty dict when no command-bearing sessions were seen."""
+
+
+# ─── Edge functions ─────────────────────────────────────────────────────────
+
+
+def _fingerprints_fully_disagree(a: Observation, b: Observation) -> bool:
+    """True iff every comparable fingerprint slot disagrees.
+
+    "Comparable" = both sides have a non-null value for that slot.
+    Used as a soft-veto on shared C2 / payload signals: when two
+    observations have distinct stable TLS + SSH stacks, sharing a C2
+    endpoint is a *campaign*-level signal (cooperating operators,
+    distinct identities) — not an identity-level one. Fixture 5
+    (``multi_operator``) is the canonical demonstration.
+
+    Returns ``False`` when no fingerprint slot is comparable (any-null
+    cases) — without evidence of disagreement we don't veto. Also
+    ``False`` when at least one slot agrees.
+    """
+    ja3_comparable = a.ja3 is not None and b.ja3 is not None
+    hassh_comparable = a.hassh is not None and b.hassh is not None
+    if not (ja3_comparable or hassh_comparable):
+        return False
+    if ja3_comparable and a.ja3 == b.ja3:
+        return False
+    if hassh_comparable and a.hassh == b.hassh:
+        return False
+    if ja3_comparable and hassh_comparable:
+        return a.ja3 != b.ja3 and a.hassh != b.hassh
+    return True  # exactly one slot is comparable, and it disagrees
+
+
+def high_weight_edge(a: Observation, b: Observation) -> float:
+    """JA3 / HASSH / payload-hash / C2-endpoint exact match.
+
+    Returns ``1.0`` if any of the four exact-match signals agrees
+    (non-null on both sides), ``0.0`` otherwise. Single-signal high-tier
+    agreement is by design enough to support identity — these are the
+    signals the design doc calls out as "stable signals an attacker
+    can't cheaply rotate."
+
+    **Fingerprint-disagreement veto.** Payload and C2 are infra signals
+    that two cooperating operators (different identities) can share.
+    JA3 + HASSH are tooling signals that differ when the operators are
+    actually different humans with different tool stacks. So when the
+    available fingerprint slots fully disagree, we drop the
+    payload/C2 contribution to zero — preventing a campaign-level
+    co-op signal from fusing two distinct identities. Fixture 5
+    (``multi_operator``) is the canonical demonstration: shared
+    stage-1 payload + shared C2, distinct JA3/HASSH per operator —
+    must stay two identities. JA3 / HASSH agreement still returns
+    ``1.0`` directly, since by definition no veto applies when
+    something agrees.
+
+    JA4 will join this tier as a sibling of JA3 once the prober emits
+    it (``ATTACKER_FINGERPRINTED`` already carries a JA4 slot in
+    ``AttackerIdentity``); the function shape doesn't change.
+    """
+    if a.ja3 is not None and a.ja3 == b.ja3:
+        return 1.0
+    if a.hassh is not None and a.hassh == b.hassh:
+        return 1.0
+    if _fingerprints_fully_disagree(a, b):
+        # Stable-tool disagreement vetoes shared-infra signals.
+        return 0.0
+    if a.payload_hashes and b.payload_hashes and (a.payload_hashes & b.payload_hashes):
+        return 1.0
+    if a.c2_endpoints and b.c2_endpoints and (a.c2_endpoints & b.c2_endpoints):
+        return 1.0
+    return 0.0
+
+
+def medium_weight_edge(a: Observation, b: Observation) -> float:
+    """Phase-bucketed command-sequence Jaccard.
+
+    For each UKC phase observed on both sides, computes the Jaccard
+    similarity of the command sets (multisets collapsed to sets — the
+    *order* signal is reserved for a future feature, this commit is
+    the scaffolding). Returns the **maximum** Jaccard across shared
+    phases, so a single strong phase match isn't averaged away by a
+    different phase where the actors diverge.
+
+    Phase-bucketing matters: comparing a Discovery cmd-list to an
+    Exploitation one is meaningless. Both actors had to be in the
+    same phase for the comparison to count.
+
+    Returns ``0.0`` when no phase is observed on both sides.
+    """
+    shared_phases = set(a.commands_by_phase) & set(b.commands_by_phase)
+    if not shared_phases:
+        return 0.0
+    best = 0.0
+    for phase in shared_phases:
+        sa = set(a.commands_by_phase[phase])
+        sb = set(b.commands_by_phase[phase])
+        if not sa and not sb:
+            continue
+        union = sa | sb
+        if not union:
+            continue
+        j = len(sa & sb) / len(union)
+        if j > best:
+            best = j
+    return best
+
+
+def low_weight_edge(a: Observation, b: Observation) -> float:
+    """Credential-attempt-set Jaccard.
+
+    Returns the Jaccard of ``(username, password)`` tuples. Two campaigns
+    burning the same wordlist will score high here — fixture 1 proves
+    this signal is dangerous in isolation. The connected-components
+    impl combines this with other signals; alone it must not push a
+    pair over threshold.
+
+    Returns ``0.0`` when either side attempted no credentials, or when
+    the union is empty.
+    """
+    if not a.credentials or not b.credentials:
+        return 0.0
+    union = a.credentials | b.credentials
+    if not union:
+        return 0.0
+    return len(a.credentials & b.credentials) / len(union)
+
+
+def very_low_weight_edge(a: Observation, b: Observation) -> float:
+    """ASN equality.
+
+    Returns ``1.0`` iff both observations have a non-null ASN and they
+    match. Fixture 2 (``vpn_hopping``) proves ASN-only clustering is
+    a failure mode — one identity legitimately rotates across many
+    ASNs. The combination logic in the connected-components impl
+    weights this so that ASN agreement alone never crosses threshold.
+    """
+    if a.asn is None or b.asn is None:
+        return 0.0
+    return 1.0 if a.asn == b.asn else 0.0
+
+
+# ─── Combined weight ────────────────────────────────────────────────────────
+
+#: Tier multipliers applied to the per-tier edge scores when combining
+#: into a single weight. Tuned so that:
+#:
+#: * High-tier agreement alone (1.0) crosses the 1.0 threshold.
+#: * Medium-tier alone (max 1.0) yields 0.6 — below threshold.
+#: * Low-tier alone (max 1.0) yields 0.2 — defeats fixture 1's
+#:   credential-overlap-only failure mode.
+#: * Very-low alone (max 1.0) yields 0.05 — defeats fixture 2's
+#:   ASN-rotation failure mode.
+#:
+#: The ratio between tiers matters more than the absolute values: a
+#: tier should never combine its way past threshold without help from
+#: a stronger one.
+TIER_WEIGHTS = {
+    "high": 1.0,
+    "medium": 0.6,
+    "low": 0.2,
+    "very_low": 0.05,
+}
+
+#: Threshold a combined edge weight must meet to survive into the
+#: similarity graph. The connected-components impl drops anything
+#: under this before running union-find.
+EDGE_THRESHOLD = 1.0
+
+
+def combined_edge_weight(a: Observation, b: Observation) -> float:
+    """Sum of all four tier scores, weighted by :data:`TIER_WEIGHTS`.
+
+    Each per-tier function returns a score in ``[0, 1]``; the
+    weighted sum lets stronger tiers dominate without letting weaker
+    ones combine their way past threshold.
+
+    The connected-components clusterer compares this against
+    :data:`EDGE_THRESHOLD` to decide whether to draw an edge. Pure /
+    time-agnostic — fixture 7 forbids recency-decay weighting.
+
+    Commits 5–7 land each tier in the call site:
+
+    * Commit 5 (this commit): high + medium.
+    * Commit 6: + phase-handoff (a separate edge family, not a tier).
+    * Commit 7: + low + very_low.
+
+    Until commit 7 lands, the low / very_low contributions stay zero
+    by virtue of the underlying functions returning ``0.0`` whenever
+    their inputs are missing. The combination is forward-compatible.
+    """
+    return (
+        TIER_WEIGHTS["high"] * high_weight_edge(a, b)
+        + TIER_WEIGHTS["medium"] * medium_weight_edge(a, b)
+        + TIER_WEIGHTS["low"] * low_weight_edge(a, b)
+        + TIER_WEIGHTS["very_low"] * very_low_weight_edge(a, b)
+    )
+
+
+# ─── Adapter for the synthetic-corpus tests ─────────────────────────────────
+
+
+def from_synthetic(att) -> Observation:  # type: ignore[no-untyped-def]
+    """Build an :class:`Observation` from a ``SyntheticAttacker``.
+
+    Lives here so test code doesn't import the factory shape into the
+    production module — the adapter is a documented integration point.
+    Imported lazily by callers; the production worker uses a parallel
+    adapter from :class:`Attacker` rows once that lands.
+    """
+    payload_hashes: set[str] = set()
+    c2_endpoints: set[str] = set()
+    credentials: set[tuple[str, str]] = set()
+    commands_by_phase: dict[str, list[str]] = {}
+
+    for s in att.sessions:
+        if s.payload_hash:
+            payload_hashes.add(s.payload_hash)
+        if s.c2_callback:
+            c2_endpoints.add(s.c2_callback)
+        for cred in s.credentials_tried:
+            credentials.add(tuple(cred))
+        if s.commands:
+            commands_by_phase.setdefault(s.phase.value, []).extend(s.commands)
+
+    return Observation(
+        observation_id=att.attacker_id,
+        ja3=att.ja3,
+        hassh=att.hassh,
+        asn=att.asn,
+        payload_hashes=frozenset(payload_hashes),
+        c2_endpoints=frozenset(c2_endpoints),
+        credentials=frozenset(credentials),
+        commands_by_phase={k: tuple(v) for k, v in commands_by_phase.items()},
+    )
+
+
+__all__ = [
+    "Observation",
+    "high_weight_edge",
+    "medium_weight_edge",
+    "low_weight_edge",
+    "very_low_weight_edge",
+    "combined_edge_weight",
+    "from_synthetic",
+    "EDGE_THRESHOLD",
+    "TIER_WEIGHTS",
+]
--- a/decnet/clustering/ukc.py
+++ b/decnet/clustering/ukc.py
@@ -0,0 +1,108 @@
+"""
+Unified Kill Chain phase vocabulary (Pols, 2017).
+
+Used as the canonical phase enum for campaign clustering and (eventually)
+the MITRE ATT&CK / TTPs-tagging worker. UKC tactic names map cleanly onto
+ATT&CK tactics, so emitting these labels in synthetic data and runtime
+phase inference avoids a renaming pass when TTP-tagging lands.
+
+A honeypot does not observe the entire chain. Pre-target phases (OSINT
+reconnaissance, resource development, weaponization, social engineering)
+happen before any decky is touched. The DSL allows the full enum so a
+campaign spec can describe an end-to-end story; the synthetic generator
+emits no events for unobservable phases.
+"""
+from __future__ import annotations
+
+from enum import Enum
+
+
+class UKCPhase(str, Enum):
+    # In — initial foothold
+    RECONNAISSANCE = "reconnaissance"
+    RESOURCE_DEVELOPMENT = "resource_development"
+    WEAPONIZATION = "weaponization"
+    DELIVERY = "delivery"
+    SOCIAL_ENGINEERING = "social_engineering"
+    EXPLOITATION = "exploitation"
+    PERSISTENCE = "persistence"
+    DEFENSE_EVASION = "defense_evasion"
+    COMMAND_AND_CONTROL = "command_and_control"
+    # Through — network propagation
+    PIVOTING = "pivoting"
+    DISCOVERY = "discovery"
+    PRIVILEGE_ESCALATION = "privilege_escalation"
+    EXECUTION = "execution"
+    CREDENTIAL_ACCESS = "credential_access"
+    LATERAL_MOVEMENT = "lateral_movement"
+    # Out — action on objectives
+    COLLECTION = "collection"
+    EXFILTRATION = "exfiltration"
+    IMPACT = "impact"
+    OBJECTIVES = "objectives"
+
+
+# Phases a honeypot can plausibly observe. Pre-target phases are excluded:
+# OSINT recon, infrastructure-stand-up, payload authoring, and human-target
+# manipulation all happen before the attacker touches a decky. The synthetic
+# generator validates campaign specs against this set and warns (but does
+# not error) on unobservable phases — a campaign can describe them; we just
+# emit no events.
+OBSERVABLE_PHASES: frozenset[UKCPhase] = frozenset({
+    UKCPhase.DELIVERY,
+    UKCPhase.EXPLOITATION,
+    UKCPhase.PERSISTENCE,
+    UKCPhase.DEFENSE_EVASION,
+    UKCPhase.COMMAND_AND_CONTROL,
+    UKCPhase.PIVOTING,
+    UKCPhase.DISCOVERY,
+    UKCPhase.PRIVILEGE_ESCALATION,
+    UKCPhase.EXECUTION,
+    UKCPhase.CREDENTIAL_ACCESS,
+    UKCPhase.LATERAL_MOVEMENT,
+    UKCPhase.COLLECTION,
+    UKCPhase.EXFILTRATION,
+    UKCPhase.IMPACT,
+    UKCPhase.OBJECTIVES,
+})
+
+
+# Stage groupings — useful for the multi_operator fixture (operators tend
+# to split along the In / Through / Out boundary) and for downstream
+# UI rendering of campaign timelines.
+STAGE_IN: frozenset[UKCPhase] = frozenset({
+    UKCPhase.RECONNAISSANCE,
+    UKCPhase.RESOURCE_DEVELOPMENT,
+    UKCPhase.WEAPONIZATION,
+    UKCPhase.DELIVERY,
+    UKCPhase.SOCIAL_ENGINEERING,
+    UKCPhase.EXPLOITATION,
+    UKCPhase.PERSISTENCE,
+    UKCPhase.DEFENSE_EVASION,
+    UKCPhase.COMMAND_AND_CONTROL,
+})
+
+STAGE_THROUGH: frozenset[UKCPhase] = frozenset({
+    UKCPhase.PIVOTING,
+    UKCPhase.DISCOVERY,
+    UKCPhase.PRIVILEGE_ESCALATION,
+    UKCPhase.EXECUTION,
+    UKCPhase.CREDENTIAL_ACCESS,
+    UKCPhase.LATERAL_MOVEMENT,
+})
+
+STAGE_OUT: frozenset[UKCPhase] = frozenset({
+    UKCPhase.COLLECTION,
+    UKCPhase.EXFILTRATION,
+    UKCPhase.IMPACT,
+    UKCPhase.OBJECTIVES,
+})
+
+
+def stage_of(phase: UKCPhase) -> str:
+    """Return 'in' | 'through' | 'out' for a given phase."""
+    if phase in STAGE_IN:
+        return "in"
+    if phase in STAGE_THROUGH:
+        return "through"
+    return "out"
--- a/decnet/clustering/worker.py
+++ b/decnet/clustering/worker.py
@@ -0,0 +1,180 @@
+"""Long-running identity-resolution clusterer worker.
+
+Runs :meth:`Clusterer.tick` on bus-wake or slow-tick fallback. Mirrors
+:mod:`decnet.intel.worker` and :mod:`decnet.correlation.reuse_worker`:
+woken on ``attacker.observed`` and ``attacker.scored`` for sub-second
+latency, falls back to a 60s poll when the bus is unavailable.
+
+The clusterer itself owns its DB writes (``attacker_identities`` +
+``attackers.identity_id`` updates). The worker shell is responsible only
+for:
+
+* lifecycle (bus connect, heartbeat, control listener, clean shutdown),
+* publishing ``identity.formed`` / ``identity.observation.linked`` /
+  ``identity.merged`` / ``identity.unmerged`` from the
+  :class:`ClusterResult` returned by ``tick``.
+
+The skeleton ``ConnectedComponentsClusterer.tick`` returns an empty
+result, so this worker runs but emits no identity events until edges
+are wired in.
+"""
+from __future__ import annotations
+
+import asyncio
+import contextlib
+from typing import Optional
+
+from decnet.bus import topics as _topics
+from decnet.bus.base import BaseBus
+from decnet.bus.factory import get_bus
+from decnet.bus.publish import (
+    publish_safely,
+    run_control_listener_signal as _run_control_listener_signal,
+    run_health_heartbeat as _run_health_heartbeat,
+)
+from decnet.clustering.base import Clusterer, ClusterResult
+from decnet.clustering.factory import get_clusterer
+from decnet.logging import get_logger
+from decnet.web.db.repository import BaseRepository
+
+log = get_logger("clustering.worker")
+
+_DEFAULT_POLL_SECS = 60.0
+
+
+async def run_clusterer_loop(
+    repo: BaseRepository,
+    *,
+    poll_interval_secs: float = _DEFAULT_POLL_SECS,
+    clusterer: Optional[Clusterer] = None,
+    shutdown: Optional[asyncio.Event] = None,
+) -> None:
+    """Run the identity clusterer until cancelled.
+
+    *clusterer* defaults to :func:`get_clusterer` — tests pass a fake.
+    *shutdown* is an optional external stop signal; the loop also exits
+    cleanly on :class:`asyncio.CancelledError` and
+    :class:`KeyboardInterrupt`.
+    """
+    if clusterer is None:
+        clusterer = get_clusterer()
+    log.info(
+        "clusterer started impl=%s poll_interval_secs=%s",
+        clusterer.name, poll_interval_secs,
+    )
+
+    bus: Optional[BaseBus] = None
+    wake = asyncio.Event()
+    wake_tasks: list[asyncio.Task] = []
+    heartbeat_task: Optional[asyncio.Task] = None
+    try:
+        candidate = get_bus(client_name="clusterer")
+        await candidate.connect()
+        bus = candidate
+        wake_tasks.append(asyncio.create_task(
+            _wake_on(bus, wake, _topics.attacker(_topics.ATTACKER_OBSERVED)),
+        ))
+        wake_tasks.append(asyncio.create_task(
+            _wake_on(bus, wake, _topics.attacker(_topics.ATTACKER_SCORED)),
+        ))
+        heartbeat_task = asyncio.create_task(
+            _run_health_heartbeat(bus, "clusterer"),
+        )
+        wake_tasks.append(asyncio.create_task(
+            _run_control_listener_signal(bus, "clusterer"),
+        ))
+    except Exception as exc:  # noqa: BLE001
+        log.warning(
+            "clusterer: bus unavailable, running in poll-only mode: %s", exc,
+        )
+
+    if shutdown is None:
+        shutdown = asyncio.Event()
+
+    try:
+        while not shutdown.is_set():
+            try:
+                result = await clusterer.tick(repo)
+            except Exception:  # noqa: BLE001
+                log.exception("clusterer: tick failed")
+                result = ClusterResult()
+
+            await _publish_result(bus, result)
+
+            try:
+                await asyncio.wait_for(
+                    wake.wait(), timeout=float(poll_interval_secs),
+                )
+            except asyncio.TimeoutError:
+                pass
+            wake.clear()
+    except (asyncio.CancelledError, KeyboardInterrupt):
+        log.info("clusterer stopped")
+    finally:
+        for t in wake_tasks:
+            t.cancel()
+        if heartbeat_task is not None:
+            heartbeat_task.cancel()
+        for t in (*wake_tasks, heartbeat_task):
+            if t is None:
+                continue
+            with contextlib.suppress(asyncio.CancelledError, Exception):
+                await t
+        if bus is not None:
+            with contextlib.suppress(Exception):
+                await bus.close()
+
+
+async def _publish_result(bus: Optional[BaseBus], result: ClusterResult) -> None:
+    """Fan ``ClusterResult`` out to the four ``identity.*`` topics."""
+    for formed in result.identities_formed:
+        await publish_safely(
+            bus,
+            _topics.identity(_topics.IDENTITY_FORMED),
+            formed,
+            event_type=_topics.IDENTITY_FORMED,
+        )
+    for linked in result.observations_linked:
+        await publish_safely(
+            bus,
+            _topics.identity(_topics.IDENTITY_OBSERVATION_LINKED),
+            linked,
+            event_type=_topics.IDENTITY_OBSERVATION_LINKED,
+        )
+    for merged in result.identities_merged:
+        await publish_safely(
+            bus,
+            _topics.identity(_topics.IDENTITY_MERGED),
+            merged,
+            event_type=_topics.IDENTITY_MERGED,
+        )
+    for unmerged in result.identities_unmerged:
+        await publish_safely(
+            bus,
+            _topics.identity(_topics.IDENTITY_UNMERGED),
+            unmerged,
+            event_type=_topics.IDENTITY_UNMERGED,
+        )
+
+
+async def _wake_on(bus: BaseBus, wake: asyncio.Event, pattern: str) -> None:
+    """Flip *wake* every time *pattern* fires on the bus.
+
+    Survives transient subscriber errors by logging and exiting; the
+    poll-interval fallback keeps the loop alive in poll-only mode.
+    """
+    try:
+        sub = bus.subscribe(pattern)
+        async with sub:
+            async for _event in sub:
+                wake.set()
+    except asyncio.CancelledError:
+        raise
+    except Exception as exc:  # noqa: BLE001
+        log.warning(
+            "clusterer: subscriber for %s died (%s); falling back to poll",
+            pattern, exc,
+        )
+
+
+__all__ = ["run_clusterer_loop"]
				`@@ -0,0 +1 @@`
				`"""Campaign clustering — see development/CAMPAIGN_CLUSTERING.md."""`