feat(clustering): UKC phase enum + synthetic campaign factory + metric harness

Pre-implementation scaffolding for campaign clustering. The simulator is the spec — algorithm code follows once fixtures + metrics are stable. * decnet/clustering/ukc.py — UKCPhase enum (19 phases across In/Through/Out stages), OBSERVABLE_PHASES set, stage_of() helper. Vocabulary aligns with future MITRE ATT&CK tagging so synthetic data and runtime phase inference don't need renaming when TTP-tagging lands. * tests/factories/campaign_factory.py — YAML DSL parser + deterministic generator emitting truth-labeled SyntheticAttacker / SyntheticSession records. Validates phase names, warns on unobservable phases, supports multi-campaign + noise corpora. * tests/clustering/metrics.py — pure-Python ARI / homogeneity / completeness / singleton_recall (no sklearn dep). Decided before any algorithm exists, on purpose. * tests/fixtures/campaigns/lone_wolf.{yaml,expected.yaml} — fixture 3 from the design doc; simplest of the six, exercises the full pipeline with an identity-clusterer placeholder. * development/CAMPAIGN_CLUSTERING.md — design spec for the feature. * development/DEVELOPMENT_V2.md — note on DSL evolution path (concurrent phases, multi-actor per phase) deferred post-v1.
2026-04-26 06:29:10 -04:00
parent 3eb67c9400
commit 00254629f8
13 changed files with 1227 additions and 0 deletions
--- a/tests/clustering/init.py
+++ b/tests/clustering/init.py
--- a/tests/clustering/metrics.py
+++ b/tests/clustering/metrics.py
@@ -0,0 +1,179 @@
+"""
+Clustering metric harness — see development/CAMPAIGN_CLUSTERING.md §3.
+
+Decided BEFORE any clustering algorithm exists, on purpose: if the
+metrics get picked after seeing results, they'll flatter whatever the
+algorithm happens to produce.
+
+Four metrics, none on its own sufficient:
+
+  * Adjusted Rand Index — headline number, chance-corrected agreement
+    between predicted clusters and ground truth.
+  * Homogeneity — each predicted cluster contains only one true class.
+    Catches FALSE MERGES (campaigns wrongly fused).
+  * Completeness — every member of a true class lands in the same
+    predicted cluster. Catches FALSE SPLITS (one campaign wrongly torn
+    apart).
+  * Singleton recall — fraction of ground-truth singletons (lone wolves,
+    background noise) that are kept singleton by the clusterer.
+
+Implemented from first principles in pure Python so the test harness
+doesn't pull sklearn/numpy into the runtime dependency surface.
+"""
+from __future__ import annotations
+
+import math
+from collections import Counter, defaultdict
+
+
+def _comb2(n: int) -> int:
+    """C(n, 2) — number of unordered pairs from n items."""
+    return n * (n - 1) // 2 if n >= 2 else 0
+
+
+def adjusted_rand_index(truth: dict[str, str], pred: dict[str, str]) -> float:
+    """
+    Adjusted Rand Index between two clusterings over the same item set.
+
+    Range: typically [0, 1]; can dip negative for worse-than-random
+    labelings. 1.0 = identical partitions (up to label renaming),
+    0.0 ≈ chance agreement.
+
+    Both args map item_id -> cluster_id. Items must align exactly.
+    """
+    if set(truth) != set(pred):
+        raise ValueError(
+            "ARI requires identical item sets in truth and pred "
+            f"(missing in pred: {set(truth) - set(pred)}, "
+            f"missing in truth: {set(pred) - set(truth)})"
+        )
+    n = len(truth)
+    if n < 2:
+        return 1.0  # trivially "agree" on <2 items
+
+    # Build the contingency table n_ij = |cluster_i ∩ class_j|.
+    contingency: dict[tuple[str, str], int] = defaultdict(int)
+    for item, t_label in truth.items():
+        p_label = pred[item]
+        contingency[(p_label, t_label)] += 1
+
+    sum_comb = sum(_comb2(v) for v in contingency.values())
+    a_counts = Counter(pred.values())   # row sums (predicted clusters)
+    b_counts = Counter(truth.values())  # column sums (true classes)
+    sum_a = sum(_comb2(v) for v in a_counts.values())
+    sum_b = sum(_comb2(v) for v in b_counts.values())
+    total_pairs = _comb2(n)
+
+    expected = (sum_a * sum_b) / total_pairs if total_pairs else 0.0
+    max_index = (sum_a + sum_b) / 2
+    if max_index == expected:
+        # Degenerate: both clusterings are trivially equal in structure
+        # (both all-singletons, or both one-big-cluster). The math forces
+        # this — see the algebra of max_index = expected. The induced
+        # partitions are necessarily identical, so ARI is 1.0. (sklearn
+        # adopts the same convention.)
+        return 1.0
+    return (sum_comb - expected) / (max_index - expected)
+
+
+def _entropy(counts: list[int], total: int) -> float:
+    if total == 0:
+        return 0.0
+    h = 0.0
+    for c in counts:
+        if c == 0:
+            continue
+        p = c / total
+        h -= p * math.log(p)
+    return h
+
+
+def _conditional_entropy(
+    contingency: dict[tuple[str, str], int],
+    given_counts: dict[str, int],
+    total: int,
+) -> float:
+    """H(rows | cols) — i.e. entropy of class within each cluster."""
+    if total == 0:
+        return 0.0
+    h = 0.0
+    by_col: dict[str, list[int]] = defaultdict(list)
+    for (row, col), v in contingency.items():
+        by_col[col].append(v)
+    for col, vs in by_col.items():
+        col_total = given_counts[col]
+        if col_total == 0:
+            continue
+        col_entropy = _entropy(vs, col_total)
+        h += (col_total / total) * col_entropy
+    return h
+
+
+def homogeneity(truth: dict[str, str], pred: dict[str, str]) -> float:
+    """
+    1 - H(truth | pred) / H(truth). 1.0 = each predicted cluster
+    contains only members of a single true class (no false merges).
+    """
+    n = len(truth)
+    if n == 0:
+        return 1.0
+    contingency: dict[tuple[str, str], int] = defaultdict(int)
+    for item, t in truth.items():
+        contingency[(t, pred[item])] += 1
+    truth_counts = Counter(truth.values())
+    pred_counts = Counter(pred.values())
+    h_truth = _entropy(list(truth_counts.values()), n)
+    if h_truth == 0:
+        return 1.0
+    h_truth_given_pred = _conditional_entropy(contingency, dict(pred_counts), n)
+    return 1.0 - (h_truth_given_pred / h_truth)
+
+
+def completeness(truth: dict[str, str], pred: dict[str, str]) -> float:
+    """
+    1 - H(pred | truth) / H(pred). 1.0 = all members of each true class
+    are assigned to the same predicted cluster (no false splits).
+    """
+    n = len(truth)
+    if n == 0:
+        return 1.0
+    contingency: dict[tuple[str, str], int] = defaultdict(int)
+    for item, t in truth.items():
+        contingency[(pred[item], t)] += 1
+    pred_counts = Counter(pred.values())
+    truth_counts = Counter(truth.values())
+    h_pred = _entropy(list(pred_counts.values()), n)
+    if h_pred == 0:
+        return 1.0
+    h_pred_given_truth = _conditional_entropy(contingency, dict(truth_counts), n)
+    return 1.0 - (h_pred_given_truth / h_pred)
+
+
+def singleton_recall(truth: dict[str, str], pred: dict[str, str]) -> float:
+    """
+    Fraction of ground-truth singletons that the clusterer kept singleton.
+
+    A "true singleton" is an item whose truth-campaign has exactly one
+    member (lone wolves, background noise scanners). The metric exists
+    because ARI/homogeneity/completeness all dilute the cost of a
+    clusterer that absorbs noise into real campaigns — and noise
+    absorption is the failure mode that makes campaign attribution
+    useless in practice.
+    """
+    truth_counts = Counter(truth.values())
+    true_singletons = [item for item, t in truth.items() if truth_counts[t] == 1]
+    if not true_singletons:
+        return 1.0
+    pred_counts = Counter(pred.values())
+    kept = sum(1 for item in true_singletons if pred_counts[pred[item]] == 1)
+    return kept / len(true_singletons)
+
+
+def score(truth: dict[str, str], pred: dict[str, str]) -> dict[str, float]:
+    """One-shot bundle the four metrics for fixture reports."""
+    return {
+        "adjusted_rand_index": adjusted_rand_index(truth, pred),
+        "homogeneity": homogeneity(truth, pred),
+        "completeness": completeness(truth, pred),
+        "singleton_recall": singleton_recall(truth, pred),
+    }
--- a/tests/clustering/test_campaign_factory.py
+++ b/tests/clustering/test_campaign_factory.py
@@ -0,0 +1,112 @@
+"""Determinism + DSL-validation tests for the synthetic campaign factory."""
+from __future__ import annotations
+
+import pytest
+
+from decnet.clustering.ukc import UKCPhase
+from tests.factories.campaign_factory import (
+    DSLValidationError,
+    generate,
+)
+
+
+def _minimal_spec() -> dict:
+    return {
+        "campaign": {
+            "id": "c-test",
+            "actors": [{"id": "a-1", "asn": 64512}],
+            "phases": [{"name": "delivery", "actor": "a-1"}],
+            "duration_days": 1,
+        }
+    }
+
+
+def test_generation_is_deterministic_given_seed() -> None:
+    spec = _minimal_spec()
+    a = generate(spec, seed=42)
+    b = generate(spec, seed=42)
+    # IDs are RNG-driven — same seed must produce identical IDs, not
+    # merely identical structure. Otherwise federation gossip and
+    # fixture diffing both break.
+    assert [att.attacker_id for att in a.attackers] == [
+        att.attacker_id for att in b.attackers
+    ]
+    assert [s.session_id for s in a.sessions] == [s.session_id for s in b.sessions]
+
+
+def test_different_seeds_produce_different_ids() -> None:
+    spec = _minimal_spec()
+    a = generate(spec, seed=1)
+    b = generate(spec, seed=2)
+    assert a.attackers[0].attacker_id != b.attackers[0].attacker_id
+
+
+def test_truth_labels_match_dsl() -> None:
+    spec = _minimal_spec()
+    corpus = generate(spec, seed=0)
+    assert corpus.attackers[0].truth_campaign_id == "c-test"
+    assert corpus.attackers[0].truth_actor_id == "a-1"
+    # truth_labels() returns the dict the metric harness consumes.
+    labels = corpus.truth_labels()
+    assert labels[corpus.attackers[0].attacker_id] == "c-test"
+
+
+def test_unobservable_phase_emits_no_events() -> None:
+    spec = _minimal_spec()
+    spec["campaign"]["phases"] = [
+        {"name": "reconnaissance", "actor": "a-1"},  # pre-target, unobservable
+        {"name": "delivery", "actor": "a-1"},
+    ]
+    corpus = generate(spec, seed=0)
+    # Only the delivery phase should produce sessions.
+    assert all(s.phase == UKCPhase.DELIVERY for s in corpus.sessions)
+    assert len(corpus.sessions) == 1
+
+
+def test_unknown_phase_name_raises() -> None:
+    spec = _minimal_spec()
+    spec["campaign"]["phases"] = [{"name": "make_coffee", "actor": "a-1"}]
+    with pytest.raises(DSLValidationError, match="unknown UKC phase"):
+        generate(spec, seed=0)
+
+
+def test_phase_referencing_unknown_actor_raises() -> None:
+    spec = _minimal_spec()
+    spec["campaign"]["phases"] = [{"name": "delivery", "actor": "ghost"}]
+    with pytest.raises(DSLValidationError, match="unknown actor"):
+        generate(spec, seed=0)
+
+
+def test_noise_scanners_are_truth_singletons() -> None:
+    spec = {
+        "corpus": {
+            "campaigns": [_minimal_spec()],
+            "noise": {"scanner_count": 5},
+        }
+    }
+    corpus = generate(spec, seed=0)
+    # 1 campaign actor + 5 noise scanners = 6 distinct truth campaigns.
+    truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
+    assert len(truth_campaigns) == 6
+
+
+def test_multi_actor_campaign_shares_campaign_id() -> None:
+    spec = {
+        "campaign": {
+            "id": "c-shared",
+            "actors": [
+                {"id": "a-1", "asn": 14061},
+                {"id": "a-2", "asn": 14061},
+            ],
+            "phases": [
+                {"name": "delivery", "actor": "a-1"},
+                {"name": "discovery", "actor": "a-2"},
+            ],
+            "duration_days": 1,
+        }
+    }
+    corpus = generate(spec, seed=0)
+    truth = corpus.truth_labels()
+    # Both attacker rows must point to the SAME truth_campaign_id —
+    # this is the property fixture 5 (multi_operator) hinges on.
+    assert set(truth.values()) == {"c-shared"}
--- a/tests/clustering/test_lone_wolf_fixture.py
+++ b/tests/clustering/test_lone_wolf_fixture.py
@@ -0,0 +1,92 @@
+"""
+End-to-end pipeline test for fixture 3 (lone_wolf).
+
+Loads the YAML spec, runs the synthetic generator, applies a placeholder
+identity clusterer (each attacker → its own cluster), scores against
+the expected bounds. This is the simplest of the six fixtures and is
+deliberately the first one wired up — its ground truth is all
+singletons, so an identity clusterer trivially passes, which proves the
+DSL→factory→metrics pipeline works before any real algorithm is built.
+
+Once the connected-components clusterer (CAMPAIGN_CLUSTERING.md §4)
+lands, this test will swap the placeholder for the real implementation
+and the same fixture must continue to pass.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+import yaml
+
+from tests.clustering.metrics import score
+from tests.factories.campaign_factory import GeneratedCorpus, generate, load_yaml
+
+FIXTURE_DIR = Path(__file__).parent.parent / "fixtures" / "campaigns"
+
+
+def _identity_clusterer(corpus: GeneratedCorpus) -> dict[str, str]:
+    """Every attacker is its own cluster. Trivially correct on lone_wolf."""
+    return {a.attacker_id: f"cluster-{a.attacker_id}" for a in corpus.attackers}
+
+
+def test_lone_wolf_pipeline_passes_bounds() -> None:
+    spec = load_yaml(FIXTURE_DIR / "lone_wolf.yaml")
+    bounds = yaml.safe_load((FIXTURE_DIR / "lone_wolf.expected.yaml").read_text())
+
+    corpus = generate(spec, seed=0)
+    truth = corpus.truth_labels()
+    pred = _identity_clusterer(corpus)
+    metrics = score(truth, pred)
+
+    failures = []
+    for name, bound in bounds.items():
+        observed = metrics[name]
+        if observed < bound["min"]:
+            failures.append(f"{name}={observed:.3f} < min {bound['min']:.3f}")
+    assert not failures, "fixture bounds violated: " + "; ".join(failures)
+
+
+def test_lone_wolf_corpus_shape() -> None:
+    """Sanity: 1 wolf + 8 noise scanners = 9 attackers, 9 sessions."""
+    spec = load_yaml(FIXTURE_DIR / "lone_wolf.yaml")
+    corpus = generate(spec, seed=0)
+    assert len(corpus.attackers) == 9
+    assert len(corpus.sessions) == 9
+    # Every attacker is a truth-singleton (its own campaign).
+    truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
+    assert len(truth_campaigns) == 9
+
+
+def test_identity_clusterer_fails_on_a_real_campaign() -> None:
+    """
+    Sanity for the harness, NOT a test of the clusterer: a real
+    multi-actor campaign should make the placeholder identity clusterer
+    fail completeness, since each truth-campaign gets fragmented into
+    one-member clusters. If this didn't fail, our metrics would be
+    blind to false splits — and that's the entire point of fixture 4
+    and 5 in the design doc.
+    """
+    spec = {
+        "campaign": {
+            "id": "c-real",
+            "actors": [
+                {"id": "a-1", "asn": 14061},
+                {"id": "a-2", "asn": 14061},
+            ],
+            "phases": [
+                {"name": "delivery", "actor": "a-1"},
+                {"name": "discovery", "actor": "a-2"},
+            ],
+            "duration_days": 1,
+        }
+    }
+    corpus = generate(spec, seed=0)
+    truth = corpus.truth_labels()
+    pred = _identity_clusterer(corpus)
+    metrics = score(truth, pred)
+    # Identity clusterer splits the one true campaign across 2 clusters
+    # → completeness drops below 1.0. This must hold or our metrics
+    # aren't catching what they're supposed to catch.
+    assert metrics["completeness"] < 1.0
+    assert metrics["homogeneity"] == pytest.approx(1.0)  # no false merges, just splits
--- a/tests/clustering/test_metrics.py
+++ b/tests/clustering/test_metrics.py
@@ -0,0 +1,76 @@
+"""Sanity tests for the clustering metric harness."""
+from __future__ import annotations
+
+import pytest
+
+from tests.clustering.metrics import (
+    adjusted_rand_index,
+    completeness,
+    homogeneity,
+    score,
+    singleton_recall,
+)
+
+
+def test_perfect_agreement_scores_one() -> None:
+    truth = {"a": "C1", "b": "C1", "c": "C2", "d": "C2"}
+    # Same partition, different label names — clustering doesn't preserve
+    # names, so renamed-but-isomorphic must still score 1.0.
+    pred = {"a": "X", "b": "X", "c": "Y", "d": "Y"}
+    s = score(truth, pred)
+    assert s["adjusted_rand_index"] == pytest.approx(1.0)
+    assert s["homogeneity"] == pytest.approx(1.0)
+    assert s["completeness"] == pytest.approx(1.0)
+    assert s["singleton_recall"] == pytest.approx(1.0)
+
+
+def test_all_singletons_perfect() -> None:
+    truth = {"a": "A", "b": "B", "c": "C"}
+    pred = {"a": "1", "b": "2", "c": "3"}
+    s = score(truth, pred)
+    assert s["singleton_recall"] == pytest.approx(1.0)
+    assert s["adjusted_rand_index"] == pytest.approx(1.0)
+
+
+def test_false_merge_drops_homogeneity() -> None:
+    truth = {"a": "C1", "b": "C2"}
+    pred = {"a": "X", "b": "X"}  # merged two distinct campaigns
+    assert homogeneity(truth, pred) == pytest.approx(0.0)
+    # Completeness is fine (each true class lives in one cluster).
+    assert completeness(truth, pred) == pytest.approx(1.0)
+
+
+def test_false_split_drops_completeness() -> None:
+    truth = {"a": "C1", "b": "C1"}
+    pred = {"a": "X", "b": "Y"}  # split one campaign into two clusters
+    assert completeness(truth, pred) == pytest.approx(0.0)
+    assert homogeneity(truth, pred) == pytest.approx(1.0)
+
+
+def test_singleton_recall_penalises_noise_absorption() -> None:
+    # 3 lone wolves + 1 real campaign with 2 members.
+    truth = {"w1": "wolf1", "w2": "wolf2", "w3": "wolf3", "c1": "C", "c2": "C"}
+    # Clusterer absorbs all wolves into the campaign.
+    pred = dict.fromkeys(truth, "BIG")
+    assert singleton_recall(truth, pred) == pytest.approx(0.0)
+    # And a clusterer that keeps wolves singleton should score 1.0
+    # on this metric, regardless of what it does with the campaign.
+    pred_ok = {"w1": "1", "w2": "2", "w3": "3", "c1": "C", "c2": "C"}
+    assert singleton_recall(truth, pred_ok) == pytest.approx(1.0)
+
+
+def test_mismatched_item_sets_raises() -> None:
+    with pytest.raises(ValueError):
+        adjusted_rand_index({"a": "X"}, {"b": "Y"})
+
+
+def test_random_labels_low_ari() -> None:
+    # ARI of an arbitrary partition vs. ground truth should be near 0,
+    # not near 1 — this is the chance-correction guarantee.
+    truth = {f"i{n}": f"C{n // 4}" for n in range(20)}
+    # Pred that ignores truth: just shuffles items into 5 buckets in
+    # an order uncorrelated with truth.
+    pred = {f"i{n}": f"X{(n * 7) % 5}" for n in range(20)}
+    ari = adjusted_rand_index(truth, pred)
+    # Loose bound — the point is "much closer to 0 than to 1".
+    assert ari < 0.3