feat(clustering): UKC phase enum + synthetic campaign factory + metric harness
Pre-implementation scaffolding for campaign clustering. The simulator is
the spec — algorithm code follows once fixtures + metrics are stable.
* decnet/clustering/ukc.py — UKCPhase enum (19 phases across In/Through/Out
stages), OBSERVABLE_PHASES set, stage_of() helper. Vocabulary aligns
with future MITRE ATT&CK tagging so synthetic data and runtime phase
inference don't need renaming when TTP-tagging lands.
* tests/factories/campaign_factory.py — YAML DSL parser + deterministic
generator emitting truth-labeled SyntheticAttacker / SyntheticSession
records. Validates phase names, warns on unobservable phases, supports
multi-campaign + noise corpora.
* tests/clustering/metrics.py — pure-Python ARI / homogeneity /
completeness / singleton_recall (no sklearn dep). Decided before any
algorithm exists, on purpose.
* tests/fixtures/campaigns/lone_wolf.{yaml,expected.yaml} — fixture 3
from the design doc; simplest of the six, exercises the full pipeline
with an identity-clusterer placeholder.
* development/CAMPAIGN_CLUSTERING.md — design spec for the feature.
* development/DEVELOPMENT_V2.md — note on DSL evolution path
(concurrent phases, multi-actor per phase) deferred post-v1.
This commit is contained in:
0
tests/clustering/__init__.py
Normal file
0
tests/clustering/__init__.py
Normal file
179
tests/clustering/metrics.py
Normal file
179
tests/clustering/metrics.py
Normal file
@@ -0,0 +1,179 @@
|
||||
"""
|
||||
Clustering metric harness — see development/CAMPAIGN_CLUSTERING.md §3.
|
||||
|
||||
Decided BEFORE any clustering algorithm exists, on purpose: if the
|
||||
metrics get picked after seeing results, they'll flatter whatever the
|
||||
algorithm happens to produce.
|
||||
|
||||
Four metrics, none on its own sufficient:
|
||||
|
||||
* Adjusted Rand Index — headline number, chance-corrected agreement
|
||||
between predicted clusters and ground truth.
|
||||
* Homogeneity — each predicted cluster contains only one true class.
|
||||
Catches FALSE MERGES (campaigns wrongly fused).
|
||||
* Completeness — every member of a true class lands in the same
|
||||
predicted cluster. Catches FALSE SPLITS (one campaign wrongly torn
|
||||
apart).
|
||||
* Singleton recall — fraction of ground-truth singletons (lone wolves,
|
||||
background noise) that are kept singleton by the clusterer.
|
||||
|
||||
Implemented from first principles in pure Python so the test harness
|
||||
doesn't pull sklearn/numpy into the runtime dependency surface.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
|
||||
def _comb2(n: int) -> int:
|
||||
"""C(n, 2) — number of unordered pairs from n items."""
|
||||
return n * (n - 1) // 2 if n >= 2 else 0
|
||||
|
||||
|
||||
def adjusted_rand_index(truth: dict[str, str], pred: dict[str, str]) -> float:
|
||||
"""
|
||||
Adjusted Rand Index between two clusterings over the same item set.
|
||||
|
||||
Range: typically [0, 1]; can dip negative for worse-than-random
|
||||
labelings. 1.0 = identical partitions (up to label renaming),
|
||||
0.0 ≈ chance agreement.
|
||||
|
||||
Both args map item_id -> cluster_id. Items must align exactly.
|
||||
"""
|
||||
if set(truth) != set(pred):
|
||||
raise ValueError(
|
||||
"ARI requires identical item sets in truth and pred "
|
||||
f"(missing in pred: {set(truth) - set(pred)}, "
|
||||
f"missing in truth: {set(pred) - set(truth)})"
|
||||
)
|
||||
n = len(truth)
|
||||
if n < 2:
|
||||
return 1.0 # trivially "agree" on <2 items
|
||||
|
||||
# Build the contingency table n_ij = |cluster_i ∩ class_j|.
|
||||
contingency: dict[tuple[str, str], int] = defaultdict(int)
|
||||
for item, t_label in truth.items():
|
||||
p_label = pred[item]
|
||||
contingency[(p_label, t_label)] += 1
|
||||
|
||||
sum_comb = sum(_comb2(v) for v in contingency.values())
|
||||
a_counts = Counter(pred.values()) # row sums (predicted clusters)
|
||||
b_counts = Counter(truth.values()) # column sums (true classes)
|
||||
sum_a = sum(_comb2(v) for v in a_counts.values())
|
||||
sum_b = sum(_comb2(v) for v in b_counts.values())
|
||||
total_pairs = _comb2(n)
|
||||
|
||||
expected = (sum_a * sum_b) / total_pairs if total_pairs else 0.0
|
||||
max_index = (sum_a + sum_b) / 2
|
||||
if max_index == expected:
|
||||
# Degenerate: both clusterings are trivially equal in structure
|
||||
# (both all-singletons, or both one-big-cluster). The math forces
|
||||
# this — see the algebra of max_index = expected. The induced
|
||||
# partitions are necessarily identical, so ARI is 1.0. (sklearn
|
||||
# adopts the same convention.)
|
||||
return 1.0
|
||||
return (sum_comb - expected) / (max_index - expected)
|
||||
|
||||
|
||||
def _entropy(counts: list[int], total: int) -> float:
|
||||
if total == 0:
|
||||
return 0.0
|
||||
h = 0.0
|
||||
for c in counts:
|
||||
if c == 0:
|
||||
continue
|
||||
p = c / total
|
||||
h -= p * math.log(p)
|
||||
return h
|
||||
|
||||
|
||||
def _conditional_entropy(
|
||||
contingency: dict[tuple[str, str], int],
|
||||
given_counts: dict[str, int],
|
||||
total: int,
|
||||
) -> float:
|
||||
"""H(rows | cols) — i.e. entropy of class within each cluster."""
|
||||
if total == 0:
|
||||
return 0.0
|
||||
h = 0.0
|
||||
by_col: dict[str, list[int]] = defaultdict(list)
|
||||
for (row, col), v in contingency.items():
|
||||
by_col[col].append(v)
|
||||
for col, vs in by_col.items():
|
||||
col_total = given_counts[col]
|
||||
if col_total == 0:
|
||||
continue
|
||||
col_entropy = _entropy(vs, col_total)
|
||||
h += (col_total / total) * col_entropy
|
||||
return h
|
||||
|
||||
|
||||
def homogeneity(truth: dict[str, str], pred: dict[str, str]) -> float:
|
||||
"""
|
||||
1 - H(truth | pred) / H(truth). 1.0 = each predicted cluster
|
||||
contains only members of a single true class (no false merges).
|
||||
"""
|
||||
n = len(truth)
|
||||
if n == 0:
|
||||
return 1.0
|
||||
contingency: dict[tuple[str, str], int] = defaultdict(int)
|
||||
for item, t in truth.items():
|
||||
contingency[(t, pred[item])] += 1
|
||||
truth_counts = Counter(truth.values())
|
||||
pred_counts = Counter(pred.values())
|
||||
h_truth = _entropy(list(truth_counts.values()), n)
|
||||
if h_truth == 0:
|
||||
return 1.0
|
||||
h_truth_given_pred = _conditional_entropy(contingency, dict(pred_counts), n)
|
||||
return 1.0 - (h_truth_given_pred / h_truth)
|
||||
|
||||
|
||||
def completeness(truth: dict[str, str], pred: dict[str, str]) -> float:
|
||||
"""
|
||||
1 - H(pred | truth) / H(pred). 1.0 = all members of each true class
|
||||
are assigned to the same predicted cluster (no false splits).
|
||||
"""
|
||||
n = len(truth)
|
||||
if n == 0:
|
||||
return 1.0
|
||||
contingency: dict[tuple[str, str], int] = defaultdict(int)
|
||||
for item, t in truth.items():
|
||||
contingency[(pred[item], t)] += 1
|
||||
pred_counts = Counter(pred.values())
|
||||
truth_counts = Counter(truth.values())
|
||||
h_pred = _entropy(list(pred_counts.values()), n)
|
||||
if h_pred == 0:
|
||||
return 1.0
|
||||
h_pred_given_truth = _conditional_entropy(contingency, dict(truth_counts), n)
|
||||
return 1.0 - (h_pred_given_truth / h_pred)
|
||||
|
||||
|
||||
def singleton_recall(truth: dict[str, str], pred: dict[str, str]) -> float:
|
||||
"""
|
||||
Fraction of ground-truth singletons that the clusterer kept singleton.
|
||||
|
||||
A "true singleton" is an item whose truth-campaign has exactly one
|
||||
member (lone wolves, background noise scanners). The metric exists
|
||||
because ARI/homogeneity/completeness all dilute the cost of a
|
||||
clusterer that absorbs noise into real campaigns — and noise
|
||||
absorption is the failure mode that makes campaign attribution
|
||||
useless in practice.
|
||||
"""
|
||||
truth_counts = Counter(truth.values())
|
||||
true_singletons = [item for item, t in truth.items() if truth_counts[t] == 1]
|
||||
if not true_singletons:
|
||||
return 1.0
|
||||
pred_counts = Counter(pred.values())
|
||||
kept = sum(1 for item in true_singletons if pred_counts[pred[item]] == 1)
|
||||
return kept / len(true_singletons)
|
||||
|
||||
|
||||
def score(truth: dict[str, str], pred: dict[str, str]) -> dict[str, float]:
|
||||
"""One-shot bundle the four metrics for fixture reports."""
|
||||
return {
|
||||
"adjusted_rand_index": adjusted_rand_index(truth, pred),
|
||||
"homogeneity": homogeneity(truth, pred),
|
||||
"completeness": completeness(truth, pred),
|
||||
"singleton_recall": singleton_recall(truth, pred),
|
||||
}
|
||||
112
tests/clustering/test_campaign_factory.py
Normal file
112
tests/clustering/test_campaign_factory.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Determinism + DSL-validation tests for the synthetic campaign factory."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from decnet.clustering.ukc import UKCPhase
|
||||
from tests.factories.campaign_factory import (
|
||||
DSLValidationError,
|
||||
generate,
|
||||
)
|
||||
|
||||
|
||||
def _minimal_spec() -> dict:
|
||||
return {
|
||||
"campaign": {
|
||||
"id": "c-test",
|
||||
"actors": [{"id": "a-1", "asn": 64512}],
|
||||
"phases": [{"name": "delivery", "actor": "a-1"}],
|
||||
"duration_days": 1,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def test_generation_is_deterministic_given_seed() -> None:
|
||||
spec = _minimal_spec()
|
||||
a = generate(spec, seed=42)
|
||||
b = generate(spec, seed=42)
|
||||
# IDs are RNG-driven — same seed must produce identical IDs, not
|
||||
# merely identical structure. Otherwise federation gossip and
|
||||
# fixture diffing both break.
|
||||
assert [att.attacker_id for att in a.attackers] == [
|
||||
att.attacker_id for att in b.attackers
|
||||
]
|
||||
assert [s.session_id for s in a.sessions] == [s.session_id for s in b.sessions]
|
||||
|
||||
|
||||
def test_different_seeds_produce_different_ids() -> None:
|
||||
spec = _minimal_spec()
|
||||
a = generate(spec, seed=1)
|
||||
b = generate(spec, seed=2)
|
||||
assert a.attackers[0].attacker_id != b.attackers[0].attacker_id
|
||||
|
||||
|
||||
def test_truth_labels_match_dsl() -> None:
|
||||
spec = _minimal_spec()
|
||||
corpus = generate(spec, seed=0)
|
||||
assert corpus.attackers[0].truth_campaign_id == "c-test"
|
||||
assert corpus.attackers[0].truth_actor_id == "a-1"
|
||||
# truth_labels() returns the dict the metric harness consumes.
|
||||
labels = corpus.truth_labels()
|
||||
assert labels[corpus.attackers[0].attacker_id] == "c-test"
|
||||
|
||||
|
||||
def test_unobservable_phase_emits_no_events() -> None:
|
||||
spec = _minimal_spec()
|
||||
spec["campaign"]["phases"] = [
|
||||
{"name": "reconnaissance", "actor": "a-1"}, # pre-target, unobservable
|
||||
{"name": "delivery", "actor": "a-1"},
|
||||
]
|
||||
corpus = generate(spec, seed=0)
|
||||
# Only the delivery phase should produce sessions.
|
||||
assert all(s.phase == UKCPhase.DELIVERY for s in corpus.sessions)
|
||||
assert len(corpus.sessions) == 1
|
||||
|
||||
|
||||
def test_unknown_phase_name_raises() -> None:
|
||||
spec = _minimal_spec()
|
||||
spec["campaign"]["phases"] = [{"name": "make_coffee", "actor": "a-1"}]
|
||||
with pytest.raises(DSLValidationError, match="unknown UKC phase"):
|
||||
generate(spec, seed=0)
|
||||
|
||||
|
||||
def test_phase_referencing_unknown_actor_raises() -> None:
|
||||
spec = _minimal_spec()
|
||||
spec["campaign"]["phases"] = [{"name": "delivery", "actor": "ghost"}]
|
||||
with pytest.raises(DSLValidationError, match="unknown actor"):
|
||||
generate(spec, seed=0)
|
||||
|
||||
|
||||
def test_noise_scanners_are_truth_singletons() -> None:
|
||||
spec = {
|
||||
"corpus": {
|
||||
"campaigns": [_minimal_spec()],
|
||||
"noise": {"scanner_count": 5},
|
||||
}
|
||||
}
|
||||
corpus = generate(spec, seed=0)
|
||||
# 1 campaign actor + 5 noise scanners = 6 distinct truth campaigns.
|
||||
truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
|
||||
assert len(truth_campaigns) == 6
|
||||
|
||||
|
||||
def test_multi_actor_campaign_shares_campaign_id() -> None:
|
||||
spec = {
|
||||
"campaign": {
|
||||
"id": "c-shared",
|
||||
"actors": [
|
||||
{"id": "a-1", "asn": 14061},
|
||||
{"id": "a-2", "asn": 14061},
|
||||
],
|
||||
"phases": [
|
||||
{"name": "delivery", "actor": "a-1"},
|
||||
{"name": "discovery", "actor": "a-2"},
|
||||
],
|
||||
"duration_days": 1,
|
||||
}
|
||||
}
|
||||
corpus = generate(spec, seed=0)
|
||||
truth = corpus.truth_labels()
|
||||
# Both attacker rows must point to the SAME truth_campaign_id —
|
||||
# this is the property fixture 5 (multi_operator) hinges on.
|
||||
assert set(truth.values()) == {"c-shared"}
|
||||
92
tests/clustering/test_lone_wolf_fixture.py
Normal file
92
tests/clustering/test_lone_wolf_fixture.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""
|
||||
End-to-end pipeline test for fixture 3 (lone_wolf).
|
||||
|
||||
Loads the YAML spec, runs the synthetic generator, applies a placeholder
|
||||
identity clusterer (each attacker → its own cluster), scores against
|
||||
the expected bounds. This is the simplest of the six fixtures and is
|
||||
deliberately the first one wired up — its ground truth is all
|
||||
singletons, so an identity clusterer trivially passes, which proves the
|
||||
DSL→factory→metrics pipeline works before any real algorithm is built.
|
||||
|
||||
Once the connected-components clusterer (CAMPAIGN_CLUSTERING.md §4)
|
||||
lands, this test will swap the placeholder for the real implementation
|
||||
and the same fixture must continue to pass.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from tests.clustering.metrics import score
|
||||
from tests.factories.campaign_factory import GeneratedCorpus, generate, load_yaml
|
||||
|
||||
FIXTURE_DIR = Path(__file__).parent.parent / "fixtures" / "campaigns"
|
||||
|
||||
|
||||
def _identity_clusterer(corpus: GeneratedCorpus) -> dict[str, str]:
|
||||
"""Every attacker is its own cluster. Trivially correct on lone_wolf."""
|
||||
return {a.attacker_id: f"cluster-{a.attacker_id}" for a in corpus.attackers}
|
||||
|
||||
|
||||
def test_lone_wolf_pipeline_passes_bounds() -> None:
|
||||
spec = load_yaml(FIXTURE_DIR / "lone_wolf.yaml")
|
||||
bounds = yaml.safe_load((FIXTURE_DIR / "lone_wolf.expected.yaml").read_text())
|
||||
|
||||
corpus = generate(spec, seed=0)
|
||||
truth = corpus.truth_labels()
|
||||
pred = _identity_clusterer(corpus)
|
||||
metrics = score(truth, pred)
|
||||
|
||||
failures = []
|
||||
for name, bound in bounds.items():
|
||||
observed = metrics[name]
|
||||
if observed < bound["min"]:
|
||||
failures.append(f"{name}={observed:.3f} < min {bound['min']:.3f}")
|
||||
assert not failures, "fixture bounds violated: " + "; ".join(failures)
|
||||
|
||||
|
||||
def test_lone_wolf_corpus_shape() -> None:
|
||||
"""Sanity: 1 wolf + 8 noise scanners = 9 attackers, 9 sessions."""
|
||||
spec = load_yaml(FIXTURE_DIR / "lone_wolf.yaml")
|
||||
corpus = generate(spec, seed=0)
|
||||
assert len(corpus.attackers) == 9
|
||||
assert len(corpus.sessions) == 9
|
||||
# Every attacker is a truth-singleton (its own campaign).
|
||||
truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
|
||||
assert len(truth_campaigns) == 9
|
||||
|
||||
|
||||
def test_identity_clusterer_fails_on_a_real_campaign() -> None:
|
||||
"""
|
||||
Sanity for the harness, NOT a test of the clusterer: a real
|
||||
multi-actor campaign should make the placeholder identity clusterer
|
||||
fail completeness, since each truth-campaign gets fragmented into
|
||||
one-member clusters. If this didn't fail, our metrics would be
|
||||
blind to false splits — and that's the entire point of fixture 4
|
||||
and 5 in the design doc.
|
||||
"""
|
||||
spec = {
|
||||
"campaign": {
|
||||
"id": "c-real",
|
||||
"actors": [
|
||||
{"id": "a-1", "asn": 14061},
|
||||
{"id": "a-2", "asn": 14061},
|
||||
],
|
||||
"phases": [
|
||||
{"name": "delivery", "actor": "a-1"},
|
||||
{"name": "discovery", "actor": "a-2"},
|
||||
],
|
||||
"duration_days": 1,
|
||||
}
|
||||
}
|
||||
corpus = generate(spec, seed=0)
|
||||
truth = corpus.truth_labels()
|
||||
pred = _identity_clusterer(corpus)
|
||||
metrics = score(truth, pred)
|
||||
# Identity clusterer splits the one true campaign across 2 clusters
|
||||
# → completeness drops below 1.0. This must hold or our metrics
|
||||
# aren't catching what they're supposed to catch.
|
||||
assert metrics["completeness"] < 1.0
|
||||
assert metrics["homogeneity"] == pytest.approx(1.0) # no false merges, just splits
|
||||
76
tests/clustering/test_metrics.py
Normal file
76
tests/clustering/test_metrics.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""Sanity tests for the clustering metric harness."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.clustering.metrics import (
|
||||
adjusted_rand_index,
|
||||
completeness,
|
||||
homogeneity,
|
||||
score,
|
||||
singleton_recall,
|
||||
)
|
||||
|
||||
|
||||
def test_perfect_agreement_scores_one() -> None:
|
||||
truth = {"a": "C1", "b": "C1", "c": "C2", "d": "C2"}
|
||||
# Same partition, different label names — clustering doesn't preserve
|
||||
# names, so renamed-but-isomorphic must still score 1.0.
|
||||
pred = {"a": "X", "b": "X", "c": "Y", "d": "Y"}
|
||||
s = score(truth, pred)
|
||||
assert s["adjusted_rand_index"] == pytest.approx(1.0)
|
||||
assert s["homogeneity"] == pytest.approx(1.0)
|
||||
assert s["completeness"] == pytest.approx(1.0)
|
||||
assert s["singleton_recall"] == pytest.approx(1.0)
|
||||
|
||||
|
||||
def test_all_singletons_perfect() -> None:
|
||||
truth = {"a": "A", "b": "B", "c": "C"}
|
||||
pred = {"a": "1", "b": "2", "c": "3"}
|
||||
s = score(truth, pred)
|
||||
assert s["singleton_recall"] == pytest.approx(1.0)
|
||||
assert s["adjusted_rand_index"] == pytest.approx(1.0)
|
||||
|
||||
|
||||
def test_false_merge_drops_homogeneity() -> None:
|
||||
truth = {"a": "C1", "b": "C2"}
|
||||
pred = {"a": "X", "b": "X"} # merged two distinct campaigns
|
||||
assert homogeneity(truth, pred) == pytest.approx(0.0)
|
||||
# Completeness is fine (each true class lives in one cluster).
|
||||
assert completeness(truth, pred) == pytest.approx(1.0)
|
||||
|
||||
|
||||
def test_false_split_drops_completeness() -> None:
|
||||
truth = {"a": "C1", "b": "C1"}
|
||||
pred = {"a": "X", "b": "Y"} # split one campaign into two clusters
|
||||
assert completeness(truth, pred) == pytest.approx(0.0)
|
||||
assert homogeneity(truth, pred) == pytest.approx(1.0)
|
||||
|
||||
|
||||
def test_singleton_recall_penalises_noise_absorption() -> None:
|
||||
# 3 lone wolves + 1 real campaign with 2 members.
|
||||
truth = {"w1": "wolf1", "w2": "wolf2", "w3": "wolf3", "c1": "C", "c2": "C"}
|
||||
# Clusterer absorbs all wolves into the campaign.
|
||||
pred = dict.fromkeys(truth, "BIG")
|
||||
assert singleton_recall(truth, pred) == pytest.approx(0.0)
|
||||
# And a clusterer that keeps wolves singleton should score 1.0
|
||||
# on this metric, regardless of what it does with the campaign.
|
||||
pred_ok = {"w1": "1", "w2": "2", "w3": "3", "c1": "C", "c2": "C"}
|
||||
assert singleton_recall(truth, pred_ok) == pytest.approx(1.0)
|
||||
|
||||
|
||||
def test_mismatched_item_sets_raises() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
adjusted_rand_index({"a": "X"}, {"b": "Y"})
|
||||
|
||||
|
||||
def test_random_labels_low_ari() -> None:
|
||||
# ARI of an arbitrary partition vs. ground truth should be near 0,
|
||||
# not near 1 — this is the chance-correction guarantee.
|
||||
truth = {f"i{n}": f"C{n // 4}" for n in range(20)}
|
||||
# Pred that ignores truth: just shuffles items into 5 buckets in
|
||||
# an order uncorrelated with truth.
|
||||
pred = {f"i{n}": f"X{(n * 7) % 5}" for n in range(20)}
|
||||
ari = adjusted_rand_index(truth, pred)
|
||||
# Loose bound — the point is "much closer to 0 than to 1".
|
||||
assert ari < 0.3
|
||||
Reference in New Issue
Block a user