merge: testing → main (reconcile 2-week divergence)

This commit is contained in:
2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions

View File

View File

@@ -0,0 +1,488 @@
"""
Shared helpers for fixture-driven clustering tests.
Each fixture lives at `tests/fixtures/campaigns/<name>.yaml` with paired
`<name>.expected.yaml` bound file. The harness here keeps every per-
fixture test file down to "load corpus → predict → assert bounds" without
copy-pasting the bound-walk loop or reference clusterers across files.
Reference clusterers are provided as the algorithm under test in each
fixture's bound assertions; their names describe the *signal* they
cluster on, not the quality of the result.
* `identity_clusterer` — every attacker is its own cluster. Trivially
passes any fixture whose ground truth is all singletons (lone_wolf,
shared_wordlist before merge, etc). Useful as a green baseline while
the real connected-components algorithm is under construction.
* `fingerprint_clusterer` — groups attackers by ``(ja3, hassh)``.
Approximates the "stable signals an attacker can't cheaply rotate"
arm of the planned similarity graph (see IDENTITY_RESOLUTION.md
Premise). Folds rotated-IP observations of one actor into one
cluster when the actor's JA3 + HASSH stay stable. Attackers whose
fingerprints are both NULL (typical of un-fingerprinted noise
scanners) are treated as un-mergeable — each becomes its own
singleton — so this clusterer doesn't trivially fuse all noise
into one mega-cluster.
* `credential_jaccard_clusterer` — deliberately-bad reference that
merges any two attackers whose credential-attempt sets overlap above
a threshold. Exists so fixtures like `shared_wordlist` can prove
they fail a clusterer that relies on credential overlap alone — the
whole point of fixture #1.
* `asn_clusterer` — deliberately-bad reference that groups attackers
by source ASN. Exists so fixtures like `vpn_hopping` (fixture #2)
can prove they fail a clusterer that treats ASN match as a
high-weight signal — VPN/proxy hopping shatters ASN within a single
identity and a clusterer that leans on it tanks completeness.
* `time_window_clusterer` — deliberately-bad reference that unions
attackers whose session time-ranges are within ``gap_days`` of each
other. Exists so fixtures like `paused_campaign` (fixture #4) can
prove they fail a clusterer that treats short-window time proximity
as a primary signal — operators pause, sleep, take weekends.
* `c2_callback_clusterer` — union-find on overlapping C2 callback
sets. Pass-clusterer for fixture 5 (multi_operator), where two
operators with distinct tooling share a C2 endpoint as the
load-bearing campaign signal. Attackers with no C2 endpoints
become their own singleton.
* `shift_clusterer` — deliberately-bad reference that buckets
attackers by majority session-start hour into night/day/swing.
Exists so fixture 5 can prove they fail a clusterer that treats
shift schedule as a primary signal — operators on different
schedules can still share a campaign.
* `composite_signals_clusterer` — union-find that combines
``(ja3, hassh)`` match OR shared C2 callback into the same
cluster. Approximates the planned similarity graph well enough
to score the combined-corpus fixture (fixture 6, noise_floor).
* `recency_decay_clusterer` — deliberately-bad reference that
starts from the same composite signal graph but weights each
edge by ``exp(-time_distance / half_life_days)`` and drops
edges below a threshold. Adversarial reference for fixture 7
(slow_burn): the canonical production failure mode where a
graph clusterer with recency decay fragments long-running
APT campaigns by silently expiring multi-week-old edges.
"""
from __future__ import annotations
from collections.abc import Callable
from pathlib import Path
import yaml
from tests.clustering.metrics import score
from tests.factories.campaign_factory import GeneratedCorpus
PredictFn = Callable[[GeneratedCorpus], dict[str, str]]
def assert_fixture_bounds(
corpus: GeneratedCorpus,
predict: PredictFn,
expected_path: str | Path,
*,
truth_level: str = "campaign",
) -> dict[str, float]:
"""
Run `predict` against the corpus, score against ground truth, and
assert every metric meets the floor declared in `expected_path`.
``truth_level`` selects the oracle: ``"campaign"`` (default) for
campaign-clustering fixtures, ``"identity"`` for identity-resolution
fixtures (where the clusterer's job is to fold N rotated-IP
observations into one identity), or ``"actor"`` for completeness.
Returns the observed metrics dict so callers can do additional
assertions (e.g. "homogeneity is *exactly* 1.0 for this fixture").
"""
bounds = yaml.safe_load(Path(expected_path).read_text(encoding="utf-8"))
truth = corpus.truth_labels(level=truth_level)
pred = predict(corpus)
metrics = score(truth, pred)
failures = []
for name, bound in bounds.items():
observed = metrics[name]
floor = bound["min"]
if observed < floor:
failures.append(f"{name}={observed:.3f} < min {floor:.3f}")
assert not failures, (
"fixture bounds violated: " + "; ".join(failures)
+ f" (full metrics: {metrics})"
)
return metrics
# ─── Reference clusterers ───────────────────────────────────────────────────
def identity_clusterer(corpus: GeneratedCorpus) -> dict[str, str]:
"""Every attacker → its own cluster. Placeholder until §4 algorithm lands."""
return {a.attacker_id: f"cluster-{a.attacker_id}" for a in corpus.attackers}
def fingerprint_clusterer(corpus: GeneratedCorpus) -> dict[str, str]:
"""Group by ``(ja3, hassh)``. Un-fingerprinted rows stay singleton.
Approximates the stable-signal arm of the planned similarity graph;
the real algorithm in `decnet/clustering/` will extend this with
payload simhashes, C2 callback overlap, and phase-handoff edges.
"""
pred: dict[str, str] = {}
for att in corpus.attackers:
if att.ja3 is None and att.hassh is None:
# No fingerprint to share — un-mergeable, own cluster.
pred[att.attacker_id] = f"fp-singleton-{att.attacker_id}"
else:
pred[att.attacker_id] = f"fp::{att.ja3}::{att.hassh}"
return pred
def asn_clusterer(corpus: GeneratedCorpus) -> dict[str, str]:
"""Group by source ASN. Deliberately-bad — see fixture 2."""
return {a.attacker_id: f"asn-{a.asn}" for a in corpus.attackers}
def _union_find(ids: list[str]) -> tuple[
dict[str, str], Callable[[str], str], Callable[[str, str], None]
]:
"""Return (parent, find, union) for a fresh union-find over ``ids``."""
parent: dict[str, str] = {aid: aid for aid in ids}
def find(x: str) -> str:
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(x: str, y: str) -> None:
rx, ry = find(x), find(y)
if rx != ry:
parent[rx] = ry
return parent, find, union
def c2_callback_clusterer(corpus: GeneratedCorpus) -> dict[str, str]:
"""Union attackers whose session-collected C2 callback sets overlap.
Attackers with no C2 callbacks become their own singleton (an
un-fingerprinted opportunistic scanner has no link to anyone).
"""
callbacks: dict[str, set[str]] = {}
for att in corpus.attackers:
callbacks[att.attacker_id] = {
s.c2_callback for s in att.sessions if s.c2_callback
}
ids = list(callbacks.keys())
_parent, find, union = _union_find(ids)
for i, a in enumerate(ids):
sa = callbacks[a]
if not sa:
continue
for b in ids[i + 1 :]:
sb = callbacks[b]
if not sb:
continue
if sa & sb:
union(a, b)
pred: dict[str, str] = {}
for aid in ids:
if not callbacks[aid]:
pred[aid] = f"c2-none-{aid}"
else:
pred[aid] = f"c2-{find(aid)}"
return pred
def shift_clusterer(corpus: GeneratedCorpus) -> dict[str, str]:
"""Bucket attackers by majority session-start hour into night /
day / swing. Deliberately-bad — see fixture 5.
Buckets:
* night — hours [22, 23, 0, 1, 2, 3, 4, 5]
* day — hours [6, 7, 8, 9, 10, 11, 12, 13]
* swing — hours [14, 15, 16, 17, 18, 19, 20, 21]
Attackers with no sessions become their own singleton.
"""
night = {22, 23, 0, 1, 2, 3, 4, 5}
day = {6, 7, 8, 9, 10, 11, 12, 13}
def bucket(hour: int) -> str:
if hour in night:
return "night"
if hour in day:
return "day"
return "swing"
pred: dict[str, str] = {}
for att in corpus.attackers:
if not att.sessions:
pred[att.attacker_id] = f"shift-none-{att.attacker_id}"
continue
counts: dict[str, int] = {}
for s in att.sessions:
b = bucket(s.started_at.hour)
counts[b] = counts.get(b, 0) + 1
majority = max(counts, key=lambda k: counts[k])
pred[att.attacker_id] = f"shift-{majority}"
return pred
def composite_signals_clusterer(corpus: GeneratedCorpus) -> dict[str, str]:
"""Union-find combining ``(ja3, hassh)`` match OR overlapping C2
callback sets. Approximates the stable-signals + C2-overlap arms
of the planned similarity graph; used as the pass-clusterer for
fixture 6 where multiple campaigns + noise are scored together.
Attackers with NO signals (no fingerprint, no C2) stay singleton.
"""
callbacks: dict[str, set[str]] = {}
fingerprint: dict[str, tuple[str | None, str | None] | None] = {}
for att in corpus.attackers:
callbacks[att.attacker_id] = {
s.c2_callback for s in att.sessions if s.c2_callback
}
if att.ja3 is None and att.hassh is None:
fingerprint[att.attacker_id] = None
else:
fingerprint[att.attacker_id] = (att.ja3, att.hassh)
ids = list(callbacks.keys())
_parent, find, union = _union_find(ids)
# Fingerprint edges.
by_fp: dict[tuple[str | None, str | None], list[str]] = {}
for aid, fp in fingerprint.items():
if fp is None:
continue
by_fp.setdefault(fp, []).append(aid)
for group in by_fp.values():
anchor = group[0]
for other in group[1:]:
union(anchor, other)
# C2 overlap edges.
for i, a in enumerate(ids):
sa = callbacks[a]
if not sa:
continue
for b in ids[i + 1 :]:
sb = callbacks[b]
if not sb:
continue
if sa & sb:
union(a, b)
pred: dict[str, str] = {}
for aid in ids:
if fingerprint[aid] is None and not callbacks[aid]:
pred[aid] = f"composite-singleton-{aid}"
else:
pred[aid] = f"composite-{find(aid)}"
return pred
def recency_decay_clusterer(
corpus: GeneratedCorpus,
*,
half_life_days: float = 14.0,
threshold: float = 0.5,
) -> dict[str, str]:
"""Composite-signal graph with exponential time decay on edges.
Same edge construction as ``composite_signals_clusterer``
(fingerprint match OR overlapping C2), but each edge's weight
is multiplied by ``exp(-time_distance / half_life_days)`` where
``time_distance`` is the gap (in days) between the two attackers'
session-midpoint timestamps. Edges with decayed weight below
``threshold`` are dropped before connected components are
extracted.
Deliberately-bad reference for fixture 7 (slow_burn): an APT
campaign that operates over months will be fragmented by any
clusterer that silently expires old edges. This is the canonical
production failure mode for recency-weighted graph clustering on
long-running threat actors.
Attackers with no signals or no sessions stay singleton.
"""
import math
from datetime import timedelta
callbacks: dict[str, set[str]] = {}
fingerprint: dict[str, tuple[str | None, str | None] | None] = {}
midpoint: dict[str, "object | None"] = {}
for att in corpus.attackers:
callbacks[att.attacker_id] = {
s.c2_callback for s in att.sessions if s.c2_callback
}
if att.ja3 is None and att.hassh is None:
fingerprint[att.attacker_id] = None
else:
fingerprint[att.attacker_id] = (att.ja3, att.hassh)
if att.sessions:
starts = [s.started_at for s in att.sessions]
ends = [s.started_at + timedelta(seconds=s.duration_s) for s in att.sessions]
mid = min(starts) + (max(ends) - min(starts)) / 2
midpoint[att.attacker_id] = mid
else:
midpoint[att.attacker_id] = None
ids = list(callbacks.keys())
_parent, find, union = _union_find(ids)
def edge_strength(a: str, b: str) -> float:
"""Base signal strength before time decay; 1.0 on match, else 0."""
fa, fb = fingerprint[a], fingerprint[b]
if fa is not None and fb is not None and fa == fb:
return 1.0
sa, sb = callbacks[a], callbacks[b]
if sa and sb and (sa & sb):
return 1.0
return 0.0
for i, a in enumerate(ids):
ma = midpoint[a]
if ma is None:
continue
for b in ids[i + 1 :]:
mb = midpoint[b]
if mb is None:
continue
base = edge_strength(a, b)
if base <= 0.0:
continue
gap_days = abs((ma - mb).total_seconds()) / 86400.0
weight = base * math.exp(-gap_days / half_life_days)
if weight >= threshold:
union(a, b)
pred: dict[str, str] = {}
for aid in ids:
if fingerprint[aid] is None and not callbacks[aid]:
pred[aid] = f"recency-singleton-{aid}"
else:
pred[aid] = f"recency-{find(aid)}"
return pred
def time_window_clusterer(
corpus: GeneratedCorpus, *, gap_days: float = 1.0
) -> dict[str, str]:
"""Union-find over attackers, edge if their session time-ranges
overlap or are within ``gap_days`` of each other.
Deliberately-bad reference for fixture 4 (paused_campaign): a
campaign that goes silent for several days will be split into
"before pause" and "after pause" clusters by this clusterer,
breaching completeness. The real algorithm must not lean on
short-window time proximity as a primary signal — operators
pause, sleep, switch shifts, take weekends. Time bursts are a
weak hint, not a hard partition.
Attackers with no sessions become their own singleton cluster.
"""
from datetime import timedelta
gap = timedelta(days=gap_days)
ids = [a.attacker_id for a in corpus.attackers]
ranges: dict[str, tuple] = {}
for att in corpus.attackers:
if not att.sessions:
continue
starts = [s.started_at for s in att.sessions]
ends = [s.started_at + timedelta(seconds=s.duration_s) for s in att.sessions]
ranges[att.attacker_id] = (min(starts), max(ends))
parent: dict[str, str] = {aid: aid for aid in ids}
def find(x: str) -> str:
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(x: str, y: str) -> None:
rx, ry = find(x), find(y)
if rx != ry:
parent[rx] = ry
keys = list(ranges.keys())
for i, a in enumerate(keys):
a_start, a_end = ranges[a]
for b in keys[i + 1 :]:
b_start, b_end = ranges[b]
# Time-distance between the two ranges (0 if they overlap).
if a_end < b_start:
separation = b_start - a_end
elif b_end < a_start:
separation = a_start - b_end
else:
separation = timedelta(0)
if separation <= gap:
union(a, b)
return {aid: find(aid) for aid in ids}
def credential_jaccard_clusterer(
corpus: GeneratedCorpus, *, threshold: float = 0.5
) -> dict[str, str]:
"""
Deliberately-bad reference: union-find over attackers, edge whenever
two attackers' credential-attempt sets have Jaccard ≥ threshold.
Used to demonstrate that fixtures targeting credential-overlap
failure modes (fixture 1: shared_wordlist) actually catch a clusterer
that leans on credential signals alone. NOT the real algorithm.
"""
# Build per-attacker credential sets.
creds: dict[str, set[tuple[str, str]]] = {}
for att in corpus.attackers:
s: set[tuple[str, str]] = set()
for sess in att.sessions:
s.update(sess.credentials_tried)
creds[att.attacker_id] = s
# Union-find.
parent: dict[str, str] = {aid: aid for aid in creds}
def find(x: str) -> str:
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(x: str, y: str) -> None:
rx, ry = find(x), find(y)
if rx != ry:
parent[rx] = ry
ids = list(creds.keys())
for i, a in enumerate(ids):
sa = creds[a]
if not sa:
continue
for b in ids[i + 1 :]:
sb = creds[b]
if not sb:
continue
inter = len(sa & sb)
union_size = len(sa | sb)
if union_size == 0:
continue
jaccard = inter / union_size
if jaccard >= threshold:
union(a, b)
return {aid: find(aid) for aid in ids}

179
tests/clustering/metrics.py Normal file
View File

@@ -0,0 +1,179 @@
"""
Clustering metric harness — see development/CAMPAIGN_CLUSTERING.md §3.
Decided BEFORE any clustering algorithm exists, on purpose: if the
metrics get picked after seeing results, they'll flatter whatever the
algorithm happens to produce.
Four metrics, none on its own sufficient:
* Adjusted Rand Index — headline number, chance-corrected agreement
between predicted clusters and ground truth.
* Homogeneity — each predicted cluster contains only one true class.
Catches FALSE MERGES (campaigns wrongly fused).
* Completeness — every member of a true class lands in the same
predicted cluster. Catches FALSE SPLITS (one campaign wrongly torn
apart).
* Singleton recall — fraction of ground-truth singletons (lone wolves,
background noise) that are kept singleton by the clusterer.
Implemented from first principles in pure Python so the test harness
doesn't pull sklearn/numpy into the runtime dependency surface.
"""
from __future__ import annotations
import math
from collections import Counter, defaultdict
def _comb2(n: int) -> int:
"""C(n, 2) — number of unordered pairs from n items."""
return n * (n - 1) // 2 if n >= 2 else 0
def adjusted_rand_index(truth: dict[str, str], pred: dict[str, str]) -> float:
"""
Adjusted Rand Index between two clusterings over the same item set.
Range: typically [0, 1]; can dip negative for worse-than-random
labelings. 1.0 = identical partitions (up to label renaming),
0.0 ≈ chance agreement.
Both args map item_id -> cluster_id. Items must align exactly.
"""
if set(truth) != set(pred):
raise ValueError(
"ARI requires identical item sets in truth and pred "
f"(missing in pred: {set(truth) - set(pred)}, "
f"missing in truth: {set(pred) - set(truth)})"
)
n = len(truth)
if n < 2:
return 1.0 # trivially "agree" on <2 items
# Build the contingency table n_ij = |cluster_i ∩ class_j|.
contingency: dict[tuple[str, str], int] = defaultdict(int)
for item, t_label in truth.items():
p_label = pred[item]
contingency[(p_label, t_label)] += 1
sum_comb = sum(_comb2(v) for v in contingency.values())
a_counts = Counter(pred.values()) # row sums (predicted clusters)
b_counts = Counter(truth.values()) # column sums (true classes)
sum_a = sum(_comb2(v) for v in a_counts.values())
sum_b = sum(_comb2(v) for v in b_counts.values())
total_pairs = _comb2(n)
expected = (sum_a * sum_b) / total_pairs if total_pairs else 0.0
max_index = (sum_a + sum_b) / 2
if max_index == expected:
# Degenerate: both clusterings are trivially equal in structure
# (both all-singletons, or both one-big-cluster). The math forces
# this — see the algebra of max_index = expected. The induced
# partitions are necessarily identical, so ARI is 1.0. (sklearn
# adopts the same convention.)
return 1.0
return (sum_comb - expected) / (max_index - expected)
def _entropy(counts: list[int], total: int) -> float:
if total == 0:
return 0.0
h = 0.0
for c in counts:
if c == 0:
continue
p = c / total
h -= p * math.log(p)
return h
def _conditional_entropy(
contingency: dict[tuple[str, str], int],
given_counts: dict[str, int],
total: int,
) -> float:
"""H(rows | cols) — i.e. entropy of class within each cluster."""
if total == 0:
return 0.0
h = 0.0
by_col: dict[str, list[int]] = defaultdict(list)
for (row, col), v in contingency.items():
by_col[col].append(v)
for col, vs in by_col.items():
col_total = given_counts[col]
if col_total == 0:
continue
col_entropy = _entropy(vs, col_total)
h += (col_total / total) * col_entropy
return h
def homogeneity(truth: dict[str, str], pred: dict[str, str]) -> float:
"""
1 - H(truth | pred) / H(truth). 1.0 = each predicted cluster
contains only members of a single true class (no false merges).
"""
n = len(truth)
if n == 0:
return 1.0
contingency: dict[tuple[str, str], int] = defaultdict(int)
for item, t in truth.items():
contingency[(t, pred[item])] += 1
truth_counts = Counter(truth.values())
pred_counts = Counter(pred.values())
h_truth = _entropy(list(truth_counts.values()), n)
if h_truth == 0:
return 1.0
h_truth_given_pred = _conditional_entropy(contingency, dict(pred_counts), n)
return 1.0 - (h_truth_given_pred / h_truth)
def completeness(truth: dict[str, str], pred: dict[str, str]) -> float:
"""
1 - H(pred | truth) / H(pred). 1.0 = all members of each true class
are assigned to the same predicted cluster (no false splits).
"""
n = len(truth)
if n == 0:
return 1.0
contingency: dict[tuple[str, str], int] = defaultdict(int)
for item, t in truth.items():
contingency[(pred[item], t)] += 1
pred_counts = Counter(pred.values())
truth_counts = Counter(truth.values())
h_pred = _entropy(list(pred_counts.values()), n)
if h_pred == 0:
return 1.0
h_pred_given_truth = _conditional_entropy(contingency, dict(truth_counts), n)
return 1.0 - (h_pred_given_truth / h_pred)
def singleton_recall(truth: dict[str, str], pred: dict[str, str]) -> float:
"""
Fraction of ground-truth singletons that the clusterer kept singleton.
A "true singleton" is an item whose truth-campaign has exactly one
member (lone wolves, background noise scanners). The metric exists
because ARI/homogeneity/completeness all dilute the cost of a
clusterer that absorbs noise into real campaigns — and noise
absorption is the failure mode that makes campaign attribution
useless in practice.
"""
truth_counts = Counter(truth.values())
true_singletons = [item for item, t in truth.items() if truth_counts[t] == 1]
if not true_singletons:
return 1.0
pred_counts = Counter(pred.values())
kept = sum(1 for item in true_singletons if pred_counts[pred[item]] == 1)
return kept / len(true_singletons)
def score(truth: dict[str, str], pred: dict[str, str]) -> dict[str, float]:
"""One-shot bundle the four metrics for fixture reports."""
return {
"adjusted_rand_index": adjusted_rand_index(truth, pred),
"homogeneity": homogeneity(truth, pred),
"completeness": completeness(truth, pred),
"singleton_recall": singleton_recall(truth, pred),
}

View File

@@ -0,0 +1,318 @@
"""Determinism + DSL-validation tests for the synthetic campaign factory."""
from __future__ import annotations
import pytest
from decnet.clustering.ukc import UKCPhase
from tests.factories.campaign_factory import (
DSLValidationError,
generate,
)
def _minimal_spec() -> dict:
return {
"campaign": {
"id": "c-test",
"actors": [{"id": "a-1", "asn": 64512}],
"phases": [{"name": "delivery", "actor": "a-1"}],
"duration_days": 1,
}
}
def test_generation_is_deterministic_given_seed() -> None:
spec = _minimal_spec()
a = generate(spec, seed=42)
b = generate(spec, seed=42)
# IDs are RNG-driven — same seed must produce identical IDs, not
# merely identical structure. Otherwise federation gossip and
# fixture diffing both break.
assert [att.attacker_id for att in a.attackers] == [
att.attacker_id for att in b.attackers
]
assert [s.session_id for s in a.sessions] == [s.session_id for s in b.sessions]
def test_different_seeds_produce_different_ids() -> None:
spec = _minimal_spec()
a = generate(spec, seed=1)
b = generate(spec, seed=2)
assert a.attackers[0].attacker_id != b.attackers[0].attacker_id
def test_truth_labels_match_dsl() -> None:
spec = _minimal_spec()
corpus = generate(spec, seed=0)
assert corpus.attackers[0].truth_campaign_id == "c-test"
assert corpus.attackers[0].truth_actor_id == "a-1"
# truth_labels() returns the dict the metric harness consumes.
labels = corpus.truth_labels()
assert labels[corpus.attackers[0].attacker_id] == "c-test"
def test_unobservable_phase_emits_no_events() -> None:
spec = _minimal_spec()
spec["campaign"]["phases"] = [
{"name": "reconnaissance", "actor": "a-1"}, # pre-target, unobservable
{"name": "delivery", "actor": "a-1"},
]
corpus = generate(spec, seed=0)
# Only the delivery phase should produce sessions.
assert all(s.phase == UKCPhase.DELIVERY for s in corpus.sessions)
assert len(corpus.sessions) == 1
def test_unknown_phase_name_raises() -> None:
spec = _minimal_spec()
spec["campaign"]["phases"] = [{"name": "make_coffee", "actor": "a-1"}]
with pytest.raises(DSLValidationError, match="unknown UKC phase"):
generate(spec, seed=0)
def test_phase_referencing_unknown_actor_raises() -> None:
spec = _minimal_spec()
spec["campaign"]["phases"] = [{"name": "delivery", "actor": "ghost"}]
with pytest.raises(DSLValidationError, match="unknown actor"):
generate(spec, seed=0)
def test_noise_scanners_are_truth_singletons() -> None:
spec = {
"corpus": {
"campaigns": [_minimal_spec()],
"noise": {"scanner_count": 5},
}
}
corpus = generate(spec, seed=0)
# 1 campaign actor + 5 noise scanners = 6 distinct truth campaigns.
truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
assert len(truth_campaigns) == 6
def test_multi_actor_campaign_shares_campaign_id() -> None:
spec = {
"campaign": {
"id": "c-shared",
"actors": [
{"id": "a-1", "asn": 14061},
{"id": "a-2", "asn": 14061},
],
"phases": [
{"name": "delivery", "actor": "a-1"},
{"name": "discovery", "actor": "a-2"},
],
"duration_days": 1,
}
}
corpus = generate(spec, seed=0)
truth = corpus.truth_labels()
# Both attacker rows must point to the SAME truth_campaign_id —
# this is the property fixture 5 (multi_operator) hinges on.
assert set(truth.values()) == {"c-shared"}
# ─── ip_pool: rotating — identity-resolution fixture support ────────────────
def test_rotating_ip_pool_emits_one_row_per_rotation_count() -> None:
"""
``rotation_count: 5`` produces 5 SyntheticAttacker rows for that
one DSL actor. Sticky default still produces 1.
"""
spec = {
"campaign": {
"id": "c-rotating",
"actors": [{
"id": "a-1",
"asn": 14061,
"ip_pool": "rotating",
"rotation_count": 5,
"ja3": "JA3-fixed",
"hassh": "HASSH-fixed",
}],
"phases": [{"name": "delivery", "actor": "a-1",
"target_selector": {"count": 10}}],
"duration_days": 1,
}
}
corpus = generate(spec, seed=0)
assert len(corpus.attackers) == 5
def test_rotating_rows_share_identity_and_fingerprints_but_differ_on_ip() -> None:
"""
All rotated rows MUST share truth_identity_id, truth_actor_id,
truth_campaign_id, ja3, hassh — these are the stable signals the
clusterer uses to recover identity. They MUST differ on ip — that's
what makes the test interesting.
"""
spec = {
"campaign": {
"id": "c-vpn-hop",
"actors": [{
"id": "a-1",
"asn": 14061,
"ip_pool": "rotating",
"rotation_count": 5,
"ja3": "JA3-fixed",
"hassh": "HASSH-fixed",
}],
"phases": [{"name": "delivery", "actor": "a-1",
"target_selector": {"count": 5}}],
"duration_days": 1,
}
}
corpus = generate(spec, seed=0)
rows = corpus.attackers
# Stable: shared across all 5 rows.
assert len({r.truth_identity_id for r in rows}) == 1
assert len({r.truth_actor_id for r in rows}) == 1
assert len({r.truth_campaign_id for r in rows}) == 1
assert len({r.ja3 for r in rows}) == 1
assert len({r.hassh for r in rows}) == 1
# Rotating: 5 distinct IPs.
assert len({r.ip for r in rows}) == 5
def test_rotation_asns_distributed_across_rows() -> None:
"""
When ``rotation_asns`` is provided, each rotated row gets the
corresponding ASN (cycling if shorter than rotation_count).
"""
spec = {
"campaign": {
"id": "c-multi-asn",
"actors": [{
"id": "a-1",
"asn": 14061, # primary, ignored when rotation_asns is set
"ip_pool": "rotating",
"rotation_count": 5,
"rotation_asns": [14061, 7922, 16509, 14618, 13335],
"ja3": "x", "hassh": "y",
}],
"phases": [{"name": "delivery", "actor": "a-1",
"target_selector": {"count": 5}}],
"duration_days": 1,
}
}
corpus = generate(spec, seed=0)
asns = [r.asn for r in corpus.attackers]
assert asns == [14061, 7922, 16509, 14618, 13335]
def test_rotation_asns_cycle_when_shorter_than_count() -> None:
"""rotation_asns of length 2 with rotation_count=5 cycles."""
spec = {
"campaign": {
"id": "c-cycle",
"actors": [{
"id": "a-1",
"ip_pool": "rotating",
"rotation_count": 5,
"rotation_asns": [100, 200],
"ja3": "x", "hassh": "y",
}],
"phases": [{"name": "delivery", "actor": "a-1"}],
"duration_days": 1,
}
}
corpus = generate(spec, seed=0)
assert [r.asn for r in corpus.attackers] == [100, 200, 100, 200, 100]
def test_sessions_distribute_round_robin_across_rotated_rows() -> None:
"""
With rotation_count=3 and 9 sessions in a phase, each row should
receive 3 sessions (round-robin). This is what makes the clusterer
job realistic — every observation row carries its own session
timeline that the clusterer joins via shared fingerprints.
"""
spec = {
"campaign": {
"id": "c-rr",
"actors": [{
"id": "a-1",
"ip_pool": "rotating",
"rotation_count": 3,
"ja3": "x", "hassh": "y",
}],
"phases": [{"name": "delivery", "actor": "a-1",
"target_selector": {"count": 9}}],
"duration_days": 1,
}
}
corpus = generate(spec, seed=0)
counts = sorted(len(r.sessions) for r in corpus.attackers)
assert counts == [3, 3, 3]
def test_truth_labels_at_identity_level() -> None:
"""
corpus.truth_labels(level="identity") returns the identity-level
oracle the clusterer is scored against. Rotated rows for one DSL
actor share an identity label even though they have distinct
attacker_ids.
"""
spec = {
"campaign": {
"id": "c-rot",
"actors": [{
"id": "a-1",
"ip_pool": "rotating",
"rotation_count": 4,
"ja3": "x", "hassh": "y",
}],
"phases": [{"name": "delivery", "actor": "a-1",
"target_selector": {"count": 4}}],
"duration_days": 1,
}
}
corpus = generate(spec, seed=0)
identity_labels = corpus.truth_labels(level="identity")
assert len(identity_labels) == 4 # one per attacker row
# All 4 attackers share one identity label.
assert len(set(identity_labels.values())) == 1
def test_truth_labels_unknown_level_raises() -> None:
spec = _minimal_spec()
corpus = generate(spec, seed=0)
with pytest.raises(ValueError, match="unknown truth-label level"):
corpus.truth_labels(level="campaign-but-spelled-wrong")
def test_sticky_default_unchanged_back_compat() -> None:
"""
The pre-existing sticky-default path produces exactly one row per
actor and assigns truth_identity_id. Smoke-tests that the
refactor didn't break the back-compat case.
"""
corpus = generate(_minimal_spec(), seed=0)
assert len(corpus.attackers) == 1
assert corpus.attackers[0].truth_identity_id != ""
# Default truth_labels still returns campaign labels.
labels = corpus.truth_labels()
assert set(labels.values()) == {"c-test"}
def test_rotated_sessions_carry_identity_label() -> None:
"""SyntheticSession.truth_identity_id matches its parent attacker."""
spec = {
"campaign": {
"id": "c-rot",
"actors": [{
"id": "a-1",
"ip_pool": "rotating",
"rotation_count": 3,
"ja3": "x", "hassh": "y",
}],
"phases": [{"name": "delivery", "actor": "a-1",
"target_selector": {"count": 6}}],
"duration_days": 1,
}
}
corpus = generate(spec, seed=0)
by_id = {a.attacker_id: a for a in corpus.attackers}
for sess in corpus.sessions:
assert sess.truth_identity_id == by_id[sess.attacker_id].truth_identity_id

View File

@@ -0,0 +1,344 @@
"""Tests for campaign-level similarity primitives.
Covers, in order:
* Each edge family in isolation — phase-handoff, shared-infra,
temporal-overlap, cohort.
* The F7 (slow_burn) time-agnostic invariant — shifting every
timestamp on both sides by the same Δ preserves every edge weight.
* The F1 (shared_wordlist) failure mode — shared cohort alone must
NOT push a pair over threshold.
* The F5 (multi_operator) target — phase-handoff alone (the
load-bearing campaign-level signal) DOES cross threshold.
* Tier-combination arithmetic — shared-infra + temporal overlap
(the canonical co-op pattern) crosses threshold; shared-infra +
cohort does not.
"""
from __future__ import annotations
import pytest
from decnet.clustering.campaign.impl.similarity import (
CAMPAIGN_EDGE_THRESHOLD,
DEFAULT_HANDOFF_WINDOW_S,
IdentityFeatures,
cohort_weight,
combined_campaign_weight,
phase_handoff_weight,
shared_infra_weight,
temporal_overlap_weight,
)
def _features(uuid: str, **kwargs) -> IdentityFeatures:
return IdentityFeatures(identity_uuid=uuid, **kwargs)
# ─── phase_handoff_weight ────────────────────────────────────────────────────
def test_phase_handoff_clean_out_to_in_within_window():
a = _features(
"a",
last_phase_per_decky={"d1": "command_and_control"},
last_seen_per_decky={"d1": 1000.0},
)
b = _features(
"b",
first_phase_per_decky={"d1": "discovery"},
first_seen_per_decky={"d1": 1000.0 + 600.0}, # 10 min later
)
assert phase_handoff_weight(a, b) == 1.0
def test_phase_handoff_symmetric():
# B finishes, A picks up. The argument order shouldn't matter.
b = _features(
"b",
last_phase_per_decky={"d1": "persistence"},
last_seen_per_decky={"d1": 5000.0},
)
a = _features(
"a",
first_phase_per_decky={"d1": "lateral_movement"},
first_seen_per_decky={"d1": 5000.0 + 60.0},
)
assert phase_handoff_weight(a, b) == 1.0
assert phase_handoff_weight(b, a) == 1.0
def test_phase_handoff_no_decky_overlap():
a = _features(
"a",
last_phase_per_decky={"d1": "command_and_control"},
last_seen_per_decky={"d1": 1000.0},
)
b = _features(
"b",
first_phase_per_decky={"d2": "discovery"},
first_seen_per_decky={"d2": 1100.0},
)
assert phase_handoff_weight(a, b) == 0.0
def test_phase_handoff_phase_mismatch():
# A ends mid-pivoting (not a handoff-out phase) → no signal.
a = _features(
"a",
last_phase_per_decky={"d1": "exploitation"},
last_seen_per_decky={"d1": 1000.0},
)
b = _features(
"b",
first_phase_per_decky={"d1": "discovery"},
first_seen_per_decky={"d1": 1100.0},
)
assert phase_handoff_weight(a, b) == 0.0
def test_phase_handoff_outside_window():
a = _features(
"a",
last_phase_per_decky={"d1": "command_and_control"},
last_seen_per_decky={"d1": 0.0},
)
b = _features(
"b",
first_phase_per_decky={"d1": "discovery"},
# Way past the 24h default window.
first_seen_per_decky={"d1": DEFAULT_HANDOFF_WINDOW_S + 3600.0},
)
assert phase_handoff_weight(a, b) == 0.0
def test_phase_handoff_negative_gap_rejected():
# B starts BEFORE A ends — that's overlap, not a handoff.
a = _features(
"a",
last_phase_per_decky={"d1": "persistence"},
last_seen_per_decky={"d1": 2000.0},
)
b = _features(
"b",
first_phase_per_decky={"d1": "lateral_movement"},
first_seen_per_decky={"d1": 1000.0},
)
assert phase_handoff_weight(a, b) == 0.0
# ─── shared_infra_weight ─────────────────────────────────────────────────────
def test_shared_infra_full_overlap():
a = _features(
"a",
payload_hashes=frozenset({"hash-1"}),
c2_endpoints=frozenset({"1.2.3.4:443"}),
decky_set=frozenset({"d1"}),
)
b = _features(
"b",
payload_hashes=frozenset({"hash-1"}),
c2_endpoints=frozenset({"1.2.3.4:443"}),
decky_set=frozenset({"d1"}),
)
assert shared_infra_weight(a, b) == 1.0
def test_shared_infra_no_overlap():
a = _features("a", payload_hashes=frozenset({"hash-a"}))
b = _features("b", payload_hashes=frozenset({"hash-b"}))
assert shared_infra_weight(a, b) == 0.0
def test_shared_infra_empty_returns_zero():
a = _features("a")
b = _features("b")
assert shared_infra_weight(a, b) == 0.0
# ─── temporal_overlap_weight ─────────────────────────────────────────────────
def test_temporal_overlap_full():
a = _features("a", session_windows=((0.0, 100.0),))
b = _features("b", session_windows=((0.0, 100.0),))
assert temporal_overlap_weight(a, b) == 1.0
def test_temporal_overlap_partial():
a = _features("a", session_windows=((0.0, 100.0),))
b = _features("b", session_windows=((50.0, 150.0),))
# 50 of 100 of A's time overlaps B.
assert temporal_overlap_weight(a, b) == pytest.approx(0.5)
def test_temporal_overlap_disjoint():
a = _features("a", session_windows=((0.0, 100.0),))
b = _features("b", session_windows=((200.0, 300.0),))
assert temporal_overlap_weight(a, b) == 0.0
def test_temporal_overlap_empty():
a = _features("a")
b = _features("b", session_windows=((0.0, 100.0),))
assert temporal_overlap_weight(a, b) == 0.0
# ─── cohort_weight ───────────────────────────────────────────────────────────
def test_cohort_asn_overlap():
a = _features("a", asn_cohort=frozenset({64512}))
b = _features("b", asn_cohort=frozenset({64512}))
assert cohort_weight(a, b) == 1.0
def test_cohort_disjoint():
a = _features("a", asn_cohort=frozenset({64512}))
b = _features("b", asn_cohort=frozenset({64513}))
assert cohort_weight(a, b) == 0.0
# ─── F7 time-agnostic invariant ──────────────────────────────────────────────
def test_f7_invariant_temporal_overlap_unchanged_under_shift():
# The fixture-7 (slow_burn) invariant: shifting every timestamp on
# BOTH sides by the same Δ must yield the same edge weight. The
# campaign clusterer's edges are pairwise-relative; an absolute
# 90-day shift must not change anything.
a = _features("a", session_windows=((0.0, 100.0), (300.0, 400.0)))
b = _features("b", session_windows=((50.0, 150.0), (350.0, 450.0)))
base = temporal_overlap_weight(a, b)
shift = 90 * 24 * 3600.0
a_shifted = _features(
"a",
session_windows=tuple((s + shift, e + shift) for s, e in a.session_windows),
)
b_shifted = _features(
"b",
session_windows=tuple((s + shift, e + shift) for s, e in b.session_windows),
)
assert temporal_overlap_weight(a_shifted, b_shifted) == pytest.approx(base)
def test_f7_invariant_phase_handoff_unchanged_under_shift():
a = _features(
"a",
last_phase_per_decky={"d1": "command_and_control"},
last_seen_per_decky={"d1": 1000.0},
)
b = _features(
"b",
first_phase_per_decky={"d1": "discovery"},
first_seen_per_decky={"d1": 1600.0},
)
base = phase_handoff_weight(a, b)
shift = 90 * 24 * 3600.0
a_shifted = _features(
"a",
last_phase_per_decky=dict(a.last_phase_per_decky),
last_seen_per_decky={k: v + shift for k, v in a.last_seen_per_decky.items()},
)
b_shifted = _features(
"b",
first_phase_per_decky=dict(b.first_phase_per_decky),
first_seen_per_decky={k: v + shift for k, v in b.first_seen_per_decky.items()},
)
assert phase_handoff_weight(a_shifted, b_shifted) == base == 1.0
# ─── Combined-weight + threshold semantics ──────────────────────────────────
def test_phase_handoff_alone_crosses_threshold():
"""F5 multi_operator's load-bearing signal: handoff alone is enough."""
a = _features(
"a",
last_phase_per_decky={"d1": "persistence"},
last_seen_per_decky={"d1": 1000.0},
)
b = _features(
"b",
first_phase_per_decky={"d1": "lateral_movement"},
first_seen_per_decky={"d1": 1100.0},
)
assert combined_campaign_weight(a, b) >= CAMPAIGN_EDGE_THRESHOLD
def test_cohort_alone_below_threshold():
"""F2 vpn_hopping at campaign level: cohort alone is not co-op."""
a = _features("a", asn_cohort=frozenset({64512}))
b = _features("b", asn_cohort=frozenset({64512}))
assert combined_campaign_weight(a, b) < CAMPAIGN_EDGE_THRESHOLD
def test_shared_infra_alone_crosses_threshold():
"""Shared payload + C2 alone is enough — F5's intended pass condition."""
a = _features(
"a",
payload_hashes=frozenset({"h"}),
c2_endpoints=frozenset({"c"}),
)
b = _features(
"b",
payload_hashes=frozenset({"h"}),
c2_endpoints=frozenset({"c"}),
)
assert combined_campaign_weight(a, b) >= CAMPAIGN_EDGE_THRESHOLD
def test_decky_overlap_alone_below_threshold():
"""F1's failure mode: shared targeting on a small fleet is NOT co-op.
Two campaigns hitting the same SSH deckies share no payload/C2,
just the decky set. Cohort tier alone must not cross threshold.
"""
a = _features(
"a",
decky_set=frozenset({"d1", "d2"}),
asn_cohort=frozenset({64512}),
)
b = _features(
"b",
decky_set=frozenset({"d1", "d2"}),
asn_cohort=frozenset({64513}),
)
assert combined_campaign_weight(a, b) < CAMPAIGN_EDGE_THRESHOLD
def test_combined_invariant_under_shift():
"""End-to-end F7 invariant on the combined weight."""
a = _features(
"a",
last_phase_per_decky={"d1": "persistence"},
last_seen_per_decky={"d1": 1000.0},
session_windows=((0.0, 1500.0),),
payload_hashes=frozenset({"h"}),
)
b = _features(
"b",
first_phase_per_decky={"d1": "discovery"},
first_seen_per_decky={"d1": 1100.0},
session_windows=((1100.0, 2000.0),),
payload_hashes=frozenset({"h"}),
)
base = combined_campaign_weight(a, b)
shift = 90 * 24 * 3600.0
a_shifted = IdentityFeatures(
identity_uuid=a.identity_uuid,
last_phase_per_decky=dict(a.last_phase_per_decky),
last_seen_per_decky={k: v + shift for k, v in a.last_seen_per_decky.items()},
session_windows=tuple((s + shift, e + shift) for s, e in a.session_windows),
payload_hashes=a.payload_hashes,
)
b_shifted = IdentityFeatures(
identity_uuid=b.identity_uuid,
first_phase_per_decky=dict(b.first_phase_per_decky),
first_seen_per_decky={k: v + shift for k, v in b.first_seen_per_decky.items()},
session_windows=tuple((s + shift, e + shift) for s, e in b.session_windows),
payload_hashes=b.payload_hashes,
)
assert combined_campaign_weight(a_shifted, b_shifted) == pytest.approx(base)

View File

@@ -0,0 +1,357 @@
"""End-to-end tests for the campaign-clusterer worker shell + tick.
Mirrors :mod:`tests.clustering.test_clusterer_worker` for the layer
above. Covers shell lifecycle (shutdown / cancel / raising tick),
end-to-end ``tick`` against SQLite (form, link, merge, revoke), bus
fan-out to the four ``campaign.*`` topics + cross-family
``identity.campaign.assigned``, factory dispatch, and CLI gating.
"""
from __future__ import annotations
import asyncio
import json
from datetime import datetime, timezone
import pytest
from decnet.bus import topics as _topics
from decnet.clustering.campaign.base import (
CampaignClusterer,
CampaignClusterResult,
)
from decnet.clustering.campaign.factory import get_campaign_clusterer
from decnet.clustering.campaign.impl.connected_components import (
ConnectedComponentsCampaignClusterer,
cluster_identities,
from_identity_row,
)
from decnet.clustering.campaign.impl.similarity import IdentityFeatures
from decnet.clustering.campaign.worker import run_campaign_clusterer_loop
from decnet.web.db.factory import get_repository
@pytest.fixture
async def repo(tmp_path):
r = get_repository(db_path=str(tmp_path / "campaign.db"))
await r.initialize()
return r
@pytest.fixture(autouse=True)
def _no_bus(monkeypatch):
"""Run workers in poll-only mode — no real Unix socket."""
monkeypatch.setenv("DECNET_BUS_ENABLED", "false")
# ─── Test doubles ───────────────────────────────────────────────────────────
class _FakeClusterer(CampaignClusterer):
name = "fake"
def __init__(self, results=None) -> None:
self._results = list(results or [])
self.calls = 0
async def tick(self, repo) -> CampaignClusterResult:
self.calls += 1
if self._results:
return self._results.pop(0)
return CampaignClusterResult()
class _RaisingClusterer(CampaignClusterer):
name = "raising"
def __init__(self) -> None:
self.calls = 0
async def tick(self, repo) -> CampaignClusterResult:
self.calls += 1
raise RuntimeError("boom")
# ─── Shell lifecycle ────────────────────────────────────────────────────────
@pytest.mark.anyio
async def test_loop_exits_on_shutdown(repo):
shutdown = asyncio.Event()
clusterer = _FakeClusterer()
task = asyncio.create_task(
run_campaign_clusterer_loop(
repo, poll_interval_secs=0.05,
clusterer=clusterer, shutdown=shutdown,
)
)
await asyncio.sleep(0.12)
shutdown.set()
await asyncio.wait_for(task, timeout=2.0)
assert clusterer.calls >= 1
@pytest.mark.anyio
async def test_loop_exits_on_cancel(repo):
clusterer = _FakeClusterer()
task = asyncio.create_task(
run_campaign_clusterer_loop(
repo, poll_interval_secs=0.05, clusterer=clusterer,
)
)
await asyncio.sleep(0.1)
task.cancel()
await asyncio.wait_for(task, timeout=2.0)
assert clusterer.calls >= 1
@pytest.mark.anyio
async def test_tick_failure_does_not_crash_loop(repo):
shutdown = asyncio.Event()
clusterer = _RaisingClusterer()
task = asyncio.create_task(
run_campaign_clusterer_loop(
repo, poll_interval_secs=0.05,
clusterer=clusterer, shutdown=shutdown,
)
)
await asyncio.sleep(0.2)
shutdown.set()
await asyncio.wait_for(task, timeout=2.0)
assert clusterer.calls >= 2
# ─── Bus fan-out ────────────────────────────────────────────────────────────
@pytest.mark.anyio
async def test_publishes_campaign_result_on_bus(monkeypatch, repo):
published: list[tuple[str, dict, str]] = []
async def _fake_publish(bus, topic, payload, event_type=""):
published.append((topic, payload, event_type))
monkeypatch.setattr(
"decnet.clustering.campaign.worker.publish_safely", _fake_publish,
)
result = CampaignClusterResult(
campaigns_formed=[
{"campaign_uuid": "c-1", "identity_uuids": ["i-1", "i-2"]},
],
identities_assigned=[
{"campaign_uuid": "c-1", "identity_uuid": "i-3",
"prior_campaign_uuid": None},
],
campaigns_merged=[
{"winner_uuid": "c-1", "loser_uuid": "c-2"},
],
campaigns_unmerged=[
{"resurrected_uuid": "c-2", "former_winner_uuid": "c-1"},
],
)
clusterer = _FakeClusterer(results=[result])
shutdown = asyncio.Event()
task = asyncio.create_task(
run_campaign_clusterer_loop(
repo, poll_interval_secs=0.05,
clusterer=clusterer, shutdown=shutdown,
)
)
await asyncio.sleep(0.1)
shutdown.set()
await asyncio.wait_for(task, timeout=2.0)
topics_seen = {t for t, _, _ in published}
assert _topics.campaign(_topics.CAMPAIGN_FORMED) in topics_seen
assert _topics.campaign(_topics.CAMPAIGN_IDENTITY_ASSIGNED) in topics_seen
assert _topics.campaign(_topics.CAMPAIGN_MERGED) in topics_seen
assert _topics.campaign(_topics.CAMPAIGN_UNMERGED) in topics_seen
# Cross-family signal — every campaigns_formed identity AND every
# identities_assigned identity should fire identity.campaign.assigned.
cross = _topics.identity(_topics.IDENTITY_CAMPAIGN_ASSIGNED)
cross_payloads = [p for t, p, _ in published if t == cross]
cross_idents = {p["identity_uuid"] for p in cross_payloads}
assert {"i-1", "i-2", "i-3"}.issubset(cross_idents)
# ─── Pure clusterer + projection ────────────────────────────────────────────
def test_cluster_identities_singletons():
a = IdentityFeatures(identity_uuid="a")
b = IdentityFeatures(identity_uuid="b")
labels = cluster_identities([a, b])
assert labels["a"] != labels["b"]
def test_cluster_identities_phase_handoff_unions():
a = IdentityFeatures(
identity_uuid="a",
last_phase_per_decky={"d1": "command_and_control"},
last_seen_per_decky={"d1": 1000.0},
)
b = IdentityFeatures(
identity_uuid="b",
first_phase_per_decky={"d1": "discovery"},
first_seen_per_decky={"d1": 1100.0},
)
labels = cluster_identities([a, b])
assert labels["a"] == labels["b"]
def test_from_identity_row_parses_json_lists():
feat = from_identity_row({
"uuid": "i-1",
"payload_simhashes": json.dumps(["h1", "h2"]),
"c2_endpoints": json.dumps(["c1"]),
})
assert feat.identity_uuid == "i-1"
assert feat.payload_hashes == frozenset({"h1", "h2"})
assert feat.c2_endpoints == frozenset({"c1"})
def test_from_identity_row_handles_null_and_garbage():
f = from_identity_row({
"uuid": "i-1",
"payload_simhashes": None,
"c2_endpoints": "not-json",
})
assert f.payload_hashes == frozenset()
assert f.c2_endpoints == frozenset()
# ─── End-to-end tick against SQLite ────────────────────────────────────────
async def _create_identity(repo, uuid: str, **kwargs) -> str:
now = datetime.now(timezone.utc)
return await repo.create_attacker_identity({
"uuid": uuid,
"first_seen_at": now,
"last_seen_at": now,
"payload_simhashes": kwargs.get("payload_simhashes"),
"c2_endpoints": kwargs.get("c2_endpoints"),
})
@pytest.mark.anyio
async def test_tick_empty_db_returns_empty_result(repo):
c = ConnectedComponentsCampaignClusterer()
result = await c.tick(repo)
assert result.campaigns_formed == []
assert result.identities_assigned == []
assert result.campaigns_merged == []
assert result.campaigns_unmerged == []
@pytest.mark.anyio
async def test_tick_forms_campaign_for_shared_infra_co_op(repo):
"""Two identities with shared payload + C2 fold to one campaign.
The canonical F5-style co-op pattern, exercised end-to-end through
the production-row adapter. ``from_identity_row`` reads
``payload_simhashes`` + ``c2_endpoints`` from the AttackerIdentity
JSON columns, builds IdentityFeatures, and the campaign weight
crosses threshold on shared_infra alone.
"""
await _create_identity(
repo, "i1",
payload_simhashes=json.dumps(["h1"]),
c2_endpoints=json.dumps(["c1"]),
)
await _create_identity(
repo, "i2",
payload_simhashes=json.dumps(["h1"]),
c2_endpoints=json.dumps(["c1"]),
)
c = ConnectedComponentsCampaignClusterer()
result = await c.tick(repo)
assert len(result.campaigns_formed) == 1
formed_idents = set(result.campaigns_formed[0]["identity_uuids"])
assert formed_idents == {"i1", "i2"}
@pytest.mark.anyio
async def test_tick_keeps_distinct_payloads_separate(repo):
"""No payload/C2 overlap → singleton per identity."""
await _create_identity(
repo, "i1",
payload_simhashes=json.dumps(["h1"]),
c2_endpoints=json.dumps(["c1"]),
)
await _create_identity(
repo, "i2",
payload_simhashes=json.dumps(["h2"]),
c2_endpoints=json.dumps(["c2"]),
)
c = ConnectedComponentsCampaignClusterer()
result = await c.tick(repo)
assert len(result.campaigns_formed) == 2
@pytest.mark.anyio
async def test_tick_idempotent_links_existing_identity(repo):
"""Second tick on same input doesn't double-create campaigns."""
await _create_identity(repo, "i1")
c = ConnectedComponentsCampaignClusterer()
r1 = await c.tick(repo)
assert len(r1.campaigns_formed) == 1
campaign_uuid = r1.campaigns_formed[0]["campaign_uuid"]
r2 = await c.tick(repo)
# Identity already linked — no new campaign, no new assignment.
assert r2.campaigns_formed == []
assert r2.identities_assigned == []
# And the existing assignment persisted.
assert await repo.count_identities_for_campaign(campaign_uuid) == 1
@pytest.mark.anyio
async def test_tick_skips_merged_out_identities(repo):
"""Merged-out identity rows must not show up as cluster inputs."""
await _create_identity(repo, "i1")
await _create_identity(repo, "i2")
# Soft-merge i2 into i1 at the identity layer.
await repo.update_identity_merged_into("i2", "i1")
c = ConnectedComponentsCampaignClusterer()
result = await c.tick(repo)
# Only i1 is an active row; one campaign formed, with one identity.
assert len(result.campaigns_formed) == 1
assert result.campaigns_formed[0]["identity_uuids"] == ["i1"]
# ─── Factory + CLI gating ────────────────────────────────────────────────────
def test_factory_default():
c = get_campaign_clusterer()
assert isinstance(c, ConnectedComponentsCampaignClusterer)
def test_factory_unknown_raises(monkeypatch):
monkeypatch.setenv("DECNET_CAMPAIGN_CLUSTERER_TYPE", "nope")
with pytest.raises(ValueError):
get_campaign_clusterer()
def test_campaign_clusterer_registered_in_cli():
from decnet.cli.gating import MASTER_ONLY_COMMANDS
assert "campaign-clusterer" in MASTER_ONLY_COMMANDS
def test_campaign_topic_builder_round_trips():
assert _topics.campaign(_topics.CAMPAIGN_FORMED) == "campaign.formed"
assert _topics.campaign(_topics.CAMPAIGN_IDENTITY_ASSIGNED) == (
"campaign.identity.assigned"
)
assert _topics.identity(_topics.IDENTITY_CAMPAIGN_ASSIGNED) == (
"identity.campaign.assigned"
)

View File

@@ -0,0 +1,34 @@
"""Tests for :mod:`decnet.clustering.factory`."""
from __future__ import annotations
import pytest
from decnet.clustering.base import Clusterer
from decnet.clustering.factory import get_clusterer
from decnet.clustering.impl.connected_components import ConnectedComponentsClusterer
def test_default_returns_connected_components(monkeypatch):
monkeypatch.delenv("DECNET_CLUSTERER_TYPE", raising=False)
c = get_clusterer()
assert isinstance(c, ConnectedComponentsClusterer)
assert isinstance(c, Clusterer)
assert c.name == "connected_components"
def test_explicit_connected_components(monkeypatch):
monkeypatch.setenv("DECNET_CLUSTERER_TYPE", "connected_components")
c = get_clusterer()
assert isinstance(c, ConnectedComponentsClusterer)
def test_unknown_clusterer_type_raises(monkeypatch):
monkeypatch.setenv("DECNET_CLUSTERER_TYPE", "nope")
with pytest.raises(ValueError, match="Unknown clusterer"):
get_clusterer()
def test_case_insensitive(monkeypatch):
monkeypatch.setenv("DECNET_CLUSTERER_TYPE", " CONNECTED_COMPONENTS ")
c = get_clusterer()
assert isinstance(c, ConnectedComponentsClusterer)

View File

@@ -0,0 +1,182 @@
"""End-to-end tests for the clusterer worker shell.
The skeleton clusterer is a no-op; these tests cover the shell:
* exits cleanly on shutdown signal (and via cancel)
* invokes ``tick`` on each loop iteration
* publishes :class:`ClusterResult` side-effects on the right topics
* a clusterer raising from ``tick`` is logged and does not crash the loop
"""
from __future__ import annotations
import asyncio
import pytest
from decnet.bus import topics as _topics
from decnet.clustering.base import Clusterer, ClusterResult
from decnet.clustering.impl.connected_components import ConnectedComponentsClusterer
from decnet.clustering.worker import run_clusterer_loop
from decnet.web.db.factory import get_repository
@pytest.fixture
async def repo(tmp_path):
r = get_repository(db_path=str(tmp_path / "clusterer.db"))
await r.initialize()
return r
@pytest.fixture(autouse=True)
def _no_bus(monkeypatch):
"""Run workers in poll-only mode — no real Unix socket."""
monkeypatch.setenv("DECNET_BUS_ENABLED", "false")
class _FakeClusterer(Clusterer):
"""Test double: returns canned :class:`ClusterResult` per call."""
name = "fake"
def __init__(self, results: list[ClusterResult] | None = None) -> None:
self._results = list(results or [])
self.calls = 0
async def tick(self, repo) -> ClusterResult:
self.calls += 1
if self._results:
return self._results.pop(0)
return ClusterResult()
class _RaisingClusterer(Clusterer):
name = "raising"
def __init__(self) -> None:
self.calls = 0
async def tick(self, repo) -> ClusterResult:
self.calls += 1
raise RuntimeError("boom")
@pytest.mark.anyio
async def test_loop_exits_on_shutdown_signal(repo):
shutdown = asyncio.Event()
clusterer = _FakeClusterer()
task = asyncio.create_task(
run_clusterer_loop(
repo,
poll_interval_secs=0.05,
clusterer=clusterer,
shutdown=shutdown,
)
)
await asyncio.sleep(0.12)
shutdown.set()
await asyncio.wait_for(task, timeout=2.0)
assert clusterer.calls >= 1
@pytest.mark.anyio
async def test_loop_exits_on_cancel(repo):
clusterer = _FakeClusterer()
task = asyncio.create_task(
run_clusterer_loop(
repo,
poll_interval_secs=0.05,
clusterer=clusterer,
)
)
await asyncio.sleep(0.1)
task.cancel()
# The loop catches CancelledError and exits cleanly, mirroring the
# intel + reuse worker shells.
await asyncio.wait_for(task, timeout=2.0)
assert clusterer.calls >= 1
@pytest.mark.anyio
async def test_tick_failure_does_not_crash_loop(repo):
"""A clusterer raising from tick must be logged, not propagated."""
shutdown = asyncio.Event()
clusterer = _RaisingClusterer()
task = asyncio.create_task(
run_clusterer_loop(
repo,
poll_interval_secs=0.05,
clusterer=clusterer,
shutdown=shutdown,
)
)
await asyncio.sleep(0.2)
shutdown.set()
await asyncio.wait_for(task, timeout=2.0)
# Loop kept ticking despite the raise.
assert clusterer.calls >= 2
@pytest.mark.anyio
async def test_skeleton_clusterer_returns_empty_result(repo):
"""The connected-components skeleton produces no side-effects yet."""
c = ConnectedComponentsClusterer()
result = await c.tick(repo)
assert result.identities_formed == []
assert result.observations_linked == []
assert result.identities_merged == []
assert result.identities_unmerged == []
@pytest.mark.anyio
async def test_publishes_cluster_result_on_bus(monkeypatch, repo):
"""Every entry in ClusterResult fans out to the correct topic."""
published: list[tuple[str, dict, str]] = []
async def _fake_publish(bus, topic, payload, event_type=""):
published.append((topic, payload, event_type))
monkeypatch.setattr(
"decnet.clustering.worker.publish_safely", _fake_publish,
)
result = ClusterResult(
identities_formed=[
{"identity_uuid": "id-1", "observation_uuids": ["obs-1", "obs-2"]},
],
observations_linked=[
{"identity_uuid": "id-1", "observation_uuid": "obs-3"},
],
identities_merged=[
{"winner_uuid": "id-1", "loser_uuid": "id-2"},
],
identities_unmerged=[
{"resurrected_uuid": "id-2", "former_winner_uuid": "id-1"},
],
)
clusterer = _FakeClusterer(results=[result])
shutdown = asyncio.Event()
task = asyncio.create_task(
run_clusterer_loop(
repo,
poll_interval_secs=0.05,
clusterer=clusterer,
shutdown=shutdown,
)
)
await asyncio.sleep(0.1)
shutdown.set()
await asyncio.wait_for(task, timeout=2.0)
topics_seen = {t for t, _, _ in published}
assert _topics.identity(_topics.IDENTITY_FORMED) in topics_seen
assert _topics.identity(_topics.IDENTITY_OBSERVATION_LINKED) in topics_seen
assert _topics.identity(_topics.IDENTITY_MERGED) in topics_seen
assert _topics.identity(_topics.IDENTITY_UNMERGED) in topics_seen
@pytest.mark.anyio
async def test_clusterer_registered_in_cli():
"""`decnet clusterer` is registered as a master-only command."""
from decnet.cli.gating import MASTER_ONLY_COMMANDS
assert "clusterer" in MASTER_ONLY_COMMANDS

View File

@@ -0,0 +1,808 @@
"""Tests for the connected-components clusterer (commit 4 — high-weight edges).
Covers, in order:
* The pure ``cluster_observations`` algorithm — singletons stay
isolated, exact-match high-weight signals fold them together,
un-fingerprinted observations stay un-mergeable.
* The production-row adapter ``from_attacker_row`` — JA3 / HASSH
recovered from the fingerprints JSON; absent fields project to
``None``.
* End-to-end ``tick`` against a real SQLite repo: seeded attackers
with shared / divergent fingerprints get the right identity rows
written and the right ``identity_id`` links set.
* Three fixture-bound assertions: lone_wolf (pure singletons),
shared_wordlist (no fingerprint signal — singletons), and
vpn_hopping at identity-level (one identity from 5 rotated IPs
via shared JA3 + HASSH).
The tick is bus-free here — the worker shell tests cover bus fan-out
separately. We're validating the algorithm + DB writes here.
"""
from __future__ import annotations
import json
from datetime import datetime, timezone
from pathlib import Path
import pytest
from decnet.clustering.impl.connected_components import (
ConnectedComponentsClusterer,
cluster_observations,
from_attacker_row,
)
from decnet.clustering.impl.similarity import Observation, from_synthetic
from decnet.web.db.factory import get_repository
FIXTURE_DIR = Path(__file__).parent.parent / "fixtures" / "campaigns"
# ─── pure algorithm ─────────────────────────────────────────────────────────
def _obs(obs_id: str, **kwargs) -> Observation:
return Observation(observation_id=obs_id, **kwargs)
def test_cluster_observations_singletons_stay_isolated():
a = _obs("a", ja3="ja3-a")
b = _obs("b", ja3="ja3-b")
c = _obs("c") # no fingerprint
labels = cluster_observations([a, b, c])
assert labels["a"] != labels["b"]
assert labels["b"] != labels["c"]
assert labels["a"] != labels["c"]
def test_cluster_observations_ja3_match_unions():
a = _obs("a", ja3="ja3-shared")
b = _obs("b", ja3="ja3-shared")
c = _obs("c", ja3="ja3-other")
labels = cluster_observations([a, b, c])
assert labels["a"] == labels["b"]
assert labels["a"] != labels["c"]
def test_cluster_observations_unfingerprinted_stay_separate():
"""Two observations with no signals must NOT collapse into one
cluster — that would fuse every noise scanner together."""
a = _obs("a")
b = _obs("b")
labels = cluster_observations([a, b])
assert labels["a"] != labels["b"]
def test_cluster_observations_transitive_via_payload():
"""A↔B via JA3, B↔C via payload → A, B, C all in one component."""
a = _obs("a", ja3="ja3-x")
b = _obs("b", ja3="ja3-x", payload_hashes=frozenset({"pl-1"}))
c = _obs("c", payload_hashes=frozenset({"pl-1"}))
labels = cluster_observations([a, b, c])
assert labels["a"] == labels["b"] == labels["c"]
def test_cluster_observations_empty_input():
assert cluster_observations([]) == {}
def test_cluster_observations_deterministic():
"""Same input → same labels. Load-bearing for fixture stability."""
obs = [_obs("a", ja3="x"), _obs("b", ja3="x"), _obs("c")]
assert cluster_observations(obs) == cluster_observations(obs)
# ─── production-row adapter ────────────────────────────────────────────────
def test_from_attacker_row_extracts_ja3_and_hassh():
row = {
"uuid": "att-1",
"asn": 64500,
"identity_id": None,
"fingerprints": json.dumps([
{"kind": "ja3", "hash": "ja3-abc"},
{"kind": "hassh", "hash": "hassh-def"},
{"kind": "jarm", "hash": "jarm-ghi"}, # not used in v1
]),
}
obs = from_attacker_row(row)
assert obs.observation_id == "att-1"
assert obs.ja3 == "ja3-abc"
assert obs.hassh == "hassh-def"
assert obs.asn == 64500
def test_from_attacker_row_handles_empty_fingerprints():
row = {"uuid": "att-2", "asn": None, "identity_id": None, "fingerprints": "[]"}
obs = from_attacker_row(row)
assert obs.ja3 is None
assert obs.hassh is None
assert obs.asn is None
def test_from_attacker_row_handles_malformed_json():
row = {"uuid": "att-3", "asn": None, "identity_id": None, "fingerprints": "not json"}
obs = from_attacker_row(row)
assert obs.ja3 is None
assert obs.hassh is None
# ─── end-to-end tick against SQLite ────────────────────────────────────────
@pytest.fixture
async def repo(tmp_path):
r = get_repository(db_path=str(tmp_path / "clusterer.db"))
await r.initialize()
return r
async def _seed_attacker(
repo, ip: str, *,
ja3: str | None = None,
hassh: str | None = None,
asn: int | None = None,
cert_sha256: str | None = None,
) -> str:
now = datetime.now(timezone.utc)
# Two-shape fingerprint payload:
# - the "kind" entries feed the clusterer's from_attacker_row
# (test-fixture shape, line ~115 of connected_components.py)
# - the "bounty_type/payload" entries feed identity_rollup's
# extract_fp_summaries (production shape, written by the
# profiler from real bounty rows). Both shapes coexist in
# the same JSON list so the same seed exercises clustering
# AND the identity-column rollup.
fingerprints: list[dict] = []
if ja3:
fingerprints.append({"kind": "ja3", "hash": ja3})
fingerprints.append({
"bounty_type": "fingerprint",
"payload": {"fingerprint_type": "ja3", "ja3": ja3},
})
if hassh:
fingerprints.append({"kind": "hassh", "hash": hassh})
fingerprints.append({
"bounty_type": "fingerprint",
"payload": {"fingerprint_type": "hassh_server", "hash": hassh},
})
if cert_sha256:
fingerprints.append({
"bounty_type": "fingerprint",
"payload": {
"fingerprint_type": "tls_certificate",
"cert_sha256": cert_sha256,
},
})
return await repo.upsert_attacker({
"ip": ip,
"first_seen": now,
"last_seen": now,
"event_count": 1,
"asn": asn,
"fingerprints": json.dumps(fingerprints),
})
@pytest.mark.anyio
async def test_tick_on_empty_db_is_noop(repo):
c = ConnectedComponentsClusterer()
result = await c.tick(repo)
assert result.identities_formed == []
assert result.observations_linked == []
@pytest.mark.anyio
async def test_tick_clusters_shared_ja3(repo):
"""Two observations with the same JA3 → one identity row, both linked."""
a = await _seed_attacker(repo, "1.1.1.1", ja3="ja3-x", asn=64500)
b = await _seed_attacker(repo, "2.2.2.2", ja3="ja3-x", asn=64501)
c = ConnectedComponentsClusterer()
result = await c.tick(repo)
assert len(result.identities_formed) == 1
formed = result.identities_formed[0]
assert set(formed["observation_uuids"]) == {a, b}
# Identity row exists and both attackers FK to it.
identity_uuid = formed["identity_uuid"]
identity = await repo.get_identity_by_uuid(identity_uuid)
assert identity is not None
assert identity["uuid"] == identity_uuid
obs_for_id = await repo.list_observations_for_identity(identity_uuid)
obs_uuids = {o["uuid"] for o in obs_for_id}
assert obs_uuids == {a, b}
@pytest.mark.anyio
async def test_tick_keeps_distinct_ja3_separate(repo):
"""Two divergent JA3s with no other shared signal → two singletons,
no identity rows written (singletons stay un-clustered in v1)."""
await _seed_attacker(repo, "1.1.1.1", ja3="ja3-a")
await _seed_attacker(repo, "2.2.2.2", ja3="ja3-b")
c = ConnectedComponentsClusterer()
result = await c.tick(repo)
# Singletons get identity rows of their own (one observation per cluster).
assert len(result.identities_formed) == 2
for formed in result.identities_formed:
assert len(formed["observation_uuids"]) == 1
@pytest.mark.anyio
async def test_tick_merges_two_identities_when_component_spans_them(repo):
"""Two pre-existing identities whose observations now cluster
together (e.g. a previously-missing fingerprint shows up) get
soft-merged: the smaller-uuid identity wins, the loser's
merged_into_uuid is set, observations stay FK'd to their
original identity row."""
# Tick 1: two distinct fingerprints → two distinct identities.
a = await _seed_attacker(repo, "1.1.1.1", ja3="ja3-A")
b = await _seed_attacker(repo, "2.2.2.2", ja3="ja3-B")
c = ConnectedComponentsClusterer()
first = await c.tick(repo)
assert len(first.identities_formed) == 2
# Snapshot the two identity uuids; we'll need them after the merge.
identities_after_first = await repo.list_all_identities()
assert len(identities_after_first) == 2
uuids = sorted(i["uuid"] for i in identities_after_first)
expected_winner, expected_loser = uuids[0], uuids[1]
# Tick 2: a bridging observation — fingerprints match BOTH prior
# rows. The bridge can't agree with both JA3s simultaneously, so
# use a HASSH that matches A and a payload that matches B.
# Simulate this with two new attackers, each linking a side.
# Simpler: change attacker A's stored fingerprint to also include
# ja3-B by re-seeding (in production this would be a fresh
# observation that bridges them).
bridge = await _seed_attacker(repo, "3.3.3.3", ja3="ja3-A", hassh="hassh-bridge")
# Make B's row carry the same hassh so the bridge can union them.
import json as _json
from datetime import datetime, timezone
now = datetime.now(timezone.utc)
await repo.upsert_attacker({
"ip": "2.2.2.2", "first_seen": now, "last_seen": now,
"event_count": 1,
"fingerprints": _json.dumps([
{"kind": "ja3", "hash": "ja3-B"},
{"kind": "hassh", "hash": "hassh-bridge"},
]),
})
second = await c.tick(repo)
assert len(second.identities_merged) == 1
merge = second.identities_merged[0]
assert merge["winner_uuid"] == expected_winner
assert merge["loser_uuid"] == expected_loser
# The loser's row still exists with merged_into_uuid set.
all_after = {i["uuid"]: i for i in await repo.list_all_identities()}
assert all_after[expected_loser]["merged_into_uuid"] == expected_winner
assert all_after[expected_winner]["merged_into_uuid"] is None
# Observations stay FK'd to their original identity row — the
# merge is a soft pointer, NOT a re-point.
a_row = await repo.get_attacker_by_uuid(a)
b_row = await repo.get_attacker_by_uuid(b)
assert a_row["identity_id"] in {expected_winner, expected_loser}
assert b_row["identity_id"] in {expected_winner, expected_loser}
@pytest.mark.anyio
async def test_tick_unmerges_when_observations_diverge(repo):
"""Pre-seed a soft-merged pair, then change the underlying
observations so they no longer cluster. The tick must clear
merged_into_uuid and emit identities_unmerged."""
import json as _json
from datetime import datetime, timezone
now = datetime.now(timezone.utc)
# Two attackers with same JA3 → tick merges them via shared
# high-tier signal (one identity formed).
a = await _seed_attacker(repo, "1.1.1.1", ja3="ja3-shared")
b = await _seed_attacker(repo, "2.2.2.2", ja3="ja3-shared")
c = ConnectedComponentsClusterer()
first = await c.tick(repo)
assert len(first.identities_formed) == 1
one_identity_uuid = first.identities_formed[0]["identity_uuid"]
# Force a soft-merge state: split observation b out into its own
# identity, then merge that back into the first via the repo
# directly. This emulates a state the clusterer would have
# arrived at across multiple ticks (form, then merge).
second_uuid = "00000000-0000-0000-0000-00000000bbbb"
await repo.create_attacker_identity({
"uuid": second_uuid,
"schema_version": 1,
"first_seen_at": now, "last_seen_at": now,
"created_at": now, "updated_at": now,
"observation_count": 1,
})
await repo.set_attacker_identity_id(b, second_uuid)
# Soft-merge second_uuid into one_identity_uuid (winner).
winner = min(one_identity_uuid, second_uuid)
loser = max(one_identity_uuid, second_uuid)
if loser == one_identity_uuid:
# Make the canonical mapping consistent with the test setup —
# we need the merge to be "loser → winner" by min-uuid rule.
# Swap ownership so the smaller-uuid keeps the active observations.
await repo.set_attacker_identity_id(a, winner)
await repo.set_attacker_identity_id(b, loser)
await repo.update_identity_merged_into(loser, winner)
# Verify the soft-merge is in place.
pre = {i["uuid"]: i for i in await repo.list_all_identities()}
assert pre[loser]["merged_into_uuid"] == winner
# Now change the underlying fingerprints so a and b no longer cluster.
await repo.upsert_attacker({
"ip": "2.2.2.2", "first_seen": now, "last_seen": now,
"event_count": 1,
"fingerprints": _json.dumps([{"kind": "ja3", "hash": "ja3-different"}]),
})
# Tick should detect the divergence and revoke the merge.
third = await c.tick(repo)
assert len(third.identities_unmerged) == 1
unmerged = third.identities_unmerged[0]
assert unmerged["resurrected_uuid"] == loser
assert unmerged["former_winner_uuid"] == winner
post = {i["uuid"]: i for i in await repo.list_all_identities()}
assert post[loser]["merged_into_uuid"] is None
assert post[winner]["merged_into_uuid"] is None
@pytest.mark.anyio
async def test_tick_is_idempotent_under_no_changes(repo):
"""Running tick twice with no state changes between produces no
side-effects on the second run."""
await _seed_attacker(repo, "1.1.1.1", ja3="ja3-x")
await _seed_attacker(repo, "2.2.2.2", ja3="ja3-x")
await _seed_attacker(repo, "3.3.3.3", ja3="ja3-y")
c = ConnectedComponentsClusterer()
first = await c.tick(repo)
second = await c.tick(repo)
assert second.identities_formed == []
assert second.observations_linked == []
assert second.identities_merged == []
assert second.identities_unmerged == []
# Sanity: the first tick did do something.
assert first.identities_formed
@pytest.mark.anyio
async def test_tick_links_new_observation_to_existing_identity(repo):
"""First tick: 2 attackers cluster into one identity. Second tick:
a new attacker with the same JA3 should get linked, not minted."""
a = await _seed_attacker(repo, "1.1.1.1", ja3="ja3-x")
b = await _seed_attacker(repo, "2.2.2.2", ja3="ja3-x")
c = ConnectedComponentsClusterer()
first = await c.tick(repo)
assert len(first.identities_formed) == 1
identity_uuid = first.identities_formed[0]["identity_uuid"]
# New observation arrives; same JA3.
d = await _seed_attacker(repo, "3.3.3.3", ja3="ja3-x")
second = await c.tick(repo)
# No new identity should be formed for the existing component;
# observation-linked should fire for the new one.
formed_uuids = {f["identity_uuid"] for f in second.identities_formed}
assert identity_uuid not in formed_uuids, (
"second tick must link to the existing identity, not mint a new one"
)
linked_uuids = {l_["observation_uuid"] for l_ in second.observations_linked}
assert d in linked_uuids
# ─── identity fingerprint rollup ───────────────────────────────────────────
@pytest.mark.anyio
async def test_tick_rolls_up_fingerprint_columns_on_create(repo):
"""A fresh-component tick must populate ja3_hashes / hassh_hashes /
tls_cert_sha256 on the newly-minted identity row, deduplicated and
sorted across all member observations."""
await _seed_attacker(
repo, "1.1.1.1", ja3="ja3-x", hassh="hassh-y", cert_sha256="ab" * 32,
)
await _seed_attacker(
repo, "2.2.2.2", ja3="ja3-x", hassh="hassh-y", cert_sha256="cd" * 32,
)
c = ConnectedComponentsClusterer()
result = await c.tick(repo)
assert len(result.identities_formed) == 1
identity_uuid = result.identities_formed[0]["identity_uuid"]
rows = {i["uuid"]: i for i in await repo.list_all_identities()}
identity = rows[identity_uuid]
assert json.loads(identity["ja3_hashes"]) == ["ja3-x"]
assert json.loads(identity["hassh_hashes"]) == ["hassh-y"]
assert json.loads(identity["tls_cert_sha256"]) == sorted(["ab" * 32, "cd" * 32])
@pytest.mark.anyio
async def test_tick_rolls_up_fingerprints_on_link(repo):
"""When a new observation links into an existing identity, the
rollup must reflect any new cert SHA-256 it brings."""
await _seed_attacker(
repo, "1.1.1.1", ja3="ja3-x", cert_sha256="ab" * 32,
)
c = ConnectedComponentsClusterer()
first = await c.tick(repo)
identity_uuid = first.identities_formed[0]["identity_uuid"]
# New observation, same JA3, fresh cert.
await _seed_attacker(
repo, "2.2.2.2", ja3="ja3-x", cert_sha256="cd" * 32,
)
await c.tick(repo)
rows = {i["uuid"]: i for i in await repo.list_all_identities()}
identity = rows[identity_uuid]
assert json.loads(identity["tls_cert_sha256"]) == sorted(["ab" * 32, "cd" * 32])
@pytest.mark.anyio
async def test_tick_leaves_columns_null_when_no_fingerprints(repo):
"""Two attackers with NO fingerprint signal cluster as separate
singletons; their identity rows must keep all rollup columns NULL
(not "[]" — NULL distinguishes 'no signal yet' from 'known empty')."""
await _seed_attacker(repo, "1.1.1.1")
await _seed_attacker(repo, "2.2.2.2")
c = ConnectedComponentsClusterer()
await c.tick(repo)
for identity in await repo.list_all_identities():
assert identity["ja3_hashes"] is None
assert identity["hassh_hashes"] is None
assert identity["tls_cert_sha256"] is None
# ─── fixture-bound assertions (in-memory) ──────────────────────────────────
def _production_clusterer_predict(corpus) -> dict[str, str]:
"""Run the production cluster_observations over a corpus.
Mirrors the reference clusterer signature (corpus → dict) so it can
be passed to ``assert_fixture_bounds``. Pure / in-memory — does NOT
touch the DB. The DB-side path is covered by the tick tests above.
"""
obs = [from_synthetic(att) for att in corpus.attackers]
labels = cluster_observations(obs)
# Singletons (no shared signal) get unique cluster ids so the
# metrics see them as distinct classes — matches the
# fingerprint_clusterer reference shape on lone_wolf / shared_wordlist.
pred: dict[str, str] = {}
cluster_sizes: dict[str, int] = {}
for cid in labels.values():
cluster_sizes[cid] = cluster_sizes.get(cid, 0) + 1
for obs_id, cid in labels.items():
if cluster_sizes[cid] == 1:
pred[obs_id] = f"cc-singleton-{obs_id}"
else:
pred[obs_id] = cid
return pred
def test_lone_wolf_passes_with_production_clusterer():
"""Fixture 3: every actor singleton. The production clusterer
keeps them all separate (no shared high-weight signal)."""
from tests.clustering.fixture_harness import assert_fixture_bounds
from tests.factories.campaign_factory import generate, load_yaml
corpus = generate(load_yaml(FIXTURE_DIR / "lone_wolf.yaml"), seed=0)
assert_fixture_bounds(
corpus, _production_clusterer_predict,
FIXTURE_DIR / "lone_wolf.expected.yaml",
)
def test_shared_wordlist_passes_with_production_clusterer():
"""Fixture 1: two campaigns sharing only credentials, divergent
infra. The production clusterer (high-weight edges only) keeps
them separate — credential overlap is not a v1 signal yet."""
from tests.clustering.fixture_harness import assert_fixture_bounds
from tests.factories.campaign_factory import generate, load_yaml
corpus = generate(load_yaml(FIXTURE_DIR / "shared_wordlist.yaml"), seed=0)
assert_fixture_bounds(
corpus, _production_clusterer_predict,
FIXTURE_DIR / "shared_wordlist.expected.yaml",
)
def test_paused_campaign_passes_with_production_clusterer():
"""Fixture 4: one campaign split across two operational windows by
a multi-day silence. Both halves share JA3 + HASSH + payload + C2;
the production clusterer must fold them into one identity. Time-
agnostic invariant: the silence window is irrelevant to clustering."""
from tests.clustering.fixture_harness import assert_fixture_bounds
from tests.factories.campaign_factory import generate, load_yaml
corpus = generate(load_yaml(FIXTURE_DIR / "paused_campaign.yaml"), seed=0)
assert_fixture_bounds(
corpus, _production_clusterer_predict,
FIXTURE_DIR / "paused_campaign.expected.yaml",
)
def test_multi_operator_keeps_distinct_identities_with_production_clusterer():
"""Fixture 5 at identity-level: two operators with distinct
JA3 + HASSH, sharing C2 + payload. The production clusterer's
fingerprint-disagreement veto must keep them as 2 identities."""
from tests.factories.campaign_factory import generate, load_yaml
from tests.clustering.metrics import score
corpus = generate(load_yaml(FIXTURE_DIR / "multi_operator.yaml"), seed=0)
pred = _production_clusterer_predict(corpus)
# Two distinct truth identities; the production clusterer must
# produce two distinct predicted clusters (no merge across
# fingerprint-disagreeing operators).
assert len(set(pred.values())) == 2
metrics = score(corpus.truth_labels(level="identity"), pred)
# Perfect identity-level recovery: ARI = 1.0, homogeneity = 1.0.
assert metrics["adjusted_rand_index"] == pytest.approx(1.0)
assert metrics["homogeneity"] == pytest.approx(1.0)
def test_cluster_observations_credentials_alone_does_not_fuse():
"""Two observations sharing a credential set but nothing else
must stay distinct. Fixture 1's failure mode in miniature."""
a = Observation(
observation_id="a",
credentials=frozenset({("root", "toor"), ("admin", "admin")}),
)
b = Observation(
observation_id="b",
credentials=frozenset({("root", "toor"), ("admin", "admin")}),
)
labels = cluster_observations([a, b])
assert labels["a"] != labels["b"]
def test_cluster_observations_asn_alone_does_not_fuse():
"""Two observations sharing only ASN must stay distinct.
Fixture 2's failure mode in miniature — VPN/proxy hopping
fragments ASN within a single identity, and ASN sharing
across identities is common; can't drive clustering."""
a = Observation(observation_id="a", asn=64500)
b = Observation(observation_id="b", asn=64500)
labels = cluster_observations([a, b])
assert labels["a"] != labels["b"]
def test_cluster_observations_all_weak_signals_combined_does_not_fuse():
"""Even credentials + commands + ASN together don't drive
clustering — only a high-tier signal does. Stack everything
a campaign-level F1+F2 hybrid would have, confirm singletons."""
a = Observation(
observation_id="a",
asn=64500,
credentials=frozenset({("root", "toor"), ("admin", "admin")}),
commands_by_phase={"discovery": ("ls", "id")},
)
b = Observation(
observation_id="b",
asn=64500,
credentials=frozenset({("root", "toor"), ("admin", "admin")}),
commands_by_phase={"discovery": ("ls", "id")},
)
labels = cluster_observations([a, b])
assert labels["a"] != labels["b"]
def test_shared_wordlist_no_false_merge_at_identity_level():
"""F1 ratchet: even at identity level (where each row is its own
identity), the production clusterer must not fuse credential-
sharing observations. Tightens the F1 bound by asserting
completeness == 1.0 at identity-level scoring (no truth identity
is split, because every row is its own truth identity)."""
from tests.factories.campaign_factory import generate, load_yaml
from tests.clustering.metrics import score
corpus = generate(load_yaml(FIXTURE_DIR / "shared_wordlist.yaml"), seed=0)
pred = _production_clusterer_predict(corpus)
metrics = score(corpus.truth_labels(level="identity"), pred)
# Each row must land in its own predicted cluster — anything else
# is a false merge driven by the credential-overlap signal.
assert len(set(pred.values())) == len(corpus.attackers)
assert metrics["homogeneity"] == pytest.approx(1.0)
def test_vpn_hopping_asn_alone_would_have_fragmented_but_doesnt():
"""F2 ratchet: vpn_hopping has 5 distinct ASNs across one identity.
A clusterer that lets ASN drive would split into 5; the production
clusterer doesn't because ASN is very-low-tier and JA3 / HASSH
are stable. Confirms tier discipline holds end-to-end."""
from tests.factories.campaign_factory import generate, load_yaml
corpus = generate(load_yaml(FIXTURE_DIR / "vpn_hopping.yaml"), seed=0)
pred = _production_clusterer_predict(corpus)
asns = {a.asn for a in corpus.attackers}
assert len(asns) == 5, "fixture sanity: 5 distinct ASNs"
# All 5 land in one cluster, not 5.
assert len(set(pred.values())) == 1
def test_cluster_observations_medium_alone_does_not_fuse():
"""Two observations sharing only command-sequence (medium-tier)
must stay in distinct clusters — medium is a supporting signal."""
a = Observation(
observation_id="a",
commands_by_phase={"discovery": ("ls", "id", "uname")},
)
b = Observation(
observation_id="b",
commands_by_phase={"discovery": ("ls", "id", "uname")},
)
labels = cluster_observations([a, b])
assert labels["a"] != labels["b"]
def _build_noise_floor_corpus():
"""Expand noise_floor.yaml's include_fixtures block into one corpus."""
import yaml as _yaml
from typing import Any
from tests.factories.campaign_factory import generate, load_yaml
declared = _yaml.safe_load(
(FIXTURE_DIR / "noise_floor.yaml").read_text(encoding="utf-8")
)
campaigns: list[dict[str, Any]] = []
inherited_noise = 0
for fname in declared["include_fixtures"]:
sub = load_yaml(FIXTURE_DIR / fname)
if "corpus" in sub:
campaigns.extend(sub["corpus"].get("campaigns", []))
inherited_noise += int(
(sub["corpus"].get("noise") or {}).get("scanner_count", 0)
)
else:
campaigns.append({"campaign": sub["campaign"]})
extra = int(declared.get("extra_noise_scanners", 0))
spec = {"corpus": {
"campaigns": campaigns,
"noise": {"scanner_count": inherited_noise + extra},
}}
return generate(spec, seed=0)
def test_noise_floor_singleton_recall_holds_with_production_clusterer():
"""Fixture 6 ratchet — noise floor isolation.
The load-bearing F6 invariant for the *production* clusterer:
truth-singleton noise scanners must not be absorbed into real
campaigns. A clusterer that pulls noise into campaigns dilutes
attribution to nothing.
Scored at *campaign* level so the truth-singleton noise scanners
align with the prediction (each noise row has its own truth
campaign id). Identity-level scoring is muddier here — see
``test_noise_floor_intra_campaign_recovery`` below for the
constituent-campaign test that *is* identity-shaped.
"""
from tests.clustering.metrics import score
corpus = _build_noise_floor_corpus()
pred = _production_clusterer_predict(corpus)
metrics = score(corpus.truth_labels(level="campaign"), pred)
assert metrics["singleton_recall"] >= 0.95, metrics
def test_noise_floor_intra_campaign_recovery_with_production_clusterer():
"""The other half of F6: real campaigns must still resolve through
the noise. Specifically: vpn_hopping's 5 rotations land in one
cluster (its identity-level signature), and shared_wordlist's two
distinct campaigns stay un-merged despite sharing wordlists.
Demonstrates the production clusterer's tier discipline holds
under cross-corpus interference, not just per-fixture in
isolation."""
corpus = _build_noise_floor_corpus()
pred = _production_clusterer_predict(corpus)
# vpn_hopping: all 5 rotation rows fold into one predicted cluster.
vpn_obs = [
a.attacker_id for a in corpus.attackers
if a.truth_campaign_id == "vpn-hopping-001"
]
assert len(vpn_obs) == 5
vpn_clusters = {pred[oid] for oid in vpn_obs}
assert len(vpn_clusters) == 1, (
"vpn_hopping must consolidate to one cluster across rotations"
)
# shared_wordlist A and B: distinct fingerprints → must stay
# separate clusters despite shared credentials in the noise floor.
sw_a = [
a.attacker_id for a in corpus.attackers
if a.truth_campaign_id == "shared-wordlist-A"
]
sw_b = [
a.attacker_id for a in corpus.attackers
if a.truth_campaign_id == "shared-wordlist-B"
]
assert sw_a and sw_b
sw_a_clusters = {pred[oid] for oid in sw_a}
sw_b_clusters = {pred[oid] for oid in sw_b}
assert sw_a_clusters.isdisjoint(sw_b_clusters), (
"shared_wordlist A and B must not share a cluster"
)
def test_slow_burn_passes_with_production_clusterer():
"""Fixture 7 (slow_burn): one campaign across 3 multi-week operational
windows. Shared JA3 + HASSH + C2 across all 3 actors. The production
clusterer must fold them into one cluster — *despite* the multi-week
silence between windows. Time-agnostic invariant in action."""
from tests.clustering.fixture_harness import assert_fixture_bounds
from tests.factories.campaign_factory import generate, load_yaml
corpus = generate(load_yaml(FIXTURE_DIR / "slow_burn.yaml"), seed=0)
metrics = assert_fixture_bounds(
corpus, _production_clusterer_predict,
FIXTURE_DIR / "slow_burn.expected.yaml",
)
pred = _production_clusterer_predict(corpus)
# All three operational windows in one cluster — the F7 contract.
assert len(set(pred.values())) == 1
assert metrics["completeness"] == pytest.approx(1.0)
def test_slow_burn_time_shift_invariance():
"""Time-agnostic invariant in execution: shifting every observation's
session timestamps by an arbitrary delta must not change the
predicted clusters. This is the runtime counterpart of the
Observation-no-time-fields static check in test_similarity.py."""
from datetime import timedelta
from tests.factories.campaign_factory import generate, load_yaml
corpus = generate(load_yaml(FIXTURE_DIR / "slow_burn.yaml"), seed=0)
baseline = _production_clusterer_predict(corpus)
# Shift every session by +90 days (a full multi-month gap) and
# re-cluster. Predicted membership must be identical.
for att in corpus.attackers:
att.first_seen += timedelta(days=90)
att.last_seen += timedelta(days=90)
for s in att.sessions:
s.started_at += timedelta(days=90)
shifted = _production_clusterer_predict(corpus)
# Cluster ids may differ as opaque labels but membership groupings
# must match. Convert each prediction to canonical form: a set of
# frozensets of co-clustered observation_ids.
def _canonical(pred: dict[str, str]) -> set[frozenset[str]]:
groups: dict[str, set[str]] = {}
for oid, cid in pred.items():
groups.setdefault(cid, set()).add(oid)
return {frozenset(g) for g in groups.values()}
assert _canonical(baseline) == _canonical(shifted)
def test_vpn_hopping_passes_at_identity_level_with_production_clusterer():
"""Fixture 2: one rotating actor with stable JA3 + HASSH across
5 ASNs. The production clusterer must fold all 5 observations into
one identity (high-weight JA3 / HASSH agreement)."""
from tests.clustering.fixture_harness import assert_fixture_bounds
from tests.factories.campaign_factory import generate, load_yaml
corpus = generate(load_yaml(FIXTURE_DIR / "vpn_hopping.yaml"), seed=0)
metrics = assert_fixture_bounds(
corpus, _production_clusterer_predict,
FIXTURE_DIR / "vpn_hopping.expected.yaml",
truth_level="identity",
)
assert metrics["adjusted_rand_index"] == pytest.approx(1.0)
assert metrics["completeness"] == pytest.approx(1.0)

View File

@@ -0,0 +1,278 @@
"""Run the production campaign clusterer through all 7 fixtures.
The 7 fixtures' YAML bounds were tuned for *reference* clusterers
(``c2_callback_clusterer``, ``composite_signals_clusterer``, etc.).
The production campaign clusterer (``ConnectedComponentsCampaignClusterer``)
is the system under test now; this module asserts it meets every
existing bound, plus a few stricter per-fixture invariants where the
algorithm should — by design — score perfectly.
The pure path is what's exercised here: ``cluster_identities``
operating over ``IdentityFeatures`` projected via
``from_synthetic_identity``. Each ``SyntheticAttacker`` is treated as
one identity (identity layer is below; the campaign clusterer reads
identities). End-to-end DB-backed validation is in
``test_campaign_worker.py``.
"""
from __future__ import annotations
from pathlib import Path
from typing import Any
import pytest
import yaml
from decnet.clustering.campaign.impl.connected_components import (
cluster_identities,
)
from decnet.clustering.campaign.impl.similarity import (
IdentityFeatures,
from_synthetic_identity,
)
from decnet.clustering.impl.connected_components import cluster_observations
from decnet.clustering.impl.similarity import from_synthetic
from tests.clustering.fixture_harness import assert_fixture_bounds
from tests.clustering.metrics import score
from tests.factories.campaign_factory import generate, load_yaml
FIXTURE_DIR = Path(__file__).parent.parent / "fixtures" / "campaigns"
def _load_corpus(yaml_name: str) -> Any:
"""Load a fixture; expand the noise_floor composite if required."""
path = FIXTURE_DIR / yaml_name
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
if "include_fixtures" in raw:
# Mirror tests/clustering/test_noise_floor_fixture.py's expander —
# noise_floor is the only fixture that uses this format.
campaigns: list[dict[str, Any]] = []
inherited_noise = 0
for fname in raw["include_fixtures"]:
sub = load_yaml(FIXTURE_DIR / fname)
if "corpus" in sub:
campaigns.extend(sub["corpus"].get("campaigns", []))
inherited_noise += int(
(sub["corpus"].get("noise") or {}).get("scanner_count", 0)
)
else:
campaigns.append({"campaign": sub["campaign"]})
extra = int(raw.get("extra_noise_scanners", 0))
spec: Any = {
"corpus": {
"campaigns": campaigns,
"noise": {"scanner_count": inherited_noise + extra},
}
}
return generate(spec, seed=0)
return generate(load_yaml(path), seed=0)
def production_campaign_clusterer(corpus) -> dict[str, str]:
"""Predict-fn adapter — chains identity + campaign clustering.
Mirrors the production pipeline: the identity clusterer groups
rotated-IP observations into identities, then the campaign
clusterer groups identities into campaigns. The harness scores
``{attacker_id: cluster_id}`` so the chain preserves the
attacker → identity → campaign mapping.
"""
# ── Layer 1: identity clustering over observations.
obs_list = [from_synthetic(a) for a in corpus.attackers]
obs_labels = cluster_observations(obs_list)
# Group attackers by their identity cluster.
by_identity: dict[str, list] = {}
for a in corpus.attackers:
by_identity.setdefault(obs_labels[a.attacker_id], []).append(a)
# ── Layer 2: aggregate each identity's member observations into
# one ``IdentityFeatures``, run campaign clustering.
identity_features: list[IdentityFeatures] = []
for identity_id, members in by_identity.items():
identity_features.append(_merge_features(identity_id, members))
campaign_labels = cluster_identities(identity_features)
# ── Map attacker_id → campaign cluster id via the identity hop.
return {
a.attacker_id: campaign_labels[obs_labels[a.attacker_id]]
for a in corpus.attackers
}
def _merge_features(identity_uuid: str, members) -> IdentityFeatures:
"""Aggregate per-attacker IdentityFeatures into a single identity.
Set fields union; per-decky maps are merged (first/last seen
extends across all member observations); session windows
concatenate.
"""
parts = [from_synthetic_identity(a, identity_uuid=identity_uuid) for a in members]
asn_cohort: set[int] = set()
payload_hashes: set[str] = set()
c2_endpoints: set[str] = set()
decky_set: set[str] = set()
session_windows: list[tuple[float, float]] = []
last_phase_per_decky: dict[str, str] = {}
first_phase_per_decky: dict[str, str] = {}
last_seen_per_decky: dict[str, float] = {}
first_seen_per_decky: dict[str, float] = {}
commands_by_phase_on_decky: dict[tuple[str, str], list[str]] = {}
for p in parts:
asn_cohort |= p.asn_cohort
payload_hashes |= p.payload_hashes
c2_endpoints |= p.c2_endpoints
decky_set |= p.decky_set
session_windows.extend(p.session_windows)
for decky, ts in p.first_seen_per_decky.items():
cur = first_seen_per_decky.get(decky)
if cur is None or ts < cur:
first_seen_per_decky[decky] = ts
first_phase_per_decky[decky] = p.first_phase_per_decky.get(decky, "")
for decky, ts in p.last_seen_per_decky.items():
cur = last_seen_per_decky.get(decky)
if cur is None or ts > cur:
last_seen_per_decky[decky] = ts
last_phase_per_decky[decky] = p.last_phase_per_decky.get(decky, "")
for key, cmds in p.commands_by_phase_on_decky.items():
commands_by_phase_on_decky.setdefault(key, []).extend(cmds)
return IdentityFeatures(
identity_uuid=identity_uuid,
asn_cohort=frozenset(asn_cohort),
payload_hashes=frozenset(payload_hashes),
c2_endpoints=frozenset(c2_endpoints),
decky_set=frozenset(decky_set),
session_windows=tuple(session_windows),
last_phase_per_decky=last_phase_per_decky,
first_phase_per_decky=first_phase_per_decky,
last_seen_per_decky=last_seen_per_decky,
first_seen_per_decky=first_seen_per_decky,
commands_by_phase_on_decky={
k: tuple(v) for k, v in commands_by_phase_on_decky.items()
},
)
# ─── Per-fixture bound assertions ───────────────────────────────────────────
@pytest.mark.parametrize(
"yaml_name,expected_name,truth_level",
[
("lone_wolf.yaml", "lone_wolf.expected.yaml", "campaign"),
("shared_wordlist.yaml", "shared_wordlist.expected.yaml", "campaign"),
("vpn_hopping.yaml", "vpn_hopping.expected.yaml", "campaign"),
("paused_campaign.yaml", "paused_campaign.expected.yaml", "campaign"),
("multi_operator.yaml", "multi_operator.expected.yaml", "campaign"),
("noise_floor.yaml", "noise_floor.expected.yaml", "campaign"),
("slow_burn.yaml", "slow_burn.expected.yaml", "campaign"),
],
)
def test_production_campaign_clusterer_passes_fixture_bounds(
yaml_name: str, expected_name: str, truth_level: str,
) -> None:
corpus = _load_corpus(yaml_name)
assert_fixture_bounds(
corpus,
production_campaign_clusterer,
FIXTURE_DIR / expected_name,
truth_level=truth_level,
)
# ─── Per-fixture sharpness assertions (production clusterer specifics) ─────
#
# These tighten the YAML bounds for fixtures where the production
# clusterer is expected to score *perfectly*. They live as Python
# assertions (not YAML) so they only gate the production clusterer —
# the YAML bounds stay loose for the reference-clusterer tests in the
# per-fixture files. Ratcheting these up over time is safe; the YAML
# bounds remain the floor that *every* tested clusterer must beat.
def test_f3_lone_wolf_perfect_score() -> None:
"""Every actor a singleton — campaign clusterer should match."""
corpus = _load_corpus("lone_wolf.yaml")
pred = production_campaign_clusterer(corpus)
metrics = score(corpus.truth_labels(level="campaign"), pred)
assert metrics["singleton_recall"] == pytest.approx(1.0)
assert metrics["adjusted_rand_index"] == pytest.approx(1.0)
def test_f1_shared_wordlist_no_false_merge() -> None:
"""Two campaigns burning the same wordlist must NOT fuse."""
corpus = _load_corpus("shared_wordlist.yaml")
pred = production_campaign_clusterer(corpus)
truth = corpus.truth_labels(level="campaign")
# Predicted: each truth-class member should have its own cluster id
# (they share no payload / c2 / phase-handoff).
truth_to_pred: dict[str, set[str]] = {}
for aid, t in truth.items():
truth_to_pred.setdefault(t, set()).add(pred[aid])
# No predicted cluster spans two truth campaigns.
pred_to_truth: dict[str, set[str]] = {}
for aid, p in pred.items():
pred_to_truth.setdefault(p, set()).add(truth[aid])
assert all(len(s) == 1 for s in pred_to_truth.values()), (
f"shared_wordlist: predicted cluster spans multiple campaigns: "
f"{pred_to_truth}"
)
def test_f5_multi_operator_folds_to_one_campaign() -> None:
"""Two operators with shared payload + C2 + phase-handoff fold to one campaign."""
corpus = _load_corpus("multi_operator.yaml")
pred = production_campaign_clusterer(corpus)
cluster_ids = set(pred.values())
assert len(cluster_ids) == 1, (
f"multi_operator: expected 1 campaign, got {len(cluster_ids)}"
f"predictions: {pred}"
)
metrics = score(corpus.truth_labels(level="campaign"), pred)
assert metrics["adjusted_rand_index"] == pytest.approx(1.0)
def test_f7_slow_burn_time_shift_invariance() -> None:
"""Shift every timestamp +90 days — predictions must be identical.
The pure F7 invariant: campaign edges are pairwise-relative; an
absolute shift on every session must not change any cluster
assignment. Mirrors the identity-side check in
``test_slow_burn_fixture.py``.
"""
from datetime import timedelta
corpus = _load_corpus("slow_burn.yaml")
base_pred = production_campaign_clusterer(corpus)
delta = timedelta(days=90)
for a in corpus.attackers:
a.first_seen = a.first_seen + delta
a.last_seen = a.last_seen + delta
for s in a.sessions:
s.started_at = s.started_at + delta
shifted_pred = production_campaign_clusterer(corpus)
# Cluster id labels are opaque — what matters is the partition.
base_partition = _partition(base_pred)
shifted_partition = _partition(shifted_pred)
assert base_partition == shifted_partition, (
f"slow_burn: +90d shift changed the predicted partition\n"
f"base: {base_partition}\n"
f"shifted: {shifted_partition}"
)
def _partition(labels: dict[str, str]) -> set[frozenset[str]]:
"""Return the cluster partition (set of frozensets of member ids).
Cluster id strings are arbitrary; the equivalence we care about is
"which ids ended up in the same cluster?".
"""
by_cluster: dict[str, set[str]] = {}
for member, cluster_id in labels.items():
by_cluster.setdefault(cluster_id, set()).add(member)
return {frozenset(s) for s in by_cluster.values()}

View File

@@ -0,0 +1,74 @@
"""
End-to-end pipeline test for fixture 3 (lone_wolf).
Loads the YAML spec, runs the synthetic generator, applies the
identity-clusterer placeholder (each attacker → its own cluster), and
scores against the expected bounds. This is the simplest of the six
fixtures and is deliberately the first one wired up — its ground truth
is all singletons, so an identity clusterer trivially passes, which
proves the DSL → factory → metrics pipeline works before any real
algorithm is built.
Once the connected-components clusterer (CAMPAIGN_CLUSTERING.md §4)
lands, the same fixture must continue to pass.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from tests.clustering.fixture_harness import (
assert_fixture_bounds,
identity_clusterer,
)
from tests.clustering.metrics import score
from tests.factories.campaign_factory import generate, load_yaml
FIXTURE_DIR = Path(__file__).parent.parent / "fixtures" / "campaigns"
def test_lone_wolf_pipeline_passes_bounds() -> None:
spec = load_yaml(FIXTURE_DIR / "lone_wolf.yaml")
corpus = generate(spec, seed=0)
assert_fixture_bounds(corpus, identity_clusterer, FIXTURE_DIR / "lone_wolf.expected.yaml")
def test_lone_wolf_corpus_shape() -> None:
"""Sanity: 1 wolf + 8 noise scanners = 9 attackers, 9 sessions."""
spec = load_yaml(FIXTURE_DIR / "lone_wolf.yaml")
corpus = generate(spec, seed=0)
assert len(corpus.attackers) == 9
assert len(corpus.sessions) == 9
truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
assert len(truth_campaigns) == 9
def test_identity_clusterer_fails_on_a_real_campaign() -> None:
"""
Sanity for the harness, NOT a test of the clusterer: a real
multi-actor campaign should make the placeholder identity clusterer
fail completeness, since each truth-campaign gets fragmented into
one-member clusters. If this didn't fail, our metrics would be
blind to false splits — and that's the entire point of fixtures 4
and 5 in the design doc.
"""
spec = {
"campaign": {
"id": "c-real",
"actors": [
{"id": "a-1", "asn": 14061},
{"id": "a-2", "asn": 14061},
],
"phases": [
{"name": "delivery", "actor": "a-1"},
{"name": "discovery", "actor": "a-2"},
],
"duration_days": 1,
}
}
corpus = generate(spec, seed=0)
pred = identity_clusterer(corpus)
metrics = score(corpus.truth_labels(), pred)
assert metrics["completeness"] < 1.0
assert metrics["homogeneity"] == pytest.approx(1.0)

View File

@@ -0,0 +1,76 @@
"""Sanity tests for the clustering metric harness."""
from __future__ import annotations
import pytest
from tests.clustering.metrics import (
adjusted_rand_index,
completeness,
homogeneity,
score,
singleton_recall,
)
def test_perfect_agreement_scores_one() -> None:
truth = {"a": "C1", "b": "C1", "c": "C2", "d": "C2"}
# Same partition, different label names — clustering doesn't preserve
# names, so renamed-but-isomorphic must still score 1.0.
pred = {"a": "X", "b": "X", "c": "Y", "d": "Y"}
s = score(truth, pred)
assert s["adjusted_rand_index"] == pytest.approx(1.0)
assert s["homogeneity"] == pytest.approx(1.0)
assert s["completeness"] == pytest.approx(1.0)
assert s["singleton_recall"] == pytest.approx(1.0)
def test_all_singletons_perfect() -> None:
truth = {"a": "A", "b": "B", "c": "C"}
pred = {"a": "1", "b": "2", "c": "3"}
s = score(truth, pred)
assert s["singleton_recall"] == pytest.approx(1.0)
assert s["adjusted_rand_index"] == pytest.approx(1.0)
def test_false_merge_drops_homogeneity() -> None:
truth = {"a": "C1", "b": "C2"}
pred = {"a": "X", "b": "X"} # merged two distinct campaigns
assert homogeneity(truth, pred) == pytest.approx(0.0)
# Completeness is fine (each true class lives in one cluster).
assert completeness(truth, pred) == pytest.approx(1.0)
def test_false_split_drops_completeness() -> None:
truth = {"a": "C1", "b": "C1"}
pred = {"a": "X", "b": "Y"} # split one campaign into two clusters
assert completeness(truth, pred) == pytest.approx(0.0)
assert homogeneity(truth, pred) == pytest.approx(1.0)
def test_singleton_recall_penalises_noise_absorption() -> None:
# 3 lone wolves + 1 real campaign with 2 members.
truth = {"w1": "wolf1", "w2": "wolf2", "w3": "wolf3", "c1": "C", "c2": "C"}
# Clusterer absorbs all wolves into the campaign.
pred = dict.fromkeys(truth, "BIG")
assert singleton_recall(truth, pred) == pytest.approx(0.0)
# And a clusterer that keeps wolves singleton should score 1.0
# on this metric, regardless of what it does with the campaign.
pred_ok = {"w1": "1", "w2": "2", "w3": "3", "c1": "C", "c2": "C"}
assert singleton_recall(truth, pred_ok) == pytest.approx(1.0)
def test_mismatched_item_sets_raises() -> None:
with pytest.raises(ValueError):
adjusted_rand_index({"a": "X"}, {"b": "Y"})
def test_random_labels_low_ari() -> None:
# ARI of an arbitrary partition vs. ground truth should be near 0,
# not near 1 — this is the chance-correction guarantee.
truth = {f"i{n}": f"C{n // 4}" for n in range(20)}
# Pred that ignores truth: just shuffles items into 5 buckets in
# an order uncorrelated with truth.
pred = {f"i{n}": f"X{(n * 7) % 5}" for n in range(20)}
ari = adjusted_rand_index(truth, pred)
# Loose bound — the point is "much closer to 0 than to 1".
assert ari < 0.3

View File

@@ -0,0 +1,134 @@
"""
End-to-end pipeline test for fixture 5 (multi_operator).
One campaign, two operators with distinct UKC roles, distinct
tooling (different JA3 + HASSH), distinct ASNs and IPs, on
opposite shift schedules. What ties them is shared C2 callback +
shared stage-1 payload hash — the planned similarity graph's
"payload simhash + C2 endpoint match" arms are what should resolve
them as one campaign.
Three tests cover this:
1. `test_multi_operator_corpus_shape` — sanity: two attackers, one
campaign, distinct fingerprints, shared C2 callback present in
both rows' sessions, distinct shift hours.
2. `test_multi_operator_pipeline_passes_bounds` — runs
`c2_callback_clusterer` (the appropriate pass-clusterer for
this fixture, since fingerprint_clusterer would split the two
distinct operators). Folds both rows into one cluster via the
shared C2 endpoint.
3. `test_shift_clusterer_fragments_campaign` — runs the deliberately
bad `shift_clusterer`. Actor A on night shift and Actor B on day
shift split into two clusters → completeness collapses → the
bound floor on completeness rejects the bad clusterer. This is
the canonical proof that operational-schedule overlap is NOT a
campaign signal.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from tests.clustering.fixture_harness import (
assert_fixture_bounds,
c2_callback_clusterer,
fingerprint_clusterer,
shift_clusterer,
)
from tests.clustering.metrics import score
from tests.factories.campaign_factory import generate, load_yaml
FIXTURE_DIR = Path(__file__).parent.parent / "fixtures" / "campaigns"
FIXTURE_YAML = FIXTURE_DIR / "multi_operator.yaml"
EXPECTED_YAML = FIXTURE_DIR / "multi_operator.expected.yaml"
def test_multi_operator_corpus_shape() -> None:
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
assert len(corpus.attackers) == 2
truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
assert truth_campaigns == {"multi-operator-001"}
# Two distinct fingerprints — the operators are different people
# using different tools.
ja3s = {a.ja3 for a in corpus.attackers}
hasshs = {a.hassh for a in corpus.attackers}
assert len(ja3s) == 2
assert len(hasshs) == 2
# Shared C2 callback across both rows' sessions.
by_actor = {a.truth_actor_id: a for a in corpus.attackers}
broker = by_actor["ops-broker-night"]
postex = by_actor["ops-postex-day"]
broker_c2s = {s.c2_callback for s in broker.sessions if s.c2_callback}
postex_c2s = {s.c2_callback for s in postex.sessions if s.c2_callback}
assert "c2.shared-op.example" in broker_c2s
assert "c2.shared-op.example" in postex_c2s
# Shifts are disjoint — load-bearing for the adversarial test.
broker_hours = {s.started_at.hour for s in broker.sessions}
postex_hours = {s.started_at.hour for s in postex.sessions}
assert broker_hours <= {22, 23, 0, 1, 2, 3}
assert postex_hours <= {9, 10, 11, 12, 13}
def test_multi_operator_pipeline_passes_bounds() -> None:
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
metrics = assert_fixture_bounds(corpus, c2_callback_clusterer, EXPECTED_YAML)
pred = c2_callback_clusterer(corpus)
assert len(set(pred.values())) == 1, (
"c2_callback_clusterer should fold both operators into one cluster"
)
assert metrics["adjusted_rand_index"] == pytest.approx(1.0)
def test_fingerprint_clusterer_cannot_resolve_this_fixture() -> None:
"""
Sanity for the harness, NOT a test of the clusterer: with two
distinct fingerprints and one truth campaign,
`fingerprint_clusterer` produces 2 clusters → completeness
collapses. This is *why* the fixture's pass-clusterer is
`c2_callback_clusterer` instead. Documents which signal
actually carries the campaign here.
"""
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
pred = fingerprint_clusterer(corpus)
assert len(set(pred.values())) == 2
metrics = score(corpus.truth_labels(level="campaign"), pred)
assert metrics["completeness"] == pytest.approx(0.0)
def test_shift_clusterer_fragments_campaign() -> None:
"""
The fixture's reason for being. Bucket attackers by shift and
the two operators land in 'night' and 'day' clusters → 2
predicted clusters. Truth = 1 campaign → completeness collapses.
If this test ever passes (shift_clusterer satisfies the bounds),
the fixture has lost its discrimination power.
"""
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
pred = shift_clusterer(corpus)
buckets = set(pred.values())
assert buckets == {"shift-night", "shift-day"}, (
f"expected one night cluster + one day cluster, got {buckets}"
)
metrics = score(corpus.truth_labels(level="campaign"), pred)
assert metrics["completeness"] == pytest.approx(0.0)
bounds = {
"adjusted_rand_index": 0.85,
"homogeneity": 0.90,
"completeness": 0.80,
"singleton_recall": 0.95,
}
breaches = [k for k, floor in bounds.items() if metrics[k] < floor]
assert "completeness" in breaches, (
f"fixture failed to catch the bad clusterer; observed metrics: {metrics}"
)

View File

@@ -0,0 +1,167 @@
"""
End-to-end pipeline test for fixture 6 (noise_floor).
Composite corpus: bundles all five prior fixtures' campaigns + 10
Delivery-only noise scanners on top of lone_wolf's 8 inherited
ones. The fixture exists to catch cross-corpus interference —
signal collisions, factory ID re-use, clusterer ambiguity that
shows up only when multiple campaigns are scored together. Each
constituent fixture already ships its own in-fixture adversarial
test; fixture 6 covers a different failure class.
The composition is declared in `noise_floor.yaml` via an
``include_fixtures`` block (a fixture-6-specific format). The
loader in this test file expands it into a full
``corpus.campaigns`` spec at runtime, so the factory itself stays
unaware of the include mechanism.
Three tests cover this:
1. `test_noise_floor_corpus_integrity` — every constituent
fixture's campaigns + actors are present in the merged corpus
with their truth labels intact, and the 10 extra noise scanners
are present alongside lone_wolf's 8 (truth-singletons all).
2. `test_noise_floor_pipeline_passes_bounds` — runs
`composite_signals_clusterer` against the merged corpus.
Approximates the planned similarity graph well enough that
every campaign resolves and every singleton stays singleton.
Trips the bound floors if any cross-fixture interference creeps
in (signal collisions across fixtures' JA3/HASSH/C2 strings).
3. `test_noise_floor_singleton_recall_holds` — explicit assertion
that every truth-singleton (the lone wolf, the 8 inherited noise
scanners, the 10 extra noise scanners — 19 total) ends up in a
singleton predicted cluster. Singleton recall is the load-
bearing metric for this fixture.
"""
from __future__ import annotations
from pathlib import Path
from typing import Any
import pytest
import yaml
from tests.clustering.fixture_harness import (
assert_fixture_bounds,
composite_signals_clusterer,
)
from tests.clustering.metrics import score
from tests.factories.campaign_factory import generate, load_yaml
FIXTURE_DIR = Path(__file__).parent.parent / "fixtures" / "campaigns"
FIXTURE_YAML = FIXTURE_DIR / "noise_floor.yaml"
EXPECTED_YAML = FIXTURE_DIR / "noise_floor.expected.yaml"
def _expand_noise_floor_spec() -> dict[str, Any]:
"""Read noise_floor.yaml's include_fixtures block, load each
constituent fixture, and merge their campaigns into one
corpus-shaped spec. Returns a dict the factory's ``generate()``
accepts as-is."""
declared = yaml.safe_load(FIXTURE_YAML.read_text(encoding="utf-8"))
campaigns: list[dict[str, Any]] = []
inherited_noise = 0
for fname in declared["include_fixtures"]:
sub = load_yaml(FIXTURE_DIR / fname)
if "corpus" in sub:
campaigns.extend(sub["corpus"].get("campaigns", []))
inherited_noise += int(
(sub["corpus"].get("noise") or {}).get("scanner_count", 0)
)
else:
campaigns.append({"campaign": sub["campaign"]})
extra = int(declared.get("extra_noise_scanners", 0))
return {
"corpus": {
"campaigns": campaigns,
"noise": {"scanner_count": inherited_noise + extra},
}
}
def test_noise_floor_corpus_integrity() -> None:
spec = _expand_noise_floor_spec()
corpus = generate(spec, seed=0)
truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
# Every constituent fixture's campaign id appears in the merged
# corpus. Any missing id means the loader dropped a fixture.
expected_campaign_ids = {
"shared-wordlist-A",
"shared-wordlist-B",
"vpn-hopping-001",
"lone-wolf-001",
"paused-campaign-001",
"multi-operator-001",
}
assert expected_campaign_ids <= truth_campaigns, (
f"missing campaign ids: {expected_campaign_ids - truth_campaigns}"
)
# Noise scanner count: 8 inherited from lone_wolf + 10 added.
noise_attackers = [
a for a in corpus.attackers
if a.truth_campaign_id.startswith("noise-scanner-")
]
assert len(noise_attackers) == 18
# Every noise scanner is its own truth-campaign (singleton).
noise_truth = {a.truth_campaign_id for a in noise_attackers}
assert len(noise_truth) == 18
# Real-campaign attackers: 2 (shared_wordlist) + 5 (vpn_hopping) +
# 1 (lone_wolf wolf) + 2 (paused_campaign) + 2 (multi_operator)
# = 12.
real_attackers = [
a for a in corpus.attackers
if not a.truth_campaign_id.startswith("noise-scanner-")
]
assert len(real_attackers) == 12, (
f"expected 12 campaign-driven attackers, got {len(real_attackers)}"
)
def test_noise_floor_pipeline_passes_bounds() -> None:
spec = _expand_noise_floor_spec()
corpus = generate(spec, seed=0)
metrics = assert_fixture_bounds(corpus, composite_signals_clusterer, EXPECTED_YAML)
# The combined corpus is heterogeneous — a perfect ARI is not
# required (and the bound is loose at 0.85). Verify the harness
# produced sensible numbers anyway.
assert metrics["adjusted_rand_index"] >= 0.85
assert metrics["singleton_recall"] >= 0.95
def test_noise_floor_singleton_recall_holds() -> None:
"""Every truth-singleton (lone wolf + 18 noise) must remain
singleton under the composite clusterer. Noise absorption is the
failure mode that makes campaign attribution useless in practice.
"""
spec = _expand_noise_floor_spec()
corpus = generate(spec, seed=0)
pred = composite_signals_clusterer(corpus)
truth = corpus.truth_labels(level="campaign")
from collections import Counter
truth_counts = Counter(truth.values())
pred_counts = Counter(pred.values())
true_singletons = [aid for aid, t in truth.items() if truth_counts[t] == 1]
# Truth-singletons in this composite:
# 1 lone wolf + 18 noise + 2 shared_wordlist actors (each
# campaign has one actor; campaign size 1 means truth-singleton)
# = 21.
assert len(true_singletons) == 21, (
f"expected 21 truth-singletons, got {len(true_singletons)}"
)
absorbed = [aid for aid in true_singletons if pred_counts[pred[aid]] != 1]
assert not absorbed, (
f"composite clusterer absorbed {len(absorbed)} singletons into "
f"larger clusters: {absorbed[:5]}"
)
metrics = score(truth, pred)
assert metrics["singleton_recall"] == pytest.approx(1.0)

View File

@@ -0,0 +1,140 @@
"""
End-to-end pipeline test for fixture 4 (paused_campaign).
One campaign, two operational windows separated by a multi-day
silent stretch (days 3-5, 0-indexed [2, 4]). Modeled as two DSL
actors sharing JA3 + HASSH + payload + C2 callback — the
fingerprint-stable signals a real clusterer should resolve on.
Their ``active_days`` differ so each row's sessions land in
disjoint time ranges; this is what gives the adversarial
``time_window_clusterer`` something to fragment.
Three tests cover this:
1. `test_paused_campaign_corpus_shape` — sanity: 2 attackers, both
share campaign id, sessions are time-disjoint across the pause
window.
2. `test_paused_campaign_pipeline_passes_bounds` —
`fingerprint_clusterer` reference folds both rows into one
cluster (shared JA3 + HASSH). Trivially green at campaign-level
scoring; the test is a ratchet point for the real algorithm.
3. `test_time_window_clusterer_fragments_campaign` — runs the
deliberately-bad `time_window_clusterer`. With a 4-day silent
stretch and a 1-day union threshold, the two halves cannot be
bridged → 2 clusters → completeness collapses → bound rejected.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from tests.clustering.fixture_harness import (
assert_fixture_bounds,
fingerprint_clusterer,
time_window_clusterer,
)
from tests.clustering.metrics import score
from tests.factories.campaign_factory import generate, load_yaml
FIXTURE_DIR = Path(__file__).parent.parent / "fixtures" / "campaigns"
FIXTURE_YAML = FIXTURE_DIR / "paused_campaign.yaml"
EXPECTED_YAML = FIXTURE_DIR / "paused_campaign.expected.yaml"
def test_paused_campaign_corpus_shape() -> None:
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
assert len(corpus.attackers) == 2
truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
assert truth_campaigns == {"paused-campaign-001"}
# Both rows share the operator's JA3 and HASSH — load-bearing
# signal for fingerprint_clusterer to fold them.
ja3s = {a.ja3 for a in corpus.attackers}
hasshs = {a.hassh for a in corpus.attackers}
assert len(ja3s) == 1
assert len(hasshs) == 1
# Each row's session timeline lives in its actor's active_days.
rows_by_actor = {a.truth_actor_id: a for a in corpus.attackers}
sprint_1 = rows_by_actor["ops-sprint-1"]
sprint_2 = rows_by_actor["ops-sprint-2"]
sprint_1_days = {s.started_at.day for s in sprint_1.sessions}
sprint_2_days = {s.started_at.day for s in sprint_2.sessions}
# Epoch is 2026-01-01; active_days [0,1] → calendar days 1,2;
# active_days [5,6] → calendar days 6,7.
assert sprint_1_days <= {1, 2}, f"sprint-1 leaked outside its window: {sprint_1_days}"
assert sprint_2_days <= {6, 7}, f"sprint-2 leaked outside its window: {sprint_2_days}"
def test_paused_campaign_pipeline_passes_bounds() -> None:
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
metrics = assert_fixture_bounds(corpus, fingerprint_clusterer, EXPECTED_YAML)
# Both rows share fingerprints → one predicted cluster.
pred = fingerprint_clusterer(corpus)
assert len(set(pred.values())) == 1
# Truth = 1 campaign of 2 rows; pred = 1 cluster of 2 rows → ARI 1.0.
assert metrics["adjusted_rand_index"] == pytest.approx(1.0)
def test_time_window_clusterer_fragments_campaign() -> None:
"""
The fixture's reason for being. With a 4-day silence between
the two operational windows and a 1-day union threshold, the
bad clusterer cannot bridge the gap. The campaign splits in
two and completeness collapses.
If this test ever passes (time_window_clusterer satisfies the
bounds), the fixture has lost its discrimination power.
"""
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
pred = time_window_clusterer(corpus, gap_days=1.0)
assert len(set(pred.values())) == 2, (
f"time-window clusterer should split into 2 clusters, got {len(set(pred.values()))}"
)
metrics = score(corpus.truth_labels(level="campaign"), pred)
assert metrics["completeness"] == pytest.approx(0.0)
bounds = {
"adjusted_rand_index": 0.85,
"homogeneity": 0.90,
"completeness": 0.80,
"singleton_recall": 0.95,
}
breaches = [k for k, floor in bounds.items() if metrics[k] < floor]
assert "completeness" in breaches, (
f"fixture failed to catch the bad clusterer; observed metrics: {metrics}"
)
def test_time_window_clusterer_with_huge_gap_does_not_fragment() -> None:
"""
Sanity for the time-window reference: with a gap larger than
the campaign's silent stretch, the two halves union into one.
Confirms the clusterer's behavior depends on the threshold,
not on something unrelated. (Pause is days 3-5 → max separation
between session ranges is ≈4 days; gap_days=10 must bridge.)
"""
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
pred = time_window_clusterer(corpus, gap_days=10.0)
assert len(set(pred.values())) == 1
def test_silent_stretch_actually_silent() -> None:
"""No session may land inside the configured pause window."""
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
pause_calendar_days = {3, 4, 5} # 1-indexed; pause_windows [[2,4]] in 0-indexed
leaked = [
s for s in corpus.sessions
if s.started_at.day in pause_calendar_days
]
assert not leaked, (
f"sessions leaked into the silent stretch: "
f"{[(s.session_id, s.started_at) for s in leaked]}"
)

View File

@@ -0,0 +1,117 @@
"""
End-to-end pipeline test for fixture 1 (shared_wordlist).
Two campaigns. Same SSH credential wordlist. Everything else divergent
— ASN, IPs, JA3, HASSH, active hours.
The fixture exists to defeat one specific failure mode: a clusterer
that leans on credential-list overlap as a primary signal. Commodity
wordlists (rockyou, defaults lists, top-1k common-credentials) are
shared by hundreds of unrelated actors — credential overlap alone
cannot identify a campaign.
Two tests cover this:
1. `test_shared_wordlist_pipeline_passes_bounds` — runs the placeholder
identity clusterer against the fixture. Trivially green (each
campaign has one actor → identity puts each in its own cluster).
This is the ratchet point: when the real algorithm replaces the
placeholder, this test must continue to pass.
2. `test_credential_jaccard_clusterer_fails_homogeneity` — runs a
deliberately-bad clusterer that merges any two attackers whose
credential sets overlap above 50% Jaccard. Proves the fixture
actually catches what it's designed to catch: this clusterer DOES
merge the two campaigns, and the fixture's homogeneity floor (0.90)
is breached. If this test ever passes, our fixture or our metric
harness is broken.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from tests.clustering.fixture_harness import (
assert_fixture_bounds,
credential_jaccard_clusterer,
identity_clusterer,
)
from tests.clustering.metrics import score
from tests.factories.campaign_factory import generate, load_yaml
FIXTURE_DIR = Path(__file__).parent.parent / "fixtures" / "campaigns"
def test_shared_wordlist_pipeline_passes_bounds() -> None:
spec = load_yaml(FIXTURE_DIR / "shared_wordlist.yaml")
corpus = generate(spec, seed=0)
assert_fixture_bounds(
corpus, identity_clusterer, FIXTURE_DIR / "shared_wordlist.expected.yaml"
)
def test_shared_wordlist_corpus_shape() -> None:
"""Sanity: 2 campaigns × 1 actor = 2 attackers, 4 sessions
(delivery + credential_access × 3 sessions per campaign)."""
spec = load_yaml(FIXTURE_DIR / "shared_wordlist.yaml")
corpus = generate(spec, seed=0)
assert len(corpus.attackers) == 2
truth = corpus.truth_labels()
assert set(truth.values()) == {"shared-wordlist-A", "shared-wordlist-B"}
# Each attacker should have at least one credential_access session
# whose credentials_tried is the full shared list.
for att in corpus.attackers:
cred_sessions = [s for s in att.sessions if s.credentials_tried]
assert cred_sessions, f"attacker {att.attacker_id} has no credential sessions"
# All cred sessions should carry the same 8-entry wordlist.
for s in cred_sessions:
assert len(s.credentials_tried) == 8
def test_credential_jaccard_clusterer_fails_homogeneity() -> None:
"""
The fixture's reason for being. A naive clusterer that merges on
credential-set Jaccard ≥ 0.5 will fuse the two campaigns (Jaccard
= 1.0 on shared wordlists). That fusion drives homogeneity to 0
— exactly the failure mode the fixture protects against.
If this test ever PASSES (i.e. the bad clusterer scores high on
this fixture), the fixture has lost its discrimination power and
needs to be re-examined.
"""
spec = load_yaml(FIXTURE_DIR / "shared_wordlist.yaml")
corpus = generate(spec, seed=0)
pred = credential_jaccard_clusterer(corpus, threshold=0.5)
metrics = score(corpus.truth_labels(), pred)
# The two campaigns must be merged by this clusterer.
assert len(set(pred.values())) == 1, (
"credential-Jaccard clusterer should merge both campaigns into one"
)
# And homogeneity must collapse — that's the signal a fixture-aware
# CI gate would use to reject the bad clusterer.
assert metrics["homogeneity"] == pytest.approx(0.0)
def test_naive_clusterer_does_not_fool_the_fixture() -> None:
"""
Belt-and-braces: even though the bad clusterer collapses
homogeneity, it might still pass *some* metrics (completeness is
actually 1.0 — all members of each true campaign land in the
single mega-cluster). The fixture's bound floor on homogeneity
(0.90) must reject it.
"""
spec = load_yaml(FIXTURE_DIR / "shared_wordlist.yaml")
corpus = generate(spec, seed=0)
pred = credential_jaccard_clusterer(corpus, threshold=0.5)
metrics = score(corpus.truth_labels(), pred)
bounds = {
"adjusted_rand_index": 0.85,
"homogeneity": 0.90,
"completeness": 0.80,
"singleton_recall": 0.95,
}
breaches = [k for k, floor in bounds.items() if metrics[k] < floor]
assert "homogeneity" in breaches, (
f"fixture failed to catch the bad clusterer; observed metrics: {metrics}"
)

View File

@@ -0,0 +1,348 @@
"""Unit tests for the similarity-graph primitives.
Each edge function is tested in isolation: agreement → high score,
disagreement → zero, missing-data → zero. Combination logic +
thresholds live in the connected-components impl and are covered by
the fixture suite once those land.
"""
from __future__ import annotations
import pytest
from decnet.clustering.impl.similarity import (
EDGE_THRESHOLD,
Observation,
combined_edge_weight,
from_synthetic,
high_weight_edge,
low_weight_edge,
medium_weight_edge,
very_low_weight_edge,
)
def _obs(**kwargs) -> Observation:
"""Build an Observation with sensible defaults for tests."""
kwargs.setdefault("observation_id", "obs-x")
return Observation(**kwargs)
# ─── high_weight_edge ──────────────────────────────────────────────────────
def test_high_weight_ja3_match():
a = _obs(ja3="ja3-stable")
b = _obs(ja3="ja3-stable")
assert high_weight_edge(a, b) == 1.0
def test_high_weight_hassh_match():
a = _obs(hassh="hassh-stable")
b = _obs(hassh="hassh-stable")
assert high_weight_edge(a, b) == 1.0
def test_high_weight_payload_hash_overlap():
a = _obs(payload_hashes=frozenset({"pl-1", "pl-2"}))
b = _obs(payload_hashes=frozenset({"pl-2", "pl-3"}))
assert high_weight_edge(a, b) == 1.0
def test_high_weight_c2_overlap():
a = _obs(c2_endpoints=frozenset({"c2.example.com"}))
b = _obs(c2_endpoints=frozenset({"c2.example.com", "c2-alt.example.com"}))
assert high_weight_edge(a, b) == 1.0
def test_high_weight_no_match():
a = _obs(ja3="ja3-a", hassh="hassh-a", payload_hashes=frozenset({"x"}))
b = _obs(ja3="ja3-b", hassh="hassh-b", payload_hashes=frozenset({"y"}))
assert high_weight_edge(a, b) == 0.0
def test_high_weight_both_null_ja3_does_not_match():
"""Both-null JA3 must not be treated as 'agreement' — that would
fuse every un-fingerprinted noise scanner into one mega-cluster."""
a = _obs(ja3=None, hassh=None)
b = _obs(ja3=None, hassh=None)
assert high_weight_edge(a, b) == 0.0
# ─── fingerprint-disagreement veto on payload / C2 ──────────────────────────
def test_high_weight_veto_on_fingerprint_disagreement_with_shared_c2():
"""Fixture 5 protection: two operators with distinct JA3 + HASSH
sharing a C2 endpoint must NOT score as identity match."""
a = _obs(ja3="ja3-A", hassh="hassh-A",
c2_endpoints=frozenset({"c2.shared.example"}))
b = _obs(ja3="ja3-B", hassh="hassh-B",
c2_endpoints=frozenset({"c2.shared.example"}))
assert high_weight_edge(a, b) == 0.0
def test_high_weight_veto_on_fingerprint_disagreement_with_shared_payload():
"""Same shape, payload signal — also vetoed."""
a = _obs(ja3="ja3-A", hassh="hassh-A",
payload_hashes=frozenset({"stage1"}))
b = _obs(ja3="ja3-B", hassh="hassh-B",
payload_hashes=frozenset({"stage1"}))
assert high_weight_edge(a, b) == 0.0
def test_high_weight_no_veto_when_fingerprints_unknown():
"""Two un-fingerprinted observations sharing C2 still cluster —
we don't veto without evidence of disagreement."""
a = _obs(c2_endpoints=frozenset({"c2.shared.example"}))
b = _obs(c2_endpoints=frozenset({"c2.shared.example"}))
assert high_weight_edge(a, b) == 1.0
def test_high_weight_no_veto_when_one_side_unknown():
"""One observation without fingerprints + one with — no
disagreement evidence, so shared C2 still clusters."""
a = _obs(ja3="ja3-A", hassh="hassh-A",
c2_endpoints=frozenset({"c2.shared.example"}))
b = _obs(c2_endpoints=frozenset({"c2.shared.example"}))
assert high_weight_edge(a, b) == 1.0
def test_high_weight_partial_fingerprint_agreement_no_veto():
"""JA3 agrees, HASSH disagrees → some agreement → no veto. The
veto only triggers on FULL disagreement."""
a = _obs(ja3="ja3-shared", hassh="hassh-A",
c2_endpoints=frozenset({"c2.shared.example"}))
b = _obs(ja3="ja3-shared", hassh="hassh-B",
c2_endpoints=frozenset({"c2.shared.example"}))
# JA3 agreement returns 1.0 immediately; veto never reached.
assert high_weight_edge(a, b) == 1.0
def test_high_weight_partial_disagreement_one_slot_only_vetoes():
"""One slot comparable + disagrees, other slot uncomparable
(one side null) → veto triggers (only available evidence is
disagreement)."""
a = _obs(ja3="ja3-A", hassh=None,
c2_endpoints=frozenset({"c2.shared.example"}))
b = _obs(ja3="ja3-B", hassh=None,
c2_endpoints=frozenset({"c2.shared.example"}))
assert high_weight_edge(a, b) == 0.0
def test_high_weight_empty_sets_no_match():
a = _obs(payload_hashes=frozenset(), c2_endpoints=frozenset())
b = _obs(payload_hashes=frozenset(), c2_endpoints=frozenset())
assert high_weight_edge(a, b) == 0.0
# ─── medium_weight_edge ────────────────────────────────────────────────────
def test_medium_weight_jaccard_full_match_in_one_phase():
a = _obs(commands_by_phase={"discovery": ("ls", "id", "uname -a")})
b = _obs(commands_by_phase={"discovery": ("ls", "id", "uname -a")})
assert medium_weight_edge(a, b) == pytest.approx(1.0)
def test_medium_weight_jaccard_partial_match():
a = _obs(commands_by_phase={"discovery": ("ls", "id", "uname -a", "whoami")})
b = _obs(commands_by_phase={"discovery": ("ls", "id")})
# |A∩B|=2, |AB|=4 → 0.5
assert medium_weight_edge(a, b) == pytest.approx(0.5)
def test_medium_weight_picks_max_across_phases():
a = _obs(commands_by_phase={
"discovery": ("ls",),
"exploitation": ("./payload", "chmod +x payload"),
})
b = _obs(commands_by_phase={
"discovery": ("ps",), # 0.0
"exploitation": ("./payload", "chmod +x payload"), # 1.0
})
assert medium_weight_edge(a, b) == pytest.approx(1.0)
def test_medium_weight_no_shared_phase_returns_zero():
a = _obs(commands_by_phase={"discovery": ("ls",)})
b = _obs(commands_by_phase={"exploitation": ("./payload",)})
assert medium_weight_edge(a, b) == 0.0
def test_medium_weight_disjoint_commands_in_shared_phase():
a = _obs(commands_by_phase={"discovery": ("ls",)})
b = _obs(commands_by_phase={"discovery": ("ps",)})
# |A∩B|=0, |AB|=2
assert medium_weight_edge(a, b) == 0.0
def test_medium_weight_empty_corpora_returns_zero():
a = _obs()
b = _obs()
assert medium_weight_edge(a, b) == 0.0
# ─── low_weight_edge ───────────────────────────────────────────────────────
def test_low_weight_credential_jaccard_match():
a = _obs(credentials=frozenset({("root", "toor"), ("admin", "admin")}))
b = _obs(credentials=frozenset({("root", "toor"), ("admin", "admin")}))
assert low_weight_edge(a, b) == pytest.approx(1.0)
def test_low_weight_credential_partial_overlap():
a = _obs(credentials=frozenset({("root", "toor"), ("admin", "admin")}))
b = _obs(credentials=frozenset({("root", "toor"), ("user", "user")}))
assert low_weight_edge(a, b) == pytest.approx(1 / 3)
def test_low_weight_no_credentials_returns_zero():
a = _obs()
b = _obs(credentials=frozenset({("root", "toor")}))
assert low_weight_edge(a, b) == 0.0
# ─── very_low_weight_edge ──────────────────────────────────────────────────
def test_very_low_weight_asn_match():
a = _obs(asn=64500)
b = _obs(asn=64500)
assert very_low_weight_edge(a, b) == 1.0
def test_very_low_weight_asn_mismatch():
a = _obs(asn=64500)
b = _obs(asn=64501)
assert very_low_weight_edge(a, b) == 0.0
def test_very_low_weight_asn_null_returns_zero():
a = _obs(asn=None)
b = _obs(asn=64500)
assert very_low_weight_edge(a, b) == 0.0
# ─── time-agnostic invariant ───────────────────────────────────────────────
def test_observations_carry_no_timestamps():
"""Compile-time guarantee: Observation has no time fields, so no
edge function can accidentally start using them. Fixture 7 forbids
recency-decay clustering."""
field_names = set(Observation.__dataclass_fields__.keys())
forbidden = {"first_seen", "last_seen", "started_at", "session_midpoint", "timestamp"}
assert field_names.isdisjoint(forbidden), (
f"Observation grew time fields: {field_names & forbidden}. "
"Fixture 7 (slow_burn) forbids recency-aware clustering."
)
# ─── from_synthetic adapter ────────────────────────────────────────────────
# ─── combined_edge_weight tier discipline ─────────────────────────────────
def test_combined_high_alone_crosses_threshold():
a = _obs(ja3="ja3-shared")
b = _obs(ja3="ja3-shared")
assert combined_edge_weight(a, b) >= EDGE_THRESHOLD
def test_combined_medium_alone_below_threshold():
"""Single medium-tier match must NOT cluster — medium is a
supporting signal, never a clustering driver on its own."""
a = _obs(commands_by_phase={"discovery": ("ls", "id", "uname")})
b = _obs(commands_by_phase={"discovery": ("ls", "id", "uname")})
weight = combined_edge_weight(a, b)
assert 0 < weight < EDGE_THRESHOLD
def test_combined_low_alone_below_threshold():
"""Credential-only overlap must NOT cluster — fixture 1's failure mode."""
a = _obs(credentials=frozenset({("root", "toor"), ("admin", "admin")}))
b = _obs(credentials=frozenset({("root", "toor"), ("admin", "admin")}))
weight = combined_edge_weight(a, b)
assert 0 < weight < EDGE_THRESHOLD
def test_combined_very_low_alone_below_threshold():
"""ASN-only overlap must NOT cluster — fixture 2's failure mode."""
a = _obs(asn=64500)
b = _obs(asn=64500)
weight = combined_edge_weight(a, b)
assert 0 < weight < EDGE_THRESHOLD
def test_combined_all_weak_tiers_still_below_threshold():
"""Even all three weaker tiers stacked don't reach threshold —
only a high-tier signal does."""
a = _obs(
asn=64500,
credentials=frozenset({("root", "toor")}),
commands_by_phase={"discovery": ("ls",)},
)
b = _obs(
asn=64500,
credentials=frozenset({("root", "toor")}),
commands_by_phase={"discovery": ("ls",)},
)
# 0.6*1.0 (medium) + 0.2*1.0 (low) + 0.05*1.0 (very_low) = 0.85
weight = combined_edge_weight(a, b)
assert weight < EDGE_THRESHOLD
def test_combined_high_plus_medium_clusters():
a = _obs(ja3="ja3-x", commands_by_phase={"discovery": ("ls",)})
b = _obs(ja3="ja3-x", commands_by_phase={"discovery": ("ls",)})
assert combined_edge_weight(a, b) >= EDGE_THRESHOLD
def test_combined_no_signal_returns_zero():
a = _obs()
b = _obs()
assert combined_edge_weight(a, b) == 0.0
def test_from_synthetic_round_trip():
"""The adapter projects a SyntheticAttacker into an Observation
that the edge functions can score over."""
from datetime import datetime, timezone
from tests.factories.campaign_factory import (
SyntheticAttacker, SyntheticSession,
)
from decnet.clustering.ukc import UKCPhase
now = datetime.now(timezone.utc)
sess = SyntheticSession(
session_id="s1",
attacker_id="a1",
decky_id="d1",
started_at=now,
duration_s=10.0,
phase=UKCPhase.DISCOVERY,
commands=["ls", "id"],
credentials_tried=[("root", "toor")],
payload_hash="pl-1",
c2_callback="c2.example.com",
truth_campaign_id="c1",
truth_actor_id="actor-1",
)
att = SyntheticAttacker(
attacker_id="a1", ip="1.1.1.1", asn=64500,
ja3="ja3-x", hassh="hassh-y",
first_seen=now, last_seen=now,
truth_campaign_id="c1", truth_actor_id="actor-1",
sessions=[sess],
)
obs = from_synthetic(att)
assert obs.observation_id == "a1"
assert obs.ja3 == "ja3-x"
assert obs.hassh == "hassh-y"
assert obs.asn == 64500
assert obs.payload_hashes == frozenset({"pl-1"})
assert obs.c2_endpoints == frozenset({"c2.example.com"})
assert obs.credentials == frozenset({("root", "toor")})
assert obs.commands_by_phase == {"discovery": ("ls", "id")}

View File

@@ -0,0 +1,128 @@
"""
End-to-end pipeline test for fixture 7 (slow_burn).
90-day APT campaign with three operational windows separated by
multi-week silences. Models the real operational tempo of an APT
working a deep nested topology (MazeNET-style): recon over weeks,
exploitation later, action-on-objectives later still. The unique
signal this fixture stresses is TIME-AGNOSTIC IDENTITY — a
clusterer that silently expires old edges fragments any campaign
that operates over months.
Three tests cover this:
1. `test_slow_burn_corpus_shape` — sanity: 3 attackers, all share
campaign id and operator fingerprint, sessions land in their
respective operational windows.
2. `test_slow_burn_pipeline_passes_bounds` —
`composite_signals_clusterer` (fingerprint OR C2 — time-agnostic)
folds all three windows into one cluster.
3. `test_recency_decay_clusterer_fragments_campaign` — runs the
deliberately-bad `recency_decay_clusterer` with a 14-day half-
life and a 0.5 weight threshold. Edges between adjacent
operational windows (24+ days apart) decay below threshold and
drop. The campaign splits into three clusters; completeness
collapses; the bound floor rejects the bad clusterer.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from tests.clustering.fixture_harness import (
assert_fixture_bounds,
composite_signals_clusterer,
recency_decay_clusterer,
)
from tests.clustering.metrics import score
from tests.factories.campaign_factory import generate, load_yaml
FIXTURE_DIR = Path(__file__).parent.parent / "fixtures" / "campaigns"
FIXTURE_YAML = FIXTURE_DIR / "slow_burn.yaml"
EXPECTED_YAML = FIXTURE_DIR / "slow_burn.expected.yaml"
def test_slow_burn_corpus_shape() -> None:
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
assert len(corpus.attackers) == 3
truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
assert truth_campaigns == {"slow-burn-001"}
# Operator fingerprint stays stable across all three windows.
ja3s = {a.ja3 for a in corpus.attackers}
hasshs = {a.hassh for a in corpus.attackers}
assert len(ja3s) == 1
assert len(hasshs) == 1
# Each row's sessions land in its operational window.
by_actor = {a.truth_actor_id: a for a in corpus.attackers}
recon_days = {s.started_at.timetuple().tm_yday for s in by_actor["ops-recon"].sessions}
exploit_days = {s.started_at.timetuple().tm_yday for s in by_actor["ops-exploit"].sessions}
action_days = {s.started_at.timetuple().tm_yday for s in by_actor["ops-action"].sessions}
# Epoch is 2026-01-01 (day-of-year 1). active_days [7-11] →
# day-of-year [8-12]; [35-39] → [36-40]; [75-79] → [76-80].
assert recon_days <= {8, 9, 10, 11, 12}, recon_days
assert exploit_days <= {36, 37, 38, 39, 40}, exploit_days
assert action_days <= {76, 77, 78, 79, 80}, action_days
def test_slow_burn_pipeline_passes_bounds() -> None:
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
metrics = assert_fixture_bounds(corpus, composite_signals_clusterer, EXPECTED_YAML)
pred = composite_signals_clusterer(corpus)
assert len(set(pred.values())) == 1, (
"composite_signals_clusterer should fold all three windows into one cluster"
)
assert metrics["adjusted_rand_index"] == pytest.approx(1.0)
def test_recency_decay_clusterer_fragments_campaign() -> None:
"""
The fixture's reason for being. Recency decay with a 14-day
half-life expires edges between operational windows that are
24+ days apart, dropping their weight below the 0.5 threshold.
The campaign fragments into three clusters; completeness
collapses.
If this test ever passes (the bad clusterer satisfies the
bounds), the fixture has lost its discrimination power.
"""
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
pred = recency_decay_clusterer(corpus, half_life_days=14.0, threshold=0.5)
assert len(set(pred.values())) == 3, (
f"recency-decay clusterer should split into 3 clusters, "
f"got {len(set(pred.values()))}"
)
metrics = score(corpus.truth_labels(level="campaign"), pred)
assert metrics["completeness"] == pytest.approx(0.0)
bounds = {
"adjusted_rand_index": 0.85,
"homogeneity": 0.90,
"completeness": 0.80,
"singleton_recall": 0.95,
}
breaches = [k for k, floor in bounds.items() if metrics[k] < floor]
assert "completeness" in breaches, (
f"fixture failed to catch the bad clusterer; observed metrics: {metrics}"
)
def test_recency_decay_clusterer_with_long_halflife_does_not_fragment() -> None:
"""
Sanity for the recency-decay reference: with a half-life longer
than the campaign duration, every edge survives the decay. The
three windows union into one. Confirms the clusterer's
behavior depends on the half-life parameter, not on something
unrelated. (Half-life 365 → edges across 40 days decay to
~0.93, well above the 0.5 threshold.)
"""
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
pred = recency_decay_clusterer(corpus, half_life_days=365.0, threshold=0.5)
assert len(set(pred.values())) == 1

View File

@@ -0,0 +1,126 @@
"""
End-to-end pipeline test for fixture 2 (vpn_hopping).
One campaign, one actor, ip_pool: rotating across 5 distinct ASNs.
JA3, HASSH, and payload_hash stable across every rotation. The
fixture is the canonical "same hands, different IP/ASN" scenario
that motivates Identity Resolution (see development/
IDENTITY_RESOLUTION.md — these are the signals "the attacker can't
cheaply rotate"). It also stresses the clusterer's weighting of
ASN: the real similarity graph weights ASN match "very low" because
VPN/proxy hopping shatters ASN within a single identity.
Three tests cover this:
1. `test_vpn_hopping_pipeline_passes_bounds_at_campaign_level` —
`fingerprint_clusterer` reference folds all 5 rotated rows into
one cluster (shared JA3 + HASSH). Trivially green at campaign-
level scoring; the test is a ratchet point for the real algorithm
to keep passing once it lands.
2. `test_vpn_hopping_pipeline_passes_bounds_at_identity_level` —
same clusterer, scored against the identity-level oracle. Verifies
the factory's `truth_identity_id` plumbing across rotated rows
(commit f6b8375) actually expresses the right ground truth: 5
observations → 1 identity.
3. `test_asn_clusterer_fragments_campaign` — runs the deliberately-
bad `asn_clusterer` reference. The 5 rotation_asns become 5
singleton clusters → completeness collapses to ~0, ARI collapses,
and the fixture's bound floor on completeness (0.80) rejects the
bad clusterer. If this test ever passes, the fixture has lost its
discrimination power.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from tests.clustering.fixture_harness import (
asn_clusterer,
assert_fixture_bounds,
fingerprint_clusterer,
)
from tests.clustering.metrics import score
from tests.factories.campaign_factory import generate, load_yaml
FIXTURE_DIR = Path(__file__).parent.parent / "fixtures" / "campaigns"
FIXTURE_YAML = FIXTURE_DIR / "vpn_hopping.yaml"
EXPECTED_YAML = FIXTURE_DIR / "vpn_hopping.expected.yaml"
def test_vpn_hopping_corpus_shape() -> None:
"""One actor, rotation_count=5 → 5 observation rows, 1 identity, 1 campaign."""
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
assert len(corpus.attackers) == 5
truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
truth_identities = {a.truth_identity_id for a in corpus.attackers}
truth_actors = {a.truth_actor_id for a in corpus.attackers}
assert truth_campaigns == {"vpn-hopping-001"}
assert len(truth_identities) == 1, "all 5 rotations must share one truth_identity_id"
assert truth_actors == {"hopper-a"}
asns = {a.asn for a in corpus.attackers}
assert asns == {64512, 64513, 64514, 64515, 64516}
ips = {a.ip for a in corpus.attackers}
assert len(ips) == 5, "rotation must produce 5 distinct IPs"
# Stable fingerprints across every row — the load-bearing signal.
ja3s = {a.ja3 for a in corpus.attackers}
hasshs = {a.hassh for a in corpus.attackers}
assert len(ja3s) == 1
assert len(hasshs) == 1
def test_vpn_hopping_pipeline_passes_bounds_at_campaign_level() -> None:
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
assert_fixture_bounds(corpus, fingerprint_clusterer, EXPECTED_YAML)
def test_vpn_hopping_pipeline_passes_bounds_at_identity_level() -> None:
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
metrics = assert_fixture_bounds(
corpus, fingerprint_clusterer, EXPECTED_YAML, truth_level="identity"
)
# All 5 observations should land in the same predicted cluster
# AND share one truth identity → ARI is exactly 1.0.
assert metrics["adjusted_rand_index"] == pytest.approx(1.0)
assert metrics["completeness"] == pytest.approx(1.0)
def test_asn_clusterer_fragments_campaign() -> None:
"""
The fixture's reason for being. Group by ASN and the campaign
shatters into 5 singletons — completeness goes to 0 because the
one true class is split across 5 predicted clusters. The bound
floor on completeness (0.80) must reject this.
If this test ever passes (asn_clusterer satisfies the bounds),
the fixture has lost its discrimination power.
"""
spec = load_yaml(FIXTURE_YAML)
corpus = generate(spec, seed=0)
pred = asn_clusterer(corpus)
# 5 distinct ASNs in the rotation → 5 distinct predicted clusters.
assert len(set(pred.values())) == 5
metrics = score(corpus.truth_labels(level="campaign"), pred)
# Completeness collapses — that's the failure mode the fixture
# protects against.
assert metrics["completeness"] == pytest.approx(0.0)
# ARI collapses too (very different partitions).
assert metrics["adjusted_rand_index"] < 0.1
# The bound floor would reject this clusterer.
bounds = {
"adjusted_rand_index": 0.85,
"homogeneity": 0.90,
"completeness": 0.80,
"singleton_recall": 0.95,
}
breaches = [k for k, floor in bounds.items() if metrics[k] < floor]
assert "completeness" in breaches, (
f"fixture failed to catch the bad clusterer; observed metrics: {metrics}"
)