diff --git a/tests/clustering/fixture_harness.py b/tests/clustering/fixture_harness.py index 64e860aa..a2a95df7 100644 --- a/tests/clustering/fixture_harness.py +++ b/tests/clustering/fixture_harness.py @@ -6,18 +6,36 @@ Each fixture lives at `tests/fixtures/campaigns/.yaml` with paired fixture test file down to "load corpus → predict → assert bounds" without copy-pasting the bound-walk loop or reference clusterers across files. -Two reference clusterers are provided: +Reference clusterers are provided as the algorithm under test in each +fixture's bound assertions; their names describe the *signal* they +cluster on, not the quality of the result. * `identity_clusterer` — every attacker is its own cluster. Trivially passes any fixture whose ground truth is all singletons (lone_wolf, shared_wordlist before merge, etc). Useful as a green baseline while the real connected-components algorithm is under construction. +* `fingerprint_clusterer` — groups attackers by ``(ja3, hassh)``. + Approximates the "stable signals an attacker can't cheaply rotate" + arm of the planned similarity graph (see IDENTITY_RESOLUTION.md + Premise). Folds rotated-IP observations of one actor into one + cluster when the actor's JA3 + HASSH stay stable. Attackers whose + fingerprints are both NULL (typical of un-fingerprinted noise + scanners) are treated as un-mergeable — each becomes its own + singleton — so this clusterer doesn't trivially fuse all noise + into one mega-cluster. + * `credential_jaccard_clusterer` — deliberately-bad reference that merges any two attackers whose credential-attempt sets overlap above a threshold. Exists so fixtures like `shared_wordlist` can prove they fail a clusterer that relies on credential overlap alone — the whole point of fixture #1. + +* `asn_clusterer` — deliberately-bad reference that groups attackers + by source ASN. Exists so fixtures like `vpn_hopping` (fixture #2) + can prove they fail a clusterer that treats ASN match as a + high-weight signal — VPN/proxy hopping shatters ASN within a single + identity and a clusterer that leans on it tanks completeness. """ from __future__ import annotations @@ -77,6 +95,28 @@ def identity_clusterer(corpus: GeneratedCorpus) -> dict[str, str]: return {a.attacker_id: f"cluster-{a.attacker_id}" for a in corpus.attackers} +def fingerprint_clusterer(corpus: GeneratedCorpus) -> dict[str, str]: + """Group by ``(ja3, hassh)``. Un-fingerprinted rows stay singleton. + + Approximates the stable-signal arm of the planned similarity graph; + the real algorithm in `decnet/clustering/` will extend this with + payload simhashes, C2 callback overlap, and phase-handoff edges. + """ + pred: dict[str, str] = {} + for att in corpus.attackers: + if att.ja3 is None and att.hassh is None: + # No fingerprint to share — un-mergeable, own cluster. + pred[att.attacker_id] = f"fp-singleton-{att.attacker_id}" + else: + pred[att.attacker_id] = f"fp::{att.ja3}::{att.hassh}" + return pred + + +def asn_clusterer(corpus: GeneratedCorpus) -> dict[str, str]: + """Group by source ASN. Deliberately-bad — see fixture 2.""" + return {a.attacker_id: f"asn-{a.asn}" for a in corpus.attackers} + + def credential_jaccard_clusterer( corpus: GeneratedCorpus, *, threshold: float = 0.5 ) -> dict[str, str]: diff --git a/tests/clustering/test_vpn_hopping_fixture.py b/tests/clustering/test_vpn_hopping_fixture.py new file mode 100644 index 00000000..e4168540 --- /dev/null +++ b/tests/clustering/test_vpn_hopping_fixture.py @@ -0,0 +1,126 @@ +""" +End-to-end pipeline test for fixture 2 (vpn_hopping). + +One campaign, one actor, ip_pool: rotating across 5 distinct ASNs. +JA3, HASSH, and payload_hash stable across every rotation. The +fixture is the canonical "same hands, different IP/ASN" scenario +that motivates Identity Resolution (see development/ +IDENTITY_RESOLUTION.md — these are the signals "the attacker can't +cheaply rotate"). It also stresses the clusterer's weighting of +ASN: the real similarity graph weights ASN match "very low" because +VPN/proxy hopping shatters ASN within a single identity. + +Three tests cover this: + +1. `test_vpn_hopping_pipeline_passes_bounds_at_campaign_level` — + `fingerprint_clusterer` reference folds all 5 rotated rows into + one cluster (shared JA3 + HASSH). Trivially green at campaign- + level scoring; the test is a ratchet point for the real algorithm + to keep passing once it lands. + +2. `test_vpn_hopping_pipeline_passes_bounds_at_identity_level` — + same clusterer, scored against the identity-level oracle. Verifies + the factory's `truth_identity_id` plumbing across rotated rows + (commit f6b8375) actually expresses the right ground truth: 5 + observations → 1 identity. + +3. `test_asn_clusterer_fragments_campaign` — runs the deliberately- + bad `asn_clusterer` reference. The 5 rotation_asns become 5 + singleton clusters → completeness collapses to ~0, ARI collapses, + and the fixture's bound floor on completeness (0.80) rejects the + bad clusterer. If this test ever passes, the fixture has lost its + discrimination power. +""" +from __future__ import annotations + +from pathlib import Path + +import pytest + +from tests.clustering.fixture_harness import ( + asn_clusterer, + assert_fixture_bounds, + fingerprint_clusterer, +) +from tests.clustering.metrics import score +from tests.factories.campaign_factory import generate, load_yaml + +FIXTURE_DIR = Path(__file__).parent.parent / "fixtures" / "campaigns" +FIXTURE_YAML = FIXTURE_DIR / "vpn_hopping.yaml" +EXPECTED_YAML = FIXTURE_DIR / "vpn_hopping.expected.yaml" + + +def test_vpn_hopping_corpus_shape() -> None: + """One actor, rotation_count=5 → 5 observation rows, 1 identity, 1 campaign.""" + spec = load_yaml(FIXTURE_YAML) + corpus = generate(spec, seed=0) + assert len(corpus.attackers) == 5 + truth_campaigns = {a.truth_campaign_id for a in corpus.attackers} + truth_identities = {a.truth_identity_id for a in corpus.attackers} + truth_actors = {a.truth_actor_id for a in corpus.attackers} + assert truth_campaigns == {"vpn-hopping-001"} + assert len(truth_identities) == 1, "all 5 rotations must share one truth_identity_id" + assert truth_actors == {"hopper-a"} + asns = {a.asn for a in corpus.attackers} + assert asns == {64512, 64513, 64514, 64515, 64516} + ips = {a.ip for a in corpus.attackers} + assert len(ips) == 5, "rotation must produce 5 distinct IPs" + # Stable fingerprints across every row — the load-bearing signal. + ja3s = {a.ja3 for a in corpus.attackers} + hasshs = {a.hassh for a in corpus.attackers} + assert len(ja3s) == 1 + assert len(hasshs) == 1 + + +def test_vpn_hopping_pipeline_passes_bounds_at_campaign_level() -> None: + spec = load_yaml(FIXTURE_YAML) + corpus = generate(spec, seed=0) + assert_fixture_bounds(corpus, fingerprint_clusterer, EXPECTED_YAML) + + +def test_vpn_hopping_pipeline_passes_bounds_at_identity_level() -> None: + spec = load_yaml(FIXTURE_YAML) + corpus = generate(spec, seed=0) + metrics = assert_fixture_bounds( + corpus, fingerprint_clusterer, EXPECTED_YAML, truth_level="identity" + ) + # All 5 observations should land in the same predicted cluster + # AND share one truth identity → ARI is exactly 1.0. + assert metrics["adjusted_rand_index"] == pytest.approx(1.0) + assert metrics["completeness"] == pytest.approx(1.0) + + +def test_asn_clusterer_fragments_campaign() -> None: + """ + The fixture's reason for being. Group by ASN and the campaign + shatters into 5 singletons — completeness goes to 0 because the + one true class is split across 5 predicted clusters. The bound + floor on completeness (0.80) must reject this. + + If this test ever passes (asn_clusterer satisfies the bounds), + the fixture has lost its discrimination power. + """ + spec = load_yaml(FIXTURE_YAML) + corpus = generate(spec, seed=0) + pred = asn_clusterer(corpus) + # 5 distinct ASNs in the rotation → 5 distinct predicted clusters. + assert len(set(pred.values())) == 5 + + metrics = score(corpus.truth_labels(level="campaign"), pred) + # Completeness collapses — that's the failure mode the fixture + # protects against. + assert metrics["completeness"] == pytest.approx(0.0) + # ARI collapses too (very different partitions). + assert metrics["adjusted_rand_index"] < 0.1 + + # The bound floor would reject this clusterer. + bounds = { + "adjusted_rand_index": 0.85, + "homogeneity": 0.90, + "completeness": 0.80, + "singleton_recall": 0.95, + } + breaches = [k for k, floor in bounds.items() if metrics[k] < floor] + assert "completeness" in breaches, ( + f"fixture failed to catch the bad clusterer; observed metrics: {metrics}" + ) diff --git a/tests/fixtures/campaigns/vpn_hopping.expected.yaml b/tests/fixtures/campaigns/vpn_hopping.expected.yaml new file mode 100644 index 00000000..e9b7fb6c --- /dev/null +++ b/tests/fixtures/campaigns/vpn_hopping.expected.yaml @@ -0,0 +1,25 @@ +# Bounds for fixture 2 (vpn_hopping). +# +# Ground truth at campaign-level: 1 campaign of 5 observation rows. +# Ground truth at identity-level: 1 identity of 5 observation rows. +# A correct algorithm scores 1.0 across every metric on this fixture. +# +# Completeness is the load-bearing metric: a clusterer that fragments +# the campaign by IP/ASN tanks completeness (the one true class is +# split across many predicted clusters). The adversarial asn_clusterer +# in the test file demonstrates this and the bound below rejects it. +# +# No true singletons in this fixture — singleton_recall is trivially +# 1.0 (the metric returns 1.0 when truth has no singletons). +# +# Bounds are loose at v1; tighten as the algorithm matures. Loosening +# any bound to make CI pass requires PR-comment justification (per +# CAMPAIGN_CLUSTERING.md §2). +adjusted_rand_index: + min: 0.85 +homogeneity: + min: 0.90 +completeness: + min: 0.80 +singleton_recall: + min: 0.95 diff --git a/tests/fixtures/campaigns/vpn_hopping.yaml b/tests/fixtures/campaigns/vpn_hopping.yaml new file mode 100644 index 00000000..802c071b --- /dev/null +++ b/tests/fixtures/campaigns/vpn_hopping.yaml @@ -0,0 +1,55 @@ +# Fixture 2 (vpn_hopping) — see development/CAMPAIGN_CLUSTERING.md §2 +# and development/IDENTITY_RESOLUTION.md. +# +# One campaign, one actor, rotating across 5 distinct ASNs. JA3, HASSH, +# and payload_hash are STABLE across every rotation — these are the +# signals "the attacker can't cheaply rotate" (per the identity +# resolution design doc) and they're the reason a clusterer should +# recover all 5 observation rows as ONE identity, ONE campaign. +# +# Ground truth (verified at every level): +# - 5 observations → 1 identity → 1 campaign (per truth_labels()) +# +# Pass condition: a fingerprint-driven clusterer must fold all 5 rows +# into one cluster at both campaign-level and identity-level scoring. +# +# Adversarial condition: an asn_clusterer (group attackers by ASN — +# the textbook bad heuristic) must fragment the campaign into 5 +# pieces and breach the completeness floor. This is what proves "ASN +# match" is correctly weighted "very low" in the planned similarity +# graph (per TODO clusterer feature list). +# +# ASN choice: synthetic private-use values (RFC 6996 64512–64534) so +# the fixture never collides with real-world data and signals "not +# real" to readers at a glance. +campaign: + id: vpn-hopping-001 + actors: + - id: hopper-a + asn: 64512 # primary; rotation_asns overrides per row + ip_pool: rotating + rotation_count: 5 + rotation_asns: [64512, 64513, 64514, 64515, 64516] + ja3: "771,4865-4866-4867-49195-49199-49196-49200,0-23-65281-10-11-35-16-5-13-18-51-45-43-27,29-23-24,0" + hassh: "vpn-hopper-cccccccc-cccccccc-cccccccc" + hours_active_utc: [12, 13, 14, 15, 16] + jitter_seconds: 60 + phases: + - name: delivery + actor: hopper-a + target_selector: { service: ssh, count: 5 } + dwell_seconds: 1 + - name: exploitation + actor: hopper-a + tool_signature: + # Stable payload across every rotation — same dropper from + # whatever staging the operator uses, regardless of which VPN + # exit they emerge from. + payload_hash: "vpn-hopper-stage1-payload" + target_selector: { service: ssh, count: 5 } + dwell_seconds: 5 + - name: discovery + actor: hopper-a + target_selector: { service: ssh, count: 5 } + dwell_seconds: 5 + duration_days: 2