feat(clustering): UKC phase enum + synthetic campaign factory + metric harness
Pre-implementation scaffolding for campaign clustering. The simulator is
the spec — algorithm code follows once fixtures + metrics are stable.
* decnet/clustering/ukc.py — UKCPhase enum (19 phases across In/Through/Out
stages), OBSERVABLE_PHASES set, stage_of() helper. Vocabulary aligns
with future MITRE ATT&CK tagging so synthetic data and runtime phase
inference don't need renaming when TTP-tagging lands.
* tests/factories/campaign_factory.py — YAML DSL parser + deterministic
generator emitting truth-labeled SyntheticAttacker / SyntheticSession
records. Validates phase names, warns on unobservable phases, supports
multi-campaign + noise corpora.
* tests/clustering/metrics.py — pure-Python ARI / homogeneity /
completeness / singleton_recall (no sklearn dep). Decided before any
algorithm exists, on purpose.
* tests/fixtures/campaigns/lone_wolf.{yaml,expected.yaml} — fixture 3
from the design doc; simplest of the six, exercises the full pipeline
with an identity-clusterer placeholder.
* development/CAMPAIGN_CLUSTERING.md — design spec for the feature.
* development/DEVELOPMENT_V2.md — note on DSL evolution path
(concurrent phases, multi-actor per phase) deferred post-v1.
This commit is contained in:
112
tests/clustering/test_campaign_factory.py
Normal file
112
tests/clustering/test_campaign_factory.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Determinism + DSL-validation tests for the synthetic campaign factory."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from decnet.clustering.ukc import UKCPhase
|
||||
from tests.factories.campaign_factory import (
|
||||
DSLValidationError,
|
||||
generate,
|
||||
)
|
||||
|
||||
|
||||
def _minimal_spec() -> dict:
|
||||
return {
|
||||
"campaign": {
|
||||
"id": "c-test",
|
||||
"actors": [{"id": "a-1", "asn": 64512}],
|
||||
"phases": [{"name": "delivery", "actor": "a-1"}],
|
||||
"duration_days": 1,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def test_generation_is_deterministic_given_seed() -> None:
|
||||
spec = _minimal_spec()
|
||||
a = generate(spec, seed=42)
|
||||
b = generate(spec, seed=42)
|
||||
# IDs are RNG-driven — same seed must produce identical IDs, not
|
||||
# merely identical structure. Otherwise federation gossip and
|
||||
# fixture diffing both break.
|
||||
assert [att.attacker_id for att in a.attackers] == [
|
||||
att.attacker_id for att in b.attackers
|
||||
]
|
||||
assert [s.session_id for s in a.sessions] == [s.session_id for s in b.sessions]
|
||||
|
||||
|
||||
def test_different_seeds_produce_different_ids() -> None:
|
||||
spec = _minimal_spec()
|
||||
a = generate(spec, seed=1)
|
||||
b = generate(spec, seed=2)
|
||||
assert a.attackers[0].attacker_id != b.attackers[0].attacker_id
|
||||
|
||||
|
||||
def test_truth_labels_match_dsl() -> None:
|
||||
spec = _minimal_spec()
|
||||
corpus = generate(spec, seed=0)
|
||||
assert corpus.attackers[0].truth_campaign_id == "c-test"
|
||||
assert corpus.attackers[0].truth_actor_id == "a-1"
|
||||
# truth_labels() returns the dict the metric harness consumes.
|
||||
labels = corpus.truth_labels()
|
||||
assert labels[corpus.attackers[0].attacker_id] == "c-test"
|
||||
|
||||
|
||||
def test_unobservable_phase_emits_no_events() -> None:
|
||||
spec = _minimal_spec()
|
||||
spec["campaign"]["phases"] = [
|
||||
{"name": "reconnaissance", "actor": "a-1"}, # pre-target, unobservable
|
||||
{"name": "delivery", "actor": "a-1"},
|
||||
]
|
||||
corpus = generate(spec, seed=0)
|
||||
# Only the delivery phase should produce sessions.
|
||||
assert all(s.phase == UKCPhase.DELIVERY for s in corpus.sessions)
|
||||
assert len(corpus.sessions) == 1
|
||||
|
||||
|
||||
def test_unknown_phase_name_raises() -> None:
|
||||
spec = _minimal_spec()
|
||||
spec["campaign"]["phases"] = [{"name": "make_coffee", "actor": "a-1"}]
|
||||
with pytest.raises(DSLValidationError, match="unknown UKC phase"):
|
||||
generate(spec, seed=0)
|
||||
|
||||
|
||||
def test_phase_referencing_unknown_actor_raises() -> None:
|
||||
spec = _minimal_spec()
|
||||
spec["campaign"]["phases"] = [{"name": "delivery", "actor": "ghost"}]
|
||||
with pytest.raises(DSLValidationError, match="unknown actor"):
|
||||
generate(spec, seed=0)
|
||||
|
||||
|
||||
def test_noise_scanners_are_truth_singletons() -> None:
|
||||
spec = {
|
||||
"corpus": {
|
||||
"campaigns": [_minimal_spec()],
|
||||
"noise": {"scanner_count": 5},
|
||||
}
|
||||
}
|
||||
corpus = generate(spec, seed=0)
|
||||
# 1 campaign actor + 5 noise scanners = 6 distinct truth campaigns.
|
||||
truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
|
||||
assert len(truth_campaigns) == 6
|
||||
|
||||
|
||||
def test_multi_actor_campaign_shares_campaign_id() -> None:
|
||||
spec = {
|
||||
"campaign": {
|
||||
"id": "c-shared",
|
||||
"actors": [
|
||||
{"id": "a-1", "asn": 14061},
|
||||
{"id": "a-2", "asn": 14061},
|
||||
],
|
||||
"phases": [
|
||||
{"name": "delivery", "actor": "a-1"},
|
||||
{"name": "discovery", "actor": "a-2"},
|
||||
],
|
||||
"duration_days": 1,
|
||||
}
|
||||
}
|
||||
corpus = generate(spec, seed=0)
|
||||
truth = corpus.truth_labels()
|
||||
# Both attacker rows must point to the SAME truth_campaign_id —
|
||||
# this is the property fixture 5 (multi_operator) hinges on.
|
||||
assert set(truth.values()) == {"c-shared"}
|
||||
Reference in New Issue
Block a user