Pre-implementation scaffolding for campaign clustering. The simulator is
the spec — algorithm code follows once fixtures + metrics are stable.
* decnet/clustering/ukc.py — UKCPhase enum (19 phases across In/Through/Out
stages), OBSERVABLE_PHASES set, stage_of() helper. Vocabulary aligns
with future MITRE ATT&CK tagging so synthetic data and runtime phase
inference don't need renaming when TTP-tagging lands.
* tests/factories/campaign_factory.py — YAML DSL parser + deterministic
generator emitting truth-labeled SyntheticAttacker / SyntheticSession
records. Validates phase names, warns on unobservable phases, supports
multi-campaign + noise corpora.
* tests/clustering/metrics.py — pure-Python ARI / homogeneity /
completeness / singleton_recall (no sklearn dep). Decided before any
algorithm exists, on purpose.
* tests/fixtures/campaigns/lone_wolf.{yaml,expected.yaml} — fixture 3
from the design doc; simplest of the six, exercises the full pipeline
with an identity-clusterer placeholder.
* development/CAMPAIGN_CLUSTERING.md — design spec for the feature.
* development/DEVELOPMENT_V2.md — note on DSL evolution path
(concurrent phases, multi-actor per phase) deferred post-v1.
113 lines
3.6 KiB
Python
113 lines
3.6 KiB
Python
"""Determinism + DSL-validation tests for the synthetic campaign factory."""
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from decnet.clustering.ukc import UKCPhase
|
|
from tests.factories.campaign_factory import (
|
|
DSLValidationError,
|
|
generate,
|
|
)
|
|
|
|
|
|
def _minimal_spec() -> dict:
|
|
return {
|
|
"campaign": {
|
|
"id": "c-test",
|
|
"actors": [{"id": "a-1", "asn": 64512}],
|
|
"phases": [{"name": "delivery", "actor": "a-1"}],
|
|
"duration_days": 1,
|
|
}
|
|
}
|
|
|
|
|
|
def test_generation_is_deterministic_given_seed() -> None:
|
|
spec = _minimal_spec()
|
|
a = generate(spec, seed=42)
|
|
b = generate(spec, seed=42)
|
|
# IDs are RNG-driven — same seed must produce identical IDs, not
|
|
# merely identical structure. Otherwise federation gossip and
|
|
# fixture diffing both break.
|
|
assert [att.attacker_id for att in a.attackers] == [
|
|
att.attacker_id for att in b.attackers
|
|
]
|
|
assert [s.session_id for s in a.sessions] == [s.session_id for s in b.sessions]
|
|
|
|
|
|
def test_different_seeds_produce_different_ids() -> None:
|
|
spec = _minimal_spec()
|
|
a = generate(spec, seed=1)
|
|
b = generate(spec, seed=2)
|
|
assert a.attackers[0].attacker_id != b.attackers[0].attacker_id
|
|
|
|
|
|
def test_truth_labels_match_dsl() -> None:
|
|
spec = _minimal_spec()
|
|
corpus = generate(spec, seed=0)
|
|
assert corpus.attackers[0].truth_campaign_id == "c-test"
|
|
assert corpus.attackers[0].truth_actor_id == "a-1"
|
|
# truth_labels() returns the dict the metric harness consumes.
|
|
labels = corpus.truth_labels()
|
|
assert labels[corpus.attackers[0].attacker_id] == "c-test"
|
|
|
|
|
|
def test_unobservable_phase_emits_no_events() -> None:
|
|
spec = _minimal_spec()
|
|
spec["campaign"]["phases"] = [
|
|
{"name": "reconnaissance", "actor": "a-1"}, # pre-target, unobservable
|
|
{"name": "delivery", "actor": "a-1"},
|
|
]
|
|
corpus = generate(spec, seed=0)
|
|
# Only the delivery phase should produce sessions.
|
|
assert all(s.phase == UKCPhase.DELIVERY for s in corpus.sessions)
|
|
assert len(corpus.sessions) == 1
|
|
|
|
|
|
def test_unknown_phase_name_raises() -> None:
|
|
spec = _minimal_spec()
|
|
spec["campaign"]["phases"] = [{"name": "make_coffee", "actor": "a-1"}]
|
|
with pytest.raises(DSLValidationError, match="unknown UKC phase"):
|
|
generate(spec, seed=0)
|
|
|
|
|
|
def test_phase_referencing_unknown_actor_raises() -> None:
|
|
spec = _minimal_spec()
|
|
spec["campaign"]["phases"] = [{"name": "delivery", "actor": "ghost"}]
|
|
with pytest.raises(DSLValidationError, match="unknown actor"):
|
|
generate(spec, seed=0)
|
|
|
|
|
|
def test_noise_scanners_are_truth_singletons() -> None:
|
|
spec = {
|
|
"corpus": {
|
|
"campaigns": [_minimal_spec()],
|
|
"noise": {"scanner_count": 5},
|
|
}
|
|
}
|
|
corpus = generate(spec, seed=0)
|
|
# 1 campaign actor + 5 noise scanners = 6 distinct truth campaigns.
|
|
truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
|
|
assert len(truth_campaigns) == 6
|
|
|
|
|
|
def test_multi_actor_campaign_shares_campaign_id() -> None:
|
|
spec = {
|
|
"campaign": {
|
|
"id": "c-shared",
|
|
"actors": [
|
|
{"id": "a-1", "asn": 14061},
|
|
{"id": "a-2", "asn": 14061},
|
|
],
|
|
"phases": [
|
|
{"name": "delivery", "actor": "a-1"},
|
|
{"name": "discovery", "actor": "a-2"},
|
|
],
|
|
"duration_days": 1,
|
|
}
|
|
}
|
|
corpus = generate(spec, seed=0)
|
|
truth = corpus.truth_labels()
|
|
# Both attacker rows must point to the SAME truth_campaign_id —
|
|
# this is the property fixture 5 (multi_operator) hinges on.
|
|
assert set(truth.values()) == {"c-shared"}
|