test(clustering): fixture 4 paused_campaign + active_days/time_window
Adds the actor.active_days primitive to the campaign factory so a DSL actor can be bound to specific day indexes. Falls back to the non-paused day pool when absent (existing fixtures unchanged). Intersects with pause_windows so the campaign-wide silence still wins if both are set. Adds time_window_clusterer reference to fixture_harness — union-find over attackers, edge if their session time-ranges are within gap_days of each other. Deliberately-bad reference for fixture 4: multi-day silent stretches fragment a single campaign because the clusterer has no signal that bridges the gap. Fixture 4 (paused_campaign): one campaign modeled as two DSL actors representing the operator's two operational windows (active days 1-2 and 6-7), separated by a silent stretch (days 3-5). Both share JA3 + HASSH + payload + C2 callback; only their active_days differ. Five tests: corpus shape (rows in their windows, shared signals), pipeline pass via fingerprint_clusterer at level=campaign, adversarial fragmentation via time_window_clusterer (1-day union threshold cannot bridge the 4-day silence → completeness collapses), huge-gap sanity (gap_days=10 unions both halves), silent-stretch invariant (no session leaks into the configured pause window). Identity-level scoring is fixture 2's job; this fixture is campaign-level only — modeling caveat documented in the YAML.
This commit is contained in:
@@ -36,6 +36,12 @@ cluster on, not the quality of the result.
|
||||
can prove they fail a clusterer that treats ASN match as a
|
||||
high-weight signal — VPN/proxy hopping shatters ASN within a single
|
||||
identity and a clusterer that leans on it tanks completeness.
|
||||
|
||||
* `time_window_clusterer` — deliberately-bad reference that unions
|
||||
attackers whose session time-ranges are within ``gap_days`` of each
|
||||
other. Exists so fixtures like `paused_campaign` (fixture #4) can
|
||||
prove they fail a clusterer that treats short-window time proximity
|
||||
as a primary signal — operators pause, sleep, take weekends.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -117,6 +123,65 @@ def asn_clusterer(corpus: GeneratedCorpus) -> dict[str, str]:
|
||||
return {a.attacker_id: f"asn-{a.asn}" for a in corpus.attackers}
|
||||
|
||||
|
||||
def time_window_clusterer(
|
||||
corpus: GeneratedCorpus, *, gap_days: float = 1.0
|
||||
) -> dict[str, str]:
|
||||
"""Union-find over attackers, edge if their session time-ranges
|
||||
overlap or are within ``gap_days`` of each other.
|
||||
|
||||
Deliberately-bad reference for fixture 4 (paused_campaign): a
|
||||
campaign that goes silent for several days will be split into
|
||||
"before pause" and "after pause" clusters by this clusterer,
|
||||
breaching completeness. The real algorithm must not lean on
|
||||
short-window time proximity as a primary signal — operators
|
||||
pause, sleep, switch shifts, take weekends. Time bursts are a
|
||||
weak hint, not a hard partition.
|
||||
|
||||
Attackers with no sessions become their own singleton cluster.
|
||||
"""
|
||||
from datetime import timedelta
|
||||
|
||||
gap = timedelta(days=gap_days)
|
||||
ids = [a.attacker_id for a in corpus.attackers]
|
||||
ranges: dict[str, tuple] = {}
|
||||
for att in corpus.attackers:
|
||||
if not att.sessions:
|
||||
continue
|
||||
starts = [s.started_at for s in att.sessions]
|
||||
ends = [s.started_at + timedelta(seconds=s.duration_s) for s in att.sessions]
|
||||
ranges[att.attacker_id] = (min(starts), max(ends))
|
||||
|
||||
parent: dict[str, str] = {aid: aid for aid in ids}
|
||||
|
||||
def find(x: str) -> str:
|
||||
while parent[x] != x:
|
||||
parent[x] = parent[parent[x]]
|
||||
x = parent[x]
|
||||
return x
|
||||
|
||||
def union(x: str, y: str) -> None:
|
||||
rx, ry = find(x), find(y)
|
||||
if rx != ry:
|
||||
parent[rx] = ry
|
||||
|
||||
keys = list(ranges.keys())
|
||||
for i, a in enumerate(keys):
|
||||
a_start, a_end = ranges[a]
|
||||
for b in keys[i + 1 :]:
|
||||
b_start, b_end = ranges[b]
|
||||
# Time-distance between the two ranges (0 if they overlap).
|
||||
if a_end < b_start:
|
||||
separation = b_start - a_end
|
||||
elif b_end < a_start:
|
||||
separation = a_start - b_end
|
||||
else:
|
||||
separation = timedelta(0)
|
||||
if separation <= gap:
|
||||
union(a, b)
|
||||
|
||||
return {aid: find(aid) for aid in ids}
|
||||
|
||||
|
||||
def credential_jaccard_clusterer(
|
||||
corpus: GeneratedCorpus, *, threshold: float = 0.5
|
||||
) -> dict[str, str]:
|
||||
|
||||
Reference in New Issue
Block a user