merge: testing → main (reconcile 2-week divergence)
This commit is contained in:
6
decnet/clustering/impl/__init__.py
Normal file
6
decnet/clustering/impl/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""Concrete clusterer implementations.
|
||||
|
||||
Each module here contains exactly one :class:`~decnet.clustering.base.Clusterer`
|
||||
subclass. New implementations register themselves in
|
||||
:func:`decnet.clustering.factory.get_clusterer`.
|
||||
"""
|
||||
379
decnet/clustering/impl/connected_components.py
Normal file
379
decnet/clustering/impl/connected_components.py
Normal file
@@ -0,0 +1,379 @@
|
||||
"""Connected-components identity clusterer (v1).
|
||||
|
||||
Builds a similarity graph over observations (per-IP attacker rows),
|
||||
runs union-find over edges that pass a confidence threshold, and writes
|
||||
one ``attacker_identities`` row per component.
|
||||
|
||||
**v1 signal coverage (this commit):**
|
||||
|
||||
* High-weight tier: JA3 / HASSH / payload-hash / C2-endpoint exact
|
||||
match (alone enough to cluster). The production tick currently sees
|
||||
JA3 + HASSH only — payload + C2 require log mining and join in
|
||||
later commits. The fixture tests exercise the full high-weight set
|
||||
through the in-memory path.
|
||||
|
||||
Subsequent commits add medium / low / very-low tier edges, phase-
|
||||
handoff edges, and revocable merges. Edges MUST stay time-agnostic
|
||||
— fixture 7 forbids recency-decay clustering.
|
||||
|
||||
**v1 behavior:**
|
||||
|
||||
The clusterer assigns identities to NULL observations, merges existing
|
||||
identities when a single predicted component spans them, and revokes
|
||||
prior merges when the predicted component splits a merged-out identity
|
||||
away from its winner. Observations stay FK'd to their original identity
|
||||
row throughout — merges are soft pointers via
|
||||
``attacker_identities.merged_into_uuid``, never observation re-points.
|
||||
That keeps the audit trail intact and lets cached subscribers resolve
|
||||
merged-out UUIDs through the chain.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import uuid as _uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Iterable, Optional
|
||||
|
||||
from decnet.clustering.base import Clusterer, ClusterResult
|
||||
from decnet.clustering.impl.similarity import (
|
||||
EDGE_THRESHOLD,
|
||||
Observation,
|
||||
combined_edge_weight,
|
||||
)
|
||||
from decnet.logging import get_logger
|
||||
from decnet.profiler.identity_rollup import extract_fp_summaries
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
|
||||
log = get_logger("clustering.connected_components")
|
||||
|
||||
|
||||
def cluster_observations(
|
||||
observations: Iterable[Observation],
|
||||
) -> dict[str, str]:
|
||||
"""Run connected-components over the high-weight similarity graph.
|
||||
|
||||
Pure: no DB, no clock, no I/O. Both the fixture-validation tests
|
||||
and the production ``tick`` consume this. The mapping is a
|
||||
deterministic function of the input set + edge function.
|
||||
|
||||
Singletons get a stable per-observation cluster id so callers can
|
||||
distinguish "isolated observation" from "merged into nothing."
|
||||
|
||||
Returns ``{observation_id: cluster_id}``. Cluster ids are opaque
|
||||
strings — callers must not rely on their format.
|
||||
"""
|
||||
obs_list = list(observations)
|
||||
parent: dict[str, str] = {o.observation_id: o.observation_id for o in obs_list}
|
||||
|
||||
def find(x: str) -> str:
|
||||
while parent[x] != x:
|
||||
parent[x] = parent[parent[x]]
|
||||
x = parent[x]
|
||||
return x
|
||||
|
||||
def union(x: str, y: str) -> None:
|
||||
rx, ry = find(x), find(y)
|
||||
if rx != ry:
|
||||
parent[rx] = ry
|
||||
|
||||
for i, a in enumerate(obs_list):
|
||||
for b in obs_list[i + 1:]:
|
||||
if combined_edge_weight(a, b) >= EDGE_THRESHOLD:
|
||||
union(a.observation_id, b.observation_id)
|
||||
|
||||
# Roots: each unique find(o) is a component representative. Use
|
||||
# them as the cluster id so two runs over the same input produce
|
||||
# the same labels (handy for assertions).
|
||||
return {o.observation_id: f"cc-{find(o.observation_id)}" for o in obs_list}
|
||||
|
||||
|
||||
def from_attacker_row(row: dict[str, Any]) -> Observation:
|
||||
"""Project an ``Attacker`` row dict into an :class:`Observation`.
|
||||
|
||||
Pulls JA3 / HASSH out of the ``Attacker.fingerprints`` JSON list
|
||||
(one entry per fingerprint event the prober collected). Multiple
|
||||
JA3s on a single observation are flattened to a single value —
|
||||
the most-recent — because :class:`Observation` is a single-row
|
||||
projection; an observation that exhibits two distinct JA3s across
|
||||
its lifetime is a wire-level oddity that the clusterer treats by
|
||||
keeping the latest. The identity row itself can store the full
|
||||
list across observations.
|
||||
|
||||
Payload + C2 + commands are left empty — log mining lands in
|
||||
later commits. The function shape doesn't change when they do.
|
||||
"""
|
||||
raw = row.get("fingerprints") or "[]"
|
||||
try:
|
||||
entries = json.loads(raw) if isinstance(raw, str) else list(raw)
|
||||
except (TypeError, ValueError):
|
||||
entries = []
|
||||
|
||||
ja3: Optional[str] = None
|
||||
hassh: Optional[str] = None
|
||||
for entry in entries:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
kind = entry.get("kind")
|
||||
h = entry.get("hash") or entry.get("value")
|
||||
if not h:
|
||||
continue
|
||||
if kind == "ja3":
|
||||
ja3 = h
|
||||
elif kind == "hassh":
|
||||
hassh = h
|
||||
|
||||
return Observation(
|
||||
observation_id=row["uuid"],
|
||||
ja3=ja3,
|
||||
hassh=hassh,
|
||||
asn=row.get("asn"),
|
||||
)
|
||||
|
||||
|
||||
class ConnectedComponentsClusterer(Clusterer):
|
||||
"""Connected-components clusterer over the similarity graph.
|
||||
|
||||
See module docstring for v1 signal coverage and behavior notes.
|
||||
"""
|
||||
|
||||
name = "connected_components"
|
||||
|
||||
async def tick(self, repo: BaseRepository) -> ClusterResult:
|
||||
try:
|
||||
rows = await repo.list_attackers_for_clustering()
|
||||
except Exception: # noqa: BLE001
|
||||
log.exception("clusterer: failed to read attackers")
|
||||
return ClusterResult()
|
||||
|
||||
if not rows:
|
||||
return ClusterResult()
|
||||
|
||||
# Build the merge chain so a row's "effective" identity follows
|
||||
# merged_into_uuid up to the canonical winner. Pre-computing it
|
||||
# lets us reason about post-merge identity membership in one
|
||||
# place. ``identity_chain[u]`` is the canonical winner for
|
||||
# identity ``u`` (or ``u`` itself if not merged out).
|
||||
try:
|
||||
all_identities = await repo.list_all_identities()
|
||||
except Exception: # noqa: BLE001
|
||||
log.exception("clusterer: failed to read identities")
|
||||
return ClusterResult()
|
||||
identity_chain = _build_merge_chain(all_identities)
|
||||
|
||||
# Project + cluster.
|
||||
observations: list[Observation] = []
|
||||
row_by_id: dict[str, dict[str, Any]] = {}
|
||||
for r in rows:
|
||||
obs = from_attacker_row(r)
|
||||
observations.append(obs)
|
||||
row_by_id[obs.observation_id] = r
|
||||
labels = cluster_observations(observations)
|
||||
|
||||
# Group observations by predicted cluster.
|
||||
components: dict[str, list[str]] = {}
|
||||
for obs_id, cluster_id in labels.items():
|
||||
components.setdefault(cluster_id, []).append(obs_id)
|
||||
|
||||
result = ClusterResult()
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
# Pass 1 — per-component reconciliation: form, link, merge.
|
||||
for member_ids in components.values():
|
||||
literal_ids = {
|
||||
row_by_id[m]["identity_id"] for m in member_ids
|
||||
if row_by_id[m].get("identity_id")
|
||||
}
|
||||
effective_ids = {identity_chain.get(i, i) for i in literal_ids}
|
||||
unassigned = [
|
||||
m for m in member_ids
|
||||
if not row_by_id[m].get("identity_id")
|
||||
]
|
||||
|
||||
if not effective_ids:
|
||||
# Fresh component — mint a new identity.
|
||||
identity_uuid = str(_uuid.uuid4())
|
||||
try:
|
||||
await repo.create_attacker_identity({
|
||||
"uuid": identity_uuid,
|
||||
"schema_version": 1,
|
||||
"first_seen_at": now,
|
||||
"last_seen_at": now,
|
||||
"created_at": now,
|
||||
"updated_at": now,
|
||||
"observation_count": len(member_ids),
|
||||
})
|
||||
except Exception: # noqa: BLE001
|
||||
log.exception(
|
||||
"clusterer: failed to create identity for component %s",
|
||||
member_ids,
|
||||
)
|
||||
continue
|
||||
|
||||
linked: list[str] = []
|
||||
for obs_id in member_ids:
|
||||
if await _link(repo, obs_id, identity_uuid):
|
||||
linked.append(obs_id)
|
||||
if linked:
|
||||
result.identities_formed.append({
|
||||
"identity_uuid": identity_uuid,
|
||||
"observation_uuids": linked,
|
||||
})
|
||||
await _roll_up_fingerprints(
|
||||
repo, identity_uuid, [row_by_id[m] for m in member_ids],
|
||||
)
|
||||
continue
|
||||
|
||||
# Deterministic winner so two clusterer runs produce the
|
||||
# same merge direction. Sorting by uuid string is stable
|
||||
# and doesn't depend on row insertion order.
|
||||
winner_uuid = min(effective_ids)
|
||||
losers = effective_ids - {winner_uuid}
|
||||
|
||||
for loser_uuid in losers:
|
||||
try:
|
||||
await repo.update_identity_merged_into(loser_uuid, winner_uuid)
|
||||
except Exception: # noqa: BLE001
|
||||
log.exception(
|
||||
"clusterer: failed to merge %s -> %s",
|
||||
loser_uuid, winner_uuid,
|
||||
)
|
||||
continue
|
||||
identity_chain[loser_uuid] = winner_uuid
|
||||
result.identities_merged.append({
|
||||
"winner_uuid": winner_uuid,
|
||||
"loser_uuid": loser_uuid,
|
||||
})
|
||||
|
||||
# Link any unassigned observations in the component to the
|
||||
# winner so a subsequent tick sees a single-identity
|
||||
# component and skips this branch entirely.
|
||||
for obs_id in unassigned:
|
||||
if await _link(repo, obs_id, winner_uuid):
|
||||
result.observations_linked.append({
|
||||
"identity_uuid": winner_uuid,
|
||||
"observation_uuid": obs_id,
|
||||
})
|
||||
|
||||
# Re-roll the winner's fingerprint summary across every
|
||||
# observation now in this component (including the loser
|
||||
# side — the merge unifies their evidence even though the
|
||||
# loser's identity row stays FK'd via merged_into_uuid).
|
||||
await _roll_up_fingerprints(
|
||||
repo, winner_uuid, [row_by_id[m] for m in member_ids],
|
||||
)
|
||||
|
||||
# Pass 2 — revocable-merge undo. For each currently-merged-out
|
||||
# identity, check whether its observations still cluster with
|
||||
# the winner's. If not, the merge is contradicted by new
|
||||
# evidence — clear merged_into_uuid and emit identity.unmerged.
|
||||
# Observations FK'd to the resurrected loser stay where they
|
||||
# were; the chain just stops following.
|
||||
observations_by_literal_identity: dict[str, list[str]] = {}
|
||||
for obs_id, r in row_by_id.items():
|
||||
iid = r.get("identity_id")
|
||||
if iid:
|
||||
observations_by_literal_identity.setdefault(iid, []).append(obs_id)
|
||||
|
||||
for identity_row in all_identities:
|
||||
if not identity_row.get("merged_into_uuid"):
|
||||
continue
|
||||
loser_uuid = identity_row["uuid"]
|
||||
winner_uuid = identity_chain.get(loser_uuid, loser_uuid)
|
||||
if winner_uuid == loser_uuid:
|
||||
continue # broken chain — paranoia
|
||||
loser_obs = observations_by_literal_identity.get(loser_uuid, [])
|
||||
winner_obs = observations_by_literal_identity.get(winner_uuid, [])
|
||||
if not loser_obs or not winner_obs:
|
||||
# No observations either side — can't disprove the merge.
|
||||
continue
|
||||
loser_clusters = {labels[o] for o in loser_obs}
|
||||
winner_clusters = {labels[o] for o in winner_obs}
|
||||
if loser_clusters & winner_clusters:
|
||||
continue # still co-clustered with winner — merge stands
|
||||
try:
|
||||
await repo.update_identity_merged_into(loser_uuid, None)
|
||||
except Exception: # noqa: BLE001
|
||||
log.exception(
|
||||
"clusterer: failed to unmerge %s from %s",
|
||||
loser_uuid, winner_uuid,
|
||||
)
|
||||
continue
|
||||
identity_chain[loser_uuid] = loser_uuid
|
||||
result.identities_unmerged.append({
|
||||
"resurrected_uuid": loser_uuid,
|
||||
"former_winner_uuid": winner_uuid,
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _build_merge_chain(
|
||||
identities: list[dict[str, Any]],
|
||||
) -> dict[str, str]:
|
||||
"""Build a uuid → canonical-winner map from a list of identity rows.
|
||||
|
||||
Follows ``merged_into_uuid`` to a fixed point per identity, with a
|
||||
hop cap to defend against accidental cycles. The returned dict
|
||||
contains an entry for every identity uuid (mapping to itself if
|
||||
not merged out).
|
||||
"""
|
||||
_MAX_HOPS = 8
|
||||
by_uuid: dict[str, dict[str, Any]] = {i["uuid"]: i for i in identities}
|
||||
chain: dict[str, str] = {}
|
||||
for uuid_ in by_uuid:
|
||||
cur = uuid_
|
||||
for _ in range(_MAX_HOPS):
|
||||
row = by_uuid.get(cur)
|
||||
if row is None:
|
||||
break
|
||||
nxt = row.get("merged_into_uuid")
|
||||
if not nxt or nxt == cur:
|
||||
break
|
||||
cur = nxt
|
||||
chain[uuid_] = cur
|
||||
return chain
|
||||
|
||||
|
||||
async def _link(
|
||||
repo: BaseRepository, observation_uuid: str, identity_uuid: str,
|
||||
) -> bool:
|
||||
"""Set ``attackers.identity_id`` and return ``True`` on success.
|
||||
|
||||
Wraps the repo call so the tick body stays linear and exception
|
||||
handling is consistent across the form / link / merge branches.
|
||||
"""
|
||||
try:
|
||||
await repo.set_attacker_identity_id(observation_uuid, identity_uuid)
|
||||
return True
|
||||
except Exception: # noqa: BLE001
|
||||
log.exception(
|
||||
"clusterer: failed to link obs=%s -> identity=%s",
|
||||
observation_uuid, identity_uuid,
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
async def _roll_up_fingerprints(
|
||||
repo: BaseRepository,
|
||||
identity_uuid: str,
|
||||
member_rows: list[dict[str, Any]],
|
||||
) -> None:
|
||||
"""Project member observations' fingerprint blobs onto the identity's
|
||||
summary columns. Best-effort: a write failure is logged but never
|
||||
breaks the clusterer tick — the columns just stay stale until the
|
||||
next pass."""
|
||||
summaries = extract_fp_summaries(member_rows)
|
||||
try:
|
||||
await repo.update_identity_fingerprints(identity_uuid, **summaries)
|
||||
except Exception: # noqa: BLE001
|
||||
log.exception(
|
||||
"clusterer: failed to roll up fingerprints for identity=%s",
|
||||
identity_uuid,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ConnectedComponentsClusterer",
|
||||
"cluster_observations",
|
||||
"from_attacker_row",
|
||||
]
|
||||
313
decnet/clustering/impl/similarity.py
Normal file
313
decnet/clustering/impl/similarity.py
Normal file
@@ -0,0 +1,313 @@
|
||||
"""Similarity-graph primitives for the connected-components clusterer.
|
||||
|
||||
Each function takes two :class:`Observation` projections and returns a
|
||||
similarity score in ``[0.0, 1.0]``. The connected-components impl
|
||||
(landing in subsequent commits) decides how to combine these into a
|
||||
single edge weight, applies a threshold, and runs union-find.
|
||||
|
||||
**Time-agnostic.** Edges MUST NOT depend on observation timestamps.
|
||||
Fixture 7 (``slow_burn``) proves recency-decay clustering fragments
|
||||
multi-month APT campaigns; the production graph cannot silently expire
|
||||
old edges. Timestamps are still useful for *audit* (the ``first_seen``
|
||||
on the resulting identity row) but never for *similarity*.
|
||||
|
||||
**Weight tiers** (from `development/IDENTITY_RESOLUTION.md`):
|
||||
|
||||
* High — JA3 / HASSH / payload-hash / C2-callback exact match. Stable
|
||||
signals an attacker can't cheaply rotate. A single high-tier match
|
||||
supports identity strongly.
|
||||
* Medium — command-sequence Jaccard, bucketed by UKC phase. Tooling
|
||||
habits leak through command order; phase-bucketing avoids comparing
|
||||
a Discovery cmd-list to an Exploitation one.
|
||||
* Low — credential-attempt-set Jaccard. Defeated alone by fixture 1
|
||||
(``shared_wordlist``) where two campaigns share rockyou but diverge
|
||||
on infra.
|
||||
* Very low — ASN match. Defeated alone by fixture 2 (``vpn_hopping``)
|
||||
where one identity rotates across many ASNs.
|
||||
|
||||
The functions are pure (no DB, no I/O); the worker maps observations
|
||||
into :class:`Observation` once per tick and feeds these into the
|
||||
graph builder.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Mapping, Optional
|
||||
|
||||
# ─── Observation projection ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Observation:
|
||||
"""Minimal projection of a per-IP attacker observation.
|
||||
|
||||
Built once per ``Attacker`` row by the worker (or per
|
||||
``SyntheticAttacker`` in tests via :func:`from_synthetic`).
|
||||
Keeping the projection tight isolates the graph code from schema
|
||||
drift on either side.
|
||||
|
||||
All set-typed fields are :class:`frozenset` so they hash and so
|
||||
callers don't accidentally mutate them mid-pass.
|
||||
"""
|
||||
|
||||
observation_id: str
|
||||
"""Stable ID — for production, the ``Attacker.uuid``; for tests,
|
||||
the ``SyntheticAttacker.attacker_id``."""
|
||||
|
||||
ja3: Optional[str] = None
|
||||
hassh: Optional[str] = None
|
||||
asn: Optional[int] = None
|
||||
|
||||
payload_hashes: frozenset[str] = field(default_factory=frozenset)
|
||||
c2_endpoints: frozenset[str] = field(default_factory=frozenset)
|
||||
credentials: frozenset[tuple[str, str]] = field(default_factory=frozenset)
|
||||
|
||||
commands_by_phase: Mapping[str, tuple[str, ...]] = field(default_factory=dict)
|
||||
"""``UKCPhase.value`` → ordered command sequence observed in that
|
||||
phase. Empty dict when no command-bearing sessions were seen."""
|
||||
|
||||
|
||||
# ─── Edge functions ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _fingerprints_fully_disagree(a: Observation, b: Observation) -> bool:
|
||||
"""True iff every comparable fingerprint slot disagrees.
|
||||
|
||||
"Comparable" = both sides have a non-null value for that slot.
|
||||
Used as a soft-veto on shared C2 / payload signals: when two
|
||||
observations have distinct stable TLS + SSH stacks, sharing a C2
|
||||
endpoint is a *campaign*-level signal (cooperating operators,
|
||||
distinct identities) — not an identity-level one. Fixture 5
|
||||
(``multi_operator``) is the canonical demonstration.
|
||||
|
||||
Returns ``False`` when no fingerprint slot is comparable (any-null
|
||||
cases) — without evidence of disagreement we don't veto. Also
|
||||
``False`` when at least one slot agrees.
|
||||
"""
|
||||
ja3_comparable = a.ja3 is not None and b.ja3 is not None
|
||||
hassh_comparable = a.hassh is not None and b.hassh is not None
|
||||
if not (ja3_comparable or hassh_comparable):
|
||||
return False
|
||||
if ja3_comparable and a.ja3 == b.ja3:
|
||||
return False
|
||||
if hassh_comparable and a.hassh == b.hassh:
|
||||
return False
|
||||
if ja3_comparable and hassh_comparable:
|
||||
return a.ja3 != b.ja3 and a.hassh != b.hassh
|
||||
return True # exactly one slot is comparable, and it disagrees
|
||||
|
||||
|
||||
def high_weight_edge(a: Observation, b: Observation) -> float:
|
||||
"""JA3 / HASSH / payload-hash / C2-endpoint exact match.
|
||||
|
||||
Returns ``1.0`` if any of the four exact-match signals agrees
|
||||
(non-null on both sides), ``0.0`` otherwise. Single-signal high-tier
|
||||
agreement is by design enough to support identity — these are the
|
||||
signals the design doc calls out as "stable signals an attacker
|
||||
can't cheaply rotate."
|
||||
|
||||
**Fingerprint-disagreement veto.** Payload and C2 are infra signals
|
||||
that two cooperating operators (different identities) can share.
|
||||
JA3 + HASSH are tooling signals that differ when the operators are
|
||||
actually different humans with different tool stacks. So when the
|
||||
available fingerprint slots fully disagree, we drop the
|
||||
payload/C2 contribution to zero — preventing a campaign-level
|
||||
co-op signal from fusing two distinct identities. Fixture 5
|
||||
(``multi_operator``) is the canonical demonstration: shared
|
||||
stage-1 payload + shared C2, distinct JA3/HASSH per operator —
|
||||
must stay two identities. JA3 / HASSH agreement still returns
|
||||
``1.0`` directly, since by definition no veto applies when
|
||||
something agrees.
|
||||
|
||||
JA4 will join this tier as a sibling of JA3 once the prober emits
|
||||
it (``ATTACKER_FINGERPRINTED`` already carries a JA4 slot in
|
||||
``AttackerIdentity``); the function shape doesn't change.
|
||||
"""
|
||||
if a.ja3 is not None and a.ja3 == b.ja3:
|
||||
return 1.0
|
||||
if a.hassh is not None and a.hassh == b.hassh:
|
||||
return 1.0
|
||||
if _fingerprints_fully_disagree(a, b):
|
||||
# Stable-tool disagreement vetoes shared-infra signals.
|
||||
return 0.0
|
||||
if a.payload_hashes and b.payload_hashes and (a.payload_hashes & b.payload_hashes):
|
||||
return 1.0
|
||||
if a.c2_endpoints and b.c2_endpoints and (a.c2_endpoints & b.c2_endpoints):
|
||||
return 1.0
|
||||
return 0.0
|
||||
|
||||
|
||||
def medium_weight_edge(a: Observation, b: Observation) -> float:
|
||||
"""Phase-bucketed command-sequence Jaccard.
|
||||
|
||||
For each UKC phase observed on both sides, computes the Jaccard
|
||||
similarity of the command sets (multisets collapsed to sets — the
|
||||
*order* signal is reserved for a future feature, this commit is
|
||||
the scaffolding). Returns the **maximum** Jaccard across shared
|
||||
phases, so a single strong phase match isn't averaged away by a
|
||||
different phase where the actors diverge.
|
||||
|
||||
Phase-bucketing matters: comparing a Discovery cmd-list to an
|
||||
Exploitation one is meaningless. Both actors had to be in the
|
||||
same phase for the comparison to count.
|
||||
|
||||
Returns ``0.0`` when no phase is observed on both sides.
|
||||
"""
|
||||
shared_phases = set(a.commands_by_phase) & set(b.commands_by_phase)
|
||||
if not shared_phases:
|
||||
return 0.0
|
||||
best = 0.0
|
||||
for phase in shared_phases:
|
||||
sa = set(a.commands_by_phase[phase])
|
||||
sb = set(b.commands_by_phase[phase])
|
||||
if not sa and not sb:
|
||||
continue
|
||||
union = sa | sb
|
||||
if not union:
|
||||
continue
|
||||
j = len(sa & sb) / len(union)
|
||||
if j > best:
|
||||
best = j
|
||||
return best
|
||||
|
||||
|
||||
def low_weight_edge(a: Observation, b: Observation) -> float:
|
||||
"""Credential-attempt-set Jaccard.
|
||||
|
||||
Returns the Jaccard of ``(username, password)`` tuples. Two campaigns
|
||||
burning the same wordlist will score high here — fixture 1 proves
|
||||
this signal is dangerous in isolation. The connected-components
|
||||
impl combines this with other signals; alone it must not push a
|
||||
pair over threshold.
|
||||
|
||||
Returns ``0.0`` when either side attempted no credentials, or when
|
||||
the union is empty.
|
||||
"""
|
||||
if not a.credentials or not b.credentials:
|
||||
return 0.0
|
||||
union = a.credentials | b.credentials
|
||||
if not union:
|
||||
return 0.0
|
||||
return len(a.credentials & b.credentials) / len(union)
|
||||
|
||||
|
||||
def very_low_weight_edge(a: Observation, b: Observation) -> float:
|
||||
"""ASN equality.
|
||||
|
||||
Returns ``1.0`` iff both observations have a non-null ASN and they
|
||||
match. Fixture 2 (``vpn_hopping``) proves ASN-only clustering is
|
||||
a failure mode — one identity legitimately rotates across many
|
||||
ASNs. The combination logic in the connected-components impl
|
||||
weights this so that ASN agreement alone never crosses threshold.
|
||||
"""
|
||||
if a.asn is None or b.asn is None:
|
||||
return 0.0
|
||||
return 1.0 if a.asn == b.asn else 0.0
|
||||
|
||||
|
||||
# ─── Combined weight ────────────────────────────────────────────────────────
|
||||
|
||||
#: Tier multipliers applied to the per-tier edge scores when combining
|
||||
#: into a single weight. Tuned so that:
|
||||
#:
|
||||
#: * High-tier agreement alone (1.0) crosses the 1.0 threshold.
|
||||
#: * Medium-tier alone (max 1.0) yields 0.6 — below threshold.
|
||||
#: * Low-tier alone (max 1.0) yields 0.2 — defeats fixture 1's
|
||||
#: credential-overlap-only failure mode.
|
||||
#: * Very-low alone (max 1.0) yields 0.05 — defeats fixture 2's
|
||||
#: ASN-rotation failure mode.
|
||||
#:
|
||||
#: The ratio between tiers matters more than the absolute values: a
|
||||
#: tier should never combine its way past threshold without help from
|
||||
#: a stronger one.
|
||||
TIER_WEIGHTS = {
|
||||
"high": 1.0,
|
||||
"medium": 0.6,
|
||||
"low": 0.2,
|
||||
"very_low": 0.05,
|
||||
}
|
||||
|
||||
#: Threshold a combined edge weight must meet to survive into the
|
||||
#: similarity graph. The connected-components impl drops anything
|
||||
#: under this before running union-find.
|
||||
EDGE_THRESHOLD = 1.0
|
||||
|
||||
|
||||
def combined_edge_weight(a: Observation, b: Observation) -> float:
|
||||
"""Sum of all four tier scores, weighted by :data:`TIER_WEIGHTS`.
|
||||
|
||||
Each per-tier function returns a score in ``[0, 1]``; the
|
||||
weighted sum lets stronger tiers dominate without letting weaker
|
||||
ones combine their way past threshold.
|
||||
|
||||
The connected-components clusterer compares this against
|
||||
:data:`EDGE_THRESHOLD` to decide whether to draw an edge. Pure /
|
||||
time-agnostic — fixture 7 forbids recency-decay weighting.
|
||||
|
||||
Commits 5–7 land each tier in the call site:
|
||||
|
||||
* Commit 5 (this commit): high + medium.
|
||||
* Commit 6: + phase-handoff (a separate edge family, not a tier).
|
||||
* Commit 7: + low + very_low.
|
||||
|
||||
Until commit 7 lands, the low / very_low contributions stay zero
|
||||
by virtue of the underlying functions returning ``0.0`` whenever
|
||||
their inputs are missing. The combination is forward-compatible.
|
||||
"""
|
||||
return (
|
||||
TIER_WEIGHTS["high"] * high_weight_edge(a, b)
|
||||
+ TIER_WEIGHTS["medium"] * medium_weight_edge(a, b)
|
||||
+ TIER_WEIGHTS["low"] * low_weight_edge(a, b)
|
||||
+ TIER_WEIGHTS["very_low"] * very_low_weight_edge(a, b)
|
||||
)
|
||||
|
||||
|
||||
# ─── Adapter for the synthetic-corpus tests ─────────────────────────────────
|
||||
|
||||
|
||||
def from_synthetic(att) -> Observation: # type: ignore[no-untyped-def]
|
||||
"""Build an :class:`Observation` from a ``SyntheticAttacker``.
|
||||
|
||||
Lives here so test code doesn't import the factory shape into the
|
||||
production module — the adapter is a documented integration point.
|
||||
Imported lazily by callers; the production worker uses a parallel
|
||||
adapter from :class:`Attacker` rows once that lands.
|
||||
"""
|
||||
payload_hashes: set[str] = set()
|
||||
c2_endpoints: set[str] = set()
|
||||
credentials: set[tuple[str, str]] = set()
|
||||
commands_by_phase: dict[str, list[str]] = {}
|
||||
|
||||
for s in att.sessions:
|
||||
if s.payload_hash:
|
||||
payload_hashes.add(s.payload_hash)
|
||||
if s.c2_callback:
|
||||
c2_endpoints.add(s.c2_callback)
|
||||
for cred in s.credentials_tried:
|
||||
credentials.add(tuple(cred))
|
||||
if s.commands:
|
||||
commands_by_phase.setdefault(s.phase.value, []).extend(s.commands)
|
||||
|
||||
return Observation(
|
||||
observation_id=att.attacker_id,
|
||||
ja3=att.ja3,
|
||||
hassh=att.hassh,
|
||||
asn=att.asn,
|
||||
payload_hashes=frozenset(payload_hashes),
|
||||
c2_endpoints=frozenset(c2_endpoints),
|
||||
credentials=frozenset(credentials),
|
||||
commands_by_phase={k: tuple(v) for k, v in commands_by_phase.items()},
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"Observation",
|
||||
"high_weight_edge",
|
||||
"medium_weight_edge",
|
||||
"low_weight_edge",
|
||||
"very_low_weight_edge",
|
||||
"combined_edge_weight",
|
||||
"from_synthetic",
|
||||
"EDGE_THRESHOLD",
|
||||
"TIER_WEIGHTS",
|
||||
]
|
||||
Reference in New Issue
Block a user