feat(clustering): revocable merges (merge + unmerge)

Reworks the clusterer's tick to handle multi-identity components and
re-evaluate prior merges. Two passes per tick:

Pass 1 — per-component reconciliation:
  * Fresh component → mint identity (commit 4 path).
  * Single-identity component → link unassigned observations.
  * Multi-identity component → soft-merge: pick the smallest-uuid
    winner deterministically, set merged_into_uuid on each loser,
    link unassigned observations to the winner. Observations stay
    FK'd to their original identity row — the merge is a soft
    pointer, not a re-point. Audit trail preserved; cached
    subscribers resolve through the chain.

Pass 2 — revocable-merge undo:
  * For each merged-out identity, check whether its observations
    still cluster with its winner's. If not, the merge is
    contradicted by new evidence — clear merged_into_uuid and emit
    identities_unmerged. The resurrected identity keeps its original
    uuid, so subscribers that cached it during the merged interval
    re-attach without a new lookup.

A pre-built merge-chain dict feeds Pass 1 so the effective-identity
lookup is O(1) per observation. The chain has a hop cap (paranoia
against accidental cycles in the underlying state).

Repo additions on BaseRepository + SQLModelRepository:
  * list_all_identities() — includes merged-out rows.
  * update_identity_merged_into(uuid, winner_or_None) — single
    setter for both merge and unmerge.
DummyRepo coverage stub updated.

Tests:
  * Two distinct identities bridged by a new observation merge with
    the smaller uuid as winner.
  * A pre-seeded soft-merge whose underlying observations diverge
    gets revoked; resurrected uuid emerges with merged_into_uuid
    cleared.
  * Tick is idempotent under no state changes.
This commit is contained in:
2026-04-26 08:33:32 -04:00
parent 87412da1ca
commit e364ef8859
5 changed files with 343 additions and 50 deletions

View File

@@ -206,6 +206,151 @@ async def test_tick_keeps_distinct_ja3_separate(repo):
assert len(formed["observation_uuids"]) == 1
@pytest.mark.anyio
async def test_tick_merges_two_identities_when_component_spans_them(repo):
"""Two pre-existing identities whose observations now cluster
together (e.g. a previously-missing fingerprint shows up) get
soft-merged: the smaller-uuid identity wins, the loser's
merged_into_uuid is set, observations stay FK'd to their
original identity row."""
# Tick 1: two distinct fingerprints → two distinct identities.
a = await _seed_attacker(repo, "1.1.1.1", ja3="ja3-A")
b = await _seed_attacker(repo, "2.2.2.2", ja3="ja3-B")
c = ConnectedComponentsClusterer()
first = await c.tick(repo)
assert len(first.identities_formed) == 2
# Snapshot the two identity uuids; we'll need them after the merge.
identities_after_first = await repo.list_all_identities()
assert len(identities_after_first) == 2
uuids = sorted(i["uuid"] for i in identities_after_first)
expected_winner, expected_loser = uuids[0], uuids[1]
# Tick 2: a bridging observation — fingerprints match BOTH prior
# rows. The bridge can't agree with both JA3s simultaneously, so
# use a HASSH that matches A and a payload that matches B.
# Simulate this with two new attackers, each linking a side.
# Simpler: change attacker A's stored fingerprint to also include
# ja3-B by re-seeding (in production this would be a fresh
# observation that bridges them).
bridge = await _seed_attacker(repo, "3.3.3.3", ja3="ja3-A", hassh="hassh-bridge")
# Make B's row carry the same hassh so the bridge can union them.
import json as _json
from datetime import datetime, timezone
now = datetime.now(timezone.utc)
await repo.upsert_attacker({
"ip": "2.2.2.2", "first_seen": now, "last_seen": now,
"event_count": 1,
"fingerprints": _json.dumps([
{"kind": "ja3", "hash": "ja3-B"},
{"kind": "hassh", "hash": "hassh-bridge"},
]),
})
second = await c.tick(repo)
assert len(second.identities_merged) == 1
merge = second.identities_merged[0]
assert merge["winner_uuid"] == expected_winner
assert merge["loser_uuid"] == expected_loser
# The loser's row still exists with merged_into_uuid set.
all_after = {i["uuid"]: i for i in await repo.list_all_identities()}
assert all_after[expected_loser]["merged_into_uuid"] == expected_winner
assert all_after[expected_winner]["merged_into_uuid"] is None
# Observations stay FK'd to their original identity row — the
# merge is a soft pointer, NOT a re-point.
a_row = await repo.get_attacker_by_uuid(a)
b_row = await repo.get_attacker_by_uuid(b)
assert a_row["identity_id"] in {expected_winner, expected_loser}
assert b_row["identity_id"] in {expected_winner, expected_loser}
@pytest.mark.anyio
async def test_tick_unmerges_when_observations_diverge(repo):
"""Pre-seed a soft-merged pair, then change the underlying
observations so they no longer cluster. The tick must clear
merged_into_uuid and emit identities_unmerged."""
import json as _json
from datetime import datetime, timezone
now = datetime.now(timezone.utc)
# Two attackers with same JA3 → tick merges them via shared
# high-tier signal (one identity formed).
a = await _seed_attacker(repo, "1.1.1.1", ja3="ja3-shared")
b = await _seed_attacker(repo, "2.2.2.2", ja3="ja3-shared")
c = ConnectedComponentsClusterer()
first = await c.tick(repo)
assert len(first.identities_formed) == 1
one_identity_uuid = first.identities_formed[0]["identity_uuid"]
# Force a soft-merge state: split observation b out into its own
# identity, then merge that back into the first via the repo
# directly. This emulates a state the clusterer would have
# arrived at across multiple ticks (form, then merge).
second_uuid = "00000000-0000-0000-0000-00000000bbbb"
await repo.create_attacker_identity({
"uuid": second_uuid,
"schema_version": 1,
"first_seen_at": now, "last_seen_at": now,
"created_at": now, "updated_at": now,
"observation_count": 1,
})
await repo.set_attacker_identity_id(b, second_uuid)
# Soft-merge second_uuid into one_identity_uuid (winner).
winner = min(one_identity_uuid, second_uuid)
loser = max(one_identity_uuid, second_uuid)
if loser == one_identity_uuid:
# Make the canonical mapping consistent with the test setup —
# we need the merge to be "loser → winner" by min-uuid rule.
# Swap ownership so the smaller-uuid keeps the active observations.
await repo.set_attacker_identity_id(a, winner)
await repo.set_attacker_identity_id(b, loser)
await repo.update_identity_merged_into(loser, winner)
# Verify the soft-merge is in place.
pre = {i["uuid"]: i for i in await repo.list_all_identities()}
assert pre[loser]["merged_into_uuid"] == winner
# Now change the underlying fingerprints so a and b no longer cluster.
await repo.upsert_attacker({
"ip": "2.2.2.2", "first_seen": now, "last_seen": now,
"event_count": 1,
"fingerprints": _json.dumps([{"kind": "ja3", "hash": "ja3-different"}]),
})
# Tick should detect the divergence and revoke the merge.
third = await c.tick(repo)
assert len(third.identities_unmerged) == 1
unmerged = third.identities_unmerged[0]
assert unmerged["resurrected_uuid"] == loser
assert unmerged["former_winner_uuid"] == winner
post = {i["uuid"]: i for i in await repo.list_all_identities()}
assert post[loser]["merged_into_uuid"] is None
assert post[winner]["merged_into_uuid"] is None
@pytest.mark.anyio
async def test_tick_is_idempotent_under_no_changes(repo):
"""Running tick twice with no state changes between produces no
side-effects on the second run."""
await _seed_attacker(repo, "1.1.1.1", ja3="ja3-x")
await _seed_attacker(repo, "2.2.2.2", ja3="ja3-x")
await _seed_attacker(repo, "3.3.3.3", ja3="ja3-y")
c = ConnectedComponentsClusterer()
first = await c.tick(repo)
second = await c.tick(repo)
assert second.identities_formed == []
assert second.observations_linked == []
assert second.identities_merged == []
assert second.identities_unmerged == []
# Sanity: the first tick did do something.
assert first.identities_formed
@pytest.mark.anyio
async def test_tick_links_new_observation_to_existing_identity(repo):
"""First tick: 2 attackers cluster into one identity. Second tick: