feat(clustering): roll session digraph SimHashes into identity centroid

The identity clusterer folds an identity's per-session
motor.digraph_simhash observations into one 8-byte bitwise-majority
centroid (denoises per-session jitter) and writes it to
AttackerIdentity.kd_digraph_simhash via update_identity_fingerprints —
the orphaned column is now populated. list_identities_for_clustering
projects it so the campaign clusterer can read it.

Extends the repo abstract + DummyRepo stub/coverage.
This commit is contained in:
2026-06-16 17:05:34 -04:00
parent 66c73ce59d
commit 869d1eabb7
7 changed files with 150 additions and 10 deletions

View File

@@ -0,0 +1,47 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""``_digraph_centroid`` — bitwise-majority rollup of session SimHashes."""
from __future__ import annotations
from decnet.clustering.impl.connected_components import _digraph_centroid
from decnet.util.simhash import from_bytes8, to_bytes8
_ALL_ONES = (1 << 64) - 1
class _FakeRepo:
"""Returns canned digraph observations for one identity."""
def __init__(self, hash_ints: list[int]) -> None:
self._values = [to_bytes8(h).hex() for h in hash_ints]
async def observations_for_identity_primitive(self, identity_uuid, primitive):
assert primitive == "motor.digraph_simhash"
return [{"value": v} for v in self._values]
async def test_no_observations_returns_none() -> None:
assert await _digraph_centroid(_FakeRepo([]), "id") is None
async def test_single_session_centroid_is_that_hash() -> None:
out = await _digraph_centroid(_FakeRepo([0xDEADBEEFCAFEF00D]), "id")
assert from_bytes8(out) == 0xDEADBEEFCAFEF00D
async def test_majority_wins_per_bit() -> None:
# 2 of 3 sessions all-ones → every bit majority-set → all ones.
out = await _digraph_centroid(_FakeRepo([_ALL_ONES, _ALL_ONES, 0]), "id")
assert from_bytes8(out) == _ALL_ONES
async def test_tie_is_not_set() -> None:
# 1-1 tie per bit: majority requires strictly more than half → 0.
out = await _digraph_centroid(_FakeRepo([_ALL_ONES, 0]), "id")
assert from_bytes8(out) == 0
async def test_garbage_values_skipped() -> None:
repo = _FakeRepo([])
repo._values = ["not-hex-zz", "deadbeef", to_bytes8(_ALL_ONES).hex()] # only the last is valid
out = await _digraph_centroid(repo, "id")
assert from_bytes8(out) == _ALL_ONES

View File

@@ -96,8 +96,8 @@ class DummyRepo(BaseRepository):
async def set_attacker_identity_id(self, a, i): await super().set_attacker_identity_id(a, i)
async def list_all_identities(self): await super().list_all_identities(); return []
async def update_identity_merged_into(self, u, w): await super().update_identity_merged_into(u, w)
async def update_identity_fingerprints(self, u, *, ja3_hashes=None, hassh_hashes=None, tls_cert_sha256=None):
await super().update_identity_fingerprints(u, ja3_hashes=ja3_hashes, hassh_hashes=hassh_hashes, tls_cert_sha256=tls_cert_sha256)
async def update_identity_fingerprints(self, u, *, ja3_hashes=None, hassh_hashes=None, tls_cert_sha256=None, kd_digraph_simhash=None):
await super().update_identity_fingerprints(u, ja3_hashes=ja3_hashes, hassh_hashes=hassh_hashes, tls_cert_sha256=tls_cert_sha256, kd_digraph_simhash=kd_digraph_simhash)
# Campaign clustering (this PR)
async def get_campaign_by_uuid(self, u): await super().get_campaign_by_uuid(u)
async def list_campaigns(self, limit=50, offset=0): await super().list_campaigns(limit, offset); return []
@@ -260,7 +260,7 @@ async def test_base_repo_coverage():
await dr.list_all_identities()
await dr.update_identity_merged_into("a", "b")
await dr.update_identity_merged_into("a", None)
await dr.update_identity_fingerprints("a", ja3_hashes='["x"]', hassh_hashes=None, tls_cert_sha256='["y"]')
await dr.update_identity_fingerprints("a", ja3_hashes='["x"]', hassh_hashes=None, tls_cert_sha256='["y"]', kd_digraph_simhash=b"\x00" * 8)
await dr.get_campaign_by_uuid("a")
await dr.list_campaigns()
await dr.count_campaigns()

View File

@@ -0,0 +1,45 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""kd_digraph_simhash round-trips through update_identity_fingerprints
and the campaign-clustering projection."""
from __future__ import annotations
from datetime import datetime, timezone
import pytest
from decnet.web.db.factory import get_repository
@pytest.fixture
async def repo(tmp_path):
r = get_repository(db_path=str(tmp_path / "kd.db"))
await r.initialize()
return r
@pytest.mark.asyncio
async def test_fingerprint_write_and_clustering_read(repo):
now = datetime.now(timezone.utc)
await repo.create_attacker_identity({
"uuid": "id-kd", "first_seen_at": now, "last_seen_at": now,
})
raw = bytes.fromhex("deadbeefcafef00d")
await repo.update_identity_fingerprints("id-kd", kd_digraph_simhash=raw)
rows = await repo.list_identities_for_clustering()
row = next(r for r in rows if r["uuid"] == "id-kd")
assert bytes(row["kd_digraph_simhash"]) == raw
@pytest.mark.asyncio
async def test_fingerprint_overwrite_to_none(repo):
now = datetime.now(timezone.utc)
await repo.create_attacker_identity({
"uuid": "id-kd2", "first_seen_at": now, "last_seen_at": now,
})
await repo.update_identity_fingerprints("id-kd2", kd_digraph_simhash=b"\x01" * 8)
# A later pass with no biometric clears it (full-overwrite contract).
await repo.update_identity_fingerprints("id-kd2", kd_digraph_simhash=None)
rows = await repo.list_identities_for_clustering()
row = next(r for r in rows if r["uuid"] == "id-kd2")
assert row["kd_digraph_simhash"] is None