feat(clustering): link identities by keystroke-rhythm proximity

Campaign clusterer gains a keystroke edge: when two identities'
kd_digraph_simhash centroids are within KD_HAMMING_MAX bits, a graded
weight (1.0 at identical, fading to 0 at the cutoff) feeds the campaign
graph. Supporting tier (0.6) — a typing match plus temporal overlap
reaches threshold, but typing alone never merges (FP guard against
coarse, noisy terminal timing).

Projects the column through IdentityFeatures + from_identity_row.
This commit is contained in:
2026-06-16 17:09:42 -04:00
parent 869d1eabb7
commit c9e4bf4022
3 changed files with 133 additions and 0 deletions

View File

@@ -31,6 +31,7 @@ from decnet.clustering.campaign.impl.similarity import (
combined_campaign_weight,
)
from decnet.logging import get_logger
from decnet.util.simhash import from_bytes8
from decnet.web.db.repository import BaseRepository
log = get_logger("clustering.campaign.connected_components")
@@ -94,6 +95,7 @@ def from_identity_row(
payload_hashes = _parse_json_list(row.get("payload_simhashes"))
c2_endpoints = _parse_json_list(row.get("c2_endpoints"))
kd_digraph_simhash = _parse_kd_simhash(row.get("kd_digraph_simhash"))
first_phase_per_decky: dict[str, str] = {}
last_phase_per_decky: dict[str, str] = {}
@@ -127,6 +129,7 @@ def from_identity_row(
identity_uuid=row["uuid"],
payload_hashes=frozenset(payload_hashes),
c2_endpoints=frozenset(c2_endpoints),
kd_digraph_simhash=kd_digraph_simhash,
decky_set=frozenset(decky_set),
first_phase_per_decky=first_phase_per_decky,
last_phase_per_decky=last_phase_per_decky,
@@ -135,6 +138,24 @@ def from_identity_row(
)
def _parse_kd_simhash(raw: Any) -> Optional[int]:
"""Project the ``kd_digraph_simhash`` column into a 64-bit int.
The column is 8 raw bytes (BINARY(8) / BLOB) or NULL; a hex string
is tolerated for fixture rows. Returns ``None`` on absent/malformed.
"""
if raw is None:
return None
if isinstance(raw, (bytes, bytearray)) and len(raw) == 8:
return from_bytes8(bytes(raw))
if isinstance(raw, str) and len(raw) == 16:
try:
return from_bytes8(bytes.fromhex(raw))
except ValueError:
return None
return None
def _parse_json_list(raw: Optional[str]) -> list[str]:
if not raw:
return []