feat(profiler): extract motor.digraph_simhash keystroke biometric
Per-session 64-bit SimHash of inter-keystroke digraph flight times: walk single-char input events, accumulate flight time per (c1,c2), bucket the median, Charikar-SimHash the bucketed pairs. Locality- sensitive so the same typist is Hamming-close across sessions; pastes and think-pauses break the chain; silent below the sample-size floor. New shared decnet/util/simhash.py (simhash64/hamming64/bytes helpers). Registered as a conditional Tier-A primitive (count 37->38); requires behave-shell>=0.1.2.
This commit is contained in:
66
decnet/util/simhash.py
Normal file
66
decnet/util/simhash.py
Normal file
@@ -0,0 +1,66 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Charikar 64-bit SimHash + Hamming helpers.
|
||||
|
||||
Locality-sensitive fingerprint: inputs that share most weighted tokens
|
||||
produce hashes a few bits apart (small Hamming distance), so near-
|
||||
duplicates cluster without storing the raw feature vector. Used by the
|
||||
keystroke-digraph biometric (``decnet/profiler/.../motor.py``) and the
|
||||
campaign clusterer's typing-similarity edge.
|
||||
|
||||
ponytail: ``templates/smtp/server.py:_body_simhash`` is the same
|
||||
algorithm, inlined to keep slim decky containers from importing decnet.
|
||||
Left as-is to avoid pulling decnet into decky images; dedup here only if
|
||||
a third caller appears.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from collections.abc import Mapping
|
||||
|
||||
_BITS = 64
|
||||
_MASK = (1 << _BITS) - 1
|
||||
|
||||
|
||||
def simhash64(weighted_tokens: Mapping[str, int]) -> int:
|
||||
"""Charikar 64-bit SimHash over frequency-weighted tokens.
|
||||
|
||||
Returns 0 on empty/all-zero-weight input — callers treat 0 as "no
|
||||
signal". Per-token hash is md5[:8]: a content fingerprint, not a
|
||||
security primitive.
|
||||
"""
|
||||
if not weighted_tokens:
|
||||
return 0
|
||||
bits = [0] * _BITS
|
||||
for tok, weight in weighted_tokens.items():
|
||||
if weight <= 0:
|
||||
continue
|
||||
h = int.from_bytes(
|
||||
# Content fingerprint, not a security primitive — md5[:8] is fast
|
||||
# and 64 bits is all we need; usedforsecurity=False clears B324.
|
||||
hashlib.md5(
|
||||
tok.encode("utf-8", errors="replace"), usedforsecurity=False,
|
||||
).digest()[:8],
|
||||
"big",
|
||||
)
|
||||
for i in range(_BITS):
|
||||
bits[i] += weight if (h >> i) & 1 else -weight
|
||||
out = 0
|
||||
for i in range(_BITS):
|
||||
if bits[i] > 0:
|
||||
out |= (1 << i)
|
||||
return out
|
||||
|
||||
|
||||
def hamming64(a: int, b: int) -> int:
|
||||
"""Number of differing bits between two 64-bit ints."""
|
||||
return ((a ^ b) & _MASK).bit_count()
|
||||
|
||||
|
||||
def to_bytes8(value: int) -> bytes:
|
||||
"""64-bit int → 8 big-endian bytes (for ``BINARY(8)`` storage)."""
|
||||
return (value & _MASK).to_bytes(8, "big")
|
||||
|
||||
|
||||
def from_bytes8(raw: bytes) -> int:
|
||||
"""8 big-endian bytes → 64-bit int."""
|
||||
return int.from_bytes(raw, "big")
|
||||
Reference in New Issue
Block a user