feat(profiler): extract motor.digraph_simhash keystroke biometric

Per-session 64-bit SimHash of inter-keystroke digraph flight times:
walk single-char input events, accumulate flight time per (c1,c2),
bucket the median, Charikar-SimHash the bucketed pairs. Locality-
sensitive so the same typist is Hamming-close across sessions; pastes
and think-pauses break the chain; silent below the sample-size floor.

New shared decnet/util/simhash.py (simhash64/hamming64/bytes helpers).
Registered as a conditional Tier-A primitive (count 37->38); requires
behave-shell>=0.1.2.
This commit is contained in:
2026-06-16 16:59:57 -04:00
parent 372375194c
commit 66c73ce59d
12 changed files with 283 additions and 5 deletions

0
tests/util/__init__.py Normal file
View File

View File

@@ -0,0 +1,38 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Charikar SimHash util — determinism, LSH property, byte round-trip."""
from __future__ import annotations
from decnet.util.simhash import from_bytes8, hamming64, simhash64, to_bytes8
def test_empty_or_zero_weight_is_zero() -> None:
assert simhash64({}) == 0
assert simhash64({"a": 0, "b": -3}) == 0 # non-positive weights skipped
def test_deterministic() -> None:
tokens = {"th": 3, "he": 2, "er": 1}
assert simhash64(tokens) == simhash64(dict(tokens))
def test_near_duplicate_low_hamming() -> None:
base = {f"dg{i}": (i % 5) + 1 for i in range(40)}
identical = dict(base)
perturbed = dict(base)
perturbed["NEW_PAIR"] = 1 # one extra low-weight token
assert hamming64(simhash64(base), simhash64(identical)) == 0
assert hamming64(simhash64(base), simhash64(perturbed)) <= 8
def test_disjoint_high_hamming() -> None:
a = {f"a{i}": 2 for i in range(30)}
b = {f"b{i}": 2 for i in range(30)}
# Unrelated token sets ≈ half the 64 bits differ; comfortably ≥ 20.
assert hamming64(simhash64(a), simhash64(b)) >= 20
def test_bytes_roundtrip_is_8_bytes() -> None:
h = simhash64({"x": 1, "y": 2, "z": 5})
raw = to_bytes8(h)
assert isinstance(raw, bytes) and len(raw) == 8
assert from_bytes8(raw) == h