feat(profiler): extract motor.digraph_simhash keystroke biometric
Per-session 64-bit SimHash of inter-keystroke digraph flight times: walk single-char input events, accumulate flight time per (c1,c2), bucket the median, Charikar-SimHash the bucketed pairs. Locality- sensitive so the same typist is Hamming-close across sessions; pastes and think-pauses break the chain; silent below the sample-size floor. New shared decnet/util/simhash.py (simhash64/hamming64/bytes helpers). Registered as a conditional Tier-A primitive (count 37->38); requires behave-shell>=0.1.2.
This commit is contained in:
0
tests/util/__init__.py
Normal file
0
tests/util/__init__.py
Normal file
38
tests/util/test_simhash.py
Normal file
38
tests/util/test_simhash.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Charikar SimHash util — determinism, LSH property, byte round-trip."""
|
||||
from __future__ import annotations
|
||||
|
||||
from decnet.util.simhash import from_bytes8, hamming64, simhash64, to_bytes8
|
||||
|
||||
|
||||
def test_empty_or_zero_weight_is_zero() -> None:
|
||||
assert simhash64({}) == 0
|
||||
assert simhash64({"a": 0, "b": -3}) == 0 # non-positive weights skipped
|
||||
|
||||
|
||||
def test_deterministic() -> None:
|
||||
tokens = {"th": 3, "he": 2, "er": 1}
|
||||
assert simhash64(tokens) == simhash64(dict(tokens))
|
||||
|
||||
|
||||
def test_near_duplicate_low_hamming() -> None:
|
||||
base = {f"dg{i}": (i % 5) + 1 for i in range(40)}
|
||||
identical = dict(base)
|
||||
perturbed = dict(base)
|
||||
perturbed["NEW_PAIR"] = 1 # one extra low-weight token
|
||||
assert hamming64(simhash64(base), simhash64(identical)) == 0
|
||||
assert hamming64(simhash64(base), simhash64(perturbed)) <= 8
|
||||
|
||||
|
||||
def test_disjoint_high_hamming() -> None:
|
||||
a = {f"a{i}": 2 for i in range(30)}
|
||||
b = {f"b{i}": 2 for i in range(30)}
|
||||
# Unrelated token sets ≈ half the 64 bits differ; comfortably ≥ 20.
|
||||
assert hamming64(simhash64(a), simhash64(b)) >= 20
|
||||
|
||||
|
||||
def test_bytes_roundtrip_is_8_bytes() -> None:
|
||||
h = simhash64({"x": 1, "y": 2, "z": 5})
|
||||
raw = to_bytes8(h)
|
||||
assert isinstance(raw, bytes) and len(raw) == 8
|
||||
assert from_bytes8(raw) == h
|
||||
Reference in New Issue
Block a user