feat(profiler): extract motor.digraph_simhash keystroke biometric
Per-session 64-bit SimHash of inter-keystroke digraph flight times: walk single-char input events, accumulate flight time per (c1,c2), bucket the median, Charikar-SimHash the bucketed pairs. Locality- sensitive so the same typist is Hamming-close across sessions; pastes and think-pauses break the chain; silent below the sample-size floor. New shared decnet/util/simhash.py (simhash64/hamming64/bytes helpers). Registered as a conditional Tier-A primitive (count 37->38); requires behave-shell>=0.1.2.
This commit is contained in:
@@ -109,6 +109,7 @@ PHASE_G_CONDITIONAL_PRIMITIVES: frozenset[str] = frozenset({
|
||||
"emotional_valence.arousal", # needs typing bursts
|
||||
"emotional_valence.valence", # needs ≥ 80 typed letters
|
||||
"emotional_valence.frustration_venting", # needs ≥ 30 typed letters
|
||||
"motor.digraph_simhash", # needs ≥ 8 distinct digraphs / ≥ 20 samples
|
||||
})
|
||||
|
||||
# Backwards-compatible aliases for any external import — earlier phases
|
||||
|
||||
69
tests/profiler/behave_shell/test_motor_digraph_simhash.py
Normal file
69
tests/profiler/behave_shell/test_motor_digraph_simhash.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""``motor.digraph_simhash`` — keystroke-rhythm biometric.
|
||||
|
||||
Builds typed input streams (single-char ``"i"`` events at a fixed
|
||||
inter-key gap) and asserts the LSH property: same typist → Hamming-close,
|
||||
different cadence → far apart, pastes excluded, thin sessions silent.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from decnet.profiler.behave_shell import extract_session
|
||||
from decnet.profiler.behave_shell._parse import AsciinemaEvent
|
||||
from decnet.util.simhash import from_bytes8, hamming64
|
||||
|
||||
# A realistic multi-command session: plenty of distinct digraphs, > 20 samples.
|
||||
_PHRASE = "ls -la /etc; cat /etc/passwd; whoami; uname -a; netstat -tlnp\r"
|
||||
|
||||
|
||||
def _typed(phrase: str, dt: float, *, start: float = 0.0) -> list[AsciinemaEvent]:
|
||||
events: list[AsciinemaEvent] = []
|
||||
t = start
|
||||
for ch in phrase:
|
||||
events.append((t, "i", ch))
|
||||
t += dt
|
||||
return events
|
||||
|
||||
|
||||
def _digraph_obs(events: list[AsciinemaEvent], sid: str):
|
||||
out = list(extract_session(events, sid=sid))
|
||||
obs = [o for o in out if o.primitive == "motor.digraph_simhash"]
|
||||
return obs
|
||||
|
||||
|
||||
def _hash_int(obs) -> int:
|
||||
return from_bytes8(bytes.fromhex(obs.value))
|
||||
|
||||
|
||||
def test_emits_one_observation_for_a_normal_session() -> None:
|
||||
obs = _digraph_obs(_typed(_PHRASE, 0.12), "dg-basic")
|
||||
assert len(obs) == 1
|
||||
assert len(obs[0].value) == 16 # 64-bit hex
|
||||
assert 0.0 < obs[0].confidence <= 0.95
|
||||
|
||||
|
||||
def test_same_typist_identical_timing_is_identical() -> None:
|
||||
a = _digraph_obs(_typed(_PHRASE, 0.12), "dg-a")[0]
|
||||
b = _digraph_obs(_typed(_PHRASE, 0.12), "dg-b")[0]
|
||||
# Identical text + timing → identical fingerprint (0 Hamming).
|
||||
assert hamming64(_hash_int(a), _hash_int(b)) == 0
|
||||
|
||||
|
||||
def test_different_cadence_separates() -> None:
|
||||
fast = _digraph_obs(_typed(_PHRASE, 0.05), "dg-fast")[0]
|
||||
slow = _digraph_obs(_typed(_PHRASE, 0.45), "dg-slow")[0]
|
||||
# Same vocabulary, very different flight-time buckets → the hashes diverge.
|
||||
assert hamming64(_hash_int(fast), _hash_int(slow)) > 0
|
||||
|
||||
|
||||
def test_pastes_do_not_form_digraphs() -> None:
|
||||
# A session made of large paste events (len >= 4) carries no single-char
|
||||
# keystrokes, so no digraphs and no observation.
|
||||
events: list[AsciinemaEvent] = [
|
||||
(float(i), "i", "sudo apt-get update") for i in range(10)
|
||||
]
|
||||
assert _digraph_obs(events, "dg-paste") == []
|
||||
|
||||
|
||||
def test_thin_session_is_silent() -> None:
|
||||
# Below MIN_DIGRAPHS_FOR_SIMHASH / MIN_DIGRAPH_SAMPLES → no emission.
|
||||
assert _digraph_obs(_typed("ls\r", 0.1), "dg-thin") == []
|
||||
@@ -91,10 +91,14 @@ def test_no_extractor_set_drifts_from_registry() -> None:
|
||||
)
|
||||
|
||||
|
||||
def test_tier_a_count_is_37() -> None:
|
||||
"""Sanity check: Tier-A count matches the design doc (37 primitives)."""
|
||||
assert len(_tier_a_primitives()) == 37, (
|
||||
f"Expected 37 Tier-A primitives per BEHAVE-EXTRACTOR.md; "
|
||||
def test_tier_a_count_is_38() -> None:
|
||||
"""Sanity check: Tier-A count matches the design doc.
|
||||
|
||||
38 since behave-shell 0.1.2 added ``motor.digraph_simhash`` (the
|
||||
keystroke-rhythm biometric); was 37.
|
||||
"""
|
||||
assert len(_tier_a_primitives()) == 38, (
|
||||
f"Expected 38 Tier-A primitives per BEHAVE-EXTRACTOR.md; "
|
||||
f"got {len(_tier_a_primitives())}. Update Phase H if the "
|
||||
f"spec genuinely changed, or adjust TIER_B_ALLOWLIST."
|
||||
)
|
||||
|
||||
0
tests/util/__init__.py
Normal file
0
tests/util/__init__.py
Normal file
38
tests/util/test_simhash.py
Normal file
38
tests/util/test_simhash.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Charikar SimHash util — determinism, LSH property, byte round-trip."""
|
||||
from __future__ import annotations
|
||||
|
||||
from decnet.util.simhash import from_bytes8, hamming64, simhash64, to_bytes8
|
||||
|
||||
|
||||
def test_empty_or_zero_weight_is_zero() -> None:
|
||||
assert simhash64({}) == 0
|
||||
assert simhash64({"a": 0, "b": -3}) == 0 # non-positive weights skipped
|
||||
|
||||
|
||||
def test_deterministic() -> None:
|
||||
tokens = {"th": 3, "he": 2, "er": 1}
|
||||
assert simhash64(tokens) == simhash64(dict(tokens))
|
||||
|
||||
|
||||
def test_near_duplicate_low_hamming() -> None:
|
||||
base = {f"dg{i}": (i % 5) + 1 for i in range(40)}
|
||||
identical = dict(base)
|
||||
perturbed = dict(base)
|
||||
perturbed["NEW_PAIR"] = 1 # one extra low-weight token
|
||||
assert hamming64(simhash64(base), simhash64(identical)) == 0
|
||||
assert hamming64(simhash64(base), simhash64(perturbed)) <= 8
|
||||
|
||||
|
||||
def test_disjoint_high_hamming() -> None:
|
||||
a = {f"a{i}": 2 for i in range(30)}
|
||||
b = {f"b{i}": 2 for i in range(30)}
|
||||
# Unrelated token sets ≈ half the 64 bits differ; comfortably ≥ 20.
|
||||
assert hamming64(simhash64(a), simhash64(b)) >= 20
|
||||
|
||||
|
||||
def test_bytes_roundtrip_is_8_bytes() -> None:
|
||||
h = simhash64({"x": 1, "y": 2, "z": 5})
|
||||
raw = to_bytes8(h)
|
||||
assert isinstance(raw, bytes) and len(raw) == 8
|
||||
assert from_bytes8(raw) == h
|
||||
Reference in New Issue
Block a user