feat(profiler): extract motor.digraph_simhash keystroke biometric

Per-session 64-bit SimHash of inter-keystroke digraph flight times:
walk single-char input events, accumulate flight time per (c1,c2),
bucket the median, Charikar-SimHash the bucketed pairs. Locality-
sensitive so the same typist is Hamming-close across sessions; pastes
and think-pauses break the chain; silent below the sample-size floor.

New shared decnet/util/simhash.py (simhash64/hamming64/bytes helpers).
Registered as a conditional Tier-A primitive (count 37->38); requires
behave-shell>=0.1.2.
This commit is contained in:
2026-06-16 16:59:57 -04:00
parent 372375194c
commit 66c73ce59d
12 changed files with 283 additions and 5 deletions

View File

@@ -52,6 +52,7 @@ from decnet.profiler.behave_shell._features.temporal import (
)
from decnet.profiler.behave_shell._features.motor import (
command_chunking,
digraph_simhash,
error_correction,
input_modality,
keystroke_cadence,
@@ -68,6 +69,7 @@ FEATURES: tuple[FeatureFn, ...] = (
input_modality,
paste_burst_rate,
keystroke_cadence,
digraph_simhash,
motor_stability,
error_correction,
command_chunking,

View File

@@ -15,13 +15,18 @@ from behave_core.spec.envelope import Observation
from decnet.profiler.behave_shell._ctx import SessionContext
from decnet.profiler.behave_shell._features._emit import make_observation
from decnet.util.simhash import simhash64
from decnet.profiler.behave_shell._thresholds import (
BACKSPACE_IMMEDIATE_MAX_S,
CMD_CHUNKING_FLUENT_CV_MAX,
CV_BURSTY_MAX,
CV_MACHINE_MAX,
CV_STEADY_MAX,
DIGRAPH_FLIGHT_BUCKETS_S,
IKI_MACHINE_MAX_S,
IKI_THINK_MAX_S,
MIN_DIGRAPH_SAMPLES,
MIN_DIGRAPHS_FOR_SIMHASH,
MIN_INPUTS_FOR_CADENCE,
MODALITY_PASTED_MIN,
MODALITY_TYPED_MAX,
@@ -421,3 +426,64 @@ def pipe_chaining_depth(ctx: SessionContext) -> Iterator[Observation]:
value=value,
confidence=confidence,
)
def _flight_bucket(seconds: float) -> int:
"""Quantize a digraph flight time into a coarse log bucket index.
Returns 0..len(DIGRAPH_FLIGHT_BUCKETS_S); the coarseness is what lets
the same typist collide across sessions despite per-keystroke jitter.
"""
for i, edge in enumerate(DIGRAPH_FLIGHT_BUCKETS_S):
if seconds < edge:
return i
return len(DIGRAPH_FLIGHT_BUCKETS_S)
def digraph_simhash(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``motor.digraph_simhash`` — a 64-bit LSH fingerprint of the
operator's per-digraph keystroke flight times.
For each consecutive pair of single-char input keystrokes ``(c1, c2)``
the flight time is the inter-event gap. Pastes / escape sequences
(multi-char events) and think-pauses (> ``IKI_THINK_MAX_S``) break the
chain so they don't pollute timing. Each digraph's *median* flight is
bucketed; ``(c1c2, bucket)`` tokens are SimHashed (weighted by sample
count) so the same typist lands Hamming-close across sessions, while a
faster/slower or different-vocabulary operator separates.
Stays silent below ``MIN_DIGRAPHS_FOR_SIMHASH`` distinct pairs or
``MIN_DIGRAPH_SAMPLES`` total samples — too little signal to fingerprint.
"""
flights: dict[str, list[float]] = {}
prev_t: float | None = None
prev_c: str | None = None
for t, _kind, data in ctx.input_events:
if len(data) != 1:
# paste / control / escape burst — not a single keystroke
prev_t = prev_c = None
continue
if prev_c is not None and prev_t is not None:
dt = t - prev_t
if 0.0 < dt <= IKI_THINK_MAX_S:
flights.setdefault(prev_c + data, []).append(dt)
prev_t, prev_c = t, data
total = sum(len(v) for v in flights.values())
if len(flights) < MIN_DIGRAPHS_FOR_SIMHASH or total < MIN_DIGRAPH_SAMPLES:
return
tokens = {
f"{digraph}:{_flight_bucket(statistics.median(dts))}": len(dts)
for digraph, dts in flights.items()
}
# Confidence grows with the number of distinct digraphs (more pairs =
# more stable fingerprint), capped at 0.95 — never claim certainty on a
# biometric inferred from terminal timing.
confidence = min(0.95, 0.40 + 0.05 * len(flights))
yield make_observation(
ctx,
primitive="motor.digraph_simhash",
value=format(simhash64(tokens), "016x"),
confidence=confidence,
)

View File

@@ -292,6 +292,17 @@ CV_BURSTY_MAX: float = 1.50
# Need this many input events before we'll claim a cadence at all.
MIN_INPUTS_FOR_CADENCE: int = 5
# ── motor.digraph_simhash (keystroke-rhythm biometric) ──────────────────────
# A digraph is two consecutive single-char keystrokes; its flight time is the
# inter-event gap. We need enough distinct pairs AND enough samples before the
# SimHash is stable enough to fingerprint a typist — below this it's noise.
MIN_DIGRAPHS_FOR_SIMHASH: int = 8
MIN_DIGRAPH_SAMPLES: int = 20
# Median flight time per digraph is quantized into these log-spaced buckets
# (upper edges, seconds). Coarse on purpose: the same typist must land in the
# same bucket despite jitter, while a clearly faster/slower typist separates.
DIGRAPH_FLIGHT_BUCKETS_S: tuple[float, ...] = (0.03, 0.06, 0.12, 0.25, 0.5, 1.0)
# ── motor.motor_stability (Step B.2) ────────────────────────────────────────
# Tremor proxy: fraction of within-burst IATs below TREMOR_FAST_FLOOR_S
# (30 ms — physiologically implausible double-press floor; humans can't