feat(profiler): extract motor.digraph_simhash keystroke biometric
Per-session 64-bit SimHash of inter-keystroke digraph flight times: walk single-char input events, accumulate flight time per (c1,c2), bucket the median, Charikar-SimHash the bucketed pairs. Locality- sensitive so the same typist is Hamming-close across sessions; pastes and think-pauses break the chain; silent below the sample-size floor. New shared decnet/util/simhash.py (simhash64/hamming64/bytes helpers). Registered as a conditional Tier-A primitive (count 37->38); requires behave-shell>=0.1.2.
This commit is contained in:
@@ -52,6 +52,7 @@ from decnet.profiler.behave_shell._features.temporal import (
|
||||
)
|
||||
from decnet.profiler.behave_shell._features.motor import (
|
||||
command_chunking,
|
||||
digraph_simhash,
|
||||
error_correction,
|
||||
input_modality,
|
||||
keystroke_cadence,
|
||||
@@ -68,6 +69,7 @@ FEATURES: tuple[FeatureFn, ...] = (
|
||||
input_modality,
|
||||
paste_burst_rate,
|
||||
keystroke_cadence,
|
||||
digraph_simhash,
|
||||
motor_stability,
|
||||
error_correction,
|
||||
command_chunking,
|
||||
|
||||
@@ -15,13 +15,18 @@ from behave_core.spec.envelope import Observation
|
||||
|
||||
from decnet.profiler.behave_shell._ctx import SessionContext
|
||||
from decnet.profiler.behave_shell._features._emit import make_observation
|
||||
from decnet.util.simhash import simhash64
|
||||
from decnet.profiler.behave_shell._thresholds import (
|
||||
BACKSPACE_IMMEDIATE_MAX_S,
|
||||
CMD_CHUNKING_FLUENT_CV_MAX,
|
||||
CV_BURSTY_MAX,
|
||||
CV_MACHINE_MAX,
|
||||
CV_STEADY_MAX,
|
||||
DIGRAPH_FLIGHT_BUCKETS_S,
|
||||
IKI_MACHINE_MAX_S,
|
||||
IKI_THINK_MAX_S,
|
||||
MIN_DIGRAPH_SAMPLES,
|
||||
MIN_DIGRAPHS_FOR_SIMHASH,
|
||||
MIN_INPUTS_FOR_CADENCE,
|
||||
MODALITY_PASTED_MIN,
|
||||
MODALITY_TYPED_MAX,
|
||||
@@ -421,3 +426,64 @@ def pipe_chaining_depth(ctx: SessionContext) -> Iterator[Observation]:
|
||||
value=value,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
|
||||
def _flight_bucket(seconds: float) -> int:
|
||||
"""Quantize a digraph flight time into a coarse log bucket index.
|
||||
|
||||
Returns 0..len(DIGRAPH_FLIGHT_BUCKETS_S); the coarseness is what lets
|
||||
the same typist collide across sessions despite per-keystroke jitter.
|
||||
"""
|
||||
for i, edge in enumerate(DIGRAPH_FLIGHT_BUCKETS_S):
|
||||
if seconds < edge:
|
||||
return i
|
||||
return len(DIGRAPH_FLIGHT_BUCKETS_S)
|
||||
|
||||
|
||||
def digraph_simhash(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``motor.digraph_simhash`` — a 64-bit LSH fingerprint of the
|
||||
operator's per-digraph keystroke flight times.
|
||||
|
||||
For each consecutive pair of single-char input keystrokes ``(c1, c2)``
|
||||
the flight time is the inter-event gap. Pastes / escape sequences
|
||||
(multi-char events) and think-pauses (> ``IKI_THINK_MAX_S``) break the
|
||||
chain so they don't pollute timing. Each digraph's *median* flight is
|
||||
bucketed; ``(c1c2, bucket)`` tokens are SimHashed (weighted by sample
|
||||
count) so the same typist lands Hamming-close across sessions, while a
|
||||
faster/slower or different-vocabulary operator separates.
|
||||
|
||||
Stays silent below ``MIN_DIGRAPHS_FOR_SIMHASH`` distinct pairs or
|
||||
``MIN_DIGRAPH_SAMPLES`` total samples — too little signal to fingerprint.
|
||||
"""
|
||||
flights: dict[str, list[float]] = {}
|
||||
prev_t: float | None = None
|
||||
prev_c: str | None = None
|
||||
for t, _kind, data in ctx.input_events:
|
||||
if len(data) != 1:
|
||||
# paste / control / escape burst — not a single keystroke
|
||||
prev_t = prev_c = None
|
||||
continue
|
||||
if prev_c is not None and prev_t is not None:
|
||||
dt = t - prev_t
|
||||
if 0.0 < dt <= IKI_THINK_MAX_S:
|
||||
flights.setdefault(prev_c + data, []).append(dt)
|
||||
prev_t, prev_c = t, data
|
||||
|
||||
total = sum(len(v) for v in flights.values())
|
||||
if len(flights) < MIN_DIGRAPHS_FOR_SIMHASH or total < MIN_DIGRAPH_SAMPLES:
|
||||
return
|
||||
|
||||
tokens = {
|
||||
f"{digraph}:{_flight_bucket(statistics.median(dts))}": len(dts)
|
||||
for digraph, dts in flights.items()
|
||||
}
|
||||
# Confidence grows with the number of distinct digraphs (more pairs =
|
||||
# more stable fingerprint), capped at 0.95 — never claim certainty on a
|
||||
# biometric inferred from terminal timing.
|
||||
confidence = min(0.95, 0.40 + 0.05 * len(flights))
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="motor.digraph_simhash",
|
||||
value=format(simhash64(tokens), "016x"),
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
@@ -292,6 +292,17 @@ CV_BURSTY_MAX: float = 1.50
|
||||
# Need this many input events before we'll claim a cadence at all.
|
||||
MIN_INPUTS_FOR_CADENCE: int = 5
|
||||
|
||||
# ── motor.digraph_simhash (keystroke-rhythm biometric) ──────────────────────
|
||||
# A digraph is two consecutive single-char keystrokes; its flight time is the
|
||||
# inter-event gap. We need enough distinct pairs AND enough samples before the
|
||||
# SimHash is stable enough to fingerprint a typist — below this it's noise.
|
||||
MIN_DIGRAPHS_FOR_SIMHASH: int = 8
|
||||
MIN_DIGRAPH_SAMPLES: int = 20
|
||||
# Median flight time per digraph is quantized into these log-spaced buckets
|
||||
# (upper edges, seconds). Coarse on purpose: the same typist must land in the
|
||||
# same bucket despite jitter, while a clearly faster/slower typist separates.
|
||||
DIGRAPH_FLIGHT_BUCKETS_S: tuple[float, ...] = (0.03, 0.06, 0.12, 0.25, 0.5, 1.0)
|
||||
|
||||
# ── motor.motor_stability (Step B.2) ────────────────────────────────────────
|
||||
# Tremor proxy: fraction of within-burst IATs below TREMOR_FAST_FLOOR_S
|
||||
# (30 ms — physiologically implausible double-press floor; humans can't
|
||||
|
||||
2
decnet/util/__init__.py
Normal file
2
decnet/util/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Small cross-cutting helpers with no domain home of their own."""
|
||||
66
decnet/util/simhash.py
Normal file
66
decnet/util/simhash.py
Normal file
@@ -0,0 +1,66 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Charikar 64-bit SimHash + Hamming helpers.
|
||||
|
||||
Locality-sensitive fingerprint: inputs that share most weighted tokens
|
||||
produce hashes a few bits apart (small Hamming distance), so near-
|
||||
duplicates cluster without storing the raw feature vector. Used by the
|
||||
keystroke-digraph biometric (``decnet/profiler/.../motor.py``) and the
|
||||
campaign clusterer's typing-similarity edge.
|
||||
|
||||
ponytail: ``templates/smtp/server.py:_body_simhash`` is the same
|
||||
algorithm, inlined to keep slim decky containers from importing decnet.
|
||||
Left as-is to avoid pulling decnet into decky images; dedup here only if
|
||||
a third caller appears.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from collections.abc import Mapping
|
||||
|
||||
_BITS = 64
|
||||
_MASK = (1 << _BITS) - 1
|
||||
|
||||
|
||||
def simhash64(weighted_tokens: Mapping[str, int]) -> int:
|
||||
"""Charikar 64-bit SimHash over frequency-weighted tokens.
|
||||
|
||||
Returns 0 on empty/all-zero-weight input — callers treat 0 as "no
|
||||
signal". Per-token hash is md5[:8]: a content fingerprint, not a
|
||||
security primitive.
|
||||
"""
|
||||
if not weighted_tokens:
|
||||
return 0
|
||||
bits = [0] * _BITS
|
||||
for tok, weight in weighted_tokens.items():
|
||||
if weight <= 0:
|
||||
continue
|
||||
h = int.from_bytes(
|
||||
# Content fingerprint, not a security primitive — md5[:8] is fast
|
||||
# and 64 bits is all we need; usedforsecurity=False clears B324.
|
||||
hashlib.md5(
|
||||
tok.encode("utf-8", errors="replace"), usedforsecurity=False,
|
||||
).digest()[:8],
|
||||
"big",
|
||||
)
|
||||
for i in range(_BITS):
|
||||
bits[i] += weight if (h >> i) & 1 else -weight
|
||||
out = 0
|
||||
for i in range(_BITS):
|
||||
if bits[i] > 0:
|
||||
out |= (1 << i)
|
||||
return out
|
||||
|
||||
|
||||
def hamming64(a: int, b: int) -> int:
|
||||
"""Number of differing bits between two 64-bit ints."""
|
||||
return ((a ^ b) & _MASK).bit_count()
|
||||
|
||||
|
||||
def to_bytes8(value: int) -> bytes:
|
||||
"""64-bit int → 8 big-endian bytes (for ``BINARY(8)`` storage)."""
|
||||
return (value & _MASK).to_bytes(8, "big")
|
||||
|
||||
|
||||
def from_bytes8(raw: bytes) -> int:
|
||||
"""8 big-endian bytes → 64-bit int."""
|
||||
return int.from_bytes(raw, "big")
|
||||
Reference in New Issue
Block a user