feat(profiler/behave_shell): emit environmental.keyboard_layout
ANTI authorised dropping the PII boundary for this primitive. ctx gains typed_unigram_counts / typed_bigram_counts / typed_letter_count populated during the existing single-pass input walk (paste-class events excluded). Two-axis classifier: * layout-artefact unigrams take priority — q rate above floor with low English saturation → azerty; z above floor with y below → qwertz * fallback to English-bigram saturation: ≥ floor → qwerty, else other Sample-size floor 200 typed letters; bigram histogram capped at top-64 to bound memory. Confidence cap stays moderate (0.40-0.55) — heuristic discriminator.
This commit is contained in:
@@ -12,7 +12,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import math
|
import math
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Iterable
|
from typing import Iterable, Mapping
|
||||||
|
|
||||||
from decnet.profiler.behave_shell._parse import (
|
from decnet.profiler.behave_shell._parse import (
|
||||||
AsciinemaEvent,
|
AsciinemaEvent,
|
||||||
@@ -26,6 +26,7 @@ from decnet.profiler.behave_shell._parse import (
|
|||||||
)
|
)
|
||||||
from decnet.profiler.behave_shell._thresholds import (
|
from decnet.profiler.behave_shell._thresholds import (
|
||||||
IKI_THINK_MAX_S,
|
IKI_THINK_MAX_S,
|
||||||
|
LAYOUT_BIGRAM_TOP_N,
|
||||||
PASTE_BURST_MAX_IAT_S,
|
PASTE_BURST_MAX_IAT_S,
|
||||||
PASTE_MIN_CHARS_PER_EVENT,
|
PASTE_MIN_CHARS_PER_EVENT,
|
||||||
PROMPT_LINE_MAX_CHARS,
|
PROMPT_LINE_MAX_CHARS,
|
||||||
@@ -69,6 +70,12 @@ class SessionContext:
|
|||||||
# Step F.0 derivations — PS1 prompt lines detected in the output stream
|
# Step F.0 derivations — PS1 prompt lines detected in the output stream
|
||||||
prompt_lines: tuple[PromptLine, ...] = field(default_factory=tuple)
|
prompt_lines: tuple[PromptLine, ...] = field(default_factory=tuple)
|
||||||
|
|
||||||
|
# Step F.4 derivations — typed-only character histograms for keyboard
|
||||||
|
# layout fingerprinting (PII boundary lifted by ANTI for Phase F).
|
||||||
|
typed_unigram_counts: Mapping[str, int] = field(default_factory=dict)
|
||||||
|
typed_bigram_counts: Mapping[str, int] = field(default_factory=dict)
|
||||||
|
typed_letter_count: int = 0
|
||||||
|
|
||||||
|
|
||||||
def _detect_paste_bursts(
|
def _detect_paste_bursts(
|
||||||
inputs: list[AsciinemaEvent],
|
inputs: list[AsciinemaEvent],
|
||||||
@@ -300,6 +307,46 @@ def _output_bytes_between(
|
|||||||
return sum(len(d) for t, _k, d in outputs if start <= t < end)
|
return sum(len(d) for t, _k, d in outputs if start <= t < end)
|
||||||
|
|
||||||
|
|
||||||
|
def _typed_char_histograms(
|
||||||
|
inputs: list[AsciinemaEvent],
|
||||||
|
) -> tuple[Mapping[str, int], Mapping[str, int], int]:
|
||||||
|
"""Walk input events, build typed-only unigram + bigram histograms.
|
||||||
|
|
||||||
|
Skip paste-class events (``len(data) >= PASTE_MIN_CHARS_PER_EVENT``)
|
||||||
|
— pasted text reveals nothing about the operator's keyboard. Letter
|
||||||
|
bigrams chain only across consecutive ASCII-letter chars; a digit
|
||||||
|
or punctuation character breaks the chain.
|
||||||
|
|
||||||
|
Returns ``(unigrams, bigrams, total_letters)``. The bigram dict is
|
||||||
|
truncated to the top ``LAYOUT_BIGRAM_TOP_N`` entries by count to
|
||||||
|
bound memory (the layout signals only need the head of the
|
||||||
|
distribution).
|
||||||
|
"""
|
||||||
|
unigrams: dict[str, int] = {}
|
||||||
|
bigrams: dict[str, int] = {}
|
||||||
|
total_letters = 0
|
||||||
|
last_letter: str | None = None
|
||||||
|
for _t, _kind, data in inputs:
|
||||||
|
if len(data) >= PASTE_MIN_CHARS_PER_EVENT:
|
||||||
|
last_letter = None
|
||||||
|
continue
|
||||||
|
for c in data:
|
||||||
|
if c.isascii() and c.isalpha():
|
||||||
|
lower = c.lower()
|
||||||
|
unigrams[lower] = unigrams.get(lower, 0) + 1
|
||||||
|
total_letters += 1
|
||||||
|
if last_letter is not None:
|
||||||
|
big = last_letter + lower
|
||||||
|
bigrams[big] = bigrams.get(big, 0) + 1
|
||||||
|
last_letter = lower
|
||||||
|
else:
|
||||||
|
last_letter = None
|
||||||
|
if len(bigrams) > LAYOUT_BIGRAM_TOP_N:
|
||||||
|
top = sorted(bigrams.items(), key=lambda kv: -kv[1])[:LAYOUT_BIGRAM_TOP_N]
|
||||||
|
bigrams = dict(top)
|
||||||
|
return unigrams, bigrams, total_letters
|
||||||
|
|
||||||
|
|
||||||
def _output_window(
|
def _output_window(
|
||||||
outputs: list[AsciinemaEvent],
|
outputs: list[AsciinemaEvent],
|
||||||
start: float,
|
start: float,
|
||||||
@@ -385,6 +432,7 @@ def build_session_context(
|
|||||||
for i in range(len(commands) - 1)
|
for i in range(len(commands) - 1)
|
||||||
)
|
)
|
||||||
intra_command_iats = _per_command_iats(commands, inputs)
|
intra_command_iats = _per_command_iats(commands, inputs)
|
||||||
|
typed_uni, typed_bi, typed_letters = _typed_char_histograms(inputs)
|
||||||
|
|
||||||
return SessionContext(
|
return SessionContext(
|
||||||
sid=sid,
|
sid=sid,
|
||||||
@@ -407,4 +455,7 @@ def build_session_context(
|
|||||||
kill_line_count=kill_line_count,
|
kill_line_count=kill_line_count,
|
||||||
intra_command_iats=intra_command_iats,
|
intra_command_iats=intra_command_iats,
|
||||||
prompt_lines=prompt_lines,
|
prompt_lines=prompt_lines,
|
||||||
|
typed_unigram_counts=typed_uni,
|
||||||
|
typed_bigram_counts=typed_bi,
|
||||||
|
typed_letter_count=typed_letters,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ from decnet.profiler.behave_shell._features.cognitive import (
|
|||||||
inter_command_latency_class,
|
inter_command_latency_class,
|
||||||
)
|
)
|
||||||
from decnet.profiler.behave_shell._features.environmental import (
|
from decnet.profiler.behave_shell._features.environmental import (
|
||||||
|
keyboard_layout,
|
||||||
locale,
|
locale,
|
||||||
shell_type,
|
shell_type,
|
||||||
terminal_multiplexer,
|
terminal_multiplexer,
|
||||||
@@ -75,4 +76,5 @@ FEATURES: tuple[FeatureFn, ...] = (
|
|||||||
shell_type,
|
shell_type,
|
||||||
terminal_multiplexer,
|
terminal_multiplexer,
|
||||||
locale,
|
locale,
|
||||||
|
keyboard_layout,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ which F.1 / F.3 / E.4 read.
|
|||||||
Step F.1: ``environmental.shell_type``.
|
Step F.1: ``environmental.shell_type``.
|
||||||
Step F.2: ``environmental.terminal_multiplexer``.
|
Step F.2: ``environmental.terminal_multiplexer``.
|
||||||
Step F.3: ``environmental.locale``.
|
Step F.3: ``environmental.locale``.
|
||||||
|
Step F.4: ``environmental.keyboard_layout``.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
@@ -21,6 +22,13 @@ from decnet.profiler.behave_shell._ctx import SessionContext
|
|||||||
from decnet.profiler.behave_shell._features._emit import make_observation
|
from decnet.profiler.behave_shell._features._emit import make_observation
|
||||||
from decnet.profiler.behave_shell._parse import PromptLine, strip_ansi
|
from decnet.profiler.behave_shell._parse import PromptLine, strip_ansi
|
||||||
from decnet.profiler.behave_shell._thresholds import (
|
from decnet.profiler.behave_shell._thresholds import (
|
||||||
|
LAYOUT_AZERTY_ENG_MAX,
|
||||||
|
LAYOUT_AZERTY_Q_MIN,
|
||||||
|
LAYOUT_MIN_TYPED_LETTERS,
|
||||||
|
LAYOUT_QWERTY_ENG_MIN,
|
||||||
|
LAYOUT_QWERTZ_Y_MAX,
|
||||||
|
LAYOUT_QWERTZ_Z_MIN,
|
||||||
|
LAYOUT_TOP_ENG_BIGRAMS,
|
||||||
LOCALE_MIN_VALUE_LENGTH,
|
LOCALE_MIN_VALUE_LENGTH,
|
||||||
SHELL_TYPE_MIN_PROMPTS,
|
SHELL_TYPE_MIN_PROMPTS,
|
||||||
)
|
)
|
||||||
@@ -228,3 +236,64 @@ def locale(ctx: SessionContext) -> Iterator[Observation]:
|
|||||||
value=best_value,
|
value=best_value,
|
||||||
confidence=0.80,
|
confidence=0.80,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def keyboard_layout(ctx: SessionContext) -> Iterator[Observation]:
|
||||||
|
"""Emit ``environmental.keyboard_layout``.
|
||||||
|
|
||||||
|
Two independent signals over the typed-only character histograms:
|
||||||
|
|
||||||
|
1. **English-bigram saturation** — fraction of typed bigrams that
|
||||||
|
hit the top-10 English bigrams. High → presumed QWERTY.
|
||||||
|
2. **Layout-artefact unigrams** — letters that are rare in English
|
||||||
|
but frequent on operators using a different layout:
|
||||||
|
|
||||||
|
* ``q`` rate above floor AND English saturation low → ``azerty``
|
||||||
|
(AZERTY's `a` is on QWERTY's `q` position; mistypes bleed `q`)
|
||||||
|
* ``z`` rate above floor AND ``y`` rate below floor → ``qwertz``
|
||||||
|
(QWERTZ swaps `y`/`z`)
|
||||||
|
* Else: English saturation above floor → ``qwerty``
|
||||||
|
* Else: → ``other``
|
||||||
|
|
||||||
|
Threshold ordering matters — layout-artefact checks fire before
|
||||||
|
QWERTY because AZERTY/QWERTZ operators may still hit some English
|
||||||
|
bigrams.
|
||||||
|
|
||||||
|
Skip emission when typed letter count below
|
||||||
|
``LAYOUT_MIN_TYPED_LETTERS`` (200) — the histograms are too thin
|
||||||
|
to discriminate honestly.
|
||||||
|
"""
|
||||||
|
if ctx.typed_letter_count < LAYOUT_MIN_TYPED_LETTERS:
|
||||||
|
return
|
||||||
|
uni = ctx.typed_unigram_counts
|
||||||
|
bi = ctx.typed_bigram_counts
|
||||||
|
total_letters = ctx.typed_letter_count
|
||||||
|
total_bigrams = sum(bi.values())
|
||||||
|
|
||||||
|
eng_saturation = (
|
||||||
|
sum(bi.get(b, 0) for b in LAYOUT_TOP_ENG_BIGRAMS) / total_bigrams
|
||||||
|
if total_bigrams > 0 else 0.0
|
||||||
|
)
|
||||||
|
q_rate = uni.get("q", 0) / total_letters
|
||||||
|
z_rate = uni.get("z", 0) / total_letters
|
||||||
|
y_rate = uni.get("y", 0) / total_letters
|
||||||
|
|
||||||
|
if q_rate > LAYOUT_AZERTY_Q_MIN and eng_saturation < LAYOUT_AZERTY_ENG_MAX:
|
||||||
|
value = "azerty"
|
||||||
|
elif z_rate > LAYOUT_QWERTZ_Z_MIN and y_rate < LAYOUT_QWERTZ_Y_MAX:
|
||||||
|
value = "qwertz"
|
||||||
|
elif eng_saturation >= LAYOUT_QWERTY_ENG_MIN:
|
||||||
|
value = "qwerty"
|
||||||
|
else:
|
||||||
|
value = "other"
|
||||||
|
|
||||||
|
if total_letters < 500:
|
||||||
|
confidence = 0.40
|
||||||
|
else:
|
||||||
|
confidence = 0.55
|
||||||
|
yield make_observation(
|
||||||
|
ctx,
|
||||||
|
primitive="environmental.keyboard_layout",
|
||||||
|
value=value,
|
||||||
|
confidence=confidence,
|
||||||
|
)
|
||||||
|
|||||||
@@ -238,6 +238,31 @@ SHELL_TYPE_MIN_PROMPTS: int = 3
|
|||||||
# noise and skip emission (a single 'C' or 'en' is too thin).
|
# noise and skip emission (a single 'C' or 'en' is too thin).
|
||||||
LOCALE_MIN_VALUE_LENGTH: int = 2
|
LOCALE_MIN_VALUE_LENGTH: int = 2
|
||||||
|
|
||||||
|
# ── environmental.keyboard_layout (Step F.4) ───────────────────────────────
|
||||||
|
# ANTI authorised dropping the PII boundary for this primitive — typed
|
||||||
|
# bigram/unigram histograms ride on SessionContext to feed two
|
||||||
|
# independent layout signals:
|
||||||
|
#
|
||||||
|
# 1. English-bigram saturation (presumed-QWERTY signal)
|
||||||
|
# 2. Layout-artefact unigram rates (q for AZERTY, z/y swap for QWERTZ)
|
||||||
|
#
|
||||||
|
# Sample-size floor; below this typed-letter-count we skip emission.
|
||||||
|
LAYOUT_MIN_TYPED_LETTERS: int = 200
|
||||||
|
# Cap on bigram histogram size — bound memory while keeping the top
|
||||||
|
# bigrams that drive the saturation signal.
|
||||||
|
LAYOUT_BIGRAM_TOP_N: int = 64
|
||||||
|
# Top-10 English bigrams. Their summed frequency floor presumes QWERTY
|
||||||
|
# (the dominant English-typing layout).
|
||||||
|
LAYOUT_TOP_ENG_BIGRAMS: frozenset[str] = frozenset({
|
||||||
|
"th", "he", "in", "er", "an", "re", "on", "at", "nd", "ha",
|
||||||
|
})
|
||||||
|
# Layout-artefact thresholds. Fractions are over total ASCII-letter typed.
|
||||||
|
LAYOUT_AZERTY_Q_MIN: float = 0.020 # high `q` rate (mistyping AZERTY's `a`)
|
||||||
|
LAYOUT_AZERTY_ENG_MAX: float = 0.050 # AND low English saturation
|
||||||
|
LAYOUT_QWERTZ_Z_MIN: float = 0.030 # high `z` rate (German content / QWERTZ)
|
||||||
|
LAYOUT_QWERTZ_Y_MAX: float = 0.010 # AND `y` swap signature
|
||||||
|
LAYOUT_QWERTY_ENG_MIN: float = 0.080 # English-bigram saturation floor
|
||||||
|
|
||||||
# ── motor.keystroke_cadence (Step B.1) ──────────────────────────────────────
|
# ── motor.keystroke_cadence (Step B.1) ──────────────────────────────────────
|
||||||
# Typing bursts split at gaps > IKI_THINK_MAX_S so think-pauses between
|
# Typing bursts split at gaps > IKI_THINK_MAX_S so think-pauses between
|
||||||
# commands don't inflate the within-burst CV. Mirrors the prototype's
|
# commands don't inflate the within-burst CV. Mirrors the prototype's
|
||||||
|
|||||||
@@ -0,0 +1,79 @@
|
|||||||
|
"""Step F.4: ``environmental.keyboard_layout``."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from decnet.profiler.behave_shell import extract_session
|
||||||
|
from decnet.profiler.behave_shell._parse import AsciinemaEvent
|
||||||
|
|
||||||
|
|
||||||
|
PRIMITIVE = "environmental.keyboard_layout"
|
||||||
|
|
||||||
|
|
||||||
|
def _of(observations: list, primitive: str):
|
||||||
|
obs = [o for o in observations if o.primitive == primitive]
|
||||||
|
assert len(obs) == 1, f"expected exactly one {primitive}, got {len(obs)}"
|
||||||
|
return obs[0]
|
||||||
|
|
||||||
|
|
||||||
|
def _typed_session(text: str, t0: float = 0.0, dt: float = 0.05) -> list[AsciinemaEvent]:
|
||||||
|
"""Type ``text`` char-by-char and run as one command."""
|
||||||
|
events: list[AsciinemaEvent] = [
|
||||||
|
(t0 + i * dt, "i", c) for i, c in enumerate(text)
|
||||||
|
]
|
||||||
|
events.append((t0 + len(text) * dt, "i", "\r"))
|
||||||
|
return events
|
||||||
|
|
||||||
|
|
||||||
|
def test_below_min_typed_letters_no_emission() -> None:
|
||||||
|
out = list(extract_session(_typed_session("hi"), sid="kl-tiny"))
|
||||||
|
assert [o for o in out if o.primitive == PRIMITIVE] == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_english_text_emits_qwerty() -> None:
|
||||||
|
"""Pangram repeated to clear LAYOUT_MIN_TYPED_LETTERS (200)."""
|
||||||
|
pangram = "the quick brown fox jumps over the lazy dog and then he ran inside the house "
|
||||||
|
text = pangram * 5
|
||||||
|
obs = _of(list(extract_session(_typed_session(text), sid="kl-en")), PRIMITIVE)
|
||||||
|
assert obs.value == "qwerty"
|
||||||
|
|
||||||
|
|
||||||
|
def test_french_with_q_artifacts_emits_azerty() -> None:
|
||||||
|
"""High `q` rate AND low English saturation → azerty.
|
||||||
|
|
||||||
|
Construct text dominated by `q`-runs and consonant clusters that
|
||||||
|
don't form top-10 English bigrams (avoiding `er` / `he` / `th`).
|
||||||
|
"""
|
||||||
|
text = ("qqqqqqq " * 50 + "qsdfg " * 30 + "qpkml " * 30)
|
||||||
|
obs = _of(list(extract_session(_typed_session(text), sid="kl-fr")), PRIMITIVE)
|
||||||
|
assert obs.value == "azerty"
|
||||||
|
|
||||||
|
|
||||||
|
def test_german_with_z_artifacts_emits_qwertz() -> None:
|
||||||
|
"""High `z` rate AND low `y` rate → qwertz."""
|
||||||
|
# German text simulation: lots of z, almost no y
|
||||||
|
text = (
|
||||||
|
"zwei zauber zaehlen zwischen zwanzig zelten "
|
||||||
|
"zaubern zwanzig zwerge zaehlen zaubern zwanzig "
|
||||||
|
) * 5
|
||||||
|
obs = _of(list(extract_session(_typed_session(text), sid="kl-de")), PRIMITIVE)
|
||||||
|
assert obs.value == "qwertz"
|
||||||
|
|
||||||
|
|
||||||
|
def test_random_low_signal_emits_other() -> None:
|
||||||
|
"""Random non-English low-bigram-saturation text → other."""
|
||||||
|
# Generate text without English digraphs and without artifact unigrams
|
||||||
|
text = ("kpfm vbnj wxlc " * 30)
|
||||||
|
obs = _of(list(extract_session(_typed_session(text), sid="kl-other")), PRIMITIVE)
|
||||||
|
assert obs.value == "other"
|
||||||
|
|
||||||
|
|
||||||
|
def test_pasted_text_does_not_count() -> None:
|
||||||
|
"""A long paste shouldn't drive layout — only typed chars count.
|
||||||
|
|
||||||
|
Send everything as a single 'paste-class' input event (>= 4 chars):
|
||||||
|
F.4's histograms exclude pastes, so the typed letter count stays
|
||||||
|
at zero and emission is skipped.
|
||||||
|
"""
|
||||||
|
pangram = "the quick brown fox jumps over the lazy dog " * 10
|
||||||
|
events: list[AsciinemaEvent] = [(0.0, "i", pangram), (1.0, "i", "\r")]
|
||||||
|
out = list(extract_session(events, sid="kl-paste"))
|
||||||
|
assert [o for o in out if o.primitive == PRIMITIVE] == []
|
||||||
Reference in New Issue
Block a user