From cd7c7ea5a22db93a014d73946d07e137a2008536 Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 4 May 2026 00:38:24 -0400 Subject: [PATCH] feat(profiler/behave_shell): emit environmental.keyboard_layout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANTI authorised dropping the PII boundary for this primitive. ctx gains typed_unigram_counts / typed_bigram_counts / typed_letter_count populated during the existing single-pass input walk (paste-class events excluded). Two-axis classifier: * layout-artefact unigrams take priority — q rate above floor with low English saturation → azerty; z above floor with y below → qwertz * fallback to English-bigram saturation: ≥ floor → qwerty, else other Sample-size floor 200 typed letters; bigram histogram capped at top-64 to bound memory. Confidence cap stays moderate (0.40-0.55) — heuristic discriminator. --- decnet/profiler/behave_shell/_ctx.py | 53 ++++++++++++- .../behave_shell/_features/__init__.py | 2 + .../behave_shell/_features/environmental.py | 69 ++++++++++++++++ decnet/profiler/behave_shell/_thresholds.py | 25 ++++++ .../test_environmental_keyboard_layout.py | 79 +++++++++++++++++++ 5 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 tests/profiler/behave_shell/test_environmental_keyboard_layout.py diff --git a/decnet/profiler/behave_shell/_ctx.py b/decnet/profiler/behave_shell/_ctx.py index dba044a5..d43e8f4e 100644 --- a/decnet/profiler/behave_shell/_ctx.py +++ b/decnet/profiler/behave_shell/_ctx.py @@ -12,7 +12,7 @@ from __future__ import annotations import math from dataclasses import dataclass, field -from typing import Iterable +from typing import Iterable, Mapping from decnet.profiler.behave_shell._parse import ( AsciinemaEvent, @@ -26,6 +26,7 @@ from decnet.profiler.behave_shell._parse import ( ) from decnet.profiler.behave_shell._thresholds import ( IKI_THINK_MAX_S, + LAYOUT_BIGRAM_TOP_N, PASTE_BURST_MAX_IAT_S, PASTE_MIN_CHARS_PER_EVENT, PROMPT_LINE_MAX_CHARS, @@ -69,6 +70,12 @@ class SessionContext: # Step F.0 derivations — PS1 prompt lines detected in the output stream prompt_lines: tuple[PromptLine, ...] = field(default_factory=tuple) + # Step F.4 derivations — typed-only character histograms for keyboard + # layout fingerprinting (PII boundary lifted by ANTI for Phase F). + typed_unigram_counts: Mapping[str, int] = field(default_factory=dict) + typed_bigram_counts: Mapping[str, int] = field(default_factory=dict) + typed_letter_count: int = 0 + def _detect_paste_bursts( inputs: list[AsciinemaEvent], @@ -300,6 +307,46 @@ def _output_bytes_between( return sum(len(d) for t, _k, d in outputs if start <= t < end) +def _typed_char_histograms( + inputs: list[AsciinemaEvent], +) -> tuple[Mapping[str, int], Mapping[str, int], int]: + """Walk input events, build typed-only unigram + bigram histograms. + + Skip paste-class events (``len(data) >= PASTE_MIN_CHARS_PER_EVENT``) + — pasted text reveals nothing about the operator's keyboard. Letter + bigrams chain only across consecutive ASCII-letter chars; a digit + or punctuation character breaks the chain. + + Returns ``(unigrams, bigrams, total_letters)``. The bigram dict is + truncated to the top ``LAYOUT_BIGRAM_TOP_N`` entries by count to + bound memory (the layout signals only need the head of the + distribution). + """ + unigrams: dict[str, int] = {} + bigrams: dict[str, int] = {} + total_letters = 0 + last_letter: str | None = None + for _t, _kind, data in inputs: + if len(data) >= PASTE_MIN_CHARS_PER_EVENT: + last_letter = None + continue + for c in data: + if c.isascii() and c.isalpha(): + lower = c.lower() + unigrams[lower] = unigrams.get(lower, 0) + 1 + total_letters += 1 + if last_letter is not None: + big = last_letter + lower + bigrams[big] = bigrams.get(big, 0) + 1 + last_letter = lower + else: + last_letter = None + if len(bigrams) > LAYOUT_BIGRAM_TOP_N: + top = sorted(bigrams.items(), key=lambda kv: -kv[1])[:LAYOUT_BIGRAM_TOP_N] + bigrams = dict(top) + return unigrams, bigrams, total_letters + + def _output_window( outputs: list[AsciinemaEvent], start: float, @@ -385,6 +432,7 @@ def build_session_context( for i in range(len(commands) - 1) ) intra_command_iats = _per_command_iats(commands, inputs) + typed_uni, typed_bi, typed_letters = _typed_char_histograms(inputs) return SessionContext( sid=sid, @@ -407,4 +455,7 @@ def build_session_context( kill_line_count=kill_line_count, intra_command_iats=intra_command_iats, prompt_lines=prompt_lines, + typed_unigram_counts=typed_uni, + typed_bigram_counts=typed_bi, + typed_letter_count=typed_letters, ) diff --git a/decnet/profiler/behave_shell/_features/__init__.py b/decnet/profiler/behave_shell/_features/__init__.py index eff85fce..4e7bb77f 100644 --- a/decnet/profiler/behave_shell/_features/__init__.py +++ b/decnet/profiler/behave_shell/_features/__init__.py @@ -25,6 +25,7 @@ from decnet.profiler.behave_shell._features.cognitive import ( inter_command_latency_class, ) from decnet.profiler.behave_shell._features.environmental import ( + keyboard_layout, locale, shell_type, terminal_multiplexer, @@ -75,4 +76,5 @@ FEATURES: tuple[FeatureFn, ...] = ( shell_type, terminal_multiplexer, locale, + keyboard_layout, ) diff --git a/decnet/profiler/behave_shell/_features/environmental.py b/decnet/profiler/behave_shell/_features/environmental.py index cdba4722..1a9b2683 100644 --- a/decnet/profiler/behave_shell/_features/environmental.py +++ b/decnet/profiler/behave_shell/_features/environmental.py @@ -8,6 +8,7 @@ which F.1 / F.3 / E.4 read. Step F.1: ``environmental.shell_type``. Step F.2: ``environmental.terminal_multiplexer``. Step F.3: ``environmental.locale``. +Step F.4: ``environmental.keyboard_layout``. """ from __future__ import annotations @@ -21,6 +22,13 @@ from decnet.profiler.behave_shell._ctx import SessionContext from decnet.profiler.behave_shell._features._emit import make_observation from decnet.profiler.behave_shell._parse import PromptLine, strip_ansi from decnet.profiler.behave_shell._thresholds import ( + LAYOUT_AZERTY_ENG_MAX, + LAYOUT_AZERTY_Q_MIN, + LAYOUT_MIN_TYPED_LETTERS, + LAYOUT_QWERTY_ENG_MIN, + LAYOUT_QWERTZ_Y_MAX, + LAYOUT_QWERTZ_Z_MIN, + LAYOUT_TOP_ENG_BIGRAMS, LOCALE_MIN_VALUE_LENGTH, SHELL_TYPE_MIN_PROMPTS, ) @@ -228,3 +236,64 @@ def locale(ctx: SessionContext) -> Iterator[Observation]: value=best_value, confidence=0.80, ) + + +def keyboard_layout(ctx: SessionContext) -> Iterator[Observation]: + """Emit ``environmental.keyboard_layout``. + + Two independent signals over the typed-only character histograms: + + 1. **English-bigram saturation** — fraction of typed bigrams that + hit the top-10 English bigrams. High → presumed QWERTY. + 2. **Layout-artefact unigrams** — letters that are rare in English + but frequent on operators using a different layout: + + * ``q`` rate above floor AND English saturation low → ``azerty`` + (AZERTY's `a` is on QWERTY's `q` position; mistypes bleed `q`) + * ``z`` rate above floor AND ``y`` rate below floor → ``qwertz`` + (QWERTZ swaps `y`/`z`) + * Else: English saturation above floor → ``qwerty`` + * Else: → ``other`` + + Threshold ordering matters — layout-artefact checks fire before + QWERTY because AZERTY/QWERTZ operators may still hit some English + bigrams. + + Skip emission when typed letter count below + ``LAYOUT_MIN_TYPED_LETTERS`` (200) — the histograms are too thin + to discriminate honestly. + """ + if ctx.typed_letter_count < LAYOUT_MIN_TYPED_LETTERS: + return + uni = ctx.typed_unigram_counts + bi = ctx.typed_bigram_counts + total_letters = ctx.typed_letter_count + total_bigrams = sum(bi.values()) + + eng_saturation = ( + sum(bi.get(b, 0) for b in LAYOUT_TOP_ENG_BIGRAMS) / total_bigrams + if total_bigrams > 0 else 0.0 + ) + q_rate = uni.get("q", 0) / total_letters + z_rate = uni.get("z", 0) / total_letters + y_rate = uni.get("y", 0) / total_letters + + if q_rate > LAYOUT_AZERTY_Q_MIN and eng_saturation < LAYOUT_AZERTY_ENG_MAX: + value = "azerty" + elif z_rate > LAYOUT_QWERTZ_Z_MIN and y_rate < LAYOUT_QWERTZ_Y_MAX: + value = "qwertz" + elif eng_saturation >= LAYOUT_QWERTY_ENG_MIN: + value = "qwerty" + else: + value = "other" + + if total_letters < 500: + confidence = 0.40 + else: + confidence = 0.55 + yield make_observation( + ctx, + primitive="environmental.keyboard_layout", + value=value, + confidence=confidence, + ) diff --git a/decnet/profiler/behave_shell/_thresholds.py b/decnet/profiler/behave_shell/_thresholds.py index dfc98c5f..f47e7639 100644 --- a/decnet/profiler/behave_shell/_thresholds.py +++ b/decnet/profiler/behave_shell/_thresholds.py @@ -238,6 +238,31 @@ SHELL_TYPE_MIN_PROMPTS: int = 3 # noise and skip emission (a single 'C' or 'en' is too thin). LOCALE_MIN_VALUE_LENGTH: int = 2 +# ── environmental.keyboard_layout (Step F.4) ─────────────────────────────── +# ANTI authorised dropping the PII boundary for this primitive — typed +# bigram/unigram histograms ride on SessionContext to feed two +# independent layout signals: +# +# 1. English-bigram saturation (presumed-QWERTY signal) +# 2. Layout-artefact unigram rates (q for AZERTY, z/y swap for QWERTZ) +# +# Sample-size floor; below this typed-letter-count we skip emission. +LAYOUT_MIN_TYPED_LETTERS: int = 200 +# Cap on bigram histogram size — bound memory while keeping the top +# bigrams that drive the saturation signal. +LAYOUT_BIGRAM_TOP_N: int = 64 +# Top-10 English bigrams. Their summed frequency floor presumes QWERTY +# (the dominant English-typing layout). +LAYOUT_TOP_ENG_BIGRAMS: frozenset[str] = frozenset({ + "th", "he", "in", "er", "an", "re", "on", "at", "nd", "ha", +}) +# Layout-artefact thresholds. Fractions are over total ASCII-letter typed. +LAYOUT_AZERTY_Q_MIN: float = 0.020 # high `q` rate (mistyping AZERTY's `a`) +LAYOUT_AZERTY_ENG_MAX: float = 0.050 # AND low English saturation +LAYOUT_QWERTZ_Z_MIN: float = 0.030 # high `z` rate (German content / QWERTZ) +LAYOUT_QWERTZ_Y_MAX: float = 0.010 # AND `y` swap signature +LAYOUT_QWERTY_ENG_MIN: float = 0.080 # English-bigram saturation floor + # ── motor.keystroke_cadence (Step B.1) ────────────────────────────────────── # Typing bursts split at gaps > IKI_THINK_MAX_S so think-pauses between # commands don't inflate the within-burst CV. Mirrors the prototype's diff --git a/tests/profiler/behave_shell/test_environmental_keyboard_layout.py b/tests/profiler/behave_shell/test_environmental_keyboard_layout.py new file mode 100644 index 00000000..9361b942 --- /dev/null +++ b/tests/profiler/behave_shell/test_environmental_keyboard_layout.py @@ -0,0 +1,79 @@ +"""Step F.4: ``environmental.keyboard_layout``.""" +from __future__ import annotations + +from decnet.profiler.behave_shell import extract_session +from decnet.profiler.behave_shell._parse import AsciinemaEvent + + +PRIMITIVE = "environmental.keyboard_layout" + + +def _of(observations: list, primitive: str): + obs = [o for o in observations if o.primitive == primitive] + assert len(obs) == 1, f"expected exactly one {primitive}, got {len(obs)}" + return obs[0] + + +def _typed_session(text: str, t0: float = 0.0, dt: float = 0.05) -> list[AsciinemaEvent]: + """Type ``text`` char-by-char and run as one command.""" + events: list[AsciinemaEvent] = [ + (t0 + i * dt, "i", c) for i, c in enumerate(text) + ] + events.append((t0 + len(text) * dt, "i", "\r")) + return events + + +def test_below_min_typed_letters_no_emission() -> None: + out = list(extract_session(_typed_session("hi"), sid="kl-tiny")) + assert [o for o in out if o.primitive == PRIMITIVE] == [] + + +def test_english_text_emits_qwerty() -> None: + """Pangram repeated to clear LAYOUT_MIN_TYPED_LETTERS (200).""" + pangram = "the quick brown fox jumps over the lazy dog and then he ran inside the house " + text = pangram * 5 + obs = _of(list(extract_session(_typed_session(text), sid="kl-en")), PRIMITIVE) + assert obs.value == "qwerty" + + +def test_french_with_q_artifacts_emits_azerty() -> None: + """High `q` rate AND low English saturation → azerty. + + Construct text dominated by `q`-runs and consonant clusters that + don't form top-10 English bigrams (avoiding `er` / `he` / `th`). + """ + text = ("qqqqqqq " * 50 + "qsdfg " * 30 + "qpkml " * 30) + obs = _of(list(extract_session(_typed_session(text), sid="kl-fr")), PRIMITIVE) + assert obs.value == "azerty" + + +def test_german_with_z_artifacts_emits_qwertz() -> None: + """High `z` rate AND low `y` rate → qwertz.""" + # German text simulation: lots of z, almost no y + text = ( + "zwei zauber zaehlen zwischen zwanzig zelten " + "zaubern zwanzig zwerge zaehlen zaubern zwanzig " + ) * 5 + obs = _of(list(extract_session(_typed_session(text), sid="kl-de")), PRIMITIVE) + assert obs.value == "qwertz" + + +def test_random_low_signal_emits_other() -> None: + """Random non-English low-bigram-saturation text → other.""" + # Generate text without English digraphs and without artifact unigrams + text = ("kpfm vbnj wxlc " * 30) + obs = _of(list(extract_session(_typed_session(text), sid="kl-other")), PRIMITIVE) + assert obs.value == "other" + + +def test_pasted_text_does_not_count() -> None: + """A long paste shouldn't drive layout — only typed chars count. + + Send everything as a single 'paste-class' input event (>= 4 chars): + F.4's histograms exclude pastes, so the typed letter count stays + at zero and emission is skipped. + """ + pangram = "the quick brown fox jumps over the lazy dog " * 10 + events: list[AsciinemaEvent] = [(0.0, "i", pangram), (1.0, "i", "\r")] + out = list(extract_session(events, sid="kl-paste")) + assert [o for o in out if o.primitive == PRIMITIVE] == []