feat(profiler/behave_shell): emit environmental.keyboard_layout

ANTI authorised dropping the PII boundary for this primitive. ctx gains typed_unigram_counts / typed_bigram_counts / typed_letter_count populated during the existing single-pass input walk (paste-class events excluded). Two-axis classifier: * layout-artefact unigrams take priority — q rate above floor with low English saturation → azerty; z above floor with y below → qwertz * fallback to English-bigram saturation: ≥ floor → qwerty, else other Sample-size floor 200 typed letters; bigram histogram capped at top-64 to bound memory. Confidence cap stays moderate (0.40-0.55) — heuristic discriminator.
2026-05-04 00:38:24 -04:00
parent b7ff5d2cc1
commit cd7c7ea5a2
5 changed files with 227 additions and 1 deletions
--- a/decnet/profiler/behave_shell/_ctx.py
+++ b/decnet/profiler/behave_shell/_ctx.py
@@ -12,7 +12,7 @@ from __future__ import annotations

 import math
 from dataclasses import dataclass, field
-from typing import Iterable
+from typing import Iterable, Mapping

 from decnet.profiler.behave_shell._parse import (
    AsciinemaEvent,
@@ -26,6 +26,7 @@ from decnet.profiler.behave_shell._parse import (
 )
 from decnet.profiler.behave_shell._thresholds import (
    IKI_THINK_MAX_S,
+    LAYOUT_BIGRAM_TOP_N,
    PASTE_BURST_MAX_IAT_S,
    PASTE_MIN_CHARS_PER_EVENT,
    PROMPT_LINE_MAX_CHARS,
@@ -69,6 +70,12 @@ class SessionContext:
    # Step F.0 derivations — PS1 prompt lines detected in the output stream
    prompt_lines: tuple[PromptLine, ...] = field(default_factory=tuple)

+    # Step F.4 derivations — typed-only character histograms for keyboard
+    # layout fingerprinting (PII boundary lifted by ANTI for Phase F).
+    typed_unigram_counts: Mapping[str, int] = field(default_factory=dict)
+    typed_bigram_counts: Mapping[str, int] = field(default_factory=dict)
+    typed_letter_count: int = 0
+

 def _detect_paste_bursts(
    inputs: list[AsciinemaEvent],
@@ -300,6 +307,46 @@ def _output_bytes_between(
    return sum(len(d) for t, _k, d in outputs if start <= t < end)


+def _typed_char_histograms(
+    inputs: list[AsciinemaEvent],
+) -> tuple[Mapping[str, int], Mapping[str, int], int]:
+    """Walk input events, build typed-only unigram + bigram histograms.
+
+    Skip paste-class events (``len(data) >= PASTE_MIN_CHARS_PER_EVENT``)
+    — pasted text reveals nothing about the operator's keyboard. Letter
+    bigrams chain only across consecutive ASCII-letter chars; a digit
+    or punctuation character breaks the chain.
+
+    Returns ``(unigrams, bigrams, total_letters)``. The bigram dict is
+    truncated to the top ``LAYOUT_BIGRAM_TOP_N`` entries by count to
+    bound memory (the layout signals only need the head of the
+    distribution).
+    """
+    unigrams: dict[str, int] = {}
+    bigrams: dict[str, int] = {}
+    total_letters = 0
+    last_letter: str | None = None
+    for _t, _kind, data in inputs:
+        if len(data) >= PASTE_MIN_CHARS_PER_EVENT:
+            last_letter = None
+            continue
+        for c in data:
+            if c.isascii() and c.isalpha():
+                lower = c.lower()
+                unigrams[lower] = unigrams.get(lower, 0) + 1
+                total_letters += 1
+                if last_letter is not None:
+                    big = last_letter + lower
+                    bigrams[big] = bigrams.get(big, 0) + 1
+                last_letter = lower
+            else:
+                last_letter = None
+    if len(bigrams) > LAYOUT_BIGRAM_TOP_N:
+        top = sorted(bigrams.items(), key=lambda kv: -kv[1])[:LAYOUT_BIGRAM_TOP_N]
+        bigrams = dict(top)
+    return unigrams, bigrams, total_letters
+
+
 def _output_window(
    outputs: list[AsciinemaEvent],
    start: float,
@@ -385,6 +432,7 @@ def build_session_context(
        for i in range(len(commands) - 1)
    )
    intra_command_iats = _per_command_iats(commands, inputs)
+    typed_uni, typed_bi, typed_letters = _typed_char_histograms(inputs)

    return SessionContext(
        sid=sid,
@@ -407,4 +455,7 @@ def build_session_context(
        kill_line_count=kill_line_count,
        intra_command_iats=intra_command_iats,
        prompt_lines=prompt_lines,
+        typed_unigram_counts=typed_uni,
+        typed_bigram_counts=typed_bi,
+        typed_letter_count=typed_letters,
    )
--- a/decnet/profiler/behave_shell/_features/init.py
+++ b/decnet/profiler/behave_shell/_features/init.py
@@ -25,6 +25,7 @@ from decnet.profiler.behave_shell._features.cognitive import (
    inter_command_latency_class,
 )
 from decnet.profiler.behave_shell._features.environmental import (
+    keyboard_layout,
    locale,
    shell_type,
    terminal_multiplexer,
@@ -75,4 +76,5 @@ FEATURES: tuple[FeatureFn, ...] = (
    shell_type,
    terminal_multiplexer,
    locale,
+    keyboard_layout,
 )
--- a/decnet/profiler/behave_shell/_features/environmental.py
+++ b/decnet/profiler/behave_shell/_features/environmental.py
@@ -8,6 +8,7 @@ which F.1 / F.3 / E.4 read.
 Step F.1: ``environmental.shell_type``.
 Step F.2: ``environmental.terminal_multiplexer``.
 Step F.3: ``environmental.locale``.
+Step F.4: ``environmental.keyboard_layout``.
 """
 from __future__ import annotations

@@ -21,6 +22,13 @@ from decnet.profiler.behave_shell._ctx import SessionContext
 from decnet.profiler.behave_shell._features._emit import make_observation
 from decnet.profiler.behave_shell._parse import PromptLine, strip_ansi
 from decnet.profiler.behave_shell._thresholds import (
+    LAYOUT_AZERTY_ENG_MAX,
+    LAYOUT_AZERTY_Q_MIN,
+    LAYOUT_MIN_TYPED_LETTERS,
+    LAYOUT_QWERTY_ENG_MIN,
+    LAYOUT_QWERTZ_Y_MAX,
+    LAYOUT_QWERTZ_Z_MIN,
+    LAYOUT_TOP_ENG_BIGRAMS,
    LOCALE_MIN_VALUE_LENGTH,
    SHELL_TYPE_MIN_PROMPTS,
 )
@@ -228,3 +236,64 @@ def locale(ctx: SessionContext) -> Iterator[Observation]:
        value=best_value,
        confidence=0.80,
    )
+
+
+def keyboard_layout(ctx: SessionContext) -> Iterator[Observation]:
+    """Emit ``environmental.keyboard_layout``.
+
+    Two independent signals over the typed-only character histograms:
+
+    1. **English-bigram saturation** — fraction of typed bigrams that
+       hit the top-10 English bigrams. High → presumed QWERTY.
+    2. **Layout-artefact unigrams** — letters that are rare in English
+       but frequent on operators using a different layout:
+
+       * ``q`` rate above floor AND English saturation low → ``azerty``
+         (AZERTY's `a` is on QWERTY's `q` position; mistypes bleed `q`)
+       * ``z`` rate above floor AND ``y`` rate below floor → ``qwertz``
+         (QWERTZ swaps `y`/`z`)
+       * Else: English saturation above floor → ``qwerty``
+       * Else: → ``other``
+
+    Threshold ordering matters — layout-artefact checks fire before
+    QWERTY because AZERTY/QWERTZ operators may still hit some English
+    bigrams.
+
+    Skip emission when typed letter count below
+    ``LAYOUT_MIN_TYPED_LETTERS`` (200) — the histograms are too thin
+    to discriminate honestly.
+    """
+    if ctx.typed_letter_count < LAYOUT_MIN_TYPED_LETTERS:
+        return
+    uni = ctx.typed_unigram_counts
+    bi = ctx.typed_bigram_counts
+    total_letters = ctx.typed_letter_count
+    total_bigrams = sum(bi.values())
+
+    eng_saturation = (
+        sum(bi.get(b, 0) for b in LAYOUT_TOP_ENG_BIGRAMS) / total_bigrams
+        if total_bigrams > 0 else 0.0
+    )
+    q_rate = uni.get("q", 0) / total_letters
+    z_rate = uni.get("z", 0) / total_letters
+    y_rate = uni.get("y", 0) / total_letters
+
+    if q_rate > LAYOUT_AZERTY_Q_MIN and eng_saturation < LAYOUT_AZERTY_ENG_MAX:
+        value = "azerty"
+    elif z_rate > LAYOUT_QWERTZ_Z_MIN and y_rate < LAYOUT_QWERTZ_Y_MAX:
+        value = "qwertz"
+    elif eng_saturation >= LAYOUT_QWERTY_ENG_MIN:
+        value = "qwerty"
+    else:
+        value = "other"
+
+    if total_letters < 500:
+        confidence = 0.40
+    else:
+        confidence = 0.55
+    yield make_observation(
+        ctx,
+        primitive="environmental.keyboard_layout",
+        value=value,
+        confidence=confidence,
+    )
--- a/decnet/profiler/behave_shell/_thresholds.py
+++ b/decnet/profiler/behave_shell/_thresholds.py
@@ -238,6 +238,31 @@ SHELL_TYPE_MIN_PROMPTS: int = 3
 # noise and skip emission (a single 'C' or 'en' is too thin).
 LOCALE_MIN_VALUE_LENGTH: int = 2

+# ── environmental.keyboard_layout (Step F.4) ───────────────────────────────
+# ANTI authorised dropping the PII boundary for this primitive — typed
+# bigram/unigram histograms ride on SessionContext to feed two
+# independent layout signals:
+#
+#   1. English-bigram saturation (presumed-QWERTY signal)
+#   2. Layout-artefact unigram rates (q for AZERTY, z/y swap for QWERTZ)
+#
+# Sample-size floor; below this typed-letter-count we skip emission.
+LAYOUT_MIN_TYPED_LETTERS: int = 200
+# Cap on bigram histogram size — bound memory while keeping the top
+# bigrams that drive the saturation signal.
+LAYOUT_BIGRAM_TOP_N: int = 64
+# Top-10 English bigrams. Their summed frequency floor presumes QWERTY
+# (the dominant English-typing layout).
+LAYOUT_TOP_ENG_BIGRAMS: frozenset[str] = frozenset({
+    "th", "he", "in", "er", "an", "re", "on", "at", "nd", "ha",
+})
+# Layout-artefact thresholds. Fractions are over total ASCII-letter typed.
+LAYOUT_AZERTY_Q_MIN: float = 0.020      # high `q` rate (mistyping AZERTY's `a`)
+LAYOUT_AZERTY_ENG_MAX: float = 0.050    # AND low English saturation
+LAYOUT_QWERTZ_Z_MIN: float = 0.030      # high `z` rate (German content / QWERTZ)
+LAYOUT_QWERTZ_Y_MAX: float = 0.010      # AND `y` swap signature
+LAYOUT_QWERTY_ENG_MIN: float = 0.080    # English-bigram saturation floor
+
 # ── motor.keystroke_cadence (Step B.1) ──────────────────────────────────────
 # Typing bursts split at gaps > IKI_THINK_MAX_S so think-pauses between
 # commands don't inflate the within-burst CV. Mirrors the prototype's
--- a/tests/profiler/behave_shell/test_environmental_keyboard_layout.py
+++ b/tests/profiler/behave_shell/test_environmental_keyboard_layout.py
@@ -0,0 +1,79 @@
+"""Step F.4: ``environmental.keyboard_layout``."""
+from __future__ import annotations
+
+from decnet.profiler.behave_shell import extract_session
+from decnet.profiler.behave_shell._parse import AsciinemaEvent
+
+
+PRIMITIVE = "environmental.keyboard_layout"
+
+
+def _of(observations: list, primitive: str):
+    obs = [o for o in observations if o.primitive == primitive]
+    assert len(obs) == 1, f"expected exactly one {primitive}, got {len(obs)}"
+    return obs[0]
+
+
+def _typed_session(text: str, t0: float = 0.0, dt: float = 0.05) -> list[AsciinemaEvent]:
+    """Type ``text`` char-by-char and run as one command."""
+    events: list[AsciinemaEvent] = [
+        (t0 + i * dt, "i", c) for i, c in enumerate(text)
+    ]
+    events.append((t0 + len(text) * dt, "i", "\r"))
+    return events
+
+
+def test_below_min_typed_letters_no_emission() -> None:
+    out = list(extract_session(_typed_session("hi"), sid="kl-tiny"))
+    assert [o for o in out if o.primitive == PRIMITIVE] == []
+
+
+def test_english_text_emits_qwerty() -> None:
+    """Pangram repeated to clear LAYOUT_MIN_TYPED_LETTERS (200)."""
+    pangram = "the quick brown fox jumps over the lazy dog and then he ran inside the house "
+    text = pangram * 5
+    obs = _of(list(extract_session(_typed_session(text), sid="kl-en")), PRIMITIVE)
+    assert obs.value == "qwerty"
+
+
+def test_french_with_q_artifacts_emits_azerty() -> None:
+    """High `q` rate AND low English saturation → azerty.
+
+    Construct text dominated by `q`-runs and consonant clusters that
+    don't form top-10 English bigrams (avoiding `er` / `he` / `th`).
+    """
+    text = ("qqqqqqq " * 50 + "qsdfg " * 30 + "qpkml " * 30)
+    obs = _of(list(extract_session(_typed_session(text), sid="kl-fr")), PRIMITIVE)
+    assert obs.value == "azerty"
+
+
+def test_german_with_z_artifacts_emits_qwertz() -> None:
+    """High `z` rate AND low `y` rate → qwertz."""
+    # German text simulation: lots of z, almost no y
+    text = (
+        "zwei zauber zaehlen zwischen zwanzig zelten "
+        "zaubern zwanzig zwerge zaehlen zaubern zwanzig "
+    ) * 5
+    obs = _of(list(extract_session(_typed_session(text), sid="kl-de")), PRIMITIVE)
+    assert obs.value == "qwertz"
+
+
+def test_random_low_signal_emits_other() -> None:
+    """Random non-English low-bigram-saturation text → other."""
+    # Generate text without English digraphs and without artifact unigrams
+    text = ("kpfm vbnj wxlc " * 30)
+    obs = _of(list(extract_session(_typed_session(text), sid="kl-other")), PRIMITIVE)
+    assert obs.value == "other"
+
+
+def test_pasted_text_does_not_count() -> None:
+    """A long paste shouldn't drive layout — only typed chars count.
+
+    Send everything as a single 'paste-class' input event (>= 4 chars):
+    F.4's histograms exclude pastes, so the typed letter count stays
+    at zero and emission is skipped.
+    """
+    pangram = "the quick brown fox jumps over the lazy dog " * 10
+    events: list[AsciinemaEvent] = [(0.0, "i", pangram), (1.0, "i", "\r")]
+    out = list(extract_session(events, sid="kl-paste"))
+    assert [o for o in out if o.primitive == PRIMITIVE] == []