Files
DECNET/tests/profiler/behave_shell/test_environmental_keyboard_layout.py
anti cd7c7ea5a2 feat(profiler/behave_shell): emit environmental.keyboard_layout
ANTI authorised dropping the PII boundary for this primitive. ctx
gains typed_unigram_counts / typed_bigram_counts / typed_letter_count
populated during the existing single-pass input walk (paste-class
events excluded).

Two-axis classifier:
* layout-artefact unigrams take priority — q rate above floor with
  low English saturation → azerty; z above floor with y below → qwertz
* fallback to English-bigram saturation: ≥ floor → qwerty, else other

Sample-size floor 200 typed letters; bigram histogram capped at
top-64 to bound memory. Confidence cap stays moderate (0.40-0.55) —
heuristic discriminator.
2026-05-04 00:38:24 -04:00

80 lines
3.0 KiB
Python

"""Step F.4: ``environmental.keyboard_layout``."""
from __future__ import annotations
from decnet.profiler.behave_shell import extract_session
from decnet.profiler.behave_shell._parse import AsciinemaEvent
PRIMITIVE = "environmental.keyboard_layout"
def _of(observations: list, primitive: str):
obs = [o for o in observations if o.primitive == primitive]
assert len(obs) == 1, f"expected exactly one {primitive}, got {len(obs)}"
return obs[0]
def _typed_session(text: str, t0: float = 0.0, dt: float = 0.05) -> list[AsciinemaEvent]:
"""Type ``text`` char-by-char and run as one command."""
events: list[AsciinemaEvent] = [
(t0 + i * dt, "i", c) for i, c in enumerate(text)
]
events.append((t0 + len(text) * dt, "i", "\r"))
return events
def test_below_min_typed_letters_no_emission() -> None:
out = list(extract_session(_typed_session("hi"), sid="kl-tiny"))
assert [o for o in out if o.primitive == PRIMITIVE] == []
def test_english_text_emits_qwerty() -> None:
"""Pangram repeated to clear LAYOUT_MIN_TYPED_LETTERS (200)."""
pangram = "the quick brown fox jumps over the lazy dog and then he ran inside the house "
text = pangram * 5
obs = _of(list(extract_session(_typed_session(text), sid="kl-en")), PRIMITIVE)
assert obs.value == "qwerty"
def test_french_with_q_artifacts_emits_azerty() -> None:
"""High `q` rate AND low English saturation → azerty.
Construct text dominated by `q`-runs and consonant clusters that
don't form top-10 English bigrams (avoiding `er` / `he` / `th`).
"""
text = ("qqqqqqq " * 50 + "qsdfg " * 30 + "qpkml " * 30)
obs = _of(list(extract_session(_typed_session(text), sid="kl-fr")), PRIMITIVE)
assert obs.value == "azerty"
def test_german_with_z_artifacts_emits_qwertz() -> None:
"""High `z` rate AND low `y` rate → qwertz."""
# German text simulation: lots of z, almost no y
text = (
"zwei zauber zaehlen zwischen zwanzig zelten "
"zaubern zwanzig zwerge zaehlen zaubern zwanzig "
) * 5
obs = _of(list(extract_session(_typed_session(text), sid="kl-de")), PRIMITIVE)
assert obs.value == "qwertz"
def test_random_low_signal_emits_other() -> None:
"""Random non-English low-bigram-saturation text → other."""
# Generate text without English digraphs and without artifact unigrams
text = ("kpfm vbnj wxlc " * 30)
obs = _of(list(extract_session(_typed_session(text), sid="kl-other")), PRIMITIVE)
assert obs.value == "other"
def test_pasted_text_does_not_count() -> None:
"""A long paste shouldn't drive layout — only typed chars count.
Send everything as a single 'paste-class' input event (>= 4 chars):
F.4's histograms exclude pastes, so the typed letter count stays
at zero and emission is skipped.
"""
pangram = "the quick brown fox jumps over the lazy dog " * 10
events: list[AsciinemaEvent] = [(0.0, "i", pangram), (1.0, "i", "\r")]
out = list(extract_session(events, sid="kl-paste"))
assert [o for o in out if o.primitive == PRIMITIVE] == []