feat(profiler/behave_shell): emit cognitive.error_resilience.frustration_typing

Compares median within-command IAT for commands following an errored command vs commands following a successful one. Relative absolute delta buckets to low / moderate / high. Skips when either group is empty (no errors, or no clean baseline). v0.1; D.8 re-tunes.
2026-05-04 00:00:36 -04:00
parent b704352783
commit 8183218d29
4 changed files with 172 additions and 0 deletions
--- a/decnet/profiler/behave_shell/_features/init.py
+++ b/decnet/profiler/behave_shell/_features/init.py
@@ -14,6 +14,7 @@ from decnet.profiler.behave_shell._ctx import SessionContext
 from decnet.profiler.behave_shell._features.cognitive import (
    cognitive_load,
    command_branch_diversity,
    error_resilience_frustration_typing,
    error_resilience_retry_tactic,
    exploration_style,
    feedback_loop_engagement,
@@ -55,4 +56,5 @@ FEATURES: tuple[FeatureFn, ...] = (
    planning_depth,
    tool_vocabulary,
    error_resilience_retry_tactic,
    error_resilience_frustration_typing,
 )
--- a/decnet/profiler/behave_shell/_features/cognitive.py
+++ b/decnet/profiler/behave_shell/_features/cognitive.py
@@ -25,6 +25,8 @@ from decnet.profiler.behave_shell._thresholds import (
    EXPLORATION_TARGETED_REP_MIN,
    FEEDBACK_CORRELATION_MIN,
    FEEDBACK_MIN_PAIRS,
    FRUSTRATION_LOW_MAX,
    FRUSTRATION_MODERATE_MAX,
    IKI_THINK_MAX_S,
    INTER_CMD_DELIBERATE_MAX,
    INTER_CMD_INSTANT_MAX,
@@ -186,6 +188,61 @@ def feedback_loop_engagement(ctx: SessionContext) -> Iterator[Observation]:
    )
 def error_resilience_frustration_typing(ctx: SessionContext) -> Iterator[Observation]:
    """Emit ``cognitive.error_resilience.frustration_typing``.
    Compares median within-command IAT for commands *following* an
    errored command against the same statistic for commands following
    a successful command. A large relative delta indicates the operator
    typed differently after a failure — speed-up (rage / fluency) or
    slowdown (caution); both are signs of arousal.
    Skip emission when either group is empty (no errors, or every
    command errored — no clean baseline). Sample-size honesty drops
    confidence below the floor.
    """
    post_err: list[float] = []
    post_ok: list[float] = []
    cmds = ctx.commands
    intra = ctx.intra_command_iats
    if len(cmds) < 2 or len(intra) != len(cmds):
        return
    for i in range(1, len(cmds)):
        cmd_iats = intra[i]
        if not cmd_iats:
            continue
        m = statistics.median(cmd_iats)
        if cmds[i - 1].errored:
            post_err.append(m)
        else:
            post_ok.append(m)
    if not post_err or not post_ok:
        return
    median_err = statistics.median(post_err)
    median_ok = statistics.median(post_ok)
    if median_ok <= 0.0:
        return
    delta = abs(median_err - median_ok) / median_ok
    if delta < FRUSTRATION_LOW_MAX:
        value = "low"
    elif delta < FRUSTRATION_MODERATE_MAX:
        value = "moderate"
    else:
        value = "high"
    if len(post_err) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
        confidence = 0.40
    else:
        confidence = 0.60
    yield make_observation(
        ctx,
        primitive="cognitive.error_resilience.frustration_typing",
        value=value,
        confidence=confidence,
    )
 def error_resilience_retry_tactic(ctx: SessionContext) -> Iterator[Observation]:
    """Emit ``cognitive.error_resilience.retry_tactic``.
--- a/decnet/profiler/behave_shell/_thresholds.py
+++ b/decnet/profiler/behave_shell/_thresholds.py
@@ -155,6 +155,21 @@ PLANNING_REACTIVE_MIN: float = 0.50
 TOOL_VOCAB_NARROW_MAX: int = 3
 TOOL_VOCAB_BROAD_MIN: int = 10
 # ── cognitive.error_resilience.frustration_typing (Step D.6) ───────────────
 # Compare the median within-command IAT of commands *following* an
 # errored command against the same statistic for commands following a
 # successful command. The relative absolute delta:
 #
 #   delta = |median_post_error - median_post_success| / median_post_success
 #
 #   delta < FRUSTRATION_LOW_MAX        → low
 #   delta < FRUSTRATION_MODERATE_MAX   → moderate
 #   else                               → high
 #
 # v0.1; D.8 re-tunes.
 FRUSTRATION_LOW_MAX: float = 0.10
 FRUSTRATION_MODERATE_MAX: float = 0.30
 # ── motor.keystroke_cadence (Step B.1) ──────────────────────────────────────
 # Typing bursts split at gaps > IKI_THINK_MAX_S so think-pauses between
 # commands don't inflate the within-burst CV. Mirrors the prototype's
--- a/tests/profiler/behave_shell/test_cognitive_error_resilience_frustration_typing.py
+++ b/tests/profiler/behave_shell/test_cognitive_error_resilience_frustration_typing.py
@@ -0,0 +1,98 @@
 """Step D.6: ``cognitive.error_resilience.frustration_typing``."""
 from __future__ import annotations
 from decnet.profiler.behave_shell import extract_session
 from decnet.profiler.behave_shell._parse import AsciinemaEvent
 PRIMITIVE = "cognitive.error_resilience.frustration_typing"
 def _of(observations: list, primitive: str):
    obs = [o for o in observations if o.primitive == primitive]
    assert len(obs) == 1, f"expected exactly one {primitive}, got {len(obs)}"
    return obs[0]
 def _typed(text: str, t0: float, dt: float) -> list[AsciinemaEvent]:
    return [(t0 + i * dt, "i", c) for i, c in enumerate(text)]
 def _build(blocks: list[tuple[str, bool, float]]) -> list[AsciinemaEvent]:
    """Synthesise a session.
    ``blocks`` is a list of (token, errored, dt) tuples. Each command
    gets its own time slot 2s apart; ``dt`` is the within-command IAT.
    """
    events: list[AsciinemaEvent] = []
    for i, (tok, errored, dt) in enumerate(blocks):
        t0 = i * 2.0
        events.extend(_typed(f"{tok}\r", t0=t0, dt=dt))
        if errored:
            cmd_end = t0 + len(tok) * dt
            events.append((cmd_end + 0.10, "o", f"bash: {tok}: command not found\n"))
        else:
            cmd_end = t0 + len(tok) * dt
            events.append((cmd_end + 0.10, "o", "ok\n"))
    return events
 def test_no_errors_no_emission() -> None:
    out = list(extract_session(_build([("ls", False, 0.05)] * 5), sid="ft-clean"))
    assert [o for o in out if o.primitive == PRIMITIVE] == []
 def test_no_baseline_no_emission() -> None:
    """Every command errored — no clean baseline → skip emission."""
    out = list(extract_session(_build([("foo", True, 0.05)] * 5), sid="ft-allerr"))
    assert [o for o in out if o.primitive == PRIMITIVE] == []
 def test_matching_speeds_emit_low() -> None:
    """Same dt for post-error and post-success commands → delta ≈ 0 → low."""
    blocks = [
        ("ok", False, 0.05),
        ("ok", False, 0.05),
        ("foo", True, 0.05),
        ("ok", False, 0.05),  # post-err: dt=0.05
        ("ok", False, 0.05),  # post-ok:  dt=0.05
        ("foo", True, 0.05),
        ("ok", False, 0.05),  # post-err: dt=0.05
        ("ok", False, 0.05),
    ]
    out = list(extract_session(_build(blocks), sid="ft-low"))
    obs = _of(out, PRIMITIVE)
    assert obs.value == "low"
 def test_huge_speed_change_emits_high() -> None:
    """Post-error commands typed 4x slower than post-success → delta=3 → high."""
    blocks = [
        ("ok", False, 0.05),
        ("ok", False, 0.05),  # post-ok: dt=0.05
        ("foo", True, 0.05),
        ("ok", False, 0.20),  # post-err: dt=0.20 (4x slower)
        ("ok", False, 0.05),  # post-ok: dt=0.05
        ("foo", True, 0.05),
        ("ok", False, 0.20),
        ("ok", False, 0.05),
    ]
    out = list(extract_session(_build(blocks), sid="ft-high"))
    obs = _of(out, PRIMITIVE)
    assert obs.value == "high"
 def test_low_post_error_count_reduces_confidence() -> None:
    short = [
        ("ok", False, 0.05),
        ("foo", True, 0.05),
        ("ok", False, 0.05),
        ("ok", False, 0.05),
    ]
    full_blocks = [("ok", False, 0.05)]
    for _ in range(6):
        full_blocks.append(("foo", True, 0.05))
        full_blocks.append(("ok", False, 0.05))
    s = _of(list(extract_session(_build(short), sid="ft-short")), PRIMITIVE)
    f = _of(list(extract_session(_build(full_blocks), sid="ft-full")), PRIMITIVE)
    assert s.confidence < f.confidence