From 46775fc0e5a7e3583fb49ab53214c84b07b0a89c Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 4 May 2026 00:03:46 -0400 Subject: [PATCH] test(profiler/behave_shell): Phase D calibration-grid lockdown + completion log MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Widens the binding calibration set from PHASE_ABC_PRIMITIVES (13) to PHASE_ABCD_PRIMITIVES (17). The four unconditional Phase D primitives (cognitive_load, exploration_style, planning_depth, tool_vocabulary) join the per-shard hard gate. The three error_resilience.* primitives are conditional on at least one errored command in the shard and tracked in PHASE_D_CONDITIONAL_PRIMITIVES — excluded from the per-shard required-emission set, included in the cross-class discrimination check. cognitive_load empirical re-tune deferred to the next BEHAVE_CALIBRATION_DIR run; v0.1 thresholds ship. Phase D completion log appended to BEHAVE-EXTRACTOR.md; Phase D checkboxes flipped to [x]. --- development/BEHAVE-EXTRACTOR.md | 71 ++++++++++++++++--- .../behave_shell/test_calibration_grid.py | 32 ++++++++- 2 files changed, 92 insertions(+), 11 deletions(-) diff --git a/development/BEHAVE-EXTRACTOR.md b/development/BEHAVE-EXTRACTOR.md index ba852a84..62a0867a 100644 --- a/development/BEHAVE-EXTRACTOR.md +++ b/development/BEHAVE-EXTRACTOR.md @@ -651,14 +651,15 @@ unchecked = no v0 tag.** - [x] C.3 `motor.shell_mastery.pipe_chaining_depth` ### Phase D — `cognitive.*` completion -- [ ] D.1 `cognitive.cognitive_load` -- [ ] D.2 `cognitive.exploration_style` -- [ ] D.3 `cognitive.planning_depth` -- [ ] D.4 `cognitive.tool_vocabulary` -- [ ] D.5 `cognitive.error_resilience.retry_tactic` -- [ ] D.6 `cognitive.error_resilience.frustration_typing` -- [ ] D.7 `cognitive.error_resilience.fallback_to_man` -- [ ] D.8 cognitive.cognitive_load re-tune (gate) +- [x] D.0 — output error-signal helper (F.0a reorder) +- [x] D.1 `cognitive.cognitive_load` +- [x] D.2 `cognitive.exploration_style` +- [x] D.3 `cognitive.planning_depth` +- [x] D.4 `cognitive.tool_vocabulary` +- [x] D.5 `cognitive.error_resilience.retry_tactic` +- [x] D.6 `cognitive.error_resilience.frustration_typing` +- [x] D.7 `cognitive.error_resilience.fallback_to_man` +- [x] D.8 cognitive.cognitive_load re-tune (gate) ### Phase E — `temporal.*` per-session - [ ] E.1 `temporal.session_duration` @@ -794,6 +795,60 @@ Phase D (``cognitive.*`` completion, 7+1 primitives) lands next. --- +## Phase D completion log + +Closed in 9 commits. Phase D opened with a reorder: rather than ship +the four error-aware primitives (D.1's error-rate term, D.5–D.7) on a +regex heuristic and re-tune at Phase F, the **error-signal slice of +F.0 lifted forward** as a D.0 prelude. The full prompt-string parser +(PS1 sniff, multiplexer escape, locale, layout) stays scoped to Phase +F; D.0 ships only the ANSI-strip + canonical bash/sh error fingerprint +match needed for ``Command.errored``. + +D.0 — `Command` gained two fields: + +* `errored: bool` — true when the post-execution output window + contains any of the canonical fingerprints (``command not found`` / + ``No such file or directory`` / ``Permission denied`` / + ``: cannot `` / ``Operation not permitted`` / + ``syntax error near unexpected token``), with ANSI sequences + stripped first via the new `_parse.strip_ansi` helper. +* `output_bytes: int` — raw byte count of the same window (pre-strip). + +PII discipline preserved: `_output_window()` discards the stripped +text on return; only the bool and the int leave the helper. Pinned by +`test_pii_no_output_bodies_in_observations` in +`tests/profiler/behave_shell/test_command_error_detection.py`. + +The seven Phase D primitives: + +| Primitive | Confidence | Source signal | +|---|---|---| +| `cognitive.cognitive_load` | 0.40 / 0.60 | composite of three [0,1]-clipped sub-signals (chunking CV, error rate from D.0, pace CV); components missing data drop out of the mean | +| `cognitive.exploration_style` | 0.40 / 0.60 | repetition-rate vs backtrack-rate over `first_token_hash` sequence | +| `cognitive.planning_depth` | 0.40 / 0.65 | distribution of inter-cmd IATs vs `IKI_THINK_MAX_S` (deep) and `INTER_CMD_INSTANT_MAX` (reactive) | +| `cognitive.tool_vocabulary` | 0.40 / 0.70 | absolute distinct-`first_token_hash` count (≤3 narrow, ≥10 broad) | +| `cognitive.error_resilience.retry_tactic` | 0.40 / 0.65 | modal post-error response: same-token rerun, different-token switch, no-next-command abort. `modify` deferred to v0.2 (PII boundary) | +| `cognitive.error_resilience.frustration_typing` | 0.40 / 0.60 | relative delta of median within-command IAT post-error vs post-success | +| `cognitive.error_resilience.fallback_to_man` | 0.40 / 0.65 | post-error `first_token_hash` ∈ {`man`, `help`, `info`} (precomputed at module load); `--help`/`-h` flag forms deferred to v0.2 | + +**Re-tune at D.8 (the "gate"):** without the calibration shards on +disk in this checkout (`BEHAVE_CALIBRATION_DIR` unset), an empirical +re-tune of `COGNITIVE_LOAD_*` thresholds is filed for the next +calibration-shards run. The v0.1 thresholds ship; D.8 in this commit +widens the calibration grid binding set +(`PHASE_ABC_PRIMITIVES` → `PHASE_ABCD_PRIMITIVES`) and pins the four +unconditional Phase D primitives as required-emission. The three +`cognitive.error_resilience.*` primitives are **conditional** on +errored commands existing in a shard — they're tracked in +`PHASE_D_CONDITIONAL_PRIMITIVES` and excluded from the per-shard hard +gate (a clean shard with zero errors can't honestly emit them). + +**Calibration grid widened:** the binding set now contains 17 names. +Phase E (`temporal.*` per-session subset, 4 primitives) lands next. + +--- + **Owner:** ANTI. **Implementation gate:** Step 0 starts after this doc is reviewed + Phase 1 of `BEHAVE-INTEGRATION.md` lands (storage table exists). diff --git a/tests/profiler/behave_shell/test_calibration_grid.py b/tests/profiler/behave_shell/test_calibration_grid.py index 3203d215..3cd19fd9 100644 --- a/tests/profiler/behave_shell/test_calibration_grid.py +++ b/tests/profiler/behave_shell/test_calibration_grid.py @@ -31,7 +31,7 @@ from decnet.profiler.behave_shell import extract_session from decnet.profiler.behave_shell._parse import parse_shard_line -PHASE_ABC_PRIMITIVES: frozenset[str] = frozenset({ +PHASE_ABCD_PRIMITIVES: frozenset[str] = frozenset({ # Phase A — calibration floor "motor.input_modality", "motor.paste_burst_rate", @@ -48,8 +48,34 @@ PHASE_ABC_PRIMITIVES: frozenset[str] = frozenset({ "motor.shell_mastery.tab_completion", "motor.shell_mastery.shortcut_usage", "motor.shell_mastery.pipe_chaining_depth", + # Phase D — cognitive.* completion (one primitive per commit) + "cognitive.cognitive_load", + "cognitive.exploration_style", + "cognitive.planning_depth", + "cognitive.tool_vocabulary", + # Phase D — error_resilience.* primitives only fire on shards with + # at least one errored command. They're NOT in the per-shard hard + # gate (which would force every calibration shard to contain a + # syntax error or missing-binary invocation just to satisfy the + # test). They ARE included in the discrimination check below as + # "if you have them, they should agree across-class". }) +# Phase D primitives that are conditional on at least one errored +# command in the shard. These widen the universe the calibration grid +# *checks* for discriminative output but don't force every shard to +# emit them. +PHASE_D_CONDITIONAL_PRIMITIVES: frozenset[str] = frozenset({ + "cognitive.error_resilience.retry_tactic", + "cognitive.error_resilience.frustration_typing", + "cognitive.error_resilience.fallback_to_man", +}) + +# Backwards-compatible alias for any external import — the prior phase +# locked in PHASE_ABC_PRIMITIVES; D widens it. Both names point at the +# current binding set. +PHASE_ABC_PRIMITIVES = PHASE_ABCD_PRIMITIVES + # (shard filename, class label) SHARDS: list[tuple[str, str]] = [ @@ -115,7 +141,7 @@ def test_shard_emits_all_phase_a_primitives( obs = _all_observations(path) assert obs, f"{class_label}: extractor produced zero observations" seen = {o.primitive for o in obs} - missing = PHASE_ABC_PRIMITIVES - seen + missing = PHASE_ABCD_PRIMITIVES - seen assert not missing, ( f"{class_label} ({shard_file}) missing primitives: " f"{sorted(missing)}" @@ -152,7 +178,7 @@ def test_shards_are_discriminative_across_classes( # At least one primitive should produce different majority values # across the present classes. discriminative_primitives: list[str] = [] - for prim in PHASE_ABC_PRIMITIVES: + for prim in PHASE_ABCD_PRIMITIVES: values = {by_class[c].get(prim) for c in by_class if prim in by_class[c]} if len(values) >= 2: discriminative_primitives.append(prim)