test(profiler/behave_shell): Phase D calibration-grid lockdown + completion log

Widens the binding calibration set from PHASE_ABC_PRIMITIVES (13) to PHASE_ABCD_PRIMITIVES (17). The four unconditional Phase D primitives (cognitive_load, exploration_style, planning_depth, tool_vocabulary) join the per-shard hard gate. The three error_resilience.* primitives are conditional on at least one errored command in the shard and tracked in PHASE_D_CONDITIONAL_PRIMITIVES — excluded from the per-shard required-emission set, included in the cross-class discrimination check. cognitive_load empirical re-tune deferred to the next BEHAVE_CALIBRATION_DIR run; v0.1 thresholds ship. Phase D completion log appended to BEHAVE-EXTRACTOR.md; Phase D checkboxes flipped to [x].
2026-05-04 00:03:46 -04:00
parent 0fba6b6113
commit 46775fc0e5
2 changed files with 92 additions and 11 deletions
--- a/tests/profiler/behave_shell/test_calibration_grid.py
+++ b/tests/profiler/behave_shell/test_calibration_grid.py
@@ -31,7 +31,7 @@ from decnet.profiler.behave_shell import extract_session
 from decnet.profiler.behave_shell._parse import parse_shard_line


-PHASE_ABC_PRIMITIVES: frozenset[str] = frozenset({
+PHASE_ABCD_PRIMITIVES: frozenset[str] = frozenset({
    # Phase A — calibration floor
    "motor.input_modality",
    "motor.paste_burst_rate",
@@ -48,8 +48,34 @@ PHASE_ABC_PRIMITIVES: frozenset[str] = frozenset({
    "motor.shell_mastery.tab_completion",
    "motor.shell_mastery.shortcut_usage",
    "motor.shell_mastery.pipe_chaining_depth",
+    # Phase D — cognitive.* completion (one primitive per commit)
+    "cognitive.cognitive_load",
+    "cognitive.exploration_style",
+    "cognitive.planning_depth",
+    "cognitive.tool_vocabulary",
+    # Phase D — error_resilience.* primitives only fire on shards with
+    # at least one errored command. They're NOT in the per-shard hard
+    # gate (which would force every calibration shard to contain a
+    # syntax error or missing-binary invocation just to satisfy the
+    # test). They ARE included in the discrimination check below as
+    # "if you have them, they should agree across-class".
 })

+# Phase D primitives that are conditional on at least one errored
+# command in the shard. These widen the universe the calibration grid
+# *checks* for discriminative output but don't force every shard to
+# emit them.
+PHASE_D_CONDITIONAL_PRIMITIVES: frozenset[str] = frozenset({
+    "cognitive.error_resilience.retry_tactic",
+    "cognitive.error_resilience.frustration_typing",
+    "cognitive.error_resilience.fallback_to_man",
+})
+
+# Backwards-compatible alias for any external import — the prior phase
+# locked in PHASE_ABC_PRIMITIVES; D widens it. Both names point at the
+# current binding set.
+PHASE_ABC_PRIMITIVES = PHASE_ABCD_PRIMITIVES
+

 # (shard filename, class label)
 SHARDS: list[tuple[str, str]] = [
@@ -115,7 +141,7 @@ def test_shard_emits_all_phase_a_primitives(
    obs = _all_observations(path)
    assert obs, f"{class_label}: extractor produced zero observations"
    seen = {o.primitive for o in obs}
-    missing = PHASE_ABC_PRIMITIVES - seen
+    missing = PHASE_ABCD_PRIMITIVES - seen
    assert not missing, (
        f"{class_label} ({shard_file}) missing primitives: "
        f"{sorted(missing)}"
@@ -152,7 +178,7 @@ def test_shards_are_discriminative_across_classes(
    # At least one primitive should produce different majority values
    # across the present classes.
    discriminative_primitives: list[str] = []
-    for prim in PHASE_ABC_PRIMITIVES:
+    for prim in PHASE_ABCD_PRIMITIVES:
        values = {by_class[c].get(prim) for c in by_class if prim in by_class[c]}
        if len(values) >= 2:
            discriminative_primitives.append(prim)