test(profiler/behave_shell): five-class calibration grid lockdown

BEHAVE-EXTRACTOR.md Phase A Step 9 — the gate. Runs the pure engine against each of the five 2026-05-02 calibration shards and pins the contract that all subsequent Phase B-G PRs must keep green: every Phase A primitive (motor.input_modality, motor.paste_burst_rate, cognitive.inter_command_latency_class, cognitive.command_branch_diversity, cognitive.feedback_loop_engagement, cognitive.inter_command_consistency) fires at least once per shard. * tests/profiler/behave_shell/test_calibration_grid.py parametrized over (shard_file, class_label) for HUMAN / YOU-sim / LW-sim / CLAUDE-FF / CLAUDE-CL. Skips entirely when BEHAVE_CALIBRATION_DIR is unset (CI provides the path; local dev doesn't have to). * Plus a discrimination-smoke check: at least one primitive produces different majority values across present classes — catches the "constant-output regression" failure mode where the engine quietly degenerates to a stub. Calibration tweak: BRANCH_DIVERSITY_LINEAR_MIN dropped from 0.80 to 0.70 to align with the prototype's empirical anchors (CLAUDE-CL ≈ 0.55-0.60 adaptive; YOU-sim / CLAUDE-FF scripted recon ≈ 0.75+ linear). Test for the middle band re-pinned at the new boundary. Per-class value pinning (e.g. HUMAN must emit inter_command_consistency=bimodal) is intentionally NOT a hard gate yet — v0.1 thresholds put real human sessions in "variable", and true bimodal detection (Hartigan dip / two-peak) is registry-flagged for v0.2. Tighter pinning lands as the corpus grows.
2026-05-03 08:00:50 -04:00
parent 842b7de950
commit 640294f3dc
3 changed files with 162 additions and 13 deletions
--- a/decnet/profiler/behave_shell/_thresholds.py
+++ b/decnet/profiler/behave_shell/_thresholds.py
@@ -54,14 +54,11 @@ INTER_CMD_LLM_HEAVYWEIGHT_MAX: float = 30.00
 MIN_COMMANDS_FOR_FULL_CONFIDENCE: int = 5

 # ── cognitive.command_branch_diversity (Step 6) ─────────────────────────────
-# unique_first_tokens / total_commands ratio. Empirical (CLAUDE-FF vs
-# CLAUDE-CL on 2026-05-02): fire-and-forget runs ~10 distinct tools (ratio
-# near 1.0) → linear_playbook; closed-loop runs ~5-6 tools with the same
-# tool re-invoked → adaptive_branching.
-BRANCH_DIVERSITY_LINEAR_MIN: float = 0.80   # >= → linear_playbook
-BRANCH_DIVERSITY_ADAPTIVE_MAX: float = 0.60  # <= → adaptive_branching
-# Between is the ambiguous middle band — bias toward adaptive (the
-# operator is reusing tools).
+# unique_first_tokens / total_commands ratio. Prototype's empirical
+# split (sessions-2026-05-02-* corpus): CLAUDE-CL chasing one finding
+# ≈ 0.55-0.60 (adaptive), HUMAN exploring filesystem ≈ 0.65-0.70
+# (adaptive), YOU-sim / CLAUDE-FF scripted recon ≈ 0.75+ (linear).
+BRANCH_DIVERSITY_LINEAR_MIN: float = 0.70   # >= → linear_playbook

 # ── cognitive.feedback_loop_engagement (Step 7) ─────────────────────────────
 # Pearson r threshold for "the operator's pause grew with the volume of
--- a/tests/profiler/behave_shell/test_calibration_grid.py
+++ b/tests/profiler/behave_shell/test_calibration_grid.py
@@ -0,0 +1,153 @@
+"""Step 9: calibration grid lockdown — the Phase A gate.
+
+Runs the **pure engine** (``behave_shell.extract_session()``) against
+each of the five 2026-05-02 calibration shards. The shards live in
+``BEHAVE/prototype_extractors/shell/`` and are gitignored — fixture
+path is resolved via the ``BEHAVE_CALIBRATION_DIR`` env var; the test
+is skipped if that var is unset (CI provides it; local dev doesn't
+have to).
+
+The hard gate that this commit pins (and that all subsequent Phase
+B-G PRs must keep green): each shard must emit every Phase A
+primitive at least once across its sessions. Engine is allowed to
+emit *more* than required.
+
+Per-class expected values (the calibration **target**, not a hard
+gate yet — value-level pins land once cross-class thresholds are
+re-tuned with a wider corpus) are pinned in a softer cross-class
+discrimination check below.
+"""
+from __future__ import annotations
+
+import collections
+import json
+import os
+from pathlib import Path
+from typing import Any
+
+import pytest
+
+from decnet.profiler.behave_shell import extract_session
+from decnet.profiler.behave_shell._parse import parse_shard_line
+
+
+PHASE_A_PRIMITIVES: frozenset[str] = frozenset({
+    "motor.input_modality",
+    "motor.paste_burst_rate",
+    "cognitive.inter_command_latency_class",
+    "cognitive.command_branch_diversity",
+    "cognitive.feedback_loop_engagement",
+    "cognitive.inter_command_consistency",
+})
+
+
+# (shard filename, class label)
+SHARDS: list[tuple[str, str]] = [
+    ("sessions-2026-05-02.jsonl",                "HUMAN"),
+    ("sessions-2026-05-02-with-llm.jsonl",       "YOU-sim"),
+    ("sessions-2026-05-02-new.jsonl",            "LW-sim"),
+    ("sessions-2026-05-02-with-claude.jsonl",    "CLAUDE-FF"),
+    ("sessions-2026-05-02-closed-loop.jsonl",    "CLAUDE-CL"),
+]
+
+
+def _calibration_dir() -> Path | None:
+    raw = os.environ.get("BEHAVE_CALIBRATION_DIR")
+    if not raw:
+        return None
+    p = Path(raw).expanduser()
+    return p if p.is_dir() else None
+
+
+@pytest.fixture(scope="module")
+def calibration_dir() -> Path:
+    d = _calibration_dir()
+    if d is None:
+        pytest.skip("BEHAVE_CALIBRATION_DIR unset or not a directory")
+    return d
+
+
+def _sessions_in_shard(path: Path) -> dict[str, list[Any]]:
+    """Group raw events by sid, skipping headers and junk."""
+    by_sid: dict[str, list[Any]] = collections.defaultdict(list)
+    with path.open() as f:
+        for line in f:
+            try:
+                rec = json.loads(line)
+            except (json.JSONDecodeError, ValueError):
+                continue
+            sid = rec.get("sid") if isinstance(rec, dict) else None
+            if not sid or "hdr" in rec:
+                continue
+            ev = parse_shard_line(line)
+            if ev is not None:
+                by_sid[sid].append(ev)
+    return by_sid
+
+
+def _all_observations(path: Path) -> list:
+    obs: list = []
+    for sid, events in _sessions_in_shard(path).items():
+        obs.extend(extract_session(events, sid=sid))
+    return obs
+
+
+@pytest.mark.parametrize("shard_file,class_label", SHARDS, ids=[c for _, c in SHARDS])
+def test_shard_emits_all_phase_a_primitives(
+    shard_file: str,
+    class_label: str,
+    calibration_dir: Path,
+) -> None:
+    """Hard gate: every Phase A primitive fires at least once per shard."""
+    path = calibration_dir / shard_file
+    if not path.is_file():
+        pytest.skip(f"shard not present at {path}")
+    obs = _all_observations(path)
+    assert obs, f"{class_label}: extractor produced zero observations"
+    seen = {o.primitive for o in obs}
+    missing = PHASE_A_PRIMITIVES - seen
+    assert not missing, (
+        f"{class_label} ({shard_file}) missing primitives: "
+        f"{sorted(missing)}"
+    )
+
+
+def test_shards_are_discriminative_across_classes(
+    calibration_dir: Path,
+) -> None:
+    """Smoke discrimination: at least one Phase A primitive must
+    show different majority values across classes.
+
+    A constant-output engine (every shard yields the same value for
+    every primitive) would fail this check — that's the regression we
+    care about. Tighter per-class value pinning lands as the corpus
+    grows.
+    """
+    by_class: dict[str, dict[str, str]] = {}
+    for shard_file, label in SHARDS:
+        path = calibration_dir / shard_file
+        if not path.is_file():
+            continue
+        per_prim: dict[str, collections.Counter] = collections.defaultdict(
+            collections.Counter
+        )
+        for o in _all_observations(path):
+            per_prim[o.primitive][str(o.value)] += 1
+        by_class[label] = {
+            prim: ctr.most_common(1)[0][0] for prim, ctr in per_prim.items()
+        }
+    if len(by_class) < 2:
+        pytest.skip("need at least two shards present to compare")
+
+    # At least one primitive should produce different majority values
+    # across the present classes.
+    discriminative_primitives: list[str] = []
+    for prim in PHASE_A_PRIMITIVES:
+        values = {by_class[c].get(prim) for c in by_class if prim in by_class[c]}
+        if len(values) >= 2:
+            discriminative_primitives.append(prim)
+    assert discriminative_primitives, (
+        f"Engine emitted identical majority values for every Phase A "
+        f"primitive across {sorted(by_class)} — likely a constant-output "
+        f"regression. Class summaries: {by_class}"
+    )
--- a/tests/profiler/behave_shell/test_cognitive_command_branch_diversity.py
+++ b/tests/profiler/behave_shell/test_cognitive_command_branch_diversity.py
@@ -45,11 +45,10 @@ def test_repeated_first_tokens_emit_adaptive_branching() -> None:
    assert obs.value == "adaptive_branching"


-def test_middle_band_biases_to_adaptive() -> None:
-    # 7 commands, 5 unique → ratio ≈ 0.71 — between 0.60 and 0.80.
-    # The doc instructs us to bias to adaptive in the ambiguous middle.
-    tokens = ["a", "b", "c", "d", "e", "a", "b"]
-    out = list(extract_session(_commands(tokens), sid="bd-mid"))
+def test_just_below_linear_threshold_emits_adaptive() -> None:
+    # 7 commands, 4 unique → ratio ≈ 0.57 — below the 0.70 linear floor.
+    tokens = ["a", "b", "c", "d", "a", "b", "c"]
+    out = list(extract_session(_commands(tokens), sid="bd-just-adaptive"))
    obs = _of(out, "cognitive.command_branch_diversity")
    assert obs.value == "adaptive_branching"