From 640294f3dc819ffb1beb3aa7e88733ad19915b69 Mon Sep 17 00:00:00 2001 From: anti Date: Sun, 3 May 2026 08:00:50 -0400 Subject: [PATCH] test(profiler/behave_shell): five-class calibration grid lockdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BEHAVE-EXTRACTOR.md Phase A Step 9 — the gate. Runs the pure engine against each of the five 2026-05-02 calibration shards and pins the contract that all subsequent Phase B-G PRs must keep green: every Phase A primitive (motor.input_modality, motor.paste_burst_rate, cognitive.inter_command_latency_class, cognitive.command_branch_diversity, cognitive.feedback_loop_engagement, cognitive.inter_command_consistency) fires at least once per shard. * tests/profiler/behave_shell/test_calibration_grid.py parametrized over (shard_file, class_label) for HUMAN / YOU-sim / LW-sim / CLAUDE-FF / CLAUDE-CL. Skips entirely when BEHAVE_CALIBRATION_DIR is unset (CI provides the path; local dev doesn't have to). * Plus a discrimination-smoke check: at least one primitive produces different majority values across present classes — catches the "constant-output regression" failure mode where the engine quietly degenerates to a stub. Calibration tweak: BRANCH_DIVERSITY_LINEAR_MIN dropped from 0.80 to 0.70 to align with the prototype's empirical anchors (CLAUDE-CL ≈ 0.55-0.60 adaptive; YOU-sim / CLAUDE-FF scripted recon ≈ 0.75+ linear). Test for the middle band re-pinned at the new boundary. Per-class value pinning (e.g. HUMAN must emit inter_command_consistency=bimodal) is intentionally NOT a hard gate yet — v0.1 thresholds put real human sessions in "variable", and true bimodal detection (Hartigan dip / two-peak) is registry-flagged for v0.2. Tighter pinning lands as the corpus grows. --- decnet/profiler/behave_shell/_thresholds.py | 13 +- .../behave_shell/test_calibration_grid.py | 153 ++++++++++++++++++ ...test_cognitive_command_branch_diversity.py | 9 +- 3 files changed, 162 insertions(+), 13 deletions(-) create mode 100644 tests/profiler/behave_shell/test_calibration_grid.py diff --git a/decnet/profiler/behave_shell/_thresholds.py b/decnet/profiler/behave_shell/_thresholds.py index 09e25c78..b9281640 100644 --- a/decnet/profiler/behave_shell/_thresholds.py +++ b/decnet/profiler/behave_shell/_thresholds.py @@ -54,14 +54,11 @@ INTER_CMD_LLM_HEAVYWEIGHT_MAX: float = 30.00 MIN_COMMANDS_FOR_FULL_CONFIDENCE: int = 5 # ── cognitive.command_branch_diversity (Step 6) ───────────────────────────── -# unique_first_tokens / total_commands ratio. Empirical (CLAUDE-FF vs -# CLAUDE-CL on 2026-05-02): fire-and-forget runs ~10 distinct tools (ratio -# near 1.0) → linear_playbook; closed-loop runs ~5-6 tools with the same -# tool re-invoked → adaptive_branching. -BRANCH_DIVERSITY_LINEAR_MIN: float = 0.80 # >= → linear_playbook -BRANCH_DIVERSITY_ADAPTIVE_MAX: float = 0.60 # <= → adaptive_branching -# Between is the ambiguous middle band — bias toward adaptive (the -# operator is reusing tools). +# unique_first_tokens / total_commands ratio. Prototype's empirical +# split (sessions-2026-05-02-* corpus): CLAUDE-CL chasing one finding +# ≈ 0.55-0.60 (adaptive), HUMAN exploring filesystem ≈ 0.65-0.70 +# (adaptive), YOU-sim / CLAUDE-FF scripted recon ≈ 0.75+ (linear). +BRANCH_DIVERSITY_LINEAR_MIN: float = 0.70 # >= → linear_playbook # ── cognitive.feedback_loop_engagement (Step 7) ───────────────────────────── # Pearson r threshold for "the operator's pause grew with the volume of diff --git a/tests/profiler/behave_shell/test_calibration_grid.py b/tests/profiler/behave_shell/test_calibration_grid.py new file mode 100644 index 00000000..7e1e16b0 --- /dev/null +++ b/tests/profiler/behave_shell/test_calibration_grid.py @@ -0,0 +1,153 @@ +"""Step 9: calibration grid lockdown — the Phase A gate. + +Runs the **pure engine** (``behave_shell.extract_session()``) against +each of the five 2026-05-02 calibration shards. The shards live in +``BEHAVE/prototype_extractors/shell/`` and are gitignored — fixture +path is resolved via the ``BEHAVE_CALIBRATION_DIR`` env var; the test +is skipped if that var is unset (CI provides it; local dev doesn't +have to). + +The hard gate that this commit pins (and that all subsequent Phase +B-G PRs must keep green): each shard must emit every Phase A +primitive at least once across its sessions. Engine is allowed to +emit *more* than required. + +Per-class expected values (the calibration **target**, not a hard +gate yet — value-level pins land once cross-class thresholds are +re-tuned with a wider corpus) are pinned in a softer cross-class +discrimination check below. +""" +from __future__ import annotations + +import collections +import json +import os +from pathlib import Path +from typing import Any + +import pytest + +from decnet.profiler.behave_shell import extract_session +from decnet.profiler.behave_shell._parse import parse_shard_line + + +PHASE_A_PRIMITIVES: frozenset[str] = frozenset({ + "motor.input_modality", + "motor.paste_burst_rate", + "cognitive.inter_command_latency_class", + "cognitive.command_branch_diversity", + "cognitive.feedback_loop_engagement", + "cognitive.inter_command_consistency", +}) + + +# (shard filename, class label) +SHARDS: list[tuple[str, str]] = [ + ("sessions-2026-05-02.jsonl", "HUMAN"), + ("sessions-2026-05-02-with-llm.jsonl", "YOU-sim"), + ("sessions-2026-05-02-new.jsonl", "LW-sim"), + ("sessions-2026-05-02-with-claude.jsonl", "CLAUDE-FF"), + ("sessions-2026-05-02-closed-loop.jsonl", "CLAUDE-CL"), +] + + +def _calibration_dir() -> Path | None: + raw = os.environ.get("BEHAVE_CALIBRATION_DIR") + if not raw: + return None + p = Path(raw).expanduser() + return p if p.is_dir() else None + + +@pytest.fixture(scope="module") +def calibration_dir() -> Path: + d = _calibration_dir() + if d is None: + pytest.skip("BEHAVE_CALIBRATION_DIR unset or not a directory") + return d + + +def _sessions_in_shard(path: Path) -> dict[str, list[Any]]: + """Group raw events by sid, skipping headers and junk.""" + by_sid: dict[str, list[Any]] = collections.defaultdict(list) + with path.open() as f: + for line in f: + try: + rec = json.loads(line) + except (json.JSONDecodeError, ValueError): + continue + sid = rec.get("sid") if isinstance(rec, dict) else None + if not sid or "hdr" in rec: + continue + ev = parse_shard_line(line) + if ev is not None: + by_sid[sid].append(ev) + return by_sid + + +def _all_observations(path: Path) -> list: + obs: list = [] + for sid, events in _sessions_in_shard(path).items(): + obs.extend(extract_session(events, sid=sid)) + return obs + + +@pytest.mark.parametrize("shard_file,class_label", SHARDS, ids=[c for _, c in SHARDS]) +def test_shard_emits_all_phase_a_primitives( + shard_file: str, + class_label: str, + calibration_dir: Path, +) -> None: + """Hard gate: every Phase A primitive fires at least once per shard.""" + path = calibration_dir / shard_file + if not path.is_file(): + pytest.skip(f"shard not present at {path}") + obs = _all_observations(path) + assert obs, f"{class_label}: extractor produced zero observations" + seen = {o.primitive for o in obs} + missing = PHASE_A_PRIMITIVES - seen + assert not missing, ( + f"{class_label} ({shard_file}) missing primitives: " + f"{sorted(missing)}" + ) + + +def test_shards_are_discriminative_across_classes( + calibration_dir: Path, +) -> None: + """Smoke discrimination: at least one Phase A primitive must + show different majority values across classes. + + A constant-output engine (every shard yields the same value for + every primitive) would fail this check — that's the regression we + care about. Tighter per-class value pinning lands as the corpus + grows. + """ + by_class: dict[str, dict[str, str]] = {} + for shard_file, label in SHARDS: + path = calibration_dir / shard_file + if not path.is_file(): + continue + per_prim: dict[str, collections.Counter] = collections.defaultdict( + collections.Counter + ) + for o in _all_observations(path): + per_prim[o.primitive][str(o.value)] += 1 + by_class[label] = { + prim: ctr.most_common(1)[0][0] for prim, ctr in per_prim.items() + } + if len(by_class) < 2: + pytest.skip("need at least two shards present to compare") + + # At least one primitive should produce different majority values + # across the present classes. + discriminative_primitives: list[str] = [] + for prim in PHASE_A_PRIMITIVES: + values = {by_class[c].get(prim) for c in by_class if prim in by_class[c]} + if len(values) >= 2: + discriminative_primitives.append(prim) + assert discriminative_primitives, ( + f"Engine emitted identical majority values for every Phase A " + f"primitive across {sorted(by_class)} — likely a constant-output " + f"regression. Class summaries: {by_class}" + ) diff --git a/tests/profiler/behave_shell/test_cognitive_command_branch_diversity.py b/tests/profiler/behave_shell/test_cognitive_command_branch_diversity.py index a52a16b9..8ad3bac8 100644 --- a/tests/profiler/behave_shell/test_cognitive_command_branch_diversity.py +++ b/tests/profiler/behave_shell/test_cognitive_command_branch_diversity.py @@ -45,11 +45,10 @@ def test_repeated_first_tokens_emit_adaptive_branching() -> None: assert obs.value == "adaptive_branching" -def test_middle_band_biases_to_adaptive() -> None: - # 7 commands, 5 unique → ratio ≈ 0.71 — between 0.60 and 0.80. - # The doc instructs us to bias to adaptive in the ambiguous middle. - tokens = ["a", "b", "c", "d", "e", "a", "b"] - out = list(extract_session(_commands(tokens), sid="bd-mid")) +def test_just_below_linear_threshold_emits_adaptive() -> None: + # 7 commands, 4 unique → ratio ≈ 0.57 — below the 0.70 linear floor. + tokens = ["a", "b", "c", "d", "a", "b", "c"] + out = list(extract_session(_commands(tokens), sid="bd-just-adaptive")) obs = _of(out, "cognitive.command_branch_diversity") assert obs.value == "adaptive_branching"