DECNET/tests/profiler/behave_shell/test_calibration_grid.py

"""Step 9: calibration grid lockdown — the Phase A gate.

Runs the **pure engine** (``behave_shell.extract_session()``) against
each of the five 2026-05-02 calibration shards. The shards live in
``BEHAVE/prototype_extractors/shell/`` and are gitignored — fixture
path is resolved via the ``BEHAVE_CALIBRATION_DIR`` env var; the test
is skipped if that var is unset (CI provides it; local dev doesn't
have to).

The hard gate that this commit pins (and that all subsequent Phase
B-G PRs must keep green): each shard must emit every Phase A
primitive at least once across its sessions. Engine is allowed to
emit *more* than required.

Per-class expected values (the calibration **target**, not a hard
gate yet — value-level pins land once cross-class thresholds are
re-tuned with a wider corpus) are pinned in a softer cross-class
discrimination check below.
"""
from __future__ import annotations

import collections
import json
import os
from pathlib import Path
from typing import Any

import pytest

from decnet.profiler.behave_shell import extract_session
from decnet.profiler.behave_shell._parse import parse_shard_line


PHASE_ABCDEFG_PRIMITIVES: frozenset[str] = frozenset({
    # Phase A — calibration floor
    "motor.input_modality",
    "motor.paste_burst_rate",
    "cognitive.inter_command_latency_class",
    "cognitive.command_branch_diversity",
    "cognitive.feedback_loop_engagement",
    "cognitive.inter_command_consistency",
    # Phase B — motor.* completion
    "motor.keystroke_cadence",
    "motor.motor_stability",
    "motor.error_correction",
    "motor.command_chunking",
    # Phase C — motor.shell_mastery.*
    "motor.shell_mastery.tab_completion",
    "motor.shell_mastery.shortcut_usage",
    "motor.shell_mastery.pipe_chaining_depth",
    # Phase D — cognitive.* completion (error_resilience.* are
    # conditional, see PHASE_D_CONDITIONAL_PRIMITIVES below)
    "cognitive.cognitive_load",
    "cognitive.exploration_style",
    "cognitive.planning_depth",
    "cognitive.tool_vocabulary",
    # Phase E — temporal.* per-session subset
    "temporal.session_duration",
    "temporal.escalation_pattern",
    "temporal.lifecycle_markers.landing_ritual",
    # Phase F — environmental.* output-stream block + carry-over E.4
    # (locale and keyboard_layout are conditional — see
    # PHASE_F_CONDITIONAL_PRIMITIVES)
    "environmental.shell_type",
    "environmental.terminal_multiplexer",
    "environmental.numpad_usage",
    "temporal.lifecycle_markers.exit_behavior",
    # Phase G — operational.* + emotional_valence.* (hard subset)
    # The rest of Phase G are gated by sample-size floors and ride in
    # PHASE_G_CONDITIONAL_PRIMITIVES below (objective needs classified
    # commands, multi_actor needs ≥ 8 commands, arousal needs typing
    # bursts, valence / frustration_venting need typed-letter floors).
    "operational.opsec_discipline",
    "operational.cleanup_behavior",
    "emotional_valence.stress_response",
})

# Phase D primitives that are conditional on at least one errored
# command in the shard. These widen the universe the calibration grid
# *checks* for discriminative output but don't force every shard to
# emit them.
PHASE_D_CONDITIONAL_PRIMITIVES: frozenset[str] = frozenset({
    "cognitive.error_resilience.retry_tactic",
    "cognitive.error_resilience.frustration_typing",
    "cognitive.error_resilience.fallback_to_man",
})

# Phase F primitives conditional on shard content.
# * ``environmental.locale`` fires only when the shard's output contains
#   an env / locale dump (LANG=, LC_ALL=, LC_CTYPE=).
# * ``environmental.keyboard_layout`` requires LAYOUT_MIN_TYPED_LETTERS
#   (200) typed letters per session — short SSH-recon shards (the
#   2026-05-02 calibration corpus) max out around 90 typed letters
#   per session because most input is pasted rather than typed.
#   v0 keeps the 200-floor honesty rather than tuning to pass; longer-
#   text corpora will surface it.
PHASE_F_CONDITIONAL_PRIMITIVES: frozenset[str] = frozenset({
    "environmental.locale",
    "environmental.keyboard_layout",
})

# Phase G primitives that ride sample-size floors and may legitimately
# skip emission on shards that don't meet them. Tracked for grid
# discrimination but not part of the per-shard hard gate.
PHASE_G_CONDITIONAL_PRIMITIVES: frozenset[str] = frozenset({
    "operational.objective",                # needs ≥ 3 classified commands
    "operational.multi_actor_indicators",   # needs ≥ 8 commands
    "emotional_valence.arousal",            # needs typing bursts
    "emotional_valence.valence",            # needs ≥ 80 typed letters
    "emotional_valence.frustration_venting",  # needs ≥ 30 typed letters
})

# Backwards-compatible aliases for any external import — earlier phases
# locked in narrower sets; later phases widen them. All names point at
# the current binding set.
PHASE_ABCDEF_PRIMITIVES = PHASE_ABCDEFG_PRIMITIVES
PHASE_ABCDE_PRIMITIVES = PHASE_ABCDEFG_PRIMITIVES
PHASE_ABCD_PRIMITIVES = PHASE_ABCDEFG_PRIMITIVES
PHASE_ABC_PRIMITIVES = PHASE_ABCDEFG_PRIMITIVES


# (shard filename, class label)
SHARDS: list[tuple[str, str]] = [
    ("sessions-2026-05-02.jsonl",                "HUMAN"),
    ("sessions-2026-05-02-with-llm.jsonl",       "YOU-sim"),
    ("sessions-2026-05-02-new.jsonl",            "LW-sim"),
    ("sessions-2026-05-02-with-claude.jsonl",    "CLAUDE-FF"),
    ("sessions-2026-05-02-closed-loop.jsonl",    "CLAUDE-CL"),
]


def _calibration_dir() -> Path | None:
    raw = os.environ.get("BEHAVE_CALIBRATION_DIR")
    if not raw:
        return None
    p = Path(raw).expanduser()
    return p if p.is_dir() else None


@pytest.fixture(scope="module")
def calibration_dir() -> Path:
    d = _calibration_dir()
    if d is None:
        pytest.skip("BEHAVE_CALIBRATION_DIR unset or not a directory")
    return d


def _sessions_in_shard(path: Path) -> dict[str, list[Any]]:
    """Group raw events by sid, skipping headers and junk."""
    by_sid: dict[str, list[Any]] = collections.defaultdict(list)
    with path.open() as f:
        for line in f:
            try:
                rec = json.loads(line)
            except (json.JSONDecodeError, ValueError):
                continue
            sid = rec.get("sid") if isinstance(rec, dict) else None
            if not sid or "hdr" in rec:
                continue
            ev = parse_shard_line(line)
            if ev is not None:
                by_sid[sid].append(ev)
    return by_sid


def _all_observations(path: Path) -> list:
    obs: list = []
    for sid, events in _sessions_in_shard(path).items():
        obs.extend(extract_session(events, sid=sid))
    return obs


@pytest.mark.parametrize("shard_file,class_label", SHARDS, ids=[c for _, c in SHARDS])
def test_shard_emits_all_phase_a_primitives(
    shard_file: str,
    class_label: str,
    calibration_dir: Path,
) -> None:
    """Hard gate: every Phase A primitive fires at least once per shard."""
    path = calibration_dir / shard_file
    if not path.is_file():
        pytest.skip(f"shard not present at {path}")
    obs = _all_observations(path)
    assert obs, f"{class_label}: extractor produced zero observations"
    seen = {o.primitive for o in obs}
    missing = PHASE_ABCDEFG_PRIMITIVES - seen
    assert not missing, (
        f"{class_label} ({shard_file}) missing primitives: "
        f"{sorted(missing)}"
    )


def test_shards_are_discriminative_across_classes(
    calibration_dir: Path,
) -> None:
    """Smoke discrimination: at least one Phase A primitive must
    show different majority values across classes.

    A constant-output engine (every shard yields the same value for
    every primitive) would fail this check — that's the regression we
    care about. Tighter per-class value pinning lands as the corpus
    grows.
    """
    by_class: dict[str, dict[str, str]] = {}
    for shard_file, label in SHARDS:
        path = calibration_dir / shard_file
        if not path.is_file():
            continue
        per_prim: dict[str, collections.Counter] = collections.defaultdict(
            collections.Counter
        )
        for o in _all_observations(path):
            per_prim[o.primitive][str(o.value)] += 1
        by_class[label] = {
            prim: ctr.most_common(1)[0][0] for prim, ctr in per_prim.items()
        }
    if len(by_class) < 2:
        pytest.skip("need at least two shards present to compare")

    # At least one primitive should produce different majority values
    # across the present classes.
    discriminative_primitives: list[str] = []
    for prim in PHASE_ABCDEFG_PRIMITIVES:
        values = {by_class[c].get(prim) for c in by_class if prim in by_class[c]}
        if len(values) >= 2:
            discriminative_primitives.append(prim)
    assert discriminative_primitives, (
        f"Engine emitted identical majority values for every Phase A "
        f"primitive across {sorted(by_class)} — likely a constant-output "
        f"regression. Class summaries: {by_class}"
    )