BEHAVE-EXTRACTOR.md Phase A Step 9 — the gate. Runs the pure engine against each of the five 2026-05-02 calibration shards and pins the contract that all subsequent Phase B-G PRs must keep green: every Phase A primitive (motor.input_modality, motor.paste_burst_rate, cognitive.inter_command_latency_class, cognitive.command_branch_diversity, cognitive.feedback_loop_engagement, cognitive.inter_command_consistency) fires at least once per shard. * tests/profiler/behave_shell/test_calibration_grid.py parametrized over (shard_file, class_label) for HUMAN / YOU-sim / LW-sim / CLAUDE-FF / CLAUDE-CL. Skips entirely when BEHAVE_CALIBRATION_DIR is unset (CI provides the path; local dev doesn't have to). * Plus a discrimination-smoke check: at least one primitive produces different majority values across present classes — catches the "constant-output regression" failure mode where the engine quietly degenerates to a stub. Calibration tweak: BRANCH_DIVERSITY_LINEAR_MIN dropped from 0.80 to 0.70 to align with the prototype's empirical anchors (CLAUDE-CL ≈ 0.55-0.60 adaptive; YOU-sim / CLAUDE-FF scripted recon ≈ 0.75+ linear). Test for the middle band re-pinned at the new boundary. Per-class value pinning (e.g. HUMAN must emit inter_command_consistency=bimodal) is intentionally NOT a hard gate yet — v0.1 thresholds put real human sessions in "variable", and true bimodal detection (Hartigan dip / two-peak) is registry-flagged for v0.2. Tighter pinning lands as the corpus grows.
64 lines
2.4 KiB
Python
64 lines
2.4 KiB
Python
"""Step 6: ``cognitive.command_branch_diversity``."""
|
|
from __future__ import annotations
|
|
|
|
from decnet.profiler.behave_shell import extract_session
|
|
from decnet.profiler.behave_shell._parse import AsciinemaEvent
|
|
|
|
|
|
def _of(observations: list, primitive: str):
|
|
obs = [o for o in observations if o.primitive == primitive]
|
|
assert len(obs) == 1, f"expected exactly one {primitive}, got {len(obs)}"
|
|
return obs[0]
|
|
|
|
|
|
def _commands(first_tokens: list[str]) -> list[AsciinemaEvent]:
|
|
"""One command per token, well-spaced."""
|
|
events: list[AsciinemaEvent] = []
|
|
t = 0.0
|
|
for tok in first_tokens:
|
|
events.append((t, "i", f"{tok} arg\r"))
|
|
t += 1.0
|
|
return events
|
|
|
|
|
|
def test_under_floor_emits_unknown_high_confidence() -> None:
|
|
out = list(extract_session(_commands(["ls", "ps", "id"]), sid="bd-low"))
|
|
obs = _of(out, "cognitive.command_branch_diversity")
|
|
assert obs.value == "unknown"
|
|
assert obs.confidence == 1.0
|
|
|
|
|
|
def test_unique_first_tokens_emit_linear_playbook() -> None:
|
|
# 8 distinct tools — ratio 1.0 → linear_playbook
|
|
tokens = ["uname", "id", "whoami", "pwd", "ls", "ps", "netstat", "ss"]
|
|
out = list(extract_session(_commands(tokens), sid="bd-linear"))
|
|
obs = _of(out, "cognitive.command_branch_diversity")
|
|
assert obs.value == "linear_playbook"
|
|
assert obs.confidence == 0.80
|
|
|
|
|
|
def test_repeated_first_tokens_emit_adaptive_branching() -> None:
|
|
# 8 commands, only 3 distinct — ratio 0.375 < 0.60
|
|
tokens = ["curl", "curl", "curl", "ls", "curl", "ls", "curl", "ps"]
|
|
out = list(extract_session(_commands(tokens), sid="bd-adaptive"))
|
|
obs = _of(out, "cognitive.command_branch_diversity")
|
|
assert obs.value == "adaptive_branching"
|
|
|
|
|
|
def test_just_below_linear_threshold_emits_adaptive() -> None:
|
|
# 7 commands, 4 unique → ratio ≈ 0.57 — below the 0.70 linear floor.
|
|
tokens = ["a", "b", "c", "d", "a", "b", "c"]
|
|
out = list(extract_session(_commands(tokens), sid="bd-just-adaptive"))
|
|
obs = _of(out, "cognitive.command_branch_diversity")
|
|
assert obs.value == "adaptive_branching"
|
|
|
|
|
|
def test_pii_no_command_bodies_in_observation() -> None:
|
|
out = list(extract_session(_commands(
|
|
["secret_arg_payload"] * 6,
|
|
), sid="bd-pii"))
|
|
obs = _of(out, "cognitive.command_branch_diversity")
|
|
# Whatever the verdict, the raw token must not be in the dump
|
|
serialised = obs.model_dump_json()
|
|
assert "secret_arg_payload" not in serialised
|