diff --git a/decnet/profiler/behave_shell/_features/__init__.py b/decnet/profiler/behave_shell/_features/__init__.py index b4292136..7cd4640e 100644 --- a/decnet/profiler/behave_shell/_features/__init__.py +++ b/decnet/profiler/behave_shell/_features/__init__.py @@ -11,6 +11,9 @@ from typing import Callable, Iterable from decnet_behave_core.spec.envelope import Observation from decnet.profiler.behave_shell._ctx import SessionContext +from decnet.profiler.behave_shell._features.cognitive import ( + inter_command_latency_class, +) from decnet.profiler.behave_shell._features.motor import ( input_modality, paste_burst_rate, @@ -21,4 +24,5 @@ FeatureFn = Callable[[SessionContext], Iterable[Observation]] FEATURES: tuple[FeatureFn, ...] = ( input_modality, paste_burst_rate, + inter_command_latency_class, ) diff --git a/decnet/profiler/behave_shell/_features/cognitive.py b/decnet/profiler/behave_shell/_features/cognitive.py new file mode 100644 index 00000000..cb14d0f7 --- /dev/null +++ b/decnet/profiler/behave_shell/_features/cognitive.py @@ -0,0 +1,61 @@ +"""``cognitive.*`` feature functions. + +Step 5: ``cognitive.inter_command_latency_class``. +Step 6: ``cognitive.command_branch_diversity``. +Step 7: ``cognitive.feedback_loop_engagement``. +Step 8: ``cognitive.inter_command_consistency``. +""" +from __future__ import annotations + +import statistics +from typing import Iterator + +from decnet_behave_core.spec.envelope import Observation + +from decnet.profiler.behave_shell._ctx import SessionContext +from decnet.profiler.behave_shell._features._emit import make_observation +from decnet.profiler.behave_shell._thresholds import ( + INTER_CMD_DELIBERATE_MAX, + INTER_CMD_INSTANT_MAX, + INTER_CMD_LLM_HEAVYWEIGHT_MAX, + INTER_CMD_LLM_LIGHTWEIGHT_MAX, + INTER_CMD_TYPING_MAX, + MIN_COMMANDS_FOR_FULL_CONFIDENCE, +) + + +def _bucket_inter_cmd_latency(median_iat: float) -> str: + if median_iat <= INTER_CMD_INSTANT_MAX: + return "instant" + if median_iat <= INTER_CMD_TYPING_MAX: + return "typing_speed" + if median_iat <= INTER_CMD_DELIBERATE_MAX: + return "deliberate" + if median_iat <= INTER_CMD_LLM_LIGHTWEIGHT_MAX: + return "llm_lightweight" + if median_iat <= INTER_CMD_LLM_HEAVYWEIGHT_MAX: + return "llm_heavyweight" + return "long" + + +def inter_command_latency_class(ctx: SessionContext) -> Iterator[Observation]: + """Emit ``cognitive.inter_command_latency_class``. + + Operator's *thinking pace* between commands, bucketed against + calibrated thresholds. Splits LW-sim / CLAUDE-FF / CLAUDE-CL. + """ + if not ctx.inter_cmd_iats: + return + median_iat = statistics.median(ctx.inter_cmd_iats) + bucket = _bucket_inter_cmd_latency(median_iat) + # Sample-size honesty: < 5 commands → halve confidence + if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE: + confidence = 0.40 + else: + confidence = 0.80 + yield make_observation( + ctx, + primitive="cognitive.inter_command_latency_class", + value=bucket, + confidence=confidence, + ) diff --git a/decnet/profiler/behave_shell/_thresholds.py b/decnet/profiler/behave_shell/_thresholds.py index 843373c1..09e25c78 100644 --- a/decnet/profiler/behave_shell/_thresholds.py +++ b/decnet/profiler/behave_shell/_thresholds.py @@ -35,3 +35,46 @@ MODALITY_TYPED_MAX: float = 0.05 # habit signal, input_modality is the dominant-channel signal. PASTE_RATE_HABITUAL_MIN: float = 0.50 PASTE_RATE_OCCASIONAL_MIN: float = 0.10 + +# ── cognitive.inter_command_latency_class (Step 5) ────────────────────────── +# Bucket edges (seconds) for the median inter-command IAT. Prototype +# values; v0.2 splits the original llm_roundtrip 2-8s band into +# llm_lightweight (orchestrated agents w/ small models / terse prompts) and +# llm_heavyweight (reasoning-class agents in tool loops with text +# generation between calls). Empirical anchor: Claude Opus driving recon +# via tmux send-keys produced a median of 15.5s. +INTER_CMD_INSTANT_MAX: float = 0.30 +INTER_CMD_TYPING_MAX: float = 1.50 +INTER_CMD_DELIBERATE_MAX: float = 2.00 +INTER_CMD_LLM_LIGHTWEIGHT_MAX: float = 8.00 +INTER_CMD_LLM_HEAVYWEIGHT_MAX: float = 30.00 + +# Sample-size floor for inter-command IAT primitives. Below this we +# halve the confidence per BEHAVE-EXTRACTOR.md "sample-size honesty". +MIN_COMMANDS_FOR_FULL_CONFIDENCE: int = 5 + +# ── cognitive.command_branch_diversity (Step 6) ───────────────────────────── +# unique_first_tokens / total_commands ratio. Empirical (CLAUDE-FF vs +# CLAUDE-CL on 2026-05-02): fire-and-forget runs ~10 distinct tools (ratio +# near 1.0) → linear_playbook; closed-loop runs ~5-6 tools with the same +# tool re-invoked → adaptive_branching. +BRANCH_DIVERSITY_LINEAR_MIN: float = 0.80 # >= → linear_playbook +BRANCH_DIVERSITY_ADAPTIVE_MAX: float = 0.60 # <= → adaptive_branching +# Between is the ambiguous middle band — bias toward adaptive (the +# operator is reusing tools). + +# ── cognitive.feedback_loop_engagement (Step 7) ───────────────────────────── +# Pearson r threshold for "the operator's pause grew with the volume of +# preceding output". |r| > this → significant; sign carries direction. +FEEDBACK_CORRELATION_MIN: float = 0.30 +# Need at least this many (output_bytes, next_pause) pairs to even +# attempt a correlation. Below this the answer is "unknown". +FEEDBACK_MIN_PAIRS: int = 5 + +# ── cognitive.inter_command_consistency (Step 8) ──────────────────────────── +# CV (stdev / mean) of inter-command IATs. Empirical (this corpus): +# human session CV=0.94 → variable; LLM-simulated CV=0.24 → metronomic; +# anything beyond 1.5 is heuristically "bimodal" (real bimodal detection +# via Hartigan dip is filed for v0.2). +PAUSE_CV_METRONOMIC_MAX: float = 0.40 +PAUSE_CV_BIMODAL_MIN: float = 1.50 diff --git a/tests/profiler/behave_shell/test_cognitive_inter_command_latency_class.py b/tests/profiler/behave_shell/test_cognitive_inter_command_latency_class.py new file mode 100644 index 00000000..f412fda5 --- /dev/null +++ b/tests/profiler/behave_shell/test_cognitive_inter_command_latency_class.py @@ -0,0 +1,81 @@ +"""Step 5: ``cognitive.inter_command_latency_class``.""" +from __future__ import annotations + +from decnet.profiler.behave_shell import extract_session +from decnet.profiler.behave_shell._parse import AsciinemaEvent + + +def _of(observations: list, primitive: str): + obs = [o for o in observations if o.primitive == primitive] + assert len(obs) == 1, f"expected exactly one {primitive}, got {len(obs)}" + return obs[0] + + +def _command_stream(starts: list[float]) -> list[AsciinemaEvent]: + """Build an input stream that yields commands at the given start times.""" + events: list[AsciinemaEvent] = [] + for s in starts: + events.append((s, "i", "x")) + events.append((s + 0.05, "i", "\r")) + return events + + +def test_no_commands_means_no_observation() -> None: + out = list(extract_session([], sid="lat-empty")) + assert [o for o in out if o.primitive == "cognitive.inter_command_latency_class"] == [] + + +def test_single_command_no_iat_no_observation() -> None: + out = list(extract_session(_command_stream([0.0]), sid="lat-1")) + assert [o for o in out if o.primitive == "cognitive.inter_command_latency_class"] == [] + + +def test_instant_bucket() -> None: + # IATs of 0.1s — well under 0.30 cap + starts = [i * 0.15 for i in range(6)] + out = list(extract_session(_command_stream(starts), sid="lat-instant")) + assert _of(out, "cognitive.inter_command_latency_class").value == "instant" + + +def test_typing_speed_bucket() -> None: + # IATs around 1.0s + starts = [i * 1.0 for i in range(6)] + out = list(extract_session(_command_stream(starts), sid="lat-typing")) + assert _of(out, "cognitive.inter_command_latency_class").value == "typing_speed" + + +def test_deliberate_bucket() -> None: + # IATs around 1.85s — above typing (1.5), under deliberate cap (2.0) + starts = [i * 1.9 for i in range(6)] + out = list(extract_session(_command_stream(starts), sid="lat-deliberate")) + assert _of(out, "cognitive.inter_command_latency_class").value == "deliberate" + + +def test_llm_lightweight_bucket() -> None: + # IATs around 5s — within 2-8s band + starts = [i * 5.05 for i in range(6)] + out = list(extract_session(_command_stream(starts), sid="lat-lwt")) + assert _of(out, "cognitive.inter_command_latency_class").value == "llm_lightweight" + + +def test_llm_heavyweight_bucket() -> None: + # IATs around 15s — within 8-30s band; matches Claude Opus empirical + starts = [i * 15.05 for i in range(6)] + out = list(extract_session(_command_stream(starts), sid="lat-hvy")) + assert _of(out, "cognitive.inter_command_latency_class").value == "llm_heavyweight" + + +def test_long_bucket() -> None: + # IATs > 30s + starts = [i * 60.0 for i in range(6)] + out = list(extract_session(_command_stream(starts), sid="lat-long")) + assert _of(out, "cognitive.inter_command_latency_class").value == "long" + + +def test_low_sample_count_reduces_confidence() -> None: + # 2 commands → 1 IAT; below the floor + short = list(extract_session(_command_stream([0.0, 1.0]), sid="lat-low")) + full = list(extract_session(_command_stream([i * 1.0 for i in range(6)]), sid="lat-full")) + s = _of(short, "cognitive.inter_command_latency_class") + f = _of(full, "cognitive.inter_command_latency_class") + assert s.confidence < f.confidence