DECNET/decnet/profiler/behave_shell/_features/cognitive.py

"""``cognitive.*`` feature functions.

Step 5: ``cognitive.inter_command_latency_class``.
Step 6: ``cognitive.command_branch_diversity``.
Step 7: ``cognitive.feedback_loop_engagement``.
Step 8: ``cognitive.inter_command_consistency``.
Step D.1: ``cognitive.cognitive_load``.
"""
from __future__ import annotations

import statistics
from typing import Iterator

from behave_core.spec.envelope import Observation

from decnet.profiler.behave_shell._ctx import SessionContext
from decnet.profiler.behave_shell._features._emit import make_observation
from decnet.profiler.behave_shell._parse import hash_token
from decnet.profiler.behave_shell._thresholds import (
    BRANCH_DIVERSITY_LINEAR_MIN,
    COGNITIVE_LOAD_CHUNKING_REF_CV,
    COGNITIVE_LOAD_LOW_MAX,
    COGNITIVE_LOAD_MEDIUM_MAX,
    COGNITIVE_LOAD_PACE_REF_CV,
    EXPLORATION_CHAOTIC_BACKTRACK_MIN,
    EXPLORATION_TARGETED_REP_MIN,
    FEEDBACK_CORRELATION_MIN,
    FEEDBACK_MIN_PAIRS,
    FRUSTRATION_LOW_MAX,
    FRUSTRATION_MODERATE_MAX,
    IKI_THINK_MAX_S,
    INTER_CMD_DELIBERATE_MAX,
    INTER_CMD_INSTANT_MAX,
    INTER_CMD_LLM_HEAVYWEIGHT_MAX,
    INTER_CMD_LLM_LIGHTWEIGHT_MAX,
    INTER_CMD_TYPING_MAX,
    MIN_COMMANDS_FOR_FULL_CONFIDENCE,
    PAUSE_CV_BIMODAL_MIN,
    PAUSE_CV_METRONOMIC_MAX,
    PLANNING_DEEP_MIN,
    PLANNING_REACTIVE_MIN,
    TOOL_VOCAB_BROAD_MIN,
    TOOL_VOCAB_NARROW_MAX,
)


# Precomputed at import time so the per-session hot loop is a set
# membership check, not 3 sha256 ops per command. The ``--help`` /
# ``-h`` flag forms can't be detected here — they're not first tokens
# (PII discipline keeps only the *first* token's hash). v0.2 will
# reconsider once corpus calibration justifies storing arg-token
# hashes too.
_HELP_FAMILY_HASHES: frozenset[str] = frozenset({
    hash_token("man"),
    hash_token("help"),
    hash_token("info"),
})


def _clip01(x: float) -> float:
    if x < 0.0:
        return 0.0
    if x > 1.0:
        return 1.0
    return x


def _cv(xs: tuple[float, ...] | list[float]) -> float | None:
    """Coefficient of variation; ``None`` if undefined (n<2 or mean==0)."""
    if len(xs) < 2:
        return None
    mean = statistics.fmean(xs)
    if mean <= 0.0:
        return None
    return statistics.stdev(xs) / mean


def _bucket_inter_cmd_latency(median_iat: float) -> str:
    if median_iat <= INTER_CMD_INSTANT_MAX:
        return "instant"
    if median_iat <= INTER_CMD_TYPING_MAX:
        return "typing_speed"
    if median_iat <= INTER_CMD_DELIBERATE_MAX:
        return "deliberate"
    if median_iat <= INTER_CMD_LLM_LIGHTWEIGHT_MAX:
        return "llm_lightweight"
    if median_iat <= INTER_CMD_LLM_HEAVYWEIGHT_MAX:
        return "llm_heavyweight"
    return "long"


def inter_command_latency_class(ctx: SessionContext) -> Iterator[Observation]:
    """Emit ``cognitive.inter_command_latency_class``.

    Operator's *thinking pace* between commands, bucketed against
    calibrated thresholds. Splits LW-sim / CLAUDE-FF / CLAUDE-CL.
    """
    if not ctx.inter_cmd_iats:
        return
    median_iat = statistics.median(ctx.inter_cmd_iats)
    bucket = _bucket_inter_cmd_latency(median_iat)
    # Sample-size honesty: < 5 commands → halve confidence
    if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
        confidence = 0.40
    else:
        confidence = 0.80
    yield make_observation(
        ctx,
        primitive="cognitive.inter_command_latency_class",
        value=bucket,
        confidence=confidence,
    )


def command_branch_diversity(ctx: SessionContext) -> Iterator[Observation]:
    """Emit ``cognitive.command_branch_diversity``.

    Content-based discriminator (no timing): unique first-token ratio
    over total commands. Splits CLAUDE-FF (linear_playbook) from
    CLAUDE-CL (adaptive_branching). The empirical anchor on
    2026-05-02: fire-and-forget runs ~10 distinct tools; closed-loop
    runs 5-6 with ``curl`` re-invoked as the operator chases threads.
    """
    n = len(ctx.commands)
    if n == 0:
        # No commands at all → nothing honest to say. Skip emission.
        return
    if n < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
        # Registry admits "unknown"; absence of *enough* data is itself
        # a high-confidence answer.
        yield make_observation(
            ctx,
            primitive="cognitive.command_branch_diversity",
            value="unknown",
            confidence=1.0,
        )
        return
    unique = len({c.first_token_hash for c in ctx.commands})
    ratio = unique / n
    if ratio >= BRANCH_DIVERSITY_LINEAR_MIN:
        value = "linear_playbook"
    else:
        # Anything below the linear floor is treated as adaptive — the
        # operator is reusing tools, the discriminative signal we
        # actually want.
        value = "adaptive_branching"
    yield make_observation(
        ctx,
        primitive="cognitive.command_branch_diversity",
        value=value,
        confidence=0.80,
    )


def feedback_loop_engagement(ctx: SessionContext) -> Iterator[Observation]:
    """Emit ``cognitive.feedback_loop_engagement``.

    Pearson correlation between ``output_per_cmd[i]`` (bytes the
    operator saw before the next command) and
    ``inter_cmd_iats[i]`` (the pause that followed). closed_loop
    operators read more before pausing more; fire_and_forget operators
    pace independently of output. CUTS ACROSS the LLM/human axis —
    closed-loop LLMs and reading humans both score closed_loop.

    First primitive that depends on output events: zero output events
    in the shard → emit ``unknown`` at confidence 1.0 (no honest
    correlation possible) and exit.
    """
    pairs = list(zip(ctx.output_per_cmd, ctx.inter_cmd_iats))
    if not ctx.output_events or len(pairs) < FEEDBACK_MIN_PAIRS:
        if not ctx.commands:
            return
        yield make_observation(
            ctx,
            primitive="cognitive.feedback_loop_engagement",
            value="unknown",
            confidence=1.0,
        )
        return
    xs = [float(p[0]) for p in pairs]
    ys = [float(p[1]) for p in pairs]
    try:
        r = statistics.correlation(xs, ys)
    except statistics.StatisticsError:
        # Constant series on either axis — correlation undefined.
        yield make_observation(
            ctx,
            primitive="cognitive.feedback_loop_engagement",
            value="unknown",
            confidence=1.0,
        )
        return
    if r > FEEDBACK_CORRELATION_MIN:
        value = "closed_loop"
    else:
        value = "fire_and_forget"
    yield make_observation(
        ctx,
        primitive="cognitive.feedback_loop_engagement",
        value=value,
        confidence=0.75,
    )


def error_resilience_fallback_to_man(ctx: SessionContext) -> Iterator[Observation]:
    """Emit ``cognitive.error_resilience.fallback_to_man``.

    For each errored command, check whether the operator's next
    command is ``man`` / ``help`` / ``info`` — i.e. they reached for
    the manual rather than re-trying or pivoting. If at least one
    errored command triggered this fallback → ``present``; otherwise
    ``absent``.

    Skip emission when no commands errored — the registry's binary
    has no ``unknown``, and emitting ``absent`` from no observation
    at all would be dishonest.

    The ``--help`` / ``-h`` flag forms can't fire this primitive in
    v0.1: they aren't first tokens, and the engine only retains
    ``first_token_hash`` per command (PII discipline). Filed for v0.2.
    """
    errored_indices = [i for i, c in enumerate(ctx.commands) if c.errored]
    if not errored_indices:
        return
    fallback_count = 0
    for i in errored_indices:
        if i + 1 >= len(ctx.commands):
            continue
        if ctx.commands[i + 1].first_token_hash in _HELP_FAMILY_HASHES:
            fallback_count += 1
    value = "present" if fallback_count > 0 else "absent"

    if len(errored_indices) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
        confidence = 0.40
    else:
        confidence = 0.65
    yield make_observation(
        ctx,
        primitive="cognitive.error_resilience.fallback_to_man",
        value=value,
        confidence=confidence,
    )


def error_resilience_frustration_typing(ctx: SessionContext) -> Iterator[Observation]:
    """Emit ``cognitive.error_resilience.frustration_typing``.

    Compares median within-command IAT for commands *following* an
    errored command against the same statistic for commands following
    a successful command. A large relative delta indicates the operator
    typed differently after a failure — speed-up (rage / fluency) or
    slowdown (caution); both are signs of arousal.

    Skip emission when either group is empty (no errors, or every
    command errored — no clean baseline). Sample-size honesty drops
    confidence below the floor.
    """
    post_err: list[float] = []
    post_ok: list[float] = []
    cmds = ctx.commands
    intra = ctx.intra_command_iats
    if len(cmds) < 2 or len(intra) != len(cmds):
        return
    for i in range(1, len(cmds)):
        cmd_iats = intra[i]
        if not cmd_iats:
            continue
        m = statistics.median(cmd_iats)
        if cmds[i - 1].errored:
            post_err.append(m)
        else:
            post_ok.append(m)
    if not post_err or not post_ok:
        return
    median_err = statistics.median(post_err)
    median_ok = statistics.median(post_ok)
    if median_ok <= 0.0:
        return
    delta = abs(median_err - median_ok) / median_ok

    if delta < FRUSTRATION_LOW_MAX:
        value = "low"
    elif delta < FRUSTRATION_MODERATE_MAX:
        value = "moderate"
    else:
        value = "high"

    if len(post_err) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
        confidence = 0.40
    else:
        confidence = 0.60
    yield make_observation(
        ctx,
        primitive="cognitive.error_resilience.frustration_typing",
        value=value,
        confidence=confidence,
    )


def error_resilience_retry_tactic(ctx: SessionContext) -> Iterator[Observation]:
    """Emit ``cognitive.error_resilience.retry_tactic``.

    For each command with ``Command.errored=True``, classify the
    operator's response by the *next* command:

    * **rerun** — same first_token_hash as the errored command. The
      operator re-invoked the same tool (often after fixing args
      mid-edit, but we can't see args).
    * **switch** — different first_token_hash. Pivoted to a different
      tool.
    * **abort** — no next command. Session ended after the error.

    The session's reported tactic is the **modal** response across all
    errored commands (with ties broken in registry order: rerun >
    modify > switch > abort). Skip emission entirely when no commands
    errored — the registry has no ``unknown`` here, and silence is the
    most honest answer.

    The ``modify`` value (edit-and-retry) requires within-command
    diffing of arg tokens, which crosses the PII boundary the engine
    holds (only ``first_token_hash`` is retained per command). v0.1
    therefore never emits ``modify``; v0.2 will once the PII trade-off
    is revisited against a real attacker corpus.
    """
    errored = [(i, c) for i, c in enumerate(ctx.commands) if c.errored]
    if not errored:
        return
    counts = {"rerun": 0, "switch": 0, "abort": 0}
    for i, cmd in errored:
        if i + 1 >= len(ctx.commands):
            counts["abort"] += 1
        elif ctx.commands[i + 1].first_token_hash == cmd.first_token_hash:
            counts["rerun"] += 1
        else:
            counts["switch"] += 1
    # Registry-order tiebreak (rerun > modify > switch > abort).
    # `modify` deferred — never increments here.
    order = ("rerun", "switch", "abort")
    value = max(order, key=lambda k: counts[k])

    if len(errored) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
        confidence = 0.40
    else:
        confidence = 0.65
    yield make_observation(
        ctx,
        primitive="cognitive.error_resilience.retry_tactic",
        value=value,
        confidence=confidence,
    )


def tool_vocabulary(ctx: SessionContext) -> Iterator[Observation]:
    """Emit ``cognitive.tool_vocabulary`` ∈ {narrow, moderate, broad}.

    Absolute count of distinct first_token_hashes. Skip emission when
    no commands exist; below the sample-size floor we still emit, but
    at confidence 0.40 — a session with few commands but five distinct
    tools is genuinely a moderate-vocabulary signal.
    """
    if not ctx.commands:
        return
    distinct = len({c.first_token_hash for c in ctx.commands})
    if distinct <= TOOL_VOCAB_NARROW_MAX:
        value = "narrow"
    elif distinct >= TOOL_VOCAB_BROAD_MIN:
        value = "broad"
    else:
        value = "moderate"
    if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
        confidence = 0.40
    else:
        confidence = 0.70
    yield make_observation(
        ctx,
        primitive="cognitive.tool_vocabulary",
        value=value,
        confidence=confidence,
    )


def planning_depth(ctx: SessionContext) -> Iterator[Observation]:
    """Emit ``cognitive.planning_depth`` ∈ {deep, shallow, reactive}.

    Read off the distribution of inter-command IATs:

    * **deep** — many think-pauses (> ``IKI_THINK_MAX_S``). The
      operator stops to think between commands.
    * **reactive** — most pauses are sub-instant
      (≤ ``INTER_CMD_INSTANT_MAX``). Knee-jerk pacing — automated
      runner, prepared playbook, or an LLM with no internal latency.
    * **shallow** — neither: mostly typing-speed pauses, no extended
      contemplation.

    Skip emission when no inter-command IATs exist (one or zero
    commands); the registry has no ``unknown`` for this primitive.
    """
    iats = ctx.inter_cmd_iats
    if not iats:
        return
    n = len(iats)
    deep_count = sum(1 for x in iats if x > IKI_THINK_MAX_S)
    reactive_count = sum(1 for x in iats if x <= INTER_CMD_INSTANT_MAX)
    deep_frac = deep_count / n
    reactive_frac = reactive_count / n

    if deep_frac >= PLANNING_DEEP_MIN:
        value = "deep"
    elif reactive_frac >= PLANNING_REACTIVE_MIN:
        value = "reactive"
    else:
        value = "shallow"

    if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
        confidence = 0.40
    else:
        confidence = 0.65
    yield make_observation(
        ctx,
        primitive="cognitive.planning_depth",
        value=value,
        confidence=confidence,
    )


def exploration_style(ctx: SessionContext) -> Iterator[Observation]:
    """Emit ``cognitive.exploration_style`` ∈ {methodical, chaotic, targeted}.

    Two-axis classification over the first_token_hash sequence:

    * **methodical** — low repetition, low backtracks. Operator marches
      forward through new tools.
    * **targeted** — high repetition (R ≥ EXPLORATION_TARGETED_REP_MIN).
      Same tool re-invoked repeatedly; the operator is drilling.
    * **chaotic** — high backtrack rate (J ≥ EXPLORATION_CHAOTIC_BACKTRACK_MIN).
      Jumps among previously-used tools without a clear thread.

    The registry doesn't permit ``unknown``; below the
    MIN_COMMANDS_FOR_FULL_CONFIDENCE floor we emit at confidence 0.40
    rather than skip — the engine has *some* signal, just less of it.
    Skip emission only when there are no commands at all.
    """
    n = len(ctx.commands)
    if n == 0:
        return
    hashes = [c.first_token_hash for c in ctx.commands]
    unique = len(set(hashes))
    repetition_rate = 0.0 if n == 0 else 1.0 - (unique / n)

    # Backtrack: at position i, hashes[i] previously seen at index < i-1
    # and not equal to hashes[i-1]. (Repeating the immediate predecessor
    # is "drilling", picked up by repetition_rate; backtrack is the
    # non-local jump signal.)
    seen_before: set[str] = set()
    backtracks = 0
    transitions = 0
    if hashes:
        seen_before.add(hashes[0])
    for i in range(1, n):
        transitions += 1
        if hashes[i] != hashes[i - 1] and hashes[i] in seen_before:
            backtracks += 1
        seen_before.add(hashes[i])
    backtrack_rate = (backtracks / transitions) if transitions else 0.0

    if backtrack_rate >= EXPLORATION_CHAOTIC_BACKTRACK_MIN:
        value = "chaotic"
    elif repetition_rate >= EXPLORATION_TARGETED_REP_MIN:
        value = "targeted"
    else:
        value = "methodical"

    if n < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
        confidence = 0.40
    else:
        confidence = 0.60
    yield make_observation(
        ctx,
        primitive="cognitive.exploration_style",
        value=value,
        confidence=confidence,
    )


def cognitive_load(ctx: SessionContext) -> Iterator[Observation]:
    """Emit ``cognitive.cognitive_load`` ∈ {low, medium, high}.

    Composite of three [0, 1]-clipped sub-signals, mean-aggregated:

    * **chunking** — median CV of intra-command IATs / reference CV.
      Fragmented mid-command typing → high contribution.
    * **errors** — fraction of commands whose post-execution output
      matched a canonical error fingerprint (``Command.errored`` from
      Step D.0). Failures pile load.
    * **pace variability** — CV of inter-command IATs / reference CV.
      A spread of think-pause durations → unsettled cadence → load.

    Components missing data contribute 0.0 (no penalty for an absent
    signal), and the composite normalises by *available* component
    count so a session with zero inter-command pauses isn't punished
    for the silence. Skip emission entirely when no commands at all
    exist — there's no honest answer.

    v0.1 thresholds; D.8 re-tunes once the rest of Phase D is stable.
    """
    if not ctx.commands:
        return

    # Component A: chunking variance — median within-command CV
    per_cmd_cvs: list[float] = []
    for cmd_iats in ctx.intra_command_iats:
        cv = _cv(cmd_iats)
        if cv is not None:
            per_cmd_cvs.append(cv)
    if per_cmd_cvs:
        chunking_load: float | None = _clip01(
            statistics.median(per_cmd_cvs) / COGNITIVE_LOAD_CHUNKING_REF_CV
        )
    else:
        chunking_load = None

    # Component B: error rate
    error_load: float = sum(1 for c in ctx.commands if c.errored) / len(ctx.commands)
    error_load = _clip01(error_load)

    # Component C: pace variability — CV of inter-command IATs
    pace_cv = _cv(ctx.inter_cmd_iats)
    if pace_cv is not None:
        pace_load: float | None = _clip01(pace_cv / COGNITIVE_LOAD_PACE_REF_CV)
    else:
        pace_load = None

    components = [c for c in (chunking_load, error_load, pace_load) if c is not None]
    if not components:
        return
    load = sum(components) / len(components)

    if load < COGNITIVE_LOAD_LOW_MAX:
        value = "low"
    elif load < COGNITIVE_LOAD_MEDIUM_MAX:
        value = "medium"
    else:
        value = "high"

    if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
        confidence = 0.40
    else:
        # Composite over three soft sub-signals — held below the
        # cap of single-source primitives. D.8 re-tunes.
        confidence = 0.60
    yield make_observation(
        ctx,
        primitive="cognitive.cognitive_load",
        value=value,
        confidence=confidence,
    )


def inter_command_consistency(ctx: SessionContext) -> Iterator[Observation]:
    """Emit ``cognitive.inter_command_consistency``.

    CV (stdev / mean) of inter-command IATs.

    * ``metronomic`` (CV < 0.40) → LLM-pure. Empirical anchor:
      LLM-simulated session CV ≈ 0.24 in this corpus.
    * ``variable`` (0.40 ≤ CV < 1.50) → human. Empirical anchor:
      human session CV ≈ 0.94.
    * ``bimodal`` (CV ≥ 1.50) → LLM-assisted human, heuristic. v0.1
      uses CV-only; true bimodal detection (Hartigan dip / two-peak)
      is filed for v0.2 per the registry's ``notes:`` field.
    """
    iats = ctx.inter_cmd_iats
    if len(iats) < 2:
        return
    mean = statistics.fmean(iats)
    if mean <= 0.0:
        return
    cv = statistics.stdev(iats) / mean
    if cv < PAUSE_CV_METRONOMIC_MAX:
        value = "metronomic"
    elif cv >= PAUSE_CV_BIMODAL_MIN:
        value = "bimodal"
    else:
        value = "variable"
    confidence = (
        0.40 if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE else 0.75
    )
    yield make_observation(
        ctx,
        primitive="cognitive.inter_command_consistency",
        value=value,
        confidence=confidence,
    )