feat(profiler/behave_shell): command segmentation in SessionContext

BEHAVE-EXTRACTOR.md Phase A Step 4. Pure refactor inside _ctx.py — no new feature emits. Lays the shared utility for the three cognitive primitives next in line (Steps 5-7). * Command dataclass (frozen): start_ts, end_ts, first_token_hash. PII-safe by construction — only the first whitespace-delimited token of the command is retained, and only as a sha256 hash (decnet/profiler/behave_shell/_parse.py:hash_token). * _segment_commands walks input events char-by-char, splits on \r / \n, hashes the first token, drops the rest. * SessionContext gains commands, inter_cmd_iats, output_per_cmd. output_per_cmd[i] counts bytes between commands[i].end_ts and commands[i+1].start_ts — the natural pairing for Step 7 (feedback_loop_engagement). Tests: empty / unterminated streams, single command (CR + LF terminators), paste-with-newline, multi-command IAT pairing, output-byte counting between boundaries, blank-line skip, first-token-only PII discipline.
2026-05-03 07:50:55 -04:00
parent 6763fceb0b
commit f3880b24d1
3 changed files with 195 additions and 13 deletions
--- a/decnet/profiler/behave_shell/_ctx.py
+++ b/decnet/profiler/behave_shell/_ctx.py
@@ -13,7 +13,12 @@ from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import Iterable

-from decnet.profiler.behave_shell._parse import AsciinemaEvent, PasteBurst
+from decnet.profiler.behave_shell._parse import (
+    AsciinemaEvent,
+    Command,
+    PasteBurst,
+    hash_token,
+)
 from decnet.profiler.behave_shell._thresholds import (
    PASTE_BURST_MAX_IAT_S,
    PASTE_MIN_CHARS_PER_EVENT,
@@ -37,6 +42,11 @@ class SessionContext:
    paste_bursts: tuple[PasteBurst, ...] = field(default_factory=tuple)
    paste_event_count: int = 0

+    # Step 4 derivations — command segmentation
+    commands: tuple[Command, ...] = field(default_factory=tuple)
+    inter_cmd_iats: tuple[float, ...] = field(default_factory=tuple)
+    output_per_cmd: tuple[int, ...] = field(default_factory=tuple)
+

 def _detect_paste_bursts(
    inputs: list[AsciinemaEvent],
@@ -92,6 +102,48 @@ def _detect_paste_bursts(
    return tuple(bursts), paste_count


+def _segment_commands(inputs: list[AsciinemaEvent]) -> tuple[Command, ...]:
+    """Walk input events, splitting on ``\\r`` / ``\\n`` into commands.
+
+    PII discipline: only the first whitespace-delimited token is
+    retained, and only as a sha256 hash. Buffer contents are dropped
+    on every command boundary; an unterminated trailing buffer (no
+    final newline) yields no command.
+    """
+    cmds: list[Command] = []
+    buf_chars: list[str] = []
+    buf_start_ts: float | None = None
+
+    for t, _kind, data in inputs:
+        for c in data:
+            if c in ("\r", "\n"):
+                if buf_chars:
+                    text = "".join(buf_chars).strip()
+                    first_token = text.split(maxsplit=1)[0] if text else ""
+                    cmds.append(Command(
+                        start_ts=buf_start_ts if buf_start_ts is not None else t,
+                        end_ts=t,
+                        first_token_hash=hash_token(first_token),
+                    ))
+                buf_chars = []
+                buf_start_ts = None
+            else:
+                if not buf_chars:
+                    buf_start_ts = t
+                buf_chars.append(c)
+
+    return tuple(cmds)
+
+
+def _output_bytes_between(
+    outputs: list[AsciinemaEvent],
+    start: float,
+    end: float,
+) -> int:
+    """Total ``len(d)`` of output events with ``start <= t < end``."""
+    return sum(len(d) for t, _k, d in outputs if start <= t < end)
+
+
 def build_session_context(
    events: Iterable[AsciinemaEvent],
    *,
@@ -127,6 +179,15 @@ def build_session_context(
        max(0.0, inputs[i][0] - inputs[i - 1][0]) for i in range(1, len(inputs))
    )
    paste_bursts, paste_count = _detect_paste_bursts(inputs)
+    commands = _segment_commands(inputs)
+    inter_cmd_iats = tuple(
+        max(0.0, commands[i + 1].start_ts - commands[i].end_ts)
+        for i in range(len(commands) - 1)
+    )
+    output_per_cmd = tuple(
+        _output_bytes_between(outputs, commands[i].end_ts, commands[i + 1].start_ts)
+        for i in range(len(commands) - 1)
+    )

    return SessionContext(
        sid=sid,
@@ -140,4 +201,7 @@ def build_session_context(
        iats=iats,
        paste_bursts=paste_bursts,
        paste_event_count=paste_count,
+        commands=commands,
+        inter_cmd_iats=inter_cmd_iats,
+        output_per_cmd=output_per_cmd,
    )
--- a/decnet/profiler/behave_shell/_parse.py
+++ b/decnet/profiler/behave_shell/_parse.py
@@ -13,6 +13,7 @@ turn a raw JSON string into one.
 """
 from __future__ import annotations

+import hashlib
 import json
 from dataclasses import dataclass
 from typing import Iterable, Iterator, Literal, Tuple
@@ -23,18 +24,7 @@ AsciinemaEvent = Tuple[float, EventKind, str]

@dataclass(frozen=True, slots=True)
 class PasteBurst:
-    """Contiguous run of paste-class input events.
-
-    A paste-class event is a single input event whose ``data`` length
-    is at least ``PASTE_MIN_CHARS_PER_EVENT`` — terminal pastes from
-    xterm/kitty/iTerm arrive as one bulk write, so checking event size
-    is the cheap-and-correct proxy for the bracketed-paste signal we
-    don't get to see.
-
-    Multiple consecutive paste-class events with low IATs collapse
-    into one ``PasteBurst`` for higher-level reasoning (paste-rate /
-    paste-style classification later).
-    """
+    """Contiguous run of paste-class input events."""

    start_ts: float
    end_ts: float
@@ -42,6 +32,33 @@ class PasteBurst:
    event_count: int


+@dataclass(frozen=True, slots=True)
+class Command:
+    """One command-line invocation, segmented from the input stream.
+
+    PII discipline (per ``BEHAVE-INTEGRATION.md`` and the BEHAVE
+    envelope's pinned policy): only the *first token* of the command
+    is retained, and only as a sha256 hash. The full command body
+    never enters the engine's data structures, never goes to the bus,
+    never ends up in the database. ``first_token_hash`` lets
+    cognitive.command_branch_diversity (Step 6) count distinct
+    invocations without learning anything about argument values.
+
+    ``end_ts`` is the timestamp of the ``\\r`` / ``\\n`` that
+    terminated the command; ``start_ts`` is the first character typed
+    or pasted into it.
+    """
+
+    start_ts: float
+    end_ts: float
+    first_token_hash: str
+
+
+def hash_token(token: str) -> str:
+    """sha256-hex of a token; the only PII-safe handle on a command."""
+    return hashlib.sha256(token.encode("utf-8")).hexdigest()
+
+
 def parse_shard_line(line: str) -> AsciinemaEvent | None:
    """Turn one shard JSONL line into an :data:`AsciinemaEvent`.