feat(profiler/behave_shell): command segmentation in SessionContext

BEHAVE-EXTRACTOR.md Phase A Step 4. Pure refactor inside _ctx.py —
no new feature emits. Lays the shared utility for the three
cognitive primitives next in line (Steps 5-7).

* Command dataclass (frozen): start_ts, end_ts, first_token_hash.
  PII-safe by construction — only the first whitespace-delimited
  token of the command is retained, and only as a sha256 hash
  (decnet/profiler/behave_shell/_parse.py:hash_token).
* _segment_commands walks input events char-by-char, splits on
  \r / \n, hashes the first token, drops the rest.
* SessionContext gains commands, inter_cmd_iats, output_per_cmd.
  output_per_cmd[i] counts bytes between commands[i].end_ts and
  commands[i+1].start_ts — the natural pairing for Step 7
  (feedback_loop_engagement).

Tests: empty / unterminated streams, single command (CR + LF
terminators), paste-with-newline, multi-command IAT pairing,
output-byte counting between boundaries, blank-line skip,
first-token-only PII discipline.
This commit is contained in:
2026-05-03 07:50:55 -04:00
parent 6763fceb0b
commit f3880b24d1
3 changed files with 195 additions and 13 deletions

View File

@@ -13,6 +13,7 @@ turn a raw JSON string into one.
"""
from __future__ import annotations
import hashlib
import json
from dataclasses import dataclass
from typing import Iterable, Iterator, Literal, Tuple
@@ -23,18 +24,7 @@ AsciinemaEvent = Tuple[float, EventKind, str]
@dataclass(frozen=True, slots=True)
class PasteBurst:
"""Contiguous run of paste-class input events.
A paste-class event is a single input event whose ``data`` length
is at least ``PASTE_MIN_CHARS_PER_EVENT`` — terminal pastes from
xterm/kitty/iTerm arrive as one bulk write, so checking event size
is the cheap-and-correct proxy for the bracketed-paste signal we
don't get to see.
Multiple consecutive paste-class events with low IATs collapse
into one ``PasteBurst`` for higher-level reasoning (paste-rate /
paste-style classification later).
"""
"""Contiguous run of paste-class input events."""
start_ts: float
end_ts: float
@@ -42,6 +32,33 @@ class PasteBurst:
event_count: int
@dataclass(frozen=True, slots=True)
class Command:
"""One command-line invocation, segmented from the input stream.
PII discipline (per ``BEHAVE-INTEGRATION.md`` and the BEHAVE
envelope's pinned policy): only the *first token* of the command
is retained, and only as a sha256 hash. The full command body
never enters the engine's data structures, never goes to the bus,
never ends up in the database. ``first_token_hash`` lets
cognitive.command_branch_diversity (Step 6) count distinct
invocations without learning anything about argument values.
``end_ts`` is the timestamp of the ``\\r`` / ``\\n`` that
terminated the command; ``start_ts`` is the first character typed
or pasted into it.
"""
start_ts: float
end_ts: float
first_token_hash: str
def hash_token(token: str) -> str:
"""sha256-hex of a token; the only PII-safe handle on a command."""
return hashlib.sha256(token.encode("utf-8")).hexdigest()
def parse_shard_line(line: str) -> AsciinemaEvent | None:
"""Turn one shard JSONL line into an :data:`AsciinemaEvent`.