From f3880b24d14fc9556f8fac86059290d7fa92b9e2 Mon Sep 17 00:00:00 2001 From: anti Date: Sun, 3 May 2026 07:50:55 -0400 Subject: [PATCH] feat(profiler/behave_shell): command segmentation in SessionContext MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BEHAVE-EXTRACTOR.md Phase A Step 4. Pure refactor inside _ctx.py — no new feature emits. Lays the shared utility for the three cognitive primitives next in line (Steps 5-7). * Command dataclass (frozen): start_ts, end_ts, first_token_hash. PII-safe by construction — only the first whitespace-delimited token of the command is retained, and only as a sha256 hash (decnet/profiler/behave_shell/_parse.py:hash_token). * _segment_commands walks input events char-by-char, splits on \r / \n, hashes the first token, drops the rest. * SessionContext gains commands, inter_cmd_iats, output_per_cmd. output_per_cmd[i] counts bytes between commands[i].end_ts and commands[i+1].start_ts — the natural pairing for Step 7 (feedback_loop_engagement). Tests: empty / unterminated streams, single command (CR + LF terminators), paste-with-newline, multi-command IAT pairing, output-byte counting between boundaries, blank-line skip, first-token-only PII discipline. --- decnet/profiler/behave_shell/_ctx.py | 66 +++++++++++- decnet/profiler/behave_shell/_parse.py | 41 ++++--- .../behave_shell/test_command_segmentation.py | 101 ++++++++++++++++++ 3 files changed, 195 insertions(+), 13 deletions(-) create mode 100644 tests/profiler/behave_shell/test_command_segmentation.py diff --git a/decnet/profiler/behave_shell/_ctx.py b/decnet/profiler/behave_shell/_ctx.py index cd0b7913..93c35a2d 100644 --- a/decnet/profiler/behave_shell/_ctx.py +++ b/decnet/profiler/behave_shell/_ctx.py @@ -13,7 +13,12 @@ from __future__ import annotations from dataclasses import dataclass, field from typing import Iterable -from decnet.profiler.behave_shell._parse import AsciinemaEvent, PasteBurst +from decnet.profiler.behave_shell._parse import ( + AsciinemaEvent, + Command, + PasteBurst, + hash_token, +) from decnet.profiler.behave_shell._thresholds import ( PASTE_BURST_MAX_IAT_S, PASTE_MIN_CHARS_PER_EVENT, @@ -37,6 +42,11 @@ class SessionContext: paste_bursts: tuple[PasteBurst, ...] = field(default_factory=tuple) paste_event_count: int = 0 + # Step 4 derivations — command segmentation + commands: tuple[Command, ...] = field(default_factory=tuple) + inter_cmd_iats: tuple[float, ...] = field(default_factory=tuple) + output_per_cmd: tuple[int, ...] = field(default_factory=tuple) + def _detect_paste_bursts( inputs: list[AsciinemaEvent], @@ -92,6 +102,48 @@ def _detect_paste_bursts( return tuple(bursts), paste_count +def _segment_commands(inputs: list[AsciinemaEvent]) -> tuple[Command, ...]: + """Walk input events, splitting on ``\\r`` / ``\\n`` into commands. + + PII discipline: only the first whitespace-delimited token is + retained, and only as a sha256 hash. Buffer contents are dropped + on every command boundary; an unterminated trailing buffer (no + final newline) yields no command. + """ + cmds: list[Command] = [] + buf_chars: list[str] = [] + buf_start_ts: float | None = None + + for t, _kind, data in inputs: + for c in data: + if c in ("\r", "\n"): + if buf_chars: + text = "".join(buf_chars).strip() + first_token = text.split(maxsplit=1)[0] if text else "" + cmds.append(Command( + start_ts=buf_start_ts if buf_start_ts is not None else t, + end_ts=t, + first_token_hash=hash_token(first_token), + )) + buf_chars = [] + buf_start_ts = None + else: + if not buf_chars: + buf_start_ts = t + buf_chars.append(c) + + return tuple(cmds) + + +def _output_bytes_between( + outputs: list[AsciinemaEvent], + start: float, + end: float, +) -> int: + """Total ``len(d)`` of output events with ``start <= t < end``.""" + return sum(len(d) for t, _k, d in outputs if start <= t < end) + + def build_session_context( events: Iterable[AsciinemaEvent], *, @@ -127,6 +179,15 @@ def build_session_context( max(0.0, inputs[i][0] - inputs[i - 1][0]) for i in range(1, len(inputs)) ) paste_bursts, paste_count = _detect_paste_bursts(inputs) + commands = _segment_commands(inputs) + inter_cmd_iats = tuple( + max(0.0, commands[i + 1].start_ts - commands[i].end_ts) + for i in range(len(commands) - 1) + ) + output_per_cmd = tuple( + _output_bytes_between(outputs, commands[i].end_ts, commands[i + 1].start_ts) + for i in range(len(commands) - 1) + ) return SessionContext( sid=sid, @@ -140,4 +201,7 @@ def build_session_context( iats=iats, paste_bursts=paste_bursts, paste_event_count=paste_count, + commands=commands, + inter_cmd_iats=inter_cmd_iats, + output_per_cmd=output_per_cmd, ) diff --git a/decnet/profiler/behave_shell/_parse.py b/decnet/profiler/behave_shell/_parse.py index c8178273..d9cac899 100644 --- a/decnet/profiler/behave_shell/_parse.py +++ b/decnet/profiler/behave_shell/_parse.py @@ -13,6 +13,7 @@ turn a raw JSON string into one. """ from __future__ import annotations +import hashlib import json from dataclasses import dataclass from typing import Iterable, Iterator, Literal, Tuple @@ -23,18 +24,7 @@ AsciinemaEvent = Tuple[float, EventKind, str] @dataclass(frozen=True, slots=True) class PasteBurst: - """Contiguous run of paste-class input events. - - A paste-class event is a single input event whose ``data`` length - is at least ``PASTE_MIN_CHARS_PER_EVENT`` — terminal pastes from - xterm/kitty/iTerm arrive as one bulk write, so checking event size - is the cheap-and-correct proxy for the bracketed-paste signal we - don't get to see. - - Multiple consecutive paste-class events with low IATs collapse - into one ``PasteBurst`` for higher-level reasoning (paste-rate / - paste-style classification later). - """ + """Contiguous run of paste-class input events.""" start_ts: float end_ts: float @@ -42,6 +32,33 @@ class PasteBurst: event_count: int +@dataclass(frozen=True, slots=True) +class Command: + """One command-line invocation, segmented from the input stream. + + PII discipline (per ``BEHAVE-INTEGRATION.md`` and the BEHAVE + envelope's pinned policy): only the *first token* of the command + is retained, and only as a sha256 hash. The full command body + never enters the engine's data structures, never goes to the bus, + never ends up in the database. ``first_token_hash`` lets + cognitive.command_branch_diversity (Step 6) count distinct + invocations without learning anything about argument values. + + ``end_ts`` is the timestamp of the ``\\r`` / ``\\n`` that + terminated the command; ``start_ts`` is the first character typed + or pasted into it. + """ + + start_ts: float + end_ts: float + first_token_hash: str + + +def hash_token(token: str) -> str: + """sha256-hex of a token; the only PII-safe handle on a command.""" + return hashlib.sha256(token.encode("utf-8")).hexdigest() + + def parse_shard_line(line: str) -> AsciinemaEvent | None: """Turn one shard JSONL line into an :data:`AsciinemaEvent`. diff --git a/tests/profiler/behave_shell/test_command_segmentation.py b/tests/profiler/behave_shell/test_command_segmentation.py new file mode 100644 index 00000000..dd3f308b --- /dev/null +++ b/tests/profiler/behave_shell/test_command_segmentation.py @@ -0,0 +1,101 @@ +"""Step 4: command segmentation in SessionContext. + +PII discipline: full command body never enters the engine. Only the +first-token sha256 hash is retained. +""" +from __future__ import annotations + +from decnet.profiler.behave_shell._ctx import build_session_context +from decnet.profiler.behave_shell._parse import AsciinemaEvent, hash_token + + +def _ctx(events: list[AsciinemaEvent]): + return build_session_context(events, sid="t-cmds", source="test") + + +def test_no_input_means_no_commands() -> None: + ctx = _ctx([]) + assert ctx.commands == () + assert ctx.inter_cmd_iats == () + assert ctx.output_per_cmd == () + + +def test_unterminated_input_yields_no_command() -> None: + # No trailing newline → no command boundary observed + events: list[AsciinemaEvent] = [(0.0, "i", "ls"), (0.1, "i", " -la")] + ctx = _ctx(events) + assert ctx.commands == () + + +def test_single_command_carriage_return_terminator() -> None: + events: list[AsciinemaEvent] = [ + (0.0, "i", "l"), (0.1, "i", "s"), (0.2, "i", "\r"), + ] + ctx = _ctx(events) + assert len(ctx.commands) == 1 + cmd = ctx.commands[0] + assert cmd.start_ts == 0.0 + assert cmd.end_ts == 0.2 + assert cmd.first_token_hash == hash_token("ls") + + +def test_paste_event_with_full_command() -> None: + # A pasted command line all in one event, terminated by \r in the + # paste itself. + events: list[AsciinemaEvent] = [(1.0, "i", "echo hello world\r")] + ctx = _ctx(events) + assert len(ctx.commands) == 1 + cmd = ctx.commands[0] + assert cmd.first_token_hash == hash_token("echo") + # start_ts and end_ts both come from the single event timestamp + assert cmd.start_ts == 1.0 + assert cmd.end_ts == 1.0 + + +def test_lf_terminator_also_segments() -> None: + events: list[AsciinemaEvent] = [(0.0, "i", "ls -la\n")] + ctx = _ctx(events) + assert len(ctx.commands) == 1 + assert ctx.commands[0].first_token_hash == hash_token("ls") + + +def test_multiple_commands_get_inter_cmd_iats() -> None: + events: list[AsciinemaEvent] = [ + (0.0, "i", "a"), (0.1, "i", "\r"), # cmd "a" ends at 0.1 + (1.5, "i", "b"), (1.6, "i", "\r"), # cmd "b" starts at 1.5, ends at 1.6 + (3.0, "i", "c"), (3.1, "i", "\r"), # cmd "c" starts at 3.0 + ] + ctx = _ctx(events) + assert len(ctx.commands) == 3 + # IATs between command end and next command start + assert ctx.inter_cmd_iats == (1.5 - 0.1, 3.0 - 1.6) + + +def test_output_per_cmd_counts_bytes_between_command_boundaries() -> None: + events: list[AsciinemaEvent] = [ + (0.0, "i", "ls\r"), + (0.1, "o", "file.txt\r\n"), # 10 bytes + (0.2, "o", "other.txt\r\n"), # 11 bytes + (1.0, "i", "ps\r"), + (1.1, "o", "PID TTY\r\n"), # 9 bytes (after cmd 2; tail beyond paired output) + ] + ctx = _ctx(events) + assert len(ctx.commands) == 2 + # Pair is (cmd0.end_ts, cmd1.start_ts) = (0.0, 1.0); 21 bytes fall in + assert ctx.output_per_cmd == (21,) + + +def test_first_token_only_hashes_first_word() -> None: + events: list[AsciinemaEvent] = [(0.0, "i", "curl -sS https://target/\r")] + ctx = _ctx(events) + assert ctx.commands[0].first_token_hash == hash_token("curl") + # Argument values are not stored anywhere in SessionContext + assert "target" not in str(ctx.commands) + + +def test_blank_line_does_not_emit_command() -> None: + # Hitting Enter on an empty prompt should not register a command + events: list[AsciinemaEvent] = [(0.0, "i", "\r"), (0.5, "i", "ls\r")] + ctx = _ctx(events) + assert len(ctx.commands) == 1 + assert ctx.commands[0].first_token_hash == hash_token("ls")