feat(profiler/behave_shell): command segmentation in SessionContext

BEHAVE-EXTRACTOR.md Phase A Step 4. Pure refactor inside _ctx.py —
no new feature emits. Lays the shared utility for the three
cognitive primitives next in line (Steps 5-7).

* Command dataclass (frozen): start_ts, end_ts, first_token_hash.
  PII-safe by construction — only the first whitespace-delimited
  token of the command is retained, and only as a sha256 hash
  (decnet/profiler/behave_shell/_parse.py:hash_token).
* _segment_commands walks input events char-by-char, splits on
  \r / \n, hashes the first token, drops the rest.
* SessionContext gains commands, inter_cmd_iats, output_per_cmd.
  output_per_cmd[i] counts bytes between commands[i].end_ts and
  commands[i+1].start_ts — the natural pairing for Step 7
  (feedback_loop_engagement).

Tests: empty / unterminated streams, single command (CR + LF
terminators), paste-with-newline, multi-command IAT pairing,
output-byte counting between boundaries, blank-line skip,
first-token-only PII discipline.
This commit is contained in:
2026-05-03 07:50:55 -04:00
parent 6763fceb0b
commit f3880b24d1
3 changed files with 195 additions and 13 deletions

View File

@@ -13,7 +13,12 @@ from __future__ import annotations
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Iterable from typing import Iterable
from decnet.profiler.behave_shell._parse import AsciinemaEvent, PasteBurst from decnet.profiler.behave_shell._parse import (
AsciinemaEvent,
Command,
PasteBurst,
hash_token,
)
from decnet.profiler.behave_shell._thresholds import ( from decnet.profiler.behave_shell._thresholds import (
PASTE_BURST_MAX_IAT_S, PASTE_BURST_MAX_IAT_S,
PASTE_MIN_CHARS_PER_EVENT, PASTE_MIN_CHARS_PER_EVENT,
@@ -37,6 +42,11 @@ class SessionContext:
paste_bursts: tuple[PasteBurst, ...] = field(default_factory=tuple) paste_bursts: tuple[PasteBurst, ...] = field(default_factory=tuple)
paste_event_count: int = 0 paste_event_count: int = 0
# Step 4 derivations — command segmentation
commands: tuple[Command, ...] = field(default_factory=tuple)
inter_cmd_iats: tuple[float, ...] = field(default_factory=tuple)
output_per_cmd: tuple[int, ...] = field(default_factory=tuple)
def _detect_paste_bursts( def _detect_paste_bursts(
inputs: list[AsciinemaEvent], inputs: list[AsciinemaEvent],
@@ -92,6 +102,48 @@ def _detect_paste_bursts(
return tuple(bursts), paste_count return tuple(bursts), paste_count
def _segment_commands(inputs: list[AsciinemaEvent]) -> tuple[Command, ...]:
"""Walk input events, splitting on ``\\r`` / ``\\n`` into commands.
PII discipline: only the first whitespace-delimited token is
retained, and only as a sha256 hash. Buffer contents are dropped
on every command boundary; an unterminated trailing buffer (no
final newline) yields no command.
"""
cmds: list[Command] = []
buf_chars: list[str] = []
buf_start_ts: float | None = None
for t, _kind, data in inputs:
for c in data:
if c in ("\r", "\n"):
if buf_chars:
text = "".join(buf_chars).strip()
first_token = text.split(maxsplit=1)[0] if text else ""
cmds.append(Command(
start_ts=buf_start_ts if buf_start_ts is not None else t,
end_ts=t,
first_token_hash=hash_token(first_token),
))
buf_chars = []
buf_start_ts = None
else:
if not buf_chars:
buf_start_ts = t
buf_chars.append(c)
return tuple(cmds)
def _output_bytes_between(
outputs: list[AsciinemaEvent],
start: float,
end: float,
) -> int:
"""Total ``len(d)`` of output events with ``start <= t < end``."""
return sum(len(d) for t, _k, d in outputs if start <= t < end)
def build_session_context( def build_session_context(
events: Iterable[AsciinemaEvent], events: Iterable[AsciinemaEvent],
*, *,
@@ -127,6 +179,15 @@ def build_session_context(
max(0.0, inputs[i][0] - inputs[i - 1][0]) for i in range(1, len(inputs)) max(0.0, inputs[i][0] - inputs[i - 1][0]) for i in range(1, len(inputs))
) )
paste_bursts, paste_count = _detect_paste_bursts(inputs) paste_bursts, paste_count = _detect_paste_bursts(inputs)
commands = _segment_commands(inputs)
inter_cmd_iats = tuple(
max(0.0, commands[i + 1].start_ts - commands[i].end_ts)
for i in range(len(commands) - 1)
)
output_per_cmd = tuple(
_output_bytes_between(outputs, commands[i].end_ts, commands[i + 1].start_ts)
for i in range(len(commands) - 1)
)
return SessionContext( return SessionContext(
sid=sid, sid=sid,
@@ -140,4 +201,7 @@ def build_session_context(
iats=iats, iats=iats,
paste_bursts=paste_bursts, paste_bursts=paste_bursts,
paste_event_count=paste_count, paste_event_count=paste_count,
commands=commands,
inter_cmd_iats=inter_cmd_iats,
output_per_cmd=output_per_cmd,
) )

View File

@@ -13,6 +13,7 @@ turn a raw JSON string into one.
""" """
from __future__ import annotations from __future__ import annotations
import hashlib
import json import json
from dataclasses import dataclass from dataclasses import dataclass
from typing import Iterable, Iterator, Literal, Tuple from typing import Iterable, Iterator, Literal, Tuple
@@ -23,18 +24,7 @@ AsciinemaEvent = Tuple[float, EventKind, str]
@dataclass(frozen=True, slots=True) @dataclass(frozen=True, slots=True)
class PasteBurst: class PasteBurst:
"""Contiguous run of paste-class input events. """Contiguous run of paste-class input events."""
A paste-class event is a single input event whose ``data`` length
is at least ``PASTE_MIN_CHARS_PER_EVENT`` — terminal pastes from
xterm/kitty/iTerm arrive as one bulk write, so checking event size
is the cheap-and-correct proxy for the bracketed-paste signal we
don't get to see.
Multiple consecutive paste-class events with low IATs collapse
into one ``PasteBurst`` for higher-level reasoning (paste-rate /
paste-style classification later).
"""
start_ts: float start_ts: float
end_ts: float end_ts: float
@@ -42,6 +32,33 @@ class PasteBurst:
event_count: int event_count: int
@dataclass(frozen=True, slots=True)
class Command:
"""One command-line invocation, segmented from the input stream.
PII discipline (per ``BEHAVE-INTEGRATION.md`` and the BEHAVE
envelope's pinned policy): only the *first token* of the command
is retained, and only as a sha256 hash. The full command body
never enters the engine's data structures, never goes to the bus,
never ends up in the database. ``first_token_hash`` lets
cognitive.command_branch_diversity (Step 6) count distinct
invocations without learning anything about argument values.
``end_ts`` is the timestamp of the ``\\r`` / ``\\n`` that
terminated the command; ``start_ts`` is the first character typed
or pasted into it.
"""
start_ts: float
end_ts: float
first_token_hash: str
def hash_token(token: str) -> str:
"""sha256-hex of a token; the only PII-safe handle on a command."""
return hashlib.sha256(token.encode("utf-8")).hexdigest()
def parse_shard_line(line: str) -> AsciinemaEvent | None: def parse_shard_line(line: str) -> AsciinemaEvent | None:
"""Turn one shard JSONL line into an :data:`AsciinemaEvent`. """Turn one shard JSONL line into an :data:`AsciinemaEvent`.

View File

@@ -0,0 +1,101 @@
"""Step 4: command segmentation in SessionContext.
PII discipline: full command body never enters the engine. Only the
first-token sha256 hash is retained.
"""
from __future__ import annotations
from decnet.profiler.behave_shell._ctx import build_session_context
from decnet.profiler.behave_shell._parse import AsciinemaEvent, hash_token
def _ctx(events: list[AsciinemaEvent]):
return build_session_context(events, sid="t-cmds", source="test")
def test_no_input_means_no_commands() -> None:
ctx = _ctx([])
assert ctx.commands == ()
assert ctx.inter_cmd_iats == ()
assert ctx.output_per_cmd == ()
def test_unterminated_input_yields_no_command() -> None:
# No trailing newline → no command boundary observed
events: list[AsciinemaEvent] = [(0.0, "i", "ls"), (0.1, "i", " -la")]
ctx = _ctx(events)
assert ctx.commands == ()
def test_single_command_carriage_return_terminator() -> None:
events: list[AsciinemaEvent] = [
(0.0, "i", "l"), (0.1, "i", "s"), (0.2, "i", "\r"),
]
ctx = _ctx(events)
assert len(ctx.commands) == 1
cmd = ctx.commands[0]
assert cmd.start_ts == 0.0
assert cmd.end_ts == 0.2
assert cmd.first_token_hash == hash_token("ls")
def test_paste_event_with_full_command() -> None:
# A pasted command line all in one event, terminated by \r in the
# paste itself.
events: list[AsciinemaEvent] = [(1.0, "i", "echo hello world\r")]
ctx = _ctx(events)
assert len(ctx.commands) == 1
cmd = ctx.commands[0]
assert cmd.first_token_hash == hash_token("echo")
# start_ts and end_ts both come from the single event timestamp
assert cmd.start_ts == 1.0
assert cmd.end_ts == 1.0
def test_lf_terminator_also_segments() -> None:
events: list[AsciinemaEvent] = [(0.0, "i", "ls -la\n")]
ctx = _ctx(events)
assert len(ctx.commands) == 1
assert ctx.commands[0].first_token_hash == hash_token("ls")
def test_multiple_commands_get_inter_cmd_iats() -> None:
events: list[AsciinemaEvent] = [
(0.0, "i", "a"), (0.1, "i", "\r"), # cmd "a" ends at 0.1
(1.5, "i", "b"), (1.6, "i", "\r"), # cmd "b" starts at 1.5, ends at 1.6
(3.0, "i", "c"), (3.1, "i", "\r"), # cmd "c" starts at 3.0
]
ctx = _ctx(events)
assert len(ctx.commands) == 3
# IATs between command end and next command start
assert ctx.inter_cmd_iats == (1.5 - 0.1, 3.0 - 1.6)
def test_output_per_cmd_counts_bytes_between_command_boundaries() -> None:
events: list[AsciinemaEvent] = [
(0.0, "i", "ls\r"),
(0.1, "o", "file.txt\r\n"), # 10 bytes
(0.2, "o", "other.txt\r\n"), # 11 bytes
(1.0, "i", "ps\r"),
(1.1, "o", "PID TTY\r\n"), # 9 bytes (after cmd 2; tail beyond paired output)
]
ctx = _ctx(events)
assert len(ctx.commands) == 2
# Pair is (cmd0.end_ts, cmd1.start_ts) = (0.0, 1.0); 21 bytes fall in
assert ctx.output_per_cmd == (21,)
def test_first_token_only_hashes_first_word() -> None:
events: list[AsciinemaEvent] = [(0.0, "i", "curl -sS https://target/\r")]
ctx = _ctx(events)
assert ctx.commands[0].first_token_hash == hash_token("curl")
# Argument values are not stored anywhere in SessionContext
assert "target" not in str(ctx.commands)
def test_blank_line_does_not_emit_command() -> None:
# Hitting Enter on an empty prompt should not register a command
events: list[AsciinemaEvent] = [(0.0, "i", "\r"), (0.5, "i", "ls\r")]
ctx = _ctx(events)
assert len(ctx.commands) == 1
assert ctx.commands[0].first_token_hash == hash_token("ls")