feat(profiler/behave_shell): command segmentation in SessionContext
BEHAVE-EXTRACTOR.md Phase A Step 4. Pure refactor inside _ctx.py — no new feature emits. Lays the shared utility for the three cognitive primitives next in line (Steps 5-7). * Command dataclass (frozen): start_ts, end_ts, first_token_hash. PII-safe by construction — only the first whitespace-delimited token of the command is retained, and only as a sha256 hash (decnet/profiler/behave_shell/_parse.py:hash_token). * _segment_commands walks input events char-by-char, splits on \r / \n, hashes the first token, drops the rest. * SessionContext gains commands, inter_cmd_iats, output_per_cmd. output_per_cmd[i] counts bytes between commands[i].end_ts and commands[i+1].start_ts — the natural pairing for Step 7 (feedback_loop_engagement). Tests: empty / unterminated streams, single command (CR + LF terminators), paste-with-newline, multi-command IAT pairing, output-byte counting between boundaries, blank-line skip, first-token-only PII discipline.
This commit is contained in:
@@ -13,7 +13,12 @@ from __future__ import annotations
|
|||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
from decnet.profiler.behave_shell._parse import AsciinemaEvent, PasteBurst
|
from decnet.profiler.behave_shell._parse import (
|
||||||
|
AsciinemaEvent,
|
||||||
|
Command,
|
||||||
|
PasteBurst,
|
||||||
|
hash_token,
|
||||||
|
)
|
||||||
from decnet.profiler.behave_shell._thresholds import (
|
from decnet.profiler.behave_shell._thresholds import (
|
||||||
PASTE_BURST_MAX_IAT_S,
|
PASTE_BURST_MAX_IAT_S,
|
||||||
PASTE_MIN_CHARS_PER_EVENT,
|
PASTE_MIN_CHARS_PER_EVENT,
|
||||||
@@ -37,6 +42,11 @@ class SessionContext:
|
|||||||
paste_bursts: tuple[PasteBurst, ...] = field(default_factory=tuple)
|
paste_bursts: tuple[PasteBurst, ...] = field(default_factory=tuple)
|
||||||
paste_event_count: int = 0
|
paste_event_count: int = 0
|
||||||
|
|
||||||
|
# Step 4 derivations — command segmentation
|
||||||
|
commands: tuple[Command, ...] = field(default_factory=tuple)
|
||||||
|
inter_cmd_iats: tuple[float, ...] = field(default_factory=tuple)
|
||||||
|
output_per_cmd: tuple[int, ...] = field(default_factory=tuple)
|
||||||
|
|
||||||
|
|
||||||
def _detect_paste_bursts(
|
def _detect_paste_bursts(
|
||||||
inputs: list[AsciinemaEvent],
|
inputs: list[AsciinemaEvent],
|
||||||
@@ -92,6 +102,48 @@ def _detect_paste_bursts(
|
|||||||
return tuple(bursts), paste_count
|
return tuple(bursts), paste_count
|
||||||
|
|
||||||
|
|
||||||
|
def _segment_commands(inputs: list[AsciinemaEvent]) -> tuple[Command, ...]:
|
||||||
|
"""Walk input events, splitting on ``\\r`` / ``\\n`` into commands.
|
||||||
|
|
||||||
|
PII discipline: only the first whitespace-delimited token is
|
||||||
|
retained, and only as a sha256 hash. Buffer contents are dropped
|
||||||
|
on every command boundary; an unterminated trailing buffer (no
|
||||||
|
final newline) yields no command.
|
||||||
|
"""
|
||||||
|
cmds: list[Command] = []
|
||||||
|
buf_chars: list[str] = []
|
||||||
|
buf_start_ts: float | None = None
|
||||||
|
|
||||||
|
for t, _kind, data in inputs:
|
||||||
|
for c in data:
|
||||||
|
if c in ("\r", "\n"):
|
||||||
|
if buf_chars:
|
||||||
|
text = "".join(buf_chars).strip()
|
||||||
|
first_token = text.split(maxsplit=1)[0] if text else ""
|
||||||
|
cmds.append(Command(
|
||||||
|
start_ts=buf_start_ts if buf_start_ts is not None else t,
|
||||||
|
end_ts=t,
|
||||||
|
first_token_hash=hash_token(first_token),
|
||||||
|
))
|
||||||
|
buf_chars = []
|
||||||
|
buf_start_ts = None
|
||||||
|
else:
|
||||||
|
if not buf_chars:
|
||||||
|
buf_start_ts = t
|
||||||
|
buf_chars.append(c)
|
||||||
|
|
||||||
|
return tuple(cmds)
|
||||||
|
|
||||||
|
|
||||||
|
def _output_bytes_between(
|
||||||
|
outputs: list[AsciinemaEvent],
|
||||||
|
start: float,
|
||||||
|
end: float,
|
||||||
|
) -> int:
|
||||||
|
"""Total ``len(d)`` of output events with ``start <= t < end``."""
|
||||||
|
return sum(len(d) for t, _k, d in outputs if start <= t < end)
|
||||||
|
|
||||||
|
|
||||||
def build_session_context(
|
def build_session_context(
|
||||||
events: Iterable[AsciinemaEvent],
|
events: Iterable[AsciinemaEvent],
|
||||||
*,
|
*,
|
||||||
@@ -127,6 +179,15 @@ def build_session_context(
|
|||||||
max(0.0, inputs[i][0] - inputs[i - 1][0]) for i in range(1, len(inputs))
|
max(0.0, inputs[i][0] - inputs[i - 1][0]) for i in range(1, len(inputs))
|
||||||
)
|
)
|
||||||
paste_bursts, paste_count = _detect_paste_bursts(inputs)
|
paste_bursts, paste_count = _detect_paste_bursts(inputs)
|
||||||
|
commands = _segment_commands(inputs)
|
||||||
|
inter_cmd_iats = tuple(
|
||||||
|
max(0.0, commands[i + 1].start_ts - commands[i].end_ts)
|
||||||
|
for i in range(len(commands) - 1)
|
||||||
|
)
|
||||||
|
output_per_cmd = tuple(
|
||||||
|
_output_bytes_between(outputs, commands[i].end_ts, commands[i + 1].start_ts)
|
||||||
|
for i in range(len(commands) - 1)
|
||||||
|
)
|
||||||
|
|
||||||
return SessionContext(
|
return SessionContext(
|
||||||
sid=sid,
|
sid=sid,
|
||||||
@@ -140,4 +201,7 @@ def build_session_context(
|
|||||||
iats=iats,
|
iats=iats,
|
||||||
paste_bursts=paste_bursts,
|
paste_bursts=paste_bursts,
|
||||||
paste_event_count=paste_count,
|
paste_event_count=paste_count,
|
||||||
|
commands=commands,
|
||||||
|
inter_cmd_iats=inter_cmd_iats,
|
||||||
|
output_per_cmd=output_per_cmd,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ turn a raw JSON string into one.
|
|||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Iterable, Iterator, Literal, Tuple
|
from typing import Iterable, Iterator, Literal, Tuple
|
||||||
@@ -23,18 +24,7 @@ AsciinemaEvent = Tuple[float, EventKind, str]
|
|||||||
|
|
||||||
@dataclass(frozen=True, slots=True)
|
@dataclass(frozen=True, slots=True)
|
||||||
class PasteBurst:
|
class PasteBurst:
|
||||||
"""Contiguous run of paste-class input events.
|
"""Contiguous run of paste-class input events."""
|
||||||
|
|
||||||
A paste-class event is a single input event whose ``data`` length
|
|
||||||
is at least ``PASTE_MIN_CHARS_PER_EVENT`` — terminal pastes from
|
|
||||||
xterm/kitty/iTerm arrive as one bulk write, so checking event size
|
|
||||||
is the cheap-and-correct proxy for the bracketed-paste signal we
|
|
||||||
don't get to see.
|
|
||||||
|
|
||||||
Multiple consecutive paste-class events with low IATs collapse
|
|
||||||
into one ``PasteBurst`` for higher-level reasoning (paste-rate /
|
|
||||||
paste-style classification later).
|
|
||||||
"""
|
|
||||||
|
|
||||||
start_ts: float
|
start_ts: float
|
||||||
end_ts: float
|
end_ts: float
|
||||||
@@ -42,6 +32,33 @@ class PasteBurst:
|
|||||||
event_count: int
|
event_count: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True, slots=True)
|
||||||
|
class Command:
|
||||||
|
"""One command-line invocation, segmented from the input stream.
|
||||||
|
|
||||||
|
PII discipline (per ``BEHAVE-INTEGRATION.md`` and the BEHAVE
|
||||||
|
envelope's pinned policy): only the *first token* of the command
|
||||||
|
is retained, and only as a sha256 hash. The full command body
|
||||||
|
never enters the engine's data structures, never goes to the bus,
|
||||||
|
never ends up in the database. ``first_token_hash`` lets
|
||||||
|
cognitive.command_branch_diversity (Step 6) count distinct
|
||||||
|
invocations without learning anything about argument values.
|
||||||
|
|
||||||
|
``end_ts`` is the timestamp of the ``\\r`` / ``\\n`` that
|
||||||
|
terminated the command; ``start_ts`` is the first character typed
|
||||||
|
or pasted into it.
|
||||||
|
"""
|
||||||
|
|
||||||
|
start_ts: float
|
||||||
|
end_ts: float
|
||||||
|
first_token_hash: str
|
||||||
|
|
||||||
|
|
||||||
|
def hash_token(token: str) -> str:
|
||||||
|
"""sha256-hex of a token; the only PII-safe handle on a command."""
|
||||||
|
return hashlib.sha256(token.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def parse_shard_line(line: str) -> AsciinemaEvent | None:
|
def parse_shard_line(line: str) -> AsciinemaEvent | None:
|
||||||
"""Turn one shard JSONL line into an :data:`AsciinemaEvent`.
|
"""Turn one shard JSONL line into an :data:`AsciinemaEvent`.
|
||||||
|
|
||||||
|
|||||||
101
tests/profiler/behave_shell/test_command_segmentation.py
Normal file
101
tests/profiler/behave_shell/test_command_segmentation.py
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
"""Step 4: command segmentation in SessionContext.
|
||||||
|
|
||||||
|
PII discipline: full command body never enters the engine. Only the
|
||||||
|
first-token sha256 hash is retained.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from decnet.profiler.behave_shell._ctx import build_session_context
|
||||||
|
from decnet.profiler.behave_shell._parse import AsciinemaEvent, hash_token
|
||||||
|
|
||||||
|
|
||||||
|
def _ctx(events: list[AsciinemaEvent]):
|
||||||
|
return build_session_context(events, sid="t-cmds", source="test")
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_input_means_no_commands() -> None:
|
||||||
|
ctx = _ctx([])
|
||||||
|
assert ctx.commands == ()
|
||||||
|
assert ctx.inter_cmd_iats == ()
|
||||||
|
assert ctx.output_per_cmd == ()
|
||||||
|
|
||||||
|
|
||||||
|
def test_unterminated_input_yields_no_command() -> None:
|
||||||
|
# No trailing newline → no command boundary observed
|
||||||
|
events: list[AsciinemaEvent] = [(0.0, "i", "ls"), (0.1, "i", " -la")]
|
||||||
|
ctx = _ctx(events)
|
||||||
|
assert ctx.commands == ()
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_command_carriage_return_terminator() -> None:
|
||||||
|
events: list[AsciinemaEvent] = [
|
||||||
|
(0.0, "i", "l"), (0.1, "i", "s"), (0.2, "i", "\r"),
|
||||||
|
]
|
||||||
|
ctx = _ctx(events)
|
||||||
|
assert len(ctx.commands) == 1
|
||||||
|
cmd = ctx.commands[0]
|
||||||
|
assert cmd.start_ts == 0.0
|
||||||
|
assert cmd.end_ts == 0.2
|
||||||
|
assert cmd.first_token_hash == hash_token("ls")
|
||||||
|
|
||||||
|
|
||||||
|
def test_paste_event_with_full_command() -> None:
|
||||||
|
# A pasted command line all in one event, terminated by \r in the
|
||||||
|
# paste itself.
|
||||||
|
events: list[AsciinemaEvent] = [(1.0, "i", "echo hello world\r")]
|
||||||
|
ctx = _ctx(events)
|
||||||
|
assert len(ctx.commands) == 1
|
||||||
|
cmd = ctx.commands[0]
|
||||||
|
assert cmd.first_token_hash == hash_token("echo")
|
||||||
|
# start_ts and end_ts both come from the single event timestamp
|
||||||
|
assert cmd.start_ts == 1.0
|
||||||
|
assert cmd.end_ts == 1.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_lf_terminator_also_segments() -> None:
|
||||||
|
events: list[AsciinemaEvent] = [(0.0, "i", "ls -la\n")]
|
||||||
|
ctx = _ctx(events)
|
||||||
|
assert len(ctx.commands) == 1
|
||||||
|
assert ctx.commands[0].first_token_hash == hash_token("ls")
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiple_commands_get_inter_cmd_iats() -> None:
|
||||||
|
events: list[AsciinemaEvent] = [
|
||||||
|
(0.0, "i", "a"), (0.1, "i", "\r"), # cmd "a" ends at 0.1
|
||||||
|
(1.5, "i", "b"), (1.6, "i", "\r"), # cmd "b" starts at 1.5, ends at 1.6
|
||||||
|
(3.0, "i", "c"), (3.1, "i", "\r"), # cmd "c" starts at 3.0
|
||||||
|
]
|
||||||
|
ctx = _ctx(events)
|
||||||
|
assert len(ctx.commands) == 3
|
||||||
|
# IATs between command end and next command start
|
||||||
|
assert ctx.inter_cmd_iats == (1.5 - 0.1, 3.0 - 1.6)
|
||||||
|
|
||||||
|
|
||||||
|
def test_output_per_cmd_counts_bytes_between_command_boundaries() -> None:
|
||||||
|
events: list[AsciinemaEvent] = [
|
||||||
|
(0.0, "i", "ls\r"),
|
||||||
|
(0.1, "o", "file.txt\r\n"), # 10 bytes
|
||||||
|
(0.2, "o", "other.txt\r\n"), # 11 bytes
|
||||||
|
(1.0, "i", "ps\r"),
|
||||||
|
(1.1, "o", "PID TTY\r\n"), # 9 bytes (after cmd 2; tail beyond paired output)
|
||||||
|
]
|
||||||
|
ctx = _ctx(events)
|
||||||
|
assert len(ctx.commands) == 2
|
||||||
|
# Pair is (cmd0.end_ts, cmd1.start_ts) = (0.0, 1.0); 21 bytes fall in
|
||||||
|
assert ctx.output_per_cmd == (21,)
|
||||||
|
|
||||||
|
|
||||||
|
def test_first_token_only_hashes_first_word() -> None:
|
||||||
|
events: list[AsciinemaEvent] = [(0.0, "i", "curl -sS https://target/\r")]
|
||||||
|
ctx = _ctx(events)
|
||||||
|
assert ctx.commands[0].first_token_hash == hash_token("curl")
|
||||||
|
# Argument values are not stored anywhere in SessionContext
|
||||||
|
assert "target" not in str(ctx.commands)
|
||||||
|
|
||||||
|
|
||||||
|
def test_blank_line_does_not_emit_command() -> None:
|
||||||
|
# Hitting Enter on an empty prompt should not register a command
|
||||||
|
events: list[AsciinemaEvent] = [(0.0, "i", "\r"), (0.5, "i", "ls\r")]
|
||||||
|
ctx = _ctx(events)
|
||||||
|
assert len(ctx.commands) == 1
|
||||||
|
assert ctx.commands[0].first_token_hash == hash_token("ls")
|
||||||
Reference in New Issue
Block a user