feat(profiler/behave_shell): scaffold extract_session entry point

BEHAVE-EXTRACTOR.md Phase A Step 0. Lays the package skeleton
(__init__/extract/_parse/_ctx/_thresholds/_features) with empty
FEATURES = (), so the worker plumbing in BEHAVE-INTEGRATION Phase 4
has a stable import path before any primitive lands.

extract_session() builds a SessionContext once and fans the
registered feature functions across it; at Step 0 that fan-out is
empty and the function yields nothing. Step 1 (asciinema parser +
paste-burst detector) and Step 2 (motor.input_modality) land next.

Smoke suite asserts the empty contract: empty stream → no
observations, single event → t_start == t_end, multi-event → events
routed into input_events / output_events by kind, evidence_ref
defaults to "session:<sid>" or honours an explicit override.
This commit is contained in:
2026-05-03 07:42:09 -04:00
parent a2a61b636e
commit f8eae04e5d
8 changed files with 286 additions and 0 deletions

View File

@@ -0,0 +1,18 @@
"""BEHAVE-SHELL extraction engine — DECNET's official implementation.
Per ``development/BEHAVE-EXTRACTOR.md``: this package is a pure
library. Workers (``BEHAVE-INTEGRATION.md`` Phase 4) own I/O, bus
emission, and persistence. The engine just turns one PTY session into
``Iterable[Observation]``.
BEHAVE is the spec; DECNET is the engine.
"""
from __future__ import annotations
from decnet.profiler.behave_shell.extract import (
DEFAULT_SOURCE,
build_context,
extract_session,
)
__all__ = ["DEFAULT_SOURCE", "build_context", "extract_session"]

View File

@@ -0,0 +1,78 @@
"""SessionContext: precomputed bundle every feature function reads from.
A naïve engine re-walks the event stream once per primitive. We don't
do that — one walk over the events builds this context, every feature
reads from it. Adding a new feature is O(1) cost on the parse side.
Step 0 ships only the structural fields (sid / source / evidence_ref /
timing envelope). Step 1+ fills ``iats`` / ``paste_bursts`` /
``commands`` / ``inter_cmd_iats`` / ``output_per_cmd``.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Iterable
from decnet.profiler.behave_shell._parse import AsciinemaEvent
@dataclass(frozen=True, slots=True)
class SessionContext:
sid: str
source: str
evidence_ref: str
t_start: float
t_end: float
duration_s: float
input_events: tuple[AsciinemaEvent, ...] = field(default_factory=tuple)
output_events: tuple[AsciinemaEvent, ...] = field(default_factory=tuple)
def build_session_context(
events: Iterable[AsciinemaEvent],
*,
sid: str,
source: str,
evidence_ref: str | None = None,
) -> SessionContext:
"""Single-pass build of the SessionContext for ``events``.
``evidence_ref`` defaults to ``"session:" + sid`` so callers that
don't yet plumb a real evidence pointer still get a stable,
BEHAVE-envelope-valid string. Workers should pass an explicit
pointer to the on-disk shard.
"""
inputs: list[AsciinemaEvent] = []
outputs: list[AsciinemaEvent] = []
t_first: float | None = None
t_last: float = 0.0
for ev in events:
t, kind, _ = ev
if t_first is None:
t_first = t
if t > t_last:
t_last = t
if kind == "i":
inputs.append(ev)
elif kind == "o":
outputs.append(ev)
if t_first is None:
t_start = 0.0
t_end = 0.0
else:
t_start = t_first
t_end = t_last
return SessionContext(
sid=sid,
source=source,
evidence_ref=evidence_ref or f"session:{sid}",
t_start=t_start,
t_end=t_end,
duration_s=max(0.0, t_end - t_start),
input_events=tuple(inputs),
output_events=tuple(outputs),
)

View File

@@ -0,0 +1,20 @@
"""Registered feature functions.
Each entry takes a ``SessionContext`` and yields zero or more
``Observation`` instances. Adding a primitive = adding a function in a
sibling module and appending it to ``FEATURES``.
Step 0 ships an empty tuple — extract_session() is wired but emits
nothing until Step 2.
"""
from __future__ import annotations
from typing import Callable, Iterable
from decnet_behave_core.spec.envelope import Observation
from decnet.profiler.behave_shell._ctx import SessionContext
FeatureFn = Callable[[SessionContext], Iterable[Observation]]
FEATURES: tuple[FeatureFn, ...] = ()

View File

@@ -0,0 +1,14 @@
"""Asciinema event types.
The on-disk shard format is a list of 3-tuples ``(t, kind, data)`` where
``t`` is seconds since session start (float), ``kind`` is ``'i'`` (input)
or ``'o'`` (output), and ``data`` is the captured bytes decoded as a
Python ``str``. Step 0 ships only the type aliases — Step 1 fills the
parsing helpers and paste-burst detector.
"""
from __future__ import annotations
from typing import Literal, Tuple
EventKind = Literal["i", "o"]
AsciinemaEvent = Tuple[float, EventKind, str]

View File

@@ -0,0 +1,11 @@
"""Numeric thresholds for BEHAVE-SHELL primitive classification.
Each constant added here cites its calibration source. When the
registry's ``notes:`` field disagrees with a constant in this file the
registry is authoritative — fix the constant and re-run the
calibration grid.
Step 0 ships this file empty by design; thresholds land alongside the
feature functions that consume them (Steps 1+).
"""
from __future__ import annotations

View File

@@ -0,0 +1,51 @@
"""Public extraction entry point.
``extract_session`` is the only function workers call. It builds a
:class:`SessionContext` once and fans the registered feature functions
across it. Pure library: no I/O, no bus, no DB. The worker
(``BEHAVE-INTEGRATION.md`` Phase 4) is responsible for those.
"""
from __future__ import annotations
from typing import Iterable, Iterator
from decnet_behave_core.spec.envelope import Observation
from decnet.profiler.behave_shell._ctx import SessionContext, build_session_context
from decnet.profiler.behave_shell._features import FEATURES
from decnet.profiler.behave_shell._parse import AsciinemaEvent
DEFAULT_SOURCE = "decnet/profiler/behave_shell/extract.py"
def extract_session(
events: Iterable[AsciinemaEvent],
*,
sid: str,
source: str = DEFAULT_SOURCE,
evidence_ref: str | None = None,
) -> Iterator[Observation]:
"""Yield BEHAVE-SHELL observations for a single session.
``events`` is an iterable of ``(t, kind, data)`` tuples — see
``_parse.AsciinemaEvent``. ``sid`` identifies the session for
evidence pointers and downstream joins.
"""
ctx = build_session_context(
events, sid=sid, source=source, evidence_ref=evidence_ref
)
for feature_fn in FEATURES:
yield from feature_fn(ctx)
def build_context(
events: Iterable[AsciinemaEvent],
*,
sid: str,
source: str = DEFAULT_SOURCE,
evidence_ref: str | None = None,
) -> SessionContext:
"""Expose the SessionContext build for tests + future debug tools."""
return build_session_context(
events, sid=sid, source=source, evidence_ref=evidence_ref
)

View File

View File

@@ -0,0 +1,94 @@
"""Step 0 smoke: prove the wiring before any logic.
Before any feature function lands, verify:
* the public ``extract_session`` import path resolves;
* an empty event stream yields zero observations and a well-formed
zero-duration ``SessionContext``;
* a single input event yields a context with ``t_start == t_end``
and ``duration_s == 0.0``;
* a multi-event stream populates ``t_start`` / ``t_end`` /
``duration_s`` correctly and routes events into the
``input_events`` / ``output_events`` slots by kind;
* ``FEATURES`` is empty at Step 0 — the empty contract is the gate
that the next step must intentionally break.
"""
from __future__ import annotations
from decnet.profiler.behave_shell import (
DEFAULT_SOURCE,
build_context,
extract_session,
)
from decnet.profiler.behave_shell._features import FEATURES
from decnet.profiler.behave_shell._parse import AsciinemaEvent
def test_features_tuple_is_empty_at_step_0() -> None:
assert FEATURES == ()
def test_default_source_is_canonical_path() -> None:
assert DEFAULT_SOURCE == "decnet/profiler/behave_shell/extract.py"
def test_extract_session_empty_stream_yields_no_observations() -> None:
out = list(extract_session([], sid="sess-empty"))
assert out == []
def test_build_context_empty_stream_zero_duration() -> None:
ctx = build_context([], sid="sess-empty")
assert ctx.sid == "sess-empty"
assert ctx.source == DEFAULT_SOURCE
assert ctx.evidence_ref == "session:sess-empty"
assert ctx.t_start == 0.0
assert ctx.t_end == 0.0
assert ctx.duration_s == 0.0
assert ctx.input_events == ()
assert ctx.output_events == ()
def test_build_context_single_input_event() -> None:
events: list[AsciinemaEvent] = [(1.5, "i", "a")]
ctx = build_context(events, sid="sess-1")
assert ctx.t_start == 1.5
assert ctx.t_end == 1.5
assert ctx.duration_s == 0.0
assert ctx.input_events == ((1.5, "i", "a"),)
assert ctx.output_events == ()
def test_build_context_multi_event_routes_by_kind() -> None:
events: list[AsciinemaEvent] = [
(0.0, "i", "l"),
(0.1, "i", "s"),
(0.2, "o", "ls\r\n"),
(0.3, "o", "file.txt\r\n"),
(0.5, "i", "\r"),
]
ctx = build_context(events, sid="sess-multi")
assert ctx.t_start == 0.0
assert ctx.t_end == 0.5
assert ctx.duration_s == 0.5
assert len(ctx.input_events) == 3
assert len(ctx.output_events) == 2
# Order preserved
assert ctx.input_events[0] == (0.0, "i", "l")
assert ctx.output_events[-1] == (0.3, "o", "file.txt\r\n")
def test_extract_session_explicit_evidence_ref_overrides_default() -> None:
ctx = build_context(
[(0.0, "i", "x")],
sid="sess-x",
evidence_ref="shard:/var/log/d/sess-x.cast",
)
assert ctx.evidence_ref == "shard:/var/log/d/sess-x.cast"
def test_extract_session_with_features_still_empty() -> None:
"""Until Step 2 lands, even a populated stream emits nothing."""
events: list[AsciinemaEvent] = [(t / 10.0, "i", c) for t, c in enumerate("hello\r")]
out = list(extract_session(events, sid="sess-features-empty"))
assert out == []