From e94ab608d9510329829f5a08db1d7e69c0448e41 Mon Sep 17 00:00:00 2001 From: anti Date: Fri, 8 May 2026 22:52:46 -0400 Subject: [PATCH] fix(profiler/behave_shell): tolerate non-UTF-8 bytes in shard reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real-world bug surfaced on the first live decky run: sessrec.c's json_escape (decnet/templates/_shared/sessrec/sessrec.c:111-141) only escapes bytes < 0x20 + DEL — bytes >= 0x80 pass through raw. An attacker pasting Latin-1 / GB18030 / any non-UTF-8 8-bit text yields a shard line that chokes Python's default UTF-8 text-mode read with 'utf-8 codec can't decode byte 0xac'. Three changes: 1. _events_for_sid now opens with errors='surrogateescape', preserving byte fidelity through the JSON parse. Surrogate-half chars correctly fail isascii() / isalpha() so the typed-letter histograms filter them out automatically. Tightening sessrec.c to escape >= 0x80 is filed for v0.2 — that's the proper forensic-data fix; the surrogateescape read makes the engine robust meanwhile. 2. Regression test (test_handler_tolerates_non_utf8_bytes_in_shard) builds a shard with raw 0xAC bytes inside a JSON 'data' string and asserts the handler still persists observations. 3. Collector's _emit_session now logs at WARNING (was DEBUG) when find_shard_with_sid returns None, citing the three usual causes (ARTIFACTS_ROOT perms, _SERVICE_RE whitelist, sessrec/collector race). Surfaces the silent-skip class of bug in seconds instead of hours — the first live run hid a perm mismatch (User=anti without SupplementaryGroups=decnet) for an entire session window before the symptom was traced upstream. --- decnet/collector/worker.py | 24 ++++++++++-- decnet/profiler/behave_shell/_handler.py | 10 ++++- .../test_handler_session_ended.py | 38 ++++++++++++++++++- 3 files changed, 66 insertions(+), 6 deletions(-) diff --git a/decnet/collector/worker.py b/decnet/collector/worker.py index 3a95bde7..7c1f4e12 100644 --- a/decnet/collector/worker.py +++ b/decnet/collector/worker.py @@ -313,16 +313,34 @@ class _SessionAggregator: # consumer skips honestly. Additive field; existing TTP consumers # ignore it. shard_path: str | None = None + resolve_error: str | None = None if sid and decky and service: try: resolved = find_shard_with_sid(decky, service, sid) except (ValueError, OSError, PermissionError) as exc: - logger.debug( - "collector: shard resolve failed for sid=%s: %s", sid, exc, - ) + resolve_error = f"{type(exc).__name__}: {exc}" resolved = None if resolved is not None: shard_path = str(resolved) + if shard_path is None and sid: + # Loud-by-default — the BEHAVE-SHELL handler will skip + # session.ended events with shard_path=None, so a silent + # miss here means the profiler panel never hydrates. Surface + # the most common failure modes inline so the operator can + # diagnose without grepping decnet/artifacts/shards.py. + # + # 1. ARTIFACTS_ROOT not readable by the collector's user + # (perm 0750 decnet:decnet vs. User=anti without + # SupplementaryGroups=decnet). + # 2. service whitelist (_SERVICE_RE accepts ssh|telnet only). + # 3. sessrec hasn't flushed the shard for this sid yet + # (collector tick won the race; next tick recovers). + logger.warning( + "collector: shard_path=None decky=%s service=%s sid=%s " + "(error=%s) — profiler will skip this session.ended; " + "check ARTIFACTS_ROOT perms / service whitelist", + decky, service, sid, resolve_error or "shard not found", + ) payload: dict[str, Any] = { "session_id": sid or None, diff --git a/decnet/profiler/behave_shell/_handler.py b/decnet/profiler/behave_shell/_handler.py index 777a8aa0..f42d59b3 100644 --- a/decnet/profiler/behave_shell/_handler.py +++ b/decnet/profiler/behave_shell/_handler.py @@ -53,9 +53,17 @@ def _events_for_sid(shard: Path, sid: str) -> list[AsciinemaEvent]: Mirrors the loader pattern in ``tests/profiler/behave_shell/test_calibration_grid.py``: skip headers / non-matching sids / unparseable lines silently. + + ``errors="surrogateescape"`` because sessrec.c's json_escape only + escapes bytes < 0x20 + DEL — bytes >= 0x80 pass through raw, so + a real shard with Latin-1 / GB18030 / arbitrary 8-bit attacker + paste content is NOT valid UTF-8. surrogateescape preserves byte + fidelity through the JSON read; downstream isalpha() / isascii() + correctly filter the surrogate-half chars out of the typed-letter + histograms. Filed for v0.2: tighten sessrec.c to escape >= 0x80. """ events: list[AsciinemaEvent] = [] - with shard.open() as f: + with shard.open(encoding="utf-8", errors="surrogateescape") as f: for line in f: try: rec = json.loads(line) diff --git a/tests/profiler/behave_shell/test_handler_session_ended.py b/tests/profiler/behave_shell/test_handler_session_ended.py index d3fa6f33..2e3be1da 100644 --- a/tests/profiler/behave_shell/test_handler_session_ended.py +++ b/tests/profiler/behave_shell/test_handler_session_ended.py @@ -10,8 +10,6 @@ import json from typing import Any from unittest.mock import AsyncMock -import pytest - from decnet.profiler.behave_shell._handler import ( _build_evidence_ref, handle_session_ended, @@ -180,6 +178,42 @@ async def test_publish_none_is_silent(tmp_path) -> None: assert n > 0 +async def test_handler_tolerates_non_utf8_bytes_in_shard(tmp_path) -> None: + """Real-world regression: sessrec.c's json_escape only escapes + bytes < 0x20 + DEL, so bytes >= 0x80 pass through raw. An attacker + pasting Latin-1 (or any non-UTF-8) text yields a shard line that + chokes Python's default UTF-8 text-mode read. The handler must + open with errors='surrogateescape' and process the session anyway. + """ + sid = _SID + shard_dir = tmp_path / _DECKY / _SERVICE / "transcripts" + shard_dir.mkdir(parents=True, exist_ok=True) + shard = shard_dir / "sessions-2026-05-09.jsonl" + # Header (valid JSON). + header = b'{"sid":"' + sid.encode() + b'","hdr":{"version":2,"width":80,"height":24,"timestamp":1}}\n' + # Synthetic typing burst — six commands so the calibration floor fires. + body_lines: list[bytes] = [] + text = "ls\rps\rid\rwhoami\rpwd\runame\r" + for i, c in enumerate(text): + body_lines.append( + f'{{"sid":"{sid}","t":{i * 0.05},"ch":"i","d":"{c}"}}\n'.encode() + ) + # ONE output event with a raw 0xAC byte inside the data string — + # exactly what sessrec emits for Latin-1 content. + body_lines.append( + b'{"sid":"' + sid.encode() + + b'","t":5.0,"ch":"o","d":"prefix\xac\xac\xacsuffix"}\n' + ) + with shard.open("wb") as f: + f.write(header) + for line in body_lines: + f.write(line) + + repo = _make_repo() + n = await handle_session_ended(repo, _payload(str(shard)), None) + assert n > 0, "handler should persist at least one observation despite raw bytes" + + async def test_attacker_uuid_in_payload_for_filter(tmp_path) -> None: """Phase 5 amendment: every published observation carries the DECNET-side ``attacker_uuid`` denorm (NOT the BEHAVE