feat(ttp): EmailLifter disk-reach for body-aware predicates (DEBT-047)

R0047 (BEC) and the encoded-payload predicate substring-match against
the email body. Shipping raw body text on the abstracted service bus
is the wrong privacy stance — the bus transport may swap from UNIX
socket to networked at any time, and "loopback today" is not a license
to put PII on the wire.

EmailLifter now opens the .eml lazily from
/var/lib/decnet/artifacts/{decky_id}/smtp/{stored_as} when a body-aware
predicate runs and parses the body in-process via stdlib email +
policy.default. The decoded body is memoized into the payload dict so
multiple body-aware predicates on the same event open the file once.

Bus envelope only carries the artifact pointer (decky_id + stored_as);
raw body bytes never cross the host disk boundary on the agent → master
hop. Filesystem access on agents is unblocked by DEBT-035 (setgid +
group-readable artifacts root, paid 2026-05-02).

The legacy inline body_text path is preserved — when the producer ships
body_text on the bus the helper short-circuits without opening the file.
This commit is contained in:
2026-05-02 20:05:54 -04:00
parent 7036a86e76
commit e972d870de
2 changed files with 225 additions and 4 deletions

View File

@@ -19,11 +19,16 @@ from __future__ import annotations
import base64
import binascii
import email
import email.errors
import email.message
import email.policy
import hashlib
import re
from collections.abc import Callable
from typing import Any, Final
from decnet.artifacts.paths import ArtifactPathError, resolve_artifact_path
from decnet.ttp.base import TaggerEvent, TolerantTagger
from decnet.ttp.impl._emit import emit_tags
from decnet.ttp.impl._rule_index import RuleIndex
@@ -241,12 +246,78 @@ def _p_malicious_attachment(
return None
def _extract_body_text(msg: email.message.EmailMessage) -> str | None:
"""Best-effort plain-text body extraction from a parsed email.
Prefers ``text/plain``. Falls back to ``text/html`` (raw — predicates
here are substring-matchers, no need to de-tag). Returns None when
the message has no readable text part. Requires the message to have
been parsed with ``policy=email.policy.default`` so parts are
``EmailMessage`` instances (``get_content`` is policy-conditional).
"""
candidates: list[email.message.EmailMessage] = list(msg.walk())
for content_type in ("text/plain", "text/html"):
for part in candidates:
if part.get_content_type() != content_type:
continue
try:
content = part.get_content()
except (LookupError, ValueError, KeyError):
continue
if isinstance(content, str):
return content
return None
def _load_body_text(payload: dict[str, Any]) -> str | None:
"""Return the email body text for predicates that need it.
If the bus payload already carries ``body_text`` (older deployments
or master-side producers), use it. Otherwise disk-reach: open the
``.eml`` from ``/var/lib/decnet/artifacts/{decky_id}/smtp/{stored_as}``
and parse the body in-process.
The decoded body is memoized back into the payload dict so the next
predicate on the same event reuses it without re-opening the file.
The bus envelope only carries the artifact pointer (``decky_id`` +
``stored_as``); raw body bytes never cross the host boundary
(DEBT-047). Returns None on any failure — predicates then short
circuit to no-match, matching pre-disk-reach behavior when fields
were absent.
"""
existing = payload.get("body_text")
if isinstance(existing, str):
return existing
decky_id = payload.get("decky_id")
stored_as = payload.get("stored_as")
if not isinstance(decky_id, str) or not isinstance(stored_as, str):
return None
try:
path = resolve_artifact_path(decky_id, stored_as, "smtp")
except ArtifactPathError:
return None
try:
with open(path, "rb") as fh:
msg = email.message_from_binary_file(
fh, policy=email.policy.default,
)
except (OSError, email.errors.MessageError):
return None
body = _extract_body_text(msg)
if body is None:
return None
payload["body_text"] = body
return body
def _p_bec(
spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
subject = payload.get("subject")
body_text = payload.get("body_text")
if not isinstance(subject, str) or not isinstance(body_text, str):
if not isinstance(subject, str):
return None
body_text = _load_body_text(payload)
if body_text is None:
return None
subj_kws = spec.get("subject_keywords", [])
body_kws = spec.get("body_action_keywords", [])
@@ -278,8 +349,8 @@ def _p_encoded_payload(
spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
min_bytes = int(spec.get("min_bytes", 4096))
body_text = payload.get("body_text")
if not isinstance(body_text, str) or not body_text:
body_text = _load_body_text(payload)
if not body_text:
return None
# Upstream may pre-compute the largest decoded base64 length.
body_b64_bytes = payload.get("body_base64_bytes")

View File

@@ -0,0 +1,150 @@
"""Disk-reach tests for EmailLifter (DEBT-047).
When the bus payload omits ``body_text`` but carries ``decky_id`` +
``stored_as``, body-aware predicates (R0047 BEC, encoded-payload) must
open the stored ``.eml`` from the artifact tree and parse the body
in-process. Bus carries only the pointer; raw body bytes stay on
host disk.
"""
from __future__ import annotations
from email.message import EmailMessage
import pytest
from decnet.artifacts import paths as artifact_paths
from decnet.ttp.impl import email_lifter as lifter_mod
_DECKY = "test-decky-01"
_STORED_AS = "2026-04-18T02:22:56Z_abc123def456_msg.eml"
def _write_eml(root, body_text, *, content_type="text/plain"):
msg = EmailMessage()
msg["From"] = "alice@evil.example"
msg["To"] = "victim@target.example"
msg["Subject"] = "URGENT: wire transfer needed"
if content_type == "text/plain":
msg.set_content(body_text)
else:
msg.set_content("plain fallback")
msg.add_alternative(body_text, subtype="html")
smtp_dir = root / _DECKY / "smtp"
smtp_dir.mkdir(parents=True, exist_ok=True)
p = smtp_dir / _STORED_AS
p.write_bytes(bytes(msg))
return p
@pytest.fixture
def root(tmp_path, monkeypatch):
monkeypatch.setattr(artifact_paths, "ARTIFACTS_ROOT", tmp_path)
return tmp_path
_BEC_SPEC = {
"subject_keywords": ["wire transfer", "urgent"],
"body_action_keywords": ["bank", "iban", "account"],
}
def test_p_bec_matches_via_disk_reach(root):
_write_eml(
root, "Please update our bank account / IBAN before EOD.",
)
payload = {
"subject": "URGENT: wire transfer needed",
"decky_id": _DECKY,
"stored_as": _STORED_AS,
}
result = lifter_mod._p_bec(_BEC_SPEC, payload)
assert result is not None
assert result["matched_subject_kw"] == "wire transfer"
assert result["matched_body_kw"] in {"bank", "iban"}
# Helper must have memoized the body back into the payload.
assert "bank" in payload["body_text"].lower()
def test_p_bec_no_match_when_eml_missing(root):
payload = {
"subject": "URGENT: wire transfer needed",
"decky_id": _DECKY,
"stored_as": _STORED_AS,
}
assert lifter_mod._p_bec(_BEC_SPEC, payload) is None
def test_p_bec_no_match_without_pointer(root):
payload = {"subject": "URGENT: wire transfer needed"}
assert lifter_mod._p_bec(_BEC_SPEC, payload) is None
def test_inline_body_text_takes_precedence(root, monkeypatch):
"""If the producer ships body_text inline, no file IO happens."""
sentinel = "Please remit IBAN bank details now."
payload = {
"subject": "URGENT: wire transfer needed",
"body_text": sentinel,
"decky_id": _DECKY,
"stored_as": _STORED_AS,
}
def _explode(*a, **kw):
raise AssertionError("disk-reach must not run when body_text inline")
monkeypatch.setattr(lifter_mod, "resolve_artifact_path", _explode)
res = lifter_mod._p_bec(_BEC_SPEC, payload)
assert res is not None
def test_body_cache_avoids_second_open(root, monkeypatch):
_write_eml(root, "wire to our bank IBAN now")
opens: list[str] = []
real_open = lifter_mod.email.message_from_binary_file
def _spy(fh, *a, **kw):
opens.append("opened")
return real_open(fh, *a, **kw)
monkeypatch.setattr(
lifter_mod.email, "message_from_binary_file", _spy,
)
payload = {
"subject": "URGENT: wire transfer needed",
"decky_id": _DECKY,
"stored_as": _STORED_AS,
}
lifter_mod._p_bec(_BEC_SPEC, payload)
lifter_mod._p_bec(_BEC_SPEC, payload)
assert len(opens) == 1
def test_html_fallback_when_no_text_plain(tmp_path, monkeypatch):
monkeypatch.setattr(artifact_paths, "ARTIFACTS_ROOT", tmp_path)
smtp = tmp_path / _DECKY / "smtp"
smtp.mkdir(parents=True)
raw = (
b"From: a@b\r\nTo: c@d\r\nSubject: t\r\n"
b"Content-Type: text/html; charset=utf-8\r\n\r\n"
b"<html><body>please send our IBAN bank info</body></html>"
)
(smtp / _STORED_AS).write_bytes(raw)
payload = {
"subject": "URGENT: wire transfer needed",
"decky_id": _DECKY,
"stored_as": _STORED_AS,
}
result = lifter_mod._p_bec(_BEC_SPEC, payload)
assert result is not None
def test_invalid_pointer_rejected(root):
"""Bad decky/stored_as values must not crash and must yield no body."""
payload = {
"subject": "URGENT: wire transfer needed",
"decky_id": "../etc",
"stored_as": _STORED_AS,
}
assert lifter_mod._p_bec(_BEC_SPEC, payload) is None