Files
DECNET/decnet/ttp/impl/email_lifter.py
anti e972d870de feat(ttp): EmailLifter disk-reach for body-aware predicates (DEBT-047)
R0047 (BEC) and the encoded-payload predicate substring-match against
the email body. Shipping raw body text on the abstracted service bus
is the wrong privacy stance — the bus transport may swap from UNIX
socket to networked at any time, and "loopback today" is not a license
to put PII on the wire.

EmailLifter now opens the .eml lazily from
/var/lib/decnet/artifacts/{decky_id}/smtp/{stored_as} when a body-aware
predicate runs and parses the body in-process via stdlib email +
policy.default. The decoded body is memoized into the payload dict so
multiple body-aware predicates on the same event open the file once.

Bus envelope only carries the artifact pointer (decky_id + stored_as);
raw body bytes never cross the host disk boundary on the agent → master
hop. Filesystem access on agents is unblocked by DEBT-035 (setgid +
group-readable artifacts root, paid 2026-05-02).

The legacy inline body_text path is preserved — when the producer ships
body_text on the bus the helper short-circuits without opening the file.
2026-05-02 20:05:54 -04:00

474 lines
17 KiB
Python

"""Email lifter — SMTP message-level technique tagger (E.3.12).
Reads pre-parsed SMTP message payload (headers as a name-only list,
body sha + body text already truncated/scrubbed by the upstream worker,
attachment hashes + names) and emits Initial-Access / Phishing /
Resource-Development techniques per Appendix A.6.
PII discipline (TTP_TAGGING.md §"Hard parts §6") is enforced at the
lifter layer: emitted ``TTPTag.evidence`` only carries fields that
conform to :class:`~decnet.web.db.models.ttp.EmailEvidence`
(``body_sha256``, ``matched_headers`` — names not values,
``rcpt_domain_set`` — domains not addresses, ``attachment_sha256s``,
``rcpt_count``) plus a small set of match-discriminator strings
(``matched_kit``, ``matched_trigger``, ``matched_url``). Raw From /
Return-Path / RCPT addresses, raw body bytes, and decoded payload
previews NEVER appear in evidence.
"""
from __future__ import annotations
import base64
import binascii
import email
import email.errors
import email.message
import email.policy
import hashlib
import re
from collections.abc import Callable
from typing import Any, Final
from decnet.artifacts.paths import ArtifactPathError, resolve_artifact_path
from decnet.ttp.base import TaggerEvent, TolerantTagger
from decnet.ttp.impl._emit import emit_tags
from decnet.ttp.impl._rule_index import RuleIndex
from decnet.ttp.impl._state import is_active
from decnet.ttp.impl.rule_engine import CompiledRule
from decnet.ttp.store.base import RuleStore
from decnet.web.db.models.ttp import TTPTag
Predicate = Callable[
[dict[str, Any], dict[str, Any]],
"dict[str, Any] | None",
]
# ── Helpers ─────────────────────────────────────────────────────────
def _domain(addr_or_domain: str | None) -> str | None:
if not isinstance(addr_or_domain, str):
return None
if not addr_or_domain:
return None
if "@" in addr_or_domain:
return addr_or_domain.split("@", 1)[1].lower().strip()
return addr_or_domain.lower().strip()
def _safe_evidence(payload: dict[str, Any]) -> dict[str, Any]:
"""Build the EmailEvidence-conformant base evidence dict.
Only PII-safe keys: body sha (already a hash), header NAMES (not
values), recipient DOMAINS (not addresses), attachment hashes,
rcpt count. Raw addresses, raw body, raw header values explicitly
excluded.
"""
rcpt_domains_raw = payload.get("rcpt_domains") or []
rcpt_domains = [
d.lower() for d in rcpt_domains_raw if isinstance(d, str)
]
attachment_hashes = payload.get("attachment_sha256s") or []
if not isinstance(attachment_hashes, list):
attachment_hashes = []
body_sha = payload.get("body_sha256") or ""
if not isinstance(body_sha, str):
body_sha = ""
rcpt_count = payload.get("rcpt_count")
if not isinstance(rcpt_count, int):
rcpt_count = 0
return {
"body_sha256": body_sha,
"matched_headers": [],
"rcpt_domain_set": sorted(set(rcpt_domains)),
"attachment_sha256s": [
h for h in attachment_hashes if isinstance(h, str)
],
"rcpt_count": rcpt_count,
}
# ── Per-rule predicates ─────────────────────────────────────────────
def _p_open_relay(
spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
threshold = int(spec.get("rcpt_threshold", 10))
rcpt_count = payload.get("rcpt_count")
if not isinstance(rcpt_count, int) or rcpt_count < threshold:
return None
if spec.get("require_foreign_from"):
from_domain = _domain(payload.get("from_domain") or payload.get("from"))
mail_from = _domain(
payload.get("mail_from_domain") or payload.get("mail_from"),
)
if not from_domain or not mail_from or from_domain == mail_from:
return None
return {"matched_headers": ["From", "Mail-From"]}
def _p_mass_phish(
spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
threshold = int(spec.get("rcpt_threshold", 25))
rcpt_count = payload.get("rcpt_count")
if not isinstance(rcpt_count, int) or rcpt_count < threshold:
return None
# The "campaign" half: upstream must have observed body simhash
# recurring across recipients. Without that signal, high-RCPT alone
# is open-relay territory (R0041), not mass-phish. The simhash
# derivation lives in the SMTP worker (out of scope here).
if not isinstance(payload.get("body_simhash"), (str, int)):
return None
return {}
def _p_xmailer_kit(
spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
x_mailer = payload.get("x_mailer")
if not isinstance(x_mailer, str) or not x_mailer:
return None
matched_kit = payload.get("matched_kit")
if isinstance(matched_kit, str) and matched_kit:
return {"matched_kit": matched_kit, "matched_headers": ["X-Mailer"]}
# Catalogue match flag — upstream marks it via xmailer_kit_match.
if payload.get("xmailer_kit_match") is True:
return {"matched_headers": ["X-Mailer"]}
return None
_PUNYCODE_PREFIX_DEFAULT: Final[str] = "xn--"
def _p_idn_url(
spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
prefix = spec.get("punycode_prefix") or _PUNYCODE_PREFIX_DEFAULT
if not isinstance(prefix, str):
prefix = _PUNYCODE_PREFIX_DEFAULT
urls = payload.get("urls") or []
if not isinstance(urls, list):
return None
for url in urls:
if isinstance(url, str) and prefix in url:
# Carry only the punycode-bearing host portion as a match
# discriminator. NEVER carry the full URL (could contain
# credential-harvest path with PII).
host = _extract_host(url)
return {
"matched_url_host": host or "",
"matched_headers": ["body"],
}
return None
def _extract_host(url: str) -> str | None:
m = re.match(r"https?://([^/]+)", url)
if m:
return m.group(1).lower()
return None
def _p_sender_masquerade(
spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
signals_raw = spec.get("signals", [])
if not isinstance(signals_raw, list):
return None
signals = {s for s in signals_raw if isinstance(s, str)}
matched: list[str] = []
if "from_returnpath_mismatch" in signals:
if (
_domain(payload.get("from_domain")) is not None
and _domain(payload.get("return_path_domain")) is not None
and _domain(payload.get("from_domain"))
!= _domain(payload.get("return_path_domain"))
):
matched.append("from_returnpath_mismatch")
if "from_mailfrom_mismatch" in signals:
if (
_domain(payload.get("from_domain")) is not None
and _domain(payload.get("mail_from_domain")) is not None
and _domain(payload.get("from_domain"))
!= _domain(payload.get("mail_from_domain"))
):
matched.append("from_mailfrom_mismatch")
if "dkim_fail" in signals and payload.get("dkim_signed") is False:
matched.append("dkim_fail")
if "spf_fail" in signals and payload.get("spf_pass") is False:
matched.append("spf_fail")
if not matched:
return None
headers: list[str] = []
if any("from_" in m for m in matched):
headers.extend(["From", "Return-Path"])
if "dkim_fail" in matched:
headers.append("DKIM-Signature")
if "spf_fail" in matched:
headers.append("Authentication-Results")
return {
"matched_signals": matched,
"matched_headers": sorted(set(headers)),
}
def _p_malicious_attachment(
spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
triggers_raw = spec.get("triggers", [])
triggers = (
{t for t in triggers_raw if isinstance(t, str)}
if isinstance(triggers_raw, list)
else set()
)
if "office_macro" in triggers and payload.get("attachment_macros") is True:
return {"matched_trigger": "office_macro"}
if (
"protected_archive" in triggers
and payload.get("attachment_password_protected") is True
):
return {"matched_trigger": "protected_archive"}
if "html_smuggling" in triggers and payload.get("html_smuggling") is True:
return {"matched_trigger": "html_smuggling"}
if "mal_hash_match" in triggers and payload.get("mal_hash_match") is True:
return {"matched_trigger": "mal_hash_match"}
extensions = payload.get("attachment_extensions") or []
if isinstance(extensions, list):
ext_set = {
e.lower().lstrip(".") for e in extensions if isinstance(e, str)
}
for ext_trigger in ("lnk", "iso", "img"):
if ext_trigger in triggers and ext_trigger in ext_set:
return {"matched_trigger": ext_trigger}
return None
def _extract_body_text(msg: email.message.EmailMessage) -> str | None:
"""Best-effort plain-text body extraction from a parsed email.
Prefers ``text/plain``. Falls back to ``text/html`` (raw — predicates
here are substring-matchers, no need to de-tag). Returns None when
the message has no readable text part. Requires the message to have
been parsed with ``policy=email.policy.default`` so parts are
``EmailMessage`` instances (``get_content`` is policy-conditional).
"""
candidates: list[email.message.EmailMessage] = list(msg.walk())
for content_type in ("text/plain", "text/html"):
for part in candidates:
if part.get_content_type() != content_type:
continue
try:
content = part.get_content()
except (LookupError, ValueError, KeyError):
continue
if isinstance(content, str):
return content
return None
def _load_body_text(payload: dict[str, Any]) -> str | None:
"""Return the email body text for predicates that need it.
If the bus payload already carries ``body_text`` (older deployments
or master-side producers), use it. Otherwise disk-reach: open the
``.eml`` from ``/var/lib/decnet/artifacts/{decky_id}/smtp/{stored_as}``
and parse the body in-process.
The decoded body is memoized back into the payload dict so the next
predicate on the same event reuses it without re-opening the file.
The bus envelope only carries the artifact pointer (``decky_id`` +
``stored_as``); raw body bytes never cross the host boundary
(DEBT-047). Returns None on any failure — predicates then short
circuit to no-match, matching pre-disk-reach behavior when fields
were absent.
"""
existing = payload.get("body_text")
if isinstance(existing, str):
return existing
decky_id = payload.get("decky_id")
stored_as = payload.get("stored_as")
if not isinstance(decky_id, str) or not isinstance(stored_as, str):
return None
try:
path = resolve_artifact_path(decky_id, stored_as, "smtp")
except ArtifactPathError:
return None
try:
with open(path, "rb") as fh:
msg = email.message_from_binary_file(
fh, policy=email.policy.default,
)
except (OSError, email.errors.MessageError):
return None
body = _extract_body_text(msg)
if body is None:
return None
payload["body_text"] = body
return body
def _p_bec(
spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
subject = payload.get("subject")
if not isinstance(subject, str):
return None
body_text = _load_body_text(payload)
if body_text is None:
return None
subj_kws = spec.get("subject_keywords", [])
body_kws = spec.get("body_action_keywords", [])
if not isinstance(subj_kws, list) or not isinstance(body_kws, list):
return None
subj_l = subject.lower()
body_l = body_text.lower()
subj_hit = next(
(k for k in subj_kws if isinstance(k, str) and k.lower() in subj_l),
None,
)
body_hit = next(
(k for k in body_kws if isinstance(k, str) and k.lower() in body_l),
None,
)
if not subj_hit or not body_hit:
return None
return {
"matched_subject_kw": subj_hit,
"matched_body_kw": body_hit,
"matched_headers": ["Subject"],
}
_BASE64_RE = re.compile(r"[A-Za-z0-9+/]{32,}={0,2}")
def _p_encoded_payload(
spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
min_bytes = int(spec.get("min_bytes", 4096))
body_text = _load_body_text(payload)
if not body_text:
return None
# Upstream may pre-compute the largest decoded base64 length.
body_b64_bytes = payload.get("body_base64_bytes")
if isinstance(body_b64_bytes, int) and body_b64_bytes >= min_bytes:
return {"encoded_byte_count": body_b64_bytes}
# Fallback: best-effort scan of the body text. Cap the work at the
# first match >= threshold to avoid quadratic behavior on a hostile
# body. Decoded bytes are NEVER returned — only the count.
for m in _BASE64_RE.finditer(body_text):
chunk = m.group(0)
try:
decoded = base64.b64decode(chunk, validate=True)
except (binascii.Error, ValueError):
continue
if len(decoded) >= min_bytes:
return {"encoded_byte_count": len(decoded)}
return None
_PREDICATES: Final[dict[str, Predicate]] = {
"lifter:email_open_relay": _p_open_relay,
"lifter:email_mass_phish": _p_mass_phish,
"lifter:email_xmailer_kit": _p_xmailer_kit,
"lifter:email_idn_url": _p_idn_url,
"lifter:email_sender_masquerade": _p_sender_masquerade,
"lifter:email_malicious_attachment": _p_malicious_attachment,
"lifter:email_bec": _p_bec,
"lifter:email_encoded_payload": _p_encoded_payload,
}
# Allowed keys in TTPTag.evidence for source_kind=email. Used both as
# the assembly contract here AND by tests/ttp/test_email_lifter.py to
# guard against a future predicate accidentally leaking PII.
_EMAIL_EVIDENCE_ALLOWED_KEYS: Final[frozenset[str]] = frozenset({
# EmailEvidence base
"body_sha256",
"matched_headers",
"rcpt_domain_set",
"attachment_sha256s",
"rcpt_count",
# PII-safe match discriminators
"matched_kit",
"matched_trigger",
"matched_url_host",
"matched_signals",
"matched_subject_kw",
"matched_body_kw",
"encoded_byte_count",
})
def _filter_evidence(evidence: dict[str, Any]) -> dict[str, Any]:
"""Drop any key not in the PII-safe allowlist.
Defense-in-depth: even if a predicate accidentally returns a raw
address or body field, this filter strips it before the tag is
constructed. Asserted by ``test_email_lifter.py``.
"""
return {
k: v for k, v in evidence.items()
if k in _EMAIL_EVIDENCE_ALLOWED_KEYS
}
class EmailLifter(TolerantTagger):
name = "email"
HANDLES = frozenset({"email"})
OWNED_PREFIX: Final[str] = "lifter:email_"
def __init__(self, store: RuleStore) -> None:
self._store = store
self._index = RuleIndex()
@classmethod
def _owns(cls, rule: CompiledRule) -> bool:
kind = rule.match_spec.get("kind", "")
return isinstance(kind, str) and kind.startswith(cls.OWNED_PREFIX)
async def watch_store(self) -> None:
await self._index.watch(self._store, predicate=self._owns)
async def _tag_impl(self, event: TaggerEvent) -> list[TTPTag]:
out: list[TTPTag] = []
base_evidence = _safe_evidence(event.payload)
for rule in self._index.values():
if event.source_kind not in rule.applies_to:
continue
if not is_active(rule.state):
continue
kind = rule.match_spec.get("kind", "")
handler = _PREDICATES.get(kind)
if handler is None:
continue
extra = handler(rule.match_spec, event.payload)
if extra is None:
continue
evidence = dict(base_evidence)
# Allow predicates to extend matched_headers without
# clobbering the base list.
extra_headers = extra.pop("matched_headers", None)
if isinstance(extra_headers, list):
merged = list(evidence.get("matched_headers", []))
merged.extend(h for h in extra_headers if isinstance(h, str))
evidence["matched_headers"] = sorted(set(merged))
evidence.update(extra)
evidence = _filter_evidence(evidence)
# Body sha is required by EmailEvidence; if upstream
# didn't supply one, derive from body_text (best-effort).
if not evidence.get("body_sha256"):
body_text = event.payload.get("body_text")
if isinstance(body_text, str) and body_text:
evidence["body_sha256"] = hashlib.sha256(
body_text.encode("utf-8", errors="replace"),
).hexdigest()
out.extend(emit_tags(rule, event, evidence))
return out
__all__ = ["EmailLifter"]