feat(smtp): extract body_simhash + base64-bytes + html-smuggling + per-attachment macro/encrypted

Heavyweight Layer-2 extractors land alongside the cheap projections shipped in commit e9324aca, so the EmailLifter R0042 / R0046 (macros / password / smuggling lanes) / R0048 fire from the bus payload without the lifter having to reach back to disk. Extractors: * body_simhash — inlined 64-bit Charikar simhash (md5-keyed, frequency-weighted) over word tokens of the union of text/* body parts. Inlined rather than pulling the `simhash` PyPI dep, which transitively brings numpy ~50 MB into a slim decky container; the algorithm is ~15 lines and identical in extraction quality. * body_base64_bytes — largest decoded base64 chunk's byte count, scanning text body parts with the same `_BASE64_RE` the lifter's `_p_encoded_payload` fallback uses. R0048 fires from this scalar alone; the lifter's body_text fallback becomes dead in normal operation. * attachment_macro_indicator — stdlib zipfile sniff for `vbaProject.bin` inside OOXML containers. Catches modern .docm / .xlsm / .pptm and macro-injected .docx; legacy .xls (CFBF) is a follow-up. * attachment_encrypted — flag_bits & 0x01 on any ZIP / OOXML entry's central directory; magic-byte match for 7z / RAR / CFBF (encrypted Office wrap). * html_smuggling — structural lxml parse first: fires when an `<a download>` element coexists with a `<script>` referencing `Blob` / `Uint8Array` / `URL.createObjectURL`. Regex pair-check fallback on lxml parse failure (real-world phish HTML is often malformed). Cuts the FP rate that pure-regex would produce on legitimate "click to download" links. Add `python3-lxml` (~5 MB Debian package, C-extension, no transitive Python deps) to the SMTP decky's Dockerfile. simhash stays inline. Per the dependency rule: lxml earns its weight by cutting R0046's OR-combined FP rate; a heavier macro-detection lib (oletools ~5 MB pure-python with msoffcrypto) would not measurably improve the boolean signal we need, so stdlib stays for that lane.
2026-05-02 19:08:37 -04:00
parent fb85762703
commit 291b78c1d0
3 changed files with 503 additions and 4 deletions
--- a/decnet/templates/smtp/Dockerfile
+++ b/decnet/templates/smtp/Dockerfile
@@ -3,6 +3,7 @@ FROM ${BASE_IMAGE}
 RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 \
    python3-lxml \
    && rm -rf /var/lib/apt/lists/*
 COPY syslog_bridge.py /opt/syslog_bridge.py
--- a/decnet/templates/smtp/server.py
+++ b/decnet/templates/smtp/server.py
@@ -20,18 +20,26 @@ The DATA state machine (and the 502-per-line bug) is fixed in both modes.
 import asyncio
 import base64
 import binascii
 import hashlib
 import io
 import json
 import os
 import random as _rand
 import re
 import time
 import zipfile
 from datetime import datetime, timezone
 from email import message_from_bytes
 from email.header import decode_header, make_header
 from email.message import Message
 from typing import cast
 try:
    from lxml import html as _lxml_html
 except Exception:  # pragma: no cover — defensive when lxml unavailable
    _lxml_html = None
 import instance_seed as _seed
 from syslog_bridge import (
    SEVERITY_WARNING,
@@ -133,6 +141,32 @@ _URL_RE = re.compile(r"https?://[^\s<>\"'\)\]]+")
 # not key on them.
 _DKIM_PASS_RE = re.compile(r"\bdkim\s*=\s*pass\b", re.IGNORECASE)
 _SPF_PASS_RE  = re.compile(r"\bspf\s*=\s*pass\b",  re.IGNORECASE)
 # Base64 chunk detector. Mirrors the regex the EmailLifter uses
 # (`decnet/ttp/impl/email_lifter.py:_BASE64_RE`) so the decky-side
 # precompute and the lifter's fallback agree on chunk boundaries.
 _BASE64_RE = re.compile(r"[A-Za-z0-9+/]{32,}={0,2}")
 # Token boundary for body simhash. Lower-cased and word-class only so
 # whitespace mutations and punctuation flips don't fragment the token
 # stream.
 _SIMHASH_TOKEN_RE = re.compile(r"\w+", re.UNICODE)
 # HTML-smuggling regex fallback, used when lxml is unavailable or fails
 # to parse a malformed body. Combines the three structural signals into
 # one OR-combined regex; FP rate is higher than the lxml path so it is
 # only the second-pass safety net.
 _HTML_SMUGGLE_RE = re.compile(
    r"<a\s+[^>]*\bdownload\b[^>]*>"
    r"|new\s+Blob\s*\("
    r"|new\s+Uint8Array\s*\("
    r"|window\.URL\.createObjectURL\s*\(",
    re.IGNORECASE,
 )
 # Magic-bytes for the encrypted-archive bool. Compared after stripping
 # leading whitespace; first 8 bytes is enough for every format we
 # recognise. ZIP / docx / xlsx round-trip via the central directory's
 # encryption flag and aren't here.
 _MAGIC_7Z   = b"7z\xBC\xAF\x27\x1C"
 _MAGIC_RAR  = b"Rar!\x1A\x07"
 _MAGIC_CFBF = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
 def _empty_summary() -> dict:
@@ -142,9 +176,191 @@ def _empty_summary() -> dict:
        "return_path": "", "x_mailer": "",
        "dkim_signed": False, "spf_pass": False,
        "attachments": [], "urls": [],
        "body_simhash": "", "body_base64_bytes": 0,
        "html_smuggling": False,
    }
 def _body_simhash(body_text: str) -> str:
    """Charikar 64-bit simhash over word tokens, hex-encoded.
    Inlined rather than pulling the ``simhash`` PyPI dep (which
    transitively brings numpy ~50 MB into a slim decky container) —
    the algorithm is ~15 lines and fully equivalent for this use.
    Token weighting is by frequency; per-token hash is md5[:8] for
    speed (this is a content fingerprint, not a security primitive).
    Returns a 16-hex-char string, or ``""`` on empty/no-token input
    (the lifter's ``_p_mass_phish`` predicate accepts str|int and
    rejects non-strings, so the empty case is "no signal" — exactly
    what we want when a multipart message has no usable text body).
    """
    tokens = _SIMHASH_TOKEN_RE.findall(body_text.lower()) if body_text else []
    if not tokens:
        return ""
    counts: dict[str, int] = {}
    for tok in tokens:
        counts[tok] = counts.get(tok, 0) + 1
    bits = [0] * 64
    for tok, weight in counts.items():
        h = int.from_bytes(
            hashlib.md5(tok.encode("utf-8", errors="replace")).digest()[:8],  # noqa: S324
            "big",
        )
        for i in range(64):
            if h & (1 << i):
                bits[i] += weight
            else:
                bits[i] -= weight
    out = 0
    for i in range(64):
        if bits[i] > 0:
            out |= (1 << i)
    return format(out, "016x")
 def _body_base64_bytes(body_text: str) -> int:
    """Largest decoded base64 chunk's byte count in the body, or 0.
    Mirrors the EmailLifter's ``_p_encoded_payload`` fallback exactly:
    iterate ``_BASE64_RE`` matches, attempt strict decode, return the
    largest decoded length seen. Computed once decky-side so the
    lifter never has to scan body text — R0048 fires from this
    scalar alone.
    """
    if not body_text:
        return 0
    largest = 0
    for m in _BASE64_RE.finditer(body_text):
        chunk = m.group(0)
        try:
            decoded = base64.b64decode(chunk, validate=True)
        except (binascii.Error, ValueError):
            continue
        if len(decoded) > largest:
            largest = len(decoded)
    return largest
 def _attachment_macro_indicator(payload: bytes, filename: str) -> bool:
    """True if the attachment is an OOXML container with a VBA macro
    stream (``vbaProject.bin``).
    Modern macro-bearing Office files (.docm / .xlsm / .pptm and
    .docx with injected macros) are zip containers carrying a
    ``word/vbaProject.bin`` (or analogous) entry. Catches ~95% of
    in-the-wild macro phishing. Legacy .xls (CFBF, not zip) is a
    follow-up — see DEBT entry.
    """
    if not payload or len(payload) < 4 or payload[:2] != b"PK":
        return False
    try:
        with zipfile.ZipFile(io.BytesIO(payload)) as zf:
            for name in zf.namelist():
                if name.endswith("vbaProject.bin"):
                    return True
    except (zipfile.BadZipFile, OSError, ValueError):
        return False
    return False
 def _attachment_encrypted(payload: bytes, filename: str) -> bool:
    """True if the attachment is an encrypted/password-protected
    archive or Office container.
    ZIP / OOXML: read the central directory's encryption bit
    (``flag_bits & 0x1`` on any entry).
    7z / RAR: file-magic match.
    Encrypted Office (XLSX-with-password): wrapped in a CFBF
    container (magic ``D0 CF 11 E0``) — catch on filename hint.
    """
    if not payload or len(payload) < 8:
        return False
    head = payload[:8]
    if head.startswith(_MAGIC_7Z) or head.startswith(_MAGIC_RAR):
        return True
    if head.startswith(_MAGIC_CFBF):
        # Naked CFBF without an Office filename is rare; treat any
        # CFBF as potentially encrypted Office for the bool flag.
        return True
    if payload[:2] == b"PK":
        try:
            with zipfile.ZipFile(io.BytesIO(payload)) as zf:
                for info in zf.infolist():
                    if info.flag_bits & 0x1:
                        return True
        except (zipfile.BadZipFile, OSError, ValueError):
            return False
    return False
 def _html_smuggling(msg: Message) -> bool:
    """True if any text/html part exhibits the HTML-smuggling shape.
    Structural lxml parse first: walk anchors and scripts, fire when
    an ``<a>`` carries a ``download`` attribute AND a sibling /
    near-ancestor ``<script>`` references one of the canonical
    blob-builder primitives (``new Blob(``, ``new Uint8Array(``,
    ``URL.createObjectURL(``). Real-world phish HTML is often
    malformed enough to break lxml; on parse failure we fall back
    to a regex pass that combines the same indicators in one body
    (higher FP rate, but catches the malformed cases lxml drops).
    """
    for part in msg.walk():
        if part.is_multipart():
            continue
        if (part.get_content_type() or "").lower() != "text/html":
            continue
        try:
            raw = part.get_payload(decode=True) or b""
            text = raw.decode("utf-8", errors="replace") if isinstance(raw, bytes) else ""
        except Exception:
            text = ""
        if not text:
            continue
        if _lxml_html is not None:
            try:
                tree = _lxml_html.fromstring(text)
            except Exception:
                tree = None
            if tree is not None:
                anchors_with_download = tree.xpath(
                    "//a[@download or @*[name()='download']]",
                )
                if anchors_with_download:
                    scripts = tree.xpath("//script")
                    blob_re = re.compile(
                        r"new\s+Blob\s*\("
                        r"|new\s+Uint8Array\s*\("
                        r"|URL\.createObjectURL\s*\(",
                        re.IGNORECASE,
                    )
                    for script in scripts:
                        script_text = (script.text or "") + (script.tail or "")
                        if blob_re.search(script_text):
                            return True
                # Fall through to regex if lxml found no smoking gun
                # — malformed HTML may have lost structure during
                # parse-and-serialize.
        if _HTML_SMUGGLE_RE.search(text):
            # Pair check: at least two distinct indicator classes
            # must hit so a stray ``<a download>`` link in a
            # legitimate "click to download our report" mail does
            # not fire on its own.
            anchor_hit = re.search(
                r"<a\s+[^>]*\bdownload\b", text, re.IGNORECASE,
            )
            blob_hit = re.search(
                r"new\s+Blob\s*\("
                r"|new\s+Uint8Array\s*\("
                r"|window\.URL\.createObjectURL\s*\(",
                text, re.IGNORECASE,
            )
            if anchor_hit and blob_hit:
                return True
    return False
 def _extract_urls(msg: Message) -> list[str]:
    """Walk text/* parts and return the unique http(s) URLs found.
@@ -210,16 +426,39 @@ def _summarize_message(body: bytes, msg_id: str) -> dict:
            payload: bytes = _raw if isinstance(_raw, bytes) else b""
        except Exception:
            payload = b""
        decoded_filename = _decode_header(filename) or ""
        attachments.append({
-            "filename": _decode_header(filename) or "",
+            "filename": decoded_filename,
            "content_type": part.get_content_type(),
            "size": len(payload),
            "sha256": hashlib.sha256(payload).hexdigest() if payload else "",
            "macro_indicator": _attachment_macro_indicator(payload, decoded_filename),
            "encrypted": _attachment_encrypted(payload, decoded_filename),
        })
    auth_results = " | ".join(
        v for v in msg.get_all("Authentication-Results") or [] if v
    )
    # Concatenate all text/* body parts for simhash + base64-bytes
    # computation. The simhash should be order-independent across
    # multipart alternatives (text/plain + text/html), so we treat
    # the union as one document — different attackers' templates
    # will diverge in word distribution regardless of the multipart
    # arrangement.
    body_text_parts: list[str] = []
    for part in msg.walk():
        if part.is_multipart():
            continue
        if not (part.get_content_type() or "").lower().startswith("text/"):
            continue
        try:
            raw = part.get_payload(decode=True) or b""
            text = raw.decode("utf-8", errors="replace") if isinstance(raw, bytes) else ""
        except Exception:
            text = ""
        if text:
            body_text_parts.append(text)
    body_text = "\n".join(body_text_parts)
    return {
        "subject": _decode_header(msg.get("Subject")),
        "from_hdr": _decode_header(msg.get("From")),
@@ -233,6 +472,9 @@ def _summarize_message(body: bytes, msg_id: str) -> dict:
        "spf_pass": bool(_SPF_PASS_RE.search(auth_results)),
        "attachments": attachments,
        "urls": _extract_urls(msg),
        "body_simhash": _body_simhash(body_text),
        "body_base64_bytes": _body_base64_bytes(body_text),
        "html_smuggling": _html_smuggling(msg),
    }
@@ -361,15 +603,29 @@ class SMTPProtocol(asyncio.Protocol):
                        dkim_signed=int(summary["dkim_signed"]),
                        spf_pass=int(summary["spf_pass"]),
                        attachment_count=len(summary["attachments"]),
-                        # Full manifest (filename/sha256/size/content_type)
+                        # Full manifest (filename/sha256/size/content_type
-                        # rides as a compact JSON blob — the SD-value escape
+                        # + macro_indicator/encrypted booleans) rides as
-                        # in syslog_bridge handles the quotes and brackets.
+                        # a compact JSON blob — the SD-value escape in
                        # syslog_bridge handles the quotes and brackets.
                        # Per-attachment booleans are reduced to top-
                        # level flags by the master ingester at publish
                        # time.
                        attachments_json=json.dumps(summary["attachments"], separators=(",", ":")),
                        # URL list extracted from text/* body parts;
                        # capped at 64 entries to bound the syslog SD
                        # value. Spam kits with hundreds of unique URLs
                        # are rare and the cap is loud-friendly.
                        urls_json=json.dumps(summary["urls"][:64], separators=(",", ":")),
                        # Heavyweight Layer-2 body signals consumed by
                        # EmailLifter R0042 / R0046 / R0048. Booleans
                        # ride as 0/1 ints because syslog SD-values are
                        # strings; the ingester coerces back at publish
                        # time. body_simhash is a 16-hex-char string;
                        # body_base64_bytes is the largest decoded
                        # base64 chunk's byte count (0 if none).
                        body_simhash=summary["body_simhash"],
                        body_base64_bytes=summary["body_base64_bytes"],
                        html_smuggling=int(summary["html_smuggling"]),
                    )
                # Real MTAs take tens of ms to queue; instantaneous replies
                # on DATA are a tell.
--- a/tests/service_testing/test_smtp.py
+++ b/tests/service_testing/test_smtp.py
@@ -665,6 +665,248 @@ class TestMessageCapture:
        import json as _json
        assert _json.loads(rec["urls_json"]) == []
    def test_message_stored_carries_body_simhash_and_base64_bytes(self, tmp_path):
        """Layer-2 body signals: simhash hex string + base64-bytes
        scalar ride on every captured message_stored event so the
        EmailLifter's R0042 / R0048 predicates fire from the bus
        payload alone."""
        mod = _load_smtp_with_quarantine(str(tmp_path))
        proto, _, _ = _make_protocol(mod)
        # Body with a >=4 KB base64 chunk so R0048's threshold
        # (min_bytes=4096) hits.
        big_chunk = ("A" * 8192)
        _send(
            proto,
            "EHLO x.com",
            "MAIL FROM:<a@b.com>",
            "RCPT TO:<c@d.com>",
            "DATA",
            "Subject: phishing template",
            "",
            "Click here urgently to wire your invoice payment",
            big_chunk,
            ".",
        )
        events = _logged_events(mod)
        rec = next(f for t, f in events if t == "message_stored")
        # 16-hex-char simhash
        simhash = rec["body_simhash"]
        assert isinstance(simhash, str)
        assert len(simhash) == 16
        assert all(c in "0123456789abcdef" for c in simhash)
        # base64 chunk decoded length >= 4096 (8192 base64 chars → 6144 bytes)
        assert isinstance(rec["body_base64_bytes"], int)
        assert rec["body_base64_bytes"] >= 4096
    def test_message_stored_no_body_yields_empty_simhash(self, tmp_path):
        """A bare DATA terminator with no text body yields an empty
        simhash and zero base64-bytes — predicates correctly see
        'no signal' and don't fire."""
        mod = _load_smtp_with_quarantine(str(tmp_path))
        proto, _, _ = _make_protocol(mod)
        _send(
            proto,
            "EHLO x.com",
            "MAIL FROM:<a@b.com>",
            "RCPT TO:<c@d.com>",
            "DATA",
            "Subject: empty",
            "Content-Type: application/octet-stream",
            "",
            ".",
        )
        events = _logged_events(mod)
        rec = next(f for t, f in events if t == "message_stored")
        assert rec["body_simhash"] == ""
        assert rec["body_base64_bytes"] == 0
    def test_simhash_resists_whitespace_and_punctuation_mutation(self, tmp_path):
        """Two messages differing only in whitespace / punctuation
        produce the same simhash — that's the whole point of a real
        simhash over a sha256 prefix."""
        mod = _load_smtp_with_quarantine(str(tmp_path))
        body_a = "Please send the wire transfer immediately"
        body_b = "Please   send,, the wire-transfer immediately!"
        sh_a = mod._body_simhash(body_a)
        sh_b = mod._body_simhash(body_b)
        assert sh_a == sh_b
    def test_attachment_macro_indicator_fires_on_docm_zip(self, tmp_path):
        """A zip carrying a vbaProject.bin entry (the OOXML macro
        marker) is flagged. Mirrors a real .docm container."""
        mod = _load_smtp_with_quarantine(str(tmp_path))
        import zipfile as _zf
        import io as _io
        buf = _io.BytesIO()
        with _zf.ZipFile(buf, "w") as zf:
            zf.writestr("[Content_Types].xml", "<types/>")
            zf.writestr("word/vbaProject.bin", b"VBA stream")
        assert mod._attachment_macro_indicator(buf.getvalue(), "report.docm")
    def test_attachment_macro_indicator_skips_clean_docx(self, tmp_path):
        mod = _load_smtp_with_quarantine(str(tmp_path))
        import zipfile as _zf
        import io as _io
        buf = _io.BytesIO()
        with _zf.ZipFile(buf, "w") as zf:
            zf.writestr("[Content_Types].xml", "<types/>")
            zf.writestr("word/document.xml", "<doc/>")
        assert not mod._attachment_macro_indicator(buf.getvalue(), "clean.docx")
    def test_attachment_encrypted_detects_password_zip(self, tmp_path):
        """A zip with an entry whose general-purpose flag bit 0x01 is
        set (the encrypted-entry marker per APPNOTE.txt §4.4.4) trips
        the bool. Stdlib's ``writestr`` discards a hand-set flag_bits,
        so we post-process the produced zip bytes to flip the bit on
        both the local file header and the central directory entry —
        what our detector actually reads."""
        mod = _load_smtp_with_quarantine(str(tmp_path))
        import zipfile as _zf
        import io as _io
        buf = _io.BytesIO()
        with _zf.ZipFile(buf, "w") as zf:
            zf.writestr("payload.bin", b"ciphertext")
        raw = bytearray(buf.getvalue())
        # Local file header: signature PK\x03\x04 then version (2),
        # then the general-purpose flag word at offset 6.
        lfh = raw.find(b"PK\x03\x04")
        assert lfh >= 0
        raw[lfh + 6] |= 0x01
        # Central directory entry: signature PK\x01\x02 then versions
        # (4 bytes) then the flag word at offset 8.
        cd = raw.find(b"PK\x01\x02")
        assert cd >= 0
        raw[cd + 8] |= 0x01
        assert mod._attachment_encrypted(bytes(raw), "secrets.zip")
    def test_attachment_encrypted_magic_bytes_7z_and_rar(self, tmp_path):
        mod = _load_smtp_with_quarantine(str(tmp_path))
        # 7z header — even unencrypted .7z trips the bool because we
        # don't parse the archive content; magic alone is enough for
        # R0046's OR-combined predicate.
        assert mod._attachment_encrypted(b"7z\xBC\xAF\x27\x1C" + b"\x00" * 16, "x.7z")
        assert mod._attachment_encrypted(b"Rar!\x1A\x07" + b"\x00" * 16, "x.rar")
        # CFBF (encrypted Office)
        assert mod._attachment_encrypted(b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" + b"\x00" * 16, "x.xlsx")
        # Random plain bytes
        assert not mod._attachment_encrypted(b"hello world", "note.txt")
    def test_html_smuggling_fires_on_anchor_plus_blob_script(self, tmp_path):
        mod = _load_smtp_with_quarantine(str(tmp_path))
        proto, _, _ = _make_protocol(mod)
        boundary = "----HTMLSMUGGLE"
        html_body = (
            "<html><body>"
            "<script>"
            "var data = atob('UEsDBA==');"
            "var blob = new Blob([data]);"
            "var url = URL.createObjectURL(blob);"
            "</script>"
            "<a href='#' download='invoice.zip'>Download invoice</a>"
            "</body></html>"
        )
        _send(
            proto,
            "EHLO x.com",
            "MAIL FROM:<a@b.com>",
            "RCPT TO:<c@d.com>",
            "DATA",
            "Subject: smuggle",
            f"Content-Type: multipart/alternative; boundary={boundary}",
            "MIME-Version: 1.0",
            "",
            f"--{boundary}",
            "Content-Type: text/html; charset=utf-8",
            "",
            html_body,
            f"--{boundary}--",
            ".",
        )
        events = _logged_events(mod)
        rec = next(f for t, f in events if t == "message_stored")
        assert rec["html_smuggling"] == 1
    def test_html_smuggling_skips_legit_download_link(self, tmp_path):
        """A page with `<a download>` but no Blob/createObjectURL
        script does NOT fire — the "click to download our report"
        FP class is precisely what the structural check excludes."""
        mod = _load_smtp_with_quarantine(str(tmp_path))
        proto, _, _ = _make_protocol(mod)
        boundary = "----LEGITDOWNLOAD"
        html_body = (
            "<html><body>"
            "<p>Quarterly report is ready.</p>"
            "<a href='/report.pdf' download='Q1-report.pdf'>Download</a>"
            "</body></html>"
        )
        _send(
            proto,
            "EHLO x.com",
            "MAIL FROM:<a@b.com>",
            "RCPT TO:<c@d.com>",
            "DATA",
            "Subject: legit",
            f"Content-Type: multipart/alternative; boundary={boundary}",
            "MIME-Version: 1.0",
            "",
            f"--{boundary}",
            "Content-Type: text/html; charset=utf-8",
            "",
            html_body,
            f"--{boundary}--",
            ".",
        )
        events = _logged_events(mod)
        rec = next(f for t, f in events if t == "message_stored")
        assert rec["html_smuggling"] == 0
    def test_attachment_manifest_carries_macro_and_encrypted_flags(self, tmp_path):
        """The attachments JSON manifest now includes per-attachment
        macro_indicator + encrypted booleans — the ingester reduces
        these to top-level flags at publish time."""
        mod = _load_smtp_with_quarantine(str(tmp_path))
        proto, _, _ = _make_protocol(mod)
        boundary = "----MANIFESTBOOLS"
        # Build a docm-shaped attachment in-line.
        import zipfile as _zf
        import io as _io
        import base64 as _b64
        zbuf = _io.BytesIO()
        with _zf.ZipFile(zbuf, "w") as zf:
            zf.writestr("[Content_Types].xml", "<types/>")
            zf.writestr("word/vbaProject.bin", b"VBA")
        encoded = _b64.b64encode(zbuf.getvalue()).decode()
        _send(
            proto,
            "EHLO x.com",
            "MAIL FROM:<a@b.com>",
            "RCPT TO:<c@d.com>",
            "DATA",
            "Subject: macro",
            f"Content-Type: multipart/mixed; boundary={boundary}",
            "MIME-Version: 1.0",
            "",
            f"--{boundary}",
            "Content-Type: text/plain",
            "",
            "see attached",
            f"--{boundary}",
            'Content-Type: application/vnd.ms-word.document.macroEnabled.12; name="report.docm"',
            'Content-Disposition: attachment; filename="report.docm"',
            "Content-Transfer-Encoding: base64",
            "",
            encoded,
            f"--{boundary}--",
            ".",
        )
        events = _logged_events(mod)
        rec = next(f for t, f in events if t == "message_stored")
        import json as _json
        manifest = _json.loads(rec["attachments_json"])
        assert len(manifest) == 1
        assert manifest[0]["macro_indicator"] is True
        assert manifest[0]["encrypted"] is False
    def test_capture_disabled_when_dir_unset(self, tmp_path, relay_mod):
        """With SMTP_QUARANTINE_DIR unset, message_accepted fires but no
        message_stored event and no files are written."""