DECNET/decnet/ttp/impl/email_lifter.py

"""Email lifter — SMTP message-level technique tagger (E.3.12).

Reads pre-parsed SMTP message payload (headers as a name-only list,
body sha + body text already truncated/scrubbed by the upstream worker,
attachment hashes + names) and emits Initial-Access / Phishing /
Resource-Development techniques per Appendix A.6.

PII discipline (TTP_TAGGING.md §"Hard parts §6") is enforced at the
lifter layer: emitted ``TTPTag.evidence`` only carries fields that
conform to :class:`~decnet.web.db.models.ttp.EmailEvidence`
(``body_sha256``, ``matched_headers`` — names not values,
``rcpt_domain_set`` — domains not addresses, ``attachment_sha256s``,
``rcpt_count``) plus a small set of match-discriminator strings
(``matched_kit``, ``matched_trigger``, ``matched_url``). Raw From /
Return-Path / RCPT addresses, raw body bytes, and decoded payload
previews NEVER appear in evidence.
"""
from __future__ import annotations

import base64
import binascii
import email
import email.errors
import email.message
import email.policy
import hashlib
import re
from collections.abc import Callable
from typing import Any, Final

from decnet.artifacts.paths import ArtifactPathError, resolve_artifact_path
from decnet.ttp.base import TaggerEvent, TolerantTagger
from decnet.ttp.impl._emit import emit_tags
from decnet.ttp.impl._rule_index import RuleIndex
from decnet.ttp.impl._state import is_active
from decnet.ttp.impl.rule_engine import CompiledRule
from decnet.ttp.store.base import RuleStore
from decnet.web.db.models.ttp import TTPTag


Predicate = Callable[
    [dict[str, Any], dict[str, Any]],
    "dict[str, Any] | None",
]


# ── Helpers ─────────────────────────────────────────────────────────


def _domain(addr_or_domain: str | None) -> str | None:
    if not isinstance(addr_or_domain, str):
        return None
    if not addr_or_domain:
        return None
    if "@" in addr_or_domain:
        return addr_or_domain.split("@", 1)[1].lower().strip()
    return addr_or_domain.lower().strip()


def _safe_evidence(payload: dict[str, Any]) -> dict[str, Any]:
    """Build the EmailEvidence-conformant base evidence dict.

    Only PII-safe keys: body sha (already a hash), header NAMES (not
    values), recipient DOMAINS (not addresses), attachment hashes,
    rcpt count. Raw addresses, raw body, raw header values explicitly
    excluded.
    """
    rcpt_domains_raw = payload.get("rcpt_domains") or []
    rcpt_domains = [
        d.lower() for d in rcpt_domains_raw if isinstance(d, str)
    ]
    attachment_hashes = payload.get("attachment_sha256s") or []
    if not isinstance(attachment_hashes, list):
        attachment_hashes = []
    body_sha = payload.get("body_sha256") or ""
    if not isinstance(body_sha, str):
        body_sha = ""
    rcpt_count = payload.get("rcpt_count")
    if not isinstance(rcpt_count, int):
        rcpt_count = 0
    return {
        "body_sha256": body_sha,
        "matched_headers": [],
        "rcpt_domain_set": sorted(set(rcpt_domains)),
        "attachment_sha256s": [
            h for h in attachment_hashes if isinstance(h, str)
        ],
        "rcpt_count": rcpt_count,
    }


# ── Per-rule predicates ─────────────────────────────────────────────


def _p_open_relay(
    spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
    threshold = int(spec.get("rcpt_threshold", 10))
    rcpt_count = payload.get("rcpt_count")
    if not isinstance(rcpt_count, int) or rcpt_count < threshold:
        return None
    if spec.get("require_foreign_from"):
        from_domain = _domain(payload.get("from_domain") or payload.get("from"))
        mail_from = _domain(
            payload.get("mail_from_domain") or payload.get("mail_from"),
        )
        if not from_domain or not mail_from or from_domain == mail_from:
            return None
    return {"matched_headers": ["From", "Mail-From"]}


def _p_mass_phish(
    spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
    threshold = int(spec.get("rcpt_threshold", 25))
    rcpt_count = payload.get("rcpt_count")
    if not isinstance(rcpt_count, int) or rcpt_count < threshold:
        return None
    # The "campaign" half: upstream must have observed body simhash
    # recurring across recipients. Without that signal, high-RCPT alone
    # is open-relay territory (R0041), not mass-phish. The simhash
    # derivation lives in the SMTP worker (out of scope here).
    if not isinstance(payload.get("body_simhash"), (str, int)):
        return None
    return {}


def _p_xmailer_kit(
    spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
    x_mailer = payload.get("x_mailer")
    if not isinstance(x_mailer, str) or not x_mailer:
        return None
    matched_kit = payload.get("matched_kit")
    if isinstance(matched_kit, str) and matched_kit:
        return {"matched_kit": matched_kit, "matched_headers": ["X-Mailer"]}
    # Catalogue match flag — upstream marks it via xmailer_kit_match.
    if payload.get("xmailer_kit_match") is True:
        return {"matched_headers": ["X-Mailer"]}
    return None


_PUNYCODE_PREFIX_DEFAULT: Final[str] = "xn--"


def _p_idn_url(
    spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
    prefix = spec.get("punycode_prefix") or _PUNYCODE_PREFIX_DEFAULT
    if not isinstance(prefix, str):
        prefix = _PUNYCODE_PREFIX_DEFAULT
    urls = payload.get("urls") or []
    if not isinstance(urls, list):
        return None
    for url in urls:
        if isinstance(url, str) and prefix in url:
            # Carry only the punycode-bearing host portion as a match
            # discriminator. NEVER carry the full URL (could contain
            # credential-harvest path with PII).
            host = _extract_host(url)
            return {
                "matched_url_host": host or "",
                "matched_headers": ["body"],
            }
    return None


def _extract_host(url: str) -> str | None:
    m = re.match(r"https?://([^/]+)", url)
    if m:
        return m.group(1).lower()
    return None


def _p_sender_masquerade(
    spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
    signals_raw = spec.get("signals", [])
    if not isinstance(signals_raw, list):
        return None
    signals = {s for s in signals_raw if isinstance(s, str)}
    matched: list[str] = []
    if "from_returnpath_mismatch" in signals:
        if (
            _domain(payload.get("from_domain")) is not None
            and _domain(payload.get("return_path_domain")) is not None
            and _domain(payload.get("from_domain"))
            != _domain(payload.get("return_path_domain"))
        ):
            matched.append("from_returnpath_mismatch")
    if "from_mailfrom_mismatch" in signals:
        if (
            _domain(payload.get("from_domain")) is not None
            and _domain(payload.get("mail_from_domain")) is not None
            and _domain(payload.get("from_domain"))
            != _domain(payload.get("mail_from_domain"))
        ):
            matched.append("from_mailfrom_mismatch")
    if "dkim_fail" in signals and payload.get("dkim_signed") is False:
        matched.append("dkim_fail")
    if "spf_fail" in signals and payload.get("spf_pass") is False:
        matched.append("spf_fail")
    if not matched:
        return None
    headers: list[str] = []
    if any("from_" in m for m in matched):
        headers.extend(["From", "Return-Path"])
    if "dkim_fail" in matched:
        headers.append("DKIM-Signature")
    if "spf_fail" in matched:
        headers.append("Authentication-Results")
    return {
        "matched_signals": matched,
        "matched_headers": sorted(set(headers)),
    }


def _p_malicious_attachment(
    spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
    triggers_raw = spec.get("triggers", [])
    triggers = (
        {t for t in triggers_raw if isinstance(t, str)}
        if isinstance(triggers_raw, list)
        else set()
    )
    if "office_macro" in triggers and payload.get("attachment_macros") is True:
        return {"matched_trigger": "office_macro"}
    if (
        "protected_archive" in triggers
        and payload.get("attachment_password_protected") is True
    ):
        return {"matched_trigger": "protected_archive"}
    if "html_smuggling" in triggers and payload.get("html_smuggling") is True:
        return {"matched_trigger": "html_smuggling"}
    if "mal_hash_match" in triggers and payload.get("mal_hash_match") is True:
        return {"matched_trigger": "mal_hash_match"}
    extensions = payload.get("attachment_extensions") or []
    if isinstance(extensions, list):
        ext_set = {
            e.lower().lstrip(".") for e in extensions if isinstance(e, str)
        }
        for ext_trigger in ("lnk", "iso", "img"):
            if ext_trigger in triggers and ext_trigger in ext_set:
                return {"matched_trigger": ext_trigger}
    return None


def _extract_body_text(msg: email.message.EmailMessage) -> str | None:
    """Best-effort plain-text body extraction from a parsed email.

    Prefers ``text/plain``. Falls back to ``text/html`` (raw — predicates
    here are substring-matchers, no need to de-tag). Returns None when
    the message has no readable text part. Requires the message to have
    been parsed with ``policy=email.policy.default`` so parts are
    ``EmailMessage`` instances (``get_content`` is policy-conditional).
    """
    candidates: list[email.message.EmailMessage] = list(msg.walk())
    for content_type in ("text/plain", "text/html"):
        for part in candidates:
            if part.get_content_type() != content_type:
                continue
            try:
                content = part.get_content()
            except (LookupError, ValueError, KeyError):
                continue
            if isinstance(content, str):
                return content
    return None


def _load_body_text(payload: dict[str, Any]) -> str | None:
    """Return the email body text for predicates that need it.

    If the bus payload already carries ``body_text`` (older deployments
    or master-side producers), use it. Otherwise disk-reach: open the
    ``.eml`` from ``/var/lib/decnet/artifacts/{decky_id}/smtp/{stored_as}``
    and parse the body in-process.

    The decoded body is memoized back into the payload dict so the next
    predicate on the same event reuses it without re-opening the file.
    The bus envelope only carries the artifact pointer (``decky_id`` +
    ``stored_as``); raw body bytes never cross the host boundary
    (DEBT-047). Returns None on any failure — predicates then short
    circuit to no-match, matching pre-disk-reach behavior when fields
    were absent.
    """
    existing = payload.get("body_text")
    if isinstance(existing, str):
        return existing
    decky_id = payload.get("decky_id")
    stored_as = payload.get("stored_as")
    if not isinstance(decky_id, str) or not isinstance(stored_as, str):
        return None
    try:
        path = resolve_artifact_path(decky_id, stored_as, "smtp")
    except ArtifactPathError:
        return None
    try:
        with open(path, "rb") as fh:
            msg = email.message_from_binary_file(
                fh, policy=email.policy.default,
            )
    except (OSError, email.errors.MessageError):
        return None
    body = _extract_body_text(msg)
    if body is None:
        return None
    payload["body_text"] = body
    return body


def _p_bec(
    spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
    subject = payload.get("subject")
    if not isinstance(subject, str):
        return None
    body_text = _load_body_text(payload)
    if body_text is None:
        return None
    subj_kws = spec.get("subject_keywords", [])
    body_kws = spec.get("body_action_keywords", [])
    if not isinstance(subj_kws, list) or not isinstance(body_kws, list):
        return None
    subj_l = subject.lower()
    body_l = body_text.lower()
    subj_hit = next(
        (k for k in subj_kws if isinstance(k, str) and k.lower() in subj_l),
        None,
    )
    body_hit = next(
        (k for k in body_kws if isinstance(k, str) and k.lower() in body_l),
        None,
    )
    if not subj_hit or not body_hit:
        return None
    return {
        "matched_subject_kw": subj_hit,
        "matched_body_kw": body_hit,
        "matched_headers": ["Subject"],
    }


_BASE64_RE = re.compile(r"[A-Za-z0-9+/]{32,}={0,2}")


def _p_encoded_payload(
    spec: dict[str, Any], payload: dict[str, Any],
) -> dict[str, Any] | None:
    min_bytes = int(spec.get("min_bytes", 4096))
    body_text = _load_body_text(payload)
    if not body_text:
        return None
    # Upstream may pre-compute the largest decoded base64 length.
    body_b64_bytes = payload.get("body_base64_bytes")
    if isinstance(body_b64_bytes, int) and body_b64_bytes >= min_bytes:
        return {"encoded_byte_count": body_b64_bytes}
    # Fallback: best-effort scan of the body text. Cap the work at the
    # first match >= threshold to avoid quadratic behavior on a hostile
    # body. Decoded bytes are NEVER returned — only the count.
    for m in _BASE64_RE.finditer(body_text):
        chunk = m.group(0)
        try:
            decoded = base64.b64decode(chunk, validate=True)
        except (binascii.Error, ValueError):
            continue
        if len(decoded) >= min_bytes:
            return {"encoded_byte_count": len(decoded)}
    return None


_PREDICATES: Final[dict[str, Predicate]] = {
    "lifter:email_open_relay": _p_open_relay,
    "lifter:email_mass_phish": _p_mass_phish,
    "lifter:email_xmailer_kit": _p_xmailer_kit,
    "lifter:email_idn_url": _p_idn_url,
    "lifter:email_sender_masquerade": _p_sender_masquerade,
    "lifter:email_malicious_attachment": _p_malicious_attachment,
    "lifter:email_bec": _p_bec,
    "lifter:email_encoded_payload": _p_encoded_payload,
}


# Allowed keys in TTPTag.evidence for source_kind=email. Used both as
# the assembly contract here AND by tests/ttp/test_email_lifter.py to
# guard against a future predicate accidentally leaking PII.
_EMAIL_EVIDENCE_ALLOWED_KEYS: Final[frozenset[str]] = frozenset({
    # EmailEvidence base
    "body_sha256",
    "matched_headers",
    "rcpt_domain_set",
    "attachment_sha256s",
    "rcpt_count",
    # PII-safe match discriminators
    "matched_kit",
    "matched_trigger",
    "matched_url_host",
    "matched_signals",
    "matched_subject_kw",
    "matched_body_kw",
    "encoded_byte_count",
})


def _filter_evidence(evidence: dict[str, Any]) -> dict[str, Any]:
    """Drop any key not in the PII-safe allowlist.

    Defense-in-depth: even if a predicate accidentally returns a raw
    address or body field, this filter strips it before the tag is
    constructed. Asserted by ``test_email_lifter.py``.
    """
    return {
        k: v for k, v in evidence.items()
        if k in _EMAIL_EVIDENCE_ALLOWED_KEYS
    }


class EmailLifter(TolerantTagger):
    name = "email"
    HANDLES = frozenset({"email"})
    OWNED_PREFIX: Final[str] = "lifter:email_"

    def __init__(self, store: RuleStore) -> None:
        self._store = store
        self._index = RuleIndex()

    @classmethod
    def _owns(cls, rule: CompiledRule) -> bool:
        kind = rule.match_spec.get("kind", "")
        return isinstance(kind, str) and kind.startswith(cls.OWNED_PREFIX)

    async def watch_store(self) -> None:
        await self._index.watch(self._store, predicate=self._owns)

    async def _tag_impl(self, event: TaggerEvent) -> list[TTPTag]:
        out: list[TTPTag] = []
        base_evidence = _safe_evidence(event.payload)
        for rule in self._index.values():
            if event.source_kind not in rule.applies_to:
                continue
            if not is_active(rule.state):
                continue
            kind = rule.match_spec.get("kind", "")
            handler = _PREDICATES.get(kind)
            if handler is None:
                continue
            extra = handler(rule.match_spec, event.payload)
            if extra is None:
                continue
            evidence = dict(base_evidence)
            # Allow predicates to extend matched_headers without
            # clobbering the base list.
            extra_headers = extra.pop("matched_headers", None)
            if isinstance(extra_headers, list):
                merged = list(evidence.get("matched_headers", []))
                merged.extend(h for h in extra_headers if isinstance(h, str))
                evidence["matched_headers"] = sorted(set(merged))
            evidence.update(extra)
            evidence = _filter_evidence(evidence)
            # Body sha is required by EmailEvidence; if upstream
            # didn't supply one, derive from body_text (best-effort).
            if not evidence.get("body_sha256"):
                body_text = event.payload.get("body_text")
                if isinstance(body_text, str) and body_text:
                    evidence["body_sha256"] = hashlib.sha256(
                        body_text.encode("utf-8", errors="replace"),
                    ).hexdigest()
            out.extend(emit_tags(rule, event, evidence))
        return out


__all__ = ["EmailLifter"]