#!/usr/bin/env python3 """ SMTP server — emulates a realistic ESMTP server (Postfix-style). Two modes of operation, controlled by SMTP_OPEN_RELAY: SMTP_OPEN_RELAY=0 (default) — credential harvester AUTH attempts are logged and rejected (535). RCPT TO is rejected with 554 (relay denied) for all recipients. This captures credential stuffing and scanning activity. SMTP_OPEN_RELAY=1 — open relay bait AUTH is accepted for any credentials (235). RCPT TO is accepted for any domain (250). DATA is fully buffered until CRLF.CRLF and acknowledged with a queued-as message ID. Attractive to spam relay operators. The DATA state machine (and the 502-per-line bug) is fixed in both modes. """ import asyncio import base64 import binascii import hashlib import io import json import os import random as _rand import re import time import zipfile from datetime import datetime, timezone from email import message_from_bytes from email.header import decode_header, make_header from email.message import Message from typing import cast try: from lxml import html as _lxml_html except Exception: # pragma: no cover — defensive when lxml unavailable _lxml_html = None import instance_seed as _seed from syslog_bridge import ( SEVERITY_WARNING, encode_secret, forward_syslog, syslog_line, write_syslog_file, ) NODE_NAME = os.environ.get("NODE_NAME", "mailserver") SERVICE_NAME = os.environ.get("SMTP_SERVICE_NAME", "smtp") LOG_TARGET = os.environ.get("LOG_TARGET", "") PORT = int(os.environ.get("PORT", "25")) OPEN_RELAY = os.environ.get("SMTP_OPEN_RELAY", "0").strip() == "1" # In open-relay mode, optionally restrict which creds succeed. Blank means # "accept anything". Format: "user1,user2,..." — any name not in the list # gets a 535 instead of 235, so the relay looks realistically selective. _AUTH_WHITELIST = {u.strip() for u in os.environ.get("SMTP_AUTH_WHITELIST", "").split(",") if u.strip()} # Open-relay filtering. Even compromised/misconfigured relays aren't pure # tarpits — Postfix rejects malformed addresses at RCPT time, and many drop # a small fraction of external recipients under greylisting or reputation # checks. Accepting literally every RCPT is a honeypot tell. _ADDR_RE = re.compile(r"^@]+)@([A-Za-z0-9.-]+\.[A-Za-z]{2,})>?$") _BLOCKED_TLDS = {"invalid", "test", "localhost", "local", "example"} _RCPT_DROP_RATE = float(os.environ.get("SMTP_RCPT_DROP_RATE", "0.08")) _SMTP_BANNER = os.environ.get("SMTP_BANNER", f"220 {NODE_NAME} ESMTP Postfix (Debian/GNU)") _SMTP_MTA = os.environ.get("SMTP_MTA", NODE_NAME) # Full-message capture: bind-mounted quarantine dir (host path # /var/lib/decnet/artifacts/{decky}/smtp). When unset, capture is skipped — # the container still accepts mail, it just doesn't persist the body. Used by # tests and by deployments that don't want disk persistence. _QUARANTINE_DIR = os.environ.get("SMTP_QUARANTINE_DIR", "") # EHLO advertises SIZE 10240000 (10 MB). Cap the accumulator at the same # value so a crafted client can't OOM the container by streaming forever. _MAX_BODY_BYTES = int(os.environ.get("SMTP_MAX_BODY_BYTES", "10485760")) # Postfix's queue-ID character set (real one: excludes vowels and look-alikes # like 0/O, 1/I, so scanners that know Postfix's alphabet are satisfied). _QUEUE_CHARS = "BCDFGHJKLMNPQRSTVWXYZ23456789" _Q_BASE = len(_QUEUE_CHARS) def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) write_syslog_file(line) forward_syslog(line, LOG_TARGET) def _rand_msg_id() -> str: """Postfix-style queue ID. Real Postfix derives its short queue IDs from the message's arrival microseconds, base-encoded with a vowel-free alphabet — so IDs are monotonically increasing and visually distinctive. We encode the current microsecond count with Postfix's actual character set, then append a short per-instance suffix so two deckies never emit identical IDs at the same instant. """ us = int(time.time() * 1_000_000) out: list[str] = [] while us and len(out) < 10: us, r = divmod(us, _Q_BASE) out.append(_QUEUE_CHARS[r]) base = "".join(reversed(out)) or _QUEUE_CHARS[0] suffix_idx = _seed.rng.randint(0, _Q_BASE - 1) return base + _QUEUE_CHARS[suffix_idx] def _decode_header(raw: str | None) -> str: """Best-effort decode of an RFC 2047 encoded-word header to Unicode. Returns "" for missing / undecodable values so callers can treat the result as a plain string. """ if not raw: return "" try: return str(make_header(decode_header(raw))) except Exception: return raw # Stored_as format mirrors the SSH artifact convention so the existing # /api/v1/artifacts/{decky}/{stored_as} endpoint and its filename regex # accept SMTP drops unchanged: __. The basename # always ends in .eml so operators can open it in any MUA. _STORED_AS_BASE_RE = re.compile(r"[^A-Za-z0-9._-]") # Body-URL extraction. Tight enough to skip stray text that happens to # start with "http"; loose enough to catch IDN punycode, query strings, # and the trailing-paren / trailing-period tokens that bare-URL regexes # typically over-capture. Anchored on whitespace / quote / angle-bracket # boundaries so URLs inside `` round-trip cleanly. _URL_RE = re.compile(r"https?://[^\s<>\"'\)\]]+") # Authentication-Results parsing. We only care about the binary # pass-or-not for dkim and spf — finer-grained verdicts (neutral / # softfail / temperror) are evidence at best and the EmailLifter does # not key on them. _DKIM_PASS_RE = re.compile(r"\bdkim\s*=\s*pass\b", re.IGNORECASE) _SPF_PASS_RE = re.compile(r"\bspf\s*=\s*pass\b", re.IGNORECASE) # Base64 chunk detector. Mirrors the regex the EmailLifter uses # (`decnet/ttp/impl/email_lifter.py:_BASE64_RE`) so the decky-side # precompute and the lifter's fallback agree on chunk boundaries. _BASE64_RE = re.compile(r"[A-Za-z0-9+/]{32,}={0,2}") # Token boundary for body simhash. Lower-cased and word-class only so # whitespace mutations and punctuation flips don't fragment the token # stream. _SIMHASH_TOKEN_RE = re.compile(r"\w+", re.UNICODE) # HTML-smuggling regex fallback, used when lxml is unavailable or fails # to parse a malformed body. Combines the three structural signals into # one OR-combined regex; FP rate is higher than the lxml path so it is # only the second-pass safety net. _HTML_SMUGGLE_RE = re.compile( r"]*\bdownload\b[^>]*>" r"|new\s+Blob\s*\(" r"|new\s+Uint8Array\s*\(" r"|window\.URL\.createObjectURL\s*\(", re.IGNORECASE, ) # Magic-bytes for the encrypted-archive bool. Compared after stripping # leading whitespace; first 8 bytes is enough for every format we # recognise. ZIP / docx / xlsx round-trip via the central directory's # encryption flag and aren't here. _MAGIC_7Z = b"7z\xBC\xAF\x27\x1C" _MAGIC_RAR = b"Rar!\x1A\x07" _MAGIC_CFBF = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" def _empty_summary() -> dict: return { "subject": "", "from_hdr": "", "to_hdr": "", "date_hdr": "", "message_id_hdr": "", "content_type": "", "return_path": "", "x_mailer": "", "dkim_signed": False, "spf_pass": False, "attachments": [], "urls": [], "body_simhash": "", "body_base64_bytes": 0, "html_smuggling": False, } def _body_simhash(body_text: str) -> str: """Charikar 64-bit simhash over word tokens, hex-encoded. Inlined rather than pulling the ``simhash`` PyPI dep (which transitively brings numpy ~50 MB into a slim decky container) — the algorithm is ~15 lines and fully equivalent for this use. Token weighting is by frequency; per-token hash is md5[:8] for speed (this is a content fingerprint, not a security primitive). Returns a 16-hex-char string, or ``""`` on empty/no-token input (the lifter's ``_p_mass_phish`` predicate accepts str|int and rejects non-strings, so the empty case is "no signal" — exactly what we want when a multipart message has no usable text body). """ tokens = _SIMHASH_TOKEN_RE.findall(body_text.lower()) if body_text else [] if not tokens: return "" counts: dict[str, int] = {} for tok in tokens: counts[tok] = counts.get(tok, 0) + 1 bits = [0] * 64 for tok, weight in counts.items(): h = int.from_bytes( hashlib.md5(tok.encode("utf-8", errors="replace")).digest()[:8], # noqa: S324 "big", ) for i in range(64): if h & (1 << i): bits[i] += weight else: bits[i] -= weight out = 0 for i in range(64): if bits[i] > 0: out |= (1 << i) return format(out, "016x") def _body_base64_bytes(body_text: str) -> int: """Largest decoded base64 chunk's byte count in the body, or 0. Mirrors the EmailLifter's ``_p_encoded_payload`` fallback exactly: iterate ``_BASE64_RE`` matches, attempt strict decode, return the largest decoded length seen. Computed once decky-side so the lifter never has to scan body text — R0048 fires from this scalar alone. """ if not body_text: return 0 largest = 0 for m in _BASE64_RE.finditer(body_text): chunk = m.group(0) try: decoded = base64.b64decode(chunk, validate=True) except (binascii.Error, ValueError): continue if len(decoded) > largest: largest = len(decoded) return largest def _attachment_macro_indicator(payload: bytes, filename: str) -> bool: """True if the attachment is an OOXML container with a VBA macro stream (``vbaProject.bin``). Modern macro-bearing Office files (.docm / .xlsm / .pptm and .docx with injected macros) are zip containers carrying a ``word/vbaProject.bin`` (or analogous) entry. Catches ~95% of in-the-wild macro phishing. Legacy .xls (CFBF, not zip) is a follow-up — see DEBT entry. """ if not payload or len(payload) < 4 or payload[:2] != b"PK": return False try: with zipfile.ZipFile(io.BytesIO(payload)) as zf: for name in zf.namelist(): if name.endswith("vbaProject.bin"): return True except (zipfile.BadZipFile, OSError, ValueError): return False return False def _attachment_encrypted(payload: bytes, filename: str) -> bool: """True if the attachment is an encrypted/password-protected archive or Office container. ZIP / OOXML: read the central directory's encryption bit (``flag_bits & 0x1`` on any entry). 7z / RAR: file-magic match. Encrypted Office (XLSX-with-password): wrapped in a CFBF container (magic ``D0 CF 11 E0``) — catch on filename hint. """ if not payload or len(payload) < 8: return False head = payload[:8] if head.startswith(_MAGIC_7Z) or head.startswith(_MAGIC_RAR): return True if head.startswith(_MAGIC_CFBF): # Naked CFBF without an Office filename is rare; treat any # CFBF as potentially encrypted Office for the bool flag. return True if payload[:2] == b"PK": try: with zipfile.ZipFile(io.BytesIO(payload)) as zf: for info in zf.infolist(): if info.flag_bits & 0x1: return True except (zipfile.BadZipFile, OSError, ValueError): return False return False def _html_smuggling(msg: Message) -> bool: """True if any text/html part exhibits the HTML-smuggling shape. Structural lxml parse first: walk anchors and scripts, fire when an ```` carries a ``download`` attribute AND a sibling / near-ancestor ``