feat(canary): honeydoc_docx + honeydoc_pdf generators

honeydoc previously emitted HTML only — operators picking 'Document' out of the dropdown got a .html file dropped at /Documents/ quarterly_report.docx, which any attacker would clock the moment they ran 'file' on it. Two new generators that emit the real artifact format: - honeydoc_docx: stdlib zipfile only. Builds a minimal but valid Office Open XML zip with the same Q3 review body as the HTML flavor and an external-image relationship pointing at the callback URL — same trick the operator-upload DOCX instrumenter uses, fetched on document open by Word and LibreOffice. Reuses _drawing() and _next_rid() from instrumenters/docx.py to keep the body/relationships shape identical between synthesised and instrumented files. - honeydoc_pdf: pikepdf-backed. One-page PDF in the 14 base fonts (Helvetica, no font embedding), realistic body, /OpenAction /URI on the catalog so most viewers fire the callback on document open. Falls back to a clear error if pikepdf is missing so the operator can switch to honeydoc / honeydoc_docx. Default placement paths now reflect each generator's true extension (.html / .docx / .pdf) so the UI suggests something sensible. Both generators surfaced in the New Token modal's generator dropdown.
2026-04-27 13:44:20 -04:00
parent c17b9e01c8
commit 5ac8e0f91a
8 changed files with 312 additions and 5 deletions
--- a/decnet/canary/factory.py
+++ b/decnet/canary/factory.py
@@ -18,6 +18,8 @@ KNOWN_GENERATORS: Tuple[str, ...] = (
    "ssh_key",
    "aws_creds",
    "honeydoc",
+    "honeydoc_docx",
+    "honeydoc_pdf",
 )

 KNOWN_INSTRUMENTERS: Tuple[str, ...] = (
@@ -52,6 +54,12 @@ def get_generator(name: str) -> CanaryGenerator:
    if name == "honeydoc":
        from decnet.canary.generators.honeydoc import HoneydocGenerator
        return HoneydocGenerator()
+    if name == "honeydoc_docx":
+        from decnet.canary.generators.honeydoc_docx import HoneydocDocxGenerator
+        return HoneydocDocxGenerator()
+    if name == "honeydoc_pdf":
+        from decnet.canary.generators.honeydoc_pdf import HoneydocPdfGenerator
+        return HoneydocPdfGenerator()
    raise ValueError(
        f"Unknown canary generator: {name!r}. Known: {KNOWN_GENERATORS}"
    )
--- a/decnet/canary/generators/honeydoc_docx.py
+++ b/decnet/canary/generators/honeydoc_docx.py
@@ -0,0 +1,133 @@
+"""Real-DOCX honeydoc generator.
+
+Synthesises a minimal but structurally valid DOCX from scratch via
+stdlib :mod:`zipfile`, then uses the same external-image relationship
+trick that powers :mod:`decnet.canary.instrumenters.docx` to embed
+the callback URL.  No python-docx dependency.
+
+The output opens cleanly in Word / LibreOffice; both fetch the
+external image relationship on document load.
+"""
+from __future__ import annotations
+
+import io
+import zipfile
+
+from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
+from decnet.canary.instrumenters.docx import _drawing, _next_rid
+
+
+_CONTENT_TYPES = (
+    '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
+    '<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">'
+    '<Default Extension="xml" ContentType="application/xml"/>'
+    '<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>'
+    '<Override PartName="/word/document.xml" '
+    'ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>'
+    '</Types>'
+).encode()
+
+_PACKAGE_RELS = (
+    '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
+    '<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
+    '<Relationship Id="rId1" '
+    'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" '
+    'Target="word/document.xml"/>'
+    '</Relationships>'
+).encode()
+
+_BODY_PARAGRAPHS = (
+    "Q3 Operations Review (DRAFT — DO NOT DISTRIBUTE)",
+    "",
+    "Forecast and remediation timeline below. Numbers are preliminary "
+    "and subject to revision before the all-hands.",
+    "",
+    "Region        Incidents     MTTR (h)",
+    "us-east       14            3.2",
+    "us-west       9             4.7",
+    "eu-central    22            2.1",
+    "",
+    "Internal contact: secops@internal",
+)
+
+
+def _document_xml(rid_with_drawing: str | None = None) -> bytes:
+    """Build the body XML.
+
+    ``rid_with_drawing`` is the rId of the external image relationship;
+    when set, we append the same ``<w:drawing>`` element that the DOCX
+    instrumenter inserts so the body references the external resource.
+    """
+    paragraphs = []
+    for line in _BODY_PARAGRAPHS:
+        if line:
+            paragraphs.append(
+                "<w:p><w:r><w:t xml:space=\"preserve\">"
+                + _xml_escape(line)
+                + "</w:t></w:r></w:p>"
+            )
+        else:
+            paragraphs.append("<w:p/>")
+    body = "".join(paragraphs)
+    drawing = _drawing(rid_with_drawing).decode() if rid_with_drawing else ""
+    return (
+        '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
+        '<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">'
+        f'<w:body>{body}{drawing}</w:body>'
+        '</w:document>'
+    ).encode()
+
+
+def _xml_escape(s: str) -> str:
+    return (
+        s.replace("&", "&amp;")
+         .replace("<", "&lt;")
+         .replace(">", "&gt;")
+    )
+
+
+def _document_rels(rid: str, url: str) -> bytes:
+    return (
+        '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
+        '<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
+        f'<Relationship Id="{rid}" '
+        f'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" '
+        f'Target="{url}" TargetMode="External"/>'
+        '</Relationships>'
+    ).encode()
+
+
+class HoneydocDocxGenerator(CanaryGenerator):
+    name = "honeydoc_docx"
+
+    def generate(self, ctx: CanaryContext) -> CanaryArtifact:
+        url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
+        # Pick a stable rId — there's only one relationship in the
+        # synthesised file, so any unused id works.  Reuse the
+        # instrumenter's allocator against the bare relationships
+        # skeleton for parity with operator-uploaded DOCX flow.
+        skeleton = (
+            b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
+            b'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
+            b'</Relationships>'
+        )
+        rid = _next_rid(skeleton)
+
+        out = io.BytesIO()
+        with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zf:
+            zf.writestr("[Content_Types].xml", _CONTENT_TYPES)
+            zf.writestr("_rels/.rels", _PACKAGE_RELS)
+            zf.writestr("word/document.xml", _document_xml(rid))
+            zf.writestr("word/_rels/document.xml.rels", _document_rels(rid, url))
+
+        return CanaryArtifact(
+            path="",
+            content=out.getvalue(),
+            mode=0o644,
+            mtime_offset=-86400 * 21,
+            generator=self.name,
+            notes=[
+                "synthesised DOCX with realistic Q3 review body",
+                f"external-image relationship {rid} -> {url}",
+            ],
+        )
--- a/decnet/canary/generators/honeydoc_pdf.py
+++ b/decnet/canary/generators/honeydoc_pdf.py
@@ -0,0 +1,127 @@
+"""Real-PDF honeydoc generator (uses :mod:`pikepdf`).
+
+Builds a one-page PDF with the same Q3-review body as the HTML/DOCX
+flavors and installs an ``/OpenAction`` ``/URI`` action on the
+catalog so most viewers fire the callback the moment the document
+opens.
+
+Pikepdf is now a hard dependency for this generator (the operator
+installed it explicitly so we can use it).  We still surface a
+clear :class:`InstrumenterRejectedError` when imports fail, so a
+deployment without pikepdf can fall back to the DOCX or HTML
+generators rather than crashing the API.
+"""
+from __future__ import annotations
+
+import io
+
+from decnet.canary.base import (
+    CanaryArtifact,
+    CanaryContext,
+    CanaryGenerator,
+    InstrumenterRejectedError,
+)
+
+
+_BODY_LINES = (
+    ("Q3 Operations Review (DRAFT — DO NOT DISTRIBUTE)", 14),
+    ("", 12),
+    ("Forecast and remediation timeline below.", 11),
+    ("Numbers are preliminary, subject to revision.", 11),
+    ("", 12),
+    ("Region        Incidents     MTTR (h)", 11),
+    ("us-east       14            3.2", 11),
+    ("us-west       9             4.7",  11),
+    ("eu-central    22            2.1",  11),
+    ("", 12),
+    ("Internal contact: secops@internal", 11),
+)
+
+
+class HoneydocPdfGenerator(CanaryGenerator):
+    name = "honeydoc_pdf"
+
+    def generate(self, ctx: CanaryContext) -> CanaryArtifact:
+        try:
+            from pikepdf import Pdf, Name, Dictionary, String  # type: ignore[import-not-found]
+        except ImportError as e:
+            raise InstrumenterRejectedError(
+                "honeydoc_pdf requires pikepdf; install it (`pip install "
+                "pikepdf`) or pick honeydoc / honeydoc_docx instead."
+            ) from e
+
+        url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
+
+        pdf = Pdf.new()
+        # Helvetica is one of the 14 PDF base fonts — every viewer ships
+        # it, so no font embedding is required.
+        font = pdf.make_indirect(Dictionary(
+            Type=Name("/Font"),
+            Subtype=Name("/Type1"),
+            BaseFont=Name("/Helvetica"),
+        ))
+
+        # Build a single content stream that writes each body line at a
+        # decreasing y-coordinate.  PDF coordinates start at the bottom-
+        # left (US Letter = 612 x 792 points); we lay out lines roughly
+        # 18 points apart starting near the top.
+        ops: list[str] = ["BT /F1 12 Tf 72 750 Td"]
+        first = True
+        for line, size in _BODY_LINES:
+            if not first:
+                ops.append("0 -18 Td")
+            first = False
+            ops.append(f"/F1 {size} Tf")
+            ops.append(f"({_pdf_escape(line)}) Tj")
+        ops.append("ET")
+        content_bytes = "\n".join(ops).encode("latin-1")
+
+        content_stream = pdf.make_stream(content_bytes)
+
+        page = pdf.add_blank_page(page_size=(612, 792))
+        page[Name("/Resources")] = Dictionary(
+            Font=Dictionary(F1=font),
+        )
+        page[Name("/Contents")] = content_stream
+
+        # OpenAction fires the URI when the file is opened in Acrobat,
+        # Preview, the browser PDF viewer, etc.  Most viewers prompt
+        # before fetching; that prompt itself is a tell, and an
+        # auto-allow viewer fetches silently.
+        pdf.Root[Name("/OpenAction")] = Dictionary(
+            Type=Name("/Action"),
+            S=Name("/URI"),
+            URI=String(url),
+        )
+
+        out = io.BytesIO()
+        pdf.save(out)
+        return CanaryArtifact(
+            path="",
+            content=out.getvalue(),
+            mode=0o644,
+            mtime_offset=-86400 * 21,
+            generator=self.name,
+            notes=[
+                "synthesised one-page PDF with realistic Q3 review body",
+                f"/OpenAction /URI -> {url}",
+            ],
+        )
+
+
+def _pdf_escape(s: str) -> str:
+    """Escape parens and backslashes for PDF literal-string syntax.
+
+    PDF string literals are wrapped in ``( … )``; inner ``(``, ``)``,
+    and ``\\`` need backslash escapes.  Everything else (including
+    UTF-8 multibyte sequences) round-trips fine because Helvetica's
+    encoding is WinAnsi-ish — we'll lose exotic glyphs but the
+    realistic body sticks to ASCII anyway.  Em-dashes are downgraded
+    to ``--`` to avoid the WinAnsi gap.
+    """
+    return (
+        s.replace("\\", r"\\")
+         .replace("(", r"\(")
+         .replace(")", r"\)")
+         .replace("—", "--")
+    )
--- a/decnet/canary/paths.py
+++ b/decnet/canary/paths.py
@@ -25,7 +25,9 @@ _LINUX_DEFAULTS: dict[str, str] = {
    "env_file": "/home/{user}/.env",
    "ssh_key": "/home/{user}/.ssh/id_rsa",
    "aws_creds": "/home/{user}/.aws/credentials",
-    "honeydoc": "/home/{user}/Documents/quarterly_report.docx",
+    "honeydoc": "/home/{user}/Documents/quarterly_report.html",
+    "honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx",
+    "honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf",
 }

 _WINDOWS_DEFAULTS: dict[str, str] = {
@@ -33,7 +35,9 @@ _WINDOWS_DEFAULTS: dict[str, str] = {
    "env_file": "/home/{user}/Desktop/prod.env",
    "ssh_key": "/home/{user}/.ssh/id_rsa",  # OpenSSH on Windows uses the same path
    "aws_creds": "/home/{user}/.aws/credentials",
-    "honeydoc": "/home/{user}/Documents/quarterly_report.docx",
+    "honeydoc": "/home/{user}/Documents/quarterly_report.html",
+    "honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx",
+    "honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf",
 }