feat(canary): honeydoc_docx + honeydoc_pdf generators

honeydoc previously emitted HTML only — operators picking 'Document'
out of the dropdown got a .html file dropped at /Documents/
quarterly_report.docx, which any attacker would clock the moment they
ran 'file' on it.

Two new generators that emit the real artifact format:

- honeydoc_docx: stdlib zipfile only. Builds a minimal but valid
  Office Open XML zip with the same Q3 review body as the HTML
  flavor and an external-image relationship pointing at the
  callback URL — same trick the operator-upload DOCX instrumenter
  uses, fetched on document open by Word and LibreOffice. Reuses
  _drawing() and _next_rid() from instrumenters/docx.py to keep
  the body/relationships shape identical between synthesised and
  instrumented files.

- honeydoc_pdf: pikepdf-backed. One-page PDF in the 14 base fonts
  (Helvetica, no font embedding), realistic body, /OpenAction /URI
  on the catalog so most viewers fire the callback on document
  open. Falls back to a clear error if pikepdf is missing so the
  operator can switch to honeydoc / honeydoc_docx.

Default placement paths now reflect each generator's true extension
(.html / .docx / .pdf) so the UI suggests something sensible. Both
generators surfaced in the New Token modal's generator dropdown.
This commit is contained in:
2026-04-27 13:44:20 -04:00
parent c17b9e01c8
commit 5ac8e0f91a
8 changed files with 312 additions and 5 deletions

View File

@@ -18,6 +18,8 @@ KNOWN_GENERATORS: Tuple[str, ...] = (
"ssh_key",
"aws_creds",
"honeydoc",
"honeydoc_docx",
"honeydoc_pdf",
)
KNOWN_INSTRUMENTERS: Tuple[str, ...] = (
@@ -52,6 +54,12 @@ def get_generator(name: str) -> CanaryGenerator:
if name == "honeydoc":
from decnet.canary.generators.honeydoc import HoneydocGenerator
return HoneydocGenerator()
if name == "honeydoc_docx":
from decnet.canary.generators.honeydoc_docx import HoneydocDocxGenerator
return HoneydocDocxGenerator()
if name == "honeydoc_pdf":
from decnet.canary.generators.honeydoc_pdf import HoneydocPdfGenerator
return HoneydocPdfGenerator()
raise ValueError(
f"Unknown canary generator: {name!r}. Known: {KNOWN_GENERATORS}"
)

View File

@@ -0,0 +1,133 @@
"""Real-DOCX honeydoc generator.
Synthesises a minimal but structurally valid DOCX from scratch via
stdlib :mod:`zipfile`, then uses the same external-image relationship
trick that powers :mod:`decnet.canary.instrumenters.docx` to embed
the callback URL. No python-docx dependency.
The output opens cleanly in Word / LibreOffice; both fetch the
external image relationship on document load.
"""
from __future__ import annotations
import io
import zipfile
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
from decnet.canary.instrumenters.docx import _drawing, _next_rid
_CONTENT_TYPES = (
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
'<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">'
'<Default Extension="xml" ContentType="application/xml"/>'
'<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>'
'<Override PartName="/word/document.xml" '
'ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>'
'</Types>'
).encode()
_PACKAGE_RELS = (
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
'<Relationship Id="rId1" '
'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" '
'Target="word/document.xml"/>'
'</Relationships>'
).encode()
_BODY_PARAGRAPHS = (
"Q3 Operations Review (DRAFT — DO NOT DISTRIBUTE)",
"",
"Forecast and remediation timeline below. Numbers are preliminary "
"and subject to revision before the all-hands.",
"",
"Region Incidents MTTR (h)",
"us-east 14 3.2",
"us-west 9 4.7",
"eu-central 22 2.1",
"",
"Internal contact: secops@internal",
)
def _document_xml(rid_with_drawing: str | None = None) -> bytes:
"""Build the body XML.
``rid_with_drawing`` is the rId of the external image relationship;
when set, we append the same ``<w:drawing>`` element that the DOCX
instrumenter inserts so the body references the external resource.
"""
paragraphs = []
for line in _BODY_PARAGRAPHS:
if line:
paragraphs.append(
"<w:p><w:r><w:t xml:space=\"preserve\">"
+ _xml_escape(line)
+ "</w:t></w:r></w:p>"
)
else:
paragraphs.append("<w:p/>")
body = "".join(paragraphs)
drawing = _drawing(rid_with_drawing).decode() if rid_with_drawing else ""
return (
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">'
f'<w:body>{body}{drawing}</w:body>'
'</w:document>'
).encode()
def _xml_escape(s: str) -> str:
return (
s.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
)
def _document_rels(rid: str, url: str) -> bytes:
return (
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
f'<Relationship Id="{rid}" '
f'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" '
f'Target="{url}" TargetMode="External"/>'
'</Relationships>'
).encode()
class HoneydocDocxGenerator(CanaryGenerator):
name = "honeydoc_docx"
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
# Pick a stable rId — there's only one relationship in the
# synthesised file, so any unused id works. Reuse the
# instrumenter's allocator against the bare relationships
# skeleton for parity with operator-uploaded DOCX flow.
skeleton = (
b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
b'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
b'</Relationships>'
)
rid = _next_rid(skeleton)
out = io.BytesIO()
with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zf:
zf.writestr("[Content_Types].xml", _CONTENT_TYPES)
zf.writestr("_rels/.rels", _PACKAGE_RELS)
zf.writestr("word/document.xml", _document_xml(rid))
zf.writestr("word/_rels/document.xml.rels", _document_rels(rid, url))
return CanaryArtifact(
path="",
content=out.getvalue(),
mode=0o644,
mtime_offset=-86400 * 21,
generator=self.name,
notes=[
"synthesised DOCX with realistic Q3 review body",
f"external-image relationship {rid} -> {url}",
],
)

View File

@@ -0,0 +1,127 @@
"""Real-PDF honeydoc generator (uses :mod:`pikepdf`).
Builds a one-page PDF with the same Q3-review body as the HTML/DOCX
flavors and installs an ``/OpenAction`` ``/URI`` action on the
catalog so most viewers fire the callback the moment the document
opens.
Pikepdf is now a hard dependency for this generator (the operator
installed it explicitly so we can use it). We still surface a
clear :class:`InstrumenterRejectedError` when imports fail, so a
deployment without pikepdf can fall back to the DOCX or HTML
generators rather than crashing the API.
"""
from __future__ import annotations
import io
from decnet.canary.base import (
CanaryArtifact,
CanaryContext,
CanaryGenerator,
InstrumenterRejectedError,
)
_BODY_LINES = (
("Q3 Operations Review (DRAFT — DO NOT DISTRIBUTE)", 14),
("", 12),
("Forecast and remediation timeline below.", 11),
("Numbers are preliminary, subject to revision.", 11),
("", 12),
("Region Incidents MTTR (h)", 11),
("us-east 14 3.2", 11),
("us-west 9 4.7", 11),
("eu-central 22 2.1", 11),
("", 12),
("Internal contact: secops@internal", 11),
)
class HoneydocPdfGenerator(CanaryGenerator):
name = "honeydoc_pdf"
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
try:
from pikepdf import Pdf, Name, Dictionary, String # type: ignore[import-not-found]
except ImportError as e:
raise InstrumenterRejectedError(
"honeydoc_pdf requires pikepdf; install it (`pip install "
"pikepdf`) or pick honeydoc / honeydoc_docx instead."
) from e
url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
pdf = Pdf.new()
# Helvetica is one of the 14 PDF base fonts — every viewer ships
# it, so no font embedding is required.
font = pdf.make_indirect(Dictionary(
Type=Name("/Font"),
Subtype=Name("/Type1"),
BaseFont=Name("/Helvetica"),
))
# Build a single content stream that writes each body line at a
# decreasing y-coordinate. PDF coordinates start at the bottom-
# left (US Letter = 612 x 792 points); we lay out lines roughly
# 18 points apart starting near the top.
ops: list[str] = ["BT /F1 12 Tf 72 750 Td"]
first = True
for line, size in _BODY_LINES:
if not first:
ops.append("0 -18 Td")
first = False
ops.append(f"/F1 {size} Tf")
ops.append(f"({_pdf_escape(line)}) Tj")
ops.append("ET")
content_bytes = "\n".join(ops).encode("latin-1")
content_stream = pdf.make_stream(content_bytes)
page = pdf.add_blank_page(page_size=(612, 792))
page[Name("/Resources")] = Dictionary(
Font=Dictionary(F1=font),
)
page[Name("/Contents")] = content_stream
# OpenAction fires the URI when the file is opened in Acrobat,
# Preview, the browser PDF viewer, etc. Most viewers prompt
# before fetching; that prompt itself is a tell, and an
# auto-allow viewer fetches silently.
pdf.Root[Name("/OpenAction")] = Dictionary(
Type=Name("/Action"),
S=Name("/URI"),
URI=String(url),
)
out = io.BytesIO()
pdf.save(out)
return CanaryArtifact(
path="",
content=out.getvalue(),
mode=0o644,
mtime_offset=-86400 * 21,
generator=self.name,
notes=[
"synthesised one-page PDF with realistic Q3 review body",
f"/OpenAction /URI -> {url}",
],
)
def _pdf_escape(s: str) -> str:
"""Escape parens and backslashes for PDF literal-string syntax.
PDF string literals are wrapped in ``( … )``; inner ``(``, ``)``,
and ``\\`` need backslash escapes. Everything else (including
UTF-8 multibyte sequences) round-trips fine because Helvetica's
encoding is WinAnsi-ish — we'll lose exotic glyphs but the
realistic body sticks to ASCII anyway. Em-dashes are downgraded
to ``--`` to avoid the WinAnsi gap.
"""
return (
s.replace("\\", r"\\")
.replace("(", r"\(")
.replace(")", r"\)")
.replace("", "--")
)

View File

@@ -25,7 +25,9 @@ _LINUX_DEFAULTS: dict[str, str] = {
"env_file": "/home/{user}/.env",
"ssh_key": "/home/{user}/.ssh/id_rsa",
"aws_creds": "/home/{user}/.aws/credentials",
"honeydoc": "/home/{user}/Documents/quarterly_report.docx",
"honeydoc": "/home/{user}/Documents/quarterly_report.html",
"honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx",
"honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf",
}
_WINDOWS_DEFAULTS: dict[str, str] = {
@@ -33,7 +35,9 @@ _WINDOWS_DEFAULTS: dict[str, str] = {
"env_file": "/home/{user}/Desktop/prod.env",
"ssh_key": "/home/{user}/.ssh/id_rsa", # OpenSSH on Windows uses the same path
"aws_creds": "/home/{user}/.aws/credentials",
"honeydoc": "/home/{user}/Documents/quarterly_report.docx",
"honeydoc": "/home/{user}/Documents/quarterly_report.html",
"honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx",
"honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf",
}