feat(canary): honeydoc_docx + honeydoc_pdf generators

honeydoc previously emitted HTML only — operators picking 'Document'
out of the dropdown got a .html file dropped at /Documents/
quarterly_report.docx, which any attacker would clock the moment they
ran 'file' on it.

Two new generators that emit the real artifact format:

- honeydoc_docx: stdlib zipfile only. Builds a minimal but valid
  Office Open XML zip with the same Q3 review body as the HTML
  flavor and an external-image relationship pointing at the
  callback URL — same trick the operator-upload DOCX instrumenter
  uses, fetched on document open by Word and LibreOffice. Reuses
  _drawing() and _next_rid() from instrumenters/docx.py to keep
  the body/relationships shape identical between synthesised and
  instrumented files.

- honeydoc_pdf: pikepdf-backed. One-page PDF in the 14 base fonts
  (Helvetica, no font embedding), realistic body, /OpenAction /URI
  on the catalog so most viewers fire the callback on document
  open. Falls back to a clear error if pikepdf is missing so the
  operator can switch to honeydoc / honeydoc_docx.

Default placement paths now reflect each generator's true extension
(.html / .docx / .pdf) so the UI suggests something sensible. Both
generators surfaced in the New Token modal's generator dropdown.
This commit is contained in:
2026-04-27 13:44:20 -04:00
parent c17b9e01c8
commit 5ac8e0f91a
8 changed files with 312 additions and 5 deletions

View File

@@ -18,6 +18,8 @@ KNOWN_GENERATORS: Tuple[str, ...] = (
"ssh_key",
"aws_creds",
"honeydoc",
"honeydoc_docx",
"honeydoc_pdf",
)
KNOWN_INSTRUMENTERS: Tuple[str, ...] = (
@@ -52,6 +54,12 @@ def get_generator(name: str) -> CanaryGenerator:
if name == "honeydoc":
from decnet.canary.generators.honeydoc import HoneydocGenerator
return HoneydocGenerator()
if name == "honeydoc_docx":
from decnet.canary.generators.honeydoc_docx import HoneydocDocxGenerator
return HoneydocDocxGenerator()
if name == "honeydoc_pdf":
from decnet.canary.generators.honeydoc_pdf import HoneydocPdfGenerator
return HoneydocPdfGenerator()
raise ValueError(
f"Unknown canary generator: {name!r}. Known: {KNOWN_GENERATORS}"
)

View File

@@ -0,0 +1,133 @@
"""Real-DOCX honeydoc generator.
Synthesises a minimal but structurally valid DOCX from scratch via
stdlib :mod:`zipfile`, then uses the same external-image relationship
trick that powers :mod:`decnet.canary.instrumenters.docx` to embed
the callback URL. No python-docx dependency.
The output opens cleanly in Word / LibreOffice; both fetch the
external image relationship on document load.
"""
from __future__ import annotations
import io
import zipfile
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
from decnet.canary.instrumenters.docx import _drawing, _next_rid
_CONTENT_TYPES = (
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
'<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">'
'<Default Extension="xml" ContentType="application/xml"/>'
'<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>'
'<Override PartName="/word/document.xml" '
'ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>'
'</Types>'
).encode()
_PACKAGE_RELS = (
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
'<Relationship Id="rId1" '
'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" '
'Target="word/document.xml"/>'
'</Relationships>'
).encode()
_BODY_PARAGRAPHS = (
"Q3 Operations Review (DRAFT — DO NOT DISTRIBUTE)",
"",
"Forecast and remediation timeline below. Numbers are preliminary "
"and subject to revision before the all-hands.",
"",
"Region Incidents MTTR (h)",
"us-east 14 3.2",
"us-west 9 4.7",
"eu-central 22 2.1",
"",
"Internal contact: secops@internal",
)
def _document_xml(rid_with_drawing: str | None = None) -> bytes:
"""Build the body XML.
``rid_with_drawing`` is the rId of the external image relationship;
when set, we append the same ``<w:drawing>`` element that the DOCX
instrumenter inserts so the body references the external resource.
"""
paragraphs = []
for line in _BODY_PARAGRAPHS:
if line:
paragraphs.append(
"<w:p><w:r><w:t xml:space=\"preserve\">"
+ _xml_escape(line)
+ "</w:t></w:r></w:p>"
)
else:
paragraphs.append("<w:p/>")
body = "".join(paragraphs)
drawing = _drawing(rid_with_drawing).decode() if rid_with_drawing else ""
return (
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">'
f'<w:body>{body}{drawing}</w:body>'
'</w:document>'
).encode()
def _xml_escape(s: str) -> str:
return (
s.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
)
def _document_rels(rid: str, url: str) -> bytes:
return (
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
f'<Relationship Id="{rid}" '
f'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" '
f'Target="{url}" TargetMode="External"/>'
'</Relationships>'
).encode()
class HoneydocDocxGenerator(CanaryGenerator):
name = "honeydoc_docx"
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
# Pick a stable rId — there's only one relationship in the
# synthesised file, so any unused id works. Reuse the
# instrumenter's allocator against the bare relationships
# skeleton for parity with operator-uploaded DOCX flow.
skeleton = (
b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
b'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
b'</Relationships>'
)
rid = _next_rid(skeleton)
out = io.BytesIO()
with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zf:
zf.writestr("[Content_Types].xml", _CONTENT_TYPES)
zf.writestr("_rels/.rels", _PACKAGE_RELS)
zf.writestr("word/document.xml", _document_xml(rid))
zf.writestr("word/_rels/document.xml.rels", _document_rels(rid, url))
return CanaryArtifact(
path="",
content=out.getvalue(),
mode=0o644,
mtime_offset=-86400 * 21,
generator=self.name,
notes=[
"synthesised DOCX with realistic Q3 review body",
f"external-image relationship {rid} -> {url}",
],
)

View File

@@ -0,0 +1,127 @@
"""Real-PDF honeydoc generator (uses :mod:`pikepdf`).
Builds a one-page PDF with the same Q3-review body as the HTML/DOCX
flavors and installs an ``/OpenAction`` ``/URI`` action on the
catalog so most viewers fire the callback the moment the document
opens.
Pikepdf is now a hard dependency for this generator (the operator
installed it explicitly so we can use it). We still surface a
clear :class:`InstrumenterRejectedError` when imports fail, so a
deployment without pikepdf can fall back to the DOCX or HTML
generators rather than crashing the API.
"""
from __future__ import annotations
import io
from decnet.canary.base import (
CanaryArtifact,
CanaryContext,
CanaryGenerator,
InstrumenterRejectedError,
)
_BODY_LINES = (
("Q3 Operations Review (DRAFT — DO NOT DISTRIBUTE)", 14),
("", 12),
("Forecast and remediation timeline below.", 11),
("Numbers are preliminary, subject to revision.", 11),
("", 12),
("Region Incidents MTTR (h)", 11),
("us-east 14 3.2", 11),
("us-west 9 4.7", 11),
("eu-central 22 2.1", 11),
("", 12),
("Internal contact: secops@internal", 11),
)
class HoneydocPdfGenerator(CanaryGenerator):
name = "honeydoc_pdf"
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
try:
from pikepdf import Pdf, Name, Dictionary, String # type: ignore[import-not-found]
except ImportError as e:
raise InstrumenterRejectedError(
"honeydoc_pdf requires pikepdf; install it (`pip install "
"pikepdf`) or pick honeydoc / honeydoc_docx instead."
) from e
url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
pdf = Pdf.new()
# Helvetica is one of the 14 PDF base fonts — every viewer ships
# it, so no font embedding is required.
font = pdf.make_indirect(Dictionary(
Type=Name("/Font"),
Subtype=Name("/Type1"),
BaseFont=Name("/Helvetica"),
))
# Build a single content stream that writes each body line at a
# decreasing y-coordinate. PDF coordinates start at the bottom-
# left (US Letter = 612 x 792 points); we lay out lines roughly
# 18 points apart starting near the top.
ops: list[str] = ["BT /F1 12 Tf 72 750 Td"]
first = True
for line, size in _BODY_LINES:
if not first:
ops.append("0 -18 Td")
first = False
ops.append(f"/F1 {size} Tf")
ops.append(f"({_pdf_escape(line)}) Tj")
ops.append("ET")
content_bytes = "\n".join(ops).encode("latin-1")
content_stream = pdf.make_stream(content_bytes)
page = pdf.add_blank_page(page_size=(612, 792))
page[Name("/Resources")] = Dictionary(
Font=Dictionary(F1=font),
)
page[Name("/Contents")] = content_stream
# OpenAction fires the URI when the file is opened in Acrobat,
# Preview, the browser PDF viewer, etc. Most viewers prompt
# before fetching; that prompt itself is a tell, and an
# auto-allow viewer fetches silently.
pdf.Root[Name("/OpenAction")] = Dictionary(
Type=Name("/Action"),
S=Name("/URI"),
URI=String(url),
)
out = io.BytesIO()
pdf.save(out)
return CanaryArtifact(
path="",
content=out.getvalue(),
mode=0o644,
mtime_offset=-86400 * 21,
generator=self.name,
notes=[
"synthesised one-page PDF with realistic Q3 review body",
f"/OpenAction /URI -> {url}",
],
)
def _pdf_escape(s: str) -> str:
"""Escape parens and backslashes for PDF literal-string syntax.
PDF string literals are wrapped in ``( … )``; inner ``(``, ``)``,
and ``\\`` need backslash escapes. Everything else (including
UTF-8 multibyte sequences) round-trips fine because Helvetica's
encoding is WinAnsi-ish — we'll lose exotic glyphs but the
realistic body sticks to ASCII anyway. Em-dashes are downgraded
to ``--`` to avoid the WinAnsi gap.
"""
return (
s.replace("\\", r"\\")
.replace("(", r"\(")
.replace(")", r"\)")
.replace("", "--")
)

View File

@@ -25,7 +25,9 @@ _LINUX_DEFAULTS: dict[str, str] = {
"env_file": "/home/{user}/.env",
"ssh_key": "/home/{user}/.ssh/id_rsa",
"aws_creds": "/home/{user}/.aws/credentials",
"honeydoc": "/home/{user}/Documents/quarterly_report.docx",
"honeydoc": "/home/{user}/Documents/quarterly_report.html",
"honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx",
"honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf",
}
_WINDOWS_DEFAULTS: dict[str, str] = {
@@ -33,7 +35,9 @@ _WINDOWS_DEFAULTS: dict[str, str] = {
"env_file": "/home/{user}/Desktop/prod.env",
"ssh_key": "/home/{user}/.ssh/id_rsa", # OpenSSH on Windows uses the same path
"aws_creds": "/home/{user}/.aws/credentials",
"honeydoc": "/home/{user}/Documents/quarterly_report.docx",
"honeydoc": "/home/{user}/Documents/quarterly_report.html",
"honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx",
"honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf",
}

View File

@@ -20,7 +20,8 @@ interface BlobRow {
}
const KNOWN_GENERATORS = [
'git_config', 'env_file', 'ssh_key', 'aws_creds', 'honeydoc',
'git_config', 'env_file', 'ssh_key', 'aws_creds',
'honeydoc', 'honeydoc_docx', 'honeydoc_pdf',
] as const;
type GeneratorName = typeof KNOWN_GENERATORS[number];

View File

@@ -59,7 +59,8 @@ def test_known_lists_are_stable() -> None:
# If anyone adds/removes from the dispatch tables, the test
# surfaces it. Keeps the schema-of-record in one place.
assert KNOWN_GENERATORS == (
"git_config", "env_file", "ssh_key", "aws_creds", "honeydoc",
"git_config", "env_file", "ssh_key", "aws_creds",
"honeydoc", "honeydoc_docx", "honeydoc_pdf",
)
assert KNOWN_INSTRUMENTERS == (
"docx", "xlsx", "pdf", "html", "image", "plain", "passthrough",

View File

@@ -90,6 +90,37 @@ def test_honeydoc_html_is_valid_ish_html() -> None:
assert "width=\"1\" height=\"1\"" in body
def test_honeydoc_docx_produces_valid_zip_with_callback() -> None:
import io
import zipfile
g = get_generator("honeydoc_docx")
art = g.generate(_ctx(callback_token="slugDX"))
assert art.content[:4] == b"PK\x03\x04" # zip magic
with zipfile.ZipFile(io.BytesIO(art.content), "r") as zf:
names = set(zf.namelist())
assert {"[Content_Types].xml", "_rels/.rels", "word/document.xml",
"word/_rels/document.xml.rels"} <= names
rels = zf.read("word/_rels/document.xml.rels").decode()
assert "https://canary.example.test/c/slugDX" in rels
assert "TargetMode=\"External\"" in rels
doc = zf.read("word/document.xml").decode()
assert "Q3 Operations Review" in doc
assert "<w:drawing>" in doc
def test_honeydoc_pdf_produces_valid_pdf_with_openaction() -> None:
pikepdf = pytest.importorskip("pikepdf")
g = get_generator("honeydoc_pdf")
art = g.generate(_ctx(callback_token="slugPDF"))
assert art.content[:5] == b"%PDF-"
# Re-open and confirm OpenAction URI round-trips.
import io
with pikepdf.open(io.BytesIO(art.content)) as pdf:
action = pdf.Root["/OpenAction"]
assert str(action["/S"]) == "/URI"
assert str(action["/URI"]) == "https://canary.example.test/c/slugPDF"
def test_git_config_remote_url_shape() -> None:
g = get_generator("git_config")
art = g.generate(_ctx(callback_token="slug42"))

View File

@@ -28,7 +28,9 @@ def test_default_user_dispatch() -> None:
("env_file", "windows", "/home/Administrator/Desktop/prod.env"),
("git_config", "linux", "/home/admin/.git/config"),
("ssh_key", "linux", "/home/admin/.ssh/id_rsa"),
("honeydoc", "linux", "/home/admin/Documents/quarterly_report.docx"),
("honeydoc", "linux", "/home/admin/Documents/quarterly_report.html"),
("honeydoc_docx", "linux", "/home/admin/Documents/quarterly_report.docx"),
("honeydoc_pdf", "linux", "/home/admin/Documents/quarterly_report.pdf"),
],
)
def test_default_path_for_known_generators(