merge: testing → main (reconcile 2-week divergence)

This commit is contained in:
2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions

View File

@@ -0,0 +1,147 @@
"""DOCX instrumenter — inject a remote image into the body.
DOCX files are zip archives carrying ``word/document.xml`` (the body)
and ``word/_rels/document.xml.rels`` (the relationship table that
maps ``rId`` references to URLs). We:
1. Add a new relationship of type ``image`` whose target is the
canary callback URL and ``TargetMode="External"``.
2. Add a tiny ``<w:drawing>`` element referencing that ``rId`` at
the end of ``word/document.xml`` (just before ``</w:body>``).
Word and LibreOffice both fetch external image relationships when
the document is opened (subject to the user's "trusted source"
toggle, which most enterprise environments disable in favour of
"warn but allow").
We use stdlib ``zipfile`` only — no python-docx dependency — because
the surface we touch is two small XML files and we don't need any of
the higher-level abstractions.
"""
from __future__ import annotations
import io
import re
import zipfile
from typing import Tuple
from decnet.canary.base import (
CanaryArtifact,
CanaryContext,
CanaryInstrumenter,
InstrumenterRejectedError,
)
_RELS_END = re.compile(rb"</Relationships\s*>", re.IGNORECASE)
_BODY_END = re.compile(rb"</w:body\s*>", re.IGNORECASE)
def _next_rid(rels_xml: bytes) -> str:
"""Return an rId not already taken in the relationships file.
Word's loader tolerates non-sequential ids, so we just pick one
well above the typical range to avoid collisions.
"""
used = set(m.group(1).decode() for m in re.finditer(rb'Id="(rId\d+)"', rels_xml))
for n in range(900, 9999):
rid = f"rId{n}"
if rid not in used:
return rid
raise InstrumenterRejectedError("DOCX has too many relationships to allocate a new rId")
def _inject_relationship(rels_xml: bytes, rid: str, url: str) -> bytes:
rel = (
f'<Relationship Id="{rid}" '
f'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" '
f'Target="{url}" TargetMode="External"/>'
).encode()
match = _RELS_END.search(rels_xml)
if not match:
raise InstrumenterRejectedError(
"DOCX rels file has no </Relationships>; refusing to mutate"
)
return rels_xml[:match.start()] + rel + rels_xml[match.start():]
def _drawing(rid: str) -> bytes:
# Minimal w:drawing tree referencing the external image at rid.
# Dimensions are 1 EMU x 1 EMU so the image is invisible; Word
# still fetches the resource on document load.
return (
'<w:p><w:r><w:drawing>'
'<wp:inline xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing">'
'<wp:extent cx="1" cy="1"/><wp:docPr id="1" name="canary"/>'
'<a:graphic xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">'
'<a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/picture">'
'<pic:pic xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">'
'<pic:nvPicPr><pic:cNvPr id="1" name="canary"/><pic:cNvPicPr/></pic:nvPicPr>'
'<pic:blipFill>'
f'<a:blip xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" r:link="{rid}"/>'
'<a:stretch><a:fillRect/></a:stretch>'
'</pic:blipFill>'
'<pic:spPr><a:xfrm><a:off x="0" y="0"/><a:ext cx="1" cy="1"/></a:xfrm>'
'<a:prstGeom prst="rect"><a:avLst/></a:prstGeom></pic:spPr>'
'</pic:pic></a:graphicData></a:graphic></wp:inline>'
'</w:drawing></w:r></w:p>'
).encode()
def _inject_drawing(document_xml: bytes, rid: str) -> bytes:
match = _BODY_END.search(document_xml)
if not match:
raise InstrumenterRejectedError("DOCX document.xml has no </w:body>")
drawing = _drawing(rid)
return document_xml[:match.start()] + drawing + document_xml[match.start():]
def _mutate(blob: bytes, url: str) -> Tuple[bytes, str]:
try:
with zipfile.ZipFile(io.BytesIO(blob), "r") as zf:
try:
rels = zf.read("word/_rels/document.xml.rels")
doc = zf.read("word/document.xml")
except KeyError as e:
raise InstrumenterRejectedError(
f"DOCX missing expected member: {e.args[0]!r}"
) from e
members = [(zi, zf.read(zi.filename)) for zi in zf.infolist()]
except zipfile.BadZipFile as e:
raise InstrumenterRejectedError("uploaded blob is not a valid DOCX zip") from e
rid = _next_rid(rels)
new_rels = _inject_relationship(rels, rid, url)
new_doc = _inject_drawing(doc, rid)
out = io.BytesIO()
with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zf_out:
for zi, data in members:
if zi.filename == "word/_rels/document.xml.rels":
zf_out.writestr(zi.filename, new_rels)
elif zi.filename == "word/document.xml":
zf_out.writestr(zi.filename, new_doc)
else:
zf_out.writestr(zi, data)
return out.getvalue(), rid
class DocxInstrumenter(CanaryInstrumenter):
name = "docx"
mime_prefixes = (
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
def instrument(
self, blob: bytes, ctx: CanaryContext, *, target_path: str,
) -> CanaryArtifact:
url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
mutated, rid = _mutate(blob, url)
return CanaryArtifact(
path=target_path,
content=mutated,
mode=0o644,
mtime_offset=-86400 * 14,
instrumenter=self.name,
notes=[f"injected external-image relationship {rid} -> {url}"],
)