"""DOCX instrumenter — inject a remote image into the body. DOCX files are zip archives carrying ``word/document.xml`` (the body) and ``word/_rels/document.xml.rels`` (the relationship table that maps ``rId`` references to URLs). We: 1. Add a new relationship of type ``image`` whose target is the canary callback URL and ``TargetMode="External"``. 2. Add a tiny ```` element referencing that ``rId`` at the end of ``word/document.xml`` (just before ````). Word and LibreOffice both fetch external image relationships when the document is opened (subject to the user's "trusted source" toggle, which most enterprise environments disable in favour of "warn but allow"). We use stdlib ``zipfile`` only — no python-docx dependency — because the surface we touch is two small XML files and we don't need any of the higher-level abstractions. """ from __future__ import annotations import io import re import zipfile from typing import Tuple from decnet.canary.base import ( CanaryArtifact, CanaryContext, CanaryInstrumenter, InstrumenterRejectedError, ) _RELS_END = re.compile(rb"", re.IGNORECASE) _BODY_END = re.compile(rb"", re.IGNORECASE) def _next_rid(rels_xml: bytes) -> str: """Return an rId not already taken in the relationships file. Word's loader tolerates non-sequential ids, so we just pick one well above the typical range to avoid collisions. """ used = set(m.group(1).decode() for m in re.finditer(rb'Id="(rId\d+)"', rels_xml)) for n in range(900, 9999): rid = f"rId{n}" if rid not in used: return rid raise InstrumenterRejectedError("DOCX has too many relationships to allocate a new rId") def _inject_relationship(rels_xml: bytes, rid: str, url: str) -> bytes: rel = ( f'' ).encode() match = _RELS_END.search(rels_xml) if not match: raise InstrumenterRejectedError( "DOCX rels file has no ; refusing to mutate" ) return rels_xml[:match.start()] + rel + rels_xml[match.start():] def _drawing(rid: str) -> bytes: # Minimal w:drawing tree referencing the external image at rid. # Dimensions are 1 EMU x 1 EMU so the image is invisible; Word # still fetches the resource on document load. return ( '' '' '' '' '' '' '' '' f'' '' '' '' '' '' '' ).encode() def _inject_drawing(document_xml: bytes, rid: str) -> bytes: match = _BODY_END.search(document_xml) if not match: raise InstrumenterRejectedError("DOCX document.xml has no ") drawing = _drawing(rid) return document_xml[:match.start()] + drawing + document_xml[match.start():] def _mutate(blob: bytes, url: str) -> Tuple[bytes, str]: try: with zipfile.ZipFile(io.BytesIO(blob), "r") as zf: try: rels = zf.read("word/_rels/document.xml.rels") doc = zf.read("word/document.xml") except KeyError as e: raise InstrumenterRejectedError( f"DOCX missing expected member: {e.args[0]!r}" ) from e members = [(zi, zf.read(zi.filename)) for zi in zf.infolist()] except zipfile.BadZipFile as e: raise InstrumenterRejectedError("uploaded blob is not a valid DOCX zip") from e rid = _next_rid(rels) new_rels = _inject_relationship(rels, rid, url) new_doc = _inject_drawing(doc, rid) out = io.BytesIO() with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zf_out: for zi, data in members: if zi.filename == "word/_rels/document.xml.rels": zf_out.writestr(zi.filename, new_rels) elif zi.filename == "word/document.xml": zf_out.writestr(zi.filename, new_doc) else: zf_out.writestr(zi, data) return out.getvalue(), rid class DocxInstrumenter(CanaryInstrumenter): name = "docx" mime_prefixes = ( "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ) def instrument( self, blob: bytes, ctx: CanaryContext, *, target_path: str, ) -> CanaryArtifact: url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}" mutated, rid = _mutate(blob, url) return CanaryArtifact( path=target_path, content=mutated, mode=0o644, mtime_offset=-86400 * 14, instrumenter=self.name, notes=[f"injected external-image relationship {rid} -> {url}"], )