feat(canary): package scaffolding (base/factory/paths/storage) + tests

Mirrors the decnet.intel layout (base + factory + lazy concrete
imports). Defines:

- CanaryArtifact / CanaryContext dataclasses + the generator and
  instrumenter ABCs they share
- factory dispatch for generators (git_config/env_file/ssh_key/
  aws_creds/honeydoc) and instrumenters (docx/xlsx/pdf/html/image/
  plain/passthrough), plus pick_instrumenter_for_mime() for MIME-driven
  dispatch on operator uploads
- persona-aware default placement paths (Linux vs. Windows-shaped)
  and absolute-path validation that the API will use to validate
  operator-supplied placement_path values
- on-disk blob store: sha256-keyed two-level fan-out, idempotent
  writes, refcount-aware unlink (the DB row is the source of truth)

Also covers prior commits' tests (bus topics, models, repo CRUD)
under tests/canary/. 79 tests, all pass.
This commit is contained in:
2026-04-27 12:56:01 -04:00
parent 6a0d140e91
commit 8f19adecfe
12 changed files with 989 additions and 0 deletions

37
decnet/canary/__init__.py Normal file
View File

@@ -0,0 +1,37 @@
"""Canary tokens — decoy artifacts planted in decky filesystems.
Public surface is exported here so callers can ``from decnet.canary
import CanaryArtifact, get_generator, get_instrumenter`` without
knowing the submodule layout. Concrete generators / instrumenters
live under :mod:`decnet.canary.generators` and
:mod:`decnet.canary.instrumenters` respectively; the factory keeps
import-time cost down by deferring those imports until first use
(same pattern as :mod:`decnet.intel.factory`).
"""
from __future__ import annotations
from decnet.canary.base import (
CanaryArtifact,
CanaryContext,
CanaryGenerator,
CanaryInstrumenter,
)
from decnet.canary.factory import (
KNOWN_GENERATORS,
KNOWN_INSTRUMENTERS,
get_generator,
get_instrumenter,
pick_instrumenter_for_mime,
)
__all__ = [
"CanaryArtifact",
"CanaryContext",
"CanaryGenerator",
"CanaryInstrumenter",
"KNOWN_GENERATORS",
"KNOWN_INSTRUMENTERS",
"get_generator",
"get_instrumenter",
"pick_instrumenter_for_mime",
]

145
decnet/canary/base.py Normal file
View File

@@ -0,0 +1,145 @@
"""Canary generator / instrumenter ABCs and the artifact dataclass.
Two flavors of producer share the same return shape:
* :class:`CanaryGenerator` synthesises a fake artifact from scratch
(e.g. a plausible ``~/.aws/credentials`` block, a ``.git/config``
pointing at an attacker-bait remote URL). Operators don't supply
any input.
* :class:`CanaryInstrumenter` mutates an operator-uploaded blob to
embed the callback (HTTP slug + DNS host). The original blob bytes
are passed in; the instrumenter returns the mutated version.
Both return a :class:`CanaryArtifact` — the planter doesn't care
which path produced it. Same dataclass keeps the planter's
docker-exec injector trivial.
ABCs intentionally do not include I/O — generators and instrumenters
are pure functions of (slug, host, blob?). All filesystem work
happens in :mod:`decnet.canary.planter` and :mod:`decnet.canary.storage`.
"""
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class CanaryContext:
"""Inputs every generator/instrumenter needs to embed a working callback.
``callback_token`` is the unique slug; it appears verbatim in HTTP
URLs (``https://<host>/c/<callback_token>``) and as the leftmost
DNS label (``<callback_token>.canary.<dns_zone>``) so a single
slug resolves to a single :class:`CanaryToken` row regardless of
which path the attacker tripped.
``http_base`` and ``dns_zone`` come from the canary worker's
public-facing config (``DECNET_CANARY_HTTP_BASE``,
``DECNET_CANARY_DNS_ZONE``). When DNS isn't deployed,
``dns_zone`` is empty and instrumenters that only have a DNS
surface (e.g. an artifact whose only realistic embed point is a
hostname) raise.
"""
callback_token: str
http_base: str # e.g. "https://canary.example.test" — no trailing slash
dns_zone: str = "" # e.g. "canary.example.test"; "" disables DNS embeds
persona: str = "linux" # "linux" | "windows" — drives default username, path style
@dataclass
class CanaryArtifact:
"""Bytes-and-placement bundle produced by a generator/instrumenter."""
path: str
"""Absolute path inside the target container."""
content: bytes
"""Final bytes that hit the decky filesystem.
Always raw bytes — the planter base64-encodes for the wire so
binary blobs (DOCX/PNG/PDF) survive ``docker exec sh -c`` safely.
"""
mode: int = 0o600
"""Unix file mode. Defaults to ``0600`` because most realistic
canary placements (``~/.aws/credentials``, ``.env``, ``id_rsa``)
are operator-only. Honeydocs in user docs folders should pass
``0o644``.
"""
mtime_offset: int = 0
"""Seconds relative to *now* for the planted file's mtime.
Negative values backdate the file so it doesn't look like it
appeared the moment the decky was deployed. ``-86400 * 90`` (90
days ago) is a common choice for ``honeydoc`` artifacts; ``0``
means "stamp it now," which is fine for ``aws_creds``-like files
that would plausibly be touched recently.
"""
instrumenter: Optional[str] = None
"""Identifier of the instrumenter that produced this artifact (for
upload-driven tokens). Mirrored into ``CanaryToken.instrumenter``.
Mutually exclusive with :attr:`generator`.
"""
generator: Optional[str] = None
"""Identifier of the generator that produced this artifact (for
synthesised tokens). Mirrored into ``CanaryToken.generator``.
Mutually exclusive with :attr:`instrumenter`.
"""
notes: list[str] = field(default_factory=list)
"""Human-readable notes about the embedding (e.g. "DOCX: injected
1×1 remote image at relsId rId99"). Surfaced in the API
``preview`` response so the operator sees what we did before
planting. Never leaked to the attacker-facing surface.
"""
class CanaryGenerator(ABC):
"""Produces a fake artifact from scratch."""
name: str #: short tag — matches ``CanaryToken.generator``
@abstractmethod
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
"""Synthesise the artifact.
MUST NOT do I/O. MUST be deterministic for the same
``(callback_token, http_base, dns_zone, persona)`` so re-seeding
from :attr:`CanaryToken.secret_seed` produces byte-identical
output and the planter is naturally idempotent.
"""
class CanaryInstrumenter(ABC):
"""Mutates an operator-uploaded blob to embed a callback."""
name: str #: short tag — matches ``CanaryToken.instrumenter``
#: MIME prefixes this instrumenter handles. The factory uses these
#: to dispatch by sniffed content-type. Sub-string match against
#: the prefix list (e.g. ``("application/pdf",)`` or
#: ``("text/",)``).
mime_prefixes: tuple[str, ...] = ()
@abstractmethod
def instrument(
self, blob: bytes, ctx: CanaryContext, *, target_path: str,
) -> CanaryArtifact:
"""Return the mutated bytes with the callback embedded.
MUST raise :class:`InstrumenterRejectedError` when the blob
can't be safely mutated (corrupt zip, encrypted PDF, etc.) so
the API can surface a 400 with the specific reason rather than
silently shipping the original bytes.
"""
class InstrumenterRejectedError(ValueError):
"""Raised when an instrumenter can't safely mutate the input."""

129
decnet/canary/factory.py Normal file
View File

@@ -0,0 +1,129 @@
"""Generator and instrumenter factories.
Same lazy-import pattern as :mod:`decnet.intel.factory` — concrete
implementations stay un-imported until first use so importing
:mod:`decnet.canary` from a CLI subcommand doesn't drag in
``pikepdf`` / ``python-docx`` / ``Pillow`` for callers that only
need the model layer.
"""
from __future__ import annotations
from typing import Tuple
from decnet.canary.base import CanaryGenerator, CanaryInstrumenter
KNOWN_GENERATORS: Tuple[str, ...] = (
"git_config",
"env_file",
"ssh_key",
"aws_creds",
"honeydoc",
)
KNOWN_INSTRUMENTERS: Tuple[str, ...] = (
"docx",
"xlsx",
"pdf",
"html",
"image",
"plain",
"passthrough",
)
def get_generator(name: str) -> CanaryGenerator:
"""Return the generator registered under ``name``.
Raises :class:`ValueError` for unknown names so a typo in the API
request surfaces as a 400 rather than silently producing nothing.
"""
if name == "git_config":
from decnet.canary.generators.git_config import GitConfigGenerator
return GitConfigGenerator()
if name == "env_file":
from decnet.canary.generators.env_file import EnvFileGenerator
return EnvFileGenerator()
if name == "ssh_key":
from decnet.canary.generators.ssh_key import SSHKeyGenerator
return SSHKeyGenerator()
if name == "aws_creds":
from decnet.canary.generators.aws_creds import AWSCredsGenerator
return AWSCredsGenerator()
if name == "honeydoc":
from decnet.canary.generators.honeydoc import HoneydocGenerator
return HoneydocGenerator()
raise ValueError(
f"Unknown canary generator: {name!r}. Known: {KNOWN_GENERATORS}"
)
def get_instrumenter(name: str) -> CanaryInstrumenter:
"""Return the instrumenter registered under ``name``."""
if name == "docx":
from decnet.canary.instrumenters.docx import DocxInstrumenter
return DocxInstrumenter()
if name == "xlsx":
from decnet.canary.instrumenters.xlsx import XlsxInstrumenter
return XlsxInstrumenter()
if name == "pdf":
from decnet.canary.instrumenters.pdf import PdfInstrumenter
return PdfInstrumenter()
if name == "html":
from decnet.canary.instrumenters.html import HtmlInstrumenter
return HtmlInstrumenter()
if name == "image":
from decnet.canary.instrumenters.image import ImageInstrumenter
return ImageInstrumenter()
if name == "plain":
from decnet.canary.instrumenters.plain import PlainInstrumenter
return PlainInstrumenter()
if name == "passthrough":
from decnet.canary.instrumenters.passthrough import PassthroughInstrumenter
return PassthroughInstrumenter()
raise ValueError(
f"Unknown canary instrumenter: {name!r}. Known: {KNOWN_INSTRUMENTERS}"
)
# MIME → instrumenter dispatch. Order matters: we walk the table
# top-to-bottom and the first prefix match wins, so put the more
# specific (DOCX/XLSX) before the generic (zip/octet-stream).
_MIME_DISPATCH: tuple[tuple[str, str], ...] = (
# Office Open XML — DOCX/XLSX share a zip structure but expose
# different inner trees, so dispatch by MIME alias rather than
# zip-poking.
("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx"),
("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "xlsx"),
("application/pdf", "pdf"),
("text/html", "html"),
("application/xhtml+xml", "html"),
("image/png", "image"),
("image/jpeg", "image"),
("image/gif", "image"),
# Plaintext catch-alls — config files, .env, .ini, .yaml, .json,
# source code. All handled by the same regex-substitution pass.
("text/", "plain"),
("application/json", "plain"),
("application/x-yaml", "plain"),
("application/yaml", "plain"),
("application/toml", "plain"),
)
def pick_instrumenter_for_mime(content_type: str) -> str:
"""Return the instrumenter name registered for a sniffed MIME.
Falls back to ``"passthrough"`` for anything we don't have an
embedder for (binary blobs we can't mutate safely — random
container images, archives, executables). ``passthrough`` only
supports DNS-callback tokens (the slug ends up in the filename or
an accompanying README), so the API surfaces that constraint to
the operator before they pick a kind.
"""
if not content_type:
return "passthrough"
lowered = content_type.lower()
for prefix, name in _MIME_DISPATCH:
if lowered.startswith(prefix):
return name
return "passthrough"

78
decnet/canary/paths.py Normal file
View File

@@ -0,0 +1,78 @@
"""Persona-aware path resolution for canary artifacts.
Linux-persona deckies use POSIX-shaped paths under ``/home/<user>``.
"Windows" personas (still Linux containers under the hood — see
:mod:`decnet.archetypes`) use Windows-shaped paths under
``/home/<user>/AppData/...`` so an attacker browsing the filesystem
through a planted RDP/SMB session sees the right shape.
The persona lookup is best-effort: callers pass the
:attr:`decnet.archetypes.Archetype.nmap_os` value (``"linux"`` or
``"windows"``); unknown personas fall through to ``"linux"``.
Operators can always override by passing an explicit
``placement_path`` when creating a token.
"""
from __future__ import annotations
DEFAULT_LINUX_USER = "admin"
DEFAULT_WINDOWS_USER = "Administrator"
# Canonical placements for the synthesizer-driven baseline tokens.
# Operators can override per-token via the API, but these are the
# defaults the deploy-time seed uses.
_LINUX_DEFAULTS: dict[str, str] = {
"git_config": "/home/{user}/.git/config",
"env_file": "/home/{user}/.env",
"ssh_key": "/home/{user}/.ssh/id_rsa",
"aws_creds": "/home/{user}/.aws/credentials",
"honeydoc": "/home/{user}/Documents/quarterly_report.docx",
}
_WINDOWS_DEFAULTS: dict[str, str] = {
"git_config": "/home/{user}/AppData/Local/Programs/Git/etc/gitconfig",
"env_file": "/home/{user}/Desktop/prod.env",
"ssh_key": "/home/{user}/.ssh/id_rsa", # OpenSSH on Windows uses the same path
"aws_creds": "/home/{user}/.aws/credentials",
"honeydoc": "/home/{user}/Documents/quarterly_report.docx",
}
def default_user(persona: str) -> str:
"""Return the conventional unprivileged username for a persona."""
return DEFAULT_WINDOWS_USER if persona == "windows" else DEFAULT_LINUX_USER
def default_path_for(generator: str, persona: str = "linux") -> str:
"""Resolve the default placement path for a synthesized token.
Returns an absolute container path with ``{user}`` already
expanded. Falls back to a sane Linux default for unknown
personas — better to plant *something* than fail the deploy hook.
"""
table = _WINDOWS_DEFAULTS if persona == "windows" else _LINUX_DEFAULTS
template = table.get(generator)
if not template:
# Unknown generator — fall back to a generic /tmp drop so the
# planter still has somewhere to write. The API rejects
# unknown generators upstream, so this branch is defensive.
return f"/tmp/{generator}.canary" # nosec B108 — placement inside attacker-facing decoy container, not host /tmp
return template.format(user=default_user(persona))
def normalize_placement(path: str) -> str:
"""Validate and normalize an operator-supplied placement path.
Forbids relative paths, NUL bytes, and shell metacharacters that
``docker exec sh -c`` can't safely round-trip. Returns the
sanitised path unchanged when valid; raises :class:`ValueError`
otherwise so the API can return a 400 with a clear message.
"""
if not path or not path.startswith("/"):
raise ValueError("placement_path must be absolute (start with '/')")
if "\x00" in path:
raise ValueError("placement_path may not contain NUL")
if "\n" in path or "\r" in path:
raise ValueError("placement_path may not contain newlines")
if "../" in path or path.endswith("/.."):
raise ValueError("placement_path may not contain '..' segments")
return path

89
decnet/canary/storage.py Normal file
View File

@@ -0,0 +1,89 @@
"""Filesystem store for operator-uploaded canary blobs.
Blobs live under ``/var/lib/decnet/canary/blobs/<sha256>`` (override
via ``DECNET_CANARY_BLOB_DIR``) and are deduplicated by content hash.
The DB table :class:`decnet.web.db.models.CanaryBlob` mirrors
metadata; the bytes are read on demand at instrumentation time, so
the API process never holds large operator uploads in memory longer
than the request itself.
Refcount-aware deletion is enforced at the DB layer (see
:meth:`decnet.web.db.repository.BaseRepository.delete_canary_blob`);
this module only provides write/read/unlink primitives keyed by
sha256.
"""
from __future__ import annotations
import hashlib
import os
from pathlib import Path
from typing import Tuple
def blob_dir() -> Path:
"""Return the on-disk root for canary blobs.
Honors ``DECNET_CANARY_BLOB_DIR`` so tests can point at a tmp
path. The directory is created lazily on first write.
"""
raw = os.environ.get("DECNET_CANARY_BLOB_DIR", "/var/lib/decnet/canary/blobs")
return Path(raw)
def _path_for(sha256: str) -> Path:
# Two-level fan-out (``ab/cd/abcd...``) keeps any one directory
# from accumulating thousands of entries on busy fleets. Same
# shape as Git's loose-object store.
if len(sha256) < 4:
raise ValueError("sha256 must be at least 4 chars")
root = blob_dir()
return root / sha256[:2] / sha256[2:4] / sha256
def write_blob(content: bytes) -> Tuple[str, Path, int]:
"""Persist ``content`` under its sha256 path.
Idempotent: if the target file already exists with the same
bytes, no rewrite happens. Returns ``(sha256, path,
size_bytes)``.
"""
sha = hashlib.sha256(content).hexdigest()
target = _path_for(sha)
target.parent.mkdir(parents=True, exist_ok=True)
if not target.exists():
# Atomic-ish: write to a temp sibling and rename. Avoids the
# half-written-file race a concurrent reader would otherwise
# see if we wrote in place.
tmp = target.with_suffix(target.suffix + ".part")
tmp.write_bytes(content)
os.replace(tmp, target)
return sha, target, len(content)
def read_blob(sha256: str) -> bytes:
"""Read the bytes for a stored blob.
Raises :class:`FileNotFoundError` when the on-disk row was unlinked
out of band (operator pruned ``/var/lib/decnet`` by hand) — the
caller (instrumenter dispatch) surfaces it as a 410-ish error so
the operator can re-upload.
"""
return _path_for(sha256).read_bytes()
def unlink_blob(sha256: str) -> bool:
"""Delete the on-disk bytes for ``sha256``.
Returns True if a file was removed, False if it was already gone.
The DB row deletion happens in
:meth:`SQLModelRepository.delete_canary_blob`; this function is
a best-effort companion called *after* the DB delete commits so
a crash between them leaves a recoverable orphan, never a
dangling DB reference.
"""
target = _path_for(sha256)
try:
target.unlink()
except FileNotFoundError:
return False
return True