feat(realism): canary cultivator on the realism contract

Stage 7 — final stage of the realism migration. Canary plants are
now scheduled by the same realism planner that handles inert content,
keeping the orchestrator as the single decision point and avoiding
duplicate diurnal / persona / rate-limit logic in the canary
subsystem.

New surface:

- decnet/canary/cultivator.py: cultivate(plan, repo) builds a
  CanaryContext, calls the right generator (canary_aws_creds ->
  aws_creds, canary_mysql_dump -> mysql_dump, …), persists the
  canary_tokens row before plant so the canary worker can attribute
  callbacks even on plant-time previews. Resolves canary placements
  to credible operator paths (~/.aws/credentials, ~/.ssh/id_rsa,
  /var/backups/db_backup.sql).
- realism/planner.py adds 8 canary content_classes uniformly weighted
  inside a 3% probability gate. Hard-capped: each tick at most one
  canary; create branch falls through to inert otherwise.
- scheduler.pick_file dispatches canary content_class to the
  cultivator; FileAction grows an optional content_bytes field so
  binary canary artifacts (DOCX/PDF/honeydoc) survive the wire
  intact instead of being utf-8 round-tripped.
- SSHDriver._run_file uses content_bytes when set, falls back to
  encoding the str content otherwise.

Stealth (per feedback_stealth.md): cultivator does not introduce
any DECNET literal; the underlying generators are already
stealth-clean and the test suite asserts the contract holds.

Tests cover round-tripping every canary class through the cultivator,
verifying placement-path conventions, persona-login normalisation
("John Smith" -> /home/johnsmith/.aws/credentials), and the
no-DECNET-leak invariant.
This commit is contained in:
2026-04-27 16:47:59 -04:00
parent 4e436da569
commit a07fb3fe08
6 changed files with 392 additions and 10 deletions

172
decnet/canary/cultivator.py Normal file
View File

@@ -0,0 +1,172 @@
"""Realism contract adapter for canary generators.
Stage 7 of the realism migration. The orchestrator's planner picks a
``canary_*`` :class:`~decnet.realism.taxonomy.ContentClass` 13% of
the time on file ticks; this module turns that pick into a
:class:`~decnet.canary.base.CanaryArtifact` (bytes the SSH driver
plants) plus a persisted :class:`~decnet.web.db.models.CanaryToken`
row so the canary worker recognises the slug when an attacker trips
it.
What this is NOT: it doesn't pick *when* canaries fire — that's the
realism planner's job. It doesn't decide *where* on the filesystem
the canary lands beyond what realism naming + persona conventions
already produce. It's a thin bytes-and-row factory bolted onto the
realism contract.
Stealth (per ``feedback_stealth.md``): we never leak the
``DECNET`` literal into anything that survives to the planted file.
The underlying generators are already stealth-clean; this wrapper
must not undo that.
"""
from __future__ import annotations
import os
import secrets as _secrets
from datetime import datetime, timezone
from typing import Any, Optional
from decnet.canary.base import CanaryArtifact, CanaryContext
from decnet.canary.factory import get_generator
from decnet.logging import get_logger
from decnet.realism.taxonomy import ContentClass, Plan
log = get_logger("canary.cultivator")
# realism content_class → canary generator name. Mirrors
# :data:`decnet.canary.factory.KNOWN_GENERATORS`.
_CLASS_TO_GENERATOR: dict[ContentClass, str] = {
ContentClass.CANARY_AWS_CREDS: "aws_creds",
ContentClass.CANARY_ENV_FILE: "env_file",
ContentClass.CANARY_GIT_CONFIG: "git_config",
ContentClass.CANARY_SSH_KEY: "ssh_key",
ContentClass.CANARY_HONEYDOC: "honeydoc",
ContentClass.CANARY_HONEYDOC_DOCX: "honeydoc_docx",
ContentClass.CANARY_HONEYDOC_PDF: "honeydoc_pdf",
ContentClass.CANARY_MYSQL_DUMP: "mysql_dump",
}
# Path conventions per generator. The realism planner doesn't know
# about decoy-realistic credential locations (``~/.aws/credentials``,
# ``~/.git/config``); we map them per-class here so the planted
# artifact lands somewhere an attacker would actually look.
_DEFAULT_PATH: dict[ContentClass, str] = {
ContentClass.CANARY_AWS_CREDS: "/home/{persona}/.aws/credentials",
ContentClass.CANARY_ENV_FILE: "/home/{persona}/app/.env",
ContentClass.CANARY_GIT_CONFIG: "/home/{persona}/.git/config",
ContentClass.CANARY_SSH_KEY: "/home/{persona}/.ssh/id_rsa",
ContentClass.CANARY_HONEYDOC: "/home/{persona}/Documents/notes.html",
ContentClass.CANARY_HONEYDOC_DOCX: "/home/{persona}/Documents/Q3-Operations-Review.docx",
ContentClass.CANARY_HONEYDOC_PDF: "/home/{persona}/Documents/Q3-Operations-Review.pdf",
ContentClass.CANARY_MYSQL_DUMP: "/var/backups/db_backup.sql",
}
def _persona_login(persona: str) -> str:
"""Mirror :func:`decnet.realism.naming._home`'s username conventions."""
candidate = persona.lower().replace(" ", "")
if candidate.isalnum() and candidate.isascii() and candidate:
return candidate
return "user"
def _path_for(plan: Plan) -> str:
"""Produce the canary placement path for *plan*.
The realism planner already filled in ``plan.target_path`` from
the namer, but canary placements have stronger conventions
(``~/.aws/credentials``, ``~/.ssh/id_rsa``) than the realism
namer's vocabulary. When :data:`_DEFAULT_PATH` has an entry,
that wins.
"""
template = _DEFAULT_PATH.get(plan.content_class)
if template is None:
return plan.target_path
return template.format(persona=_persona_login(plan.persona))
def _new_callback_token() -> str:
"""16 url-safe bytes — same shape canary slug fields use elsewhere."""
return _secrets.token_urlsafe(16)
async def cultivate(
plan: Plan,
repo: Any,
*,
http_base: Optional[str] = None,
dns_zone: Optional[str] = None,
created_by: str = "system",
) -> CanaryArtifact:
"""Realism-driven canary plant.
Build a :class:`CanaryContext`, ask the right generator for bytes,
persist a ``canary_tokens`` row so the canary worker can attribute
callbacks to this token, and return the artifact for the SSH
driver to plant.
*http_base* and *dns_zone* default to ``DECNET_CANARY_HTTP_BASE``
and ``DECNET_CANARY_DNS_ZONE`` env vars respectively — same
pattern the canary worker uses. When both are empty, generators
that need a callback host (``ssh_key`` DNS, ``mysql_dump``)
raise; the planner's caller logs and falls back to a non-canary
plan.
"""
if not plan.content_class.is_canary():
raise ValueError(
f"cultivate() called with non-canary content_class="
f"{plan.content_class!r}"
)
gen_name = _CLASS_TO_GENERATOR.get(plan.content_class)
if gen_name is None:
raise KeyError(
f"no canary generator mapped for content_class="
f"{plan.content_class!r}"
)
callback_token = _new_callback_token()
ctx = CanaryContext(
callback_token=callback_token,
http_base=http_base or os.environ.get("DECNET_CANARY_HTTP_BASE", ""),
dns_zone=dns_zone or os.environ.get("DECNET_CANARY_DNS_ZONE", ""),
persona="linux", # all our deckies are POSIX in MVP
)
generator = get_generator(gen_name)
artifact = generator.generate(ctx)
# The generator returns ``path=""`` (planter fills it normally).
# We have a realism-derived path on hand; stuff it in for the SSH
# driver's plant_file call AND the canary_tokens row.
placement_path = _path_for(plan)
# Persist the token row before planting so the canary worker can
# attribute a callback if the artifact trips during the plant
# itself (improbable but possible — DOCX viewers can preview
# autoplay-style).
await repo.create_canary_token({
"kind": "http", # MVP: all realism-cultivated tokens use HTTP
"decky_name": plan.decky_name,
"instrumenter": None,
"generator": gen_name,
"placement_path": placement_path,
"callback_token": callback_token,
"secret_seed": callback_token, # deterministic re-seed compatible
"placed_at": datetime.now(timezone.utc),
"created_by": created_by,
"state": "planted",
})
# Carry the placement_path on the artifact so the orchestrator's
# plant_file call uses it. We don't mutate the generator's
# original — copy with the new path.
return CanaryArtifact(
path=placement_path,
content=artifact.content,
mode=artifact.mode,
mtime_offset=artifact.mtime_offset,
instrumenter=artifact.instrumenter,
generator=artifact.generator,
notes=list(artifact.notes),
)

View File

@@ -198,15 +198,19 @@ class SSHDriver(ActivityDriver):
return result
async def _run_file(self, action: FileAction) -> ActivityResult:
# FileAction's content is a string; the realism path uses
# bytes-typed plant_file so binary blobs (DOCX/PDF, future
# canary artifacts) survive the wire. Encode-once here.
# FileAction.content_bytes wins when set — canary artifacts
# (DOCX/PDF/honeydoc binaries) need their bytes preserved
# exactly. Falls back to utf-8 encoding the str content for
# the inert-realism path.
# mtime carries through from the realism planner so the file
# doesn't stamp at wall-clock-now (the realism failure today).
body = action.content_bytes
if body is None:
body = action.content.encode("utf-8")
return await self.plant_file(
action.dst_name,
action.path,
action.content.encode("utf-8"),
body,
mode=0o644,
mtime=action.mtime,
)

View File

@@ -56,6 +56,11 @@ class FileAction:
content_class: str = ContentClass.NOTE.value
mtime: Optional[datetime] = None
description: str = "file:create"
# Canary artifacts (DOCX/PDF/honeydoc binaries) carry their bytes
# here so re-encoding ``content`` from utf-8 doesn't mangle them.
# When set, the SSH driver uses these bytes directly and ignores
# ``content``.
content_bytes: Optional[bytes] = None
@dataclass(frozen=True)
@@ -183,6 +188,38 @@ async def pick_file(
mtime=plan.mtime,
)
# Canary branch — the cultivator builds the bytes, picks the
# placement path, and persists the canary_tokens row. We map
# the resulting CanaryArtifact to a FileAction so the SSH
# driver's plant_file path is reused unchanged.
if plan.content_class.is_canary():
try:
from decnet.canary import cultivator as _cultivator
artifact = await _cultivator.cultivate(plan, repo)
except Exception: # noqa: BLE001
# Cultivation failed (no http_base/dns_zone configured,
# generator raised, repo write failed). Fall through to
# an inert file plant so the tick isn't wasted.
return FileAction(
dst_uuid=plan.decky_uuid,
dst_name=plan.decky_name,
path=plan.target_path or f"/tmp/.cache-{secrets.token_hex(3)}", # nosec B108
content=plan.body_hint or "",
persona=plan.persona,
content_class=plan.content_class.value,
mtime=plan.mtime,
)
return FileAction(
dst_uuid=plan.decky_uuid,
dst_name=plan.decky_name,
path=artifact.path,
content="", # ignored when content_bytes is set
content_bytes=artifact.content,
persona=plan.persona,
content_class=plan.content_class.value,
mtime=plan.mtime,
)
# Create branch. If LLM is wired, optionally swap body_hint for
# an LLM-authored body. Always keep the deterministic body_hint
# as the fallback the function call returns when LLM

View File

@@ -45,6 +45,21 @@ _SYSTEM_CLASS_WEIGHTS: tuple[tuple[ContentClass, int], ...] = (
(ContentClass.LOG_DAEMON, 8),
(ContentClass.CACHE_TMP, 5),
)
# Canary classes are picked rarely. Each plant materialises a real
# CanaryToken row + DNS slug + HTTP URL — flooding the fleet with
# canaries makes the dashboard noisy and the per-decky alert surface
# explode. ~3% of file picks land here.
_CANARY_CLASS_WEIGHTS: tuple[tuple[ContentClass, int], ...] = (
(ContentClass.CANARY_AWS_CREDS, 1),
(ContentClass.CANARY_ENV_FILE, 1),
(ContentClass.CANARY_GIT_CONFIG, 1),
(ContentClass.CANARY_SSH_KEY, 1),
(ContentClass.CANARY_HONEYDOC, 1),
(ContentClass.CANARY_HONEYDOC_DOCX, 1),
(ContentClass.CANARY_HONEYDOC_PDF, 1),
(ContentClass.CANARY_MYSQL_DUMP, 1),
)
_CANARY_PROBABILITY = 0.03
def _weighted_pick(
@@ -117,6 +132,33 @@ def pick(
decky, persona = rng.choice(eligible)
# Canary first — they're rare (~3% of file picks), uniformly
# weighted across generators. Falling here means the orchestrator
# plants a callback-bearing artifact this tick instead of an
# inert one.
if rng.random() < _CANARY_PROBABILITY:
content_class = _weighted_pick(_CANARY_CLASS_WEIGHTS, rng)
# Canary placement is the cultivator's job — plan.target_path
# is advisory; a "" lets the cultivator override entirely.
target_path = ""
body_hint = None
mtime = sample_mtime(persona.active_hours, now, rand=rng)
return Plan(
decky_uuid=decky["uuid"],
decky_name=decky["name"],
persona=persona.name,
content_class=content_class,
action="create",
target_path=target_path,
mtime=mtime,
body_hint=body_hint,
notes=(
f"persona={persona.name}",
f"class={content_class.value}",
"kind=canary",
),
)
# User vs system content — biased toward user (realism wins are
# bigger there).
if rng.random() < 0.7:

View File

@@ -0,0 +1,114 @@
"""Realism-driven canary cultivation.
Stage 7 of the realism migration: the orchestrator's planner picks a
canary content_class ~3% of file ticks; the cultivator turns that into
a CanaryArtifact + persisted CanaryToken row.
"""
from __future__ import annotations
from datetime import datetime, timezone
import pytest
import pytest_asyncio
from decnet.canary.cultivator import cultivate
from decnet.realism.taxonomy import ContentClass, Plan
from decnet.web.db.sqlite.repository import SQLiteRepository
@pytest_asyncio.fixture
async def repo(tmp_path):
r = SQLiteRepository(db_path=str(tmp_path / "decnet.db"))
await r.initialize()
yield r
await r.engine.dispose()
def _plan(cls: ContentClass, persona: str = "admin") -> Plan:
return Plan(
decky_uuid="d1",
decky_name="alpha",
persona=persona,
content_class=cls,
action="create",
target_path="",
mtime=datetime(2026, 4, 27, 11, 30, tzinfo=timezone.utc),
body_hint=None,
)
@pytest.mark.asyncio
async def test_cultivate_records_canary_token_row(repo, monkeypatch):
monkeypatch.setenv("DECNET_CANARY_HTTP_BASE", "https://canary.example.test")
monkeypatch.setenv("DECNET_CANARY_DNS_ZONE", "canary.example.test")
artifact = await cultivate(
_plan(ContentClass.CANARY_GIT_CONFIG), repo,
)
assert artifact.path == "/home/admin/.git/config"
assert artifact.content
# Token row landed and the slug round-trips through the slug index.
rows = await repo.list_canary_tokens(decky_name="alpha")
assert len(rows) == 1
assert rows[0]["generator"] == "git_config"
assert rows[0]["placement_path"] == "/home/admin/.git/config"
assert rows[0]["callback_token"] in artifact.content.decode("utf-8")
@pytest.mark.asyncio
async def test_cultivate_persists_path_for_each_class(repo, monkeypatch):
monkeypatch.setenv("DECNET_CANARY_HTTP_BASE", "https://canary.example.test")
monkeypatch.setenv("DECNET_CANARY_DNS_ZONE", "canary.example.test")
classes_and_paths = {
ContentClass.CANARY_AWS_CREDS: "/home/admin/.aws/credentials",
ContentClass.CANARY_ENV_FILE: "/home/admin/app/.env",
ContentClass.CANARY_GIT_CONFIG: "/home/admin/.git/config",
ContentClass.CANARY_SSH_KEY: "/home/admin/.ssh/id_rsa",
ContentClass.CANARY_HONEYDOC: "/home/admin/Documents/notes.html",
ContentClass.CANARY_MYSQL_DUMP: "/var/backups/db_backup.sql",
}
for cls, expected in classes_and_paths.items():
artifact = await cultivate(_plan(cls), repo)
assert artifact.path == expected, (
f"{cls.value!r} planted at {artifact.path!r}, want {expected!r}"
)
@pytest.mark.asyncio
async def test_cultivate_rejects_non_canary_class(repo):
with pytest.raises(ValueError, match="non-canary"):
await cultivate(_plan(ContentClass.NOTE), repo)
@pytest.mark.asyncio
async def test_cultivate_persona_login_normalisation(repo, monkeypatch):
monkeypatch.setenv("DECNET_CANARY_HTTP_BASE", "https://canary.example.test")
monkeypatch.setenv("DECNET_CANARY_DNS_ZONE", "canary.example.test")
artifact = await cultivate(
_plan(ContentClass.CANARY_AWS_CREDS, persona="John Smith"), repo,
)
# Spaces collapsed to lowercase login, same convention as the
# realism namer's _home() function.
assert artifact.path == "/home/johnsmith/.aws/credentials"
@pytest.mark.asyncio
async def test_cultivate_artifact_does_not_leak_decnet_string(repo, monkeypatch):
"""Stealth contract (per feedback_stealth.md): a planted canary's
bytes must never carry the DECNET literal — that would tell an
attacker the file is a honeypot trap."""
monkeypatch.setenv("DECNET_CANARY_HTTP_BASE", "https://canary.example.test")
monkeypatch.setenv("DECNET_CANARY_DNS_ZONE", "canary.example.test")
for cls in (
ContentClass.CANARY_AWS_CREDS,
ContentClass.CANARY_GIT_CONFIG,
ContentClass.CANARY_ENV_FILE,
ContentClass.CANARY_SSH_KEY,
):
artifact = await cultivate(_plan(cls), repo)
body = artifact.content.decode("utf-8", errors="replace")
assert "decnet" not in body.lower(), (
f"{cls.value!r} body leaked 'decnet': "
f"{body[:120]!r}"
)

View File

@@ -80,16 +80,29 @@ def test_pick_distributes_across_user_and_system_classes() -> None:
assert system_classes, f"no system-class plans in 80 trials: {seen}"
def test_pick_never_returns_canary_class_in_stage3() -> None:
def test_canary_picks_are_rare() -> None:
"""Stage 7: canary content_classes ARE picked, but bounded.
The documented rate is ~3% of file picks per
decnet/realism/planner.py:_CANARY_PROBABILITY. We trial a large
sample and assert the rate stays under a generous ceiling so a
typo bumping the constant to 30% explodes here loudly.
"""
deckies = [_decky()]
for seed in range(40):
canary_count = 0
create_count = 0
for seed in range(500):
plan = pick(deckies, _NOW, rand=random.Random(seed))
if plan is None:
continue
assert not plan.content_class.is_canary(), (
"canary class slipped into the realism planner; cultivator "
"lands in stage 7"
)
create_count += 1
if plan.content_class.is_canary():
canary_count += 1
# 3% target with a 6% upper bound — sampling noise on 500 trials
# is comfortably below this for the documented rate.
rate = canary_count / max(1, create_count)
assert rate <= 0.06, f"canary rate {rate:.2%} exceeds 6% ceiling"
assert canary_count > 0, "expected at least one canary across 500 seeds"
def test_pick_persists_persona_window_in_notes() -> None: