diff --git a/decnet/canary/cultivator.py b/decnet/canary/cultivator.py new file mode 100644 index 00000000..ec0f8e0f --- /dev/null +++ b/decnet/canary/cultivator.py @@ -0,0 +1,172 @@ +"""Realism contract adapter for canary generators. + +Stage 7 of the realism migration. The orchestrator's planner picks a +``canary_*`` :class:`~decnet.realism.taxonomy.ContentClass` 1–3% of +the time on file ticks; this module turns that pick into a +:class:`~decnet.canary.base.CanaryArtifact` (bytes the SSH driver +plants) plus a persisted :class:`~decnet.web.db.models.CanaryToken` +row so the canary worker recognises the slug when an attacker trips +it. + +What this is NOT: it doesn't pick *when* canaries fire — that's the +realism planner's job. It doesn't decide *where* on the filesystem +the canary lands beyond what realism naming + persona conventions +already produce. It's a thin bytes-and-row factory bolted onto the +realism contract. + +Stealth (per ``feedback_stealth.md``): we never leak the +``DECNET`` literal into anything that survives to the planted file. +The underlying generators are already stealth-clean; this wrapper +must not undo that. +""" +from __future__ import annotations + +import os +import secrets as _secrets +from datetime import datetime, timezone +from typing import Any, Optional + +from decnet.canary.base import CanaryArtifact, CanaryContext +from decnet.canary.factory import get_generator +from decnet.logging import get_logger +from decnet.realism.taxonomy import ContentClass, Plan + +log = get_logger("canary.cultivator") + + +# realism content_class → canary generator name. Mirrors +# :data:`decnet.canary.factory.KNOWN_GENERATORS`. +_CLASS_TO_GENERATOR: dict[ContentClass, str] = { + ContentClass.CANARY_AWS_CREDS: "aws_creds", + ContentClass.CANARY_ENV_FILE: "env_file", + ContentClass.CANARY_GIT_CONFIG: "git_config", + ContentClass.CANARY_SSH_KEY: "ssh_key", + ContentClass.CANARY_HONEYDOC: "honeydoc", + ContentClass.CANARY_HONEYDOC_DOCX: "honeydoc_docx", + ContentClass.CANARY_HONEYDOC_PDF: "honeydoc_pdf", + ContentClass.CANARY_MYSQL_DUMP: "mysql_dump", +} + + +# Path conventions per generator. The realism planner doesn't know +# about decoy-realistic credential locations (``~/.aws/credentials``, +# ``~/.git/config``); we map them per-class here so the planted +# artifact lands somewhere an attacker would actually look. +_DEFAULT_PATH: dict[ContentClass, str] = { + ContentClass.CANARY_AWS_CREDS: "/home/{persona}/.aws/credentials", + ContentClass.CANARY_ENV_FILE: "/home/{persona}/app/.env", + ContentClass.CANARY_GIT_CONFIG: "/home/{persona}/.git/config", + ContentClass.CANARY_SSH_KEY: "/home/{persona}/.ssh/id_rsa", + ContentClass.CANARY_HONEYDOC: "/home/{persona}/Documents/notes.html", + ContentClass.CANARY_HONEYDOC_DOCX: "/home/{persona}/Documents/Q3-Operations-Review.docx", + ContentClass.CANARY_HONEYDOC_PDF: "/home/{persona}/Documents/Q3-Operations-Review.pdf", + ContentClass.CANARY_MYSQL_DUMP: "/var/backups/db_backup.sql", +} + + +def _persona_login(persona: str) -> str: + """Mirror :func:`decnet.realism.naming._home`'s username conventions.""" + candidate = persona.lower().replace(" ", "") + if candidate.isalnum() and candidate.isascii() and candidate: + return candidate + return "user" + + +def _path_for(plan: Plan) -> str: + """Produce the canary placement path for *plan*. + + The realism planner already filled in ``plan.target_path`` from + the namer, but canary placements have stronger conventions + (``~/.aws/credentials``, ``~/.ssh/id_rsa``) than the realism + namer's vocabulary. When :data:`_DEFAULT_PATH` has an entry, + that wins. + """ + template = _DEFAULT_PATH.get(plan.content_class) + if template is None: + return plan.target_path + return template.format(persona=_persona_login(plan.persona)) + + +def _new_callback_token() -> str: + """16 url-safe bytes — same shape canary slug fields use elsewhere.""" + return _secrets.token_urlsafe(16) + + +async def cultivate( + plan: Plan, + repo: Any, + *, + http_base: Optional[str] = None, + dns_zone: Optional[str] = None, + created_by: str = "system", +) -> CanaryArtifact: + """Realism-driven canary plant. + + Build a :class:`CanaryContext`, ask the right generator for bytes, + persist a ``canary_tokens`` row so the canary worker can attribute + callbacks to this token, and return the artifact for the SSH + driver to plant. + + *http_base* and *dns_zone* default to ``DECNET_CANARY_HTTP_BASE`` + and ``DECNET_CANARY_DNS_ZONE`` env vars respectively — same + pattern the canary worker uses. When both are empty, generators + that need a callback host (``ssh_key`` DNS, ``mysql_dump``) + raise; the planner's caller logs and falls back to a non-canary + plan. + """ + if not plan.content_class.is_canary(): + raise ValueError( + f"cultivate() called with non-canary content_class=" + f"{plan.content_class!r}" + ) + gen_name = _CLASS_TO_GENERATOR.get(plan.content_class) + if gen_name is None: + raise KeyError( + f"no canary generator mapped for content_class=" + f"{plan.content_class!r}" + ) + + callback_token = _new_callback_token() + ctx = CanaryContext( + callback_token=callback_token, + http_base=http_base or os.environ.get("DECNET_CANARY_HTTP_BASE", ""), + dns_zone=dns_zone or os.environ.get("DECNET_CANARY_DNS_ZONE", ""), + persona="linux", # all our deckies are POSIX in MVP + ) + generator = get_generator(gen_name) + artifact = generator.generate(ctx) + + # The generator returns ``path=""`` (planter fills it normally). + # We have a realism-derived path on hand; stuff it in for the SSH + # driver's plant_file call AND the canary_tokens row. + placement_path = _path_for(plan) + + # Persist the token row before planting so the canary worker can + # attribute a callback if the artifact trips during the plant + # itself (improbable but possible — DOCX viewers can preview + # autoplay-style). + await repo.create_canary_token({ + "kind": "http", # MVP: all realism-cultivated tokens use HTTP + "decky_name": plan.decky_name, + "instrumenter": None, + "generator": gen_name, + "placement_path": placement_path, + "callback_token": callback_token, + "secret_seed": callback_token, # deterministic re-seed compatible + "placed_at": datetime.now(timezone.utc), + "created_by": created_by, + "state": "planted", + }) + + # Carry the placement_path on the artifact so the orchestrator's + # plant_file call uses it. We don't mutate the generator's + # original — copy with the new path. + return CanaryArtifact( + path=placement_path, + content=artifact.content, + mode=artifact.mode, + mtime_offset=artifact.mtime_offset, + instrumenter=artifact.instrumenter, + generator=artifact.generator, + notes=list(artifact.notes), + ) diff --git a/decnet/orchestrator/drivers/ssh.py b/decnet/orchestrator/drivers/ssh.py index b94bb0dc..9718028e 100644 --- a/decnet/orchestrator/drivers/ssh.py +++ b/decnet/orchestrator/drivers/ssh.py @@ -198,15 +198,19 @@ class SSHDriver(ActivityDriver): return result async def _run_file(self, action: FileAction) -> ActivityResult: - # FileAction's content is a string; the realism path uses - # bytes-typed plant_file so binary blobs (DOCX/PDF, future - # canary artifacts) survive the wire. Encode-once here. + # FileAction.content_bytes wins when set — canary artifacts + # (DOCX/PDF/honeydoc binaries) need their bytes preserved + # exactly. Falls back to utf-8 encoding the str content for + # the inert-realism path. # mtime carries through from the realism planner so the file # doesn't stamp at wall-clock-now (the realism failure today). + body = action.content_bytes + if body is None: + body = action.content.encode("utf-8") return await self.plant_file( action.dst_name, action.path, - action.content.encode("utf-8"), + body, mode=0o644, mtime=action.mtime, ) diff --git a/decnet/orchestrator/scheduler.py b/decnet/orchestrator/scheduler.py index 012b96f9..dfc61e8d 100644 --- a/decnet/orchestrator/scheduler.py +++ b/decnet/orchestrator/scheduler.py @@ -56,6 +56,11 @@ class FileAction: content_class: str = ContentClass.NOTE.value mtime: Optional[datetime] = None description: str = "file:create" + # Canary artifacts (DOCX/PDF/honeydoc binaries) carry their bytes + # here so re-encoding ``content`` from utf-8 doesn't mangle them. + # When set, the SSH driver uses these bytes directly and ignores + # ``content``. + content_bytes: Optional[bytes] = None @dataclass(frozen=True) @@ -183,6 +188,38 @@ async def pick_file( mtime=plan.mtime, ) + # Canary branch — the cultivator builds the bytes, picks the + # placement path, and persists the canary_tokens row. We map + # the resulting CanaryArtifact to a FileAction so the SSH + # driver's plant_file path is reused unchanged. + if plan.content_class.is_canary(): + try: + from decnet.canary import cultivator as _cultivator + artifact = await _cultivator.cultivate(plan, repo) + except Exception: # noqa: BLE001 + # Cultivation failed (no http_base/dns_zone configured, + # generator raised, repo write failed). Fall through to + # an inert file plant so the tick isn't wasted. + return FileAction( + dst_uuid=plan.decky_uuid, + dst_name=plan.decky_name, + path=plan.target_path or f"/tmp/.cache-{secrets.token_hex(3)}", # nosec B108 + content=plan.body_hint or "", + persona=plan.persona, + content_class=plan.content_class.value, + mtime=plan.mtime, + ) + return FileAction( + dst_uuid=plan.decky_uuid, + dst_name=plan.decky_name, + path=artifact.path, + content="", # ignored when content_bytes is set + content_bytes=artifact.content, + persona=plan.persona, + content_class=plan.content_class.value, + mtime=plan.mtime, + ) + # Create branch. If LLM is wired, optionally swap body_hint for # an LLM-authored body. Always keep the deterministic body_hint # as the fallback the function call returns when LLM diff --git a/decnet/realism/planner.py b/decnet/realism/planner.py index bc2fd4b3..827448db 100644 --- a/decnet/realism/planner.py +++ b/decnet/realism/planner.py @@ -45,6 +45,21 @@ _SYSTEM_CLASS_WEIGHTS: tuple[tuple[ContentClass, int], ...] = ( (ContentClass.LOG_DAEMON, 8), (ContentClass.CACHE_TMP, 5), ) +# Canary classes are picked rarely. Each plant materialises a real +# CanaryToken row + DNS slug + HTTP URL — flooding the fleet with +# canaries makes the dashboard noisy and the per-decky alert surface +# explode. ~3% of file picks land here. +_CANARY_CLASS_WEIGHTS: tuple[tuple[ContentClass, int], ...] = ( + (ContentClass.CANARY_AWS_CREDS, 1), + (ContentClass.CANARY_ENV_FILE, 1), + (ContentClass.CANARY_GIT_CONFIG, 1), + (ContentClass.CANARY_SSH_KEY, 1), + (ContentClass.CANARY_HONEYDOC, 1), + (ContentClass.CANARY_HONEYDOC_DOCX, 1), + (ContentClass.CANARY_HONEYDOC_PDF, 1), + (ContentClass.CANARY_MYSQL_DUMP, 1), +) +_CANARY_PROBABILITY = 0.03 def _weighted_pick( @@ -117,6 +132,33 @@ def pick( decky, persona = rng.choice(eligible) + # Canary first — they're rare (~3% of file picks), uniformly + # weighted across generators. Falling here means the orchestrator + # plants a callback-bearing artifact this tick instead of an + # inert one. + if rng.random() < _CANARY_PROBABILITY: + content_class = _weighted_pick(_CANARY_CLASS_WEIGHTS, rng) + # Canary placement is the cultivator's job — plan.target_path + # is advisory; a "" lets the cultivator override entirely. + target_path = "" + body_hint = None + mtime = sample_mtime(persona.active_hours, now, rand=rng) + return Plan( + decky_uuid=decky["uuid"], + decky_name=decky["name"], + persona=persona.name, + content_class=content_class, + action="create", + target_path=target_path, + mtime=mtime, + body_hint=body_hint, + notes=( + f"persona={persona.name}", + f"class={content_class.value}", + "kind=canary", + ), + ) + # User vs system content — biased toward user (realism wins are # bigger there). if rng.random() < 0.7: diff --git a/tests/canary/test_cultivator.py b/tests/canary/test_cultivator.py new file mode 100644 index 00000000..cb8b99f1 --- /dev/null +++ b/tests/canary/test_cultivator.py @@ -0,0 +1,114 @@ +"""Realism-driven canary cultivation. + +Stage 7 of the realism migration: the orchestrator's planner picks a +canary content_class ~3% of file ticks; the cultivator turns that into +a CanaryArtifact + persisted CanaryToken row. +""" +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +import pytest_asyncio + +from decnet.canary.cultivator import cultivate +from decnet.realism.taxonomy import ContentClass, Plan +from decnet.web.db.sqlite.repository import SQLiteRepository + + +@pytest_asyncio.fixture +async def repo(tmp_path): + r = SQLiteRepository(db_path=str(tmp_path / "decnet.db")) + await r.initialize() + yield r + await r.engine.dispose() + + +def _plan(cls: ContentClass, persona: str = "admin") -> Plan: + return Plan( + decky_uuid="d1", + decky_name="alpha", + persona=persona, + content_class=cls, + action="create", + target_path="", + mtime=datetime(2026, 4, 27, 11, 30, tzinfo=timezone.utc), + body_hint=None, + ) + + +@pytest.mark.asyncio +async def test_cultivate_records_canary_token_row(repo, monkeypatch): + monkeypatch.setenv("DECNET_CANARY_HTTP_BASE", "https://canary.example.test") + monkeypatch.setenv("DECNET_CANARY_DNS_ZONE", "canary.example.test") + + artifact = await cultivate( + _plan(ContentClass.CANARY_GIT_CONFIG), repo, + ) + assert artifact.path == "/home/admin/.git/config" + assert artifact.content + # Token row landed and the slug round-trips through the slug index. + rows = await repo.list_canary_tokens(decky_name="alpha") + assert len(rows) == 1 + assert rows[0]["generator"] == "git_config" + assert rows[0]["placement_path"] == "/home/admin/.git/config" + assert rows[0]["callback_token"] in artifact.content.decode("utf-8") + + +@pytest.mark.asyncio +async def test_cultivate_persists_path_for_each_class(repo, monkeypatch): + monkeypatch.setenv("DECNET_CANARY_HTTP_BASE", "https://canary.example.test") + monkeypatch.setenv("DECNET_CANARY_DNS_ZONE", "canary.example.test") + + classes_and_paths = { + ContentClass.CANARY_AWS_CREDS: "/home/admin/.aws/credentials", + ContentClass.CANARY_ENV_FILE: "/home/admin/app/.env", + ContentClass.CANARY_GIT_CONFIG: "/home/admin/.git/config", + ContentClass.CANARY_SSH_KEY: "/home/admin/.ssh/id_rsa", + ContentClass.CANARY_HONEYDOC: "/home/admin/Documents/notes.html", + ContentClass.CANARY_MYSQL_DUMP: "/var/backups/db_backup.sql", + } + for cls, expected in classes_and_paths.items(): + artifact = await cultivate(_plan(cls), repo) + assert artifact.path == expected, ( + f"{cls.value!r} planted at {artifact.path!r}, want {expected!r}" + ) + + +@pytest.mark.asyncio +async def test_cultivate_rejects_non_canary_class(repo): + with pytest.raises(ValueError, match="non-canary"): + await cultivate(_plan(ContentClass.NOTE), repo) + + +@pytest.mark.asyncio +async def test_cultivate_persona_login_normalisation(repo, monkeypatch): + monkeypatch.setenv("DECNET_CANARY_HTTP_BASE", "https://canary.example.test") + monkeypatch.setenv("DECNET_CANARY_DNS_ZONE", "canary.example.test") + artifact = await cultivate( + _plan(ContentClass.CANARY_AWS_CREDS, persona="John Smith"), repo, + ) + # Spaces collapsed to lowercase login, same convention as the + # realism namer's _home() function. + assert artifact.path == "/home/johnsmith/.aws/credentials" + + +@pytest.mark.asyncio +async def test_cultivate_artifact_does_not_leak_decnet_string(repo, monkeypatch): + """Stealth contract (per feedback_stealth.md): a planted canary's + bytes must never carry the DECNET literal — that would tell an + attacker the file is a honeypot trap.""" + monkeypatch.setenv("DECNET_CANARY_HTTP_BASE", "https://canary.example.test") + monkeypatch.setenv("DECNET_CANARY_DNS_ZONE", "canary.example.test") + for cls in ( + ContentClass.CANARY_AWS_CREDS, + ContentClass.CANARY_GIT_CONFIG, + ContentClass.CANARY_ENV_FILE, + ContentClass.CANARY_SSH_KEY, + ): + artifact = await cultivate(_plan(cls), repo) + body = artifact.content.decode("utf-8", errors="replace") + assert "decnet" not in body.lower(), ( + f"{cls.value!r} body leaked 'decnet': " + f"{body[:120]!r}" + ) diff --git a/tests/realism/test_planner.py b/tests/realism/test_planner.py index c55e3926..b48886a1 100644 --- a/tests/realism/test_planner.py +++ b/tests/realism/test_planner.py @@ -80,16 +80,29 @@ def test_pick_distributes_across_user_and_system_classes() -> None: assert system_classes, f"no system-class plans in 80 trials: {seen}" -def test_pick_never_returns_canary_class_in_stage3() -> None: +def test_canary_picks_are_rare() -> None: + """Stage 7: canary content_classes ARE picked, but bounded. + + The documented rate is ~3% of file picks per + decnet/realism/planner.py:_CANARY_PROBABILITY. We trial a large + sample and assert the rate stays under a generous ceiling so a + typo bumping the constant to 30% explodes here loudly. + """ deckies = [_decky()] - for seed in range(40): + canary_count = 0 + create_count = 0 + for seed in range(500): plan = pick(deckies, _NOW, rand=random.Random(seed)) if plan is None: continue - assert not plan.content_class.is_canary(), ( - "canary class slipped into the realism planner; cultivator " - "lands in stage 7" - ) + create_count += 1 + if plan.content_class.is_canary(): + canary_count += 1 + # 3% target with a 6% upper bound — sampling noise on 500 trials + # is comfortably below this for the documented rate. + rate = canary_count / max(1, create_count) + assert rate <= 0.06, f"canary rate {rate:.2%} exceeds 6% ceiling" + assert canary_count > 0, "expected at least one canary across 500 seeds" def test_pick_persists_persona_window_in_notes() -> None: