merge: testing → main (reconcile 2-week divergence)

This commit is contained in:
2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions

View File

@@ -0,0 +1,9 @@
"""DECNET orchestrator — synthetic life-injection worker.
Drives realistic-looking activity between deckies (inter-decky traffic and
in-decky filesystem mutations) so the honeypot stops looking suspiciously
static. Sole writer of the ``OrchestratorEvent`` table.
"""
from decnet.orchestrator.worker import orchestrator_worker
__all__ = ["orchestrator_worker"]

View File

@@ -0,0 +1,74 @@
"""Activity drivers for the orchestrator.
Concrete drivers register dispatch in :func:`get_driver_for`. Same
lazy-import pattern as :mod:`decnet.canary.factory`: the import-time
cost of :mod:`decnet.orchestrator.drivers` stays low for callers that
only need :class:`ActivityResult` / :class:`ActivityDriver`.
"""
from __future__ import annotations
from decnet.orchestrator.drivers.base import (
ActivityDriver,
ActivityResult,
Driver,
)
from decnet.orchestrator.scheduler import Action, FileAction, TrafficAction
__all__ = [
"ActivityDriver",
"ActivityResult",
"Driver",
"SSHDriver",
"get_driver_for",
]
def __getattr__(name: str): # pragma: no cover - import passthrough
"""Lazy access to concrete drivers.
Avoids dragging the docker-exec / email-driver code into every
consumer that only needs the ABC.
"""
if name == "SSHDriver":
from decnet.orchestrator.drivers.ssh import SSHDriver
return SSHDriver
if name == "EmailDriver":
from decnet.orchestrator.drivers.email import EmailDriver
return EmailDriver
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
def get_driver_for(action: Action) -> ActivityDriver:
"""Return the concrete driver that handles *action*.
Stage 4 of the realism migration adds this seam so the orchestrator
worker can dispatch by action type without an isinstance chain in
``_one_tick``. Stage 5 wires the worker to call this function
instead of holding a single ``SSHDriver`` instance.
The set of action shapes the orchestrator can plan grows with the
migration:
* :class:`TrafficAction` / :class:`FileAction` → :class:`SSHDriver`
* :class:`EmailAction` (post-stage-5) → ``EmailDriver``
* :class:`EditAction` (post-stage-3b) → :class:`SSHDriver`
"""
# Lazy imports keep the side-effecting docker-exec / email-driver
# modules out of every importer's graph.
from decnet.orchestrator.drivers.ssh import SSHDriver
if isinstance(action, (TrafficAction, FileAction)):
return SSHDriver()
# EmailAction lands in stage 5; reachable only after that import is
# added to scheduler. Importing inside the branch avoids a cycle
# with realism.llm at module load time.
try:
from decnet.orchestrator.emailgen.scheduler import EmailAction
except ImportError: # pragma: no cover - scheduler always exists
EmailAction = None # type: ignore[assignment]
if EmailAction is not None and isinstance(action, EmailAction):
from decnet.orchestrator.drivers.email import EmailDriver
return EmailDriver()
raise TypeError(
f"no driver registered for action type {type(action).__name__}"
)

View File

@@ -0,0 +1,92 @@
"""Driver ABC for orchestrator actions.
Each concrete driver (SSH, Email, future HTTP/SMB/MySQL) maps one
:class:`~decnet.orchestrator.scheduler.Action` shape to a side effect
on a target decky and returns an :class:`ActivityResult` the
orchestrator persists.
The ABC lives here, the dispatch factory in
:mod:`decnet.orchestrator.drivers` ``__init__``, and the impls in
sibling modules — same pattern as :mod:`decnet.canary.factory`,
:mod:`decnet.web.db.factory`, and :mod:`decnet.bus.factory`.
Why ABC and not :class:`Protocol`: drivers also expose lower-level
helpers (``plant_file``, ``read_file``) that the planner-driven
realism path will call directly without going through ``run``.
Inheritance pins the contract for those helpers; a structural
protocol would let a typo silently produce a half-implemented driver.
"""
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
from decnet.orchestrator.scheduler import Action
@dataclass
class ActivityResult:
"""Outcome of one driver invocation.
``payload`` is the per-action JSON envelope the worker writes to
the ``OrchestratorEvent.payload`` column and to the bus event
body.
"""
success: bool
payload: dict[str, Any] = field(default_factory=dict)
class ActivityDriver(ABC):
"""Base class every concrete orchestrator driver inherits.
Subclasses MUST implement :meth:`run` — the action-shape dispatch.
Subclasses that interact with files on the target decky SHOULD
implement :meth:`plant_file` and :meth:`read_file` so the realism
edit-in-place path can read existing artifacts before mutating
them. Drivers that don't touch files (e.g. a future pure-traffic
driver) raise :class:`NotImplementedError` from those, and the
planner avoids picking ``EditAction`` for them.
"""
@abstractmethod
async def run(self, action: Action) -> ActivityResult:
"""Execute the action against its target decky."""
async def plant_file(
self,
decky_name: str,
path: str,
content: bytes,
*,
mode: int = 0o600,
mtime: datetime | None = None,
) -> ActivityResult:
"""Write *content* to *path* inside *decky_name*.
Default raises :class:`NotImplementedError`; concrete drivers
that have a write transport (docker exec, ssh, etc.) override.
Bytes-typed so binary artifacts (DOCX/PDF) survive the wire.
"""
raise NotImplementedError(
f"{type(self).__name__} does not support plant_file"
)
async def read_file(self, decky_name: str, path: str) -> bytes:
"""Read *path* from inside *decky_name*.
Required for the realism edit-in-place flow (stage 3b of the
realism migration): the driver reads the previous body, the
realism engine produces the next iteration, the driver writes
it back. Default raises :class:`NotImplementedError`.
"""
raise NotImplementedError(
f"{type(self).__name__} does not support read_file"
)
# Back-compat alias so existing imports of ``Driver`` keep working
# while consumers transition to ``ActivityDriver``. Removed once the
# realism migration is complete.
Driver = ActivityDriver

View File

@@ -0,0 +1,290 @@
"""Email driver — pluggable-LLM EML generation + decky-side delivery.
One :class:`EmailAction` becomes one EML written into the mail decky's
configured emailgen spool directory (``/var/spool/decnet-emails/`` by
default). The IMAP/POP3 service templates read that spool at request
time so attackers see the generated mail in their MUA.
The LLM call goes through :mod:`decnet.realism.llm` — backend-agnostic
by construction so swapping Ollama for the Anthropic API, vLLM, or
llama.cpp is a config change, not a driver rewrite.
Output is parsed-and-repaired into a valid EML using
:mod:`email.mime.*`; the worker then ``docker exec``\\s a ``tee`` to
drop the file inside the target container, followed by a
``touch -d <Date>`` so the file's mtime matches the email's RFC 2822
``Date:`` header.
Per CLAUDE.md "no shell strings": every subprocess invocation uses an
argv list, never ``shell=True``. EML payloads are piped via ``stdin``,
not interpolated into argv.
"""
from __future__ import annotations
import asyncio
import shlex
from datetime import datetime, timezone
from email.mime.text import MIMEText
from email.utils import formatdate
from typing import Any, Optional
from decnet.logging import get_logger
from decnet.orchestrator.drivers.base import ActivityDriver, ActivityResult
from decnet.orchestrator.emailgen.scheduler import EmailAction
from decnet.orchestrator.emailgen.threads import new_message_id
from decnet.realism.llm import LLMBackend, LLMTimeout, get_llm
from decnet.realism.prompts.email import PromptInputs, build as build_prompt
log = get_logger("orchestrator.email")
_DOCKER = "docker"
# docker-exec wall-clock cap for the per-EML write.
_DOCKER_TIMEOUT = 8.0
# Container suffix for the IMAP service on a mail decky.
_IMAP_CONTAINER_SUFFIX = "-imap"
_POP3_CONTAINER_SUFFIX = "-pop3"
# Spool path inside the container. Match the IMAP template's stubbed
# IMAP_EMAIL_SEED location once wiring lands; shipping the constant now
# lets that integration land independently.
_SPOOL_DIR = "/var/spool/decnet-emails"
async def _run_capture(
argv: list[str],
*,
stdin_data: Optional[bytes] = None,
timeout: float = _DOCKER_TIMEOUT,
) -> tuple[int, str, str]:
"""Spawn *argv*, optionally feeding *stdin_data*. Never raises."""
try:
proc = await asyncio.create_subprocess_exec(
*argv,
stdin=asyncio.subprocess.PIPE if stdin_data is not None else None,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
except FileNotFoundError as exc:
return 127, "", f"argv[0] not found: {exc}"
try:
stdout, stderr = await asyncio.wait_for(
proc.communicate(stdin_data), timeout=timeout,
)
except asyncio.TimeoutError:
try:
proc.kill()
except ProcessLookupError:
pass
return 124, "", "timeout"
return (
proc.returncode if proc.returncode is not None else -1,
stdout.decode("utf-8", "replace"),
stderr.decode("utf-8", "replace"),
)
def _container_for(decky_name: str, services: list[str]) -> str:
"""Pick the IMAP container if present, else POP3. Names follow the
``<decky_name>-<service>`` convention from the service templates."""
if "imap" in services:
return f"{decky_name}{_IMAP_CONTAINER_SUFFIX}"
return f"{decky_name}{_POP3_CONTAINER_SUFFIX}"
def _parse_subject_and_body(ollama_output: str) -> tuple[str, str]:
"""Split LLM output into (subject, body).
The prompt asks for ``Subject: <subject>\\n\\n<body>``. When the
model misbehaves (e.g. wraps in markdown fences or skips the
Subject line), fall back to a generic subject and treat the whole
output as body. Never raises.
"""
text = ollama_output.strip()
# Strip code fences if the model wrapped output.
if text.startswith("```"):
nl = text.find("\n")
if nl > 0:
text = text[nl + 1:]
if text.endswith("```"):
text = text[: -3]
text = text.strip()
lines = text.splitlines()
if lines and lines[0].lower().startswith("subject:"):
subject = lines[0].split(":", 1)[1].strip()
# Drop the (possibly empty) blank line after Subject.
body_lines = lines[1:]
if body_lines and not body_lines[0].strip():
body_lines = body_lines[1:]
body = "\n".join(body_lines).strip()
if not subject:
subject = "Business Communication"
return subject, body
return "Business Communication", text
def _build_eml(
*,
sender_name: str,
sender_email: str,
recipient_name: str,
recipient_email: str,
subject: str,
body: str,
message_id: str,
in_reply_to: Optional[str],
references: str,
ts: datetime,
) -> bytes:
"""Assemble a valid plain-text RFC 2822 EML."""
msg = MIMEText(body, "plain", "utf-8")
msg["From"] = f"{sender_name} <{sender_email}>"
msg["To"] = f"{recipient_name} <{recipient_email}>"
msg["Subject"] = subject
msg["Date"] = formatdate(ts.timestamp(), localtime=False)
msg["Message-ID"] = message_id
if in_reply_to:
msg["In-Reply-To"] = in_reply_to
if references:
msg["References"] = references
msg["MIME-Version"] = "1.0"
return msg.as_bytes()
class EmailDriver(ActivityDriver):
"""Concrete driver for :class:`EmailAction`.
Stateless across calls — the LLM backend is constructed once at
init time (or injected for tests). The driver itself does *not*
know about the bus or DB; it returns an :class:`ActivityResult`
that the worker pipes onward.
"""
def __init__(
self,
*,
llm: Optional[LLMBackend] = None,
model: Optional[str] = None,
spool_dir: str = _SPOOL_DIR,
) -> None:
# *llm* takes precedence so tests can inject a FakeBackend
# without env-var trickery. *model* lets the worker honour
# ``--model`` from the CLI without each backend needing to know
# about CLI flags.
self._llm = llm if llm is not None else get_llm(model=model)
self.spool_dir = spool_dir
@property
def model(self) -> str:
"""Convenience accessor for telemetry / logging."""
return self._llm.model
async def run(self, action: EmailAction) -> ActivityResult:
return await self._run_email(action)
async def _run_email(self, action: EmailAction) -> ActivityResult:
prompt, mannerisms_used = build_prompt(
PromptInputs(
sender=action.sender,
recipient=action.recipient,
context_hint=action.context_hint,
parent_subject=action.subject_hint,
parent_excerpt=action.parent_excerpt,
)
)
try:
llm_result = await self._llm.generate(prompt)
except LLMTimeout as exc:
log.warning("emailgen llm timeout model=%s: %s", self._llm.model, exc)
return ActivityResult(
success=False,
payload={
"stage": "llm",
"error": "timeout",
"model": self._llm.model,
"thread_id": action.thread_id,
},
)
gen_ms = llm_result.latency_ms
if not llm_result.success or not llm_result.text.strip():
log.warning(
"emailgen llm produced no usable output model=%s extra=%r",
self._llm.model, llm_result.extra,
)
return ActivityResult(
success=False,
payload={
"stage": "llm",
"model": self._llm.model,
"generation_ms": gen_ms,
"thread_id": action.thread_id,
**{
k: v for k, v in llm_result.extra.items()
if k in ("rc", "stderr")
},
},
)
subject, body = _parse_subject_and_body(llm_result.text)
message_id = new_message_id(action.sender.email.split("@", 1)[1])
ts = datetime.now(timezone.utc)
eml_bytes = _build_eml(
sender_name=action.sender.name,
sender_email=action.sender.email,
recipient_name=action.recipient.name,
recipient_email=action.recipient.email,
subject=subject,
body=body,
message_id=message_id,
in_reply_to=action.parent_message_id,
references=action.references,
ts=ts,
)
# Drop the EML into the mail decky's spool dir over docker exec.
# File path: <spool>/<thread_id>/<uuid-from-message-id>.eml.
# Per-thread sub-directory keeps `ls` in the spool readable by
# operators inspecting the running decoy.
eml_filename = message_id.strip("<>").replace("@", "_at_") + ".eml"
eml_dir = f"{self.spool_dir.rstrip('/')}/{action.thread_id}"
eml_path = f"{eml_dir}/{eml_filename}"
container = _container_for(
action.mail_decky_name, list(action.mail_decky_services),
)
# Stamp the file's mtime + atime to match the EML's Date: header
# so an attacker `ls -lt`'ing the spool doesn't see a wall of
# files all created within the worker's tick window — the cluster
# itself is a tell. ``touch -d`` on GNU coreutils accepts RFC
# 2822 dates directly via the same formatdate() string we wrote
# into the header, so no extra parsing on the container side.
eml_date_header = formatdate(ts.timestamp(), localtime=False)
sh_cmd = (
f"mkdir -p {shlex.quote(eml_dir)} && "
f"tee {shlex.quote(eml_path)} >/dev/null && "
f"touch -d {shlex.quote(eml_date_header)} {shlex.quote(eml_path)}"
)
argv = [_DOCKER, "exec", "-i", container, "sh", "-c", sh_cmd]
rc2, _stdout2, stderr2 = await _run_capture(
argv, stdin_data=eml_bytes, timeout=_DOCKER_TIMEOUT,
)
success = rc2 == 0
payload: dict[str, Any] = {
"stage": "delivered" if success else "delivery",
"model": self.model,
"generation_ms": gen_ms,
"bytes": len(eml_bytes),
"thread_id": action.thread_id,
"message_id": message_id,
"subject": subject,
"language": action.sender.language or "en",
"mannerisms_used": mannerisms_used,
"is_reply": action.is_reply,
"container": container,
"eml_path": eml_path,
"rc": rc2,
"stderr": stderr2.strip()[:256] if not success else None,
}
if not success:
log.warning(
"emailgen delivery failed container=%s rc=%d stderr=%r",
container, rc2, stderr2[:200],
)
return ActivityResult(success=success, payload=payload)

View File

@@ -0,0 +1,293 @@
"""MVP SSH-flavoured driver.
Two action shapes:
* :class:`~decnet.orchestrator.scheduler.TrafficAction` — exec a tiny
Python one-liner *inside the source decky's ssh container* that opens
TCP/22 against the destination decky's IP and reads the SSH banner.
This generates real on-the-wire SSH-protocol traffic between the two
containers (sshd announces the banner on connect), without us having
to ship credentials anywhere.
* :class:`~decnet.orchestrator.scheduler.FileAction` — drop / refresh a
file inside the destination decky's ssh container via ``docker exec``.
Both shell out via :func:`asyncio.create_subprocess_exec` with argv
lists — never a shell string — so an attacker-controllable decky name
or IP can't escape into a shell.
"""
from __future__ import annotations
import asyncio
import shlex
from typing import Any
import base64
from datetime import datetime, timezone
from decnet.logging import get_logger
from decnet.orchestrator.drivers.base import ActivityDriver, ActivityResult
from decnet.orchestrator.scheduler import (
Action,
EditAction,
FileAction,
TrafficAction,
)
log = get_logger("orchestrator.ssh")
_DOCKER = "docker"
# Per-call wall-clock cap. The orchestrator runs serially (one action
# per tick); a wedged docker exec must not stall the whole worker.
_TIMEOUT = 8.0
# Container suffix convention: services/*.py emit container_name as
# ``<decky_name>-<service>``. The MVP only drives the ssh service.
_SSH_CONTAINER_SUFFIX = "-ssh"
def _container_for(decky_name: str) -> str:
return f"{decky_name}{_SSH_CONTAINER_SUFFIX}"
async def _run(argv: list[str]) -> tuple[int, str, str]:
"""Spawn *argv* and capture (rc, stdout, stderr).
Returns ``(rc=124, "", "timeout")`` on wall-clock expiry. Never
raises — orchestrator success/failure is a payload attribute, not
an exception.
"""
return await _run_with_stdin(argv, None)
async def _run_with_stdin(
argv: list[str], stdin_bytes: bytes | None,
) -> tuple[int, str, str]:
"""Spawn *argv*, optionally feeding *stdin_bytes*, and capture rc+output.
Used by :meth:`SSHDriver.plant_file` to stream base64 payloads via
stdin (avoids ARG_MAX on large blobs — same fix as the canary
planter in commit c17b9e0). Same failure semantics as :func:`_run`.
"""
try:
proc = await asyncio.create_subprocess_exec(
*argv,
stdin=asyncio.subprocess.PIPE if stdin_bytes is not None else None,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
except FileNotFoundError as exc:
return 127, "", f"argv[0] not found: {exc}"
try:
stdout, stderr = await asyncio.wait_for(
proc.communicate(stdin_bytes), timeout=_TIMEOUT,
)
except asyncio.TimeoutError:
try:
proc.kill()
except ProcessLookupError:
pass
return 124, "", "timeout"
return (
proc.returncode if proc.returncode is not None else -1,
stdout.decode("utf-8", "replace"),
stderr.decode("utf-8", "replace"),
)
# Python one-liner that probes the destination's SSH banner. Kept inline
# so the driver has zero filesystem dependencies on the host side; the
# *container* needs python3 (ssh service template ships it).
_PROBE_PY = (
"import socket,sys;"
"s=socket.socket();s.settimeout(3);"
"s.connect((sys.argv[1], 22));"
"b=s.recv(128);s.close();"
"sys.stdout.write(b.decode('latin1','replace'))"
)
class SSHDriver(ActivityDriver):
"""Concrete :class:`ActivityDriver` for SSH-flavoured actions."""
async def run(self, action: Action) -> ActivityResult:
if isinstance(action, TrafficAction):
return await self._run_traffic(action)
if isinstance(action, FileAction):
return await self._run_file(action)
if isinstance(action, EditAction):
return await self._run_edit(action)
raise TypeError(f"unsupported action type: {type(action)!r}")
async def _run_traffic(self, action: TrafficAction) -> ActivityResult:
container = _container_for(action.src_name)
argv = [
_DOCKER, "exec", container,
"python3", "-c", _PROBE_PY, action.dst_ip,
]
rc, stdout, stderr = await _run(argv)
success = rc == 0 and stdout.startswith("SSH-")
payload: dict[str, Any] = {
"src_decky": action.src_name,
"dst_decky": action.dst_name,
"dst_ip": action.dst_ip,
"dst_port": 22,
"rc": rc,
"banner": stdout.strip()[:128] if success else None,
"stderr": stderr.strip()[:256] if not success else None,
}
if not success:
log.debug(
"orchestrator.ssh.traffic failed src=%s dst=%s rc=%d stderr=%r",
action.src_name, action.dst_name, rc, stderr[:120],
)
return ActivityResult(success=success, payload=payload)
async def _run_edit(self, action: EditAction) -> ActivityResult:
"""Mutate an existing synthetic file in place.
The realism planner already loaded the previous body from the
``synthetic_files`` row, so we don't re-fetch via ``read_file``;
the body the planner saw is the body we mutate. This avoids a
TOCTOU window where the file changed between pick and apply
(the realism worker is the only writer in the MVP, but the
contract should still be tight).
"""
from decnet.realism.bodies import next_iteration as _next_iteration
from decnet.realism.taxonomy import ContentClass
try:
cls = ContentClass(action.content_class)
except ValueError:
return ActivityResult(
success=False,
payload={
"dst_decky": action.dst_name,
"path": action.path,
"error": f"unknown content_class: {action.content_class!r}",
},
)
try:
new_body = _next_iteration(
cls, action.persona, action.previous_body,
)
except KeyError:
return ActivityResult(
success=False,
payload={
"dst_decky": action.dst_name,
"path": action.path,
"error": (
f"content_class={cls!s} does not support edits"
),
},
)
result = await self.plant_file(
action.dst_name,
action.path,
new_body.encode("utf-8"),
mode=0o644,
mtime=action.mtime,
)
# Carry edit-specific metadata through to the orchestrator
# event payload so the worker's synthetic_files bump (and the
# dashboard's lineage view) sees what actually landed.
if result.success:
result.payload["new_body"] = new_body
result.payload["new_body_bytes"] = len(new_body.encode("utf-8"))
result.payload["synthetic_file_uuid"] = action.synthetic_file_uuid
return result
async def _run_file(self, action: FileAction) -> ActivityResult:
# FileAction.content_bytes wins when set — canary artifacts
# (DOCX/PDF/honeydoc binaries) need their bytes preserved
# exactly. Falls back to utf-8 encoding the str content for
# the inert-realism path.
# mtime carries through from the realism planner so the file
# doesn't stamp at wall-clock-now (the realism failure today).
body = action.content_bytes
if body is None:
body = action.content.encode("utf-8")
return await self.plant_file(
action.dst_name,
action.path,
body,
mode=0o644,
mtime=action.mtime,
)
async def plant_file(
self,
decky_name: str,
path: str,
content: bytes,
*,
mode: int = 0o600,
mtime: datetime | None = None,
) -> ActivityResult:
"""Write *content* to *path* inside *decky_name*'s ssh container.
Streams base64 via stdin (mirrors :mod:`decnet.canary.planter`'s
ARG_MAX-safe write — see commit c17b9e0). Sets file mode and,
when *mtime* is provided, ``touch -d`` to backdate the file so
it doesn't all stamp at wall-clock-now (the realism failure
this migration is fixing).
"""
container = _container_for(decky_name)
b64 = base64.b64encode(content).decode("ascii")
# touch -d accepts ISO 8601; we always emit UTC so the
# container's local TZ doesn't drift the mtime.
if mtime is not None:
ts = mtime.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
touch_cmd = f"touch -d {shlex.quote(ts)} {shlex.quote(path)}"
else:
touch_cmd = f"touch {shlex.quote(path)}"
sh_cmd = (
f"mkdir -p {shlex.quote(_dirname(path))} && "
f"base64 -d > {shlex.quote(path)} && "
f"chmod {mode:o} {shlex.quote(path)} && "
f"{touch_cmd}"
)
argv = [_DOCKER, "exec", "-i", container, "sh", "-c", sh_cmd]
rc, _stdout, stderr = await _run_with_stdin(argv, b64.encode("ascii"))
success = rc == 0
payload: dict[str, Any] = {
"dst_decky": decky_name,
"path": path,
"bytes": len(content),
"rc": rc,
"stderr": stderr.strip()[:256] if not success else None,
}
return ActivityResult(success=success, payload=payload)
async def read_file(self, decky_name: str, path: str) -> bytes:
"""Read *path* from inside *decky_name*'s ssh container.
Used by the realism edit-in-place flow: the driver fetches
the previous body, the realism engine produces the next
iteration, the driver re-plants it via :meth:`plant_file`.
Raises :class:`FileNotFoundError` when the container path
doesn't exist (rc=1 from ``cat`` with stderr ``No such
file``). Other failures raise :class:`RuntimeError` carrying
the docker stderr.
"""
container = _container_for(decky_name)
argv = [_DOCKER, "exec", container, "cat", path]
rc, stdout, stderr = await _run(argv)
if rc == 0:
return stdout.encode("utf-8") if isinstance(stdout, str) else stdout
if "No such file" in stderr or "no such file" in stderr.lower():
raise FileNotFoundError(f"{path} not present in {decky_name}")
raise RuntimeError(
f"docker exec cat failed rc={rc} stderr={stderr.strip()[:256]!r}"
)
def _dirname(path: str) -> str:
"""Pure-string dirname. We can't trust ``os.path.dirname`` on the
host to share the destination container's separator semantics, but
deckies are POSIX so a plain ``rfind('/')`` suffices."""
idx = path.rfind("/")
if idx <= 0:
return "/"
return path[:idx]

View File

@@ -0,0 +1,20 @@
"""Emailgen — email-specific delivery, scheduling, and threading.
After stage 5 of the realism migration, ``emailgen`` is no longer a
separate worker / systemd unit / CLI subcommand. It exposes:
* :mod:`decnet.orchestrator.emailgen.scheduler` — the
``EmailAction`` shape and the ``pick(repo)`` policy that decides
which mail decky / sender / recipient / thread an email belongs to.
* :mod:`decnet.orchestrator.emailgen.threads` — RFC 2822 thread chain
helpers (Message-ID generation, Re: / In-Reply-To bookkeeping).
* :mod:`decnet.orchestrator.emailgen.events` — DB-row + bus-topic
builders for email events.
The orchestrator's main worker (:mod:`decnet.orchestrator.worker`)
calls into these modules per tick. LLM glue, persona schema, prompt
builder, and the global persona pool moved to :mod:`decnet.realism`
in stage 2 of the migration; this package keeps only the
email-specific delivery surface.
"""
from __future__ import annotations

View File

@@ -0,0 +1,49 @@
"""DB-row + bus-topic helpers for the emailgen worker.
Mirror of :mod:`decnet.orchestrator.events` for the email action class.
Kept in its own module so the SSH-flavoured orchestrator and the
emailgen worker don't accumulate cross-imports of each other's action
types.
"""
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any
from decnet.bus import topics as _topics
from decnet.orchestrator.drivers.base import ActivityResult
from decnet.orchestrator.emailgen.scheduler import EmailAction
def to_row(action: EmailAction, result: ActivityResult) -> dict[str, Any]:
"""Build the kwargs dict for ``OrchestratorEmail(**...)``.
Pulls ``message_id`` / ``subject`` / ``language`` out of the
driver's ``payload`` rather than off the action — the EML's
Message-ID is generated inside the driver after the LLM call so
we know it matches what landed on disk.
"""
payload = result.payload or {}
return {
"ts": datetime.now(timezone.utc),
"mail_decky_uuid": action.mail_decky_uuid,
"thread_id": action.thread_id,
"message_id": payload.get("message_id", ""),
"in_reply_to": action.parent_message_id,
"sender_email": action.sender.email,
"recipient_email": action.recipient.email,
"subject": payload.get("subject", ""),
"language": payload.get("language", action.sender.language or "en"),
"eml_path": payload.get("eml_path", ""),
"success": result.success,
"payload": payload, # repo serialises dict→json
}
def topic_for(action: EmailAction) -> str:
"""Map an email action to its bus topic."""
return _topics.orchestrator(_topics.ORCHESTRATOR_EMAIL, action.mail_decky_uuid)
def event_type_for(action: EmailAction) -> str: # noqa: ARG001 — symmetry
return _topics.ORCHESTRATOR_EMAIL

View File

@@ -0,0 +1,255 @@
"""Action picker for the emailgen worker.
One tick = one (mail-decky, sender, recipient, [thread]) decision.
Scope (v1):
- Only TopologyDeckies are eligible mail hosts. Fleet / SWARM-shard
mail-deckies are out of scope per the plan; they get covered when the
forwarder pattern lands for emailgen.
- Mail decky = a running TopologyDecky whose ``services`` includes
``imap`` or ``pop3``.
- Personas come from ``Topology.email_personas`` (JSON list of
:class:`EmailPersona`). Topology-wide ``language_default`` fills in
any persona that didn't set its own.
Returns ``None`` (skip tick) when:
- no running mail decky,
- the mail decky's topology has fewer than two valid personas,
- nobody is in their ``active_hours`` window right now.
"""
from __future__ import annotations
import secrets
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Optional
from decnet.logging import get_logger
from decnet.orchestrator.emailgen.threads import (
ThreadChain,
new_thread_id,
references_for_reply,
reply_subject,
)
from decnet.realism import personas_pool as global_pool
from decnet.realism.personas import (
EmailPersona,
in_active_hours,
parse_personas,
)
logger = get_logger("orchestrator.emailgen")
_MAIL_SERVICES = ("imap", "pop3")
# Probability of replying on an existing thread when one exists. The
# inverse starts a fresh thread. 0.6 mirrors what mailbox studies find
# for active corporate inboxes — most messages are replies, but not
# overwhelmingly so.
_REPLY_PROBABILITY = 0.6
# Generic context hints fed to the LLM when starting a new thread.
# Deliberately broad — the persona's tone + role is what shapes the
# email; the hint just gives the model a topic to riff on.
_CONTEXT_HINTS: tuple[str, ...] = (
"Q3 budget review and approval",
"Client presentation feedback",
"Project deadline extension request",
"Team building event planning",
"IT system maintenance notification",
"Quarterly performance review",
"Vendor onboarding process",
"Holiday schedule announcement",
"Training session invitation",
"Department restructuring update",
"Client contract negotiation",
"Security audit findings",
"Sales strategy meeting",
"Product launch timeline",
"Office relocation update",
"Travel reimbursement policy change",
)
@dataclass(frozen=True)
class EmailAction:
"""One emailgen tick's decision.
``thread_id`` is non-None whenever this action is a reply; the
worker writes it back to the DB so future ticks can chain further
replies. ``in_reply_to`` / ``references`` mirror the RFC 2822
headers we'll set on the EML.
``mail_decky_name`` / ``mail_decky_services`` are denormalised onto
the action so the driver doesn't need a second repo round-trip just
to resolve the container name.
"""
mail_decky_uuid: str
mail_decky_name: str
mail_decky_services: tuple[str, ...]
sender: EmailPersona
recipient: EmailPersona
thread_id: str
parent_message_id: Optional[str]
references: str
subject_hint: Optional[str] # used as parent subject when replying
parent_excerpt: Optional[str] # excerpt from the parent body
context_hint: str # only meaningful on new threads
is_reply: bool
description: str = "email:send"
def _is_mail_decky(decky: dict[str, Any]) -> bool:
services = decky.get("services") or []
if isinstance(services, str):
return False
return any(s in services for s in _MAIL_SERVICES)
async def _resolve_personas(
repo: Any, mail_decky: dict[str, Any],
) -> tuple[list[EmailPersona], str]:
"""Pick the right persona source for *mail_decky* and return the list.
Returns ``(personas, source_label)`` so logs can disambiguate why a
tick was skipped. Source label is the same string ``list_running_deckies``
sets on the row (``"topology" | "fleet" | "shard"``) so the logger
reads consistently against the rest of the orchestrator.
Resolution rules (matches the design discussion):
* **topology** source → walk to ``Topology.email_personas``; the
topology owns its own list. Each topology can have different
personas.
* **fleet** / **shard** source → unihost MACVLAN/IPVLAN deckies and
SWARM shards have no parent topology row, so they share a single
host-wide pool loaded from disk by :mod:`global_pool`.
"""
source = mail_decky.get("source") or "unknown"
if source == "topology":
topology_id = mail_decky.get("topology_id")
if not topology_id:
return [], source
topology = await repo.get_topology(topology_id)
if not topology:
return [], source
return (
parse_personas(
topology.get("email_personas"),
language_default=topology.get("language_default") or "en",
),
source,
)
# Fleet / shard / anything else → global pool.
return global_pool.load(), source
async def pick(
repo: Any,
*,
rand: Optional[secrets.SystemRandom] = None,
now: Optional[datetime] = None,
) -> Optional[EmailAction]:
"""Pick one email action against any running mail decky.
Mail-decky discovery uses the **union view** (``list_running_deckies``):
MazeNET topology deckies, unihost fleet deckies, and SWARM shards are
all eligible. Persona source is per-decky-source; see
:func:`_resolve_personas`. *now* is the wall-clock used for
``active_hours`` filtering — injected so tests can pin the hour
deterministically.
"""
rng = rand or secrets.SystemRandom()
now_dt = now or datetime.now()
deckies = await repo.list_running_deckies()
mail_deckies = [d for d in deckies if _is_mail_decky(d)]
if not mail_deckies:
logger.debug("emailgen pick: no running mail decky")
return None
mail_decky = rng.choice(mail_deckies)
personas, source = await _resolve_personas(repo, mail_decky)
if len(personas) < 2:
logger.debug(
"emailgen pick: source=%s mail_decky=%s only %d personas; need >=2",
source, mail_decky.get("uuid"), len(personas),
)
return None
active = [p for p in personas if in_active_hours(p, now_dt.hour)]
if len(active) < 2:
logger.debug(
"emailgen pick: source=%s mail_decky=%s only %d personas in-hours",
source, mail_decky.get("uuid"), len(active),
)
return None
sender = rng.choice(active)
recipient = rng.choice([p for p in active if p.email != sender.email])
# Look up open threads between this pair on this mail decky.
chain = await _maybe_pick_chain(
repo, mail_decky["uuid"], sender, recipient, rng=rng,
)
services = tuple(mail_decky.get("services") or ())
decky_name = mail_decky.get("name") or ""
if chain is not None:
return EmailAction(
mail_decky_uuid=mail_decky["uuid"],
mail_decky_name=decky_name,
mail_decky_services=services,
sender=sender,
recipient=recipient,
thread_id=chain.thread_id,
parent_message_id=chain.parent_message_id,
references=references_for_reply(chain),
subject_hint=chain.parent_subject,
parent_excerpt=None, # repo can populate later if useful
context_hint=chain.parent_subject,
is_reply=True,
)
return EmailAction(
mail_decky_uuid=mail_decky["uuid"],
mail_decky_name=decky_name,
mail_decky_services=services,
sender=sender,
recipient=recipient,
thread_id=new_thread_id(),
parent_message_id=None,
references="",
subject_hint=None,
parent_excerpt=None,
context_hint=rng.choice(_CONTEXT_HINTS),
is_reply=False,
)
async def _maybe_pick_chain(
repo: Any,
mail_decky_uuid: str,
sender: EmailPersona,
recipient: EmailPersona,
*,
rng: secrets.SystemRandom,
) -> Optional[ThreadChain]:
"""Probabilistically pick an open thread between the pair, or None."""
if rng.random() >= _REPLY_PROBABILITY:
return None
threads = await repo.list_orchestrator_email_threads(
mail_decky_uuid, sender.email, recipient.email, limit=20,
)
if not threads:
return None
head = threads[0]
return ThreadChain(
thread_id=head["thread_id"],
parent_message_id=head["message_id"],
# We don't reconstruct the full ancestry from row history here —
# the parent's References + parent's Message-ID would do that.
# For v1, single-step references is fine; mail clients still
# group correctly by (Subject + In-Reply-To).
references=tuple(),
parent_subject=reply_subject(head["subject"]),
)

View File

@@ -0,0 +1,75 @@
"""RFC 2822 thread-chain bookkeeping.
A thread is a worker-side UUID that groups one or more emails between
the same two personas. ``In-Reply-To`` carries the immediate parent's
``Message-ID``; ``References`` carries the full ancestry chain.
The emailgen scheduler queries the repository for the most recent email
in any thread between (sender, recipient); if it finds one, it emits a
reply (continuing the chain). Otherwise it starts a new thread.
"""
from __future__ import annotations
import uuid
from dataclasses import dataclass
from typing import Optional
@dataclass(frozen=True)
class ThreadChain:
"""Immutable view of a thread's chain at a point in time.
``thread_id`` is opaque (UUID). ``parent_message_id`` is the most
recent message in the chain — the new reply's ``In-Reply-To`` field.
``references`` is the dot-separated history fed into the
``References:`` header (oldest-first per RFC 2822 §3.6.4).
``parent_subject`` carries the subject we're replying to, so the
reply can prepend ``Re:`` correctly.
"""
thread_id: str
parent_message_id: str
references: tuple[str, ...]
parent_subject: str
def new_thread_id() -> str:
return str(uuid.uuid4())
def reply_subject(parent_subject: str) -> str:
"""Prepend ``Re:`` to *parent_subject* if not already a reply.
Folds repeat ``Re: Re: Re:`` into a single ``Re:`` — Outlook /
Thunderbird both do this and an attacker reading the maildir would
notice the corpus's missing convention immediately.
"""
s = parent_subject.strip()
lowered = s.lower()
while lowered.startswith("re:"):
s = s[3:].lstrip()
lowered = s.lower()
return f"Re: {s}"
def references_for_reply(chain: Optional[ThreadChain]) -> str:
"""Build the ``References:`` header value for a reply.
Returns a space-separated list of message-ids, oldest-first, with
the parent appended. Empty string when *chain* is None (root).
"""
if chain is None:
return ""
refs = list(chain.references) + [chain.parent_message_id]
return " ".join(refs)
def new_message_id(domain: str) -> str:
"""Build an RFC 2822 ``Message-ID`` value (incl. angle brackets).
Worker side — the value is also stored in the DB so a future reply
can be threaded against it. Domain mirrors the sender's email
domain so an attacker grepping for tells doesn't find every
fake-corp email tagged with ``@example.com``.
"""
safe_domain = domain.strip() or "localhost"
return f"<{uuid.uuid4().hex}@{safe_domain}>"

View File

@@ -0,0 +1,68 @@
"""DB-row + bus-topic helpers for the orchestrator."""
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any
from decnet.bus import topics as _topics
from decnet.orchestrator.drivers.base import ActivityResult
from decnet.orchestrator.scheduler import (
Action,
EditAction,
FileAction,
TrafficAction,
)
def to_row(action: Action, result: ActivityResult) -> dict[str, Any]:
"""Build the kwargs dict for ``OrchestratorEvent(**...)``."""
base: dict[str, Any] = {
"ts": datetime.now(timezone.utc),
"protocol": "ssh",
"success": result.success,
"payload": result.payload, # repo serialises dict→json
}
if isinstance(action, TrafficAction):
base.update(
kind="traffic",
action=f"exec:{action.description}",
src_decky_uuid=action.src_uuid,
dst_decky_uuid=action.dst_uuid,
)
elif isinstance(action, FileAction):
base.update(
kind="file",
action=action.description,
src_decky_uuid=None,
dst_decky_uuid=action.dst_uuid,
)
elif isinstance(action, EditAction):
# EditAction shares the "file" kind (same dashboard view, same
# bus topic family) but action="file:edit" lets queries
# discriminate when needed.
base.update(
kind="file",
action=action.description,
src_decky_uuid=None,
dst_decky_uuid=action.dst_uuid,
)
else:
raise TypeError(f"unsupported action type: {type(action)!r}")
return base
def topic_for(action: Action) -> str:
"""Map an action to its bus topic."""
if isinstance(action, TrafficAction):
return _topics.orchestrator(_topics.ORCHESTRATOR_TRAFFIC, action.dst_uuid)
if isinstance(action, (FileAction, EditAction)):
return _topics.orchestrator(_topics.ORCHESTRATOR_FILE, action.dst_uuid)
raise TypeError(f"unsupported action type: {type(action)!r}")
def event_type_for(action: Action) -> str:
if isinstance(action, TrafficAction):
return _topics.ORCHESTRATOR_TRAFFIC
if isinstance(action, (FileAction, EditAction)):
return _topics.ORCHESTRATOR_FILE
raise TypeError(f"unsupported action type: {type(action)!r}")

View File

@@ -0,0 +1,340 @@
"""Action picker for the orchestrator.
Stage-3 realism: file actions are sourced from
:func:`decnet.realism.planner.pick`, not the old hardcoded
``_FILE_TEMPLATES``/``_USERS`` constants. Persona resolution per
decky still belongs here (the realism planner is pure of
:class:`~decnet.web.db.repository.BaseRepository` knowledge) — we
walk each decky to either ``Topology.email_personas`` or the
``decnet.realism.personas_pool`` global pool, depending on
``decky["source"]``, then hand the resolved set to the planner.
TrafficAction stays untouched: still a flat random pair-pick of
SSH-capable deckies. Email actions land in stage 5 of the realism
migration when the emailgen worker collapses into the orchestrator.
"""
from __future__ import annotations
import json
import secrets
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any, Optional, Sequence
from decnet.realism import personas_pool
from decnet.realism.personas import EmailPersona, parse_personas
from decnet.realism.planner import pick as _realism_pick
from decnet.realism.taxonomy import ContentClass, Plan
@dataclass(frozen=True)
class TrafficAction:
src_uuid: str
src_name: str
dst_uuid: str
dst_name: str
dst_ip: str
protocol: str = "ssh"
description: str = "tcp_connect:22"
@dataclass(frozen=True)
class FileAction:
"""One file plant request the SSH driver materialises.
Stage-3 realism: ``persona`` / ``content_class`` / ``mtime`` are
populated when the action came through :func:`pick_file`. Older
direct constructions (tests, manual operator drives) leave them
at the defaults — back-compat for the pre-realism call sites
that haven't migrated yet.
"""
dst_uuid: str
dst_name: str
path: str
content: str
persona: str = ""
content_class: str = ContentClass.NOTE.value
mtime: Optional[datetime] = None
description: str = "file:create"
# Canary artifacts (DOCX/PDF/honeydoc binaries) carry their bytes
# here so re-encoding ``content`` from utf-8 doesn't mangle them.
# When set, the SSH driver uses these bytes directly and ignores
# ``content``.
content_bytes: Optional[bytes] = None
@dataclass(frozen=True)
class EditAction:
"""Read-modify-write of an existing synthetic file.
Stage 3b of the realism migration: a previously-planted ``TODO.md``
gets a checkbox flipped, a notes file gets a new line appended, a
cron log gets a fresh entry tacked on. ``synthetic_file_uuid`` is
the row in ``synthetic_files`` to update; ``previous_body`` is
what the planner already saw so the driver doesn't double-fetch.
"""
dst_uuid: str
dst_name: str
path: str
persona: str
content_class: str
previous_body: str
synthetic_file_uuid: str
mtime: Optional[datetime] = None
description: str = "file:edit"
Action = TrafficAction | FileAction | EditAction
def _has_ssh(decky: dict[str, Any]) -> bool:
services = decky.get("services") or []
if isinstance(services, str):
return False # not deserialised — treat as "we don't know"
return "ssh" in services
def pick(
deckies: Sequence[dict[str, Any]],
*,
rand: Optional[secrets.SystemRandom] = None,
) -> Optional[Action]:
"""Pick one *traffic* action against the given decky set.
Returns ``None`` when no SSH-capable pair is available. File
actions are produced by :func:`pick_file` (async — needs the repo
for persona resolution). The orchestrator worker calls one or the
other per tick, weighted 50/50.
"""
rng = rand or secrets.SystemRandom()
ssh_deckies = [d for d in deckies if _has_ssh(d) and d.get("ip")]
if len(ssh_deckies) < 2:
return None
src, dst = rng.sample(ssh_deckies, 2)
return TrafficAction(
src_uuid=src["uuid"],
src_name=src["name"],
dst_uuid=dst["uuid"],
dst_name=dst["name"],
dst_ip=dst["ip"],
)
async def pick_file(
deckies: Sequence[dict[str, Any]],
repo: Any,
*,
now: Optional[datetime] = None,
rand: Optional[secrets.SystemRandom] = None,
llm: Any = None,
llm_breaker: Any = None,
llm_timeout: float = 60.0,
) -> Optional[Action]:
"""Realism-driven file action — create or edit.
Resolves personas per decky (topology pool when the decky has a
parent topology; global pool otherwise), filters to deckies in any
persona's work hours, optionally fetches an edit candidate from
the synthetic_files table, and asks
:func:`decnet.realism.planner.pick` to choose between create / edit
/ leave-alone. Maps the resulting :class:`Plan` to a
:class:`FileAction` (create) or :class:`EditAction` (edit) the
SSH driver can dispatch.
Returns ``None`` when no decky has a non-empty persona pool with a
persona currently in its active-hours window, or when the planner
rolled "leave alone."
"""
rng = rand or secrets.SystemRandom()
when = now or datetime.now(timezone.utc)
enriched = await _resolve_personas(deckies, repo)
if not enriched:
return None
# Pre-fetch a single edit candidate from a random eligible decky,
# so the planner can decide whether to use it. We pick the decky
# client-side (cheap) and ask the repo for one row; if there's
# nothing editable, planner falls back to create.
edit_candidate = None
if rng.random() < 0.5 and enriched:
# Half the ticks consider an edit. Lower than the planner's
# 30% edit weight on purpose — the repo lookup is the
# expensive part, no point doing it on every tick.
candidate_decky = rng.choice(enriched)
try:
row = await repo.pick_random_synthetic_file_for_edit(
candidate_decky["uuid"],
)
except Exception: # noqa: BLE001
row = None
if row is not None:
row = {**row, "decky_name": candidate_decky["name"]}
edit_candidate = row
plan = _realism_pick(enriched, when, edit_candidate=edit_candidate, rand=rng)
if plan is None:
return None
if plan.action == "edit":
return EditAction(
dst_uuid=plan.decky_uuid,
dst_name=plan.decky_name,
path=plan.target_path,
persona=plan.persona,
content_class=plan.content_class.value,
previous_body=plan.previous_body or "",
synthetic_file_uuid=(edit_candidate or {}).get("uuid", ""),
mtime=plan.mtime,
)
# Canary branch — the cultivator builds the bytes, picks the
# placement path, and persists the canary_tokens row. We map
# the resulting CanaryArtifact to a FileAction so the SSH
# driver's plant_file path is reused unchanged.
if plan.content_class.is_canary():
try:
from decnet.canary import cultivator as _cultivator
artifact = await _cultivator.cultivate(plan, repo)
except Exception: # noqa: BLE001
# Cultivation failed (no http_base/dns_zone configured,
# generator raised, repo write failed). Fall through to
# an inert file plant so the tick isn't wasted.
return FileAction(
dst_uuid=plan.decky_uuid,
dst_name=plan.decky_name,
path=plan.target_path or f"/tmp/.cache-{secrets.token_hex(3)}", # nosec B108
content=plan.body_hint or "",
persona=plan.persona,
content_class=plan.content_class.value,
mtime=plan.mtime,
)
return FileAction(
dst_uuid=plan.decky_uuid,
dst_name=plan.decky_name,
path=artifact.path,
content="", # ignored when content_bytes is set
content_bytes=artifact.content,
persona=plan.persona,
content_class=plan.content_class.value,
mtime=plan.mtime,
)
# Create branch. If LLM is wired, optionally swap body_hint for
# an LLM-authored body. Always keep the deterministic body_hint
# as the fallback the function call returns when LLM
# times out / errors / breaker-trips.
body = plan.body_hint or ""
if llm is not None and plan.content_class.is_user_class():
persona_obj = _persona_by_name(enriched, plan.persona)
if persona_obj is not None:
from decnet.realism.bodies import make_body_with_llm
body = await make_body_with_llm(
plan.content_class,
persona_obj,
llm=llm,
breaker=llm_breaker,
timeout=llm_timeout,
rand=rng,
)
return FileAction(
dst_uuid=plan.decky_uuid,
dst_name=plan.decky_name,
path=plan.target_path,
content=body,
persona=plan.persona,
content_class=plan.content_class.value,
mtime=plan.mtime,
)
def _persona_by_name(
enriched: list[dict[str, Any]], name: str,
) -> Optional[EmailPersona]:
"""Find the persona instance the planner used; ``None`` if missing."""
for decky in enriched:
for persona in decky.get("_realism_personas") or []:
if persona.name == name:
return persona
return None
async def _resolve_personas(
deckies: Sequence[dict[str, Any]],
repo: Any,
) -> list[dict[str, Any]]:
"""Attach a resolved persona list to each decky dict.
The realism planner expects each decky to carry
``_realism_personas`` (list of :class:`EmailPersona`). We do the
repo lookups here so the planner stays pure-of-DB.
Topology-source deckies pull from ``Topology.email_personas``.
Fleet/shard deckies pull from the global pool
(:func:`decnet.realism.personas_pool.load`). Decky source unknown
→ fall back to global pool too; better noisy than silent.
"""
enriched: list[dict[str, Any]] = []
topology_cache: dict[str, list[EmailPersona]] = {}
global_personas: Optional[list[EmailPersona]] = None
for decky in deckies:
# Files are planted via the SSH service, same as TrafficAction.
# A decky without ssh has no realism file path today (windows
# personas / SMB writes land in a future stage).
if not _has_ssh(decky):
continue
source = (decky.get("source") or "").lower()
topology_id = decky.get("topology_id")
personas: list[EmailPersona] = []
if source == "topology" and topology_id:
if topology_id not in topology_cache:
try:
topology = await repo.get_topology(topology_id)
except Exception: # noqa: BLE001
topology = None
topology_cache[topology_id] = _topology_personas(topology)
personas = topology_cache[topology_id]
else:
if global_personas is None:
# Lazy-load once per call; the global-pool cache inside
# personas_pool already mtime-checks.
global_personas = personas_pool.load()
personas = global_personas
if not personas:
continue
enriched.append({**decky, "_realism_personas": personas})
return enriched
def _topology_personas(topology: Optional[dict[str, Any]]) -> list[EmailPersona]:
if not topology:
return []
raw = topology.get("email_personas")
if raw is None:
return []
if isinstance(raw, list):
return parse_personas(raw, language_default=topology.get("language_default") or "en")
if isinstance(raw, str):
try:
return parse_personas(json.loads(raw), language_default=topology.get("language_default") or "en")
except json.JSONDecodeError:
return []
return []
# Lightweight no-op alias kept so external callers that already import
# ``Plan`` from the scheduler keep working through the migration.
__all__ = [
"Action",
"EditAction",
"FileAction",
"Plan",
"TrafficAction",
"pick",
"pick_file",
]

View File

@@ -0,0 +1,513 @@
"""Orchestrator main loop.
One tick = one action pick + one driver invocation + one DB write +
one fire-and-forget bus publish. Intentionally serial — MVP honesty:
a wedged docker exec stalls only this worker, never another.
Three action shapes are folded into the single tick after stage 5 of
the realism migration: SSH traffic between deckies, file plants on
deckies (driven by :func:`decnet.realism.planner.pick`), and email
drops into mail-decky maildirs (driven by
:func:`decnet.orchestrator.emailgen.scheduler.pick`). ``decnet
emailgen`` and ``decnet-emailgen.service`` are gone; this worker
covers all three.
Modeled after :mod:`decnet.profiler.worker` for consistency: same
control listener, same heartbeat helper, same shutdown semantics.
"""
from __future__ import annotations
import asyncio
import contextlib
import hashlib
import os
import secrets
from datetime import datetime, timezone
from typing import Any, Optional
from decnet.bus.factory import get_bus
from decnet.bus.publish import (
publish_safely,
run_control_listener,
run_health_heartbeat,
)
from decnet.logging import get_logger
from decnet.orchestrator import events, scheduler
from decnet.orchestrator.drivers import get_driver_for
from decnet.orchestrator.emailgen import (
events as email_events,
scheduler as email_scheduler,
)
from decnet.orchestrator.emailgen.scheduler import EmailAction
from decnet.realism import planner as realism_planner
from decnet.realism.llm.circuit import LLMCircuitBreaker
from decnet.web.db.repository import BaseRepository
logger = get_logger("orchestrator")
# Periodic-prune knobs. Trim per-decky history every _PRUNE_EVERY_TICKS
# to keep orchestrator_events / orchestrator_emails from unbounded
# growth on long-running fleets. Cheap on the write path (zero overhead
# per tick); the cost pays in once every ~100 ticks.
_PRUNE_EVERY_TICKS = 100
_PRUNE_PER_DST_CAP = 10000
_PRUNE_PER_MAIL_DECKY_CAP = 5000
# Refresh planner weights from realism_config every N ticks. Operator
# tunables drift slowly; ~minute-scale latency between PUT and effect
# is fine. No bus signal — keeps the path simple and the orchestrator
# self-contained.
_REALISM_CONFIG_REFRESH_TICKS = 5
# Action-kind weights for the per-tick roll. Email is rare because
# each LLM round-trip is expensive (~seconds) and the prior emailgen
# worker only ticked every 5 minutes. At a 60s orchestrator interval,
# a 10% email weight produces ~one email every ~10 minutes — close
# enough to the pre-collapse cadence.
_ACTION_WEIGHTS: tuple[tuple[str, int], ...] = (
("traffic", 45),
("file", 45),
("email", 10),
)
async def orchestrator_worker(
repo: BaseRepository,
*,
interval: int = 60,
llm_enabled: Optional[bool] = None,
) -> None:
"""Periodically inject synthetic activity into the running fleet.
Runs as a long-lived asyncio task. Honours the bus control topic
(``system.orchestrator.control``) for graceful shutdown.
LLM enrichment for user-class file bodies is opt-in via the
``DECNET_REALISM_LLM`` env var (set to ``ollama`` / ``fake`` /
empty). Pass ``llm_enabled=False`` from the CLI to override
(``decnet orchestrate --no-llm``). When the LLM is unreachable
or wedged, a process-local circuit breaker
(:class:`LLMCircuitBreaker`) trips after 3 consecutive failures
and the worker falls back to deterministic templates for 60
seconds before re-probing.
"""
logger.info("orchestrator worker started interval=%ds", interval)
llm: Any = None
breaker: Optional[LLMCircuitBreaker] = None
if _llm_should_enable(llm_enabled):
try:
from decnet.realism.llm import get_llm
llm = get_llm()
breaker = LLMCircuitBreaker()
logger.info(
"orchestrator: LLM enrichment enabled backend=%s model=%s",
os.environ.get("DECNET_REALISM_LLM", "ollama"),
getattr(llm, "model", "?"),
)
except Exception as exc: # noqa: BLE001
logger.warning(
"orchestrator: LLM init failed, continuing without "
"enrichment: %s", exc,
)
llm = None
bus = None
try:
bus = get_bus(client_name="orchestrator")
await bus.connect()
except Exception as exc: # noqa: BLE001
logger.warning(
"orchestrator: bus unavailable, continuing without publish: %s", exc
)
bus = None
# Initial load — pulls the operator-tuned weights from
# realism_config so the orchestrator starts ticking with the
# operator's intent rather than the baked-in defaults. A failure
# here logs and falls through; the planner already holds defaults.
await _refresh_realism_config(repo)
shutdown = asyncio.Event()
heartbeat_task = asyncio.create_task(
run_health_heartbeat(
bus, "orchestrator",
extra=lambda: {"realism": _realism_health_snapshot(llm, breaker)},
)
)
control_task = asyncio.create_task(
run_control_listener(bus, "orchestrator", shutdown),
)
tick_n = 0
try:
while not shutdown.is_set():
try:
await asyncio.wait_for(shutdown.wait(), timeout=interval)
except asyncio.TimeoutError:
pass # normal tick
if shutdown.is_set():
break
try:
await _one_tick(repo, bus, llm=llm, breaker=breaker)
except Exception as exc: # noqa: BLE001
logger.error("orchestrator tick failed: %s", exc)
tick_n += 1
if tick_n % _PRUNE_EVERY_TICKS == 0:
await _periodic_prune(repo)
if tick_n % _REALISM_CONFIG_REFRESH_TICKS == 0:
await _refresh_realism_config(repo)
finally:
for t in (heartbeat_task, control_task):
t.cancel()
with contextlib.suppress(Exception, asyncio.CancelledError):
await t
if bus is not None:
with contextlib.suppress(Exception):
await bus.close()
async def _periodic_prune(repo: BaseRepository) -> None:
try:
deleted = await repo.prune_orchestrator_events(per_dst_cap=_PRUNE_PER_DST_CAP)
if deleted:
logger.info(
"orchestrator events prune deleted=%d cap=%d",
deleted, _PRUNE_PER_DST_CAP,
)
except Exception as exc: # noqa: BLE001
logger.error("orchestrator events prune failed: %s", exc)
try:
deleted = await repo.prune_orchestrator_emails(
per_decky_cap=_PRUNE_PER_MAIL_DECKY_CAP,
)
if deleted:
logger.info(
"orchestrator emails prune deleted=%d cap=%d",
deleted, _PRUNE_PER_MAIL_DECKY_CAP,
)
except Exception as exc: # noqa: BLE001
logger.error("orchestrator emails prune failed: %s", exc)
async def _refresh_realism_config(repo: BaseRepository) -> None:
"""Pull operator-tuned weights from realism_config into the planner.
Failure modes (DB unreachable, malformed JSON, validation reject)
log and leave the planner's current weights in place. The orchestrator
keeps ticking with whatever it had — never blocks on config.
"""
try:
row = await repo.get_realism_config("weights")
except Exception as exc: # noqa: BLE001
logger.warning("realism config refresh: DB read failed: %s", exc)
return
if row is None:
return # no overrides set; defaults stand
import json
try:
payload = json.loads(row.get("value") or "{}")
except json.JSONDecodeError as exc:
logger.warning("realism config refresh: malformed JSON: %s", exc)
return
if not isinstance(payload, dict):
logger.warning("realism config refresh: payload not an object")
return
try:
realism_planner.apply_payload(payload)
except ValueError as exc:
logger.warning("realism config refresh: rejected payload: %s", exc)
def _roll_action_kind(rng: secrets.SystemRandom) -> str:
total = sum(w for _, w in _ACTION_WEIGHTS)
target = rng.randint(1, total)
running = 0
for kind, w in _ACTION_WEIGHTS:
running += w
if target <= running:
return kind
return _ACTION_WEIGHTS[-1][0] # unreachable, satisfy mypy
def _realism_health_snapshot(
llm: Any, breaker: Optional[LLMCircuitBreaker],
) -> dict[str, Any]:
"""Snapshot of the orchestrator's realism subsystem for the
heartbeat ``extra`` payload.
Surfaces the LLM backend / model / circuit-breaker state so the
dashboard can render a status badge without reaching into worker
process memory. Read-only — the heartbeat ticks every 30s; this
snapshot is recomputed each tick.
When LLM is disabled (``llm is None``) the snapshot still
returns a dict so consumers can branch on ``llm_enabled`` alone.
"""
if llm is None:
return {
"llm_enabled": False,
"llm_backend": None,
"llm_model": None,
"llm_breaker_state": None,
}
return {
"llm_enabled": True,
"llm_backend": os.environ.get("DECNET_REALISM_LLM", "ollama"),
"llm_model": getattr(llm, "model", None),
"llm_breaker_state": breaker.state if breaker is not None else None,
}
def _llm_should_enable(explicit: Optional[bool]) -> bool:
"""Resolve the LLM-enabled flag from CLI / env / defaults.
*explicit* takes precedence (``--llm`` / ``--no-llm``). When unset,
the env var ``DECNET_REALISM_LLM`` decides: any non-empty value
(``ollama`` / ``fake`` / etc.) enables; empty string or ``off`` /
``none`` / ``0`` / ``false`` disables.
"""
if explicit is not None:
return explicit
raw = os.environ.get("DECNET_REALISM_LLM", "").strip().lower()
if raw in ("", "off", "none", "0", "false", "disabled"):
return False
return True
async def _pick_action(
repo: BaseRepository,
deckies: list[dict],
rng: secrets.SystemRandom,
*,
llm: Any = None,
breaker: Optional[LLMCircuitBreaker] = None,
):
"""Roll an action-kind, then pick the matching action.
Quiet branches fall through to the other two so a (decky-set,
persona-pool, mail-decky) shape that would silence one branch
doesn't waste the whole tick.
"""
kinds_in_priority_order = [_roll_action_kind(rng)]
for kind, _ in _ACTION_WEIGHTS:
if kind not in kinds_in_priority_order:
kinds_in_priority_order.append(kind)
for kind in kinds_in_priority_order:
if kind == "traffic":
action = scheduler.pick(deckies, rand=rng)
elif kind == "file":
action = await scheduler.pick_file(
deckies, repo, rand=rng,
llm=llm, llm_breaker=breaker,
)
elif kind == "email":
try:
action = await email_scheduler.pick(repo, rand=rng)
except Exception as exc: # noqa: BLE001
logger.debug("orchestrator: email pick failed: %s", exc)
action = None
else:
action = None
if action is not None:
return action
return None
async def _one_tick(
repo: BaseRepository,
bus,
*,
llm: Any = None,
breaker: Optional[LLMCircuitBreaker] = None,
) -> None:
deckies = await repo.list_running_deckies()
rng = secrets.SystemRandom()
action = await _pick_action(repo, deckies, rng, llm=llm, breaker=breaker)
if action is None:
ssh_eligible = sum(
1 for d in deckies
if isinstance(d.get("services"), list)
and "ssh" in d["services"]
and d.get("ip")
)
by_source: dict[str, int] = {}
for d in deckies:
src = d.get("source", "unknown")
by_source[src] = by_source.get(src, 0) + 1
logger.debug(
"orchestrator: no actionable deckies "
"(running=%d ssh_eligible=%d sources=%s)",
len(deckies), ssh_eligible, by_source,
)
return
driver = get_driver_for(action)
result = await driver.run(action)
if isinstance(action, EmailAction):
await _persist_email(repo, action, result, bus)
else:
await _persist_event(repo, action, result, bus)
if result.success:
if isinstance(action, scheduler.FileAction):
try:
await _record_synthetic_file(repo, action)
except Exception as exc: # noqa: BLE001
logger.warning(
"orchestrator: synthetic_files write failed dst=%s path=%s: %s",
action.dst_uuid, action.path, exc,
)
elif isinstance(action, scheduler.EditAction):
try:
await _bump_synthetic_file_after_edit(repo, action, result)
except Exception as exc: # noqa: BLE001
logger.warning(
"orchestrator: synthetic_files edit-bump failed "
"dst=%s path=%s: %s",
action.dst_uuid, action.path, exc,
)
async def _persist_event(repo, action, result, bus) -> None:
row = events.to_row(action, result)
await repo.record_orchestrator_event(row)
if bus is not None:
topic = events.topic_for(action)
bus_payload = {
"kind": row["kind"],
"protocol": row["protocol"],
"action": row["action"],
"src_decky_uuid": row.get("src_decky_uuid"),
"dst_decky_uuid": row["dst_decky_uuid"],
"success": row["success"],
"payload": result.payload,
"ts": row["ts"].isoformat(),
}
await publish_safely(
bus, topic, bus_payload, event_type=events.event_type_for(action),
)
logger.info(
"orchestrator tick kind=%s success=%s dst=%s",
row["kind"], row["success"], row["dst_decky_uuid"],
)
async def _persist_email(repo, action: EmailAction, result, bus) -> None:
"""Persist + publish an email tick result.
Mirrors the pre-collapse emailgen worker payload exactly so SSE
subscribers and dashboards keep working without a breaking change
to the on-the-wire shape.
"""
row = email_events.to_row(action, result)
await repo.record_orchestrator_email(row)
if bus is not None:
topic = email_events.topic_for(action)
bus_payload = {
"kind": "email",
"mail_decky_uuid": row["mail_decky_uuid"],
"thread_id": row["thread_id"],
"message_id": row["message_id"],
"in_reply_to": row["in_reply_to"],
"sender_email": row["sender_email"],
"recipient_email": row["recipient_email"],
"subject": row["subject"],
"language": row["language"],
"success": row["success"],
"ts": row["ts"].isoformat(),
}
await publish_safely(
bus, topic, bus_payload,
event_type=email_events.event_type_for(action),
)
logger.info(
"orchestrator tick kind=email mail_decky=%s thread=%s success=%s reply=%s",
row["mail_decky_uuid"], row["thread_id"], row["success"], action.is_reply,
)
async def _bump_synthetic_file_after_edit(repo, action, result) -> None:
"""Patch ``synthetic_files`` after a successful EditAction.
Bumps ``edit_count`` + ``last_modified`` + ``content_hash`` so the
dashboard's lineage view shows the change. When the row's UUID
isn't on the action (planner produced an edit plan from a stale
candidate that the repo pruned in between), the update is a no-op
— resurrecting a pruned row isn't this layer's job.
The new body comes from ``result.payload["new_body"]`` (the SSH
driver stashes it on success); we re-hash here so the orchestrator,
not the driver, owns the canonical hash field.
"""
if not action.synthetic_file_uuid:
return
new_body = result.payload.get("new_body", "")
rows = await repo.list_synthetic_files(decky_uuid=action.dst_uuid, limit=200)
existing = next(
(r for r in rows if r.get("uuid") == action.synthetic_file_uuid),
None,
)
if existing is None:
return # candidate was pruned mid-flight; skip silently
patch: dict = {
"last_modified": datetime.now(timezone.utc),
"edit_count": int(existing.get("edit_count", 0)) + 1,
}
if new_body:
patch["content_hash"] = hashlib.sha256(
new_body.encode("utf-8"),
).hexdigest()
patch["last_body"] = new_body
await repo.update_synthetic_file(action.synthetic_file_uuid, patch)
async def _record_synthetic_file(repo, action) -> None:
"""Persist (or patch) a synthetic_files row after a FileAction plant.
Idempotent on ``(decky_uuid, path)``: when the unique constraint
fires (the file existed already), we patch the existing row's
``last_modified`` / ``content_hash`` / ``last_body`` / bump
``edit_count`` so the dashboard's "files this decky has grown"
view stays accurate even when the orchestrator re-plants the same
location.
"""
body = action.content or ""
content_hash = hashlib.sha256(body.encode("utf-8")).hexdigest()
now = datetime.now(timezone.utc)
row = {
"decky_uuid": action.dst_uuid,
"path": action.path,
"persona": action.persona,
"content_class": action.content_class,
"created_at": now,
"last_modified": now,
"edit_count": 0,
"content_hash": content_hash,
"last_body": body,
}
try:
await repo.record_synthetic_file(row)
except Exception: # noqa: BLE001
existing = await repo.list_synthetic_files(
decky_uuid=action.dst_uuid, limit=200,
)
match = next(
(r for r in existing if r.get("path") == action.path), None,
)
if match is None:
raise
await repo.update_synthetic_file(
match["uuid"],
{
"last_modified": now,
"content_hash": content_hash,
"last_body": body,
"edit_count": int(match.get("edit_count", 0)) + 1,
},
)