merge: testing → main (reconcile 2-week divergence)
This commit is contained in:
9
decnet/orchestrator/__init__.py
Normal file
9
decnet/orchestrator/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
"""DECNET orchestrator — synthetic life-injection worker.
|
||||
|
||||
Drives realistic-looking activity between deckies (inter-decky traffic and
|
||||
in-decky filesystem mutations) so the honeypot stops looking suspiciously
|
||||
static. Sole writer of the ``OrchestratorEvent`` table.
|
||||
"""
|
||||
from decnet.orchestrator.worker import orchestrator_worker
|
||||
|
||||
__all__ = ["orchestrator_worker"]
|
||||
74
decnet/orchestrator/drivers/__init__.py
Normal file
74
decnet/orchestrator/drivers/__init__.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""Activity drivers for the orchestrator.
|
||||
|
||||
Concrete drivers register dispatch in :func:`get_driver_for`. Same
|
||||
lazy-import pattern as :mod:`decnet.canary.factory`: the import-time
|
||||
cost of :mod:`decnet.orchestrator.drivers` stays low for callers that
|
||||
only need :class:`ActivityResult` / :class:`ActivityDriver`.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from decnet.orchestrator.drivers.base import (
|
||||
ActivityDriver,
|
||||
ActivityResult,
|
||||
Driver,
|
||||
)
|
||||
from decnet.orchestrator.scheduler import Action, FileAction, TrafficAction
|
||||
|
||||
__all__ = [
|
||||
"ActivityDriver",
|
||||
"ActivityResult",
|
||||
"Driver",
|
||||
"SSHDriver",
|
||||
"get_driver_for",
|
||||
]
|
||||
|
||||
|
||||
def __getattr__(name: str): # pragma: no cover - import passthrough
|
||||
"""Lazy access to concrete drivers.
|
||||
|
||||
Avoids dragging the docker-exec / email-driver code into every
|
||||
consumer that only needs the ABC.
|
||||
"""
|
||||
if name == "SSHDriver":
|
||||
from decnet.orchestrator.drivers.ssh import SSHDriver
|
||||
return SSHDriver
|
||||
if name == "EmailDriver":
|
||||
from decnet.orchestrator.drivers.email import EmailDriver
|
||||
return EmailDriver
|
||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||
|
||||
|
||||
def get_driver_for(action: Action) -> ActivityDriver:
|
||||
"""Return the concrete driver that handles *action*.
|
||||
|
||||
Stage 4 of the realism migration adds this seam so the orchestrator
|
||||
worker can dispatch by action type without an isinstance chain in
|
||||
``_one_tick``. Stage 5 wires the worker to call this function
|
||||
instead of holding a single ``SSHDriver`` instance.
|
||||
|
||||
The set of action shapes the orchestrator can plan grows with the
|
||||
migration:
|
||||
|
||||
* :class:`TrafficAction` / :class:`FileAction` → :class:`SSHDriver`
|
||||
* :class:`EmailAction` (post-stage-5) → ``EmailDriver``
|
||||
* :class:`EditAction` (post-stage-3b) → :class:`SSHDriver`
|
||||
"""
|
||||
# Lazy imports keep the side-effecting docker-exec / email-driver
|
||||
# modules out of every importer's graph.
|
||||
from decnet.orchestrator.drivers.ssh import SSHDriver
|
||||
|
||||
if isinstance(action, (TrafficAction, FileAction)):
|
||||
return SSHDriver()
|
||||
# EmailAction lands in stage 5; reachable only after that import is
|
||||
# added to scheduler. Importing inside the branch avoids a cycle
|
||||
# with realism.llm at module load time.
|
||||
try:
|
||||
from decnet.orchestrator.emailgen.scheduler import EmailAction
|
||||
except ImportError: # pragma: no cover - scheduler always exists
|
||||
EmailAction = None # type: ignore[assignment]
|
||||
if EmailAction is not None and isinstance(action, EmailAction):
|
||||
from decnet.orchestrator.drivers.email import EmailDriver
|
||||
return EmailDriver()
|
||||
raise TypeError(
|
||||
f"no driver registered for action type {type(action).__name__}"
|
||||
)
|
||||
92
decnet/orchestrator/drivers/base.py
Normal file
92
decnet/orchestrator/drivers/base.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""Driver ABC for orchestrator actions.
|
||||
|
||||
Each concrete driver (SSH, Email, future HTTP/SMB/MySQL) maps one
|
||||
:class:`~decnet.orchestrator.scheduler.Action` shape to a side effect
|
||||
on a target decky and returns an :class:`ActivityResult` the
|
||||
orchestrator persists.
|
||||
|
||||
The ABC lives here, the dispatch factory in
|
||||
:mod:`decnet.orchestrator.drivers` ``__init__``, and the impls in
|
||||
sibling modules — same pattern as :mod:`decnet.canary.factory`,
|
||||
:mod:`decnet.web.db.factory`, and :mod:`decnet.bus.factory`.
|
||||
|
||||
Why ABC and not :class:`Protocol`: drivers also expose lower-level
|
||||
helpers (``plant_file``, ``read_file``) that the planner-driven
|
||||
realism path will call directly without going through ``run``.
|
||||
Inheritance pins the contract for those helpers; a structural
|
||||
protocol would let a typo silently produce a half-implemented driver.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from decnet.orchestrator.scheduler import Action
|
||||
|
||||
|
||||
@dataclass
|
||||
class ActivityResult:
|
||||
"""Outcome of one driver invocation.
|
||||
|
||||
``payload`` is the per-action JSON envelope the worker writes to
|
||||
the ``OrchestratorEvent.payload`` column and to the bus event
|
||||
body.
|
||||
"""
|
||||
success: bool
|
||||
payload: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
class ActivityDriver(ABC):
|
||||
"""Base class every concrete orchestrator driver inherits.
|
||||
|
||||
Subclasses MUST implement :meth:`run` — the action-shape dispatch.
|
||||
Subclasses that interact with files on the target decky SHOULD
|
||||
implement :meth:`plant_file` and :meth:`read_file` so the realism
|
||||
edit-in-place path can read existing artifacts before mutating
|
||||
them. Drivers that don't touch files (e.g. a future pure-traffic
|
||||
driver) raise :class:`NotImplementedError` from those, and the
|
||||
planner avoids picking ``EditAction`` for them.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def run(self, action: Action) -> ActivityResult:
|
||||
"""Execute the action against its target decky."""
|
||||
|
||||
async def plant_file(
|
||||
self,
|
||||
decky_name: str,
|
||||
path: str,
|
||||
content: bytes,
|
||||
*,
|
||||
mode: int = 0o600,
|
||||
mtime: datetime | None = None,
|
||||
) -> ActivityResult:
|
||||
"""Write *content* to *path* inside *decky_name*.
|
||||
|
||||
Default raises :class:`NotImplementedError`; concrete drivers
|
||||
that have a write transport (docker exec, ssh, etc.) override.
|
||||
Bytes-typed so binary artifacts (DOCX/PDF) survive the wire.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
f"{type(self).__name__} does not support plant_file"
|
||||
)
|
||||
|
||||
async def read_file(self, decky_name: str, path: str) -> bytes:
|
||||
"""Read *path* from inside *decky_name*.
|
||||
|
||||
Required for the realism edit-in-place flow (stage 3b of the
|
||||
realism migration): the driver reads the previous body, the
|
||||
realism engine produces the next iteration, the driver writes
|
||||
it back. Default raises :class:`NotImplementedError`.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
f"{type(self).__name__} does not support read_file"
|
||||
)
|
||||
|
||||
|
||||
# Back-compat alias so existing imports of ``Driver`` keep working
|
||||
# while consumers transition to ``ActivityDriver``. Removed once the
|
||||
# realism migration is complete.
|
||||
Driver = ActivityDriver
|
||||
290
decnet/orchestrator/drivers/email.py
Normal file
290
decnet/orchestrator/drivers/email.py
Normal file
@@ -0,0 +1,290 @@
|
||||
"""Email driver — pluggable-LLM EML generation + decky-side delivery.
|
||||
|
||||
One :class:`EmailAction` becomes one EML written into the mail decky's
|
||||
configured emailgen spool directory (``/var/spool/decnet-emails/`` by
|
||||
default). The IMAP/POP3 service templates read that spool at request
|
||||
time so attackers see the generated mail in their MUA.
|
||||
|
||||
The LLM call goes through :mod:`decnet.realism.llm` — backend-agnostic
|
||||
by construction so swapping Ollama for the Anthropic API, vLLM, or
|
||||
llama.cpp is a config change, not a driver rewrite.
|
||||
Output is parsed-and-repaired into a valid EML using
|
||||
:mod:`email.mime.*`; the worker then ``docker exec``\\s a ``tee`` to
|
||||
drop the file inside the target container, followed by a
|
||||
``touch -d <Date>`` so the file's mtime matches the email's RFC 2822
|
||||
``Date:`` header.
|
||||
|
||||
Per CLAUDE.md "no shell strings": every subprocess invocation uses an
|
||||
argv list, never ``shell=True``. EML payloads are piped via ``stdin``,
|
||||
not interpolated into argv.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import shlex
|
||||
from datetime import datetime, timezone
|
||||
from email.mime.text import MIMEText
|
||||
from email.utils import formatdate
|
||||
from typing import Any, Optional
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.orchestrator.drivers.base import ActivityDriver, ActivityResult
|
||||
from decnet.orchestrator.emailgen.scheduler import EmailAction
|
||||
from decnet.orchestrator.emailgen.threads import new_message_id
|
||||
from decnet.realism.llm import LLMBackend, LLMTimeout, get_llm
|
||||
from decnet.realism.prompts.email import PromptInputs, build as build_prompt
|
||||
|
||||
log = get_logger("orchestrator.email")
|
||||
|
||||
_DOCKER = "docker"
|
||||
# docker-exec wall-clock cap for the per-EML write.
|
||||
_DOCKER_TIMEOUT = 8.0
|
||||
# Container suffix for the IMAP service on a mail decky.
|
||||
_IMAP_CONTAINER_SUFFIX = "-imap"
|
||||
_POP3_CONTAINER_SUFFIX = "-pop3"
|
||||
# Spool path inside the container. Match the IMAP template's stubbed
|
||||
# IMAP_EMAIL_SEED location once wiring lands; shipping the constant now
|
||||
# lets that integration land independently.
|
||||
_SPOOL_DIR = "/var/spool/decnet-emails"
|
||||
|
||||
|
||||
async def _run_capture(
|
||||
argv: list[str],
|
||||
*,
|
||||
stdin_data: Optional[bytes] = None,
|
||||
timeout: float = _DOCKER_TIMEOUT,
|
||||
) -> tuple[int, str, str]:
|
||||
"""Spawn *argv*, optionally feeding *stdin_data*. Never raises."""
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*argv,
|
||||
stdin=asyncio.subprocess.PIPE if stdin_data is not None else None,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
except FileNotFoundError as exc:
|
||||
return 127, "", f"argv[0] not found: {exc}"
|
||||
try:
|
||||
stdout, stderr = await asyncio.wait_for(
|
||||
proc.communicate(stdin_data), timeout=timeout,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
try:
|
||||
proc.kill()
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
return 124, "", "timeout"
|
||||
return (
|
||||
proc.returncode if proc.returncode is not None else -1,
|
||||
stdout.decode("utf-8", "replace"),
|
||||
stderr.decode("utf-8", "replace"),
|
||||
)
|
||||
|
||||
|
||||
def _container_for(decky_name: str, services: list[str]) -> str:
|
||||
"""Pick the IMAP container if present, else POP3. Names follow the
|
||||
``<decky_name>-<service>`` convention from the service templates."""
|
||||
if "imap" in services:
|
||||
return f"{decky_name}{_IMAP_CONTAINER_SUFFIX}"
|
||||
return f"{decky_name}{_POP3_CONTAINER_SUFFIX}"
|
||||
|
||||
|
||||
def _parse_subject_and_body(ollama_output: str) -> tuple[str, str]:
|
||||
"""Split LLM output into (subject, body).
|
||||
|
||||
The prompt asks for ``Subject: <subject>\\n\\n<body>``. When the
|
||||
model misbehaves (e.g. wraps in markdown fences or skips the
|
||||
Subject line), fall back to a generic subject and treat the whole
|
||||
output as body. Never raises.
|
||||
"""
|
||||
text = ollama_output.strip()
|
||||
# Strip code fences if the model wrapped output.
|
||||
if text.startswith("```"):
|
||||
nl = text.find("\n")
|
||||
if nl > 0:
|
||||
text = text[nl + 1:]
|
||||
if text.endswith("```"):
|
||||
text = text[: -3]
|
||||
text = text.strip()
|
||||
lines = text.splitlines()
|
||||
if lines and lines[0].lower().startswith("subject:"):
|
||||
subject = lines[0].split(":", 1)[1].strip()
|
||||
# Drop the (possibly empty) blank line after Subject.
|
||||
body_lines = lines[1:]
|
||||
if body_lines and not body_lines[0].strip():
|
||||
body_lines = body_lines[1:]
|
||||
body = "\n".join(body_lines).strip()
|
||||
if not subject:
|
||||
subject = "Business Communication"
|
||||
return subject, body
|
||||
return "Business Communication", text
|
||||
|
||||
|
||||
def _build_eml(
|
||||
*,
|
||||
sender_name: str,
|
||||
sender_email: str,
|
||||
recipient_name: str,
|
||||
recipient_email: str,
|
||||
subject: str,
|
||||
body: str,
|
||||
message_id: str,
|
||||
in_reply_to: Optional[str],
|
||||
references: str,
|
||||
ts: datetime,
|
||||
) -> bytes:
|
||||
"""Assemble a valid plain-text RFC 2822 EML."""
|
||||
msg = MIMEText(body, "plain", "utf-8")
|
||||
msg["From"] = f"{sender_name} <{sender_email}>"
|
||||
msg["To"] = f"{recipient_name} <{recipient_email}>"
|
||||
msg["Subject"] = subject
|
||||
msg["Date"] = formatdate(ts.timestamp(), localtime=False)
|
||||
msg["Message-ID"] = message_id
|
||||
if in_reply_to:
|
||||
msg["In-Reply-To"] = in_reply_to
|
||||
if references:
|
||||
msg["References"] = references
|
||||
msg["MIME-Version"] = "1.0"
|
||||
return msg.as_bytes()
|
||||
|
||||
|
||||
class EmailDriver(ActivityDriver):
|
||||
"""Concrete driver for :class:`EmailAction`.
|
||||
|
||||
Stateless across calls — the LLM backend is constructed once at
|
||||
init time (or injected for tests). The driver itself does *not*
|
||||
know about the bus or DB; it returns an :class:`ActivityResult`
|
||||
that the worker pipes onward.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
llm: Optional[LLMBackend] = None,
|
||||
model: Optional[str] = None,
|
||||
spool_dir: str = _SPOOL_DIR,
|
||||
) -> None:
|
||||
# *llm* takes precedence so tests can inject a FakeBackend
|
||||
# without env-var trickery. *model* lets the worker honour
|
||||
# ``--model`` from the CLI without each backend needing to know
|
||||
# about CLI flags.
|
||||
self._llm = llm if llm is not None else get_llm(model=model)
|
||||
self.spool_dir = spool_dir
|
||||
|
||||
@property
|
||||
def model(self) -> str:
|
||||
"""Convenience accessor for telemetry / logging."""
|
||||
return self._llm.model
|
||||
|
||||
async def run(self, action: EmailAction) -> ActivityResult:
|
||||
return await self._run_email(action)
|
||||
|
||||
async def _run_email(self, action: EmailAction) -> ActivityResult:
|
||||
prompt, mannerisms_used = build_prompt(
|
||||
PromptInputs(
|
||||
sender=action.sender,
|
||||
recipient=action.recipient,
|
||||
context_hint=action.context_hint,
|
||||
parent_subject=action.subject_hint,
|
||||
parent_excerpt=action.parent_excerpt,
|
||||
)
|
||||
)
|
||||
try:
|
||||
llm_result = await self._llm.generate(prompt)
|
||||
except LLMTimeout as exc:
|
||||
log.warning("emailgen llm timeout model=%s: %s", self._llm.model, exc)
|
||||
return ActivityResult(
|
||||
success=False,
|
||||
payload={
|
||||
"stage": "llm",
|
||||
"error": "timeout",
|
||||
"model": self._llm.model,
|
||||
"thread_id": action.thread_id,
|
||||
},
|
||||
)
|
||||
|
||||
gen_ms = llm_result.latency_ms
|
||||
if not llm_result.success or not llm_result.text.strip():
|
||||
log.warning(
|
||||
"emailgen llm produced no usable output model=%s extra=%r",
|
||||
self._llm.model, llm_result.extra,
|
||||
)
|
||||
return ActivityResult(
|
||||
success=False,
|
||||
payload={
|
||||
"stage": "llm",
|
||||
"model": self._llm.model,
|
||||
"generation_ms": gen_ms,
|
||||
"thread_id": action.thread_id,
|
||||
**{
|
||||
k: v for k, v in llm_result.extra.items()
|
||||
if k in ("rc", "stderr")
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
subject, body = _parse_subject_and_body(llm_result.text)
|
||||
message_id = new_message_id(action.sender.email.split("@", 1)[1])
|
||||
ts = datetime.now(timezone.utc)
|
||||
eml_bytes = _build_eml(
|
||||
sender_name=action.sender.name,
|
||||
sender_email=action.sender.email,
|
||||
recipient_name=action.recipient.name,
|
||||
recipient_email=action.recipient.email,
|
||||
subject=subject,
|
||||
body=body,
|
||||
message_id=message_id,
|
||||
in_reply_to=action.parent_message_id,
|
||||
references=action.references,
|
||||
ts=ts,
|
||||
)
|
||||
|
||||
# Drop the EML into the mail decky's spool dir over docker exec.
|
||||
# File path: <spool>/<thread_id>/<uuid-from-message-id>.eml.
|
||||
# Per-thread sub-directory keeps `ls` in the spool readable by
|
||||
# operators inspecting the running decoy.
|
||||
eml_filename = message_id.strip("<>").replace("@", "_at_") + ".eml"
|
||||
eml_dir = f"{self.spool_dir.rstrip('/')}/{action.thread_id}"
|
||||
eml_path = f"{eml_dir}/{eml_filename}"
|
||||
container = _container_for(
|
||||
action.mail_decky_name, list(action.mail_decky_services),
|
||||
)
|
||||
# Stamp the file's mtime + atime to match the EML's Date: header
|
||||
# so an attacker `ls -lt`'ing the spool doesn't see a wall of
|
||||
# files all created within the worker's tick window — the cluster
|
||||
# itself is a tell. ``touch -d`` on GNU coreutils accepts RFC
|
||||
# 2822 dates directly via the same formatdate() string we wrote
|
||||
# into the header, so no extra parsing on the container side.
|
||||
eml_date_header = formatdate(ts.timestamp(), localtime=False)
|
||||
sh_cmd = (
|
||||
f"mkdir -p {shlex.quote(eml_dir)} && "
|
||||
f"tee {shlex.quote(eml_path)} >/dev/null && "
|
||||
f"touch -d {shlex.quote(eml_date_header)} {shlex.quote(eml_path)}"
|
||||
)
|
||||
argv = [_DOCKER, "exec", "-i", container, "sh", "-c", sh_cmd]
|
||||
rc2, _stdout2, stderr2 = await _run_capture(
|
||||
argv, stdin_data=eml_bytes, timeout=_DOCKER_TIMEOUT,
|
||||
)
|
||||
success = rc2 == 0
|
||||
payload: dict[str, Any] = {
|
||||
"stage": "delivered" if success else "delivery",
|
||||
"model": self.model,
|
||||
"generation_ms": gen_ms,
|
||||
"bytes": len(eml_bytes),
|
||||
"thread_id": action.thread_id,
|
||||
"message_id": message_id,
|
||||
"subject": subject,
|
||||
"language": action.sender.language or "en",
|
||||
"mannerisms_used": mannerisms_used,
|
||||
"is_reply": action.is_reply,
|
||||
"container": container,
|
||||
"eml_path": eml_path,
|
||||
"rc": rc2,
|
||||
"stderr": stderr2.strip()[:256] if not success else None,
|
||||
}
|
||||
if not success:
|
||||
log.warning(
|
||||
"emailgen delivery failed container=%s rc=%d stderr=%r",
|
||||
container, rc2, stderr2[:200],
|
||||
)
|
||||
return ActivityResult(success=success, payload=payload)
|
||||
293
decnet/orchestrator/drivers/ssh.py
Normal file
293
decnet/orchestrator/drivers/ssh.py
Normal file
@@ -0,0 +1,293 @@
|
||||
"""MVP SSH-flavoured driver.
|
||||
|
||||
Two action shapes:
|
||||
|
||||
* :class:`~decnet.orchestrator.scheduler.TrafficAction` — exec a tiny
|
||||
Python one-liner *inside the source decky's ssh container* that opens
|
||||
TCP/22 against the destination decky's IP and reads the SSH banner.
|
||||
This generates real on-the-wire SSH-protocol traffic between the two
|
||||
containers (sshd announces the banner on connect), without us having
|
||||
to ship credentials anywhere.
|
||||
* :class:`~decnet.orchestrator.scheduler.FileAction` — drop / refresh a
|
||||
file inside the destination decky's ssh container via ``docker exec``.
|
||||
|
||||
Both shell out via :func:`asyncio.create_subprocess_exec` with argv
|
||||
lists — never a shell string — so an attacker-controllable decky name
|
||||
or IP can't escape into a shell.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import shlex
|
||||
from typing import Any
|
||||
|
||||
import base64
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.orchestrator.drivers.base import ActivityDriver, ActivityResult
|
||||
from decnet.orchestrator.scheduler import (
|
||||
Action,
|
||||
EditAction,
|
||||
FileAction,
|
||||
TrafficAction,
|
||||
)
|
||||
|
||||
log = get_logger("orchestrator.ssh")
|
||||
|
||||
_DOCKER = "docker"
|
||||
# Per-call wall-clock cap. The orchestrator runs serially (one action
|
||||
# per tick); a wedged docker exec must not stall the whole worker.
|
||||
_TIMEOUT = 8.0
|
||||
|
||||
# Container suffix convention: services/*.py emit container_name as
|
||||
# ``<decky_name>-<service>``. The MVP only drives the ssh service.
|
||||
_SSH_CONTAINER_SUFFIX = "-ssh"
|
||||
|
||||
|
||||
def _container_for(decky_name: str) -> str:
|
||||
return f"{decky_name}{_SSH_CONTAINER_SUFFIX}"
|
||||
|
||||
|
||||
async def _run(argv: list[str]) -> tuple[int, str, str]:
|
||||
"""Spawn *argv* and capture (rc, stdout, stderr).
|
||||
|
||||
Returns ``(rc=124, "", "timeout")`` on wall-clock expiry. Never
|
||||
raises — orchestrator success/failure is a payload attribute, not
|
||||
an exception.
|
||||
"""
|
||||
return await _run_with_stdin(argv, None)
|
||||
|
||||
|
||||
async def _run_with_stdin(
|
||||
argv: list[str], stdin_bytes: bytes | None,
|
||||
) -> tuple[int, str, str]:
|
||||
"""Spawn *argv*, optionally feeding *stdin_bytes*, and capture rc+output.
|
||||
|
||||
Used by :meth:`SSHDriver.plant_file` to stream base64 payloads via
|
||||
stdin (avoids ARG_MAX on large blobs — same fix as the canary
|
||||
planter in commit c17b9e0). Same failure semantics as :func:`_run`.
|
||||
"""
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*argv,
|
||||
stdin=asyncio.subprocess.PIPE if stdin_bytes is not None else None,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
except FileNotFoundError as exc:
|
||||
return 127, "", f"argv[0] not found: {exc}"
|
||||
try:
|
||||
stdout, stderr = await asyncio.wait_for(
|
||||
proc.communicate(stdin_bytes), timeout=_TIMEOUT,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
try:
|
||||
proc.kill()
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
return 124, "", "timeout"
|
||||
return (
|
||||
proc.returncode if proc.returncode is not None else -1,
|
||||
stdout.decode("utf-8", "replace"),
|
||||
stderr.decode("utf-8", "replace"),
|
||||
)
|
||||
|
||||
|
||||
# Python one-liner that probes the destination's SSH banner. Kept inline
|
||||
# so the driver has zero filesystem dependencies on the host side; the
|
||||
# *container* needs python3 (ssh service template ships it).
|
||||
_PROBE_PY = (
|
||||
"import socket,sys;"
|
||||
"s=socket.socket();s.settimeout(3);"
|
||||
"s.connect((sys.argv[1], 22));"
|
||||
"b=s.recv(128);s.close();"
|
||||
"sys.stdout.write(b.decode('latin1','replace'))"
|
||||
)
|
||||
|
||||
|
||||
class SSHDriver(ActivityDriver):
|
||||
"""Concrete :class:`ActivityDriver` for SSH-flavoured actions."""
|
||||
|
||||
async def run(self, action: Action) -> ActivityResult:
|
||||
if isinstance(action, TrafficAction):
|
||||
return await self._run_traffic(action)
|
||||
if isinstance(action, FileAction):
|
||||
return await self._run_file(action)
|
||||
if isinstance(action, EditAction):
|
||||
return await self._run_edit(action)
|
||||
raise TypeError(f"unsupported action type: {type(action)!r}")
|
||||
|
||||
async def _run_traffic(self, action: TrafficAction) -> ActivityResult:
|
||||
container = _container_for(action.src_name)
|
||||
argv = [
|
||||
_DOCKER, "exec", container,
|
||||
"python3", "-c", _PROBE_PY, action.dst_ip,
|
||||
]
|
||||
rc, stdout, stderr = await _run(argv)
|
||||
success = rc == 0 and stdout.startswith("SSH-")
|
||||
payload: dict[str, Any] = {
|
||||
"src_decky": action.src_name,
|
||||
"dst_decky": action.dst_name,
|
||||
"dst_ip": action.dst_ip,
|
||||
"dst_port": 22,
|
||||
"rc": rc,
|
||||
"banner": stdout.strip()[:128] if success else None,
|
||||
"stderr": stderr.strip()[:256] if not success else None,
|
||||
}
|
||||
if not success:
|
||||
log.debug(
|
||||
"orchestrator.ssh.traffic failed src=%s dst=%s rc=%d stderr=%r",
|
||||
action.src_name, action.dst_name, rc, stderr[:120],
|
||||
)
|
||||
return ActivityResult(success=success, payload=payload)
|
||||
|
||||
async def _run_edit(self, action: EditAction) -> ActivityResult:
|
||||
"""Mutate an existing synthetic file in place.
|
||||
|
||||
The realism planner already loaded the previous body from the
|
||||
``synthetic_files`` row, so we don't re-fetch via ``read_file``;
|
||||
the body the planner saw is the body we mutate. This avoids a
|
||||
TOCTOU window where the file changed between pick and apply
|
||||
(the realism worker is the only writer in the MVP, but the
|
||||
contract should still be tight).
|
||||
"""
|
||||
from decnet.realism.bodies import next_iteration as _next_iteration
|
||||
from decnet.realism.taxonomy import ContentClass
|
||||
|
||||
try:
|
||||
cls = ContentClass(action.content_class)
|
||||
except ValueError:
|
||||
return ActivityResult(
|
||||
success=False,
|
||||
payload={
|
||||
"dst_decky": action.dst_name,
|
||||
"path": action.path,
|
||||
"error": f"unknown content_class: {action.content_class!r}",
|
||||
},
|
||||
)
|
||||
try:
|
||||
new_body = _next_iteration(
|
||||
cls, action.persona, action.previous_body,
|
||||
)
|
||||
except KeyError:
|
||||
return ActivityResult(
|
||||
success=False,
|
||||
payload={
|
||||
"dst_decky": action.dst_name,
|
||||
"path": action.path,
|
||||
"error": (
|
||||
f"content_class={cls!s} does not support edits"
|
||||
),
|
||||
},
|
||||
)
|
||||
result = await self.plant_file(
|
||||
action.dst_name,
|
||||
action.path,
|
||||
new_body.encode("utf-8"),
|
||||
mode=0o644,
|
||||
mtime=action.mtime,
|
||||
)
|
||||
# Carry edit-specific metadata through to the orchestrator
|
||||
# event payload so the worker's synthetic_files bump (and the
|
||||
# dashboard's lineage view) sees what actually landed.
|
||||
if result.success:
|
||||
result.payload["new_body"] = new_body
|
||||
result.payload["new_body_bytes"] = len(new_body.encode("utf-8"))
|
||||
result.payload["synthetic_file_uuid"] = action.synthetic_file_uuid
|
||||
return result
|
||||
|
||||
async def _run_file(self, action: FileAction) -> ActivityResult:
|
||||
# FileAction.content_bytes wins when set — canary artifacts
|
||||
# (DOCX/PDF/honeydoc binaries) need their bytes preserved
|
||||
# exactly. Falls back to utf-8 encoding the str content for
|
||||
# the inert-realism path.
|
||||
# mtime carries through from the realism planner so the file
|
||||
# doesn't stamp at wall-clock-now (the realism failure today).
|
||||
body = action.content_bytes
|
||||
if body is None:
|
||||
body = action.content.encode("utf-8")
|
||||
return await self.plant_file(
|
||||
action.dst_name,
|
||||
action.path,
|
||||
body,
|
||||
mode=0o644,
|
||||
mtime=action.mtime,
|
||||
)
|
||||
|
||||
async def plant_file(
|
||||
self,
|
||||
decky_name: str,
|
||||
path: str,
|
||||
content: bytes,
|
||||
*,
|
||||
mode: int = 0o600,
|
||||
mtime: datetime | None = None,
|
||||
) -> ActivityResult:
|
||||
"""Write *content* to *path* inside *decky_name*'s ssh container.
|
||||
|
||||
Streams base64 via stdin (mirrors :mod:`decnet.canary.planter`'s
|
||||
ARG_MAX-safe write — see commit c17b9e0). Sets file mode and,
|
||||
when *mtime* is provided, ``touch -d`` to backdate the file so
|
||||
it doesn't all stamp at wall-clock-now (the realism failure
|
||||
this migration is fixing).
|
||||
"""
|
||||
container = _container_for(decky_name)
|
||||
b64 = base64.b64encode(content).decode("ascii")
|
||||
# touch -d accepts ISO 8601; we always emit UTC so the
|
||||
# container's local TZ doesn't drift the mtime.
|
||||
if mtime is not None:
|
||||
ts = mtime.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
touch_cmd = f"touch -d {shlex.quote(ts)} {shlex.quote(path)}"
|
||||
else:
|
||||
touch_cmd = f"touch {shlex.quote(path)}"
|
||||
sh_cmd = (
|
||||
f"mkdir -p {shlex.quote(_dirname(path))} && "
|
||||
f"base64 -d > {shlex.quote(path)} && "
|
||||
f"chmod {mode:o} {shlex.quote(path)} && "
|
||||
f"{touch_cmd}"
|
||||
)
|
||||
argv = [_DOCKER, "exec", "-i", container, "sh", "-c", sh_cmd]
|
||||
rc, _stdout, stderr = await _run_with_stdin(argv, b64.encode("ascii"))
|
||||
success = rc == 0
|
||||
payload: dict[str, Any] = {
|
||||
"dst_decky": decky_name,
|
||||
"path": path,
|
||||
"bytes": len(content),
|
||||
"rc": rc,
|
||||
"stderr": stderr.strip()[:256] if not success else None,
|
||||
}
|
||||
return ActivityResult(success=success, payload=payload)
|
||||
|
||||
async def read_file(self, decky_name: str, path: str) -> bytes:
|
||||
"""Read *path* from inside *decky_name*'s ssh container.
|
||||
|
||||
Used by the realism edit-in-place flow: the driver fetches
|
||||
the previous body, the realism engine produces the next
|
||||
iteration, the driver re-plants it via :meth:`plant_file`.
|
||||
|
||||
Raises :class:`FileNotFoundError` when the container path
|
||||
doesn't exist (rc=1 from ``cat`` with stderr ``No such
|
||||
file``). Other failures raise :class:`RuntimeError` carrying
|
||||
the docker stderr.
|
||||
"""
|
||||
container = _container_for(decky_name)
|
||||
argv = [_DOCKER, "exec", container, "cat", path]
|
||||
rc, stdout, stderr = await _run(argv)
|
||||
if rc == 0:
|
||||
return stdout.encode("utf-8") if isinstance(stdout, str) else stdout
|
||||
if "No such file" in stderr or "no such file" in stderr.lower():
|
||||
raise FileNotFoundError(f"{path} not present in {decky_name}")
|
||||
raise RuntimeError(
|
||||
f"docker exec cat failed rc={rc} stderr={stderr.strip()[:256]!r}"
|
||||
)
|
||||
|
||||
|
||||
def _dirname(path: str) -> str:
|
||||
"""Pure-string dirname. We can't trust ``os.path.dirname`` on the
|
||||
host to share the destination container's separator semantics, but
|
||||
deckies are POSIX so a plain ``rfind('/')`` suffices."""
|
||||
idx = path.rfind("/")
|
||||
if idx <= 0:
|
||||
return "/"
|
||||
return path[:idx]
|
||||
20
decnet/orchestrator/emailgen/__init__.py
Normal file
20
decnet/orchestrator/emailgen/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Emailgen — email-specific delivery, scheduling, and threading.
|
||||
|
||||
After stage 5 of the realism migration, ``emailgen`` is no longer a
|
||||
separate worker / systemd unit / CLI subcommand. It exposes:
|
||||
|
||||
* :mod:`decnet.orchestrator.emailgen.scheduler` — the
|
||||
``EmailAction`` shape and the ``pick(repo)`` policy that decides
|
||||
which mail decky / sender / recipient / thread an email belongs to.
|
||||
* :mod:`decnet.orchestrator.emailgen.threads` — RFC 2822 thread chain
|
||||
helpers (Message-ID generation, Re: / In-Reply-To bookkeeping).
|
||||
* :mod:`decnet.orchestrator.emailgen.events` — DB-row + bus-topic
|
||||
builders for email events.
|
||||
|
||||
The orchestrator's main worker (:mod:`decnet.orchestrator.worker`)
|
||||
calls into these modules per tick. LLM glue, persona schema, prompt
|
||||
builder, and the global persona pool moved to :mod:`decnet.realism`
|
||||
in stage 2 of the migration; this package keeps only the
|
||||
email-specific delivery surface.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
49
decnet/orchestrator/emailgen/events.py
Normal file
49
decnet/orchestrator/emailgen/events.py
Normal file
@@ -0,0 +1,49 @@
|
||||
"""DB-row + bus-topic helpers for the emailgen worker.
|
||||
|
||||
Mirror of :mod:`decnet.orchestrator.events` for the email action class.
|
||||
Kept in its own module so the SSH-flavoured orchestrator and the
|
||||
emailgen worker don't accumulate cross-imports of each other's action
|
||||
types.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.orchestrator.drivers.base import ActivityResult
|
||||
from decnet.orchestrator.emailgen.scheduler import EmailAction
|
||||
|
||||
|
||||
def to_row(action: EmailAction, result: ActivityResult) -> dict[str, Any]:
|
||||
"""Build the kwargs dict for ``OrchestratorEmail(**...)``.
|
||||
|
||||
Pulls ``message_id`` / ``subject`` / ``language`` out of the
|
||||
driver's ``payload`` rather than off the action — the EML's
|
||||
Message-ID is generated inside the driver after the LLM call so
|
||||
we know it matches what landed on disk.
|
||||
"""
|
||||
payload = result.payload or {}
|
||||
return {
|
||||
"ts": datetime.now(timezone.utc),
|
||||
"mail_decky_uuid": action.mail_decky_uuid,
|
||||
"thread_id": action.thread_id,
|
||||
"message_id": payload.get("message_id", ""),
|
||||
"in_reply_to": action.parent_message_id,
|
||||
"sender_email": action.sender.email,
|
||||
"recipient_email": action.recipient.email,
|
||||
"subject": payload.get("subject", ""),
|
||||
"language": payload.get("language", action.sender.language or "en"),
|
||||
"eml_path": payload.get("eml_path", ""),
|
||||
"success": result.success,
|
||||
"payload": payload, # repo serialises dict→json
|
||||
}
|
||||
|
||||
|
||||
def topic_for(action: EmailAction) -> str:
|
||||
"""Map an email action to its bus topic."""
|
||||
return _topics.orchestrator(_topics.ORCHESTRATOR_EMAIL, action.mail_decky_uuid)
|
||||
|
||||
|
||||
def event_type_for(action: EmailAction) -> str: # noqa: ARG001 — symmetry
|
||||
return _topics.ORCHESTRATOR_EMAIL
|
||||
255
decnet/orchestrator/emailgen/scheduler.py
Normal file
255
decnet/orchestrator/emailgen/scheduler.py
Normal file
@@ -0,0 +1,255 @@
|
||||
"""Action picker for the emailgen worker.
|
||||
|
||||
One tick = one (mail-decky, sender, recipient, [thread]) decision.
|
||||
|
||||
Scope (v1):
|
||||
- Only TopologyDeckies are eligible mail hosts. Fleet / SWARM-shard
|
||||
mail-deckies are out of scope per the plan; they get covered when the
|
||||
forwarder pattern lands for emailgen.
|
||||
- Mail decky = a running TopologyDecky whose ``services`` includes
|
||||
``imap`` or ``pop3``.
|
||||
- Personas come from ``Topology.email_personas`` (JSON list of
|
||||
:class:`EmailPersona`). Topology-wide ``language_default`` fills in
|
||||
any persona that didn't set its own.
|
||||
|
||||
Returns ``None`` (skip tick) when:
|
||||
- no running mail decky,
|
||||
- the mail decky's topology has fewer than two valid personas,
|
||||
- nobody is in their ``active_hours`` window right now.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import secrets
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Any, Optional
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.orchestrator.emailgen.threads import (
|
||||
ThreadChain,
|
||||
new_thread_id,
|
||||
references_for_reply,
|
||||
reply_subject,
|
||||
)
|
||||
from decnet.realism import personas_pool as global_pool
|
||||
from decnet.realism.personas import (
|
||||
EmailPersona,
|
||||
in_active_hours,
|
||||
parse_personas,
|
||||
)
|
||||
|
||||
logger = get_logger("orchestrator.emailgen")
|
||||
|
||||
_MAIL_SERVICES = ("imap", "pop3")
|
||||
# Probability of replying on an existing thread when one exists. The
|
||||
# inverse starts a fresh thread. 0.6 mirrors what mailbox studies find
|
||||
# for active corporate inboxes — most messages are replies, but not
|
||||
# overwhelmingly so.
|
||||
_REPLY_PROBABILITY = 0.6
|
||||
|
||||
# Generic context hints fed to the LLM when starting a new thread.
|
||||
# Deliberately broad — the persona's tone + role is what shapes the
|
||||
# email; the hint just gives the model a topic to riff on.
|
||||
_CONTEXT_HINTS: tuple[str, ...] = (
|
||||
"Q3 budget review and approval",
|
||||
"Client presentation feedback",
|
||||
"Project deadline extension request",
|
||||
"Team building event planning",
|
||||
"IT system maintenance notification",
|
||||
"Quarterly performance review",
|
||||
"Vendor onboarding process",
|
||||
"Holiday schedule announcement",
|
||||
"Training session invitation",
|
||||
"Department restructuring update",
|
||||
"Client contract negotiation",
|
||||
"Security audit findings",
|
||||
"Sales strategy meeting",
|
||||
"Product launch timeline",
|
||||
"Office relocation update",
|
||||
"Travel reimbursement policy change",
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EmailAction:
|
||||
"""One emailgen tick's decision.
|
||||
|
||||
``thread_id`` is non-None whenever this action is a reply; the
|
||||
worker writes it back to the DB so future ticks can chain further
|
||||
replies. ``in_reply_to`` / ``references`` mirror the RFC 2822
|
||||
headers we'll set on the EML.
|
||||
|
||||
``mail_decky_name`` / ``mail_decky_services`` are denormalised onto
|
||||
the action so the driver doesn't need a second repo round-trip just
|
||||
to resolve the container name.
|
||||
"""
|
||||
mail_decky_uuid: str
|
||||
mail_decky_name: str
|
||||
mail_decky_services: tuple[str, ...]
|
||||
sender: EmailPersona
|
||||
recipient: EmailPersona
|
||||
thread_id: str
|
||||
parent_message_id: Optional[str]
|
||||
references: str
|
||||
subject_hint: Optional[str] # used as parent subject when replying
|
||||
parent_excerpt: Optional[str] # excerpt from the parent body
|
||||
context_hint: str # only meaningful on new threads
|
||||
is_reply: bool
|
||||
description: str = "email:send"
|
||||
|
||||
|
||||
def _is_mail_decky(decky: dict[str, Any]) -> bool:
|
||||
services = decky.get("services") or []
|
||||
if isinstance(services, str):
|
||||
return False
|
||||
return any(s in services for s in _MAIL_SERVICES)
|
||||
|
||||
|
||||
async def _resolve_personas(
|
||||
repo: Any, mail_decky: dict[str, Any],
|
||||
) -> tuple[list[EmailPersona], str]:
|
||||
"""Pick the right persona source for *mail_decky* and return the list.
|
||||
|
||||
Returns ``(personas, source_label)`` so logs can disambiguate why a
|
||||
tick was skipped. Source label is the same string ``list_running_deckies``
|
||||
sets on the row (``"topology" | "fleet" | "shard"``) so the logger
|
||||
reads consistently against the rest of the orchestrator.
|
||||
|
||||
Resolution rules (matches the design discussion):
|
||||
* **topology** source → walk to ``Topology.email_personas``; the
|
||||
topology owns its own list. Each topology can have different
|
||||
personas.
|
||||
* **fleet** / **shard** source → unihost MACVLAN/IPVLAN deckies and
|
||||
SWARM shards have no parent topology row, so they share a single
|
||||
host-wide pool loaded from disk by :mod:`global_pool`.
|
||||
"""
|
||||
source = mail_decky.get("source") or "unknown"
|
||||
if source == "topology":
|
||||
topology_id = mail_decky.get("topology_id")
|
||||
if not topology_id:
|
||||
return [], source
|
||||
topology = await repo.get_topology(topology_id)
|
||||
if not topology:
|
||||
return [], source
|
||||
return (
|
||||
parse_personas(
|
||||
topology.get("email_personas"),
|
||||
language_default=topology.get("language_default") or "en",
|
||||
),
|
||||
source,
|
||||
)
|
||||
# Fleet / shard / anything else → global pool.
|
||||
return global_pool.load(), source
|
||||
|
||||
|
||||
async def pick(
|
||||
repo: Any,
|
||||
*,
|
||||
rand: Optional[secrets.SystemRandom] = None,
|
||||
now: Optional[datetime] = None,
|
||||
) -> Optional[EmailAction]:
|
||||
"""Pick one email action against any running mail decky.
|
||||
|
||||
Mail-decky discovery uses the **union view** (``list_running_deckies``):
|
||||
MazeNET topology deckies, unihost fleet deckies, and SWARM shards are
|
||||
all eligible. Persona source is per-decky-source; see
|
||||
:func:`_resolve_personas`. *now* is the wall-clock used for
|
||||
``active_hours`` filtering — injected so tests can pin the hour
|
||||
deterministically.
|
||||
"""
|
||||
rng = rand or secrets.SystemRandom()
|
||||
now_dt = now or datetime.now()
|
||||
|
||||
deckies = await repo.list_running_deckies()
|
||||
mail_deckies = [d for d in deckies if _is_mail_decky(d)]
|
||||
if not mail_deckies:
|
||||
logger.debug("emailgen pick: no running mail decky")
|
||||
return None
|
||||
|
||||
mail_decky = rng.choice(mail_deckies)
|
||||
personas, source = await _resolve_personas(repo, mail_decky)
|
||||
if len(personas) < 2:
|
||||
logger.debug(
|
||||
"emailgen pick: source=%s mail_decky=%s only %d personas; need >=2",
|
||||
source, mail_decky.get("uuid"), len(personas),
|
||||
)
|
||||
return None
|
||||
|
||||
active = [p for p in personas if in_active_hours(p, now_dt.hour)]
|
||||
if len(active) < 2:
|
||||
logger.debug(
|
||||
"emailgen pick: source=%s mail_decky=%s only %d personas in-hours",
|
||||
source, mail_decky.get("uuid"), len(active),
|
||||
)
|
||||
return None
|
||||
|
||||
sender = rng.choice(active)
|
||||
recipient = rng.choice([p for p in active if p.email != sender.email])
|
||||
|
||||
# Look up open threads between this pair on this mail decky.
|
||||
chain = await _maybe_pick_chain(
|
||||
repo, mail_decky["uuid"], sender, recipient, rng=rng,
|
||||
)
|
||||
|
||||
services = tuple(mail_decky.get("services") or ())
|
||||
decky_name = mail_decky.get("name") or ""
|
||||
|
||||
if chain is not None:
|
||||
return EmailAction(
|
||||
mail_decky_uuid=mail_decky["uuid"],
|
||||
mail_decky_name=decky_name,
|
||||
mail_decky_services=services,
|
||||
sender=sender,
|
||||
recipient=recipient,
|
||||
thread_id=chain.thread_id,
|
||||
parent_message_id=chain.parent_message_id,
|
||||
references=references_for_reply(chain),
|
||||
subject_hint=chain.parent_subject,
|
||||
parent_excerpt=None, # repo can populate later if useful
|
||||
context_hint=chain.parent_subject,
|
||||
is_reply=True,
|
||||
)
|
||||
|
||||
return EmailAction(
|
||||
mail_decky_uuid=mail_decky["uuid"],
|
||||
mail_decky_name=decky_name,
|
||||
mail_decky_services=services,
|
||||
sender=sender,
|
||||
recipient=recipient,
|
||||
thread_id=new_thread_id(),
|
||||
parent_message_id=None,
|
||||
references="",
|
||||
subject_hint=None,
|
||||
parent_excerpt=None,
|
||||
context_hint=rng.choice(_CONTEXT_HINTS),
|
||||
is_reply=False,
|
||||
)
|
||||
|
||||
|
||||
async def _maybe_pick_chain(
|
||||
repo: Any,
|
||||
mail_decky_uuid: str,
|
||||
sender: EmailPersona,
|
||||
recipient: EmailPersona,
|
||||
*,
|
||||
rng: secrets.SystemRandom,
|
||||
) -> Optional[ThreadChain]:
|
||||
"""Probabilistically pick an open thread between the pair, or None."""
|
||||
if rng.random() >= _REPLY_PROBABILITY:
|
||||
return None
|
||||
threads = await repo.list_orchestrator_email_threads(
|
||||
mail_decky_uuid, sender.email, recipient.email, limit=20,
|
||||
)
|
||||
if not threads:
|
||||
return None
|
||||
head = threads[0]
|
||||
return ThreadChain(
|
||||
thread_id=head["thread_id"],
|
||||
parent_message_id=head["message_id"],
|
||||
# We don't reconstruct the full ancestry from row history here —
|
||||
# the parent's References + parent's Message-ID would do that.
|
||||
# For v1, single-step references is fine; mail clients still
|
||||
# group correctly by (Subject + In-Reply-To).
|
||||
references=tuple(),
|
||||
parent_subject=reply_subject(head["subject"]),
|
||||
)
|
||||
75
decnet/orchestrator/emailgen/threads.py
Normal file
75
decnet/orchestrator/emailgen/threads.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""RFC 2822 thread-chain bookkeeping.
|
||||
|
||||
A thread is a worker-side UUID that groups one or more emails between
|
||||
the same two personas. ``In-Reply-To`` carries the immediate parent's
|
||||
``Message-ID``; ``References`` carries the full ancestry chain.
|
||||
|
||||
The emailgen scheduler queries the repository for the most recent email
|
||||
in any thread between (sender, recipient); if it finds one, it emits a
|
||||
reply (continuing the chain). Otherwise it starts a new thread.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ThreadChain:
|
||||
"""Immutable view of a thread's chain at a point in time.
|
||||
|
||||
``thread_id`` is opaque (UUID). ``parent_message_id`` is the most
|
||||
recent message in the chain — the new reply's ``In-Reply-To`` field.
|
||||
``references`` is the dot-separated history fed into the
|
||||
``References:`` header (oldest-first per RFC 2822 §3.6.4).
|
||||
``parent_subject`` carries the subject we're replying to, so the
|
||||
reply can prepend ``Re:`` correctly.
|
||||
"""
|
||||
thread_id: str
|
||||
parent_message_id: str
|
||||
references: tuple[str, ...]
|
||||
parent_subject: str
|
||||
|
||||
|
||||
def new_thread_id() -> str:
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
def reply_subject(parent_subject: str) -> str:
|
||||
"""Prepend ``Re:`` to *parent_subject* if not already a reply.
|
||||
|
||||
Folds repeat ``Re: Re: Re:`` into a single ``Re:`` — Outlook /
|
||||
Thunderbird both do this and an attacker reading the maildir would
|
||||
notice the corpus's missing convention immediately.
|
||||
"""
|
||||
s = parent_subject.strip()
|
||||
lowered = s.lower()
|
||||
while lowered.startswith("re:"):
|
||||
s = s[3:].lstrip()
|
||||
lowered = s.lower()
|
||||
return f"Re: {s}"
|
||||
|
||||
|
||||
def references_for_reply(chain: Optional[ThreadChain]) -> str:
|
||||
"""Build the ``References:`` header value for a reply.
|
||||
|
||||
Returns a space-separated list of message-ids, oldest-first, with
|
||||
the parent appended. Empty string when *chain* is None (root).
|
||||
"""
|
||||
if chain is None:
|
||||
return ""
|
||||
refs = list(chain.references) + [chain.parent_message_id]
|
||||
return " ".join(refs)
|
||||
|
||||
|
||||
def new_message_id(domain: str) -> str:
|
||||
"""Build an RFC 2822 ``Message-ID`` value (incl. angle brackets).
|
||||
|
||||
Worker side — the value is also stored in the DB so a future reply
|
||||
can be threaded against it. Domain mirrors the sender's email
|
||||
domain so an attacker grepping for tells doesn't find every
|
||||
fake-corp email tagged with ``@example.com``.
|
||||
"""
|
||||
safe_domain = domain.strip() or "localhost"
|
||||
return f"<{uuid.uuid4().hex}@{safe_domain}>"
|
||||
68
decnet/orchestrator/events.py
Normal file
68
decnet/orchestrator/events.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""DB-row + bus-topic helpers for the orchestrator."""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.orchestrator.drivers.base import ActivityResult
|
||||
from decnet.orchestrator.scheduler import (
|
||||
Action,
|
||||
EditAction,
|
||||
FileAction,
|
||||
TrafficAction,
|
||||
)
|
||||
|
||||
|
||||
def to_row(action: Action, result: ActivityResult) -> dict[str, Any]:
|
||||
"""Build the kwargs dict for ``OrchestratorEvent(**...)``."""
|
||||
base: dict[str, Any] = {
|
||||
"ts": datetime.now(timezone.utc),
|
||||
"protocol": "ssh",
|
||||
"success": result.success,
|
||||
"payload": result.payload, # repo serialises dict→json
|
||||
}
|
||||
if isinstance(action, TrafficAction):
|
||||
base.update(
|
||||
kind="traffic",
|
||||
action=f"exec:{action.description}",
|
||||
src_decky_uuid=action.src_uuid,
|
||||
dst_decky_uuid=action.dst_uuid,
|
||||
)
|
||||
elif isinstance(action, FileAction):
|
||||
base.update(
|
||||
kind="file",
|
||||
action=action.description,
|
||||
src_decky_uuid=None,
|
||||
dst_decky_uuid=action.dst_uuid,
|
||||
)
|
||||
elif isinstance(action, EditAction):
|
||||
# EditAction shares the "file" kind (same dashboard view, same
|
||||
# bus topic family) but action="file:edit" lets queries
|
||||
# discriminate when needed.
|
||||
base.update(
|
||||
kind="file",
|
||||
action=action.description,
|
||||
src_decky_uuid=None,
|
||||
dst_decky_uuid=action.dst_uuid,
|
||||
)
|
||||
else:
|
||||
raise TypeError(f"unsupported action type: {type(action)!r}")
|
||||
return base
|
||||
|
||||
|
||||
def topic_for(action: Action) -> str:
|
||||
"""Map an action to its bus topic."""
|
||||
if isinstance(action, TrafficAction):
|
||||
return _topics.orchestrator(_topics.ORCHESTRATOR_TRAFFIC, action.dst_uuid)
|
||||
if isinstance(action, (FileAction, EditAction)):
|
||||
return _topics.orchestrator(_topics.ORCHESTRATOR_FILE, action.dst_uuid)
|
||||
raise TypeError(f"unsupported action type: {type(action)!r}")
|
||||
|
||||
|
||||
def event_type_for(action: Action) -> str:
|
||||
if isinstance(action, TrafficAction):
|
||||
return _topics.ORCHESTRATOR_TRAFFIC
|
||||
if isinstance(action, (FileAction, EditAction)):
|
||||
return _topics.ORCHESTRATOR_FILE
|
||||
raise TypeError(f"unsupported action type: {type(action)!r}")
|
||||
340
decnet/orchestrator/scheduler.py
Normal file
340
decnet/orchestrator/scheduler.py
Normal file
@@ -0,0 +1,340 @@
|
||||
"""Action picker for the orchestrator.
|
||||
|
||||
Stage-3 realism: file actions are sourced from
|
||||
:func:`decnet.realism.planner.pick`, not the old hardcoded
|
||||
``_FILE_TEMPLATES``/``_USERS`` constants. Persona resolution per
|
||||
decky still belongs here (the realism planner is pure of
|
||||
:class:`~decnet.web.db.repository.BaseRepository` knowledge) — we
|
||||
walk each decky to either ``Topology.email_personas`` or the
|
||||
``decnet.realism.personas_pool`` global pool, depending on
|
||||
``decky["source"]``, then hand the resolved set to the planner.
|
||||
|
||||
TrafficAction stays untouched: still a flat random pair-pick of
|
||||
SSH-capable deckies. Email actions land in stage 5 of the realism
|
||||
migration when the emailgen worker collapses into the orchestrator.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import secrets
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional, Sequence
|
||||
|
||||
from decnet.realism import personas_pool
|
||||
from decnet.realism.personas import EmailPersona, parse_personas
|
||||
from decnet.realism.planner import pick as _realism_pick
|
||||
from decnet.realism.taxonomy import ContentClass, Plan
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TrafficAction:
|
||||
src_uuid: str
|
||||
src_name: str
|
||||
dst_uuid: str
|
||||
dst_name: str
|
||||
dst_ip: str
|
||||
protocol: str = "ssh"
|
||||
description: str = "tcp_connect:22"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FileAction:
|
||||
"""One file plant request the SSH driver materialises.
|
||||
|
||||
Stage-3 realism: ``persona`` / ``content_class`` / ``mtime`` are
|
||||
populated when the action came through :func:`pick_file`. Older
|
||||
direct constructions (tests, manual operator drives) leave them
|
||||
at the defaults — back-compat for the pre-realism call sites
|
||||
that haven't migrated yet.
|
||||
"""
|
||||
dst_uuid: str
|
||||
dst_name: str
|
||||
path: str
|
||||
content: str
|
||||
persona: str = ""
|
||||
content_class: str = ContentClass.NOTE.value
|
||||
mtime: Optional[datetime] = None
|
||||
description: str = "file:create"
|
||||
# Canary artifacts (DOCX/PDF/honeydoc binaries) carry their bytes
|
||||
# here so re-encoding ``content`` from utf-8 doesn't mangle them.
|
||||
# When set, the SSH driver uses these bytes directly and ignores
|
||||
# ``content``.
|
||||
content_bytes: Optional[bytes] = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EditAction:
|
||||
"""Read-modify-write of an existing synthetic file.
|
||||
|
||||
Stage 3b of the realism migration: a previously-planted ``TODO.md``
|
||||
gets a checkbox flipped, a notes file gets a new line appended, a
|
||||
cron log gets a fresh entry tacked on. ``synthetic_file_uuid`` is
|
||||
the row in ``synthetic_files`` to update; ``previous_body`` is
|
||||
what the planner already saw so the driver doesn't double-fetch.
|
||||
"""
|
||||
dst_uuid: str
|
||||
dst_name: str
|
||||
path: str
|
||||
persona: str
|
||||
content_class: str
|
||||
previous_body: str
|
||||
synthetic_file_uuid: str
|
||||
mtime: Optional[datetime] = None
|
||||
description: str = "file:edit"
|
||||
|
||||
|
||||
Action = TrafficAction | FileAction | EditAction
|
||||
|
||||
|
||||
def _has_ssh(decky: dict[str, Any]) -> bool:
|
||||
services = decky.get("services") or []
|
||||
if isinstance(services, str):
|
||||
return False # not deserialised — treat as "we don't know"
|
||||
return "ssh" in services
|
||||
|
||||
|
||||
def pick(
|
||||
deckies: Sequence[dict[str, Any]],
|
||||
*,
|
||||
rand: Optional[secrets.SystemRandom] = None,
|
||||
) -> Optional[Action]:
|
||||
"""Pick one *traffic* action against the given decky set.
|
||||
|
||||
Returns ``None`` when no SSH-capable pair is available. File
|
||||
actions are produced by :func:`pick_file` (async — needs the repo
|
||||
for persona resolution). The orchestrator worker calls one or the
|
||||
other per tick, weighted 50/50.
|
||||
"""
|
||||
rng = rand or secrets.SystemRandom()
|
||||
ssh_deckies = [d for d in deckies if _has_ssh(d) and d.get("ip")]
|
||||
if len(ssh_deckies) < 2:
|
||||
return None
|
||||
src, dst = rng.sample(ssh_deckies, 2)
|
||||
return TrafficAction(
|
||||
src_uuid=src["uuid"],
|
||||
src_name=src["name"],
|
||||
dst_uuid=dst["uuid"],
|
||||
dst_name=dst["name"],
|
||||
dst_ip=dst["ip"],
|
||||
)
|
||||
|
||||
|
||||
async def pick_file(
|
||||
deckies: Sequence[dict[str, Any]],
|
||||
repo: Any,
|
||||
*,
|
||||
now: Optional[datetime] = None,
|
||||
rand: Optional[secrets.SystemRandom] = None,
|
||||
llm: Any = None,
|
||||
llm_breaker: Any = None,
|
||||
llm_timeout: float = 60.0,
|
||||
) -> Optional[Action]:
|
||||
"""Realism-driven file action — create or edit.
|
||||
|
||||
Resolves personas per decky (topology pool when the decky has a
|
||||
parent topology; global pool otherwise), filters to deckies in any
|
||||
persona's work hours, optionally fetches an edit candidate from
|
||||
the synthetic_files table, and asks
|
||||
:func:`decnet.realism.planner.pick` to choose between create / edit
|
||||
/ leave-alone. Maps the resulting :class:`Plan` to a
|
||||
:class:`FileAction` (create) or :class:`EditAction` (edit) the
|
||||
SSH driver can dispatch.
|
||||
|
||||
Returns ``None`` when no decky has a non-empty persona pool with a
|
||||
persona currently in its active-hours window, or when the planner
|
||||
rolled "leave alone."
|
||||
"""
|
||||
rng = rand or secrets.SystemRandom()
|
||||
when = now or datetime.now(timezone.utc)
|
||||
|
||||
enriched = await _resolve_personas(deckies, repo)
|
||||
if not enriched:
|
||||
return None
|
||||
|
||||
# Pre-fetch a single edit candidate from a random eligible decky,
|
||||
# so the planner can decide whether to use it. We pick the decky
|
||||
# client-side (cheap) and ask the repo for one row; if there's
|
||||
# nothing editable, planner falls back to create.
|
||||
edit_candidate = None
|
||||
if rng.random() < 0.5 and enriched:
|
||||
# Half the ticks consider an edit. Lower than the planner's
|
||||
# 30% edit weight on purpose — the repo lookup is the
|
||||
# expensive part, no point doing it on every tick.
|
||||
candidate_decky = rng.choice(enriched)
|
||||
try:
|
||||
row = await repo.pick_random_synthetic_file_for_edit(
|
||||
candidate_decky["uuid"],
|
||||
)
|
||||
except Exception: # noqa: BLE001
|
||||
row = None
|
||||
if row is not None:
|
||||
row = {**row, "decky_name": candidate_decky["name"]}
|
||||
edit_candidate = row
|
||||
|
||||
plan = _realism_pick(enriched, when, edit_candidate=edit_candidate, rand=rng)
|
||||
if plan is None:
|
||||
return None
|
||||
|
||||
if plan.action == "edit":
|
||||
return EditAction(
|
||||
dst_uuid=plan.decky_uuid,
|
||||
dst_name=plan.decky_name,
|
||||
path=plan.target_path,
|
||||
persona=plan.persona,
|
||||
content_class=plan.content_class.value,
|
||||
previous_body=plan.previous_body or "",
|
||||
synthetic_file_uuid=(edit_candidate or {}).get("uuid", ""),
|
||||
mtime=plan.mtime,
|
||||
)
|
||||
|
||||
# Canary branch — the cultivator builds the bytes, picks the
|
||||
# placement path, and persists the canary_tokens row. We map
|
||||
# the resulting CanaryArtifact to a FileAction so the SSH
|
||||
# driver's plant_file path is reused unchanged.
|
||||
if plan.content_class.is_canary():
|
||||
try:
|
||||
from decnet.canary import cultivator as _cultivator
|
||||
artifact = await _cultivator.cultivate(plan, repo)
|
||||
except Exception: # noqa: BLE001
|
||||
# Cultivation failed (no http_base/dns_zone configured,
|
||||
# generator raised, repo write failed). Fall through to
|
||||
# an inert file plant so the tick isn't wasted.
|
||||
return FileAction(
|
||||
dst_uuid=plan.decky_uuid,
|
||||
dst_name=plan.decky_name,
|
||||
path=plan.target_path or f"/tmp/.cache-{secrets.token_hex(3)}", # nosec B108
|
||||
content=plan.body_hint or "",
|
||||
persona=plan.persona,
|
||||
content_class=plan.content_class.value,
|
||||
mtime=plan.mtime,
|
||||
)
|
||||
return FileAction(
|
||||
dst_uuid=plan.decky_uuid,
|
||||
dst_name=plan.decky_name,
|
||||
path=artifact.path,
|
||||
content="", # ignored when content_bytes is set
|
||||
content_bytes=artifact.content,
|
||||
persona=plan.persona,
|
||||
content_class=plan.content_class.value,
|
||||
mtime=plan.mtime,
|
||||
)
|
||||
|
||||
# Create branch. If LLM is wired, optionally swap body_hint for
|
||||
# an LLM-authored body. Always keep the deterministic body_hint
|
||||
# as the fallback the function call returns when LLM
|
||||
# times out / errors / breaker-trips.
|
||||
body = plan.body_hint or ""
|
||||
if llm is not None and plan.content_class.is_user_class():
|
||||
persona_obj = _persona_by_name(enriched, plan.persona)
|
||||
if persona_obj is not None:
|
||||
from decnet.realism.bodies import make_body_with_llm
|
||||
body = await make_body_with_llm(
|
||||
plan.content_class,
|
||||
persona_obj,
|
||||
llm=llm,
|
||||
breaker=llm_breaker,
|
||||
timeout=llm_timeout,
|
||||
rand=rng,
|
||||
)
|
||||
return FileAction(
|
||||
dst_uuid=plan.decky_uuid,
|
||||
dst_name=plan.decky_name,
|
||||
path=plan.target_path,
|
||||
content=body,
|
||||
persona=plan.persona,
|
||||
content_class=plan.content_class.value,
|
||||
mtime=plan.mtime,
|
||||
)
|
||||
|
||||
|
||||
def _persona_by_name(
|
||||
enriched: list[dict[str, Any]], name: str,
|
||||
) -> Optional[EmailPersona]:
|
||||
"""Find the persona instance the planner used; ``None`` if missing."""
|
||||
for decky in enriched:
|
||||
for persona in decky.get("_realism_personas") or []:
|
||||
if persona.name == name:
|
||||
return persona
|
||||
return None
|
||||
|
||||
|
||||
async def _resolve_personas(
|
||||
deckies: Sequence[dict[str, Any]],
|
||||
repo: Any,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Attach a resolved persona list to each decky dict.
|
||||
|
||||
The realism planner expects each decky to carry
|
||||
``_realism_personas`` (list of :class:`EmailPersona`). We do the
|
||||
repo lookups here so the planner stays pure-of-DB.
|
||||
|
||||
Topology-source deckies pull from ``Topology.email_personas``.
|
||||
Fleet/shard deckies pull from the global pool
|
||||
(:func:`decnet.realism.personas_pool.load`). Decky source unknown
|
||||
→ fall back to global pool too; better noisy than silent.
|
||||
"""
|
||||
enriched: list[dict[str, Any]] = []
|
||||
topology_cache: dict[str, list[EmailPersona]] = {}
|
||||
global_personas: Optional[list[EmailPersona]] = None
|
||||
|
||||
for decky in deckies:
|
||||
# Files are planted via the SSH service, same as TrafficAction.
|
||||
# A decky without ssh has no realism file path today (windows
|
||||
# personas / SMB writes land in a future stage).
|
||||
if not _has_ssh(decky):
|
||||
continue
|
||||
|
||||
source = (decky.get("source") or "").lower()
|
||||
topology_id = decky.get("topology_id")
|
||||
|
||||
personas: list[EmailPersona] = []
|
||||
if source == "topology" and topology_id:
|
||||
if topology_id not in topology_cache:
|
||||
try:
|
||||
topology = await repo.get_topology(topology_id)
|
||||
except Exception: # noqa: BLE001
|
||||
topology = None
|
||||
topology_cache[topology_id] = _topology_personas(topology)
|
||||
personas = topology_cache[topology_id]
|
||||
else:
|
||||
if global_personas is None:
|
||||
# Lazy-load once per call; the global-pool cache inside
|
||||
# personas_pool already mtime-checks.
|
||||
global_personas = personas_pool.load()
|
||||
personas = global_personas
|
||||
|
||||
if not personas:
|
||||
continue
|
||||
enriched.append({**decky, "_realism_personas": personas})
|
||||
|
||||
return enriched
|
||||
|
||||
|
||||
def _topology_personas(topology: Optional[dict[str, Any]]) -> list[EmailPersona]:
|
||||
if not topology:
|
||||
return []
|
||||
raw = topology.get("email_personas")
|
||||
if raw is None:
|
||||
return []
|
||||
if isinstance(raw, list):
|
||||
return parse_personas(raw, language_default=topology.get("language_default") or "en")
|
||||
if isinstance(raw, str):
|
||||
try:
|
||||
return parse_personas(json.loads(raw), language_default=topology.get("language_default") or "en")
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
return []
|
||||
|
||||
|
||||
# Lightweight no-op alias kept so external callers that already import
|
||||
# ``Plan`` from the scheduler keep working through the migration.
|
||||
__all__ = [
|
||||
"Action",
|
||||
"EditAction",
|
||||
"FileAction",
|
||||
"Plan",
|
||||
"TrafficAction",
|
||||
"pick",
|
||||
"pick_file",
|
||||
]
|
||||
513
decnet/orchestrator/worker.py
Normal file
513
decnet/orchestrator/worker.py
Normal file
@@ -0,0 +1,513 @@
|
||||
"""Orchestrator main loop.
|
||||
|
||||
One tick = one action pick + one driver invocation + one DB write +
|
||||
one fire-and-forget bus publish. Intentionally serial — MVP honesty:
|
||||
a wedged docker exec stalls only this worker, never another.
|
||||
|
||||
Three action shapes are folded into the single tick after stage 5 of
|
||||
the realism migration: SSH traffic between deckies, file plants on
|
||||
deckies (driven by :func:`decnet.realism.planner.pick`), and email
|
||||
drops into mail-decky maildirs (driven by
|
||||
:func:`decnet.orchestrator.emailgen.scheduler.pick`). ``decnet
|
||||
emailgen`` and ``decnet-emailgen.service`` are gone; this worker
|
||||
covers all three.
|
||||
|
||||
Modeled after :mod:`decnet.profiler.worker` for consistency: same
|
||||
control listener, same heartbeat helper, same shutdown semantics.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
import hashlib
|
||||
import os
|
||||
import secrets
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional
|
||||
|
||||
from decnet.bus.factory import get_bus
|
||||
from decnet.bus.publish import (
|
||||
publish_safely,
|
||||
run_control_listener,
|
||||
run_health_heartbeat,
|
||||
)
|
||||
from decnet.logging import get_logger
|
||||
from decnet.orchestrator import events, scheduler
|
||||
from decnet.orchestrator.drivers import get_driver_for
|
||||
from decnet.orchestrator.emailgen import (
|
||||
events as email_events,
|
||||
scheduler as email_scheduler,
|
||||
)
|
||||
from decnet.orchestrator.emailgen.scheduler import EmailAction
|
||||
from decnet.realism import planner as realism_planner
|
||||
from decnet.realism.llm.circuit import LLMCircuitBreaker
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
|
||||
logger = get_logger("orchestrator")
|
||||
|
||||
# Periodic-prune knobs. Trim per-decky history every _PRUNE_EVERY_TICKS
|
||||
# to keep orchestrator_events / orchestrator_emails from unbounded
|
||||
# growth on long-running fleets. Cheap on the write path (zero overhead
|
||||
# per tick); the cost pays in once every ~100 ticks.
|
||||
_PRUNE_EVERY_TICKS = 100
|
||||
_PRUNE_PER_DST_CAP = 10000
|
||||
_PRUNE_PER_MAIL_DECKY_CAP = 5000
|
||||
|
||||
# Refresh planner weights from realism_config every N ticks. Operator
|
||||
# tunables drift slowly; ~minute-scale latency between PUT and effect
|
||||
# is fine. No bus signal — keeps the path simple and the orchestrator
|
||||
# self-contained.
|
||||
_REALISM_CONFIG_REFRESH_TICKS = 5
|
||||
|
||||
# Action-kind weights for the per-tick roll. Email is rare because
|
||||
# each LLM round-trip is expensive (~seconds) and the prior emailgen
|
||||
# worker only ticked every 5 minutes. At a 60s orchestrator interval,
|
||||
# a 10% email weight produces ~one email every ~10 minutes — close
|
||||
# enough to the pre-collapse cadence.
|
||||
_ACTION_WEIGHTS: tuple[tuple[str, int], ...] = (
|
||||
("traffic", 45),
|
||||
("file", 45),
|
||||
("email", 10),
|
||||
)
|
||||
|
||||
|
||||
async def orchestrator_worker(
|
||||
repo: BaseRepository,
|
||||
*,
|
||||
interval: int = 60,
|
||||
llm_enabled: Optional[bool] = None,
|
||||
) -> None:
|
||||
"""Periodically inject synthetic activity into the running fleet.
|
||||
|
||||
Runs as a long-lived asyncio task. Honours the bus control topic
|
||||
(``system.orchestrator.control``) for graceful shutdown.
|
||||
|
||||
LLM enrichment for user-class file bodies is opt-in via the
|
||||
``DECNET_REALISM_LLM`` env var (set to ``ollama`` / ``fake`` /
|
||||
empty). Pass ``llm_enabled=False`` from the CLI to override
|
||||
(``decnet orchestrate --no-llm``). When the LLM is unreachable
|
||||
or wedged, a process-local circuit breaker
|
||||
(:class:`LLMCircuitBreaker`) trips after 3 consecutive failures
|
||||
and the worker falls back to deterministic templates for 60
|
||||
seconds before re-probing.
|
||||
"""
|
||||
logger.info("orchestrator worker started interval=%ds", interval)
|
||||
|
||||
llm: Any = None
|
||||
breaker: Optional[LLMCircuitBreaker] = None
|
||||
if _llm_should_enable(llm_enabled):
|
||||
try:
|
||||
from decnet.realism.llm import get_llm
|
||||
llm = get_llm()
|
||||
breaker = LLMCircuitBreaker()
|
||||
logger.info(
|
||||
"orchestrator: LLM enrichment enabled backend=%s model=%s",
|
||||
os.environ.get("DECNET_REALISM_LLM", "ollama"),
|
||||
getattr(llm, "model", "?"),
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning(
|
||||
"orchestrator: LLM init failed, continuing without "
|
||||
"enrichment: %s", exc,
|
||||
)
|
||||
llm = None
|
||||
|
||||
bus = None
|
||||
try:
|
||||
bus = get_bus(client_name="orchestrator")
|
||||
await bus.connect()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning(
|
||||
"orchestrator: bus unavailable, continuing without publish: %s", exc
|
||||
)
|
||||
bus = None
|
||||
|
||||
# Initial load — pulls the operator-tuned weights from
|
||||
# realism_config so the orchestrator starts ticking with the
|
||||
# operator's intent rather than the baked-in defaults. A failure
|
||||
# here logs and falls through; the planner already holds defaults.
|
||||
await _refresh_realism_config(repo)
|
||||
|
||||
shutdown = asyncio.Event()
|
||||
heartbeat_task = asyncio.create_task(
|
||||
run_health_heartbeat(
|
||||
bus, "orchestrator",
|
||||
extra=lambda: {"realism": _realism_health_snapshot(llm, breaker)},
|
||||
)
|
||||
)
|
||||
control_task = asyncio.create_task(
|
||||
run_control_listener(bus, "orchestrator", shutdown),
|
||||
)
|
||||
tick_n = 0
|
||||
try:
|
||||
while not shutdown.is_set():
|
||||
try:
|
||||
await asyncio.wait_for(shutdown.wait(), timeout=interval)
|
||||
except asyncio.TimeoutError:
|
||||
pass # normal tick
|
||||
if shutdown.is_set():
|
||||
break
|
||||
try:
|
||||
await _one_tick(repo, bus, llm=llm, breaker=breaker)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.error("orchestrator tick failed: %s", exc)
|
||||
tick_n += 1
|
||||
if tick_n % _PRUNE_EVERY_TICKS == 0:
|
||||
await _periodic_prune(repo)
|
||||
if tick_n % _REALISM_CONFIG_REFRESH_TICKS == 0:
|
||||
await _refresh_realism_config(repo)
|
||||
finally:
|
||||
for t in (heartbeat_task, control_task):
|
||||
t.cancel()
|
||||
with contextlib.suppress(Exception, asyncio.CancelledError):
|
||||
await t
|
||||
if bus is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
await bus.close()
|
||||
|
||||
|
||||
async def _periodic_prune(repo: BaseRepository) -> None:
|
||||
try:
|
||||
deleted = await repo.prune_orchestrator_events(per_dst_cap=_PRUNE_PER_DST_CAP)
|
||||
if deleted:
|
||||
logger.info(
|
||||
"orchestrator events prune deleted=%d cap=%d",
|
||||
deleted, _PRUNE_PER_DST_CAP,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.error("orchestrator events prune failed: %s", exc)
|
||||
try:
|
||||
deleted = await repo.prune_orchestrator_emails(
|
||||
per_decky_cap=_PRUNE_PER_MAIL_DECKY_CAP,
|
||||
)
|
||||
if deleted:
|
||||
logger.info(
|
||||
"orchestrator emails prune deleted=%d cap=%d",
|
||||
deleted, _PRUNE_PER_MAIL_DECKY_CAP,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.error("orchestrator emails prune failed: %s", exc)
|
||||
|
||||
|
||||
async def _refresh_realism_config(repo: BaseRepository) -> None:
|
||||
"""Pull operator-tuned weights from realism_config into the planner.
|
||||
|
||||
Failure modes (DB unreachable, malformed JSON, validation reject)
|
||||
log and leave the planner's current weights in place. The orchestrator
|
||||
keeps ticking with whatever it had — never blocks on config.
|
||||
"""
|
||||
try:
|
||||
row = await repo.get_realism_config("weights")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("realism config refresh: DB read failed: %s", exc)
|
||||
return
|
||||
if row is None:
|
||||
return # no overrides set; defaults stand
|
||||
import json
|
||||
try:
|
||||
payload = json.loads(row.get("value") or "{}")
|
||||
except json.JSONDecodeError as exc:
|
||||
logger.warning("realism config refresh: malformed JSON: %s", exc)
|
||||
return
|
||||
if not isinstance(payload, dict):
|
||||
logger.warning("realism config refresh: payload not an object")
|
||||
return
|
||||
try:
|
||||
realism_planner.apply_payload(payload)
|
||||
except ValueError as exc:
|
||||
logger.warning("realism config refresh: rejected payload: %s", exc)
|
||||
|
||||
|
||||
def _roll_action_kind(rng: secrets.SystemRandom) -> str:
|
||||
total = sum(w for _, w in _ACTION_WEIGHTS)
|
||||
target = rng.randint(1, total)
|
||||
running = 0
|
||||
for kind, w in _ACTION_WEIGHTS:
|
||||
running += w
|
||||
if target <= running:
|
||||
return kind
|
||||
return _ACTION_WEIGHTS[-1][0] # unreachable, satisfy mypy
|
||||
|
||||
|
||||
def _realism_health_snapshot(
|
||||
llm: Any, breaker: Optional[LLMCircuitBreaker],
|
||||
) -> dict[str, Any]:
|
||||
"""Snapshot of the orchestrator's realism subsystem for the
|
||||
heartbeat ``extra`` payload.
|
||||
|
||||
Surfaces the LLM backend / model / circuit-breaker state so the
|
||||
dashboard can render a status badge without reaching into worker
|
||||
process memory. Read-only — the heartbeat ticks every 30s; this
|
||||
snapshot is recomputed each tick.
|
||||
|
||||
When LLM is disabled (``llm is None``) the snapshot still
|
||||
returns a dict so consumers can branch on ``llm_enabled`` alone.
|
||||
"""
|
||||
if llm is None:
|
||||
return {
|
||||
"llm_enabled": False,
|
||||
"llm_backend": None,
|
||||
"llm_model": None,
|
||||
"llm_breaker_state": None,
|
||||
}
|
||||
return {
|
||||
"llm_enabled": True,
|
||||
"llm_backend": os.environ.get("DECNET_REALISM_LLM", "ollama"),
|
||||
"llm_model": getattr(llm, "model", None),
|
||||
"llm_breaker_state": breaker.state if breaker is not None else None,
|
||||
}
|
||||
|
||||
|
||||
def _llm_should_enable(explicit: Optional[bool]) -> bool:
|
||||
"""Resolve the LLM-enabled flag from CLI / env / defaults.
|
||||
|
||||
*explicit* takes precedence (``--llm`` / ``--no-llm``). When unset,
|
||||
the env var ``DECNET_REALISM_LLM`` decides: any non-empty value
|
||||
(``ollama`` / ``fake`` / etc.) enables; empty string or ``off`` /
|
||||
``none`` / ``0`` / ``false`` disables.
|
||||
"""
|
||||
if explicit is not None:
|
||||
return explicit
|
||||
raw = os.environ.get("DECNET_REALISM_LLM", "").strip().lower()
|
||||
if raw in ("", "off", "none", "0", "false", "disabled"):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
async def _pick_action(
|
||||
repo: BaseRepository,
|
||||
deckies: list[dict],
|
||||
rng: secrets.SystemRandom,
|
||||
*,
|
||||
llm: Any = None,
|
||||
breaker: Optional[LLMCircuitBreaker] = None,
|
||||
):
|
||||
"""Roll an action-kind, then pick the matching action.
|
||||
|
||||
Quiet branches fall through to the other two so a (decky-set,
|
||||
persona-pool, mail-decky) shape that would silence one branch
|
||||
doesn't waste the whole tick.
|
||||
"""
|
||||
kinds_in_priority_order = [_roll_action_kind(rng)]
|
||||
for kind, _ in _ACTION_WEIGHTS:
|
||||
if kind not in kinds_in_priority_order:
|
||||
kinds_in_priority_order.append(kind)
|
||||
|
||||
for kind in kinds_in_priority_order:
|
||||
if kind == "traffic":
|
||||
action = scheduler.pick(deckies, rand=rng)
|
||||
elif kind == "file":
|
||||
action = await scheduler.pick_file(
|
||||
deckies, repo, rand=rng,
|
||||
llm=llm, llm_breaker=breaker,
|
||||
)
|
||||
elif kind == "email":
|
||||
try:
|
||||
action = await email_scheduler.pick(repo, rand=rng)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.debug("orchestrator: email pick failed: %s", exc)
|
||||
action = None
|
||||
else:
|
||||
action = None
|
||||
if action is not None:
|
||||
return action
|
||||
return None
|
||||
|
||||
|
||||
async def _one_tick(
|
||||
repo: BaseRepository,
|
||||
bus,
|
||||
*,
|
||||
llm: Any = None,
|
||||
breaker: Optional[LLMCircuitBreaker] = None,
|
||||
) -> None:
|
||||
deckies = await repo.list_running_deckies()
|
||||
rng = secrets.SystemRandom()
|
||||
|
||||
action = await _pick_action(repo, deckies, rng, llm=llm, breaker=breaker)
|
||||
if action is None:
|
||||
ssh_eligible = sum(
|
||||
1 for d in deckies
|
||||
if isinstance(d.get("services"), list)
|
||||
and "ssh" in d["services"]
|
||||
and d.get("ip")
|
||||
)
|
||||
by_source: dict[str, int] = {}
|
||||
for d in deckies:
|
||||
src = d.get("source", "unknown")
|
||||
by_source[src] = by_source.get(src, 0) + 1
|
||||
logger.debug(
|
||||
"orchestrator: no actionable deckies "
|
||||
"(running=%d ssh_eligible=%d sources=%s)",
|
||||
len(deckies), ssh_eligible, by_source,
|
||||
)
|
||||
return
|
||||
|
||||
driver = get_driver_for(action)
|
||||
result = await driver.run(action)
|
||||
|
||||
if isinstance(action, EmailAction):
|
||||
await _persist_email(repo, action, result, bus)
|
||||
else:
|
||||
await _persist_event(repo, action, result, bus)
|
||||
if result.success:
|
||||
if isinstance(action, scheduler.FileAction):
|
||||
try:
|
||||
await _record_synthetic_file(repo, action)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning(
|
||||
"orchestrator: synthetic_files write failed dst=%s path=%s: %s",
|
||||
action.dst_uuid, action.path, exc,
|
||||
)
|
||||
elif isinstance(action, scheduler.EditAction):
|
||||
try:
|
||||
await _bump_synthetic_file_after_edit(repo, action, result)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning(
|
||||
"orchestrator: synthetic_files edit-bump failed "
|
||||
"dst=%s path=%s: %s",
|
||||
action.dst_uuid, action.path, exc,
|
||||
)
|
||||
|
||||
|
||||
async def _persist_event(repo, action, result, bus) -> None:
|
||||
row = events.to_row(action, result)
|
||||
await repo.record_orchestrator_event(row)
|
||||
|
||||
if bus is not None:
|
||||
topic = events.topic_for(action)
|
||||
bus_payload = {
|
||||
"kind": row["kind"],
|
||||
"protocol": row["protocol"],
|
||||
"action": row["action"],
|
||||
"src_decky_uuid": row.get("src_decky_uuid"),
|
||||
"dst_decky_uuid": row["dst_decky_uuid"],
|
||||
"success": row["success"],
|
||||
"payload": result.payload,
|
||||
"ts": row["ts"].isoformat(),
|
||||
}
|
||||
await publish_safely(
|
||||
bus, topic, bus_payload, event_type=events.event_type_for(action),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"orchestrator tick kind=%s success=%s dst=%s",
|
||||
row["kind"], row["success"], row["dst_decky_uuid"],
|
||||
)
|
||||
|
||||
|
||||
async def _persist_email(repo, action: EmailAction, result, bus) -> None:
|
||||
"""Persist + publish an email tick result.
|
||||
|
||||
Mirrors the pre-collapse emailgen worker payload exactly so SSE
|
||||
subscribers and dashboards keep working without a breaking change
|
||||
to the on-the-wire shape.
|
||||
"""
|
||||
row = email_events.to_row(action, result)
|
||||
await repo.record_orchestrator_email(row)
|
||||
|
||||
if bus is not None:
|
||||
topic = email_events.topic_for(action)
|
||||
bus_payload = {
|
||||
"kind": "email",
|
||||
"mail_decky_uuid": row["mail_decky_uuid"],
|
||||
"thread_id": row["thread_id"],
|
||||
"message_id": row["message_id"],
|
||||
"in_reply_to": row["in_reply_to"],
|
||||
"sender_email": row["sender_email"],
|
||||
"recipient_email": row["recipient_email"],
|
||||
"subject": row["subject"],
|
||||
"language": row["language"],
|
||||
"success": row["success"],
|
||||
"ts": row["ts"].isoformat(),
|
||||
}
|
||||
await publish_safely(
|
||||
bus, topic, bus_payload,
|
||||
event_type=email_events.event_type_for(action),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"orchestrator tick kind=email mail_decky=%s thread=%s success=%s reply=%s",
|
||||
row["mail_decky_uuid"], row["thread_id"], row["success"], action.is_reply,
|
||||
)
|
||||
|
||||
|
||||
async def _bump_synthetic_file_after_edit(repo, action, result) -> None:
|
||||
"""Patch ``synthetic_files`` after a successful EditAction.
|
||||
|
||||
Bumps ``edit_count`` + ``last_modified`` + ``content_hash`` so the
|
||||
dashboard's lineage view shows the change. When the row's UUID
|
||||
isn't on the action (planner produced an edit plan from a stale
|
||||
candidate that the repo pruned in between), the update is a no-op
|
||||
— resurrecting a pruned row isn't this layer's job.
|
||||
|
||||
The new body comes from ``result.payload["new_body"]`` (the SSH
|
||||
driver stashes it on success); we re-hash here so the orchestrator,
|
||||
not the driver, owns the canonical hash field.
|
||||
"""
|
||||
if not action.synthetic_file_uuid:
|
||||
return
|
||||
new_body = result.payload.get("new_body", "")
|
||||
rows = await repo.list_synthetic_files(decky_uuid=action.dst_uuid, limit=200)
|
||||
existing = next(
|
||||
(r for r in rows if r.get("uuid") == action.synthetic_file_uuid),
|
||||
None,
|
||||
)
|
||||
if existing is None:
|
||||
return # candidate was pruned mid-flight; skip silently
|
||||
patch: dict = {
|
||||
"last_modified": datetime.now(timezone.utc),
|
||||
"edit_count": int(existing.get("edit_count", 0)) + 1,
|
||||
}
|
||||
if new_body:
|
||||
patch["content_hash"] = hashlib.sha256(
|
||||
new_body.encode("utf-8"),
|
||||
).hexdigest()
|
||||
patch["last_body"] = new_body
|
||||
await repo.update_synthetic_file(action.synthetic_file_uuid, patch)
|
||||
|
||||
|
||||
async def _record_synthetic_file(repo, action) -> None:
|
||||
"""Persist (or patch) a synthetic_files row after a FileAction plant.
|
||||
|
||||
Idempotent on ``(decky_uuid, path)``: when the unique constraint
|
||||
fires (the file existed already), we patch the existing row's
|
||||
``last_modified`` / ``content_hash`` / ``last_body`` / bump
|
||||
``edit_count`` so the dashboard's "files this decky has grown"
|
||||
view stays accurate even when the orchestrator re-plants the same
|
||||
location.
|
||||
"""
|
||||
body = action.content or ""
|
||||
content_hash = hashlib.sha256(body.encode("utf-8")).hexdigest()
|
||||
now = datetime.now(timezone.utc)
|
||||
row = {
|
||||
"decky_uuid": action.dst_uuid,
|
||||
"path": action.path,
|
||||
"persona": action.persona,
|
||||
"content_class": action.content_class,
|
||||
"created_at": now,
|
||||
"last_modified": now,
|
||||
"edit_count": 0,
|
||||
"content_hash": content_hash,
|
||||
"last_body": body,
|
||||
}
|
||||
try:
|
||||
await repo.record_synthetic_file(row)
|
||||
except Exception: # noqa: BLE001
|
||||
existing = await repo.list_synthetic_files(
|
||||
decky_uuid=action.dst_uuid, limit=200,
|
||||
)
|
||||
match = next(
|
||||
(r for r in existing if r.get("path") == action.path), None,
|
||||
)
|
||||
if match is None:
|
||||
raise
|
||||
await repo.update_synthetic_file(
|
||||
match["uuid"],
|
||||
{
|
||||
"last_modified": now,
|
||||
"content_hash": content_hash,
|
||||
"last_body": body,
|
||||
"edit_count": int(match.get("edit_count", 0)) + 1,
|
||||
},
|
||||
)
|
||||
Reference in New Issue
Block a user