feat(orchestrator): MVP synthetic life-injection worker (SSH only)

Adds a new decnet orchestrate worker whose job is to keep the honeypot
ecosystem from looking suspiciously static — a frozen LAN with no
inter-host traffic and no filesystem aging is its own honeypot tell.

MVP scope:
- New OrchestratorEvent table + repo methods (purpose-built sibling
  to Log so synthetic events stay separable from attacker-driven ones).
- New orchestrator.{activity,file}.<decky_id> bus topics +
  system.orchestrator.health heartbeat.
- SSH-only driver. Traffic action runs python3 inside src container
  to TCP-connect dst:22 and read the SSH banner — real on-the-wire
  SSH-protocol traffic without shipping creds. File action drops or
  refreshes a small file via docker exec on the destination.
- Random scheduler (50/50 traffic/file when >=2 SSH-capable deckies
  are running). Diurnal shaping, role-aware pairing, and session-aware
  backoff are explicit non-goals for MVP.
- CLI registration, systemd unit (SupplementaryGroups=docker),
  worker-registry entry so the dashboard shows orchestrator health.
- 11 tests: scheduler policy, driver argv shape + injection-safety,
  end-to-end one-tick integration with FakeBus + SQLite.
This commit is contained in:
2026-04-26 19:43:20 -04:00
parent cc2deb73f7
commit 4c37ece39e
21 changed files with 972 additions and 1 deletions

View File

@@ -0,0 +1,9 @@
"""DECNET orchestrator — synthetic life-injection worker.
Drives realistic-looking activity between deckies (inter-decky traffic and
in-decky filesystem mutations) so the honeypot stops looking suspiciously
static. Sole writer of the ``OrchestratorEvent`` table.
"""
from decnet.orchestrator.worker import orchestrator_worker
__all__ = ["orchestrator_worker"]

View File

@@ -0,0 +1,5 @@
"""Activity drivers for the orchestrator (MVP: SSH only)."""
from decnet.orchestrator.drivers.base import ActivityResult, Driver
from decnet.orchestrator.drivers.ssh import SSHDriver
__all__ = ["ActivityResult", "Driver", "SSHDriver"]

View File

@@ -0,0 +1,27 @@
"""Driver protocol for orchestrator actions.
Future protocols (HTTP, SMB, MySQL, …) implement this interface alongside
the SSH driver. Kept deliberately minimal — the orchestrator only needs
"run this action and tell me how it went".
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Protocol
from decnet.orchestrator.scheduler import Action
@dataclass
class ActivityResult:
"""Outcome of one driver invocation.
``payload`` is the per-action JSON envelope the worker writes to the
``OrchestratorEvent.payload`` column and to the bus event body.
"""
success: bool
payload: dict[str, Any] = field(default_factory=dict)
class Driver(Protocol):
async def run(self, action: Action) -> ActivityResult: ...

View File

@@ -0,0 +1,153 @@
"""MVP SSH-flavoured driver.
Two action shapes:
* :class:`~decnet.orchestrator.scheduler.TrafficAction` — exec a tiny
Python one-liner *inside the source decky's ssh container* that opens
TCP/22 against the destination decky's IP and reads the SSH banner.
This generates real on-the-wire SSH-protocol traffic between the two
containers (sshd announces the banner on connect), without us having
to ship credentials anywhere.
* :class:`~decnet.orchestrator.scheduler.FileAction` — drop / refresh a
file inside the destination decky's ssh container via ``docker exec``.
Both shell out via :func:`asyncio.create_subprocess_exec` with argv
lists — never a shell string — so an attacker-controllable decky name
or IP can't escape into a shell.
"""
from __future__ import annotations
import asyncio
import shlex
from typing import Any
from decnet.logging import get_logger
from decnet.orchestrator.drivers.base import ActivityResult
from decnet.orchestrator.scheduler import Action, FileAction, TrafficAction
log = get_logger("orchestrator.ssh")
_DOCKER = "docker"
# Per-call wall-clock cap. The orchestrator runs serially (one action
# per tick); a wedged docker exec must not stall the whole worker.
_TIMEOUT = 8.0
# Container suffix convention: services/*.py emit container_name as
# ``<decky_name>-<service>``. The MVP only drives the ssh service.
_SSH_CONTAINER_SUFFIX = "-ssh"
def _container_for(decky_name: str) -> str:
return f"{decky_name}{_SSH_CONTAINER_SUFFIX}"
async def _run(argv: list[str]) -> tuple[int, str, str]:
"""Spawn *argv* and capture (rc, stdout, stderr).
Returns ``(rc=124, "", "timeout")`` on wall-clock expiry. Never
raises — orchestrator success/failure is a payload attribute, not
an exception.
"""
try:
proc = await asyncio.create_subprocess_exec(
*argv,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
except FileNotFoundError as exc:
return 127, "", f"argv[0] not found: {exc}"
try:
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=_TIMEOUT)
except asyncio.TimeoutError:
try:
proc.kill()
except ProcessLookupError:
pass
return 124, "", "timeout"
return (
proc.returncode if proc.returncode is not None else -1,
stdout.decode("utf-8", "replace"),
stderr.decode("utf-8", "replace"),
)
# Python one-liner that probes the destination's SSH banner. Kept inline
# so the driver has zero filesystem dependencies on the host side; the
# *container* needs python3 (ssh service template ships it).
_PROBE_PY = (
"import socket,sys;"
"s=socket.socket();s.settimeout(3);"
"s.connect((sys.argv[1], 22));"
"b=s.recv(128);s.close();"
"sys.stdout.write(b.decode('latin1','replace'))"
)
class SSHDriver:
"""Concrete :class:`Driver` for the MVP."""
async def run(self, action: Action) -> ActivityResult:
if isinstance(action, TrafficAction):
return await self._run_traffic(action)
if isinstance(action, FileAction):
return await self._run_file(action)
raise TypeError(f"unsupported action type: {type(action)!r}")
async def _run_traffic(self, action: TrafficAction) -> ActivityResult:
container = _container_for(action.src_name)
argv = [
_DOCKER, "exec", container,
"python3", "-c", _PROBE_PY, action.dst_ip,
]
rc, stdout, stderr = await _run(argv)
success = rc == 0 and stdout.startswith("SSH-")
payload: dict[str, Any] = {
"src_decky": action.src_name,
"dst_decky": action.dst_name,
"dst_ip": action.dst_ip,
"dst_port": 22,
"rc": rc,
"banner": stdout.strip()[:128] if success else None,
"stderr": stderr.strip()[:256] if not success else None,
}
if not success:
log.debug(
"orchestrator.ssh.traffic failed src=%s dst=%s rc=%d stderr=%r",
action.src_name, action.dst_name, rc, stderr[:120],
)
return ActivityResult(success=success, payload=payload)
async def _run_file(self, action: FileAction) -> ActivityResult:
container = _container_for(action.dst_name)
# `tee` is in coreutils on every base image; using it (instead of
# `>` redirection) keeps the argv free of shell metacharacters
# the dst_name/path could otherwise weaponise. Path validation
# still belongs upstream — the scheduler's templates are fixed.
# We do invoke `sh -c` so the parent dir gets mkdir'd in one
# call; the sh argv stays trivially auditable.
sh_cmd = (
f"mkdir -p {shlex.quote(_dirname(action.path))} && "
f"printf %s {shlex.quote(action.content)} > {shlex.quote(action.path)} && "
f"touch {shlex.quote(action.path)}"
)
argv = [_DOCKER, "exec", container, "sh", "-c", sh_cmd]
rc, stdout, stderr = await _run(argv)
success = rc == 0
payload: dict[str, Any] = {
"dst_decky": action.dst_name,
"path": action.path,
"bytes": len(action.content.encode("utf-8")),
"rc": rc,
"stderr": stderr.strip()[:256] if not success else None,
}
return ActivityResult(success=success, payload=payload)
def _dirname(path: str) -> str:
"""Pure-string dirname. We can't trust ``os.path.dirname`` on the
host to share the destination container's separator semantics, but
deckies are POSIX so a plain ``rfind('/')`` suffices."""
idx = path.rfind("/")
if idx <= 0:
return "/"
return path[:idx]

View File

@@ -0,0 +1,53 @@
"""DB-row + bus-topic helpers for the orchestrator."""
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any
from decnet.bus import topics as _topics
from decnet.orchestrator.drivers.base import ActivityResult
from decnet.orchestrator.scheduler import Action, FileAction, TrafficAction
def to_row(action: Action, result: ActivityResult) -> dict[str, Any]:
"""Build the kwargs dict for ``OrchestratorEvent(**...)``."""
base: dict[str, Any] = {
"ts": datetime.now(timezone.utc),
"protocol": "ssh",
"success": result.success,
"payload": result.payload, # repo serialises dict→json
}
if isinstance(action, TrafficAction):
base.update(
kind="traffic",
action=f"exec:{action.description}",
src_decky_uuid=action.src_uuid,
dst_decky_uuid=action.dst_uuid,
)
elif isinstance(action, FileAction):
base.update(
kind="file",
action=action.description,
src_decky_uuid=None,
dst_decky_uuid=action.dst_uuid,
)
else:
raise TypeError(f"unsupported action type: {type(action)!r}")
return base
def topic_for(action: Action) -> str:
"""Map an action to its bus topic."""
if isinstance(action, TrafficAction):
return _topics.orchestrator(_topics.ORCHESTRATOR_ACTIVITY, action.dst_uuid)
if isinstance(action, FileAction):
return _topics.orchestrator(_topics.ORCHESTRATOR_FILE, action.dst_uuid)
raise TypeError(f"unsupported action type: {type(action)!r}")
def event_type_for(action: Action) -> str:
if isinstance(action, TrafficAction):
return _topics.ORCHESTRATOR_ACTIVITY
if isinstance(action, FileAction):
return _topics.ORCHESTRATOR_FILE
raise TypeError(f"unsupported action type: {type(action)!r}")

View File

@@ -0,0 +1,97 @@
"""Action picker for the orchestrator.
MVP policy: flat random — pick one (src, dst) pair where both deckies
expose SSH, then choose one of {ssh-traffic, file-touch}. No diurnal
shaping, no role-aware pairing — those land in v1.
"""
from __future__ import annotations
import secrets
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any, Optional, Sequence
# A small set of plausible filenames the orchestrator drops or refreshes.
# Scope on purpose: the file driver is "prove the docker-exec write path
# works", not "generate believable user activity". Realism is v2.
# Paths target the filesystem *inside* a decoy container, not the host.
# Bandit B108 is a host-side concern; suppressed at the data definition.
_FILE_TEMPLATES: tuple[tuple[str, str], ...] = ( # nosec B108
("/tmp/.cache-{ts}.tmp", "session={ts}\n"), # nosec B108
("/var/log/cron-{ts}.log", "{ts} CRON[{n}]: ({user}) CMD (run-parts /etc/cron.daily)\n"),
("/home/{user}/notes-{ts}.txt", "todo: rotate keys; check on backup task\n"),
)
_USERS = ("admin", "ubuntu", "service")
@dataclass(frozen=True)
class TrafficAction:
src_uuid: str
src_name: str
dst_uuid: str
dst_name: str
dst_ip: str
protocol: str = "ssh"
description: str = "tcp_connect:22"
@dataclass(frozen=True)
class FileAction:
dst_uuid: str
dst_name: str
path: str
content: str
description: str = "file:create"
Action = TrafficAction | FileAction
def _has_ssh(decky: dict[str, Any]) -> bool:
services = decky.get("services") or []
if isinstance(services, str):
return False # not deserialised — treat as "we don't know"
return "ssh" in services
def pick(
deckies: Sequence[dict[str, Any]],
*,
rand: Optional[secrets.SystemRandom] = None,
) -> Optional[Action]:
"""Pick one action against the given decky set.
Returns ``None`` when no action is possible (fewer than two SSH-capable
deckies for traffic, or no deckies at all for file ops). The worker
treats ``None`` as "skip this tick".
"""
rng = rand or secrets.SystemRandom()
ssh_deckies = [d for d in deckies if _has_ssh(d) and d.get("ip")]
if not ssh_deckies:
return None
kind = "traffic" if (len(ssh_deckies) >= 2 and rng.random() < 0.5) else "file"
if kind == "traffic":
src, dst = rng.sample(ssh_deckies, 2)
return TrafficAction(
src_uuid=src["uuid"],
src_name=src["name"],
dst_uuid=dst["uuid"],
dst_name=dst["name"],
dst_ip=dst["ip"],
)
dst = rng.choice(ssh_deckies)
template, content_template = rng.choice(_FILE_TEMPLATES)
ts = int(datetime.now(timezone.utc).timestamp())
user = rng.choice(_USERS)
path = template.format(ts=ts, user=user)
content = content_template.format(ts=ts, user=user, n=rng.randint(1000, 99999))
return FileAction(
dst_uuid=dst["uuid"],
dst_name=dst["name"],
path=path,
content=content,
)

View File

@@ -0,0 +1,114 @@
"""Orchestrator main loop.
One tick = one (src, dst, action) pick + one driver invocation + one DB
write + one fire-and-forget bus publish. Intentionally serial — MVP
honesty: a wedged docker exec stalls only this worker, never another.
Modeled after :mod:`decnet.profiler.worker` for consistency: same control
listener, same heartbeat helper, same shutdown semantics.
"""
from __future__ import annotations
import asyncio
import contextlib
from decnet.bus.factory import get_bus
from decnet.bus.publish import (
publish_safely,
run_control_listener,
run_health_heartbeat,
)
from decnet.logging import get_logger
from decnet.orchestrator import events, scheduler
from decnet.orchestrator.drivers import SSHDriver
from decnet.web.db.repository import BaseRepository
logger = get_logger("orchestrator")
async def orchestrator_worker(
repo: BaseRepository,
*,
interval: int = 60,
) -> None:
"""Periodically inject synthetic activity into the running fleet.
Runs as a long-lived asyncio task. Honours the bus control topic
(``system.orchestrator.control``) for graceful shutdown.
"""
logger.info("orchestrator worker started interval=%ds", interval)
bus = None
try:
bus = get_bus(client_name="orchestrator")
await bus.connect()
except Exception as exc: # noqa: BLE001
logger.warning(
"orchestrator: bus unavailable, continuing without publish: %s", exc
)
bus = None
driver = SSHDriver()
shutdown = asyncio.Event()
heartbeat_task = asyncio.create_task(run_health_heartbeat(bus, "orchestrator"))
control_task = asyncio.create_task(
run_control_listener(bus, "orchestrator", shutdown),
)
try:
while not shutdown.is_set():
try:
await asyncio.wait_for(shutdown.wait(), timeout=interval)
except asyncio.TimeoutError:
pass # normal tick
if shutdown.is_set():
break
try:
await _one_tick(repo, driver, bus)
except Exception as exc: # noqa: BLE001
logger.error("orchestrator tick failed: %s", exc)
finally:
for t in (heartbeat_task, control_task):
t.cancel()
with contextlib.suppress(Exception, asyncio.CancelledError):
await t
if bus is not None:
with contextlib.suppress(Exception):
await bus.close()
async def _one_tick(repo: BaseRepository, driver, bus) -> None:
deckies = await repo.list_running_topology_deckies()
action = scheduler.pick(deckies)
if action is None:
logger.debug(
"orchestrator: no actionable deckies (running+ssh count=%d)",
len(deckies),
)
return
result = await driver.run(action)
row = events.to_row(action, result)
await repo.record_orchestrator_event(row)
if bus is not None:
topic = events.topic_for(action)
# Bus payload mirrors the row but uses iso8601 for ts so SSE
# consumers don't have to JSON-handle datetime themselves.
bus_payload = {
"kind": row["kind"],
"protocol": row["protocol"],
"action": row["action"],
"src_decky_uuid": row.get("src_decky_uuid"),
"dst_decky_uuid": row["dst_decky_uuid"],
"success": row["success"],
"payload": result.payload,
"ts": row["ts"].isoformat(),
}
await publish_safely(
bus, topic, bus_payload, event_type=events.event_type_for(action)
)
logger.info(
"orchestrator tick kind=%s success=%s dst=%s",
row["kind"], row["success"], row["dst_decky_uuid"],
)