feat(orchestrator): MVP synthetic life-injection worker (SSH only)
Adds a new decnet orchestrate worker whose job is to keep the honeypot
ecosystem from looking suspiciously static — a frozen LAN with no
inter-host traffic and no filesystem aging is its own honeypot tell.
MVP scope:
- New OrchestratorEvent table + repo methods (purpose-built sibling
to Log so synthetic events stay separable from attacker-driven ones).
- New orchestrator.{activity,file}.<decky_id> bus topics +
system.orchestrator.health heartbeat.
- SSH-only driver. Traffic action runs python3 inside src container
to TCP-connect dst:22 and read the SSH banner — real on-the-wire
SSH-protocol traffic without shipping creds. File action drops or
refreshes a small file via docker exec on the destination.
- Random scheduler (50/50 traffic/file when >=2 SSH-capable deckies
are running). Diurnal shaping, role-aware pairing, and session-aware
backoff are explicit non-goals for MVP.
- CLI registration, systemd unit (SupplementaryGroups=docker),
worker-registry entry so the dashboard shows orchestrator health.
- 11 tests: scheduler policy, driver argv shape + injection-safety,
end-to-end one-tick integration with FakeBus + SQLite.
This commit is contained in:
9
decnet/orchestrator/__init__.py
Normal file
9
decnet/orchestrator/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
"""DECNET orchestrator — synthetic life-injection worker.
|
||||
|
||||
Drives realistic-looking activity between deckies (inter-decky traffic and
|
||||
in-decky filesystem mutations) so the honeypot stops looking suspiciously
|
||||
static. Sole writer of the ``OrchestratorEvent`` table.
|
||||
"""
|
||||
from decnet.orchestrator.worker import orchestrator_worker
|
||||
|
||||
__all__ = ["orchestrator_worker"]
|
||||
5
decnet/orchestrator/drivers/__init__.py
Normal file
5
decnet/orchestrator/drivers/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Activity drivers for the orchestrator (MVP: SSH only)."""
|
||||
from decnet.orchestrator.drivers.base import ActivityResult, Driver
|
||||
from decnet.orchestrator.drivers.ssh import SSHDriver
|
||||
|
||||
__all__ = ["ActivityResult", "Driver", "SSHDriver"]
|
||||
27
decnet/orchestrator/drivers/base.py
Normal file
27
decnet/orchestrator/drivers/base.py
Normal file
@@ -0,0 +1,27 @@
|
||||
"""Driver protocol for orchestrator actions.
|
||||
|
||||
Future protocols (HTTP, SMB, MySQL, …) implement this interface alongside
|
||||
the SSH driver. Kept deliberately minimal — the orchestrator only needs
|
||||
"run this action and tell me how it went".
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Protocol
|
||||
|
||||
from decnet.orchestrator.scheduler import Action
|
||||
|
||||
|
||||
@dataclass
|
||||
class ActivityResult:
|
||||
"""Outcome of one driver invocation.
|
||||
|
||||
``payload`` is the per-action JSON envelope the worker writes to the
|
||||
``OrchestratorEvent.payload`` column and to the bus event body.
|
||||
"""
|
||||
success: bool
|
||||
payload: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
class Driver(Protocol):
|
||||
async def run(self, action: Action) -> ActivityResult: ...
|
||||
153
decnet/orchestrator/drivers/ssh.py
Normal file
153
decnet/orchestrator/drivers/ssh.py
Normal file
@@ -0,0 +1,153 @@
|
||||
"""MVP SSH-flavoured driver.
|
||||
|
||||
Two action shapes:
|
||||
|
||||
* :class:`~decnet.orchestrator.scheduler.TrafficAction` — exec a tiny
|
||||
Python one-liner *inside the source decky's ssh container* that opens
|
||||
TCP/22 against the destination decky's IP and reads the SSH banner.
|
||||
This generates real on-the-wire SSH-protocol traffic between the two
|
||||
containers (sshd announces the banner on connect), without us having
|
||||
to ship credentials anywhere.
|
||||
* :class:`~decnet.orchestrator.scheduler.FileAction` — drop / refresh a
|
||||
file inside the destination decky's ssh container via ``docker exec``.
|
||||
|
||||
Both shell out via :func:`asyncio.create_subprocess_exec` with argv
|
||||
lists — never a shell string — so an attacker-controllable decky name
|
||||
or IP can't escape into a shell.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import shlex
|
||||
from typing import Any
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.orchestrator.drivers.base import ActivityResult
|
||||
from decnet.orchestrator.scheduler import Action, FileAction, TrafficAction
|
||||
|
||||
log = get_logger("orchestrator.ssh")
|
||||
|
||||
_DOCKER = "docker"
|
||||
# Per-call wall-clock cap. The orchestrator runs serially (one action
|
||||
# per tick); a wedged docker exec must not stall the whole worker.
|
||||
_TIMEOUT = 8.0
|
||||
|
||||
# Container suffix convention: services/*.py emit container_name as
|
||||
# ``<decky_name>-<service>``. The MVP only drives the ssh service.
|
||||
_SSH_CONTAINER_SUFFIX = "-ssh"
|
||||
|
||||
|
||||
def _container_for(decky_name: str) -> str:
|
||||
return f"{decky_name}{_SSH_CONTAINER_SUFFIX}"
|
||||
|
||||
|
||||
async def _run(argv: list[str]) -> tuple[int, str, str]:
|
||||
"""Spawn *argv* and capture (rc, stdout, stderr).
|
||||
|
||||
Returns ``(rc=124, "", "timeout")`` on wall-clock expiry. Never
|
||||
raises — orchestrator success/failure is a payload attribute, not
|
||||
an exception.
|
||||
"""
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*argv,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
except FileNotFoundError as exc:
|
||||
return 127, "", f"argv[0] not found: {exc}"
|
||||
try:
|
||||
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=_TIMEOUT)
|
||||
except asyncio.TimeoutError:
|
||||
try:
|
||||
proc.kill()
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
return 124, "", "timeout"
|
||||
return (
|
||||
proc.returncode if proc.returncode is not None else -1,
|
||||
stdout.decode("utf-8", "replace"),
|
||||
stderr.decode("utf-8", "replace"),
|
||||
)
|
||||
|
||||
|
||||
# Python one-liner that probes the destination's SSH banner. Kept inline
|
||||
# so the driver has zero filesystem dependencies on the host side; the
|
||||
# *container* needs python3 (ssh service template ships it).
|
||||
_PROBE_PY = (
|
||||
"import socket,sys;"
|
||||
"s=socket.socket();s.settimeout(3);"
|
||||
"s.connect((sys.argv[1], 22));"
|
||||
"b=s.recv(128);s.close();"
|
||||
"sys.stdout.write(b.decode('latin1','replace'))"
|
||||
)
|
||||
|
||||
|
||||
class SSHDriver:
|
||||
"""Concrete :class:`Driver` for the MVP."""
|
||||
|
||||
async def run(self, action: Action) -> ActivityResult:
|
||||
if isinstance(action, TrafficAction):
|
||||
return await self._run_traffic(action)
|
||||
if isinstance(action, FileAction):
|
||||
return await self._run_file(action)
|
||||
raise TypeError(f"unsupported action type: {type(action)!r}")
|
||||
|
||||
async def _run_traffic(self, action: TrafficAction) -> ActivityResult:
|
||||
container = _container_for(action.src_name)
|
||||
argv = [
|
||||
_DOCKER, "exec", container,
|
||||
"python3", "-c", _PROBE_PY, action.dst_ip,
|
||||
]
|
||||
rc, stdout, stderr = await _run(argv)
|
||||
success = rc == 0 and stdout.startswith("SSH-")
|
||||
payload: dict[str, Any] = {
|
||||
"src_decky": action.src_name,
|
||||
"dst_decky": action.dst_name,
|
||||
"dst_ip": action.dst_ip,
|
||||
"dst_port": 22,
|
||||
"rc": rc,
|
||||
"banner": stdout.strip()[:128] if success else None,
|
||||
"stderr": stderr.strip()[:256] if not success else None,
|
||||
}
|
||||
if not success:
|
||||
log.debug(
|
||||
"orchestrator.ssh.traffic failed src=%s dst=%s rc=%d stderr=%r",
|
||||
action.src_name, action.dst_name, rc, stderr[:120],
|
||||
)
|
||||
return ActivityResult(success=success, payload=payload)
|
||||
|
||||
async def _run_file(self, action: FileAction) -> ActivityResult:
|
||||
container = _container_for(action.dst_name)
|
||||
# `tee` is in coreutils on every base image; using it (instead of
|
||||
# `>` redirection) keeps the argv free of shell metacharacters
|
||||
# the dst_name/path could otherwise weaponise. Path validation
|
||||
# still belongs upstream — the scheduler's templates are fixed.
|
||||
# We do invoke `sh -c` so the parent dir gets mkdir'd in one
|
||||
# call; the sh argv stays trivially auditable.
|
||||
sh_cmd = (
|
||||
f"mkdir -p {shlex.quote(_dirname(action.path))} && "
|
||||
f"printf %s {shlex.quote(action.content)} > {shlex.quote(action.path)} && "
|
||||
f"touch {shlex.quote(action.path)}"
|
||||
)
|
||||
argv = [_DOCKER, "exec", container, "sh", "-c", sh_cmd]
|
||||
rc, stdout, stderr = await _run(argv)
|
||||
success = rc == 0
|
||||
payload: dict[str, Any] = {
|
||||
"dst_decky": action.dst_name,
|
||||
"path": action.path,
|
||||
"bytes": len(action.content.encode("utf-8")),
|
||||
"rc": rc,
|
||||
"stderr": stderr.strip()[:256] if not success else None,
|
||||
}
|
||||
return ActivityResult(success=success, payload=payload)
|
||||
|
||||
|
||||
def _dirname(path: str) -> str:
|
||||
"""Pure-string dirname. We can't trust ``os.path.dirname`` on the
|
||||
host to share the destination container's separator semantics, but
|
||||
deckies are POSIX so a plain ``rfind('/')`` suffices."""
|
||||
idx = path.rfind("/")
|
||||
if idx <= 0:
|
||||
return "/"
|
||||
return path[:idx]
|
||||
53
decnet/orchestrator/events.py
Normal file
53
decnet/orchestrator/events.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""DB-row + bus-topic helpers for the orchestrator."""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.orchestrator.drivers.base import ActivityResult
|
||||
from decnet.orchestrator.scheduler import Action, FileAction, TrafficAction
|
||||
|
||||
|
||||
def to_row(action: Action, result: ActivityResult) -> dict[str, Any]:
|
||||
"""Build the kwargs dict for ``OrchestratorEvent(**...)``."""
|
||||
base: dict[str, Any] = {
|
||||
"ts": datetime.now(timezone.utc),
|
||||
"protocol": "ssh",
|
||||
"success": result.success,
|
||||
"payload": result.payload, # repo serialises dict→json
|
||||
}
|
||||
if isinstance(action, TrafficAction):
|
||||
base.update(
|
||||
kind="traffic",
|
||||
action=f"exec:{action.description}",
|
||||
src_decky_uuid=action.src_uuid,
|
||||
dst_decky_uuid=action.dst_uuid,
|
||||
)
|
||||
elif isinstance(action, FileAction):
|
||||
base.update(
|
||||
kind="file",
|
||||
action=action.description,
|
||||
src_decky_uuid=None,
|
||||
dst_decky_uuid=action.dst_uuid,
|
||||
)
|
||||
else:
|
||||
raise TypeError(f"unsupported action type: {type(action)!r}")
|
||||
return base
|
||||
|
||||
|
||||
def topic_for(action: Action) -> str:
|
||||
"""Map an action to its bus topic."""
|
||||
if isinstance(action, TrafficAction):
|
||||
return _topics.orchestrator(_topics.ORCHESTRATOR_ACTIVITY, action.dst_uuid)
|
||||
if isinstance(action, FileAction):
|
||||
return _topics.orchestrator(_topics.ORCHESTRATOR_FILE, action.dst_uuid)
|
||||
raise TypeError(f"unsupported action type: {type(action)!r}")
|
||||
|
||||
|
||||
def event_type_for(action: Action) -> str:
|
||||
if isinstance(action, TrafficAction):
|
||||
return _topics.ORCHESTRATOR_ACTIVITY
|
||||
if isinstance(action, FileAction):
|
||||
return _topics.ORCHESTRATOR_FILE
|
||||
raise TypeError(f"unsupported action type: {type(action)!r}")
|
||||
97
decnet/orchestrator/scheduler.py
Normal file
97
decnet/orchestrator/scheduler.py
Normal file
@@ -0,0 +1,97 @@
|
||||
"""Action picker for the orchestrator.
|
||||
|
||||
MVP policy: flat random — pick one (src, dst) pair where both deckies
|
||||
expose SSH, then choose one of {ssh-traffic, file-touch}. No diurnal
|
||||
shaping, no role-aware pairing — those land in v1.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import secrets
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional, Sequence
|
||||
|
||||
# A small set of plausible filenames the orchestrator drops or refreshes.
|
||||
# Scope on purpose: the file driver is "prove the docker-exec write path
|
||||
# works", not "generate believable user activity". Realism is v2.
|
||||
# Paths target the filesystem *inside* a decoy container, not the host.
|
||||
# Bandit B108 is a host-side concern; suppressed at the data definition.
|
||||
_FILE_TEMPLATES: tuple[tuple[str, str], ...] = ( # nosec B108
|
||||
("/tmp/.cache-{ts}.tmp", "session={ts}\n"), # nosec B108
|
||||
("/var/log/cron-{ts}.log", "{ts} CRON[{n}]: ({user}) CMD (run-parts /etc/cron.daily)\n"),
|
||||
("/home/{user}/notes-{ts}.txt", "todo: rotate keys; check on backup task\n"),
|
||||
)
|
||||
|
||||
_USERS = ("admin", "ubuntu", "service")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TrafficAction:
|
||||
src_uuid: str
|
||||
src_name: str
|
||||
dst_uuid: str
|
||||
dst_name: str
|
||||
dst_ip: str
|
||||
protocol: str = "ssh"
|
||||
description: str = "tcp_connect:22"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FileAction:
|
||||
dst_uuid: str
|
||||
dst_name: str
|
||||
path: str
|
||||
content: str
|
||||
description: str = "file:create"
|
||||
|
||||
|
||||
Action = TrafficAction | FileAction
|
||||
|
||||
|
||||
def _has_ssh(decky: dict[str, Any]) -> bool:
|
||||
services = decky.get("services") or []
|
||||
if isinstance(services, str):
|
||||
return False # not deserialised — treat as "we don't know"
|
||||
return "ssh" in services
|
||||
|
||||
|
||||
def pick(
|
||||
deckies: Sequence[dict[str, Any]],
|
||||
*,
|
||||
rand: Optional[secrets.SystemRandom] = None,
|
||||
) -> Optional[Action]:
|
||||
"""Pick one action against the given decky set.
|
||||
|
||||
Returns ``None`` when no action is possible (fewer than two SSH-capable
|
||||
deckies for traffic, or no deckies at all for file ops). The worker
|
||||
treats ``None`` as "skip this tick".
|
||||
"""
|
||||
rng = rand or secrets.SystemRandom()
|
||||
ssh_deckies = [d for d in deckies if _has_ssh(d) and d.get("ip")]
|
||||
if not ssh_deckies:
|
||||
return None
|
||||
|
||||
kind = "traffic" if (len(ssh_deckies) >= 2 and rng.random() < 0.5) else "file"
|
||||
|
||||
if kind == "traffic":
|
||||
src, dst = rng.sample(ssh_deckies, 2)
|
||||
return TrafficAction(
|
||||
src_uuid=src["uuid"],
|
||||
src_name=src["name"],
|
||||
dst_uuid=dst["uuid"],
|
||||
dst_name=dst["name"],
|
||||
dst_ip=dst["ip"],
|
||||
)
|
||||
|
||||
dst = rng.choice(ssh_deckies)
|
||||
template, content_template = rng.choice(_FILE_TEMPLATES)
|
||||
ts = int(datetime.now(timezone.utc).timestamp())
|
||||
user = rng.choice(_USERS)
|
||||
path = template.format(ts=ts, user=user)
|
||||
content = content_template.format(ts=ts, user=user, n=rng.randint(1000, 99999))
|
||||
return FileAction(
|
||||
dst_uuid=dst["uuid"],
|
||||
dst_name=dst["name"],
|
||||
path=path,
|
||||
content=content,
|
||||
)
|
||||
114
decnet/orchestrator/worker.py
Normal file
114
decnet/orchestrator/worker.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""Orchestrator main loop.
|
||||
|
||||
One tick = one (src, dst, action) pick + one driver invocation + one DB
|
||||
write + one fire-and-forget bus publish. Intentionally serial — MVP
|
||||
honesty: a wedged docker exec stalls only this worker, never another.
|
||||
|
||||
Modeled after :mod:`decnet.profiler.worker` for consistency: same control
|
||||
listener, same heartbeat helper, same shutdown semantics.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
|
||||
from decnet.bus.factory import get_bus
|
||||
from decnet.bus.publish import (
|
||||
publish_safely,
|
||||
run_control_listener,
|
||||
run_health_heartbeat,
|
||||
)
|
||||
from decnet.logging import get_logger
|
||||
from decnet.orchestrator import events, scheduler
|
||||
from decnet.orchestrator.drivers import SSHDriver
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
|
||||
logger = get_logger("orchestrator")
|
||||
|
||||
|
||||
async def orchestrator_worker(
|
||||
repo: BaseRepository,
|
||||
*,
|
||||
interval: int = 60,
|
||||
) -> None:
|
||||
"""Periodically inject synthetic activity into the running fleet.
|
||||
|
||||
Runs as a long-lived asyncio task. Honours the bus control topic
|
||||
(``system.orchestrator.control``) for graceful shutdown.
|
||||
"""
|
||||
logger.info("orchestrator worker started interval=%ds", interval)
|
||||
|
||||
bus = None
|
||||
try:
|
||||
bus = get_bus(client_name="orchestrator")
|
||||
await bus.connect()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning(
|
||||
"orchestrator: bus unavailable, continuing without publish: %s", exc
|
||||
)
|
||||
bus = None
|
||||
|
||||
driver = SSHDriver()
|
||||
shutdown = asyncio.Event()
|
||||
heartbeat_task = asyncio.create_task(run_health_heartbeat(bus, "orchestrator"))
|
||||
control_task = asyncio.create_task(
|
||||
run_control_listener(bus, "orchestrator", shutdown),
|
||||
)
|
||||
try:
|
||||
while not shutdown.is_set():
|
||||
try:
|
||||
await asyncio.wait_for(shutdown.wait(), timeout=interval)
|
||||
except asyncio.TimeoutError:
|
||||
pass # normal tick
|
||||
if shutdown.is_set():
|
||||
break
|
||||
try:
|
||||
await _one_tick(repo, driver, bus)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.error("orchestrator tick failed: %s", exc)
|
||||
finally:
|
||||
for t in (heartbeat_task, control_task):
|
||||
t.cancel()
|
||||
with contextlib.suppress(Exception, asyncio.CancelledError):
|
||||
await t
|
||||
if bus is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
await bus.close()
|
||||
|
||||
|
||||
async def _one_tick(repo: BaseRepository, driver, bus) -> None:
|
||||
deckies = await repo.list_running_topology_deckies()
|
||||
action = scheduler.pick(deckies)
|
||||
if action is None:
|
||||
logger.debug(
|
||||
"orchestrator: no actionable deckies (running+ssh count=%d)",
|
||||
len(deckies),
|
||||
)
|
||||
return
|
||||
|
||||
result = await driver.run(action)
|
||||
row = events.to_row(action, result)
|
||||
await repo.record_orchestrator_event(row)
|
||||
|
||||
if bus is not None:
|
||||
topic = events.topic_for(action)
|
||||
# Bus payload mirrors the row but uses iso8601 for ts so SSE
|
||||
# consumers don't have to JSON-handle datetime themselves.
|
||||
bus_payload = {
|
||||
"kind": row["kind"],
|
||||
"protocol": row["protocol"],
|
||||
"action": row["action"],
|
||||
"src_decky_uuid": row.get("src_decky_uuid"),
|
||||
"dst_decky_uuid": row["dst_decky_uuid"],
|
||||
"success": row["success"],
|
||||
"payload": result.payload,
|
||||
"ts": row["ts"].isoformat(),
|
||||
}
|
||||
await publish_safely(
|
||||
bus, topic, bus_payload, event_type=events.event_type_for(action)
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"orchestrator tick kind=%s success=%s dst=%s",
|
||||
row["kind"], row["success"], row["dst_decky_uuid"],
|
||||
)
|
||||
Reference in New Issue
Block a user