feat(swarm): per-host microservices as systemd units, mutator off agents

Previously `decnet status` on an agent showed every microservice as DOWN
because deploy's auto-spawn was unihost-scoped and the agent CLI gate
hid the per-host commands. Now:

  - collect, probe, profiler, sniffer drop out of MASTER_ONLY_COMMANDS
    (they run per-host; master-side work stays master-gated).
  - mutate stays master-only (it orchestrates swarm-wide respawns).
  - decnet/mutator/ excluded from agent tarballs — never invoked there.
  - decnet/web exclusion tightened: ship db/ + auth.py + dependencies.py
    (profiler needs the repo singleton), drop api.py, swarm_api.py,
    ingester.py, router/, templates/.
  - Four new systemd unit templates (decnet-collector/prober/profiler/
    sniffer) shipped in every enrollment tarball.
  - enroll_bootstrap.sh enables + starts all four alongside agent and
    forwarder at install time.
  - updater restarts the aux units on code push so they pick up the new
    release (best-effort — legacy enrollments without the units won't
    fail the update).
  - status table hides Mutator + API rows in agent mode.
This commit is contained in:
2026-04-19 18:58:48 -04:00
parent ee9ade4cd5
commit 6d7877c679
10 changed files with 172 additions and 19 deletions

View File

@@ -1323,6 +1323,11 @@ def status() -> None:
_status() _status()
registry = _service_registry(str(DECNET_INGEST_LOG_FILE)) registry = _service_registry(str(DECNET_INGEST_LOG_FILE))
# On agents, the Mutator runs master-side only (it schedules decky
# respawns across the swarm) and the API is never shipped. Hide those
# rows so operators aren't chasing permanent DOWN entries.
if _agent_mode_active():
registry = [r for r in registry if r[0] not in {"Mutator", "API"}]
svc_table = Table(title="DECNET Services", show_lines=True) svc_table = Table(title="DECNET Services", show_lines=True)
svc_table.add_column("Service", style="bold cyan") svc_table.add_column("Service", style="bold cyan")
svc_table.add_column("Status") svc_table.add_column("Status")
@@ -1762,13 +1767,17 @@ def db_reset(
# MASTER_ONLY when touching command registration. # MASTER_ONLY when touching command registration.
# #
# Worker-legitimate commands (NOT in these sets): agent, updater, forwarder, # Worker-legitimate commands (NOT in these sets): agent, updater, forwarder,
# status (agents run deckies locally and should be able to inspect them). # status, collect, probe, profiler, sniffer. Agents run deckies locally and
# should be able to inspect them + run the per-host microservices (collector
# streams container logs, prober/profiler characterize attackers hitting
# this host, sniffer captures traffic). Mutator stays master-only because
# it orchestrates respawns across the swarm.
# ─────────────────────────────────────────────────────────────────────────── # ───────────────────────────────────────────────────────────────────────────
MASTER_ONLY_COMMANDS: frozenset[str] = frozenset({ MASTER_ONLY_COMMANDS: frozenset[str] = frozenset({
"api", "swarmctl", "deploy", "redeploy", "teardown", "api", "swarmctl", "deploy", "redeploy", "teardown",
"probe", "collect", "mutate", "listener", "mutate", "listener",
"services", "distros", "correlate", "archetypes", "web", "services", "distros", "correlate", "archetypes", "web",
"profiler", "sniffer", "db-reset", "db-reset",
}) })
MASTER_ONLY_GROUPS: frozenset[str] = frozenset({"swarm"}) MASTER_ONLY_GROUPS: frozenset[str] = frozenset({"swarm"})

View File

@@ -237,6 +237,14 @@ def _run_pip(
AGENT_SYSTEMD_UNIT = "decnet-agent.service" AGENT_SYSTEMD_UNIT = "decnet-agent.service"
FORWARDER_SYSTEMD_UNIT = "decnet-forwarder.service" FORWARDER_SYSTEMD_UNIT = "decnet-forwarder.service"
UPDATER_SYSTEMD_UNIT = "decnet-updater.service" UPDATER_SYSTEMD_UNIT = "decnet-updater.service"
# Per-host microservices that run out of the same /opt/decnet tree. An
# update replaces their code, so we must cycle them alongside the agent or
# they keep serving the pre-update image. Best-effort: legacy enrollments
# without these units installed shouldn't abort the update.
AUXILIARY_SYSTEMD_UNITS = (
"decnet-collector.service", "decnet-prober.service",
"decnet-profiler.service", "decnet-sniffer.service",
)
def _systemd_available() -> bool: def _systemd_available() -> bool:
@@ -286,6 +294,13 @@ def _spawn_agent_via_systemd(install_dir: pathlib.Path) -> int:
) )
if fwd.returncode != 0: if fwd.returncode != 0:
log.warning("forwarder restart failed (ignored): %s", fwd.stderr.strip()) log.warning("forwarder restart failed (ignored): %s", fwd.stderr.strip())
for unit in AUXILIARY_SYSTEMD_UNITS:
aux = subprocess.run( # nosec B603 B607
["systemctl", "restart", unit],
check=False, capture_output=True, text=True,
)
if aux.returncode != 0:
log.warning("%s restart failed (ignored): %s", unit, aux.stderr.strip())
pid_out = subprocess.run( # nosec B603 B607 pid_out = subprocess.run( # nosec B603 B607
["systemctl", "show", "--property=MainPID", "--value", AGENT_SYSTEMD_UNIT], ["systemctl", "show", "--property=MainPID", "--value", AGENT_SYSTEMD_UNIT],
check=True, capture_output=True, text=True, check=True, capture_output=True, text=True,

View File

@@ -63,10 +63,19 @@ _EXCLUDES: tuple[str, ...] = (
"wiki-checkout", "wiki-checkout/*", "wiki-checkout", "wiki-checkout/*",
# Frontend is master-only; agents never serve UI. # Frontend is master-only; agents never serve UI.
"decnet_web", "decnet_web/*", "decnet_web/**", "decnet_web", "decnet_web/*", "decnet_web/**",
# Master FastAPI app (API, routers, master-side DB) is not run on agents. # Master API surface. Agents ship with decnet.web.db + auth + dependencies
# The `agent` / `updater` / `forwarder` commands have their own apps under # (the profiler microservice needs the repo singleton), but the FastAPI
# decnet/agent, decnet/updater — they don't import decnet.web. # app itself (api.py, swarm_api.py, the full router tree, the ingester,
"decnet/web", "decnet/web/*", "decnet/web/**", # and the .j2 templates that the master renders into the tarball) has no
# business running on a worker.
"decnet/web/api.py",
"decnet/web/swarm_api.py",
"decnet/web/ingester.py",
"decnet/web/router", "decnet/web/router/*", "decnet/web/router/**",
"decnet/web/templates", "decnet/web/templates/*", "decnet/web/templates/**",
# Mutator is master-only (it schedules decky respawns across the swarm);
# agents never invoke it. Keep it off the worker.
"decnet/mutator", "decnet/mutator/*", "decnet/mutator/**",
"decnet-state.json", "decnet-state.json",
"master.log", "master.json", "master.log", "master.json",
"decnet.tar", "decnet.tar",
@@ -254,7 +263,11 @@ def _build_tarball(
return buf.getvalue() return buf.getvalue()
_SYSTEMD_UNITS = ("decnet-agent", "decnet-forwarder", "decnet-engine", "decnet-updater") _SYSTEMD_UNITS = (
"decnet-agent", "decnet-forwarder", "decnet-engine", "decnet-updater",
# Per-host microservices — activated by enroll_bootstrap.sh.
"decnet-collector", "decnet-prober", "decnet-profiler", "decnet-sniffer",
)
def _render_systemd_unit(name: str, agent_name: str, master_host: str) -> bytes: def _render_systemd_unit(name: str, agent_name: str, master_host: str) -> bytes:

View File

@@ -0,0 +1,20 @@
[Unit]
Description=DECNET container log collector — {{ agent_name }}
Documentation=https://github.com/anti/DECNET
After=network-online.target decnet-agent.service
Wants=network-online.target
PartOf=decnet-agent.service
[Service]
Type=simple
WorkingDirectory=/opt/decnet
Environment=DECNET_MODE=agent
Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.collector.log
ExecStart=/usr/local/bin/decnet collect --log-file /var/log/decnet/decnet.log
Restart=on-failure
RestartSec=5
StandardOutput=append:/var/log/decnet/decnet.collector.log
StandardError=append:/var/log/decnet/decnet.collector.log
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,20 @@
[Unit]
Description=DECNET attacker prober (JARM/HASSH/TCP fingerprint) — {{ agent_name }}
Documentation=https://github.com/anti/DECNET
After=network-online.target decnet-agent.service
Wants=network-online.target
PartOf=decnet-agent.service
[Service]
Type=simple
WorkingDirectory=/opt/decnet
Environment=DECNET_MODE=agent
Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.prober.log
ExecStart=/usr/local/bin/decnet probe --log-file /var/log/decnet/decnet.log --interval 300
Restart=on-failure
RestartSec=5
StandardOutput=append:/var/log/decnet/decnet.prober.log
StandardError=append:/var/log/decnet/decnet.prober.log
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,20 @@
[Unit]
Description=DECNET attacker profiler — {{ agent_name }}
Documentation=https://github.com/anti/DECNET
After=network-online.target decnet-agent.service
Wants=network-online.target
PartOf=decnet-agent.service
[Service]
Type=simple
WorkingDirectory=/opt/decnet
Environment=DECNET_MODE=agent
Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.profiler.log
ExecStart=/usr/local/bin/decnet profiler --interval 30
Restart=on-failure
RestartSec=5
StandardOutput=append:/var/log/decnet/decnet.profiler.log
StandardError=append:/var/log/decnet/decnet.profiler.log
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,24 @@
[Unit]
Description=DECNET network sniffer — {{ agent_name }}
Documentation=https://github.com/anti/DECNET
After=network-online.target decnet-agent.service
Wants=network-online.target
PartOf=decnet-agent.service
[Service]
Type=simple
WorkingDirectory=/opt/decnet
Environment=DECNET_MODE=agent
Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.sniffer.log
# scapy needs raw sockets; forwarder already runs with these caps, so we
# mirror the same ambient set here.
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW
CapabilityBoundingSet=CAP_NET_ADMIN CAP_NET_RAW
ExecStart=/usr/local/bin/decnet sniffer --log-file /var/log/decnet/decnet.log
Restart=on-failure
RestartSec=5
StandardOutput=append:/var/log/decnet/decnet.sniffer.log
StandardError=append:/var/log/decnet/decnet.sniffer.log
[Install]
WantedBy=multi-user.target

View File

@@ -60,15 +60,24 @@ chmod 0755 "$VENV_DIR/bin/decnet"
ln -sf "$VENV_DIR/bin/decnet" /usr/local/bin/decnet ln -sf "$VENV_DIR/bin/decnet" /usr/local/bin/decnet
echo "[DECNET] installing systemd units..." echo "[DECNET] installing systemd units..."
install -Dm0644 etc/systemd/system/decnet-agent.service /etc/systemd/system/decnet-agent.service for unit in \
install -Dm0644 etc/systemd/system/decnet-forwarder.service /etc/systemd/system/decnet-forwarder.service decnet-agent decnet-forwarder decnet-engine \
install -Dm0644 etc/systemd/system/decnet-engine.service /etc/systemd/system/decnet-engine.service decnet-collector decnet-prober decnet-profiler decnet-sniffer; do
install -Dm0644 "etc/systemd/system/${unit}.service" "/etc/systemd/system/${unit}.service"
done
if [[ "$WITH_UPDATER" == "true" ]]; then if [[ "$WITH_UPDATER" == "true" ]]; then
install -Dm0644 etc/systemd/system/decnet-updater.service /etc/systemd/system/decnet-updater.service install -Dm0644 etc/systemd/system/decnet-updater.service /etc/systemd/system/decnet-updater.service
fi fi
systemctl daemon-reload systemctl daemon-reload
ACTIVE_UNITS=(decnet-agent.service decnet-forwarder.service) # Agent + forwarder are the control plane; collector/prober/profiler/sniffer
# are the per-host microservices that used to require `decnet deploy` to
# auto-spawn. With systemd units they come up at boot and auto-restart.
ACTIVE_UNITS=(
decnet-agent.service decnet-forwarder.service
decnet-collector.service decnet-prober.service
decnet-profiler.service decnet-sniffer.service
)
if [[ "$WITH_UPDATER" == "true" ]]; then if [[ "$WITH_UPDATER" == "true" ]]; then
ACTIVE_UNITS+=(decnet-updater.service) ACTIVE_UNITS+=(decnet-updater.service)
fi fi

View File

@@ -184,6 +184,10 @@ async def test_systemd_units_shipped_and_installed(client, auth_token):
assert "etc/systemd/system/decnet-agent.service" in names assert "etc/systemd/system/decnet-agent.service" in names
assert "etc/systemd/system/decnet-forwarder.service" in names assert "etc/systemd/system/decnet-forwarder.service" in names
assert "etc/systemd/system/decnet-engine.service" in names assert "etc/systemd/system/decnet-engine.service" in names
# Per-host microservices get their own systemd units now.
for unit in ("decnet-collector", "decnet-prober",
"decnet-profiler", "decnet-sniffer"):
assert f"etc/systemd/system/{unit}.service" in names, unit
fwd = tf.extractfile("etc/systemd/system/decnet-forwarder.service").read().decode() fwd = tf.extractfile("etc/systemd/system/decnet-forwarder.service").read().decode()
assert "--master-host 10.9.8.7" in fwd assert "--master-host 10.9.8.7" in fwd
@@ -197,8 +201,14 @@ async def test_systemd_units_shipped_and_installed(client, auth_token):
master_host="10.9.8.7")).json()["token"] master_host="10.9.8.7")).json()["token"]
sh = (await client.get(f"/api/v1/swarm/enroll-bundle/{sh_token}.sh")).text sh = (await client.get(f"/api/v1/swarm/enroll-bundle/{sh_token}.sh")).text
assert "systemctl daemon-reload" in sh assert "systemctl daemon-reload" in sh
# Agent + forwarder always enabled; updater conditional on WITH_UPDATER. # Agent + forwarder + per-host microservices always enabled; updater
assert "decnet-agent.service decnet-forwarder.service" in sh # conditional on WITH_UPDATER.
for unit in (
"decnet-agent.service", "decnet-forwarder.service",
"decnet-collector.service", "decnet-prober.service",
"decnet-profiler.service", "decnet-sniffer.service",
):
assert unit in sh, unit
assert "decnet-updater.service" in sh assert "decnet-updater.service" in sh
ini = tf.extractfile("etc/decnet/decnet.ini").read().decode() ini = tf.extractfile("etc/decnet/decnet.ini").read().decode()
@@ -299,9 +309,16 @@ async def test_get_tgz_contents(client, auth_token, tmp_path):
assert ".env.example" not in bad, f"leaked env file: {bad}" assert ".env.example" not in bad, f"leaked env file: {bad}"
# Master-only trees: agents don't run the FastAPI master app or the # Master-only trees: agents don't run the FastAPI master app or the
# React frontend, so shipping them bloats the tarball and widens the # React frontend, so shipping them bloats the tarball and widens the
# worker's attack surface for no benefit. # worker's attack surface for no benefit. decnet/web/db and
# decnet/web/dependencies.py DO ship — the profiler microservice on
# the agent needs the repo singleton.
assert not bad.startswith("decnet_web/"), f"leaked frontend: {bad}" assert not bad.startswith("decnet_web/"), f"leaked frontend: {bad}"
assert not bad.startswith("decnet/web/"), f"leaked master-api: {bad}" assert bad != "decnet/web/api.py", f"leaked master API: {bad}"
assert bad != "decnet/web/swarm_api.py", f"leaked swarm API: {bad}"
assert bad != "decnet/web/ingester.py", f"leaked ingester: {bad}"
assert not bad.startswith("decnet/web/router/"), f"leaked router: {bad}"
assert not bad.startswith("decnet/web/templates/"), f"leaked tpl: {bad}"
assert not bad.startswith("decnet/mutator/"), f"leaked mutator: {bad}"
# INI content is correct # INI content is correct
ini = tf.extractfile("etc/decnet/decnet.ini").read().decode() ini = tf.extractfile("etc/decnet/decnet.ini").read().decode()

View File

@@ -419,11 +419,17 @@ def test_spawn_agent_via_systemd_records_main_pid(
pid = ex._spawn_agent_via_systemd(install_dir) pid = ex._spawn_agent_via_systemd(install_dir)
assert pid == 4711 assert pid == 4711
assert (install_dir / "agent.pid").read_text() == "4711" assert (install_dir / "agent.pid").read_text() == "4711"
# Agent restart, forwarder restart, then MainPID lookup on the agent. # Agent restart, forwarder restart, each aux microservice, then the
# MainPID lookup on the agent.
assert calls[0] == ["systemctl", "restart", ex.AGENT_SYSTEMD_UNIT] assert calls[0] == ["systemctl", "restart", ex.AGENT_SYSTEMD_UNIT]
assert calls[1] == ["systemctl", "restart", ex.FORWARDER_SYSTEMD_UNIT] assert calls[1] == ["systemctl", "restart", ex.FORWARDER_SYSTEMD_UNIT]
assert calls[2][:2] == ["systemctl", "show"] aux_calls = calls[2 : 2 + len(ex.AUXILIARY_SYSTEMD_UNITS)]
assert ex.AGENT_SYSTEMD_UNIT in calls[2] assert aux_calls == [
["systemctl", "restart", unit] for unit in ex.AUXILIARY_SYSTEMD_UNITS
]
show_call = calls[2 + len(ex.AUXILIARY_SYSTEMD_UNITS)]
assert show_call[:2] == ["systemctl", "show"]
assert ex.AGENT_SYSTEMD_UNIT in show_call
def test_spawn_agent_via_systemd_tolerates_missing_forwarder_unit( def test_spawn_agent_via_systemd_tolerates_missing_forwarder_unit(