feat(swarm): per-host microservices as systemd units, mutator off agents
Previously `decnet status` on an agent showed every microservice as DOWN
because deploy's auto-spawn was unihost-scoped and the agent CLI gate
hid the per-host commands. Now:
- collect, probe, profiler, sniffer drop out of MASTER_ONLY_COMMANDS
(they run per-host; master-side work stays master-gated).
- mutate stays master-only (it orchestrates swarm-wide respawns).
- decnet/mutator/ excluded from agent tarballs — never invoked there.
- decnet/web exclusion tightened: ship db/ + auth.py + dependencies.py
(profiler needs the repo singleton), drop api.py, swarm_api.py,
ingester.py, router/, templates/.
- Four new systemd unit templates (decnet-collector/prober/profiler/
sniffer) shipped in every enrollment tarball.
- enroll_bootstrap.sh enables + starts all four alongside agent and
forwarder at install time.
- updater restarts the aux units on code push so they pick up the new
release (best-effort — legacy enrollments without the units won't
fail the update).
- status table hides Mutator + API rows in agent mode.
This commit is contained in:
@@ -1323,6 +1323,11 @@ def status() -> None:
|
|||||||
_status()
|
_status()
|
||||||
|
|
||||||
registry = _service_registry(str(DECNET_INGEST_LOG_FILE))
|
registry = _service_registry(str(DECNET_INGEST_LOG_FILE))
|
||||||
|
# On agents, the Mutator runs master-side only (it schedules decky
|
||||||
|
# respawns across the swarm) and the API is never shipped. Hide those
|
||||||
|
# rows so operators aren't chasing permanent DOWN entries.
|
||||||
|
if _agent_mode_active():
|
||||||
|
registry = [r for r in registry if r[0] not in {"Mutator", "API"}]
|
||||||
svc_table = Table(title="DECNET Services", show_lines=True)
|
svc_table = Table(title="DECNET Services", show_lines=True)
|
||||||
svc_table.add_column("Service", style="bold cyan")
|
svc_table.add_column("Service", style="bold cyan")
|
||||||
svc_table.add_column("Status")
|
svc_table.add_column("Status")
|
||||||
@@ -1762,13 +1767,17 @@ def db_reset(
|
|||||||
# MASTER_ONLY when touching command registration.
|
# MASTER_ONLY when touching command registration.
|
||||||
#
|
#
|
||||||
# Worker-legitimate commands (NOT in these sets): agent, updater, forwarder,
|
# Worker-legitimate commands (NOT in these sets): agent, updater, forwarder,
|
||||||
# status (agents run deckies locally and should be able to inspect them).
|
# status, collect, probe, profiler, sniffer. Agents run deckies locally and
|
||||||
|
# should be able to inspect them + run the per-host microservices (collector
|
||||||
|
# streams container logs, prober/profiler characterize attackers hitting
|
||||||
|
# this host, sniffer captures traffic). Mutator stays master-only because
|
||||||
|
# it orchestrates respawns across the swarm.
|
||||||
# ───────────────────────────────────────────────────────────────────────────
|
# ───────────────────────────────────────────────────────────────────────────
|
||||||
MASTER_ONLY_COMMANDS: frozenset[str] = frozenset({
|
MASTER_ONLY_COMMANDS: frozenset[str] = frozenset({
|
||||||
"api", "swarmctl", "deploy", "redeploy", "teardown",
|
"api", "swarmctl", "deploy", "redeploy", "teardown",
|
||||||
"probe", "collect", "mutate", "listener",
|
"mutate", "listener",
|
||||||
"services", "distros", "correlate", "archetypes", "web",
|
"services", "distros", "correlate", "archetypes", "web",
|
||||||
"profiler", "sniffer", "db-reset",
|
"db-reset",
|
||||||
})
|
})
|
||||||
MASTER_ONLY_GROUPS: frozenset[str] = frozenset({"swarm"})
|
MASTER_ONLY_GROUPS: frozenset[str] = frozenset({"swarm"})
|
||||||
|
|
||||||
|
|||||||
@@ -237,6 +237,14 @@ def _run_pip(
|
|||||||
AGENT_SYSTEMD_UNIT = "decnet-agent.service"
|
AGENT_SYSTEMD_UNIT = "decnet-agent.service"
|
||||||
FORWARDER_SYSTEMD_UNIT = "decnet-forwarder.service"
|
FORWARDER_SYSTEMD_UNIT = "decnet-forwarder.service"
|
||||||
UPDATER_SYSTEMD_UNIT = "decnet-updater.service"
|
UPDATER_SYSTEMD_UNIT = "decnet-updater.service"
|
||||||
|
# Per-host microservices that run out of the same /opt/decnet tree. An
|
||||||
|
# update replaces their code, so we must cycle them alongside the agent or
|
||||||
|
# they keep serving the pre-update image. Best-effort: legacy enrollments
|
||||||
|
# without these units installed shouldn't abort the update.
|
||||||
|
AUXILIARY_SYSTEMD_UNITS = (
|
||||||
|
"decnet-collector.service", "decnet-prober.service",
|
||||||
|
"decnet-profiler.service", "decnet-sniffer.service",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _systemd_available() -> bool:
|
def _systemd_available() -> bool:
|
||||||
@@ -286,6 +294,13 @@ def _spawn_agent_via_systemd(install_dir: pathlib.Path) -> int:
|
|||||||
)
|
)
|
||||||
if fwd.returncode != 0:
|
if fwd.returncode != 0:
|
||||||
log.warning("forwarder restart failed (ignored): %s", fwd.stderr.strip())
|
log.warning("forwarder restart failed (ignored): %s", fwd.stderr.strip())
|
||||||
|
for unit in AUXILIARY_SYSTEMD_UNITS:
|
||||||
|
aux = subprocess.run( # nosec B603 B607
|
||||||
|
["systemctl", "restart", unit],
|
||||||
|
check=False, capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
if aux.returncode != 0:
|
||||||
|
log.warning("%s restart failed (ignored): %s", unit, aux.stderr.strip())
|
||||||
pid_out = subprocess.run( # nosec B603 B607
|
pid_out = subprocess.run( # nosec B603 B607
|
||||||
["systemctl", "show", "--property=MainPID", "--value", AGENT_SYSTEMD_UNIT],
|
["systemctl", "show", "--property=MainPID", "--value", AGENT_SYSTEMD_UNIT],
|
||||||
check=True, capture_output=True, text=True,
|
check=True, capture_output=True, text=True,
|
||||||
|
|||||||
@@ -63,10 +63,19 @@ _EXCLUDES: tuple[str, ...] = (
|
|||||||
"wiki-checkout", "wiki-checkout/*",
|
"wiki-checkout", "wiki-checkout/*",
|
||||||
# Frontend is master-only; agents never serve UI.
|
# Frontend is master-only; agents never serve UI.
|
||||||
"decnet_web", "decnet_web/*", "decnet_web/**",
|
"decnet_web", "decnet_web/*", "decnet_web/**",
|
||||||
# Master FastAPI app (API, routers, master-side DB) is not run on agents.
|
# Master API surface. Agents ship with decnet.web.db + auth + dependencies
|
||||||
# The `agent` / `updater` / `forwarder` commands have their own apps under
|
# (the profiler microservice needs the repo singleton), but the FastAPI
|
||||||
# decnet/agent, decnet/updater — they don't import decnet.web.
|
# app itself (api.py, swarm_api.py, the full router tree, the ingester,
|
||||||
"decnet/web", "decnet/web/*", "decnet/web/**",
|
# and the .j2 templates that the master renders into the tarball) has no
|
||||||
|
# business running on a worker.
|
||||||
|
"decnet/web/api.py",
|
||||||
|
"decnet/web/swarm_api.py",
|
||||||
|
"decnet/web/ingester.py",
|
||||||
|
"decnet/web/router", "decnet/web/router/*", "decnet/web/router/**",
|
||||||
|
"decnet/web/templates", "decnet/web/templates/*", "decnet/web/templates/**",
|
||||||
|
# Mutator is master-only (it schedules decky respawns across the swarm);
|
||||||
|
# agents never invoke it. Keep it off the worker.
|
||||||
|
"decnet/mutator", "decnet/mutator/*", "decnet/mutator/**",
|
||||||
"decnet-state.json",
|
"decnet-state.json",
|
||||||
"master.log", "master.json",
|
"master.log", "master.json",
|
||||||
"decnet.tar",
|
"decnet.tar",
|
||||||
@@ -254,7 +263,11 @@ def _build_tarball(
|
|||||||
return buf.getvalue()
|
return buf.getvalue()
|
||||||
|
|
||||||
|
|
||||||
_SYSTEMD_UNITS = ("decnet-agent", "decnet-forwarder", "decnet-engine", "decnet-updater")
|
_SYSTEMD_UNITS = (
|
||||||
|
"decnet-agent", "decnet-forwarder", "decnet-engine", "decnet-updater",
|
||||||
|
# Per-host microservices — activated by enroll_bootstrap.sh.
|
||||||
|
"decnet-collector", "decnet-prober", "decnet-profiler", "decnet-sniffer",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _render_systemd_unit(name: str, agent_name: str, master_host: str) -> bytes:
|
def _render_systemd_unit(name: str, agent_name: str, master_host: str) -> bytes:
|
||||||
|
|||||||
20
decnet/web/templates/decnet-collector.service.j2
Normal file
20
decnet/web/templates/decnet-collector.service.j2
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=DECNET container log collector — {{ agent_name }}
|
||||||
|
Documentation=https://github.com/anti/DECNET
|
||||||
|
After=network-online.target decnet-agent.service
|
||||||
|
Wants=network-online.target
|
||||||
|
PartOf=decnet-agent.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
WorkingDirectory=/opt/decnet
|
||||||
|
Environment=DECNET_MODE=agent
|
||||||
|
Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.collector.log
|
||||||
|
ExecStart=/usr/local/bin/decnet collect --log-file /var/log/decnet/decnet.log
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=5
|
||||||
|
StandardOutput=append:/var/log/decnet/decnet.collector.log
|
||||||
|
StandardError=append:/var/log/decnet/decnet.collector.log
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
20
decnet/web/templates/decnet-prober.service.j2
Normal file
20
decnet/web/templates/decnet-prober.service.j2
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=DECNET attacker prober (JARM/HASSH/TCP fingerprint) — {{ agent_name }}
|
||||||
|
Documentation=https://github.com/anti/DECNET
|
||||||
|
After=network-online.target decnet-agent.service
|
||||||
|
Wants=network-online.target
|
||||||
|
PartOf=decnet-agent.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
WorkingDirectory=/opt/decnet
|
||||||
|
Environment=DECNET_MODE=agent
|
||||||
|
Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.prober.log
|
||||||
|
ExecStart=/usr/local/bin/decnet probe --log-file /var/log/decnet/decnet.log --interval 300
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=5
|
||||||
|
StandardOutput=append:/var/log/decnet/decnet.prober.log
|
||||||
|
StandardError=append:/var/log/decnet/decnet.prober.log
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
20
decnet/web/templates/decnet-profiler.service.j2
Normal file
20
decnet/web/templates/decnet-profiler.service.j2
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=DECNET attacker profiler — {{ agent_name }}
|
||||||
|
Documentation=https://github.com/anti/DECNET
|
||||||
|
After=network-online.target decnet-agent.service
|
||||||
|
Wants=network-online.target
|
||||||
|
PartOf=decnet-agent.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
WorkingDirectory=/opt/decnet
|
||||||
|
Environment=DECNET_MODE=agent
|
||||||
|
Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.profiler.log
|
||||||
|
ExecStart=/usr/local/bin/decnet profiler --interval 30
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=5
|
||||||
|
StandardOutput=append:/var/log/decnet/decnet.profiler.log
|
||||||
|
StandardError=append:/var/log/decnet/decnet.profiler.log
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
24
decnet/web/templates/decnet-sniffer.service.j2
Normal file
24
decnet/web/templates/decnet-sniffer.service.j2
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=DECNET network sniffer — {{ agent_name }}
|
||||||
|
Documentation=https://github.com/anti/DECNET
|
||||||
|
After=network-online.target decnet-agent.service
|
||||||
|
Wants=network-online.target
|
||||||
|
PartOf=decnet-agent.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
WorkingDirectory=/opt/decnet
|
||||||
|
Environment=DECNET_MODE=agent
|
||||||
|
Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.sniffer.log
|
||||||
|
# scapy needs raw sockets; forwarder already runs with these caps, so we
|
||||||
|
# mirror the same ambient set here.
|
||||||
|
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW
|
||||||
|
CapabilityBoundingSet=CAP_NET_ADMIN CAP_NET_RAW
|
||||||
|
ExecStart=/usr/local/bin/decnet sniffer --log-file /var/log/decnet/decnet.log
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=5
|
||||||
|
StandardOutput=append:/var/log/decnet/decnet.sniffer.log
|
||||||
|
StandardError=append:/var/log/decnet/decnet.sniffer.log
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
@@ -60,15 +60,24 @@ chmod 0755 "$VENV_DIR/bin/decnet"
|
|||||||
ln -sf "$VENV_DIR/bin/decnet" /usr/local/bin/decnet
|
ln -sf "$VENV_DIR/bin/decnet" /usr/local/bin/decnet
|
||||||
|
|
||||||
echo "[DECNET] installing systemd units..."
|
echo "[DECNET] installing systemd units..."
|
||||||
install -Dm0644 etc/systemd/system/decnet-agent.service /etc/systemd/system/decnet-agent.service
|
for unit in \
|
||||||
install -Dm0644 etc/systemd/system/decnet-forwarder.service /etc/systemd/system/decnet-forwarder.service
|
decnet-agent decnet-forwarder decnet-engine \
|
||||||
install -Dm0644 etc/systemd/system/decnet-engine.service /etc/systemd/system/decnet-engine.service
|
decnet-collector decnet-prober decnet-profiler decnet-sniffer; do
|
||||||
|
install -Dm0644 "etc/systemd/system/${unit}.service" "/etc/systemd/system/${unit}.service"
|
||||||
|
done
|
||||||
if [[ "$WITH_UPDATER" == "true" ]]; then
|
if [[ "$WITH_UPDATER" == "true" ]]; then
|
||||||
install -Dm0644 etc/systemd/system/decnet-updater.service /etc/systemd/system/decnet-updater.service
|
install -Dm0644 etc/systemd/system/decnet-updater.service /etc/systemd/system/decnet-updater.service
|
||||||
fi
|
fi
|
||||||
systemctl daemon-reload
|
systemctl daemon-reload
|
||||||
|
|
||||||
ACTIVE_UNITS=(decnet-agent.service decnet-forwarder.service)
|
# Agent + forwarder are the control plane; collector/prober/profiler/sniffer
|
||||||
|
# are the per-host microservices that used to require `decnet deploy` to
|
||||||
|
# auto-spawn. With systemd units they come up at boot and auto-restart.
|
||||||
|
ACTIVE_UNITS=(
|
||||||
|
decnet-agent.service decnet-forwarder.service
|
||||||
|
decnet-collector.service decnet-prober.service
|
||||||
|
decnet-profiler.service decnet-sniffer.service
|
||||||
|
)
|
||||||
if [[ "$WITH_UPDATER" == "true" ]]; then
|
if [[ "$WITH_UPDATER" == "true" ]]; then
|
||||||
ACTIVE_UNITS+=(decnet-updater.service)
|
ACTIVE_UNITS+=(decnet-updater.service)
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -184,6 +184,10 @@ async def test_systemd_units_shipped_and_installed(client, auth_token):
|
|||||||
assert "etc/systemd/system/decnet-agent.service" in names
|
assert "etc/systemd/system/decnet-agent.service" in names
|
||||||
assert "etc/systemd/system/decnet-forwarder.service" in names
|
assert "etc/systemd/system/decnet-forwarder.service" in names
|
||||||
assert "etc/systemd/system/decnet-engine.service" in names
|
assert "etc/systemd/system/decnet-engine.service" in names
|
||||||
|
# Per-host microservices get their own systemd units now.
|
||||||
|
for unit in ("decnet-collector", "decnet-prober",
|
||||||
|
"decnet-profiler", "decnet-sniffer"):
|
||||||
|
assert f"etc/systemd/system/{unit}.service" in names, unit
|
||||||
|
|
||||||
fwd = tf.extractfile("etc/systemd/system/decnet-forwarder.service").read().decode()
|
fwd = tf.extractfile("etc/systemd/system/decnet-forwarder.service").read().decode()
|
||||||
assert "--master-host 10.9.8.7" in fwd
|
assert "--master-host 10.9.8.7" in fwd
|
||||||
@@ -197,8 +201,14 @@ async def test_systemd_units_shipped_and_installed(client, auth_token):
|
|||||||
master_host="10.9.8.7")).json()["token"]
|
master_host="10.9.8.7")).json()["token"]
|
||||||
sh = (await client.get(f"/api/v1/swarm/enroll-bundle/{sh_token}.sh")).text
|
sh = (await client.get(f"/api/v1/swarm/enroll-bundle/{sh_token}.sh")).text
|
||||||
assert "systemctl daemon-reload" in sh
|
assert "systemctl daemon-reload" in sh
|
||||||
# Agent + forwarder always enabled; updater conditional on WITH_UPDATER.
|
# Agent + forwarder + per-host microservices always enabled; updater
|
||||||
assert "decnet-agent.service decnet-forwarder.service" in sh
|
# conditional on WITH_UPDATER.
|
||||||
|
for unit in (
|
||||||
|
"decnet-agent.service", "decnet-forwarder.service",
|
||||||
|
"decnet-collector.service", "decnet-prober.service",
|
||||||
|
"decnet-profiler.service", "decnet-sniffer.service",
|
||||||
|
):
|
||||||
|
assert unit in sh, unit
|
||||||
assert "decnet-updater.service" in sh
|
assert "decnet-updater.service" in sh
|
||||||
|
|
||||||
ini = tf.extractfile("etc/decnet/decnet.ini").read().decode()
|
ini = tf.extractfile("etc/decnet/decnet.ini").read().decode()
|
||||||
@@ -299,9 +309,16 @@ async def test_get_tgz_contents(client, auth_token, tmp_path):
|
|||||||
assert ".env.example" not in bad, f"leaked env file: {bad}"
|
assert ".env.example" not in bad, f"leaked env file: {bad}"
|
||||||
# Master-only trees: agents don't run the FastAPI master app or the
|
# Master-only trees: agents don't run the FastAPI master app or the
|
||||||
# React frontend, so shipping them bloats the tarball and widens the
|
# React frontend, so shipping them bloats the tarball and widens the
|
||||||
# worker's attack surface for no benefit.
|
# worker's attack surface for no benefit. decnet/web/db and
|
||||||
|
# decnet/web/dependencies.py DO ship — the profiler microservice on
|
||||||
|
# the agent needs the repo singleton.
|
||||||
assert not bad.startswith("decnet_web/"), f"leaked frontend: {bad}"
|
assert not bad.startswith("decnet_web/"), f"leaked frontend: {bad}"
|
||||||
assert not bad.startswith("decnet/web/"), f"leaked master-api: {bad}"
|
assert bad != "decnet/web/api.py", f"leaked master API: {bad}"
|
||||||
|
assert bad != "decnet/web/swarm_api.py", f"leaked swarm API: {bad}"
|
||||||
|
assert bad != "decnet/web/ingester.py", f"leaked ingester: {bad}"
|
||||||
|
assert not bad.startswith("decnet/web/router/"), f"leaked router: {bad}"
|
||||||
|
assert not bad.startswith("decnet/web/templates/"), f"leaked tpl: {bad}"
|
||||||
|
assert not bad.startswith("decnet/mutator/"), f"leaked mutator: {bad}"
|
||||||
|
|
||||||
# INI content is correct
|
# INI content is correct
|
||||||
ini = tf.extractfile("etc/decnet/decnet.ini").read().decode()
|
ini = tf.extractfile("etc/decnet/decnet.ini").read().decode()
|
||||||
|
|||||||
@@ -419,11 +419,17 @@ def test_spawn_agent_via_systemd_records_main_pid(
|
|||||||
pid = ex._spawn_agent_via_systemd(install_dir)
|
pid = ex._spawn_agent_via_systemd(install_dir)
|
||||||
assert pid == 4711
|
assert pid == 4711
|
||||||
assert (install_dir / "agent.pid").read_text() == "4711"
|
assert (install_dir / "agent.pid").read_text() == "4711"
|
||||||
# Agent restart, forwarder restart, then MainPID lookup on the agent.
|
# Agent restart, forwarder restart, each aux microservice, then the
|
||||||
|
# MainPID lookup on the agent.
|
||||||
assert calls[0] == ["systemctl", "restart", ex.AGENT_SYSTEMD_UNIT]
|
assert calls[0] == ["systemctl", "restart", ex.AGENT_SYSTEMD_UNIT]
|
||||||
assert calls[1] == ["systemctl", "restart", ex.FORWARDER_SYSTEMD_UNIT]
|
assert calls[1] == ["systemctl", "restart", ex.FORWARDER_SYSTEMD_UNIT]
|
||||||
assert calls[2][:2] == ["systemctl", "show"]
|
aux_calls = calls[2 : 2 + len(ex.AUXILIARY_SYSTEMD_UNITS)]
|
||||||
assert ex.AGENT_SYSTEMD_UNIT in calls[2]
|
assert aux_calls == [
|
||||||
|
["systemctl", "restart", unit] for unit in ex.AUXILIARY_SYSTEMD_UNITS
|
||||||
|
]
|
||||||
|
show_call = calls[2 + len(ex.AUXILIARY_SYSTEMD_UNITS)]
|
||||||
|
assert show_call[:2] == ["systemctl", "show"]
|
||||||
|
assert ex.AGENT_SYSTEMD_UNIT in show_call
|
||||||
|
|
||||||
|
|
||||||
def test_spawn_agent_via_systemd_tolerates_missing_forwarder_unit(
|
def test_spawn_agent_via_systemd_tolerates_missing_forwarder_unit(
|
||||||
|
|||||||
Reference in New Issue
Block a user