feat(swarm): per-host microservices as systemd units, mutator off agents

Previously `decnet status` on an agent showed every microservice as DOWN
because deploy's auto-spawn was unihost-scoped and the agent CLI gate
hid the per-host commands. Now:

  - collect, probe, profiler, sniffer drop out of MASTER_ONLY_COMMANDS
    (they run per-host; master-side work stays master-gated).
  - mutate stays master-only (it orchestrates swarm-wide respawns).
  - decnet/mutator/ excluded from agent tarballs — never invoked there.
  - decnet/web exclusion tightened: ship db/ + auth.py + dependencies.py
    (profiler needs the repo singleton), drop api.py, swarm_api.py,
    ingester.py, router/, templates/.
  - Four new systemd unit templates (decnet-collector/prober/profiler/
    sniffer) shipped in every enrollment tarball.
  - enroll_bootstrap.sh enables + starts all four alongside agent and
    forwarder at install time.
  - updater restarts the aux units on code push so they pick up the new
    release (best-effort — legacy enrollments without the units won't
    fail the update).
  - status table hides Mutator + API rows in agent mode.
This commit is contained in:
2026-04-19 18:58:48 -04:00
parent ee9ade4cd5
commit 6d7877c679
10 changed files with 172 additions and 19 deletions

View File

@@ -184,6 +184,10 @@ async def test_systemd_units_shipped_and_installed(client, auth_token):
assert "etc/systemd/system/decnet-agent.service" in names
assert "etc/systemd/system/decnet-forwarder.service" in names
assert "etc/systemd/system/decnet-engine.service" in names
# Per-host microservices get their own systemd units now.
for unit in ("decnet-collector", "decnet-prober",
"decnet-profiler", "decnet-sniffer"):
assert f"etc/systemd/system/{unit}.service" in names, unit
fwd = tf.extractfile("etc/systemd/system/decnet-forwarder.service").read().decode()
assert "--master-host 10.9.8.7" in fwd
@@ -197,8 +201,14 @@ async def test_systemd_units_shipped_and_installed(client, auth_token):
master_host="10.9.8.7")).json()["token"]
sh = (await client.get(f"/api/v1/swarm/enroll-bundle/{sh_token}.sh")).text
assert "systemctl daemon-reload" in sh
# Agent + forwarder always enabled; updater conditional on WITH_UPDATER.
assert "decnet-agent.service decnet-forwarder.service" in sh
# Agent + forwarder + per-host microservices always enabled; updater
# conditional on WITH_UPDATER.
for unit in (
"decnet-agent.service", "decnet-forwarder.service",
"decnet-collector.service", "decnet-prober.service",
"decnet-profiler.service", "decnet-sniffer.service",
):
assert unit in sh, unit
assert "decnet-updater.service" in sh
ini = tf.extractfile("etc/decnet/decnet.ini").read().decode()
@@ -299,9 +309,16 @@ async def test_get_tgz_contents(client, auth_token, tmp_path):
assert ".env.example" not in bad, f"leaked env file: {bad}"
# Master-only trees: agents don't run the FastAPI master app or the
# React frontend, so shipping them bloats the tarball and widens the
# worker's attack surface for no benefit.
# worker's attack surface for no benefit. decnet/web/db and
# decnet/web/dependencies.py DO ship — the profiler microservice on
# the agent needs the repo singleton.
assert not bad.startswith("decnet_web/"), f"leaked frontend: {bad}"
assert not bad.startswith("decnet/web/"), f"leaked master-api: {bad}"
assert bad != "decnet/web/api.py", f"leaked master API: {bad}"
assert bad != "decnet/web/swarm_api.py", f"leaked swarm API: {bad}"
assert bad != "decnet/web/ingester.py", f"leaked ingester: {bad}"
assert not bad.startswith("decnet/web/router/"), f"leaked router: {bad}"
assert not bad.startswith("decnet/web/templates/"), f"leaked tpl: {bad}"
assert not bad.startswith("decnet/mutator/"), f"leaked mutator: {bad}"
# INI content is correct
ini = tf.extractfile("etc/decnet/decnet.ini").read().decode()

View File

@@ -419,11 +419,17 @@ def test_spawn_agent_via_systemd_records_main_pid(
pid = ex._spawn_agent_via_systemd(install_dir)
assert pid == 4711
assert (install_dir / "agent.pid").read_text() == "4711"
# Agent restart, forwarder restart, then MainPID lookup on the agent.
# Agent restart, forwarder restart, each aux microservice, then the
# MainPID lookup on the agent.
assert calls[0] == ["systemctl", "restart", ex.AGENT_SYSTEMD_UNIT]
assert calls[1] == ["systemctl", "restart", ex.FORWARDER_SYSTEMD_UNIT]
assert calls[2][:2] == ["systemctl", "show"]
assert ex.AGENT_SYSTEMD_UNIT in calls[2]
aux_calls = calls[2 : 2 + len(ex.AUXILIARY_SYSTEMD_UNITS)]
assert aux_calls == [
["systemctl", "restart", unit] for unit in ex.AUXILIARY_SYSTEMD_UNITS
]
show_call = calls[2 + len(ex.AUXILIARY_SYSTEMD_UNITS)]
assert show_call[:2] == ["systemctl", "show"]
assert ex.AGENT_SYSTEMD_UNIT in show_call
def test_spawn_agent_via_systemd_tolerates_missing_forwarder_unit(