feat(swarm): self-destruct agent on decommission
Decommissioning a worker from the dashboard (or swarm controller) now asks the agent to wipe its own install before the master forgets it. The agent stops decky containers + every decnet-* systemd unit, then deletes /opt/decnet*, /etc/systemd/system/decnet-*, /var/lib/decnet/*, and /usr/local/bin/decnet*. Logs under /var/log are preserved. The reaper runs as a detached /tmp script (start_new_session=True) so it survives the agent process being killed. Self-destruct dispatch is best-effort — a dead worker doesn't block master-side cleanup.
This commit is contained in:
@@ -42,4 +42,49 @@ def test_deploy_rejects_malformed_body() -> None:
|
||||
|
||||
def test_route_set() -> None:
|
||||
paths = {r.path for r in app.routes if hasattr(r, "path")}
|
||||
assert {"/health", "/status", "/deploy", "/teardown", "/mutate"} <= paths
|
||||
assert {"/health", "/status", "/deploy", "/teardown", "/mutate", "/self-destruct"} <= paths
|
||||
|
||||
|
||||
def test_self_destruct_spawns_reaper_and_returns_fast(monkeypatch, tmp_path) -> None:
|
||||
"""/self-destruct must write the reaper script and spawn it detached
|
||||
(start_new_session=True). We intercept Popen so the test doesn't
|
||||
actually nuke anything."""
|
||||
from decnet.agent import executor as _exec
|
||||
|
||||
spawned: list[dict] = []
|
||||
|
||||
class _FakePopen:
|
||||
def __init__(self, args, **kw):
|
||||
spawned.append({"args": args, "kw": kw})
|
||||
|
||||
monkeypatch.setattr(_exec, "_deployer", type("X", (), {
|
||||
"teardown": staticmethod(lambda _id: None),
|
||||
})())
|
||||
monkeypatch.setattr(_exec, "clear_state", lambda: None)
|
||||
|
||||
import subprocess as _sp
|
||||
monkeypatch.setattr(_sp, "Popen", _FakePopen)
|
||||
|
||||
client = TestClient(app)
|
||||
resp = client.post("/self-destruct")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["status"] == "self_destruct_scheduled"
|
||||
assert len(spawned) == 1
|
||||
assert spawned[0]["kw"].get("start_new_session") is True
|
||||
script_path = spawned[0]["args"][1]
|
||||
assert script_path.startswith("/tmp/decnet-reaper-")
|
||||
# Reaper content sanity check — covers the paths the operator asked for.
|
||||
import pathlib
|
||||
body = pathlib.Path(script_path).read_text()
|
||||
assert "/opt/decnet*" in body
|
||||
assert "/etc/systemd/system/decnet-" in body
|
||||
assert "/var/lib/decnet/*" in body
|
||||
assert "/usr/local/bin/decnet*" in body
|
||||
# Logs must be preserved — no `rm` line should touch /var/log.
|
||||
for line in body.splitlines():
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("#") or not stripped:
|
||||
continue
|
||||
if stripped.startswith("rm "):
|
||||
assert "/var/log" not in stripped
|
||||
pathlib.Path(script_path).unlink(missing_ok=True)
|
||||
|
||||
@@ -158,6 +158,60 @@ def test_decommission_removes_host_and_bundle(
|
||||
assert not bundle_dir.exists()
|
||||
|
||||
|
||||
def test_decommission_dispatches_self_destruct_to_agent(
|
||||
client: TestClient, ca_dir: pathlib.Path, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""Decommission must ask the worker to wipe its own install. Otherwise
|
||||
the agent keeps running after the dashboard forgets it exists."""
|
||||
calls: list[str] = []
|
||||
|
||||
class _SelfDestructAgent:
|
||||
def __init__(self, host=None, **_):
|
||||
self._host = host or {}
|
||||
|
||||
async def __aenter__(self): return self
|
||||
async def __aexit__(self, *exc): return None
|
||||
|
||||
async def self_destruct(self):
|
||||
calls.append(self._host.get("name") or "?")
|
||||
return {"status": "self_destruct_scheduled"}
|
||||
|
||||
from decnet.web.router.swarm import api_decommission_host as decom_mod
|
||||
monkeypatch.setattr(decom_mod, "AgentClient", _SelfDestructAgent)
|
||||
|
||||
reg = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "worker-nuke", "address": "10.0.0.8", "agent_port": 8765},
|
||||
).json()
|
||||
resp = client.delete(f"/swarm/hosts/{reg['host_uuid']}")
|
||||
assert resp.status_code == 204
|
||||
assert calls == ["worker-nuke"]
|
||||
|
||||
|
||||
def test_decommission_proceeds_when_agent_unreachable(
|
||||
client: TestClient, ca_dir: pathlib.Path, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""A dead worker must not block the operator from cleaning up the
|
||||
dashboard. Self-destruct failure is logged, decommission proceeds."""
|
||||
class _DeadAgent:
|
||||
def __init__(self, host=None, **_): pass
|
||||
async def __aenter__(self): return self
|
||||
async def __aexit__(self, *exc): return None
|
||||
async def self_destruct(self):
|
||||
raise RuntimeError("connection refused")
|
||||
|
||||
from decnet.web.router.swarm import api_decommission_host as decom_mod
|
||||
monkeypatch.setattr(decom_mod, "AgentClient", _DeadAgent)
|
||||
|
||||
reg = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "worker-dead", "address": "10.0.0.8", "agent_port": 8765},
|
||||
).json()
|
||||
resp = client.delete(f"/swarm/hosts/{reg['host_uuid']}")
|
||||
assert resp.status_code == 204
|
||||
assert client.get(f"/swarm/hosts/{reg['host_uuid']}").status_code == 404
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- /deploy
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user