feat(swarm): self-destruct agent on decommission
Decommissioning a worker from the dashboard (or swarm controller) now asks the agent to wipe its own install before the master forgets it. The agent stops decky containers + every decnet-* systemd unit, then deletes /opt/decnet*, /etc/systemd/system/decnet-*, /var/lib/decnet/*, and /usr/local/bin/decnet*. Logs under /var/log are preserved. The reaper runs as a detached /tmp script (start_new_session=True) so it survives the agent process being killed. Self-destruct dispatch is best-effort — a dead worker doesn't block master-side cleanup.
This commit is contained in:
@@ -115,6 +115,76 @@ def _decky_runtime_states(config: DecnetConfig) -> dict[str, dict[str, Any]]:
|
||||
return out
|
||||
|
||||
|
||||
_REAPER_SCRIPT = r"""#!/bin/bash
|
||||
# DECNET agent self-destruct reaper.
|
||||
# Runs detached from the agent process so it survives the agent's death.
|
||||
# Waits briefly for the HTTP response to drain, then stops services,
|
||||
# wipes install paths, and preserves logs.
|
||||
set +e
|
||||
|
||||
sleep 3
|
||||
|
||||
# Stop decky containers started by the local deployer (best-effort).
|
||||
if command -v docker >/dev/null 2>&1; then
|
||||
docker ps -q --filter "label=com.docker.compose.project=decnet" | xargs -r docker stop
|
||||
docker ps -aq --filter "label=com.docker.compose.project=decnet" | xargs -r docker rm -f
|
||||
docker network rm decnet_lan 2>/dev/null
|
||||
fi
|
||||
|
||||
# Stop+disable every systemd unit the installer may have dropped.
|
||||
for unit in decnet-agent decnet-engine decnet-collector decnet-forwarder decnet-prober decnet-sniffer decnet-updater; do
|
||||
systemctl stop "$unit" 2>/dev/null
|
||||
systemctl disable "$unit" 2>/dev/null
|
||||
done
|
||||
|
||||
# Nuke install paths. Logs under /var/log/decnet* are intentionally
|
||||
# preserved — the operator typically wants them for forensic review.
|
||||
rm -rf /opt/decnet* /var/lib/decnet/* /usr/local/bin/decnet*
|
||||
rm -f /etc/systemd/system/decnet-*.service /etc/systemd/system/decnet-*.timer
|
||||
|
||||
systemctl daemon-reload 2>/dev/null
|
||||
rm -f "$0"
|
||||
"""
|
||||
|
||||
|
||||
async def self_destruct() -> None:
|
||||
"""Tear down deckies, then spawn a detached reaper that wipes the
|
||||
install footprint. Returns immediately so the HTTP response can drain
|
||||
before the reaper starts deleting files out from under the agent."""
|
||||
import os
|
||||
import subprocess # nosec B404
|
||||
import tempfile
|
||||
|
||||
# Best-effort teardown first — the reaper also runs docker stop, but
|
||||
# going through the deployer gives the host-macvlan/ipvlan helper a
|
||||
# chance to clean up routes cleanly.
|
||||
try:
|
||||
await asyncio.to_thread(_deployer.teardown, None)
|
||||
await asyncio.to_thread(clear_state)
|
||||
except Exception:
|
||||
log.exception("self_destruct: pre-reap teardown failed — reaper will force-stop containers")
|
||||
|
||||
# Reaper lives under /tmp so it survives rm -rf /opt/decnet*.
|
||||
fd, path = tempfile.mkstemp(prefix="decnet-reaper-", suffix=".sh", dir="/tmp") # nosec B108 — reaper must outlive /opt/decnet removal
|
||||
try:
|
||||
os.write(fd, _REAPER_SCRIPT.encode())
|
||||
finally:
|
||||
os.close(fd)
|
||||
os.chmod(path, 0o700) # nosec B103 — root-owned reaper, needs exec
|
||||
|
||||
# start_new_session detaches from the agent process group so the
|
||||
# reaper isn't killed when systemctl stop decnet-agent fires.
|
||||
subprocess.Popen( # nosec B603
|
||||
["/bin/bash", path],
|
||||
stdin=subprocess.DEVNULL,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
start_new_session=True,
|
||||
close_fds=True,
|
||||
)
|
||||
log.warning("self_destruct: reaper spawned path=%s — agent will die in ~3s", path)
|
||||
|
||||
|
||||
async def status() -> dict[str, Any]:
|
||||
state = await asyncio.to_thread(load_state)
|
||||
if state is None:
|
||||
|
||||
Reference in New Issue
Block a user