From 00d5799a796d66dd9c4b8a7d40b09b17c83f164e Mon Sep 17 00:00:00 2001 From: anti Date: Sun, 19 Apr 2026 21:00:43 -0400 Subject: [PATCH] fix(agent): escape systemd cgroup when spawning self-destruct reaper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reaper was being SIGTERM'd mid-rm because `start_new_session=True` only forks a new POSIX session — it does not escape decnet-agent.service's cgroup. When the reaper ran `systemctl stop decnet-agent`, systemd tore down the whole cgroup (reaper included) before `rm -rf /opt/decnet*` finished, leaving the install on disk. Spawn the reaper via `systemd-run --collect --unit decnet-reaper-` so it runs in a fresh transient scope, outside the agent unit. Falls back to bare Popen for non-systemd hosts. --- decnet/agent/executor.py | 33 ++++++++++++++++++++++++++++----- tests/swarm/test_agent_app.py | 8 ++++++-- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/decnet/agent/executor.py b/decnet/agent/executor.py index b93a0cb..3fa331d 100644 --- a/decnet/agent/executor.py +++ b/decnet/agent/executor.py @@ -152,6 +152,7 @@ async def self_destruct() -> None: install footprint. Returns immediately so the HTTP response can drain before the reaper starts deleting files out from under the agent.""" import os + import shutil import subprocess # nosec B404 import tempfile @@ -172,17 +173,39 @@ async def self_destruct() -> None: os.close(fd) os.chmod(path, 0o700) # nosec B103 — root-owned reaper, needs exec - # start_new_session detaches from the agent process group so the - # reaper isn't killed when systemctl stop decnet-agent fires. + # The reaper MUST run outside decnet-agent.service's cgroup — otherwise + # `systemctl stop decnet-agent` SIGTERMs the whole cgroup (reaper included) + # before rm -rf completes. `start_new_session=True` gets us a fresh POSIX + # session but does NOT escape the systemd cgroup. So we prefer + # `systemd-run --scope` (launches the command in a transient scope + # detached from the caller's service), falling back to a bare Popen if + # systemd-run is unavailable (non-systemd host / container). + systemd_run = shutil.which("systemd-run") + if systemd_run: + argv = [ + systemd_run, + "--collect", + "--unit", f"decnet-reaper-{os.getpid()}", + "--description", "DECNET agent self-destruct reaper", + "/bin/bash", path, + ] + spawn_kwargs = {"start_new_session": True} + else: + argv = ["/bin/bash", path] + spawn_kwargs = {"start_new_session": True} + subprocess.Popen( # nosec B603 - ["/bin/bash", path], + argv, stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, - start_new_session=True, close_fds=True, + **spawn_kwargs, + ) + log.warning( + "self_destruct: reaper spawned path=%s via=%s — agent will die in ~3s", + path, "systemd-run" if systemd_run else "popen", ) - log.warning("self_destruct: reaper spawned path=%s — agent will die in ~3s", path) async def status() -> dict[str, Any]: diff --git a/tests/swarm/test_agent_app.py b/tests/swarm/test_agent_app.py index 725c399..a4e9fae 100644 --- a/tests/swarm/test_agent_app.py +++ b/tests/swarm/test_agent_app.py @@ -71,8 +71,12 @@ def test_self_destruct_spawns_reaper_and_returns_fast(monkeypatch, tmp_path) -> assert resp.json()["status"] == "self_destruct_scheduled" assert len(spawned) == 1 assert spawned[0]["kw"].get("start_new_session") is True - script_path = spawned[0]["args"][1] - assert script_path.startswith("/tmp/decnet-reaper-") + script_candidates = [ + a for a in spawned[0]["args"] + if isinstance(a, str) and a.startswith("/tmp/decnet-reaper-") + ] + assert len(script_candidates) == 1, spawned[0]["args"] + script_path = script_candidates[0] # Reaper content sanity check — covers the paths the operator asked for. import pathlib body = pathlib.Path(script_path).read_text()