fix(agent): escape systemd cgroup when spawning self-destruct reaper

The reaper was being SIGTERM'd mid-rm because `start_new_session=True`
only forks a new POSIX session — it does not escape decnet-agent.service's
cgroup. When the reaper ran `systemctl stop decnet-agent`, systemd
tore down the whole cgroup (reaper included) before `rm -rf /opt/decnet*`
finished, leaving the install on disk.

Spawn the reaper via `systemd-run --collect --unit decnet-reaper-<pid>`
so it runs in a fresh transient scope, outside the agent unit. Falls
back to bare Popen for non-systemd hosts.
This commit is contained in:
2026-04-19 21:00:43 -04:00
parent 14250cacad
commit 00d5799a79
2 changed files with 34 additions and 7 deletions

View File

@@ -152,6 +152,7 @@ async def self_destruct() -> None:
install footprint. Returns immediately so the HTTP response can drain
before the reaper starts deleting files out from under the agent."""
import os
import shutil
import subprocess # nosec B404
import tempfile
@@ -172,17 +173,39 @@ async def self_destruct() -> None:
os.close(fd)
os.chmod(path, 0o700) # nosec B103 — root-owned reaper, needs exec
# start_new_session detaches from the agent process group so the
# reaper isn't killed when systemctl stop decnet-agent fires.
# The reaper MUST run outside decnet-agent.service's cgroup — otherwise
# `systemctl stop decnet-agent` SIGTERMs the whole cgroup (reaper included)
# before rm -rf completes. `start_new_session=True` gets us a fresh POSIX
# session but does NOT escape the systemd cgroup. So we prefer
# `systemd-run --scope` (launches the command in a transient scope
# detached from the caller's service), falling back to a bare Popen if
# systemd-run is unavailable (non-systemd host / container).
systemd_run = shutil.which("systemd-run")
if systemd_run:
argv = [
systemd_run,
"--collect",
"--unit", f"decnet-reaper-{os.getpid()}",
"--description", "DECNET agent self-destruct reaper",
"/bin/bash", path,
]
spawn_kwargs = {"start_new_session": True}
else:
argv = ["/bin/bash", path]
spawn_kwargs = {"start_new_session": True}
subprocess.Popen( # nosec B603
["/bin/bash", path],
argv,
stdin=subprocess.DEVNULL,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True,
close_fds=True,
**spawn_kwargs,
)
log.warning(
"self_destruct: reaper spawned path=%s via=%s — agent will die in ~3s",
path, "systemd-run" if systemd_run else "popen",
)
log.warning("self_destruct: reaper spawned path=%s — agent will die in ~3s", path)
async def status() -> dict[str, Any]:

View File

@@ -71,8 +71,12 @@ def test_self_destruct_spawns_reaper_and_returns_fast(monkeypatch, tmp_path) ->
assert resp.json()["status"] == "self_destruct_scheduled"
assert len(spawned) == 1
assert spawned[0]["kw"].get("start_new_session") is True
script_path = spawned[0]["args"][1]
assert script_path.startswith("/tmp/decnet-reaper-")
script_candidates = [
a for a in spawned[0]["args"]
if isinstance(a, str) and a.startswith("/tmp/decnet-reaper-")
]
assert len(script_candidates) == 1, spawned[0]["args"]
script_path = script_candidates[0]
# Reaper content sanity check — covers the paths the operator asked for.
import pathlib
body = pathlib.Path(script_path).read_text()