fix(agent): escape systemd cgroup when spawning self-destruct reaper

The reaper was being SIGTERM'd mid-rm because `start_new_session=True`
only forks a new POSIX session — it does not escape decnet-agent.service's
cgroup. When the reaper ran `systemctl stop decnet-agent`, systemd
tore down the whole cgroup (reaper included) before `rm -rf /opt/decnet*`
finished, leaving the install on disk.

Spawn the reaper via `systemd-run --collect --unit decnet-reaper-<pid>`
so it runs in a fresh transient scope, outside the agent unit. Falls
back to bare Popen for non-systemd hosts.
This commit is contained in:
2026-04-19 21:00:43 -04:00
parent 14250cacad
commit 00d5799a79
2 changed files with 34 additions and 7 deletions

View File

@@ -152,6 +152,7 @@ async def self_destruct() -> None:
install footprint. Returns immediately so the HTTP response can drain install footprint. Returns immediately so the HTTP response can drain
before the reaper starts deleting files out from under the agent.""" before the reaper starts deleting files out from under the agent."""
import os import os
import shutil
import subprocess # nosec B404 import subprocess # nosec B404
import tempfile import tempfile
@@ -172,17 +173,39 @@ async def self_destruct() -> None:
os.close(fd) os.close(fd)
os.chmod(path, 0o700) # nosec B103 — root-owned reaper, needs exec os.chmod(path, 0o700) # nosec B103 — root-owned reaper, needs exec
# start_new_session detaches from the agent process group so the # The reaper MUST run outside decnet-agent.service's cgroup — otherwise
# reaper isn't killed when systemctl stop decnet-agent fires. # `systemctl stop decnet-agent` SIGTERMs the whole cgroup (reaper included)
# before rm -rf completes. `start_new_session=True` gets us a fresh POSIX
# session but does NOT escape the systemd cgroup. So we prefer
# `systemd-run --scope` (launches the command in a transient scope
# detached from the caller's service), falling back to a bare Popen if
# systemd-run is unavailable (non-systemd host / container).
systemd_run = shutil.which("systemd-run")
if systemd_run:
argv = [
systemd_run,
"--collect",
"--unit", f"decnet-reaper-{os.getpid()}",
"--description", "DECNET agent self-destruct reaper",
"/bin/bash", path,
]
spawn_kwargs = {"start_new_session": True}
else:
argv = ["/bin/bash", path]
spawn_kwargs = {"start_new_session": True}
subprocess.Popen( # nosec B603 subprocess.Popen( # nosec B603
["/bin/bash", path], argv,
stdin=subprocess.DEVNULL, stdin=subprocess.DEVNULL,
stdout=subprocess.DEVNULL, stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
start_new_session=True,
close_fds=True, close_fds=True,
**spawn_kwargs,
)
log.warning(
"self_destruct: reaper spawned path=%s via=%s — agent will die in ~3s",
path, "systemd-run" if systemd_run else "popen",
) )
log.warning("self_destruct: reaper spawned path=%s — agent will die in ~3s", path)
async def status() -> dict[str, Any]: async def status() -> dict[str, Any]:

View File

@@ -71,8 +71,12 @@ def test_self_destruct_spawns_reaper_and_returns_fast(monkeypatch, tmp_path) ->
assert resp.json()["status"] == "self_destruct_scheduled" assert resp.json()["status"] == "self_destruct_scheduled"
assert len(spawned) == 1 assert len(spawned) == 1
assert spawned[0]["kw"].get("start_new_session") is True assert spawned[0]["kw"].get("start_new_session") is True
script_path = spawned[0]["args"][1] script_candidates = [
assert script_path.startswith("/tmp/decnet-reaper-") a for a in spawned[0]["args"]
if isinstance(a, str) and a.startswith("/tmp/decnet-reaper-")
]
assert len(script_candidates) == 1, spawned[0]["args"]
script_path = script_candidates[0]
# Reaper content sanity check — covers the paths the operator asked for. # Reaper content sanity check — covers the paths the operator asked for.
import pathlib import pathlib
body = pathlib.Path(script_path).read_text() body = pathlib.Path(script_path).read_text()