fix(agent): escape systemd cgroup when spawning self-destruct reaper
The reaper was being SIGTERM'd mid-rm because `start_new_session=True` only forks a new POSIX session — it does not escape decnet-agent.service's cgroup. When the reaper ran `systemctl stop decnet-agent`, systemd tore down the whole cgroup (reaper included) before `rm -rf /opt/decnet*` finished, leaving the install on disk. Spawn the reaper via `systemd-run --collect --unit decnet-reaper-<pid>` so it runs in a fresh transient scope, outside the agent unit. Falls back to bare Popen for non-systemd hosts.
This commit is contained in:
@@ -152,6 +152,7 @@ async def self_destruct() -> None:
|
||||
install footprint. Returns immediately so the HTTP response can drain
|
||||
before the reaper starts deleting files out from under the agent."""
|
||||
import os
|
||||
import shutil
|
||||
import subprocess # nosec B404
|
||||
import tempfile
|
||||
|
||||
@@ -172,17 +173,39 @@ async def self_destruct() -> None:
|
||||
os.close(fd)
|
||||
os.chmod(path, 0o700) # nosec B103 — root-owned reaper, needs exec
|
||||
|
||||
# start_new_session detaches from the agent process group so the
|
||||
# reaper isn't killed when systemctl stop decnet-agent fires.
|
||||
# The reaper MUST run outside decnet-agent.service's cgroup — otherwise
|
||||
# `systemctl stop decnet-agent` SIGTERMs the whole cgroup (reaper included)
|
||||
# before rm -rf completes. `start_new_session=True` gets us a fresh POSIX
|
||||
# session but does NOT escape the systemd cgroup. So we prefer
|
||||
# `systemd-run --scope` (launches the command in a transient scope
|
||||
# detached from the caller's service), falling back to a bare Popen if
|
||||
# systemd-run is unavailable (non-systemd host / container).
|
||||
systemd_run = shutil.which("systemd-run")
|
||||
if systemd_run:
|
||||
argv = [
|
||||
systemd_run,
|
||||
"--collect",
|
||||
"--unit", f"decnet-reaper-{os.getpid()}",
|
||||
"--description", "DECNET agent self-destruct reaper",
|
||||
"/bin/bash", path,
|
||||
]
|
||||
spawn_kwargs = {"start_new_session": True}
|
||||
else:
|
||||
argv = ["/bin/bash", path]
|
||||
spawn_kwargs = {"start_new_session": True}
|
||||
|
||||
subprocess.Popen( # nosec B603
|
||||
["/bin/bash", path],
|
||||
argv,
|
||||
stdin=subprocess.DEVNULL,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
start_new_session=True,
|
||||
close_fds=True,
|
||||
**spawn_kwargs,
|
||||
)
|
||||
log.warning(
|
||||
"self_destruct: reaper spawned path=%s via=%s — agent will die in ~3s",
|
||||
path, "systemd-run" if systemd_run else "popen",
|
||||
)
|
||||
log.warning("self_destruct: reaper spawned path=%s — agent will die in ~3s", path)
|
||||
|
||||
|
||||
async def status() -> dict[str, Any]:
|
||||
|
||||
@@ -71,8 +71,12 @@ def test_self_destruct_spawns_reaper_and_returns_fast(monkeypatch, tmp_path) ->
|
||||
assert resp.json()["status"] == "self_destruct_scheduled"
|
||||
assert len(spawned) == 1
|
||||
assert spawned[0]["kw"].get("start_new_session") is True
|
||||
script_path = spawned[0]["args"][1]
|
||||
assert script_path.startswith("/tmp/decnet-reaper-")
|
||||
script_candidates = [
|
||||
a for a in spawned[0]["args"]
|
||||
if isinstance(a, str) and a.startswith("/tmp/decnet-reaper-")
|
||||
]
|
||||
assert len(script_candidates) == 1, spawned[0]["args"]
|
||||
script_path = script_candidates[0]
|
||||
# Reaper content sanity check — covers the paths the operator asked for.
|
||||
import pathlib
|
||||
body = pathlib.Path(script_path).read_text()
|
||||
|
||||
Reference in New Issue
Block a user