fix(updater): restart agent+forwarder+self via systemd on push

Three holes in the systemd integration:
1. _spawn_agent_via_systemd only restarted decnet-agent.service, leaving
   decnet-forwarder.service running the pre-update code (same /opt/decnet
   tree, stale import cache).
2. run_update_self used os.execv regardless of environment — the re-execed
   process kept the updater's existing cgroup/capability inheritance but
   systemd would notice MainPID change and mark the unit degraded.
3. No path to surface a failed forwarder restart (legacy enrollments have
   no forwarder unit).

Now: agent restart first, forwarder restart as best-effort (logged but
non-fatal so legacy workers still update), MainPID still read from the
agent unit. For update-self under systemd, spawn a detached sleep+
systemctl restart so the HTTP response flushes before the unit cycles.
This commit is contained in:
2026-04-19 18:23:10 -04:00
parent a0a241f65d
commit 43b92c7bd6
2 changed files with 89 additions and 4 deletions

View File

@@ -209,6 +209,8 @@ def _run_pip(
AGENT_SYSTEMD_UNIT = "decnet-agent.service"
FORWARDER_SYSTEMD_UNIT = "decnet-forwarder.service"
UPDATER_SYSTEMD_UNIT = "decnet-updater.service"
def _systemd_available() -> bool:
@@ -243,10 +245,21 @@ def _spawn_agent(install_dir: pathlib.Path) -> int:
def _spawn_agent_via_systemd(install_dir: pathlib.Path) -> int:
# Restart agent + forwarder together: both processes run out of the same
# /opt/decnet tree, so a code push that replaces the tree must cycle both
# or the forwarder keeps the pre-update code in memory. Forwarder restart
# is best-effort — a worker without the forwarder unit installed (e.g. a
# legacy enrollment) shouldn't abort the update.
subprocess.run( # nosec B603 B607
["systemctl", "restart", AGENT_SYSTEMD_UNIT],
check=True, capture_output=True, text=True,
)
fwd = subprocess.run( # nosec B603 B607
["systemctl", "restart", FORWARDER_SYSTEMD_UNIT],
check=False, capture_output=True, text=True,
)
if fwd.returncode != 0:
log.warning("forwarder restart failed (ignored): %s", fwd.stderr.strip())
pid_out = subprocess.run( # nosec B603 B607
["systemctl", "show", "--property=MainPID", "--value", AGENT_SYSTEMD_UNIT],
check=True, capture_output=True, text=True,
@@ -556,6 +569,19 @@ def run_update_self(
if exec_cb is not None:
exec_cb(argv) # tests stub this — we don't actually re-exec
return {"status": "self_update_queued", "argv": argv}
# Returns nothing on success (replaces the process image).
# Under systemd, hand the restart to the init system so the new process
# keeps its unit context (capabilities, cgroup, logging target) instead
# of inheriting whatever we had here. Spawn a detached sh that waits for
# this response to flush before issuing the restart — `systemctl restart`
# on our own unit would kill us mid-response and the caller would see a
# connection drop with no indication of success.
if _systemd_available():
subprocess.Popen( # nosec B603 B607
["sh", "-c", f"sleep 1 && systemctl restart {UPDATER_SYSTEMD_UNIT}"],
start_new_session=True,
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
)
return {"status": "self_update_queued", "via": "systemd"}
# Off-systemd fallback: replace the process image directly.
os.execv(argv[0], argv) # nosec B606 - pragma: no cover
return {"status": "self_update_queued"} # pragma: no cover

View File

@@ -279,6 +279,36 @@ def test_update_self_rotates_and_calls_exec_cb(
assert "updater" in seen_argv[0]
def test_update_self_under_systemd_defers_to_systemctl(
monkeypatch: pytest.MonkeyPatch,
install_dir: pathlib.Path,
) -> None:
"""Under systemd, update-self must NOT os.execv — it hands the restart
to systemd so the new process inherits the unit context. A detached
``systemctl restart decnet-updater.service`` is scheduled after a short
sleep so the HTTP response can flush before the unit cycles."""
active = install_dir / "releases" / "active"
active.mkdir()
(active / "marker").write_text("old-updater")
monkeypatch.setattr(ex, "_run_pip", lambda release: _PipOK())
monkeypatch.setattr(ex, "_systemd_available", lambda: True)
popen_calls: list[list[str]] = []
class _FakePopen:
def __init__(self, cmd, **kwargs):
popen_calls.append(cmd)
monkeypatch.setattr(ex.subprocess, "Popen", _FakePopen)
monkeypatch.setattr(ex.os, "execv", lambda *a, **k: pytest.fail("execv taken under systemd"))
tb = _make_tarball({"marker": "new-updater"})
result = ex.run_update_self(tb, sha="USHA", updater_install_dir=install_dir)
assert result == {"status": "self_update_queued", "via": "systemd"}
assert len(popen_calls) == 1
sh_cmd = popen_calls[0]
assert sh_cmd[:2] == ["sh", "-c"]
assert "systemctl restart decnet-updater.service" in sh_cmd[2]
def test_update_self_pip_failure_leaves_active_intact(
monkeypatch: pytest.MonkeyPatch,
install_dir: pathlib.Path,
@@ -374,8 +404,10 @@ def test_spawn_agent_via_systemd_records_main_pid(
calls: list[list[str]] = []
class _Out:
def __init__(self, stdout: str = "") -> None:
def __init__(self, stdout: str = "", returncode: int = 0, stderr: str = "") -> None:
self.stdout = stdout
self.returncode = returncode
self.stderr = stderr
def fake_run(cmd, **kwargs): # type: ignore[no-untyped-def]
calls.append(cmd)
@@ -387,5 +419,32 @@ def test_spawn_agent_via_systemd_records_main_pid(
pid = ex._spawn_agent_via_systemd(install_dir)
assert pid == 4711
assert (install_dir / "agent.pid").read_text() == "4711"
assert calls[0][:2] == ["systemctl", "restart"]
assert calls[1][:2] == ["systemctl", "show"]
# Agent restart, forwarder restart, then MainPID lookup on the agent.
assert calls[0] == ["systemctl", "restart", ex.AGENT_SYSTEMD_UNIT]
assert calls[1] == ["systemctl", "restart", ex.FORWARDER_SYSTEMD_UNIT]
assert calls[2][:2] == ["systemctl", "show"]
assert ex.AGENT_SYSTEMD_UNIT in calls[2]
def test_spawn_agent_via_systemd_tolerates_missing_forwarder_unit(
monkeypatch: pytest.MonkeyPatch,
install_dir: pathlib.Path,
) -> None:
"""Legacy enrollments lack decnet-forwarder.service — restart fails and
must not abort the update."""
class _Out:
def __init__(self, stdout: str = "", returncode: int = 0, stderr: str = "") -> None:
self.stdout = stdout
self.returncode = returncode
self.stderr = stderr
def fake_run(cmd, **kwargs): # type: ignore[no-untyped-def]
if "restart" in cmd and ex.FORWARDER_SYSTEMD_UNIT in cmd:
return _Out(returncode=5, stderr="Unit not found.")
if "show" in cmd:
return _Out("4711\n")
return _Out("")
monkeypatch.setattr(ex.subprocess, "run", fake_run)
pid = ex._spawn_agent_via_systemd(install_dir)
assert pid == 4711