From 43b92c7bd6092b5b8d41b81373882c3c950c45ea Mon Sep 17 00:00:00 2001 From: anti Date: Sun, 19 Apr 2026 18:23:10 -0400 Subject: [PATCH] fix(updater): restart agent+forwarder+self via systemd on push MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three holes in the systemd integration: 1. _spawn_agent_via_systemd only restarted decnet-agent.service, leaving decnet-forwarder.service running the pre-update code (same /opt/decnet tree, stale import cache). 2. run_update_self used os.execv regardless of environment — the re-execed process kept the updater's existing cgroup/capability inheritance but systemd would notice MainPID change and mark the unit degraded. 3. No path to surface a failed forwarder restart (legacy enrollments have no forwarder unit). Now: agent restart first, forwarder restart as best-effort (logged but non-fatal so legacy workers still update), MainPID still read from the agent unit. For update-self under systemd, spawn a detached sleep+ systemctl restart so the HTTP response flushes before the unit cycles. --- decnet/updater/executor.py | 28 ++++++++++- tests/updater/test_updater_executor.py | 65 ++++++++++++++++++++++++-- 2 files changed, 89 insertions(+), 4 deletions(-) diff --git a/decnet/updater/executor.py b/decnet/updater/executor.py index 067b0a6..fb83596 100644 --- a/decnet/updater/executor.py +++ b/decnet/updater/executor.py @@ -209,6 +209,8 @@ def _run_pip( AGENT_SYSTEMD_UNIT = "decnet-agent.service" +FORWARDER_SYSTEMD_UNIT = "decnet-forwarder.service" +UPDATER_SYSTEMD_UNIT = "decnet-updater.service" def _systemd_available() -> bool: @@ -243,10 +245,21 @@ def _spawn_agent(install_dir: pathlib.Path) -> int: def _spawn_agent_via_systemd(install_dir: pathlib.Path) -> int: + # Restart agent + forwarder together: both processes run out of the same + # /opt/decnet tree, so a code push that replaces the tree must cycle both + # or the forwarder keeps the pre-update code in memory. Forwarder restart + # is best-effort — a worker without the forwarder unit installed (e.g. a + # legacy enrollment) shouldn't abort the update. subprocess.run( # nosec B603 B607 ["systemctl", "restart", AGENT_SYSTEMD_UNIT], check=True, capture_output=True, text=True, ) + fwd = subprocess.run( # nosec B603 B607 + ["systemctl", "restart", FORWARDER_SYSTEMD_UNIT], + check=False, capture_output=True, text=True, + ) + if fwd.returncode != 0: + log.warning("forwarder restart failed (ignored): %s", fwd.stderr.strip()) pid_out = subprocess.run( # nosec B603 B607 ["systemctl", "show", "--property=MainPID", "--value", AGENT_SYSTEMD_UNIT], check=True, capture_output=True, text=True, @@ -556,6 +569,19 @@ def run_update_self( if exec_cb is not None: exec_cb(argv) # tests stub this — we don't actually re-exec return {"status": "self_update_queued", "argv": argv} - # Returns nothing on success (replaces the process image). + # Under systemd, hand the restart to the init system so the new process + # keeps its unit context (capabilities, cgroup, logging target) instead + # of inheriting whatever we had here. Spawn a detached sh that waits for + # this response to flush before issuing the restart — `systemctl restart` + # on our own unit would kill us mid-response and the caller would see a + # connection drop with no indication of success. + if _systemd_available(): + subprocess.Popen( # nosec B603 B607 + ["sh", "-c", f"sleep 1 && systemctl restart {UPDATER_SYSTEMD_UNIT}"], + start_new_session=True, + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) + return {"status": "self_update_queued", "via": "systemd"} + # Off-systemd fallback: replace the process image directly. os.execv(argv[0], argv) # nosec B606 - pragma: no cover return {"status": "self_update_queued"} # pragma: no cover diff --git a/tests/updater/test_updater_executor.py b/tests/updater/test_updater_executor.py index cfdaf0e..eff4835 100644 --- a/tests/updater/test_updater_executor.py +++ b/tests/updater/test_updater_executor.py @@ -279,6 +279,36 @@ def test_update_self_rotates_and_calls_exec_cb( assert "updater" in seen_argv[0] +def test_update_self_under_systemd_defers_to_systemctl( + monkeypatch: pytest.MonkeyPatch, + install_dir: pathlib.Path, +) -> None: + """Under systemd, update-self must NOT os.execv — it hands the restart + to systemd so the new process inherits the unit context. A detached + ``systemctl restart decnet-updater.service`` is scheduled after a short + sleep so the HTTP response can flush before the unit cycles.""" + active = install_dir / "releases" / "active" + active.mkdir() + (active / "marker").write_text("old-updater") + monkeypatch.setattr(ex, "_run_pip", lambda release: _PipOK()) + monkeypatch.setattr(ex, "_systemd_available", lambda: True) + + popen_calls: list[list[str]] = [] + class _FakePopen: + def __init__(self, cmd, **kwargs): + popen_calls.append(cmd) + monkeypatch.setattr(ex.subprocess, "Popen", _FakePopen) + monkeypatch.setattr(ex.os, "execv", lambda *a, **k: pytest.fail("execv taken under systemd")) + + tb = _make_tarball({"marker": "new-updater"}) + result = ex.run_update_self(tb, sha="USHA", updater_install_dir=install_dir) + assert result == {"status": "self_update_queued", "via": "systemd"} + assert len(popen_calls) == 1 + sh_cmd = popen_calls[0] + assert sh_cmd[:2] == ["sh", "-c"] + assert "systemctl restart decnet-updater.service" in sh_cmd[2] + + def test_update_self_pip_failure_leaves_active_intact( monkeypatch: pytest.MonkeyPatch, install_dir: pathlib.Path, @@ -374,8 +404,10 @@ def test_spawn_agent_via_systemd_records_main_pid( calls: list[list[str]] = [] class _Out: - def __init__(self, stdout: str = "") -> None: + def __init__(self, stdout: str = "", returncode: int = 0, stderr: str = "") -> None: self.stdout = stdout + self.returncode = returncode + self.stderr = stderr def fake_run(cmd, **kwargs): # type: ignore[no-untyped-def] calls.append(cmd) @@ -387,5 +419,32 @@ def test_spawn_agent_via_systemd_records_main_pid( pid = ex._spawn_agent_via_systemd(install_dir) assert pid == 4711 assert (install_dir / "agent.pid").read_text() == "4711" - assert calls[0][:2] == ["systemctl", "restart"] - assert calls[1][:2] == ["systemctl", "show"] + # Agent restart, forwarder restart, then MainPID lookup on the agent. + assert calls[0] == ["systemctl", "restart", ex.AGENT_SYSTEMD_UNIT] + assert calls[1] == ["systemctl", "restart", ex.FORWARDER_SYSTEMD_UNIT] + assert calls[2][:2] == ["systemctl", "show"] + assert ex.AGENT_SYSTEMD_UNIT in calls[2] + + +def test_spawn_agent_via_systemd_tolerates_missing_forwarder_unit( + monkeypatch: pytest.MonkeyPatch, + install_dir: pathlib.Path, +) -> None: + """Legacy enrollments lack decnet-forwarder.service — restart fails and + must not abort the update.""" + class _Out: + def __init__(self, stdout: str = "", returncode: int = 0, stderr: str = "") -> None: + self.stdout = stdout + self.returncode = returncode + self.stderr = stderr + + def fake_run(cmd, **kwargs): # type: ignore[no-untyped-def] + if "restart" in cmd and ex.FORWARDER_SYSTEMD_UNIT in cmd: + return _Out(returncode=5, stderr="Unit not found.") + if "show" in cmd: + return _Out("4711\n") + return _Out("") + + monkeypatch.setattr(ex.subprocess, "run", fake_run) + pid = ex._spawn_agent_via_systemd(install_dir) + assert pid == 4711