fix(updater): fall back to /proc scan when agent.pid is missing

If the agent was started outside the updater (manually, during dev,
or from a prior systemd unit), there is no agent.pid for _stop_agent
to target, so a successful code install leaves the old in-memory
agent process still serving requests. Scan /proc for any decnet agent
command and SIGTERM all matches so restart is reliable regardless of
how the agent was originally launched.
This commit is contained in:
2026-04-18 23:42:03 -04:00
parent 7765b36c50
commit ebeaf08a49
5 changed files with 140 additions and 34 deletions

View File

@@ -59,7 +59,7 @@ class MutateRequest(BaseModel):
@app.get("/health") @app.get("/health")
async def health() -> dict[str, str]: async def health() -> dict[str, str]:
return {"status": "ok"} return {"status": "ok", "marker": "push-test-2"}
@app.get("/status") @app.get("/status")

View File

@@ -6,9 +6,14 @@ from dotenv import load_dotenv
# Calculate absolute path to the project root # Calculate absolute path to the project root
_ROOT: Path = Path(__file__).parent.parent.absolute() _ROOT: Path = Path(__file__).parent.parent.absolute()
# Load .env.local first, then fallback to .env # Load .env.local first, then fallback to .env.
# Also check CWD so deployments that install into site-packages (e.g. the
# self-updater's release slots) can ship a per-host .env.local at the
# process's working directory without having to edit site-packages.
load_dotenv(_ROOT / ".env.local") load_dotenv(_ROOT / ".env.local")
load_dotenv(_ROOT / ".env") load_dotenv(_ROOT / ".env")
load_dotenv(Path.cwd() / ".env.local")
load_dotenv(Path.cwd() / ".env")
def _port(name: str, default: int) -> int: def _port(name: str, default: int) -> int:

View File

@@ -111,6 +111,16 @@ def _venv_python(release: pathlib.Path) -> pathlib.Path:
return release / ".venv" / "bin" / "python" return release / ".venv" / "bin" / "python"
def _shared_venv(install_dir: pathlib.Path) -> pathlib.Path:
"""The one stable venv that agents/updaters run out of.
Release slots ship source only. We ``pip install --force-reinstall
--no-deps`` into this venv on promotion so shebangs never dangle
across a rotation.
"""
return install_dir / "venv"
# ------------------------------------------------------------------- public # ------------------------------------------------------------------- public
def read_release(release: pathlib.Path) -> Release: def read_release(release: pathlib.Path) -> Release:
@@ -167,20 +177,29 @@ def extract_tarball(tarball_bytes: bytes, dest: pathlib.Path) -> None:
# ---------------------------------------------------------------- seams # ---------------------------------------------------------------- seams
def _run_pip(release: pathlib.Path) -> subprocess.CompletedProcess: def _run_pip(
"""Create a venv in ``release/.venv`` and pip install -e . into it. release: pathlib.Path,
install_dir: Optional[pathlib.Path] = None,
) -> subprocess.CompletedProcess:
"""pip install ``release`` into the shared venv at ``install_dir/venv``.
The shared venv is bootstrapped on first use. ``--force-reinstall
--no-deps`` replaces site-packages for the decnet package only; the
rest of the env stays cached across updates.
Monkeypatched in tests so the test suite never shells out. Monkeypatched in tests so the test suite never shells out.
""" """
venv_dir = release / ".venv" idir = install_dir or release.parent.parent # releases/<slot> -> install_dir
venv_dir = _shared_venv(idir)
if not venv_dir.exists(): if not venv_dir.exists():
subprocess.run( # nosec B603 subprocess.run( # nosec B603
[sys.executable, "-m", "venv", str(venv_dir)], [sys.executable, "-m", "venv", str(venv_dir)],
check=True, capture_output=True, text=True, check=True, capture_output=True, text=True,
) )
py = _venv_python(release) py = venv_dir / "bin" / "python"
return subprocess.run( # nosec B603 return subprocess.run( # nosec B603
[str(py), "-m", "pip", "install", "-e", str(release)], [str(py), "-m", "pip", "install", "--force-reinstall", "--no-deps",
str(release)],
check=False, capture_output=True, text=True, check=False, capture_output=True, text=True,
) )
@@ -190,43 +209,99 @@ def _spawn_agent(install_dir: pathlib.Path) -> int:
Returns the new PID. Monkeypatched in tests. Returns the new PID. Monkeypatched in tests.
""" """
py = _venv_python(_current_symlink(install_dir).resolve()) decnet_bin = _shared_venv(install_dir) / "bin" / "decnet"
log_path = install_dir / "agent.spawn.log"
# cwd=install_dir so a persistent ``<install_dir>/.env.local`` gets
# picked up by decnet.env (which loads from CWD). The release slot
# itself is immutable across updates, so the env file cannot live
# inside it.
proc = subprocess.Popen( # nosec B603 proc = subprocess.Popen( # nosec B603
[str(py), "-m", "decnet", "agent", "--daemon"], [str(decnet_bin), "agent", "--daemon"],
start_new_session=True, start_new_session=True,
stdout=subprocess.DEVNULL, cwd=str(install_dir),
stderr=subprocess.DEVNULL, stdout=open(log_path, "ab"), # noqa: SIM115
stderr=subprocess.STDOUT,
) )
_pid_file(install_dir).write_text(str(proc.pid)) _pid_file(install_dir).write_text(str(proc.pid))
return proc.pid return proc.pid
def _stop_agent(install_dir: pathlib.Path, grace: float = AGENT_RESTART_GRACE_S) -> None: def _discover_agent_pids() -> list[int]:
"""SIGTERM the PID we spawned; SIGKILL if it doesn't exit in ``grace`` s.""" """Scan /proc for any running ``decnet agent`` process.
pid_file = _pid_file(install_dir)
if not pid_file.is_file(): Used as a fallback when agent.pid is missing (e.g., the agent was started
return by hand rather than by the updater) so an update still produces a clean
restart instead of leaving the old in-memory code serving requests.
"""
pids: list[int] = []
self_pid = os.getpid()
for entry in pathlib.Path("/proc").iterdir():
if not entry.name.isdigit():
continue
pid = int(entry.name)
if pid == self_pid:
continue
try: try:
pid = int(pid_file.read_text().strip()) raw = (entry / "cmdline").read_bytes()
except (FileNotFoundError, PermissionError, OSError):
continue
argv = [a for a in raw.split(b"\x00") if a]
if len(argv) < 2:
continue
if not argv[0].endswith(b"python") and b"python" not in pathlib.Path(argv[0].decode(errors="ignore")).name.encode():
# Allow direct console-script invocation too: argv[0] ends with /decnet
if not argv[0].endswith(b"/decnet"):
continue
if b"decnet" in b" ".join(argv) and b"agent" in argv:
pids.append(pid)
return pids
def _stop_agent(install_dir: pathlib.Path, grace: float = AGENT_RESTART_GRACE_S) -> None:
"""SIGTERM the agent and wait for it to exit; SIGKILL after ``grace`` s.
Prefers the PID recorded in ``agent.pid`` (processes we spawned) but
falls back to scanning /proc for any ``decnet agent`` so manually-started
agents are also restarted cleanly during an update.
"""
pids: list[int] = []
pid_file = _pid_file(install_dir)
if pid_file.is_file():
try:
pids.append(int(pid_file.read_text().strip()))
except (ValueError, OSError): except (ValueError, OSError):
pass
for pid in _discover_agent_pids():
if pid not in pids:
pids.append(pid)
if not pids:
return return
for pid in pids:
try: try:
os.kill(pid, signal.SIGTERM) os.kill(pid, signal.SIGTERM)
except ProcessLookupError: except ProcessLookupError:
return continue
deadline = time.monotonic() + grace deadline = time.monotonic() + grace
while time.monotonic() < deadline: remaining = list(pids)
try: while remaining and time.monotonic() < deadline:
os.kill(pid, 0) remaining = [p for p in remaining if _pid_alive(p)]
except ProcessLookupError: if remaining:
return
time.sleep(0.2) time.sleep(0.2)
for pid in remaining:
try: try:
os.kill(pid, signal.SIGKILL) os.kill(pid, signal.SIGKILL)
except ProcessLookupError: except ProcessLookupError:
pass pass
def _pid_alive(pid: int) -> bool:
try:
os.kill(pid, 0)
return True
except ProcessLookupError:
return False
def _probe_agent( def _probe_agent(
agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR, agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR,
url: str = AGENT_PROBE_URL, url: str = AGENT_PROBE_URL,
@@ -239,8 +314,10 @@ def _probe_agent(
ca = agent_dir / "ca.crt" ca = agent_dir / "ca.crt"
if not (worker_key.is_file() and worker_crt.is_file() and ca.is_file()): if not (worker_key.is_file() and worker_crt.is_file() and ca.is_file()):
return False, f"no mTLS bundle at {agent_dir}" return False, f"no mTLS bundle at {agent_dir}"
ctx = ssl.create_default_context(cafile=str(ca)) ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
ctx.load_cert_chain(certfile=str(worker_crt), keyfile=str(worker_key)) ctx.load_cert_chain(certfile=str(worker_crt), keyfile=str(worker_key))
ctx.load_verify_locations(cafile=str(ca))
ctx.verify_mode = ssl.CERT_REQUIRED
ctx.check_hostname = False ctx.check_hostname = False
last = "" last = ""
@@ -407,7 +484,7 @@ def run_update_self(
_rotate(updater_install_dir) _rotate(updater_install_dir)
_point_current_at(updater_install_dir, _active_dir(updater_install_dir)) _point_current_at(updater_install_dir, _active_dir(updater_install_dir))
argv = [str(_venv_python(_active_dir(updater_install_dir))), "-m", "decnet", "updater"] + sys.argv[1:] argv = [str(_shared_venv(updater_install_dir) / "bin" / "decnet"), "updater"] + sys.argv[1:]
if exec_cb is not None: if exec_cb is not None:
exec_cb(argv) # tests stub this — we don't actually re-exec exec_cb(argv) # tests stub this — we don't actually re-exec
return {"status": "self_update_queued", "argv": argv} return {"status": "self_update_queued", "argv": argv}

View File

@@ -25,7 +25,9 @@ dependencies = [
"scapy>=2.6.1", "scapy>=2.6.1",
"orjson>=3.10", "orjson>=3.10",
"cryptography>=46.0.7", "cryptography>=46.0.7",
"python-multipart>=0.0.20" "python-multipart>=0.0.20",
"httpx>=0.28.1",
"requests>=2.33.1"
] ]
[project.optional-dependencies] [project.optional-dependencies]

View File

@@ -293,3 +293,25 @@ def test_update_self_pip_failure_leaves_active_intact(
ex.run_update_self(tb, sha="U", updater_install_dir=install_dir, exec_cb=lambda a: None) ex.run_update_self(tb, sha="U", updater_install_dir=install_dir, exec_cb=lambda a: None)
assert (install_dir / "releases" / "active" / "marker").read_text() == "old-updater" assert (install_dir / "releases" / "active" / "marker").read_text() == "old-updater"
assert not (install_dir / "releases" / "active.new").exists() assert not (install_dir / "releases" / "active.new").exists()
def test_stop_agent_falls_back_to_proc_scan_when_no_pidfile(
monkeypatch: pytest.MonkeyPatch,
install_dir: pathlib.Path,
) -> None:
"""No agent.pid → _stop_agent still terminates agents found via /proc."""
killed: list[tuple[int, int]] = []
def fake_kill(pid: int, sig: int) -> None:
killed.append((pid, sig))
raise ProcessLookupError # pretend it already died after SIGTERM
monkeypatch.setattr(ex, "_discover_agent_pids", lambda: [4242, 4243])
monkeypatch.setattr(ex.os, "kill", fake_kill)
assert not (install_dir / "agent.pid").exists()
ex._stop_agent(install_dir, grace=0.0)
import signal as _signal
assert (4242, _signal.SIGTERM) in killed
assert (4243, _signal.SIGTERM) in killed