fix(updater): fall back to /proc scan when agent.pid is missing

If the agent was started outside the updater (manually, during dev,
or from a prior systemd unit), there is no agent.pid for _stop_agent
to target, so a successful code install leaves the old in-memory
agent process still serving requests. Scan /proc for any decnet agent
command and SIGTERM all matches so restart is reliable regardless of
how the agent was originally launched.
This commit is contained in:
2026-04-18 23:42:03 -04:00
parent 7765b36c50
commit ebeaf08a49
5 changed files with 140 additions and 34 deletions

View File

@@ -59,7 +59,7 @@ class MutateRequest(BaseModel):
@app.get("/health")
async def health() -> dict[str, str]:
return {"status": "ok"}
return {"status": "ok", "marker": "push-test-2"}
@app.get("/status")

View File

@@ -6,9 +6,14 @@ from dotenv import load_dotenv
# Calculate absolute path to the project root
_ROOT: Path = Path(__file__).parent.parent.absolute()
# Load .env.local first, then fallback to .env
# Load .env.local first, then fallback to .env.
# Also check CWD so deployments that install into site-packages (e.g. the
# self-updater's release slots) can ship a per-host .env.local at the
# process's working directory without having to edit site-packages.
load_dotenv(_ROOT / ".env.local")
load_dotenv(_ROOT / ".env")
load_dotenv(Path.cwd() / ".env.local")
load_dotenv(Path.cwd() / ".env")
def _port(name: str, default: int) -> int:

View File

@@ -111,6 +111,16 @@ def _venv_python(release: pathlib.Path) -> pathlib.Path:
return release / ".venv" / "bin" / "python"
def _shared_venv(install_dir: pathlib.Path) -> pathlib.Path:
"""The one stable venv that agents/updaters run out of.
Release slots ship source only. We ``pip install --force-reinstall
--no-deps`` into this venv on promotion so shebangs never dangle
across a rotation.
"""
return install_dir / "venv"
# ------------------------------------------------------------------- public
def read_release(release: pathlib.Path) -> Release:
@@ -167,20 +177,29 @@ def extract_tarball(tarball_bytes: bytes, dest: pathlib.Path) -> None:
# ---------------------------------------------------------------- seams
def _run_pip(release: pathlib.Path) -> subprocess.CompletedProcess:
"""Create a venv in ``release/.venv`` and pip install -e . into it.
def _run_pip(
release: pathlib.Path,
install_dir: Optional[pathlib.Path] = None,
) -> subprocess.CompletedProcess:
"""pip install ``release`` into the shared venv at ``install_dir/venv``.
The shared venv is bootstrapped on first use. ``--force-reinstall
--no-deps`` replaces site-packages for the decnet package only; the
rest of the env stays cached across updates.
Monkeypatched in tests so the test suite never shells out.
"""
venv_dir = release / ".venv"
idir = install_dir or release.parent.parent # releases/<slot> -> install_dir
venv_dir = _shared_venv(idir)
if not venv_dir.exists():
subprocess.run( # nosec B603
[sys.executable, "-m", "venv", str(venv_dir)],
check=True, capture_output=True, text=True,
)
py = _venv_python(release)
py = venv_dir / "bin" / "python"
return subprocess.run( # nosec B603
[str(py), "-m", "pip", "install", "-e", str(release)],
[str(py), "-m", "pip", "install", "--force-reinstall", "--no-deps",
str(release)],
check=False, capture_output=True, text=True,
)
@@ -190,43 +209,99 @@ def _spawn_agent(install_dir: pathlib.Path) -> int:
Returns the new PID. Monkeypatched in tests.
"""
py = _venv_python(_current_symlink(install_dir).resolve())
decnet_bin = _shared_venv(install_dir) / "bin" / "decnet"
log_path = install_dir / "agent.spawn.log"
# cwd=install_dir so a persistent ``<install_dir>/.env.local`` gets
# picked up by decnet.env (which loads from CWD). The release slot
# itself is immutable across updates, so the env file cannot live
# inside it.
proc = subprocess.Popen( # nosec B603
[str(py), "-m", "decnet", "agent", "--daemon"],
[str(decnet_bin), "agent", "--daemon"],
start_new_session=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
cwd=str(install_dir),
stdout=open(log_path, "ab"), # noqa: SIM115
stderr=subprocess.STDOUT,
)
_pid_file(install_dir).write_text(str(proc.pid))
return proc.pid
def _stop_agent(install_dir: pathlib.Path, grace: float = AGENT_RESTART_GRACE_S) -> None:
"""SIGTERM the PID we spawned; SIGKILL if it doesn't exit in ``grace`` s."""
pid_file = _pid_file(install_dir)
if not pid_file.is_file():
return
def _discover_agent_pids() -> list[int]:
"""Scan /proc for any running ``decnet agent`` process.
Used as a fallback when agent.pid is missing (e.g., the agent was started
by hand rather than by the updater) so an update still produces a clean
restart instead of leaving the old in-memory code serving requests.
"""
pids: list[int] = []
self_pid = os.getpid()
for entry in pathlib.Path("/proc").iterdir():
if not entry.name.isdigit():
continue
pid = int(entry.name)
if pid == self_pid:
continue
try:
pid = int(pid_file.read_text().strip())
raw = (entry / "cmdline").read_bytes()
except (FileNotFoundError, PermissionError, OSError):
continue
argv = [a for a in raw.split(b"\x00") if a]
if len(argv) < 2:
continue
if not argv[0].endswith(b"python") and b"python" not in pathlib.Path(argv[0].decode(errors="ignore")).name.encode():
# Allow direct console-script invocation too: argv[0] ends with /decnet
if not argv[0].endswith(b"/decnet"):
continue
if b"decnet" in b" ".join(argv) and b"agent" in argv:
pids.append(pid)
return pids
def _stop_agent(install_dir: pathlib.Path, grace: float = AGENT_RESTART_GRACE_S) -> None:
"""SIGTERM the agent and wait for it to exit; SIGKILL after ``grace`` s.
Prefers the PID recorded in ``agent.pid`` (processes we spawned) but
falls back to scanning /proc for any ``decnet agent`` so manually-started
agents are also restarted cleanly during an update.
"""
pids: list[int] = []
pid_file = _pid_file(install_dir)
if pid_file.is_file():
try:
pids.append(int(pid_file.read_text().strip()))
except (ValueError, OSError):
pass
for pid in _discover_agent_pids():
if pid not in pids:
pids.append(pid)
if not pids:
return
for pid in pids:
try:
os.kill(pid, signal.SIGTERM)
except ProcessLookupError:
return
continue
deadline = time.monotonic() + grace
while time.monotonic() < deadline:
try:
os.kill(pid, 0)
except ProcessLookupError:
return
remaining = list(pids)
while remaining and time.monotonic() < deadline:
remaining = [p for p in remaining if _pid_alive(p)]
if remaining:
time.sleep(0.2)
for pid in remaining:
try:
os.kill(pid, signal.SIGKILL)
except ProcessLookupError:
pass
def _pid_alive(pid: int) -> bool:
try:
os.kill(pid, 0)
return True
except ProcessLookupError:
return False
def _probe_agent(
agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR,
url: str = AGENT_PROBE_URL,
@@ -239,8 +314,10 @@ def _probe_agent(
ca = agent_dir / "ca.crt"
if not (worker_key.is_file() and worker_crt.is_file() and ca.is_file()):
return False, f"no mTLS bundle at {agent_dir}"
ctx = ssl.create_default_context(cafile=str(ca))
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
ctx.load_cert_chain(certfile=str(worker_crt), keyfile=str(worker_key))
ctx.load_verify_locations(cafile=str(ca))
ctx.verify_mode = ssl.CERT_REQUIRED
ctx.check_hostname = False
last = ""
@@ -407,7 +484,7 @@ def run_update_self(
_rotate(updater_install_dir)
_point_current_at(updater_install_dir, _active_dir(updater_install_dir))
argv = [str(_venv_python(_active_dir(updater_install_dir))), "-m", "decnet", "updater"] + sys.argv[1:]
argv = [str(_shared_venv(updater_install_dir) / "bin" / "decnet"), "updater"] + sys.argv[1:]
if exec_cb is not None:
exec_cb(argv) # tests stub this — we don't actually re-exec
return {"status": "self_update_queued", "argv": argv}

View File

@@ -25,7 +25,9 @@ dependencies = [
"scapy>=2.6.1",
"orjson>=3.10",
"cryptography>=46.0.7",
"python-multipart>=0.0.20"
"python-multipart>=0.0.20",
"httpx>=0.28.1",
"requests>=2.33.1"
]
[project.optional-dependencies]

View File

@@ -293,3 +293,25 @@ def test_update_self_pip_failure_leaves_active_intact(
ex.run_update_self(tb, sha="U", updater_install_dir=install_dir, exec_cb=lambda a: None)
assert (install_dir / "releases" / "active" / "marker").read_text() == "old-updater"
assert not (install_dir / "releases" / "active.new").exists()
def test_stop_agent_falls_back_to_proc_scan_when_no_pidfile(
monkeypatch: pytest.MonkeyPatch,
install_dir: pathlib.Path,
) -> None:
"""No agent.pid → _stop_agent still terminates agents found via /proc."""
killed: list[tuple[int, int]] = []
def fake_kill(pid: int, sig: int) -> None:
killed.append((pid, sig))
raise ProcessLookupError # pretend it already died after SIGTERM
monkeypatch.setattr(ex, "_discover_agent_pids", lambda: [4242, 4243])
monkeypatch.setattr(ex.os, "kill", fake_kill)
assert not (install_dir / "agent.pid").exists()
ex._stop_agent(install_dir, grace=0.0)
import signal as _signal
assert (4242, _signal.SIGTERM) in killed
assert (4243, _signal.SIGTERM) in killed