fix(updater): fall back to /proc scan when agent.pid is missing

If the agent was started outside the updater (manually, during dev,
or from a prior systemd unit), there is no agent.pid for _stop_agent
to target, so a successful code install leaves the old in-memory
agent process still serving requests. Scan /proc for any decnet agent
command and SIGTERM all matches so restart is reliable regardless of
how the agent was originally launched.
This commit is contained in:
2026-04-18 23:42:03 -04:00
parent 7765b36c50
commit ebeaf08a49
5 changed files with 140 additions and 34 deletions

View File

@@ -111,6 +111,16 @@ def _venv_python(release: pathlib.Path) -> pathlib.Path:
return release / ".venv" / "bin" / "python"
def _shared_venv(install_dir: pathlib.Path) -> pathlib.Path:
"""The one stable venv that agents/updaters run out of.
Release slots ship source only. We ``pip install --force-reinstall
--no-deps`` into this venv on promotion so shebangs never dangle
across a rotation.
"""
return install_dir / "venv"
# ------------------------------------------------------------------- public
def read_release(release: pathlib.Path) -> Release:
@@ -167,20 +177,29 @@ def extract_tarball(tarball_bytes: bytes, dest: pathlib.Path) -> None:
# ---------------------------------------------------------------- seams
def _run_pip(release: pathlib.Path) -> subprocess.CompletedProcess:
"""Create a venv in ``release/.venv`` and pip install -e . into it.
def _run_pip(
release: pathlib.Path,
install_dir: Optional[pathlib.Path] = None,
) -> subprocess.CompletedProcess:
"""pip install ``release`` into the shared venv at ``install_dir/venv``.
The shared venv is bootstrapped on first use. ``--force-reinstall
--no-deps`` replaces site-packages for the decnet package only; the
rest of the env stays cached across updates.
Monkeypatched in tests so the test suite never shells out.
"""
venv_dir = release / ".venv"
idir = install_dir or release.parent.parent # releases/<slot> -> install_dir
venv_dir = _shared_venv(idir)
if not venv_dir.exists():
subprocess.run( # nosec B603
[sys.executable, "-m", "venv", str(venv_dir)],
check=True, capture_output=True, text=True,
)
py = _venv_python(release)
py = venv_dir / "bin" / "python"
return subprocess.run( # nosec B603
[str(py), "-m", "pip", "install", "-e", str(release)],
[str(py), "-m", "pip", "install", "--force-reinstall", "--no-deps",
str(release)],
check=False, capture_output=True, text=True,
)
@@ -190,41 +209,97 @@ def _spawn_agent(install_dir: pathlib.Path) -> int:
Returns the new PID. Monkeypatched in tests.
"""
py = _venv_python(_current_symlink(install_dir).resolve())
decnet_bin = _shared_venv(install_dir) / "bin" / "decnet"
log_path = install_dir / "agent.spawn.log"
# cwd=install_dir so a persistent ``<install_dir>/.env.local`` gets
# picked up by decnet.env (which loads from CWD). The release slot
# itself is immutable across updates, so the env file cannot live
# inside it.
proc = subprocess.Popen( # nosec B603
[str(py), "-m", "decnet", "agent", "--daemon"],
[str(decnet_bin), "agent", "--daemon"],
start_new_session=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
cwd=str(install_dir),
stdout=open(log_path, "ab"), # noqa: SIM115
stderr=subprocess.STDOUT,
)
_pid_file(install_dir).write_text(str(proc.pid))
return proc.pid
def _stop_agent(install_dir: pathlib.Path, grace: float = AGENT_RESTART_GRACE_S) -> None:
"""SIGTERM the PID we spawned; SIGKILL if it doesn't exit in ``grace`` s."""
pid_file = _pid_file(install_dir)
if not pid_file.is_file():
return
try:
pid = int(pid_file.read_text().strip())
except (ValueError, OSError):
return
try:
os.kill(pid, signal.SIGTERM)
except ProcessLookupError:
return
deadline = time.monotonic() + grace
while time.monotonic() < deadline:
def _discover_agent_pids() -> list[int]:
"""Scan /proc for any running ``decnet agent`` process.
Used as a fallback when agent.pid is missing (e.g., the agent was started
by hand rather than by the updater) so an update still produces a clean
restart instead of leaving the old in-memory code serving requests.
"""
pids: list[int] = []
self_pid = os.getpid()
for entry in pathlib.Path("/proc").iterdir():
if not entry.name.isdigit():
continue
pid = int(entry.name)
if pid == self_pid:
continue
try:
os.kill(pid, 0)
raw = (entry / "cmdline").read_bytes()
except (FileNotFoundError, PermissionError, OSError):
continue
argv = [a for a in raw.split(b"\x00") if a]
if len(argv) < 2:
continue
if not argv[0].endswith(b"python") and b"python" not in pathlib.Path(argv[0].decode(errors="ignore")).name.encode():
# Allow direct console-script invocation too: argv[0] ends with /decnet
if not argv[0].endswith(b"/decnet"):
continue
if b"decnet" in b" ".join(argv) and b"agent" in argv:
pids.append(pid)
return pids
def _stop_agent(install_dir: pathlib.Path, grace: float = AGENT_RESTART_GRACE_S) -> None:
"""SIGTERM the agent and wait for it to exit; SIGKILL after ``grace`` s.
Prefers the PID recorded in ``agent.pid`` (processes we spawned) but
falls back to scanning /proc for any ``decnet agent`` so manually-started
agents are also restarted cleanly during an update.
"""
pids: list[int] = []
pid_file = _pid_file(install_dir)
if pid_file.is_file():
try:
pids.append(int(pid_file.read_text().strip()))
except (ValueError, OSError):
pass
for pid in _discover_agent_pids():
if pid not in pids:
pids.append(pid)
if not pids:
return
for pid in pids:
try:
os.kill(pid, signal.SIGTERM)
except ProcessLookupError:
return
time.sleep(0.2)
continue
deadline = time.monotonic() + grace
remaining = list(pids)
while remaining and time.monotonic() < deadline:
remaining = [p for p in remaining if _pid_alive(p)]
if remaining:
time.sleep(0.2)
for pid in remaining:
try:
os.kill(pid, signal.SIGKILL)
except ProcessLookupError:
pass
def _pid_alive(pid: int) -> bool:
try:
os.kill(pid, signal.SIGKILL)
os.kill(pid, 0)
return True
except ProcessLookupError:
pass
return False
def _probe_agent(
@@ -239,8 +314,10 @@ def _probe_agent(
ca = agent_dir / "ca.crt"
if not (worker_key.is_file() and worker_crt.is_file() and ca.is_file()):
return False, f"no mTLS bundle at {agent_dir}"
ctx = ssl.create_default_context(cafile=str(ca))
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
ctx.load_cert_chain(certfile=str(worker_crt), keyfile=str(worker_key))
ctx.load_verify_locations(cafile=str(ca))
ctx.verify_mode = ssl.CERT_REQUIRED
ctx.check_hostname = False
last = ""
@@ -407,7 +484,7 @@ def run_update_self(
_rotate(updater_install_dir)
_point_current_at(updater_install_dir, _active_dir(updater_install_dir))
argv = [str(_venv_python(_active_dir(updater_install_dir))), "-m", "decnet", "updater"] + sys.argv[1:]
argv = [str(_shared_venv(updater_install_dir) / "bin" / "decnet"), "updater"] + sys.argv[1:]
if exec_cb is not None:
exec_cb(argv) # tests stub this — we don't actually re-exec
return {"status": "self_update_queued", "argv": argv}