fix(updater): fall back to /proc scan when agent.pid is missing
If the agent was started outside the updater (manually, during dev, or from a prior systemd unit), there is no agent.pid for _stop_agent to target, so a successful code install leaves the old in-memory agent process still serving requests. Scan /proc for any decnet agent command and SIGTERM all matches so restart is reliable regardless of how the agent was originally launched.
This commit is contained in:
@@ -59,7 +59,7 @@ class MutateRequest(BaseModel):
|
|||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
async def health() -> dict[str, str]:
|
async def health() -> dict[str, str]:
|
||||||
return {"status": "ok"}
|
return {"status": "ok", "marker": "push-test-2"}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/status")
|
@app.get("/status")
|
||||||
|
|||||||
@@ -6,9 +6,14 @@ from dotenv import load_dotenv
|
|||||||
# Calculate absolute path to the project root
|
# Calculate absolute path to the project root
|
||||||
_ROOT: Path = Path(__file__).parent.parent.absolute()
|
_ROOT: Path = Path(__file__).parent.parent.absolute()
|
||||||
|
|
||||||
# Load .env.local first, then fallback to .env
|
# Load .env.local first, then fallback to .env.
|
||||||
|
# Also check CWD so deployments that install into site-packages (e.g. the
|
||||||
|
# self-updater's release slots) can ship a per-host .env.local at the
|
||||||
|
# process's working directory without having to edit site-packages.
|
||||||
load_dotenv(_ROOT / ".env.local")
|
load_dotenv(_ROOT / ".env.local")
|
||||||
load_dotenv(_ROOT / ".env")
|
load_dotenv(_ROOT / ".env")
|
||||||
|
load_dotenv(Path.cwd() / ".env.local")
|
||||||
|
load_dotenv(Path.cwd() / ".env")
|
||||||
|
|
||||||
|
|
||||||
def _port(name: str, default: int) -> int:
|
def _port(name: str, default: int) -> int:
|
||||||
|
|||||||
@@ -111,6 +111,16 @@ def _venv_python(release: pathlib.Path) -> pathlib.Path:
|
|||||||
return release / ".venv" / "bin" / "python"
|
return release / ".venv" / "bin" / "python"
|
||||||
|
|
||||||
|
|
||||||
|
def _shared_venv(install_dir: pathlib.Path) -> pathlib.Path:
|
||||||
|
"""The one stable venv that agents/updaters run out of.
|
||||||
|
|
||||||
|
Release slots ship source only. We ``pip install --force-reinstall
|
||||||
|
--no-deps`` into this venv on promotion so shebangs never dangle
|
||||||
|
across a rotation.
|
||||||
|
"""
|
||||||
|
return install_dir / "venv"
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------- public
|
# ------------------------------------------------------------------- public
|
||||||
|
|
||||||
def read_release(release: pathlib.Path) -> Release:
|
def read_release(release: pathlib.Path) -> Release:
|
||||||
@@ -167,20 +177,29 @@ def extract_tarball(tarball_bytes: bytes, dest: pathlib.Path) -> None:
|
|||||||
|
|
||||||
# ---------------------------------------------------------------- seams
|
# ---------------------------------------------------------------- seams
|
||||||
|
|
||||||
def _run_pip(release: pathlib.Path) -> subprocess.CompletedProcess:
|
def _run_pip(
|
||||||
"""Create a venv in ``release/.venv`` and pip install -e . into it.
|
release: pathlib.Path,
|
||||||
|
install_dir: Optional[pathlib.Path] = None,
|
||||||
|
) -> subprocess.CompletedProcess:
|
||||||
|
"""pip install ``release`` into the shared venv at ``install_dir/venv``.
|
||||||
|
|
||||||
|
The shared venv is bootstrapped on first use. ``--force-reinstall
|
||||||
|
--no-deps`` replaces site-packages for the decnet package only; the
|
||||||
|
rest of the env stays cached across updates.
|
||||||
|
|
||||||
Monkeypatched in tests so the test suite never shells out.
|
Monkeypatched in tests so the test suite never shells out.
|
||||||
"""
|
"""
|
||||||
venv_dir = release / ".venv"
|
idir = install_dir or release.parent.parent # releases/<slot> -> install_dir
|
||||||
|
venv_dir = _shared_venv(idir)
|
||||||
if not venv_dir.exists():
|
if not venv_dir.exists():
|
||||||
subprocess.run( # nosec B603
|
subprocess.run( # nosec B603
|
||||||
[sys.executable, "-m", "venv", str(venv_dir)],
|
[sys.executable, "-m", "venv", str(venv_dir)],
|
||||||
check=True, capture_output=True, text=True,
|
check=True, capture_output=True, text=True,
|
||||||
)
|
)
|
||||||
py = _venv_python(release)
|
py = venv_dir / "bin" / "python"
|
||||||
return subprocess.run( # nosec B603
|
return subprocess.run( # nosec B603
|
||||||
[str(py), "-m", "pip", "install", "-e", str(release)],
|
[str(py), "-m", "pip", "install", "--force-reinstall", "--no-deps",
|
||||||
|
str(release)],
|
||||||
check=False, capture_output=True, text=True,
|
check=False, capture_output=True, text=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -190,41 +209,97 @@ def _spawn_agent(install_dir: pathlib.Path) -> int:
|
|||||||
|
|
||||||
Returns the new PID. Monkeypatched in tests.
|
Returns the new PID. Monkeypatched in tests.
|
||||||
"""
|
"""
|
||||||
py = _venv_python(_current_symlink(install_dir).resolve())
|
decnet_bin = _shared_venv(install_dir) / "bin" / "decnet"
|
||||||
|
log_path = install_dir / "agent.spawn.log"
|
||||||
|
# cwd=install_dir so a persistent ``<install_dir>/.env.local`` gets
|
||||||
|
# picked up by decnet.env (which loads from CWD). The release slot
|
||||||
|
# itself is immutable across updates, so the env file cannot live
|
||||||
|
# inside it.
|
||||||
proc = subprocess.Popen( # nosec B603
|
proc = subprocess.Popen( # nosec B603
|
||||||
[str(py), "-m", "decnet", "agent", "--daemon"],
|
[str(decnet_bin), "agent", "--daemon"],
|
||||||
start_new_session=True,
|
start_new_session=True,
|
||||||
stdout=subprocess.DEVNULL,
|
cwd=str(install_dir),
|
||||||
stderr=subprocess.DEVNULL,
|
stdout=open(log_path, "ab"), # noqa: SIM115
|
||||||
|
stderr=subprocess.STDOUT,
|
||||||
)
|
)
|
||||||
_pid_file(install_dir).write_text(str(proc.pid))
|
_pid_file(install_dir).write_text(str(proc.pid))
|
||||||
return proc.pid
|
return proc.pid
|
||||||
|
|
||||||
|
|
||||||
def _stop_agent(install_dir: pathlib.Path, grace: float = AGENT_RESTART_GRACE_S) -> None:
|
def _discover_agent_pids() -> list[int]:
|
||||||
"""SIGTERM the PID we spawned; SIGKILL if it doesn't exit in ``grace`` s."""
|
"""Scan /proc for any running ``decnet agent`` process.
|
||||||
pid_file = _pid_file(install_dir)
|
|
||||||
if not pid_file.is_file():
|
Used as a fallback when agent.pid is missing (e.g., the agent was started
|
||||||
return
|
by hand rather than by the updater) so an update still produces a clean
|
||||||
try:
|
restart instead of leaving the old in-memory code serving requests.
|
||||||
pid = int(pid_file.read_text().strip())
|
"""
|
||||||
except (ValueError, OSError):
|
pids: list[int] = []
|
||||||
return
|
self_pid = os.getpid()
|
||||||
try:
|
for entry in pathlib.Path("/proc").iterdir():
|
||||||
os.kill(pid, signal.SIGTERM)
|
if not entry.name.isdigit():
|
||||||
except ProcessLookupError:
|
continue
|
||||||
return
|
pid = int(entry.name)
|
||||||
deadline = time.monotonic() + grace
|
if pid == self_pid:
|
||||||
while time.monotonic() < deadline:
|
continue
|
||||||
try:
|
try:
|
||||||
os.kill(pid, 0)
|
raw = (entry / "cmdline").read_bytes()
|
||||||
|
except (FileNotFoundError, PermissionError, OSError):
|
||||||
|
continue
|
||||||
|
argv = [a for a in raw.split(b"\x00") if a]
|
||||||
|
if len(argv) < 2:
|
||||||
|
continue
|
||||||
|
if not argv[0].endswith(b"python") and b"python" not in pathlib.Path(argv[0].decode(errors="ignore")).name.encode():
|
||||||
|
# Allow direct console-script invocation too: argv[0] ends with /decnet
|
||||||
|
if not argv[0].endswith(b"/decnet"):
|
||||||
|
continue
|
||||||
|
if b"decnet" in b" ".join(argv) and b"agent" in argv:
|
||||||
|
pids.append(pid)
|
||||||
|
return pids
|
||||||
|
|
||||||
|
|
||||||
|
def _stop_agent(install_dir: pathlib.Path, grace: float = AGENT_RESTART_GRACE_S) -> None:
|
||||||
|
"""SIGTERM the agent and wait for it to exit; SIGKILL after ``grace`` s.
|
||||||
|
|
||||||
|
Prefers the PID recorded in ``agent.pid`` (processes we spawned) but
|
||||||
|
falls back to scanning /proc for any ``decnet agent`` so manually-started
|
||||||
|
agents are also restarted cleanly during an update.
|
||||||
|
"""
|
||||||
|
pids: list[int] = []
|
||||||
|
pid_file = _pid_file(install_dir)
|
||||||
|
if pid_file.is_file():
|
||||||
|
try:
|
||||||
|
pids.append(int(pid_file.read_text().strip()))
|
||||||
|
except (ValueError, OSError):
|
||||||
|
pass
|
||||||
|
for pid in _discover_agent_pids():
|
||||||
|
if pid not in pids:
|
||||||
|
pids.append(pid)
|
||||||
|
if not pids:
|
||||||
|
return
|
||||||
|
for pid in pids:
|
||||||
|
try:
|
||||||
|
os.kill(pid, signal.SIGTERM)
|
||||||
except ProcessLookupError:
|
except ProcessLookupError:
|
||||||
return
|
continue
|
||||||
time.sleep(0.2)
|
deadline = time.monotonic() + grace
|
||||||
|
remaining = list(pids)
|
||||||
|
while remaining and time.monotonic() < deadline:
|
||||||
|
remaining = [p for p in remaining if _pid_alive(p)]
|
||||||
|
if remaining:
|
||||||
|
time.sleep(0.2)
|
||||||
|
for pid in remaining:
|
||||||
|
try:
|
||||||
|
os.kill(pid, signal.SIGKILL)
|
||||||
|
except ProcessLookupError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _pid_alive(pid: int) -> bool:
|
||||||
try:
|
try:
|
||||||
os.kill(pid, signal.SIGKILL)
|
os.kill(pid, 0)
|
||||||
|
return True
|
||||||
except ProcessLookupError:
|
except ProcessLookupError:
|
||||||
pass
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _probe_agent(
|
def _probe_agent(
|
||||||
@@ -239,8 +314,10 @@ def _probe_agent(
|
|||||||
ca = agent_dir / "ca.crt"
|
ca = agent_dir / "ca.crt"
|
||||||
if not (worker_key.is_file() and worker_crt.is_file() and ca.is_file()):
|
if not (worker_key.is_file() and worker_crt.is_file() and ca.is_file()):
|
||||||
return False, f"no mTLS bundle at {agent_dir}"
|
return False, f"no mTLS bundle at {agent_dir}"
|
||||||
ctx = ssl.create_default_context(cafile=str(ca))
|
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||||
ctx.load_cert_chain(certfile=str(worker_crt), keyfile=str(worker_key))
|
ctx.load_cert_chain(certfile=str(worker_crt), keyfile=str(worker_key))
|
||||||
|
ctx.load_verify_locations(cafile=str(ca))
|
||||||
|
ctx.verify_mode = ssl.CERT_REQUIRED
|
||||||
ctx.check_hostname = False
|
ctx.check_hostname = False
|
||||||
|
|
||||||
last = ""
|
last = ""
|
||||||
@@ -407,7 +484,7 @@ def run_update_self(
|
|||||||
_rotate(updater_install_dir)
|
_rotate(updater_install_dir)
|
||||||
_point_current_at(updater_install_dir, _active_dir(updater_install_dir))
|
_point_current_at(updater_install_dir, _active_dir(updater_install_dir))
|
||||||
|
|
||||||
argv = [str(_venv_python(_active_dir(updater_install_dir))), "-m", "decnet", "updater"] + sys.argv[1:]
|
argv = [str(_shared_venv(updater_install_dir) / "bin" / "decnet"), "updater"] + sys.argv[1:]
|
||||||
if exec_cb is not None:
|
if exec_cb is not None:
|
||||||
exec_cb(argv) # tests stub this — we don't actually re-exec
|
exec_cb(argv) # tests stub this — we don't actually re-exec
|
||||||
return {"status": "self_update_queued", "argv": argv}
|
return {"status": "self_update_queued", "argv": argv}
|
||||||
|
|||||||
@@ -25,7 +25,9 @@ dependencies = [
|
|||||||
"scapy>=2.6.1",
|
"scapy>=2.6.1",
|
||||||
"orjson>=3.10",
|
"orjson>=3.10",
|
||||||
"cryptography>=46.0.7",
|
"cryptography>=46.0.7",
|
||||||
"python-multipart>=0.0.20"
|
"python-multipart>=0.0.20",
|
||||||
|
"httpx>=0.28.1",
|
||||||
|
"requests>=2.33.1"
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|||||||
@@ -293,3 +293,25 @@ def test_update_self_pip_failure_leaves_active_intact(
|
|||||||
ex.run_update_self(tb, sha="U", updater_install_dir=install_dir, exec_cb=lambda a: None)
|
ex.run_update_self(tb, sha="U", updater_install_dir=install_dir, exec_cb=lambda a: None)
|
||||||
assert (install_dir / "releases" / "active" / "marker").read_text() == "old-updater"
|
assert (install_dir / "releases" / "active" / "marker").read_text() == "old-updater"
|
||||||
assert not (install_dir / "releases" / "active.new").exists()
|
assert not (install_dir / "releases" / "active.new").exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_stop_agent_falls_back_to_proc_scan_when_no_pidfile(
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
install_dir: pathlib.Path,
|
||||||
|
) -> None:
|
||||||
|
"""No agent.pid → _stop_agent still terminates agents found via /proc."""
|
||||||
|
killed: list[tuple[int, int]] = []
|
||||||
|
|
||||||
|
def fake_kill(pid: int, sig: int) -> None:
|
||||||
|
killed.append((pid, sig))
|
||||||
|
raise ProcessLookupError # pretend it already died after SIGTERM
|
||||||
|
|
||||||
|
monkeypatch.setattr(ex, "_discover_agent_pids", lambda: [4242, 4243])
|
||||||
|
monkeypatch.setattr(ex.os, "kill", fake_kill)
|
||||||
|
|
||||||
|
assert not (install_dir / "agent.pid").exists()
|
||||||
|
ex._stop_agent(install_dir, grace=0.0)
|
||||||
|
|
||||||
|
import signal as _signal
|
||||||
|
assert (4242, _signal.SIGTERM) in killed
|
||||||
|
assert (4243, _signal.SIGTERM) in killed
|
||||||
|
|||||||
Reference in New Issue
Block a user