The bootstrap was installing into /opt/decnet/.venv with an editable
`pip install -e .`, and /usr/local/bin/decnet pointed there. The updater
writes releases to /opt/decnet/releases/active/ with a shared venv at
/opt/decnet/venv — a parallel tree nothing on the box actually runs.
Result: updates appeared to succeed (release dir rotated, SHA changed)
but systemd kept executing the untouched bootstrap code.
Changes:
- Bootstrap now installs directly into /opt/decnet/releases/active
with the shared venv at /opt/decnet/venv and /opt/decnet/current
symlinked. Same layout the updater rotates in and out of.
- /usr/local/bin/decnet -> /opt/decnet/venv/bin/decnet.
- run_update / run_update_self heal /usr/local/bin/decnet on every
push so already-enrolled hosts recover on the next update instead
of needing a re-enroll.
- run_update / run_update_self now log each phase (receive, extract,
pip install, rotate, restart, probe) so the updater log actually
shows what happened.
629 lines
23 KiB
Python
629 lines
23 KiB
Python
"""Update/rollback orchestrator for the DECNET self-updater.
|
|
|
|
Directory layout owned by this module (root = ``install_dir``):
|
|
|
|
<install_dir>/
|
|
current -> releases/active (symlink; atomic swap == promotion)
|
|
releases/
|
|
active/ (working tree; has its own .venv)
|
|
prev/ (last good snapshot; restored on failure)
|
|
active.new/ (staging; only exists mid-update)
|
|
agent.pid (PID of the agent process we spawned)
|
|
|
|
Rollback semantics: if the agent doesn't come back healthy after an update,
|
|
we swap the symlink back to ``prev``, restart the agent, and return the
|
|
captured pip/agent stderr to the caller.
|
|
|
|
Seams for tests — every subprocess call goes through a module-level hook
|
|
(`_run_pip`, `_spawn_agent`, `_probe_agent`) so tests can monkeypatch them
|
|
without actually touching the filesystem's Python toolchain.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import dataclasses
|
|
import os
|
|
import pathlib
|
|
import shutil
|
|
import signal
|
|
import ssl
|
|
import subprocess # nosec B404
|
|
import sys
|
|
import tarfile
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from typing import Any, Callable, Optional
|
|
|
|
import httpx
|
|
|
|
from decnet.logging import get_logger
|
|
from decnet.swarm import pki
|
|
|
|
log = get_logger("updater.executor")
|
|
|
|
DEFAULT_INSTALL_DIR = pathlib.Path("/opt/decnet")
|
|
AGENT_PROBE_URL = "https://127.0.0.1:8765/health"
|
|
AGENT_PROBE_ATTEMPTS = 10
|
|
AGENT_PROBE_BACKOFF_S = 1.0
|
|
AGENT_RESTART_GRACE_S = 10.0
|
|
|
|
|
|
# ------------------------------------------------------------------- errors
|
|
|
|
class UpdateError(RuntimeError):
|
|
"""Raised when an update fails but the install dir is consistent.
|
|
|
|
Carries the captured stderr so the master gets actionable output.
|
|
"""
|
|
|
|
def __init__(self, message: str, *, stderr: str = "", rolled_back: bool = False):
|
|
super().__init__(message)
|
|
self.stderr = stderr
|
|
self.rolled_back = rolled_back
|
|
|
|
|
|
# -------------------------------------------------------------------- types
|
|
|
|
@dataclasses.dataclass(frozen=True)
|
|
class Release:
|
|
slot: str
|
|
sha: Optional[str]
|
|
installed_at: Optional[datetime]
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"slot": self.slot,
|
|
"sha": self.sha,
|
|
"installed_at": self.installed_at.isoformat() if self.installed_at else None,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------- internals
|
|
|
|
def _releases_dir(install_dir: pathlib.Path) -> pathlib.Path:
|
|
return install_dir / "releases"
|
|
|
|
|
|
def _active_dir(install_dir: pathlib.Path) -> pathlib.Path:
|
|
return _releases_dir(install_dir) / "active"
|
|
|
|
|
|
def _prev_dir(install_dir: pathlib.Path) -> pathlib.Path:
|
|
return _releases_dir(install_dir) / "prev"
|
|
|
|
|
|
def _staging_dir(install_dir: pathlib.Path) -> pathlib.Path:
|
|
return _releases_dir(install_dir) / "active.new"
|
|
|
|
|
|
def _current_symlink(install_dir: pathlib.Path) -> pathlib.Path:
|
|
return install_dir / "current"
|
|
|
|
|
|
def _pid_file(install_dir: pathlib.Path) -> pathlib.Path:
|
|
return install_dir / "agent.pid"
|
|
|
|
|
|
def _manifest_file(release: pathlib.Path) -> pathlib.Path:
|
|
return release / ".decnet-release.json"
|
|
|
|
|
|
def _venv_python(release: pathlib.Path) -> pathlib.Path:
|
|
return release / ".venv" / "bin" / "python"
|
|
|
|
|
|
def _heal_path_symlink(install_dir: pathlib.Path) -> None:
|
|
"""Point /usr/local/bin/decnet at the shared venv we manage.
|
|
|
|
Pre-fix bootstraps installed into ``<install_dir>/.venv`` (editable) and
|
|
symlinked /usr/local/bin/decnet there, so systemd units kept executing
|
|
the pre-update code even after ``_run_pip`` wrote to the shared venv.
|
|
Fix it opportunistically on every update so already-enrolled hosts
|
|
recover on the next push instead of needing a manual re-enroll.
|
|
"""
|
|
target = _shared_venv(install_dir) / "bin" / "decnet"
|
|
link = pathlib.Path("/usr/local/bin/decnet")
|
|
if not target.is_file():
|
|
return
|
|
try:
|
|
if link.is_symlink() and pathlib.Path(os.readlink(link)) == target:
|
|
return
|
|
tmp = link.with_suffix(".tmp")
|
|
if tmp.exists() or tmp.is_symlink():
|
|
tmp.unlink()
|
|
tmp.symlink_to(target)
|
|
os.replace(tmp, link)
|
|
log.info("repointed %s -> %s", link, target)
|
|
except OSError as exc:
|
|
log.warning("could not repoint %s: %s", link, exc)
|
|
|
|
|
|
def _shared_venv(install_dir: pathlib.Path) -> pathlib.Path:
|
|
"""The one stable venv that agents/updaters run out of.
|
|
|
|
Release slots ship source only. We ``pip install --force-reinstall
|
|
--no-deps`` into this venv on promotion so shebangs never dangle
|
|
across a rotation.
|
|
"""
|
|
return install_dir / "venv"
|
|
|
|
|
|
# ------------------------------------------------------------------- public
|
|
|
|
def read_release(release: pathlib.Path) -> Release:
|
|
"""Read the release manifest sidecar; tolerate absence."""
|
|
slot = release.name
|
|
mf = _manifest_file(release)
|
|
if not mf.is_file():
|
|
return Release(slot=slot, sha=None, installed_at=None)
|
|
import json
|
|
|
|
try:
|
|
data = json.loads(mf.read_text())
|
|
except (json.JSONDecodeError, OSError):
|
|
return Release(slot=slot, sha=None, installed_at=None)
|
|
ts = data.get("installed_at")
|
|
return Release(
|
|
slot=slot,
|
|
sha=data.get("sha"),
|
|
installed_at=datetime.fromisoformat(ts) if ts else None,
|
|
)
|
|
|
|
|
|
def list_releases(install_dir: pathlib.Path) -> list[Release]:
|
|
out: list[Release] = []
|
|
for slot_dir in (_active_dir(install_dir), _prev_dir(install_dir)):
|
|
if slot_dir.is_dir():
|
|
out.append(read_release(slot_dir))
|
|
return out
|
|
|
|
|
|
def clean_stale_staging(install_dir: pathlib.Path) -> None:
|
|
"""Remove a half-extracted ``active.new`` left by a crashed update."""
|
|
staging = _staging_dir(install_dir)
|
|
if staging.exists():
|
|
log.warning("removing stale staging dir %s", staging)
|
|
shutil.rmtree(staging, ignore_errors=True)
|
|
|
|
|
|
def extract_tarball(tarball_bytes: bytes, dest: pathlib.Path) -> None:
|
|
"""Extract a gzipped tarball into ``dest`` (must not pre-exist).
|
|
|
|
Rejects absolute paths and ``..`` traversal in the archive.
|
|
"""
|
|
import io
|
|
|
|
dest.mkdir(parents=True, exist_ok=False)
|
|
with tarfile.open(fileobj=io.BytesIO(tarball_bytes), mode="r:gz") as tar:
|
|
for member in tar.getmembers():
|
|
name = member.name
|
|
if name.startswith("/") or ".." in pathlib.PurePosixPath(name).parts:
|
|
raise UpdateError(f"unsafe path in tarball: {name!r}")
|
|
tar.extractall(dest) # nosec B202 — validated above
|
|
|
|
|
|
# ---------------------------------------------------------------- seams
|
|
|
|
def _run_pip(
|
|
release: pathlib.Path,
|
|
install_dir: Optional[pathlib.Path] = None,
|
|
) -> subprocess.CompletedProcess:
|
|
"""pip install ``release`` into the shared venv at ``install_dir/venv``.
|
|
|
|
The shared venv is bootstrapped on first use. ``--force-reinstall
|
|
--no-deps`` replaces site-packages for the decnet package only; the
|
|
rest of the env stays cached across updates.
|
|
|
|
Monkeypatched in tests so the test suite never shells out.
|
|
"""
|
|
idir = install_dir or release.parent.parent # releases/<slot> -> install_dir
|
|
venv_dir = _shared_venv(idir)
|
|
fresh = not venv_dir.exists()
|
|
if fresh:
|
|
subprocess.run( # nosec B603
|
|
[sys.executable, "-m", "venv", str(venv_dir)],
|
|
check=True, capture_output=True, text=True,
|
|
)
|
|
py = venv_dir / "bin" / "python"
|
|
# First install into a fresh venv: pull full dep tree. Subsequent updates
|
|
# use --no-deps so pip only replaces the decnet package.
|
|
args = [str(py), "-m", "pip", "install", "--force-reinstall", str(release)]
|
|
if not fresh:
|
|
args.insert(-1, "--no-deps")
|
|
return subprocess.run( # nosec B603
|
|
args, check=False, capture_output=True, text=True,
|
|
)
|
|
|
|
|
|
AGENT_SYSTEMD_UNIT = "decnet-agent.service"
|
|
FORWARDER_SYSTEMD_UNIT = "decnet-forwarder.service"
|
|
UPDATER_SYSTEMD_UNIT = "decnet-updater.service"
|
|
|
|
|
|
def _systemd_available() -> bool:
|
|
"""True when we're running under systemd and have systemctl on PATH.
|
|
|
|
Detection is conservative: we only return True if *both* the invocation
|
|
marker is set (``INVOCATION_ID`` is exported by systemd for every unit)
|
|
and ``systemctl`` is resolvable. The env var alone can be forged; the
|
|
binary alone can exist on hosts running other init systems.
|
|
"""
|
|
if not os.environ.get("INVOCATION_ID"):
|
|
return False
|
|
from shutil import which
|
|
return which("systemctl") is not None
|
|
|
|
|
|
def _spawn_agent(install_dir: pathlib.Path) -> int:
|
|
"""Launch the agent and return its PID.
|
|
|
|
Under systemd, restart ``decnet-agent.service`` via ``systemctl`` so the
|
|
new process inherits the unit's ambient capabilities (CAP_NET_ADMIN,
|
|
CAP_NET_RAW). Spawning with ``subprocess.Popen`` from inside the updater
|
|
unit would make the agent a child of the updater and therefore a member
|
|
of the updater's (empty) capability set — it would come up without the
|
|
caps needed to run MACVLAN/scapy.
|
|
|
|
Off systemd (dev boxes, manual starts), fall back to a direct Popen.
|
|
"""
|
|
if _systemd_available():
|
|
return _spawn_agent_via_systemd(install_dir)
|
|
return _spawn_agent_via_popen(install_dir)
|
|
|
|
|
|
def _spawn_agent_via_systemd(install_dir: pathlib.Path) -> int:
|
|
# Restart agent + forwarder together: both processes run out of the same
|
|
# /opt/decnet tree, so a code push that replaces the tree must cycle both
|
|
# or the forwarder keeps the pre-update code in memory. Forwarder restart
|
|
# is best-effort — a worker without the forwarder unit installed (e.g. a
|
|
# legacy enrollment) shouldn't abort the update.
|
|
subprocess.run( # nosec B603 B607
|
|
["systemctl", "restart", AGENT_SYSTEMD_UNIT],
|
|
check=True, capture_output=True, text=True,
|
|
)
|
|
fwd = subprocess.run( # nosec B603 B607
|
|
["systemctl", "restart", FORWARDER_SYSTEMD_UNIT],
|
|
check=False, capture_output=True, text=True,
|
|
)
|
|
if fwd.returncode != 0:
|
|
log.warning("forwarder restart failed (ignored): %s", fwd.stderr.strip())
|
|
pid_out = subprocess.run( # nosec B603 B607
|
|
["systemctl", "show", "--property=MainPID", "--value", AGENT_SYSTEMD_UNIT],
|
|
check=True, capture_output=True, text=True,
|
|
)
|
|
pid = int(pid_out.stdout.strip() or "0")
|
|
if pid:
|
|
_pid_file(install_dir).write_text(str(pid))
|
|
return pid
|
|
|
|
|
|
def _spawn_agent_via_popen(install_dir: pathlib.Path) -> int:
|
|
decnet_bin = _shared_venv(install_dir) / "bin" / "decnet"
|
|
log_path = install_dir / "agent.spawn.log"
|
|
# cwd=install_dir so a persistent ``<install_dir>/.env.local`` gets
|
|
# picked up by decnet.env (which loads from CWD). The release slot
|
|
# itself is immutable across updates, so the env file cannot live
|
|
# inside it.
|
|
proc = subprocess.Popen( # nosec B603
|
|
[str(decnet_bin), "agent", "--daemon"],
|
|
start_new_session=True,
|
|
cwd=str(install_dir),
|
|
stdout=open(log_path, "ab"), # noqa: SIM115
|
|
stderr=subprocess.STDOUT,
|
|
)
|
|
_pid_file(install_dir).write_text(str(proc.pid))
|
|
return proc.pid
|
|
|
|
|
|
def _discover_agent_pids() -> list[int]:
|
|
"""Scan /proc for any running ``decnet agent`` process.
|
|
|
|
Used as a fallback when agent.pid is missing (e.g., the agent was started
|
|
by hand rather than by the updater) so an update still produces a clean
|
|
restart instead of leaving the old in-memory code serving requests.
|
|
"""
|
|
pids: list[int] = []
|
|
self_pid = os.getpid()
|
|
for entry in pathlib.Path("/proc").iterdir():
|
|
if not entry.name.isdigit():
|
|
continue
|
|
pid = int(entry.name)
|
|
if pid == self_pid:
|
|
continue
|
|
try:
|
|
raw = (entry / "cmdline").read_bytes()
|
|
except (FileNotFoundError, PermissionError, OSError):
|
|
continue
|
|
argv = [a for a in raw.split(b"\x00") if a]
|
|
if len(argv) < 2:
|
|
continue
|
|
if not argv[0].endswith(b"python") and b"python" not in pathlib.Path(argv[0].decode(errors="ignore")).name.encode():
|
|
# Allow direct console-script invocation too: argv[0] ends with /decnet
|
|
if not argv[0].endswith(b"/decnet"):
|
|
continue
|
|
if b"decnet" in b" ".join(argv) and b"agent" in argv:
|
|
pids.append(pid)
|
|
return pids
|
|
|
|
|
|
def _stop_agent(install_dir: pathlib.Path, grace: float = AGENT_RESTART_GRACE_S) -> None:
|
|
"""SIGTERM the agent and wait for it to exit; SIGKILL after ``grace`` s.
|
|
|
|
Prefers the PID recorded in ``agent.pid`` (processes we spawned) but
|
|
falls back to scanning /proc for any ``decnet agent`` so manually-started
|
|
agents are also restarted cleanly during an update.
|
|
|
|
Under systemd, stop is a no-op — ``_spawn_agent`` issues a single
|
|
``systemctl restart`` that handles stop and start atomically. Pre-stopping
|
|
would only race the restart's own stop phase.
|
|
"""
|
|
if _systemd_available():
|
|
return
|
|
pids: list[int] = []
|
|
pid_file = _pid_file(install_dir)
|
|
if pid_file.is_file():
|
|
try:
|
|
pids.append(int(pid_file.read_text().strip()))
|
|
except (ValueError, OSError):
|
|
pass
|
|
for pid in _discover_agent_pids():
|
|
if pid not in pids:
|
|
pids.append(pid)
|
|
if not pids:
|
|
return
|
|
for pid in pids:
|
|
try:
|
|
os.kill(pid, signal.SIGTERM)
|
|
except ProcessLookupError:
|
|
continue
|
|
deadline = time.monotonic() + grace
|
|
remaining = list(pids)
|
|
while remaining and time.monotonic() < deadline:
|
|
remaining = [p for p in remaining if _pid_alive(p)]
|
|
if remaining:
|
|
time.sleep(0.2)
|
|
for pid in remaining:
|
|
try:
|
|
os.kill(pid, signal.SIGKILL)
|
|
except ProcessLookupError:
|
|
pass
|
|
|
|
|
|
def _pid_alive(pid: int) -> bool:
|
|
try:
|
|
os.kill(pid, 0)
|
|
return True
|
|
except ProcessLookupError:
|
|
return False
|
|
|
|
|
|
def _probe_agent(
|
|
agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR,
|
|
url: str = AGENT_PROBE_URL,
|
|
attempts: int = AGENT_PROBE_ATTEMPTS,
|
|
backoff_s: float = AGENT_PROBE_BACKOFF_S,
|
|
) -> tuple[bool, str]:
|
|
"""Local mTLS health probe against the agent. Returns (ok, detail)."""
|
|
worker_key = agent_dir / "worker.key"
|
|
worker_crt = agent_dir / "worker.crt"
|
|
ca = agent_dir / "ca.crt"
|
|
if not (worker_key.is_file() and worker_crt.is_file() and ca.is_file()):
|
|
return False, f"no mTLS bundle at {agent_dir}"
|
|
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
|
ctx.load_cert_chain(certfile=str(worker_crt), keyfile=str(worker_key))
|
|
ctx.load_verify_locations(cafile=str(ca))
|
|
ctx.verify_mode = ssl.CERT_REQUIRED
|
|
ctx.check_hostname = False
|
|
|
|
last = ""
|
|
for i in range(attempts):
|
|
try:
|
|
with httpx.Client(verify=ctx, timeout=3.0) as client:
|
|
r = client.get(url)
|
|
if r.status_code == 200:
|
|
return True, r.text
|
|
last = f"status={r.status_code} body={r.text[:200]}"
|
|
except Exception as exc: # noqa: BLE001
|
|
last = f"{type(exc).__name__}: {exc}"
|
|
if i < attempts - 1:
|
|
time.sleep(backoff_s)
|
|
return False, last
|
|
|
|
|
|
# -------------------------------------------------------------- orchestrator
|
|
|
|
def _write_manifest(release: pathlib.Path, sha: Optional[str]) -> None:
|
|
import json
|
|
|
|
_manifest_file(release).write_text(json.dumps({
|
|
"sha": sha,
|
|
"installed_at": datetime.now(timezone.utc).isoformat(),
|
|
}))
|
|
|
|
|
|
def _rotate(install_dir: pathlib.Path) -> None:
|
|
"""Rotate directories: prev→(deleted), active→prev, active.new→active.
|
|
|
|
Caller must ensure ``active.new`` exists. ``active`` may or may not.
|
|
"""
|
|
active = _active_dir(install_dir)
|
|
prev = _prev_dir(install_dir)
|
|
staging = _staging_dir(install_dir)
|
|
|
|
if prev.exists():
|
|
shutil.rmtree(prev)
|
|
if active.exists():
|
|
active.rename(prev)
|
|
staging.rename(active)
|
|
|
|
|
|
def _point_current_at(install_dir: pathlib.Path, target: pathlib.Path) -> None:
|
|
"""Atomic symlink flip via rename."""
|
|
link = _current_symlink(install_dir)
|
|
tmp = install_dir / ".current.tmp"
|
|
if tmp.exists() or tmp.is_symlink():
|
|
tmp.unlink()
|
|
tmp.symlink_to(target)
|
|
os.replace(tmp, link)
|
|
|
|
|
|
def run_update(
|
|
tarball_bytes: bytes,
|
|
sha: Optional[str],
|
|
install_dir: pathlib.Path = DEFAULT_INSTALL_DIR,
|
|
agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR,
|
|
) -> dict[str, Any]:
|
|
"""Apply an update atomically. Rolls back on probe failure."""
|
|
log.info("update received sha=%s bytes=%d install_dir=%s", sha, len(tarball_bytes), install_dir)
|
|
clean_stale_staging(install_dir)
|
|
staging = _staging_dir(install_dir)
|
|
|
|
log.info("extracting tarball -> %s", staging)
|
|
extract_tarball(tarball_bytes, staging)
|
|
_write_manifest(staging, sha)
|
|
|
|
log.info("pip install into shared venv (%s)", _shared_venv(install_dir))
|
|
pip = _run_pip(staging)
|
|
if pip.returncode != 0:
|
|
log.error("pip install failed rc=%d stderr=%s", pip.returncode, (pip.stderr or pip.stdout).strip()[:400])
|
|
shutil.rmtree(staging, ignore_errors=True)
|
|
raise UpdateError(
|
|
"pip install failed on new release", stderr=pip.stderr or pip.stdout,
|
|
)
|
|
|
|
log.info("rotating releases: active.new -> active, active -> prev")
|
|
_rotate(install_dir)
|
|
_point_current_at(install_dir, _active_dir(install_dir))
|
|
_heal_path_symlink(install_dir)
|
|
|
|
log.info("restarting agent (and forwarder if present)")
|
|
_stop_agent(install_dir)
|
|
_spawn_agent(install_dir)
|
|
|
|
ok, detail = _probe_agent(agent_dir=agent_dir)
|
|
if ok:
|
|
log.info("update complete sha=%s probe=ok", sha)
|
|
return {
|
|
"status": "updated",
|
|
"release": read_release(_active_dir(install_dir)).to_dict(),
|
|
"probe": detail,
|
|
}
|
|
|
|
# Rollback.
|
|
log.warning("agent probe failed after update: %s — rolling back", detail)
|
|
_stop_agent(install_dir)
|
|
# Swap active <-> prev.
|
|
active = _active_dir(install_dir)
|
|
prev = _prev_dir(install_dir)
|
|
tmp = _releases_dir(install_dir) / ".swap"
|
|
if tmp.exists():
|
|
shutil.rmtree(tmp)
|
|
active.rename(tmp)
|
|
prev.rename(active)
|
|
tmp.rename(prev)
|
|
_point_current_at(install_dir, active)
|
|
_spawn_agent(install_dir)
|
|
ok2, detail2 = _probe_agent(agent_dir=agent_dir)
|
|
raise UpdateError(
|
|
"agent failed health probe after update; rolled back to previous release",
|
|
stderr=f"forward-probe: {detail}\nrollback-probe: {detail2}",
|
|
rolled_back=ok2,
|
|
)
|
|
|
|
|
|
def run_rollback(
|
|
install_dir: pathlib.Path = DEFAULT_INSTALL_DIR,
|
|
agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR,
|
|
) -> dict[str, Any]:
|
|
"""Manually swap active with prev and restart the agent."""
|
|
active = _active_dir(install_dir)
|
|
prev = _prev_dir(install_dir)
|
|
if not prev.is_dir():
|
|
raise UpdateError("no previous release to roll back to")
|
|
|
|
_stop_agent(install_dir)
|
|
tmp = _releases_dir(install_dir) / ".swap"
|
|
if tmp.exists():
|
|
shutil.rmtree(tmp)
|
|
active.rename(tmp)
|
|
prev.rename(active)
|
|
tmp.rename(prev)
|
|
_point_current_at(install_dir, active)
|
|
_spawn_agent(install_dir)
|
|
ok, detail = _probe_agent(agent_dir=agent_dir)
|
|
if not ok:
|
|
raise UpdateError("agent unhealthy after rollback", stderr=detail)
|
|
return {
|
|
"status": "rolled_back",
|
|
"release": read_release(active).to_dict(),
|
|
"probe": detail,
|
|
}
|
|
|
|
|
|
def run_update_self(
|
|
tarball_bytes: bytes,
|
|
sha: Optional[str],
|
|
updater_install_dir: pathlib.Path,
|
|
exec_cb: Optional[Callable[[list[str]], None]] = None,
|
|
) -> dict[str, Any]:
|
|
"""Replace the updater's own source tree, then re-exec this process.
|
|
|
|
No auto-rollback. Caller must treat "connection dropped + /health
|
|
returns new SHA within 30s" as success.
|
|
"""
|
|
log.info("self-update received sha=%s bytes=%d install_dir=%s", sha, len(tarball_bytes), updater_install_dir)
|
|
clean_stale_staging(updater_install_dir)
|
|
staging = _staging_dir(updater_install_dir)
|
|
log.info("extracting tarball -> %s", staging)
|
|
extract_tarball(tarball_bytes, staging)
|
|
_write_manifest(staging, sha)
|
|
|
|
log.info("pip install updater release into shared venv (%s)", _shared_venv(updater_install_dir))
|
|
pip = _run_pip(staging)
|
|
if pip.returncode != 0:
|
|
log.error("self-update pip install failed rc=%d stderr=%s", pip.returncode, (pip.stderr or pip.stdout).strip()[:400])
|
|
shutil.rmtree(staging, ignore_errors=True)
|
|
raise UpdateError(
|
|
"pip install failed on new updater release",
|
|
stderr=pip.stderr or pip.stdout,
|
|
)
|
|
|
|
log.info("rotating updater releases and flipping current symlink")
|
|
_rotate(updater_install_dir)
|
|
_point_current_at(updater_install_dir, _active_dir(updater_install_dir))
|
|
_heal_path_symlink(updater_install_dir)
|
|
|
|
# Reconstruct the updater's original launch command from env vars set by
|
|
# `decnet.updater.server.run`. We can't reuse sys.argv: inside the app
|
|
# process this is the uvicorn subprocess invocation (--ssl-keyfile, etc.),
|
|
# not the operator-visible `decnet updater ...` command.
|
|
decnet_bin = str(_shared_venv(updater_install_dir) / "bin" / "decnet")
|
|
argv = [decnet_bin, "updater",
|
|
"--host", os.environ.get("DECNET_UPDATER_HOST", "0.0.0.0"), # nosec B104
|
|
"--port", os.environ.get("DECNET_UPDATER_PORT", "8766"),
|
|
"--updater-dir", os.environ.get("DECNET_UPDATER_BUNDLE_DIR",
|
|
str(pki.DEFAULT_AGENT_DIR.parent / "updater")),
|
|
"--install-dir", os.environ.get("DECNET_UPDATER_INSTALL_DIR",
|
|
str(updater_install_dir.parent)),
|
|
"--agent-dir", os.environ.get("DECNET_UPDATER_AGENT_DIR",
|
|
str(pki.DEFAULT_AGENT_DIR))]
|
|
if exec_cb is not None:
|
|
exec_cb(argv) # tests stub this — we don't actually re-exec
|
|
return {"status": "self_update_queued", "argv": argv}
|
|
# Under systemd, hand the restart to the init system so the new process
|
|
# keeps its unit context (capabilities, cgroup, logging target) instead
|
|
# of inheriting whatever we had here. Spawn a detached sh that waits for
|
|
# this response to flush before issuing the restart — `systemctl restart`
|
|
# on our own unit would kill us mid-response and the caller would see a
|
|
# connection drop with no indication of success.
|
|
if _systemd_available():
|
|
log.info("self-update queued: systemctl restart %s (deferred 1s)", UPDATER_SYSTEMD_UNIT)
|
|
subprocess.Popen( # nosec B603 B607
|
|
["sh", "-c", f"sleep 1 && systemctl restart {UPDATER_SYSTEMD_UNIT}"],
|
|
start_new_session=True,
|
|
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
|
|
)
|
|
return {"status": "self_update_queued", "via": "systemd"}
|
|
# Off-systemd fallback: replace the process image directly.
|
|
os.execv(argv[0], argv) # nosec B606 - pragma: no cover
|
|
return {"status": "self_update_queued"} # pragma: no cover
|