feat(updater): remote self-update daemon with auto-rollback
Adds a separate `decnet updater` daemon on each worker that owns the agent's release directory and installs tarball pushes from the master over mTLS. A normal `/update` never touches the updater itself, so the updater is always a known-good rescuer if a bad agent push breaks /health — the rotation is reversed and the agent restarted against the previous release. `POST /update-self` handles updater upgrades explicitly (no auto-rollback). - decnet/updater/: executor, FastAPI app, uvicorn launcher - decnet/swarm/updater_client.py, tar_tree.py: master-side push - cli: `decnet updater`, `decnet swarm update [--host|--all] [--include-self] [--dry-run]`, `--updater` on `swarm enroll` - enrollment API issues a second cert (CN=updater@<host>) signed by the same CA; SwarmHost records updater_cert_fingerprint - tests: executor, app, CLI, tar tree, enroll-with-updater (37 new) - wiki: Remote-Updates page + sidebar + SWARM-Mode cross-link
This commit is contained in:
97
decnet/swarm/tar_tree.py
Normal file
97
decnet/swarm/tar_tree.py
Normal file
@@ -0,0 +1,97 @@
|
||||
"""Build a gzipped tarball of the master's working tree for pushing to workers.
|
||||
|
||||
Always excludes the obvious large / secret / churn paths: ``.venv/``,
|
||||
``__pycache__/``, ``.git/``, ``wiki-checkout/``, ``*.db*``, ``*.log``. The
|
||||
caller can supply additional exclude globs.
|
||||
|
||||
Deliberately does NOT invoke git — the tree is what the operator has on
|
||||
disk (staged + unstaged + untracked). That's the whole point; the scp
|
||||
workflow we're replacing also shipped the live tree.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import fnmatch
|
||||
import io
|
||||
import pathlib
|
||||
import tarfile
|
||||
from typing import Iterable, Optional
|
||||
|
||||
DEFAULT_EXCLUDES = (
|
||||
".venv", ".venv/*",
|
||||
"**/.venv/*",
|
||||
"__pycache__", "**/__pycache__", "**/__pycache__/*",
|
||||
".git", ".git/*",
|
||||
"wiki-checkout", "wiki-checkout/*",
|
||||
"*.pyc", "*.pyo",
|
||||
"*.db", "*.db-wal", "*.db-shm",
|
||||
"*.log",
|
||||
".pytest_cache", ".pytest_cache/*",
|
||||
".mypy_cache", ".mypy_cache/*",
|
||||
".tox", ".tox/*",
|
||||
"*.egg-info", "*.egg-info/*",
|
||||
"decnet-state.json",
|
||||
"master.log", "master.json",
|
||||
"decnet.db*",
|
||||
)
|
||||
|
||||
|
||||
def _is_excluded(rel: str, patterns: Iterable[str]) -> bool:
|
||||
parts = pathlib.PurePosixPath(rel).parts
|
||||
for pat in patterns:
|
||||
if fnmatch.fnmatch(rel, pat):
|
||||
return True
|
||||
# Also match the pattern against every leading subpath — this is
|
||||
# what catches nested `.venv/...` without forcing callers to spell
|
||||
# out every `**/` glob.
|
||||
for i in range(1, len(parts) + 1):
|
||||
if fnmatch.fnmatch("/".join(parts[:i]), pat):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def tar_working_tree(
|
||||
root: pathlib.Path,
|
||||
extra_excludes: Optional[Iterable[str]] = None,
|
||||
) -> bytes:
|
||||
"""Return the gzipped tarball bytes of ``root``.
|
||||
|
||||
Entries are added with paths relative to ``root`` (no leading ``/``,
|
||||
no ``..``). The updater rejects unsafe paths on the receiving side.
|
||||
"""
|
||||
patterns = list(DEFAULT_EXCLUDES) + list(extra_excludes or ())
|
||||
buf = io.BytesIO()
|
||||
|
||||
with tarfile.open(fileobj=buf, mode="w:gz") as tar:
|
||||
for path in sorted(root.rglob("*")):
|
||||
rel = path.relative_to(root).as_posix()
|
||||
if _is_excluded(rel, patterns):
|
||||
continue
|
||||
if path.is_symlink():
|
||||
# Symlinks inside a repo tree are rare and often break
|
||||
# portability; skip them rather than ship dangling links.
|
||||
continue
|
||||
if path.is_dir():
|
||||
continue
|
||||
tar.add(path, arcname=rel, recursive=False)
|
||||
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def detect_git_sha(root: pathlib.Path) -> str:
|
||||
"""Best-effort ``HEAD`` sha. Returns ``""`` if not a git repo."""
|
||||
head = root / ".git" / "HEAD"
|
||||
if not head.is_file():
|
||||
return ""
|
||||
try:
|
||||
ref = head.read_text().strip()
|
||||
except OSError:
|
||||
return ""
|
||||
if ref.startswith("ref: "):
|
||||
ref_path = root / ".git" / ref[5:]
|
||||
if ref_path.is_file():
|
||||
try:
|
||||
return ref_path.read_text().strip()
|
||||
except OSError:
|
||||
return ""
|
||||
return ""
|
||||
return ref
|
||||
124
decnet/swarm/updater_client.py
Normal file
124
decnet/swarm/updater_client.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""Master-side HTTP client for the worker's self-updater daemon.
|
||||
|
||||
Sibling of ``AgentClient``: same mTLS identity (same DECNET CA, same
|
||||
master client cert) but targets the updater's port (default 8766) and
|
||||
speaks the multipart upload protocol the updater's ``/update`` endpoint
|
||||
expects.
|
||||
|
||||
Kept as its own module — not a subclass of ``AgentClient`` — because the
|
||||
timeouts and failure semantics are genuinely different: pip install +
|
||||
agent probe can take a minute on a slow VM, and ``/update-self`` drops
|
||||
the connection on purpose (the updater re-execs itself mid-response).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import ssl
|
||||
from typing import Any, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.client import MasterIdentity, ensure_master_identity
|
||||
|
||||
log = get_logger("swarm.updater_client")
|
||||
|
||||
_TIMEOUT_UPDATE = httpx.Timeout(connect=10.0, read=180.0, write=120.0, pool=5.0)
|
||||
_TIMEOUT_CONTROL = httpx.Timeout(connect=5.0, read=30.0, write=10.0, pool=5.0)
|
||||
|
||||
|
||||
class UpdaterClient:
|
||||
"""Async client targeting a worker's ``decnet updater`` daemon."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host: dict[str, Any] | None = None,
|
||||
*,
|
||||
address: Optional[str] = None,
|
||||
updater_port: int = 8766,
|
||||
identity: Optional[MasterIdentity] = None,
|
||||
):
|
||||
if host is not None:
|
||||
self._address = host["address"]
|
||||
self._host_name = host.get("name")
|
||||
else:
|
||||
if address is None:
|
||||
raise ValueError("UpdaterClient requires host dict or address")
|
||||
self._address = address
|
||||
self._host_name = None
|
||||
self._port = updater_port
|
||||
self._identity = identity or ensure_master_identity()
|
||||
self._client: Optional[httpx.AsyncClient] = None
|
||||
|
||||
def _build_client(self, timeout: httpx.Timeout) -> httpx.AsyncClient:
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
ctx.load_cert_chain(
|
||||
str(self._identity.cert_path), str(self._identity.key_path),
|
||||
)
|
||||
ctx.load_verify_locations(cafile=str(self._identity.ca_cert_path))
|
||||
ctx.verify_mode = ssl.CERT_REQUIRED
|
||||
ctx.check_hostname = False
|
||||
return httpx.AsyncClient(
|
||||
base_url=f"https://{self._address}:{self._port}",
|
||||
verify=ctx,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
async def __aenter__(self) -> "UpdaterClient":
|
||||
self._client = self._build_client(_TIMEOUT_CONTROL)
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *exc: Any) -> None:
|
||||
if self._client:
|
||||
await self._client.aclose()
|
||||
self._client = None
|
||||
|
||||
def _require(self) -> httpx.AsyncClient:
|
||||
if self._client is None:
|
||||
raise RuntimeError("UpdaterClient used outside `async with` block")
|
||||
return self._client
|
||||
|
||||
# --------------------------------------------------------------- RPCs
|
||||
|
||||
async def health(self) -> dict[str, Any]:
|
||||
r = await self._require().get("/health")
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
async def releases(self) -> dict[str, Any]:
|
||||
r = await self._require().get("/releases")
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
async def update(self, tarball: bytes, sha: str = "") -> httpx.Response:
|
||||
"""POST /update. Returns the Response so the caller can distinguish
|
||||
200 / 409 / 500 — each means something different.
|
||||
"""
|
||||
self._require().timeout = _TIMEOUT_UPDATE
|
||||
try:
|
||||
r = await self._require().post(
|
||||
"/update",
|
||||
files={"tarball": ("tree.tgz", tarball, "application/gzip")},
|
||||
data={"sha": sha},
|
||||
)
|
||||
finally:
|
||||
self._require().timeout = _TIMEOUT_CONTROL
|
||||
return r
|
||||
|
||||
async def update_self(self, tarball: bytes, sha: str = "") -> httpx.Response:
|
||||
"""POST /update-self. The updater re-execs itself, so the connection
|
||||
usually drops mid-response; that's not an error. Callers should then
|
||||
poll /health until the new SHA appears.
|
||||
"""
|
||||
self._require().timeout = _TIMEOUT_UPDATE
|
||||
try:
|
||||
r = await self._require().post(
|
||||
"/update-self",
|
||||
files={"tarball": ("tree.tgz", tarball, "application/gzip")},
|
||||
data={"sha": sha, "confirm_self": "true"},
|
||||
)
|
||||
finally:
|
||||
self._require().timeout = _TIMEOUT_CONTROL
|
||||
return r
|
||||
|
||||
async def rollback(self) -> httpx.Response:
|
||||
return await self._require().post("/rollback")
|
||||
Reference in New Issue
Block a user