feat(updater): remote self-update daemon with auto-rollback

Adds a separate `decnet updater` daemon on each worker that owns the
agent's release directory and installs tarball pushes from the master
over mTLS. A normal `/update` never touches the updater itself, so the
updater is always a known-good rescuer if a bad agent push breaks
/health — the rotation is reversed and the agent restarted against the
previous release. `POST /update-self` handles updater upgrades
explicitly (no auto-rollback).

- decnet/updater/: executor, FastAPI app, uvicorn launcher
- decnet/swarm/updater_client.py, tar_tree.py: master-side push
- cli: `decnet updater`, `decnet swarm update [--host|--all]
  [--include-self] [--dry-run]`, `--updater` on `swarm enroll`
- enrollment API issues a second cert (CN=updater@<host>) signed by the
  same CA; SwarmHost records updater_cert_fingerprint
- tests: executor, app, CLI, tar tree, enroll-with-updater (37 new)
- wiki: Remote-Updates page + sidebar + SWARM-Mode cross-link
This commit is contained in:
2026-04-18 21:40:21 -04:00
parent 8914c27220
commit 7765b36c50
16 changed files with 1814 additions and 4 deletions

View File

@@ -0,0 +1,192 @@
"""CLI `decnet swarm update` — target resolution, tarring, push aggregation.
The UpdaterClient is stubbed: we are testing the CLI's orchestration, not
the wire protocol (that has test_updater_app.py and UpdaterClient round-
trips live under test_swarm_api.py integration).
"""
from __future__ import annotations
import json
import pathlib
from typing import Any
import pytest
from typer.testing import CliRunner
from decnet import cli as cli_mod
from decnet.cli import app
runner = CliRunner()
class _FakeResp:
def __init__(self, payload: Any, status: int = 200):
self._payload = payload
self.status_code = status
self.text = json.dumps(payload) if not isinstance(payload, str) else payload
self.content = self.text.encode()
def json(self) -> Any:
return self._payload
@pytest.fixture
def http_stub(monkeypatch: pytest.MonkeyPatch) -> dict:
state: dict = {"hosts": []}
def _fake(method, url, *, json_body=None, timeout=30.0):
if method == "GET" and url.endswith("/swarm/hosts"):
return _FakeResp(state["hosts"])
raise AssertionError(f"Unscripted HTTP call: {method} {url}")
monkeypatch.setattr(cli_mod, "_http_request", _fake)
return state
class _StubUpdaterClient:
"""Mirrors UpdaterClient's async-context-manager surface."""
instances: list["_StubUpdaterClient"] = []
behavior: dict[str, Any] = {}
def __init__(self, host, *, updater_port: int = 8766, **_: Any):
self.host = host
self.port = updater_port
self.calls: list[str] = []
_StubUpdaterClient.instances.append(self)
async def __aenter__(self) -> "_StubUpdaterClient":
return self
async def __aexit__(self, *exc: Any) -> None:
return None
async def update(self, tarball: bytes, sha: str = "") -> _FakeResp:
self.calls.append("update")
return _StubUpdaterClient.behavior.get(
self.host.get("name"),
_FakeResp({"status": "updated", "release": {"sha": sha}}, 200),
)
async def update_self(self, tarball: bytes, sha: str = "") -> _FakeResp:
self.calls.append("update_self")
return _FakeResp({"status": "self_update_queued"}, 200)
@pytest.fixture
def stub_updater(monkeypatch: pytest.MonkeyPatch):
_StubUpdaterClient.instances.clear()
_StubUpdaterClient.behavior.clear()
monkeypatch.setattr("decnet.swarm.updater_client.UpdaterClient", _StubUpdaterClient)
# Also patch the module-level import inside cli.py's swarm_update closure.
import decnet.cli # noqa: F401
return _StubUpdaterClient
def _mk_source_tree(tmp_path: pathlib.Path) -> pathlib.Path:
root = tmp_path / "src"
root.mkdir()
(root / "decnet").mkdir()
(root / "decnet" / "a.py").write_text("x = 1")
return root
# ------------------------------------------------------------- arg validation
def test_update_requires_host_or_all(http_stub) -> None:
r = runner.invoke(app, ["swarm", "update"])
assert r.exit_code == 2
def test_update_host_and_all_are_mutex(http_stub) -> None:
r = runner.invoke(app, ["swarm", "update", "--host", "w1", "--all"])
assert r.exit_code == 2
def test_update_unknown_host_exits_1(http_stub) -> None:
http_stub["hosts"] = [{"uuid": "u1", "name": "other", "address": "10.0.0.1", "status": "active"}]
r = runner.invoke(app, ["swarm", "update", "--host", "nope"])
assert r.exit_code == 1
assert "No enrolled worker" in r.output
# ---------------------------------------------------------------- happy paths
def test_update_single_host(http_stub, stub_updater, tmp_path: pathlib.Path) -> None:
http_stub["hosts"] = [
{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"},
{"uuid": "u2", "name": "w2", "address": "10.0.0.2", "status": "active"},
]
root = _mk_source_tree(tmp_path)
r = runner.invoke(app, ["swarm", "update", "--host", "w1", "--root", str(root)])
assert r.exit_code == 0, r.output
assert "w1" in r.output
# Only w1 got a client; w2 is untouched.
names = [c.host["name"] for c in stub_updater.instances]
assert names == ["w1"]
def test_update_all_skips_decommissioned(http_stub, stub_updater, tmp_path: pathlib.Path) -> None:
http_stub["hosts"] = [
{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"},
{"uuid": "u2", "name": "w2", "address": "10.0.0.2", "status": "decommissioned"},
{"uuid": "u3", "name": "w3", "address": "10.0.0.3", "status": "enrolled"},
]
root = _mk_source_tree(tmp_path)
r = runner.invoke(app, ["swarm", "update", "--all", "--root", str(root)])
assert r.exit_code == 0, r.output
hit = sorted(c.host["name"] for c in stub_updater.instances)
assert hit == ["w1", "w3"]
def test_update_include_self_calls_both(
http_stub, stub_updater, tmp_path: pathlib.Path,
) -> None:
http_stub["hosts"] = [{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"}]
root = _mk_source_tree(tmp_path)
r = runner.invoke(app, ["swarm", "update", "--all", "--root", str(root), "--include-self"])
assert r.exit_code == 0
assert stub_updater.instances[0].calls == ["update", "update_self"]
# ------------------------------------------------------------- failure modes
def test_update_rollback_status_409_flags_failure(
http_stub, stub_updater, tmp_path: pathlib.Path,
) -> None:
http_stub["hosts"] = [{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"}]
_StubUpdaterClient.behavior["w1"] = _FakeResp(
{"detail": {"error": "probe failed", "rolled_back": True}},
status=409,
)
root = _mk_source_tree(tmp_path)
r = runner.invoke(app, ["swarm", "update", "--all", "--root", str(root)])
assert r.exit_code == 1
assert "rolled-back" in r.output
def test_update_include_self_skipped_when_agent_update_failed(
http_stub, stub_updater, tmp_path: pathlib.Path,
) -> None:
http_stub["hosts"] = [{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"}]
_StubUpdaterClient.behavior["w1"] = _FakeResp(
{"detail": {"error": "pip failed"}}, status=500,
)
root = _mk_source_tree(tmp_path)
r = runner.invoke(app, ["swarm", "update", "--all", "--root", str(root), "--include-self"])
assert r.exit_code == 1
# update_self must NOT have been called — agent update failed.
assert stub_updater.instances[0].calls == ["update"]
# --------------------------------------------------------------------- dry run
def test_update_dry_run_does_not_call_updater(
http_stub, stub_updater, tmp_path: pathlib.Path,
) -> None:
http_stub["hosts"] = [{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"}]
root = _mk_source_tree(tmp_path)
r = runner.invoke(app, ["swarm", "update", "--all", "--root", str(root), "--dry-run"])
assert r.exit_code == 0
assert stub_updater.instances == []
assert "dry-run" in r.output.lower()

View File

@@ -78,6 +78,36 @@ def test_enroll_creates_host_and_returns_bundle(client: TestClient) -> None:
assert len(body["fingerprint"]) == 64 # sha256 hex
def test_enroll_with_updater_issues_second_cert(client: TestClient, ca_dir) -> None:
resp = client.post(
"/swarm/enroll",
json={"name": "worker-upd", "address": "10.0.0.99", "agent_port": 8765,
"issue_updater_bundle": True},
)
assert resp.status_code == 201, resp.text
body = resp.json()
assert body["updater"] is not None
assert body["updater"]["fingerprint"] != body["fingerprint"]
assert "-----BEGIN CERTIFICATE-----" in body["updater"]["updater_cert_pem"]
assert "-----BEGIN PRIVATE KEY-----" in body["updater"]["updater_key_pem"]
# Cert bundle persisted on master.
upd_bundle = ca_dir / "workers" / "worker-upd" / "updater"
assert (upd_bundle / "updater.crt").is_file()
assert (upd_bundle / "updater.key").is_file()
# DB row carries the updater fingerprint.
row = client.get(f"/swarm/hosts/{body['host_uuid']}").json()
assert row.get("updater_cert_fingerprint") == body["updater"]["fingerprint"]
def test_enroll_without_updater_omits_bundle(client: TestClient) -> None:
resp = client.post(
"/swarm/enroll",
json={"name": "worker-no-upd", "address": "10.0.0.98", "agent_port": 8765},
)
assert resp.status_code == 201
assert resp.json()["updater"] is None
def test_enroll_rejects_duplicate_name(client: TestClient) -> None:
payload = {"name": "worker-dup", "address": "10.0.0.6", "agent_port": 8765}
assert client.post("/swarm/enroll", json=payload).status_code == 201

View File

@@ -0,0 +1,75 @@
"""tar_working_tree: exclude filter, tarball validity, git SHA detection."""
from __future__ import annotations
import io
import pathlib
import tarfile
from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree
def _tree_names(data: bytes) -> set[str]:
with tarfile.open(fileobj=io.BytesIO(data), mode="r:gz") as tar:
return {m.name for m in tar.getmembers()}
def test_tar_excludes_default_patterns(tmp_path: pathlib.Path) -> None:
(tmp_path / "decnet").mkdir()
(tmp_path / "decnet" / "keep.py").write_text("x = 1")
(tmp_path / ".venv").mkdir()
(tmp_path / ".venv" / "pyvenv.cfg").write_text("junk")
(tmp_path / ".git").mkdir()
(tmp_path / ".git" / "HEAD").write_text("ref: refs/heads/main\n")
(tmp_path / "decnet" / "__pycache__").mkdir()
(tmp_path / "decnet" / "__pycache__" / "keep.cpython-311.pyc").write_text("bytecode")
(tmp_path / "wiki-checkout").mkdir()
(tmp_path / "wiki-checkout" / "Home.md").write_text("# wiki")
(tmp_path / "run.db").write_text("sqlite")
(tmp_path / "master.log").write_text("log")
data = tar_working_tree(tmp_path)
names = _tree_names(data)
assert "decnet/keep.py" in names
assert all(".venv" not in n for n in names)
assert all(".git" not in n for n in names)
assert all("__pycache__" not in n for n in names)
assert all("wiki-checkout" not in n for n in names)
assert "run.db" not in names
assert "master.log" not in names
def test_tar_accepts_extra_excludes(tmp_path: pathlib.Path) -> None:
(tmp_path / "a.py").write_text("x")
(tmp_path / "secret.env").write_text("TOKEN=abc")
data = tar_working_tree(tmp_path, extra_excludes=["secret.env"])
names = _tree_names(data)
assert "a.py" in names
assert "secret.env" not in names
def test_tar_skips_symlinks(tmp_path: pathlib.Path) -> None:
(tmp_path / "real.txt").write_text("hi")
try:
(tmp_path / "link.txt").symlink_to(tmp_path / "real.txt")
except (OSError, NotImplementedError):
return # platform doesn't support symlinks — skip
names = _tree_names(tar_working_tree(tmp_path))
assert "real.txt" in names
assert "link.txt" not in names
def test_detect_git_sha_from_ref(tmp_path: pathlib.Path) -> None:
(tmp_path / ".git" / "refs" / "heads").mkdir(parents=True)
(tmp_path / ".git" / "refs" / "heads" / "main").write_text("deadbeef" * 5 + "\n")
(tmp_path / ".git" / "HEAD").write_text("ref: refs/heads/main\n")
assert detect_git_sha(tmp_path).startswith("deadbeef")
def test_detect_git_sha_detached(tmp_path: pathlib.Path) -> None:
(tmp_path / ".git").mkdir()
(tmp_path / ".git" / "HEAD").write_text("f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0\n")
assert detect_git_sha(tmp_path).startswith("f0f0")
def test_detect_git_sha_none_when_not_repo(tmp_path: pathlib.Path) -> None:
assert detect_git_sha(tmp_path) == ""