feat(updater): remote self-update daemon with auto-rollback

Adds a separate `decnet updater` daemon on each worker that owns the agent's release directory and installs tarball pushes from the master over mTLS. A normal `/update` never touches the updater itself, so the updater is always a known-good rescuer if a bad agent push breaks /health — the rotation is reversed and the agent restarted against the previous release. `POST /update-self` handles updater upgrades explicitly (no auto-rollback). - decnet/updater/: executor, FastAPI app, uvicorn launcher - decnet/swarm/updater_client.py, tar_tree.py: master-side push - cli: `decnet updater`, `decnet swarm update [--host|--all] [--include-self] [--dry-run]`, `--updater` on `swarm enroll` - enrollment API issues a second cert (CN=updater@<host>) signed by the same CA; SwarmHost records updater_cert_fingerprint - tests: executor, app, CLI, tar tree, enroll-with-updater (37 new) - wiki: Remote-Updates page + sidebar + SWARM-Mode cross-link
2026-04-18 21:40:21 -04:00
parent 8914c27220
commit 7765b36c50
16 changed files with 1814 additions and 4 deletions
--- a/tests/swarm/test_cli_swarm_update.py
+++ b/tests/swarm/test_cli_swarm_update.py
@@ -0,0 +1,192 @@
+"""CLI `decnet swarm update` — target resolution, tarring, push aggregation.
+
+The UpdaterClient is stubbed: we are testing the CLI's orchestration, not
+the wire protocol (that has test_updater_app.py and UpdaterClient round-
+trips live under test_swarm_api.py integration).
+"""
+from __future__ import annotations
+
+import json
+import pathlib
+from typing import Any
+
+import pytest
+from typer.testing import CliRunner
+
+from decnet import cli as cli_mod
+from decnet.cli import app
+
+
+runner = CliRunner()
+
+
+class _FakeResp:
+    def __init__(self, payload: Any, status: int = 200):
+        self._payload = payload
+        self.status_code = status
+        self.text = json.dumps(payload) if not isinstance(payload, str) else payload
+        self.content = self.text.encode()
+
+    def json(self) -> Any:
+        return self._payload
+
+
+@pytest.fixture
+def http_stub(monkeypatch: pytest.MonkeyPatch) -> dict:
+    state: dict = {"hosts": []}
+
+    def _fake(method, url, *, json_body=None, timeout=30.0):
+        if method == "GET" and url.endswith("/swarm/hosts"):
+            return _FakeResp(state["hosts"])
+        raise AssertionError(f"Unscripted HTTP call: {method} {url}")
+
+    monkeypatch.setattr(cli_mod, "_http_request", _fake)
+    return state
+
+
+class _StubUpdaterClient:
+    """Mirrors UpdaterClient's async-context-manager surface."""
+    instances: list["_StubUpdaterClient"] = []
+    behavior: dict[str, Any] = {}
+
+    def __init__(self, host, *, updater_port: int = 8766, **_: Any):
+        self.host = host
+        self.port = updater_port
+        self.calls: list[str] = []
+        _StubUpdaterClient.instances.append(self)
+
+    async def __aenter__(self) -> "_StubUpdaterClient":
+        return self
+
+    async def __aexit__(self, *exc: Any) -> None:
+        return None
+
+    async def update(self, tarball: bytes, sha: str = "") -> _FakeResp:
+        self.calls.append("update")
+        return _StubUpdaterClient.behavior.get(
+            self.host.get("name"),
+            _FakeResp({"status": "updated", "release": {"sha": sha}}, 200),
+        )
+
+    async def update_self(self, tarball: bytes, sha: str = "") -> _FakeResp:
+        self.calls.append("update_self")
+        return _FakeResp({"status": "self_update_queued"}, 200)
+
+
+@pytest.fixture
+def stub_updater(monkeypatch: pytest.MonkeyPatch):
+    _StubUpdaterClient.instances.clear()
+    _StubUpdaterClient.behavior.clear()
+    monkeypatch.setattr("decnet.swarm.updater_client.UpdaterClient", _StubUpdaterClient)
+    # Also patch the module-level import inside cli.py's swarm_update closure.
+    import decnet.cli  # noqa: F401
+    return _StubUpdaterClient
+
+
+def _mk_source_tree(tmp_path: pathlib.Path) -> pathlib.Path:
+    root = tmp_path / "src"
+    root.mkdir()
+    (root / "decnet").mkdir()
+    (root / "decnet" / "a.py").write_text("x = 1")
+    return root
+
+
+# ------------------------------------------------------------- arg validation
+
+def test_update_requires_host_or_all(http_stub) -> None:
+    r = runner.invoke(app, ["swarm", "update"])
+    assert r.exit_code == 2
+
+
+def test_update_host_and_all_are_mutex(http_stub) -> None:
+    r = runner.invoke(app, ["swarm", "update", "--host", "w1", "--all"])
+    assert r.exit_code == 2
+
+
+def test_update_unknown_host_exits_1(http_stub) -> None:
+    http_stub["hosts"] = [{"uuid": "u1", "name": "other", "address": "10.0.0.1", "status": "active"}]
+    r = runner.invoke(app, ["swarm", "update", "--host", "nope"])
+    assert r.exit_code == 1
+    assert "No enrolled worker" in r.output
+
+
+# ---------------------------------------------------------------- happy paths
+
+def test_update_single_host(http_stub, stub_updater, tmp_path: pathlib.Path) -> None:
+    http_stub["hosts"] = [
+        {"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"},
+        {"uuid": "u2", "name": "w2", "address": "10.0.0.2", "status": "active"},
+    ]
+    root = _mk_source_tree(tmp_path)
+    r = runner.invoke(app, ["swarm", "update", "--host", "w1", "--root", str(root)])
+    assert r.exit_code == 0, r.output
+    assert "w1" in r.output
+    # Only w1 got a client; w2 is untouched.
+    names = [c.host["name"] for c in stub_updater.instances]
+    assert names == ["w1"]
+
+
+def test_update_all_skips_decommissioned(http_stub, stub_updater, tmp_path: pathlib.Path) -> None:
+    http_stub["hosts"] = [
+        {"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"},
+        {"uuid": "u2", "name": "w2", "address": "10.0.0.2", "status": "decommissioned"},
+        {"uuid": "u3", "name": "w3", "address": "10.0.0.3", "status": "enrolled"},
+    ]
+    root = _mk_source_tree(tmp_path)
+    r = runner.invoke(app, ["swarm", "update", "--all", "--root", str(root)])
+    assert r.exit_code == 0, r.output
+    hit = sorted(c.host["name"] for c in stub_updater.instances)
+    assert hit == ["w1", "w3"]
+
+
+def test_update_include_self_calls_both(
+    http_stub, stub_updater, tmp_path: pathlib.Path,
+) -> None:
+    http_stub["hosts"] = [{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"}]
+    root = _mk_source_tree(tmp_path)
+    r = runner.invoke(app, ["swarm", "update", "--all", "--root", str(root), "--include-self"])
+    assert r.exit_code == 0
+    assert stub_updater.instances[0].calls == ["update", "update_self"]
+
+
+# ------------------------------------------------------------- failure modes
+
+def test_update_rollback_status_409_flags_failure(
+    http_stub, stub_updater, tmp_path: pathlib.Path,
+) -> None:
+    http_stub["hosts"] = [{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"}]
+    _StubUpdaterClient.behavior["w1"] = _FakeResp(
+        {"detail": {"error": "probe failed", "rolled_back": True}},
+        status=409,
+    )
+    root = _mk_source_tree(tmp_path)
+    r = runner.invoke(app, ["swarm", "update", "--all", "--root", str(root)])
+    assert r.exit_code == 1
+    assert "rolled-back" in r.output
+
+
+def test_update_include_self_skipped_when_agent_update_failed(
+    http_stub, stub_updater, tmp_path: pathlib.Path,
+) -> None:
+    http_stub["hosts"] = [{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"}]
+    _StubUpdaterClient.behavior["w1"] = _FakeResp(
+        {"detail": {"error": "pip failed"}}, status=500,
+    )
+    root = _mk_source_tree(tmp_path)
+    r = runner.invoke(app, ["swarm", "update", "--all", "--root", str(root), "--include-self"])
+    assert r.exit_code == 1
+    # update_self must NOT have been called — agent update failed.
+    assert stub_updater.instances[0].calls == ["update"]
+
+
+# --------------------------------------------------------------------- dry run
+
+def test_update_dry_run_does_not_call_updater(
+    http_stub, stub_updater, tmp_path: pathlib.Path,
+) -> None:
+    http_stub["hosts"] = [{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"}]
+    root = _mk_source_tree(tmp_path)
+    r = runner.invoke(app, ["swarm", "update", "--all", "--root", str(root), "--dry-run"])
+    assert r.exit_code == 0
+    assert stub_updater.instances == []
+    assert "dry-run" in r.output.lower()
--- a/tests/swarm/test_swarm_api.py
+++ b/tests/swarm/test_swarm_api.py
@@ -78,6 +78,36 @@ def test_enroll_creates_host_and_returns_bundle(client: TestClient) -> None:
    assert len(body["fingerprint"]) == 64  # sha256 hex


+def test_enroll_with_updater_issues_second_cert(client: TestClient, ca_dir) -> None:
+    resp = client.post(
+        "/swarm/enroll",
+        json={"name": "worker-upd", "address": "10.0.0.99", "agent_port": 8765,
+              "issue_updater_bundle": True},
+    )
+    assert resp.status_code == 201, resp.text
+    body = resp.json()
+    assert body["updater"] is not None
+    assert body["updater"]["fingerprint"] != body["fingerprint"]
+    assert "-----BEGIN CERTIFICATE-----" in body["updater"]["updater_cert_pem"]
+    assert "-----BEGIN PRIVATE KEY-----" in body["updater"]["updater_key_pem"]
+    # Cert bundle persisted on master.
+    upd_bundle = ca_dir / "workers" / "worker-upd" / "updater"
+    assert (upd_bundle / "updater.crt").is_file()
+    assert (upd_bundle / "updater.key").is_file()
+    # DB row carries the updater fingerprint.
+    row = client.get(f"/swarm/hosts/{body['host_uuid']}").json()
+    assert row.get("updater_cert_fingerprint") == body["updater"]["fingerprint"]
+
+
+def test_enroll_without_updater_omits_bundle(client: TestClient) -> None:
+    resp = client.post(
+        "/swarm/enroll",
+        json={"name": "worker-no-upd", "address": "10.0.0.98", "agent_port": 8765},
+    )
+    assert resp.status_code == 201
+    assert resp.json()["updater"] is None
+
+
 def test_enroll_rejects_duplicate_name(client: TestClient) -> None:
    payload = {"name": "worker-dup", "address": "10.0.0.6", "agent_port": 8765}
    assert client.post("/swarm/enroll", json=payload).status_code == 201
--- a/tests/swarm/test_tar_tree.py
+++ b/tests/swarm/test_tar_tree.py
@@ -0,0 +1,75 @@
+"""tar_working_tree: exclude filter, tarball validity, git SHA detection."""
+from __future__ import annotations
+
+import io
+import pathlib
+import tarfile
+
+from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree
+
+
+def _tree_names(data: bytes) -> set[str]:
+    with tarfile.open(fileobj=io.BytesIO(data), mode="r:gz") as tar:
+        return {m.name for m in tar.getmembers()}
+
+
+def test_tar_excludes_default_patterns(tmp_path: pathlib.Path) -> None:
+    (tmp_path / "decnet").mkdir()
+    (tmp_path / "decnet" / "keep.py").write_text("x = 1")
+    (tmp_path / ".venv").mkdir()
+    (tmp_path / ".venv" / "pyvenv.cfg").write_text("junk")
+    (tmp_path / ".git").mkdir()
+    (tmp_path / ".git" / "HEAD").write_text("ref: refs/heads/main\n")
+    (tmp_path / "decnet" / "__pycache__").mkdir()
+    (tmp_path / "decnet" / "__pycache__" / "keep.cpython-311.pyc").write_text("bytecode")
+    (tmp_path / "wiki-checkout").mkdir()
+    (tmp_path / "wiki-checkout" / "Home.md").write_text("# wiki")
+    (tmp_path / "run.db").write_text("sqlite")
+    (tmp_path / "master.log").write_text("log")
+
+    data = tar_working_tree(tmp_path)
+    names = _tree_names(data)
+    assert "decnet/keep.py" in names
+    assert all(".venv" not in n for n in names)
+    assert all(".git" not in n for n in names)
+    assert all("__pycache__" not in n for n in names)
+    assert all("wiki-checkout" not in n for n in names)
+    assert "run.db" not in names
+    assert "master.log" not in names
+
+
+def test_tar_accepts_extra_excludes(tmp_path: pathlib.Path) -> None:
+    (tmp_path / "a.py").write_text("x")
+    (tmp_path / "secret.env").write_text("TOKEN=abc")
+    data = tar_working_tree(tmp_path, extra_excludes=["secret.env"])
+    names = _tree_names(data)
+    assert "a.py" in names
+    assert "secret.env" not in names
+
+
+def test_tar_skips_symlinks(tmp_path: pathlib.Path) -> None:
+    (tmp_path / "real.txt").write_text("hi")
+    try:
+        (tmp_path / "link.txt").symlink_to(tmp_path / "real.txt")
+    except (OSError, NotImplementedError):
+        return  # platform doesn't support symlinks — skip
+    names = _tree_names(tar_working_tree(tmp_path))
+    assert "real.txt" in names
+    assert "link.txt" not in names
+
+
+def test_detect_git_sha_from_ref(tmp_path: pathlib.Path) -> None:
+    (tmp_path / ".git" / "refs" / "heads").mkdir(parents=True)
+    (tmp_path / ".git" / "refs" / "heads" / "main").write_text("deadbeef" * 5 + "\n")
+    (tmp_path / ".git" / "HEAD").write_text("ref: refs/heads/main\n")
+    assert detect_git_sha(tmp_path).startswith("deadbeef")
+
+
+def test_detect_git_sha_detached(tmp_path: pathlib.Path) -> None:
+    (tmp_path / ".git").mkdir()
+    (tmp_path / ".git" / "HEAD").write_text("f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0\n")
+    assert detect_git_sha(tmp_path).startswith("f0f0")
+
+
+def test_detect_git_sha_none_when_not_repo(tmp_path: pathlib.Path) -> None:
+    assert detect_git_sha(tmp_path) == ""
--- a/tests/updater/init.py
+++ b/tests/updater/init.py
--- a/tests/updater/test_updater_app.py
+++ b/tests/updater/test_updater_app.py
@@ -0,0 +1,138 @@
+"""HTTP contract for the updater app.
+
+Executor functions are monkeypatched — we're testing wire format, not
+the rotation logic (that has test_updater_executor.py).
+"""
+from __future__ import annotations
+
+import io
+import pathlib
+import tarfile
+
+import pytest
+from fastapi.testclient import TestClient
+
+from decnet.updater import app as app_mod
+from decnet.updater import executor as ex
+
+
+def _tarball(files: dict[str, str] | None = None) -> bytes:
+    buf = io.BytesIO()
+    with tarfile.open(fileobj=buf, mode="w:gz") as tar:
+        for name, content in (files or {"a": "b"}).items():
+            data = content.encode()
+            info = tarfile.TarInfo(name=name)
+            info.size = len(data)
+            tar.addfile(info, io.BytesIO(data))
+    return buf.getvalue()
+
+
+@pytest.fixture
+def client(tmp_path: pathlib.Path) -> TestClient:
+    app_mod.configure(
+        install_dir=tmp_path / "install",
+        updater_install_dir=tmp_path / "install" / "updater",
+        agent_dir=tmp_path / "agent",
+    )
+    (tmp_path / "install" / "releases").mkdir(parents=True)
+    return TestClient(app_mod.app)
+
+
+def test_health_returns_role_and_releases(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(ex, "list_releases", lambda d: [])
+    r = client.get("/health")
+    assert r.status_code == 200
+    body = r.json()
+    assert body["status"] == "ok"
+    assert body["role"] == "updater"
+    assert body["releases"] == []
+
+
+def test_update_happy_path(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(
+        ex, "run_update",
+        lambda data, sha, install_dir, agent_dir: {"status": "updated", "release": {"slot": "active", "sha": sha}, "probe": "ok"},
+    )
+    r = client.post(
+        "/update",
+        files={"tarball": ("tree.tgz", _tarball(), "application/gzip")},
+        data={"sha": "ABC123"},
+    )
+    assert r.status_code == 200, r.text
+    assert r.json()["release"]["sha"] == "ABC123"
+
+
+def test_update_rollback_returns_409(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None:
+    def _boom(*a, **kw):
+        raise ex.UpdateError("probe failed; rolled back", stderr="connection refused", rolled_back=True)
+    monkeypatch.setattr(ex, "run_update", _boom)
+
+    r = client.post(
+        "/update",
+        files={"tarball": ("t.tgz", _tarball(), "application/gzip")},
+        data={"sha": ""},
+    )
+    assert r.status_code == 409, r.text
+    detail = r.json()["detail"]
+    assert detail["rolled_back"] is True
+    assert "connection refused" in detail["stderr"]
+
+
+def test_update_hard_failure_returns_500(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None:
+    def _boom(*a, **kw):
+        raise ex.UpdateError("pip install failed", stderr="resolver error")
+    monkeypatch.setattr(ex, "run_update", _boom)
+
+    r = client.post("/update", files={"tarball": ("t.tgz", _tarball(), "application/gzip")})
+    assert r.status_code == 500
+    assert r.json()["detail"]["rolled_back"] is False
+
+
+def test_update_self_requires_confirm(client: TestClient) -> None:
+    r = client.post("/update-self", files={"tarball": ("t.tgz", _tarball(), "application/gzip")})
+    assert r.status_code == 400
+    assert "confirm_self" in r.json()["detail"]
+
+
+def test_update_self_happy_path(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(
+        ex, "run_update_self",
+        lambda data, sha, updater_install_dir: {"status": "self_update_queued", "argv": ["python", "-m", "decnet", "updater"]},
+    )
+    r = client.post(
+        "/update-self",
+        files={"tarball": ("t.tgz", _tarball(), "application/gzip")},
+        data={"sha": "S", "confirm_self": "true"},
+    )
+    assert r.status_code == 200
+    assert r.json()["status"] == "self_update_queued"
+
+
+def test_rollback_happy(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(
+        ex, "run_rollback",
+        lambda install_dir, agent_dir: {"status": "rolled_back", "release": {"slot": "active", "sha": "O"}, "probe": "ok"},
+    )
+    r = client.post("/rollback")
+    assert r.status_code == 200
+    assert r.json()["status"] == "rolled_back"
+
+
+def test_rollback_missing_prev_returns_404(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None:
+    def _boom(**_):
+        raise ex.UpdateError("no previous release to roll back to")
+    monkeypatch.setattr(ex, "run_rollback", _boom)
+    r = client.post("/rollback")
+    assert r.status_code == 404
+
+
+def test_releases_lists_slots(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(
+        ex, "list_releases",
+        lambda d: [ex.Release(slot="active", sha="A", installed_at=None),
+                   ex.Release(slot="prev", sha="B", installed_at=None)],
+    )
+    r = client.get("/releases")
+    assert r.status_code == 200
+    slots = [rel["slot"] for rel in r.json()["releases"]]
+    assert slots == ["active", "prev"]
--- a/tests/updater/test_updater_executor.py
+++ b/tests/updater/test_updater_executor.py
@@ -0,0 +1,295 @@
+"""Updater executor: directory rotation, probe-driven rollback, safety checks.
+
+All three real seams (`_run_pip`, `_spawn_agent`, `_stop_agent`,
+`_probe_agent`) are monkeypatched so these tests never shell out or
+touch a real Python venv. The rotation/symlink/extract logic is exercised
+against a ``tmp_path`` install dir.
+"""
+from __future__ import annotations
+
+import io
+import pathlib
+import subprocess
+import tarfile
+from typing import Any
+
+import pytest
+
+from decnet.updater import executor as ex
+
+
+# ------------------------------------------------------------------ helpers
+
+def _make_tarball(files: dict[str, str]) -> bytes:
+    buf = io.BytesIO()
+    with tarfile.open(fileobj=buf, mode="w:gz") as tar:
+        for name, content in files.items():
+            data = content.encode()
+            info = tarfile.TarInfo(name=name)
+            info.size = len(data)
+            tar.addfile(info, io.BytesIO(data))
+    return buf.getvalue()
+
+
+class _PipOK:
+    returncode = 0
+    stdout = ""
+    stderr = ""
+
+
+class _PipFail:
+    returncode = 1
+    stdout = ""
+    stderr = "resolver error: Could not find a version that satisfies ..."
+
+
+@pytest.fixture
+def install_dir(tmp_path: pathlib.Path) -> pathlib.Path:
+    d = tmp_path / "decnet"
+    d.mkdir()
+    (d / "releases").mkdir()
+    return d
+
+
+@pytest.fixture
+def agent_dir(tmp_path: pathlib.Path) -> pathlib.Path:
+    d = tmp_path / "agent"
+    d.mkdir()
+    # executor._probe_agent checks these exist before constructing SSL ctx,
+    # but the probe seam is monkeypatched in every test so content doesn't
+    # matter — still create them so the non-stubbed path is representative.
+    (d / "ca.crt").write_bytes(b"-----BEGIN CERTIFICATE-----\nstub\n-----END CERTIFICATE-----\n")
+    (d / "worker.crt").write_bytes(b"-----BEGIN CERTIFICATE-----\nstub\n-----END CERTIFICATE-----\n")
+    (d / "worker.key").write_bytes(b"-----BEGIN PRIVATE KEY-----\nstub\n-----END PRIVATE KEY-----\n")
+    return d
+
+
+@pytest.fixture
+def seed_existing_release(install_dir: pathlib.Path) -> None:
+    """Pretend an install is already live: create releases/active with a marker."""
+    active = install_dir / "releases" / "active"
+    active.mkdir()
+    (active / "marker.txt").write_text("old")
+    ex._write_manifest(active, sha="OLDSHA")
+    # current -> active
+    ex._point_current_at(install_dir, active)
+
+
+# --------------------------------------------------------- extract + safety
+
+def test_extract_rejects_path_traversal(tmp_path: pathlib.Path) -> None:
+    evil = _make_tarball({"../escape.txt": "pwned"})
+    with pytest.raises(ex.UpdateError, match="unsafe path"):
+        ex.extract_tarball(evil, tmp_path / "out")
+
+
+def test_extract_rejects_absolute_paths(tmp_path: pathlib.Path) -> None:
+    evil = _make_tarball({"/etc/passwd": "root:x:0:0"})
+    with pytest.raises(ex.UpdateError, match="unsafe path"):
+        ex.extract_tarball(evil, tmp_path / "out")
+
+
+def test_extract_happy_path(tmp_path: pathlib.Path) -> None:
+    tb = _make_tarball({"a/b.txt": "hello"})
+    out = tmp_path / "out"
+    ex.extract_tarball(tb, out)
+    assert (out / "a" / "b.txt").read_text() == "hello"
+
+
+def test_clean_stale_staging(install_dir: pathlib.Path) -> None:
+    staging = install_dir / "releases" / "active.new"
+    staging.mkdir()
+    (staging / "junk").write_text("left from a crash")
+    ex.clean_stale_staging(install_dir)
+    assert not staging.exists()
+
+
+# ---------------------------------------------------------------- happy path
+
+def test_update_rotates_and_probes(
+    monkeypatch: pytest.MonkeyPatch,
+    install_dir: pathlib.Path,
+    agent_dir: pathlib.Path,
+    seed_existing_release: None,
+) -> None:
+    monkeypatch.setattr(ex, "_run_pip", lambda release: _PipOK())
+    monkeypatch.setattr(ex, "_stop_agent", lambda *a, **k: None)
+    monkeypatch.setattr(ex, "_spawn_agent", lambda *a, **k: 42)
+    monkeypatch.setattr(ex, "_probe_agent", lambda **_: (True, "ok"))
+
+    tb = _make_tarball({"marker.txt": "new"})
+    result = ex.run_update(tb, sha="NEWSHA", install_dir=install_dir, agent_dir=agent_dir)
+
+    assert result["status"] == "updated"
+    assert result["release"]["sha"] == "NEWSHA"
+    assert (install_dir / "releases" / "active" / "marker.txt").read_text() == "new"
+    # Old release demoted, not deleted.
+    assert (install_dir / "releases" / "prev" / "marker.txt").read_text() == "old"
+    # Current symlink points at the new active.
+    assert (install_dir / "current").resolve() == (install_dir / "releases" / "active").resolve()
+
+
+def test_update_first_install_without_previous(
+    monkeypatch: pytest.MonkeyPatch,
+    install_dir: pathlib.Path,
+    agent_dir: pathlib.Path,
+) -> None:
+    """No existing active/ dir — first real install via the updater."""
+    monkeypatch.setattr(ex, "_run_pip", lambda release: _PipOK())
+    monkeypatch.setattr(ex, "_stop_agent", lambda *a, **k: None)
+    monkeypatch.setattr(ex, "_spawn_agent", lambda *a, **k: 1)
+    monkeypatch.setattr(ex, "_probe_agent", lambda **_: (True, "ok"))
+
+    tb = _make_tarball({"marker.txt": "first"})
+    result = ex.run_update(tb, sha="S1", install_dir=install_dir, agent_dir=agent_dir)
+    assert result["status"] == "updated"
+    assert not (install_dir / "releases" / "prev").exists()
+
+
+# ------------------------------------------------------------ pip failure
+
+def test_update_pip_failure_aborts_before_rotation(
+    monkeypatch: pytest.MonkeyPatch,
+    install_dir: pathlib.Path,
+    agent_dir: pathlib.Path,
+    seed_existing_release: None,
+) -> None:
+    monkeypatch.setattr(ex, "_run_pip", lambda release: _PipFail())
+    stop_called: list[bool] = []
+    monkeypatch.setattr(ex, "_stop_agent", lambda *a, **k: stop_called.append(True))
+    monkeypatch.setattr(ex, "_spawn_agent", lambda *a, **k: 1)
+    monkeypatch.setattr(ex, "_probe_agent", lambda **_: (True, "ok"))
+
+    tb = _make_tarball({"marker.txt": "new"})
+    with pytest.raises(ex.UpdateError, match="pip install failed") as ei:
+        ex.run_update(tb, sha="S", install_dir=install_dir, agent_dir=agent_dir)
+    assert "resolver error" in ei.value.stderr
+
+    # Nothing rotated — old active still live, no prev created.
+    assert (install_dir / "releases" / "active" / "marker.txt").read_text() == "old"
+    assert not (install_dir / "releases" / "prev").exists()
+    # Agent never touched.
+    assert stop_called == []
+    # Staging cleaned up.
+    assert not (install_dir / "releases" / "active.new").exists()
+
+
+# ------------------------------------------------------------ probe failure
+
+def test_update_probe_failure_rolls_back(
+    monkeypatch: pytest.MonkeyPatch,
+    install_dir: pathlib.Path,
+    agent_dir: pathlib.Path,
+    seed_existing_release: None,
+) -> None:
+    monkeypatch.setattr(ex, "_run_pip", lambda release: _PipOK())
+    monkeypatch.setattr(ex, "_stop_agent", lambda *a, **k: None)
+    monkeypatch.setattr(ex, "_spawn_agent", lambda *a, **k: 1)
+
+    calls: list[int] = [0]
+
+    def _probe(**_: Any) -> tuple[bool, str]:
+        calls[0] += 1
+        if calls[0] == 1:
+            return False, "connection refused"
+        return True, "ok"  # rollback probe succeeds
+
+    monkeypatch.setattr(ex, "_probe_agent", _probe)
+
+    tb = _make_tarball({"marker.txt": "new"})
+    with pytest.raises(ex.UpdateError, match="health probe") as ei:
+        ex.run_update(tb, sha="NEWSHA", install_dir=install_dir, agent_dir=agent_dir)
+    assert ei.value.rolled_back is True
+    assert "connection refused" in ei.value.stderr
+
+    # Rolled back: active has the old marker again.
+    assert (install_dir / "releases" / "active" / "marker.txt").read_text() == "old"
+    # Prev now holds what would have been the new release.
+    assert (install_dir / "releases" / "prev" / "marker.txt").read_text() == "new"
+    # Current symlink points back at active.
+    assert (install_dir / "current").resolve() == (install_dir / "releases" / "active").resolve()
+
+
+# ------------------------------------------------------------ manual rollback
+
+def test_manual_rollback_swaps(
+    monkeypatch: pytest.MonkeyPatch,
+    install_dir: pathlib.Path,
+    agent_dir: pathlib.Path,
+    seed_existing_release: None,
+) -> None:
+    # Seed a prev/ so rollback has somewhere to go.
+    prev = install_dir / "releases" / "prev"
+    prev.mkdir()
+    (prev / "marker.txt").write_text("older")
+    ex._write_manifest(prev, sha="OLDERSHA")
+
+    monkeypatch.setattr(ex, "_stop_agent", lambda *a, **k: None)
+    monkeypatch.setattr(ex, "_spawn_agent", lambda *a, **k: 1)
+    monkeypatch.setattr(ex, "_probe_agent", lambda **_: (True, "ok"))
+
+    result = ex.run_rollback(install_dir=install_dir, agent_dir=agent_dir)
+    assert result["status"] == "rolled_back"
+    assert (install_dir / "releases" / "active" / "marker.txt").read_text() == "older"
+    assert (install_dir / "releases" / "prev" / "marker.txt").read_text() == "old"
+
+
+def test_manual_rollback_refuses_without_prev(
+    install_dir: pathlib.Path,
+    seed_existing_release: None,
+) -> None:
+    with pytest.raises(ex.UpdateError, match="no previous release"):
+        ex.run_rollback(install_dir=install_dir)
+
+
+# ---------------------------------------------------------------- releases
+
+def test_list_releases_includes_only_existing_slots(
+    install_dir: pathlib.Path,
+    seed_existing_release: None,
+) -> None:
+    rs = ex.list_releases(install_dir)
+    assert [r.slot for r in rs] == ["active"]
+    assert rs[0].sha == "OLDSHA"
+
+
+# ---------------------------------------------------------------- self-update
+
+def test_update_self_rotates_and_calls_exec_cb(
+    monkeypatch: pytest.MonkeyPatch,
+    install_dir: pathlib.Path,
+) -> None:
+    # Seed a stand-in "active" for the updater itself.
+    active = install_dir / "releases" / "active"
+    active.mkdir()
+    (active / "marker").write_text("old-updater")
+
+    monkeypatch.setattr(ex, "_run_pip", lambda release: _PipOK())
+    seen_argv: list[list[str]] = []
+
+    tb = _make_tarball({"marker": "new-updater"})
+    result = ex.run_update_self(
+        tb, sha="USHA", updater_install_dir=install_dir,
+        exec_cb=lambda argv: seen_argv.append(argv),
+    )
+    assert result["status"] == "self_update_queued"
+    assert (install_dir / "releases" / "active" / "marker").read_text() == "new-updater"
+    assert (install_dir / "releases" / "prev" / "marker").read_text() == "old-updater"
+    assert len(seen_argv) == 1
+    assert "updater" in seen_argv[0]
+
+
+def test_update_self_pip_failure_leaves_active_intact(
+    monkeypatch: pytest.MonkeyPatch,
+    install_dir: pathlib.Path,
+) -> None:
+    active = install_dir / "releases" / "active"
+    active.mkdir()
+    (active / "marker").write_text("old-updater")
+    monkeypatch.setattr(ex, "_run_pip", lambda release: _PipFail())
+
+    tb = _make_tarball({"marker": "new-updater"})
+    with pytest.raises(ex.UpdateError, match="pip install failed"):
+        ex.run_update_self(tb, sha="U", updater_install_dir=install_dir, exec_cb=lambda a: None)
+    assert (install_dir / "releases" / "active" / "marker").read_text() == "old-updater"
+    assert not (install_dir / "releases" / "active.new").exists()