feat(web): Remote Updates API — dashboard endpoints for pushing code to workers

Adds /api/v1/swarm-updates/{hosts,push,push-self,rollback} behind
require_admin. Reuses the existing UpdaterClient + tar_working_tree + the
per-host asyncio.gather pattern from api_deploy_swarm.py; tarball is
built exactly once per /push request and fanned out to every selected
worker. /hosts filters out decommissioned hosts and agent-only
enrollments (no updater bundle = not a target).

Connection drops during /update-self are treated as success — the
updater re-execs itself mid-response, so httpx always raises.

Pydantic models live in decnet/web/db/models.py (single source of
truth). 24 tests cover happy paths, rollback, transport failures,
include_self ordering (skip on rolled-back agents), validation, and
RBAC gating.
This commit is contained in:
2026-04-19 01:01:09 -04:00
parent f5a5fec607
commit a266d6b17e
13 changed files with 1041 additions and 0 deletions

View File

@@ -0,0 +1,23 @@
"""Remote Updates — master dashboard's surface for pushing code to workers.
These are *not* the swarm-controller's /swarm routes (those run on a
separate process, auth-free, internal-only). They live on the main web
API, go through ``require_admin``, and are the interface the React
dashboard calls to fan updates out to worker ``decnet updater`` daemons
via ``UpdaterClient``.
Mounted under ``/api/v1/swarm-updates`` by the main api router.
"""
from fastapi import APIRouter
from .api_list_host_releases import router as list_host_releases_router
from .api_push_update import router as push_update_router
from .api_push_update_self import router as push_update_self_router
from .api_rollback_host import router as rollback_host_router
swarm_updates_router = APIRouter(prefix="/swarm-updates")
swarm_updates_router.include_router(list_host_releases_router)
swarm_updates_router.include_router(push_update_router)
swarm_updates_router.include_router(push_update_self_router)
swarm_updates_router.include_router(rollback_host_router)

View File

@@ -0,0 +1,82 @@
"""GET /swarm-updates/hosts — per-host updater health + release slots.
Fans out an ``UpdaterClient.health()`` probe to every enrolled host that
has an updater bundle. Each probe is isolated: a single unreachable host
never fails the whole list (that's normal partial-failure behaviour for
a fleet view).
"""
from __future__ import annotations
import asyncio
from typing import Any
from fastapi import APIRouter, Depends
from decnet.logging import get_logger
from decnet.swarm.updater_client import UpdaterClient
from decnet.web.db.models import HostReleaseInfo, HostReleasesResponse
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm_updates.list")
router = APIRouter()
def _extract_shas(releases: list[dict[str, Any]]) -> tuple[str | None, str | None]:
"""Pick the (current, previous) SHA from the updater's releases list.
The updater reports releases as ``[{"slot": "active"|"prev", "sha": ...,
...}]`` in no guaranteed order, so pull by slot name rather than index.
"""
current = next((r.get("sha") for r in releases if r.get("slot") == "active"), None)
previous = next((r.get("sha") for r in releases if r.get("slot") == "prev"), None)
return current, previous
async def _probe_host(host: dict[str, Any]) -> HostReleaseInfo:
try:
async with UpdaterClient(host=host) as u:
body = await u.health()
except Exception as exc: # noqa: BLE001
return HostReleaseInfo(
host_uuid=host["uuid"],
host_name=host["name"],
address=host["address"],
reachable=False,
detail=f"{type(exc).__name__}: {exc}",
)
releases = body.get("releases") or []
current, previous = _extract_shas(releases)
return HostReleaseInfo(
host_uuid=host["uuid"],
host_name=host["name"],
address=host["address"],
reachable=True,
agent_status=body.get("agent_status") or body.get("status"),
current_sha=current,
previous_sha=previous,
releases=releases,
)
@router.get(
"/hosts",
response_model=HostReleasesResponse,
tags=["Swarm Updates"],
)
async def api_list_host_releases(
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> HostReleasesResponse:
rows = await repo.list_swarm_hosts()
# Only hosts actually capable of receiving updates — decommissioned
# hosts and agent-only enrollments are filtered out.
targets = [
r for r in rows
if r.get("status") != "decommissioned" and r.get("updater_cert_fingerprint")
]
if not targets:
return HostReleasesResponse(hosts=[])
results = await asyncio.gather(*(_probe_host(h) for h in targets))
return HostReleasesResponse(hosts=list(results))

View File

@@ -0,0 +1,152 @@
"""POST /swarm-updates/push — fan a tarball of the master's tree to workers.
Mirrors the ``decnet swarm update`` CLI flow: build the tarball once,
dispatch concurrently, collect per-host statuses. Returns HTTP 200 even
when individual hosts failed — the operator reads per-host ``status``.
"""
from __future__ import annotations
import asyncio
import pathlib
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.logging import get_logger
from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree
from decnet.swarm.updater_client import UpdaterClient
from decnet.web.db.models import PushUpdateRequest, PushUpdateResponse, PushUpdateResult
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm_updates.push")
router = APIRouter()
def _master_tree_root() -> pathlib.Path:
"""Resolve the master's install tree to tar.
Walks up from this file: ``decnet/web/router/swarm_updates/`` → 3 parents
lands on the repo root. Matches the layout shipped via ``pip install -e .``
and the dev checkout at ``~/Tools/DECNET``.
"""
return pathlib.Path(__file__).resolve().parents[4]
def _classify_update(status_code: int) -> str:
if status_code == 200:
return "updated"
if status_code == 409:
return "rolled-back"
return "failed"
async def _resolve_targets(
repo: BaseRepository,
req: PushUpdateRequest,
) -> list[dict[str, Any]]:
if req.all == bool(req.host_uuids):
raise HTTPException(
status_code=400,
detail="Specify exactly one of host_uuids or all=true.",
)
rows = await repo.list_swarm_hosts()
rows = [r for r in rows if r.get("updater_cert_fingerprint")]
if req.all:
targets = [r for r in rows if r.get("status") != "decommissioned"]
else:
wanted = set(req.host_uuids or [])
targets = [r for r in rows if r["uuid"] in wanted]
missing = wanted - {r["uuid"] for r in targets}
if missing:
raise HTTPException(
status_code=404,
detail=f"Unknown or updater-less host(s): {sorted(missing)}",
)
if not targets:
raise HTTPException(
status_code=404,
detail="No targets: no enrolled hosts have an updater bundle.",
)
return targets
async def _push_one(
host: dict[str, Any],
tarball: bytes,
sha: str,
include_self: bool,
) -> PushUpdateResult:
try:
async with UpdaterClient(host=host) as u:
r = await u.update(tarball, sha=sha)
body = r.json() if r.content else {}
status = _classify_update(r.status_code)
stderr = body.get("stderr") if isinstance(body, dict) else None
if include_self and r.status_code == 200:
# Agent first, updater second — a broken updater push must never
# strand the fleet on an old agent.
try:
rs = await u.update_self(tarball, sha=sha)
self_ok = rs.status_code in (200, 0) # 0 = connection dropped (expected)
except Exception as exc: # noqa: BLE001
# Connection drop on update-self is expected and not an error.
self_ok = _is_expected_connection_drop(exc)
if not self_ok:
return PushUpdateResult(
host_uuid=host["uuid"], host_name=host["name"],
status="self-failed", http_status=r.status_code, sha=sha,
detail=f"agent updated OK but self-update failed: {exc}",
stderr=stderr,
)
status = "self-updated" if self_ok else "self-failed"
return PushUpdateResult(
host_uuid=host["uuid"], host_name=host["name"],
status=status, http_status=r.status_code, sha=sha,
detail=body.get("error") or body.get("probe") if isinstance(body, dict) else None,
stderr=stderr,
)
except Exception as exc: # noqa: BLE001
log.exception("swarm_updates.push failed host=%s", host.get("name"))
return PushUpdateResult(
host_uuid=host["uuid"], host_name=host["name"],
status="failed",
detail=f"{type(exc).__name__}: {exc}",
)
def _is_expected_connection_drop(exc: BaseException) -> bool:
"""update-self re-execs the updater mid-response; httpx raises on the drop."""
import httpx
return isinstance(exc, (httpx.RemoteProtocolError, httpx.ReadError, httpx.ConnectError))
@router.post(
"/push",
response_model=PushUpdateResponse,
tags=["Swarm Updates"],
)
async def api_push_update(
req: PushUpdateRequest,
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> PushUpdateResponse:
targets = await _resolve_targets(repo, req)
tree_root = _master_tree_root()
sha = detect_git_sha(tree_root)
tarball = tar_working_tree(tree_root, extra_excludes=req.exclude)
log.info(
"swarm_updates.push sha=%s tarball=%d hosts=%d include_self=%s",
sha or "(not a git repo)", len(tarball), len(targets), req.include_self,
)
results = await asyncio.gather(
*(_push_one(h, tarball, sha, req.include_self) for h in targets)
)
return PushUpdateResponse(
sha=sha,
tarball_bytes=len(tarball),
results=list(results),
)

View File

@@ -0,0 +1,92 @@
"""POST /swarm-updates/push-self — push only to workers' /update-self.
Use case: the agent is fine but the updater itself needs an upgrade (e.g.
a fix to ``executor.py``). Uploading only ``/update-self`` avoids a
redundant agent restart on healthy workers.
No auto-rollback: the updater re-execs itself on success, so a broken
push leaves the worker on the old code — verified by polling ``/health``
after the request returns.
"""
from __future__ import annotations
import asyncio
from typing import Any
from fastapi import APIRouter, Depends
from decnet.logging import get_logger
from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree
from decnet.swarm.updater_client import UpdaterClient
from decnet.web.db.models import PushUpdateRequest, PushUpdateResponse, PushUpdateResult
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
from .api_push_update import _is_expected_connection_drop, _master_tree_root, _resolve_targets
log = get_logger("swarm_updates.push_self")
router = APIRouter()
async def _push_self_one(host: dict[str, Any], tarball: bytes, sha: str) -> PushUpdateResult:
try:
async with UpdaterClient(host=host) as u:
try:
r = await u.update_self(tarball, sha=sha)
http_status = r.status_code
body = r.json() if r.content else {}
ok = http_status == 200
detail = (body.get("error") or body.get("probe")) if isinstance(body, dict) else None
stderr = body.get("stderr") if isinstance(body, dict) else None
except Exception as exc: # noqa: BLE001
# Connection drops during self-update are expected — the updater
# re-execs itself mid-response.
if _is_expected_connection_drop(exc):
return PushUpdateResult(
host_uuid=host["uuid"], host_name=host["name"],
status="self-updated", sha=sha,
detail="updater re-exec dropped connection (expected)",
)
raise
return PushUpdateResult(
host_uuid=host["uuid"], host_name=host["name"],
status="self-updated" if ok else "self-failed",
http_status=http_status, sha=sha,
detail=detail, stderr=stderr,
)
except Exception as exc: # noqa: BLE001
log.exception("swarm_updates.push_self failed host=%s", host.get("name"))
return PushUpdateResult(
host_uuid=host["uuid"], host_name=host["name"],
status="self-failed",
detail=f"{type(exc).__name__}: {exc}",
)
@router.post(
"/push-self",
response_model=PushUpdateResponse,
tags=["Swarm Updates"],
)
async def api_push_update_self(
req: PushUpdateRequest,
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> PushUpdateResponse:
targets = await _resolve_targets(repo, req)
tree_root = _master_tree_root()
sha = detect_git_sha(tree_root)
tarball = tar_working_tree(tree_root, extra_excludes=req.exclude)
log.info(
"swarm_updates.push_self sha=%s tarball=%d hosts=%d",
sha or "(not a git repo)", len(tarball), len(targets),
)
results = await asyncio.gather(
*(_push_self_one(h, tarball, sha) for h in targets)
)
return PushUpdateResponse(
sha=sha,
tarball_bytes=len(tarball),
results=list(results),
)

View File

@@ -0,0 +1,70 @@
"""POST /swarm-updates/rollback — manual rollback on a single host.
Calls the worker updater's ``/rollback`` which swaps the ``current``
symlink back to ``releases/prev``. Fails with 404 if the target has no
previous release slot.
"""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException
from decnet.logging import get_logger
from decnet.swarm.updater_client import UpdaterClient
from decnet.web.db.models import RollbackRequest, RollbackResponse
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm_updates.rollback")
router = APIRouter()
@router.post(
"/rollback",
response_model=RollbackResponse,
tags=["Swarm Updates"],
)
async def api_rollback_host(
req: RollbackRequest,
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> RollbackResponse:
host = await repo.get_swarm_host_by_uuid(req.host_uuid)
if host is None:
raise HTTPException(status_code=404, detail=f"Unknown host: {req.host_uuid}")
if not host.get("updater_cert_fingerprint"):
raise HTTPException(
status_code=400,
detail=f"Host '{host['name']}' has no updater bundle — nothing to roll back.",
)
try:
async with UpdaterClient(host=host) as u:
r = await u.rollback()
except Exception as exc: # noqa: BLE001
log.exception("swarm_updates.rollback transport failure host=%s", host["name"])
return RollbackResponse(
host_uuid=host["uuid"], host_name=host["name"],
status="failed",
detail=f"{type(exc).__name__}: {exc}",
)
body = r.json() if r.content else {}
if r.status_code == 404:
# No previous release — surface as 404 so the UI can render the
# "nothing to roll back" state distinctly from a transport error.
raise HTTPException(
status_code=404,
detail=body.get("detail") if isinstance(body, dict) else "No previous release on worker.",
)
if r.status_code != 200:
return RollbackResponse(
host_uuid=host["uuid"], host_name=host["name"],
status="failed", http_status=r.status_code,
detail=(body.get("error") or body.get("detail")) if isinstance(body, dict) else None,
)
return RollbackResponse(
host_uuid=host["uuid"], host_name=host["name"],
status="rolled-back", http_status=r.status_code,
detail=body.get("status") if isinstance(body, dict) else None,
)