feat(web): Remote Updates API — dashboard endpoints for pushing code to workers
Adds /api/v1/swarm-updates/{hosts,push,push-self,rollback} behind
require_admin. Reuses the existing UpdaterClient + tar_working_tree + the
per-host asyncio.gather pattern from api_deploy_swarm.py; tarball is
built exactly once per /push request and fanned out to every selected
worker. /hosts filters out decommissioned hosts and agent-only
enrollments (no updater bundle = not a target).
Connection drops during /update-self are treated as success — the
updater re-execs itself mid-response, so httpx always raises.
Pydantic models live in decnet/web/db/models.py (single source of
truth). 24 tests cover happy paths, rollback, transport failures,
include_self ordering (skip on rolled-back agents), validation, and
RBAC gating.
This commit is contained in:
@@ -373,3 +373,72 @@ class SwarmHostHealth(BaseModel):
|
||||
|
||||
class SwarmCheckResponse(BaseModel):
|
||||
results: list[SwarmHostHealth]
|
||||
|
||||
|
||||
# --- Remote Updates (master → worker /updater) DTOs ---
|
||||
# Powers the dashboard's Remote Updates page. The master dashboard calls
|
||||
# these (auth-gated) endpoints; internally they fan out to each worker's
|
||||
# updater daemon over mTLS via UpdaterClient.
|
||||
|
||||
class HostReleaseInfo(BaseModel):
|
||||
host_uuid: str
|
||||
host_name: str
|
||||
address: str
|
||||
reachable: bool
|
||||
# These fields mirror the updater's /health payload when reachable; they
|
||||
# are all Optional so an unreachable host still serializes cleanly.
|
||||
agent_status: Optional[str] = None
|
||||
current_sha: Optional[str] = None
|
||||
previous_sha: Optional[str] = None
|
||||
releases: list[dict[str, Any]] = PydanticField(default_factory=list)
|
||||
detail: Optional[str] = None # populated when unreachable
|
||||
|
||||
|
||||
class HostReleasesResponse(BaseModel):
|
||||
hosts: list[HostReleaseInfo]
|
||||
|
||||
|
||||
class PushUpdateRequest(BaseModel):
|
||||
host_uuids: Optional[list[str]] = PydanticField(
|
||||
default=None,
|
||||
description="Target specific hosts; mutually exclusive with 'all'.",
|
||||
)
|
||||
all: bool = PydanticField(default=False, description="Target every non-decommissioned host with an updater bundle.")
|
||||
include_self: bool = PydanticField(
|
||||
default=False,
|
||||
description="After a successful /update, also push /update-self to upgrade the updater itself.",
|
||||
)
|
||||
exclude: list[str] = PydanticField(
|
||||
default_factory=list,
|
||||
description="Additional tarball exclude globs (on top of the built-in defaults).",
|
||||
)
|
||||
|
||||
|
||||
class PushUpdateResult(BaseModel):
|
||||
host_uuid: str
|
||||
host_name: str
|
||||
# updated = /update 200. rolled-back = /update 409 (auto-recovered).
|
||||
# failed = transport error or non-200/409 response. self-updated = /update-self succeeded.
|
||||
status: Literal["updated", "rolled-back", "failed", "self-updated", "self-failed"]
|
||||
http_status: Optional[int] = None
|
||||
sha: Optional[str] = None
|
||||
detail: Optional[str] = None
|
||||
stderr: Optional[str] = None
|
||||
|
||||
|
||||
class PushUpdateResponse(BaseModel):
|
||||
sha: str
|
||||
tarball_bytes: int
|
||||
results: list[PushUpdateResult]
|
||||
|
||||
|
||||
class RollbackRequest(BaseModel):
|
||||
host_uuid: str = PydanticField(..., description="Host to roll back to its previous release slot.")
|
||||
|
||||
|
||||
class RollbackResponse(BaseModel):
|
||||
host_uuid: str
|
||||
host_name: str
|
||||
status: Literal["rolled-back", "failed"]
|
||||
http_status: Optional[int] = None
|
||||
detail: Optional[str] = None
|
||||
|
||||
@@ -21,6 +21,7 @@ from .config.api_manage_users import router as config_users_router
|
||||
from .config.api_reinit import router as config_reinit_router
|
||||
from .health.api_get_health import router as health_router
|
||||
from .artifacts.api_get_artifact import router as artifacts_router
|
||||
from .swarm_updates import swarm_updates_router
|
||||
|
||||
api_router = APIRouter()
|
||||
|
||||
@@ -60,3 +61,6 @@ api_router.include_router(config_reinit_router)
|
||||
|
||||
# Artifacts (captured attacker file drops)
|
||||
api_router.include_router(artifacts_router)
|
||||
|
||||
# Remote Updates (dashboard → worker updater daemons)
|
||||
api_router.include_router(swarm_updates_router)
|
||||
|
||||
23
decnet/web/router/swarm_updates/__init__.py
Normal file
23
decnet/web/router/swarm_updates/__init__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""Remote Updates — master dashboard's surface for pushing code to workers.
|
||||
|
||||
These are *not* the swarm-controller's /swarm routes (those run on a
|
||||
separate process, auth-free, internal-only). They live on the main web
|
||||
API, go through ``require_admin``, and are the interface the React
|
||||
dashboard calls to fan updates out to worker ``decnet updater`` daemons
|
||||
via ``UpdaterClient``.
|
||||
|
||||
Mounted under ``/api/v1/swarm-updates`` by the main api router.
|
||||
"""
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .api_list_host_releases import router as list_host_releases_router
|
||||
from .api_push_update import router as push_update_router
|
||||
from .api_push_update_self import router as push_update_self_router
|
||||
from .api_rollback_host import router as rollback_host_router
|
||||
|
||||
swarm_updates_router = APIRouter(prefix="/swarm-updates")
|
||||
|
||||
swarm_updates_router.include_router(list_host_releases_router)
|
||||
swarm_updates_router.include_router(push_update_router)
|
||||
swarm_updates_router.include_router(push_update_self_router)
|
||||
swarm_updates_router.include_router(rollback_host_router)
|
||||
82
decnet/web/router/swarm_updates/api_list_host_releases.py
Normal file
82
decnet/web/router/swarm_updates/api_list_host_releases.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""GET /swarm-updates/hosts — per-host updater health + release slots.
|
||||
|
||||
Fans out an ``UpdaterClient.health()`` probe to every enrolled host that
|
||||
has an updater bundle. Each probe is isolated: a single unreachable host
|
||||
never fails the whole list (that's normal partial-failure behaviour for
|
||||
a fleet view).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.updater_client import UpdaterClient
|
||||
from decnet.web.db.models import HostReleaseInfo, HostReleasesResponse
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo, require_admin
|
||||
|
||||
log = get_logger("swarm_updates.list")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _extract_shas(releases: list[dict[str, Any]]) -> tuple[str | None, str | None]:
|
||||
"""Pick the (current, previous) SHA from the updater's releases list.
|
||||
|
||||
The updater reports releases as ``[{"slot": "active"|"prev", "sha": ...,
|
||||
...}]`` in no guaranteed order, so pull by slot name rather than index.
|
||||
"""
|
||||
current = next((r.get("sha") for r in releases if r.get("slot") == "active"), None)
|
||||
previous = next((r.get("sha") for r in releases if r.get("slot") == "prev"), None)
|
||||
return current, previous
|
||||
|
||||
|
||||
async def _probe_host(host: dict[str, Any]) -> HostReleaseInfo:
|
||||
try:
|
||||
async with UpdaterClient(host=host) as u:
|
||||
body = await u.health()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return HostReleaseInfo(
|
||||
host_uuid=host["uuid"],
|
||||
host_name=host["name"],
|
||||
address=host["address"],
|
||||
reachable=False,
|
||||
detail=f"{type(exc).__name__}: {exc}",
|
||||
)
|
||||
releases = body.get("releases") or []
|
||||
current, previous = _extract_shas(releases)
|
||||
return HostReleaseInfo(
|
||||
host_uuid=host["uuid"],
|
||||
host_name=host["name"],
|
||||
address=host["address"],
|
||||
reachable=True,
|
||||
agent_status=body.get("agent_status") or body.get("status"),
|
||||
current_sha=current,
|
||||
previous_sha=previous,
|
||||
releases=releases,
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/hosts",
|
||||
response_model=HostReleasesResponse,
|
||||
tags=["Swarm Updates"],
|
||||
)
|
||||
async def api_list_host_releases(
|
||||
admin: dict = Depends(require_admin),
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> HostReleasesResponse:
|
||||
rows = await repo.list_swarm_hosts()
|
||||
# Only hosts actually capable of receiving updates — decommissioned
|
||||
# hosts and agent-only enrollments are filtered out.
|
||||
targets = [
|
||||
r for r in rows
|
||||
if r.get("status") != "decommissioned" and r.get("updater_cert_fingerprint")
|
||||
]
|
||||
if not targets:
|
||||
return HostReleasesResponse(hosts=[])
|
||||
results = await asyncio.gather(*(_probe_host(h) for h in targets))
|
||||
return HostReleasesResponse(hosts=list(results))
|
||||
152
decnet/web/router/swarm_updates/api_push_update.py
Normal file
152
decnet/web/router/swarm_updates/api_push_update.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""POST /swarm-updates/push — fan a tarball of the master's tree to workers.
|
||||
|
||||
Mirrors the ``decnet swarm update`` CLI flow: build the tarball once,
|
||||
dispatch concurrently, collect per-host statuses. Returns HTTP 200 even
|
||||
when individual hosts failed — the operator reads per-host ``status``.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import pathlib
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree
|
||||
from decnet.swarm.updater_client import UpdaterClient
|
||||
from decnet.web.db.models import PushUpdateRequest, PushUpdateResponse, PushUpdateResult
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo, require_admin
|
||||
|
||||
log = get_logger("swarm_updates.push")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _master_tree_root() -> pathlib.Path:
|
||||
"""Resolve the master's install tree to tar.
|
||||
|
||||
Walks up from this file: ``decnet/web/router/swarm_updates/`` → 3 parents
|
||||
lands on the repo root. Matches the layout shipped via ``pip install -e .``
|
||||
and the dev checkout at ``~/Tools/DECNET``.
|
||||
"""
|
||||
return pathlib.Path(__file__).resolve().parents[4]
|
||||
|
||||
|
||||
def _classify_update(status_code: int) -> str:
|
||||
if status_code == 200:
|
||||
return "updated"
|
||||
if status_code == 409:
|
||||
return "rolled-back"
|
||||
return "failed"
|
||||
|
||||
|
||||
async def _resolve_targets(
|
||||
repo: BaseRepository,
|
||||
req: PushUpdateRequest,
|
||||
) -> list[dict[str, Any]]:
|
||||
if req.all == bool(req.host_uuids):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Specify exactly one of host_uuids or all=true.",
|
||||
)
|
||||
rows = await repo.list_swarm_hosts()
|
||||
rows = [r for r in rows if r.get("updater_cert_fingerprint")]
|
||||
if req.all:
|
||||
targets = [r for r in rows if r.get("status") != "decommissioned"]
|
||||
else:
|
||||
wanted = set(req.host_uuids or [])
|
||||
targets = [r for r in rows if r["uuid"] in wanted]
|
||||
missing = wanted - {r["uuid"] for r in targets}
|
||||
if missing:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"Unknown or updater-less host(s): {sorted(missing)}",
|
||||
)
|
||||
if not targets:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="No targets: no enrolled hosts have an updater bundle.",
|
||||
)
|
||||
return targets
|
||||
|
||||
|
||||
async def _push_one(
|
||||
host: dict[str, Any],
|
||||
tarball: bytes,
|
||||
sha: str,
|
||||
include_self: bool,
|
||||
) -> PushUpdateResult:
|
||||
try:
|
||||
async with UpdaterClient(host=host) as u:
|
||||
r = await u.update(tarball, sha=sha)
|
||||
body = r.json() if r.content else {}
|
||||
status = _classify_update(r.status_code)
|
||||
stderr = body.get("stderr") if isinstance(body, dict) else None
|
||||
|
||||
if include_self and r.status_code == 200:
|
||||
# Agent first, updater second — a broken updater push must never
|
||||
# strand the fleet on an old agent.
|
||||
try:
|
||||
rs = await u.update_self(tarball, sha=sha)
|
||||
self_ok = rs.status_code in (200, 0) # 0 = connection dropped (expected)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
# Connection drop on update-self is expected and not an error.
|
||||
self_ok = _is_expected_connection_drop(exc)
|
||||
if not self_ok:
|
||||
return PushUpdateResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="self-failed", http_status=r.status_code, sha=sha,
|
||||
detail=f"agent updated OK but self-update failed: {exc}",
|
||||
stderr=stderr,
|
||||
)
|
||||
status = "self-updated" if self_ok else "self-failed"
|
||||
|
||||
return PushUpdateResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status=status, http_status=r.status_code, sha=sha,
|
||||
detail=body.get("error") or body.get("probe") if isinstance(body, dict) else None,
|
||||
stderr=stderr,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.exception("swarm_updates.push failed host=%s", host.get("name"))
|
||||
return PushUpdateResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="failed",
|
||||
detail=f"{type(exc).__name__}: {exc}",
|
||||
)
|
||||
|
||||
|
||||
def _is_expected_connection_drop(exc: BaseException) -> bool:
|
||||
"""update-self re-execs the updater mid-response; httpx raises on the drop."""
|
||||
import httpx
|
||||
return isinstance(exc, (httpx.RemoteProtocolError, httpx.ReadError, httpx.ConnectError))
|
||||
|
||||
|
||||
@router.post(
|
||||
"/push",
|
||||
response_model=PushUpdateResponse,
|
||||
tags=["Swarm Updates"],
|
||||
)
|
||||
async def api_push_update(
|
||||
req: PushUpdateRequest,
|
||||
admin: dict = Depends(require_admin),
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> PushUpdateResponse:
|
||||
targets = await _resolve_targets(repo, req)
|
||||
tree_root = _master_tree_root()
|
||||
sha = detect_git_sha(tree_root)
|
||||
tarball = tar_working_tree(tree_root, extra_excludes=req.exclude)
|
||||
log.info(
|
||||
"swarm_updates.push sha=%s tarball=%d hosts=%d include_self=%s",
|
||||
sha or "(not a git repo)", len(tarball), len(targets), req.include_self,
|
||||
)
|
||||
results = await asyncio.gather(
|
||||
*(_push_one(h, tarball, sha, req.include_self) for h in targets)
|
||||
)
|
||||
return PushUpdateResponse(
|
||||
sha=sha,
|
||||
tarball_bytes=len(tarball),
|
||||
results=list(results),
|
||||
)
|
||||
92
decnet/web/router/swarm_updates/api_push_update_self.py
Normal file
92
decnet/web/router/swarm_updates/api_push_update_self.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""POST /swarm-updates/push-self — push only to workers' /update-self.
|
||||
|
||||
Use case: the agent is fine but the updater itself needs an upgrade (e.g.
|
||||
a fix to ``executor.py``). Uploading only ``/update-self`` avoids a
|
||||
redundant agent restart on healthy workers.
|
||||
|
||||
No auto-rollback: the updater re-execs itself on success, so a broken
|
||||
push leaves the worker on the old code — verified by polling ``/health``
|
||||
after the request returns.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree
|
||||
from decnet.swarm.updater_client import UpdaterClient
|
||||
from decnet.web.db.models import PushUpdateRequest, PushUpdateResponse, PushUpdateResult
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo, require_admin
|
||||
|
||||
from .api_push_update import _is_expected_connection_drop, _master_tree_root, _resolve_targets
|
||||
|
||||
log = get_logger("swarm_updates.push_self")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
async def _push_self_one(host: dict[str, Any], tarball: bytes, sha: str) -> PushUpdateResult:
|
||||
try:
|
||||
async with UpdaterClient(host=host) as u:
|
||||
try:
|
||||
r = await u.update_self(tarball, sha=sha)
|
||||
http_status = r.status_code
|
||||
body = r.json() if r.content else {}
|
||||
ok = http_status == 200
|
||||
detail = (body.get("error") or body.get("probe")) if isinstance(body, dict) else None
|
||||
stderr = body.get("stderr") if isinstance(body, dict) else None
|
||||
except Exception as exc: # noqa: BLE001
|
||||
# Connection drops during self-update are expected — the updater
|
||||
# re-execs itself mid-response.
|
||||
if _is_expected_connection_drop(exc):
|
||||
return PushUpdateResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="self-updated", sha=sha,
|
||||
detail="updater re-exec dropped connection (expected)",
|
||||
)
|
||||
raise
|
||||
return PushUpdateResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="self-updated" if ok else "self-failed",
|
||||
http_status=http_status, sha=sha,
|
||||
detail=detail, stderr=stderr,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.exception("swarm_updates.push_self failed host=%s", host.get("name"))
|
||||
return PushUpdateResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="self-failed",
|
||||
detail=f"{type(exc).__name__}: {exc}",
|
||||
)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/push-self",
|
||||
response_model=PushUpdateResponse,
|
||||
tags=["Swarm Updates"],
|
||||
)
|
||||
async def api_push_update_self(
|
||||
req: PushUpdateRequest,
|
||||
admin: dict = Depends(require_admin),
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> PushUpdateResponse:
|
||||
targets = await _resolve_targets(repo, req)
|
||||
tree_root = _master_tree_root()
|
||||
sha = detect_git_sha(tree_root)
|
||||
tarball = tar_working_tree(tree_root, extra_excludes=req.exclude)
|
||||
log.info(
|
||||
"swarm_updates.push_self sha=%s tarball=%d hosts=%d",
|
||||
sha or "(not a git repo)", len(tarball), len(targets),
|
||||
)
|
||||
results = await asyncio.gather(
|
||||
*(_push_self_one(h, tarball, sha) for h in targets)
|
||||
)
|
||||
return PushUpdateResponse(
|
||||
sha=sha,
|
||||
tarball_bytes=len(tarball),
|
||||
results=list(results),
|
||||
)
|
||||
70
decnet/web/router/swarm_updates/api_rollback_host.py
Normal file
70
decnet/web/router/swarm_updates/api_rollback_host.py
Normal file
@@ -0,0 +1,70 @@
|
||||
"""POST /swarm-updates/rollback — manual rollback on a single host.
|
||||
|
||||
Calls the worker updater's ``/rollback`` which swaps the ``current``
|
||||
symlink back to ``releases/prev``. Fails with 404 if the target has no
|
||||
previous release slot.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.updater_client import UpdaterClient
|
||||
from decnet.web.db.models import RollbackRequest, RollbackResponse
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo, require_admin
|
||||
|
||||
log = get_logger("swarm_updates.rollback")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/rollback",
|
||||
response_model=RollbackResponse,
|
||||
tags=["Swarm Updates"],
|
||||
)
|
||||
async def api_rollback_host(
|
||||
req: RollbackRequest,
|
||||
admin: dict = Depends(require_admin),
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> RollbackResponse:
|
||||
host = await repo.get_swarm_host_by_uuid(req.host_uuid)
|
||||
if host is None:
|
||||
raise HTTPException(status_code=404, detail=f"Unknown host: {req.host_uuid}")
|
||||
if not host.get("updater_cert_fingerprint"):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Host '{host['name']}' has no updater bundle — nothing to roll back.",
|
||||
)
|
||||
|
||||
try:
|
||||
async with UpdaterClient(host=host) as u:
|
||||
r = await u.rollback()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.exception("swarm_updates.rollback transport failure host=%s", host["name"])
|
||||
return RollbackResponse(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="failed",
|
||||
detail=f"{type(exc).__name__}: {exc}",
|
||||
)
|
||||
|
||||
body = r.json() if r.content else {}
|
||||
if r.status_code == 404:
|
||||
# No previous release — surface as 404 so the UI can render the
|
||||
# "nothing to roll back" state distinctly from a transport error.
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=body.get("detail") if isinstance(body, dict) else "No previous release on worker.",
|
||||
)
|
||||
if r.status_code != 200:
|
||||
return RollbackResponse(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="failed", http_status=r.status_code,
|
||||
detail=(body.get("error") or body.get("detail")) if isinstance(body, dict) else None,
|
||||
)
|
||||
return RollbackResponse(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="rolled-back", http_status=r.status_code,
|
||||
detail=body.get("status") if isinstance(body, dict) else None,
|
||||
)
|
||||
Reference in New Issue
Block a user