Adds /api/v1/swarm-updates/{hosts,push,push-self,rollback} behind
require_admin. Reuses the existing UpdaterClient + tar_working_tree + the
per-host asyncio.gather pattern from api_deploy_swarm.py; tarball is
built exactly once per /push request and fanned out to every selected
worker. /hosts filters out decommissioned hosts and agent-only
enrollments (no updater bundle = not a target).
Connection drops during /update-self are treated as success — the
updater re-execs itself mid-response, so httpx always raises.
Pydantic models live in decnet/web/db/models.py (single source of
truth). 24 tests cover happy paths, rollback, transport failures,
include_self ordering (skip on rolled-back agents), validation, and
RBAC gating.
153 lines
5.5 KiB
Python
153 lines
5.5 KiB
Python
"""POST /swarm-updates/push — fan a tarball of the master's tree to workers.
|
|
|
|
Mirrors the ``decnet swarm update`` CLI flow: build the tarball once,
|
|
dispatch concurrently, collect per-host statuses. Returns HTTP 200 even
|
|
when individual hosts failed — the operator reads per-host ``status``.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import pathlib
|
|
from typing import Any
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException
|
|
|
|
from decnet.logging import get_logger
|
|
from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree
|
|
from decnet.swarm.updater_client import UpdaterClient
|
|
from decnet.web.db.models import PushUpdateRequest, PushUpdateResponse, PushUpdateResult
|
|
from decnet.web.db.repository import BaseRepository
|
|
from decnet.web.dependencies import get_repo, require_admin
|
|
|
|
log = get_logger("swarm_updates.push")
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
def _master_tree_root() -> pathlib.Path:
|
|
"""Resolve the master's install tree to tar.
|
|
|
|
Walks up from this file: ``decnet/web/router/swarm_updates/`` → 3 parents
|
|
lands on the repo root. Matches the layout shipped via ``pip install -e .``
|
|
and the dev checkout at ``~/Tools/DECNET``.
|
|
"""
|
|
return pathlib.Path(__file__).resolve().parents[4]
|
|
|
|
|
|
def _classify_update(status_code: int) -> str:
|
|
if status_code == 200:
|
|
return "updated"
|
|
if status_code == 409:
|
|
return "rolled-back"
|
|
return "failed"
|
|
|
|
|
|
async def _resolve_targets(
|
|
repo: BaseRepository,
|
|
req: PushUpdateRequest,
|
|
) -> list[dict[str, Any]]:
|
|
if req.all == bool(req.host_uuids):
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="Specify exactly one of host_uuids or all=true.",
|
|
)
|
|
rows = await repo.list_swarm_hosts()
|
|
rows = [r for r in rows if r.get("updater_cert_fingerprint")]
|
|
if req.all:
|
|
targets = [r for r in rows if r.get("status") != "decommissioned"]
|
|
else:
|
|
wanted = set(req.host_uuids or [])
|
|
targets = [r for r in rows if r["uuid"] in wanted]
|
|
missing = wanted - {r["uuid"] for r in targets}
|
|
if missing:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"Unknown or updater-less host(s): {sorted(missing)}",
|
|
)
|
|
if not targets:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail="No targets: no enrolled hosts have an updater bundle.",
|
|
)
|
|
return targets
|
|
|
|
|
|
async def _push_one(
|
|
host: dict[str, Any],
|
|
tarball: bytes,
|
|
sha: str,
|
|
include_self: bool,
|
|
) -> PushUpdateResult:
|
|
try:
|
|
async with UpdaterClient(host=host) as u:
|
|
r = await u.update(tarball, sha=sha)
|
|
body = r.json() if r.content else {}
|
|
status = _classify_update(r.status_code)
|
|
stderr = body.get("stderr") if isinstance(body, dict) else None
|
|
|
|
if include_self and r.status_code == 200:
|
|
# Agent first, updater second — a broken updater push must never
|
|
# strand the fleet on an old agent.
|
|
try:
|
|
rs = await u.update_self(tarball, sha=sha)
|
|
self_ok = rs.status_code in (200, 0) # 0 = connection dropped (expected)
|
|
except Exception as exc: # noqa: BLE001
|
|
# Connection drop on update-self is expected and not an error.
|
|
self_ok = _is_expected_connection_drop(exc)
|
|
if not self_ok:
|
|
return PushUpdateResult(
|
|
host_uuid=host["uuid"], host_name=host["name"],
|
|
status="self-failed", http_status=r.status_code, sha=sha,
|
|
detail=f"agent updated OK but self-update failed: {exc}",
|
|
stderr=stderr,
|
|
)
|
|
status = "self-updated" if self_ok else "self-failed"
|
|
|
|
return PushUpdateResult(
|
|
host_uuid=host["uuid"], host_name=host["name"],
|
|
status=status, http_status=r.status_code, sha=sha,
|
|
detail=body.get("error") or body.get("probe") if isinstance(body, dict) else None,
|
|
stderr=stderr,
|
|
)
|
|
except Exception as exc: # noqa: BLE001
|
|
log.exception("swarm_updates.push failed host=%s", host.get("name"))
|
|
return PushUpdateResult(
|
|
host_uuid=host["uuid"], host_name=host["name"],
|
|
status="failed",
|
|
detail=f"{type(exc).__name__}: {exc}",
|
|
)
|
|
|
|
|
|
def _is_expected_connection_drop(exc: BaseException) -> bool:
|
|
"""update-self re-execs the updater mid-response; httpx raises on the drop."""
|
|
import httpx
|
|
return isinstance(exc, (httpx.RemoteProtocolError, httpx.ReadError, httpx.ConnectError))
|
|
|
|
|
|
@router.post(
|
|
"/push",
|
|
response_model=PushUpdateResponse,
|
|
tags=["Swarm Updates"],
|
|
)
|
|
async def api_push_update(
|
|
req: PushUpdateRequest,
|
|
admin: dict = Depends(require_admin),
|
|
repo: BaseRepository = Depends(get_repo),
|
|
) -> PushUpdateResponse:
|
|
targets = await _resolve_targets(repo, req)
|
|
tree_root = _master_tree_root()
|
|
sha = detect_git_sha(tree_root)
|
|
tarball = tar_working_tree(tree_root, extra_excludes=req.exclude)
|
|
log.info(
|
|
"swarm_updates.push sha=%s tarball=%d hosts=%d include_self=%s",
|
|
sha or "(not a git repo)", len(tarball), len(targets), req.include_self,
|
|
)
|
|
results = await asyncio.gather(
|
|
*(_push_one(h, tarball, sha, req.include_self) for h in targets)
|
|
)
|
|
return PushUpdateResponse(
|
|
sha=sha,
|
|
tarball_bytes=len(tarball),
|
|
results=list(results),
|
|
)
|