merge: testing → main (reconcile 2-week divergence)

This commit is contained in:
2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions

View File

@@ -0,0 +1,47 @@
"""Swarm controller routers.
One file per endpoint, aggregated under the ``/swarm`` prefix. Mounted
onto the swarm-api FastAPI app (``decnet/web/swarm_api.py``), a separate
process from the main DECNET API so swarm failures cannot cascade into
log ingestion / dashboard serving.
"""
from fastapi import APIRouter
from .api_enroll_host import router as enroll_host_router
from .api_list_hosts import router as list_hosts_router
from .api_get_host import router as get_host_router
from .api_decommission_host import router as decommission_host_router
from .api_deploy_swarm import router as deploy_swarm_router
from .api_teardown_swarm import router as teardown_swarm_router
from .api_get_swarm_health import router as get_swarm_health_router
from .api_check_hosts import router as check_hosts_router
from .api_heartbeat import router as heartbeat_router
from .api_list_deckies import router as list_deckies_router
swarm_router = APIRouter(
prefix="/swarm",
# Error responses that every swarm route can surface. Route-level
# `responses=` entries still override/extend these for route-specific
# codes (e.g. 409 on /enroll).
responses={
400: {"description": "Malformed request"},
403: {"description": "Peer cert missing or fingerprint mismatch"},
404: {"description": "Referenced host does not exist"},
},
)
# Hosts
swarm_router.include_router(enroll_host_router)
swarm_router.include_router(list_hosts_router)
swarm_router.include_router(get_host_router)
swarm_router.include_router(decommission_host_router)
# Deployments
swarm_router.include_router(deploy_swarm_router)
swarm_router.include_router(teardown_swarm_router)
swarm_router.include_router(list_deckies_router)
# Health
swarm_router.include_router(get_swarm_health_router)
swarm_router.include_router(check_hosts_router)
swarm_router.include_router(heartbeat_router)

View File

@@ -0,0 +1,61 @@
"""POST /swarm/check — active mTLS probe of every enrolled worker.
Updates ``SwarmHost.status`` and ``last_heartbeat`` for each host based
on the outcome of the probe.
"""
from __future__ import annotations
import asyncio
from datetime import datetime, timezone
from typing import Any
from fastapi import APIRouter, Depends
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.db.models import SwarmCheckResponse, SwarmHostHealth
log = get_logger("swarm.check")
router = APIRouter()
@router.post("/check", response_model=SwarmCheckResponse, tags=["Swarm Health"])
async def api_check_hosts(
repo: BaseRepository = Depends(get_repo),
) -> SwarmCheckResponse:
hosts = await repo.list_swarm_hosts()
async def _probe(host: dict[str, Any]) -> SwarmHostHealth:
try:
async with AgentClient(host=host) as agent:
body = await agent.health()
await repo.update_swarm_host(
host["uuid"],
{
"status": "active",
"last_heartbeat": datetime.now(timezone.utc),
},
)
return SwarmHostHealth(
host_uuid=host["uuid"],
name=host["name"],
address=host["address"],
reachable=True,
detail=body,
)
except Exception as exc:
log.warning("swarm.check unreachable host=%s err=%s", host["name"], exc)
await repo.update_swarm_host(host["uuid"], {"status": "unreachable"})
return SwarmHostHealth(
host_uuid=host["uuid"],
name=host["name"],
address=host["address"],
reachable=False,
detail=str(exc),
)
results = await asyncio.gather(*(_probe(h) for h in hosts))
return SwarmCheckResponse(results=list(results))

View File

@@ -0,0 +1,63 @@
"""DELETE /swarm/hosts/{uuid} — decommission a worker.
Removes the DeckyShard rows bound to the host (portable cascade — MySQL
and SQLite both honor it via the repo layer), deletes the SwarmHost row,
and best-effort-cleans the per-worker bundle directory on the master.
Also asks the worker agent to wipe its own install (keeping logs). A
dead/unreachable worker does not block master-side cleanup.
"""
from __future__ import annotations
import pathlib
from fastapi import APIRouter, Depends, HTTPException, status
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
log = get_logger("swarm.decommission")
router = APIRouter()
@router.delete(
"/hosts/{uuid}",
status_code=status.HTTP_204_NO_CONTENT,
tags=["Swarm Hosts"],
responses={404: {"description": "No host with this UUID is enrolled"}},
)
async def api_decommission_host(
uuid: str,
repo: BaseRepository = Depends(get_repo),
) -> None:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
try:
async with AgentClient(host=row) as agent:
await agent.self_destruct()
except Exception:
log.exception(
"decommission: self-destruct dispatch failed host=%s"
"proceeding with master-side cleanup anyway",
row.get("name"),
)
await repo.delete_decky_shards_for_host(uuid)
await repo.delete_swarm_host(uuid)
# Best-effort bundle cleanup; if the dir was moved manually, don't fail.
bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
if bundle_dir.is_dir():
for child in bundle_dir.iterdir():
try:
child.unlink()
except OSError:
pass
try:
bundle_dir.rmdir()
except OSError:
pass

View File

@@ -0,0 +1,155 @@
"""POST /swarm/deploy — shard a DecnetConfig across enrolled workers.
Per worker we build a filtered copy containing only the deckies assigned
to that worker (via ``host_uuid``), then POST it to the worker agent.
The caller is expected to have already set ``host_uuid`` on every decky;
if any decky arrives without one, we fail fast. Auto-sharding lives in
the CLI layer, not here.
"""
from __future__ import annotations
import asyncio
import json
from datetime import datetime, timezone
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.config import DecnetConfig, DeckyConfig
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.db.models import (
SwarmDeployRequest,
SwarmDeployResponse,
SwarmHostResult,
)
log = get_logger("swarm.deploy")
router = APIRouter()
def _shard_by_host(config: DecnetConfig) -> dict[str, list[DeckyConfig]]:
buckets: dict[str, list[DeckyConfig]] = {}
for d in config.deckies:
if not d.host_uuid:
raise HTTPException(
status_code=400,
detail=f"decky '{d.name}' has no host_uuid — caller must shard before dispatch",
)
buckets.setdefault(d.host_uuid, []).append(d)
return buckets
def _worker_config(
base: DecnetConfig,
shard: list[DeckyConfig],
host: dict[str, Any],
) -> DecnetConfig:
updates: dict[str, Any] = {"deckies": shard}
# Per-host driver opt-in (Wi-Fi-bridged VMs can't use macvlan — see
# SwarmHost.use_ipvlan). Never downgrade: if the operator picked ipvlan
# at the deploy level, keep it regardless of the per-host flag.
if host.get("use_ipvlan"):
updates["ipvlan"] = True
return base.model_copy(update=updates)
async def dispatch_decnet_config(
config: DecnetConfig,
repo: BaseRepository,
dry_run: bool = False,
no_cache: bool = False,
) -> SwarmDeployResponse:
"""Shard ``config`` by ``host_uuid`` and dispatch to each worker in parallel.
Shared between POST /swarm/deploy (explicit swarm call) and the auto-swarm
branch of POST /deckies/deploy.
"""
buckets = _shard_by_host(config)
hosts: dict[str, dict[str, Any]] = {}
for host_uuid in buckets:
row = await repo.get_swarm_host_by_uuid(host_uuid)
if row is None:
raise HTTPException(status_code=404, detail=f"unknown host_uuid: {host_uuid}")
hosts[host_uuid] = row
async def _dispatch(host_uuid: str, shard: list[DeckyConfig]) -> SwarmHostResult:
host = hosts[host_uuid]
cfg = _worker_config(config, shard, host)
try:
async with AgentClient(host=host) as agent:
body = await agent.deploy(cfg, dry_run=dry_run, no_cache=no_cache)
for d in shard:
await repo.upsert_decky_shard(
{
"decky_name": d.name,
"host_uuid": host_uuid,
"services": json.dumps(d.services),
"decky_config": d.model_dump_json(),
"decky_ip": d.ip,
"state": "running" if not dry_run else "pending",
"last_error": None,
"updated_at": datetime.now(timezone.utc),
}
)
await repo.update_swarm_host(host_uuid, {"status": "active"})
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
except Exception as exc:
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
# Compose-up is partial-success-friendly: one decky failing to
# build doesn't roll back the ones that already came up. Ask the
# agent which containers actually exist before painting the whole
# shard red — otherwise decky1 and decky2 look "failed" even
# though they're live on the worker.
runtime: dict[str, Any] = {}
try:
async with AgentClient(host=host) as probe:
snap = await probe.status()
runtime = snap.get("runtime") or {}
except Exception:
log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"])
for d in shard:
rstate = runtime.get(d.name) or {}
is_up = bool(rstate.get("running"))
await repo.upsert_decky_shard(
{
"decky_name": d.name,
"host_uuid": host_uuid,
"services": json.dumps(d.services),
"decky_config": d.model_dump_json(),
"decky_ip": d.ip,
"state": "running" if is_up else "failed",
"last_error": None if is_up else str(exc)[:512],
"updated_at": datetime.now(timezone.utc),
}
)
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=False, detail=str(exc))
results = await asyncio.gather(
*(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
)
return SwarmDeployResponse(results=list(results))
@router.post(
"/deploy",
response_model=SwarmDeployResponse,
tags=["Swarm Deployments"],
responses={
400: {"description": "Deployment mode must be 'swarm'"},
404: {"description": "A referenced host_uuid is not enrolled"},
},
)
async def api_deploy_swarm(
req: SwarmDeployRequest,
repo: BaseRepository = Depends(get_repo),
) -> SwarmDeployResponse:
if req.config.mode != "swarm":
raise HTTPException(status_code=400, detail="mode must be 'swarm'")
return await dispatch_decnet_config(
req.config, repo, dry_run=req.dry_run, no_cache=req.no_cache
)

View File

@@ -0,0 +1,100 @@
"""POST /swarm/enroll — issue a worker cert bundle and register the host.
Enrollment is master-driven: the controller holds the CA private key,
generates a fresh worker keypair + CA-signed cert, and returns the full
bundle to the operator. Bundle delivery to the worker (scp/sshpass/etc.)
is outside this process's trust boundary.
Rationale: the worker agent speaks ONLY mTLS; there is no pre-auth
bootstrap endpoint, so nothing to attack before the worker is enrolled.
"""
from __future__ import annotations
import uuid as _uuid
from datetime import datetime, timezone
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, status
from decnet.swarm import pki
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.db.models import SwarmEnrolledBundle, SwarmEnrollRequest, SwarmUpdaterBundle
router = APIRouter()
@router.post(
"/enroll",
response_model=SwarmEnrolledBundle,
status_code=status.HTTP_201_CREATED,
tags=["Swarm Hosts"],
responses={
400: {"description": "Bad Request (malformed JSON body)"},
409: {"description": "A worker with this name is already enrolled"},
422: {"description": "Request body validation error"},
},
)
async def api_enroll_host(
req: SwarmEnrollRequest,
repo: BaseRepository = Depends(get_repo),
) -> SwarmEnrolledBundle:
existing = await repo.get_swarm_host_by_name(req.name)
if existing is not None:
raise HTTPException(status_code=409, detail=f"Worker '{req.name}' is already enrolled")
ca = pki.ensure_ca()
sans = list({*req.sans, req.address, req.name})
issued = pki.issue_worker_cert(ca, req.name, sans)
# Persist the bundle under ~/.decnet/ca/workers/<name>/ so the master
# can replay it if the operator loses the original delivery.
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.name
pki.write_worker_bundle(issued, bundle_dir)
updater_view: Optional[SwarmUpdaterBundle] = None
updater_fp: Optional[str] = None
if req.issue_updater_bundle:
updater_cn = f"updater@{req.name}"
updater_sans = list({*sans, updater_cn, "127.0.0.1"})
updater_issued = pki.issue_worker_cert(ca, updater_cn, updater_sans)
# Persist alongside the worker bundle for replay.
updater_dir = bundle_dir / "updater"
updater_dir.mkdir(parents=True, exist_ok=True)
(updater_dir / "updater.crt").write_bytes(updater_issued.cert_pem)
(updater_dir / "updater.key").write_bytes(updater_issued.key_pem)
import os as _os
_os.chmod(updater_dir / "updater.key", 0o600)
updater_fp = updater_issued.fingerprint_sha256
updater_view = SwarmUpdaterBundle(
fingerprint=updater_fp,
updater_cert_pem=updater_issued.cert_pem.decode(),
updater_key_pem=updater_issued.key_pem.decode(),
)
host_uuid = str(_uuid.uuid4())
await repo.add_swarm_host(
{
"uuid": host_uuid,
"name": req.name,
"address": req.address,
"agent_port": req.agent_port,
"status": "enrolled",
"client_cert_fingerprint": issued.fingerprint_sha256,
"updater_cert_fingerprint": updater_fp,
"cert_bundle_path": str(bundle_dir),
"enrolled_at": datetime.now(timezone.utc),
"notes": req.notes,
}
)
return SwarmEnrolledBundle(
host_uuid=host_uuid,
name=req.name,
address=req.address,
agent_port=req.agent_port,
fingerprint=issued.fingerprint_sha256,
ca_cert_pem=issued.ca_cert_pem.decode(),
worker_cert_pem=issued.cert_pem.decode(),
worker_key_pem=issued.key_pem.decode(),
updater=updater_view,
)

View File

@@ -0,0 +1,26 @@
"""GET /swarm/hosts/{uuid} — fetch a single worker by UUID."""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.db.models import SwarmHostView
router = APIRouter()
@router.get(
"/hosts/{uuid}",
response_model=SwarmHostView,
tags=["Swarm Hosts"],
responses={404: {"description": "No host with this UUID is enrolled"}},
)
async def api_get_host(
uuid: str,
repo: BaseRepository = Depends(get_repo),
) -> SwarmHostView:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
return SwarmHostView(**row)

View File

@@ -0,0 +1,11 @@
"""GET /swarm/health — controller liveness (no I/O)."""
from __future__ import annotations
from fastapi import APIRouter
router = APIRouter()
@router.get("/health", tags=["Swarm Health"])
async def api_get_swarm_health() -> dict[str, str]:
return {"status": "ok", "role": "swarm-controller"}

View File

@@ -0,0 +1,212 @@
"""POST /swarm/heartbeat — agent→master liveness + decky snapshot refresh.
Workers call this every ~30 s with the output of ``executor.status()``.
The master bumps ``SwarmHost.last_heartbeat`` and re-upserts each
``DeckyShard`` with the fresh ``DeckyConfig`` snapshot + runtime-derived
state so the dashboard stays current without a master-pull probe.
Security: CA-signed mTLS is necessary but not sufficient — a
decommissioned worker's still-valid cert must not resurrect ghost
shards. We pin the presented peer cert's SHA-256 to the
``client_cert_fingerprint`` stored for the claimed ``host_uuid``.
Mismatch (or decommissioned host) → 403.
"""
from __future__ import annotations
import hashlib
import json
from datetime import datetime, timezone
from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, Request
from pydantic import BaseModel
from decnet.config import DeckyConfig
from decnet.logging import get_logger
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
log = get_logger("swarm.heartbeat")
router = APIRouter()
class HeartbeatRequest(BaseModel):
host_uuid: str
agent_version: Optional[str] = None
status: dict[str, Any]
topology: Optional[dict[str, Any]] = None
def _extract_peer_fingerprint(scope: dict[str, Any]) -> Optional[str]:
"""Pull the peer cert's SHA-256 fingerprint from an ASGI scope.
Tries two extraction paths because uvicorn has historically stashed
the TLS peer cert in different scope keys across versions:
1. Primary: ``scope["extensions"]["tls"]["client_cert_chain"][0]``
(uvicorn ≥ 0.30 ASGI TLS extension).
2. Fallback: the transport object's ``ssl_object.getpeercert(binary_form=True)``
(older uvicorn builds + some other servers).
Returns the lowercase hex SHA-256 of the DER-encoded cert, or None
when neither path yields bytes. The endpoint fails closed on None.
"""
peer_der: Optional[bytes] = None
source = "none"
try:
chain = scope.get("extensions", {}).get("tls", {}).get("client_cert_chain")
if chain:
peer_der = chain[0]
source = "primary"
except Exception:
peer_der = None
if peer_der is None:
transport = scope.get("transport")
try:
ssl_obj = transport.get_extra_info("ssl_object") if transport else None
if ssl_obj is not None:
peer_der = ssl_obj.getpeercert(binary_form=True)
if peer_der:
source = "fallback"
except Exception:
peer_der = None
if not peer_der:
log.debug("heartbeat: peer cert extraction failed via none")
return None
log.debug("heartbeat: peer cert extraction succeeded via %s", source)
return hashlib.sha256(peer_der).hexdigest().lower()
async def _verify_peer_matches_host(
request: Request, host_uuid: str, repo: BaseRepository
) -> dict[str, Any]:
host = await repo.get_swarm_host_by_uuid(host_uuid)
if host is None:
raise HTTPException(status_code=404, detail="unknown host")
fp = _extract_peer_fingerprint(request.scope)
if fp is None:
raise HTTPException(status_code=403, detail="peer cert unavailable")
expected = (host.get("client_cert_fingerprint") or "").lower()
if not expected or fp != expected:
raise HTTPException(status_code=403, detail="cert fingerprint mismatch")
return host
async def _reconcile_topology_report(
repo: BaseRepository,
host_uuid: str,
reported: Optional[dict[str, Any]],
) -> None:
"""Compare the agent's reported applied_version_hash against what
master expects for any topology pinned to *host_uuid*.
Sets ``needs_resync=True`` when:
- master has an ACTIVE topology targeted here but the agent reports
a different hash, OR
- master has an ACTIVE topology targeted here but the agent reports
no topology at all (fresh boot / wiped cache).
The actual re-push is handled by the mutator reconcile loop so the
heartbeat endpoint stays cheap.
"""
from decnet.topology.hashing import canonical_hash
from decnet.topology.persistence import hydrate
from decnet.topology.status import TopologyStatus
try:
topos = await repo.list_topologies(status=TopologyStatus.ACTIVE)
except Exception:
log.exception("heartbeat: could not list active topologies")
return
mine = [t for t in topos if t.get("target_host_uuid") == host_uuid]
if not mine:
return
reported_id = (reported or {}).get("topology_id")
reported_hash = (reported or {}).get("applied_version_hash")
for topo in mine:
tid = topo["id"]
if topo.get("needs_resync"):
continue
expected: Optional[str] = None
if reported_id == tid and reported_hash:
try:
hydrated = await hydrate(repo, tid)
except Exception:
log.exception("heartbeat: hydrate failed tid=%s", tid)
continue
if hydrated is None:
continue
expected = canonical_hash(hydrated)
if expected == reported_hash:
continue
# Either mismatch or agent reports no/other topology — flag it.
try:
await repo.set_topology_resync(tid, True)
log.info(
"heartbeat: flagged topology %s for resync (host=%s "
"reported_id=%s reported_hash=%s expected=%s)",
tid, host_uuid, reported_id, reported_hash, expected,
)
except Exception:
log.exception("heartbeat: failed to flag resync tid=%s", tid)
@router.post(
"/heartbeat",
status_code=204,
tags=["Swarm Health"],
responses={
400: {"description": "Bad Request (malformed JSON body)"},
403: {"description": "Peer cert missing, or its fingerprint does not match the host's pinned cert"},
404: {"description": "host_uuid is not enrolled"},
422: {"description": "Request body validation error"},
},
)
async def heartbeat(
req: HeartbeatRequest,
request: Request,
repo: BaseRepository = Depends(get_repo),
) -> None:
await _verify_peer_matches_host(request, req.host_uuid, repo)
now = datetime.now(timezone.utc)
await repo.update_swarm_host(
req.host_uuid,
{"status": "active", "last_heartbeat": now},
)
await _reconcile_topology_report(repo, req.host_uuid, req.topology)
status_body = req.status or {}
if not status_body.get("deployed"):
return
runtime = status_body.get("runtime") or {}
for decky_dict in status_body.get("deckies") or []:
try:
d = DeckyConfig(**decky_dict)
except Exception:
log.exception("heartbeat: skipping malformed decky payload host=%s", req.host_uuid)
continue
rstate = runtime.get(d.name) or {}
is_up = bool(rstate.get("running"))
await repo.upsert_decky_shard(
{
"decky_name": d.name,
"host_uuid": req.host_uuid,
"services": json.dumps(d.services),
"decky_config": d.model_dump_json(),
"decky_ip": d.ip,
"state": "running" if is_up else "degraded",
"last_error": None,
"last_seen": now,
"updated_at": now,
}
)

View File

@@ -0,0 +1,55 @@
"""GET /swarm/deckies — list decky shards with their worker host's identity.
The DeckyShard table maps decky_name → host_uuid; users want to see which
deckies are running and *where*, so we enrich each shard with the owning
host's name/address/status from SwarmHost rather than making callers do
the join themselves.
"""
from __future__ import annotations
from typing import Optional
from fastapi import APIRouter, Depends
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.db.models import DeckyShardView
router = APIRouter()
@router.get("/deckies", response_model=list[DeckyShardView], tags=["Swarm Deckies"])
async def api_list_deckies(
host_uuid: Optional[str] = None,
state: Optional[str] = None,
repo: BaseRepository = Depends(get_repo),
) -> list[DeckyShardView]:
shards = await repo.list_decky_shards(host_uuid)
hosts = {h["uuid"]: h for h in await repo.list_swarm_hosts()}
out: list[DeckyShardView] = []
for s in shards:
if state and s.get("state") != state:
continue
host = hosts.get(s["host_uuid"], {})
out.append(DeckyShardView(
decky_name=s["decky_name"],
decky_ip=s.get("decky_ip"),
host_uuid=s["host_uuid"],
host_name=host.get("name") or "<unknown>",
host_address=host.get("address") or "",
host_status=host.get("status") or "unknown",
services=s.get("services") or [],
state=s.get("state") or "pending",
last_error=s.get("last_error"),
compose_hash=s.get("compose_hash"),
updated_at=s["updated_at"],
hostname=s.get("hostname"),
distro=s.get("distro"),
archetype=s.get("archetype"),
service_config=s.get("service_config") or {},
mutate_interval=s.get("mutate_interval"),
last_mutated=s.get("last_mutated") or 0.0,
last_seen=s.get("last_seen"),
))
return out

View File

@@ -0,0 +1,21 @@
"""GET /swarm/hosts — list enrolled workers, optionally filtered by status."""
from __future__ import annotations
from typing import Optional
from fastapi import APIRouter, Depends
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.db.models import SwarmHostView
router = APIRouter()
@router.get("/hosts", response_model=list[SwarmHostView], tags=["Swarm Hosts"])
async def api_list_hosts(
host_status: Optional[str] = None,
repo: BaseRepository = Depends(get_repo),
) -> list[SwarmHostView]:
rows = await repo.list_swarm_hosts(host_status)
return [SwarmHostView(**r) for r in rows]

View File

@@ -0,0 +1,60 @@
"""POST /swarm/teardown — tear down one or all enrolled workers."""
from __future__ import annotations
import asyncio
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.db.models import (
SwarmDeployResponse,
SwarmHostResult,
SwarmTeardownRequest,
)
log = get_logger("swarm.teardown")
router = APIRouter()
@router.post(
"/teardown",
response_model=SwarmDeployResponse,
tags=["Swarm Deployments"],
responses={
400: {"description": "Bad Request (malformed JSON body)"},
404: {"description": "A targeted host does not exist"},
422: {"description": "Request body validation error"},
},
)
async def api_teardown_swarm(
req: SwarmTeardownRequest,
repo: BaseRepository = Depends(get_repo),
) -> SwarmDeployResponse:
if req.host_uuid is not None:
row = await repo.get_swarm_host_by_uuid(req.host_uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
targets = [row]
else:
targets = await repo.list_swarm_hosts()
async def _call(host: dict[str, Any]) -> SwarmHostResult:
try:
async with AgentClient(host=host) as agent:
body = await agent.teardown(req.decky_id)
if req.decky_id is None:
await repo.delete_decky_shards_for_host(host["uuid"])
return SwarmHostResult(host_uuid=host["uuid"], host_name=host["name"], ok=True, detail=body)
except Exception as exc:
log.exception("swarm.teardown failed host=%s", host["name"])
return SwarmHostResult(
host_uuid=host["uuid"], host_name=host["name"], ok=False, detail=str(exc)
)
results = await asyncio.gather(*(_call(h) for h in targets))
return SwarmDeployResponse(results=list(results))