Files
DECNET/decnet/web/router/swarm/health.py
anti 63b0a58527 feat(swarm): master-side SWARM controller (swarmctl) + agent CLI
Adds decnet/web/swarm_api.py as an independent FastAPI app with routers
for host enrollment, deployment dispatch (sharding DecnetConfig across
enrolled workers via AgentClient), and active health probing. Runs as
its own uvicorn subprocess via 'decnet swarmctl', mirroring the isolation
pattern used by 'decnet api'. Also wires up 'decnet agent' CLI entry for
the worker side.

29 tests added under tests/swarm/test_swarm_api.py cover enrollment
(including bundle generation + duplicate rejection), host CRUD, sharding
correctness, non-swarm-mode rejection, teardown, and health probes with
a stubbed AgentClient.
2026-04-18 19:18:33 -04:00

80 lines
2.3 KiB
Python

"""Health endpoints for the swarm controller.
* ``GET /swarm/health`` — liveness of the controller itself (no I/O).
* ``POST /swarm/check`` — active probe of every enrolled worker over mTLS.
Updates ``SwarmHost.status`` and ``last_heartbeat``.
"""
from __future__ import annotations
import asyncio
from datetime import datetime, timezone
from typing import Any
from fastapi import APIRouter, Depends
from pydantic import BaseModel
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
log = get_logger("swarm.health")
router = APIRouter(tags=["swarm-health"])
class HostHealth(BaseModel):
host_uuid: str
name: str
address: str
reachable: bool
detail: Any | None = None
class CheckResponse(BaseModel):
results: list[HostHealth]
@router.get("/health")
async def health() -> dict[str, str]:
return {"status": "ok", "role": "swarm-controller"}
@router.post("/check", response_model=CheckResponse)
async def check(
repo: BaseRepository = Depends(get_repo),
) -> CheckResponse:
hosts = await repo.list_swarm_hosts()
async def _probe(host: dict[str, Any]) -> HostHealth:
try:
async with AgentClient(host=host) as agent:
body = await agent.health()
await repo.update_swarm_host(
host["uuid"],
{
"status": "active",
"last_heartbeat": datetime.now(timezone.utc),
},
)
return HostHealth(
host_uuid=host["uuid"],
name=host["name"],
address=host["address"],
reachable=True,
detail=body,
)
except Exception as exc:
log.warning("swarm.check unreachable host=%s err=%s", host["name"], exc)
await repo.update_swarm_host(host["uuid"], {"status": "unreachable"})
return HostHealth(
host_uuid=host["uuid"],
name=host["name"],
address=host["address"],
reachable=False,
detail=str(exc),
)
results = await asyncio.gather(*(_probe(h) for h in hosts))
return CheckResponse(results=list(results))