feat(swarm-mgmt): probe-on-read for GET /swarm/hosts heartbeat + status

This commit is contained in:
2026-04-19 05:26:35 -04:00
parent ff4c993617
commit bc5f43c3f7

View File

@@ -1,21 +1,54 @@
"""GET /swarm/hosts — admin-gated list of enrolled workers for the dashboard. """GET /swarm/hosts — admin-gated list of enrolled workers for the dashboard.
Thin wrapper over ``repo.list_swarm_hosts()`` — same shape as the Fans out an ``AgentClient.health()`` probe to each host on every call and
unauth'd controller route, but behind ``require_admin``. updates ``status`` / ``last_heartbeat`` as a side effect. This mirrors how
``/swarm-updates/hosts`` probes the updater daemon — the SwarmHosts page
polls this endpoint, so probe-on-read is what drives heartbeat freshness
in the UI. No separate scheduler needed.
""" """
from __future__ import annotations from __future__ import annotations
from typing import Optional import asyncio
from datetime import datetime, timezone
from typing import Any, Optional
from fastapi import APIRouter, Depends from fastapi import APIRouter, Depends
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.models import SwarmHostView from decnet.web.db.models import SwarmHostView
from decnet.web.db.repository import BaseRepository from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm_mgmt.list_hosts")
router = APIRouter() router = APIRouter()
async def _probe_and_update(
host: dict[str, Any], repo: BaseRepository
) -> dict[str, Any]:
"""Best-effort mTLS probe. Skips hosts with no address yet (pending first
connect-back) so we don't pollute the DB with 'unreachable' on fresh
enrollments that haven't fetched the tarball."""
if not host.get("address"):
return host
try:
async with AgentClient(host=host) as agent:
await agent.health()
patch = {"status": "active", "last_heartbeat": datetime.now(timezone.utc)}
except Exception as exc: # noqa: BLE001
log.debug("swarm/hosts probe unreachable host=%s err=%s", host.get("name"), exc)
patch = {"status": "unreachable"}
try:
await repo.update_swarm_host(host["uuid"], patch)
except Exception as exc: # noqa: BLE001
log.warning("swarm/hosts could not persist probe result host=%s err=%s", host.get("name"), exc)
return host
host.update(patch)
return host
@router.get("/hosts", response_model=list[SwarmHostView], tags=["Swarm Management"]) @router.get("/hosts", response_model=list[SwarmHostView], tags=["Swarm Management"])
async def list_hosts( async def list_hosts(
host_status: Optional[str] = None, host_status: Optional[str] = None,
@@ -23,4 +56,5 @@ async def list_hosts(
repo: BaseRepository = Depends(get_repo), repo: BaseRepository = Depends(get_repo),
) -> list[SwarmHostView]: ) -> list[SwarmHostView]:
rows = await repo.list_swarm_hosts(host_status) rows = await repo.list_swarm_hosts(host_status)
return [SwarmHostView(**r) for r in rows] probed = await asyncio.gather(*(_probe_and_update(r, repo) for r in rows))
return [SwarmHostView(**r) for r in probed]