New POST /swarm/heartbeat on the swarm controller. Workers post every
~30s with the output of executor.status(); the master bumps
SwarmHost.last_heartbeat and re-upserts each DeckyShard with a fresh
DeckyConfig snapshot and runtime-derived state (running/degraded).
Security: CA-signed mTLS alone is not sufficient — a decommissioned
worker's still-valid cert could resurrect ghost shards. The endpoint
extracts the presented peer cert (primary: scope["extensions"]["tls"],
fallback: transport.get_extra_info("ssl_object")) and SHA-256-pins it
to the SwarmHost.client_cert_fingerprint stored for the claimed
host_uuid. Extraction is factored into _extract_peer_fingerprint so
tests can exercise both uvicorn scope shapes and the both-unavailable
fail-closed path without mocking uvicorn's TLS pipeline.
Adds get_swarm_host_by_fingerprint to the repo interface (SQLModel
impl reuses the indexed client_cert_fingerprint column).
139 lines
4.7 KiB
Python
139 lines
4.7 KiB
Python
"""POST /swarm/heartbeat — agent→master liveness + decky snapshot refresh.
|
|
|
|
Workers call this every ~30 s with the output of ``executor.status()``.
|
|
The master bumps ``SwarmHost.last_heartbeat`` and re-upserts each
|
|
``DeckyShard`` with the fresh ``DeckyConfig`` snapshot + runtime-derived
|
|
state so the dashboard stays current without a master-pull probe.
|
|
|
|
Security: CA-signed mTLS is necessary but not sufficient — a
|
|
decommissioned worker's still-valid cert must not resurrect ghost
|
|
shards. We pin the presented peer cert's SHA-256 to the
|
|
``client_cert_fingerprint`` stored for the claimed ``host_uuid``.
|
|
Mismatch (or decommissioned host) → 403.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from typing import Any, Optional
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException, Request
|
|
from pydantic import BaseModel
|
|
|
|
from decnet.config import DeckyConfig
|
|
from decnet.logging import get_logger
|
|
from decnet.web.db.repository import BaseRepository
|
|
from decnet.web.dependencies import get_repo
|
|
|
|
log = get_logger("swarm.heartbeat")
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
class HeartbeatRequest(BaseModel):
|
|
host_uuid: str
|
|
agent_version: Optional[str] = None
|
|
status: dict[str, Any]
|
|
|
|
|
|
def _extract_peer_fingerprint(scope: dict[str, Any]) -> Optional[str]:
|
|
"""Pull the peer cert's SHA-256 fingerprint from an ASGI scope.
|
|
|
|
Tries two extraction paths because uvicorn has historically stashed
|
|
the TLS peer cert in different scope keys across versions:
|
|
|
|
1. Primary: ``scope["extensions"]["tls"]["client_cert_chain"][0]``
|
|
(uvicorn ≥ 0.30 ASGI TLS extension).
|
|
2. Fallback: the transport object's ``ssl_object.getpeercert(binary_form=True)``
|
|
(older uvicorn builds + some other servers).
|
|
|
|
Returns the lowercase hex SHA-256 of the DER-encoded cert, or None
|
|
when neither path yields bytes. The endpoint fails closed on None.
|
|
"""
|
|
peer_der: Optional[bytes] = None
|
|
source = "none"
|
|
|
|
try:
|
|
chain = scope.get("extensions", {}).get("tls", {}).get("client_cert_chain")
|
|
if chain:
|
|
peer_der = chain[0]
|
|
source = "primary"
|
|
except Exception:
|
|
peer_der = None
|
|
|
|
if peer_der is None:
|
|
transport = scope.get("transport")
|
|
try:
|
|
ssl_obj = transport.get_extra_info("ssl_object") if transport else None
|
|
if ssl_obj is not None:
|
|
peer_der = ssl_obj.getpeercert(binary_form=True)
|
|
if peer_der:
|
|
source = "fallback"
|
|
except Exception:
|
|
peer_der = None
|
|
|
|
if not peer_der:
|
|
log.debug("heartbeat: peer cert extraction failed via none")
|
|
return None
|
|
|
|
log.debug("heartbeat: peer cert extraction succeeded via %s", source)
|
|
return hashlib.sha256(peer_der).hexdigest().lower()
|
|
|
|
|
|
async def _verify_peer_matches_host(
|
|
request: Request, host_uuid: str, repo: BaseRepository
|
|
) -> dict[str, Any]:
|
|
host = await repo.get_swarm_host_by_uuid(host_uuid)
|
|
if host is None:
|
|
raise HTTPException(status_code=404, detail="unknown host")
|
|
fp = _extract_peer_fingerprint(request.scope)
|
|
if fp is None:
|
|
raise HTTPException(status_code=403, detail="peer cert unavailable")
|
|
expected = (host.get("client_cert_fingerprint") or "").lower()
|
|
if not expected or fp != expected:
|
|
raise HTTPException(status_code=403, detail="cert fingerprint mismatch")
|
|
return host
|
|
|
|
|
|
@router.post("/heartbeat", status_code=204, tags=["Swarm Health"])
|
|
async def heartbeat(
|
|
req: HeartbeatRequest,
|
|
request: Request,
|
|
repo: BaseRepository = Depends(get_repo),
|
|
) -> None:
|
|
await _verify_peer_matches_host(request, req.host_uuid, repo)
|
|
|
|
now = datetime.now(timezone.utc)
|
|
await repo.update_swarm_host(
|
|
req.host_uuid,
|
|
{"status": "active", "last_heartbeat": now},
|
|
)
|
|
|
|
status_body = req.status or {}
|
|
if not status_body.get("deployed"):
|
|
return
|
|
|
|
runtime = status_body.get("runtime") or {}
|
|
for decky_dict in status_body.get("deckies") or []:
|
|
try:
|
|
d = DeckyConfig(**decky_dict)
|
|
except Exception:
|
|
log.exception("heartbeat: skipping malformed decky payload host=%s", req.host_uuid)
|
|
continue
|
|
rstate = runtime.get(d.name) or {}
|
|
is_up = bool(rstate.get("running"))
|
|
await repo.upsert_decky_shard(
|
|
{
|
|
"decky_name": d.name,
|
|
"host_uuid": req.host_uuid,
|
|
"services": json.dumps(d.services),
|
|
"decky_config": d.model_dump_json(),
|
|
"decky_ip": d.ip,
|
|
"state": "running" if is_up else "degraded",
|
|
"last_error": None,
|
|
"last_seen": now,
|
|
"updated_at": now,
|
|
}
|
|
)
|