merge: testing → main (reconcile 2-week divergence)
This commit is contained in:
47
decnet/web/router/swarm/__init__.py
Normal file
47
decnet/web/router/swarm/__init__.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""Swarm controller routers.
|
||||
|
||||
One file per endpoint, aggregated under the ``/swarm`` prefix. Mounted
|
||||
onto the swarm-api FastAPI app (``decnet/web/swarm_api.py``), a separate
|
||||
process from the main DECNET API so swarm failures cannot cascade into
|
||||
log ingestion / dashboard serving.
|
||||
"""
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .api_enroll_host import router as enroll_host_router
|
||||
from .api_list_hosts import router as list_hosts_router
|
||||
from .api_get_host import router as get_host_router
|
||||
from .api_decommission_host import router as decommission_host_router
|
||||
from .api_deploy_swarm import router as deploy_swarm_router
|
||||
from .api_teardown_swarm import router as teardown_swarm_router
|
||||
from .api_get_swarm_health import router as get_swarm_health_router
|
||||
from .api_check_hosts import router as check_hosts_router
|
||||
from .api_heartbeat import router as heartbeat_router
|
||||
from .api_list_deckies import router as list_deckies_router
|
||||
|
||||
swarm_router = APIRouter(
|
||||
prefix="/swarm",
|
||||
# Error responses that every swarm route can surface. Route-level
|
||||
# `responses=` entries still override/extend these for route-specific
|
||||
# codes (e.g. 409 on /enroll).
|
||||
responses={
|
||||
400: {"description": "Malformed request"},
|
||||
403: {"description": "Peer cert missing or fingerprint mismatch"},
|
||||
404: {"description": "Referenced host does not exist"},
|
||||
},
|
||||
)
|
||||
|
||||
# Hosts
|
||||
swarm_router.include_router(enroll_host_router)
|
||||
swarm_router.include_router(list_hosts_router)
|
||||
swarm_router.include_router(get_host_router)
|
||||
swarm_router.include_router(decommission_host_router)
|
||||
|
||||
# Deployments
|
||||
swarm_router.include_router(deploy_swarm_router)
|
||||
swarm_router.include_router(teardown_swarm_router)
|
||||
swarm_router.include_router(list_deckies_router)
|
||||
|
||||
# Health
|
||||
swarm_router.include_router(get_swarm_health_router)
|
||||
swarm_router.include_router(check_hosts_router)
|
||||
swarm_router.include_router(heartbeat_router)
|
||||
61
decnet/web/router/swarm/api_check_hosts.py
Normal file
61
decnet/web/router/swarm/api_check_hosts.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""POST /swarm/check — active mTLS probe of every enrolled worker.
|
||||
|
||||
Updates ``SwarmHost.status`` and ``last_heartbeat`` for each host based
|
||||
on the outcome of the probe.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.db.models import SwarmCheckResponse, SwarmHostHealth
|
||||
|
||||
log = get_logger("swarm.check")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post("/check", response_model=SwarmCheckResponse, tags=["Swarm Health"])
|
||||
async def api_check_hosts(
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> SwarmCheckResponse:
|
||||
hosts = await repo.list_swarm_hosts()
|
||||
|
||||
async def _probe(host: dict[str, Any]) -> SwarmHostHealth:
|
||||
try:
|
||||
async with AgentClient(host=host) as agent:
|
||||
body = await agent.health()
|
||||
await repo.update_swarm_host(
|
||||
host["uuid"],
|
||||
{
|
||||
"status": "active",
|
||||
"last_heartbeat": datetime.now(timezone.utc),
|
||||
},
|
||||
)
|
||||
return SwarmHostHealth(
|
||||
host_uuid=host["uuid"],
|
||||
name=host["name"],
|
||||
address=host["address"],
|
||||
reachable=True,
|
||||
detail=body,
|
||||
)
|
||||
except Exception as exc:
|
||||
log.warning("swarm.check unreachable host=%s err=%s", host["name"], exc)
|
||||
await repo.update_swarm_host(host["uuid"], {"status": "unreachable"})
|
||||
return SwarmHostHealth(
|
||||
host_uuid=host["uuid"],
|
||||
name=host["name"],
|
||||
address=host["address"],
|
||||
reachable=False,
|
||||
detail=str(exc),
|
||||
)
|
||||
|
||||
results = await asyncio.gather(*(_probe(h) for h in hosts))
|
||||
return SwarmCheckResponse(results=list(results))
|
||||
63
decnet/web/router/swarm/api_decommission_host.py
Normal file
63
decnet/web/router/swarm/api_decommission_host.py
Normal file
@@ -0,0 +1,63 @@
|
||||
"""DELETE /swarm/hosts/{uuid} — decommission a worker.
|
||||
|
||||
Removes the DeckyShard rows bound to the host (portable cascade — MySQL
|
||||
and SQLite both honor it via the repo layer), deletes the SwarmHost row,
|
||||
and best-effort-cleans the per-worker bundle directory on the master.
|
||||
|
||||
Also asks the worker agent to wipe its own install (keeping logs). A
|
||||
dead/unreachable worker does not block master-side cleanup.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pathlib
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
|
||||
log = get_logger("swarm.decommission")
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/hosts/{uuid}",
|
||||
status_code=status.HTTP_204_NO_CONTENT,
|
||||
tags=["Swarm Hosts"],
|
||||
responses={404: {"description": "No host with this UUID is enrolled"}},
|
||||
)
|
||||
async def api_decommission_host(
|
||||
uuid: str,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> None:
|
||||
row = await repo.get_swarm_host_by_uuid(uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="host not found")
|
||||
|
||||
try:
|
||||
async with AgentClient(host=row) as agent:
|
||||
await agent.self_destruct()
|
||||
except Exception:
|
||||
log.exception(
|
||||
"decommission: self-destruct dispatch failed host=%s — "
|
||||
"proceeding with master-side cleanup anyway",
|
||||
row.get("name"),
|
||||
)
|
||||
|
||||
await repo.delete_decky_shards_for_host(uuid)
|
||||
await repo.delete_swarm_host(uuid)
|
||||
|
||||
# Best-effort bundle cleanup; if the dir was moved manually, don't fail.
|
||||
bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
|
||||
if bundle_dir.is_dir():
|
||||
for child in bundle_dir.iterdir():
|
||||
try:
|
||||
child.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
bundle_dir.rmdir()
|
||||
except OSError:
|
||||
pass
|
||||
155
decnet/web/router/swarm/api_deploy_swarm.py
Normal file
155
decnet/web/router/swarm/api_deploy_swarm.py
Normal file
@@ -0,0 +1,155 @@
|
||||
"""POST /swarm/deploy — shard a DecnetConfig across enrolled workers.
|
||||
|
||||
Per worker we build a filtered copy containing only the deckies assigned
|
||||
to that worker (via ``host_uuid``), then POST it to the worker agent.
|
||||
The caller is expected to have already set ``host_uuid`` on every decky;
|
||||
if any decky arrives without one, we fail fast. Auto-sharding lives in
|
||||
the CLI layer, not here.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.config import DecnetConfig, DeckyConfig
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.db.models import (
|
||||
SwarmDeployRequest,
|
||||
SwarmDeployResponse,
|
||||
SwarmHostResult,
|
||||
)
|
||||
|
||||
log = get_logger("swarm.deploy")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _shard_by_host(config: DecnetConfig) -> dict[str, list[DeckyConfig]]:
|
||||
buckets: dict[str, list[DeckyConfig]] = {}
|
||||
for d in config.deckies:
|
||||
if not d.host_uuid:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"decky '{d.name}' has no host_uuid — caller must shard before dispatch",
|
||||
)
|
||||
buckets.setdefault(d.host_uuid, []).append(d)
|
||||
return buckets
|
||||
|
||||
|
||||
def _worker_config(
|
||||
base: DecnetConfig,
|
||||
shard: list[DeckyConfig],
|
||||
host: dict[str, Any],
|
||||
) -> DecnetConfig:
|
||||
updates: dict[str, Any] = {"deckies": shard}
|
||||
# Per-host driver opt-in (Wi-Fi-bridged VMs can't use macvlan — see
|
||||
# SwarmHost.use_ipvlan). Never downgrade: if the operator picked ipvlan
|
||||
# at the deploy level, keep it regardless of the per-host flag.
|
||||
if host.get("use_ipvlan"):
|
||||
updates["ipvlan"] = True
|
||||
return base.model_copy(update=updates)
|
||||
|
||||
|
||||
async def dispatch_decnet_config(
|
||||
config: DecnetConfig,
|
||||
repo: BaseRepository,
|
||||
dry_run: bool = False,
|
||||
no_cache: bool = False,
|
||||
) -> SwarmDeployResponse:
|
||||
"""Shard ``config`` by ``host_uuid`` and dispatch to each worker in parallel.
|
||||
|
||||
Shared between POST /swarm/deploy (explicit swarm call) and the auto-swarm
|
||||
branch of POST /deckies/deploy.
|
||||
"""
|
||||
buckets = _shard_by_host(config)
|
||||
|
||||
hosts: dict[str, dict[str, Any]] = {}
|
||||
for host_uuid in buckets:
|
||||
row = await repo.get_swarm_host_by_uuid(host_uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail=f"unknown host_uuid: {host_uuid}")
|
||||
hosts[host_uuid] = row
|
||||
|
||||
async def _dispatch(host_uuid: str, shard: list[DeckyConfig]) -> SwarmHostResult:
|
||||
host = hosts[host_uuid]
|
||||
cfg = _worker_config(config, shard, host)
|
||||
try:
|
||||
async with AgentClient(host=host) as agent:
|
||||
body = await agent.deploy(cfg, dry_run=dry_run, no_cache=no_cache)
|
||||
for d in shard:
|
||||
await repo.upsert_decky_shard(
|
||||
{
|
||||
"decky_name": d.name,
|
||||
"host_uuid": host_uuid,
|
||||
"services": json.dumps(d.services),
|
||||
"decky_config": d.model_dump_json(),
|
||||
"decky_ip": d.ip,
|
||||
"state": "running" if not dry_run else "pending",
|
||||
"last_error": None,
|
||||
"updated_at": datetime.now(timezone.utc),
|
||||
}
|
||||
)
|
||||
await repo.update_swarm_host(host_uuid, {"status": "active"})
|
||||
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
|
||||
except Exception as exc:
|
||||
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
|
||||
# Compose-up is partial-success-friendly: one decky failing to
|
||||
# build doesn't roll back the ones that already came up. Ask the
|
||||
# agent which containers actually exist before painting the whole
|
||||
# shard red — otherwise decky1 and decky2 look "failed" even
|
||||
# though they're live on the worker.
|
||||
runtime: dict[str, Any] = {}
|
||||
try:
|
||||
async with AgentClient(host=host) as probe:
|
||||
snap = await probe.status()
|
||||
runtime = snap.get("runtime") or {}
|
||||
except Exception:
|
||||
log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"])
|
||||
for d in shard:
|
||||
rstate = runtime.get(d.name) or {}
|
||||
is_up = bool(rstate.get("running"))
|
||||
await repo.upsert_decky_shard(
|
||||
{
|
||||
"decky_name": d.name,
|
||||
"host_uuid": host_uuid,
|
||||
"services": json.dumps(d.services),
|
||||
"decky_config": d.model_dump_json(),
|
||||
"decky_ip": d.ip,
|
||||
"state": "running" if is_up else "failed",
|
||||
"last_error": None if is_up else str(exc)[:512],
|
||||
"updated_at": datetime.now(timezone.utc),
|
||||
}
|
||||
)
|
||||
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=False, detail=str(exc))
|
||||
|
||||
results = await asyncio.gather(
|
||||
*(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
|
||||
)
|
||||
return SwarmDeployResponse(results=list(results))
|
||||
|
||||
|
||||
@router.post(
|
||||
"/deploy",
|
||||
response_model=SwarmDeployResponse,
|
||||
tags=["Swarm Deployments"],
|
||||
responses={
|
||||
400: {"description": "Deployment mode must be 'swarm'"},
|
||||
404: {"description": "A referenced host_uuid is not enrolled"},
|
||||
},
|
||||
)
|
||||
async def api_deploy_swarm(
|
||||
req: SwarmDeployRequest,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> SwarmDeployResponse:
|
||||
if req.config.mode != "swarm":
|
||||
raise HTTPException(status_code=400, detail="mode must be 'swarm'")
|
||||
return await dispatch_decnet_config(
|
||||
req.config, repo, dry_run=req.dry_run, no_cache=req.no_cache
|
||||
)
|
||||
100
decnet/web/router/swarm/api_enroll_host.py
Normal file
100
decnet/web/router/swarm/api_enroll_host.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""POST /swarm/enroll — issue a worker cert bundle and register the host.
|
||||
|
||||
Enrollment is master-driven: the controller holds the CA private key,
|
||||
generates a fresh worker keypair + CA-signed cert, and returns the full
|
||||
bundle to the operator. Bundle delivery to the worker (scp/sshpass/etc.)
|
||||
is outside this process's trust boundary.
|
||||
|
||||
Rationale: the worker agent speaks ONLY mTLS; there is no pre-auth
|
||||
bootstrap endpoint, so nothing to attack before the worker is enrolled.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid as _uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
|
||||
from decnet.swarm import pki
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.db.models import SwarmEnrolledBundle, SwarmEnrollRequest, SwarmUpdaterBundle
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/enroll",
|
||||
response_model=SwarmEnrolledBundle,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
tags=["Swarm Hosts"],
|
||||
responses={
|
||||
400: {"description": "Bad Request (malformed JSON body)"},
|
||||
409: {"description": "A worker with this name is already enrolled"},
|
||||
422: {"description": "Request body validation error"},
|
||||
},
|
||||
)
|
||||
async def api_enroll_host(
|
||||
req: SwarmEnrollRequest,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> SwarmEnrolledBundle:
|
||||
existing = await repo.get_swarm_host_by_name(req.name)
|
||||
if existing is not None:
|
||||
raise HTTPException(status_code=409, detail=f"Worker '{req.name}' is already enrolled")
|
||||
|
||||
ca = pki.ensure_ca()
|
||||
sans = list({*req.sans, req.address, req.name})
|
||||
issued = pki.issue_worker_cert(ca, req.name, sans)
|
||||
|
||||
# Persist the bundle under ~/.decnet/ca/workers/<name>/ so the master
|
||||
# can replay it if the operator loses the original delivery.
|
||||
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.name
|
||||
pki.write_worker_bundle(issued, bundle_dir)
|
||||
|
||||
updater_view: Optional[SwarmUpdaterBundle] = None
|
||||
updater_fp: Optional[str] = None
|
||||
if req.issue_updater_bundle:
|
||||
updater_cn = f"updater@{req.name}"
|
||||
updater_sans = list({*sans, updater_cn, "127.0.0.1"})
|
||||
updater_issued = pki.issue_worker_cert(ca, updater_cn, updater_sans)
|
||||
# Persist alongside the worker bundle for replay.
|
||||
updater_dir = bundle_dir / "updater"
|
||||
updater_dir.mkdir(parents=True, exist_ok=True)
|
||||
(updater_dir / "updater.crt").write_bytes(updater_issued.cert_pem)
|
||||
(updater_dir / "updater.key").write_bytes(updater_issued.key_pem)
|
||||
import os as _os
|
||||
_os.chmod(updater_dir / "updater.key", 0o600)
|
||||
updater_fp = updater_issued.fingerprint_sha256
|
||||
updater_view = SwarmUpdaterBundle(
|
||||
fingerprint=updater_fp,
|
||||
updater_cert_pem=updater_issued.cert_pem.decode(),
|
||||
updater_key_pem=updater_issued.key_pem.decode(),
|
||||
)
|
||||
|
||||
host_uuid = str(_uuid.uuid4())
|
||||
await repo.add_swarm_host(
|
||||
{
|
||||
"uuid": host_uuid,
|
||||
"name": req.name,
|
||||
"address": req.address,
|
||||
"agent_port": req.agent_port,
|
||||
"status": "enrolled",
|
||||
"client_cert_fingerprint": issued.fingerprint_sha256,
|
||||
"updater_cert_fingerprint": updater_fp,
|
||||
"cert_bundle_path": str(bundle_dir),
|
||||
"enrolled_at": datetime.now(timezone.utc),
|
||||
"notes": req.notes,
|
||||
}
|
||||
)
|
||||
return SwarmEnrolledBundle(
|
||||
host_uuid=host_uuid,
|
||||
name=req.name,
|
||||
address=req.address,
|
||||
agent_port=req.agent_port,
|
||||
fingerprint=issued.fingerprint_sha256,
|
||||
ca_cert_pem=issued.ca_cert_pem.decode(),
|
||||
worker_cert_pem=issued.cert_pem.decode(),
|
||||
worker_key_pem=issued.key_pem.decode(),
|
||||
updater=updater_view,
|
||||
)
|
||||
26
decnet/web/router/swarm/api_get_host.py
Normal file
26
decnet/web/router/swarm/api_get_host.py
Normal file
@@ -0,0 +1,26 @@
|
||||
"""GET /swarm/hosts/{uuid} — fetch a single worker by UUID."""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.db.models import SwarmHostView
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/hosts/{uuid}",
|
||||
response_model=SwarmHostView,
|
||||
tags=["Swarm Hosts"],
|
||||
responses={404: {"description": "No host with this UUID is enrolled"}},
|
||||
)
|
||||
async def api_get_host(
|
||||
uuid: str,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> SwarmHostView:
|
||||
row = await repo.get_swarm_host_by_uuid(uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="host not found")
|
||||
return SwarmHostView(**row)
|
||||
11
decnet/web/router/swarm/api_get_swarm_health.py
Normal file
11
decnet/web/router/swarm/api_get_swarm_health.py
Normal file
@@ -0,0 +1,11 @@
|
||||
"""GET /swarm/health — controller liveness (no I/O)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/health", tags=["Swarm Health"])
|
||||
async def api_get_swarm_health() -> dict[str, str]:
|
||||
return {"status": "ok", "role": "swarm-controller"}
|
||||
212
decnet/web/router/swarm/api_heartbeat.py
Normal file
212
decnet/web/router/swarm/api_heartbeat.py
Normal file
@@ -0,0 +1,212 @@
|
||||
"""POST /swarm/heartbeat — agent→master liveness + decky snapshot refresh.
|
||||
|
||||
Workers call this every ~30 s with the output of ``executor.status()``.
|
||||
The master bumps ``SwarmHost.last_heartbeat`` and re-upserts each
|
||||
``DeckyShard`` with the fresh ``DeckyConfig`` snapshot + runtime-derived
|
||||
state so the dashboard stays current without a master-pull probe.
|
||||
|
||||
Security: CA-signed mTLS is necessary but not sufficient — a
|
||||
decommissioned worker's still-valid cert must not resurrect ghost
|
||||
shards. We pin the presented peer cert's SHA-256 to the
|
||||
``client_cert_fingerprint`` stored for the claimed ``host_uuid``.
|
||||
Mismatch (or decommissioned host) → 403.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||
from pydantic import BaseModel
|
||||
|
||||
from decnet.config import DeckyConfig
|
||||
from decnet.logging import get_logger
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
|
||||
log = get_logger("swarm.heartbeat")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class HeartbeatRequest(BaseModel):
|
||||
host_uuid: str
|
||||
agent_version: Optional[str] = None
|
||||
status: dict[str, Any]
|
||||
topology: Optional[dict[str, Any]] = None
|
||||
|
||||
|
||||
def _extract_peer_fingerprint(scope: dict[str, Any]) -> Optional[str]:
|
||||
"""Pull the peer cert's SHA-256 fingerprint from an ASGI scope.
|
||||
|
||||
Tries two extraction paths because uvicorn has historically stashed
|
||||
the TLS peer cert in different scope keys across versions:
|
||||
|
||||
1. Primary: ``scope["extensions"]["tls"]["client_cert_chain"][0]``
|
||||
(uvicorn ≥ 0.30 ASGI TLS extension).
|
||||
2. Fallback: the transport object's ``ssl_object.getpeercert(binary_form=True)``
|
||||
(older uvicorn builds + some other servers).
|
||||
|
||||
Returns the lowercase hex SHA-256 of the DER-encoded cert, or None
|
||||
when neither path yields bytes. The endpoint fails closed on None.
|
||||
"""
|
||||
peer_der: Optional[bytes] = None
|
||||
source = "none"
|
||||
|
||||
try:
|
||||
chain = scope.get("extensions", {}).get("tls", {}).get("client_cert_chain")
|
||||
if chain:
|
||||
peer_der = chain[0]
|
||||
source = "primary"
|
||||
except Exception:
|
||||
peer_der = None
|
||||
|
||||
if peer_der is None:
|
||||
transport = scope.get("transport")
|
||||
try:
|
||||
ssl_obj = transport.get_extra_info("ssl_object") if transport else None
|
||||
if ssl_obj is not None:
|
||||
peer_der = ssl_obj.getpeercert(binary_form=True)
|
||||
if peer_der:
|
||||
source = "fallback"
|
||||
except Exception:
|
||||
peer_der = None
|
||||
|
||||
if not peer_der:
|
||||
log.debug("heartbeat: peer cert extraction failed via none")
|
||||
return None
|
||||
|
||||
log.debug("heartbeat: peer cert extraction succeeded via %s", source)
|
||||
return hashlib.sha256(peer_der).hexdigest().lower()
|
||||
|
||||
|
||||
async def _verify_peer_matches_host(
|
||||
request: Request, host_uuid: str, repo: BaseRepository
|
||||
) -> dict[str, Any]:
|
||||
host = await repo.get_swarm_host_by_uuid(host_uuid)
|
||||
if host is None:
|
||||
raise HTTPException(status_code=404, detail="unknown host")
|
||||
fp = _extract_peer_fingerprint(request.scope)
|
||||
if fp is None:
|
||||
raise HTTPException(status_code=403, detail="peer cert unavailable")
|
||||
expected = (host.get("client_cert_fingerprint") or "").lower()
|
||||
if not expected or fp != expected:
|
||||
raise HTTPException(status_code=403, detail="cert fingerprint mismatch")
|
||||
return host
|
||||
|
||||
|
||||
async def _reconcile_topology_report(
|
||||
repo: BaseRepository,
|
||||
host_uuid: str,
|
||||
reported: Optional[dict[str, Any]],
|
||||
) -> None:
|
||||
"""Compare the agent's reported applied_version_hash against what
|
||||
master expects for any topology pinned to *host_uuid*.
|
||||
|
||||
Sets ``needs_resync=True`` when:
|
||||
- master has an ACTIVE topology targeted here but the agent reports
|
||||
a different hash, OR
|
||||
- master has an ACTIVE topology targeted here but the agent reports
|
||||
no topology at all (fresh boot / wiped cache).
|
||||
|
||||
The actual re-push is handled by the mutator reconcile loop so the
|
||||
heartbeat endpoint stays cheap.
|
||||
"""
|
||||
from decnet.topology.hashing import canonical_hash
|
||||
from decnet.topology.persistence import hydrate
|
||||
from decnet.topology.status import TopologyStatus
|
||||
|
||||
try:
|
||||
topos = await repo.list_topologies(status=TopologyStatus.ACTIVE)
|
||||
except Exception:
|
||||
log.exception("heartbeat: could not list active topologies")
|
||||
return
|
||||
mine = [t for t in topos if t.get("target_host_uuid") == host_uuid]
|
||||
if not mine:
|
||||
return
|
||||
|
||||
reported_id = (reported or {}).get("topology_id")
|
||||
reported_hash = (reported or {}).get("applied_version_hash")
|
||||
|
||||
for topo in mine:
|
||||
tid = topo["id"]
|
||||
if topo.get("needs_resync"):
|
||||
continue
|
||||
expected: Optional[str] = None
|
||||
if reported_id == tid and reported_hash:
|
||||
try:
|
||||
hydrated = await hydrate(repo, tid)
|
||||
except Exception:
|
||||
log.exception("heartbeat: hydrate failed tid=%s", tid)
|
||||
continue
|
||||
if hydrated is None:
|
||||
continue
|
||||
expected = canonical_hash(hydrated)
|
||||
if expected == reported_hash:
|
||||
continue
|
||||
# Either mismatch or agent reports no/other topology — flag it.
|
||||
try:
|
||||
await repo.set_topology_resync(tid, True)
|
||||
log.info(
|
||||
"heartbeat: flagged topology %s for resync (host=%s "
|
||||
"reported_id=%s reported_hash=%s expected=%s)",
|
||||
tid, host_uuid, reported_id, reported_hash, expected,
|
||||
)
|
||||
except Exception:
|
||||
log.exception("heartbeat: failed to flag resync tid=%s", tid)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/heartbeat",
|
||||
status_code=204,
|
||||
tags=["Swarm Health"],
|
||||
responses={
|
||||
400: {"description": "Bad Request (malformed JSON body)"},
|
||||
403: {"description": "Peer cert missing, or its fingerprint does not match the host's pinned cert"},
|
||||
404: {"description": "host_uuid is not enrolled"},
|
||||
422: {"description": "Request body validation error"},
|
||||
},
|
||||
)
|
||||
async def heartbeat(
|
||||
req: HeartbeatRequest,
|
||||
request: Request,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> None:
|
||||
await _verify_peer_matches_host(request, req.host_uuid, repo)
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
await repo.update_swarm_host(
|
||||
req.host_uuid,
|
||||
{"status": "active", "last_heartbeat": now},
|
||||
)
|
||||
|
||||
await _reconcile_topology_report(repo, req.host_uuid, req.topology)
|
||||
|
||||
status_body = req.status or {}
|
||||
if not status_body.get("deployed"):
|
||||
return
|
||||
|
||||
runtime = status_body.get("runtime") or {}
|
||||
for decky_dict in status_body.get("deckies") or []:
|
||||
try:
|
||||
d = DeckyConfig(**decky_dict)
|
||||
except Exception:
|
||||
log.exception("heartbeat: skipping malformed decky payload host=%s", req.host_uuid)
|
||||
continue
|
||||
rstate = runtime.get(d.name) or {}
|
||||
is_up = bool(rstate.get("running"))
|
||||
await repo.upsert_decky_shard(
|
||||
{
|
||||
"decky_name": d.name,
|
||||
"host_uuid": req.host_uuid,
|
||||
"services": json.dumps(d.services),
|
||||
"decky_config": d.model_dump_json(),
|
||||
"decky_ip": d.ip,
|
||||
"state": "running" if is_up else "degraded",
|
||||
"last_error": None,
|
||||
"last_seen": now,
|
||||
"updated_at": now,
|
||||
}
|
||||
)
|
||||
55
decnet/web/router/swarm/api_list_deckies.py
Normal file
55
decnet/web/router/swarm/api_list_deckies.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""GET /swarm/deckies — list decky shards with their worker host's identity.
|
||||
|
||||
The DeckyShard table maps decky_name → host_uuid; users want to see which
|
||||
deckies are running and *where*, so we enrich each shard with the owning
|
||||
host's name/address/status from SwarmHost rather than making callers do
|
||||
the join themselves.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.db.models import DeckyShardView
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/deckies", response_model=list[DeckyShardView], tags=["Swarm Deckies"])
|
||||
async def api_list_deckies(
|
||||
host_uuid: Optional[str] = None,
|
||||
state: Optional[str] = None,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> list[DeckyShardView]:
|
||||
shards = await repo.list_decky_shards(host_uuid)
|
||||
hosts = {h["uuid"]: h for h in await repo.list_swarm_hosts()}
|
||||
|
||||
out: list[DeckyShardView] = []
|
||||
for s in shards:
|
||||
if state and s.get("state") != state:
|
||||
continue
|
||||
host = hosts.get(s["host_uuid"], {})
|
||||
out.append(DeckyShardView(
|
||||
decky_name=s["decky_name"],
|
||||
decky_ip=s.get("decky_ip"),
|
||||
host_uuid=s["host_uuid"],
|
||||
host_name=host.get("name") or "<unknown>",
|
||||
host_address=host.get("address") or "",
|
||||
host_status=host.get("status") or "unknown",
|
||||
services=s.get("services") or [],
|
||||
state=s.get("state") or "pending",
|
||||
last_error=s.get("last_error"),
|
||||
compose_hash=s.get("compose_hash"),
|
||||
updated_at=s["updated_at"],
|
||||
hostname=s.get("hostname"),
|
||||
distro=s.get("distro"),
|
||||
archetype=s.get("archetype"),
|
||||
service_config=s.get("service_config") or {},
|
||||
mutate_interval=s.get("mutate_interval"),
|
||||
last_mutated=s.get("last_mutated") or 0.0,
|
||||
last_seen=s.get("last_seen"),
|
||||
))
|
||||
return out
|
||||
21
decnet/web/router/swarm/api_list_hosts.py
Normal file
21
decnet/web/router/swarm/api_list_hosts.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""GET /swarm/hosts — list enrolled workers, optionally filtered by status."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.db.models import SwarmHostView
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/hosts", response_model=list[SwarmHostView], tags=["Swarm Hosts"])
|
||||
async def api_list_hosts(
|
||||
host_status: Optional[str] = None,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> list[SwarmHostView]:
|
||||
rows = await repo.list_swarm_hosts(host_status)
|
||||
return [SwarmHostView(**r) for r in rows]
|
||||
60
decnet/web/router/swarm/api_teardown_swarm.py
Normal file
60
decnet/web/router/swarm/api_teardown_swarm.py
Normal file
@@ -0,0 +1,60 @@
|
||||
"""POST /swarm/teardown — tear down one or all enrolled workers."""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.db.models import (
|
||||
SwarmDeployResponse,
|
||||
SwarmHostResult,
|
||||
SwarmTeardownRequest,
|
||||
)
|
||||
|
||||
log = get_logger("swarm.teardown")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/teardown",
|
||||
response_model=SwarmDeployResponse,
|
||||
tags=["Swarm Deployments"],
|
||||
responses={
|
||||
400: {"description": "Bad Request (malformed JSON body)"},
|
||||
404: {"description": "A targeted host does not exist"},
|
||||
422: {"description": "Request body validation error"},
|
||||
},
|
||||
)
|
||||
async def api_teardown_swarm(
|
||||
req: SwarmTeardownRequest,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> SwarmDeployResponse:
|
||||
if req.host_uuid is not None:
|
||||
row = await repo.get_swarm_host_by_uuid(req.host_uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="host not found")
|
||||
targets = [row]
|
||||
else:
|
||||
targets = await repo.list_swarm_hosts()
|
||||
|
||||
async def _call(host: dict[str, Any]) -> SwarmHostResult:
|
||||
try:
|
||||
async with AgentClient(host=host) as agent:
|
||||
body = await agent.teardown(req.decky_id)
|
||||
if req.decky_id is None:
|
||||
await repo.delete_decky_shards_for_host(host["uuid"])
|
||||
return SwarmHostResult(host_uuid=host["uuid"], host_name=host["name"], ok=True, detail=body)
|
||||
except Exception as exc:
|
||||
log.exception("swarm.teardown failed host=%s", host["name"])
|
||||
return SwarmHostResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"], ok=False, detail=str(exc)
|
||||
)
|
||||
|
||||
results = await asyncio.gather(*(_call(h) for h in targets))
|
||||
return SwarmDeployResponse(results=list(results))
|
||||
Reference in New Issue
Block a user