feat(swarm): master-side SWARM controller (swarmctl) + agent CLI

Adds decnet/web/swarm_api.py as an independent FastAPI app with routers
for host enrollment, deployment dispatch (sharding DecnetConfig across
enrolled workers via AgentClient), and active health probing. Runs as
its own uvicorn subprocess via 'decnet swarmctl', mirroring the isolation
pattern used by 'decnet api'. Also wires up 'decnet agent' CLI entry for
the worker side.

29 tests added under tests/swarm/test_swarm_api.py cover enrollment
(including bundle generation + duplicate rejection), host CRUD, sharding
correctness, non-swarm-mode rejection, teardown, and health probes with
a stubbed AgentClient.
This commit is contained in:
2026-04-18 19:18:33 -04:00
parent cd0057c129
commit 63b0a58527
7 changed files with 838 additions and 0 deletions

View File

@@ -0,0 +1,16 @@
"""Swarm controller routers.
Mounted onto the swarm-api FastAPI app under the ``/swarm`` prefix. The
controller is a separate process from the main DECNET API so swarm
failures cannot cascade into log ingestion / dashboard serving.
"""
from fastapi import APIRouter
from .hosts import router as hosts_router
from .deployments import router as deployments_router
from .health import router as health_router
swarm_router = APIRouter(prefix="/swarm")
swarm_router.include_router(hosts_router)
swarm_router.include_router(deployments_router)
swarm_router.include_router(health_router)

View File

@@ -0,0 +1,164 @@
"""Deployment dispatch: shard deckies across enrolled workers and push.
The master owns the DecnetConfig. Per worker we build a filtered copy
containing only the deckies assigned to that worker (via ``host_uuid``),
then POST it to the worker agent. Sharding strategy is explicit: the
caller is expected to have already set ``host_uuid`` on every decky. If
any decky arrives without one, we fail fast — auto-sharding lives in the
CLI layer (task #7), not here.
"""
from __future__ import annotations
import asyncio
import json
from datetime import datetime, timezone
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel, Field
from decnet.config import DecnetConfig, DeckyConfig
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
log = get_logger("swarm.deployments")
router = APIRouter(tags=["swarm-deployments"])
class DeployRequest(BaseModel):
config: DecnetConfig
dry_run: bool = False
no_cache: bool = False
class TeardownRequest(BaseModel):
host_uuid: str | None = Field(
default=None,
description="If set, tear down only this worker; otherwise tear down all hosts",
)
decky_id: str | None = None
class HostResult(BaseModel):
host_uuid: str
host_name: str
ok: bool
detail: Any | None = None
class DeployResponse(BaseModel):
results: list[HostResult]
# ----------------------------------------------------------------- helpers
def _shard_by_host(config: DecnetConfig) -> dict[str, list[DeckyConfig]]:
buckets: dict[str, list[DeckyConfig]] = {}
for d in config.deckies:
if not d.host_uuid:
raise HTTPException(
status_code=400,
detail=f"decky '{d.name}' has no host_uuid — caller must shard before dispatch",
)
buckets.setdefault(d.host_uuid, []).append(d)
return buckets
def _worker_config(base: DecnetConfig, shard: list[DeckyConfig]) -> DecnetConfig:
return base.model_copy(update={"deckies": shard})
# ------------------------------------------------------------------ routes
@router.post("/deploy", response_model=DeployResponse)
async def deploy(
req: DeployRequest,
repo: BaseRepository = Depends(get_repo),
) -> DeployResponse:
if req.config.mode != "swarm":
raise HTTPException(status_code=400, detail="mode must be 'swarm'")
buckets = _shard_by_host(req.config)
# Resolve host rows in one query-per-host pass; fail fast on unknown uuids.
hosts: dict[str, dict[str, Any]] = {}
for host_uuid in buckets:
row = await repo.get_swarm_host_by_uuid(host_uuid)
if row is None:
raise HTTPException(status_code=404, detail=f"unknown host_uuid: {host_uuid}")
hosts[host_uuid] = row
async def _dispatch(host_uuid: str, shard: list[DeckyConfig]) -> HostResult:
host = hosts[host_uuid]
cfg = _worker_config(req.config, shard)
try:
async with AgentClient(host=host) as agent:
body = await agent.deploy(cfg, dry_run=req.dry_run, no_cache=req.no_cache)
# Persist a DeckyShard row per decky for status lookups.
for d in shard:
await repo.upsert_decky_shard(
{
"decky_name": d.name,
"host_uuid": host_uuid,
"services": json.dumps(d.services),
"state": "running" if not req.dry_run else "pending",
"last_error": None,
"updated_at": datetime.now(timezone.utc),
}
)
await repo.update_swarm_host(host_uuid, {"status": "active"})
return HostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
except Exception as exc:
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
for d in shard:
await repo.upsert_decky_shard(
{
"decky_name": d.name,
"host_uuid": host_uuid,
"services": json.dumps(d.services),
"state": "failed",
"last_error": str(exc)[:512],
"updated_at": datetime.now(timezone.utc),
}
)
return HostResult(host_uuid=host_uuid, host_name=host["name"], ok=False, detail=str(exc))
results = await asyncio.gather(
*(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
)
return DeployResponse(results=list(results))
@router.post("/teardown", response_model=DeployResponse)
async def teardown(
req: TeardownRequest,
repo: BaseRepository = Depends(get_repo),
) -> DeployResponse:
if req.host_uuid is not None:
row = await repo.get_swarm_host_by_uuid(req.host_uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
targets = [row]
else:
targets = await repo.list_swarm_hosts()
async def _call(host: dict[str, Any]) -> HostResult:
try:
async with AgentClient(host=host) as agent:
body = await agent.teardown(req.decky_id)
if req.decky_id is None:
await repo.delete_decky_shards_for_host(host["uuid"])
return HostResult(host_uuid=host["uuid"], host_name=host["name"], ok=True, detail=body)
except Exception as exc:
log.exception("swarm.teardown failed host=%s", host["name"])
return HostResult(
host_uuid=host["uuid"], host_name=host["name"], ok=False, detail=str(exc)
)
results = await asyncio.gather(*(_call(h) for h in targets))
return DeployResponse(results=list(results))

View File

@@ -0,0 +1,79 @@
"""Health endpoints for the swarm controller.
* ``GET /swarm/health`` — liveness of the controller itself (no I/O).
* ``POST /swarm/check`` — active probe of every enrolled worker over mTLS.
Updates ``SwarmHost.status`` and ``last_heartbeat``.
"""
from __future__ import annotations
import asyncio
from datetime import datetime, timezone
from typing import Any
from fastapi import APIRouter, Depends
from pydantic import BaseModel
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
log = get_logger("swarm.health")
router = APIRouter(tags=["swarm-health"])
class HostHealth(BaseModel):
host_uuid: str
name: str
address: str
reachable: bool
detail: Any | None = None
class CheckResponse(BaseModel):
results: list[HostHealth]
@router.get("/health")
async def health() -> dict[str, str]:
return {"status": "ok", "role": "swarm-controller"}
@router.post("/check", response_model=CheckResponse)
async def check(
repo: BaseRepository = Depends(get_repo),
) -> CheckResponse:
hosts = await repo.list_swarm_hosts()
async def _probe(host: dict[str, Any]) -> HostHealth:
try:
async with AgentClient(host=host) as agent:
body = await agent.health()
await repo.update_swarm_host(
host["uuid"],
{
"status": "active",
"last_heartbeat": datetime.now(timezone.utc),
},
)
return HostHealth(
host_uuid=host["uuid"],
name=host["name"],
address=host["address"],
reachable=True,
detail=body,
)
except Exception as exc:
log.warning("swarm.check unreachable host=%s err=%s", host["name"], exc)
await repo.update_swarm_host(host["uuid"], {"status": "unreachable"})
return HostHealth(
host_uuid=host["uuid"],
name=host["name"],
address=host["address"],
reachable=False,
detail=str(exc),
)
results = await asyncio.gather(*(_probe(h) for h in hosts))
return CheckResponse(results=list(results))

View File

@@ -0,0 +1,162 @@
"""Swarm host lifecycle endpoints: enroll, list, decommission.
Enrollment design
-----------------
The master controller holds the CA private key. On ``POST /swarm/enroll``
it generates a fresh worker keypair + cert (signed by the master CA) and
returns the full bundle to the operator. The operator is responsible for
delivering that bundle to the worker's ``~/.decnet/agent/`` directory
(scp/sshpass/ansible — outside this process's trust boundary).
Rationale: the worker agent speaks ONLY mTLS. There is no pre-auth
bootstrap endpoint, so there is nothing to attack before the worker is
enrolled. The bundle-delivery step is explicit and auditable.
"""
from __future__ import annotations
import pathlib
import uuid as _uuid
from datetime import datetime, timezone
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel, Field
from decnet.swarm import pki
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
router = APIRouter(tags=["swarm-hosts"])
# ------------------------------------------------------------------- schemas
class EnrollRequest(BaseModel):
name: str = Field(..., min_length=1, max_length=128)
address: str = Field(..., description="IP or DNS the master uses to reach the worker")
agent_port: int = Field(default=8765, ge=1, le=65535)
sans: list[str] = Field(
default_factory=list,
description="Extra SANs (IPs / hostnames) to embed in the worker cert",
)
notes: Optional[str] = None
class EnrolledBundle(BaseModel):
"""Cert bundle returned to the operator — must be delivered to the worker."""
host_uuid: str
name: str
address: str
agent_port: int
fingerprint: str
ca_cert_pem: str
worker_cert_pem: str
worker_key_pem: str
class SwarmHostView(BaseModel):
uuid: str
name: str
address: str
agent_port: int
status: str
last_heartbeat: Optional[datetime] = None
client_cert_fingerprint: str
enrolled_at: datetime
notes: Optional[str] = None
# ------------------------------------------------------------------- routes
@router.post("/enroll", response_model=EnrolledBundle, status_code=status.HTTP_201_CREATED)
async def enroll(
req: EnrollRequest,
repo: BaseRepository = Depends(get_repo),
) -> EnrolledBundle:
existing = await repo.get_swarm_host_by_name(req.name)
if existing is not None:
raise HTTPException(status_code=409, detail=f"Worker '{req.name}' is already enrolled")
ca = pki.ensure_ca()
sans = list({*req.sans, req.address, req.name})
issued = pki.issue_worker_cert(ca, req.name, sans)
# Persist the bundle under ~/.decnet/ca/workers/<name>/ so the master
# can replay it if the operator loses the original delivery.
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.name
pki.write_worker_bundle(issued, bundle_dir)
host_uuid = str(_uuid.uuid4())
await repo.add_swarm_host(
{
"uuid": host_uuid,
"name": req.name,
"address": req.address,
"agent_port": req.agent_port,
"status": "enrolled",
"client_cert_fingerprint": issued.fingerprint_sha256,
"cert_bundle_path": str(bundle_dir),
"enrolled_at": datetime.now(timezone.utc),
"notes": req.notes,
}
)
return EnrolledBundle(
host_uuid=host_uuid,
name=req.name,
address=req.address,
agent_port=req.agent_port,
fingerprint=issued.fingerprint_sha256,
ca_cert_pem=issued.ca_cert_pem.decode(),
worker_cert_pem=issued.cert_pem.decode(),
worker_key_pem=issued.key_pem.decode(),
)
@router.get("/hosts", response_model=list[SwarmHostView])
async def list_hosts(
host_status: Optional[str] = None,
repo: BaseRepository = Depends(get_repo),
) -> list[SwarmHostView]:
rows = await repo.list_swarm_hosts(host_status)
return [SwarmHostView(**r) for r in rows]
@router.get("/hosts/{uuid}", response_model=SwarmHostView)
async def get_host(
uuid: str,
repo: BaseRepository = Depends(get_repo),
) -> SwarmHostView:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
return SwarmHostView(**row)
@router.delete("/hosts/{uuid}", status_code=status.HTTP_204_NO_CONTENT)
async def decommission(
uuid: str,
repo: BaseRepository = Depends(get_repo),
) -> None:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
# Remove shard rows first (we own them; cascade is portable via the repo).
await repo.delete_decky_shards_for_host(uuid)
await repo.delete_swarm_host(uuid)
# Best-effort bundle cleanup; if the dir was moved manually, don't fail.
bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
if bundle_dir.is_dir():
for child in bundle_dir.iterdir():
try:
child.unlink()
except OSError:
pass
try:
bundle_dir.rmdir()
except OSError:
pass