feat(swarm): master-side SWARM controller (swarmctl) + agent CLI

Adds decnet/web/swarm_api.py as an independent FastAPI app with routers
for host enrollment, deployment dispatch (sharding DecnetConfig across
enrolled workers via AgentClient), and active health probing. Runs as
its own uvicorn subprocess via 'decnet swarmctl', mirroring the isolation
pattern used by 'decnet api'. Also wires up 'decnet agent' CLI entry for
the worker side.

29 tests added under tests/swarm/test_swarm_api.py cover enrollment
(including bundle generation + duplicate rejection), host CRUD, sharding
correctness, non-swarm-mode rejection, teardown, and health probes with
a stubbed AgentClient.
This commit is contained in:
2026-04-18 19:18:33 -04:00
parent cd0057c129
commit 63b0a58527
7 changed files with 838 additions and 0 deletions

View File

@@ -124,6 +124,64 @@ def api(
console.print("[red]Failed to start API. Ensure 'uvicorn' is installed in the current environment.[/]")
@app.command()
def swarmctl(
port: int = typer.Option(8770, "--port", help="Port for the swarm controller"),
host: str = typer.Option("127.0.0.1", "--host", help="Bind address for the swarm controller"),
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"),
) -> None:
"""Run the DECNET SWARM controller (master-side, separate process from `decnet api`)."""
import subprocess # nosec B404
import sys
import os
import signal
if daemon:
log.info("swarmctl daemonizing host=%s port=%d", host, port)
_daemonize()
log.info("swarmctl command invoked host=%s port=%d", host, port)
console.print(f"[green]Starting DECNET SWARM controller on {host}:{port}...[/]")
_cmd = [sys.executable, "-m", "uvicorn", "decnet.web.swarm_api:app",
"--host", host, "--port", str(port)]
try:
proc = subprocess.Popen(_cmd, start_new_session=True) # nosec B603 B404
try:
proc.wait()
except KeyboardInterrupt:
try:
os.killpg(proc.pid, signal.SIGTERM)
try:
proc.wait(timeout=10)
except subprocess.TimeoutExpired:
os.killpg(proc.pid, signal.SIGKILL)
proc.wait()
except ProcessLookupError:
pass
except (FileNotFoundError, subprocess.SubprocessError):
console.print("[red]Failed to start swarmctl. Ensure 'uvicorn' is installed in the current environment.[/]")
@app.command()
def agent(
port: int = typer.Option(8765, "--port", help="Port for the worker agent"),
host: str = typer.Option("0.0.0.0", "--host", help="Bind address for the worker agent"), # nosec B104
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"),
) -> None:
"""Run the DECNET SWARM worker agent (requires a cert bundle in ~/.decnet/agent/)."""
from decnet.agent import server as _agent_server
if daemon:
log.info("agent daemonizing host=%s port=%d", host, port)
_daemonize()
log.info("agent command invoked host=%s port=%d", host, port)
console.print(f"[green]Starting DECNET worker agent on {host}:{port} (mTLS)...[/]")
rc = _agent_server.run(host, port)
if rc != 0:
raise typer.Exit(rc)
@app.command()
def deploy(
mode: str = typer.Option("unihost", "--mode", "-m", help="Deployment mode: unihost | swarm"),

View File

@@ -0,0 +1,16 @@
"""Swarm controller routers.
Mounted onto the swarm-api FastAPI app under the ``/swarm`` prefix. The
controller is a separate process from the main DECNET API so swarm
failures cannot cascade into log ingestion / dashboard serving.
"""
from fastapi import APIRouter
from .hosts import router as hosts_router
from .deployments import router as deployments_router
from .health import router as health_router
swarm_router = APIRouter(prefix="/swarm")
swarm_router.include_router(hosts_router)
swarm_router.include_router(deployments_router)
swarm_router.include_router(health_router)

View File

@@ -0,0 +1,164 @@
"""Deployment dispatch: shard deckies across enrolled workers and push.
The master owns the DecnetConfig. Per worker we build a filtered copy
containing only the deckies assigned to that worker (via ``host_uuid``),
then POST it to the worker agent. Sharding strategy is explicit: the
caller is expected to have already set ``host_uuid`` on every decky. If
any decky arrives without one, we fail fast — auto-sharding lives in the
CLI layer (task #7), not here.
"""
from __future__ import annotations
import asyncio
import json
from datetime import datetime, timezone
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel, Field
from decnet.config import DecnetConfig, DeckyConfig
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
log = get_logger("swarm.deployments")
router = APIRouter(tags=["swarm-deployments"])
class DeployRequest(BaseModel):
config: DecnetConfig
dry_run: bool = False
no_cache: bool = False
class TeardownRequest(BaseModel):
host_uuid: str | None = Field(
default=None,
description="If set, tear down only this worker; otherwise tear down all hosts",
)
decky_id: str | None = None
class HostResult(BaseModel):
host_uuid: str
host_name: str
ok: bool
detail: Any | None = None
class DeployResponse(BaseModel):
results: list[HostResult]
# ----------------------------------------------------------------- helpers
def _shard_by_host(config: DecnetConfig) -> dict[str, list[DeckyConfig]]:
buckets: dict[str, list[DeckyConfig]] = {}
for d in config.deckies:
if not d.host_uuid:
raise HTTPException(
status_code=400,
detail=f"decky '{d.name}' has no host_uuid — caller must shard before dispatch",
)
buckets.setdefault(d.host_uuid, []).append(d)
return buckets
def _worker_config(base: DecnetConfig, shard: list[DeckyConfig]) -> DecnetConfig:
return base.model_copy(update={"deckies": shard})
# ------------------------------------------------------------------ routes
@router.post("/deploy", response_model=DeployResponse)
async def deploy(
req: DeployRequest,
repo: BaseRepository = Depends(get_repo),
) -> DeployResponse:
if req.config.mode != "swarm":
raise HTTPException(status_code=400, detail="mode must be 'swarm'")
buckets = _shard_by_host(req.config)
# Resolve host rows in one query-per-host pass; fail fast on unknown uuids.
hosts: dict[str, dict[str, Any]] = {}
for host_uuid in buckets:
row = await repo.get_swarm_host_by_uuid(host_uuid)
if row is None:
raise HTTPException(status_code=404, detail=f"unknown host_uuid: {host_uuid}")
hosts[host_uuid] = row
async def _dispatch(host_uuid: str, shard: list[DeckyConfig]) -> HostResult:
host = hosts[host_uuid]
cfg = _worker_config(req.config, shard)
try:
async with AgentClient(host=host) as agent:
body = await agent.deploy(cfg, dry_run=req.dry_run, no_cache=req.no_cache)
# Persist a DeckyShard row per decky for status lookups.
for d in shard:
await repo.upsert_decky_shard(
{
"decky_name": d.name,
"host_uuid": host_uuid,
"services": json.dumps(d.services),
"state": "running" if not req.dry_run else "pending",
"last_error": None,
"updated_at": datetime.now(timezone.utc),
}
)
await repo.update_swarm_host(host_uuid, {"status": "active"})
return HostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
except Exception as exc:
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
for d in shard:
await repo.upsert_decky_shard(
{
"decky_name": d.name,
"host_uuid": host_uuid,
"services": json.dumps(d.services),
"state": "failed",
"last_error": str(exc)[:512],
"updated_at": datetime.now(timezone.utc),
}
)
return HostResult(host_uuid=host_uuid, host_name=host["name"], ok=False, detail=str(exc))
results = await asyncio.gather(
*(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
)
return DeployResponse(results=list(results))
@router.post("/teardown", response_model=DeployResponse)
async def teardown(
req: TeardownRequest,
repo: BaseRepository = Depends(get_repo),
) -> DeployResponse:
if req.host_uuid is not None:
row = await repo.get_swarm_host_by_uuid(req.host_uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
targets = [row]
else:
targets = await repo.list_swarm_hosts()
async def _call(host: dict[str, Any]) -> HostResult:
try:
async with AgentClient(host=host) as agent:
body = await agent.teardown(req.decky_id)
if req.decky_id is None:
await repo.delete_decky_shards_for_host(host["uuid"])
return HostResult(host_uuid=host["uuid"], host_name=host["name"], ok=True, detail=body)
except Exception as exc:
log.exception("swarm.teardown failed host=%s", host["name"])
return HostResult(
host_uuid=host["uuid"], host_name=host["name"], ok=False, detail=str(exc)
)
results = await asyncio.gather(*(_call(h) for h in targets))
return DeployResponse(results=list(results))

View File

@@ -0,0 +1,79 @@
"""Health endpoints for the swarm controller.
* ``GET /swarm/health`` — liveness of the controller itself (no I/O).
* ``POST /swarm/check`` — active probe of every enrolled worker over mTLS.
Updates ``SwarmHost.status`` and ``last_heartbeat``.
"""
from __future__ import annotations
import asyncio
from datetime import datetime, timezone
from typing import Any
from fastapi import APIRouter, Depends
from pydantic import BaseModel
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
log = get_logger("swarm.health")
router = APIRouter(tags=["swarm-health"])
class HostHealth(BaseModel):
host_uuid: str
name: str
address: str
reachable: bool
detail: Any | None = None
class CheckResponse(BaseModel):
results: list[HostHealth]
@router.get("/health")
async def health() -> dict[str, str]:
return {"status": "ok", "role": "swarm-controller"}
@router.post("/check", response_model=CheckResponse)
async def check(
repo: BaseRepository = Depends(get_repo),
) -> CheckResponse:
hosts = await repo.list_swarm_hosts()
async def _probe(host: dict[str, Any]) -> HostHealth:
try:
async with AgentClient(host=host) as agent:
body = await agent.health()
await repo.update_swarm_host(
host["uuid"],
{
"status": "active",
"last_heartbeat": datetime.now(timezone.utc),
},
)
return HostHealth(
host_uuid=host["uuid"],
name=host["name"],
address=host["address"],
reachable=True,
detail=body,
)
except Exception as exc:
log.warning("swarm.check unreachable host=%s err=%s", host["name"], exc)
await repo.update_swarm_host(host["uuid"], {"status": "unreachable"})
return HostHealth(
host_uuid=host["uuid"],
name=host["name"],
address=host["address"],
reachable=False,
detail=str(exc),
)
results = await asyncio.gather(*(_probe(h) for h in hosts))
return CheckResponse(results=list(results))

View File

@@ -0,0 +1,162 @@
"""Swarm host lifecycle endpoints: enroll, list, decommission.
Enrollment design
-----------------
The master controller holds the CA private key. On ``POST /swarm/enroll``
it generates a fresh worker keypair + cert (signed by the master CA) and
returns the full bundle to the operator. The operator is responsible for
delivering that bundle to the worker's ``~/.decnet/agent/`` directory
(scp/sshpass/ansible — outside this process's trust boundary).
Rationale: the worker agent speaks ONLY mTLS. There is no pre-auth
bootstrap endpoint, so there is nothing to attack before the worker is
enrolled. The bundle-delivery step is explicit and auditable.
"""
from __future__ import annotations
import pathlib
import uuid as _uuid
from datetime import datetime, timezone
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel, Field
from decnet.swarm import pki
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
router = APIRouter(tags=["swarm-hosts"])
# ------------------------------------------------------------------- schemas
class EnrollRequest(BaseModel):
name: str = Field(..., min_length=1, max_length=128)
address: str = Field(..., description="IP or DNS the master uses to reach the worker")
agent_port: int = Field(default=8765, ge=1, le=65535)
sans: list[str] = Field(
default_factory=list,
description="Extra SANs (IPs / hostnames) to embed in the worker cert",
)
notes: Optional[str] = None
class EnrolledBundle(BaseModel):
"""Cert bundle returned to the operator — must be delivered to the worker."""
host_uuid: str
name: str
address: str
agent_port: int
fingerprint: str
ca_cert_pem: str
worker_cert_pem: str
worker_key_pem: str
class SwarmHostView(BaseModel):
uuid: str
name: str
address: str
agent_port: int
status: str
last_heartbeat: Optional[datetime] = None
client_cert_fingerprint: str
enrolled_at: datetime
notes: Optional[str] = None
# ------------------------------------------------------------------- routes
@router.post("/enroll", response_model=EnrolledBundle, status_code=status.HTTP_201_CREATED)
async def enroll(
req: EnrollRequest,
repo: BaseRepository = Depends(get_repo),
) -> EnrolledBundle:
existing = await repo.get_swarm_host_by_name(req.name)
if existing is not None:
raise HTTPException(status_code=409, detail=f"Worker '{req.name}' is already enrolled")
ca = pki.ensure_ca()
sans = list({*req.sans, req.address, req.name})
issued = pki.issue_worker_cert(ca, req.name, sans)
# Persist the bundle under ~/.decnet/ca/workers/<name>/ so the master
# can replay it if the operator loses the original delivery.
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.name
pki.write_worker_bundle(issued, bundle_dir)
host_uuid = str(_uuid.uuid4())
await repo.add_swarm_host(
{
"uuid": host_uuid,
"name": req.name,
"address": req.address,
"agent_port": req.agent_port,
"status": "enrolled",
"client_cert_fingerprint": issued.fingerprint_sha256,
"cert_bundle_path": str(bundle_dir),
"enrolled_at": datetime.now(timezone.utc),
"notes": req.notes,
}
)
return EnrolledBundle(
host_uuid=host_uuid,
name=req.name,
address=req.address,
agent_port=req.agent_port,
fingerprint=issued.fingerprint_sha256,
ca_cert_pem=issued.ca_cert_pem.decode(),
worker_cert_pem=issued.cert_pem.decode(),
worker_key_pem=issued.key_pem.decode(),
)
@router.get("/hosts", response_model=list[SwarmHostView])
async def list_hosts(
host_status: Optional[str] = None,
repo: BaseRepository = Depends(get_repo),
) -> list[SwarmHostView]:
rows = await repo.list_swarm_hosts(host_status)
return [SwarmHostView(**r) for r in rows]
@router.get("/hosts/{uuid}", response_model=SwarmHostView)
async def get_host(
uuid: str,
repo: BaseRepository = Depends(get_repo),
) -> SwarmHostView:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
return SwarmHostView(**row)
@router.delete("/hosts/{uuid}", status_code=status.HTTP_204_NO_CONTENT)
async def decommission(
uuid: str,
repo: BaseRepository = Depends(get_repo),
) -> None:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
# Remove shard rows first (we own them; cascade is portable via the repo).
await repo.delete_decky_shards_for_host(uuid)
await repo.delete_swarm_host(uuid)
# Best-effort bundle cleanup; if the dir was moved manually, don't fail.
bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
if bundle_dir.is_dir():
for child in bundle_dir.iterdir():
try:
child.unlink()
except OSError:
pass
try:
bundle_dir.rmdir()
except OSError:
pass

65
decnet/web/swarm_api.py Normal file
View File

@@ -0,0 +1,65 @@
"""DECNET SWARM Controller — master-side control plane.
Runs as an independent FastAPI/uvicorn process. Isolated from
``decnet.web.api`` so controller failure cannot cascade to the main API,
ingester, or dashboard (mirrors the existing pattern used by
``decnet api`` with ``start_new_session=True``).
Responsibilities:
* host enrollment (issues CA-signed worker bundles);
* dispatching DecnetConfig shards to worker agents over mTLS;
* active health probes of enrolled workers.
The controller *reuses* the same ``get_repo`` dependency as the main API,
so SwarmHost / DeckyShard state is visible to both processes via the
shared DB.
"""
from __future__ import annotations
from contextlib import asynccontextmanager
from typing import AsyncGenerator
from fastapi import FastAPI
from fastapi.responses import ORJSONResponse
from decnet.logging import get_logger
from decnet.swarm import pki
from decnet.swarm.client import ensure_master_identity
from decnet.web.dependencies import repo
from decnet.web.router.swarm import swarm_router
log = get_logger("swarm_api")
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
log.info("swarm-controller starting up")
# Make sure the CA and master client cert exist before we accept any
# request — enrollment needs them and AgentClient needs them.
pki.ensure_ca()
ensure_master_identity()
await repo.initialize()
log.info("swarm-controller ready")
yield
log.info("swarm-controller shutdown")
app: FastAPI = FastAPI(
title="DECNET SWARM Controller",
version="0.1.0",
lifespan=lifespan,
default_response_class=ORJSONResponse,
# No interactive docs: the controller is an internal management plane,
# not a public surface. Enable explicitly in dev if needed.
docs_url=None,
redoc_url=None,
openapi_url=None,
)
app.include_router(swarm_router)
@app.get("/health")
async def root_health() -> dict[str, str]:
"""Top-level liveness probe (no DB I/O)."""
return {"status": "ok", "role": "swarm-controller"}