Files
DECNET/decnet/web/router/swarm/hosts.py
anti 63b0a58527 feat(swarm): master-side SWARM controller (swarmctl) + agent CLI
Adds decnet/web/swarm_api.py as an independent FastAPI app with routers
for host enrollment, deployment dispatch (sharding DecnetConfig across
enrolled workers via AgentClient), and active health probing. Runs as
its own uvicorn subprocess via 'decnet swarmctl', mirroring the isolation
pattern used by 'decnet api'. Also wires up 'decnet agent' CLI entry for
the worker side.

29 tests added under tests/swarm/test_swarm_api.py cover enrollment
(including bundle generation + duplicate rejection), host CRUD, sharding
correctness, non-swarm-mode rejection, teardown, and health probes with
a stubbed AgentClient.
2026-04-18 19:18:33 -04:00

163 lines
5.1 KiB
Python

"""Swarm host lifecycle endpoints: enroll, list, decommission.
Enrollment design
-----------------
The master controller holds the CA private key. On ``POST /swarm/enroll``
it generates a fresh worker keypair + cert (signed by the master CA) and
returns the full bundle to the operator. The operator is responsible for
delivering that bundle to the worker's ``~/.decnet/agent/`` directory
(scp/sshpass/ansible — outside this process's trust boundary).
Rationale: the worker agent speaks ONLY mTLS. There is no pre-auth
bootstrap endpoint, so there is nothing to attack before the worker is
enrolled. The bundle-delivery step is explicit and auditable.
"""
from __future__ import annotations
import pathlib
import uuid as _uuid
from datetime import datetime, timezone
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel, Field
from decnet.swarm import pki
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
router = APIRouter(tags=["swarm-hosts"])
# ------------------------------------------------------------------- schemas
class EnrollRequest(BaseModel):
name: str = Field(..., min_length=1, max_length=128)
address: str = Field(..., description="IP or DNS the master uses to reach the worker")
agent_port: int = Field(default=8765, ge=1, le=65535)
sans: list[str] = Field(
default_factory=list,
description="Extra SANs (IPs / hostnames) to embed in the worker cert",
)
notes: Optional[str] = None
class EnrolledBundle(BaseModel):
"""Cert bundle returned to the operator — must be delivered to the worker."""
host_uuid: str
name: str
address: str
agent_port: int
fingerprint: str
ca_cert_pem: str
worker_cert_pem: str
worker_key_pem: str
class SwarmHostView(BaseModel):
uuid: str
name: str
address: str
agent_port: int
status: str
last_heartbeat: Optional[datetime] = None
client_cert_fingerprint: str
enrolled_at: datetime
notes: Optional[str] = None
# ------------------------------------------------------------------- routes
@router.post("/enroll", response_model=EnrolledBundle, status_code=status.HTTP_201_CREATED)
async def enroll(
req: EnrollRequest,
repo: BaseRepository = Depends(get_repo),
) -> EnrolledBundle:
existing = await repo.get_swarm_host_by_name(req.name)
if existing is not None:
raise HTTPException(status_code=409, detail=f"Worker '{req.name}' is already enrolled")
ca = pki.ensure_ca()
sans = list({*req.sans, req.address, req.name})
issued = pki.issue_worker_cert(ca, req.name, sans)
# Persist the bundle under ~/.decnet/ca/workers/<name>/ so the master
# can replay it if the operator loses the original delivery.
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.name
pki.write_worker_bundle(issued, bundle_dir)
host_uuid = str(_uuid.uuid4())
await repo.add_swarm_host(
{
"uuid": host_uuid,
"name": req.name,
"address": req.address,
"agent_port": req.agent_port,
"status": "enrolled",
"client_cert_fingerprint": issued.fingerprint_sha256,
"cert_bundle_path": str(bundle_dir),
"enrolled_at": datetime.now(timezone.utc),
"notes": req.notes,
}
)
return EnrolledBundle(
host_uuid=host_uuid,
name=req.name,
address=req.address,
agent_port=req.agent_port,
fingerprint=issued.fingerprint_sha256,
ca_cert_pem=issued.ca_cert_pem.decode(),
worker_cert_pem=issued.cert_pem.decode(),
worker_key_pem=issued.key_pem.decode(),
)
@router.get("/hosts", response_model=list[SwarmHostView])
async def list_hosts(
host_status: Optional[str] = None,
repo: BaseRepository = Depends(get_repo),
) -> list[SwarmHostView]:
rows = await repo.list_swarm_hosts(host_status)
return [SwarmHostView(**r) for r in rows]
@router.get("/hosts/{uuid}", response_model=SwarmHostView)
async def get_host(
uuid: str,
repo: BaseRepository = Depends(get_repo),
) -> SwarmHostView:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
return SwarmHostView(**row)
@router.delete("/hosts/{uuid}", status_code=status.HTTP_204_NO_CONTENT)
async def decommission(
uuid: str,
repo: BaseRepository = Depends(get_repo),
) -> None:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
# Remove shard rows first (we own them; cascade is portable via the repo).
await repo.delete_decky_shards_for_host(uuid)
await repo.delete_swarm_host(uuid)
# Best-effort bundle cleanup; if the dir was moved manually, don't fail.
bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
if bundle_dir.is_dir():
for child in bundle_dir.iterdir():
try:
child.unlink()
except OSError:
pass
try:
bundle_dir.rmdir()
except OSError:
pass