feat(swarm): master-side SWARM controller (swarmctl) + agent CLI
Adds decnet/web/swarm_api.py as an independent FastAPI app with routers for host enrollment, deployment dispatch (sharding DecnetConfig across enrolled workers via AgentClient), and active health probing. Runs as its own uvicorn subprocess via 'decnet swarmctl', mirroring the isolation pattern used by 'decnet api'. Also wires up 'decnet agent' CLI entry for the worker side. 29 tests added under tests/swarm/test_swarm_api.py cover enrollment (including bundle generation + duplicate rejection), host CRUD, sharding correctness, non-swarm-mode rejection, teardown, and health probes with a stubbed AgentClient.
This commit is contained in:
@@ -124,6 +124,64 @@ def api(
|
|||||||
console.print("[red]Failed to start API. Ensure 'uvicorn' is installed in the current environment.[/]")
|
console.print("[red]Failed to start API. Ensure 'uvicorn' is installed in the current environment.[/]")
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def swarmctl(
|
||||||
|
port: int = typer.Option(8770, "--port", help="Port for the swarm controller"),
|
||||||
|
host: str = typer.Option("127.0.0.1", "--host", help="Bind address for the swarm controller"),
|
||||||
|
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"),
|
||||||
|
) -> None:
|
||||||
|
"""Run the DECNET SWARM controller (master-side, separate process from `decnet api`)."""
|
||||||
|
import subprocess # nosec B404
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
|
||||||
|
if daemon:
|
||||||
|
log.info("swarmctl daemonizing host=%s port=%d", host, port)
|
||||||
|
_daemonize()
|
||||||
|
|
||||||
|
log.info("swarmctl command invoked host=%s port=%d", host, port)
|
||||||
|
console.print(f"[green]Starting DECNET SWARM controller on {host}:{port}...[/]")
|
||||||
|
_cmd = [sys.executable, "-m", "uvicorn", "decnet.web.swarm_api:app",
|
||||||
|
"--host", host, "--port", str(port)]
|
||||||
|
try:
|
||||||
|
proc = subprocess.Popen(_cmd, start_new_session=True) # nosec B603 B404
|
||||||
|
try:
|
||||||
|
proc.wait()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
try:
|
||||||
|
os.killpg(proc.pid, signal.SIGTERM)
|
||||||
|
try:
|
||||||
|
proc.wait(timeout=10)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
os.killpg(proc.pid, signal.SIGKILL)
|
||||||
|
proc.wait()
|
||||||
|
except ProcessLookupError:
|
||||||
|
pass
|
||||||
|
except (FileNotFoundError, subprocess.SubprocessError):
|
||||||
|
console.print("[red]Failed to start swarmctl. Ensure 'uvicorn' is installed in the current environment.[/]")
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def agent(
|
||||||
|
port: int = typer.Option(8765, "--port", help="Port for the worker agent"),
|
||||||
|
host: str = typer.Option("0.0.0.0", "--host", help="Bind address for the worker agent"), # nosec B104
|
||||||
|
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"),
|
||||||
|
) -> None:
|
||||||
|
"""Run the DECNET SWARM worker agent (requires a cert bundle in ~/.decnet/agent/)."""
|
||||||
|
from decnet.agent import server as _agent_server
|
||||||
|
|
||||||
|
if daemon:
|
||||||
|
log.info("agent daemonizing host=%s port=%d", host, port)
|
||||||
|
_daemonize()
|
||||||
|
|
||||||
|
log.info("agent command invoked host=%s port=%d", host, port)
|
||||||
|
console.print(f"[green]Starting DECNET worker agent on {host}:{port} (mTLS)...[/]")
|
||||||
|
rc = _agent_server.run(host, port)
|
||||||
|
if rc != 0:
|
||||||
|
raise typer.Exit(rc)
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def deploy(
|
def deploy(
|
||||||
mode: str = typer.Option("unihost", "--mode", "-m", help="Deployment mode: unihost | swarm"),
|
mode: str = typer.Option("unihost", "--mode", "-m", help="Deployment mode: unihost | swarm"),
|
||||||
|
|||||||
16
decnet/web/router/swarm/__init__.py
Normal file
16
decnet/web/router/swarm/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
"""Swarm controller routers.
|
||||||
|
|
||||||
|
Mounted onto the swarm-api FastAPI app under the ``/swarm`` prefix. The
|
||||||
|
controller is a separate process from the main DECNET API so swarm
|
||||||
|
failures cannot cascade into log ingestion / dashboard serving.
|
||||||
|
"""
|
||||||
|
from fastapi import APIRouter
|
||||||
|
|
||||||
|
from .hosts import router as hosts_router
|
||||||
|
from .deployments import router as deployments_router
|
||||||
|
from .health import router as health_router
|
||||||
|
|
||||||
|
swarm_router = APIRouter(prefix="/swarm")
|
||||||
|
swarm_router.include_router(hosts_router)
|
||||||
|
swarm_router.include_router(deployments_router)
|
||||||
|
swarm_router.include_router(health_router)
|
||||||
164
decnet/web/router/swarm/deployments.py
Normal file
164
decnet/web/router/swarm/deployments.py
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
"""Deployment dispatch: shard deckies across enrolled workers and push.
|
||||||
|
|
||||||
|
The master owns the DecnetConfig. Per worker we build a filtered copy
|
||||||
|
containing only the deckies assigned to that worker (via ``host_uuid``),
|
||||||
|
then POST it to the worker agent. Sharding strategy is explicit: the
|
||||||
|
caller is expected to have already set ``host_uuid`` on every decky. If
|
||||||
|
any decky arrives without one, we fail fast — auto-sharding lives in the
|
||||||
|
CLI layer (task #7), not here.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from decnet.config import DecnetConfig, DeckyConfig
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
from decnet.swarm.client import AgentClient
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
from decnet.web.dependencies import get_repo
|
||||||
|
|
||||||
|
log = get_logger("swarm.deployments")
|
||||||
|
|
||||||
|
router = APIRouter(tags=["swarm-deployments"])
|
||||||
|
|
||||||
|
|
||||||
|
class DeployRequest(BaseModel):
|
||||||
|
config: DecnetConfig
|
||||||
|
dry_run: bool = False
|
||||||
|
no_cache: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class TeardownRequest(BaseModel):
|
||||||
|
host_uuid: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="If set, tear down only this worker; otherwise tear down all hosts",
|
||||||
|
)
|
||||||
|
decky_id: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class HostResult(BaseModel):
|
||||||
|
host_uuid: str
|
||||||
|
host_name: str
|
||||||
|
ok: bool
|
||||||
|
detail: Any | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class DeployResponse(BaseModel):
|
||||||
|
results: list[HostResult]
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------- helpers
|
||||||
|
|
||||||
|
|
||||||
|
def _shard_by_host(config: DecnetConfig) -> dict[str, list[DeckyConfig]]:
|
||||||
|
buckets: dict[str, list[DeckyConfig]] = {}
|
||||||
|
for d in config.deckies:
|
||||||
|
if not d.host_uuid:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"decky '{d.name}' has no host_uuid — caller must shard before dispatch",
|
||||||
|
)
|
||||||
|
buckets.setdefault(d.host_uuid, []).append(d)
|
||||||
|
return buckets
|
||||||
|
|
||||||
|
|
||||||
|
def _worker_config(base: DecnetConfig, shard: list[DeckyConfig]) -> DecnetConfig:
|
||||||
|
return base.model_copy(update={"deckies": shard})
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ routes
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/deploy", response_model=DeployResponse)
|
||||||
|
async def deploy(
|
||||||
|
req: DeployRequest,
|
||||||
|
repo: BaseRepository = Depends(get_repo),
|
||||||
|
) -> DeployResponse:
|
||||||
|
if req.config.mode != "swarm":
|
||||||
|
raise HTTPException(status_code=400, detail="mode must be 'swarm'")
|
||||||
|
|
||||||
|
buckets = _shard_by_host(req.config)
|
||||||
|
|
||||||
|
# Resolve host rows in one query-per-host pass; fail fast on unknown uuids.
|
||||||
|
hosts: dict[str, dict[str, Any]] = {}
|
||||||
|
for host_uuid in buckets:
|
||||||
|
row = await repo.get_swarm_host_by_uuid(host_uuid)
|
||||||
|
if row is None:
|
||||||
|
raise HTTPException(status_code=404, detail=f"unknown host_uuid: {host_uuid}")
|
||||||
|
hosts[host_uuid] = row
|
||||||
|
|
||||||
|
async def _dispatch(host_uuid: str, shard: list[DeckyConfig]) -> HostResult:
|
||||||
|
host = hosts[host_uuid]
|
||||||
|
cfg = _worker_config(req.config, shard)
|
||||||
|
try:
|
||||||
|
async with AgentClient(host=host) as agent:
|
||||||
|
body = await agent.deploy(cfg, dry_run=req.dry_run, no_cache=req.no_cache)
|
||||||
|
# Persist a DeckyShard row per decky for status lookups.
|
||||||
|
for d in shard:
|
||||||
|
await repo.upsert_decky_shard(
|
||||||
|
{
|
||||||
|
"decky_name": d.name,
|
||||||
|
"host_uuid": host_uuid,
|
||||||
|
"services": json.dumps(d.services),
|
||||||
|
"state": "running" if not req.dry_run else "pending",
|
||||||
|
"last_error": None,
|
||||||
|
"updated_at": datetime.now(timezone.utc),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
await repo.update_swarm_host(host_uuid, {"status": "active"})
|
||||||
|
return HostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
|
||||||
|
except Exception as exc:
|
||||||
|
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
|
||||||
|
for d in shard:
|
||||||
|
await repo.upsert_decky_shard(
|
||||||
|
{
|
||||||
|
"decky_name": d.name,
|
||||||
|
"host_uuid": host_uuid,
|
||||||
|
"services": json.dumps(d.services),
|
||||||
|
"state": "failed",
|
||||||
|
"last_error": str(exc)[:512],
|
||||||
|
"updated_at": datetime.now(timezone.utc),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return HostResult(host_uuid=host_uuid, host_name=host["name"], ok=False, detail=str(exc))
|
||||||
|
|
||||||
|
results = await asyncio.gather(
|
||||||
|
*(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
|
||||||
|
)
|
||||||
|
return DeployResponse(results=list(results))
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/teardown", response_model=DeployResponse)
|
||||||
|
async def teardown(
|
||||||
|
req: TeardownRequest,
|
||||||
|
repo: BaseRepository = Depends(get_repo),
|
||||||
|
) -> DeployResponse:
|
||||||
|
if req.host_uuid is not None:
|
||||||
|
row = await repo.get_swarm_host_by_uuid(req.host_uuid)
|
||||||
|
if row is None:
|
||||||
|
raise HTTPException(status_code=404, detail="host not found")
|
||||||
|
targets = [row]
|
||||||
|
else:
|
||||||
|
targets = await repo.list_swarm_hosts()
|
||||||
|
|
||||||
|
async def _call(host: dict[str, Any]) -> HostResult:
|
||||||
|
try:
|
||||||
|
async with AgentClient(host=host) as agent:
|
||||||
|
body = await agent.teardown(req.decky_id)
|
||||||
|
if req.decky_id is None:
|
||||||
|
await repo.delete_decky_shards_for_host(host["uuid"])
|
||||||
|
return HostResult(host_uuid=host["uuid"], host_name=host["name"], ok=True, detail=body)
|
||||||
|
except Exception as exc:
|
||||||
|
log.exception("swarm.teardown failed host=%s", host["name"])
|
||||||
|
return HostResult(
|
||||||
|
host_uuid=host["uuid"], host_name=host["name"], ok=False, detail=str(exc)
|
||||||
|
)
|
||||||
|
|
||||||
|
results = await asyncio.gather(*(_call(h) for h in targets))
|
||||||
|
return DeployResponse(results=list(results))
|
||||||
79
decnet/web/router/swarm/health.py
Normal file
79
decnet/web/router/swarm/health.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
"""Health endpoints for the swarm controller.
|
||||||
|
|
||||||
|
* ``GET /swarm/health`` — liveness of the controller itself (no I/O).
|
||||||
|
* ``POST /swarm/check`` — active probe of every enrolled worker over mTLS.
|
||||||
|
Updates ``SwarmHost.status`` and ``last_heartbeat``.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
from decnet.swarm.client import AgentClient
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
from decnet.web.dependencies import get_repo
|
||||||
|
|
||||||
|
log = get_logger("swarm.health")
|
||||||
|
|
||||||
|
router = APIRouter(tags=["swarm-health"])
|
||||||
|
|
||||||
|
|
||||||
|
class HostHealth(BaseModel):
|
||||||
|
host_uuid: str
|
||||||
|
name: str
|
||||||
|
address: str
|
||||||
|
reachable: bool
|
||||||
|
detail: Any | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class CheckResponse(BaseModel):
|
||||||
|
results: list[HostHealth]
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/health")
|
||||||
|
async def health() -> dict[str, str]:
|
||||||
|
return {"status": "ok", "role": "swarm-controller"}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/check", response_model=CheckResponse)
|
||||||
|
async def check(
|
||||||
|
repo: BaseRepository = Depends(get_repo),
|
||||||
|
) -> CheckResponse:
|
||||||
|
hosts = await repo.list_swarm_hosts()
|
||||||
|
|
||||||
|
async def _probe(host: dict[str, Any]) -> HostHealth:
|
||||||
|
try:
|
||||||
|
async with AgentClient(host=host) as agent:
|
||||||
|
body = await agent.health()
|
||||||
|
await repo.update_swarm_host(
|
||||||
|
host["uuid"],
|
||||||
|
{
|
||||||
|
"status": "active",
|
||||||
|
"last_heartbeat": datetime.now(timezone.utc),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return HostHealth(
|
||||||
|
host_uuid=host["uuid"],
|
||||||
|
name=host["name"],
|
||||||
|
address=host["address"],
|
||||||
|
reachable=True,
|
||||||
|
detail=body,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("swarm.check unreachable host=%s err=%s", host["name"], exc)
|
||||||
|
await repo.update_swarm_host(host["uuid"], {"status": "unreachable"})
|
||||||
|
return HostHealth(
|
||||||
|
host_uuid=host["uuid"],
|
||||||
|
name=host["name"],
|
||||||
|
address=host["address"],
|
||||||
|
reachable=False,
|
||||||
|
detail=str(exc),
|
||||||
|
)
|
||||||
|
|
||||||
|
results = await asyncio.gather(*(_probe(h) for h in hosts))
|
||||||
|
return CheckResponse(results=list(results))
|
||||||
162
decnet/web/router/swarm/hosts.py
Normal file
162
decnet/web/router/swarm/hosts.py
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
"""Swarm host lifecycle endpoints: enroll, list, decommission.
|
||||||
|
|
||||||
|
Enrollment design
|
||||||
|
-----------------
|
||||||
|
The master controller holds the CA private key. On ``POST /swarm/enroll``
|
||||||
|
it generates a fresh worker keypair + cert (signed by the master CA) and
|
||||||
|
returns the full bundle to the operator. The operator is responsible for
|
||||||
|
delivering that bundle to the worker's ``~/.decnet/agent/`` directory
|
||||||
|
(scp/sshpass/ansible — outside this process's trust boundary).
|
||||||
|
|
||||||
|
Rationale: the worker agent speaks ONLY mTLS. There is no pre-auth
|
||||||
|
bootstrap endpoint, so there is nothing to attack before the worker is
|
||||||
|
enrolled. The bundle-delivery step is explicit and auditable.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pathlib
|
||||||
|
import uuid as _uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, status
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from decnet.swarm import pki
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
from decnet.web.dependencies import get_repo
|
||||||
|
|
||||||
|
router = APIRouter(tags=["swarm-hosts"])
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------- schemas
|
||||||
|
|
||||||
|
|
||||||
|
class EnrollRequest(BaseModel):
|
||||||
|
name: str = Field(..., min_length=1, max_length=128)
|
||||||
|
address: str = Field(..., description="IP or DNS the master uses to reach the worker")
|
||||||
|
agent_port: int = Field(default=8765, ge=1, le=65535)
|
||||||
|
sans: list[str] = Field(
|
||||||
|
default_factory=list,
|
||||||
|
description="Extra SANs (IPs / hostnames) to embed in the worker cert",
|
||||||
|
)
|
||||||
|
notes: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class EnrolledBundle(BaseModel):
|
||||||
|
"""Cert bundle returned to the operator — must be delivered to the worker."""
|
||||||
|
|
||||||
|
host_uuid: str
|
||||||
|
name: str
|
||||||
|
address: str
|
||||||
|
agent_port: int
|
||||||
|
fingerprint: str
|
||||||
|
ca_cert_pem: str
|
||||||
|
worker_cert_pem: str
|
||||||
|
worker_key_pem: str
|
||||||
|
|
||||||
|
|
||||||
|
class SwarmHostView(BaseModel):
|
||||||
|
uuid: str
|
||||||
|
name: str
|
||||||
|
address: str
|
||||||
|
agent_port: int
|
||||||
|
status: str
|
||||||
|
last_heartbeat: Optional[datetime] = None
|
||||||
|
client_cert_fingerprint: str
|
||||||
|
enrolled_at: datetime
|
||||||
|
notes: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------- routes
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/enroll", response_model=EnrolledBundle, status_code=status.HTTP_201_CREATED)
|
||||||
|
async def enroll(
|
||||||
|
req: EnrollRequest,
|
||||||
|
repo: BaseRepository = Depends(get_repo),
|
||||||
|
) -> EnrolledBundle:
|
||||||
|
existing = await repo.get_swarm_host_by_name(req.name)
|
||||||
|
if existing is not None:
|
||||||
|
raise HTTPException(status_code=409, detail=f"Worker '{req.name}' is already enrolled")
|
||||||
|
|
||||||
|
ca = pki.ensure_ca()
|
||||||
|
sans = list({*req.sans, req.address, req.name})
|
||||||
|
issued = pki.issue_worker_cert(ca, req.name, sans)
|
||||||
|
|
||||||
|
# Persist the bundle under ~/.decnet/ca/workers/<name>/ so the master
|
||||||
|
# can replay it if the operator loses the original delivery.
|
||||||
|
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.name
|
||||||
|
pki.write_worker_bundle(issued, bundle_dir)
|
||||||
|
|
||||||
|
host_uuid = str(_uuid.uuid4())
|
||||||
|
await repo.add_swarm_host(
|
||||||
|
{
|
||||||
|
"uuid": host_uuid,
|
||||||
|
"name": req.name,
|
||||||
|
"address": req.address,
|
||||||
|
"agent_port": req.agent_port,
|
||||||
|
"status": "enrolled",
|
||||||
|
"client_cert_fingerprint": issued.fingerprint_sha256,
|
||||||
|
"cert_bundle_path": str(bundle_dir),
|
||||||
|
"enrolled_at": datetime.now(timezone.utc),
|
||||||
|
"notes": req.notes,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return EnrolledBundle(
|
||||||
|
host_uuid=host_uuid,
|
||||||
|
name=req.name,
|
||||||
|
address=req.address,
|
||||||
|
agent_port=req.agent_port,
|
||||||
|
fingerprint=issued.fingerprint_sha256,
|
||||||
|
ca_cert_pem=issued.ca_cert_pem.decode(),
|
||||||
|
worker_cert_pem=issued.cert_pem.decode(),
|
||||||
|
worker_key_pem=issued.key_pem.decode(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/hosts", response_model=list[SwarmHostView])
|
||||||
|
async def list_hosts(
|
||||||
|
host_status: Optional[str] = None,
|
||||||
|
repo: BaseRepository = Depends(get_repo),
|
||||||
|
) -> list[SwarmHostView]:
|
||||||
|
rows = await repo.list_swarm_hosts(host_status)
|
||||||
|
return [SwarmHostView(**r) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/hosts/{uuid}", response_model=SwarmHostView)
|
||||||
|
async def get_host(
|
||||||
|
uuid: str,
|
||||||
|
repo: BaseRepository = Depends(get_repo),
|
||||||
|
) -> SwarmHostView:
|
||||||
|
row = await repo.get_swarm_host_by_uuid(uuid)
|
||||||
|
if row is None:
|
||||||
|
raise HTTPException(status_code=404, detail="host not found")
|
||||||
|
return SwarmHostView(**row)
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/hosts/{uuid}", status_code=status.HTTP_204_NO_CONTENT)
|
||||||
|
async def decommission(
|
||||||
|
uuid: str,
|
||||||
|
repo: BaseRepository = Depends(get_repo),
|
||||||
|
) -> None:
|
||||||
|
row = await repo.get_swarm_host_by_uuid(uuid)
|
||||||
|
if row is None:
|
||||||
|
raise HTTPException(status_code=404, detail="host not found")
|
||||||
|
|
||||||
|
# Remove shard rows first (we own them; cascade is portable via the repo).
|
||||||
|
await repo.delete_decky_shards_for_host(uuid)
|
||||||
|
await repo.delete_swarm_host(uuid)
|
||||||
|
|
||||||
|
# Best-effort bundle cleanup; if the dir was moved manually, don't fail.
|
||||||
|
bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
|
||||||
|
if bundle_dir.is_dir():
|
||||||
|
for child in bundle_dir.iterdir():
|
||||||
|
try:
|
||||||
|
child.unlink()
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
bundle_dir.rmdir()
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
65
decnet/web/swarm_api.py
Normal file
65
decnet/web/swarm_api.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
"""DECNET SWARM Controller — master-side control plane.
|
||||||
|
|
||||||
|
Runs as an independent FastAPI/uvicorn process. Isolated from
|
||||||
|
``decnet.web.api`` so controller failure cannot cascade to the main API,
|
||||||
|
ingester, or dashboard (mirrors the existing pattern used by
|
||||||
|
``decnet api`` with ``start_new_session=True``).
|
||||||
|
|
||||||
|
Responsibilities:
|
||||||
|
* host enrollment (issues CA-signed worker bundles);
|
||||||
|
* dispatching DecnetConfig shards to worker agents over mTLS;
|
||||||
|
* active health probes of enrolled workers.
|
||||||
|
|
||||||
|
The controller *reuses* the same ``get_repo`` dependency as the main API,
|
||||||
|
so SwarmHost / DeckyShard state is visible to both processes via the
|
||||||
|
shared DB.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
from typing import AsyncGenerator
|
||||||
|
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.responses import ORJSONResponse
|
||||||
|
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
from decnet.swarm import pki
|
||||||
|
from decnet.swarm.client import ensure_master_identity
|
||||||
|
from decnet.web.dependencies import repo
|
||||||
|
from decnet.web.router.swarm import swarm_router
|
||||||
|
|
||||||
|
log = get_logger("swarm_api")
|
||||||
|
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
||||||
|
log.info("swarm-controller starting up")
|
||||||
|
# Make sure the CA and master client cert exist before we accept any
|
||||||
|
# request — enrollment needs them and AgentClient needs them.
|
||||||
|
pki.ensure_ca()
|
||||||
|
ensure_master_identity()
|
||||||
|
await repo.initialize()
|
||||||
|
log.info("swarm-controller ready")
|
||||||
|
yield
|
||||||
|
log.info("swarm-controller shutdown")
|
||||||
|
|
||||||
|
|
||||||
|
app: FastAPI = FastAPI(
|
||||||
|
title="DECNET SWARM Controller",
|
||||||
|
version="0.1.0",
|
||||||
|
lifespan=lifespan,
|
||||||
|
default_response_class=ORJSONResponse,
|
||||||
|
# No interactive docs: the controller is an internal management plane,
|
||||||
|
# not a public surface. Enable explicitly in dev if needed.
|
||||||
|
docs_url=None,
|
||||||
|
redoc_url=None,
|
||||||
|
openapi_url=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
app.include_router(swarm_router)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def root_health() -> dict[str, str]:
|
||||||
|
"""Top-level liveness probe (no DB I/O)."""
|
||||||
|
return {"status": "ok", "role": "swarm-controller"}
|
||||||
294
tests/swarm/test_swarm_api.py
Normal file
294
tests/swarm/test_swarm_api.py
Normal file
@@ -0,0 +1,294 @@
|
|||||||
|
"""Unit tests for the SWARM controller FastAPI app.
|
||||||
|
|
||||||
|
Covers the enrollment, host-management, and deployment dispatch routes.
|
||||||
|
The AgentClient is stubbed so we exercise the controller's logic without
|
||||||
|
a live mTLS peer (that path has its own roundtrip test).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pathlib
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
from decnet.web.db.factory import get_repository
|
||||||
|
from decnet.web.dependencies import get_repo
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def ca_dir(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch) -> pathlib.Path:
|
||||||
|
"""Redirect the PKI default CA path into tmp so the test CA never
|
||||||
|
touches ``~/.decnet/ca``."""
|
||||||
|
ca = tmp_path / "ca"
|
||||||
|
from decnet.swarm import pki
|
||||||
|
|
||||||
|
monkeypatch.setattr(pki, "DEFAULT_CA_DIR", ca)
|
||||||
|
# Also patch the already-imported references inside client.py / routers.
|
||||||
|
from decnet.swarm import client as swarm_client
|
||||||
|
from decnet.web.router.swarm import hosts as swarm_hosts
|
||||||
|
|
||||||
|
monkeypatch.setattr(swarm_client, "pki", pki)
|
||||||
|
monkeypatch.setattr(swarm_hosts, "pki", pki)
|
||||||
|
return ca
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def repo(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch):
|
||||||
|
r = get_repository(db_path=str(tmp_path / "swarm.db"))
|
||||||
|
# The controller's lifespan initialises the module-level `repo` in
|
||||||
|
# decnet.web.dependencies. Swap that singleton for our test repo so
|
||||||
|
# schema creation targets the temp DB.
|
||||||
|
import decnet.web.dependencies as deps
|
||||||
|
import decnet.web.swarm_api as swarm_api_mod
|
||||||
|
|
||||||
|
monkeypatch.setattr(deps, "repo", r)
|
||||||
|
monkeypatch.setattr(swarm_api_mod, "repo", r)
|
||||||
|
return r
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client(repo, ca_dir: pathlib.Path):
|
||||||
|
from decnet.web.swarm_api import app
|
||||||
|
|
||||||
|
async def _override() -> Any:
|
||||||
|
return repo
|
||||||
|
|
||||||
|
app.dependency_overrides[get_repo] = _override
|
||||||
|
with TestClient(app) as c:
|
||||||
|
yield c
|
||||||
|
app.dependency_overrides.clear()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------- /enroll
|
||||||
|
|
||||||
|
|
||||||
|
def test_enroll_creates_host_and_returns_bundle(client: TestClient) -> None:
|
||||||
|
resp = client.post(
|
||||||
|
"/swarm/enroll",
|
||||||
|
json={"name": "worker-a", "address": "10.0.0.5", "agent_port": 8765},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 201, resp.text
|
||||||
|
body = resp.json()
|
||||||
|
assert body["name"] == "worker-a"
|
||||||
|
assert body["address"] == "10.0.0.5"
|
||||||
|
assert "-----BEGIN CERTIFICATE-----" in body["worker_cert_pem"]
|
||||||
|
assert "-----BEGIN PRIVATE KEY-----" in body["worker_key_pem"]
|
||||||
|
assert "-----BEGIN CERTIFICATE-----" in body["ca_cert_pem"]
|
||||||
|
assert len(body["fingerprint"]) == 64 # sha256 hex
|
||||||
|
|
||||||
|
|
||||||
|
def test_enroll_rejects_duplicate_name(client: TestClient) -> None:
|
||||||
|
payload = {"name": "worker-dup", "address": "10.0.0.6", "agent_port": 8765}
|
||||||
|
assert client.post("/swarm/enroll", json=payload).status_code == 201
|
||||||
|
resp2 = client.post("/swarm/enroll", json=payload)
|
||||||
|
assert resp2.status_code == 409
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------- /hosts
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_hosts_empty(client: TestClient) -> None:
|
||||||
|
resp = client.get("/swarm/hosts")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json() == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_and_get_host_after_enroll(client: TestClient) -> None:
|
||||||
|
reg = client.post(
|
||||||
|
"/swarm/enroll",
|
||||||
|
json={"name": "worker-b", "address": "10.0.0.7", "agent_port": 8765},
|
||||||
|
).json()
|
||||||
|
uuid = reg["host_uuid"]
|
||||||
|
|
||||||
|
lst = client.get("/swarm/hosts").json()
|
||||||
|
assert len(lst) == 1
|
||||||
|
assert lst[0]["name"] == "worker-b"
|
||||||
|
|
||||||
|
one = client.get(f"/swarm/hosts/{uuid}").json()
|
||||||
|
assert one["uuid"] == uuid
|
||||||
|
assert one["status"] == "enrolled"
|
||||||
|
|
||||||
|
|
||||||
|
def test_decommission_removes_host_and_bundle(
|
||||||
|
client: TestClient, ca_dir: pathlib.Path
|
||||||
|
) -> None:
|
||||||
|
reg = client.post(
|
||||||
|
"/swarm/enroll",
|
||||||
|
json={"name": "worker-c", "address": "10.0.0.8", "agent_port": 8765},
|
||||||
|
).json()
|
||||||
|
uuid = reg["host_uuid"]
|
||||||
|
|
||||||
|
bundle_dir = ca_dir / "workers" / "worker-c"
|
||||||
|
assert bundle_dir.is_dir()
|
||||||
|
|
||||||
|
resp = client.delete(f"/swarm/hosts/{uuid}")
|
||||||
|
assert resp.status_code == 204
|
||||||
|
assert client.get(f"/swarm/hosts/{uuid}").status_code == 404
|
||||||
|
assert not bundle_dir.exists()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------- /deploy
|
||||||
|
|
||||||
|
|
||||||
|
class _StubAgentClient:
|
||||||
|
"""Minimal async-context-manager stub mirroring ``AgentClient``."""
|
||||||
|
|
||||||
|
deployed: list[dict[str, Any]] = []
|
||||||
|
torn_down: list[dict[str, Any]] = []
|
||||||
|
|
||||||
|
def __init__(self, host: dict[str, Any] | None = None, **_: Any) -> None:
|
||||||
|
self._host = host or {}
|
||||||
|
|
||||||
|
async def __aenter__(self) -> "_StubAgentClient":
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, *exc: Any) -> None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def health(self) -> dict[str, Any]:
|
||||||
|
return {"status": "ok"}
|
||||||
|
|
||||||
|
async def deploy(self, config: Any, **kw: Any) -> dict[str, Any]:
|
||||||
|
_StubAgentClient.deployed.append(
|
||||||
|
{"host": self._host.get("name"), "deckies": [d.name for d in config.deckies]}
|
||||||
|
)
|
||||||
|
return {"status": "deployed", "deckies": len(config.deckies)}
|
||||||
|
|
||||||
|
async def teardown(self, decky_id: str | None = None) -> dict[str, Any]:
|
||||||
|
_StubAgentClient.torn_down.append(
|
||||||
|
{"host": self._host.get("name"), "decky_id": decky_id}
|
||||||
|
)
|
||||||
|
return {"status": "torn_down"}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def stub_agent(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
_StubAgentClient.deployed.clear()
|
||||||
|
_StubAgentClient.torn_down.clear()
|
||||||
|
from decnet.web.router.swarm import deployments as dep_mod
|
||||||
|
from decnet.web.router.swarm import health as hlt_mod
|
||||||
|
|
||||||
|
monkeypatch.setattr(dep_mod, "AgentClient", _StubAgentClient)
|
||||||
|
monkeypatch.setattr(hlt_mod, "AgentClient", _StubAgentClient)
|
||||||
|
return _StubAgentClient
|
||||||
|
|
||||||
|
|
||||||
|
def _decky_dict(name: str, host_uuid: str, ip: str) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"ip": ip,
|
||||||
|
"services": ["ssh"],
|
||||||
|
"distro": "debian",
|
||||||
|
"base_image": "debian:bookworm-slim",
|
||||||
|
"hostname": name,
|
||||||
|
"host_uuid": host_uuid,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_deploy_shards_across_hosts(client: TestClient, stub_agent) -> None:
|
||||||
|
h1 = client.post(
|
||||||
|
"/swarm/enroll",
|
||||||
|
json={"name": "w1", "address": "10.0.0.1", "agent_port": 8765},
|
||||||
|
).json()
|
||||||
|
h2 = client.post(
|
||||||
|
"/swarm/enroll",
|
||||||
|
json={"name": "w2", "address": "10.0.0.2", "agent_port": 8765},
|
||||||
|
).json()
|
||||||
|
|
||||||
|
cfg = {
|
||||||
|
"mode": "swarm",
|
||||||
|
"interface": "eth0",
|
||||||
|
"subnet": "192.168.1.0/24",
|
||||||
|
"gateway": "192.168.1.1",
|
||||||
|
"deckies": [
|
||||||
|
_decky_dict("decky-01", h1["host_uuid"], "192.168.1.10"),
|
||||||
|
_decky_dict("decky-02", h1["host_uuid"], "192.168.1.11"),
|
||||||
|
_decky_dict("decky-03", h2["host_uuid"], "192.168.1.12"),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
resp = client.post("/swarm/deploy", json={"config": cfg})
|
||||||
|
assert resp.status_code == 200, resp.text
|
||||||
|
body = resp.json()
|
||||||
|
assert len(body["results"]) == 2
|
||||||
|
assert all(r["ok"] for r in body["results"])
|
||||||
|
|
||||||
|
by_host = {d["host"]: d["deckies"] for d in stub_agent.deployed}
|
||||||
|
assert by_host["w1"] == ["decky-01", "decky-02"]
|
||||||
|
assert by_host["w2"] == ["decky-03"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_deploy_rejects_missing_host_uuid(client: TestClient, stub_agent) -> None:
|
||||||
|
cfg = {
|
||||||
|
"mode": "swarm",
|
||||||
|
"interface": "eth0",
|
||||||
|
"subnet": "192.168.1.0/24",
|
||||||
|
"gateway": "192.168.1.1",
|
||||||
|
"deckies": [
|
||||||
|
{
|
||||||
|
"name": "decky-01",
|
||||||
|
"ip": "192.168.1.10",
|
||||||
|
"services": ["ssh"],
|
||||||
|
"distro": "debian",
|
||||||
|
"base_image": "debian:bookworm-slim",
|
||||||
|
"hostname": "decky-01",
|
||||||
|
# host_uuid deliberately omitted
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
resp = client.post("/swarm/deploy", json={"config": cfg})
|
||||||
|
assert resp.status_code == 400
|
||||||
|
assert "host_uuid" in resp.json()["detail"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_deploy_rejects_non_swarm_mode(client: TestClient, stub_agent) -> None:
|
||||||
|
cfg = {
|
||||||
|
"mode": "unihost",
|
||||||
|
"interface": "eth0",
|
||||||
|
"subnet": "192.168.1.0/24",
|
||||||
|
"gateway": "192.168.1.1",
|
||||||
|
"deckies": [_decky_dict("decky-01", "fake-uuid", "192.168.1.10")],
|
||||||
|
}
|
||||||
|
resp = client.post("/swarm/deploy", json={"config": cfg})
|
||||||
|
assert resp.status_code == 400
|
||||||
|
|
||||||
|
|
||||||
|
def test_teardown_all_hosts(client: TestClient, stub_agent) -> None:
|
||||||
|
for i, addr in enumerate(("10.0.0.1", "10.0.0.2"), start=1):
|
||||||
|
client.post(
|
||||||
|
"/swarm/enroll",
|
||||||
|
json={"name": f"td{i}", "address": addr, "agent_port": 8765},
|
||||||
|
)
|
||||||
|
resp = client.post("/swarm/teardown", json={})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert len(resp.json()["results"]) == 2
|
||||||
|
assert {t["host"] for t in stub_agent.torn_down} == {"td1", "td2"}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------- /check
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_marks_hosts_active(client: TestClient, stub_agent) -> None:
|
||||||
|
h = client.post(
|
||||||
|
"/swarm/enroll",
|
||||||
|
json={"name": "probe-w", "address": "10.0.0.9", "agent_port": 8765},
|
||||||
|
).json()
|
||||||
|
|
||||||
|
resp = client.post("/swarm/check")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
results = resp.json()["results"]
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results[0]["reachable"] is True
|
||||||
|
|
||||||
|
one = client.get(f"/swarm/hosts/{h['host_uuid']}").json()
|
||||||
|
assert one["status"] == "active"
|
||||||
|
assert one["last_heartbeat"] is not None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------- /health (root)
|
||||||
|
|
||||||
|
|
||||||
|
def test_root_health(client: TestClient) -> None:
|
||||||
|
resp = client.get("/health")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["role"] == "swarm-controller"
|
||||||
Reference in New Issue
Block a user