refactor(swarm): one file per endpoint, matching existing router layout
Splits the three grouped router files into eight api_<verb>_<resource>.py modules under decnet/web/router/swarm/ to match the convention used by router/fleet/ and router/config/. Shared request/response models live in _schemas.py. Keeps each endpoint easy to locate and modify without stepping on siblings.
This commit is contained in:
@@ -1,16 +1,33 @@
|
|||||||
"""Swarm controller routers.
|
"""Swarm controller routers.
|
||||||
|
|
||||||
Mounted onto the swarm-api FastAPI app under the ``/swarm`` prefix. The
|
One file per endpoint, aggregated under the ``/swarm`` prefix. Mounted
|
||||||
controller is a separate process from the main DECNET API so swarm
|
onto the swarm-api FastAPI app (``decnet/web/swarm_api.py``), a separate
|
||||||
failures cannot cascade into log ingestion / dashboard serving.
|
process from the main DECNET API so swarm failures cannot cascade into
|
||||||
|
log ingestion / dashboard serving.
|
||||||
"""
|
"""
|
||||||
from fastapi import APIRouter
|
from fastapi import APIRouter
|
||||||
|
|
||||||
from .hosts import router as hosts_router
|
from .api_enroll_host import router as enroll_host_router
|
||||||
from .deployments import router as deployments_router
|
from .api_list_hosts import router as list_hosts_router
|
||||||
from .health import router as health_router
|
from .api_get_host import router as get_host_router
|
||||||
|
from .api_decommission_host import router as decommission_host_router
|
||||||
|
from .api_deploy_swarm import router as deploy_swarm_router
|
||||||
|
from .api_teardown_swarm import router as teardown_swarm_router
|
||||||
|
from .api_get_swarm_health import router as get_swarm_health_router
|
||||||
|
from .api_check_hosts import router as check_hosts_router
|
||||||
|
|
||||||
swarm_router = APIRouter(prefix="/swarm")
|
swarm_router = APIRouter(prefix="/swarm")
|
||||||
swarm_router.include_router(hosts_router)
|
|
||||||
swarm_router.include_router(deployments_router)
|
# Hosts
|
||||||
swarm_router.include_router(health_router)
|
swarm_router.include_router(enroll_host_router)
|
||||||
|
swarm_router.include_router(list_hosts_router)
|
||||||
|
swarm_router.include_router(get_host_router)
|
||||||
|
swarm_router.include_router(decommission_host_router)
|
||||||
|
|
||||||
|
# Deployments
|
||||||
|
swarm_router.include_router(deploy_swarm_router)
|
||||||
|
swarm_router.include_router(teardown_swarm_router)
|
||||||
|
|
||||||
|
# Health
|
||||||
|
swarm_router.include_router(get_swarm_health_router)
|
||||||
|
swarm_router.include_router(check_hosts_router)
|
||||||
|
|||||||
82
decnet/web/router/swarm/_schemas.py
Normal file
82
decnet/web/router/swarm/_schemas.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
"""Request/response models shared across the swarm router endpoints."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from decnet.config import DecnetConfig
|
||||||
|
|
||||||
|
|
||||||
|
class EnrollRequest(BaseModel):
|
||||||
|
name: str = Field(..., min_length=1, max_length=128)
|
||||||
|
address: str = Field(..., description="IP or DNS the master uses to reach the worker")
|
||||||
|
agent_port: int = Field(default=8765, ge=1, le=65535)
|
||||||
|
sans: list[str] = Field(
|
||||||
|
default_factory=list,
|
||||||
|
description="Extra SANs (IPs / hostnames) to embed in the worker cert",
|
||||||
|
)
|
||||||
|
notes: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class EnrolledBundle(BaseModel):
|
||||||
|
"""Cert bundle returned to the operator — must be delivered to the worker."""
|
||||||
|
|
||||||
|
host_uuid: str
|
||||||
|
name: str
|
||||||
|
address: str
|
||||||
|
agent_port: int
|
||||||
|
fingerprint: str
|
||||||
|
ca_cert_pem: str
|
||||||
|
worker_cert_pem: str
|
||||||
|
worker_key_pem: str
|
||||||
|
|
||||||
|
|
||||||
|
class SwarmHostView(BaseModel):
|
||||||
|
uuid: str
|
||||||
|
name: str
|
||||||
|
address: str
|
||||||
|
agent_port: int
|
||||||
|
status: str
|
||||||
|
last_heartbeat: Optional[datetime] = None
|
||||||
|
client_cert_fingerprint: str
|
||||||
|
enrolled_at: datetime
|
||||||
|
notes: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class DeployRequest(BaseModel):
|
||||||
|
config: DecnetConfig
|
||||||
|
dry_run: bool = False
|
||||||
|
no_cache: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class TeardownRequest(BaseModel):
|
||||||
|
host_uuid: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="If set, tear down only this worker; otherwise tear down all hosts",
|
||||||
|
)
|
||||||
|
decky_id: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class HostResult(BaseModel):
|
||||||
|
host_uuid: str
|
||||||
|
host_name: str
|
||||||
|
ok: bool
|
||||||
|
detail: Any | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class DeployResponse(BaseModel):
|
||||||
|
results: list[HostResult]
|
||||||
|
|
||||||
|
|
||||||
|
class HostHealth(BaseModel):
|
||||||
|
host_uuid: str
|
||||||
|
name: str
|
||||||
|
address: str
|
||||||
|
reachable: bool
|
||||||
|
detail: Any | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class CheckResponse(BaseModel):
|
||||||
|
results: list[HostHealth]
|
||||||
@@ -1,8 +1,7 @@
|
|||||||
"""Health endpoints for the swarm controller.
|
"""POST /swarm/check — active mTLS probe of every enrolled worker.
|
||||||
|
|
||||||
* ``GET /swarm/health`` — liveness of the controller itself (no I/O).
|
Updates ``SwarmHost.status`` and ``last_heartbeat`` for each host based
|
||||||
* ``POST /swarm/check`` — active probe of every enrolled worker over mTLS.
|
on the outcome of the probe.
|
||||||
Updates ``SwarmHost.status`` and ``last_heartbeat``.
|
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
@@ -11,37 +10,20 @@ from datetime import datetime, timezone
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends
|
from fastapi import APIRouter, Depends
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
from decnet.logging import get_logger
|
from decnet.logging import get_logger
|
||||||
from decnet.swarm.client import AgentClient
|
from decnet.swarm.client import AgentClient
|
||||||
from decnet.web.db.repository import BaseRepository
|
from decnet.web.db.repository import BaseRepository
|
||||||
from decnet.web.dependencies import get_repo
|
from decnet.web.dependencies import get_repo
|
||||||
|
from decnet.web.router.swarm._schemas import CheckResponse, HostHealth
|
||||||
|
|
||||||
log = get_logger("swarm.health")
|
log = get_logger("swarm.check")
|
||||||
|
|
||||||
router = APIRouter(tags=["swarm-health"])
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
class HostHealth(BaseModel):
|
@router.post("/check", response_model=CheckResponse, tags=["Swarm Health"])
|
||||||
host_uuid: str
|
async def api_check_hosts(
|
||||||
name: str
|
|
||||||
address: str
|
|
||||||
reachable: bool
|
|
||||||
detail: Any | None = None
|
|
||||||
|
|
||||||
|
|
||||||
class CheckResponse(BaseModel):
|
|
||||||
results: list[HostHealth]
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/health")
|
|
||||||
async def health() -> dict[str, str]:
|
|
||||||
return {"status": "ok", "role": "swarm-controller"}
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/check", response_model=CheckResponse)
|
|
||||||
async def check(
|
|
||||||
repo: BaseRepository = Depends(get_repo),
|
repo: BaseRepository = Depends(get_repo),
|
||||||
) -> CheckResponse:
|
) -> CheckResponse:
|
||||||
hosts = await repo.list_swarm_hosts()
|
hosts = await repo.list_swarm_hosts()
|
||||||
46
decnet/web/router/swarm/api_decommission_host.py
Normal file
46
decnet/web/router/swarm/api_decommission_host.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
"""DELETE /swarm/hosts/{uuid} — decommission a worker.
|
||||||
|
|
||||||
|
Removes the DeckyShard rows bound to the host (portable cascade — MySQL
|
||||||
|
and SQLite both honor it via the repo layer), deletes the SwarmHost row,
|
||||||
|
and best-effort-cleans the per-worker bundle directory on the master.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, status
|
||||||
|
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
from decnet.web.dependencies import get_repo
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete(
|
||||||
|
"/hosts/{uuid}",
|
||||||
|
status_code=status.HTTP_204_NO_CONTENT,
|
||||||
|
tags=["Swarm Hosts"],
|
||||||
|
)
|
||||||
|
async def api_decommission_host(
|
||||||
|
uuid: str,
|
||||||
|
repo: BaseRepository = Depends(get_repo),
|
||||||
|
) -> None:
|
||||||
|
row = await repo.get_swarm_host_by_uuid(uuid)
|
||||||
|
if row is None:
|
||||||
|
raise HTTPException(status_code=404, detail="host not found")
|
||||||
|
|
||||||
|
await repo.delete_decky_shards_for_host(uuid)
|
||||||
|
await repo.delete_swarm_host(uuid)
|
||||||
|
|
||||||
|
# Best-effort bundle cleanup; if the dir was moved manually, don't fail.
|
||||||
|
bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
|
||||||
|
if bundle_dir.is_dir():
|
||||||
|
for child in bundle_dir.iterdir():
|
||||||
|
try:
|
||||||
|
child.unlink()
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
bundle_dir.rmdir()
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
@@ -1,11 +1,10 @@
|
|||||||
"""Deployment dispatch: shard deckies across enrolled workers and push.
|
"""POST /swarm/deploy — shard a DecnetConfig across enrolled workers.
|
||||||
|
|
||||||
The master owns the DecnetConfig. Per worker we build a filtered copy
|
Per worker we build a filtered copy containing only the deckies assigned
|
||||||
containing only the deckies assigned to that worker (via ``host_uuid``),
|
to that worker (via ``host_uuid``), then POST it to the worker agent.
|
||||||
then POST it to the worker agent. Sharding strategy is explicit: the
|
The caller is expected to have already set ``host_uuid`` on every decky;
|
||||||
caller is expected to have already set ``host_uuid`` on every decky. If
|
if any decky arrives without one, we fail fast. Auto-sharding lives in
|
||||||
any decky arrives without one, we fail fast — auto-sharding lives in the
|
the CLI layer, not here.
|
||||||
CLI layer (task #7), not here.
|
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
@@ -15,45 +14,21 @@ from datetime import datetime, timezone
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
from pydantic import BaseModel, Field
|
|
||||||
|
|
||||||
from decnet.config import DecnetConfig, DeckyConfig
|
from decnet.config import DecnetConfig, DeckyConfig
|
||||||
from decnet.logging import get_logger
|
from decnet.logging import get_logger
|
||||||
from decnet.swarm.client import AgentClient
|
from decnet.swarm.client import AgentClient
|
||||||
from decnet.web.db.repository import BaseRepository
|
from decnet.web.db.repository import BaseRepository
|
||||||
from decnet.web.dependencies import get_repo
|
from decnet.web.dependencies import get_repo
|
||||||
|
from decnet.web.router.swarm._schemas import (
|
||||||
log = get_logger("swarm.deployments")
|
DeployRequest,
|
||||||
|
DeployResponse,
|
||||||
router = APIRouter(tags=["swarm-deployments"])
|
HostResult,
|
||||||
|
|
||||||
|
|
||||||
class DeployRequest(BaseModel):
|
|
||||||
config: DecnetConfig
|
|
||||||
dry_run: bool = False
|
|
||||||
no_cache: bool = False
|
|
||||||
|
|
||||||
|
|
||||||
class TeardownRequest(BaseModel):
|
|
||||||
host_uuid: str | None = Field(
|
|
||||||
default=None,
|
|
||||||
description="If set, tear down only this worker; otherwise tear down all hosts",
|
|
||||||
)
|
)
|
||||||
decky_id: str | None = None
|
|
||||||
|
|
||||||
|
log = get_logger("swarm.deploy")
|
||||||
|
|
||||||
class HostResult(BaseModel):
|
router = APIRouter()
|
||||||
host_uuid: str
|
|
||||||
host_name: str
|
|
||||||
ok: bool
|
|
||||||
detail: Any | None = None
|
|
||||||
|
|
||||||
|
|
||||||
class DeployResponse(BaseModel):
|
|
||||||
results: list[HostResult]
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------- helpers
|
|
||||||
|
|
||||||
|
|
||||||
def _shard_by_host(config: DecnetConfig) -> dict[str, list[DeckyConfig]]:
|
def _shard_by_host(config: DecnetConfig) -> dict[str, list[DeckyConfig]]:
|
||||||
@@ -72,11 +47,8 @@ def _worker_config(base: DecnetConfig, shard: list[DeckyConfig]) -> DecnetConfig
|
|||||||
return base.model_copy(update={"deckies": shard})
|
return base.model_copy(update={"deckies": shard})
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------ routes
|
@router.post("/deploy", response_model=DeployResponse, tags=["Swarm Deployments"])
|
||||||
|
async def api_deploy_swarm(
|
||||||
|
|
||||||
@router.post("/deploy", response_model=DeployResponse)
|
|
||||||
async def deploy(
|
|
||||||
req: DeployRequest,
|
req: DeployRequest,
|
||||||
repo: BaseRepository = Depends(get_repo),
|
repo: BaseRepository = Depends(get_repo),
|
||||||
) -> DeployResponse:
|
) -> DeployResponse:
|
||||||
@@ -85,7 +57,6 @@ async def deploy(
|
|||||||
|
|
||||||
buckets = _shard_by_host(req.config)
|
buckets = _shard_by_host(req.config)
|
||||||
|
|
||||||
# Resolve host rows in one query-per-host pass; fail fast on unknown uuids.
|
|
||||||
hosts: dict[str, dict[str, Any]] = {}
|
hosts: dict[str, dict[str, Any]] = {}
|
||||||
for host_uuid in buckets:
|
for host_uuid in buckets:
|
||||||
row = await repo.get_swarm_host_by_uuid(host_uuid)
|
row = await repo.get_swarm_host_by_uuid(host_uuid)
|
||||||
@@ -99,7 +70,6 @@ async def deploy(
|
|||||||
try:
|
try:
|
||||||
async with AgentClient(host=host) as agent:
|
async with AgentClient(host=host) as agent:
|
||||||
body = await agent.deploy(cfg, dry_run=req.dry_run, no_cache=req.no_cache)
|
body = await agent.deploy(cfg, dry_run=req.dry_run, no_cache=req.no_cache)
|
||||||
# Persist a DeckyShard row per decky for status lookups.
|
|
||||||
for d in shard:
|
for d in shard:
|
||||||
await repo.upsert_decky_shard(
|
await repo.upsert_decky_shard(
|
||||||
{
|
{
|
||||||
@@ -132,33 +102,3 @@ async def deploy(
|
|||||||
*(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
|
*(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
|
||||||
)
|
)
|
||||||
return DeployResponse(results=list(results))
|
return DeployResponse(results=list(results))
|
||||||
|
|
||||||
|
|
||||||
@router.post("/teardown", response_model=DeployResponse)
|
|
||||||
async def teardown(
|
|
||||||
req: TeardownRequest,
|
|
||||||
repo: BaseRepository = Depends(get_repo),
|
|
||||||
) -> DeployResponse:
|
|
||||||
if req.host_uuid is not None:
|
|
||||||
row = await repo.get_swarm_host_by_uuid(req.host_uuid)
|
|
||||||
if row is None:
|
|
||||||
raise HTTPException(status_code=404, detail="host not found")
|
|
||||||
targets = [row]
|
|
||||||
else:
|
|
||||||
targets = await repo.list_swarm_hosts()
|
|
||||||
|
|
||||||
async def _call(host: dict[str, Any]) -> HostResult:
|
|
||||||
try:
|
|
||||||
async with AgentClient(host=host) as agent:
|
|
||||||
body = await agent.teardown(req.decky_id)
|
|
||||||
if req.decky_id is None:
|
|
||||||
await repo.delete_decky_shards_for_host(host["uuid"])
|
|
||||||
return HostResult(host_uuid=host["uuid"], host_name=host["name"], ok=True, detail=body)
|
|
||||||
except Exception as exc:
|
|
||||||
log.exception("swarm.teardown failed host=%s", host["name"])
|
|
||||||
return HostResult(
|
|
||||||
host_uuid=host["uuid"], host_name=host["name"], ok=False, detail=str(exc)
|
|
||||||
)
|
|
||||||
|
|
||||||
results = await asyncio.gather(*(_call(h) for h in targets))
|
|
||||||
return DeployResponse(results=list(results))
|
|
||||||
72
decnet/web/router/swarm/api_enroll_host.py
Normal file
72
decnet/web/router/swarm/api_enroll_host.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
"""POST /swarm/enroll — issue a worker cert bundle and register the host.
|
||||||
|
|
||||||
|
Enrollment is master-driven: the controller holds the CA private key,
|
||||||
|
generates a fresh worker keypair + CA-signed cert, and returns the full
|
||||||
|
bundle to the operator. Bundle delivery to the worker (scp/sshpass/etc.)
|
||||||
|
is outside this process's trust boundary.
|
||||||
|
|
||||||
|
Rationale: the worker agent speaks ONLY mTLS; there is no pre-auth
|
||||||
|
bootstrap endpoint, so nothing to attack before the worker is enrolled.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid as _uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, status
|
||||||
|
|
||||||
|
from decnet.swarm import pki
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
from decnet.web.dependencies import get_repo
|
||||||
|
from decnet.web.router.swarm._schemas import EnrolledBundle, EnrollRequest
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/enroll",
|
||||||
|
response_model=EnrolledBundle,
|
||||||
|
status_code=status.HTTP_201_CREATED,
|
||||||
|
tags=["Swarm Hosts"],
|
||||||
|
)
|
||||||
|
async def api_enroll_host(
|
||||||
|
req: EnrollRequest,
|
||||||
|
repo: BaseRepository = Depends(get_repo),
|
||||||
|
) -> EnrolledBundle:
|
||||||
|
existing = await repo.get_swarm_host_by_name(req.name)
|
||||||
|
if existing is not None:
|
||||||
|
raise HTTPException(status_code=409, detail=f"Worker '{req.name}' is already enrolled")
|
||||||
|
|
||||||
|
ca = pki.ensure_ca()
|
||||||
|
sans = list({*req.sans, req.address, req.name})
|
||||||
|
issued = pki.issue_worker_cert(ca, req.name, sans)
|
||||||
|
|
||||||
|
# Persist the bundle under ~/.decnet/ca/workers/<name>/ so the master
|
||||||
|
# can replay it if the operator loses the original delivery.
|
||||||
|
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.name
|
||||||
|
pki.write_worker_bundle(issued, bundle_dir)
|
||||||
|
|
||||||
|
host_uuid = str(_uuid.uuid4())
|
||||||
|
await repo.add_swarm_host(
|
||||||
|
{
|
||||||
|
"uuid": host_uuid,
|
||||||
|
"name": req.name,
|
||||||
|
"address": req.address,
|
||||||
|
"agent_port": req.agent_port,
|
||||||
|
"status": "enrolled",
|
||||||
|
"client_cert_fingerprint": issued.fingerprint_sha256,
|
||||||
|
"cert_bundle_path": str(bundle_dir),
|
||||||
|
"enrolled_at": datetime.now(timezone.utc),
|
||||||
|
"notes": req.notes,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return EnrolledBundle(
|
||||||
|
host_uuid=host_uuid,
|
||||||
|
name=req.name,
|
||||||
|
address=req.address,
|
||||||
|
agent_port=req.agent_port,
|
||||||
|
fingerprint=issued.fingerprint_sha256,
|
||||||
|
ca_cert_pem=issued.ca_cert_pem.decode(),
|
||||||
|
worker_cert_pem=issued.cert_pem.decode(),
|
||||||
|
worker_key_pem=issued.key_pem.decode(),
|
||||||
|
)
|
||||||
21
decnet/web/router/swarm/api_get_host.py
Normal file
21
decnet/web/router/swarm/api_get_host.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
"""GET /swarm/hosts/{uuid} — fetch a single worker by UUID."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
|
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
from decnet.web.dependencies import get_repo
|
||||||
|
from decnet.web.router.swarm._schemas import SwarmHostView
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/hosts/{uuid}", response_model=SwarmHostView, tags=["Swarm Hosts"])
|
||||||
|
async def api_get_host(
|
||||||
|
uuid: str,
|
||||||
|
repo: BaseRepository = Depends(get_repo),
|
||||||
|
) -> SwarmHostView:
|
||||||
|
row = await repo.get_swarm_host_by_uuid(uuid)
|
||||||
|
if row is None:
|
||||||
|
raise HTTPException(status_code=404, detail="host not found")
|
||||||
|
return SwarmHostView(**row)
|
||||||
11
decnet/web/router/swarm/api_get_swarm_health.py
Normal file
11
decnet/web/router/swarm/api_get_swarm_health.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
"""GET /swarm/health — controller liveness (no I/O)."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from fastapi import APIRouter
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/health", tags=["Swarm Health"])
|
||||||
|
async def api_get_swarm_health() -> dict[str, str]:
|
||||||
|
return {"status": "ok", "role": "swarm-controller"}
|
||||||
21
decnet/web/router/swarm/api_list_hosts.py
Normal file
21
decnet/web/router/swarm/api_list_hosts.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
"""GET /swarm/hosts — list enrolled workers, optionally filtered by status."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends
|
||||||
|
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
from decnet.web.dependencies import get_repo
|
||||||
|
from decnet.web.router.swarm._schemas import SwarmHostView
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/hosts", response_model=list[SwarmHostView], tags=["Swarm Hosts"])
|
||||||
|
async def api_list_hosts(
|
||||||
|
host_status: Optional[str] = None,
|
||||||
|
repo: BaseRepository = Depends(get_repo),
|
||||||
|
) -> list[SwarmHostView]:
|
||||||
|
rows = await repo.list_swarm_hosts(host_status)
|
||||||
|
return [SwarmHostView(**r) for r in rows]
|
||||||
51
decnet/web/router/swarm/api_teardown_swarm.py
Normal file
51
decnet/web/router/swarm/api_teardown_swarm.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
"""POST /swarm/teardown — tear down one or all enrolled workers."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
|
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
from decnet.swarm.client import AgentClient
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
from decnet.web.dependencies import get_repo
|
||||||
|
from decnet.web.router.swarm._schemas import (
|
||||||
|
DeployResponse,
|
||||||
|
HostResult,
|
||||||
|
TeardownRequest,
|
||||||
|
)
|
||||||
|
|
||||||
|
log = get_logger("swarm.teardown")
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/teardown", response_model=DeployResponse, tags=["Swarm Deployments"])
|
||||||
|
async def api_teardown_swarm(
|
||||||
|
req: TeardownRequest,
|
||||||
|
repo: BaseRepository = Depends(get_repo),
|
||||||
|
) -> DeployResponse:
|
||||||
|
if req.host_uuid is not None:
|
||||||
|
row = await repo.get_swarm_host_by_uuid(req.host_uuid)
|
||||||
|
if row is None:
|
||||||
|
raise HTTPException(status_code=404, detail="host not found")
|
||||||
|
targets = [row]
|
||||||
|
else:
|
||||||
|
targets = await repo.list_swarm_hosts()
|
||||||
|
|
||||||
|
async def _call(host: dict[str, Any]) -> HostResult:
|
||||||
|
try:
|
||||||
|
async with AgentClient(host=host) as agent:
|
||||||
|
body = await agent.teardown(req.decky_id)
|
||||||
|
if req.decky_id is None:
|
||||||
|
await repo.delete_decky_shards_for_host(host["uuid"])
|
||||||
|
return HostResult(host_uuid=host["uuid"], host_name=host["name"], ok=True, detail=body)
|
||||||
|
except Exception as exc:
|
||||||
|
log.exception("swarm.teardown failed host=%s", host["name"])
|
||||||
|
return HostResult(
|
||||||
|
host_uuid=host["uuid"], host_name=host["name"], ok=False, detail=str(exc)
|
||||||
|
)
|
||||||
|
|
||||||
|
results = await asyncio.gather(*(_call(h) for h in targets))
|
||||||
|
return DeployResponse(results=list(results))
|
||||||
@@ -1,162 +0,0 @@
|
|||||||
"""Swarm host lifecycle endpoints: enroll, list, decommission.
|
|
||||||
|
|
||||||
Enrollment design
|
|
||||||
-----------------
|
|
||||||
The master controller holds the CA private key. On ``POST /swarm/enroll``
|
|
||||||
it generates a fresh worker keypair + cert (signed by the master CA) and
|
|
||||||
returns the full bundle to the operator. The operator is responsible for
|
|
||||||
delivering that bundle to the worker's ``~/.decnet/agent/`` directory
|
|
||||||
(scp/sshpass/ansible — outside this process's trust boundary).
|
|
||||||
|
|
||||||
Rationale: the worker agent speaks ONLY mTLS. There is no pre-auth
|
|
||||||
bootstrap endpoint, so there is nothing to attack before the worker is
|
|
||||||
enrolled. The bundle-delivery step is explicit and auditable.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import pathlib
|
|
||||||
import uuid as _uuid
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException, status
|
|
||||||
from pydantic import BaseModel, Field
|
|
||||||
|
|
||||||
from decnet.swarm import pki
|
|
||||||
from decnet.web.db.repository import BaseRepository
|
|
||||||
from decnet.web.dependencies import get_repo
|
|
||||||
|
|
||||||
router = APIRouter(tags=["swarm-hosts"])
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------- schemas
|
|
||||||
|
|
||||||
|
|
||||||
class EnrollRequest(BaseModel):
|
|
||||||
name: str = Field(..., min_length=1, max_length=128)
|
|
||||||
address: str = Field(..., description="IP or DNS the master uses to reach the worker")
|
|
||||||
agent_port: int = Field(default=8765, ge=1, le=65535)
|
|
||||||
sans: list[str] = Field(
|
|
||||||
default_factory=list,
|
|
||||||
description="Extra SANs (IPs / hostnames) to embed in the worker cert",
|
|
||||||
)
|
|
||||||
notes: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
class EnrolledBundle(BaseModel):
|
|
||||||
"""Cert bundle returned to the operator — must be delivered to the worker."""
|
|
||||||
|
|
||||||
host_uuid: str
|
|
||||||
name: str
|
|
||||||
address: str
|
|
||||||
agent_port: int
|
|
||||||
fingerprint: str
|
|
||||||
ca_cert_pem: str
|
|
||||||
worker_cert_pem: str
|
|
||||||
worker_key_pem: str
|
|
||||||
|
|
||||||
|
|
||||||
class SwarmHostView(BaseModel):
|
|
||||||
uuid: str
|
|
||||||
name: str
|
|
||||||
address: str
|
|
||||||
agent_port: int
|
|
||||||
status: str
|
|
||||||
last_heartbeat: Optional[datetime] = None
|
|
||||||
client_cert_fingerprint: str
|
|
||||||
enrolled_at: datetime
|
|
||||||
notes: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------- routes
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/enroll", response_model=EnrolledBundle, status_code=status.HTTP_201_CREATED)
|
|
||||||
async def enroll(
|
|
||||||
req: EnrollRequest,
|
|
||||||
repo: BaseRepository = Depends(get_repo),
|
|
||||||
) -> EnrolledBundle:
|
|
||||||
existing = await repo.get_swarm_host_by_name(req.name)
|
|
||||||
if existing is not None:
|
|
||||||
raise HTTPException(status_code=409, detail=f"Worker '{req.name}' is already enrolled")
|
|
||||||
|
|
||||||
ca = pki.ensure_ca()
|
|
||||||
sans = list({*req.sans, req.address, req.name})
|
|
||||||
issued = pki.issue_worker_cert(ca, req.name, sans)
|
|
||||||
|
|
||||||
# Persist the bundle under ~/.decnet/ca/workers/<name>/ so the master
|
|
||||||
# can replay it if the operator loses the original delivery.
|
|
||||||
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.name
|
|
||||||
pki.write_worker_bundle(issued, bundle_dir)
|
|
||||||
|
|
||||||
host_uuid = str(_uuid.uuid4())
|
|
||||||
await repo.add_swarm_host(
|
|
||||||
{
|
|
||||||
"uuid": host_uuid,
|
|
||||||
"name": req.name,
|
|
||||||
"address": req.address,
|
|
||||||
"agent_port": req.agent_port,
|
|
||||||
"status": "enrolled",
|
|
||||||
"client_cert_fingerprint": issued.fingerprint_sha256,
|
|
||||||
"cert_bundle_path": str(bundle_dir),
|
|
||||||
"enrolled_at": datetime.now(timezone.utc),
|
|
||||||
"notes": req.notes,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return EnrolledBundle(
|
|
||||||
host_uuid=host_uuid,
|
|
||||||
name=req.name,
|
|
||||||
address=req.address,
|
|
||||||
agent_port=req.agent_port,
|
|
||||||
fingerprint=issued.fingerprint_sha256,
|
|
||||||
ca_cert_pem=issued.ca_cert_pem.decode(),
|
|
||||||
worker_cert_pem=issued.cert_pem.decode(),
|
|
||||||
worker_key_pem=issued.key_pem.decode(),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/hosts", response_model=list[SwarmHostView])
|
|
||||||
async def list_hosts(
|
|
||||||
host_status: Optional[str] = None,
|
|
||||||
repo: BaseRepository = Depends(get_repo),
|
|
||||||
) -> list[SwarmHostView]:
|
|
||||||
rows = await repo.list_swarm_hosts(host_status)
|
|
||||||
return [SwarmHostView(**r) for r in rows]
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/hosts/{uuid}", response_model=SwarmHostView)
|
|
||||||
async def get_host(
|
|
||||||
uuid: str,
|
|
||||||
repo: BaseRepository = Depends(get_repo),
|
|
||||||
) -> SwarmHostView:
|
|
||||||
row = await repo.get_swarm_host_by_uuid(uuid)
|
|
||||||
if row is None:
|
|
||||||
raise HTTPException(status_code=404, detail="host not found")
|
|
||||||
return SwarmHostView(**row)
|
|
||||||
|
|
||||||
|
|
||||||
@router.delete("/hosts/{uuid}", status_code=status.HTTP_204_NO_CONTENT)
|
|
||||||
async def decommission(
|
|
||||||
uuid: str,
|
|
||||||
repo: BaseRepository = Depends(get_repo),
|
|
||||||
) -> None:
|
|
||||||
row = await repo.get_swarm_host_by_uuid(uuid)
|
|
||||||
if row is None:
|
|
||||||
raise HTTPException(status_code=404, detail="host not found")
|
|
||||||
|
|
||||||
# Remove shard rows first (we own them; cascade is portable via the repo).
|
|
||||||
await repo.delete_decky_shards_for_host(uuid)
|
|
||||||
await repo.delete_swarm_host(uuid)
|
|
||||||
|
|
||||||
# Best-effort bundle cleanup; if the dir was moved manually, don't fail.
|
|
||||||
bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
|
|
||||||
if bundle_dir.is_dir():
|
|
||||||
for child in bundle_dir.iterdir():
|
|
||||||
try:
|
|
||||||
child.unlink()
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
bundle_dir.rmdir()
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
@@ -26,10 +26,10 @@ def ca_dir(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch) -> pathlib.P
|
|||||||
monkeypatch.setattr(pki, "DEFAULT_CA_DIR", ca)
|
monkeypatch.setattr(pki, "DEFAULT_CA_DIR", ca)
|
||||||
# Also patch the already-imported references inside client.py / routers.
|
# Also patch the already-imported references inside client.py / routers.
|
||||||
from decnet.swarm import client as swarm_client
|
from decnet.swarm import client as swarm_client
|
||||||
from decnet.web.router.swarm import hosts as swarm_hosts
|
from decnet.web.router.swarm import api_enroll_host as enroll_mod
|
||||||
|
|
||||||
monkeypatch.setattr(swarm_client, "pki", pki)
|
monkeypatch.setattr(swarm_client, "pki", pki)
|
||||||
monkeypatch.setattr(swarm_hosts, "pki", pki)
|
monkeypatch.setattr(enroll_mod, "pki", pki)
|
||||||
return ca
|
return ca
|
||||||
|
|
||||||
|
|
||||||
@@ -166,11 +166,13 @@ class _StubAgentClient:
|
|||||||
def stub_agent(monkeypatch: pytest.MonkeyPatch):
|
def stub_agent(monkeypatch: pytest.MonkeyPatch):
|
||||||
_StubAgentClient.deployed.clear()
|
_StubAgentClient.deployed.clear()
|
||||||
_StubAgentClient.torn_down.clear()
|
_StubAgentClient.torn_down.clear()
|
||||||
from decnet.web.router.swarm import deployments as dep_mod
|
from decnet.web.router.swarm import api_deploy_swarm as deploy_mod
|
||||||
from decnet.web.router.swarm import health as hlt_mod
|
from decnet.web.router.swarm import api_teardown_swarm as teardown_mod
|
||||||
|
from decnet.web.router.swarm import api_check_hosts as check_mod
|
||||||
|
|
||||||
monkeypatch.setattr(dep_mod, "AgentClient", _StubAgentClient)
|
monkeypatch.setattr(deploy_mod, "AgentClient", _StubAgentClient)
|
||||||
monkeypatch.setattr(hlt_mod, "AgentClient", _StubAgentClient)
|
monkeypatch.setattr(teardown_mod, "AgentClient", _StubAgentClient)
|
||||||
|
monkeypatch.setattr(check_mod, "AgentClient", _StubAgentClient)
|
||||||
return _StubAgentClient
|
return _StubAgentClient
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user