refactor(swarm): one file per endpoint, matching existing router layout
Splits the three grouped router files into eight api_<verb>_<resource>.py modules under decnet/web/router/swarm/ to match the convention used by router/fleet/ and router/config/. Shared request/response models live in _schemas.py. Keeps each endpoint easy to locate and modify without stepping on siblings.
This commit is contained in:
@@ -1,16 +1,33 @@
|
||||
"""Swarm controller routers.
|
||||
|
||||
Mounted onto the swarm-api FastAPI app under the ``/swarm`` prefix. The
|
||||
controller is a separate process from the main DECNET API so swarm
|
||||
failures cannot cascade into log ingestion / dashboard serving.
|
||||
One file per endpoint, aggregated under the ``/swarm`` prefix. Mounted
|
||||
onto the swarm-api FastAPI app (``decnet/web/swarm_api.py``), a separate
|
||||
process from the main DECNET API so swarm failures cannot cascade into
|
||||
log ingestion / dashboard serving.
|
||||
"""
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .hosts import router as hosts_router
|
||||
from .deployments import router as deployments_router
|
||||
from .health import router as health_router
|
||||
from .api_enroll_host import router as enroll_host_router
|
||||
from .api_list_hosts import router as list_hosts_router
|
||||
from .api_get_host import router as get_host_router
|
||||
from .api_decommission_host import router as decommission_host_router
|
||||
from .api_deploy_swarm import router as deploy_swarm_router
|
||||
from .api_teardown_swarm import router as teardown_swarm_router
|
||||
from .api_get_swarm_health import router as get_swarm_health_router
|
||||
from .api_check_hosts import router as check_hosts_router
|
||||
|
||||
swarm_router = APIRouter(prefix="/swarm")
|
||||
swarm_router.include_router(hosts_router)
|
||||
swarm_router.include_router(deployments_router)
|
||||
swarm_router.include_router(health_router)
|
||||
|
||||
# Hosts
|
||||
swarm_router.include_router(enroll_host_router)
|
||||
swarm_router.include_router(list_hosts_router)
|
||||
swarm_router.include_router(get_host_router)
|
||||
swarm_router.include_router(decommission_host_router)
|
||||
|
||||
# Deployments
|
||||
swarm_router.include_router(deploy_swarm_router)
|
||||
swarm_router.include_router(teardown_swarm_router)
|
||||
|
||||
# Health
|
||||
swarm_router.include_router(get_swarm_health_router)
|
||||
swarm_router.include_router(check_hosts_router)
|
||||
|
||||
82
decnet/web/router/swarm/_schemas.py
Normal file
82
decnet/web/router/swarm/_schemas.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""Request/response models shared across the swarm router endpoints."""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any, Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from decnet.config import DecnetConfig
|
||||
|
||||
|
||||
class EnrollRequest(BaseModel):
|
||||
name: str = Field(..., min_length=1, max_length=128)
|
||||
address: str = Field(..., description="IP or DNS the master uses to reach the worker")
|
||||
agent_port: int = Field(default=8765, ge=1, le=65535)
|
||||
sans: list[str] = Field(
|
||||
default_factory=list,
|
||||
description="Extra SANs (IPs / hostnames) to embed in the worker cert",
|
||||
)
|
||||
notes: Optional[str] = None
|
||||
|
||||
|
||||
class EnrolledBundle(BaseModel):
|
||||
"""Cert bundle returned to the operator — must be delivered to the worker."""
|
||||
|
||||
host_uuid: str
|
||||
name: str
|
||||
address: str
|
||||
agent_port: int
|
||||
fingerprint: str
|
||||
ca_cert_pem: str
|
||||
worker_cert_pem: str
|
||||
worker_key_pem: str
|
||||
|
||||
|
||||
class SwarmHostView(BaseModel):
|
||||
uuid: str
|
||||
name: str
|
||||
address: str
|
||||
agent_port: int
|
||||
status: str
|
||||
last_heartbeat: Optional[datetime] = None
|
||||
client_cert_fingerprint: str
|
||||
enrolled_at: datetime
|
||||
notes: Optional[str] = None
|
||||
|
||||
|
||||
class DeployRequest(BaseModel):
|
||||
config: DecnetConfig
|
||||
dry_run: bool = False
|
||||
no_cache: bool = False
|
||||
|
||||
|
||||
class TeardownRequest(BaseModel):
|
||||
host_uuid: str | None = Field(
|
||||
default=None,
|
||||
description="If set, tear down only this worker; otherwise tear down all hosts",
|
||||
)
|
||||
decky_id: str | None = None
|
||||
|
||||
|
||||
class HostResult(BaseModel):
|
||||
host_uuid: str
|
||||
host_name: str
|
||||
ok: bool
|
||||
detail: Any | None = None
|
||||
|
||||
|
||||
class DeployResponse(BaseModel):
|
||||
results: list[HostResult]
|
||||
|
||||
|
||||
class HostHealth(BaseModel):
|
||||
host_uuid: str
|
||||
name: str
|
||||
address: str
|
||||
reachable: bool
|
||||
detail: Any | None = None
|
||||
|
||||
|
||||
class CheckResponse(BaseModel):
|
||||
results: list[HostHealth]
|
||||
@@ -1,8 +1,7 @@
|
||||
"""Health endpoints for the swarm controller.
|
||||
"""POST /swarm/check — active mTLS probe of every enrolled worker.
|
||||
|
||||
* ``GET /swarm/health`` — liveness of the controller itself (no I/O).
|
||||
* ``POST /swarm/check`` — active probe of every enrolled worker over mTLS.
|
||||
Updates ``SwarmHost.status`` and ``last_heartbeat``.
|
||||
Updates ``SwarmHost.status`` and ``last_heartbeat`` for each host based
|
||||
on the outcome of the probe.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -11,37 +10,20 @@ from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
from pydantic import BaseModel
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm._schemas import CheckResponse, HostHealth
|
||||
|
||||
log = get_logger("swarm.health")
|
||||
log = get_logger("swarm.check")
|
||||
|
||||
router = APIRouter(tags=["swarm-health"])
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class HostHealth(BaseModel):
|
||||
host_uuid: str
|
||||
name: str
|
||||
address: str
|
||||
reachable: bool
|
||||
detail: Any | None = None
|
||||
|
||||
|
||||
class CheckResponse(BaseModel):
|
||||
results: list[HostHealth]
|
||||
|
||||
|
||||
@router.get("/health")
|
||||
async def health() -> dict[str, str]:
|
||||
return {"status": "ok", "role": "swarm-controller"}
|
||||
|
||||
|
||||
@router.post("/check", response_model=CheckResponse)
|
||||
async def check(
|
||||
@router.post("/check", response_model=CheckResponse, tags=["Swarm Health"])
|
||||
async def api_check_hosts(
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> CheckResponse:
|
||||
hosts = await repo.list_swarm_hosts()
|
||||
46
decnet/web/router/swarm/api_decommission_host.py
Normal file
46
decnet/web/router/swarm/api_decommission_host.py
Normal file
@@ -0,0 +1,46 @@
|
||||
"""DELETE /swarm/hosts/{uuid} — decommission a worker.
|
||||
|
||||
Removes the DeckyShard rows bound to the host (portable cascade — MySQL
|
||||
and SQLite both honor it via the repo layer), deletes the SwarmHost row,
|
||||
and best-effort-cleans the per-worker bundle directory on the master.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pathlib
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/hosts/{uuid}",
|
||||
status_code=status.HTTP_204_NO_CONTENT,
|
||||
tags=["Swarm Hosts"],
|
||||
)
|
||||
async def api_decommission_host(
|
||||
uuid: str,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> None:
|
||||
row = await repo.get_swarm_host_by_uuid(uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="host not found")
|
||||
|
||||
await repo.delete_decky_shards_for_host(uuid)
|
||||
await repo.delete_swarm_host(uuid)
|
||||
|
||||
# Best-effort bundle cleanup; if the dir was moved manually, don't fail.
|
||||
bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
|
||||
if bundle_dir.is_dir():
|
||||
for child in bundle_dir.iterdir():
|
||||
try:
|
||||
child.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
bundle_dir.rmdir()
|
||||
except OSError:
|
||||
pass
|
||||
@@ -1,11 +1,10 @@
|
||||
"""Deployment dispatch: shard deckies across enrolled workers and push.
|
||||
"""POST /swarm/deploy — shard a DecnetConfig across enrolled workers.
|
||||
|
||||
The master owns the DecnetConfig. Per worker we build a filtered copy
|
||||
containing only the deckies assigned to that worker (via ``host_uuid``),
|
||||
then POST it to the worker agent. Sharding strategy is explicit: the
|
||||
caller is expected to have already set ``host_uuid`` on every decky. If
|
||||
any decky arrives without one, we fail fast — auto-sharding lives in the
|
||||
CLI layer (task #7), not here.
|
||||
Per worker we build a filtered copy containing only the deckies assigned
|
||||
to that worker (via ``host_uuid``), then POST it to the worker agent.
|
||||
The caller is expected to have already set ``host_uuid`` on every decky;
|
||||
if any decky arrives without one, we fail fast. Auto-sharding lives in
|
||||
the CLI layer, not here.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -15,45 +14,21 @@ from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from decnet.config import DecnetConfig, DeckyConfig
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm._schemas import (
|
||||
DeployRequest,
|
||||
DeployResponse,
|
||||
HostResult,
|
||||
)
|
||||
|
||||
log = get_logger("swarm.deployments")
|
||||
log = get_logger("swarm.deploy")
|
||||
|
||||
router = APIRouter(tags=["swarm-deployments"])
|
||||
|
||||
|
||||
class DeployRequest(BaseModel):
|
||||
config: DecnetConfig
|
||||
dry_run: bool = False
|
||||
no_cache: bool = False
|
||||
|
||||
|
||||
class TeardownRequest(BaseModel):
|
||||
host_uuid: str | None = Field(
|
||||
default=None,
|
||||
description="If set, tear down only this worker; otherwise tear down all hosts",
|
||||
)
|
||||
decky_id: str | None = None
|
||||
|
||||
|
||||
class HostResult(BaseModel):
|
||||
host_uuid: str
|
||||
host_name: str
|
||||
ok: bool
|
||||
detail: Any | None = None
|
||||
|
||||
|
||||
class DeployResponse(BaseModel):
|
||||
results: list[HostResult]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------- helpers
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _shard_by_host(config: DecnetConfig) -> dict[str, list[DeckyConfig]]:
|
||||
@@ -72,11 +47,8 @@ def _worker_config(base: DecnetConfig, shard: list[DeckyConfig]) -> DecnetConfig
|
||||
return base.model_copy(update={"deckies": shard})
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ routes
|
||||
|
||||
|
||||
@router.post("/deploy", response_model=DeployResponse)
|
||||
async def deploy(
|
||||
@router.post("/deploy", response_model=DeployResponse, tags=["Swarm Deployments"])
|
||||
async def api_deploy_swarm(
|
||||
req: DeployRequest,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> DeployResponse:
|
||||
@@ -85,7 +57,6 @@ async def deploy(
|
||||
|
||||
buckets = _shard_by_host(req.config)
|
||||
|
||||
# Resolve host rows in one query-per-host pass; fail fast on unknown uuids.
|
||||
hosts: dict[str, dict[str, Any]] = {}
|
||||
for host_uuid in buckets:
|
||||
row = await repo.get_swarm_host_by_uuid(host_uuid)
|
||||
@@ -99,7 +70,6 @@ async def deploy(
|
||||
try:
|
||||
async with AgentClient(host=host) as agent:
|
||||
body = await agent.deploy(cfg, dry_run=req.dry_run, no_cache=req.no_cache)
|
||||
# Persist a DeckyShard row per decky for status lookups.
|
||||
for d in shard:
|
||||
await repo.upsert_decky_shard(
|
||||
{
|
||||
@@ -132,33 +102,3 @@ async def deploy(
|
||||
*(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
|
||||
)
|
||||
return DeployResponse(results=list(results))
|
||||
|
||||
|
||||
@router.post("/teardown", response_model=DeployResponse)
|
||||
async def teardown(
|
||||
req: TeardownRequest,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> DeployResponse:
|
||||
if req.host_uuid is not None:
|
||||
row = await repo.get_swarm_host_by_uuid(req.host_uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="host not found")
|
||||
targets = [row]
|
||||
else:
|
||||
targets = await repo.list_swarm_hosts()
|
||||
|
||||
async def _call(host: dict[str, Any]) -> HostResult:
|
||||
try:
|
||||
async with AgentClient(host=host) as agent:
|
||||
body = await agent.teardown(req.decky_id)
|
||||
if req.decky_id is None:
|
||||
await repo.delete_decky_shards_for_host(host["uuid"])
|
||||
return HostResult(host_uuid=host["uuid"], host_name=host["name"], ok=True, detail=body)
|
||||
except Exception as exc:
|
||||
log.exception("swarm.teardown failed host=%s", host["name"])
|
||||
return HostResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"], ok=False, detail=str(exc)
|
||||
)
|
||||
|
||||
results = await asyncio.gather(*(_call(h) for h in targets))
|
||||
return DeployResponse(results=list(results))
|
||||
72
decnet/web/router/swarm/api_enroll_host.py
Normal file
72
decnet/web/router/swarm/api_enroll_host.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""POST /swarm/enroll — issue a worker cert bundle and register the host.
|
||||
|
||||
Enrollment is master-driven: the controller holds the CA private key,
|
||||
generates a fresh worker keypair + CA-signed cert, and returns the full
|
||||
bundle to the operator. Bundle delivery to the worker (scp/sshpass/etc.)
|
||||
is outside this process's trust boundary.
|
||||
|
||||
Rationale: the worker agent speaks ONLY mTLS; there is no pre-auth
|
||||
bootstrap endpoint, so nothing to attack before the worker is enrolled.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid as _uuid
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
|
||||
from decnet.swarm import pki
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm._schemas import EnrolledBundle, EnrollRequest
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/enroll",
|
||||
response_model=EnrolledBundle,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
tags=["Swarm Hosts"],
|
||||
)
|
||||
async def api_enroll_host(
|
||||
req: EnrollRequest,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> EnrolledBundle:
|
||||
existing = await repo.get_swarm_host_by_name(req.name)
|
||||
if existing is not None:
|
||||
raise HTTPException(status_code=409, detail=f"Worker '{req.name}' is already enrolled")
|
||||
|
||||
ca = pki.ensure_ca()
|
||||
sans = list({*req.sans, req.address, req.name})
|
||||
issued = pki.issue_worker_cert(ca, req.name, sans)
|
||||
|
||||
# Persist the bundle under ~/.decnet/ca/workers/<name>/ so the master
|
||||
# can replay it if the operator loses the original delivery.
|
||||
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.name
|
||||
pki.write_worker_bundle(issued, bundle_dir)
|
||||
|
||||
host_uuid = str(_uuid.uuid4())
|
||||
await repo.add_swarm_host(
|
||||
{
|
||||
"uuid": host_uuid,
|
||||
"name": req.name,
|
||||
"address": req.address,
|
||||
"agent_port": req.agent_port,
|
||||
"status": "enrolled",
|
||||
"client_cert_fingerprint": issued.fingerprint_sha256,
|
||||
"cert_bundle_path": str(bundle_dir),
|
||||
"enrolled_at": datetime.now(timezone.utc),
|
||||
"notes": req.notes,
|
||||
}
|
||||
)
|
||||
return EnrolledBundle(
|
||||
host_uuid=host_uuid,
|
||||
name=req.name,
|
||||
address=req.address,
|
||||
agent_port=req.agent_port,
|
||||
fingerprint=issued.fingerprint_sha256,
|
||||
ca_cert_pem=issued.ca_cert_pem.decode(),
|
||||
worker_cert_pem=issued.cert_pem.decode(),
|
||||
worker_key_pem=issued.key_pem.decode(),
|
||||
)
|
||||
21
decnet/web/router/swarm/api_get_host.py
Normal file
21
decnet/web/router/swarm/api_get_host.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""GET /swarm/hosts/{uuid} — fetch a single worker by UUID."""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm._schemas import SwarmHostView
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/hosts/{uuid}", response_model=SwarmHostView, tags=["Swarm Hosts"])
|
||||
async def api_get_host(
|
||||
uuid: str,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> SwarmHostView:
|
||||
row = await repo.get_swarm_host_by_uuid(uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="host not found")
|
||||
return SwarmHostView(**row)
|
||||
11
decnet/web/router/swarm/api_get_swarm_health.py
Normal file
11
decnet/web/router/swarm/api_get_swarm_health.py
Normal file
@@ -0,0 +1,11 @@
|
||||
"""GET /swarm/health — controller liveness (no I/O)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/health", tags=["Swarm Health"])
|
||||
async def api_get_swarm_health() -> dict[str, str]:
|
||||
return {"status": "ok", "role": "swarm-controller"}
|
||||
21
decnet/web/router/swarm/api_list_hosts.py
Normal file
21
decnet/web/router/swarm/api_list_hosts.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""GET /swarm/hosts — list enrolled workers, optionally filtered by status."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm._schemas import SwarmHostView
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/hosts", response_model=list[SwarmHostView], tags=["Swarm Hosts"])
|
||||
async def api_list_hosts(
|
||||
host_status: Optional[str] = None,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> list[SwarmHostView]:
|
||||
rows = await repo.list_swarm_hosts(host_status)
|
||||
return [SwarmHostView(**r) for r in rows]
|
||||
51
decnet/web/router/swarm/api_teardown_swarm.py
Normal file
51
decnet/web/router/swarm/api_teardown_swarm.py
Normal file
@@ -0,0 +1,51 @@
|
||||
"""POST /swarm/teardown — tear down one or all enrolled workers."""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm._schemas import (
|
||||
DeployResponse,
|
||||
HostResult,
|
||||
TeardownRequest,
|
||||
)
|
||||
|
||||
log = get_logger("swarm.teardown")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post("/teardown", response_model=DeployResponse, tags=["Swarm Deployments"])
|
||||
async def api_teardown_swarm(
|
||||
req: TeardownRequest,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> DeployResponse:
|
||||
if req.host_uuid is not None:
|
||||
row = await repo.get_swarm_host_by_uuid(req.host_uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="host not found")
|
||||
targets = [row]
|
||||
else:
|
||||
targets = await repo.list_swarm_hosts()
|
||||
|
||||
async def _call(host: dict[str, Any]) -> HostResult:
|
||||
try:
|
||||
async with AgentClient(host=host) as agent:
|
||||
body = await agent.teardown(req.decky_id)
|
||||
if req.decky_id is None:
|
||||
await repo.delete_decky_shards_for_host(host["uuid"])
|
||||
return HostResult(host_uuid=host["uuid"], host_name=host["name"], ok=True, detail=body)
|
||||
except Exception as exc:
|
||||
log.exception("swarm.teardown failed host=%s", host["name"])
|
||||
return HostResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"], ok=False, detail=str(exc)
|
||||
)
|
||||
|
||||
results = await asyncio.gather(*(_call(h) for h in targets))
|
||||
return DeployResponse(results=list(results))
|
||||
@@ -1,162 +0,0 @@
|
||||
"""Swarm host lifecycle endpoints: enroll, list, decommission.
|
||||
|
||||
Enrollment design
|
||||
-----------------
|
||||
The master controller holds the CA private key. On ``POST /swarm/enroll``
|
||||
it generates a fresh worker keypair + cert (signed by the master CA) and
|
||||
returns the full bundle to the operator. The operator is responsible for
|
||||
delivering that bundle to the worker's ``~/.decnet/agent/`` directory
|
||||
(scp/sshpass/ansible — outside this process's trust boundary).
|
||||
|
||||
Rationale: the worker agent speaks ONLY mTLS. There is no pre-auth
|
||||
bootstrap endpoint, so there is nothing to attack before the worker is
|
||||
enrolled. The bundle-delivery step is explicit and auditable.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pathlib
|
||||
import uuid as _uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from decnet.swarm import pki
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
|
||||
router = APIRouter(tags=["swarm-hosts"])
|
||||
|
||||
|
||||
# ------------------------------------------------------------------- schemas
|
||||
|
||||
|
||||
class EnrollRequest(BaseModel):
|
||||
name: str = Field(..., min_length=1, max_length=128)
|
||||
address: str = Field(..., description="IP or DNS the master uses to reach the worker")
|
||||
agent_port: int = Field(default=8765, ge=1, le=65535)
|
||||
sans: list[str] = Field(
|
||||
default_factory=list,
|
||||
description="Extra SANs (IPs / hostnames) to embed in the worker cert",
|
||||
)
|
||||
notes: Optional[str] = None
|
||||
|
||||
|
||||
class EnrolledBundle(BaseModel):
|
||||
"""Cert bundle returned to the operator — must be delivered to the worker."""
|
||||
|
||||
host_uuid: str
|
||||
name: str
|
||||
address: str
|
||||
agent_port: int
|
||||
fingerprint: str
|
||||
ca_cert_pem: str
|
||||
worker_cert_pem: str
|
||||
worker_key_pem: str
|
||||
|
||||
|
||||
class SwarmHostView(BaseModel):
|
||||
uuid: str
|
||||
name: str
|
||||
address: str
|
||||
agent_port: int
|
||||
status: str
|
||||
last_heartbeat: Optional[datetime] = None
|
||||
client_cert_fingerprint: str
|
||||
enrolled_at: datetime
|
||||
notes: Optional[str] = None
|
||||
|
||||
|
||||
# ------------------------------------------------------------------- routes
|
||||
|
||||
|
||||
@router.post("/enroll", response_model=EnrolledBundle, status_code=status.HTTP_201_CREATED)
|
||||
async def enroll(
|
||||
req: EnrollRequest,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> EnrolledBundle:
|
||||
existing = await repo.get_swarm_host_by_name(req.name)
|
||||
if existing is not None:
|
||||
raise HTTPException(status_code=409, detail=f"Worker '{req.name}' is already enrolled")
|
||||
|
||||
ca = pki.ensure_ca()
|
||||
sans = list({*req.sans, req.address, req.name})
|
||||
issued = pki.issue_worker_cert(ca, req.name, sans)
|
||||
|
||||
# Persist the bundle under ~/.decnet/ca/workers/<name>/ so the master
|
||||
# can replay it if the operator loses the original delivery.
|
||||
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.name
|
||||
pki.write_worker_bundle(issued, bundle_dir)
|
||||
|
||||
host_uuid = str(_uuid.uuid4())
|
||||
await repo.add_swarm_host(
|
||||
{
|
||||
"uuid": host_uuid,
|
||||
"name": req.name,
|
||||
"address": req.address,
|
||||
"agent_port": req.agent_port,
|
||||
"status": "enrolled",
|
||||
"client_cert_fingerprint": issued.fingerprint_sha256,
|
||||
"cert_bundle_path": str(bundle_dir),
|
||||
"enrolled_at": datetime.now(timezone.utc),
|
||||
"notes": req.notes,
|
||||
}
|
||||
)
|
||||
return EnrolledBundle(
|
||||
host_uuid=host_uuid,
|
||||
name=req.name,
|
||||
address=req.address,
|
||||
agent_port=req.agent_port,
|
||||
fingerprint=issued.fingerprint_sha256,
|
||||
ca_cert_pem=issued.ca_cert_pem.decode(),
|
||||
worker_cert_pem=issued.cert_pem.decode(),
|
||||
worker_key_pem=issued.key_pem.decode(),
|
||||
)
|
||||
|
||||
|
||||
@router.get("/hosts", response_model=list[SwarmHostView])
|
||||
async def list_hosts(
|
||||
host_status: Optional[str] = None,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> list[SwarmHostView]:
|
||||
rows = await repo.list_swarm_hosts(host_status)
|
||||
return [SwarmHostView(**r) for r in rows]
|
||||
|
||||
|
||||
@router.get("/hosts/{uuid}", response_model=SwarmHostView)
|
||||
async def get_host(
|
||||
uuid: str,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> SwarmHostView:
|
||||
row = await repo.get_swarm_host_by_uuid(uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="host not found")
|
||||
return SwarmHostView(**row)
|
||||
|
||||
|
||||
@router.delete("/hosts/{uuid}", status_code=status.HTTP_204_NO_CONTENT)
|
||||
async def decommission(
|
||||
uuid: str,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> None:
|
||||
row = await repo.get_swarm_host_by_uuid(uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="host not found")
|
||||
|
||||
# Remove shard rows first (we own them; cascade is portable via the repo).
|
||||
await repo.delete_decky_shards_for_host(uuid)
|
||||
await repo.delete_swarm_host(uuid)
|
||||
|
||||
# Best-effort bundle cleanup; if the dir was moved manually, don't fail.
|
||||
bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
|
||||
if bundle_dir.is_dir():
|
||||
for child in bundle_dir.iterdir():
|
||||
try:
|
||||
child.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
bundle_dir.rmdir()
|
||||
except OSError:
|
||||
pass
|
||||
Reference in New Issue
Block a user