Files
DECNET/decnet/web/router/swarm/api_deploy_swarm.py
anti 8d18c59201 fix(swarm): require admin JWT on all swarm operator endpoints
Gate all 8 swarm-controller operator routes (enroll, list/get/decommission
hosts, deploy, teardown, check, list deckies) with the centralized
require_admin RBAC dependency alongside require_operator_cert; mTLS becomes
defense-in-depth instead of the only gate. /heartbeat stays cert-fingerprint
pinned (worker-facing) and /swarm/health stays open (liveness only).

CLI swarm commands now send Authorization: Bearer $DECNET_API_TOKEN with a
401/403 hint covering the must_change_password bootstrap flow.

Bump pyjwt to 2.13.0 and pip to 26.1.2 (pip-audit PYSEC-2026-175/177/178/179,
PYSEC-2026-196); authz suite re-verified on the new pyjwt.

Closes ASVS_L2_AUDIT.md V4.1.1a and V4.1.1b (CRITICAL).
2026-06-09 17:08:10 -04:00

174 lines
6.1 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""POST /swarm/deploy — shard a DecnetConfig across enrolled workers.
Per worker we build a filtered copy containing only the deckies assigned
to that worker (via ``host_uuid``), then POST it to the worker agent.
The caller is expected to have already set ``host_uuid`` on every decky;
if any decky arrives without one, we fail fast. Auto-sharding lives in
the CLI layer, not here.
"""
from __future__ import annotations
import asyncio
import json
from datetime import datetime, timezone
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.config import DecnetConfig, DeckyConfig
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import (
SwarmDeployRequest,
SwarmDeployResponse,
SwarmHostResult,
)
log = get_logger("swarm.deploy")
router = APIRouter()
def _shard_by_host(config: DecnetConfig) -> dict[str, list[DeckyConfig]]:
buckets: dict[str, list[DeckyConfig]] = {}
for d in config.deckies:
if not d.host_uuid:
raise HTTPException(
status_code=400,
detail=f"decky '{d.name}' has no host_uuid — caller must shard before dispatch",
)
buckets.setdefault(d.host_uuid, []).append(d)
return buckets
def _worker_config(
base: DecnetConfig,
shard: list[DeckyConfig],
host: dict[str, Any],
) -> DecnetConfig:
updates: dict[str, Any] = {"deckies": shard}
# Per-host driver opt-in (Wi-Fi-bridged VMs can't use macvlan — see
# SwarmHost.use_ipvlan). Never downgrade: if the operator picked ipvlan
# at the deploy level, keep it regardless of the per-host flag.
if host.get("use_ipvlan"):
updates["ipvlan"] = True
return base.model_copy(update=updates)
def _shard_payload(
d: DeckyConfig,
host_uuid: str,
state: str,
error: str | None,
) -> dict[str, Any]:
return {
"decky_name": d.name,
"host_uuid": host_uuid,
"services": json.dumps(d.services),
"decky_config": d.model_dump_json(),
"decky_ip": d.ip,
"state": state,
"last_error": error,
"updated_at": datetime.now(timezone.utc),
}
async def _dispatch(
host_uuid: str,
shard: list[DeckyConfig],
hosts: dict[str, dict[str, Any]],
config: DecnetConfig,
repo: BaseRepository,
dry_run: bool,
no_cache: bool,
) -> SwarmHostResult:
host = hosts[host_uuid]
cfg = _worker_config(config, shard, host)
try:
async with AgentClient(host=host) as agent:
body = await agent.deploy(cfg, dry_run=dry_run, no_cache=no_cache)
for d in shard:
await repo.upsert_decky_shard(
_shard_payload(d, host_uuid, "running" if not dry_run else "pending", None)
)
await repo.update_swarm_host(host_uuid, {"status": "active"})
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
except Exception as exc:
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
# Compose-up is partial-success-friendly: one decky failing to
# build doesn't roll back the ones that already came up. Ask the
# agent which containers actually exist before painting the whole
# shard red — otherwise decky1 and decky2 look "failed" even
# though they're live on the worker.
runtime: dict[str, Any] = {}
try:
async with AgentClient(host=host) as probe:
snap = await probe.status()
runtime = snap.get("runtime") or {}
except Exception:
log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"])
for d in shard:
rstate = runtime.get(d.name) or {}
is_up = bool(rstate.get("running"))
await repo.upsert_decky_shard(
_shard_payload(d, host_uuid, "running" if is_up else "failed", None if is_up else str(exc)[:512])
)
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=False, detail=str(exc))
async def dispatch_decnet_config(
config: DecnetConfig,
repo: BaseRepository,
dry_run: bool = False,
no_cache: bool = False,
) -> SwarmDeployResponse:
"""Shard ``config`` by ``host_uuid`` and dispatch to each worker in parallel.
Shared between POST /swarm/deploy (explicit swarm call) and the auto-swarm
branch of POST /deckies/deploy.
"""
buckets = _shard_by_host(config)
hosts: dict[str, dict[str, Any]] = {}
for host_uuid in buckets:
row = await repo.get_swarm_host_by_uuid(host_uuid)
if row is None:
raise HTTPException(status_code=404, detail=f"unknown host_uuid: {host_uuid}")
hosts[host_uuid] = row
results = await asyncio.gather(
*(
_dispatch(uuid_, shard, hosts, config, repo, dry_run, no_cache)
for uuid_, shard in buckets.items()
)
)
return SwarmDeployResponse(results=list(results))
@router.post(
"/deploy",
response_model=SwarmDeployResponse,
tags=["Swarm Deployments"],
responses={
400: {"description": "Deployment mode must be 'swarm'"},
401: {"description": "Missing or invalid admin JWT"},
403: {"description": "Authenticated user is not an admin, or operator cert missing"},
404: {"description": "A referenced host_uuid is not enrolled"},
},
)
async def api_deploy_swarm(
req: SwarmDeployRequest,
repo: BaseRepository = Depends(get_repo),
_admin: dict = Depends(require_admin),
_operator: PeerCert = Depends(require_operator_cert),
) -> SwarmDeployResponse:
if req.config.mode != "swarm":
raise HTTPException(status_code=400, detail="mode must be 'swarm'")
return await dispatch_decnet_config(
req.config, repo, dry_run=req.dry_run, no_cache=req.no_cache
)