feat(fleet): auto-swarm deploy — shard across enrolled workers when master

POST /deckies/deploy now branches on DECNET_MODE + enrolled host presence:
when the caller is a master with at least one reachable swarm host, round-
robin host_uuids are assigned over new deckies and the config is dispatched
via AgentClient. Falls back to local docker-compose otherwise.

Extracts the dispatch loop from api_deploy_swarm into dispatch_decnet_config
so both endpoints share the same shard/dispatch/persist path. Adds
GET /system/deployment-mode for the UI to show 'will shard across N hosts'
vs 'will deploy locally' before the operator clicks deploy.
This commit is contained in:
2026-04-19 06:09:08 -04:00
parent cb1a1d1270
commit 79db999030
7 changed files with 245 additions and 14 deletions

View File

@@ -23,6 +23,7 @@ from .health.api_get_health import router as health_router
from .artifacts.api_get_artifact import router as artifacts_router
from .swarm_updates import swarm_updates_router
from .swarm_mgmt import swarm_mgmt_router
from .system import system_router
api_router = APIRouter()
@@ -68,3 +69,6 @@ api_router.include_router(swarm_updates_router)
# Swarm Management (dashboard: hosts, deckies, agent enrollment bundles)
api_router.include_router(swarm_mgmt_router)
# System info (deployment-mode auto-detection, etc.)
api_router.include_router(system_router)

View File

@@ -10,6 +10,7 @@ from decnet.ini_loader import load_ini_from_string
from decnet.network import detect_interface, detect_subnet, get_host_ip
from decnet.web.dependencies import require_admin, repo
from decnet.web.db.models import DeployIniRequest
from decnet.web.router.swarm.api_deploy_swarm import dispatch_decnet_config
log = get_logger("api")
@@ -109,12 +110,51 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir
config.deckies = list(existing_deckies_map.values())
# We call deploy(config) which regenerates docker-compose and runs `up -d --remove-orphans`.
# Auto-mode: if we're a master with at least one enrolled/active SWARM
# host, shard the deckies across those workers instead of spawning docker
# containers on the master itself. Round-robin assignment over deckies
# that don't already carry a host_uuid (state from a prior swarm deploy
# keeps its original assignment).
swarm_hosts: list[dict] = []
if os.environ.get("DECNET_MODE", "master").lower() == "master":
swarm_hosts = [
h for h in await repo.list_swarm_hosts()
if h.get("status") in ("active", "enrolled") and h.get("address")
]
if swarm_hosts:
unassigned = [d for d in config.deckies if not d.host_uuid]
for i, d in enumerate(unassigned):
d.host_uuid = swarm_hosts[i % len(swarm_hosts)]["uuid"]
config = config.model_copy(update={"mode": "swarm"})
try:
result = await dispatch_decnet_config(config, repo, dry_run=False, no_cache=False)
except HTTPException:
raise
except Exception as e:
log.exception("swarm-auto deploy dispatch failed: %s", e)
raise HTTPException(status_code=500, detail="Swarm dispatch failed. Check server logs.")
await repo.set_state("deployment", {
"config": config.model_dump(),
"compose_path": state_dict["compose_path"] if state_dict else "",
})
failed = [r for r in result.results if not r.ok]
if failed:
detail = "; ".join(f"{r.host_name}: {r.detail}" for r in failed)
raise HTTPException(status_code=502, detail=f"Partial swarm deploy failure — {detail}")
return {
"message": f"Deckies deployed across {len(result.results)} swarm host(s)",
"mode": "swarm",
}
# Unihost path — docker-compose on the master itself.
try:
if os.environ.get("DECNET_CONTRACT_TEST") != "true":
_deploy(config)
# Persist new state to DB
new_state_payload = {
"config": config.model_dump(),
"compose_path": str(_ROOT / "docker-compose.yml") if not state_dict else state_dict["compose_path"]
@@ -124,4 +164,4 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir
log.exception("Deployment failed: %s", e)
raise HTTPException(status_code=500, detail="Deployment failed. Check server logs for details.")
return {"message": "Deckies deployed successfully"}
return {"message": "Deckies deployed successfully", "mode": "unihost"}

View File

@@ -47,15 +47,18 @@ def _worker_config(base: DecnetConfig, shard: list[DeckyConfig]) -> DecnetConfig
return base.model_copy(update={"deckies": shard})
@router.post("/deploy", response_model=SwarmDeployResponse, tags=["Swarm Deployments"])
async def api_deploy_swarm(
req: SwarmDeployRequest,
repo: BaseRepository = Depends(get_repo),
async def dispatch_decnet_config(
config: DecnetConfig,
repo: BaseRepository,
dry_run: bool = False,
no_cache: bool = False,
) -> SwarmDeployResponse:
if req.config.mode != "swarm":
raise HTTPException(status_code=400, detail="mode must be 'swarm'")
"""Shard ``config`` by ``host_uuid`` and dispatch to each worker in parallel.
buckets = _shard_by_host(req.config)
Shared between POST /swarm/deploy (explicit swarm call) and the auto-swarm
branch of POST /deckies/deploy.
"""
buckets = _shard_by_host(config)
hosts: dict[str, dict[str, Any]] = {}
for host_uuid in buckets:
@@ -66,17 +69,17 @@ async def api_deploy_swarm(
async def _dispatch(host_uuid: str, shard: list[DeckyConfig]) -> SwarmHostResult:
host = hosts[host_uuid]
cfg = _worker_config(req.config, shard)
cfg = _worker_config(config, shard)
try:
async with AgentClient(host=host) as agent:
body = await agent.deploy(cfg, dry_run=req.dry_run, no_cache=req.no_cache)
body = await agent.deploy(cfg, dry_run=dry_run, no_cache=no_cache)
for d in shard:
await repo.upsert_decky_shard(
{
"decky_name": d.name,
"host_uuid": host_uuid,
"services": json.dumps(d.services),
"state": "running" if not req.dry_run else "pending",
"state": "running" if not dry_run else "pending",
"last_error": None,
"updated_at": datetime.now(timezone.utc),
}
@@ -102,3 +105,15 @@ async def api_deploy_swarm(
*(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
)
return SwarmDeployResponse(results=list(results))
@router.post("/deploy", response_model=SwarmDeployResponse, tags=["Swarm Deployments"])
async def api_deploy_swarm(
req: SwarmDeployRequest,
repo: BaseRepository = Depends(get_repo),
) -> SwarmDeployResponse:
if req.config.mode != "swarm":
raise HTTPException(status_code=400, detail="mode must be 'swarm'")
return await dispatch_decnet_config(
req.config, repo, dry_run=req.dry_run, no_cache=req.no_cache
)

View File

@@ -0,0 +1,6 @@
from fastapi import APIRouter
from .api_deployment_mode import router as deployment_mode_router
system_router = APIRouter(prefix="/system", tags=["System"])
system_router.include_router(deployment_mode_router)

View File

@@ -0,0 +1,41 @@
"""GET /system/deployment-mode — tells the UI whether a deploy will shard
across SWARM workers or land on the master itself.
Logic mirrors the auto-mode branch in ``api_deploy_deckies``: master role
plus at least one reachable enrolled worker = swarm; otherwise unihost.
"""
from __future__ import annotations
import os
from fastapi import APIRouter, Depends
from pydantic import BaseModel
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
router = APIRouter()
class DeploymentModeResponse(BaseModel):
mode: str # "swarm" or "unihost"
role: str # "master" or "agent"
swarm_host_count: int
@router.get("/deployment-mode", response_model=DeploymentModeResponse)
async def get_deployment_mode(
repo: BaseRepository = Depends(get_repo),
) -> DeploymentModeResponse:
role = os.environ.get("DECNET_MODE", "master").lower()
hosts = 0
if role == "master":
hosts = sum(
1 for h in await repo.list_swarm_hosts()
if h.get("status") in ("active", "enrolled") and h.get("address")
)
return DeploymentModeResponse(
mode="swarm" if hosts > 0 else "unihost",
role=role,
swarm_host_count=hosts,
)