From 79db999030acce345619096e34fc7a5c7f7beeb7 Mon Sep 17 00:00:00 2001 From: anti Date: Sun, 19 Apr 2026 06:09:08 -0400 Subject: [PATCH] =?UTF-8?q?feat(fleet):=20auto-swarm=20deploy=20=E2=80=94?= =?UTF-8?q?=20shard=20across=20enrolled=20workers=20when=20master?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit POST /deckies/deploy now branches on DECNET_MODE + enrolled host presence: when the caller is a master with at least one reachable swarm host, round- robin host_uuids are assigned over new deckies and the config is dispatched via AgentClient. Falls back to local docker-compose otherwise. Extracts the dispatch loop from api_deploy_swarm into dispatch_decnet_config so both endpoints share the same shard/dispatch/persist path. Adds GET /system/deployment-mode for the UI to show 'will shard across N hosts' vs 'will deploy locally' before the operator clicks deploy. --- decnet/web/router/__init__.py | 4 + decnet/web/router/fleet/api_deploy_deckies.py | 46 +++++++- decnet/web/router/swarm/api_deploy_swarm.py | 35 ++++-- decnet/web/router/system/__init__.py | 6 + .../web/router/system/api_deployment_mode.py | 41 +++++++ decnet_web/src/components/DeckyFleet.tsx | 22 +++- tests/api/fleet/test_deploy_automode.py | 105 ++++++++++++++++++ 7 files changed, 245 insertions(+), 14 deletions(-) create mode 100644 decnet/web/router/system/__init__.py create mode 100644 decnet/web/router/system/api_deployment_mode.py create mode 100644 tests/api/fleet/test_deploy_automode.py diff --git a/decnet/web/router/__init__.py b/decnet/web/router/__init__.py index ac92e7c..3061797 100644 --- a/decnet/web/router/__init__.py +++ b/decnet/web/router/__init__.py @@ -23,6 +23,7 @@ from .health.api_get_health import router as health_router from .artifacts.api_get_artifact import router as artifacts_router from .swarm_updates import swarm_updates_router from .swarm_mgmt import swarm_mgmt_router +from .system import system_router api_router = APIRouter() @@ -68,3 +69,6 @@ api_router.include_router(swarm_updates_router) # Swarm Management (dashboard: hosts, deckies, agent enrollment bundles) api_router.include_router(swarm_mgmt_router) + +# System info (deployment-mode auto-detection, etc.) +api_router.include_router(system_router) diff --git a/decnet/web/router/fleet/api_deploy_deckies.py b/decnet/web/router/fleet/api_deploy_deckies.py index 0f872c4..4e8ef2c 100644 --- a/decnet/web/router/fleet/api_deploy_deckies.py +++ b/decnet/web/router/fleet/api_deploy_deckies.py @@ -10,6 +10,7 @@ from decnet.ini_loader import load_ini_from_string from decnet.network import detect_interface, detect_subnet, get_host_ip from decnet.web.dependencies import require_admin, repo from decnet.web.db.models import DeployIniRequest +from decnet.web.router.swarm.api_deploy_swarm import dispatch_decnet_config log = get_logger("api") @@ -109,12 +110,51 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir config.deckies = list(existing_deckies_map.values()) - # We call deploy(config) which regenerates docker-compose and runs `up -d --remove-orphans`. + # Auto-mode: if we're a master with at least one enrolled/active SWARM + # host, shard the deckies across those workers instead of spawning docker + # containers on the master itself. Round-robin assignment over deckies + # that don't already carry a host_uuid (state from a prior swarm deploy + # keeps its original assignment). + swarm_hosts: list[dict] = [] + if os.environ.get("DECNET_MODE", "master").lower() == "master": + swarm_hosts = [ + h for h in await repo.list_swarm_hosts() + if h.get("status") in ("active", "enrolled") and h.get("address") + ] + + if swarm_hosts: + unassigned = [d for d in config.deckies if not d.host_uuid] + for i, d in enumerate(unassigned): + d.host_uuid = swarm_hosts[i % len(swarm_hosts)]["uuid"] + config = config.model_copy(update={"mode": "swarm"}) + + try: + result = await dispatch_decnet_config(config, repo, dry_run=False, no_cache=False) + except HTTPException: + raise + except Exception as e: + log.exception("swarm-auto deploy dispatch failed: %s", e) + raise HTTPException(status_code=500, detail="Swarm dispatch failed. Check server logs.") + + await repo.set_state("deployment", { + "config": config.model_dump(), + "compose_path": state_dict["compose_path"] if state_dict else "", + }) + + failed = [r for r in result.results if not r.ok] + if failed: + detail = "; ".join(f"{r.host_name}: {r.detail}" for r in failed) + raise HTTPException(status_code=502, detail=f"Partial swarm deploy failure — {detail}") + return { + "message": f"Deckies deployed across {len(result.results)} swarm host(s)", + "mode": "swarm", + } + + # Unihost path — docker-compose on the master itself. try: if os.environ.get("DECNET_CONTRACT_TEST") != "true": _deploy(config) - # Persist new state to DB new_state_payload = { "config": config.model_dump(), "compose_path": str(_ROOT / "docker-compose.yml") if not state_dict else state_dict["compose_path"] @@ -124,4 +164,4 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir log.exception("Deployment failed: %s", e) raise HTTPException(status_code=500, detail="Deployment failed. Check server logs for details.") - return {"message": "Deckies deployed successfully"} + return {"message": "Deckies deployed successfully", "mode": "unihost"} diff --git a/decnet/web/router/swarm/api_deploy_swarm.py b/decnet/web/router/swarm/api_deploy_swarm.py index c55a844..f2b7b2e 100644 --- a/decnet/web/router/swarm/api_deploy_swarm.py +++ b/decnet/web/router/swarm/api_deploy_swarm.py @@ -47,15 +47,18 @@ def _worker_config(base: DecnetConfig, shard: list[DeckyConfig]) -> DecnetConfig return base.model_copy(update={"deckies": shard}) -@router.post("/deploy", response_model=SwarmDeployResponse, tags=["Swarm Deployments"]) -async def api_deploy_swarm( - req: SwarmDeployRequest, - repo: BaseRepository = Depends(get_repo), +async def dispatch_decnet_config( + config: DecnetConfig, + repo: BaseRepository, + dry_run: bool = False, + no_cache: bool = False, ) -> SwarmDeployResponse: - if req.config.mode != "swarm": - raise HTTPException(status_code=400, detail="mode must be 'swarm'") + """Shard ``config`` by ``host_uuid`` and dispatch to each worker in parallel. - buckets = _shard_by_host(req.config) + Shared between POST /swarm/deploy (explicit swarm call) and the auto-swarm + branch of POST /deckies/deploy. + """ + buckets = _shard_by_host(config) hosts: dict[str, dict[str, Any]] = {} for host_uuid in buckets: @@ -66,17 +69,17 @@ async def api_deploy_swarm( async def _dispatch(host_uuid: str, shard: list[DeckyConfig]) -> SwarmHostResult: host = hosts[host_uuid] - cfg = _worker_config(req.config, shard) + cfg = _worker_config(config, shard) try: async with AgentClient(host=host) as agent: - body = await agent.deploy(cfg, dry_run=req.dry_run, no_cache=req.no_cache) + body = await agent.deploy(cfg, dry_run=dry_run, no_cache=no_cache) for d in shard: await repo.upsert_decky_shard( { "decky_name": d.name, "host_uuid": host_uuid, "services": json.dumps(d.services), - "state": "running" if not req.dry_run else "pending", + "state": "running" if not dry_run else "pending", "last_error": None, "updated_at": datetime.now(timezone.utc), } @@ -102,3 +105,15 @@ async def api_deploy_swarm( *(_dispatch(uuid_, shard) for uuid_, shard in buckets.items()) ) return SwarmDeployResponse(results=list(results)) + + +@router.post("/deploy", response_model=SwarmDeployResponse, tags=["Swarm Deployments"]) +async def api_deploy_swarm( + req: SwarmDeployRequest, + repo: BaseRepository = Depends(get_repo), +) -> SwarmDeployResponse: + if req.config.mode != "swarm": + raise HTTPException(status_code=400, detail="mode must be 'swarm'") + return await dispatch_decnet_config( + req.config, repo, dry_run=req.dry_run, no_cache=req.no_cache + ) diff --git a/decnet/web/router/system/__init__.py b/decnet/web/router/system/__init__.py new file mode 100644 index 0000000..fdc0c05 --- /dev/null +++ b/decnet/web/router/system/__init__.py @@ -0,0 +1,6 @@ +from fastapi import APIRouter + +from .api_deployment_mode import router as deployment_mode_router + +system_router = APIRouter(prefix="/system", tags=["System"]) +system_router.include_router(deployment_mode_router) diff --git a/decnet/web/router/system/api_deployment_mode.py b/decnet/web/router/system/api_deployment_mode.py new file mode 100644 index 0000000..18cb3b0 --- /dev/null +++ b/decnet/web/router/system/api_deployment_mode.py @@ -0,0 +1,41 @@ +"""GET /system/deployment-mode — tells the UI whether a deploy will shard +across SWARM workers or land on the master itself. + +Logic mirrors the auto-mode branch in ``api_deploy_deckies``: master role +plus at least one reachable enrolled worker = swarm; otherwise unihost. +""" +from __future__ import annotations + +import os + +from fastapi import APIRouter, Depends +from pydantic import BaseModel + +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo + +router = APIRouter() + + +class DeploymentModeResponse(BaseModel): + mode: str # "swarm" or "unihost" + role: str # "master" or "agent" + swarm_host_count: int + + +@router.get("/deployment-mode", response_model=DeploymentModeResponse) +async def get_deployment_mode( + repo: BaseRepository = Depends(get_repo), +) -> DeploymentModeResponse: + role = os.environ.get("DECNET_MODE", "master").lower() + hosts = 0 + if role == "master": + hosts = sum( + 1 for h in await repo.list_swarm_hosts() + if h.get("status") in ("active", "enrolled") and h.get("address") + ) + return DeploymentModeResponse( + mode="swarm" if hosts > 0 else "unihost", + role=role, + swarm_host_count=hosts, + ) diff --git a/decnet_web/src/components/DeckyFleet.tsx b/decnet_web/src/components/DeckyFleet.tsx index de3a972..68d76bd 100644 --- a/decnet_web/src/components/DeckyFleet.tsx +++ b/decnet_web/src/components/DeckyFleet.tsx @@ -23,6 +23,7 @@ const DeckyFleet: React.FC = () => { const [iniContent, setIniContent] = useState(''); const [deploying, setDeploying] = useState(false); const [isAdmin, setIsAdmin] = useState(false); + const [deployMode, setDeployMode] = useState<{ mode: string; swarm_host_count: number } | null>(null); const fetchDeckies = async () => { try { @@ -102,9 +103,19 @@ const DeckyFleet: React.FC = () => { reader.readAsText(file); }; + const fetchDeployMode = async () => { + try { + const res = await api.get('/system/deployment-mode'); + setDeployMode({ mode: res.data.mode, swarm_host_count: res.data.swarm_host_count }); + } catch { + setDeployMode(null); + } + }; + useEffect(() => { fetchDeckies(); fetchRole(); + fetchDeployMode(); const _interval = setInterval(fetchDeckies, 10000); // Fleet state updates less frequently than logs return () => clearInterval(_interval); }, []); @@ -131,7 +142,16 @@ const DeckyFleet: React.FC = () => { {showDeploy && (
-

Deploy via INI Configuration

+

+ Deploy via INI Configuration + {deployMode && ( + + {deployMode.mode === 'swarm' + ? `→ will shard across ${deployMode.swarm_host_count} SWARM host(s)` + : '→ will deploy locally (UNIHOST)'} + + )} +