feat(fleet): auto-swarm deploy — shard across enrolled workers when master
POST /deckies/deploy now branches on DECNET_MODE + enrolled host presence: when the caller is a master with at least one reachable swarm host, round- robin host_uuids are assigned over new deckies and the config is dispatched via AgentClient. Falls back to local docker-compose otherwise. Extracts the dispatch loop from api_deploy_swarm into dispatch_decnet_config so both endpoints share the same shard/dispatch/persist path. Adds GET /system/deployment-mode for the UI to show 'will shard across N hosts' vs 'will deploy locally' before the operator clicks deploy.
This commit is contained in:
@@ -23,6 +23,7 @@ from .health.api_get_health import router as health_router
|
|||||||
from .artifacts.api_get_artifact import router as artifacts_router
|
from .artifacts.api_get_artifact import router as artifacts_router
|
||||||
from .swarm_updates import swarm_updates_router
|
from .swarm_updates import swarm_updates_router
|
||||||
from .swarm_mgmt import swarm_mgmt_router
|
from .swarm_mgmt import swarm_mgmt_router
|
||||||
|
from .system import system_router
|
||||||
|
|
||||||
api_router = APIRouter()
|
api_router = APIRouter()
|
||||||
|
|
||||||
@@ -68,3 +69,6 @@ api_router.include_router(swarm_updates_router)
|
|||||||
|
|
||||||
# Swarm Management (dashboard: hosts, deckies, agent enrollment bundles)
|
# Swarm Management (dashboard: hosts, deckies, agent enrollment bundles)
|
||||||
api_router.include_router(swarm_mgmt_router)
|
api_router.include_router(swarm_mgmt_router)
|
||||||
|
|
||||||
|
# System info (deployment-mode auto-detection, etc.)
|
||||||
|
api_router.include_router(system_router)
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from decnet.ini_loader import load_ini_from_string
|
|||||||
from decnet.network import detect_interface, detect_subnet, get_host_ip
|
from decnet.network import detect_interface, detect_subnet, get_host_ip
|
||||||
from decnet.web.dependencies import require_admin, repo
|
from decnet.web.dependencies import require_admin, repo
|
||||||
from decnet.web.db.models import DeployIniRequest
|
from decnet.web.db.models import DeployIniRequest
|
||||||
|
from decnet.web.router.swarm.api_deploy_swarm import dispatch_decnet_config
|
||||||
|
|
||||||
log = get_logger("api")
|
log = get_logger("api")
|
||||||
|
|
||||||
@@ -109,12 +110,51 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir
|
|||||||
|
|
||||||
config.deckies = list(existing_deckies_map.values())
|
config.deckies = list(existing_deckies_map.values())
|
||||||
|
|
||||||
# We call deploy(config) which regenerates docker-compose and runs `up -d --remove-orphans`.
|
# Auto-mode: if we're a master with at least one enrolled/active SWARM
|
||||||
|
# host, shard the deckies across those workers instead of spawning docker
|
||||||
|
# containers on the master itself. Round-robin assignment over deckies
|
||||||
|
# that don't already carry a host_uuid (state from a prior swarm deploy
|
||||||
|
# keeps its original assignment).
|
||||||
|
swarm_hosts: list[dict] = []
|
||||||
|
if os.environ.get("DECNET_MODE", "master").lower() == "master":
|
||||||
|
swarm_hosts = [
|
||||||
|
h for h in await repo.list_swarm_hosts()
|
||||||
|
if h.get("status") in ("active", "enrolled") and h.get("address")
|
||||||
|
]
|
||||||
|
|
||||||
|
if swarm_hosts:
|
||||||
|
unassigned = [d for d in config.deckies if not d.host_uuid]
|
||||||
|
for i, d in enumerate(unassigned):
|
||||||
|
d.host_uuid = swarm_hosts[i % len(swarm_hosts)]["uuid"]
|
||||||
|
config = config.model_copy(update={"mode": "swarm"})
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await dispatch_decnet_config(config, repo, dry_run=False, no_cache=False)
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
log.exception("swarm-auto deploy dispatch failed: %s", e)
|
||||||
|
raise HTTPException(status_code=500, detail="Swarm dispatch failed. Check server logs.")
|
||||||
|
|
||||||
|
await repo.set_state("deployment", {
|
||||||
|
"config": config.model_dump(),
|
||||||
|
"compose_path": state_dict["compose_path"] if state_dict else "",
|
||||||
|
})
|
||||||
|
|
||||||
|
failed = [r for r in result.results if not r.ok]
|
||||||
|
if failed:
|
||||||
|
detail = "; ".join(f"{r.host_name}: {r.detail}" for r in failed)
|
||||||
|
raise HTTPException(status_code=502, detail=f"Partial swarm deploy failure — {detail}")
|
||||||
|
return {
|
||||||
|
"message": f"Deckies deployed across {len(result.results)} swarm host(s)",
|
||||||
|
"mode": "swarm",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Unihost path — docker-compose on the master itself.
|
||||||
try:
|
try:
|
||||||
if os.environ.get("DECNET_CONTRACT_TEST") != "true":
|
if os.environ.get("DECNET_CONTRACT_TEST") != "true":
|
||||||
_deploy(config)
|
_deploy(config)
|
||||||
|
|
||||||
# Persist new state to DB
|
|
||||||
new_state_payload = {
|
new_state_payload = {
|
||||||
"config": config.model_dump(),
|
"config": config.model_dump(),
|
||||||
"compose_path": str(_ROOT / "docker-compose.yml") if not state_dict else state_dict["compose_path"]
|
"compose_path": str(_ROOT / "docker-compose.yml") if not state_dict else state_dict["compose_path"]
|
||||||
@@ -124,4 +164,4 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir
|
|||||||
log.exception("Deployment failed: %s", e)
|
log.exception("Deployment failed: %s", e)
|
||||||
raise HTTPException(status_code=500, detail="Deployment failed. Check server logs for details.")
|
raise HTTPException(status_code=500, detail="Deployment failed. Check server logs for details.")
|
||||||
|
|
||||||
return {"message": "Deckies deployed successfully"}
|
return {"message": "Deckies deployed successfully", "mode": "unihost"}
|
||||||
|
|||||||
@@ -47,15 +47,18 @@ def _worker_config(base: DecnetConfig, shard: list[DeckyConfig]) -> DecnetConfig
|
|||||||
return base.model_copy(update={"deckies": shard})
|
return base.model_copy(update={"deckies": shard})
|
||||||
|
|
||||||
|
|
||||||
@router.post("/deploy", response_model=SwarmDeployResponse, tags=["Swarm Deployments"])
|
async def dispatch_decnet_config(
|
||||||
async def api_deploy_swarm(
|
config: DecnetConfig,
|
||||||
req: SwarmDeployRequest,
|
repo: BaseRepository,
|
||||||
repo: BaseRepository = Depends(get_repo),
|
dry_run: bool = False,
|
||||||
|
no_cache: bool = False,
|
||||||
) -> SwarmDeployResponse:
|
) -> SwarmDeployResponse:
|
||||||
if req.config.mode != "swarm":
|
"""Shard ``config`` by ``host_uuid`` and dispatch to each worker in parallel.
|
||||||
raise HTTPException(status_code=400, detail="mode must be 'swarm'")
|
|
||||||
|
|
||||||
buckets = _shard_by_host(req.config)
|
Shared between POST /swarm/deploy (explicit swarm call) and the auto-swarm
|
||||||
|
branch of POST /deckies/deploy.
|
||||||
|
"""
|
||||||
|
buckets = _shard_by_host(config)
|
||||||
|
|
||||||
hosts: dict[str, dict[str, Any]] = {}
|
hosts: dict[str, dict[str, Any]] = {}
|
||||||
for host_uuid in buckets:
|
for host_uuid in buckets:
|
||||||
@@ -66,17 +69,17 @@ async def api_deploy_swarm(
|
|||||||
|
|
||||||
async def _dispatch(host_uuid: str, shard: list[DeckyConfig]) -> SwarmHostResult:
|
async def _dispatch(host_uuid: str, shard: list[DeckyConfig]) -> SwarmHostResult:
|
||||||
host = hosts[host_uuid]
|
host = hosts[host_uuid]
|
||||||
cfg = _worker_config(req.config, shard)
|
cfg = _worker_config(config, shard)
|
||||||
try:
|
try:
|
||||||
async with AgentClient(host=host) as agent:
|
async with AgentClient(host=host) as agent:
|
||||||
body = await agent.deploy(cfg, dry_run=req.dry_run, no_cache=req.no_cache)
|
body = await agent.deploy(cfg, dry_run=dry_run, no_cache=no_cache)
|
||||||
for d in shard:
|
for d in shard:
|
||||||
await repo.upsert_decky_shard(
|
await repo.upsert_decky_shard(
|
||||||
{
|
{
|
||||||
"decky_name": d.name,
|
"decky_name": d.name,
|
||||||
"host_uuid": host_uuid,
|
"host_uuid": host_uuid,
|
||||||
"services": json.dumps(d.services),
|
"services": json.dumps(d.services),
|
||||||
"state": "running" if not req.dry_run else "pending",
|
"state": "running" if not dry_run else "pending",
|
||||||
"last_error": None,
|
"last_error": None,
|
||||||
"updated_at": datetime.now(timezone.utc),
|
"updated_at": datetime.now(timezone.utc),
|
||||||
}
|
}
|
||||||
@@ -102,3 +105,15 @@ async def api_deploy_swarm(
|
|||||||
*(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
|
*(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
|
||||||
)
|
)
|
||||||
return SwarmDeployResponse(results=list(results))
|
return SwarmDeployResponse(results=list(results))
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/deploy", response_model=SwarmDeployResponse, tags=["Swarm Deployments"])
|
||||||
|
async def api_deploy_swarm(
|
||||||
|
req: SwarmDeployRequest,
|
||||||
|
repo: BaseRepository = Depends(get_repo),
|
||||||
|
) -> SwarmDeployResponse:
|
||||||
|
if req.config.mode != "swarm":
|
||||||
|
raise HTTPException(status_code=400, detail="mode must be 'swarm'")
|
||||||
|
return await dispatch_decnet_config(
|
||||||
|
req.config, repo, dry_run=req.dry_run, no_cache=req.no_cache
|
||||||
|
)
|
||||||
|
|||||||
6
decnet/web/router/system/__init__.py
Normal file
6
decnet/web/router/system/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
from fastapi import APIRouter
|
||||||
|
|
||||||
|
from .api_deployment_mode import router as deployment_mode_router
|
||||||
|
|
||||||
|
system_router = APIRouter(prefix="/system", tags=["System"])
|
||||||
|
system_router.include_router(deployment_mode_router)
|
||||||
41
decnet/web/router/system/api_deployment_mode.py
Normal file
41
decnet/web/router/system/api_deployment_mode.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
"""GET /system/deployment-mode — tells the UI whether a deploy will shard
|
||||||
|
across SWARM workers or land on the master itself.
|
||||||
|
|
||||||
|
Logic mirrors the auto-mode branch in ``api_deploy_deckies``: master role
|
||||||
|
plus at least one reachable enrolled worker = swarm; otherwise unihost.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
from decnet.web.dependencies import get_repo
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
class DeploymentModeResponse(BaseModel):
|
||||||
|
mode: str # "swarm" or "unihost"
|
||||||
|
role: str # "master" or "agent"
|
||||||
|
swarm_host_count: int
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/deployment-mode", response_model=DeploymentModeResponse)
|
||||||
|
async def get_deployment_mode(
|
||||||
|
repo: BaseRepository = Depends(get_repo),
|
||||||
|
) -> DeploymentModeResponse:
|
||||||
|
role = os.environ.get("DECNET_MODE", "master").lower()
|
||||||
|
hosts = 0
|
||||||
|
if role == "master":
|
||||||
|
hosts = sum(
|
||||||
|
1 for h in await repo.list_swarm_hosts()
|
||||||
|
if h.get("status") in ("active", "enrolled") and h.get("address")
|
||||||
|
)
|
||||||
|
return DeploymentModeResponse(
|
||||||
|
mode="swarm" if hosts > 0 else "unihost",
|
||||||
|
role=role,
|
||||||
|
swarm_host_count=hosts,
|
||||||
|
)
|
||||||
@@ -23,6 +23,7 @@ const DeckyFleet: React.FC = () => {
|
|||||||
const [iniContent, setIniContent] = useState('');
|
const [iniContent, setIniContent] = useState('');
|
||||||
const [deploying, setDeploying] = useState(false);
|
const [deploying, setDeploying] = useState(false);
|
||||||
const [isAdmin, setIsAdmin] = useState(false);
|
const [isAdmin, setIsAdmin] = useState(false);
|
||||||
|
const [deployMode, setDeployMode] = useState<{ mode: string; swarm_host_count: number } | null>(null);
|
||||||
|
|
||||||
const fetchDeckies = async () => {
|
const fetchDeckies = async () => {
|
||||||
try {
|
try {
|
||||||
@@ -102,9 +103,19 @@ const DeckyFleet: React.FC = () => {
|
|||||||
reader.readAsText(file);
|
reader.readAsText(file);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const fetchDeployMode = async () => {
|
||||||
|
try {
|
||||||
|
const res = await api.get('/system/deployment-mode');
|
||||||
|
setDeployMode({ mode: res.data.mode, swarm_host_count: res.data.swarm_host_count });
|
||||||
|
} catch {
|
||||||
|
setDeployMode(null);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
fetchDeckies();
|
fetchDeckies();
|
||||||
fetchRole();
|
fetchRole();
|
||||||
|
fetchDeployMode();
|
||||||
const _interval = setInterval(fetchDeckies, 10000); // Fleet state updates less frequently than logs
|
const _interval = setInterval(fetchDeckies, 10000); // Fleet state updates less frequently than logs
|
||||||
return () => clearInterval(_interval);
|
return () => clearInterval(_interval);
|
||||||
}, []);
|
}, []);
|
||||||
@@ -131,7 +142,16 @@ const DeckyFleet: React.FC = () => {
|
|||||||
{showDeploy && (
|
{showDeploy && (
|
||||||
<div style={{ marginBottom: '24px', padding: '24px', backgroundColor: 'var(--secondary-color)', border: '1px solid var(--accent-color)', display: 'flex', flexDirection: 'column', gap: '16px' }}>
|
<div style={{ marginBottom: '24px', padding: '24px', backgroundColor: 'var(--secondary-color)', border: '1px solid var(--accent-color)', display: 'flex', flexDirection: 'column', gap: '16px' }}>
|
||||||
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center' }}>
|
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center' }}>
|
||||||
<h3 style={{ fontSize: '1rem', color: 'var(--text-color)' }}>Deploy via INI Configuration</h3>
|
<h3 style={{ fontSize: '1rem', color: 'var(--text-color)' }}>
|
||||||
|
Deploy via INI Configuration
|
||||||
|
{deployMode && (
|
||||||
|
<span style={{ marginLeft: 12, fontSize: '0.75rem', color: 'var(--dim-color)', fontWeight: 'normal' }}>
|
||||||
|
{deployMode.mode === 'swarm'
|
||||||
|
? `→ will shard across ${deployMode.swarm_host_count} SWARM host(s)`
|
||||||
|
: '→ will deploy locally (UNIHOST)'}
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</h3>
|
||||||
<div>
|
<div>
|
||||||
<input
|
<input
|
||||||
type="file"
|
type="file"
|
||||||
|
|||||||
105
tests/api/fleet/test_deploy_automode.py
Normal file
105
tests/api/fleet/test_deploy_automode.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
"""POST /deckies/deploy auto-mode: master + swarm hosts → shard to workers."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from unittest.mock import patch, AsyncMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from decnet.web.dependencies import repo
|
||||||
|
from decnet.web.db.models import SwarmDeployResponse, SwarmHostResult
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def contract_test_mode(monkeypatch):
|
||||||
|
monkeypatch.setenv("DECNET_CONTRACT_TEST", "true")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def mock_network():
|
||||||
|
with patch("decnet.web.router.fleet.api_deploy_deckies.get_host_ip", return_value="192.168.1.100"):
|
||||||
|
with patch("decnet.web.router.fleet.api_deploy_deckies.detect_interface", return_value="eth0"):
|
||||||
|
with patch("decnet.web.router.fleet.api_deploy_deckies.detect_subnet", return_value=("192.168.1.0/24", "192.168.1.1")):
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.anyio
|
||||||
|
async def test_deploy_automode_unihost_when_no_swarm_hosts(client, auth_token, monkeypatch):
|
||||||
|
"""No swarm hosts enrolled → local unihost deploy."""
|
||||||
|
monkeypatch.setenv("DECNET_MODE", "master")
|
||||||
|
for row in await repo.list_swarm_hosts():
|
||||||
|
await repo.delete_swarm_host(row["uuid"])
|
||||||
|
await repo.set_state("deployment", None)
|
||||||
|
|
||||||
|
ini = "[decky-solo]\nservices = ssh\n"
|
||||||
|
resp = await client.post(
|
||||||
|
"/api/v1/deckies/deploy",
|
||||||
|
json={"ini_content": ini},
|
||||||
|
headers={"Authorization": f"Bearer {auth_token}"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200, resp.text
|
||||||
|
assert resp.json()["mode"] == "unihost"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.anyio
|
||||||
|
async def test_deploy_automode_shards_when_swarm_host_enrolled(client, auth_token, monkeypatch):
|
||||||
|
"""Master + one active swarm host → swarm mode, dispatch invoked."""
|
||||||
|
monkeypatch.setenv("DECNET_MODE", "master")
|
||||||
|
await repo.set_state("deployment", None)
|
||||||
|
|
||||||
|
for row in await repo.list_swarm_hosts():
|
||||||
|
await repo.delete_swarm_host(row["uuid"])
|
||||||
|
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
await repo.add_swarm_host({
|
||||||
|
"uuid": "host-A",
|
||||||
|
"name": "worker-a",
|
||||||
|
"address": "10.0.0.50",
|
||||||
|
"agent_port": 8765,
|
||||||
|
"status": "active",
|
||||||
|
"client_cert_fingerprint": "x" * 64,
|
||||||
|
"updater_cert_fingerprint": None,
|
||||||
|
"cert_bundle_path": "/tmp/worker-a",
|
||||||
|
"enrolled_at": datetime.now(timezone.utc),
|
||||||
|
"notes": "",
|
||||||
|
})
|
||||||
|
|
||||||
|
fake_response = SwarmDeployResponse(results=[
|
||||||
|
SwarmHostResult(host_uuid="host-A", host_name="worker-a", ok=True, detail={})
|
||||||
|
])
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"decnet.web.router.fleet.api_deploy_deckies.dispatch_decnet_config",
|
||||||
|
new=AsyncMock(return_value=fake_response),
|
||||||
|
) as mock_dispatch:
|
||||||
|
ini = "[decky-01]\nservices = ssh\n[decky-02]\nservices = http\n"
|
||||||
|
resp = await client.post(
|
||||||
|
"/api/v1/deckies/deploy",
|
||||||
|
json={"ini_content": ini},
|
||||||
|
headers={"Authorization": f"Bearer {auth_token}"},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert resp.status_code == 200, resp.text
|
||||||
|
assert resp.json()["mode"] == "swarm"
|
||||||
|
assert mock_dispatch.await_count == 1
|
||||||
|
dispatched_config = mock_dispatch.await_args.args[0]
|
||||||
|
assert dispatched_config.mode == "swarm"
|
||||||
|
assert all(d.host_uuid == "host-A" for d in dispatched_config.deckies)
|
||||||
|
|
||||||
|
await repo.delete_swarm_host("host-A")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.anyio
|
||||||
|
async def test_deployment_mode_endpoint(client, auth_token, monkeypatch):
|
||||||
|
monkeypatch.setenv("DECNET_MODE", "master")
|
||||||
|
for row in await repo.list_swarm_hosts():
|
||||||
|
await repo.delete_swarm_host(row["uuid"])
|
||||||
|
|
||||||
|
resp = await client.get(
|
||||||
|
"/api/v1/system/deployment-mode",
|
||||||
|
headers={"Authorization": f"Bearer {auth_token}"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
body = resp.json()
|
||||||
|
assert body["role"] == "master"
|
||||||
|
assert body["mode"] == "unihost"
|
||||||
|
assert body["swarm_host_count"] == 0
|
||||||
Reference in New Issue
Block a user