feat(api): /deckies/deploy and /mutate become 202 fire-and-forget
This is the unblock for the wizard hang. Both endpoints used to run
docker compose synchronously inside the HTTP handler -- on master
(unihost) or via asyncio.gather of worker /deploy POSTs at 600s
timeout each (swarm) -- blocking every other API request.
New flow:
1. Commit the new config shape to repo state (fast).
2. Create one DeckyLifecycle row per decky (status=pending).
3. Spawn asyncio.create_task(run_deploy / run_mutate) -- the
lifecycle runner drives rows through running -> succeeded|failed
and emits decky.<name>.lifecycle on the bus.
4. Return 202 with {lifecycle_ids: [...]}. Wizard polls
GET /deckies/lifecycle?ids=... (next commit).
mutator/engine.py gains pick_new_services() -- shared between the
async API path and the watch-loop's synchronous mutate_decky().
DeployResponse grows lifecycle_ids[]. The old dispatch_decnet_config
helper still exists for the CLI swarm-deploy command path; it just
isn't called from the API handler anymore.
Test changes: 200 -> 202, drop dispatch_decnet_config mocks (handler
no longer calls it), assert lifecycle_ids in response + committed
state matches expectations.
This commit is contained in:
@@ -37,6 +37,37 @@ log = get_logger("mutator")
|
||||
console = Console()
|
||||
|
||||
|
||||
def pick_new_services(decky: DeckyConfig) -> list[str] | None:
|
||||
"""Pick a fresh service list for *decky* using its archetype pool
|
||||
(or the global pool when no archetype is set). Returns ``None`` if
|
||||
no services are available to pick from.
|
||||
|
||||
Pure: does not touch the repo, file system, or docker. Shared by
|
||||
the mutator watch loop and the async API handler.
|
||||
"""
|
||||
if decky.archetype:
|
||||
try:
|
||||
arch = get_archetype(decky.archetype)
|
||||
svc_pool = list(arch.services)
|
||||
except ValueError:
|
||||
svc_pool = all_service_names()
|
||||
else:
|
||||
svc_pool = all_service_names()
|
||||
|
||||
if not svc_pool:
|
||||
return None
|
||||
|
||||
current_services = set(decky.services)
|
||||
attempts = 0
|
||||
while True:
|
||||
count = random.randint(1, min(3, len(svc_pool))) # nosec B311
|
||||
chosen = set(random.sample(svc_pool, count)) # nosec B311
|
||||
attempts += 1
|
||||
if chosen != current_services or attempts > 20:
|
||||
break
|
||||
return list(chosen)
|
||||
|
||||
|
||||
@_traced("mutator.mutate_decky")
|
||||
async def mutate_decky(
|
||||
decky_name: str,
|
||||
|
||||
@@ -20,8 +20,12 @@ class DeployIniRequest(BaseModel):
|
||||
|
||||
|
||||
class DeployResponse(BaseModel):
|
||||
"""202-Accepted response: deploy spawned in background, client polls
|
||||
GET /deckies/lifecycle?ids=... until each row reaches a terminal
|
||||
status."""
|
||||
message: str
|
||||
mode: str
|
||||
lifecycle_ids: list[str] = PydanticField(default_factory=list)
|
||||
|
||||
|
||||
class PurgeResponse(BaseModel):
|
||||
|
||||
@@ -1,16 +1,17 @@
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
|
||||
from decnet.bus.factory import get_bus
|
||||
from decnet.lifecycle.runner import run_deploy
|
||||
from decnet.logging import get_logger
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT
|
||||
from decnet.engine import deploy as _deploy
|
||||
from decnet.ini_loader import load_ini_from_string
|
||||
from decnet.network import detect_interface, detect_subnet, get_host_ip
|
||||
from decnet.web.dependencies import require_admin, repo
|
||||
from decnet.web.db.models import DeployIniRequest, DeployResponse
|
||||
from decnet.web.router.swarm.api_deploy_swarm import dispatch_decnet_config
|
||||
|
||||
log = get_logger("api")
|
||||
|
||||
@@ -20,19 +21,19 @@ router = APIRouter()
|
||||
@router.post(
|
||||
"/deckies/deploy",
|
||||
tags=["Fleet Management"],
|
||||
status_code=status.HTTP_202_ACCEPTED,
|
||||
response_model=DeployResponse,
|
||||
responses={
|
||||
202: {"description": "Deploy accepted; poll GET /deckies/lifecycle?ids=... for terminal status"},
|
||||
400: {"description": "Bad Request (e.g. malformed JSON)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
409: {"description": "Configuration conflict (e.g. invalid IP allocation or network mismatch)"},
|
||||
422: {"description": "Invalid INI config or schema validation error"},
|
||||
500: {"description": "Deployment failed"},
|
||||
502: {"description": "Partial swarm deploy failure — one or more worker hosts returned an error"},
|
||||
}
|
||||
)
|
||||
@_traced("api.deploy_deckies")
|
||||
async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(require_admin)) -> dict[str, str]:
|
||||
async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(require_admin)) -> dict:
|
||||
from decnet.fleet import build_deckies_from_ini
|
||||
|
||||
try:
|
||||
@@ -136,46 +137,46 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir
|
||||
for i, d in enumerate(unassigned):
|
||||
d.host_uuid = swarm_hosts[i % len(swarm_hosts)]["uuid"]
|
||||
config = config.model_copy(update={"mode": "swarm"})
|
||||
mode = "swarm"
|
||||
else:
|
||||
mode = "unihost"
|
||||
|
||||
try:
|
||||
result = await dispatch_decnet_config(config, repo, dry_run=False, no_cache=False)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
log.exception("swarm-auto deploy dispatch failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail="Swarm dispatch failed. Check server logs.")
|
||||
# Commit the new shape before spawning so the wizard / dashboard
|
||||
# observe the intended fleet immediately; lifecycle rows track the
|
||||
# operation's progress separately.
|
||||
new_state_payload = {
|
||||
"config": config.model_dump(),
|
||||
"compose_path": state_dict["compose_path"] if state_dict else str(
|
||||
_ROOT / "docker-compose.yml",
|
||||
),
|
||||
}
|
||||
await repo.set_state("deployment", new_state_payload)
|
||||
|
||||
await repo.set_state("deployment", {
|
||||
"config": config.model_dump(),
|
||||
"compose_path": state_dict["compose_path"] if state_dict else "",
|
||||
lifecycle_ids: dict[str, str] = {}
|
||||
for d in config.deckies:
|
||||
lid = await repo.create_lifecycle({
|
||||
"decky_name": d.name,
|
||||
"host_uuid": d.host_uuid,
|
||||
"operation": "deploy",
|
||||
})
|
||||
lifecycle_ids[d.name] = lid
|
||||
|
||||
failed = [r for r in result.results if not r.ok]
|
||||
if failed:
|
||||
detail = "; ".join(f"{r.host_name}: {r.detail}" for r in failed)
|
||||
raise HTTPException(status_code=502, detail=f"Partial swarm deploy failure — {detail}")
|
||||
return {
|
||||
"message": f"Deckies deployed across {len(result.results)} swarm host(s)",
|
||||
"mode": "swarm",
|
||||
}
|
||||
|
||||
# Unihost path — docker-compose on the master itself.
|
||||
# NB: the JSON state file (decnet-state.json) and fleet_deckies DB rows
|
||||
# are both written *inside* _deploy(config) — engine.deployer is the
|
||||
# single shared sink for every fleet-creation path (CLI deploy, this
|
||||
# unihost API path, and per-worker SWARM agent deploys). Do not
|
||||
# duplicate save_state / fleet upserts here.
|
||||
try:
|
||||
if os.environ.get("DECNET_CONTRACT_TEST") != "true":
|
||||
_deploy(config)
|
||||
bus = get_bus(client_name="api.deploy")
|
||||
except Exception:
|
||||
bus = None
|
||||
|
||||
new_state_payload = {
|
||||
"config": config.model_dump(),
|
||||
"compose_path": str(_ROOT / "docker-compose.yml") if not state_dict else state_dict["compose_path"]
|
||||
}
|
||||
await repo.set_state("deployment", new_state_payload)
|
||||
except Exception as e:
|
||||
log.exception("Deployment failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail="Deployment failed. Check server logs for details.")
|
||||
if os.environ.get("DECNET_CONTRACT_TEST") != "true":
|
||||
asyncio.create_task(
|
||||
run_deploy(repo, bus, lifecycle_ids=lifecycle_ids, config=config),
|
||||
name=f"deploy-{mode}-{len(config.deckies)}",
|
||||
)
|
||||
|
||||
return {"message": "Deckies deployed successfully", "mode": "unihost"}
|
||||
return {
|
||||
"message": (
|
||||
f"Deploy accepted ({len(config.deckies)} decky/ies, mode={mode}). "
|
||||
f"Poll /deckies/lifecycle?ids=... for completion."
|
||||
),
|
||||
"mode": mode,
|
||||
"lifecycle_ids": list(lifecycle_ids.values()),
|
||||
}
|
||||
|
||||
@@ -1,34 +1,101 @@
|
||||
import os
|
||||
from fastapi import APIRouter, Depends, HTTPException, Path
|
||||
"""POST /deckies/{name}/mutate — operator-triggered single-decky mutate.
|
||||
|
||||
Returns 202 Accepted with one ``lifecycle_id`` per mutated decky. The
|
||||
real compose work runs in an ``asyncio.create_task``; the wizard polls
|
||||
``GET /deckies/lifecycle?ids=...`` until terminal.
|
||||
|
||||
Auto-mutate (the watch-loop path) still goes through
|
||||
``decnet.mutator.mutate_decky`` and is synchronous within that loop —
|
||||
it's a background process, not an HTTP request, so it doesn't need
|
||||
fire-and-forget.
|
||||
"""
|
||||
import asyncio
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Path as PathParam, status
|
||||
|
||||
from decnet.bus.factory import get_bus
|
||||
from decnet.config import DecnetConfig
|
||||
from decnet.lifecycle.runner import run_mutate
|
||||
from decnet.logging import get_logger
|
||||
from decnet.mutator.engine import pick_new_services
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.mutator import mutate_decky
|
||||
from decnet.web.db.models import MessageResponse
|
||||
from decnet.web.db.models import LifecycleAcceptedResponse
|
||||
from decnet.web.dependencies import require_admin, repo
|
||||
|
||||
log = get_logger("api.mutate")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/deckies/{decky_name}/mutate",
|
||||
tags=["Fleet Management"],
|
||||
response_model=MessageResponse,
|
||||
status_code=status.HTTP_202_ACCEPTED,
|
||||
response_model=LifecycleAcceptedResponse,
|
||||
responses={
|
||||
202: {"description": "Mutate accepted; poll GET /deckies/lifecycle?ids=..."},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Decky not found"},
|
||||
404: {"description": "No active deployment, or decky not found, or no services available"},
|
||||
422: {"description": "Path parameter validation error (decky_name must match ^[a-z0-9\\-]{1,64}$)"},
|
||||
}
|
||||
},
|
||||
)
|
||||
@_traced("api.mutate_decky")
|
||||
async def api_mutate_decky(
|
||||
decky_name: str = Path(..., pattern=r"^[a-z0-9\-]{1,64}$"),
|
||||
decky_name: str = PathParam(..., pattern=r"^[a-z0-9\-]{1,64}$"),
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> dict[str, str]:
|
||||
) -> dict:
|
||||
if os.environ.get("DECNET_CONTRACT_TEST") == "true":
|
||||
return {"message": f"Successfully mutated {decky_name} (Contract Test Mock)"}
|
||||
return {"lifecycle_ids": ["contract-test"]}
|
||||
|
||||
success = await mutate_decky(decky_name, repo=repo)
|
||||
if success:
|
||||
return {"message": f"Successfully mutated {decky_name}"}
|
||||
raise HTTPException(status_code=404, detail=f"Decky {decky_name} not found or failed to mutate")
|
||||
state_dict = await repo.get_state("deployment")
|
||||
if state_dict is None:
|
||||
raise HTTPException(status_code=404, detail="No active deployment")
|
||||
config = DecnetConfig(**state_dict["config"])
|
||||
compose_path = Path(state_dict["compose_path"])
|
||||
decky = next((d for d in config.deckies if d.name == decky_name), None)
|
||||
if decky is None:
|
||||
raise HTTPException(status_code=404, detail=f"Decky {decky_name} not found")
|
||||
|
||||
new_services = pick_new_services(decky)
|
||||
if new_services is None:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"No services available to mutate {decky_name}",
|
||||
)
|
||||
|
||||
# Commit the new shape to the DB before spawning, so observers
|
||||
# don't see a half-applied mutation if the master crashes mid-task.
|
||||
decky.services = list(new_services)
|
||||
decky.last_mutated = time.time()
|
||||
await repo.set_state(
|
||||
"deployment",
|
||||
{"config": config.model_dump(), "compose_path": str(compose_path)},
|
||||
)
|
||||
|
||||
lifecycle_id = await repo.create_lifecycle({
|
||||
"decky_name": decky.name,
|
||||
"host_uuid": decky.host_uuid,
|
||||
"operation": "mutate",
|
||||
})
|
||||
|
||||
try:
|
||||
bus = get_bus(client_name="api.mutate")
|
||||
except Exception:
|
||||
bus = None
|
||||
|
||||
asyncio.create_task(
|
||||
run_mutate(
|
||||
repo, bus,
|
||||
lifecycle_id=lifecycle_id,
|
||||
decky=decky,
|
||||
services=list(new_services),
|
||||
full_config=config,
|
||||
compose_path=compose_path,
|
||||
),
|
||||
name=f"mutate-{decky.name}",
|
||||
)
|
||||
return {"lifecycle_ids": [lifecycle_id]}
|
||||
|
||||
Reference in New Issue
Block a user