feat(agent): /deploy and /mutate become 202 fire-and-forget

The wizard API used to hang because /deckies/deploy ran docker compose
build && up -d synchronously, holding the request thread for minutes.
The worker side of that pipeline now returns 202 Accepted immediately
and runs the deploy in an asyncio.create_task.

On task completion (success or failure) the worker pushes a one-off
heartbeat carrying a lifecycle delta per decky:
  {decky_name, operation, status: succeeded|failed, error?, completed_at}

Master pivots these onto open DeckyLifecycle rows in the heartbeat
handler (next commit).  The scheduled 30s heartbeat tick is the
fallback if the immediate push drops.

- decnet/agent/app.py: /deploy and /mutate return 202; dry_run mutate
  still validates synchronously and returns 200.
- decnet/agent/executor.py: deploy_async + mutate_async wrap the work
  and push the completion delta.
- decnet/agent/heartbeat.py: push_lifecycle_delta() helper builds a
  one-off body and POSTs with the same mTLS context as the loop.
- decnet/swarm/client.py: revert deploy/mutate to control timeout
  (master no longer holds the HTTP request open for compose work).

Worker state.json gains no lifecycle field -- master DeckyLifecycle is
the source of truth; the master sweep handles crashed-mid-deploy
recovery.
This commit is contained in:
2026-05-22 16:31:23 -04:00
parent c0ad380020
commit d1ca96b2f4
6 changed files with 292 additions and 122 deletions

View File

@@ -25,6 +25,7 @@ from contextlib import asynccontextmanager
from typing import Any, Optional
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field
import contextlib
@@ -198,15 +199,22 @@ async def status() -> dict:
@app.post(
"/deploy",
responses={500: {"description": "Deployer raised an exception materialising the config"}},
status_code=202,
responses={202: {"description": "Deploy accepted; runs in background; lifecycle deltas pushed via heartbeat"}},
)
async def deploy(req: DeployRequest) -> dict:
try:
await _exec.deploy(req.config, dry_run=req.dry_run, no_cache=req.no_cache)
except Exception as exc:
log.exception("agent.deploy failed")
raise HTTPException(status_code=500, detail=str(exc)) from exc
return {"status": "deployed", "deckies": len(req.config.deckies)}
"""Spawn the deploy in the background and return 202 immediately.
The master tracks per-decky completion via lifecycle deltas pushed on
the next heartbeat (one immediate push on completion, plus the
scheduled 30 s ticks as a fallback). Holding the request open across
a multi-minute compose build was the previous source of the wizard
API-hang."""
asyncio.create_task(
_exec.deploy_async(req.config, dry_run=req.dry_run, no_cache=req.no_cache),
name=f"deploy-{id(req)}",
)
return {"status": "accepted", "deckies": [d.name for d in req.config.deckies]}
@app.post(
@@ -308,51 +316,50 @@ async def topology_state() -> dict:
@app.post(
"/mutate",
status_code=202,
responses={
404: {"description": "No active deployment, or unknown decky_id"},
500: {"description": "Compose rewrite or container restart failed"},
202: {"description": "Mutate accepted; runs in background; lifecycle delta pushed via heartbeat"},
404: {"description": "No active deployment, or unknown decky_id (dry_run validation only)"},
},
)
async def mutate(req: MutateRequest) -> dict:
import time
from decnet.composer import write_compose
from decnet.config import load_state, save_state
from decnet.engine import _compose_with_retry
state = load_state()
if state is None:
raise HTTPException(status_code=404, detail="no active deployment on this worker")
cfg, compose_path = state
decky = next((d for d in cfg.deckies if d.name == req.decky_id), None)
if decky is None:
raise HTTPException(
status_code=404, detail=f"decky {req.decky_id!r} not found in worker state",
)
decky.services = list(req.services)
decky.last_mutated = time.time()
async def mutate(req: MutateRequest) -> Any:
"""Spawn the mutate in the background and return 202 immediately.
Master tracks completion via a lifecycle delta pushed on the next
heartbeat (immediate push on completion). ``dry_run`` is still
synchronous — it validates against the worker's current state and
returns the would-be services without spawning a task or touching
docker, so the wizard's preview path stays cheap."""
if req.dry_run:
return {
"status": "dry_run",
"decky_id": decky.name,
"services": list(decky.services),
}
try:
save_state(cfg, compose_path)
write_compose(cfg, compose_path)
await asyncio.to_thread(
_compose_with_retry, "up", "-d", "--remove-orphans",
compose_file=compose_path,
from decnet.config import load_state
state = load_state()
if state is None:
raise HTTPException(
status_code=404,
detail="no active deployment on this worker",
)
cfg, _ = state
decky = next((d for d in cfg.deckies if d.name == req.decky_id), None)
if decky is None:
raise HTTPException(
status_code=404,
detail=f"decky {req.decky_id!r} not found in worker state",
)
return JSONResponse(
status_code=200,
content={
"status": "dry_run",
"decky_id": req.decky_id,
"services": list(req.services),
},
)
except Exception as exc:
log.exception("agent.mutate failed")
raise HTTPException(status_code=500, detail=str(exc)) from exc
asyncio.create_task(
_exec.mutate_async(req.decky_id, list(req.services)),
name=f"mutate-{req.decky_id}",
)
return {
"status": "mutated",
"decky_id": decky.name,
"services": list(decky.services),
"status": "accepted",
"decky_id": req.decky_id,
"services": list(req.services),
}