feat(agent): /deploy and /mutate become 202 fire-and-forget

The wizard API used to hang because /deckies/deploy ran docker compose
build && up -d synchronously, holding the request thread for minutes.
The worker side of that pipeline now returns 202 Accepted immediately
and runs the deploy in an asyncio.create_task.

On task completion (success or failure) the worker pushes a one-off
heartbeat carrying a lifecycle delta per decky:
  {decky_name, operation, status: succeeded|failed, error?, completed_at}

Master pivots these onto open DeckyLifecycle rows in the heartbeat
handler (next commit).  The scheduled 30s heartbeat tick is the
fallback if the immediate push drops.

- decnet/agent/app.py: /deploy and /mutate return 202; dry_run mutate
  still validates synchronously and returns 200.
- decnet/agent/executor.py: deploy_async + mutate_async wrap the work
  and push the completion delta.
- decnet/agent/heartbeat.py: push_lifecycle_delta() helper builds a
  one-off body and POSTs with the same mTLS context as the loop.
- decnet/swarm/client.py: revert deploy/mutate to control timeout
  (master no longer holds the HTTP request open for compose work).

Worker state.json gains no lifecycle field -- master DeckyLifecycle is
the source of truth; the master sweep handles crashed-mid-deploy
recovery.
This commit is contained in:
2026-05-22 16:31:23 -04:00
parent c0ad380020
commit d1ca96b2f4
6 changed files with 292 additions and 122 deletions

View File

@@ -25,6 +25,7 @@ from contextlib import asynccontextmanager
from typing import Any, Optional from typing import Any, Optional
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
import contextlib import contextlib
@@ -198,15 +199,22 @@ async def status() -> dict:
@app.post( @app.post(
"/deploy", "/deploy",
responses={500: {"description": "Deployer raised an exception materialising the config"}}, status_code=202,
responses={202: {"description": "Deploy accepted; runs in background; lifecycle deltas pushed via heartbeat"}},
) )
async def deploy(req: DeployRequest) -> dict: async def deploy(req: DeployRequest) -> dict:
try: """Spawn the deploy in the background and return 202 immediately.
await _exec.deploy(req.config, dry_run=req.dry_run, no_cache=req.no_cache)
except Exception as exc: The master tracks per-decky completion via lifecycle deltas pushed on
log.exception("agent.deploy failed") the next heartbeat (one immediate push on completion, plus the
raise HTTPException(status_code=500, detail=str(exc)) from exc scheduled 30 s ticks as a fallback). Holding the request open across
return {"status": "deployed", "deckies": len(req.config.deckies)} a multi-minute compose build was the previous source of the wizard
API-hang."""
asyncio.create_task(
_exec.deploy_async(req.config, dry_run=req.dry_run, no_cache=req.no_cache),
name=f"deploy-{id(req)}",
)
return {"status": "accepted", "deckies": [d.name for d in req.config.deckies]}
@app.post( @app.post(
@@ -308,51 +316,50 @@ async def topology_state() -> dict:
@app.post( @app.post(
"/mutate", "/mutate",
status_code=202,
responses={ responses={
404: {"description": "No active deployment, or unknown decky_id"}, 202: {"description": "Mutate accepted; runs in background; lifecycle delta pushed via heartbeat"},
500: {"description": "Compose rewrite or container restart failed"}, 404: {"description": "No active deployment, or unknown decky_id (dry_run validation only)"},
}, },
) )
async def mutate(req: MutateRequest) -> dict: async def mutate(req: MutateRequest) -> Any:
import time """Spawn the mutate in the background and return 202 immediately.
from decnet.composer import write_compose
from decnet.config import load_state, save_state
from decnet.engine import _compose_with_retry
state = load_state()
if state is None:
raise HTTPException(status_code=404, detail="no active deployment on this worker")
cfg, compose_path = state
decky = next((d for d in cfg.deckies if d.name == req.decky_id), None)
if decky is None:
raise HTTPException(
status_code=404, detail=f"decky {req.decky_id!r} not found in worker state",
)
decky.services = list(req.services)
decky.last_mutated = time.time()
Master tracks completion via a lifecycle delta pushed on the next
heartbeat (immediate push on completion). ``dry_run`` is still
synchronous — it validates against the worker's current state and
returns the would-be services without spawning a task or touching
docker, so the wizard's preview path stays cheap."""
if req.dry_run: if req.dry_run:
return { from decnet.config import load_state
"status": "dry_run", state = load_state()
"decky_id": decky.name, if state is None:
"services": list(decky.services), raise HTTPException(
} status_code=404,
detail="no active deployment on this worker",
try: )
save_state(cfg, compose_path) cfg, _ = state
write_compose(cfg, compose_path) decky = next((d for d in cfg.deckies if d.name == req.decky_id), None)
await asyncio.to_thread( if decky is None:
_compose_with_retry, "up", "-d", "--remove-orphans", raise HTTPException(
compose_file=compose_path, status_code=404,
detail=f"decky {req.decky_id!r} not found in worker state",
)
return JSONResponse(
status_code=200,
content={
"status": "dry_run",
"decky_id": req.decky_id,
"services": list(req.services),
},
) )
except Exception as exc:
log.exception("agent.mutate failed")
raise HTTPException(status_code=500, detail=str(exc)) from exc
asyncio.create_task(
_exec.mutate_async(req.decky_id, list(req.services)),
name=f"mutate-{req.decky_id}",
)
return { return {
"status": "mutated", "status": "accepted",
"decky_id": decky.name, "decky_id": req.decky_id,
"services": list(decky.services), "services": list(req.services),
} }

View File

@@ -80,6 +80,99 @@ async def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = F
await asyncio.to_thread(_deployer.deploy, config, dry_run, no_cache, False) await asyncio.to_thread(_deployer.deploy, config, dry_run, no_cache, False)
async def deploy_async(
config: DecnetConfig, *, dry_run: bool = False, no_cache: bool = False,
) -> None:
"""Background-task body for /deploy: run the deploy, then push a
lifecycle delta to the master so it observes terminal transitions
immediately rather than waiting for the next scheduled heartbeat.
Per-decky lifecycle deltas — master pivots them onto the matching
open DeckyLifecycle rows via the heartbeat handler. Errors are
captured and pushed as ``failed`` deltas; the task itself never
raises (a crashed task would just leave master rows wedged).
"""
from datetime import datetime, timezone
from decnet.agent.heartbeat import push_lifecycle_delta
decky_names = [d.name for d in config.deckies]
try:
await deploy(config, dry_run=dry_run, no_cache=no_cache)
except Exception as exc: # noqa: BLE001
log.exception("agent.deploy_async failed")
err = f"{type(exc).__name__}: {exc}"
deltas = [
{
"decky_name": name, "operation": "deploy",
"status": "failed", "error": err[:2000],
"completed_at": datetime.now(timezone.utc).isoformat(),
}
for name in decky_names
]
await push_lifecycle_delta(deltas)
return
deltas = [
{
"decky_name": name, "operation": "deploy",
"status": "succeeded",
"completed_at": datetime.now(timezone.utc).isoformat(),
}
for name in decky_names
]
await push_lifecycle_delta(deltas)
async def mutate_async(decky_id: str, services: list[str]) -> None:
"""Background-task body for /mutate. Same shape as deploy_async:
perform the work, then push a single lifecycle delta on
completion (success or failure)."""
import time
from datetime import datetime, timezone
from decnet.composer import write_compose
from decnet.config import load_state, save_state
from decnet.engine import _compose_with_retry
from decnet.agent.heartbeat import push_lifecycle_delta
def _delta(status: str, error: str | None = None) -> dict:
out = {
"decky_name": decky_id, "operation": "mutate",
"status": status,
"completed_at": datetime.now(timezone.utc).isoformat(),
}
if error is not None:
out["error"] = error[:2000]
return out
try:
state = load_state()
if state is None:
await push_lifecycle_delta(
[_delta("failed", "no active deployment on this worker")],
)
return
cfg, compose_path = state
decky = next((d for d in cfg.deckies if d.name == decky_id), None)
if decky is None:
await push_lifecycle_delta(
[_delta("failed", f"decky {decky_id!r} not found in worker state")],
)
return
decky.services = list(services)
decky.last_mutated = time.time()
save_state(cfg, compose_path)
write_compose(cfg, compose_path)
await asyncio.to_thread(
_compose_with_retry, "up", "-d", "--remove-orphans",
compose_file=compose_path,
)
except Exception as exc: # noqa: BLE001
log.exception("agent.mutate_async failed decky=%s", decky_id)
err = f"{type(exc).__name__}: {exc}"
await push_lifecycle_delta([_delta("failed", err)])
return
await push_lifecycle_delta([_delta("succeeded")])
async def teardown(decky_id: str | None = None) -> None: async def teardown(decky_id: str | None = None) -> None:
log.info("agent.teardown decky_id=%s", decky_id) log.info("agent.teardown decky_id=%s", decky_id)
await asyncio.to_thread(_deployer.teardown, decky_id) await asyncio.to_thread(_deployer.teardown, decky_id)

View File

@@ -50,7 +50,11 @@ def _resolve_agent_dir() -> pathlib.Path:
return pki.DEFAULT_AGENT_DIR return pki.DEFAULT_AGENT_DIR
async def _tick(client: httpx.AsyncClient, url: str, host_uuid: str, agent_version: str) -> None: async def _build_body(
host_uuid: str,
agent_version: str,
lifecycle: Optional[list[dict]] = None,
) -> dict:
snap = await _exec.status() snap = await _exec.status()
body: dict = { body: dict = {
"host_uuid": host_uuid, "host_uuid": host_uuid,
@@ -70,7 +74,13 @@ async def _tick(client: httpx.AsyncClient, url: str, host_uuid: str, agent_versi
store.close() store.close()
except Exception: except Exception:
log.debug("heartbeat: topology state unavailable", exc_info=True) log.debug("heartbeat: topology state unavailable", exc_info=True)
if lifecycle:
body["lifecycle"] = lifecycle
return body
async def _tick(client: httpx.AsyncClient, url: str, host_uuid: str, agent_version: str) -> None:
body = await _build_body(host_uuid, agent_version)
resp = await client.post(url, json=body) resp = await client.post(url, json=body)
# 403 / 404 are terminal-ish — we still keep looping because an # 403 / 404 are terminal-ish — we still keep looping because an
# operator may re-enrol the host mid-session, but we log loudly so # operator may re-enrol the host mid-session, but we log loudly so
@@ -134,6 +144,59 @@ def start() -> Optional[asyncio.Task]:
return _task return _task
async def push_lifecycle_delta(deltas: list[dict]) -> None:
"""Fire a one-off heartbeat POST carrying *deltas* in the
``lifecycle`` field. Each delta: ``{decky_name, operation, status,
error?, completed_at?}``.
Called by the agent executor on /deploy and /mutate completion so
the master observes the terminal transition immediately rather than
waiting up to ``INTERVAL_S`` for the next scheduled tick. Failures
are logged and swallowed; the next scheduled heartbeat carries the
same deltas via DB-side reconciliation, since the worker has no
durable per-row state to lose.
"""
from decnet.env import (
DECNET_HOST_UUID,
DECNET_MASTER_HOST,
DECNET_SWARMCTL_PORT,
)
if not deltas:
return
if not DECNET_HOST_UUID or not DECNET_MASTER_HOST:
log.debug("push_lifecycle_delta: identity unconfigured — skipping")
return
agent_dir = _resolve_agent_dir()
try:
ssl_ctx = build_worker_ssl_context(agent_dir)
except Exception:
log.exception("push_lifecycle_delta: SSL context unavailable")
return
try:
from decnet import __version__ as _v # type: ignore[attr-defined]
agent_version = _v
except Exception:
agent_version = "unknown"
url = f"https://{DECNET_MASTER_HOST}:{DECNET_SWARMCTL_PORT}/swarm/heartbeat"
try:
async with httpx.AsyncClient(verify=ssl_ctx, timeout=_TIMEOUT) as client:
body = await _build_body(
DECNET_HOST_UUID, agent_version, lifecycle=deltas,
)
resp = await client.post(url, json=body)
if resp.status_code not in (200, 204):
log.warning(
"lifecycle delta push rejected status=%d body=%s",
resp.status_code, resp.text[:200],
)
except Exception:
log.exception("push_lifecycle_delta failed — next scheduled tick will retry")
async def stop() -> None: async def stop() -> None:
global _task global _task
if _task is None: if _task is None:

View File

@@ -246,13 +246,11 @@ class AgentClient:
"dry_run": dry_run, "dry_run": dry_run,
"no_cache": no_cache, "no_cache": no_cache,
} }
# Swap in a long-deploy timeout for this call only. # Worker /deploy is async (202 fire-and-forget): the response only
old = self._require_client().timeout # acks acceptance; the real work runs in the agent's event loop
self._require_client().timeout = _TIMEOUT_DEPLOY # and reports terminal state via heartbeat lifecycle deltas. No
try: # need for the long deploy timeout here.
resp = await self._require_client().post("/deploy", json=body) resp = await self._require_client().post("/deploy", json=body)
finally:
self._require_client().timeout = old
resp.raise_for_status() resp.raise_for_status()
return resp.json() return resp.json()
@@ -268,14 +266,8 @@ class AgentClient:
"services": list(services), "services": list(services),
"dry_run": dry_run, "dry_run": dry_run,
} }
# Worker /mutate runs `compose up -d` which can pull/build; same # Worker /mutate is async (202): control-timeout is right.
# long-tail latency as /deploy. Swap the deploy timeout in. resp = await self._require_client().post("/mutate", json=body)
old = self._require_client().timeout
self._require_client().timeout = _TIMEOUT_DEPLOY
try:
resp = await self._require_client().post("/mutate", json=body)
finally:
self._require_client().timeout = old
resp.raise_for_status() resp.raise_for_status()
return resp.json() return resp.json()

View File

@@ -65,80 +65,67 @@ def _seed_state(monkeypatch, tmp_path):
return cell return cell
def test_mutate_success(monkeypatch, tmp_path) -> None: def test_mutate_returns_202_and_spawns_task(monkeypatch, tmp_path) -> None:
cell = _seed_state(monkeypatch, tmp_path) _seed_state(monkeypatch, tmp_path)
compose_calls: list[tuple] = [] spawned: list = []
write_compose_calls: list[tuple] = [] real_create_task = __import__("asyncio").create_task
monkeypatch.setattr( def _capture_create_task(coro, **kw):
"decnet.composer.write_compose", spawned.append(kw.get("name", ""))
lambda c, p: write_compose_calls.append((c, p)) or p, # Run the coro so it doesn't leak as a never-awaited warning,
) # but swap its body out for a no-op.
monkeypatch.setattr( coro.close()
"decnet.engine._compose_with_retry", # Return something task-like for the handler.
lambda *a, **kw: compose_calls.append((a, kw)), async def _noop():
) return None
return real_create_task(_noop())
monkeypatch.setattr("decnet.agent.app.asyncio.create_task", _capture_create_task)
client = TestClient(app) client = TestClient(app)
resp = client.post( resp = client.post(
"/mutate", "/mutate",
json={"decky_id": "decky-01", "services": ["http", "ftp"]}, json={"decky_id": "decky-01", "services": ["http", "ftp"]},
) )
assert resp.status_code == 200, resp.text assert resp.status_code == 202, resp.text
body = resp.json() body = resp.json()
assert body == {"status": "mutated", "decky_id": "decky-01", "services": ["http", "ftp"]} assert body == {
assert cell["cfg"].deckies[0].services == ["http", "ftp"] "status": "accepted",
assert cell["cfg"].deckies[0].last_mutated > 0 "decky_id": "decky-01",
assert len(write_compose_calls) == 1 "services": ["http", "ftp"],
assert len(compose_calls) == 1 }
assert compose_calls[0][0] == ("up", "-d", "--remove-orphans") assert spawned and spawned[0].startswith("mutate-")
def test_mutate_unknown_decky_returns_404(monkeypatch, tmp_path) -> None: def test_mutate_dry_run_404_when_no_state(monkeypatch) -> None:
_seed_state(monkeypatch, tmp_path)
compose_calls: list = []
monkeypatch.setattr(
"decnet.engine._compose_with_retry",
lambda *a, **kw: compose_calls.append((a, kw)),
)
client = TestClient(app)
resp = client.post(
"/mutate", json={"decky_id": "ghost", "services": ["ssh"]},
)
assert resp.status_code == 404
assert compose_calls == []
def test_mutate_no_state_returns_404(monkeypatch) -> None:
monkeypatch.setattr("decnet.config.load_state", lambda: None) monkeypatch.setattr("decnet.config.load_state", lambda: None)
client = TestClient(app) client = TestClient(app)
resp = client.post( resp = client.post(
"/mutate", json={"decky_id": "decky-01", "services": ["ssh"]}, "/mutate",
json={"decky_id": "decky-01", "services": ["ssh"], "dry_run": True},
) )
assert resp.status_code == 404 assert resp.status_code == 404
def test_mutate_dry_run_does_not_touch_docker_or_state(monkeypatch, tmp_path) -> None: def test_mutate_dry_run_404_for_unknown_decky(monkeypatch, tmp_path) -> None:
cell = _seed_state(monkeypatch, tmp_path) _seed_state(monkeypatch, tmp_path)
saved: list = [] client = TestClient(app)
written: list = [] resp = client.post(
composed: list = [] "/mutate",
json={"decky_id": "ghost", "services": ["ssh"], "dry_run": True},
)
assert resp.status_code == 404
monkeypatch.setattr(
"decnet.config.save_state", def test_mutate_dry_run_returns_services_without_touching_docker(
lambda c, p: saved.append((c, p)), monkeypatch, tmp_path,
) ) -> None:
monkeypatch.setattr( _seed_state(monkeypatch, tmp_path)
"decnet.composer.write_compose", composed: list = []
lambda c, p: written.append((c, p)),
)
monkeypatch.setattr( monkeypatch.setattr(
"decnet.engine._compose_with_retry", "decnet.engine._compose_with_retry",
lambda *a, **kw: composed.append((a, kw)), lambda *a, **kw: composed.append((a, kw)),
) )
original_services = list(cell["cfg"].deckies[0].services)
client = TestClient(app) client = TestClient(app)
resp = client.post( resp = client.post(
"/mutate", "/mutate",
@@ -146,14 +133,39 @@ def test_mutate_dry_run_does_not_touch_docker_or_state(monkeypatch, tmp_path) ->
) )
assert resp.status_code == 200 assert resp.status_code == 200
assert resp.json()["status"] == "dry_run" assert resp.json()["status"] == "dry_run"
# No persistence, no compose render, no docker.
assert saved == []
assert written == []
assert composed == [] assert composed == []
# State on the in-memory cell was touched (handler mutated the loaded
# DeckyConfig) but never persisted — load_state is shared by reference,
# so we only assert that no save/render happened above. def test_deploy_returns_202_and_spawns_task(monkeypatch) -> None:
del original_services from decnet.config import DecnetConfig, DeckyConfig
cfg = DecnetConfig(
mode="unihost", interface="eth0",
subnet="10.66.0.0/24", gateway="10.66.0.1",
deckies=[DeckyConfig(
name="decky-01", ip="10.66.0.10",
services=["ssh"], distro="debian",
base_image="debian:bookworm-slim", hostname="d01",
)],
)
spawned: list = []
real_create_task = __import__("asyncio").create_task
def _capture_create_task(coro, **kw):
spawned.append(kw.get("name", ""))
coro.close()
async def _noop():
return None
return real_create_task(_noop())
monkeypatch.setattr("decnet.agent.app.asyncio.create_task", _capture_create_task)
client = TestClient(app)
resp = client.post("/deploy", json={"config": cfg.model_dump(mode="json")})
assert resp.status_code == 202, resp.text
body = resp.json()
assert body["status"] == "accepted"
assert body["deckies"] == ["decky-01"]
assert spawned and spawned[0].startswith("deploy-")
def test_deploy_rejects_malformed_body() -> None: def test_deploy_rejects_malformed_body() -> None:

View File

@@ -142,8 +142,11 @@ async def test_client_mutate_unknown_decky_404(
async with swarm_client.AgentClient( async with swarm_client.AgentClient(
address="127.0.0.1", agent_port=port, identity=master_id, address="127.0.0.1", agent_port=port, identity=master_id,
) as agent: ) as agent:
# Only dry_run can surface 404 synchronously; the live path is
# 202 fire-and-forget and would surface failure via the
# heartbeat lifecycle delta.
with pytest.raises(httpx.HTTPStatusError) as ei: with pytest.raises(httpx.HTTPStatusError) as ei:
await agent.mutate("ghost", ["ssh"]) await agent.mutate("ghost", ["ssh"], dry_run=True)
assert ei.value.response.status_code == 404 assert ei.value.response.status_code == 404
finally: finally:
server.should_exit = True server.should_exit = True