feat(agent): /deploy and /mutate become 202 fire-and-forget

The wizard API used to hang because /deckies/deploy ran docker compose
build && up -d synchronously, holding the request thread for minutes.
The worker side of that pipeline now returns 202 Accepted immediately
and runs the deploy in an asyncio.create_task.

On task completion (success or failure) the worker pushes a one-off
heartbeat carrying a lifecycle delta per decky:
  {decky_name, operation, status: succeeded|failed, error?, completed_at}

Master pivots these onto open DeckyLifecycle rows in the heartbeat
handler (next commit).  The scheduled 30s heartbeat tick is the
fallback if the immediate push drops.

- decnet/agent/app.py: /deploy and /mutate return 202; dry_run mutate
  still validates synchronously and returns 200.
- decnet/agent/executor.py: deploy_async + mutate_async wrap the work
  and push the completion delta.
- decnet/agent/heartbeat.py: push_lifecycle_delta() helper builds a
  one-off body and POSTs with the same mTLS context as the loop.
- decnet/swarm/client.py: revert deploy/mutate to control timeout
  (master no longer holds the HTTP request open for compose work).

Worker state.json gains no lifecycle field -- master DeckyLifecycle is
the source of truth; the master sweep handles crashed-mid-deploy
recovery.
This commit is contained in:
2026-05-22 16:31:23 -04:00
parent c0ad380020
commit d1ca96b2f4
6 changed files with 292 additions and 122 deletions

View File

@@ -80,6 +80,99 @@ async def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = F
await asyncio.to_thread(_deployer.deploy, config, dry_run, no_cache, False)
async def deploy_async(
config: DecnetConfig, *, dry_run: bool = False, no_cache: bool = False,
) -> None:
"""Background-task body for /deploy: run the deploy, then push a
lifecycle delta to the master so it observes terminal transitions
immediately rather than waiting for the next scheduled heartbeat.
Per-decky lifecycle deltas — master pivots them onto the matching
open DeckyLifecycle rows via the heartbeat handler. Errors are
captured and pushed as ``failed`` deltas; the task itself never
raises (a crashed task would just leave master rows wedged).
"""
from datetime import datetime, timezone
from decnet.agent.heartbeat import push_lifecycle_delta
decky_names = [d.name for d in config.deckies]
try:
await deploy(config, dry_run=dry_run, no_cache=no_cache)
except Exception as exc: # noqa: BLE001
log.exception("agent.deploy_async failed")
err = f"{type(exc).__name__}: {exc}"
deltas = [
{
"decky_name": name, "operation": "deploy",
"status": "failed", "error": err[:2000],
"completed_at": datetime.now(timezone.utc).isoformat(),
}
for name in decky_names
]
await push_lifecycle_delta(deltas)
return
deltas = [
{
"decky_name": name, "operation": "deploy",
"status": "succeeded",
"completed_at": datetime.now(timezone.utc).isoformat(),
}
for name in decky_names
]
await push_lifecycle_delta(deltas)
async def mutate_async(decky_id: str, services: list[str]) -> None:
"""Background-task body for /mutate. Same shape as deploy_async:
perform the work, then push a single lifecycle delta on
completion (success or failure)."""
import time
from datetime import datetime, timezone
from decnet.composer import write_compose
from decnet.config import load_state, save_state
from decnet.engine import _compose_with_retry
from decnet.agent.heartbeat import push_lifecycle_delta
def _delta(status: str, error: str | None = None) -> dict:
out = {
"decky_name": decky_id, "operation": "mutate",
"status": status,
"completed_at": datetime.now(timezone.utc).isoformat(),
}
if error is not None:
out["error"] = error[:2000]
return out
try:
state = load_state()
if state is None:
await push_lifecycle_delta(
[_delta("failed", "no active deployment on this worker")],
)
return
cfg, compose_path = state
decky = next((d for d in cfg.deckies if d.name == decky_id), None)
if decky is None:
await push_lifecycle_delta(
[_delta("failed", f"decky {decky_id!r} not found in worker state")],
)
return
decky.services = list(services)
decky.last_mutated = time.time()
save_state(cfg, compose_path)
write_compose(cfg, compose_path)
await asyncio.to_thread(
_compose_with_retry, "up", "-d", "--remove-orphans",
compose_file=compose_path,
)
except Exception as exc: # noqa: BLE001
log.exception("agent.mutate_async failed decky=%s", decky_id)
err = f"{type(exc).__name__}: {exc}"
await push_lifecycle_delta([_delta("failed", err)])
return
await push_lifecycle_delta([_delta("succeeded")])
async def teardown(decky_id: str | None = None) -> None:
log.info("agent.teardown decky_id=%s", decky_id)
await asyncio.to_thread(_deployer.teardown, decky_id)