feat(web): async teardowns — 202 + background task, UI allows parallel queue

Teardowns were synchronous all the way through: POST blocked on the
worker's docker-compose-down cycle (seconds to minutes), the frontend
locked tearingDown to a single string so only one button could be armed
at a time, and operators couldn't queue a second teardown until the
first returned. On a flaky worker that meant staring at a spinner for
the whole RTT.

Backend: POST /swarm/hosts/{uuid}/teardown returns 202 the instant the
request is validated. Affected shards flip to state='tearing_down'
synchronously before the response so the UI reflects progress
immediately, then the actual AgentClient call + DB cleanup run in an
asyncio.create_task (tracked in a module-level set to survive GC and
to be drainable by tests). On failure the shard flips to
'teardown_failed' with the error recorded — nothing is re-raised,
since there's no caller to catch it.

Frontend: swap tearingDown / decommissioning from 'string | null' to
'Set<string>'. Each button tracks its own in-flight state; the poll
loop picks up the final shard state from the backend. Multiple
teardowns can now be queued without blocking each other.
This commit is contained in:
2026-04-19 20:30:56 -04:00
parent 07ec4bc269
commit 9d68bb45c7
4 changed files with 179 additions and 44 deletions

View File

@@ -2,14 +2,19 @@
Body: ``{"decky_id": "..."}`` (optional). When ``decky_id`` is null/omitted
the agent tears down the entire host (all deckies + network); otherwise it
tears down that single decky. Mirrors the arguments of the local
``decnet teardown`` CLI command.
tears down that single decky.
Async-by-default: the endpoint returns 202 the moment the request is
accepted and runs the actual agent call + DB cleanup in a background task.
That lets the operator queue multiple teardowns in parallel without
blocking on slow docker-compose-down cycles on the worker.
"""
from __future__ import annotations
from typing import Optional
import asyncio
from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel
from decnet.logging import get_logger
@@ -20,6 +25,23 @@ from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm.teardown")
router = APIRouter()
# Track spawned background tasks so (a) they're not GC'd mid-flight and
# (b) tests can drain them deterministically via ``await drain_pending()``.
_PENDING: "set[asyncio.Task]" = set()
def _spawn(coro) -> asyncio.Task:
task = asyncio.create_task(coro)
_PENDING.add(task)
task.add_done_callback(_PENDING.discard)
return task
async def drain_pending() -> None:
"""Await all outstanding teardown tasks. Used by tests."""
while _PENDING:
await asyncio.gather(*list(_PENDING), return_exceptions=True)
class TeardownHostRequest(BaseModel):
decky_id: Optional[str] = None
@@ -29,13 +51,69 @@ class TeardownHostResponse(BaseModel):
host_uuid: str
host_name: str
decky_id: Optional[str] = None
ok: bool
accepted: bool
detail: str
async def _mark_tearing_down(
repo: BaseRepository, host_uuid: str, decky_id: Optional[str]
) -> None:
"""Flip affected shards to state='tearing_down' so the UI can show
progress immediately while the background task runs."""
shards = await repo.list_decky_shards(host_uuid)
for s in shards:
if decky_id and s.get("decky_name") != decky_id:
continue
await repo.upsert_decky_shard({
**s,
"state": "tearing_down",
"last_error": None,
})
async def _run_teardown(
host: dict[str, Any], repo: BaseRepository, decky_id: Optional[str]
) -> None:
"""Fire the remote teardown + DB cleanup. Exceptions are logged and
reflected on the shard so the UI surfaces them — never re-raised,
since nothing is awaiting us."""
try:
async with AgentClient(host=host) as agent:
await agent.teardown(decky_id)
except Exception as exc:
log.exception(
"swarm.teardown background task failed host=%s decky=%s",
host.get("name"), decky_id,
)
# Reflect the failure on the shard(s) — don't delete on failure,
# the operator needs to see what went wrong and retry.
try:
shards = await repo.list_decky_shards(host["uuid"])
for s in shards:
if decky_id and s.get("decky_name") != decky_id:
continue
await repo.upsert_decky_shard({
**s,
"state": "teardown_failed",
"last_error": str(exc)[:512],
})
except Exception:
log.exception("swarm.teardown failed to record shard failure")
return
try:
if decky_id:
await repo.delete_decky_shard(decky_id)
else:
await repo.delete_decky_shards_for_host(host["uuid"])
except Exception:
log.exception("swarm.teardown DB cleanup failed (agent call succeeded)")
@router.post(
"/hosts/{uuid}/teardown",
response_model=TeardownHostResponse,
status_code=status.HTTP_202_ACCEPTED,
tags=["Swarm Management"],
)
async def teardown_host(
@@ -48,23 +126,18 @@ async def teardown_host(
if host is None:
raise HTTPException(status_code=404, detail="host not found")
try:
async with AgentClient(host=host) as agent:
body = await agent.teardown(req.decky_id)
except Exception as exc:
log.exception("swarm.teardown dispatch failed host=%s decky=%s",
host.get("name"), req.decky_id)
raise HTTPException(status_code=502, detail=str(exc)) from exc
await _mark_tearing_down(repo, uuid, req.decky_id)
if req.decky_id:
await repo.delete_decky_shard(req.decky_id)
else:
await repo.delete_decky_shards_for_host(uuid)
# Fire-and-forget: asyncio.create_task (not BackgroundTasks) so the
# task runs independently of this request's lifecycle — the operator
# can queue another teardown the moment this one returns 202 without
# waiting for any per-request cleanup phase.
_spawn(_run_teardown(host, repo, req.decky_id))
return TeardownHostResponse(
host_uuid=uuid,
host_name=host.get("name") or "",
decky_id=req.decky_id,
ok=True,
detail=str(body),
accepted=True,
detail="teardown queued",
)