feat(web): async teardowns — 202 + background task, UI allows parallel queue

Teardowns were synchronous all the way through: POST blocked on the worker's docker-compose-down cycle (seconds to minutes), the frontend locked tearingDown to a single string so only one button could be armed at a time, and operators couldn't queue a second teardown until the first returned. On a flaky worker that meant staring at a spinner for the whole RTT. Backend: POST /swarm/hosts/{uuid}/teardown returns 202 the instant the request is validated. Affected shards flip to state='tearing_down' synchronously before the response so the UI reflects progress immediately, then the actual AgentClient call + DB cleanup run in an asyncio.create_task (tracked in a module-level set to survive GC and to be drainable by tests). On failure the shard flips to 'teardown_failed' with the error recorded — nothing is re-raised, since there's no caller to catch it. Frontend: swap tearingDown / decommissioning from 'string | null' to 'Set<string>'. Each button tracks its own in-flight state; the poll loop picks up the final shard state from the backend. Multiple teardowns can now be queued without blocking each other.
2026-04-19 20:30:56 -04:00
parent 07ec4bc269
commit 9d68bb45c7
4 changed files with 179 additions and 44 deletions
--- a/decnet/web/router/swarm_mgmt/api_teardown_host.py
+++ b/decnet/web/router/swarm_mgmt/api_teardown_host.py
@@ -2,14 +2,19 @@

 Body: ``{"decky_id": "..."}`` (optional). When ``decky_id`` is null/omitted
 the agent tears down the entire host (all deckies + network); otherwise it
-tears down that single decky. Mirrors the arguments of the local
-``decnet teardown`` CLI command.
+tears down that single decky.
+
+Async-by-default: the endpoint returns 202 the moment the request is
+accepted and runs the actual agent call + DB cleanup in a background task.
+That lets the operator queue multiple teardowns in parallel without
+blocking on slow docker-compose-down cycles on the worker.
 """
 from __future__ import annotations

-from typing import Optional
+import asyncio
+from typing import Any, Optional

-from fastapi import APIRouter, Depends, HTTPException
+from fastapi import APIRouter, Depends, HTTPException, status
 from pydantic import BaseModel

 from decnet.logging import get_logger
@@ -20,6 +25,23 @@ from decnet.web.dependencies import get_repo, require_admin
 log = get_logger("swarm.teardown")
 router = APIRouter()

+# Track spawned background tasks so (a) they're not GC'd mid-flight and
+# (b) tests can drain them deterministically via ``await drain_pending()``.
+_PENDING: "set[asyncio.Task]" = set()
+
+
+def _spawn(coro) -> asyncio.Task:
+    task = asyncio.create_task(coro)
+    _PENDING.add(task)
+    task.add_done_callback(_PENDING.discard)
+    return task
+
+
+async def drain_pending() -> None:
+    """Await all outstanding teardown tasks. Used by tests."""
+    while _PENDING:
+        await asyncio.gather(*list(_PENDING), return_exceptions=True)
+

 class TeardownHostRequest(BaseModel):
    decky_id: Optional[str] = None
@@ -29,13 +51,69 @@ class TeardownHostResponse(BaseModel):
    host_uuid: str
    host_name: str
    decky_id: Optional[str] = None
-    ok: bool
+    accepted: bool
    detail: str


+async def _mark_tearing_down(
+    repo: BaseRepository, host_uuid: str, decky_id: Optional[str]
+) -> None:
+    """Flip affected shards to state='tearing_down' so the UI can show
+    progress immediately while the background task runs."""
+    shards = await repo.list_decky_shards(host_uuid)
+    for s in shards:
+        if decky_id and s.get("decky_name") != decky_id:
+            continue
+        await repo.upsert_decky_shard({
+            **s,
+            "state": "tearing_down",
+            "last_error": None,
+        })
+
+
+async def _run_teardown(
+    host: dict[str, Any], repo: BaseRepository, decky_id: Optional[str]
+) -> None:
+    """Fire the remote teardown + DB cleanup. Exceptions are logged and
+    reflected on the shard so the UI surfaces them — never re-raised,
+    since nothing is awaiting us."""
+    try:
+        async with AgentClient(host=host) as agent:
+            await agent.teardown(decky_id)
+    except Exception as exc:
+        log.exception(
+            "swarm.teardown background task failed host=%s decky=%s",
+            host.get("name"), decky_id,
+        )
+        # Reflect the failure on the shard(s) — don't delete on failure,
+        # the operator needs to see what went wrong and retry.
+        try:
+            shards = await repo.list_decky_shards(host["uuid"])
+            for s in shards:
+                if decky_id and s.get("decky_name") != decky_id:
+                    continue
+                await repo.upsert_decky_shard({
+                    **s,
+                    "state": "teardown_failed",
+                    "last_error": str(exc)[:512],
+                })
+        except Exception:
+            log.exception("swarm.teardown failed to record shard failure")
+        return
+
+    try:
+        if decky_id:
+            await repo.delete_decky_shard(decky_id)
+        else:
+            await repo.delete_decky_shards_for_host(host["uuid"])
+    except Exception:
+        log.exception("swarm.teardown DB cleanup failed (agent call succeeded)")
+
+
@router.post(
    "/hosts/{uuid}/teardown",
    response_model=TeardownHostResponse,
+    status_code=status.HTTP_202_ACCEPTED,
    tags=["Swarm Management"],
 )
 async def teardown_host(
@@ -48,23 +126,18 @@ async def teardown_host(
    if host is None:
        raise HTTPException(status_code=404, detail="host not found")

-    try:
-        async with AgentClient(host=host) as agent:
-            body = await agent.teardown(req.decky_id)
-    except Exception as exc:
-        log.exception("swarm.teardown dispatch failed host=%s decky=%s",
-                      host.get("name"), req.decky_id)
-        raise HTTPException(status_code=502, detail=str(exc)) from exc
+    await _mark_tearing_down(repo, uuid, req.decky_id)

-    if req.decky_id:
-        await repo.delete_decky_shard(req.decky_id)
-    else:
-        await repo.delete_decky_shards_for_host(uuid)
+    # Fire-and-forget: asyncio.create_task (not BackgroundTasks) so the
+    # task runs independently of this request's lifecycle — the operator
+    # can queue another teardown the moment this one returns 202 without
+    # waiting for any per-request cleanup phase.
+    _spawn(_run_teardown(host, repo, req.decky_id))

    return TeardownHostResponse(
        host_uuid=uuid,
        host_name=host.get("name") or "",
        decky_id=req.decky_id,
-        ok=True,
-        detail=str(body),
+        accepted=True,
+        detail="teardown queued",
    )