From 9d68bb45c70d9b80d8405e323e38241faf5cc90c Mon Sep 17 00:00:00 2001 From: anti Date: Sun, 19 Apr 2026 20:30:56 -0400 Subject: [PATCH] =?UTF-8?q?feat(web):=20async=20teardowns=20=E2=80=94=2020?= =?UTF-8?q?2=20+=20background=20task,=20UI=20allows=20parallel=20queue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Teardowns were synchronous all the way through: POST blocked on the worker's docker-compose-down cycle (seconds to minutes), the frontend locked tearingDown to a single string so only one button could be armed at a time, and operators couldn't queue a second teardown until the first returned. On a flaky worker that meant staring at a spinner for the whole RTT. Backend: POST /swarm/hosts/{uuid}/teardown returns 202 the instant the request is validated. Affected shards flip to state='tearing_down' synchronously before the response so the UI reflects progress immediately, then the actual AgentClient call + DB cleanup run in an asyncio.create_task (tracked in a module-level set to survive GC and to be drainable by tests). On failure the shard flips to 'teardown_failed' with the error recorded — nothing is re-raised, since there's no caller to catch it. Frontend: swap tearingDown / decommissioning from 'string | null' to 'Set'. Each button tracks its own in-flight state; the poll loop picks up the final shard state from the backend. Multiple teardowns can now be queued without blocking each other. --- .../router/swarm_mgmt/api_teardown_host.py | 109 +++++++++++++++--- decnet_web/src/components/SwarmDeckies.tsx | 18 ++- decnet_web/src/components/SwarmHosts.tsx | 24 ++-- tests/api/swarm_mgmt/test_teardown_host.py | 72 ++++++++++-- 4 files changed, 179 insertions(+), 44 deletions(-) diff --git a/decnet/web/router/swarm_mgmt/api_teardown_host.py b/decnet/web/router/swarm_mgmt/api_teardown_host.py index e648be5..8cc0732 100644 --- a/decnet/web/router/swarm_mgmt/api_teardown_host.py +++ b/decnet/web/router/swarm_mgmt/api_teardown_host.py @@ -2,14 +2,19 @@ Body: ``{"decky_id": "..."}`` (optional). When ``decky_id`` is null/omitted the agent tears down the entire host (all deckies + network); otherwise it -tears down that single decky. Mirrors the arguments of the local -``decnet teardown`` CLI command. +tears down that single decky. + +Async-by-default: the endpoint returns 202 the moment the request is +accepted and runs the actual agent call + DB cleanup in a background task. +That lets the operator queue multiple teardowns in parallel without +blocking on slow docker-compose-down cycles on the worker. """ from __future__ import annotations -from typing import Optional +import asyncio +from typing import Any, Optional -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, HTTPException, status from pydantic import BaseModel from decnet.logging import get_logger @@ -20,6 +25,23 @@ from decnet.web.dependencies import get_repo, require_admin log = get_logger("swarm.teardown") router = APIRouter() +# Track spawned background tasks so (a) they're not GC'd mid-flight and +# (b) tests can drain them deterministically via ``await drain_pending()``. +_PENDING: "set[asyncio.Task]" = set() + + +def _spawn(coro) -> asyncio.Task: + task = asyncio.create_task(coro) + _PENDING.add(task) + task.add_done_callback(_PENDING.discard) + return task + + +async def drain_pending() -> None: + """Await all outstanding teardown tasks. Used by tests.""" + while _PENDING: + await asyncio.gather(*list(_PENDING), return_exceptions=True) + class TeardownHostRequest(BaseModel): decky_id: Optional[str] = None @@ -29,13 +51,69 @@ class TeardownHostResponse(BaseModel): host_uuid: str host_name: str decky_id: Optional[str] = None - ok: bool + accepted: bool detail: str +async def _mark_tearing_down( + repo: BaseRepository, host_uuid: str, decky_id: Optional[str] +) -> None: + """Flip affected shards to state='tearing_down' so the UI can show + progress immediately while the background task runs.""" + shards = await repo.list_decky_shards(host_uuid) + for s in shards: + if decky_id and s.get("decky_name") != decky_id: + continue + await repo.upsert_decky_shard({ + **s, + "state": "tearing_down", + "last_error": None, + }) + + +async def _run_teardown( + host: dict[str, Any], repo: BaseRepository, decky_id: Optional[str] +) -> None: + """Fire the remote teardown + DB cleanup. Exceptions are logged and + reflected on the shard so the UI surfaces them — never re-raised, + since nothing is awaiting us.""" + try: + async with AgentClient(host=host) as agent: + await agent.teardown(decky_id) + except Exception as exc: + log.exception( + "swarm.teardown background task failed host=%s decky=%s", + host.get("name"), decky_id, + ) + # Reflect the failure on the shard(s) — don't delete on failure, + # the operator needs to see what went wrong and retry. + try: + shards = await repo.list_decky_shards(host["uuid"]) + for s in shards: + if decky_id and s.get("decky_name") != decky_id: + continue + await repo.upsert_decky_shard({ + **s, + "state": "teardown_failed", + "last_error": str(exc)[:512], + }) + except Exception: + log.exception("swarm.teardown failed to record shard failure") + return + + try: + if decky_id: + await repo.delete_decky_shard(decky_id) + else: + await repo.delete_decky_shards_for_host(host["uuid"]) + except Exception: + log.exception("swarm.teardown DB cleanup failed (agent call succeeded)") + + @router.post( "/hosts/{uuid}/teardown", response_model=TeardownHostResponse, + status_code=status.HTTP_202_ACCEPTED, tags=["Swarm Management"], ) async def teardown_host( @@ -48,23 +126,18 @@ async def teardown_host( if host is None: raise HTTPException(status_code=404, detail="host not found") - try: - async with AgentClient(host=host) as agent: - body = await agent.teardown(req.decky_id) - except Exception as exc: - log.exception("swarm.teardown dispatch failed host=%s decky=%s", - host.get("name"), req.decky_id) - raise HTTPException(status_code=502, detail=str(exc)) from exc + await _mark_tearing_down(repo, uuid, req.decky_id) - if req.decky_id: - await repo.delete_decky_shard(req.decky_id) - else: - await repo.delete_decky_shards_for_host(uuid) + # Fire-and-forget: asyncio.create_task (not BackgroundTasks) so the + # task runs independently of this request's lifecycle — the operator + # can queue another teardown the moment this one returns 202 without + # waiting for any per-request cleanup phase. + _spawn(_run_teardown(host, repo, req.decky_id)) return TeardownHostResponse( host_uuid=uuid, host_name=host.get("name") or "", decky_id=req.decky_id, - ok=True, - detail=str(body), + accepted=True, + detail="teardown queued", ) diff --git a/decnet_web/src/components/SwarmDeckies.tsx b/decnet_web/src/components/SwarmDeckies.tsx index ce3b2a3..93ebcf3 100644 --- a/decnet_web/src/components/SwarmDeckies.tsx +++ b/decnet_web/src/components/SwarmDeckies.tsx @@ -21,7 +21,7 @@ interface DeckyShard { const SwarmDeckies: React.FC = () => { const [shards, setShards] = useState([]); const [loading, setLoading] = useState(true); - const [tearingDown, setTearingDown] = useState(null); + const [tearingDown, setTearingDown] = useState>(new Set()); const [error, setError] = useState(null); // Two-click arm/commit replaces window.confirm() — browsers silently // suppress confirm() after the "prevent additional dialogs" opt-out. @@ -53,14 +53,22 @@ const SwarmDeckies: React.FC = () => { const key = `td:${s.host_uuid}:${s.decky_name}`; if (armed !== key) { arm(key); return; } setArmed(null); - setTearingDown(s.decky_name); + setTearingDown((prev) => new Set(prev).add(s.decky_name)); try { + // Endpoint returns 202 immediately; the actual teardown runs in the + // background on the backend. Shard state flips to 'tearing_down' and + // the 10s poll picks up the final state (gone on success, or + // 'teardown_failed' with an error). await api.post(`/swarm/hosts/${s.host_uuid}/teardown`, { decky_id: s.decky_name }); await fetch(); } catch (err: any) { alert(err?.response?.data?.detail || 'Teardown failed'); } finally { - setTearingDown(null); + setTearingDown((prev) => { + const next = new Set(prev); + next.delete(s.decky_name); + return next; + }); } }; @@ -115,12 +123,12 @@ const SwarmDeckies: React.FC = () => {