All checks were successful
CI / Lint (ruff) (push) Successful in 16s
CI / SAST (bandit) (push) Successful in 18s
CI / Dependency audit (pip-audit) (push) Successful in 26s
CI / Test (Standard) (3.11) (push) Successful in 2m41s
CI / Test (Live) (3.11) (push) Successful in 1m6s
CI / Test (Fuzz) (3.11) (push) Successful in 1h9m14s
CI / Finalize Merge to Main (push) Has been skipped
CI / Merge dev → testing (push) Successful in 12s
CI / Prepare Merge to Main (push) Has been skipped
Schemathesis was failing CI on routes that returned status codes not declared in their OpenAPI responses= dicts. Adds the missing codes across swarm_updates, swarm_mgmt, swarm, fleet and attackers routers. Also adds 400 to every POST/PUT/PATCH that accepts a JSON body — Starlette returns 400 on malformed/non-UTF8 bodies before FastAPI's 422 validation runs, which schemathesis fuzzing trips every time. No handler logic changed.
151 lines
5.0 KiB
Python
151 lines
5.0 KiB
Python
"""POST /swarm/hosts/{uuid}/teardown — remote teardown on a swarm worker.
|
|
|
|
Body: ``{"decky_id": "..."}`` (optional). When ``decky_id`` is null/omitted
|
|
the agent tears down the entire host (all deckies + network); otherwise it
|
|
tears down that single decky.
|
|
|
|
Async-by-default: the endpoint returns 202 the moment the request is
|
|
accepted and runs the actual agent call + DB cleanup in a background task.
|
|
That lets the operator queue multiple teardowns in parallel without
|
|
blocking on slow docker-compose-down cycles on the worker.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from typing import Any, Optional
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException, status
|
|
from pydantic import BaseModel
|
|
|
|
from decnet.logging import get_logger
|
|
from decnet.swarm.client import AgentClient
|
|
from decnet.web.db.repository import BaseRepository
|
|
from decnet.web.dependencies import get_repo, require_admin
|
|
|
|
log = get_logger("swarm.teardown")
|
|
router = APIRouter()
|
|
|
|
# Track spawned background tasks so (a) they're not GC'd mid-flight and
|
|
# (b) tests can drain them deterministically via ``await drain_pending()``.
|
|
_PENDING: "set[asyncio.Task]" = set()
|
|
|
|
|
|
def _spawn(coro) -> asyncio.Task:
|
|
task = asyncio.create_task(coro)
|
|
_PENDING.add(task)
|
|
task.add_done_callback(_PENDING.discard)
|
|
return task
|
|
|
|
|
|
async def drain_pending() -> None:
|
|
"""Await all outstanding teardown tasks. Used by tests."""
|
|
while _PENDING:
|
|
await asyncio.gather(*list(_PENDING), return_exceptions=True)
|
|
|
|
|
|
class TeardownHostRequest(BaseModel):
|
|
decky_id: Optional[str] = None
|
|
|
|
|
|
class TeardownHostResponse(BaseModel):
|
|
host_uuid: str
|
|
host_name: str
|
|
decky_id: Optional[str] = None
|
|
accepted: bool
|
|
detail: str
|
|
|
|
|
|
async def _mark_tearing_down(
|
|
repo: BaseRepository, host_uuid: str, decky_id: Optional[str]
|
|
) -> None:
|
|
"""Flip affected shards to state='tearing_down' so the UI can show
|
|
progress immediately while the background task runs."""
|
|
shards = await repo.list_decky_shards(host_uuid)
|
|
for s in shards:
|
|
if decky_id and s.get("decky_name") != decky_id:
|
|
continue
|
|
await repo.upsert_decky_shard({
|
|
**s,
|
|
"state": "tearing_down",
|
|
"last_error": None,
|
|
})
|
|
|
|
|
|
async def _run_teardown(
|
|
host: dict[str, Any], repo: BaseRepository, decky_id: Optional[str]
|
|
) -> None:
|
|
"""Fire the remote teardown + DB cleanup. Exceptions are logged and
|
|
reflected on the shard so the UI surfaces them — never re-raised,
|
|
since nothing is awaiting us."""
|
|
try:
|
|
async with AgentClient(host=host) as agent:
|
|
await agent.teardown(decky_id)
|
|
except Exception as exc:
|
|
log.exception(
|
|
"swarm.teardown background task failed host=%s decky=%s",
|
|
host.get("name"), decky_id,
|
|
)
|
|
# Reflect the failure on the shard(s) — don't delete on failure,
|
|
# the operator needs to see what went wrong and retry.
|
|
try:
|
|
shards = await repo.list_decky_shards(host["uuid"])
|
|
for s in shards:
|
|
if decky_id and s.get("decky_name") != decky_id:
|
|
continue
|
|
await repo.upsert_decky_shard({
|
|
**s,
|
|
"state": "teardown_failed",
|
|
"last_error": str(exc)[:512],
|
|
})
|
|
except Exception:
|
|
log.exception("swarm.teardown failed to record shard failure")
|
|
return
|
|
|
|
try:
|
|
if decky_id:
|
|
await repo.delete_decky_shard(decky_id)
|
|
else:
|
|
await repo.delete_decky_shards_for_host(host["uuid"])
|
|
except Exception:
|
|
log.exception("swarm.teardown DB cleanup failed (agent call succeeded)")
|
|
|
|
|
|
@router.post(
|
|
"/hosts/{uuid}/teardown",
|
|
response_model=TeardownHostResponse,
|
|
status_code=status.HTTP_202_ACCEPTED,
|
|
tags=["Swarm Management"],
|
|
responses={
|
|
400: {"description": "Bad Request (malformed JSON body)"},
|
|
401: {"description": "Could not validate credentials"},
|
|
403: {"description": "Insufficient permissions"},
|
|
404: {"description": "Host not found"},
|
|
422: {"description": "Request body or path parameter validation error"},
|
|
},
|
|
)
|
|
async def teardown_host(
|
|
uuid: str,
|
|
req: TeardownHostRequest,
|
|
admin: dict = Depends(require_admin),
|
|
repo: BaseRepository = Depends(get_repo),
|
|
) -> TeardownHostResponse:
|
|
host = await repo.get_swarm_host_by_uuid(uuid)
|
|
if host is None:
|
|
raise HTTPException(status_code=404, detail="host not found")
|
|
|
|
await _mark_tearing_down(repo, uuid, req.decky_id)
|
|
|
|
# Fire-and-forget: asyncio.create_task (not BackgroundTasks) so the
|
|
# task runs independently of this request's lifecycle — the operator
|
|
# can queue another teardown the moment this one returns 202 without
|
|
# waiting for any per-request cleanup phase.
|
|
_spawn(_run_teardown(host, repo, req.decky_id))
|
|
|
|
return TeardownHostResponse(
|
|
host_uuid=uuid,
|
|
host_name=host.get("name") or "",
|
|
decky_id=req.decky_id,
|
|
accepted=True,
|
|
detail="teardown queued",
|
|
)
|