feat(swarm): remote teardown API + UI (per-decky and per-host)

Agents already exposed POST /teardown; the master was missing the plumbing
to reach it. Add:

- POST /api/v1/swarm/hosts/{uuid}/teardown — admin-gated. Body
  {decky_id: str|null}: null tears the whole host, a value tears one decky.
  On worker failure the master returns 502 and leaves DB shards intact so
  master and agent stay aligned.
- BaseRepository.delete_decky_shard(name) + sqlmodel impl for per-decky
  cleanup after a single-decky teardown.
- SwarmHosts page: "Teardown all" button (keeps host enrolled).
- SwarmDeckies page: per-row "Teardown" button.

Also exclude setuptools' build/ staging dir from the enrollment tarball —
`pip install -e` on the master generates build/lib/decnet_web/node_modules
and the bundle walker was leaking it to agents. Align pyproject's bandit
exclude with the git-hook invocation so both skip decnet/templates/.
This commit is contained in:
2026-04-19 19:39:28 -04:00
parent 6708f26e6b
commit 5dad1bb315
9 changed files with 305 additions and 3 deletions

View File

@@ -15,6 +15,7 @@ from .api_list_hosts import router as list_hosts_router
from .api_decommission_host import router as decommission_host_router
from .api_list_deckies import router as list_deckies_router
from .api_enroll_bundle import router as enroll_bundle_router
from .api_teardown_host import router as teardown_host_router
swarm_mgmt_router = APIRouter(prefix="/swarm")
@@ -22,3 +23,4 @@ swarm_mgmt_router.include_router(list_hosts_router)
swarm_mgmt_router.include_router(decommission_host_router)
swarm_mgmt_router.include_router(list_deckies_router)
swarm_mgmt_router.include_router(enroll_bundle_router)
swarm_mgmt_router.include_router(teardown_host_router)

View File

@@ -55,6 +55,9 @@ _EXCLUDES: tuple[str, ...] = (
".pytest_cache", ".pytest_cache/*",
".mypy_cache", ".mypy_cache/*",
"*.egg-info", "*.egg-info/*",
# setuptools build/ staging dir — created by `pip install` and leaks a
# nested decnet_web/node_modules/ copy into the bundle otherwise.
"build", "build/*", "build/**",
"*.pyc", "*.pyo",
"*.db", "*.db-wal", "*.db-shm", "decnet.db*",
"*.log",

View File

@@ -0,0 +1,70 @@
"""POST /swarm/hosts/{uuid}/teardown — remote teardown on a swarm worker.
Body: ``{"decky_id": "..."}`` (optional). When ``decky_id`` is null/omitted
the agent tears down the entire host (all deckies + network); otherwise it
tears down that single decky. Mirrors the arguments of the local
``decnet teardown`` CLI command.
"""
from __future__ import annotations
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm.teardown")
router = APIRouter()
class TeardownHostRequest(BaseModel):
decky_id: Optional[str] = None
class TeardownHostResponse(BaseModel):
host_uuid: str
host_name: str
decky_id: Optional[str] = None
ok: bool
detail: str
@router.post(
"/hosts/{uuid}/teardown",
response_model=TeardownHostResponse,
tags=["Swarm Management"],
)
async def teardown_host(
uuid: str,
req: TeardownHostRequest,
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> TeardownHostResponse:
host = await repo.get_swarm_host_by_uuid(uuid)
if host is None:
raise HTTPException(status_code=404, detail="host not found")
try:
async with AgentClient(host=host) as agent:
body = await agent.teardown(req.decky_id)
except Exception as exc:
log.exception("swarm.teardown dispatch failed host=%s decky=%s",
host.get("name"), req.decky_id)
raise HTTPException(status_code=502, detail=str(exc)) from exc
if req.decky_id:
await repo.delete_decky_shard(req.decky_id)
else:
await repo.delete_decky_shards_for_host(uuid)
return TeardownHostResponse(
host_uuid=uuid,
host_name=host.get("name") or "",
decky_id=req.decky_id,
ok=True,
detail=str(body),
)