feat(swarm): remote teardown API + UI (per-decky and per-host)

Agents already exposed POST /teardown; the master was missing the plumbing
to reach it. Add:

- POST /api/v1/swarm/hosts/{uuid}/teardown — admin-gated. Body
  {decky_id: str|null}: null tears the whole host, a value tears one decky.
  On worker failure the master returns 502 and leaves DB shards intact so
  master and agent stay aligned.
- BaseRepository.delete_decky_shard(name) + sqlmodel impl for per-decky
  cleanup after a single-decky teardown.
- SwarmHosts page: "Teardown all" button (keeps host enrolled).
- SwarmDeckies page: per-row "Teardown" button.

Also exclude setuptools' build/ staging dir from the enrollment tarball —
`pip install -e` on the master generates build/lib/decnet_web/node_modules
and the bundle walker was leaking it to agents. Align pyproject's bandit
exclude with the git-hook invocation so both skip decnet/templates/.
This commit is contained in:
2026-04-19 19:39:28 -04:00
parent 6708f26e6b
commit 5dad1bb315
9 changed files with 305 additions and 3 deletions

View File

@@ -228,3 +228,6 @@ class BaseRepository(ABC):
async def delete_decky_shards_for_host(self, host_uuid: str) -> int: async def delete_decky_shards_for_host(self, host_uuid: str) -> int:
raise NotImplementedError raise NotImplementedError
async def delete_decky_shard(self, decky_name: str) -> bool:
raise NotImplementedError

View File

@@ -861,3 +861,12 @@ class SQLModelRepository(BaseRepository):
) )
await session.commit() await session.commit()
return result.rowcount or 0 return result.rowcount or 0
async def delete_decky_shard(self, decky_name: str) -> bool:
async with self._session() as session:
result = await session.execute(
text("DELETE FROM decky_shards WHERE decky_name = :n"),
{"n": decky_name},
)
await session.commit()
return bool(result.rowcount)

View File

@@ -15,6 +15,7 @@ from .api_list_hosts import router as list_hosts_router
from .api_decommission_host import router as decommission_host_router from .api_decommission_host import router as decommission_host_router
from .api_list_deckies import router as list_deckies_router from .api_list_deckies import router as list_deckies_router
from .api_enroll_bundle import router as enroll_bundle_router from .api_enroll_bundle import router as enroll_bundle_router
from .api_teardown_host import router as teardown_host_router
swarm_mgmt_router = APIRouter(prefix="/swarm") swarm_mgmt_router = APIRouter(prefix="/swarm")
@@ -22,3 +23,4 @@ swarm_mgmt_router.include_router(list_hosts_router)
swarm_mgmt_router.include_router(decommission_host_router) swarm_mgmt_router.include_router(decommission_host_router)
swarm_mgmt_router.include_router(list_deckies_router) swarm_mgmt_router.include_router(list_deckies_router)
swarm_mgmt_router.include_router(enroll_bundle_router) swarm_mgmt_router.include_router(enroll_bundle_router)
swarm_mgmt_router.include_router(teardown_host_router)

View File

@@ -55,6 +55,9 @@ _EXCLUDES: tuple[str, ...] = (
".pytest_cache", ".pytest_cache/*", ".pytest_cache", ".pytest_cache/*",
".mypy_cache", ".mypy_cache/*", ".mypy_cache", ".mypy_cache/*",
"*.egg-info", "*.egg-info/*", "*.egg-info", "*.egg-info/*",
# setuptools build/ staging dir — created by `pip install` and leaks a
# nested decnet_web/node_modules/ copy into the bundle otherwise.
"build", "build/*", "build/**",
"*.pyc", "*.pyo", "*.pyc", "*.pyo",
"*.db", "*.db-wal", "*.db-shm", "decnet.db*", "*.db", "*.db-wal", "*.db-shm", "decnet.db*",
"*.log", "*.log",

View File

@@ -0,0 +1,70 @@
"""POST /swarm/hosts/{uuid}/teardown — remote teardown on a swarm worker.
Body: ``{"decky_id": "..."}`` (optional). When ``decky_id`` is null/omitted
the agent tears down the entire host (all deckies + network); otherwise it
tears down that single decky. Mirrors the arguments of the local
``decnet teardown`` CLI command.
"""
from __future__ import annotations
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm.teardown")
router = APIRouter()
class TeardownHostRequest(BaseModel):
decky_id: Optional[str] = None
class TeardownHostResponse(BaseModel):
host_uuid: str
host_name: str
decky_id: Optional[str] = None
ok: bool
detail: str
@router.post(
"/hosts/{uuid}/teardown",
response_model=TeardownHostResponse,
tags=["Swarm Management"],
)
async def teardown_host(
uuid: str,
req: TeardownHostRequest,
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> TeardownHostResponse:
host = await repo.get_swarm_host_by_uuid(uuid)
if host is None:
raise HTTPException(status_code=404, detail="host not found")
try:
async with AgentClient(host=host) as agent:
body = await agent.teardown(req.decky_id)
except Exception as exc:
log.exception("swarm.teardown dispatch failed host=%s decky=%s",
host.get("name"), req.decky_id)
raise HTTPException(status_code=502, detail=str(exc)) from exc
if req.decky_id:
await repo.delete_decky_shard(req.decky_id)
else:
await repo.delete_decky_shards_for_host(uuid)
return TeardownHostResponse(
host_uuid=uuid,
host_name=host.get("name") or "",
decky_id=req.decky_id,
ok=True,
detail=str(body),
)

View File

@@ -2,7 +2,7 @@ import React, { useEffect, useState } from 'react';
import api from '../utils/api'; import api from '../utils/api';
import './Dashboard.css'; import './Dashboard.css';
import './Swarm.css'; import './Swarm.css';
import { Boxes, RefreshCw } from 'lucide-react'; import { Boxes, PowerOff, RefreshCw } from 'lucide-react';
interface DeckyShard { interface DeckyShard {
decky_name: string; decky_name: string;
@@ -20,6 +20,7 @@ interface DeckyShard {
const SwarmDeckies: React.FC = () => { const SwarmDeckies: React.FC = () => {
const [shards, setShards] = useState<DeckyShard[]>([]); const [shards, setShards] = useState<DeckyShard[]>([]);
const [loading, setLoading] = useState(true); const [loading, setLoading] = useState(true);
const [tearingDown, setTearingDown] = useState<string | null>(null);
const [error, setError] = useState<string | null>(null); const [error, setError] = useState<string | null>(null);
const fetch = async () => { const fetch = async () => {
@@ -40,6 +41,19 @@ const SwarmDeckies: React.FC = () => {
return () => clearInterval(t); return () => clearInterval(t);
}, []); }, []);
const handleTeardown = async (s: DeckyShard) => {
if (!window.confirm(`Tear down decky ${s.decky_name} on ${s.host_name}?`)) return;
setTearingDown(s.decky_name);
try {
await api.post(`/swarm/hosts/${s.host_uuid}/teardown`, { decky_id: s.decky_name });
await fetch();
} catch (err: any) {
alert(err?.response?.data?.detail || 'Teardown failed');
} finally {
setTearingDown(null);
}
};
const byHost: Record<string, { name: string; address: string; status: string; shards: DeckyShard[] }> = {}; const byHost: Record<string, { name: string; address: string; status: string; shards: DeckyShard[] }> = {};
for (const s of shards) { for (const s of shards) {
if (!byHost[s.host_uuid]) { if (!byHost[s.host_uuid]) {
@@ -77,6 +91,7 @@ const SwarmDeckies: React.FC = () => {
<th>Services</th> <th>Services</th>
<th>Compose</th> <th>Compose</th>
<th>Updated</th> <th>Updated</th>
<th></th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
@@ -87,6 +102,16 @@ const SwarmDeckies: React.FC = () => {
<td>{s.services.join(', ')}</td> <td>{s.services.join(', ')}</td>
<td><code>{s.compose_hash ? s.compose_hash.slice(0, 8) : '—'}</code></td> <td><code>{s.compose_hash ? s.compose_hash.slice(0, 8) : '—'}</code></td>
<td>{new Date(s.updated_at).toLocaleString()}</td> <td>{new Date(s.updated_at).toLocaleString()}</td>
<td>
<button
className="control-btn danger"
disabled={tearingDown === s.decky_name}
onClick={() => handleTeardown(s)}
title="Stop this decky on its host"
>
<PowerOff size={14} /> {tearingDown === s.decky_name ? 'Tearing down…' : 'Teardown'}
</button>
</td>
</tr> </tr>
))} ))}
</tbody> </tbody>

View File

@@ -2,7 +2,7 @@ import React, { useEffect, useState } from 'react';
import api from '../utils/api'; import api from '../utils/api';
import './Dashboard.css'; import './Dashboard.css';
import './Swarm.css'; import './Swarm.css';
import { HardDrive, RefreshCw, Trash2, Wifi, WifiOff } from 'lucide-react'; import { HardDrive, PowerOff, RefreshCw, Trash2, Wifi, WifiOff } from 'lucide-react';
interface SwarmHost { interface SwarmHost {
uuid: string; uuid: string;
@@ -23,6 +23,7 @@ const SwarmHosts: React.FC = () => {
const [hosts, setHosts] = useState<SwarmHost[]>([]); const [hosts, setHosts] = useState<SwarmHost[]>([]);
const [loading, setLoading] = useState(true); const [loading, setLoading] = useState(true);
const [decommissioning, setDecommissioning] = useState<string | null>(null); const [decommissioning, setDecommissioning] = useState<string | null>(null);
const [tearingDown, setTearingDown] = useState<string | null>(null);
const [error, setError] = useState<string | null>(null); const [error, setError] = useState<string | null>(null);
const fetchHosts = async () => { const fetchHosts = async () => {
@@ -43,6 +44,19 @@ const SwarmHosts: React.FC = () => {
return () => clearInterval(t); return () => clearInterval(t);
}, []); }, []);
const handleTeardownAll = async (host: SwarmHost) => {
if (!window.confirm(`Tear down ALL deckies on ${host.name}? The host stays enrolled.`)) return;
setTearingDown(host.uuid);
try {
await api.post(`/swarm/hosts/${host.uuid}/teardown`, {});
await fetchHosts();
} catch (err: any) {
alert(err?.response?.data?.detail || 'Teardown failed');
} finally {
setTearingDown(null);
}
};
const handleDecommission = async (host: SwarmHost) => { const handleDecommission = async (host: SwarmHost) => {
if (!window.confirm(`Decommission ${host.name} (${host.address})? This removes certs and decky mappings.`)) return; if (!window.confirm(`Decommission ${host.name} (${host.address})? This removes certs and decky mappings.`)) return;
setDecommissioning(host.uuid); setDecommissioning(host.uuid);
@@ -97,6 +111,14 @@ const SwarmHosts: React.FC = () => {
<td title={h.client_cert_fingerprint}><code>{shortFp(h.client_cert_fingerprint)}</code></td> <td title={h.client_cert_fingerprint}><code>{shortFp(h.client_cert_fingerprint)}</code></td>
<td>{new Date(h.enrolled_at).toLocaleString()}</td> <td>{new Date(h.enrolled_at).toLocaleString()}</td>
<td> <td>
<button
className="control-btn"
disabled={tearingDown === h.uuid || h.status !== 'active'}
onClick={() => handleTeardownAll(h)}
title="Stop all deckies on this host (keeps it enrolled)"
>
<PowerOff size={14} /> {tearingDown === h.uuid ? 'Tearing down…' : 'Teardown all'}
</button>
<button <button
className="control-btn danger" className="control-btn danger"
disabled={decommissioning === h.uuid} disabled={decommissioning === h.uuid}

View File

@@ -111,6 +111,9 @@ include = ["decnet*"]
decnet = ["templates/**/*"] decnet = ["templates/**/*"]
[tool.bandit] [tool.bandit]
# Docker build contexts — code runs inside decoy containers, not in the
# master/agent process. Skipping keeps honeypot service scripts (hashlib.md5
# for JA3, /tmp/build in Dockerfile helpers, etc.) out of the scan.
exclude_dirs = [ exclude_dirs = [
"decnet/templates/syslog_bridge.py", "decnet/templates",
] ]

View File

@@ -0,0 +1,165 @@
"""POST /swarm/hosts/{uuid}/teardown — per-host and per-decky remote teardown."""
from __future__ import annotations
import json
from datetime import datetime, timezone
from typing import Optional
import pytest
from decnet.web.router.swarm_mgmt import api_teardown_host as mod
class _FakeAgent:
def __init__(self, *a, **kw):
_FakeAgent.calls.append(("init", kw.get("host", a[0] if a else None)))
self._host = kw.get("host", a[0] if a else None)
async def __aenter__(self):
return self
async def __aexit__(self, *exc):
return None
async def teardown(self, decky_id: Optional[str] = None) -> dict:
_FakeAgent.calls.append(("teardown", decky_id))
return {"status": "torn_down", "decky_id": decky_id}
class _FailingAgent(_FakeAgent):
async def teardown(self, decky_id: Optional[str] = None) -> dict:
raise RuntimeError("network unreachable")
@pytest.fixture
def fake_agent(monkeypatch):
_FakeAgent.calls = []
monkeypatch.setattr(mod, "AgentClient", _FakeAgent)
return _FakeAgent
@pytest.fixture
def failing_agent(monkeypatch):
_FailingAgent.calls = []
monkeypatch.setattr(mod, "AgentClient", _FailingAgent)
return _FailingAgent
async def _seed_host(repo, *, name="worker-a", uuid="h-1") -> str:
await repo.add_swarm_host({
"uuid": uuid,
"name": name,
"address": "10.0.0.9",
"agent_port": 8765,
"status": "active",
"client_cert_fingerprint": "f" * 64,
"cert_bundle_path": "",
"use_ipvlan": False,
"enrolled_at": datetime.now(timezone.utc),
"last_heartbeat": None,
})
return uuid
async def _seed_shard(repo, *, host_uuid: str, decky_name: str) -> None:
await repo.upsert_decky_shard({
"decky_name": decky_name,
"host_uuid": host_uuid,
"services": json.dumps(["ssh"]),
"state": "running",
"last_error": None,
"updated_at": datetime.now(timezone.utc),
})
@pytest.mark.anyio
async def test_teardown_all_deckies_on_host(client, auth_token, fake_agent):
from decnet.web.dependencies import repo
uuid = await _seed_host(repo, name="tear-all", uuid="tear-all-uuid")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky1")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky2")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={},
)
assert resp.status_code == 200, resp.text
body = resp.json()
assert body["ok"] is True
assert body["decky_id"] is None
assert ("teardown", None) in fake_agent.calls
remaining = await repo.list_decky_shards(uuid)
assert remaining == []
@pytest.mark.anyio
async def test_teardown_single_decky(client, auth_token, fake_agent):
from decnet.web.dependencies import repo
uuid = await _seed_host(repo, name="tear-one", uuid="tear-one-uuid")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky-keep")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky-drop")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={"decky_id": "decky-drop"},
)
assert resp.status_code == 200, resp.text
body = resp.json()
assert body["decky_id"] == "decky-drop"
assert ("teardown", "decky-drop") in fake_agent.calls
remaining = {s["decky_name"] for s in await repo.list_decky_shards(uuid)}
assert remaining == {"decky-keep"}
@pytest.mark.anyio
async def test_teardown_unknown_host_404(client, auth_token, fake_agent):
resp = await client.post(
"/api/v1/swarm/hosts/does-not-exist/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={},
)
assert resp.status_code == 404
@pytest.mark.anyio
async def test_teardown_agent_failure_502(client, auth_token, failing_agent):
"""When the worker is unreachable the DB shards MUST NOT be deleted —
otherwise the master's view diverges from reality."""
from decnet.web.dependencies import repo
uuid = await _seed_host(repo, name="tear-fail", uuid="tear-fail-uuid")
await _seed_shard(repo, host_uuid=uuid, decky_name="survivor")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={},
)
assert resp.status_code == 502
remaining = {s["decky_name"] for s in await repo.list_decky_shards(uuid)}
assert remaining == {"survivor"}
@pytest.mark.anyio
async def test_teardown_non_admin_forbidden(client, viewer_token, fake_agent):
from decnet.web.dependencies import repo
uuid = await _seed_host(repo, name="tear-guard", uuid="tear-guard-uuid")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {viewer_token}"},
json={},
)
assert resp.status_code == 403
@pytest.mark.anyio
async def test_teardown_no_auth_401(client, fake_agent):
resp = await client.post(
"/api/v1/swarm/hosts/whatever/teardown",
json={},
)
assert resp.status_code == 401