Files
DECNET/tests/api/swarm_mgmt/test_teardown_host.py
anti 9d68bb45c7 feat(web): async teardowns — 202 + background task, UI allows parallel queue
Teardowns were synchronous all the way through: POST blocked on the
worker's docker-compose-down cycle (seconds to minutes), the frontend
locked tearingDown to a single string so only one button could be armed
at a time, and operators couldn't queue a second teardown until the
first returned. On a flaky worker that meant staring at a spinner for
the whole RTT.

Backend: POST /swarm/hosts/{uuid}/teardown returns 202 the instant the
request is validated. Affected shards flip to state='tearing_down'
synchronously before the response so the UI reflects progress
immediately, then the actual AgentClient call + DB cleanup run in an
asyncio.create_task (tracked in a module-level set to survive GC and
to be drainable by tests). On failure the shard flips to
'teardown_failed' with the error recorded — nothing is re-raised,
since there's no caller to catch it.

Frontend: swap tearingDown / decommissioning from 'string | null' to
'Set<string>'. Each button tracks its own in-flight state; the poll
loop picks up the final shard state from the backend. Multiple
teardowns can now be queued without blocking each other.
2026-04-19 20:30:56 -04:00

216 lines
6.8 KiB
Python

"""POST /swarm/hosts/{uuid}/teardown — per-host and per-decky remote teardown."""
from __future__ import annotations
import json
from datetime import datetime, timezone
from typing import Optional
import pytest
from decnet.web.router.swarm_mgmt import api_teardown_host as mod
class _FakeAgent:
def __init__(self, *a, **kw):
_FakeAgent.calls.append(("init", kw.get("host", a[0] if a else None)))
self._host = kw.get("host", a[0] if a else None)
async def __aenter__(self):
return self
async def __aexit__(self, *exc):
return None
async def teardown(self, decky_id: Optional[str] = None) -> dict:
_FakeAgent.calls.append(("teardown", decky_id))
return {"status": "torn_down", "decky_id": decky_id}
class _FailingAgent(_FakeAgent):
async def teardown(self, decky_id: Optional[str] = None) -> dict:
raise RuntimeError("network unreachable")
@pytest.fixture
def fake_agent(monkeypatch):
_FakeAgent.calls = []
monkeypatch.setattr(mod, "AgentClient", _FakeAgent)
return _FakeAgent
@pytest.fixture
def failing_agent(monkeypatch):
_FailingAgent.calls = []
monkeypatch.setattr(mod, "AgentClient", _FailingAgent)
return _FailingAgent
async def _seed_host(repo, *, name="worker-a", uuid="h-1") -> str:
await repo.add_swarm_host({
"uuid": uuid,
"name": name,
"address": "10.0.0.9",
"agent_port": 8765,
"status": "active",
"client_cert_fingerprint": "f" * 64,
"cert_bundle_path": "",
"use_ipvlan": False,
"enrolled_at": datetime.now(timezone.utc),
"last_heartbeat": None,
})
return uuid
async def _seed_shard(repo, *, host_uuid: str, decky_name: str) -> None:
await repo.upsert_decky_shard({
"decky_name": decky_name,
"host_uuid": host_uuid,
"services": json.dumps(["ssh"]),
"state": "running",
"last_error": None,
"updated_at": datetime.now(timezone.utc),
})
@pytest.mark.anyio
async def test_teardown_all_deckies_on_host(client, auth_token, fake_agent):
from decnet.web.dependencies import repo
uuid = await _seed_host(repo, name="tear-all", uuid="tear-all-uuid")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky1")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky2")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={},
)
assert resp.status_code == 202, resp.text
body = resp.json()
assert body["accepted"] is True
assert body["decky_id"] is None
await mod.drain_pending()
assert ("teardown", None) in fake_agent.calls
remaining = await repo.list_decky_shards(uuid)
assert remaining == []
@pytest.mark.anyio
async def test_teardown_single_decky(client, auth_token, fake_agent):
from decnet.web.dependencies import repo
uuid = await _seed_host(repo, name="tear-one", uuid="tear-one-uuid")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky-keep")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky-drop")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={"decky_id": "decky-drop"},
)
assert resp.status_code == 202, resp.text
assert resp.json()["decky_id"] == "decky-drop"
await mod.drain_pending()
assert ("teardown", "decky-drop") in fake_agent.calls
remaining = {s["decky_name"] for s in await repo.list_decky_shards(uuid)}
assert remaining == {"decky-keep"}
@pytest.mark.anyio
async def test_teardown_returns_immediately_and_marks_tearing_down(
client, auth_token, monkeypatch
):
"""The 202 must fire before the background agent call completes —
otherwise multiple queued teardowns still serialize on the UI."""
import asyncio as _asyncio
from decnet.web.dependencies import repo
gate = _asyncio.Event()
class _SlowAgent(_FakeAgent):
async def teardown(self, decky_id=None):
await gate.wait()
return {"status": "torn_down"}
monkeypatch.setattr(mod, "AgentClient", _SlowAgent)
uuid = await _seed_host(repo, name="slow", uuid="slow-uuid")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky-slow")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={"decky_id": "decky-slow"},
)
assert resp.status_code == 202
# Agent is still blocked — shard should be in 'tearing_down', not gone.
shards = {s["decky_name"]: s for s in await repo.list_decky_shards(uuid)}
assert shards["decky-slow"]["state"] == "tearing_down"
gate.set()
await mod.drain_pending()
remaining = {s["decky_name"] for s in await repo.list_decky_shards(uuid)}
assert remaining == set()
@pytest.mark.anyio
async def test_teardown_unknown_host_404(client, auth_token, fake_agent):
resp = await client.post(
"/api/v1/swarm/hosts/does-not-exist/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={},
)
assert resp.status_code == 404
@pytest.mark.anyio
async def test_teardown_agent_failure_marks_shard_failed(
client, auth_token, failing_agent
):
"""Background-task failure: the shard must NOT be deleted and its
state flips to teardown_failed with the error recorded so the UI
surfaces it."""
from decnet.web.dependencies import repo
uuid = await _seed_host(repo, name="tear-fail", uuid="tear-fail-uuid")
await _seed_shard(repo, host_uuid=uuid, decky_name="survivor")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={},
)
# Acceptance is unconditional — the failure happens in the background.
assert resp.status_code == 202
await mod.drain_pending()
shards = {s["decky_name"]: s for s in await repo.list_decky_shards(uuid)}
assert "survivor" in shards
assert shards["survivor"]["state"] == "teardown_failed"
assert "network unreachable" in (shards["survivor"]["last_error"] or "")
@pytest.mark.anyio
async def test_teardown_non_admin_forbidden(client, viewer_token, fake_agent):
from decnet.web.dependencies import repo
uuid = await _seed_host(repo, name="tear-guard", uuid="tear-guard-uuid")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {viewer_token}"},
json={},
)
assert resp.status_code == 403
@pytest.mark.anyio
async def test_teardown_no_auth_401(client, fake_agent):
resp = await client.post(
"/api/v1/swarm/hosts/whatever/teardown",
json={},
)
assert resp.status_code == 401