merge testing->tomerge/main #7

Open
anti wants to merge 242 commits from testing into tomerge/main
4 changed files with 179 additions and 44 deletions
Showing only changes of commit 9d68bb45c7 - Show all commits

View File

@@ -2,14 +2,19 @@
Body: ``{"decky_id": "..."}`` (optional). When ``decky_id`` is null/omitted
the agent tears down the entire host (all deckies + network); otherwise it
tears down that single decky. Mirrors the arguments of the local
``decnet teardown`` CLI command.
tears down that single decky.
Async-by-default: the endpoint returns 202 the moment the request is
accepted and runs the actual agent call + DB cleanup in a background task.
That lets the operator queue multiple teardowns in parallel without
blocking on slow docker-compose-down cycles on the worker.
"""
from __future__ import annotations
from typing import Optional
import asyncio
from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel
from decnet.logging import get_logger
@@ -20,6 +25,23 @@ from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm.teardown")
router = APIRouter()
# Track spawned background tasks so (a) they're not GC'd mid-flight and
# (b) tests can drain them deterministically via ``await drain_pending()``.
_PENDING: "set[asyncio.Task]" = set()
def _spawn(coro) -> asyncio.Task:
task = asyncio.create_task(coro)
_PENDING.add(task)
task.add_done_callback(_PENDING.discard)
return task
async def drain_pending() -> None:
"""Await all outstanding teardown tasks. Used by tests."""
while _PENDING:
await asyncio.gather(*list(_PENDING), return_exceptions=True)
class TeardownHostRequest(BaseModel):
decky_id: Optional[str] = None
@@ -29,13 +51,69 @@ class TeardownHostResponse(BaseModel):
host_uuid: str
host_name: str
decky_id: Optional[str] = None
ok: bool
accepted: bool
detail: str
async def _mark_tearing_down(
repo: BaseRepository, host_uuid: str, decky_id: Optional[str]
) -> None:
"""Flip affected shards to state='tearing_down' so the UI can show
progress immediately while the background task runs."""
shards = await repo.list_decky_shards(host_uuid)
for s in shards:
if decky_id and s.get("decky_name") != decky_id:
continue
await repo.upsert_decky_shard({
**s,
"state": "tearing_down",
"last_error": None,
})
async def _run_teardown(
host: dict[str, Any], repo: BaseRepository, decky_id: Optional[str]
) -> None:
"""Fire the remote teardown + DB cleanup. Exceptions are logged and
reflected on the shard so the UI surfaces them — never re-raised,
since nothing is awaiting us."""
try:
async with AgentClient(host=host) as agent:
await agent.teardown(decky_id)
except Exception as exc:
log.exception(
"swarm.teardown background task failed host=%s decky=%s",
host.get("name"), decky_id,
)
# Reflect the failure on the shard(s) — don't delete on failure,
# the operator needs to see what went wrong and retry.
try:
shards = await repo.list_decky_shards(host["uuid"])
for s in shards:
if decky_id and s.get("decky_name") != decky_id:
continue
await repo.upsert_decky_shard({
**s,
"state": "teardown_failed",
"last_error": str(exc)[:512],
})
except Exception:
log.exception("swarm.teardown failed to record shard failure")
return
try:
if decky_id:
await repo.delete_decky_shard(decky_id)
else:
await repo.delete_decky_shards_for_host(host["uuid"])
except Exception:
log.exception("swarm.teardown DB cleanup failed (agent call succeeded)")
@router.post(
"/hosts/{uuid}/teardown",
response_model=TeardownHostResponse,
status_code=status.HTTP_202_ACCEPTED,
tags=["Swarm Management"],
)
async def teardown_host(
@@ -48,23 +126,18 @@ async def teardown_host(
if host is None:
raise HTTPException(status_code=404, detail="host not found")
try:
async with AgentClient(host=host) as agent:
body = await agent.teardown(req.decky_id)
except Exception as exc:
log.exception("swarm.teardown dispatch failed host=%s decky=%s",
host.get("name"), req.decky_id)
raise HTTPException(status_code=502, detail=str(exc)) from exc
await _mark_tearing_down(repo, uuid, req.decky_id)
if req.decky_id:
await repo.delete_decky_shard(req.decky_id)
else:
await repo.delete_decky_shards_for_host(uuid)
# Fire-and-forget: asyncio.create_task (not BackgroundTasks) so the
# task runs independently of this request's lifecycle — the operator
# can queue another teardown the moment this one returns 202 without
# waiting for any per-request cleanup phase.
_spawn(_run_teardown(host, repo, req.decky_id))
return TeardownHostResponse(
host_uuid=uuid,
host_name=host.get("name") or "",
decky_id=req.decky_id,
ok=True,
detail=str(body),
accepted=True,
detail="teardown queued",
)

View File

@@ -21,7 +21,7 @@ interface DeckyShard {
const SwarmDeckies: React.FC = () => {
const [shards, setShards] = useState<DeckyShard[]>([]);
const [loading, setLoading] = useState(true);
const [tearingDown, setTearingDown] = useState<string | null>(null);
const [tearingDown, setTearingDown] = useState<Set<string>>(new Set());
const [error, setError] = useState<string | null>(null);
// Two-click arm/commit replaces window.confirm() — browsers silently
// suppress confirm() after the "prevent additional dialogs" opt-out.
@@ -53,14 +53,22 @@ const SwarmDeckies: React.FC = () => {
const key = `td:${s.host_uuid}:${s.decky_name}`;
if (armed !== key) { arm(key); return; }
setArmed(null);
setTearingDown(s.decky_name);
setTearingDown((prev) => new Set(prev).add(s.decky_name));
try {
// Endpoint returns 202 immediately; the actual teardown runs in the
// background on the backend. Shard state flips to 'tearing_down' and
// the 10s poll picks up the final state (gone on success, or
// 'teardown_failed' with an error).
await api.post(`/swarm/hosts/${s.host_uuid}/teardown`, { decky_id: s.decky_name });
await fetch();
} catch (err: any) {
alert(err?.response?.data?.detail || 'Teardown failed');
} finally {
setTearingDown(null);
setTearingDown((prev) => {
const next = new Set(prev);
next.delete(s.decky_name);
return next;
});
}
};
@@ -115,12 +123,12 @@ const SwarmDeckies: React.FC = () => {
<td>
<button
className="control-btn danger"
disabled={tearingDown === s.decky_name}
disabled={tearingDown.has(s.decky_name) || s.state === 'tearing_down'}
onClick={() => handleTeardown(s)}
title="Stop this decky on its host"
>
<PowerOff size={14} />{' '}
{tearingDown === s.decky_name
{tearingDown.has(s.decky_name) || s.state === 'tearing_down'
? 'Tearing down…'
: armed === `td:${s.host_uuid}:${s.decky_name}`
? 'Click again to confirm'

View File

@@ -22,8 +22,8 @@ const shortFp = (fp: string): string => (fp ? fp.slice(0, 16) + '…' : '—');
const SwarmHosts: React.FC = () => {
const [hosts, setHosts] = useState<SwarmHost[]>([]);
const [loading, setLoading] = useState(true);
const [decommissioning, setDecommissioning] = useState<string | null>(null);
const [tearingDown, setTearingDown] = useState<string | null>(null);
const [decommissioning, setDecommissioning] = useState<Set<string>>(new Set());
const [tearingDown, setTearingDown] = useState<Set<string>>(new Set());
const [error, setError] = useState<string | null>(null);
// Two-click arm/commit replaces window.confirm(). Browsers silently
// suppress confirm() after the "prevent additional dialogs" opt-out,
@@ -53,18 +53,22 @@ const SwarmHosts: React.FC = () => {
return () => clearInterval(t);
}, []);
const addTo = (set: Set<string>, id: string) => { const n = new Set(set); n.add(id); return n; };
const removeFrom = (set: Set<string>, id: string) => { const n = new Set(set); n.delete(id); return n; };
const handleTeardownAll = async (host: SwarmHost) => {
const key = `teardown:${host.uuid}`;
if (armed !== key) { arm(key); return; }
setArmed(null);
setTearingDown(host.uuid);
setTearingDown((s) => addTo(s, host.uuid));
try {
// 202 Accepted — teardown runs async on the backend.
await api.post(`/swarm/hosts/${host.uuid}/teardown`, {});
await fetchHosts();
} catch (err: any) {
alert(err?.response?.data?.detail || 'Teardown failed');
} finally {
setTearingDown(null);
setTearingDown((s) => removeFrom(s, host.uuid));
}
};
@@ -72,14 +76,14 @@ const SwarmHosts: React.FC = () => {
const key = `decom:${host.uuid}`;
if (armed !== key) { arm(key); return; }
setArmed(null);
setDecommissioning(host.uuid);
setDecommissioning((s) => addTo(s, host.uuid));
try {
await api.delete(`/swarm/hosts/${host.uuid}`);
await fetchHosts();
} catch (err: any) {
alert(err?.response?.data?.detail || 'Decommission failed');
} finally {
setDecommissioning(null);
setDecommissioning((s) => removeFrom(s, host.uuid));
}
};
@@ -126,12 +130,12 @@ const SwarmHosts: React.FC = () => {
<td>
<button
className={`control-btn${armed === `teardown:${h.uuid}` ? ' danger' : ''}`}
disabled={tearingDown === h.uuid || h.status !== 'active'}
disabled={tearingDown.has(h.uuid) || h.status !== 'active'}
onClick={() => handleTeardownAll(h)}
title="Stop all deckies on this host (keeps it enrolled)"
>
<PowerOff size={14} />{' '}
{tearingDown === h.uuid
{tearingDown.has(h.uuid)
? 'Tearing down…'
: armed === `teardown:${h.uuid}`
? 'Click again to confirm'
@@ -139,11 +143,11 @@ const SwarmHosts: React.FC = () => {
</button>
<button
className="control-btn danger"
disabled={decommissioning === h.uuid}
disabled={decommissioning.has(h.uuid)}
onClick={() => handleDecommission(h)}
>
<Trash2 size={14} />{' '}
{decommissioning === h.uuid
{decommissioning.has(h.uuid)
? 'Decommissioning…'
: armed === `decom:${h.uuid}`
? 'Click again to confirm'

View File

@@ -84,11 +84,13 @@ async def test_teardown_all_deckies_on_host(client, auth_token, fake_agent):
headers={"Authorization": f"Bearer {auth_token}"},
json={},
)
assert resp.status_code == 200, resp.text
assert resp.status_code == 202, resp.text
body = resp.json()
assert body["ok"] is True
assert body["accepted"] is True
assert body["decky_id"] is None
await mod.drain_pending()
assert ("teardown", None) in fake_agent.calls
remaining = await repo.list_decky_shards(uuid)
assert remaining == []
@@ -106,15 +108,55 @@ async def test_teardown_single_decky(client, auth_token, fake_agent):
headers={"Authorization": f"Bearer {auth_token}"},
json={"decky_id": "decky-drop"},
)
assert resp.status_code == 200, resp.text
body = resp.json()
assert body["decky_id"] == "decky-drop"
assert resp.status_code == 202, resp.text
assert resp.json()["decky_id"] == "decky-drop"
await mod.drain_pending()
assert ("teardown", "decky-drop") in fake_agent.calls
remaining = {s["decky_name"] for s in await repo.list_decky_shards(uuid)}
assert remaining == {"decky-keep"}
@pytest.mark.anyio
async def test_teardown_returns_immediately_and_marks_tearing_down(
client, auth_token, monkeypatch
):
"""The 202 must fire before the background agent call completes —
otherwise multiple queued teardowns still serialize on the UI."""
import asyncio as _asyncio
from decnet.web.dependencies import repo
gate = _asyncio.Event()
class _SlowAgent(_FakeAgent):
async def teardown(self, decky_id=None):
await gate.wait()
return {"status": "torn_down"}
monkeypatch.setattr(mod, "AgentClient", _SlowAgent)
uuid = await _seed_host(repo, name="slow", uuid="slow-uuid")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky-slow")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={"decky_id": "decky-slow"},
)
assert resp.status_code == 202
# Agent is still blocked — shard should be in 'tearing_down', not gone.
shards = {s["decky_name"]: s for s in await repo.list_decky_shards(uuid)}
assert shards["decky-slow"]["state"] == "tearing_down"
gate.set()
await mod.drain_pending()
remaining = {s["decky_name"] for s in await repo.list_decky_shards(uuid)}
assert remaining == set()
@pytest.mark.anyio
async def test_teardown_unknown_host_404(client, auth_token, fake_agent):
resp = await client.post(
@@ -126,9 +168,12 @@ async def test_teardown_unknown_host_404(client, auth_token, fake_agent):
@pytest.mark.anyio
async def test_teardown_agent_failure_502(client, auth_token, failing_agent):
"""When the worker is unreachable the DB shards MUST NOT be deleted —
otherwise the master's view diverges from reality."""
async def test_teardown_agent_failure_marks_shard_failed(
client, auth_token, failing_agent
):
"""Background-task failure: the shard must NOT be deleted and its
state flips to teardown_failed with the error recorded so the UI
surfaces it."""
from decnet.web.dependencies import repo
uuid = await _seed_host(repo, name="tear-fail", uuid="tear-fail-uuid")
await _seed_shard(repo, host_uuid=uuid, decky_name="survivor")
@@ -138,10 +183,15 @@ async def test_teardown_agent_failure_502(client, auth_token, failing_agent):
headers={"Authorization": f"Bearer {auth_token}"},
json={},
)
assert resp.status_code == 502
# Acceptance is unconditional — the failure happens in the background.
assert resp.status_code == 202
remaining = {s["decky_name"] for s in await repo.list_decky_shards(uuid)}
assert remaining == {"survivor"}
await mod.drain_pending()
shards = {s["decky_name"]: s for s in await repo.list_decky_shards(uuid)}
assert "survivor" in shards
assert shards["survivor"]["state"] == "teardown_failed"
assert "network unreachable" in (shards["survivor"]["last_error"] or "")
@pytest.mark.anyio