merge testing->tomerge/main #7

Open
anti wants to merge 242 commits from testing into tomerge/main
4 changed files with 179 additions and 44 deletions
Showing only changes of commit 9d68bb45c7 - Show all commits

View File

@@ -2,14 +2,19 @@
Body: ``{"decky_id": "..."}`` (optional). When ``decky_id`` is null/omitted Body: ``{"decky_id": "..."}`` (optional). When ``decky_id`` is null/omitted
the agent tears down the entire host (all deckies + network); otherwise it the agent tears down the entire host (all deckies + network); otherwise it
tears down that single decky. Mirrors the arguments of the local tears down that single decky.
``decnet teardown`` CLI command.
Async-by-default: the endpoint returns 202 the moment the request is
accepted and runs the actual agent call + DB cleanup in a background task.
That lets the operator queue multiple teardowns in parallel without
blocking on slow docker-compose-down cycles on the worker.
""" """
from __future__ import annotations from __future__ import annotations
from typing import Optional import asyncio
from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel from pydantic import BaseModel
from decnet.logging import get_logger from decnet.logging import get_logger
@@ -20,6 +25,23 @@ from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm.teardown") log = get_logger("swarm.teardown")
router = APIRouter() router = APIRouter()
# Track spawned background tasks so (a) they're not GC'd mid-flight and
# (b) tests can drain them deterministically via ``await drain_pending()``.
_PENDING: "set[asyncio.Task]" = set()
def _spawn(coro) -> asyncio.Task:
task = asyncio.create_task(coro)
_PENDING.add(task)
task.add_done_callback(_PENDING.discard)
return task
async def drain_pending() -> None:
"""Await all outstanding teardown tasks. Used by tests."""
while _PENDING:
await asyncio.gather(*list(_PENDING), return_exceptions=True)
class TeardownHostRequest(BaseModel): class TeardownHostRequest(BaseModel):
decky_id: Optional[str] = None decky_id: Optional[str] = None
@@ -29,13 +51,69 @@ class TeardownHostResponse(BaseModel):
host_uuid: str host_uuid: str
host_name: str host_name: str
decky_id: Optional[str] = None decky_id: Optional[str] = None
ok: bool accepted: bool
detail: str detail: str
async def _mark_tearing_down(
repo: BaseRepository, host_uuid: str, decky_id: Optional[str]
) -> None:
"""Flip affected shards to state='tearing_down' so the UI can show
progress immediately while the background task runs."""
shards = await repo.list_decky_shards(host_uuid)
for s in shards:
if decky_id and s.get("decky_name") != decky_id:
continue
await repo.upsert_decky_shard({
**s,
"state": "tearing_down",
"last_error": None,
})
async def _run_teardown(
host: dict[str, Any], repo: BaseRepository, decky_id: Optional[str]
) -> None:
"""Fire the remote teardown + DB cleanup. Exceptions are logged and
reflected on the shard so the UI surfaces them — never re-raised,
since nothing is awaiting us."""
try:
async with AgentClient(host=host) as agent:
await agent.teardown(decky_id)
except Exception as exc:
log.exception(
"swarm.teardown background task failed host=%s decky=%s",
host.get("name"), decky_id,
)
# Reflect the failure on the shard(s) — don't delete on failure,
# the operator needs to see what went wrong and retry.
try:
shards = await repo.list_decky_shards(host["uuid"])
for s in shards:
if decky_id and s.get("decky_name") != decky_id:
continue
await repo.upsert_decky_shard({
**s,
"state": "teardown_failed",
"last_error": str(exc)[:512],
})
except Exception:
log.exception("swarm.teardown failed to record shard failure")
return
try:
if decky_id:
await repo.delete_decky_shard(decky_id)
else:
await repo.delete_decky_shards_for_host(host["uuid"])
except Exception:
log.exception("swarm.teardown DB cleanup failed (agent call succeeded)")
@router.post( @router.post(
"/hosts/{uuid}/teardown", "/hosts/{uuid}/teardown",
response_model=TeardownHostResponse, response_model=TeardownHostResponse,
status_code=status.HTTP_202_ACCEPTED,
tags=["Swarm Management"], tags=["Swarm Management"],
) )
async def teardown_host( async def teardown_host(
@@ -48,23 +126,18 @@ async def teardown_host(
if host is None: if host is None:
raise HTTPException(status_code=404, detail="host not found") raise HTTPException(status_code=404, detail="host not found")
try: await _mark_tearing_down(repo, uuid, req.decky_id)
async with AgentClient(host=host) as agent:
body = await agent.teardown(req.decky_id)
except Exception as exc:
log.exception("swarm.teardown dispatch failed host=%s decky=%s",
host.get("name"), req.decky_id)
raise HTTPException(status_code=502, detail=str(exc)) from exc
if req.decky_id: # Fire-and-forget: asyncio.create_task (not BackgroundTasks) so the
await repo.delete_decky_shard(req.decky_id) # task runs independently of this request's lifecycle — the operator
else: # can queue another teardown the moment this one returns 202 without
await repo.delete_decky_shards_for_host(uuid) # waiting for any per-request cleanup phase.
_spawn(_run_teardown(host, repo, req.decky_id))
return TeardownHostResponse( return TeardownHostResponse(
host_uuid=uuid, host_uuid=uuid,
host_name=host.get("name") or "", host_name=host.get("name") or "",
decky_id=req.decky_id, decky_id=req.decky_id,
ok=True, accepted=True,
detail=str(body), detail="teardown queued",
) )

View File

@@ -21,7 +21,7 @@ interface DeckyShard {
const SwarmDeckies: React.FC = () => { const SwarmDeckies: React.FC = () => {
const [shards, setShards] = useState<DeckyShard[]>([]); const [shards, setShards] = useState<DeckyShard[]>([]);
const [loading, setLoading] = useState(true); const [loading, setLoading] = useState(true);
const [tearingDown, setTearingDown] = useState<string | null>(null); const [tearingDown, setTearingDown] = useState<Set<string>>(new Set());
const [error, setError] = useState<string | null>(null); const [error, setError] = useState<string | null>(null);
// Two-click arm/commit replaces window.confirm() — browsers silently // Two-click arm/commit replaces window.confirm() — browsers silently
// suppress confirm() after the "prevent additional dialogs" opt-out. // suppress confirm() after the "prevent additional dialogs" opt-out.
@@ -53,14 +53,22 @@ const SwarmDeckies: React.FC = () => {
const key = `td:${s.host_uuid}:${s.decky_name}`; const key = `td:${s.host_uuid}:${s.decky_name}`;
if (armed !== key) { arm(key); return; } if (armed !== key) { arm(key); return; }
setArmed(null); setArmed(null);
setTearingDown(s.decky_name); setTearingDown((prev) => new Set(prev).add(s.decky_name));
try { try {
// Endpoint returns 202 immediately; the actual teardown runs in the
// background on the backend. Shard state flips to 'tearing_down' and
// the 10s poll picks up the final state (gone on success, or
// 'teardown_failed' with an error).
await api.post(`/swarm/hosts/${s.host_uuid}/teardown`, { decky_id: s.decky_name }); await api.post(`/swarm/hosts/${s.host_uuid}/teardown`, { decky_id: s.decky_name });
await fetch(); await fetch();
} catch (err: any) { } catch (err: any) {
alert(err?.response?.data?.detail || 'Teardown failed'); alert(err?.response?.data?.detail || 'Teardown failed');
} finally { } finally {
setTearingDown(null); setTearingDown((prev) => {
const next = new Set(prev);
next.delete(s.decky_name);
return next;
});
} }
}; };
@@ -115,12 +123,12 @@ const SwarmDeckies: React.FC = () => {
<td> <td>
<button <button
className="control-btn danger" className="control-btn danger"
disabled={tearingDown === s.decky_name} disabled={tearingDown.has(s.decky_name) || s.state === 'tearing_down'}
onClick={() => handleTeardown(s)} onClick={() => handleTeardown(s)}
title="Stop this decky on its host" title="Stop this decky on its host"
> >
<PowerOff size={14} />{' '} <PowerOff size={14} />{' '}
{tearingDown === s.decky_name {tearingDown.has(s.decky_name) || s.state === 'tearing_down'
? 'Tearing down…' ? 'Tearing down…'
: armed === `td:${s.host_uuid}:${s.decky_name}` : armed === `td:${s.host_uuid}:${s.decky_name}`
? 'Click again to confirm' ? 'Click again to confirm'

View File

@@ -22,8 +22,8 @@ const shortFp = (fp: string): string => (fp ? fp.slice(0, 16) + '…' : '—');
const SwarmHosts: React.FC = () => { const SwarmHosts: React.FC = () => {
const [hosts, setHosts] = useState<SwarmHost[]>([]); const [hosts, setHosts] = useState<SwarmHost[]>([]);
const [loading, setLoading] = useState(true); const [loading, setLoading] = useState(true);
const [decommissioning, setDecommissioning] = useState<string | null>(null); const [decommissioning, setDecommissioning] = useState<Set<string>>(new Set());
const [tearingDown, setTearingDown] = useState<string | null>(null); const [tearingDown, setTearingDown] = useState<Set<string>>(new Set());
const [error, setError] = useState<string | null>(null); const [error, setError] = useState<string | null>(null);
// Two-click arm/commit replaces window.confirm(). Browsers silently // Two-click arm/commit replaces window.confirm(). Browsers silently
// suppress confirm() after the "prevent additional dialogs" opt-out, // suppress confirm() after the "prevent additional dialogs" opt-out,
@@ -53,18 +53,22 @@ const SwarmHosts: React.FC = () => {
return () => clearInterval(t); return () => clearInterval(t);
}, []); }, []);
const addTo = (set: Set<string>, id: string) => { const n = new Set(set); n.add(id); return n; };
const removeFrom = (set: Set<string>, id: string) => { const n = new Set(set); n.delete(id); return n; };
const handleTeardownAll = async (host: SwarmHost) => { const handleTeardownAll = async (host: SwarmHost) => {
const key = `teardown:${host.uuid}`; const key = `teardown:${host.uuid}`;
if (armed !== key) { arm(key); return; } if (armed !== key) { arm(key); return; }
setArmed(null); setArmed(null);
setTearingDown(host.uuid); setTearingDown((s) => addTo(s, host.uuid));
try { try {
// 202 Accepted — teardown runs async on the backend.
await api.post(`/swarm/hosts/${host.uuid}/teardown`, {}); await api.post(`/swarm/hosts/${host.uuid}/teardown`, {});
await fetchHosts(); await fetchHosts();
} catch (err: any) { } catch (err: any) {
alert(err?.response?.data?.detail || 'Teardown failed'); alert(err?.response?.data?.detail || 'Teardown failed');
} finally { } finally {
setTearingDown(null); setTearingDown((s) => removeFrom(s, host.uuid));
} }
}; };
@@ -72,14 +76,14 @@ const SwarmHosts: React.FC = () => {
const key = `decom:${host.uuid}`; const key = `decom:${host.uuid}`;
if (armed !== key) { arm(key); return; } if (armed !== key) { arm(key); return; }
setArmed(null); setArmed(null);
setDecommissioning(host.uuid); setDecommissioning((s) => addTo(s, host.uuid));
try { try {
await api.delete(`/swarm/hosts/${host.uuid}`); await api.delete(`/swarm/hosts/${host.uuid}`);
await fetchHosts(); await fetchHosts();
} catch (err: any) { } catch (err: any) {
alert(err?.response?.data?.detail || 'Decommission failed'); alert(err?.response?.data?.detail || 'Decommission failed');
} finally { } finally {
setDecommissioning(null); setDecommissioning((s) => removeFrom(s, host.uuid));
} }
}; };
@@ -126,12 +130,12 @@ const SwarmHosts: React.FC = () => {
<td> <td>
<button <button
className={`control-btn${armed === `teardown:${h.uuid}` ? ' danger' : ''}`} className={`control-btn${armed === `teardown:${h.uuid}` ? ' danger' : ''}`}
disabled={tearingDown === h.uuid || h.status !== 'active'} disabled={tearingDown.has(h.uuid) || h.status !== 'active'}
onClick={() => handleTeardownAll(h)} onClick={() => handleTeardownAll(h)}
title="Stop all deckies on this host (keeps it enrolled)" title="Stop all deckies on this host (keeps it enrolled)"
> >
<PowerOff size={14} />{' '} <PowerOff size={14} />{' '}
{tearingDown === h.uuid {tearingDown.has(h.uuid)
? 'Tearing down…' ? 'Tearing down…'
: armed === `teardown:${h.uuid}` : armed === `teardown:${h.uuid}`
? 'Click again to confirm' ? 'Click again to confirm'
@@ -139,11 +143,11 @@ const SwarmHosts: React.FC = () => {
</button> </button>
<button <button
className="control-btn danger" className="control-btn danger"
disabled={decommissioning === h.uuid} disabled={decommissioning.has(h.uuid)}
onClick={() => handleDecommission(h)} onClick={() => handleDecommission(h)}
> >
<Trash2 size={14} />{' '} <Trash2 size={14} />{' '}
{decommissioning === h.uuid {decommissioning.has(h.uuid)
? 'Decommissioning…' ? 'Decommissioning…'
: armed === `decom:${h.uuid}` : armed === `decom:${h.uuid}`
? 'Click again to confirm' ? 'Click again to confirm'

View File

@@ -84,11 +84,13 @@ async def test_teardown_all_deckies_on_host(client, auth_token, fake_agent):
headers={"Authorization": f"Bearer {auth_token}"}, headers={"Authorization": f"Bearer {auth_token}"},
json={}, json={},
) )
assert resp.status_code == 200, resp.text assert resp.status_code == 202, resp.text
body = resp.json() body = resp.json()
assert body["ok"] is True assert body["accepted"] is True
assert body["decky_id"] is None assert body["decky_id"] is None
await mod.drain_pending()
assert ("teardown", None) in fake_agent.calls assert ("teardown", None) in fake_agent.calls
remaining = await repo.list_decky_shards(uuid) remaining = await repo.list_decky_shards(uuid)
assert remaining == [] assert remaining == []
@@ -106,15 +108,55 @@ async def test_teardown_single_decky(client, auth_token, fake_agent):
headers={"Authorization": f"Bearer {auth_token}"}, headers={"Authorization": f"Bearer {auth_token}"},
json={"decky_id": "decky-drop"}, json={"decky_id": "decky-drop"},
) )
assert resp.status_code == 200, resp.text assert resp.status_code == 202, resp.text
body = resp.json() assert resp.json()["decky_id"] == "decky-drop"
assert body["decky_id"] == "decky-drop"
await mod.drain_pending()
assert ("teardown", "decky-drop") in fake_agent.calls assert ("teardown", "decky-drop") in fake_agent.calls
remaining = {s["decky_name"] for s in await repo.list_decky_shards(uuid)} remaining = {s["decky_name"] for s in await repo.list_decky_shards(uuid)}
assert remaining == {"decky-keep"} assert remaining == {"decky-keep"}
@pytest.mark.anyio
async def test_teardown_returns_immediately_and_marks_tearing_down(
client, auth_token, monkeypatch
):
"""The 202 must fire before the background agent call completes —
otherwise multiple queued teardowns still serialize on the UI."""
import asyncio as _asyncio
from decnet.web.dependencies import repo
gate = _asyncio.Event()
class _SlowAgent(_FakeAgent):
async def teardown(self, decky_id=None):
await gate.wait()
return {"status": "torn_down"}
monkeypatch.setattr(mod, "AgentClient", _SlowAgent)
uuid = await _seed_host(repo, name="slow", uuid="slow-uuid")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky-slow")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={"decky_id": "decky-slow"},
)
assert resp.status_code == 202
# Agent is still blocked — shard should be in 'tearing_down', not gone.
shards = {s["decky_name"]: s for s in await repo.list_decky_shards(uuid)}
assert shards["decky-slow"]["state"] == "tearing_down"
gate.set()
await mod.drain_pending()
remaining = {s["decky_name"] for s in await repo.list_decky_shards(uuid)}
assert remaining == set()
@pytest.mark.anyio @pytest.mark.anyio
async def test_teardown_unknown_host_404(client, auth_token, fake_agent): async def test_teardown_unknown_host_404(client, auth_token, fake_agent):
resp = await client.post( resp = await client.post(
@@ -126,9 +168,12 @@ async def test_teardown_unknown_host_404(client, auth_token, fake_agent):
@pytest.mark.anyio @pytest.mark.anyio
async def test_teardown_agent_failure_502(client, auth_token, failing_agent): async def test_teardown_agent_failure_marks_shard_failed(
"""When the worker is unreachable the DB shards MUST NOT be deleted — client, auth_token, failing_agent
otherwise the master's view diverges from reality.""" ):
"""Background-task failure: the shard must NOT be deleted and its
state flips to teardown_failed with the error recorded so the UI
surfaces it."""
from decnet.web.dependencies import repo from decnet.web.dependencies import repo
uuid = await _seed_host(repo, name="tear-fail", uuid="tear-fail-uuid") uuid = await _seed_host(repo, name="tear-fail", uuid="tear-fail-uuid")
await _seed_shard(repo, host_uuid=uuid, decky_name="survivor") await _seed_shard(repo, host_uuid=uuid, decky_name="survivor")
@@ -138,10 +183,15 @@ async def test_teardown_agent_failure_502(client, auth_token, failing_agent):
headers={"Authorization": f"Bearer {auth_token}"}, headers={"Authorization": f"Bearer {auth_token}"},
json={}, json={},
) )
assert resp.status_code == 502 # Acceptance is unconditional — the failure happens in the background.
assert resp.status_code == 202
remaining = {s["decky_name"] for s in await repo.list_decky_shards(uuid)} await mod.drain_pending()
assert remaining == {"survivor"}
shards = {s["decky_name"]: s for s in await repo.list_decky_shards(uuid)}
assert "survivor" in shards
assert shards["survivor"]["state"] == "teardown_failed"
assert "network unreachable" in (shards["survivor"]["last_error"] or "")
@pytest.mark.anyio @pytest.mark.anyio