From df18cb44ccf96483728bf9fca912ec3dd1c360f1 Mon Sep 17 00:00:00 2001 From: anti Date: Sun, 19 Apr 2026 20:11:08 -0400 Subject: [PATCH] fix(swarm): don't paint healthy deckies as failed when a shard-sibling fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit docker compose up is partial-success-friendly — a build failure on one service doesn't roll back the others. But the master was catching the agent's 500 and tagging every decky in the shard as 'failed' with the same error message. From the UI that looked like all three deckies died even though two were live on the worker. On dispatch exception, probe the agent's /status to learn which deckies actually have running containers, and upsert per-decky state accordingly. Only fall back to marking the whole shard failed if the status probe itself is unreachable. Enhance agent.executor.status() to include a 'runtime' map keyed by decky name with per-service container state, so the master has something concrete to consult. --- decnet/agent/executor.py | 30 ++++++++++ decnet/web/router/swarm/api_deploy_swarm.py | 18 +++++- tests/swarm/test_swarm_api.py | 62 +++++++++++++++++++++ 3 files changed, 108 insertions(+), 2 deletions(-) diff --git a/decnet/agent/executor.py b/decnet/agent/executor.py index 3c1030f..76eaff2 100644 --- a/decnet/agent/executor.py +++ b/decnet/agent/executor.py @@ -87,14 +87,44 @@ async def teardown(decky_id: str | None = None) -> None: await asyncio.to_thread(clear_state) +def _decky_runtime_states(config: DecnetConfig) -> dict[str, dict[str, Any]]: + """Map decky_name → {"running": bool, "services": {svc: container_state}}. + + Queried so the master can tell, after a partial-failure deploy, which + deckies actually came up instead of tainting the whole shard as failed. + Best-effort: a docker error returns an empty map, not an exception. + """ + try: + import docker # local import — agent-only path + client = docker.from_env() + live = {c.name: c.status for c in client.containers.list(all=True, ignore_removed=True)} + except Exception: # pragma: no cover — defensive + log.exception("_decky_runtime_states: docker query failed") + return {} + + out: dict[str, dict[str, Any]] = {} + for d in config.deckies: + svc_states = { + svc: live.get(f"{d.name}-{svc.replace('_', '-')}", "absent") + for svc in d.services + } + out[d.name] = { + "running": bool(svc_states) and all(s == "running" for s in svc_states.values()), + "services": svc_states, + } + return out + + async def status() -> dict[str, Any]: state = await asyncio.to_thread(load_state) if state is None: return {"deployed": False, "deckies": []} config, _compose_path = state + runtime = await asyncio.to_thread(_decky_runtime_states, config) return { "deployed": True, "mode": config.mode, "compose_path": str(_compose_path), "deckies": [d.model_dump() for d in config.deckies], + "runtime": runtime, } diff --git a/decnet/web/router/swarm/api_deploy_swarm.py b/decnet/web/router/swarm/api_deploy_swarm.py index b15f4b7..2b19ebc 100644 --- a/decnet/web/router/swarm/api_deploy_swarm.py +++ b/decnet/web/router/swarm/api_deploy_swarm.py @@ -98,14 +98,28 @@ async def dispatch_decnet_config( return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body) except Exception as exc: log.exception("swarm.deploy dispatch failed host=%s", host["name"]) + # Compose-up is partial-success-friendly: one decky failing to + # build doesn't roll back the ones that already came up. Ask the + # agent which containers actually exist before painting the whole + # shard red — otherwise decky1 and decky2 look "failed" even + # though they're live on the worker. + runtime: dict[str, Any] = {} + try: + async with AgentClient(host=host) as probe: + snap = await probe.status() + runtime = snap.get("runtime") or {} + except Exception: + log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"]) for d in shard: + rstate = runtime.get(d.name) or {} + is_up = bool(rstate.get("running")) await repo.upsert_decky_shard( { "decky_name": d.name, "host_uuid": host_uuid, "services": json.dumps(d.services), - "state": "failed", - "last_error": str(exc)[:512], + "state": "running" if is_up else "failed", + "last_error": None if is_up else str(exc)[:512], "updated_at": datetime.now(timezone.utc), } ) diff --git a/tests/swarm/test_swarm_api.py b/tests/swarm/test_swarm_api.py index 02f0759..77629d8 100644 --- a/tests/swarm/test_swarm_api.py +++ b/tests/swarm/test_swarm_api.py @@ -273,6 +273,68 @@ def test_deploy_rejects_missing_host_uuid(client: TestClient, stub_agent) -> Non assert "host_uuid" in resp.json()["detail"] +def test_deploy_partial_failure_only_marks_actually_failed_decky( + client: TestClient, repo, monkeypatch: pytest.MonkeyPatch +) -> None: + """docker compose up is partial-success-friendly: one failed service + doesn't roll back the ones already up. The master must probe /status + after a dispatch exception so healthy deckies aren't painted red just + because a sibling in the same shard failed.""" + + class _PartialFailAgent: + def __init__(self, host=None, **_): + self._host = host or {} + + async def __aenter__(self): + return self + + async def __aexit__(self, *exc): + return None + + async def deploy(self, config, **kw): + raise RuntimeError("Server error '500 Internal Server Error'") + + async def status(self): + return { + "deployed": True, + "runtime": { + "decky1": {"running": True, "services": {"ssh": "running"}}, + "decky2": {"running": True, "services": {"ssh": "running"}}, + "decky3": {"running": False, "services": {"ssh": "absent"}}, + }, + } + + from decnet.web.router.swarm import api_deploy_swarm as deploy_mod + monkeypatch.setattr(deploy_mod, "AgentClient", _PartialFailAgent) + + h1 = client.post( + "/swarm/enroll", + json={"name": "decktest", "address": "192.168.1.47", "agent_port": 8765}, + ).json() + + cfg = { + "mode": "swarm", + "interface": "eth0", + "subnet": "192.168.1.0/24", + "gateway": "192.168.1.1", + "deckies": [ + _decky_dict("decky1", h1["host_uuid"], "192.168.1.2"), + _decky_dict("decky2", h1["host_uuid"], "192.168.1.3"), + _decky_dict("decky3", h1["host_uuid"], "192.168.1.4"), + ], + } + resp = client.post("/swarm/deploy", json={"config": cfg}) + assert resp.status_code == 200 + assert resp.json()["results"][0]["ok"] is False + + shards = {s["decky_name"]: s for s in client.get("/swarm/deckies").json()} + assert shards["decky1"]["state"] == "running" + assert shards["decky1"]["last_error"] is None + assert shards["decky2"]["state"] == "running" + assert shards["decky3"]["state"] == "failed" + assert "500" in (shards["decky3"]["last_error"] or "") + + def test_deploy_rejects_non_swarm_mode(client: TestClient, stub_agent) -> None: cfg = { "mode": "unihost",