diff --git a/decnet/agent/executor.py b/decnet/agent/executor.py index 3c1030f..76eaff2 100644 --- a/decnet/agent/executor.py +++ b/decnet/agent/executor.py @@ -87,14 +87,44 @@ async def teardown(decky_id: str | None = None) -> None: await asyncio.to_thread(clear_state) +def _decky_runtime_states(config: DecnetConfig) -> dict[str, dict[str, Any]]: + """Map decky_name → {"running": bool, "services": {svc: container_state}}. + + Queried so the master can tell, after a partial-failure deploy, which + deckies actually came up instead of tainting the whole shard as failed. + Best-effort: a docker error returns an empty map, not an exception. + """ + try: + import docker # local import — agent-only path + client = docker.from_env() + live = {c.name: c.status for c in client.containers.list(all=True, ignore_removed=True)} + except Exception: # pragma: no cover — defensive + log.exception("_decky_runtime_states: docker query failed") + return {} + + out: dict[str, dict[str, Any]] = {} + for d in config.deckies: + svc_states = { + svc: live.get(f"{d.name}-{svc.replace('_', '-')}", "absent") + for svc in d.services + } + out[d.name] = { + "running": bool(svc_states) and all(s == "running" for s in svc_states.values()), + "services": svc_states, + } + return out + + async def status() -> dict[str, Any]: state = await asyncio.to_thread(load_state) if state is None: return {"deployed": False, "deckies": []} config, _compose_path = state + runtime = await asyncio.to_thread(_decky_runtime_states, config) return { "deployed": True, "mode": config.mode, "compose_path": str(_compose_path), "deckies": [d.model_dump() for d in config.deckies], + "runtime": runtime, } diff --git a/decnet/web/router/swarm/api_deploy_swarm.py b/decnet/web/router/swarm/api_deploy_swarm.py index b15f4b7..2b19ebc 100644 --- a/decnet/web/router/swarm/api_deploy_swarm.py +++ b/decnet/web/router/swarm/api_deploy_swarm.py @@ -98,14 +98,28 @@ async def dispatch_decnet_config( return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body) except Exception as exc: log.exception("swarm.deploy dispatch failed host=%s", host["name"]) + # Compose-up is partial-success-friendly: one decky failing to + # build doesn't roll back the ones that already came up. Ask the + # agent which containers actually exist before painting the whole + # shard red — otherwise decky1 and decky2 look "failed" even + # though they're live on the worker. + runtime: dict[str, Any] = {} + try: + async with AgentClient(host=host) as probe: + snap = await probe.status() + runtime = snap.get("runtime") or {} + except Exception: + log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"]) for d in shard: + rstate = runtime.get(d.name) or {} + is_up = bool(rstate.get("running")) await repo.upsert_decky_shard( { "decky_name": d.name, "host_uuid": host_uuid, "services": json.dumps(d.services), - "state": "failed", - "last_error": str(exc)[:512], + "state": "running" if is_up else "failed", + "last_error": None if is_up else str(exc)[:512], "updated_at": datetime.now(timezone.utc), } ) diff --git a/tests/swarm/test_swarm_api.py b/tests/swarm/test_swarm_api.py index 02f0759..77629d8 100644 --- a/tests/swarm/test_swarm_api.py +++ b/tests/swarm/test_swarm_api.py @@ -273,6 +273,68 @@ def test_deploy_rejects_missing_host_uuid(client: TestClient, stub_agent) -> Non assert "host_uuid" in resp.json()["detail"] +def test_deploy_partial_failure_only_marks_actually_failed_decky( + client: TestClient, repo, monkeypatch: pytest.MonkeyPatch +) -> None: + """docker compose up is partial-success-friendly: one failed service + doesn't roll back the ones already up. The master must probe /status + after a dispatch exception so healthy deckies aren't painted red just + because a sibling in the same shard failed.""" + + class _PartialFailAgent: + def __init__(self, host=None, **_): + self._host = host or {} + + async def __aenter__(self): + return self + + async def __aexit__(self, *exc): + return None + + async def deploy(self, config, **kw): + raise RuntimeError("Server error '500 Internal Server Error'") + + async def status(self): + return { + "deployed": True, + "runtime": { + "decky1": {"running": True, "services": {"ssh": "running"}}, + "decky2": {"running": True, "services": {"ssh": "running"}}, + "decky3": {"running": False, "services": {"ssh": "absent"}}, + }, + } + + from decnet.web.router.swarm import api_deploy_swarm as deploy_mod + monkeypatch.setattr(deploy_mod, "AgentClient", _PartialFailAgent) + + h1 = client.post( + "/swarm/enroll", + json={"name": "decktest", "address": "192.168.1.47", "agent_port": 8765}, + ).json() + + cfg = { + "mode": "swarm", + "interface": "eth0", + "subnet": "192.168.1.0/24", + "gateway": "192.168.1.1", + "deckies": [ + _decky_dict("decky1", h1["host_uuid"], "192.168.1.2"), + _decky_dict("decky2", h1["host_uuid"], "192.168.1.3"), + _decky_dict("decky3", h1["host_uuid"], "192.168.1.4"), + ], + } + resp = client.post("/swarm/deploy", json={"config": cfg}) + assert resp.status_code == 200 + assert resp.json()["results"][0]["ok"] is False + + shards = {s["decky_name"]: s for s in client.get("/swarm/deckies").json()} + assert shards["decky1"]["state"] == "running" + assert shards["decky1"]["last_error"] is None + assert shards["decky2"]["state"] == "running" + assert shards["decky3"]["state"] == "failed" + assert "500" in (shards["decky3"]["last_error"] or "") + + def test_deploy_rejects_non_swarm_mode(client: TestClient, stub_agent) -> None: cfg = { "mode": "unihost",