merge testing->tomerge/main #7

Open
anti wants to merge 242 commits from testing into tomerge/main
3 changed files with 108 additions and 2 deletions
Showing only changes of commit df18cb44cc - Show all commits

View File

@@ -87,14 +87,44 @@ async def teardown(decky_id: str | None = None) -> None:
await asyncio.to_thread(clear_state)
def _decky_runtime_states(config: DecnetConfig) -> dict[str, dict[str, Any]]:
"""Map decky_name → {"running": bool, "services": {svc: container_state}}.
Queried so the master can tell, after a partial-failure deploy, which
deckies actually came up instead of tainting the whole shard as failed.
Best-effort: a docker error returns an empty map, not an exception.
"""
try:
import docker # local import — agent-only path
client = docker.from_env()
live = {c.name: c.status for c in client.containers.list(all=True, ignore_removed=True)}
except Exception: # pragma: no cover — defensive
log.exception("_decky_runtime_states: docker query failed")
return {}
out: dict[str, dict[str, Any]] = {}
for d in config.deckies:
svc_states = {
svc: live.get(f"{d.name}-{svc.replace('_', '-')}", "absent")
for svc in d.services
}
out[d.name] = {
"running": bool(svc_states) and all(s == "running" for s in svc_states.values()),
"services": svc_states,
}
return out
async def status() -> dict[str, Any]:
state = await asyncio.to_thread(load_state)
if state is None:
return {"deployed": False, "deckies": []}
config, _compose_path = state
runtime = await asyncio.to_thread(_decky_runtime_states, config)
return {
"deployed": True,
"mode": config.mode,
"compose_path": str(_compose_path),
"deckies": [d.model_dump() for d in config.deckies],
"runtime": runtime,
}

View File

@@ -98,14 +98,28 @@ async def dispatch_decnet_config(
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
except Exception as exc:
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
# Compose-up is partial-success-friendly: one decky failing to
# build doesn't roll back the ones that already came up. Ask the
# agent which containers actually exist before painting the whole
# shard red — otherwise decky1 and decky2 look "failed" even
# though they're live on the worker.
runtime: dict[str, Any] = {}
try:
async with AgentClient(host=host) as probe:
snap = await probe.status()
runtime = snap.get("runtime") or {}
except Exception:
log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"])
for d in shard:
rstate = runtime.get(d.name) or {}
is_up = bool(rstate.get("running"))
await repo.upsert_decky_shard(
{
"decky_name": d.name,
"host_uuid": host_uuid,
"services": json.dumps(d.services),
"state": "failed",
"last_error": str(exc)[:512],
"state": "running" if is_up else "failed",
"last_error": None if is_up else str(exc)[:512],
"updated_at": datetime.now(timezone.utc),
}
)

View File

@@ -273,6 +273,68 @@ def test_deploy_rejects_missing_host_uuid(client: TestClient, stub_agent) -> Non
assert "host_uuid" in resp.json()["detail"]
def test_deploy_partial_failure_only_marks_actually_failed_decky(
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
) -> None:
"""docker compose up is partial-success-friendly: one failed service
doesn't roll back the ones already up. The master must probe /status
after a dispatch exception so healthy deckies aren't painted red just
because a sibling in the same shard failed."""
class _PartialFailAgent:
def __init__(self, host=None, **_):
self._host = host or {}
async def __aenter__(self):
return self
async def __aexit__(self, *exc):
return None
async def deploy(self, config, **kw):
raise RuntimeError("Server error '500 Internal Server Error'")
async def status(self):
return {
"deployed": True,
"runtime": {
"decky1": {"running": True, "services": {"ssh": "running"}},
"decky2": {"running": True, "services": {"ssh": "running"}},
"decky3": {"running": False, "services": {"ssh": "absent"}},
},
}
from decnet.web.router.swarm import api_deploy_swarm as deploy_mod
monkeypatch.setattr(deploy_mod, "AgentClient", _PartialFailAgent)
h1 = client.post(
"/swarm/enroll",
json={"name": "decktest", "address": "192.168.1.47", "agent_port": 8765},
).json()
cfg = {
"mode": "swarm",
"interface": "eth0",
"subnet": "192.168.1.0/24",
"gateway": "192.168.1.1",
"deckies": [
_decky_dict("decky1", h1["host_uuid"], "192.168.1.2"),
_decky_dict("decky2", h1["host_uuid"], "192.168.1.3"),
_decky_dict("decky3", h1["host_uuid"], "192.168.1.4"),
],
}
resp = client.post("/swarm/deploy", json={"config": cfg})
assert resp.status_code == 200
assert resp.json()["results"][0]["ok"] is False
shards = {s["decky_name"]: s for s in client.get("/swarm/deckies").json()}
assert shards["decky1"]["state"] == "running"
assert shards["decky1"]["last_error"] is None
assert shards["decky2"]["state"] == "running"
assert shards["decky3"]["state"] == "failed"
assert "500" in (shards["decky3"]["last_error"] or "")
def test_deploy_rejects_non_swarm_mode(client: TestClient, stub_agent) -> None:
cfg = {
"mode": "unihost",