merge testing->tomerge/main #7
@@ -87,14 +87,44 @@ async def teardown(decky_id: str | None = None) -> None:
|
||||
await asyncio.to_thread(clear_state)
|
||||
|
||||
|
||||
def _decky_runtime_states(config: DecnetConfig) -> dict[str, dict[str, Any]]:
|
||||
"""Map decky_name → {"running": bool, "services": {svc: container_state}}.
|
||||
|
||||
Queried so the master can tell, after a partial-failure deploy, which
|
||||
deckies actually came up instead of tainting the whole shard as failed.
|
||||
Best-effort: a docker error returns an empty map, not an exception.
|
||||
"""
|
||||
try:
|
||||
import docker # local import — agent-only path
|
||||
client = docker.from_env()
|
||||
live = {c.name: c.status for c in client.containers.list(all=True, ignore_removed=True)}
|
||||
except Exception: # pragma: no cover — defensive
|
||||
log.exception("_decky_runtime_states: docker query failed")
|
||||
return {}
|
||||
|
||||
out: dict[str, dict[str, Any]] = {}
|
||||
for d in config.deckies:
|
||||
svc_states = {
|
||||
svc: live.get(f"{d.name}-{svc.replace('_', '-')}", "absent")
|
||||
for svc in d.services
|
||||
}
|
||||
out[d.name] = {
|
||||
"running": bool(svc_states) and all(s == "running" for s in svc_states.values()),
|
||||
"services": svc_states,
|
||||
}
|
||||
return out
|
||||
|
||||
|
||||
async def status() -> dict[str, Any]:
|
||||
state = await asyncio.to_thread(load_state)
|
||||
if state is None:
|
||||
return {"deployed": False, "deckies": []}
|
||||
config, _compose_path = state
|
||||
runtime = await asyncio.to_thread(_decky_runtime_states, config)
|
||||
return {
|
||||
"deployed": True,
|
||||
"mode": config.mode,
|
||||
"compose_path": str(_compose_path),
|
||||
"deckies": [d.model_dump() for d in config.deckies],
|
||||
"runtime": runtime,
|
||||
}
|
||||
|
||||
@@ -98,14 +98,28 @@ async def dispatch_decnet_config(
|
||||
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
|
||||
except Exception as exc:
|
||||
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
|
||||
# Compose-up is partial-success-friendly: one decky failing to
|
||||
# build doesn't roll back the ones that already came up. Ask the
|
||||
# agent which containers actually exist before painting the whole
|
||||
# shard red — otherwise decky1 and decky2 look "failed" even
|
||||
# though they're live on the worker.
|
||||
runtime: dict[str, Any] = {}
|
||||
try:
|
||||
async with AgentClient(host=host) as probe:
|
||||
snap = await probe.status()
|
||||
runtime = snap.get("runtime") or {}
|
||||
except Exception:
|
||||
log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"])
|
||||
for d in shard:
|
||||
rstate = runtime.get(d.name) or {}
|
||||
is_up = bool(rstate.get("running"))
|
||||
await repo.upsert_decky_shard(
|
||||
{
|
||||
"decky_name": d.name,
|
||||
"host_uuid": host_uuid,
|
||||
"services": json.dumps(d.services),
|
||||
"state": "failed",
|
||||
"last_error": str(exc)[:512],
|
||||
"state": "running" if is_up else "failed",
|
||||
"last_error": None if is_up else str(exc)[:512],
|
||||
"updated_at": datetime.now(timezone.utc),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -273,6 +273,68 @@ def test_deploy_rejects_missing_host_uuid(client: TestClient, stub_agent) -> Non
|
||||
assert "host_uuid" in resp.json()["detail"]
|
||||
|
||||
|
||||
def test_deploy_partial_failure_only_marks_actually_failed_decky(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""docker compose up is partial-success-friendly: one failed service
|
||||
doesn't roll back the ones already up. The master must probe /status
|
||||
after a dispatch exception so healthy deckies aren't painted red just
|
||||
because a sibling in the same shard failed."""
|
||||
|
||||
class _PartialFailAgent:
|
||||
def __init__(self, host=None, **_):
|
||||
self._host = host or {}
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *exc):
|
||||
return None
|
||||
|
||||
async def deploy(self, config, **kw):
|
||||
raise RuntimeError("Server error '500 Internal Server Error'")
|
||||
|
||||
async def status(self):
|
||||
return {
|
||||
"deployed": True,
|
||||
"runtime": {
|
||||
"decky1": {"running": True, "services": {"ssh": "running"}},
|
||||
"decky2": {"running": True, "services": {"ssh": "running"}},
|
||||
"decky3": {"running": False, "services": {"ssh": "absent"}},
|
||||
},
|
||||
}
|
||||
|
||||
from decnet.web.router.swarm import api_deploy_swarm as deploy_mod
|
||||
monkeypatch.setattr(deploy_mod, "AgentClient", _PartialFailAgent)
|
||||
|
||||
h1 = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "decktest", "address": "192.168.1.47", "agent_port": 8765},
|
||||
).json()
|
||||
|
||||
cfg = {
|
||||
"mode": "swarm",
|
||||
"interface": "eth0",
|
||||
"subnet": "192.168.1.0/24",
|
||||
"gateway": "192.168.1.1",
|
||||
"deckies": [
|
||||
_decky_dict("decky1", h1["host_uuid"], "192.168.1.2"),
|
||||
_decky_dict("decky2", h1["host_uuid"], "192.168.1.3"),
|
||||
_decky_dict("decky3", h1["host_uuid"], "192.168.1.4"),
|
||||
],
|
||||
}
|
||||
resp = client.post("/swarm/deploy", json={"config": cfg})
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["results"][0]["ok"] is False
|
||||
|
||||
shards = {s["decky_name"]: s for s in client.get("/swarm/deckies").json()}
|
||||
assert shards["decky1"]["state"] == "running"
|
||||
assert shards["decky1"]["last_error"] is None
|
||||
assert shards["decky2"]["state"] == "running"
|
||||
assert shards["decky3"]["state"] == "failed"
|
||||
assert "500" in (shards["decky3"]["last_error"] or "")
|
||||
|
||||
|
||||
def test_deploy_rejects_non_swarm_mode(client: TestClient, stub_agent) -> None:
|
||||
cfg = {
|
||||
"mode": "unihost",
|
||||
|
||||
Reference in New Issue
Block a user