merge testing->tomerge/main #7
@@ -87,14 +87,44 @@ async def teardown(decky_id: str | None = None) -> None:
|
|||||||
await asyncio.to_thread(clear_state)
|
await asyncio.to_thread(clear_state)
|
||||||
|
|
||||||
|
|
||||||
|
def _decky_runtime_states(config: DecnetConfig) -> dict[str, dict[str, Any]]:
|
||||||
|
"""Map decky_name → {"running": bool, "services": {svc: container_state}}.
|
||||||
|
|
||||||
|
Queried so the master can tell, after a partial-failure deploy, which
|
||||||
|
deckies actually came up instead of tainting the whole shard as failed.
|
||||||
|
Best-effort: a docker error returns an empty map, not an exception.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import docker # local import — agent-only path
|
||||||
|
client = docker.from_env()
|
||||||
|
live = {c.name: c.status for c in client.containers.list(all=True, ignore_removed=True)}
|
||||||
|
except Exception: # pragma: no cover — defensive
|
||||||
|
log.exception("_decky_runtime_states: docker query failed")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
out: dict[str, dict[str, Any]] = {}
|
||||||
|
for d in config.deckies:
|
||||||
|
svc_states = {
|
||||||
|
svc: live.get(f"{d.name}-{svc.replace('_', '-')}", "absent")
|
||||||
|
for svc in d.services
|
||||||
|
}
|
||||||
|
out[d.name] = {
|
||||||
|
"running": bool(svc_states) and all(s == "running" for s in svc_states.values()),
|
||||||
|
"services": svc_states,
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
async def status() -> dict[str, Any]:
|
async def status() -> dict[str, Any]:
|
||||||
state = await asyncio.to_thread(load_state)
|
state = await asyncio.to_thread(load_state)
|
||||||
if state is None:
|
if state is None:
|
||||||
return {"deployed": False, "deckies": []}
|
return {"deployed": False, "deckies": []}
|
||||||
config, _compose_path = state
|
config, _compose_path = state
|
||||||
|
runtime = await asyncio.to_thread(_decky_runtime_states, config)
|
||||||
return {
|
return {
|
||||||
"deployed": True,
|
"deployed": True,
|
||||||
"mode": config.mode,
|
"mode": config.mode,
|
||||||
"compose_path": str(_compose_path),
|
"compose_path": str(_compose_path),
|
||||||
"deckies": [d.model_dump() for d in config.deckies],
|
"deckies": [d.model_dump() for d in config.deckies],
|
||||||
|
"runtime": runtime,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -98,14 +98,28 @@ async def dispatch_decnet_config(
|
|||||||
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
|
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
|
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
|
||||||
|
# Compose-up is partial-success-friendly: one decky failing to
|
||||||
|
# build doesn't roll back the ones that already came up. Ask the
|
||||||
|
# agent which containers actually exist before painting the whole
|
||||||
|
# shard red — otherwise decky1 and decky2 look "failed" even
|
||||||
|
# though they're live on the worker.
|
||||||
|
runtime: dict[str, Any] = {}
|
||||||
|
try:
|
||||||
|
async with AgentClient(host=host) as probe:
|
||||||
|
snap = await probe.status()
|
||||||
|
runtime = snap.get("runtime") or {}
|
||||||
|
except Exception:
|
||||||
|
log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"])
|
||||||
for d in shard:
|
for d in shard:
|
||||||
|
rstate = runtime.get(d.name) or {}
|
||||||
|
is_up = bool(rstate.get("running"))
|
||||||
await repo.upsert_decky_shard(
|
await repo.upsert_decky_shard(
|
||||||
{
|
{
|
||||||
"decky_name": d.name,
|
"decky_name": d.name,
|
||||||
"host_uuid": host_uuid,
|
"host_uuid": host_uuid,
|
||||||
"services": json.dumps(d.services),
|
"services": json.dumps(d.services),
|
||||||
"state": "failed",
|
"state": "running" if is_up else "failed",
|
||||||
"last_error": str(exc)[:512],
|
"last_error": None if is_up else str(exc)[:512],
|
||||||
"updated_at": datetime.now(timezone.utc),
|
"updated_at": datetime.now(timezone.utc),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -273,6 +273,68 @@ def test_deploy_rejects_missing_host_uuid(client: TestClient, stub_agent) -> Non
|
|||||||
assert "host_uuid" in resp.json()["detail"]
|
assert "host_uuid" in resp.json()["detail"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_deploy_partial_failure_only_marks_actually_failed_decky(
|
||||||
|
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||||
|
) -> None:
|
||||||
|
"""docker compose up is partial-success-friendly: one failed service
|
||||||
|
doesn't roll back the ones already up. The master must probe /status
|
||||||
|
after a dispatch exception so healthy deckies aren't painted red just
|
||||||
|
because a sibling in the same shard failed."""
|
||||||
|
|
||||||
|
class _PartialFailAgent:
|
||||||
|
def __init__(self, host=None, **_):
|
||||||
|
self._host = host or {}
|
||||||
|
|
||||||
|
async def __aenter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, *exc):
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def deploy(self, config, **kw):
|
||||||
|
raise RuntimeError("Server error '500 Internal Server Error'")
|
||||||
|
|
||||||
|
async def status(self):
|
||||||
|
return {
|
||||||
|
"deployed": True,
|
||||||
|
"runtime": {
|
||||||
|
"decky1": {"running": True, "services": {"ssh": "running"}},
|
||||||
|
"decky2": {"running": True, "services": {"ssh": "running"}},
|
||||||
|
"decky3": {"running": False, "services": {"ssh": "absent"}},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
from decnet.web.router.swarm import api_deploy_swarm as deploy_mod
|
||||||
|
monkeypatch.setattr(deploy_mod, "AgentClient", _PartialFailAgent)
|
||||||
|
|
||||||
|
h1 = client.post(
|
||||||
|
"/swarm/enroll",
|
||||||
|
json={"name": "decktest", "address": "192.168.1.47", "agent_port": 8765},
|
||||||
|
).json()
|
||||||
|
|
||||||
|
cfg = {
|
||||||
|
"mode": "swarm",
|
||||||
|
"interface": "eth0",
|
||||||
|
"subnet": "192.168.1.0/24",
|
||||||
|
"gateway": "192.168.1.1",
|
||||||
|
"deckies": [
|
||||||
|
_decky_dict("decky1", h1["host_uuid"], "192.168.1.2"),
|
||||||
|
_decky_dict("decky2", h1["host_uuid"], "192.168.1.3"),
|
||||||
|
_decky_dict("decky3", h1["host_uuid"], "192.168.1.4"),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
resp = client.post("/swarm/deploy", json={"config": cfg})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["results"][0]["ok"] is False
|
||||||
|
|
||||||
|
shards = {s["decky_name"]: s for s in client.get("/swarm/deckies").json()}
|
||||||
|
assert shards["decky1"]["state"] == "running"
|
||||||
|
assert shards["decky1"]["last_error"] is None
|
||||||
|
assert shards["decky2"]["state"] == "running"
|
||||||
|
assert shards["decky3"]["state"] == "failed"
|
||||||
|
assert "500" in (shards["decky3"]["last_error"] or "")
|
||||||
|
|
||||||
|
|
||||||
def test_deploy_rejects_non_swarm_mode(client: TestClient, stub_agent) -> None:
|
def test_deploy_rejects_non_swarm_mode(client: TestClient, stub_agent) -> None:
|
||||||
cfg = {
|
cfg = {
|
||||||
"mode": "unihost",
|
"mode": "unihost",
|
||||||
|
|||||||
Reference in New Issue
Block a user