refactor(swarm): extract _shard_payload helper and promote _dispatch to module-level

This commit is contained in:
2026-04-30 20:25:38 -04:00
parent c648d8b04e
commit e124f9e296

View File

@@ -57,6 +57,67 @@ def _worker_config(
return base.model_copy(update=updates)
def _shard_payload(
d: DeckyConfig,
host_uuid: str,
state: str,
error: str | None,
) -> dict[str, Any]:
return {
"decky_name": d.name,
"host_uuid": host_uuid,
"services": json.dumps(d.services),
"decky_config": d.model_dump_json(),
"decky_ip": d.ip,
"state": state,
"last_error": error,
"updated_at": datetime.now(timezone.utc),
}
async def _dispatch(
host_uuid: str,
shard: list[DeckyConfig],
hosts: dict[str, dict[str, Any]],
config: DecnetConfig,
repo: BaseRepository,
dry_run: bool,
no_cache: bool,
) -> SwarmHostResult:
host = hosts[host_uuid]
cfg = _worker_config(config, shard, host)
try:
async with AgentClient(host=host) as agent:
body = await agent.deploy(cfg, dry_run=dry_run, no_cache=no_cache)
for d in shard:
await repo.upsert_decky_shard(
_shard_payload(d, host_uuid, "running" if not dry_run else "pending", None)
)
await repo.update_swarm_host(host_uuid, {"status": "active"})
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
except Exception as exc:
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
# Compose-up is partial-success-friendly: one decky failing to
# build doesn't roll back the ones that already came up. Ask the
# agent which containers actually exist before painting the whole
# shard red — otherwise decky1 and decky2 look "failed" even
# though they're live on the worker.
runtime: dict[str, Any] = {}
try:
async with AgentClient(host=host) as probe:
snap = await probe.status()
runtime = snap.get("runtime") or {}
except Exception:
log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"])
for d in shard:
rstate = runtime.get(d.name) or {}
is_up = bool(rstate.get("running"))
await repo.upsert_decky_shard(
_shard_payload(d, host_uuid, "running" if is_up else "failed", None if is_up else str(exc)[:512])
)
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=False, detail=str(exc))
async def dispatch_decnet_config(
config: DecnetConfig,
repo: BaseRepository,
@@ -77,60 +138,11 @@ async def dispatch_decnet_config(
raise HTTPException(status_code=404, detail=f"unknown host_uuid: {host_uuid}")
hosts[host_uuid] = row
async def _dispatch(host_uuid: str, shard: list[DeckyConfig]) -> SwarmHostResult:
host = hosts[host_uuid]
cfg = _worker_config(config, shard, host)
try:
async with AgentClient(host=host) as agent:
body = await agent.deploy(cfg, dry_run=dry_run, no_cache=no_cache)
for d in shard:
await repo.upsert_decky_shard(
{
"decky_name": d.name,
"host_uuid": host_uuid,
"services": json.dumps(d.services),
"decky_config": d.model_dump_json(),
"decky_ip": d.ip,
"state": "running" if not dry_run else "pending",
"last_error": None,
"updated_at": datetime.now(timezone.utc),
}
)
await repo.update_swarm_host(host_uuid, {"status": "active"})
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
except Exception as exc:
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
# Compose-up is partial-success-friendly: one decky failing to
# build doesn't roll back the ones that already came up. Ask the
# agent which containers actually exist before painting the whole
# shard red — otherwise decky1 and decky2 look "failed" even
# though they're live on the worker.
runtime: dict[str, Any] = {}
try:
async with AgentClient(host=host) as probe:
snap = await probe.status()
runtime = snap.get("runtime") or {}
except Exception:
log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"])
for d in shard:
rstate = runtime.get(d.name) or {}
is_up = bool(rstate.get("running"))
await repo.upsert_decky_shard(
{
"decky_name": d.name,
"host_uuid": host_uuid,
"services": json.dumps(d.services),
"decky_config": d.model_dump_json(),
"decky_ip": d.ip,
"state": "running" if is_up else "failed",
"last_error": None if is_up else str(exc)[:512],
"updated_at": datetime.now(timezone.utc),
}
)
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=False, detail=str(exc))
results = await asyncio.gather(
*(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
*(
_dispatch(uuid_, shard, hosts, config, repo, dry_run, no_cache)
for uuid_, shard in buckets.items()
)
)
return SwarmDeployResponse(results=list(results))