refactor(swarm): extract _shard_payload helper and promote _dispatch to module-level
This commit is contained in:
@@ -57,6 +57,67 @@ def _worker_config(
|
|||||||
return base.model_copy(update=updates)
|
return base.model_copy(update=updates)
|
||||||
|
|
||||||
|
|
||||||
|
def _shard_payload(
|
||||||
|
d: DeckyConfig,
|
||||||
|
host_uuid: str,
|
||||||
|
state: str,
|
||||||
|
error: str | None,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"decky_name": d.name,
|
||||||
|
"host_uuid": host_uuid,
|
||||||
|
"services": json.dumps(d.services),
|
||||||
|
"decky_config": d.model_dump_json(),
|
||||||
|
"decky_ip": d.ip,
|
||||||
|
"state": state,
|
||||||
|
"last_error": error,
|
||||||
|
"updated_at": datetime.now(timezone.utc),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def _dispatch(
|
||||||
|
host_uuid: str,
|
||||||
|
shard: list[DeckyConfig],
|
||||||
|
hosts: dict[str, dict[str, Any]],
|
||||||
|
config: DecnetConfig,
|
||||||
|
repo: BaseRepository,
|
||||||
|
dry_run: bool,
|
||||||
|
no_cache: bool,
|
||||||
|
) -> SwarmHostResult:
|
||||||
|
host = hosts[host_uuid]
|
||||||
|
cfg = _worker_config(config, shard, host)
|
||||||
|
try:
|
||||||
|
async with AgentClient(host=host) as agent:
|
||||||
|
body = await agent.deploy(cfg, dry_run=dry_run, no_cache=no_cache)
|
||||||
|
for d in shard:
|
||||||
|
await repo.upsert_decky_shard(
|
||||||
|
_shard_payload(d, host_uuid, "running" if not dry_run else "pending", None)
|
||||||
|
)
|
||||||
|
await repo.update_swarm_host(host_uuid, {"status": "active"})
|
||||||
|
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
|
||||||
|
except Exception as exc:
|
||||||
|
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
|
||||||
|
# Compose-up is partial-success-friendly: one decky failing to
|
||||||
|
# build doesn't roll back the ones that already came up. Ask the
|
||||||
|
# agent which containers actually exist before painting the whole
|
||||||
|
# shard red — otherwise decky1 and decky2 look "failed" even
|
||||||
|
# though they're live on the worker.
|
||||||
|
runtime: dict[str, Any] = {}
|
||||||
|
try:
|
||||||
|
async with AgentClient(host=host) as probe:
|
||||||
|
snap = await probe.status()
|
||||||
|
runtime = snap.get("runtime") or {}
|
||||||
|
except Exception:
|
||||||
|
log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"])
|
||||||
|
for d in shard:
|
||||||
|
rstate = runtime.get(d.name) or {}
|
||||||
|
is_up = bool(rstate.get("running"))
|
||||||
|
await repo.upsert_decky_shard(
|
||||||
|
_shard_payload(d, host_uuid, "running" if is_up else "failed", None if is_up else str(exc)[:512])
|
||||||
|
)
|
||||||
|
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=False, detail=str(exc))
|
||||||
|
|
||||||
|
|
||||||
async def dispatch_decnet_config(
|
async def dispatch_decnet_config(
|
||||||
config: DecnetConfig,
|
config: DecnetConfig,
|
||||||
repo: BaseRepository,
|
repo: BaseRepository,
|
||||||
@@ -77,60 +138,11 @@ async def dispatch_decnet_config(
|
|||||||
raise HTTPException(status_code=404, detail=f"unknown host_uuid: {host_uuid}")
|
raise HTTPException(status_code=404, detail=f"unknown host_uuid: {host_uuid}")
|
||||||
hosts[host_uuid] = row
|
hosts[host_uuid] = row
|
||||||
|
|
||||||
async def _dispatch(host_uuid: str, shard: list[DeckyConfig]) -> SwarmHostResult:
|
|
||||||
host = hosts[host_uuid]
|
|
||||||
cfg = _worker_config(config, shard, host)
|
|
||||||
try:
|
|
||||||
async with AgentClient(host=host) as agent:
|
|
||||||
body = await agent.deploy(cfg, dry_run=dry_run, no_cache=no_cache)
|
|
||||||
for d in shard:
|
|
||||||
await repo.upsert_decky_shard(
|
|
||||||
{
|
|
||||||
"decky_name": d.name,
|
|
||||||
"host_uuid": host_uuid,
|
|
||||||
"services": json.dumps(d.services),
|
|
||||||
"decky_config": d.model_dump_json(),
|
|
||||||
"decky_ip": d.ip,
|
|
||||||
"state": "running" if not dry_run else "pending",
|
|
||||||
"last_error": None,
|
|
||||||
"updated_at": datetime.now(timezone.utc),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
await repo.update_swarm_host(host_uuid, {"status": "active"})
|
|
||||||
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
|
|
||||||
except Exception as exc:
|
|
||||||
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
|
|
||||||
# Compose-up is partial-success-friendly: one decky failing to
|
|
||||||
# build doesn't roll back the ones that already came up. Ask the
|
|
||||||
# agent which containers actually exist before painting the whole
|
|
||||||
# shard red — otherwise decky1 and decky2 look "failed" even
|
|
||||||
# though they're live on the worker.
|
|
||||||
runtime: dict[str, Any] = {}
|
|
||||||
try:
|
|
||||||
async with AgentClient(host=host) as probe:
|
|
||||||
snap = await probe.status()
|
|
||||||
runtime = snap.get("runtime") or {}
|
|
||||||
except Exception:
|
|
||||||
log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"])
|
|
||||||
for d in shard:
|
|
||||||
rstate = runtime.get(d.name) or {}
|
|
||||||
is_up = bool(rstate.get("running"))
|
|
||||||
await repo.upsert_decky_shard(
|
|
||||||
{
|
|
||||||
"decky_name": d.name,
|
|
||||||
"host_uuid": host_uuid,
|
|
||||||
"services": json.dumps(d.services),
|
|
||||||
"decky_config": d.model_dump_json(),
|
|
||||||
"decky_ip": d.ip,
|
|
||||||
"state": "running" if is_up else "failed",
|
|
||||||
"last_error": None if is_up else str(exc)[:512],
|
|
||||||
"updated_at": datetime.now(timezone.utc),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=False, detail=str(exc))
|
|
||||||
|
|
||||||
results = await asyncio.gather(
|
results = await asyncio.gather(
|
||||||
*(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
|
*(
|
||||||
|
_dispatch(uuid_, shard, hosts, config, repo, dry_run, no_cache)
|
||||||
|
for uuid_, shard in buckets.items()
|
||||||
|
)
|
||||||
)
|
)
|
||||||
return SwarmDeployResponse(results=list(results))
|
return SwarmDeployResponse(results=list(results))
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user