feat(agent): real worker-side /mutate with master swarm dispatch
- Implement /mutate handler: load_state, update services + last_mutated, save_state, write_compose, compose up -d via asyncio.to_thread. 404 for missing state / unknown decky_id. dry_run short-circuits before any side effect. - Add AgentClient.mutate(decky_id, services, *, dry_run=False) using _TIMEOUT_DEPLOY (compose up can pull/build, exceeds control timeout). - mutator/engine.py: in swarm mode with decky.host_uuid set, resolve worker via _resolve_swarm_host and dispatch through AgentClient.mutate instead of writing a compose file on master. Master-resident deckies (unihost mode, or swarm with host_uuid=None) keep the local path.
This commit is contained in:
@@ -181,6 +181,7 @@ class TeardownRequest(BaseModel):
|
||||
class MutateRequest(BaseModel):
|
||||
decky_id: str
|
||||
services: list[str]
|
||||
dry_run: bool = False
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ routes
|
||||
@@ -307,14 +308,51 @@ async def topology_state() -> dict:
|
||||
|
||||
@app.post(
|
||||
"/mutate",
|
||||
responses={501: {"description": "Worker-side mutate not yet implemented"}},
|
||||
responses={
|
||||
404: {"description": "No active deployment, or unknown decky_id"},
|
||||
500: {"description": "Compose rewrite or container restart failed"},
|
||||
},
|
||||
)
|
||||
async def mutate(req: MutateRequest) -> dict:
|
||||
# TODO: implement worker-side mutate. Currently the master performs
|
||||
# mutation by re-sending a full /deploy with the updated DecnetConfig;
|
||||
# this avoids duplicating mutation logic on the worker for v1. When
|
||||
# ready, replace the 501 with a real redeploy-of-a-single-decky path.
|
||||
raise HTTPException(
|
||||
status_code=501,
|
||||
detail="Per-decky mutate is performed via /deploy with updated services",
|
||||
)
|
||||
import time
|
||||
from decnet.composer import write_compose
|
||||
from decnet.config import load_state, save_state
|
||||
from decnet.engine import _compose_with_retry
|
||||
|
||||
state = load_state()
|
||||
if state is None:
|
||||
raise HTTPException(status_code=404, detail="no active deployment on this worker")
|
||||
cfg, compose_path = state
|
||||
|
||||
decky = next((d for d in cfg.deckies if d.name == req.decky_id), None)
|
||||
if decky is None:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"decky {req.decky_id!r} not found in worker state",
|
||||
)
|
||||
|
||||
decky.services = list(req.services)
|
||||
decky.last_mutated = time.time()
|
||||
|
||||
if req.dry_run:
|
||||
return {
|
||||
"status": "dry_run",
|
||||
"decky_id": decky.name,
|
||||
"services": list(decky.services),
|
||||
}
|
||||
|
||||
try:
|
||||
save_state(cfg, compose_path)
|
||||
write_compose(cfg, compose_path)
|
||||
await asyncio.to_thread(
|
||||
_compose_with_retry, "up", "-d", "--remove-orphans",
|
||||
compose_file=compose_path,
|
||||
)
|
||||
except Exception as exc:
|
||||
log.exception("agent.mutate failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||
|
||||
return {
|
||||
"status": "mutated",
|
||||
"decky_id": decky.name,
|
||||
"services": list(decky.services),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user