feat(agent): real worker-side /mutate with master swarm dispatch

- Implement /mutate handler: load_state, update services + last_mutated,
  save_state, write_compose, compose up -d via asyncio.to_thread. 404
  for missing state / unknown decky_id. dry_run short-circuits before
  any side effect.
- Add AgentClient.mutate(decky_id, services, *, dry_run=False) using
  _TIMEOUT_DEPLOY (compose up can pull/build, exceeds control timeout).
- mutator/engine.py: in swarm mode with decky.host_uuid set, resolve
  worker via _resolve_swarm_host and dispatch through AgentClient.mutate
  instead of writing a compose file on master. Master-resident deckies
  (unihost mode, or swarm with host_uuid=None) keep the local path.
This commit is contained in:
2026-05-22 16:14:46 -04:00
parent 418245f9b4
commit ade8bbe30a
6 changed files with 434 additions and 25 deletions

View File

@@ -181,6 +181,7 @@ class TeardownRequest(BaseModel):
class MutateRequest(BaseModel):
decky_id: str
services: list[str]
dry_run: bool = False
# ------------------------------------------------------------------ routes
@@ -307,14 +308,51 @@ async def topology_state() -> dict:
@app.post(
"/mutate",
responses={501: {"description": "Worker-side mutate not yet implemented"}},
responses={
404: {"description": "No active deployment, or unknown decky_id"},
500: {"description": "Compose rewrite or container restart failed"},
},
)
async def mutate(req: MutateRequest) -> dict:
# TODO: implement worker-side mutate. Currently the master performs
# mutation by re-sending a full /deploy with the updated DecnetConfig;
# this avoids duplicating mutation logic on the worker for v1. When
# ready, replace the 501 with a real redeploy-of-a-single-decky path.
raise HTTPException(
status_code=501,
detail="Per-decky mutate is performed via /deploy with updated services",
)
import time
from decnet.composer import write_compose
from decnet.config import load_state, save_state
from decnet.engine import _compose_with_retry
state = load_state()
if state is None:
raise HTTPException(status_code=404, detail="no active deployment on this worker")
cfg, compose_path = state
decky = next((d for d in cfg.deckies if d.name == req.decky_id), None)
if decky is None:
raise HTTPException(
status_code=404, detail=f"decky {req.decky_id!r} not found in worker state",
)
decky.services = list(req.services)
decky.last_mutated = time.time()
if req.dry_run:
return {
"status": "dry_run",
"decky_id": decky.name,
"services": list(decky.services),
}
try:
save_state(cfg, compose_path)
write_compose(cfg, compose_path)
await asyncio.to_thread(
_compose_with_retry, "up", "-d", "--remove-orphans",
compose_file=compose_path,
)
except Exception as exc:
log.exception("agent.mutate failed")
raise HTTPException(status_code=500, detail=str(exc)) from exc
return {
"status": "mutated",
"decky_id": decky.name,
"services": list(decky.services),
}

View File

@@ -93,22 +93,37 @@ async def mutate_decky(
# Save to DB
await repo.set_state("deployment", {"config": config.model_dump(), "compose_path": str(compose_path)})
# Still writes files for Docker to use
write_compose(config, compose_path)
log.info("mutation applied decky=%s services=%s", decky_name, ",".join(decky.services))
console.print(f"[cyan]Mutating '{decky_name}' to services: {', '.join(decky.services)}[/]")
try:
# Wrap blocking call in thread
cp = compose_path
await anyio.to_thread.run_sync(
lambda: _compose_with_retry("up", "-d", "--remove-orphans", compose_file=cp)
)
except Exception as e:
log.error("mutation failed decky=%s error=%s", decky_name, e)
console.print(f"[red]Failed to mutate '{decky_name}': {e}[/]")
return False
# Swarm-resident deckies are reified on a remote worker; dispatch to its
# agent /mutate rather than scribbling a compose file on the master.
# Master-resident deckies (host_uuid is None, or unihost mode) keep the
# local docker path.
if config.mode == "swarm" and decky.host_uuid:
try:
from decnet.engine.deployer import _resolve_swarm_host
from decnet.swarm.client import AgentClient
host = await _resolve_swarm_host(repo, decky.host_uuid)
async with AgentClient(host=host) as agent:
await agent.mutate(decky.name, list(decky.services))
except Exception as e:
log.error("mutation failed decky=%s error=%s", decky_name, e)
console.print(f"[red]Failed to mutate '{decky_name}': {e}[/]")
return False
else:
# Still writes files for Docker to use
write_compose(config, compose_path)
try:
cp = compose_path
await anyio.to_thread.run_sync(
lambda: _compose_with_retry("up", "-d", "--remove-orphans", compose_file=cp)
)
except Exception as e:
log.error("mutation failed decky=%s error=%s", decky_name, e)
console.print(f"[red]Failed to mutate '{decky_name}': {e}[/]")
return False
await emit_decky_mutated(
bus,

View File

@@ -256,6 +256,29 @@ class AgentClient:
resp.raise_for_status()
return resp.json()
async def mutate(
self,
decky_id: str,
services: list[str],
*,
dry_run: bool = False,
) -> dict[str, Any]:
body = {
"decky_id": decky_id,
"services": list(services),
"dry_run": dry_run,
}
# Worker /mutate runs `compose up -d` which can pull/build; same
# long-tail latency as /deploy. Swap the deploy timeout in.
old = self._require_client().timeout
self._require_client().timeout = _TIMEOUT_DEPLOY
try:
resp = await self._require_client().post("/mutate", json=body)
finally:
self._require_client().timeout = old
resp.raise_for_status()
return resp.json()
async def teardown(self, decky_id: Optional[str] = None) -> dict[str, Any]:
resp = await self._require_client().post(
"/teardown", json={"decky_id": decky_id}