feat(agent): real worker-side /mutate with master swarm dispatch
- Implement /mutate handler: load_state, update services + last_mutated, save_state, write_compose, compose up -d via asyncio.to_thread. 404 for missing state / unknown decky_id. dry_run short-circuits before any side effect. - Add AgentClient.mutate(decky_id, services, *, dry_run=False) using _TIMEOUT_DEPLOY (compose up can pull/build, exceeds control timeout). - mutator/engine.py: in swarm mode with decky.host_uuid set, resolve worker via _resolve_swarm_host and dispatch through AgentClient.mutate instead of writing a compose file on master. Master-resident deckies (unihost mode, or swarm with host_uuid=None) keep the local path.
This commit is contained in:
@@ -181,6 +181,7 @@ class TeardownRequest(BaseModel):
|
||||
class MutateRequest(BaseModel):
|
||||
decky_id: str
|
||||
services: list[str]
|
||||
dry_run: bool = False
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ routes
|
||||
@@ -307,14 +308,51 @@ async def topology_state() -> dict:
|
||||
|
||||
@app.post(
|
||||
"/mutate",
|
||||
responses={501: {"description": "Worker-side mutate not yet implemented"}},
|
||||
responses={
|
||||
404: {"description": "No active deployment, or unknown decky_id"},
|
||||
500: {"description": "Compose rewrite or container restart failed"},
|
||||
},
|
||||
)
|
||||
async def mutate(req: MutateRequest) -> dict:
|
||||
# TODO: implement worker-side mutate. Currently the master performs
|
||||
# mutation by re-sending a full /deploy with the updated DecnetConfig;
|
||||
# this avoids duplicating mutation logic on the worker for v1. When
|
||||
# ready, replace the 501 with a real redeploy-of-a-single-decky path.
|
||||
raise HTTPException(
|
||||
status_code=501,
|
||||
detail="Per-decky mutate is performed via /deploy with updated services",
|
||||
)
|
||||
import time
|
||||
from decnet.composer import write_compose
|
||||
from decnet.config import load_state, save_state
|
||||
from decnet.engine import _compose_with_retry
|
||||
|
||||
state = load_state()
|
||||
if state is None:
|
||||
raise HTTPException(status_code=404, detail="no active deployment on this worker")
|
||||
cfg, compose_path = state
|
||||
|
||||
decky = next((d for d in cfg.deckies if d.name == req.decky_id), None)
|
||||
if decky is None:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"decky {req.decky_id!r} not found in worker state",
|
||||
)
|
||||
|
||||
decky.services = list(req.services)
|
||||
decky.last_mutated = time.time()
|
||||
|
||||
if req.dry_run:
|
||||
return {
|
||||
"status": "dry_run",
|
||||
"decky_id": decky.name,
|
||||
"services": list(decky.services),
|
||||
}
|
||||
|
||||
try:
|
||||
save_state(cfg, compose_path)
|
||||
write_compose(cfg, compose_path)
|
||||
await asyncio.to_thread(
|
||||
_compose_with_retry, "up", "-d", "--remove-orphans",
|
||||
compose_file=compose_path,
|
||||
)
|
||||
except Exception as exc:
|
||||
log.exception("agent.mutate failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||
|
||||
return {
|
||||
"status": "mutated",
|
||||
"decky_id": decky.name,
|
||||
"services": list(decky.services),
|
||||
}
|
||||
|
||||
@@ -93,22 +93,37 @@ async def mutate_decky(
|
||||
# Save to DB
|
||||
await repo.set_state("deployment", {"config": config.model_dump(), "compose_path": str(compose_path)})
|
||||
|
||||
# Still writes files for Docker to use
|
||||
write_compose(config, compose_path)
|
||||
|
||||
log.info("mutation applied decky=%s services=%s", decky_name, ",".join(decky.services))
|
||||
console.print(f"[cyan]Mutating '{decky_name}' to services: {', '.join(decky.services)}[/]")
|
||||
|
||||
try:
|
||||
# Wrap blocking call in thread
|
||||
cp = compose_path
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose_with_retry("up", "-d", "--remove-orphans", compose_file=cp)
|
||||
)
|
||||
except Exception as e:
|
||||
log.error("mutation failed decky=%s error=%s", decky_name, e)
|
||||
console.print(f"[red]Failed to mutate '{decky_name}': {e}[/]")
|
||||
return False
|
||||
# Swarm-resident deckies are reified on a remote worker; dispatch to its
|
||||
# agent /mutate rather than scribbling a compose file on the master.
|
||||
# Master-resident deckies (host_uuid is None, or unihost mode) keep the
|
||||
# local docker path.
|
||||
if config.mode == "swarm" and decky.host_uuid:
|
||||
try:
|
||||
from decnet.engine.deployer import _resolve_swarm_host
|
||||
from decnet.swarm.client import AgentClient
|
||||
|
||||
host = await _resolve_swarm_host(repo, decky.host_uuid)
|
||||
async with AgentClient(host=host) as agent:
|
||||
await agent.mutate(decky.name, list(decky.services))
|
||||
except Exception as e:
|
||||
log.error("mutation failed decky=%s error=%s", decky_name, e)
|
||||
console.print(f"[red]Failed to mutate '{decky_name}': {e}[/]")
|
||||
return False
|
||||
else:
|
||||
# Still writes files for Docker to use
|
||||
write_compose(config, compose_path)
|
||||
try:
|
||||
cp = compose_path
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose_with_retry("up", "-d", "--remove-orphans", compose_file=cp)
|
||||
)
|
||||
except Exception as e:
|
||||
log.error("mutation failed decky=%s error=%s", decky_name, e)
|
||||
console.print(f"[red]Failed to mutate '{decky_name}': {e}[/]")
|
||||
return False
|
||||
|
||||
await emit_decky_mutated(
|
||||
bus,
|
||||
|
||||
@@ -256,6 +256,29 @@ class AgentClient:
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
async def mutate(
|
||||
self,
|
||||
decky_id: str,
|
||||
services: list[str],
|
||||
*,
|
||||
dry_run: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
body = {
|
||||
"decky_id": decky_id,
|
||||
"services": list(services),
|
||||
"dry_run": dry_run,
|
||||
}
|
||||
# Worker /mutate runs `compose up -d` which can pull/build; same
|
||||
# long-tail latency as /deploy. Swap the deploy timeout in.
|
||||
old = self._require_client().timeout
|
||||
self._require_client().timeout = _TIMEOUT_DEPLOY
|
||||
try:
|
||||
resp = await self._require_client().post("/mutate", json=body)
|
||||
finally:
|
||||
self._require_client().timeout = old
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
async def teardown(self, decky_id: Optional[str] = None) -> dict[str, Any]:
|
||||
resp = await self._require_client().post(
|
||||
"/teardown", json={"decky_id": decky_id}
|
||||
|
||||
Reference in New Issue
Block a user