feat(agent): /deploy and /mutate become 202 fire-and-forget
The wizard API used to hang because /deckies/deploy ran docker compose
build && up -d synchronously, holding the request thread for minutes.
The worker side of that pipeline now returns 202 Accepted immediately
and runs the deploy in an asyncio.create_task.
On task completion (success or failure) the worker pushes a one-off
heartbeat carrying a lifecycle delta per decky:
{decky_name, operation, status: succeeded|failed, error?, completed_at}
Master pivots these onto open DeckyLifecycle rows in the heartbeat
handler (next commit). The scheduled 30s heartbeat tick is the
fallback if the immediate push drops.
- decnet/agent/app.py: /deploy and /mutate return 202; dry_run mutate
still validates synchronously and returns 200.
- decnet/agent/executor.py: deploy_async + mutate_async wrap the work
and push the completion delta.
- decnet/agent/heartbeat.py: push_lifecycle_delta() helper builds a
one-off body and POSTs with the same mTLS context as the loop.
- decnet/swarm/client.py: revert deploy/mutate to control timeout
(master no longer holds the HTTP request open for compose work).
Worker state.json gains no lifecycle field -- master DeckyLifecycle is
the source of truth; the master sweep handles crashed-mid-deploy
recovery.
This commit is contained in:
@@ -65,80 +65,67 @@ def _seed_state(monkeypatch, tmp_path):
|
||||
return cell
|
||||
|
||||
|
||||
def test_mutate_success(monkeypatch, tmp_path) -> None:
|
||||
cell = _seed_state(monkeypatch, tmp_path)
|
||||
compose_calls: list[tuple] = []
|
||||
write_compose_calls: list[tuple] = []
|
||||
def test_mutate_returns_202_and_spawns_task(monkeypatch, tmp_path) -> None:
|
||||
_seed_state(monkeypatch, tmp_path)
|
||||
spawned: list = []
|
||||
real_create_task = __import__("asyncio").create_task
|
||||
|
||||
monkeypatch.setattr(
|
||||
"decnet.composer.write_compose",
|
||||
lambda c, p: write_compose_calls.append((c, p)) or p,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"decnet.engine._compose_with_retry",
|
||||
lambda *a, **kw: compose_calls.append((a, kw)),
|
||||
)
|
||||
def _capture_create_task(coro, **kw):
|
||||
spawned.append(kw.get("name", ""))
|
||||
# Run the coro so it doesn't leak as a never-awaited warning,
|
||||
# but swap its body out for a no-op.
|
||||
coro.close()
|
||||
# Return something task-like for the handler.
|
||||
async def _noop():
|
||||
return None
|
||||
return real_create_task(_noop())
|
||||
|
||||
monkeypatch.setattr("decnet.agent.app.asyncio.create_task", _capture_create_task)
|
||||
|
||||
client = TestClient(app)
|
||||
resp = client.post(
|
||||
"/mutate",
|
||||
json={"decky_id": "decky-01", "services": ["http", "ftp"]},
|
||||
)
|
||||
assert resp.status_code == 200, resp.text
|
||||
assert resp.status_code == 202, resp.text
|
||||
body = resp.json()
|
||||
assert body == {"status": "mutated", "decky_id": "decky-01", "services": ["http", "ftp"]}
|
||||
assert cell["cfg"].deckies[0].services == ["http", "ftp"]
|
||||
assert cell["cfg"].deckies[0].last_mutated > 0
|
||||
assert len(write_compose_calls) == 1
|
||||
assert len(compose_calls) == 1
|
||||
assert compose_calls[0][0] == ("up", "-d", "--remove-orphans")
|
||||
assert body == {
|
||||
"status": "accepted",
|
||||
"decky_id": "decky-01",
|
||||
"services": ["http", "ftp"],
|
||||
}
|
||||
assert spawned and spawned[0].startswith("mutate-")
|
||||
|
||||
|
||||
def test_mutate_unknown_decky_returns_404(monkeypatch, tmp_path) -> None:
|
||||
_seed_state(monkeypatch, tmp_path)
|
||||
compose_calls: list = []
|
||||
monkeypatch.setattr(
|
||||
"decnet.engine._compose_with_retry",
|
||||
lambda *a, **kw: compose_calls.append((a, kw)),
|
||||
)
|
||||
|
||||
client = TestClient(app)
|
||||
resp = client.post(
|
||||
"/mutate", json={"decky_id": "ghost", "services": ["ssh"]},
|
||||
)
|
||||
assert resp.status_code == 404
|
||||
assert compose_calls == []
|
||||
|
||||
|
||||
def test_mutate_no_state_returns_404(monkeypatch) -> None:
|
||||
def test_mutate_dry_run_404_when_no_state(monkeypatch) -> None:
|
||||
monkeypatch.setattr("decnet.config.load_state", lambda: None)
|
||||
client = TestClient(app)
|
||||
resp = client.post(
|
||||
"/mutate", json={"decky_id": "decky-01", "services": ["ssh"]},
|
||||
"/mutate",
|
||||
json={"decky_id": "decky-01", "services": ["ssh"], "dry_run": True},
|
||||
)
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
def test_mutate_dry_run_does_not_touch_docker_or_state(monkeypatch, tmp_path) -> None:
|
||||
cell = _seed_state(monkeypatch, tmp_path)
|
||||
saved: list = []
|
||||
written: list = []
|
||||
composed: list = []
|
||||
def test_mutate_dry_run_404_for_unknown_decky(monkeypatch, tmp_path) -> None:
|
||||
_seed_state(monkeypatch, tmp_path)
|
||||
client = TestClient(app)
|
||||
resp = client.post(
|
||||
"/mutate",
|
||||
json={"decky_id": "ghost", "services": ["ssh"], "dry_run": True},
|
||||
)
|
||||
assert resp.status_code == 404
|
||||
|
||||
monkeypatch.setattr(
|
||||
"decnet.config.save_state",
|
||||
lambda c, p: saved.append((c, p)),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"decnet.composer.write_compose",
|
||||
lambda c, p: written.append((c, p)),
|
||||
)
|
||||
|
||||
def test_mutate_dry_run_returns_services_without_touching_docker(
|
||||
monkeypatch, tmp_path,
|
||||
) -> None:
|
||||
_seed_state(monkeypatch, tmp_path)
|
||||
composed: list = []
|
||||
monkeypatch.setattr(
|
||||
"decnet.engine._compose_with_retry",
|
||||
lambda *a, **kw: composed.append((a, kw)),
|
||||
)
|
||||
|
||||
original_services = list(cell["cfg"].deckies[0].services)
|
||||
client = TestClient(app)
|
||||
resp = client.post(
|
||||
"/mutate",
|
||||
@@ -146,14 +133,39 @@ def test_mutate_dry_run_does_not_touch_docker_or_state(monkeypatch, tmp_path) ->
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["status"] == "dry_run"
|
||||
# No persistence, no compose render, no docker.
|
||||
assert saved == []
|
||||
assert written == []
|
||||
assert composed == []
|
||||
# State on the in-memory cell was touched (handler mutated the loaded
|
||||
# DeckyConfig) but never persisted — load_state is shared by reference,
|
||||
# so we only assert that no save/render happened above.
|
||||
del original_services
|
||||
|
||||
|
||||
def test_deploy_returns_202_and_spawns_task(monkeypatch) -> None:
|
||||
from decnet.config import DecnetConfig, DeckyConfig
|
||||
cfg = DecnetConfig(
|
||||
mode="unihost", interface="eth0",
|
||||
subnet="10.66.0.0/24", gateway="10.66.0.1",
|
||||
deckies=[DeckyConfig(
|
||||
name="decky-01", ip="10.66.0.10",
|
||||
services=["ssh"], distro="debian",
|
||||
base_image="debian:bookworm-slim", hostname="d01",
|
||||
)],
|
||||
)
|
||||
spawned: list = []
|
||||
real_create_task = __import__("asyncio").create_task
|
||||
|
||||
def _capture_create_task(coro, **kw):
|
||||
spawned.append(kw.get("name", ""))
|
||||
coro.close()
|
||||
async def _noop():
|
||||
return None
|
||||
return real_create_task(_noop())
|
||||
|
||||
monkeypatch.setattr("decnet.agent.app.asyncio.create_task", _capture_create_task)
|
||||
|
||||
client = TestClient(app)
|
||||
resp = client.post("/deploy", json={"config": cfg.model_dump(mode="json")})
|
||||
assert resp.status_code == 202, resp.text
|
||||
body = resp.json()
|
||||
assert body["status"] == "accepted"
|
||||
assert body["deckies"] == ["decky-01"]
|
||||
assert spawned and spawned[0].startswith("deploy-")
|
||||
|
||||
|
||||
def test_deploy_rejects_malformed_body() -> None:
|
||||
|
||||
@@ -142,8 +142,11 @@ async def test_client_mutate_unknown_decky_404(
|
||||
async with swarm_client.AgentClient(
|
||||
address="127.0.0.1", agent_port=port, identity=master_id,
|
||||
) as agent:
|
||||
# Only dry_run can surface 404 synchronously; the live path is
|
||||
# 202 fire-and-forget and would surface failure via the
|
||||
# heartbeat lifecycle delta.
|
||||
with pytest.raises(httpx.HTTPStatusError) as ei:
|
||||
await agent.mutate("ghost", ["ssh"])
|
||||
await agent.mutate("ghost", ["ssh"], dry_run=True)
|
||||
assert ei.value.response.status_code == 404
|
||||
finally:
|
||||
server.should_exit = True
|
||||
|
||||
Reference in New Issue
Block a user