diff --git a/decnet/web/router/fleet/api_deploy_deckies.py b/decnet/web/router/fleet/api_deploy_deckies.py index 4e8ef2c..70fb692 100644 --- a/decnet/web/router/fleet/api_deploy_deckies.py +++ b/decnet/web/router/fleet/api_deploy_deckies.py @@ -123,6 +123,14 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir ] if swarm_hosts: + # Carry-over from a prior deployment may reference a host_uuid that's + # since been decommissioned / re-enrolled at a new uuid. Drop any + # assignment that isn't in the currently-reachable set, then round- + # robin-fill the blanks — otherwise dispatch 404s on a dead uuid. + live_uuids = {h["uuid"] for h in swarm_hosts} + for d in config.deckies: + if d.host_uuid and d.host_uuid not in live_uuids: + d.host_uuid = None unassigned = [d for d in config.deckies if not d.host_uuid] for i, d in enumerate(unassigned): d.host_uuid = swarm_hosts[i % len(swarm_hosts)]["uuid"] diff --git a/tests/api/fleet/test_deploy_automode.py b/tests/api/fleet/test_deploy_automode.py index 06a2765..6b4d668 100644 --- a/tests/api/fleet/test_deploy_automode.py +++ b/tests/api/fleet/test_deploy_automode.py @@ -88,6 +88,72 @@ async def test_deploy_automode_shards_when_swarm_host_enrolled(client, auth_toke await repo.delete_swarm_host("host-A") +@pytest.mark.anyio +async def test_deploy_automode_resets_stale_host_uuid(client, auth_token, monkeypatch): + """Deckies carried over from prior state must not be dispatched to a host + uuid that no longer exists — reset + round-robin against live hosts.""" + monkeypatch.setenv("DECNET_MODE", "master") + for row in await repo.list_swarm_hosts(): + await repo.delete_swarm_host(row["uuid"]) + + from datetime import datetime, timezone + await repo.add_swarm_host({ + "uuid": "host-LIVE", + "name": "live", + "address": "10.0.0.60", + "agent_port": 8765, + "status": "active", + "client_cert_fingerprint": "a" * 64, + "updater_cert_fingerprint": None, + "cert_bundle_path": "/tmp/live", + "enrolled_at": datetime.now(timezone.utc), + "notes": "", + }) + + # Prior state: decky-old is assigned to a now-decommissioned host. + await repo.set_state("deployment", { + "config": { + "mode": "swarm", + "interface": "eth0", + "subnet": "192.168.1.0/24", + "gateway": "192.168.1.1", + "deckies": [{ + "name": "decky-old", + "ip": "192.168.1.50", + "services": ["ssh"], + "distro": "debian", + "base_image": "debian:bookworm-slim", + "hostname": "decky-old", + "host_uuid": "ghost-uuid", + }], + }, + "compose_path": "", + }) + + fake_response = SwarmDeployResponse(results=[ + SwarmHostResult(host_uuid="host-LIVE", host_name="live", ok=True, detail={}) + ]) + + with patch( + "decnet.web.router.fleet.api_deploy_deckies.dispatch_decnet_config", + new=AsyncMock(return_value=fake_response), + ) as mock_dispatch: + ini = "[decky-new]\nservices = ssh\n" + resp = await client.post( + "/api/v1/deckies/deploy", + json={"ini_content": ini}, + headers={"Authorization": f"Bearer {auth_token}"}, + ) + + assert resp.status_code == 200, resp.text + dispatched = mock_dispatch.await_args.args[0] + # Both the carried-over decky and the new one must point at the live host. + assert {d.host_uuid for d in dispatched.deckies} == {"host-LIVE"} + + await repo.delete_swarm_host("host-LIVE") + await repo.set_state("deployment", None) + + @pytest.mark.anyio async def test_deployment_mode_endpoint(client, auth_token, monkeypatch): monkeypatch.setenv("DECNET_MODE", "master")