fix(fleet): reset stale host_uuid on carried-over deckies before dispatch

Deckies merged in from a prior deployment's saved state kept their
original host_uuid — which dispatch_decnet_config then 404'd on if that
host had since been decommissioned or re-enrolled at a different uuid.
Before round-robin assignment, drop any host_uuid that isn't in the live
swarm_hosts set so orphaned entries get reassigned instead of exploding
with 'unknown host_uuid'.
This commit is contained in:
2026-04-19 06:27:34 -04:00
parent dbaccde143
commit 6d7567b6bb
2 changed files with 74 additions and 0 deletions

View File

@@ -123,6 +123,14 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir
]
if swarm_hosts:
# Carry-over from a prior deployment may reference a host_uuid that's
# since been decommissioned / re-enrolled at a new uuid. Drop any
# assignment that isn't in the currently-reachable set, then round-
# robin-fill the blanks — otherwise dispatch 404s on a dead uuid.
live_uuids = {h["uuid"] for h in swarm_hosts}
for d in config.deckies:
if d.host_uuid and d.host_uuid not in live_uuids:
d.host_uuid = None
unassigned = [d for d in config.deckies if not d.host_uuid]
for i, d in enumerate(unassigned):
d.host_uuid = swarm_hosts[i % len(swarm_hosts)]["uuid"]

View File

@@ -88,6 +88,72 @@ async def test_deploy_automode_shards_when_swarm_host_enrolled(client, auth_toke
await repo.delete_swarm_host("host-A")
@pytest.mark.anyio
async def test_deploy_automode_resets_stale_host_uuid(client, auth_token, monkeypatch):
"""Deckies carried over from prior state must not be dispatched to a host
uuid that no longer exists — reset + round-robin against live hosts."""
monkeypatch.setenv("DECNET_MODE", "master")
for row in await repo.list_swarm_hosts():
await repo.delete_swarm_host(row["uuid"])
from datetime import datetime, timezone
await repo.add_swarm_host({
"uuid": "host-LIVE",
"name": "live",
"address": "10.0.0.60",
"agent_port": 8765,
"status": "active",
"client_cert_fingerprint": "a" * 64,
"updater_cert_fingerprint": None,
"cert_bundle_path": "/tmp/live",
"enrolled_at": datetime.now(timezone.utc),
"notes": "",
})
# Prior state: decky-old is assigned to a now-decommissioned host.
await repo.set_state("deployment", {
"config": {
"mode": "swarm",
"interface": "eth0",
"subnet": "192.168.1.0/24",
"gateway": "192.168.1.1",
"deckies": [{
"name": "decky-old",
"ip": "192.168.1.50",
"services": ["ssh"],
"distro": "debian",
"base_image": "debian:bookworm-slim",
"hostname": "decky-old",
"host_uuid": "ghost-uuid",
}],
},
"compose_path": "",
})
fake_response = SwarmDeployResponse(results=[
SwarmHostResult(host_uuid="host-LIVE", host_name="live", ok=True, detail={})
])
with patch(
"decnet.web.router.fleet.api_deploy_deckies.dispatch_decnet_config",
new=AsyncMock(return_value=fake_response),
) as mock_dispatch:
ini = "[decky-new]\nservices = ssh\n"
resp = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": ini},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert resp.status_code == 200, resp.text
dispatched = mock_dispatch.await_args.args[0]
# Both the carried-over decky and the new one must point at the live host.
assert {d.host_uuid for d in dispatched.deckies} == {"host-LIVE"}
await repo.delete_swarm_host("host-LIVE")
await repo.set_state("deployment", None)
@pytest.mark.anyio
async def test_deployment_mode_endpoint(client, auth_token, monkeypatch):
monkeypatch.setenv("DECNET_MODE", "master")