fix(fleet): reset stale host_uuid on carried-over deckies before dispatch
Deckies merged in from a prior deployment's saved state kept their original host_uuid — which dispatch_decnet_config then 404'd on if that host had since been decommissioned or re-enrolled at a different uuid. Before round-robin assignment, drop any host_uuid that isn't in the live swarm_hosts set so orphaned entries get reassigned instead of exploding with 'unknown host_uuid'.
This commit is contained in:
@@ -123,6 +123,14 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir
|
||||
]
|
||||
|
||||
if swarm_hosts:
|
||||
# Carry-over from a prior deployment may reference a host_uuid that's
|
||||
# since been decommissioned / re-enrolled at a new uuid. Drop any
|
||||
# assignment that isn't in the currently-reachable set, then round-
|
||||
# robin-fill the blanks — otherwise dispatch 404s on a dead uuid.
|
||||
live_uuids = {h["uuid"] for h in swarm_hosts}
|
||||
for d in config.deckies:
|
||||
if d.host_uuid and d.host_uuid not in live_uuids:
|
||||
d.host_uuid = None
|
||||
unassigned = [d for d in config.deckies if not d.host_uuid]
|
||||
for i, d in enumerate(unassigned):
|
||||
d.host_uuid = swarm_hosts[i % len(swarm_hosts)]["uuid"]
|
||||
|
||||
@@ -88,6 +88,72 @@ async def test_deploy_automode_shards_when_swarm_host_enrolled(client, auth_toke
|
||||
await repo.delete_swarm_host("host-A")
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_deploy_automode_resets_stale_host_uuid(client, auth_token, monkeypatch):
|
||||
"""Deckies carried over from prior state must not be dispatched to a host
|
||||
uuid that no longer exists — reset + round-robin against live hosts."""
|
||||
monkeypatch.setenv("DECNET_MODE", "master")
|
||||
for row in await repo.list_swarm_hosts():
|
||||
await repo.delete_swarm_host(row["uuid"])
|
||||
|
||||
from datetime import datetime, timezone
|
||||
await repo.add_swarm_host({
|
||||
"uuid": "host-LIVE",
|
||||
"name": "live",
|
||||
"address": "10.0.0.60",
|
||||
"agent_port": 8765,
|
||||
"status": "active",
|
||||
"client_cert_fingerprint": "a" * 64,
|
||||
"updater_cert_fingerprint": None,
|
||||
"cert_bundle_path": "/tmp/live",
|
||||
"enrolled_at": datetime.now(timezone.utc),
|
||||
"notes": "",
|
||||
})
|
||||
|
||||
# Prior state: decky-old is assigned to a now-decommissioned host.
|
||||
await repo.set_state("deployment", {
|
||||
"config": {
|
||||
"mode": "swarm",
|
||||
"interface": "eth0",
|
||||
"subnet": "192.168.1.0/24",
|
||||
"gateway": "192.168.1.1",
|
||||
"deckies": [{
|
||||
"name": "decky-old",
|
||||
"ip": "192.168.1.50",
|
||||
"services": ["ssh"],
|
||||
"distro": "debian",
|
||||
"base_image": "debian:bookworm-slim",
|
||||
"hostname": "decky-old",
|
||||
"host_uuid": "ghost-uuid",
|
||||
}],
|
||||
},
|
||||
"compose_path": "",
|
||||
})
|
||||
|
||||
fake_response = SwarmDeployResponse(results=[
|
||||
SwarmHostResult(host_uuid="host-LIVE", host_name="live", ok=True, detail={})
|
||||
])
|
||||
|
||||
with patch(
|
||||
"decnet.web.router.fleet.api_deploy_deckies.dispatch_decnet_config",
|
||||
new=AsyncMock(return_value=fake_response),
|
||||
) as mock_dispatch:
|
||||
ini = "[decky-new]\nservices = ssh\n"
|
||||
resp = await client.post(
|
||||
"/api/v1/deckies/deploy",
|
||||
json={"ini_content": ini},
|
||||
headers={"Authorization": f"Bearer {auth_token}"},
|
||||
)
|
||||
|
||||
assert resp.status_code == 200, resp.text
|
||||
dispatched = mock_dispatch.await_args.args[0]
|
||||
# Both the carried-over decky and the new one must point at the live host.
|
||||
assert {d.host_uuid for d in dispatched.deckies} == {"host-LIVE"}
|
||||
|
||||
await repo.delete_swarm_host("host-LIVE")
|
||||
await repo.set_state("deployment", None)
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_deployment_mode_endpoint(client, auth_token, monkeypatch):
|
||||
monkeypatch.setenv("DECNET_MODE", "master")
|
||||
|
||||
Reference in New Issue
Block a user