From 07ec4bc26970b1291d7015991985e927f1e5b470 Mon Sep 17 00:00:00 2001 From: anti Date: Sun, 19 Apr 2026 20:24:29 -0400 Subject: [PATCH] fix(fleet): INI fully replaces prior decky state on redeploy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Submitting an INI with a single [decky1] was silently redeploying the deckies from the *previous* deploy too. POST /deckies/deploy merged the new INI into the stored DecnetConfig by name, so a 1-decky INI on top of a prior 3-decky run still pushed 3 deckies to the worker. Those stale decky2/decky3 kept their old IPs, collided on the parent NIC, and the agent failed with 'Address already in use' — the deploy the operator never asked for. The INI is the source of truth for which deckies exist this deploy. Full replace: config.deckies = list(new_decky_configs). Operators who want to add more deckies should list them all in the INI. Update the deploy-limit test to reflect the new replace semantics, and add a regression test asserting prior state is dropped. --- decnet/web/router/fleet/api_deploy_deckies.py | 17 +++++----- tests/api/config/test_deploy_limit.py | 33 +++++++++++++++++-- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/decnet/web/router/fleet/api_deploy_deckies.py b/decnet/web/router/fleet/api_deploy_deckies.py index 70fb692..7c75ca8 100644 --- a/decnet/web/router/fleet/api_deploy_deckies.py +++ b/decnet/web/router/fleet/api_deploy_deckies.py @@ -93,23 +93,22 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir mutate_interval=ini.mutate_interval or DEFAULT_MUTATE_INTERVAL, ) - # Merge deckies - existing_deckies_map = {d.name: d for d in config.deckies} - for new_decky in new_decky_configs: - existing_deckies_map[new_decky.name] = new_decky + # The INI is the source of truth for *which* deckies exist this deploy. + # The old "merge with prior state" behaviour meant submitting `[decky1]` + # after a 3-decky run silently redeployed decky2/decky3 too — and then + # collided on their stale IPs ("Address already in use"). Full replace + # matches what the operator sees in the submitted config. + config.deckies = list(new_decky_configs) - # Enforce deployment limit limits_state = await repo.get_state("config_limits") deployment_limit = limits_state.get("deployment_limit", 10) if limits_state else 10 - if len(existing_deckies_map) > deployment_limit: + if len(config.deckies) > deployment_limit: raise HTTPException( status_code=409, - detail=f"Deployment would result in {len(existing_deckies_map)} deckies, " + detail=f"Deployment would result in {len(config.deckies)} deckies, " f"exceeding the configured limit of {deployment_limit}", ) - config.deckies = list(existing_deckies_map.values()) - # Auto-mode: if we're a master with at least one enrolled/active SWARM # host, shard the deckies across those workers instead of spawning docker # containers on the master itself. Round-robin assignment over deckies diff --git a/tests/api/config/test_deploy_limit.py b/tests/api/config/test_deploy_limit.py index 82e5f0a..26481b9 100644 --- a/tests/api/config/test_deploy_limit.py +++ b/tests/api/config/test_deploy_limit.py @@ -19,11 +19,16 @@ def mock_network(): @pytest.mark.anyio async def test_deploy_respects_limit(client, auth_token, mock_state_file): - """Deploy should reject if total deckies would exceed limit.""" + """Deploy should reject if the *submitted* INI exceeds the limit. + The INI is the source of truth — prior state is fully replaced — so the + check runs on the new decky count alone.""" await repo.set_state("config_limits", {"deployment_limit": 1}) await repo.set_state("deployment", mock_state_file) - ini = """[decky-new] + ini = """[decky-a] +services = ssh + +[decky-b] services = ssh """ resp = await client.post( @@ -31,11 +36,33 @@ services = ssh json={"ini_content": ini}, headers={"Authorization": f"Bearer {auth_token}"}, ) - # 2 existing + 1 new = 3 > limit of 1 + # 2 new deckies > limit of 1 assert resp.status_code == 409 assert "limit" in resp.json()["detail"].lower() +@pytest.mark.anyio +async def test_deploy_replaces_prior_state(client, auth_token, mock_state_file): + """Submitting an INI with 1 decky must not silently re-include the 2 + deckies from prior state (that caused the 'Address already in use' + regression when stale decky2/decky3 redeployed on stale IPs).""" + await repo.set_state("config_limits", {"deployment_limit": 10}) + await repo.set_state("deployment", mock_state_file) + + ini = """[only-decky] +services = ssh +""" + resp = await client.post( + "/api/v1/deckies/deploy", + json={"ini_content": ini}, + headers={"Authorization": f"Bearer {auth_token}"}, + ) + assert resp.status_code == 200 + persisted = await repo.get_state("deployment") + names = [d["name"] for d in persisted["config"]["deckies"]] + assert names == ["only-decky"] + + @pytest.mark.anyio async def test_deploy_within_limit(client, auth_token, mock_state_file): """Deploy should succeed when within limit."""