From 4db9c7464c8d161c686bbd686f888586a31276ff Mon Sep 17 00:00:00 2001 From: anti Date: Sat, 18 Apr 2026 20:41:21 -0400 Subject: [PATCH] fix(swarm): relocalize master-built config on worker before deploy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit deploy --mode swarm was failing on every heterogeneous fleet: the master populates config.interface from its own box (detect_interface() → its default NIC), then ships that verbatim. The worker's deployer then calls get_host_ip(config.interface), hits 'ip addr show wlp6s0' on a VM whose NIC is enp0s3, and 500s. Fix: agent.executor._relocalize() runs on every swarm-mode deploy. Re-detects the worker's interface/subnet/gateway/host_ip locally and swaps them into the config before calling deployer.deploy(). When the worker's subnet doesn't match the master's, decky IPs are re-allocated from the worker's subnet via allocate_ips() so they're reachable. Unihost-mode configs are left untouched — they're already built against the local box and second-guessing them would be wrong. Validated against anti@192.168.1.13: master dispatched interface=wlp6s0, agent logged 'relocalized interface=enp0s3', deployer ran successfully, dry-run returned ok=deployed. 4 new tests cover both branches (matching-subnet preserves decky IPs; mismatch re-allocates), the end-to-end executor.deploy() path, and the unihost short-circuit. --- decnet/agent/executor.py | 54 +++++++++++- tests/swarm/test_agent_relocalize.py | 118 +++++++++++++++++++++++++++ 2 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 tests/swarm/test_agent_relocalize.py diff --git a/decnet/agent/executor.py b/decnet/agent/executor.py index 9e4ba5f..3c1030f 100644 --- a/decnet/agent/executor.py +++ b/decnet/agent/executor.py @@ -9,22 +9,74 @@ blocking) so the FastAPI event loop stays responsive. from __future__ import annotations import asyncio +from ipaddress import IPv4Network from typing import Any from decnet.engine import deployer as _deployer from decnet.config import DecnetConfig, load_state, clear_state from decnet.logging import get_logger +from decnet.network import ( + allocate_ips, + detect_interface, + detect_subnet, + get_host_ip, +) log = get_logger("agent.executor") +def _relocalize(config: DecnetConfig) -> DecnetConfig: + """Rewrite a master-built config to the worker's local network reality. + + The master populates ``interface``/``subnet``/``gateway`` from its own + box before dispatching, which blows up the deployer on any worker whose + NIC name differs (common in heterogeneous fleets — master on ``wlp6s0``, + worker on ``enp0s3``). We always re-detect locally; if the worker sits + on a different subnet than the master, decky IPs are re-allocated from + the worker's subnet so they're actually reachable. + """ + local_iface = detect_interface() + local_subnet, local_gateway = detect_subnet(local_iface) + local_host_ip = get_host_ip(local_iface) + + updates: dict[str, Any] = { + "interface": local_iface, + "subnet": local_subnet, + "gateway": local_gateway, + } + + master_net = IPv4Network(config.subnet, strict=False) if config.subnet else None + local_net = IPv4Network(local_subnet, strict=False) + if master_net is None or master_net != local_net: + log.info( + "agent.deploy subnet mismatch master=%s local=%s — re-allocating decky IPs", + config.subnet, local_subnet, + ) + fresh_ips = allocate_ips( + subnet=local_subnet, + gateway=local_gateway, + host_ip=local_host_ip, + count=len(config.deckies), + ) + new_deckies = [d.model_copy(update={"ip": ip}) for d, ip in zip(config.deckies, fresh_ips)] + updates["deckies"] = new_deckies + + return config.model_copy(update=updates) + + async def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False) -> None: """Run the blocking deployer off-loop. The deployer itself calls save_state() internally once the compose file is materialised.""" log.info( - "agent.deploy mode=%s deckies=%d interface=%s", + "agent.deploy mode=%s deckies=%d interface=%s (incoming)", config.mode, len(config.deckies), config.interface, ) + if config.mode == "swarm": + config = _relocalize(config) + log.info( + "agent.deploy relocalized interface=%s subnet=%s gateway=%s", + config.interface, config.subnet, config.gateway, + ) await asyncio.to_thread(_deployer.deploy, config, dry_run, no_cache, False) diff --git a/tests/swarm/test_agent_relocalize.py b/tests/swarm/test_agent_relocalize.py new file mode 100644 index 0000000..991b545 --- /dev/null +++ b/tests/swarm/test_agent_relocalize.py @@ -0,0 +1,118 @@ +"""Worker agent re-localizes master-built configs to its own NIC/subnet. + +The master ships a DecnetConfig populated from *its own* network (master +NIC name, master subnet, master-chosen decky IPs). The worker cannot run +the deployer against that as-is: `ip addr show ` blows up on +any worker whose NIC differs from the master's, which is ~always the +case in a heterogeneous fleet. + +The agent's executor overrides interface/subnet/gateway/host_ip with +locally-detected values before calling into the deployer, and if the +subnet doesn't match, it re-allocates decky IPs from the local subnet. +""" +from __future__ import annotations + +import pytest + +from decnet.agent import executor +from decnet.models import DecnetConfig, DeckyConfig + + +def _cfg(subnet: str, interface: str = "wlp6s0") -> DecnetConfig: + return DecnetConfig( + mode="swarm", + interface=interface, + subnet=subnet, + gateway=subnet.rsplit(".", 1)[0] + ".1", + deckies=[ + DeckyConfig( + name=f"decky-0{i}", + ip=subnet.rsplit(".", 1)[0] + f".{10 + i}", + services=["ssh"], + distro="debian", + base_image="debian:bookworm-slim", + hostname=f"decky-0{i}", + ) + for i in range(1, 3) + ], + ) + + +def test_relocalize_swaps_interface_and_subnet(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(executor, "detect_interface", lambda: "enp0s3") + monkeypatch.setattr(executor, "detect_subnet", lambda _i: ("10.0.0.0/24", "10.0.0.1")) + monkeypatch.setattr(executor, "get_host_ip", lambda _i: "10.0.0.99") + monkeypatch.setattr( + executor, "allocate_ips", + lambda **kw: [f"10.0.0.{20 + i}" for i in range(kw["count"])], + ) + + incoming = _cfg("192.168.1.0/24") + out = executor._relocalize(incoming) + + assert out.interface == "enp0s3" + assert out.subnet == "10.0.0.0/24" + assert out.gateway == "10.0.0.1" + # Subnet changed → IPs re-allocated from the worker's subnet. + assert [d.ip for d in out.deckies] == ["10.0.0.20", "10.0.0.21"] + # Non-network fields survive. + assert [d.name for d in out.deckies] == ["decky-01", "decky-02"] + assert [d.services for d in out.deckies] == [["ssh"], ["ssh"]] + + +def test_relocalize_keeps_ips_when_subnet_matches(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(executor, "detect_interface", lambda: "enp0s3") + monkeypatch.setattr(executor, "detect_subnet", lambda _i: ("192.168.1.0/24", "192.168.1.1")) + monkeypatch.setattr(executor, "get_host_ip", lambda _i: "192.168.1.50") + # allocate_ips should NOT be called in the matching-subnet branch. + def _fail(**_kw): # pragma: no cover + raise AssertionError("allocate_ips must not be called when subnets match") + monkeypatch.setattr(executor, "allocate_ips", _fail) + + incoming = _cfg("192.168.1.0/24") + out = executor._relocalize(incoming) + + assert out.interface == "enp0s3" + assert out.subnet == "192.168.1.0/24" + # Decky IPs preserved verbatim. + assert [d.ip for d in out.deckies] == ["192.168.1.11", "192.168.1.12"] + + +@pytest.mark.asyncio +async def test_deploy_relocalizes_before_calling_deployer(monkeypatch: pytest.MonkeyPatch) -> None: + """End-to-end: agent.deploy(..) must not pass the master's interface + through to the blocking deployer.""" + monkeypatch.setattr(executor, "detect_interface", lambda: "enp0s3") + monkeypatch.setattr(executor, "detect_subnet", lambda _i: ("192.168.1.0/24", "192.168.1.1")) + monkeypatch.setattr(executor, "get_host_ip", lambda _i: "192.168.1.50") + + seen: dict = {} + + def _fake_deploy(cfg, dry_run, no_cache, parallel): + seen["interface"] = cfg.interface + seen["subnet"] = cfg.subnet + + monkeypatch.setattr(executor._deployer, "deploy", _fake_deploy) + + await executor.deploy(_cfg("192.168.1.0/24", interface="wlp6s0-master"), dry_run=True) + assert seen == {"interface": "enp0s3", "subnet": "192.168.1.0/24"} + + +@pytest.mark.asyncio +async def test_deploy_unihost_mode_skips_relocalize(monkeypatch: pytest.MonkeyPatch) -> None: + """Unihost configs have already been built against the local box — we + must not second-guess them.""" + def _fail(*_a, **_kw): # pragma: no cover + raise AssertionError("detect_interface must not be called for unihost") + monkeypatch.setattr(executor, "detect_interface", _fail) + + captured: dict = {} + + def _fake_deploy(cfg, dry_run, no_cache, parallel): + captured["interface"] = cfg.interface + + monkeypatch.setattr(executor._deployer, "deploy", _fake_deploy) + + cfg = _cfg("192.168.1.0/24", interface="eth0").model_copy(update={"mode": "unihost"}) + await executor.deploy(cfg, dry_run=True) + assert captured["interface"] == "eth0"