fix(swarm): relocalize master-built config on worker before deploy

deploy --mode swarm was failing on every heterogeneous fleet: the master
populates config.interface from its own box (detect_interface() → its
default NIC), then ships that verbatim. The worker's deployer then calls
get_host_ip(config.interface), hits 'ip addr show wlp6s0' on a VM whose
NIC is enp0s3, and 500s.

Fix: agent.executor._relocalize() runs on every swarm-mode deploy.
Re-detects the worker's interface/subnet/gateway/host_ip locally and
swaps them into the config before calling deployer.deploy(). When the
worker's subnet doesn't match the master's, decky IPs are re-allocated
from the worker's subnet via allocate_ips() so they're reachable.

Unihost-mode configs are left untouched — they're already built against
the local box and second-guessing them would be wrong.

Validated against anti@192.168.1.13: master dispatched interface=wlp6s0,
agent logged 'relocalized interface=enp0s3', deployer ran successfully,
dry-run returned ok=deployed.

4 new tests cover both branches (matching-subnet preserves decky IPs;
mismatch re-allocates), the end-to-end executor.deploy() path, and the
unihost short-circuit.
This commit is contained in:
2026-04-18 20:41:21 -04:00
parent 411a797120
commit 4db9c7464c
2 changed files with 171 additions and 1 deletions

View File

@@ -9,22 +9,74 @@ blocking) so the FastAPI event loop stays responsive.
from __future__ import annotations from __future__ import annotations
import asyncio import asyncio
from ipaddress import IPv4Network
from typing import Any from typing import Any
from decnet.engine import deployer as _deployer from decnet.engine import deployer as _deployer
from decnet.config import DecnetConfig, load_state, clear_state from decnet.config import DecnetConfig, load_state, clear_state
from decnet.logging import get_logger from decnet.logging import get_logger
from decnet.network import (
allocate_ips,
detect_interface,
detect_subnet,
get_host_ip,
)
log = get_logger("agent.executor") log = get_logger("agent.executor")
def _relocalize(config: DecnetConfig) -> DecnetConfig:
"""Rewrite a master-built config to the worker's local network reality.
The master populates ``interface``/``subnet``/``gateway`` from its own
box before dispatching, which blows up the deployer on any worker whose
NIC name differs (common in heterogeneous fleets — master on ``wlp6s0``,
worker on ``enp0s3``). We always re-detect locally; if the worker sits
on a different subnet than the master, decky IPs are re-allocated from
the worker's subnet so they're actually reachable.
"""
local_iface = detect_interface()
local_subnet, local_gateway = detect_subnet(local_iface)
local_host_ip = get_host_ip(local_iface)
updates: dict[str, Any] = {
"interface": local_iface,
"subnet": local_subnet,
"gateway": local_gateway,
}
master_net = IPv4Network(config.subnet, strict=False) if config.subnet else None
local_net = IPv4Network(local_subnet, strict=False)
if master_net is None or master_net != local_net:
log.info(
"agent.deploy subnet mismatch master=%s local=%s — re-allocating decky IPs",
config.subnet, local_subnet,
)
fresh_ips = allocate_ips(
subnet=local_subnet,
gateway=local_gateway,
host_ip=local_host_ip,
count=len(config.deckies),
)
new_deckies = [d.model_copy(update={"ip": ip}) for d, ip in zip(config.deckies, fresh_ips)]
updates["deckies"] = new_deckies
return config.model_copy(update=updates)
async def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False) -> None: async def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False) -> None:
"""Run the blocking deployer off-loop. The deployer itself calls """Run the blocking deployer off-loop. The deployer itself calls
save_state() internally once the compose file is materialised.""" save_state() internally once the compose file is materialised."""
log.info( log.info(
"agent.deploy mode=%s deckies=%d interface=%s", "agent.deploy mode=%s deckies=%d interface=%s (incoming)",
config.mode, len(config.deckies), config.interface, config.mode, len(config.deckies), config.interface,
) )
if config.mode == "swarm":
config = _relocalize(config)
log.info(
"agent.deploy relocalized interface=%s subnet=%s gateway=%s",
config.interface, config.subnet, config.gateway,
)
await asyncio.to_thread(_deployer.deploy, config, dry_run, no_cache, False) await asyncio.to_thread(_deployer.deploy, config, dry_run, no_cache, False)

View File

@@ -0,0 +1,118 @@
"""Worker agent re-localizes master-built configs to its own NIC/subnet.
The master ships a DecnetConfig populated from *its own* network (master
NIC name, master subnet, master-chosen decky IPs). The worker cannot run
the deployer against that as-is: `ip addr show <master-nic>` blows up on
any worker whose NIC differs from the master's, which is ~always the
case in a heterogeneous fleet.
The agent's executor overrides interface/subnet/gateway/host_ip with
locally-detected values before calling into the deployer, and if the
subnet doesn't match, it re-allocates decky IPs from the local subnet.
"""
from __future__ import annotations
import pytest
from decnet.agent import executor
from decnet.models import DecnetConfig, DeckyConfig
def _cfg(subnet: str, interface: str = "wlp6s0") -> DecnetConfig:
return DecnetConfig(
mode="swarm",
interface=interface,
subnet=subnet,
gateway=subnet.rsplit(".", 1)[0] + ".1",
deckies=[
DeckyConfig(
name=f"decky-0{i}",
ip=subnet.rsplit(".", 1)[0] + f".{10 + i}",
services=["ssh"],
distro="debian",
base_image="debian:bookworm-slim",
hostname=f"decky-0{i}",
)
for i in range(1, 3)
],
)
def test_relocalize_swaps_interface_and_subnet(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(executor, "detect_interface", lambda: "enp0s3")
monkeypatch.setattr(executor, "detect_subnet", lambda _i: ("10.0.0.0/24", "10.0.0.1"))
monkeypatch.setattr(executor, "get_host_ip", lambda _i: "10.0.0.99")
monkeypatch.setattr(
executor, "allocate_ips",
lambda **kw: [f"10.0.0.{20 + i}" for i in range(kw["count"])],
)
incoming = _cfg("192.168.1.0/24")
out = executor._relocalize(incoming)
assert out.interface == "enp0s3"
assert out.subnet == "10.0.0.0/24"
assert out.gateway == "10.0.0.1"
# Subnet changed → IPs re-allocated from the worker's subnet.
assert [d.ip for d in out.deckies] == ["10.0.0.20", "10.0.0.21"]
# Non-network fields survive.
assert [d.name for d in out.deckies] == ["decky-01", "decky-02"]
assert [d.services for d in out.deckies] == [["ssh"], ["ssh"]]
def test_relocalize_keeps_ips_when_subnet_matches(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(executor, "detect_interface", lambda: "enp0s3")
monkeypatch.setattr(executor, "detect_subnet", lambda _i: ("192.168.1.0/24", "192.168.1.1"))
monkeypatch.setattr(executor, "get_host_ip", lambda _i: "192.168.1.50")
# allocate_ips should NOT be called in the matching-subnet branch.
def _fail(**_kw): # pragma: no cover
raise AssertionError("allocate_ips must not be called when subnets match")
monkeypatch.setattr(executor, "allocate_ips", _fail)
incoming = _cfg("192.168.1.0/24")
out = executor._relocalize(incoming)
assert out.interface == "enp0s3"
assert out.subnet == "192.168.1.0/24"
# Decky IPs preserved verbatim.
assert [d.ip for d in out.deckies] == ["192.168.1.11", "192.168.1.12"]
@pytest.mark.asyncio
async def test_deploy_relocalizes_before_calling_deployer(monkeypatch: pytest.MonkeyPatch) -> None:
"""End-to-end: agent.deploy(..) must not pass the master's interface
through to the blocking deployer."""
monkeypatch.setattr(executor, "detect_interface", lambda: "enp0s3")
monkeypatch.setattr(executor, "detect_subnet", lambda _i: ("192.168.1.0/24", "192.168.1.1"))
monkeypatch.setattr(executor, "get_host_ip", lambda _i: "192.168.1.50")
seen: dict = {}
def _fake_deploy(cfg, dry_run, no_cache, parallel):
seen["interface"] = cfg.interface
seen["subnet"] = cfg.subnet
monkeypatch.setattr(executor._deployer, "deploy", _fake_deploy)
await executor.deploy(_cfg("192.168.1.0/24", interface="wlp6s0-master"), dry_run=True)
assert seen == {"interface": "enp0s3", "subnet": "192.168.1.0/24"}
@pytest.mark.asyncio
async def test_deploy_unihost_mode_skips_relocalize(monkeypatch: pytest.MonkeyPatch) -> None:
"""Unihost configs have already been built against the local box — we
must not second-guess them."""
def _fail(*_a, **_kw): # pragma: no cover
raise AssertionError("detect_interface must not be called for unihost")
monkeypatch.setattr(executor, "detect_interface", _fail)
captured: dict = {}
def _fake_deploy(cfg, dry_run, no_cache, parallel):
captured["interface"] = cfg.interface
monkeypatch.setattr(executor._deployer, "deploy", _fake_deploy)
cfg = _cfg("192.168.1.0/24", interface="eth0").model_copy(update={"mode": "unihost"})
await executor.deploy(cfg, dry_run=True)
assert captured["interface"] == "eth0"