fix(deploy): prevent 'Address already in use' from stale IPAM and half-torn-down containers

Two compounding root causes produced the recurring 'Address already in use'
error on redeploy:

1. _ensure_network only compared driver+name; if a prior deploy's IPAM
   pool drifted (different subnet/gateway/range), Docker kept handing out
   addresses from the old pool and raced the real LAN. Now also compares
   Subnet/Gateway/IPRange and rebuilds on drift.

2. A prior half-failed 'up' could leave containers still holding the IPs
   and ports the new run wants. Run 'compose down --remove-orphans' as a
   best-effort pre-up cleanup so IPAM starts from a clean state.

Also surface docker compose stderr to the structured log on failure so
the agent's journal captures Docker's actual message (which IP, which
port) instead of just the exit code.
This commit is contained in:
2026-04-19 19:59:06 -04:00
parent e8e11b2896
commit 91549e6936
4 changed files with 74 additions and 8 deletions

View File

@@ -60,7 +60,19 @@ def _compose(*args: str, compose_file: Path = COMPOSE_FILE, env: dict | None = N
# "project name must not be empty". # "project name must not be empty".
cmd = ["docker", "compose", "-p", "decnet", "-f", str(compose_file), *args] cmd = ["docker", "compose", "-p", "decnet", "-f", str(compose_file), *args]
merged = {**os.environ, **(env or {})} merged = {**os.environ, **(env or {})}
subprocess.run(cmd, check=True, env=merged) # nosec B603 result = subprocess.run(cmd, capture_output=True, text=True, env=merged) # nosec B603
if result.stdout:
print(result.stdout, end="")
if result.returncode != 0:
# Docker emits the useful detail ("Address already in use", which IP,
# which port) on stderr. Surface it to the structured log so the
# agent's journal carries it — without this the upstream traceback
# just shows the exit code.
if result.stderr:
log.error("docker compose %s failed: %s", " ".join(args), result.stderr.strip())
raise subprocess.CalledProcessError(
result.returncode, cmd, result.stdout, result.stderr
)
_PERMANENT_ERRORS = ( _PERMANENT_ERRORS = (
@@ -114,6 +126,8 @@ def _compose_with_retry(
else: else:
if result.stderr: if result.stderr:
console.print(f"[red]{result.stderr.strip()}[/]") console.print(f"[red]{result.stderr.strip()}[/]")
log.error("docker compose %s failed after %d attempts: %s",
" ".join(args), retries, result.stderr.strip())
raise last_exc raise last_exc
@@ -162,6 +176,15 @@ def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False,
save_state(config, compose_path) save_state(config, compose_path)
# Pre-up cleanup: a prior half-failed `up` can leave containers still
# holding the IPs/ports this run wants, which surfaces as the recurring
# "Address already in use" from Docker's IPAM. Best-effort — ignore
# failure (e.g. nothing to tear down on a clean host).
try:
_compose("down", "--remove-orphans", compose_file=compose_path)
except subprocess.CalledProcessError:
log.debug("pre-up cleanup: compose down failed (likely nothing to remove)")
build_env = {"DOCKER_BUILDKIT": "1"} if parallel else {} build_env = {"DOCKER_BUILDKIT": "1"} if parallel else {}
console.print("[bold cyan]Building images and starting deckies...[/]") console.print("[bold cyan]Building images and starting deckies...[/]")

View File

@@ -152,9 +152,20 @@ def _ensure_network(
for net in client.networks.list(names=[MACVLAN_NETWORK_NAME]): for net in client.networks.list(names=[MACVLAN_NETWORK_NAME]):
if net.attrs.get("Driver") == driver: if net.attrs.get("Driver") == driver:
return # right driver, leave it alone # Same driver — but if the IPAM pool drifted (different subnet,
# Wrong driver — tear it down. Disconnect any live containers first # gateway, or ip-range than this deploy asks for), reusing it
# so `remove()` doesn't refuse with ErrNetworkInUse. # hands out addresses from the old pool and we race the real LAN.
# Compare and rebuild on mismatch.
pools = (net.attrs.get("IPAM") or {}).get("Config") or []
cur = pools[0] if pools else {}
if (
cur.get("Subnet") == subnet
and cur.get("Gateway") == gateway
and cur.get("IPRange") == ip_range
):
return # right driver AND matching pool, leave it alone
# Driver mismatch OR IPAM drift — tear it down. Disconnect any live
# containers first so `remove()` doesn't refuse with ErrNetworkInUse.
for cid in (net.attrs.get("Containers") or {}): for cid in (net.attrs.get("Containers") or {}):
try: try:
net.disconnect(cid, force=True) net.disconnect(cid, force=True)

View File

@@ -40,6 +40,7 @@ class TestCompose:
@patch("decnet.engine.deployer.subprocess.run") @patch("decnet.engine.deployer.subprocess.run")
def test_compose_constructs_correct_command(self, mock_run): def test_compose_constructs_correct_command(self, mock_run):
from decnet.engine.deployer import _compose from decnet.engine.deployer import _compose
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
_compose("up", "-d", compose_file=Path("test.yml")) _compose("up", "-d", compose_file=Path("test.yml"))
mock_run.assert_called_once() mock_run.assert_called_once()
cmd = mock_run.call_args[0][0] cmd = mock_run.call_args[0][0]
@@ -50,6 +51,7 @@ class TestCompose:
@patch("decnet.engine.deployer.subprocess.run") @patch("decnet.engine.deployer.subprocess.run")
def test_compose_passes_env(self, mock_run): def test_compose_passes_env(self, mock_run):
from decnet.engine.deployer import _compose from decnet.engine.deployer import _compose
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
_compose("build", env={"DOCKER_BUILDKIT": "1"}) _compose("build", env={"DOCKER_BUILDKIT": "1"})
_, kwargs = mock_run.call_args _, kwargs = mock_run.call_args
assert "DOCKER_BUILDKIT" in kwargs["env"] assert "DOCKER_BUILDKIT" in kwargs["env"]

View File

@@ -76,12 +76,22 @@ class TestIpsToRange:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
class TestCreateMacvlanNetwork: class TestCreateMacvlanNetwork:
def _make_client(self, existing=None, existing_driver="macvlan"): def _make_client(self, existing=None, existing_driver="macvlan",
ipam_subnet="192.168.1.0/24", ipam_gateway="192.168.1.1",
ipam_range="192.168.1.96/27"):
client = MagicMock() client = MagicMock()
nets = [MagicMock(name=n) for n in (existing or [])] nets = [MagicMock(name=n) for n in (existing or [])]
for net, n in zip(nets, (existing or [])): for net, n in zip(nets, (existing or [])):
net.name = n net.name = n
net.attrs = {"Driver": existing_driver, "Containers": {}} net.attrs = {
"Driver": existing_driver,
"Containers": {},
"IPAM": {"Config": [{
"Subnet": ipam_subnet,
"Gateway": ipam_gateway,
"IPRange": ipam_range,
}]},
}
client.networks.list.return_value = nets client.networks.list.return_value = nets
return client return client
@@ -99,18 +109,38 @@ class TestCreateMacvlanNetwork:
create_macvlan_network(client, "eth0", "192.168.1.0/24", "192.168.1.1", "192.168.1.96/27") create_macvlan_network(client, "eth0", "192.168.1.0/24", "192.168.1.1", "192.168.1.96/27")
client.networks.create.assert_not_called() client.networks.create.assert_not_called()
def test_rebuilds_when_ipam_subnet_drifted(self):
"""Existing net matches driver+name, but IPAM pool is stale. Reusing it
hands out addresses from the old pool — surfaces as 'Address already in
use'. Must tear down + recreate."""
client = self._make_client([MACVLAN_NETWORK_NAME], ipam_subnet="10.0.0.0/24")
old_net = client.networks.list.return_value[0]
create_macvlan_network(client, "eth0", "192.168.1.0/24", "192.168.1.1", "192.168.1.96/27")
old_net.remove.assert_called_once()
client.networks.create.assert_called_once()
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# create_ipvlan_network # create_ipvlan_network
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
class TestCreateIpvlanNetwork: class TestCreateIpvlanNetwork:
def _make_client(self, existing=None, existing_driver="ipvlan"): def _make_client(self, existing=None, existing_driver="ipvlan",
ipam_subnet="192.168.1.0/24", ipam_gateway="192.168.1.1",
ipam_range="192.168.1.96/27"):
client = MagicMock() client = MagicMock()
nets = [MagicMock(name=n) for n in (existing or [])] nets = [MagicMock(name=n) for n in (existing or [])]
for net, n in zip(nets, (existing or [])): for net, n in zip(nets, (existing or [])):
net.name = n net.name = n
net.attrs = {"Driver": existing_driver, "Containers": {}} net.attrs = {
"Driver": existing_driver,
"Containers": {},
"IPAM": {"Config": [{
"Subnet": ipam_subnet,
"Gateway": ipam_gateway,
"IPRange": ipam_range,
}]},
}
client.networks.list.return_value = nets client.networks.list.return_value = nets
return client return client