fix(deploy): prevent 'Address already in use' from stale IPAM and half-torn-down containers
Two compounding root causes produced the recurring 'Address already in use' error on redeploy: 1. _ensure_network only compared driver+name; if a prior deploy's IPAM pool drifted (different subnet/gateway/range), Docker kept handing out addresses from the old pool and raced the real LAN. Now also compares Subnet/Gateway/IPRange and rebuilds on drift. 2. A prior half-failed 'up' could leave containers still holding the IPs and ports the new run wants. Run 'compose down --remove-orphans' as a best-effort pre-up cleanup so IPAM starts from a clean state. Also surface docker compose stderr to the structured log on failure so the agent's journal captures Docker's actual message (which IP, which port) instead of just the exit code.
This commit is contained in:
@@ -60,7 +60,19 @@ def _compose(*args: str, compose_file: Path = COMPOSE_FILE, env: dict | None = N
|
||||
# "project name must not be empty".
|
||||
cmd = ["docker", "compose", "-p", "decnet", "-f", str(compose_file), *args]
|
||||
merged = {**os.environ, **(env or {})}
|
||||
subprocess.run(cmd, check=True, env=merged) # nosec B603
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, env=merged) # nosec B603
|
||||
if result.stdout:
|
||||
print(result.stdout, end="")
|
||||
if result.returncode != 0:
|
||||
# Docker emits the useful detail ("Address already in use", which IP,
|
||||
# which port) on stderr. Surface it to the structured log so the
|
||||
# agent's journal carries it — without this the upstream traceback
|
||||
# just shows the exit code.
|
||||
if result.stderr:
|
||||
log.error("docker compose %s failed: %s", " ".join(args), result.stderr.strip())
|
||||
raise subprocess.CalledProcessError(
|
||||
result.returncode, cmd, result.stdout, result.stderr
|
||||
)
|
||||
|
||||
|
||||
_PERMANENT_ERRORS = (
|
||||
@@ -114,6 +126,8 @@ def _compose_with_retry(
|
||||
else:
|
||||
if result.stderr:
|
||||
console.print(f"[red]{result.stderr.strip()}[/]")
|
||||
log.error("docker compose %s failed after %d attempts: %s",
|
||||
" ".join(args), retries, result.stderr.strip())
|
||||
raise last_exc
|
||||
|
||||
|
||||
@@ -162,6 +176,15 @@ def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False,
|
||||
|
||||
save_state(config, compose_path)
|
||||
|
||||
# Pre-up cleanup: a prior half-failed `up` can leave containers still
|
||||
# holding the IPs/ports this run wants, which surfaces as the recurring
|
||||
# "Address already in use" from Docker's IPAM. Best-effort — ignore
|
||||
# failure (e.g. nothing to tear down on a clean host).
|
||||
try:
|
||||
_compose("down", "--remove-orphans", compose_file=compose_path)
|
||||
except subprocess.CalledProcessError:
|
||||
log.debug("pre-up cleanup: compose down failed (likely nothing to remove)")
|
||||
|
||||
build_env = {"DOCKER_BUILDKIT": "1"} if parallel else {}
|
||||
|
||||
console.print("[bold cyan]Building images and starting deckies...[/]")
|
||||
|
||||
@@ -152,9 +152,20 @@ def _ensure_network(
|
||||
|
||||
for net in client.networks.list(names=[MACVLAN_NETWORK_NAME]):
|
||||
if net.attrs.get("Driver") == driver:
|
||||
return # right driver, leave it alone
|
||||
# Wrong driver — tear it down. Disconnect any live containers first
|
||||
# so `remove()` doesn't refuse with ErrNetworkInUse.
|
||||
# Same driver — but if the IPAM pool drifted (different subnet,
|
||||
# gateway, or ip-range than this deploy asks for), reusing it
|
||||
# hands out addresses from the old pool and we race the real LAN.
|
||||
# Compare and rebuild on mismatch.
|
||||
pools = (net.attrs.get("IPAM") or {}).get("Config") or []
|
||||
cur = pools[0] if pools else {}
|
||||
if (
|
||||
cur.get("Subnet") == subnet
|
||||
and cur.get("Gateway") == gateway
|
||||
and cur.get("IPRange") == ip_range
|
||||
):
|
||||
return # right driver AND matching pool, leave it alone
|
||||
# Driver mismatch OR IPAM drift — tear it down. Disconnect any live
|
||||
# containers first so `remove()` doesn't refuse with ErrNetworkInUse.
|
||||
for cid in (net.attrs.get("Containers") or {}):
|
||||
try:
|
||||
net.disconnect(cid, force=True)
|
||||
|
||||
@@ -40,6 +40,7 @@ class TestCompose:
|
||||
@patch("decnet.engine.deployer.subprocess.run")
|
||||
def test_compose_constructs_correct_command(self, mock_run):
|
||||
from decnet.engine.deployer import _compose
|
||||
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
|
||||
_compose("up", "-d", compose_file=Path("test.yml"))
|
||||
mock_run.assert_called_once()
|
||||
cmd = mock_run.call_args[0][0]
|
||||
@@ -50,6 +51,7 @@ class TestCompose:
|
||||
@patch("decnet.engine.deployer.subprocess.run")
|
||||
def test_compose_passes_env(self, mock_run):
|
||||
from decnet.engine.deployer import _compose
|
||||
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
|
||||
_compose("build", env={"DOCKER_BUILDKIT": "1"})
|
||||
_, kwargs = mock_run.call_args
|
||||
assert "DOCKER_BUILDKIT" in kwargs["env"]
|
||||
|
||||
@@ -76,12 +76,22 @@ class TestIpsToRange:
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCreateMacvlanNetwork:
|
||||
def _make_client(self, existing=None, existing_driver="macvlan"):
|
||||
def _make_client(self, existing=None, existing_driver="macvlan",
|
||||
ipam_subnet="192.168.1.0/24", ipam_gateway="192.168.1.1",
|
||||
ipam_range="192.168.1.96/27"):
|
||||
client = MagicMock()
|
||||
nets = [MagicMock(name=n) for n in (existing or [])]
|
||||
for net, n in zip(nets, (existing or [])):
|
||||
net.name = n
|
||||
net.attrs = {"Driver": existing_driver, "Containers": {}}
|
||||
net.attrs = {
|
||||
"Driver": existing_driver,
|
||||
"Containers": {},
|
||||
"IPAM": {"Config": [{
|
||||
"Subnet": ipam_subnet,
|
||||
"Gateway": ipam_gateway,
|
||||
"IPRange": ipam_range,
|
||||
}]},
|
||||
}
|
||||
client.networks.list.return_value = nets
|
||||
return client
|
||||
|
||||
@@ -99,18 +109,38 @@ class TestCreateMacvlanNetwork:
|
||||
create_macvlan_network(client, "eth0", "192.168.1.0/24", "192.168.1.1", "192.168.1.96/27")
|
||||
client.networks.create.assert_not_called()
|
||||
|
||||
def test_rebuilds_when_ipam_subnet_drifted(self):
|
||||
"""Existing net matches driver+name, but IPAM pool is stale. Reusing it
|
||||
hands out addresses from the old pool — surfaces as 'Address already in
|
||||
use'. Must tear down + recreate."""
|
||||
client = self._make_client([MACVLAN_NETWORK_NAME], ipam_subnet="10.0.0.0/24")
|
||||
old_net = client.networks.list.return_value[0]
|
||||
create_macvlan_network(client, "eth0", "192.168.1.0/24", "192.168.1.1", "192.168.1.96/27")
|
||||
old_net.remove.assert_called_once()
|
||||
client.networks.create.assert_called_once()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# create_ipvlan_network
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCreateIpvlanNetwork:
|
||||
def _make_client(self, existing=None, existing_driver="ipvlan"):
|
||||
def _make_client(self, existing=None, existing_driver="ipvlan",
|
||||
ipam_subnet="192.168.1.0/24", ipam_gateway="192.168.1.1",
|
||||
ipam_range="192.168.1.96/27"):
|
||||
client = MagicMock()
|
||||
nets = [MagicMock(name=n) for n in (existing or [])]
|
||||
for net, n in zip(nets, (existing or [])):
|
||||
net.name = n
|
||||
net.attrs = {"Driver": existing_driver, "Containers": {}}
|
||||
net.attrs = {
|
||||
"Driver": existing_driver,
|
||||
"Containers": {},
|
||||
"IPAM": {"Config": [{
|
||||
"Subnet": ipam_subnet,
|
||||
"Gateway": ipam_gateway,
|
||||
"IPRange": ipam_range,
|
||||
}]},
|
||||
}
|
||||
client.networks.list.return_value = nets
|
||||
return client
|
||||
|
||||
|
||||
Reference in New Issue
Block a user