From 85bb0e2f658a58934de017402d7cde24b3f9fa30 Mon Sep 17 00:00:00 2001 From: anti Date: Tue, 21 Apr 2026 20:23:03 -0400 Subject: [PATCH] fix(engine): roll back partial Docker state on deploy failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When create_bridge_network or compose-up raised mid-deploy, the deployer marked the topology FAILED and re-raised — but left every network it had already created alive. The next deploy attempt tripped over the orphans with 'Pool overlaps with other one on this address space' (IPAM conflict). Track networks created in the current attempt; on exception, tear down the started compose stack (if any), remove the networks in reverse order, and delete the compose file before marking FAILED. Rollback errors are logged but never mask the original failure. Covered by a new regression test that drives a docker client which succeeds once then raises, and asserts every created network is also removed. --- decnet/engine/deployer.py | 31 +++++++++++++++++++ tests/topology/test_deploy.py | 58 +++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/decnet/engine/deployer.py b/decnet/engine/deployer.py index 52fb9881..30410210 100644 --- a/decnet/engine/deployer.py +++ b/decnet/engine/deployer.py @@ -540,6 +540,8 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N await transition_status(repo, topology_id, TopologyStatus.DEPLOYING) client = docker.from_env() + created_networks: list[str] = [] + compose_started = False try: for lan in lans: net_name = _topology_network_name(topology_id, lan["name"]) @@ -549,13 +551,42 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N create_bridge_network( client, net_name, lan["subnet"], internal=internal ) + created_networks.append(net_name) write_topology_compose(hydrated, compose_path) console.print( f"[bold cyan]Topology compose file written[/] → {compose_path}" ) _compose_with_retry("up", "--build", "-d", compose_file=compose_path) + compose_started = True except Exception as exc: log.error("topology %s deploy failed: %s", topology_id, exc) + # Roll back any Docker state we created in this attempt so the + # next deploy doesn't trip over orphan networks or half-started + # containers. Best-effort: rollback errors must not mask the + # original deploy failure. + if compose_started or compose_path.exists(): + try: + _compose( + "down", "--remove-orphans", compose_file=compose_path + ) + except Exception as rb_exc: # pragma: no cover + log.warning( + "topology %s rollback compose-down failed: %s", + topology_id, rb_exc, + ) + for net_name in reversed(created_networks): + try: + remove_bridge_network(client, net_name) + except Exception as rb_exc: # pragma: no cover + log.warning( + "topology %s rollback network %s removal failed: %s", + topology_id, net_name, rb_exc, + ) + if compose_path.exists(): + try: + compose_path.unlink() + except OSError: # pragma: no cover + pass await transition_status( repo, topology_id, TopologyStatus.FAILED, reason=str(exc) ) diff --git a/tests/topology/test_deploy.py b/tests/topology/test_deploy.py index ec8357a5..00af9eca 100644 --- a/tests/topology/test_deploy.py +++ b/tests/topology/test_deploy.py @@ -94,6 +94,64 @@ async def test_deploy_failure_transitions_to_failed(repo, tmp_path, monkeypatch) assert "boom" in (last["reason"] or "") +@pytest.mark.anyio +async def test_deploy_failure_rolls_back_created_networks(repo, tmp_path, monkeypatch): + """Networks created before the failing op must be removed on rollback. + + Reproduces the ``Pool overlaps`` regression: a failed deploy left + partial networks alive and the next deploy hit an IPAM conflict.""" + monkeypatch.chdir(tmp_path) + plan = generate(_cfg()) + tid = await persist(repo, plan) + + class _PartialClient: + def __init__(self): + self.networks = self + self.created: list[str] = [] + self.removed: list[str] = [] + self._call = 0 + self._created_objs: dict[str, _FakeNet] = {} + def list(self, names=None, filters=None): # noqa: ARG002 + if not names: + return [] + return [self._created_objs[n] for n in names if n in self._created_objs] + def create(self, name, *a, **kw): # noqa: ARG002 + self._call += 1 + # Succeed on the first N-1 creates, blow up on the last. + if self._call >= 2: + raise RuntimeError("boom: pool overlap") + self.created.append(name) + obj = _FakeNet(name, self) + self._created_objs[name] = obj + return obj + + class _FakeNet: + def __init__(self, name, client): + self.name = name + self.id = f"id-{name}" + self.attrs = {"Containers": {}} + self._client = client + def remove(self): + self._client.removed.append(self.name) + self._client._created_objs.pop(self.name, None) + + fake = _PartialClient() + with patch("decnet.engine.deployer.docker.from_env", return_value=fake): + with patch("decnet.engine.deployer._compose") as mock_down: + with pytest.raises(RuntimeError, match="boom"): + await deploy_topology(repo, tid) + # compose down is invoked only when compose was actually started + # OR a partial compose file exists; create_bridge_network failed + # before write_topology_compose, so _compose should not have run. + mock_down.assert_not_called() + + # Every network created this attempt must have been removed on rollback. + assert set(fake.removed) == set(fake.created) + + topo = await repo.get_topology(tid) + assert topo["status"] == TopologyStatus.FAILED + + @pytest.mark.anyio async def test_teardown_from_failed_marks_torn_down(repo, tmp_path, monkeypatch): monkeypatch.chdir(tmp_path)