fix(engine): roll back partial Docker state on deploy failure
When create_bridge_network or compose-up raised mid-deploy, the deployer marked the topology FAILED and re-raised — but left every network it had already created alive. The next deploy attempt tripped over the orphans with 'Pool overlaps with other one on this address space' (IPAM conflict). Track networks created in the current attempt; on exception, tear down the started compose stack (if any), remove the networks in reverse order, and delete the compose file before marking FAILED. Rollback errors are logged but never mask the original failure. Covered by a new regression test that drives a docker client which succeeds once then raises, and asserts every created network is also removed.
This commit is contained in:
@@ -540,6 +540,8 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N
|
||||
await transition_status(repo, topology_id, TopologyStatus.DEPLOYING)
|
||||
|
||||
client = docker.from_env()
|
||||
created_networks: list[str] = []
|
||||
compose_started = False
|
||||
try:
|
||||
for lan in lans:
|
||||
net_name = _topology_network_name(topology_id, lan["name"])
|
||||
@@ -549,13 +551,42 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N
|
||||
create_bridge_network(
|
||||
client, net_name, lan["subnet"], internal=internal
|
||||
)
|
||||
created_networks.append(net_name)
|
||||
write_topology_compose(hydrated, compose_path)
|
||||
console.print(
|
||||
f"[bold cyan]Topology compose file written[/] → {compose_path}"
|
||||
)
|
||||
_compose_with_retry("up", "--build", "-d", compose_file=compose_path)
|
||||
compose_started = True
|
||||
except Exception as exc:
|
||||
log.error("topology %s deploy failed: %s", topology_id, exc)
|
||||
# Roll back any Docker state we created in this attempt so the
|
||||
# next deploy doesn't trip over orphan networks or half-started
|
||||
# containers. Best-effort: rollback errors must not mask the
|
||||
# original deploy failure.
|
||||
if compose_started or compose_path.exists():
|
||||
try:
|
||||
_compose(
|
||||
"down", "--remove-orphans", compose_file=compose_path
|
||||
)
|
||||
except Exception as rb_exc: # pragma: no cover
|
||||
log.warning(
|
||||
"topology %s rollback compose-down failed: %s",
|
||||
topology_id, rb_exc,
|
||||
)
|
||||
for net_name in reversed(created_networks):
|
||||
try:
|
||||
remove_bridge_network(client, net_name)
|
||||
except Exception as rb_exc: # pragma: no cover
|
||||
log.warning(
|
||||
"topology %s rollback network %s removal failed: %s",
|
||||
topology_id, net_name, rb_exc,
|
||||
)
|
||||
if compose_path.exists():
|
||||
try:
|
||||
compose_path.unlink()
|
||||
except OSError: # pragma: no cover
|
||||
pass
|
||||
await transition_status(
|
||||
repo, topology_id, TopologyStatus.FAILED, reason=str(exc)
|
||||
)
|
||||
|
||||
@@ -94,6 +94,64 @@ async def test_deploy_failure_transitions_to_failed(repo, tmp_path, monkeypatch)
|
||||
assert "boom" in (last["reason"] or "")
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_deploy_failure_rolls_back_created_networks(repo, tmp_path, monkeypatch):
|
||||
"""Networks created before the failing op must be removed on rollback.
|
||||
|
||||
Reproduces the ``Pool overlaps`` regression: a failed deploy left
|
||||
partial networks alive and the next deploy hit an IPAM conflict."""
|
||||
monkeypatch.chdir(tmp_path)
|
||||
plan = generate(_cfg())
|
||||
tid = await persist(repo, plan)
|
||||
|
||||
class _PartialClient:
|
||||
def __init__(self):
|
||||
self.networks = self
|
||||
self.created: list[str] = []
|
||||
self.removed: list[str] = []
|
||||
self._call = 0
|
||||
self._created_objs: dict[str, _FakeNet] = {}
|
||||
def list(self, names=None, filters=None): # noqa: ARG002
|
||||
if not names:
|
||||
return []
|
||||
return [self._created_objs[n] for n in names if n in self._created_objs]
|
||||
def create(self, name, *a, **kw): # noqa: ARG002
|
||||
self._call += 1
|
||||
# Succeed on the first N-1 creates, blow up on the last.
|
||||
if self._call >= 2:
|
||||
raise RuntimeError("boom: pool overlap")
|
||||
self.created.append(name)
|
||||
obj = _FakeNet(name, self)
|
||||
self._created_objs[name] = obj
|
||||
return obj
|
||||
|
||||
class _FakeNet:
|
||||
def __init__(self, name, client):
|
||||
self.name = name
|
||||
self.id = f"id-{name}"
|
||||
self.attrs = {"Containers": {}}
|
||||
self._client = client
|
||||
def remove(self):
|
||||
self._client.removed.append(self.name)
|
||||
self._client._created_objs.pop(self.name, None)
|
||||
|
||||
fake = _PartialClient()
|
||||
with patch("decnet.engine.deployer.docker.from_env", return_value=fake):
|
||||
with patch("decnet.engine.deployer._compose") as mock_down:
|
||||
with pytest.raises(RuntimeError, match="boom"):
|
||||
await deploy_topology(repo, tid)
|
||||
# compose down is invoked only when compose was actually started
|
||||
# OR a partial compose file exists; create_bridge_network failed
|
||||
# before write_topology_compose, so _compose should not have run.
|
||||
mock_down.assert_not_called()
|
||||
|
||||
# Every network created this attempt must have been removed on rollback.
|
||||
assert set(fake.removed) == set(fake.created)
|
||||
|
||||
topo = await repo.get_topology(tid)
|
||||
assert topo["status"] == TopologyStatus.FAILED
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_teardown_from_failed_marks_torn_down(repo, tmp_path, monkeypatch):
|
||||
monkeypatch.chdir(tmp_path)
|
||||
|
||||
Reference in New Issue
Block a user