fix(engine): roll back partial Docker state on deploy failure

When create_bridge_network or compose-up raised mid-deploy, the
deployer marked the topology FAILED and re-raised — but left every
network it had already created alive. The next deploy attempt tripped
over the orphans with 'Pool overlaps with other one on this address
space' (IPAM conflict).

Track networks created in the current attempt; on exception, tear down
the started compose stack (if any), remove the networks in reverse
order, and delete the compose file before marking FAILED. Rollback
errors are logged but never mask the original failure.

Covered by a new regression test that drives a docker client which
succeeds once then raises, and asserts every created network is also
removed.
This commit is contained in:
2026-04-21 20:23:03 -04:00
parent 9ea0abc321
commit 85bb0e2f65
2 changed files with 89 additions and 0 deletions

View File

@@ -540,6 +540,8 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N
await transition_status(repo, topology_id, TopologyStatus.DEPLOYING)
client = docker.from_env()
created_networks: list[str] = []
compose_started = False
try:
for lan in lans:
net_name = _topology_network_name(topology_id, lan["name"])
@@ -549,13 +551,42 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N
create_bridge_network(
client, net_name, lan["subnet"], internal=internal
)
created_networks.append(net_name)
write_topology_compose(hydrated, compose_path)
console.print(
f"[bold cyan]Topology compose file written[/] → {compose_path}"
)
_compose_with_retry("up", "--build", "-d", compose_file=compose_path)
compose_started = True
except Exception as exc:
log.error("topology %s deploy failed: %s", topology_id, exc)
# Roll back any Docker state we created in this attempt so the
# next deploy doesn't trip over orphan networks or half-started
# containers. Best-effort: rollback errors must not mask the
# original deploy failure.
if compose_started or compose_path.exists():
try:
_compose(
"down", "--remove-orphans", compose_file=compose_path
)
except Exception as rb_exc: # pragma: no cover
log.warning(
"topology %s rollback compose-down failed: %s",
topology_id, rb_exc,
)
for net_name in reversed(created_networks):
try:
remove_bridge_network(client, net_name)
except Exception as rb_exc: # pragma: no cover
log.warning(
"topology %s rollback network %s removal failed: %s",
topology_id, net_name, rb_exc,
)
if compose_path.exists():
try:
compose_path.unlink()
except OSError: # pragma: no cover
pass
await transition_status(
repo, topology_id, TopologyStatus.FAILED, reason=str(exc)
)