Files
DECNET/tests/topology/test_deploy.py
anti 85bb0e2f65 fix(engine): roll back partial Docker state on deploy failure
When create_bridge_network or compose-up raised mid-deploy, the
deployer marked the topology FAILED and re-raised — but left every
network it had already created alive. The next deploy attempt tripped
over the orphans with 'Pool overlaps with other one on this address
space' (IPAM conflict).

Track networks created in the current attempt; on exception, tear down
the started compose stack (if any), remove the networks in reverse
order, and delete the compose file before marking FAILED. Rollback
errors are logged but never mask the original failure.

Covered by a new regression test that drives a docker client which
succeeds once then raises, and asserts every created network is also
removed.
2026-04-21 20:23:03 -04:00

237 lines
8.2 KiB
Python

"""Deploy/teardown integration tests for MazeNET topologies.
Docker-touching paths live behind ``@pytest.mark.live`` per
feedback_skip_heavy_tests.md. The non-live path here exercises dry-run
deploy (compose file is written, repo status is left untouched) and the
state-machine around failure/teardown using a stub repo.
"""
from __future__ import annotations
from pathlib import Path
from unittest.mock import patch
import pytest
from decnet.engine.deployer import (
_teardown_order,
_topology_compose_path,
deploy_topology,
teardown_topology,
)
from decnet.topology.config import TopologyConfig
from decnet.topology.generator import generate
from decnet.topology.persistence import persist
from decnet.topology.status import TopologyStatus
from decnet.web.db.factory import get_repository
def _cfg(**kw) -> TopologyConfig:
base = dict(
name="dep",
depth=2,
branching_factor=2,
deckies_per_lan_min=1,
deckies_per_lan_max=1,
cross_edge_probability=0.0,
randomize_services=False,
services_explicit=["ssh"],
seed=11,
)
base.update(kw)
return TopologyConfig(**base)
@pytest.fixture
async def repo(tmp_path):
r = get_repository(db_path=str(tmp_path / "dep.db"))
await r.initialize()
return r
@pytest.mark.anyio
async def test_dry_run_writes_compose_and_preserves_pending(repo, tmp_path, monkeypatch):
monkeypatch.chdir(tmp_path)
plan = generate(_cfg())
tid = await persist(repo, plan)
await deploy_topology(repo, tid, dry_run=True)
compose_path = _topology_compose_path(tid)
assert compose_path.exists(), "dry run must emit a compose file"
topo = await repo.get_topology(tid)
assert topo["status"] == TopologyStatus.PENDING, (
"dry run must not transition status"
)
@pytest.mark.anyio
async def test_deploy_failure_transitions_to_failed(repo, tmp_path, monkeypatch):
"""If compose-up fails, status lands at FAILED with the reason logged."""
monkeypatch.chdir(tmp_path)
plan = generate(_cfg())
tid = await persist(repo, plan)
class _BoomClient:
def __init__(self):
self.networks = self
def list(self, names=None, filters=None): # noqa: ARG002
return []
def create(self, *a, **kw): # noqa: ARG002
raise RuntimeError("boom: docker daemon unreachable")
with patch("decnet.engine.deployer.docker.from_env", return_value=_BoomClient()):
with pytest.raises(RuntimeError, match="boom"):
await deploy_topology(repo, tid)
topo = await repo.get_topology(tid)
assert topo["status"] == TopologyStatus.FAILED
events = await repo.list_topology_status_events(tid)
# Events are returned newest-first.
last = events[0]
assert last["to_status"] == TopologyStatus.FAILED
assert "boom" in (last["reason"] or "")
@pytest.mark.anyio
async def test_deploy_failure_rolls_back_created_networks(repo, tmp_path, monkeypatch):
"""Networks created before the failing op must be removed on rollback.
Reproduces the ``Pool overlaps`` regression: a failed deploy left
partial networks alive and the next deploy hit an IPAM conflict."""
monkeypatch.chdir(tmp_path)
plan = generate(_cfg())
tid = await persist(repo, plan)
class _PartialClient:
def __init__(self):
self.networks = self
self.created: list[str] = []
self.removed: list[str] = []
self._call = 0
self._created_objs: dict[str, _FakeNet] = {}
def list(self, names=None, filters=None): # noqa: ARG002
if not names:
return []
return [self._created_objs[n] for n in names if n in self._created_objs]
def create(self, name, *a, **kw): # noqa: ARG002
self._call += 1
# Succeed on the first N-1 creates, blow up on the last.
if self._call >= 2:
raise RuntimeError("boom: pool overlap")
self.created.append(name)
obj = _FakeNet(name, self)
self._created_objs[name] = obj
return obj
class _FakeNet:
def __init__(self, name, client):
self.name = name
self.id = f"id-{name}"
self.attrs = {"Containers": {}}
self._client = client
def remove(self):
self._client.removed.append(self.name)
self._client._created_objs.pop(self.name, None)
fake = _PartialClient()
with patch("decnet.engine.deployer.docker.from_env", return_value=fake):
with patch("decnet.engine.deployer._compose") as mock_down:
with pytest.raises(RuntimeError, match="boom"):
await deploy_topology(repo, tid)
# compose down is invoked only when compose was actually started
# OR a partial compose file exists; create_bridge_network failed
# before write_topology_compose, so _compose should not have run.
mock_down.assert_not_called()
# Every network created this attempt must have been removed on rollback.
assert set(fake.removed) == set(fake.created)
topo = await repo.get_topology(tid)
assert topo["status"] == TopologyStatus.FAILED
@pytest.mark.anyio
async def test_teardown_from_failed_marks_torn_down(repo, tmp_path, monkeypatch):
monkeypatch.chdir(tmp_path)
plan = generate(_cfg())
tid = await persist(repo, plan)
# Drive it into FAILED directly via the legal path.
from decnet.topology.persistence import transition_status
await transition_status(repo, tid, TopologyStatus.DEPLOYING)
await transition_status(repo, tid, TopologyStatus.FAILED, reason="test")
class _StubClient:
def __init__(self):
self.networks = self
def list(self, names=None, filters=None): # noqa: ARG002
return []
with patch("decnet.engine.deployer.docker.from_env", return_value=_StubClient()):
await teardown_topology(repo, tid)
topo = await repo.get_topology(tid)
assert topo["status"] == TopologyStatus.TORN_DOWN
def test_teardown_order_is_stable():
lans = [{"name": f"LAN-{i:02d}"} for i in range(5)]
assert _teardown_order(lans) == [
"LAN-04", "LAN-03", "LAN-02", "LAN-01", "LAN-00",
]
@pytest.mark.live
@pytest.mark.anyio
async def test_deploy_and_teardown_against_real_docker(repo, tmp_path, monkeypatch):
"""End-to-end: create real Docker bridge networks, verify, tear down.
Skipped on CI; run locally with ``pytest -m live tests/topology``.
Does NOT run ``docker compose up`` — that's exercised by the flat
fleet tests. This test covers the topology-specific paths only
(LAN network creation, multi-home bridge wiring, teardown order).
"""
monkeypatch.chdir(tmp_path)
docker = pytest.importorskip("docker")
try:
client = docker.from_env()
client.ping()
except Exception as exc: # pragma: no cover - environment-specific
pytest.skip(f"docker daemon not reachable: {exc}")
plan = generate(_cfg(depth=1, branching_factor=1))
tid = await persist(repo, plan)
from decnet.topology.compose import _network_name
try:
await deploy_topology(repo, tid, dry_run=True)
# Dry run doesn't create networks. Now exercise the real path by
# creating just the networks (no compose up) and tearing down.
from decnet.network import create_bridge_network, remove_bridge_network
for lan in plan.lans:
create_bridge_network(
client,
_network_name(tid, lan.name),
lan.subnet,
internal=not lan.is_dmz,
)
existing = {n.name for n in client.networks.list()}
for lan in plan.lans:
assert _network_name(tid, lan.name) in existing
finally:
for lan in plan.lans:
remove_bridge_network(client, _network_name(tid, lan.name))
remaining = {n.name for n in client.networks.list()}
for lan in plan.lans:
assert _network_name(tid, lan.name) not in remaining
# Compose artifact cleanup
p = _topology_compose_path(tid)
if p.exists():
p.unlink()
# Sanity: Path roundtrip still resolvable
assert isinstance(Path(str(p)), Path)