When create_bridge_network or compose-up raised mid-deploy, the deployer marked the topology FAILED and re-raised — but left every network it had already created alive. The next deploy attempt tripped over the orphans with 'Pool overlaps with other one on this address space' (IPAM conflict). Track networks created in the current attempt; on exception, tear down the started compose stack (if any), remove the networks in reverse order, and delete the compose file before marking FAILED. Rollback errors are logged but never mask the original failure. Covered by a new regression test that drives a docker client which succeeds once then raises, and asserts every created network is also removed.
237 lines
8.2 KiB
Python
237 lines
8.2 KiB
Python
"""Deploy/teardown integration tests for MazeNET topologies.
|
|
|
|
Docker-touching paths live behind ``@pytest.mark.live`` per
|
|
feedback_skip_heavy_tests.md. The non-live path here exercises dry-run
|
|
deploy (compose file is written, repo status is left untouched) and the
|
|
state-machine around failure/teardown using a stub repo.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
from decnet.engine.deployer import (
|
|
_teardown_order,
|
|
_topology_compose_path,
|
|
deploy_topology,
|
|
teardown_topology,
|
|
)
|
|
from decnet.topology.config import TopologyConfig
|
|
from decnet.topology.generator import generate
|
|
from decnet.topology.persistence import persist
|
|
from decnet.topology.status import TopologyStatus
|
|
from decnet.web.db.factory import get_repository
|
|
|
|
|
|
def _cfg(**kw) -> TopologyConfig:
|
|
base = dict(
|
|
name="dep",
|
|
depth=2,
|
|
branching_factor=2,
|
|
deckies_per_lan_min=1,
|
|
deckies_per_lan_max=1,
|
|
cross_edge_probability=0.0,
|
|
randomize_services=False,
|
|
services_explicit=["ssh"],
|
|
seed=11,
|
|
)
|
|
base.update(kw)
|
|
return TopologyConfig(**base)
|
|
|
|
|
|
@pytest.fixture
|
|
async def repo(tmp_path):
|
|
r = get_repository(db_path=str(tmp_path / "dep.db"))
|
|
await r.initialize()
|
|
return r
|
|
|
|
|
|
@pytest.mark.anyio
|
|
async def test_dry_run_writes_compose_and_preserves_pending(repo, tmp_path, monkeypatch):
|
|
monkeypatch.chdir(tmp_path)
|
|
plan = generate(_cfg())
|
|
tid = await persist(repo, plan)
|
|
|
|
await deploy_topology(repo, tid, dry_run=True)
|
|
|
|
compose_path = _topology_compose_path(tid)
|
|
assert compose_path.exists(), "dry run must emit a compose file"
|
|
|
|
topo = await repo.get_topology(tid)
|
|
assert topo["status"] == TopologyStatus.PENDING, (
|
|
"dry run must not transition status"
|
|
)
|
|
|
|
|
|
@pytest.mark.anyio
|
|
async def test_deploy_failure_transitions_to_failed(repo, tmp_path, monkeypatch):
|
|
"""If compose-up fails, status lands at FAILED with the reason logged."""
|
|
monkeypatch.chdir(tmp_path)
|
|
plan = generate(_cfg())
|
|
tid = await persist(repo, plan)
|
|
|
|
class _BoomClient:
|
|
def __init__(self):
|
|
self.networks = self
|
|
def list(self, names=None, filters=None): # noqa: ARG002
|
|
return []
|
|
def create(self, *a, **kw): # noqa: ARG002
|
|
raise RuntimeError("boom: docker daemon unreachable")
|
|
|
|
with patch("decnet.engine.deployer.docker.from_env", return_value=_BoomClient()):
|
|
with pytest.raises(RuntimeError, match="boom"):
|
|
await deploy_topology(repo, tid)
|
|
|
|
topo = await repo.get_topology(tid)
|
|
assert topo["status"] == TopologyStatus.FAILED
|
|
|
|
events = await repo.list_topology_status_events(tid)
|
|
# Events are returned newest-first.
|
|
last = events[0]
|
|
assert last["to_status"] == TopologyStatus.FAILED
|
|
assert "boom" in (last["reason"] or "")
|
|
|
|
|
|
@pytest.mark.anyio
|
|
async def test_deploy_failure_rolls_back_created_networks(repo, tmp_path, monkeypatch):
|
|
"""Networks created before the failing op must be removed on rollback.
|
|
|
|
Reproduces the ``Pool overlaps`` regression: a failed deploy left
|
|
partial networks alive and the next deploy hit an IPAM conflict."""
|
|
monkeypatch.chdir(tmp_path)
|
|
plan = generate(_cfg())
|
|
tid = await persist(repo, plan)
|
|
|
|
class _PartialClient:
|
|
def __init__(self):
|
|
self.networks = self
|
|
self.created: list[str] = []
|
|
self.removed: list[str] = []
|
|
self._call = 0
|
|
self._created_objs: dict[str, _FakeNet] = {}
|
|
def list(self, names=None, filters=None): # noqa: ARG002
|
|
if not names:
|
|
return []
|
|
return [self._created_objs[n] for n in names if n in self._created_objs]
|
|
def create(self, name, *a, **kw): # noqa: ARG002
|
|
self._call += 1
|
|
# Succeed on the first N-1 creates, blow up on the last.
|
|
if self._call >= 2:
|
|
raise RuntimeError("boom: pool overlap")
|
|
self.created.append(name)
|
|
obj = _FakeNet(name, self)
|
|
self._created_objs[name] = obj
|
|
return obj
|
|
|
|
class _FakeNet:
|
|
def __init__(self, name, client):
|
|
self.name = name
|
|
self.id = f"id-{name}"
|
|
self.attrs = {"Containers": {}}
|
|
self._client = client
|
|
def remove(self):
|
|
self._client.removed.append(self.name)
|
|
self._client._created_objs.pop(self.name, None)
|
|
|
|
fake = _PartialClient()
|
|
with patch("decnet.engine.deployer.docker.from_env", return_value=fake):
|
|
with patch("decnet.engine.deployer._compose") as mock_down:
|
|
with pytest.raises(RuntimeError, match="boom"):
|
|
await deploy_topology(repo, tid)
|
|
# compose down is invoked only when compose was actually started
|
|
# OR a partial compose file exists; create_bridge_network failed
|
|
# before write_topology_compose, so _compose should not have run.
|
|
mock_down.assert_not_called()
|
|
|
|
# Every network created this attempt must have been removed on rollback.
|
|
assert set(fake.removed) == set(fake.created)
|
|
|
|
topo = await repo.get_topology(tid)
|
|
assert topo["status"] == TopologyStatus.FAILED
|
|
|
|
|
|
@pytest.mark.anyio
|
|
async def test_teardown_from_failed_marks_torn_down(repo, tmp_path, monkeypatch):
|
|
monkeypatch.chdir(tmp_path)
|
|
plan = generate(_cfg())
|
|
tid = await persist(repo, plan)
|
|
# Drive it into FAILED directly via the legal path.
|
|
from decnet.topology.persistence import transition_status
|
|
await transition_status(repo, tid, TopologyStatus.DEPLOYING)
|
|
await transition_status(repo, tid, TopologyStatus.FAILED, reason="test")
|
|
|
|
class _StubClient:
|
|
def __init__(self):
|
|
self.networks = self
|
|
def list(self, names=None, filters=None): # noqa: ARG002
|
|
return []
|
|
|
|
with patch("decnet.engine.deployer.docker.from_env", return_value=_StubClient()):
|
|
await teardown_topology(repo, tid)
|
|
|
|
topo = await repo.get_topology(tid)
|
|
assert topo["status"] == TopologyStatus.TORN_DOWN
|
|
|
|
|
|
def test_teardown_order_is_stable():
|
|
lans = [{"name": f"LAN-{i:02d}"} for i in range(5)]
|
|
assert _teardown_order(lans) == [
|
|
"LAN-04", "LAN-03", "LAN-02", "LAN-01", "LAN-00",
|
|
]
|
|
|
|
|
|
@pytest.mark.live
|
|
@pytest.mark.anyio
|
|
async def test_deploy_and_teardown_against_real_docker(repo, tmp_path, monkeypatch):
|
|
"""End-to-end: create real Docker bridge networks, verify, tear down.
|
|
|
|
Skipped on CI; run locally with ``pytest -m live tests/topology``.
|
|
Does NOT run ``docker compose up`` — that's exercised by the flat
|
|
fleet tests. This test covers the topology-specific paths only
|
|
(LAN network creation, multi-home bridge wiring, teardown order).
|
|
"""
|
|
monkeypatch.chdir(tmp_path)
|
|
docker = pytest.importorskip("docker")
|
|
try:
|
|
client = docker.from_env()
|
|
client.ping()
|
|
except Exception as exc: # pragma: no cover - environment-specific
|
|
pytest.skip(f"docker daemon not reachable: {exc}")
|
|
|
|
plan = generate(_cfg(depth=1, branching_factor=1))
|
|
tid = await persist(repo, plan)
|
|
|
|
from decnet.topology.compose import _network_name
|
|
|
|
try:
|
|
await deploy_topology(repo, tid, dry_run=True)
|
|
# Dry run doesn't create networks. Now exercise the real path by
|
|
# creating just the networks (no compose up) and tearing down.
|
|
from decnet.network import create_bridge_network, remove_bridge_network
|
|
for lan in plan.lans:
|
|
create_bridge_network(
|
|
client,
|
|
_network_name(tid, lan.name),
|
|
lan.subnet,
|
|
internal=not lan.is_dmz,
|
|
)
|
|
existing = {n.name for n in client.networks.list()}
|
|
for lan in plan.lans:
|
|
assert _network_name(tid, lan.name) in existing
|
|
finally:
|
|
for lan in plan.lans:
|
|
remove_bridge_network(client, _network_name(tid, lan.name))
|
|
|
|
remaining = {n.name for n in client.networks.list()}
|
|
for lan in plan.lans:
|
|
assert _network_name(tid, lan.name) not in remaining
|
|
|
|
# Compose artifact cleanup
|
|
p = _topology_compose_path(tid)
|
|
if p.exists():
|
|
p.unlink()
|
|
# Sanity: Path roundtrip still resolvable
|
|
assert isinstance(Path(str(p)), Path)
|