feat(topology): add compose generator and deployer integration

Adds per-topology compose generation (one Docker bridge network per
LAN, multi-homed bridge deckies, ip_forward sysctl for L3 forwarders)
plus async deploy_topology/teardown_topology in the engine. Leaf-first
teardown via BFS-named LAN reverse sort; partial-state safe on failure.
This commit is contained in:
2026-04-20 16:54:40 -04:00
parent 33f139ecfa
commit 2a030bf3a9
4 changed files with 394 additions and 0 deletions

View File

@@ -17,16 +17,24 @@ from decnet.config import DecnetConfig, clear_state, load_state, save_state
from decnet.composer import write_compose from decnet.composer import write_compose
from decnet.network import ( from decnet.network import (
MACVLAN_NETWORK_NAME, MACVLAN_NETWORK_NAME,
create_bridge_network,
create_ipvlan_network, create_ipvlan_network,
create_macvlan_network, create_macvlan_network,
get_host_ip, get_host_ip,
ips_to_range, ips_to_range,
remove_bridge_network,
remove_macvlan_network, remove_macvlan_network,
setup_host_ipvlan, setup_host_ipvlan,
setup_host_macvlan, setup_host_macvlan,
teardown_host_ipvlan, teardown_host_ipvlan,
teardown_host_macvlan, teardown_host_macvlan,
) )
from decnet.topology.compose import (
_network_name as _topology_network_name,
write_topology_compose,
)
from decnet.topology.persistence import hydrate, transition_status
from decnet.topology.status import TopologyStatus
log = get_logger("engine") log = get_logger("engine")
console = Console() console = Console()
@@ -281,6 +289,106 @@ def status() -> None:
console.print(table) console.print(table)
def _teardown_order(lans: list[dict]) -> list[str]:
"""Return LAN names in leaf-first (DMZ-last) teardown order.
The generator names LANs in BFS order (``LAN-00`` = DMZ root,
then children, then grandchildren), so reverse-name order is a
correct leaf-first topological sort for the tree. Cross-edges
are membership-only — they don't introduce parent/child
relationships, so the BFS numbering remains valid.
"""
return sorted((lan["name"] for lan in lans), reverse=True)
def _topology_compose_path(topology_id: str) -> Path:
return Path(f"decnet-topology-{topology_id[:8]}-compose.yml")
@_traced("engine.deploy_topology")
async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> None:
"""Deploy a persisted MazeNET topology.
Assumes ``repo`` has the topology in ``pending`` state. Creates one
Docker bridge network per LAN, writes a per-topology compose file,
and brings all deckies up. Marks ``active`` on success, ``failed``
on exception (partial state left for later teardown).
"""
hydrated = await hydrate(repo, topology_id)
if hydrated is None:
raise ValueError(f"topology {topology_id!r} not found")
await transition_status(repo, topology_id, TopologyStatus.DEPLOYING)
client = docker.from_env()
lans = hydrated["lans"]
compose_path = _topology_compose_path(topology_id)
try:
for lan in lans:
net_name = _topology_network_name(topology_id, lan["name"])
# DMZ LAN is publicly routable; internal LANs are isolated
# from the host's default egress.
internal = not lan["is_dmz"]
create_bridge_network(
client, net_name, lan["subnet"], internal=internal
)
write_topology_compose(hydrated, compose_path)
console.print(
f"[bold cyan]Topology compose file written[/] → {compose_path}"
)
if dry_run:
log.info("topology %s dry-run complete", topology_id)
return
_compose_with_retry("up", "--build", "-d", compose_file=compose_path)
except Exception as exc:
log.error("topology %s deploy failed: %s", topology_id, exc)
await transition_status(
repo, topology_id, TopologyStatus.FAILED, reason=str(exc)
)
raise
await transition_status(repo, topology_id, TopologyStatus.ACTIVE)
log.info("topology %s deployed n_lans=%d", topology_id, len(lans))
@_traced("engine.teardown_topology")
async def teardown_topology(repo, topology_id: str) -> None:
"""Tear down a persisted MazeNET topology.
Legal from ``active|degraded|failed|deploying``. Brings compose
down, removes each LAN's Docker bridge network in leaf-first order,
and marks ``torn_down``.
"""
hydrated = await hydrate(repo, topology_id)
if hydrated is None:
raise ValueError(f"topology {topology_id!r} not found")
await transition_status(repo, topology_id, TopologyStatus.TEARING_DOWN)
client = docker.from_env()
compose_path = _topology_compose_path(topology_id)
if compose_path.exists():
try:
_compose("down", "--remove-orphans", compose_file=compose_path)
except subprocess.CalledProcessError as exc:
log.warning(
"topology %s compose down failed (continuing): %s",
topology_id, exc,
)
for lan_name in _teardown_order(hydrated["lans"]):
net_name = _topology_network_name(topology_id, lan_name)
remove_bridge_network(client, net_name)
if compose_path.exists():
compose_path.unlink()
await transition_status(repo, topology_id, TopologyStatus.TORN_DOWN)
log.info("topology %s torn down", topology_id)
def _print_status(config: DecnetConfig) -> None: def _print_status(config: DecnetConfig) -> None:
table = Table(title="Deployed Deckies", show_lines=True) table = Table(title="Deployed Deckies", show_lines=True)
table.add_column("Decky") table.add_column("Decky")

View File

@@ -227,6 +227,60 @@ def remove_macvlan_network(client: docker.DockerClient) -> None:
n.remove() n.remove()
# ---------------------------------------------------------------------------
# Plain Docker bridge networks (MazeNET topologies — one per LAN)
# ---------------------------------------------------------------------------
def create_bridge_network(
client: docker.DockerClient,
name: str,
subnet: str,
*,
internal: bool = False,
) -> str:
"""Create (or reuse) a plain Docker bridge network and return its id.
``internal=True`` blocks outbound routing via the host — used for
non-DMZ MazeNET LANs so deckies can only reach what the bridge
deckies let them reach.
"""
for net in client.networks.list(names=[name]):
pools = (net.attrs.get("IPAM") or {}).get("Config") or []
cur = pools[0] if pools else {}
if net.attrs.get("Driver") == "bridge" and cur.get("Subnet") == subnet:
return net.id
for cid in (net.attrs.get("Containers") or {}):
try:
net.disconnect(cid, force=True)
except docker.errors.APIError:
pass
net.remove()
net = client.networks.create(
name=name,
driver="bridge",
internal=internal,
ipam=docker.types.IPAMConfig(
driver="default",
pool_configs=[docker.types.IPAMPool(subnet=subnet)],
),
)
return net.id
def remove_bridge_network(client: docker.DockerClient, name: str) -> None:
for net in client.networks.list(names=[name]):
for cid in (net.attrs.get("Containers") or {}):
try:
net.disconnect(cid, force=True)
except docker.errors.APIError:
pass
try:
net.remove()
except docker.errors.APIError:
pass
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Host-side macvlan interface (hairpin fix) # Host-side macvlan interface (hairpin fix)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------

130
decnet/topology/compose.py Normal file
View File

@@ -0,0 +1,130 @@
"""Compose-file generator for a MazeNET topology.
Produces a ``docker-compose.yml`` dict given a hydrated topology
(the output of :func:`decnet.topology.persistence.hydrate`). The
compose file references each LAN as an ``external: true`` network —
the deployer creates the Docker bridge networks via the SDK before
invoking ``docker compose up``.
Layout:
* Each decky has a "base" container holding the LAN IPs. Multi-homed
(bridge) deckies list every LAN they belong to under ``networks``
with the per-LAN ``ipv4_address``.
* Bridge deckies with ``forwards_l3=True`` get ``net.ipv4.ip_forward=1``
baked in via compose ``sysctls`` plus ``NET_ADMIN`` in ``cap_add``.
* Service containers share the base namespace via
``network_mode: service:<base>``, matching the flat composer.
"""
from __future__ import annotations
from pathlib import Path
from typing import Any
import yaml
from decnet.services.registry import get_service
_DEFAULT_BASE_IMAGE = "debian:bookworm-slim"
_DOCKER_LOGGING = {
"driver": "json-file",
"options": {"max-size": "10m", "max-file": "5"},
}
def _network_name(topology_id: str, lan_name: str) -> str:
"""Docker network name for a given (topology, LAN) pair."""
return f"decnet_t_{topology_id[:8]}_{lan_name.lower()}"
def _container_name(topology_id: str, decky_name: str) -> str:
"""Container name for a decky base in a topology."""
return f"decnet_t_{topology_id[:8]}_{decky_name}"
def generate_topology_compose(hydrated: dict[str, Any]) -> dict:
"""Build the compose dict for a hydrated topology.
``hydrated`` is the shape returned by
:func:`decnet.topology.persistence.hydrate`.
"""
topology = hydrated["topology"]
topology_id = topology["id"]
lans = hydrated["lans"]
deckies = hydrated["deckies"]
lan_by_name = {lan["name"]: lan for lan in lans}
services: dict[str, dict] = {}
for decky in deckies:
cfg = decky["decky_config"]
name = cfg["name"]
ips_by_lan: dict[str, str] = cfg["ips_by_lan"]
forwards_l3: bool = cfg.get("forwards_l3", False)
svc_names: list[str] = decky["services"]
base_key = name
nets: dict[str, dict] = {}
for lan_name, ip in ips_by_lan.items():
if lan_name not in lan_by_name:
raise ValueError(
f"decky {name!r} references unknown LAN {lan_name!r}"
)
nets[_network_name(topology_id, lan_name)] = {"ipv4_address": ip}
base: dict = {
"image": _DEFAULT_BASE_IMAGE,
"container_name": _container_name(topology_id, name),
"hostname": name,
"command": ["sleep", "infinity"],
"restart": "unless-stopped",
"networks": nets,
"cap_add": ["NET_ADMIN"],
"logging": _DOCKER_LOGGING,
}
if forwards_l3:
base["sysctls"] = {"net.ipv4.ip_forward": 1}
services[base_key] = base
for svc_name in svc_names:
svc = get_service(svc_name)
if svc is None or svc.fleet_singleton:
continue
fragment = svc.compose_fragment(name, service_cfg={})
if "build" in fragment:
fragment["build"].setdefault("args", {}).setdefault(
"BASE_IMAGE", _DEFAULT_BASE_IMAGE
)
fragment.setdefault("environment", {})
fragment["environment"]["HOSTNAME"] = name
fragment["network_mode"] = f"service:{base_key}"
fragment["depends_on"] = [base_key]
fragment.pop("hostname", None)
fragment.pop("networks", None)
fragment["logging"] = _DOCKER_LOGGING
services[f"{name}-{svc_name}"] = fragment
networks: dict[str, dict] = {
_network_name(topology_id, lan["name"]): {
"external": True,
"name": _network_name(topology_id, lan["name"]),
}
for lan in lans
}
return {
"version": "3.8",
"services": services,
"networks": networks,
}
def write_topology_compose(hydrated: dict[str, Any], output_path: Path) -> Path:
"""Write the compose dict for a hydrated topology and return the path."""
data = generate_topology_compose(hydrated)
output_path.write_text(
yaml.dump(data, default_flow_style=False, sort_keys=False)
)
return output_path

View File

@@ -0,0 +1,102 @@
"""MazeNET compose-generator + teardown-order tests."""
from __future__ import annotations
import pytest
from decnet.engine.deployer import _teardown_order
from decnet.topology.compose import (
_container_name,
_network_name,
generate_topology_compose,
)
from decnet.topology.config import TopologyConfig
from decnet.topology.generator import generate
from decnet.topology.persistence import hydrate, persist
from decnet.web.db.factory import get_repository
def _cfg(**kw) -> TopologyConfig:
base = dict(
name="cmp",
depth=2,
branching_factor=2,
deckies_per_lan_min=1,
deckies_per_lan_max=1,
cross_edge_probability=0.0,
randomize_services=False,
services_explicit=["ssh"],
seed=9,
)
base.update(kw)
return TopologyConfig(**base)
@pytest.fixture
async def repo(tmp_path):
r = get_repository(db_path=str(tmp_path / "compose.db"))
await r.initialize()
return r
@pytest.mark.anyio
async def test_compose_has_one_network_per_lan(repo):
plan = generate(_cfg())
tid = await persist(repo, plan)
hydrated = await hydrate(repo, tid)
data = generate_topology_compose(hydrated)
assert set(data["networks"].keys()) == {
_network_name(tid, lan.name) for lan in plan.lans
}
for net in data["networks"].values():
assert net["external"] is True
@pytest.mark.anyio
async def test_compose_multi_home_bridge_decky(repo):
plan = generate(_cfg())
tid = await persist(repo, plan)
hydrated = await hydrate(repo, tid)
data = generate_topology_compose(hydrated)
# Every bridge decky (multi-homed) must list ≥2 networks in its base.
for decky in hydrated["deckies"]:
cfg = decky["decky_config"]
base = data["services"][cfg["name"]]
assert base["container_name"] == _container_name(tid, cfg["name"])
assert len(base["networks"]) == len(cfg["ips_by_lan"])
for lan_name, ip in cfg["ips_by_lan"].items():
net_key = _network_name(tid, lan_name)
assert base["networks"][net_key]["ipv4_address"] == ip
@pytest.mark.anyio
async def test_compose_forwards_l3_sets_sysctl(repo):
# Force every bridge to forward L3, then assert at least one base has it.
plan = generate(_cfg(bridge_forward_probability=1.0))
tid = await persist(repo, plan)
hydrated = await hydrate(repo, tid)
data = generate_topology_compose(hydrated)
forwarders = [
d for d in hydrated["deckies"]
if d["decky_config"].get("forwards_l3")
]
assert forwarders, "expected at least one forwarding bridge decky"
for d in forwarders:
base = data["services"][d["decky_config"]["name"]]
assert base["sysctls"]["net.ipv4.ip_forward"] == 1
assert "NET_ADMIN" in base["cap_add"]
def test_teardown_order_is_leaf_first():
lans = [
{"name": "LAN-00"},
{"name": "LAN-01"},
{"name": "LAN-02"},
{"name": "LAN-03"},
]
order = _teardown_order(lans)
assert order == ["LAN-03", "LAN-02", "LAN-01", "LAN-00"]
# DMZ is last — nothing should be torn down after LAN-00.
assert order[-1] == "LAN-00"