feat(engine,api): add orphan topology resource reaper

Topology rows deleted without a proper teardown leave Docker containers
and bridge networks behind, holding IPAM pools that cause 403 "Pool
overlaps" on the next deploy at the same subnet.

- engine/reaper.py walks the local Docker daemon, extracts the 8-char
  topology prefix from every decnet_t_* resource, and force-removes
  containers + networks whose prefix is not in the repo.
- POST /api/v1/topologies/reap-orphans (admin-only) returns a report
  of live/orphan prefixes and what was removed.
- Resources belonging to live topologies are never touched; per-resource
  errors are captured without aborting the sweep.
This commit is contained in:
2026-04-21 22:13:44 -04:00
parent 85bb0e2f65
commit 8f25ff677f
4 changed files with 447 additions and 0 deletions

171
decnet/engine/reaper.py Normal file
View File

@@ -0,0 +1,171 @@
"""Orphan Docker resource reaper for MazeNET topologies.
Every topology's Docker resources carry the fixed prefix
``decnet_t_<first-8-of-topology-uuid>_`` (see
:func:`decnet.topology.compose._network_name`). When a topology row is
deleted from the DB without a proper teardown — operator error, crashed
master, straight ``DELETE FROM topologies`` — the containers and
networks linger and steal IPAM pools.
This module walks the local Docker daemon, extracts the 8-char prefix
from every matching container/network, compares against the set of
prefixes that *do* map to a known topology, and removes the rest.
It never touches resources whose prefix matches a live topology, and it
never touches non-DECNET resources.
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from typing import Any, Iterable, Optional
import docker
from decnet.logging import get_logger
from decnet.network import remove_bridge_network
log = get_logger("engine.reaper")
# decnet_t_<8hex>_<anything>. The 8-char prefix is sliced from the
# topology UUID in decnet.topology.compose._network_name. Tolerate any
# suffix (network name, decky name) after the second underscore.
_RESOURCE_NAME_RE = re.compile(r"^decnet_t_([0-9a-f]{8})_")
@dataclass
class ReapReport:
"""Outcome of a reap pass — what was found and what was removed."""
live_prefixes: list[str] = field(default_factory=list)
orphan_prefixes: list[str] = field(default_factory=list)
containers_removed: list[str] = field(default_factory=list)
networks_removed: list[str] = field(default_factory=list)
errors: list[str] = field(default_factory=list)
def to_dict(self) -> dict[str, Any]:
return {
"live_prefixes": self.live_prefixes,
"orphan_prefixes": self.orphan_prefixes,
"containers_removed": self.containers_removed,
"networks_removed": self.networks_removed,
"errors": self.errors,
}
def _prefix_of(name: str) -> Optional[str]:
m = _RESOURCE_NAME_RE.match(name)
return m.group(1) if m else None
async def _live_prefixes(repo: Any) -> set[str]:
"""Every topology-id prefix the DB still knows about.
Tearing down only marks ``torn_down``; the row stays around for
audit. We consider *every* persisted topology to be live for the
reaper so we never race a concurrent teardown / redeploy by nuking
its networks mid-flight. Operators who want those resources gone
should delete the topology row (which cascades) or run teardown.
"""
rows = await repo.list_topologies()
return {r["id"][:8] for r in rows}
def _orphan_prefixes(
container_names: Iterable[str],
network_names: Iterable[str],
live: set[str],
) -> tuple[set[str], list[str], list[str]]:
"""Return (orphan_prefixes, decnet_containers, decnet_networks).
Pure function — no Docker / repo I/O. Kept separate so the test
suite can drive it without mocking the docker SDK."""
c_decnet = [n for n in container_names if _prefix_of(n) is not None]
n_decnet = [n for n in network_names if _prefix_of(n) is not None]
orphans = {
_prefix_of(n) for n in (*c_decnet, *n_decnet)
} - live
orphans.discard(None)
return orphans, c_decnet, n_decnet # type: ignore[return-value]
async def reap_orphan_topology_resources(
repo: Any,
client: Optional[docker.DockerClient] = None,
) -> ReapReport:
"""Remove Docker containers + networks whose topology id is gone.
* Enumerates every container and network whose name matches the
DECNET topology pattern.
* Computes the set of prefixes still referenced in the DB.
* Force-removes every container (so networks can drop their
endpoints), then removes the networks in a second pass.
* Errors on any single resource are captured into the report but
never abort the sweep — one stuck container should not block the
other nineteen from being cleaned up.
"""
if client is None:
client = docker.from_env()
live = await _live_prefixes(repo)
report = ReapReport(live_prefixes=sorted(live))
try:
containers = client.containers.list(all=True)
networks = client.networks.list()
except Exception as exc: # noqa: BLE001
report.errors.append(f"docker list failed: {exc}")
return report
container_names = [c.name for c in containers]
network_names = [n.name for n in networks]
orphans, decnet_containers, decnet_networks = _orphan_prefixes(
container_names, network_names, live
)
report.orphan_prefixes = sorted(orphans)
if not orphans:
log.info(
"reaper: no orphans (decnet containers=%d, networks=%d, live=%d)",
len(decnet_containers), len(decnet_networks), len(live),
)
return report
# Pass 1: containers. Force-remove so we don't hang on a stopped
# container whose network is about to be killed.
for c in containers:
prefix = _prefix_of(c.name)
if prefix is None or prefix not in orphans:
continue
try:
c.remove(force=True)
report.containers_removed.append(c.name)
except Exception as exc: # noqa: BLE001
report.errors.append(f"container {c.name!r}: {exc}")
log.warning("reaper: container %s remove failed: %s", c.name, exc)
# Pass 2: networks.
for n in networks:
prefix = _prefix_of(n.name)
if prefix is None or prefix not in orphans:
continue
try:
remove_bridge_network(client, n.name)
report.networks_removed.append(n.name)
except Exception as exc: # noqa: BLE001
report.errors.append(f"network {n.name!r}: {exc}")
log.warning("reaper: network %s remove failed: %s", n.name, exc)
log.info(
"reaper: removed %d container(s), %d network(s) across %d orphan prefix(es)",
len(report.containers_removed),
len(report.networks_removed),
len(report.orphan_prefixes),
)
return report
__all__ = [
"ReapReport",
"reap_orphan_topology_resources",
]

View File

@@ -21,6 +21,7 @@ from .api_get_topology import router as _get_router
from .api_lan_crud import router as _lan_router
from .api_list_topologies import router as _list_router
from .api_mutations import router as _mutations_router
from .api_reap_orphans import router as _reap_router
from .api_teardown_topology import router as _teardown_router
topology_router = APIRouter(prefix="/topologies", tags=["topologies"])
@@ -34,6 +35,7 @@ topology_router.include_router(_catalog_router)
topology_router.include_router(_list_router)
topology_router.include_router(_create_blank_router)
topology_router.include_router(_create_router)
topology_router.include_router(_reap_router)
topology_router.include_router(_deploy_router)
topology_router.include_router(_teardown_router)
topology_router.include_router(_delete_router)

View File

@@ -0,0 +1,46 @@
"""POST /topologies/reap-orphans — remove Docker resources for topology
ids the DB no longer knows about.
A topology row deleted outside the teardown flow (operator error,
crashed master, direct DB edit) leaves its containers and bridge
networks behind. The orphan networks keep their IPAM pools, so the
next deploy at the same subnet hits a 403 ``Pool overlaps`` from the
Docker daemon.
This endpoint walks the local Docker daemon, computes the set of
topology prefixes still known to the repo, and force-removes every
container + network whose prefix is orphaned. Resources belonging to
live topologies are never touched.
"""
from __future__ import annotations
from fastapi import APIRouter, Depends
from decnet.engine.reaper import reap_orphan_topology_resources
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import repo, require_admin
router = APIRouter()
@router.post(
"/reap-orphans",
tags=["MazeNET Topologies"],
responses={
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
},
)
@_traced("api.topology.reap_orphans")
async def api_reap_orphans(
_admin: dict = Depends(require_admin),
) -> dict:
"""Reap Docker resources whose topology id is absent from the DB.
Returns a report with the live prefixes, the orphan prefixes that
were identified, every container + network actually removed, and
any per-resource errors encountered. Errors are non-fatal — a
single stuck resource does not abort the sweep.
"""
report = await reap_orphan_topology_resources(repo)
return report.to_dict()