172 lines
6.0 KiB
Python
172 lines
6.0 KiB
Python
"""Orphan Docker resource reaper for MazeNET topologies.
|
|
|
|
Every topology's Docker resources carry the fixed prefix
|
|
``decnet_t_<first-8-of-topology-uuid>_`` (see
|
|
:func:`decnet.topology.compose._network_name`). When a topology row is
|
|
deleted from the DB without a proper teardown — operator error, crashed
|
|
master, straight ``DELETE FROM topologies`` — the containers and
|
|
networks linger and steal IPAM pools.
|
|
|
|
This module walks the local Docker daemon, extracts the 8-char prefix
|
|
from every matching container/network, compares against the set of
|
|
prefixes that *do* map to a known topology, and removes the rest.
|
|
|
|
It never touches resources whose prefix matches a live topology, and it
|
|
never touches non-DECNET resources.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, Iterable, Optional
|
|
|
|
import docker
|
|
|
|
from decnet.logging import get_logger
|
|
from decnet.network import remove_bridge_network
|
|
|
|
log = get_logger("engine.reaper")
|
|
|
|
# decnet_t_<8hex>_<anything>. The 8-char prefix is sliced from the
|
|
# topology UUID in decnet.topology.compose._network_name. Tolerate any
|
|
# suffix (network name, decky name) after the second underscore.
|
|
_RESOURCE_NAME_RE = re.compile(r"^decnet_t_([0-9a-f]{8})_")
|
|
|
|
|
|
@dataclass
|
|
class ReapReport:
|
|
"""Outcome of a reap pass — what was found and what was removed."""
|
|
|
|
live_prefixes: list[str] = field(default_factory=list)
|
|
orphan_prefixes: list[str] = field(default_factory=list)
|
|
containers_removed: list[str] = field(default_factory=list)
|
|
networks_removed: list[str] = field(default_factory=list)
|
|
errors: list[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"live_prefixes": self.live_prefixes,
|
|
"orphan_prefixes": self.orphan_prefixes,
|
|
"containers_removed": self.containers_removed,
|
|
"networks_removed": self.networks_removed,
|
|
"errors": self.errors,
|
|
}
|
|
|
|
|
|
def _prefix_of(name: str) -> Optional[str]:
|
|
m = _RESOURCE_NAME_RE.match(name)
|
|
return m.group(1) if m else None
|
|
|
|
|
|
async def _live_prefixes(repo: Any) -> set[str]:
|
|
"""Every topology-id prefix the DB still knows about.
|
|
|
|
Tearing down only marks ``torn_down``; the row stays around for
|
|
audit. We consider *every* persisted topology to be live for the
|
|
reaper so we never race a concurrent teardown / redeploy by nuking
|
|
its networks mid-flight. Operators who want those resources gone
|
|
should delete the topology row (which cascades) or run teardown.
|
|
"""
|
|
rows = await repo.list_topologies()
|
|
return {r["id"][:8] for r in rows}
|
|
|
|
|
|
def _orphan_prefixes(
|
|
container_names: Iterable[str],
|
|
network_names: Iterable[str],
|
|
live: set[str],
|
|
) -> tuple[set[str], list[str], list[str]]:
|
|
"""Return (orphan_prefixes, decnet_containers, decnet_networks).
|
|
|
|
Pure function — no Docker / repo I/O. Kept separate so the test
|
|
suite can drive it without mocking the docker SDK."""
|
|
c_decnet = [n for n in container_names if _prefix_of(n) is not None]
|
|
n_decnet = [n for n in network_names if _prefix_of(n) is not None]
|
|
orphans = {
|
|
_prefix_of(n) for n in (*c_decnet, *n_decnet)
|
|
} - live
|
|
orphans.discard(None)
|
|
return orphans, c_decnet, n_decnet # type: ignore[return-value]
|
|
|
|
|
|
async def reap_orphan_topology_resources(
|
|
repo: Any,
|
|
client: Optional[docker.DockerClient] = None,
|
|
) -> ReapReport:
|
|
"""Remove Docker containers + networks whose topology id is gone.
|
|
|
|
* Enumerates every container and network whose name matches the
|
|
DECNET topology pattern.
|
|
* Computes the set of prefixes still referenced in the DB.
|
|
* Force-removes every container (so networks can drop their
|
|
endpoints), then removes the networks in a second pass.
|
|
* Errors on any single resource are captured into the report but
|
|
never abort the sweep — one stuck container should not block the
|
|
other nineteen from being cleaned up.
|
|
"""
|
|
if client is None:
|
|
client = docker.from_env()
|
|
|
|
live = await _live_prefixes(repo)
|
|
report = ReapReport(live_prefixes=sorted(live))
|
|
|
|
try:
|
|
containers = client.containers.list(all=True)
|
|
networks = client.networks.list()
|
|
except Exception as exc: # noqa: BLE001
|
|
report.errors.append(f"docker list failed: {exc}")
|
|
return report
|
|
|
|
container_names = [c.name for c in containers]
|
|
network_names = [n.name for n in networks]
|
|
orphans, decnet_containers, decnet_networks = _orphan_prefixes(
|
|
container_names, network_names, live
|
|
)
|
|
report.orphan_prefixes = sorted(orphans)
|
|
|
|
if not orphans:
|
|
log.info(
|
|
"reaper: no orphans (decnet containers=%d, networks=%d, live=%d)",
|
|
len(decnet_containers), len(decnet_networks), len(live),
|
|
)
|
|
return report
|
|
|
|
# Pass 1: containers. Force-remove so we don't hang on a stopped
|
|
# container whose network is about to be killed.
|
|
for c in containers:
|
|
prefix = _prefix_of(c.name)
|
|
if prefix is None or prefix not in orphans:
|
|
continue
|
|
try:
|
|
c.remove(force=True)
|
|
report.containers_removed.append(c.name)
|
|
except Exception as exc: # noqa: BLE001
|
|
report.errors.append(f"container {c.name!r}: {exc}")
|
|
log.warning("reaper: container %s remove failed: %s", c.name, exc)
|
|
|
|
# Pass 2: networks.
|
|
for n in networks:
|
|
prefix = _prefix_of(n.name)
|
|
if prefix is None or prefix not in orphans:
|
|
continue
|
|
try:
|
|
remove_bridge_network(client, n.name)
|
|
report.networks_removed.append(n.name)
|
|
except Exception as exc: # noqa: BLE001
|
|
report.errors.append(f"network {n.name!r}: {exc}")
|
|
log.warning("reaper: network %s remove failed: %s", n.name, exc)
|
|
|
|
log.info(
|
|
"reaper: removed %d container(s), %d network(s) across %d orphan prefix(es)",
|
|
len(report.containers_removed),
|
|
len(report.networks_removed),
|
|
len(report.orphan_prefixes),
|
|
)
|
|
return report
|
|
|
|
|
|
__all__ = [
|
|
"ReapReport",
|
|
"reap_orphan_topology_resources",
|
|
]
|