feat(mazenet): host port-collision warning at deploy time

Add check_no_host_port_collision: enumerate the ports the topology's
gateways will publish (forwards_l3=True × svc.ports), probe live
listeners via psutil, emit a 'warning'-severity PORT_COLLISION
issue per overlap. Live-only — invoked from deploy_topology just
after dry-run branching, so unit tests that exercise validate()
stay hermetic.

Warning rather than error because docker-compose up will hard-fail
on a real collision anyway; this just gives operators a cleaner log
line ahead of the compose failure.
This commit is contained in:
2026-04-20 23:07:31 -04:00
parent be4e1b1891
commit 2c35d60d45
2 changed files with 62 additions and 1 deletions

View File

@@ -35,7 +35,12 @@ from decnet.topology.compose import (
)
from decnet.topology.persistence import hydrate, transition_status
from decnet.topology.status import TopologyStatus
from decnet.topology.validate import ValidationError, errors as _validation_errors, validate as _validate_topology
from decnet.topology.validate import (
ValidationError,
check_no_host_port_collision,
errors as _validation_errors,
validate as _validate_topology,
)
log = get_logger("engine")
console = Console()
@@ -338,6 +343,12 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N
log.info("topology %s dry-run complete", topology_id)
return
# Host-state precheck: PORT_COLLISION is a warning (docker-compose
# will hard-fail if the port is actually unavailable; we just want
# the clearer log line up-front). Only runs at live deploy.
for w in check_no_host_port_collision(hydrated):
log.warning("[%s] %s", w.code, w.message)
await transition_status(repo, topology_id, TopologyStatus.DEPLOYING)
client = docker.from_env()

View File

@@ -16,6 +16,7 @@ from ipaddress import IPv4Address, IPv4Network
from typing import Any, Callable, Literal
from decnet.fleet import all_service_names
from decnet.services.registry import get_service
Severity = Literal["error", "warning"]
@@ -282,6 +283,55 @@ def check_service_config_shape(h: dict[str, Any]) -> list[ValidationIssue]:
return issues
def check_no_host_port_collision(h: dict[str, Any]) -> list[ValidationIssue]:
"""Flag gateway service ports that are already bound on the host.
Only gateway deckies (``forwards_l3=True`` in decky_config) publish
ports (see decnet/topology/compose.py). Best-effort: if ``psutil``
isn't importable or probing fails, returns no issues.
"""
wanted: dict[int, str] = {} # host_port → gateway decky name
for d in h["deckies"]:
cfg = d.get("decky_config") or {}
if not cfg.get("forwards_l3"):
continue
for svc_name in d.get("services", []):
svc = get_service(svc_name)
if svc is None or getattr(svc, "fleet_singleton", False):
continue
for port in getattr(svc, "ports", []) or []:
wanted.setdefault(int(port), d["name"])
if not wanted:
return []
try:
import psutil # type: ignore
bound = {
c.laddr.port
for c in psutil.net_connections(kind="inet")
if c.status == psutil.CONN_LISTEN and c.laddr
}
except Exception:
return []
issues: list[ValidationIssue] = []
for port, decky_name in wanted.items():
if port in bound:
issues.append(
ValidationIssue(
"warning",
"PORT_COLLISION",
f"host port {port} is already bound; "
f"gateway {decky_name!r} may fail to publish it",
target={"decky": decky_name, "port": port},
)
)
return issues
# Pure-data rules. Host-state rules (like PORT_COLLISION) are
# *not* listed here — they're called separately by the live deployer
# so that unit tests exercising validate() stay hermetic.
_RULES: list[Callable[[dict[str, Any]], list[ValidationIssue]]] = [
check_exactly_one_dmz,
check_all_lans_connected_to_dmz,