merge: testing → main (reconcile 2-week divergence)

This commit is contained in:
2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions

View File

@@ -0,0 +1,23 @@
"""MazeNET — nested deception topologies.
A topology is an arbitrary-depth DAG of LANs, connected by multi-homed
"bridge deckies" that optionally forward L3 between segments. One LAN
is marked as the DMZ (Internet-facing). Persisted via the repo pattern;
deployed via :mod:`decnet.engine.deployer`.
"""
from decnet.topology.config import TopologyConfig, GeneratedTopology
from decnet.topology.generator import generate
from decnet.topology.status import (
TopologyStatus,
assert_transition,
TopologyStatusError,
)
__all__ = [
"TopologyConfig",
"GeneratedTopology",
"generate",
"TopologyStatus",
"assert_transition",
"TopologyStatusError",
]

View File

@@ -0,0 +1,160 @@
"""IP and subnet allocators for MazeNET topologies.
Extracted from :mod:`decnet.topology.generator` so the same primitives
can be reused by the generator, the pre-deploy editor (REST), and the
mutator reconciler. The allocators are pure — persistence lives in the
repo; these objects hold in-memory state for a single planning pass.
``reserved_subnets`` queries the repo for every subnet currently claimed
by a non-``torn_down`` topology so a new draft cannot collide with an
open one.
"""
from __future__ import annotations
from ipaddress import IPv4Network
from typing import Any, Iterable
from decnet.topology.status import TopologyStatus
class AllocatorExhausted(RuntimeError):
"""Raised when an allocator cannot produce another value."""
class IPAllocator:
"""Hands out host IPs within a single LAN subnet.
Skips the ``.1`` gateway. Callers may pre-seed taken IPs via
:meth:`reserve` before requesting :meth:`next_free`.
"""
def __init__(self, subnet: str) -> None:
self._net = IPv4Network(subnet, strict=False)
self._gateway = str(next(self._net.hosts()))
self._pool: list[str] = [
str(ip) for ip in self._net.hosts() if str(ip) != self._gateway
]
self._taken: set[str] = set()
self._cursor = 0
def next_free(self) -> str:
while self._cursor < len(self._pool):
ip = self._pool[self._cursor]
self._cursor += 1
if ip not in self._taken:
self._taken.add(ip)
return ip
# Cursor past the end — fall back to a linear scan in case
# releases opened up earlier slots.
for ip in self._pool:
if ip not in self._taken:
self._taken.add(ip)
return ip
raise AllocatorExhausted(
f"no free IPs left in {self._net.with_prefixlen}"
)
def reserve(self, ip: str) -> None:
if ip == self._gateway:
raise ValueError(f"{ip} is the gateway of {self._net.with_prefixlen}")
if ip not in {str(h) for h in self._net.hosts()}:
raise ValueError(f"{ip} not in {self._net.with_prefixlen}")
self._taken.add(ip)
def release(self, ip: str) -> None:
self._taken.discard(ip)
def is_free(self, ip: str) -> bool:
return ip not in self._taken and ip in {str(h) for h in self._net.hosts()} and ip != self._gateway
class SubnetAllocator:
"""Hands out ``/24`` subnets inside a parent network.
Accepted ``base_prefix`` forms:
* Full CIDR: ``"172.16.0.0/12"`` → 4096 ``/24`` slots
* Legacy two-octet shorthand: ``"172.20"`` → auto-lifted to
``"172.20.0.0/16"`` (256 slots), for backward compat with
configs written before mass-scale topologies were a thing.
The parent must be at most ``/24`` wide (i.e. its prefix length
must be ≤ 24); a ``/24`` base yields exactly one slot, anything
larger yields more.
"""
def __init__(
self,
base_prefix: str,
reserved: Iterable[str] = (),
) -> None:
parent = _parse_base(base_prefix)
if parent.prefixlen > 24:
raise ValueError(
f"subnet base {parent.with_prefixlen} is narrower than /24; "
"cannot carve /24 children out of it"
)
self._parent = parent
# A generator over all /24 subnets of the parent. ipaddress
# yields them in order, so the allocator preserves the legacy
# "sequential-third-octet" behaviour for /16 bases. For /12
# bases you get second.third-octet sweep.
self._iter = parent.subnets(new_prefix=24) if parent.prefixlen < 24 else iter([parent])
self._reserved: set[str] = {s for s in reserved}
def next_free(self) -> str:
for net in self._iter:
subnet = net.with_prefixlen
if subnet not in self._reserved:
self._reserved.add(subnet)
return subnet
raise AllocatorExhausted(
f"no free /24s left under {self._parent.with_prefixlen}"
)
def reserve(self, subnet: str) -> None:
self._reserved.add(subnet)
def is_free(self, subnet: str) -> bool:
return subnet not in self._reserved
def _parse_base(base_prefix: str) -> IPv4Network:
"""Accept either ``'a.b.c.d/n'`` or legacy ``'a.b'`` shorthand."""
stripped = base_prefix.strip().rstrip(".")
if "/" in stripped:
return IPv4Network(stripped, strict=False)
octets = stripped.split(".")
if len(octets) == 2:
return IPv4Network(f"{stripped}.0.0/16", strict=False)
if len(octets) == 4:
return IPv4Network(f"{stripped}/24", strict=False)
raise ValueError(
f"unrecognised subnet base {base_prefix!r}; expected 'x.y' or CIDR"
)
# Topology statuses whose LANs still claim subnets. torn_down is the
# only state that releases its networks back to the pool.
_SUBNET_CLAIMING_STATES: frozenset[str] = frozenset(
{
TopologyStatus.PENDING,
TopologyStatus.DEPLOYING,
TopologyStatus.ACTIVE,
TopologyStatus.DEGRADED,
TopologyStatus.FAILED,
TopologyStatus.TEARING_DOWN,
}
)
async def reserved_subnets(repo: Any) -> set[str]:
"""All LAN subnets currently claimed by non-torn-down topologies."""
out: set[str] = set()
for status in _SUBNET_CLAIMING_STATES:
for topo in await repo.list_topologies(status=status):
for lan in await repo.list_lans_for_topology(topo["id"]):
subnet = lan.get("subnet")
if subnet:
out.add(subnet)
return out

165
decnet/topology/compose.py Normal file
View File

@@ -0,0 +1,165 @@
"""Compose-file generator for a MazeNET topology.
Produces a ``docker-compose.yml`` dict given a hydrated topology
(the output of :func:`decnet.topology.persistence.hydrate`). The
compose file references each LAN as an ``external: true`` network —
the deployer creates the Docker bridge networks via the SDK before
invoking ``docker compose up``.
Layout:
* Each decky has a "base" container holding the LAN IPs. Multi-homed
(bridge) deckies list every LAN they belong to under ``networks``
with the per-LAN ``ipv4_address``.
* Bridge deckies with ``forwards_l3=True`` get ``net.ipv4.ip_forward=1``
baked in via compose ``sysctls`` plus ``NET_ADMIN`` in ``cap_add``.
* Service containers share the base namespace via
``network_mode: service:<base>``, matching the flat composer.
"""
from __future__ import annotations
from pathlib import Path
from typing import Any
import yaml
from decnet.services.registry import get_service
_DEFAULT_BASE_IMAGE = "debian:bookworm-slim"
_DOCKER_LOGGING = {
"driver": "json-file",
"options": {"max-size": "10m", "max-file": "5"},
}
def _network_name(topology_id: str, lan_name: str) -> str:
"""Docker network name for a given (topology, LAN) pair."""
return f"decnet_t_{topology_id[:8]}_{lan_name.lower()}"
def _container_name(topology_id: str, decky_name: str) -> str:
"""Container name for a decky base in a topology."""
return f"decnet_t_{topology_id[:8]}_{decky_name}"
def generate_topology_compose(hydrated: dict[str, Any]) -> dict:
"""Build the compose dict for a hydrated topology.
``hydrated`` is the shape returned by
:func:`decnet.topology.persistence.hydrate`.
"""
topology = hydrated["topology"]
topology_id = topology["id"]
lans = hydrated["lans"]
deckies = hydrated["deckies"]
lan_by_name = {lan["name"]: lan for lan in lans}
services: dict[str, dict] = {}
for decky in deckies:
cfg = decky["decky_config"]
name = cfg["name"]
ips_by_lan: dict[str, str] = cfg["ips_by_lan"]
forwards_l3: bool = cfg.get("forwards_l3", False)
service_config: dict[str, dict] = cfg.get("service_config", {}) or {}
svc_names: list[str] = decky["services"]
base_key = name
nets: dict[str, dict] = {}
for lan_name, ip in ips_by_lan.items():
if lan_name not in lan_by_name:
raise ValueError(
f"decky {name!r} references unknown LAN {lan_name!r}"
)
nets[_network_name(topology_id, lan_name)] = {"ipv4_address": ip}
base: dict = {
"image": _DEFAULT_BASE_IMAGE,
"container_name": _container_name(topology_id, name),
"hostname": name,
"command": ["sleep", "infinity"],
"restart": "unless-stopped",
"networks": nets,
"cap_add": ["NET_ADMIN"],
"logging": _DOCKER_LOGGING,
# Labels let the host collector discover topology containers
# without consulting decnet-state.json (which only knows about
# legacy fleet deckies). See decnet/collector/worker.py.
"labels": {
"decnet.topology.id": topology_id,
"decnet.topology.decky": name,
"decnet.topology.role": "base",
},
}
if forwards_l3:
base["sysctls"] = {"net.ipv4.ip_forward": 1}
# Gateway decky — publish its service ports on the host so
# attackers can reach the DMZ via the host's public IP.
# Service containers share this base's namespace (see below),
# so ports declared here expose every service's listener.
published: list[str] = []
for svc_name in svc_names:
svc = get_service(svc_name)
if svc is None or svc.fleet_singleton:
continue
for port in svc.ports:
published.append(f"{port}:{port}")
if published:
base["ports"] = published
services[base_key] = base
for svc_name in svc_names:
svc = get_service(svc_name)
if svc is None or svc.fleet_singleton:
continue
fragment = svc.compose_fragment(
name, service_cfg=service_config.get(svc_name, {})
)
if "build" in fragment:
fragment["build"].setdefault("args", {}).setdefault(
"BASE_IMAGE", _DEFAULT_BASE_IMAGE
)
fragment.setdefault("environment", {})
fragment["environment"]["HOSTNAME"] = name
fragment["network_mode"] = f"service:{base_key}"
fragment["depends_on"] = [base_key]
fragment.pop("hostname", None)
fragment.pop("networks", None)
fragment["logging"] = _DOCKER_LOGGING
# ``decnet.topology.service=true`` is the marker the collector
# filters on — without it, log streams for this container are
# never attached.
labels = dict(fragment.get("labels") or {})
labels.update({
"decnet.topology.id": topology_id,
"decnet.topology.decky": name,
"decnet.topology.service_name": svc_name,
"decnet.topology.service": "true",
})
fragment["labels"] = labels
services[f"{name}-{svc_name}"] = fragment
networks: dict[str, dict] = {
_network_name(topology_id, lan["name"]): {
"external": True,
"name": _network_name(topology_id, lan["name"]),
}
for lan in lans
}
return {
"version": "3.8",
"services": services,
"networks": networks,
}
def write_topology_compose(hydrated: dict[str, Any], output_path: Path) -> Path:
"""Write the compose dict for a hydrated topology and return the path."""
data = generate_topology_compose(hydrated)
output_path.write_text(
yaml.dump(data, default_flow_style=False, sort_keys=False)
)
return output_path

113
decnet/topology/config.py Normal file
View File

@@ -0,0 +1,113 @@
"""MazeNET topology config + in-memory generation output."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Optional
from pydantic import BaseModel, Field, model_validator
class TopologyConfig(BaseModel):
"""Parameters driving :func:`decnet.topology.generator.generate`."""
name: str = Field(..., min_length=1, max_length=64)
mode: str = Field(default="unihost", pattern=r"^(unihost|agent)$")
# Topology shape
depth: int = Field(..., ge=1, le=16, description="Max depth from DMZ")
branching_factor: int = Field(..., ge=1, le=8, description="Max child LANs per LAN")
deckies_per_lan_min: int = Field(default=1, ge=0, le=32)
deckies_per_lan_max: int = Field(default=3, ge=1, le=32)
# Probability a given non-DMZ LAN's connection to its parent uses a
# bridge decky that forwards L3 (enables attacker pivot). Bridge
# existence between parent/child is implicit — every non-DMZ LAN
# has exactly one parent bridge. This controls *forwarding*, not
# the existence of the bridge.
bridge_forward_probability: float = Field(default=1.0, ge=0.0, le=1.0)
# Probability of injecting a DAG cross-edge: a decky also bridged
# from its LAN to a non-parent, non-child LAN. 0.0 yields a tree.
cross_edge_probability: float = Field(default=0.0, ge=0.0, le=1.0)
# IP allocation base. LANs get sequential /24s carved out of this
# network. Accepts either a full CIDR (e.g. ``172.16.0.0/12`` for
# 4096 slots) or the legacy two-octet shorthand ``172.20`` which
# auto-lifts to ``172.20.0.0/16`` (256 slots). Default is a /12
# so mass-scale topologies (depth/branching trees with >256 LANs)
# don't exhaust the pool on first generation.
subnet_base_prefix: str = Field(
default="172.16.0.0/12",
pattern=r"^\d{1,3}\.\d{1,3}(\.\d{1,3}\.\d{1,3}/\d{1,2})?$",
)
# Service selection — reuses decnet.fleet.build_deckies' randomizer.
randomize_services: bool = Field(default=True)
services_explicit: Optional[list[str]] = None
seed: Optional[int] = Field(default=None, ge=0)
@model_validator(mode="after")
def _check_min_max(self) -> "TopologyConfig":
if self.deckies_per_lan_min > self.deckies_per_lan_max:
raise ValueError(
"deckies_per_lan_min must be <= deckies_per_lan_max"
)
if not self.randomize_services and not self.services_explicit:
raise ValueError(
"either randomize_services=True or services_explicit must be set"
)
return self
@dataclass
class _PlannedLAN:
"""In-memory LAN record emitted by the generator."""
name: str
subnet: str
is_dmz: bool
parent: Optional[str] # name of parent LAN, None for DMZ
# Canvas coordinates — generator leaves them None; the web editor
# (or a future auto-layouter) fills them in.
x: Optional[float] = None
y: Optional[float] = None
@dataclass
class _PlannedDecky:
"""In-memory decky record emitted by the generator."""
name: str
services: list[str]
# Mapping LAN-name → assigned IP within that LAN's subnet.
ips_by_lan: dict[str, str] = field(default_factory=dict)
forwards_l3: bool = False # only meaningful when present on ≥2 LANs
# Per-service config overrides: {service_name: {field: value}}.
# Mirrors ``DeckyConfig.service_config`` from the flat-fleet path;
# services read these via ``compose_fragment(service_cfg=...)``.
service_config: dict[str, dict] = field(default_factory=dict)
# Canvas coordinates — see _PlannedLAN.x/y.
x: Optional[float] = None
y: Optional[float] = None
@dataclass
class _PlannedEdge:
"""In-memory (decky, LAN) membership edge."""
decky_name: str
lan_name: str
is_bridge: bool
forwards_l3: bool
@dataclass
class GeneratedTopology:
"""Full in-memory output of :func:`decnet.topology.generator.generate`.
Names are unique within the topology. No UUIDs are assigned here —
those are minted by :mod:`decnet.topology.persistence` when the
topology is written to the repo.
"""
config: TopologyConfig
lans: list[_PlannedLAN]
deckies: list[_PlannedDecky]
edges: list[_PlannedEdge]

View File

@@ -0,0 +1,237 @@
"""MazeNET topology generator.
Produces a :class:`GeneratedTopology` — an in-memory DAG of LANs and
multi-homed deckies. Deterministic under ``config.seed``: the same seed
always yields the same structure, service assignments, and IP layout.
The generator only plans the structure. Persisting UUIDs to the repo
is :mod:`decnet.topology.persistence`; spawning Docker networks and
containers is :mod:`decnet.engine.deployer`.
"""
from __future__ import annotations
import random
from typing import Optional
from decnet.fleet import all_service_names
from decnet.topology.allocator import IPAllocator, SubnetAllocator
from decnet.topology.config import (
GeneratedTopology,
TopologyConfig,
_PlannedDecky,
_PlannedEdge,
_PlannedLAN,
)
# Range of services per randomly assigned decky (matches decnet.fleet).
_SVC_MIN = 1
_SVC_MAX = 3
def _plan_lans(
config: TopologyConfig,
rng: random.Random,
subnets: SubnetAllocator,
) -> list[_PlannedLAN]:
"""Plan LANs as a tree of depth ``config.depth``.
Each non-leaf level adds [1, branching_factor] children per parent.
LAN names and subnets are assigned in BFS order; subnets come from
``subnets``, which the caller may have pre-seeded with reservations
from other topologies.
"""
lans: list[_PlannedLAN] = []
# DMZ root.
lans.append(
_PlannedLAN(
name="LAN-00", subnet=subnets.next_free(), is_dmz=True, parent=None
)
)
frontier: list[_PlannedLAN] = [lans[0]]
for _level in range(1, config.depth + 1):
next_frontier: list[_PlannedLAN] = []
for parent in frontier:
n_children = rng.randint(1, config.branching_factor) # nosec B311
for _ in range(n_children):
idx = len(lans)
child = _PlannedLAN(
name=f"LAN-{idx:02d}",
subnet=subnets.next_free(),
is_dmz=False,
parent=parent.name,
)
lans.append(child)
next_frontier.append(child)
frontier = next_frontier
if not frontier:
break
return lans
def _pick_services(
rng: random.Random,
services_explicit: Optional[list[str]],
pool: list[str],
used_combos: set[frozenset],
) -> list[str]:
if services_explicit:
return list(services_explicit)
if not pool:
return []
attempts = 0
while True:
count = rng.randint(_SVC_MIN, min(_SVC_MAX, len(pool))) # nosec B311
chosen = frozenset(rng.sample(pool, count)) # nosec B311
attempts += 1
if chosen not in used_combos or attempts > 20:
break
used_combos.add(chosen)
return list(chosen)
def generate(
config: TopologyConfig,
*,
reserved_subnets: Optional[set[str]] = None,
) -> GeneratedTopology:
"""Generate a topology plan deterministically under ``config.seed``.
The caller is responsible for persisting the plan via
:mod:`decnet.topology.persistence` and then deploying it.
``reserved_subnets`` (optional): /24s already claimed by other
topologies. The subnet allocator skips these so two concurrent
drafts can't collide. Populate via
:func:`decnet.topology.allocator.reserved_subnets`.
"""
rng = random.Random(config.seed) # nosec B311
svc_pool = all_service_names() if config.randomize_services else []
used_combos: set[frozenset] = set()
subnets = SubnetAllocator(
config.subnet_base_prefix, reserved=reserved_subnets or set()
)
lans = _plan_lans(config, rng, subnets)
lans_by_name = {lan.name: lan for lan in lans}
# Per-LAN IP allocators for deterministic assignment.
ip_allocs: dict[str, IPAllocator] = {
lan.name: IPAllocator(lan.subnet) for lan in lans
}
def _take_ip(lan_name: str) -> str:
return ip_allocs[lan_name].next_free()
deckies: list[_PlannedDecky] = []
edges: list[_PlannedEdge] = []
decky_counter = 0
def _new_decky(home_lan: str) -> _PlannedDecky:
nonlocal decky_counter
decky_counter += 1
name = f"decky-{decky_counter:03d}"
services = _pick_services(
rng, config.services_explicit, svc_pool, used_combos
)
decky = _PlannedDecky(
name=name,
services=services,
ips_by_lan={home_lan: _take_ip(home_lan)},
)
deckies.append(decky)
return decky
# Populate each LAN with its own deckies.
for lan in lans:
if lan.is_dmz:
count = 1 # single DMZ decky (deaddeck)
else:
count = rng.randint( # nosec B311
config.deckies_per_lan_min, config.deckies_per_lan_max
)
if count < 1:
count = 1 # every LAN needs ≥1 decky to host the bridge
for _ in range(count):
decky = _new_decky(lan.name)
edges.append(
_PlannedEdge(
decky_name=decky.name,
lan_name=lan.name,
is_bridge=False,
forwards_l3=False,
)
)
# Parent↔child bridges. For every non-DMZ LAN, pick one of its
# deckies and multi-home it to the parent LAN. This decky becomes
# the bridge between the two segments.
deckies_by_lan: dict[str, list[_PlannedDecky]] = {lan.name: [] for lan in lans}
for e in edges:
deckies_by_lan[e.lan_name].append(
next(d for d in deckies if d.name == e.decky_name)
)
for lan in lans:
if lan.is_dmz or lan.parent is None:
continue
candidates = deckies_by_lan[lan.name]
bridge = rng.choice(candidates) # nosec B311
bridge.ips_by_lan[lan.parent] = _take_ip(lan.parent)
forwards = rng.random() < config.bridge_forward_probability # nosec B311
bridge.forwards_l3 = bridge.forwards_l3 or forwards
# Mark both existing edges as bridge edges for this decky, and
# add a new edge connecting it to the parent LAN.
for e in edges:
if e.decky_name == bridge.name:
e.is_bridge = True
e.forwards_l3 = bridge.forwards_l3
edges.append(
_PlannedEdge(
decky_name=bridge.name,
lan_name=lan.parent,
is_bridge=True,
forwards_l3=bridge.forwards_l3,
)
)
# Cross-edges: with probability p, pick a non-parent, non-child,
# non-self LAN and attach a random decky to it too. Turns the tree
# into a DAG. Only rolls on non-DMZ LANs with ≥1 candidate peer.
if config.cross_edge_probability > 0:
for lan in lans:
if lan.is_dmz:
continue
if rng.random() >= config.cross_edge_probability: # nosec B311
continue
forbidden = {lan.name, lan.parent}
forbidden |= {c.name for c in lans if c.parent == lan.name}
peers = [p for p in lans if p.name not in forbidden]
if not peers:
continue
peer = rng.choice(peers) # nosec B311
decky = rng.choice(deckies_by_lan[lan.name]) # nosec B311
if peer.name in decky.ips_by_lan:
continue # already connected, skip
decky.ips_by_lan[peer.name] = _take_ip(peer.name)
forwards = rng.random() < config.bridge_forward_probability # nosec B311
decky.forwards_l3 = decky.forwards_l3 or forwards
for e in edges:
if e.decky_name == decky.name:
e.is_bridge = True
e.forwards_l3 = decky.forwards_l3
edges.append(
_PlannedEdge(
decky_name=decky.name,
lan_name=peer.name,
is_bridge=True,
forwards_l3=decky.forwards_l3,
)
)
del lans_by_name # intermediate lookup, drop before returning
return GeneratedTopology(
config=config, lans=lans, deckies=deckies, edges=edges
)

View File

@@ -0,0 +1,65 @@
"""Canonical hash of a hydrated topology dict.
Both master and agent need to agree on "is the applied state the one
the master intends?". We answer that by hashing the hydrated topology
blob on both sides and comparing the hex digests. The function has to
be **pure** and **deterministic**: same logical state → same hash, no
matter the dict-key order, no matter the timezone of a ``created_at``.
Normalisation rules (applied to a deep copy — input is never mutated):
- Drop fields that change on every read but don't change behaviour:
``created_at``, ``status_changed_at``, ``updated_at``, ``last_seen``,
``status``, ``version``, ``last_error``.
- Drop purely-cosmetic canvas positions (``x``, ``y``, ``w``, ``h``)
everywhere — they're client-side layout, not deployment state.
- Leave everything else alone; sort-keys=True + ``separators``
collapse whitespace and fix ordering.
"""
from __future__ import annotations
import hashlib
import json
from typing import Any
# Fields that vary over time or come from layout and must NOT feed the
# applied-state hash. Dropped at every nesting level.
_VOLATILE_KEYS = frozenset(
{
"created_at",
"status_changed_at",
"updated_at",
"last_seen",
"status",
"version",
"last_error",
"x",
"y",
"w",
"h",
}
)
def _strip(value: Any) -> Any:
"""Return a deep copy of *value* with volatile keys removed."""
if isinstance(value, dict):
return {k: _strip(v) for k, v in value.items() if k not in _VOLATILE_KEYS}
if isinstance(value, list):
return [_strip(v) for v in value]
return value
def canonical_hash(hydrated: dict) -> str:
"""Return the SHA-256 hex digest of *hydrated*'s canonical form."""
normalised = _strip(hydrated)
blob = json.dumps(
normalised,
sort_keys=True,
separators=(",", ":"),
default=str,
).encode("utf-8")
return hashlib.sha256(blob).hexdigest()
__all__ = ["canonical_hash"]

View File

@@ -0,0 +1,218 @@
"""Adapter between :class:`GeneratedTopology` and the repository layer."""
from __future__ import annotations
from ipaddress import IPv4Address, IPv4Network
from typing import Any
from decnet.topology.allocator import IPAllocator
from decnet.topology.config import GeneratedTopology
from decnet.topology.status import TopologyStatus, assert_transition
async def persist(
repo: Any,
plan: GeneratedTopology,
*,
target_host_uuid: str | None = None,
) -> str:
"""Write a generated plan to the repo as a ``pending`` topology.
Returns the newly created topology id. All child rows are written
atomically relative to each other (SQLite transactions are per-call
here; the repo methods each commit — good enough for initial create
since the whole chain is invoked before any external side effects).
``target_host_uuid`` — pin the topology to a specific swarm agent.
Only meaningful when ``plan.config.mode == "agent"`` (caller
validates; this function just stores what it's told).
"""
topology_id = await repo.create_topology(
{
"name": plan.config.name,
"mode": plan.config.mode,
"target_host_uuid": target_host_uuid,
"config_snapshot": plan.config.model_dump(),
}
)
lan_ids: dict[str, str] = {}
for lan in plan.lans:
lan_id = await repo.add_lan(
{
"topology_id": topology_id,
"name": lan.name,
"subnet": lan.subnet,
"is_dmz": lan.is_dmz,
"x": lan.x,
"y": lan.y,
}
)
lan_ids[lan.name] = lan_id
decky_ids: dict[str, str] = {}
for decky in plan.deckies:
# Primary IP: the first LAN the decky was assigned to (insertion
# order of ips_by_lan, which reflects generator ordering —
# home LAN first, then any bridge targets).
primary_lan = next(iter(decky.ips_by_lan))
primary_ip = decky.ips_by_lan[primary_lan]
decky_uuid = await repo.add_topology_decky(
{
"topology_id": topology_id,
"name": decky.name,
"services": decky.services,
"decky_config": {
"name": decky.name,
"services": decky.services,
"ips_by_lan": decky.ips_by_lan,
"forwards_l3": decky.forwards_l3,
"service_config": decky.service_config,
},
"ip": primary_ip,
"x": decky.x,
"y": decky.y,
}
)
decky_ids[decky.name] = decky_uuid
for edge in plan.edges:
await repo.add_topology_edge(
{
"topology_id": topology_id,
"decky_uuid": decky_ids[edge.decky_name],
"lan_id": lan_ids[edge.lan_name],
"is_bridge": edge.is_bridge,
"forwards_l3": edge.forwards_l3,
}
)
return topology_id
async def transition_status(
repo: Any,
topology_id: str,
new_status: str,
reason: str | None = None,
) -> None:
"""Legal-only status transition.
Raises :class:`decnet.topology.status.TopologyStatusError` if the
current status cannot legally transition to ``new_status``.
"""
topo = await repo.get_topology(topology_id)
if topo is None:
raise ValueError(f"topology {topology_id!r} not found")
assert_transition(topo["status"], new_status)
await repo.update_topology_status(topology_id, new_status, reason=reason)
async def hydrate(repo: Any, topology_id: str) -> dict[str, Any] | None:
"""Load a topology + children into a single dict for callers.
Shape::
{
"topology": { ...row... },
"lans": [ {...}, ... ],
"deckies": [ {...}, ... ],
"edges": [ {...}, ... ],
}
Returns ``None`` if the topology does not exist.
"""
topo = await repo.get_topology(topology_id)
if topo is None:
return None
lans = await repo.list_lans_for_topology(topology_id)
deckies = await repo.list_topology_deckies(topology_id)
edges = await repo.list_topology_edges(topology_id)
_backfill_decky_configs(lans, deckies, edges)
return {
"topology": topo,
"lans": lans,
"deckies": deckies,
"edges": edges,
}
def _backfill_decky_configs(
lans: list[dict[str, Any]],
deckies: list[dict[str, Any]],
edges: list[dict[str, Any]],
) -> None:
"""Fill in ``decky_config['name']`` and ``ips_by_lan`` for UI-created rows.
The generator path writes these fields at persist-time; the REST
CRUD path writes whatever the client sends (often just archetype
flags). Compose generation requires both, so we normalise here so
every write path feeds the same shape downstream.
"""
lans_by_id = {lan["id"]: lan for lan in lans}
allocators: dict[str, IPAllocator] = {}
def _alloc(lan_id: str) -> IPAllocator | None:
lan = lans_by_id.get(lan_id)
if lan is None or not lan.get("subnet"):
return None
if lan_id not in allocators:
allocators[lan_id] = IPAllocator(lan["subnet"])
return allocators[lan_id]
decky_edges: dict[str, list[str]] = {}
for e in edges:
decky_edges.setdefault(e["decky_uuid"], []).append(e["lan_id"])
ordered = sorted(deckies, key=lambda d: (d.get("name", ""), d["uuid"]))
# Pass 1: reserve IPs already declared in decky_config.
for decky in ordered:
cfg = decky.get("decky_config") or {}
existing = cfg.get("ips_by_lan") or {}
for lan_id in decky_edges.get(decky["uuid"], []):
lan = lans_by_id.get(lan_id)
if lan is None:
continue
alloc = _alloc(lan_id)
if alloc is None:
continue
ip = existing.get(lan["name"])
if ip and alloc.is_free(ip):
alloc.reserve(ip)
# Pass 2: fill gaps; rewrite decky_config.
for decky in ordered:
cfg = dict(decky.get("decky_config") or {})
cfg.setdefault("name", decky.get("name"))
ips_by_lan: dict[str, str] = dict(cfg.get("ips_by_lan") or {})
primary_ip = decky.get("ip")
for lan_id in decky_edges.get(decky["uuid"], []):
lan = lans_by_id.get(lan_id)
if lan is None:
continue
if lan["name"] in ips_by_lan:
continue
alloc = _alloc(lan_id)
if alloc is None:
continue
ip: str | None = None
if primary_ip:
try:
if (
IPv4Address(primary_ip) in IPv4Network(lan["subnet"])
and alloc.is_free(primary_ip)
):
ip = primary_ip
alloc.reserve(ip)
except (ValueError, TypeError):
pass
if ip is None:
ip = alloc.next_free()
ips_by_lan[lan["name"]] = ip
cfg["ips_by_lan"] = ips_by_lan
decky["decky_config"] = cfg
# Re-export the status constants so callers can ``from decnet.topology.persistence
# import TopologyStatus`` without chasing modules.
__all__ = ["persist", "transition_status", "hydrate", "TopologyStatus"]

106
decnet/topology/status.py Normal file
View File

@@ -0,0 +1,106 @@
"""MazeNET topology status state machine.
Seven states — six active in v1. ``degraded`` is schema-reserved for the
future Healer worker and has no transitions into it from v1 code paths.
"""
from __future__ import annotations
class TopologyStatus:
PENDING = "pending"
DEPLOYING = "deploying"
ACTIVE = "active"
DEGRADED = "degraded"
FAILED = "failed"
TEARING_DOWN = "tearing_down"
TORN_DOWN = "torn_down"
ALL: frozenset[str] = frozenset(
{PENDING, DEPLOYING, ACTIVE, DEGRADED, FAILED, TEARING_DOWN, TORN_DOWN}
)
# Directed transitions. torn_down is terminal. degraded is unreachable
# in v1 (Healer would be the only writer), but its outbound edges stay
# defined so when Healer lands the state machine already accepts them.
_LEGAL: dict[str, frozenset[str]] = {
TopologyStatus.PENDING: frozenset(
{TopologyStatus.DEPLOYING, TopologyStatus.TORN_DOWN}
),
TopologyStatus.DEPLOYING: frozenset(
{
TopologyStatus.ACTIVE,
TopologyStatus.FAILED,
TopologyStatus.DEGRADED,
TopologyStatus.TEARING_DOWN,
}
),
TopologyStatus.ACTIVE: frozenset(
{TopologyStatus.DEGRADED, TopologyStatus.TEARING_DOWN}
),
TopologyStatus.DEGRADED: frozenset(
{TopologyStatus.ACTIVE, TopologyStatus.TEARING_DOWN}
),
TopologyStatus.FAILED: frozenset({TopologyStatus.TEARING_DOWN}),
TopologyStatus.TEARING_DOWN: frozenset(
{TopologyStatus.TORN_DOWN, TopologyStatus.DEGRADED}
),
TopologyStatus.TORN_DOWN: frozenset(),
}
class TopologyStatusError(ValueError):
"""Raised when an illegal topology status transition is attempted."""
class TopologyNotEditable(RuntimeError):
"""Raised when a pending-only mutation hits a non-pending topology.
Pre-deploy edits (update_lan, delete_lan, update/delete decky,
delete_edge) are only legal while the topology is ``pending``.
After deploy the mutator's reconciler + topology_mutations table
take over.
"""
def __init__(self, *, status: str, reason: str = "") -> None:
self.status = status
self.reason = reason
super().__init__(
f"topology not editable (status={status!r})"
+ (f": {reason}" if reason else "")
)
class VersionConflict(RuntimeError):
"""Raised when a topology write is supplied a stale ``expected_version``.
Optimistic concurrency guard: the caller passed the version it last
observed, and the topology has since been mutated by someone else.
The caller should re-read and retry.
"""
def __init__(self, *, current: int, expected: int) -> None:
self.current = current
self.expected = expected
super().__init__(
f"topology version conflict: expected {expected}, current is {current}"
)
def assert_transition(current: str, new: str) -> None:
"""Validate ``current → new`` or raise :class:`TopologyStatusError`."""
if current not in TopologyStatus.ALL:
raise TopologyStatusError(f"unknown current status: {current!r}")
if new not in TopologyStatus.ALL:
raise TopologyStatusError(f"unknown new status: {new!r}")
if new not in _LEGAL[current]:
raise TopologyStatusError(
f"illegal transition: {current!r}{new!r}"
)
def legal_next(current: str) -> frozenset[str]:
"""Return the set of legal successor statuses from ``current``."""
if current not in _LEGAL:
raise TopologyStatusError(f"unknown status: {current!r}")
return _LEGAL[current]

356
decnet/topology/validate.py Normal file
View File

@@ -0,0 +1,356 @@
"""Pre-deploy validator for MazeNET topologies.
Consumes a hydrated dict (output of
:func:`decnet.topology.persistence.hydrate`) and returns a list of
:class:`ValidationIssue` records. The deployer calls :func:`validate`
before transitioning to ``DEPLOYING`` and refuses to proceed if any
issue has ``severity=="error"``.
Rules are independent functions so the web editor can surface them as
inline diagnostics without running the full list.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from ipaddress import IPv4Address, IPv4Network
from typing import Any, Callable, Literal
from decnet.fleet import all_service_names
from decnet.services.registry import get_service
Severity = Literal["error", "warning"]
@dataclass
class ValidationIssue:
severity: Severity
code: str
message: str
target: dict = field(default_factory=dict)
class ValidationError(Exception):
"""Raised by the deployer when a topology fails pre-deploy checks."""
def __init__(self, issues: list[ValidationIssue]) -> None:
self.issues = issues
errors = [i for i in issues if i.severity == "error"]
super().__init__(
f"{len(errors)} topology validation error(s): "
+ "; ".join(f"[{i.code}] {i.message}" for i in errors)
)
# --------------------------------------------------------------------- rules
def check_exactly_one_dmz(h: dict[str, Any]) -> list[ValidationIssue]:
dmzs = [lan for lan in h["lans"] if lan.get("is_dmz")]
if len(dmzs) == 1:
return []
if not dmzs:
return [
ValidationIssue("error", "DMZ_MISSING", "no LAN is marked is_dmz=True")
]
return [
ValidationIssue(
"error",
"DMZ_MULTIPLE",
f"{len(dmzs)} LANs marked is_dmz=True; exactly one allowed",
target={"lans": [lan["name"] for lan in dmzs]},
)
]
def check_all_lans_connected_to_dmz(
h: dict[str, Any],
) -> list[ValidationIssue]:
lans = {lan["id"]: lan for lan in h["lans"]}
if not lans:
return []
dmz = next((lan for lan in h["lans"] if lan.get("is_dmz")), None)
if dmz is None:
return [] # covered by check_exactly_one_dmz
# Adjacency: LANs share an edge if ≥1 bridge decky is attached to both.
decky_lans: dict[str, set[str]] = {}
for edge in h["edges"]:
decky_lans.setdefault(edge["decky_uuid"], set()).add(edge["lan_id"])
adj: dict[str, set[str]] = {lid: set() for lid in lans}
for lan_ids in decky_lans.values():
if len(lan_ids) < 2:
continue
for a in lan_ids:
for b in lan_ids:
if a != b:
adj[a].add(b)
reachable = {dmz["id"]}
frontier = [dmz["id"]]
while frontier:
nxt: list[str] = []
for lid in frontier:
for peer in adj[lid]:
if peer not in reachable:
reachable.add(peer)
nxt.append(peer)
frontier = nxt
orphans = [lans[lid]["name"] for lid in lans if lid not in reachable]
if not orphans:
return []
return [
ValidationIssue(
"error",
"DMZ_ORPHAN",
f"LAN(s) have no bridge path to the DMZ: {', '.join(orphans)}",
target={"lans": orphans},
)
]
def check_no_orphan_deckies(h: dict[str, Any]) -> list[ValidationIssue]:
attached: set[str] = {e["decky_uuid"] for e in h["edges"]}
issues: list[ValidationIssue] = []
for d in h["deckies"]:
if d["uuid"] not in attached:
issues.append(
ValidationIssue(
"error",
"DECKY_ORPHAN",
f"decky {d['name']!r} has no LAN edges",
target={"decky": d["name"]},
)
)
return issues
def check_names_unique(h: dict[str, Any]) -> list[ValidationIssue]:
issues: list[ValidationIssue] = []
seen_lan: set[str] = set()
for lan in h["lans"]:
if lan["name"] in seen_lan:
issues.append(
ValidationIssue(
"error",
"LAN_NAME_DUP",
f"duplicate LAN name {lan['name']!r}",
target={"lan": lan["name"]},
)
)
seen_lan.add(lan["name"])
seen_decky: set[str] = set()
for d in h["deckies"]:
if d["name"] in seen_decky:
issues.append(
ValidationIssue(
"error",
"DECKY_NAME_DUP",
f"duplicate decky name {d['name']!r}",
target={"decky": d["name"]},
)
)
seen_decky.add(d["name"])
return issues
def check_no_ip_collisions(h: dict[str, Any]) -> list[ValidationIssue]:
lans_by_name = {lan["name"]: lan for lan in h["lans"]}
per_lan_ips: dict[str, dict[str, str]] = {} # lan_name → {ip: decky_name}
issues: list[ValidationIssue] = []
for d in h["deckies"]:
ips_by_lan: dict[str, str] = (d.get("decky_config") or {}).get(
"ips_by_lan", {}
)
for lan_name, ip in ips_by_lan.items():
lan = lans_by_name.get(lan_name)
if lan is None:
issues.append(
ValidationIssue(
"error",
"IP_UNKNOWN_LAN",
f"decky {d['name']!r} claims IP in unknown LAN "
f"{lan_name!r}",
target={"decky": d["name"], "lan": lan_name},
)
)
continue
# Out-of-subnet check.
try:
if IPv4Address(ip) not in IPv4Network(lan["subnet"]):
issues.append(
ValidationIssue(
"error",
"IP_OUT_OF_SUBNET",
f"{ip} not inside {lan['subnet']} "
f"(decky {d['name']!r}, LAN {lan_name!r})",
target={"decky": d["name"], "lan": lan_name, "ip": ip},
)
)
except (ValueError, TypeError):
issues.append(
ValidationIssue(
"error",
"IP_MALFORMED",
f"decky {d['name']!r}: malformed IP {ip!r}",
target={"decky": d["name"], "ip": ip},
)
)
continue
bucket = per_lan_ips.setdefault(lan_name, {})
if ip in bucket:
issues.append(
ValidationIssue(
"error",
"IP_COLLISION",
f"IP {ip} claimed by both {bucket[ip]!r} and "
f"{d['name']!r} in LAN {lan_name!r}",
target={
"lan": lan_name,
"ip": ip,
"deckies": [bucket[ip], d["name"]],
},
)
)
else:
bucket[ip] = d["name"]
return issues
def check_no_subnet_overlap(h: dict[str, Any]) -> list[ValidationIssue]:
nets: list[tuple[str, IPv4Network]] = []
issues: list[ValidationIssue] = []
for lan in h["lans"]:
try:
nets.append((lan["name"], IPv4Network(lan["subnet"])))
except ValueError:
issues.append(
ValidationIssue(
"error",
"SUBNET_MALFORMED",
f"LAN {lan['name']!r}: malformed subnet {lan['subnet']!r}",
target={"lan": lan["name"]},
)
)
for i, (na, a) in enumerate(nets):
for nb, b in nets[i + 1 :]:
if a.overlaps(b):
issues.append(
ValidationIssue(
"error",
"SUBNET_OVERLAP",
f"LAN {na!r} ({a}) overlaps LAN {nb!r} ({b})",
target={"lans": [na, nb]},
)
)
return issues
def check_services_known(h: dict[str, Any]) -> list[ValidationIssue]:
known = set(all_service_names())
issues: list[ValidationIssue] = []
for d in h["deckies"]:
for svc in d.get("services", []):
if svc not in known:
issues.append(
ValidationIssue(
"error",
"UNKNOWN_SERVICE",
f"decky {d['name']!r}: unknown service {svc!r}",
target={"decky": d["name"], "service": svc},
)
)
return issues
def check_service_config_shape(h: dict[str, Any]) -> list[ValidationIssue]:
issues: list[ValidationIssue] = []
for d in h["deckies"]:
svc_cfg = (d.get("decky_config") or {}).get("service_config") or {}
declared = set(d.get("services", []))
for svc_name in svc_cfg:
if svc_name not in declared:
issues.append(
ValidationIssue(
"error",
"SERVICE_CFG_UNDECLARED",
f"decky {d['name']!r}: service_config for "
f"{svc_name!r} but service not in services list",
target={"decky": d["name"], "service": svc_name},
)
)
return issues
def check_no_host_port_collision(h: dict[str, Any]) -> list[ValidationIssue]:
"""Flag gateway service ports that are already bound on the host.
Only gateway deckies (``forwards_l3=True`` in decky_config) publish
ports (see decnet/topology/compose.py). Best-effort: if ``psutil``
isn't importable or probing fails, returns no issues.
"""
wanted: dict[int, str] = {} # host_port → gateway decky name
for d in h["deckies"]:
cfg = d.get("decky_config") or {}
if not cfg.get("forwards_l3"):
continue
for svc_name in d.get("services", []):
svc = get_service(svc_name)
if svc is None or getattr(svc, "fleet_singleton", False):
continue
for port in getattr(svc, "ports", []) or []:
wanted.setdefault(int(port), d["name"])
if not wanted:
return []
try:
import psutil # type: ignore
bound = {
c.laddr.port
for c in psutil.net_connections(kind="inet")
if c.status == psutil.CONN_LISTEN and c.laddr
}
except Exception:
return []
issues: list[ValidationIssue] = []
for port, decky_name in wanted.items():
if port in bound:
issues.append(
ValidationIssue(
"warning",
"PORT_COLLISION",
f"host port {port} is already bound; "
f"gateway {decky_name!r} may fail to publish it",
target={"decky": decky_name, "port": port},
)
)
return issues
# Pure-data rules. Host-state rules (like PORT_COLLISION) are
# *not* listed here — they're called separately by the live deployer
# so that unit tests exercising validate() stay hermetic.
_RULES: list[Callable[[dict[str, Any]], list[ValidationIssue]]] = [
check_exactly_one_dmz,
check_all_lans_connected_to_dmz,
check_no_orphan_deckies,
check_names_unique,
check_no_ip_collisions,
check_no_subnet_overlap,
check_services_known,
check_service_config_shape,
]
def validate(hydrated: dict[str, Any]) -> list[ValidationIssue]:
"""Run every rule and return the flat list of issues (may be empty)."""
out: list[ValidationIssue] = []
for rule in _RULES:
out.extend(rule(hydrated))
return out
def errors(issues: list[ValidationIssue]) -> list[ValidationIssue]:
return [i for i in issues if i.severity == "error"]