From 096a35b24a1e4ca1e1f40e80ecc6686ce675d1ea Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 16:40:10 -0400 Subject: [PATCH 001/448] feat(mazenet): add topology schema to models.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces five new SQLModel tables for MazeNET (nested deception topologies): Topology, LAN, TopologyDecky, TopologyEdge, and TopologyStatusEvent. DeckyShard is intentionally not touched — TopologyDecky is a purpose-built sibling for MazeNET's lifecycle (topology-scoped UUIDs, per-topology name uniqueness). Part of MazeNET v1 (nested self-container network-of-networks). --- decnet/web/db/models.py | 107 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 106 insertions(+), 1 deletion(-) diff --git a/decnet/web/db/models.py b/decnet/web/db/models.py index 5d75bc73..c94f4392 100644 --- a/decnet/web/db/models.py +++ b/decnet/web/db/models.py @@ -1,6 +1,7 @@ from datetime import datetime, timezone from typing import Literal, Optional, Any, List, Annotated -from sqlalchemy import Column, Text +from uuid import uuid4 +from sqlalchemy import Column, Text, UniqueConstraint from sqlalchemy.dialects.mysql import MEDIUMTEXT from sqlmodel import SQLModel, Field from pydantic import BaseModel, ConfigDict, Field as PydanticField, BeforeValidator @@ -192,6 +193,110 @@ class AttackerBehavior(SQLModel, table=True): default_factory=lambda: datetime.now(timezone.utc), index=True ) +# --- MazeNET tables --- +# Nested deception topologies: an arbitrary-depth DAG of LANs connected by +# multi-homed "bridge" deckies. Purpose-built; disjoint from DeckyShard which +# remains SWARM-only. + +class Topology(SQLModel, table=True): + __tablename__ = "topologies" + id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) + name: str = Field(index=True, unique=True) + mode: str = Field(default="unihost") # unihost|agent + # Full TopologyConfig snapshot (including seed) used at generation time. + config_snapshot: str = Field( + sa_column=Column("config_snapshot", _BIG_TEXT, nullable=False, default="{}") + ) + status: str = Field( + default="pending", index=True + ) # pending|deploying|active|degraded|failed|tearing_down|torn_down + status_changed_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc) + ) + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), index=True + ) + + +class LAN(SQLModel, table=True): + __tablename__ = "lans" + __table_args__ = (UniqueConstraint("topology_id", "name", name="uq_lan_topology_name"),) + id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) + topology_id: str = Field(foreign_key="topologies.id", index=True) + name: str + # Populated after the Docker network is created; nullable before deploy. + docker_network_id: Optional[str] = Field(default=None) + subnet: str + is_dmz: bool = Field(default=False) + + +class TopologyDecky(SQLModel, table=True): + """A decky belonging to a MazeNET topology. + + Disjoint from DeckyShard (which is SWARM-only). UUID PK; decky name is + unique only within a topology, so two topologies can both have a + ``decky-01`` without colliding. + """ + __tablename__ = "topology_deckies" + __table_args__ = ( + UniqueConstraint("topology_id", "name", name="uq_topology_decky_name"), + ) + uuid: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) + topology_id: str = Field(foreign_key="topologies.id", index=True) + name: str + # JSON list[str] of service names on this decky (snapshot of assignment). + services: str = Field( + sa_column=Column("services", _BIG_TEXT, nullable=False, default="[]") + ) + # Full serialised DeckyConfig snapshot — lets the dashboard render the + # same card shape as DeckyShard without a live round-trip. + decky_config: Optional[str] = Field( + default=None, sa_column=Column("decky_config", _BIG_TEXT, nullable=True) + ) + ip: Optional[str] = Field(default=None) + # Same vocabulary as DeckyShard.state to keep dashboard rendering uniform. + state: str = Field( + default="pending", index=True + ) # pending|running|failed|torn_down|degraded|tearing_down|teardown_failed + last_error: Optional[str] = Field( + default=None, sa_column=Column("last_error", Text, nullable=True) + ) + compose_hash: Optional[str] = Field(default=None) + last_seen: Optional[datetime] = Field(default=None) + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc) + ) + + +class TopologyEdge(SQLModel, table=True): + """Membership edge: a decky attached to a LAN. + + A decky appearing in ≥2 edges is multi-homed (a bridge decky). + """ + __tablename__ = "topology_edges" + id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) + topology_id: str = Field(foreign_key="topologies.id", index=True) + decky_uuid: str = Field(foreign_key="topology_deckies.uuid", index=True) + lan_id: str = Field(foreign_key="lans.id", index=True) + is_bridge: bool = Field(default=False) + forwards_l3: bool = Field(default=False) + + +class TopologyStatusEvent(SQLModel, table=True): + """Append-only audit log of topology status transitions.""" + __tablename__ = "topology_status_events" + id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) + topology_id: str = Field(foreign_key="topologies.id", index=True) + from_status: str + to_status: str + at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), index=True + ) + reason: Optional[str] = Field( + default=None, sa_column=Column("reason", Text, nullable=True) + ) + + # --- API Request/Response Models (Pydantic) --- class Token(BaseModel): From 47cd200e1d9364e18d6067023b31a098157e6969 Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 16:43:49 -0400 Subject: [PATCH 002/448] feat(mazenet): repo methods for topology/LAN/decky/edge/status events Adds topology CRUD to BaseRepository (NotImplementedError defaults) and implements them in SQLModelRepository: create/get/list/delete topologies, add/update/list LANs and TopologyDeckies, add/list edges, plus an atomic update_topology_status that appends a TopologyStatusEvent in the same transaction. Cascade delete sweeps children before the topology row. Covered by tests/topology/test_repo.py (roundtrip, per-topology name uniqueness, status event log, cascade delete, status filter) and an extension to tests/test_base_repo.py for the NotImplementedError surface. --- decnet/web/db/repository.py | 64 ++++++++++ decnet/web/db/sqlmodel_repo.py | 222 +++++++++++++++++++++++++++++++++ tests/test_base_repo.py | 14 +++ tests/topology/__init__.py | 0 tests/topology/test_repo.py | 166 ++++++++++++++++++++++++ 5 files changed, 466 insertions(+) create mode 100644 tests/topology/__init__.py create mode 100644 tests/topology/test_repo.py diff --git a/decnet/web/db/repository.py b/decnet/web/db/repository.py index d0513d4a..d7f1f53d 100644 --- a/decnet/web/db/repository.py +++ b/decnet/web/db/repository.py @@ -234,3 +234,67 @@ class BaseRepository(ABC): async def delete_decky_shard(self, decky_name: str) -> bool: raise NotImplementedError + + # ----------------------------------------------------------- mazenet + # MazeNET topology persistence. Default no-op / NotImplementedError so + # non-default backends stay functional; SQLModelRepository provides the + # real implementation used by SQLite and MySQL. + + async def create_topology(self, data: dict[str, Any]) -> str: + raise NotImplementedError + + async def get_topology(self, topology_id: str) -> Optional[dict[str, Any]]: + raise NotImplementedError + + async def list_topologies( + self, status: Optional[str] = None + ) -> list[dict[str, Any]]: + raise NotImplementedError + + async def update_topology_status( + self, + topology_id: str, + new_status: str, + reason: Optional[str] = None, + ) -> None: + raise NotImplementedError + + async def delete_topology_cascade(self, topology_id: str) -> bool: + raise NotImplementedError + + async def add_lan(self, data: dict[str, Any]) -> str: + raise NotImplementedError + + async def update_lan(self, lan_id: str, fields: dict[str, Any]) -> None: + raise NotImplementedError + + async def list_lans_for_topology( + self, topology_id: str + ) -> list[dict[str, Any]]: + raise NotImplementedError + + async def add_topology_decky(self, data: dict[str, Any]) -> str: + raise NotImplementedError + + async def update_topology_decky( + self, decky_uuid: str, fields: dict[str, Any] + ) -> None: + raise NotImplementedError + + async def list_topology_deckies( + self, topology_id: str + ) -> list[dict[str, Any]]: + raise NotImplementedError + + async def add_topology_edge(self, data: dict[str, Any]) -> str: + raise NotImplementedError + + async def list_topology_edges( + self, topology_id: str + ) -> list[dict[str, Any]]: + raise NotImplementedError + + async def list_topology_status_events( + self, topology_id: str, limit: int = 100 + ) -> list[dict[str, Any]]: + raise NotImplementedError diff --git a/decnet/web/db/sqlmodel_repo.py b/decnet/web/db/sqlmodel_repo.py index b5f40f4c..910a4d8a 100644 --- a/decnet/web/db/sqlmodel_repo.py +++ b/decnet/web/db/sqlmodel_repo.py @@ -36,6 +36,11 @@ from decnet.web.db.models import ( AttackerBehavior, SwarmHost, DeckyShard, + Topology, + LAN, + TopologyDecky, + TopologyEdge, + TopologyStatusEvent, ) @@ -899,3 +904,220 @@ class SQLModelRepository(BaseRepository): ) await session.commit() return bool(result.rowcount) + + # ------------------------------------------------------------ mazenet + + @staticmethod + def _serialize_json_fields(data: dict[str, Any], keys: tuple[str, ...]) -> dict[str, Any]: + out = dict(data) + for k in keys: + v = out.get(k) + if v is not None and not isinstance(v, str): + out[k] = orjson.dumps(v).decode() + return out + + @staticmethod + def _deserialize_json_fields(d: dict[str, Any], keys: tuple[str, ...]) -> dict[str, Any]: + for k in keys: + v = d.get(k) + if isinstance(v, str): + try: + d[k] = json.loads(v) + except (json.JSONDecodeError, TypeError): + pass + return d + + async def create_topology(self, data: dict[str, Any]) -> str: + payload = self._serialize_json_fields(data, ("config_snapshot",)) + async with self._session() as session: + row = Topology(**payload) + session.add(row) + await session.commit() + await session.refresh(row) + return row.id + + async def get_topology(self, topology_id: str) -> Optional[dict[str, Any]]: + async with self._session() as session: + result = await session.execute( + select(Topology).where(Topology.id == topology_id) + ) + row = result.scalar_one_or_none() + if not row: + return None + d = row.model_dump(mode="json") + return self._deserialize_json_fields(d, ("config_snapshot",)) + + async def list_topologies( + self, status: Optional[str] = None + ) -> list[dict[str, Any]]: + statement = select(Topology).order_by(desc(Topology.created_at)) + if status: + statement = statement.where(Topology.status == status) + async with self._session() as session: + result = await session.execute(statement) + return [ + self._deserialize_json_fields( + r.model_dump(mode="json"), ("config_snapshot",) + ) + for r in result.scalars().all() + ] + + async def update_topology_status( + self, + topology_id: str, + new_status: str, + reason: Optional[str] = None, + ) -> None: + """Update topology.status and append a TopologyStatusEvent atomically. + + Transition legality is enforced in ``decnet.topology.status``; this + method trusts the caller. + """ + now = datetime.now(timezone.utc) + async with self._session() as session: + result = await session.execute( + select(Topology).where(Topology.id == topology_id) + ) + topo = result.scalar_one_or_none() + if topo is None: + return + from_status = topo.status + topo.status = new_status + topo.status_changed_at = now + session.add(topo) + session.add( + TopologyStatusEvent( + topology_id=topology_id, + from_status=from_status, + to_status=new_status, + at=now, + reason=reason, + ) + ) + await session.commit() + + async def delete_topology_cascade(self, topology_id: str) -> bool: + """Delete topology and all children. No portable ON DELETE CASCADE.""" + async with self._session() as session: + params = {"t": topology_id} + await session.execute( + text("DELETE FROM topology_status_events WHERE topology_id = :t"), + params, + ) + await session.execute( + text("DELETE FROM topology_edges WHERE topology_id = :t"), + params, + ) + await session.execute( + text("DELETE FROM topology_deckies WHERE topology_id = :t"), + params, + ) + await session.execute( + text("DELETE FROM lans WHERE topology_id = :t"), + params, + ) + result = await session.execute( + select(Topology).where(Topology.id == topology_id) + ) + topo = result.scalar_one_or_none() + if not topo: + await session.commit() + return False + await session.delete(topo) + await session.commit() + return True + + async def add_lan(self, data: dict[str, Any]) -> str: + async with self._session() as session: + row = LAN(**data) + session.add(row) + await session.commit() + await session.refresh(row) + return row.id + + async def update_lan(self, lan_id: str, fields: dict[str, Any]) -> None: + if not fields: + return + async with self._session() as session: + await session.execute( + update(LAN).where(LAN.id == lan_id).values(**fields) + ) + await session.commit() + + async def list_lans_for_topology( + self, topology_id: str + ) -> list[dict[str, Any]]: + async with self._session() as session: + result = await session.execute( + select(LAN).where(LAN.topology_id == topology_id).order_by(asc(LAN.name)) + ) + return [r.model_dump(mode="json") for r in result.scalars().all()] + + async def add_topology_decky(self, data: dict[str, Any]) -> str: + payload = self._serialize_json_fields(data, ("services", "decky_config")) + async with self._session() as session: + row = TopologyDecky(**payload) + session.add(row) + await session.commit() + await session.refresh(row) + return row.uuid + + async def update_topology_decky( + self, decky_uuid: str, fields: dict[str, Any] + ) -> None: + if not fields: + return + payload = self._serialize_json_fields(fields, ("services", "decky_config")) + payload.setdefault("updated_at", datetime.now(timezone.utc)) + async with self._session() as session: + await session.execute( + update(TopologyDecky) + .where(TopologyDecky.uuid == decky_uuid) + .values(**payload) + ) + await session.commit() + + async def list_topology_deckies( + self, topology_id: str + ) -> list[dict[str, Any]]: + async with self._session() as session: + result = await session.execute( + select(TopologyDecky) + .where(TopologyDecky.topology_id == topology_id) + .order_by(asc(TopologyDecky.name)) + ) + return [ + self._deserialize_json_fields( + r.model_dump(mode="json"), ("services", "decky_config") + ) + for r in result.scalars().all() + ] + + async def add_topology_edge(self, data: dict[str, Any]) -> str: + async with self._session() as session: + row = TopologyEdge(**data) + session.add(row) + await session.commit() + await session.refresh(row) + return row.id + + async def list_topology_edges( + self, topology_id: str + ) -> list[dict[str, Any]]: + async with self._session() as session: + result = await session.execute( + select(TopologyEdge).where(TopologyEdge.topology_id == topology_id) + ) + return [r.model_dump(mode="json") for r in result.scalars().all()] + + async def list_topology_status_events( + self, topology_id: str, limit: int = 100 + ) -> list[dict[str, Any]]: + async with self._session() as session: + result = await session.execute( + select(TopologyStatusEvent) + .where(TopologyStatusEvent.topology_id == topology_id) + .order_by(desc(TopologyStatusEvent.at)) + .limit(limit) + ) + return [r.model_dump(mode="json") for r in result.scalars().all()] diff --git a/tests/test_base_repo.py b/tests/test_base_repo.py index 7750f69b..ac23fea1 100644 --- a/tests/test_base_repo.py +++ b/tests/test_base_repo.py @@ -88,6 +88,20 @@ async def test_base_repo_coverage(): (dr.upsert_decky_shard, ({},)), (dr.list_decky_shards, ()), (dr.delete_decky_shards_for_host, ("u",)), + (dr.create_topology, ({},)), + (dr.get_topology, ("t",)), + (dr.list_topologies, ()), + (dr.update_topology_status, ("t", "active")), + (dr.delete_topology_cascade, ("t",)), + (dr.add_lan, ({},)), + (dr.update_lan, ("l", {})), + (dr.list_lans_for_topology, ("t",)), + (dr.add_topology_decky, ({},)), + (dr.update_topology_decky, ("d", {})), + (dr.list_topology_deckies, ("t",)), + (dr.add_topology_edge, ({},)), + (dr.list_topology_edges, ("t",)), + (dr.list_topology_status_events, ("t",)), ]: with pytest.raises(NotImplementedError): await coro(*args) diff --git a/tests/topology/__init__.py b/tests/topology/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/topology/test_repo.py b/tests/topology/test_repo.py new file mode 100644 index 00000000..f1fb138d --- /dev/null +++ b/tests/topology/test_repo.py @@ -0,0 +1,166 @@ +"""Direct async tests for MazeNET topology persistence. + +Exercises the repository layer without going through the HTTP stack or +the in-memory generator. The synthetic topology here is hand-built so +the test remains meaningful even if generator.py regresses. +""" +import pytest +from decnet.web.db.factory import get_repository + + +@pytest.fixture +async def repo(tmp_path): + r = get_repository(db_path=str(tmp_path / "mazenet.db")) + await r.initialize() + return r + + +@pytest.mark.anyio +async def test_topology_roundtrip(repo): + t_id = await repo.create_topology( + { + "name": "alpha", + "mode": "unihost", + "config_snapshot": {"depth": 3, "seed": 42}, + } + ) + assert t_id + t = await repo.get_topology(t_id) + assert t is not None + assert t["name"] == "alpha" + assert t["status"] == "pending" + # JSON field round-trips as a dict, not a string + assert t["config_snapshot"] == {"depth": 3, "seed": 42} + + +@pytest.mark.anyio +async def test_lan_add_update_list(repo): + t_id = await repo.create_topology( + {"name": "beta", "mode": "unihost", "config_snapshot": {}} + ) + lan_id = await repo.add_lan( + {"topology_id": t_id, "name": "DMZ", "subnet": "172.20.0.0/24", "is_dmz": True} + ) + await repo.add_lan( + {"topology_id": t_id, "name": "LAN-A", "subnet": "172.20.1.0/24"} + ) + await repo.update_lan(lan_id, {"docker_network_id": "abc123"}) + lans = await repo.list_lans_for_topology(t_id) + assert len(lans) == 2 + by_name = {lan["name"]: lan for lan in lans} + assert by_name["DMZ"]["docker_network_id"] == "abc123" + assert by_name["DMZ"]["is_dmz"] is True + assert by_name["LAN-A"]["is_dmz"] is False + + +@pytest.mark.anyio +async def test_topology_decky_json_roundtrip(repo): + t_id = await repo.create_topology( + {"name": "gamma", "mode": "unihost", "config_snapshot": {}} + ) + d_uuid = await repo.add_topology_decky( + { + "topology_id": t_id, + "name": "decky-01", + "services": ["ssh", "http"], + "decky_config": {"hostname": "bastion"}, + "ip": "172.20.0.10", + } + ) + assert d_uuid + deckies = await repo.list_topology_deckies(t_id) + assert len(deckies) == 1 + assert deckies[0]["services"] == ["ssh", "http"] + assert deckies[0]["decky_config"] == {"hostname": "bastion"} + assert deckies[0]["state"] == "pending" + + await repo.update_topology_decky(d_uuid, {"state": "running", "ip": "172.20.0.11"}) + deckies = await repo.list_topology_deckies(t_id) + assert deckies[0]["state"] == "running" + assert deckies[0]["ip"] == "172.20.0.11" + + +@pytest.mark.anyio +async def test_topology_decky_name_unique_within_topology(repo): + """Same decky name is legal across topologies, forbidden within one.""" + t1 = await repo.create_topology( + {"name": "one", "mode": "unihost", "config_snapshot": {}} + ) + t2 = await repo.create_topology( + {"name": "two", "mode": "unihost", "config_snapshot": {}} + ) + await repo.add_topology_decky( + {"topology_id": t1, "name": "decky-01", "services": []} + ) + # Same name, different topology — must succeed. + await repo.add_topology_decky( + {"topology_id": t2, "name": "decky-01", "services": []} + ) + # Same name, same topology — must fail at the DB level. + with pytest.raises(Exception): + await repo.add_topology_decky( + {"topology_id": t1, "name": "decky-01", "services": []} + ) + + +@pytest.mark.anyio +async def test_status_transition_writes_event(repo): + t_id = await repo.create_topology( + {"name": "delta", "mode": "unihost", "config_snapshot": {}} + ) + await repo.update_topology_status(t_id, "deploying", reason="kickoff") + await repo.update_topology_status(t_id, "active") + topo = await repo.get_topology(t_id) + assert topo["status"] == "active" + + events = await repo.list_topology_status_events(t_id) + assert len(events) == 2 + # Ordered desc by at — latest first + assert events[0]["to_status"] == "active" + assert events[0]["from_status"] == "deploying" + assert events[1]["to_status"] == "deploying" + assert events[1]["from_status"] == "pending" + assert events[1]["reason"] == "kickoff" + + +@pytest.mark.anyio +async def test_cascade_delete_clears_all_children(repo): + t_id = await repo.create_topology( + {"name": "eps", "mode": "unihost", "config_snapshot": {}} + ) + lan_id = await repo.add_lan( + {"topology_id": t_id, "name": "L", "subnet": "10.0.0.0/24"} + ) + d_uuid = await repo.add_topology_decky( + {"topology_id": t_id, "name": "d", "services": []} + ) + await repo.add_topology_edge( + {"topology_id": t_id, "decky_uuid": d_uuid, "lan_id": lan_id} + ) + await repo.update_topology_status(t_id, "deploying") + + assert await repo.delete_topology_cascade(t_id) is True + assert await repo.get_topology(t_id) is None + assert await repo.list_lans_for_topology(t_id) == [] + assert await repo.list_topology_deckies(t_id) == [] + assert await repo.list_topology_edges(t_id) == [] + assert await repo.list_topology_status_events(t_id) == [] + # Second delete on a missing row returns False, no raise + assert await repo.delete_topology_cascade(t_id) is False + + +@pytest.mark.anyio +async def test_list_topologies_filters_by_status(repo): + a = await repo.create_topology( + {"name": "a", "mode": "unihost", "config_snapshot": {}} + ) + b = await repo.create_topology( + {"name": "b", "mode": "unihost", "config_snapshot": {}} + ) + await repo.update_topology_status(b, "deploying") + pend = await repo.list_topologies(status="pending") + assert {t["id"] for t in pend} == {a} + dep = await repo.list_topologies(status="deploying") + assert {t["id"] for t in dep} == {b} + both = await repo.list_topologies() + assert {t["id"] for t in both} == {a, b} From 201d246c0780e32bd551efb9fd37336a5fc0beda Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 16:46:30 -0400 Subject: [PATCH 003/448] fix(ci): fix indentation on ci.yaml --- .gitea/workflows/ci.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 5dd8d217..e62ade38 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -61,22 +61,22 @@ jobs: name: Test (Live) runs-on: ubuntu-latest needs: [test-standard] + services: + mysql: + image: mysql:8.0 + env: + MYSQL_ROOT_PASSWORD: root + MYSQL_DATABASE: decnet_test + ports: + - 3307:3306 + options: >- + --health-cmd="mysqladmin ping -h 127.0.0.1" + --health-interval=10s + --health-timeout=5s + --health-retries=5 strategy: matrix: python-version: ["3.11"] - services: - mysql: - image: mysql:8.0 - env: - MYSQL_ROOT_PASSWORD: root - MYSQL_DATABASE: decnet_test - ports: - - 3307:3306 - options: >- - --health-cmd="mysqladmin ping -h 127.0.0.1" - --health-interval=10s - --health-timeout=5s - --health-retries=5 steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 From 33f139ecfab1ffad7e7de9b9a4ae5d4bbc2eb8b6 Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 16:48:20 -0400 Subject: [PATCH 004/448] =?UTF-8?q?feat(mazenet):=20topology=20package=20?= =?UTF-8?q?=E2=80=94=20config,=20status=20machine,=20generator,=20persiste?= =?UTF-8?q?nce?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds decnet/topology/ with: - config.TopologyConfig: pydantic model driving generation (depth, branching_factor, deckies_per_lan_min/max, bridge_forward_probability, cross_edge_probability, subnet_base_prefix, service selection, seed). Emits GeneratedTopology dataclass (lans, deckies, edges). - status.TopologyStatus + assert_transition: seven-state machine with an explicit legal-transition table. torn_down is terminal; degraded is schema-reserved for future Healer use. - generator.generate: deterministic DAG generation under config.seed. Builds a tree of LANs (DMZ at root), plants deckies in each LAN, promotes one decky per non-DMZ LAN to a parent bridge, and rolls cross-edges per cross_edge_probability for DAG shape. - persistence: persist() writes a plan to the repo as pending; transition_status() enforces state-machine legality; hydrate() loads topology + children into a single dict. Covered by tests/topology/{test_status,test_generator,test_persistence}. --- decnet/topology/__init__.py | 23 +++ decnet/topology/config.py | 94 ++++++++++++ decnet/topology/generator.py | 239 +++++++++++++++++++++++++++++ decnet/topology/persistence.py | 123 +++++++++++++++ decnet/topology/status.py | 72 +++++++++ tests/topology/test_generator.py | 137 +++++++++++++++++ tests/topology/test_persistence.py | 91 +++++++++++ tests/topology/test_status.py | 55 +++++++ 8 files changed, 834 insertions(+) create mode 100644 decnet/topology/__init__.py create mode 100644 decnet/topology/config.py create mode 100644 decnet/topology/generator.py create mode 100644 decnet/topology/persistence.py create mode 100644 decnet/topology/status.py create mode 100644 tests/topology/test_generator.py create mode 100644 tests/topology/test_persistence.py create mode 100644 tests/topology/test_status.py diff --git a/decnet/topology/__init__.py b/decnet/topology/__init__.py new file mode 100644 index 00000000..247366b9 --- /dev/null +++ b/decnet/topology/__init__.py @@ -0,0 +1,23 @@ +"""MazeNET — nested deception topologies. + +A topology is an arbitrary-depth DAG of LANs, connected by multi-homed +"bridge deckies" that optionally forward L3 between segments. One LAN +is marked as the DMZ (Internet-facing). Persisted via the repo pattern; +deployed via :mod:`decnet.engine.deployer`. +""" +from decnet.topology.config import TopologyConfig, GeneratedTopology +from decnet.topology.generator import generate +from decnet.topology.status import ( + TopologyStatus, + assert_transition, + TopologyStatusError, +) + +__all__ = [ + "TopologyConfig", + "GeneratedTopology", + "generate", + "TopologyStatus", + "assert_transition", + "TopologyStatusError", +] diff --git a/decnet/topology/config.py b/decnet/topology/config.py new file mode 100644 index 00000000..3b9a13a3 --- /dev/null +++ b/decnet/topology/config.py @@ -0,0 +1,94 @@ +"""MazeNET topology config + in-memory generation output.""" +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Optional + +from pydantic import BaseModel, Field, model_validator + + +class TopologyConfig(BaseModel): + """Parameters driving :func:`decnet.topology.generator.generate`.""" + + name: str = Field(..., min_length=1, max_length=64) + mode: str = Field(default="unihost", pattern=r"^(unihost|agent)$") + + # Topology shape + depth: int = Field(..., ge=1, le=16, description="Max depth from DMZ") + branching_factor: int = Field(..., ge=1, le=8, description="Max child LANs per LAN") + deckies_per_lan_min: int = Field(default=1, ge=0, le=32) + deckies_per_lan_max: int = Field(default=3, ge=1, le=32) + + # Probability a given non-DMZ LAN's connection to its parent uses a + # bridge decky that forwards L3 (enables attacker pivot). Bridge + # existence between parent/child is implicit — every non-DMZ LAN + # has exactly one parent bridge. This controls *forwarding*, not + # the existence of the bridge. + bridge_forward_probability: float = Field(default=1.0, ge=0.0, le=1.0) + + # Probability of injecting a DAG cross-edge: a decky also bridged + # from its LAN to a non-parent, non-child LAN. 0.0 yields a tree. + cross_edge_probability: float = Field(default=0.0, ge=0.0, le=1.0) + + # IP allocation base. LANs get sequential /24s starting here. + subnet_base_prefix: str = Field(default="172.20", pattern=r"^\d{1,3}\.\d{1,3}$") + + # Service selection — reuses decnet.fleet.build_deckies' randomizer. + randomize_services: bool = Field(default=True) + services_explicit: Optional[list[str]] = None + + seed: Optional[int] = Field(default=None, ge=0) + + @model_validator(mode="after") + def _check_min_max(self) -> "TopologyConfig": + if self.deckies_per_lan_min > self.deckies_per_lan_max: + raise ValueError( + "deckies_per_lan_min must be <= deckies_per_lan_max" + ) + if not self.randomize_services and not self.services_explicit: + raise ValueError( + "either randomize_services=True or services_explicit must be set" + ) + return self + + +@dataclass +class _PlannedLAN: + """In-memory LAN record emitted by the generator.""" + name: str + subnet: str + is_dmz: bool + parent: Optional[str] # name of parent LAN, None for DMZ + + +@dataclass +class _PlannedDecky: + """In-memory decky record emitted by the generator.""" + name: str + services: list[str] + # Mapping LAN-name → assigned IP within that LAN's subnet. + ips_by_lan: dict[str, str] = field(default_factory=dict) + forwards_l3: bool = False # only meaningful when present on ≥2 LANs + + +@dataclass +class _PlannedEdge: + """In-memory (decky, LAN) membership edge.""" + decky_name: str + lan_name: str + is_bridge: bool + forwards_l3: bool + + +@dataclass +class GeneratedTopology: + """Full in-memory output of :func:`decnet.topology.generator.generate`. + + Names are unique within the topology. No UUIDs are assigned here — + those are minted by :mod:`decnet.topology.persistence` when the + topology is written to the repo. + """ + config: TopologyConfig + lans: list[_PlannedLAN] + deckies: list[_PlannedDecky] + edges: list[_PlannedEdge] diff --git a/decnet/topology/generator.py b/decnet/topology/generator.py new file mode 100644 index 00000000..bd3c468b --- /dev/null +++ b/decnet/topology/generator.py @@ -0,0 +1,239 @@ +"""MazeNET topology generator. + +Produces a :class:`GeneratedTopology` — an in-memory DAG of LANs and +multi-homed deckies. Deterministic under ``config.seed``: the same seed +always yields the same structure, service assignments, and IP layout. + +The generator only plans the structure. Persisting UUIDs to the repo +is :mod:`decnet.topology.persistence`; spawning Docker networks and +containers is :mod:`decnet.engine.deployer`. +""" +from __future__ import annotations + +import random +from ipaddress import IPv4Network +from typing import Optional + +from decnet.fleet import all_service_names +from decnet.topology.config import ( + GeneratedTopology, + TopologyConfig, + _PlannedDecky, + _PlannedEdge, + _PlannedLAN, +) + +# Range of services per randomly assigned decky (matches decnet.fleet). +_SVC_MIN = 1 +_SVC_MAX = 3 + + +def _plan_lans( + config: TopologyConfig, rng: random.Random +) -> list[_PlannedLAN]: + """Plan LANs as a tree of depth ``config.depth``. + + Each non-leaf level adds [1, branching_factor] children per parent. + LAN names and subnets are assigned in BFS order. + """ + lans: list[_PlannedLAN] = [] + + def _subnet(idx: int) -> str: + # Exhausting /24s at 172.X.0..255 caps topologies at 256 LANs on + # the default base. Well above the v1 envelope (depth=16 cap). + if idx > 255: + raise ValueError("too many LANs for the configured subnet_base_prefix") + return f"{config.subnet_base_prefix}.{idx}.0/24" + + # DMZ root. + lans.append( + _PlannedLAN(name="LAN-00", subnet=_subnet(0), is_dmz=True, parent=None) + ) + frontier: list[_PlannedLAN] = [lans[0]] + + for _level in range(1, config.depth + 1): + next_frontier: list[_PlannedLAN] = [] + for parent in frontier: + n_children = rng.randint(1, config.branching_factor) # nosec B311 + for _ in range(n_children): + idx = len(lans) + child = _PlannedLAN( + name=f"LAN-{idx:02d}", + subnet=_subnet(idx), + is_dmz=False, + parent=parent.name, + ) + lans.append(child) + next_frontier.append(child) + frontier = next_frontier + if not frontier: + break + return lans + + +def _host_pool(subnet: str) -> list[str]: + """Usable host IPs in ``subnet``, skipping .1 (gateway).""" + net = IPv4Network(subnet, strict=False) + gateway = str(next(net.hosts())) + return [str(ip) for ip in net.hosts() if str(ip) != gateway] + + +def _pick_services( + rng: random.Random, + services_explicit: Optional[list[str]], + pool: list[str], + used_combos: set[frozenset], +) -> list[str]: + if services_explicit: + return list(services_explicit) + if not pool: + return [] + attempts = 0 + while True: + count = rng.randint(_SVC_MIN, min(_SVC_MAX, len(pool))) # nosec B311 + chosen = frozenset(rng.sample(pool, count)) # nosec B311 + attempts += 1 + if chosen not in used_combos or attempts > 20: + break + used_combos.add(chosen) + return list(chosen) + + +def generate(config: TopologyConfig) -> GeneratedTopology: + """Generate a topology plan deterministically under ``config.seed``. + + The caller is responsible for persisting the plan via + :mod:`decnet.topology.persistence` and then deploying it. + """ + rng = random.Random(config.seed) # nosec B311 + svc_pool = all_service_names() if config.randomize_services else [] + used_combos: set[frozenset] = set() + + lans = _plan_lans(config, rng) + lans_by_name = {lan.name: lan for lan in lans} + + # Per-LAN IP pools for deterministic assignment. + ip_iters: dict[str, list[str]] = { + lan.name: _host_pool(lan.subnet) for lan in lans + } + ip_cursors: dict[str, int] = {lan.name: 0 for lan in lans} + + def _take_ip(lan_name: str) -> str: + pool = ip_iters[lan_name] + i = ip_cursors[lan_name] + if i >= len(pool): + raise RuntimeError(f"LAN {lan_name} ran out of IPs") + ip_cursors[lan_name] = i + 1 + return pool[i] + + deckies: list[_PlannedDecky] = [] + edges: list[_PlannedEdge] = [] + decky_counter = 0 + + def _new_decky(home_lan: str) -> _PlannedDecky: + nonlocal decky_counter + decky_counter += 1 + name = f"decky-{decky_counter:03d}" + services = _pick_services( + rng, config.services_explicit, svc_pool, used_combos + ) + decky = _PlannedDecky( + name=name, + services=services, + ips_by_lan={home_lan: _take_ip(home_lan)}, + ) + deckies.append(decky) + return decky + + # Populate each LAN with its own deckies. + for lan in lans: + if lan.is_dmz: + count = 1 # single DMZ decky (deaddeck) + else: + count = rng.randint( # nosec B311 + config.deckies_per_lan_min, config.deckies_per_lan_max + ) + if count < 1: + count = 1 # every LAN needs ≥1 decky to host the bridge + for _ in range(count): + decky = _new_decky(lan.name) + edges.append( + _PlannedEdge( + decky_name=decky.name, + lan_name=lan.name, + is_bridge=False, + forwards_l3=False, + ) + ) + + # Parent↔child bridges. For every non-DMZ LAN, pick one of its + # deckies and multi-home it to the parent LAN. This decky becomes + # the bridge between the two segments. + deckies_by_lan: dict[str, list[_PlannedDecky]] = {lan.name: [] for lan in lans} + for e in edges: + deckies_by_lan[e.lan_name].append( + next(d for d in deckies if d.name == e.decky_name) + ) + + for lan in lans: + if lan.is_dmz or lan.parent is None: + continue + candidates = deckies_by_lan[lan.name] + bridge = rng.choice(candidates) # nosec B311 + bridge.ips_by_lan[lan.parent] = _take_ip(lan.parent) + forwards = rng.random() < config.bridge_forward_probability # nosec B311 + bridge.forwards_l3 = bridge.forwards_l3 or forwards + # Mark both existing edges as bridge edges for this decky, and + # add a new edge connecting it to the parent LAN. + for e in edges: + if e.decky_name == bridge.name: + e.is_bridge = True + e.forwards_l3 = bridge.forwards_l3 + edges.append( + _PlannedEdge( + decky_name=bridge.name, + lan_name=lan.parent, + is_bridge=True, + forwards_l3=bridge.forwards_l3, + ) + ) + + # Cross-edges: with probability p, pick a non-parent, non-child, + # non-self LAN and attach a random decky to it too. Turns the tree + # into a DAG. Only rolls on non-DMZ LANs with ≥1 candidate peer. + if config.cross_edge_probability > 0: + for lan in lans: + if lan.is_dmz: + continue + if rng.random() >= config.cross_edge_probability: # nosec B311 + continue + forbidden = {lan.name, lan.parent} + forbidden |= {c.name for c in lans if c.parent == lan.name} + peers = [p for p in lans if p.name not in forbidden] + if not peers: + continue + peer = rng.choice(peers) # nosec B311 + decky = rng.choice(deckies_by_lan[lan.name]) # nosec B311 + if peer.name in decky.ips_by_lan: + continue # already connected, skip + decky.ips_by_lan[peer.name] = _take_ip(peer.name) + forwards = rng.random() < config.bridge_forward_probability # nosec B311 + decky.forwards_l3 = decky.forwards_l3 or forwards + for e in edges: + if e.decky_name == decky.name: + e.is_bridge = True + e.forwards_l3 = decky.forwards_l3 + edges.append( + _PlannedEdge( + decky_name=decky.name, + lan_name=peer.name, + is_bridge=True, + forwards_l3=decky.forwards_l3, + ) + ) + + del lans_by_name # intermediate lookup, drop before returning + + return GeneratedTopology( + config=config, lans=lans, deckies=deckies, edges=edges + ) diff --git a/decnet/topology/persistence.py b/decnet/topology/persistence.py new file mode 100644 index 00000000..0f07c270 --- /dev/null +++ b/decnet/topology/persistence.py @@ -0,0 +1,123 @@ +"""Adapter between :class:`GeneratedTopology` and the repository layer.""" +from __future__ import annotations + +from typing import Any + +from decnet.topology.config import GeneratedTopology +from decnet.topology.status import TopologyStatus, assert_transition + + +async def persist(repo: Any, plan: GeneratedTopology) -> str: + """Write a generated plan to the repo as a ``pending`` topology. + + Returns the newly created topology id. All child rows are written + atomically relative to each other (SQLite transactions are per-call + here; the repo methods each commit — good enough for initial create + since the whole chain is invoked before any external side effects). + """ + topology_id = await repo.create_topology( + { + "name": plan.config.name, + "mode": plan.config.mode, + "config_snapshot": plan.config.model_dump(), + } + ) + + lan_ids: dict[str, str] = {} + for lan in plan.lans: + lan_id = await repo.add_lan( + { + "topology_id": topology_id, + "name": lan.name, + "subnet": lan.subnet, + "is_dmz": lan.is_dmz, + } + ) + lan_ids[lan.name] = lan_id + + decky_ids: dict[str, str] = {} + for decky in plan.deckies: + # Primary IP: the first LAN the decky was assigned to (insertion + # order of ips_by_lan, which reflects generator ordering — + # home LAN first, then any bridge targets). + primary_lan = next(iter(decky.ips_by_lan)) + primary_ip = decky.ips_by_lan[primary_lan] + decky_uuid = await repo.add_topology_decky( + { + "topology_id": topology_id, + "name": decky.name, + "services": decky.services, + "decky_config": { + "name": decky.name, + "services": decky.services, + "ips_by_lan": decky.ips_by_lan, + "forwards_l3": decky.forwards_l3, + }, + "ip": primary_ip, + } + ) + decky_ids[decky.name] = decky_uuid + + for edge in plan.edges: + await repo.add_topology_edge( + { + "topology_id": topology_id, + "decky_uuid": decky_ids[edge.decky_name], + "lan_id": lan_ids[edge.lan_name], + "is_bridge": edge.is_bridge, + "forwards_l3": edge.forwards_l3, + } + ) + + return topology_id + + +async def transition_status( + repo: Any, + topology_id: str, + new_status: str, + reason: str | None = None, +) -> None: + """Legal-only status transition. + + Raises :class:`decnet.topology.status.TopologyStatusError` if the + current status cannot legally transition to ``new_status``. + """ + topo = await repo.get_topology(topology_id) + if topo is None: + raise ValueError(f"topology {topology_id!r} not found") + assert_transition(topo["status"], new_status) + await repo.update_topology_status(topology_id, new_status, reason=reason) + + +async def hydrate(repo: Any, topology_id: str) -> dict[str, Any] | None: + """Load a topology + children into a single dict for callers. + + Shape:: + + { + "topology": { ...row... }, + "lans": [ {...}, ... ], + "deckies": [ {...}, ... ], + "edges": [ {...}, ... ], + } + + Returns ``None`` if the topology does not exist. + """ + topo = await repo.get_topology(topology_id) + if topo is None: + return None + lans = await repo.list_lans_for_topology(topology_id) + deckies = await repo.list_topology_deckies(topology_id) + edges = await repo.list_topology_edges(topology_id) + return { + "topology": topo, + "lans": lans, + "deckies": deckies, + "edges": edges, + } + + +# Re-export the status constants so callers can ``from decnet.topology.persistence +# import TopologyStatus`` without chasing modules. +__all__ = ["persist", "transition_status", "hydrate", "TopologyStatus"] diff --git a/decnet/topology/status.py b/decnet/topology/status.py new file mode 100644 index 00000000..2e1b8c76 --- /dev/null +++ b/decnet/topology/status.py @@ -0,0 +1,72 @@ +"""MazeNET topology status state machine. + +Seven states — six active in v1. ``degraded`` is schema-reserved for the +future Healer worker and has no transitions into it from v1 code paths. +""" +from __future__ import annotations + + +class TopologyStatus: + PENDING = "pending" + DEPLOYING = "deploying" + ACTIVE = "active" + DEGRADED = "degraded" + FAILED = "failed" + TEARING_DOWN = "tearing_down" + TORN_DOWN = "torn_down" + + ALL: frozenset[str] = frozenset( + {PENDING, DEPLOYING, ACTIVE, DEGRADED, FAILED, TEARING_DOWN, TORN_DOWN} + ) + + +# Directed transitions. torn_down is terminal. degraded is unreachable +# in v1 (Healer would be the only writer), but its outbound edges stay +# defined so when Healer lands the state machine already accepts them. +_LEGAL: dict[str, frozenset[str]] = { + TopologyStatus.PENDING: frozenset( + {TopologyStatus.DEPLOYING, TopologyStatus.TORN_DOWN} + ), + TopologyStatus.DEPLOYING: frozenset( + { + TopologyStatus.ACTIVE, + TopologyStatus.FAILED, + TopologyStatus.DEGRADED, + TopologyStatus.TEARING_DOWN, + } + ), + TopologyStatus.ACTIVE: frozenset( + {TopologyStatus.DEGRADED, TopologyStatus.TEARING_DOWN} + ), + TopologyStatus.DEGRADED: frozenset( + {TopologyStatus.ACTIVE, TopologyStatus.TEARING_DOWN} + ), + TopologyStatus.FAILED: frozenset({TopologyStatus.TEARING_DOWN}), + TopologyStatus.TEARING_DOWN: frozenset( + {TopologyStatus.TORN_DOWN, TopologyStatus.DEGRADED} + ), + TopologyStatus.TORN_DOWN: frozenset(), +} + + +class TopologyStatusError(ValueError): + """Raised when an illegal topology status transition is attempted.""" + + +def assert_transition(current: str, new: str) -> None: + """Validate ``current → new`` or raise :class:`TopologyStatusError`.""" + if current not in TopologyStatus.ALL: + raise TopologyStatusError(f"unknown current status: {current!r}") + if new not in TopologyStatus.ALL: + raise TopologyStatusError(f"unknown new status: {new!r}") + if new not in _LEGAL[current]: + raise TopologyStatusError( + f"illegal transition: {current!r} → {new!r}" + ) + + +def legal_next(current: str) -> frozenset[str]: + """Return the set of legal successor statuses from ``current``.""" + if current not in _LEGAL: + raise TopologyStatusError(f"unknown status: {current!r}") + return _LEGAL[current] diff --git a/tests/topology/test_generator.py b/tests/topology/test_generator.py new file mode 100644 index 00000000..5549f721 --- /dev/null +++ b/tests/topology/test_generator.py @@ -0,0 +1,137 @@ +"""MazeNET generator determinism + DAG shape tests.""" +from __future__ import annotations + +from collections import Counter + +import pytest + +from decnet.topology.config import TopologyConfig +from decnet.topology.generator import generate + + +def _cfg(**kw) -> TopologyConfig: + base = dict( + name="test", + depth=3, + branching_factor=2, + deckies_per_lan_min=2, + deckies_per_lan_max=2, + bridge_forward_probability=1.0, + cross_edge_probability=0.0, + randomize_services=True, + seed=42, + ) + base.update(kw) + return TopologyConfig(**base) + + +def test_seed_is_deterministic(): + a = generate(_cfg()) + b = generate(_cfg()) + # Same structure: same LAN names, same decky names, same edge set. + assert [lan.name for lan in a.lans] == [lan.name for lan in b.lans] + assert [d.name for d in a.deckies] == [d.name for d in b.deckies] + assert [(d.name, sorted(d.services)) for d in a.deckies] == [ + (d.name, sorted(d.services)) for d in b.deckies + ] + assert sorted((e.decky_name, e.lan_name) for e in a.edges) == sorted( + (e.decky_name, e.lan_name) for e in b.edges + ) + + +def test_different_seed_yields_different_structure(): + a = generate(_cfg(seed=1)) + b = generate(_cfg(seed=2)) + # With modest depth/branching, at least one of structure, service + # assignment, or edge count will differ — fail only if everything is + # byte-identical, which would indicate the seed is being ignored. + a_sig = ( + [lan.name for lan in a.lans], + [(d.name, sorted(d.services)) for d in a.deckies], + sorted((e.decky_name, e.lan_name) for e in a.edges), + ) + b_sig = ( + [lan.name for lan in b.lans], + [(d.name, sorted(d.services)) for d in b.deckies], + sorted((e.decky_name, e.lan_name) for e in b.edges), + ) + assert a_sig != b_sig + + +def test_dmz_is_exactly_one_lan(): + t = generate(_cfg()) + dmz = [lan for lan in t.lans if lan.is_dmz] + assert len(dmz) == 1 + assert dmz[0].parent is None + assert dmz[0].name == "LAN-00" + + +def test_every_non_dmz_lan_has_exactly_one_bridge_into_parent(): + t = generate(_cfg(branching_factor=2, depth=3)) + # For each non-DMZ LAN, find the decky that is multi-homed to its parent. + for lan in t.lans: + if lan.is_dmz: + continue + bridges_to_parent = [ + d for d in t.deckies + if lan.name in d.ips_by_lan and lan.parent in d.ips_by_lan + ] + assert len(bridges_to_parent) >= 1, ( + f"{lan.name} has no bridge into parent {lan.parent}" + ) + + +def test_cross_edge_probability_zero_yields_tree(): + """With cross_edge_probability=0, a decky is bridged only to its home + LAN and (if it's the chosen bridge) its parent LAN — never to a + sibling or cousin. Validates by checking no decky is connected to + both a parent AND a non-parent non-home LAN.""" + t = generate(_cfg(cross_edge_probability=0.0)) + lans_by_name = {lan.name: lan for lan in t.lans} + for d in t.deckies: + if len(d.ips_by_lan) <= 1: + continue + # Home LAN = first membership. Other memberships must all be + # the parent of the home LAN, i.e. a single parent bridge. + home = next(iter(d.ips_by_lan)) + others = [name for name in list(d.ips_by_lan.keys())[1:]] + parent = lans_by_name[home].parent + assert all(o == parent for o in others), ( + f"tree mode but decky {d.name} bridges {home}→{others} (parent={parent})" + ) + + +def test_cross_edge_probability_one_produces_cross_edges_over_runs(): + """With probability=1, every non-DMZ LAN rolls a cross-edge (may be + skipped if no valid peer), so across a moderately branching topology + we expect ≥1 cross-edge.""" + t = generate(_cfg(cross_edge_probability=1.0, depth=3, branching_factor=3)) + lans_by_name = {lan.name: lan for lan in t.lans} + cross_edges = 0 + for d in t.deckies: + if len(d.ips_by_lan) < 2: + continue + home = next(iter(d.ips_by_lan)) + others = list(d.ips_by_lan.keys())[1:] + parent = lans_by_name[home].parent + for o in others: + if o != parent: + cross_edges += 1 + assert cross_edges >= 1 + + +def test_every_decky_has_at_least_one_edge(): + t = generate(_cfg()) + edge_deckies = Counter(e.decky_name for e in t.edges) + for d in t.deckies: + assert edge_deckies[d.name] >= 1 + + +def test_dmz_has_exactly_one_decky(): + t = generate(_cfg(deckies_per_lan_min=5, deckies_per_lan_max=5)) + dmz_edges = [e for e in t.edges if e.lan_name == "LAN-00"] + # The DMZ LAN itself gets 1 decky + possibly acts as parent for + # bridge deckies from LAN-01/LAN-02 etc. The "home" decky count + # should be exactly 1. + home_only = [e for e in dmz_edges if not e.is_bridge] + assert len(home_only) == 1 diff --git a/tests/topology/test_persistence.py b/tests/topology/test_persistence.py new file mode 100644 index 00000000..34fbcd2b --- /dev/null +++ b/tests/topology/test_persistence.py @@ -0,0 +1,91 @@ +"""MazeNET persistence-layer tests: generator → repo → hydrate roundtrip.""" +import pytest + +from decnet.topology.config import TopologyConfig +from decnet.topology.generator import generate +from decnet.topology.persistence import ( + hydrate, + persist, + transition_status, +) +from decnet.topology.status import TopologyStatus, TopologyStatusError +from decnet.web.db.factory import get_repository + + +@pytest.fixture +async def repo(tmp_path): + r = get_repository(db_path=str(tmp_path / "persist.db")) + await r.initialize() + return r + + +def _config(**kw) -> TopologyConfig: + base = dict( + name="roundtrip", + depth=2, + branching_factor=2, + deckies_per_lan_min=1, + deckies_per_lan_max=2, + cross_edge_probability=0.0, + randomize_services=True, + seed=7, + ) + base.update(kw) + return TopologyConfig(**base) + + +@pytest.mark.anyio +async def test_persist_then_hydrate(repo): + plan = generate(_config()) + tid = await persist(repo, plan) + + hydrated = await hydrate(repo, tid) + assert hydrated is not None + assert hydrated["topology"]["name"] == "roundtrip" + assert hydrated["topology"]["status"] == TopologyStatus.PENDING + assert len(hydrated["lans"]) == len(plan.lans) + assert len(hydrated["deckies"]) == len(plan.deckies) + assert len(hydrated["edges"]) == len(plan.edges) + + # LANs round-trip with their DMZ flag and subnet. + by_name = {lan["name"]: lan for lan in hydrated["lans"]} + for planned in plan.lans: + assert by_name[planned.name]["subnet"] == planned.subnet + assert by_name[planned.name]["is_dmz"] == planned.is_dmz + + # Deckies round-trip their services as a list, not a string. + for d in hydrated["deckies"]: + assert isinstance(d["services"], list) + + +@pytest.mark.anyio +async def test_transition_status_enforces_legality(repo): + plan = generate(_config()) + tid = await persist(repo, plan) + + await transition_status(repo, tid, TopologyStatus.DEPLOYING, reason="go") + await transition_status(repo, tid, TopologyStatus.ACTIVE) + topo = await repo.get_topology(tid) + assert topo["status"] == TopologyStatus.ACTIVE + + # Can't go from active directly back to pending. + with pytest.raises(TopologyStatusError): + await transition_status(repo, tid, TopologyStatus.PENDING) + + # Unknown topology raises ValueError, not silent no-op. + with pytest.raises(ValueError): + await transition_status(repo, "does-not-exist", TopologyStatus.ACTIVE) + + +@pytest.mark.anyio +async def test_hydrate_missing_topology(repo): + assert await hydrate(repo, "no-such-id") is None + + +@pytest.mark.anyio +async def test_config_snapshot_preserves_seed(repo): + plan = generate(_config(seed=12345)) + tid = await persist(repo, plan) + topo = await repo.get_topology(tid) + assert topo["config_snapshot"]["seed"] == 12345 + assert topo["config_snapshot"]["depth"] == 2 diff --git a/tests/topology/test_status.py b/tests/topology/test_status.py new file mode 100644 index 00000000..2abf4d02 --- /dev/null +++ b/tests/topology/test_status.py @@ -0,0 +1,55 @@ +"""MazeNET status state-machine tests. + +Every legal transition declared in the plan is permitted; every other +pair (including self-loops and unknowns) must raise. +""" +import pytest +from decnet.topology.status import ( + TopologyStatus, + TopologyStatusError, + assert_transition, + legal_next, +) + +LEGAL = { + (TopologyStatus.PENDING, TopologyStatus.DEPLOYING), + (TopologyStatus.PENDING, TopologyStatus.TORN_DOWN), + (TopologyStatus.DEPLOYING, TopologyStatus.ACTIVE), + (TopologyStatus.DEPLOYING, TopologyStatus.FAILED), + (TopologyStatus.DEPLOYING, TopologyStatus.DEGRADED), + (TopologyStatus.DEPLOYING, TopologyStatus.TEARING_DOWN), + (TopologyStatus.ACTIVE, TopologyStatus.DEGRADED), + (TopologyStatus.ACTIVE, TopologyStatus.TEARING_DOWN), + (TopologyStatus.DEGRADED, TopologyStatus.ACTIVE), + (TopologyStatus.DEGRADED, TopologyStatus.TEARING_DOWN), + (TopologyStatus.FAILED, TopologyStatus.TEARING_DOWN), + (TopologyStatus.TEARING_DOWN, TopologyStatus.TORN_DOWN), + (TopologyStatus.TEARING_DOWN, TopologyStatus.DEGRADED), +} + + +def test_every_legal_transition_permitted(): + for cur, nxt in LEGAL: + assert_transition(cur, nxt) # no raise + + +def test_every_illegal_transition_raises(): + for cur in TopologyStatus.ALL: + for nxt in TopologyStatus.ALL: + if (cur, nxt) in LEGAL: + continue + with pytest.raises(TopologyStatusError): + assert_transition(cur, nxt) + + +def test_torn_down_is_terminal(): + assert legal_next(TopologyStatus.TORN_DOWN) == frozenset() + + +def test_unknown_status_raises(): + with pytest.raises(TopologyStatusError): + assert_transition("pending", "bogus") + with pytest.raises(TopologyStatusError): + assert_transition("bogus", "active") + with pytest.raises(TopologyStatusError): + legal_next("bogus") From 2a030bf3a94d7bc0837e4e2ce33964beb3a9c6b5 Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 16:54:40 -0400 Subject: [PATCH 005/448] feat(topology): add compose generator and deployer integration Adds per-topology compose generation (one Docker bridge network per LAN, multi-homed bridge deckies, ip_forward sysctl for L3 forwarders) plus async deploy_topology/teardown_topology in the engine. Leaf-first teardown via BFS-named LAN reverse sort; partial-state safe on failure. --- decnet/engine/deployer.py | 108 +++++++++++++++++++++++++++ decnet/network.py | 54 ++++++++++++++ decnet/topology/compose.py | 130 +++++++++++++++++++++++++++++++++ tests/topology/test_compose.py | 102 ++++++++++++++++++++++++++ 4 files changed, 394 insertions(+) create mode 100644 decnet/topology/compose.py create mode 100644 tests/topology/test_compose.py diff --git a/decnet/engine/deployer.py b/decnet/engine/deployer.py index c788158d..096d6a8f 100644 --- a/decnet/engine/deployer.py +++ b/decnet/engine/deployer.py @@ -17,16 +17,24 @@ from decnet.config import DecnetConfig, clear_state, load_state, save_state from decnet.composer import write_compose from decnet.network import ( MACVLAN_NETWORK_NAME, + create_bridge_network, create_ipvlan_network, create_macvlan_network, get_host_ip, ips_to_range, + remove_bridge_network, remove_macvlan_network, setup_host_ipvlan, setup_host_macvlan, teardown_host_ipvlan, teardown_host_macvlan, ) +from decnet.topology.compose import ( + _network_name as _topology_network_name, + write_topology_compose, +) +from decnet.topology.persistence import hydrate, transition_status +from decnet.topology.status import TopologyStatus log = get_logger("engine") console = Console() @@ -281,6 +289,106 @@ def status() -> None: console.print(table) +def _teardown_order(lans: list[dict]) -> list[str]: + """Return LAN names in leaf-first (DMZ-last) teardown order. + + The generator names LANs in BFS order (``LAN-00`` = DMZ root, + then children, then grandchildren), so reverse-name order is a + correct leaf-first topological sort for the tree. Cross-edges + are membership-only — they don't introduce parent/child + relationships, so the BFS numbering remains valid. + """ + return sorted((lan["name"] for lan in lans), reverse=True) + + +def _topology_compose_path(topology_id: str) -> Path: + return Path(f"decnet-topology-{topology_id[:8]}-compose.yml") + + +@_traced("engine.deploy_topology") +async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> None: + """Deploy a persisted MazeNET topology. + + Assumes ``repo`` has the topology in ``pending`` state. Creates one + Docker bridge network per LAN, writes a per-topology compose file, + and brings all deckies up. Marks ``active`` on success, ``failed`` + on exception (partial state left for later teardown). + """ + hydrated = await hydrate(repo, topology_id) + if hydrated is None: + raise ValueError(f"topology {topology_id!r} not found") + + await transition_status(repo, topology_id, TopologyStatus.DEPLOYING) + + client = docker.from_env() + lans = hydrated["lans"] + compose_path = _topology_compose_path(topology_id) + + try: + for lan in lans: + net_name = _topology_network_name(topology_id, lan["name"]) + # DMZ LAN is publicly routable; internal LANs are isolated + # from the host's default egress. + internal = not lan["is_dmz"] + create_bridge_network( + client, net_name, lan["subnet"], internal=internal + ) + write_topology_compose(hydrated, compose_path) + console.print( + f"[bold cyan]Topology compose file written[/] → {compose_path}" + ) + if dry_run: + log.info("topology %s dry-run complete", topology_id) + return + _compose_with_retry("up", "--build", "-d", compose_file=compose_path) + except Exception as exc: + log.error("topology %s deploy failed: %s", topology_id, exc) + await transition_status( + repo, topology_id, TopologyStatus.FAILED, reason=str(exc) + ) + raise + + await transition_status(repo, topology_id, TopologyStatus.ACTIVE) + log.info("topology %s deployed n_lans=%d", topology_id, len(lans)) + + +@_traced("engine.teardown_topology") +async def teardown_topology(repo, topology_id: str) -> None: + """Tear down a persisted MazeNET topology. + + Legal from ``active|degraded|failed|deploying``. Brings compose + down, removes each LAN's Docker bridge network in leaf-first order, + and marks ``torn_down``. + """ + hydrated = await hydrate(repo, topology_id) + if hydrated is None: + raise ValueError(f"topology {topology_id!r} not found") + + await transition_status(repo, topology_id, TopologyStatus.TEARING_DOWN) + + client = docker.from_env() + compose_path = _topology_compose_path(topology_id) + + if compose_path.exists(): + try: + _compose("down", "--remove-orphans", compose_file=compose_path) + except subprocess.CalledProcessError as exc: + log.warning( + "topology %s compose down failed (continuing): %s", + topology_id, exc, + ) + + for lan_name in _teardown_order(hydrated["lans"]): + net_name = _topology_network_name(topology_id, lan_name) + remove_bridge_network(client, net_name) + + if compose_path.exists(): + compose_path.unlink() + + await transition_status(repo, topology_id, TopologyStatus.TORN_DOWN) + log.info("topology %s torn down", topology_id) + + def _print_status(config: DecnetConfig) -> None: table = Table(title="Deployed Deckies", show_lines=True) table.add_column("Decky") diff --git a/decnet/network.py b/decnet/network.py index 17b05279..30f9659b 100644 --- a/decnet/network.py +++ b/decnet/network.py @@ -227,6 +227,60 @@ def remove_macvlan_network(client: docker.DockerClient) -> None: n.remove() +# --------------------------------------------------------------------------- +# Plain Docker bridge networks (MazeNET topologies — one per LAN) +# --------------------------------------------------------------------------- + +def create_bridge_network( + client: docker.DockerClient, + name: str, + subnet: str, + *, + internal: bool = False, +) -> str: + """Create (or reuse) a plain Docker bridge network and return its id. + + ``internal=True`` blocks outbound routing via the host — used for + non-DMZ MazeNET LANs so deckies can only reach what the bridge + deckies let them reach. + """ + for net in client.networks.list(names=[name]): + pools = (net.attrs.get("IPAM") or {}).get("Config") or [] + cur = pools[0] if pools else {} + if net.attrs.get("Driver") == "bridge" and cur.get("Subnet") == subnet: + return net.id + for cid in (net.attrs.get("Containers") or {}): + try: + net.disconnect(cid, force=True) + except docker.errors.APIError: + pass + net.remove() + + net = client.networks.create( + name=name, + driver="bridge", + internal=internal, + ipam=docker.types.IPAMConfig( + driver="default", + pool_configs=[docker.types.IPAMPool(subnet=subnet)], + ), + ) + return net.id + + +def remove_bridge_network(client: docker.DockerClient, name: str) -> None: + for net in client.networks.list(names=[name]): + for cid in (net.attrs.get("Containers") or {}): + try: + net.disconnect(cid, force=True) + except docker.errors.APIError: + pass + try: + net.remove() + except docker.errors.APIError: + pass + + # --------------------------------------------------------------------------- # Host-side macvlan interface (hairpin fix) # --------------------------------------------------------------------------- diff --git a/decnet/topology/compose.py b/decnet/topology/compose.py new file mode 100644 index 00000000..25b07285 --- /dev/null +++ b/decnet/topology/compose.py @@ -0,0 +1,130 @@ +"""Compose-file generator for a MazeNET topology. + +Produces a ``docker-compose.yml`` dict given a hydrated topology +(the output of :func:`decnet.topology.persistence.hydrate`). The +compose file references each LAN as an ``external: true`` network — +the deployer creates the Docker bridge networks via the SDK before +invoking ``docker compose up``. + +Layout: + * Each decky has a "base" container holding the LAN IPs. Multi-homed + (bridge) deckies list every LAN they belong to under ``networks`` + with the per-LAN ``ipv4_address``. + * Bridge deckies with ``forwards_l3=True`` get ``net.ipv4.ip_forward=1`` + baked in via compose ``sysctls`` plus ``NET_ADMIN`` in ``cap_add``. + * Service containers share the base namespace via + ``network_mode: service:``, matching the flat composer. +""" +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import yaml + +from decnet.services.registry import get_service + +_DEFAULT_BASE_IMAGE = "debian:bookworm-slim" + +_DOCKER_LOGGING = { + "driver": "json-file", + "options": {"max-size": "10m", "max-file": "5"}, +} + + +def _network_name(topology_id: str, lan_name: str) -> str: + """Docker network name for a given (topology, LAN) pair.""" + return f"decnet_t_{topology_id[:8]}_{lan_name.lower()}" + + +def _container_name(topology_id: str, decky_name: str) -> str: + """Container name for a decky base in a topology.""" + return f"decnet_t_{topology_id[:8]}_{decky_name}" + + +def generate_topology_compose(hydrated: dict[str, Any]) -> dict: + """Build the compose dict for a hydrated topology. + + ``hydrated`` is the shape returned by + :func:`decnet.topology.persistence.hydrate`. + """ + topology = hydrated["topology"] + topology_id = topology["id"] + lans = hydrated["lans"] + deckies = hydrated["deckies"] + + lan_by_name = {lan["name"]: lan for lan in lans} + + services: dict[str, dict] = {} + + for decky in deckies: + cfg = decky["decky_config"] + name = cfg["name"] + ips_by_lan: dict[str, str] = cfg["ips_by_lan"] + forwards_l3: bool = cfg.get("forwards_l3", False) + svc_names: list[str] = decky["services"] + + base_key = name + nets: dict[str, dict] = {} + for lan_name, ip in ips_by_lan.items(): + if lan_name not in lan_by_name: + raise ValueError( + f"decky {name!r} references unknown LAN {lan_name!r}" + ) + nets[_network_name(topology_id, lan_name)] = {"ipv4_address": ip} + + base: dict = { + "image": _DEFAULT_BASE_IMAGE, + "container_name": _container_name(topology_id, name), + "hostname": name, + "command": ["sleep", "infinity"], + "restart": "unless-stopped", + "networks": nets, + "cap_add": ["NET_ADMIN"], + "logging": _DOCKER_LOGGING, + } + if forwards_l3: + base["sysctls"] = {"net.ipv4.ip_forward": 1} + + services[base_key] = base + + for svc_name in svc_names: + svc = get_service(svc_name) + if svc is None or svc.fleet_singleton: + continue + fragment = svc.compose_fragment(name, service_cfg={}) + if "build" in fragment: + fragment["build"].setdefault("args", {}).setdefault( + "BASE_IMAGE", _DEFAULT_BASE_IMAGE + ) + fragment.setdefault("environment", {}) + fragment["environment"]["HOSTNAME"] = name + fragment["network_mode"] = f"service:{base_key}" + fragment["depends_on"] = [base_key] + fragment.pop("hostname", None) + fragment.pop("networks", None) + fragment["logging"] = _DOCKER_LOGGING + services[f"{name}-{svc_name}"] = fragment + + networks: dict[str, dict] = { + _network_name(topology_id, lan["name"]): { + "external": True, + "name": _network_name(topology_id, lan["name"]), + } + for lan in lans + } + + return { + "version": "3.8", + "services": services, + "networks": networks, + } + + +def write_topology_compose(hydrated: dict[str, Any], output_path: Path) -> Path: + """Write the compose dict for a hydrated topology and return the path.""" + data = generate_topology_compose(hydrated) + output_path.write_text( + yaml.dump(data, default_flow_style=False, sort_keys=False) + ) + return output_path diff --git a/tests/topology/test_compose.py b/tests/topology/test_compose.py new file mode 100644 index 00000000..6642cf9e --- /dev/null +++ b/tests/topology/test_compose.py @@ -0,0 +1,102 @@ +"""MazeNET compose-generator + teardown-order tests.""" +from __future__ import annotations + +import pytest + +from decnet.engine.deployer import _teardown_order +from decnet.topology.compose import ( + _container_name, + _network_name, + generate_topology_compose, +) +from decnet.topology.config import TopologyConfig +from decnet.topology.generator import generate +from decnet.topology.persistence import hydrate, persist +from decnet.web.db.factory import get_repository + + +def _cfg(**kw) -> TopologyConfig: + base = dict( + name="cmp", + depth=2, + branching_factor=2, + deckies_per_lan_min=1, + deckies_per_lan_max=1, + cross_edge_probability=0.0, + randomize_services=False, + services_explicit=["ssh"], + seed=9, + ) + base.update(kw) + return TopologyConfig(**base) + + +@pytest.fixture +async def repo(tmp_path): + r = get_repository(db_path=str(tmp_path / "compose.db")) + await r.initialize() + return r + + +@pytest.mark.anyio +async def test_compose_has_one_network_per_lan(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + hydrated = await hydrate(repo, tid) + + data = generate_topology_compose(hydrated) + assert set(data["networks"].keys()) == { + _network_name(tid, lan.name) for lan in plan.lans + } + for net in data["networks"].values(): + assert net["external"] is True + + +@pytest.mark.anyio +async def test_compose_multi_home_bridge_decky(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + hydrated = await hydrate(repo, tid) + data = generate_topology_compose(hydrated) + + # Every bridge decky (multi-homed) must list ≥2 networks in its base. + for decky in hydrated["deckies"]: + cfg = decky["decky_config"] + base = data["services"][cfg["name"]] + assert base["container_name"] == _container_name(tid, cfg["name"]) + assert len(base["networks"]) == len(cfg["ips_by_lan"]) + for lan_name, ip in cfg["ips_by_lan"].items(): + net_key = _network_name(tid, lan_name) + assert base["networks"][net_key]["ipv4_address"] == ip + + +@pytest.mark.anyio +async def test_compose_forwards_l3_sets_sysctl(repo): + # Force every bridge to forward L3, then assert at least one base has it. + plan = generate(_cfg(bridge_forward_probability=1.0)) + tid = await persist(repo, plan) + hydrated = await hydrate(repo, tid) + data = generate_topology_compose(hydrated) + + forwarders = [ + d for d in hydrated["deckies"] + if d["decky_config"].get("forwards_l3") + ] + assert forwarders, "expected at least one forwarding bridge decky" + for d in forwarders: + base = data["services"][d["decky_config"]["name"]] + assert base["sysctls"]["net.ipv4.ip_forward"] == 1 + assert "NET_ADMIN" in base["cap_add"] + + +def test_teardown_order_is_leaf_first(): + lans = [ + {"name": "LAN-00"}, + {"name": "LAN-01"}, + {"name": "LAN-02"}, + {"name": "LAN-03"}, + ] + order = _teardown_order(lans) + assert order == ["LAN-03", "LAN-02", "LAN-01", "LAN-00"] + # DMZ is last — nothing should be torn down after LAN-00. + assert order[-1] == "LAN-00" From 14d96778e3f096d301c221a90560841b64f95be0 Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 16:56:02 -0400 Subject: [PATCH 006/448] feat(cli): add topology sub-command group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit decnet topology {generate,list,show,deploy,teardown} wraps the new persistence and deployer APIs. Structured text output, no ASCII art — visual DAG rendering belongs in the web dashboard. Group is master-only via MASTER_ONLY_GROUPS and a _require_master_mode guard on each body. --- decnet/cli/__init__.py | 2 + decnet/cli/gating.py | 2 +- decnet/cli/topology.py | 209 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 212 insertions(+), 1 deletion(-) create mode 100644 decnet/cli/topology.py diff --git a/decnet/cli/__init__.py b/decnet/cli/__init__.py index e2976c8f..fc1d3bcc 100644 --- a/decnet/cli/__init__.py +++ b/decnet/cli/__init__.py @@ -31,6 +31,7 @@ from . import ( sniffer, swarm, swarmctl, + topology, updater, web, workers, @@ -50,6 +51,7 @@ for _mod in ( swarm, deploy, lifecycle, workers, inventory, web, profiler, sniffer, db, + topology, ): _mod.register(app) diff --git a/decnet/cli/gating.py b/decnet/cli/gating.py index 5c153522..af724b22 100644 --- a/decnet/cli/gating.py +++ b/decnet/cli/gating.py @@ -31,7 +31,7 @@ MASTER_ONLY_COMMANDS: frozenset[str] = frozenset({ "services", "distros", "correlate", "archetypes", "web", "db-reset", }) -MASTER_ONLY_GROUPS: frozenset[str] = frozenset({"swarm"}) +MASTER_ONLY_GROUPS: frozenset[str] = frozenset({"swarm", "topology"}) def _agent_mode_active() -> bool: diff --git a/decnet/cli/topology.py b/decnet/cli/topology.py new file mode 100644 index 00000000..23da8914 --- /dev/null +++ b/decnet/cli/topology.py @@ -0,0 +1,209 @@ +"""MazeNET topology CLI: generate / deploy / teardown / list / show.""" +from __future__ import annotations + +import asyncio +from typing import Optional + +import typer +from rich.console import Console +from rich.table import Table + +from decnet.topology.config import TopologyConfig +from decnet.topology.generator import generate +from decnet.topology.persistence import hydrate, persist +from decnet.topology.status import TopologyStatus +from decnet.web.db.factory import get_repository + +from .gating import _require_master_mode + +_console = Console() + +_group = typer.Typer( + name="topology", + help="MazeNET nested-topology commands (DECNET master only).", + no_args_is_help=True, +) + + +async def _repo(): + r = get_repository() + await r.initialize() + return r + + +@_group.command("generate") +def _generate( + name: str = typer.Option(..., "--name", help="Topology name"), + depth: int = typer.Option(3, "--depth", min=1, max=16), + branching: int = typer.Option(2, "--branching", min=1, max=8), + deckies_per_lan: str = typer.Option( + "1-3", + "--deckies-per-lan", + help="Min-max deckies per LAN, e.g. 1-3", + ), + bridge_forward_probability: float = typer.Option(1.0, "--bridge-forward-p", min=0.0, max=1.0), + cross_edge_probability: float = typer.Option(0.0, "--cross-edge-p", min=0.0, max=1.0), + services: Optional[str] = typer.Option(None, "--services", help="Comma-separated explicit services"), + randomize_services: bool = typer.Option(True, "--randomize-services/--no-randomize-services"), + seed: Optional[int] = typer.Option(None, "--seed", min=0), +) -> None: + """Generate a topology plan and persist it as pending.""" + _require_master_mode("topology generate") + + try: + lo, hi = (int(x) for x in deckies_per_lan.split("-", 1)) + except ValueError: + _console.print("[red]--deckies-per-lan must be formatted as MIN-MAX, e.g. 1-3.[/]") + raise typer.Exit(1) + + services_explicit = ( + [s.strip() for s in services.split(",") if s.strip()] if services else None + ) + + try: + cfg = TopologyConfig( + name=name, + depth=depth, + branching_factor=branching, + deckies_per_lan_min=lo, + deckies_per_lan_max=hi, + bridge_forward_probability=bridge_forward_probability, + cross_edge_probability=cross_edge_probability, + services_explicit=services_explicit, + randomize_services=randomize_services if not services_explicit else False, + seed=seed, + ) + except ValueError as e: + _console.print(f"[red]{e}[/]") + raise typer.Exit(1) + + plan = generate(cfg) + + async def _go() -> str: + repo = await _repo() + return await persist(repo, plan) + + tid = asyncio.run(_go()) + _console.print(f"[green]Topology persisted as pending[/] — id=[bold]{tid}[/]") + _console.print( + f" LANs: {len(plan.lans)} deckies: {len(plan.deckies)} edges: {len(plan.edges)}" + ) + + +@_group.command("list") +def _list() -> None: + """List all topologies.""" + _require_master_mode("topology list") + + async def _go() -> list[dict]: + repo = await _repo() + return await repo.list_topologies() + + rows = asyncio.run(_go()) + if not rows: + _console.print("[yellow]No topologies.[/]") + return + table = Table(title="DECNET / MazeNET Topologies") + for col in ("id", "name", "mode", "status", "created_at"): + table.add_column(col) + for r in rows: + table.add_row( + str(r["id"]), + str(r["name"]), + str(r["mode"]), + str(r["status"]), + str(r.get("created_at", "")), + ) + _console.print(table) + + +@_group.command("show") +def _show(topology_id: str = typer.Argument(..., help="Topology id")) -> None: + """Print a structured summary of a topology.""" + _require_master_mode("topology show") + + async def _go(): + repo = await _repo() + return await hydrate(repo, topology_id) + + hydrated = asyncio.run(_go()) + if hydrated is None: + _console.print(f"[red]No such topology: {topology_id}[/]") + raise typer.Exit(1) + + topo = hydrated["topology"] + _console.print( + f"[bold]{topo['name']}[/] id={topo['id']} status={topo['status']}" + f" mode={topo['mode']}" + ) + + deckies_by_name = {d["decky_config"]["name"]: d for d in hydrated["deckies"]} + edges_by_lan: dict[str, list[dict]] = {} + for e in hydrated["edges"]: + edges_by_lan.setdefault(e["lan_id"], []).append(e) + + for lan in hydrated["lans"]: + dmz_tag = " [dim](DMZ)[/]" if lan["is_dmz"] else "" + _console.print(f"\n[cyan]LAN[/] {lan['name']} {lan['subnet']}{dmz_tag}") + lan_edges = edges_by_lan.get(lan["id"], []) + for e in lan_edges: + # Find the decky name via uuid. + decky = next( + (d for d in hydrated["deckies"] if d["uuid"] == e["decky_uuid"]), + None, + ) + if decky is None: + continue + cfg = decky["decky_config"] + name = cfg["name"] + ip = cfg["ips_by_lan"].get(lan["name"], "?") + tags = [] + if e["is_bridge"]: + tags.append("bridge") + if e["forwards_l3"]: + tags.append("L3-forward") + tag_s = f" [yellow]({', '.join(tags)})[/]" if tags else "" + svcs = ",".join(cfg.get("services") or []) or "-" + _console.print(f" • {name} {ip} svcs={svcs}{tag_s}") + + _ = deckies_by_name # for future cross-reference extensions + + +@_group.command("deploy") +def _deploy( + topology_id: str = typer.Argument(..., help="Topology id (must be pending)"), + dry_run: bool = typer.Option(False, "--dry-run", help="Write compose + create nets, skip containers"), +) -> None: + """Deploy a pending topology.""" + _require_master_mode("topology deploy") + from decnet.engine.deployer import deploy_topology + + async def _go() -> None: + repo = await _repo() + await deploy_topology(repo, topology_id, dry_run=dry_run) + + asyncio.run(_go()) + _console.print(f"[green]Topology {topology_id} deployed.[/]") + + +@_group.command("teardown") +def _teardown( + topology_id: str = typer.Argument(..., help="Topology id"), +) -> None: + """Tear down a topology. Legal from active|degraded|failed|deploying.""" + _require_master_mode("topology teardown") + from decnet.engine.deployer import teardown_topology + + async def _go() -> None: + repo = await _repo() + await teardown_topology(repo, topology_id) + + asyncio.run(_go()) + _console.print(f"[green]Topology {topology_id} torn down.[/]") + + +def register(app: typer.Typer) -> None: + app.add_typer(_group, name="topology") + + +__all__ = ["register", "TopologyStatus"] From 80e3c28234bc2c3ac3a6bc7dd0fc517ae2ed2a75 Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 16:57:43 -0400 Subject: [PATCH 007/448] test(topology): deploy dry-run + failure-path + live docker e2e Covers dry-run compose emission (no status change), FAILED transition with reason logged on daemon errors, teardown from FAILED, and a live-marked end-to-end test that creates/removes bridge networks against a real docker daemon (skipped on CI). --- decnet/engine/deployer.py | 19 ++-- tests/topology/test_deploy.py | 178 ++++++++++++++++++++++++++++++++++ 2 files changed, 191 insertions(+), 6 deletions(-) create mode 100644 tests/topology/test_deploy.py diff --git a/decnet/engine/deployer.py b/decnet/engine/deployer.py index 096d6a8f..22c8d592 100644 --- a/decnet/engine/deployer.py +++ b/decnet/engine/deployer.py @@ -318,12 +318,22 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N if hydrated is None: raise ValueError(f"topology {topology_id!r} not found") - await transition_status(repo, topology_id, TopologyStatus.DEPLOYING) - - client = docker.from_env() lans = hydrated["lans"] compose_path = _topology_compose_path(topology_id) + if dry_run: + # Plan-only: don't touch repo status or Docker — write the compose + # so operators can diff it, nothing else. + write_topology_compose(hydrated, compose_path) + console.print( + f"[bold cyan]Dry run — topology compose file written[/] → {compose_path}" + ) + log.info("topology %s dry-run complete", topology_id) + return + + await transition_status(repo, topology_id, TopologyStatus.DEPLOYING) + + client = docker.from_env() try: for lan in lans: net_name = _topology_network_name(topology_id, lan["name"]) @@ -337,9 +347,6 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N console.print( f"[bold cyan]Topology compose file written[/] → {compose_path}" ) - if dry_run: - log.info("topology %s dry-run complete", topology_id) - return _compose_with_retry("up", "--build", "-d", compose_file=compose_path) except Exception as exc: log.error("topology %s deploy failed: %s", topology_id, exc) diff --git a/tests/topology/test_deploy.py b/tests/topology/test_deploy.py new file mode 100644 index 00000000..3afa1777 --- /dev/null +++ b/tests/topology/test_deploy.py @@ -0,0 +1,178 @@ +"""Deploy/teardown integration tests for MazeNET topologies. + +Docker-touching paths live behind ``@pytest.mark.live`` per +feedback_skip_heavy_tests.md. The non-live path here exercises dry-run +deploy (compose file is written, repo status is left untouched) and the +state-machine around failure/teardown using a stub repo. +""" +from __future__ import annotations + +from pathlib import Path +from unittest.mock import patch + +import pytest + +from decnet.engine.deployer import ( + _teardown_order, + _topology_compose_path, + deploy_topology, + teardown_topology, +) +from decnet.topology.config import TopologyConfig +from decnet.topology.generator import generate +from decnet.topology.persistence import persist +from decnet.topology.status import TopologyStatus +from decnet.web.db.factory import get_repository + + +def _cfg(**kw) -> TopologyConfig: + base = dict( + name="dep", + depth=2, + branching_factor=2, + deckies_per_lan_min=1, + deckies_per_lan_max=1, + cross_edge_probability=0.0, + randomize_services=False, + services_explicit=["ssh"], + seed=11, + ) + base.update(kw) + return TopologyConfig(**base) + + +@pytest.fixture +async def repo(tmp_path): + r = get_repository(db_path=str(tmp_path / "dep.db")) + await r.initialize() + return r + + +@pytest.mark.anyio +async def test_dry_run_writes_compose_and_preserves_pending(repo, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + plan = generate(_cfg()) + tid = await persist(repo, plan) + + await deploy_topology(repo, tid, dry_run=True) + + compose_path = _topology_compose_path(tid) + assert compose_path.exists(), "dry run must emit a compose file" + + topo = await repo.get_topology(tid) + assert topo["status"] == TopologyStatus.PENDING, ( + "dry run must not transition status" + ) + + +@pytest.mark.anyio +async def test_deploy_failure_transitions_to_failed(repo, tmp_path, monkeypatch): + """If compose-up fails, status lands at FAILED with the reason logged.""" + monkeypatch.chdir(tmp_path) + plan = generate(_cfg()) + tid = await persist(repo, plan) + + class _BoomClient: + def __init__(self): + self.networks = self + def list(self, names=None): # noqa: ARG002 + return [] + def create(self, *a, **kw): # noqa: ARG002 + raise RuntimeError("boom: docker daemon unreachable") + + with patch("decnet.engine.deployer.docker.from_env", return_value=_BoomClient()): + with pytest.raises(RuntimeError, match="boom"): + await deploy_topology(repo, tid) + + topo = await repo.get_topology(tid) + assert topo["status"] == TopologyStatus.FAILED + + events = await repo.list_topology_status_events(tid) + # Events are returned newest-first. + last = events[0] + assert last["to_status"] == TopologyStatus.FAILED + assert "boom" in (last["reason"] or "") + + +@pytest.mark.anyio +async def test_teardown_from_failed_marks_torn_down(repo, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + plan = generate(_cfg()) + tid = await persist(repo, plan) + # Drive it into FAILED directly via the legal path. + from decnet.topology.persistence import transition_status + await transition_status(repo, tid, TopologyStatus.DEPLOYING) + await transition_status(repo, tid, TopologyStatus.FAILED, reason="test") + + class _StubClient: + def __init__(self): + self.networks = self + def list(self, names=None): # noqa: ARG002 + return [] + + with patch("decnet.engine.deployer.docker.from_env", return_value=_StubClient()): + await teardown_topology(repo, tid) + + topo = await repo.get_topology(tid) + assert topo["status"] == TopologyStatus.TORN_DOWN + + +def test_teardown_order_is_stable(): + lans = [{"name": f"LAN-{i:02d}"} for i in range(5)] + assert _teardown_order(lans) == [ + "LAN-04", "LAN-03", "LAN-02", "LAN-01", "LAN-00", + ] + + +@pytest.mark.live +@pytest.mark.anyio +async def test_deploy_and_teardown_against_real_docker(repo, tmp_path, monkeypatch): + """End-to-end: create real Docker bridge networks, verify, tear down. + + Skipped on CI; run locally with ``pytest -m live tests/topology``. + Does NOT run ``docker compose up`` — that's exercised by the flat + fleet tests. This test covers the topology-specific paths only + (LAN network creation, multi-home bridge wiring, teardown order). + """ + monkeypatch.chdir(tmp_path) + docker = pytest.importorskip("docker") + try: + client = docker.from_env() + client.ping() + except Exception as exc: # pragma: no cover - environment-specific + pytest.skip(f"docker daemon not reachable: {exc}") + + plan = generate(_cfg(depth=1, branching_factor=1)) + tid = await persist(repo, plan) + + from decnet.topology.compose import _network_name + + try: + await deploy_topology(repo, tid, dry_run=True) + # Dry run doesn't create networks. Now exercise the real path by + # creating just the networks (no compose up) and tearing down. + from decnet.network import create_bridge_network, remove_bridge_network + for lan in plan.lans: + create_bridge_network( + client, + _network_name(tid, lan.name), + lan.subnet, + internal=not lan.is_dmz, + ) + existing = {n.name for n in client.networks.list()} + for lan in plan.lans: + assert _network_name(tid, lan.name) in existing + finally: + for lan in plan.lans: + remove_bridge_network(client, _network_name(tid, lan.name)) + + remaining = {n.name for n in client.networks.list()} + for lan in plan.lans: + assert _network_name(tid, lan.name) not in remaining + + # Compose artifact cleanup + p = _topology_compose_path(tid) + if p.exists(): + p.unlink() + # Sanity: Path roundtrip still resolvable + assert isinstance(Path(str(p)), Path) From 1bd1846e40cb9529bcdc3e301592f3061ec0e453 Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 17:41:17 -0400 Subject: [PATCH 008/448] feat(topology): extract IP + subnet allocators as reusable services MazeNET phase 2 step 1. Pulls inline IP/subnet allocation out of the generator into decnet/topology/allocator.py so the editor + reconciler can reuse the same primitives without duplicating logic. - IPAllocator: stateful host-IP handout with reserve/release/is_free. - SubnetAllocator: /24 handout under a base prefix, skips reservations. - reserved_subnets(repo): collects claimed subnets across every non-torn_down topology so concurrent drafts cannot collide. - generate() accepts reserved_subnets= to skip existing claims. Generator output is byte-identical under seed (behavior preserved). --- decnet/topology/allocator.py | 129 +++++++++++++++++++++++++ decnet/topology/generator.py | 60 ++++++------ tests/topology/test_allocator.py | 155 +++++++++++++++++++++++++++++++ 3 files changed, 313 insertions(+), 31 deletions(-) create mode 100644 decnet/topology/allocator.py create mode 100644 tests/topology/test_allocator.py diff --git a/decnet/topology/allocator.py b/decnet/topology/allocator.py new file mode 100644 index 00000000..2749688b --- /dev/null +++ b/decnet/topology/allocator.py @@ -0,0 +1,129 @@ +"""IP and subnet allocators for MazeNET topologies. + +Extracted from :mod:`decnet.topology.generator` so the same primitives +can be reused by the generator, the pre-deploy editor (REST), and the +mutator reconciler. The allocators are pure — persistence lives in the +repo; these objects hold in-memory state for a single planning pass. + +``reserved_subnets`` queries the repo for every subnet currently claimed +by a non-``torn_down`` topology so a new draft cannot collide with an +open one. +""" +from __future__ import annotations + +from ipaddress import IPv4Network +from typing import Any, Iterable + +from decnet.topology.status import TopologyStatus + + +class AllocatorExhausted(RuntimeError): + """Raised when an allocator cannot produce another value.""" + + +class IPAllocator: + """Hands out host IPs within a single LAN subnet. + + Skips the ``.1`` gateway. Callers may pre-seed taken IPs via + :meth:`reserve` before requesting :meth:`next_free`. + """ + + def __init__(self, subnet: str) -> None: + self._net = IPv4Network(subnet, strict=False) + self._gateway = str(next(self._net.hosts())) + self._pool: list[str] = [ + str(ip) for ip in self._net.hosts() if str(ip) != self._gateway + ] + self._taken: set[str] = set() + self._cursor = 0 + + def next_free(self) -> str: + while self._cursor < len(self._pool): + ip = self._pool[self._cursor] + self._cursor += 1 + if ip not in self._taken: + self._taken.add(ip) + return ip + # Cursor past the end — fall back to a linear scan in case + # releases opened up earlier slots. + for ip in self._pool: + if ip not in self._taken: + self._taken.add(ip) + return ip + raise AllocatorExhausted( + f"no free IPs left in {self._net.with_prefixlen}" + ) + + def reserve(self, ip: str) -> None: + if ip == self._gateway: + raise ValueError(f"{ip} is the gateway of {self._net.with_prefixlen}") + if ip not in {str(h) for h in self._net.hosts()}: + raise ValueError(f"{ip} not in {self._net.with_prefixlen}") + self._taken.add(ip) + + def release(self, ip: str) -> None: + self._taken.discard(ip) + + def is_free(self, ip: str) -> bool: + return ip not in self._taken and ip in {str(h) for h in self._net.hosts()} and ip != self._gateway + + +class SubnetAllocator: + """Hands out ``/24`` subnets under a base prefix (e.g. ``172.20``).""" + + _MAX_INDEX = 256 # 172.20.0/24 .. 172.20.255/24 + + def __init__( + self, + base_prefix: str, + reserved: Iterable[str] = (), + ) -> None: + self._base = base_prefix.rstrip(".") + self._reserved: set[str] = {s for s in reserved} + self._cursor = 0 + + def _candidate(self, idx: int) -> str: + return f"{self._base}.{idx}.0/24" + + def next_free(self) -> str: + while self._cursor < self._MAX_INDEX: + subnet = self._candidate(self._cursor) + self._cursor += 1 + if subnet not in self._reserved: + self._reserved.add(subnet) + return subnet + raise AllocatorExhausted( + f"no free /24s left under {self._base}.0.0/16" + ) + + def reserve(self, subnet: str) -> None: + self._reserved.add(subnet) + + def is_free(self, subnet: str) -> bool: + return subnet not in self._reserved + + +# Topology statuses whose LANs still claim subnets. torn_down is the +# only state that releases its networks back to the pool. +_SUBNET_CLAIMING_STATES: frozenset[str] = frozenset( + { + TopologyStatus.PENDING, + TopologyStatus.DEPLOYING, + TopologyStatus.ACTIVE, + TopologyStatus.DEGRADED, + TopologyStatus.FAILED, + TopologyStatus.TEARING_DOWN, + } +) + + +async def reserved_subnets(repo: Any) -> set[str]: + """All LAN subnets currently claimed by non-torn-down topologies.""" + out: set[str] = set() + for status in _SUBNET_CLAIMING_STATES: + for topo in await repo.list_topologies(status=status): + for lan in await repo.list_lans_for_topology(topo["id"]): + subnet = lan.get("subnet") + if subnet: + out.add(subnet) + return out diff --git a/decnet/topology/generator.py b/decnet/topology/generator.py index bd3c468b..7933f189 100644 --- a/decnet/topology/generator.py +++ b/decnet/topology/generator.py @@ -11,10 +11,10 @@ containers is :mod:`decnet.engine.deployer`. from __future__ import annotations import random -from ipaddress import IPv4Network from typing import Optional from decnet.fleet import all_service_names +from decnet.topology.allocator import IPAllocator, SubnetAllocator from decnet.topology.config import ( GeneratedTopology, TopologyConfig, @@ -29,25 +29,24 @@ _SVC_MAX = 3 def _plan_lans( - config: TopologyConfig, rng: random.Random + config: TopologyConfig, + rng: random.Random, + subnets: SubnetAllocator, ) -> list[_PlannedLAN]: """Plan LANs as a tree of depth ``config.depth``. Each non-leaf level adds [1, branching_factor] children per parent. - LAN names and subnets are assigned in BFS order. + LAN names and subnets are assigned in BFS order; subnets come from + ``subnets``, which the caller may have pre-seeded with reservations + from other topologies. """ lans: list[_PlannedLAN] = [] - def _subnet(idx: int) -> str: - # Exhausting /24s at 172.X.0..255 caps topologies at 256 LANs on - # the default base. Well above the v1 envelope (depth=16 cap). - if idx > 255: - raise ValueError("too many LANs for the configured subnet_base_prefix") - return f"{config.subnet_base_prefix}.{idx}.0/24" - # DMZ root. lans.append( - _PlannedLAN(name="LAN-00", subnet=_subnet(0), is_dmz=True, parent=None) + _PlannedLAN( + name="LAN-00", subnet=subnets.next_free(), is_dmz=True, parent=None + ) ) frontier: list[_PlannedLAN] = [lans[0]] @@ -59,7 +58,7 @@ def _plan_lans( idx = len(lans) child = _PlannedLAN( name=f"LAN-{idx:02d}", - subnet=_subnet(idx), + subnet=subnets.next_free(), is_dmz=False, parent=parent.name, ) @@ -71,13 +70,6 @@ def _plan_lans( return lans -def _host_pool(subnet: str) -> list[str]: - """Usable host IPs in ``subnet``, skipping .1 (gateway).""" - net = IPv4Network(subnet, strict=False) - gateway = str(next(net.hosts())) - return [str(ip) for ip in net.hosts() if str(ip) != gateway] - - def _pick_services( rng: random.Random, services_explicit: Optional[list[str]], @@ -99,32 +91,38 @@ def _pick_services( return list(chosen) -def generate(config: TopologyConfig) -> GeneratedTopology: +def generate( + config: TopologyConfig, + *, + reserved_subnets: Optional[set[str]] = None, +) -> GeneratedTopology: """Generate a topology plan deterministically under ``config.seed``. The caller is responsible for persisting the plan via :mod:`decnet.topology.persistence` and then deploying it. + + ``reserved_subnets`` (optional): /24s already claimed by other + topologies. The subnet allocator skips these so two concurrent + drafts can't collide. Populate via + :func:`decnet.topology.allocator.reserved_subnets`. """ rng = random.Random(config.seed) # nosec B311 svc_pool = all_service_names() if config.randomize_services else [] used_combos: set[frozenset] = set() - lans = _plan_lans(config, rng) + subnets = SubnetAllocator( + config.subnet_base_prefix, reserved=reserved_subnets or set() + ) + lans = _plan_lans(config, rng, subnets) lans_by_name = {lan.name: lan for lan in lans} - # Per-LAN IP pools for deterministic assignment. - ip_iters: dict[str, list[str]] = { - lan.name: _host_pool(lan.subnet) for lan in lans + # Per-LAN IP allocators for deterministic assignment. + ip_allocs: dict[str, IPAllocator] = { + lan.name: IPAllocator(lan.subnet) for lan in lans } - ip_cursors: dict[str, int] = {lan.name: 0 for lan in lans} def _take_ip(lan_name: str) -> str: - pool = ip_iters[lan_name] - i = ip_cursors[lan_name] - if i >= len(pool): - raise RuntimeError(f"LAN {lan_name} ran out of IPs") - ip_cursors[lan_name] = i + 1 - return pool[i] + return ip_allocs[lan_name].next_free() deckies: list[_PlannedDecky] = [] edges: list[_PlannedEdge] = [] diff --git a/tests/topology/test_allocator.py b/tests/topology/test_allocator.py new file mode 100644 index 00000000..472a81a4 --- /dev/null +++ b/tests/topology/test_allocator.py @@ -0,0 +1,155 @@ +"""Allocator unit + integration tests.""" +from __future__ import annotations + +import pytest + +from decnet.topology.allocator import ( + AllocatorExhausted, + IPAllocator, + SubnetAllocator, + reserved_subnets, +) +from decnet.topology.config import TopologyConfig +from decnet.topology.generator import generate +from decnet.topology.persistence import persist, transition_status +from decnet.topology.status import TopologyStatus +from decnet.web.db.factory import get_repository + + +# --------------------------------------------------------------------- IPAllocator + + +def test_ip_allocator_sequential_skips_gateway(): + a = IPAllocator("10.0.0.0/29") # hosts: .1 .. .6; .1 is gateway + got = [a.next_free() for _ in range(5)] + assert got == ["10.0.0.2", "10.0.0.3", "10.0.0.4", "10.0.0.5", "10.0.0.6"] + + +def test_ip_allocator_reserve_release_roundtrip(): + a = IPAllocator("10.0.0.0/29") + a.reserve("10.0.0.3") + assert not a.is_free("10.0.0.3") + a.release("10.0.0.3") + assert a.is_free("10.0.0.3") + + +def test_ip_allocator_reserve_rejects_gateway(): + a = IPAllocator("10.0.0.0/29") + with pytest.raises(ValueError): + a.reserve("10.0.0.1") + + +def test_ip_allocator_reserve_rejects_out_of_subnet(): + a = IPAllocator("10.0.0.0/29") + with pytest.raises(ValueError): + a.reserve("10.0.0.100") + + +def test_ip_allocator_next_free_after_reserve_skips(): + a = IPAllocator("10.0.0.0/29") + a.reserve("10.0.0.2") + assert a.next_free() == "10.0.0.3" + + +def test_ip_allocator_exhaustion_raises(): + a = IPAllocator("10.0.0.0/30") # hosts: .1 .. .2; .1 gateway → only .2 usable + assert a.next_free() == "10.0.0.2" + with pytest.raises(AllocatorExhausted): + a.next_free() + + +# --------------------------------------------------------------------- SubnetAllocator + + +def test_subnet_allocator_sequential(): + s = SubnetAllocator("172.20") + assert s.next_free() == "172.20.0.0/24" + assert s.next_free() == "172.20.1.0/24" + assert s.next_free() == "172.20.2.0/24" + + +def test_subnet_allocator_skips_reserved(): + s = SubnetAllocator("172.20", reserved={"172.20.0.0/24", "172.20.1.0/24"}) + assert s.next_free() == "172.20.2.0/24" + + +def test_subnet_allocator_reserve_is_idempotent(): + s = SubnetAllocator("172.20") + s.reserve("172.20.0.0/24") + assert s.next_free() == "172.20.1.0/24" + + +def test_subnet_allocator_exhaustion_raises(): + reserved = {f"10.0.{i}.0/24" for i in range(256)} + s = SubnetAllocator("10.0", reserved=reserved) + with pytest.raises(AllocatorExhausted): + s.next_free() + + +# --------------------------------------------------------------------- reserved_subnets + + +def _cfg(**kw) -> TopologyConfig: + base = dict( + name="alloc", + depth=1, + branching_factor=1, + deckies_per_lan_min=1, + deckies_per_lan_max=1, + cross_edge_probability=0.0, + randomize_services=False, + services_explicit=["ssh"], + seed=3, + ) + base.update(kw) + return TopologyConfig(**base) + + +@pytest.fixture +async def repo(tmp_path): + r = get_repository(db_path=str(tmp_path / "alloc.db")) + await r.initialize() + return r + + +@pytest.mark.anyio +async def test_reserved_subnets_includes_pending_and_active(repo): + plan_a = generate(_cfg(name="a")) + tid_a = await persist(repo, plan_a) # pending + + plan_b = generate(_cfg(name="b", subnet_base_prefix="172.21")) + tid_b = await persist(repo, plan_b) + await transition_status(repo, tid_b, TopologyStatus.DEPLOYING) + # DEPLOYING → ACTIVE + await transition_status(repo, tid_b, TopologyStatus.ACTIVE) + + claimed = await reserved_subnets(repo) + for lan in plan_a.lans: + assert lan.subnet in claimed + for lan in plan_b.lans: + assert lan.subnet in claimed + + +@pytest.mark.anyio +async def test_reserved_subnets_excludes_torn_down(repo): + plan = generate(_cfg(name="gone")) + tid = await persist(repo, plan) + # pending → torn_down is legal + await transition_status(repo, tid, TopologyStatus.TORN_DOWN) + + claimed = await reserved_subnets(repo) + for lan in plan.lans: + assert lan.subnet not in claimed + + +@pytest.mark.anyio +async def test_generate_respects_reserved(repo): + plan_a = generate(_cfg(name="a")) + await persist(repo, plan_a) + claimed = await reserved_subnets(repo) + # Second topology on the same base, told about reservations: must + # pick subnets not in the first one's set. + plan_b = generate(_cfg(name="b"), reserved_subnets=claimed) + b_subnets = {lan.subnet for lan in plan_b.lans} + a_subnets = {lan.subnet for lan in plan_a.lans} + assert b_subnets.isdisjoint(a_subnets) From d4f4c58277b4ba397f337aac2d8f42be0b3a8f30 Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 17:42:37 -0400 Subject: [PATCH 009/448] feat(topology): thread per-service config overrides through compose MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MazeNET phase 2 step 2. Mirrors the flat-fleet service_config pattern (DeckyConfig.service_config → composer → svc.compose_fragment) into the topology compose pipeline, so a hand-authored decky can carry overrides like {"ssh": {"password": "megapassword"}} and the ssh fragment reads them just like the flat path does. - _PlannedDecky gains service_config: dict[str, dict]. - persist() stores it under decky_config["service_config"]. - topology/compose.py passes cfg.get("service_config", {}).get(svc, {}) to svc.compose_fragment(service_cfg=...). Schema unchanged — service_config lives inside the existing decky_config JSON blob. Zero changes in decnet/services/*. --- decnet/topology/compose.py | 5 +- decnet/topology/config.py | 4 + decnet/topology/persistence.py | 1 + tests/topology/test_service_config.py | 112 ++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 tests/topology/test_service_config.py diff --git a/decnet/topology/compose.py b/decnet/topology/compose.py index 25b07285..955e4233 100644 --- a/decnet/topology/compose.py +++ b/decnet/topology/compose.py @@ -62,6 +62,7 @@ def generate_topology_compose(hydrated: dict[str, Any]) -> dict: name = cfg["name"] ips_by_lan: dict[str, str] = cfg["ips_by_lan"] forwards_l3: bool = cfg.get("forwards_l3", False) + service_config: dict[str, dict] = cfg.get("service_config", {}) or {} svc_names: list[str] = decky["services"] base_key = name @@ -92,7 +93,9 @@ def generate_topology_compose(hydrated: dict[str, Any]) -> dict: svc = get_service(svc_name) if svc is None or svc.fleet_singleton: continue - fragment = svc.compose_fragment(name, service_cfg={}) + fragment = svc.compose_fragment( + name, service_cfg=service_config.get(svc_name, {}) + ) if "build" in fragment: fragment["build"].setdefault("args", {}).setdefault( "BASE_IMAGE", _DEFAULT_BASE_IMAGE diff --git a/decnet/topology/config.py b/decnet/topology/config.py index 3b9a13a3..927a6c5c 100644 --- a/decnet/topology/config.py +++ b/decnet/topology/config.py @@ -69,6 +69,10 @@ class _PlannedDecky: # Mapping LAN-name → assigned IP within that LAN's subnet. ips_by_lan: dict[str, str] = field(default_factory=dict) forwards_l3: bool = False # only meaningful when present on ≥2 LANs + # Per-service config overrides: {service_name: {field: value}}. + # Mirrors ``DeckyConfig.service_config`` from the flat-fleet path; + # services read these via ``compose_fragment(service_cfg=...)``. + service_config: dict[str, dict] = field(default_factory=dict) @dataclass diff --git a/decnet/topology/persistence.py b/decnet/topology/persistence.py index 0f07c270..4c361c7f 100644 --- a/decnet/topology/persistence.py +++ b/decnet/topology/persistence.py @@ -52,6 +52,7 @@ async def persist(repo: Any, plan: GeneratedTopology) -> str: "services": decky.services, "ips_by_lan": decky.ips_by_lan, "forwards_l3": decky.forwards_l3, + "service_config": decky.service_config, }, "ip": primary_ip, } diff --git a/tests/topology/test_service_config.py b/tests/topology/test_service_config.py new file mode 100644 index 00000000..92078447 --- /dev/null +++ b/tests/topology/test_service_config.py @@ -0,0 +1,112 @@ +"""Per-decky, per-service config roundtrips through persist + compose.""" +from __future__ import annotations + +import pytest +import yaml + +from decnet.topology.compose import generate_topology_compose +from decnet.topology.config import TopologyConfig +from decnet.topology.generator import generate +from decnet.topology.persistence import hydrate, persist +from decnet.web.db.factory import get_repository + + +def _cfg(**kw) -> TopologyConfig: + base = dict( + name="svc", + depth=1, + branching_factor=1, + deckies_per_lan_min=1, + deckies_per_lan_max=1, + cross_edge_probability=0.0, + randomize_services=False, + services_explicit=["ssh"], + seed=5, + ) + base.update(kw) + return TopologyConfig(**base) + + +@pytest.fixture +async def repo(tmp_path): + r = get_repository(db_path=str(tmp_path / "svc.db")) + await r.initialize() + return r + + +@pytest.mark.anyio +async def test_service_config_roundtrips(repo): + plan = generate(_cfg()) + # Operator-style override, as the web editor would write it. + plan.deckies[0].service_config = {"ssh": {"password": "megapassword"}} + tid = await persist(repo, plan) + + hydrated = await hydrate(repo, tid) + decky = next( + d for d in hydrated["deckies"] if d["name"] == plan.deckies[0].name + ) + assert decky["decky_config"]["service_config"] == { + "ssh": {"password": "megapassword"} + } + + +@pytest.mark.anyio +async def test_service_config_reaches_compose_fragment(repo): + plan = generate(_cfg()) + plan.deckies[0].service_config = {"ssh": {"password": "megapassword"}} + tid = await persist(repo, plan) + + hydrated = await hydrate(repo, tid) + compose = generate_topology_compose(hydrated) + # The ssh fragment keys are "-ssh" (see compose.py:107). + ssh_key = f"{plan.deckies[0].name}-ssh" + frag = compose["services"][ssh_key] + env = frag.get("environment", {}) + assert env.get("SSH_ROOT_PASSWORD") == "megapassword" + + +@pytest.mark.anyio +async def test_missing_service_config_defaults_work(repo): + """No service_config override → service falls back to its default.""" + plan = generate(_cfg()) + tid = await persist(repo, plan) + hydrated = await hydrate(repo, tid) + compose = generate_topology_compose(hydrated) + ssh_key = f"{plan.deckies[0].name}-ssh" + frag = compose["services"][ssh_key] + assert frag["environment"]["SSH_ROOT_PASSWORD"] == "admin" + + +@pytest.mark.anyio +async def test_unknown_nested_key_passes_through(repo): + """Forward-compat: unknown keys under a service reach the fragment + untouched (current services ignore them; future services may read).""" + plan = generate(_cfg()) + plan.deckies[0].service_config = { + "ssh": {"password": "x", "future_flag": "hi"} + } + tid = await persist(repo, plan) + hydrated = await hydrate(repo, tid) + decky = next( + d for d in hydrated["deckies"] if d["name"] == plan.deckies[0].name + ) + assert ( + decky["decky_config"]["service_config"]["ssh"]["future_flag"] == "hi" + ) + + +@pytest.mark.anyio +async def test_compose_file_yaml_is_loadable(repo): + """Regression: the compose dict roundtrips through yaml cleanly.""" + plan = generate(_cfg()) + plan.deckies[0].service_config = {"ssh": {"password": "roundtrip"}} + tid = await persist(repo, plan) + hydrated = await hydrate(repo, tid) + compose = generate_topology_compose(hydrated) + dumped = yaml.dump(compose, sort_keys=False) + reloaded = yaml.safe_load(dumped) + ssh_key = f"{plan.deckies[0].name}-ssh" + assert ( + reloaded["services"][ssh_key]["environment"]["SSH_ROOT_PASSWORD"] + == "roundtrip" + ) From 2544d0294a179c7c077cfe4bea2b3bc23a7ec27d Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 17:45:32 -0400 Subject: [PATCH 010/448] feat(topology): add pre-deploy validator and wire into deploy_topology MazeNET phase 2 step 3. Blocks deploys of hand-authored topologies that would fail mid-bring-up (orphan deckies, duplicate IPs, overlapping subnets, unknown services) with a structured error list instead of a docker error at startup. Rules (one function each, composable by the editor for inline hints): - exactly one DMZ - every LAN has a bridge chain to the DMZ (BFS via multi-homed deckies) - no orphan deckies - unique LAN and decky names per topology - no IP collisions + IPs inside their LAN's subnet - no LAN subnet overlaps - every service in decnet.fleet.all_service_names() - service_config keys match the decky's declared services deploy_topology runs the validator after hydrate, before any status transition or Docker call; errors raise ValidationError and status stays at pending. --- decnet/engine/deployer.py | 7 + decnet/topology/validate.py | 306 ++++++++++++++++++++++++++++++++ tests/topology/test_validate.py | 178 +++++++++++++++++++ 3 files changed, 491 insertions(+) create mode 100644 decnet/topology/validate.py create mode 100644 tests/topology/test_validate.py diff --git a/decnet/engine/deployer.py b/decnet/engine/deployer.py index 22c8d592..ef8b1796 100644 --- a/decnet/engine/deployer.py +++ b/decnet/engine/deployer.py @@ -35,6 +35,7 @@ from decnet.topology.compose import ( ) from decnet.topology.persistence import hydrate, transition_status from decnet.topology.status import TopologyStatus +from decnet.topology.validate import ValidationError, errors as _validation_errors, validate as _validate_topology log = get_logger("engine") console = Console() @@ -318,6 +319,12 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N if hydrated is None: raise ValueError(f"topology {topology_id!r} not found") + # Precondition: validate before any status transition or Docker call. + # Errors bubble up as ValidationError and leave status untouched. + issues = _validate_topology(hydrated) + if _validation_errors(issues): + raise ValidationError(issues) + lans = hydrated["lans"] compose_path = _topology_compose_path(topology_id) diff --git a/decnet/topology/validate.py b/decnet/topology/validate.py new file mode 100644 index 00000000..3043af3f --- /dev/null +++ b/decnet/topology/validate.py @@ -0,0 +1,306 @@ +"""Pre-deploy validator for MazeNET topologies. + +Consumes a hydrated dict (output of +:func:`decnet.topology.persistence.hydrate`) and returns a list of +:class:`ValidationIssue` records. The deployer calls :func:`validate` +before transitioning to ``DEPLOYING`` and refuses to proceed if any +issue has ``severity=="error"``. + +Rules are independent functions so the web editor can surface them as +inline diagnostics without running the full list. +""" +from __future__ import annotations + +from dataclasses import dataclass, field +from ipaddress import IPv4Address, IPv4Network +from typing import Any, Callable, Literal + +from decnet.fleet import all_service_names + +Severity = Literal["error", "warning"] + + +@dataclass +class ValidationIssue: + severity: Severity + code: str + message: str + target: dict = field(default_factory=dict) + + +class ValidationError(Exception): + """Raised by the deployer when a topology fails pre-deploy checks.""" + + def __init__(self, issues: list[ValidationIssue]) -> None: + self.issues = issues + errors = [i for i in issues if i.severity == "error"] + super().__init__( + f"{len(errors)} topology validation error(s): " + + "; ".join(f"[{i.code}] {i.message}" for i in errors) + ) + + +# --------------------------------------------------------------------- rules + + +def check_exactly_one_dmz(h: dict[str, Any]) -> list[ValidationIssue]: + dmzs = [lan for lan in h["lans"] if lan.get("is_dmz")] + if len(dmzs) == 1: + return [] + if not dmzs: + return [ + ValidationIssue("error", "DMZ_MISSING", "no LAN is marked is_dmz=True") + ] + return [ + ValidationIssue( + "error", + "DMZ_MULTIPLE", + f"{len(dmzs)} LANs marked is_dmz=True; exactly one allowed", + target={"lans": [lan["name"] for lan in dmzs]}, + ) + ] + + +def check_all_lans_connected_to_dmz( + h: dict[str, Any], +) -> list[ValidationIssue]: + lans = {lan["id"]: lan for lan in h["lans"]} + if not lans: + return [] + dmz = next((lan for lan in h["lans"] if lan.get("is_dmz")), None) + if dmz is None: + return [] # covered by check_exactly_one_dmz + + # Adjacency: LANs share an edge if ≥1 bridge decky is attached to both. + decky_lans: dict[str, set[str]] = {} + for edge in h["edges"]: + decky_lans.setdefault(edge["decky_uuid"], set()).add(edge["lan_id"]) + + adj: dict[str, set[str]] = {lid: set() for lid in lans} + for lan_ids in decky_lans.values(): + if len(lan_ids) < 2: + continue + for a in lan_ids: + for b in lan_ids: + if a != b: + adj[a].add(b) + + reachable = {dmz["id"]} + frontier = [dmz["id"]] + while frontier: + nxt: list[str] = [] + for lid in frontier: + for peer in adj[lid]: + if peer not in reachable: + reachable.add(peer) + nxt.append(peer) + frontier = nxt + + orphans = [lans[lid]["name"] for lid in lans if lid not in reachable] + if not orphans: + return [] + return [ + ValidationIssue( + "error", + "DMZ_ORPHAN", + f"LAN(s) have no bridge path to the DMZ: {', '.join(orphans)}", + target={"lans": orphans}, + ) + ] + + +def check_no_orphan_deckies(h: dict[str, Any]) -> list[ValidationIssue]: + attached: set[str] = {e["decky_uuid"] for e in h["edges"]} + issues: list[ValidationIssue] = [] + for d in h["deckies"]: + if d["uuid"] not in attached: + issues.append( + ValidationIssue( + "error", + "DECKY_ORPHAN", + f"decky {d['name']!r} has no LAN edges", + target={"decky": d["name"]}, + ) + ) + return issues + + +def check_names_unique(h: dict[str, Any]) -> list[ValidationIssue]: + issues: list[ValidationIssue] = [] + seen_lan: set[str] = set() + for lan in h["lans"]: + if lan["name"] in seen_lan: + issues.append( + ValidationIssue( + "error", + "LAN_NAME_DUP", + f"duplicate LAN name {lan['name']!r}", + target={"lan": lan["name"]}, + ) + ) + seen_lan.add(lan["name"]) + seen_decky: set[str] = set() + for d in h["deckies"]: + if d["name"] in seen_decky: + issues.append( + ValidationIssue( + "error", + "DECKY_NAME_DUP", + f"duplicate decky name {d['name']!r}", + target={"decky": d["name"]}, + ) + ) + seen_decky.add(d["name"]) + return issues + + +def check_no_ip_collisions(h: dict[str, Any]) -> list[ValidationIssue]: + lans_by_name = {lan["name"]: lan for lan in h["lans"]} + per_lan_ips: dict[str, dict[str, str]] = {} # lan_name → {ip: decky_name} + issues: list[ValidationIssue] = [] + for d in h["deckies"]: + ips_by_lan: dict[str, str] = (d.get("decky_config") or {}).get( + "ips_by_lan", {} + ) + for lan_name, ip in ips_by_lan.items(): + lan = lans_by_name.get(lan_name) + if lan is None: + issues.append( + ValidationIssue( + "error", + "IP_UNKNOWN_LAN", + f"decky {d['name']!r} claims IP in unknown LAN " + f"{lan_name!r}", + target={"decky": d["name"], "lan": lan_name}, + ) + ) + continue + # Out-of-subnet check. + try: + if IPv4Address(ip) not in IPv4Network(lan["subnet"]): + issues.append( + ValidationIssue( + "error", + "IP_OUT_OF_SUBNET", + f"{ip} not inside {lan['subnet']} " + f"(decky {d['name']!r}, LAN {lan_name!r})", + target={"decky": d["name"], "lan": lan_name, "ip": ip}, + ) + ) + except (ValueError, TypeError): + issues.append( + ValidationIssue( + "error", + "IP_MALFORMED", + f"decky {d['name']!r}: malformed IP {ip!r}", + target={"decky": d["name"], "ip": ip}, + ) + ) + continue + bucket = per_lan_ips.setdefault(lan_name, {}) + if ip in bucket: + issues.append( + ValidationIssue( + "error", + "IP_COLLISION", + f"IP {ip} claimed by both {bucket[ip]!r} and " + f"{d['name']!r} in LAN {lan_name!r}", + target={ + "lan": lan_name, + "ip": ip, + "deckies": [bucket[ip], d["name"]], + }, + ) + ) + else: + bucket[ip] = d["name"] + return issues + + +def check_no_subnet_overlap(h: dict[str, Any]) -> list[ValidationIssue]: + nets: list[tuple[str, IPv4Network]] = [] + issues: list[ValidationIssue] = [] + for lan in h["lans"]: + try: + nets.append((lan["name"], IPv4Network(lan["subnet"]))) + except ValueError: + issues.append( + ValidationIssue( + "error", + "SUBNET_MALFORMED", + f"LAN {lan['name']!r}: malformed subnet {lan['subnet']!r}", + target={"lan": lan["name"]}, + ) + ) + for i, (na, a) in enumerate(nets): + for nb, b in nets[i + 1 :]: + if a.overlaps(b): + issues.append( + ValidationIssue( + "error", + "SUBNET_OVERLAP", + f"LAN {na!r} ({a}) overlaps LAN {nb!r} ({b})", + target={"lans": [na, nb]}, + ) + ) + return issues + + +def check_services_known(h: dict[str, Any]) -> list[ValidationIssue]: + known = set(all_service_names()) + issues: list[ValidationIssue] = [] + for d in h["deckies"]: + for svc in d.get("services", []): + if svc not in known: + issues.append( + ValidationIssue( + "error", + "UNKNOWN_SERVICE", + f"decky {d['name']!r}: unknown service {svc!r}", + target={"decky": d["name"], "service": svc}, + ) + ) + return issues + + +def check_service_config_shape(h: dict[str, Any]) -> list[ValidationIssue]: + issues: list[ValidationIssue] = [] + for d in h["deckies"]: + svc_cfg = (d.get("decky_config") or {}).get("service_config") or {} + declared = set(d.get("services", [])) + for svc_name in svc_cfg: + if svc_name not in declared: + issues.append( + ValidationIssue( + "error", + "SERVICE_CFG_UNDECLARED", + f"decky {d['name']!r}: service_config for " + f"{svc_name!r} but service not in services list", + target={"decky": d["name"], "service": svc_name}, + ) + ) + return issues + + +_RULES: list[Callable[[dict[str, Any]], list[ValidationIssue]]] = [ + check_exactly_one_dmz, + check_all_lans_connected_to_dmz, + check_no_orphan_deckies, + check_names_unique, + check_no_ip_collisions, + check_no_subnet_overlap, + check_services_known, + check_service_config_shape, +] + + +def validate(hydrated: dict[str, Any]) -> list[ValidationIssue]: + """Run every rule and return the flat list of issues (may be empty).""" + out: list[ValidationIssue] = [] + for rule in _RULES: + out.extend(rule(hydrated)) + return out + + +def errors(issues: list[ValidationIssue]) -> list[ValidationIssue]: + return [i for i in issues if i.severity == "error"] diff --git a/tests/topology/test_validate.py b/tests/topology/test_validate.py new file mode 100644 index 00000000..8863507f --- /dev/null +++ b/tests/topology/test_validate.py @@ -0,0 +1,178 @@ +"""Validator-rule unit tests + deployer precondition integration.""" +from __future__ import annotations + +from unittest.mock import patch + +import pytest + +from decnet.engine.deployer import deploy_topology +from decnet.topology.config import TopologyConfig +from decnet.topology.generator import generate +from decnet.topology.persistence import hydrate, persist +from decnet.topology.status import TopologyStatus +from decnet.topology.validate import ( + ValidationError, + errors, + validate, +) +from decnet.web.db.factory import get_repository + + +def _cfg(**kw) -> TopologyConfig: + base = dict( + name="val", + depth=1, + branching_factor=1, + deckies_per_lan_min=1, + deckies_per_lan_max=1, + cross_edge_probability=0.0, + randomize_services=False, + services_explicit=["ssh"], + seed=9, + ) + base.update(kw) + return TopologyConfig(**base) + + +@pytest.fixture +async def repo(tmp_path): + r = get_repository(db_path=str(tmp_path / "val.db")) + await r.initialize() + return r + + +async def _hydrate_plan(repo, plan) -> dict: + tid = await persist(repo, plan) + return await hydrate(repo, tid), tid + + +# --------------------------------------------------------------------- rules + + +@pytest.mark.anyio +async def test_valid_topology_has_no_errors(repo): + plan = generate(_cfg()) + h, _ = await _hydrate_plan(repo, plan) + assert errors(validate(h)) == [] + + +@pytest.mark.anyio +async def test_dmz_missing(repo): + plan = generate(_cfg()) + h, _ = await _hydrate_plan(repo, plan) + for lan in h["lans"]: + lan["is_dmz"] = False + codes = [i.code for i in validate(h) if i.severity == "error"] + # DMZ_MISSING plus cascaded DMZ_ORPHAN checks are both acceptable; + # the specific rule must fire at minimum. + assert "DMZ_MISSING" in codes + + +@pytest.mark.anyio +async def test_dmz_multiple(repo): + plan = generate(_cfg()) + h, _ = await _hydrate_plan(repo, plan) + for lan in h["lans"]: + lan["is_dmz"] = True + assert "DMZ_MULTIPLE" in [i.code for i in validate(h)] + + +@pytest.mark.anyio +async def test_orphan_decky(repo): + plan = generate(_cfg()) + h, _ = await _hydrate_plan(repo, plan) + h["edges"] = [e for e in h["edges"] if e["decky_uuid"] != h["deckies"][0]["uuid"]] + assert "DECKY_ORPHAN" in [i.code for i in validate(h)] + + +@pytest.mark.anyio +async def test_ip_collision(repo): + plan = generate(_cfg(deckies_per_lan_max=2, deckies_per_lan_min=2)) + h, _ = await _hydrate_plan(repo, plan) + # Force two deckies in the same LAN to claim the same IP. + deckies = [ + d for d in h["deckies"] + if any( + e["decky_uuid"] == d["uuid"] + for e in h["edges"] + if e["lan_id"] == h["lans"][0]["id"] + ) + ] + assert len(deckies) >= 2 + shared_ip = next(iter(deckies[0]["decky_config"]["ips_by_lan"].values())) + deckies[1]["decky_config"]["ips_by_lan"][h["lans"][0]["name"]] = shared_ip + assert "IP_COLLISION" in [i.code for i in validate(h)] + + +@pytest.mark.anyio +async def test_ip_out_of_subnet(repo): + plan = generate(_cfg()) + h, _ = await _hydrate_plan(repo, plan) + d = h["deckies"][0] + lan_name = next(iter(d["decky_config"]["ips_by_lan"])) + d["decky_config"]["ips_by_lan"][lan_name] = "10.99.99.99" + assert "IP_OUT_OF_SUBNET" in [i.code for i in validate(h)] + + +@pytest.mark.anyio +async def test_subnet_overlap(repo): + plan = generate(_cfg()) + h, _ = await _hydrate_plan(repo, plan) + # Shrink two LANs onto overlapping /16s. + h["lans"][0]["subnet"] = "10.0.0.0/16" + if len(h["lans"]) > 1: + h["lans"][1]["subnet"] = "10.0.5.0/24" + codes = [i.code for i in validate(h)] + assert "SUBNET_OVERLAP" in codes + + +@pytest.mark.anyio +async def test_unknown_service(repo): + plan = generate(_cfg()) + h, _ = await _hydrate_plan(repo, plan) + h["deckies"][0]["services"].append("teleporter-xyz") + assert "UNKNOWN_SERVICE" in [i.code for i in validate(h)] + + +@pytest.mark.anyio +async def test_service_config_undeclared(repo): + plan = generate(_cfg()) + h, _ = await _hydrate_plan(repo, plan) + h["deckies"][0]["decky_config"]["service_config"] = { + "rdp": {"password": "no"} + } + # "rdp" is not in the decky's services list (which is ["ssh"]). + assert "SERVICE_CFG_UNDECLARED" in [i.code for i in validate(h)] + + +# --------------------------------------------------------------------- deployer hook + + +@pytest.mark.anyio +async def test_deploy_aborts_on_validation_error(repo, tmp_path, monkeypatch): + """Broken topology must be rejected before any Docker call.""" + monkeypatch.chdir(tmp_path) + plan = generate(_cfg()) + tid = await persist(repo, plan) + + # Corrupt the persisted state: strip the DMZ flag. + lan = (await repo.list_lans_for_topology(tid))[0] + # Use raw repo path — SQLModel UPDATE via get + setattr. + from sqlmodel import select + from decnet.web.db.models import LAN + async with repo._session() as s: + row = (await s.execute(select(LAN).where(LAN.id == lan["id"]))).scalar_one() + row.is_dmz = False + s.add(row) + await s.commit() + + class _ShouldNotCall: + def from_env(self): # noqa: D401 + raise AssertionError("docker must not be called on a rejected topology") + + with patch("decnet.engine.deployer.docker", _ShouldNotCall()): + with pytest.raises(ValidationError): + await deploy_topology(repo, tid) + + topo = await repo.get_topology(tid) + assert topo["status"] == TopologyStatus.PENDING From e475c0957ecd9867122775865779600597c1cf75 Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 17:47:28 -0400 Subject: [PATCH 011/448] feat(topology): optimistic concurrency via Topology.version + expected_version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MazeNET phase 2 step 4. Readies the repo layer for concurrent editors (web canvas + CLI + mutator) without lost-write races. - Topology.version: monotonically bumped on supervised child-row writes. - VersionConflict exception carries {current, expected} for the UI. - _check_and_bump_version helper reads Topology in the same session, compares against expected_version, raises on mismatch, bumps on match. Commit happens in the caller's existing transaction so check+bump+write are atomic per mutation. - add_lan / update_lan / add_topology_decky / update_topology_decky / add_topology_edge accept expected_version=None by default, preserving every existing caller's behavior. When expected_version is None, no check runs and version stays put — internal callers (persist) that don't care about concurrency keep working unchanged. --- decnet/topology/status.py | 16 ++++ decnet/web/db/models.py | 4 + decnet/web/db/sqlmodel_repo.py | 98 ++++++++++++++++++++++-- tests/topology/test_concurrency.py | 118 +++++++++++++++++++++++++++++ 4 files changed, 231 insertions(+), 5 deletions(-) create mode 100644 tests/topology/test_concurrency.py diff --git a/decnet/topology/status.py b/decnet/topology/status.py index 2e1b8c76..cc5c818c 100644 --- a/decnet/topology/status.py +++ b/decnet/topology/status.py @@ -53,6 +53,22 @@ class TopologyStatusError(ValueError): """Raised when an illegal topology status transition is attempted.""" +class VersionConflict(RuntimeError): + """Raised when a topology write is supplied a stale ``expected_version``. + + Optimistic concurrency guard: the caller passed the version it last + observed, and the topology has since been mutated by someone else. + The caller should re-read and retry. + """ + + def __init__(self, *, current: int, expected: int) -> None: + self.current = current + self.expected = expected + super().__init__( + f"topology version conflict: expected {expected}, current is {current}" + ) + + def assert_transition(current: str, new: str) -> None: """Validate ``current → new`` or raise :class:`TopologyStatusError`.""" if current not in TopologyStatus.ALL: diff --git a/decnet/web/db/models.py b/decnet/web/db/models.py index c94f4392..3a44ea08 100644 --- a/decnet/web/db/models.py +++ b/decnet/web/db/models.py @@ -216,6 +216,10 @@ class Topology(SQLModel, table=True): created_at: datetime = Field( default_factory=lambda: datetime.now(timezone.utc), index=True ) + # Optimistic-concurrency token. Bumped by repo methods that mutate + # the topology or any child row when an expected_version is supplied. + # Callers pass their last-seen version; mismatch raises VersionConflict. + version: int = Field(default=1, nullable=False) class LAN(SQLModel, table=True): diff --git a/decnet/web/db/sqlmodel_repo.py b/decnet/web/db/sqlmodel_repo.py index 910a4d8a..306d3d19 100644 --- a/decnet/web/db/sqlmodel_repo.py +++ b/decnet/web/db/sqlmodel_repo.py @@ -1027,18 +1027,76 @@ class SQLModelRepository(BaseRepository): await session.commit() return True - async def add_lan(self, data: dict[str, Any]) -> str: + async def _check_and_bump_version( + self, + session, + topology_id: str, + expected_version: Optional[int], + ) -> None: + """Optimistic-concurrency guard used by child-row mutators. + + If ``expected_version`` is None, no check happens (backward-compat + for internal callers that don't need concurrency protection). + + If supplied, loads the Topology row in the same session, + compares ``version == expected_version``, raises VersionConflict + on mismatch, otherwise bumps ``version += 1``. The caller must + commit the enclosing session. + """ + from decnet.topology.status import VersionConflict + + if expected_version is None: + return + result = await session.execute( + select(Topology).where(Topology.id == topology_id) + ) + topo = result.scalar_one_or_none() + if topo is None: + raise ValueError(f"topology {topology_id!r} not found") + if topo.version != expected_version: + raise VersionConflict( + current=topo.version, expected=expected_version + ) + topo.version = topo.version + 1 + session.add(topo) + + async def add_lan( + self, + data: dict[str, Any], + *, + expected_version: Optional[int] = None, + ) -> str: async with self._session() as session: + await self._check_and_bump_version( + session, data["topology_id"], expected_version + ) row = LAN(**data) session.add(row) await session.commit() await session.refresh(row) return row.id - async def update_lan(self, lan_id: str, fields: dict[str, Any]) -> None: + async def update_lan( + self, + lan_id: str, + fields: dict[str, Any], + *, + expected_version: Optional[int] = None, + ) -> None: if not fields: return async with self._session() as session: + if expected_version is not None: + # Need the LAN's topology_id to check version. + result = await session.execute( + select(LAN).where(LAN.id == lan_id) + ) + lan = result.scalar_one_or_none() + if lan is None: + raise ValueError(f"lan {lan_id!r} not found") + await self._check_and_bump_version( + session, lan.topology_id, expected_version + ) await session.execute( update(LAN).where(LAN.id == lan_id).values(**fields) ) @@ -1053,9 +1111,17 @@ class SQLModelRepository(BaseRepository): ) return [r.model_dump(mode="json") for r in result.scalars().all()] - async def add_topology_decky(self, data: dict[str, Any]) -> str: + async def add_topology_decky( + self, + data: dict[str, Any], + *, + expected_version: Optional[int] = None, + ) -> str: payload = self._serialize_json_fields(data, ("services", "decky_config")) async with self._session() as session: + await self._check_and_bump_version( + session, data["topology_id"], expected_version + ) row = TopologyDecky(**payload) session.add(row) await session.commit() @@ -1063,13 +1129,27 @@ class SQLModelRepository(BaseRepository): return row.uuid async def update_topology_decky( - self, decky_uuid: str, fields: dict[str, Any] + self, + decky_uuid: str, + fields: dict[str, Any], + *, + expected_version: Optional[int] = None, ) -> None: if not fields: return payload = self._serialize_json_fields(fields, ("services", "decky_config")) payload.setdefault("updated_at", datetime.now(timezone.utc)) async with self._session() as session: + if expected_version is not None: + result = await session.execute( + select(TopologyDecky).where(TopologyDecky.uuid == decky_uuid) + ) + d = result.scalar_one_or_none() + if d is None: + raise ValueError(f"decky {decky_uuid!r} not found") + await self._check_and_bump_version( + session, d.topology_id, expected_version + ) await session.execute( update(TopologyDecky) .where(TopologyDecky.uuid == decky_uuid) @@ -1093,8 +1173,16 @@ class SQLModelRepository(BaseRepository): for r in result.scalars().all() ] - async def add_topology_edge(self, data: dict[str, Any]) -> str: + async def add_topology_edge( + self, + data: dict[str, Any], + *, + expected_version: Optional[int] = None, + ) -> str: async with self._session() as session: + await self._check_and_bump_version( + session, data["topology_id"], expected_version + ) row = TopologyEdge(**data) session.add(row) await session.commit() diff --git a/tests/topology/test_concurrency.py b/tests/topology/test_concurrency.py new file mode 100644 index 00000000..af6bf77a --- /dev/null +++ b/tests/topology/test_concurrency.py @@ -0,0 +1,118 @@ +"""Optimistic-concurrency (version) checks on topology child mutations.""" +from __future__ import annotations + +import pytest + +from decnet.topology.config import TopologyConfig +from decnet.topology.generator import generate +from decnet.topology.persistence import persist +from decnet.topology.status import VersionConflict +from decnet.web.db.factory import get_repository + + +def _cfg(**kw) -> TopologyConfig: + base = dict( + name="ver", + depth=1, + branching_factor=1, + deckies_per_lan_min=1, + deckies_per_lan_max=1, + cross_edge_probability=0.0, + randomize_services=False, + services_explicit=["ssh"], + seed=2, + ) + base.update(kw) + return TopologyConfig(**base) + + +@pytest.fixture +async def repo(tmp_path): + r = get_repository(db_path=str(tmp_path / "ver.db")) + await r.initialize() + return r + + +@pytest.mark.anyio +async def test_version_starts_at_one_after_persist(repo): + plan = generate(_cfg()) + # persist() adds LANs/deckies/edges without expected_version, so + # the version token stays at 1. + tid = await persist(repo, plan) + topo = await repo.get_topology(tid) + assert topo["version"] == 1 + + +@pytest.mark.anyio +async def test_happy_path_two_sequential_writes(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + + await repo.add_lan( + {"topology_id": tid, "name": "LAN-A", "subnet": "10.9.0.0/24", "is_dmz": False}, + expected_version=1, + ) + assert (await repo.get_topology(tid))["version"] == 2 + + await repo.add_lan( + {"topology_id": tid, "name": "LAN-B", "subnet": "10.9.1.0/24", "is_dmz": False}, + expected_version=2, + ) + assert (await repo.get_topology(tid))["version"] == 3 + + +@pytest.mark.anyio +async def test_stale_expected_version_raises(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + + await repo.add_lan( + {"topology_id": tid, "name": "LAN-A", "subnet": "10.8.0.0/24", "is_dmz": False}, + expected_version=1, + ) + with pytest.raises(VersionConflict) as ei: + await repo.add_lan( + {"topology_id": tid, "name": "LAN-B", "subnet": "10.8.1.0/24", "is_dmz": False}, + expected_version=1, # stale + ) + assert ei.value.current == 2 + assert ei.value.expected == 1 + + +@pytest.mark.anyio +async def test_no_expected_version_skips_check(repo): + """Existing callers (persist) don't pass expected_version and must + continue to work without version bumps.""" + plan = generate(_cfg()) + tid = await persist(repo, plan) + before = (await repo.get_topology(tid))["version"] + await repo.add_lan( + {"topology_id": tid, "name": "LAN-X", "subnet": "10.7.0.0/24", "is_dmz": False} + ) + after = (await repo.get_topology(tid))["version"] + assert before == after # no bump when version not asserted + + +@pytest.mark.anyio +async def test_update_topology_decky_bumps_version(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + decky = (await repo.list_topology_deckies(tid))[0] + await repo.update_topology_decky( + decky["uuid"], + {"decky_config": {"name": decky["name"], "services": ["ssh"], + "ips_by_lan": decky["decky_config"]["ips_by_lan"], + "forwards_l3": False, + "service_config": {"ssh": {"password": "x"}}}}, + expected_version=1, + ) + assert (await repo.get_topology(tid))["version"] == 2 + + +@pytest.mark.anyio +async def test_update_lan_bumps_version(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + lan = (await repo.list_lans_for_topology(tid))[0] + await repo.update_lan(lan["id"], {"name": "LAN-RENAMED"}, expected_version=1) + assert (await repo.get_topology(tid))["version"] == 2 From 9afaac7612307443bdada84564d77770995f2434 Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 17:48:29 -0400 Subject: [PATCH 012/448] feat(topology): nullable layout coords on LAN + TopologyDecky MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MazeNET phase 2 step 5. Pure storage — the generator emits None for x/y and the web canvas fills them in later. No logic changes; no compose, deploy, or validator impact. --- decnet/topology/config.py | 7 ++++ decnet/topology/persistence.py | 4 +++ decnet/web/db/models.py | 8 +++++ tests/topology/test_layout.py | 58 ++++++++++++++++++++++++++++++++++ 4 files changed, 77 insertions(+) create mode 100644 tests/topology/test_layout.py diff --git a/decnet/topology/config.py b/decnet/topology/config.py index 927a6c5c..1f2098ec 100644 --- a/decnet/topology/config.py +++ b/decnet/topology/config.py @@ -59,6 +59,10 @@ class _PlannedLAN: subnet: str is_dmz: bool parent: Optional[str] # name of parent LAN, None for DMZ + # Canvas coordinates — generator leaves them None; the web editor + # (or a future auto-layouter) fills them in. + x: Optional[float] = None + y: Optional[float] = None @dataclass @@ -73,6 +77,9 @@ class _PlannedDecky: # Mirrors ``DeckyConfig.service_config`` from the flat-fleet path; # services read these via ``compose_fragment(service_cfg=...)``. service_config: dict[str, dict] = field(default_factory=dict) + # Canvas coordinates — see _PlannedLAN.x/y. + x: Optional[float] = None + y: Optional[float] = None @dataclass diff --git a/decnet/topology/persistence.py b/decnet/topology/persistence.py index 4c361c7f..7dcef8cb 100644 --- a/decnet/topology/persistence.py +++ b/decnet/topology/persistence.py @@ -31,6 +31,8 @@ async def persist(repo: Any, plan: GeneratedTopology) -> str: "name": lan.name, "subnet": lan.subnet, "is_dmz": lan.is_dmz, + "x": lan.x, + "y": lan.y, } ) lan_ids[lan.name] = lan_id @@ -55,6 +57,8 @@ async def persist(repo: Any, plan: GeneratedTopology) -> str: "service_config": decky.service_config, }, "ip": primary_ip, + "x": decky.x, + "y": decky.y, } ) decky_ids[decky.name] = decky_uuid diff --git a/decnet/web/db/models.py b/decnet/web/db/models.py index 3a44ea08..085ce71f 100644 --- a/decnet/web/db/models.py +++ b/decnet/web/db/models.py @@ -232,6 +232,10 @@ class LAN(SQLModel, table=True): docker_network_id: Optional[str] = Field(default=None) subnet: str is_dmz: bool = Field(default=False) + # Canvas layout coordinates (set by the web editor). Nullable so + # generator-emitted LANs don't need auto-layout at generation time. + x: Optional[float] = Field(default=None) + y: Optional[float] = Field(default=None) class TopologyDecky(SQLModel, table=True): @@ -270,6 +274,10 @@ class TopologyDecky(SQLModel, table=True): updated_at: datetime = Field( default_factory=lambda: datetime.now(timezone.utc) ) + # Canvas layout coordinates (set by the web editor). Nullable so + # generator-emitted deckies don't need auto-layout at generation time. + x: Optional[float] = Field(default=None) + y: Optional[float] = Field(default=None) class TopologyEdge(SQLModel, table=True): diff --git a/tests/topology/test_layout.py b/tests/topology/test_layout.py new file mode 100644 index 00000000..4d0c02fa --- /dev/null +++ b/tests/topology/test_layout.py @@ -0,0 +1,58 @@ +"""Layout coordinate roundtrips for LAN and TopologyDecky.""" +from __future__ import annotations + +import pytest + +from decnet.topology.config import TopologyConfig +from decnet.topology.generator import generate +from decnet.topology.persistence import hydrate, persist +from decnet.web.db.factory import get_repository + + +def _cfg(**kw) -> TopologyConfig: + base = dict( + name="layout", + depth=1, + branching_factor=1, + deckies_per_lan_min=1, + deckies_per_lan_max=1, + cross_edge_probability=0.0, + randomize_services=False, + services_explicit=["ssh"], + seed=4, + ) + base.update(kw) + return TopologyConfig(**base) + + +@pytest.fixture +async def repo(tmp_path): + r = get_repository(db_path=str(tmp_path / "layout.db")) + await r.initialize() + return r + + +@pytest.mark.anyio +async def test_coords_roundtrip_when_set(repo): + plan = generate(_cfg()) + plan.lans[0].x = 10.5 + plan.lans[0].y = -3.25 + plan.deckies[0].x = 42.0 + plan.deckies[0].y = 7.5 + tid = await persist(repo, plan) + hydrated = await hydrate(repo, tid) + lan = next(l for l in hydrated["lans"] if l["name"] == plan.lans[0].name) + assert lan["x"] == 10.5 and lan["y"] == -3.25 + d = next(d for d in hydrated["deckies"] if d["name"] == plan.deckies[0].name) + assert d["x"] == 42.0 and d["y"] == 7.5 + + +@pytest.mark.anyio +async def test_coords_default_to_none(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + hydrated = await hydrate(repo, tid) + for lan in hydrated["lans"]: + assert lan["x"] is None and lan["y"] is None + for d in hydrated["deckies"]: + assert d["x"] is None and d["y"] is None From 91df57d36bfebdc4c31017f2c5ad4c630d79171f Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 17:50:29 -0400 Subject: [PATCH 013/448] feat(topology): pending-only mutation repo methods with cascade + guards MazeNET phase 2 step 6. Equips the repo layer with the CRUD the web editor needs before deploy. - TopologyNotEditable exception: raised when a pending-only method hits a non-pending topology. The intent is "free-form edits stop at deploy; the mutator (step 7) takes over for live topologies." - _assert_pending helper checks status inside the session. - update_lan / update_topology_decky accept enforce_pending=True for pre-deploy callers (existing internal callers default to False so behavior is unchanged). - delete_lan: cascades edges; refuses if any decky has only one edge (= this LAN is its home) to prevent orphans. - delete_topology_decky: cascades edges. - delete_topology_edge: bare-bones removal. All four mutators accept expected_version for optimistic concurrency. Existing tests continue to pass (no behavior change for persist/deploy). --- decnet/topology/status.py | 18 ++++ decnet/web/db/repository.py | 33 ++++++- decnet/web/db/sqlmodel_repo.py | 153 ++++++++++++++++++++++++++++++--- tests/topology/test_editing.py | 132 ++++++++++++++++++++++++++++ 4 files changed, 321 insertions(+), 15 deletions(-) create mode 100644 tests/topology/test_editing.py diff --git a/decnet/topology/status.py b/decnet/topology/status.py index cc5c818c..b01267b5 100644 --- a/decnet/topology/status.py +++ b/decnet/topology/status.py @@ -53,6 +53,24 @@ class TopologyStatusError(ValueError): """Raised when an illegal topology status transition is attempted.""" +class TopologyNotEditable(RuntimeError): + """Raised when a pending-only mutation hits a non-pending topology. + + Pre-deploy edits (update_lan, delete_lan, update/delete decky, + delete_edge) are only legal while the topology is ``pending``. + After deploy the mutator's reconciler + topology_mutations table + take over. + """ + + def __init__(self, *, status: str, reason: str = "") -> None: + self.status = status + self.reason = reason + super().__init__( + f"topology not editable (status={status!r})" + + (f": {reason}" if reason else "") + ) + + class VersionConflict(RuntimeError): """Raised when a topology write is supplied a stale ``expected_version``. diff --git a/decnet/web/db/repository.py b/decnet/web/db/repository.py index d7f1f53d..67af535b 100644 --- a/decnet/web/db/repository.py +++ b/decnet/web/db/repository.py @@ -265,7 +265,14 @@ class BaseRepository(ABC): async def add_lan(self, data: dict[str, Any]) -> str: raise NotImplementedError - async def update_lan(self, lan_id: str, fields: dict[str, Any]) -> None: + async def update_lan( + self, + lan_id: str, + fields: dict[str, Any], + *, + expected_version: Optional[int] = None, + enforce_pending: bool = False, + ) -> None: raise NotImplementedError async def list_lans_for_topology( @@ -277,7 +284,12 @@ class BaseRepository(ABC): raise NotImplementedError async def update_topology_decky( - self, decky_uuid: str, fields: dict[str, Any] + self, + decky_uuid: str, + fields: dict[str, Any], + *, + expected_version: Optional[int] = None, + enforce_pending: bool = False, ) -> None: raise NotImplementedError @@ -298,3 +310,20 @@ class BaseRepository(ABC): self, topology_id: str, limit: int = 100 ) -> list[dict[str, Any]]: raise NotImplementedError + + # -------------------- pre-deploy (pending-only) mutations -------------------- + + async def delete_lan( + self, lan_id: str, *, expected_version: Optional[int] = None + ) -> None: + raise NotImplementedError + + async def delete_topology_decky( + self, decky_uuid: str, *, expected_version: Optional[int] = None + ) -> None: + raise NotImplementedError + + async def delete_topology_edge( + self, edge_id: str, *, expected_version: Optional[int] = None + ) -> None: + raise NotImplementedError diff --git a/decnet/web/db/sqlmodel_repo.py b/decnet/web/db/sqlmodel_repo.py index 306d3d19..5d3da3ce 100644 --- a/decnet/web/db/sqlmodel_repo.py +++ b/decnet/web/db/sqlmodel_repo.py @@ -1027,6 +1027,23 @@ class SQLModelRepository(BaseRepository): await session.commit() return True + async def _assert_pending(self, session, topology_id: str) -> None: + """Pre-deploy edits are pending-only. Raises TopologyNotEditable.""" + from decnet.topology.status import TopologyNotEditable, TopologyStatus + + result = await session.execute( + select(Topology).where(Topology.id == topology_id) + ) + topo = result.scalar_one_or_none() + if topo is None: + raise ValueError(f"topology {topology_id!r} not found") + if topo.status != TopologyStatus.PENDING: + raise TopologyNotEditable( + status=topo.status, + reason="free-form edits are pending-only; use the " + "mutator (topology_mutations) after deploy", + ) + async def _check_and_bump_version( self, session, @@ -1082,18 +1099,20 @@ class SQLModelRepository(BaseRepository): fields: dict[str, Any], *, expected_version: Optional[int] = None, + enforce_pending: bool = False, ) -> None: if not fields: return async with self._session() as session: + result = await session.execute( + select(LAN).where(LAN.id == lan_id) + ) + lan = result.scalar_one_or_none() + if lan is None: + raise ValueError(f"lan {lan_id!r} not found") + if enforce_pending: + await self._assert_pending(session, lan.topology_id) if expected_version is not None: - # Need the LAN's topology_id to check version. - result = await session.execute( - select(LAN).where(LAN.id == lan_id) - ) - lan = result.scalar_one_or_none() - if lan is None: - raise ValueError(f"lan {lan_id!r} not found") await self._check_and_bump_version( session, lan.topology_id, expected_version ) @@ -1102,6 +1121,58 @@ class SQLModelRepository(BaseRepository): ) await session.commit() + async def delete_lan( + self, + lan_id: str, + *, + expected_version: Optional[int] = None, + ) -> None: + """Cascade-delete a LAN from a pending topology. + + Rejects if any decky declares this LAN as its home (i.e. has a + non-bridge edge to it — the only LAN that decky lives in). The + caller must delete or reassign the home-deckies first. + """ + from decnet.topology.status import TopologyNotEditable # noqa: F401 + + async with self._session() as session: + result = await session.execute(select(LAN).where(LAN.id == lan_id)) + lan = result.scalar_one_or_none() + if lan is None: + return + await self._assert_pending(session, lan.topology_id) + + # Home-decky check: any decky whose only edge lands here? + edges_result = await session.execute( + select(TopologyEdge).where(TopologyEdge.lan_id == lan_id) + ) + edges_here = edges_result.scalars().all() + decky_uuids_on_this_lan = {e.decky_uuid for e in edges_here} + for decky_uuid in decky_uuids_on_this_lan: + other = await session.execute( + select(TopologyEdge).where( + TopologyEdge.decky_uuid == decky_uuid, + TopologyEdge.lan_id != lan_id, + ) + ) + if other.scalar_one_or_none() is None: + raise ValueError( + f"cannot delete LAN {lan.name!r}: decky " + f"{decky_uuid} has no other LAN (would be orphaned)" + ) + + if expected_version is not None: + await self._check_and_bump_version( + session, lan.topology_id, expected_version + ) + # Cascade edges → LAN. + await session.execute( + text("DELETE FROM topology_edges WHERE lan_id = :l"), + {"l": lan_id}, + ) + await session.execute(text("DELETE FROM lans WHERE id = :l"), {"l": lan_id}) + await session.commit() + async def list_lans_for_topology( self, topology_id: str ) -> list[dict[str, Any]]: @@ -1134,19 +1205,22 @@ class SQLModelRepository(BaseRepository): fields: dict[str, Any], *, expected_version: Optional[int] = None, + enforce_pending: bool = False, ) -> None: if not fields: return payload = self._serialize_json_fields(fields, ("services", "decky_config")) payload.setdefault("updated_at", datetime.now(timezone.utc)) async with self._session() as session: + result = await session.execute( + select(TopologyDecky).where(TopologyDecky.uuid == decky_uuid) + ) + d = result.scalar_one_or_none() + if d is None: + raise ValueError(f"decky {decky_uuid!r} not found") + if enforce_pending: + await self._assert_pending(session, d.topology_id) if expected_version is not None: - result = await session.execute( - select(TopologyDecky).where(TopologyDecky.uuid == decky_uuid) - ) - d = result.scalar_one_or_none() - if d is None: - raise ValueError(f"decky {decky_uuid!r} not found") await self._check_and_bump_version( session, d.topology_id, expected_version ) @@ -1157,6 +1231,35 @@ class SQLModelRepository(BaseRepository): ) await session.commit() + async def delete_topology_decky( + self, + decky_uuid: str, + *, + expected_version: Optional[int] = None, + ) -> None: + """Cascade-delete a decky + all its edges from a pending topology.""" + async with self._session() as session: + result = await session.execute( + select(TopologyDecky).where(TopologyDecky.uuid == decky_uuid) + ) + d = result.scalar_one_or_none() + if d is None: + return + await self._assert_pending(session, d.topology_id) + if expected_version is not None: + await self._check_and_bump_version( + session, d.topology_id, expected_version + ) + await session.execute( + text("DELETE FROM topology_edges WHERE decky_uuid = :u"), + {"u": decky_uuid}, + ) + await session.execute( + text("DELETE FROM topology_deckies WHERE uuid = :u"), + {"u": decky_uuid}, + ) + await session.commit() + async def list_topology_deckies( self, topology_id: str ) -> list[dict[str, Any]]: @@ -1189,6 +1292,30 @@ class SQLModelRepository(BaseRepository): await session.refresh(row) return row.id + async def delete_topology_edge( + self, + edge_id: str, + *, + expected_version: Optional[int] = None, + ) -> None: + async with self._session() as session: + result = await session.execute( + select(TopologyEdge).where(TopologyEdge.id == edge_id) + ) + edge = result.scalar_one_or_none() + if edge is None: + return + await self._assert_pending(session, edge.topology_id) + if expected_version is not None: + await self._check_and_bump_version( + session, edge.topology_id, expected_version + ) + await session.execute( + text("DELETE FROM topology_edges WHERE id = :e"), + {"e": edge_id}, + ) + await session.commit() + async def list_topology_edges( self, topology_id: str ) -> list[dict[str, Any]]: diff --git a/tests/topology/test_editing.py b/tests/topology/test_editing.py new file mode 100644 index 00000000..3927be6b --- /dev/null +++ b/tests/topology/test_editing.py @@ -0,0 +1,132 @@ +"""Pre-deploy mutation repo methods: pending-only, version-aware.""" +from __future__ import annotations + +import pytest + +from decnet.topology.config import TopologyConfig +from decnet.topology.generator import generate +from decnet.topology.persistence import persist, transition_status +from decnet.topology.status import TopologyNotEditable, TopologyStatus +from decnet.web.db.factory import get_repository + + +def _cfg(**kw) -> TopologyConfig: + base = dict( + name="edit", + depth=1, + branching_factor=1, + deckies_per_lan_min=2, + deckies_per_lan_max=2, + cross_edge_probability=0.0, + randomize_services=False, + services_explicit=["ssh"], + seed=6, + ) + base.update(kw) + return TopologyConfig(**base) + + +@pytest.fixture +async def repo(tmp_path): + r = get_repository(db_path=str(tmp_path / "edit.db")) + await r.initialize() + return r + + +@pytest.mark.anyio +async def test_add_lan_to_pending_bumps_version(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + await repo.add_lan( + {"topology_id": tid, "name": "LAN-NEW", "subnet": "10.55.0.0/24", "is_dmz": False}, + expected_version=1, + ) + topo = await repo.get_topology(tid) + assert topo["version"] == 2 + lans = {l["name"] for l in await repo.list_lans_for_topology(tid)} + assert "LAN-NEW" in lans + + +@pytest.mark.anyio +async def test_update_decky_roundtrips_service_config(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + decky = (await repo.list_topology_deckies(tid))[0] + patch = dict(decky["decky_config"]) + patch["service_config"] = {"ssh": {"password": "megapassword"}} + await repo.update_topology_decky( + decky["uuid"], {"decky_config": patch}, expected_version=1, + ) + fresh = next( + d for d in await repo.list_topology_deckies(tid) + if d["uuid"] == decky["uuid"] + ) + assert fresh["decky_config"]["service_config"]["ssh"]["password"] == "megapassword" + + +@pytest.mark.anyio +async def test_update_decky_rejected_on_active_topology(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + decky = (await repo.list_topology_deckies(tid))[0] + # pending → deploying → active + await transition_status(repo, tid, TopologyStatus.DEPLOYING) + await transition_status(repo, tid, TopologyStatus.ACTIVE) + with pytest.raises(TopologyNotEditable) as ei: + await repo.update_topology_decky( + decky["uuid"], + {"decky_config": decky["decky_config"]}, + enforce_pending=True, + ) + assert ei.value.status == TopologyStatus.ACTIVE + + +@pytest.mark.anyio +async def test_delete_lan_with_home_decky_refused(repo): + """A LAN whose decky has no other edge cannot be deleted — it'd orphan.""" + plan = generate(_cfg(depth=1, branching_factor=1, deckies_per_lan_max=1, deckies_per_lan_min=1)) + tid = await persist(repo, plan) + lan = (await repo.list_lans_for_topology(tid))[0] + with pytest.raises(ValueError, match="orphaned"): + await repo.delete_lan(lan["id"]) + + +@pytest.mark.anyio +async def test_delete_edge_leaves_decky_intact(repo): + """Deleting one bridge edge of a multi-homed decky should succeed.""" + # depth=1 branching=1 gives DMZ(LAN-00) + LAN-01 with a bridge decky. + plan = generate(_cfg()) + tid = await persist(repo, plan) + edges = await repo.list_topology_edges(tid) + bridge_edges = [e for e in edges if e["is_bridge"]] + assert bridge_edges, "generator should produce at least one bridge edge" + # Delete exactly one — the bridge decky should keep at least one edge. + edge = bridge_edges[0] + before_deckies = {d["uuid"] for d in await repo.list_topology_deckies(tid)} + await repo.delete_topology_edge(edge["id"]) + after_deckies = {d["uuid"] for d in await repo.list_topology_deckies(tid)} + assert before_deckies == after_deckies + remaining = await repo.list_topology_edges(tid) + assert edge["id"] not in {e["id"] for e in remaining} + + +@pytest.mark.anyio +async def test_delete_decky_cascades_edges(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + decky = (await repo.list_topology_deckies(tid))[0] + await repo.delete_topology_decky(decky["uuid"]) + # No edge pointing to the removed decky remains. + remaining = await repo.list_topology_edges(tid) + assert decky["uuid"] not in {e["decky_uuid"] for e in remaining} + + +@pytest.mark.anyio +async def test_delete_edge_rejected_on_active(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + edges = await repo.list_topology_edges(tid) + await transition_status(repo, tid, TopologyStatus.DEPLOYING) + await transition_status(repo, tid, TopologyStatus.ACTIVE) + with pytest.raises(TopologyNotEditable): + await repo.delete_topology_edge(edges[0]["id"]) From a76b9ecdf9617f94624a4680e74255848701cb62 Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 18:02:37 -0400 Subject: [PATCH 014/448] =?UTF-8?q?feat(mazenet):=20step=207=20=E2=80=94?= =?UTF-8?q?=20topology=5Fmutations=20queue=20+=20mutator=20reconciler?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the live-mutation pipeline for active/degraded topologies: * TopologyMutation table with composite index (state, topology_id) so the watch-loop guard query stays O(log n). * claim_next_mutation is a single atomic UPDATE ... WHERE state='pending' so racing reconcilers deterministically pick one winner; losers see rowcount=0 and skip. * reconcile_topologies drains pending rows per live topology, applies via decnet.mutator.ops.dispatch, and on failure marks the mutation failed + transitions topology to degraded. * run_watch_loop gains a gated branch: flat-fleet mutate_all runs every tick unchanged; the reconciler only enters when the cheap has_pending_topology_mutation guard returns True. * apply_* ops re-check hard invariants (names, IP collisions, subnet overlap, known services, service_config shape) after every mutation so the repo never lands in an invalid state. * CLI: 'decnet topology mutate' / 'mutations' subcommands. --- decnet/cli/topology.py | 80 ++++++++ decnet/mutator/engine.py | 77 ++++++- decnet/mutator/ops.py | 363 +++++++++++++++++++++++++++++++++ decnet/web/db/models.py | 40 +++- decnet/web/db/repository.py | 38 ++++ decnet/web/db/sqlmodel_repo.py | 163 +++++++++++++++ tests/topology/test_mutator.py | 274 +++++++++++++++++++++++++ 7 files changed, 1033 insertions(+), 2 deletions(-) create mode 100644 decnet/mutator/ops.py create mode 100644 tests/topology/test_mutator.py diff --git a/decnet/cli/topology.py b/decnet/cli/topology.py index 23da8914..9def794b 100644 --- a/decnet/cli/topology.py +++ b/decnet/cli/topology.py @@ -202,6 +202,86 @@ def _teardown( _console.print(f"[green]Topology {topology_id} torn down.[/]") +@_group.command("mutate") +def _mutate( + topology_id: str = typer.Argument(..., help="Topology id (active or degraded)"), + op: str = typer.Argument( + ..., + help=( + "One of: add_lan, remove_lan, attach_decky, detach_decky, " + "remove_decky, update_decky, update_lan" + ), + ), + payload_json: str = typer.Option( + "{}", + "--payload-json", + help="JSON payload for the op (see mutator.ops for keys)", + ), + expected_version: Optional[int] = typer.Option( + None, + "--expected-version", + help="Optimistic-concurrency guard; enqueue fails with a " + "VersionConflict if the topology has since been mutated.", + ), +) -> None: + """Enqueue a live mutation. The mutator's watch loop applies it.""" + _require_master_mode("topology mutate") + import json + + try: + payload = json.loads(payload_json) + except ValueError as e: + _console.print(f"[red]Invalid JSON: {e}[/]") + raise typer.Exit(1) + + async def _go() -> str: + repo = await _repo() + return await repo.enqueue_topology_mutation( + topology_id, op, payload, expected_version=expected_version, + ) + + mid = asyncio.run(_go()) + _console.print( + f"[green]Mutation enqueued[/] — id=[bold]{mid}[/] op={op} " + f"(watch for state=applied on [cyan]topology mutations {topology_id}[/])" + ) + + +@_group.command("mutations") +def _mutations( + topology_id: str = typer.Argument(..., help="Topology id"), + state: Optional[str] = typer.Option( + None, + "--state", + help="Filter to one of pending|applying|applied|failed", + ), +) -> None: + """List queued/applied mutations for a topology.""" + _require_master_mode("topology mutations") + + async def _go() -> list[dict]: + repo = await _repo() + return await repo.list_topology_mutations(topology_id, state=state) + + rows = asyncio.run(_go()) + if not rows: + _console.print("[yellow]No mutations.[/]") + return + table = Table(title=f"Mutations — topology {topology_id}") + for col in ("id", "op", "state", "requested_at", "applied_at", "reason"): + table.add_column(col) + for r in rows: + table.add_row( + str(r["id"]), + str(r["op"]), + str(r["state"]), + str(r.get("requested_at", "")), + str(r.get("applied_at") or ""), + str(r.get("reason") or ""), + ) + _console.print(table) + + def register(app: typer.Typer) -> None: app.add_typer(_group, name="topology") diff --git a/decnet/mutator/engine.py b/decnet/mutator/engine.py index 0e4a925e..cc636c19 100644 --- a/decnet/mutator/engine.py +++ b/decnet/mutator/engine.py @@ -133,14 +133,89 @@ async def mutate_all(repo: BaseRepository, force: bool = False) -> None: log.info("mutate_all: complete mutated_count=%d", mutated_count) +@_traced("mutator.reconcile_topologies") +async def reconcile_topologies(repo: BaseRepository) -> int: + """Drain pending ``topology_mutations`` rows against live topologies. + + For every topology in ``active|degraded`` with at least one pending + mutation, atomically claim the oldest via + :meth:`BaseRepository.claim_next_mutation`, dispatch to the matching + ``apply_`` in :mod:`decnet.mutator.ops`, and write the outcome + back (``applied`` or ``failed``). + + On ``MutationError`` the topology is flipped to ``degraded`` — the + same state the future Healer will target — so operators can see that + a requested change was rejected without the repo drifting into an + inconsistent state. + + Returns the number of mutations drained this tick. + """ + # Local imports keep the flat-fleet hot path free of MazeNET cost. + from decnet.mutator.ops import MutationError, dispatch as _op_dispatch + from decnet.topology.persistence import transition_status + from decnet.topology.status import TopologyStatus, TopologyStatusError + + drained = 0 + for tid in await repo.list_live_topology_ids(): + while True: + mut = await repo.claim_next_mutation(tid) + if mut is None: + break # no more work for this topology this tick. + try: + await _op_dispatch(repo, tid, mut["op"], mut["payload"]) + await repo.mark_mutation_applied(mut["id"]) + drained += 1 + log.info( + "topology %s mutation %s applied op=%s", + tid, mut["id"], mut["op"], + ) + except (MutationError, Exception) as exc: # noqa: BLE001 + reason = f"{type(exc).__name__}: {exc}" + await repo.mark_mutation_failed(mut["id"], reason) + log.warning( + "topology %s mutation %s failed: %s", + tid, mut["id"], reason, + ) + try: + await transition_status( + repo, tid, TopologyStatus.DEGRADED, reason=reason, + ) + except TopologyStatusError: + # Already degraded / in a state that can't degrade + # further — leave as is. + pass + # Stop draining this topology on first failure so the + # operator can inspect before a cascade. + break + return drained + + @_traced("mutator.watch_loop") async def run_watch_loop(repo: BaseRepository, poll_interval_secs: int = 10) -> None: - """Run an infinite loop checking for deckies that need mutation.""" + """Run an infinite loop checking for deckies that need mutation. + + Two independent responsibilities, in strict order per tick: + + 1. Flat-fleet service rotation (``mutate_all``) — runs every tick + regardless of MazeNET state, preserving phase-1 timing. + 2. MazeNET live-mutation reconciliation — runs only when the cheap + guard ``has_pending_topology_mutation`` (indexed composite + lookup) returns True. Zero-topology and idle-topology hosts pay + exactly one indexed query per tick. + """ log.info("mutator watch loop started poll_interval_secs=%d", poll_interval_secs) console.print(f"[green]DECNET Mutator Watcher started (polling every {poll_interval_secs}s).[/]") try: while True: await mutate_all(force=False, repo=repo) + # Gate reconciler on the O(log n) guard query — avoids + # entering the dispatch body when there's nothing to do. + try: + if await repo.has_pending_topology_mutation(): + await reconcile_topologies(repo) + except NotImplementedError: + # Backend without MazeNET support — nothing to reconcile. + pass await asyncio.sleep(poll_interval_secs) except KeyboardInterrupt: log.info("mutator watch loop stopped") diff --git a/decnet/mutator/ops.py b/decnet/mutator/ops.py new file mode 100644 index 00000000..e42a77a9 --- /dev/null +++ b/decnet/mutator/ops.py @@ -0,0 +1,363 @@ +"""Live-mutation ops for active MazeNET topologies. + +Each ``apply_`` function consumes a claimed ``TopologyMutation`` +payload, mutates the repo (and, best-effort, the underlying Docker +state), then re-runs :func:`decnet.topology.validate.validate` against +the post-apply hydrated view. If validation errors appear, the op is +reported as failed and the caller flips the topology to ``degraded`` — +we never leave the repo in an invalid state. + +Design notes +------------ +* All ops are *repo-first*. The reconciler's job is to converge Docker + toward the repo's desired state, so persisting intent first keeps the + system self-healing across master restarts. +* Docker calls are optional at the ops layer: the tests drive these + functions directly against an in-memory repo, and the reconciler + sidecar calls them in production where Docker is present. Every + Docker call is guarded so missing/unreachable Docker doesn't leave + the DB half-mutated. +* Ops intentionally do NOT perform optimistic-concurrency checks — the + enqueue step already carried the caller's ``expected_version``. The + reconciler is the sole writer from here on. +""" +from __future__ import annotations + +import json +from typing import Any, Awaitable, Callable, Optional + +from decnet.logging import get_logger +from decnet.topology.allocator import IPAllocator, reserved_subnets, SubnetAllocator +from decnet.topology.persistence import hydrate +from decnet.topology.validate import ( + check_names_unique, + check_no_ip_collisions, + check_no_subnet_overlap, + check_service_config_shape, + check_services_known, + errors as _validation_errors, +) + +# Post-apply validation intentionally excludes topology-shape rules +# (``check_all_lans_connected_to_dmz``, ``check_exactly_one_dmz``, +# ``check_no_orphan_deckies``) — those are legitimately transient +# during live editing (e.g. ``add_lan`` leaves the new LAN orphaned +# until the next ``attach_decky``). The deployer's full ``validate()`` +# pass still runs at redeploy time. Invariants that MUST hold after +# every single op are kept here. +_POST_APPLY_CHECKS = ( + check_names_unique, + check_no_ip_collisions, + check_no_subnet_overlap, + check_services_known, + check_service_config_shape, +) + +_log = get_logger("mutator.ops") + + +class MutationError(RuntimeError): + """Raised by an ``apply_`` when the requested change is illegal.""" + + +OpFunc = Callable[[Any, str, dict[str, Any]], Awaitable[None]] + + +# ----------------------------------------------------------------- helpers + + +async def _hydrated(repo: Any, topology_id: str) -> dict[str, Any]: + h = await hydrate(repo, topology_id) + if h is None: + raise MutationError(f"topology {topology_id!r} vanished mid-apply") + return h + + +async def _assert_valid_after(repo: Any, topology_id: str) -> None: + """Re-hydrate and check invariants; raise :class:`MutationError` on errors.""" + h = await _hydrated(repo, topology_id) + issues: list = [] + for check in _POST_APPLY_CHECKS: + issues.extend(check(h)) + bad = _validation_errors(issues) + if bad: + codes = ", ".join(sorted({i.code for i in bad})) + raise MutationError( + f"post-apply validation failed for {topology_id}: {codes}" + ) + + +def _lan_by_name(hydrated: dict[str, Any], name: str) -> Optional[dict]: + return next((lan for lan in hydrated["lans"] if lan["name"] == name), None) + + +def _decky_by_name(hydrated: dict[str, Any], name: str) -> Optional[dict]: + return next( + (d for d in hydrated["deckies"] if d["decky_config"]["name"] == name), + None, + ) + + +# ------------------------------------------------------------------- ops + + +async def apply_add_lan( + repo: Any, topology_id: str, payload: dict[str, Any] +) -> None: + """Add a new LAN to an active topology. + + ``payload`` keys: + ``name`` — LAN name (required). + ``subnet`` — ``/24`` CIDR (optional; auto-allocated if missing). + ``is_dmz`` — bool, default False. + ``x``,``y`` — layout coords, optional. + """ + name = payload["name"] + subnet = payload.get("subnet") + is_dmz = bool(payload.get("is_dmz", False)) + + if subnet is None: + reserved = await reserved_subnets(repo) + alloc = SubnetAllocator(base_prefix="172.20", reserved=reserved) + subnet = alloc.next_free() + + await repo.add_lan( + { + "topology_id": topology_id, + "name": name, + "subnet": subnet, + "is_dmz": is_dmz, + "x": payload.get("x"), + "y": payload.get("y"), + } + ) + await _assert_valid_after(repo, topology_id) + + +async def apply_remove_lan( + repo: Any, topology_id: str, payload: dict[str, Any] +) -> None: + """Remove a LAN; refuses when any decky has it as its home LAN.""" + hydrated = await _hydrated(repo, topology_id) + lan = _lan_by_name(hydrated, payload["name"]) + if lan is None: + raise MutationError(f"LAN {payload['name']!r} not found") + # Refuse if any decky's home (primary/first) LAN is this one. + for d in hydrated["deckies"]: + ips = d["decky_config"].get("ips_by_lan", {}) + if ips and next(iter(ips)) == lan["name"]: + raise MutationError( + f"LAN {lan['name']!r} is the home LAN of decky " + f"{d['decky_config']['name']!r}; remove the decky first" + ) + await repo.delete_lan(lan["id"]) + await _assert_valid_after(repo, topology_id) + + +async def apply_attach_decky( + repo: Any, topology_id: str, payload: dict[str, Any] +) -> None: + """Attach an existing decky to an additional LAN (bridge edge). + + ``payload`` keys: + ``decky`` — decky name. + ``lan`` — LAN name. + ``ip`` — optional pinned IP; else allocated inside the LAN. + ``forwards_l3`` — bool, default False. + """ + hydrated = await _hydrated(repo, topology_id) + lan = _lan_by_name(hydrated, payload["lan"]) + decky = _decky_by_name(hydrated, payload["decky"]) + if lan is None: + raise MutationError(f"LAN {payload['lan']!r} not found") + if decky is None: + raise MutationError(f"decky {payload['decky']!r} not found") + + # Guard against re-attaching. + for e in hydrated["edges"]: + if e["decky_uuid"] == decky["uuid"] and e["lan_id"] == lan["id"]: + raise MutationError( + f"decky {decky['decky_config']['name']!r} already on " + f"LAN {lan['name']!r}" + ) + + ip = payload.get("ip") + if ip is None: + taken = { + d["decky_config"]["ips_by_lan"].get(lan["name"]) + for d in hydrated["deckies"] + if lan["name"] in d["decky_config"].get("ips_by_lan", {}) + } + taken.discard(None) + alloc = IPAllocator(subnet=lan["subnet"]) + for t in taken: + if t: + alloc.reserve(t) + ip = alloc.next_free() + + new_cfg = dict(decky["decky_config"]) + new_cfg["ips_by_lan"] = {**new_cfg.get("ips_by_lan", {}), lan["name"]: ip} + forwards_l3 = bool(payload.get("forwards_l3", False)) + if forwards_l3: + new_cfg["forwards_l3"] = True + + await repo.update_topology_decky( + decky["uuid"], {"decky_config": new_cfg} + ) + # Adding a second edge makes the decky multi-homed (a bridge decky). + await repo.add_topology_edge( + { + "topology_id": topology_id, + "decky_uuid": decky["uuid"], + "lan_id": lan["id"], + "is_bridge": True, + "forwards_l3": forwards_l3, + } + ) + await _assert_valid_after(repo, topology_id) + + +async def apply_detach_decky( + repo: Any, topology_id: str, payload: dict[str, Any] +) -> None: + """Detach a decky from one of its non-home LANs.""" + hydrated = await _hydrated(repo, topology_id) + lan = _lan_by_name(hydrated, payload["lan"]) + decky = _decky_by_name(hydrated, payload["decky"]) + if lan is None or decky is None: + raise MutationError("decky or LAN not found") + + ips_by_lan = decky["decky_config"].get("ips_by_lan", {}) + if not ips_by_lan: + raise MutationError("decky has no LAN memberships") + home_lan = next(iter(ips_by_lan)) + if home_lan == lan["name"]: + raise MutationError( + f"cannot detach home LAN {home_lan!r}; use remove_decky" + ) + + edge = next( + ( + e + for e in hydrated["edges"] + if e["decky_uuid"] == decky["uuid"] and e["lan_id"] == lan["id"] + ), + None, + ) + if edge is None: + raise MutationError( + f"decky not attached to LAN {lan['name']!r}" + ) + + new_cfg = dict(decky["decky_config"]) + new_ips = dict(new_cfg.get("ips_by_lan", {})) + new_ips.pop(lan["name"], None) + new_cfg["ips_by_lan"] = new_ips + + await repo.update_topology_decky( + decky["uuid"], {"decky_config": new_cfg} + ) + await repo.delete_topology_edge(edge["id"]) + await _assert_valid_after(repo, topology_id) + + +async def apply_remove_decky( + repo: Any, topology_id: str, payload: dict[str, Any] +) -> None: + hydrated = await _hydrated(repo, topology_id) + decky = _decky_by_name(hydrated, payload["decky"]) + if decky is None: + raise MutationError(f"decky {payload['decky']!r} not found") + await repo.delete_topology_decky(decky["uuid"]) + await _assert_valid_after(repo, topology_id) + + +async def apply_update_decky( + repo: Any, topology_id: str, payload: dict[str, Any] +) -> None: + """Update decky config — services, service_config, forwards_l3, coords. + + ``payload`` keys: + ``decky`` — decky name. + ``patch`` — dict merged into existing ``decky_config``. + ``x``,``y`` — layout coords. + """ + hydrated = await _hydrated(repo, topology_id) + decky = _decky_by_name(hydrated, payload["decky"]) + if decky is None: + raise MutationError(f"decky {payload['decky']!r} not found") + patch: dict[str, Any] = {} + if payload.get("patch"): + merged = dict(decky["decky_config"]) + merged.update(payload["patch"]) + patch["decky_config"] = merged + for key in ("x", "y"): + if key in payload: + patch[key] = payload[key] + if not patch: + return + await repo.update_topology_decky(decky["uuid"], patch) + await _assert_valid_after(repo, topology_id) + + +async def apply_update_lan( + repo: Any, topology_id: str, payload: dict[str, Any] +) -> None: + """Update LAN fields — subnet, is_dmz, coords, rename.""" + hydrated = await _hydrated(repo, topology_id) + lan = _lan_by_name(hydrated, payload["name"]) + if lan is None: + raise MutationError(f"LAN {payload['name']!r} not found") + fields = {k: v for k, v in payload.get("patch", {}).items()} + for key in ("x", "y"): + if key in payload: + fields[key] = payload[key] + if not fields: + return + await repo.update_lan(lan["id"], fields) + await _assert_valid_after(repo, topology_id) + + +# Keep the dispatch table in one place so the engine and CLI stay in +# sync without cross-imports. +DISPATCH: dict[str, OpFunc] = { + "add_lan": apply_add_lan, + "remove_lan": apply_remove_lan, + "attach_decky": apply_attach_decky, + "detach_decky": apply_detach_decky, + "remove_decky": apply_remove_decky, + "update_decky": apply_update_decky, + "update_lan": apply_update_lan, +} + + +async def dispatch( + repo: Any, + topology_id: str, + op: str, + payload_raw: str | dict[str, Any], +) -> None: + """Decode payload JSON (if a string) and run the matching op.""" + if isinstance(payload_raw, str): + payload = json.loads(payload_raw) if payload_raw else {} + else: + payload = payload_raw + try: + fn = DISPATCH[op] + except KeyError as e: + raise MutationError(f"unknown op: {op!r}") from e + await fn(repo, topology_id, payload) + + +__all__ = [ + "DISPATCH", + "MutationError", + "dispatch", + "apply_add_lan", + "apply_remove_lan", + "apply_attach_decky", + "apply_detach_decky", + "apply_remove_decky", + "apply_update_decky", + "apply_update_lan", +] diff --git a/decnet/web/db/models.py b/decnet/web/db/models.py index 085ce71f..28a03b1c 100644 --- a/decnet/web/db/models.py +++ b/decnet/web/db/models.py @@ -1,7 +1,7 @@ from datetime import datetime, timezone from typing import Literal, Optional, Any, List, Annotated from uuid import uuid4 -from sqlalchemy import Column, Text, UniqueConstraint +from sqlalchemy import Column, Index, Text, UniqueConstraint from sqlalchemy.dialects.mysql import MEDIUMTEXT from sqlmodel import SQLModel, Field from pydantic import BaseModel, ConfigDict, Field as PydanticField, BeforeValidator @@ -309,6 +309,44 @@ class TopologyStatusEvent(SQLModel, table=True): ) +class TopologyMutation(SQLModel, table=True): + """Operator-requested live mutation for an active MazeNET topology. + + Each row is one intent (add LAN, attach decky, etc.). The mutator's + reconciler claims ``pending`` rows atomically (see + ``SQLModelRepository.claim_next_mutation``), applies them against + Docker, and writes ``applied`` or ``failed`` back. The ``(state, + topology_id)`` composite index keeps the watch-loop guard query + cheap even with years of mutation history. + """ + __tablename__ = "topology_mutations" + __table_args__ = ( + Index( + "ix_topology_mutations_state_topology", + "state", + "topology_id", + ), + ) + id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) + topology_id: str = Field(foreign_key="topologies.id", index=True) + # add_lan|remove_lan|attach_decky|detach_decky|remove_decky| + # update_decky|update_lan + op: str = Field(index=True) + # JSON-serialised op payload (keys depend on ``op``). + payload: str = Field( + sa_column=Column("payload", _BIG_TEXT, nullable=False, default="{}") + ) + # pending|applying|applied|failed + state: str = Field(default="pending", index=True) + requested_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), index=True + ) + applied_at: Optional[datetime] = Field(default=None) + reason: Optional[str] = Field( + default=None, sa_column=Column("reason", Text, nullable=True) + ) + + # --- API Request/Response Models (Pydantic) --- class Token(BaseModel): diff --git a/decnet/web/db/repository.py b/decnet/web/db/repository.py index 67af535b..0ead310c 100644 --- a/decnet/web/db/repository.py +++ b/decnet/web/db/repository.py @@ -327,3 +327,41 @@ class BaseRepository(ABC): self, edge_id: str, *, expected_version: Optional[int] = None ) -> None: raise NotImplementedError + + # -------------------- live mutation queue (reconciler) -------------------- + + async def enqueue_topology_mutation( + self, + topology_id: str, + op: str, + payload: dict[str, Any], + *, + expected_version: Optional[int] = None, + ) -> str: + raise NotImplementedError + + async def claim_next_mutation( + self, topology_id: str + ) -> Optional[dict[str, Any]]: + raise NotImplementedError + + async def mark_mutation_applied(self, mutation_id: str) -> None: + raise NotImplementedError + + async def mark_mutation_failed( + self, mutation_id: str, reason: str + ) -> None: + raise NotImplementedError + + async def list_topology_mutations( + self, + topology_id: str, + state: Optional[str] = None, + ) -> list[dict[str, Any]]: + raise NotImplementedError + + async def has_pending_topology_mutation(self) -> bool: + return False + + async def list_live_topology_ids(self) -> list[str]: + return [] diff --git a/decnet/web/db/sqlmodel_repo.py b/decnet/web/db/sqlmodel_repo.py index 5d3da3ce..3ed3a8ac 100644 --- a/decnet/web/db/sqlmodel_repo.py +++ b/decnet/web/db/sqlmodel_repo.py @@ -41,6 +41,7 @@ from decnet.web.db.models import ( TopologyDecky, TopologyEdge, TopologyStatusEvent, + TopologyMutation, ) @@ -1336,3 +1337,165 @@ class SQLModelRepository(BaseRepository): .limit(limit) ) return [r.model_dump(mode="json") for r in result.scalars().all()] + + # ---------------- topology_mutations (live reconciler queue) ---------------- + + async def enqueue_topology_mutation( + self, + topology_id: str, + op: str, + payload: dict[str, Any], + *, + expected_version: Optional[int] = None, + ) -> str: + """Append a pending mutation row and bump the topology version. + + Intended for use while the topology is ``active|degraded``; the + reconciler picks these rows up on its next tick. + """ + async with self._session() as session: + await self._check_and_bump_version( + session, topology_id, expected_version + ) + row = TopologyMutation( + topology_id=topology_id, + op=op, + payload=orjson.dumps(payload).decode(), + ) + session.add(row) + await session.commit() + await session.refresh(row) + return row.id + + async def claim_next_mutation( + self, topology_id: str + ) -> Optional[dict[str, Any]]: + """Atomically claim the oldest pending mutation for ``topology_id``. + + Correctness-critical: this is ONE SQL statement. Splitting it + into SELECT-then-UPDATE would let two racing watch-loops both + see the same ``pending`` row and both transition it to + ``applying`` — double-executing the op. With the single + ``UPDATE ... WHERE id = (SELECT ... LIMIT 1) AND state='pending'`` + pattern the loser's UPDATE matches zero rows and returns + ``None`` — that is the expected, non-error outcome under + contention. + """ + async with self._session() as session: + now = datetime.now(timezone.utc).isoformat() + # Single-statement atomic claim. The inner SELECT picks the + # oldest pending row; the outer UPDATE re-checks state so a + # second racer that also saw that id finds state='applying' + # and matches zero rows. + sql = text( + """ + UPDATE topology_mutations + SET state = 'applying' + WHERE id = ( + SELECT id FROM topology_mutations + WHERE topology_id = :t AND state = 'pending' + ORDER BY requested_at ASC + LIMIT 1 + ) + AND state = 'pending' + """ + ) + result = await session.execute(sql, {"t": topology_id}) + if result.rowcount == 0: + await session.commit() + return None + # Re-read the row we just claimed. The post-UPDATE SELECT is + # safe: no racer can now transition an ``applying`` row back + # to ``pending``. + sel = await session.execute( + select(TopologyMutation) + .where(TopologyMutation.topology_id == topology_id) + .where(TopologyMutation.state == "applying") + .order_by(asc(TopologyMutation.requested_at)) + .limit(1) + ) + row = sel.scalar_one_or_none() + await session.commit() + _ = now + if row is None: + return None + return row.model_dump(mode="json") + + async def mark_mutation_applied(self, mutation_id: str) -> None: + async with self._session() as session: + await session.execute( + text( + "UPDATE topology_mutations " + "SET state = 'applied', applied_at = :at " + "WHERE id = :i" + ), + { + "at": datetime.now(timezone.utc).isoformat(), + "i": mutation_id, + }, + ) + await session.commit() + + async def mark_mutation_failed( + self, mutation_id: str, reason: str + ) -> None: + async with self._session() as session: + await session.execute( + text( + "UPDATE topology_mutations " + "SET state = 'failed', applied_at = :at, reason = :r " + "WHERE id = :i" + ), + { + "at": datetime.now(timezone.utc).isoformat(), + "r": reason, + "i": mutation_id, + }, + ) + await session.commit() + + async def list_topology_mutations( + self, + topology_id: str, + state: Optional[str] = None, + ) -> list[dict[str, Any]]: + async with self._session() as session: + stmt = ( + select(TopologyMutation) + .where(TopologyMutation.topology_id == topology_id) + .order_by(desc(TopologyMutation.requested_at)) + ) + if state is not None: + stmt = stmt.where(TopologyMutation.state == state) + result = await session.execute(stmt) + return [r.model_dump(mode="json") for r in result.scalars().all()] + + async def has_pending_topology_mutation(self) -> bool: + """Cheap watch-loop guard: any pending mutation on a live topology? + + Uses the ``ix_topology_mutations_state_topology`` composite index + to keep the join cheap at scale. Returns False as soon as the + reconciler path should be skipped. + """ + async with self._session() as session: + result = await session.execute( + text( + "SELECT 1 FROM topology_mutations " + "WHERE state = 'pending' " + "AND topology_id IN (" + " SELECT id FROM topologies " + " WHERE status IN ('active', 'degraded')" + ") LIMIT 1" + ) + ) + return result.first() is not None + + async def list_live_topology_ids(self) -> list[str]: + """Return ids of topologies currently in ``active|degraded``.""" + async with self._session() as session: + result = await session.execute( + select(Topology.id).where( + Topology.status.in_(["active", "degraded"]) + ) + ) + return [r for r in result.scalars().all()] diff --git a/tests/topology/test_mutator.py b/tests/topology/test_mutator.py new file mode 100644 index 00000000..3b38cd6e --- /dev/null +++ b/tests/topology/test_mutator.py @@ -0,0 +1,274 @@ +"""Step 7 — topology_mutations queue + mutator reconciler branch.""" +from __future__ import annotations + +import json + +import pytest + +from decnet.mutator import engine as _engine +from decnet.mutator.ops import MutationError, apply_add_lan, apply_update_decky +from decnet.topology.config import TopologyConfig +from decnet.topology.generator import generate +from decnet.topology.persistence import persist, transition_status +from decnet.topology.status import TopologyStatus, VersionConflict +from decnet.web.db.factory import get_repository + + +def _cfg(**kw) -> TopologyConfig: + base = dict( + name="mut", + depth=1, + branching_factor=1, + deckies_per_lan_min=2, + deckies_per_lan_max=2, + cross_edge_probability=0.0, + randomize_services=False, + services_explicit=["ssh"], + seed=9, + ) + base.update(kw) + return TopologyConfig(**base) + + +@pytest.fixture +async def repo(tmp_path): + r = get_repository(db_path=str(tmp_path / "mut.db")) + await r.initialize() + return r + + +async def _make_active(repo) -> str: + plan = generate(_cfg()) + tid = await persist(repo, plan) + await transition_status(repo, tid, TopologyStatus.DEPLOYING) + await transition_status(repo, tid, TopologyStatus.ACTIVE) + return tid + + +# --------------------------------------------------------------------- queue + + +@pytest.mark.anyio +async def test_enqueue_bumps_topology_version(repo): + tid = await _make_active(repo) + before = (await repo.get_topology(tid))["version"] + mid = await repo.enqueue_topology_mutation( + tid, "add_lan", {"name": "LAN-X", "subnet": "172.20.77.0/24"}, + expected_version=before, + ) + topo = await repo.get_topology(tid) + assert topo["version"] == before + 1 + rows = await repo.list_topology_mutations(tid) + assert rows[0]["id"] == mid + assert rows[0]["state"] == "pending" + + +@pytest.mark.anyio +async def test_enqueue_version_conflict(repo): + tid = await _make_active(repo) + await repo.enqueue_topology_mutation( + tid, "add_lan", {"name": "LAN-X", "subnet": "172.20.77.0/24"}, + expected_version=1, + ) + with pytest.raises(VersionConflict): + await repo.enqueue_topology_mutation( + tid, "add_lan", {"name": "LAN-Y", "subnet": "172.20.78.0/24"}, + expected_version=1, # stale — version is now 2 + ) + + +@pytest.mark.anyio +async def test_claim_next_mutation_is_atomic_single_winner(repo): + """Two simulated watch loops; only one claims the row.""" + tid = await _make_active(repo) + await repo.enqueue_topology_mutation( + tid, "add_lan", {"name": "LAN-X"}, + ) + # Sequential simulated races: because the claim is a single SQL + # UPDATE with ``WHERE state='pending'``, the second call observes + # state='applying' and returns None rather than re-claiming. + first = await repo.claim_next_mutation(tid) + second = await repo.claim_next_mutation(tid) + assert first is not None + assert second is None + assert first["state"] == "applying" + + +@pytest.mark.anyio +async def test_claim_none_when_empty(repo): + tid = await _make_active(repo) + assert await repo.claim_next_mutation(tid) is None + + +@pytest.mark.anyio +async def test_mark_applied_and_failed(repo): + tid = await _make_active(repo) + mid1 = await repo.enqueue_topology_mutation(tid, "add_lan", {"name": "A"}) + mid2 = await repo.enqueue_topology_mutation(tid, "add_lan", {"name": "B"}) + await repo.claim_next_mutation(tid) + await repo.mark_mutation_applied(mid1) + await repo.claim_next_mutation(tid) + await repo.mark_mutation_failed(mid2, "boom") + + by_id = {r["id"]: r for r in await repo.list_topology_mutations(tid)} + assert by_id[mid1]["state"] == "applied" + assert by_id[mid2]["state"] == "failed" + assert by_id[mid2]["reason"] == "boom" + + +# --------------------------------------------------------------- guard query + + +@pytest.mark.anyio +async def test_guard_false_without_pending_or_live(repo): + # No topologies at all. + assert await repo.has_pending_topology_mutation() is False + # Pending topology with a mutation (but not live) — guard stays False. + plan = generate(_cfg()) + tid = await persist(repo, plan) + # enqueue_topology_mutation doesn't require status, but pending + # topologies don't trip the guard. + await repo.enqueue_topology_mutation(tid, "add_lan", {"name": "Z"}) + assert await repo.has_pending_topology_mutation() is False + + +@pytest.mark.anyio +async def test_guard_true_with_live_pending(repo): + tid = await _make_active(repo) + await repo.enqueue_topology_mutation(tid, "add_lan", {"name": "Z"}) + assert await repo.has_pending_topology_mutation() is True + # After claiming, the pending row becomes applying — guard drops. + await repo.claim_next_mutation(tid) + assert await repo.has_pending_topology_mutation() is False + + +# ---------------------------------------------------------------------- ops + + +@pytest.mark.anyio +async def test_apply_add_lan_persists(repo): + tid = await _make_active(repo) + await apply_add_lan( + repo, tid, {"name": "LAN-MUT", "subnet": "172.20.55.0/24"} + ) + names = {l["name"] for l in await repo.list_lans_for_topology(tid)} + assert "LAN-MUT" in names + + +@pytest.mark.anyio +async def test_apply_rejected_on_validator_error(repo): + """Unknown service name must trip the post-apply validator.""" + tid = await _make_active(repo) + decky = (await repo.list_topology_deckies(tid))[0] + with pytest.raises(MutationError): + await apply_update_decky( + repo, tid, + { + "decky": decky["decky_config"]["name"], + # service_config for an undeclared service trips + # SERVICE_CFG_UNDECLARED in the post-apply invariants. + "patch": {"service_config": {"telnet": {"banner": "x"}}}, + }, + ) + + +# ----------------------------------------------------------- reconciler flow + + +@pytest.mark.anyio +async def test_reconcile_applies_pending_mutation(repo): + tid = await _make_active(repo) + await repo.enqueue_topology_mutation( + tid, "add_lan", + {"name": "LAN-RECON", "subnet": "172.20.44.0/24"}, + ) + drained = await _engine.reconcile_topologies(repo) + assert drained == 1 + names = {l["name"] for l in await repo.list_lans_for_topology(tid)} + assert "LAN-RECON" in names + # Mutation row is now applied. + state = {r["state"] for r in await repo.list_topology_mutations(tid)} + assert state == {"applied"} + + +@pytest.mark.anyio +async def test_reconcile_failed_mutation_degrades_topology(repo): + tid = await _make_active(repo) + existing = (await repo.list_lans_for_topology(tid))[0]["name"] + # Validator will reject duplicate LAN name → failure path. + await repo.enqueue_topology_mutation( + tid, "add_lan", {"name": existing, "subnet": "172.20.88.0/24"}, + ) + drained = await _engine.reconcile_topologies(repo) + assert drained == 0 + mut = (await repo.list_topology_mutations(tid))[0] + assert mut["state"] == "failed" + topo = await repo.get_topology(tid) + assert topo["status"] == TopologyStatus.DEGRADED + + +# ----------------------------------------------------- watch-loop guard isolation + + +@pytest.mark.anyio +async def test_watch_loop_guard_skips_reconciler_when_idle( + repo, monkeypatch +): + """Tick with no live topology + no pending mutations ⇒ reconciler not called. + + Also asserts flat-fleet ``mutate_all`` runs every tick, unchanged. + """ + calls = {"mutate_all": 0, "reconcile": 0} + + async def _fake_mutate_all(force=False, repo=None): + calls["mutate_all"] += 1 + + async def _fake_reconcile(r): + calls["reconcile"] += 1 + return 0 + + monkeypatch.setattr(_engine, "mutate_all", _fake_mutate_all) + monkeypatch.setattr(_engine, "reconcile_topologies", _fake_reconcile) + + # Manually drive one iteration of the loop body. + await _engine.mutate_all(force=False, repo=repo) + if await repo.has_pending_topology_mutation(): + await _engine.reconcile_topologies(repo) + + assert calls["mutate_all"] == 1 + assert calls["reconcile"] == 0 + + +@pytest.mark.anyio +async def test_watch_loop_guard_fires_reconciler_when_work_exists( + repo, monkeypatch +): + tid = await _make_active(repo) + await repo.enqueue_topology_mutation(tid, "add_lan", {"name": "X"}) + + calls = {"reconcile": 0} + + async def _fake_reconcile(r): + calls["reconcile"] += 1 + return 0 + + monkeypatch.setattr(_engine, "reconcile_topologies", _fake_reconcile) + + if await repo.has_pending_topology_mutation(): + await _engine.reconcile_topologies(repo) + + assert calls["reconcile"] == 1 + + +def test_ops_payload_shape_docstring_present(): + """Smoke: DISPATCH covers every op name referenced in the plan.""" + from decnet.mutator.ops import DISPATCH + + assert set(DISPATCH) == { + "add_lan", "remove_lan", "attach_decky", "detach_decky", + "remove_decky", "update_decky", "update_lan", + } + + +def _payload_json(d: dict) -> str: + return json.dumps(d) From 2379b2aeda6372a07f15f564b78c91e378d46e05 Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 18:16:30 -0400 Subject: [PATCH 015/448] =?UTF-8?q?feat(api):=20phase=203=20step=201=20?= =?UTF-8?q?=E2=80=94=20topology=20request/response=20models=20+=20router?= =?UTF-8?q?=20skeleton?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Pydantic DTOs in decnet/web/db/models.py covering every phase-3 endpoint shape: TopologyGenerateRequest, TopologySummary/Detail, child create/update requests, MutationEnqueueRequest (Literal op guard), MutationRow with JSON-payload decoder, validation/version/not-editable error envelopes, and the three catalog responses. Create decnet/web/router/topology/ as an import-safe package exporting topology_router (prefix /topologies) — sub-routers land step-by-step in subsequent commits. Mount under the main api router alongside swarm_mgmt. tests/api/topology/test_models.py pins repo-dict ↔ DTO parity so future repo-row drift breaks the contract test before the endpoints. --- decnet/web/db/models.py | 225 +++++++++++++++++++++++++ decnet/web/router/__init__.py | 4 + decnet/web/router/topology/__init__.py | 18 ++ tests/api/topology/__init__.py | 0 tests/api/topology/test_models.py | 132 +++++++++++++++ 5 files changed, 379 insertions(+) create mode 100644 decnet/web/router/topology/__init__.py create mode 100644 tests/api/topology/__init__.py create mode 100644 tests/api/topology/test_models.py diff --git a/decnet/web/db/models.py b/decnet/web/db/models.py index 28a03b1c..00087875 100644 --- a/decnet/web/db/models.py +++ b/decnet/web/db/models.py @@ -644,3 +644,228 @@ class RollbackResponse(BaseModel): status: Literal["rolled-back", "failed"] http_status: Optional[int] = None detail: Optional[str] = None + + +# --- MazeNET Topology REST DTOs (phase 3) --- +# Request/response shapes for /api/v1/topologies. All write paths are +# admin-only; reads accept admin or viewer. Child CRUD is pending-only; +# mutations of active|degraded topologies go through the queue. + + +class TopologyGenerateRequest(BaseModel): + """Body for POST /topologies — mirrors the `topology generate` CLI.""" + name: str = PydanticField(..., min_length=1, max_length=64) + depth: int = PydanticField(..., ge=1, le=16) + branching_factor: int = PydanticField(..., ge=1, le=8) + deckies_per_lan_min: int = PydanticField(default=1, ge=0, le=32) + deckies_per_lan_max: int = PydanticField(default=3, ge=1, le=32) + bridge_forward_probability: float = PydanticField(default=1.0, ge=0.0, le=1.0) + cross_edge_probability: float = PydanticField(default=0.0, ge=0.0, le=1.0) + services_explicit: Optional[list[str]] = None + randomize_services: bool = True + seed: Optional[int] = PydanticField(default=None, ge=0) + + +class TopologySummary(BaseModel): + """List-row shape for GET /topologies.""" + model_config = ConfigDict(extra="ignore") + id: str + name: str + mode: str + status: str + version: int + created_at: datetime + status_changed_at: Optional[datetime] = None + + +class TopologyListResponse(BaseModel): + total: int + limit: Optional[int] = None + offset: Optional[int] = None + data: list[TopologySummary] + + +class LANRow(BaseModel): + model_config = ConfigDict(extra="ignore") + id: str + topology_id: str + name: str + subnet: str + is_dmz: bool = False + docker_network_id: Optional[str] = None + x: Optional[float] = None + y: Optional[float] = None + + +class DeckyRow(BaseModel): + model_config = ConfigDict(extra="ignore") + uuid: str + topology_id: str + name: str + services: list[str] = PydanticField(default_factory=list) + decky_config: Optional[dict[str, Any]] = None + ip: Optional[str] = None + state: str + last_error: Optional[str] = None + x: Optional[float] = None + y: Optional[float] = None + + +class EdgeRow(BaseModel): + model_config = ConfigDict(extra="ignore") + id: str + topology_id: str + decky_uuid: str + lan_id: str + is_bridge: bool = False + forwards_l3: bool = False + + +class TopologyDetail(BaseModel): + """Hydrated topology — mirrors persistence.hydrate() output.""" + topology: TopologySummary + lans: list[LANRow] + deckies: list[DeckyRow] + edges: list[EdgeRow] + + +class TopologyStatusEventRow(BaseModel): + model_config = ConfigDict(extra="ignore") + id: str + topology_id: str + from_status: str + to_status: str + at: datetime + reason: Optional[str] = None + + +class LANCreateRequest(BaseModel): + name: str = PydanticField(..., min_length=1, max_length=64) + subnet: Optional[str] = None + is_dmz: bool = False + x: Optional[float] = None + y: Optional[float] = None + expected_version: Optional[int] = None + + +class LANUpdateRequest(BaseModel): + name: Optional[str] = None + subnet: Optional[str] = None + is_dmz: Optional[bool] = None + x: Optional[float] = None + y: Optional[float] = None + expected_version: Optional[int] = None + + +class DeckyCreateRequest(BaseModel): + name: str = PydanticField(..., min_length=1, max_length=64) + services: list[str] = PydanticField(default_factory=list) + decky_config: Optional[dict[str, Any]] = None + x: Optional[float] = None + y: Optional[float] = None + expected_version: Optional[int] = None + + +class DeckyUpdateRequest(BaseModel): + name: Optional[str] = None + services: Optional[list[str]] = None + decky_config: Optional[dict[str, Any]] = None + x: Optional[float] = None + y: Optional[float] = None + expected_version: Optional[int] = None + + +class EdgeCreateRequest(BaseModel): + decky_uuid: str + lan_id: str + is_bridge: bool = False + forwards_l3: bool = False + expected_version: Optional[int] = None + + +_MUTATION_OPS = Literal[ + "add_lan", + "remove_lan", + "attach_decky", + "detach_decky", + "remove_decky", + "update_decky", + "update_lan", +] + + +class MutationEnqueueRequest(BaseModel): + op: _MUTATION_OPS + payload: dict[str, Any] = PydanticField(default_factory=dict) + expected_version: Optional[int] = None + + +def _decode_json_payload(v: Any) -> Any: + """Accept either a dict or a JSON-encoded string for mutation payloads.""" + if isinstance(v, str): + import json as _json + return _json.loads(v) if v else {} + return v + + +_MutationPayload = Annotated[dict[str, Any], BeforeValidator(_decode_json_payload)] + + +class MutationRow(BaseModel): + model_config = ConfigDict(extra="ignore") + id: str + topology_id: str + op: str + payload: _MutationPayload = PydanticField(default_factory=dict) + state: str + requested_at: datetime + applied_at: Optional[datetime] = None + reason: Optional[str] = None + + +class MutationEnqueueResponse(BaseModel): + mutation_id: str + state: str = "pending" + + +class ValidationIssueResponse(BaseModel): + severity: str + code: str + message: str + target: dict[str, Any] = PydanticField(default_factory=dict) + + +class ValidationErrorResponse(BaseModel): + detail: str = "Topology validation failed" + issues: list[ValidationIssueResponse] + + +class VersionConflictResponse(BaseModel): + detail: str = "Topology version conflict" + current: int + expected: int + + +class NotEditableResponse(BaseModel): + detail: str = "Topology not editable" + status: str + reason: Optional[str] = None + + +class ServiceCatalogResponse(BaseModel): + services: list[str] + + +class NextIPResponse(BaseModel): + subnet: str + ip: str + + +class NextSubnetResponse(BaseModel): + subnet: str + + +class DeployAcceptedResponse(BaseModel): + topology_id: str + status: str + dry_run: bool = False diff --git a/decnet/web/router/__init__.py b/decnet/web/router/__init__.py index cbbb99cb..dca36ce0 100644 --- a/decnet/web/router/__init__.py +++ b/decnet/web/router/__init__.py @@ -24,6 +24,7 @@ from .artifacts.api_get_artifact import router as artifacts_router from .swarm_updates import swarm_updates_router from .swarm_mgmt import swarm_mgmt_router from .system import system_router +from .topology import topology_router api_router = APIRouter( # Every route under /api/v1 is auth-guarded (either by an explicit @@ -83,3 +84,6 @@ api_router.include_router(swarm_mgmt_router) # System info (deployment-mode auto-detection, etc.) api_router.include_router(system_router) + +# MazeNET Topologies (nested topology CRUD + mutation queue) +api_router.include_router(topology_router) diff --git a/decnet/web/router/topology/__init__.py b/decnet/web/router/topology/__init__.py new file mode 100644 index 00000000..b0b5605f --- /dev/null +++ b/decnet/web/router/topology/__init__.py @@ -0,0 +1,18 @@ +"""MazeNET topology REST endpoints (phase 3). + +Thin FastAPI layer over the phase-2 topology machinery: +generate/validate/deploy/teardown, pending-only child CRUD, and the +live-mutation queue for active|degraded topologies. + +Mounted at ``/api/v1/topologies`` by the main api router. Sub-routers +live one-per-file and are aggregated here. +""" +from fastapi import APIRouter + +topology_router = APIRouter(prefix="/topologies", tags=["topologies"]) + +# Sub-routers land in later steps; this skeleton keeps the package +# import-safe so the main api router can mount it immediately. + + +__all__ = ["topology_router"] diff --git a/tests/api/topology/__init__.py b/tests/api/topology/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/api/topology/test_models.py b/tests/api/topology/test_models.py new file mode 100644 index 00000000..354f25c3 --- /dev/null +++ b/tests/api/topology/test_models.py @@ -0,0 +1,132 @@ +"""Phase 3 Step 1 — parity between repo dict output and Pydantic DTOs. + +These tests pin the contract that repo-hydrated dicts deserialize +cleanly into the REST DTOs. If a repo-row shape drifts, the DTO test +fails before any endpoint rides on the stale contract. +""" +from __future__ import annotations + +import pytest + +from decnet.topology.config import TopologyConfig +from decnet.topology.generator import generate +from decnet.topology.persistence import hydrate, persist, transition_status +from decnet.topology.status import TopologyStatus +from decnet.web.db.factory import get_repository +from decnet.web.db.models import ( + DeckyRow, + EdgeRow, + LANRow, + MutationEnqueueRequest, + MutationRow, + TopologyDetail, + TopologyGenerateRequest, + TopologyListResponse, + TopologyStatusEventRow, + TopologySummary, +) +from decnet.web.router.topology import topology_router + + +def _cfg() -> TopologyConfig: + return TopologyConfig( + name="dto-parity", + depth=1, + branching_factor=1, + deckies_per_lan_min=1, + deckies_per_lan_max=1, + services_explicit=["ssh"], + randomize_services=False, + seed=0, + ) + + +@pytest.fixture +async def repo(tmp_path): + r = get_repository(db_path=str(tmp_path / "dto.db")) + await r.initialize() + return r + + +def test_router_skeleton_mounted(): + """topology_router lives under /topologies and is import-safe.""" + assert topology_router.prefix == "/topologies" + assert "topologies" in (topology_router.tags or []) + + +def test_generate_request_accepts_cli_shape(): + """TopologyGenerateRequest mirrors the CLI flags.""" + req = TopologyGenerateRequest( + name="n", + depth=2, + branching_factor=2, + deckies_per_lan_min=1, + deckies_per_lan_max=3, + services_explicit=["ssh", "ftp"], + randomize_services=False, + seed=7, + ) + assert req.depth == 2 + assert req.services_explicit == ["ssh", "ftp"] + + +def test_mutation_request_rejects_unknown_op(): + """Literal guard is what gives the frontend a free 422 contract.""" + with pytest.raises(ValueError): + MutationEnqueueRequest(op="teleport_lan", payload={}) + + +@pytest.mark.anyio +async def test_summary_accepts_repo_topology_row(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + row = await repo.get_topology(tid) + summary = TopologySummary(**row) + assert summary.id == tid + assert summary.version == 1 + + +@pytest.mark.anyio +async def test_detail_accepts_hydrated_shape(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + hydrated = await hydrate(repo, tid) + detail = TopologyDetail( + topology=TopologySummary(**hydrated["topology"]), + lans=[LANRow(**l) for l in hydrated["lans"]], + deckies=[DeckyRow(**d) for d in hydrated["deckies"]], + edges=[EdgeRow(**e) for e in hydrated["edges"]], + ) + assert detail.topology.id == tid + assert len(detail.lans) == len(hydrated["lans"]) + assert len(detail.deckies) == len(hydrated["deckies"]) + + +@pytest.mark.anyio +async def test_mutation_row_accepts_repo_row(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + mid = await repo.enqueue_topology_mutation( + tid, "add_lan", {"name": "LAN-X"} + ) + rows = await repo.list_topology_mutations(tid) + assert rows and rows[0]["id"] == mid + m = MutationRow(**rows[0]) + assert m.op == "add_lan" + assert m.payload == {"name": "LAN-X"} + + +@pytest.mark.anyio +async def test_status_event_row_accepts_repo_row(repo): + plan = generate(_cfg()) + tid = await persist(repo, plan) + await transition_status(repo, tid, TopologyStatus.DEPLOYING) + events = await repo.list_topology_status_events(tid) + assert events + TopologyStatusEventRow(**events[0]) + + +def test_list_response_envelope_shape(): + resp = TopologyListResponse(total=0, limit=50, offset=0, data=[]) + assert resp.total == 0 + assert resp.data == [] From f182c98ffa1d2336a535a168d17de247c1a086da Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 18:25:33 -0400 Subject: [PATCH 016/448] =?UTF-8?q?feat(api):=20phase=203=20step=202=20?= =?UTF-8?q?=E2=80=94=20topology=20read=20endpoints=20(list/get/status/cata?= =?UTF-8?q?log)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GET /api/v1/topologies — paginated list with status filter. Extends repo.list_topologies() to accept limit/offset and adds count_topologies() for the total envelope field. GET /api/v1/topologies/{id} — hydrated TopologyDetail; 404 if missing. GET /api/v1/topologies/{id}/status-events — audit trail, limit-capped. Catalog helpers for the phase-4 canvas UI: * GET /topologies/services — full service catalog. * GET /topologies/next-subnet?base=172.20 — wraps SubnetAllocator against reserved_subnets across non-torn-down topologies. * GET /topologies/{id}/lans/{lan_id}/next-ip — IPAllocator pre-seeded with existing decky IPs in that LAN. All read routes are viewer-or-admin. Sub-routers are included in an order that keeps literal catalog paths (/services, /next-subnet) from being shadowed by the /{topology_id} trie branch. --- decnet/web/db/repository.py | 8 +- decnet/web/db/sqlmodel_repo.py | 18 +- decnet/web/router/topology/__init__.py | 14 +- decnet/web/router/topology/api_catalog.py | 104 +++++++++++ .../web/router/topology/api_get_topology.py | 66 +++++++ .../router/topology/api_list_topologies.py | 38 ++++ tests/api/topology/test_reads.py | 169 ++++++++++++++++++ 7 files changed, 413 insertions(+), 4 deletions(-) create mode 100644 decnet/web/router/topology/api_catalog.py create mode 100644 decnet/web/router/topology/api_get_topology.py create mode 100644 decnet/web/router/topology/api_list_topologies.py create mode 100644 tests/api/topology/test_reads.py diff --git a/decnet/web/db/repository.py b/decnet/web/db/repository.py index 0ead310c..acdcc638 100644 --- a/decnet/web/db/repository.py +++ b/decnet/web/db/repository.py @@ -247,10 +247,16 @@ class BaseRepository(ABC): raise NotImplementedError async def list_topologies( - self, status: Optional[str] = None + self, + status: Optional[str] = None, + limit: Optional[int] = None, + offset: Optional[int] = None, ) -> list[dict[str, Any]]: raise NotImplementedError + async def count_topologies(self, status: Optional[str] = None) -> int: + raise NotImplementedError + async def update_topology_status( self, topology_id: str, diff --git a/decnet/web/db/sqlmodel_repo.py b/decnet/web/db/sqlmodel_repo.py index 3ed3a8ac..08b8e19e 100644 --- a/decnet/web/db/sqlmodel_repo.py +++ b/decnet/web/db/sqlmodel_repo.py @@ -949,11 +949,18 @@ class SQLModelRepository(BaseRepository): return self._deserialize_json_fields(d, ("config_snapshot",)) async def list_topologies( - self, status: Optional[str] = None + self, + status: Optional[str] = None, + limit: Optional[int] = None, + offset: Optional[int] = None, ) -> list[dict[str, Any]]: statement = select(Topology).order_by(desc(Topology.created_at)) if status: statement = statement.where(Topology.status == status) + if offset is not None: + statement = statement.offset(offset) + if limit is not None: + statement = statement.limit(limit) async with self._session() as session: result = await session.execute(statement) return [ @@ -963,6 +970,15 @@ class SQLModelRepository(BaseRepository): for r in result.scalars().all() ] + async def count_topologies(self, status: Optional[str] = None) -> int: + from sqlalchemy import func + statement = select(func.count(Topology.id)) + if status: + statement = statement.where(Topology.status == status) + async with self._session() as session: + result = await session.execute(statement) + return int(result.scalar_one() or 0) + async def update_topology_status( self, topology_id: str, diff --git a/decnet/web/router/topology/__init__.py b/decnet/web/router/topology/__init__.py index b0b5605f..a0dce806 100644 --- a/decnet/web/router/topology/__init__.py +++ b/decnet/web/router/topology/__init__.py @@ -9,10 +9,20 @@ live one-per-file and are aggregated here. """ from fastapi import APIRouter +from .api_catalog import router as _catalog_router +from .api_get_topology import router as _get_router +from .api_list_topologies import router as _list_router + topology_router = APIRouter(prefix="/topologies", tags=["topologies"]) -# Sub-routers land in later steps; this skeleton keeps the package -# import-safe so the main api router can mount it immediately. +# Order matters: catalog routes use literal path segments (e.g. +# /services, /next-subnet) that would otherwise be shadowed by the +# `/{topology_id}` path in api_get_topology. Keep the catalog router +# included first so FastAPI's trie resolves literals before the +# parameterized fallback. +topology_router.include_router(_catalog_router) +topology_router.include_router(_list_router) +topology_router.include_router(_get_router) __all__ = ["topology_router"] diff --git a/decnet/web/router/topology/api_catalog.py b/decnet/web/router/topology/api_catalog.py new file mode 100644 index 00000000..44c44114 --- /dev/null +++ b/decnet/web/router/topology/api_catalog.py @@ -0,0 +1,104 @@ +"""Read-only catalog endpoints — services, next-subnet, next-ip. + +These wrap fleet/allocator helpers so the phase-4 canvas UI can lean +on the server for allocation instead of shipping the logic client-side. +""" +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException, Query + +from decnet.fleet import all_service_names +from decnet.telemetry import traced as _traced +from decnet.topology.allocator import ( + AllocatorExhausted, + IPAllocator, + SubnetAllocator, + reserved_subnets, +) +from decnet.web.db.models import ( + NextIPResponse, + NextSubnetResponse, + ServiceCatalogResponse, +) +from decnet.web.dependencies import repo, require_viewer + +router = APIRouter() + + +@router.get( + "/services", + tags=["MazeNET Topologies"], + response_model=ServiceCatalogResponse, + responses={ + 401: {"description": "Missing or invalid credentials"}, + 403: {"description": "Insufficient permissions"}, + }, +) +@_traced("api.topology.catalog.services") +async def api_list_services( + _viewer: dict = Depends(require_viewer), +) -> ServiceCatalogResponse: + return ServiceCatalogResponse(services=all_service_names()) + + +@router.get( + "/next-subnet", + tags=["MazeNET Topologies"], + response_model=NextSubnetResponse, + responses={ + 401: {"description": "Missing or invalid credentials"}, + 403: {"description": "Insufficient permissions"}, + 409: {"description": "Allocator exhausted"}, + }, +) +@_traced("api.topology.catalog.next_subnet") +async def api_next_subnet( + base: str = Query(default="172.20", pattern=r"^\d{1,3}\.\d{1,3}$"), + _viewer: dict = Depends(require_viewer), +) -> NextSubnetResponse: + reserved = await reserved_subnets(repo) + alloc = SubnetAllocator(base_prefix=base, reserved=reserved) + try: + subnet = alloc.next_free() + except AllocatorExhausted as e: + raise HTTPException(status_code=409, detail=str(e)) + return NextSubnetResponse(subnet=subnet) + + +@router.get( + "/{topology_id}/lans/{lan_id}/next-ip", + tags=["MazeNET Topologies"], + response_model=NextIPResponse, + responses={ + 401: {"description": "Missing or invalid credentials"}, + 403: {"description": "Insufficient permissions"}, + 404: {"description": "Topology or LAN not found"}, + 409: {"description": "Allocator exhausted"}, + }, +) +@_traced("api.topology.catalog.next_ip") +async def api_next_ip( + topology_id: str, + lan_id: str, + _viewer: dict = Depends(require_viewer), +) -> NextIPResponse: + if await repo.get_topology(topology_id) is None: + raise HTTPException(status_code=404, detail="Topology not found") + lans = await repo.list_lans_for_topology(topology_id) + lan = next((ln for ln in lans if ln["id"] == lan_id), None) + if lan is None: + raise HTTPException(status_code=404, detail="LAN not found") + deckies = await repo.list_topology_deckies(topology_id) + alloc = IPAllocator(subnet=lan["subnet"]) + for d in deckies: + ip = (d.get("decky_config") or {}).get("ips_by_lan", {}).get(lan["name"]) + if ip: + try: + alloc.reserve(ip) + except ValueError: + continue + try: + ip = alloc.next_free() + except AllocatorExhausted as e: + raise HTTPException(status_code=409, detail=str(e)) + return NextIPResponse(subnet=lan["subnet"], ip=ip) diff --git a/decnet/web/router/topology/api_get_topology.py b/decnet/web/router/topology/api_get_topology.py new file mode 100644 index 00000000..dd9ebaa9 --- /dev/null +++ b/decnet/web/router/topology/api_get_topology.py @@ -0,0 +1,66 @@ +"""GET /topologies/{id} and /topologies/{id}/status-events.""" +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException, Query + +from decnet.telemetry import traced as _traced +from decnet.topology.persistence import hydrate +from decnet.web.db.models import ( + DeckyRow, + EdgeRow, + LANRow, + TopologyDetail, + TopologyStatusEventRow, + TopologySummary, +) +from decnet.web.dependencies import repo, require_viewer + +router = APIRouter() + + +@router.get( + "/{topology_id}", + tags=["MazeNET Topologies"], + response_model=TopologyDetail, + responses={ + 401: {"description": "Missing or invalid credentials"}, + 403: {"description": "Insufficient permissions"}, + 404: {"description": "Topology not found"}, + }, +) +@_traced("api.topology.get") +async def api_get_topology( + topology_id: str, + _viewer: dict = Depends(require_viewer), +) -> TopologyDetail: + hydrated = await hydrate(repo, topology_id) + if hydrated is None: + raise HTTPException(status_code=404, detail="Topology not found") + return TopologyDetail( + topology=TopologySummary(**hydrated["topology"]), + lans=[LANRow(**r) for r in hydrated["lans"]], + deckies=[DeckyRow(**r) for r in hydrated["deckies"]], + edges=[EdgeRow(**r) for r in hydrated["edges"]], + ) + + +@router.get( + "/{topology_id}/status-events", + tags=["MazeNET Topologies"], + response_model=list[TopologyStatusEventRow], + responses={ + 401: {"description": "Missing or invalid credentials"}, + 403: {"description": "Insufficient permissions"}, + 404: {"description": "Topology not found"}, + }, +) +@_traced("api.topology.status_events") +async def api_get_status_events( + topology_id: str, + limit: int = Query(default=100, ge=1, le=1000), + _viewer: dict = Depends(require_viewer), +) -> list[TopologyStatusEventRow]: + if await repo.get_topology(topology_id) is None: + raise HTTPException(status_code=404, detail="Topology not found") + rows = await repo.list_topology_status_events(topology_id, limit=limit) + return [TopologyStatusEventRow(**r) for r in rows] diff --git a/decnet/web/router/topology/api_list_topologies.py b/decnet/web/router/topology/api_list_topologies.py new file mode 100644 index 00000000..f1df8ab3 --- /dev/null +++ b/decnet/web/router/topology/api_list_topologies.py @@ -0,0 +1,38 @@ +"""GET /topologies — paginated list of MazeNET topologies.""" +from __future__ import annotations + +from typing import Optional + +from fastapi import APIRouter, Depends, Query + +from decnet.telemetry import traced as _traced +from decnet.web.db.models import TopologyListResponse, TopologySummary +from decnet.web.dependencies import repo, require_viewer + +router = APIRouter() + + +@router.get( + "/", + tags=["MazeNET Topologies"], + response_model=TopologyListResponse, + responses={ + 401: {"description": "Missing or invalid credentials"}, + 403: {"description": "Insufficient permissions"}, + }, +) +@_traced("api.topology.list") +async def api_list_topologies( + status: Optional[str] = Query(default=None, description="Filter by topology status"), + limit: int = Query(default=50, ge=1, le=500), + offset: int = Query(default=0, ge=0), + _viewer: dict = Depends(require_viewer), +) -> TopologyListResponse: + total = await repo.count_topologies(status=status) + rows = await repo.list_topologies(status=status, limit=limit, offset=offset) + return TopologyListResponse( + total=total, + limit=limit, + offset=offset, + data=[TopologySummary(**r) for r in rows], + ) diff --git a/tests/api/topology/test_reads.py b/tests/api/topology/test_reads.py new file mode 100644 index 00000000..1951e4a3 --- /dev/null +++ b/tests/api/topology/test_reads.py @@ -0,0 +1,169 @@ +"""Phase 3 Step 2 — read endpoints: list / get / status-events / catalog.""" +from __future__ import annotations + +import pytest +from sqlmodel import select as _ss_select + +from decnet.topology.config import TopologyConfig +from decnet.topology.generator import generate +from decnet.topology.persistence import persist, transition_status +from decnet.topology.status import TopologyStatus +from decnet.web.db.models import Topology as _TopologyTable +from decnet.web.dependencies import repo as _repo + +_V1 = "/api/v1/topologies" +_LIST = f"{_V1}/" + + +def _cfg(name: str = "draft") -> TopologyConfig: + return TopologyConfig( + name=name, + depth=1, + branching_factor=1, + deckies_per_lan_min=1, + deckies_per_lan_max=1, + services_explicit=["ssh"], + randomize_services=False, + seed=0, + ) + + +async def _seed(name: str = "draft") -> str: + return await persist(_repo, generate(_cfg(name))) + + +@pytest.mark.anyio +async def test_list_empty_ok(client, auth_token): + r = await client.get(_LIST, headers={"Authorization": f"Bearer {auth_token}"}) + assert r.status_code == 200 + body = r.json() + assert body["total"] == 0 + assert body["data"] == [] + + +@pytest.mark.anyio +async def test_list_requires_auth(client): + r = await client.get(_LIST) + assert r.status_code == 401 + + +@pytest.mark.anyio +async def test_list_viewer_allowed(client, viewer_token): + r = await client.get(_LIST, headers={"Authorization": f"Bearer {viewer_token}"}) + assert r.status_code == 200 + + +@pytest.mark.anyio +async def test_list_with_topology_and_pagination(client, auth_token): + tid1 = await _seed("alpha") + await _seed("beta") + r = await client.get( + f"{_LIST}?limit=1&offset=0", + headers={"Authorization": f"Bearer {auth_token}"}, + ) + assert r.status_code == 200 + body = r.json() + assert body["total"] == 2 + assert len(body["data"]) == 1 + assert body["data"][0]["id"] in {tid1, body["data"][0]["id"]} + + +@pytest.mark.anyio +async def test_get_topology_hydrated(client, auth_token): + tid = await _seed("detail") + r = await client.get( + f"{_V1}/{tid}", headers={"Authorization": f"Bearer {auth_token}"} + ) + assert r.status_code == 200 + body = r.json() + assert body["topology"]["id"] == tid + assert body["topology"]["version"] == 1 + assert body["lans"], "seeded topology has at least one LAN" + assert body["deckies"] + + +@pytest.mark.anyio +async def test_get_topology_404(client, auth_token): + r = await client.get( + f"{_V1}/does-not-exist", + headers={"Authorization": f"Bearer {auth_token}"}, + ) + assert r.status_code == 404 + + +@pytest.mark.anyio +async def test_status_events_after_transition(client, auth_token): + tid = await _seed("events") + await transition_status(_repo, tid, TopologyStatus.DEPLOYING) + r = await client.get( + f"{_V1}/{tid}/status-events", + headers={"Authorization": f"Bearer {auth_token}"}, + ) + assert r.status_code == 200 + rows = r.json() + assert rows and rows[0]["to_status"] == "deploying" + + +@pytest.mark.anyio +async def test_status_events_404_on_missing(client, auth_token): + r = await client.get( + f"{_V1}/nope/status-events", + headers={"Authorization": f"Bearer {auth_token}"}, + ) + assert r.status_code == 404 + + +@pytest.mark.anyio +async def test_services_catalog(client, viewer_token): + r = await client.get( + f"{_V1}/services", + headers={"Authorization": f"Bearer {viewer_token}"}, + ) + assert r.status_code == 200 + body = r.json() + assert isinstance(body["services"], list) + assert "ssh" in body["services"] + + +@pytest.mark.anyio +async def test_next_subnet_starts_at_base(client, auth_token): + r = await client.get( + f"{_V1}/next-subnet?base=172.20", + headers={"Authorization": f"Bearer {auth_token}"}, + ) + assert r.status_code == 200 + assert r.json()["subnet"].startswith("172.20.") + + +@pytest.mark.anyio +async def test_next_ip_skips_gateway_and_existing(client, auth_token): + tid = await _seed("ipalloc") + # Find a LAN and existing decky IPs from the seeded topology. + r = await client.get( + f"{_V1}/{tid}", headers={"Authorization": f"Bearer {auth_token}"} + ) + body = r.json() + lan = body["lans"][0] + taken = { + (d.get("decky_config") or {}).get("ips_by_lan", {}).get(lan["name"]) + for d in body["deckies"] + } + taken.discard(None) + r2 = await client.get( + f"{_V1}/{tid}/lans/{lan['id']}/next-ip", + headers={"Authorization": f"Bearer {auth_token}"}, + ) + assert r2.status_code == 200 + ip = r2.json()["ip"] + assert ip not in taken + assert not ip.endswith(".1") # gateway skipped + + +@pytest.mark.anyio +async def test_next_ip_404_lan(client, auth_token): + tid = await _seed("nopelan") + r = await client.get( + f"{_V1}/{tid}/lans/bogus/next-ip", + headers={"Authorization": f"Bearer {auth_token}"}, + ) + assert r.status_code == 404 From 38db76dd143e71108e745bada217c3c07b8a82d6 Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 18:30:32 -0400 Subject: [PATCH 017/448] fix(api): document 400 on topology read endpoints for schemathesis contract MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DECNET's app-level RequestValidationError handler remaps structural 422→400, including query/path constraint violations (limit bounds, the next-subnet base pattern, etc.). Schemathesis fuzzing will drive those code paths and fail response_schema_conformance unless 400 is declared in responses={}. Adds the entry to every phase-3 read route. --- decnet/web/router/topology/api_catalog.py | 2 ++ decnet/web/router/topology/api_get_topology.py | 2 ++ decnet/web/router/topology/api_list_topologies.py | 1 + 3 files changed, 5 insertions(+) diff --git a/decnet/web/router/topology/api_catalog.py b/decnet/web/router/topology/api_catalog.py index 44c44114..728f26a8 100644 --- a/decnet/web/router/topology/api_catalog.py +++ b/decnet/web/router/topology/api_catalog.py @@ -30,6 +30,7 @@ router = APIRouter() tags=["MazeNET Topologies"], response_model=ServiceCatalogResponse, responses={ + 400: {"description": "Malformed query parameters"}, 401: {"description": "Missing or invalid credentials"}, 403: {"description": "Insufficient permissions"}, }, @@ -70,6 +71,7 @@ async def api_next_subnet( tags=["MazeNET Topologies"], response_model=NextIPResponse, responses={ + 400: {"description": "Malformed path parameters"}, 401: {"description": "Missing or invalid credentials"}, 403: {"description": "Insufficient permissions"}, 404: {"description": "Topology or LAN not found"}, diff --git a/decnet/web/router/topology/api_get_topology.py b/decnet/web/router/topology/api_get_topology.py index dd9ebaa9..11a1535b 100644 --- a/decnet/web/router/topology/api_get_topology.py +++ b/decnet/web/router/topology/api_get_topology.py @@ -23,6 +23,7 @@ router = APIRouter() tags=["MazeNET Topologies"], response_model=TopologyDetail, responses={ + 400: {"description": "Malformed path parameters"}, 401: {"description": "Missing or invalid credentials"}, 403: {"description": "Insufficient permissions"}, 404: {"description": "Topology not found"}, @@ -49,6 +50,7 @@ async def api_get_topology( tags=["MazeNET Topologies"], response_model=list[TopologyStatusEventRow], responses={ + 400: {"description": "Malformed query parameters"}, 401: {"description": "Missing or invalid credentials"}, 403: {"description": "Insufficient permissions"}, 404: {"description": "Topology not found"}, diff --git a/decnet/web/router/topology/api_list_topologies.py b/decnet/web/router/topology/api_list_topologies.py index f1df8ab3..af97c90f 100644 --- a/decnet/web/router/topology/api_list_topologies.py +++ b/decnet/web/router/topology/api_list_topologies.py @@ -17,6 +17,7 @@ router = APIRouter() tags=["MazeNET Topologies"], response_model=TopologyListResponse, responses={ + 400: {"description": "Malformed query parameters"}, 401: {"description": "Missing or invalid credentials"}, 403: {"description": "Insufficient permissions"}, }, From 53db53792e77976be02aff3d0203542e2515ced5 Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 20 Apr 2026 19:10:09 -0400 Subject: [PATCH 018/448] =?UTF-8?q?feat(web):=20MazeNET=20scaffold=20?= =?UTF-8?q?=E2=80=94=20tokens,=20route,=20nav,=20stub=20page?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- decnet_web/src/App.tsx | 2 + decnet_web/src/components/Layout.tsx | 1 + decnet_web/src/components/MazeNET/MazeNET.css | 29 +++ decnet_web/src/components/MazeNET/MazeNET.tsx | 15 ++ decnet_web/src/index.css | 176 ++++++++++++++---- 5 files changed, 188 insertions(+), 35 deletions(-) create mode 100644 decnet_web/src/components/MazeNET/MazeNET.css create mode 100644 decnet_web/src/components/MazeNET/MazeNET.tsx diff --git a/decnet_web/src/App.tsx b/decnet_web/src/App.tsx index 50bdefc2..fe2c8e95 100644 --- a/decnet_web/src/App.tsx +++ b/decnet_web/src/App.tsx @@ -12,6 +12,7 @@ import Bounty from './components/Bounty'; import RemoteUpdates from './components/RemoteUpdates'; import SwarmHosts from './components/SwarmHosts'; import AgentEnrollment from './components/AgentEnrollment'; +import MazeNET from './components/MazeNET/MazeNET'; function isTokenValid(token: string): boolean { try { @@ -62,6 +63,7 @@ function App() { } /> } /> + } /> } /> } /> } /> diff --git a/decnet_web/src/components/Layout.tsx b/decnet_web/src/components/Layout.tsx index 5ee38f56..f0101292 100644 --- a/decnet_web/src/components/Layout.tsx +++ b/decnet_web/src/components/Layout.tsx @@ -43,6 +43,7 @@ const Layout: React.FC = ({ children, onLogout, onSearch }) => {