feat(mazenet): step 7 — topology_mutations queue + mutator reconciler
Adds the live-mutation pipeline for active/degraded topologies: * TopologyMutation table with composite index (state, topology_id) so the watch-loop guard query stays O(log n). * claim_next_mutation is a single atomic UPDATE ... WHERE state='pending' so racing reconcilers deterministically pick one winner; losers see rowcount=0 and skip. * reconcile_topologies drains pending rows per live topology, applies via decnet.mutator.ops.dispatch, and on failure marks the mutation failed + transitions topology to degraded. * run_watch_loop gains a gated branch: flat-fleet mutate_all runs every tick unchanged; the reconciler only enters when the cheap has_pending_topology_mutation guard returns True. * apply_* ops re-check hard invariants (names, IP collisions, subnet overlap, known services, service_config shape) after every mutation so the repo never lands in an invalid state. * CLI: 'decnet topology mutate' / 'mutations' subcommands.
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
from datetime import datetime, timezone
|
||||
from typing import Literal, Optional, Any, List, Annotated
|
||||
from uuid import uuid4
|
||||
from sqlalchemy import Column, Text, UniqueConstraint
|
||||
from sqlalchemy import Column, Index, Text, UniqueConstraint
|
||||
from sqlalchemy.dialects.mysql import MEDIUMTEXT
|
||||
from sqlmodel import SQLModel, Field
|
||||
from pydantic import BaseModel, ConfigDict, Field as PydanticField, BeforeValidator
|
||||
@@ -309,6 +309,44 @@ class TopologyStatusEvent(SQLModel, table=True):
|
||||
)
|
||||
|
||||
|
||||
class TopologyMutation(SQLModel, table=True):
|
||||
"""Operator-requested live mutation for an active MazeNET topology.
|
||||
|
||||
Each row is one intent (add LAN, attach decky, etc.). The mutator's
|
||||
reconciler claims ``pending`` rows atomically (see
|
||||
``SQLModelRepository.claim_next_mutation``), applies them against
|
||||
Docker, and writes ``applied`` or ``failed`` back. The ``(state,
|
||||
topology_id)`` composite index keeps the watch-loop guard query
|
||||
cheap even with years of mutation history.
|
||||
"""
|
||||
__tablename__ = "topology_mutations"
|
||||
__table_args__ = (
|
||||
Index(
|
||||
"ix_topology_mutations_state_topology",
|
||||
"state",
|
||||
"topology_id",
|
||||
),
|
||||
)
|
||||
id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True)
|
||||
topology_id: str = Field(foreign_key="topologies.id", index=True)
|
||||
# add_lan|remove_lan|attach_decky|detach_decky|remove_decky|
|
||||
# update_decky|update_lan
|
||||
op: str = Field(index=True)
|
||||
# JSON-serialised op payload (keys depend on ``op``).
|
||||
payload: str = Field(
|
||||
sa_column=Column("payload", _BIG_TEXT, nullable=False, default="{}")
|
||||
)
|
||||
# pending|applying|applied|failed
|
||||
state: str = Field(default="pending", index=True)
|
||||
requested_at: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc), index=True
|
||||
)
|
||||
applied_at: Optional[datetime] = Field(default=None)
|
||||
reason: Optional[str] = Field(
|
||||
default=None, sa_column=Column("reason", Text, nullable=True)
|
||||
)
|
||||
|
||||
|
||||
# --- API Request/Response Models (Pydantic) ---
|
||||
|
||||
class Token(BaseModel):
|
||||
|
||||
@@ -327,3 +327,41 @@ class BaseRepository(ABC):
|
||||
self, edge_id: str, *, expected_version: Optional[int] = None
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
# -------------------- live mutation queue (reconciler) --------------------
|
||||
|
||||
async def enqueue_topology_mutation(
|
||||
self,
|
||||
topology_id: str,
|
||||
op: str,
|
||||
payload: dict[str, Any],
|
||||
*,
|
||||
expected_version: Optional[int] = None,
|
||||
) -> str:
|
||||
raise NotImplementedError
|
||||
|
||||
async def claim_next_mutation(
|
||||
self, topology_id: str
|
||||
) -> Optional[dict[str, Any]]:
|
||||
raise NotImplementedError
|
||||
|
||||
async def mark_mutation_applied(self, mutation_id: str) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
async def mark_mutation_failed(
|
||||
self, mutation_id: str, reason: str
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
async def list_topology_mutations(
|
||||
self,
|
||||
topology_id: str,
|
||||
state: Optional[str] = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
raise NotImplementedError
|
||||
|
||||
async def has_pending_topology_mutation(self) -> bool:
|
||||
return False
|
||||
|
||||
async def list_live_topology_ids(self) -> list[str]:
|
||||
return []
|
||||
|
||||
@@ -41,6 +41,7 @@ from decnet.web.db.models import (
|
||||
TopologyDecky,
|
||||
TopologyEdge,
|
||||
TopologyStatusEvent,
|
||||
TopologyMutation,
|
||||
)
|
||||
|
||||
|
||||
@@ -1336,3 +1337,165 @@ class SQLModelRepository(BaseRepository):
|
||||
.limit(limit)
|
||||
)
|
||||
return [r.model_dump(mode="json") for r in result.scalars().all()]
|
||||
|
||||
# ---------------- topology_mutations (live reconciler queue) ----------------
|
||||
|
||||
async def enqueue_topology_mutation(
|
||||
self,
|
||||
topology_id: str,
|
||||
op: str,
|
||||
payload: dict[str, Any],
|
||||
*,
|
||||
expected_version: Optional[int] = None,
|
||||
) -> str:
|
||||
"""Append a pending mutation row and bump the topology version.
|
||||
|
||||
Intended for use while the topology is ``active|degraded``; the
|
||||
reconciler picks these rows up on its next tick.
|
||||
"""
|
||||
async with self._session() as session:
|
||||
await self._check_and_bump_version(
|
||||
session, topology_id, expected_version
|
||||
)
|
||||
row = TopologyMutation(
|
||||
topology_id=topology_id,
|
||||
op=op,
|
||||
payload=orjson.dumps(payload).decode(),
|
||||
)
|
||||
session.add(row)
|
||||
await session.commit()
|
||||
await session.refresh(row)
|
||||
return row.id
|
||||
|
||||
async def claim_next_mutation(
|
||||
self, topology_id: str
|
||||
) -> Optional[dict[str, Any]]:
|
||||
"""Atomically claim the oldest pending mutation for ``topology_id``.
|
||||
|
||||
Correctness-critical: this is ONE SQL statement. Splitting it
|
||||
into SELECT-then-UPDATE would let two racing watch-loops both
|
||||
see the same ``pending`` row and both transition it to
|
||||
``applying`` — double-executing the op. With the single
|
||||
``UPDATE ... WHERE id = (SELECT ... LIMIT 1) AND state='pending'``
|
||||
pattern the loser's UPDATE matches zero rows and returns
|
||||
``None`` — that is the expected, non-error outcome under
|
||||
contention.
|
||||
"""
|
||||
async with self._session() as session:
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
# Single-statement atomic claim. The inner SELECT picks the
|
||||
# oldest pending row; the outer UPDATE re-checks state so a
|
||||
# second racer that also saw that id finds state='applying'
|
||||
# and matches zero rows.
|
||||
sql = text(
|
||||
"""
|
||||
UPDATE topology_mutations
|
||||
SET state = 'applying'
|
||||
WHERE id = (
|
||||
SELECT id FROM topology_mutations
|
||||
WHERE topology_id = :t AND state = 'pending'
|
||||
ORDER BY requested_at ASC
|
||||
LIMIT 1
|
||||
)
|
||||
AND state = 'pending'
|
||||
"""
|
||||
)
|
||||
result = await session.execute(sql, {"t": topology_id})
|
||||
if result.rowcount == 0:
|
||||
await session.commit()
|
||||
return None
|
||||
# Re-read the row we just claimed. The post-UPDATE SELECT is
|
||||
# safe: no racer can now transition an ``applying`` row back
|
||||
# to ``pending``.
|
||||
sel = await session.execute(
|
||||
select(TopologyMutation)
|
||||
.where(TopologyMutation.topology_id == topology_id)
|
||||
.where(TopologyMutation.state == "applying")
|
||||
.order_by(asc(TopologyMutation.requested_at))
|
||||
.limit(1)
|
||||
)
|
||||
row = sel.scalar_one_or_none()
|
||||
await session.commit()
|
||||
_ = now
|
||||
if row is None:
|
||||
return None
|
||||
return row.model_dump(mode="json")
|
||||
|
||||
async def mark_mutation_applied(self, mutation_id: str) -> None:
|
||||
async with self._session() as session:
|
||||
await session.execute(
|
||||
text(
|
||||
"UPDATE topology_mutations "
|
||||
"SET state = 'applied', applied_at = :at "
|
||||
"WHERE id = :i"
|
||||
),
|
||||
{
|
||||
"at": datetime.now(timezone.utc).isoformat(),
|
||||
"i": mutation_id,
|
||||
},
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
async def mark_mutation_failed(
|
||||
self, mutation_id: str, reason: str
|
||||
) -> None:
|
||||
async with self._session() as session:
|
||||
await session.execute(
|
||||
text(
|
||||
"UPDATE topology_mutations "
|
||||
"SET state = 'failed', applied_at = :at, reason = :r "
|
||||
"WHERE id = :i"
|
||||
),
|
||||
{
|
||||
"at": datetime.now(timezone.utc).isoformat(),
|
||||
"r": reason,
|
||||
"i": mutation_id,
|
||||
},
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
async def list_topology_mutations(
|
||||
self,
|
||||
topology_id: str,
|
||||
state: Optional[str] = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
async with self._session() as session:
|
||||
stmt = (
|
||||
select(TopologyMutation)
|
||||
.where(TopologyMutation.topology_id == topology_id)
|
||||
.order_by(desc(TopologyMutation.requested_at))
|
||||
)
|
||||
if state is not None:
|
||||
stmt = stmt.where(TopologyMutation.state == state)
|
||||
result = await session.execute(stmt)
|
||||
return [r.model_dump(mode="json") for r in result.scalars().all()]
|
||||
|
||||
async def has_pending_topology_mutation(self) -> bool:
|
||||
"""Cheap watch-loop guard: any pending mutation on a live topology?
|
||||
|
||||
Uses the ``ix_topology_mutations_state_topology`` composite index
|
||||
to keep the join cheap at scale. Returns False as soon as the
|
||||
reconciler path should be skipped.
|
||||
"""
|
||||
async with self._session() as session:
|
||||
result = await session.execute(
|
||||
text(
|
||||
"SELECT 1 FROM topology_mutations "
|
||||
"WHERE state = 'pending' "
|
||||
"AND topology_id IN ("
|
||||
" SELECT id FROM topologies "
|
||||
" WHERE status IN ('active', 'degraded')"
|
||||
") LIMIT 1"
|
||||
)
|
||||
)
|
||||
return result.first() is not None
|
||||
|
||||
async def list_live_topology_ids(self) -> list[str]:
|
||||
"""Return ids of topologies currently in ``active|degraded``."""
|
||||
async with self._session() as session:
|
||||
result = await session.execute(
|
||||
select(Topology.id).where(
|
||||
Topology.status.in_(["active", "degraded"])
|
||||
)
|
||||
)
|
||||
return [r for r in result.scalars().all()]
|
||||
|
||||
Reference in New Issue
Block a user