feat(swarm): heartbeat-driven topology resync for agent-pinned deployments

Agent heartbeats now carry an applied-topology snapshot. The master
heartbeat handler compares the reported version_hash against what
canonical_hash yields for the hydrated topology pinned to that host
and flags Topology.needs_resync on divergence (or when the agent
reports no topology at all while master expects one).

The mutator watch loop gains reconcile_agent_resyncs, which re-pushes
the current hydrated blob via AgentClient.apply_topology without
touching status, then clears the flag on success. Push failures leave
the flag set so the next tick retries.
This commit is contained in:
2026-04-21 01:35:12 -04:00
parent 05d1ebbaaa
commit e8f9c955b3
9 changed files with 581 additions and 8 deletions

View File

@@ -225,6 +225,11 @@ class Topology(SQLModel, table=True):
# the topology or any child row when an expected_version is supplied.
# Callers pass their last-seen version; mismatch raises VersionConflict.
version: int = Field(default=1, nullable=False)
# Set by the heartbeat handler when an agent's reported
# ``applied_version_hash`` diverges from what we expect it to be
# running. Drained by the mutator watch loop, which re-pushes via
# AgentClient and clears the flag. NULL for unihost topologies.
needs_resync: bool = Field(default=False, nullable=False)
class LAN(SQLModel, table=True):

View File

@@ -268,6 +268,12 @@ class BaseRepository(ABC):
async def delete_topology_cascade(self, topology_id: str) -> bool:
raise NotImplementedError
async def set_topology_resync(self, topology_id: str, value: bool) -> None:
raise NotImplementedError
async def list_topologies_needing_resync(self) -> list[dict[str, Any]]:
raise NotImplementedError
async def add_lan(self, data: dict[str, Any]) -> str:
raise NotImplementedError

View File

@@ -1013,6 +1013,30 @@ class SQLModelRepository(BaseRepository):
)
await session.commit()
async def set_topology_resync(self, topology_id: str, value: bool) -> None:
async with self._session() as session:
result = await session.execute(
select(Topology).where(Topology.id == topology_id)
)
topo = result.scalar_one_or_none()
if topo is None:
return
topo.needs_resync = bool(value)
session.add(topo)
await session.commit()
async def list_topologies_needing_resync(self) -> list[dict[str, Any]]:
async with self._session() as session:
result = await session.execute(
select(Topology).where(Topology.needs_resync == True) # noqa: E712
)
return [
self._deserialize_json_fields(
r.model_dump(mode="json"), ("config_snapshot",)
)
for r in result.scalars().all()
]
async def delete_topology_cascade(self, topology_id: str) -> bool:
"""Delete topology and all children. No portable ON DELETE CASCADE."""
async with self._session() as session: