feat(swarm): heartbeat-driven topology resync for agent-pinned deployments
Agent heartbeats now carry an applied-topology snapshot. The master heartbeat handler compares the reported version_hash against what canonical_hash yields for the hydrated topology pinned to that host and flags Topology.needs_resync on divergence (or when the agent reports no topology at all while master expects one). The mutator watch loop gains reconcile_agent_resyncs, which re-pushes the current hydrated blob via AgentClient.apply_topology without touching status, then clears the flag on success. Push failures leave the flag set so the next tick retries.
This commit is contained in:
@@ -1013,6 +1013,30 @@ class SQLModelRepository(BaseRepository):
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
async def set_topology_resync(self, topology_id: str, value: bool) -> None:
|
||||
async with self._session() as session:
|
||||
result = await session.execute(
|
||||
select(Topology).where(Topology.id == topology_id)
|
||||
)
|
||||
topo = result.scalar_one_or_none()
|
||||
if topo is None:
|
||||
return
|
||||
topo.needs_resync = bool(value)
|
||||
session.add(topo)
|
||||
await session.commit()
|
||||
|
||||
async def list_topologies_needing_resync(self) -> list[dict[str, Any]]:
|
||||
async with self._session() as session:
|
||||
result = await session.execute(
|
||||
select(Topology).where(Topology.needs_resync == True) # noqa: E712
|
||||
)
|
||||
return [
|
||||
self._deserialize_json_fields(
|
||||
r.model_dump(mode="json"), ("config_snapshot",)
|
||||
)
|
||||
for r in result.scalars().all()
|
||||
]
|
||||
|
||||
async def delete_topology_cascade(self, topology_id: str) -> bool:
|
||||
"""Delete topology and all children. No portable ON DELETE CASCADE."""
|
||||
async with self._session() as session:
|
||||
|
||||
Reference in New Issue
Block a user