feat(swarm): heartbeat-driven topology resync for agent-pinned deployments

Agent heartbeats now carry an applied-topology snapshot. The master heartbeat handler compares the reported version_hash against what canonical_hash yields for the hydrated topology pinned to that host and flags Topology.needs_resync on divergence (or when the agent reports no topology at all while master expects one). The mutator watch loop gains reconcile_agent_resyncs, which re-pushes the current hydrated blob via AgentClient.apply_topology without touching status, then clears the flag on success. Push failures leave the flag set so the next tick retries.
2026-04-21 01:35:12 -04:00
parent 05d1ebbaaa
commit e8f9c955b3
9 changed files with 581 additions and 8 deletions
--- a/decnet/agent/heartbeat.py
+++ b/decnet/agent/heartbeat.py
@@ -52,14 +52,26 @@ def _resolve_agent_dir() -> pathlib.Path:

 async def _tick(client: httpx.AsyncClient, url: str, host_uuid: str, agent_version: str) -> None:
    snap = await _exec.status()
-    resp = await client.post(
-        url,
-        json={
-            "host_uuid": host_uuid,
-            "agent_version": agent_version,
-            "status": snap,
-        },
-    )
+    body: dict = {
+        "host_uuid": host_uuid,
+        "agent_version": agent_version,
+        "status": snap,
+    }
+    # Best-effort: fold in applied-topology snapshot. Failures must never
+    # wedge the heartbeat loop — master will fall back to "no topology
+    # reported" which triggers a resync if it expected one.
+    try:
+        from decnet.agent import topology_ops as _topo_ops
+        from decnet.agent.topology_store import TopologyStore
+        store = TopologyStore(_resolve_agent_dir() / "topology.db")
+        try:
+            body["topology"] = _topo_ops.state(store)
+        finally:
+            store.close()
+    except Exception:
+        log.debug("heartbeat: topology state unavailable", exc_info=True)
+
+    resp = await client.post(url, json=body)
    # 403 / 404 are terminal-ish — we still keep looping because an
    # operator may re-enrol the host mid-session, but we log loudly so
    # prod ops can spot cert-pinning drift.