feat(swarm): heartbeat-driven topology resync for agent-pinned deployments

Agent heartbeats now carry an applied-topology snapshot. The master
heartbeat handler compares the reported version_hash against what
canonical_hash yields for the hydrated topology pinned to that host
and flags Topology.needs_resync on divergence (or when the agent
reports no topology at all while master expects one).

The mutator watch loop gains reconcile_agent_resyncs, which re-pushes
the current hydrated blob via AgentClient.apply_topology without
touching status, then clears the flag on success. Push failures leave
the flag set so the next tick retries.
This commit is contained in:
2026-04-21 01:35:12 -04:00
parent 05d1ebbaaa
commit e8f9c955b3
9 changed files with 581 additions and 8 deletions

View File

@@ -52,14 +52,26 @@ def _resolve_agent_dir() -> pathlib.Path:
async def _tick(client: httpx.AsyncClient, url: str, host_uuid: str, agent_version: str) -> None:
snap = await _exec.status()
resp = await client.post(
url,
json={
"host_uuid": host_uuid,
"agent_version": agent_version,
"status": snap,
},
)
body: dict = {
"host_uuid": host_uuid,
"agent_version": agent_version,
"status": snap,
}
# Best-effort: fold in applied-topology snapshot. Failures must never
# wedge the heartbeat loop — master will fall back to "no topology
# reported" which triggers a resync if it expected one.
try:
from decnet.agent import topology_ops as _topo_ops
from decnet.agent.topology_store import TopologyStore
store = TopologyStore(_resolve_agent_dir() / "topology.db")
try:
body["topology"] = _topo_ops.state(store)
finally:
store.close()
except Exception:
log.debug("heartbeat: topology state unavailable", exc_info=True)
resp = await client.post(url, json=body)
# 403 / 404 are terminal-ish — we still keep looping because an
# operator may re-enrol the host mid-session, but we log loudly so
# prod ops can spot cert-pinning drift.