feat(swarm): heartbeat-driven topology resync for agent-pinned deployments
Agent heartbeats now carry an applied-topology snapshot. The master heartbeat handler compares the reported version_hash against what canonical_hash yields for the hydrated topology pinned to that host and flags Topology.needs_resync on divergence (or when the agent reports no topology at all while master expects one). The mutator watch loop gains reconcile_agent_resyncs, which re-pushes the current hydrated blob via AgentClient.apply_topology without touching status, then clears the flag on success. Push failures leave the flag set so the next tick retries.
This commit is contained in:
@@ -52,14 +52,26 @@ def _resolve_agent_dir() -> pathlib.Path:
|
||||
|
||||
async def _tick(client: httpx.AsyncClient, url: str, host_uuid: str, agent_version: str) -> None:
|
||||
snap = await _exec.status()
|
||||
resp = await client.post(
|
||||
url,
|
||||
json={
|
||||
"host_uuid": host_uuid,
|
||||
"agent_version": agent_version,
|
||||
"status": snap,
|
||||
},
|
||||
)
|
||||
body: dict = {
|
||||
"host_uuid": host_uuid,
|
||||
"agent_version": agent_version,
|
||||
"status": snap,
|
||||
}
|
||||
# Best-effort: fold in applied-topology snapshot. Failures must never
|
||||
# wedge the heartbeat loop — master will fall back to "no topology
|
||||
# reported" which triggers a resync if it expected one.
|
||||
try:
|
||||
from decnet.agent import topology_ops as _topo_ops
|
||||
from decnet.agent.topology_store import TopologyStore
|
||||
store = TopologyStore(_resolve_agent_dir() / "topology.db")
|
||||
try:
|
||||
body["topology"] = _topo_ops.state(store)
|
||||
finally:
|
||||
store.close()
|
||||
except Exception:
|
||||
log.debug("heartbeat: topology state unavailable", exc_info=True)
|
||||
|
||||
resp = await client.post(url, json=body)
|
||||
# 403 / 404 are terminal-ish — we still keep looping because an
|
||||
# operator may re-enrol the host mid-session, but we log loudly so
|
||||
# prod ops can spot cert-pinning drift.
|
||||
|
||||
Reference in New Issue
Block a user