feat(swarm): heartbeat-driven topology resync for agent-pinned deployments
Agent heartbeats now carry an applied-topology snapshot. The master heartbeat handler compares the reported version_hash against what canonical_hash yields for the hydrated topology pinned to that host and flags Topology.needs_resync on divergence (or when the agent reports no topology at all while master expects one). The mutator watch loop gains reconcile_agent_resyncs, which re-pushes the current hydrated blob via AgentClient.apply_topology without touching status, then clears the flag on success. Push failures leave the flag set so the next tick retries.
This commit is contained in:
224
tests/swarm/test_heartbeat_topology_resync.py
Normal file
224
tests/swarm/test_heartbeat_topology_resync.py
Normal file
@@ -0,0 +1,224 @@
|
||||
"""Heartbeat-driven topology resync: master flags divergent agents.
|
||||
|
||||
When an agent reports an applied_version_hash that differs from what
|
||||
master computed for the topology pinned to that host (or reports no
|
||||
topology at all while master expects one), the heartbeat handler must
|
||||
set ``needs_resync=True`` on the topology row. The mutator reconcile
|
||||
loop picks it up later — tested separately.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pathlib
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from decnet.topology.config import TopologyConfig
|
||||
from decnet.topology.generator import generate
|
||||
from decnet.topology.hashing import canonical_hash
|
||||
from decnet.topology.persistence import hydrate, persist, transition_status
|
||||
from decnet.topology.status import TopologyStatus
|
||||
from decnet.web.db.factory import get_repository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm import api_heartbeat as hb_mod
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ca_dir(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch) -> pathlib.Path:
|
||||
ca = tmp_path / "ca"
|
||||
from decnet.swarm import pki
|
||||
from decnet.swarm import client as swarm_client
|
||||
from decnet.web.router.swarm import api_enroll_host as enroll_mod
|
||||
|
||||
monkeypatch.setattr(pki, "DEFAULT_CA_DIR", ca)
|
||||
monkeypatch.setattr(swarm_client, "pki", pki)
|
||||
monkeypatch.setattr(enroll_mod, "pki", pki)
|
||||
return ca
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def repo(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch):
|
||||
r = get_repository(db_path=str(tmp_path / "hb-resync.db"))
|
||||
import decnet.web.dependencies as deps
|
||||
import decnet.web.swarm_api as swarm_api_mod
|
||||
|
||||
monkeypatch.setattr(deps, "repo", r)
|
||||
monkeypatch.setattr(swarm_api_mod, "repo", r)
|
||||
return r
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client(repo, ca_dir):
|
||||
from decnet.web.swarm_api import app
|
||||
|
||||
async def _override() -> Any:
|
||||
return repo
|
||||
|
||||
app.dependency_overrides[get_repo] = _override
|
||||
with TestClient(app) as c:
|
||||
yield c
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
|
||||
def _enroll(c: TestClient, name: str) -> dict:
|
||||
r = c.post("/swarm/enroll", json={"name": name, "address": "10.0.0.5", "agent_port": 8765})
|
||||
assert r.status_code == 201, r.text
|
||||
return r.json()
|
||||
|
||||
|
||||
def _cfg(**kw) -> TopologyConfig:
|
||||
base = dict(
|
||||
name="hb-resync",
|
||||
mode="agent",
|
||||
depth=1,
|
||||
branching_factor=1,
|
||||
deckies_per_lan_min=1,
|
||||
deckies_per_lan_max=1,
|
||||
cross_edge_probability=0.0,
|
||||
randomize_services=False,
|
||||
services_explicit=["ssh"],
|
||||
seed=3,
|
||||
)
|
||||
base.update(kw)
|
||||
return TopologyConfig(**base)
|
||||
|
||||
|
||||
async def _persist_active(repo, host_uuid: str) -> tuple[str, str]:
|
||||
plan = generate(_cfg())
|
||||
tid = await persist(repo, plan, target_host_uuid=host_uuid)
|
||||
await transition_status(repo, tid, TopologyStatus.DEPLOYING)
|
||||
await transition_status(repo, tid, TopologyStatus.ACTIVE)
|
||||
hydrated = await hydrate(repo, tid)
|
||||
return tid, canonical_hash(hydrated)
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_heartbeat_matching_hash_does_not_flag(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
host = _enroll(client, "worker-match")
|
||||
monkeypatch.setattr(hb_mod, "_extract_peer_fingerprint", lambda s: host["fingerprint"])
|
||||
tid, expected = await _persist_active(repo, host["host_uuid"])
|
||||
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={
|
||||
"host_uuid": host["host_uuid"],
|
||||
"status": {"deployed": False},
|
||||
"topology": {
|
||||
"topology_id": tid,
|
||||
"applied_version_hash": expected,
|
||||
"observed": {"bridges": [], "containers": []},
|
||||
},
|
||||
},
|
||||
)
|
||||
assert resp.status_code == 204, resp.text
|
||||
row = await repo.get_topology(tid)
|
||||
assert row["needs_resync"] is False
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_heartbeat_hash_mismatch_flags_resync(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
host = _enroll(client, "worker-drift")
|
||||
monkeypatch.setattr(hb_mod, "_extract_peer_fingerprint", lambda s: host["fingerprint"])
|
||||
tid, _ = await _persist_active(repo, host["host_uuid"])
|
||||
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={
|
||||
"host_uuid": host["host_uuid"],
|
||||
"status": {"deployed": False},
|
||||
"topology": {
|
||||
"topology_id": tid,
|
||||
"applied_version_hash": "stale-hash-" + "0" * 40,
|
||||
"observed": {"bridges": [], "containers": []},
|
||||
},
|
||||
},
|
||||
)
|
||||
assert resp.status_code == 204, resp.text
|
||||
row = await repo.get_topology(tid)
|
||||
assert row["needs_resync"] is True
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_heartbeat_agent_reports_no_topology_flags_resync(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""Fresh-boot / wiped-cache case: agent says `null` but master expects
|
||||
an ACTIVE topology pinned here → flag for re-push."""
|
||||
host = _enroll(client, "worker-fresh")
|
||||
monkeypatch.setattr(hb_mod, "_extract_peer_fingerprint", lambda s: host["fingerprint"])
|
||||
tid, _ = await _persist_active(repo, host["host_uuid"])
|
||||
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={
|
||||
"host_uuid": host["host_uuid"],
|
||||
"status": {"deployed": False},
|
||||
"topology": {
|
||||
"topology_id": None,
|
||||
"applied_version_hash": None,
|
||||
"observed": {"bridges": [], "containers": []},
|
||||
},
|
||||
},
|
||||
)
|
||||
assert resp.status_code == 204, resp.text
|
||||
row = await repo.get_topology(tid)
|
||||
assert row["needs_resync"] is True
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_heartbeat_without_topology_block_is_noop_for_resync(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""Legacy agents that don't send a topology block are still valid;
|
||||
they just don't contribute to resync detection. But we still should
|
||||
treat the absence as 'no topology reported' for a pinned ACTIVE
|
||||
topology → flag."""
|
||||
host = _enroll(client, "worker-legacy")
|
||||
monkeypatch.setattr(hb_mod, "_extract_peer_fingerprint", lambda s: host["fingerprint"])
|
||||
tid, _ = await _persist_active(repo, host["host_uuid"])
|
||||
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={"host_uuid": host["host_uuid"], "status": {"deployed": False}},
|
||||
)
|
||||
assert resp.status_code == 204, resp.text
|
||||
row = await repo.get_topology(tid)
|
||||
# Absence of the topology block means agent hasn't reported anything
|
||||
# → treat like no topology reported → flag.
|
||||
assert row["needs_resync"] is True
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_heartbeat_other_host_topology_unaffected(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""Reports from one host must not flip resync flags on another
|
||||
host's topologies."""
|
||||
host_a = _enroll(client, "worker-a")
|
||||
host_b = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "worker-b", "address": "10.0.0.6", "agent_port": 8765},
|
||||
).json()
|
||||
monkeypatch.setattr(hb_mod, "_extract_peer_fingerprint", lambda s: host_b["fingerprint"])
|
||||
tid_a, hash_a = await _persist_active(repo, host_a["host_uuid"])
|
||||
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={
|
||||
"host_uuid": host_b["host_uuid"],
|
||||
"status": {"deployed": False},
|
||||
"topology": {
|
||||
"topology_id": None,
|
||||
"applied_version_hash": None,
|
||||
"observed": {},
|
||||
},
|
||||
},
|
||||
)
|
||||
assert resp.status_code == 204, resp.text
|
||||
row = await repo.get_topology(tid_a)
|
||||
assert row["needs_resync"] is False
|
||||
168
tests/topology/test_resync_reconcile.py
Normal file
168
tests/topology/test_resync_reconcile.py
Normal file
@@ -0,0 +1,168 @@
|
||||
"""Mutator reconcile loop + deployer.resync_agent_topology.
|
||||
|
||||
Covers the last mile of Step 7: once the heartbeat handler flags a
|
||||
topology as ``needs_resync``, the mutator's ``reconcile_agent_resyncs``
|
||||
pass must pick it up, re-push via AgentClient, and clear the flag.
|
||||
Failures must leave the flag set so the next tick retries.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from decnet.engine import deployer as _deployer
|
||||
from decnet.mutator import engine as _mut_engine
|
||||
from decnet.topology.config import TopologyConfig
|
||||
from decnet.topology.generator import generate
|
||||
from decnet.topology.hashing import canonical_hash
|
||||
from decnet.topology.persistence import hydrate, persist, transition_status
|
||||
from decnet.topology.status import TopologyStatus
|
||||
from decnet.web.db.factory import get_repository
|
||||
|
||||
|
||||
def _cfg(**kw) -> TopologyConfig:
|
||||
base = dict(
|
||||
name="resync",
|
||||
mode="agent",
|
||||
depth=1,
|
||||
branching_factor=1,
|
||||
deckies_per_lan_min=1,
|
||||
deckies_per_lan_max=1,
|
||||
cross_edge_probability=0.0,
|
||||
randomize_services=False,
|
||||
services_explicit=["ssh"],
|
||||
seed=9,
|
||||
)
|
||||
base.update(kw)
|
||||
return TopologyConfig(**base)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
async def repo(tmp_path):
|
||||
r = get_repository(db_path=str(tmp_path / "resync.db"))
|
||||
await r.initialize()
|
||||
return r
|
||||
|
||||
|
||||
async def _seed_host(repo, uuid_: str) -> None:
|
||||
await repo.add_swarm_host(
|
||||
{
|
||||
"uuid": uuid_,
|
||||
"name": f"host-{uuid_}",
|
||||
"address": "10.9.9.9",
|
||||
"agent_port": 8765,
|
||||
"status": "active",
|
||||
"client_cert_fingerprint": "a" * 64,
|
||||
"cert_bundle_path": "/tmp/ignored",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class _FakeAgentClient:
|
||||
instances: list["_FakeAgentClient"] = []
|
||||
|
||||
def __init__(self, *, host: dict[str, Any]) -> None:
|
||||
self.host = host
|
||||
self.calls: list[tuple[str, tuple]] = []
|
||||
_FakeAgentClient.instances.append(self)
|
||||
|
||||
async def __aenter__(self) -> "_FakeAgentClient":
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *_exc) -> None:
|
||||
return None
|
||||
|
||||
async def apply_topology(self, hydrated, version_hash):
|
||||
self.calls.append(("apply", (hydrated, version_hash)))
|
||||
return {"status": "applied", "version_hash": version_hash}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fake_agent(monkeypatch: pytest.MonkeyPatch):
|
||||
_FakeAgentClient.instances.clear()
|
||||
import decnet.swarm.client as _swarm_client
|
||||
monkeypatch.setattr(_swarm_client, "AgentClient", _FakeAgentClient)
|
||||
return _FakeAgentClient
|
||||
|
||||
|
||||
async def _active_topology(repo, host_uuid: str) -> tuple[str, str]:
|
||||
plan = generate(_cfg())
|
||||
tid = await persist(repo, plan, target_host_uuid=host_uuid)
|
||||
await transition_status(repo, tid, TopologyStatus.DEPLOYING)
|
||||
await transition_status(repo, tid, TopologyStatus.ACTIVE)
|
||||
hydrated = await hydrate(repo, tid)
|
||||
return tid, canonical_hash(hydrated)
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_resync_agent_topology_pushes_current_hash(repo, fake_agent) -> None:
|
||||
await _seed_host(repo, "h-sync")
|
||||
tid, expected = await _active_topology(repo, "h-sync")
|
||||
|
||||
await _deployer.resync_agent_topology(repo, tid)
|
||||
|
||||
assert len(fake_agent.instances) == 1
|
||||
inst = fake_agent.instances[0]
|
||||
assert inst.calls[0][0] == "apply"
|
||||
_, (hydrated, version_hash) = inst.calls[0]
|
||||
assert version_hash == expected
|
||||
assert hydrated["topology"]["id"] == tid
|
||||
|
||||
row = await repo.get_topology(tid)
|
||||
assert row["status"] == TopologyStatus.ACTIVE # unchanged
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_resync_rejects_master_local_topology(repo) -> None:
|
||||
plan = generate(_cfg(mode="unihost"))
|
||||
tid = await persist(repo, plan, target_host_uuid=None)
|
||||
await transition_status(repo, tid, TopologyStatus.DEPLOYING)
|
||||
await transition_status(repo, tid, TopologyStatus.ACTIVE)
|
||||
|
||||
with pytest.raises(ValueError, match="no target_host_uuid"):
|
||||
await _deployer.resync_agent_topology(repo, tid)
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_reconcile_agent_resyncs_drains_flag(repo, fake_agent) -> None:
|
||||
await _seed_host(repo, "h-drain")
|
||||
tid, _ = await _active_topology(repo, "h-drain")
|
||||
await repo.set_topology_resync(tid, True)
|
||||
|
||||
drained = await _mut_engine.reconcile_agent_resyncs(repo)
|
||||
assert drained == 1
|
||||
row = await repo.get_topology(tid)
|
||||
assert row["needs_resync"] is False
|
||||
assert len(fake_agent.instances) == 1
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_reconcile_retains_flag_on_push_failure(repo, monkeypatch) -> None:
|
||||
await _seed_host(repo, "h-boom")
|
||||
tid, _ = await _active_topology(repo, "h-boom")
|
||||
await repo.set_topology_resync(tid, True)
|
||||
|
||||
class _Boom:
|
||||
def __init__(self, *, host): ...
|
||||
async def __aenter__(self): return self
|
||||
async def __aexit__(self, *_): return None
|
||||
async def apply_topology(self, *_a, **_k):
|
||||
raise RuntimeError("agent unreachable")
|
||||
|
||||
import decnet.swarm.client as _swarm_client
|
||||
monkeypatch.setattr(_swarm_client, "AgentClient", _Boom)
|
||||
|
||||
drained = await _mut_engine.reconcile_agent_resyncs(repo)
|
||||
assert drained == 0
|
||||
row = await repo.get_topology(tid)
|
||||
assert row["needs_resync"] is True # still flagged — next tick retries
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_reconcile_noop_when_nothing_flagged(repo, fake_agent) -> None:
|
||||
await _seed_host(repo, "h-idle")
|
||||
await _active_topology(repo, "h-idle")
|
||||
drained = await _mut_engine.reconcile_agent_resyncs(repo)
|
||||
assert drained == 0
|
||||
assert fake_agent.instances == []
|
||||
Reference in New Issue
Block a user