feat(fleet): systemd unit + bus signal for fleet reconciler
Two pieces, one PR because they share a deployment surface: 1. systemd. decnet-reconciler.service.j2 mirrors the orchestrator unit shape (docker group, hardened sandbox, append-logs). Read-only /var/lib/decnet so it can read decnet-state.json without write access. Auto-discovered by `decnet init` via the existing decnet-*.service.j2 glob — no init.py change needed. Added to decnet.target so `systemctl start decnet.target` brings it up alongside collector / sniffer / mutator / etc. Also added to the agent reaper script so self-destruct cleans it up on workers. 2. Bus signal. reconcile_once now publishes `decky.<host_uuid:name>.state` on every insert / delete / state-changed transition. Reuses the existing DECKY_STATE topic family (no bus/topics.py change → no wiki update needed per the bus-signals doc rule). Composite host_uuid:name segment keeps fleet rows distinguishable from MazeNET TopologyDecky rows whose ids are bare UUIDs. Quiet ticks publish nothing — convergence means silence. Bus is plumbed through the worker, defaults to None for unit-test callers. publish_safely keeps the source-of-truth contract: DB write is authoritative, the publish is best-effort notification. Captures previous_state into a local before update_fleet_decky_state runs — a fake repo that mutates rows in-place would otherwise see the post-update state and report previous == current. Real repos don't have this concern but the fix is cheap and makes the function less order-dependent.
This commit is contained in:
@@ -255,6 +255,103 @@ class TestReconcileOnce:
|
||||
# host-b's row survives
|
||||
assert any(r["host_uuid"] == "host-b" for r in repo.rows)
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_publishes_decky_state_on_transitions(self):
|
||||
"""When *bus* is provided, every insert/delete/state-change must
|
||||
publish on ``decky.<host_uuid:name>.state``."""
|
||||
from decnet.bus.fake import FakeBus
|
||||
bus = FakeBus()
|
||||
await bus.connect()
|
||||
|
||||
published: list = []
|
||||
|
||||
async def collect():
|
||||
async with bus.subscribe("decky.>") as sub:
|
||||
async for ev in sub:
|
||||
published.append(ev)
|
||||
if len(published) >= 3:
|
||||
return
|
||||
|
||||
try:
|
||||
collector = asyncio.create_task(collect())
|
||||
await asyncio.sleep(0) # let subscription register
|
||||
|
||||
repo = FakeRepo([
|
||||
# An existing row that will be deleted (not in JSON).
|
||||
{"host_uuid": "local", "name": "ghost", "services": ["ssh"],
|
||||
"state": "running", "decky_ip": "10.0.0.99"},
|
||||
# An existing row whose state will flip running → failed.
|
||||
{"host_uuid": "local", "name": "d-flip", "services": ["ssh"],
|
||||
"state": "running", "decky_ip": "10.0.0.20"},
|
||||
])
|
||||
json_deckies = [
|
||||
_decky(name="d-new", ip="10.0.0.30", services=["http"]),
|
||||
_decky(name="d-flip", ip="10.0.0.20", services=["ssh"]),
|
||||
]
|
||||
await reconcile_once(
|
||||
repo,
|
||||
load_state_fn=_state_loader(json_deckies),
|
||||
docker_client_factory=_docker_factory({
|
||||
"d-new-http": "running",
|
||||
"d-flip-ssh": "exited",
|
||||
}),
|
||||
bus=bus,
|
||||
)
|
||||
await asyncio.wait_for(collector, timeout=2.0)
|
||||
finally:
|
||||
await bus.close()
|
||||
|
||||
topics = sorted(ev.topic for ev in published)
|
||||
assert topics == [
|
||||
"decky.local:d-flip.state",
|
||||
"decky.local:d-new.state",
|
||||
"decky.local:ghost.state",
|
||||
]
|
||||
by_name = {ev.payload["name"]: ev.payload for ev in published}
|
||||
assert by_name["d-new"]["transition"] == "inserted"
|
||||
assert by_name["d-new"]["state"] == "running"
|
||||
assert by_name["ghost"]["transition"] == "deleted"
|
||||
assert by_name["ghost"]["state"] == "torn_down"
|
||||
assert by_name["d-flip"]["transition"] == "state_changed"
|
||||
assert by_name["d-flip"]["state"] == "failed"
|
||||
assert by_name["d-flip"]["previous_state"] == "running"
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_no_bus_publish_when_already_converged(self):
|
||||
"""Quiet ticks must not publish — otherwise every 30s the bus
|
||||
floods with no-op events."""
|
||||
from decnet.bus.fake import FakeBus
|
||||
bus = FakeBus()
|
||||
await bus.connect()
|
||||
try:
|
||||
published: list = []
|
||||
|
||||
async def collect():
|
||||
async with bus.subscribe("decky.>") as sub:
|
||||
async for ev in sub:
|
||||
published.append(ev)
|
||||
|
||||
collector = asyncio.create_task(collect())
|
||||
await asyncio.sleep(0)
|
||||
|
||||
repo = FakeRepo([
|
||||
{"host_uuid": "local", "name": "d1", "services": ["ssh"],
|
||||
"state": "running", "decky_ip": "10.0.0.10"},
|
||||
])
|
||||
d = _decky(name="d1", services=["ssh"])
|
||||
await reconcile_once(
|
||||
repo,
|
||||
load_state_fn=_state_loader([d]),
|
||||
docker_client_factory=_docker_factory({"d1-ssh": "running"}),
|
||||
bus=bus,
|
||||
)
|
||||
await asyncio.sleep(0.1) # give the bus a window to deliver
|
||||
collector.cancel()
|
||||
finally:
|
||||
await bus.close()
|
||||
|
||||
assert published == []
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_combined_drift_in_one_pass(self):
|
||||
"""JSON has new decky AND DB has stale decky AND third decky's
|
||||
|
||||
Reference in New Issue
Block a user