feat(agent): periodic heartbeat loop posting status to swarmctl

New decnet.agent.heartbeat asyncio loop wired into the agent FastAPI
lifespan. Every 30 s the worker POSTs executor.status() to the master's
/swarm/heartbeat with its DECNET_HOST_UUID for self-identity; the
existing agent mTLS bundle provides the client cert the master pins
against SwarmHost.client_cert_fingerprint.

start() is a silent no-op when identity env (HOST_UUID, MASTER_HOST) is
unset or the worker bundle is missing, so dev runs and un-enrolled hosts
don't crash the agent app. On non-204 responses the loop logs loudly but
keeps ticking — an operator may re-enrol mid-session, and fail-closed
pinning shouldn't be self-silencing.
This commit is contained in:
2026-04-19 21:49:34 -04:00
parent 62f7c88b90
commit bf01804736
3 changed files with 271 additions and 0 deletions

View File

@@ -0,0 +1,122 @@
"""Tests for the worker-side heartbeat loop (decnet.agent.heartbeat)."""
from __future__ import annotations
import asyncio
from typing import Any
import httpx
import pytest
from decnet.agent import heartbeat as hb
@pytest.fixture(autouse=True)
def _reset_module_task(monkeypatch: pytest.MonkeyPatch):
# Each test gets a fresh _task slot so start()/stop() state doesn't
# leak between cases.
monkeypatch.setattr(hb, "_task", None)
yield
monkeypatch.setattr(hb, "_task", None)
class _StubTransport(httpx.AsyncBaseTransport):
"""Record each POST and respond according to ``responder(req)``."""
def __init__(self, responder):
self.calls: list[dict[str, Any]] = []
self._responder = responder
async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
body = request.read()
self.calls.append({"url": str(request.url), "body": body})
return self._responder(request)
@pytest.mark.asyncio
async def test_tick_posts_status_snapshot_and_accepts_204(monkeypatch) -> None:
async def fake_status() -> dict:
return {"deployed": False, "deckies": []}
monkeypatch.setattr(hb._exec, "status", fake_status)
transport = _StubTransport(lambda req: httpx.Response(204))
async with httpx.AsyncClient(transport=transport) as client:
await hb._tick(client, "https://m/swarm/heartbeat", "uuid-a", "1.2.3")
assert len(transport.calls) == 1
import json
payload = json.loads(transport.calls[0]["body"])
assert payload["host_uuid"] == "uuid-a"
assert payload["agent_version"] == "1.2.3"
assert payload["status"]["deployed"] is False
@pytest.mark.asyncio
async def test_tick_logs_on_non_204_response(monkeypatch, caplog) -> None:
async def fake_status() -> dict:
return {"deployed": False}
monkeypatch.setattr(hb._exec, "status", fake_status)
transport = _StubTransport(lambda req: httpx.Response(403, text="mismatch"))
async with httpx.AsyncClient(transport=transport) as client:
with caplog.at_level("WARNING", logger="agent.heartbeat"):
await hb._tick(client, "https://m/swarm/heartbeat", "uuid-a", "1.2.3")
assert any("rejected" in rec.getMessage() for rec in caplog.records)
def test_start_is_noop_when_identity_missing(monkeypatch) -> None:
# Neither DECNET_HOST_UUID nor DECNET_MASTER_HOST set → start() must
# return None, never raise. Dev runs exercise this path every time.
import decnet.env as env
monkeypatch.setattr(env, "DECNET_HOST_UUID", None)
monkeypatch.setattr(env, "DECNET_MASTER_HOST", None)
assert hb.start() is None
assert hb._task is None
@pytest.mark.asyncio
async def test_start_is_noop_when_ssl_context_unavailable(
monkeypatch, tmp_path
) -> None:
# Identity plumbed, but worker bundle missing on disk → start() logs
# and bails instead of crashing the FastAPI app.
import decnet.env as env
monkeypatch.setattr(env, "DECNET_HOST_UUID", "uuid-a")
monkeypatch.setattr(env, "DECNET_MASTER_HOST", "master.lan")
monkeypatch.setattr(env, "DECNET_SWARMCTL_PORT", 8770)
monkeypatch.setenv("DECNET_AGENT_DIR", str(tmp_path / "empty"))
assert hb.start() is None
@pytest.mark.asyncio
async def test_loop_keeps_ticking_after_5xx_failures(monkeypatch) -> None:
# Simulates a flapping master: first two ticks raise/5xx, third succeeds.
# The loop must not crash — it must sleep and retry.
call_count = {"n": 0}
def _responder(req):
call_count["n"] += 1
if call_count["n"] < 3:
return httpx.Response(503, text="unavailable")
return httpx.Response(204)
async def fake_status() -> dict:
return {"deployed": False}
monkeypatch.setattr(hb._exec, "status", fake_status)
monkeypatch.setattr(hb, "INTERVAL_S", 0.01) # fast-forward the sleep
transport = _StubTransport(_responder)
async def _run():
async with httpx.AsyncClient(transport=transport) as client:
while call_count["n"] < 3:
try:
await hb._tick(client, "https://m/swarm/heartbeat", "uuid-a", "1.2.3")
except Exception:
pass
await asyncio.sleep(0.01)
await asyncio.wait_for(_run(), timeout=2.0)
assert call_count["n"] >= 3