merge testing->tomerge/main #7
@@ -18,23 +18,38 @@ Endpoints mirror the existing unihost CLI verbs:
|
|||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from fastapi import FastAPI, HTTPException
|
from fastapi import FastAPI, HTTPException
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
from decnet.agent import executor as _exec
|
from decnet.agent import executor as _exec
|
||||||
|
from decnet.agent import heartbeat as _heartbeat
|
||||||
from decnet.config import DecnetConfig
|
from decnet.config import DecnetConfig
|
||||||
from decnet.logging import get_logger
|
from decnet.logging import get_logger
|
||||||
|
|
||||||
log = get_logger("agent.app")
|
log = get_logger("agent.app")
|
||||||
|
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def _lifespan(app: FastAPI):
|
||||||
|
# Best-effort: if identity/bundle plumbing isn't configured (e.g. dev
|
||||||
|
# runs or non-enrolled hosts), heartbeat.start() is a silent no-op.
|
||||||
|
_heartbeat.start()
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
await _heartbeat.stop()
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="DECNET SWARM Agent",
|
title="DECNET SWARM Agent",
|
||||||
version="0.1.0",
|
version="0.1.0",
|
||||||
docs_url=None, # no interactive docs on worker — narrow attack surface
|
docs_url=None, # no interactive docs on worker — narrow attack surface
|
||||||
redoc_url=None,
|
redoc_url=None,
|
||||||
openapi_url=None,
|
openapi_url=None,
|
||||||
|
lifespan=_lifespan,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
134
decnet/agent/heartbeat.py
Normal file
134
decnet/agent/heartbeat.py
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
"""Agent → master liveness heartbeat loop.
|
||||||
|
|
||||||
|
Every ``INTERVAL_S`` seconds the worker posts ``executor.status()`` to
|
||||||
|
``POST <master>/swarm/heartbeat`` over mTLS. The master pins the
|
||||||
|
presented client cert's SHA-256 against the ``SwarmHost`` row for the
|
||||||
|
claimed ``host_uuid``; a match refreshes ``last_heartbeat`` + each
|
||||||
|
``DeckyShard``'s snapshot + runtime state.
|
||||||
|
|
||||||
|
Identity comes from ``/etc/decnet/decnet.ini`` (seeded by the enroll
|
||||||
|
bundle) — specifically ``DECNET_HOST_UUID`` and ``DECNET_MASTER_HOST``.
|
||||||
|
The worker's existing ``~/.decnet/agent/`` bundle (or
|
||||||
|
``/etc/decnet/agent/``) provides the mTLS client cert.
|
||||||
|
|
||||||
|
Started/stopped via the agent FastAPI app's lifespan. If identity
|
||||||
|
plumbing is missing (pre-enrollment dev runs) the loop logs at DEBUG and
|
||||||
|
declines to start — callers don't have to guard it.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import pathlib
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from decnet.agent import executor as _exec
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
from decnet.swarm import pki
|
||||||
|
from decnet.swarm.log_forwarder import build_worker_ssl_context
|
||||||
|
|
||||||
|
log = get_logger("agent.heartbeat")
|
||||||
|
|
||||||
|
INTERVAL_S = 30.0
|
||||||
|
_TIMEOUT = httpx.Timeout(connect=5.0, read=10.0, write=5.0, pool=5.0)
|
||||||
|
|
||||||
|
_task: Optional[asyncio.Task] = None
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_agent_dir() -> pathlib.Path:
|
||||||
|
"""Match the agent-dir resolution order used by the agent server:
|
||||||
|
DECNET_AGENT_DIR env, else /etc/decnet/agent (production install),
|
||||||
|
else ~/.decnet/agent (dev)."""
|
||||||
|
import os
|
||||||
|
env = os.environ.get("DECNET_AGENT_DIR")
|
||||||
|
if env:
|
||||||
|
return pathlib.Path(env)
|
||||||
|
system = pathlib.Path("/etc/decnet/agent")
|
||||||
|
if system.exists():
|
||||||
|
return system
|
||||||
|
return pki.DEFAULT_AGENT_DIR
|
||||||
|
|
||||||
|
|
||||||
|
async def _tick(client: httpx.AsyncClient, url: str, host_uuid: str, agent_version: str) -> None:
|
||||||
|
snap = await _exec.status()
|
||||||
|
resp = await client.post(
|
||||||
|
url,
|
||||||
|
json={
|
||||||
|
"host_uuid": host_uuid,
|
||||||
|
"agent_version": agent_version,
|
||||||
|
"status": snap,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
# 403 / 404 are terminal-ish — we still keep looping because an
|
||||||
|
# operator may re-enrol the host mid-session, but we log loudly so
|
||||||
|
# prod ops can spot cert-pinning drift.
|
||||||
|
if resp.status_code == 204:
|
||||||
|
return
|
||||||
|
log.warning(
|
||||||
|
"heartbeat rejected status=%d body=%s",
|
||||||
|
resp.status_code, resp.text[:200],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _loop(url: str, host_uuid: str, agent_version: str, ssl_ctx) -> None:
|
||||||
|
log.info("heartbeat loop starting url=%s host_uuid=%s interval=%ss",
|
||||||
|
url, host_uuid, INTERVAL_S)
|
||||||
|
async with httpx.AsyncClient(verify=ssl_ctx, timeout=_TIMEOUT) as client:
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
await _tick(client, url, host_uuid, agent_version)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
except Exception:
|
||||||
|
log.exception("heartbeat tick failed — will retry in %ss", INTERVAL_S)
|
||||||
|
await asyncio.sleep(INTERVAL_S)
|
||||||
|
|
||||||
|
|
||||||
|
def start() -> Optional[asyncio.Task]:
|
||||||
|
"""Kick off the background heartbeat task. No-op if identity is
|
||||||
|
unconfigured (dev mode) — the caller doesn't need to check."""
|
||||||
|
global _task
|
||||||
|
from decnet.env import (
|
||||||
|
DECNET_HOST_UUID,
|
||||||
|
DECNET_MASTER_HOST,
|
||||||
|
DECNET_SWARMCTL_PORT,
|
||||||
|
)
|
||||||
|
|
||||||
|
if _task is not None and not _task.done():
|
||||||
|
return _task
|
||||||
|
if not DECNET_HOST_UUID or not DECNET_MASTER_HOST:
|
||||||
|
log.debug("heartbeat not starting — DECNET_HOST_UUID or DECNET_MASTER_HOST unset")
|
||||||
|
return None
|
||||||
|
|
||||||
|
agent_dir = _resolve_agent_dir()
|
||||||
|
try:
|
||||||
|
ssl_ctx = build_worker_ssl_context(agent_dir)
|
||||||
|
except Exception:
|
||||||
|
log.exception("heartbeat not starting — worker SSL context unavailable at %s", agent_dir)
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from decnet import __version__ as _v
|
||||||
|
agent_version = _v
|
||||||
|
except Exception:
|
||||||
|
agent_version = "unknown"
|
||||||
|
|
||||||
|
url = f"https://{DECNET_MASTER_HOST}:{DECNET_SWARMCTL_PORT}/swarm/heartbeat"
|
||||||
|
_task = asyncio.create_task(
|
||||||
|
_loop(url, DECNET_HOST_UUID, agent_version, ssl_ctx),
|
||||||
|
name="agent-heartbeat",
|
||||||
|
)
|
||||||
|
return _task
|
||||||
|
|
||||||
|
|
||||||
|
async def stop() -> None:
|
||||||
|
global _task
|
||||||
|
if _task is None:
|
||||||
|
return
|
||||||
|
_task.cancel()
|
||||||
|
try:
|
||||||
|
await _task
|
||||||
|
except (asyncio.CancelledError, Exception):
|
||||||
|
pass
|
||||||
|
_task = None
|
||||||
122
tests/swarm/test_agent_heartbeat.py
Normal file
122
tests/swarm/test_agent_heartbeat.py
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
"""Tests for the worker-side heartbeat loop (decnet.agent.heartbeat)."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from decnet.agent import heartbeat as hb
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _reset_module_task(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
# Each test gets a fresh _task slot so start()/stop() state doesn't
|
||||||
|
# leak between cases.
|
||||||
|
monkeypatch.setattr(hb, "_task", None)
|
||||||
|
yield
|
||||||
|
monkeypatch.setattr(hb, "_task", None)
|
||||||
|
|
||||||
|
|
||||||
|
class _StubTransport(httpx.AsyncBaseTransport):
|
||||||
|
"""Record each POST and respond according to ``responder(req)``."""
|
||||||
|
def __init__(self, responder):
|
||||||
|
self.calls: list[dict[str, Any]] = []
|
||||||
|
self._responder = responder
|
||||||
|
|
||||||
|
async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
|
||||||
|
body = request.read()
|
||||||
|
self.calls.append({"url": str(request.url), "body": body})
|
||||||
|
return self._responder(request)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_tick_posts_status_snapshot_and_accepts_204(monkeypatch) -> None:
|
||||||
|
async def fake_status() -> dict:
|
||||||
|
return {"deployed": False, "deckies": []}
|
||||||
|
|
||||||
|
monkeypatch.setattr(hb._exec, "status", fake_status)
|
||||||
|
|
||||||
|
transport = _StubTransport(lambda req: httpx.Response(204))
|
||||||
|
async with httpx.AsyncClient(transport=transport) as client:
|
||||||
|
await hb._tick(client, "https://m/swarm/heartbeat", "uuid-a", "1.2.3")
|
||||||
|
|
||||||
|
assert len(transport.calls) == 1
|
||||||
|
import json
|
||||||
|
payload = json.loads(transport.calls[0]["body"])
|
||||||
|
assert payload["host_uuid"] == "uuid-a"
|
||||||
|
assert payload["agent_version"] == "1.2.3"
|
||||||
|
assert payload["status"]["deployed"] is False
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_tick_logs_on_non_204_response(monkeypatch, caplog) -> None:
|
||||||
|
async def fake_status() -> dict:
|
||||||
|
return {"deployed": False}
|
||||||
|
|
||||||
|
monkeypatch.setattr(hb._exec, "status", fake_status)
|
||||||
|
transport = _StubTransport(lambda req: httpx.Response(403, text="mismatch"))
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(transport=transport) as client:
|
||||||
|
with caplog.at_level("WARNING", logger="agent.heartbeat"):
|
||||||
|
await hb._tick(client, "https://m/swarm/heartbeat", "uuid-a", "1.2.3")
|
||||||
|
|
||||||
|
assert any("rejected" in rec.getMessage() for rec in caplog.records)
|
||||||
|
|
||||||
|
|
||||||
|
def test_start_is_noop_when_identity_missing(monkeypatch) -> None:
|
||||||
|
# Neither DECNET_HOST_UUID nor DECNET_MASTER_HOST set → start() must
|
||||||
|
# return None, never raise. Dev runs exercise this path every time.
|
||||||
|
import decnet.env as env
|
||||||
|
monkeypatch.setattr(env, "DECNET_HOST_UUID", None)
|
||||||
|
monkeypatch.setattr(env, "DECNET_MASTER_HOST", None)
|
||||||
|
assert hb.start() is None
|
||||||
|
assert hb._task is None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_start_is_noop_when_ssl_context_unavailable(
|
||||||
|
monkeypatch, tmp_path
|
||||||
|
) -> None:
|
||||||
|
# Identity plumbed, but worker bundle missing on disk → start() logs
|
||||||
|
# and bails instead of crashing the FastAPI app.
|
||||||
|
import decnet.env as env
|
||||||
|
monkeypatch.setattr(env, "DECNET_HOST_UUID", "uuid-a")
|
||||||
|
monkeypatch.setattr(env, "DECNET_MASTER_HOST", "master.lan")
|
||||||
|
monkeypatch.setattr(env, "DECNET_SWARMCTL_PORT", 8770)
|
||||||
|
monkeypatch.setenv("DECNET_AGENT_DIR", str(tmp_path / "empty"))
|
||||||
|
assert hb.start() is None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_loop_keeps_ticking_after_5xx_failures(monkeypatch) -> None:
|
||||||
|
# Simulates a flapping master: first two ticks raise/5xx, third succeeds.
|
||||||
|
# The loop must not crash — it must sleep and retry.
|
||||||
|
call_count = {"n": 0}
|
||||||
|
|
||||||
|
def _responder(req):
|
||||||
|
call_count["n"] += 1
|
||||||
|
if call_count["n"] < 3:
|
||||||
|
return httpx.Response(503, text="unavailable")
|
||||||
|
return httpx.Response(204)
|
||||||
|
|
||||||
|
async def fake_status() -> dict:
|
||||||
|
return {"deployed": False}
|
||||||
|
|
||||||
|
monkeypatch.setattr(hb._exec, "status", fake_status)
|
||||||
|
monkeypatch.setattr(hb, "INTERVAL_S", 0.01) # fast-forward the sleep
|
||||||
|
|
||||||
|
transport = _StubTransport(_responder)
|
||||||
|
|
||||||
|
async def _run():
|
||||||
|
async with httpx.AsyncClient(transport=transport) as client:
|
||||||
|
while call_count["n"] < 3:
|
||||||
|
try:
|
||||||
|
await hb._tick(client, "https://m/swarm/heartbeat", "uuid-a", "1.2.3")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
await asyncio.sleep(0.01)
|
||||||
|
|
||||||
|
await asyncio.wait_for(_run(), timeout=2.0)
|
||||||
|
assert call_count["n"] >= 3
|
||||||
Reference in New Issue
Block a user