Replaces LICENSE (GPLv3 -> AGPLv3) and prepends `SPDX-License-Identifier: AGPL-3.0-or-later` to every source file across decnet/, decnet_web/, tests/, scripts/, and tools/. Rationale: closes the GPLv3 ASP loophole so any party operating a modified DECNET as a network service must offer their modified source. Personal copyright (Samuel Paschuan) + inbound=outbound contributions make a future unilateral relicense infeasible. - LICENSE: full AGPL-3.0 text (gnu.org/licenses/agpl-3.0.txt) - COPYRIGHT: project copyright notice - tools/add_spdx_headers.py: idempotent header injector (shebang- and PEP 263-aware) Touches 1565 source files (.py, .ts, .tsx, .js, .jsx, .css, .sh). No behavior change; comments only.
211 lines
7.1 KiB
Python
211 lines
7.1 KiB
Python
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
"""Agent → master liveness heartbeat loop.
|
|
|
|
Every ``INTERVAL_S`` seconds the worker posts ``executor.status()`` to
|
|
``POST <master>/swarm/heartbeat`` over mTLS. The master pins the
|
|
presented client cert's SHA-256 against the ``SwarmHost`` row for the
|
|
claimed ``host_uuid``; a match refreshes ``last_heartbeat`` + each
|
|
``DeckyShard``'s snapshot + runtime state.
|
|
|
|
Identity comes from ``/etc/decnet/decnet.ini`` (seeded by the enroll
|
|
bundle) — specifically ``DECNET_HOST_UUID`` and ``DECNET_MASTER_HOST``.
|
|
The worker's existing ``~/.decnet/agent/`` bundle (or
|
|
``/etc/decnet/agent/``) provides the mTLS client cert.
|
|
|
|
Started/stopped via the agent FastAPI app's lifespan. If identity
|
|
plumbing is missing (pre-enrollment dev runs) the loop logs at DEBUG and
|
|
declines to start — callers don't have to guard it.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import pathlib
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
|
|
from decnet.agent import executor as _exec
|
|
from decnet.logging import get_logger
|
|
from decnet.swarm import pki
|
|
from decnet.swarm.log_forwarder import build_worker_ssl_context
|
|
|
|
log = get_logger("agent.heartbeat")
|
|
|
|
INTERVAL_S = 30.0
|
|
_TIMEOUT = httpx.Timeout(connect=5.0, read=10.0, write=5.0, pool=5.0)
|
|
|
|
_task: Optional[asyncio.Task] = None
|
|
|
|
|
|
def _resolve_agent_dir() -> pathlib.Path:
|
|
"""Match the agent-dir resolution order used by the agent server:
|
|
DECNET_AGENT_DIR env, else /etc/decnet/agent (production install),
|
|
else ~/.decnet/agent (dev)."""
|
|
import os
|
|
env = os.environ.get("DECNET_AGENT_DIR")
|
|
if env:
|
|
return pathlib.Path(env)
|
|
system = pathlib.Path("/etc/decnet/agent")
|
|
if system.exists():
|
|
return system
|
|
return pki.DEFAULT_AGENT_DIR
|
|
|
|
|
|
async def _build_body(
|
|
host_uuid: str,
|
|
agent_version: str,
|
|
lifecycle: Optional[list[dict]] = None,
|
|
) -> dict:
|
|
snap = await _exec.status()
|
|
body: dict = {
|
|
"host_uuid": host_uuid,
|
|
"agent_version": agent_version,
|
|
"status": snap,
|
|
}
|
|
# Best-effort: fold in applied-topology snapshot. Failures must never
|
|
# wedge the heartbeat loop — master will fall back to "no topology
|
|
# reported" which triggers a resync if it expected one.
|
|
try:
|
|
from decnet.agent import topology_ops as _topo_ops
|
|
from decnet.agent.topology_store import TopologyStore
|
|
store = TopologyStore(_resolve_agent_dir() / "topology.db")
|
|
try:
|
|
body["topology"] = _topo_ops.state(store)
|
|
finally:
|
|
store.close()
|
|
except Exception:
|
|
log.debug("heartbeat: topology state unavailable", exc_info=True)
|
|
if lifecycle:
|
|
body["lifecycle"] = lifecycle
|
|
return body
|
|
|
|
|
|
async def _tick(client: httpx.AsyncClient, url: str, host_uuid: str, agent_version: str) -> None:
|
|
body = await _build_body(host_uuid, agent_version)
|
|
resp = await client.post(url, json=body)
|
|
# 403 / 404 are terminal-ish — we still keep looping because an
|
|
# operator may re-enrol the host mid-session, but we log loudly so
|
|
# prod ops can spot cert-pinning drift.
|
|
if resp.status_code == 204:
|
|
return
|
|
log.warning(
|
|
"heartbeat rejected status=%d body=%s",
|
|
resp.status_code, resp.text[:200],
|
|
)
|
|
|
|
|
|
async def _loop(url: str, host_uuid: str, agent_version: str, ssl_ctx) -> None:
|
|
log.info("heartbeat loop starting url=%s host_uuid=%s interval=%ss",
|
|
url, host_uuid, INTERVAL_S)
|
|
async with httpx.AsyncClient(verify=ssl_ctx, timeout=_TIMEOUT) as client:
|
|
while True:
|
|
try:
|
|
await _tick(client, url, host_uuid, agent_version)
|
|
except asyncio.CancelledError:
|
|
raise
|
|
except Exception:
|
|
log.exception("heartbeat tick failed — will retry in %ss", INTERVAL_S)
|
|
await asyncio.sleep(INTERVAL_S)
|
|
|
|
|
|
def start() -> Optional[asyncio.Task]:
|
|
"""Kick off the background heartbeat task. No-op if identity is
|
|
unconfigured (dev mode) — the caller doesn't need to check."""
|
|
global _task
|
|
from decnet.env import (
|
|
DECNET_HOST_UUID,
|
|
DECNET_MASTER_HOST,
|
|
DECNET_SWARMCTL_PORT,
|
|
)
|
|
|
|
if _task is not None and not _task.done():
|
|
return _task
|
|
if not DECNET_HOST_UUID or not DECNET_MASTER_HOST:
|
|
log.debug("heartbeat not starting — DECNET_HOST_UUID or DECNET_MASTER_HOST unset")
|
|
return None
|
|
|
|
agent_dir = _resolve_agent_dir()
|
|
try:
|
|
ssl_ctx = build_worker_ssl_context(agent_dir)
|
|
except Exception:
|
|
log.exception("heartbeat not starting — worker SSL context unavailable at %s", agent_dir)
|
|
return None
|
|
|
|
try:
|
|
from decnet import __version__ as _v # type: ignore[attr-defined]
|
|
agent_version = _v
|
|
except Exception:
|
|
agent_version = "unknown"
|
|
|
|
url = f"https://{DECNET_MASTER_HOST}:{DECNET_SWARMCTL_PORT}/swarm/heartbeat"
|
|
_task = asyncio.create_task(
|
|
_loop(url, DECNET_HOST_UUID, agent_version, ssl_ctx),
|
|
name="agent-heartbeat",
|
|
)
|
|
return _task
|
|
|
|
|
|
async def push_lifecycle_delta(deltas: list[dict]) -> None:
|
|
"""Fire a one-off heartbeat POST carrying *deltas* in the
|
|
``lifecycle`` field. Each delta: ``{decky_name, operation, status,
|
|
error?, completed_at?}``.
|
|
|
|
Called by the agent executor on /deploy and /mutate completion so
|
|
the master observes the terminal transition immediately rather than
|
|
waiting up to ``INTERVAL_S`` for the next scheduled tick. Failures
|
|
are logged and swallowed; the next scheduled heartbeat carries the
|
|
same deltas via DB-side reconciliation, since the worker has no
|
|
durable per-row state to lose.
|
|
"""
|
|
from decnet.env import (
|
|
DECNET_HOST_UUID,
|
|
DECNET_MASTER_HOST,
|
|
DECNET_SWARMCTL_PORT,
|
|
)
|
|
|
|
if not deltas:
|
|
return
|
|
if not DECNET_HOST_UUID or not DECNET_MASTER_HOST:
|
|
log.debug("push_lifecycle_delta: identity unconfigured — skipping")
|
|
return
|
|
|
|
agent_dir = _resolve_agent_dir()
|
|
try:
|
|
ssl_ctx = build_worker_ssl_context(agent_dir)
|
|
except Exception:
|
|
log.exception("push_lifecycle_delta: SSL context unavailable")
|
|
return
|
|
|
|
try:
|
|
from decnet import __version__ as _v # type: ignore[attr-defined]
|
|
agent_version = _v
|
|
except Exception:
|
|
agent_version = "unknown"
|
|
|
|
url = f"https://{DECNET_MASTER_HOST}:{DECNET_SWARMCTL_PORT}/swarm/heartbeat"
|
|
try:
|
|
async with httpx.AsyncClient(verify=ssl_ctx, timeout=_TIMEOUT) as client:
|
|
body = await _build_body(
|
|
DECNET_HOST_UUID, agent_version, lifecycle=deltas,
|
|
)
|
|
resp = await client.post(url, json=body)
|
|
if resp.status_code not in (200, 204):
|
|
log.warning(
|
|
"lifecycle delta push rejected status=%d body=%s",
|
|
resp.status_code, resp.text[:200],
|
|
)
|
|
except Exception:
|
|
log.exception("push_lifecycle_delta failed — next scheduled tick will retry")
|
|
|
|
|
|
async def stop() -> None:
|
|
global _task
|
|
if _task is None:
|
|
return
|
|
_task.cancel()
|
|
try:
|
|
await _task
|
|
except (asyncio.CancelledError, Exception):
|
|
pass
|
|
_task = None
|