merge: testing → main (reconcile 2-week divergence)
This commit is contained in:
7
decnet/agent/__init__.py
Normal file
7
decnet/agent/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
"""DECNET worker agent — runs on every SWARM worker host.
|
||||
|
||||
Exposes an mTLS-protected FastAPI service the master's SWARM controller
|
||||
calls to deploy, mutate, and tear down deckies locally. The agent reuses
|
||||
the existing `decnet.engine.deployer` code path unchanged, so a worker runs
|
||||
deckies the same way `decnet deploy --mode unihost` does today.
|
||||
"""
|
||||
320
decnet/agent/app.py
Normal file
320
decnet/agent/app.py
Normal file
@@ -0,0 +1,320 @@
|
||||
"""Worker-side FastAPI app.
|
||||
|
||||
Protected by mTLS at the ASGI/uvicorn transport layer: uvicorn is started
|
||||
with ``--ssl-ca-certs`` + ``--ssl-cert-reqs 2`` (CERT_REQUIRED), so any
|
||||
client that cannot prove a cert signed by the DECNET CA is rejected before
|
||||
reaching a handler. Once past the TLS handshake, all peers are trusted
|
||||
equally (the only entity holding a CA-signed cert is the master
|
||||
controller).
|
||||
|
||||
Endpoints mirror the existing unihost CLI verbs:
|
||||
|
||||
* ``POST /deploy`` — body: serialized ``DecnetConfig``
|
||||
* ``POST /teardown`` — body: optional ``{"decky_id": "..."}``
|
||||
* ``POST /mutate`` — body: ``{"decky_id": "...", "services": [...]}``
|
||||
* ``GET /status`` — deployment snapshot
|
||||
* ``GET /health`` — liveness probe, does NOT require mTLS? No — mTLS
|
||||
still required; master pings it with its cert.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import pathlib
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
import contextlib
|
||||
|
||||
from decnet.agent import executor as _exec
|
||||
from decnet.agent import heartbeat as _heartbeat
|
||||
from decnet.agent import topology_ops as _topology_ops
|
||||
from decnet.bus.factory import get_bus
|
||||
from decnet.bus.publish import run_health_heartbeat
|
||||
from decnet.swarm.pki import DEFAULT_AGENT_DIR
|
||||
from decnet.agent.topology_store import AlreadyApplied, TopologyStore
|
||||
from decnet.config import DecnetConfig
|
||||
from decnet.logging import get_logger
|
||||
from decnet.topology.validate import ValidationError
|
||||
|
||||
log = get_logger("agent.app")
|
||||
|
||||
|
||||
def _resolve_agent_dir() -> pathlib.Path:
|
||||
env = os.environ.get("DECNET_AGENT_DIR")
|
||||
if env:
|
||||
return pathlib.Path(env)
|
||||
system = pathlib.Path("/etc/decnet/agent")
|
||||
if system.exists():
|
||||
return system
|
||||
return DEFAULT_AGENT_DIR
|
||||
|
||||
|
||||
# Module-level singleton. Created lazily on first use so tests can
|
||||
# monkeypatch DECNET_AGENT_DIR before the store binds to a path.
|
||||
_topology_store: Optional[TopologyStore] = None
|
||||
|
||||
|
||||
def _store() -> TopologyStore:
|
||||
global _topology_store
|
||||
if _topology_store is None:
|
||||
_topology_store = TopologyStore(_resolve_agent_dir() / "topology.db")
|
||||
return _topology_store
|
||||
|
||||
|
||||
_collector_task: Optional[asyncio.Task] = None
|
||||
|
||||
|
||||
def _ensure_collector_started() -> None:
|
||||
"""Spawn the log collector on demand — called from /topology/apply
|
||||
after a successful materialise. We must NOT start this in the
|
||||
lifespan hook: the agent's boot invariant is "never touch docker
|
||||
until master tells us to" (see tests/swarm/test_agent_no_auto_restore.py).
|
||||
|
||||
The collector watches ``decnet.topology.service=true`` labels via
|
||||
docker events, writing RFC 5424 lines to ``DECNET_AGENT_LOG_FILE``
|
||||
which the forwarder ships to the master over syslog-TLS. Idempotent:
|
||||
subsequent calls while the task is still running are no-ops.
|
||||
"""
|
||||
global _collector_task
|
||||
if _collector_task is not None and not _collector_task.done():
|
||||
return
|
||||
from decnet.env import DECNET_AGENT_LOG_FILE
|
||||
|
||||
try:
|
||||
from decnet.collector.worker import log_collector_worker
|
||||
except Exception: # noqa: BLE001 — docker may be unavailable on dev
|
||||
log.warning(
|
||||
"agent log collector not starting — collector worker import failed",
|
||||
exc_info=True,
|
||||
)
|
||||
return
|
||||
_collector_task = asyncio.create_task(
|
||||
log_collector_worker(DECNET_AGENT_LOG_FILE),
|
||||
name="agent-log-collector",
|
||||
)
|
||||
log.info("agent log collector started log_file=%s", DECNET_AGENT_LOG_FILE)
|
||||
|
||||
|
||||
_bus_heartbeat_task: Optional[asyncio.Task] = None
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def _lifespan(app: FastAPI):
|
||||
# Best-effort: if identity/bundle plumbing isn't configured (e.g. dev
|
||||
# runs or non-enrolled hosts), heartbeat.start() is a silent no-op.
|
||||
_heartbeat.start()
|
||||
|
||||
# Host-local bus heartbeat (system.agent.health). Separate channel
|
||||
# from the mTLS master-facing heartbeat above; this one lets peers on
|
||||
# the same host (dashboard, updater) see the agent is alive without
|
||||
# hitting its HTTPS endpoint. Bus-disabled path is a no-op loop.
|
||||
bus = None
|
||||
try:
|
||||
bus = get_bus(client_name="agent")
|
||||
await bus.connect()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("agent: bus unavailable, skipping health heartbeat: %s", exc)
|
||||
bus = None
|
||||
|
||||
global _bus_heartbeat_task
|
||||
_bus_heartbeat_task = asyncio.create_task(
|
||||
run_health_heartbeat(bus, "agent"),
|
||||
name="agent-bus-heartbeat",
|
||||
)
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
await _heartbeat.stop()
|
||||
if _bus_heartbeat_task is not None:
|
||||
_bus_heartbeat_task.cancel()
|
||||
with contextlib.suppress(asyncio.CancelledError, Exception):
|
||||
await _bus_heartbeat_task
|
||||
_bus_heartbeat_task = None
|
||||
if bus is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
await bus.close()
|
||||
global _collector_task
|
||||
if _collector_task is not None and not _collector_task.done():
|
||||
_collector_task.cancel()
|
||||
try:
|
||||
await _collector_task
|
||||
except (asyncio.CancelledError, Exception): # noqa: BLE001
|
||||
pass
|
||||
_collector_task = None
|
||||
global _topology_store
|
||||
if _topology_store is not None:
|
||||
_topology_store.close()
|
||||
_topology_store = None
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="DECNET SWARM Agent",
|
||||
version="0.1.0",
|
||||
docs_url=None, # no interactive docs on worker — narrow attack surface
|
||||
redoc_url=None,
|
||||
openapi_url=None,
|
||||
lifespan=_lifespan,
|
||||
responses={
|
||||
400: {"description": "Malformed request body"},
|
||||
500: {"description": "Executor error"},
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ schemas
|
||||
|
||||
class DeployRequest(BaseModel):
|
||||
config: DecnetConfig = Field(..., description="Full DecnetConfig to materialise on this worker")
|
||||
dry_run: bool = False
|
||||
no_cache: bool = False
|
||||
|
||||
|
||||
class TeardownRequest(BaseModel):
|
||||
decky_id: Optional[str] = None
|
||||
|
||||
|
||||
class MutateRequest(BaseModel):
|
||||
decky_id: str
|
||||
services: list[str]
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ routes
|
||||
|
||||
@app.get("/health")
|
||||
async def health() -> dict[str, str]:
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.get("/status")
|
||||
async def status() -> dict:
|
||||
return await _exec.status()
|
||||
|
||||
|
||||
@app.post(
|
||||
"/deploy",
|
||||
responses={500: {"description": "Deployer raised an exception materialising the config"}},
|
||||
)
|
||||
async def deploy(req: DeployRequest) -> dict:
|
||||
try:
|
||||
await _exec.deploy(req.config, dry_run=req.dry_run, no_cache=req.no_cache)
|
||||
except Exception as exc:
|
||||
log.exception("agent.deploy failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||
return {"status": "deployed", "deckies": len(req.config.deckies)}
|
||||
|
||||
|
||||
@app.post(
|
||||
"/teardown",
|
||||
responses={500: {"description": "Teardown raised an exception"}},
|
||||
)
|
||||
async def teardown(req: TeardownRequest) -> dict:
|
||||
try:
|
||||
await _exec.teardown(req.decky_id)
|
||||
except Exception as exc:
|
||||
log.exception("agent.teardown failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||
return {"status": "torn_down", "decky_id": req.decky_id}
|
||||
|
||||
|
||||
@app.post(
|
||||
"/self-destruct",
|
||||
responses={500: {"description": "Reaper could not be scheduled"}},
|
||||
)
|
||||
async def self_destruct() -> dict:
|
||||
"""Stop all DECNET services on this worker and delete the install
|
||||
footprint. Called by the master during decommission. Logs under
|
||||
/var/log/decnet* are preserved. Fire-and-forget — returns 202 before
|
||||
the reaper starts deleting files."""
|
||||
try:
|
||||
await _exec.self_destruct()
|
||||
except Exception as exc:
|
||||
log.exception("agent.self_destruct failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||
return {"status": "self_destruct_scheduled"}
|
||||
|
||||
|
||||
# ------------------------------------------------------- topology endpoints
|
||||
|
||||
|
||||
class ApplyTopologyRequest(BaseModel):
|
||||
hydrated: dict[str, Any] = Field(
|
||||
..., description="Hydrated topology dict from master.persistence.hydrate()"
|
||||
)
|
||||
version_hash: str = Field(
|
||||
..., description="Master's canonical_hash(hydrated); must match ours"
|
||||
)
|
||||
|
||||
|
||||
class TeardownTopologyRequest(BaseModel):
|
||||
topology_id: str = Field(..., description="Topology UUID to dismantle")
|
||||
|
||||
|
||||
@app.post(
|
||||
"/topology/apply",
|
||||
responses={
|
||||
400: {"description": "Malformed hydrated topology or hash mismatch"},
|
||||
409: {"description": "A different topology is already applied"},
|
||||
500: {"description": "Docker or compose raised while applying"},
|
||||
},
|
||||
)
|
||||
async def topology_apply(req: ApplyTopologyRequest) -> dict:
|
||||
store = _store()
|
||||
try:
|
||||
await _topology_ops.apply(req.hydrated, req.version_hash, store)
|
||||
except _topology_ops.HashMismatch as exc:
|
||||
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||
except ValidationError as exc:
|
||||
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||
except AlreadyApplied as exc:
|
||||
raise HTTPException(status_code=409, detail=str(exc)) from exc
|
||||
except Exception as exc:
|
||||
log.exception("agent.topology_apply failed")
|
||||
topology_id = (req.hydrated.get("topology") or {}).get("id")
|
||||
if topology_id:
|
||||
try:
|
||||
store.record_error(
|
||||
str(topology_id), str(exc)[:500], hydrated=req.hydrated,
|
||||
)
|
||||
except Exception: # noqa: BLE001 — don't mask original failure
|
||||
log.exception("failed to record apply error")
|
||||
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||
_ensure_collector_started()
|
||||
return {"status": "applied", "version_hash": req.version_hash}
|
||||
|
||||
|
||||
@app.post(
|
||||
"/topology/teardown",
|
||||
responses={500: {"description": "Docker or compose raised while tearing down"}},
|
||||
)
|
||||
async def topology_teardown(req: TeardownTopologyRequest) -> dict:
|
||||
try:
|
||||
await _topology_ops.teardown(req.topology_id, _store())
|
||||
except Exception as exc:
|
||||
log.exception("agent.topology_teardown failed")
|
||||
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||
return {"status": "torn_down", "topology_id": req.topology_id}
|
||||
|
||||
|
||||
@app.get("/topology/state")
|
||||
async def topology_state() -> dict:
|
||||
return _topology_ops.state(_store())
|
||||
|
||||
|
||||
@app.post(
|
||||
"/mutate",
|
||||
responses={501: {"description": "Worker-side mutate not yet implemented"}},
|
||||
)
|
||||
async def mutate(req: MutateRequest) -> dict:
|
||||
# TODO: implement worker-side mutate. Currently the master performs
|
||||
# mutation by re-sending a full /deploy with the updated DecnetConfig;
|
||||
# this avoids duplicating mutation logic on the worker for v1. When
|
||||
# ready, replace the 501 with a real redeploy-of-a-single-decky path.
|
||||
raise HTTPException(
|
||||
status_code=501,
|
||||
detail="Per-decky mutate is performed via /deploy with updated services",
|
||||
)
|
||||
223
decnet/agent/executor.py
Normal file
223
decnet/agent/executor.py
Normal file
@@ -0,0 +1,223 @@
|
||||
"""Thin adapter between the agent's HTTP endpoints and the existing
|
||||
``decnet.engine.deployer`` code path.
|
||||
|
||||
Kept deliberately small: the agent does not re-implement deployment logic,
|
||||
it only translates a master RPC into the same function calls the unihost
|
||||
CLI already uses. Everything runs in a worker thread (the deployer is
|
||||
blocking) so the FastAPI event loop stays responsive.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from ipaddress import IPv4Network
|
||||
from typing import Any
|
||||
|
||||
from decnet.engine import deployer as _deployer
|
||||
from decnet.config import DecnetConfig, load_state, clear_state
|
||||
from decnet.logging import get_logger
|
||||
from decnet.network import (
|
||||
allocate_ips,
|
||||
detect_interface,
|
||||
detect_subnet,
|
||||
get_host_ip,
|
||||
)
|
||||
|
||||
log = get_logger("agent.executor")
|
||||
|
||||
|
||||
def _relocalize(config: DecnetConfig) -> DecnetConfig:
|
||||
"""Rewrite a master-built config to the worker's local network reality.
|
||||
|
||||
The master populates ``interface``/``subnet``/``gateway`` from its own
|
||||
box before dispatching, which blows up the deployer on any worker whose
|
||||
NIC name differs (common in heterogeneous fleets — master on ``wlp6s0``,
|
||||
worker on ``enp0s3``). We always re-detect locally; if the worker sits
|
||||
on a different subnet than the master, decky IPs are re-allocated from
|
||||
the worker's subnet so they're actually reachable.
|
||||
"""
|
||||
local_iface = detect_interface()
|
||||
local_subnet, local_gateway = detect_subnet(local_iface)
|
||||
local_host_ip = get_host_ip(local_iface)
|
||||
|
||||
updates: dict[str, Any] = {
|
||||
"interface": local_iface,
|
||||
"subnet": local_subnet,
|
||||
"gateway": local_gateway,
|
||||
}
|
||||
|
||||
master_net = IPv4Network(config.subnet, strict=False) if config.subnet else None
|
||||
local_net = IPv4Network(local_subnet, strict=False)
|
||||
if master_net is None or master_net != local_net:
|
||||
log.info(
|
||||
"agent.deploy subnet mismatch master=%s local=%s — re-allocating decky IPs",
|
||||
config.subnet, local_subnet,
|
||||
)
|
||||
fresh_ips = allocate_ips(
|
||||
subnet=local_subnet,
|
||||
gateway=local_gateway,
|
||||
host_ip=local_host_ip,
|
||||
count=len(config.deckies),
|
||||
)
|
||||
new_deckies = [d.model_copy(update={"ip": ip}) for d, ip in zip(config.deckies, fresh_ips)]
|
||||
updates["deckies"] = new_deckies
|
||||
|
||||
return config.model_copy(update=updates)
|
||||
|
||||
|
||||
async def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False) -> None:
|
||||
"""Run the blocking deployer off-loop. The deployer itself calls
|
||||
save_state() internally once the compose file is materialised."""
|
||||
log.info(
|
||||
"agent.deploy mode=%s deckies=%d interface=%s (incoming)",
|
||||
config.mode, len(config.deckies), config.interface,
|
||||
)
|
||||
if config.mode == "swarm":
|
||||
config = _relocalize(config)
|
||||
log.info(
|
||||
"agent.deploy relocalized interface=%s subnet=%s gateway=%s",
|
||||
config.interface, config.subnet, config.gateway,
|
||||
)
|
||||
await asyncio.to_thread(_deployer.deploy, config, dry_run, no_cache, False)
|
||||
|
||||
|
||||
async def teardown(decky_id: str | None = None) -> None:
|
||||
log.info("agent.teardown decky_id=%s", decky_id)
|
||||
await asyncio.to_thread(_deployer.teardown, decky_id)
|
||||
if decky_id is None:
|
||||
await asyncio.to_thread(clear_state)
|
||||
|
||||
|
||||
def _decky_runtime_states(config: DecnetConfig) -> dict[str, dict[str, Any]]:
|
||||
"""Map decky_name → {"running": bool, "services": {svc: container_state}}.
|
||||
|
||||
Queried so the master can tell, after a partial-failure deploy, which
|
||||
deckies actually came up instead of tainting the whole shard as failed.
|
||||
Best-effort: a docker error returns an empty map, not an exception.
|
||||
"""
|
||||
try:
|
||||
import docker # local import — agent-only path
|
||||
client = docker.from_env()
|
||||
live = {c.name: c.status for c in client.containers.list(all=True, ignore_removed=True)}
|
||||
except Exception: # pragma: no cover — defensive
|
||||
log.exception("_decky_runtime_states: docker query failed")
|
||||
return {}
|
||||
|
||||
out: dict[str, dict[str, Any]] = {}
|
||||
for d in config.deckies:
|
||||
svc_states = {
|
||||
svc: live.get(f"{d.name}-{svc.replace('_', '-')}", "absent")
|
||||
for svc in d.services
|
||||
}
|
||||
out[d.name] = {
|
||||
"running": bool(svc_states) and all(s == "running" for s in svc_states.values()),
|
||||
"services": svc_states,
|
||||
}
|
||||
return out
|
||||
|
||||
|
||||
_REAPER_SCRIPT = r"""#!/bin/bash
|
||||
# DECNET agent self-destruct reaper.
|
||||
# Runs detached from the agent process so it survives the agent's death.
|
||||
# Waits briefly for the HTTP response to drain, then stops services,
|
||||
# wipes install paths, and preserves logs.
|
||||
set +e
|
||||
|
||||
sleep 3
|
||||
|
||||
# Stop decky containers started by the local deployer (best-effort).
|
||||
if command -v docker >/dev/null 2>&1; then
|
||||
docker ps -q --filter "label=com.docker.compose.project=decnet" | xargs -r docker stop
|
||||
docker ps -aq --filter "label=com.docker.compose.project=decnet" | xargs -r docker rm -f
|
||||
docker network rm decnet_lan 2>/dev/null
|
||||
fi
|
||||
|
||||
# Stop+disable every systemd unit the installer may have dropped.
|
||||
for unit in decnet-agent decnet-engine decnet-collector decnet-forwarder decnet-prober decnet-reconciler decnet-sniffer decnet-updater; do
|
||||
systemctl stop "$unit" 2>/dev/null
|
||||
systemctl disable "$unit" 2>/dev/null
|
||||
done
|
||||
|
||||
# Nuke install paths. Logs under /var/log/decnet* are intentionally
|
||||
# preserved — the operator typically wants them for forensic review.
|
||||
rm -rf /opt/decnet* /var/lib/decnet/* /usr/local/bin/decnet* /etc/decnet
|
||||
rm -f /etc/systemd/system/decnet-*.service /etc/systemd/system/decnet-*.timer
|
||||
|
||||
systemctl daemon-reload 2>/dev/null
|
||||
rm -f "$0"
|
||||
"""
|
||||
|
||||
|
||||
async def self_destruct() -> None:
|
||||
"""Tear down deckies, then spawn a detached reaper that wipes the
|
||||
install footprint. Returns immediately so the HTTP response can drain
|
||||
before the reaper starts deleting files out from under the agent."""
|
||||
import os
|
||||
import shutil
|
||||
import subprocess # nosec B404
|
||||
import tempfile
|
||||
|
||||
# Best-effort teardown first — the reaper also runs docker stop, but
|
||||
# going through the deployer gives the host-macvlan/ipvlan helper a
|
||||
# chance to clean up routes cleanly.
|
||||
try:
|
||||
await asyncio.to_thread(_deployer.teardown, None)
|
||||
await asyncio.to_thread(clear_state)
|
||||
except Exception:
|
||||
log.exception("self_destruct: pre-reap teardown failed — reaper will force-stop containers")
|
||||
|
||||
# Reaper lives under /tmp so it survives rm -rf /opt/decnet*.
|
||||
fd, path = tempfile.mkstemp(prefix="decnet-reaper-", suffix=".sh", dir="/tmp") # nosec B108 — reaper must outlive /opt/decnet removal
|
||||
try:
|
||||
os.write(fd, _REAPER_SCRIPT.encode())
|
||||
finally:
|
||||
os.close(fd)
|
||||
os.chmod(path, 0o700) # nosec B103 — root-owned reaper, needs exec
|
||||
|
||||
# The reaper MUST run outside decnet-agent.service's cgroup — otherwise
|
||||
# `systemctl stop decnet-agent` SIGTERMs the whole cgroup (reaper included)
|
||||
# before rm -rf completes. `start_new_session=True` gets us a fresh POSIX
|
||||
# session but does NOT escape the systemd cgroup. So we prefer
|
||||
# `systemd-run --scope` (launches the command in a transient scope
|
||||
# detached from the caller's service), falling back to a bare Popen if
|
||||
# systemd-run is unavailable (non-systemd host / container).
|
||||
systemd_run = shutil.which("systemd-run")
|
||||
if systemd_run:
|
||||
argv = [
|
||||
systemd_run,
|
||||
"--collect",
|
||||
"--unit", f"decnet-reaper-{os.getpid()}",
|
||||
"--description", "DECNET agent self-destruct reaper",
|
||||
"/bin/bash", path,
|
||||
]
|
||||
spawn_kwargs = {"start_new_session": True}
|
||||
else:
|
||||
argv = ["/bin/bash", path]
|
||||
spawn_kwargs = {"start_new_session": True}
|
||||
|
||||
subprocess.Popen( # nosec B603
|
||||
argv,
|
||||
stdin=subprocess.DEVNULL,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
close_fds=True,
|
||||
**spawn_kwargs,
|
||||
)
|
||||
log.warning(
|
||||
"self_destruct: reaper spawned path=%s via=%s — agent will die in ~3s",
|
||||
path, "systemd-run" if systemd_run else "popen",
|
||||
)
|
||||
|
||||
|
||||
async def status() -> dict[str, Any]:
|
||||
state = await asyncio.to_thread(load_state)
|
||||
if state is None:
|
||||
return {"deployed": False, "deckies": []}
|
||||
config, _compose_path = state
|
||||
runtime = await asyncio.to_thread(_decky_runtime_states, config)
|
||||
return {
|
||||
"deployed": True,
|
||||
"mode": config.mode,
|
||||
"compose_path": str(_compose_path),
|
||||
"deckies": [d.model_dump() for d in config.deckies],
|
||||
"runtime": runtime,
|
||||
}
|
||||
146
decnet/agent/heartbeat.py
Normal file
146
decnet/agent/heartbeat.py
Normal file
@@ -0,0 +1,146 @@
|
||||
"""Agent → master liveness heartbeat loop.
|
||||
|
||||
Every ``INTERVAL_S`` seconds the worker posts ``executor.status()`` to
|
||||
``POST <master>/swarm/heartbeat`` over mTLS. The master pins the
|
||||
presented client cert's SHA-256 against the ``SwarmHost`` row for the
|
||||
claimed ``host_uuid``; a match refreshes ``last_heartbeat`` + each
|
||||
``DeckyShard``'s snapshot + runtime state.
|
||||
|
||||
Identity comes from ``/etc/decnet/decnet.ini`` (seeded by the enroll
|
||||
bundle) — specifically ``DECNET_HOST_UUID`` and ``DECNET_MASTER_HOST``.
|
||||
The worker's existing ``~/.decnet/agent/`` bundle (or
|
||||
``/etc/decnet/agent/``) provides the mTLS client cert.
|
||||
|
||||
Started/stopped via the agent FastAPI app's lifespan. If identity
|
||||
plumbing is missing (pre-enrollment dev runs) the loop logs at DEBUG and
|
||||
declines to start — callers don't have to guard it.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import pathlib
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from decnet.agent import executor as _exec
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm import pki
|
||||
from decnet.swarm.log_forwarder import build_worker_ssl_context
|
||||
|
||||
log = get_logger("agent.heartbeat")
|
||||
|
||||
INTERVAL_S = 30.0
|
||||
_TIMEOUT = httpx.Timeout(connect=5.0, read=10.0, write=5.0, pool=5.0)
|
||||
|
||||
_task: Optional[asyncio.Task] = None
|
||||
|
||||
|
||||
def _resolve_agent_dir() -> pathlib.Path:
|
||||
"""Match the agent-dir resolution order used by the agent server:
|
||||
DECNET_AGENT_DIR env, else /etc/decnet/agent (production install),
|
||||
else ~/.decnet/agent (dev)."""
|
||||
import os
|
||||
env = os.environ.get("DECNET_AGENT_DIR")
|
||||
if env:
|
||||
return pathlib.Path(env)
|
||||
system = pathlib.Path("/etc/decnet/agent")
|
||||
if system.exists():
|
||||
return system
|
||||
return pki.DEFAULT_AGENT_DIR
|
||||
|
||||
|
||||
async def _tick(client: httpx.AsyncClient, url: str, host_uuid: str, agent_version: str) -> None:
|
||||
snap = await _exec.status()
|
||||
body: dict = {
|
||||
"host_uuid": host_uuid,
|
||||
"agent_version": agent_version,
|
||||
"status": snap,
|
||||
}
|
||||
# Best-effort: fold in applied-topology snapshot. Failures must never
|
||||
# wedge the heartbeat loop — master will fall back to "no topology
|
||||
# reported" which triggers a resync if it expected one.
|
||||
try:
|
||||
from decnet.agent import topology_ops as _topo_ops
|
||||
from decnet.agent.topology_store import TopologyStore
|
||||
store = TopologyStore(_resolve_agent_dir() / "topology.db")
|
||||
try:
|
||||
body["topology"] = _topo_ops.state(store)
|
||||
finally:
|
||||
store.close()
|
||||
except Exception:
|
||||
log.debug("heartbeat: topology state unavailable", exc_info=True)
|
||||
|
||||
resp = await client.post(url, json=body)
|
||||
# 403 / 404 are terminal-ish — we still keep looping because an
|
||||
# operator may re-enrol the host mid-session, but we log loudly so
|
||||
# prod ops can spot cert-pinning drift.
|
||||
if resp.status_code == 204:
|
||||
return
|
||||
log.warning(
|
||||
"heartbeat rejected status=%d body=%s",
|
||||
resp.status_code, resp.text[:200],
|
||||
)
|
||||
|
||||
|
||||
async def _loop(url: str, host_uuid: str, agent_version: str, ssl_ctx) -> None:
|
||||
log.info("heartbeat loop starting url=%s host_uuid=%s interval=%ss",
|
||||
url, host_uuid, INTERVAL_S)
|
||||
async with httpx.AsyncClient(verify=ssl_ctx, timeout=_TIMEOUT) as client:
|
||||
while True:
|
||||
try:
|
||||
await _tick(client, url, host_uuid, agent_version)
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception:
|
||||
log.exception("heartbeat tick failed — will retry in %ss", INTERVAL_S)
|
||||
await asyncio.sleep(INTERVAL_S)
|
||||
|
||||
|
||||
def start() -> Optional[asyncio.Task]:
|
||||
"""Kick off the background heartbeat task. No-op if identity is
|
||||
unconfigured (dev mode) — the caller doesn't need to check."""
|
||||
global _task
|
||||
from decnet.env import (
|
||||
DECNET_HOST_UUID,
|
||||
DECNET_MASTER_HOST,
|
||||
DECNET_SWARMCTL_PORT,
|
||||
)
|
||||
|
||||
if _task is not None and not _task.done():
|
||||
return _task
|
||||
if not DECNET_HOST_UUID or not DECNET_MASTER_HOST:
|
||||
log.debug("heartbeat not starting — DECNET_HOST_UUID or DECNET_MASTER_HOST unset")
|
||||
return None
|
||||
|
||||
agent_dir = _resolve_agent_dir()
|
||||
try:
|
||||
ssl_ctx = build_worker_ssl_context(agent_dir)
|
||||
except Exception:
|
||||
log.exception("heartbeat not starting — worker SSL context unavailable at %s", agent_dir)
|
||||
return None
|
||||
|
||||
try:
|
||||
from decnet import __version__ as _v
|
||||
agent_version = _v
|
||||
except Exception:
|
||||
agent_version = "unknown"
|
||||
|
||||
url = f"https://{DECNET_MASTER_HOST}:{DECNET_SWARMCTL_PORT}/swarm/heartbeat"
|
||||
_task = asyncio.create_task(
|
||||
_loop(url, DECNET_HOST_UUID, agent_version, ssl_ctx),
|
||||
name="agent-heartbeat",
|
||||
)
|
||||
return _task
|
||||
|
||||
|
||||
async def stop() -> None:
|
||||
global _task
|
||||
if _task is None:
|
||||
return
|
||||
_task.cancel()
|
||||
try:
|
||||
await _task
|
||||
except (asyncio.CancelledError, Exception):
|
||||
pass
|
||||
_task = None
|
||||
70
decnet/agent/server.py
Normal file
70
decnet/agent/server.py
Normal file
@@ -0,0 +1,70 @@
|
||||
"""Worker-agent uvicorn launcher.
|
||||
|
||||
Starts ``decnet.agent.app:app`` over HTTPS with mTLS enforcement. The
|
||||
worker must already have a bundle in ``~/.decnet/agent/`` (delivered by
|
||||
``decnet swarm enroll`` from the master); if it does not, we refuse to
|
||||
start — unauthenticated agents are not a supported mode.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
import signal
|
||||
import subprocess # nosec B404
|
||||
import sys
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm import pki
|
||||
|
||||
log = get_logger("agent.server")
|
||||
|
||||
|
||||
def run(host: str, port: int, agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR) -> int:
|
||||
bundle = pki.load_worker_bundle(agent_dir)
|
||||
if bundle is None:
|
||||
print(
|
||||
f"[agent] No cert bundle at {agent_dir}. "
|
||||
f"Run `decnet swarm enroll` from the master first.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 2
|
||||
|
||||
keyfile = agent_dir / "worker.key"
|
||||
certfile = agent_dir / "worker.crt"
|
||||
cafile = agent_dir / "ca.crt"
|
||||
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"uvicorn",
|
||||
"decnet.agent.app:app",
|
||||
"--host",
|
||||
host,
|
||||
"--port",
|
||||
str(port),
|
||||
"--ssl-keyfile",
|
||||
str(keyfile),
|
||||
"--ssl-certfile",
|
||||
str(certfile),
|
||||
"--ssl-ca-certs",
|
||||
str(cafile),
|
||||
# 2 == ssl.CERT_REQUIRED — clients MUST present a CA-signed cert.
|
||||
"--ssl-cert-reqs",
|
||||
"2",
|
||||
]
|
||||
log.info("agent starting host=%s port=%d bundle=%s", host, port, agent_dir)
|
||||
# Own process group for clean Ctrl+C / SIGTERM propagation to uvicorn
|
||||
# workers (same pattern as `decnet api`).
|
||||
proc = subprocess.Popen(cmd, start_new_session=True) # nosec B603
|
||||
try:
|
||||
return proc.wait()
|
||||
except KeyboardInterrupt:
|
||||
try:
|
||||
os.killpg(proc.pid, signal.SIGTERM)
|
||||
try:
|
||||
return proc.wait(timeout=10)
|
||||
except subprocess.TimeoutExpired:
|
||||
os.killpg(proc.pid, signal.SIGKILL)
|
||||
return proc.wait()
|
||||
except ProcessLookupError:
|
||||
return 0
|
||||
208
decnet/agent/topology_ops.py
Normal file
208
decnet/agent/topology_ops.py
Normal file
@@ -0,0 +1,208 @@
|
||||
"""Agent-side topology apply/teardown/state primitives.
|
||||
|
||||
Wraps the compose + bridge machinery from :mod:`decnet.engine.deployer`
|
||||
so the agent can drive a topology without ever touching the master's
|
||||
sqlmodel repo. The master-side ``deploy_topology`` always calls
|
||||
``transition_status(repo, …)`` which is useless (and unreachable) on
|
||||
an agent — here we operate purely on a hydrated dict + the local
|
||||
:class:`TopologyStore`.
|
||||
|
||||
v1 constraint: one topology per agent. A second apply for a different
|
||||
``topology_id`` triggers an on-the-spot teardown of the predecessor
|
||||
before the new apply proceeds — master is authoritative.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import subprocess # nosec B404
|
||||
from typing import Any
|
||||
|
||||
import docker
|
||||
|
||||
from decnet.agent.topology_store import (
|
||||
TopologyStore,
|
||||
observed,
|
||||
)
|
||||
from decnet.engine.deployer import (
|
||||
_compose,
|
||||
_compose_with_retry,
|
||||
_teardown_order,
|
||||
_topology_compose_path,
|
||||
)
|
||||
from decnet.logging import get_logger
|
||||
from decnet.network import create_bridge_network, remove_bridge_network
|
||||
from decnet.topology.compose import (
|
||||
_network_name as _topology_network_name,
|
||||
write_topology_compose,
|
||||
)
|
||||
from decnet.topology.hashing import canonical_hash
|
||||
from decnet.topology.validate import (
|
||||
ValidationError,
|
||||
errors as _validation_errors,
|
||||
validate as _validate_topology,
|
||||
)
|
||||
|
||||
log = get_logger("agent.topology_ops")
|
||||
|
||||
|
||||
class HashMismatch(RuntimeError):
|
||||
"""Raised when the master-provided version_hash doesn't match what we
|
||||
hash locally — suggests serialisation drift. We fail loudly rather
|
||||
than silently papering over a schema mismatch."""
|
||||
|
||||
|
||||
def _topology_id(hydrated: dict[str, Any]) -> str:
|
||||
topo = hydrated.get("topology") or {}
|
||||
tid = topo.get("id")
|
||||
if not tid:
|
||||
raise ValueError("hydrated topology missing topology.id")
|
||||
return str(tid)
|
||||
|
||||
|
||||
async def apply(
|
||||
hydrated: dict[str, Any],
|
||||
version_hash: str,
|
||||
store: TopologyStore,
|
||||
) -> None:
|
||||
"""Materialise *hydrated* on this agent and record it in *store*.
|
||||
|
||||
Raises:
|
||||
HashMismatch: master and agent disagree on the canonical hash —
|
||||
don't touch docker, fail the apply.
|
||||
ValidationError: topology fails structural validation.
|
||||
Any docker / compose error propagates up; the endpoint maps it
|
||||
to 500 and records the message on the store row.
|
||||
"""
|
||||
local_hash = canonical_hash(hydrated)
|
||||
if local_hash != version_hash:
|
||||
raise HashMismatch(
|
||||
f"master hash {version_hash!r} does not match agent hash "
|
||||
f"{local_hash!r} — refusing to apply"
|
||||
)
|
||||
|
||||
issues = _validate_topology(hydrated)
|
||||
if _validation_errors(issues):
|
||||
raise ValidationError(issues)
|
||||
|
||||
topology_id = _topology_id(hydrated)
|
||||
# Master is authoritative. If a different topology is pinned here
|
||||
# — whether it fully applied, only partially applied (failure
|
||||
# marker row + orphan containers), or drifted — teardown first,
|
||||
# then accept the new one. Refusing with 409 would leave the
|
||||
# agent stuck in a state only a human could resolve.
|
||||
existing = store.current()
|
||||
if existing is not None and existing.topology_id != topology_id:
|
||||
log.info(
|
||||
"superseding topology %s with %s on master authority",
|
||||
existing.topology_id, topology_id,
|
||||
)
|
||||
try:
|
||||
await teardown(existing.topology_id, store)
|
||||
except Exception as exc: # noqa: BLE001 — we still want to try applying
|
||||
log.warning(
|
||||
"best-effort teardown of superseded topology %s failed: %s",
|
||||
existing.topology_id, exc,
|
||||
)
|
||||
# Hard-clear the store row so the new apply isn't blocked
|
||||
# by a half-torn-down predecessor. Leftover docker objects
|
||||
# will surface via the next heartbeat's observed block.
|
||||
store.clear(existing.topology_id)
|
||||
|
||||
lans = hydrated["lans"]
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
client = docker.from_env()
|
||||
|
||||
# Bridges + compose are sync/blocking; hop to a thread so we don't
|
||||
# stall the event loop on a slow docker daemon.
|
||||
def _materialise() -> None:
|
||||
for lan in lans:
|
||||
net_name = _topology_network_name(topology_id, lan["name"])
|
||||
internal = not lan["is_dmz"]
|
||||
create_bridge_network(
|
||||
client, net_name, lan["subnet"], internal=internal
|
||||
)
|
||||
write_topology_compose(hydrated, compose_path)
|
||||
# ``--always-recreate-deps`` keeps service containers' netns shares
|
||||
# fresh: every decky service joins its base's netns via
|
||||
# ``network_mode: container:<base>``, and that share is bound at
|
||||
# service start time. If a base is recreated (e.g. when ``ports:``
|
||||
# changes after toggling ``forwards_l3``) but compose decides the
|
||||
# services are unchanged, the services keep a stale netns FD
|
||||
# pointing at the destroyed base — they end up in an empty
|
||||
# namespace with only ``lo``, and external traffic hits a closed
|
||||
# port on the live base. Forcing dependents to recreate alongside
|
||||
# the base is the cheapest way to make this race impossible.
|
||||
_compose_with_retry(
|
||||
"up", "--build", "-d", "--always-recreate-deps",
|
||||
compose_file=compose_path,
|
||||
)
|
||||
|
||||
await asyncio.to_thread(_materialise)
|
||||
|
||||
store.put(topology_id, version_hash, hydrated)
|
||||
log.info(
|
||||
"topology %s applied on agent (%d LANs)", topology_id, len(lans)
|
||||
)
|
||||
|
||||
|
||||
async def teardown(
|
||||
topology_id: str,
|
||||
store: TopologyStore,
|
||||
) -> None:
|
||||
"""Tear down *topology_id* on this agent. Idempotent: if there's no
|
||||
record and no compose file, it's a no-op that still returns cleanly."""
|
||||
row = store.current()
|
||||
# Prefer the stored hydrated blob — it's what we applied with. If
|
||||
# it's gone (db wiped) but compose-file lingers, we still try to
|
||||
# compose-down and delete bridges by scanning the compose file's
|
||||
# LAN membership list via the hydrated blob if available.
|
||||
hydrated = row.hydrated if row and row.topology_id == topology_id else None
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
client = docker.from_env()
|
||||
|
||||
def _dismantle() -> None:
|
||||
if compose_path.exists():
|
||||
try:
|
||||
_compose("down", "--remove-orphans", compose_file=compose_path)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
log.warning(
|
||||
"topology %s compose down failed (continuing): %s",
|
||||
topology_id, exc,
|
||||
)
|
||||
if hydrated is not None:
|
||||
for lan_name in _teardown_order(hydrated["lans"]):
|
||||
net_name = _topology_network_name(topology_id, lan_name)
|
||||
remove_bridge_network(client, net_name)
|
||||
if compose_path.exists():
|
||||
compose_path.unlink()
|
||||
|
||||
await asyncio.to_thread(_dismantle)
|
||||
store.clear(topology_id)
|
||||
log.info("topology %s torn down on agent", topology_id)
|
||||
|
||||
|
||||
def state(store: TopologyStore) -> dict[str, Any]:
|
||||
"""Snapshot-plus-live-observation — the shape the heartbeat embeds."""
|
||||
row = store.current()
|
||||
try:
|
||||
obs = observed(docker.from_env())
|
||||
except Exception as exc: # noqa: BLE001 — docker socket may be gone
|
||||
obs = {"error": str(exc)[:200]}
|
||||
if row is None:
|
||||
return {
|
||||
"topology_id": None,
|
||||
"applied_version_hash": None,
|
||||
"applied_at": None,
|
||||
"last_error": None,
|
||||
"observed": obs,
|
||||
}
|
||||
return {
|
||||
"topology_id": row.topology_id,
|
||||
"applied_version_hash": row.applied_version_hash,
|
||||
"applied_at": row.applied_at,
|
||||
"last_error": row.last_error,
|
||||
"observed": obs,
|
||||
}
|
||||
|
||||
|
||||
__all__ = ["apply", "teardown", "state", "HashMismatch"]
|
||||
213
decnet/agent/topology_store.py
Normal file
213
decnet/agent/topology_store.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""Agent-side sqlite cache of the currently-applied topology.
|
||||
|
||||
**This is a cache, not a source of truth.** The master is the only
|
||||
authority for what the agent should be running. This store exists so
|
||||
the agent can answer two questions quickly and offline:
|
||||
|
||||
1. What topology did I last apply, and with what version hash?
|
||||
2. Is what docker is currently doing consistent with that?
|
||||
|
||||
The hash goes out on every heartbeat; the master compares it to what
|
||||
it thinks this host should be running and schedules a re-push on
|
||||
mismatch.
|
||||
|
||||
Why sqlite when the blob is JSON? Consistent with
|
||||
:mod:`decnet.swarm.log_forwarder._OffsetStore` — single-row sqlite is
|
||||
the project-wide pattern for agent-local persistent state. Keeps
|
||||
operational mental model small: "one state.db per thing".
|
||||
|
||||
Design choices worth calling out:
|
||||
|
||||
- **One row, one topology.** v1 only supports a single topology per
|
||||
agent. Attempting to :meth:`put` a different ``topology_id`` while
|
||||
a row already exists raises :class:`AlreadyApplied` — the agent
|
||||
rejects the apply with 409 and the master is expected to teardown
|
||||
the old one first.
|
||||
- **No auto-restore on boot.** The agent does NOT read this db at
|
||||
startup and try to re-apply. Whatever docker has after a restart
|
||||
is what it has; the next heartbeat reports the truth and the
|
||||
master decides whether to re-push. Same reason we don't sync
|
||||
mutations from agent → master anywhere else: split-brain is worse
|
||||
than temporary drift.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import pathlib
|
||||
import sqlite3
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Optional
|
||||
|
||||
|
||||
class AlreadyApplied(RuntimeError):
|
||||
"""Raised when a different topology is already pinned to this agent."""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AppliedRow:
|
||||
topology_id: str
|
||||
applied_version_hash: str
|
||||
hydrated: dict[str, Any]
|
||||
applied_at: int
|
||||
last_error: Optional[str]
|
||||
|
||||
|
||||
class TopologyStore:
|
||||
"""Single-row sqlite cache. Stdlib only, sync (called from endpoints)."""
|
||||
|
||||
def __init__(self, db_path: pathlib.Path) -> None:
|
||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
# check_same_thread=False: Starlette/FastAPI runs sync endpoint
|
||||
# bodies on a worker thread distinct from where `app` is imported.
|
||||
# The agent is single-process, so there's no real contention —
|
||||
# sqlite's own connection lock is enough.
|
||||
self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
|
||||
self._conn.execute(
|
||||
"CREATE TABLE IF NOT EXISTS applied_topology ("
|
||||
" topology_id TEXT PRIMARY KEY,"
|
||||
" applied_version_hash TEXT NOT NULL,"
|
||||
" hydrated_blob_json TEXT NOT NULL,"
|
||||
" applied_at INTEGER NOT NULL,"
|
||||
" last_error TEXT)"
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
# ----------------------------------------------------------------- reads
|
||||
|
||||
def current(self) -> Optional[AppliedRow]:
|
||||
"""Return the single applied topology, or ``None`` if idle."""
|
||||
row = self._conn.execute(
|
||||
"SELECT topology_id, applied_version_hash, hydrated_blob_json,"
|
||||
" applied_at, last_error FROM applied_topology LIMIT 1"
|
||||
).fetchone()
|
||||
if row is None:
|
||||
return None
|
||||
return AppliedRow(
|
||||
topology_id=row[0],
|
||||
applied_version_hash=row[1],
|
||||
hydrated=json.loads(row[2]),
|
||||
applied_at=int(row[3]),
|
||||
last_error=row[4],
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------- writes
|
||||
|
||||
def put(
|
||||
self,
|
||||
topology_id: str,
|
||||
applied_version_hash: str,
|
||||
hydrated: dict[str, Any],
|
||||
) -> None:
|
||||
"""Record an applied topology.
|
||||
|
||||
If a *different* topology is already recorded, raises
|
||||
:class:`AlreadyApplied`. Re-applying the same ``topology_id``
|
||||
just updates the hash + blob (idempotent re-push).
|
||||
"""
|
||||
existing = self.current()
|
||||
if existing is not None and existing.topology_id != topology_id:
|
||||
raise AlreadyApplied(
|
||||
f"agent already has topology {existing.topology_id!r}; "
|
||||
f"cannot apply {topology_id!r}"
|
||||
)
|
||||
self._conn.execute(
|
||||
"INSERT INTO applied_topology"
|
||||
" (topology_id, applied_version_hash, hydrated_blob_json,"
|
||||
" applied_at, last_error)"
|
||||
" VALUES (?, ?, ?, ?, NULL)"
|
||||
" ON CONFLICT(topology_id) DO UPDATE SET"
|
||||
" applied_version_hash=excluded.applied_version_hash,"
|
||||
" hydrated_blob_json=excluded.hydrated_blob_json,"
|
||||
" applied_at=excluded.applied_at,"
|
||||
" last_error=NULL",
|
||||
(
|
||||
topology_id,
|
||||
applied_version_hash,
|
||||
json.dumps(hydrated, sort_keys=True),
|
||||
int(time.time()),
|
||||
),
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def record_error(
|
||||
self,
|
||||
topology_id: str,
|
||||
message: str,
|
||||
hydrated: Optional[dict[str, Any]] = None,
|
||||
) -> None:
|
||||
"""Attach a last-error message for *topology_id*.
|
||||
|
||||
Upserts a marker row when no apply has yet succeeded for this
|
||||
topology — that way a failure *during* the first materialise
|
||||
(put() hasn't been reached) still surfaces via GET
|
||||
/topology/state and the next heartbeat. The marker row uses an
|
||||
empty ``applied_version_hash`` so master's heartbeat check sees
|
||||
the hash mismatch and schedules a resync.
|
||||
|
||||
If *hydrated* is provided it is stored so a later teardown can
|
||||
still walk the LAN list — otherwise a partial deploy is strands
|
||||
containers + bridges with no breadcrumb back to them.
|
||||
"""
|
||||
blob = json.dumps(hydrated, sort_keys=True) if hydrated else "{}"
|
||||
self._conn.execute(
|
||||
"INSERT INTO applied_topology"
|
||||
" (topology_id, applied_version_hash, hydrated_blob_json,"
|
||||
" applied_at, last_error)"
|
||||
" VALUES (?, '', ?, 0, ?)"
|
||||
" ON CONFLICT(topology_id) DO UPDATE SET"
|
||||
" last_error=excluded.last_error,"
|
||||
" hydrated_blob_json=CASE"
|
||||
" WHEN applied_topology.hydrated_blob_json='{}'"
|
||||
" THEN excluded.hydrated_blob_json"
|
||||
" ELSE applied_topology.hydrated_blob_json END",
|
||||
(topology_id, blob, message),
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def clear(self, topology_id: str) -> None:
|
||||
"""Remove the row for *topology_id* (post-teardown).
|
||||
|
||||
No-op if the row doesn't exist — makes teardown idempotent.
|
||||
"""
|
||||
self._conn.execute(
|
||||
"DELETE FROM applied_topology WHERE topology_id=?",
|
||||
(topology_id,),
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def close(self) -> None:
|
||||
self._conn.close()
|
||||
|
||||
|
||||
# --------------------------------------------------- live docker observation
|
||||
|
||||
|
||||
def observed(docker_client: Any) -> dict[str, Any]:
|
||||
"""Snapshot what docker is *actually* running on this agent.
|
||||
|
||||
Returns a compact dict the heartbeat can ship so the master can
|
||||
cross-check ``applied_version_hash`` against reality (a matching
|
||||
hash with missing bridges is still drift). Best-effort: if docker
|
||||
is unreachable we return an ``error`` marker rather than raising —
|
||||
the agent still needs to heartbeat, and the master can treat
|
||||
``error`` as "unknown, re-push".
|
||||
"""
|
||||
try:
|
||||
bridges = [
|
||||
n.name
|
||||
for n in docker_client.networks.list()
|
||||
if n.attrs.get("Driver") == "bridge"
|
||||
and n.name.startswith("decnet-topology-")
|
||||
]
|
||||
containers = [
|
||||
c.name
|
||||
for c in docker_client.containers.list(all=False)
|
||||
if c.name.startswith("decnet-")
|
||||
]
|
||||
return {"bridges": sorted(bridges), "containers": sorted(containers)}
|
||||
except Exception as exc: # noqa: BLE001 — best-effort observation
|
||||
return {"error": str(exc)[:200]}
|
||||
|
||||
|
||||
__all__ = ["TopologyStore", "AppliedRow", "AlreadyApplied", "observed"]
|
||||
Reference in New Issue
Block a user