merge: testing → main (reconcile 2-week divergence)

2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions
--- a/decnet/agent/init.py
+++ b/decnet/agent/init.py
@@ -0,0 +1,7 @@
+"""DECNET worker agent — runs on every SWARM worker host.
+
+Exposes an mTLS-protected FastAPI service the master's SWARM controller
+calls to deploy, mutate, and tear down deckies locally.  The agent reuses
+the existing `decnet.engine.deployer` code path unchanged, so a worker runs
+deckies the same way `decnet deploy --mode unihost` does today.
+"""
--- a/decnet/agent/app.py
+++ b/decnet/agent/app.py
@@ -0,0 +1,320 @@
+"""Worker-side FastAPI app.
+
+Protected by mTLS at the ASGI/uvicorn transport layer: uvicorn is started
+with ``--ssl-ca-certs`` + ``--ssl-cert-reqs 2`` (CERT_REQUIRED), so any
+client that cannot prove a cert signed by the DECNET CA is rejected before
+reaching a handler.  Once past the TLS handshake, all peers are trusted
+equally (the only entity holding a CA-signed cert is the master
+controller).
+
+Endpoints mirror the existing unihost CLI verbs:
+
+* ``POST /deploy``   — body: serialized ``DecnetConfig``
+* ``POST /teardown`` — body: optional ``{"decky_id": "..."}``
+* ``POST /mutate``   — body: ``{"decky_id": "...", "services": [...]}``
+* ``GET  /status``   — deployment snapshot
+* ``GET  /health``   — liveness probe, does NOT require mTLS? No — mTLS
+  still required; master pings it with its cert.
+"""
+from __future__ import annotations
+
+import asyncio
+import os
+import pathlib
+from contextlib import asynccontextmanager
+from typing import Any, Optional
+
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+
+import contextlib
+
+from decnet.agent import executor as _exec
+from decnet.agent import heartbeat as _heartbeat
+from decnet.agent import topology_ops as _topology_ops
+from decnet.bus.factory import get_bus
+from decnet.bus.publish import run_health_heartbeat
+from decnet.swarm.pki import DEFAULT_AGENT_DIR
+from decnet.agent.topology_store import AlreadyApplied, TopologyStore
+from decnet.config import DecnetConfig
+from decnet.logging import get_logger
+from decnet.topology.validate import ValidationError
+
+log = get_logger("agent.app")
+
+
+def _resolve_agent_dir() -> pathlib.Path:
+    env = os.environ.get("DECNET_AGENT_DIR")
+    if env:
+        return pathlib.Path(env)
+    system = pathlib.Path("/etc/decnet/agent")
+    if system.exists():
+        return system
+    return DEFAULT_AGENT_DIR
+
+
+# Module-level singleton.  Created lazily on first use so tests can
+# monkeypatch DECNET_AGENT_DIR before the store binds to a path.
+_topology_store: Optional[TopologyStore] = None
+
+
+def _store() -> TopologyStore:
+    global _topology_store
+    if _topology_store is None:
+        _topology_store = TopologyStore(_resolve_agent_dir() / "topology.db")
+    return _topology_store
+
+
+_collector_task: Optional[asyncio.Task] = None
+
+
+def _ensure_collector_started() -> None:
+    """Spawn the log collector on demand — called from /topology/apply
+    after a successful materialise.  We must NOT start this in the
+    lifespan hook: the agent's boot invariant is "never touch docker
+    until master tells us to" (see tests/swarm/test_agent_no_auto_restore.py).
+
+    The collector watches ``decnet.topology.service=true`` labels via
+    docker events, writing RFC 5424 lines to ``DECNET_AGENT_LOG_FILE``
+    which the forwarder ships to the master over syslog-TLS.  Idempotent:
+    subsequent calls while the task is still running are no-ops.
+    """
+    global _collector_task
+    if _collector_task is not None and not _collector_task.done():
+        return
+    from decnet.env import DECNET_AGENT_LOG_FILE
+
+    try:
+        from decnet.collector.worker import log_collector_worker
+    except Exception:  # noqa: BLE001 — docker may be unavailable on dev
+        log.warning(
+            "agent log collector not starting — collector worker import failed",
+            exc_info=True,
+        )
+        return
+    _collector_task = asyncio.create_task(
+        log_collector_worker(DECNET_AGENT_LOG_FILE),
+        name="agent-log-collector",
+    )
+    log.info("agent log collector started log_file=%s", DECNET_AGENT_LOG_FILE)
+
+
+_bus_heartbeat_task: Optional[asyncio.Task] = None
+
+
+@asynccontextmanager
+async def _lifespan(app: FastAPI):
+    # Best-effort: if identity/bundle plumbing isn't configured (e.g. dev
+    # runs or non-enrolled hosts), heartbeat.start() is a silent no-op.
+    _heartbeat.start()
+
+    # Host-local bus heartbeat (system.agent.health).  Separate channel
+    # from the mTLS master-facing heartbeat above; this one lets peers on
+    # the same host (dashboard, updater) see the agent is alive without
+    # hitting its HTTPS endpoint.  Bus-disabled path is a no-op loop.
+    bus = None
+    try:
+        bus = get_bus(client_name="agent")
+        await bus.connect()
+    except Exception as exc:  # noqa: BLE001
+        log.warning("agent: bus unavailable, skipping health heartbeat: %s", exc)
+        bus = None
+
+    global _bus_heartbeat_task
+    _bus_heartbeat_task = asyncio.create_task(
+        run_health_heartbeat(bus, "agent"),
+        name="agent-bus-heartbeat",
+    )
+
+    try:
+        yield
+    finally:
+        await _heartbeat.stop()
+        if _bus_heartbeat_task is not None:
+            _bus_heartbeat_task.cancel()
+            with contextlib.suppress(asyncio.CancelledError, Exception):
+                await _bus_heartbeat_task
+            _bus_heartbeat_task = None
+        if bus is not None:
+            with contextlib.suppress(Exception):
+                await bus.close()
+        global _collector_task
+        if _collector_task is not None and not _collector_task.done():
+            _collector_task.cancel()
+            try:
+                await _collector_task
+            except (asyncio.CancelledError, Exception):  # noqa: BLE001
+                pass
+        _collector_task = None
+        global _topology_store
+        if _topology_store is not None:
+            _topology_store.close()
+            _topology_store = None
+
+
+app = FastAPI(
+    title="DECNET SWARM Agent",
+    version="0.1.0",
+    docs_url=None,    # no interactive docs on worker — narrow attack surface
+    redoc_url=None,
+    openapi_url=None,
+    lifespan=_lifespan,
+    responses={
+        400: {"description": "Malformed request body"},
+        500: {"description": "Executor error"},
+    },
+)
+
+
+# ------------------------------------------------------------------ schemas
+
+class DeployRequest(BaseModel):
+    config: DecnetConfig = Field(..., description="Full DecnetConfig to materialise on this worker")
+    dry_run: bool = False
+    no_cache: bool = False
+
+
+class TeardownRequest(BaseModel):
+    decky_id: Optional[str] = None
+
+
+class MutateRequest(BaseModel):
+    decky_id: str
+    services: list[str]
+
+
+# ------------------------------------------------------------------ routes
+
+@app.get("/health")
+async def health() -> dict[str, str]:
+    return {"status": "ok"}
+
+
+@app.get("/status")
+async def status() -> dict:
+    return await _exec.status()
+
+
+@app.post(
+    "/deploy",
+    responses={500: {"description": "Deployer raised an exception materialising the config"}},
+)
+async def deploy(req: DeployRequest) -> dict:
+    try:
+        await _exec.deploy(req.config, dry_run=req.dry_run, no_cache=req.no_cache)
+    except Exception as exc:
+        log.exception("agent.deploy failed")
+        raise HTTPException(status_code=500, detail=str(exc)) from exc
+    return {"status": "deployed", "deckies": len(req.config.deckies)}
+
+
+@app.post(
+    "/teardown",
+    responses={500: {"description": "Teardown raised an exception"}},
+)
+async def teardown(req: TeardownRequest) -> dict:
+    try:
+        await _exec.teardown(req.decky_id)
+    except Exception as exc:
+        log.exception("agent.teardown failed")
+        raise HTTPException(status_code=500, detail=str(exc)) from exc
+    return {"status": "torn_down", "decky_id": req.decky_id}
+
+
+@app.post(
+    "/self-destruct",
+    responses={500: {"description": "Reaper could not be scheduled"}},
+)
+async def self_destruct() -> dict:
+    """Stop all DECNET services on this worker and delete the install
+    footprint. Called by the master during decommission. Logs under
+    /var/log/decnet* are preserved. Fire-and-forget — returns 202 before
+    the reaper starts deleting files."""
+    try:
+        await _exec.self_destruct()
+    except Exception as exc:
+        log.exception("agent.self_destruct failed")
+        raise HTTPException(status_code=500, detail=str(exc)) from exc
+    return {"status": "self_destruct_scheduled"}
+
+
+# ------------------------------------------------------- topology endpoints
+
+
+class ApplyTopologyRequest(BaseModel):
+    hydrated: dict[str, Any] = Field(
+        ..., description="Hydrated topology dict from master.persistence.hydrate()"
+    )
+    version_hash: str = Field(
+        ..., description="Master's canonical_hash(hydrated); must match ours"
+    )
+
+
+class TeardownTopologyRequest(BaseModel):
+    topology_id: str = Field(..., description="Topology UUID to dismantle")
+
+
+@app.post(
+    "/topology/apply",
+    responses={
+        400: {"description": "Malformed hydrated topology or hash mismatch"},
+        409: {"description": "A different topology is already applied"},
+        500: {"description": "Docker or compose raised while applying"},
+    },
+)
+async def topology_apply(req: ApplyTopologyRequest) -> dict:
+    store = _store()
+    try:
+        await _topology_ops.apply(req.hydrated, req.version_hash, store)
+    except _topology_ops.HashMismatch as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+    except ValidationError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+    except AlreadyApplied as exc:
+        raise HTTPException(status_code=409, detail=str(exc)) from exc
+    except Exception as exc:
+        log.exception("agent.topology_apply failed")
+        topology_id = (req.hydrated.get("topology") or {}).get("id")
+        if topology_id:
+            try:
+                store.record_error(
+                    str(topology_id), str(exc)[:500], hydrated=req.hydrated,
+                )
+            except Exception:  # noqa: BLE001 — don't mask original failure
+                log.exception("failed to record apply error")
+        raise HTTPException(status_code=500, detail=str(exc)) from exc
+    _ensure_collector_started()
+    return {"status": "applied", "version_hash": req.version_hash}
+
+
+@app.post(
+    "/topology/teardown",
+    responses={500: {"description": "Docker or compose raised while tearing down"}},
+)
+async def topology_teardown(req: TeardownTopologyRequest) -> dict:
+    try:
+        await _topology_ops.teardown(req.topology_id, _store())
+    except Exception as exc:
+        log.exception("agent.topology_teardown failed")
+        raise HTTPException(status_code=500, detail=str(exc)) from exc
+    return {"status": "torn_down", "topology_id": req.topology_id}
+
+
+@app.get("/topology/state")
+async def topology_state() -> dict:
+    return _topology_ops.state(_store())
+
+
+@app.post(
+    "/mutate",
+    responses={501: {"description": "Worker-side mutate not yet implemented"}},
+)
+async def mutate(req: MutateRequest) -> dict:
+    # TODO: implement worker-side mutate. Currently the master performs
+    # mutation by re-sending a full /deploy with the updated DecnetConfig;
+    # this avoids duplicating mutation logic on the worker for v1. When
+    # ready, replace the 501 with a real redeploy-of-a-single-decky path.
+    raise HTTPException(
+        status_code=501,
+        detail="Per-decky mutate is performed via /deploy with updated services",
+    )
--- a/decnet/agent/executor.py
+++ b/decnet/agent/executor.py
@@ -0,0 +1,223 @@
+"""Thin adapter between the agent's HTTP endpoints and the existing
+``decnet.engine.deployer`` code path.
+
+Kept deliberately small: the agent does not re-implement deployment logic,
+it only translates a master RPC into the same function calls the unihost
+CLI already uses.  Everything runs in a worker thread (the deployer is
+blocking) so the FastAPI event loop stays responsive.
+"""
+from __future__ import annotations
+
+import asyncio
+from ipaddress import IPv4Network
+from typing import Any
+
+from decnet.engine import deployer as _deployer
+from decnet.config import DecnetConfig, load_state, clear_state
+from decnet.logging import get_logger
+from decnet.network import (
+    allocate_ips,
+    detect_interface,
+    detect_subnet,
+    get_host_ip,
+)
+
+log = get_logger("agent.executor")
+
+
+def _relocalize(config: DecnetConfig) -> DecnetConfig:
+    """Rewrite a master-built config to the worker's local network reality.
+
+    The master populates ``interface``/``subnet``/``gateway`` from its own
+    box before dispatching, which blows up the deployer on any worker whose
+    NIC name differs (common in heterogeneous fleets — master on ``wlp6s0``,
+    worker on ``enp0s3``). We always re-detect locally; if the worker sits
+    on a different subnet than the master, decky IPs are re-allocated from
+    the worker's subnet so they're actually reachable.
+    """
+    local_iface = detect_interface()
+    local_subnet, local_gateway = detect_subnet(local_iface)
+    local_host_ip = get_host_ip(local_iface)
+
+    updates: dict[str, Any] = {
+        "interface": local_iface,
+        "subnet": local_subnet,
+        "gateway": local_gateway,
+    }
+
+    master_net = IPv4Network(config.subnet, strict=False) if config.subnet else None
+    local_net = IPv4Network(local_subnet, strict=False)
+    if master_net is None or master_net != local_net:
+        log.info(
+            "agent.deploy subnet mismatch master=%s local=%s — re-allocating decky IPs",
+            config.subnet, local_subnet,
+        )
+        fresh_ips = allocate_ips(
+            subnet=local_subnet,
+            gateway=local_gateway,
+            host_ip=local_host_ip,
+            count=len(config.deckies),
+        )
+        new_deckies = [d.model_copy(update={"ip": ip}) for d, ip in zip(config.deckies, fresh_ips)]
+        updates["deckies"] = new_deckies
+
+    return config.model_copy(update=updates)
+
+
+async def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False) -> None:
+    """Run the blocking deployer off-loop. The deployer itself calls
+    save_state() internally once the compose file is materialised."""
+    log.info(
+        "agent.deploy mode=%s deckies=%d interface=%s (incoming)",
+        config.mode, len(config.deckies), config.interface,
+    )
+    if config.mode == "swarm":
+        config = _relocalize(config)
+        log.info(
+            "agent.deploy relocalized interface=%s subnet=%s gateway=%s",
+            config.interface, config.subnet, config.gateway,
+        )
+    await asyncio.to_thread(_deployer.deploy, config, dry_run, no_cache, False)
+
+
+async def teardown(decky_id: str | None = None) -> None:
+    log.info("agent.teardown decky_id=%s", decky_id)
+    await asyncio.to_thread(_deployer.teardown, decky_id)
+    if decky_id is None:
+        await asyncio.to_thread(clear_state)
+
+
+def _decky_runtime_states(config: DecnetConfig) -> dict[str, dict[str, Any]]:
+    """Map decky_name → {"running": bool, "services": {svc: container_state}}.
+
+    Queried so the master can tell, after a partial-failure deploy, which
+    deckies actually came up instead of tainting the whole shard as failed.
+    Best-effort: a docker error returns an empty map, not an exception.
+    """
+    try:
+        import docker  # local import — agent-only path
+        client = docker.from_env()
+        live = {c.name: c.status for c in client.containers.list(all=True, ignore_removed=True)}
+    except Exception:  # pragma: no cover — defensive
+        log.exception("_decky_runtime_states: docker query failed")
+        return {}
+
+    out: dict[str, dict[str, Any]] = {}
+    for d in config.deckies:
+        svc_states = {
+            svc: live.get(f"{d.name}-{svc.replace('_', '-')}", "absent")
+            for svc in d.services
+        }
+        out[d.name] = {
+            "running": bool(svc_states) and all(s == "running" for s in svc_states.values()),
+            "services": svc_states,
+        }
+    return out
+
+
+_REAPER_SCRIPT = r"""#!/bin/bash
+# DECNET agent self-destruct reaper.
+# Runs detached from the agent process so it survives the agent's death.
+# Waits briefly for the HTTP response to drain, then stops services,
+# wipes install paths, and preserves logs.
+set +e
+
+sleep 3
+
+# Stop decky containers started by the local deployer (best-effort).
+if command -v docker >/dev/null 2>&1; then
+    docker ps -q --filter "label=com.docker.compose.project=decnet" | xargs -r docker stop
+    docker ps -aq --filter "label=com.docker.compose.project=decnet" | xargs -r docker rm -f
+    docker network rm decnet_lan 2>/dev/null
+fi
+
+# Stop+disable every systemd unit the installer may have dropped.
+for unit in decnet-agent decnet-engine decnet-collector decnet-forwarder decnet-prober decnet-reconciler decnet-sniffer decnet-updater; do
+    systemctl stop "$unit" 2>/dev/null
+    systemctl disable "$unit" 2>/dev/null
+done
+
+# Nuke install paths. Logs under /var/log/decnet* are intentionally
+# preserved — the operator typically wants them for forensic review.
+rm -rf /opt/decnet* /var/lib/decnet/* /usr/local/bin/decnet* /etc/decnet
+rm -f /etc/systemd/system/decnet-*.service /etc/systemd/system/decnet-*.timer
+
+systemctl daemon-reload 2>/dev/null
+rm -f "$0"
+"""
+
+
+async def self_destruct() -> None:
+    """Tear down deckies, then spawn a detached reaper that wipes the
+    install footprint. Returns immediately so the HTTP response can drain
+    before the reaper starts deleting files out from under the agent."""
+    import os
+    import shutil
+    import subprocess  # nosec B404
+    import tempfile
+
+    # Best-effort teardown first — the reaper also runs docker stop, but
+    # going through the deployer gives the host-macvlan/ipvlan helper a
+    # chance to clean up routes cleanly.
+    try:
+        await asyncio.to_thread(_deployer.teardown, None)
+        await asyncio.to_thread(clear_state)
+    except Exception:
+        log.exception("self_destruct: pre-reap teardown failed — reaper will force-stop containers")
+
+    # Reaper lives under /tmp so it survives rm -rf /opt/decnet*.
+    fd, path = tempfile.mkstemp(prefix="decnet-reaper-", suffix=".sh", dir="/tmp")  # nosec B108 — reaper must outlive /opt/decnet removal
+    try:
+        os.write(fd, _REAPER_SCRIPT.encode())
+    finally:
+        os.close(fd)
+    os.chmod(path, 0o700)  # nosec B103 — root-owned reaper, needs exec
+
+    # The reaper MUST run outside decnet-agent.service's cgroup — otherwise
+    # `systemctl stop decnet-agent` SIGTERMs the whole cgroup (reaper included)
+    # before rm -rf completes. `start_new_session=True` gets us a fresh POSIX
+    # session but does NOT escape the systemd cgroup. So we prefer
+    # `systemd-run --scope` (launches the command in a transient scope
+    # detached from the caller's service), falling back to a bare Popen if
+    # systemd-run is unavailable (non-systemd host / container).
+    systemd_run = shutil.which("systemd-run")
+    if systemd_run:
+        argv = [
+            systemd_run,
+            "--collect",
+            "--unit", f"decnet-reaper-{os.getpid()}",
+            "--description", "DECNET agent self-destruct reaper",
+            "/bin/bash", path,
+        ]
+        spawn_kwargs = {"start_new_session": True}
+    else:
+        argv = ["/bin/bash", path]
+        spawn_kwargs = {"start_new_session": True}
+
+    subprocess.Popen(  # nosec B603
+        argv,
+        stdin=subprocess.DEVNULL,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+        close_fds=True,
+        **spawn_kwargs,
+    )
+    log.warning(
+        "self_destruct: reaper spawned path=%s via=%s — agent will die in ~3s",
+        path, "systemd-run" if systemd_run else "popen",
+    )
+
+
+async def status() -> dict[str, Any]:
+    state = await asyncio.to_thread(load_state)
+    if state is None:
+        return {"deployed": False, "deckies": []}
+    config, _compose_path = state
+    runtime = await asyncio.to_thread(_decky_runtime_states, config)
+    return {
+        "deployed": True,
+        "mode": config.mode,
+        "compose_path": str(_compose_path),
+        "deckies": [d.model_dump() for d in config.deckies],
+        "runtime": runtime,
+    }
--- a/decnet/agent/heartbeat.py
+++ b/decnet/agent/heartbeat.py
@@ -0,0 +1,146 @@
+"""Agent → master liveness heartbeat loop.
+
+Every ``INTERVAL_S`` seconds the worker posts ``executor.status()`` to
+``POST <master>/swarm/heartbeat`` over mTLS. The master pins the
+presented client cert's SHA-256 against the ``SwarmHost`` row for the
+claimed ``host_uuid``; a match refreshes ``last_heartbeat`` + each
+``DeckyShard``'s snapshot + runtime state.
+
+Identity comes from ``/etc/decnet/decnet.ini`` (seeded by the enroll
+bundle) — specifically ``DECNET_HOST_UUID`` and ``DECNET_MASTER_HOST``.
+The worker's existing ``~/.decnet/agent/`` bundle (or
+``/etc/decnet/agent/``) provides the mTLS client cert.
+
+Started/stopped via the agent FastAPI app's lifespan. If identity
+plumbing is missing (pre-enrollment dev runs) the loop logs at DEBUG and
+declines to start — callers don't have to guard it.
+"""
+from __future__ import annotations
+
+import asyncio
+import pathlib
+from typing import Optional
+
+import httpx
+
+from decnet.agent import executor as _exec
+from decnet.logging import get_logger
+from decnet.swarm import pki
+from decnet.swarm.log_forwarder import build_worker_ssl_context
+
+log = get_logger("agent.heartbeat")
+
+INTERVAL_S = 30.0
+_TIMEOUT = httpx.Timeout(connect=5.0, read=10.0, write=5.0, pool=5.0)
+
+_task: Optional[asyncio.Task] = None
+
+
+def _resolve_agent_dir() -> pathlib.Path:
+    """Match the agent-dir resolution order used by the agent server:
+    DECNET_AGENT_DIR env, else /etc/decnet/agent (production install),
+    else ~/.decnet/agent (dev)."""
+    import os
+    env = os.environ.get("DECNET_AGENT_DIR")
+    if env:
+        return pathlib.Path(env)
+    system = pathlib.Path("/etc/decnet/agent")
+    if system.exists():
+        return system
+    return pki.DEFAULT_AGENT_DIR
+
+
+async def _tick(client: httpx.AsyncClient, url: str, host_uuid: str, agent_version: str) -> None:
+    snap = await _exec.status()
+    body: dict = {
+        "host_uuid": host_uuid,
+        "agent_version": agent_version,
+        "status": snap,
+    }
+    # Best-effort: fold in applied-topology snapshot. Failures must never
+    # wedge the heartbeat loop — master will fall back to "no topology
+    # reported" which triggers a resync if it expected one.
+    try:
+        from decnet.agent import topology_ops as _topo_ops
+        from decnet.agent.topology_store import TopologyStore
+        store = TopologyStore(_resolve_agent_dir() / "topology.db")
+        try:
+            body["topology"] = _topo_ops.state(store)
+        finally:
+            store.close()
+    except Exception:
+        log.debug("heartbeat: topology state unavailable", exc_info=True)
+
+    resp = await client.post(url, json=body)
+    # 403 / 404 are terminal-ish — we still keep looping because an
+    # operator may re-enrol the host mid-session, but we log loudly so
+    # prod ops can spot cert-pinning drift.
+    if resp.status_code == 204:
+        return
+    log.warning(
+        "heartbeat rejected status=%d body=%s",
+        resp.status_code, resp.text[:200],
+    )
+
+
+async def _loop(url: str, host_uuid: str, agent_version: str, ssl_ctx) -> None:
+    log.info("heartbeat loop starting url=%s host_uuid=%s interval=%ss",
+             url, host_uuid, INTERVAL_S)
+    async with httpx.AsyncClient(verify=ssl_ctx, timeout=_TIMEOUT) as client:
+        while True:
+            try:
+                await _tick(client, url, host_uuid, agent_version)
+            except asyncio.CancelledError:
+                raise
+            except Exception:
+                log.exception("heartbeat tick failed — will retry in %ss", INTERVAL_S)
+            await asyncio.sleep(INTERVAL_S)
+
+
+def start() -> Optional[asyncio.Task]:
+    """Kick off the background heartbeat task. No-op if identity is
+    unconfigured (dev mode) — the caller doesn't need to check."""
+    global _task
+    from decnet.env import (
+        DECNET_HOST_UUID,
+        DECNET_MASTER_HOST,
+        DECNET_SWARMCTL_PORT,
+    )
+
+    if _task is not None and not _task.done():
+        return _task
+    if not DECNET_HOST_UUID or not DECNET_MASTER_HOST:
+        log.debug("heartbeat not starting — DECNET_HOST_UUID or DECNET_MASTER_HOST unset")
+        return None
+
+    agent_dir = _resolve_agent_dir()
+    try:
+        ssl_ctx = build_worker_ssl_context(agent_dir)
+    except Exception:
+        log.exception("heartbeat not starting — worker SSL context unavailable at %s", agent_dir)
+        return None
+
+    try:
+        from decnet import __version__ as _v
+        agent_version = _v
+    except Exception:
+        agent_version = "unknown"
+
+    url = f"https://{DECNET_MASTER_HOST}:{DECNET_SWARMCTL_PORT}/swarm/heartbeat"
+    _task = asyncio.create_task(
+        _loop(url, DECNET_HOST_UUID, agent_version, ssl_ctx),
+        name="agent-heartbeat",
+    )
+    return _task
+
+
+async def stop() -> None:
+    global _task
+    if _task is None:
+        return
+    _task.cancel()
+    try:
+        await _task
+    except (asyncio.CancelledError, Exception):
+        pass
+    _task = None
--- a/decnet/agent/server.py
+++ b/decnet/agent/server.py
@@ -0,0 +1,70 @@
+"""Worker-agent uvicorn launcher.
+
+Starts ``decnet.agent.app:app`` over HTTPS with mTLS enforcement.  The
+worker must already have a bundle in ``~/.decnet/agent/`` (delivered by
+``decnet swarm enroll`` from the master); if it does not, we refuse to
+start — unauthenticated agents are not a supported mode.
+"""
+from __future__ import annotations
+
+import os
+import pathlib
+import signal
+import subprocess  # nosec B404
+import sys
+
+from decnet.logging import get_logger
+from decnet.swarm import pki
+
+log = get_logger("agent.server")
+
+
+def run(host: str, port: int, agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR) -> int:
+    bundle = pki.load_worker_bundle(agent_dir)
+    if bundle is None:
+        print(
+            f"[agent] No cert bundle at {agent_dir}. "
+            f"Run `decnet swarm enroll` from the master first.",
+            file=sys.stderr,
+        )
+        return 2
+
+    keyfile = agent_dir / "worker.key"
+    certfile = agent_dir / "worker.crt"
+    cafile = agent_dir / "ca.crt"
+
+    cmd = [
+        sys.executable,
+        "-m",
+        "uvicorn",
+        "decnet.agent.app:app",
+        "--host",
+        host,
+        "--port",
+        str(port),
+        "--ssl-keyfile",
+        str(keyfile),
+        "--ssl-certfile",
+        str(certfile),
+        "--ssl-ca-certs",
+        str(cafile),
+        # 2 == ssl.CERT_REQUIRED — clients MUST present a CA-signed cert.
+        "--ssl-cert-reqs",
+        "2",
+    ]
+    log.info("agent starting host=%s port=%d bundle=%s", host, port, agent_dir)
+    # Own process group for clean Ctrl+C / SIGTERM propagation to uvicorn
+    # workers (same pattern as `decnet api`).
+    proc = subprocess.Popen(cmd, start_new_session=True)  # nosec B603
+    try:
+        return proc.wait()
+    except KeyboardInterrupt:
+        try:
+            os.killpg(proc.pid, signal.SIGTERM)
+            try:
+                return proc.wait(timeout=10)
+            except subprocess.TimeoutExpired:
+                os.killpg(proc.pid, signal.SIGKILL)
+                return proc.wait()
+        except ProcessLookupError:
+            return 0
--- a/decnet/agent/topology_ops.py
+++ b/decnet/agent/topology_ops.py
@@ -0,0 +1,208 @@
+"""Agent-side topology apply/teardown/state primitives.
+
+Wraps the compose + bridge machinery from :mod:`decnet.engine.deployer`
+so the agent can drive a topology without ever touching the master's
+sqlmodel repo.  The master-side ``deploy_topology`` always calls
+``transition_status(repo, …)`` which is useless (and unreachable) on
+an agent — here we operate purely on a hydrated dict + the local
+:class:`TopologyStore`.
+
+v1 constraint: one topology per agent.  A second apply for a different
+``topology_id`` triggers an on-the-spot teardown of the predecessor
+before the new apply proceeds — master is authoritative.
+"""
+from __future__ import annotations
+
+import asyncio
+import subprocess  # nosec B404
+from typing import Any
+
+import docker
+
+from decnet.agent.topology_store import (
+    TopologyStore,
+    observed,
+)
+from decnet.engine.deployer import (
+    _compose,
+    _compose_with_retry,
+    _teardown_order,
+    _topology_compose_path,
+)
+from decnet.logging import get_logger
+from decnet.network import create_bridge_network, remove_bridge_network
+from decnet.topology.compose import (
+    _network_name as _topology_network_name,
+    write_topology_compose,
+)
+from decnet.topology.hashing import canonical_hash
+from decnet.topology.validate import (
+    ValidationError,
+    errors as _validation_errors,
+    validate as _validate_topology,
+)
+
+log = get_logger("agent.topology_ops")
+
+
+class HashMismatch(RuntimeError):
+    """Raised when the master-provided version_hash doesn't match what we
+    hash locally — suggests serialisation drift.  We fail loudly rather
+    than silently papering over a schema mismatch."""
+
+
+def _topology_id(hydrated: dict[str, Any]) -> str:
+    topo = hydrated.get("topology") or {}
+    tid = topo.get("id")
+    if not tid:
+        raise ValueError("hydrated topology missing topology.id")
+    return str(tid)
+
+
+async def apply(
+    hydrated: dict[str, Any],
+    version_hash: str,
+    store: TopologyStore,
+) -> None:
+    """Materialise *hydrated* on this agent and record it in *store*.
+
+    Raises:
+      HashMismatch: master and agent disagree on the canonical hash —
+        don't touch docker, fail the apply.
+      ValidationError: topology fails structural validation.
+      Any docker / compose error propagates up; the endpoint maps it
+        to 500 and records the message on the store row.
+    """
+    local_hash = canonical_hash(hydrated)
+    if local_hash != version_hash:
+        raise HashMismatch(
+            f"master hash {version_hash!r} does not match agent hash "
+            f"{local_hash!r} — refusing to apply"
+        )
+
+    issues = _validate_topology(hydrated)
+    if _validation_errors(issues):
+        raise ValidationError(issues)
+
+    topology_id = _topology_id(hydrated)
+    # Master is authoritative.  If a different topology is pinned here
+    # — whether it fully applied, only partially applied (failure
+    # marker row + orphan containers), or drifted — teardown first,
+    # then accept the new one.  Refusing with 409 would leave the
+    # agent stuck in a state only a human could resolve.
+    existing = store.current()
+    if existing is not None and existing.topology_id != topology_id:
+        log.info(
+            "superseding topology %s with %s on master authority",
+            existing.topology_id, topology_id,
+        )
+        try:
+            await teardown(existing.topology_id, store)
+        except Exception as exc:  # noqa: BLE001 — we still want to try applying
+            log.warning(
+                "best-effort teardown of superseded topology %s failed: %s",
+                existing.topology_id, exc,
+            )
+            # Hard-clear the store row so the new apply isn't blocked
+            # by a half-torn-down predecessor.  Leftover docker objects
+            # will surface via the next heartbeat's observed block.
+            store.clear(existing.topology_id)
+
+    lans = hydrated["lans"]
+    compose_path = _topology_compose_path(topology_id)
+    client = docker.from_env()
+
+    # Bridges + compose are sync/blocking; hop to a thread so we don't
+    # stall the event loop on a slow docker daemon.
+    def _materialise() -> None:
+        for lan in lans:
+            net_name = _topology_network_name(topology_id, lan["name"])
+            internal = not lan["is_dmz"]
+            create_bridge_network(
+                client, net_name, lan["subnet"], internal=internal
+            )
+        write_topology_compose(hydrated, compose_path)
+        # ``--always-recreate-deps`` keeps service containers' netns shares
+        # fresh: every decky service joins its base's netns via
+        # ``network_mode: container:<base>``, and that share is bound at
+        # service start time. If a base is recreated (e.g. when ``ports:``
+        # changes after toggling ``forwards_l3``) but compose decides the
+        # services are unchanged, the services keep a stale netns FD
+        # pointing at the destroyed base — they end up in an empty
+        # namespace with only ``lo``, and external traffic hits a closed
+        # port on the live base. Forcing dependents to recreate alongside
+        # the base is the cheapest way to make this race impossible.
+        _compose_with_retry(
+            "up", "--build", "-d", "--always-recreate-deps",
+            compose_file=compose_path,
+        )
+
+    await asyncio.to_thread(_materialise)
+
+    store.put(topology_id, version_hash, hydrated)
+    log.info(
+        "topology %s applied on agent (%d LANs)", topology_id, len(lans)
+    )
+
+
+async def teardown(
+    topology_id: str,
+    store: TopologyStore,
+) -> None:
+    """Tear down *topology_id* on this agent.  Idempotent: if there's no
+    record and no compose file, it's a no-op that still returns cleanly."""
+    row = store.current()
+    # Prefer the stored hydrated blob — it's what we applied with.  If
+    # it's gone (db wiped) but compose-file lingers, we still try to
+    # compose-down and delete bridges by scanning the compose file's
+    # LAN membership list via the hydrated blob if available.
+    hydrated = row.hydrated if row and row.topology_id == topology_id else None
+    compose_path = _topology_compose_path(topology_id)
+    client = docker.from_env()
+
+    def _dismantle() -> None:
+        if compose_path.exists():
+            try:
+                _compose("down", "--remove-orphans", compose_file=compose_path)
+            except subprocess.CalledProcessError as exc:
+                log.warning(
+                    "topology %s compose down failed (continuing): %s",
+                    topology_id, exc,
+                )
+        if hydrated is not None:
+            for lan_name in _teardown_order(hydrated["lans"]):
+                net_name = _topology_network_name(topology_id, lan_name)
+                remove_bridge_network(client, net_name)
+        if compose_path.exists():
+            compose_path.unlink()
+
+    await asyncio.to_thread(_dismantle)
+    store.clear(topology_id)
+    log.info("topology %s torn down on agent", topology_id)
+
+
+def state(store: TopologyStore) -> dict[str, Any]:
+    """Snapshot-plus-live-observation — the shape the heartbeat embeds."""
+    row = store.current()
+    try:
+        obs = observed(docker.from_env())
+    except Exception as exc:  # noqa: BLE001 — docker socket may be gone
+        obs = {"error": str(exc)[:200]}
+    if row is None:
+        return {
+            "topology_id": None,
+            "applied_version_hash": None,
+            "applied_at": None,
+            "last_error": None,
+            "observed": obs,
+        }
+    return {
+        "topology_id": row.topology_id,
+        "applied_version_hash": row.applied_version_hash,
+        "applied_at": row.applied_at,
+        "last_error": row.last_error,
+        "observed": obs,
+    }
+
+
+__all__ = ["apply", "teardown", "state", "HashMismatch"]
--- a/decnet/agent/topology_store.py
+++ b/decnet/agent/topology_store.py
@@ -0,0 +1,213 @@
+"""Agent-side sqlite cache of the currently-applied topology.
+
+**This is a cache, not a source of truth.**  The master is the only
+authority for what the agent should be running.  This store exists so
+the agent can answer two questions quickly and offline:
+
+1. What topology did I last apply, and with what version hash?
+2. Is what docker is currently doing consistent with that?
+
+The hash goes out on every heartbeat; the master compares it to what
+it thinks this host should be running and schedules a re-push on
+mismatch.
+
+Why sqlite when the blob is JSON?  Consistent with
+:mod:`decnet.swarm.log_forwarder._OffsetStore` — single-row sqlite is
+the project-wide pattern for agent-local persistent state.  Keeps
+operational mental model small: "one state.db per thing".
+
+Design choices worth calling out:
+
+- **One row, one topology.**  v1 only supports a single topology per
+  agent.  Attempting to :meth:`put` a different ``topology_id`` while
+  a row already exists raises :class:`AlreadyApplied` — the agent
+  rejects the apply with 409 and the master is expected to teardown
+  the old one first.
+- **No auto-restore on boot.**  The agent does NOT read this db at
+  startup and try to re-apply.  Whatever docker has after a restart
+  is what it has; the next heartbeat reports the truth and the
+  master decides whether to re-push.  Same reason we don't sync
+  mutations from agent → master anywhere else: split-brain is worse
+  than temporary drift.
+"""
+from __future__ import annotations
+
+import json
+import pathlib
+import sqlite3
+import time
+from dataclasses import dataclass
+from typing import Any, Optional
+
+
+class AlreadyApplied(RuntimeError):
+    """Raised when a different topology is already pinned to this agent."""
+
+
+@dataclass(frozen=True)
+class AppliedRow:
+    topology_id: str
+    applied_version_hash: str
+    hydrated: dict[str, Any]
+    applied_at: int
+    last_error: Optional[str]
+
+
+class TopologyStore:
+    """Single-row sqlite cache. Stdlib only, sync (called from endpoints)."""
+
+    def __init__(self, db_path: pathlib.Path) -> None:
+        db_path.parent.mkdir(parents=True, exist_ok=True)
+        # check_same_thread=False: Starlette/FastAPI runs sync endpoint
+        # bodies on a worker thread distinct from where `app` is imported.
+        # The agent is single-process, so there's no real contention —
+        # sqlite's own connection lock is enough.
+        self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
+        self._conn.execute(
+            "CREATE TABLE IF NOT EXISTS applied_topology ("
+            " topology_id TEXT PRIMARY KEY,"
+            " applied_version_hash TEXT NOT NULL,"
+            " hydrated_blob_json TEXT NOT NULL,"
+            " applied_at INTEGER NOT NULL,"
+            " last_error TEXT)"
+        )
+        self._conn.commit()
+
+    # ----------------------------------------------------------------- reads
+
+    def current(self) -> Optional[AppliedRow]:
+        """Return the single applied topology, or ``None`` if idle."""
+        row = self._conn.execute(
+            "SELECT topology_id, applied_version_hash, hydrated_blob_json,"
+            " applied_at, last_error FROM applied_topology LIMIT 1"
+        ).fetchone()
+        if row is None:
+            return None
+        return AppliedRow(
+            topology_id=row[0],
+            applied_version_hash=row[1],
+            hydrated=json.loads(row[2]),
+            applied_at=int(row[3]),
+            last_error=row[4],
+        )
+
+    # ---------------------------------------------------------------- writes
+
+    def put(
+        self,
+        topology_id: str,
+        applied_version_hash: str,
+        hydrated: dict[str, Any],
+    ) -> None:
+        """Record an applied topology.
+
+        If a *different* topology is already recorded, raises
+        :class:`AlreadyApplied`.  Re-applying the same ``topology_id``
+        just updates the hash + blob (idempotent re-push).
+        """
+        existing = self.current()
+        if existing is not None and existing.topology_id != topology_id:
+            raise AlreadyApplied(
+                f"agent already has topology {existing.topology_id!r}; "
+                f"cannot apply {topology_id!r}"
+            )
+        self._conn.execute(
+            "INSERT INTO applied_topology"
+            " (topology_id, applied_version_hash, hydrated_blob_json,"
+            "  applied_at, last_error)"
+            " VALUES (?, ?, ?, ?, NULL)"
+            " ON CONFLICT(topology_id) DO UPDATE SET"
+            "  applied_version_hash=excluded.applied_version_hash,"
+            "  hydrated_blob_json=excluded.hydrated_blob_json,"
+            "  applied_at=excluded.applied_at,"
+            "  last_error=NULL",
+            (
+                topology_id,
+                applied_version_hash,
+                json.dumps(hydrated, sort_keys=True),
+                int(time.time()),
+            ),
+        )
+        self._conn.commit()
+
+    def record_error(
+        self,
+        topology_id: str,
+        message: str,
+        hydrated: Optional[dict[str, Any]] = None,
+    ) -> None:
+        """Attach a last-error message for *topology_id*.
+
+        Upserts a marker row when no apply has yet succeeded for this
+        topology — that way a failure *during* the first materialise
+        (put() hasn't been reached) still surfaces via GET
+        /topology/state and the next heartbeat.  The marker row uses an
+        empty ``applied_version_hash`` so master's heartbeat check sees
+        the hash mismatch and schedules a resync.
+
+        If *hydrated* is provided it is stored so a later teardown can
+        still walk the LAN list — otherwise a partial deploy is strands
+        containers + bridges with no breadcrumb back to them.
+        """
+        blob = json.dumps(hydrated, sort_keys=True) if hydrated else "{}"
+        self._conn.execute(
+            "INSERT INTO applied_topology"
+            " (topology_id, applied_version_hash, hydrated_blob_json,"
+            "  applied_at, last_error)"
+            " VALUES (?, '', ?, 0, ?)"
+            " ON CONFLICT(topology_id) DO UPDATE SET"
+            "  last_error=excluded.last_error,"
+            "  hydrated_blob_json=CASE"
+            "   WHEN applied_topology.hydrated_blob_json='{}'"
+            "   THEN excluded.hydrated_blob_json"
+            "   ELSE applied_topology.hydrated_blob_json END",
+            (topology_id, blob, message),
+        )
+        self._conn.commit()
+
+    def clear(self, topology_id: str) -> None:
+        """Remove the row for *topology_id* (post-teardown).
+
+        No-op if the row doesn't exist — makes teardown idempotent.
+        """
+        self._conn.execute(
+            "DELETE FROM applied_topology WHERE topology_id=?",
+            (topology_id,),
+        )
+        self._conn.commit()
+
+    def close(self) -> None:
+        self._conn.close()
+
+
+# --------------------------------------------------- live docker observation
+
+
+def observed(docker_client: Any) -> dict[str, Any]:
+    """Snapshot what docker is *actually* running on this agent.
+
+    Returns a compact dict the heartbeat can ship so the master can
+    cross-check ``applied_version_hash`` against reality (a matching
+    hash with missing bridges is still drift).  Best-effort: if docker
+    is unreachable we return an ``error`` marker rather than raising —
+    the agent still needs to heartbeat, and the master can treat
+    ``error`` as "unknown, re-push".
+    """
+    try:
+        bridges = [
+            n.name
+            for n in docker_client.networks.list()
+            if n.attrs.get("Driver") == "bridge"
+            and n.name.startswith("decnet-topology-")
+        ]
+        containers = [
+            c.name
+            for c in docker_client.containers.list(all=False)
+            if c.name.startswith("decnet-")
+        ]
+        return {"bridges": sorted(bridges), "containers": sorted(containers)}
+    except Exception as exc:  # noqa: BLE001 — best-effort observation
+        return {"error": str(exc)[:200]}
+
+
+__all__ = ["TopologyStore", "AppliedRow", "AlreadyApplied", "observed"]