Files
DECNET/decnet/agent/topology_ops.py
anti 0cdcfe2653 feat(agent/collector): topology-label discovery and master-authoritative supersede
Legacy fleet deckies live in decnet-state.json; MazeNET topology
containers don't. Tag them at compose-time with
decnet.topology.service=true and let the collector match on that label.
Spin up the agent's log collector on the first successful /topology/apply
(not in the lifespan — that would break the no-docker-on-boot invariant)
and tear it down with the app. Land log lines in DECNET_AGENT_LOG_FILE,
separate from master-side DECNET_INGEST_LOG_FILE, so a dev box running
both roles can't forward its own ingest back to itself.

When master pushes a topology that differs from whatever is pinned
locally, teardown the predecessor and accept the new one. Refusing with
409 left the agent stranded after partial deploys. record_error now
persists the hydrated blob so a later teardown can still walk the LAN
list — otherwise a half-failed apply strands containers + bridges with
no breadcrumb back to them.
2026-04-21 10:23:10 -04:00

196 lines
6.9 KiB
Python

"""Agent-side topology apply/teardown/state primitives.
Wraps the compose + bridge machinery from :mod:`decnet.engine.deployer`
so the agent can drive a topology without ever touching the master's
sqlmodel repo. The master-side ``deploy_topology`` always calls
``transition_status(repo, …)`` which is useless (and unreachable) on
an agent — here we operate purely on a hydrated dict + the local
:class:`TopologyStore`.
v1 constraint: one topology per agent. A second apply for a different
``topology_id`` triggers an on-the-spot teardown of the predecessor
before the new apply proceeds — master is authoritative.
"""
from __future__ import annotations
import asyncio
import subprocess # nosec B404
from typing import Any
import docker
from decnet.agent.topology_store import (
TopologyStore,
observed,
)
from decnet.engine.deployer import (
_compose,
_compose_with_retry,
_teardown_order,
_topology_compose_path,
)
from decnet.logging import get_logger
from decnet.network import create_bridge_network, remove_bridge_network
from decnet.topology.compose import (
_network_name as _topology_network_name,
write_topology_compose,
)
from decnet.topology.hashing import canonical_hash
from decnet.topology.validate import (
ValidationError,
errors as _validation_errors,
validate as _validate_topology,
)
log = get_logger("agent.topology_ops")
class HashMismatch(RuntimeError):
"""Raised when the master-provided version_hash doesn't match what we
hash locally — suggests serialisation drift. We fail loudly rather
than silently papering over a schema mismatch."""
def _topology_id(hydrated: dict[str, Any]) -> str:
topo = hydrated.get("topology") or {}
tid = topo.get("id")
if not tid:
raise ValueError("hydrated topology missing topology.id")
return str(tid)
async def apply(
hydrated: dict[str, Any],
version_hash: str,
store: TopologyStore,
) -> None:
"""Materialise *hydrated* on this agent and record it in *store*.
Raises:
HashMismatch: master and agent disagree on the canonical hash —
don't touch docker, fail the apply.
ValidationError: topology fails structural validation.
Any docker / compose error propagates up; the endpoint maps it
to 500 and records the message on the store row.
"""
local_hash = canonical_hash(hydrated)
if local_hash != version_hash:
raise HashMismatch(
f"master hash {version_hash!r} does not match agent hash "
f"{local_hash!r} — refusing to apply"
)
issues = _validate_topology(hydrated)
if _validation_errors(issues):
raise ValidationError(issues)
topology_id = _topology_id(hydrated)
# Master is authoritative. If a different topology is pinned here
# — whether it fully applied, only partially applied (failure
# marker row + orphan containers), or drifted — teardown first,
# then accept the new one. Refusing with 409 would leave the
# agent stuck in a state only a human could resolve.
existing = store.current()
if existing is not None and existing.topology_id != topology_id:
log.info(
"superseding topology %s with %s on master authority",
existing.topology_id, topology_id,
)
try:
await teardown(existing.topology_id, store)
except Exception as exc: # noqa: BLE001 — we still want to try applying
log.warning(
"best-effort teardown of superseded topology %s failed: %s",
existing.topology_id, exc,
)
# Hard-clear the store row so the new apply isn't blocked
# by a half-torn-down predecessor. Leftover docker objects
# will surface via the next heartbeat's observed block.
store.clear(existing.topology_id)
lans = hydrated["lans"]
compose_path = _topology_compose_path(topology_id)
client = docker.from_env()
# Bridges + compose are sync/blocking; hop to a thread so we don't
# stall the event loop on a slow docker daemon.
def _materialise() -> None:
for lan in lans:
net_name = _topology_network_name(topology_id, lan["name"])
internal = not lan["is_dmz"]
create_bridge_network(
client, net_name, lan["subnet"], internal=internal
)
write_topology_compose(hydrated, compose_path)
_compose_with_retry("up", "--build", "-d", compose_file=compose_path)
await asyncio.to_thread(_materialise)
store.put(topology_id, version_hash, hydrated)
log.info(
"topology %s applied on agent (%d LANs)", topology_id, len(lans)
)
async def teardown(
topology_id: str,
store: TopologyStore,
) -> None:
"""Tear down *topology_id* on this agent. Idempotent: if there's no
record and no compose file, it's a no-op that still returns cleanly."""
row = store.current()
# Prefer the stored hydrated blob — it's what we applied with. If
# it's gone (db wiped) but compose-file lingers, we still try to
# compose-down and delete bridges by scanning the compose file's
# LAN membership list via the hydrated blob if available.
hydrated = row.hydrated if row and row.topology_id == topology_id else None
compose_path = _topology_compose_path(topology_id)
client = docker.from_env()
def _dismantle() -> None:
if compose_path.exists():
try:
_compose("down", "--remove-orphans", compose_file=compose_path)
except subprocess.CalledProcessError as exc:
log.warning(
"topology %s compose down failed (continuing): %s",
topology_id, exc,
)
if hydrated is not None:
for lan_name in _teardown_order(hydrated["lans"]):
net_name = _topology_network_name(topology_id, lan_name)
remove_bridge_network(client, net_name)
if compose_path.exists():
compose_path.unlink()
await asyncio.to_thread(_dismantle)
store.clear(topology_id)
log.info("topology %s torn down on agent", topology_id)
def state(store: TopologyStore) -> dict[str, Any]:
"""Snapshot-plus-live-observation — the shape the heartbeat embeds."""
row = store.current()
try:
obs = observed(docker.from_env())
except Exception as exc: # noqa: BLE001 — docker socket may be gone
obs = {"error": str(exc)[:200]}
if row is None:
return {
"topology_id": None,
"applied_version_hash": None,
"applied_at": None,
"last_error": None,
"observed": obs,
}
return {
"topology_id": row.topology_id,
"applied_version_hash": row.applied_version_hash,
"applied_at": row.applied_at,
"last_error": row.last_error,
"observed": obs,
}
__all__ = ["apply", "teardown", "state", "HashMismatch"]