feat(agent/collector): topology-label discovery and master-authoritative supersede
Legacy fleet deckies live in decnet-state.json; MazeNET topology containers don't. Tag them at compose-time with decnet.topology.service=true and let the collector match on that label. Spin up the agent's log collector on the first successful /topology/apply (not in the lifespan — that would break the no-docker-on-boot invariant) and tear it down with the app. Land log lines in DECNET_AGENT_LOG_FILE, separate from master-side DECNET_INGEST_LOG_FILE, so a dev box running both roles can't forward its own ingest back to itself. When master pushes a topology that differs from whatever is pinned locally, teardown the predecessor and accept the new one. Refusing with 409 left the agent stranded after partial deploys. record_error now persists the hydrated blob so a later teardown can still walk the LAN list — otherwise a half-failed apply strands containers + bridges with no breadcrumb back to them.
This commit is contained in:
@@ -210,16 +210,46 @@ def _load_service_container_names() -> set[str]:
|
||||
return names
|
||||
|
||||
|
||||
_TOPOLOGY_SERVICE_LABEL = "decnet.topology.service"
|
||||
|
||||
|
||||
def _has_topology_service_label(labels: Optional[dict]) -> bool:
|
||||
"""MazeNET topology containers are tagged at compose-time (see
|
||||
``decnet/topology/compose.py``) so the collector can discover them
|
||||
without consulting ``decnet-state.json`` — that state file only
|
||||
knows about legacy fleet deckies."""
|
||||
if not labels:
|
||||
return False
|
||||
return labels.get(_TOPOLOGY_SERVICE_LABEL) == "true"
|
||||
|
||||
|
||||
def is_service_container(container) -> bool:
|
||||
"""Return True if this Docker container is a known DECNET service container."""
|
||||
name = (container if isinstance(container, str) else container.name).lstrip("/")
|
||||
return name in _load_service_container_names()
|
||||
if isinstance(container, str):
|
||||
return container.lstrip("/") in _load_service_container_names()
|
||||
name = container.name.lstrip("/")
|
||||
if name in _load_service_container_names():
|
||||
return True
|
||||
# MazeNET topology containers aren't in decnet-state.json — discover
|
||||
# them via compose-time labels instead. Tolerant to stub objects
|
||||
# that don't expose .attrs/.labels (unit tests).
|
||||
labels: Optional[dict] = None
|
||||
attrs = getattr(container, "attrs", None)
|
||||
if isinstance(attrs, dict):
|
||||
labels = (attrs.get("Config") or {}).get("Labels")
|
||||
if labels is None:
|
||||
labels = getattr(container, "labels", None)
|
||||
return _has_topology_service_label(labels)
|
||||
|
||||
|
||||
def is_service_event(attrs: dict) -> bool:
|
||||
"""Return True if a Docker start event is for a known DECNET service container."""
|
||||
name = attrs.get("name", "").lstrip("/")
|
||||
return name in _load_service_container_names()
|
||||
if name in _load_service_container_names():
|
||||
return True
|
||||
# Docker start-event attrs contains every container label flat alongside
|
||||
# 'name' / 'image' — no separate 'labels' sub-dict.
|
||||
return attrs.get(_TOPOLOGY_SERVICE_LABEL) == "true"
|
||||
|
||||
|
||||
# ─── Blocking stream worker (runs in a thread) ────────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user