fix(collector): label-based fleet container discovery
The events watcher's start-event filter previously called
_load_service_container_names(), which reads decnet-state.json on
every event. decnet deploy writes that state file out-of-band
with docker compose up, so a container's start event could
arrive before the state was committed — the watcher then dropped
the event silently and never tailed the container's stdout. The
visible symptom was an empty Credentials view (and Logs/Bounty)
after a fresh deploy until the collector was manually restarted.
Fix: stamp decnet.fleet.{service,decky,service_name} labels on
every fleet service container at compose-time, and let the
collector recognize either the fleet or topology label without
touching the state file. The state-file name match remains as a
fallback for legacy containers that predate the new labels.
This commit is contained in:
@@ -224,45 +224,68 @@ def _load_service_container_names() -> set[str]:
|
|||||||
|
|
||||||
|
|
||||||
_TOPOLOGY_SERVICE_LABEL = "decnet.topology.service"
|
_TOPOLOGY_SERVICE_LABEL = "decnet.topology.service"
|
||||||
|
_FLEET_SERVICE_LABEL = "decnet.fleet.service"
|
||||||
|
|
||||||
|
|
||||||
def _has_topology_service_label(labels: Optional[dict]) -> bool:
|
def _has_decnet_service_label(labels: Optional[dict]) -> bool:
|
||||||
"""MazeNET topology containers are tagged at compose-time (see
|
"""Recognize both fleet (``decnet.fleet.service``, set by
|
||||||
``decnet/topology/compose.py``) so the collector can discover them
|
``decnet/composer.py``) and MazeNET topology (``decnet.topology.service``,
|
||||||
without consulting ``decnet-state.json`` — that state file only
|
set by ``decnet/topology/compose.py``) containers.
|
||||||
knows about legacy fleet deckies."""
|
|
||||||
|
Label-based detection is the canonical path: it's stateless and avoids
|
||||||
|
the race between ``docker compose up`` and the ``decnet-state.json``
|
||||||
|
write that previously caused freshly-deployed fleet containers to be
|
||||||
|
silently dropped by the docker-events watcher.
|
||||||
|
"""
|
||||||
if not labels:
|
if not labels:
|
||||||
return False
|
return False
|
||||||
return labels.get(_TOPOLOGY_SERVICE_LABEL) == "true"
|
return (
|
||||||
|
labels.get(_TOPOLOGY_SERVICE_LABEL) == "true"
|
||||||
|
or labels.get(_FLEET_SERVICE_LABEL) == "true"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def is_service_container(container) -> bool:
|
def is_service_container(container) -> bool:
|
||||||
"""Return True if this Docker container is a known DECNET service container."""
|
"""Return True if this Docker container is a known DECNET service container.
|
||||||
|
|
||||||
|
Label-based detection is preferred (works for both fleet and MazeNET
|
||||||
|
topology containers without touching decnet-state.json). The
|
||||||
|
state-file name match remains as a fallback so containers built from
|
||||||
|
older composes — which predate the ``decnet.fleet.service`` label —
|
||||||
|
are still picked up.
|
||||||
|
"""
|
||||||
if isinstance(container, str):
|
if isinstance(container, str):
|
||||||
return container.lstrip("/") in _load_service_container_names()
|
return container.lstrip("/") in _load_service_container_names()
|
||||||
name = container.name.lstrip("/")
|
|
||||||
if name in _load_service_container_names():
|
|
||||||
return True
|
|
||||||
# MazeNET topology containers aren't in decnet-state.json — discover
|
|
||||||
# them via compose-time labels instead. Tolerant to stub objects
|
|
||||||
# that don't expose .attrs/.labels (unit tests).
|
|
||||||
labels: Optional[dict] = None
|
labels: Optional[dict] = None
|
||||||
attrs = getattr(container, "attrs", None)
|
attrs = getattr(container, "attrs", None)
|
||||||
if isinstance(attrs, dict):
|
if isinstance(attrs, dict):
|
||||||
labels = (attrs.get("Config") or {}).get("Labels")
|
labels = (attrs.get("Config") or {}).get("Labels")
|
||||||
if labels is None:
|
if labels is None:
|
||||||
labels = getattr(container, "labels", None)
|
labels = getattr(container, "labels", None)
|
||||||
return _has_topology_service_label(labels)
|
if _has_decnet_service_label(labels):
|
||||||
|
return True
|
||||||
|
# Fallback: legacy containers without labels still match by name.
|
||||||
|
name = container.name.lstrip("/")
|
||||||
|
return name in _load_service_container_names()
|
||||||
|
|
||||||
|
|
||||||
def is_service_event(attrs: dict) -> bool:
|
def is_service_event(attrs: dict) -> bool:
|
||||||
"""Return True if a Docker start event is for a known DECNET service container."""
|
"""Return True if a Docker start event is for a known DECNET service container.
|
||||||
name = attrs.get("name", "").lstrip("/")
|
|
||||||
if name in _load_service_container_names():
|
Docker start-event attrs flatten every container label alongside the
|
||||||
|
``name``/``image`` keys — no separate ``labels`` sub-dict — so label
|
||||||
|
detection happens directly on ``attrs``.
|
||||||
|
|
||||||
|
Prefer the label path because it's race-free with respect to the
|
||||||
|
``decnet-state.json`` write that ``decnet deploy`` performs around
|
||||||
|
``docker compose up``: a freshly-started container's start event can
|
||||||
|
arrive before the state file has been updated, and the legacy
|
||||||
|
name-based fallback would then drop the event.
|
||||||
|
"""
|
||||||
|
if _has_decnet_service_label(attrs):
|
||||||
return True
|
return True
|
||||||
# Docker start-event attrs contains every container label flat alongside
|
name = attrs.get("name", "").lstrip("/")
|
||||||
# 'name' / 'image' — no separate 'labels' sub-dict.
|
return name in _load_service_container_names()
|
||||||
return attrs.get(_TOPOLOGY_SERVICE_LABEL) == "true"
|
|
||||||
|
|
||||||
|
|
||||||
# ─── Blocking stream worker (runs in a thread) ────────────────────────────────
|
# ─── Blocking stream worker (runs in a thread) ────────────────────────────────
|
||||||
|
|||||||
@@ -91,6 +91,19 @@ def generate_compose(config: DecnetConfig) -> dict:
|
|||||||
# Rotate Docker logs so disk usage is bounded
|
# Rotate Docker logs so disk usage is bounded
|
||||||
fragment["logging"] = _DOCKER_LOGGING
|
fragment["logging"] = _DOCKER_LOGGING
|
||||||
|
|
||||||
|
# Stamp DECNET ownership labels so the collector's docker-events
|
||||||
|
# watcher can identify newly-started containers without consulting
|
||||||
|
# decnet-state.json (which is written and read out-of-band with
|
||||||
|
# `docker compose up`, leaving a race window where freshly started
|
||||||
|
# containers were silently ignored).
|
||||||
|
labels = dict(fragment.get("labels") or {})
|
||||||
|
labels.update({
|
||||||
|
"decnet.fleet.service": "true",
|
||||||
|
"decnet.fleet.decky": decky.name,
|
||||||
|
"decnet.fleet.service_name": svc_name,
|
||||||
|
})
|
||||||
|
fragment["labels"] = labels
|
||||||
|
|
||||||
services[f"{decky.name}-{svc_name}"] = fragment
|
services[f"{decky.name}-{svc_name}"] = fragment
|
||||||
|
|
||||||
# Network definitions
|
# Network definitions
|
||||||
|
|||||||
@@ -303,6 +303,54 @@ class TestTopologyLabelDiscovery:
|
|||||||
assert is_service_container(c) is True
|
assert is_service_container(c) is True
|
||||||
|
|
||||||
|
|
||||||
|
class TestFleetLabelDiscovery:
|
||||||
|
"""Fleet (legacy) containers stamped with ``decnet.fleet.service=true``
|
||||||
|
by ``decnet/composer.py`` must be picked up by the events watcher even
|
||||||
|
when ``decnet-state.json`` hasn't been refreshed yet — that's the race
|
||||||
|
that previously caused freshly-deployed containers to be silently
|
||||||
|
ignored."""
|
||||||
|
|
||||||
|
def _labelled(self, name: str, labels: dict):
|
||||||
|
return SimpleNamespace(
|
||||||
|
name=name,
|
||||||
|
attrs={"Config": {"Labels": labels}},
|
||||||
|
labels=labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_fleet_labelled_container_matches_without_state(self):
|
||||||
|
with patch("decnet.collector.worker._load_service_container_names", return_value=set()):
|
||||||
|
c = self._labelled(
|
||||||
|
"omega-decky-ssh",
|
||||||
|
{"decnet.fleet.service": "true", "decnet.fleet.decky": "omega-decky"},
|
||||||
|
)
|
||||||
|
assert is_service_container(c) is True
|
||||||
|
|
||||||
|
def test_fleet_labelled_event_matches_without_state(self):
|
||||||
|
with patch("decnet.collector.worker._load_service_container_names", return_value=set()):
|
||||||
|
attrs = {
|
||||||
|
"name": "omega-decky-ssh",
|
||||||
|
"decnet.fleet.service": "true",
|
||||||
|
"decnet.fleet.decky": "omega-decky",
|
||||||
|
}
|
||||||
|
assert is_service_event(attrs) is True
|
||||||
|
|
||||||
|
def test_unlabelled_event_falls_back_to_state(self):
|
||||||
|
"""Containers built before this label landed still match by name."""
|
||||||
|
with patch("decnet.collector.worker._load_service_container_names", return_value=_KNOWN_NAMES):
|
||||||
|
assert is_service_event({"name": "omega-decky-http"}) is True
|
||||||
|
|
||||||
|
def test_unrelated_label_does_not_match(self):
|
||||||
|
with patch("decnet.collector.worker._load_service_container_names", return_value=set()):
|
||||||
|
c = self._labelled(
|
||||||
|
"redis",
|
||||||
|
{"com.docker.compose.project": "redis", "decnet.fleet.service": "false"},
|
||||||
|
)
|
||||||
|
assert is_service_container(c) is False
|
||||||
|
assert is_service_event(
|
||||||
|
{"name": "redis", "decnet.fleet.service": "false"}
|
||||||
|
) is False
|
||||||
|
|
||||||
|
|
||||||
class TestLoadServiceContainerNames:
|
class TestLoadServiceContainerNames:
|
||||||
def test_with_valid_state(self, tmp_path, monkeypatch):
|
def test_with_valid_state(self, tmp_path, monkeypatch):
|
||||||
import decnet.config
|
import decnet.config
|
||||||
|
|||||||
@@ -242,3 +242,28 @@ def test_multiple_deckies_different_build_bases():
|
|||||||
assert base_img_01 == "debian:bookworm-slim"
|
assert base_img_01 == "debian:bookworm-slim"
|
||||||
assert base_img_02 == "ubuntu:22.04"
|
assert base_img_02 == "ubuntu:22.04"
|
||||||
assert base_img_01 != base_img_02
|
assert base_img_01 != base_img_02
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Fleet ownership labels — collector keys off these to recognize freshly-
|
||||||
|
# deployed containers without consulting decnet-state.json (the previous
|
||||||
|
# state-file lookup race silently dropped containers whose Docker start
|
||||||
|
# event arrived before the state write completed).
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_service_container_carries_fleet_labels():
|
||||||
|
config = _make_config(["http"], distro="debian")
|
||||||
|
compose = generate_compose(config)
|
||||||
|
labels = compose["services"]["decky-01-http"]["labels"]
|
||||||
|
assert labels["decnet.fleet.service"] == "true"
|
||||||
|
assert labels["decnet.fleet.decky"] == "decky-01"
|
||||||
|
assert labels["decnet.fleet.service_name"] == "http"
|
||||||
|
|
||||||
|
|
||||||
|
def test_base_container_does_not_carry_service_label():
|
||||||
|
"""Base containers run sleep — they don't emit logs and must NOT be
|
||||||
|
streamed by the collector, so the service marker stays off them."""
|
||||||
|
config = _make_config(["http"], distro="debian")
|
||||||
|
compose = generate_compose(config)
|
||||||
|
base = compose["services"]["decky-01"]
|
||||||
|
assert "decnet.fleet.service" not in (base.get("labels") or {})
|
||||||
|
|||||||
Reference in New Issue
Block a user