diff --git a/decnet/collector/worker.py b/decnet/collector/worker.py index 633f2f7f..9fb46b1e 100644 --- a/decnet/collector/worker.py +++ b/decnet/collector/worker.py @@ -224,45 +224,68 @@ def _load_service_container_names() -> set[str]: _TOPOLOGY_SERVICE_LABEL = "decnet.topology.service" +_FLEET_SERVICE_LABEL = "decnet.fleet.service" -def _has_topology_service_label(labels: Optional[dict]) -> bool: - """MazeNET topology containers are tagged at compose-time (see - ``decnet/topology/compose.py``) so the collector can discover them - without consulting ``decnet-state.json`` — that state file only - knows about legacy fleet deckies.""" +def _has_decnet_service_label(labels: Optional[dict]) -> bool: + """Recognize both fleet (``decnet.fleet.service``, set by + ``decnet/composer.py``) and MazeNET topology (``decnet.topology.service``, + set by ``decnet/topology/compose.py``) containers. + + Label-based detection is the canonical path: it's stateless and avoids + the race between ``docker compose up`` and the ``decnet-state.json`` + write that previously caused freshly-deployed fleet containers to be + silently dropped by the docker-events watcher. + """ if not labels: return False - return labels.get(_TOPOLOGY_SERVICE_LABEL) == "true" + return ( + labels.get(_TOPOLOGY_SERVICE_LABEL) == "true" + or labels.get(_FLEET_SERVICE_LABEL) == "true" + ) def is_service_container(container) -> bool: - """Return True if this Docker container is a known DECNET service container.""" + """Return True if this Docker container is a known DECNET service container. + + Label-based detection is preferred (works for both fleet and MazeNET + topology containers without touching decnet-state.json). The + state-file name match remains as a fallback so containers built from + older composes — which predate the ``decnet.fleet.service`` label — + are still picked up. + """ if isinstance(container, str): return container.lstrip("/") in _load_service_container_names() - name = container.name.lstrip("/") - if name in _load_service_container_names(): - return True - # MazeNET topology containers aren't in decnet-state.json — discover - # them via compose-time labels instead. Tolerant to stub objects - # that don't expose .attrs/.labels (unit tests). labels: Optional[dict] = None attrs = getattr(container, "attrs", None) if isinstance(attrs, dict): labels = (attrs.get("Config") or {}).get("Labels") if labels is None: labels = getattr(container, "labels", None) - return _has_topology_service_label(labels) + if _has_decnet_service_label(labels): + return True + # Fallback: legacy containers without labels still match by name. + name = container.name.lstrip("/") + return name in _load_service_container_names() def is_service_event(attrs: dict) -> bool: - """Return True if a Docker start event is for a known DECNET service container.""" - name = attrs.get("name", "").lstrip("/") - if name in _load_service_container_names(): + """Return True if a Docker start event is for a known DECNET service container. + + Docker start-event attrs flatten every container label alongside the + ``name``/``image`` keys — no separate ``labels`` sub-dict — so label + detection happens directly on ``attrs``. + + Prefer the label path because it's race-free with respect to the + ``decnet-state.json`` write that ``decnet deploy`` performs around + ``docker compose up``: a freshly-started container's start event can + arrive before the state file has been updated, and the legacy + name-based fallback would then drop the event. + """ + if _has_decnet_service_label(attrs): return True - # Docker start-event attrs contains every container label flat alongside - # 'name' / 'image' — no separate 'labels' sub-dict. - return attrs.get(_TOPOLOGY_SERVICE_LABEL) == "true" + name = attrs.get("name", "").lstrip("/") + return name in _load_service_container_names() # ─── Blocking stream worker (runs in a thread) ──────────────────────────────── diff --git a/decnet/composer.py b/decnet/composer.py index d7896156..34fbe9af 100644 --- a/decnet/composer.py +++ b/decnet/composer.py @@ -91,6 +91,19 @@ def generate_compose(config: DecnetConfig) -> dict: # Rotate Docker logs so disk usage is bounded fragment["logging"] = _DOCKER_LOGGING + # Stamp DECNET ownership labels so the collector's docker-events + # watcher can identify newly-started containers without consulting + # decnet-state.json (which is written and read out-of-band with + # `docker compose up`, leaving a race window where freshly started + # containers were silently ignored). + labels = dict(fragment.get("labels") or {}) + labels.update({ + "decnet.fleet.service": "true", + "decnet.fleet.decky": decky.name, + "decnet.fleet.service_name": svc_name, + }) + fragment["labels"] = labels + services[f"{decky.name}-{svc_name}"] = fragment # Network definitions diff --git a/tests/collector/test_collector.py b/tests/collector/test_collector.py index 0d0c6594..cbeb453b 100644 --- a/tests/collector/test_collector.py +++ b/tests/collector/test_collector.py @@ -303,6 +303,54 @@ class TestTopologyLabelDiscovery: assert is_service_container(c) is True +class TestFleetLabelDiscovery: + """Fleet (legacy) containers stamped with ``decnet.fleet.service=true`` + by ``decnet/composer.py`` must be picked up by the events watcher even + when ``decnet-state.json`` hasn't been refreshed yet — that's the race + that previously caused freshly-deployed containers to be silently + ignored.""" + + def _labelled(self, name: str, labels: dict): + return SimpleNamespace( + name=name, + attrs={"Config": {"Labels": labels}}, + labels=labels, + ) + + def test_fleet_labelled_container_matches_without_state(self): + with patch("decnet.collector.worker._load_service_container_names", return_value=set()): + c = self._labelled( + "omega-decky-ssh", + {"decnet.fleet.service": "true", "decnet.fleet.decky": "omega-decky"}, + ) + assert is_service_container(c) is True + + def test_fleet_labelled_event_matches_without_state(self): + with patch("decnet.collector.worker._load_service_container_names", return_value=set()): + attrs = { + "name": "omega-decky-ssh", + "decnet.fleet.service": "true", + "decnet.fleet.decky": "omega-decky", + } + assert is_service_event(attrs) is True + + def test_unlabelled_event_falls_back_to_state(self): + """Containers built before this label landed still match by name.""" + with patch("decnet.collector.worker._load_service_container_names", return_value=_KNOWN_NAMES): + assert is_service_event({"name": "omega-decky-http"}) is True + + def test_unrelated_label_does_not_match(self): + with patch("decnet.collector.worker._load_service_container_names", return_value=set()): + c = self._labelled( + "redis", + {"com.docker.compose.project": "redis", "decnet.fleet.service": "false"}, + ) + assert is_service_container(c) is False + assert is_service_event( + {"name": "redis", "decnet.fleet.service": "false"} + ) is False + + class TestLoadServiceContainerNames: def test_with_valid_state(self, tmp_path, monkeypatch): import decnet.config diff --git a/tests/fleet/test_composer.py b/tests/fleet/test_composer.py index 9de96cd9..94a07e6d 100644 --- a/tests/fleet/test_composer.py +++ b/tests/fleet/test_composer.py @@ -242,3 +242,28 @@ def test_multiple_deckies_different_build_bases(): assert base_img_01 == "debian:bookworm-slim" assert base_img_02 == "ubuntu:22.04" assert base_img_01 != base_img_02 + + +# --------------------------------------------------------------------------- +# Fleet ownership labels — collector keys off these to recognize freshly- +# deployed containers without consulting decnet-state.json (the previous +# state-file lookup race silently dropped containers whose Docker start +# event arrived before the state write completed). +# --------------------------------------------------------------------------- + +def test_service_container_carries_fleet_labels(): + config = _make_config(["http"], distro="debian") + compose = generate_compose(config) + labels = compose["services"]["decky-01-http"]["labels"] + assert labels["decnet.fleet.service"] == "true" + assert labels["decnet.fleet.decky"] == "decky-01" + assert labels["decnet.fleet.service_name"] == "http" + + +def test_base_container_does_not_carry_service_label(): + """Base containers run sleep — they don't emit logs and must NOT be + streamed by the collector, so the service marker stays off them.""" + config = _make_config(["http"], distro="debian") + compose = generate_compose(config) + base = compose["services"]["decky-01"] + assert "decnet.fleet.service" not in (base.get("labels") or {})