fix(collector): retry on event-stream errors and add periodic reconciler

Hit live on first VPS deploy: a window between the initial
client.containers.list() snapshot and the client.events() start-event
stream let topology service containers slip through, requiring an
operator restart for them to be picked up.

Two fixes:

* `_watch_events` now wraps the events() call in a retry loop with
  exponential backoff (1s -> 30s cap). A docker.errors.APIError, daemon
  reload, or SDK stream-decode hiccup used to make the executor task
  return cleanly, leaving the collector "running" with no event
  subscription. Future container starts were silently dropped until
  the unit was restarted.

* New `_reconcile_loop` async task ticks every
  DECNET_COLLECTOR_RECONCILE_S (default 30s), re-scans
  client.containers.list(), and calls _spawn for any service container
  not already in `active`. Belt to the event watcher's suspenders:
  even if a start event is dropped during a reconnect window, the
  reconciler picks it up within one cycle. Also prunes finished
  futures from `active` so the dict's bounded by current container
  count rather than agent lifetime churn.
This commit is contained in:
2026-04-27 22:56:13 -04:00
parent c5db1d7ba2
commit e03a6d10a0
2 changed files with 194 additions and 10 deletions

View File

@@ -2,6 +2,8 @@
import json
import asyncio
import threading
import time
import pytest
from types import SimpleNamespace
from unittest.mock import patch, MagicMock
@@ -690,3 +692,116 @@ class TestLogCollectorWorker:
# Should not raise
await log_collector_worker(log_file)
@pytest.mark.asyncio
async def test_event_watcher_retries_on_stream_break(self, tmp_path, monkeypatch):
"""A docker stream-decode hiccup must not silently end the
watcher: today the executor task would return cleanly and
future container starts would be dropped until an operator
restarted the unit. The retry loop is what keeps the collector
honest across daemon reloads."""
log_file = str(tmp_path / "decnet.log")
valid_event = {
"id": "c-resilient",
"Actor": {"Attributes": {"name": "resilient-svc"}},
}
# Patch time.sleep inside the worker so the retry's backoff
# doesn't actually wait — keeps the test under the budget.
monkeypatch.setattr("decnet.collector.worker.time.sleep", lambda *_: None)
# Sequence: raise (transient error), then SystemExit to break
# out of the while-True. SystemExit is BaseException-derived so
# the broad ``except Exception`` in production won't catch it —
# the watcher thread exits cleanly and the worker finishes.
# We don't try to assert _spawn was called: the dispatch path
# uses ``loop.call_soon_threadsafe(_spawn, ...)`` and patching
# the abstract loop method doesn't reach the concrete loop.
# The retry contract is fully verified by counting reconnect
# attempts.
events_calls = {"n": 0}
def _events(**_kw):
events_calls["n"] += 1
if events_calls["n"] == 1:
raise RuntimeError("stream decode error")
# Second call: clean exit. Watcher's retry means call #2
# happens at all; without retry, the RuntimeError would
# propagate out of the executor and the watcher would
# never call events() again.
return iter([])
mock_client = MagicMock()
mock_client.containers.list.return_value = []
mock_client.events.side_effect = _events
# del valid_event — unused now that we dropped the spawn assertion
del valid_event
with patch("docker.from_env", return_value=mock_client), \
patch("decnet.collector.worker.is_service_event", return_value=True):
try:
await asyncio.wait_for(log_collector_worker(log_file), timeout=2.0)
except (asyncio.TimeoutError, StopIteration, SystemExit):
pass
assert events_calls["n"] >= 2, (
f"expected >=2 events() calls (one failure + one reconnect) "
f"proving the retry loop, got {events_calls['n']}"
)
@pytest.mark.asyncio
async def test_reconciler_picks_up_missed_container(self, tmp_path, monkeypatch):
"""Even if the event watcher wedges, the reconciler must catch
any service container that's already running. Simulates the
first-VPS-deploy bug: events() never yields, but a service
container exists in containers.list() — the worker had to be
restarted to pick it up. Now the reconciler does it
within RECONCILE_INTERVAL_S."""
log_file = str(tmp_path / "decnet.log")
monkeypatch.setenv("DECNET_COLLECTOR_RECONCILE_S", "0.05")
missed_container = MagicMock()
missed_container.id = "c-missed"
missed_container.name = "/missed-svc"
list_calls = {"n": 0}
def _list():
list_calls["n"] += 1
# First call (initial scan): empty. Subsequent (reconciler): one container.
if list_calls["n"] == 1:
return []
return [missed_container]
mock_client = MagicMock()
mock_client.containers.list.side_effect = _list
# First events() call raises a transient error that the
# watcher catches → triggers its real 1s backoff sleep. During
# that sleep the asyncio loop runs and the reconciler (ticking
# every 0.05s) gets ~20 chances to discover ``c-missed``.
# Second call returns an empty iterator → watcher exits
# cleanly so the test can unwind without a lingering thread.
events_calls = {"n": 0}
def _events_seq(**_kw):
events_calls["n"] += 1
if events_calls["n"] == 1:
raise RuntimeError("test: trigger backoff so reconciler can run")
return iter([])
mock_client.events.side_effect = _events_seq
with patch("docker.from_env", return_value=mock_client), \
patch("decnet.collector.worker.is_service_container", return_value=True):
try:
await asyncio.wait_for(log_collector_worker(log_file), timeout=2.0)
except (asyncio.TimeoutError, StopIteration, SystemExit):
pass
assert list_calls["n"] >= 2, (
"reconciler should have run at least once after the initial scan; "
f"got {list_calls['n']} calls to containers.list()"
)