merge: testing → main (reconcile 2-week divergence)

This commit is contained in:
2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions

View File

View File

@@ -0,0 +1,840 @@
"""Tests for the host-side Docker log collector."""
import json
import asyncio
import threading
import time
import pytest
from types import SimpleNamespace
from unittest.mock import patch, MagicMock
from decnet.collector import parse_rfc5424, is_service_container, is_service_event
from decnet.collector.worker import (
_stream_container,
_load_service_container_names,
_should_ingest,
_reset_rate_limiter,
log_collector_worker,
)
_KNOWN_NAMES = {"omega-decky-http", "omega-decky-smtp", "relay-decky-ftp"}
def _make_container(name="omega-decky-http"):
return SimpleNamespace(name=name)
class TestParseRfc5424:
def _make_line(self, fields_str="", msg=""):
sd = f"[relay@55555 {fields_str}]" if fields_str else "-"
suffix = f" {msg}" if msg else ""
return f"<134>1 2024-01-15T12:00:00+00:00 decky-01 http - request {sd}{suffix}"
def test_returns_none_for_non_decnet_line(self):
assert parse_rfc5424("not a syslog line") is None
def test_returns_none_for_empty_line(self):
assert parse_rfc5424("") is None
def test_parses_basic_fields(self):
line = self._make_line()
result = parse_rfc5424(line)
assert result is not None
assert result["decky"] == "decky-01"
assert result["service"] == "http"
assert result["event_type"] == "request"
def test_parses_structured_data_fields(self):
line = self._make_line('src_ip="1.2.3.4" method="GET" path="/login"')
result = parse_rfc5424(line)
assert result is not None
assert result["fields"]["src_ip"] == "1.2.3.4"
assert result["fields"]["method"] == "GET"
assert result["fields"]["path"] == "/login"
def test_extracts_attacker_ip_from_src_ip(self):
line = self._make_line('src_ip="10.0.0.5"')
result = parse_rfc5424(line)
assert result["attacker_ip"] == "10.0.0.5"
def test_extracts_attacker_ip_from_src(self):
line = self._make_line('src="10.0.0.5"')
result = parse_rfc5424(line)
assert result["attacker_ip"] == "10.0.0.5"
def test_extracts_attacker_ip_from_client_ip(self):
line = self._make_line('client_ip="10.0.0.7"')
result = parse_rfc5424(line)
assert result["attacker_ip"] == "10.0.0.7"
def test_extracts_attacker_ip_from_remote_ip(self):
line = self._make_line('remote_ip="10.0.0.8"')
result = parse_rfc5424(line)
assert result["attacker_ip"] == "10.0.0.8"
def test_extracts_attacker_ip_from_ip(self):
line = self._make_line('ip="10.0.0.9"')
result = parse_rfc5424(line)
assert result["attacker_ip"] == "10.0.0.9"
def test_attacker_ip_defaults_to_unknown(self):
line = self._make_line('user="admin"')
result = parse_rfc5424(line)
assert result["attacker_ip"] == "Unknown"
def test_parses_line_with_real_procid(self):
"""sshd/sudo log via native syslog, so rsyslog fills PROCID with the
real PID instead of NILVALUE. The parser must accept either form."""
line = (
"<38>1 2026-04-18T08:27:21.862365+00:00 omega-decky sshd 940 - - "
"Accepted password for root from 192.168.1.5 port 43210 ssh2"
)
result = parse_rfc5424(line)
assert result is not None
assert result["decky"] == "omega-decky"
assert result["service"] == "sshd"
assert "Accepted password" in result["msg"]
# Native sshd lines have no key=value; the prose fallback pulls
# the IP out of "from <ip>".
assert result["attacker_ip"] == "192.168.1.5"
def test_extracts_attacker_ip_from_sshd_prose(self):
"""sshd routed via rsyslog emits free prose with no SD block and no
key=value pairs. The parser must still find the remote IP."""
cases = [
(
"<38>1 2026-04-27T03:08:48+00:00 dmz-gateway sshd 940 - - "
"Failed password for root from 157.66.144.16 port 42772 ssh2",
"157.66.144.16",
),
(
"<38>1 2026-04-27T03:08:45+00:00 dmz-gateway sshd 940 - - "
"Connection from 157.66.144.16 port 42772 on 10.0.0.2 port 22 rdomain \"\"",
"157.66.144.16", # must beat the local listener 10.0.0.2
),
(
"<38>1 2026-04-27T03:08:49+00:00 dmz-gateway sshd 940 - - "
"Connection closed by authenticating user root 157.66.144.16 port 42772 [preauth]",
"157.66.144.16",
),
(
"<38>1 2026-04-27T03:08:46+00:00 dmz-gateway sshd 940 - - "
"pam_unix(sshd:auth): authentication failure; "
"logname= uid=0 euid=0 tty=ssh ruser= rhost=157.66.144.16 user=root",
"157.66.144.16",
),
]
for line, expected in cases:
result = parse_rfc5424(line)
assert result is not None, line
assert result["attacker_ip"] == expected, (line, result["attacker_ip"])
def test_extracts_attacker_ip_from_msg_body_kv(self):
"""SSH container's bash PROMPT_COMMAND uses `logger -t bash "CMD ... src=IP ..."`
which produces an RFC 5424 line with NILVALUE SD — the IP lives in the
free-form msg, not in SD params. The collector should still pick it up."""
line = (
"<134>1 2024-01-15T12:00:00+00:00 decky-01 bash - - - "
"CMD uid=0 user=root src=198.51.100.7 pwd=/root cmd=ls -la"
)
result = parse_rfc5424(line)
assert result is not None
assert result["attacker_ip"] == "198.51.100.7"
# `fields` stays empty — the frontend's parseEventBody renders kv
# pairs straight from msg; we don't want duplicate pills.
assert result["fields"] == {}
assert "CMD uid=0" in result["msg"]
def test_sd_ip_wins_over_msg_body(self):
"""If SD params carry an IP, the msg-body fallback must not overwrite it."""
line = (
'<134>1 2024-01-15T12:00:00+00:00 decky-01 ssh - login '
'[relay@55555 src_ip="1.2.3.4"] rogue src=9.9.9.9 entry'
)
result = parse_rfc5424(line)
assert result["attacker_ip"] == "1.2.3.4"
# SD wins; `src=` from msg isn't folded into fields (msg retains it).
assert result["fields"]["src_ip"] == "1.2.3.4"
assert "src" not in result["fields"]
def test_parses_msg(self):
line = self._make_line(msg="hello world")
result = parse_rfc5424(line)
assert result["msg"] == "hello world"
def test_nilvalue_sd_with_msg(self):
line = "<134>1 2024-01-15T12:00:00+00:00 decky-01 http - request - some message"
result = parse_rfc5424(line)
assert result is not None
assert result["msg"] == "some message"
assert result["fields"] == {}
def test_raw_line_preserved(self):
line = self._make_line('src_ip="1.2.3.4"')
result = parse_rfc5424(line)
assert result["raw_line"] == line
def test_timestamp_formatted(self):
line = self._make_line()
result = parse_rfc5424(line)
assert result["timestamp"] == "2024-01-15 12:00:00"
def test_unescapes_sd_values(self):
line = self._make_line(r'path="/foo\"bar"')
result = parse_rfc5424(line)
assert result["fields"]["path"] == '/foo"bar'
def test_result_json_serializable(self):
line = self._make_line('src_ip="1.2.3.4" username="admin" password="s3cr3t"')
result = parse_rfc5424(line)
# Should not raise
json.dumps(result)
def test_invalid_timestamp_preserved_as_is(self):
line = "<134>1 not-a-date decky-01 http - request -"
result = parse_rfc5424(line)
assert result is not None
assert result["timestamp"] == "not-a-date"
def test_sd_rest_is_plain_text(self):
# When SD starts with neither '-' nor '[', treat as msg
line = "<134>1 2024-01-15T12:00:00+00:00 decky-01 http - request hello world"
result = parse_rfc5424(line)
assert result is not None
assert result["msg"] == "hello world"
def test_sd_with_msg_after_bracket(self):
line = '<134>1 2024-01-15T12:00:00+00:00 decky-01 http - request [relay@55555 src_ip="1.2.3.4"] login attempt'
result = parse_rfc5424(line)
assert result is not None
assert result["fields"]["src_ip"] == "1.2.3.4"
assert result["msg"] == "login attempt"
class TestIsServiceContainer:
def test_known_container_returns_true(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=_KNOWN_NAMES):
assert is_service_container(_make_container("omega-decky-http")) is True
assert is_service_container(_make_container("omega-decky-smtp")) is True
assert is_service_container(_make_container("relay-decky-ftp")) is True
def test_base_container_returns_false(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=_KNOWN_NAMES):
assert is_service_container(_make_container("omega-decky")) is False
def test_unrelated_container_returns_false(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=_KNOWN_NAMES):
assert is_service_container(_make_container("nginx")) is False
def test_strips_leading_slash(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=_KNOWN_NAMES):
assert is_service_container(_make_container("/omega-decky-http")) is True
assert is_service_container(_make_container("/omega-decky")) is False
def test_no_state_returns_false(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=set()):
assert is_service_container(_make_container("omega-decky-http")) is False
def test_string_argument(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=_KNOWN_NAMES):
assert is_service_container("omega-decky-http") is True
assert is_service_container("/omega-decky-http") is True
assert is_service_container("nginx") is False
class TestIsServiceEvent:
def test_known_service_event_returns_true(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=_KNOWN_NAMES):
assert is_service_event({"name": "omega-decky-smtp"}) is True
def test_base_event_returns_false(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=_KNOWN_NAMES):
assert is_service_event({"name": "omega-decky"}) is False
def test_unrelated_event_returns_false(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=_KNOWN_NAMES):
assert is_service_event({"name": "nginx"}) is False
def test_no_state_returns_false(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=set()):
assert is_service_event({"name": "omega-decky-smtp"}) is False
def test_strips_leading_slash(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=_KNOWN_NAMES):
assert is_service_event({"name": "/omega-decky-smtp"}) is True
def test_empty_name(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=_KNOWN_NAMES):
assert is_service_event({"name": ""}) is False
assert is_service_event({}) is False
class TestTopologyLabelDiscovery:
"""MazeNET topology containers aren't in decnet-state.json — the
collector discovers them via compose-time labels instead."""
def _labelled(self, name: str, labels: dict):
return SimpleNamespace(
name=name,
attrs={"Config": {"Labels": labels}},
labels=labels,
)
def test_topology_labelled_container_matches(self):
"""Unknown name + decnet.topology.service=true label → True."""
with patch("decnet.collector.worker._load_service_container_names", return_value=set()):
c = self._labelled(
"decky-2966-ssh",
{"decnet.topology.service": "true", "decnet.topology.id": "abc"},
)
assert is_service_container(c) is True
def test_base_container_label_does_not_match(self):
"""Base containers carry decnet.topology.role=base but NOT the
service marker — collector must ignore them or we double-capture
the sshd auth stream from both the base and the service share."""
with patch("decnet.collector.worker._load_service_container_names", return_value=set()):
c = self._labelled(
"decnet_t_af22dae8_decky-2966",
{"decnet.topology.role": "base", "decnet.topology.id": "abc"},
)
assert is_service_container(c) is False
def test_unrelated_container_with_labels_does_not_match(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=set()):
c = self._labelled("portainer", {"com.docker.compose.project": "portainer"})
assert is_service_container(c) is False
def test_topology_event_matches_via_label(self):
"""Docker start events flatten labels alongside 'name' in attrs —
is_service_event must detect that shape."""
with patch("decnet.collector.worker._load_service_container_names", return_value=set()):
attrs = {
"name": "decky-2966-ssh",
"decnet.topology.service": "true",
"decnet.topology.id": "abc",
}
assert is_service_event(attrs) is True
def test_fleet_and_topology_coexist(self):
"""Fleet match wins when the name is in state; topology label
catches containers that aren't."""
with patch("decnet.collector.worker._load_service_container_names", return_value=_KNOWN_NAMES):
fleet_c = _make_container("omega-decky-http")
topo_c = self._labelled(
"decky-2966-ssh",
{"decnet.topology.service": "true"},
)
assert is_service_container(fleet_c) is True
assert is_service_container(topo_c) is True
def test_stub_without_attrs_still_works_via_labels(self):
"""Older test stubs use SimpleNamespace(name=…) with no .attrs —
falling back to .labels should still evaluate."""
with patch("decnet.collector.worker._load_service_container_names", return_value=set()):
c = SimpleNamespace(
name="decky-2966-ssh",
labels={"decnet.topology.service": "true"},
)
assert is_service_container(c) is True
class TestFleetLabelDiscovery:
"""Fleet (legacy) containers stamped with ``decnet.fleet.service=true``
by ``decnet/composer.py`` must be picked up by the events watcher even
when ``decnet-state.json`` hasn't been refreshed yet — that's the race
that previously caused freshly-deployed containers to be silently
ignored."""
def _labelled(self, name: str, labels: dict):
return SimpleNamespace(
name=name,
attrs={"Config": {"Labels": labels}},
labels=labels,
)
def test_fleet_labelled_container_matches_without_state(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=set()):
c = self._labelled(
"omega-decky-ssh",
{"decnet.fleet.service": "true", "decnet.fleet.decky": "omega-decky"},
)
assert is_service_container(c) is True
def test_fleet_labelled_event_matches_without_state(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=set()):
attrs = {
"name": "omega-decky-ssh",
"decnet.fleet.service": "true",
"decnet.fleet.decky": "omega-decky",
}
assert is_service_event(attrs) is True
def test_unlabelled_event_falls_back_to_state(self):
"""Containers built before this label landed still match by name."""
with patch("decnet.collector.worker._load_service_container_names", return_value=_KNOWN_NAMES):
assert is_service_event({"name": "omega-decky-http"}) is True
def test_unrelated_label_does_not_match(self):
with patch("decnet.collector.worker._load_service_container_names", return_value=set()):
c = self._labelled(
"redis",
{"com.docker.compose.project": "redis", "decnet.fleet.service": "false"},
)
assert is_service_container(c) is False
assert is_service_event(
{"name": "redis", "decnet.fleet.service": "false"}
) is False
class TestLoadServiceContainerNames:
def test_with_valid_state(self, tmp_path, monkeypatch):
import decnet.config
from decnet.config import DeckyConfig, DecnetConfig
state_file = tmp_path / "state.json"
config = DecnetConfig(
mode="unihost", interface="eth0", subnet="192.168.1.0/24",
gateway="192.168.1.1",
deckies=[
DeckyConfig(name="decky-01", ip="192.168.1.10", services=["ssh", "http"],
distro="debian", base_image="debian", hostname="test",
build_base="debian:bookworm-slim"),
],
)
state_file.write_text(json.dumps({
"config": config.model_dump(),
"compose_path": "test.yml",
}))
monkeypatch.setattr(decnet.config, "STATE_FILE", state_file)
names = _load_service_container_names()
assert names == {"decky-01-ssh", "decky-01-http"}
def test_no_state(self, tmp_path, monkeypatch):
import decnet.config
state_file = tmp_path / "nonexistent.json"
monkeypatch.setattr(decnet.config, "STATE_FILE", state_file)
names = _load_service_container_names()
assert names == set()
class TestStreamContainer:
def test_streams_rfc5424_lines(self, tmp_path):
log_path = tmp_path / "test.log"
json_path = tmp_path / "test.json"
mock_container = MagicMock()
rfc_line = '<134>1 2024-01-15T12:00:00+00:00 decky-01 ssh - auth [relay@55555 src_ip="1.2.3.4"] login\n'
mock_container.logs.return_value = [rfc_line.encode("utf-8")]
mock_client = MagicMock()
mock_client.containers.get.return_value = mock_container
with patch("docker.from_env", return_value=mock_client):
_stream_container("test-id", log_path, json_path)
assert log_path.exists()
log_content = log_path.read_text()
assert "decky-01" in log_content
assert json_path.exists()
json_content = json_path.read_text().strip()
parsed = json.loads(json_content)
assert parsed["service"] == "ssh"
def test_handles_non_rfc5424_lines(self, tmp_path):
log_path = tmp_path / "test.log"
json_path = tmp_path / "test.json"
mock_container = MagicMock()
mock_container.logs.return_value = [b"just a plain log line\n"]
mock_client = MagicMock()
mock_client.containers.get.return_value = mock_container
with patch("docker.from_env", return_value=mock_client):
_stream_container("test-id", log_path, json_path)
assert log_path.exists()
# JSON file is only created when RFC5424 lines are parsed — not for plain lines.
assert not json_path.exists() or json_path.read_text() == ""
def test_handles_docker_error(self, tmp_path):
log_path = tmp_path / "test.log"
json_path = tmp_path / "test.json"
mock_client = MagicMock()
mock_client.containers.get.side_effect = Exception("Container not found")
with patch("docker.from_env", return_value=mock_client):
_stream_container("bad-id", log_path, json_path)
# Should not raise, just log the error
def test_skips_empty_lines(self, tmp_path):
log_path = tmp_path / "test.log"
json_path = tmp_path / "test.json"
mock_container = MagicMock()
mock_container.logs.return_value = [b"\n\n\n"]
mock_client = MagicMock()
mock_client.containers.get.return_value = mock_container
with patch("docker.from_env", return_value=mock_client):
_stream_container("test-id", log_path, json_path)
# All lines were empty — no file is created (lazy open).
assert not log_path.exists() or log_path.read_text() == ""
def test_log_file_recreated_after_deletion(self, tmp_path):
log_path = tmp_path / "test.log"
json_path = tmp_path / "test.json"
line1 = b"first line\n"
line2 = b"second line\n"
def _chunks():
yield line1
log_path.unlink() # simulate deletion between writes
yield line2
mock_container = MagicMock()
mock_container.logs.return_value = _chunks()
mock_client = MagicMock()
mock_client.containers.get.return_value = mock_container
with patch("docker.from_env", return_value=mock_client):
_stream_container("test-id", log_path, json_path)
assert log_path.exists(), "log file must be recreated after deletion"
content = log_path.read_text()
assert "second line" in content
def test_json_file_recreated_after_deletion(self, tmp_path):
log_path = tmp_path / "test.log"
json_path = tmp_path / "test.json"
rfc_line = (
'<134>1 2024-01-15T12:00:00+00:00 decky-01 ssh - auth '
'[relay@55555 src_ip="1.2.3.4"] login\n'
)
encoded = rfc_line.encode("utf-8")
def _chunks():
yield encoded
# Remove the json file between writes; the second RFC line should
# trigger a fresh file open.
if json_path.exists():
json_path.unlink()
yield encoded
mock_container = MagicMock()
mock_container.logs.return_value = _chunks()
mock_client = MagicMock()
mock_client.containers.get.return_value = mock_container
with patch("docker.from_env", return_value=mock_client):
_stream_container("test-id", log_path, json_path)
assert json_path.exists(), "json file must be recreated after deletion"
lines = [l for l in json_path.read_text().splitlines() if l.strip()]
assert len(lines) >= 1
def test_rotated_file_detected(self, tmp_path):
"""Simulate logrotate: rename old file away, new write should go to a fresh file."""
log_path = tmp_path / "test.log"
json_path = tmp_path / "test.json"
line1 = b"before rotation\n"
line2 = b"after rotation\n"
rotated = tmp_path / "test.log.1"
def _chunks():
yield line1
log_path.rename(rotated) # logrotate renames old file
yield line2
mock_container = MagicMock()
mock_container.logs.return_value = _chunks()
mock_client = MagicMock()
mock_client.containers.get.return_value = mock_container
with patch("docker.from_env", return_value=mock_client):
_stream_container("test-id", log_path, json_path)
assert log_path.exists(), "new log file must be created after rotation"
assert "after rotation" in log_path.read_text()
assert "before rotation" in rotated.read_text()
class TestIngestRateLimiter:
def setup_method(self):
_reset_rate_limiter()
def _event(self, event_type="connect", attacker_ip="1.2.3.4",
decky="decky-01", service="ssh"):
return {
"event_type": event_type,
"attacker_ip": attacker_ip,
"decky": decky,
"service": service,
}
def test_non_limited_event_types_always_pass(self):
# login_attempt / request / etc. carry distinguishing payload — never deduped.
for _ in range(5):
assert _should_ingest(self._event(event_type="login_attempt")) is True
assert _should_ingest(self._event(event_type="request")) is True
def test_first_connect_passes(self):
assert _should_ingest(self._event()) is True
def test_duplicate_connect_within_window_is_dropped(self):
assert _should_ingest(self._event()) is True
assert _should_ingest(self._event()) is False
assert _should_ingest(self._event()) is False
def test_different_attackers_tracked_independently(self):
assert _should_ingest(self._event(attacker_ip="1.1.1.1")) is True
assert _should_ingest(self._event(attacker_ip="2.2.2.2")) is True
def test_different_deckies_tracked_independently(self):
assert _should_ingest(self._event(decky="a")) is True
assert _should_ingest(self._event(decky="b")) is True
def test_different_services_tracked_independently(self):
assert _should_ingest(self._event(service="ssh")) is True
assert _should_ingest(self._event(service="http")) is True
def test_disconnect_and_connect_tracked_independently(self):
assert _should_ingest(self._event(event_type="connect")) is True
assert _should_ingest(self._event(event_type="disconnect")) is True
def test_window_expiry_allows_next_event(self, monkeypatch):
import decnet.collector.worker as worker
t = [1000.0]
monkeypatch.setattr(worker.time, "monotonic", lambda: t[0])
assert _should_ingest(self._event()) is True
assert _should_ingest(self._event()) is False
# Advance past 1-second window.
t[0] += 1.5
assert _should_ingest(self._event()) is True
def test_window_zero_disables_limiter(self, monkeypatch):
import decnet.collector.worker as worker
monkeypatch.setattr(worker, "_RL_WINDOW_SEC", 0.0)
for _ in range(10):
assert _should_ingest(self._event()) is True
def test_raw_log_gets_all_lines_json_dedupes(self, tmp_path):
"""End-to-end: duplicates hit the .log file but NOT the .json stream."""
log_path = tmp_path / "test.log"
json_path = tmp_path / "test.json"
line = (
'<134>1 2024-01-15T12:00:00+00:00 decky-01 ssh - connect '
'[relay@55555 src_ip="1.2.3.4"]\n'
)
payload = (line * 5).encode("utf-8")
mock_container = MagicMock()
mock_container.logs.return_value = [payload]
mock_client = MagicMock()
mock_client.containers.get.return_value = mock_container
with patch("docker.from_env", return_value=mock_client):
_stream_container("test-id", log_path, json_path)
# Raw log: all 5 lines preserved (forensic fidelity).
assert log_path.read_text().count("\n") == 5
# JSON ingest: only the first one written (4 dropped by the limiter).
json_lines = [l for l in json_path.read_text().splitlines() if l.strip()]
assert len(json_lines) == 1
def test_gc_trims_oversized_map(self, monkeypatch):
import decnet.collector.worker as worker
# Seed the map with stale entries, then push past the cap.
monkeypatch.setattr(worker, "_RL_MAX_ENTRIES", 10)
t = [1000.0]
monkeypatch.setattr(worker.time, "monotonic", lambda: t[0])
for i in range(9):
assert _should_ingest(self._event(attacker_ip=f"10.0.0.{i}")) is True
# Jump well past 60 windows to make prior entries stale.
t[0] += 1000.0
# This insertion pushes len to 10; GC triggers on >10 so stays.
assert _should_ingest(self._event(attacker_ip="10.0.0.99")) is True
assert _should_ingest(self._event(attacker_ip="10.0.0.100")) is True
# After the map exceeds the cap, stale entries must be purged.
assert len(worker._rl_last) < 10
class TestLogCollectorWorker:
@pytest.mark.asyncio
async def test_worker_initial_discovery(self, tmp_path):
log_file = str(tmp_path / "decnet.log")
mock_container = MagicMock()
mock_container.id = "c1"
mock_container.name = "/s-1"
# Mock labels to satisfy is_service_container
mock_container.labels = {"com.docker.compose.project": "decnet"}
mock_client = MagicMock()
mock_client.containers.list.return_value = [mock_container]
# Make events return an empty generator/iterator immediately
mock_client.events.return_value = iter([])
with patch("docker.from_env", return_value=mock_client), \
patch("decnet.collector.worker.is_service_container", return_value=True):
# Run with a short task timeout because it loops
try:
await asyncio.wait_for(log_collector_worker(log_file), timeout=0.1)
except (asyncio.TimeoutError, StopIteration):
pass
# Should have tried to list and watch events
mock_client.containers.list.assert_called_once()
@pytest.mark.asyncio
async def test_worker_handles_events(self, tmp_path):
log_file = str(tmp_path / "decnet.log")
mock_client = MagicMock()
mock_client.containers.list.return_value = []
event = {
"id": "c2",
"Actor": {"Attributes": {"name": "s-2", "com.docker.compose.project": "decnet"}}
}
mock_client.events.return_value = iter([event])
with patch("docker.from_env", return_value=mock_client), \
patch("decnet.collector.worker.is_service_event", return_value=True):
try:
await asyncio.wait_for(log_collector_worker(log_file), timeout=0.1)
except (asyncio.TimeoutError, StopIteration):
pass
mock_client.events.assert_called_once()
@pytest.mark.asyncio
async def test_worker_exception_handling(self, tmp_path):
log_file = str(tmp_path / "decnet.log")
mock_client = MagicMock()
mock_client.containers.list.side_effect = Exception("Docker down")
with patch("docker.from_env", return_value=mock_client):
# Should not raise
await log_collector_worker(log_file)
@pytest.mark.asyncio
async def test_event_watcher_retries_on_stream_break(self, tmp_path, monkeypatch):
"""A docker stream-decode hiccup must not silently end the
watcher: today the executor task would return cleanly and
future container starts would be dropped until an operator
restarted the unit. The retry loop is what keeps the collector
honest across daemon reloads."""
log_file = str(tmp_path / "decnet.log")
valid_event = {
"id": "c-resilient",
"Actor": {"Attributes": {"name": "resilient-svc"}},
}
# Patch time.sleep inside the worker so the retry's backoff
# doesn't actually wait — keeps the test under the budget.
monkeypatch.setattr("decnet.collector.worker.time.sleep", lambda *_: None)
# Sequence: raise (transient error), then SystemExit to break
# out of the while-True. SystemExit is BaseException-derived so
# the broad ``except Exception`` in production won't catch it —
# the watcher thread exits cleanly and the worker finishes.
# We don't try to assert _spawn was called: the dispatch path
# uses ``loop.call_soon_threadsafe(_spawn, ...)`` and patching
# the abstract loop method doesn't reach the concrete loop.
# The retry contract is fully verified by counting reconnect
# attempts.
events_calls = {"n": 0}
def _events(**_kw):
events_calls["n"] += 1
if events_calls["n"] == 1:
raise RuntimeError("stream decode error")
# Second call: clean exit. Watcher's retry means call #2
# happens at all; without retry, the RuntimeError would
# propagate out of the executor and the watcher would
# never call events() again.
return iter([])
mock_client = MagicMock()
mock_client.containers.list.return_value = []
mock_client.events.side_effect = _events
# del valid_event — unused now that we dropped the spawn assertion
del valid_event
with patch("docker.from_env", return_value=mock_client), \
patch("decnet.collector.worker.is_service_event", return_value=True):
try:
await asyncio.wait_for(log_collector_worker(log_file), timeout=2.0)
except (asyncio.TimeoutError, StopIteration, SystemExit):
pass
assert events_calls["n"] >= 2, (
f"expected >=2 events() calls (one failure + one reconnect) "
f"proving the retry loop, got {events_calls['n']}"
)
@pytest.mark.asyncio
async def test_reconciler_picks_up_missed_container(self, tmp_path, monkeypatch):
"""Even if the event watcher wedges, the reconciler must catch
any service container that's already running. Simulates the
first-VPS-deploy bug: events() never yields, but a service
container exists in containers.list() — the worker had to be
restarted to pick it up. Now the reconciler does it
within RECONCILE_INTERVAL_S."""
log_file = str(tmp_path / "decnet.log")
monkeypatch.setenv("DECNET_COLLECTOR_RECONCILE_S", "0.05")
missed_container = MagicMock()
missed_container.id = "c-missed"
missed_container.name = "/missed-svc"
list_calls = {"n": 0}
def _list():
list_calls["n"] += 1
# First call (initial scan): empty. Subsequent (reconciler): one container.
if list_calls["n"] == 1:
return []
return [missed_container]
mock_client = MagicMock()
mock_client.containers.list.side_effect = _list
# First events() call raises a transient error that the
# watcher catches → triggers its real 1s backoff sleep. During
# that sleep the asyncio loop runs and the reconciler (ticking
# every 0.05s) gets ~20 chances to discover ``c-missed``.
# Second call returns an empty iterator → watcher exits
# cleanly so the test can unwind without a lingering thread.
events_calls = {"n": 0}
def _events_seq(**_kw):
events_calls["n"] += 1
if events_calls["n"] == 1:
raise RuntimeError("test: trigger backoff so reconciler can run")
return iter([])
mock_client.events.side_effect = _events_seq
with patch("docker.from_env", return_value=mock_client), \
patch("decnet.collector.worker.is_service_container", return_value=True):
try:
await asyncio.wait_for(log_collector_worker(log_file), timeout=2.0)
except (asyncio.TimeoutError, StopIteration, SystemExit):
pass
assert list_calls["n"] >= 2, (
"reconciler should have run at least once after the initial scan; "
f"got {list_calls['n']} calls to containers.list()"
)

View File

@@ -0,0 +1,185 @@
"""Bus wiring for the collector (DEBT-031, worker 5).
Collector streams logs from Docker containers in a thread pool — can't be
exercised cleanly under pytest. These tests pin the two things that
actually carry the contract:
1. ``_stream_container`` invokes ``publish_fn(parsed)`` right after writing
the JSON record, and skips publish when the hook is absent.
2. ``_make_system_log_publisher`` routes under ``system.log`` with the
expected compact payload shape.
"""
from __future__ import annotations
import asyncio
import json
import pytest
import pytest_asyncio
from decnet.bus.fake import FakeBus
from decnet.collector.worker import (
_make_system_log_publisher,
_stream_container,
)
@pytest_asyncio.fixture
async def bus() -> FakeBus:
b = FakeBus()
await b.connect()
yield b
await b.close()
# ─── Thread-safe publisher factory ───────────────────────────────────────────
@pytest.mark.asyncio
async def test_publisher_routes_under_system_log(bus: FakeBus) -> None:
loop = asyncio.get_running_loop()
publish = _make_system_log_publisher(bus, loop)
sub = bus.subscribe("system.log")
async with sub:
publish({
"timestamp": "2026-04-21 10:00:00",
"decky": "decky-a",
"service": "ssh",
"event_type": "auth_fail",
"attacker_ip": "1.2.3.4",
})
event = await asyncio.wait_for(sub.__anext__(), timeout=2.0)
assert event.topic == "system.log"
assert event.type == "auth_fail"
assert event.payload == {
"decky": "decky-a",
"service": "ssh",
"event_type": "auth_fail",
"attacker_ip": "1.2.3.4",
"timestamp": "2026-04-21 10:00:00",
}
@pytest.mark.asyncio
async def test_publisher_no_bus_is_noop() -> None:
# get_bus() failure path returns None → publisher is a no-op callable.
loop = asyncio.get_running_loop()
publish = _make_system_log_publisher(None, loop)
# Must be safely invocable; no exception, no hang.
publish({"event_type": "anything"})
# ─── Stream-thread integration: publish_fn wiring ────────────────────────────
class _FakeContainer:
"""Minimal duck-typed stand-in for docker.Container.logs(stream=True)."""
def __init__(self, lines: list[bytes]) -> None:
self._lines = lines
def logs(self, stream=True, follow=True, stdout=True, stderr=False):
yield from self._lines
class _FakeDockerClient:
def __init__(self, container: _FakeContainer) -> None:
self.containers = self # so .get() lookup below works
self._container = container
def get(self, _container_id: str) -> _FakeContainer:
return self._container
def _make_rfc5424_line() -> str:
# Crafted to pass _RFC5424_RE in collector.worker.
return (
"<134>1 2026-04-21T10:00:00+00:00 decky-a ssh - auth_fail "
"[decnet@32473 src_ip=\"1.2.3.4\"] failed password"
)
def test_stream_container_invokes_publish_fn(monkeypatch, tmp_path):
line = _make_rfc5424_line()
fake_container = _FakeContainer([line.encode() + b"\n"])
fake_client = _FakeDockerClient(fake_container)
import docker as _docker_mod
monkeypatch.setattr(_docker_mod, "from_env", lambda: fake_client)
captured: list[dict] = []
_stream_container(
"cid-xyz",
tmp_path / "decnet.log",
tmp_path / "decnet.json",
publish_fn=lambda parsed: captured.append(parsed),
)
# One parseable line → one publish call with the parsed dict.
assert len(captured) == 1
assert captured[0]["decky"] == "decky-a"
assert captured[0]["service"] == "ssh"
assert captured[0]["event_type"] == "auth_fail"
# JSON file still written — bus publishing is additive, not a replacement.
jf = (tmp_path / "decnet.json").read_text().strip().splitlines()
assert len(jf) == 1
assert json.loads(jf[0])["event_type"] == "auth_fail"
def test_stream_container_runs_without_publish_fn(monkeypatch, tmp_path):
# Pre-bus behavior: no publish_fn, no crash, JSON still written.
line = _make_rfc5424_line()
fake_container = _FakeContainer([line.encode() + b"\n"])
fake_client = _FakeDockerClient(fake_container)
import docker as _docker_mod
monkeypatch.setattr(_docker_mod, "from_env", lambda: fake_client)
_stream_container(
"cid-xyz",
tmp_path / "decnet.log",
tmp_path / "decnet.json",
)
jf = (tmp_path / "decnet.json").read_text().strip().splitlines()
assert len(jf) == 1
def test_stream_container_swallows_publish_failures(monkeypatch, tmp_path):
# Hook failure must not abort the stream thread.
line = _make_rfc5424_line()
fake_container = _FakeContainer([line.encode() + b"\n"])
fake_client = _FakeDockerClient(fake_container)
import docker as _docker_mod
monkeypatch.setattr(_docker_mod, "from_env", lambda: fake_client)
def _boom(_parsed):
raise RuntimeError("transport exploded")
# Must not raise.
_stream_container(
"cid-xyz",
tmp_path / "decnet.log",
tmp_path / "decnet.json",
publish_fn=_boom,
)
jf = (tmp_path / "decnet.json").read_text().strip().splitlines()
assert len(jf) == 1
# ─── Bus-disabled escape hatch ───────────────────────────────────────────────
@pytest.mark.asyncio
async def test_collector_degrades_cleanly_when_bus_disabled(
monkeypatch: pytest.MonkeyPatch,
) -> None:
from decnet.bus.factory import get_bus
monkeypatch.setenv("DECNET_BUS_ENABLED", "false")
b = get_bus(client_name="collector")
await b.connect()
await b.publish("system.log", {"event_type": "auth_fail"}, event_type="auth_fail")
await b.close()

View File

@@ -0,0 +1,94 @@
"""Verify that the collector and sniffer use dedicated thread pools
instead of the default asyncio executor — preventing starvation of
short-lived ``asyncio.to_thread`` calls in the web API layer."""
import asyncio
from concurrent.futures import ThreadPoolExecutor
from unittest.mock import patch, MagicMock, AsyncMock
import pytest
from decnet.collector.worker import log_collector_worker
from decnet.sniffer.worker import sniffer_worker
class TestCollectorDedicatedPool:
"""Collector log streams must NOT use the default asyncio executor."""
@pytest.mark.asyncio
async def test_stream_containers_use_dedicated_pool(self, tmp_path):
"""Spawning container log threads should go through a dedicated
ThreadPoolExecutor, not the default loop executor."""
log_file = str(tmp_path / "decnet.log")
captured_executors: list[ThreadPoolExecutor | None] = []
original_run_in_executor = asyncio.get_event_loop().run_in_executor
async def _spy_run_in_executor(executor, func, *args):
captured_executors.append(executor)
# Don't actually run the blocking function — raise to exit.
raise asyncio.CancelledError
fake_container = MagicMock()
fake_container.id = "abc123"
fake_container.name = "/omega-decky-http"
fake_client = MagicMock()
fake_client.containers.list.return_value = [fake_container]
mock_docker = MagicMock()
mock_docker.from_env.return_value = fake_client
with (
patch.dict("sys.modules", {"docker": mock_docker}),
patch(
"decnet.collector.worker.is_service_container",
return_value=True,
),
):
loop = asyncio.get_running_loop()
with patch.object(loop, "run_in_executor", side_effect=_spy_run_in_executor):
with pytest.raises(asyncio.CancelledError):
await log_collector_worker(log_file)
# The executor passed should be a dedicated pool, not None (default).
assert len(captured_executors) >= 1
for executor in captured_executors:
assert executor is not None, (
"Collector used default executor (None) — must use a dedicated pool"
)
assert isinstance(executor, ThreadPoolExecutor)
class TestSnifferDedicatedPool:
"""Sniffer sniff loop must NOT use the default asyncio executor."""
@pytest.mark.asyncio
async def test_sniff_loop_uses_dedicated_pool(self, tmp_path):
log_file = str(tmp_path / "decnet.log")
captured_executors: list[ThreadPoolExecutor | None] = []
async def _spy_run_in_executor(executor, func, *args):
captured_executors.append(executor)
raise asyncio.CancelledError
with (
patch(
"decnet.sniffer.worker._interface_exists",
return_value=True,
),
patch.dict("os.environ", {"DECNET_SNIFFER_IFACE": "eth0"}),
):
loop = asyncio.get_running_loop()
with patch.object(loop, "run_in_executor", side_effect=_spy_run_in_executor):
with pytest.raises(asyncio.CancelledError):
await sniffer_worker(log_file)
assert len(captured_executors) >= 1
for executor in captured_executors:
assert executor is not None, (
"Sniffer used default executor (None) — must use a dedicated pool"
)
assert isinstance(executor, ThreadPoolExecutor)