Files
DECNET/tests/clustering/test_clusterer_worker.py
anti 245975a6dd fix(security): close LOW ASVS findings — env bypass, SSE/deployment authz, CN fail-close, password byte-limit, exception leaks, BUG-12..16
Auth/session (V2.1.7, V4.1.5, V4.1.6, V2.1.4/V2.1.5):
- env secret validation no longer bypassed by attacker-injectable PYTEST* env;
  gated on explicit DECNET_TESTING=1 (set only in conftest).
- must_change_password now enforced on the SSE header-JWT path, not just ticket mint.
- GET /system/deployment-mode requires viewer auth (was leaking role + topology size).
- CreateUser/ResetUser passwords min_length=12; passwords >72 bytes rejected
  explicitly instead of bcrypt silently truncating.

Swarm ingestion (V9.1.3, BUG-16):
- Log listener hard-rejects peers with unparseable/empty cert CN (fail closed,
  ingests nothing) instead of tagging 'unknown'.
- Shutdown handlers no longer swallow real errors (narrowed to CancelledError).

Info leakage (V7.1.2, V14.1.2):
- Exception text sanitized on swarm-update, health, tarpit, realism, file-drop,
  blank-topology endpoints (raw tc/docker stderr, DB/Docker errors logged
  server-side, generic detail returned). pyproject license corrected to AGPL-3.0.

Correctness (BUG-12..16):
- BUG-12 atomic credential upsert (UNIQUE constraint + IntegrityError retry,
  consistent principal_key canonicalization).
- BUG-13 rule-tail watermark uses >= with seen-id dedup (no same-second drop).
- BUG-14 worker wake cleared before wait (no lost wake during tick).
- BUG-15 intel gather tolerates an unexpected provider raise.
- BUG-16 see above.

Already-closed (verified, no change): V2.1.6, V5.1.3, V9.1.2. Accept-risk +
documented: V2.1.8 cache window, V3.1.3 idle timeout. Tests added for every fix;
unanimous adversarial review after two refute-fix rounds.
2026-06-10 13:27:14 -04:00

298 lines
10 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""End-to-end tests for the clusterer worker shell.
The skeleton clusterer is a no-op; these tests cover the shell:
* exits cleanly on shutdown signal (and via cancel)
* invokes ``tick`` on each loop iteration
* publishes :class:`ClusterResult` side-effects on the right topics
* a clusterer raising from ``tick`` is logged and does not crash the loop
"""
from __future__ import annotations
import asyncio
import pytest
from decnet.bus import topics as _topics
from decnet.clustering.base import Clusterer, ClusterResult
from decnet.clustering.impl.connected_components import ConnectedComponentsClusterer
from decnet.clustering.worker import run_clusterer_loop
from decnet.web.db.factory import get_repository
@pytest.fixture
async def repo(tmp_path):
r = get_repository(db_path=str(tmp_path / "clusterer.db"))
await r.initialize()
return r
@pytest.fixture(autouse=True)
def _no_bus(monkeypatch):
"""Run workers in poll-only mode — no real Unix socket."""
monkeypatch.setenv("DECNET_BUS_ENABLED", "false")
class _FakeClusterer(Clusterer):
"""Test double: returns canned :class:`ClusterResult` per call."""
name = "fake"
def __init__(self, results: list[ClusterResult] | None = None) -> None:
self._results = list(results or [])
self.calls = 0
async def tick(self, repo) -> ClusterResult:
self.calls += 1
if self._results:
return self._results.pop(0)
return ClusterResult()
class _RaisingClusterer(Clusterer):
name = "raising"
def __init__(self) -> None:
self.calls = 0
async def tick(self, repo) -> ClusterResult:
self.calls += 1
raise RuntimeError("boom")
@pytest.mark.anyio
async def test_loop_exits_on_shutdown_signal(repo):
shutdown = asyncio.Event()
clusterer = _FakeClusterer()
task = asyncio.create_task(
run_clusterer_loop(
repo,
poll_interval_secs=0.05,
clusterer=clusterer,
shutdown=shutdown,
)
)
await asyncio.sleep(0.12)
shutdown.set()
await asyncio.wait_for(task, timeout=2.0)
assert clusterer.calls >= 1
@pytest.mark.anyio
async def test_loop_exits_on_cancel(repo):
clusterer = _FakeClusterer()
task = asyncio.create_task(
run_clusterer_loop(
repo,
poll_interval_secs=0.05,
clusterer=clusterer,
)
)
await asyncio.sleep(0.1)
task.cancel()
# The loop catches CancelledError and exits cleanly, mirroring the
# intel + reuse worker shells.
await asyncio.wait_for(task, timeout=2.0)
assert clusterer.calls >= 1
@pytest.mark.anyio
async def test_tick_failure_does_not_crash_loop(repo):
"""A clusterer raising from tick must be logged, not propagated."""
shutdown = asyncio.Event()
clusterer = _RaisingClusterer()
task = asyncio.create_task(
run_clusterer_loop(
repo,
poll_interval_secs=0.05,
clusterer=clusterer,
shutdown=shutdown,
)
)
await asyncio.sleep(0.2)
shutdown.set()
await asyncio.wait_for(task, timeout=2.0)
# Loop kept ticking despite the raise.
assert clusterer.calls >= 2
@pytest.mark.anyio
async def test_skeleton_clusterer_returns_empty_result(repo):
"""The connected-components skeleton produces no side-effects yet."""
c = ConnectedComponentsClusterer()
result = await c.tick(repo)
assert result.identities_formed == []
assert result.observations_linked == []
assert result.identities_merged == []
assert result.identities_unmerged == []
@pytest.mark.anyio
async def test_publishes_cluster_result_on_bus(monkeypatch, repo):
"""Every entry in ClusterResult fans out to the correct topic."""
published: list[tuple[str, dict, str]] = []
async def _fake_publish(bus, topic, payload, event_type=""):
published.append((topic, payload, event_type))
monkeypatch.setattr(
"decnet.clustering.worker.publish_safely", _fake_publish,
)
result = ClusterResult(
identities_formed=[
{"identity_uuid": "id-1", "observation_uuids": ["obs-1", "obs-2"]},
],
observations_linked=[
{"identity_uuid": "id-1", "observation_uuid": "obs-3"},
],
identities_merged=[
{"winner_uuid": "id-1", "loser_uuid": "id-2"},
],
identities_unmerged=[
{"resurrected_uuid": "id-2", "former_winner_uuid": "id-1"},
],
)
clusterer = _FakeClusterer(results=[result])
shutdown = asyncio.Event()
task = asyncio.create_task(
run_clusterer_loop(
repo,
poll_interval_secs=0.05,
clusterer=clusterer,
shutdown=shutdown,
)
)
await asyncio.sleep(0.1)
shutdown.set()
await asyncio.wait_for(task, timeout=2.0)
topics_seen = {t for t, _, _ in published}
assert _topics.identity(_topics.IDENTITY_FORMED) in topics_seen
assert _topics.identity(_topics.IDENTITY_OBSERVATION_LINKED) in topics_seen
assert _topics.identity(_topics.IDENTITY_MERGED) in topics_seen
assert _topics.identity(_topics.IDENTITY_UNMERGED) in topics_seen
@pytest.mark.anyio
async def test_clusterer_registered_in_cli():
"""`decnet clusterer` is registered as a master-only command."""
from decnet.cli.gating import MASTER_ONLY_COMMANDS
assert "clusterer" in MASTER_ONLY_COMMANDS
@pytest.mark.anyio
async def test_wake_during_tick_is_not_lost(repo):
"""BUG-14 regression: wake.clear() must run BEFORE wake.wait(), not after.
The worker loop pattern:
Fixed (after fix): tick → clear → wait ← current code
Buggy (before fix): tick → wait → clear
The race in the buggy pattern: a wake.set() could arrive from a _wake_on
background task between wait() returning and clear() executing. In asyncio
the task switch requires an ``await``; the original code had
``await _publish_result`` between wait() and clear(), providing a real
window. The fix closes this by moving clear() to run immediately after
tick (before wait), so there is no window between wait() returning and the
next clear().
**Structural test (red-before / green-after, deterministic):**
We intercept the internal ``wake`` event's ``clear()`` and ``wait()``
methods to record their invocation order, then assert that every
``clear`` call is immediately followed by a ``wait`` (never preceded by
one). Reverting the worker to the buggy ``wait → clear`` order produces
``("wait", "clear")`` consecutive pairs, which the assertion catches
deterministically without relying on wall-clock timing races.
"""
import unittest.mock as _mock
captured_wake: list[asyncio.Event] = []
call_log: list[str] = []
_orig_event_cls = asyncio.Event
class _LoggingEvent(_orig_event_cls): # type: ignore[misc]
"""Subclass captures the first event created (the wake event) and
logs clear()/wait() calls so we can verify their relative order."""
def __init__(self) -> None:
super().__init__()
if not captured_wake:
captured_wake.append(self)
def clear(self) -> None:
if captured_wake and self is captured_wake[0]:
call_log.append("clear")
super().clear()
async def wait(self) -> bool: # type: ignore[override]
if captured_wake and self is captured_wake[0]:
call_log.append("wait")
return await super().wait()
class _SimpleTicker(Clusterer):
name = "bug14_ticker"
async def tick(self, _repo) -> ClusterResult:
return ClusterResult()
shutdown = asyncio.Event()
with _mock.patch("decnet.clustering.worker.asyncio.Event", _LoggingEvent):
task = asyncio.create_task(
run_clusterer_loop(
repo,
poll_interval_secs=0.1,
clusterer=_SimpleTicker(),
shutdown=shutdown,
)
)
# Run for long enough to accumulate several clear/wait pairs.
await asyncio.sleep(0.5)
shutdown.set()
await asyncio.wait_for(task, timeout=2.0)
# Must have enough events to be meaningful.
assert len(call_log) >= 4, (
f"BUG-14: too few clear/wait calls recorded ({call_log!r}); "
"loop may not have run or event capture failed"
)
# Fixed invariant: every loop iteration runs clear() BEFORE wait().
# The resulting call_log for the fixed code is: clear, wait, clear, wait, ...
# For the buggy code (wait → clear) the log would be: wait, clear, wait, clear, ...
#
# We verify TWO conditions that together guarantee the fixed order:
#
# 1. The log starts with "clear" — the very first thing after tick is a clear.
# Buggy code starts with "wait" (wait ran first in the original loop).
#
# 2. Within each (clear, wait) pair at positions (2k, 2k+1), clear always
# precedes wait. We check that no "clear" appears at an ODD index (which
# would mean a clear followed another clear, or a wait appeared before
# the next clear).
assert call_log[0] == "clear", (
f"BUG-14 regression: first wake call was {call_log[0]!r}, expected 'clear'. "
f"Full log: {call_log!r}. The worker is using the buggy 'wait-then-clear' "
"order — wake.clear() must execute BEFORE wake.wait() each iteration."
)
# Verify the alternating pattern holds: indices 0,2,4,... should be "clear"
# and indices 1,3,5,... should be "wait".
for idx, call in enumerate(call_log):
if idx % 2 == 0:
assert call == "clear", (
f"BUG-14 regression: expected 'clear' at position {idx} but got "
f"{call!r} in log {call_log!r}. This indicates the loop is NOT "
"using the fixed 'clear → wait' order within each iteration."
)
else:
assert call == "wait", (
f"BUG-14 regression: expected 'wait' at position {idx} but got "
f"{call!r} in log {call_log!r}. This indicates the loop is NOT "
"using the fixed 'clear → wait' order within each iteration."
)