From b5d7bf818f425237946c3e40017c60c8b5c12c94 Mon Sep 17 00:00:00 2001 From: anti Date: Fri, 17 Apr 2026 17:48:42 -0400 Subject: [PATCH] feat(health): 3-tier status (healthy / degraded / unhealthy) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only database, docker, and ingestion_worker now count as critical (→ 503 unhealthy). attacker/sniffer/collector failures drop overall status to degraded (still 200) so the dashboard doesn't panic when a non-essential worker isn't running. --- decnet/web/router/health/api_get_health.py | 21 ++++++++------ tests/api/health/test_get_health.py | 33 +++++++++++++++++++++- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/decnet/web/router/health/api_get_health.py b/decnet/web/router/health/api_get_health.py index ad39d76..056519f 100644 --- a/decnet/web/router/health/api_get_health.py +++ b/decnet/web/router/health/api_get_health.py @@ -11,7 +11,7 @@ from decnet.web.db.models import HealthResponse, ComponentHealth router = APIRouter() -_OPTIONAL_SERVICES = {"sniffer_worker"} +_CRITICAL_SERVICES = {"database", "docker", "ingestion_worker"} # Cache Docker client and health result to avoid hammering the Docker socket _docker_client: Optional[Any] = None @@ -122,21 +122,26 @@ async def get_health(user: dict = Depends(require_viewer)) -> Any: else: components["docker"] = ComponentHealth(status="failing", detail=_docker_detail) - # Compute overall status - required_failing = any( + # Overall status tiers: + # healthy — every component ok + # degraded — only non-critical components failing (service usable, + # falls back to cache or skips non-essential work) + # unhealthy — a critical component (db, docker, ingestion) failing; + # survival depends on caches + critical_failing = any( c.status == "failing" for name, c in components.items() - if name not in _OPTIONAL_SERVICES + if name in _CRITICAL_SERVICES ) - optional_failing = any( + noncritical_failing = any( c.status == "failing" for name, c in components.items() - if name in _OPTIONAL_SERVICES + if name not in _CRITICAL_SERVICES ) - if required_failing: + if critical_failing: overall = "unhealthy" - elif optional_failing: + elif noncritical_failing: overall = "degraded" else: overall = "healthy" diff --git a/tests/api/health/test_get_health.py b/tests/api/health/test_get_health.py index 75f8a65..4736417 100644 --- a/tests/api/health/test_get_health.py +++ b/tests/api/health/test_get_health.py @@ -92,7 +92,7 @@ async def test_health_unhealthy_returns_503(client: httpx.AsyncClient, auth_toke with patch("decnet.web.api.get_background_tasks") as mock_tasks, \ patch("docker.from_env") as mock_docker: tasks = _make_running_tasks() - tasks["ingestion_worker"] = None # required worker down + tasks["ingestion_worker"] = None # critical worker down mock_tasks.return_value = tasks mock_docker.return_value = MagicMock() @@ -102,6 +102,37 @@ async def test_health_unhealthy_returns_503(client: httpx.AsyncClient, auth_toke assert resp.json()["status"] == "unhealthy" +@pytest.mark.anyio +async def test_health_degraded_when_attacker_down(client: httpx.AsyncClient, auth_token: str) -> None: + with patch("decnet.web.api.get_background_tasks") as mock_tasks, \ + patch("docker.from_env") as mock_docker: + tasks = _make_running_tasks() + tasks["attacker_worker"] = None # non-critical + mock_tasks.return_value = tasks + mock_docker.return_value = MagicMock() + + resp = await client.get("/api/v1/health", headers={"Authorization": f"Bearer {auth_token}"}) + + assert resp.status_code == 200 + assert resp.json()["status"] == "degraded" + assert resp.json()["components"]["attacker_worker"]["status"] == "failing" + + +@pytest.mark.anyio +async def test_health_degraded_when_collector_down(client: httpx.AsyncClient, auth_token: str) -> None: + with patch("decnet.web.api.get_background_tasks") as mock_tasks, \ + patch("docker.from_env") as mock_docker: + tasks = _make_running_tasks() + tasks["collector_worker"] = None # non-critical + mock_tasks.return_value = tasks + mock_docker.return_value = MagicMock() + + resp = await client.get("/api/v1/health", headers={"Authorization": f"Bearer {auth_token}"}) + + assert resp.status_code == 200 + assert resp.json()["status"] == "degraded" + + @pytest.mark.anyio async def test_health_docker_failing(client: httpx.AsyncClient, auth_token: str) -> None: with patch("decnet.web.api.get_background_tasks") as mock_tasks, \