feat(health): 3-tier status (healthy / degraded / unhealthy)
Only database, docker, and ingestion_worker now count as critical (→ 503 unhealthy). attacker/sniffer/collector failures drop overall status to degraded (still 200) so the dashboard doesn't panic when a non-essential worker isn't running.
This commit is contained in:
@@ -11,7 +11,7 @@ from decnet.web.db.models import HealthResponse, ComponentHealth
|
|||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
_OPTIONAL_SERVICES = {"sniffer_worker"}
|
_CRITICAL_SERVICES = {"database", "docker", "ingestion_worker"}
|
||||||
|
|
||||||
# Cache Docker client and health result to avoid hammering the Docker socket
|
# Cache Docker client and health result to avoid hammering the Docker socket
|
||||||
_docker_client: Optional[Any] = None
|
_docker_client: Optional[Any] = None
|
||||||
@@ -122,21 +122,26 @@ async def get_health(user: dict = Depends(require_viewer)) -> Any:
|
|||||||
else:
|
else:
|
||||||
components["docker"] = ComponentHealth(status="failing", detail=_docker_detail)
|
components["docker"] = ComponentHealth(status="failing", detail=_docker_detail)
|
||||||
|
|
||||||
# Compute overall status
|
# Overall status tiers:
|
||||||
required_failing = any(
|
# healthy — every component ok
|
||||||
|
# degraded — only non-critical components failing (service usable,
|
||||||
|
# falls back to cache or skips non-essential work)
|
||||||
|
# unhealthy — a critical component (db, docker, ingestion) failing;
|
||||||
|
# survival depends on caches
|
||||||
|
critical_failing = any(
|
||||||
c.status == "failing"
|
c.status == "failing"
|
||||||
for name, c in components.items()
|
for name, c in components.items()
|
||||||
if name not in _OPTIONAL_SERVICES
|
if name in _CRITICAL_SERVICES
|
||||||
)
|
)
|
||||||
optional_failing = any(
|
noncritical_failing = any(
|
||||||
c.status == "failing"
|
c.status == "failing"
|
||||||
for name, c in components.items()
|
for name, c in components.items()
|
||||||
if name in _OPTIONAL_SERVICES
|
if name not in _CRITICAL_SERVICES
|
||||||
)
|
)
|
||||||
|
|
||||||
if required_failing:
|
if critical_failing:
|
||||||
overall = "unhealthy"
|
overall = "unhealthy"
|
||||||
elif optional_failing:
|
elif noncritical_failing:
|
||||||
overall = "degraded"
|
overall = "degraded"
|
||||||
else:
|
else:
|
||||||
overall = "healthy"
|
overall = "healthy"
|
||||||
|
|||||||
@@ -92,7 +92,7 @@ async def test_health_unhealthy_returns_503(client: httpx.AsyncClient, auth_toke
|
|||||||
with patch("decnet.web.api.get_background_tasks") as mock_tasks, \
|
with patch("decnet.web.api.get_background_tasks") as mock_tasks, \
|
||||||
patch("docker.from_env") as mock_docker:
|
patch("docker.from_env") as mock_docker:
|
||||||
tasks = _make_running_tasks()
|
tasks = _make_running_tasks()
|
||||||
tasks["ingestion_worker"] = None # required worker down
|
tasks["ingestion_worker"] = None # critical worker down
|
||||||
mock_tasks.return_value = tasks
|
mock_tasks.return_value = tasks
|
||||||
mock_docker.return_value = MagicMock()
|
mock_docker.return_value = MagicMock()
|
||||||
|
|
||||||
@@ -102,6 +102,37 @@ async def test_health_unhealthy_returns_503(client: httpx.AsyncClient, auth_toke
|
|||||||
assert resp.json()["status"] == "unhealthy"
|
assert resp.json()["status"] == "unhealthy"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.anyio
|
||||||
|
async def test_health_degraded_when_attacker_down(client: httpx.AsyncClient, auth_token: str) -> None:
|
||||||
|
with patch("decnet.web.api.get_background_tasks") as mock_tasks, \
|
||||||
|
patch("docker.from_env") as mock_docker:
|
||||||
|
tasks = _make_running_tasks()
|
||||||
|
tasks["attacker_worker"] = None # non-critical
|
||||||
|
mock_tasks.return_value = tasks
|
||||||
|
mock_docker.return_value = MagicMock()
|
||||||
|
|
||||||
|
resp = await client.get("/api/v1/health", headers={"Authorization": f"Bearer {auth_token}"})
|
||||||
|
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["status"] == "degraded"
|
||||||
|
assert resp.json()["components"]["attacker_worker"]["status"] == "failing"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.anyio
|
||||||
|
async def test_health_degraded_when_collector_down(client: httpx.AsyncClient, auth_token: str) -> None:
|
||||||
|
with patch("decnet.web.api.get_background_tasks") as mock_tasks, \
|
||||||
|
patch("docker.from_env") as mock_docker:
|
||||||
|
tasks = _make_running_tasks()
|
||||||
|
tasks["collector_worker"] = None # non-critical
|
||||||
|
mock_tasks.return_value = tasks
|
||||||
|
mock_docker.return_value = MagicMock()
|
||||||
|
|
||||||
|
resp = await client.get("/api/v1/health", headers={"Authorization": f"Bearer {auth_token}"})
|
||||||
|
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["status"] == "degraded"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.anyio
|
@pytest.mark.anyio
|
||||||
async def test_health_docker_failing(client: httpx.AsyncClient, auth_token: str) -> None:
|
async def test_health_docker_failing(client: httpx.AsyncClient, auth_token: str) -> None:
|
||||||
with patch("decnet.web.api.get_background_tasks") as mock_tasks, \
|
with patch("decnet.web.api.get_background_tasks") as mock_tasks, \
|
||||||
|
|||||||
Reference in New Issue
Block a user