perf: 1s TTL cache for /health DB probe and /config state reads

Locust hit /health and /config on every @task(3), so each request was firing repo.get_total_logs() and two repo.get_state() calls against aiosqlite — filling the driver queue for data that changes on the order of seconds, not milliseconds. Both caches follow the shape already used by the existing Docker cache: - asyncio.Lock with double-checked TTL so concurrent callers collapse into one DB hit per 1s window. - _reset_* helpers called from tests/api/conftest.py::setup_db so the module-level cache can't leak across tests. tests/test_health_config_cache.py asserts 50 concurrent callers produce exactly 1 repo call, and the cache expires after TTL.
2026-04-17 15:05:18 -04:00
parent 931f33fb06
commit f1e14280c0
4 changed files with 141 additions and 8 deletions
--- a/decnet/web/router/config/api_get_config.py
+++ b/decnet/web/router/config/api_get_config.py
@@ -1,3 +1,7 @@
+import asyncio
+import time
+from typing import Any, Optional
+
 from fastapi import APIRouter, Depends

 from decnet.env import DECNET_DEVELOPER
@@ -10,6 +14,33 @@ router = APIRouter()
 _DEFAULT_DEPLOYMENT_LIMIT = 10
 _DEFAULT_MUTATION_INTERVAL = "30m"

+# Cache config_limits / config_globals reads — these change on rare admin
+# writes but get polled constantly by the UI and locust.
+_STATE_TTL = 1.0
+_state_cache: dict[str, tuple[Optional[dict[str, Any]], float]] = {}
+_state_locks: dict[str, asyncio.Lock] = {}
+
+
+def _reset_state_cache() -> None:
+    """Reset cached config state — used by tests."""
+    _state_cache.clear()
+
+
+async def _get_state_cached(name: str) -> Optional[dict[str, Any]]:
+    entry = _state_cache.get(name)
+    now = time.monotonic()
+    if entry is not None and now - entry[1] < _STATE_TTL:
+        return entry[0]
+    lock = _state_locks.setdefault(name, asyncio.Lock())
+    async with lock:
+        entry = _state_cache.get(name)
+        now = time.monotonic()
+        if entry is not None and now - entry[1] < _STATE_TTL:
+            return entry[0]
+        value = await repo.get_state(name)
+        _state_cache[name] = (value, time.monotonic())
+        return value
+

@router.get(
    "/config",
@@ -21,8 +52,8 @@ _DEFAULT_MUTATION_INTERVAL = "30m"
 )
@_traced("api.get_config")
 async def api_get_config(user: dict = Depends(require_viewer)) -> dict:
-    limits_state = await repo.get_state("config_limits")
-    globals_state = await repo.get_state("config_globals")
+    limits_state = await _get_state_cached("config_limits")
+    globals_state = await _get_state_cached("config_globals")

    deployment_limit = (
        limits_state.get("deployment_limit", _DEFAULT_DEPLOYMENT_LIMIT)
--- a/decnet/web/router/health/api_get_health.py
+++ b/decnet/web/router/health/api_get_health.py
@@ -1,3 +1,4 @@
+import asyncio
 import time
 from typing import Any, Optional

@@ -19,6 +20,13 @@ _docker_detail: str = ""
 _docker_last_check: float = 0.0
 _DOCKER_CHECK_INTERVAL = 5.0  # seconds between actual Docker pings

+# Cache DB liveness result — under load, every request was hitting
+# repo.get_total_logs() and filling the aiosqlite queue.
+_db_component: Optional[ComponentHealth] = None
+_db_last_check: float = 0.0
+_db_lock = asyncio.Lock()
+_DB_CHECK_INTERVAL = 1.0  # seconds
+

 def _reset_docker_cache() -> None:
    """Reset cached Docker state — used by tests."""
@@ -29,6 +37,31 @@ def _reset_docker_cache() -> None:
    _docker_last_check = 0.0


+def _reset_db_cache() -> None:
+    """Reset cached DB liveness — used by tests."""
+    global _db_component, _db_last_check
+    _db_component = None
+    _db_last_check = 0.0
+
+
+async def _check_database_cached() -> ComponentHealth:
+    global _db_component, _db_last_check
+    now = time.monotonic()
+    if _db_component is not None and now - _db_last_check < _DB_CHECK_INTERVAL:
+        return _db_component
+    async with _db_lock:
+        now = time.monotonic()
+        if _db_component is not None and now - _db_last_check < _DB_CHECK_INTERVAL:
+            return _db_component
+        try:
+            await repo.get_total_logs()
+            _db_component = ComponentHealth(status="ok")
+        except Exception as exc:
+            _db_component = ComponentHealth(status="failing", detail=str(exc))
+        _db_last_check = time.monotonic()
+        return _db_component
+
+
@router.get(
    "/health",
    response_model=HealthResponse,
@@ -43,12 +76,8 @@ def _reset_docker_cache() -> None:
 async def get_health(user: dict = Depends(require_viewer)) -> Any:
    components: dict[str, ComponentHealth] = {}

-    # 1. Database
-    try:
-        await repo.get_total_logs()
-        components["database"] = ComponentHealth(status="ok")
-    except Exception as exc:
-        components["database"] = ComponentHealth(status="failing", detail=str(exc))
+    # 1. Database (cached — avoids a DB round-trip per request)
+    components["database"] = await _check_database_cached()

    # 2. Background workers
    from decnet.web.api import get_background_tasks
--- a/tests/api/conftest.py
+++ b/tests/api/conftest.py
@@ -53,6 +53,12 @@ async def setup_db(monkeypatch) -> AsyncGenerator[None, None]:
    monkeypatch.setattr(repo, "engine", engine)
    monkeypatch.setattr(repo, "session_factory", session_factory)

+    # Reset per-request TTL caches so they don't leak across tests
+    from decnet.web.router.health import api_get_health as _h
+    from decnet.web.router.config import api_get_config as _c
+    _h._reset_db_cache()
+    _c._reset_state_cache()
+
    # Create schema
    async with engine.begin() as conn:
        await conn.run_sync(SQLModel.metadata.create_all)
--- a/tests/test_health_config_cache.py
+++ b/tests/test_health_config_cache.py
@@ -0,0 +1,67 @@
+"""
+TTL-cache contract: under concurrent load, N callers collapse to 1 repo hit
+per TTL window. Tests use fake repo objects — no real DB.
+"""
+import asyncio
+from unittest.mock import patch
+
+import pytest
+
+from decnet.web.router.health import api_get_health
+from decnet.web.router.config import api_get_config
+
+
+class _FakeRepo:
+    def __init__(self):
+        self.total_logs_calls = 0
+        self.state_calls = 0
+
+    async def get_total_logs(self):
+        self.total_logs_calls += 1
+        return 0
+
+    async def get_state(self, name: str):
+        self.state_calls += 1
+        return {"name": name}
+
+
+@pytest.mark.asyncio
+async def test_db_cache_collapses_concurrent_calls():
+    api_get_health._reset_db_cache()
+    fake = _FakeRepo()
+    with patch.object(api_get_health, "repo", fake):
+        results = await asyncio.gather(*[api_get_health._check_database_cached() for _ in range(50)])
+    assert all(r.status == "ok" for r in results)
+    assert fake.total_logs_calls == 1
+
+
+@pytest.mark.asyncio
+async def test_db_cache_expires_after_ttl(monkeypatch):
+    api_get_health._reset_db_cache()
+    monkeypatch.setattr(api_get_health, "_DB_CHECK_INTERVAL", 0.05)
+    fake = _FakeRepo()
+    with patch.object(api_get_health, "repo", fake):
+        await api_get_health._check_database_cached()
+        await asyncio.sleep(0.1)
+        await api_get_health._check_database_cached()
+    assert fake.total_logs_calls == 2
+
+
+@pytest.mark.asyncio
+async def test_config_state_cache_collapses_concurrent_calls():
+    api_get_config._reset_state_cache()
+    fake = _FakeRepo()
+    with patch.object(api_get_config, "repo", fake):
+        results = await asyncio.gather(*[api_get_config._get_state_cached("config_limits") for _ in range(30)])
+    assert all(r == {"name": "config_limits"} for r in results)
+    assert fake.state_calls == 1
+
+
+@pytest.mark.asyncio
+async def test_config_state_cache_per_key():
+    api_get_config._reset_state_cache()
+    fake = _FakeRepo()
+    with patch.object(api_get_config, "repo", fake):
+        await api_get_config._get_state_cached("config_limits")
+        await api_get_config._get_state_cached("config_globals")
+    assert fake.state_calls == 2