From f1e14280c06548bf5c9b9b335a8119964f3896a9 Mon Sep 17 00:00:00 2001 From: anti Date: Fri, 17 Apr 2026 15:05:18 -0400 Subject: [PATCH] perf: 1s TTL cache for /health DB probe and /config state reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Locust hit /health and /config on every @task(3), so each request was firing repo.get_total_logs() and two repo.get_state() calls against aiosqlite — filling the driver queue for data that changes on the order of seconds, not milliseconds. Both caches follow the shape already used by the existing Docker cache: - asyncio.Lock with double-checked TTL so concurrent callers collapse into one DB hit per 1s window. - _reset_* helpers called from tests/api/conftest.py::setup_db so the module-level cache can't leak across tests. tests/test_health_config_cache.py asserts 50 concurrent callers produce exactly 1 repo call, and the cache expires after TTL. --- decnet/web/router/config/api_get_config.py | 35 ++++++++++- decnet/web/router/health/api_get_health.py | 41 +++++++++++-- tests/api/conftest.py | 6 ++ tests/test_health_config_cache.py | 67 ++++++++++++++++++++++ 4 files changed, 141 insertions(+), 8 deletions(-) create mode 100644 tests/test_health_config_cache.py diff --git a/decnet/web/router/config/api_get_config.py b/decnet/web/router/config/api_get_config.py index a0d5369..e47cceb 100644 --- a/decnet/web/router/config/api_get_config.py +++ b/decnet/web/router/config/api_get_config.py @@ -1,3 +1,7 @@ +import asyncio +import time +from typing import Any, Optional + from fastapi import APIRouter, Depends from decnet.env import DECNET_DEVELOPER @@ -10,6 +14,33 @@ router = APIRouter() _DEFAULT_DEPLOYMENT_LIMIT = 10 _DEFAULT_MUTATION_INTERVAL = "30m" +# Cache config_limits / config_globals reads — these change on rare admin +# writes but get polled constantly by the UI and locust. +_STATE_TTL = 1.0 +_state_cache: dict[str, tuple[Optional[dict[str, Any]], float]] = {} +_state_locks: dict[str, asyncio.Lock] = {} + + +def _reset_state_cache() -> None: + """Reset cached config state — used by tests.""" + _state_cache.clear() + + +async def _get_state_cached(name: str) -> Optional[dict[str, Any]]: + entry = _state_cache.get(name) + now = time.monotonic() + if entry is not None and now - entry[1] < _STATE_TTL: + return entry[0] + lock = _state_locks.setdefault(name, asyncio.Lock()) + async with lock: + entry = _state_cache.get(name) + now = time.monotonic() + if entry is not None and now - entry[1] < _STATE_TTL: + return entry[0] + value = await repo.get_state(name) + _state_cache[name] = (value, time.monotonic()) + return value + @router.get( "/config", @@ -21,8 +52,8 @@ _DEFAULT_MUTATION_INTERVAL = "30m" ) @_traced("api.get_config") async def api_get_config(user: dict = Depends(require_viewer)) -> dict: - limits_state = await repo.get_state("config_limits") - globals_state = await repo.get_state("config_globals") + limits_state = await _get_state_cached("config_limits") + globals_state = await _get_state_cached("config_globals") deployment_limit = ( limits_state.get("deployment_limit", _DEFAULT_DEPLOYMENT_LIMIT) diff --git a/decnet/web/router/health/api_get_health.py b/decnet/web/router/health/api_get_health.py index 88f603b..b741754 100644 --- a/decnet/web/router/health/api_get_health.py +++ b/decnet/web/router/health/api_get_health.py @@ -1,3 +1,4 @@ +import asyncio import time from typing import Any, Optional @@ -19,6 +20,13 @@ _docker_detail: str = "" _docker_last_check: float = 0.0 _DOCKER_CHECK_INTERVAL = 5.0 # seconds between actual Docker pings +# Cache DB liveness result — under load, every request was hitting +# repo.get_total_logs() and filling the aiosqlite queue. +_db_component: Optional[ComponentHealth] = None +_db_last_check: float = 0.0 +_db_lock = asyncio.Lock() +_DB_CHECK_INTERVAL = 1.0 # seconds + def _reset_docker_cache() -> None: """Reset cached Docker state — used by tests.""" @@ -29,6 +37,31 @@ def _reset_docker_cache() -> None: _docker_last_check = 0.0 +def _reset_db_cache() -> None: + """Reset cached DB liveness — used by tests.""" + global _db_component, _db_last_check + _db_component = None + _db_last_check = 0.0 + + +async def _check_database_cached() -> ComponentHealth: + global _db_component, _db_last_check + now = time.monotonic() + if _db_component is not None and now - _db_last_check < _DB_CHECK_INTERVAL: + return _db_component + async with _db_lock: + now = time.monotonic() + if _db_component is not None and now - _db_last_check < _DB_CHECK_INTERVAL: + return _db_component + try: + await repo.get_total_logs() + _db_component = ComponentHealth(status="ok") + except Exception as exc: + _db_component = ComponentHealth(status="failing", detail=str(exc)) + _db_last_check = time.monotonic() + return _db_component + + @router.get( "/health", response_model=HealthResponse, @@ -43,12 +76,8 @@ def _reset_docker_cache() -> None: async def get_health(user: dict = Depends(require_viewer)) -> Any: components: dict[str, ComponentHealth] = {} - # 1. Database - try: - await repo.get_total_logs() - components["database"] = ComponentHealth(status="ok") - except Exception as exc: - components["database"] = ComponentHealth(status="failing", detail=str(exc)) + # 1. Database (cached — avoids a DB round-trip per request) + components["database"] = await _check_database_cached() # 2. Background workers from decnet.web.api import get_background_tasks diff --git a/tests/api/conftest.py b/tests/api/conftest.py index 32aff91..7727f02 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -53,6 +53,12 @@ async def setup_db(monkeypatch) -> AsyncGenerator[None, None]: monkeypatch.setattr(repo, "engine", engine) monkeypatch.setattr(repo, "session_factory", session_factory) + # Reset per-request TTL caches so they don't leak across tests + from decnet.web.router.health import api_get_health as _h + from decnet.web.router.config import api_get_config as _c + _h._reset_db_cache() + _c._reset_state_cache() + # Create schema async with engine.begin() as conn: await conn.run_sync(SQLModel.metadata.create_all) diff --git a/tests/test_health_config_cache.py b/tests/test_health_config_cache.py new file mode 100644 index 0000000..a3d7b61 --- /dev/null +++ b/tests/test_health_config_cache.py @@ -0,0 +1,67 @@ +""" +TTL-cache contract: under concurrent load, N callers collapse to 1 repo hit +per TTL window. Tests use fake repo objects — no real DB. +""" +import asyncio +from unittest.mock import patch + +import pytest + +from decnet.web.router.health import api_get_health +from decnet.web.router.config import api_get_config + + +class _FakeRepo: + def __init__(self): + self.total_logs_calls = 0 + self.state_calls = 0 + + async def get_total_logs(self): + self.total_logs_calls += 1 + return 0 + + async def get_state(self, name: str): + self.state_calls += 1 + return {"name": name} + + +@pytest.mark.asyncio +async def test_db_cache_collapses_concurrent_calls(): + api_get_health._reset_db_cache() + fake = _FakeRepo() + with patch.object(api_get_health, "repo", fake): + results = await asyncio.gather(*[api_get_health._check_database_cached() for _ in range(50)]) + assert all(r.status == "ok" for r in results) + assert fake.total_logs_calls == 1 + + +@pytest.mark.asyncio +async def test_db_cache_expires_after_ttl(monkeypatch): + api_get_health._reset_db_cache() + monkeypatch.setattr(api_get_health, "_DB_CHECK_INTERVAL", 0.05) + fake = _FakeRepo() + with patch.object(api_get_health, "repo", fake): + await api_get_health._check_database_cached() + await asyncio.sleep(0.1) + await api_get_health._check_database_cached() + assert fake.total_logs_calls == 2 + + +@pytest.mark.asyncio +async def test_config_state_cache_collapses_concurrent_calls(): + api_get_config._reset_state_cache() + fake = _FakeRepo() + with patch.object(api_get_config, "repo", fake): + results = await asyncio.gather(*[api_get_config._get_state_cached("config_limits") for _ in range(30)]) + assert all(r == {"name": "config_limits"} for r in results) + assert fake.state_calls == 1 + + +@pytest.mark.asyncio +async def test_config_state_cache_per_key(): + api_get_config._reset_state_cache() + fake = _FakeRepo() + with patch.object(api_get_config, "repo", fake): + await api_get_config._get_state_cached("config_limits") + await api_get_config._get_state_cached("config_globals") + assert fake.state_calls == 2