perf: 1s TTL cache for /health DB probe and /config state reads
Locust hit /health and /config on every @task(3), so each request was firing repo.get_total_logs() and two repo.get_state() calls against aiosqlite — filling the driver queue for data that changes on the order of seconds, not milliseconds. Both caches follow the shape already used by the existing Docker cache: - asyncio.Lock with double-checked TTL so concurrent callers collapse into one DB hit per 1s window. - _reset_* helpers called from tests/api/conftest.py::setup_db so the module-level cache can't leak across tests. tests/test_health_config_cache.py asserts 50 concurrent callers produce exactly 1 repo call, and the cache expires after TTL.
This commit is contained in:
@@ -1,3 +1,7 @@
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.env import DECNET_DEVELOPER
|
||||
@@ -10,6 +14,33 @@ router = APIRouter()
|
||||
_DEFAULT_DEPLOYMENT_LIMIT = 10
|
||||
_DEFAULT_MUTATION_INTERVAL = "30m"
|
||||
|
||||
# Cache config_limits / config_globals reads — these change on rare admin
|
||||
# writes but get polled constantly by the UI and locust.
|
||||
_STATE_TTL = 1.0
|
||||
_state_cache: dict[str, tuple[Optional[dict[str, Any]], float]] = {}
|
||||
_state_locks: dict[str, asyncio.Lock] = {}
|
||||
|
||||
|
||||
def _reset_state_cache() -> None:
|
||||
"""Reset cached config state — used by tests."""
|
||||
_state_cache.clear()
|
||||
|
||||
|
||||
async def _get_state_cached(name: str) -> Optional[dict[str, Any]]:
|
||||
entry = _state_cache.get(name)
|
||||
now = time.monotonic()
|
||||
if entry is not None and now - entry[1] < _STATE_TTL:
|
||||
return entry[0]
|
||||
lock = _state_locks.setdefault(name, asyncio.Lock())
|
||||
async with lock:
|
||||
entry = _state_cache.get(name)
|
||||
now = time.monotonic()
|
||||
if entry is not None and now - entry[1] < _STATE_TTL:
|
||||
return entry[0]
|
||||
value = await repo.get_state(name)
|
||||
_state_cache[name] = (value, time.monotonic())
|
||||
return value
|
||||
|
||||
|
||||
@router.get(
|
||||
"/config",
|
||||
@@ -21,8 +52,8 @@ _DEFAULT_MUTATION_INTERVAL = "30m"
|
||||
)
|
||||
@_traced("api.get_config")
|
||||
async def api_get_config(user: dict = Depends(require_viewer)) -> dict:
|
||||
limits_state = await repo.get_state("config_limits")
|
||||
globals_state = await repo.get_state("config_globals")
|
||||
limits_state = await _get_state_cached("config_limits")
|
||||
globals_state = await _get_state_cached("config_globals")
|
||||
|
||||
deployment_limit = (
|
||||
limits_state.get("deployment_limit", _DEFAULT_DEPLOYMENT_LIMIT)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
@@ -19,6 +20,13 @@ _docker_detail: str = ""
|
||||
_docker_last_check: float = 0.0
|
||||
_DOCKER_CHECK_INTERVAL = 5.0 # seconds between actual Docker pings
|
||||
|
||||
# Cache DB liveness result — under load, every request was hitting
|
||||
# repo.get_total_logs() and filling the aiosqlite queue.
|
||||
_db_component: Optional[ComponentHealth] = None
|
||||
_db_last_check: float = 0.0
|
||||
_db_lock = asyncio.Lock()
|
||||
_DB_CHECK_INTERVAL = 1.0 # seconds
|
||||
|
||||
|
||||
def _reset_docker_cache() -> None:
|
||||
"""Reset cached Docker state — used by tests."""
|
||||
@@ -29,6 +37,31 @@ def _reset_docker_cache() -> None:
|
||||
_docker_last_check = 0.0
|
||||
|
||||
|
||||
def _reset_db_cache() -> None:
|
||||
"""Reset cached DB liveness — used by tests."""
|
||||
global _db_component, _db_last_check
|
||||
_db_component = None
|
||||
_db_last_check = 0.0
|
||||
|
||||
|
||||
async def _check_database_cached() -> ComponentHealth:
|
||||
global _db_component, _db_last_check
|
||||
now = time.monotonic()
|
||||
if _db_component is not None and now - _db_last_check < _DB_CHECK_INTERVAL:
|
||||
return _db_component
|
||||
async with _db_lock:
|
||||
now = time.monotonic()
|
||||
if _db_component is not None and now - _db_last_check < _DB_CHECK_INTERVAL:
|
||||
return _db_component
|
||||
try:
|
||||
await repo.get_total_logs()
|
||||
_db_component = ComponentHealth(status="ok")
|
||||
except Exception as exc:
|
||||
_db_component = ComponentHealth(status="failing", detail=str(exc))
|
||||
_db_last_check = time.monotonic()
|
||||
return _db_component
|
||||
|
||||
|
||||
@router.get(
|
||||
"/health",
|
||||
response_model=HealthResponse,
|
||||
@@ -43,12 +76,8 @@ def _reset_docker_cache() -> None:
|
||||
async def get_health(user: dict = Depends(require_viewer)) -> Any:
|
||||
components: dict[str, ComponentHealth] = {}
|
||||
|
||||
# 1. Database
|
||||
try:
|
||||
await repo.get_total_logs()
|
||||
components["database"] = ComponentHealth(status="ok")
|
||||
except Exception as exc:
|
||||
components["database"] = ComponentHealth(status="failing", detail=str(exc))
|
||||
# 1. Database (cached — avoids a DB round-trip per request)
|
||||
components["database"] = await _check_database_cached()
|
||||
|
||||
# 2. Background workers
|
||||
from decnet.web.api import get_background_tasks
|
||||
|
||||
Reference in New Issue
Block a user