fix(stress): unblock Locust runs from login rate-limit self-DoS
Locust spawns N virtual users (default 1000), all from 127.0.0.1 as admin. /auth/login is rate-limited 10/5min per-IP AND per-username, so the 11th on_start() got 429 and a RuntimeError. A @task(2) login in the task weights turned the whole run into a 429 factory even after ramp-up. And _login_with_retry treated 429 as non-retryable, so there was no graceful degradation path. Three changes, one root cause: - decnet/web/limiter.py: read DECNET_LIMITER_ENABLED (default true). When false, slowapi's Limiter(enabled=False) makes @limiter.limit a no-op. Default ships unchanged; nobody should ever release with this off. - tests/stress/conftest.py: set DECNET_LIMITER_ENABLED=false in the uvicorn subprocess env. Stress tests measure throughput, not rate limiting. - tests/stress/locustfile.py: drop the @task(2) login — it added zero coverage (every user already logs in at on_start) and only generated contention. Teach _login_with_retry to honour 429 + Retry-After so a Locust pointed at a limiter-enabled server degrades gracefully instead of crashing on_start.
This commit is contained in:
@@ -20,6 +20,7 @@ we introduce a verified-proxy config.
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
from typing import Any, Awaitable, Callable
|
from typing import Any, Awaitable, Callable
|
||||||
|
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
@@ -27,6 +28,19 @@ from slowapi import Limiter
|
|||||||
from slowapi.util import get_remote_address
|
from slowapi.util import get_remote_address
|
||||||
|
|
||||||
|
|
||||||
|
def _limiter_enabled() -> bool:
|
||||||
|
"""``DECNET_LIMITER_ENABLED=false`` disables the limiter process-wide.
|
||||||
|
|
||||||
|
Intended for stress / load testing, where a single Locust host
|
||||||
|
represents thousands of virtual users but shares one source IP and
|
||||||
|
one admin username — the real-world limits (10/5min per IP, per
|
||||||
|
user) would otherwise cap every run at 10 successful logins. The
|
||||||
|
default is ``true``; nobody should ever ship a release with this
|
||||||
|
off.
|
||||||
|
"""
|
||||||
|
return os.environ.get("DECNET_LIMITER_ENABLED", "true").lower() != "false"
|
||||||
|
|
||||||
|
|
||||||
# Single process-wide limiter. Importing modules pull this instance to
|
# Single process-wide limiter. Importing modules pull this instance to
|
||||||
# apply `@limiter.limit(...)` decorators on their routes. Default
|
# apply `@limiter.limit(...)` decorators on their routes. Default
|
||||||
# headers off: FastAPI response_model handlers return dicts, not
|
# headers off: FastAPI response_model handlers return dicts, not
|
||||||
@@ -36,6 +50,7 @@ from slowapi.util import get_remote_address
|
|||||||
limiter: Limiter = Limiter(
|
limiter: Limiter = Limiter(
|
||||||
key_func=get_remote_address,
|
key_func=get_remote_address,
|
||||||
storage_uri="memory://",
|
storage_uri="memory://",
|
||||||
|
enabled=_limiter_enabled(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -72,6 +72,12 @@ def stress_server():
|
|||||||
"DECNET_DEVELOPER_TRACING": "false",
|
"DECNET_DEVELOPER_TRACING": "false",
|
||||||
"DECNET_DB_TYPE": "sqlite",
|
"DECNET_DB_TYPE": "sqlite",
|
||||||
"DECNET_MODE": "master",
|
"DECNET_MODE": "master",
|
||||||
|
# Locust hammers /auth/login from a single host as a single
|
||||||
|
# user — the production 10/5min per-IP + per-user limits would
|
||||||
|
# kill ramp-up past the 11th virtual user. Stress tests are
|
||||||
|
# measuring throughput, not rate-limiting; disable in this
|
||||||
|
# subprocess only.
|
||||||
|
"DECNET_LIMITER_ENABLED": "false",
|
||||||
})
|
})
|
||||||
proc = subprocess.Popen(
|
proc = subprocess.Popen(
|
||||||
[
|
[
|
||||||
|
|||||||
@@ -24,7 +24,8 @@ class DecnetUser(HttpUser):
|
|||||||
wait_time = between(0.01, 0.05) # near-zero think time — max pressure
|
wait_time = between(0.01, 0.05) # near-zero think time — max pressure
|
||||||
|
|
||||||
def _login_with_retry(self):
|
def _login_with_retry(self):
|
||||||
"""Login with exponential backoff — handles connection storms.
|
"""Login with exponential backoff — handles connection storms
|
||||||
|
and (if the server still has rate limits on) 429 throttling.
|
||||||
|
|
||||||
Returns (access_token, must_change_password)."""
|
Returns (access_token, must_change_password)."""
|
||||||
for attempt in range(_MAX_LOGIN_RETRIES):
|
for attempt in range(_MAX_LOGIN_RETRIES):
|
||||||
@@ -40,6 +41,21 @@ class DecnetUser(HttpUser):
|
|||||||
if resp.status_code == 0 or resp.status_code >= 500:
|
if resp.status_code == 0 or resp.status_code >= 500:
|
||||||
time.sleep(_LOGIN_BACKOFF_BASE * (2 ** attempt))
|
time.sleep(_LOGIN_BACKOFF_BASE * (2 ** attempt))
|
||||||
continue
|
continue
|
||||||
|
# 429: the server is rate-limiting logins. In stress runs the
|
||||||
|
# fixture sets DECNET_LIMITER_ENABLED=false so we should
|
||||||
|
# never see this — but if someone points locust at a real
|
||||||
|
# server, honour Retry-After so the run degrades gracefully
|
||||||
|
# instead of crashing on_start.
|
||||||
|
if resp.status_code == 429:
|
||||||
|
retry_after = resp.headers.get("Retry-After")
|
||||||
|
delay = _LOGIN_BACKOFF_BASE * (2 ** attempt)
|
||||||
|
if retry_after:
|
||||||
|
try:
|
||||||
|
delay = max(delay, float(retry_after))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
time.sleep(delay)
|
||||||
|
continue
|
||||||
raise RuntimeError(f"Login failed (non-retryable): {resp.status_code} {resp.text}")
|
raise RuntimeError(f"Login failed (non-retryable): {resp.status_code} {resp.text}")
|
||||||
raise RuntimeError(f"Login failed after {_MAX_LOGIN_RETRIES} retries (last status: {resp.status_code})")
|
raise RuntimeError(f"Login failed after {_MAX_LOGIN_RETRIES} retries (last status: {resp.status_code})")
|
||||||
|
|
||||||
@@ -111,12 +127,12 @@ class DecnetUser(HttpUser):
|
|||||||
|
|
||||||
# --- Write / auth paths (low weight) ---
|
# --- Write / auth paths (low weight) ---
|
||||||
|
|
||||||
@task(2)
|
# N.B. a previous revision had a @task(2) login here that re-hit
|
||||||
def login(self):
|
# /auth/login during the run. Under N>10 virtual users it burned
|
||||||
self.client.post(
|
# the 10/5min per-IP + per-username limits and turned the whole
|
||||||
"/api/v1/auth/login",
|
# stress run into a 429 factory. The login hot path is already
|
||||||
json={"username": ADMIN_USER, "password": ADMIN_PASS},
|
# covered by on_start for every simulated user; re-logging in on
|
||||||
)
|
# every tick adds no coverage, just contention.
|
||||||
|
|
||||||
@task(1)
|
@task(1)
|
||||||
def stream_sse(self):
|
def stream_sse(self):
|
||||||
|
|||||||
Reference in New Issue
Block a user