From d61e143b71f3978b7c18fc217a8899ac66726a76 Mon Sep 17 00:00:00 2001 From: anti Date: Fri, 24 Apr 2026 00:13:15 -0400 Subject: [PATCH] fix(stress): unblock Locust runs from login rate-limit self-DoS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Locust spawns N virtual users (default 1000), all from 127.0.0.1 as admin. /auth/login is rate-limited 10/5min per-IP AND per-username, so the 11th on_start() got 429 and a RuntimeError. A @task(2) login in the task weights turned the whole run into a 429 factory even after ramp-up. And _login_with_retry treated 429 as non-retryable, so there was no graceful degradation path. Three changes, one root cause: - decnet/web/limiter.py: read DECNET_LIMITER_ENABLED (default true). When false, slowapi's Limiter(enabled=False) makes @limiter.limit a no-op. Default ships unchanged; nobody should ever release with this off. - tests/stress/conftest.py: set DECNET_LIMITER_ENABLED=false in the uvicorn subprocess env. Stress tests measure throughput, not rate limiting. - tests/stress/locustfile.py: drop the @task(2) login — it added zero coverage (every user already logs in at on_start) and only generated contention. Teach _login_with_retry to honour 429 + Retry-After so a Locust pointed at a limiter-enabled server degrades gracefully instead of crashing on_start. --- decnet/web/limiter.py | 15 +++++++++++++++ tests/stress/conftest.py | 6 ++++++ tests/stress/locustfile.py | 30 +++++++++++++++++++++++------- 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/decnet/web/limiter.py b/decnet/web/limiter.py index 6b47c0ad..0dce1bd4 100644 --- a/decnet/web/limiter.py +++ b/decnet/web/limiter.py @@ -20,6 +20,7 @@ we introduce a verified-proxy config. from __future__ import annotations import json +import os from typing import Any, Awaitable, Callable from fastapi import Request @@ -27,6 +28,19 @@ from slowapi import Limiter from slowapi.util import get_remote_address +def _limiter_enabled() -> bool: + """``DECNET_LIMITER_ENABLED=false`` disables the limiter process-wide. + + Intended for stress / load testing, where a single Locust host + represents thousands of virtual users but shares one source IP and + one admin username — the real-world limits (10/5min per IP, per + user) would otherwise cap every run at 10 successful logins. The + default is ``true``; nobody should ever ship a release with this + off. + """ + return os.environ.get("DECNET_LIMITER_ENABLED", "true").lower() != "false" + + # Single process-wide limiter. Importing modules pull this instance to # apply `@limiter.limit(...)` decorators on their routes. Default # headers off: FastAPI response_model handlers return dicts, not @@ -36,6 +50,7 @@ from slowapi.util import get_remote_address limiter: Limiter = Limiter( key_func=get_remote_address, storage_uri="memory://", + enabled=_limiter_enabled(), ) diff --git a/tests/stress/conftest.py b/tests/stress/conftest.py index 7f552ee1..7235c246 100644 --- a/tests/stress/conftest.py +++ b/tests/stress/conftest.py @@ -72,6 +72,12 @@ def stress_server(): "DECNET_DEVELOPER_TRACING": "false", "DECNET_DB_TYPE": "sqlite", "DECNET_MODE": "master", + # Locust hammers /auth/login from a single host as a single + # user — the production 10/5min per-IP + per-user limits would + # kill ramp-up past the 11th virtual user. Stress tests are + # measuring throughput, not rate-limiting; disable in this + # subprocess only. + "DECNET_LIMITER_ENABLED": "false", }) proc = subprocess.Popen( [ diff --git a/tests/stress/locustfile.py b/tests/stress/locustfile.py index fae56923..586aa816 100644 --- a/tests/stress/locustfile.py +++ b/tests/stress/locustfile.py @@ -24,7 +24,8 @@ class DecnetUser(HttpUser): wait_time = between(0.01, 0.05) # near-zero think time — max pressure def _login_with_retry(self): - """Login with exponential backoff — handles connection storms. + """Login with exponential backoff — handles connection storms + and (if the server still has rate limits on) 429 throttling. Returns (access_token, must_change_password).""" for attempt in range(_MAX_LOGIN_RETRIES): @@ -40,6 +41,21 @@ class DecnetUser(HttpUser): if resp.status_code == 0 or resp.status_code >= 500: time.sleep(_LOGIN_BACKOFF_BASE * (2 ** attempt)) continue + # 429: the server is rate-limiting logins. In stress runs the + # fixture sets DECNET_LIMITER_ENABLED=false so we should + # never see this — but if someone points locust at a real + # server, honour Retry-After so the run degrades gracefully + # instead of crashing on_start. + if resp.status_code == 429: + retry_after = resp.headers.get("Retry-After") + delay = _LOGIN_BACKOFF_BASE * (2 ** attempt) + if retry_after: + try: + delay = max(delay, float(retry_after)) + except ValueError: + pass + time.sleep(delay) + continue raise RuntimeError(f"Login failed (non-retryable): {resp.status_code} {resp.text}") raise RuntimeError(f"Login failed after {_MAX_LOGIN_RETRIES} retries (last status: {resp.status_code})") @@ -111,12 +127,12 @@ class DecnetUser(HttpUser): # --- Write / auth paths (low weight) --- - @task(2) - def login(self): - self.client.post( - "/api/v1/auth/login", - json={"username": ADMIN_USER, "password": ADMIN_PASS}, - ) + # N.B. a previous revision had a @task(2) login here that re-hit + # /auth/login during the run. Under N>10 virtual users it burned + # the 10/5min per-IP + per-username limits and turned the whole + # stress run into a 429 factory. The login hot path is already + # covered by on_start for every simulated user; re-logging in on + # every tick adds no coverage, just contention. @task(1) def stream_sse(self):