Files
DECNET/tests/stress/locustfile.py
anti cc6abf7256 fix(tests/stress): eliminate 0-request flakes in locust runs
Three independent issues conspired to make stress tests record 0 requests:

1. Every virtual user did /auth/login in on_start. With 1000 users in a
   spike window, bcrypt-bound logins never finished and on_start failed
   for all users — aggregated requests stayed at 0. Pre-fetch a single
   admin token in the fixture (cached per-host) and pass it via
   DECNET_STRESS_TOKEN so locust users skip the login storm.

2. Locust exits non-zero on any request failure by default, causing
   run_locust to throw away an otherwise valid stats CSV. Pass
   --exit-code-on-error 0 so per-test assertions are the only fail gate.

3. test_stress_sustained ran two locust subprocesses against the same
   uvicorn. Phase 1's keep-alive connections wedged phase 2 into 0
   recorded requests ~2/3 of the time. Refactored stress_server into a
   start_stress_server() context manager and gave each phase its own
   uvicorn.

Stable 3/3 on full suite, 3/3 on test_stress_sustained alone.
2026-04-28 13:01:11 -04:00

155 lines
5.4 KiB
Python

"""
Locust user class for DECNET API stress testing.
Hammers every endpoint from the OpenAPI spec with realistic traffic weights.
Can be used standalone (`locust -f tests/stress/locustfile.py`) or
programmatically via the pytest fixtures in conftest.py.
"""
import os
import random
import time
from locust import HttpUser, task, between
ADMIN_USER = os.environ.get("DECNET_ADMIN_USER", "admin")
ADMIN_PASS = os.environ.get("DECNET_ADMIN_PASSWORD", "admin")
_MAX_LOGIN_RETRIES = 5
_LOGIN_BACKOFF_BASE = 0.5 # seconds, doubles each retry
class DecnetUser(HttpUser):
wait_time = between(0.01, 0.05) # near-zero think time — max pressure
def _login_with_retry(self):
"""Login with exponential backoff — handles connection storms
and (if the server still has rate limits on) 429 throttling.
Returns (access_token, must_change_password)."""
for attempt in range(_MAX_LOGIN_RETRIES):
resp = self.client.post(
"/api/v1/auth/login",
json={"username": ADMIN_USER, "password": ADMIN_PASS},
name="/api/v1/auth/login [on_start]",
)
if resp.status_code == 200:
body = resp.json()
return body["access_token"], bool(body.get("must_change_password", False))
# Status 0 = connection refused, retry with backoff
if resp.status_code == 0 or resp.status_code >= 500:
time.sleep(_LOGIN_BACKOFF_BASE * (2 ** attempt))
continue
# 429: the server is rate-limiting logins. In stress runs the
# fixture sets DECNET_LIMITER_ENABLED=false so we should
# never see this — but if someone points locust at a real
# server, honour Retry-After so the run degrades gracefully
# instead of crashing on_start.
if resp.status_code == 429:
retry_after = resp.headers.get("Retry-After")
delay = _LOGIN_BACKOFF_BASE * (2 ** attempt)
if retry_after:
try:
delay = max(delay, float(retry_after))
except ValueError:
pass
time.sleep(delay)
continue
raise RuntimeError(f"Login failed (non-retryable): {resp.status_code} {resp.text}")
raise RuntimeError(f"Login failed after {_MAX_LOGIN_RETRIES} retries (last status: {resp.status_code})")
def on_start(self):
# Prefer the fixture-supplied token: 1000 simultaneous bcrypt logins
# never finish inside a spike window, leaving aggregated requests at 0.
preset = os.environ.get("DECNET_STRESS_TOKEN")
if preset:
self.token = preset
else:
token, must_change = self._login_with_retry()
if must_change:
self.client.post(
"/api/v1/auth/change-password",
json={"old_password": ADMIN_PASS, "new_password": ADMIN_PASS},
headers={"Authorization": f"Bearer {token}"},
)
token, _ = self._login_with_retry()
self.token = token
self.client.headers.update({"Authorization": f"Bearer {self.token}"})
# --- Read-hot paths (high weight) ---
@task(10)
def get_stats(self):
self.client.get("/api/v1/stats")
@task(8)
def get_logs(self):
self.client.get("/api/v1/logs", params={"limit": 50})
@task(8)
def get_attackers(self):
self.client.get("/api/v1/attackers")
@task(7)
def get_deckies(self):
self.client.get("/api/v1/deckies")
@task(6)
def get_bounties(self):
self.client.get("/api/v1/bounty")
@task(5)
def get_logs_histogram(self):
self.client.get("/api/v1/logs/histogram")
@task(5)
def search_logs(self):
self.client.get("/api/v1/logs", params={"search": "ssh", "limit": 100})
@task(4)
def search_attackers(self):
self.client.get(
"/api/v1/attackers", params={"search": "brute", "sort_by": "recent"}
)
@task(4)
def paginate_logs(self):
offset = random.randint(0, 1000)
self.client.get("/api/v1/logs", params={"limit": 100, "offset": offset})
@task(3)
def get_health(self):
self.client.get("/api/v1/health")
@task(3)
def get_config(self):
self.client.get("/api/v1/config")
# --- Write / auth paths (low weight) ---
# N.B. a previous revision had a @task(2) login here that re-hit
# /auth/login during the run. Under N>10 virtual users it burned
# the 10/5min per-IP + per-username limits and turned the whole
# stress run into a 429 factory. The login hot path is already
# covered by on_start for every simulated user; re-logging in on
# every tick adds no coverage, just contention.
@task(1)
def stream_sse(self):
"""Short-lived SSE connection — read a few bytes then close."""
with self.client.get(
"/api/v1/stream",
params={"maxOutput": 3},
stream=True,
catch_response=True,
name="/api/v1/stream",
) as resp:
if resp.status_code == 200:
# Read up to 4KB then bail — we're stress-testing connection setup
for chunk in resp.iter_content(chunk_size=1024):
break
resp.success()
else:
resp.failure(f"SSE returned {resp.status_code}")