merge: testing → main (reconcile 2-week divergence)
This commit is contained in:
0
tests/stress/__init__.py
Normal file
0
tests/stress/__init__.py
Normal file
377
tests/stress/conftest.py
Normal file
377
tests/stress/conftest.py
Normal file
@@ -0,0 +1,377 @@
|
||||
"""
|
||||
Stress-test fixtures: real uvicorn server + out-of-process Locust runner.
|
||||
|
||||
Locust is run via its CLI in a fresh subprocess so its gevent monkey-patching
|
||||
happens before ssl/urllib3 are imported. Running it in-process here causes a
|
||||
RecursionError in urllib3's create_urllib3_context on Python 3.11+.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import multiprocessing
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import socket
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration (env-var driven for CI flexibility)
|
||||
# ---------------------------------------------------------------------------
|
||||
STRESS_USERS = int(os.environ.get("STRESS_USERS", "1000"))
|
||||
STRESS_SPAWN_RATE = int(os.environ.get("STRESS_SPAWN_RATE", "50"))
|
||||
STRESS_DURATION = int(os.environ.get("STRESS_DURATION", "60"))
|
||||
STRESS_WORKERS = int(os.environ.get("STRESS_WORKERS", str(min(multiprocessing.cpu_count(), 4))))
|
||||
|
||||
ADMIN_USER = "admin"
|
||||
ADMIN_PASS = "test-password-123"
|
||||
JWT_SECRET = "stable-test-secret-key-at-least-32-chars-long"
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
_LOCUSTFILE = Path(__file__).resolve().parent / "locustfile.py"
|
||||
|
||||
|
||||
def _free_port() -> int:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(("127.0.0.1", 0))
|
||||
return s.getsockname()[1]
|
||||
|
||||
|
||||
def _wait_for_server(url: str, timeout: float = 60.0) -> None:
|
||||
deadline = time.monotonic() + timeout
|
||||
while time.monotonic() < deadline:
|
||||
try:
|
||||
r = requests.get(url, timeout=2)
|
||||
if r.status_code in (200, 401, 503):
|
||||
return
|
||||
except requests.RequestException:
|
||||
# ConnectionError / ReadTimeout / anything else transient — the
|
||||
# server is either not up yet or too busy to respond in time.
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
raise TimeoutError(f"Server not ready at {url}")
|
||||
|
||||
|
||||
from contextlib import contextmanager
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _start_stress_server():
|
||||
"""Spawn a uvicorn for stress testing; yield base_url; tear down on exit."""
|
||||
port = _free_port()
|
||||
env = {k: v for k, v in os.environ.items() if not k.startswith("DECNET_")}
|
||||
env.update({
|
||||
"DECNET_JWT_SECRET": JWT_SECRET,
|
||||
"DECNET_ADMIN_PASSWORD": ADMIN_PASS,
|
||||
"DECNET_DEVELOPER": "false",
|
||||
"DECNET_DEVELOPER_TRACING": "false",
|
||||
"DECNET_DB_TYPE": "sqlite",
|
||||
"DECNET_MODE": "master",
|
||||
# Locust hammers /auth/login from a single host as a single
|
||||
# user — the production 10/5min per-IP + per-user limits would
|
||||
# kill ramp-up past the 11th virtual user. Stress tests are
|
||||
# measuring throughput, not rate-limiting; disable in this
|
||||
# subprocess only.
|
||||
"DECNET_LIMITER_ENABLED": "false",
|
||||
})
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
sys.executable, "-m", "uvicorn",
|
||||
"decnet.web.api:app",
|
||||
"--host", "127.0.0.1",
|
||||
"--port", str(port),
|
||||
"--workers", str(STRESS_WORKERS),
|
||||
"--log-level", "warning",
|
||||
],
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
base_url = f"http://127.0.0.1:{port}"
|
||||
try:
|
||||
try:
|
||||
_wait_for_server(f"{base_url}/api/v1/health", timeout=60.0)
|
||||
except TimeoutError:
|
||||
proc.terminate()
|
||||
try:
|
||||
out, err = proc.communicate(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
proc.kill()
|
||||
out, err = proc.communicate()
|
||||
raise TimeoutError(
|
||||
f"uvicorn did not become ready.\n"
|
||||
f"--- stdout ---\n{out.decode(errors='replace')}\n"
|
||||
f"--- stderr ---\n{err.decode(errors='replace')}"
|
||||
)
|
||||
yield base_url
|
||||
finally:
|
||||
# Drop any cached admin token for the dying host so the next server
|
||||
# gets a fresh login instead of presenting a JWT signed against a
|
||||
# stale process state.
|
||||
_TOKEN_CACHE.pop(base_url, None)
|
||||
proc.terminate()
|
||||
try:
|
||||
proc.wait(timeout=10)
|
||||
except subprocess.TimeoutExpired:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def stress_server():
|
||||
# Function-scoped: every stress test gets its own clean uvicorn. Sharing
|
||||
# a server across baseline → spike → sustained left the later runs with
|
||||
# a half-dead pool (0-request symptom). Cost is ~5s of startup per test.
|
||||
"""Start a real uvicorn server for stress testing."""
|
||||
with _start_stress_server() as base_url:
|
||||
yield base_url
|
||||
|
||||
|
||||
# Re-exported so tests can spin up additional servers within a single test
|
||||
# (e.g. test_stress_sustained needs a fresh uvicorn between phases — phase 1
|
||||
# leaves keep-alive connections that wedge phase 2 into 0 recorded requests).
|
||||
start_stress_server = _start_stress_server
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stress_token(stress_server):
|
||||
"""Authenticate and return a valid admin JWT."""
|
||||
url = stress_server
|
||||
resp = requests.post(
|
||||
f"{url}/api/v1/auth/login",
|
||||
json={"username": ADMIN_USER, "password": ADMIN_PASS},
|
||||
)
|
||||
assert resp.status_code == 200, f"Login failed: {resp.text}"
|
||||
token = resp.json()["access_token"]
|
||||
|
||||
# Clear must_change_password
|
||||
requests.post(
|
||||
f"{url}/api/v1/auth/change-password",
|
||||
json={"old_password": ADMIN_PASS, "new_password": ADMIN_PASS},
|
||||
headers={"Authorization": f"Bearer {token}"},
|
||||
)
|
||||
# Re-login for clean token
|
||||
resp2 = requests.post(
|
||||
f"{url}/api/v1/auth/login",
|
||||
json={"username": ADMIN_USER, "password": ADMIN_PASS},
|
||||
)
|
||||
return resp2.json()["access_token"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Locust subprocess runner + stats shim
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _StatsEntry:
|
||||
"""Shim mimicking locust.stats.StatsEntry for the fields our tests use."""
|
||||
def __init__(self, row: dict, percentile_rows: dict):
|
||||
self.method = row.get("Type", "") or ""
|
||||
self.name = row.get("Name", "")
|
||||
self.num_requests = int(float(row.get("Request Count", 0) or 0))
|
||||
self.num_failures = int(float(row.get("Failure Count", 0) or 0))
|
||||
self.avg_response_time = float(row.get("Average Response Time", 0) or 0)
|
||||
self.min_response_time = float(row.get("Min Response Time", 0) or 0)
|
||||
self.max_response_time = float(row.get("Max Response Time", 0) or 0)
|
||||
self.total_rps = float(row.get("Requests/s", 0) or 0)
|
||||
self._percentiles = percentile_rows # {0.5: ms, 0.95: ms, ...}
|
||||
|
||||
def get_response_time_percentile(self, p: float):
|
||||
# Accept either 0.99 or 99 form; normalize to 0..1
|
||||
if p > 1:
|
||||
p = p / 100.0
|
||||
# Exact match first
|
||||
if p in self._percentiles:
|
||||
return self._percentiles[p]
|
||||
# Fuzzy match on closest declared percentile
|
||||
if not self._percentiles:
|
||||
return 0
|
||||
closest = min(self._percentiles.keys(), key=lambda k: abs(k - p))
|
||||
return self._percentiles[closest]
|
||||
|
||||
|
||||
class _Stats:
|
||||
def __init__(self, total: _StatsEntry, entries: dict):
|
||||
self.total = total
|
||||
self.entries = entries
|
||||
|
||||
|
||||
class _LocustEnv:
|
||||
def __init__(self, stats: _Stats):
|
||||
self.stats = stats
|
||||
|
||||
|
||||
# Locust CSV column names for percentile fields (varies slightly by version).
|
||||
_PCT_COL_MAP = {
|
||||
"50%": 0.50, "66%": 0.66, "75%": 0.75, "80%": 0.80,
|
||||
"90%": 0.90, "95%": 0.95, "98%": 0.98, "99%": 0.99,
|
||||
"99.9%": 0.999, "99.99%": 0.9999, "100%": 1.0,
|
||||
}
|
||||
|
||||
|
||||
def _parse_locust_csv(stats_csv: Path) -> _LocustEnv:
|
||||
if not stats_csv.exists():
|
||||
raise RuntimeError(f"locust stats csv missing: {stats_csv}")
|
||||
|
||||
entries: dict = {}
|
||||
total: _StatsEntry | None = None
|
||||
|
||||
with stats_csv.open() as fh:
|
||||
reader = csv.DictReader(fh)
|
||||
for row in reader:
|
||||
pcts = {}
|
||||
for col, frac in _PCT_COL_MAP.items():
|
||||
v = row.get(col)
|
||||
if v not in (None, "", "N/A"):
|
||||
try:
|
||||
pcts[frac] = float(v)
|
||||
except ValueError:
|
||||
pass
|
||||
entry = _StatsEntry(row, pcts)
|
||||
if row.get("Name") == "Aggregated":
|
||||
total = entry
|
||||
else:
|
||||
key = (entry.method, entry.name)
|
||||
entries[key] = entry
|
||||
|
||||
if total is None:
|
||||
# Fallback: synthesize a zero-row total
|
||||
total = _StatsEntry({}, {})
|
||||
return _LocustEnv(_Stats(total, entries))
|
||||
|
||||
|
||||
_TOKEN_CACHE: dict[str, str] = {}
|
||||
|
||||
|
||||
def _login_once(host: str, timeout: float) -> dict:
|
||||
last_exc: Exception | None = None
|
||||
# Retry through transient drain windows: between phases the API is
|
||||
# still flushing connections from the prior locust run, so the first
|
||||
# POST can sit in-queue past a single 15s timeout.
|
||||
for attempt in range(5):
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{host}/api/v1/auth/login",
|
||||
json={"username": ADMIN_USER, "password": ADMIN_PASS},
|
||||
timeout=timeout,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
except (requests.Timeout, requests.ConnectionError) as e:
|
||||
last_exc = e
|
||||
time.sleep(2 ** attempt)
|
||||
raise RuntimeError(f"admin login failed after retries: {last_exc!r}")
|
||||
|
||||
|
||||
def _fetch_admin_token(host: str) -> str:
|
||||
"""Pre-fetch an admin token so locust virtual users don't all stampede
|
||||
/auth/login on_start. Bcrypt is CPU-bound; 1000 simultaneous logins under
|
||||
a 15s spike window means no user ever completes on_start and aggregated
|
||||
request count is 0.
|
||||
|
||||
Cached per-host: the token is reusable across phases, so we don't pay
|
||||
a fresh /auth/login round-trip while a previous run is still draining.
|
||||
"""
|
||||
cached = _TOKEN_CACHE.get(host)
|
||||
if cached:
|
||||
return cached
|
||||
|
||||
body = _login_once(host, timeout=30)
|
||||
token = body["access_token"]
|
||||
if body.get("must_change_password"):
|
||||
requests.post(
|
||||
f"{host}/api/v1/auth/change-password",
|
||||
json={"old_password": ADMIN_PASS, "new_password": ADMIN_PASS},
|
||||
headers={"Authorization": f"Bearer {token}"},
|
||||
timeout=30,
|
||||
)
|
||||
body = _login_once(host, timeout=30)
|
||||
token = body["access_token"]
|
||||
|
||||
_TOKEN_CACHE[host] = token
|
||||
return token
|
||||
|
||||
|
||||
def run_locust(host, users, spawn_rate, duration, _retry=False):
|
||||
"""Run Locust in a subprocess (fresh Python, clean gevent monkey-patch)
|
||||
and return a stats shim compatible with the tests.
|
||||
"""
|
||||
import tempfile
|
||||
|
||||
tmp = tempfile.mkdtemp(prefix="locust-stress-")
|
||||
csv_prefix = Path(tmp) / "run"
|
||||
|
||||
env = {k: v for k, v in os.environ.items()}
|
||||
# Ensure DecnetUser.on_start can log in with the right creds
|
||||
env.setdefault("DECNET_ADMIN_USER", ADMIN_USER)
|
||||
env.setdefault("DECNET_ADMIN_PASSWORD", ADMIN_PASS)
|
||||
# Pre-fetched token: locustfile picks this up and skips its own login.
|
||||
env["DECNET_STRESS_TOKEN"] = _fetch_admin_token(host)
|
||||
|
||||
cmd = [
|
||||
sys.executable, "-m", "locust",
|
||||
"-f", str(_LOCUSTFILE),
|
||||
"--headless",
|
||||
"--host", host,
|
||||
"-u", str(users),
|
||||
"-r", str(spawn_rate),
|
||||
"-t", f"{duration}s",
|
||||
"--csv", str(csv_prefix),
|
||||
"--only-summary",
|
||||
"--loglevel", "WARNING",
|
||||
# Locust defaults to exit code 1 on any request failure. We have our
|
||||
# own per-test assertions for fail-rate; let the subprocess exit 0 so
|
||||
# we don't throw away an otherwise valid stats CSV.
|
||||
"--exit-code-on-error", "0",
|
||||
]
|
||||
|
||||
# Generous timeout: locust run-time + spawn ramp + shutdown grace
|
||||
wall_timeout = duration + max(30, users // max(1, spawn_rate)) + 30
|
||||
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
cwd=str(_REPO_ROOT),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
timeout=wall_timeout,
|
||||
)
|
||||
except subprocess.TimeoutExpired as e:
|
||||
raise RuntimeError(
|
||||
f"locust subprocess timed out after {wall_timeout}s.\n"
|
||||
f"--- stdout ---\n{(e.stdout or b'').decode(errors='replace')}\n"
|
||||
f"--- stderr ---\n{(e.stderr or b'').decode(errors='replace')}"
|
||||
)
|
||||
|
||||
# Locust exits non-zero on failure-rate threshold; we don't set one, so any
|
||||
# non-zero is a real error.
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"locust subprocess exited {proc.returncode}.\n"
|
||||
f"--- stdout ---\n{proc.stdout.decode(errors='replace')}\n"
|
||||
f"--- stderr ---\n{proc.stderr.decode(errors='replace')}"
|
||||
)
|
||||
|
||||
result = _parse_locust_csv(Path(str(csv_prefix) + "_stats.csv"))
|
||||
if result.stats.total.num_requests == 0 and not _retry:
|
||||
# Transient: server was mid-drain or connection storm RSTed before any
|
||||
# request landed. Wait for the API to respond cleanly, then retry once
|
||||
# before giving up.
|
||||
try:
|
||||
_wait_for_server(f"{host}/api/v1/health", timeout=15.0)
|
||||
except TimeoutError:
|
||||
pass
|
||||
time.sleep(2)
|
||||
return run_locust(host, users, spawn_rate, duration, _retry=True)
|
||||
if result.stats.total.num_requests == 0:
|
||||
raise RuntimeError(
|
||||
f"locust produced 0 requests (after 1 retry).\n"
|
||||
f"--- stdout ---\n{proc.stdout.decode(errors='replace')}\n"
|
||||
f"--- stderr ---\n{proc.stderr.decode(errors='replace')}"
|
||||
)
|
||||
return result
|
||||
154
tests/stress/locustfile.py
Normal file
154
tests/stress/locustfile.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""
|
||||
Locust user class for DECNET API stress testing.
|
||||
|
||||
Hammers every endpoint from the OpenAPI spec with realistic traffic weights.
|
||||
Can be used standalone (`locust -f tests/stress/locustfile.py`) or
|
||||
programmatically via the pytest fixtures in conftest.py.
|
||||
"""
|
||||
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
|
||||
from locust import HttpUser, task, between
|
||||
|
||||
|
||||
ADMIN_USER = os.environ.get("DECNET_ADMIN_USER", "admin")
|
||||
ADMIN_PASS = os.environ.get("DECNET_ADMIN_PASSWORD", "admin")
|
||||
|
||||
_MAX_LOGIN_RETRIES = 5
|
||||
_LOGIN_BACKOFF_BASE = 0.5 # seconds, doubles each retry
|
||||
|
||||
|
||||
class DecnetUser(HttpUser):
|
||||
wait_time = between(0.01, 0.05) # near-zero think time — max pressure
|
||||
|
||||
def _login_with_retry(self):
|
||||
"""Login with exponential backoff — handles connection storms
|
||||
and (if the server still has rate limits on) 429 throttling.
|
||||
|
||||
Returns (access_token, must_change_password)."""
|
||||
for attempt in range(_MAX_LOGIN_RETRIES):
|
||||
resp = self.client.post(
|
||||
"/api/v1/auth/login",
|
||||
json={"username": ADMIN_USER, "password": ADMIN_PASS},
|
||||
name="/api/v1/auth/login [on_start]",
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
body = resp.json()
|
||||
return body["access_token"], bool(body.get("must_change_password", False))
|
||||
# Status 0 = connection refused, retry with backoff
|
||||
if resp.status_code == 0 or resp.status_code >= 500:
|
||||
time.sleep(_LOGIN_BACKOFF_BASE * (2 ** attempt))
|
||||
continue
|
||||
# 429: the server is rate-limiting logins. In stress runs the
|
||||
# fixture sets DECNET_LIMITER_ENABLED=false so we should
|
||||
# never see this — but if someone points locust at a real
|
||||
# server, honour Retry-After so the run degrades gracefully
|
||||
# instead of crashing on_start.
|
||||
if resp.status_code == 429:
|
||||
retry_after = resp.headers.get("Retry-After")
|
||||
delay = _LOGIN_BACKOFF_BASE * (2 ** attempt)
|
||||
if retry_after:
|
||||
try:
|
||||
delay = max(delay, float(retry_after))
|
||||
except ValueError:
|
||||
pass
|
||||
time.sleep(delay)
|
||||
continue
|
||||
raise RuntimeError(f"Login failed (non-retryable): {resp.status_code} {resp.text}")
|
||||
raise RuntimeError(f"Login failed after {_MAX_LOGIN_RETRIES} retries (last status: {resp.status_code})")
|
||||
|
||||
def on_start(self):
|
||||
# Prefer the fixture-supplied token: 1000 simultaneous bcrypt logins
|
||||
# never finish inside a spike window, leaving aggregated requests at 0.
|
||||
preset = os.environ.get("DECNET_STRESS_TOKEN")
|
||||
if preset:
|
||||
self.token = preset
|
||||
else:
|
||||
token, must_change = self._login_with_retry()
|
||||
if must_change:
|
||||
self.client.post(
|
||||
"/api/v1/auth/change-password",
|
||||
json={"old_password": ADMIN_PASS, "new_password": ADMIN_PASS},
|
||||
headers={"Authorization": f"Bearer {token}"},
|
||||
)
|
||||
token, _ = self._login_with_retry()
|
||||
self.token = token
|
||||
self.client.headers.update({"Authorization": f"Bearer {self.token}"})
|
||||
|
||||
# --- Read-hot paths (high weight) ---
|
||||
|
||||
@task(10)
|
||||
def get_stats(self):
|
||||
self.client.get("/api/v1/stats")
|
||||
|
||||
@task(8)
|
||||
def get_logs(self):
|
||||
self.client.get("/api/v1/logs", params={"limit": 50})
|
||||
|
||||
@task(8)
|
||||
def get_attackers(self):
|
||||
self.client.get("/api/v1/attackers")
|
||||
|
||||
@task(7)
|
||||
def get_deckies(self):
|
||||
self.client.get("/api/v1/deckies")
|
||||
|
||||
@task(6)
|
||||
def get_bounties(self):
|
||||
self.client.get("/api/v1/bounty")
|
||||
|
||||
@task(5)
|
||||
def get_logs_histogram(self):
|
||||
self.client.get("/api/v1/logs/histogram")
|
||||
|
||||
@task(5)
|
||||
def search_logs(self):
|
||||
self.client.get("/api/v1/logs", params={"search": "ssh", "limit": 100})
|
||||
|
||||
@task(4)
|
||||
def search_attackers(self):
|
||||
self.client.get(
|
||||
"/api/v1/attackers", params={"search": "brute", "sort_by": "recent"}
|
||||
)
|
||||
|
||||
@task(4)
|
||||
def paginate_logs(self):
|
||||
offset = random.randint(0, 1000)
|
||||
self.client.get("/api/v1/logs", params={"limit": 100, "offset": offset})
|
||||
|
||||
@task(3)
|
||||
def get_health(self):
|
||||
self.client.get("/api/v1/health")
|
||||
|
||||
@task(3)
|
||||
def get_config(self):
|
||||
self.client.get("/api/v1/config")
|
||||
|
||||
# --- Write / auth paths (low weight) ---
|
||||
|
||||
# N.B. a previous revision had a @task(2) login here that re-hit
|
||||
# /auth/login during the run. Under N>10 virtual users it burned
|
||||
# the 10/5min per-IP + per-username limits and turned the whole
|
||||
# stress run into a 429 factory. The login hot path is already
|
||||
# covered by on_start for every simulated user; re-logging in on
|
||||
# every tick adds no coverage, just contention.
|
||||
|
||||
@task(1)
|
||||
def stream_sse(self):
|
||||
"""Short-lived SSE connection — read a few bytes then close."""
|
||||
with self.client.get(
|
||||
"/api/v1/stream",
|
||||
params={"maxOutput": 3},
|
||||
stream=True,
|
||||
catch_response=True,
|
||||
name="/api/v1/stream",
|
||||
) as resp:
|
||||
if resp.status_code == 200:
|
||||
# Read up to 4KB then bail — we're stress-testing connection setup
|
||||
for chunk in resp.iter_content(chunk_size=1024):
|
||||
break
|
||||
resp.success()
|
||||
else:
|
||||
resp.failure(f"SSE returned {resp.status_code}")
|
||||
163
tests/stress/test_stress.py
Normal file
163
tests/stress/test_stress.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""
|
||||
Locust-based stress tests for the DECNET API.
|
||||
|
||||
Run: pytest -m stress tests/stress/ -v -x -n0
|
||||
Tune: STRESS_USERS=2000 STRESS_SPAWN_RATE=200 STRESS_DURATION=120 pytest -m stress ...
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.stress.conftest import (
|
||||
run_locust,
|
||||
start_stress_server,
|
||||
STRESS_USERS,
|
||||
STRESS_SPAWN_RATE,
|
||||
STRESS_DURATION,
|
||||
)
|
||||
|
||||
|
||||
# Assertion thresholds (overridable via env)
|
||||
MIN_RPS = int(os.environ.get("STRESS_MIN_RPS", "150"))
|
||||
MAX_P99_MS = int(os.environ.get("STRESS_MAX_P99_MS", "10000"))
|
||||
MAX_FAIL_RATE = float(os.environ.get("STRESS_MAX_FAIL_RATE", "0.01")) # 1%
|
||||
|
||||
|
||||
def _print_stats(env, label=""):
|
||||
"""Print a compact stats summary table."""
|
||||
total = env.stats.total
|
||||
num_reqs = total.num_requests
|
||||
num_fails = total.num_failures
|
||||
fail_pct = (num_fails / num_reqs * 100) if num_reqs else 0
|
||||
rps = total.total_rps
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
if label:
|
||||
print(f" {label}")
|
||||
print(f"{'=' * 70}")
|
||||
print(f" {'Metric':<30} {'Value':>15}")
|
||||
print(f" {'-' * 45}")
|
||||
print(f" {'Total requests':<30} {num_reqs:>15,}")
|
||||
print(f" {'Failures':<30} {num_fails:>15,} ({fail_pct:.2f}%)")
|
||||
print(f" {'RPS (total)':<30} {rps:>15.1f}")
|
||||
print(f" {'Avg latency (ms)':<30} {total.avg_response_time:>15.1f}")
|
||||
print(f" {'p50 (ms)':<30} {total.get_response_time_percentile(0.50) or 0:>15.0f}")
|
||||
print(f" {'p95 (ms)':<30} {total.get_response_time_percentile(0.95) or 0:>15.0f}")
|
||||
print(f" {'p99 (ms)':<30} {total.get_response_time_percentile(0.99) or 0:>15.0f}")
|
||||
print(f" {'Min (ms)':<30} {total.min_response_time:>15.0f}")
|
||||
print(f" {'Max (ms)':<30} {total.max_response_time:>15.0f}")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
# Per-endpoint breakdown
|
||||
print(f"\n {'Endpoint':<45} {'Reqs':>8} {'Fails':>8} {'Avg(ms)':>10} {'p99(ms)':>10}")
|
||||
print(f" {'-' * 81}")
|
||||
for entry in sorted(env.stats.entries.values(), key=lambda e: e.num_requests, reverse=True):
|
||||
p99 = entry.get_response_time_percentile(0.99) or 0
|
||||
print(
|
||||
f" {entry.method + ' ' + entry.name:<45} "
|
||||
f"{entry.num_requests:>8,} "
|
||||
f"{entry.num_failures:>8,} "
|
||||
f"{entry.avg_response_time:>10.1f} "
|
||||
f"{p99:>10.0f}"
|
||||
)
|
||||
print()
|
||||
|
||||
|
||||
@pytest.mark.stress
|
||||
def test_stress_rps_baseline(stress_server):
|
||||
"""Baseline throughput: ramp to STRESS_USERS users, sustain for STRESS_DURATION seconds.
|
||||
|
||||
Asserts:
|
||||
- RPS exceeds MIN_RPS
|
||||
- p99 latency < MAX_P99_MS
|
||||
- Failure rate < MAX_FAIL_RATE
|
||||
"""
|
||||
env = run_locust(
|
||||
host=stress_server,
|
||||
users=STRESS_USERS,
|
||||
spawn_rate=STRESS_SPAWN_RATE,
|
||||
duration=STRESS_DURATION,
|
||||
)
|
||||
_print_stats(env, f"BASELINE: {STRESS_USERS} users, {STRESS_DURATION}s")
|
||||
|
||||
total = env.stats.total
|
||||
num_reqs = total.num_requests
|
||||
assert num_reqs > 0, "No requests were made"
|
||||
|
||||
rps = total.total_rps
|
||||
fail_rate = total.num_failures / num_reqs if num_reqs else 1.0
|
||||
p99 = total.get_response_time_percentile(0.99) or 0
|
||||
|
||||
assert rps >= MIN_RPS, f"RPS {rps:.1f} below minimum {MIN_RPS}"
|
||||
assert p99 <= MAX_P99_MS, f"p99 {p99:.0f}ms exceeds max {MAX_P99_MS}ms"
|
||||
assert fail_rate <= MAX_FAIL_RATE, f"Failure rate {fail_rate:.2%} exceeds max {MAX_FAIL_RATE:.2%}"
|
||||
|
||||
|
||||
@pytest.mark.stress
|
||||
def test_stress_spike(stress_server):
|
||||
"""Thundering herd: ramp from 0 to 1000 users in 5 seconds.
|
||||
|
||||
Asserts: no 5xx errors (failure rate < 2%).
|
||||
"""
|
||||
spike_users = int(os.environ.get("STRESS_SPIKE_USERS", "1000"))
|
||||
spike_spawn = spike_users // 5 # all users in ~5 seconds
|
||||
|
||||
env = run_locust(
|
||||
host=stress_server,
|
||||
users=spike_users,
|
||||
spawn_rate=spike_spawn,
|
||||
duration=15, # 5s ramp + 10s sustained
|
||||
)
|
||||
_print_stats(env, f"SPIKE: 0 -> {spike_users} users in 5s")
|
||||
|
||||
total = env.stats.total
|
||||
num_reqs = total.num_requests
|
||||
assert num_reqs > 0, "No requests were made"
|
||||
|
||||
fail_rate = total.num_failures / num_reqs
|
||||
assert fail_rate < 0.02, f"Spike failure rate {fail_rate:.2%} — server buckled under thundering herd"
|
||||
|
||||
|
||||
@pytest.mark.stress
|
||||
def test_stress_sustained():
|
||||
"""Sustained load: 200 users for 30s. Checks latency doesn't degrade >3x.
|
||||
|
||||
Runs two phases against independent uvicorns. Sharing a server between
|
||||
phases leaks keep-alive connections from phase 1 into phase 2 and the
|
||||
sustained run records 0 requests roughly two-thirds of the time.
|
||||
1. Warm-up (10s) to get baseline latency
|
||||
2. Sustained (30s) to check for degradation
|
||||
"""
|
||||
sustained_users = int(os.environ.get("STRESS_SUSTAINED_USERS", "200"))
|
||||
ramp = min(sustained_users, 100)
|
||||
|
||||
with start_stress_server() as warm_url:
|
||||
env_warmup = run_locust(
|
||||
host=warm_url,
|
||||
users=sustained_users,
|
||||
spawn_rate=ramp,
|
||||
duration=10,
|
||||
)
|
||||
baseline_avg = env_warmup.stats.total.avg_response_time
|
||||
_print_stats(env_warmup, f"SUSTAINED warm-up: {sustained_users} users, 10s")
|
||||
|
||||
with start_stress_server() as sustained_url:
|
||||
env_sustained = run_locust(
|
||||
host=sustained_url,
|
||||
users=sustained_users,
|
||||
spawn_rate=ramp,
|
||||
duration=30,
|
||||
)
|
||||
sustained_avg = env_sustained.stats.total.avg_response_time
|
||||
_print_stats(env_sustained, f"SUSTAINED main: {sustained_users} users, 30s")
|
||||
|
||||
assert env_sustained.stats.total.num_requests > 0, "No requests during sustained phase"
|
||||
|
||||
if baseline_avg > 0:
|
||||
degradation = sustained_avg / baseline_avg
|
||||
print(f"\n Latency degradation factor: {degradation:.2f}x (baseline {baseline_avg:.1f}ms -> sustained {sustained_avg:.1f}ms)")
|
||||
assert degradation < 3.0, (
|
||||
f"Latency degraded {degradation:.1f}x under sustained load "
|
||||
f"(baseline {baseline_avg:.1f}ms -> {sustained_avg:.1f}ms)"
|
||||
)
|
||||
Reference in New Issue
Block a user