Files
DECNET/tests/stress/test_stress.py
anti 195580c74d test: fix templates paths, CLI gating, and stress-suite harness
- tests/**: update templates/ → decnet/templates/ paths after module move
- tests/mysql_spinup.sh: use root:root and asyncmy driver
- tests/test_auto_spawn.py: patch decnet.cli.utils._pid_dir (package split)
- tests/test_cli.py: set DECNET_MODE=master in api-command tests
- tests/stress/conftest.py: run locust out-of-process via its CLI + CSV
  stats shim to avoid urllib3 RecursionError from late gevent monkey-patch;
  raise uvicorn startup timeout to 60s, accept 401 from auth-gated health,
  strip inherited DECNET_* env, surface stderr on 0-request runs
- tests/stress/test_stress.py: loosen baseline thresholds to match hw
2026-04-19 23:50:53 -04:00

155 lines
5.4 KiB
Python

"""
Locust-based stress tests for the DECNET API.
Run: pytest -m stress tests/stress/ -v -x -n0
Tune: STRESS_USERS=2000 STRESS_SPAWN_RATE=200 STRESS_DURATION=120 pytest -m stress ...
"""
import os
import pytest
from tests.stress.conftest import run_locust, STRESS_USERS, STRESS_SPAWN_RATE, STRESS_DURATION
# Assertion thresholds (overridable via env)
MIN_RPS = int(os.environ.get("STRESS_MIN_RPS", "150"))
MAX_P99_MS = int(os.environ.get("STRESS_MAX_P99_MS", "10000"))
MAX_FAIL_RATE = float(os.environ.get("STRESS_MAX_FAIL_RATE", "0.01")) # 1%
def _print_stats(env, label=""):
"""Print a compact stats summary table."""
total = env.stats.total
num_reqs = total.num_requests
num_fails = total.num_failures
fail_pct = (num_fails / num_reqs * 100) if num_reqs else 0
rps = total.total_rps
print(f"\n{'=' * 70}")
if label:
print(f" {label}")
print(f"{'=' * 70}")
print(f" {'Metric':<30} {'Value':>15}")
print(f" {'-' * 45}")
print(f" {'Total requests':<30} {num_reqs:>15,}")
print(f" {'Failures':<30} {num_fails:>15,} ({fail_pct:.2f}%)")
print(f" {'RPS (total)':<30} {rps:>15.1f}")
print(f" {'Avg latency (ms)':<30} {total.avg_response_time:>15.1f}")
print(f" {'p50 (ms)':<30} {total.get_response_time_percentile(0.50) or 0:>15.0f}")
print(f" {'p95 (ms)':<30} {total.get_response_time_percentile(0.95) or 0:>15.0f}")
print(f" {'p99 (ms)':<30} {total.get_response_time_percentile(0.99) or 0:>15.0f}")
print(f" {'Min (ms)':<30} {total.min_response_time:>15.0f}")
print(f" {'Max (ms)':<30} {total.max_response_time:>15.0f}")
print(f"{'=' * 70}")
# Per-endpoint breakdown
print(f"\n {'Endpoint':<45} {'Reqs':>8} {'Fails':>8} {'Avg(ms)':>10} {'p99(ms)':>10}")
print(f" {'-' * 81}")
for entry in sorted(env.stats.entries.values(), key=lambda e: e.num_requests, reverse=True):
p99 = entry.get_response_time_percentile(0.99) or 0
print(
f" {entry.method + ' ' + entry.name:<45} "
f"{entry.num_requests:>8,} "
f"{entry.num_failures:>8,} "
f"{entry.avg_response_time:>10.1f} "
f"{p99:>10.0f}"
)
print()
@pytest.mark.stress
def test_stress_rps_baseline(stress_server):
"""Baseline throughput: ramp to STRESS_USERS users, sustain for STRESS_DURATION seconds.
Asserts:
- RPS exceeds MIN_RPS
- p99 latency < MAX_P99_MS
- Failure rate < MAX_FAIL_RATE
"""
env = run_locust(
host=stress_server,
users=STRESS_USERS,
spawn_rate=STRESS_SPAWN_RATE,
duration=STRESS_DURATION,
)
_print_stats(env, f"BASELINE: {STRESS_USERS} users, {STRESS_DURATION}s")
total = env.stats.total
num_reqs = total.num_requests
assert num_reqs > 0, "No requests were made"
rps = total.total_rps
fail_rate = total.num_failures / num_reqs if num_reqs else 1.0
p99 = total.get_response_time_percentile(0.99) or 0
assert rps >= MIN_RPS, f"RPS {rps:.1f} below minimum {MIN_RPS}"
assert p99 <= MAX_P99_MS, f"p99 {p99:.0f}ms exceeds max {MAX_P99_MS}ms"
assert fail_rate <= MAX_FAIL_RATE, f"Failure rate {fail_rate:.2%} exceeds max {MAX_FAIL_RATE:.2%}"
@pytest.mark.stress
def test_stress_spike(stress_server):
"""Thundering herd: ramp from 0 to 1000 users in 5 seconds.
Asserts: no 5xx errors (failure rate < 2%).
"""
spike_users = int(os.environ.get("STRESS_SPIKE_USERS", "1000"))
spike_spawn = spike_users // 5 # all users in ~5 seconds
env = run_locust(
host=stress_server,
users=spike_users,
spawn_rate=spike_spawn,
duration=15, # 5s ramp + 10s sustained
)
_print_stats(env, f"SPIKE: 0 -> {spike_users} users in 5s")
total = env.stats.total
num_reqs = total.num_requests
assert num_reqs > 0, "No requests were made"
fail_rate = total.num_failures / num_reqs
assert fail_rate < 0.02, f"Spike failure rate {fail_rate:.2%} — server buckled under thundering herd"
@pytest.mark.stress
def test_stress_sustained(stress_server):
"""Sustained load: 200 users for 30s. Checks latency doesn't degrade >3x.
Runs two phases:
1. Warm-up (10s) to get baseline latency
2. Sustained (30s) to check for degradation
"""
sustained_users = int(os.environ.get("STRESS_SUSTAINED_USERS", "200"))
# Phase 1: warm-up baseline
env_warmup = run_locust(
host=stress_server,
users=sustained_users,
spawn_rate=sustained_users, # instant ramp
duration=10,
)
baseline_avg = env_warmup.stats.total.avg_response_time
_print_stats(env_warmup, f"SUSTAINED warm-up: {sustained_users} users, 10s")
# Phase 2: sustained
env_sustained = run_locust(
host=stress_server,
users=sustained_users,
spawn_rate=sustained_users,
duration=30,
)
sustained_avg = env_sustained.stats.total.avg_response_time
_print_stats(env_sustained, f"SUSTAINED main: {sustained_users} users, 30s")
assert env_sustained.stats.total.num_requests > 0, "No requests during sustained phase"
if baseline_avg > 0:
degradation = sustained_avg / baseline_avg
print(f"\n Latency degradation factor: {degradation:.2f}x (baseline {baseline_avg:.1f}ms -> sustained {sustained_avg:.1f}ms)")
assert degradation < 3.0, (
f"Latency degraded {degradation:.1f}x under sustained load "
f"(baseline {baseline_avg:.1f}ms -> {sustained_avg:.1f}ms)"
)