DECNET/tests/stress/test_stress.py

"""
Locust-based stress tests for the DECNET API.

Run:  pytest -m stress tests/stress/ -v -x -n0
Tune: STRESS_USERS=2000 STRESS_SPAWN_RATE=200 STRESS_DURATION=120 pytest -m stress ...
"""

import os

import pytest

from tests.stress.conftest import run_locust, STRESS_USERS, STRESS_SPAWN_RATE, STRESS_DURATION


# Assertion thresholds (overridable via env)
MIN_RPS = int(os.environ.get("STRESS_MIN_RPS", "150"))
MAX_P99_MS = int(os.environ.get("STRESS_MAX_P99_MS", "10000"))
MAX_FAIL_RATE = float(os.environ.get("STRESS_MAX_FAIL_RATE", "0.01"))  # 1%


def _print_stats(env, label=""):
    """Print a compact stats summary table."""
    total = env.stats.total
    num_reqs = total.num_requests
    num_fails = total.num_failures
    fail_pct = (num_fails / num_reqs * 100) if num_reqs else 0
    rps = total.total_rps

    print(f"\n{'=' * 70}")
    if label:
        print(f"  {label}")
        print(f"{'=' * 70}")
    print(f"  {'Metric':<30} {'Value':>15}")
    print(f"  {'-' * 45}")
    print(f"  {'Total requests':<30} {num_reqs:>15,}")
    print(f"  {'Failures':<30} {num_fails:>15,} ({fail_pct:.2f}%)")
    print(f"  {'RPS (total)':<30} {rps:>15.1f}")
    print(f"  {'Avg latency (ms)':<30} {total.avg_response_time:>15.1f}")
    print(f"  {'p50 (ms)':<30} {total.get_response_time_percentile(0.50) or 0:>15.0f}")
    print(f"  {'p95 (ms)':<30} {total.get_response_time_percentile(0.95) or 0:>15.0f}")
    print(f"  {'p99 (ms)':<30} {total.get_response_time_percentile(0.99) or 0:>15.0f}")
    print(f"  {'Min (ms)':<30} {total.min_response_time:>15.0f}")
    print(f"  {'Max (ms)':<30} {total.max_response_time:>15.0f}")
    print(f"{'=' * 70}")

    # Per-endpoint breakdown
    print(f"\n  {'Endpoint':<45} {'Reqs':>8} {'Fails':>8} {'Avg(ms)':>10} {'p99(ms)':>10}")
    print(f"  {'-' * 81}")
    for entry in sorted(env.stats.entries.values(), key=lambda e: e.num_requests, reverse=True):
        p99 = entry.get_response_time_percentile(0.99) or 0
        print(
            f"  {entry.method + ' ' + entry.name:<45} "
            f"{entry.num_requests:>8,} "
            f"{entry.num_failures:>8,} "
            f"{entry.avg_response_time:>10.1f} "
            f"{p99:>10.0f}"
        )
    print()


@pytest.mark.stress
def test_stress_rps_baseline(stress_server):
    """Baseline throughput: ramp to STRESS_USERS users, sustain for STRESS_DURATION seconds.

    Asserts:
    - RPS exceeds MIN_RPS
    - p99 latency < MAX_P99_MS
    - Failure rate < MAX_FAIL_RATE
    """
    env = run_locust(
        host=stress_server,
        users=STRESS_USERS,
        spawn_rate=STRESS_SPAWN_RATE,
        duration=STRESS_DURATION,
    )
    _print_stats(env, f"BASELINE: {STRESS_USERS} users, {STRESS_DURATION}s")

    total = env.stats.total
    num_reqs = total.num_requests
    assert num_reqs > 0, "No requests were made"

    rps = total.total_rps
    fail_rate = total.num_failures / num_reqs if num_reqs else 1.0
    p99 = total.get_response_time_percentile(0.99) or 0

    assert rps >= MIN_RPS, f"RPS {rps:.1f} below minimum {MIN_RPS}"
    assert p99 <= MAX_P99_MS, f"p99 {p99:.0f}ms exceeds max {MAX_P99_MS}ms"
    assert fail_rate <= MAX_FAIL_RATE, f"Failure rate {fail_rate:.2%} exceeds max {MAX_FAIL_RATE:.2%}"


@pytest.mark.stress
def test_stress_spike(stress_server):
    """Thundering herd: ramp from 0 to 1000 users in 5 seconds.

    Asserts: no 5xx errors (failure rate < 2%).
    """
    spike_users = int(os.environ.get("STRESS_SPIKE_USERS", "1000"))
    spike_spawn = spike_users // 5  # all users in ~5 seconds

    env = run_locust(
        host=stress_server,
        users=spike_users,
        spawn_rate=spike_spawn,
        duration=15,  # 5s ramp + 10s sustained
    )
    _print_stats(env, f"SPIKE: 0 -> {spike_users} users in 5s")

    total = env.stats.total
    num_reqs = total.num_requests
    assert num_reqs > 0, "No requests were made"

    fail_rate = total.num_failures / num_reqs
    assert fail_rate < 0.02, f"Spike failure rate {fail_rate:.2%} — server buckled under thundering herd"


@pytest.mark.stress
def test_stress_sustained(stress_server):
    """Sustained load: 200 users for 30s. Checks latency doesn't degrade >3x.

    Runs two phases:
    1. Warm-up (10s) to get baseline latency
    2. Sustained (30s) to check for degradation
    """
    sustained_users = int(os.environ.get("STRESS_SUSTAINED_USERS", "200"))

    # Cap spawn rate at 100/s — locust itself warns above that and has been
    # observed to record 0 requests when the spawn storm collides with a
    # still-draining uvicorn from a prior phase.
    ramp = min(sustained_users, 100)

    # Phase 1: warm-up baseline
    env_warmup = run_locust(
        host=stress_server,
        users=sustained_users,
        spawn_rate=ramp,
        duration=10,
    )
    baseline_avg = env_warmup.stats.total.avg_response_time
    _print_stats(env_warmup, f"SUSTAINED warm-up: {sustained_users} users, 10s")

    # Let the server drain pending work before firing the second locust run;
    # otherwise the first request in phase 2 can sit behind a queued backlog
    # and the 30s window can finish with 0 recorded requests.
    import time as _t
    _t.sleep(5)

    # Phase 2: sustained
    env_sustained = run_locust(
        host=stress_server,
        users=sustained_users,
        spawn_rate=ramp,
        duration=30,
    )
    sustained_avg = env_sustained.stats.total.avg_response_time
    _print_stats(env_sustained, f"SUSTAINED main: {sustained_users} users, 30s")

    assert env_sustained.stats.total.num_requests > 0, "No requests during sustained phase"

    if baseline_avg > 0:
        degradation = sustained_avg / baseline_avg
        print(f"\n  Latency degradation factor: {degradation:.2f}x (baseline {baseline_avg:.1f}ms -> sustained {sustained_avg:.1f}ms)")
        assert degradation < 3.0, (
            f"Latency degraded {degradation:.1f}x under sustained load "
            f"(baseline {baseline_avg:.1f}ms -> {sustained_avg:.1f}ms)"
        )