perf: run bcrypt on a thread so it doesn't block the event loop

verify_password / get_password_hash are CPU-bound and take ~250ms each at rounds=12. Called directly from async endpoints, they stall every other coroutine for that window — the single biggest single-worker bottleneck on the login path. Adds averify_password / ahash_password that wrap the sync versions in asyncio.to_thread. Sync versions stay put because _ensure_admin_user and tests still use them. 5 call sites updated: login, change-password, create-user, reset-password. tests/test_auth_async.py asserts parallel averify runs concurrently (~1x of a single verify, not 2x).
2026-04-17 14:52:22 -04:00
parent bd406090a7
commit 3945e72e11
15 changed files with 724 additions and 42 deletions
--- a/tests/stress/init.py
+++ b/tests/stress/init.py
--- a/tests/stress/conftest.py
+++ b/tests/stress/conftest.py
@@ -0,0 +1,130 @@
+"""
+Stress-test fixtures: real uvicorn server + programmatic Locust runner.
+"""
+
+import multiprocessing
+import os
+import sys
+import time
+import socket
+import signal
+import subprocess
+
+import pytest
+import requests
+
+
+# ---------------------------------------------------------------------------
+# Configuration (env-var driven for CI flexibility)
+# ---------------------------------------------------------------------------
+STRESS_USERS = int(os.environ.get("STRESS_USERS", "500"))
+STRESS_SPAWN_RATE = int(os.environ.get("STRESS_SPAWN_RATE", "50"))
+STRESS_DURATION = int(os.environ.get("STRESS_DURATION", "60"))
+STRESS_WORKERS = int(os.environ.get("STRESS_WORKERS", str(min(multiprocessing.cpu_count(), 4))))
+
+ADMIN_USER = "admin"
+ADMIN_PASS = "test-password-123"
+JWT_SECRET = "stable-test-secret-key-at-least-32-chars-long"
+
+
+def _free_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+
+
+def _wait_for_server(url: str, timeout: float = 15.0) -> None:
+    deadline = time.monotonic() + timeout
+    while time.monotonic() < deadline:
+        try:
+            r = requests.get(url, timeout=2)
+            if r.status_code in (200, 503):
+                return
+        except requests.ConnectionError:
+            pass
+        time.sleep(0.1)
+    raise TimeoutError(f"Server not ready at {url}")
+
+
+@pytest.fixture(scope="session")
+def stress_server():
+    """Start a real uvicorn server for stress testing."""
+    port = _free_port()
+    env = {
+        **os.environ,
+        "DECNET_JWT_SECRET": JWT_SECRET,
+        "DECNET_ADMIN_PASSWORD": ADMIN_PASS,
+        "DECNET_DEVELOPER": "true",
+        "DECNET_DEVELOPER_TRACING": "false",
+        "DECNET_DB_TYPE": "sqlite",
+    }
+    proc = subprocess.Popen(
+        [
+            sys.executable, "-m", "uvicorn",
+            "decnet.web.api:app",
+            "--host", "127.0.0.1",
+            "--port", str(port),
+            "--workers", str(STRESS_WORKERS),
+            "--log-level", "warning",
+        ],
+        env=env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    base_url = f"http://127.0.0.1:{port}"
+    try:
+        _wait_for_server(f"{base_url}/api/v1/health")
+        yield base_url
+    finally:
+        proc.terminate()
+        try:
+            proc.wait(timeout=10)
+        except subprocess.TimeoutExpired:
+            proc.kill()
+            proc.wait()
+
+
+@pytest.fixture(scope="session")
+def stress_token(stress_server):
+    """Authenticate and return a valid admin JWT."""
+    url = stress_server
+    resp = requests.post(
+        f"{url}/api/v1/auth/login",
+        json={"username": ADMIN_USER, "password": ADMIN_PASS},
+    )
+    assert resp.status_code == 200, f"Login failed: {resp.text}"
+    token = resp.json()["access_token"]
+
+    # Clear must_change_password
+    requests.post(
+        f"{url}/api/v1/auth/change-password",
+        json={"old_password": ADMIN_PASS, "new_password": ADMIN_PASS},
+        headers={"Authorization": f"Bearer {token}"},
+    )
+    # Re-login for clean token
+    resp2 = requests.post(
+        f"{url}/api/v1/auth/login",
+        json={"username": ADMIN_USER, "password": ADMIN_PASS},
+    )
+    return resp2.json()["access_token"]
+
+
+def run_locust(host, users, spawn_rate, duration):
+    """Run Locust programmatically and return the Environment with stats."""
+    import gevent
+    from locust.env import Environment
+    from locust.stats import stats_printer, stats_history, StatsCSVFileWriter
+    from tests.stress.locustfile import DecnetUser
+
+    env = Environment(user_classes=[DecnetUser], host=host)
+    env.create_local_runner()
+
+    env.runner.start(users, spawn_rate=spawn_rate)
+
+    # Let it run for the specified duration
+    gevent.sleep(duration)
+
+    env.runner.quit()
+    env.runner.greenlet.join(timeout=10)
+
+    return env
--- a/tests/stress/locustfile.py
+++ b/tests/stress/locustfile.py
@@ -0,0 +1,130 @@
+"""
+Locust user class for DECNET API stress testing.
+
+Hammers every endpoint from the OpenAPI spec with realistic traffic weights.
+Can be used standalone (`locust -f tests/stress/locustfile.py`) or
+programmatically via the pytest fixtures in conftest.py.
+"""
+
+import os
+import random
+import time
+
+from locust import HttpUser, task, between
+
+
+ADMIN_USER = os.environ.get("DECNET_ADMIN_USER", "admin")
+ADMIN_PASS = os.environ.get("DECNET_ADMIN_PASSWORD", "admin")
+
+_MAX_LOGIN_RETRIES = 5
+_LOGIN_BACKOFF_BASE = 0.5  # seconds, doubles each retry
+
+
+class DecnetUser(HttpUser):
+    wait_time = between(0.01, 0.05)  # near-zero think time — max pressure
+
+    def _login_with_retry(self):
+        """Login with exponential backoff — handles connection storms."""
+        for attempt in range(_MAX_LOGIN_RETRIES):
+            resp = self.client.post(
+                "/api/v1/auth/login",
+                json={"username": ADMIN_USER, "password": ADMIN_PASS},
+                name="/api/v1/auth/login [on_start]",
+            )
+            if resp.status_code == 200:
+                return resp.json()["access_token"]
+            # Status 0 = connection refused, retry with backoff
+            if resp.status_code == 0 or resp.status_code >= 500:
+                time.sleep(_LOGIN_BACKOFF_BASE * (2 ** attempt))
+                continue
+            raise RuntimeError(f"Login failed (non-retryable): {resp.status_code} {resp.text}")
+        raise RuntimeError(f"Login failed after {_MAX_LOGIN_RETRIES} retries (last status: {resp.status_code})")
+
+    def on_start(self):
+        token = self._login_with_retry()
+
+        # Clear must_change_password
+        self.client.post(
+            "/api/v1/auth/change-password",
+            json={"old_password": ADMIN_PASS, "new_password": ADMIN_PASS},
+            headers={"Authorization": f"Bearer {token}"},
+        )
+        # Re-login for a clean token
+        self.token = self._login_with_retry()
+        self.client.headers.update({"Authorization": f"Bearer {self.token}"})
+
+    # --- Read-hot paths (high weight) ---
+
+    @task(10)
+    def get_stats(self):
+        self.client.get("/api/v1/stats")
+
+    @task(8)
+    def get_logs(self):
+        self.client.get("/api/v1/logs", params={"limit": 50})
+
+    @task(8)
+    def get_attackers(self):
+        self.client.get("/api/v1/attackers")
+
+    @task(7)
+    def get_deckies(self):
+        self.client.get("/api/v1/deckies")
+
+    @task(6)
+    def get_bounties(self):
+        self.client.get("/api/v1/bounty")
+
+    @task(5)
+    def get_logs_histogram(self):
+        self.client.get("/api/v1/logs/histogram")
+
+    @task(5)
+    def search_logs(self):
+        self.client.get("/api/v1/logs", params={"search": "ssh", "limit": 100})
+
+    @task(4)
+    def search_attackers(self):
+        self.client.get(
+            "/api/v1/attackers", params={"search": "brute", "sort_by": "recent"}
+        )
+
+    @task(4)
+    def paginate_logs(self):
+        offset = random.randint(0, 1000)
+        self.client.get("/api/v1/logs", params={"limit": 100, "offset": offset})
+
+    @task(3)
+    def get_health(self):
+        self.client.get("/api/v1/health")
+
+    @task(3)
+    def get_config(self):
+        self.client.get("/api/v1/config")
+
+    # --- Write / auth paths (low weight) ---
+
+    @task(2)
+    def login(self):
+        self.client.post(
+            "/api/v1/auth/login",
+            json={"username": ADMIN_USER, "password": ADMIN_PASS},
+        )
+
+    @task(1)
+    def stream_sse(self):
+        """Short-lived SSE connection — read a few bytes then close."""
+        with self.client.get(
+            "/api/v1/stream",
+            params={"maxOutput": 3},
+            stream=True,
+            catch_response=True,
+            name="/api/v1/stream",
+        ) as resp:
+            if resp.status_code == 200:
+                # Read up to 4KB then bail — we're stress-testing connection setup
+                for chunk in resp.iter_content(chunk_size=1024):
+                    break
+                resp.success()
+            else:
+                resp.failure(f"SSE returned {resp.status_code}")
--- a/tests/stress/test_stress.py
+++ b/tests/stress/test_stress.py
@@ -0,0 +1,154 @@
+"""
+Locust-based stress tests for the DECNET API.
+
+Run:  pytest -m stress tests/stress/ -v -x -n0
+Tune: STRESS_USERS=2000 STRESS_SPAWN_RATE=200 STRESS_DURATION=120 pytest -m stress ...
+"""
+
+import os
+
+import pytest
+
+from tests.stress.conftest import run_locust, STRESS_USERS, STRESS_SPAWN_RATE, STRESS_DURATION
+
+
+# Assertion thresholds (overridable via env)
+MIN_RPS = int(os.environ.get("STRESS_MIN_RPS", "500"))
+MAX_P99_MS = int(os.environ.get("STRESS_MAX_P99_MS", "200"))
+MAX_FAIL_RATE = float(os.environ.get("STRESS_MAX_FAIL_RATE", "0.01"))  # 1%
+
+
+def _print_stats(env, label=""):
+    """Print a compact stats summary table."""
+    total = env.stats.total
+    num_reqs = total.num_requests
+    num_fails = total.num_failures
+    fail_pct = (num_fails / num_reqs * 100) if num_reqs else 0
+    rps = total.total_rps
+
+    print(f"\n{'=' * 70}")
+    if label:
+        print(f"  {label}")
+        print(f"{'=' * 70}")
+    print(f"  {'Metric':<30} {'Value':>15}")
+    print(f"  {'-' * 45}")
+    print(f"  {'Total requests':<30} {num_reqs:>15,}")
+    print(f"  {'Failures':<30} {num_fails:>15,} ({fail_pct:.2f}%)")
+    print(f"  {'RPS (total)':<30} {rps:>15.1f}")
+    print(f"  {'Avg latency (ms)':<30} {total.avg_response_time:>15.1f}")
+    print(f"  {'p50 (ms)':<30} {total.get_response_time_percentile(0.50) or 0:>15.0f}")
+    print(f"  {'p95 (ms)':<30} {total.get_response_time_percentile(0.95) or 0:>15.0f}")
+    print(f"  {'p99 (ms)':<30} {total.get_response_time_percentile(0.99) or 0:>15.0f}")
+    print(f"  {'Min (ms)':<30} {total.min_response_time:>15.0f}")
+    print(f"  {'Max (ms)':<30} {total.max_response_time:>15.0f}")
+    print(f"{'=' * 70}")
+
+    # Per-endpoint breakdown
+    print(f"\n  {'Endpoint':<45} {'Reqs':>8} {'Fails':>8} {'Avg(ms)':>10} {'p99(ms)':>10}")
+    print(f"  {'-' * 81}")
+    for entry in sorted(env.stats.entries.values(), key=lambda e: e.num_requests, reverse=True):
+        p99 = entry.get_response_time_percentile(0.99) or 0
+        print(
+            f"  {entry.method + ' ' + entry.name:<45} "
+            f"{entry.num_requests:>8,} "
+            f"{entry.num_failures:>8,} "
+            f"{entry.avg_response_time:>10.1f} "
+            f"{p99:>10.0f}"
+        )
+    print()
+
+
+@pytest.mark.stress
+def test_stress_rps_baseline(stress_server):
+    """Baseline throughput: ramp to STRESS_USERS users, sustain for STRESS_DURATION seconds.
+
+    Asserts:
+    - RPS exceeds MIN_RPS
+    - p99 latency < MAX_P99_MS
+    - Failure rate < MAX_FAIL_RATE
+    """
+    env = run_locust(
+        host=stress_server,
+        users=STRESS_USERS,
+        spawn_rate=STRESS_SPAWN_RATE,
+        duration=STRESS_DURATION,
+    )
+    _print_stats(env, f"BASELINE: {STRESS_USERS} users, {STRESS_DURATION}s")
+
+    total = env.stats.total
+    num_reqs = total.num_requests
+    assert num_reqs > 0, "No requests were made"
+
+    rps = total.total_rps
+    fail_rate = total.num_failures / num_reqs if num_reqs else 1.0
+    p99 = total.get_response_time_percentile(0.99) or 0
+
+    assert rps >= MIN_RPS, f"RPS {rps:.1f} below minimum {MIN_RPS}"
+    assert p99 <= MAX_P99_MS, f"p99 {p99:.0f}ms exceeds max {MAX_P99_MS}ms"
+    assert fail_rate <= MAX_FAIL_RATE, f"Failure rate {fail_rate:.2%} exceeds max {MAX_FAIL_RATE:.2%}"
+
+
+@pytest.mark.stress
+def test_stress_spike(stress_server):
+    """Thundering herd: ramp from 0 to 1000 users in 5 seconds.
+
+    Asserts: no 5xx errors (failure rate < 2%).
+    """
+    spike_users = int(os.environ.get("STRESS_SPIKE_USERS", "1000"))
+    spike_spawn = spike_users // 5  # all users in ~5 seconds
+
+    env = run_locust(
+        host=stress_server,
+        users=spike_users,
+        spawn_rate=spike_spawn,
+        duration=15,  # 5s ramp + 10s sustained
+    )
+    _print_stats(env, f"SPIKE: 0 -> {spike_users} users in 5s")
+
+    total = env.stats.total
+    num_reqs = total.num_requests
+    assert num_reqs > 0, "No requests were made"
+
+    fail_rate = total.num_failures / num_reqs
+    assert fail_rate < 0.02, f"Spike failure rate {fail_rate:.2%} — server buckled under thundering herd"
+
+
+@pytest.mark.stress
+def test_stress_sustained(stress_server):
+    """Sustained load: 200 users for 30s. Checks latency doesn't degrade >3x.
+
+    Runs two phases:
+    1. Warm-up (10s) to get baseline latency
+    2. Sustained (30s) to check for degradation
+    """
+    sustained_users = int(os.environ.get("STRESS_SUSTAINED_USERS", "200"))
+
+    # Phase 1: warm-up baseline
+    env_warmup = run_locust(
+        host=stress_server,
+        users=sustained_users,
+        spawn_rate=sustained_users,  # instant ramp
+        duration=10,
+    )
+    baseline_avg = env_warmup.stats.total.avg_response_time
+    _print_stats(env_warmup, f"SUSTAINED warm-up: {sustained_users} users, 10s")
+
+    # Phase 2: sustained
+    env_sustained = run_locust(
+        host=stress_server,
+        users=sustained_users,
+        spawn_rate=sustained_users,
+        duration=30,
+    )
+    sustained_avg = env_sustained.stats.total.avg_response_time
+    _print_stats(env_sustained, f"SUSTAINED main: {sustained_users} users, 30s")
+
+    assert env_sustained.stats.total.num_requests > 0, "No requests during sustained phase"
+
+    if baseline_avg > 0:
+        degradation = sustained_avg / baseline_avg
+        print(f"\n  Latency degradation factor: {degradation:.2f}x (baseline {baseline_avg:.1f}ms -> sustained {sustained_avg:.1f}ms)")
+        assert degradation < 3.0, (
+            f"Latency degraded {degradation:.1f}x under sustained load "
+            f"(baseline {baseline_avg:.1f}ms -> {sustained_avg:.1f}ms)"
+        )