From da3e675f863f77f808ad786895e71fe55f197ff7 Mon Sep 17 00:00:00 2001
From: anti <samuel@securejump.cl>
Date: Mon, 20 Apr 2026 01:26:56 -0400
Subject: [PATCH] fix(tests): fixed locust fixtures and rampups, since >100
 generally isn't very well managed

---
 tests/stress/conftest.py    | 27 ++++++++++++++++++++-------
 tests/stress/test_stress.py | 15 +++++++++++++--
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/tests/stress/conftest.py b/tests/stress/conftest.py
index 8efb24f..7f552ee 100644
--- a/tests/stress/conftest.py
+++ b/tests/stress/conftest.py
@@ -49,14 +49,19 @@ def _wait_for_server(url: str, timeout: float = 60.0) -> None:
             r = requests.get(url, timeout=2)
             if r.status_code in (200, 401, 503):
                 return
-        except requests.ConnectionError:
+        except requests.RequestException:
+            # ConnectionError / ReadTimeout / anything else transient — the
+            # server is either not up yet or too busy to respond in time.
             pass
         time.sleep(0.1)
     raise TimeoutError(f"Server not ready at {url}")
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def stress_server():
+    # Function-scoped: every stress test gets its own clean uvicorn. Sharing
+    # a server across baseline → spike → sustained left the later runs with
+    # a half-dead pool (0-request symptom). Cost is ~5s of startup per test.
     """Start a real uvicorn server for stress testing."""
     port = _free_port()
     env = {k: v for k, v in os.environ.items() if not k.startswith("DECNET_")}
@@ -107,7 +112,7 @@ def stress_server():
             proc.wait()
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture
 def stress_token(stress_server):
     """Authenticate and return a valid admin JWT."""
     url = stress_server
@@ -213,7 +218,7 @@ def _parse_locust_csv(stats_csv: Path) -> _LocustEnv:
     return _LocustEnv(_Stats(total, entries))
 
 
-def run_locust(host, users, spawn_rate, duration):
+def run_locust(host, users, spawn_rate, duration, _retry=False):
     """Run Locust in a subprocess (fresh Python, clean gevent monkey-patch)
     and return a stats shim compatible with the tests.
     """
@@ -269,11 +274,19 @@ def run_locust(host, users, spawn_rate, duration):
         )
 
     result = _parse_locust_csv(Path(str(csv_prefix) + "_stats.csv"))
+    if result.stats.total.num_requests == 0 and not _retry:
+        # Transient: server was mid-drain or connection storm RSTed before any
+        # request landed. Wait for the API to respond cleanly, then retry once
+        # before giving up.
+        try:
+            _wait_for_server(f"{host}/api/v1/health", timeout=15.0)
+        except TimeoutError:
+            pass
+        time.sleep(2)
+        return run_locust(host, users, spawn_rate, duration, _retry=True)
     if result.stats.total.num_requests == 0:
-        # Surface the locust output so we can see why (connection errors,
-        # on_start stalls, etc.) instead of a silent "no requests" assert.
         raise RuntimeError(
-            f"locust produced 0 requests.\n"
+            f"locust produced 0 requests (after 1 retry).\n"
             f"--- stdout ---\n{proc.stdout.decode(errors='replace')}\n"
             f"--- stderr ---\n{proc.stderr.decode(errors='replace')}"
         )
diff --git a/tests/stress/test_stress.py b/tests/stress/test_stress.py
index 8abe695..5f3345b 100644
--- a/tests/stress/test_stress.py
+++ b/tests/stress/test_stress.py
@@ -123,21 +123,32 @@ def test_stress_sustained(stress_server):
     """
     sustained_users = int(os.environ.get("STRESS_SUSTAINED_USERS", "200"))
 
+    # Cap spawn rate at 100/s — locust itself warns above that and has been
+    # observed to record 0 requests when the spawn storm collides with a
+    # still-draining uvicorn from a prior phase.
+    ramp = min(sustained_users, 100)
+
     # Phase 1: warm-up baseline
     env_warmup = run_locust(
         host=stress_server,
         users=sustained_users,
-        spawn_rate=sustained_users,  # instant ramp
+        spawn_rate=ramp,
         duration=10,
     )
     baseline_avg = env_warmup.stats.total.avg_response_time
     _print_stats(env_warmup, f"SUSTAINED warm-up: {sustained_users} users, 10s")
 
+    # Let the server drain pending work before firing the second locust run;
+    # otherwise the first request in phase 2 can sit behind a queued backlog
+    # and the 30s window can finish with 0 recorded requests.
+    import time as _t
+    _t.sleep(5)
+
     # Phase 2: sustained
     env_sustained = run_locust(
         host=stress_server,
         users=sustained_users,
-        spawn_rate=sustained_users,
+        spawn_rate=ramp,
         duration=30,
     )
     sustained_avg = env_sustained.stats.total.avg_response_time