merge testing->tomerge/main #7
@@ -49,14 +49,19 @@ def _wait_for_server(url: str, timeout: float = 60.0) -> None:
|
|||||||
r = requests.get(url, timeout=2)
|
r = requests.get(url, timeout=2)
|
||||||
if r.status_code in (200, 401, 503):
|
if r.status_code in (200, 401, 503):
|
||||||
return
|
return
|
||||||
except requests.ConnectionError:
|
except requests.RequestException:
|
||||||
|
# ConnectionError / ReadTimeout / anything else transient — the
|
||||||
|
# server is either not up yet or too busy to respond in time.
|
||||||
pass
|
pass
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
raise TimeoutError(f"Server not ready at {url}")
|
raise TimeoutError(f"Server not ready at {url}")
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="function")
|
||||||
def stress_server():
|
def stress_server():
|
||||||
|
# Function-scoped: every stress test gets its own clean uvicorn. Sharing
|
||||||
|
# a server across baseline → spike → sustained left the later runs with
|
||||||
|
# a half-dead pool (0-request symptom). Cost is ~5s of startup per test.
|
||||||
"""Start a real uvicorn server for stress testing."""
|
"""Start a real uvicorn server for stress testing."""
|
||||||
port = _free_port()
|
port = _free_port()
|
||||||
env = {k: v for k, v in os.environ.items() if not k.startswith("DECNET_")}
|
env = {k: v for k, v in os.environ.items() if not k.startswith("DECNET_")}
|
||||||
@@ -107,7 +112,7 @@ def stress_server():
|
|||||||
proc.wait()
|
proc.wait()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture
|
||||||
def stress_token(stress_server):
|
def stress_token(stress_server):
|
||||||
"""Authenticate and return a valid admin JWT."""
|
"""Authenticate and return a valid admin JWT."""
|
||||||
url = stress_server
|
url = stress_server
|
||||||
@@ -213,7 +218,7 @@ def _parse_locust_csv(stats_csv: Path) -> _LocustEnv:
|
|||||||
return _LocustEnv(_Stats(total, entries))
|
return _LocustEnv(_Stats(total, entries))
|
||||||
|
|
||||||
|
|
||||||
def run_locust(host, users, spawn_rate, duration):
|
def run_locust(host, users, spawn_rate, duration, _retry=False):
|
||||||
"""Run Locust in a subprocess (fresh Python, clean gevent monkey-patch)
|
"""Run Locust in a subprocess (fresh Python, clean gevent monkey-patch)
|
||||||
and return a stats shim compatible with the tests.
|
and return a stats shim compatible with the tests.
|
||||||
"""
|
"""
|
||||||
@@ -269,11 +274,19 @@ def run_locust(host, users, spawn_rate, duration):
|
|||||||
)
|
)
|
||||||
|
|
||||||
result = _parse_locust_csv(Path(str(csv_prefix) + "_stats.csv"))
|
result = _parse_locust_csv(Path(str(csv_prefix) + "_stats.csv"))
|
||||||
|
if result.stats.total.num_requests == 0 and not _retry:
|
||||||
|
# Transient: server was mid-drain or connection storm RSTed before any
|
||||||
|
# request landed. Wait for the API to respond cleanly, then retry once
|
||||||
|
# before giving up.
|
||||||
|
try:
|
||||||
|
_wait_for_server(f"{host}/api/v1/health", timeout=15.0)
|
||||||
|
except TimeoutError:
|
||||||
|
pass
|
||||||
|
time.sleep(2)
|
||||||
|
return run_locust(host, users, spawn_rate, duration, _retry=True)
|
||||||
if result.stats.total.num_requests == 0:
|
if result.stats.total.num_requests == 0:
|
||||||
# Surface the locust output so we can see why (connection errors,
|
|
||||||
# on_start stalls, etc.) instead of a silent "no requests" assert.
|
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"locust produced 0 requests.\n"
|
f"locust produced 0 requests (after 1 retry).\n"
|
||||||
f"--- stdout ---\n{proc.stdout.decode(errors='replace')}\n"
|
f"--- stdout ---\n{proc.stdout.decode(errors='replace')}\n"
|
||||||
f"--- stderr ---\n{proc.stderr.decode(errors='replace')}"
|
f"--- stderr ---\n{proc.stderr.decode(errors='replace')}"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -123,21 +123,32 @@ def test_stress_sustained(stress_server):
|
|||||||
"""
|
"""
|
||||||
sustained_users = int(os.environ.get("STRESS_SUSTAINED_USERS", "200"))
|
sustained_users = int(os.environ.get("STRESS_SUSTAINED_USERS", "200"))
|
||||||
|
|
||||||
|
# Cap spawn rate at 100/s — locust itself warns above that and has been
|
||||||
|
# observed to record 0 requests when the spawn storm collides with a
|
||||||
|
# still-draining uvicorn from a prior phase.
|
||||||
|
ramp = min(sustained_users, 100)
|
||||||
|
|
||||||
# Phase 1: warm-up baseline
|
# Phase 1: warm-up baseline
|
||||||
env_warmup = run_locust(
|
env_warmup = run_locust(
|
||||||
host=stress_server,
|
host=stress_server,
|
||||||
users=sustained_users,
|
users=sustained_users,
|
||||||
spawn_rate=sustained_users, # instant ramp
|
spawn_rate=ramp,
|
||||||
duration=10,
|
duration=10,
|
||||||
)
|
)
|
||||||
baseline_avg = env_warmup.stats.total.avg_response_time
|
baseline_avg = env_warmup.stats.total.avg_response_time
|
||||||
_print_stats(env_warmup, f"SUSTAINED warm-up: {sustained_users} users, 10s")
|
_print_stats(env_warmup, f"SUSTAINED warm-up: {sustained_users} users, 10s")
|
||||||
|
|
||||||
|
# Let the server drain pending work before firing the second locust run;
|
||||||
|
# otherwise the first request in phase 2 can sit behind a queued backlog
|
||||||
|
# and the 30s window can finish with 0 recorded requests.
|
||||||
|
import time as _t
|
||||||
|
_t.sleep(5)
|
||||||
|
|
||||||
# Phase 2: sustained
|
# Phase 2: sustained
|
||||||
env_sustained = run_locust(
|
env_sustained = run_locust(
|
||||||
host=stress_server,
|
host=stress_server,
|
||||||
users=sustained_users,
|
users=sustained_users,
|
||||||
spawn_rate=sustained_users,
|
spawn_rate=ramp,
|
||||||
duration=30,
|
duration=30,
|
||||||
)
|
)
|
||||||
sustained_avg = env_sustained.stats.total.avg_response_time
|
sustained_avg = env_sustained.stats.total.avg_response_time
|
||||||
|
|||||||
Reference in New Issue
Block a user