feat(api): opaque 500 handler + error_id correlation for unhandled exceptions

Registers a generic @app.exception_handler(Exception) that catches anything
uncaught in route handlers / dependencies. Prod response is opaque:
{detail: 'Internal Server Error', error_id: <uuid4 hex>}. Dev mode
(DECNET_DEVELOPER=True) adds exception_type and traceback fields so
failures are debuggable without tailing server logs.

The error_id is logged alongside the full traceback server-side, letting
operators correlate a user's 500 report with the exact exception via
`grep <error_id> /var/log/decnet.log`.

FastAPI's own HTTPException routing and the existing
RequestValidationError / ValidationError / RateLimitExceeded handlers
still take precedence — this handler only fires on genuinely-uncaught
exceptions.

Flips threat model F1/I 'traceback / stack trace leakage' from ? to M
and logs a follow-up checklist entry for 4 detail=str(e) sites in the
fleet deploy router (admin-gated, different threat class, separate
audit).
This commit is contained in:
2026-04-23 14:07:32 -04:00
parent 2f4f81e5de
commit ef4179ea1f
3 changed files with 161 additions and 2 deletions

View File

@@ -1,5 +1,7 @@
import asyncio
import os
import traceback
import uuid
from contextlib import asynccontextmanager
from typing import Any, AsyncGenerator, Optional
@@ -297,3 +299,29 @@ async def pydantic_validation_exception_handler(request: Request, exc: Validatio
"type": "internal_validation_error"
},
)
@app.exception_handler(Exception)
async def unhandled_exception_handler(request: Request, exc: Exception) -> ORJSONResponse:
"""Catch-all for uncaught exceptions in route handlers and dependencies.
Prod: opaque 500 with an ``error_id``; full traceback goes ONLY to server
logs. Dev (``DECNET_DEVELOPER=True``): same response plus ``exception_type``
and ``traceback`` fields so failures are debuggable without tailing logs.
The ``error_id`` lets operators correlate a user's 500 report with the full
traceback in server logs (``grep <error_id> /var/log/decnet.log``).
FastAPI's own ``HTTPException`` routing still takes precedence — this
handler only fires on genuinely-uncaught exceptions.
"""
error_id = uuid.uuid4().hex
log.exception(
"unhandled exception on %s %s [error_id=%s]",
request.method, request.url.path, error_id,
)
body: dict[str, Any] = {"detail": "Internal Server Error", "error_id": error_id}
if DECNET_DEVELOPER:
body["exception_type"] = type(exc).__name__
body["traceback"] = traceback.format_exc()
return ORJSONResponse(status_code=500, content=body)

View File

@@ -185,7 +185,7 @@ Each sub-flow below gets its own table. Status codes:
| T | Password hash tampering in DB | T | DB integrity is OS/filesystem scope. See boundary #2 for syslog-path tampering. |
| R | User denies having performed an action | M | Every mutation logged with actor UUID; audit trail lives in `logs` table. |
| I | Password reflected in login response on failure | M | Single uniform 401 for user-not-found and bad-password at `api_login.py`. No user-existence oracle. |
| I | JWT secret leaked via error message / stack trace | **?** | Verify: production error handler doesn't return tracebacks. |
| I | JWT secret leaked via error message / stack trace | M | Generic `@app.exception_handler(Exception)` at `decnet/web/api.py` returns opaque `{detail, error_id}` on uncaught exceptions; traceback is logged server-side only. Dev-mode (`DECNET_DEVELOPER=True`) includes traceback in body for debugging. |
| D | Bcrypt-cost DoS via long password submission | M | Pydantic `max_length=72` on all password fields in `decnet/web/db/models/auth.py` (matches bcrypt's internal truncation limit). |
| E | `role=None` bypass (historical bug) | M | See memory `project_rbac_null_role.md`; fixed via centralized RBAC that treats `None` as unauthenticated. |
@@ -289,7 +289,8 @@ code" or "accepted, add to table above."
- [x] ~~Per-IP / per-user rate limit on `/auth/login`.~~ Shipped — see F1/S row.
- [x] ~~Uniform "invalid credentials" on login failure (no user-existence oracle).~~ Verified — see F1/I row.
- [ ] Production error handler suppresses tracebacks and internal details.
- [x] ~~Production error handler suppresses tracebacks and internal details.~~ Shipped — generic `@app.exception_handler(Exception)` in `decnet/web/api.py`; opaque `{detail, error_id}` in prod, traceback only under `DECNET_DEVELOPER=True`.
- [ ] `detail=str(e)` / `detail=f"…{e}"` sites in `decnet/web/router/fleet/api_deploy_deckies.py:41,67,83,155` — admin-gated, arguably intentional INI-parser UX, but worth auditing: preserve deliberate messages, sanitize any bubble-up from lower layers.
- [x] ~~Password length clamp before bcrypt.~~ Verified — Pydantic `max_length=72`.
- [ ] Contract test asserting every protected route returns 401 unauthenticated and 403 for under-roled.
- [ ] Field allow-list on viewer responses for attacker / user / bounty serializers.
@@ -357,3 +358,4 @@ In priority order:
|------|--------|--------|
| 2026-04-23 | Initial scaffold. System context + Dashboard↔API as first worked component. | ANTI |
| 2026-04-23 | F1 Authn: 3 threats moved from **?** to **M** (rate limit shipped; uniform 401 verified; bcrypt length clamp verified). Added DA-08 accepted risk: reverse-proxy per-IP bucket collapse. | ANTI |
| 2026-04-23 | F1/I "traceback / stack trace leakage" moved from **?** to **M** via generic Exception handler with `error_id` correlation. Added follow-up checklist entry for `detail=str(e)` sites in fleet deploy router. | ANTI |

View File

@@ -0,0 +1,129 @@
"""Tests for the generic Exception handler at decnet/web/api.py.
Mitigation target: threat model F1/I — "Production error handler suppresses
tracebacks and internal details." Verifies that uncaught exceptions return
an opaque 500 with a correlation id in prod, and include debug detail only
when DECNET_DEVELOPER is on.
"""
from __future__ import annotations
import logging
import re
from typing import AsyncGenerator
import httpx
import pytest
from decnet.env import DECNET_ADMIN_USER, DECNET_ADMIN_PASSWORD
from decnet.web.api import app
from decnet.web.dependencies import repo
@pytest.fixture
async def client() -> AsyncGenerator[httpx.AsyncClient, None]:
"""Override the shared client fixture to NOT re-raise app exceptions.
The default `httpx.ASGITransport` re-raises any uncaught exception
from the app — which defeats the whole point of testing our
generic exception handler. With `raise_app_exceptions=False`, the
transport instead returns the HTTP response our handler built.
"""
transport = httpx.ASGITransport(app=app, raise_app_exceptions=False)
async with httpx.AsyncClient(transport=transport, base_url="http://test") as ac:
yield ac
async def _admin_headers(client: httpx.AsyncClient) -> dict[str, str]:
resp = await client.post(
"/api/v1/auth/login",
json={"username": DECNET_ADMIN_USER, "password": DECNET_ADMIN_PASSWORD},
)
token = resp.json()["access_token"]
# Clear must_change_password so the token passes mutation-gated endpoints.
await client.post(
"/api/v1/auth/change-password",
json={"old_password": DECNET_ADMIN_PASSWORD, "new_password": DECNET_ADMIN_PASSWORD},
headers={"Authorization": f"Bearer {token}"},
)
resp2 = await client.post(
"/api/v1/auth/login",
json={"username": DECNET_ADMIN_USER, "password": DECNET_ADMIN_PASSWORD},
)
return {"Authorization": f"Bearer {resp2.json()['access_token']}"}
def _raise_boom(*_a, **_kw):
raise RuntimeError("boom")
@pytest.mark.anyio
async def test_unhandled_exception_prod_shape_is_opaque(
client: httpx.AsyncClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Prod mode (DECNET_DEVELOPER=False): 500 with opaque body + error_id.
Must NOT include traceback or exception_type."""
import decnet.web.api as _api
monkeypatch.setattr(_api, "DECNET_DEVELOPER", False)
headers = await _admin_headers(client)
monkeypatch.setattr(repo, "get_attacker_by_uuid", _raise_boom)
r = await client.get("/api/v1/attackers/any-uuid", headers=headers)
assert r.status_code == 500
body = r.json()
assert body.get("detail") == "Internal Server Error"
assert "error_id" in body
assert re.fullmatch(r"[0-9a-f]{32}", body["error_id"]), body["error_id"]
assert "traceback" not in body
assert "exception_type" not in body
@pytest.mark.anyio
async def test_unhandled_exception_dev_shape_includes_traceback(
client: httpx.AsyncClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Dev mode (DECNET_DEVELOPER=True): body includes exception_type and
traceback so failures are debuggable without tailing server logs."""
import decnet.web.api as _api
monkeypatch.setattr(_api, "DECNET_DEVELOPER", True)
headers = await _admin_headers(client)
monkeypatch.setattr(repo, "get_attacker_by_uuid", _raise_boom)
r = await client.get("/api/v1/attackers/any-uuid", headers=headers)
assert r.status_code == 500
body = r.json()
assert body["detail"] == "Internal Server Error"
assert "error_id" in body
assert body["exception_type"] == "RuntimeError"
assert "RuntimeError" in body["traceback"]
assert "boom" in body["traceback"]
@pytest.mark.anyio
async def test_unhandled_exception_logs_error_id(
client: httpx.AsyncClient,
monkeypatch: pytest.MonkeyPatch,
caplog: pytest.LogCaptureFixture,
) -> None:
"""The same error_id returned to the client must appear in server logs,
so operators can correlate a user's 500 report with the full traceback."""
import decnet.web.api as _api
monkeypatch.setattr(_api, "DECNET_DEVELOPER", False)
headers = await _admin_headers(client)
monkeypatch.setattr(repo, "get_attacker_by_uuid", _raise_boom)
with caplog.at_level(logging.ERROR, logger="api"):
r = await client.get("/api/v1/attackers/any-uuid", headers=headers)
assert r.status_code == 500
error_id = r.json()["error_id"]
assert any(error_id in rec.getMessage() for rec in caplog.records), (
f"error_id {error_id} not found in captured logs: "
f"{[rec.getMessage() for rec in caplog.records]}"
)