feat(api): opaque 500 handler + error_id correlation for unhandled exceptions

Registers a generic @app.exception_handler(Exception) that catches anything
uncaught in route handlers / dependencies. Prod response is opaque:
{detail: 'Internal Server Error', error_id: <uuid4 hex>}. Dev mode
(DECNET_DEVELOPER=True) adds exception_type and traceback fields so
failures are debuggable without tailing server logs.

The error_id is logged alongside the full traceback server-side, letting
operators correlate a user's 500 report with the exact exception via
`grep <error_id> /var/log/decnet.log`.

FastAPI's own HTTPException routing and the existing
RequestValidationError / ValidationError / RateLimitExceeded handlers
still take precedence — this handler only fires on genuinely-uncaught
exceptions.

Flips threat model F1/I 'traceback / stack trace leakage' from ? to M
and logs a follow-up checklist entry for 4 detail=str(e) sites in the
fleet deploy router (admin-gated, different threat class, separate
audit).
This commit is contained in:
2026-04-23 14:07:32 -04:00
parent 2f4f81e5de
commit ef4179ea1f
3 changed files with 161 additions and 2 deletions

View File

@@ -1,5 +1,7 @@
import asyncio import asyncio
import os import os
import traceback
import uuid
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from typing import Any, AsyncGenerator, Optional from typing import Any, AsyncGenerator, Optional
@@ -297,3 +299,29 @@ async def pydantic_validation_exception_handler(request: Request, exc: Validatio
"type": "internal_validation_error" "type": "internal_validation_error"
}, },
) )
@app.exception_handler(Exception)
async def unhandled_exception_handler(request: Request, exc: Exception) -> ORJSONResponse:
"""Catch-all for uncaught exceptions in route handlers and dependencies.
Prod: opaque 500 with an ``error_id``; full traceback goes ONLY to server
logs. Dev (``DECNET_DEVELOPER=True``): same response plus ``exception_type``
and ``traceback`` fields so failures are debuggable without tailing logs.
The ``error_id`` lets operators correlate a user's 500 report with the full
traceback in server logs (``grep <error_id> /var/log/decnet.log``).
FastAPI's own ``HTTPException`` routing still takes precedence — this
handler only fires on genuinely-uncaught exceptions.
"""
error_id = uuid.uuid4().hex
log.exception(
"unhandled exception on %s %s [error_id=%s]",
request.method, request.url.path, error_id,
)
body: dict[str, Any] = {"detail": "Internal Server Error", "error_id": error_id}
if DECNET_DEVELOPER:
body["exception_type"] = type(exc).__name__
body["traceback"] = traceback.format_exc()
return ORJSONResponse(status_code=500, content=body)

View File

@@ -185,7 +185,7 @@ Each sub-flow below gets its own table. Status codes:
| T | Password hash tampering in DB | T | DB integrity is OS/filesystem scope. See boundary #2 for syslog-path tampering. | | T | Password hash tampering in DB | T | DB integrity is OS/filesystem scope. See boundary #2 for syslog-path tampering. |
| R | User denies having performed an action | M | Every mutation logged with actor UUID; audit trail lives in `logs` table. | | R | User denies having performed an action | M | Every mutation logged with actor UUID; audit trail lives in `logs` table. |
| I | Password reflected in login response on failure | M | Single uniform 401 for user-not-found and bad-password at `api_login.py`. No user-existence oracle. | | I | Password reflected in login response on failure | M | Single uniform 401 for user-not-found and bad-password at `api_login.py`. No user-existence oracle. |
| I | JWT secret leaked via error message / stack trace | **?** | Verify: production error handler doesn't return tracebacks. | | I | JWT secret leaked via error message / stack trace | M | Generic `@app.exception_handler(Exception)` at `decnet/web/api.py` returns opaque `{detail, error_id}` on uncaught exceptions; traceback is logged server-side only. Dev-mode (`DECNET_DEVELOPER=True`) includes traceback in body for debugging. |
| D | Bcrypt-cost DoS via long password submission | M | Pydantic `max_length=72` on all password fields in `decnet/web/db/models/auth.py` (matches bcrypt's internal truncation limit). | | D | Bcrypt-cost DoS via long password submission | M | Pydantic `max_length=72` on all password fields in `decnet/web/db/models/auth.py` (matches bcrypt's internal truncation limit). |
| E | `role=None` bypass (historical bug) | M | See memory `project_rbac_null_role.md`; fixed via centralized RBAC that treats `None` as unauthenticated. | | E | `role=None` bypass (historical bug) | M | See memory `project_rbac_null_role.md`; fixed via centralized RBAC that treats `None` as unauthenticated. |
@@ -289,7 +289,8 @@ code" or "accepted, add to table above."
- [x] ~~Per-IP / per-user rate limit on `/auth/login`.~~ Shipped — see F1/S row. - [x] ~~Per-IP / per-user rate limit on `/auth/login`.~~ Shipped — see F1/S row.
- [x] ~~Uniform "invalid credentials" on login failure (no user-existence oracle).~~ Verified — see F1/I row. - [x] ~~Uniform "invalid credentials" on login failure (no user-existence oracle).~~ Verified — see F1/I row.
- [ ] Production error handler suppresses tracebacks and internal details. - [x] ~~Production error handler suppresses tracebacks and internal details.~~ Shipped — generic `@app.exception_handler(Exception)` in `decnet/web/api.py`; opaque `{detail, error_id}` in prod, traceback only under `DECNET_DEVELOPER=True`.
- [ ] `detail=str(e)` / `detail=f"…{e}"` sites in `decnet/web/router/fleet/api_deploy_deckies.py:41,67,83,155` — admin-gated, arguably intentional INI-parser UX, but worth auditing: preserve deliberate messages, sanitize any bubble-up from lower layers.
- [x] ~~Password length clamp before bcrypt.~~ Verified — Pydantic `max_length=72`. - [x] ~~Password length clamp before bcrypt.~~ Verified — Pydantic `max_length=72`.
- [ ] Contract test asserting every protected route returns 401 unauthenticated and 403 for under-roled. - [ ] Contract test asserting every protected route returns 401 unauthenticated and 403 for under-roled.
- [ ] Field allow-list on viewer responses for attacker / user / bounty serializers. - [ ] Field allow-list on viewer responses for attacker / user / bounty serializers.
@@ -357,3 +358,4 @@ In priority order:
|------|--------|--------| |------|--------|--------|
| 2026-04-23 | Initial scaffold. System context + Dashboard↔API as first worked component. | ANTI | | 2026-04-23 | Initial scaffold. System context + Dashboard↔API as first worked component. | ANTI |
| 2026-04-23 | F1 Authn: 3 threats moved from **?** to **M** (rate limit shipped; uniform 401 verified; bcrypt length clamp verified). Added DA-08 accepted risk: reverse-proxy per-IP bucket collapse. | ANTI | | 2026-04-23 | F1 Authn: 3 threats moved from **?** to **M** (rate limit shipped; uniform 401 verified; bcrypt length clamp verified). Added DA-08 accepted risk: reverse-proxy per-IP bucket collapse. | ANTI |
| 2026-04-23 | F1/I "traceback / stack trace leakage" moved from **?** to **M** via generic Exception handler with `error_id` correlation. Added follow-up checklist entry for `detail=str(e)` sites in fleet deploy router. | ANTI |

View File

@@ -0,0 +1,129 @@
"""Tests for the generic Exception handler at decnet/web/api.py.
Mitigation target: threat model F1/I — "Production error handler suppresses
tracebacks and internal details." Verifies that uncaught exceptions return
an opaque 500 with a correlation id in prod, and include debug detail only
when DECNET_DEVELOPER is on.
"""
from __future__ import annotations
import logging
import re
from typing import AsyncGenerator
import httpx
import pytest
from decnet.env import DECNET_ADMIN_USER, DECNET_ADMIN_PASSWORD
from decnet.web.api import app
from decnet.web.dependencies import repo
@pytest.fixture
async def client() -> AsyncGenerator[httpx.AsyncClient, None]:
"""Override the shared client fixture to NOT re-raise app exceptions.
The default `httpx.ASGITransport` re-raises any uncaught exception
from the app — which defeats the whole point of testing our
generic exception handler. With `raise_app_exceptions=False`, the
transport instead returns the HTTP response our handler built.
"""
transport = httpx.ASGITransport(app=app, raise_app_exceptions=False)
async with httpx.AsyncClient(transport=transport, base_url="http://test") as ac:
yield ac
async def _admin_headers(client: httpx.AsyncClient) -> dict[str, str]:
resp = await client.post(
"/api/v1/auth/login",
json={"username": DECNET_ADMIN_USER, "password": DECNET_ADMIN_PASSWORD},
)
token = resp.json()["access_token"]
# Clear must_change_password so the token passes mutation-gated endpoints.
await client.post(
"/api/v1/auth/change-password",
json={"old_password": DECNET_ADMIN_PASSWORD, "new_password": DECNET_ADMIN_PASSWORD},
headers={"Authorization": f"Bearer {token}"},
)
resp2 = await client.post(
"/api/v1/auth/login",
json={"username": DECNET_ADMIN_USER, "password": DECNET_ADMIN_PASSWORD},
)
return {"Authorization": f"Bearer {resp2.json()['access_token']}"}
def _raise_boom(*_a, **_kw):
raise RuntimeError("boom")
@pytest.mark.anyio
async def test_unhandled_exception_prod_shape_is_opaque(
client: httpx.AsyncClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Prod mode (DECNET_DEVELOPER=False): 500 with opaque body + error_id.
Must NOT include traceback or exception_type."""
import decnet.web.api as _api
monkeypatch.setattr(_api, "DECNET_DEVELOPER", False)
headers = await _admin_headers(client)
monkeypatch.setattr(repo, "get_attacker_by_uuid", _raise_boom)
r = await client.get("/api/v1/attackers/any-uuid", headers=headers)
assert r.status_code == 500
body = r.json()
assert body.get("detail") == "Internal Server Error"
assert "error_id" in body
assert re.fullmatch(r"[0-9a-f]{32}", body["error_id"]), body["error_id"]
assert "traceback" not in body
assert "exception_type" not in body
@pytest.mark.anyio
async def test_unhandled_exception_dev_shape_includes_traceback(
client: httpx.AsyncClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Dev mode (DECNET_DEVELOPER=True): body includes exception_type and
traceback so failures are debuggable without tailing server logs."""
import decnet.web.api as _api
monkeypatch.setattr(_api, "DECNET_DEVELOPER", True)
headers = await _admin_headers(client)
monkeypatch.setattr(repo, "get_attacker_by_uuid", _raise_boom)
r = await client.get("/api/v1/attackers/any-uuid", headers=headers)
assert r.status_code == 500
body = r.json()
assert body["detail"] == "Internal Server Error"
assert "error_id" in body
assert body["exception_type"] == "RuntimeError"
assert "RuntimeError" in body["traceback"]
assert "boom" in body["traceback"]
@pytest.mark.anyio
async def test_unhandled_exception_logs_error_id(
client: httpx.AsyncClient,
monkeypatch: pytest.MonkeyPatch,
caplog: pytest.LogCaptureFixture,
) -> None:
"""The same error_id returned to the client must appear in server logs,
so operators can correlate a user's 500 report with the full traceback."""
import decnet.web.api as _api
monkeypatch.setattr(_api, "DECNET_DEVELOPER", False)
headers = await _admin_headers(client)
monkeypatch.setattr(repo, "get_attacker_by_uuid", _raise_boom)
with caplog.at_level(logging.ERROR, logger="api"):
r = await client.get("/api/v1/attackers/any-uuid", headers=headers)
assert r.status_code == 500
error_id = r.json()["error_id"]
assert any(error_id in rec.getMessage() for rec in caplog.records), (
f"error_id {error_id} not found in captured logs: "
f"{[rec.getMessage() for rec in caplog.records]}"
)