New GET /api/v1/orchestrator/events/stats?since=1h&success=false&kind=... backed by repo.count_orchestrator_failures(since_ts, kind), which counts failed rows across both orchestrator_events and orchestrator_emails since the cutoff. Window parser accepts ^\d+[smhd]$, capped at 7d. Today only success=false is accepted on this surface so the endpoint isn't accidentally repurposed before the next consumer is properly designed. Orchestrator.tsx polls the endpoint on mount + every 30 s and renders the authoritative DB-derived count instead of deriving from the in-memory SSE buffer + one paginated page (which silently excluded failures older than the local window).
100 lines
3.3 KiB
Python
100 lines
3.3 KiB
Python
"""GET /api/v1/orchestrator/events/stats — authoritative failure count.
|
|
|
|
The dashboard's failure-count badge previously derived its number from
|
|
the in-memory SSE buffer + a single paginated page (capped at 500 +
|
|
limit rows). On busy fleets, failures older than the local window
|
|
were silently excluded and the badge read low — see DEBT-042. This
|
|
endpoint returns the real count straight from the DB so the badge
|
|
matches reality.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from datetime import datetime, timedelta, timezone
|
|
from typing import Any, Optional
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException, Query
|
|
|
|
from decnet.telemetry import traced as _traced
|
|
from decnet.web.dependencies import repo, require_viewer
|
|
|
|
router = APIRouter()
|
|
|
|
_SINCE_RE = re.compile(r"^(\d+)([smhd])$")
|
|
# Bounded to avoid unintentionally-expensive scans. 7d covers the
|
|
# operator UX use case (failure-count badge) while still returning
|
|
# in O(index seek + count).
|
|
_MAX_SINCE = timedelta(days=7)
|
|
|
|
|
|
def _parse_since(s: str) -> timedelta:
|
|
m = _SINCE_RE.match(s)
|
|
if not m:
|
|
raise HTTPException(
|
|
status_code=422,
|
|
detail="since must match ^(\\d+)[smhd]$ (e.g. '15m', '1h', '24h', '7d')",
|
|
)
|
|
value, unit = int(m.group(1)), m.group(2)
|
|
if value <= 0:
|
|
raise HTTPException(status_code=422, detail="since must be > 0")
|
|
delta = {
|
|
"s": timedelta(seconds=value),
|
|
"m": timedelta(minutes=value),
|
|
"h": timedelta(hours=value),
|
|
"d": timedelta(days=value),
|
|
}[unit]
|
|
if delta > _MAX_SINCE:
|
|
raise HTTPException(
|
|
status_code=422,
|
|
detail=f"since exceeds maximum window of {_MAX_SINCE}",
|
|
)
|
|
return delta
|
|
|
|
|
|
@router.get(
|
|
"/orchestrator/events/stats",
|
|
tags=["Orchestrator"],
|
|
responses={
|
|
401: {"description": "Could not validate credentials"},
|
|
403: {"description": "Insufficient permissions"},
|
|
422: {"description": "Validation error"},
|
|
},
|
|
)
|
|
@_traced("api.orchestrator.events.stats")
|
|
async def orchestrator_event_stats(
|
|
since: str = Query("1h", description="Window relative to now, e.g. '15m', '1h', '24h'."),
|
|
success: Optional[bool] = Query(
|
|
None,
|
|
description="If set, restrict the count to rows with this success value.",
|
|
),
|
|
kind: Optional[str] = Query(
|
|
None, pattern="^(traffic|file|email)$",
|
|
),
|
|
user: dict = Depends(require_viewer),
|
|
) -> dict[str, Any]:
|
|
"""Aggregate counts for the orchestrator activity feed.
|
|
|
|
Today only the failure-count badge consumes this surface, so the
|
|
only supported aggregate is ``success=false`` (everything else is
|
|
rejected — ``success=true`` and the unfiltered total can be served
|
|
by the existing ``count`` on the list endpoint without a window
|
|
filter, and we'd rather not paint ourselves into a corner before
|
|
the next consumer shows up).
|
|
"""
|
|
if success is not False:
|
|
raise HTTPException(
|
|
status_code=422,
|
|
detail="only success=false is supported on this surface today",
|
|
)
|
|
delta = _parse_since(since)
|
|
since_ts = datetime.now(timezone.utc) - delta
|
|
count = await repo.count_orchestrator_failures(
|
|
since_ts=since_ts, kind=kind,
|
|
)
|
|
return {
|
|
"since": since,
|
|
"success": success,
|
|
"kind": kind,
|
|
"count": count,
|
|
}
|