feat(orchestrator): authoritative failure-count badge endpoint (DEBT-042)
New GET /api/v1/orchestrator/events/stats?since=1h&success=false&kind=... backed by repo.count_orchestrator_failures(since_ts, kind), which counts failed rows across both orchestrator_events and orchestrator_emails since the cutoff. Window parser accepts ^\d+[smhd]$, capped at 7d. Today only success=false is accepted on this surface so the endpoint isn't accidentally repurposed before the next consumer is properly designed. Orchestrator.tsx polls the endpoint on mount + every 30 s and renders the authoritative DB-derived count instead of deriving from the in-memory SSE buffer + one paginated page (which silently excluded failures older than the local window).
This commit is contained in:
@@ -1131,6 +1131,17 @@ class BaseRepository(ABC):
|
||||
"""Total orchestrator-event rows, optionally filtered by kind."""
|
||||
raise NotImplementedError
|
||||
|
||||
async def count_orchestrator_failures(
|
||||
self,
|
||||
*,
|
||||
since_ts: Any,
|
||||
kind: Optional[str] = None,
|
||||
) -> int:
|
||||
"""Count failed orchestrator activity since *since_ts*, across
|
||||
both event + email tables. Backs the dashboard's failure-count
|
||||
badge (DEBT-042)."""
|
||||
raise NotImplementedError
|
||||
|
||||
async def prune_orchestrator_events(self, per_dst_cap: int = 10000) -> int:
|
||||
"""Trim per-``dst_decky_uuid`` rows to a cap. Returns deleted count.
|
||||
|
||||
|
||||
@@ -60,6 +60,46 @@ class OrchestratorMixin(_MixinBase):
|
||||
result = await session.execute(stmt)
|
||||
return result.scalar() or 0
|
||||
|
||||
async def count_orchestrator_failures(
|
||||
self,
|
||||
*,
|
||||
since_ts: datetime,
|
||||
kind: Optional[str] = None,
|
||||
) -> int:
|
||||
"""Count failed orchestrator activity since *since_ts*, across
|
||||
both ``orchestrator_events`` (traffic / file) and
|
||||
``orchestrator_emails`` (email).
|
||||
|
||||
Backs the dashboard's failure-count badge — see DEBT-042. The
|
||||
in-memory window the badge previously computed against was
|
||||
bounded by the SSE-buffer + paginated page, so failures older
|
||||
than the local window read low. This is the authoritative count.
|
||||
"""
|
||||
async with self._session() as session:
|
||||
ev_stmt = (
|
||||
select(func.count()).select_from(OrchestratorEvent)
|
||||
.where(
|
||||
col(OrchestratorEvent.success).is_(False),
|
||||
OrchestratorEvent.ts >= since_ts,
|
||||
)
|
||||
)
|
||||
if kind in ("traffic", "file"):
|
||||
ev_stmt = ev_stmt.where(OrchestratorEvent.kind == kind)
|
||||
em_stmt = (
|
||||
select(func.count()).select_from(OrchestratorEmail)
|
||||
.where(
|
||||
col(OrchestratorEmail.success).is_(False),
|
||||
OrchestratorEmail.ts >= since_ts,
|
||||
)
|
||||
)
|
||||
ev_count = 0
|
||||
em_count = 0
|
||||
if kind in (None, "traffic", "file"):
|
||||
ev_count = (await session.execute(ev_stmt)).scalar() or 0
|
||||
if kind in (None, "email"):
|
||||
em_count = (await session.execute(em_stmt)).scalar() or 0
|
||||
return ev_count + em_count
|
||||
|
||||
async def prune_orchestrator_events(self, per_dst_cap: int = 10000) -> int:
|
||||
"""Trim per-dst rows to *per_dst_cap*, oldest-first. Returns deleted count."""
|
||||
deleted = 0
|
||||
|
||||
@@ -32,6 +32,7 @@ from .campaigns.api_list_campaign_identities import router as campaign_identitie
|
||||
from .campaigns.api_events import router as campaign_events_router
|
||||
from .orchestrator.api_list_events import router as orchestrator_list_router
|
||||
from .orchestrator.api_events import router as orchestrator_events_router
|
||||
from .orchestrator.api_event_stats import router as orchestrator_stats_router
|
||||
from .realism.api_config import router as realism_config_router
|
||||
from .realism.api_personas import router as realism_personas_router
|
||||
from .realism.api_synthetic_files import router as realism_synthetic_files_router
|
||||
@@ -123,6 +124,7 @@ api_router.include_router(campaign_identities_router)
|
||||
api_router.include_router(campaign_events_router)
|
||||
api_router.include_router(orchestrator_list_router)
|
||||
api_router.include_router(orchestrator_events_router)
|
||||
api_router.include_router(orchestrator_stats_router)
|
||||
|
||||
# Realism — global persona pool CRUD for the dashboard's
|
||||
# "Persona Generation" page. The orchestrator reads from the same
|
||||
|
||||
99
decnet/web/router/orchestrator/api_event_stats.py
Normal file
99
decnet/web/router/orchestrator/api_event_stats.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""GET /api/v1/orchestrator/events/stats — authoritative failure count.
|
||||
|
||||
The dashboard's failure-count badge previously derived its number from
|
||||
the in-memory SSE buffer + a single paginated page (capped at 500 +
|
||||
limit rows). On busy fleets, failures older than the local window
|
||||
were silently excluded and the badge read low — see DEBT-042. This
|
||||
endpoint returns the real count straight from the DB so the badge
|
||||
matches reality.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import repo, require_viewer
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
_SINCE_RE = re.compile(r"^(\d+)([smhd])$")
|
||||
# Bounded to avoid unintentionally-expensive scans. 7d covers the
|
||||
# operator UX use case (failure-count badge) while still returning
|
||||
# in O(index seek + count).
|
||||
_MAX_SINCE = timedelta(days=7)
|
||||
|
||||
|
||||
def _parse_since(s: str) -> timedelta:
|
||||
m = _SINCE_RE.match(s)
|
||||
if not m:
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail="since must match ^(\\d+)[smhd]$ (e.g. '15m', '1h', '24h', '7d')",
|
||||
)
|
||||
value, unit = int(m.group(1)), m.group(2)
|
||||
if value <= 0:
|
||||
raise HTTPException(status_code=422, detail="since must be > 0")
|
||||
delta = {
|
||||
"s": timedelta(seconds=value),
|
||||
"m": timedelta(minutes=value),
|
||||
"h": timedelta(hours=value),
|
||||
"d": timedelta(days=value),
|
||||
}[unit]
|
||||
if delta > _MAX_SINCE:
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail=f"since exceeds maximum window of {_MAX_SINCE}",
|
||||
)
|
||||
return delta
|
||||
|
||||
|
||||
@router.get(
|
||||
"/orchestrator/events/stats",
|
||||
tags=["Orchestrator"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
422: {"description": "Validation error"},
|
||||
},
|
||||
)
|
||||
@_traced("api.orchestrator.events.stats")
|
||||
async def orchestrator_event_stats(
|
||||
since: str = Query("1h", description="Window relative to now, e.g. '15m', '1h', '24h'."),
|
||||
success: Optional[bool] = Query(
|
||||
None,
|
||||
description="If set, restrict the count to rows with this success value.",
|
||||
),
|
||||
kind: Optional[str] = Query(
|
||||
None, pattern="^(traffic|file|email)$",
|
||||
),
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""Aggregate counts for the orchestrator activity feed.
|
||||
|
||||
Today only the failure-count badge consumes this surface, so the
|
||||
only supported aggregate is ``success=false`` (everything else is
|
||||
rejected — ``success=true`` and the unfiltered total can be served
|
||||
by the existing ``count`` on the list endpoint without a window
|
||||
filter, and we'd rather not paint ourselves into a corner before
|
||||
the next consumer shows up).
|
||||
"""
|
||||
if success is not False:
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail="only success=false is supported on this surface today",
|
||||
)
|
||||
delta = _parse_since(since)
|
||||
since_ts = datetime.now(timezone.utc) - delta
|
||||
count = await repo.count_orchestrator_failures(
|
||||
since_ts=since_ts, kind=kind,
|
||||
)
|
||||
return {
|
||||
"since": since,
|
||||
"success": success,
|
||||
"kind": kind,
|
||||
"count": count,
|
||||
}
|
||||
Reference in New Issue
Block a user