merge: testing → main (reconcile 2-week divergence)

This commit is contained in:
2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions

View File

View File

@@ -0,0 +1,76 @@
import time
from fastapi import APIRouter, Depends, HTTPException, status
from fastapi.responses import ORJSONResponse
from decnet.bus import topics as _topics
from decnet.bus.app import get_app_bus
from decnet.bus.publish import publish_safely
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced
from decnet.web.db.models import WorkerControlResponse
from decnet.web.dependencies import require_admin
from decnet.web.worker_registry import KNOWN_WORKERS
log = get_logger("api")
router = APIRouter()
@router.post(
"/workers/{name}/stop",
tags=["Observability"],
responses={
202: {"model": WorkerControlResponse, "description": "Stop intent queued on bus"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Unknown worker"},
503: {"description": "Bus unavailable"},
},
)
@_traced("api.stop_worker")
async def stop_worker(
name: str,
admin: dict = Depends(require_admin),
) -> ORJSONResponse:
"""Publish a stop intent on ``system.<name>.control``.
Fire-and-forget: the endpoint does not wait for the worker to
actually exit — the caller observes the status row in the Workers
panel flipping to ``stale`` as heartbeats stop. Consistent with the
rest of the bus contract (at-most-once, DB is source of truth for
any persistent state; the bus is the notification layer).
"""
if name not in KNOWN_WORKERS:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Unknown worker: {name!r}",
)
bus = await get_app_bus()
if bus is None:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="bus unavailable",
)
topic = _topics.system_control(name)
payload = {
"action": _topics.WORKER_CONTROL_STOP,
"requested_by": admin.get("username") or admin.get("sub") or "admin",
"ts": time.time(),
}
await publish_safely(bus, topic, payload, event_type=_topics.SYSTEM_CONTROL)
log.info(
"workers: stop requested worker=%s by=%s",
name, payload["requested_by"],
)
body = WorkerControlResponse(
accepted=True,
worker=name,
action=_topics.WORKER_CONTROL_STOP,
)
return ORJSONResponse(
content=body.model_dump(),
status_code=status.HTTP_202_ACCEPTED,
)

View File

@@ -0,0 +1,35 @@
import time
from fastapi import APIRouter, Depends
from decnet.bus.app import get_app_bus
from decnet.telemetry import traced as _traced
from decnet.web.db.models import WorkersResponse
from decnet.web.dependencies import require_viewer
from decnet.web.services import systemd_control
from decnet.web.worker_registry import get_registry
router = APIRouter()
@router.get(
"/workers",
response_model=WorkersResponse,
tags=["Observability"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
@_traced("api.list_workers")
async def list_workers(user: dict = Depends(require_viewer)) -> WorkersResponse:
workers = get_registry().snapshot()
bus = await get_app_bus()
installed = await systemd_control.list_installed()
for w in workers:
w.installed = w.name in installed
return WorkersResponse(
workers=workers,
generated_at=time.time(),
bus_connected=bus is not None,
)

View File

@@ -0,0 +1,102 @@
from fastapi import APIRouter, Depends
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced
from decnet.web.db.models import StartAllResponse, StartFailure
from decnet.web.dependencies import require_admin
from decnet.web.services import systemd_control
from decnet.web.worker_registry import KNOWN_WORKERS
log = get_logger("api")
router = APIRouter()
# Order matters — bus comes up first so subsequent workers have a place
# to publish their heartbeats; then the API, then the data-plane set.
# Anything unknown in KNOWN_WORKERS but not here gets appended at the
# end so new worker names still get started even if we forget to place
# them explicitly.
_PREFERRED_ORDER: tuple[str, ...] = (
"bus",
"api",
"collector",
"profiler",
"sniffer",
"prober",
"mutator",
"reconciler",
"reuse-correlator",
"enrich",
"clusterer",
"campaign-clusterer",
"webhook",
"orchestrator",
)
def _ordered() -> list[str]:
seen: set[str] = set()
out: list[str] = []
for name in _PREFERRED_ORDER:
if name in KNOWN_WORKERS and name not in seen:
out.append(name)
seen.add(name)
for name in KNOWN_WORKERS:
if name not in seen:
out.append(name)
seen.add(name)
return out
@router.post(
"/workers/start-all",
response_model=StartAllResponse,
tags=["Observability"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
@_traced("api.start_all_workers")
async def start_all_workers(
admin: dict = Depends(require_admin),
) -> StartAllResponse:
"""Best-effort: bring up every installed worker unit in order.
Workers already ``active`` are counted in ``already_running`` and
skipped. Workers without a unit file (common on dev boxes) are
silently skipped — the UI already renders them as not-installed.
Returns 200 even on partial failure; the caller reads the three
lists. Started sequentially, not in parallel: systemd dependency
ordering (bus → api → data-plane) matters.
"""
installed = await systemd_control.list_installed()
started: list[str] = []
already_running: list[str] = []
failed: list[StartFailure] = []
for name in _ordered():
if name not in installed:
continue
try:
if await systemd_control.is_active(name):
already_running.append(name)
continue
await systemd_control.start(name)
started.append(name)
except systemd_control.SystemctlError as exc:
snippet = (exc.stderr.splitlines() or ["systemctl failed"])[0][:200]
failed.append(StartFailure(name=name, reason=snippet))
log.warning("start-all: %s failed: %s", name, snippet)
log.info(
"workers: start-all by=%s started=%d already=%d failed=%d",
admin.get("username") or admin.get("sub") or "admin",
len(started), len(already_running), len(failed),
)
return StartAllResponse(
started=started,
already_running=already_running,
failed=failed,
)

View File

@@ -0,0 +1,72 @@
from fastapi import APIRouter, Depends, HTTPException, status
from fastapi.responses import ORJSONResponse
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced
from decnet.web.db.models import WorkerControlResponse
from decnet.web.dependencies import require_admin
from decnet.web.services import systemd_control
from decnet.web.worker_registry import KNOWN_WORKERS
log = get_logger("api")
router = APIRouter()
@router.post(
"/workers/{name}/start",
tags=["Observability"],
responses={
202: {"model": WorkerControlResponse, "description": "Start issued via systemd"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Unknown worker"},
502: {"description": "systemctl returned non-zero"},
503: {"description": "Unit file not installed on this host"},
},
)
@_traced("api.start_worker")
async def start_worker(
name: str,
admin: dict = Depends(require_admin),
) -> ORJSONResponse:
"""Start ``decnet-<name>.service`` via systemd.
Unlike STOP (which is bus-based — the worker signals itself), START
has to come from *outside* the worker since a stopped worker has no
subscriber. The API shells out to ``systemctl`` via a scoped polkit
rule. Returns 202 on acceptance; the UI then waits for the next
REFRESH to see the heartbeat flip the row to OK.
"""
if name not in KNOWN_WORKERS:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Unknown worker: {name!r}",
)
installed = await systemd_control.list_installed()
if name not in installed:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail=f"unit file not installed: decnet-{name}.service",
)
try:
await systemd_control.start(name)
except systemd_control.SystemctlError as exc:
log.exception("systemctl start %s failed: %s", name, exc.stderr)
snippet = exc.stderr.splitlines()[0] if exc.stderr else "systemctl failed"
raise HTTPException(
status_code=status.HTTP_502_BAD_GATEWAY,
detail=snippet[:200],
) from exc
log.info(
"workers: start requested worker=%s by=%s",
name, admin.get("username") or admin.get("sub") or "admin",
)
body = WorkerControlResponse(accepted=True, worker=name, action="start")
return ORJSONResponse(
content=body.model_dump(),
status_code=status.HTTP_202_ACCEPTED,
)