feat(workers): bus-backed Workers panel (registry, control, installed flag)

Ships the backend half of Config → Workers:

* Worker registry aggregates `system.*.health` + `system.bus.health`
  heartbeats into a last-seen dict; OK / STALE / UNKNOWN tiers drop
  out of a 90s window (3× the 30s heartbeat interval).
* `GET /api/v1/workers` returns the snapshot plus `bus_connected`
  (so the UI can explain "all UNKNOWN" when the bus socket is down)
  and a per-row `installed` flag populated from
  `systemctl list-unit-files decnet-*.service` (cached 30s).
* `POST /api/v1/workers/{name}/stop` publishes a stop intent on
  `system.<name>.control`; workers listen via the shared control
  listener in `bus/publish.py`.
* Heartbeat + control listener wired into collector / profiler /
  sniffer / prober / mutator worker loops. API self-heartbeats too
  so the panel always has one ground-truth row.
* Topic helper `system_control(name)` + tests covering builder
  validation, control listener shutdown path, and the API surface
  (auth gating, bus-connected field, unknown-name 404).

Adds `StartFailure` / `StartAllResponse` models in anticipation of
the upcoming start endpoints (DEBT-034).
This commit is contained in:
2026-04-22 14:10:39 -04:00
parent fcaac648a4
commit 0fbb07c2ec
18 changed files with 863 additions and 10 deletions

View File

@@ -8,6 +8,8 @@ from __future__ import annotations
import asyncio
import contextlib
import os
import signal
import time
from typing import Any, Callable
@@ -99,3 +101,103 @@ async def run_health_heartbeat(
log.debug("heartbeat extra() failed worker=%s: %s", worker, exc)
await publish_safely(bus, topic, payload, event_type=_topics.SYSTEM_HEALTH)
await asyncio.sleep(interval)
async def run_control_listener(
bus: BaseBus | None,
worker: str,
shutdown: asyncio.Event,
) -> None:
"""Subscribe to ``system.<worker>.control`` and honour stop intents.
On a well-formed ``{"action": "stop", ...}`` message the function sets
*shutdown* and returns — the worker's main loop is expected to check
the event and unwind cleanly, matching the SIGTERM path.
Malformed payloads (missing/unknown action, non-dict, exception from
the transport) are logged and ignored. A ``None`` bus yields a noop
coroutine that simply awaits *shutdown* — callers can ``create_task``
this unconditionally regardless of bus state.
Cancellation-safe.
"""
if bus is None:
with contextlib.suppress(asyncio.CancelledError):
await shutdown.wait()
return
topic = _topics.system_control(worker)
with contextlib.suppress(asyncio.CancelledError):
try:
async with bus.subscribe(topic) as sub:
async for event in sub:
payload = event.payload or {}
action = payload.get("action")
requested_by = payload.get("requested_by", "<unknown>")
if action == _topics.WORKER_CONTROL_STOP:
log.info(
"control: stop requested worker=%s by=%s",
worker, requested_by,
)
shutdown.set()
return
log.debug(
"control: ignoring unknown action worker=%s action=%r",
worker, action,
)
except Exception as exc: # noqa: BLE001
log.warning(
"control listener failed worker=%s: %s — shutdown via bus disabled",
worker, exc,
)
async def run_control_listener_signal(
bus: BaseBus | None,
worker: str,
) -> None:
"""Like :func:`run_control_listener` but signals the process on stop.
Preferred for workers whose main loop is a blocking thread
(container-log tail, PTY read, scapy sniff) — wiring an
``asyncio.Event`` through the thread boundary is error-prone, and
every DECNET worker already has systemd-equivalent SIGTERM cleanup.
A SIGTERM self-signal routes the stop through that same path
without inventing a second shutdown mechanism.
Cancellation-safe. Never raises: a failed self-signal is logged
and the loop simply exits (admin can fall back to ``systemctl``).
"""
if bus is None:
return
topic = _topics.system_control(worker)
with contextlib.suppress(asyncio.CancelledError):
try:
async with bus.subscribe(topic) as sub:
async for event in sub:
payload = event.payload or {}
action = payload.get("action")
requested_by = payload.get("requested_by", "<unknown>")
if action == _topics.WORKER_CONTROL_STOP:
log.info(
"control: stop requested worker=%s by=%s → SIGTERM self",
worker, requested_by,
)
try:
os.kill(os.getpid(), signal.SIGTERM)
except Exception as exc: # noqa: BLE001
log.warning(
"control: self-signal failed worker=%s: %s",
worker, exc,
)
return
log.debug(
"control: ignoring unknown action worker=%s action=%r",
worker, action,
)
except Exception as exc: # noqa: BLE001
log.warning(
"control signal listener failed worker=%s: %s",
worker, exc,
)