feat(workers): bus-backed Workers panel (registry, control, installed flag)
Ships the backend half of Config → Workers:
* Worker registry aggregates `system.*.health` + `system.bus.health`
heartbeats into a last-seen dict; OK / STALE / UNKNOWN tiers drop
out of a 90s window (3× the 30s heartbeat interval).
* `GET /api/v1/workers` returns the snapshot plus `bus_connected`
(so the UI can explain "all UNKNOWN" when the bus socket is down)
and a per-row `installed` flag populated from
`systemctl list-unit-files decnet-*.service` (cached 30s).
* `POST /api/v1/workers/{name}/stop` publishes a stop intent on
`system.<name>.control`; workers listen via the shared control
listener in `bus/publish.py`.
* Heartbeat + control listener wired into collector / profiler /
sniffer / prober / mutator worker loops. API self-heartbeats too
so the panel always has one ground-truth row.
* Topic helper `system_control(name)` + tests covering builder
validation, control listener shutdown path, and the API surface
(auth gating, bus-connected field, unknown-name 404).
Adds `StartFailure` / `StartAllResponse` models in anticipation of
the upcoming start endpoints (DEBT-034).
This commit is contained in:
@@ -8,6 +8,8 @@ from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
import os
|
||||
import signal
|
||||
import time
|
||||
from typing import Any, Callable
|
||||
|
||||
@@ -99,3 +101,103 @@ async def run_health_heartbeat(
|
||||
log.debug("heartbeat extra() failed worker=%s: %s", worker, exc)
|
||||
await publish_safely(bus, topic, payload, event_type=_topics.SYSTEM_HEALTH)
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
|
||||
async def run_control_listener(
|
||||
bus: BaseBus | None,
|
||||
worker: str,
|
||||
shutdown: asyncio.Event,
|
||||
) -> None:
|
||||
"""Subscribe to ``system.<worker>.control`` and honour stop intents.
|
||||
|
||||
On a well-formed ``{"action": "stop", ...}`` message the function sets
|
||||
*shutdown* and returns — the worker's main loop is expected to check
|
||||
the event and unwind cleanly, matching the SIGTERM path.
|
||||
|
||||
Malformed payloads (missing/unknown action, non-dict, exception from
|
||||
the transport) are logged and ignored. A ``None`` bus yields a noop
|
||||
coroutine that simply awaits *shutdown* — callers can ``create_task``
|
||||
this unconditionally regardless of bus state.
|
||||
|
||||
Cancellation-safe.
|
||||
"""
|
||||
if bus is None:
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
await shutdown.wait()
|
||||
return
|
||||
|
||||
topic = _topics.system_control(worker)
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
try:
|
||||
async with bus.subscribe(topic) as sub:
|
||||
async for event in sub:
|
||||
payload = event.payload or {}
|
||||
action = payload.get("action")
|
||||
requested_by = payload.get("requested_by", "<unknown>")
|
||||
if action == _topics.WORKER_CONTROL_STOP:
|
||||
log.info(
|
||||
"control: stop requested worker=%s by=%s",
|
||||
worker, requested_by,
|
||||
)
|
||||
shutdown.set()
|
||||
return
|
||||
log.debug(
|
||||
"control: ignoring unknown action worker=%s action=%r",
|
||||
worker, action,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(
|
||||
"control listener failed worker=%s: %s — shutdown via bus disabled",
|
||||
worker, exc,
|
||||
)
|
||||
|
||||
|
||||
async def run_control_listener_signal(
|
||||
bus: BaseBus | None,
|
||||
worker: str,
|
||||
) -> None:
|
||||
"""Like :func:`run_control_listener` but signals the process on stop.
|
||||
|
||||
Preferred for workers whose main loop is a blocking thread
|
||||
(container-log tail, PTY read, scapy sniff) — wiring an
|
||||
``asyncio.Event`` through the thread boundary is error-prone, and
|
||||
every DECNET worker already has systemd-equivalent SIGTERM cleanup.
|
||||
A SIGTERM self-signal routes the stop through that same path
|
||||
without inventing a second shutdown mechanism.
|
||||
|
||||
Cancellation-safe. Never raises: a failed self-signal is logged
|
||||
and the loop simply exits (admin can fall back to ``systemctl``).
|
||||
"""
|
||||
if bus is None:
|
||||
return
|
||||
|
||||
topic = _topics.system_control(worker)
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
try:
|
||||
async with bus.subscribe(topic) as sub:
|
||||
async for event in sub:
|
||||
payload = event.payload or {}
|
||||
action = payload.get("action")
|
||||
requested_by = payload.get("requested_by", "<unknown>")
|
||||
if action == _topics.WORKER_CONTROL_STOP:
|
||||
log.info(
|
||||
"control: stop requested worker=%s by=%s → SIGTERM self",
|
||||
worker, requested_by,
|
||||
)
|
||||
try:
|
||||
os.kill(os.getpid(), signal.SIGTERM)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(
|
||||
"control: self-signal failed worker=%s: %s",
|
||||
worker, exc,
|
||||
)
|
||||
return
|
||||
log.debug(
|
||||
"control: ignoring unknown action worker=%s action=%r",
|
||||
worker, action,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(
|
||||
"control signal listener failed worker=%s: %s",
|
||||
worker, exc,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user