refactor(bus): extract publish_safely + extend topics for DEBT-031

Shared publish_safely helper at decnet/bus/publish.py so the nine
workers about to be wired into the bus don't each copy-paste the
"never raise back at the caller" contract. Mutator drops its private
copy and imports the canonical one.

topics.py gains the attacker.* hierarchy (observed, scored,
session.started, session.ended) and a system_health(worker) builder
for per-worker health heartbeats — both prerequisites for the worker
rollout under DEBT-031.
This commit is contained in:
2026-04-21 16:32:30 -04:00
parent e083bbe17c
commit f3eaab5d37
5 changed files with 163 additions and 20 deletions

36
decnet/bus/publish.py Normal file
View File

@@ -0,0 +1,36 @@
"""Fire-and-forget publish helper shared across every worker.
Lifted out of ``decnet/mutator/engine.py`` once a second caller showed up
(DEBT-031). Keeping one implementation means the "never break the worker
loop" guarantee is audited in exactly one place.
"""
from __future__ import annotations
from typing import Any
from decnet.bus.base import BaseBus
from decnet.logging import get_logger
log = get_logger("bus.publish")
async def publish_safely(
bus: BaseBus | None,
topic: str,
payload: dict[str, Any],
event_type: str = "",
) -> None:
"""Publish on *bus* without ever raising back at the caller.
The DB row (or equivalent side-effect) has already been committed by
the time a worker calls this; the bus is the notification layer, not
the source of truth. A dropped publish is at most a few seconds of
UI latency until the next poll tick. A raised exception here, by
contrast, would crash the worker — which is strictly worse.
"""
if bus is None:
return
try:
await bus.publish(topic, payload, event_type=event_type)
except Exception as exc: # noqa: BLE001
log.warning("bus publish failed topic=%s: %s", topic, exc)

View File

@@ -11,8 +11,12 @@ Token structure (NATS-style, dot-separated):
decky.{decky_id}.state
decky.{decky_id}.traffic
attacker.observed
attacker.scored
attacker.session.started
attacker.session.ended
system.log
system.bus.health
system.{worker}.health
Wildcards (per :func:`decnet.bus.base.matches`):
@@ -47,9 +51,22 @@ TOPOLOGY_STATUS = "status"
DECKY_STATE = "state"
DECKY_TRAFFIC = "traffic"
# Attacker event types (second token under the ``attacker`` root). First
# sighting, session boundary transitions, and score-threshold crossings
# published by correlator + profiler. Consumers typically subscribe to
# the wildcard ``attacker.>``.
ATTACKER_OBSERVED = "observed"
ATTACKER_SCORED = "scored"
ATTACKER_SESSION_STARTED = "session.started"
ATTACKER_SESSION_ENDED = "session.ended"
# System event types.
SYSTEM_LOG = "log"
SYSTEM_BUS_HEALTH = "bus.health"
# Worker-health leaf — built per-worker as ``system.<worker>.health`` via
# :func:`system_health`. The leaf constant stays the same across workers;
# the worker name goes in the middle token.
SYSTEM_HEALTH = "health"
# ─── Builders ────────────────────────────────────────────────────────────────
@@ -89,6 +106,31 @@ def system(event_type: str) -> str:
return f"{SYSTEM}.{event_type}"
def attacker(event_type: str) -> str:
"""Build ``attacker.<event_type>``.
*event_type* is typically one of ``ATTACKER_OBSERVED``,
``ATTACKER_SCORED``, ``ATTACKER_SESSION_STARTED``,
``ATTACKER_SESSION_ENDED``. Dotted leaves (``session.started``) are
permitted — same rationale as :func:`system`.
"""
if not event_type:
raise ValueError("attacker topic requires a non-empty event_type")
return f"{ATTACKER}.{event_type}"
def system_health(worker: str) -> str:
"""Build ``system.<worker>.health``.
Worker-health heartbeats live as a nested leaf under ``system`` so
consumers can subscribe to ``system.*.health`` for every worker at
once, or to ``system.mutator.health`` for a single one. *worker* is
validated as a regular segment — no dots, wildcards, or whitespace.
"""
_reject_tokens(worker)
return f"{SYSTEM}.{worker}.{SYSTEM_HEALTH}"
def _reject_tokens(*parts: str) -> None:
"""Reject topic segments that would break NATS-style tokenization.

View File

@@ -25,32 +25,13 @@ import contextlib
from decnet.bus import topics as _topics
from decnet.bus.base import BaseBus
from decnet.bus.factory import get_bus
from decnet.bus.publish import publish_safely as _publish_safely
from decnet.web.db.repository import BaseRepository
log = get_logger("mutator")
console = Console()
async def _publish_safely(
bus: BaseBus | None,
topic: str,
payload: dict,
event_type: str = "",
) -> None:
"""Fire-and-forget bus publish.
A bus failure must never break the reconciler — the DB write already
happened before we got here, so losing the notification is at most a
few seconds of UI latency (the next poll tick picks it up).
"""
if bus is None:
return
try:
await bus.publish(topic, payload, event_type=event_type)
except Exception as exc: # noqa: BLE001
log.warning("bus publish failed topic=%s: %s", topic, exc)
@_traced("mutator.mutate_decky")
async def mutate_decky(decky_name: str, repo: BaseRepository) -> bool:
"""

64
tests/bus/test_publish.py Normal file
View File

@@ -0,0 +1,64 @@
"""Tests for :mod:`decnet.bus.publish`.
The whole point of ``publish_safely`` is that it never raises back at the
caller. These tests pin that contract: ``None`` bus is a no-op, a real
bus publishes, and a raising bus is swallowed + logged.
"""
from __future__ import annotations
import logging
import pytest
from decnet.bus.base import BaseBus, Event, Subscription
from decnet.bus.fake import FakeBus
from decnet.bus.publish import publish_safely
class _ExplodingBus(BaseBus):
"""Minimal bus whose ``publish`` always raises."""
async def connect(self) -> None: # pragma: no cover - trivial
return None
async def publish(self, topic, payload, *, event_type=""):
raise RuntimeError("transport exploded")
def subscribe(self, pattern: str) -> Subscription: # pragma: no cover
raise NotImplementedError
async def close(self) -> None: # pragma: no cover - trivial
return None
@pytest.mark.asyncio
async def test_publish_safely_none_bus_is_noop() -> None:
# Must not raise. A worker that couldn't connect at startup passes
# bus=None and expects every call to silently no-op.
await publish_safely(None, "system.log", {"msg": "hi"})
@pytest.mark.asyncio
async def test_publish_safely_delivers_on_live_bus() -> None:
bus = FakeBus()
await bus.connect()
try:
sub = bus.subscribe("system.log")
async with sub:
await publish_safely(bus, "system.log", {"msg": "hi"}, event_type="log")
event = await sub.__anext__()
assert isinstance(event, Event)
assert event.topic == "system.log"
assert event.type == "log"
assert event.payload == {"msg": "hi"}
finally:
await bus.close()
@pytest.mark.asyncio
async def test_publish_safely_swallows_transport_errors(caplog: pytest.LogCaptureFixture) -> None:
caplog.set_level(logging.WARNING, logger="bus.publish")
# The exploding bus would crash the caller without publish_safely.
# After wrapping, the caller sees nothing but a log line.
await publish_safely(_ExplodingBus(), "system.log", {"msg": "hi"})
assert any("bus publish failed" in rec.message for rec in caplog.records)

View File

@@ -40,3 +40,23 @@ def test_segment_validation(bad: str) -> None:
topics.topology_status(bad)
with pytest.raises(ValueError):
topics.decky(bad, topics.DECKY_STATE)
with pytest.raises(ValueError):
topics.system_health(bad)
def test_attacker_builder() -> None:
assert topics.attacker(topics.ATTACKER_OBSERVED) == "attacker.observed"
assert topics.attacker(topics.ATTACKER_SCORED) == "attacker.scored"
# Dotted leaf is intentional — same as system.bus.health.
assert topics.attacker(topics.ATTACKER_SESSION_STARTED) == "attacker.session.started"
assert topics.attacker(topics.ATTACKER_SESSION_ENDED) == "attacker.session.ended"
def test_attacker_builder_rejects_empty() -> None:
with pytest.raises(ValueError):
topics.attacker("")
def test_system_health_builder() -> None:
assert topics.system_health("sniffer") == "system.sniffer.health"
assert topics.system_health("mutator") == "system.mutator.health"