merge: testing → main (reconcile 2-week divergence)
This commit is contained in:
18
decnet/bus/__init__.py
Normal file
18
decnet/bus/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""DECNET ServiceBus — pub/sub notification substrate.
|
||||
|
||||
The bus is the notification layer for DECNET's worker constellation. The DB
|
||||
remains the source of truth for anything durable; the bus carries "something
|
||||
happened, go look" events. Delivery is at-most-once, fire-and-forget.
|
||||
|
||||
Consumers call :func:`get_bus` from :mod:`decnet.bus.factory`; never import
|
||||
transport implementations directly. The factory selects the backend via
|
||||
``DECNET_BUS_TYPE`` (``nats`` or ``fake``) and honors ``DECNET_BUS_ENABLED``.
|
||||
|
||||
Topic hierarchy is defined in :mod:`decnet.bus.topics` and locked early so
|
||||
consumers can subscribe with stable wildcard patterns.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from decnet.bus.base import BaseBus, Event, Subscription
|
||||
|
||||
__all__ = ["BaseBus", "Event", "Subscription"]
|
||||
92
decnet/bus/app.py
Normal file
92
decnet/bus/app.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""Process-wide bus singleton for request-serving workers (API, SSE routes).
|
||||
|
||||
A single connected :class:`~decnet.bus.base.BaseBus` shared across request
|
||||
handlers — opening a UNIX socket per request would be wasteful and add
|
||||
latency to the hot path. The API lifespan is responsible for calling
|
||||
:func:`close_app_bus` on shutdown; connect is lazy so tests and
|
||||
contract-test mode that never hit a publish/subscribe code path don't
|
||||
pay for a bus connection they'll never use.
|
||||
|
||||
Failures during :meth:`BaseBus.connect` are swallowed and logged — a
|
||||
dead bus must never break request serving. Publishers should treat a
|
||||
``None`` return from :func:`get_app_bus` as "skip this notification",
|
||||
same as ``DECNET_BUS_ENABLED=false``.
|
||||
|
||||
Connect is **retried with a short backoff** (not one-shot): a startup
|
||||
race where the API lifespan hits :func:`get_app_bus` before ``decnet
|
||||
bus`` is ready would otherwise poison the singleton for the entire
|
||||
process lifetime. Instead we remember the last failure timestamp and
|
||||
let callers retry once ``_RETRY_BACKOFF`` seconds have passed.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
from decnet.bus.base import BaseBus
|
||||
from decnet.bus.factory import get_bus
|
||||
from decnet.logging import get_logger
|
||||
|
||||
log = get_logger("bus.app")
|
||||
|
||||
# Publishers in the hot path shouldn't pay connect-retry latency on every
|
||||
# call; the dashboard's own 5 s poll interval recovers within one tick
|
||||
# once the bus comes up. A persistently-dead bus only gets a connect
|
||||
# attempt every 2 s, not once per request.
|
||||
_RETRY_BACKOFF: float = 2.0
|
||||
|
||||
_lock = asyncio.Lock()
|
||||
_shared: BaseBus | None = None
|
||||
_last_failure_ts: float = 0.0
|
||||
|
||||
|
||||
async def get_app_bus() -> BaseBus | None:
|
||||
"""Return the process-wide connected bus, or ``None`` if unavailable.
|
||||
|
||||
On first call, constructs a client via :func:`get_bus` and awaits
|
||||
``connect()``. Subsequent calls return the cached instance. If a
|
||||
connect attempt raises, the failure timestamp is recorded and
|
||||
subsequent calls within ``_RETRY_BACKOFF`` seconds return ``None``
|
||||
without re-attempting — after the backoff window, the next call
|
||||
retries. This is what lets the API recover from a
|
||||
``decnet bus``-started-after-API race without a full API restart.
|
||||
"""
|
||||
global _shared, _last_failure_ts
|
||||
if _shared is not None:
|
||||
return _shared
|
||||
if (time.monotonic() - _last_failure_ts) < _RETRY_BACKOFF:
|
||||
return None
|
||||
async with _lock:
|
||||
if _shared is not None:
|
||||
return _shared
|
||||
if (time.monotonic() - _last_failure_ts) < _RETRY_BACKOFF:
|
||||
return None
|
||||
try:
|
||||
candidate = get_bus(client_name="api")
|
||||
await candidate.connect()
|
||||
_shared = candidate
|
||||
_last_failure_ts = 0.0
|
||||
return _shared
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("app bus unavailable: %s", exc)
|
||||
_last_failure_ts = time.monotonic()
|
||||
return None
|
||||
|
||||
|
||||
async def close_app_bus() -> None:
|
||||
"""Close the shared bus if one is open; clear the backoff window.
|
||||
|
||||
Call from the API lifespan shutdown. Safe to call multiple times.
|
||||
Resetting ``_last_failure_ts`` means the next ``get_app_bus()``
|
||||
after shutdown-and-restart-within-the-same-process (rare, but
|
||||
tests do this) retries immediately instead of honouring a stale
|
||||
backoff.
|
||||
"""
|
||||
global _shared, _last_failure_ts
|
||||
bus, _shared = _shared, None
|
||||
_last_failure_ts = 0.0
|
||||
if bus is not None:
|
||||
try:
|
||||
await bus.close()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("app bus close raised: %s", exc)
|
||||
205
decnet/bus/base.py
Normal file
205
decnet/bus/base.py
Normal file
@@ -0,0 +1,205 @@
|
||||
"""Bus abstractions: the :class:`Event` envelope and the :class:`BaseBus` ABC.
|
||||
|
||||
Every transport (NATS, in-process fake, null) speaks this contract. The
|
||||
envelope is versioned (``v``) so future evolution never breaks deployed
|
||||
consumers that happen to see a newer event shape.
|
||||
|
||||
Subscription model: :meth:`BaseBus.subscribe` returns a :class:`Subscription`
|
||||
that is an async context manager AND an async iterator. The expected usage is:
|
||||
|
||||
async with bus.subscribe("topology.*.mutation.*") as sub:
|
||||
async for event in sub:
|
||||
handle(event)
|
||||
|
||||
Leaving the ``async with`` releases the underlying subscription handle; the
|
||||
transport is free to drop any buffered events after that point.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import abc
|
||||
import asyncio
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, AsyncIterator
|
||||
|
||||
EVENT_SCHEMA_VERSION = 1
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Event:
|
||||
"""The bus envelope.
|
||||
|
||||
``v`` is the envelope schema version, bumped on incompatible shape
|
||||
changes. ``type`` is a short discriminator (``"mutation.applied"``,
|
||||
``"decky.state"``) useful for consumers that subscribe to a broad
|
||||
wildcard and dispatch in Python; it is redundant with the trailing
|
||||
segments of ``topic`` but cheaper to inspect. ``ts`` is epoch seconds
|
||||
(float). ``id`` is a random UUID so consumers can de-dupe if they
|
||||
ever see the same event twice (not expected at-most-once, but cheap
|
||||
insurance).
|
||||
"""
|
||||
|
||||
topic: str
|
||||
payload: dict[str, Any]
|
||||
type: str = ""
|
||||
v: int = EVENT_SCHEMA_VERSION
|
||||
ts: float = field(default_factory=time.time)
|
||||
id: str = field(default_factory=lambda: uuid.uuid4().hex)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"v": self.v,
|
||||
"id": self.id,
|
||||
"topic": self.topic,
|
||||
"type": self.type,
|
||||
"ts": self.ts,
|
||||
"payload": self.payload,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, topic: str, data: dict[str, Any]) -> "Event":
|
||||
"""Reconstruct an Event from a wire-format dict.
|
||||
|
||||
``topic`` is passed explicitly because the transport knows which
|
||||
subject the message arrived on; trusting a ``topic`` field from the
|
||||
wire would let a misbehaving publisher spoof events on topics they
|
||||
don't actually publish to.
|
||||
"""
|
||||
return cls(
|
||||
topic=topic,
|
||||
payload=data.get("payload", {}) or {},
|
||||
type=data.get("type", "") or "",
|
||||
v=int(data.get("v", EVENT_SCHEMA_VERSION)),
|
||||
ts=float(data.get("ts", time.time())),
|
||||
id=data.get("id") or uuid.uuid4().hex,
|
||||
)
|
||||
|
||||
|
||||
class Subscription(abc.ABC):
|
||||
"""An open subscription — async context manager + async iterator.
|
||||
|
||||
Concrete transports subclass this and implement :meth:`_aclose` plus the
|
||||
async iterator protocol. Callers should not instantiate directly; use
|
||||
:meth:`BaseBus.subscribe`.
|
||||
"""
|
||||
|
||||
def __init__(self, pattern: str) -> None:
|
||||
self.pattern = pattern
|
||||
self._closed = False
|
||||
|
||||
async def __aenter__(self) -> "Subscription":
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *exc: Any) -> None:
|
||||
await self.aclose()
|
||||
|
||||
def __aiter__(self) -> AsyncIterator[Event]:
|
||||
return self
|
||||
|
||||
async def aclose(self) -> None:
|
||||
if self._closed:
|
||||
return
|
||||
self._closed = True
|
||||
await self._aclose()
|
||||
|
||||
@abc.abstractmethod
|
||||
async def __anext__(self) -> Event: # pragma: no cover - abstract
|
||||
raise NotImplementedError
|
||||
|
||||
@abc.abstractmethod
|
||||
async def _aclose(self) -> None: # pragma: no cover - abstract
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class BaseBus(abc.ABC):
|
||||
"""Pub/sub transport contract.
|
||||
|
||||
Implementations MUST be safe to ``await connect()`` multiple times and
|
||||
``await close()`` multiple times. Publishing to a closed bus raises
|
||||
:class:`RuntimeError`; subscribing to a closed bus does too.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def connect(self) -> None:
|
||||
"""Establish any network/transport resources. Idempotent."""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def publish(
|
||||
self,
|
||||
topic: str,
|
||||
payload: dict[str, Any],
|
||||
*,
|
||||
event_type: str = "",
|
||||
) -> None:
|
||||
"""Publish *payload* on *topic*. Fire-and-forget.
|
||||
|
||||
Delivery is at-most-once. On transport error the implementation
|
||||
logs and returns; it does not raise, because bus losses must not
|
||||
cascade into worker failure (DB is source of truth).
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def subscribe(self, pattern: str) -> Subscription:
|
||||
"""Return a :class:`Subscription` that yields events matching *pattern*.
|
||||
|
||||
Patterns follow NATS wildcard semantics: ``*`` matches one topic
|
||||
token, ``>`` matches one-or-more trailing tokens. Examples:
|
||||
|
||||
* ``topology.*.mutation.applied`` — all ``applied`` events for any
|
||||
topology.
|
||||
* ``topology.abc123.mutation.*`` — all mutation states for one
|
||||
topology.
|
||||
* ``topology.>`` — every event under the ``topology`` root.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def close(self) -> None:
|
||||
"""Tear down transport resources. Idempotent."""
|
||||
|
||||
async def __aenter__(self) -> "BaseBus":
|
||||
await self.connect()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *exc: Any) -> None:
|
||||
await self.close()
|
||||
|
||||
|
||||
# ─── Wildcard matching shared across in-process transports ───────────────────
|
||||
|
||||
def matches(pattern: str, topic: str) -> bool:
|
||||
"""Return True iff *topic* matches *pattern* under NATS wildcard rules.
|
||||
|
||||
``*`` matches exactly one non-empty token; ``>`` matches one-or-more
|
||||
trailing tokens (so ``topology.>`` matches ``topology.abc.x`` but not
|
||||
``topology`` alone).
|
||||
"""
|
||||
p_tokens = pattern.split(".")
|
||||
t_tokens = topic.split(".")
|
||||
for i, p in enumerate(p_tokens):
|
||||
if p == ">":
|
||||
# Must have at least one token remaining to match.
|
||||
return i < len(t_tokens)
|
||||
if i >= len(t_tokens):
|
||||
return False
|
||||
if p == "*":
|
||||
if not t_tokens[i]:
|
||||
return False
|
||||
continue
|
||||
if p != t_tokens[i]:
|
||||
return False
|
||||
return len(p_tokens) == len(t_tokens)
|
||||
|
||||
|
||||
# Sentinel used by the in-process transports to signal "no more events"
|
||||
# through the asyncio.Queue fan-out without inventing a separate control
|
||||
# channel. Not part of the wire protocol.
|
||||
_CLOSE_SENTINEL: Any = object()
|
||||
|
||||
|
||||
async def _next_or_stop(queue: "asyncio.Queue[Any]") -> Event:
|
||||
"""Pop the next item from *queue*, raising ``StopAsyncIteration`` on close."""
|
||||
item = await queue.get()
|
||||
if item is _CLOSE_SENTINEL:
|
||||
raise StopAsyncIteration
|
||||
return item
|
||||
85
decnet/bus/factory.py
Normal file
85
decnet/bus/factory.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""Bus factory — selects a :class:`~decnet.bus.base.BaseBus` implementation.
|
||||
|
||||
Dispatch key: the ``DECNET_BUS_TYPE`` environment variable.
|
||||
|
||||
* ``unix`` (default) → :class:`~decnet.bus.unix_client.UnixSocketBus`
|
||||
* ``fake`` → :class:`~decnet.bus.fake.FakeBus` (in-process)
|
||||
|
||||
If ``DECNET_BUS_ENABLED`` is ``"false"`` the factory short-circuits to
|
||||
:class:`~decnet.bus.fake.NullBus` regardless of ``DECNET_BUS_TYPE`` — a
|
||||
cheap way for dev environments to run workers without a bus daemon.
|
||||
|
||||
Mirrors :mod:`decnet.web.db.factory` (lazy imports inside each branch,
|
||||
env-driven dispatch, optional telemetry wrapping). Callers MUST use
|
||||
:func:`get_bus` rather than instantiating transports directly.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from decnet.bus.base import BaseBus
|
||||
|
||||
|
||||
def get_bus(**kwargs: Any) -> BaseBus:
|
||||
"""Instantiate the bus implementation selected by environment.
|
||||
|
||||
Keyword arguments are forwarded to the concrete transport:
|
||||
|
||||
* ``UnixSocketBus`` accepts ``socket_path`` (overrides
|
||||
``DECNET_BUS_SOCKET``) and ``client_name``.
|
||||
* ``FakeBus`` accepts ``queue_size``.
|
||||
"""
|
||||
if os.environ.get("DECNET_BUS_ENABLED", "true").lower() == "false":
|
||||
from decnet.bus.fake import NullBus
|
||||
return NullBus()
|
||||
|
||||
bus_type = os.environ.get("DECNET_BUS_TYPE", "unix").lower()
|
||||
|
||||
if bus_type == "unix":
|
||||
from decnet.bus.unix_client import UnixSocketBus
|
||||
socket_path = kwargs.pop("socket_path", None) or _default_socket_path()
|
||||
bus: BaseBus = UnixSocketBus(socket_path=socket_path, **kwargs)
|
||||
elif bus_type == "fake":
|
||||
from decnet.bus.fake import FakeBus
|
||||
bus = FakeBus(**kwargs)
|
||||
else:
|
||||
raise ValueError(f"Unsupported bus type: {bus_type}")
|
||||
|
||||
return _maybe_wrap_telemetry(bus)
|
||||
|
||||
|
||||
def _default_socket_path() -> str:
|
||||
"""Return the bus socket path honoring ``DECNET_BUS_SOCKET`` and falling
|
||||
back to ``/run/decnet/bus.sock`` → ``~/.decnet/bus.sock``.
|
||||
|
||||
The runtime path (``/run/decnet``) is preferred because systemd
|
||||
``RuntimeDirectory=decnet`` sets it up with the right perms; the home
|
||||
fallback keeps dev boxes usable without systemd.
|
||||
"""
|
||||
explicit = os.environ.get("DECNET_BUS_SOCKET")
|
||||
if explicit:
|
||||
return explicit
|
||||
|
||||
runtime_dir = "/run/decnet"
|
||||
if os.path.isdir(runtime_dir) and os.access(runtime_dir, os.W_OK):
|
||||
return f"{runtime_dir}/bus.sock"
|
||||
return os.path.expanduser("~/.decnet/bus.sock")
|
||||
|
||||
|
||||
def _maybe_wrap_telemetry(bus: BaseBus) -> BaseBus:
|
||||
"""Wrap *bus* in a tracing proxy if OTEL is enabled, else return as-is.
|
||||
|
||||
Uses :func:`decnet.telemetry.wrap_repository` as the underlying proxy —
|
||||
its implementation is generic (wraps any async method in a span), so we
|
||||
reuse it with a bus-appropriate tracer name. If telemetry isn't wired
|
||||
up at all we no-op.
|
||||
"""
|
||||
try:
|
||||
from decnet.telemetry import wrap_repository # type: ignore[attr-defined]
|
||||
except ImportError:
|
||||
return bus
|
||||
try:
|
||||
return wrap_repository(bus)
|
||||
except Exception: # pragma: no cover - defensive
|
||||
return bus
|
||||
183
decnet/bus/fake.py
Normal file
183
decnet/bus/fake.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""In-process bus transports.
|
||||
|
||||
* :class:`FakeBus` — real pub/sub semantics without touching a socket. Used
|
||||
by unit tests and anywhere ``DECNET_BUS_TYPE=fake`` is set. Lets code
|
||||
that depends on the bus be exercised entirely inside a single event loop,
|
||||
matching the DECNET testing convention of not opening real network
|
||||
sockets from unit tests.
|
||||
* :class:`NullBus` — no-op. Returned by :func:`~decnet.bus.factory.get_bus`
|
||||
when ``DECNET_BUS_ENABLED=false`` so workers can start cleanly in dev
|
||||
environments where no bus daemon is running. Publishes are dropped;
|
||||
subscriptions yield nothing and close cleanly.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any
|
||||
|
||||
from decnet.bus.base import (
|
||||
BaseBus,
|
||||
Event,
|
||||
Subscription,
|
||||
_CLOSE_SENTINEL,
|
||||
matches,
|
||||
)
|
||||
from decnet.logging import get_logger
|
||||
|
||||
log = get_logger("bus.fake")
|
||||
|
||||
# Per-subscriber bounded queue: backpressure policy is drop-oldest so a slow
|
||||
# consumer cannot stall publishers (the invariant — DB is the source of
|
||||
# truth — makes dropped events acceptable).
|
||||
_DEFAULT_QUEUE_SIZE = 1024
|
||||
|
||||
|
||||
# ─── FakeBus ─────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class _FakeSubscription(Subscription):
|
||||
"""Subscription backed by an :class:`asyncio.Queue` fed from
|
||||
:meth:`FakeBus.publish`. Unregisters itself on close."""
|
||||
|
||||
def __init__(self, bus: "FakeBus", pattern: str, queue: "asyncio.Queue[Any]") -> None:
|
||||
super().__init__(pattern)
|
||||
self._bus = bus
|
||||
self._queue = queue
|
||||
|
||||
async def __anext__(self) -> Event:
|
||||
if self._closed:
|
||||
raise StopAsyncIteration
|
||||
item = await self._queue.get()
|
||||
if item is _CLOSE_SENTINEL:
|
||||
raise StopAsyncIteration
|
||||
return item
|
||||
|
||||
async def _aclose(self) -> None:
|
||||
self._bus._unregister(self)
|
||||
# Unblock any pending __anext__ waiter.
|
||||
try:
|
||||
self._queue.put_nowait(_CLOSE_SENTINEL)
|
||||
except asyncio.QueueFull:
|
||||
pass
|
||||
|
||||
|
||||
class FakeBus(BaseBus):
|
||||
"""In-process pub/sub.
|
||||
|
||||
Publishes iterate every active subscription and enqueue the event on
|
||||
the ones whose pattern matches the topic. If a subscriber's queue is
|
||||
full, the oldest event is discarded to make room — same at-most-once
|
||||
semantics as the real UNIX-socket transport.
|
||||
"""
|
||||
|
||||
def __init__(self, queue_size: int = _DEFAULT_QUEUE_SIZE) -> None:
|
||||
self._queue_size = queue_size
|
||||
self._subs: list[_FakeSubscription] = []
|
||||
self._connected = False
|
||||
self._closed = False
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def connect(self) -> None:
|
||||
self._connected = True
|
||||
|
||||
async def publish(
|
||||
self,
|
||||
topic: str,
|
||||
payload: dict[str, Any],
|
||||
*,
|
||||
event_type: str = "",
|
||||
) -> None:
|
||||
if self._closed:
|
||||
raise RuntimeError("publish on closed bus")
|
||||
event = Event(topic=topic, payload=payload, type=event_type)
|
||||
async with self._lock:
|
||||
targets = [s for s in self._subs if matches(s.pattern, topic)]
|
||||
for sub in targets:
|
||||
_enqueue_drop_oldest(sub._queue, event)
|
||||
|
||||
def subscribe(self, pattern: str) -> Subscription:
|
||||
if self._closed:
|
||||
raise RuntimeError("subscribe on closed bus")
|
||||
queue: asyncio.Queue[Any] = asyncio.Queue(maxsize=self._queue_size)
|
||||
sub = _FakeSubscription(self, pattern, queue)
|
||||
self._subs.append(sub)
|
||||
return sub
|
||||
|
||||
def _unregister(self, sub: _FakeSubscription) -> None:
|
||||
try:
|
||||
self._subs.remove(sub)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
async def close(self) -> None:
|
||||
if self._closed:
|
||||
return
|
||||
self._closed = True
|
||||
# Wake every still-open subscription so iterators unblock cleanly.
|
||||
for sub in list(self._subs):
|
||||
try:
|
||||
sub._queue.put_nowait(_CLOSE_SENTINEL)
|
||||
except asyncio.QueueFull:
|
||||
pass
|
||||
self._subs.clear()
|
||||
|
||||
|
||||
def _enqueue_drop_oldest(queue: "asyncio.Queue[Any]", event: Event) -> None:
|
||||
"""Put *event* on *queue*, dropping the oldest item if the queue is full.
|
||||
|
||||
Factored out so both FakeBus and the real UNIX server share the exact
|
||||
same backpressure policy.
|
||||
"""
|
||||
while True:
|
||||
try:
|
||||
queue.put_nowait(event)
|
||||
return
|
||||
except asyncio.QueueFull:
|
||||
try:
|
||||
dropped = queue.get_nowait()
|
||||
log.warning(
|
||||
"bus.fake: subscriber queue full, dropped %s", getattr(dropped, "topic", "?")
|
||||
)
|
||||
except asyncio.QueueEmpty:
|
||||
return
|
||||
|
||||
|
||||
# ─── NullBus ─────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class _NullSubscription(Subscription):
|
||||
"""A subscription that never yields and closes immediately on iteration."""
|
||||
|
||||
async def __anext__(self) -> Event:
|
||||
raise StopAsyncIteration
|
||||
|
||||
async def _aclose(self) -> None:
|
||||
return
|
||||
|
||||
|
||||
class NullBus(BaseBus):
|
||||
"""No-op bus used when ``DECNET_BUS_ENABLED=false``.
|
||||
|
||||
Publishes are silently dropped; subscriptions are empty. Intended for
|
||||
dev environments where no bus daemon is running — the process starts
|
||||
cleanly, code that publishes doesn't need feature flags, and nothing
|
||||
ever blocks on a subscriber.
|
||||
"""
|
||||
|
||||
async def connect(self) -> None:
|
||||
return
|
||||
|
||||
async def publish(
|
||||
self,
|
||||
topic: str,
|
||||
payload: dict[str, Any],
|
||||
*,
|
||||
event_type: str = "",
|
||||
) -> None:
|
||||
return
|
||||
|
||||
def subscribe(self, pattern: str) -> Subscription:
|
||||
return _NullSubscription(pattern)
|
||||
|
||||
async def close(self) -> None:
|
||||
return
|
||||
144
decnet/bus/protocol.py
Normal file
144
decnet/bus/protocol.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""Wire protocol for the DECNET bus UNIX-socket transport.
|
||||
|
||||
Frame layout:
|
||||
|
||||
<VERB> [<args ...>]\\n # ASCII header, single line, no trailing space
|
||||
<4-byte big-endian body length>
|
||||
<body> # orjson-serialized dict, or empty (length 0)
|
||||
|
||||
Verbs:
|
||||
|
||||
* ``HELLO <client-name>`` — optional greeting, logged by server. Body empty.
|
||||
* ``PUB <topic>`` — publisher → server. Body = payload dict.
|
||||
* ``SUB <pattern>`` — subscriber → server. Body empty.
|
||||
* ``UNSUB <pattern>`` — subscriber → server. Body empty.
|
||||
* ``EVT <topic>`` — server → subscriber. Body = payload dict (wrapped
|
||||
in an :class:`~decnet.bus.base.Event` envelope).
|
||||
* ``BYE`` — either direction. Body empty. Graceful shutdown.
|
||||
|
||||
Parsing rules:
|
||||
|
||||
* The header is a single line terminated by ``\\n`` (LF). ``\\r`` is tolerated
|
||||
but not required.
|
||||
* Header tokens are whitespace-separated. The first token is the verb;
|
||||
everything after is verb-specific. We split on the first space only so
|
||||
topics / patterns with quoted content are not supported (they are not
|
||||
needed — topic segments forbid whitespace per :mod:`decnet.bus.topics`).
|
||||
* Maximum header length is 4096 bytes; maximum body length is 1 MiB. Beyond
|
||||
those, the connection is dropped with a logged error. This is a honeypot
|
||||
framework, not a general-purpose message broker; a malformed frame is
|
||||
treated as hostile.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import struct
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
import orjson
|
||||
|
||||
MAX_HEADER_BYTES = 4096
|
||||
MAX_BODY_BYTES = 1 * 1024 * 1024 # 1 MiB
|
||||
|
||||
# Verb constants (callers should reference these, not bare strings).
|
||||
HELLO = "HELLO"
|
||||
PUB = "PUB"
|
||||
SUB = "SUB"
|
||||
UNSUB = "UNSUB"
|
||||
EVT = "EVT"
|
||||
BYE = "BYE"
|
||||
|
||||
_VALID_VERBS = frozenset({HELLO, PUB, SUB, UNSUB, EVT, BYE})
|
||||
|
||||
|
||||
class ProtocolError(Exception):
|
||||
"""Malformed or oversized frame. Callers should close the connection."""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Frame:
|
||||
"""A parsed frame. ``body`` is the raw (unparsed) body bytes — callers
|
||||
decide whether to orjson-decode it (the protocol does not know whether
|
||||
a given verb expects a dict body or an empty one).
|
||||
"""
|
||||
|
||||
verb: str
|
||||
args: str # everything after the verb on the header line, trimmed
|
||||
body: bytes
|
||||
|
||||
|
||||
def encode(verb: str, args: str = "", body: dict[str, Any] | None = None) -> bytes:
|
||||
"""Serialize a frame.
|
||||
|
||||
*body* is a dict that will be orjson-encoded, or ``None`` for an empty
|
||||
body. The header line is written verbatim — callers must supply args
|
||||
that are free of ``\\n``.
|
||||
"""
|
||||
if verb not in _VALID_VERBS:
|
||||
raise ProtocolError(f"unknown verb {verb!r}")
|
||||
if "\n" in args or "\r" in args:
|
||||
raise ProtocolError("args must not contain newline characters")
|
||||
|
||||
body_bytes = b"" if body is None else orjson.dumps(body)
|
||||
if len(body_bytes) > MAX_BODY_BYTES:
|
||||
raise ProtocolError(
|
||||
f"body {len(body_bytes)} bytes exceeds max {MAX_BODY_BYTES}"
|
||||
)
|
||||
|
||||
header = f"{verb} {args}".rstrip() + "\n"
|
||||
header_bytes = header.encode("ascii")
|
||||
if len(header_bytes) > MAX_HEADER_BYTES:
|
||||
raise ProtocolError(
|
||||
f"header {len(header_bytes)} bytes exceeds max {MAX_HEADER_BYTES}"
|
||||
)
|
||||
return header_bytes + struct.pack(">I", len(body_bytes)) + body_bytes
|
||||
|
||||
|
||||
async def read_frame(reader: asyncio.StreamReader) -> Frame | None:
|
||||
"""Read one frame from *reader*.
|
||||
|
||||
Returns ``None`` on clean EOF before a new frame starts. Raises
|
||||
:class:`ProtocolError` on malformed input (caller should close the
|
||||
connection).
|
||||
"""
|
||||
try:
|
||||
header = await reader.readuntil(b"\n")
|
||||
except asyncio.IncompleteReadError as exc:
|
||||
if not exc.partial:
|
||||
return None
|
||||
raise ProtocolError("connection closed mid-header") from exc
|
||||
except asyncio.LimitOverrunError as exc:
|
||||
raise ProtocolError("header exceeded buffer limit") from exc
|
||||
|
||||
if len(header) > MAX_HEADER_BYTES:
|
||||
raise ProtocolError(f"header {len(header)} bytes exceeds max")
|
||||
|
||||
line = header.rstrip(b"\r\n").decode("ascii", errors="strict")
|
||||
if not line:
|
||||
raise ProtocolError("empty header line")
|
||||
|
||||
verb, _, args = line.partition(" ")
|
||||
if verb not in _VALID_VERBS:
|
||||
raise ProtocolError(f"unknown verb {verb!r}")
|
||||
|
||||
length_bytes = await reader.readexactly(4)
|
||||
(body_len,) = struct.unpack(">I", length_bytes)
|
||||
if body_len > MAX_BODY_BYTES:
|
||||
raise ProtocolError(f"body length {body_len} exceeds max")
|
||||
|
||||
body = await reader.readexactly(body_len) if body_len else b""
|
||||
return Frame(verb=verb, args=args.strip(), body=body)
|
||||
|
||||
|
||||
def decode_body(body: bytes) -> dict[str, Any]:
|
||||
"""Decode a frame body as a JSON dict. Empty body → empty dict."""
|
||||
if not body:
|
||||
return {}
|
||||
try:
|
||||
obj = orjson.loads(body)
|
||||
except orjson.JSONDecodeError as exc:
|
||||
raise ProtocolError(f"body is not valid JSON: {exc}") from exc
|
||||
if not isinstance(obj, dict):
|
||||
raise ProtocolError(f"body must be a JSON object, got {type(obj).__name__}")
|
||||
return obj
|
||||
211
decnet/bus/publish.py
Normal file
211
decnet/bus/publish.py
Normal file
@@ -0,0 +1,211 @@
|
||||
"""Fire-and-forget publish helpers shared across every worker.
|
||||
|
||||
Lifted out of ``decnet/mutator/engine.py`` once a second caller showed up
|
||||
(DEBT-031). Keeping one implementation means the "never break the worker
|
||||
loop" guarantee is audited in exactly one place.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
import os
|
||||
import signal
|
||||
import time
|
||||
from typing import Any, Callable
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.bus.base import BaseBus
|
||||
from decnet.logging import get_logger
|
||||
|
||||
log = get_logger("bus.publish")
|
||||
|
||||
|
||||
async def publish_safely(
|
||||
bus: BaseBus | None,
|
||||
topic: str,
|
||||
payload: dict[str, Any],
|
||||
event_type: str = "",
|
||||
) -> None:
|
||||
"""Publish on *bus* without ever raising back at the caller.
|
||||
|
||||
The DB row (or equivalent side-effect) has already been committed by
|
||||
the time a worker calls this; the bus is the notification layer, not
|
||||
the source of truth. A dropped publish is at most a few seconds of
|
||||
UI latency until the next poll tick. A raised exception here, by
|
||||
contrast, would crash the worker — which is strictly worse.
|
||||
"""
|
||||
if bus is None:
|
||||
return
|
||||
try:
|
||||
await bus.publish(topic, payload, event_type=event_type)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("bus publish failed topic=%s: %s", topic, exc)
|
||||
|
||||
|
||||
def make_thread_safe_publisher(
|
||||
bus: BaseBus | None,
|
||||
loop: asyncio.AbstractEventLoop,
|
||||
) -> Callable[[str, dict[str, Any], str], None]:
|
||||
"""Build a sync callable that marshals publishes back to *loop*.
|
||||
|
||||
Workers that run their hot paths in a worker thread (scapy sniff loop,
|
||||
``asyncio.to_thread`` probes, blocking socket reads) cannot ``await``
|
||||
the bus directly. This helper returns a plain function that schedules
|
||||
the publish on *loop* via ``run_coroutine_threadsafe`` and returns
|
||||
immediately — the calling thread is never blocked on the publish.
|
||||
|
||||
A ``None`` bus yields a no-op callable, matching the degraded-mode
|
||||
contract the rest of this module already upholds.
|
||||
"""
|
||||
if bus is None:
|
||||
return lambda _topic, _payload, _event_type="": None
|
||||
|
||||
def _publish(topic: str, payload: dict[str, Any], event_type: str = "") -> None:
|
||||
# Stream threads may keep draining after the bus owner closed it
|
||||
# (shutdown race). Short-circuit here so we don't marshal a
|
||||
# coroutine onto a dead loop just to have publish_safely swallow
|
||||
# it. bus.publish's own WARN-once guard handles the rare case
|
||||
# where _closed flips between this check and the coroutine
|
||||
# actually running.
|
||||
if getattr(bus, "_closed", False):
|
||||
return
|
||||
try:
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
publish_safely(bus, topic, payload, event_type=event_type),
|
||||
loop,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.debug("cross-thread bus publish failed topic=%s: %s", topic, exc)
|
||||
|
||||
return _publish
|
||||
|
||||
|
||||
async def run_health_heartbeat(
|
||||
bus: BaseBus | None,
|
||||
worker: str,
|
||||
*,
|
||||
interval: float = 30.0,
|
||||
extra: Callable[[], dict[str, Any]] | None = None,
|
||||
) -> None:
|
||||
"""Publish ``system.<worker>.health`` every *interval* seconds.
|
||||
|
||||
Standard heartbeat loop shared across agent/forwarder/updater. Emits
|
||||
``{"worker": <name>, "ts": <unix-ts>, **extra()}`` on each tick. A
|
||||
``None`` bus turns the loop into a no-op sleep cycle — still cancellable
|
||||
so the caller can use the same ``asyncio.create_task``/``.cancel()``
|
||||
pattern regardless of bus state.
|
||||
|
||||
Cancellation-safe: unwraps the ``CancelledError`` so callers awaiting
|
||||
the task during shutdown see a clean exit.
|
||||
"""
|
||||
topic = _topics.system_health(worker)
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
while True:
|
||||
payload: dict[str, Any] = {"worker": worker, "ts": time.time()}
|
||||
if extra is not None:
|
||||
try:
|
||||
payload.update(extra())
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.debug("heartbeat extra() failed worker=%s: %s", worker, exc)
|
||||
await publish_safely(bus, topic, payload, event_type=_topics.SYSTEM_HEALTH)
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
|
||||
async def run_control_listener(
|
||||
bus: BaseBus | None,
|
||||
worker: str,
|
||||
shutdown: asyncio.Event,
|
||||
) -> None:
|
||||
"""Subscribe to ``system.<worker>.control`` and honour stop intents.
|
||||
|
||||
On a well-formed ``{"action": "stop", ...}`` message the function sets
|
||||
*shutdown* and returns — the worker's main loop is expected to check
|
||||
the event and unwind cleanly, matching the SIGTERM path.
|
||||
|
||||
Malformed payloads (missing/unknown action, non-dict, exception from
|
||||
the transport) are logged and ignored. A ``None`` bus yields a noop
|
||||
coroutine that simply awaits *shutdown* — callers can ``create_task``
|
||||
this unconditionally regardless of bus state.
|
||||
|
||||
Cancellation-safe.
|
||||
"""
|
||||
if bus is None:
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
await shutdown.wait()
|
||||
return
|
||||
|
||||
topic = _topics.system_control(worker)
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
try:
|
||||
async with bus.subscribe(topic) as sub:
|
||||
async for event in sub:
|
||||
payload = event.payload or {}
|
||||
action = payload.get("action")
|
||||
requested_by = payload.get("requested_by", "<unknown>")
|
||||
if action == _topics.WORKER_CONTROL_STOP:
|
||||
log.info(
|
||||
"control: stop requested worker=%s by=%s",
|
||||
worker, requested_by,
|
||||
)
|
||||
shutdown.set()
|
||||
return
|
||||
log.debug(
|
||||
"control: ignoring unknown action worker=%s action=%r",
|
||||
worker, action,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(
|
||||
"control listener failed worker=%s: %s — shutdown via bus disabled",
|
||||
worker, exc,
|
||||
)
|
||||
|
||||
|
||||
async def run_control_listener_signal(
|
||||
bus: BaseBus | None,
|
||||
worker: str,
|
||||
) -> None:
|
||||
"""Like :func:`run_control_listener` but signals the process on stop.
|
||||
|
||||
Preferred for workers whose main loop is a blocking thread
|
||||
(container-log tail, PTY read, scapy sniff) — wiring an
|
||||
``asyncio.Event`` through the thread boundary is error-prone, and
|
||||
every DECNET worker already has systemd-equivalent SIGTERM cleanup.
|
||||
A SIGTERM self-signal routes the stop through that same path
|
||||
without inventing a second shutdown mechanism.
|
||||
|
||||
Cancellation-safe. Never raises: a failed self-signal is logged
|
||||
and the loop simply exits (admin can fall back to ``systemctl``).
|
||||
"""
|
||||
if bus is None:
|
||||
return
|
||||
|
||||
topic = _topics.system_control(worker)
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
try:
|
||||
async with bus.subscribe(topic) as sub:
|
||||
async for event in sub:
|
||||
payload = event.payload or {}
|
||||
action = payload.get("action")
|
||||
requested_by = payload.get("requested_by", "<unknown>")
|
||||
if action == _topics.WORKER_CONTROL_STOP:
|
||||
log.info(
|
||||
"control: stop requested worker=%s by=%s → SIGTERM self",
|
||||
worker, requested_by,
|
||||
)
|
||||
try:
|
||||
os.kill(os.getpid(), signal.SIGTERM)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(
|
||||
"control: self-signal failed worker=%s: %s",
|
||||
worker, exc,
|
||||
)
|
||||
return
|
||||
log.debug(
|
||||
"control: ignoring unknown action worker=%s action=%r",
|
||||
worker, action,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(
|
||||
"control signal listener failed worker=%s: %s",
|
||||
worker, exc,
|
||||
)
|
||||
398
decnet/bus/topics.py
Normal file
398
decnet/bus/topics.py
Normal file
@@ -0,0 +1,398 @@
|
||||
"""Canonical topic hierarchy for the DECNET ServiceBus.
|
||||
|
||||
Locked early so consumers can subscribe with stable wildcard patterns.
|
||||
Adding new topic families is fine; **renaming** existing ones is a breaking
|
||||
change for every subscriber and requires a coordinated rollout.
|
||||
|
||||
Token structure (NATS-style, dot-separated):
|
||||
|
||||
topology.{topology_id}.mutation.{state}
|
||||
topology.{topology_id}.status
|
||||
decky.{decky_id}.state
|
||||
decky.{decky_id}.traffic
|
||||
orchestrator.traffic.{decky_id}
|
||||
orchestrator.file.{decky_id}
|
||||
orchestrator.email.{decky_id}
|
||||
attacker.observed
|
||||
attacker.scored
|
||||
attacker.session.started
|
||||
attacker.session.ended
|
||||
identity.formed
|
||||
identity.observation.linked
|
||||
identity.merged
|
||||
identity.unmerged
|
||||
identity.campaign.assigned
|
||||
campaign.formed
|
||||
campaign.identity.assigned
|
||||
campaign.merged
|
||||
campaign.unmerged
|
||||
credential.captured
|
||||
credential.reuse.detected
|
||||
canary.{token_id}.triggered
|
||||
canary.{token_id}.placed
|
||||
canary.{token_id}.revoked
|
||||
system.log
|
||||
system.bus.health
|
||||
system.{worker}.health
|
||||
|
||||
Wildcards (per :func:`decnet.bus.base.matches`):
|
||||
|
||||
* ``*`` matches exactly one token.
|
||||
* ``>`` matches one-or-more trailing tokens (so ``topology.>`` matches
|
||||
``topology.abc.status`` but not the bare root ``topology``).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
# ─── Root prefixes ───────────────────────────────────────────────────────────
|
||||
|
||||
TOPOLOGY = "topology"
|
||||
DECKY = "decky"
|
||||
ATTACKER = "attacker"
|
||||
IDENTITY = "identity"
|
||||
CAMPAIGN = "campaign"
|
||||
SYSTEM = "system"
|
||||
CREDENTIAL = "credential"
|
||||
ORCHESTRATOR = "orchestrator"
|
||||
CANARY = "canary"
|
||||
|
||||
|
||||
# ─── Leaf event-type constants (the last segment of each topic) ──────────────
|
||||
|
||||
# Topology mutation lifecycle states — keep in sync with TopologyMutation.state
|
||||
# in decnet/web/db/models.py; the bus topic mirrors the DB state machine.
|
||||
MUTATION_ENQUEUED = "enqueued"
|
||||
MUTATION_APPLYING = "applying"
|
||||
MUTATION_APPLIED = "applied"
|
||||
MUTATION_FAILED = "failed"
|
||||
|
||||
# Topology-level status transitions (topology.{id}.status): fires when the
|
||||
# topology row's status column changes (pending/deploying/active/degraded/failed).
|
||||
TOPOLOGY_STATUS = "status"
|
||||
|
||||
# Decky-level event types (second token).
|
||||
DECKY_STATE = "state"
|
||||
DECKY_TRAFFIC = "traffic"
|
||||
# On-demand mutation request — published by the API/CLI/UI, consumed by
|
||||
# the mutator's watch loop to force an immediate mutation of one decky
|
||||
# without waiting for its scheduled interval. Underscored (not dotted)
|
||||
# to stay a single NATS token so the builder's validator accepts it.
|
||||
DECKY_MUTATE_REQUEST = "mutate_request"
|
||||
# Mutation transition event — distinct from DECKY_STATE ("current
|
||||
# shape") because a mutation is a *transition* that carries old/new
|
||||
# services + trigger + timing. Correlator consumes these (via the
|
||||
# syslog sidechannel too) to interleave substrate-change markers into
|
||||
# attacker traversals.
|
||||
DECKY_MUTATION = "mutation"
|
||||
|
||||
# Attacker event types (second token under the ``attacker`` root). First
|
||||
# sighting, session boundary transitions, and score-threshold crossings
|
||||
# published by correlator + profiler. Consumers typically subscribe to
|
||||
# the wildcard ``attacker.>``.
|
||||
ATTACKER_OBSERVED = "observed"
|
||||
ATTACKER_SCORED = "scored"
|
||||
# Published once per successful active probe result (JARM/HASSH/TCPfp).
|
||||
# Distinct from ``observed`` which is the correlator's first-sight signal —
|
||||
# a fingerprint is additional evidence about an already-observed attacker.
|
||||
ATTACKER_FINGERPRINTED = "fingerprinted"
|
||||
ATTACKER_SESSION_STARTED = "session.started"
|
||||
ATTACKER_SESSION_ENDED = "session.ended"
|
||||
# Published by the ``decnet enrich`` worker after an enrichment pass
|
||||
# succeeds for an attacker IP (one or more 3rd-party intel providers
|
||||
# returned a verdict). Payload carries the aggregate verdict + per-
|
||||
# provider summary so SIEM-bound webhooks don't need to re-query the DB.
|
||||
ATTACKER_INTEL_ENRICHED = "intel.enriched"
|
||||
|
||||
# Identity-resolution event types (second/third tokens under ``identity``).
|
||||
# Published by the (future) clusterer worker — see
|
||||
# development/IDENTITY_RESOLUTION.md. Constants ship in this commit;
|
||||
# no publishers exist yet, but consumers (webhook worker, dashboard
|
||||
# SSE relay) can subscribe to ``identity.>`` from day one and receive
|
||||
# events the instant the clusterer comes online.
|
||||
#
|
||||
# identity.formed — clusterer creates a new identity from
|
||||
# one or more observations
|
||||
# identity.observation.linked — observation attached to an existing
|
||||
# identity (or reattached from another)
|
||||
# identity.merged — two identities collapsed; loser gets
|
||||
# ``merged_into_uuid`` set, subscribers
|
||||
# re-key cached references to the winner
|
||||
# identity.unmerged — revocable-merge undo: contradicting
|
||||
# evidence cleared ``merged_into_uuid``
|
||||
# and re-split observations. The
|
||||
# resurrected side's UUID is the same
|
||||
# as the prior loser, so subscribers
|
||||
# that cached references to the loser
|
||||
# during the merged interval can
|
||||
# re-attach without a new lookup.
|
||||
#
|
||||
# ``identity.campaign.assigned`` is deferred; it ships when the campaign
|
||||
# clusterer ships. YAGNI before then.
|
||||
IDENTITY_FORMED = "formed"
|
||||
IDENTITY_OBSERVATION_LINKED = "observation.linked"
|
||||
IDENTITY_MERGED = "merged"
|
||||
IDENTITY_UNMERGED = "unmerged"
|
||||
# Campaign-clusterer cross-family event — fires under ``identity.>`` so
|
||||
# identity-stream subscribers (e.g. the IdentityDetail SSE client) get
|
||||
# notified the moment an identity's ``campaign_id`` changes without
|
||||
# having to subscribe to the campaign topic family. The same event
|
||||
# fires under ``campaign.identity.assigned`` for campaign-side
|
||||
# subscribers.
|
||||
IDENTITY_CAMPAIGN_ASSIGNED = "campaign.assigned"
|
||||
|
||||
# Campaign-clusterer event types (second/third tokens under
|
||||
# ``campaign``). Mirror of the identity family at the layer above:
|
||||
# campaigns group identities into operations, and the clusterer
|
||||
# publishes the same form / link / merge / unmerge lifecycle.
|
||||
#
|
||||
# campaign.formed — clusterer creates a new campaign from
|
||||
# one or more identities
|
||||
# campaign.identity.assigned — identity attached to an existing
|
||||
# campaign (or reassigned from another)
|
||||
# campaign.merged — two campaigns collapsed; loser gets
|
||||
# ``merged_into_uuid`` set, subscribers
|
||||
# re-key cached references to the winner
|
||||
# campaign.unmerged — revocable-merge undo: contradicting
|
||||
# evidence cleared ``merged_into_uuid``
|
||||
# and re-split identities
|
||||
CAMPAIGN_FORMED = "formed"
|
||||
CAMPAIGN_IDENTITY_ASSIGNED = "identity.assigned"
|
||||
CAMPAIGN_MERGED = "merged"
|
||||
CAMPAIGN_UNMERGED = "unmerged"
|
||||
|
||||
# Credential event types (second/third tokens under ``credential``).
|
||||
# ``credential.captured`` fires once per upserted Credential row — the
|
||||
# correlator listens for it and runs the cred-reuse query in response,
|
||||
# so reuse detection latency is sub-second after a fresh capture.
|
||||
# ``credential.reuse.detected`` fires when the correlator inserts a new
|
||||
# CredentialReuse row or grows an existing one (added decky/service/IP).
|
||||
CREDENTIAL_CAPTURED = "captured"
|
||||
CREDENTIAL_REUSE_DETECTED = "reuse.detected"
|
||||
|
||||
# Canary-token event types (third token under ``canary``).
|
||||
#
|
||||
# canary.{token_id}.placed — orchestrator/API successfully planted a
|
||||
# canary artifact inside a decky's
|
||||
# filesystem (or persisted a passive token
|
||||
# that has no callback wiring). Lets
|
||||
# dashboards reflect baseline coverage in
|
||||
# real time without a DB poll.
|
||||
# canary.{token_id}.triggered — ``decnet canary`` worker observed a
|
||||
# callback hit (HTTP slug or DNS subdomain
|
||||
# lookup) for the token. Payload carries
|
||||
# ``src_ip``, ``user_agent``, ``request_path``
|
||||
# and any DNS qname so downstream
|
||||
# consumers (correlator, webhook fanout)
|
||||
# can attribute and forward without a
|
||||
# follow-up DB read.
|
||||
# canary.{token_id}.revoked — operator removed a token; planter unlinked
|
||||
# the file (best-effort) and the row was
|
||||
# marked ``revoked``. Subscribers may
|
||||
# evict cached lookups by token id.
|
||||
CANARY_PLACED = "placed"
|
||||
CANARY_TRIGGERED = "triggered"
|
||||
CANARY_REVOKED = "revoked"
|
||||
|
||||
# Orchestrator event types (second token under ``orchestrator``). The
|
||||
# orchestrator worker publishes one of these per synthetic action it
|
||||
# drives against a decky — cheap inter-decky traffic and filesystem
|
||||
# mutations whose role is to keep the honeypot from looking suspiciously
|
||||
# static. Always nested with the destination decky uuid as the third
|
||||
# token, so consumers can subscribe to a single decky's life-injection
|
||||
# stream via ``orchestrator.*.<decky_uuid>``.
|
||||
ORCHESTRATOR_TRAFFIC = "traffic"
|
||||
ORCHESTRATOR_FILE = "file"
|
||||
# Emailgen — published by the ``decnet emailgen`` worker once per generated
|
||||
# fake email delivered into a mail decky's maildir. Third token is the
|
||||
# destination mail-decky uuid (the IMAP/POP3 host serving the mailbox),
|
||||
# matching the ``orchestrator.*.<decky_uuid>`` subscription pattern.
|
||||
ORCHESTRATOR_EMAIL = "email"
|
||||
|
||||
# System event types.
|
||||
SYSTEM_LOG = "log"
|
||||
SYSTEM_BUS_HEALTH = "bus.health"
|
||||
# Worker-health leaf — built per-worker as ``system.<worker>.health`` via
|
||||
# :func:`system_health`. The leaf constant stays the same across workers;
|
||||
# the worker name goes in the middle token.
|
||||
SYSTEM_HEALTH = "health"
|
||||
# Worker-control leaf — built per-worker as ``system.<worker>.control`` via
|
||||
# :func:`system_control`. Admin-originated stop intents travel on this
|
||||
# topic; each worker subscribes to its own.
|
||||
SYSTEM_CONTROL = "control"
|
||||
|
||||
# Control payload ``action`` values — the wire vocabulary. Only ``stop`` is
|
||||
# handled in v1; ``start`` is reserved because a stopped worker has no
|
||||
# subscriber, so starting requires external supervision (systemd).
|
||||
WORKER_CONTROL_STOP = "stop"
|
||||
WORKER_CONTROL_START = "start"
|
||||
|
||||
# Webhook subscription-set changed — published by the CRUD router after any
|
||||
# create / update / delete on WebhookSubscription so the webhook worker can
|
||||
# reload its in-memory subscription list and re-subscribe to the new union
|
||||
# of patterns. Payload is currently empty; consumers only need the signal.
|
||||
WEBHOOK_SUBSCRIPTIONS_CHANGED = "system.webhook.subscriptions_changed"
|
||||
|
||||
|
||||
# ─── Builders ────────────────────────────────────────────────────────────────
|
||||
|
||||
def topology_mutation(topology_id: str, state: str) -> str:
|
||||
"""Build ``topology.<id>.mutation.<state>``.
|
||||
|
||||
*state* should be one of the ``MUTATION_*`` constants.
|
||||
"""
|
||||
_reject_tokens(topology_id, state)
|
||||
return f"{TOPOLOGY}.{topology_id}.mutation.{state}"
|
||||
|
||||
|
||||
def topology_status(topology_id: str) -> str:
|
||||
"""Build ``topology.<id>.status``."""
|
||||
_reject_tokens(topology_id)
|
||||
return f"{TOPOLOGY}.{topology_id}.{TOPOLOGY_STATUS}"
|
||||
|
||||
|
||||
def decky(decky_id: str, event_type: str) -> str:
|
||||
"""Build ``decky.<id>.<event_type>``.
|
||||
|
||||
*event_type* is typically one of ``DECKY_STATE`` or ``DECKY_TRAFFIC``.
|
||||
"""
|
||||
_reject_tokens(decky_id, event_type)
|
||||
return f"{DECKY}.{decky_id}.{event_type}"
|
||||
|
||||
|
||||
def decky_mutation(decky_id: str) -> str:
|
||||
"""Build ``decky.<id>.mutation``."""
|
||||
_reject_tokens(decky_id)
|
||||
return f"{DECKY}.{decky_id}.{DECKY_MUTATION}"
|
||||
|
||||
|
||||
def system(event_type: str) -> str:
|
||||
"""Build ``system.<event_type>``.
|
||||
|
||||
*event_type* may itself contain dots (e.g. ``bus.health``) — we don't
|
||||
re-validate the already-constant leaves; this just prefixes.
|
||||
"""
|
||||
if not event_type:
|
||||
raise ValueError("system topic requires a non-empty event_type")
|
||||
return f"{SYSTEM}.{event_type}"
|
||||
|
||||
|
||||
def credential(event_type: str) -> str:
|
||||
"""Build ``credential.<event_type>``.
|
||||
|
||||
*event_type* is typically one of :data:`CREDENTIAL_CAPTURED` or
|
||||
:data:`CREDENTIAL_REUSE_DETECTED`. Dotted leaves
|
||||
(``reuse.detected``) are permitted — same rationale as
|
||||
:func:`system`.
|
||||
"""
|
||||
if not event_type:
|
||||
raise ValueError("credential topic requires a non-empty event_type")
|
||||
return f"{CREDENTIAL}.{event_type}"
|
||||
|
||||
|
||||
def attacker(event_type: str) -> str:
|
||||
"""Build ``attacker.<event_type>``.
|
||||
|
||||
*event_type* is typically one of ``ATTACKER_OBSERVED``,
|
||||
``ATTACKER_SCORED``, ``ATTACKER_SESSION_STARTED``,
|
||||
``ATTACKER_SESSION_ENDED``. Dotted leaves (``session.started``) are
|
||||
permitted — same rationale as :func:`system`.
|
||||
"""
|
||||
if not event_type:
|
||||
raise ValueError("attacker topic requires a non-empty event_type")
|
||||
return f"{ATTACKER}.{event_type}"
|
||||
|
||||
|
||||
def campaign(event_type: str) -> str:
|
||||
"""Build ``campaign.<event_type>``.
|
||||
|
||||
*event_type* is typically one of :data:`CAMPAIGN_FORMED`,
|
||||
:data:`CAMPAIGN_IDENTITY_ASSIGNED`, :data:`CAMPAIGN_MERGED`, or
|
||||
:data:`CAMPAIGN_UNMERGED`. Dotted leaves (``identity.assigned``)
|
||||
are permitted — same rationale as :func:`system`.
|
||||
"""
|
||||
if not event_type:
|
||||
raise ValueError("campaign topic requires a non-empty event_type")
|
||||
return f"{CAMPAIGN}.{event_type}"
|
||||
|
||||
|
||||
def identity(event_type: str) -> str:
|
||||
"""Build ``identity.<event_type>``.
|
||||
|
||||
*event_type* is typically one of :data:`IDENTITY_FORMED`,
|
||||
:data:`IDENTITY_OBSERVATION_LINKED`, :data:`IDENTITY_MERGED`, or
|
||||
:data:`IDENTITY_UNMERGED`. Dotted leaves (``observation.linked``)
|
||||
are permitted — same rationale as :func:`system`.
|
||||
"""
|
||||
if not event_type:
|
||||
raise ValueError("identity topic requires a non-empty event_type")
|
||||
return f"{IDENTITY}.{event_type}"
|
||||
|
||||
|
||||
def orchestrator(event_type: str, decky_id: str) -> str:
|
||||
"""Build ``orchestrator.<event_type>.<decky_id>``.
|
||||
|
||||
*event_type* should be one of :data:`ORCHESTRATOR_TRAFFIC` or
|
||||
:data:`ORCHESTRATOR_FILE`. The destination decky is always the
|
||||
third token so per-decky subscribers can use
|
||||
``orchestrator.*.<decky_uuid>``.
|
||||
"""
|
||||
_reject_tokens(event_type, decky_id)
|
||||
return f"{ORCHESTRATOR}.{event_type}.{decky_id}"
|
||||
|
||||
|
||||
def canary(token_id: str, event_type: str) -> str:
|
||||
"""Build ``canary.<token_id>.<event_type>``.
|
||||
|
||||
*event_type* should be one of :data:`CANARY_PLACED`,
|
||||
:data:`CANARY_TRIGGERED`, or :data:`CANARY_REVOKED`. The token id
|
||||
is always the second token so per-token subscribers can use
|
||||
``canary.<token_id>.>`` and fleet-wide consumers (webhook fanout,
|
||||
correlator) use ``canary.>``.
|
||||
"""
|
||||
_reject_tokens(token_id, event_type)
|
||||
return f"{CANARY}.{token_id}.{event_type}"
|
||||
|
||||
|
||||
def system_health(worker: str) -> str:
|
||||
"""Build ``system.<worker>.health``.
|
||||
|
||||
Worker-health heartbeats live as a nested leaf under ``system`` so
|
||||
consumers can subscribe to ``system.*.health`` for every worker at
|
||||
once, or to ``system.mutator.health`` for a single one. *worker* is
|
||||
validated as a regular segment — no dots, wildcards, or whitespace.
|
||||
"""
|
||||
_reject_tokens(worker)
|
||||
return f"{SYSTEM}.{worker}.{SYSTEM_HEALTH}"
|
||||
|
||||
|
||||
def system_control(worker: str) -> str:
|
||||
"""Build ``system.<worker>.control``.
|
||||
|
||||
Admin-originated stop (and, eventually, start) intents are published
|
||||
here; the worker in question subscribes to its own address and reacts.
|
||||
Payload shape::
|
||||
|
||||
{"action": "stop", "requested_by": "<username>", "ts": <unix>}
|
||||
|
||||
*action* must be one of :data:`WORKER_CONTROL_STOP` /
|
||||
:data:`WORKER_CONTROL_START`; any other value is ignored by the
|
||||
listener. Same segment rules as :func:`system_health`.
|
||||
"""
|
||||
_reject_tokens(worker)
|
||||
return f"{SYSTEM}.{worker}.{SYSTEM_CONTROL}"
|
||||
|
||||
|
||||
def _reject_tokens(*parts: str) -> None:
|
||||
"""Reject topic segments that would break NATS-style tokenization.
|
||||
|
||||
Dots, wildcards, whitespace, and empty strings in a *segment* would
|
||||
silently corrupt the hierarchy (e.g. ``topology.a.b.status`` for a
|
||||
``topology_id`` of ``"a.b"``). Raise early at the builder instead of
|
||||
shipping a malformed topic to the wire.
|
||||
"""
|
||||
for p in parts:
|
||||
if not p:
|
||||
raise ValueError("topic segment must not be empty")
|
||||
if "." in p or "*" in p or ">" in p or any(c.isspace() for c in p):
|
||||
raise ValueError(
|
||||
f"topic segment {p!r} may not contain '.', '*', '>', or whitespace"
|
||||
)
|
||||
257
decnet/bus/unix_client.py
Normal file
257
decnet/bus/unix_client.py
Normal file
@@ -0,0 +1,257 @@
|
||||
"""UNIX-socket client — :class:`UnixSocketBus` implementation of :class:`BaseBus`.
|
||||
|
||||
Holds one open socket to the local :class:`~decnet.bus.unix_server.BusServer`.
|
||||
Operations:
|
||||
|
||||
* :meth:`publish` writes a single ``PUB`` frame and returns; no ack.
|
||||
* :meth:`subscribe` writes a ``SUB`` frame and returns a
|
||||
:class:`~decnet.bus.base.Subscription` backed by an :class:`asyncio.Queue`
|
||||
that the background reader task feeds.
|
||||
|
||||
One background reader task per bus instance dispatches incoming ``EVT``
|
||||
frames to every registered subscription whose pattern matches the topic.
|
||||
On connection drop or close, every subscription is woken via a sentinel so
|
||||
iterators unblock cleanly; callers see :class:`StopAsyncIteration` from the
|
||||
``async for`` loop.
|
||||
|
||||
No auto-reconnect in MVP. If the server restarts, callers must
|
||||
:meth:`close` the bus and construct a new one. This mirrors how other
|
||||
DECNET workers handle their dependencies — the systemd ``Restart=on-failure``
|
||||
supervision above us is the retry loop.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
import os
|
||||
import pathlib
|
||||
from typing import Any
|
||||
|
||||
from decnet.bus import protocol
|
||||
from decnet.bus.base import (
|
||||
BaseBus,
|
||||
Event,
|
||||
Subscription,
|
||||
_CLOSE_SENTINEL,
|
||||
matches,
|
||||
)
|
||||
from decnet.bus.fake import _enqueue_drop_oldest as _enqueue_event_drop_oldest
|
||||
from decnet.logging import get_logger
|
||||
|
||||
log = get_logger("bus.client")
|
||||
|
||||
_INBOUND_QUEUE_SIZE = 1024
|
||||
|
||||
|
||||
class _UnixSubscription(Subscription):
|
||||
def __init__(
|
||||
self,
|
||||
bus: "UnixSocketBus",
|
||||
pattern: str,
|
||||
queue: "asyncio.Queue[Any]",
|
||||
) -> None:
|
||||
super().__init__(pattern)
|
||||
self._bus = bus
|
||||
self._queue = queue
|
||||
|
||||
async def __anext__(self) -> Event:
|
||||
if self._closed:
|
||||
raise StopAsyncIteration
|
||||
item = await self._queue.get()
|
||||
if item is _CLOSE_SENTINEL:
|
||||
raise StopAsyncIteration
|
||||
return item
|
||||
|
||||
async def _aclose(self) -> None:
|
||||
await self._bus._unregister(self)
|
||||
try:
|
||||
self._queue.put_nowait(_CLOSE_SENTINEL)
|
||||
except asyncio.QueueFull:
|
||||
pass
|
||||
|
||||
|
||||
class UnixSocketBus(BaseBus):
|
||||
"""Client handle for a local :class:`BusServer`.
|
||||
|
||||
One instance per process typically; multiple instances simply open
|
||||
multiple sockets to the same server. Connection is lazy — the first
|
||||
:meth:`connect` (or any publish/subscribe call via ``async with``)
|
||||
opens the socket.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
socket_path: pathlib.Path | str,
|
||||
*,
|
||||
client_name: str | None = None,
|
||||
) -> None:
|
||||
self._path = pathlib.Path(socket_path)
|
||||
self._client_name = client_name or f"decnet-bus-client[{os.getpid()}]"
|
||||
self._reader: asyncio.StreamReader | None = None
|
||||
self._writer: asyncio.StreamWriter | None = None
|
||||
self._reader_task: asyncio.Task[None] | None = None
|
||||
self._subs: list[_UnixSubscription] = []
|
||||
self._lock = asyncio.Lock()
|
||||
self._write_lock = asyncio.Lock()
|
||||
self._closed = False
|
||||
# Sticky flag: the first publish-on-closed-bus call logs at
|
||||
# WARNING so operators see that a publish was dropped; subsequent
|
||||
# calls on the same instance log at DEBUG only to prevent a
|
||||
# log flood when stream threads drain after close. The bus is
|
||||
# critical infra, so the first warning is non-negotiable.
|
||||
self._closed_publish_warned = False
|
||||
|
||||
# ─── Lifecycle ──────────────────────────────────────────────────────────
|
||||
|
||||
async def connect(self) -> None:
|
||||
if self._writer is not None:
|
||||
return
|
||||
if self._closed:
|
||||
raise RuntimeError("connect on closed bus")
|
||||
self._reader, self._writer = await asyncio.open_unix_connection(str(self._path))
|
||||
await self._send(protocol.encode(protocol.HELLO, args=self._client_name))
|
||||
self._reader_task = asyncio.create_task(self._reader_loop())
|
||||
log.debug("bus.client: connected to %s as %s", self._path, self._client_name)
|
||||
|
||||
async def close(self) -> None:
|
||||
if self._closed:
|
||||
return
|
||||
self._closed = True
|
||||
|
||||
# Best-effort BYE — we don't care if it fails.
|
||||
if self._writer is not None and not self._writer.is_closing():
|
||||
with contextlib.suppress(Exception):
|
||||
await self._send(protocol.encode(protocol.BYE))
|
||||
|
||||
if self._reader_task is not None:
|
||||
self._reader_task.cancel()
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
await self._reader_task
|
||||
self._reader_task = None
|
||||
|
||||
if self._writer is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
self._writer.close()
|
||||
await self._writer.wait_closed()
|
||||
self._writer = None
|
||||
self._reader = None
|
||||
|
||||
# Wake every subscription so `async for` exits.
|
||||
for sub in list(self._subs):
|
||||
with contextlib.suppress(asyncio.QueueFull):
|
||||
sub._queue.put_nowait(_CLOSE_SENTINEL)
|
||||
self._subs.clear()
|
||||
|
||||
# ─── Pub/Sub ────────────────────────────────────────────────────────────
|
||||
|
||||
async def publish(
|
||||
self,
|
||||
topic: str,
|
||||
payload: dict[str, Any],
|
||||
*,
|
||||
event_type: str = "",
|
||||
) -> None:
|
||||
if self._closed:
|
||||
# Degrade gracefully: the DB is the source of truth, the bus
|
||||
# is only the notification layer. Raising here made every
|
||||
# caller via publish_safely flood the logs once per stream
|
||||
# line during shutdown races. First drop warns loudly;
|
||||
# subsequent drops on the same instance are DEBUG-only.
|
||||
if not self._closed_publish_warned:
|
||||
self._closed_publish_warned = True
|
||||
log.warning(
|
||||
"bus.client: publish on closed bus dropped topic=%s "
|
||||
"(further drops on this instance logged at DEBUG)",
|
||||
topic,
|
||||
)
|
||||
else:
|
||||
log.debug("bus.client: publish on closed bus dropped topic=%s", topic)
|
||||
return
|
||||
if self._writer is None:
|
||||
await self.connect()
|
||||
body = Event(topic=topic, payload=payload, type=event_type).to_dict()
|
||||
try:
|
||||
await self._send(protocol.encode(protocol.PUB, args=topic, body=body))
|
||||
except (ConnectionError, BrokenPipeError) as exc:
|
||||
# Bus loss is a logged warning, never a publisher crash. The
|
||||
# DB-as-source-of-truth invariant means the work is already
|
||||
# persisted; the missing event is just a missed notification.
|
||||
log.warning("bus.client: publish failed: %s", exc)
|
||||
|
||||
def subscribe(self, pattern: str) -> Subscription:
|
||||
if self._closed:
|
||||
raise RuntimeError("subscribe on closed bus")
|
||||
queue: asyncio.Queue[Any] = asyncio.Queue(maxsize=_INBOUND_QUEUE_SIZE)
|
||||
sub = _UnixSubscription(self, pattern, queue)
|
||||
self._subs.append(sub)
|
||||
# Schedule the SUB frame asynchronously so subscribe() stays sync,
|
||||
# matching the BaseBus signature. The caller will shortly `async
|
||||
# with` / `async for` the subscription, which will run the event
|
||||
# loop and pick this task up.
|
||||
asyncio.ensure_future(self._send_sub(pattern))
|
||||
return sub
|
||||
|
||||
async def _send_sub(self, pattern: str) -> None:
|
||||
try:
|
||||
if self._writer is None:
|
||||
await self.connect()
|
||||
await self._send(protocol.encode(protocol.SUB, args=pattern))
|
||||
except Exception as exc: # pragma: no cover - network paths in live tests
|
||||
log.warning("bus.client: SUB %s failed: %s", pattern, exc)
|
||||
|
||||
async def _unregister(self, sub: _UnixSubscription) -> None:
|
||||
try:
|
||||
self._subs.remove(sub)
|
||||
except ValueError:
|
||||
return
|
||||
# Tell the server we no longer want events for this pattern if no
|
||||
# other local subscription still wants it.
|
||||
if not any(s.pattern == sub.pattern for s in self._subs):
|
||||
with contextlib.suppress(Exception):
|
||||
await self._send(protocol.encode(protocol.UNSUB, args=sub.pattern))
|
||||
|
||||
# ─── Internal I/O ───────────────────────────────────────────────────────
|
||||
|
||||
async def _send(self, frame_bytes: bytes) -> None:
|
||||
if self._writer is None:
|
||||
raise ConnectionError("bus.client: not connected")
|
||||
async with self._write_lock:
|
||||
self._writer.write(frame_bytes)
|
||||
await self._writer.drain()
|
||||
|
||||
async def _reader_loop(self) -> None:
|
||||
if self._reader is None:
|
||||
return
|
||||
try:
|
||||
while True:
|
||||
frame = await protocol.read_frame(self._reader)
|
||||
if frame is None:
|
||||
break
|
||||
if frame.verb != protocol.EVT:
|
||||
# Clients only ever legitimately receive EVT (or BYE).
|
||||
if frame.verb == protocol.BYE:
|
||||
break
|
||||
log.warning("bus.client: unexpected verb from server: %s", frame.verb)
|
||||
continue
|
||||
topic = frame.args
|
||||
data = protocol.decode_body(frame.body) if frame.body else {}
|
||||
event = Event.from_dict(topic, data)
|
||||
self._dispatch(event)
|
||||
except protocol.ProtocolError as exc:
|
||||
log.warning("bus.client: protocol error: %s", exc)
|
||||
except (asyncio.IncompleteReadError, ConnectionError):
|
||||
pass
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception: # pragma: no cover
|
||||
log.exception("bus.client: reader loop crashed")
|
||||
finally:
|
||||
# Server-side close — wake every subscription.
|
||||
for sub in list(self._subs):
|
||||
with contextlib.suppress(asyncio.QueueFull):
|
||||
sub._queue.put_nowait(_CLOSE_SENTINEL)
|
||||
|
||||
def _dispatch(self, event: Event) -> None:
|
||||
for sub in self._subs:
|
||||
if matches(sub.pattern, event.topic):
|
||||
_enqueue_event_drop_oldest(sub._queue, event)
|
||||
309
decnet/bus/unix_server.py
Normal file
309
decnet/bus/unix_server.py
Normal file
@@ -0,0 +1,309 @@
|
||||
"""UNIX-socket server for the DECNET bus.
|
||||
|
||||
One :class:`BusServer` per host. Accepts local connections on a UNIX-domain
|
||||
socket; each connection may:
|
||||
|
||||
* publish events (``PUB`` frames) that the server fans out to all matching
|
||||
subscribers on other connections, and
|
||||
* subscribe to patterns (``SUB`` frames) and receive matching events as
|
||||
``EVT`` frames.
|
||||
|
||||
Authorization is socket file permissions (0660, group=``decnet`` if that
|
||||
POSIX group exists, else the server process's own group). Anything the
|
||||
kernel lets ``connect()`` is trusted — there is no verb-level auth. This
|
||||
matches the "local processes on the same host" threat model; cross-host
|
||||
federation is out of scope (see DEBT-029).
|
||||
|
||||
Backpressure is per-connection, drop-oldest: if a subscriber can't drain its
|
||||
outbound queue fast enough, the server discards the oldest pending event
|
||||
rather than blocking publishers. The bus is at-most-once by contract, so
|
||||
drops are acceptable; stalled publishers are not.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
import grp
|
||||
import os
|
||||
import pathlib
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from decnet.bus import protocol
|
||||
from decnet.bus.base import Event, matches
|
||||
from decnet.logging import get_logger
|
||||
|
||||
log = get_logger("bus.server")
|
||||
|
||||
_SOCKET_MODE = 0o660
|
||||
_DEFAULT_GROUP = "decnet"
|
||||
_OUTBOUND_QUEUE_SIZE = 1024
|
||||
|
||||
|
||||
@dataclass(eq=False)
|
||||
class _Connection:
|
||||
"""Per-connection server state."""
|
||||
|
||||
writer: asyncio.StreamWriter
|
||||
peer_name: str = "<unknown>"
|
||||
patterns: set[str] = field(default_factory=set)
|
||||
outbound: asyncio.Queue[bytes] = field(
|
||||
default_factory=lambda: asyncio.Queue(maxsize=_OUTBOUND_QUEUE_SIZE)
|
||||
)
|
||||
closed: bool = False
|
||||
|
||||
|
||||
class BusServer:
|
||||
"""Serve a UNIX-socket bus on *socket_path*.
|
||||
|
||||
Lifecycle: construct → :meth:`start` → :meth:`serve_forever` (or rely
|
||||
on :meth:`start` returning once bound) → :meth:`close` for teardown.
|
||||
Safe to :meth:`close` multiple times.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
socket_path: pathlib.Path | str,
|
||||
*,
|
||||
group: str | None = _DEFAULT_GROUP,
|
||||
mode: int = _SOCKET_MODE,
|
||||
) -> None:
|
||||
self._path = pathlib.Path(socket_path)
|
||||
self._group = group
|
||||
self._mode = mode
|
||||
self._server: asyncio.base_events.Server | None = None
|
||||
self._connections: set[_Connection] = set()
|
||||
self._closed = False
|
||||
|
||||
# ─── Lifecycle ──────────────────────────────────────────────────────────
|
||||
|
||||
async def start(self) -> None:
|
||||
"""Bind the socket and begin accepting connections.
|
||||
|
||||
Removes any stale socket file at *socket_path* first (common case:
|
||||
the previous worker crashed without cleaning up). The parent
|
||||
directory must already exist; we do NOT create it blindly because
|
||||
the chosen directory (typically ``/run/decnet``) may require
|
||||
systemd ``RuntimeDirectory=`` to set up.
|
||||
"""
|
||||
if self._server is not None:
|
||||
return
|
||||
|
||||
parent = self._path.parent
|
||||
if not parent.exists():
|
||||
raise FileNotFoundError(
|
||||
f"bus socket parent directory {parent} does not exist; "
|
||||
f"create it with systemd RuntimeDirectory= or mkdir"
|
||||
)
|
||||
|
||||
# Clean up a stale socket from a previous crash. If a live server
|
||||
# is actually listening there, ``bind()`` below will fail — we do
|
||||
# not try to detect live vs. stale ourselves.
|
||||
with contextlib.suppress(FileNotFoundError):
|
||||
if self._path.is_socket():
|
||||
self._path.unlink()
|
||||
|
||||
self._server = await asyncio.start_unix_server(
|
||||
self._handle_connection, path=str(self._path),
|
||||
)
|
||||
_chmod_and_chown(self._path, self._mode, self._group)
|
||||
log.info("bus.server: listening on %s (mode=%o group=%s)",
|
||||
self._path, self._mode, self._group or "<inherit>")
|
||||
|
||||
async def serve_forever(self) -> None:
|
||||
if self._server is None:
|
||||
raise RuntimeError("BusServer not started")
|
||||
async with self._server:
|
||||
await self._server.serve_forever()
|
||||
|
||||
async def close(self) -> None:
|
||||
if self._closed:
|
||||
return
|
||||
self._closed = True
|
||||
|
||||
if self._server is not None:
|
||||
self._server.close()
|
||||
with contextlib.suppress(Exception):
|
||||
await self._server.wait_closed()
|
||||
self._server = None
|
||||
|
||||
# Drain every live connection.
|
||||
for conn in list(self._connections):
|
||||
await self._close_connection(conn)
|
||||
self._connections.clear()
|
||||
|
||||
with contextlib.suppress(FileNotFoundError):
|
||||
self._path.unlink()
|
||||
log.info("bus.server: closed")
|
||||
|
||||
# ─── Internal publish fan-out ───────────────────────────────────────────
|
||||
|
||||
async def publish(self, topic: str, payload: dict[str, Any], event_type: str = "") -> None:
|
||||
"""Server-side publish helper — used by the worker to emit
|
||||
``system.bus.health`` heartbeats without opening a client loop."""
|
||||
event = Event(topic=topic, payload=payload, type=event_type)
|
||||
self._fanout(event)
|
||||
|
||||
# ─── Connection handler ─────────────────────────────────────────────────
|
||||
|
||||
async def _handle_connection(
|
||||
self,
|
||||
reader: asyncio.StreamReader,
|
||||
writer: asyncio.StreamWriter,
|
||||
) -> None:
|
||||
conn = _Connection(writer=writer)
|
||||
self._connections.add(conn)
|
||||
writer_task = asyncio.create_task(self._writer_loop(conn))
|
||||
try:
|
||||
await self._reader_loop(conn, reader)
|
||||
except protocol.ProtocolError as exc:
|
||||
log.warning("bus.server: protocol error from %s: %s", conn.peer_name, exc)
|
||||
except (asyncio.IncompleteReadError, ConnectionError) as exc:
|
||||
log.debug("bus.server: %s disconnected: %s", conn.peer_name, exc)
|
||||
except Exception: # pragma: no cover - defensive
|
||||
log.exception("bus.server: unhandled error in connection")
|
||||
finally:
|
||||
await self._close_connection(conn)
|
||||
self._connections.discard(conn)
|
||||
writer_task.cancel()
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
await writer_task
|
||||
|
||||
async def _reader_loop(
|
||||
self, conn: _Connection, reader: asyncio.StreamReader,
|
||||
) -> None:
|
||||
while True:
|
||||
frame = await protocol.read_frame(reader)
|
||||
if frame is None:
|
||||
return
|
||||
await self._dispatch(conn, frame)
|
||||
if frame.verb == protocol.BYE:
|
||||
return
|
||||
|
||||
async def _dispatch(self, conn: _Connection, frame: protocol.Frame) -> None:
|
||||
if frame.verb == protocol.HELLO:
|
||||
conn.peer_name = frame.args or conn.peer_name
|
||||
log.debug("bus.server: HELLO from %s", conn.peer_name)
|
||||
return
|
||||
if frame.verb == protocol.SUB:
|
||||
pattern = frame.args
|
||||
if not pattern:
|
||||
raise protocol.ProtocolError("SUB requires a pattern")
|
||||
conn.patterns.add(pattern)
|
||||
log.debug("bus.server: %s SUB %s", conn.peer_name, pattern)
|
||||
return
|
||||
if frame.verb == protocol.UNSUB:
|
||||
conn.patterns.discard(frame.args)
|
||||
return
|
||||
if frame.verb == protocol.PUB:
|
||||
topic = frame.args
|
||||
if not topic:
|
||||
raise protocol.ProtocolError("PUB requires a topic")
|
||||
data = protocol.decode_body(frame.body) if frame.body else {}
|
||||
event = Event(
|
||||
topic=topic,
|
||||
payload=data.get("payload", {}) or {},
|
||||
type=data.get("type", "") or "",
|
||||
)
|
||||
self._fanout(event, origin=conn)
|
||||
return
|
||||
if frame.verb == protocol.BYE:
|
||||
return
|
||||
# EVT is server-to-client only; receiving one is a protocol violation.
|
||||
raise protocol.ProtocolError(f"unexpected verb {frame.verb!r} from client")
|
||||
|
||||
def _fanout(self, event: Event, *, origin: _Connection | None = None) -> None:
|
||||
"""Enqueue *event* as an EVT frame on every matching connection.
|
||||
|
||||
We do NOT deliver back to the originating connection (a publisher
|
||||
does not receive its own event). Encoding happens once per event,
|
||||
not once per subscriber.
|
||||
"""
|
||||
try:
|
||||
frame_bytes = protocol.encode(
|
||||
protocol.EVT, args=event.topic, body=event.to_dict(),
|
||||
)
|
||||
except protocol.ProtocolError:
|
||||
log.exception("bus.server: failed to encode EVT for topic=%s", event.topic)
|
||||
return
|
||||
|
||||
for conn in self._connections:
|
||||
if conn is origin or conn.closed:
|
||||
continue
|
||||
if not any(matches(p, event.topic) for p in conn.patterns):
|
||||
continue
|
||||
_enqueue_drop_oldest(conn.outbound, frame_bytes, event.topic)
|
||||
|
||||
async def _writer_loop(self, conn: _Connection) -> None:
|
||||
"""Serialize writes onto *conn*'s socket.
|
||||
|
||||
One writer task per connection so a slow peer only blocks its own
|
||||
queue, not the fan-out loop. The queue is bounded with drop-oldest
|
||||
policy applied at enqueue time (see :func:`_enqueue_drop_oldest`).
|
||||
"""
|
||||
try:
|
||||
while not conn.closed:
|
||||
data = await conn.outbound.get()
|
||||
conn.writer.write(data)
|
||||
await conn.writer.drain()
|
||||
except (ConnectionError, BrokenPipeError):
|
||||
log.debug("bus.server: %s writer: peer closed", conn.peer_name)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception: # pragma: no cover - defensive
|
||||
log.exception("bus.server: writer loop crashed for %s", conn.peer_name)
|
||||
|
||||
async def _close_connection(self, conn: _Connection) -> None:
|
||||
if conn.closed:
|
||||
return
|
||||
conn.closed = True
|
||||
with contextlib.suppress(Exception):
|
||||
conn.writer.close()
|
||||
await conn.writer.wait_closed()
|
||||
|
||||
|
||||
# ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def _chmod_and_chown(path: pathlib.Path, mode: int, group: str | None) -> None:
|
||||
"""Apply socket file perms and best-effort group ownership.
|
||||
|
||||
If *group* is ``None`` or the named group does not exist, we leave the
|
||||
socket owned by the current process group. This keeps the server
|
||||
usable on dev boxes that don't have a ``decnet`` group set up.
|
||||
"""
|
||||
try:
|
||||
os.chmod(path, mode)
|
||||
except OSError as exc:
|
||||
log.warning("bus.server: chmod(%s, %o) failed: %s", path, mode, exc)
|
||||
|
||||
if not group:
|
||||
return
|
||||
try:
|
||||
gid = grp.getgrnam(group).gr_gid
|
||||
except KeyError:
|
||||
log.debug("bus.server: group %r not found, leaving socket group unchanged", group)
|
||||
return
|
||||
try:
|
||||
os.chown(path, -1, gid)
|
||||
except PermissionError:
|
||||
# Dev box running as an unprivileged user can't chown. Log once at
|
||||
# debug and move on — the socket is still usable by the owner.
|
||||
log.debug("bus.server: chown(%s, gid=%d) denied; leaving as-is", path, gid)
|
||||
except OSError as exc:
|
||||
log.warning("bus.server: chown(%s, gid=%d) failed: %s", path, gid, exc)
|
||||
|
||||
|
||||
def _enqueue_drop_oldest(
|
||||
queue: "asyncio.Queue[bytes]", data: bytes, topic: str,
|
||||
) -> None:
|
||||
"""Drop-oldest backpressure — mirrors :func:`decnet.bus.fake._enqueue_drop_oldest`."""
|
||||
while True:
|
||||
try:
|
||||
queue.put_nowait(data)
|
||||
return
|
||||
except asyncio.QueueFull:
|
||||
try:
|
||||
queue.get_nowait()
|
||||
log.warning("bus.server: subscriber queue full, dropped event topic=%s", topic)
|
||||
except asyncio.QueueEmpty:
|
||||
return
|
||||
121
decnet/bus/worker.py
Normal file
121
decnet/bus/worker.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""``decnet bus`` worker entrypoint.
|
||||
|
||||
Starts a :class:`~decnet.bus.unix_server.BusServer` on the configured UNIX
|
||||
socket and serves forever, emitting a ``system.bus.health`` heartbeat on
|
||||
its own bus every :data:`HEARTBEAT_INTERVAL_SEC` seconds so liveness-aware
|
||||
consumers (dashboards, watchdogs) can tell the bus is up without polling
|
||||
the filesystem.
|
||||
|
||||
Cross-host federation is **out of scope** for the MVP; each host runs its
|
||||
own bus independently. See DEBT-029 for the deferred ``--bridge-tcp``
|
||||
mode that would proxy the socket over the swarm mTLS channel.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import pathlib
|
||||
import signal
|
||||
import time
|
||||
|
||||
from decnet.bus import topics
|
||||
from decnet.bus.unix_server import BusServer
|
||||
from decnet.logging import get_logger
|
||||
|
||||
log = get_logger("bus.worker")
|
||||
|
||||
HEARTBEAT_INTERVAL_SEC = 10
|
||||
|
||||
|
||||
async def bus_worker(
|
||||
socket_path: str | pathlib.Path,
|
||||
*,
|
||||
group: str | None = "decnet",
|
||||
heartbeat_interval: int = HEARTBEAT_INTERVAL_SEC,
|
||||
) -> None:
|
||||
"""Run the bus server until cancelled or SIGTERM/SIGINT is received.
|
||||
|
||||
The parent directory of *socket_path* must already exist (systemd's
|
||||
``RuntimeDirectory=decnet`` handles this in prod; dev code is expected
|
||||
to ``mkdir`` first). This function does not create it implicitly
|
||||
because the right choice of perms/owner depends on the deployment
|
||||
context.
|
||||
"""
|
||||
path = pathlib.Path(socket_path)
|
||||
_ensure_parent(path)
|
||||
|
||||
server = BusServer(path, group=group)
|
||||
await server.start()
|
||||
log.info("bus.worker: pid=%d socket=%s", os.getpid(), path)
|
||||
|
||||
stop_event = asyncio.Event()
|
||||
_install_signal_handlers(stop_event)
|
||||
|
||||
heartbeat_task = asyncio.create_task(_heartbeat_loop(server, heartbeat_interval))
|
||||
serve_task = asyncio.create_task(server.serve_forever())
|
||||
|
||||
try:
|
||||
await stop_event.wait()
|
||||
log.info("bus.worker: shutdown signal received")
|
||||
finally:
|
||||
heartbeat_task.cancel()
|
||||
serve_task.cancel()
|
||||
for task in (heartbeat_task, serve_task):
|
||||
try:
|
||||
await task
|
||||
except (asyncio.CancelledError, Exception): # noqa: BLE001 - draining shutdown
|
||||
pass
|
||||
await server.close()
|
||||
log.info("bus.worker: stopped")
|
||||
|
||||
|
||||
async def _heartbeat_loop(server: BusServer, interval: int) -> None:
|
||||
"""Publish ``system.bus.health`` on the server's own fan-out."""
|
||||
started_at = time.time()
|
||||
while True:
|
||||
try:
|
||||
await server.publish(
|
||||
topics.system(topics.SYSTEM_BUS_HEALTH),
|
||||
{
|
||||
"pid": os.getpid(),
|
||||
"uptime_sec": round(time.time() - started_at, 3),
|
||||
"ts": time.time(),
|
||||
},
|
||||
event_type=topics.SYSTEM_BUS_HEALTH,
|
||||
)
|
||||
except Exception: # pragma: no cover - heartbeat must never kill the worker
|
||||
log.exception("bus.worker: heartbeat publish failed")
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
|
||||
def _install_signal_handlers(stop_event: asyncio.Event) -> None:
|
||||
loop = asyncio.get_running_loop()
|
||||
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||
try:
|
||||
loop.add_signal_handler(sig, stop_event.set)
|
||||
except (NotImplementedError, RuntimeError):
|
||||
# add_signal_handler is not supported on Windows / in some
|
||||
# test harnesses where the loop is running in a non-main thread.
|
||||
# The worker still exits via KeyboardInterrupt bubbling up.
|
||||
pass
|
||||
|
||||
|
||||
def _ensure_parent(path: pathlib.Path) -> None:
|
||||
parent = path.parent
|
||||
if parent.exists():
|
||||
return
|
||||
# Dev-box convenience: if the parent is the user's ``~/.decnet`` dir,
|
||||
# create it. We do not auto-mkdir ``/run/decnet`` — that's systemd's job
|
||||
# and silently creating it as the wrong user would cause permission
|
||||
# confusion later.
|
||||
home_prefix = pathlib.Path.home() / ".decnet"
|
||||
try:
|
||||
parent.relative_to(home_prefix.parent)
|
||||
except ValueError:
|
||||
raise FileNotFoundError(
|
||||
f"bus socket parent {parent} does not exist; create it first"
|
||||
)
|
||||
parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
__all__ = ["bus_worker", "HEARTBEAT_INTERVAL_SEC"]
|
||||
Reference in New Issue
Block a user