feat(bus): host-local UNIX-socket pub/sub worker (DEBT-029)
Land the `decnet bus` worker and `get_bus()` factory. Transport is a host-local UNIX-domain socket (0660, group=decnet); authz is the file mode. Wire framing is a tiny verb-line + 4-byte-BE length + orjson body. NATS-style wildcard topics (`*`, `>`). At-most-once, fire-and-forget — DB stays the source of truth. `FakeBus` / `NullBus` for tests and the disabled path. Cross-host federation is deferred to a future `--bridge-tcp` mode; DEBT-030 is master-only and unblocked.
This commit is contained in:
18
decnet/bus/__init__.py
Normal file
18
decnet/bus/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""DECNET ServiceBus — pub/sub notification substrate.
|
||||
|
||||
The bus is the notification layer for DECNET's worker constellation. The DB
|
||||
remains the source of truth for anything durable; the bus carries "something
|
||||
happened, go look" events. Delivery is at-most-once, fire-and-forget.
|
||||
|
||||
Consumers call :func:`get_bus` from :mod:`decnet.bus.factory`; never import
|
||||
transport implementations directly. The factory selects the backend via
|
||||
``DECNET_BUS_TYPE`` (``nats`` or ``fake``) and honors ``DECNET_BUS_ENABLED``.
|
||||
|
||||
Topic hierarchy is defined in :mod:`decnet.bus.topics` and locked early so
|
||||
consumers can subscribe with stable wildcard patterns.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from decnet.bus.base import BaseBus, Event, Subscription
|
||||
|
||||
__all__ = ["BaseBus", "Event", "Subscription"]
|
||||
205
decnet/bus/base.py
Normal file
205
decnet/bus/base.py
Normal file
@@ -0,0 +1,205 @@
|
||||
"""Bus abstractions: the :class:`Event` envelope and the :class:`BaseBus` ABC.
|
||||
|
||||
Every transport (NATS, in-process fake, null) speaks this contract. The
|
||||
envelope is versioned (``v``) so future evolution never breaks deployed
|
||||
consumers that happen to see a newer event shape.
|
||||
|
||||
Subscription model: :meth:`BaseBus.subscribe` returns a :class:`Subscription`
|
||||
that is an async context manager AND an async iterator. The expected usage is:
|
||||
|
||||
async with bus.subscribe("topology.*.mutation.*") as sub:
|
||||
async for event in sub:
|
||||
handle(event)
|
||||
|
||||
Leaving the ``async with`` releases the underlying subscription handle; the
|
||||
transport is free to drop any buffered events after that point.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import abc
|
||||
import asyncio
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, AsyncIterator
|
||||
|
||||
EVENT_SCHEMA_VERSION = 1
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Event:
|
||||
"""The bus envelope.
|
||||
|
||||
``v`` is the envelope schema version, bumped on incompatible shape
|
||||
changes. ``type`` is a short discriminator (``"mutation.applied"``,
|
||||
``"decky.state"``) useful for consumers that subscribe to a broad
|
||||
wildcard and dispatch in Python; it is redundant with the trailing
|
||||
segments of ``topic`` but cheaper to inspect. ``ts`` is epoch seconds
|
||||
(float). ``id`` is a random UUID so consumers can de-dupe if they
|
||||
ever see the same event twice (not expected at-most-once, but cheap
|
||||
insurance).
|
||||
"""
|
||||
|
||||
topic: str
|
||||
payload: dict[str, Any]
|
||||
type: str = ""
|
||||
v: int = EVENT_SCHEMA_VERSION
|
||||
ts: float = field(default_factory=time.time)
|
||||
id: str = field(default_factory=lambda: uuid.uuid4().hex)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"v": self.v,
|
||||
"id": self.id,
|
||||
"topic": self.topic,
|
||||
"type": self.type,
|
||||
"ts": self.ts,
|
||||
"payload": self.payload,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, topic: str, data: dict[str, Any]) -> "Event":
|
||||
"""Reconstruct an Event from a wire-format dict.
|
||||
|
||||
``topic`` is passed explicitly because the transport knows which
|
||||
subject the message arrived on; trusting a ``topic`` field from the
|
||||
wire would let a misbehaving publisher spoof events on topics they
|
||||
don't actually publish to.
|
||||
"""
|
||||
return cls(
|
||||
topic=topic,
|
||||
payload=data.get("payload", {}) or {},
|
||||
type=data.get("type", "") or "",
|
||||
v=int(data.get("v", EVENT_SCHEMA_VERSION)),
|
||||
ts=float(data.get("ts", time.time())),
|
||||
id=data.get("id") or uuid.uuid4().hex,
|
||||
)
|
||||
|
||||
|
||||
class Subscription(abc.ABC):
|
||||
"""An open subscription — async context manager + async iterator.
|
||||
|
||||
Concrete transports subclass this and implement :meth:`_aclose` plus the
|
||||
async iterator protocol. Callers should not instantiate directly; use
|
||||
:meth:`BaseBus.subscribe`.
|
||||
"""
|
||||
|
||||
def __init__(self, pattern: str) -> None:
|
||||
self.pattern = pattern
|
||||
self._closed = False
|
||||
|
||||
async def __aenter__(self) -> "Subscription":
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *exc: Any) -> None:
|
||||
await self.aclose()
|
||||
|
||||
def __aiter__(self) -> AsyncIterator[Event]:
|
||||
return self
|
||||
|
||||
async def aclose(self) -> None:
|
||||
if self._closed:
|
||||
return
|
||||
self._closed = True
|
||||
await self._aclose()
|
||||
|
||||
@abc.abstractmethod
|
||||
async def __anext__(self) -> Event: # pragma: no cover - abstract
|
||||
raise NotImplementedError
|
||||
|
||||
@abc.abstractmethod
|
||||
async def _aclose(self) -> None: # pragma: no cover - abstract
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class BaseBus(abc.ABC):
|
||||
"""Pub/sub transport contract.
|
||||
|
||||
Implementations MUST be safe to ``await connect()`` multiple times and
|
||||
``await close()`` multiple times. Publishing to a closed bus raises
|
||||
:class:`RuntimeError`; subscribing to a closed bus does too.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def connect(self) -> None:
|
||||
"""Establish any network/transport resources. Idempotent."""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def publish(
|
||||
self,
|
||||
topic: str,
|
||||
payload: dict[str, Any],
|
||||
*,
|
||||
event_type: str = "",
|
||||
) -> None:
|
||||
"""Publish *payload* on *topic*. Fire-and-forget.
|
||||
|
||||
Delivery is at-most-once. On transport error the implementation
|
||||
logs and returns; it does not raise, because bus losses must not
|
||||
cascade into worker failure (DB is source of truth).
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def subscribe(self, pattern: str) -> Subscription:
|
||||
"""Return a :class:`Subscription` that yields events matching *pattern*.
|
||||
|
||||
Patterns follow NATS wildcard semantics: ``*`` matches one topic
|
||||
token, ``>`` matches one-or-more trailing tokens. Examples:
|
||||
|
||||
* ``topology.*.mutation.applied`` — all ``applied`` events for any
|
||||
topology.
|
||||
* ``topology.abc123.mutation.*`` — all mutation states for one
|
||||
topology.
|
||||
* ``topology.>`` — every event under the ``topology`` root.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def close(self) -> None:
|
||||
"""Tear down transport resources. Idempotent."""
|
||||
|
||||
async def __aenter__(self) -> "BaseBus":
|
||||
await self.connect()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *exc: Any) -> None:
|
||||
await self.close()
|
||||
|
||||
|
||||
# ─── Wildcard matching shared across in-process transports ───────────────────
|
||||
|
||||
def matches(pattern: str, topic: str) -> bool:
|
||||
"""Return True iff *topic* matches *pattern* under NATS wildcard rules.
|
||||
|
||||
``*`` matches exactly one non-empty token; ``>`` matches one-or-more
|
||||
trailing tokens (so ``topology.>`` matches ``topology.abc.x`` but not
|
||||
``topology`` alone).
|
||||
"""
|
||||
p_tokens = pattern.split(".")
|
||||
t_tokens = topic.split(".")
|
||||
for i, p in enumerate(p_tokens):
|
||||
if p == ">":
|
||||
# Must have at least one token remaining to match.
|
||||
return i < len(t_tokens)
|
||||
if i >= len(t_tokens):
|
||||
return False
|
||||
if p == "*":
|
||||
if not t_tokens[i]:
|
||||
return False
|
||||
continue
|
||||
if p != t_tokens[i]:
|
||||
return False
|
||||
return len(p_tokens) == len(t_tokens)
|
||||
|
||||
|
||||
# Sentinel used by the in-process transports to signal "no more events"
|
||||
# through the asyncio.Queue fan-out without inventing a separate control
|
||||
# channel. Not part of the wire protocol.
|
||||
_CLOSE_SENTINEL: Any = object()
|
||||
|
||||
|
||||
async def _next_or_stop(queue: "asyncio.Queue[Any]") -> Event:
|
||||
"""Pop the next item from *queue*, raising ``StopAsyncIteration`` on close."""
|
||||
item = await queue.get()
|
||||
if item is _CLOSE_SENTINEL:
|
||||
raise StopAsyncIteration
|
||||
return item
|
||||
85
decnet/bus/factory.py
Normal file
85
decnet/bus/factory.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""Bus factory — selects a :class:`~decnet.bus.base.BaseBus` implementation.
|
||||
|
||||
Dispatch key: the ``DECNET_BUS_TYPE`` environment variable.
|
||||
|
||||
* ``unix`` (default) → :class:`~decnet.bus.unix_client.UnixSocketBus`
|
||||
* ``fake`` → :class:`~decnet.bus.fake.FakeBus` (in-process)
|
||||
|
||||
If ``DECNET_BUS_ENABLED`` is ``"false"`` the factory short-circuits to
|
||||
:class:`~decnet.bus.fake.NullBus` regardless of ``DECNET_BUS_TYPE`` — a
|
||||
cheap way for dev environments to run workers without a bus daemon.
|
||||
|
||||
Mirrors :mod:`decnet.web.db.factory` (lazy imports inside each branch,
|
||||
env-driven dispatch, optional telemetry wrapping). Callers MUST use
|
||||
:func:`get_bus` rather than instantiating transports directly.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from decnet.bus.base import BaseBus
|
||||
|
||||
|
||||
def get_bus(**kwargs: Any) -> BaseBus:
|
||||
"""Instantiate the bus implementation selected by environment.
|
||||
|
||||
Keyword arguments are forwarded to the concrete transport:
|
||||
|
||||
* ``UnixSocketBus`` accepts ``socket_path`` (overrides
|
||||
``DECNET_BUS_SOCKET``) and ``client_name``.
|
||||
* ``FakeBus`` accepts ``queue_size``.
|
||||
"""
|
||||
if os.environ.get("DECNET_BUS_ENABLED", "true").lower() == "false":
|
||||
from decnet.bus.fake import NullBus
|
||||
return NullBus()
|
||||
|
||||
bus_type = os.environ.get("DECNET_BUS_TYPE", "unix").lower()
|
||||
|
||||
if bus_type == "unix":
|
||||
from decnet.bus.unix_client import UnixSocketBus
|
||||
socket_path = kwargs.pop("socket_path", None) or _default_socket_path()
|
||||
bus: BaseBus = UnixSocketBus(socket_path=socket_path, **kwargs)
|
||||
elif bus_type == "fake":
|
||||
from decnet.bus.fake import FakeBus
|
||||
bus = FakeBus(**kwargs)
|
||||
else:
|
||||
raise ValueError(f"Unsupported bus type: {bus_type}")
|
||||
|
||||
return _maybe_wrap_telemetry(bus)
|
||||
|
||||
|
||||
def _default_socket_path() -> str:
|
||||
"""Return the bus socket path honoring ``DECNET_BUS_SOCKET`` and falling
|
||||
back to ``/run/decnet/bus.sock`` → ``~/.decnet/bus.sock``.
|
||||
|
||||
The runtime path (``/run/decnet``) is preferred because systemd
|
||||
``RuntimeDirectory=decnet`` sets it up with the right perms; the home
|
||||
fallback keeps dev boxes usable without systemd.
|
||||
"""
|
||||
explicit = os.environ.get("DECNET_BUS_SOCKET")
|
||||
if explicit:
|
||||
return explicit
|
||||
|
||||
runtime_dir = "/run/decnet"
|
||||
if os.path.isdir(runtime_dir) and os.access(runtime_dir, os.W_OK):
|
||||
return f"{runtime_dir}/bus.sock"
|
||||
return os.path.expanduser("~/.decnet/bus.sock")
|
||||
|
||||
|
||||
def _maybe_wrap_telemetry(bus: BaseBus) -> BaseBus:
|
||||
"""Wrap *bus* in a tracing proxy if OTEL is enabled, else return as-is.
|
||||
|
||||
Uses :func:`decnet.telemetry.wrap_repository` as the underlying proxy —
|
||||
its implementation is generic (wraps any async method in a span), so we
|
||||
reuse it with a bus-appropriate tracer name. If telemetry isn't wired
|
||||
up at all we no-op.
|
||||
"""
|
||||
try:
|
||||
from decnet.telemetry import wrap_repository # type: ignore[attr-defined]
|
||||
except ImportError:
|
||||
return bus
|
||||
try:
|
||||
return wrap_repository(bus)
|
||||
except Exception: # pragma: no cover - defensive
|
||||
return bus
|
||||
183
decnet/bus/fake.py
Normal file
183
decnet/bus/fake.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""In-process bus transports.
|
||||
|
||||
* :class:`FakeBus` — real pub/sub semantics without touching a socket. Used
|
||||
by unit tests and anywhere ``DECNET_BUS_TYPE=fake`` is set. Lets code
|
||||
that depends on the bus be exercised entirely inside a single event loop,
|
||||
matching the DECNET testing convention of not opening real network
|
||||
sockets from unit tests.
|
||||
* :class:`NullBus` — no-op. Returned by :func:`~decnet.bus.factory.get_bus`
|
||||
when ``DECNET_BUS_ENABLED=false`` so workers can start cleanly in dev
|
||||
environments where no bus daemon is running. Publishes are dropped;
|
||||
subscriptions yield nothing and close cleanly.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any
|
||||
|
||||
from decnet.bus.base import (
|
||||
BaseBus,
|
||||
Event,
|
||||
Subscription,
|
||||
_CLOSE_SENTINEL,
|
||||
matches,
|
||||
)
|
||||
from decnet.logging import get_logger
|
||||
|
||||
log = get_logger("bus.fake")
|
||||
|
||||
# Per-subscriber bounded queue: backpressure policy is drop-oldest so a slow
|
||||
# consumer cannot stall publishers (the invariant — DB is the source of
|
||||
# truth — makes dropped events acceptable).
|
||||
_DEFAULT_QUEUE_SIZE = 1024
|
||||
|
||||
|
||||
# ─── FakeBus ─────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class _FakeSubscription(Subscription):
|
||||
"""Subscription backed by an :class:`asyncio.Queue` fed from
|
||||
:meth:`FakeBus.publish`. Unregisters itself on close."""
|
||||
|
||||
def __init__(self, bus: "FakeBus", pattern: str, queue: "asyncio.Queue[Any]") -> None:
|
||||
super().__init__(pattern)
|
||||
self._bus = bus
|
||||
self._queue = queue
|
||||
|
||||
async def __anext__(self) -> Event:
|
||||
if self._closed:
|
||||
raise StopAsyncIteration
|
||||
item = await self._queue.get()
|
||||
if item is _CLOSE_SENTINEL:
|
||||
raise StopAsyncIteration
|
||||
return item
|
||||
|
||||
async def _aclose(self) -> None:
|
||||
self._bus._unregister(self)
|
||||
# Unblock any pending __anext__ waiter.
|
||||
try:
|
||||
self._queue.put_nowait(_CLOSE_SENTINEL)
|
||||
except asyncio.QueueFull:
|
||||
pass
|
||||
|
||||
|
||||
class FakeBus(BaseBus):
|
||||
"""In-process pub/sub.
|
||||
|
||||
Publishes iterate every active subscription and enqueue the event on
|
||||
the ones whose pattern matches the topic. If a subscriber's queue is
|
||||
full, the oldest event is discarded to make room — same at-most-once
|
||||
semantics as the real UNIX-socket transport.
|
||||
"""
|
||||
|
||||
def __init__(self, queue_size: int = _DEFAULT_QUEUE_SIZE) -> None:
|
||||
self._queue_size = queue_size
|
||||
self._subs: list[_FakeSubscription] = []
|
||||
self._connected = False
|
||||
self._closed = False
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def connect(self) -> None:
|
||||
self._connected = True
|
||||
|
||||
async def publish(
|
||||
self,
|
||||
topic: str,
|
||||
payload: dict[str, Any],
|
||||
*,
|
||||
event_type: str = "",
|
||||
) -> None:
|
||||
if self._closed:
|
||||
raise RuntimeError("publish on closed bus")
|
||||
event = Event(topic=topic, payload=payload, type=event_type)
|
||||
async with self._lock:
|
||||
targets = [s for s in self._subs if matches(s.pattern, topic)]
|
||||
for sub in targets:
|
||||
_enqueue_drop_oldest(sub._queue, event)
|
||||
|
||||
def subscribe(self, pattern: str) -> Subscription:
|
||||
if self._closed:
|
||||
raise RuntimeError("subscribe on closed bus")
|
||||
queue: asyncio.Queue[Any] = asyncio.Queue(maxsize=self._queue_size)
|
||||
sub = _FakeSubscription(self, pattern, queue)
|
||||
self._subs.append(sub)
|
||||
return sub
|
||||
|
||||
def _unregister(self, sub: _FakeSubscription) -> None:
|
||||
try:
|
||||
self._subs.remove(sub)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
async def close(self) -> None:
|
||||
if self._closed:
|
||||
return
|
||||
self._closed = True
|
||||
# Wake every still-open subscription so iterators unblock cleanly.
|
||||
for sub in list(self._subs):
|
||||
try:
|
||||
sub._queue.put_nowait(_CLOSE_SENTINEL)
|
||||
except asyncio.QueueFull:
|
||||
pass
|
||||
self._subs.clear()
|
||||
|
||||
|
||||
def _enqueue_drop_oldest(queue: "asyncio.Queue[Any]", event: Event) -> None:
|
||||
"""Put *event* on *queue*, dropping the oldest item if the queue is full.
|
||||
|
||||
Factored out so both FakeBus and the real UNIX server share the exact
|
||||
same backpressure policy.
|
||||
"""
|
||||
while True:
|
||||
try:
|
||||
queue.put_nowait(event)
|
||||
return
|
||||
except asyncio.QueueFull:
|
||||
try:
|
||||
dropped = queue.get_nowait()
|
||||
log.warning(
|
||||
"bus.fake: subscriber queue full, dropped %s", getattr(dropped, "topic", "?")
|
||||
)
|
||||
except asyncio.QueueEmpty:
|
||||
return
|
||||
|
||||
|
||||
# ─── NullBus ─────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class _NullSubscription(Subscription):
|
||||
"""A subscription that never yields and closes immediately on iteration."""
|
||||
|
||||
async def __anext__(self) -> Event:
|
||||
raise StopAsyncIteration
|
||||
|
||||
async def _aclose(self) -> None:
|
||||
return
|
||||
|
||||
|
||||
class NullBus(BaseBus):
|
||||
"""No-op bus used when ``DECNET_BUS_ENABLED=false``.
|
||||
|
||||
Publishes are silently dropped; subscriptions are empty. Intended for
|
||||
dev environments where no bus daemon is running — the process starts
|
||||
cleanly, code that publishes doesn't need feature flags, and nothing
|
||||
ever blocks on a subscriber.
|
||||
"""
|
||||
|
||||
async def connect(self) -> None:
|
||||
return
|
||||
|
||||
async def publish(
|
||||
self,
|
||||
topic: str,
|
||||
payload: dict[str, Any],
|
||||
*,
|
||||
event_type: str = "",
|
||||
) -> None:
|
||||
return
|
||||
|
||||
def subscribe(self, pattern: str) -> Subscription:
|
||||
return _NullSubscription(pattern)
|
||||
|
||||
async def close(self) -> None:
|
||||
return
|
||||
144
decnet/bus/protocol.py
Normal file
144
decnet/bus/protocol.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""Wire protocol for the DECNET bus UNIX-socket transport.
|
||||
|
||||
Frame layout:
|
||||
|
||||
<VERB> [<args ...>]\\n # ASCII header, single line, no trailing space
|
||||
<4-byte big-endian body length>
|
||||
<body> # orjson-serialized dict, or empty (length 0)
|
||||
|
||||
Verbs:
|
||||
|
||||
* ``HELLO <client-name>`` — optional greeting, logged by server. Body empty.
|
||||
* ``PUB <topic>`` — publisher → server. Body = payload dict.
|
||||
* ``SUB <pattern>`` — subscriber → server. Body empty.
|
||||
* ``UNSUB <pattern>`` — subscriber → server. Body empty.
|
||||
* ``EVT <topic>`` — server → subscriber. Body = payload dict (wrapped
|
||||
in an :class:`~decnet.bus.base.Event` envelope).
|
||||
* ``BYE`` — either direction. Body empty. Graceful shutdown.
|
||||
|
||||
Parsing rules:
|
||||
|
||||
* The header is a single line terminated by ``\\n`` (LF). ``\\r`` is tolerated
|
||||
but not required.
|
||||
* Header tokens are whitespace-separated. The first token is the verb;
|
||||
everything after is verb-specific. We split on the first space only so
|
||||
topics / patterns with quoted content are not supported (they are not
|
||||
needed — topic segments forbid whitespace per :mod:`decnet.bus.topics`).
|
||||
* Maximum header length is 4096 bytes; maximum body length is 1 MiB. Beyond
|
||||
those, the connection is dropped with a logged error. This is a honeypot
|
||||
framework, not a general-purpose message broker; a malformed frame is
|
||||
treated as hostile.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import struct
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
import orjson
|
||||
|
||||
MAX_HEADER_BYTES = 4096
|
||||
MAX_BODY_BYTES = 1 * 1024 * 1024 # 1 MiB
|
||||
|
||||
# Verb constants (callers should reference these, not bare strings).
|
||||
HELLO = "HELLO"
|
||||
PUB = "PUB"
|
||||
SUB = "SUB"
|
||||
UNSUB = "UNSUB"
|
||||
EVT = "EVT"
|
||||
BYE = "BYE"
|
||||
|
||||
_VALID_VERBS = frozenset({HELLO, PUB, SUB, UNSUB, EVT, BYE})
|
||||
|
||||
|
||||
class ProtocolError(Exception):
|
||||
"""Malformed or oversized frame. Callers should close the connection."""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Frame:
|
||||
"""A parsed frame. ``body`` is the raw (unparsed) body bytes — callers
|
||||
decide whether to orjson-decode it (the protocol does not know whether
|
||||
a given verb expects a dict body or an empty one).
|
||||
"""
|
||||
|
||||
verb: str
|
||||
args: str # everything after the verb on the header line, trimmed
|
||||
body: bytes
|
||||
|
||||
|
||||
def encode(verb: str, args: str = "", body: dict[str, Any] | None = None) -> bytes:
|
||||
"""Serialize a frame.
|
||||
|
||||
*body* is a dict that will be orjson-encoded, or ``None`` for an empty
|
||||
body. The header line is written verbatim — callers must supply args
|
||||
that are free of ``\\n``.
|
||||
"""
|
||||
if verb not in _VALID_VERBS:
|
||||
raise ProtocolError(f"unknown verb {verb!r}")
|
||||
if "\n" in args or "\r" in args:
|
||||
raise ProtocolError("args must not contain newline characters")
|
||||
|
||||
body_bytes = b"" if body is None else orjson.dumps(body)
|
||||
if len(body_bytes) > MAX_BODY_BYTES:
|
||||
raise ProtocolError(
|
||||
f"body {len(body_bytes)} bytes exceeds max {MAX_BODY_BYTES}"
|
||||
)
|
||||
|
||||
header = f"{verb} {args}".rstrip() + "\n"
|
||||
header_bytes = header.encode("ascii")
|
||||
if len(header_bytes) > MAX_HEADER_BYTES:
|
||||
raise ProtocolError(
|
||||
f"header {len(header_bytes)} bytes exceeds max {MAX_HEADER_BYTES}"
|
||||
)
|
||||
return header_bytes + struct.pack(">I", len(body_bytes)) + body_bytes
|
||||
|
||||
|
||||
async def read_frame(reader: asyncio.StreamReader) -> Frame | None:
|
||||
"""Read one frame from *reader*.
|
||||
|
||||
Returns ``None`` on clean EOF before a new frame starts. Raises
|
||||
:class:`ProtocolError` on malformed input (caller should close the
|
||||
connection).
|
||||
"""
|
||||
try:
|
||||
header = await reader.readuntil(b"\n")
|
||||
except asyncio.IncompleteReadError as exc:
|
||||
if not exc.partial:
|
||||
return None
|
||||
raise ProtocolError("connection closed mid-header") from exc
|
||||
except asyncio.LimitOverrunError as exc:
|
||||
raise ProtocolError("header exceeded buffer limit") from exc
|
||||
|
||||
if len(header) > MAX_HEADER_BYTES:
|
||||
raise ProtocolError(f"header {len(header)} bytes exceeds max")
|
||||
|
||||
line = header.rstrip(b"\r\n").decode("ascii", errors="strict")
|
||||
if not line:
|
||||
raise ProtocolError("empty header line")
|
||||
|
||||
verb, _, args = line.partition(" ")
|
||||
if verb not in _VALID_VERBS:
|
||||
raise ProtocolError(f"unknown verb {verb!r}")
|
||||
|
||||
length_bytes = await reader.readexactly(4)
|
||||
(body_len,) = struct.unpack(">I", length_bytes)
|
||||
if body_len > MAX_BODY_BYTES:
|
||||
raise ProtocolError(f"body length {body_len} exceeds max")
|
||||
|
||||
body = await reader.readexactly(body_len) if body_len else b""
|
||||
return Frame(verb=verb, args=args.strip(), body=body)
|
||||
|
||||
|
||||
def decode_body(body: bytes) -> dict[str, Any]:
|
||||
"""Decode a frame body as a JSON dict. Empty body → empty dict."""
|
||||
if not body:
|
||||
return {}
|
||||
try:
|
||||
obj = orjson.loads(body)
|
||||
except orjson.JSONDecodeError as exc:
|
||||
raise ProtocolError(f"body is not valid JSON: {exc}") from exc
|
||||
if not isinstance(obj, dict):
|
||||
raise ProtocolError(f"body must be a JSON object, got {type(obj).__name__}")
|
||||
return obj
|
||||
106
decnet/bus/topics.py
Normal file
106
decnet/bus/topics.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""Canonical topic hierarchy for the DECNET ServiceBus.
|
||||
|
||||
Locked early so consumers can subscribe with stable wildcard patterns.
|
||||
Adding new topic families is fine; **renaming** existing ones is a breaking
|
||||
change for every subscriber and requires a coordinated rollout.
|
||||
|
||||
Token structure (NATS-style, dot-separated):
|
||||
|
||||
topology.{topology_id}.mutation.{state}
|
||||
topology.{topology_id}.status
|
||||
decky.{decky_id}.state
|
||||
decky.{decky_id}.traffic
|
||||
attacker.observed
|
||||
system.log
|
||||
system.bus.health
|
||||
|
||||
Wildcards (per :func:`decnet.bus.base.matches`):
|
||||
|
||||
* ``*`` matches exactly one token.
|
||||
* ``>`` matches one-or-more trailing tokens (so ``topology.>`` matches
|
||||
``topology.abc.status`` but not the bare root ``topology``).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
# ─── Root prefixes ───────────────────────────────────────────────────────────
|
||||
|
||||
TOPOLOGY = "topology"
|
||||
DECKY = "decky"
|
||||
ATTACKER = "attacker"
|
||||
SYSTEM = "system"
|
||||
|
||||
|
||||
# ─── Leaf event-type constants (the last segment of each topic) ──────────────
|
||||
|
||||
# Topology mutation lifecycle states — keep in sync with TopologyMutation.state
|
||||
# in decnet/web/db/models.py; the bus topic mirrors the DB state machine.
|
||||
MUTATION_ENQUEUED = "enqueued"
|
||||
MUTATION_APPLYING = "applying"
|
||||
MUTATION_APPLIED = "applied"
|
||||
MUTATION_FAILED = "failed"
|
||||
|
||||
# Topology-level status transitions (topology.{id}.status): fires when the
|
||||
# topology row's status column changes (pending/deploying/active/degraded/failed).
|
||||
TOPOLOGY_STATUS = "status"
|
||||
|
||||
# Decky-level event types (second token).
|
||||
DECKY_STATE = "state"
|
||||
DECKY_TRAFFIC = "traffic"
|
||||
|
||||
# System event types.
|
||||
SYSTEM_LOG = "log"
|
||||
SYSTEM_BUS_HEALTH = "bus.health"
|
||||
|
||||
|
||||
# ─── Builders ────────────────────────────────────────────────────────────────
|
||||
|
||||
def topology_mutation(topology_id: str, state: str) -> str:
|
||||
"""Build ``topology.<id>.mutation.<state>``.
|
||||
|
||||
*state* should be one of the ``MUTATION_*`` constants.
|
||||
"""
|
||||
_reject_tokens(topology_id, state)
|
||||
return f"{TOPOLOGY}.{topology_id}.mutation.{state}"
|
||||
|
||||
|
||||
def topology_status(topology_id: str) -> str:
|
||||
"""Build ``topology.<id>.status``."""
|
||||
_reject_tokens(topology_id)
|
||||
return f"{TOPOLOGY}.{topology_id}.{TOPOLOGY_STATUS}"
|
||||
|
||||
|
||||
def decky(decky_id: str, event_type: str) -> str:
|
||||
"""Build ``decky.<id>.<event_type>``.
|
||||
|
||||
*event_type* is typically one of ``DECKY_STATE`` or ``DECKY_TRAFFIC``.
|
||||
"""
|
||||
_reject_tokens(decky_id, event_type)
|
||||
return f"{DECKY}.{decky_id}.{event_type}"
|
||||
|
||||
|
||||
def system(event_type: str) -> str:
|
||||
"""Build ``system.<event_type>``.
|
||||
|
||||
*event_type* may itself contain dots (e.g. ``bus.health``) — we don't
|
||||
re-validate the already-constant leaves; this just prefixes.
|
||||
"""
|
||||
if not event_type:
|
||||
raise ValueError("system topic requires a non-empty event_type")
|
||||
return f"{SYSTEM}.{event_type}"
|
||||
|
||||
|
||||
def _reject_tokens(*parts: str) -> None:
|
||||
"""Reject topic segments that would break NATS-style tokenization.
|
||||
|
||||
Dots, wildcards, whitespace, and empty strings in a *segment* would
|
||||
silently corrupt the hierarchy (e.g. ``topology.a.b.status`` for a
|
||||
``topology_id`` of ``"a.b"``). Raise early at the builder instead of
|
||||
shipping a malformed topic to the wire.
|
||||
"""
|
||||
for p in parts:
|
||||
if not p:
|
||||
raise ValueError("topic segment must not be empty")
|
||||
if "." in p or "*" in p or ">" in p or any(c.isspace() for c in p):
|
||||
raise ValueError(
|
||||
f"topic segment {p!r} may not contain '.', '*', '>', or whitespace"
|
||||
)
|
||||
237
decnet/bus/unix_client.py
Normal file
237
decnet/bus/unix_client.py
Normal file
@@ -0,0 +1,237 @@
|
||||
"""UNIX-socket client — :class:`UnixSocketBus` implementation of :class:`BaseBus`.
|
||||
|
||||
Holds one open socket to the local :class:`~decnet.bus.unix_server.BusServer`.
|
||||
Operations:
|
||||
|
||||
* :meth:`publish` writes a single ``PUB`` frame and returns; no ack.
|
||||
* :meth:`subscribe` writes a ``SUB`` frame and returns a
|
||||
:class:`~decnet.bus.base.Subscription` backed by an :class:`asyncio.Queue`
|
||||
that the background reader task feeds.
|
||||
|
||||
One background reader task per bus instance dispatches incoming ``EVT``
|
||||
frames to every registered subscription whose pattern matches the topic.
|
||||
On connection drop or close, every subscription is woken via a sentinel so
|
||||
iterators unblock cleanly; callers see :class:`StopAsyncIteration` from the
|
||||
``async for`` loop.
|
||||
|
||||
No auto-reconnect in MVP. If the server restarts, callers must
|
||||
:meth:`close` the bus and construct a new one. This mirrors how other
|
||||
DECNET workers handle their dependencies — the systemd ``Restart=on-failure``
|
||||
supervision above us is the retry loop.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
import os
|
||||
import pathlib
|
||||
from typing import Any
|
||||
|
||||
from decnet.bus import protocol
|
||||
from decnet.bus.base import (
|
||||
BaseBus,
|
||||
Event,
|
||||
Subscription,
|
||||
_CLOSE_SENTINEL,
|
||||
matches,
|
||||
)
|
||||
from decnet.bus.fake import _enqueue_drop_oldest as _enqueue_event_drop_oldest
|
||||
from decnet.logging import get_logger
|
||||
|
||||
log = get_logger("bus.client")
|
||||
|
||||
_INBOUND_QUEUE_SIZE = 1024
|
||||
|
||||
|
||||
class _UnixSubscription(Subscription):
|
||||
def __init__(
|
||||
self,
|
||||
bus: "UnixSocketBus",
|
||||
pattern: str,
|
||||
queue: "asyncio.Queue[Any]",
|
||||
) -> None:
|
||||
super().__init__(pattern)
|
||||
self._bus = bus
|
||||
self._queue = queue
|
||||
|
||||
async def __anext__(self) -> Event:
|
||||
if self._closed:
|
||||
raise StopAsyncIteration
|
||||
item = await self._queue.get()
|
||||
if item is _CLOSE_SENTINEL:
|
||||
raise StopAsyncIteration
|
||||
return item
|
||||
|
||||
async def _aclose(self) -> None:
|
||||
await self._bus._unregister(self)
|
||||
try:
|
||||
self._queue.put_nowait(_CLOSE_SENTINEL)
|
||||
except asyncio.QueueFull:
|
||||
pass
|
||||
|
||||
|
||||
class UnixSocketBus(BaseBus):
|
||||
"""Client handle for a local :class:`BusServer`.
|
||||
|
||||
One instance per process typically; multiple instances simply open
|
||||
multiple sockets to the same server. Connection is lazy — the first
|
||||
:meth:`connect` (or any publish/subscribe call via ``async with``)
|
||||
opens the socket.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
socket_path: pathlib.Path | str,
|
||||
*,
|
||||
client_name: str | None = None,
|
||||
) -> None:
|
||||
self._path = pathlib.Path(socket_path)
|
||||
self._client_name = client_name or f"decnet-bus-client[{os.getpid()}]"
|
||||
self._reader: asyncio.StreamReader | None = None
|
||||
self._writer: asyncio.StreamWriter | None = None
|
||||
self._reader_task: asyncio.Task[None] | None = None
|
||||
self._subs: list[_UnixSubscription] = []
|
||||
self._lock = asyncio.Lock()
|
||||
self._write_lock = asyncio.Lock()
|
||||
self._closed = False
|
||||
|
||||
# ─── Lifecycle ──────────────────────────────────────────────────────────
|
||||
|
||||
async def connect(self) -> None:
|
||||
if self._writer is not None:
|
||||
return
|
||||
if self._closed:
|
||||
raise RuntimeError("connect on closed bus")
|
||||
self._reader, self._writer = await asyncio.open_unix_connection(str(self._path))
|
||||
await self._send(protocol.encode(protocol.HELLO, args=self._client_name))
|
||||
self._reader_task = asyncio.create_task(self._reader_loop())
|
||||
log.debug("bus.client: connected to %s as %s", self._path, self._client_name)
|
||||
|
||||
async def close(self) -> None:
|
||||
if self._closed:
|
||||
return
|
||||
self._closed = True
|
||||
|
||||
# Best-effort BYE — we don't care if it fails.
|
||||
if self._writer is not None and not self._writer.is_closing():
|
||||
with contextlib.suppress(Exception):
|
||||
await self._send(protocol.encode(protocol.BYE))
|
||||
|
||||
if self._reader_task is not None:
|
||||
self._reader_task.cancel()
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
await self._reader_task
|
||||
self._reader_task = None
|
||||
|
||||
if self._writer is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
self._writer.close()
|
||||
await self._writer.wait_closed()
|
||||
self._writer = None
|
||||
self._reader = None
|
||||
|
||||
# Wake every subscription so `async for` exits.
|
||||
for sub in list(self._subs):
|
||||
with contextlib.suppress(asyncio.QueueFull):
|
||||
sub._queue.put_nowait(_CLOSE_SENTINEL)
|
||||
self._subs.clear()
|
||||
|
||||
# ─── Pub/Sub ────────────────────────────────────────────────────────────
|
||||
|
||||
async def publish(
|
||||
self,
|
||||
topic: str,
|
||||
payload: dict[str, Any],
|
||||
*,
|
||||
event_type: str = "",
|
||||
) -> None:
|
||||
if self._closed:
|
||||
raise RuntimeError("publish on closed bus")
|
||||
if self._writer is None:
|
||||
await self.connect()
|
||||
body = Event(topic=topic, payload=payload, type=event_type).to_dict()
|
||||
try:
|
||||
await self._send(protocol.encode(protocol.PUB, args=topic, body=body))
|
||||
except (ConnectionError, BrokenPipeError) as exc:
|
||||
# Bus loss is a logged warning, never a publisher crash. The
|
||||
# DB-as-source-of-truth invariant means the work is already
|
||||
# persisted; the missing event is just a missed notification.
|
||||
log.warning("bus.client: publish failed: %s", exc)
|
||||
|
||||
def subscribe(self, pattern: str) -> Subscription:
|
||||
if self._closed:
|
||||
raise RuntimeError("subscribe on closed bus")
|
||||
queue: asyncio.Queue[Any] = asyncio.Queue(maxsize=_INBOUND_QUEUE_SIZE)
|
||||
sub = _UnixSubscription(self, pattern, queue)
|
||||
self._subs.append(sub)
|
||||
# Schedule the SUB frame asynchronously so subscribe() stays sync,
|
||||
# matching the BaseBus signature. The caller will shortly `async
|
||||
# with` / `async for` the subscription, which will run the event
|
||||
# loop and pick this task up.
|
||||
asyncio.ensure_future(self._send_sub(pattern))
|
||||
return sub
|
||||
|
||||
async def _send_sub(self, pattern: str) -> None:
|
||||
try:
|
||||
if self._writer is None:
|
||||
await self.connect()
|
||||
await self._send(protocol.encode(protocol.SUB, args=pattern))
|
||||
except Exception as exc: # pragma: no cover - network paths in live tests
|
||||
log.warning("bus.client: SUB %s failed: %s", pattern, exc)
|
||||
|
||||
async def _unregister(self, sub: _UnixSubscription) -> None:
|
||||
try:
|
||||
self._subs.remove(sub)
|
||||
except ValueError:
|
||||
return
|
||||
# Tell the server we no longer want events for this pattern if no
|
||||
# other local subscription still wants it.
|
||||
if not any(s.pattern == sub.pattern for s in self._subs):
|
||||
with contextlib.suppress(Exception):
|
||||
await self._send(protocol.encode(protocol.UNSUB, args=sub.pattern))
|
||||
|
||||
# ─── Internal I/O ───────────────────────────────────────────────────────
|
||||
|
||||
async def _send(self, frame_bytes: bytes) -> None:
|
||||
if self._writer is None:
|
||||
raise ConnectionError("bus.client: not connected")
|
||||
async with self._write_lock:
|
||||
self._writer.write(frame_bytes)
|
||||
await self._writer.drain()
|
||||
|
||||
async def _reader_loop(self) -> None:
|
||||
if self._reader is None:
|
||||
return
|
||||
try:
|
||||
while True:
|
||||
frame = await protocol.read_frame(self._reader)
|
||||
if frame is None:
|
||||
break
|
||||
if frame.verb != protocol.EVT:
|
||||
# Clients only ever legitimately receive EVT (or BYE).
|
||||
if frame.verb == protocol.BYE:
|
||||
break
|
||||
log.warning("bus.client: unexpected verb from server: %s", frame.verb)
|
||||
continue
|
||||
topic = frame.args
|
||||
data = protocol.decode_body(frame.body) if frame.body else {}
|
||||
event = Event.from_dict(topic, data)
|
||||
self._dispatch(event)
|
||||
except protocol.ProtocolError as exc:
|
||||
log.warning("bus.client: protocol error: %s", exc)
|
||||
except (asyncio.IncompleteReadError, ConnectionError):
|
||||
pass
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception: # pragma: no cover
|
||||
log.exception("bus.client: reader loop crashed")
|
||||
finally:
|
||||
# Server-side close — wake every subscription.
|
||||
for sub in list(self._subs):
|
||||
with contextlib.suppress(asyncio.QueueFull):
|
||||
sub._queue.put_nowait(_CLOSE_SENTINEL)
|
||||
|
||||
def _dispatch(self, event: Event) -> None:
|
||||
for sub in self._subs:
|
||||
if matches(sub.pattern, event.topic):
|
||||
_enqueue_event_drop_oldest(sub._queue, event)
|
||||
309
decnet/bus/unix_server.py
Normal file
309
decnet/bus/unix_server.py
Normal file
@@ -0,0 +1,309 @@
|
||||
"""UNIX-socket server for the DECNET bus.
|
||||
|
||||
One :class:`BusServer` per host. Accepts local connections on a UNIX-domain
|
||||
socket; each connection may:
|
||||
|
||||
* publish events (``PUB`` frames) that the server fans out to all matching
|
||||
subscribers on other connections, and
|
||||
* subscribe to patterns (``SUB`` frames) and receive matching events as
|
||||
``EVT`` frames.
|
||||
|
||||
Authorization is socket file permissions (0660, group=``decnet`` if that
|
||||
POSIX group exists, else the server process's own group). Anything the
|
||||
kernel lets ``connect()`` is trusted — there is no verb-level auth. This
|
||||
matches the "local processes on the same host" threat model; cross-host
|
||||
federation is out of scope (see DEBT-029).
|
||||
|
||||
Backpressure is per-connection, drop-oldest: if a subscriber can't drain its
|
||||
outbound queue fast enough, the server discards the oldest pending event
|
||||
rather than blocking publishers. The bus is at-most-once by contract, so
|
||||
drops are acceptable; stalled publishers are not.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
import grp
|
||||
import os
|
||||
import pathlib
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from decnet.bus import protocol
|
||||
from decnet.bus.base import Event, matches
|
||||
from decnet.logging import get_logger
|
||||
|
||||
log = get_logger("bus.server")
|
||||
|
||||
_SOCKET_MODE = 0o660
|
||||
_DEFAULT_GROUP = "decnet"
|
||||
_OUTBOUND_QUEUE_SIZE = 1024
|
||||
|
||||
|
||||
@dataclass(eq=False)
|
||||
class _Connection:
|
||||
"""Per-connection server state."""
|
||||
|
||||
writer: asyncio.StreamWriter
|
||||
peer_name: str = "<unknown>"
|
||||
patterns: set[str] = field(default_factory=set)
|
||||
outbound: asyncio.Queue[bytes] = field(
|
||||
default_factory=lambda: asyncio.Queue(maxsize=_OUTBOUND_QUEUE_SIZE)
|
||||
)
|
||||
closed: bool = False
|
||||
|
||||
|
||||
class BusServer:
|
||||
"""Serve a UNIX-socket bus on *socket_path*.
|
||||
|
||||
Lifecycle: construct → :meth:`start` → :meth:`serve_forever` (or rely
|
||||
on :meth:`start` returning once bound) → :meth:`close` for teardown.
|
||||
Safe to :meth:`close` multiple times.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
socket_path: pathlib.Path | str,
|
||||
*,
|
||||
group: str | None = _DEFAULT_GROUP,
|
||||
mode: int = _SOCKET_MODE,
|
||||
) -> None:
|
||||
self._path = pathlib.Path(socket_path)
|
||||
self._group = group
|
||||
self._mode = mode
|
||||
self._server: asyncio.base_events.Server | None = None
|
||||
self._connections: set[_Connection] = set()
|
||||
self._closed = False
|
||||
|
||||
# ─── Lifecycle ──────────────────────────────────────────────────────────
|
||||
|
||||
async def start(self) -> None:
|
||||
"""Bind the socket and begin accepting connections.
|
||||
|
||||
Removes any stale socket file at *socket_path* first (common case:
|
||||
the previous worker crashed without cleaning up). The parent
|
||||
directory must already exist; we do NOT create it blindly because
|
||||
the chosen directory (typically ``/run/decnet``) may require
|
||||
systemd ``RuntimeDirectory=`` to set up.
|
||||
"""
|
||||
if self._server is not None:
|
||||
return
|
||||
|
||||
parent = self._path.parent
|
||||
if not parent.exists():
|
||||
raise FileNotFoundError(
|
||||
f"bus socket parent directory {parent} does not exist; "
|
||||
f"create it with systemd RuntimeDirectory= or mkdir"
|
||||
)
|
||||
|
||||
# Clean up a stale socket from a previous crash. If a live server
|
||||
# is actually listening there, ``bind()`` below will fail — we do
|
||||
# not try to detect live vs. stale ourselves.
|
||||
with contextlib.suppress(FileNotFoundError):
|
||||
if self._path.is_socket():
|
||||
self._path.unlink()
|
||||
|
||||
self._server = await asyncio.start_unix_server(
|
||||
self._handle_connection, path=str(self._path),
|
||||
)
|
||||
_chmod_and_chown(self._path, self._mode, self._group)
|
||||
log.info("bus.server: listening on %s (mode=%o group=%s)",
|
||||
self._path, self._mode, self._group or "<inherit>")
|
||||
|
||||
async def serve_forever(self) -> None:
|
||||
if self._server is None:
|
||||
raise RuntimeError("BusServer not started")
|
||||
async with self._server:
|
||||
await self._server.serve_forever()
|
||||
|
||||
async def close(self) -> None:
|
||||
if self._closed:
|
||||
return
|
||||
self._closed = True
|
||||
|
||||
if self._server is not None:
|
||||
self._server.close()
|
||||
with contextlib.suppress(Exception):
|
||||
await self._server.wait_closed()
|
||||
self._server = None
|
||||
|
||||
# Drain every live connection.
|
||||
for conn in list(self._connections):
|
||||
await self._close_connection(conn)
|
||||
self._connections.clear()
|
||||
|
||||
with contextlib.suppress(FileNotFoundError):
|
||||
self._path.unlink()
|
||||
log.info("bus.server: closed")
|
||||
|
||||
# ─── Internal publish fan-out ───────────────────────────────────────────
|
||||
|
||||
async def publish(self, topic: str, payload: dict[str, Any], event_type: str = "") -> None:
|
||||
"""Server-side publish helper — used by the worker to emit
|
||||
``system.bus.health`` heartbeats without opening a client loop."""
|
||||
event = Event(topic=topic, payload=payload, type=event_type)
|
||||
self._fanout(event)
|
||||
|
||||
# ─── Connection handler ─────────────────────────────────────────────────
|
||||
|
||||
async def _handle_connection(
|
||||
self,
|
||||
reader: asyncio.StreamReader,
|
||||
writer: asyncio.StreamWriter,
|
||||
) -> None:
|
||||
conn = _Connection(writer=writer)
|
||||
self._connections.add(conn)
|
||||
writer_task = asyncio.create_task(self._writer_loop(conn))
|
||||
try:
|
||||
await self._reader_loop(conn, reader)
|
||||
except protocol.ProtocolError as exc:
|
||||
log.warning("bus.server: protocol error from %s: %s", conn.peer_name, exc)
|
||||
except (asyncio.IncompleteReadError, ConnectionError) as exc:
|
||||
log.debug("bus.server: %s disconnected: %s", conn.peer_name, exc)
|
||||
except Exception: # pragma: no cover - defensive
|
||||
log.exception("bus.server: unhandled error in connection")
|
||||
finally:
|
||||
await self._close_connection(conn)
|
||||
self._connections.discard(conn)
|
||||
writer_task.cancel()
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
await writer_task
|
||||
|
||||
async def _reader_loop(
|
||||
self, conn: _Connection, reader: asyncio.StreamReader,
|
||||
) -> None:
|
||||
while True:
|
||||
frame = await protocol.read_frame(reader)
|
||||
if frame is None:
|
||||
return
|
||||
await self._dispatch(conn, frame)
|
||||
if frame.verb == protocol.BYE:
|
||||
return
|
||||
|
||||
async def _dispatch(self, conn: _Connection, frame: protocol.Frame) -> None:
|
||||
if frame.verb == protocol.HELLO:
|
||||
conn.peer_name = frame.args or conn.peer_name
|
||||
log.debug("bus.server: HELLO from %s", conn.peer_name)
|
||||
return
|
||||
if frame.verb == protocol.SUB:
|
||||
pattern = frame.args
|
||||
if not pattern:
|
||||
raise protocol.ProtocolError("SUB requires a pattern")
|
||||
conn.patterns.add(pattern)
|
||||
log.debug("bus.server: %s SUB %s", conn.peer_name, pattern)
|
||||
return
|
||||
if frame.verb == protocol.UNSUB:
|
||||
conn.patterns.discard(frame.args)
|
||||
return
|
||||
if frame.verb == protocol.PUB:
|
||||
topic = frame.args
|
||||
if not topic:
|
||||
raise protocol.ProtocolError("PUB requires a topic")
|
||||
data = protocol.decode_body(frame.body) if frame.body else {}
|
||||
event = Event(
|
||||
topic=topic,
|
||||
payload=data.get("payload", {}) or {},
|
||||
type=data.get("type", "") or "",
|
||||
)
|
||||
self._fanout(event, origin=conn)
|
||||
return
|
||||
if frame.verb == protocol.BYE:
|
||||
return
|
||||
# EVT is server-to-client only; receiving one is a protocol violation.
|
||||
raise protocol.ProtocolError(f"unexpected verb {frame.verb!r} from client")
|
||||
|
||||
def _fanout(self, event: Event, *, origin: _Connection | None = None) -> None:
|
||||
"""Enqueue *event* as an EVT frame on every matching connection.
|
||||
|
||||
We do NOT deliver back to the originating connection (a publisher
|
||||
does not receive its own event). Encoding happens once per event,
|
||||
not once per subscriber.
|
||||
"""
|
||||
try:
|
||||
frame_bytes = protocol.encode(
|
||||
protocol.EVT, args=event.topic, body=event.to_dict(),
|
||||
)
|
||||
except protocol.ProtocolError:
|
||||
log.exception("bus.server: failed to encode EVT for topic=%s", event.topic)
|
||||
return
|
||||
|
||||
for conn in self._connections:
|
||||
if conn is origin or conn.closed:
|
||||
continue
|
||||
if not any(matches(p, event.topic) for p in conn.patterns):
|
||||
continue
|
||||
_enqueue_drop_oldest(conn.outbound, frame_bytes, event.topic)
|
||||
|
||||
async def _writer_loop(self, conn: _Connection) -> None:
|
||||
"""Serialize writes onto *conn*'s socket.
|
||||
|
||||
One writer task per connection so a slow peer only blocks its own
|
||||
queue, not the fan-out loop. The queue is bounded with drop-oldest
|
||||
policy applied at enqueue time (see :func:`_enqueue_drop_oldest`).
|
||||
"""
|
||||
try:
|
||||
while not conn.closed:
|
||||
data = await conn.outbound.get()
|
||||
conn.writer.write(data)
|
||||
await conn.writer.drain()
|
||||
except (ConnectionError, BrokenPipeError):
|
||||
log.debug("bus.server: %s writer: peer closed", conn.peer_name)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception: # pragma: no cover - defensive
|
||||
log.exception("bus.server: writer loop crashed for %s", conn.peer_name)
|
||||
|
||||
async def _close_connection(self, conn: _Connection) -> None:
|
||||
if conn.closed:
|
||||
return
|
||||
conn.closed = True
|
||||
with contextlib.suppress(Exception):
|
||||
conn.writer.close()
|
||||
await conn.writer.wait_closed()
|
||||
|
||||
|
||||
# ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def _chmod_and_chown(path: pathlib.Path, mode: int, group: str | None) -> None:
|
||||
"""Apply socket file perms and best-effort group ownership.
|
||||
|
||||
If *group* is ``None`` or the named group does not exist, we leave the
|
||||
socket owned by the current process group. This keeps the server
|
||||
usable on dev boxes that don't have a ``decnet`` group set up.
|
||||
"""
|
||||
try:
|
||||
os.chmod(path, mode)
|
||||
except OSError as exc:
|
||||
log.warning("bus.server: chmod(%s, %o) failed: %s", path, mode, exc)
|
||||
|
||||
if not group:
|
||||
return
|
||||
try:
|
||||
gid = grp.getgrnam(group).gr_gid
|
||||
except KeyError:
|
||||
log.debug("bus.server: group %r not found, leaving socket group unchanged", group)
|
||||
return
|
||||
try:
|
||||
os.chown(path, -1, gid)
|
||||
except PermissionError:
|
||||
# Dev box running as an unprivileged user can't chown. Log once at
|
||||
# debug and move on — the socket is still usable by the owner.
|
||||
log.debug("bus.server: chown(%s, gid=%d) denied; leaving as-is", path, gid)
|
||||
except OSError as exc:
|
||||
log.warning("bus.server: chown(%s, gid=%d) failed: %s", path, gid, exc)
|
||||
|
||||
|
||||
def _enqueue_drop_oldest(
|
||||
queue: "asyncio.Queue[bytes]", data: bytes, topic: str,
|
||||
) -> None:
|
||||
"""Drop-oldest backpressure — mirrors :func:`decnet.bus.fake._enqueue_drop_oldest`."""
|
||||
while True:
|
||||
try:
|
||||
queue.put_nowait(data)
|
||||
return
|
||||
except asyncio.QueueFull:
|
||||
try:
|
||||
queue.get_nowait()
|
||||
log.warning("bus.server: subscriber queue full, dropped event topic=%s", topic)
|
||||
except asyncio.QueueEmpty:
|
||||
return
|
||||
121
decnet/bus/worker.py
Normal file
121
decnet/bus/worker.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""``decnet bus`` worker entrypoint.
|
||||
|
||||
Starts a :class:`~decnet.bus.unix_server.BusServer` on the configured UNIX
|
||||
socket and serves forever, emitting a ``system.bus.health`` heartbeat on
|
||||
its own bus every :data:`HEARTBEAT_INTERVAL_SEC` seconds so liveness-aware
|
||||
consumers (dashboards, watchdogs) can tell the bus is up without polling
|
||||
the filesystem.
|
||||
|
||||
Cross-host federation is **out of scope** for the MVP; each host runs its
|
||||
own bus independently. See DEBT-029 for the deferred ``--bridge-tcp``
|
||||
mode that would proxy the socket over the swarm mTLS channel.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import pathlib
|
||||
import signal
|
||||
import time
|
||||
|
||||
from decnet.bus import topics
|
||||
from decnet.bus.unix_server import BusServer
|
||||
from decnet.logging import get_logger
|
||||
|
||||
log = get_logger("bus.worker")
|
||||
|
||||
HEARTBEAT_INTERVAL_SEC = 10
|
||||
|
||||
|
||||
async def bus_worker(
|
||||
socket_path: str | pathlib.Path,
|
||||
*,
|
||||
group: str | None = "decnet",
|
||||
heartbeat_interval: int = HEARTBEAT_INTERVAL_SEC,
|
||||
) -> None:
|
||||
"""Run the bus server until cancelled or SIGTERM/SIGINT is received.
|
||||
|
||||
The parent directory of *socket_path* must already exist (systemd's
|
||||
``RuntimeDirectory=decnet`` handles this in prod; dev code is expected
|
||||
to ``mkdir`` first). This function does not create it implicitly
|
||||
because the right choice of perms/owner depends on the deployment
|
||||
context.
|
||||
"""
|
||||
path = pathlib.Path(socket_path)
|
||||
_ensure_parent(path)
|
||||
|
||||
server = BusServer(path, group=group)
|
||||
await server.start()
|
||||
log.info("bus.worker: pid=%d socket=%s", os.getpid(), path)
|
||||
|
||||
stop_event = asyncio.Event()
|
||||
_install_signal_handlers(stop_event)
|
||||
|
||||
heartbeat_task = asyncio.create_task(_heartbeat_loop(server, heartbeat_interval))
|
||||
serve_task = asyncio.create_task(server.serve_forever())
|
||||
|
||||
try:
|
||||
await stop_event.wait()
|
||||
log.info("bus.worker: shutdown signal received")
|
||||
finally:
|
||||
heartbeat_task.cancel()
|
||||
serve_task.cancel()
|
||||
for task in (heartbeat_task, serve_task):
|
||||
try:
|
||||
await task
|
||||
except (asyncio.CancelledError, Exception): # noqa: BLE001 - draining shutdown
|
||||
pass
|
||||
await server.close()
|
||||
log.info("bus.worker: stopped")
|
||||
|
||||
|
||||
async def _heartbeat_loop(server: BusServer, interval: int) -> None:
|
||||
"""Publish ``system.bus.health`` on the server's own fan-out."""
|
||||
started_at = time.time()
|
||||
while True:
|
||||
try:
|
||||
await server.publish(
|
||||
topics.system(topics.SYSTEM_BUS_HEALTH),
|
||||
{
|
||||
"pid": os.getpid(),
|
||||
"uptime_sec": round(time.time() - started_at, 3),
|
||||
"ts": time.time(),
|
||||
},
|
||||
event_type=topics.SYSTEM_BUS_HEALTH,
|
||||
)
|
||||
except Exception: # pragma: no cover - heartbeat must never kill the worker
|
||||
log.exception("bus.worker: heartbeat publish failed")
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
|
||||
def _install_signal_handlers(stop_event: asyncio.Event) -> None:
|
||||
loop = asyncio.get_running_loop()
|
||||
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||
try:
|
||||
loop.add_signal_handler(sig, stop_event.set)
|
||||
except (NotImplementedError, RuntimeError):
|
||||
# add_signal_handler is not supported on Windows / in some
|
||||
# test harnesses where the loop is running in a non-main thread.
|
||||
# The worker still exits via KeyboardInterrupt bubbling up.
|
||||
pass
|
||||
|
||||
|
||||
def _ensure_parent(path: pathlib.Path) -> None:
|
||||
parent = path.parent
|
||||
if parent.exists():
|
||||
return
|
||||
# Dev-box convenience: if the parent is the user's ``~/.decnet`` dir,
|
||||
# create it. We do not auto-mkdir ``/run/decnet`` — that's systemd's job
|
||||
# and silently creating it as the wrong user would cause permission
|
||||
# confusion later.
|
||||
home_prefix = pathlib.Path.home() / ".decnet"
|
||||
try:
|
||||
parent.relative_to(home_prefix.parent)
|
||||
except ValueError:
|
||||
raise FileNotFoundError(
|
||||
f"bus socket parent {parent} does not exist; create it first"
|
||||
)
|
||||
parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
__all__ = ["bus_worker", "HEARTBEAT_INTERVAL_SEC"]
|
||||
@@ -21,6 +21,7 @@ import typer
|
||||
from . import (
|
||||
agent,
|
||||
api,
|
||||
bus,
|
||||
db,
|
||||
deploy,
|
||||
forwarder,
|
||||
@@ -51,7 +52,7 @@ for _mod in (
|
||||
swarm,
|
||||
deploy, lifecycle, workers, inventory,
|
||||
web, profiler, sniffer, db,
|
||||
topology,
|
||||
topology, bus,
|
||||
):
|
||||
_mod.register(app)
|
||||
|
||||
|
||||
45
decnet/cli/bus.py
Normal file
45
decnet/cli/bus.py
Normal file
@@ -0,0 +1,45 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import typer
|
||||
|
||||
from . import utils as _utils
|
||||
from .utils import console, log
|
||||
|
||||
|
||||
def register(app: typer.Typer) -> None:
|
||||
@app.command(name="bus")
|
||||
def bus_cmd(
|
||||
socket_path: str = typer.Option(
|
||||
None, "--socket", "-s",
|
||||
help="UNIX socket path (defaults to DECNET_BUS_SOCKET env var, "
|
||||
"then /run/decnet/bus.sock, then ~/.decnet/bus.sock).",
|
||||
),
|
||||
group: str = typer.Option(
|
||||
"decnet", "--group", "-g",
|
||||
help="POSIX group to chown the socket to (falls back to process "
|
||||
"group if the named group does not exist).",
|
||||
),
|
||||
heartbeat: int = typer.Option(
|
||||
10, "--heartbeat", "-H",
|
||||
help="Seconds between system.bus.health heartbeat events.",
|
||||
),
|
||||
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process."),
|
||||
) -> None:
|
||||
"""Run the DECNET ServiceBus worker (host-local UNIX-socket pub/sub)."""
|
||||
import asyncio
|
||||
from decnet.bus.factory import _default_socket_path
|
||||
from decnet.bus.worker import bus_worker
|
||||
|
||||
resolved = socket_path or _default_socket_path()
|
||||
|
||||
if daemon:
|
||||
log.info("bus daemonizing socket=%s", resolved)
|
||||
_utils._daemonize()
|
||||
|
||||
log.info("bus starting socket=%s group=%s heartbeat=%ds", resolved, group, heartbeat)
|
||||
console.print(f"[bold cyan]Bus starting[/] (socket: {resolved}, heartbeat: {heartbeat}s)")
|
||||
|
||||
try:
|
||||
asyncio.run(bus_worker(resolved, group=group, heartbeat_interval=heartbeat))
|
||||
except KeyboardInterrupt:
|
||||
console.print("\n[yellow]Bus stopped.[/]")
|
||||
@@ -131,6 +131,15 @@ DECNET_DISALLOW_MASTER: bool = (
|
||||
os.environ.get("DECNET_DISALLOW_MASTER", "true").lower() == "true"
|
||||
)
|
||||
|
||||
# ServiceBus — host-local UNIX-socket pub/sub. Workers consume via
|
||||
# ``decnet.bus.factory.get_bus()``. Disabled → NullBus (publishes drop,
|
||||
# subscriptions yield nothing) so dev environments without a bus daemon
|
||||
# can still boot. See DEBT-029 for the MVP design.
|
||||
DECNET_BUS_ENABLED: bool = os.environ.get("DECNET_BUS_ENABLED", "true").lower() != "false"
|
||||
DECNET_BUS_TYPE: str = os.environ.get("DECNET_BUS_TYPE", "unix").lower()
|
||||
DECNET_BUS_SOCKET: Optional[str] = os.environ.get("DECNET_BUS_SOCKET")
|
||||
DECNET_BUS_GROUP: str = os.environ.get("DECNET_BUS_GROUP", "decnet")
|
||||
|
||||
# Tracing — set to "true" to enable OpenTelemetry distributed tracing.
|
||||
# Separate from DECNET_DEVELOPER so tracing can be toggled independently.
|
||||
DECNET_DEVELOPER_TRACING: bool = os.environ.get("DECNET_DEVELOPER_TRACING", "").lower() == "true"
|
||||
|
||||
Reference in New Issue
Block a user