feat(bus): host-local UNIX-socket pub/sub worker (DEBT-029)

Land the `decnet bus` worker and `get_bus()` factory. Transport is a
host-local UNIX-domain socket (0660, group=decnet); authz is the file
mode. Wire framing is a tiny verb-line + 4-byte-BE length + orjson body.
NATS-style wildcard topics (`*`, `>`). At-most-once, fire-and-forget —
DB stays the source of truth. `FakeBus` / `NullBus` for tests and the
disabled path. Cross-host federation is deferred to a future
`--bridge-tcp` mode; DEBT-030 is master-only and unblocked.
This commit is contained in:
2026-04-21 13:49:02 -04:00
parent 4481a947d4
commit fbf289ff63
23 changed files with 2167 additions and 4 deletions

18
decnet/bus/__init__.py Normal file
View File

@@ -0,0 +1,18 @@
"""DECNET ServiceBus — pub/sub notification substrate.
The bus is the notification layer for DECNET's worker constellation. The DB
remains the source of truth for anything durable; the bus carries "something
happened, go look" events. Delivery is at-most-once, fire-and-forget.
Consumers call :func:`get_bus` from :mod:`decnet.bus.factory`; never import
transport implementations directly. The factory selects the backend via
``DECNET_BUS_TYPE`` (``nats`` or ``fake``) and honors ``DECNET_BUS_ENABLED``.
Topic hierarchy is defined in :mod:`decnet.bus.topics` and locked early so
consumers can subscribe with stable wildcard patterns.
"""
from __future__ import annotations
from decnet.bus.base import BaseBus, Event, Subscription
__all__ = ["BaseBus", "Event", "Subscription"]

205
decnet/bus/base.py Normal file
View File

@@ -0,0 +1,205 @@
"""Bus abstractions: the :class:`Event` envelope and the :class:`BaseBus` ABC.
Every transport (NATS, in-process fake, null) speaks this contract. The
envelope is versioned (``v``) so future evolution never breaks deployed
consumers that happen to see a newer event shape.
Subscription model: :meth:`BaseBus.subscribe` returns a :class:`Subscription`
that is an async context manager AND an async iterator. The expected usage is:
async with bus.subscribe("topology.*.mutation.*") as sub:
async for event in sub:
handle(event)
Leaving the ``async with`` releases the underlying subscription handle; the
transport is free to drop any buffered events after that point.
"""
from __future__ import annotations
import abc
import asyncio
import time
import uuid
from dataclasses import dataclass, field
from typing import Any, AsyncIterator
EVENT_SCHEMA_VERSION = 1
@dataclass(frozen=True)
class Event:
"""The bus envelope.
``v`` is the envelope schema version, bumped on incompatible shape
changes. ``type`` is a short discriminator (``"mutation.applied"``,
``"decky.state"``) useful for consumers that subscribe to a broad
wildcard and dispatch in Python; it is redundant with the trailing
segments of ``topic`` but cheaper to inspect. ``ts`` is epoch seconds
(float). ``id`` is a random UUID so consumers can de-dupe if they
ever see the same event twice (not expected at-most-once, but cheap
insurance).
"""
topic: str
payload: dict[str, Any]
type: str = ""
v: int = EVENT_SCHEMA_VERSION
ts: float = field(default_factory=time.time)
id: str = field(default_factory=lambda: uuid.uuid4().hex)
def to_dict(self) -> dict[str, Any]:
return {
"v": self.v,
"id": self.id,
"topic": self.topic,
"type": self.type,
"ts": self.ts,
"payload": self.payload,
}
@classmethod
def from_dict(cls, topic: str, data: dict[str, Any]) -> "Event":
"""Reconstruct an Event from a wire-format dict.
``topic`` is passed explicitly because the transport knows which
subject the message arrived on; trusting a ``topic`` field from the
wire would let a misbehaving publisher spoof events on topics they
don't actually publish to.
"""
return cls(
topic=topic,
payload=data.get("payload", {}) or {},
type=data.get("type", "") or "",
v=int(data.get("v", EVENT_SCHEMA_VERSION)),
ts=float(data.get("ts", time.time())),
id=data.get("id") or uuid.uuid4().hex,
)
class Subscription(abc.ABC):
"""An open subscription — async context manager + async iterator.
Concrete transports subclass this and implement :meth:`_aclose` plus the
async iterator protocol. Callers should not instantiate directly; use
:meth:`BaseBus.subscribe`.
"""
def __init__(self, pattern: str) -> None:
self.pattern = pattern
self._closed = False
async def __aenter__(self) -> "Subscription":
return self
async def __aexit__(self, *exc: Any) -> None:
await self.aclose()
def __aiter__(self) -> AsyncIterator[Event]:
return self
async def aclose(self) -> None:
if self._closed:
return
self._closed = True
await self._aclose()
@abc.abstractmethod
async def __anext__(self) -> Event: # pragma: no cover - abstract
raise NotImplementedError
@abc.abstractmethod
async def _aclose(self) -> None: # pragma: no cover - abstract
raise NotImplementedError
class BaseBus(abc.ABC):
"""Pub/sub transport contract.
Implementations MUST be safe to ``await connect()`` multiple times and
``await close()`` multiple times. Publishing to a closed bus raises
:class:`RuntimeError`; subscribing to a closed bus does too.
"""
@abc.abstractmethod
async def connect(self) -> None:
"""Establish any network/transport resources. Idempotent."""
@abc.abstractmethod
async def publish(
self,
topic: str,
payload: dict[str, Any],
*,
event_type: str = "",
) -> None:
"""Publish *payload* on *topic*. Fire-and-forget.
Delivery is at-most-once. On transport error the implementation
logs and returns; it does not raise, because bus losses must not
cascade into worker failure (DB is source of truth).
"""
@abc.abstractmethod
def subscribe(self, pattern: str) -> Subscription:
"""Return a :class:`Subscription` that yields events matching *pattern*.
Patterns follow NATS wildcard semantics: ``*`` matches one topic
token, ``>`` matches one-or-more trailing tokens. Examples:
* ``topology.*.mutation.applied`` — all ``applied`` events for any
topology.
* ``topology.abc123.mutation.*`` — all mutation states for one
topology.
* ``topology.>`` — every event under the ``topology`` root.
"""
@abc.abstractmethod
async def close(self) -> None:
"""Tear down transport resources. Idempotent."""
async def __aenter__(self) -> "BaseBus":
await self.connect()
return self
async def __aexit__(self, *exc: Any) -> None:
await self.close()
# ─── Wildcard matching shared across in-process transports ───────────────────
def matches(pattern: str, topic: str) -> bool:
"""Return True iff *topic* matches *pattern* under NATS wildcard rules.
``*`` matches exactly one non-empty token; ``>`` matches one-or-more
trailing tokens (so ``topology.>`` matches ``topology.abc.x`` but not
``topology`` alone).
"""
p_tokens = pattern.split(".")
t_tokens = topic.split(".")
for i, p in enumerate(p_tokens):
if p == ">":
# Must have at least one token remaining to match.
return i < len(t_tokens)
if i >= len(t_tokens):
return False
if p == "*":
if not t_tokens[i]:
return False
continue
if p != t_tokens[i]:
return False
return len(p_tokens) == len(t_tokens)
# Sentinel used by the in-process transports to signal "no more events"
# through the asyncio.Queue fan-out without inventing a separate control
# channel. Not part of the wire protocol.
_CLOSE_SENTINEL: Any = object()
async def _next_or_stop(queue: "asyncio.Queue[Any]") -> Event:
"""Pop the next item from *queue*, raising ``StopAsyncIteration`` on close."""
item = await queue.get()
if item is _CLOSE_SENTINEL:
raise StopAsyncIteration
return item

85
decnet/bus/factory.py Normal file
View File

@@ -0,0 +1,85 @@
"""Bus factory — selects a :class:`~decnet.bus.base.BaseBus` implementation.
Dispatch key: the ``DECNET_BUS_TYPE`` environment variable.
* ``unix`` (default) → :class:`~decnet.bus.unix_client.UnixSocketBus`
* ``fake`` → :class:`~decnet.bus.fake.FakeBus` (in-process)
If ``DECNET_BUS_ENABLED`` is ``"false"`` the factory short-circuits to
:class:`~decnet.bus.fake.NullBus` regardless of ``DECNET_BUS_TYPE`` — a
cheap way for dev environments to run workers without a bus daemon.
Mirrors :mod:`decnet.web.db.factory` (lazy imports inside each branch,
env-driven dispatch, optional telemetry wrapping). Callers MUST use
:func:`get_bus` rather than instantiating transports directly.
"""
from __future__ import annotations
import os
from typing import Any
from decnet.bus.base import BaseBus
def get_bus(**kwargs: Any) -> BaseBus:
"""Instantiate the bus implementation selected by environment.
Keyword arguments are forwarded to the concrete transport:
* ``UnixSocketBus`` accepts ``socket_path`` (overrides
``DECNET_BUS_SOCKET``) and ``client_name``.
* ``FakeBus`` accepts ``queue_size``.
"""
if os.environ.get("DECNET_BUS_ENABLED", "true").lower() == "false":
from decnet.bus.fake import NullBus
return NullBus()
bus_type = os.environ.get("DECNET_BUS_TYPE", "unix").lower()
if bus_type == "unix":
from decnet.bus.unix_client import UnixSocketBus
socket_path = kwargs.pop("socket_path", None) or _default_socket_path()
bus: BaseBus = UnixSocketBus(socket_path=socket_path, **kwargs)
elif bus_type == "fake":
from decnet.bus.fake import FakeBus
bus = FakeBus(**kwargs)
else:
raise ValueError(f"Unsupported bus type: {bus_type}")
return _maybe_wrap_telemetry(bus)
def _default_socket_path() -> str:
"""Return the bus socket path honoring ``DECNET_BUS_SOCKET`` and falling
back to ``/run/decnet/bus.sock`` → ``~/.decnet/bus.sock``.
The runtime path (``/run/decnet``) is preferred because systemd
``RuntimeDirectory=decnet`` sets it up with the right perms; the home
fallback keeps dev boxes usable without systemd.
"""
explicit = os.environ.get("DECNET_BUS_SOCKET")
if explicit:
return explicit
runtime_dir = "/run/decnet"
if os.path.isdir(runtime_dir) and os.access(runtime_dir, os.W_OK):
return f"{runtime_dir}/bus.sock"
return os.path.expanduser("~/.decnet/bus.sock")
def _maybe_wrap_telemetry(bus: BaseBus) -> BaseBus:
"""Wrap *bus* in a tracing proxy if OTEL is enabled, else return as-is.
Uses :func:`decnet.telemetry.wrap_repository` as the underlying proxy —
its implementation is generic (wraps any async method in a span), so we
reuse it with a bus-appropriate tracer name. If telemetry isn't wired
up at all we no-op.
"""
try:
from decnet.telemetry import wrap_repository # type: ignore[attr-defined]
except ImportError:
return bus
try:
return wrap_repository(bus)
except Exception: # pragma: no cover - defensive
return bus

183
decnet/bus/fake.py Normal file
View File

@@ -0,0 +1,183 @@
"""In-process bus transports.
* :class:`FakeBus` — real pub/sub semantics without touching a socket. Used
by unit tests and anywhere ``DECNET_BUS_TYPE=fake`` is set. Lets code
that depends on the bus be exercised entirely inside a single event loop,
matching the DECNET testing convention of not opening real network
sockets from unit tests.
* :class:`NullBus` — no-op. Returned by :func:`~decnet.bus.factory.get_bus`
when ``DECNET_BUS_ENABLED=false`` so workers can start cleanly in dev
environments where no bus daemon is running. Publishes are dropped;
subscriptions yield nothing and close cleanly.
"""
from __future__ import annotations
import asyncio
from typing import Any
from decnet.bus.base import (
BaseBus,
Event,
Subscription,
_CLOSE_SENTINEL,
matches,
)
from decnet.logging import get_logger
log = get_logger("bus.fake")
# Per-subscriber bounded queue: backpressure policy is drop-oldest so a slow
# consumer cannot stall publishers (the invariant — DB is the source of
# truth — makes dropped events acceptable).
_DEFAULT_QUEUE_SIZE = 1024
# ─── FakeBus ─────────────────────────────────────────────────────────────────
class _FakeSubscription(Subscription):
"""Subscription backed by an :class:`asyncio.Queue` fed from
:meth:`FakeBus.publish`. Unregisters itself on close."""
def __init__(self, bus: "FakeBus", pattern: str, queue: "asyncio.Queue[Any]") -> None:
super().__init__(pattern)
self._bus = bus
self._queue = queue
async def __anext__(self) -> Event:
if self._closed:
raise StopAsyncIteration
item = await self._queue.get()
if item is _CLOSE_SENTINEL:
raise StopAsyncIteration
return item
async def _aclose(self) -> None:
self._bus._unregister(self)
# Unblock any pending __anext__ waiter.
try:
self._queue.put_nowait(_CLOSE_SENTINEL)
except asyncio.QueueFull:
pass
class FakeBus(BaseBus):
"""In-process pub/sub.
Publishes iterate every active subscription and enqueue the event on
the ones whose pattern matches the topic. If a subscriber's queue is
full, the oldest event is discarded to make room — same at-most-once
semantics as the real UNIX-socket transport.
"""
def __init__(self, queue_size: int = _DEFAULT_QUEUE_SIZE) -> None:
self._queue_size = queue_size
self._subs: list[_FakeSubscription] = []
self._connected = False
self._closed = False
self._lock = asyncio.Lock()
async def connect(self) -> None:
self._connected = True
async def publish(
self,
topic: str,
payload: dict[str, Any],
*,
event_type: str = "",
) -> None:
if self._closed:
raise RuntimeError("publish on closed bus")
event = Event(topic=topic, payload=payload, type=event_type)
async with self._lock:
targets = [s for s in self._subs if matches(s.pattern, topic)]
for sub in targets:
_enqueue_drop_oldest(sub._queue, event)
def subscribe(self, pattern: str) -> Subscription:
if self._closed:
raise RuntimeError("subscribe on closed bus")
queue: asyncio.Queue[Any] = asyncio.Queue(maxsize=self._queue_size)
sub = _FakeSubscription(self, pattern, queue)
self._subs.append(sub)
return sub
def _unregister(self, sub: _FakeSubscription) -> None:
try:
self._subs.remove(sub)
except ValueError:
pass
async def close(self) -> None:
if self._closed:
return
self._closed = True
# Wake every still-open subscription so iterators unblock cleanly.
for sub in list(self._subs):
try:
sub._queue.put_nowait(_CLOSE_SENTINEL)
except asyncio.QueueFull:
pass
self._subs.clear()
def _enqueue_drop_oldest(queue: "asyncio.Queue[Any]", event: Event) -> None:
"""Put *event* on *queue*, dropping the oldest item if the queue is full.
Factored out so both FakeBus and the real UNIX server share the exact
same backpressure policy.
"""
while True:
try:
queue.put_nowait(event)
return
except asyncio.QueueFull:
try:
dropped = queue.get_nowait()
log.warning(
"bus.fake: subscriber queue full, dropped %s", getattr(dropped, "topic", "?")
)
except asyncio.QueueEmpty:
return
# ─── NullBus ─────────────────────────────────────────────────────────────────
class _NullSubscription(Subscription):
"""A subscription that never yields and closes immediately on iteration."""
async def __anext__(self) -> Event:
raise StopAsyncIteration
async def _aclose(self) -> None:
return
class NullBus(BaseBus):
"""No-op bus used when ``DECNET_BUS_ENABLED=false``.
Publishes are silently dropped; subscriptions are empty. Intended for
dev environments where no bus daemon is running — the process starts
cleanly, code that publishes doesn't need feature flags, and nothing
ever blocks on a subscriber.
"""
async def connect(self) -> None:
return
async def publish(
self,
topic: str,
payload: dict[str, Any],
*,
event_type: str = "",
) -> None:
return
def subscribe(self, pattern: str) -> Subscription:
return _NullSubscription(pattern)
async def close(self) -> None:
return

144
decnet/bus/protocol.py Normal file
View File

@@ -0,0 +1,144 @@
"""Wire protocol for the DECNET bus UNIX-socket transport.
Frame layout:
<VERB> [<args ...>]\\n # ASCII header, single line, no trailing space
<4-byte big-endian body length>
<body> # orjson-serialized dict, or empty (length 0)
Verbs:
* ``HELLO <client-name>`` — optional greeting, logged by server. Body empty.
* ``PUB <topic>`` — publisher → server. Body = payload dict.
* ``SUB <pattern>`` — subscriber → server. Body empty.
* ``UNSUB <pattern>`` — subscriber → server. Body empty.
* ``EVT <topic>`` — server → subscriber. Body = payload dict (wrapped
in an :class:`~decnet.bus.base.Event` envelope).
* ``BYE`` — either direction. Body empty. Graceful shutdown.
Parsing rules:
* The header is a single line terminated by ``\\n`` (LF). ``\\r`` is tolerated
but not required.
* Header tokens are whitespace-separated. The first token is the verb;
everything after is verb-specific. We split on the first space only so
topics / patterns with quoted content are not supported (they are not
needed — topic segments forbid whitespace per :mod:`decnet.bus.topics`).
* Maximum header length is 4096 bytes; maximum body length is 1 MiB. Beyond
those, the connection is dropped with a logged error. This is a honeypot
framework, not a general-purpose message broker; a malformed frame is
treated as hostile.
"""
from __future__ import annotations
import asyncio
import struct
from dataclasses import dataclass
from typing import Any
import orjson
MAX_HEADER_BYTES = 4096
MAX_BODY_BYTES = 1 * 1024 * 1024 # 1 MiB
# Verb constants (callers should reference these, not bare strings).
HELLO = "HELLO"
PUB = "PUB"
SUB = "SUB"
UNSUB = "UNSUB"
EVT = "EVT"
BYE = "BYE"
_VALID_VERBS = frozenset({HELLO, PUB, SUB, UNSUB, EVT, BYE})
class ProtocolError(Exception):
"""Malformed or oversized frame. Callers should close the connection."""
@dataclass(frozen=True)
class Frame:
"""A parsed frame. ``body`` is the raw (unparsed) body bytes — callers
decide whether to orjson-decode it (the protocol does not know whether
a given verb expects a dict body or an empty one).
"""
verb: str
args: str # everything after the verb on the header line, trimmed
body: bytes
def encode(verb: str, args: str = "", body: dict[str, Any] | None = None) -> bytes:
"""Serialize a frame.
*body* is a dict that will be orjson-encoded, or ``None`` for an empty
body. The header line is written verbatim — callers must supply args
that are free of ``\\n``.
"""
if verb not in _VALID_VERBS:
raise ProtocolError(f"unknown verb {verb!r}")
if "\n" in args or "\r" in args:
raise ProtocolError("args must not contain newline characters")
body_bytes = b"" if body is None else orjson.dumps(body)
if len(body_bytes) > MAX_BODY_BYTES:
raise ProtocolError(
f"body {len(body_bytes)} bytes exceeds max {MAX_BODY_BYTES}"
)
header = f"{verb} {args}".rstrip() + "\n"
header_bytes = header.encode("ascii")
if len(header_bytes) > MAX_HEADER_BYTES:
raise ProtocolError(
f"header {len(header_bytes)} bytes exceeds max {MAX_HEADER_BYTES}"
)
return header_bytes + struct.pack(">I", len(body_bytes)) + body_bytes
async def read_frame(reader: asyncio.StreamReader) -> Frame | None:
"""Read one frame from *reader*.
Returns ``None`` on clean EOF before a new frame starts. Raises
:class:`ProtocolError` on malformed input (caller should close the
connection).
"""
try:
header = await reader.readuntil(b"\n")
except asyncio.IncompleteReadError as exc:
if not exc.partial:
return None
raise ProtocolError("connection closed mid-header") from exc
except asyncio.LimitOverrunError as exc:
raise ProtocolError("header exceeded buffer limit") from exc
if len(header) > MAX_HEADER_BYTES:
raise ProtocolError(f"header {len(header)} bytes exceeds max")
line = header.rstrip(b"\r\n").decode("ascii", errors="strict")
if not line:
raise ProtocolError("empty header line")
verb, _, args = line.partition(" ")
if verb not in _VALID_VERBS:
raise ProtocolError(f"unknown verb {verb!r}")
length_bytes = await reader.readexactly(4)
(body_len,) = struct.unpack(">I", length_bytes)
if body_len > MAX_BODY_BYTES:
raise ProtocolError(f"body length {body_len} exceeds max")
body = await reader.readexactly(body_len) if body_len else b""
return Frame(verb=verb, args=args.strip(), body=body)
def decode_body(body: bytes) -> dict[str, Any]:
"""Decode a frame body as a JSON dict. Empty body → empty dict."""
if not body:
return {}
try:
obj = orjson.loads(body)
except orjson.JSONDecodeError as exc:
raise ProtocolError(f"body is not valid JSON: {exc}") from exc
if not isinstance(obj, dict):
raise ProtocolError(f"body must be a JSON object, got {type(obj).__name__}")
return obj

106
decnet/bus/topics.py Normal file
View File

@@ -0,0 +1,106 @@
"""Canonical topic hierarchy for the DECNET ServiceBus.
Locked early so consumers can subscribe with stable wildcard patterns.
Adding new topic families is fine; **renaming** existing ones is a breaking
change for every subscriber and requires a coordinated rollout.
Token structure (NATS-style, dot-separated):
topology.{topology_id}.mutation.{state}
topology.{topology_id}.status
decky.{decky_id}.state
decky.{decky_id}.traffic
attacker.observed
system.log
system.bus.health
Wildcards (per :func:`decnet.bus.base.matches`):
* ``*`` matches exactly one token.
* ``>`` matches one-or-more trailing tokens (so ``topology.>`` matches
``topology.abc.status`` but not the bare root ``topology``).
"""
from __future__ import annotations
# ─── Root prefixes ───────────────────────────────────────────────────────────
TOPOLOGY = "topology"
DECKY = "decky"
ATTACKER = "attacker"
SYSTEM = "system"
# ─── Leaf event-type constants (the last segment of each topic) ──────────────
# Topology mutation lifecycle states — keep in sync with TopologyMutation.state
# in decnet/web/db/models.py; the bus topic mirrors the DB state machine.
MUTATION_ENQUEUED = "enqueued"
MUTATION_APPLYING = "applying"
MUTATION_APPLIED = "applied"
MUTATION_FAILED = "failed"
# Topology-level status transitions (topology.{id}.status): fires when the
# topology row's status column changes (pending/deploying/active/degraded/failed).
TOPOLOGY_STATUS = "status"
# Decky-level event types (second token).
DECKY_STATE = "state"
DECKY_TRAFFIC = "traffic"
# System event types.
SYSTEM_LOG = "log"
SYSTEM_BUS_HEALTH = "bus.health"
# ─── Builders ────────────────────────────────────────────────────────────────
def topology_mutation(topology_id: str, state: str) -> str:
"""Build ``topology.<id>.mutation.<state>``.
*state* should be one of the ``MUTATION_*`` constants.
"""
_reject_tokens(topology_id, state)
return f"{TOPOLOGY}.{topology_id}.mutation.{state}"
def topology_status(topology_id: str) -> str:
"""Build ``topology.<id>.status``."""
_reject_tokens(topology_id)
return f"{TOPOLOGY}.{topology_id}.{TOPOLOGY_STATUS}"
def decky(decky_id: str, event_type: str) -> str:
"""Build ``decky.<id>.<event_type>``.
*event_type* is typically one of ``DECKY_STATE`` or ``DECKY_TRAFFIC``.
"""
_reject_tokens(decky_id, event_type)
return f"{DECKY}.{decky_id}.{event_type}"
def system(event_type: str) -> str:
"""Build ``system.<event_type>``.
*event_type* may itself contain dots (e.g. ``bus.health``) — we don't
re-validate the already-constant leaves; this just prefixes.
"""
if not event_type:
raise ValueError("system topic requires a non-empty event_type")
return f"{SYSTEM}.{event_type}"
def _reject_tokens(*parts: str) -> None:
"""Reject topic segments that would break NATS-style tokenization.
Dots, wildcards, whitespace, and empty strings in a *segment* would
silently corrupt the hierarchy (e.g. ``topology.a.b.status`` for a
``topology_id`` of ``"a.b"``). Raise early at the builder instead of
shipping a malformed topic to the wire.
"""
for p in parts:
if not p:
raise ValueError("topic segment must not be empty")
if "." in p or "*" in p or ">" in p or any(c.isspace() for c in p):
raise ValueError(
f"topic segment {p!r} may not contain '.', '*', '>', or whitespace"
)

237
decnet/bus/unix_client.py Normal file
View File

@@ -0,0 +1,237 @@
"""UNIX-socket client — :class:`UnixSocketBus` implementation of :class:`BaseBus`.
Holds one open socket to the local :class:`~decnet.bus.unix_server.BusServer`.
Operations:
* :meth:`publish` writes a single ``PUB`` frame and returns; no ack.
* :meth:`subscribe` writes a ``SUB`` frame and returns a
:class:`~decnet.bus.base.Subscription` backed by an :class:`asyncio.Queue`
that the background reader task feeds.
One background reader task per bus instance dispatches incoming ``EVT``
frames to every registered subscription whose pattern matches the topic.
On connection drop or close, every subscription is woken via a sentinel so
iterators unblock cleanly; callers see :class:`StopAsyncIteration` from the
``async for`` loop.
No auto-reconnect in MVP. If the server restarts, callers must
:meth:`close` the bus and construct a new one. This mirrors how other
DECNET workers handle their dependencies — the systemd ``Restart=on-failure``
supervision above us is the retry loop.
"""
from __future__ import annotations
import asyncio
import contextlib
import os
import pathlib
from typing import Any
from decnet.bus import protocol
from decnet.bus.base import (
BaseBus,
Event,
Subscription,
_CLOSE_SENTINEL,
matches,
)
from decnet.bus.fake import _enqueue_drop_oldest as _enqueue_event_drop_oldest
from decnet.logging import get_logger
log = get_logger("bus.client")
_INBOUND_QUEUE_SIZE = 1024
class _UnixSubscription(Subscription):
def __init__(
self,
bus: "UnixSocketBus",
pattern: str,
queue: "asyncio.Queue[Any]",
) -> None:
super().__init__(pattern)
self._bus = bus
self._queue = queue
async def __anext__(self) -> Event:
if self._closed:
raise StopAsyncIteration
item = await self._queue.get()
if item is _CLOSE_SENTINEL:
raise StopAsyncIteration
return item
async def _aclose(self) -> None:
await self._bus._unregister(self)
try:
self._queue.put_nowait(_CLOSE_SENTINEL)
except asyncio.QueueFull:
pass
class UnixSocketBus(BaseBus):
"""Client handle for a local :class:`BusServer`.
One instance per process typically; multiple instances simply open
multiple sockets to the same server. Connection is lazy — the first
:meth:`connect` (or any publish/subscribe call via ``async with``)
opens the socket.
"""
def __init__(
self,
socket_path: pathlib.Path | str,
*,
client_name: str | None = None,
) -> None:
self._path = pathlib.Path(socket_path)
self._client_name = client_name or f"decnet-bus-client[{os.getpid()}]"
self._reader: asyncio.StreamReader | None = None
self._writer: asyncio.StreamWriter | None = None
self._reader_task: asyncio.Task[None] | None = None
self._subs: list[_UnixSubscription] = []
self._lock = asyncio.Lock()
self._write_lock = asyncio.Lock()
self._closed = False
# ─── Lifecycle ──────────────────────────────────────────────────────────
async def connect(self) -> None:
if self._writer is not None:
return
if self._closed:
raise RuntimeError("connect on closed bus")
self._reader, self._writer = await asyncio.open_unix_connection(str(self._path))
await self._send(protocol.encode(protocol.HELLO, args=self._client_name))
self._reader_task = asyncio.create_task(self._reader_loop())
log.debug("bus.client: connected to %s as %s", self._path, self._client_name)
async def close(self) -> None:
if self._closed:
return
self._closed = True
# Best-effort BYE — we don't care if it fails.
if self._writer is not None and not self._writer.is_closing():
with contextlib.suppress(Exception):
await self._send(protocol.encode(protocol.BYE))
if self._reader_task is not None:
self._reader_task.cancel()
with contextlib.suppress(asyncio.CancelledError):
await self._reader_task
self._reader_task = None
if self._writer is not None:
with contextlib.suppress(Exception):
self._writer.close()
await self._writer.wait_closed()
self._writer = None
self._reader = None
# Wake every subscription so `async for` exits.
for sub in list(self._subs):
with contextlib.suppress(asyncio.QueueFull):
sub._queue.put_nowait(_CLOSE_SENTINEL)
self._subs.clear()
# ─── Pub/Sub ────────────────────────────────────────────────────────────
async def publish(
self,
topic: str,
payload: dict[str, Any],
*,
event_type: str = "",
) -> None:
if self._closed:
raise RuntimeError("publish on closed bus")
if self._writer is None:
await self.connect()
body = Event(topic=topic, payload=payload, type=event_type).to_dict()
try:
await self._send(protocol.encode(protocol.PUB, args=topic, body=body))
except (ConnectionError, BrokenPipeError) as exc:
# Bus loss is a logged warning, never a publisher crash. The
# DB-as-source-of-truth invariant means the work is already
# persisted; the missing event is just a missed notification.
log.warning("bus.client: publish failed: %s", exc)
def subscribe(self, pattern: str) -> Subscription:
if self._closed:
raise RuntimeError("subscribe on closed bus")
queue: asyncio.Queue[Any] = asyncio.Queue(maxsize=_INBOUND_QUEUE_SIZE)
sub = _UnixSubscription(self, pattern, queue)
self._subs.append(sub)
# Schedule the SUB frame asynchronously so subscribe() stays sync,
# matching the BaseBus signature. The caller will shortly `async
# with` / `async for` the subscription, which will run the event
# loop and pick this task up.
asyncio.ensure_future(self._send_sub(pattern))
return sub
async def _send_sub(self, pattern: str) -> None:
try:
if self._writer is None:
await self.connect()
await self._send(protocol.encode(protocol.SUB, args=pattern))
except Exception as exc: # pragma: no cover - network paths in live tests
log.warning("bus.client: SUB %s failed: %s", pattern, exc)
async def _unregister(self, sub: _UnixSubscription) -> None:
try:
self._subs.remove(sub)
except ValueError:
return
# Tell the server we no longer want events for this pattern if no
# other local subscription still wants it.
if not any(s.pattern == sub.pattern for s in self._subs):
with contextlib.suppress(Exception):
await self._send(protocol.encode(protocol.UNSUB, args=sub.pattern))
# ─── Internal I/O ───────────────────────────────────────────────────────
async def _send(self, frame_bytes: bytes) -> None:
if self._writer is None:
raise ConnectionError("bus.client: not connected")
async with self._write_lock:
self._writer.write(frame_bytes)
await self._writer.drain()
async def _reader_loop(self) -> None:
if self._reader is None:
return
try:
while True:
frame = await protocol.read_frame(self._reader)
if frame is None:
break
if frame.verb != protocol.EVT:
# Clients only ever legitimately receive EVT (or BYE).
if frame.verb == protocol.BYE:
break
log.warning("bus.client: unexpected verb from server: %s", frame.verb)
continue
topic = frame.args
data = protocol.decode_body(frame.body) if frame.body else {}
event = Event.from_dict(topic, data)
self._dispatch(event)
except protocol.ProtocolError as exc:
log.warning("bus.client: protocol error: %s", exc)
except (asyncio.IncompleteReadError, ConnectionError):
pass
except asyncio.CancelledError:
raise
except Exception: # pragma: no cover
log.exception("bus.client: reader loop crashed")
finally:
# Server-side close — wake every subscription.
for sub in list(self._subs):
with contextlib.suppress(asyncio.QueueFull):
sub._queue.put_nowait(_CLOSE_SENTINEL)
def _dispatch(self, event: Event) -> None:
for sub in self._subs:
if matches(sub.pattern, event.topic):
_enqueue_event_drop_oldest(sub._queue, event)

309
decnet/bus/unix_server.py Normal file
View File

@@ -0,0 +1,309 @@
"""UNIX-socket server for the DECNET bus.
One :class:`BusServer` per host. Accepts local connections on a UNIX-domain
socket; each connection may:
* publish events (``PUB`` frames) that the server fans out to all matching
subscribers on other connections, and
* subscribe to patterns (``SUB`` frames) and receive matching events as
``EVT`` frames.
Authorization is socket file permissions (0660, group=``decnet`` if that
POSIX group exists, else the server process's own group). Anything the
kernel lets ``connect()`` is trusted — there is no verb-level auth. This
matches the "local processes on the same host" threat model; cross-host
federation is out of scope (see DEBT-029).
Backpressure is per-connection, drop-oldest: if a subscriber can't drain its
outbound queue fast enough, the server discards the oldest pending event
rather than blocking publishers. The bus is at-most-once by contract, so
drops are acceptable; stalled publishers are not.
"""
from __future__ import annotations
import asyncio
import contextlib
import grp
import os
import pathlib
from dataclasses import dataclass, field
from typing import Any
from decnet.bus import protocol
from decnet.bus.base import Event, matches
from decnet.logging import get_logger
log = get_logger("bus.server")
_SOCKET_MODE = 0o660
_DEFAULT_GROUP = "decnet"
_OUTBOUND_QUEUE_SIZE = 1024
@dataclass(eq=False)
class _Connection:
"""Per-connection server state."""
writer: asyncio.StreamWriter
peer_name: str = "<unknown>"
patterns: set[str] = field(default_factory=set)
outbound: asyncio.Queue[bytes] = field(
default_factory=lambda: asyncio.Queue(maxsize=_OUTBOUND_QUEUE_SIZE)
)
closed: bool = False
class BusServer:
"""Serve a UNIX-socket bus on *socket_path*.
Lifecycle: construct → :meth:`start` → :meth:`serve_forever` (or rely
on :meth:`start` returning once bound) → :meth:`close` for teardown.
Safe to :meth:`close` multiple times.
"""
def __init__(
self,
socket_path: pathlib.Path | str,
*,
group: str | None = _DEFAULT_GROUP,
mode: int = _SOCKET_MODE,
) -> None:
self._path = pathlib.Path(socket_path)
self._group = group
self._mode = mode
self._server: asyncio.base_events.Server | None = None
self._connections: set[_Connection] = set()
self._closed = False
# ─── Lifecycle ──────────────────────────────────────────────────────────
async def start(self) -> None:
"""Bind the socket and begin accepting connections.
Removes any stale socket file at *socket_path* first (common case:
the previous worker crashed without cleaning up). The parent
directory must already exist; we do NOT create it blindly because
the chosen directory (typically ``/run/decnet``) may require
systemd ``RuntimeDirectory=`` to set up.
"""
if self._server is not None:
return
parent = self._path.parent
if not parent.exists():
raise FileNotFoundError(
f"bus socket parent directory {parent} does not exist; "
f"create it with systemd RuntimeDirectory= or mkdir"
)
# Clean up a stale socket from a previous crash. If a live server
# is actually listening there, ``bind()`` below will fail — we do
# not try to detect live vs. stale ourselves.
with contextlib.suppress(FileNotFoundError):
if self._path.is_socket():
self._path.unlink()
self._server = await asyncio.start_unix_server(
self._handle_connection, path=str(self._path),
)
_chmod_and_chown(self._path, self._mode, self._group)
log.info("bus.server: listening on %s (mode=%o group=%s)",
self._path, self._mode, self._group or "<inherit>")
async def serve_forever(self) -> None:
if self._server is None:
raise RuntimeError("BusServer not started")
async with self._server:
await self._server.serve_forever()
async def close(self) -> None:
if self._closed:
return
self._closed = True
if self._server is not None:
self._server.close()
with contextlib.suppress(Exception):
await self._server.wait_closed()
self._server = None
# Drain every live connection.
for conn in list(self._connections):
await self._close_connection(conn)
self._connections.clear()
with contextlib.suppress(FileNotFoundError):
self._path.unlink()
log.info("bus.server: closed")
# ─── Internal publish fan-out ───────────────────────────────────────────
async def publish(self, topic: str, payload: dict[str, Any], event_type: str = "") -> None:
"""Server-side publish helper — used by the worker to emit
``system.bus.health`` heartbeats without opening a client loop."""
event = Event(topic=topic, payload=payload, type=event_type)
self._fanout(event)
# ─── Connection handler ─────────────────────────────────────────────────
async def _handle_connection(
self,
reader: asyncio.StreamReader,
writer: asyncio.StreamWriter,
) -> None:
conn = _Connection(writer=writer)
self._connections.add(conn)
writer_task = asyncio.create_task(self._writer_loop(conn))
try:
await self._reader_loop(conn, reader)
except protocol.ProtocolError as exc:
log.warning("bus.server: protocol error from %s: %s", conn.peer_name, exc)
except (asyncio.IncompleteReadError, ConnectionError) as exc:
log.debug("bus.server: %s disconnected: %s", conn.peer_name, exc)
except Exception: # pragma: no cover - defensive
log.exception("bus.server: unhandled error in connection")
finally:
await self._close_connection(conn)
self._connections.discard(conn)
writer_task.cancel()
with contextlib.suppress(asyncio.CancelledError):
await writer_task
async def _reader_loop(
self, conn: _Connection, reader: asyncio.StreamReader,
) -> None:
while True:
frame = await protocol.read_frame(reader)
if frame is None:
return
await self._dispatch(conn, frame)
if frame.verb == protocol.BYE:
return
async def _dispatch(self, conn: _Connection, frame: protocol.Frame) -> None:
if frame.verb == protocol.HELLO:
conn.peer_name = frame.args or conn.peer_name
log.debug("bus.server: HELLO from %s", conn.peer_name)
return
if frame.verb == protocol.SUB:
pattern = frame.args
if not pattern:
raise protocol.ProtocolError("SUB requires a pattern")
conn.patterns.add(pattern)
log.debug("bus.server: %s SUB %s", conn.peer_name, pattern)
return
if frame.verb == protocol.UNSUB:
conn.patterns.discard(frame.args)
return
if frame.verb == protocol.PUB:
topic = frame.args
if not topic:
raise protocol.ProtocolError("PUB requires a topic")
data = protocol.decode_body(frame.body) if frame.body else {}
event = Event(
topic=topic,
payload=data.get("payload", {}) or {},
type=data.get("type", "") or "",
)
self._fanout(event, origin=conn)
return
if frame.verb == protocol.BYE:
return
# EVT is server-to-client only; receiving one is a protocol violation.
raise protocol.ProtocolError(f"unexpected verb {frame.verb!r} from client")
def _fanout(self, event: Event, *, origin: _Connection | None = None) -> None:
"""Enqueue *event* as an EVT frame on every matching connection.
We do NOT deliver back to the originating connection (a publisher
does not receive its own event). Encoding happens once per event,
not once per subscriber.
"""
try:
frame_bytes = protocol.encode(
protocol.EVT, args=event.topic, body=event.to_dict(),
)
except protocol.ProtocolError:
log.exception("bus.server: failed to encode EVT for topic=%s", event.topic)
return
for conn in self._connections:
if conn is origin or conn.closed:
continue
if not any(matches(p, event.topic) for p in conn.patterns):
continue
_enqueue_drop_oldest(conn.outbound, frame_bytes, event.topic)
async def _writer_loop(self, conn: _Connection) -> None:
"""Serialize writes onto *conn*'s socket.
One writer task per connection so a slow peer only blocks its own
queue, not the fan-out loop. The queue is bounded with drop-oldest
policy applied at enqueue time (see :func:`_enqueue_drop_oldest`).
"""
try:
while not conn.closed:
data = await conn.outbound.get()
conn.writer.write(data)
await conn.writer.drain()
except (ConnectionError, BrokenPipeError):
log.debug("bus.server: %s writer: peer closed", conn.peer_name)
except asyncio.CancelledError:
pass
except Exception: # pragma: no cover - defensive
log.exception("bus.server: writer loop crashed for %s", conn.peer_name)
async def _close_connection(self, conn: _Connection) -> None:
if conn.closed:
return
conn.closed = True
with contextlib.suppress(Exception):
conn.writer.close()
await conn.writer.wait_closed()
# ─── Helpers ─────────────────────────────────────────────────────────────────
def _chmod_and_chown(path: pathlib.Path, mode: int, group: str | None) -> None:
"""Apply socket file perms and best-effort group ownership.
If *group* is ``None`` or the named group does not exist, we leave the
socket owned by the current process group. This keeps the server
usable on dev boxes that don't have a ``decnet`` group set up.
"""
try:
os.chmod(path, mode)
except OSError as exc:
log.warning("bus.server: chmod(%s, %o) failed: %s", path, mode, exc)
if not group:
return
try:
gid = grp.getgrnam(group).gr_gid
except KeyError:
log.debug("bus.server: group %r not found, leaving socket group unchanged", group)
return
try:
os.chown(path, -1, gid)
except PermissionError:
# Dev box running as an unprivileged user can't chown. Log once at
# debug and move on — the socket is still usable by the owner.
log.debug("bus.server: chown(%s, gid=%d) denied; leaving as-is", path, gid)
except OSError as exc:
log.warning("bus.server: chown(%s, gid=%d) failed: %s", path, gid, exc)
def _enqueue_drop_oldest(
queue: "asyncio.Queue[bytes]", data: bytes, topic: str,
) -> None:
"""Drop-oldest backpressure — mirrors :func:`decnet.bus.fake._enqueue_drop_oldest`."""
while True:
try:
queue.put_nowait(data)
return
except asyncio.QueueFull:
try:
queue.get_nowait()
log.warning("bus.server: subscriber queue full, dropped event topic=%s", topic)
except asyncio.QueueEmpty:
return

121
decnet/bus/worker.py Normal file
View File

@@ -0,0 +1,121 @@
"""``decnet bus`` worker entrypoint.
Starts a :class:`~decnet.bus.unix_server.BusServer` on the configured UNIX
socket and serves forever, emitting a ``system.bus.health`` heartbeat on
its own bus every :data:`HEARTBEAT_INTERVAL_SEC` seconds so liveness-aware
consumers (dashboards, watchdogs) can tell the bus is up without polling
the filesystem.
Cross-host federation is **out of scope** for the MVP; each host runs its
own bus independently. See DEBT-029 for the deferred ``--bridge-tcp``
mode that would proxy the socket over the swarm mTLS channel.
"""
from __future__ import annotations
import asyncio
import os
import pathlib
import signal
import time
from decnet.bus import topics
from decnet.bus.unix_server import BusServer
from decnet.logging import get_logger
log = get_logger("bus.worker")
HEARTBEAT_INTERVAL_SEC = 10
async def bus_worker(
socket_path: str | pathlib.Path,
*,
group: str | None = "decnet",
heartbeat_interval: int = HEARTBEAT_INTERVAL_SEC,
) -> None:
"""Run the bus server until cancelled or SIGTERM/SIGINT is received.
The parent directory of *socket_path* must already exist (systemd's
``RuntimeDirectory=decnet`` handles this in prod; dev code is expected
to ``mkdir`` first). This function does not create it implicitly
because the right choice of perms/owner depends on the deployment
context.
"""
path = pathlib.Path(socket_path)
_ensure_parent(path)
server = BusServer(path, group=group)
await server.start()
log.info("bus.worker: pid=%d socket=%s", os.getpid(), path)
stop_event = asyncio.Event()
_install_signal_handlers(stop_event)
heartbeat_task = asyncio.create_task(_heartbeat_loop(server, heartbeat_interval))
serve_task = asyncio.create_task(server.serve_forever())
try:
await stop_event.wait()
log.info("bus.worker: shutdown signal received")
finally:
heartbeat_task.cancel()
serve_task.cancel()
for task in (heartbeat_task, serve_task):
try:
await task
except (asyncio.CancelledError, Exception): # noqa: BLE001 - draining shutdown
pass
await server.close()
log.info("bus.worker: stopped")
async def _heartbeat_loop(server: BusServer, interval: int) -> None:
"""Publish ``system.bus.health`` on the server's own fan-out."""
started_at = time.time()
while True:
try:
await server.publish(
topics.system(topics.SYSTEM_BUS_HEALTH),
{
"pid": os.getpid(),
"uptime_sec": round(time.time() - started_at, 3),
"ts": time.time(),
},
event_type=topics.SYSTEM_BUS_HEALTH,
)
except Exception: # pragma: no cover - heartbeat must never kill the worker
log.exception("bus.worker: heartbeat publish failed")
await asyncio.sleep(interval)
def _install_signal_handlers(stop_event: asyncio.Event) -> None:
loop = asyncio.get_running_loop()
for sig in (signal.SIGTERM, signal.SIGINT):
try:
loop.add_signal_handler(sig, stop_event.set)
except (NotImplementedError, RuntimeError):
# add_signal_handler is not supported on Windows / in some
# test harnesses where the loop is running in a non-main thread.
# The worker still exits via KeyboardInterrupt bubbling up.
pass
def _ensure_parent(path: pathlib.Path) -> None:
parent = path.parent
if parent.exists():
return
# Dev-box convenience: if the parent is the user's ``~/.decnet`` dir,
# create it. We do not auto-mkdir ``/run/decnet`` — that's systemd's job
# and silently creating it as the wrong user would cause permission
# confusion later.
home_prefix = pathlib.Path.home() / ".decnet"
try:
parent.relative_to(home_prefix.parent)
except ValueError:
raise FileNotFoundError(
f"bus socket parent {parent} does not exist; create it first"
)
parent.mkdir(parents=True, exist_ok=True)
__all__ = ["bus_worker", "HEARTBEAT_INTERVAL_SEC"]

View File

@@ -21,6 +21,7 @@ import typer
from . import (
agent,
api,
bus,
db,
deploy,
forwarder,
@@ -51,7 +52,7 @@ for _mod in (
swarm,
deploy, lifecycle, workers, inventory,
web, profiler, sniffer, db,
topology,
topology, bus,
):
_mod.register(app)

45
decnet/cli/bus.py Normal file
View File

@@ -0,0 +1,45 @@
from __future__ import annotations
import typer
from . import utils as _utils
from .utils import console, log
def register(app: typer.Typer) -> None:
@app.command(name="bus")
def bus_cmd(
socket_path: str = typer.Option(
None, "--socket", "-s",
help="UNIX socket path (defaults to DECNET_BUS_SOCKET env var, "
"then /run/decnet/bus.sock, then ~/.decnet/bus.sock).",
),
group: str = typer.Option(
"decnet", "--group", "-g",
help="POSIX group to chown the socket to (falls back to process "
"group if the named group does not exist).",
),
heartbeat: int = typer.Option(
10, "--heartbeat", "-H",
help="Seconds between system.bus.health heartbeat events.",
),
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process."),
) -> None:
"""Run the DECNET ServiceBus worker (host-local UNIX-socket pub/sub)."""
import asyncio
from decnet.bus.factory import _default_socket_path
from decnet.bus.worker import bus_worker
resolved = socket_path or _default_socket_path()
if daemon:
log.info("bus daemonizing socket=%s", resolved)
_utils._daemonize()
log.info("bus starting socket=%s group=%s heartbeat=%ds", resolved, group, heartbeat)
console.print(f"[bold cyan]Bus starting[/] (socket: {resolved}, heartbeat: {heartbeat}s)")
try:
asyncio.run(bus_worker(resolved, group=group, heartbeat_interval=heartbeat))
except KeyboardInterrupt:
console.print("\n[yellow]Bus stopped.[/]")

View File

@@ -131,6 +131,15 @@ DECNET_DISALLOW_MASTER: bool = (
os.environ.get("DECNET_DISALLOW_MASTER", "true").lower() == "true"
)
# ServiceBus — host-local UNIX-socket pub/sub. Workers consume via
# ``decnet.bus.factory.get_bus()``. Disabled → NullBus (publishes drop,
# subscriptions yield nothing) so dev environments without a bus daemon
# can still boot. See DEBT-029 for the MVP design.
DECNET_BUS_ENABLED: bool = os.environ.get("DECNET_BUS_ENABLED", "true").lower() != "false"
DECNET_BUS_TYPE: str = os.environ.get("DECNET_BUS_TYPE", "unix").lower()
DECNET_BUS_SOCKET: Optional[str] = os.environ.get("DECNET_BUS_SOCKET")
DECNET_BUS_GROUP: str = os.environ.get("DECNET_BUS_GROUP", "decnet")
# Tracing — set to "true" to enable OpenTelemetry distributed tracing.
# Separate from DECNET_DEVELOPER so tracing can be toggled independently.
DECNET_DEVELOPER_TRACING: bool = os.environ.get("DECNET_DEVELOPER_TRACING", "").lower() == "true"