merge: testing → main (reconcile 2-week divergence)
This commit is contained in:
92
decnet/bus/app.py
Normal file
92
decnet/bus/app.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""Process-wide bus singleton for request-serving workers (API, SSE routes).
|
||||
|
||||
A single connected :class:`~decnet.bus.base.BaseBus` shared across request
|
||||
handlers — opening a UNIX socket per request would be wasteful and add
|
||||
latency to the hot path. The API lifespan is responsible for calling
|
||||
:func:`close_app_bus` on shutdown; connect is lazy so tests and
|
||||
contract-test mode that never hit a publish/subscribe code path don't
|
||||
pay for a bus connection they'll never use.
|
||||
|
||||
Failures during :meth:`BaseBus.connect` are swallowed and logged — a
|
||||
dead bus must never break request serving. Publishers should treat a
|
||||
``None`` return from :func:`get_app_bus` as "skip this notification",
|
||||
same as ``DECNET_BUS_ENABLED=false``.
|
||||
|
||||
Connect is **retried with a short backoff** (not one-shot): a startup
|
||||
race where the API lifespan hits :func:`get_app_bus` before ``decnet
|
||||
bus`` is ready would otherwise poison the singleton for the entire
|
||||
process lifetime. Instead we remember the last failure timestamp and
|
||||
let callers retry once ``_RETRY_BACKOFF`` seconds have passed.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
from decnet.bus.base import BaseBus
|
||||
from decnet.bus.factory import get_bus
|
||||
from decnet.logging import get_logger
|
||||
|
||||
log = get_logger("bus.app")
|
||||
|
||||
# Publishers in the hot path shouldn't pay connect-retry latency on every
|
||||
# call; the dashboard's own 5 s poll interval recovers within one tick
|
||||
# once the bus comes up. A persistently-dead bus only gets a connect
|
||||
# attempt every 2 s, not once per request.
|
||||
_RETRY_BACKOFF: float = 2.0
|
||||
|
||||
_lock = asyncio.Lock()
|
||||
_shared: BaseBus | None = None
|
||||
_last_failure_ts: float = 0.0
|
||||
|
||||
|
||||
async def get_app_bus() -> BaseBus | None:
|
||||
"""Return the process-wide connected bus, or ``None`` if unavailable.
|
||||
|
||||
On first call, constructs a client via :func:`get_bus` and awaits
|
||||
``connect()``. Subsequent calls return the cached instance. If a
|
||||
connect attempt raises, the failure timestamp is recorded and
|
||||
subsequent calls within ``_RETRY_BACKOFF`` seconds return ``None``
|
||||
without re-attempting — after the backoff window, the next call
|
||||
retries. This is what lets the API recover from a
|
||||
``decnet bus``-started-after-API race without a full API restart.
|
||||
"""
|
||||
global _shared, _last_failure_ts
|
||||
if _shared is not None:
|
||||
return _shared
|
||||
if (time.monotonic() - _last_failure_ts) < _RETRY_BACKOFF:
|
||||
return None
|
||||
async with _lock:
|
||||
if _shared is not None:
|
||||
return _shared
|
||||
if (time.monotonic() - _last_failure_ts) < _RETRY_BACKOFF:
|
||||
return None
|
||||
try:
|
||||
candidate = get_bus(client_name="api")
|
||||
await candidate.connect()
|
||||
_shared = candidate
|
||||
_last_failure_ts = 0.0
|
||||
return _shared
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("app bus unavailable: %s", exc)
|
||||
_last_failure_ts = time.monotonic()
|
||||
return None
|
||||
|
||||
|
||||
async def close_app_bus() -> None:
|
||||
"""Close the shared bus if one is open; clear the backoff window.
|
||||
|
||||
Call from the API lifespan shutdown. Safe to call multiple times.
|
||||
Resetting ``_last_failure_ts`` means the next ``get_app_bus()``
|
||||
after shutdown-and-restart-within-the-same-process (rare, but
|
||||
tests do this) retries immediately instead of honouring a stale
|
||||
backoff.
|
||||
"""
|
||||
global _shared, _last_failure_ts
|
||||
bus, _shared = _shared, None
|
||||
_last_failure_ts = 0.0
|
||||
if bus is not None:
|
||||
try:
|
||||
await bus.close()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("app bus close raised: %s", exc)
|
||||
Reference in New Issue
Block a user