merge: testing → main (reconcile 2-week divergence)

This commit is contained in:
2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions

View File

@@ -0,0 +1,4 @@
"""External webhook egress — ship bus events to SIEM/SOAR stacks."""
from decnet.webhook.worker import webhook_worker
__all__ = ["webhook_worker"]

188
decnet/webhook/client.py Normal file
View File

@@ -0,0 +1,188 @@
"""HMAC-signed HTTP POST delivery for webhook events.
The delivery function is shared between the worker's normal dispatch
loop and the `/webhooks/{uuid}/test` admin route — same payload shape,
same signing, same headers. Retry policy is configurable by the caller
so manual tests can skip retries entirely while the worker retries
with backoff.
"""
from __future__ import annotations
import asyncio
import hashlib
import hmac
import random
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any, Optional
from uuid import uuid4
import httpx
import orjson
from decnet.logging import get_logger
log = get_logger("webhook.client")
_DEFAULT_TIMEOUT_S = 10.0
_DEFAULT_RETRY_SCHEDULE = (1.0, 2.0, 4.0)
_JITTER_LOW = 0.8
_JITTER_HIGH = 1.2
_PAYLOAD_VERSION = 1
@dataclass(frozen=True)
class SyntheticEvent:
"""Structural match for decnet.bus.base.Event — avoids importing the
bus dependency into the HTTP egress layer."""
topic: str
type: str
ts: str
id: str
payload: dict[str, Any]
@dataclass
class DeliveryResult:
ok: bool
status_code: Optional[int] = None
error: Optional[str] = None
attempts: int = 0
def _canonical_ts(value: Any) -> str:
"""Normalize bus-event ts (epoch float / ISO str / None) to ISO-8601 UTC."""
if isinstance(value, str) and value:
return value
if isinstance(value, (int, float)):
return datetime.fromtimestamp(float(value), tz=timezone.utc).isoformat()
return datetime.now(timezone.utc).isoformat()
def _event_id(event: Any) -> str:
explicit = getattr(event, "id", None)
if isinstance(explicit, str) and explicit:
return explicit
return str(uuid4())
def build_payload(event: Any) -> bytes:
"""Serialize an event to the canonical JSON body sent on the wire.
Stable key order (`orjson.OPT_SORT_KEYS`) matters because the HMAC
signs the exact byte sequence — receivers recomputing the hash must
see the same bytes we did.
"""
body = {
"v": _PAYLOAD_VERSION,
"id": _event_id(event),
"ts": _canonical_ts(getattr(event, "ts", None)),
"topic": getattr(event, "topic", ""),
"type": getattr(event, "type", "") or "",
"payload": getattr(event, "payload", None) or {},
}
return orjson.dumps(body, option=orjson.OPT_SORT_KEYS)
def sign(secret: str, body: bytes) -> str:
"""Return `sha256=<hex>` — the value of the `X-DECNET-Signature` header."""
digest = hmac.new(
secret.encode("utf-8"), body, hashlib.sha256
).hexdigest()
return f"sha256={digest}"
def _build_headers(secret: str, body: bytes, topic: str, event_id: str) -> dict[str, str]:
return {
"Content-Type": "application/json",
"User-Agent": "decnet-webhook/1.0",
"X-DECNET-Signature": sign(secret, body),
"X-DECNET-Event-Id": event_id,
"X-DECNET-Event-Topic": topic,
"X-DECNET-Timestamp": str(int(datetime.now(timezone.utc).timestamp())),
}
def _should_retry(status_code: int) -> bool:
"""Retry on network error, 5xx, and 429. 4xx (other) is terminal —
the receiver is telling us the request itself is wrong; retrying
won't help."""
if status_code == 429:
return True
return status_code >= 500
def _jittered(delay: float) -> float:
# Jitter is a load-smoothing knob, not a secret — non-crypto random is
# fine. Using secrets.SystemRandom here would burn entropy for no gain.
return delay * random.uniform(_JITTER_LOW, _JITTER_HIGH) # nosec B311
async def deliver(
sub: dict[str, Any],
event: Any,
*,
retry_schedule: Optional[list[float] | tuple[float, ...]] = None,
timeout_s: float = _DEFAULT_TIMEOUT_S,
client: Optional[httpx.AsyncClient] = None,
) -> DeliveryResult:
"""POST *event* to *sub['url']* with HMAC signing and bounded retries.
*sub* is a subscription row dict (from `repo.get_webhook_subscription`).
*retry_schedule* is the between-attempt delays in seconds; `None` uses
the default `(1, 2, 4)`, `[]` disables retries entirely (one attempt).
*client* allows tests to inject a mock `httpx.AsyncClient`.
"""
schedule = (
list(retry_schedule) if retry_schedule is not None
else list(_DEFAULT_RETRY_SCHEDULE)
)
max_attempts = 1 + len(schedule)
body = build_payload(event)
topic = getattr(event, "topic", "")
eid = _event_id(event)
headers = _build_headers(sub["secret"], body, topic, eid)
url = sub["url"]
owns_client = client is None
if owns_client:
client = httpx.AsyncClient(timeout=timeout_s)
last_status: Optional[int] = None
last_error: Optional[str] = None
try:
for attempt in range(1, max_attempts + 1):
try:
resp = await client.post(url, content=body, headers=headers)
last_status = resp.status_code
if 200 <= resp.status_code < 300:
return DeliveryResult(
ok=True, status_code=resp.status_code, attempts=attempt
)
if not _should_retry(resp.status_code):
return DeliveryResult(
ok=False,
status_code=resp.status_code,
error=f"non-retryable {resp.status_code}",
attempts=attempt,
)
last_error = f"http {resp.status_code}"
except (httpx.RequestError, asyncio.TimeoutError) as e:
last_error = f"{type(e).__name__}: {e}"
last_status = None
if attempt < max_attempts:
await asyncio.sleep(_jittered(schedule[attempt - 1]))
return DeliveryResult(
ok=False,
status_code=last_status,
error=last_error or "exhausted retries",
attempts=max_attempts,
)
finally:
if owns_client:
await client.aclose()

54
decnet/webhook/enums.py Normal file
View File

@@ -0,0 +1,54 @@
"""Simple-mode event enum → bus-topic pattern expansion.
The UI's Simple mode hides the NATS-style wildcard syntax behind three
friendly choices. Storage is always the expanded pattern list — the
enum exists only at the API boundary.
"""
from __future__ import annotations
# Patterns map to the bus topic hierarchy shipped by DEBT-031's worker
# rollout (see `decnet/bus/topics.py`):
# - attacker.{observed,fingerprinted,scored,session.started,session.ended}
# - decky.{id}.{state,traffic}
# - system.{log,<worker>.health,<worker>.control,bus.health}
SIMPLE_EVENT_PATTERNS: dict[str, list[str]] = {
"AttackerDetail": ["attacker.>"],
"DeckyStatus": ["decky.*.state", "decky.*.traffic"],
"SystemStatus": ["system.>"],
}
def expand_simple_events(names: list[str]) -> list[str]:
"""Flatten a list of simple-event names into their bus patterns.
Unknown names are silently dropped — the router layer validates
against the `SimpleEvent` Literal before calling us, so a bad value
here means a programming error elsewhere, not user input.
"""
out: list[str] = []
for n in names:
out.extend(SIMPLE_EVENT_PATTERNS.get(n, []))
return out
def merge_patterns(
simple: list[str] | None, advanced: list[str] | None
) -> list[str]:
"""Combine simple-event expansions with advanced raw patterns, deduped.
Order-preserving (simple expansions first, then advanced patterns in
the order the user supplied them) so operators see deterministic
patterns in API responses.
"""
seen: set[str] = set()
out: list[str] = []
for p in expand_simple_events(simple or []):
if p not in seen:
seen.add(p)
out.append(p)
for p in advanced or []:
if isinstance(p, str) and p and p not in seen:
seen.add(p)
out.append(p)
return out

312
decnet/webhook/worker.py Normal file
View File

@@ -0,0 +1,312 @@
"""Webhook dispatcher — bus consumer → HTTP egress.
Spawns one asyncio task per (subscription, pattern) pair. Each task
subscribes to the bus, iterates matching events, and POSTs them via
`decnet.webhook.client.deliver`. Reloads on `WEBHOOK_SUBSCRIPTIONS_CHANGED`
bus signals and as a slow fallback so a dropped signal costs latency,
not correctness.
One-task-per-pair is deliberately dumb: cancellation propagates cleanly,
and the bus's own trie does the actual pattern matching — no in-memory
filter logic to maintain. Scales fine up to thousands of subs; if that
ever breaks down we collapse to one task per distinct pattern and add
in-memory dispatch.
"""
from __future__ import annotations
import asyncio
import contextlib
import json
import os
from datetime import datetime, timezone
from typing import Any
import httpx
from decnet.bus.factory import get_bus
from decnet.bus.publish import run_control_listener, run_health_heartbeat
from decnet.bus import topics as _topics
from decnet.logging import get_logger
from decnet.web.db.repository import BaseRepository
from decnet.webhook.client import deliver
logger = get_logger("webhook_worker")
_RELOAD_FALLBACK_SECS = 60.0
# Max parallel HTTP egress — one global semaphore keeps the process's
# outbound footprint bounded regardless of event volume.
_EGRESS_CONCURRENCY = 10
# Circuit-breaker trip point. After this many consecutive delivery
# failures the worker auto-disables the subscription so one dead
# receiver can't poison the shared egress pool. Override via
# DECNET_WEBHOOK_CIRCUIT_THRESHOLD. Operator clears the trip by
# toggling `enabled` back on via PATCH.
_CIRCUIT_THRESHOLD = max(1, int(os.environ.get("DECNET_WEBHOOK_CIRCUIT_THRESHOLD", "5")))
# How long to wait between bus (re)connect attempts when the bus is
# unreachable. Keeps the worker self-healing against a bus that starts
# after the webhook worker does (systemd race) or crashes+restarts
# mid-operation. Override via DECNET_WEBHOOK_BUS_RECONNECT_SECS.
_BUS_RECONNECT_SECS = max(5.0, float(os.environ.get("DECNET_WEBHOOK_BUS_RECONNECT_SECS", "60")))
def _patterns_for(sub: dict[str, Any]) -> list[str]:
raw = sub.get("topic_patterns") or "[]"
try:
return [p for p in json.loads(raw) if isinstance(p, str)]
except (ValueError, TypeError):
return []
def _union_patterns(subs: list[dict[str, Any]]) -> list[str]:
"""Dedup patterns across all subs, preserving first-occurrence order."""
seen: set[str] = set()
out: list[str] = []
for sub in subs:
for p in _patterns_for(sub):
if p not in seen:
seen.add(p)
out.append(p)
return out
async def webhook_worker(
repo: BaseRepository,
*,
reload_interval: float = _RELOAD_FALLBACK_SECS,
http_client: httpx.AsyncClient | None = None,
bus_reconnect_secs: float = _BUS_RECONNECT_SECS,
) -> None:
"""Main entry — connect bus, spawn per-subscription delivery tasks,
reload on signal. Retries bus connection in a loop so the worker
self-heals if the bus starts after the worker or restarts mid-run.
"""
logger.info("webhook worker started")
shutdown = asyncio.Event()
owns_http = http_client is None
if owns_http:
http_client = httpx.AsyncClient(timeout=10.0)
try:
while not shutdown.is_set():
# Try to connect to the bus. If it's down, wait out the
# reconnect interval and try again. Shutdown interrupts the
# wait so systemd stop doesn't hang for a minute.
bus = None
try:
bus = get_bus(client_name="webhook")
await bus.connect()
except Exception as exc: # noqa: BLE001
logger.warning(
"webhook: bus unavailable, retrying in %.0fs: %s",
bus_reconnect_secs, exc,
)
with contextlib.suppress(asyncio.TimeoutError):
await asyncio.wait_for(
shutdown.wait(), timeout=bus_reconnect_secs
)
continue
# Bus is live — run one dispatch epoch until it fails or we
# shut down. On any crash the outer loop reconnects and
# retries from scratch; no state carries across epochs so a
# half-dead bus can't leave us with stale subscriptions.
logger.info("webhook: bus connected")
try:
await _run_with_bus(
bus, repo, http_client,
shutdown, reload_interval,
)
except asyncio.CancelledError:
shutdown.set()
raise
except Exception as exc: # noqa: BLE001
logger.warning(
"webhook: dispatch crashed, will reconnect: %s", exc
)
finally:
with contextlib.suppress(Exception):
await bus.close()
finally:
if owns_http and http_client is not None:
await http_client.aclose()
async def _run_with_bus(
bus,
repo: BaseRepository,
http_client: httpx.AsyncClient,
shutdown: asyncio.Event,
reload_interval: float,
) -> None:
"""Run one bus-up epoch: start heartbeat+control+reload listeners,
dispatch events until shutdown or error, clean up."""
reload_flag = asyncio.Event()
semaphore = asyncio.Semaphore(_EGRESS_CONCURRENCY)
consumer_tasks: list[asyncio.Task] = []
heartbeat_task = asyncio.create_task(run_health_heartbeat(bus, "webhook"))
control_task = asyncio.create_task(
run_control_listener(bus, "webhook", shutdown)
)
reload_task = asyncio.create_task(_reload_listener(bus, reload_flag, shutdown))
try:
while not shutdown.is_set():
# Cancel prior epoch's consumers before starting new ones.
await _cancel_all(consumer_tasks)
consumer_tasks.clear()
subs = await repo.list_webhook_subscriptions(enabled_only=True)
for sub in subs:
for pattern in _patterns_for(sub):
consumer_tasks.append(asyncio.create_task(
_consume(
bus, pattern, sub, repo, http_client, semaphore, reload_flag,
)
))
# Wait for reload OR timer fallback. Shutdown propagates via
# CancelledError when the outer task is cancelled.
with contextlib.suppress(asyncio.TimeoutError):
await asyncio.wait_for(
reload_flag.wait(), timeout=reload_interval
)
reload_flag.clear()
finally:
await _cancel_all(consumer_tasks)
for t in (heartbeat_task, control_task, reload_task):
t.cancel()
for t in (heartbeat_task, control_task, reload_task):
with contextlib.suppress(asyncio.CancelledError, Exception):
await t
async def _cancel_all(tasks: list[asyncio.Task]) -> None:
for t in tasks:
if not t.done():
t.cancel()
for t in tasks:
with contextlib.suppress(asyncio.CancelledError, Exception):
await t
async def _consume(
bus,
pattern: str,
sub: dict[str, Any],
repo: BaseRepository,
http_client: httpx.AsyncClient,
semaphore: asyncio.Semaphore,
reload_flag: asyncio.Event,
) -> None:
"""Subscribe to one pattern and dispatch events to one webhook."""
try:
subscription = bus.subscribe(pattern)
async with subscription:
async for event in subscription:
asyncio.create_task(
_dispatch_one(repo, http_client, semaphore, sub, event, reload_flag)
)
except asyncio.CancelledError:
raise
except Exception as exc:
logger.warning(
"webhook: consumer crashed sub=%s pattern=%s err=%s",
sub.get("name"), pattern, exc,
)
async def _dispatch_one(
repo: BaseRepository,
http_client: httpx.AsyncClient,
semaphore: asyncio.Semaphore,
sub: dict[str, Any],
event: Any,
reload_flag: asyncio.Event,
) -> None:
async with semaphore:
try:
result = await deliver(sub, event, client=http_client)
except Exception as exc: # noqa: BLE001
logger.exception(
"webhook: deliver raised for sub=%s topic=%s: %s",
sub.get("uuid"), getattr(event, "topic", ""), exc,
)
await _safe_record_failure(
repo, sub["uuid"], f"internal: {exc}", sub.get("name", ""), reload_flag,
)
return
now = datetime.now(timezone.utc)
if result.ok:
await _safe_record_success(repo, sub["uuid"], now)
else:
logger.warning(
"webhook: delivery failed sub=%s topic=%s status=%s err=%s",
sub.get("name"), getattr(event, "topic", ""),
result.status_code, result.error,
)
await _safe_record_failure(
repo, sub["uuid"], result.error or "unknown", sub.get("name", ""), reload_flag,
)
async def _safe_record_success(
repo: BaseRepository, uuid: str, ts: datetime
) -> None:
try:
await repo.record_webhook_success(uuid, ts)
except Exception as exc:
logger.warning("webhook: record_success failed: %s", exc)
async def _safe_record_failure(
repo: BaseRepository,
uuid: str,
error: str,
sub_name: str = "",
reload_flag: asyncio.Event | None = None,
) -> None:
try:
now = datetime.now(timezone.utc)
new_count = await repo.record_webhook_failure(uuid, now, error)
except Exception as exc:
logger.warning("webhook: record_failure failed: %s", exc)
return
# Circuit breaker — trip after threshold. Set the reload flag so the
# outer loop re-queries the DB and stops consuming events for the
# now-disabled sub. Idempotent: tripping an already-tripped sub just
# re-stamps auto_disabled_at.
if new_count >= _CIRCUIT_THRESHOLD:
try:
await repo.trip_webhook_circuit(uuid, now)
logger.warning(
"webhook: circuit tripped sub=%s uuid=%s failures=%d threshold=%d",
sub_name or "<unknown>", uuid, new_count, _CIRCUIT_THRESHOLD,
)
if reload_flag is not None:
reload_flag.set()
except Exception as exc:
logger.warning("webhook: trip_circuit failed: %s", exc)
async def _reload_listener(
bus, reload_flag: asyncio.Event, shutdown: asyncio.Event
) -> None:
"""Set `reload_flag` on every WEBHOOK_SUBSCRIPTIONS_CHANGED signal."""
try:
sub = bus.subscribe(_topics.WEBHOOK_SUBSCRIPTIONS_CHANGED)
async with sub:
async for _event in sub:
if shutdown.is_set():
return
reload_flag.set()
except asyncio.CancelledError:
raise
except Exception as exc:
logger.warning(
"webhook: reload listener crashed, fallback timer only: %s", exc
)