feat(webhooks): circuit breaker auto-disables misbehaving subscriptions

After DECNET_WEBHOOK_CIRCUIT_THRESHOLD (default 5) consecutive failed
deliveries, the worker calls trip_webhook_circuit(uuid, ts) which
flips enabled=False and stamps auto_disabled_at. The worker sets its
reload flag so the next dispatch epoch stops consuming events for the
tripped sub entirely — one dead receiver can't poison the shared
egress pool anymore.

Operator clears the trip via PATCH — setting enabled=True when the
sub was previously disabled clears auto_disabled_at, zeros
consecutive_failures, and clears last_error. Admin-pause → re-enable
hits the same path harmlessly.

Three observable states now distinguishable in the UI:
- Active              enabled=True,  auto_disabled_at=NULL
- Admin-paused        enabled=False, auto_disabled_at=NULL
- Tripped             enabled=False, auto_disabled_at=<ts>

UI surfaces a TRIPPED · <ts> chip on the row (red, alert-styled) and
a "N TRIPPED" count in the page header. Hover tooltip tells the
operator how to reset ("Re-enable via Edit").

record_webhook_failure now returns the new consecutive_failures count
so the worker can compare against the threshold without a second
roundtrip. trip_webhook_circuit is idempotent — re-tripping just
re-stamps auto_disabled_at.

Closes THREAT_MODEL WH-02 and DEBT-037 §1.
This commit is contained in:
2026-04-24 16:24:33 -04:00
parent ee682eef65
commit 2bcef50ac5
10 changed files with 213 additions and 17 deletions

View File

@@ -41,6 +41,13 @@ class WebhookSubscription(SQLModel, table=True):
last_success_at: Optional[datetime] = None
last_failure_at: Optional[datetime] = None
last_error: Optional[str] = None
# Set when the circuit breaker auto-disables the subscription after
# too many consecutive failures. NULL means "not tripped" — the
# subscription is either active (enabled=True) or admin-paused
# (enabled=False, auto_disabled_at=NULL). A non-NULL stamp with
# enabled=False means the worker tripped it; the operator clears
# the flag by re-enabling via PATCH.
auto_disabled_at: Optional[datetime] = None
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
@@ -100,6 +107,7 @@ class WebhookResponse(BaseModel):
last_success_at: Optional[datetime] = None
last_failure_at: Optional[datetime] = None
last_error: Optional[str] = None
auto_disabled_at: Optional[datetime] = None
created_at: datetime
updated_at: datetime
warnings: List[str] = PydanticField(default_factory=list)

View File

@@ -466,5 +466,12 @@ class BaseRepository(ABC):
async def record_webhook_failure(
self, uuid: str, ts: Any, error: str
) -> None:
) -> int:
"""Record a failed delivery; return the new ``consecutive_failures``
count so the caller can decide whether to trip the circuit."""
raise NotImplementedError
async def trip_webhook_circuit(self, uuid: str, ts: Any) -> None:
"""Auto-disable a subscription after repeated failures. Sets
``enabled=False`` and stamps ``auto_disabled_at``."""
raise NotImplementedError

View File

@@ -1829,26 +1829,41 @@ class SQLModelRepository(BaseRepository):
async def record_webhook_failure(
self, uuid: str, ts: datetime, error: str
) -> None:
) -> int:
async with self._session() as session:
# Read current failure count, bump, write. Small race window on
# concurrent deliveries to the same subscription is acceptable —
# the counter informs the circuit-breaker heuristic (DEBT-037),
# not a correctness invariant.
# the counter informs the circuit-breaker heuristic, not a
# correctness invariant.
result = await session.execute(
select(WebhookSubscription.consecutive_failures).where(
WebhookSubscription.uuid == uuid
)
)
current = result.scalar_one_or_none() or 0
new_count = current + 1
await session.execute(
update(WebhookSubscription)
.where(WebhookSubscription.uuid == uuid)
.values(
consecutive_failures=current + 1,
consecutive_failures=new_count,
last_failure_at=ts,
last_error=error[:512] if error else None,
updated_at=ts,
)
)
await session.commit()
return new_count
async def trip_webhook_circuit(self, uuid: str, ts: datetime) -> None:
async with self._session() as session:
await session.execute(
update(WebhookSubscription)
.where(WebhookSubscription.uuid == uuid)
.values(
enabled=False,
auto_disabled_at=ts,
updated_at=ts,
)
)
await session.commit()

View File

@@ -173,6 +173,15 @@ async def api_update_webhook(
if req.enabled is not None:
patch["enabled"] = req.enabled
# Re-enabling after a circuit trip clears the trip stamp and
# zeros the failure count — the operator has acknowledged and
# is ready to resume delivery. Admin-paused → re-enabled also
# hits this path harmlessly (auto_disabled_at is already NULL
# and consecutive_failures is already 0).
if req.enabled is True and not current.get("enabled"):
patch["auto_disabled_at"] = None
patch["consecutive_failures"] = 0
patch["last_error"] = None
if req.simple_events is not None or req.topic_patterns is not None:
# Re-merge using whatever the caller supplied; a caller that wants