merge: testing → main (reconcile 2-week divergence)

This commit is contained in:
2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions

View File

@@ -0,0 +1,17 @@
"""LLM backend for the realism library.
Pluggable per the provider-subpackages convention (mirrors
:mod:`decnet.web.db` and :mod:`decnet.bus`): consumers depend on
:class:`LLMBackend` from :mod:`base`; concrete transports live under
:mod:`impl` and are selected by :func:`get_llm`.
This is the seam to pull on when swapping local Ollama for the
Anthropic API, llama.cpp, vLLM, or any other inference server — change
``DECNET_REALISM_LLM`` (or pass ``llm=`` directly), no caller rewrite.
"""
from __future__ import annotations
from decnet.realism.llm.base import LLMBackend, LLMResult, LLMTimeout
from decnet.realism.llm.factory import get_llm
__all__ = ["LLMBackend", "LLMResult", "LLMTimeout", "get_llm"]

View File

@@ -0,0 +1,47 @@
"""Backend protocol shared by every LLM transport.
Deliberately narrow: realism consumers need one async ``generate``
call that takes a prompt string and returns the model's output text
plus enough metadata to populate per-event payloads (model name,
latency, success bit). Streaming, embeddings, multi-turn chat — all
out of scope here; realism only ever does one-shot single-prompt
generations.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Protocol
class LLMTimeout(Exception):
"""Raised when a generation exceeds the backend's wall-clock cap.
Backends MUST raise this rather than returning silently empty
output; the driver discriminates timeout from "model produced
nothing useful" so payloads carry the right ``stage`` value.
"""
@dataclass
class LLMResult:
"""Outcome of one ``generate`` call.
``success`` is ``False`` when the backend ran cleanly but produced
no usable output (e.g. an empty stdout). Hard failures (subprocess
crash, network error) raise; soft failures land here so the driver
can persist + log them as one event.
"""
success: bool
text: str
model: str
latency_ms: int
extra: dict[str, Any] = field(default_factory=dict)
class LLMBackend(Protocol):
"""Minimal contract for a realism LLM provider."""
model: str
timeout: float
async def generate(self, prompt: str) -> LLMResult: ...

View File

@@ -0,0 +1,99 @@
"""Process-local circuit breaker for LLM calls.
Per-call timeouts (``asyncio.wait_for(llm.generate, timeout=...)``)
protect a single tick from a single hung Ollama. They do NOT protect
the worker from a *sustained* problem: 100 consecutive 60-second
timeouts chew up an hour of orchestrator time on dead requests before
anything notices.
This breaker watches a sliding window of recent outcomes and flips
``open`` after ``failure_threshold`` consecutive failures. Open
breakers short-circuit ``allow_call`` to ``False`` so callers fall
back to deterministic templates without the per-tick cost. After
``cooldown_seconds`` the breaker enters ``half_open`` and the next
call is allowed; success closes the breaker, failure re-opens it
with a fresh cooldown.
Process-local on purpose — cross-process state would require shared
memory and is overkill for a single orchestrator worker.
"""
from __future__ import annotations
import threading
import time
from enum import Enum
class _State(Enum):
CLOSED = "closed"
OPEN = "open"
HALF_OPEN = "half_open"
class LLMCircuitBreaker:
"""Threadsafe sliding-window circuit breaker.
Default ``failure_threshold=3`` consecutive failures → open;
``cooldown_seconds=60`` of open before transitioning to
half-open. These match the realism worker's tick cadence: 3
consecutive 60s timeouts = 3 minutes of dead air, which is the
point at which a deterministic fallback is overdue.
"""
def __init__(
self,
*,
failure_threshold: int = 3,
cooldown_seconds: float = 60.0,
clock=time.monotonic,
) -> None:
self._failure_threshold = failure_threshold
self._cooldown = cooldown_seconds
self._clock = clock
self._lock = threading.Lock()
self._state = _State.CLOSED
self._consecutive_failures = 0
self._opened_at: float = 0.0
@property
def state(self) -> str:
with self._lock:
return self._state.value
def allow_call(self) -> bool:
"""Return True if the next call should run, False if it should
short-circuit to the fallback path.
Promotes ``open`` → ``half_open`` after the cooldown elapses
so the next caller acts as a probe.
"""
with self._lock:
if self._state == _State.CLOSED:
return True
if self._state == _State.HALF_OPEN:
return True
# OPEN: check cooldown.
if self._clock() - self._opened_at >= self._cooldown:
self._state = _State.HALF_OPEN
return True
return False
def record_success(self) -> None:
with self._lock:
self._state = _State.CLOSED
self._consecutive_failures = 0
self._opened_at = 0.0
def record_failure(self) -> None:
with self._lock:
if self._state == _State.HALF_OPEN:
# The probe call failed — re-open with a fresh cooldown.
self._state = _State.OPEN
self._opened_at = self._clock()
# Don't reset the failure count; the probe failure
# implies the underlying issue is unresolved.
return
self._consecutive_failures += 1
if self._consecutive_failures >= self._failure_threshold:
self._state = _State.OPEN
self._opened_at = self._clock()

View File

@@ -0,0 +1,46 @@
"""Backend dispatch.
Reads ``DECNET_REALISM_LLM`` to pick a concrete :class:`LLMBackend`.
Defaults to ``ollama`` because that's what the prototype proved out and
what most dev boxes have on hand.
Supported keys:
* ``ollama`` — :class:`decnet.realism.llm.impl.ollama.OllamaBackend`
* ``fake`` — :class:`decnet.realism.llm.impl.fake.FakeBackend`
(canned output, used by tests so they don't shell out)
Anthropic / vLLM / llama.cpp slots in here as a third branch when the
need shows up. Per the provider-subpackages convention, do NOT collapse
factory dispatch into the impl modules — keeps the ``__init__`` import
graph cycle-free and the env contract auditable in one place.
"""
from __future__ import annotations
import os
from typing import Any
from decnet.realism.llm.base import LLMBackend
def get_llm(*, model: str | None = None, **kwargs: Any) -> LLMBackend:
"""Instantiate the LLM backend selected by environment.
*model* (when provided) overrides whatever the backend's own default
is — e.g. for :class:`OllamaBackend` that's ``llama3.1`` unless
``DECNET_REALISM_MODEL`` says otherwise. Lets the worker honour
``decnet orchestrate --model gpt-oss`` without each backend having
to know about CLI flags.
"""
backend_key = os.environ.get("DECNET_REALISM_LLM", "ollama").lower()
if backend_key == "ollama":
from decnet.realism.llm.impl.ollama import OllamaBackend
return OllamaBackend(model=model, **kwargs)
if backend_key == "fake":
from decnet.realism.llm.impl.fake import FakeBackend
return FakeBackend(model=model or "fake-model", **kwargs)
raise ValueError(
f"Unsupported DECNET_REALISM_LLM={backend_key!r}; "
"expected one of: ollama, fake"
)

View File

@@ -0,0 +1,6 @@
"""Concrete LLM-backend implementations.
Importers go through :func:`decnet.realism.llm.get_llm`, not these
modules directly — same convention as :mod:`decnet.web.db.sqlite` and
:mod:`decnet.bus.unix_client`.
"""

View File

@@ -0,0 +1,50 @@
"""In-process fake backend for tests.
Returns a canned string so the driver path can be exercised without an
Ollama install. Configurable via ``DECNET_REALISM_FAKE_OUTPUT`` (env)
or the ``output`` constructor arg — the env-var path lets integration
tests run the worker end-to-end with deterministic output.
"""
from __future__ import annotations
import os
import time
from typing import Optional
from decnet.realism.llm.base import LLMBackend, LLMResult
_DEFAULT_OUTPUT = (
"Subject: Quick update\n\n"
"Hi,\n\nFollowing up on the topic.\n\nBest regards,\nFake Persona\n"
)
class FakeBackend(LLMBackend):
def __init__(
self,
*,
model: str = "fake-model",
timeout: float = 1.0,
output: Optional[str] = None,
success: bool = True,
) -> None:
self.model = model
self.timeout = timeout
self._output = (
output
if output is not None
else os.environ.get("DECNET_REALISM_FAKE_OUTPUT", _DEFAULT_OUTPUT)
)
self._success = success
async def generate(self, prompt: str) -> LLMResult: # noqa: ARG002
t0 = time.monotonic()
latency_ms = int((time.monotonic() - t0) * 1000)
return LLMResult(
success=self._success,
text=self._output if self._success else "",
model=self.model,
latency_ms=latency_ms,
extra={"rc": 0 if self._success else 1},
)

View File

@@ -0,0 +1,100 @@
"""Ollama subprocess backend.
Shells out to ``ollama run <model>`` with the prompt fed via stdin.
Why subprocess and not the Ollama HTTP API:
* No new dependency (``ollama`` Python lib is optional).
* Works on hosts where Ollama is bound to a unix socket, an unusual TCP
port, or behind a remote-mount layer — `ollama run` resolves all that.
* Same path the operator uses by hand (``ollama run llama3.1``); easier
to debug discrepancies between worker output and a console session.
Cost: per-call process spawn (~50ms on a warm box). Acceptable for
realism tick rates (one body per ~5 minutes per persona by default).
When that cost matters, swap to an HTTP-API backend; the seam is in
:mod:`decnet.realism.llm.factory`.
"""
from __future__ import annotations
import asyncio
import os
import time
from typing import Optional
from decnet.logging import get_logger
from decnet.realism.llm.base import LLMBackend, LLMResult, LLMTimeout
log = get_logger("realism.llm")
_OLLAMA = "ollama"
_DEFAULT_MODEL = os.environ.get("DECNET_REALISM_MODEL", "llama3.1")
_DEFAULT_TIMEOUT = float(os.environ.get("DECNET_REALISM_TIMEOUT", "60"))
class OllamaBackend(LLMBackend):
"""Concrete :class:`LLMBackend` that shells out to ``ollama run``."""
def __init__(
self,
*,
model: Optional[str] = None,
timeout: Optional[float] = None,
) -> None:
self.model = model or _DEFAULT_MODEL
self.timeout = timeout if timeout is not None else _DEFAULT_TIMEOUT
async def generate(self, prompt: str) -> LLMResult:
t0 = time.monotonic()
try:
proc = await asyncio.create_subprocess_exec(
_OLLAMA, "run", self.model,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
except FileNotFoundError as exc:
latency_ms = int((time.monotonic() - t0) * 1000)
return LLMResult(
success=False,
text="",
model=self.model,
latency_ms=latency_ms,
extra={"rc": 127, "stderr": f"argv[0] not found: {exc}"},
)
try:
stdout, stderr = await asyncio.wait_for(
proc.communicate(prompt.encode("utf-8")),
timeout=self.timeout,
)
except asyncio.TimeoutError as exc:
try:
proc.kill()
except ProcessLookupError:
pass
raise LLMTimeout(
f"ollama run {self.model} exceeded {self.timeout}s"
) from exc
latency_ms = int((time.monotonic() - t0) * 1000)
rc = proc.returncode if proc.returncode is not None else -1
text = stdout.decode("utf-8", "replace")
stderr_s = stderr.decode("utf-8", "replace")
if rc != 0 or not text.strip():
log.warning(
"ollama backend non-zero / empty rc=%d model=%s stderr=%r",
rc, self.model, stderr_s[:200],
)
return LLMResult(
success=False,
text=text,
model=self.model,
latency_ms=latency_ms,
extra={"rc": rc, "stderr": stderr_s.strip()[:256]},
)
return LLMResult(
success=True,
text=text,
model=self.model,
latency_ms=latency_ms,
extra={"rc": rc},
)