merge: testing → main (reconcile 2-week divergence)

2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions
--- a/decnet/realism/llm/init.py
+++ b/decnet/realism/llm/init.py
@@ -0,0 +1,17 @@
+"""LLM backend for the realism library.
+
+Pluggable per the provider-subpackages convention (mirrors
+:mod:`decnet.web.db` and :mod:`decnet.bus`): consumers depend on
+:class:`LLMBackend` from :mod:`base`; concrete transports live under
+:mod:`impl` and are selected by :func:`get_llm`.
+
+This is the seam to pull on when swapping local Ollama for the
+Anthropic API, llama.cpp, vLLM, or any other inference server — change
+``DECNET_REALISM_LLM`` (or pass ``llm=`` directly), no caller rewrite.
+"""
+from __future__ import annotations
+
+from decnet.realism.llm.base import LLMBackend, LLMResult, LLMTimeout
+from decnet.realism.llm.factory import get_llm
+
+__all__ = ["LLMBackend", "LLMResult", "LLMTimeout", "get_llm"]
--- a/decnet/realism/llm/base.py
+++ b/decnet/realism/llm/base.py
@@ -0,0 +1,47 @@
+"""Backend protocol shared by every LLM transport.
+
+Deliberately narrow: realism consumers need one async ``generate``
+call that takes a prompt string and returns the model's output text
+plus enough metadata to populate per-event payloads (model name,
+latency, success bit).  Streaming, embeddings, multi-turn chat — all
+out of scope here; realism only ever does one-shot single-prompt
+generations.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any, Protocol
+
+
+class LLMTimeout(Exception):
+    """Raised when a generation exceeds the backend's wall-clock cap.
+
+    Backends MUST raise this rather than returning silently empty
+    output; the driver discriminates timeout from "model produced
+    nothing useful" so payloads carry the right ``stage`` value.
+    """
+
+
+@dataclass
+class LLMResult:
+    """Outcome of one ``generate`` call.
+
+    ``success`` is ``False`` when the backend ran cleanly but produced
+    no usable output (e.g. an empty stdout).  Hard failures (subprocess
+    crash, network error) raise; soft failures land here so the driver
+    can persist + log them as one event.
+    """
+    success: bool
+    text: str
+    model: str
+    latency_ms: int
+    extra: dict[str, Any] = field(default_factory=dict)
+
+
+class LLMBackend(Protocol):
+    """Minimal contract for a realism LLM provider."""
+
+    model: str
+    timeout: float
+
+    async def generate(self, prompt: str) -> LLMResult: ...
--- a/decnet/realism/llm/circuit.py
+++ b/decnet/realism/llm/circuit.py
@@ -0,0 +1,99 @@
+"""Process-local circuit breaker for LLM calls.
+
+Per-call timeouts (``asyncio.wait_for(llm.generate, timeout=...)``)
+protect a single tick from a single hung Ollama.  They do NOT protect
+the worker from a *sustained* problem: 100 consecutive 60-second
+timeouts chew up an hour of orchestrator time on dead requests before
+anything notices.
+
+This breaker watches a sliding window of recent outcomes and flips
+``open`` after ``failure_threshold`` consecutive failures.  Open
+breakers short-circuit ``allow_call`` to ``False`` so callers fall
+back to deterministic templates without the per-tick cost.  After
+``cooldown_seconds`` the breaker enters ``half_open`` and the next
+call is allowed; success closes the breaker, failure re-opens it
+with a fresh cooldown.
+
+Process-local on purpose — cross-process state would require shared
+memory and is overkill for a single orchestrator worker.
+"""
+from __future__ import annotations
+
+import threading
+import time
+from enum import Enum
+
+
+class _State(Enum):
+    CLOSED = "closed"
+    OPEN = "open"
+    HALF_OPEN = "half_open"
+
+
+class LLMCircuitBreaker:
+    """Threadsafe sliding-window circuit breaker.
+
+    Default ``failure_threshold=3`` consecutive failures → open;
+    ``cooldown_seconds=60`` of open before transitioning to
+    half-open.  These match the realism worker's tick cadence: 3
+    consecutive 60s timeouts = 3 minutes of dead air, which is the
+    point at which a deterministic fallback is overdue.
+    """
+
+    def __init__(
+        self,
+        *,
+        failure_threshold: int = 3,
+        cooldown_seconds: float = 60.0,
+        clock=time.monotonic,
+    ) -> None:
+        self._failure_threshold = failure_threshold
+        self._cooldown = cooldown_seconds
+        self._clock = clock
+        self._lock = threading.Lock()
+        self._state = _State.CLOSED
+        self._consecutive_failures = 0
+        self._opened_at: float = 0.0
+
+    @property
+    def state(self) -> str:
+        with self._lock:
+            return self._state.value
+
+    def allow_call(self) -> bool:
+        """Return True if the next call should run, False if it should
+        short-circuit to the fallback path.
+
+        Promotes ``open`` → ``half_open`` after the cooldown elapses
+        so the next caller acts as a probe.
+        """
+        with self._lock:
+            if self._state == _State.CLOSED:
+                return True
+            if self._state == _State.HALF_OPEN:
+                return True
+            # OPEN: check cooldown.
+            if self._clock() - self._opened_at >= self._cooldown:
+                self._state = _State.HALF_OPEN
+                return True
+            return False
+
+    def record_success(self) -> None:
+        with self._lock:
+            self._state = _State.CLOSED
+            self._consecutive_failures = 0
+            self._opened_at = 0.0
+
+    def record_failure(self) -> None:
+        with self._lock:
+            if self._state == _State.HALF_OPEN:
+                # The probe call failed — re-open with a fresh cooldown.
+                self._state = _State.OPEN
+                self._opened_at = self._clock()
+                # Don't reset the failure count; the probe failure
+                # implies the underlying issue is unresolved.
+                return
+            self._consecutive_failures += 1
+            if self._consecutive_failures >= self._failure_threshold:
+                self._state = _State.OPEN
+                self._opened_at = self._clock()
--- a/decnet/realism/llm/factory.py
+++ b/decnet/realism/llm/factory.py
@@ -0,0 +1,46 @@
+"""Backend dispatch.
+
+Reads ``DECNET_REALISM_LLM`` to pick a concrete :class:`LLMBackend`.
+Defaults to ``ollama`` because that's what the prototype proved out and
+what most dev boxes have on hand.
+
+Supported keys:
+
+* ``ollama`` — :class:`decnet.realism.llm.impl.ollama.OllamaBackend`
+* ``fake``   — :class:`decnet.realism.llm.impl.fake.FakeBackend`
+  (canned output, used by tests so they don't shell out)
+
+Anthropic / vLLM / llama.cpp slots in here as a third branch when the
+need shows up.  Per the provider-subpackages convention, do NOT collapse
+factory dispatch into the impl modules — keeps the ``__init__`` import
+graph cycle-free and the env contract auditable in one place.
+"""
+from __future__ import annotations
+
+import os
+from typing import Any
+
+from decnet.realism.llm.base import LLMBackend
+
+
+def get_llm(*, model: str | None = None, **kwargs: Any) -> LLMBackend:
+    """Instantiate the LLM backend selected by environment.
+
+    *model* (when provided) overrides whatever the backend's own default
+    is — e.g. for :class:`OllamaBackend` that's ``llama3.1`` unless
+    ``DECNET_REALISM_MODEL`` says otherwise.  Lets the worker honour
+    ``decnet orchestrate --model gpt-oss`` without each backend having
+    to know about CLI flags.
+    """
+    backend_key = os.environ.get("DECNET_REALISM_LLM", "ollama").lower()
+
+    if backend_key == "ollama":
+        from decnet.realism.llm.impl.ollama import OllamaBackend
+        return OllamaBackend(model=model, **kwargs)
+    if backend_key == "fake":
+        from decnet.realism.llm.impl.fake import FakeBackend
+        return FakeBackend(model=model or "fake-model", **kwargs)
+    raise ValueError(
+        f"Unsupported DECNET_REALISM_LLM={backend_key!r}; "
+        "expected one of: ollama, fake"
+    )
--- a/decnet/realism/llm/impl/init.py
+++ b/decnet/realism/llm/impl/init.py
@@ -0,0 +1,6 @@
+"""Concrete LLM-backend implementations.
+
+Importers go through :func:`decnet.realism.llm.get_llm`, not these
+modules directly — same convention as :mod:`decnet.web.db.sqlite` and
+:mod:`decnet.bus.unix_client`.
+"""
--- a/decnet/realism/llm/impl/fake.py
+++ b/decnet/realism/llm/impl/fake.py
@@ -0,0 +1,50 @@
+"""In-process fake backend for tests.
+
+Returns a canned string so the driver path can be exercised without an
+Ollama install.  Configurable via ``DECNET_REALISM_FAKE_OUTPUT`` (env)
+or the ``output`` constructor arg — the env-var path lets integration
+tests run the worker end-to-end with deterministic output.
+"""
+from __future__ import annotations
+
+import os
+import time
+from typing import Optional
+
+from decnet.realism.llm.base import LLMBackend, LLMResult
+
+
+_DEFAULT_OUTPUT = (
+    "Subject: Quick update\n\n"
+    "Hi,\n\nFollowing up on the topic.\n\nBest regards,\nFake Persona\n"
+)
+
+
+class FakeBackend(LLMBackend):
+    def __init__(
+        self,
+        *,
+        model: str = "fake-model",
+        timeout: float = 1.0,
+        output: Optional[str] = None,
+        success: bool = True,
+    ) -> None:
+        self.model = model
+        self.timeout = timeout
+        self._output = (
+            output
+            if output is not None
+            else os.environ.get("DECNET_REALISM_FAKE_OUTPUT", _DEFAULT_OUTPUT)
+        )
+        self._success = success
+
+    async def generate(self, prompt: str) -> LLMResult:    # noqa: ARG002
+        t0 = time.monotonic()
+        latency_ms = int((time.monotonic() - t0) * 1000)
+        return LLMResult(
+            success=self._success,
+            text=self._output if self._success else "",
+            model=self.model,
+            latency_ms=latency_ms,
+            extra={"rc": 0 if self._success else 1},
+        )
--- a/decnet/realism/llm/impl/ollama.py
+++ b/decnet/realism/llm/impl/ollama.py
@@ -0,0 +1,100 @@
+"""Ollama subprocess backend.
+
+Shells out to ``ollama run <model>`` with the prompt fed via stdin.
+
+Why subprocess and not the Ollama HTTP API:
+* No new dependency (``ollama`` Python lib is optional).
+* Works on hosts where Ollama is bound to a unix socket, an unusual TCP
+  port, or behind a remote-mount layer — `ollama run` resolves all that.
+* Same path the operator uses by hand (``ollama run llama3.1``); easier
+  to debug discrepancies between worker output and a console session.
+
+Cost: per-call process spawn (~50ms on a warm box).  Acceptable for
+realism tick rates (one body per ~5 minutes per persona by default).
+When that cost matters, swap to an HTTP-API backend; the seam is in
+:mod:`decnet.realism.llm.factory`.
+"""
+from __future__ import annotations
+
+import asyncio
+import os
+import time
+from typing import Optional
+
+from decnet.logging import get_logger
+from decnet.realism.llm.base import LLMBackend, LLMResult, LLMTimeout
+
+log = get_logger("realism.llm")
+
+_OLLAMA = "ollama"
+_DEFAULT_MODEL = os.environ.get("DECNET_REALISM_MODEL", "llama3.1")
+_DEFAULT_TIMEOUT = float(os.environ.get("DECNET_REALISM_TIMEOUT", "60"))
+
+
+class OllamaBackend(LLMBackend):
+    """Concrete :class:`LLMBackend` that shells out to ``ollama run``."""
+
+    def __init__(
+        self,
+        *,
+        model: Optional[str] = None,
+        timeout: Optional[float] = None,
+    ) -> None:
+        self.model = model or _DEFAULT_MODEL
+        self.timeout = timeout if timeout is not None else _DEFAULT_TIMEOUT
+
+    async def generate(self, prompt: str) -> LLMResult:
+        t0 = time.monotonic()
+        try:
+            proc = await asyncio.create_subprocess_exec(
+                _OLLAMA, "run", self.model,
+                stdin=asyncio.subprocess.PIPE,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+        except FileNotFoundError as exc:
+            latency_ms = int((time.monotonic() - t0) * 1000)
+            return LLMResult(
+                success=False,
+                text="",
+                model=self.model,
+                latency_ms=latency_ms,
+                extra={"rc": 127, "stderr": f"argv[0] not found: {exc}"},
+            )
+        try:
+            stdout, stderr = await asyncio.wait_for(
+                proc.communicate(prompt.encode("utf-8")),
+                timeout=self.timeout,
+            )
+        except asyncio.TimeoutError as exc:
+            try:
+                proc.kill()
+            except ProcessLookupError:
+                pass
+            raise LLMTimeout(
+                f"ollama run {self.model} exceeded {self.timeout}s"
+            ) from exc
+
+        latency_ms = int((time.monotonic() - t0) * 1000)
+        rc = proc.returncode if proc.returncode is not None else -1
+        text = stdout.decode("utf-8", "replace")
+        stderr_s = stderr.decode("utf-8", "replace")
+        if rc != 0 or not text.strip():
+            log.warning(
+                "ollama backend non-zero / empty rc=%d model=%s stderr=%r",
+                rc, self.model, stderr_s[:200],
+            )
+            return LLMResult(
+                success=False,
+                text=text,
+                model=self.model,
+                latency_ms=latency_ms,
+                extra={"rc": rc, "stderr": stderr_s.strip()[:256]},
+            )
+        return LLMResult(
+            success=True,
+            text=text,
+            model=self.model,
+            latency_ms=latency_ms,
+            extra={"rc": rc},
+        )