"""Backend protocol shared by every LLM transport.

Deliberately narrow: realism consumers need one async ``generate``
call that takes a prompt string and returns the model's output text
plus enough metadata to populate per-event payloads (model name,
latency, success bit).  Streaming, embeddings, multi-turn chat — all
out of scope here; realism only ever does one-shot single-prompt
generations.
"""
from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any, Protocol


class LLMTimeout(Exception):
    """Raised when a generation exceeds the backend's wall-clock cap.

    Backends MUST raise this rather than returning silently empty
    output; the driver discriminates timeout from "model produced
    nothing useful" so payloads carry the right ``stage`` value.
    """


@dataclass
class LLMResult:
    """Outcome of one ``generate`` call.

    ``success`` is ``False`` when the backend ran cleanly but produced
    no usable output (e.g. an empty stdout).  Hard failures (subprocess
    crash, network error) raise; soft failures land here so the driver
    can persist + log them as one event.
    """
    success: bool
    text: str
    model: str
    latency_ms: int
    extra: dict[str, Any] = field(default_factory=dict)


class LLMBackend(Protocol):
    """Minimal contract for a realism LLM provider."""

    model: str
    timeout: float

    async def generate(self, prompt: str) -> LLMResult: ...