decnet/realism/naming._home and decnet/canary/cultivator._persona_login both normalised "John Smith"→"johnsmith" with identical logic. Lift to decnet.realism.personas.login_for(persona) and have both consumers import it. Drift between the two would have left canary placement and realism path naming using different login derivations.
154 lines
5.9 KiB
Python
154 lines
5.9 KiB
Python
"""Persona schema for realism content generation.
|
||
|
||
Stored as a JSON list on :attr:`Topology.email_personas`. Each persona
|
||
describes one fictional employee — sender of email *and* author of
|
||
files (notes, TODOs, drafts, scripts) on the deckies they're sampled
|
||
onto. The schema deliberately stays narrow: the LLM gets *enough*
|
||
differentiation to write distinct voices, no more.
|
||
|
||
The class is still named :class:`EmailPersona` because every persona
|
||
in the pool today carries a mandatory email address (used for IMAP/
|
||
POP3 spool delivery). Future per-decky personas without mailboxes
|
||
would justify a rename / superclass; not in scope for the realism
|
||
migration.
|
||
|
||
Invalid entries are dropped with a warning (returned alongside the
|
||
parsed list) rather than raising — a single typo in one persona must
|
||
not stall the entire realism tick.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
from typing import Literal, Optional
|
||
|
||
from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
|
||
|
||
from decnet.logging import get_logger
|
||
|
||
logger = get_logger("realism.personas")
|
||
|
||
Tone = Literal["formal", "direct", "casual", "technical", "custom"]
|
||
ReplyLatency = Literal["fast", "normal", "slow"]
|
||
|
||
|
||
class EmailPersona(BaseModel):
|
||
"""One fake mailbox owner.
|
||
|
||
``language`` is ISO 639-1 (``en``, ``es``, ``pt``…); when unset on the
|
||
persona it falls back to the topology's ``language_default``.
|
||
``uses_llms_heavily`` lifts the prompt-layer em-dash suppression for
|
||
that persona — em-dashes are an LLM tell, but a persona explicitly
|
||
pegged as a heavy LLM user should *naturally* produce them.
|
||
"""
|
||
name: str = Field(min_length=1, max_length=128)
|
||
email: str = Field(min_length=3, max_length=255)
|
||
role: str = Field(min_length=1, max_length=128)
|
||
tone: Tone = "formal"
|
||
tone_custom: Optional[str] = Field(default=None, max_length=128)
|
||
mannerisms: list[str] = Field(default_factory=list, max_length=12)
|
||
language: Optional[str] = Field(default=None, max_length=8)
|
||
signature: Optional[str] = Field(default=None, max_length=512)
|
||
active_hours: str = Field(default="09:00-18:00", max_length=32)
|
||
reply_latency: ReplyLatency = "normal"
|
||
uses_llms_heavily: bool = False
|
||
|
||
@model_validator(mode="after")
|
||
def _custom_tone_requires_text(self) -> "EmailPersona":
|
||
# ``tone="custom"`` lets operators describe a voice the four canned
|
||
# tones don't capture (sarcastic, deadpan, terse, etc.). The free
|
||
# text is interpolated into the prompt verbatim, so an empty
|
||
# value would just leave the LLM with the literal word "custom" —
|
||
# reject it loudly instead of silently producing a useless prompt.
|
||
if self.tone == "custom" and not (self.tone_custom and self.tone_custom.strip()):
|
||
raise ValueError("tone_custom is required when tone is 'custom'")
|
||
return self
|
||
|
||
@field_validator("email")
|
||
@classmethod
|
||
def _email_shape(cls, v: str) -> str:
|
||
# Cheap structural check — full RFC 5322 isn't worth the
|
||
# dependency. We only need ``user@domain`` with non-empty parts
|
||
# for the prompt builder + Message-ID generator.
|
||
if "@" not in v:
|
||
raise ValueError("email must contain '@'")
|
||
local, _, domain = v.rpartition("@")
|
||
if not local or not domain or "." not in domain:
|
||
raise ValueError("email must look like user@domain.tld")
|
||
return v
|
||
|
||
|
||
def parse_personas(
|
||
raw: str | list | None,
|
||
*,
|
||
language_default: str = "en",
|
||
) -> list[EmailPersona]:
|
||
"""Parse the JSON-or-list ``email_personas`` value into models.
|
||
|
||
Resolves ``language`` against *language_default* so downstream
|
||
consumers (prompt builder, scheduler) never need to know about
|
||
fallback semantics.
|
||
"""
|
||
if not raw:
|
||
return []
|
||
if isinstance(raw, str):
|
||
try:
|
||
raw = json.loads(raw)
|
||
except json.JSONDecodeError as exc:
|
||
logger.warning("realism personas: invalid JSON, skipping: %s", exc)
|
||
return []
|
||
if not isinstance(raw, list):
|
||
logger.warning(
|
||
"realism personas: expected list, got %s", type(raw).__name__
|
||
)
|
||
return []
|
||
out: list[EmailPersona] = []
|
||
for i, entry in enumerate(raw):
|
||
try:
|
||
persona = EmailPersona.model_validate(entry)
|
||
except ValidationError as exc:
|
||
logger.warning(
|
||
"realism personas: dropping invalid entry index=%d: %s",
|
||
i, exc.errors(include_url=False),
|
||
)
|
||
continue
|
||
if persona.language is None:
|
||
persona = persona.model_copy(update={"language": language_default})
|
||
out.append(persona)
|
||
return out
|
||
|
||
|
||
def login_for(persona: str) -> str:
|
||
"""Return the linux login derived from a persona's display name.
|
||
|
||
Lowercase, strip spaces; if the result isn't a plausible POSIX
|
||
login (alnum ASCII), fall back to ``user`` so the path doesn't
|
||
leak the persona's display name onto the decky filesystem.
|
||
Shared by realism path naming (``decnet/realism/naming.py``) and
|
||
canary cultivation (``decnet/canary/cultivator.py``).
|
||
"""
|
||
candidate = persona.lower().replace(" ", "")
|
||
if candidate.isalnum() and candidate.isascii() and candidate:
|
||
return candidate
|
||
return "user"
|
||
|
||
|
||
def in_active_hours(persona: EmailPersona, now_hour: int) -> bool:
|
||
"""Return True if *now_hour* (0–23) falls in the persona's window.
|
||
|
||
Format: ``"HH:MM-HH:MM"``. Wrap-around windows (``"22:00-06:00"``)
|
||
are supported. Invalid windows treat the persona as always-on so a
|
||
config typo never silences the whole fleet.
|
||
"""
|
||
try:
|
||
start_s, end_s = persona.active_hours.split("-")
|
||
start_h = int(start_s.split(":")[0])
|
||
end_h = int(end_s.split(":")[0])
|
||
except (ValueError, IndexError):
|
||
return True
|
||
if start_h == end_h:
|
||
return True
|
||
if start_h < end_h:
|
||
return start_h <= now_hour < end_h
|
||
# Wrap-around (e.g. 22:00-06:00).
|
||
return now_hour >= start_h or now_hour < end_h
|