merge: testing → main (reconcile 2-week divergence)

2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions
--- a/decnet/realism/prompts/init.py
+++ b/decnet/realism/prompts/init.py
@@ -0,0 +1,9 @@
+"""Prompt builders for LLM-enriched content.
+
+* :mod:`decnet.realism.prompts.email` — corporate-email body builder.
+
+Stage 6 of the realism migration adds ``filebody.py``, ``filename.py``,
+and a ``_style.py`` helper so em-dash suppression sits in one place
+across email + file-class prompts.
+"""
+from __future__ import annotations
--- a/decnet/realism/prompts/_style.py
+++ b/decnet/realism/prompts/_style.py
@@ -0,0 +1,39 @@
+"""Shared stylometric guards for LLM-bound prompts.
+
+Lifted from the original ``orchestrator.emailgen.prompt`` em-dash
+block so file-class prompts (note / todo / draft / script bodies)
+pick up the same suppression.  Per the
+``feedback_em_dash_llm_tell.md`` memory: em-dashes (—) are a strong
+LLM-authorship tell, suppress by default; allow only for personas
+explicitly opted in via ``EmailPersona.uses_llms_heavily``.
+"""
+from __future__ import annotations
+
+from decnet.realism.personas import EmailPersona
+
+
+_SUPPRESS_RULE = (
+    "Do NOT use em-dashes (—). Use commas, periods, or "
+    "parentheses instead. Em-dashes are a tell."
+)
+_ALLOW_RULE = (
+    "Em-dashes are fine — this persona uses them naturally. "
+    "Write in your usual style."
+)
+
+
+def em_dash_rule(persona: EmailPersona) -> str:
+    """Return the em-dash instruction line for *persona*'s prompt."""
+    if persona.uses_llms_heavily:
+        return _ALLOW_RULE
+    return _SUPPRESS_RULE
+
+
+def strip_em_dashes(text: str, persona: EmailPersona) -> str:
+    """Belt-and-braces: even with the prompt rule, small models leak
+    em-dashes occasionally.  Substitute with comma+space so the
+    output reads naturally; opt-in personas pass through unchanged.
+    """
+    if persona.uses_llms_heavily:
+        return text
+    return text.replace("—", ", ").replace("–", ", ")
--- a/decnet/realism/prompts/email.py
+++ b/decnet/realism/prompts/email.py
@@ -0,0 +1,154 @@
+"""Prompt builder for the email content class.
+
+The LLM gets a tightly-scoped instruction and a small handful of
+deterministic constraints.  Persona mannerisms are *pre-selected* in
+Python (1–2 of the persona's full list) and injected as hard rules —
+small models otherwise treat the mannerism list as flavour text and
+ignore it, and the corpus collapses into one voice.
+
+**Em-dash suppression** is on by default; suppression is lifted only
+for personas that opt in via ``uses_llms_heavily``.  Em-dashes are a
+strong stylometric tell for LLM-authored prose, and a honeypot mailbox
+where every author uses them is a tell.  Stage 6 of the realism
+migration extracts the suppression block into a shared
+``decnet.realism.prompts._style`` helper so file-class prompts pick
+it up too.
+"""
+from __future__ import annotations
+
+import secrets
+from dataclasses import dataclass
+from typing import Optional
+
+from decnet.realism.personas import EmailPersona
+
+
+@dataclass(frozen=True)
+class PromptInputs:
+    sender: EmailPersona
+    recipient: EmailPersona
+    context_hint: str
+    parent_subject: Optional[str] = None      # set when replying
+    parent_excerpt: Optional[str] = None      # short snippet of last msg
+
+
+_LANGUAGE_NAMES = {
+    "en": "English",
+    "es": "Spanish",
+    "pt": "Portuguese",
+    "fr": "French",
+    "de": "German",
+    "it": "Italian",
+    "nl": "Dutch",
+    "ja": "Japanese",
+    "zh": "Chinese",
+}
+
+
+def _lang_label(code: str) -> str:
+    return _LANGUAGE_NAMES.get(code.lower(), code)
+
+
+def select_mannerisms(
+    persona: EmailPersona,
+    *,
+    rng: Optional[secrets.SystemRandom] = None,
+    n: int = 2,
+) -> list[str]:
+    """Pick *n* mannerisms deterministically given *rng*.
+
+    Returns up to *n*; falls back to the full list when the persona
+    declares fewer.  Determinism (under a seeded RNG) is what makes
+    tests practical — otherwise mannerism injection is unverifiable.
+    """
+    rnd = rng or secrets.SystemRandom()
+    pool = list(persona.mannerisms)
+    if not pool:
+        return []
+    if len(pool) <= n:
+        return pool
+    rnd.shuffle(pool)
+    return pool[:n]
+
+
+def build(
+    inputs: PromptInputs,
+    *,
+    rng: Optional[secrets.SystemRandom] = None,
+) -> tuple[str, list[str]]:
+    """Return ``(prompt, mannerisms_used)``.
+
+    ``mannerisms_used`` flows back into the persisted ``payload`` JSON
+    so an analyst can see *why* a given email reads the way it does.
+    """
+    sender = inputs.sender
+    recipient = inputs.recipient
+    language = _lang_label(sender.language or "en")
+    mannerisms = select_mannerisms(sender, rng=rng)
+    mannerism_block = (
+        "\n".join(f"- {m}" for m in mannerisms)
+        if mannerisms
+        else "- (no specific mannerisms; write in the persona's tone)"
+    )
+
+    if sender.uses_llms_heavily:
+        em_dash_rule = (
+            "Em-dashes are fine — this persona uses them naturally. "
+            "Write in your usual style."
+        )
+    else:
+        em_dash_rule = (
+            "Do NOT use em-dashes (—). Use commas, periods, or "
+            "parentheses instead. Em-dashes are a tell."
+        )
+
+    sig_block = (
+        f"Use this exact signature block:\n{sender.signature}"
+        if sender.signature
+        else "End with a short, plausible signature for the persona's role."
+    )
+
+    if inputs.parent_subject:
+        thread_block = (
+            f"This is a REPLY in an ongoing thread.\n"
+            f"- Parent subject: {inputs.parent_subject}\n"
+            f"- Parent excerpt: {inputs.parent_excerpt or '(no excerpt)'}\n"
+            f"- Begin the body assuming the recipient already read the parent.\n"
+        )
+        subject_rule = (
+            "Subject must be the parent subject prefixed with 'Re: ' "
+            "(no double 'Re: Re:')."
+        )
+    else:
+        thread_block = "This is a NEW thread (no prior context)."
+        subject_rule = (
+            "Generate a short, specific subject line (≤ 80 chars) "
+            "appropriate to the context."
+        )
+
+    prompt = f"""You are writing one corporate email, RFC 2822 plain-text body only.
+
+Persona — sender:
+- Name: {sender.name}
+- Role: {sender.role}
+- Tone: {sender.tone_custom if sender.tone == "custom" and sender.tone_custom else sender.tone}
+- Mannerisms (must show through):
+{mannerism_block}
+
+Persona — recipient:
+- Name: {recipient.name}
+- Role: {recipient.role}
+
+Context hint: {inputs.context_hint}
+
+Thread context:
+{thread_block}
+
+Hard rules:
+1. Write the email body in {language}. Do not translate or code-switch.
+2. {em_dash_rule}
+3. {subject_rule}
+4. {sig_block}
+5. Output ONLY the email — first line is "Subject: <subject>", then a blank line, then the body. No commentary, no markdown fences, no preamble.
+"""
+    return prompt.strip(), mannerisms
--- a/decnet/realism/prompts/filebody.py
+++ b/decnet/realism/prompts/filebody.py
@@ -0,0 +1,91 @@
+"""Class-conditioned prompt builder for user-class file bodies.
+
+Stage 6 of the realism migration.  Only user-classes (``note``,
+``todo``, ``draft``, ``script``) get LLM enrichment — system-class
+content (cron logs, daemon logs, /tmp caches) is *supposed* to look
+formulaic, and an LLM-authored cron log is more suspicious than a
+templated one.
+
+The prompt asks for *short* output (LLM-authored ten-page essays in
+``~/notes.txt`` are an instant tell) and pins the exit shape so the
+worker doesn't need to scrape boilerplate.  Em-dash suppression
+flows through :mod:`decnet.realism.prompts._style`.
+"""
+from __future__ import annotations
+
+from decnet.realism.personas import EmailPersona
+from decnet.realism.prompts._style import em_dash_rule
+from decnet.realism.taxonomy import ContentClass
+
+
+_LANGUAGE_NAMES = {
+    "en": "English", "es": "Spanish", "pt": "Portuguese",
+    "fr": "French", "de": "German", "it": "Italian",
+    "nl": "Dutch", "ja": "Japanese", "zh": "Chinese",
+}
+
+
+def _lang_label(code: str) -> str:
+    return _LANGUAGE_NAMES.get((code or "en").lower(), code or "English")
+
+
+_CLASS_GUIDANCE: dict[ContentClass, str] = {
+    ContentClass.NOTE: (
+        "A personal note file the persona keeps on their dev box.  "
+        "2–6 short lines.  Mix of TODOs, half-formed thoughts, "
+        "shorthand reminders.  NOT a polished document.  No headers "
+        "or markdown sections."
+    ),
+    ContentClass.TODO: (
+        "A markdown TODO list the persona keeps on their dev box.  "
+        "3–8 items in `- [ ] item` / `- [x] item` form.  Some checked, "
+        "some not.  Items are short, work-flavoured, lowercase, no "
+        "prose paragraphs.  No headers.  No introductory sentence."
+    ),
+    ContentClass.DRAFT: (
+        "A short draft email or memo the persona is working on.  "
+        "2–4 short paragraphs, conversational tone.  No subject line, "
+        "no headers — this is the body in a notes file, not a sent "
+        "email.  Sign off the way the persona would in their voice."
+    ),
+    ContentClass.SCRIPT: (
+        "A short utility script the persona wrote.  Pick a plausible "
+        "interpreter (bash or python3) and start with the matching "
+        "shebang.  10–25 lines.  Real-feeling intent (a backup, a "
+        "log rotation, a cleanup).  Inline comments allowed but sparse."
+    ),
+}
+
+
+def build(
+    content_class: ContentClass,
+    persona: EmailPersona,
+) -> str:
+    """Return a prompt for one body of *content_class* by *persona*.
+
+    Output the LLM is expected to produce: *just the file body*, no
+    commentary, no markdown fences.  Caller substitutes em-dashes
+    server-side via :func:`decnet.realism.prompts._style.strip_em_dashes`
+    as a belt-and-braces guard.
+    """
+    guidance = _CLASS_GUIDANCE.get(content_class)
+    if guidance is None:
+        raise KeyError(
+            f"no filebody prompt registered for content_class={content_class!r}"
+        )
+    language = _lang_label(persona.language or "en")
+    return (
+        f"You are writing one short file the persona below would "
+        f"plausibly keep on their dev box.\n\n"
+        f"Persona:\n"
+        f"- Name: {persona.name}\n"
+        f"- Role: {persona.role}\n"
+        f"- Tone: {persona.tone_custom if persona.tone == 'custom' and persona.tone_custom else persona.tone}\n\n"
+        f"File class: {content_class.value}\n"
+        f"Guidance: {guidance}\n\n"
+        f"Hard rules:\n"
+        f"1. Write the file body in {language}. Do not translate or code-switch.\n"
+        f"2. {em_dash_rule(persona)}\n"
+        f"3. Output ONLY the file body. No commentary, no markdown "
+        f"   fences, no preamble like 'Here is the file:'.\n"
+    ).strip()