merge: testing → main (reconcile 2-week divergence)

This commit is contained in:
2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions

View File

@@ -0,0 +1,20 @@
"""Emailgen — email-specific delivery, scheduling, and threading.
After stage 5 of the realism migration, ``emailgen`` is no longer a
separate worker / systemd unit / CLI subcommand. It exposes:
* :mod:`decnet.orchestrator.emailgen.scheduler` — the
``EmailAction`` shape and the ``pick(repo)`` policy that decides
which mail decky / sender / recipient / thread an email belongs to.
* :mod:`decnet.orchestrator.emailgen.threads` — RFC 2822 thread chain
helpers (Message-ID generation, Re: / In-Reply-To bookkeeping).
* :mod:`decnet.orchestrator.emailgen.events` — DB-row + bus-topic
builders for email events.
The orchestrator's main worker (:mod:`decnet.orchestrator.worker`)
calls into these modules per tick. LLM glue, persona schema, prompt
builder, and the global persona pool moved to :mod:`decnet.realism`
in stage 2 of the migration; this package keeps only the
email-specific delivery surface.
"""
from __future__ import annotations

View File

@@ -0,0 +1,49 @@
"""DB-row + bus-topic helpers for the emailgen worker.
Mirror of :mod:`decnet.orchestrator.events` for the email action class.
Kept in its own module so the SSH-flavoured orchestrator and the
emailgen worker don't accumulate cross-imports of each other's action
types.
"""
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any
from decnet.bus import topics as _topics
from decnet.orchestrator.drivers.base import ActivityResult
from decnet.orchestrator.emailgen.scheduler import EmailAction
def to_row(action: EmailAction, result: ActivityResult) -> dict[str, Any]:
"""Build the kwargs dict for ``OrchestratorEmail(**...)``.
Pulls ``message_id`` / ``subject`` / ``language`` out of the
driver's ``payload`` rather than off the action — the EML's
Message-ID is generated inside the driver after the LLM call so
we know it matches what landed on disk.
"""
payload = result.payload or {}
return {
"ts": datetime.now(timezone.utc),
"mail_decky_uuid": action.mail_decky_uuid,
"thread_id": action.thread_id,
"message_id": payload.get("message_id", ""),
"in_reply_to": action.parent_message_id,
"sender_email": action.sender.email,
"recipient_email": action.recipient.email,
"subject": payload.get("subject", ""),
"language": payload.get("language", action.sender.language or "en"),
"eml_path": payload.get("eml_path", ""),
"success": result.success,
"payload": payload, # repo serialises dict→json
}
def topic_for(action: EmailAction) -> str:
"""Map an email action to its bus topic."""
return _topics.orchestrator(_topics.ORCHESTRATOR_EMAIL, action.mail_decky_uuid)
def event_type_for(action: EmailAction) -> str: # noqa: ARG001 — symmetry
return _topics.ORCHESTRATOR_EMAIL

View File

@@ -0,0 +1,255 @@
"""Action picker for the emailgen worker.
One tick = one (mail-decky, sender, recipient, [thread]) decision.
Scope (v1):
- Only TopologyDeckies are eligible mail hosts. Fleet / SWARM-shard
mail-deckies are out of scope per the plan; they get covered when the
forwarder pattern lands for emailgen.
- Mail decky = a running TopologyDecky whose ``services`` includes
``imap`` or ``pop3``.
- Personas come from ``Topology.email_personas`` (JSON list of
:class:`EmailPersona`). Topology-wide ``language_default`` fills in
any persona that didn't set its own.
Returns ``None`` (skip tick) when:
- no running mail decky,
- the mail decky's topology has fewer than two valid personas,
- nobody is in their ``active_hours`` window right now.
"""
from __future__ import annotations
import secrets
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Optional
from decnet.logging import get_logger
from decnet.orchestrator.emailgen.threads import (
ThreadChain,
new_thread_id,
references_for_reply,
reply_subject,
)
from decnet.realism import personas_pool as global_pool
from decnet.realism.personas import (
EmailPersona,
in_active_hours,
parse_personas,
)
logger = get_logger("orchestrator.emailgen")
_MAIL_SERVICES = ("imap", "pop3")
# Probability of replying on an existing thread when one exists. The
# inverse starts a fresh thread. 0.6 mirrors what mailbox studies find
# for active corporate inboxes — most messages are replies, but not
# overwhelmingly so.
_REPLY_PROBABILITY = 0.6
# Generic context hints fed to the LLM when starting a new thread.
# Deliberately broad — the persona's tone + role is what shapes the
# email; the hint just gives the model a topic to riff on.
_CONTEXT_HINTS: tuple[str, ...] = (
"Q3 budget review and approval",
"Client presentation feedback",
"Project deadline extension request",
"Team building event planning",
"IT system maintenance notification",
"Quarterly performance review",
"Vendor onboarding process",
"Holiday schedule announcement",
"Training session invitation",
"Department restructuring update",
"Client contract negotiation",
"Security audit findings",
"Sales strategy meeting",
"Product launch timeline",
"Office relocation update",
"Travel reimbursement policy change",
)
@dataclass(frozen=True)
class EmailAction:
"""One emailgen tick's decision.
``thread_id`` is non-None whenever this action is a reply; the
worker writes it back to the DB so future ticks can chain further
replies. ``in_reply_to`` / ``references`` mirror the RFC 2822
headers we'll set on the EML.
``mail_decky_name`` / ``mail_decky_services`` are denormalised onto
the action so the driver doesn't need a second repo round-trip just
to resolve the container name.
"""
mail_decky_uuid: str
mail_decky_name: str
mail_decky_services: tuple[str, ...]
sender: EmailPersona
recipient: EmailPersona
thread_id: str
parent_message_id: Optional[str]
references: str
subject_hint: Optional[str] # used as parent subject when replying
parent_excerpt: Optional[str] # excerpt from the parent body
context_hint: str # only meaningful on new threads
is_reply: bool
description: str = "email:send"
def _is_mail_decky(decky: dict[str, Any]) -> bool:
services = decky.get("services") or []
if isinstance(services, str):
return False
return any(s in services for s in _MAIL_SERVICES)
async def _resolve_personas(
repo: Any, mail_decky: dict[str, Any],
) -> tuple[list[EmailPersona], str]:
"""Pick the right persona source for *mail_decky* and return the list.
Returns ``(personas, source_label)`` so logs can disambiguate why a
tick was skipped. Source label is the same string ``list_running_deckies``
sets on the row (``"topology" | "fleet" | "shard"``) so the logger
reads consistently against the rest of the orchestrator.
Resolution rules (matches the design discussion):
* **topology** source → walk to ``Topology.email_personas``; the
topology owns its own list. Each topology can have different
personas.
* **fleet** / **shard** source → unihost MACVLAN/IPVLAN deckies and
SWARM shards have no parent topology row, so they share a single
host-wide pool loaded from disk by :mod:`global_pool`.
"""
source = mail_decky.get("source") or "unknown"
if source == "topology":
topology_id = mail_decky.get("topology_id")
if not topology_id:
return [], source
topology = await repo.get_topology(topology_id)
if not topology:
return [], source
return (
parse_personas(
topology.get("email_personas"),
language_default=topology.get("language_default") or "en",
),
source,
)
# Fleet / shard / anything else → global pool.
return global_pool.load(), source
async def pick(
repo: Any,
*,
rand: Optional[secrets.SystemRandom] = None,
now: Optional[datetime] = None,
) -> Optional[EmailAction]:
"""Pick one email action against any running mail decky.
Mail-decky discovery uses the **union view** (``list_running_deckies``):
MazeNET topology deckies, unihost fleet deckies, and SWARM shards are
all eligible. Persona source is per-decky-source; see
:func:`_resolve_personas`. *now* is the wall-clock used for
``active_hours`` filtering — injected so tests can pin the hour
deterministically.
"""
rng = rand or secrets.SystemRandom()
now_dt = now or datetime.now()
deckies = await repo.list_running_deckies()
mail_deckies = [d for d in deckies if _is_mail_decky(d)]
if not mail_deckies:
logger.debug("emailgen pick: no running mail decky")
return None
mail_decky = rng.choice(mail_deckies)
personas, source = await _resolve_personas(repo, mail_decky)
if len(personas) < 2:
logger.debug(
"emailgen pick: source=%s mail_decky=%s only %d personas; need >=2",
source, mail_decky.get("uuid"), len(personas),
)
return None
active = [p for p in personas if in_active_hours(p, now_dt.hour)]
if len(active) < 2:
logger.debug(
"emailgen pick: source=%s mail_decky=%s only %d personas in-hours",
source, mail_decky.get("uuid"), len(active),
)
return None
sender = rng.choice(active)
recipient = rng.choice([p for p in active if p.email != sender.email])
# Look up open threads between this pair on this mail decky.
chain = await _maybe_pick_chain(
repo, mail_decky["uuid"], sender, recipient, rng=rng,
)
services = tuple(mail_decky.get("services") or ())
decky_name = mail_decky.get("name") or ""
if chain is not None:
return EmailAction(
mail_decky_uuid=mail_decky["uuid"],
mail_decky_name=decky_name,
mail_decky_services=services,
sender=sender,
recipient=recipient,
thread_id=chain.thread_id,
parent_message_id=chain.parent_message_id,
references=references_for_reply(chain),
subject_hint=chain.parent_subject,
parent_excerpt=None, # repo can populate later if useful
context_hint=chain.parent_subject,
is_reply=True,
)
return EmailAction(
mail_decky_uuid=mail_decky["uuid"],
mail_decky_name=decky_name,
mail_decky_services=services,
sender=sender,
recipient=recipient,
thread_id=new_thread_id(),
parent_message_id=None,
references="",
subject_hint=None,
parent_excerpt=None,
context_hint=rng.choice(_CONTEXT_HINTS),
is_reply=False,
)
async def _maybe_pick_chain(
repo: Any,
mail_decky_uuid: str,
sender: EmailPersona,
recipient: EmailPersona,
*,
rng: secrets.SystemRandom,
) -> Optional[ThreadChain]:
"""Probabilistically pick an open thread between the pair, or None."""
if rng.random() >= _REPLY_PROBABILITY:
return None
threads = await repo.list_orchestrator_email_threads(
mail_decky_uuid, sender.email, recipient.email, limit=20,
)
if not threads:
return None
head = threads[0]
return ThreadChain(
thread_id=head["thread_id"],
parent_message_id=head["message_id"],
# We don't reconstruct the full ancestry from row history here —
# the parent's References + parent's Message-ID would do that.
# For v1, single-step references is fine; mail clients still
# group correctly by (Subject + In-Reply-To).
references=tuple(),
parent_subject=reply_subject(head["subject"]),
)

View File

@@ -0,0 +1,75 @@
"""RFC 2822 thread-chain bookkeeping.
A thread is a worker-side UUID that groups one or more emails between
the same two personas. ``In-Reply-To`` carries the immediate parent's
``Message-ID``; ``References`` carries the full ancestry chain.
The emailgen scheduler queries the repository for the most recent email
in any thread between (sender, recipient); if it finds one, it emits a
reply (continuing the chain). Otherwise it starts a new thread.
"""
from __future__ import annotations
import uuid
from dataclasses import dataclass
from typing import Optional
@dataclass(frozen=True)
class ThreadChain:
"""Immutable view of a thread's chain at a point in time.
``thread_id`` is opaque (UUID). ``parent_message_id`` is the most
recent message in the chain — the new reply's ``In-Reply-To`` field.
``references`` is the dot-separated history fed into the
``References:`` header (oldest-first per RFC 2822 §3.6.4).
``parent_subject`` carries the subject we're replying to, so the
reply can prepend ``Re:`` correctly.
"""
thread_id: str
parent_message_id: str
references: tuple[str, ...]
parent_subject: str
def new_thread_id() -> str:
return str(uuid.uuid4())
def reply_subject(parent_subject: str) -> str:
"""Prepend ``Re:`` to *parent_subject* if not already a reply.
Folds repeat ``Re: Re: Re:`` into a single ``Re:`` — Outlook /
Thunderbird both do this and an attacker reading the maildir would
notice the corpus's missing convention immediately.
"""
s = parent_subject.strip()
lowered = s.lower()
while lowered.startswith("re:"):
s = s[3:].lstrip()
lowered = s.lower()
return f"Re: {s}"
def references_for_reply(chain: Optional[ThreadChain]) -> str:
"""Build the ``References:`` header value for a reply.
Returns a space-separated list of message-ids, oldest-first, with
the parent appended. Empty string when *chain* is None (root).
"""
if chain is None:
return ""
refs = list(chain.references) + [chain.parent_message_id]
return " ".join(refs)
def new_message_id(domain: str) -> str:
"""Build an RFC 2822 ``Message-ID`` value (incl. angle brackets).
Worker side — the value is also stored in the DB so a future reply
can be threaded against it. Domain mirrors the sender's email
domain so an attacker grepping for tells doesn't find every
fake-corp email tagged with ``@example.com``.
"""
safe_domain = domain.strip() or "localhost"
return f"<{uuid.uuid4().hex}@{safe_domain}>"