merge: testing → main (reconcile 2-week divergence)
This commit is contained in:
27
decnet/realism/__init__.py
Normal file
27
decnet/realism/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
"""Realism library — synthetic content + scheduling primitives.
|
||||
|
||||
A shared, importable library that produces *plausible* artifacts (file
|
||||
names, file bodies, email content) and the diurnal/persona machinery
|
||||
that decides *when* and *for whom* to produce them.
|
||||
|
||||
Workers (orchestrator, canary cultivator, future-emailgen-equivalents)
|
||||
import from here. This package owns:
|
||||
|
||||
* :mod:`decnet.realism.taxonomy` — :class:`ContentClass` enum and the
|
||||
:class:`Plan` dataclass that planners emit.
|
||||
* :mod:`decnet.realism.diurnal` — work-hours gating and a backdated
|
||||
``mtime`` sampler so planted files don't all stamp at wall-clock-now.
|
||||
* :mod:`decnet.realism.planner` — picks ``(decky, persona, class,
|
||||
action, mtime)`` tuples for the orchestrator's tick loop.
|
||||
* :mod:`decnet.realism.personas` — persona schema (the
|
||||
:class:`EmailPersona` record describing each fictional employee).
|
||||
* :mod:`decnet.realism.prompts` — prompt builders, one per content
|
||||
class, sharing an em-dash-suppression style helper.
|
||||
* :mod:`decnet.realism.llm` — :class:`LLMBackend` ABC + factory + impl
|
||||
subpackage; pluggable text-generation backend.
|
||||
|
||||
The library has **no worker, no systemd unit, no CLI of its own** —
|
||||
it's plain Python that consumers import. The CLI surface that does
|
||||
exist (``decnet realism import-personas``) is registered by
|
||||
:mod:`decnet.cli.realism` after stage 5 of the migration.
|
||||
"""
|
||||
421
decnet/realism/bodies.py
Normal file
421
decnet/realism/bodies.py
Normal file
@@ -0,0 +1,421 @@
|
||||
"""Per-content-class body generators (deterministic templates).
|
||||
|
||||
Stage 3 of the realism migration ships deterministic per-class
|
||||
templates — varied enough that two notes on the same decky aren't
|
||||
identical, formulaic enough that system-class files (cron logs,
|
||||
journal entries) look like cron actually wrote them.
|
||||
|
||||
Stage 6 wires LLM enrichment for user-classes; the templates here
|
||||
remain the fallback path so the orchestrator tick never blocks on
|
||||
Ollama.
|
||||
|
||||
Determinism: every namer/body takes a :class:`SystemRandom` (from
|
||||
:mod:`secrets`). Tests pin the RNG seed for reproducibility; the
|
||||
orchestrator passes a fresh RNG per tick so production picks are
|
||||
unpredictable.
|
||||
|
||||
The factory mirrors :mod:`decnet.realism.naming`: caller passes a
|
||||
:class:`~decnet.realism.taxonomy.ContentClass`; we return the body
|
||||
generator registered for it. Email + canary classes raise —
|
||||
those bodies come from the email driver and canary cultivator
|
||||
respectively, not from realism.bodies.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import secrets
|
||||
from datetime import datetime, timezone
|
||||
from typing import Callable, Optional
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.realism.taxonomy import ContentClass
|
||||
|
||||
log = get_logger("realism.bodies")
|
||||
|
||||
|
||||
# ── User-class body generators ─────────────────────────────────────────────
|
||||
|
||||
|
||||
_NOTE_TEMPLATES: tuple[str, ...] = (
|
||||
"follow up with the team on this",
|
||||
"remember to ping the on-call",
|
||||
"ask about the staging migration timeline",
|
||||
"double-check the runbook before next shift",
|
||||
"todo: rotate keys; check on backup task",
|
||||
"meeting notes from yesterday — copy onto wiki when free",
|
||||
"this is broken in prod; talk to ops monday",
|
||||
"draft response to the auditor — keep it short",
|
||||
)
|
||||
|
||||
|
||||
def _body_note(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
n = rng.randint(2, 5)
|
||||
lines = rng.sample(_NOTE_TEMPLATES, k=min(n, len(_NOTE_TEMPLATES)))
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
_TODO_VERBS: tuple[str, ...] = (
|
||||
"rotate keys", "review pr",
|
||||
"clean up logs", "update docs",
|
||||
"follow up on ticket",
|
||||
"test backup restore",
|
||||
"deploy to staging",
|
||||
"ack auditor email",
|
||||
"patch CVE backlog",
|
||||
)
|
||||
|
||||
|
||||
def _body_todo(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
n = rng.randint(3, 7)
|
||||
items = rng.sample(_TODO_VERBS, k=min(n, len(_TODO_VERBS)))
|
||||
# Roughly a third pre-checked — looks like a list that's been
|
||||
# touched at least once.
|
||||
out = []
|
||||
for item in items:
|
||||
marker = "[x]" if rng.random() < 0.33 else "[ ]"
|
||||
out.append(f"- {marker} {item}")
|
||||
return "\n".join(out) + "\n"
|
||||
|
||||
|
||||
_DRAFT_PARAGRAPHS: tuple[str, ...] = (
|
||||
"Hi team,\n\nQuick update on the project. We're tracking ahead of schedule "
|
||||
"on the migration but the staging soak revealed a regression in the "
|
||||
"auth path. I'll have a fix in by end of week.\n\nThanks,\n",
|
||||
"Hi,\n\nFollowing up on yesterday's meeting. Action items below:\n\n"
|
||||
"- Engineering owns the deployment plan\n"
|
||||
"- Ops will draft the runbook update\n"
|
||||
"- We sync again Friday\n\n",
|
||||
"All,\n\nProposal attached. Key points:\n\n"
|
||||
"1. We are not changing the data model in this release\n"
|
||||
"2. The new endpoint is opt-in via feature flag\n"
|
||||
"3. Rollback path is one config flip\n\n"
|
||||
"Feedback by EOD?\n\n",
|
||||
)
|
||||
|
||||
|
||||
def _body_draft(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
return rng.choice(_DRAFT_PARAGRAPHS)
|
||||
|
||||
|
||||
_SCRIPT_TEMPLATES: tuple[str, ...] = (
|
||||
"#!/usr/bin/env bash\nset -euo pipefail\n\n"
|
||||
"BACKUP_DIR=/var/backups\n"
|
||||
"STAMP=$(date +%Y%m%d-%H%M)\n"
|
||||
"echo \"backup start $STAMP\"\n"
|
||||
"tar czf \"$BACKUP_DIR/db-$STAMP.tar.gz\" /var/lib/mysql\n"
|
||||
"echo \"backup done\"\n",
|
||||
"#!/usr/bin/env bash\nset -e\n\n"
|
||||
"# clean up old logs\n"
|
||||
"find /var/log -name '*.log.*.gz' -mtime +30 -delete\n",
|
||||
"#!/usr/bin/env python3\n\"\"\"Quick fix for the reporting job.\"\"\"\n"
|
||||
"import sys\n\n"
|
||||
"def main():\n print('todo: real fix here')\n\n"
|
||||
"if __name__ == '__main__':\n sys.exit(main())\n",
|
||||
)
|
||||
|
||||
|
||||
def _body_script(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
return rng.choice(_SCRIPT_TEMPLATES)
|
||||
|
||||
|
||||
# ── System-class body generators ───────────────────────────────────────────
|
||||
|
||||
|
||||
_CRON_COMMANDS: tuple[str, ...] = (
|
||||
"(root) CMD (run-parts /etc/cron.daily)",
|
||||
"(root) CMD (run-parts /etc/cron.hourly)",
|
||||
"(www-data) CMD (cd /var/www && /usr/bin/php artisan schedule:run)",
|
||||
"(backup) CMD (/usr/local/bin/backup.sh)",
|
||||
"(root) CMD (test -x /usr/sbin/anacron || ( cd / && run-parts --report /etc/cron.daily ))",
|
||||
)
|
||||
|
||||
|
||||
def _body_log_cron(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
n = rng.randint(8, 24)
|
||||
base = datetime.now(timezone.utc)
|
||||
lines = []
|
||||
for i in range(n):
|
||||
hour = (base.hour - i) % 24
|
||||
minute = rng.randint(0, 59)
|
||||
pid = rng.randint(1000, 99999)
|
||||
cmd = rng.choice(_CRON_COMMANDS)
|
||||
# ISO-ish "Apr 27 09:13:44 host CRON[1234]: ..." cron syslog shape.
|
||||
date_s = base.strftime("%b %d")
|
||||
lines.append(
|
||||
f"{date_s} {hour:02d}:{minute:02d}:{rng.randint(0,59):02d} "
|
||||
f"hostname CRON[{pid}]: {cmd}"
|
||||
)
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
_DAEMON_LINES: tuple[str, ...] = (
|
||||
"systemd[1]: Started Daily apt download activities.",
|
||||
"systemd[1]: apt-daily.service: Succeeded.",
|
||||
"systemd[1]: Reached target Multi-User System.",
|
||||
"kernel: [UFW BLOCK] IN=eth0 OUT= MAC=…",
|
||||
"sshd[2103]: pam_unix(sshd:session): session opened for user admin by (uid=0)",
|
||||
"sshd[2103]: Received disconnect from 10.0.0.4 port 47282:11: disconnected by user",
|
||||
"CRON[1894]: pam_unix(cron:session): session closed for user root",
|
||||
)
|
||||
|
||||
|
||||
def _body_log_daemon(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
n = rng.randint(10, 30)
|
||||
lines = []
|
||||
base = datetime.now(timezone.utc)
|
||||
for _ in range(n):
|
||||
lines.append(
|
||||
f"{base.strftime('%b %d %H:%M:%S')} hostname "
|
||||
f"{rng.choice(_DAEMON_LINES)}"
|
||||
)
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def _body_cache_tmp(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
# ~64-256 bytes of opaque session-ish payload — most /tmp/.cache-*
|
||||
# files in the wild are short binary or k=v dumps. We emit ASCII
|
||||
# so docker exec write paths don't need binary-safety acrobatics.
|
||||
nbytes = rng.randint(64, 256)
|
||||
chars = "abcdefghijklmnopqrstuvwxyz0123456789"
|
||||
return "session=" + "".join(rng.choice(chars) for _ in range(nbytes)) + "\n"
|
||||
|
||||
|
||||
def _body_email(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
raise NotImplementedError(
|
||||
"email bodies come from the email driver, not realism.bodies"
|
||||
)
|
||||
|
||||
|
||||
def _body_canary(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
raise NotImplementedError(
|
||||
"canary bodies come from the canary cultivator (stage 7), "
|
||||
"not realism.bodies"
|
||||
)
|
||||
|
||||
|
||||
# ── Dispatch ───────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
_BODIES: dict[ContentClass, Callable[[str, secrets.SystemRandom], str]] = {
|
||||
ContentClass.NOTE: _body_note,
|
||||
ContentClass.TODO: _body_todo,
|
||||
ContentClass.DRAFT: _body_draft,
|
||||
ContentClass.SCRIPT: _body_script,
|
||||
ContentClass.LOG_CRON: _body_log_cron,
|
||||
ContentClass.LOG_DAEMON: _body_log_daemon,
|
||||
ContentClass.CACHE_TMP: _body_cache_tmp,
|
||||
ContentClass.EMAIL: _body_email,
|
||||
ContentClass.CANARY_AWS_CREDS: _body_canary,
|
||||
ContentClass.CANARY_ENV_FILE: _body_canary,
|
||||
ContentClass.CANARY_GIT_CONFIG: _body_canary,
|
||||
ContentClass.CANARY_SSH_KEY: _body_canary,
|
||||
ContentClass.CANARY_HONEYDOC: _body_canary,
|
||||
ContentClass.CANARY_HONEYDOC_DOCX: _body_canary,
|
||||
ContentClass.CANARY_HONEYDOC_PDF: _body_canary,
|
||||
ContentClass.CANARY_MYSQL_DUMP: _body_canary,
|
||||
}
|
||||
|
||||
|
||||
def make_body(
|
||||
content_class: ContentClass,
|
||||
persona: str,
|
||||
*,
|
||||
rand: Optional[secrets.SystemRandom] = None,
|
||||
) -> str:
|
||||
"""Return deterministic body bytes (utf-8 string) for *content_class*.
|
||||
|
||||
Stage 3 ships templates only. :func:`make_body_with_llm` is the
|
||||
LLM-aware variant added in stage 6 — kept on a separate name so
|
||||
the deterministic path stays trivially callable from tests and
|
||||
from the LLM fallback itself.
|
||||
"""
|
||||
rng = rand or secrets.SystemRandom()
|
||||
gen = _BODIES.get(content_class)
|
||||
if gen is None:
|
||||
raise KeyError(
|
||||
f"no body generator registered for content_class={content_class!r}"
|
||||
)
|
||||
return gen(persona, rng)
|
||||
|
||||
|
||||
async def make_body_with_llm(
|
||||
content_class: ContentClass,
|
||||
persona, # EmailPersona — typed loosely to avoid an import cycle
|
||||
*,
|
||||
llm=None, # LLMBackend | None
|
||||
breaker=None, # LLMCircuitBreaker | None
|
||||
timeout: float = 60.0,
|
||||
rand: Optional[secrets.SystemRandom] = None,
|
||||
) -> str:
|
||||
"""LLM-enriched body for user-classes; deterministic fallback otherwise.
|
||||
|
||||
Falls back to :func:`make_body` whenever:
|
||||
|
||||
* ``llm`` is None,
|
||||
* ``breaker.allow_call()`` returns False (sustained failure),
|
||||
* the LLM call times out or returns empty,
|
||||
* the content class isn't a user-class (system-class content
|
||||
should look formulaic, so we never invoke LLM there).
|
||||
|
||||
Em-dash stripping runs on the LLM output as a belt-and-braces
|
||||
guard (see :mod:`decnet.realism.prompts._style`). The function
|
||||
is async because LLM calls are; the deterministic path returns
|
||||
immediately so the orchestrator's tick doesn't pay async overhead
|
||||
when LLM is disabled.
|
||||
"""
|
||||
rng = rand or secrets.SystemRandom()
|
||||
|
||||
# System / canary / email classes never touch the LLM.
|
||||
if not content_class.is_user_class():
|
||||
return make_body(content_class, persona.name, rand=rng)
|
||||
|
||||
if llm is None or (breaker is not None and not breaker.allow_call()):
|
||||
return make_body(content_class, persona.name, rand=rng)
|
||||
|
||||
# Lazy imports keep the prompt + style modules out of the
|
||||
# deterministic path's import graph.
|
||||
from decnet.realism.llm.base import LLMTimeout
|
||||
from decnet.realism.prompts import filebody as _filebody
|
||||
from decnet.realism.prompts._style import strip_em_dashes
|
||||
|
||||
prompt = _filebody.build(content_class, persona)
|
||||
try:
|
||||
result = await asyncio.wait_for(llm.generate(prompt), timeout=timeout)
|
||||
except (LLMTimeout, asyncio.TimeoutError):
|
||||
log.debug("realism.bodies LLM timeout class=%s persona=%s",
|
||||
content_class.value, persona.name)
|
||||
if breaker is not None:
|
||||
breaker.record_failure()
|
||||
return make_body(content_class, persona.name, rand=rng)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("realism.bodies LLM error class=%s persona=%s: %s",
|
||||
content_class.value, persona.name, exc)
|
||||
if breaker is not None:
|
||||
breaker.record_failure()
|
||||
return make_body(content_class, persona.name, rand=rng)
|
||||
|
||||
if not result.success or not result.text.strip():
|
||||
if breaker is not None:
|
||||
breaker.record_failure()
|
||||
return make_body(content_class, persona.name, rand=rng)
|
||||
|
||||
if breaker is not None:
|
||||
breaker.record_success()
|
||||
return strip_em_dashes(result.text.rstrip() + "\n", persona)
|
||||
|
||||
|
||||
# ── Edit-in-place mutators ─────────────────────────────────────────────────
|
||||
# Stage 3b: deterministic per-class mutations. The contract: take the
|
||||
# previous body bytes, return a plausible *next* iteration (append a
|
||||
# line, flip a checkbox, fix a typo). Append-only for logs; small
|
||||
# in-place edits for user content. LLM enrichment in stage 6 wires
|
||||
# next_iteration to ask "what would <persona> write next" with the
|
||||
# previous body in the prompt; the deterministic path stays as the
|
||||
# fallback.
|
||||
|
||||
|
||||
def _edit_todo(
|
||||
prev: str, persona: str, rng: secrets.SystemRandom,
|
||||
) -> str:
|
||||
"""Flip an unchecked box, append a new item, or both.
|
||||
|
||||
Real TODO files evolve: items get checked off as work happens, new
|
||||
items get added, occasionally a sub-bullet appears under an
|
||||
existing one. We pick one of those mutations per call.
|
||||
"""
|
||||
lines = prev.splitlines()
|
||||
unchecked_indices = [
|
||||
i for i, ln in enumerate(lines) if ln.startswith("- [ ]")
|
||||
]
|
||||
op = rng.choice(("flip", "append", "both") if unchecked_indices else ("append",))
|
||||
if op in ("flip", "both") and unchecked_indices:
|
||||
idx = rng.choice(unchecked_indices)
|
||||
lines[idx] = lines[idx].replace("- [ ]", "- [x]", 1)
|
||||
if op in ("append", "both"):
|
||||
new_item = rng.choice(_TODO_VERBS)
|
||||
marker = "[x]" if rng.random() < 0.15 else "[ ]"
|
||||
lines.append(f"- {marker} {new_item}")
|
||||
return "\n".join(lines) + ("" if prev.endswith("\n") else "\n")
|
||||
|
||||
|
||||
def _edit_note(
|
||||
prev: str, persona: str, rng: secrets.SystemRandom,
|
||||
) -> str:
|
||||
"""Append one new note line or insert a follow-up under an existing one."""
|
||||
new_line = rng.choice(_NOTE_TEMPLATES)
|
||||
if prev.endswith("\n"):
|
||||
return prev + new_line + "\n"
|
||||
return prev + "\n" + new_line + "\n"
|
||||
|
||||
|
||||
def _edit_draft(
|
||||
prev: str, persona: str, rng: secrets.SystemRandom,
|
||||
) -> str:
|
||||
"""Append a new short paragraph to the existing draft."""
|
||||
addition = (
|
||||
"\nFollow-up: I'll send the deck once finance signs off on the numbers.\n",
|
||||
"\nP.S.: Looping in ops on the rollout sequence — they have context I don't.\n",
|
||||
"\nLet me know if any of this needs another pass.\n",
|
||||
)
|
||||
return prev.rstrip() + "\n" + rng.choice(addition)
|
||||
|
||||
|
||||
def _edit_script(
|
||||
prev: str, persona: str, rng: secrets.SystemRandom,
|
||||
) -> str:
|
||||
"""Append a comment line — scripts evolve via comments and small fixes."""
|
||||
comments = (
|
||||
"# TODO: handle the empty-input case\n",
|
||||
"# 2026-04-27: hardened error path after the prod incident\n",
|
||||
"# noqa: shellcheck disagrees but this is what the runbook says\n",
|
||||
)
|
||||
return prev.rstrip() + "\n" + rng.choice(comments)
|
||||
|
||||
|
||||
def _edit_log_cron(
|
||||
prev: str, persona: str, rng: secrets.SystemRandom,
|
||||
) -> str:
|
||||
"""Append one new cron syslog line — logs only ever grow."""
|
||||
extra = _body_log_cron(persona, rng)
|
||||
return prev.rstrip() + "\n" + extra.splitlines()[-1] + "\n"
|
||||
|
||||
|
||||
def _edit_log_daemon(
|
||||
prev: str, persona: str, rng: secrets.SystemRandom,
|
||||
) -> str:
|
||||
extra = _body_log_daemon(persona, rng)
|
||||
return prev.rstrip() + "\n" + extra.splitlines()[-1] + "\n"
|
||||
|
||||
|
||||
_EDITORS: dict[ContentClass, Callable[[str, str, secrets.SystemRandom], str]] = {
|
||||
ContentClass.NOTE: _edit_note,
|
||||
ContentClass.TODO: _edit_todo,
|
||||
ContentClass.DRAFT: _edit_draft,
|
||||
ContentClass.SCRIPT: _edit_script,
|
||||
ContentClass.LOG_CRON: _edit_log_cron,
|
||||
ContentClass.LOG_DAEMON: _edit_log_daemon,
|
||||
}
|
||||
|
||||
|
||||
def next_iteration(
|
||||
content_class: ContentClass,
|
||||
persona: str,
|
||||
previous_body: str,
|
||||
*,
|
||||
rand: Optional[secrets.SystemRandom] = None,
|
||||
) -> str:
|
||||
"""Return the next-iteration body for an edit-in-place mutation.
|
||||
|
||||
Raises :class:`KeyError` for content classes that don't support
|
||||
editing (canary blobs, cache-tmp scratch files, email). The
|
||||
planner filters those out before producing an :class:`EditAction`,
|
||||
so reaching this branch with an unsupported class is a bug worth
|
||||
surfacing loudly.
|
||||
"""
|
||||
rng = rand or secrets.SystemRandom()
|
||||
editor = _EDITORS.get(content_class)
|
||||
if editor is None:
|
||||
raise KeyError(
|
||||
f"content_class={content_class!r} does not support edits"
|
||||
)
|
||||
return editor(previous_body, persona, rng)
|
||||
152
decnet/realism/diurnal.py
Normal file
152
decnet/realism/diurnal.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""Work-hours gating and backdated mtime sampling.
|
||||
|
||||
The current orchestrator stamps every planted file at wall-clock-now,
|
||||
which is one of the realism failures driving this migration: a `cron.log`
|
||||
that says it was last touched at 03:14:22 UTC on a workstation
|
||||
attributed to a 9-to-5 admin reads as fake on first glance.
|
||||
|
||||
Two helpers:
|
||||
|
||||
* :func:`in_work_hours` — gate planner ticks so a persona's files only
|
||||
appear inside the persona's ``active_hours`` window. Wrap-around
|
||||
windows (``"22:00-06:00"``) are supported.
|
||||
* :func:`sample_mtime` — return a backdated datetime whose hour-of-day
|
||||
falls inside the persona's window, biased toward "recent but not
|
||||
now". Drivers pass this to ``touch -d``.
|
||||
|
||||
Clock and RNG are injectable so tests don't need to ``freeze_time`` or
|
||||
patch :mod:`secrets`.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import secrets
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Protocol
|
||||
|
||||
|
||||
class _ClockLike(Protocol):
|
||||
def __call__(self) -> datetime: ...
|
||||
|
||||
|
||||
class _RandLike(Protocol):
|
||||
def random(self) -> float: ...
|
||||
def randint(self, a: int, b: int) -> int: ...
|
||||
|
||||
|
||||
def _parse_window(window: str) -> tuple[int, int, int, int] | None:
|
||||
"""Parse ``"HH:MM-HH:MM"`` into ``(start_h, start_m, end_h, end_m)``.
|
||||
|
||||
Returns ``None`` for malformed input — callers treat that as
|
||||
"always-on" so a single config typo never silences the whole fleet
|
||||
(mirrors :func:`decnet.realism.personas.in_active_hours` semantics).
|
||||
"""
|
||||
try:
|
||||
start_s, end_s = window.split("-")
|
||||
start_h, start_m = (int(p) for p in start_s.split(":"))
|
||||
end_h, end_m = (int(p) for p in end_s.split(":"))
|
||||
except (ValueError, IndexError):
|
||||
return None
|
||||
if not (0 <= start_h < 24 and 0 <= end_h < 24):
|
||||
return None
|
||||
if not (0 <= start_m < 60 and 0 <= end_m < 60):
|
||||
return None
|
||||
return start_h, start_m, end_h, end_m
|
||||
|
||||
|
||||
def in_work_hours(window: str, now: datetime) -> bool:
|
||||
"""Return ``True`` when *now* falls inside the persona window.
|
||||
|
||||
*window* is ``"HH:MM-HH:MM"``. Wrap-around (``start > end``) means
|
||||
"spans midnight." Equal ``start`` and ``end`` means always-on.
|
||||
Malformed windows return ``True`` — fail-open so a typo doesn't
|
||||
silence the fleet.
|
||||
"""
|
||||
parsed = _parse_window(window)
|
||||
if parsed is None:
|
||||
return True
|
||||
start_h, start_m, end_h, end_m = parsed
|
||||
if (start_h, start_m) == (end_h, end_m):
|
||||
return True
|
||||
cur = now.hour * 60 + now.minute
|
||||
start = start_h * 60 + start_m
|
||||
end = end_h * 60 + end_m
|
||||
if start < end:
|
||||
return start <= cur < end
|
||||
# Wrap-around (e.g. 22:00-06:00).
|
||||
return cur >= start or cur < end
|
||||
|
||||
|
||||
def sample_mtime(
|
||||
window: str,
|
||||
now: datetime,
|
||||
*,
|
||||
rand: _RandLike | None = None,
|
||||
backdate_min_hours: float = 0.5,
|
||||
backdate_max_days: float = 14.0,
|
||||
) -> datetime:
|
||||
"""Return a backdated ``datetime`` for ``touch -d`` after a write.
|
||||
|
||||
The sampled time is in the past relative to *now*, capped at
|
||||
*backdate_max_days* days ago and at least *backdate_min_hours* ago.
|
||||
Weighted toward recent — half-life roughly 2 days — so most planted
|
||||
files look "edited recently" without all clustering at +30min.
|
||||
|
||||
The hour-of-day of the result is forced into *window* so an
|
||||
`admin` persona's `TODO.md` doesn't carry an mtime of 03:14:22.
|
||||
Wrap-around windows are honoured.
|
||||
|
||||
Falls back to a uniform 0.5h–14d backdate if *window* is malformed.
|
||||
"""
|
||||
rng = rand or secrets.SystemRandom()
|
||||
parsed = _parse_window(window)
|
||||
|
||||
# Exponential-ish backdate via -ln(u): heavier mass near "recent".
|
||||
# Cap by clipping; cheap and good enough for realism.
|
||||
u = max(rng.random(), 1e-6) # avoid log(0)
|
||||
import math
|
||||
span_hours = max(backdate_min_hours, min(backdate_max_days * 24, -math.log(u) * 12.0))
|
||||
candidate = now - timedelta(hours=span_hours)
|
||||
|
||||
if parsed is None:
|
||||
return candidate
|
||||
|
||||
start_h, start_m, end_h, end_m = parsed
|
||||
if (start_h, start_m) == (end_h, end_m):
|
||||
return candidate
|
||||
|
||||
# If the candidate's hour-of-day is outside the window, snap it into
|
||||
# the window on the same calendar date — preserves the "this many
|
||||
# days ago" feel while making the clock-face credible.
|
||||
cur = candidate.hour * 60 + candidate.minute
|
||||
start = start_h * 60 + start_m
|
||||
end = end_h * 60 + end_m
|
||||
if start < end:
|
||||
in_window = start <= cur < end
|
||||
snap_minutes = rng.randint(start, max(start, end - 1))
|
||||
else:
|
||||
# Wrap-around: in-window if cur is in either segment.
|
||||
in_window = cur >= start or cur < end
|
||||
# Snap into the larger of the two segments by total length.
|
||||
before_midnight = (24 * 60) - start
|
||||
after_midnight = end
|
||||
if before_midnight >= after_midnight:
|
||||
snap_minutes = rng.randint(start, 24 * 60 - 1)
|
||||
else:
|
||||
snap_minutes = rng.randint(0, max(0, end - 1))
|
||||
|
||||
if in_window:
|
||||
return candidate
|
||||
snapped = candidate.replace(
|
||||
hour=snap_minutes // 60,
|
||||
minute=snap_minutes % 60,
|
||||
second=rng.randint(0, 59),
|
||||
microsecond=0,
|
||||
)
|
||||
# If the hour-snap pushed us too close to *now* (candidate was
|
||||
# earlier today but the random in-window minute landed near or
|
||||
# later than the current clock), shift back a full day so the
|
||||
# result honours the min-backdate floor.
|
||||
floor = now - timedelta(hours=backdate_min_hours)
|
||||
while snapped > floor:
|
||||
snapped -= timedelta(days=1)
|
||||
return snapped
|
||||
17
decnet/realism/llm/__init__.py
Normal file
17
decnet/realism/llm/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
"""LLM backend for the realism library.
|
||||
|
||||
Pluggable per the provider-subpackages convention (mirrors
|
||||
:mod:`decnet.web.db` and :mod:`decnet.bus`): consumers depend on
|
||||
:class:`LLMBackend` from :mod:`base`; concrete transports live under
|
||||
:mod:`impl` and are selected by :func:`get_llm`.
|
||||
|
||||
This is the seam to pull on when swapping local Ollama for the
|
||||
Anthropic API, llama.cpp, vLLM, or any other inference server — change
|
||||
``DECNET_REALISM_LLM`` (or pass ``llm=`` directly), no caller rewrite.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from decnet.realism.llm.base import LLMBackend, LLMResult, LLMTimeout
|
||||
from decnet.realism.llm.factory import get_llm
|
||||
|
||||
__all__ = ["LLMBackend", "LLMResult", "LLMTimeout", "get_llm"]
|
||||
47
decnet/realism/llm/base.py
Normal file
47
decnet/realism/llm/base.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""Backend protocol shared by every LLM transport.
|
||||
|
||||
Deliberately narrow: realism consumers need one async ``generate``
|
||||
call that takes a prompt string and returns the model's output text
|
||||
plus enough metadata to populate per-event payloads (model name,
|
||||
latency, success bit). Streaming, embeddings, multi-turn chat — all
|
||||
out of scope here; realism only ever does one-shot single-prompt
|
||||
generations.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Protocol
|
||||
|
||||
|
||||
class LLMTimeout(Exception):
|
||||
"""Raised when a generation exceeds the backend's wall-clock cap.
|
||||
|
||||
Backends MUST raise this rather than returning silently empty
|
||||
output; the driver discriminates timeout from "model produced
|
||||
nothing useful" so payloads carry the right ``stage`` value.
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMResult:
|
||||
"""Outcome of one ``generate`` call.
|
||||
|
||||
``success`` is ``False`` when the backend ran cleanly but produced
|
||||
no usable output (e.g. an empty stdout). Hard failures (subprocess
|
||||
crash, network error) raise; soft failures land here so the driver
|
||||
can persist + log them as one event.
|
||||
"""
|
||||
success: bool
|
||||
text: str
|
||||
model: str
|
||||
latency_ms: int
|
||||
extra: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
class LLMBackend(Protocol):
|
||||
"""Minimal contract for a realism LLM provider."""
|
||||
|
||||
model: str
|
||||
timeout: float
|
||||
|
||||
async def generate(self, prompt: str) -> LLMResult: ...
|
||||
99
decnet/realism/llm/circuit.py
Normal file
99
decnet/realism/llm/circuit.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""Process-local circuit breaker for LLM calls.
|
||||
|
||||
Per-call timeouts (``asyncio.wait_for(llm.generate, timeout=...)``)
|
||||
protect a single tick from a single hung Ollama. They do NOT protect
|
||||
the worker from a *sustained* problem: 100 consecutive 60-second
|
||||
timeouts chew up an hour of orchestrator time on dead requests before
|
||||
anything notices.
|
||||
|
||||
This breaker watches a sliding window of recent outcomes and flips
|
||||
``open`` after ``failure_threshold`` consecutive failures. Open
|
||||
breakers short-circuit ``allow_call`` to ``False`` so callers fall
|
||||
back to deterministic templates without the per-tick cost. After
|
||||
``cooldown_seconds`` the breaker enters ``half_open`` and the next
|
||||
call is allowed; success closes the breaker, failure re-opens it
|
||||
with a fresh cooldown.
|
||||
|
||||
Process-local on purpose — cross-process state would require shared
|
||||
memory and is overkill for a single orchestrator worker.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import threading
|
||||
import time
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class _State(Enum):
|
||||
CLOSED = "closed"
|
||||
OPEN = "open"
|
||||
HALF_OPEN = "half_open"
|
||||
|
||||
|
||||
class LLMCircuitBreaker:
|
||||
"""Threadsafe sliding-window circuit breaker.
|
||||
|
||||
Default ``failure_threshold=3`` consecutive failures → open;
|
||||
``cooldown_seconds=60`` of open before transitioning to
|
||||
half-open. These match the realism worker's tick cadence: 3
|
||||
consecutive 60s timeouts = 3 minutes of dead air, which is the
|
||||
point at which a deterministic fallback is overdue.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
failure_threshold: int = 3,
|
||||
cooldown_seconds: float = 60.0,
|
||||
clock=time.monotonic,
|
||||
) -> None:
|
||||
self._failure_threshold = failure_threshold
|
||||
self._cooldown = cooldown_seconds
|
||||
self._clock = clock
|
||||
self._lock = threading.Lock()
|
||||
self._state = _State.CLOSED
|
||||
self._consecutive_failures = 0
|
||||
self._opened_at: float = 0.0
|
||||
|
||||
@property
|
||||
def state(self) -> str:
|
||||
with self._lock:
|
||||
return self._state.value
|
||||
|
||||
def allow_call(self) -> bool:
|
||||
"""Return True if the next call should run, False if it should
|
||||
short-circuit to the fallback path.
|
||||
|
||||
Promotes ``open`` → ``half_open`` after the cooldown elapses
|
||||
so the next caller acts as a probe.
|
||||
"""
|
||||
with self._lock:
|
||||
if self._state == _State.CLOSED:
|
||||
return True
|
||||
if self._state == _State.HALF_OPEN:
|
||||
return True
|
||||
# OPEN: check cooldown.
|
||||
if self._clock() - self._opened_at >= self._cooldown:
|
||||
self._state = _State.HALF_OPEN
|
||||
return True
|
||||
return False
|
||||
|
||||
def record_success(self) -> None:
|
||||
with self._lock:
|
||||
self._state = _State.CLOSED
|
||||
self._consecutive_failures = 0
|
||||
self._opened_at = 0.0
|
||||
|
||||
def record_failure(self) -> None:
|
||||
with self._lock:
|
||||
if self._state == _State.HALF_OPEN:
|
||||
# The probe call failed — re-open with a fresh cooldown.
|
||||
self._state = _State.OPEN
|
||||
self._opened_at = self._clock()
|
||||
# Don't reset the failure count; the probe failure
|
||||
# implies the underlying issue is unresolved.
|
||||
return
|
||||
self._consecutive_failures += 1
|
||||
if self._consecutive_failures >= self._failure_threshold:
|
||||
self._state = _State.OPEN
|
||||
self._opened_at = self._clock()
|
||||
46
decnet/realism/llm/factory.py
Normal file
46
decnet/realism/llm/factory.py
Normal file
@@ -0,0 +1,46 @@
|
||||
"""Backend dispatch.
|
||||
|
||||
Reads ``DECNET_REALISM_LLM`` to pick a concrete :class:`LLMBackend`.
|
||||
Defaults to ``ollama`` because that's what the prototype proved out and
|
||||
what most dev boxes have on hand.
|
||||
|
||||
Supported keys:
|
||||
|
||||
* ``ollama`` — :class:`decnet.realism.llm.impl.ollama.OllamaBackend`
|
||||
* ``fake`` — :class:`decnet.realism.llm.impl.fake.FakeBackend`
|
||||
(canned output, used by tests so they don't shell out)
|
||||
|
||||
Anthropic / vLLM / llama.cpp slots in here as a third branch when the
|
||||
need shows up. Per the provider-subpackages convention, do NOT collapse
|
||||
factory dispatch into the impl modules — keeps the ``__init__`` import
|
||||
graph cycle-free and the env contract auditable in one place.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from decnet.realism.llm.base import LLMBackend
|
||||
|
||||
|
||||
def get_llm(*, model: str | None = None, **kwargs: Any) -> LLMBackend:
|
||||
"""Instantiate the LLM backend selected by environment.
|
||||
|
||||
*model* (when provided) overrides whatever the backend's own default
|
||||
is — e.g. for :class:`OllamaBackend` that's ``llama3.1`` unless
|
||||
``DECNET_REALISM_MODEL`` says otherwise. Lets the worker honour
|
||||
``decnet orchestrate --model gpt-oss`` without each backend having
|
||||
to know about CLI flags.
|
||||
"""
|
||||
backend_key = os.environ.get("DECNET_REALISM_LLM", "ollama").lower()
|
||||
|
||||
if backend_key == "ollama":
|
||||
from decnet.realism.llm.impl.ollama import OllamaBackend
|
||||
return OllamaBackend(model=model, **kwargs)
|
||||
if backend_key == "fake":
|
||||
from decnet.realism.llm.impl.fake import FakeBackend
|
||||
return FakeBackend(model=model or "fake-model", **kwargs)
|
||||
raise ValueError(
|
||||
f"Unsupported DECNET_REALISM_LLM={backend_key!r}; "
|
||||
"expected one of: ollama, fake"
|
||||
)
|
||||
6
decnet/realism/llm/impl/__init__.py
Normal file
6
decnet/realism/llm/impl/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""Concrete LLM-backend implementations.
|
||||
|
||||
Importers go through :func:`decnet.realism.llm.get_llm`, not these
|
||||
modules directly — same convention as :mod:`decnet.web.db.sqlite` and
|
||||
:mod:`decnet.bus.unix_client`.
|
||||
"""
|
||||
50
decnet/realism/llm/impl/fake.py
Normal file
50
decnet/realism/llm/impl/fake.py
Normal file
@@ -0,0 +1,50 @@
|
||||
"""In-process fake backend for tests.
|
||||
|
||||
Returns a canned string so the driver path can be exercised without an
|
||||
Ollama install. Configurable via ``DECNET_REALISM_FAKE_OUTPUT`` (env)
|
||||
or the ``output`` constructor arg — the env-var path lets integration
|
||||
tests run the worker end-to-end with deterministic output.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from decnet.realism.llm.base import LLMBackend, LLMResult
|
||||
|
||||
|
||||
_DEFAULT_OUTPUT = (
|
||||
"Subject: Quick update\n\n"
|
||||
"Hi,\n\nFollowing up on the topic.\n\nBest regards,\nFake Persona\n"
|
||||
)
|
||||
|
||||
|
||||
class FakeBackend(LLMBackend):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
model: str = "fake-model",
|
||||
timeout: float = 1.0,
|
||||
output: Optional[str] = None,
|
||||
success: bool = True,
|
||||
) -> None:
|
||||
self.model = model
|
||||
self.timeout = timeout
|
||||
self._output = (
|
||||
output
|
||||
if output is not None
|
||||
else os.environ.get("DECNET_REALISM_FAKE_OUTPUT", _DEFAULT_OUTPUT)
|
||||
)
|
||||
self._success = success
|
||||
|
||||
async def generate(self, prompt: str) -> LLMResult: # noqa: ARG002
|
||||
t0 = time.monotonic()
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
return LLMResult(
|
||||
success=self._success,
|
||||
text=self._output if self._success else "",
|
||||
model=self.model,
|
||||
latency_ms=latency_ms,
|
||||
extra={"rc": 0 if self._success else 1},
|
||||
)
|
||||
100
decnet/realism/llm/impl/ollama.py
Normal file
100
decnet/realism/llm/impl/ollama.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""Ollama subprocess backend.
|
||||
|
||||
Shells out to ``ollama run <model>`` with the prompt fed via stdin.
|
||||
|
||||
Why subprocess and not the Ollama HTTP API:
|
||||
* No new dependency (``ollama`` Python lib is optional).
|
||||
* Works on hosts where Ollama is bound to a unix socket, an unusual TCP
|
||||
port, or behind a remote-mount layer — `ollama run` resolves all that.
|
||||
* Same path the operator uses by hand (``ollama run llama3.1``); easier
|
||||
to debug discrepancies between worker output and a console session.
|
||||
|
||||
Cost: per-call process spawn (~50ms on a warm box). Acceptable for
|
||||
realism tick rates (one body per ~5 minutes per persona by default).
|
||||
When that cost matters, swap to an HTTP-API backend; the seam is in
|
||||
:mod:`decnet.realism.llm.factory`.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.realism.llm.base import LLMBackend, LLMResult, LLMTimeout
|
||||
|
||||
log = get_logger("realism.llm")
|
||||
|
||||
_OLLAMA = "ollama"
|
||||
_DEFAULT_MODEL = os.environ.get("DECNET_REALISM_MODEL", "llama3.1")
|
||||
_DEFAULT_TIMEOUT = float(os.environ.get("DECNET_REALISM_TIMEOUT", "60"))
|
||||
|
||||
|
||||
class OllamaBackend(LLMBackend):
|
||||
"""Concrete :class:`LLMBackend` that shells out to ``ollama run``."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
model: Optional[str] = None,
|
||||
timeout: Optional[float] = None,
|
||||
) -> None:
|
||||
self.model = model or _DEFAULT_MODEL
|
||||
self.timeout = timeout if timeout is not None else _DEFAULT_TIMEOUT
|
||||
|
||||
async def generate(self, prompt: str) -> LLMResult:
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
_OLLAMA, "run", self.model,
|
||||
stdin=asyncio.subprocess.PIPE,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
except FileNotFoundError as exc:
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
return LLMResult(
|
||||
success=False,
|
||||
text="",
|
||||
model=self.model,
|
||||
latency_ms=latency_ms,
|
||||
extra={"rc": 127, "stderr": f"argv[0] not found: {exc}"},
|
||||
)
|
||||
try:
|
||||
stdout, stderr = await asyncio.wait_for(
|
||||
proc.communicate(prompt.encode("utf-8")),
|
||||
timeout=self.timeout,
|
||||
)
|
||||
except asyncio.TimeoutError as exc:
|
||||
try:
|
||||
proc.kill()
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
raise LLMTimeout(
|
||||
f"ollama run {self.model} exceeded {self.timeout}s"
|
||||
) from exc
|
||||
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
rc = proc.returncode if proc.returncode is not None else -1
|
||||
text = stdout.decode("utf-8", "replace")
|
||||
stderr_s = stderr.decode("utf-8", "replace")
|
||||
if rc != 0 or not text.strip():
|
||||
log.warning(
|
||||
"ollama backend non-zero / empty rc=%d model=%s stderr=%r",
|
||||
rc, self.model, stderr_s[:200],
|
||||
)
|
||||
return LLMResult(
|
||||
success=False,
|
||||
text=text,
|
||||
model=self.model,
|
||||
latency_ms=latency_ms,
|
||||
extra={"rc": rc, "stderr": stderr_s.strip()[:256]},
|
||||
)
|
||||
return LLMResult(
|
||||
success=True,
|
||||
text=text,
|
||||
model=self.model,
|
||||
latency_ms=latency_ms,
|
||||
extra={"rc": rc},
|
||||
)
|
||||
184
decnet/realism/naming.py
Normal file
184
decnet/realism/naming.py
Normal file
@@ -0,0 +1,184 @@
|
||||
"""Per-content-class filename generators.
|
||||
|
||||
The pre-realism orchestrator emitted ``notes-1777315854.txt``
|
||||
(unix-epoch suffix) — a tell on first glance. Real users name
|
||||
``notes.txt``, ``TODO.md``, ``backup-2025-04.sql.gz``. Real systems
|
||||
write ``cron.log``, ``cron.log.1``, ``cron.log.2.gz`` (logrotate
|
||||
shape, no epoch).
|
||||
|
||||
Stage 3 ships **deterministic templates only**, persona-conditioned.
|
||||
Stage 6 wires LLM enrichment for the user-classes (``note``, ``todo``,
|
||||
``draft``, ``script``); the deterministic templates remain the
|
||||
fallback when LLM is disabled or times out.
|
||||
|
||||
The factory mirrors :func:`decnet.canary.factory.get_generator`:
|
||||
caller passes a :class:`~decnet.realism.taxonomy.ContentClass`; we
|
||||
return the namer registered for it. Renaming a content_class is a
|
||||
schema change and would invalidate ``synthetic_files.path`` lookups,
|
||||
so the dispatch is exhaustive — no silent fallbacks for unknown
|
||||
classes.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import secrets
|
||||
import string
|
||||
from typing import Callable, Optional
|
||||
|
||||
from decnet.realism.personas import login_for
|
||||
from decnet.realism.taxonomy import ContentClass
|
||||
|
||||
|
||||
# Persona → home-dir convention. Most personas are linux-style; the
|
||||
# rare "windows" persona gets ``C:\\Users\\<persona>\\Documents`` style
|
||||
# paths (out of scope until per-OS personas land). For now everything
|
||||
# is POSIX.
|
||||
def _home(persona: str) -> str:
|
||||
"""Return the canonical home directory for *persona*."""
|
||||
return f"/home/{login_for(persona)}"
|
||||
|
||||
|
||||
def _random_token(rng: secrets.SystemRandom, length: int = 6) -> str:
|
||||
"""Lowercase-alphanum token of length *length* — like ``mkstemp``."""
|
||||
return "".join(rng.choice(string.ascii_lowercase + string.digits) for _ in range(length))
|
||||
|
||||
|
||||
# ── User-class namers ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
_NOTE_NAMES: tuple[str, ...] = (
|
||||
"notes.txt", "scratch.md", "ideas.txt", "Untitled-3.txt",
|
||||
"draft.md", "keys.txt", "passwords.txt", "TODO.md",
|
||||
)
|
||||
|
||||
_TODO_NAMES: tuple[str, ...] = (
|
||||
"TODO.md", "todo.txt", "things.md", "tasks.txt", "punchlist.md",
|
||||
)
|
||||
|
||||
_DRAFT_NAMES: tuple[str, ...] = (
|
||||
"Q3-budget-DRAFT.md", "proposal.md", "letter.txt",
|
||||
"rfc-internal.md", "memo.txt", "1on1-notes.md",
|
||||
)
|
||||
|
||||
_SCRIPT_NAMES: tuple[str, ...] = (
|
||||
"backup.sh", "deploy.sh", "cleanup.sh", "rotate.sh",
|
||||
"fix.py", "tmp.py", "scratch.py",
|
||||
)
|
||||
|
||||
|
||||
def _name_user(
|
||||
persona: str, names: tuple[str, ...], rng: secrets.SystemRandom,
|
||||
) -> str:
|
||||
return f"{_home(persona)}/{rng.choice(names)}"
|
||||
|
||||
|
||||
def _name_note(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
return _name_user(persona, _NOTE_NAMES, rng)
|
||||
|
||||
|
||||
def _name_todo(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
return _name_user(persona, _TODO_NAMES, rng)
|
||||
|
||||
|
||||
def _name_draft(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
return _name_user(persona, _DRAFT_NAMES, rng)
|
||||
|
||||
|
||||
def _name_script(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
return _name_user(persona, _SCRIPT_NAMES, rng)
|
||||
|
||||
|
||||
# ── System-class namers ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
# logrotate skeleton: cron.log, cron.log.1, cron.log.2.gz. No epoch
|
||||
# suffix — the realism failure today is `cron-1777317867.log`.
|
||||
_CRON_LOGROTATE: tuple[str, ...] = (
|
||||
"/var/log/cron.log", "/var/log/cron.log.1", "/var/log/cron.log.2.gz",
|
||||
)
|
||||
_DAEMON_LOGROTATE: tuple[str, ...] = (
|
||||
"/var/log/daemon.log", "/var/log/syslog", "/var/log/messages",
|
||||
"/var/log/auth.log", "/var/log/auth.log.1",
|
||||
)
|
||||
|
||||
|
||||
def _name_log_cron(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
return rng.choice(_CRON_LOGROTATE)
|
||||
|
||||
|
||||
def _name_log_daemon(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
return rng.choice(_DAEMON_LOGROTATE)
|
||||
|
||||
|
||||
def _name_cache_tmp(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
# mkstemp shape: /tmp/.cache-XXXXXX with random alphanumerics.
|
||||
# Hidden dot keeps it out of `ls` by default — same as glibc/python.
|
||||
# Bandit B108 fires on the literal "/tmp/" path; suppressed at the
|
||||
# site because this is a path we are *generating for a target
|
||||
# decky*, not a file we are opening on the host.
|
||||
return f"/tmp/.cache-{_random_token(rng, 6)}" # nosec B108
|
||||
|
||||
|
||||
# ── Email + canary placeholders ────────────────────────────────────────────
|
||||
# Email "names" (paths) are produced by the email driver's spool logic,
|
||||
# not by realism naming. Canary paths are advisory — operators usually
|
||||
# specify ``placement_path`` directly. Stage 7 of the realism migration
|
||||
# refines canary placement based on persona + content_class.
|
||||
|
||||
|
||||
def _name_email(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
raise NotImplementedError(
|
||||
"email paths come from the email driver's spool logic, not "
|
||||
"realism.naming"
|
||||
)
|
||||
|
||||
|
||||
def _name_canary(persona: str, rng: secrets.SystemRandom) -> str:
|
||||
raise NotImplementedError(
|
||||
"canary placement is set by the canary cultivator (stage 7), "
|
||||
"not realism.naming"
|
||||
)
|
||||
|
||||
|
||||
# ── Dispatch ───────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
_NAMERS: dict[ContentClass, Callable[[str, secrets.SystemRandom], str]] = {
|
||||
ContentClass.NOTE: _name_note,
|
||||
ContentClass.TODO: _name_todo,
|
||||
ContentClass.DRAFT: _name_draft,
|
||||
ContentClass.SCRIPT: _name_script,
|
||||
ContentClass.LOG_CRON: _name_log_cron,
|
||||
ContentClass.LOG_DAEMON: _name_log_daemon,
|
||||
ContentClass.CACHE_TMP: _name_cache_tmp,
|
||||
ContentClass.EMAIL: _name_email,
|
||||
ContentClass.CANARY_AWS_CREDS: _name_canary,
|
||||
ContentClass.CANARY_ENV_FILE: _name_canary,
|
||||
ContentClass.CANARY_GIT_CONFIG: _name_canary,
|
||||
ContentClass.CANARY_SSH_KEY: _name_canary,
|
||||
ContentClass.CANARY_HONEYDOC: _name_canary,
|
||||
ContentClass.CANARY_HONEYDOC_DOCX: _name_canary,
|
||||
ContentClass.CANARY_HONEYDOC_PDF: _name_canary,
|
||||
ContentClass.CANARY_MYSQL_DUMP: _name_canary,
|
||||
}
|
||||
|
||||
|
||||
def make_path(
|
||||
content_class: ContentClass,
|
||||
persona: str,
|
||||
*,
|
||||
rand: Optional[secrets.SystemRandom] = None,
|
||||
) -> str:
|
||||
"""Return a plausible absolute container-side path for *content_class*.
|
||||
|
||||
Persona-conditioned for user-classes (``/home/<persona>/…``).
|
||||
System-classes ignore persona and pick from a logrotate-shaped
|
||||
skeleton. Email and canary classes raise — those paths come
|
||||
from the respective drivers, not from realism naming.
|
||||
"""
|
||||
rng = rand or secrets.SystemRandom()
|
||||
namer = _NAMERS.get(content_class)
|
||||
if namer is None:
|
||||
raise KeyError(
|
||||
f"no namer registered for content_class={content_class!r}"
|
||||
)
|
||||
return namer(persona, rng)
|
||||
153
decnet/realism/personas.py
Normal file
153
decnet/realism/personas.py
Normal file
@@ -0,0 +1,153 @@
|
||||
"""Persona schema for realism content generation.
|
||||
|
||||
Stored as a JSON list on :attr:`Topology.email_personas`. Each persona
|
||||
describes one fictional employee — sender of email *and* author of
|
||||
files (notes, TODOs, drafts, scripts) on the deckies they're sampled
|
||||
onto. The schema deliberately stays narrow: the LLM gets *enough*
|
||||
differentiation to write distinct voices, no more.
|
||||
|
||||
The class is still named :class:`EmailPersona` because every persona
|
||||
in the pool today carries a mandatory email address (used for IMAP/
|
||||
POP3 spool delivery). Future per-decky personas without mailboxes
|
||||
would justify a rename / superclass; not in scope for the realism
|
||||
migration.
|
||||
|
||||
Invalid entries are dropped with a warning (returned alongside the
|
||||
parsed list) rather than raising — a single typo in one persona must
|
||||
not stall the entire realism tick.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Literal, Optional
|
||||
|
||||
from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
|
||||
|
||||
from decnet.logging import get_logger
|
||||
|
||||
logger = get_logger("realism.personas")
|
||||
|
||||
Tone = Literal["formal", "direct", "casual", "technical", "custom"]
|
||||
ReplyLatency = Literal["fast", "normal", "slow"]
|
||||
|
||||
|
||||
class EmailPersona(BaseModel):
|
||||
"""One fake mailbox owner.
|
||||
|
||||
``language`` is ISO 639-1 (``en``, ``es``, ``pt``…); when unset on the
|
||||
persona it falls back to the topology's ``language_default``.
|
||||
``uses_llms_heavily`` lifts the prompt-layer em-dash suppression for
|
||||
that persona — em-dashes are an LLM tell, but a persona explicitly
|
||||
pegged as a heavy LLM user should *naturally* produce them.
|
||||
"""
|
||||
name: str = Field(min_length=1, max_length=128)
|
||||
email: str = Field(min_length=3, max_length=255)
|
||||
role: str = Field(min_length=1, max_length=128)
|
||||
tone: Tone = "formal"
|
||||
tone_custom: Optional[str] = Field(default=None, max_length=128)
|
||||
mannerisms: list[str] = Field(default_factory=list, max_length=12)
|
||||
language: Optional[str] = Field(default=None, max_length=8)
|
||||
signature: Optional[str] = Field(default=None, max_length=512)
|
||||
active_hours: str = Field(default="09:00-18:00", max_length=32)
|
||||
reply_latency: ReplyLatency = "normal"
|
||||
uses_llms_heavily: bool = False
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _custom_tone_requires_text(self) -> "EmailPersona":
|
||||
# ``tone="custom"`` lets operators describe a voice the four canned
|
||||
# tones don't capture (sarcastic, deadpan, terse, etc.). The free
|
||||
# text is interpolated into the prompt verbatim, so an empty
|
||||
# value would just leave the LLM with the literal word "custom" —
|
||||
# reject it loudly instead of silently producing a useless prompt.
|
||||
if self.tone == "custom" and not (self.tone_custom and self.tone_custom.strip()):
|
||||
raise ValueError("tone_custom is required when tone is 'custom'")
|
||||
return self
|
||||
|
||||
@field_validator("email")
|
||||
@classmethod
|
||||
def _email_shape(cls, v: str) -> str:
|
||||
# Cheap structural check — full RFC 5322 isn't worth the
|
||||
# dependency. We only need ``user@domain`` with non-empty parts
|
||||
# for the prompt builder + Message-ID generator.
|
||||
if "@" not in v:
|
||||
raise ValueError("email must contain '@'")
|
||||
local, _, domain = v.rpartition("@")
|
||||
if not local or not domain or "." not in domain:
|
||||
raise ValueError("email must look like user@domain.tld")
|
||||
return v
|
||||
|
||||
|
||||
def parse_personas(
|
||||
raw: str | list | None,
|
||||
*,
|
||||
language_default: str = "en",
|
||||
) -> list[EmailPersona]:
|
||||
"""Parse the JSON-or-list ``email_personas`` value into models.
|
||||
|
||||
Resolves ``language`` against *language_default* so downstream
|
||||
consumers (prompt builder, scheduler) never need to know about
|
||||
fallback semantics.
|
||||
"""
|
||||
if not raw:
|
||||
return []
|
||||
if isinstance(raw, str):
|
||||
try:
|
||||
raw = json.loads(raw)
|
||||
except json.JSONDecodeError as exc:
|
||||
logger.warning("realism personas: invalid JSON, skipping: %s", exc)
|
||||
return []
|
||||
if not isinstance(raw, list):
|
||||
logger.warning(
|
||||
"realism personas: expected list, got %s", type(raw).__name__
|
||||
)
|
||||
return []
|
||||
out: list[EmailPersona] = []
|
||||
for i, entry in enumerate(raw):
|
||||
try:
|
||||
persona = EmailPersona.model_validate(entry)
|
||||
except ValidationError as exc:
|
||||
logger.warning(
|
||||
"realism personas: dropping invalid entry index=%d: %s",
|
||||
i, exc.errors(include_url=False),
|
||||
)
|
||||
continue
|
||||
if persona.language is None:
|
||||
persona = persona.model_copy(update={"language": language_default})
|
||||
out.append(persona)
|
||||
return out
|
||||
|
||||
|
||||
def login_for(persona: str) -> str:
|
||||
"""Return the linux login derived from a persona's display name.
|
||||
|
||||
Lowercase, strip spaces; if the result isn't a plausible POSIX
|
||||
login (alnum ASCII), fall back to ``user`` so the path doesn't
|
||||
leak the persona's display name onto the decky filesystem.
|
||||
Shared by realism path naming (``decnet/realism/naming.py``) and
|
||||
canary cultivation (``decnet/canary/cultivator.py``).
|
||||
"""
|
||||
candidate = persona.lower().replace(" ", "")
|
||||
if candidate.isalnum() and candidate.isascii() and candidate:
|
||||
return candidate
|
||||
return "user"
|
||||
|
||||
|
||||
def in_active_hours(persona: EmailPersona, now_hour: int) -> bool:
|
||||
"""Return True if *now_hour* (0–23) falls in the persona's window.
|
||||
|
||||
Format: ``"HH:MM-HH:MM"``. Wrap-around windows (``"22:00-06:00"``)
|
||||
are supported. Invalid windows treat the persona as always-on so a
|
||||
config typo never silences the whole fleet.
|
||||
"""
|
||||
try:
|
||||
start_s, end_s = persona.active_hours.split("-")
|
||||
start_h = int(start_s.split(":")[0])
|
||||
end_h = int(end_s.split(":")[0])
|
||||
except (ValueError, IndexError):
|
||||
return True
|
||||
if start_h == end_h:
|
||||
return True
|
||||
if start_h < end_h:
|
||||
return start_h <= now_hour < end_h
|
||||
# Wrap-around (e.g. 22:00-06:00).
|
||||
return now_hour >= start_h or now_hour < end_h
|
||||
145
decnet/realism/personas_pool.py
Normal file
145
decnet/realism/personas_pool.py
Normal file
@@ -0,0 +1,145 @@
|
||||
"""Global persona pool — non-topology deckies.
|
||||
|
||||
DECNET runs in three deployment shapes that emit running deckies:
|
||||
|
||||
* **MazeNET topologies** — each topology owns its own
|
||||
:attr:`Topology.email_personas` JSON list; consumers walk from the
|
||||
decky back to its parent topology row.
|
||||
* **Unihost fleet** — MACVLAN/IPVLAN deckies that have no
|
||||
parent topology row at all. They share one host-wide pool.
|
||||
* **SWARM shards** — DeckyShard rows on enrolled workers.
|
||||
Same shape as fleet for realism purposes (no parent topology row),
|
||||
so they read the same global pool.
|
||||
|
||||
This module owns the global pool: a JSON file on disk that operators
|
||||
populate via ``decnet realism import-personas <file>`` (or by editing
|
||||
the file directly). The file is loaded lazily on first read and
|
||||
re-loaded on mtime change so a CLI import takes effect for the running
|
||||
worker without a restart.
|
||||
|
||||
Path resolution order:
|
||||
|
||||
1. ``DECNET_REALISM_PERSONAS`` environment variable — explicit override.
|
||||
2. ``/etc/decnet/email_personas.json`` — canonical master path; this is
|
||||
what ``decnet init`` will eventually own. Filename retained
|
||||
(``email_personas.json``) because the on-disk schema hasn't changed
|
||||
and operators may already have committed copies.
|
||||
3. ``~/.decnet/email_personas.json`` — dev fallback so a developer can
|
||||
exercise consumers without root or ``decnet init``.
|
||||
|
||||
When the file is missing / empty / unparseable, the pool is empty and
|
||||
consumers skip fleet/shard deckies the same way they skip a topology
|
||||
with too few personas. No silent fallback to dummy personas; silence
|
||||
is correct when there's no opinion to convey.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.realism.personas import EmailPersona, parse_personas
|
||||
|
||||
logger = get_logger("realism.personas_pool")
|
||||
|
||||
_ENV_VAR = "DECNET_REALISM_PERSONAS"
|
||||
_SYSTEM_PATH = Path("/etc/decnet/email_personas.json")
|
||||
|
||||
|
||||
def _user_path() -> Path:
|
||||
return Path(os.path.expanduser("~/.decnet/email_personas.json"))
|
||||
|
||||
|
||||
def resolve_path() -> Path:
|
||||
"""Return the path the global pool would load from right now.
|
||||
|
||||
The file may not exist; callers are expected to handle that. The
|
||||
function is pure (no I/O) so the ``decnet realism import-personas``
|
||||
CLI can ask "where would I write to?" without touching the disk.
|
||||
"""
|
||||
override = os.environ.get(_ENV_VAR, "").strip()
|
||||
if override:
|
||||
return Path(override)
|
||||
if _SYSTEM_PATH.exists():
|
||||
return _SYSTEM_PATH
|
||||
# ``/etc/decnet`` exists on a fully-provisioned host (post ``decnet
|
||||
# init``) but may be read-only for the API user on dev boxes — fall
|
||||
# back to the user path when the directory isn't writable so a fresh
|
||||
# PUT lands somewhere instead of erroring out. We only do this when
|
||||
# the system file doesn't exist yet; once it does, it's authoritative.
|
||||
if _SYSTEM_PATH.parent.exists() and os.access(_SYSTEM_PATH.parent, os.W_OK):
|
||||
return _SYSTEM_PATH
|
||||
return _user_path()
|
||||
|
||||
|
||||
# ── Cache ────────────────────────────────────────────────────────────────────
|
||||
# Lock-protected because two scheduler ticks could race on the first load,
|
||||
# and the read path is hot enough (every tick, every fleet/shard mail
|
||||
# decky) that re-parsing on every call is wasteful.
|
||||
|
||||
_lock = threading.Lock()
|
||||
_cache: list[EmailPersona] = []
|
||||
_cache_path: Optional[Path] = None
|
||||
_cache_mtime: float = 0.0
|
||||
|
||||
|
||||
def load(*, language_default: str = "en") -> list[EmailPersona]:
|
||||
"""Return the parsed global persona pool.
|
||||
|
||||
*language_default* fills in any persona missing a ``language`` field;
|
||||
fleet/shard sources have no topology-level default, so callers
|
||||
should pass the worker's best guess (typically ``"en"``).
|
||||
|
||||
Threadsafe and cheap on the steady state (mtime check + dict lookup);
|
||||
expensive only when the file changed since the last call.
|
||||
"""
|
||||
path = resolve_path()
|
||||
try:
|
||||
st = path.stat()
|
||||
except OSError:
|
||||
with _lock:
|
||||
global _cache, _cache_path, _cache_mtime
|
||||
_cache = []
|
||||
_cache_path = path
|
||||
_cache_mtime = 0.0
|
||||
return []
|
||||
|
||||
with _lock:
|
||||
if (
|
||||
_cache_path == path
|
||||
and _cache_mtime == st.st_mtime
|
||||
and _cache # non-empty cache; empty re-parses cheaply anyway
|
||||
):
|
||||
return _cache
|
||||
|
||||
try:
|
||||
raw = path.read_text(encoding="utf-8")
|
||||
except OSError as exc:
|
||||
logger.warning("realism global pool: read failed path=%s: %s", path, exc)
|
||||
return []
|
||||
|
||||
parsed = parse_personas(raw, language_default=language_default)
|
||||
with _lock:
|
||||
_cache = parsed
|
||||
_cache_path = path
|
||||
_cache_mtime = st.st_mtime
|
||||
if parsed:
|
||||
logger.info(
|
||||
"realism global pool: loaded %d personas from %s", len(parsed), path,
|
||||
)
|
||||
return parsed
|
||||
|
||||
|
||||
def reset_cache() -> None:
|
||||
"""Clear the in-process cache.
|
||||
|
||||
Test-only helper — avoids stale state when several tests in the
|
||||
same process exercise different on-disk pools.
|
||||
"""
|
||||
global _cache, _cache_path, _cache_mtime
|
||||
with _lock:
|
||||
_cache = []
|
||||
_cache_path = None
|
||||
_cache_mtime = 0.0
|
||||
368
decnet/realism/planner.py
Normal file
368
decnet/realism/planner.py
Normal file
@@ -0,0 +1,368 @@
|
||||
"""Realism planner — picks the next ``(decky, persona, class, action)`` tuple.
|
||||
|
||||
Stage 3: returns ``create``-only plans (the edit branch lands in
|
||||
stage 3b). Pure-function, deterministic given the same inputs:
|
||||
caller passes deckies (with personas pre-resolved on each row),
|
||||
``now``, and an RNG.
|
||||
|
||||
The persona resolution split — topology-pool vs. global-pool — is
|
||||
the orchestrator's job, not the planner's. Each decky dict reaching
|
||||
:func:`pick` carries a ``_realism_personas`` key with the resolved
|
||||
:class:`~decnet.realism.personas.EmailPersona` list. Keeps the
|
||||
planner test-isolated and avoids forcing it to know about the
|
||||
:class:`~decnet.web.db.repository.BaseRepository` / topology pool /
|
||||
global pool.
|
||||
|
||||
Diurnal gating uses :func:`decnet.realism.diurnal.in_work_hours` per
|
||||
persona; we filter the (decky, persona) pairs *before* picking, so a
|
||||
persona outside its window is never considered.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import secrets
|
||||
from datetime import datetime
|
||||
from typing import Any, Optional, Sequence
|
||||
|
||||
from decnet.realism import bodies, naming
|
||||
from decnet.realism.diurnal import in_work_hours, sample_mtime
|
||||
from decnet.realism.personas import EmailPersona
|
||||
from decnet.realism.taxonomy import ContentClass, Plan, PlanAction # noqa: F401
|
||||
|
||||
|
||||
# Stage-3 weighted sampling defaults:
|
||||
# * User content (notes/todo/draft/script) gets the bulk — those are
|
||||
# the realism win when a persona "looks busy."
|
||||
# * System content (cron/daemon/cache) is plausible filler.
|
||||
# * Email + canary are owned by other paths and not picked here.
|
||||
# * Canary classes are picked rarely. Each plant materialises a real
|
||||
# CanaryToken row + DNS slug + HTTP URL — flooding the fleet makes
|
||||
# the dashboard noisy. ~3% of file picks land here.
|
||||
#
|
||||
# These are the *defaults*. Operator-tuned overrides arrive via
|
||||
# :func:`apply_payload` (admin PUT /api/v1/realism/config). The
|
||||
# orchestrator worker periodically refreshes the in-process state from
|
||||
# the ``realism_config`` table; pick() reads the live globals each call.
|
||||
_DEFAULT_USER_CLASS_WEIGHTS: tuple[tuple[ContentClass, int], ...] = (
|
||||
(ContentClass.NOTE, 30),
|
||||
(ContentClass.TODO, 20),
|
||||
(ContentClass.DRAFT, 15),
|
||||
(ContentClass.SCRIPT, 10),
|
||||
)
|
||||
_DEFAULT_SYSTEM_CLASS_WEIGHTS: tuple[tuple[ContentClass, int], ...] = (
|
||||
(ContentClass.LOG_CRON, 12),
|
||||
(ContentClass.LOG_DAEMON, 8),
|
||||
(ContentClass.CACHE_TMP, 5),
|
||||
)
|
||||
_DEFAULT_CANARY_CLASS_WEIGHTS: tuple[tuple[ContentClass, int], ...] = (
|
||||
(ContentClass.CANARY_AWS_CREDS, 1),
|
||||
(ContentClass.CANARY_ENV_FILE, 1),
|
||||
(ContentClass.CANARY_GIT_CONFIG, 1),
|
||||
(ContentClass.CANARY_SSH_KEY, 1),
|
||||
(ContentClass.CANARY_HONEYDOC, 1),
|
||||
(ContentClass.CANARY_HONEYDOC_DOCX, 1),
|
||||
(ContentClass.CANARY_HONEYDOC_PDF, 1),
|
||||
(ContentClass.CANARY_MYSQL_DUMP, 1),
|
||||
)
|
||||
_DEFAULT_CANARY_PROBABILITY = 0.03
|
||||
|
||||
# Live (mutable) globals — reassigned by :func:`apply_payload`. pick()
|
||||
# reads these. Reset to defaults via :func:`reset_to_defaults` (used by
|
||||
# tests + the API DELETE path).
|
||||
_USER_CLASS_WEIGHTS: tuple[tuple[ContentClass, int], ...] = _DEFAULT_USER_CLASS_WEIGHTS
|
||||
_SYSTEM_CLASS_WEIGHTS: tuple[tuple[ContentClass, int], ...] = _DEFAULT_SYSTEM_CLASS_WEIGHTS
|
||||
_CANARY_CLASS_WEIGHTS: tuple[tuple[ContentClass, int], ...] = _DEFAULT_CANARY_CLASS_WEIGHTS
|
||||
_CANARY_PROBABILITY: float = _DEFAULT_CANARY_PROBABILITY
|
||||
|
||||
|
||||
def _serialize_weights(
|
||||
weights: tuple[tuple[ContentClass, int], ...],
|
||||
) -> list[dict[str, Any]]:
|
||||
return [{"content_class": cls.value, "weight": w} for cls, w in weights]
|
||||
|
||||
|
||||
def _parse_weights(
|
||||
raw: Any, allowed: set[ContentClass],
|
||||
) -> tuple[tuple[ContentClass, int], ...]:
|
||||
"""Parse ``[{"content_class": "...", "weight": N}, ...]`` into the
|
||||
planner's internal tuple shape. Drops entries whose ``content_class``
|
||||
isn't in *allowed* (defends against an operator pasting in a canary
|
||||
class on the user list, which would skew sampling without the
|
||||
canary-probability gate).
|
||||
|
||||
Raises ``ValueError`` on structural problems (non-list, non-int
|
||||
weight, negative weight, empty result) so the API can return 400.
|
||||
"""
|
||||
if not isinstance(raw, list):
|
||||
raise ValueError("weights must be a list")
|
||||
out: list[tuple[ContentClass, int]] = []
|
||||
for entry in raw:
|
||||
if not isinstance(entry, dict):
|
||||
raise ValueError("each weight entry must be an object")
|
||||
cls_name = entry.get("content_class")
|
||||
weight = entry.get("weight")
|
||||
if not isinstance(weight, int) or weight < 0:
|
||||
raise ValueError(
|
||||
f"weight for {cls_name!r} must be a non-negative integer"
|
||||
)
|
||||
try:
|
||||
cls = ContentClass(cls_name)
|
||||
except (ValueError, TypeError):
|
||||
raise ValueError(f"unknown content_class: {cls_name!r}")
|
||||
if cls not in allowed:
|
||||
# Silently drop — a class that doesn't belong on this list
|
||||
# (e.g. a canary class on the user list) is operator error,
|
||||
# but we don't want to fail the whole save over one stray
|
||||
# entry. The roundtrip in current_payload() will show the
|
||||
# operator their entry didn't land.
|
||||
continue
|
||||
out.append((cls, weight))
|
||||
if not out:
|
||||
raise ValueError("weights list resolved to zero valid entries")
|
||||
if sum(w for _, w in out) <= 0:
|
||||
raise ValueError("weights must sum to a positive number")
|
||||
return tuple(out)
|
||||
|
||||
|
||||
_USER_CLASSES: set[ContentClass] = {
|
||||
ContentClass.NOTE, ContentClass.TODO, ContentClass.DRAFT, ContentClass.SCRIPT,
|
||||
}
|
||||
_SYSTEM_CLASSES: set[ContentClass] = {
|
||||
ContentClass.LOG_CRON, ContentClass.LOG_DAEMON, ContentClass.CACHE_TMP,
|
||||
}
|
||||
_CANARY_CLASSES: set[ContentClass] = {
|
||||
ContentClass.CANARY_AWS_CREDS, ContentClass.CANARY_ENV_FILE,
|
||||
ContentClass.CANARY_GIT_CONFIG, ContentClass.CANARY_SSH_KEY,
|
||||
ContentClass.CANARY_HONEYDOC, ContentClass.CANARY_HONEYDOC_DOCX,
|
||||
ContentClass.CANARY_HONEYDOC_PDF, ContentClass.CANARY_MYSQL_DUMP,
|
||||
}
|
||||
|
||||
|
||||
def current_payload() -> dict[str, Any]:
|
||||
"""Export the live planner config as a JSON-safe dict.
|
||||
|
||||
Wire shape returned by ``GET /api/v1/realism/config``."""
|
||||
return {
|
||||
"user_class_weights": _serialize_weights(_USER_CLASS_WEIGHTS),
|
||||
"system_class_weights": _serialize_weights(_SYSTEM_CLASS_WEIGHTS),
|
||||
"canary_class_weights": _serialize_weights(_CANARY_CLASS_WEIGHTS),
|
||||
"canary_probability": _CANARY_PROBABILITY,
|
||||
}
|
||||
|
||||
|
||||
def apply_payload(payload: dict[str, Any]) -> None:
|
||||
"""Override the planner's live globals from a wire payload.
|
||||
|
||||
Validates structurally and rebinds module-level names atomically
|
||||
per field — partial failures don't leave the planner in a torn
|
||||
state because validation happens before any rebind.
|
||||
|
||||
Unknown fields are ignored (forward-compat); fields not present
|
||||
leave the corresponding global untouched."""
|
||||
global _USER_CLASS_WEIGHTS, _SYSTEM_CLASS_WEIGHTS
|
||||
global _CANARY_CLASS_WEIGHTS, _CANARY_PROBABILITY
|
||||
|
||||
new_user = _USER_CLASS_WEIGHTS
|
||||
new_system = _SYSTEM_CLASS_WEIGHTS
|
||||
new_canary = _CANARY_CLASS_WEIGHTS
|
||||
new_prob = _CANARY_PROBABILITY
|
||||
|
||||
if "user_class_weights" in payload:
|
||||
new_user = _parse_weights(payload["user_class_weights"], _USER_CLASSES)
|
||||
if "system_class_weights" in payload:
|
||||
new_system = _parse_weights(
|
||||
payload["system_class_weights"], _SYSTEM_CLASSES,
|
||||
)
|
||||
if "canary_class_weights" in payload:
|
||||
new_canary = _parse_weights(
|
||||
payload["canary_class_weights"], _CANARY_CLASSES,
|
||||
)
|
||||
if "canary_probability" in payload:
|
||||
prob = payload["canary_probability"]
|
||||
if not isinstance(prob, (int, float)) or not (0.0 <= prob <= 1.0):
|
||||
raise ValueError("canary_probability must be in [0.0, 1.0]")
|
||||
new_prob = float(prob)
|
||||
|
||||
_USER_CLASS_WEIGHTS = new_user
|
||||
_SYSTEM_CLASS_WEIGHTS = new_system
|
||||
_CANARY_CLASS_WEIGHTS = new_canary
|
||||
_CANARY_PROBABILITY = new_prob
|
||||
|
||||
|
||||
def reset_to_defaults() -> None:
|
||||
"""Restore hardcoded defaults. Used by tests and the API reset path."""
|
||||
global _USER_CLASS_WEIGHTS, _SYSTEM_CLASS_WEIGHTS
|
||||
global _CANARY_CLASS_WEIGHTS, _CANARY_PROBABILITY
|
||||
_USER_CLASS_WEIGHTS = _DEFAULT_USER_CLASS_WEIGHTS
|
||||
_SYSTEM_CLASS_WEIGHTS = _DEFAULT_SYSTEM_CLASS_WEIGHTS
|
||||
_CANARY_CLASS_WEIGHTS = _DEFAULT_CANARY_CLASS_WEIGHTS
|
||||
_CANARY_PROBABILITY = _DEFAULT_CANARY_PROBABILITY
|
||||
|
||||
|
||||
def _weighted_pick(
|
||||
weights: tuple[tuple[ContentClass, int], ...],
|
||||
rng: secrets.SystemRandom,
|
||||
) -> ContentClass:
|
||||
total = sum(w for _, w in weights)
|
||||
target = rng.randint(1, total)
|
||||
running = 0
|
||||
for cls, w in weights:
|
||||
running += w
|
||||
if target <= running:
|
||||
return cls
|
||||
return weights[-1][0] # unreachable, satisfy mypy
|
||||
|
||||
|
||||
def _eligible_pairs(
|
||||
deckies: Sequence[dict[str, Any]],
|
||||
now: datetime,
|
||||
) -> list[tuple[dict[str, Any], EmailPersona]]:
|
||||
"""Cross-product of deckies × resolved personas, diurnal-filtered.
|
||||
|
||||
A decky with no personas (empty ``_realism_personas``) is skipped
|
||||
entirely; same fail-quiet semantics as the emailgen scheduler.
|
||||
"""
|
||||
out: list[tuple[dict[str, Any], EmailPersona]] = []
|
||||
for decky in deckies:
|
||||
personas: list[EmailPersona] = decky.get("_realism_personas") or []
|
||||
for persona in personas:
|
||||
if in_work_hours(persona.active_hours, now):
|
||||
out.append((decky, persona))
|
||||
return out
|
||||
|
||||
|
||||
def pick(
|
||||
deckies: Sequence[dict[str, Any]],
|
||||
now: datetime,
|
||||
*,
|
||||
edit_candidate: Optional[dict[str, Any]] = None,
|
||||
rand: Optional[secrets.SystemRandom] = None,
|
||||
) -> Optional[Plan]:
|
||||
"""Return a single :class:`Plan` for the orchestrator's tick.
|
||||
|
||||
Stage-3b policy: weighted action roll — 60% create, 30% edit, 10%
|
||||
"leave alone" (planner returns ``None`` to skip). When the roll
|
||||
is "edit" and *edit_candidate* is set (a row from
|
||||
:meth:`BaseRepository.pick_random_synthetic_file_for_edit`), we
|
||||
return an edit Plan; otherwise we fall through to create.
|
||||
|
||||
The orchestrator scheduler is responsible for fetching the edit
|
||||
candidate before calling — keeps this function pure-of-DB and
|
||||
test-friendly.
|
||||
|
||||
Returns ``None`` when no eligible (decky, persona) pair exists or
|
||||
when the action roll lands on "leave alone."
|
||||
"""
|
||||
rng = rand or secrets.SystemRandom()
|
||||
|
||||
eligible = _eligible_pairs(deckies, now)
|
||||
if not eligible:
|
||||
return None
|
||||
|
||||
# Action roll. Edit only fires when there's a candidate from the
|
||||
# repo — otherwise we either re-roll to create or skip.
|
||||
roll = rng.random()
|
||||
if roll < 0.10:
|
||||
return None # "leave alone" — quiet tick is realism too
|
||||
if roll < 0.40 and edit_candidate is not None:
|
||||
return _edit_plan(edit_candidate, now, rng)
|
||||
|
||||
decky, persona = rng.choice(eligible)
|
||||
|
||||
# Canary first — they're rare (~3% of file picks), uniformly
|
||||
# weighted across generators. Falling here means the orchestrator
|
||||
# plants a callback-bearing artifact this tick instead of an
|
||||
# inert one.
|
||||
if rng.random() < _CANARY_PROBABILITY:
|
||||
content_class = _weighted_pick(_CANARY_CLASS_WEIGHTS, rng)
|
||||
# Canary placement is the cultivator's job — plan.target_path
|
||||
# is advisory; a "" lets the cultivator override entirely.
|
||||
target_path = ""
|
||||
body_hint = None
|
||||
mtime = sample_mtime(persona.active_hours, now, rand=rng)
|
||||
return Plan(
|
||||
decky_uuid=decky["uuid"],
|
||||
decky_name=decky["name"],
|
||||
persona=persona.name,
|
||||
content_class=content_class,
|
||||
action="create",
|
||||
target_path=target_path,
|
||||
mtime=mtime,
|
||||
body_hint=body_hint,
|
||||
notes=(
|
||||
f"persona={persona.name}",
|
||||
f"class={content_class.value}",
|
||||
"kind=canary",
|
||||
),
|
||||
)
|
||||
|
||||
# User vs system content — biased toward user (realism wins are
|
||||
# bigger there).
|
||||
if rng.random() < 0.7:
|
||||
content_class = _weighted_pick(_USER_CLASS_WEIGHTS, rng)
|
||||
else:
|
||||
content_class = _weighted_pick(_SYSTEM_CLASS_WEIGHTS, rng)
|
||||
|
||||
target_path = naming.make_path(content_class, persona.name, rand=rng)
|
||||
body_hint = bodies.make_body(content_class, persona.name, rand=rng)
|
||||
mtime = sample_mtime(persona.active_hours, now, rand=rng)
|
||||
|
||||
return Plan(
|
||||
decky_uuid=decky["uuid"],
|
||||
decky_name=decky["name"],
|
||||
persona=persona.name,
|
||||
content_class=content_class,
|
||||
action="create",
|
||||
target_path=target_path,
|
||||
mtime=mtime,
|
||||
body_hint=body_hint,
|
||||
notes=(
|
||||
f"persona={persona.name}",
|
||||
f"class={content_class.value}",
|
||||
f"window={persona.active_hours}",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _edit_plan(
|
||||
candidate: dict[str, Any],
|
||||
now: datetime,
|
||||
rng: secrets.SystemRandom,
|
||||
) -> Optional[Plan]:
|
||||
"""Build an edit-action :class:`Plan` from a synthetic_files row.
|
||||
|
||||
The candidate dict is the shape :meth:`BaseRepository.list_synthetic_files`
|
||||
returns — we only need ``decky_uuid``, ``path``, ``persona``,
|
||||
``content_class``, ``last_body``, ``uuid``. Returns ``None`` if
|
||||
the candidate's content_class is somehow not editable (defensive
|
||||
— the repo query already filters those out).
|
||||
"""
|
||||
try:
|
||||
cls = ContentClass(candidate["content_class"])
|
||||
except (KeyError, ValueError):
|
||||
return None
|
||||
if cls.is_canary() or cls == ContentClass.CACHE_TMP:
|
||||
return None
|
||||
# mtime: edits bump forward by ~hours-to-days, but never past now.
|
||||
# We model as "the file was edited some time after creation but
|
||||
# before now" — sample_mtime with a tighter cap keeps it recent.
|
||||
edit_mtime = sample_mtime(
|
||||
"00:00-00:00", now, rand=rng,
|
||||
backdate_min_hours=1.0, backdate_max_days=2.0,
|
||||
)
|
||||
return Plan(
|
||||
decky_uuid=candidate["decky_uuid"],
|
||||
decky_name=candidate.get("decky_name", ""),
|
||||
persona=candidate.get("persona", ""),
|
||||
content_class=cls,
|
||||
action="edit",
|
||||
target_path=candidate["path"],
|
||||
mtime=edit_mtime,
|
||||
body_hint=None, # edit uses previous_body, not a fresh hint
|
||||
previous_body=candidate.get("last_body", ""),
|
||||
notes=(
|
||||
f"persona={candidate.get('persona', '')}",
|
||||
f"class={cls.value}",
|
||||
"action=edit",
|
||||
f"synthetic_file_uuid={candidate.get('uuid', '')}",
|
||||
),
|
||||
)
|
||||
9
decnet/realism/prompts/__init__.py
Normal file
9
decnet/realism/prompts/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
"""Prompt builders for LLM-enriched content.
|
||||
|
||||
* :mod:`decnet.realism.prompts.email` — corporate-email body builder.
|
||||
|
||||
Stage 6 of the realism migration adds ``filebody.py``, ``filename.py``,
|
||||
and a ``_style.py`` helper so em-dash suppression sits in one place
|
||||
across email + file-class prompts.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
39
decnet/realism/prompts/_style.py
Normal file
39
decnet/realism/prompts/_style.py
Normal file
@@ -0,0 +1,39 @@
|
||||
"""Shared stylometric guards for LLM-bound prompts.
|
||||
|
||||
Lifted from the original ``orchestrator.emailgen.prompt`` em-dash
|
||||
block so file-class prompts (note / todo / draft / script bodies)
|
||||
pick up the same suppression. Per the
|
||||
``feedback_em_dash_llm_tell.md`` memory: em-dashes (—) are a strong
|
||||
LLM-authorship tell, suppress by default; allow only for personas
|
||||
explicitly opted in via ``EmailPersona.uses_llms_heavily``.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from decnet.realism.personas import EmailPersona
|
||||
|
||||
|
||||
_SUPPRESS_RULE = (
|
||||
"Do NOT use em-dashes (—). Use commas, periods, or "
|
||||
"parentheses instead. Em-dashes are a tell."
|
||||
)
|
||||
_ALLOW_RULE = (
|
||||
"Em-dashes are fine — this persona uses them naturally. "
|
||||
"Write in your usual style."
|
||||
)
|
||||
|
||||
|
||||
def em_dash_rule(persona: EmailPersona) -> str:
|
||||
"""Return the em-dash instruction line for *persona*'s prompt."""
|
||||
if persona.uses_llms_heavily:
|
||||
return _ALLOW_RULE
|
||||
return _SUPPRESS_RULE
|
||||
|
||||
|
||||
def strip_em_dashes(text: str, persona: EmailPersona) -> str:
|
||||
"""Belt-and-braces: even with the prompt rule, small models leak
|
||||
em-dashes occasionally. Substitute with comma+space so the
|
||||
output reads naturally; opt-in personas pass through unchanged.
|
||||
"""
|
||||
if persona.uses_llms_heavily:
|
||||
return text
|
||||
return text.replace("—", ", ").replace("–", ", ")
|
||||
154
decnet/realism/prompts/email.py
Normal file
154
decnet/realism/prompts/email.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""Prompt builder for the email content class.
|
||||
|
||||
The LLM gets a tightly-scoped instruction and a small handful of
|
||||
deterministic constraints. Persona mannerisms are *pre-selected* in
|
||||
Python (1–2 of the persona's full list) and injected as hard rules —
|
||||
small models otherwise treat the mannerism list as flavour text and
|
||||
ignore it, and the corpus collapses into one voice.
|
||||
|
||||
**Em-dash suppression** is on by default; suppression is lifted only
|
||||
for personas that opt in via ``uses_llms_heavily``. Em-dashes are a
|
||||
strong stylometric tell for LLM-authored prose, and a honeypot mailbox
|
||||
where every author uses them is a tell. Stage 6 of the realism
|
||||
migration extracts the suppression block into a shared
|
||||
``decnet.realism.prompts._style`` helper so file-class prompts pick
|
||||
it up too.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import secrets
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
from decnet.realism.personas import EmailPersona
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PromptInputs:
|
||||
sender: EmailPersona
|
||||
recipient: EmailPersona
|
||||
context_hint: str
|
||||
parent_subject: Optional[str] = None # set when replying
|
||||
parent_excerpt: Optional[str] = None # short snippet of last msg
|
||||
|
||||
|
||||
_LANGUAGE_NAMES = {
|
||||
"en": "English",
|
||||
"es": "Spanish",
|
||||
"pt": "Portuguese",
|
||||
"fr": "French",
|
||||
"de": "German",
|
||||
"it": "Italian",
|
||||
"nl": "Dutch",
|
||||
"ja": "Japanese",
|
||||
"zh": "Chinese",
|
||||
}
|
||||
|
||||
|
||||
def _lang_label(code: str) -> str:
|
||||
return _LANGUAGE_NAMES.get(code.lower(), code)
|
||||
|
||||
|
||||
def select_mannerisms(
|
||||
persona: EmailPersona,
|
||||
*,
|
||||
rng: Optional[secrets.SystemRandom] = None,
|
||||
n: int = 2,
|
||||
) -> list[str]:
|
||||
"""Pick *n* mannerisms deterministically given *rng*.
|
||||
|
||||
Returns up to *n*; falls back to the full list when the persona
|
||||
declares fewer. Determinism (under a seeded RNG) is what makes
|
||||
tests practical — otherwise mannerism injection is unverifiable.
|
||||
"""
|
||||
rnd = rng or secrets.SystemRandom()
|
||||
pool = list(persona.mannerisms)
|
||||
if not pool:
|
||||
return []
|
||||
if len(pool) <= n:
|
||||
return pool
|
||||
rnd.shuffle(pool)
|
||||
return pool[:n]
|
||||
|
||||
|
||||
def build(
|
||||
inputs: PromptInputs,
|
||||
*,
|
||||
rng: Optional[secrets.SystemRandom] = None,
|
||||
) -> tuple[str, list[str]]:
|
||||
"""Return ``(prompt, mannerisms_used)``.
|
||||
|
||||
``mannerisms_used`` flows back into the persisted ``payload`` JSON
|
||||
so an analyst can see *why* a given email reads the way it does.
|
||||
"""
|
||||
sender = inputs.sender
|
||||
recipient = inputs.recipient
|
||||
language = _lang_label(sender.language or "en")
|
||||
mannerisms = select_mannerisms(sender, rng=rng)
|
||||
mannerism_block = (
|
||||
"\n".join(f"- {m}" for m in mannerisms)
|
||||
if mannerisms
|
||||
else "- (no specific mannerisms; write in the persona's tone)"
|
||||
)
|
||||
|
||||
if sender.uses_llms_heavily:
|
||||
em_dash_rule = (
|
||||
"Em-dashes are fine — this persona uses them naturally. "
|
||||
"Write in your usual style."
|
||||
)
|
||||
else:
|
||||
em_dash_rule = (
|
||||
"Do NOT use em-dashes (—). Use commas, periods, or "
|
||||
"parentheses instead. Em-dashes are a tell."
|
||||
)
|
||||
|
||||
sig_block = (
|
||||
f"Use this exact signature block:\n{sender.signature}"
|
||||
if sender.signature
|
||||
else "End with a short, plausible signature for the persona's role."
|
||||
)
|
||||
|
||||
if inputs.parent_subject:
|
||||
thread_block = (
|
||||
f"This is a REPLY in an ongoing thread.\n"
|
||||
f"- Parent subject: {inputs.parent_subject}\n"
|
||||
f"- Parent excerpt: {inputs.parent_excerpt or '(no excerpt)'}\n"
|
||||
f"- Begin the body assuming the recipient already read the parent.\n"
|
||||
)
|
||||
subject_rule = (
|
||||
"Subject must be the parent subject prefixed with 'Re: ' "
|
||||
"(no double 'Re: Re:')."
|
||||
)
|
||||
else:
|
||||
thread_block = "This is a NEW thread (no prior context)."
|
||||
subject_rule = (
|
||||
"Generate a short, specific subject line (≤ 80 chars) "
|
||||
"appropriate to the context."
|
||||
)
|
||||
|
||||
prompt = f"""You are writing one corporate email, RFC 2822 plain-text body only.
|
||||
|
||||
Persona — sender:
|
||||
- Name: {sender.name}
|
||||
- Role: {sender.role}
|
||||
- Tone: {sender.tone_custom if sender.tone == "custom" and sender.tone_custom else sender.tone}
|
||||
- Mannerisms (must show through):
|
||||
{mannerism_block}
|
||||
|
||||
Persona — recipient:
|
||||
- Name: {recipient.name}
|
||||
- Role: {recipient.role}
|
||||
|
||||
Context hint: {inputs.context_hint}
|
||||
|
||||
Thread context:
|
||||
{thread_block}
|
||||
|
||||
Hard rules:
|
||||
1. Write the email body in {language}. Do not translate or code-switch.
|
||||
2. {em_dash_rule}
|
||||
3. {subject_rule}
|
||||
4. {sig_block}
|
||||
5. Output ONLY the email — first line is "Subject: <subject>", then a blank line, then the body. No commentary, no markdown fences, no preamble.
|
||||
"""
|
||||
return prompt.strip(), mannerisms
|
||||
91
decnet/realism/prompts/filebody.py
Normal file
91
decnet/realism/prompts/filebody.py
Normal file
@@ -0,0 +1,91 @@
|
||||
"""Class-conditioned prompt builder for user-class file bodies.
|
||||
|
||||
Stage 6 of the realism migration. Only user-classes (``note``,
|
||||
``todo``, ``draft``, ``script``) get LLM enrichment — system-class
|
||||
content (cron logs, daemon logs, /tmp caches) is *supposed* to look
|
||||
formulaic, and an LLM-authored cron log is more suspicious than a
|
||||
templated one.
|
||||
|
||||
The prompt asks for *short* output (LLM-authored ten-page essays in
|
||||
``~/notes.txt`` are an instant tell) and pins the exit shape so the
|
||||
worker doesn't need to scrape boilerplate. Em-dash suppression
|
||||
flows through :mod:`decnet.realism.prompts._style`.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from decnet.realism.personas import EmailPersona
|
||||
from decnet.realism.prompts._style import em_dash_rule
|
||||
from decnet.realism.taxonomy import ContentClass
|
||||
|
||||
|
||||
_LANGUAGE_NAMES = {
|
||||
"en": "English", "es": "Spanish", "pt": "Portuguese",
|
||||
"fr": "French", "de": "German", "it": "Italian",
|
||||
"nl": "Dutch", "ja": "Japanese", "zh": "Chinese",
|
||||
}
|
||||
|
||||
|
||||
def _lang_label(code: str) -> str:
|
||||
return _LANGUAGE_NAMES.get((code or "en").lower(), code or "English")
|
||||
|
||||
|
||||
_CLASS_GUIDANCE: dict[ContentClass, str] = {
|
||||
ContentClass.NOTE: (
|
||||
"A personal note file the persona keeps on their dev box. "
|
||||
"2–6 short lines. Mix of TODOs, half-formed thoughts, "
|
||||
"shorthand reminders. NOT a polished document. No headers "
|
||||
"or markdown sections."
|
||||
),
|
||||
ContentClass.TODO: (
|
||||
"A markdown TODO list the persona keeps on their dev box. "
|
||||
"3–8 items in `- [ ] item` / `- [x] item` form. Some checked, "
|
||||
"some not. Items are short, work-flavoured, lowercase, no "
|
||||
"prose paragraphs. No headers. No introductory sentence."
|
||||
),
|
||||
ContentClass.DRAFT: (
|
||||
"A short draft email or memo the persona is working on. "
|
||||
"2–4 short paragraphs, conversational tone. No subject line, "
|
||||
"no headers — this is the body in a notes file, not a sent "
|
||||
"email. Sign off the way the persona would in their voice."
|
||||
),
|
||||
ContentClass.SCRIPT: (
|
||||
"A short utility script the persona wrote. Pick a plausible "
|
||||
"interpreter (bash or python3) and start with the matching "
|
||||
"shebang. 10–25 lines. Real-feeling intent (a backup, a "
|
||||
"log rotation, a cleanup). Inline comments allowed but sparse."
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def build(
|
||||
content_class: ContentClass,
|
||||
persona: EmailPersona,
|
||||
) -> str:
|
||||
"""Return a prompt for one body of *content_class* by *persona*.
|
||||
|
||||
Output the LLM is expected to produce: *just the file body*, no
|
||||
commentary, no markdown fences. Caller substitutes em-dashes
|
||||
server-side via :func:`decnet.realism.prompts._style.strip_em_dashes`
|
||||
as a belt-and-braces guard.
|
||||
"""
|
||||
guidance = _CLASS_GUIDANCE.get(content_class)
|
||||
if guidance is None:
|
||||
raise KeyError(
|
||||
f"no filebody prompt registered for content_class={content_class!r}"
|
||||
)
|
||||
language = _lang_label(persona.language or "en")
|
||||
return (
|
||||
f"You are writing one short file the persona below would "
|
||||
f"plausibly keep on their dev box.\n\n"
|
||||
f"Persona:\n"
|
||||
f"- Name: {persona.name}\n"
|
||||
f"- Role: {persona.role}\n"
|
||||
f"- Tone: {persona.tone_custom if persona.tone == 'custom' and persona.tone_custom else persona.tone}\n\n"
|
||||
f"File class: {content_class.value}\n"
|
||||
f"Guidance: {guidance}\n\n"
|
||||
f"Hard rules:\n"
|
||||
f"1. Write the file body in {language}. Do not translate or code-switch.\n"
|
||||
f"2. {em_dash_rule(persona)}\n"
|
||||
f"3. Output ONLY the file body. No commentary, no markdown "
|
||||
f" fences, no preamble like 'Here is the file:'.\n"
|
||||
).strip()
|
||||
150
decnet/realism/taxonomy.py
Normal file
150
decnet/realism/taxonomy.py
Normal file
@@ -0,0 +1,150 @@
|
||||
"""Content classes and the :class:`Plan` dataclass.
|
||||
|
||||
The planner emits :class:`Plan` instances; drivers consume them. Every
|
||||
planted artifact (inert noise file, email, callback-bearing canary)
|
||||
maps to exactly one :class:`ContentClass` member, which is what the
|
||||
realism engine uses to dispatch to the right namer / body generator /
|
||||
prompt template.
|
||||
|
||||
Categories:
|
||||
|
||||
* **User content** (LLM-eligible): ``note``, ``todo``, ``draft``,
|
||||
``script``. Created by humans on workstations; LLM enrichment makes
|
||||
them feel lived-in.
|
||||
* **System content** (deterministic only): ``log_cron``, ``log_daemon``,
|
||||
``cache_tmp``. These are *supposed* to look formulaic — that's how
|
||||
cron/journald actually write them. LLM here would harm realism.
|
||||
* **Email** (LLM-eligible): one persona writing to another. Owned by
|
||||
the email driver, not the file driver.
|
||||
* **Canary** (deterministic, callback-bearing): one ``canary_*`` member
|
||||
per :mod:`decnet.canary.factory.KNOWN_GENERATORS` entry. Picked
|
||||
rarely and rate-limited per-decky by the planner.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from enum import StrEnum
|
||||
from typing import Literal, Optional
|
||||
|
||||
|
||||
class ContentClass(StrEnum):
|
||||
"""The kind of artifact a planner has decided to produce.
|
||||
|
||||
Values are stable over the wire — they're persisted on
|
||||
``synthetic_files.content_class`` and used as bus-event discriminants
|
||||
so renaming a member is a schema change. Add new members at the
|
||||
bottom; never reorder.
|
||||
"""
|
||||
|
||||
# User-generated, LLM-enrichable
|
||||
NOTE = "note"
|
||||
TODO = "todo"
|
||||
DRAFT = "draft"
|
||||
SCRIPT = "script"
|
||||
|
||||
# System-generated, template-only (LLM would harm realism)
|
||||
LOG_CRON = "log_cron"
|
||||
LOG_DAEMON = "log_daemon"
|
||||
CACHE_TMP = "cache_tmp"
|
||||
|
||||
# Email — owned by the email driver, planner picks the action shape
|
||||
EMAIL = "email"
|
||||
|
||||
# Callback-bearing — provided by decnet.canary.cultivator at
|
||||
# dispatch time, not by realism.bodies. One member per generator
|
||||
# in decnet.canary.factory.KNOWN_GENERATORS.
|
||||
CANARY_AWS_CREDS = "canary_aws_creds"
|
||||
CANARY_ENV_FILE = "canary_env_file"
|
||||
CANARY_GIT_CONFIG = "canary_git_config"
|
||||
CANARY_SSH_KEY = "canary_ssh_key"
|
||||
CANARY_HONEYDOC = "canary_honeydoc"
|
||||
CANARY_HONEYDOC_DOCX = "canary_honeydoc_docx"
|
||||
CANARY_HONEYDOC_PDF = "canary_honeydoc_pdf"
|
||||
CANARY_MYSQL_DUMP = "canary_mysql_dump"
|
||||
|
||||
def is_canary(self) -> bool:
|
||||
return self.value.startswith("canary_")
|
||||
|
||||
def is_user_class(self) -> bool:
|
||||
return self in (
|
||||
ContentClass.NOTE,
|
||||
ContentClass.TODO,
|
||||
ContentClass.DRAFT,
|
||||
ContentClass.SCRIPT,
|
||||
)
|
||||
|
||||
def is_system_class(self) -> bool:
|
||||
return self in (
|
||||
ContentClass.LOG_CRON,
|
||||
ContentClass.LOG_DAEMON,
|
||||
ContentClass.CACHE_TMP,
|
||||
)
|
||||
|
||||
|
||||
PlanAction = Literal["create", "edit", "rotate"]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Plan:
|
||||
"""One realism decision: what to do, where, as whom, when.
|
||||
|
||||
Frozen so the planner can return the same instance to multiple
|
||||
consumers (e.g. orchestrator dispatcher + canary cultivator) without
|
||||
them stomping each other's view of the schedule.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
decky_uuid, decky_name :
|
||||
Target decky. Both carried so drivers don't need a repo
|
||||
round-trip to map UUID → container name.
|
||||
persona :
|
||||
Persona name (``EmailPersona.name``) — this is the user the
|
||||
action is "performed by." Sampled from the topology's persona
|
||||
pool at plan time.
|
||||
content_class :
|
||||
:class:`ContentClass` member. Drives namer/body dispatch.
|
||||
action :
|
||||
``"create"`` mints a new artifact; ``"edit"`` mutates a
|
||||
previously-planted one (read-modify-write — requires
|
||||
:attr:`previous_body`); ``"rotate"`` is the log-rotation shape
|
||||
(``cron.log`` → ``cron.log.1``).
|
||||
target_path :
|
||||
Absolute container-side path the driver should write. Already
|
||||
persona-aware (e.g. ``/home/admin/TODO.md`` not
|
||||
``/home/{user}/TODO.md``).
|
||||
mtime :
|
||||
Backdated wall-clock the driver should ``touch -d`` after
|
||||
writing. Sampled by :func:`decnet.realism.diurnal.sample_mtime`
|
||||
so files don't all stamp at the moment they were created.
|
||||
body_hint :
|
||||
Deterministic body the engine has *already* committed to. LLM
|
||||
enrichment, when enabled, may replace it but on timeout/failure
|
||||
the driver falls back to this — so the tick never blocks
|
||||
unboundedly.
|
||||
previous_body :
|
||||
Required for ``action="edit"``. The bytes the driver read back
|
||||
from the decky before mutating; passed to
|
||||
:func:`decnet.realism.bodies.next_iteration`.
|
||||
"""
|
||||
|
||||
decky_uuid: str
|
||||
decky_name: str
|
||||
persona: str
|
||||
content_class: ContentClass
|
||||
action: PlanAction
|
||||
target_path: str
|
||||
mtime: datetime
|
||||
body_hint: Optional[str] = None
|
||||
previous_body: Optional[str] = None
|
||||
notes: tuple[str, ...] = field(default_factory=tuple)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.action == "edit" and self.previous_body is None:
|
||||
# Belt-and-braces: the planner produced an edit Plan without
|
||||
# the prior body. The driver would either have to make a
|
||||
# second docker exec to re-read or silently degrade to
|
||||
# create. Both bad. Fail loudly at construction.
|
||||
raise ValueError(
|
||||
"Plan.action='edit' requires previous_body; got None"
|
||||
)
|
||||
Reference in New Issue
Block a user