feat(realism): scaffold decnet/realism/ library

Empty subpackage skeleton for the realism migration: ContentClass enum
(file/email/canary content categories), Plan dataclass (frozen, with
edit-action invariant), in_work_hours window check (wrap-around
supported, fail-open on parse error), and sample_mtime for backdated
file timestamps that snap into a persona's active hours.

Stage 1 of the orchestrator+canary realism unification — no
production caller wired yet; planner.pick is a stub returning None
until stage 3.
This commit is contained in:
2026-04-27 15:55:21 -04:00
parent 6376523923
commit f57c621117
10 changed files with 629 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
"""Realism library — synthetic content + scheduling primitives.
A shared, importable library that produces *plausible* artifacts (file
names, file bodies, email content) and the diurnal/persona machinery
that decides *when* and *for whom* to produce them.
Workers (orchestrator, canary cultivator, future-emailgen-equivalents)
import from here. This package owns:
* :mod:`decnet.realism.taxonomy` — :class:`ContentClass` enum and the
:class:`Plan` dataclass that planners emit.
* :mod:`decnet.realism.diurnal` — work-hours gating and a backdated
``mtime`` sampler so planted files don't all stamp at wall-clock-now.
* :mod:`decnet.realism.planner` — picks ``(decky, persona, class,
action, mtime)`` tuples for the orchestrator's tick loop.
* :mod:`decnet.realism.personas` — persona schema (lifted from
``orchestrator.emailgen.personas`` in stage 2 of the migration).
* :mod:`decnet.realism.prompts` — prompt builders, one per content
class, sharing an em-dash-suppression style helper.
* :mod:`decnet.realism.llm` — :class:`LLMBackend` ABC + factory + impl
subpackage; pluggable text-generation backend.
The library has **no worker, no systemd unit, no CLI of its own** —
it's plain Python that consumers import. The CLI surface that does
exist (``decnet realism import-personas``) is registered by
:mod:`decnet.cli.realism` after stage 5 of the migration.
"""

153
decnet/realism/diurnal.py Normal file
View File

@@ -0,0 +1,153 @@
"""Work-hours gating and backdated mtime sampling.
The current orchestrator stamps every planted file at wall-clock-now,
which is one of the realism failures driving this migration: a `cron.log`
that says it was last touched at 03:14:22 UTC on a workstation
attributed to a 9-to-5 admin reads as fake on first glance.
Two helpers:
* :func:`in_work_hours` — gate planner ticks so a persona's files only
appear inside the persona's ``active_hours`` window. Wrap-around
windows (``"22:00-06:00"``) are supported.
* :func:`sample_mtime` — return a backdated datetime whose hour-of-day
falls inside the persona's window, biased toward "recent but not
now". Drivers pass this to ``touch -d``.
Clock and RNG are injectable so tests don't need to ``freeze_time`` or
patch :mod:`secrets`.
"""
from __future__ import annotations
import secrets
from datetime import datetime, timedelta
from typing import Protocol
class _ClockLike(Protocol):
def __call__(self) -> datetime: ...
class _RandLike(Protocol):
def random(self) -> float: ...
def randint(self, a: int, b: int) -> int: ...
def _parse_window(window: str) -> tuple[int, int, int, int] | None:
"""Parse ``"HH:MM-HH:MM"`` into ``(start_h, start_m, end_h, end_m)``.
Returns ``None`` for malformed input — callers treat that as
"always-on" so a single config typo never silences the whole fleet
(mirrors :func:`decnet.orchestrator.emailgen.personas.in_active_hours`
semantics).
"""
try:
start_s, end_s = window.split("-")
start_h, start_m = (int(p) for p in start_s.split(":"))
end_h, end_m = (int(p) for p in end_s.split(":"))
except (ValueError, IndexError):
return None
if not (0 <= start_h < 24 and 0 <= end_h < 24):
return None
if not (0 <= start_m < 60 and 0 <= end_m < 60):
return None
return start_h, start_m, end_h, end_m
def in_work_hours(window: str, now: datetime) -> bool:
"""Return ``True`` when *now* falls inside the persona window.
*window* is ``"HH:MM-HH:MM"``. Wrap-around (``start > end``) means
"spans midnight." Equal ``start`` and ``end`` means always-on.
Malformed windows return ``True`` — fail-open so a typo doesn't
silence the fleet.
"""
parsed = _parse_window(window)
if parsed is None:
return True
start_h, start_m, end_h, end_m = parsed
if (start_h, start_m) == (end_h, end_m):
return True
cur = now.hour * 60 + now.minute
start = start_h * 60 + start_m
end = end_h * 60 + end_m
if start < end:
return start <= cur < end
# Wrap-around (e.g. 22:00-06:00).
return cur >= start or cur < end
def sample_mtime(
window: str,
now: datetime,
*,
rand: _RandLike | None = None,
backdate_min_hours: float = 0.5,
backdate_max_days: float = 14.0,
) -> datetime:
"""Return a backdated ``datetime`` for ``touch -d`` after a write.
The sampled time is in the past relative to *now*, capped at
*backdate_max_days* days ago and at least *backdate_min_hours* ago.
Weighted toward recent — half-life roughly 2 days — so most planted
files look "edited recently" without all clustering at +30min.
The hour-of-day of the result is forced into *window* so an
`admin` persona's `TODO.md` doesn't carry an mtime of 03:14:22.
Wrap-around windows are honoured.
Falls back to a uniform 0.5h14d backdate if *window* is malformed.
"""
rng = rand or secrets.SystemRandom()
parsed = _parse_window(window)
# Exponential-ish backdate via -ln(u): heavier mass near "recent".
# Cap by clipping; cheap and good enough for realism.
u = max(rng.random(), 1e-6) # avoid log(0)
import math
span_hours = max(backdate_min_hours, min(backdate_max_days * 24, -math.log(u) * 12.0))
candidate = now - timedelta(hours=span_hours)
if parsed is None:
return candidate
start_h, start_m, end_h, end_m = parsed
if (start_h, start_m) == (end_h, end_m):
return candidate
# If the candidate's hour-of-day is outside the window, snap it into
# the window on the same calendar date — preserves the "this many
# days ago" feel while making the clock-face credible.
cur = candidate.hour * 60 + candidate.minute
start = start_h * 60 + start_m
end = end_h * 60 + end_m
if start < end:
in_window = start <= cur < end
snap_minutes = rng.randint(start, max(start, end - 1))
else:
# Wrap-around: in-window if cur is in either segment.
in_window = cur >= start or cur < end
# Snap into the larger of the two segments by total length.
before_midnight = (24 * 60) - start
after_midnight = end
if before_midnight >= after_midnight:
snap_minutes = rng.randint(start, 24 * 60 - 1)
else:
snap_minutes = rng.randint(0, max(0, end - 1))
if in_window:
return candidate
snapped = candidate.replace(
hour=snap_minutes // 60,
minute=snap_minutes % 60,
second=rng.randint(0, 59),
microsecond=0,
)
# If the hour-snap pushed us too close to *now* (candidate was
# earlier today but the random in-window minute landed near or
# later than the current clock), shift back a full day so the
# result honours the min-backdate floor.
floor = now - timedelta(hours=backdate_min_hours)
while snapped > floor:
snapped -= timedelta(days=1)
return snapped

View File

@@ -0,0 +1,8 @@
"""LLM backend ABC + factory + impls.
Populated in stage 2 of the realism migration: lifts the existing
``orchestrator.emailgen.llm`` subpackage as-is (``base``, ``factory``,
``impl/ollama``, ``impl/fake``). Stage 6 adds ``circuit.py`` for
cross-call breaker behaviour.
"""
from __future__ import annotations

View File

@@ -0,0 +1,9 @@
"""Persona schema — placeholder for stage 2.
In stage 2 of the realism migration, this module receives the real
persona schema currently living at
``decnet.orchestrator.emailgen.personas`` (``EmailPersona``,
``parse_personas``, ``in_active_hours``). Stage 1 keeps it empty so
the import path is reserved without behaviour.
"""
from __future__ import annotations

53
decnet/realism/planner.py Normal file
View File

@@ -0,0 +1,53 @@
"""Realism planner — picks the next ``(decky, persona, class, action)`` tuple.
Stage-1 stub: the public signature is in place so the orchestrator
worker (stage 3) can import it, but the body returns ``None`` ("nothing
to do this tick") until stage 3 wires the synthetic_files table and
naming/body generators.
The eventual policy lives entirely in :func:`pick`; downstream
consumers should not branch on ``ContentClass`` themselves — let the
planner decide weights and rate-limits in one place.
"""
from __future__ import annotations
import secrets
from datetime import datetime
from typing import Any, Optional, Sequence
from decnet.realism.taxonomy import Plan
def pick(
deckies: Sequence[dict[str, Any]],
now: datetime,
*,
repo: Any = None,
rand: Optional[secrets.SystemRandom] = None,
) -> Optional[Plan]:
"""Return the next :class:`Plan` for the orchestrator's tick.
Stage-1 stub returns ``None`` unconditionally so the orchestrator
can import this function before the real implementation lands. The
full policy (diurnal gate, action distribution 60/30/10
create/edit/leave, content-class weights, canary rate-limit) lands
in stage 3 of the realism migration.
Parameters
----------
deckies :
Output of :meth:`BaseRepository.list_running_deckies`. Each
entry must carry ``uuid``, ``name``, ``services``,
``email_personas`` (topology-pool JSON or list).
now :
Tick timestamp. Injected so tests don't need to monkey-patch
:func:`datetime.utcnow`.
repo :
:class:`BaseRepository` for synthetic_files lookup (edit
action). Optional in stage 1; required from stage 3 onward.
rand :
RNG for sampling. Defaults to a fresh
:class:`secrets.SystemRandom`.
"""
_ = (deckies, now, repo, rand) # silence unused-arg until stage 3
return None

View File

@@ -0,0 +1,7 @@
"""Prompt builders for LLM-enriched content.
Populated in stage 2 (``email.py`` lifted from
``orchestrator.emailgen.prompt``) and stage 6 (``filebody.py``,
``filename.py``, ``_style.py`` for em-dash suppression).
"""
from __future__ import annotations

150
decnet/realism/taxonomy.py Normal file
View File

@@ -0,0 +1,150 @@
"""Content classes and the :class:`Plan` dataclass.
The planner emits :class:`Plan` instances; drivers consume them. Every
planted artifact (inert noise file, email, callback-bearing canary)
maps to exactly one :class:`ContentClass` member, which is what the
realism engine uses to dispatch to the right namer / body generator /
prompt template.
Categories:
* **User content** (LLM-eligible): ``note``, ``todo``, ``draft``,
``script``. Created by humans on workstations; LLM enrichment makes
them feel lived-in.
* **System content** (deterministic only): ``log_cron``, ``log_daemon``,
``cache_tmp``. These are *supposed* to look formulaic — that's how
cron/journald actually write them. LLM here would harm realism.
* **Email** (LLM-eligible): one persona writing to another. Owned by
the email driver, not the file driver.
* **Canary** (deterministic, callback-bearing): one ``canary_*`` member
per :mod:`decnet.canary.factory.KNOWN_GENERATORS` entry. Picked
rarely and rate-limited per-decky by the planner.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import datetime
from enum import StrEnum
from typing import Literal, Optional
class ContentClass(StrEnum):
"""The kind of artifact a planner has decided to produce.
Values are stable over the wire — they're persisted on
``synthetic_files.content_class`` and used as bus-event discriminants
so renaming a member is a schema change. Add new members at the
bottom; never reorder.
"""
# User-generated, LLM-enrichable
NOTE = "note"
TODO = "todo"
DRAFT = "draft"
SCRIPT = "script"
# System-generated, template-only (LLM would harm realism)
LOG_CRON = "log_cron"
LOG_DAEMON = "log_daemon"
CACHE_TMP = "cache_tmp"
# Email — owned by the email driver, planner picks the action shape
EMAIL = "email"
# Callback-bearing — provided by decnet.canary.cultivator at
# dispatch time, not by realism.bodies. One member per generator
# in decnet.canary.factory.KNOWN_GENERATORS.
CANARY_AWS_CREDS = "canary_aws_creds"
CANARY_ENV_FILE = "canary_env_file"
CANARY_GIT_CONFIG = "canary_git_config"
CANARY_SSH_KEY = "canary_ssh_key"
CANARY_HONEYDOC = "canary_honeydoc"
CANARY_HONEYDOC_DOCX = "canary_honeydoc_docx"
CANARY_HONEYDOC_PDF = "canary_honeydoc_pdf"
CANARY_MYSQL_DUMP = "canary_mysql_dump"
def is_canary(self) -> bool:
return self.value.startswith("canary_")
def is_user_class(self) -> bool:
return self in (
ContentClass.NOTE,
ContentClass.TODO,
ContentClass.DRAFT,
ContentClass.SCRIPT,
)
def is_system_class(self) -> bool:
return self in (
ContentClass.LOG_CRON,
ContentClass.LOG_DAEMON,
ContentClass.CACHE_TMP,
)
PlanAction = Literal["create", "edit", "rotate"]
@dataclass(frozen=True)
class Plan:
"""One realism decision: what to do, where, as whom, when.
Frozen so the planner can return the same instance to multiple
consumers (e.g. orchestrator dispatcher + canary cultivator) without
them stomping each other's view of the schedule.
Attributes
----------
decky_uuid, decky_name :
Target decky. Both carried so drivers don't need a repo
round-trip to map UUID → container name.
persona :
Persona name (``EmailPersona.name``) — this is the user the
action is "performed by." Sampled from the topology's persona
pool at plan time.
content_class :
:class:`ContentClass` member. Drives namer/body dispatch.
action :
``"create"`` mints a new artifact; ``"edit"`` mutates a
previously-planted one (read-modify-write — requires
:attr:`previous_body`); ``"rotate"`` is the log-rotation shape
(``cron.log`` → ``cron.log.1``).
target_path :
Absolute container-side path the driver should write. Already
persona-aware (e.g. ``/home/admin/TODO.md`` not
``/home/{user}/TODO.md``).
mtime :
Backdated wall-clock the driver should ``touch -d`` after
writing. Sampled by :func:`decnet.realism.diurnal.sample_mtime`
so files don't all stamp at the moment they were created.
body_hint :
Deterministic body the engine has *already* committed to. LLM
enrichment, when enabled, may replace it but on timeout/failure
the driver falls back to this — so the tick never blocks
unboundedly.
previous_body :
Required for ``action="edit"``. The bytes the driver read back
from the decky before mutating; passed to
:func:`decnet.realism.bodies.next_iteration`.
"""
decky_uuid: str
decky_name: str
persona: str
content_class: ContentClass
action: PlanAction
target_path: str
mtime: datetime
body_hint: Optional[str] = None
previous_body: Optional[str] = None
notes: tuple[str, ...] = field(default_factory=tuple)
def __post_init__(self) -> None:
if self.action == "edit" and self.previous_body is None:
# Belt-and-braces: the planner produced an edit Plan without
# the prior body. The driver would either have to make a
# second docker exec to re-read or silently degrade to
# create. Both bad. Fail loudly at construction.
raise ValueError(
"Plan.action='edit' requires previous_body; got None"
)

View File

View File

@@ -0,0 +1,120 @@
"""Coverage for :mod:`decnet.realism.diurnal`.
Two functions to exercise:
* :func:`in_work_hours` — straightforward window membership including
the wrap-around (``22:00-06:00``) case and the fail-open behaviour
on malformed windows.
* :func:`sample_mtime` — must (a) return a ``datetime`` strictly in
the past, (b) clip to the configured backdate cap, and (c) snap the
hour-of-day into the persona's window when the unconstrained
candidate would land outside.
"""
from __future__ import annotations
import random
from datetime import datetime, timedelta, timezone
import pytest
from decnet.realism.diurnal import in_work_hours, sample_mtime
# Fixed 'now' for reproducible tests — Monday 2026-04-27 14:00 UTC.
_NOW = datetime(2026, 4, 27, 14, 0, tzinfo=timezone.utc)
# ---- in_work_hours -----------------------------------------------------
@pytest.mark.parametrize(
"now_hour,now_min,window,expected",
[
(10, 0, "09:00-18:00", True),
(8, 59, "09:00-18:00", False),
(9, 0, "09:00-18:00", True),
(18, 0, "09:00-18:00", False), # exclusive end
(17, 59, "09:00-18:00", True),
(23, 30, "22:00-06:00", True), # wrap-around: late
(3, 0, "22:00-06:00", True), # wrap-around: early
(12, 0, "22:00-06:00", False), # wrap-around: middle of day
],
)
def test_in_work_hours_window_membership(
now_hour: int, now_min: int, window: str, expected: bool,
) -> None:
now = _NOW.replace(hour=now_hour, minute=now_min)
assert in_work_hours(window, now) is expected
def test_in_work_hours_equal_start_end_means_always_on() -> None:
# A persona pegged "00:00-00:00" should never be silenced by the
# diurnal gate — interpreted as "no schedule".
assert in_work_hours("00:00-00:00", _NOW) is True
@pytest.mark.parametrize(
"garbage",
["not-a-window", "9-18", "09:00", "25:00-26:00", "09:00-18:99", ""],
)
def test_malformed_window_fails_open(garbage: str) -> None:
# The fleet must not silence on a typo — same fail-open semantics
# as decnet.orchestrator.emailgen.personas.in_active_hours.
assert in_work_hours(garbage, _NOW) is True
# ---- sample_mtime ------------------------------------------------------
def test_sample_mtime_is_in_the_past() -> None:
rng = random.Random(0)
for _ in range(20):
mt = sample_mtime("09:00-18:00", _NOW, rand=rng)
assert mt < _NOW, f"sample_mtime returned future: {mt} >= {_NOW}"
def test_sample_mtime_respects_backdate_cap() -> None:
rng = random.Random(0)
cap_days = 7.0
for _ in range(50):
mt = sample_mtime(
"09:00-18:00", _NOW, rand=rng,
backdate_max_days=cap_days, backdate_min_hours=0.5,
)
assert _NOW - mt <= timedelta(days=cap_days) + timedelta(hours=1)
assert _NOW - mt >= timedelta(hours=0.5) - timedelta(seconds=1)
def test_sample_mtime_snaps_hour_into_window() -> None:
# Force a tight window then assert the hour-of-day is always in it.
rng = random.Random(42)
window = "09:00-18:00"
for _ in range(60):
mt = sample_mtime(window, _NOW, rand=rng)
assert 9 <= mt.hour < 18, (
f"hour {mt.hour} fell outside {window} on {mt.isoformat()}"
)
def test_sample_mtime_handles_wrap_around_window() -> None:
rng = random.Random(123)
for _ in range(40):
mt = sample_mtime("22:00-06:00", _NOW, rand=rng)
assert mt.hour >= 22 or mt.hour < 6, (
f"hour {mt.hour} fell outside wrap window on {mt.isoformat()}"
)
def test_sample_mtime_malformed_window_does_not_snap() -> None:
# When the window can't be parsed, just return the unconstrained
# backdate. Belt-and-braces: shouldn't crash, shouldn't future-stamp.
rng = random.Random(0)
mt = sample_mtime("garbage", _NOW, rand=rng)
assert mt < _NOW
def test_sample_mtime_is_deterministic_per_seed() -> None:
# The diurnal sampler accepts a Random — pinning the seed must
# produce stable output, otherwise tests can't assert anything
# tighter than "returns a datetime in the past."
a = sample_mtime("09:00-18:00", _NOW, rand=random.Random(7))
b = sample_mtime("09:00-18:00", _NOW, rand=random.Random(7))
assert a == b

View File

@@ -0,0 +1,102 @@
"""Coverage for :mod:`decnet.realism.taxonomy`.
The enum values are persisted on ``synthetic_files.content_class`` and
flow through bus topics — renaming a member is a schema change, so the
stable-list test pins the wire format. ``Plan`` invariants (frozen,
edit requires previous_body) are tested too because the planner relies
on construction-time validation rather than a separate validator pass.
"""
from __future__ import annotations
from datetime import datetime, timezone
import pytest
from decnet.realism.taxonomy import ContentClass, Plan
def test_content_class_values_are_stable() -> None:
# If anyone renames or reorders, the assertion explodes — the
# enum is wire-visible (synthetic_files.content_class column,
# bus event payloads) so changes need a schema bump elsewhere.
assert {c.value for c in ContentClass} == {
"note", "todo", "draft", "script",
"log_cron", "log_daemon", "cache_tmp",
"email",
"canary_aws_creds", "canary_env_file", "canary_git_config",
"canary_ssh_key", "canary_honeydoc", "canary_honeydoc_docx",
"canary_honeydoc_pdf", "canary_mysql_dump",
}
@pytest.mark.parametrize("name", ["NOTE", "TODO", "DRAFT", "SCRIPT"])
def test_user_classes_classified(name: str) -> None:
cls = ContentClass[name]
assert cls.is_user_class()
assert not cls.is_system_class()
assert not cls.is_canary()
@pytest.mark.parametrize("name", ["LOG_CRON", "LOG_DAEMON", "CACHE_TMP"])
def test_system_classes_classified(name: str) -> None:
cls = ContentClass[name]
assert cls.is_system_class()
assert not cls.is_user_class()
assert not cls.is_canary()
def test_canary_members_all_classified() -> None:
canaries = [c for c in ContentClass if c.value.startswith("canary_")]
assert canaries, "expected at least one canary content_class"
for c in canaries:
assert c.is_canary()
assert not c.is_user_class()
assert not c.is_system_class()
def test_email_is_neither_user_nor_system_nor_canary() -> None:
# Email lives on its own track — same content engine but a
# different driver and a different table. Classification helpers
# must not falsely group it into file-class buckets.
assert ContentClass.EMAIL.value == "email"
assert not ContentClass.EMAIL.is_user_class()
assert not ContentClass.EMAIL.is_system_class()
assert not ContentClass.EMAIL.is_canary()
def _plan(**kw):
defaults = dict(
decky_uuid="d-1",
decky_name="alpha",
persona="admin",
content_class=ContentClass.NOTE,
action="create",
target_path="/home/admin/notes.txt",
mtime=datetime(2026, 4, 25, 11, 30, tzinfo=timezone.utc),
body_hint="todo: rotate keys",
)
defaults.update(kw)
return Plan(**defaults)
def test_plan_is_frozen() -> None:
p = _plan()
with pytest.raises(Exception): # FrozenInstanceError or AttributeError
p.persona = "ubuntu" # type: ignore[misc]
def test_edit_plan_requires_previous_body() -> None:
with pytest.raises(ValueError, match="previous_body"):
_plan(action="edit", previous_body=None)
def test_edit_plan_with_previous_body_succeeds() -> None:
p = _plan(action="edit", previous_body="- [ ] rotate keys\n")
assert p.action == "edit"
assert p.previous_body == "- [ ] rotate keys\n"
def test_create_plan_does_not_need_previous_body() -> None:
p = _plan(action="create", previous_body=None)
assert p.action == "create"
assert p.previous_body is None