refactor(realism): enforce synthetic_files 64KB cap at the repo

The orchestrator worker clipped last_body at write time, but the repo
didn't enforce. A future caller that forgot the clip would write the
full body. Move the clip to record_synthetic_file and
update_synthetic_file via SYNTHETIC_FILE_BODY_LIMIT in
decnet/web/db/models/realism.py. Worker now passes the full body and
trusts the repo. Tests retargeted to assert repo enforcement.
This commit is contained in:
2026-04-27 17:37:36 -04:00
parent b86129e35e
commit 7e9bc6d49a
4 changed files with 33 additions and 39 deletions

View File

@@ -385,7 +385,7 @@ async def _bump_synthetic_file_after_edit(repo, action, result) -> None:
patch["content_hash"] = hashlib.sha256( patch["content_hash"] = hashlib.sha256(
new_body.encode("utf-8"), new_body.encode("utf-8"),
).hexdigest() ).hexdigest()
patch["last_body"] = new_body[:65536] patch["last_body"] = new_body
await repo.update_synthetic_file(action.synthetic_file_uuid, patch) await repo.update_synthetic_file(action.synthetic_file_uuid, patch)
@@ -411,10 +411,7 @@ async def _record_synthetic_file(repo, action) -> None:
"last_modified": now, "last_modified": now,
"edit_count": 0, "edit_count": 0,
"content_hash": content_hash, "content_hash": content_hash,
# Cap the persisted body — large blobs (DOCX/PDF/canary "last_body": body,
# artifacts in stage 7) are wasted disk on this side; the
# decky filesystem holds the canonical bytes.
"last_body": body[:65536],
} }
try: try:
await repo.record_synthetic_file(row) await repo.record_synthetic_file(row)
@@ -432,7 +429,7 @@ async def _record_synthetic_file(repo, action) -> None:
{ {
"last_modified": now, "last_modified": now,
"content_hash": content_hash, "content_hash": content_hash,
"last_body": body[:65536], "last_body": body,
"edit_count": int(match.get("edit_count", 0)) + 1, "edit_count": int(match.get("edit_count", 0)) + 1,
}, },
) )

View File

@@ -22,6 +22,15 @@ from sqlalchemy import Column, Index, Text, UniqueConstraint
from sqlmodel import Field, SQLModel from sqlmodel import Field, SQLModel
SYNTHETIC_FILE_BODY_LIMIT = 65536
"""Cap on persisted ``synthetic_files.last_body`` bytes.
Enforced by the repo on both insert and update — callers may pass the
full body; the repo clips. Large blobs (DOCX/PDF, canary artifacts) are
wasted disk on the master side; the decky filesystem holds the canonical
bytes."""
class SyntheticFile(SQLModel, table=True): class SyntheticFile(SQLModel, table=True):
"""One realism-planted file on one decky. """One realism-planted file on one decky.

View File

@@ -3335,6 +3335,9 @@ class SQLModelRepository(BaseRepository):
# ------------------------------------------------------------ realism # ------------------------------------------------------------ realism
async def record_synthetic_file(self, data: dict[str, Any]) -> str: async def record_synthetic_file(self, data: dict[str, Any]) -> str:
from decnet.web.db.models.realism import SYNTHETIC_FILE_BODY_LIMIT
if "last_body" in data and data["last_body"] is not None:
data = {**data, "last_body": data["last_body"][:SYNTHETIC_FILE_BODY_LIMIT]}
async with self._session() as session: async with self._session() as session:
row = SyntheticFile(**data) row = SyntheticFile(**data)
session.add(row) session.add(row)
@@ -3345,6 +3348,9 @@ class SQLModelRepository(BaseRepository):
async def update_synthetic_file( async def update_synthetic_file(
self, row_uuid: str, data: dict[str, Any], self, row_uuid: str, data: dict[str, Any],
) -> None: ) -> None:
from decnet.web.db.models.realism import SYNTHETIC_FILE_BODY_LIMIT
if "last_body" in data and data["last_body"] is not None:
data = {**data, "last_body": data["last_body"][:SYNTHETIC_FILE_BODY_LIMIT]}
async with self._session() as session: async with self._session() as session:
stmt = ( stmt = (
update(SyntheticFile) update(SyntheticFile)

View File

@@ -1,16 +1,12 @@
"""``synthetic_files.last_body`` is capped at 64 KB. """``synthetic_files.last_body`` is capped at 64 KB by the repo.
The orchestrator caps the persisted body at 64 KB on every write The repo clips on both insert and update so callers may pass the full
(create + edit) so the table doesn't bloat with large blobs. This body. Large blobs (DOCX/PDF, canary artifacts) would bloat the table;
introduces a real edge: an EditAction whose ``previous_body`` is the decky filesystem holds the canonical bytes.
sourced from the cap (not the file on disk) sees truncated bytes.
Today the realism templates produce well under 64 KB, so the edge These tests pin the contract so a regression that drops the cap or
isn't reachable from the planted-content path. But a future change applies it inconsistently fails loudly. Note: callers pass the *full*
that lifts the cap, an LLM that returns a long body, or a body — the worker no longer clips; the repo does.
``honeydoc_pdf`` body cultivated through the realism path could all
hit it. These tests pin the contract so a regression that drops the
cap or applies it inconsistently fails loudly.
""" """
from __future__ import annotations from __future__ import annotations
@@ -19,10 +15,11 @@ from datetime import datetime, timezone
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from decnet.web.db.models.realism import SYNTHETIC_FILE_BODY_LIMIT
from decnet.web.db.sqlite.repository import SQLiteRepository from decnet.web.db.sqlite.repository import SQLiteRepository
_LIMIT = 65536 # decnet/orchestrator/worker.py uses [:65536] _LIMIT = SYNTHETIC_FILE_BODY_LIMIT
@pytest_asyncio.fixture @pytest_asyncio.fixture
@@ -44,16 +41,14 @@ def _row(body: str) -> dict:
"created_at": now, "created_at": now,
"last_modified": now, "last_modified": now,
"edit_count": 0, "edit_count": 0,
# The hash is over the *full* body in the orchestrator's write
# path; if the body comes from a row that was already truncated,
# the hash reflects the truncation. Tests check both paths.
"content_hash": hashlib.sha256(body.encode("utf-8")).hexdigest(), "content_hash": hashlib.sha256(body.encode("utf-8")).hexdigest(),
"last_body": body[:_LIMIT], # Caller passes the full body — the repo clips.
"last_body": body,
} }
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_oversized_body_is_truncated_at_write(repo): async def test_repo_clips_oversized_body_at_insert(repo):
body = "A" * (_LIMIT * 2) body = "A" * (_LIMIT * 2)
uuid = await repo.record_synthetic_file(_row(body)) uuid = await repo.record_synthetic_file(_row(body))
rows = await repo.list_synthetic_files(decky_uuid="d1") rows = await repo.list_synthetic_files(decky_uuid="d1")
@@ -65,8 +60,6 @@ async def test_oversized_body_is_truncated_at_write(repo):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_body_at_exact_limit_is_preserved(repo): async def test_body_at_exact_limit_is_preserved(repo):
"""Boundary: a body of exactly 64 KB must not be silently
truncated. Off-by-one regression target."""
body = "B" * _LIMIT body = "B" * _LIMIT
await repo.record_synthetic_file(_row(body)) await repo.record_synthetic_file(_row(body))
rows = await repo.list_synthetic_files(decky_uuid="d1") rows = await repo.list_synthetic_files(decky_uuid="d1")
@@ -74,27 +67,16 @@ async def test_body_at_exact_limit_is_preserved(repo):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_pick_for_edit_returns_truncated_body(repo): async def test_pick_for_edit_returns_clipped_body(repo):
"""Stage 3b contract: the edit candidate carries the *stored*
last_body — necessarily truncated when the original exceeded the
cap. Document the consequence so a future test author doesn't
expect the full body to round-trip."""
body = "C" * (_LIMIT * 3) body = "C" * (_LIMIT * 3)
await repo.record_synthetic_file(_row(body)) await repo.record_synthetic_file(_row(body))
candidate = await repo.pick_random_synthetic_file_for_edit("d1") candidate = await repo.pick_random_synthetic_file_for_edit("d1")
assert candidate is not None assert candidate is not None
assert len(candidate["last_body"]) == _LIMIT assert len(candidate["last_body"]) == _LIMIT
# The edit driver mutates this body via realism.bodies.next_iteration,
# so callers must accept they're editing a truncated snapshot of
# the file that's actually on the decky. This is documented
# behaviour pre-v1; if the cap rises, lift _LIMIT here too.
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_edit_path_keeps_cap(repo): async def test_repo_clips_oversized_body_at_update(repo):
"""An update_synthetic_file call that tries to write a >cap body
must clip to the cap on the way in. Mirrors the orchestrator
worker's ``last_body=body[:65536]`` line."""
uuid = await repo.record_synthetic_file(_row("seed")) uuid = await repo.record_synthetic_file(_row("seed"))
big = "D" * (_LIMIT * 4) big = "D" * (_LIMIT * 4)
await repo.update_synthetic_file( await repo.update_synthetic_file(
@@ -102,7 +84,7 @@ async def test_edit_path_keeps_cap(repo):
{ {
"last_modified": datetime.now(timezone.utc), "last_modified": datetime.now(timezone.utc),
"edit_count": 1, "edit_count": 1,
"last_body": big[:_LIMIT], # caller is responsible for clipping "last_body": big,
}, },
) )
rows = await repo.list_synthetic_files(decky_uuid="d1") rows = await repo.list_synthetic_files(decky_uuid="d1")