feat(db): add DeckyLifecycle table for async deploy/mutate tracking

One row per (decky, operation) attempt. State machine:
pending -> running -> succeeded | failed (+ error text). Rows are
append-only after terminal; retries write a new row.

Sibling of DeckyShard rather than a rework -- DeckyShard tracks
runtime container state observed via heartbeat, this tracks
operation lifecycle. New table, UUID PK.

Adds BaseRepository abstract methods (create_lifecycle,
update_lifecycle, get_lifecycle_by_ids, find_open_lifecycle,
sweep_stale_lifecycle) with SQLModelRepository mixin impl.
Backbone for the upcoming 202-Accepted async API.
This commit is contained in:
2026-05-22 16:20:00 -04:00
parent ade8bbe30a
commit 05c0721a51
7 changed files with 451 additions and 0 deletions

View File

@@ -82,6 +82,15 @@ from .decky import (
ServiceConfigFieldDTO,
ServiceSchemaResponse,
)
from .decky_lifecycle import (
DeckyLifecycle,
DeckyLifecycleListResponse,
DeckyLifecycleView,
LifecycleAcceptedResponse,
LifecycleDelta,
LifecycleOperation,
LifecycleStatus,
)
from .fleet import (
LOCAL_HOST_SENTINEL,
FleetDecky,
@@ -278,6 +287,14 @@ __all__ = [
"FleetDecky",
"ServiceConfigFieldDTO",
"ServiceSchemaResponse",
# decky_lifecycle
"DeckyLifecycle",
"DeckyLifecycleListResponse",
"DeckyLifecycleView",
"LifecycleAcceptedResponse",
"LifecycleDelta",
"LifecycleOperation",
"LifecycleStatus",
# health
"ComponentHealth",
"HealthResponse",

View File

@@ -0,0 +1,87 @@
"""DeckyLifecycle table + DTOs.
Tracks one row per (decky, operation) attempt — `deploy` or `mutate` —
so the API can return 202 Accepted immediately and the wizard can poll
state instead of holding an open HTTP request open for minutes.
State machine: ``pending`` (row created, runner not yet started) →
``running`` (runner picked it up) → terminal ``succeeded`` | ``failed``
(+ ``error`` text). Rows are immutable after terminal status; a retry
writes a new row.
Sibling of DeckyShard rather than a rework — DeckyShard tracks runtime
container state observed via heartbeat, this tracks operation lifecycle.
Per ``feedback_uuid_over_natural_keys``: new use case, new table, UUID PK.
"""
from __future__ import annotations
import uuid
from datetime import datetime, timezone
from typing import Literal, Optional
from pydantic import BaseModel, Field as PydanticField
from sqlalchemy import Column, Text
from sqlmodel import Field, SQLModel
LifecycleOperation = Literal["deploy", "mutate"]
LifecycleStatus = Literal["pending", "running", "succeeded", "failed"]
def _now_utc() -> datetime:
return datetime.now(timezone.utc)
class DeckyLifecycle(SQLModel, table=True):
"""One row per (decky, operation) attempt."""
__tablename__ = "decky_lifecycle"
id: str = Field(
primary_key=True,
default_factory=lambda: str(uuid.uuid4()),
)
decky_name: str = Field(index=True)
# None for unihost / master-resident deckies.
host_uuid: Optional[str] = Field(default=None, index=True)
operation: str = Field(index=True) # LifecycleOperation
status: str = Field(default="pending", index=True) # LifecycleStatus
error: Optional[str] = Field(
default=None, sa_column=Column("error", Text, nullable=True),
)
started_at: datetime = Field(default_factory=_now_utc)
updated_at: datetime = Field(default_factory=_now_utc)
completed_at: Optional[datetime] = Field(default=None)
# --- HTTP DTOs ---
class DeckyLifecycleView(BaseModel):
"""One lifecycle row, serialised for the wizard polling loop."""
id: str
decky_name: str
host_uuid: Optional[str] = None
operation: str
status: str
error: Optional[str] = None
started_at: datetime
updated_at: datetime
completed_at: Optional[datetime] = None
class DeckyLifecycleListResponse(BaseModel):
rows: list[DeckyLifecycleView] = PydanticField(default_factory=list)
class LifecycleAcceptedResponse(BaseModel):
"""Returned by 202 deploy/mutate endpoints — lets the client subscribe
to the matching DeckyLifecycle rows via the polling endpoint."""
lifecycle_ids: list[str]
class LifecycleDelta(BaseModel):
"""One per-decky completion record in a worker → master heartbeat."""
decky_name: str
operation: str
status: str # one of LifecycleStatus, typically "succeeded" | "failed"
error: Optional[str] = None
completed_at: Optional[datetime] = None

View File

@@ -872,6 +872,71 @@ class BaseRepository(ABC):
async def delete_decky_shard(self, decky_name: str) -> bool:
raise NotImplementedError
# ----------------------------------------------------------- lifecycle
# Per-(decky, operation) attempt tracking for the async deploy/mutate
# state machine. Rows are append-only after terminal status; retries
# write new rows.
async def create_lifecycle(self, data: dict[str, Any]) -> str:
"""Insert a new lifecycle row in ``pending`` status.
``data`` must include ``decky_name``, ``operation``; ``host_uuid``
and ``id`` are optional. Returns the row's ``id``.
"""
raise NotImplementedError
async def update_lifecycle(
self,
lifecycle_id: str,
fields: dict[str, Any],
) -> None:
"""Partial update of an open row (status / error / timestamps).
Callers must bump ``updated_at`` themselves, or rely on the impl
to stamp it. Terminal rows are not protected at this layer — the
runner / heartbeat handler must enforce the immutability rule.
"""
raise NotImplementedError
async def get_lifecycle_by_ids(
self, lifecycle_ids: list[str],
) -> list[dict[str, Any]]:
"""Fetch lifecycle rows by id, in undefined order.
Used by the wizard polling endpoint; missing ids are silently
omitted from the result rather than raising.
"""
raise NotImplementedError
async def find_open_lifecycle(
self,
decky_name: str,
operation: str,
host_uuid: Optional[str] = None,
) -> Optional[dict[str, Any]]:
"""Return the most-recently-started row whose status is
``pending`` or ``running`` for (decky_name, operation,
host_uuid). ``None`` if none open.
The worker-heartbeat path uses this to match deltas to the
master's view of an in-flight operation.
"""
raise NotImplementedError
async def sweep_stale_lifecycle(
self,
older_than: datetime,
reason: str,
) -> int:
"""Mark every non-terminal row started before ``older_than`` as
``failed`` with ``error=reason``. Returns the row count.
Called on master boot to flush orphans from a previous crash.
Pre-v1 substitute for a durable queue per
``feedback_prev1_no_user_problems``.
"""
raise NotImplementedError
# ----------------------------------------------------------- mazenet
# MazeNET topology persistence. Default no-op / NotImplementedError so
# non-default backends stay functional; SQLModelRepository provides the

View File

@@ -42,6 +42,7 @@ from decnet.web.db.sqlmodel_repo.campaigns import CampaignsMixin
from decnet.web.db.sqlmodel_repo.canary import CanaryMixin
from decnet.web.db.sqlmodel_repo.credentials import CredentialsMixin
from decnet.web.db.sqlmodel_repo.deckies import DeckiesMixin
from decnet.web.db.sqlmodel_repo.decky_lifecycle import LifecycleMixin
from decnet.web.db.sqlmodel_repo.fleet import FleetMixin
from decnet.web.db.sqlmodel_repo.identities import IdentitiesMixin
from decnet.web.db.sqlmodel_repo.logs import LogsMixin
@@ -66,6 +67,7 @@ class SQLModelRepository(
CanaryMixin,
CredentialsMixin,
DeckiesMixin,
LifecycleMixin,
FleetMixin,
IdentitiesMixin,
LogsMixin,

View File

@@ -0,0 +1,106 @@
"""DeckyLifecycle CRUD + sweep.
One row per (decky, operation) attempt. States: pending → running →
succeeded | failed. Mixed into ``SQLModelRepository`` for both SQLite
and MySQL via MRO composition.
"""
from __future__ import annotations
import uuid as _uuid
from datetime import datetime, timezone
from typing import Any, Optional
from sqlalchemy import asc, select, update
from decnet.web.db.models import DeckyLifecycle
from decnet.web.db.sqlmodel_repo._helpers import _MixinBase
_TERMINAL = ("succeeded", "failed")
class LifecycleMixin(_MixinBase):
"""Mixin: composed onto ``SQLModelRepository``."""
async def create_lifecycle(self, data: dict[str, Any]) -> str:
payload = dict(data)
payload.setdefault("id", str(_uuid.uuid4()))
payload.setdefault("status", "pending")
now = datetime.now(timezone.utc)
payload.setdefault("started_at", now)
payload["updated_at"] = now
async with self._session() as session:
session.add(DeckyLifecycle(**payload))
await session.commit()
return str(payload["id"])
async def update_lifecycle(
self,
lifecycle_id: str,
fields: dict[str, Any],
) -> None:
payload = dict(fields)
payload["updated_at"] = datetime.now(timezone.utc)
if payload.get("status") in _TERMINAL and "completed_at" not in payload:
payload["completed_at"] = payload["updated_at"]
async with self._session() as session:
await session.execute(
update(DeckyLifecycle)
.where(DeckyLifecycle.id == lifecycle_id)
.values(**payload)
)
await session.commit()
async def get_lifecycle_by_ids(
self, lifecycle_ids: list[str],
) -> list[dict[str, Any]]:
if not lifecycle_ids:
return []
async with self._session() as session:
result = await session.execute(
select(DeckyLifecycle)
.where(DeckyLifecycle.id.in_(lifecycle_ids)) # type: ignore[attr-defined]
.order_by(asc(DeckyLifecycle.started_at))
)
return [r.model_dump(mode="json") for r in result.scalars().all()]
async def find_open_lifecycle(
self,
decky_name: str,
operation: str,
host_uuid: Optional[str] = None,
) -> Optional[dict[str, Any]]:
stmt = (
select(DeckyLifecycle)
.where(DeckyLifecycle.decky_name == decky_name)
.where(DeckyLifecycle.operation == operation)
.where(DeckyLifecycle.status.in_(("pending", "running"))) # type: ignore[attr-defined]
.order_by(DeckyLifecycle.started_at.desc()) # type: ignore[attr-defined]
)
if host_uuid is not None:
stmt = stmt.where(DeckyLifecycle.host_uuid == host_uuid)
async with self._session() as session:
result = await session.execute(stmt)
row = result.scalars().first()
return row.model_dump(mode="json") if row else None
async def sweep_stale_lifecycle(
self,
older_than: datetime,
reason: str,
) -> int:
now = datetime.now(timezone.utc)
async with self._session() as session:
result = await session.execute(
update(DeckyLifecycle)
.where(DeckyLifecycle.status.in_(("pending", "running"))) # type: ignore[attr-defined]
.where(DeckyLifecycle.started_at < older_than)
.values(
status="failed",
error=reason,
updated_at=now,
completed_at=now,
)
)
await session.commit()
return result.rowcount or 0