feat(lifecycle): runner + strategies + bus topic

Add decnet.lifecycle package: pure orchestration layer that the
master API will invoke via asyncio.create_task to drive DeckyLifecycle
rows through pending -> running -> succeeded | failed without
holding an HTTP request open.

Strategy classes per (operation, transport):
- LocalDeployStrategy: master-resident, runs engine.deployer.deploy
  in a thread.
- SwarmDeployStrategy: shards by host_uuid, dispatches via
  AgentClient.deploy; worker drives terminal via heartbeat.
- LocalMutateStrategy: write_compose + compose up.
- SwarmMutateStrategy: AgentClient.mutate; worker drives terminal.

decnet.bus.topics gains decky_lifecycle(name) -> decky.<name>.lifecycle
plus DECKY_LIFECYCLE constant. Payload documented in the wiki
(separate commit). publish_safely keeps bus best-effort.

Nothing is wired to call this yet -- next commits convert worker
/deploy /mutate to 202, then heartbeat delta wiring, then master API.
This commit is contained in:
2026-05-22 16:25:33 -04:00
parent 05c0721a51
commit c0ad380020
7 changed files with 884 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
"""Async deploy/mutate lifecycle runner.
The runner is invoked by the master API handlers (deploy + mutate) after
they write ``DeckyLifecycle`` rows and return 202 Accepted to the
caller. It executes the actual docker work off the request thread,
flips lifecycle row status through ``running -> succeeded|failed``, and
emits ``decky.<name>.lifecycle`` bus signals on every transition.
Strategy classes encapsulate transport (local docker on master vs
remote agent over mTLS). ``runner.run_deploy`` / ``run_mutate`` pick
the right strategy from the request context.
"""
from decnet.lifecycle.runner import run_deploy, run_mutate
__all__ = ["run_deploy", "run_mutate"]

View File

@@ -0,0 +1,43 @@
"""Bus emit helper for DeckyLifecycle transitions.
DB is the source of truth (wizard polls ``GET /deckies/lifecycle?ids=``).
The bus is best-effort live notification — publish failures are logged
and swallowed via ``publish_safely``, never propagated.
"""
from __future__ import annotations
from typing import Optional
from decnet.bus import topics as _topics
from decnet.bus.base import BaseBus
from decnet.bus.publish import publish_safely
async def emit_lifecycle(
bus: BaseBus | None,
*,
lifecycle_id: str,
decky_name: str,
operation: str,
status: str,
error: Optional[str] = None,
) -> None:
"""Publish ``decky.<name>.lifecycle`` with the current transition.
Payload keys: ``lifecycle_id``, ``operation``, ``status`` and
optionally ``error``. Documented in
``wiki-checkout/Service-Bus.md``.
"""
payload: dict = {
"lifecycle_id": lifecycle_id,
"operation": operation,
"status": status,
}
if error is not None:
payload["error"] = error
await publish_safely(
bus,
_topics.decky_lifecycle(decky_name),
payload,
event_type=_topics.DECKY_LIFECYCLE,
)

View File

@@ -0,0 +1,96 @@
"""Async deploy/mutate orchestration entry points.
Called by the master API handlers right after they create the lifecycle
rows. Picks the right strategy (local vs swarm) and runs it off the
HTTP request thread via ``asyncio.create_task`` at the caller.
"""
from __future__ import annotations
from pathlib import Path
from decnet.bus.base import BaseBus
from decnet.config import DecnetConfig, DeckyConfig
from decnet.lifecycle.strategies import (
LocalDeployStrategy,
SwarmDeployStrategy,
select_deploy_strategy,
select_mutate_strategy,
)
from decnet.logging import get_logger
from decnet.web.db.repository import BaseRepository
log = get_logger("lifecycle.runner")
async def run_deploy(
repo: BaseRepository,
bus: BaseBus | None,
*,
lifecycle_ids: dict[str, str],
config: DecnetConfig,
) -> None:
"""Execute the deploy referenced by *lifecycle_ids* (decky_name ->
lifecycle_id). Never raises — strategy turns errors into failed
rows. Intended to be wrapped in ``asyncio.create_task``.
In swarm mode the config may contain BOTH worker-resident deckies
(host_uuid set) and master-resident ones (host_uuid is None); we
route each subset through its own strategy.
"""
try:
if config.mode == "swarm":
remote_deckies = [d for d in config.deckies if d.host_uuid is not None]
local_deckies = [d for d in config.deckies if d.host_uuid is None]
if remote_deckies:
remote_ids = {
d.name: lifecycle_ids[d.name]
for d in remote_deckies if d.name in lifecycle_ids
}
remote_cfg = config.model_copy(update={"deckies": remote_deckies})
await SwarmDeployStrategy().execute(
repo, bus,
lifecycle_ids=remote_ids, config=remote_cfg,
)
if local_deckies:
local_ids = {
d.name: lifecycle_ids[d.name]
for d in local_deckies if d.name in lifecycle_ids
}
local_cfg = config.model_copy(update={"deckies": local_deckies})
await LocalDeployStrategy().execute(
repo, bus,
lifecycle_ids=local_ids, config=local_cfg,
)
else:
strategy = select_deploy_strategy(config)
await strategy.execute(
repo, bus, lifecycle_ids=lifecycle_ids, config=config,
)
except Exception: # noqa: BLE001 — defense in depth: never crash task
log.exception("lifecycle.run_deploy crashed unexpectedly")
async def run_mutate(
repo: BaseRepository,
bus: BaseBus | None,
*,
lifecycle_id: str,
decky: DeckyConfig,
services: list[str],
full_config: DecnetConfig,
compose_path: Path,
) -> None:
"""Execute a single-decky mutate. Never raises."""
try:
strategy = select_mutate_strategy(full_config, decky)
await strategy.execute(
repo, bus,
lifecycle_id=lifecycle_id, decky=decky,
services=services, full_config=full_config,
compose_path=compose_path,
)
except Exception: # noqa: BLE001
log.exception("lifecycle.run_mutate crashed unexpectedly")
__all__ = ["run_deploy", "run_mutate"]

View File

@@ -0,0 +1,376 @@
"""Lifecycle execution strategies.
Each strategy owns the work for one (operation, transport) combo:
* ``LocalDeployStrategy`` — master-resident deckies: writes a compose
file and runs ``docker compose up -d`` on the master via
``engine.deployer.deploy`` off the request thread.
* ``SwarmDeployStrategy`` — worker-resident deckies: fans the sharded
config to each worker via ``AgentClient.deploy``. The worker returns
202 immediately; the worker's next heartbeat drives the terminal
transition (see ``master heartbeat handler accepts lifecycle deltas``).
* ``LocalMutateStrategy`` / ``SwarmMutateStrategy`` — same split, for a
per-decky mutate of services list.
The runner picks the right concrete class. Strategies update the DB
row + emit bus signals; they never raise back at the runner — they
turn exceptions into ``failed`` rows and return.
"""
from __future__ import annotations
import abc
from datetime import datetime, timezone
import anyio
from decnet.bus.base import BaseBus
from decnet.config import DecnetConfig, DeckyConfig
from decnet.lifecycle.events import emit_lifecycle
from decnet.logging import get_logger
from decnet.web.db.repository import BaseRepository
log = get_logger("lifecycle.strategy")
# --- base ----------------------------------------------------------------
class _StrategyBase(abc.ABC):
"""Shared helpers — DB row transitions + bus emit."""
async def _mark_running(
self,
repo: BaseRepository,
bus: BaseBus | None,
*,
lifecycle_id: str,
decky_name: str,
operation: str,
) -> None:
await repo.update_lifecycle(lifecycle_id, {"status": "running"})
await emit_lifecycle(
bus,
lifecycle_id=lifecycle_id,
decky_name=decky_name,
operation=operation,
status="running",
)
async def _mark_succeeded(
self,
repo: BaseRepository,
bus: BaseBus | None,
*,
lifecycle_id: str,
decky_name: str,
operation: str,
) -> None:
await repo.update_lifecycle(
lifecycle_id,
{
"status": "succeeded",
"completed_at": datetime.now(timezone.utc),
},
)
await emit_lifecycle(
bus,
lifecycle_id=lifecycle_id,
decky_name=decky_name,
operation=operation,
status="succeeded",
)
async def _mark_failed(
self,
repo: BaseRepository,
bus: BaseBus | None,
*,
lifecycle_id: str,
decky_name: str,
operation: str,
error: str,
) -> None:
await repo.update_lifecycle(
lifecycle_id,
{
"status": "failed",
"error": error[:2000],
"completed_at": datetime.now(timezone.utc),
},
)
await emit_lifecycle(
bus,
lifecycle_id=lifecycle_id,
decky_name=decky_name,
operation=operation,
status="failed",
error=error[:2000],
)
# --- deploy --------------------------------------------------------------
class DeployStrategy(_StrategyBase):
"""ABC for deploy strategies. Concrete implementations override
:meth:`execute`."""
@abc.abstractmethod
async def execute(
self,
repo: BaseRepository,
bus: BaseBus | None,
*,
lifecycle_ids: dict[str, str], # decky_name -> lifecycle_id
config: DecnetConfig,
) -> None: ...
class LocalDeployStrategy(DeployStrategy):
"""Master-resident deploy via ``engine.deployer.deploy``.
Coalesces N decky lifecycle rows into one compose-up call (compose
is naturally batched), then flips all rows together.
"""
async def execute(
self,
repo: BaseRepository,
bus: BaseBus | None,
*,
lifecycle_ids: dict[str, str],
config: DecnetConfig,
) -> None:
from decnet.engine import deployer as _deployer
for decky_name, lid in lifecycle_ids.items():
await self._mark_running(
repo, bus, lifecycle_id=lid,
decky_name=decky_name, operation="deploy",
)
try:
await anyio.to_thread.run_sync(
_deployer.deploy, config, False, False, False,
)
except Exception as exc: # noqa: BLE001
err = f"{type(exc).__name__}: {exc}"
log.exception("local deploy failed")
for decky_name, lid in lifecycle_ids.items():
await self._mark_failed(
repo, bus, lifecycle_id=lid,
decky_name=decky_name, operation="deploy",
error=err,
)
return
for decky_name, lid in lifecycle_ids.items():
await self._mark_succeeded(
repo, bus, lifecycle_id=lid,
decky_name=decky_name, operation="deploy",
)
class SwarmDeployStrategy(DeployStrategy):
"""Worker-resident deploy via ``AgentClient.deploy``.
Marks rows ``running`` on dispatch. The worker's /deploy is async
(202); its next heartbeat carries lifecycle deltas that drive the
terminal transition via the master's heartbeat handler. If the
dispatch itself raises (network / mTLS / 5xx), the row is marked
``failed`` here.
"""
async def execute(
self,
repo: BaseRepository,
bus: BaseBus | None,
*,
lifecycle_ids: dict[str, str],
config: DecnetConfig,
) -> None:
from decnet.engine.deployer import _resolve_swarm_host
from decnet.swarm.client import AgentClient
# Shard deckies by host so we can fire one AgentClient.deploy
# per host carrying that host's slice of the config.
shards: dict[str, list[DeckyConfig]] = {}
for decky in config.deckies:
if decky.host_uuid is None:
# Master-resident decky in swarm mode: skip here; runner
# routes those through LocalDeployStrategy at the
# caller's discretion. Defensive guard only.
continue
shards.setdefault(decky.host_uuid, []).append(decky)
for host_uuid, deckies in shards.items():
shard_lifecycle = {
d.name: lifecycle_ids[d.name]
for d in deckies if d.name in lifecycle_ids
}
for decky in deckies:
lid = shard_lifecycle.get(decky.name)
if lid is None:
continue
await self._mark_running(
repo, bus, lifecycle_id=lid,
decky_name=decky.name, operation="deploy",
)
try:
host = await _resolve_swarm_host(repo, host_uuid)
shard_cfg = config.model_copy(update={"deckies": deckies})
async with AgentClient(host=host) as agent:
await agent.deploy(shard_cfg)
except Exception as exc: # noqa: BLE001
err = f"{type(exc).__name__}: {exc}"
log.exception(
"swarm deploy dispatch failed host_uuid=%s", host_uuid,
)
for decky_name, lid in shard_lifecycle.items():
await self._mark_failed(
repo, bus, lifecycle_id=lid,
decky_name=decky_name, operation="deploy",
error=err,
)
continue
# Successful dispatch -> rows stay running; worker drives
# the terminal via heartbeat.
# --- mutate --------------------------------------------------------------
class MutateStrategy(_StrategyBase):
"""ABC for mutate strategies."""
@abc.abstractmethod
async def execute(
self,
repo: BaseRepository,
bus: BaseBus | None,
*,
lifecycle_id: str,
decky: DeckyConfig,
services: list[str],
full_config: DecnetConfig,
compose_path,
) -> None: ...
class LocalMutateStrategy(MutateStrategy):
"""Master-local mutate: rewrites compose + ``compose up -d``."""
async def execute(
self,
repo: BaseRepository,
bus: BaseBus | None,
*,
lifecycle_id: str,
decky: DeckyConfig,
services: list[str],
full_config: DecnetConfig,
compose_path,
) -> None:
from decnet.composer import write_compose
from decnet.engine import _compose_with_retry
await self._mark_running(
repo, bus, lifecycle_id=lifecycle_id,
decky_name=decky.name, operation="mutate",
)
try:
decky.services = list(services)
write_compose(full_config, compose_path)
await anyio.to_thread.run_sync(
lambda: _compose_with_retry(
"up", "-d", "--remove-orphans",
compose_file=compose_path,
),
)
except Exception as exc: # noqa: BLE001
err = f"{type(exc).__name__}: {exc}"
log.exception("local mutate failed decky=%s", decky.name)
await self._mark_failed(
repo, bus, lifecycle_id=lifecycle_id,
decky_name=decky.name, operation="mutate",
error=err,
)
return
await self._mark_succeeded(
repo, bus, lifecycle_id=lifecycle_id,
decky_name=decky.name, operation="mutate",
)
class SwarmMutateStrategy(MutateStrategy):
"""Worker-resident mutate via ``AgentClient.mutate``.
Same shape as SwarmDeployStrategy: row -> running on dispatch,
worker drives terminal via heartbeat.
"""
async def execute(
self,
repo: BaseRepository,
bus: BaseBus | None,
*,
lifecycle_id: str,
decky: DeckyConfig,
services: list[str],
full_config: DecnetConfig,
compose_path,
) -> None:
from decnet.engine.deployer import _resolve_swarm_host
from decnet.swarm.client import AgentClient
await self._mark_running(
repo, bus, lifecycle_id=lifecycle_id,
decky_name=decky.name, operation="mutate",
)
if decky.host_uuid is None:
await self._mark_failed(
repo, bus, lifecycle_id=lifecycle_id,
decky_name=decky.name, operation="mutate",
error="swarm mutate strategy invoked for decky with no host_uuid",
)
return
try:
host = await _resolve_swarm_host(repo, decky.host_uuid)
async with AgentClient(host=host) as agent:
await agent.mutate(decky.name, list(services))
except Exception as exc: # noqa: BLE001
err = f"{type(exc).__name__}: {exc}"
log.exception("swarm mutate dispatch failed decky=%s", decky.name)
await self._mark_failed(
repo, bus, lifecycle_id=lifecycle_id,
decky_name=decky.name, operation="mutate",
error=err,
)
return
# Worker drives terminal via heartbeat.
def select_deploy_strategy(config: DecnetConfig) -> DeployStrategy:
"""Pick strategy by deployment mode. In swarm mode deckies with
``host_uuid`` go remote; the caller must route master-resident
swarm deckies (host_uuid=None) through the local strategy
separately."""
if config.mode == "swarm":
return SwarmDeployStrategy()
return LocalDeployStrategy()
def select_mutate_strategy(
config: DecnetConfig, decky: DeckyConfig,
) -> MutateStrategy:
"""Pick strategy by decky placement."""
if config.mode == "swarm" and decky.host_uuid is not None:
return SwarmMutateStrategy()
return LocalMutateStrategy()
__all__ = [
"DeployStrategy",
"LocalDeployStrategy",
"SwarmDeployStrategy",
"MutateStrategy",
"LocalMutateStrategy",
"SwarmMutateStrategy",
"select_deploy_strategy",
"select_mutate_strategy",
]