merge: testing → main (reconcile 2-week divergence)
This commit is contained in:
@@ -14,22 +14,44 @@ from decnet.fleet import all_service_names
|
||||
from decnet.composer import write_compose
|
||||
from decnet.config import DeckyConfig, DecnetConfig
|
||||
from decnet.engine import _compose_with_retry
|
||||
from decnet.logging import get_logger
|
||||
from decnet.telemetry import traced as _traced
|
||||
|
||||
from pathlib import Path
|
||||
import anyio
|
||||
import asyncio
|
||||
import contextlib
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.bus.base import BaseBus
|
||||
from decnet.bus.factory import get_bus
|
||||
from decnet.bus.publish import (
|
||||
publish_safely as _publish_safely,
|
||||
run_control_listener_signal as _run_control_listener_signal,
|
||||
run_health_heartbeat as _run_health_heartbeat,
|
||||
)
|
||||
from decnet.mutator.events import MutationTrigger, emit_decky_mutated
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
|
||||
log = get_logger("mutator")
|
||||
console = Console()
|
||||
|
||||
|
||||
async def mutate_decky(decky_name: str, repo: BaseRepository) -> bool:
|
||||
@_traced("mutator.mutate_decky")
|
||||
async def mutate_decky(
|
||||
decky_name: str,
|
||||
repo: BaseRepository,
|
||||
bus: BaseBus | None = None,
|
||||
trigger: MutationTrigger = "operator",
|
||||
) -> bool:
|
||||
"""
|
||||
Perform an Intra-Archetype Shuffle for a specific decky.
|
||||
Returns True if mutation succeeded, False otherwise.
|
||||
"""
|
||||
log.debug("mutate_decky: start decky=%s", decky_name)
|
||||
state_dict = await repo.get_state("deployment")
|
||||
if state_dict is None:
|
||||
log.error("mutate_decky: no active deployment found in database")
|
||||
console.print("[red]No active deployment found in database.[/]")
|
||||
return False
|
||||
|
||||
@@ -54,6 +76,7 @@ async def mutate_decky(decky_name: str, repo: BaseRepository) -> bool:
|
||||
console.print(f"[yellow]No services available for mutating '{decky_name}'.[/]")
|
||||
return False
|
||||
|
||||
old_services = list(decky.services)
|
||||
current_services = set(decky.services)
|
||||
|
||||
attempts = 0
|
||||
@@ -73,58 +96,376 @@ async def mutate_decky(decky_name: str, repo: BaseRepository) -> bool:
|
||||
# Still writes files for Docker to use
|
||||
write_compose(config, compose_path)
|
||||
|
||||
log.info("mutation applied decky=%s services=%s", decky_name, ",".join(decky.services))
|
||||
console.print(f"[cyan]Mutating '{decky_name}' to services: {', '.join(decky.services)}[/]")
|
||||
|
||||
try:
|
||||
# Wrap blocking call in thread
|
||||
await anyio.to_thread.run_sync(_compose_with_retry, "up", "-d", "--remove-orphans", compose_path)
|
||||
except Exception as e:
|
||||
log.error("mutation failed decky=%s error=%s", decky_name, e)
|
||||
console.print(f"[red]Failed to mutate '{decky_name}': {e}[/]")
|
||||
return False
|
||||
|
||||
await emit_decky_mutated(
|
||||
bus,
|
||||
decky=decky_name,
|
||||
old_services=old_services,
|
||||
new_services=list(decky.services),
|
||||
trigger=trigger,
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
async def mutate_all(repo: BaseRepository, force: bool = False) -> None:
|
||||
"""
|
||||
Check all deckies and mutate those that are due.
|
||||
If force=True, mutates all deckies regardless of schedule.
|
||||
@_traced("mutator.mutate_all")
|
||||
async def mutate_all(
|
||||
repo: BaseRepository,
|
||||
force: bool = False,
|
||||
bus: BaseBus | None = None,
|
||||
only: set[str] | None = None,
|
||||
) -> float | None:
|
||||
"""Mutate all deckies that are due (or *only* the named ones).
|
||||
|
||||
Returns the number of seconds until the next scheduled mutation, or
|
||||
``None`` if no deployment exists / no decky has an interval set. The
|
||||
watch loop uses this to adaptively sleep instead of hard-polling at a
|
||||
fixed cadence.
|
||||
|
||||
A missing ``deployment`` state row is *not* an error any more — the
|
||||
host may simply not have run ``decnet deploy`` yet. The watch loop
|
||||
edge-triggers the user-facing log for that state.
|
||||
"""
|
||||
log.debug("mutate_all: start force=%s only=%s", force, only)
|
||||
state_dict = await repo.get_state("deployment")
|
||||
if state_dict is None:
|
||||
console.print("[red]No active deployment found.[/]")
|
||||
return
|
||||
log.debug("mutate_all: no active deployment found")
|
||||
return None
|
||||
|
||||
config = DecnetConfig(**state_dict["config"])
|
||||
now = time.time()
|
||||
|
||||
# Trigger derivation: explicit force / targeted only-list come from
|
||||
# an operator action (CLI --all, API mutate-now, UI bus request).
|
||||
# Scheduled-interval ticks carry trigger=scheduled.
|
||||
trigger: MutationTrigger = "operator" if (force or only is not None) else "scheduled"
|
||||
|
||||
mutated_count = 0
|
||||
next_due_in: float | None = None
|
||||
for decky in config.deckies:
|
||||
if only is not None and decky.name not in only:
|
||||
continue
|
||||
interval_mins = decky.mutate_interval or config.mutate_interval
|
||||
if interval_mins is None and not force:
|
||||
continue
|
||||
|
||||
if force:
|
||||
if force or only is not None:
|
||||
due = True
|
||||
else:
|
||||
elapsed_secs = now - decky.last_mutated
|
||||
due = elapsed_secs >= (interval_mins * 60)
|
||||
remaining = (interval_mins * 60) - elapsed_secs
|
||||
if not due and (next_due_in is None or remaining < next_due_in):
|
||||
next_due_in = remaining
|
||||
|
||||
if due:
|
||||
success = await mutate_decky(decky.name, repo=repo)
|
||||
success = await mutate_decky(
|
||||
decky.name, repo=repo, bus=bus, trigger=trigger,
|
||||
)
|
||||
if success:
|
||||
mutated_count += 1
|
||||
|
||||
if mutated_count == 0 and not force:
|
||||
console.print("[dim]No deckies are due for mutation.[/]")
|
||||
if mutated_count:
|
||||
log.info("mutate_all: complete mutated_count=%d", mutated_count)
|
||||
else:
|
||||
log.debug("mutate_all: no deckies due for mutation")
|
||||
return next_due_in
|
||||
|
||||
|
||||
@_traced("mutator.reconcile_topologies")
|
||||
async def reconcile_topologies(
|
||||
repo: BaseRepository, bus: BaseBus | None = None,
|
||||
) -> int:
|
||||
"""Drain pending ``topology_mutations`` rows against live topologies.
|
||||
|
||||
For every topology in ``active|degraded`` with at least one pending
|
||||
mutation, atomically claim the oldest via
|
||||
:meth:`BaseRepository.claim_next_mutation`, dispatch to the matching
|
||||
``apply_<op>`` in :mod:`decnet.mutator.ops`, and write the outcome
|
||||
back (``applied`` or ``failed``).
|
||||
|
||||
On ``MutationError`` the topology is flipped to ``degraded`` — the
|
||||
same state the future Healer will target — so operators can see that
|
||||
a requested change was rejected without the repo drifting into an
|
||||
inconsistent state.
|
||||
|
||||
Returns the number of mutations drained this tick.
|
||||
"""
|
||||
# Local imports keep the flat-fleet hot path free of MazeNET cost.
|
||||
from decnet.mutator.ops import MutationError, dispatch as _op_dispatch
|
||||
from decnet.topology.persistence import transition_status
|
||||
from decnet.topology.status import TopologyStatus, TopologyStatusError
|
||||
|
||||
drained = 0
|
||||
for tid in await repo.list_live_topology_ids():
|
||||
while True:
|
||||
mut = await repo.claim_next_mutation(tid)
|
||||
if mut is None:
|
||||
break # no more work for this topology this tick.
|
||||
await _publish_safely(
|
||||
bus,
|
||||
_topics.topology_mutation(tid, _topics.MUTATION_APPLYING),
|
||||
{"mutation_id": mut["id"], "op": mut["op"], "payload": mut["payload"]},
|
||||
event_type=_topics.MUTATION_APPLYING,
|
||||
)
|
||||
try:
|
||||
await _op_dispatch(repo, tid, mut["op"], mut["payload"])
|
||||
await repo.mark_mutation_applied(mut["id"])
|
||||
drained += 1
|
||||
log.info(
|
||||
"topology %s mutation %s applied op=%s",
|
||||
tid, mut["id"], mut["op"],
|
||||
)
|
||||
await _publish_safely(
|
||||
bus,
|
||||
_topics.topology_mutation(tid, _topics.MUTATION_APPLIED),
|
||||
{"mutation_id": mut["id"], "op": mut["op"]},
|
||||
event_type=_topics.MUTATION_APPLIED,
|
||||
)
|
||||
except (MutationError, Exception) as exc: # noqa: BLE001
|
||||
reason = f"{type(exc).__name__}: {exc}"
|
||||
await repo.mark_mutation_failed(mut["id"], reason)
|
||||
log.warning(
|
||||
"topology %s mutation %s failed: %s",
|
||||
tid, mut["id"], reason,
|
||||
)
|
||||
await _publish_safely(
|
||||
bus,
|
||||
_topics.topology_mutation(tid, _topics.MUTATION_FAILED),
|
||||
{"mutation_id": mut["id"], "op": mut["op"], "reason": reason},
|
||||
event_type=_topics.MUTATION_FAILED,
|
||||
)
|
||||
try:
|
||||
await transition_status(
|
||||
repo, tid, TopologyStatus.DEGRADED, reason=reason,
|
||||
)
|
||||
await _publish_safely(
|
||||
bus,
|
||||
_topics.topology_status(tid),
|
||||
{"state": TopologyStatus.DEGRADED, "reason": reason},
|
||||
event_type=_topics.TOPOLOGY_STATUS,
|
||||
)
|
||||
except TopologyStatusError:
|
||||
# Already degraded / in a state that can't degrade
|
||||
# further — leave as is.
|
||||
pass
|
||||
# Stop draining this topology on first failure so the
|
||||
# operator can inspect before a cascade.
|
||||
break
|
||||
return drained
|
||||
|
||||
|
||||
@_traced("mutator.reconcile_agent_resyncs")
|
||||
async def reconcile_agent_resyncs(repo: BaseRepository) -> int:
|
||||
"""Re-push agent-targeted topologies flagged by the heartbeat handler.
|
||||
|
||||
The heartbeat sets ``needs_resync=True`` when an agent's reported
|
||||
applied_version_hash diverges from master's expectation. Here we
|
||||
re-run the agent branch of ``deploy_topology`` which pushes the
|
||||
current hydrated blob back down over mTLS and clears the flag on
|
||||
success. Any push failure leaves the flag set so the next tick
|
||||
retries — it also logs loudly so ops can see that a specific agent
|
||||
is stuck.
|
||||
"""
|
||||
from decnet.engine import deployer as _deployer
|
||||
|
||||
try:
|
||||
pending = await repo.list_topologies_needing_resync()
|
||||
except NotImplementedError:
|
||||
return 0
|
||||
drained = 0
|
||||
for topo in pending:
|
||||
tid = topo["id"]
|
||||
try:
|
||||
await _deployer.resync_agent_topology(repo, tid)
|
||||
await repo.set_topology_resync(tid, False)
|
||||
drained += 1
|
||||
log.info("topology %s resynced to agent %s",
|
||||
tid, topo.get("target_host_uuid"))
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(
|
||||
"topology %s resync failed (will retry): %s", tid, exc,
|
||||
)
|
||||
return drained
|
||||
|
||||
|
||||
@_traced("mutator.watch_loop")
|
||||
async def run_watch_loop(repo: BaseRepository, poll_interval_secs: int = 10) -> None:
|
||||
"""Run an infinite loop checking for deckies that need mutation."""
|
||||
"""Run an infinite loop checking for deckies that need mutation.
|
||||
|
||||
Two independent responsibilities, in strict order per tick:
|
||||
|
||||
1. Flat-fleet service rotation (``mutate_all``) — runs every tick
|
||||
regardless of MazeNET state, preserving phase-1 timing.
|
||||
2. MazeNET live-mutation reconciliation — runs only when the cheap
|
||||
guard ``has_pending_topology_mutation`` (indexed composite
|
||||
lookup) returns True. Zero-topology and idle-topology hosts pay
|
||||
exactly one indexed query per tick.
|
||||
"""
|
||||
log.info("mutator watch loop started poll_interval_secs=%d", poll_interval_secs)
|
||||
console.print(f"[green]DECNET Mutator Watcher started (polling every {poll_interval_secs}s).[/]")
|
||||
|
||||
# Connect to the bus for publish + wake-on-enqueue. Failure here is
|
||||
# non-fatal: a mutator without a bus still works, it just runs at
|
||||
# poll-interval latency and doesn't push notifications to UI clients.
|
||||
bus: BaseBus | None = None
|
||||
wake = asyncio.Event()
|
||||
mutate_requests: set[str] = set()
|
||||
wake_tasks: list[asyncio.Task] = []
|
||||
heartbeat_task: asyncio.Task | None = None
|
||||
try:
|
||||
candidate = get_bus(client_name="mutator")
|
||||
await candidate.connect()
|
||||
bus = candidate
|
||||
wake_tasks.append(asyncio.create_task(_wake_on_enqueue(bus, wake)))
|
||||
wake_tasks.append(asyncio.create_task(
|
||||
_wake_on_mutate_request(bus, wake, mutate_requests),
|
||||
))
|
||||
heartbeat_task = asyncio.create_task(
|
||||
_run_health_heartbeat(bus, "mutator"),
|
||||
)
|
||||
# Control listener: SIGTERM-based so the existing shutdown path
|
||||
# (cancel wake_tasks + heartbeat_task) runs unchanged.
|
||||
wake_tasks.append(asyncio.create_task(
|
||||
_run_control_listener_signal(bus, "mutator"),
|
||||
))
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("mutator: bus unavailable, running in poll-only mode: %s", exc)
|
||||
|
||||
# Edge-triggered "no deployment" state so we don't spam the console
|
||||
# every 10 seconds on a host that hasn't deployed yet. Start as None
|
||||
# so the first observation fires exactly one line.
|
||||
deployment_present: bool | None = None
|
||||
|
||||
try:
|
||||
while True:
|
||||
await mutate_all(force=False, repo=repo)
|
||||
await asyncio.sleep(poll_interval_secs)
|
||||
requested = mutate_requests.copy()
|
||||
mutate_requests.clear()
|
||||
|
||||
next_due = await mutate_all(
|
||||
repo=repo,
|
||||
force=False,
|
||||
bus=bus,
|
||||
only=requested or None,
|
||||
)
|
||||
has_deployment = (
|
||||
next_due is not None or await repo.get_state("deployment") is not None
|
||||
)
|
||||
if has_deployment and deployment_present is not True:
|
||||
log.info("mutator: active deployment observed — entering normal cadence")
|
||||
console.print("[green]Active deployment observed.[/]")
|
||||
deployment_present = True
|
||||
elif not has_deployment and deployment_present is not False:
|
||||
log.info("mutator: no active deployment — idling until one lands")
|
||||
console.print("[dim]No active deployment; mutator idling.[/]")
|
||||
deployment_present = False
|
||||
|
||||
# Gate reconciler on the O(log n) guard query — avoids
|
||||
# entering the dispatch body when there's nothing to do.
|
||||
try:
|
||||
if await repo.has_pending_topology_mutation():
|
||||
await reconcile_topologies(repo, bus=bus)
|
||||
except NotImplementedError:
|
||||
# Backend without MazeNET support — nothing to reconcile.
|
||||
pass
|
||||
try:
|
||||
await reconcile_agent_resyncs(repo)
|
||||
except NotImplementedError:
|
||||
pass
|
||||
except Exception:
|
||||
log.exception("reconcile_agent_resyncs tick raised")
|
||||
|
||||
# Adaptive sleep: wake at the earlier of (next decky due) or
|
||||
# (poll_interval_secs), bounded below by 1s so a thrashing
|
||||
# schedule can't spin the loop. A bus wake (enqueue or
|
||||
# mutate_request) short-circuits the wait.
|
||||
if next_due is None or next_due > poll_interval_secs:
|
||||
timeout = float(poll_interval_secs)
|
||||
else:
|
||||
timeout = max(1.0, next_due)
|
||||
try:
|
||||
await asyncio.wait_for(wake.wait(), timeout=timeout)
|
||||
except asyncio.TimeoutError:
|
||||
pass
|
||||
wake.clear()
|
||||
except KeyboardInterrupt:
|
||||
log.info("mutator watch loop stopped")
|
||||
console.print("\n[dim]Mutator watcher stopped.[/]")
|
||||
finally:
|
||||
for t in wake_tasks:
|
||||
t.cancel()
|
||||
if heartbeat_task is not None:
|
||||
heartbeat_task.cancel()
|
||||
for t in (*wake_tasks, heartbeat_task):
|
||||
if t is None:
|
||||
continue
|
||||
with contextlib.suppress(asyncio.CancelledError, Exception):
|
||||
await t
|
||||
if bus is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
await bus.close()
|
||||
|
||||
|
||||
async def _wake_on_enqueue(bus: BaseBus, wake: asyncio.Event) -> None:
|
||||
"""Flip *wake* every time a ``mutation.enqueued`` event lands.
|
||||
|
||||
Subscribes to the wildcard ``topology.*.mutation.enqueued`` — a single
|
||||
subscription covers every topology on the host. Runs until cancelled
|
||||
or the bus closes (NullBus yields nothing and returns immediately,
|
||||
which is fine: the poll-interval fallback still ticks).
|
||||
"""
|
||||
pattern = f"{_topics.TOPOLOGY}.*.mutation.{_topics.MUTATION_ENQUEUED}"
|
||||
try:
|
||||
sub = bus.subscribe(pattern)
|
||||
async with sub:
|
||||
async for _event in sub:
|
||||
wake.set()
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("mutator: wake subscriber died (%s); falling back to poll", exc)
|
||||
|
||||
|
||||
async def _wake_on_mutate_request(
|
||||
bus: BaseBus,
|
||||
wake: asyncio.Event,
|
||||
pending: set[str],
|
||||
) -> None:
|
||||
"""Collect on-demand ``decky.<name>.mutate_request`` events.
|
||||
|
||||
API/CLI/UI callers publish to ``decky.{name}.mutate_request`` to force
|
||||
an immediate mutation without waiting for the scheduled interval. We
|
||||
stash the target decky name in *pending* so the next tick can feed it
|
||||
to ``mutate_all(only=...)``, then flip *wake* to short-circuit the
|
||||
sleep. Payload is optional — the topic's second token is the name.
|
||||
"""
|
||||
pattern = f"{_topics.DECKY}.*.{_topics.DECKY_MUTATE_REQUEST}"
|
||||
try:
|
||||
sub = bus.subscribe(pattern)
|
||||
async with sub:
|
||||
async for event in sub:
|
||||
topic = getattr(event, "topic", "") or ""
|
||||
parts = topic.split(".")
|
||||
name = parts[1] if len(parts) >= 3 else ""
|
||||
payload = getattr(event, "payload", None) or {}
|
||||
if not name and isinstance(payload, dict):
|
||||
name = payload.get("name", "") or ""
|
||||
if name:
|
||||
pending.add(name)
|
||||
wake.set()
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(
|
||||
"mutator: mutate_request subscriber died (%s); falling back to poll",
|
||||
exc,
|
||||
)
|
||||
|
||||
108
decnet/mutator/events.py
Normal file
108
decnet/mutator/events.py
Normal file
@@ -0,0 +1,108 @@
|
||||
"""Mutation-event emission.
|
||||
|
||||
One helper (:func:`emit_decky_mutated`) writes every substrate
|
||||
transition to two places at once:
|
||||
|
||||
1. **RFC 5424 syslog** — appended to the collector's ingest log, so
|
||||
the correlation engine picks the event up alongside attacker
|
||||
events and can interleave substrate-change markers into traversals.
|
||||
2. **Bus topic** ``decky.<name>.mutation`` — fire-and-forget
|
||||
notification for live UI consumers (SSE, dashboards).
|
||||
|
||||
The split mirrors the DB-vs-bus contract: syslog is durable, bus is
|
||||
at-most-once. Either path failing must never crash the mutator loop,
|
||||
so both sides are wrapped in broad ``try/except log.warning``.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import socket as _socket
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.bus.base import BaseBus
|
||||
from decnet.bus.publish import publish_safely as _publish_safely
|
||||
from decnet.env import DECNET_INGEST_LOG_FILE
|
||||
from decnet.logging import get_logger
|
||||
from decnet.logging.syslog_formatter import format_rfc5424
|
||||
|
||||
log = get_logger("mutator.events")
|
||||
|
||||
|
||||
# Trigger enum — wide on purpose so the schema stays stable as v2/v3
|
||||
# features (behavioral + federation) land. Every call site supplies
|
||||
# exactly one of these.
|
||||
MutationTrigger = Literal[
|
||||
"creation", # initial deploy of a decky
|
||||
"retirement", # teardown / removal
|
||||
"scheduled", # mutator watch-loop interval tick
|
||||
"operator", # explicit force via API/CLI/UI
|
||||
"behavioral", # future: attacker-behavior-driven rotation
|
||||
"healer", # future: re-apply by the healer worker
|
||||
"federation", # future: cross-operator MazeNET mutation
|
||||
]
|
||||
|
||||
_EVENT_TYPE = "decky_mutated"
|
||||
_MUTATOR_APP = "mutator"
|
||||
_MUTATOR_HOSTNAME = _socket.gethostname()
|
||||
|
||||
|
||||
async def emit_decky_mutated(
|
||||
bus: BaseBus | None,
|
||||
*,
|
||||
decky: str,
|
||||
old_services: list[str],
|
||||
new_services: list[str],
|
||||
trigger: MutationTrigger,
|
||||
actor: str | None = None,
|
||||
log_path: Path | str | None = None,
|
||||
) -> None:
|
||||
"""Emit one ``decky_mutated`` event on both the syslog stream and the bus.
|
||||
|
||||
*log_path* defaults to :data:`decnet.env.DECNET_INGEST_LOG_FILE`.
|
||||
Pass an explicit path (or ``None``) in tests to redirect or suppress
|
||||
the file write. A missing parent directory is a soft failure —
|
||||
logged once and skipped — because the correlator works without
|
||||
mutation events and we'd rather degrade than crash.
|
||||
"""
|
||||
fields: dict[str, Any] = {
|
||||
"decky": decky,
|
||||
"old_services": ",".join(old_services),
|
||||
"new_services": ",".join(new_services),
|
||||
"trigger": trigger,
|
||||
}
|
||||
if actor:
|
||||
fields["actor"] = actor
|
||||
|
||||
# ── Syslog side ───────────────────────────────────────────────
|
||||
target = Path(log_path) if log_path is not None else Path(DECNET_INGEST_LOG_FILE)
|
||||
try:
|
||||
line = format_rfc5424(
|
||||
service=_MUTATOR_APP,
|
||||
hostname=decky, # per-decky HOSTNAME so correlator indexes it correctly
|
||||
event_type=_EVENT_TYPE,
|
||||
**fields,
|
||||
)
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(target, "a", encoding="utf-8") as fh:
|
||||
fh.write(line + "\n")
|
||||
fh.flush()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("syslog emission failed decky=%s path=%s: %s",
|
||||
decky, target, exc)
|
||||
|
||||
# ── Bus side ──────────────────────────────────────────────────
|
||||
payload: dict[str, Any] = {
|
||||
"decky": decky,
|
||||
"old_services": list(old_services),
|
||||
"new_services": list(new_services),
|
||||
"trigger": trigger,
|
||||
}
|
||||
if actor:
|
||||
payload["actor"] = actor
|
||||
await _publish_safely(
|
||||
bus,
|
||||
_topics.decky_mutation(decky),
|
||||
payload,
|
||||
event_type=_topics.DECKY_MUTATION,
|
||||
)
|
||||
440
decnet/mutator/ops.py
Normal file
440
decnet/mutator/ops.py
Normal file
@@ -0,0 +1,440 @@
|
||||
"""Live-mutation ops for active MazeNET topologies.
|
||||
|
||||
Each ``apply_<op>`` function consumes a claimed ``TopologyMutation``
|
||||
payload, mutates the repo (and, best-effort, the underlying Docker
|
||||
state), then re-runs :func:`decnet.topology.validate.validate` against
|
||||
the post-apply hydrated view. If validation errors appear, the op is
|
||||
reported as failed and the caller flips the topology to ``degraded`` —
|
||||
we never leave the repo in an invalid state.
|
||||
|
||||
Design notes
|
||||
------------
|
||||
* All ops are *repo-first*. The reconciler's job is to converge Docker
|
||||
toward the repo's desired state, so persisting intent first keeps the
|
||||
system self-healing across master restarts.
|
||||
* Docker calls are optional at the ops layer: the tests drive these
|
||||
functions directly against an in-memory repo, and the reconciler
|
||||
sidecar calls them in production where Docker is present. Every
|
||||
Docker call is guarded so missing/unreachable Docker doesn't leave
|
||||
the DB half-mutated.
|
||||
* Ops intentionally do NOT perform optimistic-concurrency checks — the
|
||||
enqueue step already carried the caller's ``expected_version``. The
|
||||
reconciler is the sole writer from here on.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any, Awaitable, Callable, Optional
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.topology.allocator import IPAllocator, reserved_subnets, SubnetAllocator
|
||||
from decnet.topology.persistence import hydrate
|
||||
from decnet.topology.validate import (
|
||||
check_names_unique,
|
||||
check_no_ip_collisions,
|
||||
check_no_subnet_overlap,
|
||||
check_service_config_shape,
|
||||
check_services_known,
|
||||
errors as _validation_errors,
|
||||
)
|
||||
|
||||
# Post-apply validation intentionally excludes topology-shape rules
|
||||
# (``check_all_lans_connected_to_dmz``, ``check_exactly_one_dmz``,
|
||||
# ``check_no_orphan_deckies``) — those are legitimately transient
|
||||
# during live editing (e.g. ``add_lan`` leaves the new LAN orphaned
|
||||
# until the next ``attach_decky``). The deployer's full ``validate()``
|
||||
# pass still runs at redeploy time. Invariants that MUST hold after
|
||||
# every single op are kept here.
|
||||
_POST_APPLY_CHECKS = (
|
||||
check_names_unique,
|
||||
check_no_ip_collisions,
|
||||
check_no_subnet_overlap,
|
||||
check_services_known,
|
||||
check_service_config_shape,
|
||||
)
|
||||
|
||||
_log = get_logger("mutator.ops")
|
||||
|
||||
|
||||
class MutationError(RuntimeError):
|
||||
"""Raised by an ``apply_<op>`` when the requested change is illegal."""
|
||||
|
||||
|
||||
OpFunc = Callable[[Any, str, dict[str, Any]], Awaitable[None]]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------- helpers
|
||||
|
||||
|
||||
async def _hydrated(repo: Any, topology_id: str) -> dict[str, Any]:
|
||||
h = await hydrate(repo, topology_id)
|
||||
if h is None:
|
||||
raise MutationError(f"topology {topology_id!r} vanished mid-apply")
|
||||
return h
|
||||
|
||||
|
||||
async def _assert_valid_after(repo: Any, topology_id: str) -> None:
|
||||
"""Re-hydrate and check invariants; raise :class:`MutationError` on errors."""
|
||||
h = await _hydrated(repo, topology_id)
|
||||
issues: list = []
|
||||
for check in _POST_APPLY_CHECKS:
|
||||
issues.extend(check(h))
|
||||
bad = _validation_errors(issues)
|
||||
if bad:
|
||||
codes = ", ".join(sorted({i.code for i in bad}))
|
||||
raise MutationError(
|
||||
f"post-apply validation failed for {topology_id}: {codes}"
|
||||
)
|
||||
|
||||
|
||||
def _lan_by_name(hydrated: dict[str, Any], name: str) -> Optional[dict]:
|
||||
return next((lan for lan in hydrated["lans"] if lan["name"] == name), None)
|
||||
|
||||
|
||||
def _decky_by_name(hydrated: dict[str, Any], name: str) -> Optional[dict]:
|
||||
return next(
|
||||
(d for d in hydrated["deckies"] if d["decky_config"]["name"] == name),
|
||||
None,
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------- ops
|
||||
|
||||
|
||||
async def apply_add_lan(
|
||||
repo: Any, topology_id: str, payload: dict[str, Any]
|
||||
) -> None:
|
||||
"""Add a new LAN to an active topology.
|
||||
|
||||
``payload`` keys:
|
||||
``name`` — LAN name (required).
|
||||
``subnet`` — ``/24`` CIDR (optional; auto-allocated if missing).
|
||||
``is_dmz`` — bool, default False.
|
||||
``x``,``y`` — layout coords, optional.
|
||||
"""
|
||||
name = payload["name"]
|
||||
subnet = payload.get("subnet")
|
||||
is_dmz = bool(payload.get("is_dmz", False))
|
||||
|
||||
if subnet is None:
|
||||
reserved = await reserved_subnets(repo)
|
||||
alloc = SubnetAllocator(base_prefix="172.16.0.0/12", reserved=reserved)
|
||||
subnet = alloc.next_free()
|
||||
|
||||
await repo.add_lan(
|
||||
{
|
||||
"topology_id": topology_id,
|
||||
"name": name,
|
||||
"subnet": subnet,
|
||||
"is_dmz": is_dmz,
|
||||
"x": payload.get("x"),
|
||||
"y": payload.get("y"),
|
||||
}
|
||||
)
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
async def apply_remove_lan(
|
||||
repo: Any, topology_id: str, payload: dict[str, Any]
|
||||
) -> None:
|
||||
"""Remove a LAN; refuses when any decky has it as its home LAN."""
|
||||
hydrated = await _hydrated(repo, topology_id)
|
||||
lan = _lan_by_name(hydrated, payload["name"])
|
||||
if lan is None:
|
||||
raise MutationError(f"LAN {payload['name']!r} not found")
|
||||
# Refuse if any decky's home (primary/first) LAN is this one.
|
||||
for d in hydrated["deckies"]:
|
||||
ips = d["decky_config"].get("ips_by_lan", {})
|
||||
if ips and next(iter(ips)) == lan["name"]:
|
||||
raise MutationError(
|
||||
f"LAN {lan['name']!r} is the home LAN of decky "
|
||||
f"{d['decky_config']['name']!r}; remove the decky first"
|
||||
)
|
||||
await repo.delete_lan(lan["id"])
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
async def apply_add_decky(
|
||||
repo: Any, topology_id: str, payload: dict[str, Any]
|
||||
) -> None:
|
||||
"""Create a brand-new decky and attach it to its home LAN.
|
||||
|
||||
Used when the editor drags an archetype onto an active topology.
|
||||
``apply_attach_decky`` requires an existing decky, so without this
|
||||
op there is no way to grow a live topology from the UI.
|
||||
|
||||
``payload`` keys:
|
||||
``name`` — decky name (required, unique in topology).
|
||||
``lan`` — home LAN name (required).
|
||||
``services`` — list of service slugs (optional).
|
||||
``archetype`` — slug string; stored in ``decky_config`` (optional).
|
||||
``forwards_l3`` — bool; stored in ``decky_config`` (optional).
|
||||
``ip`` — pinned IP inside the LAN; else auto-allocated.
|
||||
``x``,``y`` — layout coords (optional).
|
||||
"""
|
||||
name = payload["name"]
|
||||
hydrated = await _hydrated(repo, topology_id)
|
||||
if _decky_by_name(hydrated, name) is not None:
|
||||
raise MutationError(f"decky {name!r} already exists")
|
||||
lan = _lan_by_name(hydrated, payload["lan"])
|
||||
if lan is None:
|
||||
raise MutationError(f"LAN {payload['lan']!r} not found")
|
||||
|
||||
ip = payload.get("ip")
|
||||
if ip is None:
|
||||
taken = {
|
||||
d["decky_config"]["ips_by_lan"].get(lan["name"])
|
||||
for d in hydrated["deckies"]
|
||||
if lan["name"] in d["decky_config"].get("ips_by_lan", {})
|
||||
}
|
||||
taken.discard(None)
|
||||
alloc = IPAllocator(subnet=lan["subnet"])
|
||||
for t in taken:
|
||||
if t:
|
||||
alloc.reserve(t)
|
||||
ip = alloc.next_free()
|
||||
|
||||
decky_config: dict[str, Any] = {
|
||||
"name": name,
|
||||
"ips_by_lan": {lan["name"]: ip},
|
||||
}
|
||||
if "archetype" in payload:
|
||||
decky_config["archetype"] = payload["archetype"]
|
||||
forwards_l3 = bool(payload.get("forwards_l3", False))
|
||||
if forwards_l3:
|
||||
decky_config["forwards_l3"] = True
|
||||
|
||||
decky_uuid = await repo.add_topology_decky(
|
||||
{
|
||||
"topology_id": topology_id,
|
||||
"name": name,
|
||||
"services": list(payload.get("services", [])),
|
||||
"decky_config": decky_config,
|
||||
"x": payload.get("x"),
|
||||
"y": payload.get("y"),
|
||||
}
|
||||
)
|
||||
await repo.add_topology_edge(
|
||||
{
|
||||
"topology_id": topology_id,
|
||||
"decky_uuid": decky_uuid,
|
||||
"lan_id": lan["id"],
|
||||
"is_bridge": False,
|
||||
"forwards_l3": forwards_l3,
|
||||
}
|
||||
)
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
async def apply_attach_decky(
|
||||
repo: Any, topology_id: str, payload: dict[str, Any]
|
||||
) -> None:
|
||||
"""Attach an existing decky to an additional LAN (bridge edge).
|
||||
|
||||
``payload`` keys:
|
||||
``decky`` — decky name.
|
||||
``lan`` — LAN name.
|
||||
``ip`` — optional pinned IP; else allocated inside the LAN.
|
||||
``forwards_l3`` — bool, default False.
|
||||
"""
|
||||
hydrated = await _hydrated(repo, topology_id)
|
||||
lan = _lan_by_name(hydrated, payload["lan"])
|
||||
decky = _decky_by_name(hydrated, payload["decky"])
|
||||
if lan is None:
|
||||
raise MutationError(f"LAN {payload['lan']!r} not found")
|
||||
if decky is None:
|
||||
raise MutationError(f"decky {payload['decky']!r} not found")
|
||||
|
||||
# Guard against re-attaching.
|
||||
for e in hydrated["edges"]:
|
||||
if e["decky_uuid"] == decky["uuid"] and e["lan_id"] == lan["id"]:
|
||||
raise MutationError(
|
||||
f"decky {decky['decky_config']['name']!r} already on "
|
||||
f"LAN {lan['name']!r}"
|
||||
)
|
||||
|
||||
ip = payload.get("ip")
|
||||
if ip is None:
|
||||
taken = {
|
||||
d["decky_config"]["ips_by_lan"].get(lan["name"])
|
||||
for d in hydrated["deckies"]
|
||||
if lan["name"] in d["decky_config"].get("ips_by_lan", {})
|
||||
}
|
||||
taken.discard(None)
|
||||
alloc = IPAllocator(subnet=lan["subnet"])
|
||||
for t in taken:
|
||||
if t:
|
||||
alloc.reserve(t)
|
||||
ip = alloc.next_free()
|
||||
|
||||
new_cfg = dict(decky["decky_config"])
|
||||
new_cfg["ips_by_lan"] = {**new_cfg.get("ips_by_lan", {}), lan["name"]: ip}
|
||||
forwards_l3 = bool(payload.get("forwards_l3", False))
|
||||
if forwards_l3:
|
||||
new_cfg["forwards_l3"] = True
|
||||
|
||||
await repo.update_topology_decky(
|
||||
decky["uuid"], {"decky_config": new_cfg}
|
||||
)
|
||||
# Adding a second edge makes the decky multi-homed (a bridge decky).
|
||||
await repo.add_topology_edge(
|
||||
{
|
||||
"topology_id": topology_id,
|
||||
"decky_uuid": decky["uuid"],
|
||||
"lan_id": lan["id"],
|
||||
"is_bridge": True,
|
||||
"forwards_l3": forwards_l3,
|
||||
}
|
||||
)
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
async def apply_detach_decky(
|
||||
repo: Any, topology_id: str, payload: dict[str, Any]
|
||||
) -> None:
|
||||
"""Detach a decky from one of its non-home LANs."""
|
||||
hydrated = await _hydrated(repo, topology_id)
|
||||
lan = _lan_by_name(hydrated, payload["lan"])
|
||||
decky = _decky_by_name(hydrated, payload["decky"])
|
||||
if lan is None or decky is None:
|
||||
raise MutationError("decky or LAN not found")
|
||||
|
||||
ips_by_lan = decky["decky_config"].get("ips_by_lan", {})
|
||||
if not ips_by_lan:
|
||||
raise MutationError("decky has no LAN memberships")
|
||||
home_lan = next(iter(ips_by_lan))
|
||||
if home_lan == lan["name"]:
|
||||
raise MutationError(
|
||||
f"cannot detach home LAN {home_lan!r}; use remove_decky"
|
||||
)
|
||||
|
||||
edge = next(
|
||||
(
|
||||
e
|
||||
for e in hydrated["edges"]
|
||||
if e["decky_uuid"] == decky["uuid"] and e["lan_id"] == lan["id"]
|
||||
),
|
||||
None,
|
||||
)
|
||||
if edge is None:
|
||||
raise MutationError(
|
||||
f"decky not attached to LAN {lan['name']!r}"
|
||||
)
|
||||
|
||||
new_cfg = dict(decky["decky_config"])
|
||||
new_ips = dict(new_cfg.get("ips_by_lan", {}))
|
||||
new_ips.pop(lan["name"], None)
|
||||
new_cfg["ips_by_lan"] = new_ips
|
||||
|
||||
await repo.update_topology_decky(
|
||||
decky["uuid"], {"decky_config": new_cfg}
|
||||
)
|
||||
await repo.delete_topology_edge(edge["id"])
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
async def apply_remove_decky(
|
||||
repo: Any, topology_id: str, payload: dict[str, Any]
|
||||
) -> None:
|
||||
hydrated = await _hydrated(repo, topology_id)
|
||||
decky = _decky_by_name(hydrated, payload["decky"])
|
||||
if decky is None:
|
||||
raise MutationError(f"decky {payload['decky']!r} not found")
|
||||
await repo.delete_topology_decky(decky["uuid"])
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
async def apply_update_decky(
|
||||
repo: Any, topology_id: str, payload: dict[str, Any]
|
||||
) -> None:
|
||||
"""Update decky config — services, service_config, forwards_l3, coords.
|
||||
|
||||
``payload`` keys:
|
||||
``decky`` — decky name.
|
||||
``patch`` — dict merged into existing ``decky_config``.
|
||||
``services`` — replacement top-level services list.
|
||||
``x``,``y`` — layout coords.
|
||||
"""
|
||||
hydrated = await _hydrated(repo, topology_id)
|
||||
decky = _decky_by_name(hydrated, payload["decky"])
|
||||
if decky is None:
|
||||
raise MutationError(f"decky {payload['decky']!r} not found")
|
||||
patch: dict[str, Any] = {}
|
||||
if payload.get("patch"):
|
||||
merged = dict(decky["decky_config"])
|
||||
merged.update(payload["patch"])
|
||||
patch["decky_config"] = merged
|
||||
if "services" in payload:
|
||||
patch["services"] = list(payload["services"])
|
||||
for key in ("x", "y"):
|
||||
if key in payload:
|
||||
patch[key] = payload[key]
|
||||
if not patch:
|
||||
return
|
||||
await repo.update_topology_decky(decky["uuid"], patch)
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
async def apply_update_lan(
|
||||
repo: Any, topology_id: str, payload: dict[str, Any]
|
||||
) -> None:
|
||||
"""Update LAN fields — subnet, is_dmz, coords, rename."""
|
||||
hydrated = await _hydrated(repo, topology_id)
|
||||
lan = _lan_by_name(hydrated, payload["name"])
|
||||
if lan is None:
|
||||
raise MutationError(f"LAN {payload['name']!r} not found")
|
||||
fields = {k: v for k, v in payload.get("patch", {}).items()}
|
||||
for key in ("x", "y"):
|
||||
if key in payload:
|
||||
fields[key] = payload[key]
|
||||
if not fields:
|
||||
return
|
||||
await repo.update_lan(lan["id"], fields)
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
# Keep the dispatch table in one place so the engine and CLI stay in
|
||||
# sync without cross-imports.
|
||||
DISPATCH: dict[str, OpFunc] = {
|
||||
"add_lan": apply_add_lan,
|
||||
"remove_lan": apply_remove_lan,
|
||||
"add_decky": apply_add_decky,
|
||||
"attach_decky": apply_attach_decky,
|
||||
"detach_decky": apply_detach_decky,
|
||||
"remove_decky": apply_remove_decky,
|
||||
"update_decky": apply_update_decky,
|
||||
"update_lan": apply_update_lan,
|
||||
}
|
||||
|
||||
|
||||
async def dispatch(
|
||||
repo: Any,
|
||||
topology_id: str,
|
||||
op: str,
|
||||
payload_raw: str | dict[str, Any],
|
||||
) -> None:
|
||||
"""Decode payload JSON (if a string) and run the matching op."""
|
||||
if isinstance(payload_raw, str):
|
||||
payload = json.loads(payload_raw) if payload_raw else {}
|
||||
else:
|
||||
payload = payload_raw
|
||||
try:
|
||||
fn = DISPATCH[op]
|
||||
except KeyError as e:
|
||||
raise MutationError(f"unknown op: {op!r}") from e
|
||||
await fn(repo, topology_id, payload)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"DISPATCH",
|
||||
"MutationError",
|
||||
"dispatch",
|
||||
"apply_add_lan",
|
||||
"apply_remove_lan",
|
||||
"apply_add_decky",
|
||||
"apply_attach_decky",
|
||||
"apply_detach_decky",
|
||||
"apply_remove_decky",
|
||||
"apply_update_decky",
|
||||
"apply_update_lan",
|
||||
]
|
||||
Reference in New Issue
Block a user