feat(mutator): bus-wake on decky mutate_request; adaptive sleep; heartbeat
The flat-fleet mutator was DB-poll-only and noisy — it logged
"no active deployment found" every 10s on idle hosts and ran
mutate_all at a fixed tick regardless of when the next decky
was due.
- mutate_all returns seconds-until-next-due; watch loop sleeps
min(next_due, poll_interval_secs) with a 1s floor.
- "No deployment" is now idle, not an error: edge-triggered log
on present<->absent transition instead of every tick.
- mutate_decky publishes decky.<name>.state on successful compose
so UIs react in real time.
- New decky.*.mutate_request subscription lets API/CLI/UI force
an immediate mutation of a specific decky without waiting for
its interval; target name feeds mutate_all(only={...}).
- system.mutator.health heartbeat via run_health_heartbeat helper,
bringing the mutator in line with DEBT-031 workers.
Tests: next_due return, only= filter, decky.<name>.state publish
on success, no publish on compose failure. Full mutator+topology-
mutator+bus suite (109) green.
This commit is contained in:
@@ -50,6 +50,11 @@ TOPOLOGY_STATUS = "status"
|
|||||||
# Decky-level event types (second token).
|
# Decky-level event types (second token).
|
||||||
DECKY_STATE = "state"
|
DECKY_STATE = "state"
|
||||||
DECKY_TRAFFIC = "traffic"
|
DECKY_TRAFFIC = "traffic"
|
||||||
|
# On-demand mutation request — published by the API/CLI/UI, consumed by
|
||||||
|
# the mutator's watch loop to force an immediate mutation of one decky
|
||||||
|
# without waiting for its scheduled interval. Underscored (not dotted)
|
||||||
|
# to stay a single NATS token so the builder's validator accepts it.
|
||||||
|
DECKY_MUTATE_REQUEST = "mutate_request"
|
||||||
|
|
||||||
# Attacker event types (second token under the ``attacker`` root). First
|
# Attacker event types (second token under the ``attacker`` root). First
|
||||||
# sighting, session boundary transitions, and score-threshold crossings
|
# sighting, session boundary transitions, and score-threshold crossings
|
||||||
|
|||||||
@@ -25,7 +25,10 @@ import contextlib
|
|||||||
from decnet.bus import topics as _topics
|
from decnet.bus import topics as _topics
|
||||||
from decnet.bus.base import BaseBus
|
from decnet.bus.base import BaseBus
|
||||||
from decnet.bus.factory import get_bus
|
from decnet.bus.factory import get_bus
|
||||||
from decnet.bus.publish import publish_safely as _publish_safely
|
from decnet.bus.publish import (
|
||||||
|
publish_safely as _publish_safely,
|
||||||
|
run_health_heartbeat as _run_health_heartbeat,
|
||||||
|
)
|
||||||
from decnet.web.db.repository import BaseRepository
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
|
||||||
log = get_logger("mutator")
|
log = get_logger("mutator")
|
||||||
@@ -33,7 +36,11 @@ console = Console()
|
|||||||
|
|
||||||
|
|
||||||
@_traced("mutator.mutate_decky")
|
@_traced("mutator.mutate_decky")
|
||||||
async def mutate_decky(decky_name: str, repo: BaseRepository) -> bool:
|
async def mutate_decky(
|
||||||
|
decky_name: str,
|
||||||
|
repo: BaseRepository,
|
||||||
|
bus: BaseBus | None = None,
|
||||||
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Perform an Intra-Archetype Shuffle for a specific decky.
|
Perform an Intra-Archetype Shuffle for a specific decky.
|
||||||
Returns True if mutation succeeded, False otherwise.
|
Returns True if mutation succeeded, False otherwise.
|
||||||
@@ -96,47 +103,74 @@ async def mutate_decky(decky_name: str, repo: BaseRepository) -> bool:
|
|||||||
console.print(f"[red]Failed to mutate '{decky_name}': {e}[/]")
|
console.print(f"[red]Failed to mutate '{decky_name}': {e}[/]")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
await _publish_safely(
|
||||||
|
bus,
|
||||||
|
_topics.decky(decky_name, _topics.DECKY_STATE),
|
||||||
|
{
|
||||||
|
"name": decky_name,
|
||||||
|
"services": list(decky.services),
|
||||||
|
"last_mutated": decky.last_mutated,
|
||||||
|
},
|
||||||
|
event_type=_topics.DECKY_STATE,
|
||||||
|
)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
@_traced("mutator.mutate_all")
|
@_traced("mutator.mutate_all")
|
||||||
async def mutate_all(repo: BaseRepository, force: bool = False) -> None:
|
async def mutate_all(
|
||||||
|
repo: BaseRepository,
|
||||||
|
force: bool = False,
|
||||||
|
bus: BaseBus | None = None,
|
||||||
|
only: set[str] | None = None,
|
||||||
|
) -> float | None:
|
||||||
|
"""Mutate all deckies that are due (or *only* the named ones).
|
||||||
|
|
||||||
|
Returns the number of seconds until the next scheduled mutation, or
|
||||||
|
``None`` if no deployment exists / no decky has an interval set. The
|
||||||
|
watch loop uses this to adaptively sleep instead of hard-polling at a
|
||||||
|
fixed cadence.
|
||||||
|
|
||||||
|
A missing ``deployment`` state row is *not* an error any more — the
|
||||||
|
host may simply not have run ``decnet deploy`` yet. The watch loop
|
||||||
|
edge-triggers the user-facing log for that state.
|
||||||
"""
|
"""
|
||||||
Check all deckies and mutate those that are due.
|
log.debug("mutate_all: start force=%s only=%s", force, only)
|
||||||
If force=True, mutates all deckies regardless of schedule.
|
|
||||||
"""
|
|
||||||
log.debug("mutate_all: start force=%s", force)
|
|
||||||
state_dict = await repo.get_state("deployment")
|
state_dict = await repo.get_state("deployment")
|
||||||
if state_dict is None:
|
if state_dict is None:
|
||||||
log.error("mutate_all: no active deployment found")
|
log.debug("mutate_all: no active deployment found")
|
||||||
console.print("[red]No active deployment found.[/]")
|
return None
|
||||||
return
|
|
||||||
|
|
||||||
config = DecnetConfig(**state_dict["config"])
|
config = DecnetConfig(**state_dict["config"])
|
||||||
now = time.time()
|
now = time.time()
|
||||||
|
|
||||||
mutated_count = 0
|
mutated_count = 0
|
||||||
|
next_due_in: float | None = None
|
||||||
for decky in config.deckies:
|
for decky in config.deckies:
|
||||||
|
if only is not None and decky.name not in only:
|
||||||
|
continue
|
||||||
interval_mins = decky.mutate_interval or config.mutate_interval
|
interval_mins = decky.mutate_interval or config.mutate_interval
|
||||||
if interval_mins is None and not force:
|
if interval_mins is None and not force:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if force:
|
if force or only is not None:
|
||||||
due = True
|
due = True
|
||||||
else:
|
else:
|
||||||
elapsed_secs = now - decky.last_mutated
|
elapsed_secs = now - decky.last_mutated
|
||||||
due = elapsed_secs >= (interval_mins * 60)
|
due = elapsed_secs >= (interval_mins * 60)
|
||||||
|
remaining = (interval_mins * 60) - elapsed_secs
|
||||||
|
if not due and (next_due_in is None or remaining < next_due_in):
|
||||||
|
next_due_in = remaining
|
||||||
|
|
||||||
if due:
|
if due:
|
||||||
success = await mutate_decky(decky.name, repo=repo)
|
success = await mutate_decky(decky.name, repo=repo, bus=bus)
|
||||||
if success:
|
if success:
|
||||||
mutated_count += 1
|
mutated_count += 1
|
||||||
|
|
||||||
if mutated_count == 0 and not force:
|
if mutated_count:
|
||||||
log.debug("mutate_all: no deckies due for mutation")
|
|
||||||
console.print("[dim]No deckies are due for mutation.[/]")
|
|
||||||
else:
|
|
||||||
log.info("mutate_all: complete mutated_count=%d", mutated_count)
|
log.info("mutate_all: complete mutated_count=%d", mutated_count)
|
||||||
|
else:
|
||||||
|
log.debug("mutate_all: no deckies due for mutation")
|
||||||
|
return next_due_in
|
||||||
|
|
||||||
|
|
||||||
@_traced("mutator.reconcile_topologies")
|
@_traced("mutator.reconcile_topologies")
|
||||||
@@ -277,18 +311,51 @@ async def run_watch_loop(repo: BaseRepository, poll_interval_secs: int = 10) ->
|
|||||||
# poll-interval latency and doesn't push notifications to UI clients.
|
# poll-interval latency and doesn't push notifications to UI clients.
|
||||||
bus: BaseBus | None = None
|
bus: BaseBus | None = None
|
||||||
wake = asyncio.Event()
|
wake = asyncio.Event()
|
||||||
wake_task: asyncio.Task | None = None
|
mutate_requests: set[str] = set()
|
||||||
|
wake_tasks: list[asyncio.Task] = []
|
||||||
|
heartbeat_task: asyncio.Task | None = None
|
||||||
try:
|
try:
|
||||||
candidate = get_bus(client_name="mutator")
|
candidate = get_bus(client_name="mutator")
|
||||||
await candidate.connect()
|
await candidate.connect()
|
||||||
bus = candidate
|
bus = candidate
|
||||||
wake_task = asyncio.create_task(_wake_on_enqueue(bus, wake))
|
wake_tasks.append(asyncio.create_task(_wake_on_enqueue(bus, wake)))
|
||||||
|
wake_tasks.append(asyncio.create_task(
|
||||||
|
_wake_on_mutate_request(bus, wake, mutate_requests),
|
||||||
|
))
|
||||||
|
heartbeat_task = asyncio.create_task(
|
||||||
|
_run_health_heartbeat(bus, "mutator"),
|
||||||
|
)
|
||||||
except Exception as exc: # noqa: BLE001
|
except Exception as exc: # noqa: BLE001
|
||||||
log.warning("mutator: bus unavailable, running in poll-only mode: %s", exc)
|
log.warning("mutator: bus unavailable, running in poll-only mode: %s", exc)
|
||||||
|
|
||||||
|
# Edge-triggered "no deployment" state so we don't spam the console
|
||||||
|
# every 10 seconds on a host that hasn't deployed yet. Start as None
|
||||||
|
# so the first observation fires exactly one line.
|
||||||
|
deployment_present: bool | None = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
await mutate_all(force=False, repo=repo)
|
requested = mutate_requests.copy()
|
||||||
|
mutate_requests.clear()
|
||||||
|
|
||||||
|
next_due = await mutate_all(
|
||||||
|
repo=repo,
|
||||||
|
force=False,
|
||||||
|
bus=bus,
|
||||||
|
only=requested or None,
|
||||||
|
)
|
||||||
|
has_deployment = (
|
||||||
|
next_due is not None or await repo.get_state("deployment") is not None
|
||||||
|
)
|
||||||
|
if has_deployment and deployment_present is not True:
|
||||||
|
log.info("mutator: active deployment observed — entering normal cadence")
|
||||||
|
console.print("[green]Active deployment observed.[/]")
|
||||||
|
deployment_present = True
|
||||||
|
elif not has_deployment and deployment_present is not False:
|
||||||
|
log.info("mutator: no active deployment — idling until one lands")
|
||||||
|
console.print("[dim]No active deployment; mutator idling.[/]")
|
||||||
|
deployment_present = False
|
||||||
|
|
||||||
# Gate reconciler on the O(log n) guard query — avoids
|
# Gate reconciler on the O(log n) guard query — avoids
|
||||||
# entering the dispatch body when there's nothing to do.
|
# entering the dispatch body when there's nothing to do.
|
||||||
try:
|
try:
|
||||||
@@ -303,11 +370,17 @@ async def run_watch_loop(repo: BaseRepository, poll_interval_secs: int = 10) ->
|
|||||||
pass
|
pass
|
||||||
except Exception:
|
except Exception:
|
||||||
log.exception("reconcile_agent_resyncs tick raised")
|
log.exception("reconcile_agent_resyncs tick raised")
|
||||||
# Wait until either poll_interval_secs elapses OR an enqueued
|
|
||||||
# mutation wakes us early. Clearing before the next tick
|
# Adaptive sleep: wake at the earlier of (next decky due) or
|
||||||
# means a second wake during the tick will re-fire after.
|
# (poll_interval_secs), bounded below by 1s so a thrashing
|
||||||
|
# schedule can't spin the loop. A bus wake (enqueue or
|
||||||
|
# mutate_request) short-circuits the wait.
|
||||||
|
if next_due is None or next_due > poll_interval_secs:
|
||||||
|
timeout = float(poll_interval_secs)
|
||||||
|
else:
|
||||||
|
timeout = max(1.0, next_due)
|
||||||
try:
|
try:
|
||||||
await asyncio.wait_for(wake.wait(), timeout=poll_interval_secs)
|
await asyncio.wait_for(wake.wait(), timeout=timeout)
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
pass
|
pass
|
||||||
wake.clear()
|
wake.clear()
|
||||||
@@ -315,10 +388,15 @@ async def run_watch_loop(repo: BaseRepository, poll_interval_secs: int = 10) ->
|
|||||||
log.info("mutator watch loop stopped")
|
log.info("mutator watch loop stopped")
|
||||||
console.print("\n[dim]Mutator watcher stopped.[/]")
|
console.print("\n[dim]Mutator watcher stopped.[/]")
|
||||||
finally:
|
finally:
|
||||||
if wake_task is not None:
|
for t in wake_tasks:
|
||||||
wake_task.cancel()
|
t.cancel()
|
||||||
|
if heartbeat_task is not None:
|
||||||
|
heartbeat_task.cancel()
|
||||||
|
for t in (*wake_tasks, heartbeat_task):
|
||||||
|
if t is None:
|
||||||
|
continue
|
||||||
with contextlib.suppress(asyncio.CancelledError, Exception):
|
with contextlib.suppress(asyncio.CancelledError, Exception):
|
||||||
await wake_task
|
await t
|
||||||
if bus is not None:
|
if bus is not None:
|
||||||
with contextlib.suppress(Exception):
|
with contextlib.suppress(Exception):
|
||||||
await bus.close()
|
await bus.close()
|
||||||
@@ -342,3 +420,39 @@ async def _wake_on_enqueue(bus: BaseBus, wake: asyncio.Event) -> None:
|
|||||||
raise
|
raise
|
||||||
except Exception as exc: # noqa: BLE001
|
except Exception as exc: # noqa: BLE001
|
||||||
log.warning("mutator: wake subscriber died (%s); falling back to poll", exc)
|
log.warning("mutator: wake subscriber died (%s); falling back to poll", exc)
|
||||||
|
|
||||||
|
|
||||||
|
async def _wake_on_mutate_request(
|
||||||
|
bus: BaseBus,
|
||||||
|
wake: asyncio.Event,
|
||||||
|
pending: set[str],
|
||||||
|
) -> None:
|
||||||
|
"""Collect on-demand ``decky.<name>.mutate_request`` events.
|
||||||
|
|
||||||
|
API/CLI/UI callers publish to ``decky.{name}.mutate_request`` to force
|
||||||
|
an immediate mutation without waiting for the scheduled interval. We
|
||||||
|
stash the target decky name in *pending* so the next tick can feed it
|
||||||
|
to ``mutate_all(only=...)``, then flip *wake* to short-circuit the
|
||||||
|
sleep. Payload is optional — the topic's second token is the name.
|
||||||
|
"""
|
||||||
|
pattern = f"{_topics.DECKY}.*.{_topics.DECKY_MUTATE_REQUEST}"
|
||||||
|
try:
|
||||||
|
sub = bus.subscribe(pattern)
|
||||||
|
async with sub:
|
||||||
|
async for event in sub:
|
||||||
|
topic = getattr(event, "topic", "") or ""
|
||||||
|
parts = topic.split(".")
|
||||||
|
name = parts[1] if len(parts) >= 3 else ""
|
||||||
|
payload = getattr(event, "payload", None) or {}
|
||||||
|
if not name and isinstance(payload, dict):
|
||||||
|
name = payload.get("name", "") or ""
|
||||||
|
if name:
|
||||||
|
pending.add(name)
|
||||||
|
wake.set()
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning(
|
||||||
|
"mutator: mutate_request subscriber died (%s); falling back to poll",
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
|||||||
@@ -162,6 +162,67 @@ class TestMutateAll:
|
|||||||
await mutate_all(repo=mock_repo, force=False)
|
await mutate_all(repo=mock_repo, force=False)
|
||||||
mock_mutate.assert_called_once()
|
mock_mutate.assert_called_once()
|
||||||
|
|
||||||
|
async def test_no_state_returns_none_not_error(self, mock_repo):
|
||||||
|
"""Missing deployment is idle, not an error — must return None."""
|
||||||
|
mock_repo.get_state.return_value = None
|
||||||
|
assert await mutate_all(repo=mock_repo) is None
|
||||||
|
|
||||||
|
async def test_returns_seconds_until_next_due(self, mock_repo):
|
||||||
|
# Two deckies: one 10 min to go, one 25 min to go → min is ~600s
|
||||||
|
now = time.time()
|
||||||
|
cfg = _make_config(deckies=[
|
||||||
|
_make_decky("d1", mutate_interval=30, last_mutated=now - 20 * 60),
|
||||||
|
_make_decky("d2", mutate_interval=30, last_mutated=now - 5 * 60),
|
||||||
|
])
|
||||||
|
mock_repo.get_state.return_value = {"config": cfg.model_dump(), "compose_path": "c.yml"}
|
||||||
|
with patch("decnet.mutator.engine.mutate_decky", new_callable=AsyncMock):
|
||||||
|
next_due = await mutate_all(repo=mock_repo, force=False)
|
||||||
|
assert next_due is not None
|
||||||
|
assert 590 < next_due < 610 # ~10 min
|
||||||
|
|
||||||
|
async def test_only_filter_forces_named_decky(self, mock_repo):
|
||||||
|
"""only={'d1'} mutates d1 regardless of schedule, skips others."""
|
||||||
|
now = time.time()
|
||||||
|
cfg = _make_config(deckies=[
|
||||||
|
_make_decky("d1", mutate_interval=30, last_mutated=now), # not due
|
||||||
|
_make_decky("d2", mutate_interval=30, last_mutated=now), # not due
|
||||||
|
])
|
||||||
|
mock_repo.get_state.return_value = {"config": cfg.model_dump(), "compose_path": "c.yml"}
|
||||||
|
with patch("decnet.mutator.engine.mutate_decky", new_callable=AsyncMock, return_value=True) as mock_mutate:
|
||||||
|
await mutate_all(repo=mock_repo, force=False, only={"d1"})
|
||||||
|
assert mock_mutate.call_count == 1
|
||||||
|
assert mock_mutate.call_args.args[0] == "d1"
|
||||||
|
|
||||||
|
|
||||||
|
class TestMutateDeckyBusPublish:
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_publishes_decky_state_on_success(self, mock_repo):
|
||||||
|
cfg = _make_config()
|
||||||
|
mock_repo.get_state.return_value = {"config": cfg.model_dump(), "compose_path": "c.yml"}
|
||||||
|
bus = AsyncMock()
|
||||||
|
with patch("decnet.mutator.engine.write_compose"), \
|
||||||
|
patch("anyio.to_thread.run_sync", new_callable=AsyncMock):
|
||||||
|
ok = await mutate_decky("decky-01", repo=mock_repo, bus=bus)
|
||||||
|
assert ok is True
|
||||||
|
bus.publish.assert_awaited_once()
|
||||||
|
topic = bus.publish.await_args.args[0]
|
||||||
|
payload = bus.publish.await_args.args[1]
|
||||||
|
assert topic == "decky.decky-01.state"
|
||||||
|
assert payload["name"] == "decky-01"
|
||||||
|
assert isinstance(payload["services"], list)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_no_publish_on_compose_failure(self, mock_repo):
|
||||||
|
cfg = _make_config()
|
||||||
|
mock_repo.get_state.return_value = {"config": cfg.model_dump(), "compose_path": "c.yml"}
|
||||||
|
bus = AsyncMock()
|
||||||
|
with patch("decnet.mutator.engine.write_compose"), \
|
||||||
|
patch("anyio.to_thread.run_sync",
|
||||||
|
new_callable=AsyncMock, side_effect=RuntimeError("boom")):
|
||||||
|
ok = await mutate_decky("decky-01", repo=mock_repo, bus=bus)
|
||||||
|
assert ok is False
|
||||||
|
bus.publish.assert_not_awaited()
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# _compose_with_retry (Sync tests, keep as is or minimal update)
|
# _compose_with_retry (Sync tests, keep as is or minimal update)
|
||||||
|
|||||||
Reference in New Issue
Block a user