merge: testing → main (reconcile 2-week divergence)
This commit is contained in:
177
decnet/fleet/__init__.py
Normal file
177
decnet/fleet/__init__.py
Normal file
@@ -0,0 +1,177 @@
|
||||
"""
|
||||
Fleet builder — shared logic for constructing DeckyConfig lists.
|
||||
|
||||
Used by both the CLI and the web API router to build deckies from
|
||||
flags or INI config. Lives here (not in cli.py) so that the web layer
|
||||
and the mutation engine can import it without depending on the CLI.
|
||||
"""
|
||||
|
||||
import random
|
||||
from typing import Optional
|
||||
|
||||
from decnet.archetypes import Archetype, get_archetype
|
||||
from decnet.config import DeckyConfig, random_hostname
|
||||
from decnet.distros import all_distros, get_distro, random_distro
|
||||
from decnet.models import IniConfig
|
||||
from decnet.services.registry import all_services
|
||||
|
||||
|
||||
def all_service_names() -> list[str]:
|
||||
"""Return all registered per-decky service names (excludes fleet singletons)."""
|
||||
return sorted(
|
||||
name for name, svc in all_services().items()
|
||||
if not svc.fleet_singleton
|
||||
)
|
||||
|
||||
|
||||
def resolve_distros(
|
||||
distros_explicit: list[str] | None,
|
||||
randomize_distros: bool,
|
||||
n: int,
|
||||
archetype: Archetype | None = None,
|
||||
) -> list[str]:
|
||||
"""Return a list of n distro slugs based on flags or archetype preference."""
|
||||
if distros_explicit:
|
||||
return [distros_explicit[i % len(distros_explicit)] for i in range(n)]
|
||||
if randomize_distros:
|
||||
return [random_distro().slug for _ in range(n)]
|
||||
if archetype:
|
||||
pool = archetype.preferred_distros
|
||||
return [pool[i % len(pool)] for i in range(n)]
|
||||
slugs = list(all_distros().keys())
|
||||
return [slugs[i % len(slugs)] for i in range(n)]
|
||||
|
||||
|
||||
def build_deckies(
|
||||
n: int,
|
||||
ips: list[str],
|
||||
services_explicit: list[str] | None,
|
||||
randomize_services: bool,
|
||||
distros_explicit: list[str] | None = None,
|
||||
randomize_distros: bool = False,
|
||||
archetype: Archetype | None = None,
|
||||
mutate_interval: Optional[int] = None,
|
||||
) -> list[DeckyConfig]:
|
||||
"""Build a list of DeckyConfigs from CLI-style flags."""
|
||||
deckies = []
|
||||
used_combos: set[frozenset] = set()
|
||||
distro_slugs = resolve_distros(distros_explicit, randomize_distros, n, archetype)
|
||||
|
||||
for i, ip in enumerate(ips):
|
||||
name = f"decky-{i + 1:02d}"
|
||||
distro = get_distro(distro_slugs[i])
|
||||
hostname = random_hostname(distro.slug)
|
||||
|
||||
if services_explicit:
|
||||
svc_list = services_explicit
|
||||
elif archetype:
|
||||
svc_list = list(archetype.services)
|
||||
elif randomize_services:
|
||||
svc_pool = all_service_names()
|
||||
attempts = 0
|
||||
while True:
|
||||
count = random.randint(1, min(3, len(svc_pool))) # nosec B311
|
||||
chosen = frozenset(random.sample(svc_pool, count)) # nosec B311
|
||||
attempts += 1
|
||||
if chosen not in used_combos or attempts > 20:
|
||||
break
|
||||
svc_list = list(chosen)
|
||||
used_combos.add(chosen)
|
||||
else:
|
||||
raise ValueError("Provide services_explicit, archetype, or randomize_services=True.")
|
||||
|
||||
deckies.append(
|
||||
DeckyConfig(
|
||||
name=name,
|
||||
ip=ip,
|
||||
services=svc_list,
|
||||
distro=distro.slug,
|
||||
base_image=distro.image,
|
||||
build_base=distro.build_base,
|
||||
hostname=hostname,
|
||||
archetype=archetype.slug if archetype else None,
|
||||
nmap_os=archetype.nmap_os if archetype else "linux",
|
||||
mutate_interval=mutate_interval,
|
||||
)
|
||||
)
|
||||
return deckies
|
||||
|
||||
|
||||
def build_deckies_from_ini(
|
||||
ini: IniConfig,
|
||||
subnet_cidr: str,
|
||||
gateway: str,
|
||||
host_ip: str,
|
||||
randomize: bool,
|
||||
cli_mutate_interval: int | None = None,
|
||||
) -> list[DeckyConfig]:
|
||||
"""Build DeckyConfig list from an IniConfig, auto-allocating missing IPs."""
|
||||
from ipaddress import IPv4Address, IPv4Network
|
||||
import time
|
||||
now = time.time()
|
||||
|
||||
explicit_ips: set[IPv4Address] = {
|
||||
IPv4Address(s.ip) for s in ini.deckies if s.ip
|
||||
}
|
||||
|
||||
net = IPv4Network(subnet_cidr, strict=False)
|
||||
reserved = {
|
||||
net.network_address,
|
||||
net.broadcast_address,
|
||||
IPv4Address(gateway),
|
||||
IPv4Address(host_ip),
|
||||
} | explicit_ips
|
||||
|
||||
auto_pool = (str(addr) for addr in net.hosts() if addr not in reserved)
|
||||
|
||||
deckies: list[DeckyConfig] = []
|
||||
for spec in ini.deckies:
|
||||
arch: Archetype | None = None
|
||||
if spec.archetype:
|
||||
arch = get_archetype(spec.archetype)
|
||||
|
||||
distro_pool = arch.preferred_distros if arch else list(all_distros().keys())
|
||||
distro = get_distro(distro_pool[len(deckies) % len(distro_pool)])
|
||||
hostname = random_hostname(distro.slug)
|
||||
|
||||
ip = spec.ip or next(auto_pool, None)
|
||||
if ip is None:
|
||||
raise ValueError(f"Not enough free IPs in {subnet_cidr} while assigning IP for '{spec.name}'.")
|
||||
|
||||
if spec.services:
|
||||
known = set(all_service_names())
|
||||
unknown = [s for s in spec.services if s not in known]
|
||||
if unknown:
|
||||
raise ValueError(
|
||||
f"Unknown service(s) in [{spec.name}]: {unknown}. "
|
||||
f"Available: {all_service_names()}"
|
||||
)
|
||||
svc_list = spec.services
|
||||
elif arch:
|
||||
svc_list = list(arch.services)
|
||||
elif randomize or (not spec.services and not arch):
|
||||
svc_pool = all_service_names()
|
||||
count = random.randint(1, min(3, len(svc_pool))) # nosec B311
|
||||
svc_list = random.sample(svc_pool, count) # nosec B311
|
||||
|
||||
resolved_nmap_os = spec.nmap_os or (arch.nmap_os if arch else "linux")
|
||||
|
||||
decky_mutate_interval = cli_mutate_interval
|
||||
if decky_mutate_interval is None:
|
||||
decky_mutate_interval = spec.mutate_interval if spec.mutate_interval is not None else ini.mutate_interval
|
||||
|
||||
deckies.append(DeckyConfig(
|
||||
name=spec.name,
|
||||
ip=ip,
|
||||
services=svc_list,
|
||||
distro=distro.slug,
|
||||
base_image=distro.image,
|
||||
build_base=distro.build_base,
|
||||
hostname=hostname,
|
||||
archetype=arch.slug if arch else None,
|
||||
service_config=spec.service_config,
|
||||
nmap_os=resolved_nmap_os,
|
||||
mutate_interval=decky_mutate_interval,
|
||||
last_mutated=now,
|
||||
))
|
||||
return deckies
|
||||
231
decnet/fleet/reconciler.py
Normal file
231
decnet/fleet/reconciler.py
Normal file
@@ -0,0 +1,231 @@
|
||||
"""Fleet reconciler — converges JSON ↔ DB ↔ docker.
|
||||
|
||||
Three sources of truth on a DECNET host can disagree:
|
||||
|
||||
1. ``decnet-state.json`` — written by ``engine.deployer.deploy/teardown``;
|
||||
the canonical record for offline / no-API consumers (``decnet status``,
|
||||
``decnet teardown``, sniffer, collector).
|
||||
2. ``fleet_deckies`` table — DB mirror written by the same deployer; what
|
||||
the orchestrator, web dashboard, and REST API see.
|
||||
3. ``docker inspect`` — actual per-container runtime state.
|
||||
|
||||
Drift sources we accept and correct:
|
||||
|
||||
* CLI deploy on a host whose DB was unreachable → JSON ahead of DB.
|
||||
* CLI teardown on a host whose DB was unreachable → DB ahead of JSON.
|
||||
* Operator hand-edited ``decnet-state.json`` → JSON ahead of DB.
|
||||
* Container crashed / was killed externally → DB state stale until docker
|
||||
is observed.
|
||||
|
||||
Resolution:
|
||||
|
||||
* JSON has X, DB doesn't → INSERT.
|
||||
* DB has X (this host), JSON doesn't → DELETE.
|
||||
* Both have X → state := docker-aggregated state.
|
||||
|
||||
Cross-host safety: deletions are scoped to ``host_uuid == this host``.
|
||||
A multi-host master that runs swarm workers (each with their own
|
||||
reconciler) must never delete a peer's rows.
|
||||
|
||||
The reconciler intentionally does NOT publish bus events for state
|
||||
changes today — the dashboard reads the DB on every render. A
|
||||
``fleet.{name}.state`` topic is a natural follow-up if SSE consumers
|
||||
appear, but is out of scope for this PR.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.bus.base import BaseBus
|
||||
from decnet.bus.publish import publish_safely
|
||||
from decnet.config import DecnetConfig, load_state as _real_load_state
|
||||
from decnet.logging import get_logger
|
||||
from decnet.web.db.models import LOCAL_HOST_SENTINEL
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
|
||||
logger = get_logger("fleet.reconciler")
|
||||
|
||||
|
||||
# ── docker observation ────────────────────────────────────────────────────────
|
||||
|
||||
def _collect_container_states(
|
||||
docker_client_factory: Optional[Callable[[], Any]] = None,
|
||||
) -> Optional[dict[str, str]]:
|
||||
"""Return ``{container_name: status}`` or ``None`` if docker is unreachable.
|
||||
|
||||
``None`` is the explicit "unknown" signal — callers must NOT treat
|
||||
docker failure as "every container is gone" (that would torch every
|
||||
fleet row to ``torn_down`` whenever the docker socket is busy).
|
||||
"""
|
||||
if docker_client_factory is None:
|
||||
try:
|
||||
import docker # local import — keeps tests import-clean
|
||||
docker_client_factory = docker.from_env
|
||||
except ImportError:
|
||||
return None
|
||||
try:
|
||||
client = docker_client_factory()
|
||||
return {
|
||||
c.name: c.status
|
||||
for c in client.containers.list(all=True, ignore_removed=True)
|
||||
}
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.debug("reconciler: docker query failed: %s", exc)
|
||||
return None
|
||||
|
||||
|
||||
def _aggregate_decky_state(
|
||||
decky_name: str,
|
||||
services: list[str],
|
||||
container_states: dict[str, str],
|
||||
) -> str:
|
||||
"""Aggregate per-decky state from per-service container statuses.
|
||||
|
||||
``running`` — every expected service container is ``running``.
|
||||
``failed`` — every observed container is non-running.
|
||||
``degraded`` — partial: some running, some not (or some missing).
|
||||
``torn_down`` — no expected container observed at all.
|
||||
"""
|
||||
expected = {f"{decky_name}-{svc.replace('_', '-')}" for svc in services}
|
||||
seen = {n: s for n, s in container_states.items() if n in expected}
|
||||
if not seen:
|
||||
return "torn_down"
|
||||
statuses = set(seen.values())
|
||||
if statuses == {"running"} and len(seen) == len(expected):
|
||||
return "running"
|
||||
if "running" not in statuses:
|
||||
return "failed"
|
||||
return "degraded"
|
||||
|
||||
|
||||
# ── reconcile pass ────────────────────────────────────────────────────────────
|
||||
|
||||
async def reconcile_once(
|
||||
repo: BaseRepository,
|
||||
*,
|
||||
host_uuid: str = LOCAL_HOST_SENTINEL,
|
||||
load_state_fn: Callable[[], Optional[tuple[DecnetConfig, Any]]] = _real_load_state,
|
||||
docker_client_factory: Optional[Callable[[], Any]] = None,
|
||||
bus: Optional[BaseBus] = None,
|
||||
) -> dict[str, int]:
|
||||
"""Single reconciliation pass. Returns counts of work done.
|
||||
|
||||
When *bus* is provided, fires ``decky.<host_uuid:name>.state`` on every
|
||||
insert / delete / state transition. The DB write is the source of
|
||||
truth — bus publish is best-effort notification; a dropped event is at
|
||||
most one tick of UI latency.
|
||||
"""
|
||||
counts = {"inserted": 0, "deleted": 0, "state_updated": 0}
|
||||
|
||||
state = await asyncio.to_thread(load_state_fn)
|
||||
json_deckies: list[Any] = list(state[0].deckies) if state else []
|
||||
|
||||
db_rows = await repo.list_fleet_deckies(host_uuid=host_uuid)
|
||||
db_by_name = {r["name"]: r for r in db_rows}
|
||||
|
||||
container_states = await asyncio.to_thread(
|
||||
_collect_container_states, docker_client_factory,
|
||||
)
|
||||
docker_known = container_states is not None
|
||||
|
||||
json_names = {d.name for d in json_deckies}
|
||||
|
||||
# 1. INSERT: present in JSON, absent from DB.
|
||||
for d in json_deckies:
|
||||
if d.name in db_by_name:
|
||||
continue
|
||||
new_state = (
|
||||
_aggregate_decky_state(d.name, list(d.services), container_states)
|
||||
if docker_known else "running"
|
||||
)
|
||||
row_host = d.host_uuid or host_uuid
|
||||
await repo.upsert_fleet_decky({
|
||||
"host_uuid": row_host,
|
||||
"name": d.name,
|
||||
"services": list(d.services),
|
||||
"decky_config": d.model_dump(mode="json"),
|
||||
"decky_ip": d.ip,
|
||||
"state": new_state,
|
||||
})
|
||||
counts["inserted"] += 1
|
||||
await _emit_state(bus, row_host, d.name, new_state, transition="inserted")
|
||||
|
||||
# 2. DELETE: present in DB (this host), absent from JSON.
|
||||
# Scoped to host_uuid by list_fleet_deckies(host_uuid=...) call above —
|
||||
# peer-host rows are never visible here, so we can't accidentally
|
||||
# clobber another worker's slice.
|
||||
for r in db_rows:
|
||||
if r["name"] not in json_names:
|
||||
await repo.delete_fleet_decky(
|
||||
host_uuid=r["host_uuid"], name=r["name"],
|
||||
)
|
||||
counts["deleted"] += 1
|
||||
await _emit_state(
|
||||
bus, r["host_uuid"], r["name"], "torn_down",
|
||||
transition="deleted",
|
||||
)
|
||||
|
||||
# 3. STATE: present in both, docker says something fresh.
|
||||
if docker_known:
|
||||
for d in json_deckies:
|
||||
existing = db_by_name.get(d.name)
|
||||
if existing is None:
|
||||
continue # already handled in step 1
|
||||
new_state = _aggregate_decky_state(
|
||||
d.name, list(d.services), container_states,
|
||||
)
|
||||
previous_state = existing.get("state")
|
||||
if previous_state != new_state:
|
||||
await repo.update_fleet_decky_state(
|
||||
host_uuid=existing["host_uuid"],
|
||||
name=d.name,
|
||||
state=new_state,
|
||||
)
|
||||
counts["state_updated"] += 1
|
||||
await _emit_state(
|
||||
bus, existing["host_uuid"], d.name, new_state,
|
||||
transition="state_changed",
|
||||
previous=previous_state,
|
||||
)
|
||||
|
||||
return counts
|
||||
|
||||
|
||||
async def _emit_state(
|
||||
bus: Optional[BaseBus],
|
||||
host_uuid: str,
|
||||
name: str,
|
||||
state: str,
|
||||
*,
|
||||
transition: str,
|
||||
previous: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Publish ``decky.<host_uuid:name>.state`` on a fleet row transition.
|
||||
|
||||
Topic uses an existing topic family (``DECKY_STATE``) — no
|
||||
bus/topics.py change required. The composite ``host_uuid:name`` keeps
|
||||
fleet rows distinguishable from MazeNET TopologyDecky rows (whose ids
|
||||
are bare UUIDs). A ``:`` is a legal token character; ``.``, ``*``,
|
||||
``>``, and whitespace are the only banned ones (see
|
||||
``bus.topics._reject_tokens``).
|
||||
"""
|
||||
if bus is None:
|
||||
return
|
||||
decky_id = f"{host_uuid}:{name}"
|
||||
payload: dict[str, Any] = {
|
||||
"host_uuid": host_uuid,
|
||||
"name": name,
|
||||
"state": state,
|
||||
"transition": transition,
|
||||
"source": "fleet",
|
||||
}
|
||||
if previous is not None:
|
||||
payload["previous_state"] = previous
|
||||
await publish_safely(
|
||||
bus,
|
||||
_topics.decky(decky_id, _topics.DECKY_STATE),
|
||||
payload,
|
||||
event_type=_topics.DECKY_STATE,
|
||||
)
|
||||
86
decnet/fleet/reconciler_worker.py
Normal file
86
decnet/fleet/reconciler_worker.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""Long-lived periodic reconciler worker.
|
||||
|
||||
Modeled on :mod:`decnet.orchestrator.worker`: same control listener, same
|
||||
heartbeat helper, same shutdown semantics. One tick = one
|
||||
:func:`reconcile_once` pass.
|
||||
|
||||
Default interval is short (30s) because reconciliation is cheap when
|
||||
nothing has drifted (three reads, no writes), and a short cadence keeps
|
||||
the dashboard's view of crashed containers fresh.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
|
||||
from decnet.bus.factory import get_bus
|
||||
from decnet.bus.publish import (
|
||||
run_control_listener,
|
||||
run_health_heartbeat,
|
||||
)
|
||||
from decnet.fleet.reconciler import reconcile_once
|
||||
from decnet.logging import get_logger
|
||||
from decnet.web.db.models import LOCAL_HOST_SENTINEL
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
|
||||
logger = get_logger("fleet.reconciler")
|
||||
|
||||
|
||||
async def fleet_reconciler_worker(
|
||||
repo: BaseRepository,
|
||||
*,
|
||||
interval: int = 30,
|
||||
host_uuid: str = LOCAL_HOST_SENTINEL,
|
||||
) -> None:
|
||||
"""Periodically converge JSON ↔ DB ↔ docker for the local host.
|
||||
|
||||
Honours the bus control topic (``system.reconciler.control``) for
|
||||
graceful shutdown — same lifecycle contract as every other DECNET
|
||||
worker.
|
||||
"""
|
||||
logger.info("fleet reconciler started interval=%ds host=%s", interval, host_uuid)
|
||||
|
||||
bus = None
|
||||
try:
|
||||
bus = get_bus(client_name="reconciler")
|
||||
await bus.connect()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning(
|
||||
"reconciler: bus unavailable, continuing without publish: %s", exc,
|
||||
)
|
||||
bus = None
|
||||
|
||||
shutdown = asyncio.Event()
|
||||
heartbeat_task = asyncio.create_task(run_health_heartbeat(bus, "reconciler"))
|
||||
control_task = asyncio.create_task(
|
||||
run_control_listener(bus, "reconciler", shutdown),
|
||||
)
|
||||
|
||||
try:
|
||||
while not shutdown.is_set():
|
||||
try:
|
||||
await asyncio.wait_for(shutdown.wait(), timeout=interval)
|
||||
except asyncio.TimeoutError:
|
||||
pass # normal tick
|
||||
if shutdown.is_set():
|
||||
break
|
||||
try:
|
||||
counts = await reconcile_once(
|
||||
repo, host_uuid=host_uuid, bus=bus,
|
||||
)
|
||||
if any(counts.values()):
|
||||
logger.info(
|
||||
"reconcile inserted=%d deleted=%d state_updated=%d",
|
||||
counts["inserted"], counts["deleted"],
|
||||
counts["state_updated"],
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.error("reconcile tick failed: %s", exc)
|
||||
finally:
|
||||
for t in (heartbeat_task, control_task):
|
||||
t.cancel()
|
||||
with contextlib.suppress(Exception, asyncio.CancelledError):
|
||||
await t
|
||||
if bus is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
await bus.close()
|
||||
Reference in New Issue
Block a user