fix(stats): keep TopologyDecky.state in sync with docker so ACTIVE DECKIES counts right
Dashboard's ACTIVE DECKIES (active_deckies in get_stats_summary) counts TopologyDecky rows where state='running'. No code path was flipping that state away from the default 'pending', so the count read 0/N even when every container was running fine — the dashboard was lying. Two complementary fixes: 1. deploy_topology — after the post-deploy compose ps verification, reconcile each TopologyDecky.state from the corresponding base container's docker state. running → 'running'; anything else → 'failed'. Reuses the ps_rows already gathered for the ACTIVE-vs-DEGRADED status decision; no extra docker hit. 2. apply_add_decky — _materialise_decky_spawn now returns True/False; on True the row is updated to state='running' before _assert_valid_after. Catches the case where a decky added via the live mutator queue stays at 'pending' indefinitely (the deployer's reconcile only runs on a fresh deploy_topology pass). Existing topology deckies in active topologies will still read as 'pending' until the next deploy_topology runs, since this is forward-only. An operator-side fix is to teardown + redeploy or run the (forthcoming) reconcile-on-startup pass.
This commit is contained in:
@@ -1005,8 +1005,18 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N
|
|||||||
lambda: _compose_ps(compose_path),
|
lambda: _compose_ps(compose_path),
|
||||||
)
|
)
|
||||||
bad: list[str] = []
|
bad: list[str] = []
|
||||||
|
# Build the per-decky state map. The base container's compose
|
||||||
|
# service name == decky name, which is what we cache on the
|
||||||
|
# TopologyDecky row. Service containers (named ``<decky>-<svc>``)
|
||||||
|
# don't gate the decky's state — service-level failures are visible
|
||||||
|
# in compose ps separately and don't downgrade the decky as a whole.
|
||||||
|
decky_state_by_name: dict[str, str] = {}
|
||||||
for row in ps_rows:
|
for row in ps_rows:
|
||||||
state = str(row.get("State", "")).lower()
|
state = str(row.get("State", "")).lower()
|
||||||
|
service_name = str(row.get("Service") or "")
|
||||||
|
if service_name and "-" not in service_name:
|
||||||
|
# Plain decky base; cache its docker state.
|
||||||
|
decky_state_by_name[service_name] = state or "unknown"
|
||||||
if state and state != "running":
|
if state and state != "running":
|
||||||
name = str(row.get("Name") or row.get("Service") or "?")
|
name = str(row.get("Name") or row.get("Service") or "?")
|
||||||
exit_code = row.get("ExitCode")
|
exit_code = row.get("ExitCode")
|
||||||
@@ -1015,6 +1025,27 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N
|
|||||||
+ (f" (exit={exit_code})" if exit_code not in (None, 0, "") else "")
|
+ (f" (exit={exit_code})" if exit_code not in (None, 0, "") else "")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Reconcile each TopologyDecky.state from compose's view. Without
|
||||||
|
# this, the row stays at the default 'pending' forever and the
|
||||||
|
# dashboard's ACTIVE DECKIES count reads 0/N even when everything's
|
||||||
|
# actually up.
|
||||||
|
for decky in hydrated["deckies"]:
|
||||||
|
cfg = decky.get("decky_config") or {}
|
||||||
|
decky_name = cfg.get("name") or decky.get("name")
|
||||||
|
if not decky_name:
|
||||||
|
continue
|
||||||
|
ds = decky_state_by_name.get(decky_name, "unknown")
|
||||||
|
new_state = "running" if ds == "running" else "failed"
|
||||||
|
try:
|
||||||
|
await repo.update_topology_decky(
|
||||||
|
decky["uuid"], {"state": new_state},
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning(
|
||||||
|
"post-deploy state reconcile failed topology=%s decky=%s: %s",
|
||||||
|
topology_id, decky_name, exc,
|
||||||
|
)
|
||||||
|
|
||||||
if bad:
|
if bad:
|
||||||
reason = "post-deploy check: " + ", ".join(bad[:8]) + (
|
reason = "post-deploy check: " + ", ".join(bad[:8]) + (
|
||||||
f" and {len(bad) - 8} more" if len(bad) > 8 else ""
|
f" and {len(bad) - 8} more" if len(bad) > 8 else ""
|
||||||
|
|||||||
@@ -326,16 +326,18 @@ async def _rerender_compose(repo: Any, topology_id: str) -> None:
|
|||||||
|
|
||||||
async def _materialise_decky_spawn(
|
async def _materialise_decky_spawn(
|
||||||
repo: Any, topology_id: str, decky_name: str, services: list[str],
|
repo: Any, topology_id: str, decky_name: str, services: list[str],
|
||||||
) -> None:
|
) -> bool:
|
||||||
"""compose up -d --no-deps --build for one decky (base + services).
|
"""compose up -d --no-deps --build for one decky (base + services).
|
||||||
|
|
||||||
Re-renders compose first so the file lists the new decky. No-op
|
Re-renders compose first so the file lists the new decky. Returns
|
||||||
when the topology isn't eligible for live materialisation (see
|
True when compose-up reported success, False otherwise (or when
|
||||||
:func:`_live_topology_or_none`). Best-effort: docker failure is
|
the topology isn't eligible for live materialisation — pending
|
||||||
logged, not re-raised — DB row is the source of truth.
|
topologies skip and return False so the caller doesn't flip the
|
||||||
|
state to ``running`` based on a no-op). Best-effort: docker
|
||||||
|
failure is logged, not re-raised — DB row is the source of truth.
|
||||||
"""
|
"""
|
||||||
if await _live_topology_or_none(repo, topology_id) is None:
|
if await _live_topology_or_none(repo, topology_id) is None:
|
||||||
return
|
return False
|
||||||
from decnet.engine.deployer import _topology_compose_path
|
from decnet.engine.deployer import _topology_compose_path
|
||||||
await _rerender_compose(repo, topology_id)
|
await _rerender_compose(repo, topology_id)
|
||||||
targets = _decky_targets(decky_name, services)
|
targets = _decky_targets(decky_name, services)
|
||||||
@@ -346,11 +348,13 @@ async def _materialise_decky_spawn(
|
|||||||
compose_file=compose_path,
|
compose_file=compose_path,
|
||||||
label=f"live add_decky topology={topology_id} decky={decky_name}",
|
label=f"live add_decky topology={topology_id} decky={decky_name}",
|
||||||
)
|
)
|
||||||
|
return True
|
||||||
except Exception as exc: # noqa: BLE001
|
except Exception as exc: # noqa: BLE001
|
||||||
_log.error(
|
_log.error(
|
||||||
"live add_decky: compose up failed topology=%s decky=%s: %s",
|
"live add_decky: compose up failed topology=%s decky=%s: %s",
|
||||||
topology_id, decky_name, exc,
|
topology_id, decky_name, exc,
|
||||||
)
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def _materialise_decky_remove(
|
async def _materialise_decky_remove(
|
||||||
@@ -700,7 +704,22 @@ async def apply_add_decky(
|
|||||||
# Live materialisation: spawn the new decky's containers without
|
# Live materialisation: spawn the new decky's containers without
|
||||||
# touching siblings. Skips on pending / agent-pinned topologies —
|
# touching siblings. Skips on pending / agent-pinned topologies —
|
||||||
# see _live_topology_or_none.
|
# see _live_topology_or_none.
|
||||||
await _materialise_decky_spawn(repo, topology_id, name, services_list)
|
spawned = await _materialise_decky_spawn(
|
||||||
|
repo, topology_id, name, services_list,
|
||||||
|
)
|
||||||
|
# Flip the row's state to 'running' on success so the dashboard's
|
||||||
|
# ACTIVE DECKIES count reflects reality. Without this the row
|
||||||
|
# stays at the default 'pending' forever; the deployer's full
|
||||||
|
# post-deploy reconcile only runs on a fresh deploy_topology.
|
||||||
|
if spawned:
|
||||||
|
try:
|
||||||
|
await repo.update_topology_decky(decky_uuid, {"state": "running"})
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
_log.warning(
|
||||||
|
"live add_decky: state flip to running failed "
|
||||||
|
"topology=%s decky=%s: %s",
|
||||||
|
topology_id, name, exc,
|
||||||
|
)
|
||||||
await _assert_valid_after(repo, topology_id)
|
await _assert_valid_after(repo, topology_id)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -130,6 +130,23 @@ async def test_add_decky_spawns_base_and_service_containers(repo, stubs):
|
|||||||
assert "newbox-ssh" in args
|
assert "newbox-ssh" in args
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.anyio
|
||||||
|
async def test_add_decky_flips_state_to_running_after_spawn(repo, stubs):
|
||||||
|
"""Without this the dashboard's ACTIVE DECKIES count reads 0/N."""
|
||||||
|
tid = await _make_active(repo)
|
||||||
|
lans = await repo.list_lans_for_topology(tid)
|
||||||
|
home_lan = lans[0]["name"]
|
||||||
|
|
||||||
|
await apply_add_decky(repo, tid, {
|
||||||
|
"name": "newrunner",
|
||||||
|
"lan": home_lan,
|
||||||
|
"services": [],
|
||||||
|
})
|
||||||
|
rows = await repo.list_topology_deckies(tid)
|
||||||
|
new = next(r for r in rows if r["name"] == "newrunner")
|
||||||
|
assert new["state"] == "running"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.anyio
|
@pytest.mark.anyio
|
||||||
async def test_add_decky_skips_materialisation_when_pending(repo, stubs):
|
async def test_add_decky_skips_materialisation_when_pending(repo, stubs):
|
||||||
"""Pending topology gets DB write only — deploy_topology will spawn."""
|
"""Pending topology gets DB write only — deploy_topology will spawn."""
|
||||||
|
|||||||
Reference in New Issue
Block a user