diff --git a/decnet/engine/deployer.py b/decnet/engine/deployer.py index fa942937..46424e4d 100644 --- a/decnet/engine/deployer.py +++ b/decnet/engine/deployer.py @@ -1005,8 +1005,18 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N lambda: _compose_ps(compose_path), ) bad: list[str] = [] + # Build the per-decky state map. The base container's compose + # service name == decky name, which is what we cache on the + # TopologyDecky row. Service containers (named ``-``) + # don't gate the decky's state — service-level failures are visible + # in compose ps separately and don't downgrade the decky as a whole. + decky_state_by_name: dict[str, str] = {} for row in ps_rows: state = str(row.get("State", "")).lower() + service_name = str(row.get("Service") or "") + if service_name and "-" not in service_name: + # Plain decky base; cache its docker state. + decky_state_by_name[service_name] = state or "unknown" if state and state != "running": name = str(row.get("Name") or row.get("Service") or "?") exit_code = row.get("ExitCode") @@ -1015,6 +1025,27 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N + (f" (exit={exit_code})" if exit_code not in (None, 0, "") else "") ) + # Reconcile each TopologyDecky.state from compose's view. Without + # this, the row stays at the default 'pending' forever and the + # dashboard's ACTIVE DECKIES count reads 0/N even when everything's + # actually up. + for decky in hydrated["deckies"]: + cfg = decky.get("decky_config") or {} + decky_name = cfg.get("name") or decky.get("name") + if not decky_name: + continue + ds = decky_state_by_name.get(decky_name, "unknown") + new_state = "running" if ds == "running" else "failed" + try: + await repo.update_topology_decky( + decky["uuid"], {"state": new_state}, + ) + except Exception as exc: # noqa: BLE001 + log.warning( + "post-deploy state reconcile failed topology=%s decky=%s: %s", + topology_id, decky_name, exc, + ) + if bad: reason = "post-deploy check: " + ", ".join(bad[:8]) + ( f" and {len(bad) - 8} more" if len(bad) > 8 else "" diff --git a/decnet/mutator/ops.py b/decnet/mutator/ops.py index ab127849..ea183f36 100644 --- a/decnet/mutator/ops.py +++ b/decnet/mutator/ops.py @@ -326,16 +326,18 @@ async def _rerender_compose(repo: Any, topology_id: str) -> None: async def _materialise_decky_spawn( repo: Any, topology_id: str, decky_name: str, services: list[str], -) -> None: +) -> bool: """compose up -d --no-deps --build for one decky (base + services). - Re-renders compose first so the file lists the new decky. No-op - when the topology isn't eligible for live materialisation (see - :func:`_live_topology_or_none`). Best-effort: docker failure is - logged, not re-raised — DB row is the source of truth. + Re-renders compose first so the file lists the new decky. Returns + True when compose-up reported success, False otherwise (or when + the topology isn't eligible for live materialisation — pending + topologies skip and return False so the caller doesn't flip the + state to ``running`` based on a no-op). Best-effort: docker + failure is logged, not re-raised — DB row is the source of truth. """ if await _live_topology_or_none(repo, topology_id) is None: - return + return False from decnet.engine.deployer import _topology_compose_path await _rerender_compose(repo, topology_id) targets = _decky_targets(decky_name, services) @@ -346,11 +348,13 @@ async def _materialise_decky_spawn( compose_file=compose_path, label=f"live add_decky topology={topology_id} decky={decky_name}", ) + return True except Exception as exc: # noqa: BLE001 _log.error( "live add_decky: compose up failed topology=%s decky=%s: %s", topology_id, decky_name, exc, ) + return False async def _materialise_decky_remove( @@ -700,7 +704,22 @@ async def apply_add_decky( # Live materialisation: spawn the new decky's containers without # touching siblings. Skips on pending / agent-pinned topologies — # see _live_topology_or_none. - await _materialise_decky_spawn(repo, topology_id, name, services_list) + spawned = await _materialise_decky_spawn( + repo, topology_id, name, services_list, + ) + # Flip the row's state to 'running' on success so the dashboard's + # ACTIVE DECKIES count reflects reality. Without this the row + # stays at the default 'pending' forever; the deployer's full + # post-deploy reconcile only runs on a fresh deploy_topology. + if spawned: + try: + await repo.update_topology_decky(decky_uuid, {"state": "running"}) + except Exception as exc: # noqa: BLE001 + _log.warning( + "live add_decky: state flip to running failed " + "topology=%s decky=%s: %s", + topology_id, name, exc, + ) await _assert_valid_after(repo, topology_id) diff --git a/tests/mutator/test_ops_materialisation.py b/tests/mutator/test_ops_materialisation.py index 948b5354..b1d16b3f 100644 --- a/tests/mutator/test_ops_materialisation.py +++ b/tests/mutator/test_ops_materialisation.py @@ -130,6 +130,23 @@ async def test_add_decky_spawns_base_and_service_containers(repo, stubs): assert "newbox-ssh" in args +@pytest.mark.anyio +async def test_add_decky_flips_state_to_running_after_spawn(repo, stubs): + """Without this the dashboard's ACTIVE DECKIES count reads 0/N.""" + tid = await _make_active(repo) + lans = await repo.list_lans_for_topology(tid) + home_lan = lans[0]["name"] + + await apply_add_decky(repo, tid, { + "name": "newrunner", + "lan": home_lan, + "services": [], + }) + rows = await repo.list_topology_deckies(tid) + new = next(r for r in rows if r["name"] == "newrunner") + assert new["state"] == "running" + + @pytest.mark.anyio async def test_add_decky_skips_materialisation_when_pending(repo, stubs): """Pending topology gets DB write only — deploy_topology will spawn."""