fix(orchestrator): see fleet + shard deckies, not just topology rows
Switches _one_tick from list_running_topology_deckies to
list_running_deckies (the union view added in 095500a). Resolves the
permanent "no actionable deckies (running+ssh count=0)" log on hosts
running only unihost MACVLAN / IPVLAN decoys — the orchestrator now
sees fleet_deckies rows alongside MazeNET topology rows and SWARM
DeckyShard rows.
Also fixes the misleading log message: the old "running+ssh count=N"
reported the *pre-filter* total (count of all running deckies, not
the SSH-eligible subset that scheduler.pick actually evaluates). New
line breaks down running, ssh_eligible, and per-source counts so
debugging "why isn't it picking?" no longer requires reading
scheduler internals.
Regression test: orchestrator integration suite now seeds fleet_deckies
rows (not just topology_deckies) and verifies a tick picks them and
records an event with dst="local:fleet-*" — proving the original bug
on the operator's mothership is fixed.
This commit is contained in:
@@ -98,12 +98,30 @@ async def orchestrator_worker(
|
|||||||
|
|
||||||
|
|
||||||
async def _one_tick(repo: BaseRepository, driver, bus) -> None:
|
async def _one_tick(repo: BaseRepository, driver, bus) -> None:
|
||||||
deckies = await repo.list_running_topology_deckies()
|
# Union view: MazeNET topology + unihost fleet + SWARM shards. Pre-fleet
|
||||||
|
# this only saw topology_deckies and was permanently blind to MACVLAN /
|
||||||
|
# IPVLAN unihost decoys.
|
||||||
|
deckies = await repo.list_running_deckies()
|
||||||
action = scheduler.pick(deckies)
|
action = scheduler.pick(deckies)
|
||||||
if action is None:
|
if action is None:
|
||||||
|
# Report the actual SSH-eligible count (what the scheduler filters
|
||||||
|
# to), not just len(deckies) — the old "running+ssh count=N" line
|
||||||
|
# reported the pre-filter count and misled debugging.
|
||||||
|
ssh_eligible = sum(
|
||||||
|
1 for d in deckies
|
||||||
|
if isinstance(d.get("services"), list)
|
||||||
|
and "ssh" in d["services"]
|
||||||
|
and d.get("ip")
|
||||||
|
)
|
||||||
|
by_source: dict[str, int] = {}
|
||||||
|
for d in deckies:
|
||||||
|
by_source[d.get("source", "unknown")] = (
|
||||||
|
by_source.get(d.get("source", "unknown"), 0) + 1
|
||||||
|
)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"orchestrator: no actionable deckies (running+ssh count=%d)",
|
"orchestrator: no actionable deckies "
|
||||||
len(deckies),
|
"(running=%d ssh_eligible=%d sources=%s)",
|
||||||
|
len(deckies), ssh_eligible, by_source,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
@@ -106,6 +106,44 @@ async def test_one_tick_records_event_and_publishes(repo, fake_bus, monkeypatch)
|
|||||||
assert ev.payload["kind"] == row["kind"]
|
assert ev.payload["kind"] == row["kind"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_one_tick_picks_fleet_deckies(repo, fake_bus, monkeypatch):
|
||||||
|
"""Regression: orchestrator was permanently blind to unihost MACVLAN /
|
||||||
|
IPVLAN deckies because list_running_topology_deckies only scans
|
||||||
|
topology_deckies. The new union view (list_running_deckies) must
|
||||||
|
pull in fleet_deckies rows too."""
|
||||||
|
await repo.upsert_fleet_decky({
|
||||||
|
"host_uuid": "local",
|
||||||
|
"name": "fleet-d1",
|
||||||
|
"services": ["ssh"],
|
||||||
|
"decky_ip": "10.0.0.50",
|
||||||
|
"state": "running",
|
||||||
|
})
|
||||||
|
await repo.upsert_fleet_decky({
|
||||||
|
"host_uuid": "local",
|
||||||
|
"name": "fleet-d2",
|
||||||
|
"services": ["ssh"],
|
||||||
|
"decky_ip": "10.0.0.51",
|
||||||
|
"state": "running",
|
||||||
|
})
|
||||||
|
|
||||||
|
async def fake_run(argv):
|
||||||
|
if argv[3] == "python3":
|
||||||
|
return 0, "SSH-2.0-OpenSSH_9.6\r\n", ""
|
||||||
|
return 0, "", ""
|
||||||
|
|
||||||
|
monkeypatch.setattr(ssh_driver, "_run", fake_run)
|
||||||
|
|
||||||
|
driver = ssh_driver.SSHDriver()
|
||||||
|
await orch_worker._one_tick(repo, driver, fake_bus)
|
||||||
|
|
||||||
|
rows = await repo.list_orchestrator_events(limit=10)
|
||||||
|
assert len(rows) == 1
|
||||||
|
# The dst_decky_uuid is our composite "host_uuid:name" identifier
|
||||||
|
# for fleet-source rows (see SQLModelRepository.list_running_deckies).
|
||||||
|
assert rows[0]["dst_decky_uuid"].startswith("local:fleet-")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_tick_is_noop_when_no_running_deckies(repo, fake_bus, monkeypatch):
|
async def test_tick_is_noop_when_no_running_deckies(repo, fake_bus, monkeypatch):
|
||||||
called = False
|
called = False
|
||||||
|
|||||||
Reference in New Issue
Block a user