diff --git a/decnet/web/db/sqlmodel_repo/__init__.py b/decnet/web/db/sqlmodel_repo/__init__.py index 9bdc7bf5..c7708067 100644 --- a/decnet/web/db/sqlmodel_repo/__init__.py +++ b/decnet/web/db/sqlmodel_repo/__init__.py @@ -12,7 +12,6 @@ backends. Dialect-specific behavior lives in subclasses: """ from __future__ import annotations -import asyncio import json import os @@ -23,7 +22,6 @@ from typing import Any, Optional, List, cast from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession, async_sessionmaker -from decnet.config import load_state from decnet.env import DECNET_ADMIN_USER, DECNET_ADMIN_PASSWORD from decnet.web.auth import get_password_hash from decnet.web.db.repository import BaseRepository @@ -172,8 +170,14 @@ class SQLModelRepository( return None async def get_deckies(self) -> List[dict]: - _state = await asyncio.to_thread(load_state) - return [_d.model_dump() for _d in _state[0].deckies] if _state else [] + # The fleet inventory the UI/API sees is fleet_deckies — the + # engine-mirrored table written on EVERY deploy/teardown (CLI or web), + # per the source-of-truth model documented in fleet/reconciler.py. + # Each row's decky_config column is a full DeckyConfig.model_dump( + # mode="json"), so it rehydrates to the same shape load_state() used + # to return. See development/ADR-001-FLEET-SOURCE-OF-TRUTH.md. + rows = await self.list_fleet_deckies() + return [r["decky_config"] for r in rows if r.get("decky_config")] # --------------------------------------------------------------- users diff --git a/decnet/web/router/fleet/api_deploy_deckies.py b/decnet/web/router/fleet/api_deploy_deckies.py index 0aaca351..9da9180f 100644 --- a/decnet/web/router/fleet/api_deploy_deckies.py +++ b/decnet/web/router/fleet/api_deploy_deckies.py @@ -8,7 +8,7 @@ from decnet.bus.factory import get_bus from decnet.lifecycle.runner import run_deploy from decnet.logging import get_logger from decnet.telemetry import traced as _traced -from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT +from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, DeckyConfig, _ROOT from decnet.ini_loader import load_ini_from_string from decnet.network import detect_interface, detect_subnet, get_host_ip from decnet.web.dependencies import require_admin, repo @@ -19,6 +19,39 @@ log = get_logger("api") router = APIRouter() +async def _commit_fleet_to_db(deckies: list[DeckyConfig], *, replace_fleet: bool) -> None: + """Synchronously reconcile ``fleet_deckies`` to *deckies*. + + fleet_deckies is the source of truth the deploy guard now reads + (``existing_deckies``). Committing the intended shape here — before the + async deploy task's engine mirror runs — means rapid sequential web + deploys each read a current fleet (no self-wipe) and the dashboard + observes the new shape immediately. Mirrors the payload shape of + ``engine.deployer._mirror_fleet_deploy_to_db``. + + In replace mode, rows absent from *deckies* are deleted so the committed + inventory matches the desired set; the async reconciler/teardown mirror + converges the actual containers separately. + """ + from decnet.web.db.models import LOCAL_HOST_SENTINEL + + keep = {(d.host_uuid or LOCAL_HOST_SENTINEL, d.name) for d in deckies} + if replace_fleet: + for row in await repo.list_fleet_deckies(): + host = row.get("host_uuid") or LOCAL_HOST_SENTINEL + if (host, row.get("name")) not in keep: + await repo.delete_fleet_decky(host_uuid=host, name=row["name"]) + for d in deckies: + await repo.upsert_fleet_decky({ + "host_uuid": d.host_uuid or LOCAL_HOST_SENTINEL, + "name": d.name, + "services": list(d.services), + "decky_config": d.model_dump(mode="json"), + "decky_ip": d.ip, + "state": "running", + }) + + @router.post( "/deckies/deploy", tags=["Fleet Management"], @@ -81,7 +114,19 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir # config below) so additive collision checks compare new against prior # rather than against themselves. Existing IPs are passed into # build_deckies_from_ini as reserved so auto-allocation skips them. - existing_deckies = list(config.deckies) if config is not None else [] + # The existing fleet comes from fleet_deckies (engine-mirrored on CLI + # *and* web deploys), NOT from config.deckies carried by the + # State["deployment"] key. A CLI/seed-established fleet never lands in + # that key, so the additive collision guard ran blind and the reconciler + # wiped the fleet — root cause of BUG-2. fleet_deckies is the store the + # source-of-truth model (fleet/reconciler.py) names as the API's view. + # See development/ADR-001-FLEET-SOURCE-OF-TRUTH.md. + existing_rows = await repo.list_fleet_deckies() + existing_deckies = [ + DeckyConfig(**r["decky_config"]) + for r in existing_rows + if r.get("decky_config") + ] reserved_ips: set[str] | None = ( {d.ip for d in existing_deckies if d.ip} if not req.replace_fleet and existing_deckies @@ -192,6 +237,11 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir ), } await repo.set_state("deployment", new_state_payload) + # Commit the intended fleet to fleet_deckies — the store the deploy guard + # and get_deckies() now read. set_state("deployment") above is retained + # for the mutate handlers / mutator engine that still coordinate through + # that key (their consolidation is tracked in the ADR, open question 7). + await _commit_fleet_to_db(config.deckies, replace_fleet=req.replace_fleet) # Lifecycle rows track THIS call's deployments only. In additive mode # the existing deckies are already running and don't get a new diff --git a/development/ADR-001-FLEET-SOURCE-OF-TRUTH.md b/development/ADR-001-FLEET-SOURCE-OF-TRUTH.md new file mode 100644 index 00000000..1be9cf05 --- /dev/null +++ b/development/ADR-001-FLEET-SOURCE-OF-TRUTH.md @@ -0,0 +1,177 @@ +# ADR-001 — Fleet Source of Truth + +- **Status:** PROPOSED (discussion — not yet accepted) +- **Date:** 2026-06-12 +- **Context owner:** ANTI +- **Drives fix for:** BUG-2 (destructive fleet-replace / silent wipe), see `QA_REPORT.md` + +--- + +## 1. Context + +DECNET currently keeps the deployed-fleet inventory in **two unsynchronized stores**: + +| Store | Read by | Written by | +|-------|---------|------------| +| `decnet-state.json` file (`load_state()`) | `repo.get_deckies()` → the UI fleet view, collision pre-checks | CLI/engine path (`engine.deployer.save_state`), `decnet status`, sniffer, collector | +| DB `State` table, key `"deployment"` (`repo.get_state`/`set_state`) | the web deploy handler's `existing_deckies` snapshot | **only** the web deploy handler | + +The web is a **non-dependency**: the same deploys can be driven entirely from the CLI, and CLI state lives in `decnet-state.json`. Because the two stores never reconcile, a fleet established via CLI/seed is invisible to the web deploy handler's collision guard. + +### BUG-2 failure chain (source-traced) + +1. CLI/seed establishes a fleet → written to `decnet-state.json`, **never** to DB `"deployment"`. +2. UI reads `get_deckies()` (JSON) → shows decky-02/03 correctly. +3. Wizard POSTs a new decky-04 with `replace_fleet=false`. +4. Handler reads `existing_deckies` from `repo.get_state("deployment")` → **None** → `existing_deckies = []`. +5. Collision guard compares against `[]` → no conflict → `config.deckies = [] + [decky-04]`. +6. `run_deploy` → `LocalDeployStrategy` → `engine.deployer.deploy(config)`: + - `write_compose(config, COMPOSE_FILE)` writes a compose file containing **only decky-04** (`deployer.py:681`). + - `_compose("down", "--remove-orphans", …)` (`deployer.py:708`) tears down the whole compose project, then `up` brings back only decky-04. + - `_mirror_fleet_teardown_to_db` drops the survivors' rows. +7. Result: fleet silently wiped to one decky. HTTP 202. No warning. + +**Key trap:** the destructive call is `deployer.py:708` (`down --remove-orphans` against a compose file rewritten from `config.deckies`). Any source-of-truth fix that does not also guarantee `config.deckies` is the **complete** desired fleet before `write_compose` leaves BUG-2 alive. + +--- + +## 2. What the UI actually consumes + +`DeckyConfig` (`decnet/models.py:87`) full field set: + +``` +name, ip, services[], distro, base_image, build_base, hostname, +archetype, service_config{}, nmap_os, mutate_interval, last_mutated, +last_login_attempt, host_uuid +``` + +Frontend `Decky` type (`DeckyFleet/types.ts`) + what is **rendered/edited**: + +| Field | Displayed? | Where | +|-------|-----------|-------| +| name, ip, services | yes | DeckyCard / InspectPanel | +| hostname, distro, archetype | yes | DeckyInspectPanel:77-79 | +| mutate_interval, last_mutated | yes | DeckyInspectPanel:80-81 | +| **service_config** | **yes — EDITED** | DeckyCard:322 (per-service config editor `currentConfig`) | +| base_image, build_base, nmap_os, last_login_attempt | no | — | + +**Conclusion:** `service_config` is not just stored — it is rendered and **edited** in the UI. A "minimal scalar labels" scheme (name/ip/services only) would amputate editable state. Fidelity requires carrying the full `DeckyConfig`. + +--- + +## 3. Options + +### Option A — API reads only the DB; ignore `decnet-state.json` (web side) + +Align `get_deckies()` and the deploy handler both on DB `"deployment"`. The web becomes a self-contained plane on the DB; CLI stays on the JSON file. The two planes are explicitly **non-interoperable**. + +- **Pros:** smallest change; closes the desync *within the web plane*. +- **Cons:** ANTI's own verdict — "honestly the incorrect way of doing things." Two planes that can't see each other is a design smell, not a fix. A CLI-seeded fleet is still invisible to the web (and vice-versa); the wizard would still drive a reconciler that tears down CLI containers it can't see. Does **not** fix the cross-plane wipe, only the intra-web one. + +### Option B — Docker container labels as source of truth (ANTI's proposal) + +Stamp every DECNET container with provenance + identity labels; reconstruct the fleet by querying Docker. `decnet-state.json` degrades to a CLI-side convenience cache, no longer authoritative. + +Proposed labels: +``` +com.decnet.host = "true" # selector for "this is a DECNET decky" +com.decnet.deploy_type = "api" | "cli" # provenance, NOT a partition +com.decnet.service = "" # or the broader identity +com.decnet.config = "" # REQUIRED to preserve service_config fidelity (see §2) +``` + +Fleet read becomes `docker ps --filter label=com.decnet.host=true` (+ `-a` for stopped), then deserialize `com.decnet.config`. + +- **Pros:** + - **One source of truth = reality.** The collision guard and the reconciler read the SAME state, so BUG-2 cannot recur. + - Survives a DECNET process restart (Docker keeps running; labels persist on the real object). + - `deploy_type` makes the "two planes" distinction unnecessary — one fleet, labeled by origin. The guard queries ALL `com.decnet.host=true` regardless of origin, so it can never blind-wipe a CLI decky. + - This is the orchestrator-standard pattern (label the real object, reconcile against it). +- **Cons / constraints:** + - **Swarm.** The master cannot `docker ps` a remote worker. Remote deckies STILL need a registry → keep `decky_shards` (DB, heartbeat-driven). Honest model is **hybrid**: local truth = labels, remote truth = `decky_shards`. + - **Fleet-global config** (`interface, subnet, gateway, ipvlan, mutate_interval, log_file, compose_path`) is not per-container. Proposed home: **labels on the macvlan/ipvlan network object** (exactly one, DECNET-owned, correct scope). NOT replicated onto every container. + - **Label payload.** Preserving `service_config` fidelity forces a `com.decnet.config` JSON blob. Works (label values are generous) but it is config-in-label-land, with its own serialization discipline. + - **Performance.** `/deckies` is UI-polled and load-tested. Querying Docker on every read is heavier than a file/DB read. Mitigation: the existing 5s TTL cache (`api_get_deckies.py:_DECKIES_TTL`) extends naturally over the Docker query. + - **Does NOT by itself fix `deployer.py:708`.** Labels give the DATA to build the COMPLETE config (live + new) before `write_compose`; the merge must actually be done. Labels make the correct merge possible; they don't perform it. + +### Option C — Single DB store as canonical (both web and CLI write DB) + +Make the CLI write the DB `"deployment"` key too; retire `decnet-state.json` as authority. One store, but it's bookkeeping, not reality — can still drift from actual containers on crash/manual `docker rm`. + +- **Pros:** single store; no Docker-query perf cost; swarm-friendly (DB is already the remote registry). +- **Cons:** reintroduces the "trust the ledger, not reality" fragility that Option B specifically escapes; CLI now hard-depends on the DB being reachable, eroding the web-is-a-non-dependency property. + +--- + +## 4. Recommendation (for discussion) + +**Option B (labels), accepted as a hybrid:** local fleet truth = Docker labels; remote fleet truth = `decky_shards` (DB); fleet-global config = network-object labels; `decnet-state.json` demoted to CLI convenience cache. + +Mandatory companion change regardless of option chosen: **build the complete desired `config.deckies` (surviving live fleet + new submissions) before `write_compose`/`deployer.py:708`**, so `down --remove-orphans` + `up` is a no-op on survivors. This is the actual teardown fix; the source-of-truth choice only determines *where the survivor list is read from*. + +--- + +## 5. Open questions (resolve before cutting code) + +1. **`com.decnet.config` blob vs. exploded scalar labels** — do we accept one JSON label for fidelity, or split into N labels and reconstruct? (Fidelity for `service_config` pushes toward the blob.) +2. **Global config home** — network-object labels confirmed as the home, or a single sentinel "fleet" container/label set? +3. **Swarm boundary** — is the local-labels / remote-`decky_shards` split acceptable, or do we want labels mirrored back to the master via heartbeat for a uniform read path? +4. **Stopped/failed containers** — does `-a` (include stopped) count toward the fleet for collision purposes, and how do we represent non-running status the JSON file never tracked? +5. **Migration** — first label-aware deploy after upgrade: how do we adopt already-running unlabeled containers (relabel in place vs. require one redeploy)? +6. **`decnet-state.json` final role** — pure CLI cache, or removed entirely with CLI also reading labels? + +--- + +## 6. Affected files (for whichever option lands) + +- `decnet/web/router/fleet/api_deploy_deckies.py` — `existing_deckies` snapshot (lines 48, 84), collision guard (124-145), `set_state("deployment")` (194) +- `decnet/web/router/fleet/api_get_deckies.py` — `get_deckies` read path + TTL cache +- `decnet/web/db/sqlmodel_repo/__init__.py:174` — `get_deckies()` (currently `load_state()`) +- `decnet/engine/deployer.py:681` (`write_compose`), `:708` (`down --remove-orphans`), `:571`/`:623` (`_mirror_fleet_*`) +- `decnet/config.py` — `save_state`/`load_state`, `STATE_FILE` +- `decnet/lifecycle/runner.py` / `strategies.py` — `LocalDeployStrategy` → `deployer.deploy` +- `decnet/models.py:87` — `DeckyConfig` (label serialization surface) + +--- + +## 7. CORRECTION (source-traced 2026-06-12) — the store topology is wider than §1 said + +§1's claim that DB `State["deployment"]` is *"written only by the web deploy handler"* is **WRONG**. A grep for its readers/writers shows it is the shared coordination store for the **entire web + mutator plane**: + +| Site | Op | +|------|----| +| `api_deploy_deckies.py:48,194` | read + write | +| `api_mutate_decky.py:55,76` | read + write | +| `api_mutate_interval.py:32,45` | read + write | +| `swarm_mgmt/api_list_deckies.py:28` | read | +| `mutator/engine.py:84,126,189,413` | read + write (autonomous mutator) | + +Consequences: +- A one-line "deploy handler reads `load_state()`" swap makes deploy **diverge from its own plane** (mutate handlers + the background mutator still read the DB key). Lateral move, not a fix. **Empirically confirmed:** that edit broke 4/5 tests in `tests/api/fleet/test_deploy_additive.py` (the survivor was `replace_fleet=True`, the only case that doesn't read the prior fleet), because under `DECNET_CONTRACT_TEST` the deploy task is skipped so `save_state` never writes the JSON, and the handler couldn't see its own prior `set_state` write. Read-one-store / write-another is self-inconsistent. +- Pointing `get_deckies()` at the DB key **also fails to fix BUG-2**: a CLI-seeded fleet isn't in `State["deployment"]` either (CLI writes JSON + `fleet_deckies`), so the reconcile-against-incomplete-inventory wipe survives. + +### The model the codebase ALREADY documents (`fleet/reconciler.py:1-29`) + +``` +1. decnet-state.json — canonical for offline / no-API consumers (CLI, status, sniffer, collector) +2. fleet_deckies table — "what the orchestrator, web dashboard, and REST API see" +3. docker inspect — actual per-container runtime state +Resolution: JSON-only → INSERT; DB-only(this host) → DELETE; both → state := docker-aggregated. +``` + +Two facts this hands us: +1. **The API was DESIGNED to read `fleet_deckies`** — the engine-mirrored table written on *every* deploy/teardown regardless of origin (`deployer.py:571 _mirror_fleet_deploy_to_db`, `:623` teardown). The live deploy/collision-guard code reading `State["deployment"]`, and `get_deckies()` reading the JSON file, are both **drift from the documented design**. `fleet_deckies` is the cross-plane store that *does* contain a CLI-seeded fleet. +2. **Docker is already the ultimate authority** — the reconciler converges JSON and DB *to docker-aggregated state*. ANTI's label proposal (Option B) is not a new paradigm; it promotes docker from reconciler-tiebreaker to primary read path. + +### Revised recommendation + +Two viable directions, both grounded in the existing design rather than a new store: + +- **B′ (labels / docker-primary)** — the ADR's Option B, now understood as *promoting* the reconciler's existing docker-authoritative tiebreaker to the primary fleet read. Strongest long-term; same swarm caveat (remote = `decky_shards`/`fleet_deckies`, master can't `docker ps` workers). +- **D (converge on `fleet_deckies` now)** — make the deploy collision-guard AND `get_deckies()` read `fleet_deckies` (`list_fleet_deckies` / `list_running_fleet_deckies`), the store the design already names as the API's view. Smaller than relabelling; immediately closes the CLI-invisible-to-web gap because `fleet_deckies` is engine-mirrored on CLI deploys too. The mutate handlers + mutator engine reading `State["deployment"]` become the next consolidation target. + +**Unchanged hard constraint:** whichever store wins, the handler must still build the COMPLETE desired `config.deckies` (survivors + new) before `write_compose`/`deployer.py:708`. The store choice only decides where "survivors" is read from. + +### Open question added to §5 + +7. **`State["deployment"]` vs `fleet_deckies`** — do we converge the whole web+mutator plane onto `fleet_deckies` (Option D), or go straight to docker-primary (Option B′) and let `fleet_deckies` be the swarm/remote registry? The mutator engine (`mutator/engine.py`) is the heaviest consumer of `State["deployment"]` and must move in lockstep. diff --git a/tests/api/config/test_deploy_limit.py b/tests/api/config/test_deploy_limit.py index 0fcd949e..9b5a82ab 100644 --- a/tests/api/config/test_deploy_limit.py +++ b/tests/api/config/test_deploy_limit.py @@ -2,6 +2,8 @@ import pytest from unittest.mock import patch +from decnet.config import DeckyConfig +from decnet.web.db.models import LOCAL_HOST_SENTINEL from decnet.web.dependencies import repo @@ -15,70 +17,93 @@ def contract_test_mode(monkeypatch): def mock_network(): """Mock network detection so deploy doesn't call `ip addr show`.""" with patch("decnet.web.router.fleet.api_deploy_deckies.get_host_ip", return_value="192.168.1.100"): - yield + with patch("decnet.web.router.fleet.api_deploy_deckies.detect_interface", return_value="eth0"): + with patch("decnet.web.router.fleet.api_deploy_deckies.detect_subnet", return_value=("192.168.1.0/24", "192.168.1.1")): + yield + + +async def _clear_fleet() -> None: + for row in await repo.list_fleet_deckies(): + await repo.delete_fleet_decky( + host_uuid=row.get("host_uuid") or LOCAL_HOST_SENTINEL, + name=row["name"], + ) + + +async def _seed_fleet(name: str, ip: str) -> None: + cfg = DeckyConfig( + name=name, ip=ip, services=["ssh"], distro="debian", + base_image="debian", hostname=name, + ) + await repo.upsert_fleet_decky({ + "host_uuid": LOCAL_HOST_SENTINEL, + "name": name, + "services": ["ssh"], + "decky_config": cfg.model_dump(mode="json"), + "decky_ip": ip, + "state": "running", + }) + + +@pytest.fixture(autouse=True) +async def _isolate_fleet(): + await _clear_fleet() + yield + await _clear_fleet() @pytest.mark.anyio -async def test_deploy_respects_limit(client, auth_token, mock_state_file): - """Deploy should reject if the *submitted* INI exceeds the limit. - The INI is the source of truth — prior state is fully replaced — so the - check runs on the new decky count alone.""" +async def test_deploy_respects_limit(client, auth_token): + """The limit counts the WHOLE resulting fleet — existing (from + fleet_deckies) plus the submitted INI — not the INI alone. One existing + decky + one submitted, against a limit of 1, must be rejected.""" await repo.set_state("config_limits", {"deployment_limit": 1}) - await repo.set_state("deployment", mock_state_file) + await _seed_fleet("decky-existing", "192.168.1.10") - ini = """[decky-a] -services = ssh - -[decky-b] -services = ssh -""" + ini = "[decky-new]\nservices = ssh\n" resp = await client.post( "/api/v1/deckies/deploy", json={"ini_content": ini}, headers={"Authorization": f"Bearer {auth_token}"}, ) - # 2 new deckies > limit of 1 + # existing(1) + new(1) = 2 > limit 1 assert resp.status_code == 409 assert "limit" in resp.json()["detail"].lower() @pytest.mark.anyio -async def test_deploy_replaces_prior_state(client, auth_token, mock_state_file): - """Submitting an INI with 1 decky must not silently re-include the 2 - deckies from prior state (that caused the 'Address already in use' - regression when stale decky2/decky3 redeployed on stale IPs).""" +async def test_deploy_replaces_prior_state(client, auth_token): + """replace_fleet=True drops the prior fleet rather than silently + re-including it (the 'Address already in use' regression came from stale + deckies redeploying on stale IPs). After replace, the committed fleet is + exactly the submitted INI.""" await repo.set_state("config_limits", {"deployment_limit": 10}) - await repo.set_state("deployment", mock_state_file) + await _seed_fleet("test-decky-1", "192.168.1.10") + await _seed_fleet("test-decky-2", "192.168.1.11") - ini = """[only-decky] -services = ssh -""" + ini = "[only-decky]\nservices = ssh\n" resp = await client.post( "/api/v1/deckies/deploy", - json={"ini_content": ini}, + json={"ini_content": ini, "replace_fleet": True}, headers={"Authorization": f"Bearer {auth_token}"}, ) - assert resp.status_code == 202 - persisted = await repo.get_state("deployment") - names = [d["name"] for d in persisted["config"]["deckies"]] - assert names == ["only-decky"] + assert resp.status_code == 202, resp.text + names = {d["name"] for d in await repo.get_deckies()} + assert names == {"only-decky"} @pytest.mark.anyio -async def test_deploy_within_limit(client, auth_token, mock_state_file): - """Deploy should succeed when within limit.""" +async def test_deploy_within_limit(client, auth_token): + """Deploy should succeed when the resulting fleet is within limit.""" await repo.set_state("config_limits", {"deployment_limit": 100}) - await repo.set_state("deployment", mock_state_file) + await _seed_fleet("decky-existing", "192.168.1.10") - ini = """[decky-new] -services = ssh -""" + ini = "[decky-new]\nservices = ssh\n" resp = await client.post( "/api/v1/deckies/deploy", json={"ini_content": ini}, headers={"Authorization": f"Bearer {auth_token}"}, ) - # Should not fail due to limit if resp.status_code == 409: assert "limit" not in resp.json()["detail"].lower() else: diff --git a/tests/api/conftest.py b/tests/api/conftest.py index 9c512c19..edb4378c 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -211,6 +211,47 @@ def mock_state_file(patch_state_file: Path): patch_state_file.write_text(json.dumps(_test_state)) yield _test_state + +@pytest.fixture +async def mock_fleet_deckies(): + """Seed fleet_deckies with two deckies — the store get_deckies() reads + under the Option-D source-of-truth model (development/ADR-001-...md). + Mirrors the data mock_state_file used to put in decnet-state.json.""" + from decnet.config import DeckyConfig + from decnet.web.db.models import LOCAL_HOST_SENTINEL + from decnet.web.dependencies import repo + + async def _clear() -> None: + for row in await repo.list_fleet_deckies(): + await repo.delete_fleet_decky( + host_uuid=row.get("host_uuid") or LOCAL_HOST_SENTINEL, + name=row["name"], + ) + + specs = [ + ("test-decky-1", "192.168.1.10", ["ssh"], "debian", "test-host-1", + {"ssh": {"banner": "SSH-2.0-OpenSSH_8.9"}}, "deaddeck"), + ("test-decky-2", "192.168.1.11", ["http"], "ubuntu", "test-host-2", + {}, None), + ] + await _clear() + for name, ip, services, distro, hostname, svc_cfg, arche in specs: + cfg = DeckyConfig( + name=name, ip=ip, services=services, distro=distro, + base_image=distro, hostname=hostname, + service_config=svc_cfg, archetype=arche, + ) + await repo.upsert_fleet_decky({ + "host_uuid": LOCAL_HOST_SENTINEL, + "name": name, + "services": services, + "decky_config": cfg.model_dump(mode="json"), + "decky_ip": ip, + "state": "running", + }) + yield + await _clear() + # Share fuzz settings across API tests # FUZZ_EXAMPLES: keep low for dev speed; bump via HYPOTHESIS_MAX_EXAMPLES env var in CI _FUZZ_EXAMPLES = int(_os.environ.get("HYPOTHESIS_MAX_EXAMPLES", "10")) diff --git a/tests/api/fleet/test_deploy_additive.py b/tests/api/fleet/test_deploy_additive.py index cd025fb1..97a582dc 100644 --- a/tests/api/fleet/test_deploy_additive.py +++ b/tests/api/fleet/test_deploy_additive.py @@ -5,6 +5,13 @@ Default behaviour (replace_fleet=False) appends the INI to the existing fleet so the wizard's "deploy one more decky" submit no longer wipes prior deckies. replace_fleet=True preserves the historical set-desired-state semantics for CLI / declarative callers. + +The existing fleet is read from fleet_deckies — the engine-mirrored table +written on every deploy/teardown (CLI or web), per the source-of-truth +model in fleet/reconciler.py. These tests seed fleet_deckies directly, +which also models the BUG-2 scenario: a fleet established out of band +(CLI/seed) that the web deploy guard must see and append to rather than +wipe. See development/ADR-001-FLEET-SOURCE-OF-TRUTH.md. """ from __future__ import annotations @@ -12,6 +19,8 @@ from unittest.mock import patch import pytest +from decnet.config import DeckyConfig +from decnet.web.db.models import LOCAL_HOST_SENTINEL from decnet.web.dependencies import repo @@ -28,96 +37,111 @@ def mock_network(): yield +async def _clear_fleet() -> None: + for row in await repo.list_fleet_deckies(): + await repo.delete_fleet_decky( + host_uuid=row.get("host_uuid") or LOCAL_HOST_SENTINEL, + name=row["name"], + ) + + +async def _seed_fleet(name: str, *, ip: str = "192.168.1.10", services=("ssh",)) -> None: + """Insert a decky into fleet_deckies, as the engine mirror does on a + CLI/web deploy. Stamps a full DeckyConfig into decky_config so the deploy + guard can rehydrate it.""" + cfg = DeckyConfig( + name=name, + ip=ip, + services=list(services), + distro="debian", + base_image="debian:bookworm-slim", + hostname=name, + ) + await repo.upsert_fleet_decky({ + "host_uuid": LOCAL_HOST_SENTINEL, + "name": name, + "services": list(services), + "decky_config": cfg.model_dump(mode="json"), + "decky_ip": ip, + "state": "running", + }) + + @pytest.fixture(autouse=True) async def _isolate_state(): for row in await repo.list_swarm_hosts(): await repo.delete_swarm_host(row["uuid"]) await repo.set_state("deployment", None) + await _clear_fleet() yield await repo.set_state("deployment", None) + await _clear_fleet() @pytest.mark.anyio -async def test_additive_default_appends_to_existing_fleet(client, auth_token, monkeypatch): - """Two sequential deploys with replace_fleet unset → both deckies in state.""" +async def test_additive_onto_existing_fleet_appends_not_wipes(client, auth_token, monkeypatch): + """BUG-2 regression: an additive web deploy onto a fleet established out + of band (CLI/seed → fleet_deckies) appends rather than wiping it. + + Previously the guard read State["deployment"] (empty for a CLI-seeded + fleet), so existing_deckies was [] and the reconciler tore the running + fleet down to the single submitted decky.""" monkeypatch.setenv("DECNET_MODE", "master") + await _seed_fleet("decky-01", ip="192.168.1.10") - r1 = await client.post( - "/api/v1/deckies/deploy", - json={"ini_content": "[decky-01]\nservices = ssh\n"}, - headers={"Authorization": f"Bearer {auth_token}"}, - ) - assert r1.status_code == 202, r1.text - - r2 = await client.post( + r = await client.post( "/api/v1/deckies/deploy", json={"ini_content": "[decky-02]\nservices = http\n"}, headers={"Authorization": f"Bearer {auth_token}"}, ) - assert r2.status_code == 202, r2.text + assert r.status_code == 202, r.text - committed = await repo.get_state("deployment") - assert committed is not None - names = {d["name"] for d in committed["config"]["deckies"]} + names = {d["name"] for d in await repo.get_deckies()} assert names == {"decky-01", "decky-02"} @pytest.mark.anyio async def test_additive_name_collision_returns_409(client, auth_token, monkeypatch): - """Re-submitting an existing decky name without replace_fleet → 409.""" + """Submitting a decky whose name already exists in the fleet without + replace_fleet → 409.""" monkeypatch.setenv("DECNET_MODE", "master") + await _seed_fleet("decky-01") - r1 = await client.post( - "/api/v1/deckies/deploy", - json={"ini_content": "[decky-01]\nservices = ssh\n"}, - headers={"Authorization": f"Bearer {auth_token}"}, - ) - assert r1.status_code == 202, r1.text - - r2 = await client.post( + r = await client.post( "/api/v1/deckies/deploy", json={"ini_content": "[decky-01]\nservices = http\n"}, headers={"Authorization": f"Bearer {auth_token}"}, ) - assert r2.status_code == 409, r2.text - assert "decky-01" in r2.json()["detail"] - assert "replace_fleet" in r2.json()["detail"] + assert r.status_code == 409, r.text + assert "decky-01" in r.json()["detail"] + assert "replace_fleet" in r.json()["detail"] @pytest.mark.anyio async def test_additive_ip_collision_returns_409(client, auth_token, monkeypatch): - """A new decky pinned to an IP already in use → 409 with the IP.""" + """A new decky pinned to an IP already in use by the existing fleet → 409 + with the IP.""" monkeypatch.setenv("DECNET_MODE", "master") + await _seed_fleet("decky-01", ip="192.168.1.50") - r1 = await client.post( - "/api/v1/deckies/deploy", - json={"ini_content": "[decky-01]\nservices = ssh\nip = 192.168.1.50\n"}, - headers={"Authorization": f"Bearer {auth_token}"}, - ) - assert r1.status_code == 202, r1.text - - r2 = await client.post( + r = await client.post( "/api/v1/deckies/deploy", json={"ini_content": "[decky-02]\nservices = http\nip = 192.168.1.50\n"}, headers={"Authorization": f"Bearer {auth_token}"}, ) - assert r2.status_code == 409, r2.text - assert "192.168.1.50" in r2.json()["detail"] + assert r.status_code == 409, r.text + assert "192.168.1.50" in r.json()["detail"] @pytest.mark.anyio async def test_replace_fleet_true_overwrites_existing(client, auth_token, monkeypatch): - """replace_fleet=True preserves the historical full-replace semantics.""" + """replace_fleet=True preserves the historical full-replace semantics: + the existing fleet is dropped and the committed inventory is exactly the + submitted INI.""" monkeypatch.setenv("DECNET_MODE", "master") + await _seed_fleet("decky-01") - r1 = await client.post( - "/api/v1/deckies/deploy", - json={"ini_content": "[decky-01]\nservices = ssh\n"}, - headers={"Authorization": f"Bearer {auth_token}"}, - ) - assert r1.status_code == 202, r1.text - - r2 = await client.post( + r = await client.post( "/api/v1/deckies/deploy", json={ "ini_content": "[decky-02]\nservices = http\n", @@ -125,11 +149,9 @@ async def test_replace_fleet_true_overwrites_existing(client, auth_token, monkey }, headers={"Authorization": f"Bearer {auth_token}"}, ) - assert r2.status_code == 202, r2.text + assert r.status_code == 202, r.text - committed = await repo.get_state("deployment") - assert committed is not None - names = {d["name"] for d in committed["config"]["deckies"]} + names = {d["name"] for d in await repo.get_deckies()} assert names == {"decky-02"} @@ -139,25 +161,16 @@ async def test_additive_lifecycle_ids_scoped_to_new_deckies(client, auth_token, the caller submitted, not carryover. Operators polling /deckies/lifecycle?ids=... see exactly what this call deployed.""" monkeypatch.setenv("DECNET_MODE", "master") + await _seed_fleet("decky-01", ip="192.168.1.10") + await _seed_fleet("decky-02", ip="192.168.1.11") - r1 = await client.post( - "/api/v1/deckies/deploy", - json={"ini_content": "[decky-01]\nservices = ssh\n[decky-02]\nservices = http\n"}, - headers={"Authorization": f"Bearer {auth_token}"}, - ) - assert r1.status_code == 202, r1.text - assert len(r1.json()["lifecycle_ids"]) == 2 - - r2 = await client.post( + r = await client.post( "/api/v1/deckies/deploy", json={"ini_content": "[decky-03]\nservices = ssh\n"}, headers={"Authorization": f"Bearer {auth_token}"}, ) - assert r2.status_code == 202, r2.text - body2 = r2.json() - assert len(body2["lifecycle_ids"]) == 1 + assert r.status_code == 202, r.text + assert len(r.json()["lifecycle_ids"]) == 1 - committed = await repo.get_state("deployment") - assert committed is not None - names = {d["name"] for d in committed["config"]["deckies"]} + names = {d["name"] for d in await repo.get_deckies()} assert names == {"decky-01", "decky-02", "decky-03"} diff --git a/tests/api/fleet/test_get_deckies.py b/tests/api/fleet/test_get_deckies.py index 038b9965..9186986c 100644 --- a/tests/api/fleet/test_get_deckies.py +++ b/tests/api/fleet/test_get_deckies.py @@ -5,7 +5,7 @@ from hypothesis import given, settings, strategies as st from ..conftest import _FUZZ_SETTINGS @pytest.mark.anyio -async def test_get_deckies_endpoint(mock_state_file, client: httpx.AsyncClient, auth_token: str): +async def test_get_deckies_endpoint(mock_fleet_deckies, client: httpx.AsyncClient, auth_token: str): _response = await client.get("/api/v1/deckies", headers={"Authorization": f"Bearer {auth_token}"}) assert _response.status_code == 200 _data = _response.json()