fix(fleet): read existing fleet from fleet_deckies, not State["deployment"] (BUG-2)

The web deploy collision-guard read the existing fleet from the DB
State["deployment"] key, while the UI/get_deckies() read decnet-state.json.
A fleet established via CLI/seed lands in neither path the guard consulted,
so existing_deckies was empty, the additive guard ran blind, and the
reconciler tore the running fleet down to the single submitted decky
(BUG-2: silent fleet wipe, HTTP 202, no warning).

Converge both reads on fleet_deckies — the engine-mirrored table written on
every deploy/teardown (CLI and web), which fleet/reconciler.py already
documents as the store the orchestrator, dashboard, and REST API see. Each
row's decky_config column is a full DeckyConfig dump, so it rehydrates
losslessly into the collision-guard input. The handler also commits the
intended fleet to fleet_deckies synchronously so rapid sequential deploys
read a current fleet and the dashboard observes the new shape immediately.

State["deployment"] is retained for now — the mutate handlers and the
mutator engine still coordinate through it; consolidating them is tracked
in development/ADR-001-FLEET-SOURCE-OF-TRUTH.md (open question 7).

Tests seed fleet_deckies directly (also modelling the CLI-seeded scenario)
rather than chaining real deploys through the skipped contract-test path.
This commit is contained in:
2026-06-12 23:52:20 -04:00
parent 408810b3e2
commit ab1151ee7f
7 changed files with 415 additions and 105 deletions

View File

@@ -12,7 +12,6 @@ backends. Dialect-specific behavior lives in subclasses:
""" """
from __future__ import annotations from __future__ import annotations
import asyncio
import json import json
import os import os
@@ -23,7 +22,6 @@ from typing import Any, Optional, List, cast
from sqlalchemy import select from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession, async_sessionmaker from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession, async_sessionmaker
from decnet.config import load_state
from decnet.env import DECNET_ADMIN_USER, DECNET_ADMIN_PASSWORD from decnet.env import DECNET_ADMIN_USER, DECNET_ADMIN_PASSWORD
from decnet.web.auth import get_password_hash from decnet.web.auth import get_password_hash
from decnet.web.db.repository import BaseRepository from decnet.web.db.repository import BaseRepository
@@ -172,8 +170,14 @@ class SQLModelRepository(
return None return None
async def get_deckies(self) -> List[dict]: async def get_deckies(self) -> List[dict]:
_state = await asyncio.to_thread(load_state) # The fleet inventory the UI/API sees is fleet_deckies — the
return [_d.model_dump() for _d in _state[0].deckies] if _state else [] # engine-mirrored table written on EVERY deploy/teardown (CLI or web),
# per the source-of-truth model documented in fleet/reconciler.py.
# Each row's decky_config column is a full DeckyConfig.model_dump(
# mode="json"), so it rehydrates to the same shape load_state() used
# to return. See development/ADR-001-FLEET-SOURCE-OF-TRUTH.md.
rows = await self.list_fleet_deckies()
return [r["decky_config"] for r in rows if r.get("decky_config")]
# --------------------------------------------------------------- users # --------------------------------------------------------------- users

View File

@@ -8,7 +8,7 @@ from decnet.bus.factory import get_bus
from decnet.lifecycle.runner import run_deploy from decnet.lifecycle.runner import run_deploy
from decnet.logging import get_logger from decnet.logging import get_logger
from decnet.telemetry import traced as _traced from decnet.telemetry import traced as _traced
from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, DeckyConfig, _ROOT
from decnet.ini_loader import load_ini_from_string from decnet.ini_loader import load_ini_from_string
from decnet.network import detect_interface, detect_subnet, get_host_ip from decnet.network import detect_interface, detect_subnet, get_host_ip
from decnet.web.dependencies import require_admin, repo from decnet.web.dependencies import require_admin, repo
@@ -19,6 +19,39 @@ log = get_logger("api")
router = APIRouter() router = APIRouter()
async def _commit_fleet_to_db(deckies: list[DeckyConfig], *, replace_fleet: bool) -> None:
"""Synchronously reconcile ``fleet_deckies`` to *deckies*.
fleet_deckies is the source of truth the deploy guard now reads
(``existing_deckies``). Committing the intended shape here — before the
async deploy task's engine mirror runs — means rapid sequential web
deploys each read a current fleet (no self-wipe) and the dashboard
observes the new shape immediately. Mirrors the payload shape of
``engine.deployer._mirror_fleet_deploy_to_db``.
In replace mode, rows absent from *deckies* are deleted so the committed
inventory matches the desired set; the async reconciler/teardown mirror
converges the actual containers separately.
"""
from decnet.web.db.models import LOCAL_HOST_SENTINEL
keep = {(d.host_uuid or LOCAL_HOST_SENTINEL, d.name) for d in deckies}
if replace_fleet:
for row in await repo.list_fleet_deckies():
host = row.get("host_uuid") or LOCAL_HOST_SENTINEL
if (host, row.get("name")) not in keep:
await repo.delete_fleet_decky(host_uuid=host, name=row["name"])
for d in deckies:
await repo.upsert_fleet_decky({
"host_uuid": d.host_uuid or LOCAL_HOST_SENTINEL,
"name": d.name,
"services": list(d.services),
"decky_config": d.model_dump(mode="json"),
"decky_ip": d.ip,
"state": "running",
})
@router.post( @router.post(
"/deckies/deploy", "/deckies/deploy",
tags=["Fleet Management"], tags=["Fleet Management"],
@@ -81,7 +114,19 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir
# config below) so additive collision checks compare new against prior # config below) so additive collision checks compare new against prior
# rather than against themselves. Existing IPs are passed into # rather than against themselves. Existing IPs are passed into
# build_deckies_from_ini as reserved so auto-allocation skips them. # build_deckies_from_ini as reserved so auto-allocation skips them.
existing_deckies = list(config.deckies) if config is not None else [] # The existing fleet comes from fleet_deckies (engine-mirrored on CLI
# *and* web deploys), NOT from config.deckies carried by the
# State["deployment"] key. A CLI/seed-established fleet never lands in
# that key, so the additive collision guard ran blind and the reconciler
# wiped the fleet — root cause of BUG-2. fleet_deckies is the store the
# source-of-truth model (fleet/reconciler.py) names as the API's view.
# See development/ADR-001-FLEET-SOURCE-OF-TRUTH.md.
existing_rows = await repo.list_fleet_deckies()
existing_deckies = [
DeckyConfig(**r["decky_config"])
for r in existing_rows
if r.get("decky_config")
]
reserved_ips: set[str] | None = ( reserved_ips: set[str] | None = (
{d.ip for d in existing_deckies if d.ip} {d.ip for d in existing_deckies if d.ip}
if not req.replace_fleet and existing_deckies if not req.replace_fleet and existing_deckies
@@ -192,6 +237,11 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir
), ),
} }
await repo.set_state("deployment", new_state_payload) await repo.set_state("deployment", new_state_payload)
# Commit the intended fleet to fleet_deckies — the store the deploy guard
# and get_deckies() now read. set_state("deployment") above is retained
# for the mutate handlers / mutator engine that still coordinate through
# that key (their consolidation is tracked in the ADR, open question 7).
await _commit_fleet_to_db(config.deckies, replace_fleet=req.replace_fleet)
# Lifecycle rows track THIS call's deployments only. In additive mode # Lifecycle rows track THIS call's deployments only. In additive mode
# the existing deckies are already running and don't get a new # the existing deckies are already running and don't get a new

View File

@@ -0,0 +1,177 @@
# ADR-001 — Fleet Source of Truth
- **Status:** PROPOSED (discussion — not yet accepted)
- **Date:** 2026-06-12
- **Context owner:** ANTI
- **Drives fix for:** BUG-2 (destructive fleet-replace / silent wipe), see `QA_REPORT.md`
---
## 1. Context
DECNET currently keeps the deployed-fleet inventory in **two unsynchronized stores**:
| Store | Read by | Written by |
|-------|---------|------------|
| `decnet-state.json` file (`load_state()`) | `repo.get_deckies()` → the UI fleet view, collision pre-checks | CLI/engine path (`engine.deployer.save_state`), `decnet status`, sniffer, collector |
| DB `State` table, key `"deployment"` (`repo.get_state`/`set_state`) | the web deploy handler's `existing_deckies` snapshot | **only** the web deploy handler |
The web is a **non-dependency**: the same deploys can be driven entirely from the CLI, and CLI state lives in `decnet-state.json`. Because the two stores never reconcile, a fleet established via CLI/seed is invisible to the web deploy handler's collision guard.
### BUG-2 failure chain (source-traced)
1. CLI/seed establishes a fleet → written to `decnet-state.json`, **never** to DB `"deployment"`.
2. UI reads `get_deckies()` (JSON) → shows decky-02/03 correctly.
3. Wizard POSTs a new decky-04 with `replace_fleet=false`.
4. Handler reads `existing_deckies` from `repo.get_state("deployment")`**None**`existing_deckies = []`.
5. Collision guard compares against `[]` → no conflict → `config.deckies = [] + [decky-04]`.
6. `run_deploy``LocalDeployStrategy``engine.deployer.deploy(config)`:
- `write_compose(config, COMPOSE_FILE)` writes a compose file containing **only decky-04** (`deployer.py:681`).
- `_compose("down", "--remove-orphans", …)` (`deployer.py:708`) tears down the whole compose project, then `up` brings back only decky-04.
- `_mirror_fleet_teardown_to_db` drops the survivors' rows.
7. Result: fleet silently wiped to one decky. HTTP 202. No warning.
**Key trap:** the destructive call is `deployer.py:708` (`down --remove-orphans` against a compose file rewritten from `config.deckies`). Any source-of-truth fix that does not also guarantee `config.deckies` is the **complete** desired fleet before `write_compose` leaves BUG-2 alive.
---
## 2. What the UI actually consumes
`DeckyConfig` (`decnet/models.py:87`) full field set:
```
name, ip, services[], distro, base_image, build_base, hostname,
archetype, service_config{}, nmap_os, mutate_interval, last_mutated,
last_login_attempt, host_uuid
```
Frontend `Decky` type (`DeckyFleet/types.ts`) + what is **rendered/edited**:
| Field | Displayed? | Where |
|-------|-----------|-------|
| name, ip, services | yes | DeckyCard / InspectPanel |
| hostname, distro, archetype | yes | DeckyInspectPanel:77-79 |
| mutate_interval, last_mutated | yes | DeckyInspectPanel:80-81 |
| **service_config** | **yes — EDITED** | DeckyCard:322 (per-service config editor `currentConfig`) |
| base_image, build_base, nmap_os, last_login_attempt | no | — |
**Conclusion:** `service_config` is not just stored — it is rendered and **edited** in the UI. A "minimal scalar labels" scheme (name/ip/services only) would amputate editable state. Fidelity requires carrying the full `DeckyConfig`.
---
## 3. Options
### Option A — API reads only the DB; ignore `decnet-state.json` (web side)
Align `get_deckies()` and the deploy handler both on DB `"deployment"`. The web becomes a self-contained plane on the DB; CLI stays on the JSON file. The two planes are explicitly **non-interoperable**.
- **Pros:** smallest change; closes the desync *within the web plane*.
- **Cons:** ANTI's own verdict — "honestly the incorrect way of doing things." Two planes that can't see each other is a design smell, not a fix. A CLI-seeded fleet is still invisible to the web (and vice-versa); the wizard would still drive a reconciler that tears down CLI containers it can't see. Does **not** fix the cross-plane wipe, only the intra-web one.
### Option B — Docker container labels as source of truth (ANTI's proposal)
Stamp every DECNET container with provenance + identity labels; reconstruct the fleet by querying Docker. `decnet-state.json` degrades to a CLI-side convenience cache, no longer authoritative.
Proposed labels:
```
com.decnet.host = "true" # selector for "this is a DECNET decky"
com.decnet.deploy_type = "api" | "cli" # provenance, NOT a partition
com.decnet.service = "<service>" # or the broader identity
com.decnet.config = "<DeckyConfig JSON>" # REQUIRED to preserve service_config fidelity (see §2)
```
Fleet read becomes `docker ps --filter label=com.decnet.host=true` (+ `-a` for stopped), then deserialize `com.decnet.config`.
- **Pros:**
- **One source of truth = reality.** The collision guard and the reconciler read the SAME state, so BUG-2 cannot recur.
- Survives a DECNET process restart (Docker keeps running; labels persist on the real object).
- `deploy_type` makes the "two planes" distinction unnecessary — one fleet, labeled by origin. The guard queries ALL `com.decnet.host=true` regardless of origin, so it can never blind-wipe a CLI decky.
- This is the orchestrator-standard pattern (label the real object, reconcile against it).
- **Cons / constraints:**
- **Swarm.** The master cannot `docker ps` a remote worker. Remote deckies STILL need a registry → keep `decky_shards` (DB, heartbeat-driven). Honest model is **hybrid**: local truth = labels, remote truth = `decky_shards`.
- **Fleet-global config** (`interface, subnet, gateway, ipvlan, mutate_interval, log_file, compose_path`) is not per-container. Proposed home: **labels on the macvlan/ipvlan network object** (exactly one, DECNET-owned, correct scope). NOT replicated onto every container.
- **Label payload.** Preserving `service_config` fidelity forces a `com.decnet.config` JSON blob. Works (label values are generous) but it is config-in-label-land, with its own serialization discipline.
- **Performance.** `/deckies` is UI-polled and load-tested. Querying Docker on every read is heavier than a file/DB read. Mitigation: the existing 5s TTL cache (`api_get_deckies.py:_DECKIES_TTL`) extends naturally over the Docker query.
- **Does NOT by itself fix `deployer.py:708`.** Labels give the DATA to build the COMPLETE config (live + new) before `write_compose`; the merge must actually be done. Labels make the correct merge possible; they don't perform it.
### Option C — Single DB store as canonical (both web and CLI write DB)
Make the CLI write the DB `"deployment"` key too; retire `decnet-state.json` as authority. One store, but it's bookkeeping, not reality — can still drift from actual containers on crash/manual `docker rm`.
- **Pros:** single store; no Docker-query perf cost; swarm-friendly (DB is already the remote registry).
- **Cons:** reintroduces the "trust the ledger, not reality" fragility that Option B specifically escapes; CLI now hard-depends on the DB being reachable, eroding the web-is-a-non-dependency property.
---
## 4. Recommendation (for discussion)
**Option B (labels), accepted as a hybrid:** local fleet truth = Docker labels; remote fleet truth = `decky_shards` (DB); fleet-global config = network-object labels; `decnet-state.json` demoted to CLI convenience cache.
Mandatory companion change regardless of option chosen: **build the complete desired `config.deckies` (surviving live fleet + new submissions) before `write_compose`/`deployer.py:708`**, so `down --remove-orphans` + `up` is a no-op on survivors. This is the actual teardown fix; the source-of-truth choice only determines *where the survivor list is read from*.
---
## 5. Open questions (resolve before cutting code)
1. **`com.decnet.config` blob vs. exploded scalar labels** — do we accept one JSON label for fidelity, or split into N labels and reconstruct? (Fidelity for `service_config` pushes toward the blob.)
2. **Global config home** — network-object labels confirmed as the home, or a single sentinel "fleet" container/label set?
3. **Swarm boundary** — is the local-labels / remote-`decky_shards` split acceptable, or do we want labels mirrored back to the master via heartbeat for a uniform read path?
4. **Stopped/failed containers** — does `-a` (include stopped) count toward the fleet for collision purposes, and how do we represent non-running status the JSON file never tracked?
5. **Migration** — first label-aware deploy after upgrade: how do we adopt already-running unlabeled containers (relabel in place vs. require one redeploy)?
6. **`decnet-state.json` final role** — pure CLI cache, or removed entirely with CLI also reading labels?
---
## 6. Affected files (for whichever option lands)
- `decnet/web/router/fleet/api_deploy_deckies.py``existing_deckies` snapshot (lines 48, 84), collision guard (124-145), `set_state("deployment")` (194)
- `decnet/web/router/fleet/api_get_deckies.py``get_deckies` read path + TTL cache
- `decnet/web/db/sqlmodel_repo/__init__.py:174``get_deckies()` (currently `load_state()`)
- `decnet/engine/deployer.py:681` (`write_compose`), `:708` (`down --remove-orphans`), `:571`/`:623` (`_mirror_fleet_*`)
- `decnet/config.py``save_state`/`load_state`, `STATE_FILE`
- `decnet/lifecycle/runner.py` / `strategies.py``LocalDeployStrategy``deployer.deploy`
- `decnet/models.py:87``DeckyConfig` (label serialization surface)
---
## 7. CORRECTION (source-traced 2026-06-12) — the store topology is wider than §1 said
§1's claim that DB `State["deployment"]` is *"written only by the web deploy handler"* is **WRONG**. A grep for its readers/writers shows it is the shared coordination store for the **entire web + mutator plane**:
| Site | Op |
|------|----|
| `api_deploy_deckies.py:48,194` | read + write |
| `api_mutate_decky.py:55,76` | read + write |
| `api_mutate_interval.py:32,45` | read + write |
| `swarm_mgmt/api_list_deckies.py:28` | read |
| `mutator/engine.py:84,126,189,413` | read + write (autonomous mutator) |
Consequences:
- A one-line "deploy handler reads `load_state()`" swap makes deploy **diverge from its own plane** (mutate handlers + the background mutator still read the DB key). Lateral move, not a fix. **Empirically confirmed:** that edit broke 4/5 tests in `tests/api/fleet/test_deploy_additive.py` (the survivor was `replace_fleet=True`, the only case that doesn't read the prior fleet), because under `DECNET_CONTRACT_TEST` the deploy task is skipped so `save_state` never writes the JSON, and the handler couldn't see its own prior `set_state` write. Read-one-store / write-another is self-inconsistent.
- Pointing `get_deckies()` at the DB key **also fails to fix BUG-2**: a CLI-seeded fleet isn't in `State["deployment"]` either (CLI writes JSON + `fleet_deckies`), so the reconcile-against-incomplete-inventory wipe survives.
### The model the codebase ALREADY documents (`fleet/reconciler.py:1-29`)
```
1. decnet-state.json — canonical for offline / no-API consumers (CLI, status, sniffer, collector)
2. fleet_deckies table — "what the orchestrator, web dashboard, and REST API see"
3. docker inspect — actual per-container runtime state
Resolution: JSON-only → INSERT; DB-only(this host) → DELETE; both → state := docker-aggregated.
```
Two facts this hands us:
1. **The API was DESIGNED to read `fleet_deckies`** — the engine-mirrored table written on *every* deploy/teardown regardless of origin (`deployer.py:571 _mirror_fleet_deploy_to_db`, `:623` teardown). The live deploy/collision-guard code reading `State["deployment"]`, and `get_deckies()` reading the JSON file, are both **drift from the documented design**. `fleet_deckies` is the cross-plane store that *does* contain a CLI-seeded fleet.
2. **Docker is already the ultimate authority** — the reconciler converges JSON and DB *to docker-aggregated state*. ANTI's label proposal (Option B) is not a new paradigm; it promotes docker from reconciler-tiebreaker to primary read path.
### Revised recommendation
Two viable directions, both grounded in the existing design rather than a new store:
- **B (labels / docker-primary)** — the ADR's Option B, now understood as *promoting* the reconciler's existing docker-authoritative tiebreaker to the primary fleet read. Strongest long-term; same swarm caveat (remote = `decky_shards`/`fleet_deckies`, master can't `docker ps` workers).
- **D (converge on `fleet_deckies` now)** — make the deploy collision-guard AND `get_deckies()` read `fleet_deckies` (`list_fleet_deckies` / `list_running_fleet_deckies`), the store the design already names as the API's view. Smaller than relabelling; immediately closes the CLI-invisible-to-web gap because `fleet_deckies` is engine-mirrored on CLI deploys too. The mutate handlers + mutator engine reading `State["deployment"]` become the next consolidation target.
**Unchanged hard constraint:** whichever store wins, the handler must still build the COMPLETE desired `config.deckies` (survivors + new) before `write_compose`/`deployer.py:708`. The store choice only decides where "survivors" is read from.
### Open question added to §5
7. **`State["deployment"]` vs `fleet_deckies`** — do we converge the whole web+mutator plane onto `fleet_deckies` (Option D), or go straight to docker-primary (Option B) and let `fleet_deckies` be the swarm/remote registry? The mutator engine (`mutator/engine.py`) is the heaviest consumer of `State["deployment"]` and must move in lockstep.

View File

@@ -2,6 +2,8 @@
import pytest import pytest
from unittest.mock import patch from unittest.mock import patch
from decnet.config import DeckyConfig
from decnet.web.db.models import LOCAL_HOST_SENTINEL
from decnet.web.dependencies import repo from decnet.web.dependencies import repo
@@ -15,70 +17,93 @@ def contract_test_mode(monkeypatch):
def mock_network(): def mock_network():
"""Mock network detection so deploy doesn't call `ip addr show`.""" """Mock network detection so deploy doesn't call `ip addr show`."""
with patch("decnet.web.router.fleet.api_deploy_deckies.get_host_ip", return_value="192.168.1.100"): with patch("decnet.web.router.fleet.api_deploy_deckies.get_host_ip", return_value="192.168.1.100"):
yield with patch("decnet.web.router.fleet.api_deploy_deckies.detect_interface", return_value="eth0"):
with patch("decnet.web.router.fleet.api_deploy_deckies.detect_subnet", return_value=("192.168.1.0/24", "192.168.1.1")):
yield
async def _clear_fleet() -> None:
for row in await repo.list_fleet_deckies():
await repo.delete_fleet_decky(
host_uuid=row.get("host_uuid") or LOCAL_HOST_SENTINEL,
name=row["name"],
)
async def _seed_fleet(name: str, ip: str) -> None:
cfg = DeckyConfig(
name=name, ip=ip, services=["ssh"], distro="debian",
base_image="debian", hostname=name,
)
await repo.upsert_fleet_decky({
"host_uuid": LOCAL_HOST_SENTINEL,
"name": name,
"services": ["ssh"],
"decky_config": cfg.model_dump(mode="json"),
"decky_ip": ip,
"state": "running",
})
@pytest.fixture(autouse=True)
async def _isolate_fleet():
await _clear_fleet()
yield
await _clear_fleet()
@pytest.mark.anyio @pytest.mark.anyio
async def test_deploy_respects_limit(client, auth_token, mock_state_file): async def test_deploy_respects_limit(client, auth_token):
"""Deploy should reject if the *submitted* INI exceeds the limit. """The limit counts the WHOLE resulting fleet — existing (from
The INI is the source of truth — prior state is fully replaced — so the fleet_deckies) plus the submitted INI — not the INI alone. One existing
check runs on the new decky count alone.""" decky + one submitted, against a limit of 1, must be rejected."""
await repo.set_state("config_limits", {"deployment_limit": 1}) await repo.set_state("config_limits", {"deployment_limit": 1})
await repo.set_state("deployment", mock_state_file) await _seed_fleet("decky-existing", "192.168.1.10")
ini = """[decky-a] ini = "[decky-new]\nservices = ssh\n"
services = ssh
[decky-b]
services = ssh
"""
resp = await client.post( resp = await client.post(
"/api/v1/deckies/deploy", "/api/v1/deckies/deploy",
json={"ini_content": ini}, json={"ini_content": ini},
headers={"Authorization": f"Bearer {auth_token}"}, headers={"Authorization": f"Bearer {auth_token}"},
) )
# 2 new deckies > limit of 1 # existing(1) + new(1) = 2 > limit 1
assert resp.status_code == 409 assert resp.status_code == 409
assert "limit" in resp.json()["detail"].lower() assert "limit" in resp.json()["detail"].lower()
@pytest.mark.anyio @pytest.mark.anyio
async def test_deploy_replaces_prior_state(client, auth_token, mock_state_file): async def test_deploy_replaces_prior_state(client, auth_token):
"""Submitting an INI with 1 decky must not silently re-include the 2 """replace_fleet=True drops the prior fleet rather than silently
deckies from prior state (that caused the 'Address already in use' re-including it (the 'Address already in use' regression came from stale
regression when stale decky2/decky3 redeployed on stale IPs).""" deckies redeploying on stale IPs). After replace, the committed fleet is
exactly the submitted INI."""
await repo.set_state("config_limits", {"deployment_limit": 10}) await repo.set_state("config_limits", {"deployment_limit": 10})
await repo.set_state("deployment", mock_state_file) await _seed_fleet("test-decky-1", "192.168.1.10")
await _seed_fleet("test-decky-2", "192.168.1.11")
ini = """[only-decky] ini = "[only-decky]\nservices = ssh\n"
services = ssh
"""
resp = await client.post( resp = await client.post(
"/api/v1/deckies/deploy", "/api/v1/deckies/deploy",
json={"ini_content": ini}, json={"ini_content": ini, "replace_fleet": True},
headers={"Authorization": f"Bearer {auth_token}"}, headers={"Authorization": f"Bearer {auth_token}"},
) )
assert resp.status_code == 202 assert resp.status_code == 202, resp.text
persisted = await repo.get_state("deployment") names = {d["name"] for d in await repo.get_deckies()}
names = [d["name"] for d in persisted["config"]["deckies"]] assert names == {"only-decky"}
assert names == ["only-decky"]
@pytest.mark.anyio @pytest.mark.anyio
async def test_deploy_within_limit(client, auth_token, mock_state_file): async def test_deploy_within_limit(client, auth_token):
"""Deploy should succeed when within limit.""" """Deploy should succeed when the resulting fleet is within limit."""
await repo.set_state("config_limits", {"deployment_limit": 100}) await repo.set_state("config_limits", {"deployment_limit": 100})
await repo.set_state("deployment", mock_state_file) await _seed_fleet("decky-existing", "192.168.1.10")
ini = """[decky-new] ini = "[decky-new]\nservices = ssh\n"
services = ssh
"""
resp = await client.post( resp = await client.post(
"/api/v1/deckies/deploy", "/api/v1/deckies/deploy",
json={"ini_content": ini}, json={"ini_content": ini},
headers={"Authorization": f"Bearer {auth_token}"}, headers={"Authorization": f"Bearer {auth_token}"},
) )
# Should not fail due to limit
if resp.status_code == 409: if resp.status_code == 409:
assert "limit" not in resp.json()["detail"].lower() assert "limit" not in resp.json()["detail"].lower()
else: else:

View File

@@ -211,6 +211,47 @@ def mock_state_file(patch_state_file: Path):
patch_state_file.write_text(json.dumps(_test_state)) patch_state_file.write_text(json.dumps(_test_state))
yield _test_state yield _test_state
@pytest.fixture
async def mock_fleet_deckies():
"""Seed fleet_deckies with two deckies — the store get_deckies() reads
under the Option-D source-of-truth model (development/ADR-001-...md).
Mirrors the data mock_state_file used to put in decnet-state.json."""
from decnet.config import DeckyConfig
from decnet.web.db.models import LOCAL_HOST_SENTINEL
from decnet.web.dependencies import repo
async def _clear() -> None:
for row in await repo.list_fleet_deckies():
await repo.delete_fleet_decky(
host_uuid=row.get("host_uuid") or LOCAL_HOST_SENTINEL,
name=row["name"],
)
specs = [
("test-decky-1", "192.168.1.10", ["ssh"], "debian", "test-host-1",
{"ssh": {"banner": "SSH-2.0-OpenSSH_8.9"}}, "deaddeck"),
("test-decky-2", "192.168.1.11", ["http"], "ubuntu", "test-host-2",
{}, None),
]
await _clear()
for name, ip, services, distro, hostname, svc_cfg, arche in specs:
cfg = DeckyConfig(
name=name, ip=ip, services=services, distro=distro,
base_image=distro, hostname=hostname,
service_config=svc_cfg, archetype=arche,
)
await repo.upsert_fleet_decky({
"host_uuid": LOCAL_HOST_SENTINEL,
"name": name,
"services": services,
"decky_config": cfg.model_dump(mode="json"),
"decky_ip": ip,
"state": "running",
})
yield
await _clear()
# Share fuzz settings across API tests # Share fuzz settings across API tests
# FUZZ_EXAMPLES: keep low for dev speed; bump via HYPOTHESIS_MAX_EXAMPLES env var in CI # FUZZ_EXAMPLES: keep low for dev speed; bump via HYPOTHESIS_MAX_EXAMPLES env var in CI
_FUZZ_EXAMPLES = int(_os.environ.get("HYPOTHESIS_MAX_EXAMPLES", "10")) _FUZZ_EXAMPLES = int(_os.environ.get("HYPOTHESIS_MAX_EXAMPLES", "10"))

View File

@@ -5,6 +5,13 @@ Default behaviour (replace_fleet=False) appends the INI to the existing
fleet so the wizard's "deploy one more decky" submit no longer wipes fleet so the wizard's "deploy one more decky" submit no longer wipes
prior deckies. replace_fleet=True preserves the historical prior deckies. replace_fleet=True preserves the historical
set-desired-state semantics for CLI / declarative callers. set-desired-state semantics for CLI / declarative callers.
The existing fleet is read from fleet_deckies — the engine-mirrored table
written on every deploy/teardown (CLI or web), per the source-of-truth
model in fleet/reconciler.py. These tests seed fleet_deckies directly,
which also models the BUG-2 scenario: a fleet established out of band
(CLI/seed) that the web deploy guard must see and append to rather than
wipe. See development/ADR-001-FLEET-SOURCE-OF-TRUTH.md.
""" """
from __future__ import annotations from __future__ import annotations
@@ -12,6 +19,8 @@ from unittest.mock import patch
import pytest import pytest
from decnet.config import DeckyConfig
from decnet.web.db.models import LOCAL_HOST_SENTINEL
from decnet.web.dependencies import repo from decnet.web.dependencies import repo
@@ -28,96 +37,111 @@ def mock_network():
yield yield
async def _clear_fleet() -> None:
for row in await repo.list_fleet_deckies():
await repo.delete_fleet_decky(
host_uuid=row.get("host_uuid") or LOCAL_HOST_SENTINEL,
name=row["name"],
)
async def _seed_fleet(name: str, *, ip: str = "192.168.1.10", services=("ssh",)) -> None:
"""Insert a decky into fleet_deckies, as the engine mirror does on a
CLI/web deploy. Stamps a full DeckyConfig into decky_config so the deploy
guard can rehydrate it."""
cfg = DeckyConfig(
name=name,
ip=ip,
services=list(services),
distro="debian",
base_image="debian:bookworm-slim",
hostname=name,
)
await repo.upsert_fleet_decky({
"host_uuid": LOCAL_HOST_SENTINEL,
"name": name,
"services": list(services),
"decky_config": cfg.model_dump(mode="json"),
"decky_ip": ip,
"state": "running",
})
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
async def _isolate_state(): async def _isolate_state():
for row in await repo.list_swarm_hosts(): for row in await repo.list_swarm_hosts():
await repo.delete_swarm_host(row["uuid"]) await repo.delete_swarm_host(row["uuid"])
await repo.set_state("deployment", None) await repo.set_state("deployment", None)
await _clear_fleet()
yield yield
await repo.set_state("deployment", None) await repo.set_state("deployment", None)
await _clear_fleet()
@pytest.mark.anyio @pytest.mark.anyio
async def test_additive_default_appends_to_existing_fleet(client, auth_token, monkeypatch): async def test_additive_onto_existing_fleet_appends_not_wipes(client, auth_token, monkeypatch):
"""Two sequential deploys with replace_fleet unset → both deckies in state.""" """BUG-2 regression: an additive web deploy onto a fleet established out
of band (CLI/seed → fleet_deckies) appends rather than wiping it.
Previously the guard read State["deployment"] (empty for a CLI-seeded
fleet), so existing_deckies was [] and the reconciler tore the running
fleet down to the single submitted decky."""
monkeypatch.setenv("DECNET_MODE", "master") monkeypatch.setenv("DECNET_MODE", "master")
await _seed_fleet("decky-01", ip="192.168.1.10")
r1 = await client.post( r = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": "[decky-01]\nservices = ssh\n"},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert r1.status_code == 202, r1.text
r2 = await client.post(
"/api/v1/deckies/deploy", "/api/v1/deckies/deploy",
json={"ini_content": "[decky-02]\nservices = http\n"}, json={"ini_content": "[decky-02]\nservices = http\n"},
headers={"Authorization": f"Bearer {auth_token}"}, headers={"Authorization": f"Bearer {auth_token}"},
) )
assert r2.status_code == 202, r2.text assert r.status_code == 202, r.text
committed = await repo.get_state("deployment") names = {d["name"] for d in await repo.get_deckies()}
assert committed is not None
names = {d["name"] for d in committed["config"]["deckies"]}
assert names == {"decky-01", "decky-02"} assert names == {"decky-01", "decky-02"}
@pytest.mark.anyio @pytest.mark.anyio
async def test_additive_name_collision_returns_409(client, auth_token, monkeypatch): async def test_additive_name_collision_returns_409(client, auth_token, monkeypatch):
"""Re-submitting an existing decky name without replace_fleet → 409.""" """Submitting a decky whose name already exists in the fleet without
replace_fleet → 409."""
monkeypatch.setenv("DECNET_MODE", "master") monkeypatch.setenv("DECNET_MODE", "master")
await _seed_fleet("decky-01")
r1 = await client.post( r = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": "[decky-01]\nservices = ssh\n"},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert r1.status_code == 202, r1.text
r2 = await client.post(
"/api/v1/deckies/deploy", "/api/v1/deckies/deploy",
json={"ini_content": "[decky-01]\nservices = http\n"}, json={"ini_content": "[decky-01]\nservices = http\n"},
headers={"Authorization": f"Bearer {auth_token}"}, headers={"Authorization": f"Bearer {auth_token}"},
) )
assert r2.status_code == 409, r2.text assert r.status_code == 409, r.text
assert "decky-01" in r2.json()["detail"] assert "decky-01" in r.json()["detail"]
assert "replace_fleet" in r2.json()["detail"] assert "replace_fleet" in r.json()["detail"]
@pytest.mark.anyio @pytest.mark.anyio
async def test_additive_ip_collision_returns_409(client, auth_token, monkeypatch): async def test_additive_ip_collision_returns_409(client, auth_token, monkeypatch):
"""A new decky pinned to an IP already in use → 409 with the IP.""" """A new decky pinned to an IP already in use by the existing fleet → 409
with the IP."""
monkeypatch.setenv("DECNET_MODE", "master") monkeypatch.setenv("DECNET_MODE", "master")
await _seed_fleet("decky-01", ip="192.168.1.50")
r1 = await client.post( r = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": "[decky-01]\nservices = ssh\nip = 192.168.1.50\n"},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert r1.status_code == 202, r1.text
r2 = await client.post(
"/api/v1/deckies/deploy", "/api/v1/deckies/deploy",
json={"ini_content": "[decky-02]\nservices = http\nip = 192.168.1.50\n"}, json={"ini_content": "[decky-02]\nservices = http\nip = 192.168.1.50\n"},
headers={"Authorization": f"Bearer {auth_token}"}, headers={"Authorization": f"Bearer {auth_token}"},
) )
assert r2.status_code == 409, r2.text assert r.status_code == 409, r.text
assert "192.168.1.50" in r2.json()["detail"] assert "192.168.1.50" in r.json()["detail"]
@pytest.mark.anyio @pytest.mark.anyio
async def test_replace_fleet_true_overwrites_existing(client, auth_token, monkeypatch): async def test_replace_fleet_true_overwrites_existing(client, auth_token, monkeypatch):
"""replace_fleet=True preserves the historical full-replace semantics.""" """replace_fleet=True preserves the historical full-replace semantics:
the existing fleet is dropped and the committed inventory is exactly the
submitted INI."""
monkeypatch.setenv("DECNET_MODE", "master") monkeypatch.setenv("DECNET_MODE", "master")
await _seed_fleet("decky-01")
r1 = await client.post( r = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": "[decky-01]\nservices = ssh\n"},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert r1.status_code == 202, r1.text
r2 = await client.post(
"/api/v1/deckies/deploy", "/api/v1/deckies/deploy",
json={ json={
"ini_content": "[decky-02]\nservices = http\n", "ini_content": "[decky-02]\nservices = http\n",
@@ -125,11 +149,9 @@ async def test_replace_fleet_true_overwrites_existing(client, auth_token, monkey
}, },
headers={"Authorization": f"Bearer {auth_token}"}, headers={"Authorization": f"Bearer {auth_token}"},
) )
assert r2.status_code == 202, r2.text assert r.status_code == 202, r.text
committed = await repo.get_state("deployment") names = {d["name"] for d in await repo.get_deckies()}
assert committed is not None
names = {d["name"] for d in committed["config"]["deckies"]}
assert names == {"decky-02"} assert names == {"decky-02"}
@@ -139,25 +161,16 @@ async def test_additive_lifecycle_ids_scoped_to_new_deckies(client, auth_token,
the caller submitted, not carryover. Operators polling the caller submitted, not carryover. Operators polling
/deckies/lifecycle?ids=... see exactly what this call deployed.""" /deckies/lifecycle?ids=... see exactly what this call deployed."""
monkeypatch.setenv("DECNET_MODE", "master") monkeypatch.setenv("DECNET_MODE", "master")
await _seed_fleet("decky-01", ip="192.168.1.10")
await _seed_fleet("decky-02", ip="192.168.1.11")
r1 = await client.post( r = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": "[decky-01]\nservices = ssh\n[decky-02]\nservices = http\n"},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert r1.status_code == 202, r1.text
assert len(r1.json()["lifecycle_ids"]) == 2
r2 = await client.post(
"/api/v1/deckies/deploy", "/api/v1/deckies/deploy",
json={"ini_content": "[decky-03]\nservices = ssh\n"}, json={"ini_content": "[decky-03]\nservices = ssh\n"},
headers={"Authorization": f"Bearer {auth_token}"}, headers={"Authorization": f"Bearer {auth_token}"},
) )
assert r2.status_code == 202, r2.text assert r.status_code == 202, r.text
body2 = r2.json() assert len(r.json()["lifecycle_ids"]) == 1
assert len(body2["lifecycle_ids"]) == 1
committed = await repo.get_state("deployment") names = {d["name"] for d in await repo.get_deckies()}
assert committed is not None
names = {d["name"] for d in committed["config"]["deckies"]}
assert names == {"decky-01", "decky-02", "decky-03"} assert names == {"decky-01", "decky-02", "decky-03"}

View File

@@ -5,7 +5,7 @@ from hypothesis import given, settings, strategies as st
from ..conftest import _FUZZ_SETTINGS from ..conftest import _FUZZ_SETTINGS
@pytest.mark.anyio @pytest.mark.anyio
async def test_get_deckies_endpoint(mock_state_file, client: httpx.AsyncClient, auth_token: str): async def test_get_deckies_endpoint(mock_fleet_deckies, client: httpx.AsyncClient, auth_token: str):
_response = await client.get("/api/v1/deckies", headers={"Authorization": f"Bearer {auth_token}"}) _response = await client.get("/api/v1/deckies", headers={"Authorization": f"Bearer {auth_token}"})
assert _response.status_code == 200 assert _response.status_code == 200
_data = _response.json() _data = _response.json()