fix(fleet): read existing fleet from fleet_deckies, not State["deployment"] (BUG-2)

The web deploy collision-guard read the existing fleet from the DB
State["deployment"] key, while the UI/get_deckies() read decnet-state.json.
A fleet established via CLI/seed lands in neither path the guard consulted,
so existing_deckies was empty, the additive guard ran blind, and the
reconciler tore the running fleet down to the single submitted decky
(BUG-2: silent fleet wipe, HTTP 202, no warning).

Converge both reads on fleet_deckies — the engine-mirrored table written on
every deploy/teardown (CLI and web), which fleet/reconciler.py already
documents as the store the orchestrator, dashboard, and REST API see. Each
row's decky_config column is a full DeckyConfig dump, so it rehydrates
losslessly into the collision-guard input. The handler also commits the
intended fleet to fleet_deckies synchronously so rapid sequential deploys
read a current fleet and the dashboard observes the new shape immediately.

State["deployment"] is retained for now — the mutate handlers and the
mutator engine still coordinate through it; consolidating them is tracked
in development/ADR-001-FLEET-SOURCE-OF-TRUTH.md (open question 7).

Tests seed fleet_deckies directly (also modelling the CLI-seeded scenario)
rather than chaining real deploys through the skipped contract-test path.
This commit is contained in:
2026-06-12 23:52:20 -04:00
parent 408810b3e2
commit ab1151ee7f
7 changed files with 415 additions and 105 deletions

View File

@@ -12,7 +12,6 @@ backends. Dialect-specific behavior lives in subclasses:
"""
from __future__ import annotations
import asyncio
import json
import os
@@ -23,7 +22,6 @@ from typing import Any, Optional, List, cast
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession, async_sessionmaker
from decnet.config import load_state
from decnet.env import DECNET_ADMIN_USER, DECNET_ADMIN_PASSWORD
from decnet.web.auth import get_password_hash
from decnet.web.db.repository import BaseRepository
@@ -172,8 +170,14 @@ class SQLModelRepository(
return None
async def get_deckies(self) -> List[dict]:
_state = await asyncio.to_thread(load_state)
return [_d.model_dump() for _d in _state[0].deckies] if _state else []
# The fleet inventory the UI/API sees is fleet_deckies — the
# engine-mirrored table written on EVERY deploy/teardown (CLI or web),
# per the source-of-truth model documented in fleet/reconciler.py.
# Each row's decky_config column is a full DeckyConfig.model_dump(
# mode="json"), so it rehydrates to the same shape load_state() used
# to return. See development/ADR-001-FLEET-SOURCE-OF-TRUTH.md.
rows = await self.list_fleet_deckies()
return [r["decky_config"] for r in rows if r.get("decky_config")]
# --------------------------------------------------------------- users

View File

@@ -8,7 +8,7 @@ from decnet.bus.factory import get_bus
from decnet.lifecycle.runner import run_deploy
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced
from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT
from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, DeckyConfig, _ROOT
from decnet.ini_loader import load_ini_from_string
from decnet.network import detect_interface, detect_subnet, get_host_ip
from decnet.web.dependencies import require_admin, repo
@@ -19,6 +19,39 @@ log = get_logger("api")
router = APIRouter()
async def _commit_fleet_to_db(deckies: list[DeckyConfig], *, replace_fleet: bool) -> None:
"""Synchronously reconcile ``fleet_deckies`` to *deckies*.
fleet_deckies is the source of truth the deploy guard now reads
(``existing_deckies``). Committing the intended shape here — before the
async deploy task's engine mirror runs — means rapid sequential web
deploys each read a current fleet (no self-wipe) and the dashboard
observes the new shape immediately. Mirrors the payload shape of
``engine.deployer._mirror_fleet_deploy_to_db``.
In replace mode, rows absent from *deckies* are deleted so the committed
inventory matches the desired set; the async reconciler/teardown mirror
converges the actual containers separately.
"""
from decnet.web.db.models import LOCAL_HOST_SENTINEL
keep = {(d.host_uuid or LOCAL_HOST_SENTINEL, d.name) for d in deckies}
if replace_fleet:
for row in await repo.list_fleet_deckies():
host = row.get("host_uuid") or LOCAL_HOST_SENTINEL
if (host, row.get("name")) not in keep:
await repo.delete_fleet_decky(host_uuid=host, name=row["name"])
for d in deckies:
await repo.upsert_fleet_decky({
"host_uuid": d.host_uuid or LOCAL_HOST_SENTINEL,
"name": d.name,
"services": list(d.services),
"decky_config": d.model_dump(mode="json"),
"decky_ip": d.ip,
"state": "running",
})
@router.post(
"/deckies/deploy",
tags=["Fleet Management"],
@@ -81,7 +114,19 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir
# config below) so additive collision checks compare new against prior
# rather than against themselves. Existing IPs are passed into
# build_deckies_from_ini as reserved so auto-allocation skips them.
existing_deckies = list(config.deckies) if config is not None else []
# The existing fleet comes from fleet_deckies (engine-mirrored on CLI
# *and* web deploys), NOT from config.deckies carried by the
# State["deployment"] key. A CLI/seed-established fleet never lands in
# that key, so the additive collision guard ran blind and the reconciler
# wiped the fleet — root cause of BUG-2. fleet_deckies is the store the
# source-of-truth model (fleet/reconciler.py) names as the API's view.
# See development/ADR-001-FLEET-SOURCE-OF-TRUTH.md.
existing_rows = await repo.list_fleet_deckies()
existing_deckies = [
DeckyConfig(**r["decky_config"])
for r in existing_rows
if r.get("decky_config")
]
reserved_ips: set[str] | None = (
{d.ip for d in existing_deckies if d.ip}
if not req.replace_fleet and existing_deckies
@@ -192,6 +237,11 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir
),
}
await repo.set_state("deployment", new_state_payload)
# Commit the intended fleet to fleet_deckies — the store the deploy guard
# and get_deckies() now read. set_state("deployment") above is retained
# for the mutate handlers / mutator engine that still coordinate through
# that key (their consolidation is tracked in the ADR, open question 7).
await _commit_fleet_to_db(config.deckies, replace_fleet=req.replace_fleet)
# Lifecycle rows track THIS call's deployments only. In additive mode
# the existing deckies are already running and don't get a new

View File

@@ -0,0 +1,177 @@
# ADR-001 — Fleet Source of Truth
- **Status:** PROPOSED (discussion — not yet accepted)
- **Date:** 2026-06-12
- **Context owner:** ANTI
- **Drives fix for:** BUG-2 (destructive fleet-replace / silent wipe), see `QA_REPORT.md`
---
## 1. Context
DECNET currently keeps the deployed-fleet inventory in **two unsynchronized stores**:
| Store | Read by | Written by |
|-------|---------|------------|
| `decnet-state.json` file (`load_state()`) | `repo.get_deckies()` → the UI fleet view, collision pre-checks | CLI/engine path (`engine.deployer.save_state`), `decnet status`, sniffer, collector |
| DB `State` table, key `"deployment"` (`repo.get_state`/`set_state`) | the web deploy handler's `existing_deckies` snapshot | **only** the web deploy handler |
The web is a **non-dependency**: the same deploys can be driven entirely from the CLI, and CLI state lives in `decnet-state.json`. Because the two stores never reconcile, a fleet established via CLI/seed is invisible to the web deploy handler's collision guard.
### BUG-2 failure chain (source-traced)
1. CLI/seed establishes a fleet → written to `decnet-state.json`, **never** to DB `"deployment"`.
2. UI reads `get_deckies()` (JSON) → shows decky-02/03 correctly.
3. Wizard POSTs a new decky-04 with `replace_fleet=false`.
4. Handler reads `existing_deckies` from `repo.get_state("deployment")`**None**`existing_deckies = []`.
5. Collision guard compares against `[]` → no conflict → `config.deckies = [] + [decky-04]`.
6. `run_deploy``LocalDeployStrategy``engine.deployer.deploy(config)`:
- `write_compose(config, COMPOSE_FILE)` writes a compose file containing **only decky-04** (`deployer.py:681`).
- `_compose("down", "--remove-orphans", …)` (`deployer.py:708`) tears down the whole compose project, then `up` brings back only decky-04.
- `_mirror_fleet_teardown_to_db` drops the survivors' rows.
7. Result: fleet silently wiped to one decky. HTTP 202. No warning.
**Key trap:** the destructive call is `deployer.py:708` (`down --remove-orphans` against a compose file rewritten from `config.deckies`). Any source-of-truth fix that does not also guarantee `config.deckies` is the **complete** desired fleet before `write_compose` leaves BUG-2 alive.
---
## 2. What the UI actually consumes
`DeckyConfig` (`decnet/models.py:87`) full field set:
```
name, ip, services[], distro, base_image, build_base, hostname,
archetype, service_config{}, nmap_os, mutate_interval, last_mutated,
last_login_attempt, host_uuid
```
Frontend `Decky` type (`DeckyFleet/types.ts`) + what is **rendered/edited**:
| Field | Displayed? | Where |
|-------|-----------|-------|
| name, ip, services | yes | DeckyCard / InspectPanel |
| hostname, distro, archetype | yes | DeckyInspectPanel:77-79 |
| mutate_interval, last_mutated | yes | DeckyInspectPanel:80-81 |
| **service_config** | **yes — EDITED** | DeckyCard:322 (per-service config editor `currentConfig`) |
| base_image, build_base, nmap_os, last_login_attempt | no | — |
**Conclusion:** `service_config` is not just stored — it is rendered and **edited** in the UI. A "minimal scalar labels" scheme (name/ip/services only) would amputate editable state. Fidelity requires carrying the full `DeckyConfig`.
---
## 3. Options
### Option A — API reads only the DB; ignore `decnet-state.json` (web side)
Align `get_deckies()` and the deploy handler both on DB `"deployment"`. The web becomes a self-contained plane on the DB; CLI stays on the JSON file. The two planes are explicitly **non-interoperable**.
- **Pros:** smallest change; closes the desync *within the web plane*.
- **Cons:** ANTI's own verdict — "honestly the incorrect way of doing things." Two planes that can't see each other is a design smell, not a fix. A CLI-seeded fleet is still invisible to the web (and vice-versa); the wizard would still drive a reconciler that tears down CLI containers it can't see. Does **not** fix the cross-plane wipe, only the intra-web one.
### Option B — Docker container labels as source of truth (ANTI's proposal)
Stamp every DECNET container with provenance + identity labels; reconstruct the fleet by querying Docker. `decnet-state.json` degrades to a CLI-side convenience cache, no longer authoritative.
Proposed labels:
```
com.decnet.host = "true" # selector for "this is a DECNET decky"
com.decnet.deploy_type = "api" | "cli" # provenance, NOT a partition
com.decnet.service = "<service>" # or the broader identity
com.decnet.config = "<DeckyConfig JSON>" # REQUIRED to preserve service_config fidelity (see §2)
```
Fleet read becomes `docker ps --filter label=com.decnet.host=true` (+ `-a` for stopped), then deserialize `com.decnet.config`.
- **Pros:**
- **One source of truth = reality.** The collision guard and the reconciler read the SAME state, so BUG-2 cannot recur.
- Survives a DECNET process restart (Docker keeps running; labels persist on the real object).
- `deploy_type` makes the "two planes" distinction unnecessary — one fleet, labeled by origin. The guard queries ALL `com.decnet.host=true` regardless of origin, so it can never blind-wipe a CLI decky.
- This is the orchestrator-standard pattern (label the real object, reconcile against it).
- **Cons / constraints:**
- **Swarm.** The master cannot `docker ps` a remote worker. Remote deckies STILL need a registry → keep `decky_shards` (DB, heartbeat-driven). Honest model is **hybrid**: local truth = labels, remote truth = `decky_shards`.
- **Fleet-global config** (`interface, subnet, gateway, ipvlan, mutate_interval, log_file, compose_path`) is not per-container. Proposed home: **labels on the macvlan/ipvlan network object** (exactly one, DECNET-owned, correct scope). NOT replicated onto every container.
- **Label payload.** Preserving `service_config` fidelity forces a `com.decnet.config` JSON blob. Works (label values are generous) but it is config-in-label-land, with its own serialization discipline.
- **Performance.** `/deckies` is UI-polled and load-tested. Querying Docker on every read is heavier than a file/DB read. Mitigation: the existing 5s TTL cache (`api_get_deckies.py:_DECKIES_TTL`) extends naturally over the Docker query.
- **Does NOT by itself fix `deployer.py:708`.** Labels give the DATA to build the COMPLETE config (live + new) before `write_compose`; the merge must actually be done. Labels make the correct merge possible; they don't perform it.
### Option C — Single DB store as canonical (both web and CLI write DB)
Make the CLI write the DB `"deployment"` key too; retire `decnet-state.json` as authority. One store, but it's bookkeeping, not reality — can still drift from actual containers on crash/manual `docker rm`.
- **Pros:** single store; no Docker-query perf cost; swarm-friendly (DB is already the remote registry).
- **Cons:** reintroduces the "trust the ledger, not reality" fragility that Option B specifically escapes; CLI now hard-depends on the DB being reachable, eroding the web-is-a-non-dependency property.
---
## 4. Recommendation (for discussion)
**Option B (labels), accepted as a hybrid:** local fleet truth = Docker labels; remote fleet truth = `decky_shards` (DB); fleet-global config = network-object labels; `decnet-state.json` demoted to CLI convenience cache.
Mandatory companion change regardless of option chosen: **build the complete desired `config.deckies` (surviving live fleet + new submissions) before `write_compose`/`deployer.py:708`**, so `down --remove-orphans` + `up` is a no-op on survivors. This is the actual teardown fix; the source-of-truth choice only determines *where the survivor list is read from*.
---
## 5. Open questions (resolve before cutting code)
1. **`com.decnet.config` blob vs. exploded scalar labels** — do we accept one JSON label for fidelity, or split into N labels and reconstruct? (Fidelity for `service_config` pushes toward the blob.)
2. **Global config home** — network-object labels confirmed as the home, or a single sentinel "fleet" container/label set?
3. **Swarm boundary** — is the local-labels / remote-`decky_shards` split acceptable, or do we want labels mirrored back to the master via heartbeat for a uniform read path?
4. **Stopped/failed containers** — does `-a` (include stopped) count toward the fleet for collision purposes, and how do we represent non-running status the JSON file never tracked?
5. **Migration** — first label-aware deploy after upgrade: how do we adopt already-running unlabeled containers (relabel in place vs. require one redeploy)?
6. **`decnet-state.json` final role** — pure CLI cache, or removed entirely with CLI also reading labels?
---
## 6. Affected files (for whichever option lands)
- `decnet/web/router/fleet/api_deploy_deckies.py``existing_deckies` snapshot (lines 48, 84), collision guard (124-145), `set_state("deployment")` (194)
- `decnet/web/router/fleet/api_get_deckies.py``get_deckies` read path + TTL cache
- `decnet/web/db/sqlmodel_repo/__init__.py:174``get_deckies()` (currently `load_state()`)
- `decnet/engine/deployer.py:681` (`write_compose`), `:708` (`down --remove-orphans`), `:571`/`:623` (`_mirror_fleet_*`)
- `decnet/config.py``save_state`/`load_state`, `STATE_FILE`
- `decnet/lifecycle/runner.py` / `strategies.py``LocalDeployStrategy``deployer.deploy`
- `decnet/models.py:87``DeckyConfig` (label serialization surface)
---
## 7. CORRECTION (source-traced 2026-06-12) — the store topology is wider than §1 said
§1's claim that DB `State["deployment"]` is *"written only by the web deploy handler"* is **WRONG**. A grep for its readers/writers shows it is the shared coordination store for the **entire web + mutator plane**:
| Site | Op |
|------|----|
| `api_deploy_deckies.py:48,194` | read + write |
| `api_mutate_decky.py:55,76` | read + write |
| `api_mutate_interval.py:32,45` | read + write |
| `swarm_mgmt/api_list_deckies.py:28` | read |
| `mutator/engine.py:84,126,189,413` | read + write (autonomous mutator) |
Consequences:
- A one-line "deploy handler reads `load_state()`" swap makes deploy **diverge from its own plane** (mutate handlers + the background mutator still read the DB key). Lateral move, not a fix. **Empirically confirmed:** that edit broke 4/5 tests in `tests/api/fleet/test_deploy_additive.py` (the survivor was `replace_fleet=True`, the only case that doesn't read the prior fleet), because under `DECNET_CONTRACT_TEST` the deploy task is skipped so `save_state` never writes the JSON, and the handler couldn't see its own prior `set_state` write. Read-one-store / write-another is self-inconsistent.
- Pointing `get_deckies()` at the DB key **also fails to fix BUG-2**: a CLI-seeded fleet isn't in `State["deployment"]` either (CLI writes JSON + `fleet_deckies`), so the reconcile-against-incomplete-inventory wipe survives.
### The model the codebase ALREADY documents (`fleet/reconciler.py:1-29`)
```
1. decnet-state.json — canonical for offline / no-API consumers (CLI, status, sniffer, collector)
2. fleet_deckies table — "what the orchestrator, web dashboard, and REST API see"
3. docker inspect — actual per-container runtime state
Resolution: JSON-only → INSERT; DB-only(this host) → DELETE; both → state := docker-aggregated.
```
Two facts this hands us:
1. **The API was DESIGNED to read `fleet_deckies`** — the engine-mirrored table written on *every* deploy/teardown regardless of origin (`deployer.py:571 _mirror_fleet_deploy_to_db`, `:623` teardown). The live deploy/collision-guard code reading `State["deployment"]`, and `get_deckies()` reading the JSON file, are both **drift from the documented design**. `fleet_deckies` is the cross-plane store that *does* contain a CLI-seeded fleet.
2. **Docker is already the ultimate authority** — the reconciler converges JSON and DB *to docker-aggregated state*. ANTI's label proposal (Option B) is not a new paradigm; it promotes docker from reconciler-tiebreaker to primary read path.
### Revised recommendation
Two viable directions, both grounded in the existing design rather than a new store:
- **B (labels / docker-primary)** — the ADR's Option B, now understood as *promoting* the reconciler's existing docker-authoritative tiebreaker to the primary fleet read. Strongest long-term; same swarm caveat (remote = `decky_shards`/`fleet_deckies`, master can't `docker ps` workers).
- **D (converge on `fleet_deckies` now)** — make the deploy collision-guard AND `get_deckies()` read `fleet_deckies` (`list_fleet_deckies` / `list_running_fleet_deckies`), the store the design already names as the API's view. Smaller than relabelling; immediately closes the CLI-invisible-to-web gap because `fleet_deckies` is engine-mirrored on CLI deploys too. The mutate handlers + mutator engine reading `State["deployment"]` become the next consolidation target.
**Unchanged hard constraint:** whichever store wins, the handler must still build the COMPLETE desired `config.deckies` (survivors + new) before `write_compose`/`deployer.py:708`. The store choice only decides where "survivors" is read from.
### Open question added to §5
7. **`State["deployment"]` vs `fleet_deckies`** — do we converge the whole web+mutator plane onto `fleet_deckies` (Option D), or go straight to docker-primary (Option B) and let `fleet_deckies` be the swarm/remote registry? The mutator engine (`mutator/engine.py`) is the heaviest consumer of `State["deployment"]` and must move in lockstep.

View File

@@ -2,6 +2,8 @@
import pytest
from unittest.mock import patch
from decnet.config import DeckyConfig
from decnet.web.db.models import LOCAL_HOST_SENTINEL
from decnet.web.dependencies import repo
@@ -15,70 +17,93 @@ def contract_test_mode(monkeypatch):
def mock_network():
"""Mock network detection so deploy doesn't call `ip addr show`."""
with patch("decnet.web.router.fleet.api_deploy_deckies.get_host_ip", return_value="192.168.1.100"):
yield
with patch("decnet.web.router.fleet.api_deploy_deckies.detect_interface", return_value="eth0"):
with patch("decnet.web.router.fleet.api_deploy_deckies.detect_subnet", return_value=("192.168.1.0/24", "192.168.1.1")):
yield
async def _clear_fleet() -> None:
for row in await repo.list_fleet_deckies():
await repo.delete_fleet_decky(
host_uuid=row.get("host_uuid") or LOCAL_HOST_SENTINEL,
name=row["name"],
)
async def _seed_fleet(name: str, ip: str) -> None:
cfg = DeckyConfig(
name=name, ip=ip, services=["ssh"], distro="debian",
base_image="debian", hostname=name,
)
await repo.upsert_fleet_decky({
"host_uuid": LOCAL_HOST_SENTINEL,
"name": name,
"services": ["ssh"],
"decky_config": cfg.model_dump(mode="json"),
"decky_ip": ip,
"state": "running",
})
@pytest.fixture(autouse=True)
async def _isolate_fleet():
await _clear_fleet()
yield
await _clear_fleet()
@pytest.mark.anyio
async def test_deploy_respects_limit(client, auth_token, mock_state_file):
"""Deploy should reject if the *submitted* INI exceeds the limit.
The INI is the source of truth — prior state is fully replaced — so the
check runs on the new decky count alone."""
async def test_deploy_respects_limit(client, auth_token):
"""The limit counts the WHOLE resulting fleet — existing (from
fleet_deckies) plus the submitted INI — not the INI alone. One existing
decky + one submitted, against a limit of 1, must be rejected."""
await repo.set_state("config_limits", {"deployment_limit": 1})
await repo.set_state("deployment", mock_state_file)
await _seed_fleet("decky-existing", "192.168.1.10")
ini = """[decky-a]
services = ssh
[decky-b]
services = ssh
"""
ini = "[decky-new]\nservices = ssh\n"
resp = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": ini},
headers={"Authorization": f"Bearer {auth_token}"},
)
# 2 new deckies > limit of 1
# existing(1) + new(1) = 2 > limit 1
assert resp.status_code == 409
assert "limit" in resp.json()["detail"].lower()
@pytest.mark.anyio
async def test_deploy_replaces_prior_state(client, auth_token, mock_state_file):
"""Submitting an INI with 1 decky must not silently re-include the 2
deckies from prior state (that caused the 'Address already in use'
regression when stale decky2/decky3 redeployed on stale IPs)."""
async def test_deploy_replaces_prior_state(client, auth_token):
"""replace_fleet=True drops the prior fleet rather than silently
re-including it (the 'Address already in use' regression came from stale
deckies redeploying on stale IPs). After replace, the committed fleet is
exactly the submitted INI."""
await repo.set_state("config_limits", {"deployment_limit": 10})
await repo.set_state("deployment", mock_state_file)
await _seed_fleet("test-decky-1", "192.168.1.10")
await _seed_fleet("test-decky-2", "192.168.1.11")
ini = """[only-decky]
services = ssh
"""
ini = "[only-decky]\nservices = ssh\n"
resp = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": ini},
json={"ini_content": ini, "replace_fleet": True},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert resp.status_code == 202
persisted = await repo.get_state("deployment")
names = [d["name"] for d in persisted["config"]["deckies"]]
assert names == ["only-decky"]
assert resp.status_code == 202, resp.text
names = {d["name"] for d in await repo.get_deckies()}
assert names == {"only-decky"}
@pytest.mark.anyio
async def test_deploy_within_limit(client, auth_token, mock_state_file):
"""Deploy should succeed when within limit."""
async def test_deploy_within_limit(client, auth_token):
"""Deploy should succeed when the resulting fleet is within limit."""
await repo.set_state("config_limits", {"deployment_limit": 100})
await repo.set_state("deployment", mock_state_file)
await _seed_fleet("decky-existing", "192.168.1.10")
ini = """[decky-new]
services = ssh
"""
ini = "[decky-new]\nservices = ssh\n"
resp = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": ini},
headers={"Authorization": f"Bearer {auth_token}"},
)
# Should not fail due to limit
if resp.status_code == 409:
assert "limit" not in resp.json()["detail"].lower()
else:

View File

@@ -211,6 +211,47 @@ def mock_state_file(patch_state_file: Path):
patch_state_file.write_text(json.dumps(_test_state))
yield _test_state
@pytest.fixture
async def mock_fleet_deckies():
"""Seed fleet_deckies with two deckies — the store get_deckies() reads
under the Option-D source-of-truth model (development/ADR-001-...md).
Mirrors the data mock_state_file used to put in decnet-state.json."""
from decnet.config import DeckyConfig
from decnet.web.db.models import LOCAL_HOST_SENTINEL
from decnet.web.dependencies import repo
async def _clear() -> None:
for row in await repo.list_fleet_deckies():
await repo.delete_fleet_decky(
host_uuid=row.get("host_uuid") or LOCAL_HOST_SENTINEL,
name=row["name"],
)
specs = [
("test-decky-1", "192.168.1.10", ["ssh"], "debian", "test-host-1",
{"ssh": {"banner": "SSH-2.0-OpenSSH_8.9"}}, "deaddeck"),
("test-decky-2", "192.168.1.11", ["http"], "ubuntu", "test-host-2",
{}, None),
]
await _clear()
for name, ip, services, distro, hostname, svc_cfg, arche in specs:
cfg = DeckyConfig(
name=name, ip=ip, services=services, distro=distro,
base_image=distro, hostname=hostname,
service_config=svc_cfg, archetype=arche,
)
await repo.upsert_fleet_decky({
"host_uuid": LOCAL_HOST_SENTINEL,
"name": name,
"services": services,
"decky_config": cfg.model_dump(mode="json"),
"decky_ip": ip,
"state": "running",
})
yield
await _clear()
# Share fuzz settings across API tests
# FUZZ_EXAMPLES: keep low for dev speed; bump via HYPOTHESIS_MAX_EXAMPLES env var in CI
_FUZZ_EXAMPLES = int(_os.environ.get("HYPOTHESIS_MAX_EXAMPLES", "10"))

View File

@@ -5,6 +5,13 @@ Default behaviour (replace_fleet=False) appends the INI to the existing
fleet so the wizard's "deploy one more decky" submit no longer wipes
prior deckies. replace_fleet=True preserves the historical
set-desired-state semantics for CLI / declarative callers.
The existing fleet is read from fleet_deckies — the engine-mirrored table
written on every deploy/teardown (CLI or web), per the source-of-truth
model in fleet/reconciler.py. These tests seed fleet_deckies directly,
which also models the BUG-2 scenario: a fleet established out of band
(CLI/seed) that the web deploy guard must see and append to rather than
wipe. See development/ADR-001-FLEET-SOURCE-OF-TRUTH.md.
"""
from __future__ import annotations
@@ -12,6 +19,8 @@ from unittest.mock import patch
import pytest
from decnet.config import DeckyConfig
from decnet.web.db.models import LOCAL_HOST_SENTINEL
from decnet.web.dependencies import repo
@@ -28,96 +37,111 @@ def mock_network():
yield
async def _clear_fleet() -> None:
for row in await repo.list_fleet_deckies():
await repo.delete_fleet_decky(
host_uuid=row.get("host_uuid") or LOCAL_HOST_SENTINEL,
name=row["name"],
)
async def _seed_fleet(name: str, *, ip: str = "192.168.1.10", services=("ssh",)) -> None:
"""Insert a decky into fleet_deckies, as the engine mirror does on a
CLI/web deploy. Stamps a full DeckyConfig into decky_config so the deploy
guard can rehydrate it."""
cfg = DeckyConfig(
name=name,
ip=ip,
services=list(services),
distro="debian",
base_image="debian:bookworm-slim",
hostname=name,
)
await repo.upsert_fleet_decky({
"host_uuid": LOCAL_HOST_SENTINEL,
"name": name,
"services": list(services),
"decky_config": cfg.model_dump(mode="json"),
"decky_ip": ip,
"state": "running",
})
@pytest.fixture(autouse=True)
async def _isolate_state():
for row in await repo.list_swarm_hosts():
await repo.delete_swarm_host(row["uuid"])
await repo.set_state("deployment", None)
await _clear_fleet()
yield
await repo.set_state("deployment", None)
await _clear_fleet()
@pytest.mark.anyio
async def test_additive_default_appends_to_existing_fleet(client, auth_token, monkeypatch):
"""Two sequential deploys with replace_fleet unset → both deckies in state."""
async def test_additive_onto_existing_fleet_appends_not_wipes(client, auth_token, monkeypatch):
"""BUG-2 regression: an additive web deploy onto a fleet established out
of band (CLI/seed → fleet_deckies) appends rather than wiping it.
Previously the guard read State["deployment"] (empty for a CLI-seeded
fleet), so existing_deckies was [] and the reconciler tore the running
fleet down to the single submitted decky."""
monkeypatch.setenv("DECNET_MODE", "master")
await _seed_fleet("decky-01", ip="192.168.1.10")
r1 = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": "[decky-01]\nservices = ssh\n"},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert r1.status_code == 202, r1.text
r2 = await client.post(
r = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": "[decky-02]\nservices = http\n"},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert r2.status_code == 202, r2.text
assert r.status_code == 202, r.text
committed = await repo.get_state("deployment")
assert committed is not None
names = {d["name"] for d in committed["config"]["deckies"]}
names = {d["name"] for d in await repo.get_deckies()}
assert names == {"decky-01", "decky-02"}
@pytest.mark.anyio
async def test_additive_name_collision_returns_409(client, auth_token, monkeypatch):
"""Re-submitting an existing decky name without replace_fleet → 409."""
"""Submitting a decky whose name already exists in the fleet without
replace_fleet → 409."""
monkeypatch.setenv("DECNET_MODE", "master")
await _seed_fleet("decky-01")
r1 = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": "[decky-01]\nservices = ssh\n"},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert r1.status_code == 202, r1.text
r2 = await client.post(
r = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": "[decky-01]\nservices = http\n"},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert r2.status_code == 409, r2.text
assert "decky-01" in r2.json()["detail"]
assert "replace_fleet" in r2.json()["detail"]
assert r.status_code == 409, r.text
assert "decky-01" in r.json()["detail"]
assert "replace_fleet" in r.json()["detail"]
@pytest.mark.anyio
async def test_additive_ip_collision_returns_409(client, auth_token, monkeypatch):
"""A new decky pinned to an IP already in use → 409 with the IP."""
"""A new decky pinned to an IP already in use by the existing fleet → 409
with the IP."""
monkeypatch.setenv("DECNET_MODE", "master")
await _seed_fleet("decky-01", ip="192.168.1.50")
r1 = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": "[decky-01]\nservices = ssh\nip = 192.168.1.50\n"},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert r1.status_code == 202, r1.text
r2 = await client.post(
r = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": "[decky-02]\nservices = http\nip = 192.168.1.50\n"},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert r2.status_code == 409, r2.text
assert "192.168.1.50" in r2.json()["detail"]
assert r.status_code == 409, r.text
assert "192.168.1.50" in r.json()["detail"]
@pytest.mark.anyio
async def test_replace_fleet_true_overwrites_existing(client, auth_token, monkeypatch):
"""replace_fleet=True preserves the historical full-replace semantics."""
"""replace_fleet=True preserves the historical full-replace semantics:
the existing fleet is dropped and the committed inventory is exactly the
submitted INI."""
monkeypatch.setenv("DECNET_MODE", "master")
await _seed_fleet("decky-01")
r1 = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": "[decky-01]\nservices = ssh\n"},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert r1.status_code == 202, r1.text
r2 = await client.post(
r = await client.post(
"/api/v1/deckies/deploy",
json={
"ini_content": "[decky-02]\nservices = http\n",
@@ -125,11 +149,9 @@ async def test_replace_fleet_true_overwrites_existing(client, auth_token, monkey
},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert r2.status_code == 202, r2.text
assert r.status_code == 202, r.text
committed = await repo.get_state("deployment")
assert committed is not None
names = {d["name"] for d in committed["config"]["deckies"]}
names = {d["name"] for d in await repo.get_deckies()}
assert names == {"decky-02"}
@@ -139,25 +161,16 @@ async def test_additive_lifecycle_ids_scoped_to_new_deckies(client, auth_token,
the caller submitted, not carryover. Operators polling
/deckies/lifecycle?ids=... see exactly what this call deployed."""
monkeypatch.setenv("DECNET_MODE", "master")
await _seed_fleet("decky-01", ip="192.168.1.10")
await _seed_fleet("decky-02", ip="192.168.1.11")
r1 = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": "[decky-01]\nservices = ssh\n[decky-02]\nservices = http\n"},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert r1.status_code == 202, r1.text
assert len(r1.json()["lifecycle_ids"]) == 2
r2 = await client.post(
r = await client.post(
"/api/v1/deckies/deploy",
json={"ini_content": "[decky-03]\nservices = ssh\n"},
headers={"Authorization": f"Bearer {auth_token}"},
)
assert r2.status_code == 202, r2.text
body2 = r2.json()
assert len(body2["lifecycle_ids"]) == 1
assert r.status_code == 202, r.text
assert len(r.json()["lifecycle_ids"]) == 1
committed = await repo.get_state("deployment")
assert committed is not None
names = {d["name"] for d in committed["config"]["deckies"]}
names = {d["name"] for d in await repo.get_deckies()}
assert names == {"decky-01", "decky-02", "decky-03"}

View File

@@ -5,7 +5,7 @@ from hypothesis import given, settings, strategies as st
from ..conftest import _FUZZ_SETTINGS
@pytest.mark.anyio
async def test_get_deckies_endpoint(mock_state_file, client: httpx.AsyncClient, auth_token: str):
async def test_get_deckies_endpoint(mock_fleet_deckies, client: httpx.AsyncClient, auth_token: str):
_response = await client.get("/api/v1/deckies", headers={"Authorization": f"Bearer {auth_token}"})
assert _response.status_code == 200
_data = _response.json()