fix(fleet): read existing fleet from fleet_deckies, not State["deployment"] (BUG-2)
The web deploy collision-guard read the existing fleet from the DB State["deployment"] key, while the UI/get_deckies() read decnet-state.json. A fleet established via CLI/seed lands in neither path the guard consulted, so existing_deckies was empty, the additive guard ran blind, and the reconciler tore the running fleet down to the single submitted decky (BUG-2: silent fleet wipe, HTTP 202, no warning). Converge both reads on fleet_deckies — the engine-mirrored table written on every deploy/teardown (CLI and web), which fleet/reconciler.py already documents as the store the orchestrator, dashboard, and REST API see. Each row's decky_config column is a full DeckyConfig dump, so it rehydrates losslessly into the collision-guard input. The handler also commits the intended fleet to fleet_deckies synchronously so rapid sequential deploys read a current fleet and the dashboard observes the new shape immediately. State["deployment"] is retained for now — the mutate handlers and the mutator engine still coordinate through it; consolidating them is tracked in development/ADR-001-FLEET-SOURCE-OF-TRUTH.md (open question 7). Tests seed fleet_deckies directly (also modelling the CLI-seeded scenario) rather than chaining real deploys through the skipped contract-test path.
This commit is contained in:
@@ -12,7 +12,6 @@ backends. Dialect-specific behavior lives in subclasses:
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
|
||||
@@ -23,7 +22,6 @@ from typing import Any, Optional, List, cast
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession, async_sessionmaker
|
||||
|
||||
from decnet.config import load_state
|
||||
from decnet.env import DECNET_ADMIN_USER, DECNET_ADMIN_PASSWORD
|
||||
from decnet.web.auth import get_password_hash
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
@@ -172,8 +170,14 @@ class SQLModelRepository(
|
||||
return None
|
||||
|
||||
async def get_deckies(self) -> List[dict]:
|
||||
_state = await asyncio.to_thread(load_state)
|
||||
return [_d.model_dump() for _d in _state[0].deckies] if _state else []
|
||||
# The fleet inventory the UI/API sees is fleet_deckies — the
|
||||
# engine-mirrored table written on EVERY deploy/teardown (CLI or web),
|
||||
# per the source-of-truth model documented in fleet/reconciler.py.
|
||||
# Each row's decky_config column is a full DeckyConfig.model_dump(
|
||||
# mode="json"), so it rehydrates to the same shape load_state() used
|
||||
# to return. See development/ADR-001-FLEET-SOURCE-OF-TRUTH.md.
|
||||
rows = await self.list_fleet_deckies()
|
||||
return [r["decky_config"] for r in rows if r.get("decky_config")]
|
||||
|
||||
# --------------------------------------------------------------- users
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ from decnet.bus.factory import get_bus
|
||||
from decnet.lifecycle.runner import run_deploy
|
||||
from decnet.logging import get_logger
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT
|
||||
from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, DeckyConfig, _ROOT
|
||||
from decnet.ini_loader import load_ini_from_string
|
||||
from decnet.network import detect_interface, detect_subnet, get_host_ip
|
||||
from decnet.web.dependencies import require_admin, repo
|
||||
@@ -19,6 +19,39 @@ log = get_logger("api")
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
async def _commit_fleet_to_db(deckies: list[DeckyConfig], *, replace_fleet: bool) -> None:
|
||||
"""Synchronously reconcile ``fleet_deckies`` to *deckies*.
|
||||
|
||||
fleet_deckies is the source of truth the deploy guard now reads
|
||||
(``existing_deckies``). Committing the intended shape here — before the
|
||||
async deploy task's engine mirror runs — means rapid sequential web
|
||||
deploys each read a current fleet (no self-wipe) and the dashboard
|
||||
observes the new shape immediately. Mirrors the payload shape of
|
||||
``engine.deployer._mirror_fleet_deploy_to_db``.
|
||||
|
||||
In replace mode, rows absent from *deckies* are deleted so the committed
|
||||
inventory matches the desired set; the async reconciler/teardown mirror
|
||||
converges the actual containers separately.
|
||||
"""
|
||||
from decnet.web.db.models import LOCAL_HOST_SENTINEL
|
||||
|
||||
keep = {(d.host_uuid or LOCAL_HOST_SENTINEL, d.name) for d in deckies}
|
||||
if replace_fleet:
|
||||
for row in await repo.list_fleet_deckies():
|
||||
host = row.get("host_uuid") or LOCAL_HOST_SENTINEL
|
||||
if (host, row.get("name")) not in keep:
|
||||
await repo.delete_fleet_decky(host_uuid=host, name=row["name"])
|
||||
for d in deckies:
|
||||
await repo.upsert_fleet_decky({
|
||||
"host_uuid": d.host_uuid or LOCAL_HOST_SENTINEL,
|
||||
"name": d.name,
|
||||
"services": list(d.services),
|
||||
"decky_config": d.model_dump(mode="json"),
|
||||
"decky_ip": d.ip,
|
||||
"state": "running",
|
||||
})
|
||||
|
||||
|
||||
@router.post(
|
||||
"/deckies/deploy",
|
||||
tags=["Fleet Management"],
|
||||
@@ -81,7 +114,19 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir
|
||||
# config below) so additive collision checks compare new against prior
|
||||
# rather than against themselves. Existing IPs are passed into
|
||||
# build_deckies_from_ini as reserved so auto-allocation skips them.
|
||||
existing_deckies = list(config.deckies) if config is not None else []
|
||||
# The existing fleet comes from fleet_deckies (engine-mirrored on CLI
|
||||
# *and* web deploys), NOT from config.deckies carried by the
|
||||
# State["deployment"] key. A CLI/seed-established fleet never lands in
|
||||
# that key, so the additive collision guard ran blind and the reconciler
|
||||
# wiped the fleet — root cause of BUG-2. fleet_deckies is the store the
|
||||
# source-of-truth model (fleet/reconciler.py) names as the API's view.
|
||||
# See development/ADR-001-FLEET-SOURCE-OF-TRUTH.md.
|
||||
existing_rows = await repo.list_fleet_deckies()
|
||||
existing_deckies = [
|
||||
DeckyConfig(**r["decky_config"])
|
||||
for r in existing_rows
|
||||
if r.get("decky_config")
|
||||
]
|
||||
reserved_ips: set[str] | None = (
|
||||
{d.ip for d in existing_deckies if d.ip}
|
||||
if not req.replace_fleet and existing_deckies
|
||||
@@ -192,6 +237,11 @@ async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(requir
|
||||
),
|
||||
}
|
||||
await repo.set_state("deployment", new_state_payload)
|
||||
# Commit the intended fleet to fleet_deckies — the store the deploy guard
|
||||
# and get_deckies() now read. set_state("deployment") above is retained
|
||||
# for the mutate handlers / mutator engine that still coordinate through
|
||||
# that key (their consolidation is tracked in the ADR, open question 7).
|
||||
await _commit_fleet_to_db(config.deckies, replace_fleet=req.replace_fleet)
|
||||
|
||||
# Lifecycle rows track THIS call's deployments only. In additive mode
|
||||
# the existing deckies are already running and don't get a new
|
||||
|
||||
177
development/ADR-001-FLEET-SOURCE-OF-TRUTH.md
Normal file
177
development/ADR-001-FLEET-SOURCE-OF-TRUTH.md
Normal file
@@ -0,0 +1,177 @@
|
||||
# ADR-001 — Fleet Source of Truth
|
||||
|
||||
- **Status:** PROPOSED (discussion — not yet accepted)
|
||||
- **Date:** 2026-06-12
|
||||
- **Context owner:** ANTI
|
||||
- **Drives fix for:** BUG-2 (destructive fleet-replace / silent wipe), see `QA_REPORT.md`
|
||||
|
||||
---
|
||||
|
||||
## 1. Context
|
||||
|
||||
DECNET currently keeps the deployed-fleet inventory in **two unsynchronized stores**:
|
||||
|
||||
| Store | Read by | Written by |
|
||||
|-------|---------|------------|
|
||||
| `decnet-state.json` file (`load_state()`) | `repo.get_deckies()` → the UI fleet view, collision pre-checks | CLI/engine path (`engine.deployer.save_state`), `decnet status`, sniffer, collector |
|
||||
| DB `State` table, key `"deployment"` (`repo.get_state`/`set_state`) | the web deploy handler's `existing_deckies` snapshot | **only** the web deploy handler |
|
||||
|
||||
The web is a **non-dependency**: the same deploys can be driven entirely from the CLI, and CLI state lives in `decnet-state.json`. Because the two stores never reconcile, a fleet established via CLI/seed is invisible to the web deploy handler's collision guard.
|
||||
|
||||
### BUG-2 failure chain (source-traced)
|
||||
|
||||
1. CLI/seed establishes a fleet → written to `decnet-state.json`, **never** to DB `"deployment"`.
|
||||
2. UI reads `get_deckies()` (JSON) → shows decky-02/03 correctly.
|
||||
3. Wizard POSTs a new decky-04 with `replace_fleet=false`.
|
||||
4. Handler reads `existing_deckies` from `repo.get_state("deployment")` → **None** → `existing_deckies = []`.
|
||||
5. Collision guard compares against `[]` → no conflict → `config.deckies = [] + [decky-04]`.
|
||||
6. `run_deploy` → `LocalDeployStrategy` → `engine.deployer.deploy(config)`:
|
||||
- `write_compose(config, COMPOSE_FILE)` writes a compose file containing **only decky-04** (`deployer.py:681`).
|
||||
- `_compose("down", "--remove-orphans", …)` (`deployer.py:708`) tears down the whole compose project, then `up` brings back only decky-04.
|
||||
- `_mirror_fleet_teardown_to_db` drops the survivors' rows.
|
||||
7. Result: fleet silently wiped to one decky. HTTP 202. No warning.
|
||||
|
||||
**Key trap:** the destructive call is `deployer.py:708` (`down --remove-orphans` against a compose file rewritten from `config.deckies`). Any source-of-truth fix that does not also guarantee `config.deckies` is the **complete** desired fleet before `write_compose` leaves BUG-2 alive.
|
||||
|
||||
---
|
||||
|
||||
## 2. What the UI actually consumes
|
||||
|
||||
`DeckyConfig` (`decnet/models.py:87`) full field set:
|
||||
|
||||
```
|
||||
name, ip, services[], distro, base_image, build_base, hostname,
|
||||
archetype, service_config{}, nmap_os, mutate_interval, last_mutated,
|
||||
last_login_attempt, host_uuid
|
||||
```
|
||||
|
||||
Frontend `Decky` type (`DeckyFleet/types.ts`) + what is **rendered/edited**:
|
||||
|
||||
| Field | Displayed? | Where |
|
||||
|-------|-----------|-------|
|
||||
| name, ip, services | yes | DeckyCard / InspectPanel |
|
||||
| hostname, distro, archetype | yes | DeckyInspectPanel:77-79 |
|
||||
| mutate_interval, last_mutated | yes | DeckyInspectPanel:80-81 |
|
||||
| **service_config** | **yes — EDITED** | DeckyCard:322 (per-service config editor `currentConfig`) |
|
||||
| base_image, build_base, nmap_os, last_login_attempt | no | — |
|
||||
|
||||
**Conclusion:** `service_config` is not just stored — it is rendered and **edited** in the UI. A "minimal scalar labels" scheme (name/ip/services only) would amputate editable state. Fidelity requires carrying the full `DeckyConfig`.
|
||||
|
||||
---
|
||||
|
||||
## 3. Options
|
||||
|
||||
### Option A — API reads only the DB; ignore `decnet-state.json` (web side)
|
||||
|
||||
Align `get_deckies()` and the deploy handler both on DB `"deployment"`. The web becomes a self-contained plane on the DB; CLI stays on the JSON file. The two planes are explicitly **non-interoperable**.
|
||||
|
||||
- **Pros:** smallest change; closes the desync *within the web plane*.
|
||||
- **Cons:** ANTI's own verdict — "honestly the incorrect way of doing things." Two planes that can't see each other is a design smell, not a fix. A CLI-seeded fleet is still invisible to the web (and vice-versa); the wizard would still drive a reconciler that tears down CLI containers it can't see. Does **not** fix the cross-plane wipe, only the intra-web one.
|
||||
|
||||
### Option B — Docker container labels as source of truth (ANTI's proposal)
|
||||
|
||||
Stamp every DECNET container with provenance + identity labels; reconstruct the fleet by querying Docker. `decnet-state.json` degrades to a CLI-side convenience cache, no longer authoritative.
|
||||
|
||||
Proposed labels:
|
||||
```
|
||||
com.decnet.host = "true" # selector for "this is a DECNET decky"
|
||||
com.decnet.deploy_type = "api" | "cli" # provenance, NOT a partition
|
||||
com.decnet.service = "<service>" # or the broader identity
|
||||
com.decnet.config = "<DeckyConfig JSON>" # REQUIRED to preserve service_config fidelity (see §2)
|
||||
```
|
||||
|
||||
Fleet read becomes `docker ps --filter label=com.decnet.host=true` (+ `-a` for stopped), then deserialize `com.decnet.config`.
|
||||
|
||||
- **Pros:**
|
||||
- **One source of truth = reality.** The collision guard and the reconciler read the SAME state, so BUG-2 cannot recur.
|
||||
- Survives a DECNET process restart (Docker keeps running; labels persist on the real object).
|
||||
- `deploy_type` makes the "two planes" distinction unnecessary — one fleet, labeled by origin. The guard queries ALL `com.decnet.host=true` regardless of origin, so it can never blind-wipe a CLI decky.
|
||||
- This is the orchestrator-standard pattern (label the real object, reconcile against it).
|
||||
- **Cons / constraints:**
|
||||
- **Swarm.** The master cannot `docker ps` a remote worker. Remote deckies STILL need a registry → keep `decky_shards` (DB, heartbeat-driven). Honest model is **hybrid**: local truth = labels, remote truth = `decky_shards`.
|
||||
- **Fleet-global config** (`interface, subnet, gateway, ipvlan, mutate_interval, log_file, compose_path`) is not per-container. Proposed home: **labels on the macvlan/ipvlan network object** (exactly one, DECNET-owned, correct scope). NOT replicated onto every container.
|
||||
- **Label payload.** Preserving `service_config` fidelity forces a `com.decnet.config` JSON blob. Works (label values are generous) but it is config-in-label-land, with its own serialization discipline.
|
||||
- **Performance.** `/deckies` is UI-polled and load-tested. Querying Docker on every read is heavier than a file/DB read. Mitigation: the existing 5s TTL cache (`api_get_deckies.py:_DECKIES_TTL`) extends naturally over the Docker query.
|
||||
- **Does NOT by itself fix `deployer.py:708`.** Labels give the DATA to build the COMPLETE config (live + new) before `write_compose`; the merge must actually be done. Labels make the correct merge possible; they don't perform it.
|
||||
|
||||
### Option C — Single DB store as canonical (both web and CLI write DB)
|
||||
|
||||
Make the CLI write the DB `"deployment"` key too; retire `decnet-state.json` as authority. One store, but it's bookkeeping, not reality — can still drift from actual containers on crash/manual `docker rm`.
|
||||
|
||||
- **Pros:** single store; no Docker-query perf cost; swarm-friendly (DB is already the remote registry).
|
||||
- **Cons:** reintroduces the "trust the ledger, not reality" fragility that Option B specifically escapes; CLI now hard-depends on the DB being reachable, eroding the web-is-a-non-dependency property.
|
||||
|
||||
---
|
||||
|
||||
## 4. Recommendation (for discussion)
|
||||
|
||||
**Option B (labels), accepted as a hybrid:** local fleet truth = Docker labels; remote fleet truth = `decky_shards` (DB); fleet-global config = network-object labels; `decnet-state.json` demoted to CLI convenience cache.
|
||||
|
||||
Mandatory companion change regardless of option chosen: **build the complete desired `config.deckies` (surviving live fleet + new submissions) before `write_compose`/`deployer.py:708`**, so `down --remove-orphans` + `up` is a no-op on survivors. This is the actual teardown fix; the source-of-truth choice only determines *where the survivor list is read from*.
|
||||
|
||||
---
|
||||
|
||||
## 5. Open questions (resolve before cutting code)
|
||||
|
||||
1. **`com.decnet.config` blob vs. exploded scalar labels** — do we accept one JSON label for fidelity, or split into N labels and reconstruct? (Fidelity for `service_config` pushes toward the blob.)
|
||||
2. **Global config home** — network-object labels confirmed as the home, or a single sentinel "fleet" container/label set?
|
||||
3. **Swarm boundary** — is the local-labels / remote-`decky_shards` split acceptable, or do we want labels mirrored back to the master via heartbeat for a uniform read path?
|
||||
4. **Stopped/failed containers** — does `-a` (include stopped) count toward the fleet for collision purposes, and how do we represent non-running status the JSON file never tracked?
|
||||
5. **Migration** — first label-aware deploy after upgrade: how do we adopt already-running unlabeled containers (relabel in place vs. require one redeploy)?
|
||||
6. **`decnet-state.json` final role** — pure CLI cache, or removed entirely with CLI also reading labels?
|
||||
|
||||
---
|
||||
|
||||
## 6. Affected files (for whichever option lands)
|
||||
|
||||
- `decnet/web/router/fleet/api_deploy_deckies.py` — `existing_deckies` snapshot (lines 48, 84), collision guard (124-145), `set_state("deployment")` (194)
|
||||
- `decnet/web/router/fleet/api_get_deckies.py` — `get_deckies` read path + TTL cache
|
||||
- `decnet/web/db/sqlmodel_repo/__init__.py:174` — `get_deckies()` (currently `load_state()`)
|
||||
- `decnet/engine/deployer.py:681` (`write_compose`), `:708` (`down --remove-orphans`), `:571`/`:623` (`_mirror_fleet_*`)
|
||||
- `decnet/config.py` — `save_state`/`load_state`, `STATE_FILE`
|
||||
- `decnet/lifecycle/runner.py` / `strategies.py` — `LocalDeployStrategy` → `deployer.deploy`
|
||||
- `decnet/models.py:87` — `DeckyConfig` (label serialization surface)
|
||||
|
||||
---
|
||||
|
||||
## 7. CORRECTION (source-traced 2026-06-12) — the store topology is wider than §1 said
|
||||
|
||||
§1's claim that DB `State["deployment"]` is *"written only by the web deploy handler"* is **WRONG**. A grep for its readers/writers shows it is the shared coordination store for the **entire web + mutator plane**:
|
||||
|
||||
| Site | Op |
|
||||
|------|----|
|
||||
| `api_deploy_deckies.py:48,194` | read + write |
|
||||
| `api_mutate_decky.py:55,76` | read + write |
|
||||
| `api_mutate_interval.py:32,45` | read + write |
|
||||
| `swarm_mgmt/api_list_deckies.py:28` | read |
|
||||
| `mutator/engine.py:84,126,189,413` | read + write (autonomous mutator) |
|
||||
|
||||
Consequences:
|
||||
- A one-line "deploy handler reads `load_state()`" swap makes deploy **diverge from its own plane** (mutate handlers + the background mutator still read the DB key). Lateral move, not a fix. **Empirically confirmed:** that edit broke 4/5 tests in `tests/api/fleet/test_deploy_additive.py` (the survivor was `replace_fleet=True`, the only case that doesn't read the prior fleet), because under `DECNET_CONTRACT_TEST` the deploy task is skipped so `save_state` never writes the JSON, and the handler couldn't see its own prior `set_state` write. Read-one-store / write-another is self-inconsistent.
|
||||
- Pointing `get_deckies()` at the DB key **also fails to fix BUG-2**: a CLI-seeded fleet isn't in `State["deployment"]` either (CLI writes JSON + `fleet_deckies`), so the reconcile-against-incomplete-inventory wipe survives.
|
||||
|
||||
### The model the codebase ALREADY documents (`fleet/reconciler.py:1-29`)
|
||||
|
||||
```
|
||||
1. decnet-state.json — canonical for offline / no-API consumers (CLI, status, sniffer, collector)
|
||||
2. fleet_deckies table — "what the orchestrator, web dashboard, and REST API see"
|
||||
3. docker inspect — actual per-container runtime state
|
||||
Resolution: JSON-only → INSERT; DB-only(this host) → DELETE; both → state := docker-aggregated.
|
||||
```
|
||||
|
||||
Two facts this hands us:
|
||||
1. **The API was DESIGNED to read `fleet_deckies`** — the engine-mirrored table written on *every* deploy/teardown regardless of origin (`deployer.py:571 _mirror_fleet_deploy_to_db`, `:623` teardown). The live deploy/collision-guard code reading `State["deployment"]`, and `get_deckies()` reading the JSON file, are both **drift from the documented design**. `fleet_deckies` is the cross-plane store that *does* contain a CLI-seeded fleet.
|
||||
2. **Docker is already the ultimate authority** — the reconciler converges JSON and DB *to docker-aggregated state*. ANTI's label proposal (Option B) is not a new paradigm; it promotes docker from reconciler-tiebreaker to primary read path.
|
||||
|
||||
### Revised recommendation
|
||||
|
||||
Two viable directions, both grounded in the existing design rather than a new store:
|
||||
|
||||
- **B′ (labels / docker-primary)** — the ADR's Option B, now understood as *promoting* the reconciler's existing docker-authoritative tiebreaker to the primary fleet read. Strongest long-term; same swarm caveat (remote = `decky_shards`/`fleet_deckies`, master can't `docker ps` workers).
|
||||
- **D (converge on `fleet_deckies` now)** — make the deploy collision-guard AND `get_deckies()` read `fleet_deckies` (`list_fleet_deckies` / `list_running_fleet_deckies`), the store the design already names as the API's view. Smaller than relabelling; immediately closes the CLI-invisible-to-web gap because `fleet_deckies` is engine-mirrored on CLI deploys too. The mutate handlers + mutator engine reading `State["deployment"]` become the next consolidation target.
|
||||
|
||||
**Unchanged hard constraint:** whichever store wins, the handler must still build the COMPLETE desired `config.deckies` (survivors + new) before `write_compose`/`deployer.py:708`. The store choice only decides where "survivors" is read from.
|
||||
|
||||
### Open question added to §5
|
||||
|
||||
7. **`State["deployment"]` vs `fleet_deckies`** — do we converge the whole web+mutator plane onto `fleet_deckies` (Option D), or go straight to docker-primary (Option B′) and let `fleet_deckies` be the swarm/remote registry? The mutator engine (`mutator/engine.py`) is the heaviest consumer of `State["deployment"]` and must move in lockstep.
|
||||
@@ -2,6 +2,8 @@
|
||||
import pytest
|
||||
from unittest.mock import patch
|
||||
|
||||
from decnet.config import DeckyConfig
|
||||
from decnet.web.db.models import LOCAL_HOST_SENTINEL
|
||||
from decnet.web.dependencies import repo
|
||||
|
||||
|
||||
@@ -15,70 +17,93 @@ def contract_test_mode(monkeypatch):
|
||||
def mock_network():
|
||||
"""Mock network detection so deploy doesn't call `ip addr show`."""
|
||||
with patch("decnet.web.router.fleet.api_deploy_deckies.get_host_ip", return_value="192.168.1.100"):
|
||||
yield
|
||||
with patch("decnet.web.router.fleet.api_deploy_deckies.detect_interface", return_value="eth0"):
|
||||
with patch("decnet.web.router.fleet.api_deploy_deckies.detect_subnet", return_value=("192.168.1.0/24", "192.168.1.1")):
|
||||
yield
|
||||
|
||||
|
||||
async def _clear_fleet() -> None:
|
||||
for row in await repo.list_fleet_deckies():
|
||||
await repo.delete_fleet_decky(
|
||||
host_uuid=row.get("host_uuid") or LOCAL_HOST_SENTINEL,
|
||||
name=row["name"],
|
||||
)
|
||||
|
||||
|
||||
async def _seed_fleet(name: str, ip: str) -> None:
|
||||
cfg = DeckyConfig(
|
||||
name=name, ip=ip, services=["ssh"], distro="debian",
|
||||
base_image="debian", hostname=name,
|
||||
)
|
||||
await repo.upsert_fleet_decky({
|
||||
"host_uuid": LOCAL_HOST_SENTINEL,
|
||||
"name": name,
|
||||
"services": ["ssh"],
|
||||
"decky_config": cfg.model_dump(mode="json"),
|
||||
"decky_ip": ip,
|
||||
"state": "running",
|
||||
})
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
async def _isolate_fleet():
|
||||
await _clear_fleet()
|
||||
yield
|
||||
await _clear_fleet()
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_deploy_respects_limit(client, auth_token, mock_state_file):
|
||||
"""Deploy should reject if the *submitted* INI exceeds the limit.
|
||||
The INI is the source of truth — prior state is fully replaced — so the
|
||||
check runs on the new decky count alone."""
|
||||
async def test_deploy_respects_limit(client, auth_token):
|
||||
"""The limit counts the WHOLE resulting fleet — existing (from
|
||||
fleet_deckies) plus the submitted INI — not the INI alone. One existing
|
||||
decky + one submitted, against a limit of 1, must be rejected."""
|
||||
await repo.set_state("config_limits", {"deployment_limit": 1})
|
||||
await repo.set_state("deployment", mock_state_file)
|
||||
await _seed_fleet("decky-existing", "192.168.1.10")
|
||||
|
||||
ini = """[decky-a]
|
||||
services = ssh
|
||||
|
||||
[decky-b]
|
||||
services = ssh
|
||||
"""
|
||||
ini = "[decky-new]\nservices = ssh\n"
|
||||
resp = await client.post(
|
||||
"/api/v1/deckies/deploy",
|
||||
json={"ini_content": ini},
|
||||
headers={"Authorization": f"Bearer {auth_token}"},
|
||||
)
|
||||
# 2 new deckies > limit of 1
|
||||
# existing(1) + new(1) = 2 > limit 1
|
||||
assert resp.status_code == 409
|
||||
assert "limit" in resp.json()["detail"].lower()
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_deploy_replaces_prior_state(client, auth_token, mock_state_file):
|
||||
"""Submitting an INI with 1 decky must not silently re-include the 2
|
||||
deckies from prior state (that caused the 'Address already in use'
|
||||
regression when stale decky2/decky3 redeployed on stale IPs)."""
|
||||
async def test_deploy_replaces_prior_state(client, auth_token):
|
||||
"""replace_fleet=True drops the prior fleet rather than silently
|
||||
re-including it (the 'Address already in use' regression came from stale
|
||||
deckies redeploying on stale IPs). After replace, the committed fleet is
|
||||
exactly the submitted INI."""
|
||||
await repo.set_state("config_limits", {"deployment_limit": 10})
|
||||
await repo.set_state("deployment", mock_state_file)
|
||||
await _seed_fleet("test-decky-1", "192.168.1.10")
|
||||
await _seed_fleet("test-decky-2", "192.168.1.11")
|
||||
|
||||
ini = """[only-decky]
|
||||
services = ssh
|
||||
"""
|
||||
ini = "[only-decky]\nservices = ssh\n"
|
||||
resp = await client.post(
|
||||
"/api/v1/deckies/deploy",
|
||||
json={"ini_content": ini},
|
||||
json={"ini_content": ini, "replace_fleet": True},
|
||||
headers={"Authorization": f"Bearer {auth_token}"},
|
||||
)
|
||||
assert resp.status_code == 202
|
||||
persisted = await repo.get_state("deployment")
|
||||
names = [d["name"] for d in persisted["config"]["deckies"]]
|
||||
assert names == ["only-decky"]
|
||||
assert resp.status_code == 202, resp.text
|
||||
names = {d["name"] for d in await repo.get_deckies()}
|
||||
assert names == {"only-decky"}
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_deploy_within_limit(client, auth_token, mock_state_file):
|
||||
"""Deploy should succeed when within limit."""
|
||||
async def test_deploy_within_limit(client, auth_token):
|
||||
"""Deploy should succeed when the resulting fleet is within limit."""
|
||||
await repo.set_state("config_limits", {"deployment_limit": 100})
|
||||
await repo.set_state("deployment", mock_state_file)
|
||||
await _seed_fleet("decky-existing", "192.168.1.10")
|
||||
|
||||
ini = """[decky-new]
|
||||
services = ssh
|
||||
"""
|
||||
ini = "[decky-new]\nservices = ssh\n"
|
||||
resp = await client.post(
|
||||
"/api/v1/deckies/deploy",
|
||||
json={"ini_content": ini},
|
||||
headers={"Authorization": f"Bearer {auth_token}"},
|
||||
)
|
||||
# Should not fail due to limit
|
||||
if resp.status_code == 409:
|
||||
assert "limit" not in resp.json()["detail"].lower()
|
||||
else:
|
||||
|
||||
@@ -211,6 +211,47 @@ def mock_state_file(patch_state_file: Path):
|
||||
patch_state_file.write_text(json.dumps(_test_state))
|
||||
yield _test_state
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
async def mock_fleet_deckies():
|
||||
"""Seed fleet_deckies with two deckies — the store get_deckies() reads
|
||||
under the Option-D source-of-truth model (development/ADR-001-...md).
|
||||
Mirrors the data mock_state_file used to put in decnet-state.json."""
|
||||
from decnet.config import DeckyConfig
|
||||
from decnet.web.db.models import LOCAL_HOST_SENTINEL
|
||||
from decnet.web.dependencies import repo
|
||||
|
||||
async def _clear() -> None:
|
||||
for row in await repo.list_fleet_deckies():
|
||||
await repo.delete_fleet_decky(
|
||||
host_uuid=row.get("host_uuid") or LOCAL_HOST_SENTINEL,
|
||||
name=row["name"],
|
||||
)
|
||||
|
||||
specs = [
|
||||
("test-decky-1", "192.168.1.10", ["ssh"], "debian", "test-host-1",
|
||||
{"ssh": {"banner": "SSH-2.0-OpenSSH_8.9"}}, "deaddeck"),
|
||||
("test-decky-2", "192.168.1.11", ["http"], "ubuntu", "test-host-2",
|
||||
{}, None),
|
||||
]
|
||||
await _clear()
|
||||
for name, ip, services, distro, hostname, svc_cfg, arche in specs:
|
||||
cfg = DeckyConfig(
|
||||
name=name, ip=ip, services=services, distro=distro,
|
||||
base_image=distro, hostname=hostname,
|
||||
service_config=svc_cfg, archetype=arche,
|
||||
)
|
||||
await repo.upsert_fleet_decky({
|
||||
"host_uuid": LOCAL_HOST_SENTINEL,
|
||||
"name": name,
|
||||
"services": services,
|
||||
"decky_config": cfg.model_dump(mode="json"),
|
||||
"decky_ip": ip,
|
||||
"state": "running",
|
||||
})
|
||||
yield
|
||||
await _clear()
|
||||
|
||||
# Share fuzz settings across API tests
|
||||
# FUZZ_EXAMPLES: keep low for dev speed; bump via HYPOTHESIS_MAX_EXAMPLES env var in CI
|
||||
_FUZZ_EXAMPLES = int(_os.environ.get("HYPOTHESIS_MAX_EXAMPLES", "10"))
|
||||
|
||||
@@ -5,6 +5,13 @@ Default behaviour (replace_fleet=False) appends the INI to the existing
|
||||
fleet so the wizard's "deploy one more decky" submit no longer wipes
|
||||
prior deckies. replace_fleet=True preserves the historical
|
||||
set-desired-state semantics for CLI / declarative callers.
|
||||
|
||||
The existing fleet is read from fleet_deckies — the engine-mirrored table
|
||||
written on every deploy/teardown (CLI or web), per the source-of-truth
|
||||
model in fleet/reconciler.py. These tests seed fleet_deckies directly,
|
||||
which also models the BUG-2 scenario: a fleet established out of band
|
||||
(CLI/seed) that the web deploy guard must see and append to rather than
|
||||
wipe. See development/ADR-001-FLEET-SOURCE-OF-TRUTH.md.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -12,6 +19,8 @@ from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from decnet.config import DeckyConfig
|
||||
from decnet.web.db.models import LOCAL_HOST_SENTINEL
|
||||
from decnet.web.dependencies import repo
|
||||
|
||||
|
||||
@@ -28,96 +37,111 @@ def mock_network():
|
||||
yield
|
||||
|
||||
|
||||
async def _clear_fleet() -> None:
|
||||
for row in await repo.list_fleet_deckies():
|
||||
await repo.delete_fleet_decky(
|
||||
host_uuid=row.get("host_uuid") or LOCAL_HOST_SENTINEL,
|
||||
name=row["name"],
|
||||
)
|
||||
|
||||
|
||||
async def _seed_fleet(name: str, *, ip: str = "192.168.1.10", services=("ssh",)) -> None:
|
||||
"""Insert a decky into fleet_deckies, as the engine mirror does on a
|
||||
CLI/web deploy. Stamps a full DeckyConfig into decky_config so the deploy
|
||||
guard can rehydrate it."""
|
||||
cfg = DeckyConfig(
|
||||
name=name,
|
||||
ip=ip,
|
||||
services=list(services),
|
||||
distro="debian",
|
||||
base_image="debian:bookworm-slim",
|
||||
hostname=name,
|
||||
)
|
||||
await repo.upsert_fleet_decky({
|
||||
"host_uuid": LOCAL_HOST_SENTINEL,
|
||||
"name": name,
|
||||
"services": list(services),
|
||||
"decky_config": cfg.model_dump(mode="json"),
|
||||
"decky_ip": ip,
|
||||
"state": "running",
|
||||
})
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
async def _isolate_state():
|
||||
for row in await repo.list_swarm_hosts():
|
||||
await repo.delete_swarm_host(row["uuid"])
|
||||
await repo.set_state("deployment", None)
|
||||
await _clear_fleet()
|
||||
yield
|
||||
await repo.set_state("deployment", None)
|
||||
await _clear_fleet()
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_additive_default_appends_to_existing_fleet(client, auth_token, monkeypatch):
|
||||
"""Two sequential deploys with replace_fleet unset → both deckies in state."""
|
||||
async def test_additive_onto_existing_fleet_appends_not_wipes(client, auth_token, monkeypatch):
|
||||
"""BUG-2 regression: an additive web deploy onto a fleet established out
|
||||
of band (CLI/seed → fleet_deckies) appends rather than wiping it.
|
||||
|
||||
Previously the guard read State["deployment"] (empty for a CLI-seeded
|
||||
fleet), so existing_deckies was [] and the reconciler tore the running
|
||||
fleet down to the single submitted decky."""
|
||||
monkeypatch.setenv("DECNET_MODE", "master")
|
||||
await _seed_fleet("decky-01", ip="192.168.1.10")
|
||||
|
||||
r1 = await client.post(
|
||||
"/api/v1/deckies/deploy",
|
||||
json={"ini_content": "[decky-01]\nservices = ssh\n"},
|
||||
headers={"Authorization": f"Bearer {auth_token}"},
|
||||
)
|
||||
assert r1.status_code == 202, r1.text
|
||||
|
||||
r2 = await client.post(
|
||||
r = await client.post(
|
||||
"/api/v1/deckies/deploy",
|
||||
json={"ini_content": "[decky-02]\nservices = http\n"},
|
||||
headers={"Authorization": f"Bearer {auth_token}"},
|
||||
)
|
||||
assert r2.status_code == 202, r2.text
|
||||
assert r.status_code == 202, r.text
|
||||
|
||||
committed = await repo.get_state("deployment")
|
||||
assert committed is not None
|
||||
names = {d["name"] for d in committed["config"]["deckies"]}
|
||||
names = {d["name"] for d in await repo.get_deckies()}
|
||||
assert names == {"decky-01", "decky-02"}
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_additive_name_collision_returns_409(client, auth_token, monkeypatch):
|
||||
"""Re-submitting an existing decky name without replace_fleet → 409."""
|
||||
"""Submitting a decky whose name already exists in the fleet without
|
||||
replace_fleet → 409."""
|
||||
monkeypatch.setenv("DECNET_MODE", "master")
|
||||
await _seed_fleet("decky-01")
|
||||
|
||||
r1 = await client.post(
|
||||
"/api/v1/deckies/deploy",
|
||||
json={"ini_content": "[decky-01]\nservices = ssh\n"},
|
||||
headers={"Authorization": f"Bearer {auth_token}"},
|
||||
)
|
||||
assert r1.status_code == 202, r1.text
|
||||
|
||||
r2 = await client.post(
|
||||
r = await client.post(
|
||||
"/api/v1/deckies/deploy",
|
||||
json={"ini_content": "[decky-01]\nservices = http\n"},
|
||||
headers={"Authorization": f"Bearer {auth_token}"},
|
||||
)
|
||||
assert r2.status_code == 409, r2.text
|
||||
assert "decky-01" in r2.json()["detail"]
|
||||
assert "replace_fleet" in r2.json()["detail"]
|
||||
assert r.status_code == 409, r.text
|
||||
assert "decky-01" in r.json()["detail"]
|
||||
assert "replace_fleet" in r.json()["detail"]
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_additive_ip_collision_returns_409(client, auth_token, monkeypatch):
|
||||
"""A new decky pinned to an IP already in use → 409 with the IP."""
|
||||
"""A new decky pinned to an IP already in use by the existing fleet → 409
|
||||
with the IP."""
|
||||
monkeypatch.setenv("DECNET_MODE", "master")
|
||||
await _seed_fleet("decky-01", ip="192.168.1.50")
|
||||
|
||||
r1 = await client.post(
|
||||
"/api/v1/deckies/deploy",
|
||||
json={"ini_content": "[decky-01]\nservices = ssh\nip = 192.168.1.50\n"},
|
||||
headers={"Authorization": f"Bearer {auth_token}"},
|
||||
)
|
||||
assert r1.status_code == 202, r1.text
|
||||
|
||||
r2 = await client.post(
|
||||
r = await client.post(
|
||||
"/api/v1/deckies/deploy",
|
||||
json={"ini_content": "[decky-02]\nservices = http\nip = 192.168.1.50\n"},
|
||||
headers={"Authorization": f"Bearer {auth_token}"},
|
||||
)
|
||||
assert r2.status_code == 409, r2.text
|
||||
assert "192.168.1.50" in r2.json()["detail"]
|
||||
assert r.status_code == 409, r.text
|
||||
assert "192.168.1.50" in r.json()["detail"]
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_replace_fleet_true_overwrites_existing(client, auth_token, monkeypatch):
|
||||
"""replace_fleet=True preserves the historical full-replace semantics."""
|
||||
"""replace_fleet=True preserves the historical full-replace semantics:
|
||||
the existing fleet is dropped and the committed inventory is exactly the
|
||||
submitted INI."""
|
||||
monkeypatch.setenv("DECNET_MODE", "master")
|
||||
await _seed_fleet("decky-01")
|
||||
|
||||
r1 = await client.post(
|
||||
"/api/v1/deckies/deploy",
|
||||
json={"ini_content": "[decky-01]\nservices = ssh\n"},
|
||||
headers={"Authorization": f"Bearer {auth_token}"},
|
||||
)
|
||||
assert r1.status_code == 202, r1.text
|
||||
|
||||
r2 = await client.post(
|
||||
r = await client.post(
|
||||
"/api/v1/deckies/deploy",
|
||||
json={
|
||||
"ini_content": "[decky-02]\nservices = http\n",
|
||||
@@ -125,11 +149,9 @@ async def test_replace_fleet_true_overwrites_existing(client, auth_token, monkey
|
||||
},
|
||||
headers={"Authorization": f"Bearer {auth_token}"},
|
||||
)
|
||||
assert r2.status_code == 202, r2.text
|
||||
assert r.status_code == 202, r.text
|
||||
|
||||
committed = await repo.get_state("deployment")
|
||||
assert committed is not None
|
||||
names = {d["name"] for d in committed["config"]["deckies"]}
|
||||
names = {d["name"] for d in await repo.get_deckies()}
|
||||
assert names == {"decky-02"}
|
||||
|
||||
|
||||
@@ -139,25 +161,16 @@ async def test_additive_lifecycle_ids_scoped_to_new_deckies(client, auth_token,
|
||||
the caller submitted, not carryover. Operators polling
|
||||
/deckies/lifecycle?ids=... see exactly what this call deployed."""
|
||||
monkeypatch.setenv("DECNET_MODE", "master")
|
||||
await _seed_fleet("decky-01", ip="192.168.1.10")
|
||||
await _seed_fleet("decky-02", ip="192.168.1.11")
|
||||
|
||||
r1 = await client.post(
|
||||
"/api/v1/deckies/deploy",
|
||||
json={"ini_content": "[decky-01]\nservices = ssh\n[decky-02]\nservices = http\n"},
|
||||
headers={"Authorization": f"Bearer {auth_token}"},
|
||||
)
|
||||
assert r1.status_code == 202, r1.text
|
||||
assert len(r1.json()["lifecycle_ids"]) == 2
|
||||
|
||||
r2 = await client.post(
|
||||
r = await client.post(
|
||||
"/api/v1/deckies/deploy",
|
||||
json={"ini_content": "[decky-03]\nservices = ssh\n"},
|
||||
headers={"Authorization": f"Bearer {auth_token}"},
|
||||
)
|
||||
assert r2.status_code == 202, r2.text
|
||||
body2 = r2.json()
|
||||
assert len(body2["lifecycle_ids"]) == 1
|
||||
assert r.status_code == 202, r.text
|
||||
assert len(r.json()["lifecycle_ids"]) == 1
|
||||
|
||||
committed = await repo.get_state("deployment")
|
||||
assert committed is not None
|
||||
names = {d["name"] for d in committed["config"]["deckies"]}
|
||||
names = {d["name"] for d in await repo.get_deckies()}
|
||||
assert names == {"decky-01", "decky-02", "decky-03"}
|
||||
|
||||
@@ -5,7 +5,7 @@ from hypothesis import given, settings, strategies as st
|
||||
from ..conftest import _FUZZ_SETTINGS
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_get_deckies_endpoint(mock_state_file, client: httpx.AsyncClient, auth_token: str):
|
||||
async def test_get_deckies_endpoint(mock_fleet_deckies, client: httpx.AsyncClient, auth_token: str):
|
||||
_response = await client.get("/api/v1/deckies", headers={"Authorization": f"Bearer {auth_token}"})
|
||||
assert _response.status_code == 200
|
||||
_data = _response.json()
|
||||
|
||||
Reference in New Issue
Block a user