merge: testing → main (reconcile 2-week divergence)

This commit is contained in:
2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions

View File

View File

@@ -0,0 +1,380 @@
"""Agent-enrollment bundle flow: POST → .sh → .tgz (one-shot, TTL, races)."""
from __future__ import annotations
import asyncio
import io
import pathlib
import tarfile
from datetime import datetime, timedelta, timezone
import pytest
from decnet.swarm import pki
from decnet.web.router.swarm_mgmt import api_enroll_bundle as mod
@pytest.fixture(autouse=True)
def isolate_bundle_state(tmp_path: pathlib.Path, monkeypatch):
"""Point BUNDLE_DIR + CA into tmp, clear the in-memory registry."""
monkeypatch.setattr(mod, "BUNDLE_DIR", tmp_path / "bundles")
monkeypatch.setattr(pki, "DEFAULT_CA_DIR", tmp_path / "ca")
mod._BUNDLES.clear()
if mod._SWEEPER_TASK is not None and not mod._SWEEPER_TASK.done():
mod._SWEEPER_TASK.cancel()
mod._SWEEPER_TASK = None
yield
# Cleanup sweeper task between tests so they don't accumulate.
if mod._SWEEPER_TASK is not None and not mod._SWEEPER_TASK.done():
mod._SWEEPER_TASK.cancel()
mod._SWEEPER_TASK = None
async def _post(client, auth_token, **overrides):
body = {
"master_host": "10.0.0.50",
"agent_name": "worker-a",
"with_updater": True,
}
body.update(overrides)
return await client.post(
"/api/v1/swarm/enroll-bundle",
headers={"Authorization": f"Bearer {auth_token}"},
json=body,
)
@pytest.mark.anyio
async def test_create_bundle_returns_one_liner(client, auth_token):
resp = await _post(client, auth_token)
assert resp.status_code == 201, resp.text
body = resp.json()
assert body["token"]
assert body["host_uuid"]
assert body["command"].startswith("curl -fsSL ")
assert body["command"].endswith(" | sudo bash")
assert "&&" not in body["command"] # single pipe, no chaining
assert body["token"] in body["command"]
expires = datetime.fromisoformat(body["expires_at"].replace("Z", "+00:00"))
now = datetime.now(timezone.utc)
assert timedelta(minutes=4) < expires - now <= timedelta(minutes=5)
@pytest.mark.anyio
async def test_bundle_urls_use_master_host_not_request_base(client, auth_token):
"""URLs baked into the bootstrap must target the operator-supplied
master_host, not the dashboard's request.base_url (which may be loopback
behind a proxy)."""
resp = await _post(client, auth_token, master_host="10.20.30.40", agent_name="urltest")
assert resp.status_code == 201
body = resp.json()
assert "10.20.30.40" in body["command"]
assert "127.0.0.1" not in body["command"]
assert "testserver" not in body["command"]
token = body["token"]
sh = (await client.get(f"/api/v1/swarm/enroll-bundle/{token}.sh")).text
assert "10.20.30.40" in sh
assert "127.0.0.1" not in sh
assert "testserver" not in sh
@pytest.mark.anyio
async def test_use_ipvlan_opt_in_persists_and_bakes_into_ini(client, auth_token):
"""use_ipvlan=True must persist on the SwarmHost row AND bake `ipvlan = true`
into the agent's decnet.ini so locally-initiated deploys also use IPvlan."""
from decnet.web.dependencies import repo
resp = await _post(client, auth_token, agent_name="ipv-node", use_ipvlan=True)
assert resp.status_code == 201
host_uuid = resp.json()["host_uuid"]
token = resp.json()["token"]
row = await repo.get_swarm_host_by_uuid(host_uuid)
assert row["use_ipvlan"] is True
tgz = await client.get(f"/api/v1/swarm/enroll-bundle/{token}.tgz")
assert tgz.status_code == 200
with tarfile.open(fileobj=io.BytesIO(tgz.content), mode="r:gz") as tar:
ini = tar.extractfile("etc/decnet/decnet.ini").read().decode()
assert "ipvlan = true" in ini
@pytest.mark.anyio
async def test_use_ipvlan_default_false(client, auth_token):
from decnet.web.dependencies import repo
resp = await _post(client, auth_token, agent_name="macv-node")
assert resp.status_code == 201
row = await repo.get_swarm_host_by_uuid(resp.json()["host_uuid"])
assert row["use_ipvlan"] is False
tgz = await client.get(f"/api/v1/swarm/enroll-bundle/{resp.json()['token']}.tgz")
with tarfile.open(fileobj=io.BytesIO(tgz.content), mode="r:gz") as tar:
ini = tar.extractfile("etc/decnet/decnet.ini").read().decode()
assert "ipvlan = false" in ini
@pytest.mark.anyio
async def test_duplicate_agent_name_409(client, auth_token):
r1 = await _post(client, auth_token, agent_name="dup-node")
assert r1.status_code == 201
r2 = await _post(client, auth_token, agent_name="dup-node")
assert r2.status_code == 409
@pytest.mark.anyio
async def test_non_admin_forbidden(client, viewer_token):
resp = await _post(client, viewer_token)
assert resp.status_code == 403
@pytest.mark.anyio
async def test_no_auth_401(client):
resp = await client.post(
"/api/v1/swarm/enroll-bundle",
json={"master_host": "10.0.0.50", "agent_name": "worker-a"},
)
assert resp.status_code == 401
@pytest.mark.anyio
async def test_host_row_address_backfilled_from_tgz_source_ip(client, auth_token):
"""SwarmHosts.address starts blank at enroll time and is populated from
the agent's source IP when it curls the .tgz."""
from decnet.web.dependencies import repo
resp = await _post(client, auth_token, agent_name="addr-test",
master_host="192.168.1.5")
host_uuid = resp.json()["host_uuid"]
token = resp.json()["token"]
row = await repo.get_swarm_host_by_uuid(host_uuid)
assert row["address"] == "" # placeholder until first tgz fetch
tgz = await client.get(f"/api/v1/swarm/enroll-bundle/{token}.tgz")
assert tgz.status_code == 200
row = await repo.get_swarm_host_by_uuid(host_uuid)
# The TestClient client.host depends on httpx's ASGITransport — any
# non-empty value proves the backfill path ran.
assert row["address"] != ""
@pytest.mark.anyio
async def test_updater_opt_out_excludes_updater_artifacts(client, auth_token):
import io, tarfile
token = (await _post(client, auth_token, agent_name="noup", with_updater=False)).json()["token"]
resp = await client.get(f"/api/v1/swarm/enroll-bundle/{token}.tgz")
tf = tarfile.open(fileobj=io.BytesIO(resp.content), mode="r:gz")
names = set(tf.getnames())
assert not any(n.startswith("home/.decnet/updater/") for n in names)
sh_token = (await _post(client, auth_token, agent_name="noup2", with_updater=False)).json()["token"]
sh = (await client.get(f"/api/v1/swarm/enroll-bundle/{sh_token}.sh")).text
assert 'WITH_UPDATER="false"' in sh
@pytest.mark.anyio
async def test_systemd_units_shipped_and_installed(client, auth_token):
import io, tarfile
post = await _post(client, auth_token, agent_name="svc-test", master_host="10.9.8.7")
token = post.json()["token"]
resp = await client.get(f"/api/v1/swarm/enroll-bundle/{token}.tgz")
assert resp.status_code == 200
tf = tarfile.open(fileobj=io.BytesIO(resp.content), mode="r:gz")
names = set(tf.getnames())
assert "etc/systemd/system/decnet-agent.service" in names
assert "etc/systemd/system/decnet-forwarder.service" in names
assert "etc/systemd/system/decnet-engine.service" in names
# Per-host microservices get their own systemd units now.
# Profiler is master-only (uses the master DB) and must NOT ship.
for unit in ("decnet-collector", "decnet-prober", "decnet-sniffer"):
assert f"etc/systemd/system/{unit}.service" in names, unit
assert "etc/systemd/system/decnet-profiler.service" not in names
fwd = tf.extractfile("etc/systemd/system/decnet-forwarder.service").read().decode()
assert "--master-host 10.9.8.7" in fwd
assert "DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.forwarder.log" in fwd
agent_unit = tf.extractfile("etc/systemd/system/decnet-agent.service").read().decode()
assert "--no-forwarder" in agent_unit
assert "DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.agent.log" in agent_unit
sh_token = (await _post(client, auth_token, agent_name="svc-test2",
master_host="10.9.8.7")).json()["token"]
sh = (await client.get(f"/api/v1/swarm/enroll-bundle/{sh_token}.sh")).text
assert "systemctl daemon-reload" in sh
# Agent + forwarder + per-host microservices always enabled; updater
# conditional on WITH_UPDATER.
for unit in (
"decnet-agent.service", "decnet-forwarder.service",
"decnet-collector.service", "decnet-prober.service",
"decnet-sniffer.service",
):
assert unit in sh, unit
assert "decnet-updater.service" in sh
ini = tf.extractfile("etc/decnet/decnet.ini").read().decode()
assert "log-directory = /var/log/decnet" in ini
assert "log-file-path" not in ini
@pytest.mark.anyio
async def test_updater_opt_in_ships_cert_and_enables_systemd_unit(client, auth_token):
import io, tarfile
token = (await _post(client, auth_token, agent_name="up", with_updater=True)).json()["token"]
resp = await client.get(f"/api/v1/swarm/enroll-bundle/{token}.tgz")
tf = tarfile.open(fileobj=io.BytesIO(resp.content), mode="r:gz")
names = set(tf.getnames())
assert "home/.decnet/updater/updater.crt" in names
assert "home/.decnet/updater/updater.key" in names
assert "etc/systemd/system/decnet-updater.service" in names
key_info = tf.getmember("home/.decnet/updater/updater.key")
assert (key_info.mode & 0o777) == 0o600
updater_unit = tf.extractfile("etc/systemd/system/decnet-updater.service").read().decode()
assert "DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.updater.log" in updater_unit
assert "Restart=on-failure" in updater_unit
sh_token = (await _post(client, auth_token, agent_name="up2", with_updater=True)).json()["token"]
sh = (await client.get(f"/api/v1/swarm/enroll-bundle/{sh_token}.sh")).text
assert 'WITH_UPDATER="true"' in sh
assert "decnet-updater.service" in sh
# Old --daemon path is gone — updater is now a systemd service.
assert "decnet updater --daemon" not in sh
@pytest.mark.anyio
async def test_invalid_agent_name_422(client, auth_token):
# Uppercase / underscore not allowed by the regex.
resp = await _post(client, auth_token, agent_name="Bad_Name")
assert resp.status_code in (400, 422)
@pytest.mark.anyio
async def test_get_bootstrap_contains_expected(client, auth_token):
post = await _post(client, auth_token, agent_name="alpha", master_host="master.example")
token = post.json()["token"]
resp = await client.get(f"/api/v1/swarm/enroll-bundle/{token}.sh")
assert resp.status_code == 200
text = resp.text
assert text.startswith("#!/usr/bin/env bash")
assert "alpha" in text
assert "master.example" in text
assert f"/api/v1/swarm/enroll-bundle/{token}.tgz" in text
# Script does NOT try to self-read with $0 (that would break under `curl | bash`).
assert 'tail -n +' not in text and 'awk' not in text
@pytest.mark.anyio
async def test_get_bootstrap_is_idempotent_until_tgz_served(client, auth_token):
token = (await _post(client, auth_token, agent_name="beta")).json()["token"]
for _ in range(3):
assert (await client.get(f"/api/v1/swarm/enroll-bundle/{token}.sh")).status_code == 200
@pytest.mark.anyio
async def test_get_tgz_contents(client, auth_token, tmp_path):
token = (await _post(
client, auth_token,
agent_name="gamma", master_host="10.1.2.3",
services_ini="[general]\nnet = 10.0.0.0/24\n",
)).json()["token"]
resp = await client.get(f"/api/v1/swarm/enroll-bundle/{token}.tgz")
assert resp.status_code == 200
assert resp.headers["content-type"].startswith("application/gzip")
tf = tarfile.open(fileobj=io.BytesIO(resp.content), mode="r:gz")
names = set(tf.getnames())
# Required files
assert "etc/decnet/decnet.ini" in names
assert "home/.decnet/agent/ca.crt" in names
assert "home/.decnet/agent/worker.crt" in names
assert "home/.decnet/agent/worker.key" in names
assert "services.ini" in names
assert "decnet/cli/__init__.py" in names # source shipped
assert "pyproject.toml" in names
# Excluded paths must NOT be shipped
for bad in names:
assert not bad.startswith("tests/"), f"leaked test file: {bad}"
assert not bad.startswith("development/"), f"leaked dev file: {bad}"
assert not bad.startswith("wiki-checkout/"), f"leaked wiki file: {bad}"
assert "__pycache__" not in bad
assert not bad.endswith(".pyc")
assert "node_modules" not in bad
# Dev-host env leaks would bake absolute master paths into the agent.
assert not bad.endswith(".env"), f"leaked env file: {bad}"
assert ".env.local" not in bad, f"leaked env file: {bad}"
assert ".env.example" not in bad, f"leaked env file: {bad}"
# Master-only trees: agents don't run the FastAPI master app, the
# React frontend, the mutator (swarm-wide respawn scheduler), or
# the profiler (rebuilds profiles against the master DB).
assert not bad.startswith("decnet_web/"), f"leaked frontend: {bad}"
assert not bad.startswith("decnet/web/"), f"leaked master-api: {bad}"
assert not bad.startswith("decnet/mutator/"), f"leaked mutator: {bad}"
assert not bad.startswith("decnet/profiler/"), f"leaked profiler: {bad}"
# INI content is correct
ini = tf.extractfile("etc/decnet/decnet.ini").read().decode()
assert "mode = agent" in ini
assert "master-host = 10.1.2.3" in ini
# Key is mode 0600
key_info = tf.getmember("home/.decnet/agent/worker.key")
assert (key_info.mode & 0o777) == 0o600
# Services INI is there
assert tf.extractfile("services.ini").read().decode().startswith("[general]")
@pytest.mark.anyio
async def test_tgz_is_one_shot(client, auth_token):
token = (await _post(client, auth_token, agent_name="delta")).json()["token"]
r1 = await client.get(f"/api/v1/swarm/enroll-bundle/{token}.tgz")
assert r1.status_code == 200
r2 = await client.get(f"/api/v1/swarm/enroll-bundle/{token}.tgz")
assert r2.status_code == 404
# .sh also invalidated after .tgz served (the host is up; replay is pointless)
r3 = await client.get(f"/api/v1/swarm/enroll-bundle/{token}.sh")
assert r3.status_code == 404
@pytest.mark.anyio
async def test_unknown_token_404(client):
assert (await client.get("/api/v1/swarm/enroll-bundle/not-a-real-token.sh")).status_code == 404
assert (await client.get("/api/v1/swarm/enroll-bundle/not-a-real-token.tgz")).status_code == 404
@pytest.mark.anyio
async def test_ttl_expiry_returns_404(client, auth_token, monkeypatch):
token = (await _post(client, auth_token, agent_name="epsilon")).json()["token"]
# Jump the clock 6 minutes into the future.
future = datetime.now(timezone.utc) + timedelta(minutes=6)
monkeypatch.setattr(mod, "_now", lambda: future)
assert (await client.get(f"/api/v1/swarm/enroll-bundle/{token}.sh")).status_code == 404
assert (await client.get(f"/api/v1/swarm/enroll-bundle/{token}.tgz")).status_code == 404
@pytest.mark.anyio
async def test_concurrent_tgz_exactly_one_wins(client, auth_token):
token = (await _post(client, auth_token, agent_name="zeta")).json()["token"]
url = f"/api/v1/swarm/enroll-bundle/{token}.tgz"
r1, r2 = await asyncio.gather(client.get(url), client.get(url))
statuses = sorted([r1.status_code, r2.status_code])
assert statuses == [200, 404]
@pytest.mark.anyio
async def test_host_row_persisted_after_enroll(client, auth_token):
from decnet.web.dependencies import repo
resp = await _post(client, auth_token, agent_name="eta")
assert resp.status_code == 201
body = resp.json()
row = await repo.get_swarm_host_by_uuid(body["host_uuid"])
assert row is not None
assert row["name"] == "eta"
assert row["status"] == "enrolled"

View File

@@ -0,0 +1,215 @@
"""POST /swarm/hosts/{uuid}/teardown — per-host and per-decky remote teardown."""
from __future__ import annotations
import json
from datetime import datetime, timezone
from typing import Optional
import pytest
from decnet.web.router.swarm_mgmt import api_teardown_host as mod
class _FakeAgent:
def __init__(self, *a, **kw):
_FakeAgent.calls.append(("init", kw.get("host", a[0] if a else None)))
self._host = kw.get("host", a[0] if a else None)
async def __aenter__(self):
return self
async def __aexit__(self, *exc):
return None
async def teardown(self, decky_id: Optional[str] = None) -> dict:
_FakeAgent.calls.append(("teardown", decky_id))
return {"status": "torn_down", "decky_id": decky_id}
class _FailingAgent(_FakeAgent):
async def teardown(self, decky_id: Optional[str] = None) -> dict:
raise RuntimeError("network unreachable")
@pytest.fixture
def fake_agent(monkeypatch):
_FakeAgent.calls = []
monkeypatch.setattr(mod, "AgentClient", _FakeAgent)
return _FakeAgent
@pytest.fixture
def failing_agent(monkeypatch):
_FailingAgent.calls = []
monkeypatch.setattr(mod, "AgentClient", _FailingAgent)
return _FailingAgent
async def _seed_host(repo, *, name="worker-a", uuid="h-1") -> str:
await repo.add_swarm_host({
"uuid": uuid,
"name": name,
"address": "10.0.0.9",
"agent_port": 8765,
"status": "active",
"client_cert_fingerprint": "f" * 64,
"cert_bundle_path": "",
"use_ipvlan": False,
"enrolled_at": datetime.now(timezone.utc),
"last_heartbeat": None,
})
return uuid
async def _seed_shard(repo, *, host_uuid: str, decky_name: str) -> None:
await repo.upsert_decky_shard({
"decky_name": decky_name,
"host_uuid": host_uuid,
"services": json.dumps(["ssh"]),
"state": "running",
"last_error": None,
"updated_at": datetime.now(timezone.utc),
})
@pytest.mark.anyio
async def test_teardown_all_deckies_on_host(client, auth_token, fake_agent):
from decnet.web.dependencies import repo
uuid = await _seed_host(repo, name="tear-all", uuid="tear-all-uuid")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky1")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky2")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={},
)
assert resp.status_code == 202, resp.text
body = resp.json()
assert body["accepted"] is True
assert body["decky_id"] is None
await mod.drain_pending()
assert ("teardown", None) in fake_agent.calls
remaining = await repo.list_decky_shards(uuid)
assert remaining == []
@pytest.mark.anyio
async def test_teardown_single_decky(client, auth_token, fake_agent):
from decnet.web.dependencies import repo
uuid = await _seed_host(repo, name="tear-one", uuid="tear-one-uuid")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky-keep")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky-drop")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={"decky_id": "decky-drop"},
)
assert resp.status_code == 202, resp.text
assert resp.json()["decky_id"] == "decky-drop"
await mod.drain_pending()
assert ("teardown", "decky-drop") in fake_agent.calls
remaining = {s["decky_name"] for s in await repo.list_decky_shards(uuid)}
assert remaining == {"decky-keep"}
@pytest.mark.anyio
async def test_teardown_returns_immediately_and_marks_tearing_down(
client, auth_token, monkeypatch
):
"""The 202 must fire before the background agent call completes —
otherwise multiple queued teardowns still serialize on the UI."""
import asyncio as _asyncio
from decnet.web.dependencies import repo
gate = _asyncio.Event()
class _SlowAgent(_FakeAgent):
async def teardown(self, decky_id=None):
await gate.wait()
return {"status": "torn_down"}
monkeypatch.setattr(mod, "AgentClient", _SlowAgent)
uuid = await _seed_host(repo, name="slow", uuid="slow-uuid")
await _seed_shard(repo, host_uuid=uuid, decky_name="decky-slow")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={"decky_id": "decky-slow"},
)
assert resp.status_code == 202
# Agent is still blocked — shard should be in 'tearing_down', not gone.
shards = {s["decky_name"]: s for s in await repo.list_decky_shards(uuid)}
assert shards["decky-slow"]["state"] == "tearing_down"
gate.set()
await mod.drain_pending()
remaining = {s["decky_name"] for s in await repo.list_decky_shards(uuid)}
assert remaining == set()
@pytest.mark.anyio
async def test_teardown_unknown_host_404(client, auth_token, fake_agent):
resp = await client.post(
"/api/v1/swarm/hosts/does-not-exist/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={},
)
assert resp.status_code == 404
@pytest.mark.anyio
async def test_teardown_agent_failure_marks_shard_failed(
client, auth_token, failing_agent
):
"""Background-task failure: the shard must NOT be deleted and its
state flips to teardown_failed with the error recorded so the UI
surfaces it."""
from decnet.web.dependencies import repo
uuid = await _seed_host(repo, name="tear-fail", uuid="tear-fail-uuid")
await _seed_shard(repo, host_uuid=uuid, decky_name="survivor")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {auth_token}"},
json={},
)
# Acceptance is unconditional — the failure happens in the background.
assert resp.status_code == 202
await mod.drain_pending()
shards = {s["decky_name"]: s for s in await repo.list_decky_shards(uuid)}
assert "survivor" in shards
assert shards["survivor"]["state"] == "teardown_failed"
assert "network unreachable" in (shards["survivor"]["last_error"] or "")
@pytest.mark.anyio
async def test_teardown_non_admin_forbidden(client, viewer_token, fake_agent):
from decnet.web.dependencies import repo
uuid = await _seed_host(repo, name="tear-guard", uuid="tear-guard-uuid")
resp = await client.post(
f"/api/v1/swarm/hosts/{uuid}/teardown",
headers={"Authorization": f"Bearer {viewer_token}"},
json={},
)
assert resp.status_code == 403
@pytest.mark.anyio
async def test_teardown_no_auth_401(client, fake_agent):
resp = await client.post(
"/api/v1/swarm/hosts/whatever/teardown",
json={},
)
assert resp.status_code == 401