merge: testing → main (reconcile 2-week divergence)
This commit is contained in:
0
tests/swarm/__init__.py
Normal file
0
tests/swarm/__init__.py
Normal file
95
tests/swarm/test_agent_app.py
Normal file
95
tests/swarm/test_agent_app.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""Agent FastAPI app — static/contract checks only.
|
||||
|
||||
We deliberately do NOT spin uvicorn up in-process here: the mTLS layer is
|
||||
enforced by uvicorn itself (via --ssl-cert-reqs 2) and is validated in the
|
||||
VM integration suite. What we CAN assert in unit scope is the route
|
||||
surface + request/response schema.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from decnet.agent.app import app
|
||||
|
||||
|
||||
def test_health_endpoint() -> None:
|
||||
client = TestClient(app)
|
||||
resp = client.get("/health")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json() == {"status": "ok"}
|
||||
|
||||
|
||||
def test_status_when_not_deployed() -> None:
|
||||
client = TestClient(app)
|
||||
resp = client.get("/status")
|
||||
assert resp.status_code == 200
|
||||
body = resp.json()
|
||||
assert "deployed" in body
|
||||
assert "deckies" in body
|
||||
|
||||
|
||||
def test_mutate_is_501() -> None:
|
||||
client = TestClient(app)
|
||||
resp = client.post("/mutate", json={"decky_id": "decky-01", "services": ["ssh"]})
|
||||
assert resp.status_code == 501
|
||||
|
||||
|
||||
def test_deploy_rejects_malformed_body() -> None:
|
||||
client = TestClient(app)
|
||||
resp = client.post("/deploy", json={"not": "a config"})
|
||||
assert resp.status_code == 422 # pydantic validation
|
||||
|
||||
|
||||
def test_route_set() -> None:
|
||||
paths = {r.path for r in app.routes if hasattr(r, "path")}
|
||||
assert {"/health", "/status", "/deploy", "/teardown", "/mutate", "/self-destruct"} <= paths
|
||||
|
||||
|
||||
def test_self_destruct_spawns_reaper_and_returns_fast(monkeypatch, tmp_path) -> None:
|
||||
"""/self-destruct must write the reaper script and spawn it detached
|
||||
(start_new_session=True). We intercept Popen so the test doesn't
|
||||
actually nuke anything."""
|
||||
from decnet.agent import executor as _exec
|
||||
|
||||
spawned: list[dict] = []
|
||||
|
||||
class _FakePopen:
|
||||
def __init__(self, args, **kw):
|
||||
spawned.append({"args": args, "kw": kw})
|
||||
|
||||
monkeypatch.setattr(_exec, "_deployer", type("X", (), {
|
||||
"teardown": staticmethod(lambda _id: None),
|
||||
})())
|
||||
monkeypatch.setattr(_exec, "clear_state", lambda: None)
|
||||
|
||||
import subprocess as _sp
|
||||
monkeypatch.setattr(_sp, "Popen", _FakePopen)
|
||||
|
||||
client = TestClient(app)
|
||||
resp = client.post("/self-destruct")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["status"] == "self_destruct_scheduled"
|
||||
assert len(spawned) == 1
|
||||
assert spawned[0]["kw"].get("start_new_session") is True
|
||||
script_candidates = [
|
||||
a for a in spawned[0]["args"]
|
||||
if isinstance(a, str) and a.startswith("/tmp/decnet-reaper-")
|
||||
]
|
||||
assert len(script_candidates) == 1, spawned[0]["args"]
|
||||
script_path = script_candidates[0]
|
||||
# Reaper content sanity check — covers the paths the operator asked for.
|
||||
import pathlib
|
||||
body = pathlib.Path(script_path).read_text()
|
||||
assert "/opt/decnet*" in body
|
||||
assert "/etc/systemd/system/decnet-" in body
|
||||
assert "/var/lib/decnet/*" in body
|
||||
assert "/usr/local/bin/decnet*" in body
|
||||
assert "/etc/decnet" in body
|
||||
# Logs must be preserved — no `rm` line should touch /var/log.
|
||||
for line in body.splitlines():
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("#") or not stripped:
|
||||
continue
|
||||
if stripped.startswith("rm "):
|
||||
assert "/var/log" not in stripped
|
||||
pathlib.Path(script_path).unlink(missing_ok=True)
|
||||
122
tests/swarm/test_agent_heartbeat.py
Normal file
122
tests/swarm/test_agent_heartbeat.py
Normal file
@@ -0,0 +1,122 @@
|
||||
"""Tests for the worker-side heartbeat loop (decnet.agent.heartbeat)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from decnet.agent import heartbeat as hb
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_module_task(monkeypatch: pytest.MonkeyPatch):
|
||||
# Each test gets a fresh _task slot so start()/stop() state doesn't
|
||||
# leak between cases.
|
||||
monkeypatch.setattr(hb, "_task", None)
|
||||
yield
|
||||
monkeypatch.setattr(hb, "_task", None)
|
||||
|
||||
|
||||
class _StubTransport(httpx.AsyncBaseTransport):
|
||||
"""Record each POST and respond according to ``responder(req)``."""
|
||||
def __init__(self, responder):
|
||||
self.calls: list[dict[str, Any]] = []
|
||||
self._responder = responder
|
||||
|
||||
async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
|
||||
body = request.read()
|
||||
self.calls.append({"url": str(request.url), "body": body})
|
||||
return self._responder(request)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tick_posts_status_snapshot_and_accepts_204(monkeypatch) -> None:
|
||||
async def fake_status() -> dict:
|
||||
return {"deployed": False, "deckies": []}
|
||||
|
||||
monkeypatch.setattr(hb._exec, "status", fake_status)
|
||||
|
||||
transport = _StubTransport(lambda req: httpx.Response(204))
|
||||
async with httpx.AsyncClient(transport=transport) as client:
|
||||
await hb._tick(client, "https://m/swarm/heartbeat", "uuid-a", "1.2.3")
|
||||
|
||||
assert len(transport.calls) == 1
|
||||
import json
|
||||
payload = json.loads(transport.calls[0]["body"])
|
||||
assert payload["host_uuid"] == "uuid-a"
|
||||
assert payload["agent_version"] == "1.2.3"
|
||||
assert payload["status"]["deployed"] is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tick_logs_on_non_204_response(monkeypatch, caplog) -> None:
|
||||
async def fake_status() -> dict:
|
||||
return {"deployed": False}
|
||||
|
||||
monkeypatch.setattr(hb._exec, "status", fake_status)
|
||||
transport = _StubTransport(lambda req: httpx.Response(403, text="mismatch"))
|
||||
|
||||
async with httpx.AsyncClient(transport=transport) as client:
|
||||
with caplog.at_level("WARNING", logger="agent.heartbeat"):
|
||||
await hb._tick(client, "https://m/swarm/heartbeat", "uuid-a", "1.2.3")
|
||||
|
||||
assert any("rejected" in rec.getMessage() for rec in caplog.records)
|
||||
|
||||
|
||||
def test_start_is_noop_when_identity_missing(monkeypatch) -> None:
|
||||
# Neither DECNET_HOST_UUID nor DECNET_MASTER_HOST set → start() must
|
||||
# return None, never raise. Dev runs exercise this path every time.
|
||||
import decnet.env as env
|
||||
monkeypatch.setattr(env, "DECNET_HOST_UUID", None)
|
||||
monkeypatch.setattr(env, "DECNET_MASTER_HOST", None)
|
||||
assert hb.start() is None
|
||||
assert hb._task is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_start_is_noop_when_ssl_context_unavailable(
|
||||
monkeypatch, tmp_path
|
||||
) -> None:
|
||||
# Identity plumbed, but worker bundle missing on disk → start() logs
|
||||
# and bails instead of crashing the FastAPI app.
|
||||
import decnet.env as env
|
||||
monkeypatch.setattr(env, "DECNET_HOST_UUID", "uuid-a")
|
||||
monkeypatch.setattr(env, "DECNET_MASTER_HOST", "master.lan")
|
||||
monkeypatch.setattr(env, "DECNET_SWARMCTL_PORT", 8770)
|
||||
monkeypatch.setenv("DECNET_AGENT_DIR", str(tmp_path / "empty"))
|
||||
assert hb.start() is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_loop_keeps_ticking_after_5xx_failures(monkeypatch) -> None:
|
||||
# Simulates a flapping master: first two ticks raise/5xx, third succeeds.
|
||||
# The loop must not crash — it must sleep and retry.
|
||||
call_count = {"n": 0}
|
||||
|
||||
def _responder(req):
|
||||
call_count["n"] += 1
|
||||
if call_count["n"] < 3:
|
||||
return httpx.Response(503, text="unavailable")
|
||||
return httpx.Response(204)
|
||||
|
||||
async def fake_status() -> dict:
|
||||
return {"deployed": False}
|
||||
|
||||
monkeypatch.setattr(hb._exec, "status", fake_status)
|
||||
monkeypatch.setattr(hb, "INTERVAL_S", 0.01) # fast-forward the sleep
|
||||
|
||||
transport = _StubTransport(_responder)
|
||||
|
||||
async def _run():
|
||||
async with httpx.AsyncClient(transport=transport) as client:
|
||||
while call_count["n"] < 3:
|
||||
try:
|
||||
await hb._tick(client, "https://m/swarm/heartbeat", "uuid-a", "1.2.3")
|
||||
except Exception:
|
||||
pass
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
await asyncio.wait_for(_run(), timeout=2.0)
|
||||
assert call_count["n"] >= 3
|
||||
147
tests/swarm/test_agent_no_auto_restore.py
Normal file
147
tests/swarm/test_agent_no_auto_restore.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""Step 8 regression: the agent must NEVER auto-restore a topology on boot.
|
||||
|
||||
Guiding rule: master is authoritative, agent is a dumb executor. If an
|
||||
agent restarts with a stale applied_topology row in its local cache, it
|
||||
must not try to replay `docker-compose up` on its own — that would
|
||||
create a split-brain where a decommissioned topology suddenly reappears
|
||||
without the master's consent. Instead the agent simply reports whatever
|
||||
it has via GET /topology/state + heartbeat; master decides whether to
|
||||
re-push.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from decnet.agent import app as agent_app
|
||||
from decnet.agent.topology_store import TopologyStore
|
||||
|
||||
|
||||
def _seed_applied_row(db_path: Path, topology_id: str, hash_: str) -> None:
|
||||
"""Write a row directly — simulates a pre-existing cache from a
|
||||
previous process lifecycle."""
|
||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
store = TopologyStore(db_path)
|
||||
try:
|
||||
store.put(topology_id, hash_, {"topology": {"id": topology_id}})
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def agent_dir(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path:
|
||||
d = tmp_path / "agent"
|
||||
d.mkdir()
|
||||
monkeypatch.setenv("DECNET_AGENT_DIR", str(d))
|
||||
# Reset the module-level cached store so the new DECNET_AGENT_DIR
|
||||
# is honoured for this test.
|
||||
monkeypatch.setattr(agent_app, "_topology_store", None)
|
||||
return d
|
||||
|
||||
|
||||
def test_lifespan_startup_does_not_touch_docker(
|
||||
agent_dir: Path, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""Seed a populated topology.db, spin up the agent app, and verify
|
||||
docker.from_env was never called during startup — the agent must
|
||||
wait for master instructions, not self-heal from local state."""
|
||||
_seed_applied_row(agent_dir / "topology.db", "stale-tid", "stale-hash")
|
||||
|
||||
calls: list[str] = []
|
||||
|
||||
def _boom(*_a, **_k):
|
||||
calls.append("docker.from_env")
|
||||
raise AssertionError("agent must not touch docker during startup")
|
||||
|
||||
import docker as _docker
|
||||
monkeypatch.setattr(_docker, "from_env", _boom)
|
||||
|
||||
# Bringing up the lifespan is what would run any auto-restore hook.
|
||||
with TestClient(agent_app.app) as client:
|
||||
# Sanity: health is live, no apply was triggered.
|
||||
r = client.get("/health")
|
||||
assert r.status_code == 200
|
||||
|
||||
assert calls == [], "docker was contacted during agent boot"
|
||||
|
||||
|
||||
def test_get_topology_state_reflects_cache_without_replay(
|
||||
agent_dir: Path, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""GET /topology/state must return the stored hash/id unchanged.
|
||||
It may also attempt to *observe* live docker state (read-only) — we
|
||||
stub that so no real docker is required — but it must NEVER
|
||||
re-materialise bridges/containers from the cache."""
|
||||
_seed_applied_row(agent_dir / "topology.db", "t-boot", "h-boot")
|
||||
|
||||
class _StubDocker:
|
||||
class networks:
|
||||
@staticmethod
|
||||
def list(): return []
|
||||
|
||||
class containers:
|
||||
@staticmethod
|
||||
def list(all=False): return []
|
||||
|
||||
import docker as _docker
|
||||
monkeypatch.setattr(_docker, "from_env", lambda: _StubDocker)
|
||||
|
||||
with TestClient(agent_app.app) as client:
|
||||
r = client.get("/topology/state")
|
||||
assert r.status_code == 200, r.text
|
||||
body = r.json()
|
||||
assert body["topology_id"] == "t-boot"
|
||||
assert body["applied_version_hash"] == "h-boot"
|
||||
# observed is read-only — empty live state is fine, it's what the
|
||||
# master uses to decide whether to re-push.
|
||||
assert body["observed"] == {"bridges": [], "containers": []}
|
||||
|
||||
|
||||
def test_topology_store_has_no_restore_hook() -> None:
|
||||
"""Static guard: if someone adds a `restore()` / `replay()` method
|
||||
to TopologyStore this test will fail, forcing them to re-read the
|
||||
module docstring and the Step 8 rationale before merging."""
|
||||
forbidden = {"restore", "replay", "reapply", "rehydrate", "auto_restore"}
|
||||
present = {n for n in dir(TopologyStore) if not n.startswith("_")}
|
||||
overlap = forbidden & present
|
||||
assert not overlap, (
|
||||
f"TopologyStore must stay a passive cache — found {overlap}. "
|
||||
"The agent never self-heals; master decides."
|
||||
)
|
||||
|
||||
|
||||
def test_seeded_db_survives_process_restart_verbatim(tmp_path: Path) -> None:
|
||||
"""Opening a pre-populated store in a fresh process yields the same
|
||||
row — no on-open mutation, no stale-row scrubbing. This is the
|
||||
behavior the master relies on for the 'agent reports old hash →
|
||||
needs_resync' detection path."""
|
||||
db = tmp_path / "t.db"
|
||||
# Process 1.
|
||||
s1 = TopologyStore(db)
|
||||
s1.put("t-x", "h-x", {"topology": {"id": "t-x"}})
|
||||
s1.close()
|
||||
|
||||
# Raw sqlite read — confirms nothing in the file rewrites itself
|
||||
# between opens.
|
||||
with sqlite3.connect(str(db)) as raw:
|
||||
row = raw.execute(
|
||||
"SELECT topology_id, applied_version_hash, hydrated_blob_json"
|
||||
" FROM applied_topology"
|
||||
).fetchone()
|
||||
assert row[0] == "t-x"
|
||||
assert row[1] == "h-x"
|
||||
assert json.loads(row[2]) == {"topology": {"id": "t-x"}}
|
||||
|
||||
# Process 2 (new store, same file).
|
||||
s2 = TopologyStore(db)
|
||||
try:
|
||||
cur = s2.current()
|
||||
assert cur is not None
|
||||
assert cur.topology_id == "t-x"
|
||||
assert cur.applied_version_hash == "h-x"
|
||||
finally:
|
||||
s2.close()
|
||||
118
tests/swarm/test_agent_relocalize.py
Normal file
118
tests/swarm/test_agent_relocalize.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""Worker agent re-localizes master-built configs to its own NIC/subnet.
|
||||
|
||||
The master ships a DecnetConfig populated from *its own* network (master
|
||||
NIC name, master subnet, master-chosen decky IPs). The worker cannot run
|
||||
the deployer against that as-is: `ip addr show <master-nic>` blows up on
|
||||
any worker whose NIC differs from the master's, which is ~always the
|
||||
case in a heterogeneous fleet.
|
||||
|
||||
The agent's executor overrides interface/subnet/gateway/host_ip with
|
||||
locally-detected values before calling into the deployer, and if the
|
||||
subnet doesn't match, it re-allocates decky IPs from the local subnet.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from decnet.agent import executor
|
||||
from decnet.models import DecnetConfig, DeckyConfig
|
||||
|
||||
|
||||
def _cfg(subnet: str, interface: str = "wlp6s0") -> DecnetConfig:
|
||||
return DecnetConfig(
|
||||
mode="swarm",
|
||||
interface=interface,
|
||||
subnet=subnet,
|
||||
gateway=subnet.rsplit(".", 1)[0] + ".1",
|
||||
deckies=[
|
||||
DeckyConfig(
|
||||
name=f"decky-0{i}",
|
||||
ip=subnet.rsplit(".", 1)[0] + f".{10 + i}",
|
||||
services=["ssh"],
|
||||
distro="debian",
|
||||
base_image="debian:bookworm-slim",
|
||||
hostname=f"decky-0{i}",
|
||||
)
|
||||
for i in range(1, 3)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def test_relocalize_swaps_interface_and_subnet(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
monkeypatch.setattr(executor, "detect_interface", lambda: "enp0s3")
|
||||
monkeypatch.setattr(executor, "detect_subnet", lambda _i: ("10.0.0.0/24", "10.0.0.1"))
|
||||
monkeypatch.setattr(executor, "get_host_ip", lambda _i: "10.0.0.99")
|
||||
monkeypatch.setattr(
|
||||
executor, "allocate_ips",
|
||||
lambda **kw: [f"10.0.0.{20 + i}" for i in range(kw["count"])],
|
||||
)
|
||||
|
||||
incoming = _cfg("192.168.1.0/24")
|
||||
out = executor._relocalize(incoming)
|
||||
|
||||
assert out.interface == "enp0s3"
|
||||
assert out.subnet == "10.0.0.0/24"
|
||||
assert out.gateway == "10.0.0.1"
|
||||
# Subnet changed → IPs re-allocated from the worker's subnet.
|
||||
assert [d.ip for d in out.deckies] == ["10.0.0.20", "10.0.0.21"]
|
||||
# Non-network fields survive.
|
||||
assert [d.name for d in out.deckies] == ["decky-01", "decky-02"]
|
||||
assert [d.services for d in out.deckies] == [["ssh"], ["ssh"]]
|
||||
|
||||
|
||||
def test_relocalize_keeps_ips_when_subnet_matches(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
monkeypatch.setattr(executor, "detect_interface", lambda: "enp0s3")
|
||||
monkeypatch.setattr(executor, "detect_subnet", lambda _i: ("192.168.1.0/24", "192.168.1.1"))
|
||||
monkeypatch.setattr(executor, "get_host_ip", lambda _i: "192.168.1.50")
|
||||
# allocate_ips should NOT be called in the matching-subnet branch.
|
||||
def _fail(**_kw): # pragma: no cover
|
||||
raise AssertionError("allocate_ips must not be called when subnets match")
|
||||
monkeypatch.setattr(executor, "allocate_ips", _fail)
|
||||
|
||||
incoming = _cfg("192.168.1.0/24")
|
||||
out = executor._relocalize(incoming)
|
||||
|
||||
assert out.interface == "enp0s3"
|
||||
assert out.subnet == "192.168.1.0/24"
|
||||
# Decky IPs preserved verbatim.
|
||||
assert [d.ip for d in out.deckies] == ["192.168.1.11", "192.168.1.12"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_deploy_relocalizes_before_calling_deployer(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""End-to-end: agent.deploy(..) must not pass the master's interface
|
||||
through to the blocking deployer."""
|
||||
monkeypatch.setattr(executor, "detect_interface", lambda: "enp0s3")
|
||||
monkeypatch.setattr(executor, "detect_subnet", lambda _i: ("192.168.1.0/24", "192.168.1.1"))
|
||||
monkeypatch.setattr(executor, "get_host_ip", lambda _i: "192.168.1.50")
|
||||
|
||||
seen: dict = {}
|
||||
|
||||
def _fake_deploy(cfg, dry_run, no_cache, parallel):
|
||||
seen["interface"] = cfg.interface
|
||||
seen["subnet"] = cfg.subnet
|
||||
|
||||
monkeypatch.setattr(executor._deployer, "deploy", _fake_deploy)
|
||||
|
||||
await executor.deploy(_cfg("192.168.1.0/24", interface="wlp6s0-master"), dry_run=True)
|
||||
assert seen == {"interface": "enp0s3", "subnet": "192.168.1.0/24"}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_deploy_unihost_mode_skips_relocalize(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""Unihost configs have already been built against the local box — we
|
||||
must not second-guess them."""
|
||||
def _fail(*_a, **_kw): # pragma: no cover
|
||||
raise AssertionError("detect_interface must not be called for unihost")
|
||||
monkeypatch.setattr(executor, "detect_interface", _fail)
|
||||
|
||||
captured: dict = {}
|
||||
|
||||
def _fake_deploy(cfg, dry_run, no_cache, parallel):
|
||||
captured["interface"] = cfg.interface
|
||||
|
||||
monkeypatch.setattr(executor._deployer, "deploy", _fake_deploy)
|
||||
|
||||
cfg = _cfg("192.168.1.0/24", interface="eth0").model_copy(update={"mode": "unihost"})
|
||||
await executor.deploy(cfg, dry_run=True)
|
||||
assert captured["interface"] == "eth0"
|
||||
168
tests/swarm/test_agent_topology_endpoints.py
Normal file
168
tests/swarm/test_agent_topology_endpoints.py
Normal file
@@ -0,0 +1,168 @@
|
||||
"""Agent topology endpoints — contract-level tests with mocked ops."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pathlib
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from decnet.agent import app as _agent_app
|
||||
from decnet.agent import topology_ops as _ops
|
||||
from decnet.agent.topology_store import AlreadyApplied
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _isolate_store(monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path):
|
||||
"""Point the singleton at a tmp dir and reset it between tests."""
|
||||
monkeypatch.setenv("DECNET_AGENT_DIR", str(tmp_path))
|
||||
# Force a fresh store per test.
|
||||
if _agent_app._topology_store is not None:
|
||||
_agent_app._topology_store.close()
|
||||
_agent_app._topology_store = None
|
||||
yield
|
||||
if _agent_app._topology_store is not None:
|
||||
_agent_app._topology_store.close()
|
||||
_agent_app._topology_store = None
|
||||
|
||||
|
||||
def _hydrated(topology_id: str = "top-1") -> dict:
|
||||
return {
|
||||
"topology": {"id": topology_id, "name": "n", "mode": "agent"},
|
||||
"lans": [],
|
||||
"deckies": [],
|
||||
"edges": [],
|
||||
}
|
||||
|
||||
|
||||
def test_topology_state_idle() -> None:
|
||||
client = TestClient(_agent_app.app)
|
||||
resp = client.get("/topology/state")
|
||||
assert resp.status_code == 200
|
||||
body = resp.json()
|
||||
assert body["topology_id"] is None
|
||||
assert body["applied_version_hash"] is None
|
||||
assert "observed" in body
|
||||
|
||||
|
||||
def test_topology_apply_routes_to_ops(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
called: dict = {}
|
||||
|
||||
async def _fake_apply(hydrated, version_hash, store):
|
||||
called["hydrated"] = hydrated
|
||||
called["version_hash"] = version_hash
|
||||
# Simulate ops bookkeeping.
|
||||
store.put(hydrated["topology"]["id"], version_hash, hydrated)
|
||||
|
||||
monkeypatch.setattr(_ops, "apply", _fake_apply)
|
||||
|
||||
client = TestClient(_agent_app.app)
|
||||
resp = client.post(
|
||||
"/topology/apply",
|
||||
json={"hydrated": _hydrated(), "version_hash": "abc"},
|
||||
)
|
||||
assert resp.status_code == 200, resp.text
|
||||
assert resp.json() == {"status": "applied", "version_hash": "abc"}
|
||||
assert called["version_hash"] == "abc"
|
||||
|
||||
|
||||
def test_topology_apply_hash_mismatch_is_400(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
async def _boom(*_a, **_kw):
|
||||
raise _ops.HashMismatch("master hash != agent hash")
|
||||
|
||||
monkeypatch.setattr(_ops, "apply", _boom)
|
||||
|
||||
client = TestClient(_agent_app.app)
|
||||
resp = client.post(
|
||||
"/topology/apply",
|
||||
json={"hydrated": _hydrated(), "version_hash": "wrong"},
|
||||
)
|
||||
assert resp.status_code == 400
|
||||
assert "hash" in resp.json()["detail"].lower()
|
||||
|
||||
|
||||
def test_topology_apply_conflict_is_409(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
async def _boom(*_a, **_kw):
|
||||
raise AlreadyApplied("another topology already applied")
|
||||
|
||||
monkeypatch.setattr(_ops, "apply", _boom)
|
||||
|
||||
client = TestClient(_agent_app.app)
|
||||
resp = client.post(
|
||||
"/topology/apply",
|
||||
json={"hydrated": _hydrated("top-2"), "version_hash": "h"},
|
||||
)
|
||||
assert resp.status_code == 409
|
||||
|
||||
|
||||
def test_topology_apply_docker_failure_is_500_and_records_error(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
async def _boom(*_a, **_kw):
|
||||
raise RuntimeError("docker down")
|
||||
|
||||
monkeypatch.setattr(_ops, "apply", _boom)
|
||||
|
||||
# Stub docker.from_env for the /topology/state observed() call so
|
||||
# the state endpoint doesn't need a real daemon.
|
||||
class _StubDocker:
|
||||
class networks:
|
||||
@staticmethod
|
||||
def list(): return []
|
||||
|
||||
class containers:
|
||||
@staticmethod
|
||||
def list(all=False): return []
|
||||
|
||||
import docker as _docker
|
||||
monkeypatch.setattr(_docker, "from_env", lambda: _StubDocker)
|
||||
|
||||
client = TestClient(_agent_app.app)
|
||||
resp = client.post(
|
||||
"/topology/apply",
|
||||
json={"hydrated": _hydrated("top-err"), "version_hash": "h"},
|
||||
)
|
||||
assert resp.status_code == 500
|
||||
assert "docker down" in resp.json()["detail"]
|
||||
|
||||
# The error must be persisted so GET /topology/state surfaces it,
|
||||
# and the stored hash stays empty so master's heartbeat check flags
|
||||
# the topology for resync rather than assuming it's applied.
|
||||
state = client.get("/topology/state").json()
|
||||
assert state["topology_id"] == "top-err"
|
||||
assert state["applied_version_hash"] == ""
|
||||
assert state["last_error"] == "docker down"
|
||||
|
||||
|
||||
def test_topology_teardown_routes_to_ops(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
called: dict = {}
|
||||
|
||||
async def _fake_teardown(topology_id, store):
|
||||
called["topology_id"] = topology_id
|
||||
store.clear(topology_id)
|
||||
|
||||
monkeypatch.setattr(_ops, "teardown", _fake_teardown)
|
||||
|
||||
client = TestClient(_agent_app.app)
|
||||
resp = client.post(
|
||||
"/topology/teardown", json={"topology_id": "top-gone"}
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
assert called["topology_id"] == "top-gone"
|
||||
|
||||
|
||||
def test_topology_teardown_failure_is_500(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
async def _boom(*_a, **_kw):
|
||||
raise RuntimeError("compose refused")
|
||||
|
||||
monkeypatch.setattr(_ops, "teardown", _boom)
|
||||
|
||||
client = TestClient(_agent_app.app)
|
||||
resp = client.post(
|
||||
"/topology/teardown", json={"topology_id": "top-1"}
|
||||
)
|
||||
assert resp.status_code == 500
|
||||
|
||||
|
||||
def test_routes_registered() -> None:
|
||||
paths = {r.path for r in _agent_app.app.routes if hasattr(r, "path")}
|
||||
assert {"/topology/apply", "/topology/teardown", "/topology/state"} <= paths
|
||||
160
tests/swarm/test_agent_topology_store.py
Normal file
160
tests/swarm/test_agent_topology_store.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""Tests for :mod:`decnet.agent.topology_store`."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pathlib
|
||||
|
||||
import pytest
|
||||
|
||||
from decnet.agent.topology_store import (
|
||||
AlreadyApplied,
|
||||
TopologyStore,
|
||||
observed,
|
||||
)
|
||||
|
||||
|
||||
def _store(tmp_path: pathlib.Path) -> TopologyStore:
|
||||
return TopologyStore(tmp_path / "topology.db")
|
||||
|
||||
|
||||
def test_idle_by_default(tmp_path: pathlib.Path) -> None:
|
||||
s = _store(tmp_path)
|
||||
assert s.current() is None
|
||||
s.close()
|
||||
|
||||
|
||||
def test_put_then_current(tmp_path: pathlib.Path) -> None:
|
||||
s = _store(tmp_path)
|
||||
s.put("t1", "hash-a", {"topology": {"id": "t1"}, "lans": []})
|
||||
row = s.current()
|
||||
assert row is not None
|
||||
assert row.topology_id == "t1"
|
||||
assert row.applied_version_hash == "hash-a"
|
||||
assert row.hydrated["topology"]["id"] == "t1"
|
||||
assert row.last_error is None
|
||||
s.close()
|
||||
|
||||
|
||||
def test_put_same_id_is_idempotent_update(tmp_path: pathlib.Path) -> None:
|
||||
s = _store(tmp_path)
|
||||
s.put("t1", "hash-a", {"k": 1})
|
||||
s.put("t1", "hash-b", {"k": 2})
|
||||
row = s.current()
|
||||
assert row is not None
|
||||
assert row.applied_version_hash == "hash-b"
|
||||
assert row.hydrated == {"k": 2}
|
||||
s.close()
|
||||
|
||||
|
||||
def test_put_different_id_rejected(tmp_path: pathlib.Path) -> None:
|
||||
s = _store(tmp_path)
|
||||
s.put("t1", "hash-a", {})
|
||||
with pytest.raises(AlreadyApplied):
|
||||
s.put("t2", "hash-b", {})
|
||||
s.close()
|
||||
|
||||
|
||||
def test_record_error_then_put_clears(tmp_path: pathlib.Path) -> None:
|
||||
s = _store(tmp_path)
|
||||
s.put("t1", "h", {})
|
||||
s.record_error("t1", "kaboom")
|
||||
assert s.current().last_error == "kaboom"
|
||||
# Re-applying clears the error flag.
|
||||
s.put("t1", "h2", {})
|
||||
assert s.current().last_error is None
|
||||
s.close()
|
||||
|
||||
|
||||
def test_record_error_upserts_when_no_prior_row(tmp_path: pathlib.Path) -> None:
|
||||
"""Apply failure mid-materialise: put() hasn't written a row yet but
|
||||
we still want the error surfaced on GET /topology/state and the
|
||||
next heartbeat. The marker uses empty hash so master sees drift."""
|
||||
s = _store(tmp_path)
|
||||
s.record_error("t-fail", "docker refused connection")
|
||||
row = s.current()
|
||||
assert row is not None
|
||||
assert row.topology_id == "t-fail"
|
||||
assert row.applied_version_hash == ""
|
||||
assert row.applied_at == 0
|
||||
assert row.last_error == "docker refused connection"
|
||||
s.close()
|
||||
|
||||
|
||||
def test_record_error_then_successful_put_replaces_marker(tmp_path: pathlib.Path) -> None:
|
||||
"""Once a retry succeeds, the marker row must be replaced with a
|
||||
real applied row — no stale error or empty hash left behind."""
|
||||
s = _store(tmp_path)
|
||||
s.record_error("t-retry", "first try failed")
|
||||
s.put("t-retry", "real-hash", {"topology": {"id": "t-retry"}})
|
||||
row = s.current()
|
||||
assert row.applied_version_hash == "real-hash"
|
||||
assert row.last_error is None
|
||||
assert row.applied_at > 0
|
||||
s.close()
|
||||
|
||||
|
||||
def test_clear(tmp_path: pathlib.Path) -> None:
|
||||
s = _store(tmp_path)
|
||||
s.put("t1", "h", {})
|
||||
s.clear("t1")
|
||||
assert s.current() is None
|
||||
# Clearing a missing id is a no-op (teardown idempotency).
|
||||
s.clear("t1")
|
||||
s.close()
|
||||
|
||||
|
||||
def test_persists_across_reopen(tmp_path: pathlib.Path) -> None:
|
||||
s = _store(tmp_path)
|
||||
s.put("t1", "h", {"x": 1})
|
||||
s.close()
|
||||
s2 = _store(tmp_path)
|
||||
row = s2.current()
|
||||
assert row is not None
|
||||
assert row.topology_id == "t1"
|
||||
s2.close()
|
||||
|
||||
|
||||
# -------------------------------------------------------- observed() helper
|
||||
|
||||
|
||||
class _FakeNet:
|
||||
def __init__(self, name: str, driver: str) -> None:
|
||||
self.name = name
|
||||
self.attrs = {"Driver": driver}
|
||||
|
||||
|
||||
class _FakeContainer:
|
||||
def __init__(self, name: str) -> None:
|
||||
self.name = name
|
||||
|
||||
|
||||
class _FakeDocker:
|
||||
def __init__(self, nets, containers) -> None:
|
||||
self.networks = type("N", (), {"list": lambda _self: nets})()
|
||||
self.containers = type(
|
||||
"C", (), {"list": lambda _self, all=False: containers}
|
||||
)()
|
||||
|
||||
|
||||
def test_observed_filters_by_prefix() -> None:
|
||||
nets = [
|
||||
_FakeNet("decnet-topology-abc", "bridge"),
|
||||
_FakeNet("bridge", "bridge"),
|
||||
_FakeNet("decnet-topology-xyz", "overlay"), # wrong driver — filtered
|
||||
]
|
||||
containers = [_FakeContainer("decnet-deaddeck"), _FakeContainer("sshd")]
|
||||
snap = observed(_FakeDocker(nets, containers))
|
||||
assert snap == {
|
||||
"bridges": ["decnet-topology-abc"],
|
||||
"containers": ["decnet-deaddeck"],
|
||||
}
|
||||
|
||||
|
||||
def test_observed_reports_error_on_failure() -> None:
|
||||
class _Broken:
|
||||
@property
|
||||
def networks(self):
|
||||
raise RuntimeError("docker down")
|
||||
|
||||
snap = observed(_Broken())
|
||||
assert "error" in snap
|
||||
assert "docker down" in snap["error"]
|
||||
39
tests/swarm/test_cli_forwarder.py
Normal file
39
tests/swarm/test_cli_forwarder.py
Normal file
@@ -0,0 +1,39 @@
|
||||
"""CLI surface for `decnet forwarder`. Only guard clauses — the async
|
||||
loop itself is covered by tests/swarm/test_log_forwarder.py."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pathlib
|
||||
|
||||
import pytest
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from decnet.cli import app
|
||||
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
|
||||
def test_forwarder_requires_master_host(monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path) -> None:
|
||||
monkeypatch.delenv("DECNET_SWARM_MASTER_HOST", raising=False)
|
||||
# Also patch the already-imported module-level constant.
|
||||
monkeypatch.setattr("decnet.env.DECNET_SWARM_MASTER_HOST", None, raising=False)
|
||||
result = runner.invoke(app, ["forwarder", "--log-file", str(tmp_path / "decnet.log")])
|
||||
assert result.exit_code == 2
|
||||
assert "master-host" in result.output
|
||||
|
||||
|
||||
def test_forwarder_requires_bundle(monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path) -> None:
|
||||
agent_dir = tmp_path / "agent" # empty
|
||||
log_file = tmp_path / "decnet.log"
|
||||
log_file.write_text("")
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"forwarder",
|
||||
"--master-host", "127.0.0.1",
|
||||
"--log-file", str(log_file),
|
||||
"--agent-dir", str(agent_dir),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 2
|
||||
assert "bundle" in result.output
|
||||
292
tests/swarm/test_cli_swarm.py
Normal file
292
tests/swarm/test_cli_swarm.py
Normal file
@@ -0,0 +1,292 @@
|
||||
"""CLI `decnet swarm {enroll,list,decommission}` + `deploy --mode swarm`.
|
||||
|
||||
Controller HTTP is stubbed via monkeypatching `_http_request`; we aren't
|
||||
testing the controller (that has its own test file) or httpx itself. We
|
||||
*are* testing: arg parsing, URL construction, round-robin sharding of
|
||||
deckies, bundle file output, error paths when the controller rejects.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import pathlib
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from decnet import cli as cli_mod
|
||||
from decnet.cli import app, deploy as cli_deploy, utils as cli_utils
|
||||
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
|
||||
class _FakeResp:
|
||||
def __init__(self, payload: Any, status: int = 200):
|
||||
self._payload = payload
|
||||
self.status_code = status
|
||||
self.text = json.dumps(payload) if not isinstance(payload, str) else payload
|
||||
|
||||
def json(self) -> Any:
|
||||
return self._payload
|
||||
|
||||
|
||||
class _HttpStub(list):
|
||||
"""Both a call log and a scripted-reply registry."""
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.script: dict[tuple[str, str], _FakeResp] = {}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def http_stub(monkeypatch: pytest.MonkeyPatch) -> _HttpStub:
|
||||
calls = _HttpStub()
|
||||
|
||||
def _fake(method, url, *, json_body=None, timeout=30.0):
|
||||
calls.append((method, url, json_body))
|
||||
for (m, suffix), resp in calls.script.items():
|
||||
if m == method and url.endswith(suffix):
|
||||
return resp
|
||||
raise AssertionError(f"Unscripted HTTP call: {method} {url}")
|
||||
|
||||
monkeypatch.setattr(cli_utils, "_http_request", _fake)
|
||||
return calls
|
||||
|
||||
|
||||
# ------------------------------------------------------------- swarm list
|
||||
|
||||
|
||||
def test_swarm_list_empty(http_stub) -> None:
|
||||
http_stub.script[("GET", "/swarm/hosts")] = _FakeResp([])
|
||||
result = runner.invoke(app, ["swarm", "list"])
|
||||
assert result.exit_code == 0
|
||||
assert "No workers" in result.output
|
||||
|
||||
|
||||
def test_swarm_list_with_rows(http_stub) -> None:
|
||||
http_stub.script[("GET", "/swarm/hosts")] = _FakeResp([
|
||||
{"uuid": "u1", "name": "decky01", "address": "10.0.0.1",
|
||||
"agent_port": 8765, "status": "active", "last_heartbeat": None,
|
||||
"enrolled_at": "2026-04-18T00:00:00Z", "notes": None,
|
||||
"client_cert_fingerprint": "ab:cd"},
|
||||
])
|
||||
result = runner.invoke(app, ["swarm", "list"])
|
||||
assert result.exit_code == 0
|
||||
assert "decky01" in result.output
|
||||
assert "10.0.0.1" in result.output
|
||||
|
||||
|
||||
def test_swarm_list_passes_status_filter(http_stub) -> None:
|
||||
http_stub.script[("GET", "/swarm/hosts?host_status=active")] = _FakeResp([])
|
||||
result = runner.invoke(app, ["swarm", "list", "--status", "active"])
|
||||
assert result.exit_code == 0
|
||||
# last call URL ended with the filter suffix
|
||||
assert http_stub[-1][1].endswith("/swarm/hosts?host_status=active")
|
||||
|
||||
|
||||
# ------------------------------------------------------------- swarm enroll
|
||||
|
||||
|
||||
def test_swarm_enroll_writes_bundle(http_stub, tmp_path: pathlib.Path) -> None:
|
||||
http_stub.script[("POST", "/swarm/enroll")] = _FakeResp({
|
||||
"host_uuid": "u-123", "name": "decky01", "address": "10.0.0.1",
|
||||
"agent_port": 8765, "fingerprint": "de:ad:be:ef",
|
||||
"ca_cert_pem": "CA-PEM", "worker_cert_pem": "CRT-PEM",
|
||||
"worker_key_pem": "KEY-PEM",
|
||||
})
|
||||
out = tmp_path / "bundle"
|
||||
result = runner.invoke(app, [
|
||||
"swarm", "enroll",
|
||||
"--name", "decky01", "--address", "10.0.0.1",
|
||||
"--sans", "decky01.lan,10.0.0.1",
|
||||
"--out-dir", str(out),
|
||||
])
|
||||
assert result.exit_code == 0, result.output
|
||||
assert (out / "ca.crt").read_text() == "CA-PEM"
|
||||
assert (out / "worker.crt").read_text() == "CRT-PEM"
|
||||
assert (out / "worker.key").read_text() == "KEY-PEM"
|
||||
# SANs were forwarded in the JSON body.
|
||||
_, _, body = http_stub[0]
|
||||
assert body["sans"] == ["decky01.lan", "10.0.0.1"]
|
||||
|
||||
|
||||
# ------------------------------------------------------------- swarm check
|
||||
|
||||
|
||||
def test_swarm_check_prints_table(http_stub) -> None:
|
||||
http_stub.script[("POST", "/swarm/check")] = _FakeResp({
|
||||
"results": [
|
||||
{"host_uuid": "u-a", "name": "decky01", "address": "10.0.0.1",
|
||||
"reachable": True, "detail": {"status": "ok"}},
|
||||
{"host_uuid": "u-b", "name": "decky02", "address": "10.0.0.2",
|
||||
"reachable": False, "detail": "connection refused"},
|
||||
]
|
||||
})
|
||||
result = runner.invoke(app, ["swarm", "check"])
|
||||
assert result.exit_code == 0, result.output
|
||||
assert "decky01" in result.output
|
||||
assert "decky02" in result.output
|
||||
# Both reachable=true and reachable=false render.
|
||||
assert "yes" in result.output.lower()
|
||||
assert "no" in result.output.lower()
|
||||
|
||||
|
||||
def test_swarm_check_empty(http_stub) -> None:
|
||||
http_stub.script[("POST", "/swarm/check")] = _FakeResp({"results": []})
|
||||
result = runner.invoke(app, ["swarm", "check"])
|
||||
assert result.exit_code == 0
|
||||
assert "No workers" in result.output
|
||||
|
||||
|
||||
def test_swarm_check_json_output(http_stub) -> None:
|
||||
http_stub.script[("POST", "/swarm/check")] = _FakeResp({
|
||||
"results": [
|
||||
{"host_uuid": "u-a", "name": "decky01", "address": "10.0.0.1",
|
||||
"reachable": True, "detail": {"status": "ok"}},
|
||||
]
|
||||
})
|
||||
result = runner.invoke(app, ["swarm", "check", "--json"])
|
||||
assert result.exit_code == 0
|
||||
# JSON mode emits structured output, not the rich table.
|
||||
assert '"reachable"' in result.output
|
||||
assert '"decky01"' in result.output
|
||||
|
||||
|
||||
# ------------------------------------------------------------- swarm deckies
|
||||
|
||||
|
||||
def test_swarm_deckies_empty(http_stub) -> None:
|
||||
http_stub.script[("GET", "/swarm/deckies")] = _FakeResp([])
|
||||
result = runner.invoke(app, ["swarm", "deckies"])
|
||||
assert result.exit_code == 0, result.output
|
||||
assert "No deckies" in result.output
|
||||
|
||||
|
||||
def test_swarm_deckies_renders_table(http_stub) -> None:
|
||||
http_stub.script[("GET", "/swarm/deckies")] = _FakeResp([
|
||||
{"decky_name": "decky-01", "host_uuid": "u-1", "host_name": "w1",
|
||||
"host_address": "10.0.0.1", "host_status": "active",
|
||||
"services": ["ssh"], "state": "running", "last_error": None,
|
||||
"compose_hash": None, "updated_at": "2026-04-18T00:00:00Z"},
|
||||
{"decky_name": "decky-02", "host_uuid": "u-2", "host_name": "w2",
|
||||
"host_address": "10.0.0.2", "host_status": "active",
|
||||
"services": ["smb", "ssh"], "state": "failed", "last_error": "boom",
|
||||
"compose_hash": None, "updated_at": "2026-04-18T00:00:00Z"},
|
||||
])
|
||||
result = runner.invoke(app, ["swarm", "deckies"])
|
||||
assert result.exit_code == 0, result.output
|
||||
assert "decky-01" in result.output
|
||||
assert "decky-02" in result.output
|
||||
assert "w1" in result.output and "w2" in result.output
|
||||
assert "smb,ssh" in result.output
|
||||
|
||||
|
||||
def test_swarm_deckies_json_output(http_stub) -> None:
|
||||
http_stub.script[("GET", "/swarm/deckies")] = _FakeResp([
|
||||
{"decky_name": "decky-01", "host_uuid": "u-1", "host_name": "w1",
|
||||
"host_address": "10.0.0.1", "host_status": "active",
|
||||
"services": ["ssh"], "state": "running", "last_error": None,
|
||||
"compose_hash": None, "updated_at": "2026-04-18T00:00:00Z"},
|
||||
])
|
||||
result = runner.invoke(app, ["swarm", "deckies", "--json"])
|
||||
assert result.exit_code == 0
|
||||
assert '"decky_name"' in result.output
|
||||
assert '"decky-01"' in result.output
|
||||
|
||||
|
||||
def test_swarm_deckies_filter_by_host_name_looks_up_uuid(http_stub) -> None:
|
||||
http_stub.script[("GET", "/swarm/hosts")] = _FakeResp([
|
||||
{"uuid": "u-x", "name": "w1"},
|
||||
])
|
||||
http_stub.script[("GET", "/swarm/deckies?host_uuid=u-x")] = _FakeResp([])
|
||||
result = runner.invoke(app, ["swarm", "deckies", "--host", "w1"])
|
||||
assert result.exit_code == 0
|
||||
assert http_stub[-1][1].endswith("/swarm/deckies?host_uuid=u-x")
|
||||
|
||||
|
||||
def test_swarm_deckies_filter_by_state(http_stub) -> None:
|
||||
http_stub.script[("GET", "/swarm/deckies?state=failed")] = _FakeResp([])
|
||||
result = runner.invoke(app, ["swarm", "deckies", "--state", "failed"])
|
||||
assert result.exit_code == 0
|
||||
assert http_stub[-1][1].endswith("/swarm/deckies?state=failed")
|
||||
|
||||
|
||||
# ------------------------------------------------------------- swarm decommission
|
||||
|
||||
|
||||
def test_swarm_decommission_by_name_looks_up_uuid(http_stub) -> None:
|
||||
http_stub.script[("GET", "/swarm/hosts")] = _FakeResp([
|
||||
{"uuid": "u-x", "name": "decky02"},
|
||||
])
|
||||
http_stub.script[("DELETE", "/swarm/hosts/u-x")] = _FakeResp({}, status=204)
|
||||
result = runner.invoke(app, ["swarm", "decommission", "--name", "decky02", "--yes"])
|
||||
assert result.exit_code == 0, result.output
|
||||
methods = [c[0] for c in http_stub]
|
||||
assert methods == ["GET", "DELETE"]
|
||||
|
||||
|
||||
def test_swarm_decommission_name_not_found(http_stub) -> None:
|
||||
http_stub.script[("GET", "/swarm/hosts")] = _FakeResp([])
|
||||
result = runner.invoke(app, ["swarm", "decommission", "--name", "ghost", "--yes"])
|
||||
assert result.exit_code == 1
|
||||
assert "No enrolled worker" in result.output
|
||||
|
||||
|
||||
def test_swarm_decommission_requires_identifier() -> None:
|
||||
result = runner.invoke(app, ["swarm", "decommission", "--yes"])
|
||||
assert result.exit_code == 2
|
||||
|
||||
|
||||
# ------------------------------------------------------------- deploy --mode swarm
|
||||
|
||||
|
||||
def test_deploy_swarm_round_robins_and_posts(http_stub, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""deploy --mode swarm fetches hosts, assigns host_uuid round-robin,
|
||||
POSTs to /swarm/deploy with the sharded config."""
|
||||
# Two enrolled workers, zero active.
|
||||
http_stub.script[("GET", "/swarm/hosts?host_status=enrolled")] = _FakeResp([
|
||||
{"uuid": "u-a", "name": "A", "address": "10.0.0.1", "agent_port": 8765,
|
||||
"status": "enrolled"},
|
||||
{"uuid": "u-b", "name": "B", "address": "10.0.0.2", "agent_port": 8765,
|
||||
"status": "enrolled"},
|
||||
])
|
||||
http_stub.script[("GET", "/swarm/hosts?host_status=active")] = _FakeResp([])
|
||||
http_stub.script[("POST", "/swarm/deploy")] = _FakeResp({
|
||||
"results": [
|
||||
{"host_uuid": "u-a", "host_name": "A", "ok": True, "detail": {"status": "ok"}},
|
||||
{"host_uuid": "u-b", "host_name": "B", "ok": True, "detail": {"status": "ok"}},
|
||||
],
|
||||
})
|
||||
|
||||
# Stub network detection so we don't need root / real NICs.
|
||||
monkeypatch.setattr(cli_deploy, "detect_interface", lambda: "eth0")
|
||||
monkeypatch.setattr(cli_deploy, "detect_subnet", lambda _iface: ("10.0.0.0/24", "10.0.0.254"))
|
||||
monkeypatch.setattr(cli_deploy, "get_host_ip", lambda _iface: "10.0.0.100")
|
||||
|
||||
result = runner.invoke(app, [
|
||||
"deploy", "--mode", "swarm", "--deckies", "3",
|
||||
"--services", "ssh", "--dry-run",
|
||||
])
|
||||
assert result.exit_code == 0, result.output
|
||||
|
||||
# Find the POST /swarm/deploy body and confirm round-robin sharding.
|
||||
post = next(c for c in http_stub if c[0] == "POST" and c[1].endswith("/swarm/deploy"))
|
||||
body = post[2]
|
||||
uuids = [d["host_uuid"] for d in body["config"]["deckies"]]
|
||||
assert uuids == ["u-a", "u-b", "u-a"]
|
||||
assert body["dry_run"] is True
|
||||
|
||||
|
||||
def test_deploy_swarm_fails_if_no_workers(http_stub, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
http_stub.script[("GET", "/swarm/hosts?host_status=enrolled")] = _FakeResp([])
|
||||
http_stub.script[("GET", "/swarm/hosts?host_status=active")] = _FakeResp([])
|
||||
monkeypatch.setattr(cli_deploy, "detect_interface", lambda: "eth0")
|
||||
monkeypatch.setattr(cli_deploy, "detect_subnet", lambda _iface: ("10.0.0.0/24", "10.0.0.254"))
|
||||
monkeypatch.setattr(cli_deploy, "get_host_ip", lambda _iface: "10.0.0.100")
|
||||
|
||||
result = runner.invoke(app, [
|
||||
"deploy", "--mode", "swarm", "--deckies", "2",
|
||||
"--services", "ssh", "--dry-run",
|
||||
])
|
||||
assert result.exit_code == 1
|
||||
assert "No enrolled workers" in result.output
|
||||
192
tests/swarm/test_cli_swarm_update.py
Normal file
192
tests/swarm/test_cli_swarm_update.py
Normal file
@@ -0,0 +1,192 @@
|
||||
"""CLI `decnet swarm update` — target resolution, tarring, push aggregation.
|
||||
|
||||
The UpdaterClient is stubbed: we are testing the CLI's orchestration, not
|
||||
the wire protocol (that has test_updater_app.py and UpdaterClient round-
|
||||
trips live under test_swarm_api.py integration).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import pathlib
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from decnet import cli as cli_mod
|
||||
from decnet.cli import app, utils as cli_utils
|
||||
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
|
||||
class _FakeResp:
|
||||
def __init__(self, payload: Any, status: int = 200):
|
||||
self._payload = payload
|
||||
self.status_code = status
|
||||
self.text = json.dumps(payload) if not isinstance(payload, str) else payload
|
||||
self.content = self.text.encode()
|
||||
|
||||
def json(self) -> Any:
|
||||
return self._payload
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def http_stub(monkeypatch: pytest.MonkeyPatch) -> dict:
|
||||
state: dict = {"hosts": []}
|
||||
|
||||
def _fake(method, url, *, json_body=None, timeout=30.0):
|
||||
if method == "GET" and url.endswith("/swarm/hosts"):
|
||||
return _FakeResp(state["hosts"])
|
||||
raise AssertionError(f"Unscripted HTTP call: {method} {url}")
|
||||
|
||||
monkeypatch.setattr(cli_utils, "_http_request", _fake)
|
||||
return state
|
||||
|
||||
|
||||
class _StubUpdaterClient:
|
||||
"""Mirrors UpdaterClient's async-context-manager surface."""
|
||||
instances: list["_StubUpdaterClient"] = []
|
||||
behavior: dict[str, Any] = {}
|
||||
|
||||
def __init__(self, host, *, updater_port: int = 8766, **_: Any):
|
||||
self.host = host
|
||||
self.port = updater_port
|
||||
self.calls: list[str] = []
|
||||
_StubUpdaterClient.instances.append(self)
|
||||
|
||||
async def __aenter__(self) -> "_StubUpdaterClient":
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *exc: Any) -> None:
|
||||
return None
|
||||
|
||||
async def update(self, tarball: bytes, sha: str = "") -> _FakeResp:
|
||||
self.calls.append("update")
|
||||
return _StubUpdaterClient.behavior.get(
|
||||
self.host.get("name"),
|
||||
_FakeResp({"status": "updated", "release": {"sha": sha}}, 200),
|
||||
)
|
||||
|
||||
async def update_self(self, tarball: bytes, sha: str = "") -> _FakeResp:
|
||||
self.calls.append("update_self")
|
||||
return _FakeResp({"status": "self_update_queued"}, 200)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stub_updater(monkeypatch: pytest.MonkeyPatch):
|
||||
_StubUpdaterClient.instances.clear()
|
||||
_StubUpdaterClient.behavior.clear()
|
||||
monkeypatch.setattr("decnet.swarm.updater_client.UpdaterClient", _StubUpdaterClient)
|
||||
# Also patch the module-level import inside cli.py's swarm_update closure.
|
||||
import decnet.cli # noqa: F401
|
||||
return _StubUpdaterClient
|
||||
|
||||
|
||||
def _mk_source_tree(tmp_path: pathlib.Path) -> pathlib.Path:
|
||||
root = tmp_path / "src"
|
||||
root.mkdir()
|
||||
(root / "decnet").mkdir()
|
||||
(root / "decnet" / "a.py").write_text("x = 1")
|
||||
return root
|
||||
|
||||
|
||||
# ------------------------------------------------------------- arg validation
|
||||
|
||||
def test_update_requires_host_or_all(http_stub) -> None:
|
||||
r = runner.invoke(app, ["swarm", "update"])
|
||||
assert r.exit_code == 2
|
||||
|
||||
|
||||
def test_update_host_and_all_are_mutex(http_stub) -> None:
|
||||
r = runner.invoke(app, ["swarm", "update", "--host", "w1", "--all"])
|
||||
assert r.exit_code == 2
|
||||
|
||||
|
||||
def test_update_unknown_host_exits_1(http_stub) -> None:
|
||||
http_stub["hosts"] = [{"uuid": "u1", "name": "other", "address": "10.0.0.1", "status": "active"}]
|
||||
r = runner.invoke(app, ["swarm", "update", "--host", "nope"])
|
||||
assert r.exit_code == 1
|
||||
assert "No enrolled worker" in r.output
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- happy paths
|
||||
|
||||
def test_update_single_host(http_stub, stub_updater, tmp_path: pathlib.Path) -> None:
|
||||
http_stub["hosts"] = [
|
||||
{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"},
|
||||
{"uuid": "u2", "name": "w2", "address": "10.0.0.2", "status": "active"},
|
||||
]
|
||||
root = _mk_source_tree(tmp_path)
|
||||
r = runner.invoke(app, ["swarm", "update", "--host", "w1", "--root", str(root)])
|
||||
assert r.exit_code == 0, r.output
|
||||
assert "w1" in r.output
|
||||
# Only w1 got a client; w2 is untouched.
|
||||
names = [c.host["name"] for c in stub_updater.instances]
|
||||
assert names == ["w1"]
|
||||
|
||||
|
||||
def test_update_all_skips_decommissioned(http_stub, stub_updater, tmp_path: pathlib.Path) -> None:
|
||||
http_stub["hosts"] = [
|
||||
{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"},
|
||||
{"uuid": "u2", "name": "w2", "address": "10.0.0.2", "status": "decommissioned"},
|
||||
{"uuid": "u3", "name": "w3", "address": "10.0.0.3", "status": "enrolled"},
|
||||
]
|
||||
root = _mk_source_tree(tmp_path)
|
||||
r = runner.invoke(app, ["swarm", "update", "--all", "--root", str(root)])
|
||||
assert r.exit_code == 0, r.output
|
||||
hit = sorted(c.host["name"] for c in stub_updater.instances)
|
||||
assert hit == ["w1", "w3"]
|
||||
|
||||
|
||||
def test_update_include_self_calls_both(
|
||||
http_stub, stub_updater, tmp_path: pathlib.Path,
|
||||
) -> None:
|
||||
http_stub["hosts"] = [{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"}]
|
||||
root = _mk_source_tree(tmp_path)
|
||||
r = runner.invoke(app, ["swarm", "update", "--all", "--root", str(root), "--include-self"])
|
||||
assert r.exit_code == 0
|
||||
assert stub_updater.instances[0].calls == ["update", "update_self"]
|
||||
|
||||
|
||||
# ------------------------------------------------------------- failure modes
|
||||
|
||||
def test_update_rollback_status_409_flags_failure(
|
||||
http_stub, stub_updater, tmp_path: pathlib.Path,
|
||||
) -> None:
|
||||
http_stub["hosts"] = [{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"}]
|
||||
_StubUpdaterClient.behavior["w1"] = _FakeResp(
|
||||
{"detail": {"error": "probe failed", "rolled_back": True}},
|
||||
status=409,
|
||||
)
|
||||
root = _mk_source_tree(tmp_path)
|
||||
r = runner.invoke(app, ["swarm", "update", "--all", "--root", str(root)])
|
||||
assert r.exit_code == 1
|
||||
assert "rolled-back" in r.output
|
||||
|
||||
|
||||
def test_update_include_self_skipped_when_agent_update_failed(
|
||||
http_stub, stub_updater, tmp_path: pathlib.Path,
|
||||
) -> None:
|
||||
http_stub["hosts"] = [{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"}]
|
||||
_StubUpdaterClient.behavior["w1"] = _FakeResp(
|
||||
{"detail": {"error": "pip failed"}}, status=500,
|
||||
)
|
||||
root = _mk_source_tree(tmp_path)
|
||||
r = runner.invoke(app, ["swarm", "update", "--all", "--root", str(root), "--include-self"])
|
||||
assert r.exit_code == 1
|
||||
# update_self must NOT have been called — agent update failed.
|
||||
assert stub_updater.instances[0].calls == ["update"]
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- dry run
|
||||
|
||||
def test_update_dry_run_does_not_call_updater(
|
||||
http_stub, stub_updater, tmp_path: pathlib.Path,
|
||||
) -> None:
|
||||
http_stub["hosts"] = [{"uuid": "u1", "name": "w1", "address": "10.0.0.1", "status": "active"}]
|
||||
root = _mk_source_tree(tmp_path)
|
||||
r = runner.invoke(app, ["swarm", "update", "--all", "--root", str(root), "--dry-run"])
|
||||
assert r.exit_code == 0
|
||||
assert stub_updater.instances == []
|
||||
assert "dry-run" in r.output.lower()
|
||||
170
tests/swarm/test_client_agent_roundtrip.py
Normal file
170
tests/swarm/test_client_agent_roundtrip.py
Normal file
@@ -0,0 +1,170 @@
|
||||
"""End-to-end test: AgentClient talks to a live worker agent over mTLS.
|
||||
|
||||
Spins up uvicorn in-process on an ephemeral port with real cert files on
|
||||
disk. Confirms:
|
||||
|
||||
1. The health endpoint works when the client presents a CA-signed cert.
|
||||
2. An impostor client (cert signed by a different CA) is rejected at TLS
|
||||
time.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import pathlib
|
||||
import socket
|
||||
import threading
|
||||
import time
|
||||
|
||||
import ssl
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import uvicorn
|
||||
|
||||
from decnet.agent.app import app as agent_app
|
||||
from decnet.swarm import client as swarm_client
|
||||
from decnet.swarm import pki
|
||||
|
||||
|
||||
def _free_port() -> int:
|
||||
s = socket.socket()
|
||||
s.bind(("127.0.0.1", 0))
|
||||
port = s.getsockname()[1]
|
||||
s.close()
|
||||
return port
|
||||
|
||||
|
||||
def _start_agent(
|
||||
tmp_path: pathlib.Path, port: int
|
||||
) -> tuple[uvicorn.Server, threading.Thread, swarm_client.MasterIdentity]:
|
||||
"""Provision a CA, sign a worker cert + a master cert, start uvicorn."""
|
||||
ca_dir = tmp_path / "ca"
|
||||
pki.ensure_ca(ca_dir)
|
||||
|
||||
# Worker bundle
|
||||
worker_dir = tmp_path / "agent"
|
||||
pki.write_worker_bundle(
|
||||
pki.issue_worker_cert(pki.load_ca(ca_dir), "worker-test", ["127.0.0.1"]),
|
||||
worker_dir,
|
||||
)
|
||||
|
||||
# Master identity (used by AgentClient as a client cert)
|
||||
master_id = swarm_client.ensure_master_identity(ca_dir)
|
||||
|
||||
config = uvicorn.Config(
|
||||
agent_app,
|
||||
host="127.0.0.1",
|
||||
port=port,
|
||||
log_level="warning",
|
||||
ssl_keyfile=str(worker_dir / "worker.key"),
|
||||
ssl_certfile=str(worker_dir / "worker.crt"),
|
||||
ssl_ca_certs=str(worker_dir / "ca.crt"),
|
||||
# 2 == ssl.CERT_REQUIRED
|
||||
ssl_cert_reqs=2,
|
||||
)
|
||||
server = uvicorn.Server(config)
|
||||
|
||||
def _run() -> None:
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.run_until_complete(server.serve())
|
||||
loop.close()
|
||||
|
||||
thread = threading.Thread(target=_run, daemon=True)
|
||||
thread.start()
|
||||
|
||||
# Wait for server to be listening
|
||||
deadline = time.time() + 5
|
||||
while time.time() < deadline:
|
||||
if server.started:
|
||||
return server, thread, master_id
|
||||
time.sleep(0.05)
|
||||
raise RuntimeError("agent did not start within 5s")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_client_health_roundtrip(tmp_path: pathlib.Path) -> None:
|
||||
port = _free_port()
|
||||
server, thread, master_id = _start_agent(tmp_path, port)
|
||||
try:
|
||||
async with swarm_client.AgentClient(
|
||||
address="127.0.0.1", agent_port=port, identity=master_id
|
||||
) as agent:
|
||||
body = await agent.health()
|
||||
assert body == {"status": "ok"}
|
||||
snap = await agent.status()
|
||||
assert "deployed" in snap
|
||||
finally:
|
||||
server.should_exit = True
|
||||
thread.join(timeout=5)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fingerprint_pin_accepts_matching_cert(tmp_path: pathlib.Path) -> None:
|
||||
"""AgentClient with the correct expected fingerprint connects normally."""
|
||||
port = _free_port()
|
||||
server, thread, master_id = _start_agent(tmp_path, port)
|
||||
try:
|
||||
worker_cert_pem = (tmp_path / "agent" / "worker.crt").read_bytes()
|
||||
expected = pki.fingerprint(worker_cert_pem)
|
||||
host = {
|
||||
"uuid": "h1",
|
||||
"name": "worker-test",
|
||||
"address": "127.0.0.1",
|
||||
"agent_port": port,
|
||||
"client_cert_fingerprint": expected,
|
||||
}
|
||||
async with swarm_client.AgentClient(host=host, identity=master_id) as agent:
|
||||
assert await agent.health() == {"status": "ok"}
|
||||
finally:
|
||||
server.should_exit = True
|
||||
thread.join(timeout=5)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fingerprint_pin_rejects_mismatch(tmp_path: pathlib.Path) -> None:
|
||||
"""A wrong expected fingerprint must raise FingerprintMismatchError."""
|
||||
port = _free_port()
|
||||
server, thread, master_id = _start_agent(tmp_path, port)
|
||||
try:
|
||||
host = {
|
||||
"uuid": "h1",
|
||||
"name": "worker-test",
|
||||
"address": "127.0.0.1",
|
||||
"agent_port": port,
|
||||
"client_cert_fingerprint": "0" * 64,
|
||||
}
|
||||
with pytest.raises(swarm_client.FingerprintMismatchError):
|
||||
async with swarm_client.AgentClient(host=host, identity=master_id):
|
||||
pass
|
||||
finally:
|
||||
server.should_exit = True
|
||||
thread.join(timeout=5)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_impostor_client_cannot_connect(tmp_path: pathlib.Path) -> None:
|
||||
"""A client whose cert was issued by a DIFFERENT CA must be rejected."""
|
||||
port = _free_port()
|
||||
server, thread, _master_id = _start_agent(tmp_path, port)
|
||||
try:
|
||||
evil_ca = pki.generate_ca("Evil CA")
|
||||
evil_dir = tmp_path / "evil"
|
||||
pki.write_worker_bundle(
|
||||
pki.issue_worker_cert(evil_ca, "evil-master", ["127.0.0.1"]), evil_dir
|
||||
)
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
ctx.load_cert_chain(str(evil_dir / "worker.crt"), str(evil_dir / "worker.key"))
|
||||
ctx.load_verify_locations(cafile=str(evil_dir / "ca.crt"))
|
||||
ctx.verify_mode = ssl.CERT_REQUIRED
|
||||
ctx.check_hostname = False
|
||||
async with httpx.AsyncClient(
|
||||
base_url=f"https://127.0.0.1:{port}", verify=ctx, timeout=5.0
|
||||
) as ac:
|
||||
with pytest.raises(
|
||||
(httpx.ConnectError, httpx.ReadError, httpx.RemoteProtocolError)
|
||||
):
|
||||
await ac.get("/health")
|
||||
finally:
|
||||
server.should_exit = True
|
||||
thread.join(timeout=5)
|
||||
122
tests/swarm/test_client_topology.py
Normal file
122
tests/swarm/test_client_topology.py
Normal file
@@ -0,0 +1,122 @@
|
||||
"""AgentClient topology methods — unit tests with a mock httpx transport.
|
||||
|
||||
Avoids the full uvicorn+mTLS setup used by the roundtrip test; we just
|
||||
need to prove the client emits the right verb/path/body and surfaces
|
||||
HTTP errors the way the caller expects.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from decnet.swarm.client import AgentClient, MasterIdentity
|
||||
|
||||
|
||||
class _StubIdentity:
|
||||
"""Satisfies the MasterIdentity shape without requiring real files."""
|
||||
|
||||
|
||||
def _client_with_transport(handler) -> AgentClient:
|
||||
"""Build an AgentClient whose internal httpx client is backed by
|
||||
:class:`httpx.MockTransport`. Bypasses _build_client so no real
|
||||
cert IO happens."""
|
||||
identity = MasterIdentity(
|
||||
key_path="/nope/key", # type: ignore[arg-type]
|
||||
cert_path="/nope/cert", # type: ignore[arg-type]
|
||||
ca_cert_path="/nope/ca", # type: ignore[arg-type]
|
||||
)
|
||||
client = AgentClient(
|
||||
address="127.0.0.1",
|
||||
agent_port=8765,
|
||||
identity=identity,
|
||||
)
|
||||
client._client = httpx.AsyncClient(
|
||||
base_url="https://127.0.0.1:8765",
|
||||
transport=httpx.MockTransport(handler),
|
||||
)
|
||||
return client
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_apply_topology_sends_body() -> None:
|
||||
captured: dict = {}
|
||||
|
||||
def handler(request: httpx.Request) -> httpx.Response:
|
||||
captured["url"] = str(request.url)
|
||||
captured["body"] = json.loads(request.content)
|
||||
return httpx.Response(
|
||||
200, json={"status": "applied", "version_hash": "h"}
|
||||
)
|
||||
|
||||
agent = _client_with_transport(handler)
|
||||
try:
|
||||
out = await agent.apply_topology({"topology": {"id": "t1"}}, "h")
|
||||
finally:
|
||||
await agent._client.aclose()
|
||||
|
||||
assert out == {"status": "applied", "version_hash": "h"}
|
||||
assert captured["url"].endswith("/topology/apply")
|
||||
assert captured["body"] == {
|
||||
"hydrated": {"topology": {"id": "t1"}},
|
||||
"version_hash": "h",
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_apply_topology_raises_on_409() -> None:
|
||||
def handler(_req: httpx.Request) -> httpx.Response:
|
||||
return httpx.Response(409, json={"detail": "already applied"})
|
||||
|
||||
agent = _client_with_transport(handler)
|
||||
try:
|
||||
with pytest.raises(httpx.HTTPStatusError) as ei:
|
||||
await agent.apply_topology({"topology": {"id": "t2"}}, "h")
|
||||
assert ei.value.response.status_code == 409
|
||||
finally:
|
||||
await agent._client.aclose()
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_teardown_topology_sends_body() -> None:
|
||||
captured: dict = {}
|
||||
|
||||
def handler(request: httpx.Request) -> httpx.Response:
|
||||
captured["body"] = json.loads(request.content)
|
||||
captured["url"] = str(request.url)
|
||||
return httpx.Response(200, json={"status": "torn_down", "topology_id": "t1"})
|
||||
|
||||
agent = _client_with_transport(handler)
|
||||
try:
|
||||
out = await agent.teardown_topology("t1")
|
||||
finally:
|
||||
await agent._client.aclose()
|
||||
|
||||
assert out["status"] == "torn_down"
|
||||
assert captured["body"] == {"topology_id": "t1"}
|
||||
assert captured["url"].endswith("/topology/teardown")
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_get_topology_state_returns_snapshot() -> None:
|
||||
def handler(request: httpx.Request) -> httpx.Response:
|
||||
assert request.method == "GET"
|
||||
return httpx.Response(
|
||||
200,
|
||||
json={
|
||||
"topology_id": "t1",
|
||||
"applied_version_hash": "h",
|
||||
"applied_at": 1,
|
||||
"last_error": None,
|
||||
"observed": {"bridges": [], "containers": []},
|
||||
},
|
||||
)
|
||||
|
||||
agent = _client_with_transport(handler)
|
||||
try:
|
||||
snap = await agent.get_topology_state()
|
||||
finally:
|
||||
await agent._client.aclose()
|
||||
assert snap["topology_id"] == "t1"
|
||||
assert snap["applied_version_hash"] == "h"
|
||||
256
tests/swarm/test_forwarder_resilience.py
Normal file
256
tests/swarm/test_forwarder_resilience.py
Normal file
@@ -0,0 +1,256 @@
|
||||
"""Extra resilience tests for the syslog-over-TLS pipeline.
|
||||
|
||||
Covers failure modes the happy-path tests in test_log_forwarder.py don't
|
||||
exercise:
|
||||
|
||||
* log rotation (st_size shrinks under the forwarder) resets offset to 0
|
||||
and re-ships from the start;
|
||||
* listener restart — forwarder reconnects and continues from the last
|
||||
persisted offset, no duplicates;
|
||||
* listener tolerates a client that connects with a valid cert and drops
|
||||
mid-frame (IncompleteReadError path) without crashing the server task;
|
||||
* peer_cn + fingerprint_from_ssl degrade gracefully on missing/invalid
|
||||
peer certificates.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import pathlib
|
||||
import socket
|
||||
|
||||
import pytest
|
||||
import ssl
|
||||
|
||||
from decnet.swarm import log_forwarder as fwd
|
||||
from decnet.swarm import log_listener as lst
|
||||
from decnet.swarm import pki
|
||||
from decnet.swarm.client import ensure_master_identity
|
||||
|
||||
|
||||
SAMPLE = (
|
||||
'<13>1 2026-04-18T00:00:00Z decky01 svc 1 - '
|
||||
'[decnet@53595 decky="decky01" service="ssh-service" '
|
||||
'event_type="connect" attacker_ip="1.2.3.4" attacker_port="4242"] {msg}\n'
|
||||
)
|
||||
|
||||
|
||||
def _free_port() -> int:
|
||||
s = socket.socket()
|
||||
s.bind(("127.0.0.1", 0))
|
||||
port = s.getsockname()[1]
|
||||
s.close()
|
||||
return port
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def _pki_env(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch):
|
||||
ca_dir = tmp_path / "ca"
|
||||
pki.ensure_ca(ca_dir)
|
||||
ensure_master_identity(ca_dir)
|
||||
worker_dir = tmp_path / "agent"
|
||||
issued = pki.issue_worker_cert(pki.load_ca(ca_dir), "worker-y", ["127.0.0.1"])
|
||||
pki.write_worker_bundle(issued, worker_dir)
|
||||
monkeypatch.setattr(pki, "DEFAULT_CA_DIR", ca_dir)
|
||||
monkeypatch.setattr(pki, "DEFAULT_AGENT_DIR", worker_dir)
|
||||
return {"ca_dir": ca_dir, "worker_dir": worker_dir}
|
||||
|
||||
|
||||
async def _wait_for(pred, timeout: float = 5.0, interval: float = 0.1) -> bool:
|
||||
steps = max(1, int(timeout / interval))
|
||||
for _ in range(steps):
|
||||
if pred():
|
||||
return True
|
||||
await asyncio.sleep(interval)
|
||||
return False
|
||||
|
||||
|
||||
# ----------------------------------------------------------- pure helpers
|
||||
|
||||
|
||||
def test_peer_cn_returns_unknown_when_no_ssl_object() -> None:
|
||||
assert lst.peer_cn(None) == "unknown"
|
||||
|
||||
|
||||
def test_fingerprint_from_ssl_handles_missing_peer_cert() -> None:
|
||||
assert lst.fingerprint_from_ssl(None) is None
|
||||
|
||||
|
||||
# ---------------------------------------------------- rotation / crash loops
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_forwarder_reships_after_log_rotation(
|
||||
tmp_path: pathlib.Path, _pki_env: dict
|
||||
) -> None:
|
||||
"""If the log file shrinks (logrotate truncation), the forwarder must
|
||||
reset offset=0 and re-ship the new contents — never get stuck past EOF."""
|
||||
port = _free_port()
|
||||
worker_log = tmp_path / "decnet.log"
|
||||
master_log = tmp_path / "master.log"
|
||||
master_json = tmp_path / "master.json"
|
||||
|
||||
listener_cfg = lst.ListenerConfig(
|
||||
log_path=master_log, json_path=master_json,
|
||||
bind_host="127.0.0.1", bind_port=port, ca_dir=_pki_env["ca_dir"],
|
||||
)
|
||||
fwd_cfg = fwd.ForwarderConfig(
|
||||
log_path=worker_log, master_host="127.0.0.1", master_port=port,
|
||||
agent_dir=_pki_env["worker_dir"], state_db=tmp_path / "fwd.db",
|
||||
)
|
||||
stop = asyncio.Event()
|
||||
lt = asyncio.create_task(lst.run_listener(listener_cfg, stop_event=stop))
|
||||
await asyncio.sleep(0.2)
|
||||
ft = asyncio.create_task(fwd.run_forwarder(fwd_cfg, poll_interval=0.05, stop_event=stop))
|
||||
|
||||
# Phase 1: write TWO pre-rotation lines so the offset is deep into the file.
|
||||
worker_log.write_text(SAMPLE.format(msg="rotate-A") + SAMPLE.format(msg="rotate-B"))
|
||||
ok = await _wait_for(lambda: master_log.exists() and b"rotate-B" in master_log.read_bytes())
|
||||
assert ok, "pre-rotation lines never reached master"
|
||||
size_before_rotate = master_log.stat().st_size
|
||||
|
||||
# Phase 2: rotate (truncate to a strictly SHORTER content) so the
|
||||
# forwarder's offset tracker lands past EOF and must reset to 0.
|
||||
worker_log.write_text(SAMPLE.format(msg="P"))
|
||||
|
||||
ok = await _wait_for(
|
||||
lambda: master_log.stat().st_size > size_before_rotate
|
||||
and master_log.read_text().rstrip().endswith("P"),
|
||||
timeout=5.0,
|
||||
)
|
||||
assert ok, "forwarder got stuck past EOF after rotation (expected reset → ship post-rotate 'P' line)"
|
||||
|
||||
stop.set()
|
||||
for t in (ft, lt):
|
||||
try:
|
||||
await asyncio.wait_for(t, timeout=5)
|
||||
except asyncio.TimeoutError:
|
||||
t.cancel()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_forwarder_resumes_after_listener_restart(
|
||||
tmp_path: pathlib.Path, _pki_env: dict
|
||||
) -> None:
|
||||
"""Listener goes down mid-session, forwarder retries with backoff; on
|
||||
restart, we must NOT re-ship lines that were already drained."""
|
||||
port = _free_port()
|
||||
worker_log = tmp_path / "decnet.log"
|
||||
master_log = tmp_path / "master.log"
|
||||
master_json = tmp_path / "master.json"
|
||||
state_db = tmp_path / "fwd.db"
|
||||
|
||||
listener_cfg = lst.ListenerConfig(
|
||||
log_path=master_log, json_path=master_json,
|
||||
bind_host="127.0.0.1", bind_port=port, ca_dir=_pki_env["ca_dir"],
|
||||
)
|
||||
fwd_cfg = fwd.ForwarderConfig(
|
||||
log_path=worker_log, master_host="127.0.0.1", master_port=port,
|
||||
agent_dir=_pki_env["worker_dir"], state_db=state_db,
|
||||
)
|
||||
|
||||
# --- phase 1 ----------------------------------------------------------
|
||||
stop1 = asyncio.Event()
|
||||
lt1 = asyncio.create_task(lst.run_listener(listener_cfg, stop_event=stop1))
|
||||
await asyncio.sleep(0.2)
|
||||
stop_fwd = asyncio.Event()
|
||||
ft = asyncio.create_task(fwd.run_forwarder(fwd_cfg, poll_interval=0.05, stop_event=stop_fwd))
|
||||
|
||||
worker_log.write_text(SAMPLE.format(msg="before-outage"))
|
||||
ok = await _wait_for(lambda: master_log.exists() and b"before-outage" in master_log.read_bytes())
|
||||
assert ok, "phase-1 line never reached master"
|
||||
|
||||
# --- outage -----------------------------------------------------------
|
||||
stop1.set()
|
||||
try:
|
||||
await asyncio.wait_for(lt1, timeout=5)
|
||||
except asyncio.TimeoutError:
|
||||
lt1.cancel()
|
||||
|
||||
# While listener is down, append another line. Forwarder will retry.
|
||||
with open(worker_log, "a", encoding="utf-8") as f:
|
||||
f.write(SAMPLE.format(msg="during-outage"))
|
||||
|
||||
await asyncio.sleep(0.3)
|
||||
|
||||
# --- phase 2: listener back ------------------------------------------
|
||||
stop2 = asyncio.Event()
|
||||
lt2 = asyncio.create_task(lst.run_listener(listener_cfg, stop_event=stop2))
|
||||
|
||||
ok = await _wait_for(lambda: b"during-outage" in master_log.read_bytes(), timeout=15.0)
|
||||
assert ok, "forwarder never reshipped the buffered line after listener restart"
|
||||
|
||||
# Crucially, "before-outage" appears exactly once — not re-shipped.
|
||||
body = master_log.read_text()
|
||||
assert body.count("before-outage") == 1, "forwarder duplicated a line across reconnect"
|
||||
assert body.count("during-outage") == 1
|
||||
|
||||
# --- shutdown ---------------------------------------------------------
|
||||
stop_fwd.set()
|
||||
stop2.set()
|
||||
for t in (ft, lt2):
|
||||
try:
|
||||
await asyncio.wait_for(t, timeout=5)
|
||||
except asyncio.TimeoutError:
|
||||
t.cancel()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_listener_tolerates_client_dropping_mid_stream(
|
||||
tmp_path: pathlib.Path, _pki_env: dict
|
||||
) -> None:
|
||||
"""A well-authenticated client that sends a partial frame and drops must
|
||||
not take the listener down or wedge subsequent connections."""
|
||||
port = _free_port()
|
||||
master_log = tmp_path / "master.log"
|
||||
master_json = tmp_path / "master.json"
|
||||
listener_cfg = lst.ListenerConfig(
|
||||
log_path=master_log, json_path=master_json,
|
||||
bind_host="127.0.0.1", bind_port=port, ca_dir=_pki_env["ca_dir"],
|
||||
)
|
||||
stop = asyncio.Event()
|
||||
listener_task = asyncio.create_task(lst.run_listener(listener_cfg, stop_event=stop))
|
||||
await asyncio.sleep(0.2)
|
||||
|
||||
try:
|
||||
# Client 1: send a truncated octet-count prefix ("99 ") but no payload
|
||||
# before closing — exercises IncompleteReadError in read_frame.
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
ctx.load_cert_chain(
|
||||
str(_pki_env["worker_dir"] / "worker.crt"),
|
||||
str(_pki_env["worker_dir"] / "worker.key"),
|
||||
)
|
||||
ctx.load_verify_locations(cafile=str(_pki_env["worker_dir"] / "ca.crt"))
|
||||
ctx.verify_mode = ssl.CERT_REQUIRED
|
||||
ctx.check_hostname = False
|
||||
|
||||
r, w = await asyncio.open_connection("127.0.0.1", port, ssl=ctx)
|
||||
w.write(b"99 ") # promise 99 bytes, send 0
|
||||
await w.drain()
|
||||
w.close()
|
||||
try:
|
||||
await w.wait_closed()
|
||||
except Exception: # nosec B110
|
||||
pass
|
||||
|
||||
# Client 2: reconnect cleanly and actually ship a frame. If the
|
||||
# listener survived client-1's misbehavior, this must succeed.
|
||||
r2, w2 = await asyncio.open_connection("127.0.0.1", port, ssl=ctx)
|
||||
payload = b'<13>1 2026-04-18T00:00:00Z decky01 svc - - - post-drop'
|
||||
w2.write(f"{len(payload)} ".encode() + payload)
|
||||
await w2.drain()
|
||||
w2.close()
|
||||
try:
|
||||
await w2.wait_closed()
|
||||
except Exception: # nosec B110
|
||||
pass
|
||||
|
||||
ok = await _wait_for(
|
||||
lambda: master_log.exists() and b"post-drop" in master_log.read_bytes()
|
||||
)
|
||||
assert ok, "listener got wedged by a mid-frame client drop"
|
||||
finally:
|
||||
stop.set()
|
||||
try:
|
||||
await asyncio.wait_for(listener_task, timeout=5)
|
||||
except asyncio.TimeoutError:
|
||||
listener_task.cancel()
|
||||
300
tests/swarm/test_heartbeat.py
Normal file
300
tests/swarm/test_heartbeat.py
Normal file
@@ -0,0 +1,300 @@
|
||||
"""Tests for POST /swarm/heartbeat — cert pinning + shard snapshot refresh."""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import pathlib
|
||||
from typing import Any
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from decnet.web.db.factory import get_repository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm import api_heartbeat as hb_mod
|
||||
|
||||
|
||||
# ------------------------- shared fixtures (mirror test_swarm_api.py) ---
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ca_dir(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch) -> pathlib.Path:
|
||||
ca = tmp_path / "ca"
|
||||
from decnet.swarm import pki
|
||||
from decnet.swarm import client as swarm_client
|
||||
from decnet.web.router.swarm import api_enroll_host as enroll_mod
|
||||
|
||||
monkeypatch.setattr(pki, "DEFAULT_CA_DIR", ca)
|
||||
monkeypatch.setattr(swarm_client, "pki", pki)
|
||||
monkeypatch.setattr(enroll_mod, "pki", pki)
|
||||
return ca
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def repo(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch):
|
||||
r = get_repository(db_path=str(tmp_path / "hb.db"))
|
||||
import decnet.web.dependencies as deps
|
||||
import decnet.web.swarm_api as swarm_api_mod
|
||||
|
||||
monkeypatch.setattr(deps, "repo", r)
|
||||
monkeypatch.setattr(swarm_api_mod, "repo", r)
|
||||
return r
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client(repo, ca_dir: pathlib.Path):
|
||||
from decnet.web.swarm_api import app
|
||||
|
||||
async def _override() -> Any:
|
||||
return repo
|
||||
|
||||
app.dependency_overrides[get_repo] = _override
|
||||
with TestClient(app) as c:
|
||||
yield c
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
|
||||
def _enroll(client: TestClient, name: str, address: str = "10.0.0.5") -> dict:
|
||||
resp = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": name, "address": address, "agent_port": 8765},
|
||||
)
|
||||
assert resp.status_code == 201, resp.text
|
||||
return resp.json()
|
||||
|
||||
|
||||
def _pin_fingerprint(monkeypatch: pytest.MonkeyPatch, fp: str | None) -> None:
|
||||
"""Force ``_extract_peer_fingerprint`` to return ``fp`` inside the
|
||||
endpoint module so we don't need a live TLS peer."""
|
||||
monkeypatch.setattr(hb_mod, "_extract_peer_fingerprint", lambda scope: fp)
|
||||
|
||||
|
||||
def _status_body(deckies: list[dict], runtime: dict[str, dict]) -> dict:
|
||||
return {
|
||||
"deployed": True,
|
||||
"mode": "swarm",
|
||||
"compose_path": "/run/decnet/compose.yml",
|
||||
"deckies": deckies,
|
||||
"runtime": runtime,
|
||||
}
|
||||
|
||||
|
||||
def _decky_payload(name: str = "decky-01", ip: str = "10.0.0.50") -> dict:
|
||||
return {
|
||||
"name": name,
|
||||
"hostname": f"{name}.lan",
|
||||
"distro": "debian-bookworm",
|
||||
"ip": ip,
|
||||
"services": ["ssh"],
|
||||
"base_image": "debian:bookworm-slim",
|
||||
"service_config": {"ssh": {"port": 22}},
|
||||
"mutate_interval": 3600,
|
||||
"last_mutated": 0.0,
|
||||
"archetype": "generic",
|
||||
"host_uuid": None,
|
||||
}
|
||||
|
||||
|
||||
# ------------------------- _extract_peer_fingerprint unit tests ---------
|
||||
|
||||
|
||||
def test_extract_primary_path_returns_fingerprint() -> None:
|
||||
der = b"\x30\x82test-cert-bytes"
|
||||
scope = {"extensions": {"tls": {"client_cert_chain": [der]}}}
|
||||
assert hb_mod._extract_peer_fingerprint(scope) == hashlib.sha256(der).hexdigest()
|
||||
|
||||
|
||||
def test_extract_fallback_path_when_primary_absent() -> None:
|
||||
der = b"\x30\x82fallback-bytes"
|
||||
ssl_obj = MagicMock()
|
||||
ssl_obj.getpeercert.return_value = der
|
||||
transport = MagicMock()
|
||||
transport.get_extra_info.return_value = ssl_obj
|
||||
scope = {"transport": transport}
|
||||
|
||||
fp = hb_mod._extract_peer_fingerprint(scope)
|
||||
assert fp == hashlib.sha256(der).hexdigest()
|
||||
transport.get_extra_info.assert_called_with("ssl_object")
|
||||
ssl_obj.getpeercert.assert_called_with(binary_form=True)
|
||||
|
||||
|
||||
def test_extract_returns_none_when_both_paths_empty() -> None:
|
||||
# No extensions, no transport → fail-closed signal for the endpoint.
|
||||
assert hb_mod._extract_peer_fingerprint({}) is None
|
||||
|
||||
|
||||
def test_extract_returns_none_when_transport_ssl_object_missing() -> None:
|
||||
transport = MagicMock()
|
||||
transport.get_extra_info.return_value = None
|
||||
scope = {"transport": transport}
|
||||
assert hb_mod._extract_peer_fingerprint(scope) is None
|
||||
|
||||
|
||||
# ------------------------- endpoint behaviour --------------------------
|
||||
|
||||
|
||||
def test_heartbeat_happy_path_primary_extraction(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
host = _enroll(client, "worker-a")
|
||||
_pin_fingerprint(monkeypatch, host["fingerprint"])
|
||||
|
||||
body = {
|
||||
"host_uuid": host["host_uuid"],
|
||||
"agent_version": "1.2.3",
|
||||
"status": _status_body(
|
||||
[_decky_payload("decky-01")],
|
||||
{"decky-01": {"running": True}},
|
||||
),
|
||||
}
|
||||
resp = client.post("/swarm/heartbeat", json=body)
|
||||
assert resp.status_code == 204, resp.text
|
||||
|
||||
async def _verify() -> None:
|
||||
row = await repo.get_swarm_host_by_uuid(host["host_uuid"])
|
||||
assert row["last_heartbeat"] is not None
|
||||
assert row["status"] == "active"
|
||||
shards = await repo.list_decky_shards(host["host_uuid"])
|
||||
assert len(shards) == 1
|
||||
s = shards[0]
|
||||
assert s["decky_name"] == "decky-01"
|
||||
assert s["decky_ip"] == "10.0.0.50"
|
||||
assert s["state"] == "running"
|
||||
assert s["last_seen"] is not None
|
||||
# snapshot flattening from list_decky_shards
|
||||
assert s["hostname"] == "decky-01.lan"
|
||||
assert s["archetype"] == "generic"
|
||||
assert s["service_config"] == {"ssh": {"port": 22}}
|
||||
|
||||
asyncio.run(_verify())
|
||||
|
||||
|
||||
def test_heartbeat_fallback_extraction_path_also_accepted(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
# Same endpoint behaviour regardless of which scope path supplied
|
||||
# the fingerprint — this guards against uvicorn-version drift where
|
||||
# only the fallback slot is populated.
|
||||
host = _enroll(client, "worker-b", "10.0.0.6")
|
||||
_pin_fingerprint(monkeypatch, host["fingerprint"])
|
||||
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={
|
||||
"host_uuid": host["host_uuid"],
|
||||
"status": {"deployed": False, "deckies": []},
|
||||
},
|
||||
)
|
||||
assert resp.status_code == 204
|
||||
|
||||
|
||||
def test_heartbeat_unknown_host_returns_404(
|
||||
client: TestClient, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
_pin_fingerprint(monkeypatch, "a" * 64)
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={"host_uuid": "does-not-exist", "status": {"deployed": False}},
|
||||
)
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
def test_heartbeat_fingerprint_mismatch_returns_403(
|
||||
client: TestClient, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
host = _enroll(client, "worker-c", "10.0.0.7")
|
||||
_pin_fingerprint(monkeypatch, "b" * 64) # not the host's fingerprint
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={"host_uuid": host["host_uuid"], "status": {"deployed": False}},
|
||||
)
|
||||
assert resp.status_code == 403
|
||||
assert "mismatch" in resp.json()["detail"]
|
||||
|
||||
|
||||
def test_heartbeat_no_peer_cert_fails_closed(
|
||||
client: TestClient, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
# Both extraction paths unavailable → 403, never 200. Fail-closed.
|
||||
host = _enroll(client, "worker-d", "10.0.0.8")
|
||||
_pin_fingerprint(monkeypatch, None)
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={"host_uuid": host["host_uuid"], "status": {"deployed": False}},
|
||||
)
|
||||
assert resp.status_code == 403
|
||||
assert "unavailable" in resp.json()["detail"]
|
||||
|
||||
|
||||
def test_heartbeat_decommissioned_host_returns_404(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
# Enrol, capture the fingerprint, delete the host, then replay the
|
||||
# heartbeat. Even though the cert is still CA-signed, the decommission
|
||||
# revoked the host-row so lookup returns None → 404. Prevents ghost
|
||||
# shards from a decommissioned worker.
|
||||
host = _enroll(client, "worker-e", "10.0.0.9")
|
||||
fp = host["fingerprint"]
|
||||
|
||||
async def _delete() -> None:
|
||||
ok = await repo.delete_swarm_host(host["host_uuid"])
|
||||
assert ok
|
||||
|
||||
asyncio.run(_delete())
|
||||
|
||||
_pin_fingerprint(monkeypatch, fp)
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={"host_uuid": host["host_uuid"], "status": {"deployed": False}},
|
||||
)
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
def test_heartbeat_deployed_false_bumps_host_but_writes_no_shards(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
host = _enroll(client, "worker-f", "10.0.0.10")
|
||||
_pin_fingerprint(monkeypatch, host["fingerprint"])
|
||||
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={
|
||||
"host_uuid": host["host_uuid"],
|
||||
"status": {"deployed": False, "deckies": []},
|
||||
},
|
||||
)
|
||||
assert resp.status_code == 204
|
||||
|
||||
async def _verify() -> None:
|
||||
row = await repo.get_swarm_host_by_uuid(host["host_uuid"])
|
||||
assert row["last_heartbeat"] is not None
|
||||
shards = await repo.list_decky_shards(host["host_uuid"])
|
||||
assert shards == []
|
||||
|
||||
asyncio.run(_verify())
|
||||
|
||||
|
||||
def test_heartbeat_decky_missing_from_runtime_is_degraded(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
host = _enroll(client, "worker-g", "10.0.0.11")
|
||||
_pin_fingerprint(monkeypatch, host["fingerprint"])
|
||||
|
||||
body = {
|
||||
"host_uuid": host["host_uuid"],
|
||||
"status": _status_body(
|
||||
[_decky_payload("decky-01"), _decky_payload("decky-02", "10.0.0.51")],
|
||||
{"decky-01": {"running": True}}, # decky-02 absent
|
||||
),
|
||||
}
|
||||
resp = client.post("/swarm/heartbeat", json=body)
|
||||
assert resp.status_code == 204
|
||||
|
||||
async def _verify() -> None:
|
||||
shards = await repo.list_decky_shards(host["host_uuid"])
|
||||
by = {s["decky_name"]: s for s in shards}
|
||||
assert by["decky-01"]["state"] == "running"
|
||||
assert by["decky-02"]["state"] == "degraded"
|
||||
|
||||
asyncio.run(_verify())
|
||||
224
tests/swarm/test_heartbeat_topology_resync.py
Normal file
224
tests/swarm/test_heartbeat_topology_resync.py
Normal file
@@ -0,0 +1,224 @@
|
||||
"""Heartbeat-driven topology resync: master flags divergent agents.
|
||||
|
||||
When an agent reports an applied_version_hash that differs from what
|
||||
master computed for the topology pinned to that host (or reports no
|
||||
topology at all while master expects one), the heartbeat handler must
|
||||
set ``needs_resync=True`` on the topology row. The mutator reconcile
|
||||
loop picks it up later — tested separately.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pathlib
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from decnet.topology.config import TopologyConfig
|
||||
from decnet.topology.generator import generate
|
||||
from decnet.topology.hashing import canonical_hash
|
||||
from decnet.topology.persistence import hydrate, persist, transition_status
|
||||
from decnet.topology.status import TopologyStatus
|
||||
from decnet.web.db.factory import get_repository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm import api_heartbeat as hb_mod
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ca_dir(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch) -> pathlib.Path:
|
||||
ca = tmp_path / "ca"
|
||||
from decnet.swarm import pki
|
||||
from decnet.swarm import client as swarm_client
|
||||
from decnet.web.router.swarm import api_enroll_host as enroll_mod
|
||||
|
||||
monkeypatch.setattr(pki, "DEFAULT_CA_DIR", ca)
|
||||
monkeypatch.setattr(swarm_client, "pki", pki)
|
||||
monkeypatch.setattr(enroll_mod, "pki", pki)
|
||||
return ca
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def repo(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch):
|
||||
r = get_repository(db_path=str(tmp_path / "hb-resync.db"))
|
||||
import decnet.web.dependencies as deps
|
||||
import decnet.web.swarm_api as swarm_api_mod
|
||||
|
||||
monkeypatch.setattr(deps, "repo", r)
|
||||
monkeypatch.setattr(swarm_api_mod, "repo", r)
|
||||
return r
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client(repo, ca_dir):
|
||||
from decnet.web.swarm_api import app
|
||||
|
||||
async def _override() -> Any:
|
||||
return repo
|
||||
|
||||
app.dependency_overrides[get_repo] = _override
|
||||
with TestClient(app) as c:
|
||||
yield c
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
|
||||
def _enroll(c: TestClient, name: str) -> dict:
|
||||
r = c.post("/swarm/enroll", json={"name": name, "address": "10.0.0.5", "agent_port": 8765})
|
||||
assert r.status_code == 201, r.text
|
||||
return r.json()
|
||||
|
||||
|
||||
def _cfg(**kw) -> TopologyConfig:
|
||||
base = dict(
|
||||
name="hb-resync",
|
||||
mode="agent",
|
||||
depth=1,
|
||||
branching_factor=1,
|
||||
deckies_per_lan_min=1,
|
||||
deckies_per_lan_max=1,
|
||||
cross_edge_probability=0.0,
|
||||
randomize_services=False,
|
||||
services_explicit=["ssh"],
|
||||
seed=3,
|
||||
)
|
||||
base.update(kw)
|
||||
return TopologyConfig(**base)
|
||||
|
||||
|
||||
async def _persist_active(repo, host_uuid: str) -> tuple[str, str]:
|
||||
plan = generate(_cfg())
|
||||
tid = await persist(repo, plan, target_host_uuid=host_uuid)
|
||||
await transition_status(repo, tid, TopologyStatus.DEPLOYING)
|
||||
await transition_status(repo, tid, TopologyStatus.ACTIVE)
|
||||
hydrated = await hydrate(repo, tid)
|
||||
return tid, canonical_hash(hydrated)
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_heartbeat_matching_hash_does_not_flag(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
host = _enroll(client, "worker-match")
|
||||
monkeypatch.setattr(hb_mod, "_extract_peer_fingerprint", lambda s: host["fingerprint"])
|
||||
tid, expected = await _persist_active(repo, host["host_uuid"])
|
||||
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={
|
||||
"host_uuid": host["host_uuid"],
|
||||
"status": {"deployed": False},
|
||||
"topology": {
|
||||
"topology_id": tid,
|
||||
"applied_version_hash": expected,
|
||||
"observed": {"bridges": [], "containers": []},
|
||||
},
|
||||
},
|
||||
)
|
||||
assert resp.status_code == 204, resp.text
|
||||
row = await repo.get_topology(tid)
|
||||
assert row["needs_resync"] is False
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_heartbeat_hash_mismatch_flags_resync(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
host = _enroll(client, "worker-drift")
|
||||
monkeypatch.setattr(hb_mod, "_extract_peer_fingerprint", lambda s: host["fingerprint"])
|
||||
tid, _ = await _persist_active(repo, host["host_uuid"])
|
||||
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={
|
||||
"host_uuid": host["host_uuid"],
|
||||
"status": {"deployed": False},
|
||||
"topology": {
|
||||
"topology_id": tid,
|
||||
"applied_version_hash": "stale-hash-" + "0" * 40,
|
||||
"observed": {"bridges": [], "containers": []},
|
||||
},
|
||||
},
|
||||
)
|
||||
assert resp.status_code == 204, resp.text
|
||||
row = await repo.get_topology(tid)
|
||||
assert row["needs_resync"] is True
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_heartbeat_agent_reports_no_topology_flags_resync(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""Fresh-boot / wiped-cache case: agent says `null` but master expects
|
||||
an ACTIVE topology pinned here → flag for re-push."""
|
||||
host = _enroll(client, "worker-fresh")
|
||||
monkeypatch.setattr(hb_mod, "_extract_peer_fingerprint", lambda s: host["fingerprint"])
|
||||
tid, _ = await _persist_active(repo, host["host_uuid"])
|
||||
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={
|
||||
"host_uuid": host["host_uuid"],
|
||||
"status": {"deployed": False},
|
||||
"topology": {
|
||||
"topology_id": None,
|
||||
"applied_version_hash": None,
|
||||
"observed": {"bridges": [], "containers": []},
|
||||
},
|
||||
},
|
||||
)
|
||||
assert resp.status_code == 204, resp.text
|
||||
row = await repo.get_topology(tid)
|
||||
assert row["needs_resync"] is True
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_heartbeat_without_topology_block_is_noop_for_resync(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""Legacy agents that don't send a topology block are still valid;
|
||||
they just don't contribute to resync detection. But we still should
|
||||
treat the absence as 'no topology reported' for a pinned ACTIVE
|
||||
topology → flag."""
|
||||
host = _enroll(client, "worker-legacy")
|
||||
monkeypatch.setattr(hb_mod, "_extract_peer_fingerprint", lambda s: host["fingerprint"])
|
||||
tid, _ = await _persist_active(repo, host["host_uuid"])
|
||||
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={"host_uuid": host["host_uuid"], "status": {"deployed": False}},
|
||||
)
|
||||
assert resp.status_code == 204, resp.text
|
||||
row = await repo.get_topology(tid)
|
||||
# Absence of the topology block means agent hasn't reported anything
|
||||
# → treat like no topology reported → flag.
|
||||
assert row["needs_resync"] is True
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_heartbeat_other_host_topology_unaffected(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""Reports from one host must not flip resync flags on another
|
||||
host's topologies."""
|
||||
host_a = _enroll(client, "worker-a")
|
||||
host_b = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "worker-b", "address": "10.0.0.6", "agent_port": 8765},
|
||||
).json()
|
||||
monkeypatch.setattr(hb_mod, "_extract_peer_fingerprint", lambda s: host_b["fingerprint"])
|
||||
tid_a, hash_a = await _persist_active(repo, host_a["host_uuid"])
|
||||
|
||||
resp = client.post(
|
||||
"/swarm/heartbeat",
|
||||
json={
|
||||
"host_uuid": host_b["host_uuid"],
|
||||
"status": {"deployed": False},
|
||||
"topology": {
|
||||
"topology_id": None,
|
||||
"applied_version_hash": None,
|
||||
"observed": {},
|
||||
},
|
||||
},
|
||||
)
|
||||
assert resp.status_code == 204, resp.text
|
||||
row = await repo.get_topology(tid_a)
|
||||
assert row["needs_resync"] is False
|
||||
282
tests/swarm/test_log_forwarder.py
Normal file
282
tests/swarm/test_log_forwarder.py
Normal file
@@ -0,0 +1,282 @@
|
||||
"""Tests for the syslog-over-TLS pipeline.
|
||||
|
||||
Covers:
|
||||
* octet-counted framing encode/decode (pure functions);
|
||||
* offset persistence across reopens;
|
||||
* end-to-end mTLS roundtrip forwarder → listener;
|
||||
* impostor-CA worker is rejected at TLS handshake.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import pathlib
|
||||
import socket
|
||||
|
||||
import pytest
|
||||
import ssl
|
||||
|
||||
from decnet.swarm import log_forwarder as fwd
|
||||
from decnet.swarm import log_listener as lst
|
||||
from decnet.swarm import pki
|
||||
from decnet.swarm.client import ensure_master_identity
|
||||
|
||||
|
||||
def _free_port() -> int:
|
||||
s = socket.socket()
|
||||
s.bind(("127.0.0.1", 0))
|
||||
port = s.getsockname()[1]
|
||||
s.close()
|
||||
return port
|
||||
|
||||
|
||||
# ------------------------------------------------------------ pure framing
|
||||
|
||||
|
||||
def test_encode_frame_matches_rfc5425_shape() -> None:
|
||||
out = fwd.encode_frame("<13>1 2026-04-18T00:00:00Z decky01 svc - - - hi")
|
||||
# "<len> <msg>" — ASCII digits, space, then the UTF-8 payload.
|
||||
assert out.startswith(b"47 ")
|
||||
assert out.endswith(b"hi")
|
||||
assert int(out.split(b" ", 1)[0]) == len(out.split(b" ", 1)[1])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_read_frame_roundtrip() -> None:
|
||||
payload = b"<13>1 2026-04-18T00:00:00Z host app - - - msg"
|
||||
frame = fwd.encode_frame(payload.decode())
|
||||
reader = asyncio.StreamReader()
|
||||
reader.feed_data(frame)
|
||||
reader.feed_eof()
|
||||
got = await fwd.read_frame(reader)
|
||||
assert got == payload
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_read_frame_rejects_bad_prefix() -> None:
|
||||
reader = asyncio.StreamReader()
|
||||
reader.feed_data(b"NOTANUMBER msg")
|
||||
reader.feed_eof()
|
||||
with pytest.raises(ValueError):
|
||||
await fwd.read_frame(reader)
|
||||
|
||||
|
||||
# ------------------------------------------------------------- offset store
|
||||
|
||||
|
||||
def test_offset_store_persists_across_reopen(tmp_path: pathlib.Path) -> None:
|
||||
db = tmp_path / "fwd.db"
|
||||
s1 = fwd._OffsetStore(db)
|
||||
assert s1.get() == 0
|
||||
s1.set(4242)
|
||||
s1.close()
|
||||
|
||||
s2 = fwd._OffsetStore(db)
|
||||
assert s2.get() == 4242
|
||||
s2.close()
|
||||
|
||||
|
||||
# ------------------------------------------------------------ TLS roundtrip
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def _pki_env(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch):
|
||||
ca_dir = tmp_path / "ca"
|
||||
pki.ensure_ca(ca_dir)
|
||||
# Master identity (also used as listener server cert).
|
||||
master_id = ensure_master_identity(ca_dir)
|
||||
# Give master's cert a 127.0.0.1 SAN so workers can resolve it if they
|
||||
# happen to enable check_hostname; we don't, but future-proof anyway.
|
||||
# (The default ensure_master_identity() cert already has 127.0.0.1.)
|
||||
_ = master_id
|
||||
|
||||
# Worker bundle — enrolled with 127.0.0.1 SAN.
|
||||
worker_dir = tmp_path / "agent"
|
||||
issued = pki.issue_worker_cert(pki.load_ca(ca_dir), "worker-x", ["127.0.0.1"])
|
||||
pki.write_worker_bundle(issued, worker_dir)
|
||||
|
||||
monkeypatch.setattr(pki, "DEFAULT_CA_DIR", ca_dir)
|
||||
monkeypatch.setattr(pki, "DEFAULT_AGENT_DIR", worker_dir)
|
||||
return {"ca_dir": ca_dir, "worker_dir": worker_dir}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_forwarder_to_listener_roundtrip(
|
||||
tmp_path: pathlib.Path, _pki_env: dict
|
||||
) -> None:
|
||||
port = _free_port()
|
||||
worker_log = tmp_path / "decnet.log"
|
||||
worker_log.write_text("") # create empty
|
||||
|
||||
master_log = tmp_path / "master.log"
|
||||
master_json = tmp_path / "master.json"
|
||||
|
||||
listener_cfg = lst.ListenerConfig(
|
||||
log_path=master_log,
|
||||
json_path=master_json,
|
||||
bind_host="127.0.0.1",
|
||||
bind_port=port,
|
||||
ca_dir=_pki_env["ca_dir"],
|
||||
)
|
||||
fwd_cfg = fwd.ForwarderConfig(
|
||||
log_path=worker_log,
|
||||
master_host="127.0.0.1",
|
||||
master_port=port,
|
||||
agent_dir=_pki_env["worker_dir"],
|
||||
state_db=tmp_path / "fwd.db",
|
||||
)
|
||||
stop = asyncio.Event()
|
||||
|
||||
listener_task = asyncio.create_task(lst.run_listener(listener_cfg, stop_event=stop))
|
||||
await asyncio.sleep(0.2) # wait for bind
|
||||
|
||||
forwarder_task = asyncio.create_task(
|
||||
fwd.run_forwarder(fwd_cfg, poll_interval=0.05, stop_event=stop)
|
||||
)
|
||||
|
||||
# Write a few RFC 5424-ish lines into the worker log.
|
||||
sample = (
|
||||
'<13>1 2026-04-18T00:00:00Z decky01 ssh-service 1 - '
|
||||
'[decnet@53595 decky="decky01" service="ssh-service" event_type="connect" '
|
||||
'attacker_ip="1.2.3.4" attacker_port="4242"] ssh connect\n'
|
||||
)
|
||||
with open(worker_log, "a", encoding="utf-8") as f:
|
||||
for _ in range(3):
|
||||
f.write(sample)
|
||||
|
||||
# Poll for delivery on the master side.
|
||||
for _ in range(50):
|
||||
if master_log.exists() and master_log.stat().st_size > 0:
|
||||
break
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
stop.set()
|
||||
for t in (forwarder_task, listener_task):
|
||||
try:
|
||||
await asyncio.wait_for(t, timeout=5)
|
||||
except asyncio.TimeoutError:
|
||||
t.cancel()
|
||||
|
||||
assert master_log.exists()
|
||||
body = master_log.read_text()
|
||||
assert body.count("ssh connect") == 3
|
||||
# Worker provenance tagged in the JSON sink.
|
||||
assert master_json.exists()
|
||||
assert "worker-x" in master_json.read_text()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_forwarder_resumes_from_persisted_offset(
|
||||
tmp_path: pathlib.Path, _pki_env: dict
|
||||
) -> None:
|
||||
"""Simulate a listener outage: forwarder persists offset locally and,
|
||||
after the listener comes back, only ships lines added AFTER the crash."""
|
||||
port = _free_port()
|
||||
worker_log = tmp_path / "decnet.log"
|
||||
master_log = tmp_path / "master.log"
|
||||
master_json = tmp_path / "master.json"
|
||||
state_db = tmp_path / "fwd.db"
|
||||
|
||||
# Pre-populate 2 lines and the offset store as if a previous forwarder run
|
||||
# had already delivered them. The new run must NOT re-ship them.
|
||||
line = (
|
||||
'<13>1 2026-04-18T00:00:00Z decky01 svc 1 - [x] old\n'
|
||||
)
|
||||
worker_log.write_text(line * 2)
|
||||
seed = fwd._OffsetStore(state_db)
|
||||
seed.set(len(line) * 2)
|
||||
seed.close()
|
||||
|
||||
listener_cfg = lst.ListenerConfig(
|
||||
log_path=master_log, json_path=master_json,
|
||||
bind_host="127.0.0.1", bind_port=port, ca_dir=_pki_env["ca_dir"],
|
||||
)
|
||||
fwd_cfg = fwd.ForwarderConfig(
|
||||
log_path=worker_log, master_host="127.0.0.1", master_port=port,
|
||||
agent_dir=_pki_env["worker_dir"], state_db=state_db,
|
||||
)
|
||||
stop = asyncio.Event()
|
||||
lt = asyncio.create_task(lst.run_listener(listener_cfg, stop_event=stop))
|
||||
await asyncio.sleep(0.2)
|
||||
ft = asyncio.create_task(fwd.run_forwarder(fwd_cfg, poll_interval=0.05, stop_event=stop))
|
||||
|
||||
# Append a NEW line after startup — only this should reach the master.
|
||||
new_line = (
|
||||
'<13>1 2026-04-18T00:00:01Z decky01 svc 1 - [x] fresh\n'
|
||||
)
|
||||
with open(worker_log, "a", encoding="utf-8") as f:
|
||||
f.write(new_line)
|
||||
|
||||
for _ in range(50):
|
||||
if master_log.exists() and b"fresh" in master_log.read_bytes():
|
||||
break
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
stop.set()
|
||||
for t in (ft, lt):
|
||||
try:
|
||||
await asyncio.wait_for(t, timeout=5)
|
||||
except asyncio.TimeoutError:
|
||||
t.cancel()
|
||||
|
||||
body = master_log.read_text()
|
||||
assert "fresh" in body
|
||||
assert "old" not in body, "forwarder re-shipped lines already acked before restart"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_impostor_worker_rejected_at_tls(
|
||||
tmp_path: pathlib.Path, _pki_env: dict
|
||||
) -> None:
|
||||
port = _free_port()
|
||||
master_log = tmp_path / "master.log"
|
||||
master_json = tmp_path / "master.json"
|
||||
listener_cfg = lst.ListenerConfig(
|
||||
log_path=master_log,
|
||||
json_path=master_json,
|
||||
bind_host="127.0.0.1",
|
||||
bind_port=port,
|
||||
ca_dir=_pki_env["ca_dir"],
|
||||
)
|
||||
stop = asyncio.Event()
|
||||
listener_task = asyncio.create_task(lst.run_listener(listener_cfg, stop_event=stop))
|
||||
await asyncio.sleep(0.2)
|
||||
|
||||
try:
|
||||
# Build a forwarder SSL context from a DIFFERENT CA — should be rejected.
|
||||
evil_ca = pki.generate_ca("Evil CA")
|
||||
evil_dir = tmp_path / "evil"
|
||||
pki.write_worker_bundle(
|
||||
pki.issue_worker_cert(evil_ca, "evil-worker", ["127.0.0.1"]), evil_dir
|
||||
)
|
||||
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
ctx.load_cert_chain(str(evil_dir / "worker.crt"), str(evil_dir / "worker.key"))
|
||||
ctx.load_verify_locations(cafile=str(evil_dir / "ca.crt"))
|
||||
ctx.verify_mode = ssl.CERT_REQUIRED
|
||||
ctx.check_hostname = False
|
||||
|
||||
rejected = False
|
||||
try:
|
||||
r, w = await asyncio.open_connection("127.0.0.1", port, ssl=ctx)
|
||||
# If TLS somehow succeeded, push a byte and expect the server to drop.
|
||||
w.write(b"5 hello")
|
||||
await w.drain()
|
||||
# If the server accepted this from an unknown CA, that's a failure.
|
||||
await asyncio.sleep(0.2)
|
||||
w.close()
|
||||
try:
|
||||
await w.wait_closed()
|
||||
except Exception:
|
||||
pass
|
||||
except (ssl.SSLError, OSError, ConnectionError):
|
||||
rejected = True
|
||||
|
||||
assert rejected or master_log.stat().st_size == 0, (
|
||||
"impostor connection must be rejected or produce no log lines"
|
||||
)
|
||||
finally:
|
||||
stop.set()
|
||||
try:
|
||||
await asyncio.wait_for(listener_task, timeout=5)
|
||||
except asyncio.TimeoutError:
|
||||
listener_task.cancel()
|
||||
213
tests/swarm/test_pki.py
Normal file
213
tests/swarm/test_pki.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""PKI roundtrip tests for the DECNET swarm CA."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pathlib
|
||||
import ssl
|
||||
import threading
|
||||
import socket
|
||||
import time
|
||||
|
||||
import pytest
|
||||
from cryptography import x509
|
||||
|
||||
from decnet.swarm import pki
|
||||
|
||||
|
||||
def test_ensure_ca_is_idempotent(tmp_path: pathlib.Path) -> None:
|
||||
ca_dir = tmp_path / "ca"
|
||||
first = pki.ensure_ca(ca_dir)
|
||||
second = pki.ensure_ca(ca_dir)
|
||||
assert first.key_pem == second.key_pem
|
||||
assert first.cert_pem == second.cert_pem
|
||||
|
||||
|
||||
def test_issue_worker_cert_signed_by_ca(tmp_path: pathlib.Path) -> None:
|
||||
ca = pki.ensure_ca(tmp_path / "ca")
|
||||
issued = pki.issue_worker_cert(ca, "worker-01", ["127.0.0.1", "worker-01"])
|
||||
cert = x509.load_pem_x509_certificate(issued.cert_pem)
|
||||
ca_cert = x509.load_pem_x509_certificate(ca.cert_pem)
|
||||
assert cert.issuer == ca_cert.subject
|
||||
# SAN should include both the hostname AND the IP we supplied
|
||||
san = cert.extensions.get_extension_for_class(x509.SubjectAlternativeName).value
|
||||
dns_names = set(san.get_values_for_type(x509.DNSName))
|
||||
ip_values = {str(v) for v in san.get_values_for_type(x509.IPAddress)}
|
||||
assert "worker-01" in dns_names
|
||||
assert "127.0.0.1" in ip_values
|
||||
|
||||
|
||||
def test_worker_bundle_roundtrip(tmp_path: pathlib.Path) -> None:
|
||||
ca = pki.ensure_ca(tmp_path / "ca")
|
||||
issued = pki.issue_worker_cert(ca, "worker-02", ["127.0.0.1"])
|
||||
agent_dir = tmp_path / "agent"
|
||||
pki.write_worker_bundle(issued, agent_dir)
|
||||
# File perms: worker.key must not be world-readable.
|
||||
mode = (agent_dir / "worker.key").stat().st_mode & 0o777
|
||||
assert mode == 0o600
|
||||
loaded = pki.load_worker_bundle(agent_dir)
|
||||
assert loaded is not None
|
||||
assert loaded.fingerprint_sha256 == issued.fingerprint_sha256
|
||||
|
||||
|
||||
def test_load_worker_bundle_returns_none_if_missing(tmp_path: pathlib.Path) -> None:
|
||||
assert pki.load_worker_bundle(tmp_path / "empty") is None
|
||||
|
||||
|
||||
def test_ensure_swarmctl_cert_issues_from_same_ca(tmp_path: pathlib.Path) -> None:
|
||||
ca_dir = tmp_path / "ca"
|
||||
swarmctl_dir = tmp_path / "swarmctl"
|
||||
cert_path, key_path, ca_path = pki.ensure_swarmctl_cert(
|
||||
"0.0.0.0", ca_dir=ca_dir, swarmctl_dir=swarmctl_dir
|
||||
)
|
||||
assert cert_path.exists() and key_path.exists() and ca_path.exists()
|
||||
# Server cert is signed by the same CA that workers will ship — that's
|
||||
# the whole point of the auto-issue path.
|
||||
cert = x509.load_pem_x509_certificate(cert_path.read_bytes())
|
||||
ca_cert = x509.load_pem_x509_certificate(ca_path.read_bytes())
|
||||
assert cert.issuer == ca_cert.subject
|
||||
san = cert.extensions.get_extension_for_class(x509.SubjectAlternativeName).value
|
||||
ips = {str(v) for v in san.get_values_for_type(x509.IPAddress)}
|
||||
dns = set(san.get_values_for_type(x509.DNSName))
|
||||
assert "0.0.0.0" in ips
|
||||
assert "localhost" in dns
|
||||
# Key perm is the same 0600 we enforce on worker.key.
|
||||
assert (key_path.stat().st_mode & 0o777) == 0o600
|
||||
|
||||
|
||||
def test_ensure_swarmctl_cert_is_idempotent(tmp_path: pathlib.Path) -> None:
|
||||
# Second call must NOT re-issue — otherwise a restart of swarmctl
|
||||
# would rotate the server cert and break any worker mid-TLS-session.
|
||||
ca_dir = tmp_path / "ca"
|
||||
swarmctl_dir = tmp_path / "swarmctl"
|
||||
first = pki.ensure_swarmctl_cert("0.0.0.0", ca_dir=ca_dir, swarmctl_dir=swarmctl_dir)
|
||||
first_pem = first[0].read_bytes()
|
||||
second = pki.ensure_swarmctl_cert("0.0.0.0", ca_dir=ca_dir, swarmctl_dir=swarmctl_dir)
|
||||
assert second[0].read_bytes() == first_pem
|
||||
|
||||
|
||||
def test_fingerprint_stable_across_calls(tmp_path: pathlib.Path) -> None:
|
||||
ca = pki.ensure_ca(tmp_path / "ca")
|
||||
issued = pki.issue_worker_cert(ca, "worker-03", ["127.0.0.1"])
|
||||
assert pki.fingerprint(issued.cert_pem) == issued.fingerprint_sha256
|
||||
|
||||
|
||||
def test_mtls_handshake_round_trip(tmp_path: pathlib.Path) -> None:
|
||||
"""End-to-end: issue two worker certs from the same CA, have one act as
|
||||
TLS server and the other as TLS client, and confirm the handshake
|
||||
succeeds with mutual auth.
|
||||
"""
|
||||
ca = pki.ensure_ca(tmp_path / "ca")
|
||||
srv_dir = tmp_path / "srv"
|
||||
cli_dir = tmp_path / "cli"
|
||||
pki.write_worker_bundle(
|
||||
pki.issue_worker_cert(ca, "srv", ["127.0.0.1"]), srv_dir
|
||||
)
|
||||
pki.write_worker_bundle(
|
||||
pki.issue_worker_cert(ca, "cli", ["127.0.0.1"]), cli_dir
|
||||
)
|
||||
|
||||
server_ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
|
||||
server_ctx.load_cert_chain(str(srv_dir / "worker.crt"), str(srv_dir / "worker.key"))
|
||||
server_ctx.load_verify_locations(cafile=str(srv_dir / "ca.crt"))
|
||||
server_ctx.verify_mode = ssl.CERT_REQUIRED
|
||||
|
||||
client_ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
client_ctx.load_cert_chain(str(cli_dir / "worker.crt"), str(cli_dir / "worker.key"))
|
||||
client_ctx.load_verify_locations(cafile=str(cli_dir / "ca.crt"))
|
||||
client_ctx.check_hostname = False # SAN matches IP, not hostname
|
||||
client_ctx.verify_mode = ssl.CERT_REQUIRED
|
||||
|
||||
sock = socket.socket()
|
||||
sock.bind(("127.0.0.1", 0))
|
||||
sock.listen(1)
|
||||
port = sock.getsockname()[1]
|
||||
|
||||
result: dict[str, object] = {}
|
||||
|
||||
def _serve() -> None:
|
||||
try:
|
||||
conn, _ = sock.accept()
|
||||
with server_ctx.wrap_socket(conn, server_side=True) as tls:
|
||||
result["peer_cert"] = tls.getpeercert()
|
||||
tls.sendall(b"ok")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
result["error"] = repr(exc)
|
||||
|
||||
t = threading.Thread(target=_serve, daemon=True)
|
||||
t.start()
|
||||
time.sleep(0.05)
|
||||
|
||||
with socket.create_connection(("127.0.0.1", port)) as raw:
|
||||
with client_ctx.wrap_socket(raw, server_hostname="127.0.0.1") as tls:
|
||||
assert tls.recv(2) == b"ok"
|
||||
|
||||
t.join(timeout=2)
|
||||
sock.close()
|
||||
assert "error" not in result, result.get("error")
|
||||
assert result.get("peer_cert"), "server did not receive client cert"
|
||||
|
||||
|
||||
def test_unauthenticated_client_rejected(tmp_path: pathlib.Path) -> None:
|
||||
"""A client presenting a cert from a DIFFERENT CA must be rejected."""
|
||||
good_ca = pki.ensure_ca(tmp_path / "good-ca")
|
||||
evil_ca = pki.generate_ca("Evil CA")
|
||||
|
||||
srv_dir = tmp_path / "srv"
|
||||
pki.write_worker_bundle(
|
||||
pki.issue_worker_cert(good_ca, "srv", ["127.0.0.1"]), srv_dir
|
||||
)
|
||||
|
||||
evil_dir = tmp_path / "evil"
|
||||
pki.write_worker_bundle(
|
||||
pki.issue_worker_cert(evil_ca, "evil", ["127.0.0.1"]), evil_dir
|
||||
)
|
||||
|
||||
server_ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
|
||||
server_ctx.load_cert_chain(str(srv_dir / "worker.crt"), str(srv_dir / "worker.key"))
|
||||
server_ctx.load_verify_locations(cafile=str(srv_dir / "ca.crt"))
|
||||
server_ctx.verify_mode = ssl.CERT_REQUIRED
|
||||
|
||||
client_ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
client_ctx.load_cert_chain(str(evil_dir / "worker.crt"), str(evil_dir / "worker.key"))
|
||||
# The evil client still trusts its own CA for the server cert (so the
|
||||
# server cert chain verifies from its side); the server-side rejection
|
||||
# is what we are asserting.
|
||||
client_ctx.load_verify_locations(cafile=str(srv_dir / "ca.crt"))
|
||||
client_ctx.check_hostname = False
|
||||
client_ctx.verify_mode = ssl.CERT_REQUIRED
|
||||
|
||||
sock = socket.socket()
|
||||
sock.bind(("127.0.0.1", 0))
|
||||
sock.listen(1)
|
||||
port = sock.getsockname()[1]
|
||||
|
||||
errors: list[str] = []
|
||||
|
||||
def _serve() -> None:
|
||||
try:
|
||||
conn, _ = sock.accept()
|
||||
with server_ctx.wrap_socket(conn, server_side=True):
|
||||
pass
|
||||
except ssl.SSLError as exc:
|
||||
errors.append(repr(exc))
|
||||
except Exception as exc: # noqa: BLE001
|
||||
errors.append(repr(exc))
|
||||
|
||||
t = threading.Thread(target=_serve, daemon=True)
|
||||
t.start()
|
||||
time.sleep(0.05)
|
||||
|
||||
# Rejection may surface on either side (SSL alert on the server closes the
|
||||
# socket — client may see SSLError, ConnectionResetError, or EOF).
|
||||
handshake_failed = False
|
||||
try:
|
||||
with socket.create_connection(("127.0.0.1", port)) as raw:
|
||||
with client_ctx.wrap_socket(raw, server_hostname="127.0.0.1") as tls:
|
||||
tls.do_handshake()
|
||||
except (ssl.SSLError, OSError):
|
||||
handshake_failed = True
|
||||
|
||||
t.join(timeout=2)
|
||||
sock.close()
|
||||
assert handshake_failed or errors, (
|
||||
"server should have rejected the evil-CA-signed client cert"
|
||||
)
|
||||
60
tests/swarm/test_state_schema.py
Normal file
60
tests/swarm/test_state_schema.py
Normal file
@@ -0,0 +1,60 @@
|
||||
"""Backward-compatibility tests for the SWARM state-schema extension.
|
||||
|
||||
DeckyConfig gained an optional ``host_uuid`` field in swarm mode. Existing
|
||||
state files (unihost) must continue to deserialize without change.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from decnet.models import DeckyConfig, DecnetConfig
|
||||
|
||||
|
||||
def _minimal_decky(name: str = "decky-01") -> dict:
|
||||
return {
|
||||
"name": name,
|
||||
"ip": "192.168.1.10",
|
||||
"services": ["ssh"],
|
||||
"distro": "debian",
|
||||
"base_image": "debian:bookworm-slim",
|
||||
"hostname": "decky01",
|
||||
}
|
||||
|
||||
|
||||
def test_decky_config_host_uuid_defaults_to_none() -> None:
|
||||
"""A decky built from a pre-swarm state blob lands with host_uuid=None."""
|
||||
d = DeckyConfig(**_minimal_decky())
|
||||
assert d.host_uuid is None
|
||||
|
||||
|
||||
def test_decky_config_accepts_host_uuid() -> None:
|
||||
d = DeckyConfig(**_minimal_decky(), host_uuid="host-uuid-abc")
|
||||
assert d.host_uuid == "host-uuid-abc"
|
||||
|
||||
|
||||
def test_decnet_config_mode_swarm_with_host_assignments() -> None:
|
||||
"""Full swarm-mode config: every decky carries a host_uuid."""
|
||||
config = DecnetConfig(
|
||||
mode="swarm",
|
||||
interface="eth0",
|
||||
subnet="192.168.1.0/24",
|
||||
gateway="192.168.1.1",
|
||||
deckies=[
|
||||
DeckyConfig(**_minimal_decky("decky-01"), host_uuid="host-A"),
|
||||
DeckyConfig(**_minimal_decky("decky-02"), host_uuid="host-B"),
|
||||
],
|
||||
)
|
||||
assert config.mode == "swarm"
|
||||
assert {d.host_uuid for d in config.deckies} == {"host-A", "host-B"}
|
||||
|
||||
|
||||
def test_legacy_unihost_state_still_parses() -> None:
|
||||
"""A dict matching the pre-swarm schema deserializes unchanged."""
|
||||
legacy_blob = {
|
||||
"mode": "unihost",
|
||||
"interface": "eth0",
|
||||
"subnet": "192.168.1.0/24",
|
||||
"gateway": "192.168.1.1",
|
||||
"deckies": [_minimal_decky()],
|
||||
}
|
||||
config = DecnetConfig.model_validate(legacy_blob)
|
||||
assert config.mode == "unihost"
|
||||
assert config.deckies[0].host_uuid is None
|
||||
493
tests/swarm/test_swarm_api.py
Normal file
493
tests/swarm/test_swarm_api.py
Normal file
@@ -0,0 +1,493 @@
|
||||
"""Unit tests for the SWARM controller FastAPI app.
|
||||
|
||||
Covers the enrollment, host-management, and deployment dispatch routes.
|
||||
The AgentClient is stubbed so we exercise the controller's logic without
|
||||
a live mTLS peer (that path has its own roundtrip test).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pathlib
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from decnet.web.db.factory import get_repository
|
||||
from decnet.web.dependencies import get_repo
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ca_dir(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch) -> pathlib.Path:
|
||||
"""Redirect the PKI default CA path into tmp so the test CA never
|
||||
touches ``~/.decnet/ca``."""
|
||||
ca = tmp_path / "ca"
|
||||
from decnet.swarm import pki
|
||||
|
||||
monkeypatch.setattr(pki, "DEFAULT_CA_DIR", ca)
|
||||
# Also patch the already-imported references inside client.py / routers.
|
||||
from decnet.swarm import client as swarm_client
|
||||
from decnet.web.router.swarm import api_enroll_host as enroll_mod
|
||||
|
||||
monkeypatch.setattr(swarm_client, "pki", pki)
|
||||
monkeypatch.setattr(enroll_mod, "pki", pki)
|
||||
return ca
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def repo(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch):
|
||||
r = get_repository(db_path=str(tmp_path / "swarm.db"))
|
||||
# The controller's lifespan initialises the module-level `repo` in
|
||||
# decnet.web.dependencies. Swap that singleton for our test repo so
|
||||
# schema creation targets the temp DB.
|
||||
import decnet.web.dependencies as deps
|
||||
import decnet.web.swarm_api as swarm_api_mod
|
||||
|
||||
monkeypatch.setattr(deps, "repo", r)
|
||||
monkeypatch.setattr(swarm_api_mod, "repo", r)
|
||||
return r
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client(repo, ca_dir: pathlib.Path):
|
||||
from decnet.web.swarm_api import app
|
||||
|
||||
async def _override() -> Any:
|
||||
return repo
|
||||
|
||||
app.dependency_overrides[get_repo] = _override
|
||||
with TestClient(app) as c:
|
||||
yield c
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- /enroll
|
||||
|
||||
|
||||
def test_enroll_creates_host_and_returns_bundle(client: TestClient) -> None:
|
||||
resp = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "worker-a", "address": "10.0.0.5", "agent_port": 8765},
|
||||
)
|
||||
assert resp.status_code == 201, resp.text
|
||||
body = resp.json()
|
||||
assert body["name"] == "worker-a"
|
||||
assert body["address"] == "10.0.0.5"
|
||||
assert "-----BEGIN CERTIFICATE-----" in body["worker_cert_pem"]
|
||||
assert "-----BEGIN PRIVATE KEY-----" in body["worker_key_pem"]
|
||||
assert "-----BEGIN CERTIFICATE-----" in body["ca_cert_pem"]
|
||||
assert len(body["fingerprint"]) == 64 # sha256 hex
|
||||
|
||||
|
||||
def test_enroll_with_updater_issues_second_cert(client: TestClient, ca_dir) -> None:
|
||||
resp = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "worker-upd", "address": "10.0.0.99", "agent_port": 8765,
|
||||
"issue_updater_bundle": True},
|
||||
)
|
||||
assert resp.status_code == 201, resp.text
|
||||
body = resp.json()
|
||||
assert body["updater"] is not None
|
||||
assert body["updater"]["fingerprint"] != body["fingerprint"]
|
||||
assert "-----BEGIN CERTIFICATE-----" in body["updater"]["updater_cert_pem"]
|
||||
assert "-----BEGIN PRIVATE KEY-----" in body["updater"]["updater_key_pem"]
|
||||
# Cert bundle persisted on master.
|
||||
upd_bundle = ca_dir / "workers" / "worker-upd" / "updater"
|
||||
assert (upd_bundle / "updater.crt").is_file()
|
||||
assert (upd_bundle / "updater.key").is_file()
|
||||
# DB row carries the updater fingerprint.
|
||||
row = client.get(f"/swarm/hosts/{body['host_uuid']}").json()
|
||||
assert row.get("updater_cert_fingerprint") == body["updater"]["fingerprint"]
|
||||
|
||||
|
||||
def test_enroll_without_updater_omits_bundle(client: TestClient) -> None:
|
||||
resp = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "worker-no-upd", "address": "10.0.0.98", "agent_port": 8765},
|
||||
)
|
||||
assert resp.status_code == 201
|
||||
assert resp.json()["updater"] is None
|
||||
|
||||
|
||||
def test_enroll_rejects_duplicate_name(client: TestClient) -> None:
|
||||
payload = {"name": "worker-dup", "address": "10.0.0.6", "agent_port": 8765}
|
||||
assert client.post("/swarm/enroll", json=payload).status_code == 201
|
||||
resp2 = client.post("/swarm/enroll", json=payload)
|
||||
assert resp2.status_code == 409
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- /hosts
|
||||
|
||||
|
||||
def test_list_hosts_empty(client: TestClient) -> None:
|
||||
resp = client.get("/swarm/hosts")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json() == []
|
||||
|
||||
|
||||
def test_list_and_get_host_after_enroll(client: TestClient) -> None:
|
||||
reg = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "worker-b", "address": "10.0.0.7", "agent_port": 8765},
|
||||
).json()
|
||||
uuid = reg["host_uuid"]
|
||||
|
||||
lst = client.get("/swarm/hosts").json()
|
||||
assert len(lst) == 1
|
||||
assert lst[0]["name"] == "worker-b"
|
||||
|
||||
one = client.get(f"/swarm/hosts/{uuid}").json()
|
||||
assert one["uuid"] == uuid
|
||||
assert one["status"] == "enrolled"
|
||||
|
||||
|
||||
def test_decommission_removes_host_and_bundle(
|
||||
client: TestClient, ca_dir: pathlib.Path
|
||||
) -> None:
|
||||
reg = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "worker-c", "address": "10.0.0.8", "agent_port": 8765},
|
||||
).json()
|
||||
uuid = reg["host_uuid"]
|
||||
|
||||
bundle_dir = ca_dir / "workers" / "worker-c"
|
||||
assert bundle_dir.is_dir()
|
||||
|
||||
resp = client.delete(f"/swarm/hosts/{uuid}")
|
||||
assert resp.status_code == 204
|
||||
assert client.get(f"/swarm/hosts/{uuid}").status_code == 404
|
||||
assert not bundle_dir.exists()
|
||||
|
||||
|
||||
def test_decommission_dispatches_self_destruct_to_agent(
|
||||
client: TestClient, ca_dir: pathlib.Path, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""Decommission must ask the worker to wipe its own install. Otherwise
|
||||
the agent keeps running after the dashboard forgets it exists."""
|
||||
calls: list[str] = []
|
||||
|
||||
class _SelfDestructAgent:
|
||||
def __init__(self, host=None, **_):
|
||||
self._host = host or {}
|
||||
|
||||
async def __aenter__(self): return self
|
||||
async def __aexit__(self, *exc): return None
|
||||
|
||||
async def self_destruct(self):
|
||||
calls.append(self._host.get("name") or "?")
|
||||
return {"status": "self_destruct_scheduled"}
|
||||
|
||||
from decnet.web.router.swarm import api_decommission_host as decom_mod
|
||||
monkeypatch.setattr(decom_mod, "AgentClient", _SelfDestructAgent)
|
||||
|
||||
reg = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "worker-nuke", "address": "10.0.0.8", "agent_port": 8765},
|
||||
).json()
|
||||
resp = client.delete(f"/swarm/hosts/{reg['host_uuid']}")
|
||||
assert resp.status_code == 204
|
||||
assert calls == ["worker-nuke"]
|
||||
|
||||
|
||||
def test_decommission_proceeds_when_agent_unreachable(
|
||||
client: TestClient, ca_dir: pathlib.Path, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""A dead worker must not block the operator from cleaning up the
|
||||
dashboard. Self-destruct failure is logged, decommission proceeds."""
|
||||
class _DeadAgent:
|
||||
def __init__(self, host=None, **_): pass
|
||||
async def __aenter__(self): return self
|
||||
async def __aexit__(self, *exc): return None
|
||||
async def self_destruct(self):
|
||||
raise RuntimeError("connection refused")
|
||||
|
||||
from decnet.web.router.swarm import api_decommission_host as decom_mod
|
||||
monkeypatch.setattr(decom_mod, "AgentClient", _DeadAgent)
|
||||
|
||||
reg = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "worker-dead", "address": "10.0.0.8", "agent_port": 8765},
|
||||
).json()
|
||||
resp = client.delete(f"/swarm/hosts/{reg['host_uuid']}")
|
||||
assert resp.status_code == 204
|
||||
assert client.get(f"/swarm/hosts/{reg['host_uuid']}").status_code == 404
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- /deploy
|
||||
|
||||
|
||||
class _StubAgentClient:
|
||||
"""Minimal async-context-manager stub mirroring ``AgentClient``."""
|
||||
|
||||
deployed: list[dict[str, Any]] = []
|
||||
torn_down: list[dict[str, Any]] = []
|
||||
|
||||
def __init__(self, host: dict[str, Any] | None = None, **_: Any) -> None:
|
||||
self._host = host or {}
|
||||
|
||||
async def __aenter__(self) -> "_StubAgentClient":
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *exc: Any) -> None:
|
||||
return None
|
||||
|
||||
async def health(self) -> dict[str, Any]:
|
||||
return {"status": "ok"}
|
||||
|
||||
async def deploy(self, config: Any, **kw: Any) -> dict[str, Any]:
|
||||
_StubAgentClient.deployed.append(
|
||||
{"host": self._host.get("name"), "deckies": [d.name for d in config.deckies]}
|
||||
)
|
||||
return {"status": "deployed", "deckies": len(config.deckies)}
|
||||
|
||||
async def teardown(self, decky_id: str | None = None) -> dict[str, Any]:
|
||||
_StubAgentClient.torn_down.append(
|
||||
{"host": self._host.get("name"), "decky_id": decky_id}
|
||||
)
|
||||
return {"status": "torn_down"}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stub_agent(monkeypatch: pytest.MonkeyPatch):
|
||||
_StubAgentClient.deployed.clear()
|
||||
_StubAgentClient.torn_down.clear()
|
||||
from decnet.web.router.swarm import api_deploy_swarm as deploy_mod
|
||||
from decnet.web.router.swarm import api_teardown_swarm as teardown_mod
|
||||
from decnet.web.router.swarm import api_check_hosts as check_mod
|
||||
|
||||
monkeypatch.setattr(deploy_mod, "AgentClient", _StubAgentClient)
|
||||
monkeypatch.setattr(teardown_mod, "AgentClient", _StubAgentClient)
|
||||
monkeypatch.setattr(check_mod, "AgentClient", _StubAgentClient)
|
||||
return _StubAgentClient
|
||||
|
||||
|
||||
def _decky_dict(name: str, host_uuid: str, ip: str) -> dict[str, Any]:
|
||||
return {
|
||||
"name": name,
|
||||
"ip": ip,
|
||||
"services": ["ssh"],
|
||||
"distro": "debian",
|
||||
"base_image": "debian:bookworm-slim",
|
||||
"hostname": name,
|
||||
"host_uuid": host_uuid,
|
||||
}
|
||||
|
||||
|
||||
def test_deploy_shards_across_hosts(client: TestClient, stub_agent) -> None:
|
||||
h1 = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "w1", "address": "10.0.0.1", "agent_port": 8765},
|
||||
).json()
|
||||
h2 = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "w2", "address": "10.0.0.2", "agent_port": 8765},
|
||||
).json()
|
||||
|
||||
cfg = {
|
||||
"mode": "swarm",
|
||||
"interface": "eth0",
|
||||
"subnet": "192.168.1.0/24",
|
||||
"gateway": "192.168.1.1",
|
||||
"deckies": [
|
||||
_decky_dict("decky-01", h1["host_uuid"], "192.168.1.10"),
|
||||
_decky_dict("decky-02", h1["host_uuid"], "192.168.1.11"),
|
||||
_decky_dict("decky-03", h2["host_uuid"], "192.168.1.12"),
|
||||
],
|
||||
}
|
||||
resp = client.post("/swarm/deploy", json={"config": cfg})
|
||||
assert resp.status_code == 200, resp.text
|
||||
body = resp.json()
|
||||
assert len(body["results"]) == 2
|
||||
assert all(r["ok"] for r in body["results"])
|
||||
|
||||
by_host = {d["host"]: d["deckies"] for d in stub_agent.deployed}
|
||||
assert by_host["w1"] == ["decky-01", "decky-02"]
|
||||
assert by_host["w2"] == ["decky-03"]
|
||||
|
||||
|
||||
def test_deploy_rejects_missing_host_uuid(client: TestClient, stub_agent) -> None:
|
||||
cfg = {
|
||||
"mode": "swarm",
|
||||
"interface": "eth0",
|
||||
"subnet": "192.168.1.0/24",
|
||||
"gateway": "192.168.1.1",
|
||||
"deckies": [
|
||||
{
|
||||
"name": "decky-01",
|
||||
"ip": "192.168.1.10",
|
||||
"services": ["ssh"],
|
||||
"distro": "debian",
|
||||
"base_image": "debian:bookworm-slim",
|
||||
"hostname": "decky-01",
|
||||
# host_uuid deliberately omitted
|
||||
}
|
||||
],
|
||||
}
|
||||
resp = client.post("/swarm/deploy", json={"config": cfg})
|
||||
assert resp.status_code == 400
|
||||
assert "host_uuid" in resp.json()["detail"]
|
||||
|
||||
|
||||
def test_deploy_partial_failure_only_marks_actually_failed_decky(
|
||||
client: TestClient, repo, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""docker compose up is partial-success-friendly: one failed service
|
||||
doesn't roll back the ones already up. The master must probe /status
|
||||
after a dispatch exception so healthy deckies aren't painted red just
|
||||
because a sibling in the same shard failed."""
|
||||
|
||||
class _PartialFailAgent:
|
||||
def __init__(self, host=None, **_):
|
||||
self._host = host or {}
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *exc):
|
||||
return None
|
||||
|
||||
async def deploy(self, config, **kw):
|
||||
raise RuntimeError("Server error '500 Internal Server Error'")
|
||||
|
||||
async def status(self):
|
||||
return {
|
||||
"deployed": True,
|
||||
"runtime": {
|
||||
"decky1": {"running": True, "services": {"ssh": "running"}},
|
||||
"decky2": {"running": True, "services": {"ssh": "running"}},
|
||||
"decky3": {"running": False, "services": {"ssh": "absent"}},
|
||||
},
|
||||
}
|
||||
|
||||
from decnet.web.router.swarm import api_deploy_swarm as deploy_mod
|
||||
monkeypatch.setattr(deploy_mod, "AgentClient", _PartialFailAgent)
|
||||
|
||||
h1 = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "decktest", "address": "192.168.1.47", "agent_port": 8765},
|
||||
).json()
|
||||
|
||||
cfg = {
|
||||
"mode": "swarm",
|
||||
"interface": "eth0",
|
||||
"subnet": "192.168.1.0/24",
|
||||
"gateway": "192.168.1.1",
|
||||
"deckies": [
|
||||
_decky_dict("decky1", h1["host_uuid"], "192.168.1.2"),
|
||||
_decky_dict("decky2", h1["host_uuid"], "192.168.1.3"),
|
||||
_decky_dict("decky3", h1["host_uuid"], "192.168.1.4"),
|
||||
],
|
||||
}
|
||||
resp = client.post("/swarm/deploy", json={"config": cfg})
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["results"][0]["ok"] is False
|
||||
|
||||
shards = {s["decky_name"]: s for s in client.get("/swarm/deckies").json()}
|
||||
assert shards["decky1"]["state"] == "running"
|
||||
assert shards["decky1"]["last_error"] is None
|
||||
assert shards["decky2"]["state"] == "running"
|
||||
assert shards["decky3"]["state"] == "failed"
|
||||
assert "500" in (shards["decky3"]["last_error"] or "")
|
||||
|
||||
|
||||
def test_deploy_rejects_non_swarm_mode(client: TestClient, stub_agent) -> None:
|
||||
cfg = {
|
||||
"mode": "unihost",
|
||||
"interface": "eth0",
|
||||
"subnet": "192.168.1.0/24",
|
||||
"gateway": "192.168.1.1",
|
||||
"deckies": [_decky_dict("decky-01", "fake-uuid", "192.168.1.10")],
|
||||
}
|
||||
resp = client.post("/swarm/deploy", json={"config": cfg})
|
||||
assert resp.status_code == 400
|
||||
|
||||
|
||||
def test_teardown_all_hosts(client: TestClient, stub_agent) -> None:
|
||||
for i, addr in enumerate(("10.0.0.1", "10.0.0.2"), start=1):
|
||||
client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": f"td{i}", "address": addr, "agent_port": 8765},
|
||||
)
|
||||
resp = client.post("/swarm/teardown", json={})
|
||||
assert resp.status_code == 200
|
||||
assert len(resp.json()["results"]) == 2
|
||||
assert {t["host"] for t in stub_agent.torn_down} == {"td1", "td2"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- /check
|
||||
|
||||
|
||||
def test_check_marks_hosts_active(client: TestClient, stub_agent) -> None:
|
||||
h = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "probe-w", "address": "10.0.0.9", "agent_port": 8765},
|
||||
).json()
|
||||
|
||||
resp = client.post("/swarm/check")
|
||||
assert resp.status_code == 200
|
||||
results = resp.json()["results"]
|
||||
assert len(results) == 1
|
||||
assert results[0]["reachable"] is True
|
||||
|
||||
one = client.get(f"/swarm/hosts/{h['host_uuid']}").json()
|
||||
assert one["status"] == "active"
|
||||
assert one["last_heartbeat"] is not None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- /deckies
|
||||
|
||||
|
||||
def test_list_deckies_empty(client: TestClient) -> None:
|
||||
resp = client.get("/swarm/deckies")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json() == []
|
||||
|
||||
|
||||
def test_list_deckies_joins_host_identity(client: TestClient, repo) -> None:
|
||||
import asyncio
|
||||
|
||||
h1 = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "deck-host-1", "address": "10.0.0.11", "agent_port": 8765},
|
||||
).json()
|
||||
h2 = client.post(
|
||||
"/swarm/enroll",
|
||||
json={"name": "deck-host-2", "address": "10.0.0.12", "agent_port": 8765},
|
||||
).json()
|
||||
|
||||
async def _seed() -> None:
|
||||
await repo.upsert_decky_shard({
|
||||
"decky_name": "decky-01", "host_uuid": h1["host_uuid"],
|
||||
"services": ["ssh"], "state": "running",
|
||||
})
|
||||
await repo.upsert_decky_shard({
|
||||
"decky_name": "decky-02", "host_uuid": h2["host_uuid"],
|
||||
"services": ["smb", "ssh"], "state": "failed", "last_error": "boom",
|
||||
})
|
||||
|
||||
asyncio.run(_seed())
|
||||
|
||||
rows = client.get("/swarm/deckies").json()
|
||||
assert len(rows) == 2
|
||||
by_name = {r["decky_name"]: r for r in rows}
|
||||
assert by_name["decky-01"]["host_name"] == "deck-host-1"
|
||||
assert by_name["decky-01"]["host_address"] == "10.0.0.11"
|
||||
assert by_name["decky-01"]["state"] == "running"
|
||||
assert by_name["decky-02"]["services"] == ["smb", "ssh"]
|
||||
assert by_name["decky-02"]["last_error"] == "boom"
|
||||
|
||||
# host_uuid filter
|
||||
only = client.get(f"/swarm/deckies?host_uuid={h1['host_uuid']}").json()
|
||||
assert [r["decky_name"] for r in only] == ["decky-01"]
|
||||
|
||||
# state filter
|
||||
failed = client.get("/swarm/deckies?state=failed").json()
|
||||
assert [r["decky_name"] for r in failed] == ["decky-02"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- /health (root)
|
||||
|
||||
|
||||
def test_root_health(client: TestClient) -> None:
|
||||
resp = client.get("/health")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["role"] == "swarm-controller"
|
||||
75
tests/swarm/test_tar_tree.py
Normal file
75
tests/swarm/test_tar_tree.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""tar_working_tree: exclude filter, tarball validity, git SHA detection."""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import pathlib
|
||||
import tarfile
|
||||
|
||||
from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree
|
||||
|
||||
|
||||
def _tree_names(data: bytes) -> set[str]:
|
||||
with tarfile.open(fileobj=io.BytesIO(data), mode="r:gz") as tar:
|
||||
return {m.name for m in tar.getmembers()}
|
||||
|
||||
|
||||
def test_tar_excludes_default_patterns(tmp_path: pathlib.Path) -> None:
|
||||
(tmp_path / "decnet").mkdir()
|
||||
(tmp_path / "decnet" / "keep.py").write_text("x = 1")
|
||||
(tmp_path / ".venv").mkdir()
|
||||
(tmp_path / ".venv" / "pyvenv.cfg").write_text("junk")
|
||||
(tmp_path / ".git").mkdir()
|
||||
(tmp_path / ".git" / "HEAD").write_text("ref: refs/heads/main\n")
|
||||
(tmp_path / "decnet" / "__pycache__").mkdir()
|
||||
(tmp_path / "decnet" / "__pycache__" / "keep.cpython-311.pyc").write_text("bytecode")
|
||||
(tmp_path / "wiki-checkout").mkdir()
|
||||
(tmp_path / "wiki-checkout" / "Home.md").write_text("# wiki")
|
||||
(tmp_path / "run.db").write_text("sqlite")
|
||||
(tmp_path / "master.log").write_text("log")
|
||||
|
||||
data = tar_working_tree(tmp_path)
|
||||
names = _tree_names(data)
|
||||
assert "decnet/keep.py" in names
|
||||
assert all(".venv" not in n for n in names)
|
||||
assert all(".git" not in n for n in names)
|
||||
assert all("__pycache__" not in n for n in names)
|
||||
assert all("wiki-checkout" not in n for n in names)
|
||||
assert "run.db" not in names
|
||||
assert "master.log" not in names
|
||||
|
||||
|
||||
def test_tar_accepts_extra_excludes(tmp_path: pathlib.Path) -> None:
|
||||
(tmp_path / "a.py").write_text("x")
|
||||
(tmp_path / "secret.env").write_text("TOKEN=abc")
|
||||
data = tar_working_tree(tmp_path, extra_excludes=["secret.env"])
|
||||
names = _tree_names(data)
|
||||
assert "a.py" in names
|
||||
assert "secret.env" not in names
|
||||
|
||||
|
||||
def test_tar_skips_symlinks(tmp_path: pathlib.Path) -> None:
|
||||
(tmp_path / "real.txt").write_text("hi")
|
||||
try:
|
||||
(tmp_path / "link.txt").symlink_to(tmp_path / "real.txt")
|
||||
except (OSError, NotImplementedError):
|
||||
return # platform doesn't support symlinks — skip
|
||||
names = _tree_names(tar_working_tree(tmp_path))
|
||||
assert "real.txt" in names
|
||||
assert "link.txt" not in names
|
||||
|
||||
|
||||
def test_detect_git_sha_from_ref(tmp_path: pathlib.Path) -> None:
|
||||
(tmp_path / ".git" / "refs" / "heads").mkdir(parents=True)
|
||||
(tmp_path / ".git" / "refs" / "heads" / "main").write_text("deadbeef" * 5 + "\n")
|
||||
(tmp_path / ".git" / "HEAD").write_text("ref: refs/heads/main\n")
|
||||
assert detect_git_sha(tmp_path).startswith("deadbeef")
|
||||
|
||||
|
||||
def test_detect_git_sha_detached(tmp_path: pathlib.Path) -> None:
|
||||
(tmp_path / ".git").mkdir()
|
||||
(tmp_path / ".git" / "HEAD").write_text("f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0\n")
|
||||
assert detect_git_sha(tmp_path).startswith("f0f0")
|
||||
|
||||
|
||||
def test_detect_git_sha_none_when_not_repo(tmp_path: pathlib.Path) -> None:
|
||||
assert detect_git_sha(tmp_path) == ""
|
||||
77
tests/swarm/test_uvicorn_tls_scope.py
Normal file
77
tests/swarm/test_uvicorn_tls_scope.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""Regression tests for the uvicorn TLS scope monkey-patch."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
class _FakeSSLObject:
|
||||
def __init__(self, der: bytes) -> None:
|
||||
self._der = der
|
||||
|
||||
def getpeercert(self, binary_form: bool = False) -> bytes:
|
||||
assert binary_form is True
|
||||
return self._der
|
||||
|
||||
|
||||
class _FakeTransport:
|
||||
def __init__(self, ssl_obj: Any = None) -> None:
|
||||
self._ssl = ssl_obj
|
||||
|
||||
def get_extra_info(self, key: str) -> Any:
|
||||
if key == "ssl_object":
|
||||
return self._ssl
|
||||
return None
|
||||
|
||||
|
||||
def _make_cycle_cls():
|
||||
class Cycle:
|
||||
def __init__(self, scope: dict, transport: Any = None) -> None:
|
||||
self.scope = scope
|
||||
self.transport = transport
|
||||
return Cycle
|
||||
|
||||
|
||||
def test_wrap_cycle_injects_cert_into_scope() -> None:
|
||||
from decnet.web._uvicorn_tls_scope import _wrap_cycle_init
|
||||
|
||||
Cycle = _make_cycle_cls()
|
||||
_wrap_cycle_init(Cycle)
|
||||
|
||||
scope: dict = {"type": "http"}
|
||||
transport = _FakeTransport(_FakeSSLObject(b"\x30\x82der"))
|
||||
Cycle(scope, transport=transport)
|
||||
|
||||
assert scope["extensions"]["tls"]["client_cert_chain"] == [b"\x30\x82der"]
|
||||
|
||||
|
||||
def test_wrap_cycle_noop_when_no_ssl() -> None:
|
||||
from decnet.web._uvicorn_tls_scope import _wrap_cycle_init
|
||||
|
||||
Cycle = _make_cycle_cls()
|
||||
_wrap_cycle_init(Cycle)
|
||||
|
||||
scope: dict = {"type": "http"}
|
||||
Cycle(scope, transport=_FakeTransport(ssl_obj=None))
|
||||
|
||||
assert "extensions" not in scope or "tls" not in scope.get("extensions", {})
|
||||
|
||||
|
||||
def test_wrap_cycle_noop_when_empty_der() -> None:
|
||||
from decnet.web._uvicorn_tls_scope import _wrap_cycle_init
|
||||
|
||||
Cycle = _make_cycle_cls()
|
||||
_wrap_cycle_init(Cycle)
|
||||
|
||||
scope: dict = {"type": "http"}
|
||||
Cycle(scope, transport=_FakeTransport(_FakeSSLObject(b"")))
|
||||
|
||||
assert "extensions" not in scope or "tls" not in scope.get("extensions", {})
|
||||
|
||||
|
||||
def test_install_is_idempotent() -> None:
|
||||
from decnet.web import _uvicorn_tls_scope as mod
|
||||
|
||||
mod.install()
|
||||
mod.install() # second call must not double-wrap
|
||||
Reference in New Issue
Block a user