diff --git a/decnet/cli/swarmctl.py b/decnet/cli/swarmctl.py index 9b0b0e51..258f1ab6 100644 --- a/decnet/cli/swarmctl.py +++ b/decnet/cli/swarmctl.py @@ -13,6 +13,34 @@ from . import utils as _utils from .gating import _require_master_mode from .utils import console, log +# Hosts that keep the controller on the master box itself. A routable bind +# (anything else, incl. 0.0.0.0) exposes the control plane to the network and +# MUST run mTLS — the app-layer operator gate trusts the transport to have +# verified a CA-signed client cert. See decnet/web/router/swarm/_mtls.py. +_LOOPBACK_HOSTS = frozenset({"127.0.0.1", "::1", "localhost"}) + + +def _guard_bind(host: str, tls: bool) -> None: + """Fail closed: refuse to bind a routable interface without --tls. + + On loopback the controller may run plaintext (single-operator master box, + same boundary as docker.sock); off-box it would be an UNAUTHENTICATED + control plane, so we hard-refuse to start. + """ + if host not in _LOOPBACK_HOSTS and not tls: + console.print( + f"[red]Refusing to bind the swarm controller to {host} without --tls.[/]" + ) + console.print( + "[red]A routable bind without mTLS exposes an UNAUTHENTICATED control " + "plane (enroll / deploy / teardown).[/]" + ) + console.print( + "[yellow]Re-run with --tls for mutual-TLS, or bind 127.0.0.1 for a " + "local-only master.[/]" + ) + raise typer.Exit(code=2) + def register(app: typer.Typer) -> None: @app.command() @@ -50,6 +78,7 @@ def register(app: typer.Typer) -> None: if you need a publicly-trusted or externally-managed cert. """ _require_master_mode("swarmctl") + _guard_bind(host, tls) if daemon: log.info("swarmctl daemonizing host=%s port=%d", host, port) _utils._daemonize() diff --git a/decnet/web/router/swarm/_mtls.py b/decnet/web/router/swarm/_mtls.py index 07f4ebba..29df36aa 100644 --- a/decnet/web/router/swarm/_mtls.py +++ b/decnet/web/router/swarm/_mtls.py @@ -42,6 +42,12 @@ log = get_logger("swarm.mtls") # or tear the fleet down. OPERATOR_CNS = frozenset({"decnet-master", "swarmctl"}) +# Hosts treated as "the master box itself". A certless request is only accepted +# from these — the single-operator loopback boundary (same model as +# docker.sock). Any routable bind is forced onto mTLS by the swarmctl startup +# guard, so a certless request can never legitimately arrive from off-box. +LOOPBACK_HOSTS = frozenset({"127.0.0.1", "::1", "localhost"}) + @dataclass(frozen=True) class PeerCert: @@ -131,17 +137,37 @@ def extract_peer_fingerprint(scope: MutableMapping[str, Any]) -> Optional[str]: return hashlib.sha256(der).hexdigest().lower() -def require_operator_cert(request: Request) -> PeerCert: - """FastAPI dependency: require a CA-signed cert whose CN is an operator. +def _client_is_loopback(request: Request) -> bool: + """True iff the request originated from the master box's loopback.""" + client = getattr(request, "client", None) + host = getattr(client, "host", None) if client is not None else None + return host in LOOPBACK_HOSTS - Transport-layer mTLS (``--ssl-cert-reqs 2``) has already proven the cert is - CA-signed; here we enforce that its CN is in :data:`OPERATOR_CNS`. Worker - and ``updater@*`` certs are rejected with 403. + +def require_operator_cert(request: Request) -> PeerCert: + """FastAPI dependency authorizing a swarm control-plane operation. + + Two accepted paths, matching the deployment posture: + + * **mTLS on** (any routable bind — enforced by the swarmctl startup guard): + a peer cert is present. Transport already proved it is CA-signed; we + additionally require its CN to be in :data:`OPERATOR_CNS`. Worker and + ``updater@*`` certs are rejected — a worker's still-valid cert must never + drive enroll/deploy/teardown. + * **Loopback plaintext** (single-host master, the shipping default): no peer + cert, but the request came from ``127.0.0.1``/``::1``. Accepted as the + local operator — the same trust boundary as ``docker.sock``. + + A certless request from any non-loopback client is refused (fail-closed); + in practice the startup guard prevents that combination from arising. """ peer = extract_peer_cert(request.scope) - if peer is None: - raise HTTPException(status_code=403, detail="peer cert unavailable") - if peer.cn not in OPERATOR_CNS: - log.warning("rejected non-operator cert on control plane: cn=%r", peer.cn) - raise HTTPException(status_code=403, detail="operator certificate required") - return peer + if peer is not None: + if peer.cn not in OPERATOR_CNS: + log.warning("rejected non-operator cert on control plane: cn=%r", peer.cn) + raise HTTPException(status_code=403, detail="operator certificate required") + return peer + if _client_is_loopback(request): + # Local operator on the master box; no client cert over plaintext loopback. + return PeerCert(sha256="", cn=None) + raise HTTPException(status_code=403, detail="operator certificate required") diff --git a/decnet/web/router/swarm/api_check_hosts.py b/decnet/web/router/swarm/api_check_hosts.py index a2c77b4a..8956f7ce 100644 --- a/decnet/web/router/swarm/api_check_hosts.py +++ b/decnet/web/router/swarm/api_check_hosts.py @@ -16,6 +16,7 @@ from decnet.logging import get_logger from decnet.swarm.client import AgentClient from decnet.web.db.repository import BaseRepository from decnet.web.dependencies import get_repo +from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert from decnet.web.db.models import SwarmCheckResponse, SwarmHostHealth log = get_logger("swarm.check") @@ -26,6 +27,7 @@ router = APIRouter() @router.post("/check", response_model=SwarmCheckResponse, tags=["Swarm Health"]) async def api_check_hosts( repo: BaseRepository = Depends(get_repo), + _operator: PeerCert = Depends(require_operator_cert), ) -> SwarmCheckResponse: hosts = await repo.list_swarm_hosts() diff --git a/decnet/web/router/swarm/api_decommission_host.py b/decnet/web/router/swarm/api_decommission_host.py index 1ef07778..fb69d2ce 100644 --- a/decnet/web/router/swarm/api_decommission_host.py +++ b/decnet/web/router/swarm/api_decommission_host.py @@ -18,6 +18,7 @@ from decnet.logging import get_logger from decnet.swarm.client import AgentClient from decnet.web.db.repository import BaseRepository from decnet.web.dependencies import get_repo +from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert log = get_logger("swarm.decommission") router = APIRouter() @@ -32,6 +33,7 @@ router = APIRouter() async def api_decommission_host( uuid: str, repo: BaseRepository = Depends(get_repo), + _operator: PeerCert = Depends(require_operator_cert), ) -> None: row = await repo.get_swarm_host_by_uuid(uuid) if row is None: diff --git a/decnet/web/router/swarm/api_deploy_swarm.py b/decnet/web/router/swarm/api_deploy_swarm.py index 4694f849..a5561bc6 100644 --- a/decnet/web/router/swarm/api_deploy_swarm.py +++ b/decnet/web/router/swarm/api_deploy_swarm.py @@ -21,6 +21,7 @@ from decnet.logging import get_logger from decnet.swarm.client import AgentClient from decnet.web.db.repository import BaseRepository from decnet.web.dependencies import get_repo +from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert from decnet.web.db.models import ( SwarmDeployRequest, SwarmDeployResponse, @@ -160,6 +161,7 @@ async def dispatch_decnet_config( async def api_deploy_swarm( req: SwarmDeployRequest, repo: BaseRepository = Depends(get_repo), + _operator: PeerCert = Depends(require_operator_cert), ) -> SwarmDeployResponse: if req.config.mode != "swarm": raise HTTPException(status_code=400, detail="mode must be 'swarm'") diff --git a/decnet/web/router/swarm/api_enroll_host.py b/decnet/web/router/swarm/api_enroll_host.py index 168b6151..37aa6f2e 100644 --- a/decnet/web/router/swarm/api_enroll_host.py +++ b/decnet/web/router/swarm/api_enroll_host.py @@ -6,8 +6,10 @@ generates a fresh worker keypair + CA-signed cert, and returns the full bundle to the operator. Bundle delivery to the worker (scp/sshpass/etc.) is outside this process's trust boundary. -Rationale: the worker agent speaks ONLY mTLS; there is no pre-auth -bootstrap endpoint, so nothing to attack before the worker is enrolled. +Authorization: this mints a CA-signed identity (and its private key), so it +is gated by :func:`require_operator_cert` — an operator-CN client cert when +the controller runs mTLS, or a local request when it is loopback-bound. +A worker's own cert cannot enroll further hosts. """ from __future__ import annotations @@ -20,6 +22,7 @@ from fastapi import APIRouter, Depends, HTTPException, status from decnet.swarm import pki from decnet.web.db.repository import BaseRepository from decnet.web.dependencies import get_repo +from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert from decnet.web.db.models import SwarmEnrolledBundle, SwarmEnrollRequest, SwarmUpdaterBundle router = APIRouter() @@ -39,6 +42,7 @@ router = APIRouter() async def api_enroll_host( req: SwarmEnrollRequest, repo: BaseRepository = Depends(get_repo), + _operator: PeerCert = Depends(require_operator_cert), ) -> SwarmEnrolledBundle: existing = await repo.get_swarm_host_by_name(req.name) if existing is not None: diff --git a/decnet/web/router/swarm/api_get_host.py b/decnet/web/router/swarm/api_get_host.py index b8d03fa2..a96686d9 100644 --- a/decnet/web/router/swarm/api_get_host.py +++ b/decnet/web/router/swarm/api_get_host.py @@ -6,6 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException from decnet.web.db.repository import BaseRepository from decnet.web.dependencies import get_repo +from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert from decnet.web.db.models import SwarmHostView router = APIRouter() @@ -20,6 +21,7 @@ router = APIRouter() async def api_get_host( uuid: str, repo: BaseRepository = Depends(get_repo), + _operator: PeerCert = Depends(require_operator_cert), ) -> SwarmHostView: row = await repo.get_swarm_host_by_uuid(uuid) if row is None: diff --git a/decnet/web/router/swarm/api_list_deckies.py b/decnet/web/router/swarm/api_list_deckies.py index 3d651350..e0973d33 100644 --- a/decnet/web/router/swarm/api_list_deckies.py +++ b/decnet/web/router/swarm/api_list_deckies.py @@ -14,6 +14,7 @@ from fastapi import APIRouter, Depends from decnet.web.db.repository import BaseRepository from decnet.web.dependencies import get_repo +from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert from decnet.web.db.models import DeckyShardView router = APIRouter() @@ -24,6 +25,7 @@ async def api_list_deckies( host_uuid: Optional[str] = None, state: Optional[str] = None, repo: BaseRepository = Depends(get_repo), + _operator: PeerCert = Depends(require_operator_cert), ) -> list[DeckyShardView]: shards = await repo.list_decky_shards(host_uuid) hosts = {h["uuid"]: h for h in await repo.list_swarm_hosts()} diff --git a/decnet/web/router/swarm/api_list_hosts.py b/decnet/web/router/swarm/api_list_hosts.py index 6a3cfd2e..bb5fbdd1 100644 --- a/decnet/web/router/swarm/api_list_hosts.py +++ b/decnet/web/router/swarm/api_list_hosts.py @@ -8,6 +8,7 @@ from fastapi import APIRouter, Depends from decnet.web.db.repository import BaseRepository from decnet.web.dependencies import get_repo +from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert from decnet.web.db.models import SwarmHostView router = APIRouter() @@ -17,6 +18,7 @@ router = APIRouter() async def api_list_hosts( host_status: Optional[str] = None, repo: BaseRepository = Depends(get_repo), + _operator: PeerCert = Depends(require_operator_cert), ) -> list[SwarmHostView]: rows = await repo.list_swarm_hosts(host_status) return [SwarmHostView(**r) for r in rows] diff --git a/decnet/web/router/swarm/api_teardown_swarm.py b/decnet/web/router/swarm/api_teardown_swarm.py index f71510bb..2530fc65 100644 --- a/decnet/web/router/swarm/api_teardown_swarm.py +++ b/decnet/web/router/swarm/api_teardown_swarm.py @@ -11,6 +11,7 @@ from decnet.logging import get_logger from decnet.swarm.client import AgentClient from decnet.web.db.repository import BaseRepository from decnet.web.dependencies import get_repo +from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert from decnet.web.db.models import ( SwarmDeployResponse, SwarmHostResult, @@ -35,6 +36,7 @@ router = APIRouter() async def api_teardown_swarm( req: SwarmTeardownRequest, repo: BaseRepository = Depends(get_repo), + _operator: PeerCert = Depends(require_operator_cert), ) -> SwarmDeployResponse: if req.host_uuid is not None: row = await repo.get_swarm_host_by_uuid(req.host_uuid) diff --git a/tests/swarm/test_heartbeat.py b/tests/swarm/test_heartbeat.py index 3e41b515..4ae38f6e 100644 --- a/tests/swarm/test_heartbeat.py +++ b/tests/swarm/test_heartbeat.py @@ -51,7 +51,9 @@ def client(repo, ca_dir: pathlib.Path): return repo app.dependency_overrides[get_repo] = _override - with TestClient(app) as c: + # loopback client so /swarm/enroll (operator-gated) accepts the certless + # local-operator path during test setup. + with TestClient(app, client=("127.0.0.1", 50000)) as c: yield c app.dependency_overrides.clear() diff --git a/tests/swarm/test_heartbeat_lifecycle.py b/tests/swarm/test_heartbeat_lifecycle.py index e44ddcae..6d9dce88 100644 --- a/tests/swarm/test_heartbeat_lifecycle.py +++ b/tests/swarm/test_heartbeat_lifecycle.py @@ -47,7 +47,8 @@ def client(repo, ca_dir: pathlib.Path): async def _override() -> Any: return repo app.dependency_overrides[get_repo] = _override - with TestClient(app) as c: + # loopback client so operator-gated /swarm/enroll accepts the local operator. + with TestClient(app, client=("127.0.0.1", 50000)) as c: yield c app.dependency_overrides.clear() diff --git a/tests/swarm/test_heartbeat_topology_resync.py b/tests/swarm/test_heartbeat_topology_resync.py index 5025990b..7d065875 100644 --- a/tests/swarm/test_heartbeat_topology_resync.py +++ b/tests/swarm/test_heartbeat_topology_resync.py @@ -57,7 +57,8 @@ def client(repo, ca_dir): return repo app.dependency_overrides[get_repo] = _override - with TestClient(app) as c: + # loopback client so operator-gated /swarm/enroll accepts the local operator. + with TestClient(app, client=("127.0.0.1", 50000)) as c: yield c app.dependency_overrides.clear() diff --git a/tests/swarm/test_mtls.py b/tests/swarm/test_mtls.py index f6cc0ad4..96746261 100644 --- a/tests/swarm/test_mtls.py +++ b/tests/swarm/test_mtls.py @@ -79,9 +79,10 @@ def test_extract_fingerprint_works_on_non_cert_der() -> None: # ------------------------- require_operator_cert --------------------------- -def _request_with(scope: dict) -> MagicMock: +def _request_with(scope: dict, client_host: str | None = None) -> MagicMock: req = MagicMock() req.scope = scope + req.client = None if client_host is None else MagicMock(host=client_host) return req @@ -96,10 +97,14 @@ def test_require_operator_accepts_swarmctl(ca) -> None: def test_require_operator_rejects_worker_cn(ca) -> None: + # A worker cert is CA-signed but must not drive the control plane, even + # from loopback — the CN gate fires before the loopback fallback. from fastapi import HTTPException with pytest.raises(HTTPException) as ei: - _mtls.require_operator_cert(_request_with(_scope_with(_der_for(ca, "worker-1")))) + _mtls.require_operator_cert( + _request_with(_scope_with(_der_for(ca, "worker-1")), client_host="127.0.0.1") + ) assert ei.value.status_code == 403 @@ -111,10 +116,25 @@ def test_require_operator_rejects_updater_cn(ca) -> None: assert ei.value.status_code == 403 -def test_require_operator_rejects_no_cert() -> None: +def test_require_operator_allows_certless_loopback() -> None: + # Shipping default: plaintext loopback, no client cert → local operator. + peer = _mtls.require_operator_cert(_request_with({}, client_host="127.0.0.1")) + assert peer.cn is None and peer.sha256 == "" + + +def test_require_operator_rejects_certless_non_loopback() -> None: + # No cert from off-box → fail closed (the startup guard makes this + # unreachable in practice, but defense in depth). from fastapi import HTTPException with pytest.raises(HTTPException) as ei: - _mtls.require_operator_cert(_request_with({})) + _mtls.require_operator_cert(_request_with({}, client_host="10.0.0.9")) + assert ei.value.status_code == 403 + + +def test_require_operator_rejects_certless_unknown_client() -> None: + from fastapi import HTTPException + + with pytest.raises(HTTPException) as ei: + _mtls.require_operator_cert(_request_with({}, client_host=None)) assert ei.value.status_code == 403 - assert "unavailable" in ei.value.detail diff --git a/tests/swarm/test_swarm_api.py b/tests/swarm/test_swarm_api.py index e6c5e890..9c323427 100644 --- a/tests/swarm/test_swarm_api.py +++ b/tests/swarm/test_swarm_api.py @@ -56,7 +56,10 @@ def client(repo, ca_dir: pathlib.Path): return repo app.dependency_overrides[get_repo] = _override - with TestClient(app) as c: + # client=loopback so the operator-cert gate takes its certless-loopback + # path (the shipping single-host default); TestClient otherwise reports + # host "testclient", which the gate treats as off-box. + with TestClient(app, client=("127.0.0.1", 50000)) as c: yield c app.dependency_overrides.clear() diff --git a/tests/swarm/test_swarm_authz.py b/tests/swarm/test_swarm_authz.py new file mode 100644 index 00000000..eb4bf6b5 --- /dev/null +++ b/tests/swarm/test_swarm_authz.py @@ -0,0 +1,135 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Authorization for the swarm control plane. + +Two layers, both fail-closed: + 1. ``_guard_bind`` refuses a routable bind without --tls (CLI startup). + 2. ``require_operator_cert`` gates every controller endpoint (HTTP layer). + +No live TLS: the off-box case is simulated by giving the TestClient a +non-loopback client address with no peer cert in scope. +""" +from __future__ import annotations + +import pathlib +from typing import Any + +import contextlib + +import pytest +import typer +from fastapi.testclient import TestClient +from typer.testing import CliRunner + +from decnet.cli.swarmctl import _guard_bind +from decnet.web.db.factory import get_repository +from decnet.web.dependencies import get_repo + + +# ------------------------- layer 1: bind guard ------------------------------ + + +@pytest.mark.parametrize("host", ["127.0.0.1", "::1", "localhost"]) +def test_guard_bind_allows_loopback_plaintext(host: str) -> None: + _guard_bind(host, tls=False) # must not raise + + +@pytest.mark.parametrize("host", ["0.0.0.0", "10.0.0.5", "192.168.1.10"]) +def test_guard_bind_allows_routable_with_tls(host: str) -> None: + _guard_bind(host, tls=True) # mTLS makes a routable bind legitimate + + +@pytest.mark.parametrize("host", ["0.0.0.0", "10.0.0.5"]) +def test_guard_bind_refuses_routable_plaintext(host: str) -> None: + with pytest.raises(typer.Exit) as ei: + _guard_bind(host, tls=False) + assert ei.value.exit_code == 2 + + +def test_swarmctl_cli_refuses_routable_plaintext(monkeypatch: pytest.MonkeyPatch) -> None: + # Wiring check: the guard fires before any subprocess is spawned. + import subprocess + + from decnet.cli import app + + called = {"popen": False} + + def _no_popen(*a: Any, **k: Any): # pragma: no cover - must not run + called["popen"] = True + raise AssertionError("subprocess.Popen must not be reached") + + monkeypatch.setattr(subprocess, "Popen", _no_popen) + result = CliRunner().invoke(app, ["swarmctl", "--host", "0.0.0.0", "--no-listener"]) + assert result.exit_code == 2 + assert called["popen"] is False + + +# ------------------------- layer 2: endpoint operator gate ------------------ + + +@pytest.fixture +def ca_dir(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch) -> pathlib.Path: + ca = tmp_path / "ca" + from decnet.swarm import pki + from decnet.web.router.swarm import api_enroll_host as enroll_mod + + monkeypatch.setattr(pki, "DEFAULT_CA_DIR", ca) + monkeypatch.setattr(enroll_mod, "pki", pki) + return ca + + +@pytest.fixture +def repo(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch): + r = get_repository(db_path=str(tmp_path / "authz.db")) + import decnet.web.dependencies as deps + import decnet.web.swarm_api as swarm_api_mod + + monkeypatch.setattr(deps, "repo", r) + monkeypatch.setattr(swarm_api_mod, "repo", r) + return r + + +@contextlib.contextmanager +def _client(repo, client_addr: tuple[str, int]): + # The `with TestClient(...)` form runs the controller lifespan, which + # creates the swarm schema against the test repo. + from decnet.web.swarm_api import app + + async def _override() -> Any: + return repo + + app.dependency_overrides[get_repo] = _override + try: + with TestClient(app, client=client_addr) as c: + yield c + finally: + app.dependency_overrides.clear() + + +def test_offbox_certless_caller_is_refused_on_every_operator_route( + repo, ca_dir: pathlib.Path +) -> None: + # No TLS peer cert + non-loopback client = an off-box attacker. Every + # operator route must 403 (the bind guard makes this combination + # unreachable in production, but the HTTP layer fails closed regardless). + with _client(repo, ("10.0.0.99", 40000)) as c: + assert c.post( + "/swarm/enroll", + json={"name": "evil", "address": "10.0.0.99", "agent_port": 8765}, + ).status_code == 403 + assert c.get("/swarm/hosts").status_code == 403 + assert c.post("/swarm/check").status_code == 403 + assert c.get("/swarm/deckies").status_code == 403 + assert c.post("/swarm/teardown", json={}).status_code == 403 + + +def test_loopback_operator_is_allowed(repo, ca_dir: pathlib.Path) -> None: + # The shipping single-host default: local operator over plaintext loopback. + with _client(repo, ("127.0.0.1", 40000)) as c: + enrolled = c.post( + "/swarm/enroll", + json={"name": "worker-ok", "address": "10.0.0.5", "agent_port": 8765}, + ) + assert enrolled.status_code == 201, enrolled.text + listed = c.get("/swarm/hosts") + assert listed.status_code == 200 + assert any(h["name"] == "worker-ok" for h in listed.json())