fix(swarm): mTLS client-cert authz on the swarm control plane

The swarm controller (port 8770) exposed 9 routes with zero app-layer
auth, and swarmctl --tls defaulted off — anyone able to reach the port
could enroll workers (minting CA-signed certs + private keys), deploy,
or tear down the fleet. Two fail-closed layers:

- require_operator_cert gates every operator route (enroll/deploy/
  teardown/hosts/check/deckies). When mTLS is on, the peer cert's CN
  must be an operator identity (decnet-master/swarmctl); worker and
  updater@* certs are rejected. Plaintext loopback (single-host master)
  is accepted as the local operator — the docker.sock boundary.
- swarmctl refuses to bind a routable interface without --tls, so a
  network-exposed plaintext control plane can never start.

/heartbeat keeps its worker fingerprint pinning. Closes the two ASVS
criticals (control-plane no-auth, unauthenticated cert minting).
This commit is contained in:
2026-05-30 17:16:12 -04:00
parent e7a686206c
commit 30750d294d
16 changed files with 257 additions and 22 deletions

View File

@@ -13,6 +13,34 @@ from . import utils as _utils
from .gating import _require_master_mode
from .utils import console, log
# Hosts that keep the controller on the master box itself. A routable bind
# (anything else, incl. 0.0.0.0) exposes the control plane to the network and
# MUST run mTLS — the app-layer operator gate trusts the transport to have
# verified a CA-signed client cert. See decnet/web/router/swarm/_mtls.py.
_LOOPBACK_HOSTS = frozenset({"127.0.0.1", "::1", "localhost"})
def _guard_bind(host: str, tls: bool) -> None:
"""Fail closed: refuse to bind a routable interface without --tls.
On loopback the controller may run plaintext (single-operator master box,
same boundary as docker.sock); off-box it would be an UNAUTHENTICATED
control plane, so we hard-refuse to start.
"""
if host not in _LOOPBACK_HOSTS and not tls:
console.print(
f"[red]Refusing to bind the swarm controller to {host} without --tls.[/]"
)
console.print(
"[red]A routable bind without mTLS exposes an UNAUTHENTICATED control "
"plane (enroll / deploy / teardown).[/]"
)
console.print(
"[yellow]Re-run with --tls for mutual-TLS, or bind 127.0.0.1 for a "
"local-only master.[/]"
)
raise typer.Exit(code=2)
def register(app: typer.Typer) -> None:
@app.command()
@@ -50,6 +78,7 @@ def register(app: typer.Typer) -> None:
if you need a publicly-trusted or externally-managed cert.
"""
_require_master_mode("swarmctl")
_guard_bind(host, tls)
if daemon:
log.info("swarmctl daemonizing host=%s port=%d", host, port)
_utils._daemonize()

View File

@@ -42,6 +42,12 @@ log = get_logger("swarm.mtls")
# or tear the fleet down.
OPERATOR_CNS = frozenset({"decnet-master", "swarmctl"})
# Hosts treated as "the master box itself". A certless request is only accepted
# from these — the single-operator loopback boundary (same model as
# docker.sock). Any routable bind is forced onto mTLS by the swarmctl startup
# guard, so a certless request can never legitimately arrive from off-box.
LOOPBACK_HOSTS = frozenset({"127.0.0.1", "::1", "localhost"})
@dataclass(frozen=True)
class PeerCert:
@@ -131,17 +137,37 @@ def extract_peer_fingerprint(scope: MutableMapping[str, Any]) -> Optional[str]:
return hashlib.sha256(der).hexdigest().lower()
def require_operator_cert(request: Request) -> PeerCert:
"""FastAPI dependency: require a CA-signed cert whose CN is an operator.
def _client_is_loopback(request: Request) -> bool:
"""True iff the request originated from the master box's loopback."""
client = getattr(request, "client", None)
host = getattr(client, "host", None) if client is not None else None
return host in LOOPBACK_HOSTS
Transport-layer mTLS (``--ssl-cert-reqs 2``) has already proven the cert is
CA-signed; here we enforce that its CN is in :data:`OPERATOR_CNS`. Worker
and ``updater@*`` certs are rejected with 403.
def require_operator_cert(request: Request) -> PeerCert:
"""FastAPI dependency authorizing a swarm control-plane operation.
Two accepted paths, matching the deployment posture:
* **mTLS on** (any routable bind — enforced by the swarmctl startup guard):
a peer cert is present. Transport already proved it is CA-signed; we
additionally require its CN to be in :data:`OPERATOR_CNS`. Worker and
``updater@*`` certs are rejected — a worker's still-valid cert must never
drive enroll/deploy/teardown.
* **Loopback plaintext** (single-host master, the shipping default): no peer
cert, but the request came from ``127.0.0.1``/``::1``. Accepted as the
local operator — the same trust boundary as ``docker.sock``.
A certless request from any non-loopback client is refused (fail-closed);
in practice the startup guard prevents that combination from arising.
"""
peer = extract_peer_cert(request.scope)
if peer is None:
raise HTTPException(status_code=403, detail="peer cert unavailable")
if peer.cn not in OPERATOR_CNS:
log.warning("rejected non-operator cert on control plane: cn=%r", peer.cn)
raise HTTPException(status_code=403, detail="operator certificate required")
return peer
if peer is not None:
if peer.cn not in OPERATOR_CNS:
log.warning("rejected non-operator cert on control plane: cn=%r", peer.cn)
raise HTTPException(status_code=403, detail="operator certificate required")
return peer
if _client_is_loopback(request):
# Local operator on the master box; no client cert over plaintext loopback.
return PeerCert(sha256="", cn=None)
raise HTTPException(status_code=403, detail="operator certificate required")

View File

@@ -16,6 +16,7 @@ from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import SwarmCheckResponse, SwarmHostHealth
log = get_logger("swarm.check")
@@ -26,6 +27,7 @@ router = APIRouter()
@router.post("/check", response_model=SwarmCheckResponse, tags=["Swarm Health"])
async def api_check_hosts(
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> SwarmCheckResponse:
hosts = await repo.list_swarm_hosts()

View File

@@ -18,6 +18,7 @@ from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
log = get_logger("swarm.decommission")
router = APIRouter()
@@ -32,6 +33,7 @@ router = APIRouter()
async def api_decommission_host(
uuid: str,
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> None:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:

View File

@@ -21,6 +21,7 @@ from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import (
SwarmDeployRequest,
SwarmDeployResponse,
@@ -160,6 +161,7 @@ async def dispatch_decnet_config(
async def api_deploy_swarm(
req: SwarmDeployRequest,
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> SwarmDeployResponse:
if req.config.mode != "swarm":
raise HTTPException(status_code=400, detail="mode must be 'swarm'")

View File

@@ -6,8 +6,10 @@ generates a fresh worker keypair + CA-signed cert, and returns the full
bundle to the operator. Bundle delivery to the worker (scp/sshpass/etc.)
is outside this process's trust boundary.
Rationale: the worker agent speaks ONLY mTLS; there is no pre-auth
bootstrap endpoint, so nothing to attack before the worker is enrolled.
Authorization: this mints a CA-signed identity (and its private key), so it
is gated by :func:`require_operator_cert` — an operator-CN client cert when
the controller runs mTLS, or a local request when it is loopback-bound.
A worker's own cert cannot enroll further hosts.
"""
from __future__ import annotations
@@ -20,6 +22,7 @@ from fastapi import APIRouter, Depends, HTTPException, status
from decnet.swarm import pki
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import SwarmEnrolledBundle, SwarmEnrollRequest, SwarmUpdaterBundle
router = APIRouter()
@@ -39,6 +42,7 @@ router = APIRouter()
async def api_enroll_host(
req: SwarmEnrollRequest,
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> SwarmEnrolledBundle:
existing = await repo.get_swarm_host_by_name(req.name)
if existing is not None:

View File

@@ -6,6 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import SwarmHostView
router = APIRouter()
@@ -20,6 +21,7 @@ router = APIRouter()
async def api_get_host(
uuid: str,
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> SwarmHostView:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:

View File

@@ -14,6 +14,7 @@ from fastapi import APIRouter, Depends
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import DeckyShardView
router = APIRouter()
@@ -24,6 +25,7 @@ async def api_list_deckies(
host_uuid: Optional[str] = None,
state: Optional[str] = None,
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> list[DeckyShardView]:
shards = await repo.list_decky_shards(host_uuid)
hosts = {h["uuid"]: h for h in await repo.list_swarm_hosts()}

View File

@@ -8,6 +8,7 @@ from fastapi import APIRouter, Depends
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import SwarmHostView
router = APIRouter()
@@ -17,6 +18,7 @@ router = APIRouter()
async def api_list_hosts(
host_status: Optional[str] = None,
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> list[SwarmHostView]:
rows = await repo.list_swarm_hosts(host_status)
return [SwarmHostView(**r) for r in rows]

View File

@@ -11,6 +11,7 @@ from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import (
SwarmDeployResponse,
SwarmHostResult,
@@ -35,6 +36,7 @@ router = APIRouter()
async def api_teardown_swarm(
req: SwarmTeardownRequest,
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> SwarmDeployResponse:
if req.host_uuid is not None:
row = await repo.get_swarm_host_by_uuid(req.host_uuid)

View File

@@ -51,7 +51,9 @@ def client(repo, ca_dir: pathlib.Path):
return repo
app.dependency_overrides[get_repo] = _override
with TestClient(app) as c:
# loopback client so /swarm/enroll (operator-gated) accepts the certless
# local-operator path during test setup.
with TestClient(app, client=("127.0.0.1", 50000)) as c:
yield c
app.dependency_overrides.clear()

View File

@@ -47,7 +47,8 @@ def client(repo, ca_dir: pathlib.Path):
async def _override() -> Any:
return repo
app.dependency_overrides[get_repo] = _override
with TestClient(app) as c:
# loopback client so operator-gated /swarm/enroll accepts the local operator.
with TestClient(app, client=("127.0.0.1", 50000)) as c:
yield c
app.dependency_overrides.clear()

View File

@@ -57,7 +57,8 @@ def client(repo, ca_dir):
return repo
app.dependency_overrides[get_repo] = _override
with TestClient(app) as c:
# loopback client so operator-gated /swarm/enroll accepts the local operator.
with TestClient(app, client=("127.0.0.1", 50000)) as c:
yield c
app.dependency_overrides.clear()

View File

@@ -79,9 +79,10 @@ def test_extract_fingerprint_works_on_non_cert_der() -> None:
# ------------------------- require_operator_cert ---------------------------
def _request_with(scope: dict) -> MagicMock:
def _request_with(scope: dict, client_host: str | None = None) -> MagicMock:
req = MagicMock()
req.scope = scope
req.client = None if client_host is None else MagicMock(host=client_host)
return req
@@ -96,10 +97,14 @@ def test_require_operator_accepts_swarmctl(ca) -> None:
def test_require_operator_rejects_worker_cn(ca) -> None:
# A worker cert is CA-signed but must not drive the control plane, even
# from loopback — the CN gate fires before the loopback fallback.
from fastapi import HTTPException
with pytest.raises(HTTPException) as ei:
_mtls.require_operator_cert(_request_with(_scope_with(_der_for(ca, "worker-1"))))
_mtls.require_operator_cert(
_request_with(_scope_with(_der_for(ca, "worker-1")), client_host="127.0.0.1")
)
assert ei.value.status_code == 403
@@ -111,10 +116,25 @@ def test_require_operator_rejects_updater_cn(ca) -> None:
assert ei.value.status_code == 403
def test_require_operator_rejects_no_cert() -> None:
def test_require_operator_allows_certless_loopback() -> None:
# Shipping default: plaintext loopback, no client cert → local operator.
peer = _mtls.require_operator_cert(_request_with({}, client_host="127.0.0.1"))
assert peer.cn is None and peer.sha256 == ""
def test_require_operator_rejects_certless_non_loopback() -> None:
# No cert from off-box → fail closed (the startup guard makes this
# unreachable in practice, but defense in depth).
from fastapi import HTTPException
with pytest.raises(HTTPException) as ei:
_mtls.require_operator_cert(_request_with({}))
_mtls.require_operator_cert(_request_with({}, client_host="10.0.0.9"))
assert ei.value.status_code == 403
def test_require_operator_rejects_certless_unknown_client() -> None:
from fastapi import HTTPException
with pytest.raises(HTTPException) as ei:
_mtls.require_operator_cert(_request_with({}, client_host=None))
assert ei.value.status_code == 403
assert "unavailable" in ei.value.detail

View File

@@ -56,7 +56,10 @@ def client(repo, ca_dir: pathlib.Path):
return repo
app.dependency_overrides[get_repo] = _override
with TestClient(app) as c:
# client=loopback so the operator-cert gate takes its certless-loopback
# path (the shipping single-host default); TestClient otherwise reports
# host "testclient", which the gate treats as off-box.
with TestClient(app, client=("127.0.0.1", 50000)) as c:
yield c
app.dependency_overrides.clear()

View File

@@ -0,0 +1,135 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Authorization for the swarm control plane.
Two layers, both fail-closed:
1. ``_guard_bind`` refuses a routable bind without --tls (CLI startup).
2. ``require_operator_cert`` gates every controller endpoint (HTTP layer).
No live TLS: the off-box case is simulated by giving the TestClient a
non-loopback client address with no peer cert in scope.
"""
from __future__ import annotations
import pathlib
from typing import Any
import contextlib
import pytest
import typer
from fastapi.testclient import TestClient
from typer.testing import CliRunner
from decnet.cli.swarmctl import _guard_bind
from decnet.web.db.factory import get_repository
from decnet.web.dependencies import get_repo
# ------------------------- layer 1: bind guard ------------------------------
@pytest.mark.parametrize("host", ["127.0.0.1", "::1", "localhost"])
def test_guard_bind_allows_loopback_plaintext(host: str) -> None:
_guard_bind(host, tls=False) # must not raise
@pytest.mark.parametrize("host", ["0.0.0.0", "10.0.0.5", "192.168.1.10"])
def test_guard_bind_allows_routable_with_tls(host: str) -> None:
_guard_bind(host, tls=True) # mTLS makes a routable bind legitimate
@pytest.mark.parametrize("host", ["0.0.0.0", "10.0.0.5"])
def test_guard_bind_refuses_routable_plaintext(host: str) -> None:
with pytest.raises(typer.Exit) as ei:
_guard_bind(host, tls=False)
assert ei.value.exit_code == 2
def test_swarmctl_cli_refuses_routable_plaintext(monkeypatch: pytest.MonkeyPatch) -> None:
# Wiring check: the guard fires before any subprocess is spawned.
import subprocess
from decnet.cli import app
called = {"popen": False}
def _no_popen(*a: Any, **k: Any): # pragma: no cover - must not run
called["popen"] = True
raise AssertionError("subprocess.Popen must not be reached")
monkeypatch.setattr(subprocess, "Popen", _no_popen)
result = CliRunner().invoke(app, ["swarmctl", "--host", "0.0.0.0", "--no-listener"])
assert result.exit_code == 2
assert called["popen"] is False
# ------------------------- layer 2: endpoint operator gate ------------------
@pytest.fixture
def ca_dir(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch) -> pathlib.Path:
ca = tmp_path / "ca"
from decnet.swarm import pki
from decnet.web.router.swarm import api_enroll_host as enroll_mod
monkeypatch.setattr(pki, "DEFAULT_CA_DIR", ca)
monkeypatch.setattr(enroll_mod, "pki", pki)
return ca
@pytest.fixture
def repo(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch):
r = get_repository(db_path=str(tmp_path / "authz.db"))
import decnet.web.dependencies as deps
import decnet.web.swarm_api as swarm_api_mod
monkeypatch.setattr(deps, "repo", r)
monkeypatch.setattr(swarm_api_mod, "repo", r)
return r
@contextlib.contextmanager
def _client(repo, client_addr: tuple[str, int]):
# The `with TestClient(...)` form runs the controller lifespan, which
# creates the swarm schema against the test repo.
from decnet.web.swarm_api import app
async def _override() -> Any:
return repo
app.dependency_overrides[get_repo] = _override
try:
with TestClient(app, client=client_addr) as c:
yield c
finally:
app.dependency_overrides.clear()
def test_offbox_certless_caller_is_refused_on_every_operator_route(
repo, ca_dir: pathlib.Path
) -> None:
# No TLS peer cert + non-loopback client = an off-box attacker. Every
# operator route must 403 (the bind guard makes this combination
# unreachable in production, but the HTTP layer fails closed regardless).
with _client(repo, ("10.0.0.99", 40000)) as c:
assert c.post(
"/swarm/enroll",
json={"name": "evil", "address": "10.0.0.99", "agent_port": 8765},
).status_code == 403
assert c.get("/swarm/hosts").status_code == 403
assert c.post("/swarm/check").status_code == 403
assert c.get("/swarm/deckies").status_code == 403
assert c.post("/swarm/teardown", json={}).status_code == 403
def test_loopback_operator_is_allowed(repo, ca_dir: pathlib.Path) -> None:
# The shipping single-host default: local operator over plaintext loopback.
with _client(repo, ("127.0.0.1", 40000)) as c:
enrolled = c.post(
"/swarm/enroll",
json={"name": "worker-ok", "address": "10.0.0.5", "agent_port": 8765},
)
assert enrolled.status_code == 201, enrolled.text
listed = c.get("/swarm/hosts")
assert listed.status_code == 200
assert any(h["name"] == "worker-ok" for h in listed.json())