fix(swarm): mTLS client-cert authz on the swarm control plane

The swarm controller (port 8770) exposed 9 routes with zero app-layer
auth, and swarmctl --tls defaulted off — anyone able to reach the port
could enroll workers (minting CA-signed certs + private keys), deploy,
or tear down the fleet. Two fail-closed layers:

- require_operator_cert gates every operator route (enroll/deploy/
  teardown/hosts/check/deckies). When mTLS is on, the peer cert's CN
  must be an operator identity (decnet-master/swarmctl); worker and
  updater@* certs are rejected. Plaintext loopback (single-host master)
  is accepted as the local operator — the docker.sock boundary.
- swarmctl refuses to bind a routable interface without --tls, so a
  network-exposed plaintext control plane can never start.

/heartbeat keeps its worker fingerprint pinning. Closes the two ASVS
criticals (control-plane no-auth, unauthenticated cert minting).
This commit is contained in:
2026-05-30 17:16:12 -04:00
parent e7a686206c
commit 30750d294d
16 changed files with 257 additions and 22 deletions

View File

@@ -42,6 +42,12 @@ log = get_logger("swarm.mtls")
# or tear the fleet down.
OPERATOR_CNS = frozenset({"decnet-master", "swarmctl"})
# Hosts treated as "the master box itself". A certless request is only accepted
# from these — the single-operator loopback boundary (same model as
# docker.sock). Any routable bind is forced onto mTLS by the swarmctl startup
# guard, so a certless request can never legitimately arrive from off-box.
LOOPBACK_HOSTS = frozenset({"127.0.0.1", "::1", "localhost"})
@dataclass(frozen=True)
class PeerCert:
@@ -131,17 +137,37 @@ def extract_peer_fingerprint(scope: MutableMapping[str, Any]) -> Optional[str]:
return hashlib.sha256(der).hexdigest().lower()
def require_operator_cert(request: Request) -> PeerCert:
"""FastAPI dependency: require a CA-signed cert whose CN is an operator.
def _client_is_loopback(request: Request) -> bool:
"""True iff the request originated from the master box's loopback."""
client = getattr(request, "client", None)
host = getattr(client, "host", None) if client is not None else None
return host in LOOPBACK_HOSTS
Transport-layer mTLS (``--ssl-cert-reqs 2``) has already proven the cert is
CA-signed; here we enforce that its CN is in :data:`OPERATOR_CNS`. Worker
and ``updater@*`` certs are rejected with 403.
def require_operator_cert(request: Request) -> PeerCert:
"""FastAPI dependency authorizing a swarm control-plane operation.
Two accepted paths, matching the deployment posture:
* **mTLS on** (any routable bind — enforced by the swarmctl startup guard):
a peer cert is present. Transport already proved it is CA-signed; we
additionally require its CN to be in :data:`OPERATOR_CNS`. Worker and
``updater@*`` certs are rejected — a worker's still-valid cert must never
drive enroll/deploy/teardown.
* **Loopback plaintext** (single-host master, the shipping default): no peer
cert, but the request came from ``127.0.0.1``/``::1``. Accepted as the
local operator — the same trust boundary as ``docker.sock``.
A certless request from any non-loopback client is refused (fail-closed);
in practice the startup guard prevents that combination from arising.
"""
peer = extract_peer_cert(request.scope)
if peer is None:
raise HTTPException(status_code=403, detail="peer cert unavailable")
if peer.cn not in OPERATOR_CNS:
log.warning("rejected non-operator cert on control plane: cn=%r", peer.cn)
raise HTTPException(status_code=403, detail="operator certificate required")
return peer
if peer is not None:
if peer.cn not in OPERATOR_CNS:
log.warning("rejected non-operator cert on control plane: cn=%r", peer.cn)
raise HTTPException(status_code=403, detail="operator certificate required")
return peer
if _client_is_loopback(request):
# Local operator on the master box; no client cert over plaintext loopback.
return PeerCert(sha256="", cn=None)
raise HTTPException(status_code=403, detail="operator certificate required")

View File

@@ -16,6 +16,7 @@ from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import SwarmCheckResponse, SwarmHostHealth
log = get_logger("swarm.check")
@@ -26,6 +27,7 @@ router = APIRouter()
@router.post("/check", response_model=SwarmCheckResponse, tags=["Swarm Health"])
async def api_check_hosts(
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> SwarmCheckResponse:
hosts = await repo.list_swarm_hosts()

View File

@@ -18,6 +18,7 @@ from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
log = get_logger("swarm.decommission")
router = APIRouter()
@@ -32,6 +33,7 @@ router = APIRouter()
async def api_decommission_host(
uuid: str,
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> None:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:

View File

@@ -21,6 +21,7 @@ from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import (
SwarmDeployRequest,
SwarmDeployResponse,
@@ -160,6 +161,7 @@ async def dispatch_decnet_config(
async def api_deploy_swarm(
req: SwarmDeployRequest,
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> SwarmDeployResponse:
if req.config.mode != "swarm":
raise HTTPException(status_code=400, detail="mode must be 'swarm'")

View File

@@ -6,8 +6,10 @@ generates a fresh worker keypair + CA-signed cert, and returns the full
bundle to the operator. Bundle delivery to the worker (scp/sshpass/etc.)
is outside this process's trust boundary.
Rationale: the worker agent speaks ONLY mTLS; there is no pre-auth
bootstrap endpoint, so nothing to attack before the worker is enrolled.
Authorization: this mints a CA-signed identity (and its private key), so it
is gated by :func:`require_operator_cert` — an operator-CN client cert when
the controller runs mTLS, or a local request when it is loopback-bound.
A worker's own cert cannot enroll further hosts.
"""
from __future__ import annotations
@@ -20,6 +22,7 @@ from fastapi import APIRouter, Depends, HTTPException, status
from decnet.swarm import pki
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import SwarmEnrolledBundle, SwarmEnrollRequest, SwarmUpdaterBundle
router = APIRouter()
@@ -39,6 +42,7 @@ router = APIRouter()
async def api_enroll_host(
req: SwarmEnrollRequest,
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> SwarmEnrolledBundle:
existing = await repo.get_swarm_host_by_name(req.name)
if existing is not None:

View File

@@ -6,6 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import SwarmHostView
router = APIRouter()
@@ -20,6 +21,7 @@ router = APIRouter()
async def api_get_host(
uuid: str,
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> SwarmHostView:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:

View File

@@ -14,6 +14,7 @@ from fastapi import APIRouter, Depends
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import DeckyShardView
router = APIRouter()
@@ -24,6 +25,7 @@ async def api_list_deckies(
host_uuid: Optional[str] = None,
state: Optional[str] = None,
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> list[DeckyShardView]:
shards = await repo.list_decky_shards(host_uuid)
hosts = {h["uuid"]: h for h in await repo.list_swarm_hosts()}

View File

@@ -8,6 +8,7 @@ from fastapi import APIRouter, Depends
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import SwarmHostView
router = APIRouter()
@@ -17,6 +18,7 @@ router = APIRouter()
async def api_list_hosts(
host_status: Optional[str] = None,
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> list[SwarmHostView]:
rows = await repo.list_swarm_hosts(host_status)
return [SwarmHostView(**r) for r in rows]

View File

@@ -11,6 +11,7 @@ from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import (
SwarmDeployResponse,
SwarmHostResult,
@@ -35,6 +36,7 @@ router = APIRouter()
async def api_teardown_swarm(
req: SwarmTeardownRequest,
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> SwarmDeployResponse:
if req.host_uuid is not None:
row = await repo.get_swarm_host_by_uuid(req.host_uuid)