fix(swarm): mTLS client-cert authz on the swarm control plane
The swarm controller (port 8770) exposed 9 routes with zero app-layer auth, and swarmctl --tls defaulted off — anyone able to reach the port could enroll workers (minting CA-signed certs + private keys), deploy, or tear down the fleet. Two fail-closed layers: - require_operator_cert gates every operator route (enroll/deploy/ teardown/hosts/check/deckies). When mTLS is on, the peer cert's CN must be an operator identity (decnet-master/swarmctl); worker and updater@* certs are rejected. Plaintext loopback (single-host master) is accepted as the local operator — the docker.sock boundary. - swarmctl refuses to bind a routable interface without --tls, so a network-exposed plaintext control plane can never start. /heartbeat keeps its worker fingerprint pinning. Closes the two ASVS criticals (control-plane no-auth, unauthenticated cert minting).
This commit is contained in:
@@ -42,6 +42,12 @@ log = get_logger("swarm.mtls")
|
||||
# or tear the fleet down.
|
||||
OPERATOR_CNS = frozenset({"decnet-master", "swarmctl"})
|
||||
|
||||
# Hosts treated as "the master box itself". A certless request is only accepted
|
||||
# from these — the single-operator loopback boundary (same model as
|
||||
# docker.sock). Any routable bind is forced onto mTLS by the swarmctl startup
|
||||
# guard, so a certless request can never legitimately arrive from off-box.
|
||||
LOOPBACK_HOSTS = frozenset({"127.0.0.1", "::1", "localhost"})
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PeerCert:
|
||||
@@ -131,17 +137,37 @@ def extract_peer_fingerprint(scope: MutableMapping[str, Any]) -> Optional[str]:
|
||||
return hashlib.sha256(der).hexdigest().lower()
|
||||
|
||||
|
||||
def require_operator_cert(request: Request) -> PeerCert:
|
||||
"""FastAPI dependency: require a CA-signed cert whose CN is an operator.
|
||||
def _client_is_loopback(request: Request) -> bool:
|
||||
"""True iff the request originated from the master box's loopback."""
|
||||
client = getattr(request, "client", None)
|
||||
host = getattr(client, "host", None) if client is not None else None
|
||||
return host in LOOPBACK_HOSTS
|
||||
|
||||
Transport-layer mTLS (``--ssl-cert-reqs 2``) has already proven the cert is
|
||||
CA-signed; here we enforce that its CN is in :data:`OPERATOR_CNS`. Worker
|
||||
and ``updater@*`` certs are rejected with 403.
|
||||
|
||||
def require_operator_cert(request: Request) -> PeerCert:
|
||||
"""FastAPI dependency authorizing a swarm control-plane operation.
|
||||
|
||||
Two accepted paths, matching the deployment posture:
|
||||
|
||||
* **mTLS on** (any routable bind — enforced by the swarmctl startup guard):
|
||||
a peer cert is present. Transport already proved it is CA-signed; we
|
||||
additionally require its CN to be in :data:`OPERATOR_CNS`. Worker and
|
||||
``updater@*`` certs are rejected — a worker's still-valid cert must never
|
||||
drive enroll/deploy/teardown.
|
||||
* **Loopback plaintext** (single-host master, the shipping default): no peer
|
||||
cert, but the request came from ``127.0.0.1``/``::1``. Accepted as the
|
||||
local operator — the same trust boundary as ``docker.sock``.
|
||||
|
||||
A certless request from any non-loopback client is refused (fail-closed);
|
||||
in practice the startup guard prevents that combination from arising.
|
||||
"""
|
||||
peer = extract_peer_cert(request.scope)
|
||||
if peer is None:
|
||||
raise HTTPException(status_code=403, detail="peer cert unavailable")
|
||||
if peer.cn not in OPERATOR_CNS:
|
||||
log.warning("rejected non-operator cert on control plane: cn=%r", peer.cn)
|
||||
raise HTTPException(status_code=403, detail="operator certificate required")
|
||||
return peer
|
||||
if peer is not None:
|
||||
if peer.cn not in OPERATOR_CNS:
|
||||
log.warning("rejected non-operator cert on control plane: cn=%r", peer.cn)
|
||||
raise HTTPException(status_code=403, detail="operator certificate required")
|
||||
return peer
|
||||
if _client_is_loopback(request):
|
||||
# Local operator on the master box; no client cert over plaintext loopback.
|
||||
return PeerCert(sha256="", cn=None)
|
||||
raise HTTPException(status_code=403, detail="operator certificate required")
|
||||
|
||||
@@ -16,6 +16,7 @@ from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
|
||||
from decnet.web.db.models import SwarmCheckResponse, SwarmHostHealth
|
||||
|
||||
log = get_logger("swarm.check")
|
||||
@@ -26,6 +27,7 @@ router = APIRouter()
|
||||
@router.post("/check", response_model=SwarmCheckResponse, tags=["Swarm Health"])
|
||||
async def api_check_hosts(
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
_operator: PeerCert = Depends(require_operator_cert),
|
||||
) -> SwarmCheckResponse:
|
||||
hosts = await repo.list_swarm_hosts()
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
|
||||
|
||||
log = get_logger("swarm.decommission")
|
||||
router = APIRouter()
|
||||
@@ -32,6 +33,7 @@ router = APIRouter()
|
||||
async def api_decommission_host(
|
||||
uuid: str,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
_operator: PeerCert = Depends(require_operator_cert),
|
||||
) -> None:
|
||||
row = await repo.get_swarm_host_by_uuid(uuid)
|
||||
if row is None:
|
||||
|
||||
@@ -21,6 +21,7 @@ from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
|
||||
from decnet.web.db.models import (
|
||||
SwarmDeployRequest,
|
||||
SwarmDeployResponse,
|
||||
@@ -160,6 +161,7 @@ async def dispatch_decnet_config(
|
||||
async def api_deploy_swarm(
|
||||
req: SwarmDeployRequest,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
_operator: PeerCert = Depends(require_operator_cert),
|
||||
) -> SwarmDeployResponse:
|
||||
if req.config.mode != "swarm":
|
||||
raise HTTPException(status_code=400, detail="mode must be 'swarm'")
|
||||
|
||||
@@ -6,8 +6,10 @@ generates a fresh worker keypair + CA-signed cert, and returns the full
|
||||
bundle to the operator. Bundle delivery to the worker (scp/sshpass/etc.)
|
||||
is outside this process's trust boundary.
|
||||
|
||||
Rationale: the worker agent speaks ONLY mTLS; there is no pre-auth
|
||||
bootstrap endpoint, so nothing to attack before the worker is enrolled.
|
||||
Authorization: this mints a CA-signed identity (and its private key), so it
|
||||
is gated by :func:`require_operator_cert` — an operator-CN client cert when
|
||||
the controller runs mTLS, or a local request when it is loopback-bound.
|
||||
A worker's own cert cannot enroll further hosts.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -20,6 +22,7 @@ from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from decnet.swarm import pki
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
|
||||
from decnet.web.db.models import SwarmEnrolledBundle, SwarmEnrollRequest, SwarmUpdaterBundle
|
||||
|
||||
router = APIRouter()
|
||||
@@ -39,6 +42,7 @@ router = APIRouter()
|
||||
async def api_enroll_host(
|
||||
req: SwarmEnrollRequest,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
_operator: PeerCert = Depends(require_operator_cert),
|
||||
) -> SwarmEnrolledBundle:
|
||||
existing = await repo.get_swarm_host_by_name(req.name)
|
||||
if existing is not None:
|
||||
|
||||
@@ -6,6 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
|
||||
from decnet.web.db.models import SwarmHostView
|
||||
|
||||
router = APIRouter()
|
||||
@@ -20,6 +21,7 @@ router = APIRouter()
|
||||
async def api_get_host(
|
||||
uuid: str,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
_operator: PeerCert = Depends(require_operator_cert),
|
||||
) -> SwarmHostView:
|
||||
row = await repo.get_swarm_host_by_uuid(uuid)
|
||||
if row is None:
|
||||
|
||||
@@ -14,6 +14,7 @@ from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
|
||||
from decnet.web.db.models import DeckyShardView
|
||||
|
||||
router = APIRouter()
|
||||
@@ -24,6 +25,7 @@ async def api_list_deckies(
|
||||
host_uuid: Optional[str] = None,
|
||||
state: Optional[str] = None,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
_operator: PeerCert = Depends(require_operator_cert),
|
||||
) -> list[DeckyShardView]:
|
||||
shards = await repo.list_decky_shards(host_uuid)
|
||||
hosts = {h["uuid"]: h for h in await repo.list_swarm_hosts()}
|
||||
|
||||
@@ -8,6 +8,7 @@ from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
|
||||
from decnet.web.db.models import SwarmHostView
|
||||
|
||||
router = APIRouter()
|
||||
@@ -17,6 +18,7 @@ router = APIRouter()
|
||||
async def api_list_hosts(
|
||||
host_status: Optional[str] = None,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
_operator: PeerCert = Depends(require_operator_cert),
|
||||
) -> list[SwarmHostView]:
|
||||
rows = await repo.list_swarm_hosts(host_status)
|
||||
return [SwarmHostView(**r) for r in rows]
|
||||
|
||||
@@ -11,6 +11,7 @@ from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
|
||||
from decnet.web.db.models import (
|
||||
SwarmDeployResponse,
|
||||
SwarmHostResult,
|
||||
@@ -35,6 +36,7 @@ router = APIRouter()
|
||||
async def api_teardown_swarm(
|
||||
req: SwarmTeardownRequest,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
_operator: PeerCert = Depends(require_operator_cert),
|
||||
) -> SwarmDeployResponse:
|
||||
if req.host_uuid is not None:
|
||||
row = await repo.get_swarm_host_by_uuid(req.host_uuid)
|
||||
|
||||
Reference in New Issue
Block a user