Files
DECNET/decnet/web/router/swarm/api_check_hosts.py
anti 30750d294d fix(swarm): mTLS client-cert authz on the swarm control plane
The swarm controller (port 8770) exposed 9 routes with zero app-layer
auth, and swarmctl --tls defaulted off — anyone able to reach the port
could enroll workers (minting CA-signed certs + private keys), deploy,
or tear down the fleet. Two fail-closed layers:

- require_operator_cert gates every operator route (enroll/deploy/
  teardown/hosts/check/deckies). When mTLS is on, the peer cert's CN
  must be an operator identity (decnet-master/swarmctl); worker and
  updater@* certs are rejected. Plaintext loopback (single-host master)
  is accepted as the local operator — the docker.sock boundary.
- swarmctl refuses to bind a routable interface without --tls, so a
  network-exposed plaintext control plane can never start.

/heartbeat keeps its worker fingerprint pinning. Closes the two ASVS
criticals (control-plane no-auth, unauthenticated cert minting).
2026-05-30 17:16:12 -04:00

65 lines
2.2 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""POST /swarm/check — active mTLS probe of every enrolled worker.
Updates ``SwarmHost.status`` and ``last_heartbeat`` for each host based
on the outcome of the probe.
"""
from __future__ import annotations
import asyncio
from datetime import datetime, timezone
from typing import Any
from fastapi import APIRouter, Depends
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.router.swarm._mtls import PeerCert, require_operator_cert
from decnet.web.db.models import SwarmCheckResponse, SwarmHostHealth
log = get_logger("swarm.check")
router = APIRouter()
@router.post("/check", response_model=SwarmCheckResponse, tags=["Swarm Health"])
async def api_check_hosts(
repo: BaseRepository = Depends(get_repo),
_operator: PeerCert = Depends(require_operator_cert),
) -> SwarmCheckResponse:
hosts = await repo.list_swarm_hosts()
async def _probe(host: dict[str, Any]) -> SwarmHostHealth:
try:
async with AgentClient(host=host) as agent:
body = await agent.health()
await repo.update_swarm_host(
host["uuid"],
{
"status": "active",
"last_heartbeat": datetime.now(timezone.utc),
},
)
return SwarmHostHealth(
host_uuid=host["uuid"],
name=host["name"],
address=host["address"],
reachable=True,
detail=body,
)
except Exception as exc:
log.warning("swarm.check unreachable host=%s err=%s", host["name"], exc)
await repo.update_swarm_host(host["uuid"], {"status": "unreachable"})
return SwarmHostHealth(
host_uuid=host["uuid"],
name=host["name"],
address=host["address"],
reachable=False,
detail=str(exc),
)
results = await asyncio.gather(*(_probe(h) for h in hosts))
return SwarmCheckResponse(results=list(results))