feat(swarmctl): --tls with auto-issued or BYOC server cert

swarmctl CLI gains --tls/--cert/--key/--client-ca flags. With --tls the
controller runs uvicorn under HTTPS + mTLS (CERT_REQUIRED) so worker
heartbeats can reach it cross-host. Default is still 127.0.0.1 plaintext
for backwards compat with the master-CLI enrollment flow.

Auto-issue path (no --cert/--key given): a server cert signed by the
existing DECNET CA is issued once and parked under ~/.decnet/swarmctl/.
Workers already ship that CA's ca.crt from the enroll bundle, so they
verify the endpoint with no extra trust config. BYOC via --cert/--key
when the operator wants a publicly-trusted or externally-managed cert.
The auto-cert path is idempotent across restarts to keep a stable
fingerprint for any long-lived mTLS sessions.
This commit is contained in:
2026-04-19 21:46:32 -04:00
parent e411063075
commit 62f7c88b90
3 changed files with 109 additions and 2 deletions

View File

@@ -188,6 +188,10 @@ def swarmctl(
host: str = typer.Option("127.0.0.1", "--host", help="Bind address for the swarm controller"), host: str = typer.Option("127.0.0.1", "--host", help="Bind address for the swarm controller"),
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"),
no_listener: bool = typer.Option(False, "--no-listener", help="Do not auto-spawn the syslog-TLS listener alongside swarmctl"), no_listener: bool = typer.Option(False, "--no-listener", help="Do not auto-spawn the syslog-TLS listener alongside swarmctl"),
tls: bool = typer.Option(False, "--tls", help="Serve over HTTPS with mTLS (required for cross-host worker heartbeats)"),
cert: Optional[str] = typer.Option(None, "--cert", help="BYOC: path to TLS server cert (PEM). Auto-issues from the DECNET CA if omitted."),
key: Optional[str] = typer.Option(None, "--key", help="BYOC: path to TLS server private key (PEM)."),
client_ca: Optional[str] = typer.Option(None, "--client-ca", help="CA bundle used to verify worker client certs. Defaults to the DECNET CA."),
) -> None: ) -> None:
"""Run the DECNET SWARM controller (master-side, separate process from `decnet api`). """Run the DECNET SWARM controller (master-side, separate process from `decnet api`).
@@ -197,6 +201,12 @@ def swarmctl(
survives swarmctl restarts and crashes — if it dies on its own, survives swarmctl restarts and crashes — if it dies on its own,
restart it manually with `decnet listener --daemon …`. Pass restart it manually with `decnet listener --daemon …`. Pass
--no-listener to skip. --no-listener to skip.
Pass ``--tls`` to serve over HTTPS with mutual-TLS enforcement. By
default the server cert is auto-issued from the DECNET CA under
``~/.decnet/swarmctl/`` so enrolled workers (which already ship that
CA's ``ca.crt``) trust it out of the box. BYOC via ``--cert``/``--key``
if you need a publicly-trusted or externally-managed cert.
""" """
import subprocess # nosec B404 import subprocess # nosec B404
import sys import sys
@@ -226,10 +236,35 @@ def swarmctl(
log.warning("swarmctl could not auto-spawn listener: %s", e) log.warning("swarmctl could not auto-spawn listener: %s", e)
console.print(f"[yellow]listener auto-spawn skipped: {e}[/]") console.print(f"[yellow]listener auto-spawn skipped: {e}[/]")
log.info("swarmctl command invoked host=%s port=%d", host, port) log.info("swarmctl command invoked host=%s port=%d tls=%s", host, port, tls)
console.print(f"[green]Starting DECNET SWARM controller on {host}:{port}...[/]") scheme = "https" if tls else "http"
console.print(f"[green]Starting DECNET SWARM controller on {scheme}://{host}:{port}...[/]")
_cmd = [sys.executable, "-m", "uvicorn", "decnet.web.swarm_api:app", _cmd = [sys.executable, "-m", "uvicorn", "decnet.web.swarm_api:app",
"--host", host, "--port", str(port)] "--host", host, "--port", str(port)]
if tls:
from decnet.swarm import pki as _pki
# BYOC path: operator supplied cert+key explicitly. Else auto-issue
# from the existing DECNET CA so workers' already-deployed ca.crt
# verifies the endpoint with no extra steps.
if cert and key:
cert_path, key_path = cert, key
elif cert or key:
console.print("[red]--cert and --key must be provided together.[/]")
raise typer.Exit(code=2)
else:
auto_cert, auto_key, _auto_ca = _pki.ensure_swarmctl_cert(host)
cert_path, key_path = str(auto_cert), str(auto_key)
console.print(f"[dim]Auto-issued swarmctl server cert → {cert_path}[/]")
ca_path = client_ca or str(_pki.DEFAULT_CA_DIR / "ca.crt")
_cmd += [
"--ssl-keyfile", key_path,
"--ssl-certfile", cert_path,
"--ssl-ca-certs", ca_path,
# CERT_REQUIRED — a worker must present a CA-signed client cert
# before FastAPI sees the request. The heartbeat endpoint then
# pins the cert fingerprint per-host on top of this.
"--ssl-cert-reqs", "2",
]
try: try:
proc = subprocess.Popen(_cmd, start_new_session=True) # nosec B603 B404 proc = subprocess.Popen(_cmd, start_new_session=True) # nosec B603 B404
try: try:

View File

@@ -43,6 +43,7 @@ from cryptography.x509.oid import NameOID
DEFAULT_CA_DIR = pathlib.Path(os.path.expanduser("~/.decnet/ca")) DEFAULT_CA_DIR = pathlib.Path(os.path.expanduser("~/.decnet/ca"))
DEFAULT_AGENT_DIR = pathlib.Path(os.path.expanduser("~/.decnet/agent")) DEFAULT_AGENT_DIR = pathlib.Path(os.path.expanduser("~/.decnet/agent"))
DEFAULT_SWARMCTL_DIR = pathlib.Path(os.path.expanduser("~/.decnet/swarmctl"))
CA_KEY_BITS = 4096 CA_KEY_BITS = 4096
WORKER_KEY_BITS = 2048 WORKER_KEY_BITS = 2048
@@ -277,6 +278,45 @@ def load_worker_bundle(
) )
def ensure_swarmctl_cert(
bind_host: str,
ca_dir: pathlib.Path = DEFAULT_CA_DIR,
swarmctl_dir: pathlib.Path = DEFAULT_SWARMCTL_DIR,
extra_sans: Optional[list[str]] = None,
) -> tuple[pathlib.Path, pathlib.Path, pathlib.Path]:
"""Return (cert_path, key_path, ca_path), auto-issuing if missing.
Uses the existing DECNET CA (ensuring it exists first) so workers
whose bundle already includes ``ca.crt`` can verify the swarmctl
endpoint without additional trust configuration. Self-signed is
intentionally not the default — a cert signed by the same CA the
workers already trust is the friction-free path.
Callers that want BYOC should skip this and pass their own
cert/key paths directly to uvicorn.
"""
swarmctl_dir.mkdir(parents=True, exist_ok=True)
os.chmod(swarmctl_dir, 0o700)
cert_path = swarmctl_dir / "server.crt"
key_path = swarmctl_dir / "server.key"
ca_cert_path = ca_dir / "ca.crt"
if cert_path.exists() and key_path.exists() and ca_cert_path.exists():
return cert_path, key_path, ca_cert_path
ca = ensure_ca(ca_dir)
sans = list({bind_host, "127.0.0.1", "localhost", *(extra_sans or [])})
issued = issue_worker_cert(ca, "swarmctl", sans)
cert_path.write_bytes(issued.cert_pem)
key_path.write_bytes(issued.key_pem)
os.chmod(key_path, 0o600)
# ensure_ca already wrote ca.crt under ca_dir, but save_ca is only
# called on generate — re-mirror it here to guarantee the path exists.
if not ca_cert_path.exists():
ca_cert_path.write_bytes(ca.cert_pem)
return cert_path, key_path, ca_cert_path
def fingerprint(cert_pem: bytes) -> str: def fingerprint(cert_pem: bytes) -> str:
"""SHA-256 hex fingerprint of a cert (DER-encoded).""" """SHA-256 hex fingerprint of a cert (DER-encoded)."""
cert = x509.load_pem_x509_certificate(cert_pem) cert = x509.load_pem_x509_certificate(cert_pem)

View File

@@ -52,6 +52,38 @@ def test_load_worker_bundle_returns_none_if_missing(tmp_path: pathlib.Path) -> N
assert pki.load_worker_bundle(tmp_path / "empty") is None assert pki.load_worker_bundle(tmp_path / "empty") is None
def test_ensure_swarmctl_cert_issues_from_same_ca(tmp_path: pathlib.Path) -> None:
ca_dir = tmp_path / "ca"
swarmctl_dir = tmp_path / "swarmctl"
cert_path, key_path, ca_path = pki.ensure_swarmctl_cert(
"0.0.0.0", ca_dir=ca_dir, swarmctl_dir=swarmctl_dir
)
assert cert_path.exists() and key_path.exists() and ca_path.exists()
# Server cert is signed by the same CA that workers will ship — that's
# the whole point of the auto-issue path.
cert = x509.load_pem_x509_certificate(cert_path.read_bytes())
ca_cert = x509.load_pem_x509_certificate(ca_path.read_bytes())
assert cert.issuer == ca_cert.subject
san = cert.extensions.get_extension_for_class(x509.SubjectAlternativeName).value
ips = {str(v) for v in san.get_values_for_type(x509.IPAddress)}
dns = set(san.get_values_for_type(x509.DNSName))
assert "0.0.0.0" in ips
assert "localhost" in dns
# Key perm is the same 0600 we enforce on worker.key.
assert (key_path.stat().st_mode & 0o777) == 0o600
def test_ensure_swarmctl_cert_is_idempotent(tmp_path: pathlib.Path) -> None:
# Second call must NOT re-issue — otherwise a restart of swarmctl
# would rotate the server cert and break any worker mid-TLS-session.
ca_dir = tmp_path / "ca"
swarmctl_dir = tmp_path / "swarmctl"
first = pki.ensure_swarmctl_cert("0.0.0.0", ca_dir=ca_dir, swarmctl_dir=swarmctl_dir)
first_pem = first[0].read_bytes()
second = pki.ensure_swarmctl_cert("0.0.0.0", ca_dir=ca_dir, swarmctl_dir=swarmctl_dir)
assert second[0].read_bytes() == first_pem
def test_fingerprint_stable_across_calls(tmp_path: pathlib.Path) -> None: def test_fingerprint_stable_across_calls(tmp_path: pathlib.Path) -> None:
ca = pki.ensure_ca(tmp_path / "ca") ca = pki.ensure_ca(tmp_path / "ca")
issued = pki.issue_worker_cert(ca, "worker-03", ["127.0.0.1"]) issued = pki.issue_worker_cert(ca, "worker-03", ["127.0.0.1"])