feat(swarmctl): --tls with auto-issued or BYOC server cert
swarmctl CLI gains --tls/--cert/--key/--client-ca flags. With --tls the controller runs uvicorn under HTTPS + mTLS (CERT_REQUIRED) so worker heartbeats can reach it cross-host. Default is still 127.0.0.1 plaintext for backwards compat with the master-CLI enrollment flow. Auto-issue path (no --cert/--key given): a server cert signed by the existing DECNET CA is issued once and parked under ~/.decnet/swarmctl/. Workers already ship that CA's ca.crt from the enroll bundle, so they verify the endpoint with no extra trust config. BYOC via --cert/--key when the operator wants a publicly-trusted or externally-managed cert. The auto-cert path is idempotent across restarts to keep a stable fingerprint for any long-lived mTLS sessions.
This commit is contained in:
@@ -188,6 +188,10 @@ def swarmctl(
|
|||||||
host: str = typer.Option("127.0.0.1", "--host", help="Bind address for the swarm controller"),
|
host: str = typer.Option("127.0.0.1", "--host", help="Bind address for the swarm controller"),
|
||||||
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"),
|
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"),
|
||||||
no_listener: bool = typer.Option(False, "--no-listener", help="Do not auto-spawn the syslog-TLS listener alongside swarmctl"),
|
no_listener: bool = typer.Option(False, "--no-listener", help="Do not auto-spawn the syslog-TLS listener alongside swarmctl"),
|
||||||
|
tls: bool = typer.Option(False, "--tls", help="Serve over HTTPS with mTLS (required for cross-host worker heartbeats)"),
|
||||||
|
cert: Optional[str] = typer.Option(None, "--cert", help="BYOC: path to TLS server cert (PEM). Auto-issues from the DECNET CA if omitted."),
|
||||||
|
key: Optional[str] = typer.Option(None, "--key", help="BYOC: path to TLS server private key (PEM)."),
|
||||||
|
client_ca: Optional[str] = typer.Option(None, "--client-ca", help="CA bundle used to verify worker client certs. Defaults to the DECNET CA."),
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Run the DECNET SWARM controller (master-side, separate process from `decnet api`).
|
"""Run the DECNET SWARM controller (master-side, separate process from `decnet api`).
|
||||||
|
|
||||||
@@ -197,6 +201,12 @@ def swarmctl(
|
|||||||
survives swarmctl restarts and crashes — if it dies on its own,
|
survives swarmctl restarts and crashes — if it dies on its own,
|
||||||
restart it manually with `decnet listener --daemon …`. Pass
|
restart it manually with `decnet listener --daemon …`. Pass
|
||||||
--no-listener to skip.
|
--no-listener to skip.
|
||||||
|
|
||||||
|
Pass ``--tls`` to serve over HTTPS with mutual-TLS enforcement. By
|
||||||
|
default the server cert is auto-issued from the DECNET CA under
|
||||||
|
``~/.decnet/swarmctl/`` so enrolled workers (which already ship that
|
||||||
|
CA's ``ca.crt``) trust it out of the box. BYOC via ``--cert``/``--key``
|
||||||
|
if you need a publicly-trusted or externally-managed cert.
|
||||||
"""
|
"""
|
||||||
import subprocess # nosec B404
|
import subprocess # nosec B404
|
||||||
import sys
|
import sys
|
||||||
@@ -226,10 +236,35 @@ def swarmctl(
|
|||||||
log.warning("swarmctl could not auto-spawn listener: %s", e)
|
log.warning("swarmctl could not auto-spawn listener: %s", e)
|
||||||
console.print(f"[yellow]listener auto-spawn skipped: {e}[/]")
|
console.print(f"[yellow]listener auto-spawn skipped: {e}[/]")
|
||||||
|
|
||||||
log.info("swarmctl command invoked host=%s port=%d", host, port)
|
log.info("swarmctl command invoked host=%s port=%d tls=%s", host, port, tls)
|
||||||
console.print(f"[green]Starting DECNET SWARM controller on {host}:{port}...[/]")
|
scheme = "https" if tls else "http"
|
||||||
|
console.print(f"[green]Starting DECNET SWARM controller on {scheme}://{host}:{port}...[/]")
|
||||||
_cmd = [sys.executable, "-m", "uvicorn", "decnet.web.swarm_api:app",
|
_cmd = [sys.executable, "-m", "uvicorn", "decnet.web.swarm_api:app",
|
||||||
"--host", host, "--port", str(port)]
|
"--host", host, "--port", str(port)]
|
||||||
|
if tls:
|
||||||
|
from decnet.swarm import pki as _pki
|
||||||
|
# BYOC path: operator supplied cert+key explicitly. Else auto-issue
|
||||||
|
# from the existing DECNET CA so workers' already-deployed ca.crt
|
||||||
|
# verifies the endpoint with no extra steps.
|
||||||
|
if cert and key:
|
||||||
|
cert_path, key_path = cert, key
|
||||||
|
elif cert or key:
|
||||||
|
console.print("[red]--cert and --key must be provided together.[/]")
|
||||||
|
raise typer.Exit(code=2)
|
||||||
|
else:
|
||||||
|
auto_cert, auto_key, _auto_ca = _pki.ensure_swarmctl_cert(host)
|
||||||
|
cert_path, key_path = str(auto_cert), str(auto_key)
|
||||||
|
console.print(f"[dim]Auto-issued swarmctl server cert → {cert_path}[/]")
|
||||||
|
ca_path = client_ca or str(_pki.DEFAULT_CA_DIR / "ca.crt")
|
||||||
|
_cmd += [
|
||||||
|
"--ssl-keyfile", key_path,
|
||||||
|
"--ssl-certfile", cert_path,
|
||||||
|
"--ssl-ca-certs", ca_path,
|
||||||
|
# CERT_REQUIRED — a worker must present a CA-signed client cert
|
||||||
|
# before FastAPI sees the request. The heartbeat endpoint then
|
||||||
|
# pins the cert fingerprint per-host on top of this.
|
||||||
|
"--ssl-cert-reqs", "2",
|
||||||
|
]
|
||||||
try:
|
try:
|
||||||
proc = subprocess.Popen(_cmd, start_new_session=True) # nosec B603 B404
|
proc = subprocess.Popen(_cmd, start_new_session=True) # nosec B603 B404
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -43,6 +43,7 @@ from cryptography.x509.oid import NameOID
|
|||||||
|
|
||||||
DEFAULT_CA_DIR = pathlib.Path(os.path.expanduser("~/.decnet/ca"))
|
DEFAULT_CA_DIR = pathlib.Path(os.path.expanduser("~/.decnet/ca"))
|
||||||
DEFAULT_AGENT_DIR = pathlib.Path(os.path.expanduser("~/.decnet/agent"))
|
DEFAULT_AGENT_DIR = pathlib.Path(os.path.expanduser("~/.decnet/agent"))
|
||||||
|
DEFAULT_SWARMCTL_DIR = pathlib.Path(os.path.expanduser("~/.decnet/swarmctl"))
|
||||||
|
|
||||||
CA_KEY_BITS = 4096
|
CA_KEY_BITS = 4096
|
||||||
WORKER_KEY_BITS = 2048
|
WORKER_KEY_BITS = 2048
|
||||||
@@ -277,6 +278,45 @@ def load_worker_bundle(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_swarmctl_cert(
|
||||||
|
bind_host: str,
|
||||||
|
ca_dir: pathlib.Path = DEFAULT_CA_DIR,
|
||||||
|
swarmctl_dir: pathlib.Path = DEFAULT_SWARMCTL_DIR,
|
||||||
|
extra_sans: Optional[list[str]] = None,
|
||||||
|
) -> tuple[pathlib.Path, pathlib.Path, pathlib.Path]:
|
||||||
|
"""Return (cert_path, key_path, ca_path), auto-issuing if missing.
|
||||||
|
|
||||||
|
Uses the existing DECNET CA (ensuring it exists first) so workers
|
||||||
|
whose bundle already includes ``ca.crt`` can verify the swarmctl
|
||||||
|
endpoint without additional trust configuration. Self-signed is
|
||||||
|
intentionally not the default — a cert signed by the same CA the
|
||||||
|
workers already trust is the friction-free path.
|
||||||
|
|
||||||
|
Callers that want BYOC should skip this and pass their own
|
||||||
|
cert/key paths directly to uvicorn.
|
||||||
|
"""
|
||||||
|
swarmctl_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
os.chmod(swarmctl_dir, 0o700)
|
||||||
|
cert_path = swarmctl_dir / "server.crt"
|
||||||
|
key_path = swarmctl_dir / "server.key"
|
||||||
|
ca_cert_path = ca_dir / "ca.crt"
|
||||||
|
|
||||||
|
if cert_path.exists() and key_path.exists() and ca_cert_path.exists():
|
||||||
|
return cert_path, key_path, ca_cert_path
|
||||||
|
|
||||||
|
ca = ensure_ca(ca_dir)
|
||||||
|
sans = list({bind_host, "127.0.0.1", "localhost", *(extra_sans or [])})
|
||||||
|
issued = issue_worker_cert(ca, "swarmctl", sans)
|
||||||
|
cert_path.write_bytes(issued.cert_pem)
|
||||||
|
key_path.write_bytes(issued.key_pem)
|
||||||
|
os.chmod(key_path, 0o600)
|
||||||
|
# ensure_ca already wrote ca.crt under ca_dir, but save_ca is only
|
||||||
|
# called on generate — re-mirror it here to guarantee the path exists.
|
||||||
|
if not ca_cert_path.exists():
|
||||||
|
ca_cert_path.write_bytes(ca.cert_pem)
|
||||||
|
return cert_path, key_path, ca_cert_path
|
||||||
|
|
||||||
|
|
||||||
def fingerprint(cert_pem: bytes) -> str:
|
def fingerprint(cert_pem: bytes) -> str:
|
||||||
"""SHA-256 hex fingerprint of a cert (DER-encoded)."""
|
"""SHA-256 hex fingerprint of a cert (DER-encoded)."""
|
||||||
cert = x509.load_pem_x509_certificate(cert_pem)
|
cert = x509.load_pem_x509_certificate(cert_pem)
|
||||||
|
|||||||
@@ -52,6 +52,38 @@ def test_load_worker_bundle_returns_none_if_missing(tmp_path: pathlib.Path) -> N
|
|||||||
assert pki.load_worker_bundle(tmp_path / "empty") is None
|
assert pki.load_worker_bundle(tmp_path / "empty") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_ensure_swarmctl_cert_issues_from_same_ca(tmp_path: pathlib.Path) -> None:
|
||||||
|
ca_dir = tmp_path / "ca"
|
||||||
|
swarmctl_dir = tmp_path / "swarmctl"
|
||||||
|
cert_path, key_path, ca_path = pki.ensure_swarmctl_cert(
|
||||||
|
"0.0.0.0", ca_dir=ca_dir, swarmctl_dir=swarmctl_dir
|
||||||
|
)
|
||||||
|
assert cert_path.exists() and key_path.exists() and ca_path.exists()
|
||||||
|
# Server cert is signed by the same CA that workers will ship — that's
|
||||||
|
# the whole point of the auto-issue path.
|
||||||
|
cert = x509.load_pem_x509_certificate(cert_path.read_bytes())
|
||||||
|
ca_cert = x509.load_pem_x509_certificate(ca_path.read_bytes())
|
||||||
|
assert cert.issuer == ca_cert.subject
|
||||||
|
san = cert.extensions.get_extension_for_class(x509.SubjectAlternativeName).value
|
||||||
|
ips = {str(v) for v in san.get_values_for_type(x509.IPAddress)}
|
||||||
|
dns = set(san.get_values_for_type(x509.DNSName))
|
||||||
|
assert "0.0.0.0" in ips
|
||||||
|
assert "localhost" in dns
|
||||||
|
# Key perm is the same 0600 we enforce on worker.key.
|
||||||
|
assert (key_path.stat().st_mode & 0o777) == 0o600
|
||||||
|
|
||||||
|
|
||||||
|
def test_ensure_swarmctl_cert_is_idempotent(tmp_path: pathlib.Path) -> None:
|
||||||
|
# Second call must NOT re-issue — otherwise a restart of swarmctl
|
||||||
|
# would rotate the server cert and break any worker mid-TLS-session.
|
||||||
|
ca_dir = tmp_path / "ca"
|
||||||
|
swarmctl_dir = tmp_path / "swarmctl"
|
||||||
|
first = pki.ensure_swarmctl_cert("0.0.0.0", ca_dir=ca_dir, swarmctl_dir=swarmctl_dir)
|
||||||
|
first_pem = first[0].read_bytes()
|
||||||
|
second = pki.ensure_swarmctl_cert("0.0.0.0", ca_dir=ca_dir, swarmctl_dir=swarmctl_dir)
|
||||||
|
assert second[0].read_bytes() == first_pem
|
||||||
|
|
||||||
|
|
||||||
def test_fingerprint_stable_across_calls(tmp_path: pathlib.Path) -> None:
|
def test_fingerprint_stable_across_calls(tmp_path: pathlib.Path) -> None:
|
||||||
ca = pki.ensure_ca(tmp_path / "ca")
|
ca = pki.ensure_ca(tmp_path / "ca")
|
||||||
issued = pki.issue_worker_cert(ca, "worker-03", ["127.0.0.1"])
|
issued = pki.issue_worker_cert(ca, "worker-03", ["127.0.0.1"])
|
||||||
|
|||||||
Reference in New Issue
Block a user