sec(env): refuse to start master API with footgun public-binding config

Add validate_public_binding() called from the master API lifespan: when
DECNET_API_HOST is non-loopback, refuse to start if DECNET_CORS_ORIGINS
still contains a loopback origin (catches the "operator flipped to
0.0.0.0 to make it work and forgot to update CORS" footgun) or if
DECNET_CANARY_HTTP_BASE is plaintext http:// to a non-loopback host.
Log CRITICAL when DECNET_LIMITER_ENABLED=false on a public binding.
The validator no-ops under pytest so unrelated suites don't trip on it.

Add DECNET_VERIFY_HOSTNAME env knob; AgentClient and UpdaterClient
consult it when verify_hostname is None, giving production deploys
TLS hostname verification on top of the existing CA + fingerprint pin.
Default off so dev enrollments with mismatched SANs keep working.
This commit is contained in:
2026-04-27 21:15:15 -04:00
parent 28e2a93355
commit 1a7da33375
4 changed files with 200 additions and 5 deletions

View File

@@ -174,6 +174,101 @@ _cors_raw: str = os.environ.get("DECNET_CORS_ORIGINS", _cors_default)
DECNET_CORS_ORIGINS: list[str] = [o.strip() for o in _cors_raw.split(",") if o.strip()]
# Master→worker mTLS hostname verification. Off by default because legacy
# enrollments were issued certs with operator-supplied SAN lists that may
# not match the URL the master uses to connect; set to "true" on a fresh
# production deploy where you control enrollment to get TLS hostname checks
# on top of CA + fingerprint pinning.
DECNET_VERIFY_HOSTNAME: bool = (
os.environ.get("DECNET_VERIFY_HOSTNAME", "false").lower() == "true"
)
_LOOPBACK_HOSTS = {"localhost", "127.0.0.1", "::1"}
_WILDCARD_BIND_HOSTS = {"0.0.0.0", "::"} # nosec B104 — comparison only
def _origin_host(origin: str) -> str:
"""Pull the bare hostname out of a CORS origin (``http(s)://host:port``).
Returns the full origin lowercased if the URL can't be parsed — the
caller treats unrecognised origins as non-loopback, which is the safer
default for a public-binding check.
"""
from urllib.parse import urlparse
try:
parsed = urlparse(origin)
host = (parsed.hostname or "").lower()
return host or origin.strip().lower()
except (ValueError, AttributeError):
return origin.strip().lower()
def validate_public_binding() -> None:
"""Refuse to start the master API/web with a footgun config.
Three checks, all gated on the API binding being non-loopback (i.e.
actually exposed to the network):
* If CORS allow-list still contains a loopback origin, fail. The most
common shape of this bug is operator flips ``DECNET_API_HOST=0.0.0.0``
to "make it work" and forgets to update ``DECNET_CORS_ORIGINS`` —
the dashboard then either can't talk to the API at all, or worse,
they wildcard CORS to paper over it.
* If the canary HTTP base is plaintext (``http://``) and the canary
host isn't loopback, fail. Canary tokens phone home on trigger;
plaintext over the public internet leaks the token to anyone on
the path.
* If the rate limiter is globally disabled, log loudly. Don't fail —
operators sometimes want this for benchmarking — but never let it
slip past unmentioned on a public binding.
Called from the FastAPI lifespan so it surfaces at startup, not on
first request. Skipped automatically when running under pytest so
the test suite doesn't have to set five env vars per fixture.
"""
if any(k.startswith("PYTEST") for k in os.environ):
return
if DECNET_API_HOST in _LOOPBACK_HOSTS:
return # not exposed; nothing to validate
bind_label = "DECNET_API_HOST" if DECNET_API_HOST in _WILDCARD_BIND_HOSTS else "DECNET_API_HOST"
loopback_origins = [o for o in DECNET_CORS_ORIGINS if _origin_host(o) in _LOOPBACK_HOSTS]
if loopback_origins:
raise ValueError(
f"{bind_label}={DECNET_API_HOST!r} exposes the API to the network, "
f"but DECNET_CORS_ORIGINS still contains loopback origin(s) "
f"{loopback_origins!r}. Set DECNET_CORS_ORIGINS to the public "
f"dashboard URL(s) before starting (e.g. "
f"DECNET_CORS_ORIGINS=https://dashboard.example.com)."
)
canary_base = os.environ.get("DECNET_CANARY_HTTP_BASE", "").strip()
if canary_base and canary_base.lower().startswith("http://"):
host = _origin_host(canary_base)
if host and host not in _LOOPBACK_HOSTS:
raise ValueError(
f"DECNET_CANARY_HTTP_BASE={canary_base!r} is plaintext HTTP and "
f"points at a non-loopback host. Canary triggers carry secrets "
f"that must not cross the public internet in cleartext — use "
f"https:// or front the canary endpoint with a TLS proxy."
)
limiter_enabled = os.environ.get("DECNET_LIMITER_ENABLED", "true").lower() != "false"
if not limiter_enabled:
# Late import to avoid a circular dependency through decnet.logging.
from decnet.logging import get_logger
get_logger("env").critical(
"DECNET_LIMITER_ENABLED=false on a public binding (%s=%s). "
"Login + write endpoints have no rate limiting — only run this "
"way for benchmarking or behind an external rate-limiting proxy.",
bind_label, DECNET_API_HOST,
)
def __getattr__(name: str) -> str:
"""Lazy resolution for secrets only the master web/api process needs."""
if name == "DECNET_JWT_SECRET":

View File

@@ -106,15 +106,20 @@ class AgentClient:
address: Optional[str] = None,
agent_port: Optional[int] = None,
identity: Optional[MasterIdentity] = None,
verify_hostname: bool = False,
verify_hostname: Optional[bool] = None,
):
"""Either pass a SwarmHost dict, or explicit address/port.
``verify_hostname`` stays False by default because the worker's
cert SAN is populated from the operator-supplied address list, not
from modern TLS hostname-verification semantics. The mTLS client
cert + CA pinning are what authenticate the peer.
``verify_hostname`` defers to ``DECNET_VERIFY_HOSTNAME`` when the
caller doesn't pass an explicit value — production deploys flip
the env var on so the worker's cert SAN must match the address
the master connects to, on top of the existing CA + fingerprint
pin. Defaults to False so dev/test enrollments with mismatched
SANs keep working unchanged.
"""
if verify_hostname is None:
from decnet.env import DECNET_VERIFY_HOSTNAME
verify_hostname = DECNET_VERIFY_HOSTNAME
if host is not None:
self._address = host["address"]
self._port = int(host.get("agent_port") or 8765)

View File

@@ -24,6 +24,7 @@ from decnet.env import (
DECNET_INGEST_LOG_FILE,
DECNET_PROFILE_DIR,
DECNET_PROFILE_REQUESTS,
validate_public_binding,
)
from decnet.logging import get_logger
from decnet.web.dependencies import repo
@@ -69,6 +70,11 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
soft,
)
# Refuse to come up with a footgun config on a public binding (loopback
# CORS origin while bound to 0.0.0.0, plaintext canary base, etc.).
# Raises ValueError with an actionable message; uvicorn surfaces it.
validate_public_binding()
log.info("API startup initialising database")
for attempt in range(1, 6):
try:

View File

@@ -0,0 +1,89 @@
"""validate_public_binding refuses footgun configs at master startup.
The validator no-ops under pytest by design (so unit tests in unrelated
modules don't have to set five env vars per fixture); these tests strip
the PYTEST_* vars before calling it so the real code path runs.
"""
from __future__ import annotations
import importlib
import sys
import pytest
def _reimport_env(monkeypatch: pytest.MonkeyPatch):
for mod in list(sys.modules):
if mod == "decnet.env" or mod.startswith("decnet.env."):
sys.modules.pop(mod)
return importlib.import_module("decnet.env")
def _strip_pytest_vars(monkeypatch: pytest.MonkeyPatch) -> None:
import os
for k in list(os.environ):
if k.startswith("PYTEST"):
monkeypatch.delenv(k, raising=False)
def test_validator_noop_on_loopback_binding(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("DECNET_API_HOST", "127.0.0.1")
monkeypatch.setenv("DECNET_CORS_ORIGINS", "http://localhost:8080")
env = _reimport_env(monkeypatch)
_strip_pytest_vars(monkeypatch)
env.validate_public_binding() # no raise
def test_validator_rejects_loopback_cors_on_public_bind(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setenv("DECNET_API_HOST", "0.0.0.0")
monkeypatch.setenv("DECNET_CORS_ORIGINS", "http://localhost:8080")
env = _reimport_env(monkeypatch)
_strip_pytest_vars(monkeypatch)
with pytest.raises(ValueError, match="loopback origin"):
env.validate_public_binding()
def test_validator_accepts_public_cors_on_public_bind(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setenv("DECNET_API_HOST", "0.0.0.0")
monkeypatch.setenv("DECNET_CORS_ORIGINS", "https://dashboard.example.com")
env = _reimport_env(monkeypatch)
_strip_pytest_vars(monkeypatch)
env.validate_public_binding() # no raise
def test_validator_rejects_plaintext_canary_on_public_bind(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setenv("DECNET_API_HOST", "0.0.0.0")
monkeypatch.setenv("DECNET_CORS_ORIGINS", "https://dashboard.example.com")
monkeypatch.setenv("DECNET_CANARY_HTTP_BASE", "http://canary.example.com:8088")
env = _reimport_env(monkeypatch)
_strip_pytest_vars(monkeypatch)
with pytest.raises(ValueError, match="plaintext HTTP"):
env.validate_public_binding()
def test_validator_allows_loopback_canary_even_on_public_bind(
monkeypatch: pytest.MonkeyPatch,
) -> None:
# Local canary endpoint behind the master is fine; only public-facing
# plaintext is the footgun.
monkeypatch.setenv("DECNET_API_HOST", "0.0.0.0")
monkeypatch.setenv("DECNET_CORS_ORIGINS", "https://dashboard.example.com")
monkeypatch.setenv("DECNET_CANARY_HTTP_BASE", "http://localhost:8088")
env = _reimport_env(monkeypatch)
_strip_pytest_vars(monkeypatch)
env.validate_public_binding() # no raise
def test_validator_skips_under_pytest(monkeypatch: pytest.MonkeyPatch) -> None:
# With PYTEST_* still in env (default), even a misconfigured env passes —
# this is the deliberate bypass so unrelated tests don't trip on it.
monkeypatch.setenv("DECNET_API_HOST", "0.0.0.0")
monkeypatch.setenv("DECNET_CORS_ORIGINS", "http://localhost:8080")
env = _reimport_env(monkeypatch)
env.validate_public_binding() # no raise — guard short-circuits