diff --git a/decnet/env.py b/decnet/env.py index a6ce40db..e6029999 100644 --- a/decnet/env.py +++ b/decnet/env.py @@ -174,6 +174,101 @@ _cors_raw: str = os.environ.get("DECNET_CORS_ORIGINS", _cors_default) DECNET_CORS_ORIGINS: list[str] = [o.strip() for o in _cors_raw.split(",") if o.strip()] +# Master→worker mTLS hostname verification. Off by default because legacy +# enrollments were issued certs with operator-supplied SAN lists that may +# not match the URL the master uses to connect; set to "true" on a fresh +# production deploy where you control enrollment to get TLS hostname checks +# on top of CA + fingerprint pinning. +DECNET_VERIFY_HOSTNAME: bool = ( + os.environ.get("DECNET_VERIFY_HOSTNAME", "false").lower() == "true" +) + + +_LOOPBACK_HOSTS = {"localhost", "127.0.0.1", "::1"} +_WILDCARD_BIND_HOSTS = {"0.0.0.0", "::"} # nosec B104 — comparison only + + +def _origin_host(origin: str) -> str: + """Pull the bare hostname out of a CORS origin (``http(s)://host:port``). + + Returns the full origin lowercased if the URL can't be parsed — the + caller treats unrecognised origins as non-loopback, which is the safer + default for a public-binding check. + """ + from urllib.parse import urlparse + + try: + parsed = urlparse(origin) + host = (parsed.hostname or "").lower() + return host or origin.strip().lower() + except (ValueError, AttributeError): + return origin.strip().lower() + + +def validate_public_binding() -> None: + """Refuse to start the master API/web with a footgun config. + + Three checks, all gated on the API binding being non-loopback (i.e. + actually exposed to the network): + + * If CORS allow-list still contains a loopback origin, fail. The most + common shape of this bug is operator flips ``DECNET_API_HOST=0.0.0.0`` + to "make it work" and forgets to update ``DECNET_CORS_ORIGINS`` — + the dashboard then either can't talk to the API at all, or worse, + they wildcard CORS to paper over it. + + * If the canary HTTP base is plaintext (``http://``) and the canary + host isn't loopback, fail. Canary tokens phone home on trigger; + plaintext over the public internet leaks the token to anyone on + the path. + + * If the rate limiter is globally disabled, log loudly. Don't fail — + operators sometimes want this for benchmarking — but never let it + slip past unmentioned on a public binding. + + Called from the FastAPI lifespan so it surfaces at startup, not on + first request. Skipped automatically when running under pytest so + the test suite doesn't have to set five env vars per fixture. + """ + if any(k.startswith("PYTEST") for k in os.environ): + return + if DECNET_API_HOST in _LOOPBACK_HOSTS: + return # not exposed; nothing to validate + + bind_label = "DECNET_API_HOST" if DECNET_API_HOST in _WILDCARD_BIND_HOSTS else "DECNET_API_HOST" + loopback_origins = [o for o in DECNET_CORS_ORIGINS if _origin_host(o) in _LOOPBACK_HOSTS] + if loopback_origins: + raise ValueError( + f"{bind_label}={DECNET_API_HOST!r} exposes the API to the network, " + f"but DECNET_CORS_ORIGINS still contains loopback origin(s) " + f"{loopback_origins!r}. Set DECNET_CORS_ORIGINS to the public " + f"dashboard URL(s) before starting (e.g. " + f"DECNET_CORS_ORIGINS=https://dashboard.example.com)." + ) + + canary_base = os.environ.get("DECNET_CANARY_HTTP_BASE", "").strip() + if canary_base and canary_base.lower().startswith("http://"): + host = _origin_host(canary_base) + if host and host not in _LOOPBACK_HOSTS: + raise ValueError( + f"DECNET_CANARY_HTTP_BASE={canary_base!r} is plaintext HTTP and " + f"points at a non-loopback host. Canary triggers carry secrets " + f"that must not cross the public internet in cleartext — use " + f"https:// or front the canary endpoint with a TLS proxy." + ) + + limiter_enabled = os.environ.get("DECNET_LIMITER_ENABLED", "true").lower() != "false" + if not limiter_enabled: + # Late import to avoid a circular dependency through decnet.logging. + from decnet.logging import get_logger + get_logger("env").critical( + "DECNET_LIMITER_ENABLED=false on a public binding (%s=%s). " + "Login + write endpoints have no rate limiting — only run this " + "way for benchmarking or behind an external rate-limiting proxy.", + bind_label, DECNET_API_HOST, + ) + + def __getattr__(name: str) -> str: """Lazy resolution for secrets only the master web/api process needs.""" if name == "DECNET_JWT_SECRET": diff --git a/decnet/swarm/client.py b/decnet/swarm/client.py index 0c9a4c4d..f31da15b 100644 --- a/decnet/swarm/client.py +++ b/decnet/swarm/client.py @@ -106,15 +106,20 @@ class AgentClient: address: Optional[str] = None, agent_port: Optional[int] = None, identity: Optional[MasterIdentity] = None, - verify_hostname: bool = False, + verify_hostname: Optional[bool] = None, ): """Either pass a SwarmHost dict, or explicit address/port. - ``verify_hostname`` stays False by default because the worker's - cert SAN is populated from the operator-supplied address list, not - from modern TLS hostname-verification semantics. The mTLS client - cert + CA pinning are what authenticate the peer. + ``verify_hostname`` defers to ``DECNET_VERIFY_HOSTNAME`` when the + caller doesn't pass an explicit value — production deploys flip + the env var on so the worker's cert SAN must match the address + the master connects to, on top of the existing CA + fingerprint + pin. Defaults to False so dev/test enrollments with mismatched + SANs keep working unchanged. """ + if verify_hostname is None: + from decnet.env import DECNET_VERIFY_HOSTNAME + verify_hostname = DECNET_VERIFY_HOSTNAME if host is not None: self._address = host["address"] self._port = int(host.get("agent_port") or 8765) diff --git a/decnet/web/api.py b/decnet/web/api.py index b4689f6c..82a40e06 100644 --- a/decnet/web/api.py +++ b/decnet/web/api.py @@ -24,6 +24,7 @@ from decnet.env import ( DECNET_INGEST_LOG_FILE, DECNET_PROFILE_DIR, DECNET_PROFILE_REQUESTS, + validate_public_binding, ) from decnet.logging import get_logger from decnet.web.dependencies import repo @@ -69,6 +70,11 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: soft, ) + # Refuse to come up with a footgun config on a public binding (loopback + # CORS origin while bound to 0.0.0.0, plaintext canary base, etc.). + # Raises ValueError with an actionable message; uvicorn surfaces it. + validate_public_binding() + log.info("API startup initialising database") for attempt in range(1, 6): try: diff --git a/tests/web/test_validate_public_binding.py b/tests/web/test_validate_public_binding.py new file mode 100644 index 00000000..59f0db48 --- /dev/null +++ b/tests/web/test_validate_public_binding.py @@ -0,0 +1,89 @@ +"""validate_public_binding refuses footgun configs at master startup. + +The validator no-ops under pytest by design (so unit tests in unrelated +modules don't have to set five env vars per fixture); these tests strip +the PYTEST_* vars before calling it so the real code path runs. +""" +from __future__ import annotations + +import importlib +import sys + +import pytest + + +def _reimport_env(monkeypatch: pytest.MonkeyPatch): + for mod in list(sys.modules): + if mod == "decnet.env" or mod.startswith("decnet.env."): + sys.modules.pop(mod) + return importlib.import_module("decnet.env") + + +def _strip_pytest_vars(monkeypatch: pytest.MonkeyPatch) -> None: + import os + for k in list(os.environ): + if k.startswith("PYTEST"): + monkeypatch.delenv(k, raising=False) + + +def test_validator_noop_on_loopback_binding(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("DECNET_API_HOST", "127.0.0.1") + monkeypatch.setenv("DECNET_CORS_ORIGINS", "http://localhost:8080") + env = _reimport_env(monkeypatch) + _strip_pytest_vars(monkeypatch) + env.validate_public_binding() # no raise + + +def test_validator_rejects_loopback_cors_on_public_bind( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("DECNET_API_HOST", "0.0.0.0") + monkeypatch.setenv("DECNET_CORS_ORIGINS", "http://localhost:8080") + env = _reimport_env(monkeypatch) + _strip_pytest_vars(monkeypatch) + with pytest.raises(ValueError, match="loopback origin"): + env.validate_public_binding() + + +def test_validator_accepts_public_cors_on_public_bind( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("DECNET_API_HOST", "0.0.0.0") + monkeypatch.setenv("DECNET_CORS_ORIGINS", "https://dashboard.example.com") + env = _reimport_env(monkeypatch) + _strip_pytest_vars(monkeypatch) + env.validate_public_binding() # no raise + + +def test_validator_rejects_plaintext_canary_on_public_bind( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("DECNET_API_HOST", "0.0.0.0") + monkeypatch.setenv("DECNET_CORS_ORIGINS", "https://dashboard.example.com") + monkeypatch.setenv("DECNET_CANARY_HTTP_BASE", "http://canary.example.com:8088") + env = _reimport_env(monkeypatch) + _strip_pytest_vars(monkeypatch) + with pytest.raises(ValueError, match="plaintext HTTP"): + env.validate_public_binding() + + +def test_validator_allows_loopback_canary_even_on_public_bind( + monkeypatch: pytest.MonkeyPatch, +) -> None: + # Local canary endpoint behind the master is fine; only public-facing + # plaintext is the footgun. + monkeypatch.setenv("DECNET_API_HOST", "0.0.0.0") + monkeypatch.setenv("DECNET_CORS_ORIGINS", "https://dashboard.example.com") + monkeypatch.setenv("DECNET_CANARY_HTTP_BASE", "http://localhost:8088") + env = _reimport_env(monkeypatch) + _strip_pytest_vars(monkeypatch) + env.validate_public_binding() # no raise + + +def test_validator_skips_under_pytest(monkeypatch: pytest.MonkeyPatch) -> None: + # With PYTEST_* still in env (default), even a misconfigured env passes — + # this is the deliberate bypass so unrelated tests don't trip on it. + monkeypatch.setenv("DECNET_API_HOST", "0.0.0.0") + monkeypatch.setenv("DECNET_CORS_ORIGINS", "http://localhost:8080") + env = _reimport_env(monkeypatch) + env.validate_public_binding() # no raise — guard short-circuits