fix(security): close LOW ASVS findings — env bypass, SSE/deployment authz, CN fail-close, password byte-limit, exception leaks, BUG-12..16

Auth/session (V2.1.7, V4.1.5, V4.1.6, V2.1.4/V2.1.5): - env secret validation no longer bypassed by attacker-injectable PYTEST* env; gated on explicit DECNET_TESTING=1 (set only in conftest). - must_change_password now enforced on the SSE header-JWT path, not just ticket mint. - GET /system/deployment-mode requires viewer auth (was leaking role + topology size). - CreateUser/ResetUser passwords min_length=12; passwords >72 bytes rejected explicitly instead of bcrypt silently truncating. Swarm ingestion (V9.1.3, BUG-16): - Log listener hard-rejects peers with unparseable/empty cert CN (fail closed, ingests nothing) instead of tagging 'unknown'. - Shutdown handlers no longer swallow real errors (narrowed to CancelledError). Info leakage (V7.1.2, V14.1.2): - Exception text sanitized on swarm-update, health, tarpit, realism, file-drop, blank-topology endpoints (raw tc/docker stderr, DB/Docker errors logged server-side, generic detail returned). pyproject license corrected to AGPL-3.0. Correctness (BUG-12..16): - BUG-12 atomic credential upsert (UNIQUE constraint + IntegrityError retry, consistent principal_key canonicalization). - BUG-13 rule-tail watermark uses >= with seen-id dedup (no same-second drop). - BUG-14 worker wake cleared before wait (no lost wake during tick). - BUG-15 intel gather tolerates an unexpected provider raise. - BUG-16 see above. Already-closed (verified, no change): V2.1.6, V5.1.3, V9.1.2. Accept-risk + documented: V2.1.8 cache window, V3.1.3 idle timeout. Tests added for every fix; unanimous adversarial review after two refute-fix rounds.
2026-06-10 13:27:14 -04:00
parent d80e6aa6d1
commit 245975a6dd
40 changed files with 1629 additions and 72 deletions
--- a/decnet/clustering/worker.py
+++ b/decnet/clustering/worker.py
@@ -102,13 +102,13 @@ async def run_clusterer_loop(

            await _publish_result(bus, result)

+            wake.clear()
            try:
                await asyncio.wait_for(
                    wake.wait(), timeout=float(poll_interval_secs),
                )
            except asyncio.TimeoutError:
                pass
-            wake.clear()
    except (asyncio.CancelledError, KeyboardInterrupt):
        log.info("clusterer stopped")
    finally:
--- a/decnet/correlation/reuse_worker.py
+++ b/decnet/correlation/reuse_worker.py
@@ -107,13 +107,13 @@ async def run_reuse_loop(
                    event_type=_topics.CREDENTIAL_REUSE_DETECTED,
                )

+            wake.clear()
            try:
                await asyncio.wait_for(
                    wake.wait(), timeout=float(poll_interval_secs),
                )
            except asyncio.TimeoutError:
                pass
-            wake.clear()
    except (asyncio.CancelledError, KeyboardInterrupt):
        log.info("reuse correlator stopped")
    finally:
--- a/decnet/env.py
+++ b/decnet/env.py
@@ -49,7 +49,12 @@ def _require_env(name: str) -> str:
            f"Set it in .env.local or export it before starting DECNET."
        )

-    if any(k.startswith("PYTEST") for k in os.environ):
+    # Strength validation is bypassed ONLY under the explicit, non-attacker-
+    # injectable DECNET_TESTING=1 flag (set by the test harness). The old
+    # "any PYTEST* var present" check was a fail-open bug: PYTEST* is an
+    # attacker-controllable namespace, so leaking one into a prod environment
+    # silently disabled the known-bad/length guards. Fail closed (V2.1.7).
+    if os.environ.get("DECNET_TESTING") == "1":
        return value

    if value.lower() in _KNOWN_BAD:
--- a/decnet/intel/worker.py
+++ b/decnet/intel/worker.py
@@ -108,10 +108,19 @@ async def _enrich_one(
        async with p._semaphore:
            return await p.lookup(ip)

-    results: list[IntelResult] = await asyncio.gather(
+    raw = await asyncio.gather(
        *(_guarded_lookup(p, ip) for p in providers),
-        return_exceptions=False,  # providers contractually never raise
+        return_exceptions=True,
    )
+    results: list[IntelResult] = []
+    for r in raw:
+        if isinstance(r, BaseException):
+            log.warning(
+                "intel: provider raised unexpectedly for ip=%s: %s",
+                ip, r,
+            )
+        else:
+            results.append(r)

    now = datetime.now(timezone.utc)
    row: dict[str, Any] = {
@@ -220,13 +229,13 @@ async def run_intel_loop(
                            attacker_uuid, ip,
                        )

+            wake.clear()
            try:
                await asyncio.wait_for(
                    wake.wait(), timeout=float(poll_interval_secs),
                )
            except asyncio.TimeoutError:
                pass
-            wake.clear()
    except (asyncio.CancelledError, KeyboardInterrupt):
        log.info("intel worker stopped")
    finally:
--- a/decnet/network.py
+++ b/decnet/network.py
@@ -15,6 +15,10 @@ from ipaddress import IPv4Address, IPv4Interface, IPv4Network

 import docker

+from decnet.logging import get_logger
+
+log = get_logger("network")
+
 MACVLAN_NETWORK_NAME = "decnet_lan"
 HOST_MACVLAN_IFACE = "decnet_macvlan0"
 HOST_IPVLAN_IFACE = "decnet_ipvlan0"
@@ -491,9 +495,12 @@ def get_container_veth(container_name: str) -> str:
        check=False,
    )
    if result.returncode != 0:
-        raise LookupError(
-            f"container {container_name!r} not reachable: {result.stderr.strip()}"
+        log.warning(
+            "get_container_veth: docker exec failed for container %r: %s",
+            container_name,
+            result.stderr.strip(),
        )
+        raise LookupError(f"container {container_name!r} not reachable")
    peer_index = result.stdout.strip()
    links = _run(["ip", "link", "show"])
    for line in links.stdout.splitlines():
--- a/decnet/swarm/log_forwarder.py
+++ b/decnet/swarm/log_forwarder.py
@@ -240,8 +240,14 @@ async def run_forwarder(
                backoff = min(_MAX_BACKOFF, backoff * 2)
    finally:
        heartbeat_task.cancel()
-        with contextlib.suppress(asyncio.CancelledError, Exception):
+        try:
            await heartbeat_task
+        except asyncio.CancelledError:
+            pass
+        except Exception:
+            # BUG-16 — don't silently swallow a real heartbeat-task error on
+            # shutdown; log it so a failing heartbeat coroutine is visible.
+            log.exception("forwarder heartbeat task errored during shutdown")
        if bus is not None:
            with contextlib.suppress(Exception):
                await bus.close()
--- a/decnet/swarm/log_listener.py
+++ b/decnet/swarm/log_listener.py
@@ -113,6 +113,24 @@ async def _handle_connection(
    ssl_obj = writer.get_extra_info("ssl_object")
    cn = peer_cn(ssl_obj)
    peer = writer.get_extra_info("peername")
+
+    # V9.1.3 — FAIL CLOSED on unattributable provenance. The CA gates
+    # enrollment at the TLS handshake, but a CA-signed cert with a
+    # malformed/empty/missing CN subject slips through with an opaque
+    # 'unknown' label. We refuse to ingest anything we cannot attribute:
+    # close the connection immediately and ingest NOTHING.
+    if cn == "unknown":
+        log.warning(
+            "listener rejecting unattributable peer (CN=unknown) peer=%s — closing, ingesting nothing",
+            peer,
+        )
+        writer.close()
+        try:
+            await writer.wait_closed()
+        except Exception:  # nosec B110 — socket cleanup is best-effort
+            pass
+        return
+
    log.info("listener accepted worker=%s peer=%s", cn, peer)

    # Lazy import to avoid a circular dep if the collector pulls in logger setup.
@@ -191,5 +209,9 @@ async def run_listener(
            serve_task.cancel()
            try:
                await serve_task
-            except (asyncio.CancelledError, Exception):  # nosec B110
+            except asyncio.CancelledError:
                pass
+            except Exception:
+                # BUG-16 — do NOT swallow real shutdown errors (OSError etc).
+                # Surface them so a wedged/erroring serve task is visible.
+                log.exception("listener serve task errored during shutdown")
--- a/decnet/ttp/store/impl/database.py
+++ b/decnet/ttp/store/impl/database.py
@@ -231,6 +231,9 @@ class DatabaseRuleStore(RuleStore):
        self._subscribers: list[asyncio.Queue[RuleChange]] = []
        self._tail_task: asyncio.Task[None] | None = None
        self._tail_watermark: datetime | None = None
+        # rule_ids already emitted at the current watermark timestamp; reset
+        # whenever the watermark advances (BUG-13 dedup across same-ts rows).
+        self._tail_seen_ids: set[str] = set()
        self._sync_task: asyncio.Task[None] | None = None
        self._stop = asyncio.Event()
        self._lazy_lock = asyncio.Lock()
@@ -504,6 +507,10 @@ class DatabaseRuleStore(RuleStore):
        receive per-rule definition changes without a shared bus
        round-trip. The watermark advances on every observed row;
        first poll initializes it to "now" so we don't replay history.
+
+        Single-poller only: the instance state ``_tail_watermark`` /
+        ``_tail_seen_ids`` is NOT safe for concurrent pollers on the same
+        store instance.
        """
        repo = await self._ensure_repo()
        if self._tail_watermark is None:
@@ -511,14 +518,31 @@ class DatabaseRuleStore(RuleStore):
        while not self._stop.is_set():
            try:
                async with repo._session() as session:  # type: ignore[attr-defined]
+                    # Use >= so rules whose updated_at equals the watermark are
+                    # not silently skipped on the next poll (BUG-13 fix).
+                    # Rows at exactly the watermark timestamp are deduplicated
+                    # by rule_id so we don't re-emit rules we already fired.
                    rows = (
                        await session.execute(
                            sa_select(TTPRule).where(
-                                col(TTPRule.updated_at) > self._tail_watermark,
+                                col(TTPRule.updated_at) >= self._tail_watermark,
                            ),
                        )
                    ).scalars().all()
+                max_ts: datetime | None = None
+                emitted_at_ts: dict[str, datetime] = {}
                for rule_row in rows:
+                    # Normalize to UTC-aware before comparison so naive
+                    # datetimes stored by the DB don't cause TypeError.
+                    row_ts = rule_row.updated_at
+                    if row_ts.tzinfo is None:
+                        row_ts = row_ts.replace(tzinfo=timezone.utc)
+                    # Skip rules we already emitted at exactly this watermark.
+                    if (
+                        row_ts == self._tail_watermark
+                        and rule_row.rule_id in self._tail_seen_ids
+                    ):
+                        continue
                    state = await self.get_state(rule_row.rule_id)
                    compiled = _yaml_to_compiled(rule_row.yaml_content, state)
                    await self._emit_change(
@@ -529,11 +553,31 @@ class DatabaseRuleStore(RuleStore):
                            "rule_version": compiled.rule_version,
                        },
                    )
-                    if (
-                        self._tail_watermark is None
-                        or rule_row.updated_at > self._tail_watermark
-                    ):
-                        self._tail_watermark = rule_row.updated_at
+                    emitted_at_ts[rule_row.rule_id] = row_ts
+                    if max_ts is None or row_ts > max_ts:
+                        max_ts = row_ts
+                if max_ts is not None:
+                    # Keep the watermark AT max_ts — do NOT add 1 µs. On coarse
+                    # second-resolution timestamps (MySQL DATETIME) a 1 µs bump
+                    # would land inside the same whole-second bucket and the
+                    # next `>= watermark` query would silently drop a rule
+                    # saved later in that same second (reintroducing BUG-13).
+                    # We rely SOLELY on _tail_seen_ids to dedup already-emitted
+                    # rule_ids at the watermark timestamp.
+                    if max_ts > self._tail_watermark:
+                        # A strictly-newer timestamp appeared: advance the
+                        # watermark and reset seen-ids to only the rule_ids
+                        # AT the new max_ts (rows below it cannot reappear in a
+                        # future `>= max_ts` query, so they need no dedup).
+                        self._tail_watermark = max_ts
+                        self._tail_seen_ids = {
+                            rid for rid, ts in emitted_at_ts.items()
+                            if ts == max_ts
+                        }
+                    else:
+                        # All emitted rows share the current watermark
+                        # timestamp; record them so the next poll skips them.
+                        self._tail_seen_ids.update(emitted_at_ts)
            except Exception:  # noqa: BLE001
                _log.exception("ttp.store.db: tail poll failed")
            try:
--- a/decnet/web/auth.py
+++ b/decnet/web/auth.py
@@ -9,6 +9,10 @@ from decnet.env import DECNET_JWT_SECRET, DECNET_JWT_EXP_MINUTES

 SECRET_KEY: str = DECNET_JWT_SECRET
 ALGORITHM: str = "HS256"
+# Live constant — sourced from env DECNET_JWT_EXP_MINUTES (default 240 / 4 h).
+# Idle/inactivity timeout is intentionally not implemented: jti denylist covers
+# explicit logout and the 4 h absolute TTL bounds the worst-case exposure window.
+# Accept-risk: LOW / pre-v1 — revisit at v1 when user-facing session UX lands.
 ACCESS_TOKEN_EXPIRE_MINUTES: int = DECNET_JWT_EXP_MINUTES

 # Pinned issuer/audience/type so a token signed with DECNET_JWT_SECRET for any
@@ -21,6 +25,9 @@ JWT_TYPE: str = "access"


 def verify_password(plain_password: str, hashed_password: str) -> bool:
+    # [:72] is a defensive safety-net against bcrypt silent truncation.
+    # Validated callers already reject >72-byte passwords via field_validator,
+    # so this slice is unreachable for well-formed input.
    return bcrypt.checkpw(
        plain_password.encode("utf-8")[:72],
        hashed_password.encode("utf-8")
@@ -28,7 +35,9 @@ def verify_password(plain_password: str, hashed_password: str) -> bool:


 def get_password_hash(password: str) -> str:
-    # Use a cost factor of 12 (default for passlib/bcrypt)
+    # Use a cost factor of 12 (default for passlib/bcrypt).
+    # [:72] is a defensive safety-net; field_validator rejects >72-byte input
+    # before it reaches this function.
    _salt: bytes = bcrypt.gensalt(rounds=12)
    _hashed: bytes = bcrypt.hashpw(password.encode("utf-8")[:72], _salt)
    return _hashed.decode("utf-8")
--- a/decnet/web/db/models/auth.py
+++ b/decnet/web/db/models/auth.py
@@ -3,10 +3,18 @@
 from datetime import datetime, timezone
 from typing import List, Literal, Optional

-from pydantic import BaseModel, Field as PydanticField
+from pydantic import BaseModel, Field as PydanticField, field_validator
 from sqlmodel import Field, SQLModel


+def _reject_over_72_bytes(v: str) -> str:
+    """bcrypt silently truncates at 72 bytes; reject instead to avoid
+    collision/confusion between passwords that share a 72-byte prefix."""
+    if len(v.encode("utf-8")) > 72:
+        raise ValueError("password must not exceed 72 UTF-8 bytes (bcrypt limit)")
+    return v
+
+
 class User(SQLModel, table=True):
    __tablename__ = "users"
    uuid: str = Field(primary_key=True)
@@ -55,6 +63,11 @@ class ChangePasswordRequest(BaseModel):
    # floor a seeded admin could clear must_change_password with a 1-char secret.
    new_password: str = PydanticField(..., min_length=12, max_length=72)

+    @field_validator("old_password", "new_password", mode="after")
+    @classmethod
+    def _check_byte_limit(cls, v: str) -> str:
+        return _reject_over_72_bytes(v)
+

 class SSETicketResponse(BaseModel):
    """Single-use, short-lived opaque ticket the dashboard exchanges its header
@@ -68,16 +81,26 @@ class SSETicketResponse(BaseModel):

 class CreateUserRequest(BaseModel):
    username: str = PydanticField(..., min_length=1, max_length=64)
-    password: str = PydanticField(..., min_length=8, max_length=72)
+    password: str = PydanticField(..., min_length=12, max_length=72)
    role: Literal["admin", "viewer"] = "viewer"

+    @field_validator("password", mode="after")
+    @classmethod
+    def _check_byte_limit(cls, v: str) -> str:
+        return _reject_over_72_bytes(v)
+

 class UpdateUserRoleRequest(BaseModel):
    role: Literal["admin", "viewer"]


 class ResetUserPasswordRequest(BaseModel):
-    new_password: str = PydanticField(..., min_length=8, max_length=72)
+    new_password: str = PydanticField(..., min_length=12, max_length=72)
+
+    @field_validator("new_password", mode="after")
+    @classmethod
+    def _check_byte_limit(cls, v: str) -> str:
+        return _reject_over_72_bytes(v)


 class DeploymentLimitRequest(BaseModel):
--- a/decnet/web/db/models/logs.py
+++ b/decnet/web/db/models/logs.py
@@ -67,6 +67,16 @@ class Credential(SQLModel, table=True):
    __table_args__ = (
        Index("ix_credentials_secret_service", "secret_sha256", "service"),
        Index("ix_credentials_principal_service", "principal", "service"),
+        # Dedup constraint: same (attacker_ip, decky, service, secret_kind,
+        # secret_sha256, principal_key) → one row.  ``principal_key`` is
+        # the non-null canonical form of ``principal`` (empty string when
+        # principal is NULL) so the constraint is UNIQUE-safe under SQLite's
+        # NULL-distinct behaviour and MySQL's standard UNIQUE semantics.
+        UniqueConstraint(
+            "attacker_ip", "decky_name", "service",
+            "secret_kind", "secret_sha256", "principal_key",
+            name="uq_credentials_dedup",
+        ),
    )
    id: Optional[int] = Field(default=None, primary_key=True)
    # Keyed by attacker IP (not attackers.uuid) on the write path to
@@ -81,6 +91,11 @@ class Credential(SQLModel, table=True):
    decky_name: str = Field(index=True)
    service: str = Field(index=True)
    principal: Optional[str] = Field(default=None, index=True, max_length=256)
+    # Non-null canonical form of ``principal`` used in ``uq_credentials_dedup``.
+    # Empty string when ``principal`` is NULL so the UNIQUE constraint behaves
+    # correctly under SQLite's NULL-distinct semantics (same pattern as
+    # ``CredentialReuse.principal_key``).
+    principal_key: str = Field(default="", max_length=256)
    # Discriminator for what `secret_b64` actually contains. Default
    # ``"plaintext"`` — a recoverable password the attacker sent on the
    # wire (SSH/Telnet/FTP/IMAP/POP3/SMTP/Redis/LDAP/MQTT). Other kinds:
--- a/decnet/web/db/sqlmodel_repo/credentials/_core.py
+++ b/decnet/web/db/sqlmodel_repo/credentials/_core.py
@@ -7,6 +7,7 @@ from datetime import datetime, timezone
 from typing import Any, List, Optional

 from sqlalchemy import desc, func, or_, select, update
+from sqlalchemy.exc import IntegrityError
 from sqlmodel import col
 from sqlmodel.sql.expression import SelectOfScalar

@@ -31,18 +32,32 @@ class CredentialsCoreMixin(_MixinBase):
            payload["fields"] = json.dumps(payload["fields"], ensure_ascii=True)

        principal = payload.get("principal")
+        # Non-null canonical form used by the uq_credentials_dedup constraint
+        # AND by the dedup SELECT — both MUST key on the SAME value or the
+        # SELECT can miss a row that the constraint then collides on
+        # (e.g. principal=None and principal="" both canonicalize to ""):
+        # the SELECT would treat them as distinct, INSERT, hit IntegrityError,
+        # then re-SELECT with the wrong filter and re-raise. ``principal or ""``
+        # collapses None and "" identically — mirrors CredentialReuse and the
+        # constraint key. (BUG-12: canonicalization must not diverge.)
+        principal_key = principal or ""
        secret_kind = payload.get("secret_kind") or "plaintext"
-        async with self._session() as session:
-            stmt = select(Credential).where(
+
+        def _build_dedup_filter():
+            return (
                Credential.attacker_ip == payload["attacker_ip"],
                Credential.decky_name == payload["decky_name"],
                Credential.service == payload["service"],
                Credential.secret_kind == secret_kind,
                Credential.secret_sha256 == payload["secret_sha256"],
-                # NULL == NULL is False under SQL — branch the predicate.
-                (Credential.principal == principal) if principal is not None
-                else col(Credential.principal).is_(None),
+                # Key the SELECT on principal_key — the SAME canonical value
+                # the UNIQUE constraint uses — so SELECT and constraint never
+                # disagree about which rows collide.
+                Credential.principal_key == principal_key,
            )
+
+        async with self._session() as session:
+            stmt = select(Credential).where(*_build_dedup_filter())
            existing = (await session.execute(stmt)).scalar_one_or_none()
            now = datetime.now(timezone.utc)
            if existing is not None:
@@ -58,6 +73,7 @@ class CredentialsCoreMixin(_MixinBase):
                decky_name=payload["decky_name"],
                service=payload["service"],
                principal=principal,
+                principal_key=principal_key,
                secret_kind=secret_kind,
                secret_sha256=payload["secret_sha256"],
                secret_b64=payload.get("secret_b64"),
@@ -69,7 +85,25 @@ class CredentialsCoreMixin(_MixinBase):
                attempt_count=1,
            )
            session.add(row)
-            await session.commit()
+            try:
+                await session.commit()
+            except IntegrityError:
+                # Concurrent upsert for the same dedup key beat us — re-SELECT
+                # the winner row and increment its counter in a fresh session.
+                await session.rollback()
+                async with self._session() as session2:
+                    stmt2 = select(Credential).where(*_build_dedup_filter())
+                    existing2 = (await session2.execute(stmt2)).scalar_one_or_none()
+                    if existing2 is None:
+                        # Extremely unlikely (e.g. deleted between races); bail.
+                        raise
+                    existing2.attempt_count = (existing2.attempt_count or 1) + 1
+                    existing2.last_seen = now
+                    if payload.get("outcome") is not None:
+                        existing2.outcome = payload["outcome"]
+                    session2.add(existing2)
+                    await session2.commit()
+                    return existing2.id
            await session.refresh(row)
            return row.id  # type: ignore[return-value]

--- a/decnet/web/dependencies.py
+++ b/decnet/web/dependencies.py
@@ -37,6 +37,17 @@ oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/v1/auth/login")
 # Per-request user lookup was the hidden tax behind every authed endpoint —
 # SELECT users WHERE uuid=? ran once per call, serializing through aiosqlite.
 # 10s TTL is well below JWT expiry and we invalidate on all user writes.
+#
+# REVOCATION-WINDOW NOTE (V2.1.8, accept-risk): these per-process caches bound
+# how long a *stale* user/username/denylist row can be served. Single-process:
+# a role downgrade or password change is reflected within ~_USER_TTL (10s) even
+# if the in-process invalidate_user_cache hook is missed; the local write path
+# invalidates immediately. Multi-worker (gunicorn/uvicorn --workers>1) gives
+# each worker its own cache, so the worst-case cross-worker staleness is also
+# ~10s and would need a shared cache (Redis) to collapse to zero. This staleness
+# is NOT the authoritative revocation control: JWT bulk-revoke via the user's
+# tokens_valid_from cutoff (enforced in _resolve_token) is the hard cutoff and
+# is unaffected by these caches.
 _USER_TTL = 10.0
 _user_cache: dict[str, tuple[Optional[dict[str, Any]], float]] = {}
 _user_cache_lock: Optional[asyncio.Lock] = None
@@ -389,6 +400,11 @@ def require_stream_role(*allowed_roles: str):
        header_token = _bearer_from_header(request)
        if header_token:
            _user_uuid, user = await _resolve_token(header_token)
+            if user.get("must_change_password"):
+                raise HTTPException(
+                    status_code=status.HTTP_403_FORBIDDEN,
+                    detail="Password change required before accessing this resource",
+                )
            if user["role"] not in allowed_roles:
                raise HTTPException(
                    status_code=status.HTTP_403_FORBIDDEN,
--- a/decnet/web/router/deckies/api_file_drop.py
+++ b/decnet/web/router/deckies/api_file_drop.py
@@ -75,7 +75,7 @@ async def api_drop_file(
        content = base64.b64decode(req.content_b64, validate=True)
    except (ValueError, TypeError) as exc:
        raise HTTPException(
-            status_code=400, detail=f"content_b64 is not valid base64: {exc}",
+            status_code=400, detail="content_b64 is not valid base64",
        ) from exc

    container = await _resolve_container_or_4xx(req.decky_name, req.topology_id)
--- a/decnet/web/router/deckies/api_tarpit.py
+++ b/decnet/web/router/deckies/api_tarpit.py
@@ -50,7 +50,8 @@ def _apply_tarpit(veth: str, ports: list[int], delay_ms: int) -> None:
    for args in steps:
        r = _tc(*args)
        if r.returncode != 0:
-            raise RuntimeError(r.stderr.strip())
+            log.warning("tarpit tc apply failed veth=%s cmd=%s stderr=%r", veth, args[0], r.stderr.strip())
+            raise RuntimeError("tarpit command failed")

    for port in ports:
        r = _tc(
@@ -60,7 +61,8 @@ def _apply_tarpit(veth: str, ports: list[int], delay_ms: int) -> None:
            "flowid", "1:1",
        )
        if r.returncode != 0:
-            raise RuntimeError(r.stderr.strip())
+            log.warning("tarpit tc filter failed veth=%s port=%d stderr=%r", veth, port, r.stderr.strip())
+            raise RuntimeError("tarpit command failed")


 def _remove_tarpit(veth: str) -> bool:
@@ -69,7 +71,8 @@ def _remove_tarpit(veth: str) -> bool:
    if r.returncode != 0:
        if "Cannot find" in r.stderr or "No such" in r.stderr:
            return False
-        raise RuntimeError(r.stderr.strip())
+        log.warning("tarpit tc remove failed veth=%s stderr=%r", veth, r.stderr.strip())
+        raise RuntimeError("tarpit command failed")
    return True


@@ -126,7 +129,8 @@ async def api_enable_tarpit(
    try:
        await asyncio.to_thread(_apply_tarpit, veth, req.ports, req.delay_ms)
    except RuntimeError as exc:
-        raise HTTPException(status_code=409, detail=str(exc)) from exc
+        log.warning("tarpit enable failed decky=%s: %s", decky_name, exc, exc_info=True)
+        raise HTTPException(status_code=409, detail="tarpit command failed") from exc

    ports_json = json.dumps(req.ports)
    await repo.set_tarpit_rule({
@@ -212,7 +216,8 @@ async def api_disable_tarpit(
    try:
        await asyncio.to_thread(_remove_tarpit, veth)
    except RuntimeError as exc:
-        raise HTTPException(status_code=409, detail=str(exc)) from exc
+        log.warning("tarpit disable failed decky=%s: %s", decky_name, exc, exc_info=True)
+        raise HTTPException(status_code=409, detail="tarpit command failed") from exc

    await repo.delete_tarpit_rule(decky_name)
    await repo.add_log({
--- a/decnet/web/router/health/api_get_health.py
+++ b/decnet/web/router/health/api_get_health.py
@@ -62,8 +62,10 @@ async def _check_database_cached() -> ComponentHealth:
        try:
            await repo.get_total_logs()
            _db_component = ComponentHealth(status="ok")
-        except Exception as exc:
-            _db_component = ComponentHealth(status="failing", detail=str(exc))
+        except Exception:
+            import logging as _logging
+            _logging.getLogger("api.get_health").exception("database liveness check failed")
+            _db_component = ComponentHealth(status="failing", detail="database unavailable")
        _db_last_check = time.monotonic()
        return _db_component

@@ -95,7 +97,7 @@ async def get_health(user: dict = Depends(require_viewer)) -> Any:
                detail = "cancelled"
            else:
                exc = task.exception()
-                detail = f"exited: {exc}" if exc else "exited unexpectedly"
+                detail = "exited unexpectedly" if not exc else "exited with error"
            components[name] = ComponentHealth(status="failing", detail=detail)
        else:
            components[name] = ComponentHealth(status="ok")
@@ -112,10 +114,12 @@ async def get_health(user: dict = Depends(require_viewer)) -> Any:
            await asyncio.to_thread(_docker_client.ping)  # type: ignore[union-attr]
            _docker_healthy = True
            _docker_detail = ""
-        except Exception as exc:
+        except Exception:
+            import logging as _logging
+            _logging.getLogger("api.get_health").exception("docker daemon ping failed")
            _docker_client = None
            _docker_healthy = False
-            _docker_detail = str(exc)
+            _docker_detail = "docker daemon unavailable"
        _docker_last_check = now

    if _docker_healthy:
--- a/decnet/web/router/realism/api_llm.py
+++ b/decnet/web/router/realism/api_llm.py
@@ -145,23 +145,26 @@ async def put_llm_config(
        try:
            from decnet.web.db.secrets import encrypt_secret
            merged["api_key_ciphertext"] = encrypt_secret(str(api_key_raw))
-        except RuntimeError as exc:
+        except RuntimeError:
+            log.exception("api.realism.put_llm: secret encryption unavailable")
            raise HTTPException(
                status_code=500,
-                detail=f"Secret encryption unavailable: {exc}",
-            ) from exc
+                detail="Secret encryption unavailable; check server configuration.",
+            ) from None

    try:
        cfg = LLMConfig(**merged)
    except Exception as exc:
-        raise HTTPException(status_code=400, detail=str(exc)) from exc
+        log.warning("api.realism.put_llm: LLMConfig validation failed: %s", exc)
+        raise HTTPException(status_code=400, detail="Invalid LLM configuration payload.") from exc

    try:
        llm_config.apply(cfg)
-    except Exception as exc:
+    except Exception:
+        log.exception("api.realism.put_llm: backend init failed")
        raise HTTPException(
-            status_code=400, detail=f"Backend init failed: {exc}"
-        ) from exc
+            status_code=400, detail="Backend init failed; check provider/model settings."
+        ) from None

    await repo.set_realism_config(_CONFIG_KEY, json.dumps(merged))
    _hydrated = True
--- a/decnet/web/router/swarm_updates/api_list_host_releases.py
+++ b/decnet/web/router/swarm_updates/api_list_host_releases.py
@@ -39,13 +39,14 @@ async def _probe_host(host: dict[str, Any]) -> HostReleaseInfo:
    try:
        async with UpdaterClient(host=host) as u:
            body = await u.health()
-    except Exception as exc:  # noqa: BLE001
+    except Exception:  # noqa: BLE001
+        log.warning("swarm_updates.list probe unreachable host=%s", host.get("name"), exc_info=True)
        return HostReleaseInfo(
            host_uuid=host["uuid"],
            host_name=host["name"],
            address=host["address"],
            reachable=False,
-            detail=f"{type(exc).__name__}: {exc}",
+            detail="host unreachable",
        )
    releases = body.get("releases") or []
    current, previous = _extract_shas(releases)
--- a/decnet/web/router/swarm_updates/api_push_update.py
+++ b/decnet/web/router/swarm_updates/api_push_update.py
@@ -96,10 +96,14 @@ async def _push_one(
                    # Connection drop on update-self is expected and not an error.
                    self_ok = _is_expected_connection_drop(exc)
                    if not self_ok:
+                        log.warning(
+                            "swarm_updates.push self-update transport failure host=%s: %s",
+                            host.get("name"), exc,
+                        )
                        return PushUpdateResult(
                            host_uuid=host["uuid"], host_name=host["name"],
                            status="self-failed", http_status=r.status_code, sha=sha,
-                            detail=f"agent updated OK but self-update failed: {exc}",
+                            detail="agent updated OK but self-update transport failure",
                            stderr=stderr,
                        )
                status = "self-updated" if self_ok else "self-failed"
@@ -110,12 +114,12 @@ async def _push_one(
                detail=body.get("error") or body.get("probe") if isinstance(body, dict) else None,
                stderr=stderr,
            )
-    except Exception as exc:  # noqa: BLE001
+    except Exception:  # noqa: BLE001
        log.exception("swarm_updates.push failed host=%s", host.get("name"))
        return PushUpdateResult(
            host_uuid=host["uuid"], host_name=host["name"],
            status="failed",
-            detail=f"{type(exc).__name__}: {exc}",
+            detail="transport failure",
        )


--- a/decnet/web/router/swarm_updates/api_push_update_self.py
+++ b/decnet/web/router/swarm_updates/api_push_update_self.py
@@ -56,12 +56,12 @@ async def _push_self_one(host: dict[str, Any], tarball: bytes, sha: str) -> Push
            http_status=http_status, sha=sha,
            detail=detail, stderr=stderr,
        )
-    except Exception as exc:  # noqa: BLE001
+    except Exception:  # noqa: BLE001
        log.exception("swarm_updates.push_self failed host=%s", host.get("name"))
        return PushUpdateResult(
            host_uuid=host["uuid"], host_name=host["name"],
            status="self-failed",
-            detail=f"{type(exc).__name__}: {exc}",
+            detail="transport failure",
        )


--- a/decnet/web/router/swarm_updates/api_rollback_host.py
+++ b/decnet/web/router/swarm_updates/api_rollback_host.py
@@ -49,12 +49,12 @@ async def api_rollback_host(
    try:
        async with UpdaterClient(host=host) as u:
            r = await u.rollback()
-    except Exception as exc:  # noqa: BLE001
+    except Exception:  # noqa: BLE001
        log.exception("swarm_updates.rollback transport failure host=%s", host["name"])
        return RollbackResponse(
            host_uuid=host["uuid"], host_name=host["name"],
            status="failed",
-            detail=f"{type(exc).__name__}: {exc}",
+            detail="transport failure",
        )

    body = r.json() if r.content else {}
--- a/decnet/web/router/system/api_deployment_mode.py
+++ b/decnet/web/router/system/api_deployment_mode.py
@@ -13,7 +13,7 @@ from fastapi import APIRouter, Depends
 from pydantic import BaseModel

 from decnet.web.db.repository import BaseRepository
-from decnet.web.dependencies import get_repo
+from decnet.web.dependencies import get_repo, require_viewer

 router = APIRouter()

@@ -24,9 +24,15 @@ class DeploymentModeResponse(BaseModel):
    swarm_host_count: int


+# Auth-gated (V4.1.6): the response leaks host role + enrolled-worker count,
+# which is recon-useful to an unauthenticated attacker. The dashboard only ever
+# calls this from inside the post-login app shell (App.tsx gates the whole app
+# behind a valid token), so there is no pre-auth UI-mode use case to preserve —
+# gate the entire endpoint behind require_viewer rather than splitting it.
@router.get("/deployment-mode", response_model=DeploymentModeResponse)
 async def get_deployment_mode(
    repo: BaseRepository = Depends(get_repo),
+    _user: dict = Depends(require_viewer),
 ) -> DeploymentModeResponse:
    role = os.environ.get("DECNET_MODE", "master").lower()
    hosts = 0
--- a/decnet/web/router/topology/api_create_blank_topology.py
+++ b/decnet/web/router/topology/api_create_blank_topology.py
@@ -14,6 +14,7 @@ import json

 from fastapi import APIRouter, Depends, HTTPException, status
 from pydantic import BaseModel, Field as PydanticField
+from sqlalchemy.exc import IntegrityError

 from decnet.telemetry import traced as _traced
 from decnet.topology.allocator import SubnetAllocator, reserved_subnets
@@ -62,8 +63,13 @@ async def api_create_blank_topology(
                "config_snapshot": json.dumps({"blank": True}),
            }
        )
-    except Exception as exc:  # noqa: BLE001 — surface duplicate-name as 409
-        raise HTTPException(status_code=409, detail=str(exc)) from exc
+    except IntegrityError as exc:
+        # Unique constraint on topologies.name — report the collision without
+        # leaking the raw DB message.
+        raise HTTPException(
+            status_code=409,
+            detail=f"A topology named {body.name!r} already exists.",
+        ) from exc

    # 2. DMZ LAN with auto-allocated subnet
    try:
--- a/decnet/web/router/topology/api_tarpit.py
+++ b/decnet/web/router/topology/api_tarpit.py
@@ -76,7 +76,11 @@ async def api_enable_tarpit(
    try:
        await asyncio.to_thread(_apply_tarpit, veth, req.ports, req.delay_ms)
    except RuntimeError as exc:
-        raise HTTPException(status_code=409, detail=str(exc)) from exc
+        log.warning(
+            "tarpit enable failed topology=%s decky=%s: %s",
+            topology_id, decky_name, exc, exc_info=True,
+        )
+        raise HTTPException(status_code=409, detail="tarpit command failed") from exc

    db_key = _db_key(topology_id, decky_name)
    ports_json = json.dumps(req.ports)
@@ -175,7 +179,11 @@ async def api_disable_tarpit(
    try:
        await asyncio.to_thread(_remove_tarpit, veth)
    except RuntimeError as exc:
-        raise HTTPException(status_code=409, detail=str(exc)) from exc
+        log.warning(
+            "tarpit disable failed topology=%s decky=%s: %s",
+            topology_id, decky_name, exc, exc_info=True,
+        )
+        raise HTTPException(status_code=409, detail="tarpit command failed") from exc

    db_key = _db_key(topology_id, decky_name)
    await repo.delete_tarpit_rule(db_key)