fix(security): close LOW ASVS findings — env bypass, SSE/deployment authz, CN fail-close, password byte-limit, exception leaks, BUG-12..16

Auth/session (V2.1.7, V4.1.5, V4.1.6, V2.1.4/V2.1.5):
- env secret validation no longer bypassed by attacker-injectable PYTEST* env;
  gated on explicit DECNET_TESTING=1 (set only in conftest).
- must_change_password now enforced on the SSE header-JWT path, not just ticket mint.
- GET /system/deployment-mode requires viewer auth (was leaking role + topology size).
- CreateUser/ResetUser passwords min_length=12; passwords >72 bytes rejected
  explicitly instead of bcrypt silently truncating.

Swarm ingestion (V9.1.3, BUG-16):
- Log listener hard-rejects peers with unparseable/empty cert CN (fail closed,
  ingests nothing) instead of tagging 'unknown'.
- Shutdown handlers no longer swallow real errors (narrowed to CancelledError).

Info leakage (V7.1.2, V14.1.2):
- Exception text sanitized on swarm-update, health, tarpit, realism, file-drop,
  blank-topology endpoints (raw tc/docker stderr, DB/Docker errors logged
  server-side, generic detail returned). pyproject license corrected to AGPL-3.0.

Correctness (BUG-12..16):
- BUG-12 atomic credential upsert (UNIQUE constraint + IntegrityError retry,
  consistent principal_key canonicalization).
- BUG-13 rule-tail watermark uses >= with seen-id dedup (no same-second drop).
- BUG-14 worker wake cleared before wait (no lost wake during tick).
- BUG-15 intel gather tolerates an unexpected provider raise.
- BUG-16 see above.

Already-closed (verified, no change): V2.1.6, V5.1.3, V9.1.2. Accept-risk +
documented: V2.1.8 cache window, V3.1.3 idle timeout. Tests added for every fix;
unanimous adversarial review after two refute-fix rounds.
This commit is contained in:
2026-06-10 13:27:14 -04:00
parent d80e6aa6d1
commit 245975a6dd
40 changed files with 1629 additions and 72 deletions

View File

@@ -75,7 +75,7 @@ async def api_drop_file(
content = base64.b64decode(req.content_b64, validate=True)
except (ValueError, TypeError) as exc:
raise HTTPException(
status_code=400, detail=f"content_b64 is not valid base64: {exc}",
status_code=400, detail="content_b64 is not valid base64",
) from exc
container = await _resolve_container_or_4xx(req.decky_name, req.topology_id)

View File

@@ -50,7 +50,8 @@ def _apply_tarpit(veth: str, ports: list[int], delay_ms: int) -> None:
for args in steps:
r = _tc(*args)
if r.returncode != 0:
raise RuntimeError(r.stderr.strip())
log.warning("tarpit tc apply failed veth=%s cmd=%s stderr=%r", veth, args[0], r.stderr.strip())
raise RuntimeError("tarpit command failed")
for port in ports:
r = _tc(
@@ -60,7 +61,8 @@ def _apply_tarpit(veth: str, ports: list[int], delay_ms: int) -> None:
"flowid", "1:1",
)
if r.returncode != 0:
raise RuntimeError(r.stderr.strip())
log.warning("tarpit tc filter failed veth=%s port=%d stderr=%r", veth, port, r.stderr.strip())
raise RuntimeError("tarpit command failed")
def _remove_tarpit(veth: str) -> bool:
@@ -69,7 +71,8 @@ def _remove_tarpit(veth: str) -> bool:
if r.returncode != 0:
if "Cannot find" in r.stderr or "No such" in r.stderr:
return False
raise RuntimeError(r.stderr.strip())
log.warning("tarpit tc remove failed veth=%s stderr=%r", veth, r.stderr.strip())
raise RuntimeError("tarpit command failed")
return True
@@ -126,7 +129,8 @@ async def api_enable_tarpit(
try:
await asyncio.to_thread(_apply_tarpit, veth, req.ports, req.delay_ms)
except RuntimeError as exc:
raise HTTPException(status_code=409, detail=str(exc)) from exc
log.warning("tarpit enable failed decky=%s: %s", decky_name, exc, exc_info=True)
raise HTTPException(status_code=409, detail="tarpit command failed") from exc
ports_json = json.dumps(req.ports)
await repo.set_tarpit_rule({
@@ -212,7 +216,8 @@ async def api_disable_tarpit(
try:
await asyncio.to_thread(_remove_tarpit, veth)
except RuntimeError as exc:
raise HTTPException(status_code=409, detail=str(exc)) from exc
log.warning("tarpit disable failed decky=%s: %s", decky_name, exc, exc_info=True)
raise HTTPException(status_code=409, detail="tarpit command failed") from exc
await repo.delete_tarpit_rule(decky_name)
await repo.add_log({

View File

@@ -62,8 +62,10 @@ async def _check_database_cached() -> ComponentHealth:
try:
await repo.get_total_logs()
_db_component = ComponentHealth(status="ok")
except Exception as exc:
_db_component = ComponentHealth(status="failing", detail=str(exc))
except Exception:
import logging as _logging
_logging.getLogger("api.get_health").exception("database liveness check failed")
_db_component = ComponentHealth(status="failing", detail="database unavailable")
_db_last_check = time.monotonic()
return _db_component
@@ -95,7 +97,7 @@ async def get_health(user: dict = Depends(require_viewer)) -> Any:
detail = "cancelled"
else:
exc = task.exception()
detail = f"exited: {exc}" if exc else "exited unexpectedly"
detail = "exited unexpectedly" if not exc else "exited with error"
components[name] = ComponentHealth(status="failing", detail=detail)
else:
components[name] = ComponentHealth(status="ok")
@@ -112,10 +114,12 @@ async def get_health(user: dict = Depends(require_viewer)) -> Any:
await asyncio.to_thread(_docker_client.ping) # type: ignore[union-attr]
_docker_healthy = True
_docker_detail = ""
except Exception as exc:
except Exception:
import logging as _logging
_logging.getLogger("api.get_health").exception("docker daemon ping failed")
_docker_client = None
_docker_healthy = False
_docker_detail = str(exc)
_docker_detail = "docker daemon unavailable"
_docker_last_check = now
if _docker_healthy:

View File

@@ -145,23 +145,26 @@ async def put_llm_config(
try:
from decnet.web.db.secrets import encrypt_secret
merged["api_key_ciphertext"] = encrypt_secret(str(api_key_raw))
except RuntimeError as exc:
except RuntimeError:
log.exception("api.realism.put_llm: secret encryption unavailable")
raise HTTPException(
status_code=500,
detail=f"Secret encryption unavailable: {exc}",
) from exc
detail="Secret encryption unavailable; check server configuration.",
) from None
try:
cfg = LLMConfig(**merged)
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
log.warning("api.realism.put_llm: LLMConfig validation failed: %s", exc)
raise HTTPException(status_code=400, detail="Invalid LLM configuration payload.") from exc
try:
llm_config.apply(cfg)
except Exception as exc:
except Exception:
log.exception("api.realism.put_llm: backend init failed")
raise HTTPException(
status_code=400, detail=f"Backend init failed: {exc}"
) from exc
status_code=400, detail="Backend init failed; check provider/model settings."
) from None
await repo.set_realism_config(_CONFIG_KEY, json.dumps(merged))
_hydrated = True

View File

@@ -39,13 +39,14 @@ async def _probe_host(host: dict[str, Any]) -> HostReleaseInfo:
try:
async with UpdaterClient(host=host) as u:
body = await u.health()
except Exception as exc: # noqa: BLE001
except Exception: # noqa: BLE001
log.warning("swarm_updates.list probe unreachable host=%s", host.get("name"), exc_info=True)
return HostReleaseInfo(
host_uuid=host["uuid"],
host_name=host["name"],
address=host["address"],
reachable=False,
detail=f"{type(exc).__name__}: {exc}",
detail="host unreachable",
)
releases = body.get("releases") or []
current, previous = _extract_shas(releases)

View File

@@ -96,10 +96,14 @@ async def _push_one(
# Connection drop on update-self is expected and not an error.
self_ok = _is_expected_connection_drop(exc)
if not self_ok:
log.warning(
"swarm_updates.push self-update transport failure host=%s: %s",
host.get("name"), exc,
)
return PushUpdateResult(
host_uuid=host["uuid"], host_name=host["name"],
status="self-failed", http_status=r.status_code, sha=sha,
detail=f"agent updated OK but self-update failed: {exc}",
detail="agent updated OK but self-update transport failure",
stderr=stderr,
)
status = "self-updated" if self_ok else "self-failed"
@@ -110,12 +114,12 @@ async def _push_one(
detail=body.get("error") or body.get("probe") if isinstance(body, dict) else None,
stderr=stderr,
)
except Exception as exc: # noqa: BLE001
except Exception: # noqa: BLE001
log.exception("swarm_updates.push failed host=%s", host.get("name"))
return PushUpdateResult(
host_uuid=host["uuid"], host_name=host["name"],
status="failed",
detail=f"{type(exc).__name__}: {exc}",
detail="transport failure",
)

View File

@@ -56,12 +56,12 @@ async def _push_self_one(host: dict[str, Any], tarball: bytes, sha: str) -> Push
http_status=http_status, sha=sha,
detail=detail, stderr=stderr,
)
except Exception as exc: # noqa: BLE001
except Exception: # noqa: BLE001
log.exception("swarm_updates.push_self failed host=%s", host.get("name"))
return PushUpdateResult(
host_uuid=host["uuid"], host_name=host["name"],
status="self-failed",
detail=f"{type(exc).__name__}: {exc}",
detail="transport failure",
)

View File

@@ -49,12 +49,12 @@ async def api_rollback_host(
try:
async with UpdaterClient(host=host) as u:
r = await u.rollback()
except Exception as exc: # noqa: BLE001
except Exception: # noqa: BLE001
log.exception("swarm_updates.rollback transport failure host=%s", host["name"])
return RollbackResponse(
host_uuid=host["uuid"], host_name=host["name"],
status="failed",
detail=f"{type(exc).__name__}: {exc}",
detail="transport failure",
)
body = r.json() if r.content else {}

View File

@@ -13,7 +13,7 @@ from fastapi import APIRouter, Depends
from pydantic import BaseModel
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.dependencies import get_repo, require_viewer
router = APIRouter()
@@ -24,9 +24,15 @@ class DeploymentModeResponse(BaseModel):
swarm_host_count: int
# Auth-gated (V4.1.6): the response leaks host role + enrolled-worker count,
# which is recon-useful to an unauthenticated attacker. The dashboard only ever
# calls this from inside the post-login app shell (App.tsx gates the whole app
# behind a valid token), so there is no pre-auth UI-mode use case to preserve —
# gate the entire endpoint behind require_viewer rather than splitting it.
@router.get("/deployment-mode", response_model=DeploymentModeResponse)
async def get_deployment_mode(
repo: BaseRepository = Depends(get_repo),
_user: dict = Depends(require_viewer),
) -> DeploymentModeResponse:
role = os.environ.get("DECNET_MODE", "master").lower()
hosts = 0

View File

@@ -14,6 +14,7 @@ import json
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel, Field as PydanticField
from sqlalchemy.exc import IntegrityError
from decnet.telemetry import traced as _traced
from decnet.topology.allocator import SubnetAllocator, reserved_subnets
@@ -62,8 +63,13 @@ async def api_create_blank_topology(
"config_snapshot": json.dumps({"blank": True}),
}
)
except Exception as exc: # noqa: BLE001 — surface duplicate-name as 409
raise HTTPException(status_code=409, detail=str(exc)) from exc
except IntegrityError as exc:
# Unique constraint on topologies.name — report the collision without
# leaking the raw DB message.
raise HTTPException(
status_code=409,
detail=f"A topology named {body.name!r} already exists.",
) from exc
# 2. DMZ LAN with auto-allocated subnet
try:

View File

@@ -76,7 +76,11 @@ async def api_enable_tarpit(
try:
await asyncio.to_thread(_apply_tarpit, veth, req.ports, req.delay_ms)
except RuntimeError as exc:
raise HTTPException(status_code=409, detail=str(exc)) from exc
log.warning(
"tarpit enable failed topology=%s decky=%s: %s",
topology_id, decky_name, exc, exc_info=True,
)
raise HTTPException(status_code=409, detail="tarpit command failed") from exc
db_key = _db_key(topology_id, decky_name)
ports_json = json.dumps(req.ports)
@@ -175,7 +179,11 @@ async def api_disable_tarpit(
try:
await asyncio.to_thread(_remove_tarpit, veth)
except RuntimeError as exc:
raise HTTPException(status_code=409, detail=str(exc)) from exc
log.warning(
"tarpit disable failed topology=%s decky=%s: %s",
topology_id, decky_name, exc, exc_info=True,
)
raise HTTPException(status_code=409, detail="tarpit command failed") from exc
db_key = _db_key(topology_id, decky_name)
await repo.delete_tarpit_rule(db_key)