fix: gate embedded sniffer behind DECNET_EMBED_SNIFFER (default off)

The API's lifespan unconditionally spawned a MACVLAN sniffer task, which
duplicated the standalone 'decnet sniffer --daemon' process that
'decnet deploy' always starts — causing two workers to sniff the same
interface, double events, and wasted CPU.

Mirror the existing DECNET_EMBED_PROFILER pattern: sniffer is OFF by
default, opt in explicitly. Static regression tests guard against
accidental removal of the gate.
This commit is contained in:
2026-04-17 13:35:43 -04:00
parent 064c8760b6
commit 140d2fbaad
3 changed files with 66 additions and 8 deletions

View File

@@ -59,6 +59,12 @@ DECNET_SYSTEM_LOGS: str = os.environ.get("DECNET_SYSTEM_LOGS", "decnet.system.lo
# which causes events to be skipped or processed twice. # which causes events to be skipped or processed twice.
DECNET_EMBED_PROFILER: bool = os.environ.get("DECNET_EMBED_PROFILER", "").lower() == "true" DECNET_EMBED_PROFILER: bool = os.environ.get("DECNET_EMBED_PROFILER", "").lower() == "true"
# Set to "true" to embed the MACVLAN sniffer inside the API process.
# Leave unset (default) when the standalone `decnet sniffer --daemon` is
# running (which `decnet deploy` always does). Embedding both produces two
# workers sniffing the same interface — duplicated events and wasted CPU.
DECNET_EMBED_SNIFFER: bool = os.environ.get("DECNET_EMBED_SNIFFER", "").lower() == "true"
# Set to "true" to mount the Pyinstrument ASGI middleware on the FastAPI app. # Set to "true" to mount the Pyinstrument ASGI middleware on the FastAPI app.
# Produces per-request HTML flamegraphs under ./profiles/. Off by default so # Produces per-request HTML flamegraphs under ./profiles/. Off by default so
# production and normal dev runs pay zero profiling overhead. # production and normal dev runs pay zero profiling overhead.

View File

@@ -13,6 +13,7 @@ from decnet.env import (
DECNET_CORS_ORIGINS, DECNET_CORS_ORIGINS,
DECNET_DEVELOPER, DECNET_DEVELOPER,
DECNET_EMBED_PROFILER, DECNET_EMBED_PROFILER,
DECNET_EMBED_SNIFFER,
DECNET_INGEST_LOG_FILE, DECNET_INGEST_LOG_FILE,
DECNET_PROFILE_DIR, DECNET_PROFILE_DIR,
DECNET_PROFILE_REQUESTS, DECNET_PROFILE_REQUESTS,
@@ -97,14 +98,20 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
else: else:
log.debug("API startup: profiler not embedded — expecting standalone daemon") log.debug("API startup: profiler not embedded — expecting standalone daemon")
# Start fleet-wide MACVLAN sniffer (fault-isolated — never crashes the API) # Start fleet-wide MACVLAN sniffer only when explicitly requested.
# Default is OFF because `decnet deploy` always starts a standalone
# `decnet sniffer --daemon` process. Running both against the same
# interface produces duplicated events and wastes CPU.
if DECNET_EMBED_SNIFFER:
try: try:
from decnet.sniffer import sniffer_worker from decnet.sniffer import sniffer_worker
if sniffer_task is None or sniffer_task.done(): if sniffer_task is None or sniffer_task.done():
sniffer_task = asyncio.create_task(sniffer_worker(_log_file)) sniffer_task = asyncio.create_task(sniffer_worker(_log_file))
log.debug("API startup sniffer worker started") log.info("API startup: embedded sniffer started (DECNET_EMBED_SNIFFER=true)")
except Exception as exc: except Exception as exc:
log.warning("Sniffer worker failed to start — API continues without sniffing: %s", exc) log.warning("Sniffer worker failed to start — API continues without sniffing: %s", exc)
else:
log.debug("API startup: sniffer not embedded — expecting standalone daemon")
else: else:
log.info("Contract Test Mode: skipping background worker startup") log.info("Contract Test Mode: skipping background worker startup")

View File

@@ -0,0 +1,45 @@
"""
Regression guards for workers that duplicate standalone daemons.
`decnet deploy` starts standalone `decnet sniffer --daemon` and
`decnet profiler --daemon` processes. The API's lifespan must not spawn
its own copies unless the operator explicitly opts in via env flags.
These tests are intentionally static: we don't spin up lifespan, because
scapy's sniff thread doesn't cooperate with asyncio cancellation and
hangs pytest teardown.
"""
import importlib
import inspect
def test_embed_sniffer_defaults_off(monkeypatch):
monkeypatch.delenv("DECNET_EMBED_SNIFFER", raising=False)
import decnet.env
importlib.reload(decnet.env)
assert decnet.env.DECNET_EMBED_SNIFFER is False
def test_embed_sniffer_flag_is_truthy_on_opt_in(monkeypatch):
monkeypatch.setenv("DECNET_EMBED_SNIFFER", "true")
import decnet.env
importlib.reload(decnet.env)
assert decnet.env.DECNET_EMBED_SNIFFER is True
def test_api_lifespan_gates_sniffer_on_embed_flag():
"""The lifespan source must reference the gate flag before spawning the
sniffer task — catches accidental removal of the guard in future edits."""
import decnet.web.api
src = inspect.getsource(decnet.web.api.lifespan)
assert "DECNET_EMBED_SNIFFER" in src, "sniffer gate removed from lifespan"
assert "sniffer_worker" in src
# Gate must appear before the task creation.
assert src.index("DECNET_EMBED_SNIFFER") < src.index("sniffer_worker")
def test_api_lifespan_gates_profiler_on_embed_flag():
import decnet.web.api
src = inspect.getsource(decnet.web.api.lifespan)
assert "DECNET_EMBED_PROFILER" in src
assert src.index("DECNET_EMBED_PROFILER") < src.index("attacker_profile_worker")