fix: use dedicated thread pools for collector and sniffer workers

The collector spawned one permanent thread per Docker container via
asyncio.to_thread(), saturating the default asyncio executor. This
starved short-lived to_thread(load_state) calls in get_deckies() and
get_stats_summary(), causing the SSE stream and deckies endpoints to
hang indefinitely while other DB-only endpoints worked fine.

Give the collector and sniffer their own ThreadPoolExecutor so they
never compete with the default pool.
This commit is contained in:
2026-04-15 22:57:03 -04:00
parent e9d151734d
commit a1ca5d699b
3 changed files with 125 additions and 3 deletions

View File

@@ -12,6 +12,7 @@ import os
import re
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from pathlib import Path
from typing import Any, Optional
@@ -285,10 +286,20 @@ async def log_collector_worker(log_file: str) -> None:
active: dict[str, asyncio.Task[None]] = {}
loop = asyncio.get_running_loop()
# Dedicated thread pool so long-running container log streams don't
# saturate the default asyncio executor and starve short-lived
# to_thread() calls elsewhere (e.g. load_state in the web API).
collector_pool = ThreadPoolExecutor(
max_workers=64, thread_name_prefix="decnet-collector",
)
def _spawn(container_id: str, container_name: str) -> None:
if container_id not in active or active[container_id].done():
active[container_id] = asyncio.ensure_future(
asyncio.to_thread(_stream_container, container_id, log_path, json_path),
loop.run_in_executor(
collector_pool, _stream_container,
container_id, log_path, json_path,
),
loop=loop,
)
logger.info("collector: streaming container=%s", container_name)
@@ -312,12 +323,15 @@ async def log_collector_worker(log_file: str) -> None:
if cid and is_service_event(attrs):
loop.call_soon_threadsafe(_spawn, cid, name)
await asyncio.to_thread(_watch_events)
await loop.run_in_executor(collector_pool, _watch_events)
except asyncio.CancelledError:
logger.info("collector shutdown requested cancelling %d tasks", len(active))
for task in active.values():
task.cancel()
collector_pool.shutdown(wait=False)
raise
except Exception as exc:
logger.error("collector error: %s", exc)
finally:
collector_pool.shutdown(wait=False)

View File

@@ -14,6 +14,7 @@ import asyncio
import os
import subprocess
import threading
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from decnet.logging import get_logger
@@ -130,12 +131,25 @@ async def sniffer_worker(log_file: str) -> None:
stop_event = threading.Event()
# Dedicated thread pool so the long-running sniff loop doesn't
# occupy a slot in the default asyncio executor.
sniffer_pool = ThreadPoolExecutor(
max_workers=2, thread_name_prefix="decnet-sniffer",
)
try:
await asyncio.to_thread(_sniff_loop, interface, log_path, json_path, stop_event)
loop = asyncio.get_running_loop()
await loop.run_in_executor(
sniffer_pool, _sniff_loop,
interface, log_path, json_path, stop_event,
)
except asyncio.CancelledError:
logger.info("sniffer: shutdown requested")
stop_event.set()
sniffer_pool.shutdown(wait=False)
raise
finally:
sniffer_pool.shutdown(wait=False)
except asyncio.CancelledError:
raise