feat(1.1): supervise cpu group with ProcessPoolExecutor kernel offload

Hosts clusterer/campaign-clusterer/attribution/reuse-correlate in one
process. The two O(n^2) connected-components kernels (cluster_observations,
cluster_identities) offload to ONE shared forkserver pool via decnet.offload
.run_kernel, so they run in parallel instead of serialising under the GIL.

- offload.run_kernel: pool when installed + offload_if holds, else inline.
  Standalone workers and all tests run inline => behaviour unchanged
  (424 clustering/correlation tests green).
- offload_if gates on input size (>=256) to skip pickle cost on small passes.
- forkserver (not fork): supervisor is multithreaded via bus clients.
- attribution/reuse co-located but not offloaded yet (lighter; same run_kernel
  path extends to them if profiling shows contention).
- systemd unit Conflicts= the 4 units it replaces; no docker/raw-socket priv.
This commit is contained in:
2026-06-17 17:35:42 -04:00
parent 6d7d2c0e24
commit bce2c1940c
7 changed files with 232 additions and 6 deletions

View File

@@ -17,7 +17,7 @@ from .utils import console, log
# Groups are intentionally a small static registry, not config — the membership
# is an architectural decision, not an operator knob.
_GROUPS = ("batch",)
_GROUPS = ("batch", "cpu")
async def _build_specs(group: str):
@@ -43,6 +43,22 @@ async def _build_specs(group: str):
("orchestrate", lambda: orchestrator_worker(repo, interval=60, llm_enabled=None)),
("mutate", lambda: run_watch_loop(repo)),
]
if group == "cpu":
from decnet.cli.gating import _require_master_mode
from decnet.clustering.campaign.worker import run_campaign_clusterer_loop
from decnet.clustering.worker import run_clusterer_loop
from decnet.correlation.attribution_worker import run_attribution_loop
from decnet.correlation.reuse_worker import run_reuse_loop
from decnet.web.dependencies import repo
_require_master_mode("supervise cpu")
await repo.initialize() # shared by every cpu worker → one DB pool
return [
("clusterer", lambda: run_clusterer_loop(repo, poll_interval_secs=60.0)),
("campaign-clusterer", lambda: run_campaign_clusterer_loop(repo, poll_interval_secs=60.0)),
("attribution", lambda: run_attribution_loop(repo, multi_actor_tick_secs=60.0)),
("reuse-correlate", lambda: run_reuse_loop(repo, poll_interval_secs=60.0, min_targets=2)),
]
raise ValueError(f"unknown supervise group: {group}")
@@ -75,8 +91,32 @@ def register(app: typer.Typer) -> None:
console.print(f"[bold cyan]Supervisor starting[/] group={group}")
async def _run() -> None:
specs = await _build_specs(group)
await run_group(specs)
pool = None
if group == "cpu":
# The CPU workers offload their O(n^2) connected-components
# kernels to ONE shared pool so they run in parallel instead of
# serialising under the GIL. forkserver (not the default fork):
# this process is multithreaded via bus clients, and forking a
# multithreaded process is unsafe.
import multiprocessing as _mp
from concurrent.futures import ProcessPoolExecutor
from decnet import offload
pool = ProcessPoolExecutor(
max_workers=2, mp_context=_mp.get_context("forkserver")
)
offload.set_executor(pool)
log.info("supervise cpu: kernel offload pool ready (max_workers=2)")
try:
specs = await _build_specs(group)
await run_group(specs)
finally:
if pool is not None:
from decnet import offload
offload.set_executor(None)
pool.shutdown(wait=False, cancel_futures=True)
try:
asyncio.run(_run())

View File

@@ -31,11 +31,16 @@ from decnet.clustering.campaign.impl.similarity import (
combined_campaign_weight,
)
from decnet.logging import get_logger
from decnet.offload import run_kernel
from decnet.util.simhash import from_bytes8
from decnet.web.db.repository import BaseRepository
log = get_logger("clustering.campaign.connected_components")
# Below this many identities the O(n^2) pass is cheaper than the pickle
# round-trip to a pool worker, so run inline even when a pool is installed.
_OFFLOAD_MIN_IDENTITIES = 256
def cluster_identities(
features: Iterable[IdentityFeatures],
@@ -220,7 +225,11 @@ class ConnectedComponentsCampaignClusterer(CampaignClusterer):
row_by_uuid: dict[str, dict[str, Any]] = {
r["uuid"]: r for r in active_rows
}
labels = cluster_identities(feature_list)
labels = await run_kernel(
cluster_identities,
feature_list,
offload_if=len(feature_list) >= _OFFLOAD_MIN_IDENTITIES,
)
# Group identities by predicted cluster.
components: dict[str, list[str]] = {}

View File

@@ -42,12 +42,17 @@ from decnet.clustering.impl.similarity import (
combined_edge_weight,
)
from decnet.logging import get_logger
from decnet.offload import run_kernel
from decnet.profiler.identity_rollup import extract_fp_summaries
from decnet.util.simhash import from_bytes8, to_bytes8
from decnet.web.db.repository import BaseRepository
log = get_logger("clustering.connected_components")
# Below this many observations the O(n^2) pass is cheaper than the pickle
# round-trip to a pool worker, so run inline even when a pool is installed.
_OFFLOAD_MIN_OBSERVATIONS = 256
# Per-session SimHash observations of the keystroke-rhythm biometric; the
# rollup folds them into one identity-level centroid.
_DIGRAPH_PRIMITIVE = "motor.digraph_simhash"
@@ -173,7 +178,11 @@ class ConnectedComponentsClusterer(Clusterer):
obs = from_attacker_row(r)
observations.append(obs)
row_by_id[obs.observation_id] = r
labels = cluster_observations(observations)
labels = await run_kernel(
cluster_observations,
observations,
offload_if=len(observations) >= _OFFLOAD_MIN_OBSERVATIONS,
)
# Group observations by predicted cluster.
components: dict[str, list[str]] = {}

52
decnet/offload.py Normal file
View File

@@ -0,0 +1,52 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Shared CPU-kernel offload — run a pure, picklable function in a process pool
so GIL-bound compute doesn't block the event loop (or its co-hosted workers).
Used by ``decnet supervise cpu`` (see ``decnet/cli/supervise.py``), which hosts
several CPU-bound workers in one process and installs ONE shared
``ProcessPoolExecutor`` here. When no executor is installed — standalone workers
and every test — :func:`run_kernel` runs the kernel inline, so behaviour is
identical to before this module existed.
Contract for an offloadable kernel: a module-level function (picklable by
reference) that is pure (no DB / clock / I/O), taking and returning picklable
values. The clustering connected-components kernels satisfy this.
"""
from __future__ import annotations
import asyncio
from concurrent.futures import ProcessPoolExecutor
from typing import Any, Callable, TypeVar
_T = TypeVar("_T")
_executor: ProcessPoolExecutor | None = None
def set_executor(ex: ProcessPoolExecutor | None) -> None:
"""Install (``ex``) or clear (``None``) the shared pool. Idempotent."""
global _executor
_executor = ex
def get_executor() -> ProcessPoolExecutor | None:
return _executor
async def run_kernel(
fn: Callable[..., _T], *args: Any, offload_if: bool = True
) -> _T:
"""Run ``fn(*args)``, offloading to the shared pool when one is installed
and ``offload_if`` holds; otherwise run inline.
``offload_if`` lets the caller skip the pickle round-trip for inputs too
small to be worth a cross-process hop — the caller knows the problem size,
this module does not.
# ponytail: boolean gate, not an auto-tuned threshold. If kernels start
# varying wildly in cost, measure and move the decision here.
"""
ex = _executor
if ex is None or not offload_if:
return fn(*args)
loop = asyncio.get_running_loop()
return await loop.run_in_executor(ex, fn, *args)