feat(1.1): supervise cpu group with ProcessPoolExecutor kernel offload
Hosts clusterer/campaign-clusterer/attribution/reuse-correlate in one process. The two O(n^2) connected-components kernels (cluster_observations, cluster_identities) offload to ONE shared forkserver pool via decnet.offload .run_kernel, so they run in parallel instead of serialising under the GIL. - offload.run_kernel: pool when installed + offload_if holds, else inline. Standalone workers and all tests run inline => behaviour unchanged (424 clustering/correlation tests green). - offload_if gates on input size (>=256) to skip pickle cost on small passes. - forkserver (not fork): supervisor is multithreaded via bus clients. - attribution/reuse co-located but not offloaded yet (lighter; same run_kernel path extends to them if profiling shows contention). - systemd unit Conflicts= the 4 units it replaces; no docker/raw-socket priv.
This commit is contained in:
@@ -31,11 +31,16 @@ from decnet.clustering.campaign.impl.similarity import (
|
||||
combined_campaign_weight,
|
||||
)
|
||||
from decnet.logging import get_logger
|
||||
from decnet.offload import run_kernel
|
||||
from decnet.util.simhash import from_bytes8
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
|
||||
log = get_logger("clustering.campaign.connected_components")
|
||||
|
||||
# Below this many identities the O(n^2) pass is cheaper than the pickle
|
||||
# round-trip to a pool worker, so run inline even when a pool is installed.
|
||||
_OFFLOAD_MIN_IDENTITIES = 256
|
||||
|
||||
|
||||
def cluster_identities(
|
||||
features: Iterable[IdentityFeatures],
|
||||
@@ -220,7 +225,11 @@ class ConnectedComponentsCampaignClusterer(CampaignClusterer):
|
||||
row_by_uuid: dict[str, dict[str, Any]] = {
|
||||
r["uuid"]: r for r in active_rows
|
||||
}
|
||||
labels = cluster_identities(feature_list)
|
||||
labels = await run_kernel(
|
||||
cluster_identities,
|
||||
feature_list,
|
||||
offload_if=len(feature_list) >= _OFFLOAD_MIN_IDENTITIES,
|
||||
)
|
||||
|
||||
# Group identities by predicted cluster.
|
||||
components: dict[str, list[str]] = {}
|
||||
|
||||
@@ -42,12 +42,17 @@ from decnet.clustering.impl.similarity import (
|
||||
combined_edge_weight,
|
||||
)
|
||||
from decnet.logging import get_logger
|
||||
from decnet.offload import run_kernel
|
||||
from decnet.profiler.identity_rollup import extract_fp_summaries
|
||||
from decnet.util.simhash import from_bytes8, to_bytes8
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
|
||||
log = get_logger("clustering.connected_components")
|
||||
|
||||
# Below this many observations the O(n^2) pass is cheaper than the pickle
|
||||
# round-trip to a pool worker, so run inline even when a pool is installed.
|
||||
_OFFLOAD_MIN_OBSERVATIONS = 256
|
||||
|
||||
# Per-session SimHash observations of the keystroke-rhythm biometric; the
|
||||
# rollup folds them into one identity-level centroid.
|
||||
_DIGRAPH_PRIMITIVE = "motor.digraph_simhash"
|
||||
@@ -173,7 +178,11 @@ class ConnectedComponentsClusterer(Clusterer):
|
||||
obs = from_attacker_row(r)
|
||||
observations.append(obs)
|
||||
row_by_id[obs.observation_id] = r
|
||||
labels = cluster_observations(observations)
|
||||
labels = await run_kernel(
|
||||
cluster_observations,
|
||||
observations,
|
||||
offload_if=len(observations) >= _OFFLOAD_MIN_OBSERVATIONS,
|
||||
)
|
||||
|
||||
# Group observations by predicted cluster.
|
||||
components: dict[str, list[str]] = {}
|
||||
|
||||
Reference in New Issue
Block a user