feat(1.1): supervise cpu group with ProcessPoolExecutor kernel offload
Hosts clusterer/campaign-clusterer/attribution/reuse-correlate in one process. The two O(n^2) connected-components kernels (cluster_observations, cluster_identities) offload to ONE shared forkserver pool via decnet.offload .run_kernel, so they run in parallel instead of serialising under the GIL. - offload.run_kernel: pool when installed + offload_if holds, else inline. Standalone workers and all tests run inline => behaviour unchanged (424 clustering/correlation tests green). - offload_if gates on input size (>=256) to skip pickle cost on small passes. - forkserver (not fork): supervisor is multithreaded via bus clients. - attribution/reuse co-located but not offloaded yet (lighter; same run_kernel path extends to them if profiling shows contention). - systemd unit Conflicts= the 4 units it replaces; no docker/raw-socket priv.
This commit is contained in:
@@ -17,7 +17,7 @@ from .utils import console, log
|
|||||||
|
|
||||||
# Groups are intentionally a small static registry, not config — the membership
|
# Groups are intentionally a small static registry, not config — the membership
|
||||||
# is an architectural decision, not an operator knob.
|
# is an architectural decision, not an operator knob.
|
||||||
_GROUPS = ("batch",)
|
_GROUPS = ("batch", "cpu")
|
||||||
|
|
||||||
|
|
||||||
async def _build_specs(group: str):
|
async def _build_specs(group: str):
|
||||||
@@ -43,6 +43,22 @@ async def _build_specs(group: str):
|
|||||||
("orchestrate", lambda: orchestrator_worker(repo, interval=60, llm_enabled=None)),
|
("orchestrate", lambda: orchestrator_worker(repo, interval=60, llm_enabled=None)),
|
||||||
("mutate", lambda: run_watch_loop(repo)),
|
("mutate", lambda: run_watch_loop(repo)),
|
||||||
]
|
]
|
||||||
|
if group == "cpu":
|
||||||
|
from decnet.cli.gating import _require_master_mode
|
||||||
|
from decnet.clustering.campaign.worker import run_campaign_clusterer_loop
|
||||||
|
from decnet.clustering.worker import run_clusterer_loop
|
||||||
|
from decnet.correlation.attribution_worker import run_attribution_loop
|
||||||
|
from decnet.correlation.reuse_worker import run_reuse_loop
|
||||||
|
from decnet.web.dependencies import repo
|
||||||
|
|
||||||
|
_require_master_mode("supervise cpu")
|
||||||
|
await repo.initialize() # shared by every cpu worker → one DB pool
|
||||||
|
return [
|
||||||
|
("clusterer", lambda: run_clusterer_loop(repo, poll_interval_secs=60.0)),
|
||||||
|
("campaign-clusterer", lambda: run_campaign_clusterer_loop(repo, poll_interval_secs=60.0)),
|
||||||
|
("attribution", lambda: run_attribution_loop(repo, multi_actor_tick_secs=60.0)),
|
||||||
|
("reuse-correlate", lambda: run_reuse_loop(repo, poll_interval_secs=60.0, min_targets=2)),
|
||||||
|
]
|
||||||
raise ValueError(f"unknown supervise group: {group}")
|
raise ValueError(f"unknown supervise group: {group}")
|
||||||
|
|
||||||
|
|
||||||
@@ -75,8 +91,32 @@ def register(app: typer.Typer) -> None:
|
|||||||
console.print(f"[bold cyan]Supervisor starting[/] group={group}")
|
console.print(f"[bold cyan]Supervisor starting[/] group={group}")
|
||||||
|
|
||||||
async def _run() -> None:
|
async def _run() -> None:
|
||||||
specs = await _build_specs(group)
|
pool = None
|
||||||
await run_group(specs)
|
if group == "cpu":
|
||||||
|
# The CPU workers offload their O(n^2) connected-components
|
||||||
|
# kernels to ONE shared pool so they run in parallel instead of
|
||||||
|
# serialising under the GIL. forkserver (not the default fork):
|
||||||
|
# this process is multithreaded via bus clients, and forking a
|
||||||
|
# multithreaded process is unsafe.
|
||||||
|
import multiprocessing as _mp
|
||||||
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
|
|
||||||
|
from decnet import offload
|
||||||
|
|
||||||
|
pool = ProcessPoolExecutor(
|
||||||
|
max_workers=2, mp_context=_mp.get_context("forkserver")
|
||||||
|
)
|
||||||
|
offload.set_executor(pool)
|
||||||
|
log.info("supervise cpu: kernel offload pool ready (max_workers=2)")
|
||||||
|
try:
|
||||||
|
specs = await _build_specs(group)
|
||||||
|
await run_group(specs)
|
||||||
|
finally:
|
||||||
|
if pool is not None:
|
||||||
|
from decnet import offload
|
||||||
|
|
||||||
|
offload.set_executor(None)
|
||||||
|
pool.shutdown(wait=False, cancel_futures=True)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
asyncio.run(_run())
|
asyncio.run(_run())
|
||||||
|
|||||||
@@ -31,11 +31,16 @@ from decnet.clustering.campaign.impl.similarity import (
|
|||||||
combined_campaign_weight,
|
combined_campaign_weight,
|
||||||
)
|
)
|
||||||
from decnet.logging import get_logger
|
from decnet.logging import get_logger
|
||||||
|
from decnet.offload import run_kernel
|
||||||
from decnet.util.simhash import from_bytes8
|
from decnet.util.simhash import from_bytes8
|
||||||
from decnet.web.db.repository import BaseRepository
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
|
||||||
log = get_logger("clustering.campaign.connected_components")
|
log = get_logger("clustering.campaign.connected_components")
|
||||||
|
|
||||||
|
# Below this many identities the O(n^2) pass is cheaper than the pickle
|
||||||
|
# round-trip to a pool worker, so run inline even when a pool is installed.
|
||||||
|
_OFFLOAD_MIN_IDENTITIES = 256
|
||||||
|
|
||||||
|
|
||||||
def cluster_identities(
|
def cluster_identities(
|
||||||
features: Iterable[IdentityFeatures],
|
features: Iterable[IdentityFeatures],
|
||||||
@@ -220,7 +225,11 @@ class ConnectedComponentsCampaignClusterer(CampaignClusterer):
|
|||||||
row_by_uuid: dict[str, dict[str, Any]] = {
|
row_by_uuid: dict[str, dict[str, Any]] = {
|
||||||
r["uuid"]: r for r in active_rows
|
r["uuid"]: r for r in active_rows
|
||||||
}
|
}
|
||||||
labels = cluster_identities(feature_list)
|
labels = await run_kernel(
|
||||||
|
cluster_identities,
|
||||||
|
feature_list,
|
||||||
|
offload_if=len(feature_list) >= _OFFLOAD_MIN_IDENTITIES,
|
||||||
|
)
|
||||||
|
|
||||||
# Group identities by predicted cluster.
|
# Group identities by predicted cluster.
|
||||||
components: dict[str, list[str]] = {}
|
components: dict[str, list[str]] = {}
|
||||||
|
|||||||
@@ -42,12 +42,17 @@ from decnet.clustering.impl.similarity import (
|
|||||||
combined_edge_weight,
|
combined_edge_weight,
|
||||||
)
|
)
|
||||||
from decnet.logging import get_logger
|
from decnet.logging import get_logger
|
||||||
|
from decnet.offload import run_kernel
|
||||||
from decnet.profiler.identity_rollup import extract_fp_summaries
|
from decnet.profiler.identity_rollup import extract_fp_summaries
|
||||||
from decnet.util.simhash import from_bytes8, to_bytes8
|
from decnet.util.simhash import from_bytes8, to_bytes8
|
||||||
from decnet.web.db.repository import BaseRepository
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
|
||||||
log = get_logger("clustering.connected_components")
|
log = get_logger("clustering.connected_components")
|
||||||
|
|
||||||
|
# Below this many observations the O(n^2) pass is cheaper than the pickle
|
||||||
|
# round-trip to a pool worker, so run inline even when a pool is installed.
|
||||||
|
_OFFLOAD_MIN_OBSERVATIONS = 256
|
||||||
|
|
||||||
# Per-session SimHash observations of the keystroke-rhythm biometric; the
|
# Per-session SimHash observations of the keystroke-rhythm biometric; the
|
||||||
# rollup folds them into one identity-level centroid.
|
# rollup folds them into one identity-level centroid.
|
||||||
_DIGRAPH_PRIMITIVE = "motor.digraph_simhash"
|
_DIGRAPH_PRIMITIVE = "motor.digraph_simhash"
|
||||||
@@ -173,7 +178,11 @@ class ConnectedComponentsClusterer(Clusterer):
|
|||||||
obs = from_attacker_row(r)
|
obs = from_attacker_row(r)
|
||||||
observations.append(obs)
|
observations.append(obs)
|
||||||
row_by_id[obs.observation_id] = r
|
row_by_id[obs.observation_id] = r
|
||||||
labels = cluster_observations(observations)
|
labels = await run_kernel(
|
||||||
|
cluster_observations,
|
||||||
|
observations,
|
||||||
|
offload_if=len(observations) >= _OFFLOAD_MIN_OBSERVATIONS,
|
||||||
|
)
|
||||||
|
|
||||||
# Group observations by predicted cluster.
|
# Group observations by predicted cluster.
|
||||||
components: dict[str, list[str]] = {}
|
components: dict[str, list[str]] = {}
|
||||||
|
|||||||
52
decnet/offload.py
Normal file
52
decnet/offload.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
"""Shared CPU-kernel offload — run a pure, picklable function in a process pool
|
||||||
|
so GIL-bound compute doesn't block the event loop (or its co-hosted workers).
|
||||||
|
|
||||||
|
Used by ``decnet supervise cpu`` (see ``decnet/cli/supervise.py``), which hosts
|
||||||
|
several CPU-bound workers in one process and installs ONE shared
|
||||||
|
``ProcessPoolExecutor`` here. When no executor is installed — standalone workers
|
||||||
|
and every test — :func:`run_kernel` runs the kernel inline, so behaviour is
|
||||||
|
identical to before this module existed.
|
||||||
|
|
||||||
|
Contract for an offloadable kernel: a module-level function (picklable by
|
||||||
|
reference) that is pure (no DB / clock / I/O), taking and returning picklable
|
||||||
|
values. The clustering connected-components kernels satisfy this.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
|
from typing import Any, Callable, TypeVar
|
||||||
|
|
||||||
|
_T = TypeVar("_T")
|
||||||
|
|
||||||
|
_executor: ProcessPoolExecutor | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def set_executor(ex: ProcessPoolExecutor | None) -> None:
|
||||||
|
"""Install (``ex``) or clear (``None``) the shared pool. Idempotent."""
|
||||||
|
global _executor
|
||||||
|
_executor = ex
|
||||||
|
|
||||||
|
|
||||||
|
def get_executor() -> ProcessPoolExecutor | None:
|
||||||
|
return _executor
|
||||||
|
|
||||||
|
|
||||||
|
async def run_kernel(
|
||||||
|
fn: Callable[..., _T], *args: Any, offload_if: bool = True
|
||||||
|
) -> _T:
|
||||||
|
"""Run ``fn(*args)``, offloading to the shared pool when one is installed
|
||||||
|
and ``offload_if`` holds; otherwise run inline.
|
||||||
|
|
||||||
|
``offload_if`` lets the caller skip the pickle round-trip for inputs too
|
||||||
|
small to be worth a cross-process hop — the caller knows the problem size,
|
||||||
|
this module does not.
|
||||||
|
# ponytail: boolean gate, not an auto-tuned threshold. If kernels start
|
||||||
|
# varying wildly in cost, measure and move the decision here.
|
||||||
|
"""
|
||||||
|
ex = _executor
|
||||||
|
if ex is None or not offload_if:
|
||||||
|
return fn(*args)
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
return await loop.run_in_executor(ex, fn, *args)
|
||||||
47
deploy/decnet-supervise-cpu.service.j2
Normal file
47
deploy/decnet-supervise-cpu.service.j2
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=DECNET CPU Supervisor (clusterer + campaign-clusterer + attribution + reuse-correlate in one process, kernels offloaded to a shared pool)
|
||||||
|
Documentation=https://git.resacachile.cl/anti/DECNET/wiki/Workers#supervisor
|
||||||
|
After=network-online.target decnet-bus.service
|
||||||
|
Wants=network-online.target decnet-bus.service
|
||||||
|
# Replaces the individual clusterer / campaign-clusterer / attribution /
|
||||||
|
# reuse-correlator units. Do NOT enable those alongside this one.
|
||||||
|
Conflicts=decnet-clusterer.service decnet-campaign-clusterer.service decnet-attribution.service decnet-reuse-correlator.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User={{ user }}
|
||||||
|
Group={{ group }}
|
||||||
|
WorkingDirectory={{ install_dir }}
|
||||||
|
EnvironmentFile=-{{ install_dir }}/.env.local
|
||||||
|
Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.supervise-cpu.log
|
||||||
|
ExecStart={{ venv_dir }}/bin/decnet supervise cpu
|
||||||
|
StandardOutput=append:/var/log/decnet/decnet.supervise-cpu.log
|
||||||
|
StandardError=append:/var/log/decnet/decnet.supervise-cpu.log
|
||||||
|
|
||||||
|
# These are read-heavy correlators (DB in, DB out, bus). No docker socket, no
|
||||||
|
# raw sockets — so unlike the batch supervisor this carries NO extra privilege
|
||||||
|
# beyond DB + network. The forkserver pool spawns short-lived compute children
|
||||||
|
# that inherit only this unit's sandbox.
|
||||||
|
|
||||||
|
CapabilityBoundingSet=
|
||||||
|
AmbientCapabilities=
|
||||||
|
|
||||||
|
# Security Hardening
|
||||||
|
NoNewPrivileges=yes
|
||||||
|
ProtectSystem=full
|
||||||
|
ProtectHome=read-only
|
||||||
|
PrivateTmp=yes
|
||||||
|
ProtectKernelTunables=yes
|
||||||
|
ProtectKernelModules=yes
|
||||||
|
ProtectControlGroups=yes
|
||||||
|
RestrictSUIDSGID=yes
|
||||||
|
LockPersonality=yes
|
||||||
|
ReadOnlyPaths=/var/lib/decnet
|
||||||
|
ReadWritePaths={{ install_dir }} /var/log/decnet
|
||||||
|
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=5
|
||||||
|
TimeoutStopSec=20
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
@@ -22,5 +22,6 @@ def test_unknown_group_exits_2():
|
|||||||
assert "unknown group" in result.stdout
|
assert "unknown group" in result.stdout
|
||||||
|
|
||||||
|
|
||||||
def test_batch_group_is_known():
|
def test_known_groups():
|
||||||
assert "batch" in _GROUPS
|
assert "batch" in _GROUPS
|
||||||
|
assert "cpu" in _GROUPS
|
||||||
|
|||||||
68
tests/test_offload.py
Normal file
68
tests/test_offload.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
"""Tests for the shared CPU-kernel offload (DECNET 1.1 cpu group).
|
||||||
|
|
||||||
|
Proves the offloaded result is identical to the inline result — i.e. the kernel
|
||||||
|
and its inputs survive the process boundary and the GIL-relief path is correct,
|
||||||
|
not just fast.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import multiprocessing as mp
|
||||||
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from decnet import offload
|
||||||
|
from decnet.clustering.impl.connected_components import cluster_observations
|
||||||
|
from decnet.clustering.impl.similarity import Observation
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.asyncio
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _clear_executor():
|
||||||
|
offload.set_executor(None)
|
||||||
|
yield
|
||||||
|
offload.set_executor(None)
|
||||||
|
|
||||||
|
|
||||||
|
async def test_inline_when_no_executor():
|
||||||
|
assert offload.get_executor() is None
|
||||||
|
out = await offload.run_kernel(lambda a, b: a + b, 2, 3)
|
||||||
|
assert out == 5 # closures are fine on the inline path (no pickling)
|
||||||
|
|
||||||
|
|
||||||
|
async def test_offload_if_false_runs_inline_even_with_pool():
|
||||||
|
with ProcessPoolExecutor(
|
||||||
|
max_workers=1, mp_context=mp.get_context("forkserver")
|
||||||
|
) as pool:
|
||||||
|
offload.set_executor(pool)
|
||||||
|
# a closure would fail to pickle — proves this stayed inline
|
||||||
|
out = await offload.run_kernel(lambda x: x * 10, 4, offload_if=False)
|
||||||
|
assert out == 40
|
||||||
|
|
||||||
|
|
||||||
|
async def test_offloaded_result_equals_inline():
|
||||||
|
obs = [
|
||||||
|
Observation(observation_id="a", ja3="x", hassh=None, asn=1),
|
||||||
|
Observation(observation_id="b", ja3="x", hassh=None, asn=1),
|
||||||
|
Observation(observation_id="c", ja3="y", hassh=None, asn=2),
|
||||||
|
]
|
||||||
|
inline = cluster_observations(obs)
|
||||||
|
|
||||||
|
with ProcessPoolExecutor(
|
||||||
|
max_workers=2, mp_context=mp.get_context("forkserver")
|
||||||
|
) as pool:
|
||||||
|
offload.set_executor(pool)
|
||||||
|
offloaded = await offload.run_kernel(cluster_observations, obs)
|
||||||
|
|
||||||
|
assert offloaded == inline # identical across the process boundary
|
||||||
|
|
||||||
|
|
||||||
|
async def test_set_get_executor_roundtrip():
|
||||||
|
assert offload.get_executor() is None
|
||||||
|
with ProcessPoolExecutor(max_workers=1) as pool:
|
||||||
|
offload.set_executor(pool)
|
||||||
|
assert offload.get_executor() is pool
|
||||||
|
offload.set_executor(None)
|
||||||
|
assert offload.get_executor() is None
|
||||||
Reference in New Issue
Block a user