feat(correlation/attribution): substrate + idle handler (Phase 1)

v0 Phase 1 of ATTRIBUTION-ENGINE.md:

* AttributionStateRow SQLModel keyed on (identity_uuid, primitive)
  per ANTI direction — re-keying state rows when the v1 clusterer
  merges attackers is the migration debt v0 should not bake in.
  ATTRIBUTION-ENGINE.md updated with the deviation note.
* AttributionMixin: ensure_stub_identity_for_attacker, idempotent
  upsert_attribution_state, get_attribution_state[_for_identity],
  list_multi_actor_identities (the Phase 5 correlator's read).
* attribution.profile.{state_changed,multi_actor_suspected} bus
  topics + builder; wiki Service-Bus.md updated separately.
* attribution_worker.py: subscribes to attacker.observation.>,
  ensures stub identity per event, logs and continues. No merger,
  no state writes, no derived events — Phase 4 wires those.
* attribution/{aggregate.py,_thresholds.py} skeletons: Phase 2
  fills _aggregate_categorical, Phase 3 adds numeric+hash+dispatcher.
This commit is contained in:
2026-05-08 23:16:13 -04:00
parent e94ab608d9
commit c2891d6cca
15 changed files with 1203 additions and 0 deletions

View File

@@ -59,6 +59,9 @@ from .attachments import (
from .observations import (
ObservationRow,
)
from .attribution_state import (
AttributionStateRow,
)
from .campaigns import (
Campaign,
CampaignsResponse,
@@ -252,6 +255,7 @@ __all__ = [
"AttackerIdentity",
"AttackerIntel",
"AttackersResponse",
"AttributionStateRow",
"ObservationRow",
"ObservedAttachment",
"SmtpTarget",

View File

@@ -0,0 +1,78 @@
"""Per-(identity, primitive) attribution state — v0 of the
attribution engine.
Materialised view of the state machine in
``decnet.correlation.attribution.aggregate``. Re-derivable from
``observations`` + the DEBT-032 fingerprint-rotation log; this row is
a cache for cheap dashboard reads, not a source of truth.
Keyed on ``identity_uuid``, not ``attacker_uuid``: pre-clusterer,
every Attacker maps 1:1 to a stub row in ``attacker_identities``
(``merged_into_uuid = NULL``) so the key is stable across the v0 / v1
boundary. When v1's clusterer eventually merges identities, the loser
row's state is recomputed from the union of observations under the
winner — no schema change, no column-rename migration.
This deviates from ``development/ATTRIBUTION-ENGINE.md`` §"Subject of
attribution in v0" (which resolved on ``attacker_uuid``); the doc gets
a deviation note in the same commit that ships this file.
"""
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any
from sqlalchemy import JSON, Column, Index
from sqlmodel import Field, SQLModel
class AttributionStateRow(SQLModel, table=True):
"""One state row per (identity, primitive). At most one row per
pair — composite PK enforces it.
"""
__tablename__ = "attribution_state"
__table_args__ = (
Index("ix_attribution_state_state", "state"),
Index("ix_attribution_state_last_change", "last_change_ts"),
Index(
"ix_attribution_state_identity_state",
"identity_uuid", "state",
),
)
# ── key ────────────────────────────────────────────────────────────
identity_uuid: str = Field(
foreign_key="attacker_identities.uuid", primary_key=True,
)
primitive: str = Field(primary_key=True)
# ── derived state ──────────────────────────────────────────────────
# Mirrors the BEHAVE Observation ``value`` column shape so the
# frontend can render the merger output the same way it renders raw
# latest-wins values today (BEHAVE-INTEGRATION.md Q3).
current_value: dict[str, Any] | str | int | float | bool | list = Field(
sa_column=Column(JSON, nullable=False),
)
# 'unknown' | 'stable' | 'drifting' | 'conflicted' | 'multi_actor'.
# Five states, frozen — see ATTRIBUTION-ENGINE.md §"State machine".
state: str
# Engine's confidence in the *state assertion*, not in any verdict
# about the attacker. ``multi_actor`` is capped at 0.6 by
# convention; other states use the merger's per-ValueKind formula.
confidence: float
# How many observations underlie this row. Used by the API to gate
# ``unknown`` (< 3 obs) without re-querying ``observations``.
observation_count: int = Field(default=0)
# When ``state`` last flipped. Equals ``updated_at`` on insert.
last_change_ts: float
# Most recent observation that fed this row. Used by the merger to
# detect drift windows without a full observation re-scan.
last_observation_ts: float
# ── audit ──────────────────────────────────────────────────────────
# Mirrors AttackerIdentity convention (federation gossip in v2).
schema_version: int = Field(default=1)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
)

View File

@@ -1492,3 +1492,86 @@ class BaseRepository(ABC):
SQLModel TTP mixin.
"""
return []
# ─── Attribution engine (v0 — aggregation only) ────────────────────
# See development/ATTRIBUTION-ENGINE.md. The engine consumes
# ``attacker.observation.*`` events and writes per-(identity,
# primitive) state rows. Pre-clusterer, every Attacker maps 1:1
# to a stub AttackerIdentity row so the keying is stable across
# the v0 / v1 boundary.
@abstractmethod
async def ensure_stub_identity_for_attacker(
self, attacker_uuid: str,
) -> Optional[str]:
"""Return the ``identity_uuid`` for *attacker_uuid*, creating a
degenerate 1:1 stub in ``attacker_identities`` if the attacker
does not yet have one.
Returns ``None`` if the attacker row itself is missing (the
worker treats that as "defer" — the profiler tick has not yet
materialised the Attacker; same posture as
``_handler.handle_session_ended`` in BEHAVE-SHELL).
Idempotent under concurrent calls: the second caller sees the
first caller's stamp and returns the same uuid. Implementations
are responsible for serialising the read-then-insert against
the bus's at-least-once delivery.
The third return value (boolean) signalling "newly created" is
deliberately omitted — the worker emits ``identity.formed`` on
a transition observed via the row's absence on its first call,
not via a flag from the repo. Keeps the repo idempotent and
flag-free.
"""
raise NotImplementedError
@abstractmethod
async def upsert_attribution_state(self, data: dict[str, Any]) -> None:
"""Insert or update an :class:`AttributionStateRow` keyed on
``(identity_uuid, primitive)``.
``data`` MUST carry: ``identity_uuid``, ``primitive``,
``current_value``, ``state``, ``confidence``,
``observation_count``, ``last_change_ts``,
``last_observation_ts``. ``schema_version`` defaults to 1.
"""
raise NotImplementedError
@abstractmethod
async def get_attribution_state_for_identity(
self, identity_uuid: str,
) -> list[dict[str, Any]]:
"""Return every attribution-state row for *identity_uuid*.
Empty list when the identity has no derived state yet (e.g.
observations have arrived but the engine has not run, or the
engine has not produced ≥ 3 observations per primitive). The
attribution API surface and AttackerDetail badge renderer both
consume this projection.
"""
raise NotImplementedError
@abstractmethod
async def get_attribution_state(
self, identity_uuid: str, primitive: str,
) -> Optional[dict[str, Any]]:
"""Return one ``(identity_uuid, primitive)`` row, or ``None``.
Used by the attribution worker on each inbound observation to
load the prior state before running the merger. ``None`` means
"no prior state — initialise from this observation alone".
"""
raise NotImplementedError
@abstractmethod
async def list_multi_actor_identities(
self,
) -> list[dict[str, Any]]:
"""List ``{identity_uuid, primitives}`` for identities that
currently have ≥ 2 primitives flagged ``multi_actor``.
Backs the cross-primitive correlator (Phase 5). Empty list when
no identity is co-flagged.
"""
raise NotImplementedError

View File

@@ -35,6 +35,7 @@ from decnet.web.db.sqlmodel_repo._helpers import ( # noqa: F401 (re-exported f
)
from decnet.web.db.sqlmodel_repo.attacker_intel import AttackerIntelMixin
from decnet.web.db.sqlmodel_repo.attackers import AttackersMixin
from decnet.web.db.sqlmodel_repo.attribution import AttributionMixin
from decnet.web.db.sqlmodel_repo.auth import AuthMixin
from decnet.web.db.sqlmodel_repo.bounties import BountiesMixin
from decnet.web.db.sqlmodel_repo.campaigns import CampaignsMixin
@@ -58,6 +59,7 @@ from decnet.web.db.sqlmodel_repo.webhooks import WebhooksMixin
class SQLModelRepository(
AttackerIntelMixin,
AttackersMixin,
AttributionMixin,
AuthMixin,
BountiesMixin,
CampaignsMixin,

View File

@@ -0,0 +1,215 @@
"""Repo mixin for the ``attribution_state`` table + identity stub
materialisation.
Composed onto :class:`SQLModelRepository`. Five public methods, all
serving the v0 attribution engine
(``decnet.correlation.attribution_worker``):
* :meth:`ensure_stub_identity_for_attacker` — pre-clusterer 1:1 stub
identity creation. Idempotent under concurrent observation bursts.
* :meth:`upsert_attribution_state` — keyed on
``(identity_uuid, primitive)``.
* :meth:`get_attribution_state` / :meth:`get_attribution_state_for_identity`
— single-row and per-identity reads.
* :meth:`list_multi_actor_identities` — feeds the Phase 5 cross-
primitive correlator.
See ``development/ATTRIBUTION-ENGINE.md`` for the full design.
"""
from __future__ import annotations
import uuid as _uuid
from datetime import datetime, timezone
from typing import Any, Optional
from sqlalchemy import func, select
from sqlmodel import col
from decnet.web.db.models import (
Attacker,
AttackerIdentity,
AttributionStateRow,
)
from decnet.web.db.sqlmodel_repo._helpers import _MixinBase
class AttributionMixin(_MixinBase):
"""Mixin: methods composed onto :class:`SQLModelRepository`."""
async def ensure_stub_identity_for_attacker(
self, attacker_uuid: str,
) -> Optional[str]:
"""Return the ``identity_uuid`` for *attacker_uuid*, creating a
degenerate 1:1 stub in ``attacker_identities`` if absent.
Returns ``None`` when the Attacker row itself is missing — the
attribution worker treats that as "defer" (mirrors the
``_handler.handle_session_ended`` posture in BEHAVE-SHELL).
Idempotent: the second caller for the same attacker reads the
``identity_id`` stamped by the first caller and returns it
without inserting again. Race: two concurrent first-callers
could both see ``identity_id = NULL`` and both create stubs;
the loser's commit would leave a dangling AttackerIdentity row
with no Attacker referencing it. Acceptable in v0 (rare; rows
are tiny; gc'd in v1 when the clusterer runs). The
single-writer attribution worker plus the bus's per-identity
ordering make even that race vanishingly rare in practice.
"""
async with self._session() as session:
attacker_row = (
await session.execute(
select(Attacker).where(Attacker.uuid == attacker_uuid)
)
).scalar_one_or_none()
if attacker_row is None:
return None
if attacker_row.identity_id:
return attacker_row.identity_id
new_uuid = _uuid.uuid4().hex
now = datetime.now(timezone.utc)
session.add(
AttackerIdentity(
uuid=new_uuid,
schema_version=1,
first_seen_at=attacker_row.first_seen,
last_seen_at=attacker_row.last_seen,
created_at=now,
updated_at=now,
observation_count=1,
)
)
attacker_row.identity_id = new_uuid
session.add(attacker_row)
await session.commit()
return new_uuid
async def upsert_attribution_state(
self, data: dict[str, Any],
) -> None:
"""Insert or update one ``(identity_uuid, primitive)`` row.
``data`` MUST carry: ``identity_uuid``, ``primitive``,
``current_value``, ``state``, ``confidence``,
``observation_count``, ``last_change_ts``,
``last_observation_ts``. ``schema_version`` and ``updated_at``
are managed here.
"""
identity_uuid = data["identity_uuid"]
primitive = data["primitive"]
async with self._session() as session:
existing = (
await session.execute(
select(AttributionStateRow).where(
AttributionStateRow.identity_uuid == identity_uuid,
AttributionStateRow.primitive == primitive,
)
)
).scalar_one_or_none()
now = datetime.now(timezone.utc)
if existing is not None:
for k, v in data.items():
if k in ("identity_uuid", "primitive"):
continue
setattr(existing, k, v)
existing.updated_at = now
session.add(existing)
else:
session.add(
AttributionStateRow(
**{**data, "schema_version": 1, "updated_at": now}
)
)
await session.commit()
async def get_attribution_state(
self, identity_uuid: str, primitive: str,
) -> Optional[dict[str, Any]]:
"""Single-row lookup. ``None`` when the merger has not yet run
for this ``(identity_uuid, primitive)`` pair."""
async with self._session() as session:
row = (
await session.execute(
select(AttributionStateRow).where(
AttributionStateRow.identity_uuid == identity_uuid,
AttributionStateRow.primitive == primitive,
)
)
).scalar_one_or_none()
return None if row is None else row.model_dump(mode="json")
async def get_attribution_state_for_identity(
self, identity_uuid: str,
) -> list[dict[str, Any]]:
"""All attribution-state rows for one identity, primitive-
ordered for deterministic API output."""
async with self._session() as session:
rows = (
await session.execute(
select(AttributionStateRow)
.where(AttributionStateRow.identity_uuid == identity_uuid)
.order_by(AttributionStateRow.primitive)
)
).scalars().all()
return [r.model_dump(mode="json") for r in rows]
async def list_multi_actor_identities(
self,
) -> list[dict[str, Any]]:
"""Identities with ≥ 2 primitives currently in ``multi_actor``.
Output shape::
[{"identity_uuid": "...", "primitives": ["motor.input_modality",
"cognitive.feedback_loop_engagement"]},
...]
Empty list when no identity is co-flagged. Used by the Phase 5
cross-primitive correlator — single-primitive ``multi_actor``
is too noisy to alarm on, two independent primitives is the
threshold for ``attribution.profile.multi_actor_suspected``.
"""
async with self._session() as session:
# First pass: identities with ≥ 2 multi_actor rows.
count_stmt = (
select(
col(AttributionStateRow.identity_uuid),
func.count().label("ct"),
)
.where(AttributionStateRow.state == "multi_actor")
.group_by(col(AttributionStateRow.identity_uuid))
.having(func.count() >= 2)
)
co_flagged = [
row.identity_uuid
for row in (await session.execute(count_stmt)).all()
]
if not co_flagged:
return []
# Second pass: collect the primitive list per co-flagged
# identity. Two queries beat one wide one because the
# first query's count-having filter prunes the second
# query's row set without a self-join.
detail_stmt = (
select(
col(AttributionStateRow.identity_uuid),
col(AttributionStateRow.primitive),
)
.where(
AttributionStateRow.state == "multi_actor",
col(AttributionStateRow.identity_uuid).in_(co_flagged),
)
.order_by(
col(AttributionStateRow.identity_uuid),
col(AttributionStateRow.primitive),
)
)
grouped: dict[str, list[str]] = {}
for row in (await session.execute(detail_stmt)).all():
grouped.setdefault(row.identity_uuid, []).append(
row.primitive,
)
return [
{"identity_uuid": iuuid, "primitives": prims}
for iuuid, prims in grouped.items()
]