feat(correlation/attribution): substrate + idle handler (Phase 1)

v0 Phase 1 of ATTRIBUTION-ENGINE.md: * AttributionStateRow SQLModel keyed on (identity_uuid, primitive) per ANTI direction — re-keying state rows when the v1 clusterer merges attackers is the migration debt v0 should not bake in. ATTRIBUTION-ENGINE.md updated with the deviation note. * AttributionMixin: ensure_stub_identity_for_attacker, idempotent upsert_attribution_state, get_attribution_state[_for_identity], list_multi_actor_identities (the Phase 5 correlator's read). * attribution.profile.{state_changed,multi_actor_suspected} bus topics + builder; wiki Service-Bus.md updated separately. * attribution_worker.py: subscribes to attacker.observation.>, ensures stub identity per event, logs and continues. No merger, no state writes, no derived events — Phase 4 wires those. * attribution/{aggregate.py,_thresholds.py} skeletons: Phase 2 fills _aggregate_categorical, Phase 3 adds numeric+hash+dispatcher.
2026-05-08 23:16:13 -04:00
parent e94ab608d9
commit c2891d6cca
15 changed files with 1203 additions and 0 deletions
--- a/decnet/web/db/models/init.py
+++ b/decnet/web/db/models/init.py
@@ -59,6 +59,9 @@ from .attachments import (
 from .observations import (
    ObservationRow,
 )
+from .attribution_state import (
+    AttributionStateRow,
+)
 from .campaigns import (
    Campaign,
    CampaignsResponse,
@@ -252,6 +255,7 @@ __all__ = [
    "AttackerIdentity",
    "AttackerIntel",
    "AttackersResponse",
+    "AttributionStateRow",
    "ObservationRow",
    "ObservedAttachment",
    "SmtpTarget",
--- a/decnet/web/db/models/attribution_state.py
+++ b/decnet/web/db/models/attribution_state.py
@@ -0,0 +1,78 @@
+"""Per-(identity, primitive) attribution state — v0 of the
+attribution engine.
+
+Materialised view of the state machine in
+``decnet.correlation.attribution.aggregate``. Re-derivable from
+``observations`` + the DEBT-032 fingerprint-rotation log; this row is
+a cache for cheap dashboard reads, not a source of truth.
+
+Keyed on ``identity_uuid``, not ``attacker_uuid``: pre-clusterer,
+every Attacker maps 1:1 to a stub row in ``attacker_identities``
+(``merged_into_uuid = NULL``) so the key is stable across the v0 / v1
+boundary. When v1's clusterer eventually merges identities, the loser
+row's state is recomputed from the union of observations under the
+winner — no schema change, no column-rename migration.
+
+This deviates from ``development/ATTRIBUTION-ENGINE.md`` §"Subject of
+attribution in v0" (which resolved on ``attacker_uuid``); the doc gets
+a deviation note in the same commit that ships this file.
+"""
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from typing import Any
+
+from sqlalchemy import JSON, Column, Index
+from sqlmodel import Field, SQLModel
+
+
+class AttributionStateRow(SQLModel, table=True):
+    """One state row per (identity, primitive). At most one row per
+    pair — composite PK enforces it.
+    """
+
+    __tablename__ = "attribution_state"
+    __table_args__ = (
+        Index("ix_attribution_state_state", "state"),
+        Index("ix_attribution_state_last_change", "last_change_ts"),
+        Index(
+            "ix_attribution_state_identity_state",
+            "identity_uuid", "state",
+        ),
+    )
+
+    # ── key ────────────────────────────────────────────────────────────
+    identity_uuid: str = Field(
+        foreign_key="attacker_identities.uuid", primary_key=True,
+    )
+    primitive: str = Field(primary_key=True)
+
+    # ── derived state ──────────────────────────────────────────────────
+    # Mirrors the BEHAVE Observation ``value`` column shape so the
+    # frontend can render the merger output the same way it renders raw
+    # latest-wins values today (BEHAVE-INTEGRATION.md Q3).
+    current_value: dict[str, Any] | str | int | float | bool | list = Field(
+        sa_column=Column(JSON, nullable=False),
+    )
+    # 'unknown' | 'stable' | 'drifting' | 'conflicted' | 'multi_actor'.
+    # Five states, frozen — see ATTRIBUTION-ENGINE.md §"State machine".
+    state: str
+    # Engine's confidence in the *state assertion*, not in any verdict
+    # about the attacker. ``multi_actor`` is capped at 0.6 by
+    # convention; other states use the merger's per-ValueKind formula.
+    confidence: float
+    # How many observations underlie this row. Used by the API to gate
+    # ``unknown`` (< 3 obs) without re-querying ``observations``.
+    observation_count: int = Field(default=0)
+    # When ``state`` last flipped. Equals ``updated_at`` on insert.
+    last_change_ts: float
+    # Most recent observation that fed this row. Used by the merger to
+    # detect drift windows without a full observation re-scan.
+    last_observation_ts: float
+
+    # ── audit ──────────────────────────────────────────────────────────
+    # Mirrors AttackerIdentity convention (federation gossip in v2).
+    schema_version: int = Field(default=1)
+    updated_at: datetime = Field(
+        default_factory=lambda: datetime.now(timezone.utc),
+    )
--- a/decnet/web/db/repository.py
+++ b/decnet/web/db/repository.py
@@ -1492,3 +1492,86 @@ class BaseRepository(ABC):
        SQLModel TTP mixin.
        """
        return []
+
+    # ─── Attribution engine (v0 — aggregation only) ────────────────────
+    # See development/ATTRIBUTION-ENGINE.md. The engine consumes
+    # ``attacker.observation.*`` events and writes per-(identity,
+    # primitive) state rows. Pre-clusterer, every Attacker maps 1:1
+    # to a stub AttackerIdentity row so the keying is stable across
+    # the v0 / v1 boundary.
+
+    @abstractmethod
+    async def ensure_stub_identity_for_attacker(
+        self, attacker_uuid: str,
+    ) -> Optional[str]:
+        """Return the ``identity_uuid`` for *attacker_uuid*, creating a
+        degenerate 1:1 stub in ``attacker_identities`` if the attacker
+        does not yet have one.
+
+        Returns ``None`` if the attacker row itself is missing (the
+        worker treats that as "defer" — the profiler tick has not yet
+        materialised the Attacker; same posture as
+        ``_handler.handle_session_ended`` in BEHAVE-SHELL).
+
+        Idempotent under concurrent calls: the second caller sees the
+        first caller's stamp and returns the same uuid. Implementations
+        are responsible for serialising the read-then-insert against
+        the bus's at-least-once delivery.
+
+        The third return value (boolean) signalling "newly created" is
+        deliberately omitted — the worker emits ``identity.formed`` on
+        a transition observed via the row's absence on its first call,
+        not via a flag from the repo. Keeps the repo idempotent and
+        flag-free.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    async def upsert_attribution_state(self, data: dict[str, Any]) -> None:
+        """Insert or update an :class:`AttributionStateRow` keyed on
+        ``(identity_uuid, primitive)``.
+
+        ``data`` MUST carry: ``identity_uuid``, ``primitive``,
+        ``current_value``, ``state``, ``confidence``,
+        ``observation_count``, ``last_change_ts``,
+        ``last_observation_ts``. ``schema_version`` defaults to 1.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    async def get_attribution_state_for_identity(
+        self, identity_uuid: str,
+    ) -> list[dict[str, Any]]:
+        """Return every attribution-state row for *identity_uuid*.
+
+        Empty list when the identity has no derived state yet (e.g.
+        observations have arrived but the engine has not run, or the
+        engine has not produced ≥ 3 observations per primitive). The
+        attribution API surface and AttackerDetail badge renderer both
+        consume this projection.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    async def get_attribution_state(
+        self, identity_uuid: str, primitive: str,
+    ) -> Optional[dict[str, Any]]:
+        """Return one ``(identity_uuid, primitive)`` row, or ``None``.
+
+        Used by the attribution worker on each inbound observation to
+        load the prior state before running the merger. ``None`` means
+        "no prior state — initialise from this observation alone".
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    async def list_multi_actor_identities(
+        self,
+    ) -> list[dict[str, Any]]:
+        """List ``{identity_uuid, primitives}`` for identities that
+        currently have ≥ 2 primitives flagged ``multi_actor``.
+
+        Backs the cross-primitive correlator (Phase 5). Empty list when
+        no identity is co-flagged.
+        """
+        raise NotImplementedError
--- a/decnet/web/db/sqlmodel_repo/init.py
+++ b/decnet/web/db/sqlmodel_repo/init.py
@@ -35,6 +35,7 @@ from decnet.web.db.sqlmodel_repo._helpers import (  # noqa: F401  (re-exported f
 )
 from decnet.web.db.sqlmodel_repo.attacker_intel import AttackerIntelMixin
 from decnet.web.db.sqlmodel_repo.attackers import AttackersMixin
+from decnet.web.db.sqlmodel_repo.attribution import AttributionMixin
 from decnet.web.db.sqlmodel_repo.auth import AuthMixin
 from decnet.web.db.sqlmodel_repo.bounties import BountiesMixin
 from decnet.web.db.sqlmodel_repo.campaigns import CampaignsMixin
@@ -58,6 +59,7 @@ from decnet.web.db.sqlmodel_repo.webhooks import WebhooksMixin
 class SQLModelRepository(
    AttackerIntelMixin,
    AttackersMixin,
+    AttributionMixin,
    AuthMixin,
    BountiesMixin,
    CampaignsMixin,
--- a/decnet/web/db/sqlmodel_repo/attribution.py
+++ b/decnet/web/db/sqlmodel_repo/attribution.py
@@ -0,0 +1,215 @@
+"""Repo mixin for the ``attribution_state`` table + identity stub
+materialisation.
+
+Composed onto :class:`SQLModelRepository`. Five public methods, all
+serving the v0 attribution engine
+(``decnet.correlation.attribution_worker``):
+
+* :meth:`ensure_stub_identity_for_attacker` — pre-clusterer 1:1 stub
+  identity creation. Idempotent under concurrent observation bursts.
+* :meth:`upsert_attribution_state` — keyed on
+  ``(identity_uuid, primitive)``.
+* :meth:`get_attribution_state` / :meth:`get_attribution_state_for_identity`
+  — single-row and per-identity reads.
+* :meth:`list_multi_actor_identities` — feeds the Phase 5 cross-
+  primitive correlator.
+
+See ``development/ATTRIBUTION-ENGINE.md`` for the full design.
+"""
+from __future__ import annotations
+
+import uuid as _uuid
+from datetime import datetime, timezone
+from typing import Any, Optional
+
+from sqlalchemy import func, select
+from sqlmodel import col
+
+from decnet.web.db.models import (
+    Attacker,
+    AttackerIdentity,
+    AttributionStateRow,
+)
+from decnet.web.db.sqlmodel_repo._helpers import _MixinBase
+
+
+class AttributionMixin(_MixinBase):
+    """Mixin: methods composed onto :class:`SQLModelRepository`."""
+
+    async def ensure_stub_identity_for_attacker(
+        self, attacker_uuid: str,
+    ) -> Optional[str]:
+        """Return the ``identity_uuid`` for *attacker_uuid*, creating a
+        degenerate 1:1 stub in ``attacker_identities`` if absent.
+
+        Returns ``None`` when the Attacker row itself is missing — the
+        attribution worker treats that as "defer" (mirrors the
+        ``_handler.handle_session_ended`` posture in BEHAVE-SHELL).
+
+        Idempotent: the second caller for the same attacker reads the
+        ``identity_id`` stamped by the first caller and returns it
+        without inserting again. Race: two concurrent first-callers
+        could both see ``identity_id = NULL`` and both create stubs;
+        the loser's commit would leave a dangling AttackerIdentity row
+        with no Attacker referencing it. Acceptable in v0 (rare; rows
+        are tiny; gc'd in v1 when the clusterer runs). The
+        single-writer attribution worker plus the bus's per-identity
+        ordering make even that race vanishingly rare in practice.
+        """
+        async with self._session() as session:
+            attacker_row = (
+                await session.execute(
+                    select(Attacker).where(Attacker.uuid == attacker_uuid)
+                )
+            ).scalar_one_or_none()
+            if attacker_row is None:
+                return None
+            if attacker_row.identity_id:
+                return attacker_row.identity_id
+            new_uuid = _uuid.uuid4().hex
+            now = datetime.now(timezone.utc)
+            session.add(
+                AttackerIdentity(
+                    uuid=new_uuid,
+                    schema_version=1,
+                    first_seen_at=attacker_row.first_seen,
+                    last_seen_at=attacker_row.last_seen,
+                    created_at=now,
+                    updated_at=now,
+                    observation_count=1,
+                )
+            )
+            attacker_row.identity_id = new_uuid
+            session.add(attacker_row)
+            await session.commit()
+            return new_uuid
+
+    async def upsert_attribution_state(
+        self, data: dict[str, Any],
+    ) -> None:
+        """Insert or update one ``(identity_uuid, primitive)`` row.
+
+        ``data`` MUST carry: ``identity_uuid``, ``primitive``,
+        ``current_value``, ``state``, ``confidence``,
+        ``observation_count``, ``last_change_ts``,
+        ``last_observation_ts``. ``schema_version`` and ``updated_at``
+        are managed here.
+        """
+        identity_uuid = data["identity_uuid"]
+        primitive = data["primitive"]
+        async with self._session() as session:
+            existing = (
+                await session.execute(
+                    select(AttributionStateRow).where(
+                        AttributionStateRow.identity_uuid == identity_uuid,
+                        AttributionStateRow.primitive == primitive,
+                    )
+                )
+            ).scalar_one_or_none()
+            now = datetime.now(timezone.utc)
+            if existing is not None:
+                for k, v in data.items():
+                    if k in ("identity_uuid", "primitive"):
+                        continue
+                    setattr(existing, k, v)
+                existing.updated_at = now
+                session.add(existing)
+            else:
+                session.add(
+                    AttributionStateRow(
+                        **{**data, "schema_version": 1, "updated_at": now}
+                    )
+                )
+            await session.commit()
+
+    async def get_attribution_state(
+        self, identity_uuid: str, primitive: str,
+    ) -> Optional[dict[str, Any]]:
+        """Single-row lookup. ``None`` when the merger has not yet run
+        for this ``(identity_uuid, primitive)`` pair."""
+        async with self._session() as session:
+            row = (
+                await session.execute(
+                    select(AttributionStateRow).where(
+                        AttributionStateRow.identity_uuid == identity_uuid,
+                        AttributionStateRow.primitive == primitive,
+                    )
+                )
+            ).scalar_one_or_none()
+            return None if row is None else row.model_dump(mode="json")
+
+    async def get_attribution_state_for_identity(
+        self, identity_uuid: str,
+    ) -> list[dict[str, Any]]:
+        """All attribution-state rows for one identity, primitive-
+        ordered for deterministic API output."""
+        async with self._session() as session:
+            rows = (
+                await session.execute(
+                    select(AttributionStateRow)
+                    .where(AttributionStateRow.identity_uuid == identity_uuid)
+                    .order_by(AttributionStateRow.primitive)
+                )
+            ).scalars().all()
+            return [r.model_dump(mode="json") for r in rows]
+
+    async def list_multi_actor_identities(
+        self,
+    ) -> list[dict[str, Any]]:
+        """Identities with ≥ 2 primitives currently in ``multi_actor``.
+
+        Output shape::
+
+            [{"identity_uuid": "...", "primitives": ["motor.input_modality",
+                                                     "cognitive.feedback_loop_engagement"]},
+             ...]
+
+        Empty list when no identity is co-flagged. Used by the Phase 5
+        cross-primitive correlator — single-primitive ``multi_actor``
+        is too noisy to alarm on, two independent primitives is the
+        threshold for ``attribution.profile.multi_actor_suspected``.
+        """
+        async with self._session() as session:
+            # First pass: identities with ≥ 2 multi_actor rows.
+            count_stmt = (
+                select(
+                    col(AttributionStateRow.identity_uuid),
+                    func.count().label("ct"),
+                )
+                .where(AttributionStateRow.state == "multi_actor")
+                .group_by(col(AttributionStateRow.identity_uuid))
+                .having(func.count() >= 2)
+            )
+            co_flagged = [
+                row.identity_uuid
+                for row in (await session.execute(count_stmt)).all()
+            ]
+            if not co_flagged:
+                return []
+            # Second pass: collect the primitive list per co-flagged
+            # identity. Two queries beat one wide one because the
+            # first query's count-having filter prunes the second
+            # query's row set without a self-join.
+            detail_stmt = (
+                select(
+                    col(AttributionStateRow.identity_uuid),
+                    col(AttributionStateRow.primitive),
+                )
+                .where(
+                    AttributionStateRow.state == "multi_actor",
+                    col(AttributionStateRow.identity_uuid).in_(co_flagged),
+                )
+                .order_by(
+                    col(AttributionStateRow.identity_uuid),
+                    col(AttributionStateRow.primitive),
+                )
+            )
+            grouped: dict[str, list[str]] = {}
+            for row in (await session.execute(detail_stmt)).all():
+                grouped.setdefault(row.identity_uuid, []).append(
+                    row.primitive,
+                )
+            return [
+                {"identity_uuid": iuuid, "primitives": prims}
+                for iuuid, prims in grouped.items()
+            ]