DECNET/decnet/web/db/sqlmodel_repo/attribution.py

"""Repo mixin for the ``attribution_state`` table + identity stub
materialisation.

Composed onto :class:`SQLModelRepository`. Five public methods, all
serving the v0 attribution engine
(``decnet.correlation.attribution_worker``):

* :meth:`ensure_stub_identity_for_attacker` — pre-clusterer 1:1 stub
  identity creation. Idempotent under concurrent observation bursts.
* :meth:`upsert_attribution_state` — keyed on
  ``(identity_uuid, primitive)``.
* :meth:`get_attribution_state` / :meth:`get_attribution_state_for_identity`
  — single-row and per-identity reads.
* :meth:`list_multi_actor_identities` — feeds the Phase 5 cross-
  primitive correlator.

See ``development/ATTRIBUTION-ENGINE.md`` for the full design.
"""
from __future__ import annotations

import uuid as _uuid
from datetime import datetime, timezone
from typing import Any, Optional

from sqlalchemy import func, select
from sqlmodel import col

from decnet.web.db.models import (
    Attacker,
    AttackerIdentity,
    AttributionStateRow,
)
from decnet.web.db.sqlmodel_repo._helpers import _MixinBase


class AttributionMixin(_MixinBase):
    """Mixin: methods composed onto :class:`SQLModelRepository`."""

    async def ensure_stub_identity_for_attacker(
        self, attacker_uuid: str,
    ) -> Optional[str]:
        """Return the ``identity_uuid`` for *attacker_uuid*, creating a
        degenerate 1:1 stub in ``attacker_identities`` if absent.

        Returns ``None`` when the Attacker row itself is missing — the
        attribution worker treats that as "defer" (mirrors the
        ``_handler.handle_session_ended`` posture in BEHAVE-SHELL).

        Idempotent: the second caller for the same attacker reads the
        ``identity_id`` stamped by the first caller and returns it
        without inserting again. Race: two concurrent first-callers
        could both see ``identity_id = NULL`` and both create stubs;
        the loser's commit would leave a dangling AttackerIdentity row
        with no Attacker referencing it. Acceptable in v0 (rare; rows
        are tiny; gc'd in v1 when the clusterer runs). The
        single-writer attribution worker plus the bus's per-identity
        ordering make even that race vanishingly rare in practice.
        """
        async with self._session() as session:
            attacker_row = (
                await session.execute(
                    select(Attacker).where(Attacker.uuid == attacker_uuid)
                )
            ).scalar_one_or_none()
            if attacker_row is None:
                return None
            if attacker_row.identity_id:
                return attacker_row.identity_id
            new_uuid = _uuid.uuid4().hex
            now = datetime.now(timezone.utc)
            session.add(
                AttackerIdentity(
                    uuid=new_uuid,
                    schema_version=1,
                    first_seen_at=attacker_row.first_seen,
                    last_seen_at=attacker_row.last_seen,
                    created_at=now,
                    updated_at=now,
                    observation_count=1,
                )
            )
            attacker_row.identity_id = new_uuid
            session.add(attacker_row)
            await session.commit()
            return new_uuid

    async def upsert_attribution_state(
        self, data: dict[str, Any],
    ) -> None:
        """Insert or update one ``(identity_uuid, primitive)`` row.

        ``data`` MUST carry: ``identity_uuid``, ``primitive``,
        ``current_value``, ``state``, ``confidence``,
        ``observation_count``, ``last_change_ts``,
        ``last_observation_ts``. ``schema_version`` and ``updated_at``
        are managed here.
        """
        identity_uuid = data["identity_uuid"]
        primitive = data["primitive"]
        async with self._session() as session:
            existing = (
                await session.execute(
                    select(AttributionStateRow).where(
                        AttributionStateRow.identity_uuid == identity_uuid,
                        AttributionStateRow.primitive == primitive,
                    )
                )
            ).scalar_one_or_none()
            now = datetime.now(timezone.utc)
            if existing is not None:
                for k, v in data.items():
                    if k in ("identity_uuid", "primitive"):
                        continue
                    setattr(existing, k, v)
                existing.updated_at = now
                session.add(existing)
            else:
                session.add(
                    AttributionStateRow(
                        **{**data, "schema_version": 1, "updated_at": now}
                    )
                )
            await session.commit()

    async def get_attribution_state(
        self, identity_uuid: str, primitive: str,
    ) -> Optional[dict[str, Any]]:
        """Single-row lookup. ``None`` when the merger has not yet run
        for this ``(identity_uuid, primitive)`` pair."""
        async with self._session() as session:
            row = (
                await session.execute(
                    select(AttributionStateRow).where(
                        AttributionStateRow.identity_uuid == identity_uuid,
                        AttributionStateRow.primitive == primitive,
                    )
                )
            ).scalar_one_or_none()
            return None if row is None else row.model_dump(mode="json")

    async def get_attribution_state_for_identity(
        self, identity_uuid: str,
    ) -> list[dict[str, Any]]:
        """All attribution-state rows for one identity, primitive-
        ordered for deterministic API output."""
        async with self._session() as session:
            rows = (
                await session.execute(
                    select(AttributionStateRow)
                    .where(AttributionStateRow.identity_uuid == identity_uuid)
                    .order_by(AttributionStateRow.primitive)
                )
            ).scalars().all()
            return [r.model_dump(mode="json") for r in rows]

    async def list_multi_actor_identities(
        self,
    ) -> list[dict[str, Any]]:
        """Identities with ≥ 2 primitives currently in ``multi_actor``.

        Output shape::

            [{"identity_uuid": "...", "primitives": ["motor.input_modality",
                                                     "cognitive.feedback_loop_engagement"]},
             ...]

        Empty list when no identity is co-flagged. Used by the Phase 5
        cross-primitive correlator — single-primitive ``multi_actor``
        is too noisy to alarm on, two independent primitives is the
        threshold for ``attribution.profile.multi_actor_suspected``.
        """
        async with self._session() as session:
            # First pass: identities with ≥ 2 multi_actor rows.
            count_stmt = (
                select(
                    col(AttributionStateRow.identity_uuid),
                    func.count().label("ct"),
                )
                .where(AttributionStateRow.state == "multi_actor")
                .group_by(col(AttributionStateRow.identity_uuid))
                .having(func.count() >= 2)
            )
            co_flagged = [
                row.identity_uuid
                for row in (await session.execute(count_stmt)).all()
            ]
            if not co_flagged:
                return []
            # Second pass: collect the primitive list per co-flagged
            # identity. Two queries beat one wide one because the
            # first query's count-having filter prunes the second
            # query's row set without a self-join.
            detail_stmt = (
                select(
                    col(AttributionStateRow.identity_uuid),
                    col(AttributionStateRow.primitive),
                )
                .where(
                    AttributionStateRow.state == "multi_actor",
                    col(AttributionStateRow.identity_uuid).in_(co_flagged),
                )
                .order_by(
                    col(AttributionStateRow.identity_uuid),
                    col(AttributionStateRow.primitive),
                )
            )
            grouped: dict[str, list[str]] = {}
            for row in (await session.execute(detail_stmt)).all():
                grouped.setdefault(row.identity_uuid, []).append(
                    row.primitive,
                )
            return [
                {"identity_uuid": iuuid, "primitives": prims}
                for iuuid, prims in grouped.items()
            ]