feat(creds): cred-reuse foundation + vectorstore scaffold

Lays the storage and bus substrate for the "credential reuse patterns" task in DEVELOPMENT.md and scaffolds decnet/vectorstore/ as the future substrate for statistical attacker re-identification over behavioral fingerprints. No correlator, profiler, API, or dashboard wiring in this commit — see TODO.md for the handoff. Schema: - Credential.attacker_uuid (nullable FK to attackers.uuid), backfilled by the profiler post-write to avoid coupling the capture path to the profiler's ordering. - CredentialReuse table — UUID PK, JSON list columns for the accumulating attacker_uuids/ips/deckies/services, target_count (the discriminative scalar), confidence reserved for a future fuzzy-credential pass. Repo: - upsert_credential_reuse / list_credential_reuses / get_credential_reuse_by_id / update_credential_attacker_uuid. - Renamed pre-existing get_credential_reuse(secret_sha256) to get_credential_attempts_for_secret(secret_sha256) — the new findings table needs the cleaner name. Bus topics: - credential.captured (one per Credential upsert) - credential.reuse.detected (correlator-emitted on insert/grow) Vectorstore subpackage (decnet/vectorstore/, flat layout mirroring decnet/bus/): - BaseVectorStore ABC keyed by (kind, id) — kind discriminator means new feature families are additive, no schema migration. - FakeVectorStore (in-memory L2 KNN), NullVectorStore (no-op for DECNET_VECTORSTORE_ENABLED=false), SqliteVecVectorStore (lazy sqlite_vec extension load, one vec0 virtual table per kind). - get_vectorstore() env-driven dispatch with graceful fallback to FakeVectorStore when the sqlite-vec extension isn't on the host, so workers don't crash on a missing optional dep. Tests: 26 new (11 cred-reuse repo, 15 vectorstore). Existing credentials and base-repo tests updated for the rename. Total: 34 passing on the touched files.
2026-04-26 03:18:34 -04:00
parent 817ce32e6d
commit ce4be68501
17 changed files with 1615 additions and 11 deletions
--- a/decnet/web/db/models/init.py
+++ b/decnet/web/db/models/init.py
@@ -49,6 +49,8 @@ from .logs import (
    Bounty,
    BountyResponse,
    Credential,
+    CredentialReuse,
+    CredentialReuseResponse,
    CredentialsResponse,
    Log,
    LogsResponse,
@@ -170,6 +172,8 @@ __all__ = [
    "Bounty",
    "BountyResponse",
    "Credential",
+    "CredentialReuse",
+    "CredentialReuseResponse",
    "CredentialsResponse",
    "Log",
    "LogsResponse",
--- a/decnet/web/db/models/logs.py
+++ b/decnet/web/db/models/logs.py
@@ -3,7 +3,7 @@ from datetime import datetime, timezone
 from typing import Any, List, Optional

 from pydantic import BaseModel
-from sqlalchemy import Column, Index, Text
+from sqlalchemy import Column, Index, Text, UniqueConstraint
 from sqlmodel import Field, SQLModel

 from ._base import _BIG_TEXT
@@ -54,9 +54,13 @@ class Credential(SQLModel, table=True):
    LDAP. Nullable for principal-less mechanisms (Redis AUTH, bearer
    tokens). Fully service-specific keys ride in ``fields`` JSON.

-    Dedup contract: same (attacker_uuid, decky, service, secret_sha256,
+    Dedup contract: same (attacker_ip, decky, service, secret_sha256,
    principal_or_empty) tuple → upsert, bumps ``attempt_count`` and
    ``last_seen``. Different secret or different principal → new row.
+
+    ``attacker_uuid`` is backfilled by the profiler once an Attacker row
+    has been minted for the source IP. It is nullable on first write so
+    the credential ingest path stays decoupled from the profiler.
    """
    __tablename__ = "credentials"
    __table_args__ = (
@@ -64,11 +68,15 @@ class Credential(SQLModel, table=True):
        Index("ix_credentials_principal_service", "principal", "service"),
    )
    id: Optional[int] = Field(default=None, primary_key=True)
-    # Keyed by attacker IP (not attackers.uuid) to match Bounty's pattern
-    # and avoid the chicken-and-egg of writing a credential row before
-    # the profiler has minted the Attacker. Index covers the join path
-    # cred_reuse → Attacker.ip.
+    # Keyed by attacker IP (not attackers.uuid) on the write path to
+    # avoid the chicken-and-egg of landing a credential before the
+    # profiler has minted the Attacker. The profiler backfills
+    # ``attacker_uuid`` once it knows the IP, so cross-IP reuse queries
+    # eventually have an indexed FK to traverse.
    attacker_ip: str = Field(index=True)
+    attacker_uuid: Optional[str] = Field(
+        default=None, foreign_key="attackers.uuid", index=True
+    )
    decky_name: str = Field(index=True)
    service: str = Field(index=True)
    principal: Optional[str] = Field(default=None, index=True, max_length=256)
@@ -107,6 +115,77 @@ class Credential(SQLModel, table=True):
    attempt_count: int = Field(default=1)


+class CredentialReuse(SQLModel, table=True):
+    """One observed credential reuse pattern across deckies and/or services.
+
+    A row here is a *finding* produced by the correlator: the same
+    ``(secret_sha256, secret_kind, principal)`` tuple was observed
+    against ``target_count`` distinct decky×service pairs. Upserted on
+    that natural key — the row accumulates new deckies/services/IPs
+    over time as the credential is reused.
+
+    The ``confidence`` column is reserved for a future fuzzy-match pass
+    (credential variants, e.g. ``hunter2`` vs ``hunter22``); rows
+    written by the exact-secret correlator are always 1.0.
+    """
+    __tablename__ = "credential_reuse"
+    __table_args__ = (
+        UniqueConstraint(
+            "secret_sha256", "secret_kind", "principal_key",
+            name="uq_credential_reuse_secret_principal",
+        ),
+    )
+    id: str = Field(primary_key=True, max_length=36)
+    secret_sha256: str = Field(index=True, max_length=64)
+    secret_kind: str = Field(index=True, max_length=32)
+    # Optional human-readable principal (e.g. "root"). Nullable — for
+    # cross-principal reuse rows we leave this null, but we still need
+    # a unique constraint, so ``principal_key`` is the non-null
+    # canonicalised form ("" when principal is null) used in the
+    # uniqueness tuple. SQLite's NULLs-distinct-in-UNIQUE behaviour
+    # would otherwise let duplicate null-principal rows through.
+    principal: Optional[str] = Field(default=None, max_length=256)
+    principal_key: str = Field(default="", max_length=256)
+    attacker_uuids: str = Field(
+        default="[]",
+        sa_column=Column("attacker_uuids", _BIG_TEXT, nullable=False, default="[]"),
+    )  # JSON list[str]
+    attacker_ips: str = Field(
+        default="[]",
+        sa_column=Column("attacker_ips", _BIG_TEXT, nullable=False, default="[]"),
+    )  # JSON list[str]
+    deckies: str = Field(
+        default="[]",
+        sa_column=Column("deckies", _BIG_TEXT, nullable=False, default="[]"),
+    )  # JSON list[str]
+    services: str = Field(
+        default="[]",
+        sa_column=Column("services", _BIG_TEXT, nullable=False, default="[]"),
+    )  # JSON list[str]
+    # COUNT(DISTINCT decky||':'||service). The discriminative scalar
+    # for ranking and filtering — a credential seen on 12 targets is
+    # far more interesting than one seen on 2.
+    target_count: int = Field(default=0, index=True)
+    attempt_count: int = Field(default=0)
+    confidence: float = Field(default=1.0)
+    first_seen: datetime = Field(
+        default_factory=lambda: datetime.now(timezone.utc), index=True
+    )
+    last_seen: datetime = Field(
+        default_factory=lambda: datetime.now(timezone.utc), index=True
+    )
+    updated_at: datetime = Field(
+        default_factory=lambda: datetime.now(timezone.utc), index=True
+    )
+
+
+class CredentialReuseResponse(BaseModel):
+    total: int
+    limit: int
+    offset: int
+    data: List[dict[str, Any]]
+
+
 class State(SQLModel, table=True):
    __tablename__ = "state"
    key: str = Field(primary_key=True)
--- a/decnet/web/db/repository.py
+++ b/decnet/web/db/repository.py
@@ -153,12 +153,59 @@ class BaseRepository(ABC):
        pass

    @abstractmethod
-    async def get_credential_reuse(
+    async def get_credential_attempts_for_secret(
        self, secret_sha256: str
    ) -> list[dict[str, Any]]:
        """Every (attacker, decky, service, principal) row sharing this secret hash."""
        pass

+    @abstractmethod
+    async def upsert_credential_reuse(
+        self,
+        *,
+        secret_sha256: str,
+        secret_kind: str,
+        principal: Optional[str],
+        attacker_uuid: Optional[str],
+        attacker_ip: str,
+        decky: str,
+        service: str,
+        attempt_count: int,
+        ts: Optional[Any] = None,
+    ) -> Optional[dict[str, Any]]:
+        """Upsert one credential-reuse finding. Returns the row dict (with
+        ``inserted: bool`` mixed in) on insert/update, or None if the row
+        is below the reuse threshold and shouldn't be persisted yet.
+        """
+        pass
+
+    @abstractmethod
+    async def list_credential_reuses(
+        self,
+        limit: int = 50,
+        offset: int = 0,
+        min_target_count: int = 2,
+        secret_kind: Optional[str] = None,
+    ) -> tuple[int, list[dict[str, Any]]]:
+        """Paged list of credential-reuse findings ordered by target_count desc."""
+        pass
+
+    @abstractmethod
+    async def get_credential_reuse_by_id(
+        self, reuse_id: str
+    ) -> Optional[dict[str, Any]]:
+        """One credential-reuse finding by UUID, or None."""
+        pass
+
+    @abstractmethod
+    async def update_credential_attacker_uuid(
+        self, attacker_ip: str, attacker_uuid: str
+    ) -> int:
+        """Backfill ``attacker_uuid`` on every Credential row matching the IP
+        whose ``attacker_uuid`` is currently null. Returns rows updated.
+        """
+        pass
+
    @abstractmethod
    async def get_state(self, key: str) -> Optional[dict[str, Any]]:
        """Retrieve a specific state entry by key."""
--- a/decnet/web/db/sqlmodel_repo.py
+++ b/decnet/web/db/sqlmodel_repo.py
@@ -32,6 +32,7 @@ from decnet.web.db.models import (
    Log,
    Bounty,
    Credential,
+    CredentialReuse,
    State,
    Attacker,
    AttackerBehavior,
@@ -684,7 +685,7 @@ class SQLModelRepository(BaseRepository):
                out.append(d)
            return out

-    async def get_credential_reuse(
+    async def get_credential_attempts_for_secret(
        self, secret_sha256: str
    ) -> List[dict[str, Any]]:
        """Every (attacker_ip, decky, service, principal) row sharing this
@@ -706,6 +707,197 @@ class SQLModelRepository(BaseRepository):
                out.append(d)
            return out

+    # ─── credential reuse (findings) ──────────────────────────────────────
+
+    async def update_credential_attacker_uuid(
+        self, attacker_ip: str, attacker_uuid: str
+    ) -> int:
+        """Backfill ``attacker_uuid`` on every Credential row matching the
+        given IP whose ``attacker_uuid`` is currently null. Run by the
+        profiler after it mints/updates an Attacker row.
+        """
+        async with self._session() as session:
+            result = await session.execute(
+                update(Credential)
+                .where(
+                    Credential.attacker_ip == attacker_ip,
+                    Credential.attacker_uuid.is_(None),
+                )
+                .values(attacker_uuid=attacker_uuid)
+            )
+            await session.commit()
+            return int(result.rowcount or 0)
+
+    @staticmethod
+    def _merge_unique(existing_json: str, value: Optional[str]) -> tuple[str, bool]:
+        """Append ``value`` to a JSON list[str] column if not present.
+        Returns (new_json, changed). None values and duplicates are skipped.
+        """
+        if value is None:
+            return existing_json, False
+        try:
+            current = json.loads(existing_json) if existing_json else []
+            if not isinstance(current, list):
+                current = []
+        except (json.JSONDecodeError, TypeError):
+            current = []
+        if value in current:
+            return existing_json, False
+        current.append(value)
+        return json.dumps(current, ensure_ascii=True), True
+
+    async def upsert_credential_reuse(
+        self,
+        *,
+        secret_sha256: str,
+        secret_kind: str,
+        principal: Optional[str],
+        attacker_uuid: Optional[str],
+        attacker_ip: str,
+        decky: str,
+        service: str,
+        attempt_count: int,
+        ts: Optional[datetime] = None,
+    ) -> Optional[dict[str, Any]]:
+        """Upsert a credential-reuse finding.
+
+        The row is keyed by ``(secret_sha256, secret_kind, principal_key)``
+        — ``principal_key`` is the canonicalised non-null form ("" when
+        principal is null) so the unique constraint behaves the same on
+        SQLite and MySQL.
+
+        Returns the row dict augmented with ``inserted: bool`` and
+        ``changed: bool`` so the correlator can decide whether to publish
+        a bus event.
+        """
+        principal_key = principal or ""
+        now = ts or datetime.now(timezone.utc)
+        async with self._session() as session:
+            existing = (await session.execute(
+                select(CredentialReuse).where(
+                    CredentialReuse.secret_sha256 == secret_sha256,
+                    CredentialReuse.secret_kind == secret_kind,
+                    CredentialReuse.principal_key == principal_key,
+                )
+            )).scalar_one_or_none()
+
+            if existing is None:
+                row = CredentialReuse(
+                    id=str(uuid.uuid4()),
+                    secret_sha256=secret_sha256,
+                    secret_kind=secret_kind,
+                    principal=principal,
+                    principal_key=principal_key,
+                    attacker_uuids=json.dumps(
+                        [attacker_uuid] if attacker_uuid else [], ensure_ascii=True
+                    ),
+                    attacker_ips=json.dumps([attacker_ip], ensure_ascii=True),
+                    deckies=json.dumps([decky], ensure_ascii=True),
+                    services=json.dumps([service], ensure_ascii=True),
+                    target_count=1,
+                    attempt_count=int(attempt_count),
+                    confidence=1.0,
+                    first_seen=now,
+                    last_seen=now,
+                    updated_at=now,
+                )
+                session.add(row)
+                await session.commit()
+                await session.refresh(row)
+                d = row.model_dump(mode="json")
+                d["inserted"] = True
+                d["changed"] = True
+                return d
+
+            changed = False
+            new_uuids, c1 = self._merge_unique(existing.attacker_uuids, attacker_uuid)
+            new_ips, c2 = self._merge_unique(existing.attacker_ips, attacker_ip)
+            new_deckies, c3 = self._merge_unique(existing.deckies, decky)
+            new_services, c4 = self._merge_unique(existing.services, service)
+            existing.attacker_uuids = new_uuids
+            existing.attacker_ips = new_ips
+            if c3 or c4:
+                existing.deckies = new_deckies
+                existing.services = new_services
+                # Recount target tuples from the underlying credentials
+                # table — a (decky, service) tuple only counts when both
+                # were observed together, which the JSON lists alone
+                # can't tell us.
+                stmt = (
+                    select(func.count(func.distinct(
+                        Credential.decky_name + ":" + Credential.service
+                    )))
+                    .where(
+                        Credential.secret_sha256 == secret_sha256,
+                        Credential.secret_kind == secret_kind,
+                        (Credential.principal == principal) if principal is not None
+                        else Credential.principal.is_(None),
+                    )
+                )
+                target_count = (await session.execute(stmt)).scalar() or 0
+                existing.target_count = int(target_count)
+            existing.attempt_count = (existing.attempt_count or 0) + int(attempt_count)
+            existing.last_seen = now
+            existing.updated_at = now
+            if c1 or c2 or c3 or c4:
+                changed = True
+            session.add(existing)
+            await session.commit()
+            await session.refresh(existing)
+            d = existing.model_dump(mode="json")
+            d["inserted"] = False
+            d["changed"] = changed
+            return d
+
+    async def list_credential_reuses(
+        self,
+        limit: int = 50,
+        offset: int = 0,
+        min_target_count: int = 2,
+        secret_kind: Optional[str] = None,
+    ) -> tuple[int, List[dict[str, Any]]]:
+        async with self._session() as session:
+            base = select(CredentialReuse).where(
+                CredentialReuse.target_count >= min_target_count
+            )
+            if secret_kind:
+                base = base.where(CredentialReuse.secret_kind == secret_kind)
+            total_stmt = select(func.count()).select_from(base.subquery())
+            total = (await session.execute(total_stmt)).scalar() or 0
+            list_stmt = (
+                base.order_by(desc(CredentialReuse.target_count),
+                              desc(CredentialReuse.last_seen))
+                .offset(offset).limit(limit)
+            )
+            rows = (await session.execute(list_stmt)).scalars().all()
+            out: List[dict[str, Any]] = []
+            for r in rows:
+                d = r.model_dump(mode="json")
+                for key in ("attacker_uuids", "attacker_ips", "deckies", "services"):
+                    try:
+                        d[key] = json.loads(d[key])
+                    except (json.JSONDecodeError, TypeError):
+                        d[key] = []
+                out.append(d)
+            return int(total), out
+
+    async def get_credential_reuse_by_id(
+        self, reuse_id: str
+    ) -> Optional[dict[str, Any]]:
+        async with self._session() as session:
+            row = (await session.execute(
+                select(CredentialReuse).where(CredentialReuse.id == reuse_id)
+            )).scalar_one_or_none()
+            if row is None:
+                return None
+            d = row.model_dump(mode="json")
+            for key in ("attacker_uuids", "attacker_ips", "deckies", "services"):
+                try:
+                    d[key] = json.loads(d[key])
+                except (json.JSONDecodeError, TypeError):
+                    d[key] = []
+            return d
+
    async def get_state(self, key: str) -> Optional[dict[str, Any]]:
        async with self._session() as session:
            statement = select(State).where(State.key == key)