feat(creds): cred-reuse foundation + vectorstore scaffold

Lays the storage and bus substrate for the "credential reuse patterns"
task in DEVELOPMENT.md and scaffolds decnet/vectorstore/ as the future
substrate for statistical attacker re-identification over behavioral
fingerprints. No correlator, profiler, API, or dashboard wiring in
this commit — see TODO.md for the handoff.

Schema:
  - Credential.attacker_uuid (nullable FK to attackers.uuid),
    backfilled by the profiler post-write to avoid coupling the
    capture path to the profiler's ordering.
  - CredentialReuse table — UUID PK, JSON list columns for the
    accumulating attacker_uuids/ips/deckies/services, target_count
    (the discriminative scalar), confidence reserved for a future
    fuzzy-credential pass.

Repo:
  - upsert_credential_reuse / list_credential_reuses /
    get_credential_reuse_by_id / update_credential_attacker_uuid.
  - Renamed pre-existing get_credential_reuse(secret_sha256) to
    get_credential_attempts_for_secret(secret_sha256) — the new
    findings table needs the cleaner name.

Bus topics:
  - credential.captured (one per Credential upsert)
  - credential.reuse.detected (correlator-emitted on insert/grow)

Vectorstore subpackage (decnet/vectorstore/, flat layout mirroring
decnet/bus/):
  - BaseVectorStore ABC keyed by (kind, id) — kind discriminator
    means new feature families are additive, no schema migration.
  - FakeVectorStore (in-memory L2 KNN), NullVectorStore (no-op for
    DECNET_VECTORSTORE_ENABLED=false), SqliteVecVectorStore (lazy
    sqlite_vec extension load, one vec0 virtual table per kind).
  - get_vectorstore() env-driven dispatch with graceful fallback
    to FakeVectorStore when the sqlite-vec extension isn't on the
    host, so workers don't crash on a missing optional dep.

Tests: 26 new (11 cred-reuse repo, 15 vectorstore). Existing
credentials and base-repo tests updated for the rename. Total: 34
passing on the touched files.
This commit is contained in:
2026-04-26 03:18:34 -04:00
parent 817ce32e6d
commit ce4be68501
17 changed files with 1615 additions and 11 deletions

View File

@@ -49,6 +49,8 @@ from .logs import (
Bounty,
BountyResponse,
Credential,
CredentialReuse,
CredentialReuseResponse,
CredentialsResponse,
Log,
LogsResponse,
@@ -170,6 +172,8 @@ __all__ = [
"Bounty",
"BountyResponse",
"Credential",
"CredentialReuse",
"CredentialReuseResponse",
"CredentialsResponse",
"Log",
"LogsResponse",

View File

@@ -3,7 +3,7 @@ from datetime import datetime, timezone
from typing import Any, List, Optional
from pydantic import BaseModel
from sqlalchemy import Column, Index, Text
from sqlalchemy import Column, Index, Text, UniqueConstraint
from sqlmodel import Field, SQLModel
from ._base import _BIG_TEXT
@@ -54,9 +54,13 @@ class Credential(SQLModel, table=True):
LDAP. Nullable for principal-less mechanisms (Redis AUTH, bearer
tokens). Fully service-specific keys ride in ``fields`` JSON.
Dedup contract: same (attacker_uuid, decky, service, secret_sha256,
Dedup contract: same (attacker_ip, decky, service, secret_sha256,
principal_or_empty) tuple → upsert, bumps ``attempt_count`` and
``last_seen``. Different secret or different principal → new row.
``attacker_uuid`` is backfilled by the profiler once an Attacker row
has been minted for the source IP. It is nullable on first write so
the credential ingest path stays decoupled from the profiler.
"""
__tablename__ = "credentials"
__table_args__ = (
@@ -64,11 +68,15 @@ class Credential(SQLModel, table=True):
Index("ix_credentials_principal_service", "principal", "service"),
)
id: Optional[int] = Field(default=None, primary_key=True)
# Keyed by attacker IP (not attackers.uuid) to match Bounty's pattern
# and avoid the chicken-and-egg of writing a credential row before
# the profiler has minted the Attacker. Index covers the join path
# cred_reuse → Attacker.ip.
# Keyed by attacker IP (not attackers.uuid) on the write path to
# avoid the chicken-and-egg of landing a credential before the
# profiler has minted the Attacker. The profiler backfills
# ``attacker_uuid`` once it knows the IP, so cross-IP reuse queries
# eventually have an indexed FK to traverse.
attacker_ip: str = Field(index=True)
attacker_uuid: Optional[str] = Field(
default=None, foreign_key="attackers.uuid", index=True
)
decky_name: str = Field(index=True)
service: str = Field(index=True)
principal: Optional[str] = Field(default=None, index=True, max_length=256)
@@ -107,6 +115,77 @@ class Credential(SQLModel, table=True):
attempt_count: int = Field(default=1)
class CredentialReuse(SQLModel, table=True):
"""One observed credential reuse pattern across deckies and/or services.
A row here is a *finding* produced by the correlator: the same
``(secret_sha256, secret_kind, principal)`` tuple was observed
against ``target_count`` distinct decky×service pairs. Upserted on
that natural key — the row accumulates new deckies/services/IPs
over time as the credential is reused.
The ``confidence`` column is reserved for a future fuzzy-match pass
(credential variants, e.g. ``hunter2`` vs ``hunter22``); rows
written by the exact-secret correlator are always 1.0.
"""
__tablename__ = "credential_reuse"
__table_args__ = (
UniqueConstraint(
"secret_sha256", "secret_kind", "principal_key",
name="uq_credential_reuse_secret_principal",
),
)
id: str = Field(primary_key=True, max_length=36)
secret_sha256: str = Field(index=True, max_length=64)
secret_kind: str = Field(index=True, max_length=32)
# Optional human-readable principal (e.g. "root"). Nullable — for
# cross-principal reuse rows we leave this null, but we still need
# a unique constraint, so ``principal_key`` is the non-null
# canonicalised form ("" when principal is null) used in the
# uniqueness tuple. SQLite's NULLs-distinct-in-UNIQUE behaviour
# would otherwise let duplicate null-principal rows through.
principal: Optional[str] = Field(default=None, max_length=256)
principal_key: str = Field(default="", max_length=256)
attacker_uuids: str = Field(
default="[]",
sa_column=Column("attacker_uuids", _BIG_TEXT, nullable=False, default="[]"),
) # JSON list[str]
attacker_ips: str = Field(
default="[]",
sa_column=Column("attacker_ips", _BIG_TEXT, nullable=False, default="[]"),
) # JSON list[str]
deckies: str = Field(
default="[]",
sa_column=Column("deckies", _BIG_TEXT, nullable=False, default="[]"),
) # JSON list[str]
services: str = Field(
default="[]",
sa_column=Column("services", _BIG_TEXT, nullable=False, default="[]"),
) # JSON list[str]
# COUNT(DISTINCT decky||':'||service). The discriminative scalar
# for ranking and filtering — a credential seen on 12 targets is
# far more interesting than one seen on 2.
target_count: int = Field(default=0, index=True)
attempt_count: int = Field(default=0)
confidence: float = Field(default=1.0)
first_seen: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc), index=True
)
last_seen: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc), index=True
)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc), index=True
)
class CredentialReuseResponse(BaseModel):
total: int
limit: int
offset: int
data: List[dict[str, Any]]
class State(SQLModel, table=True):
__tablename__ = "state"
key: str = Field(primary_key=True)

View File

@@ -153,12 +153,59 @@ class BaseRepository(ABC):
pass
@abstractmethod
async def get_credential_reuse(
async def get_credential_attempts_for_secret(
self, secret_sha256: str
) -> list[dict[str, Any]]:
"""Every (attacker, decky, service, principal) row sharing this secret hash."""
pass
@abstractmethod
async def upsert_credential_reuse(
self,
*,
secret_sha256: str,
secret_kind: str,
principal: Optional[str],
attacker_uuid: Optional[str],
attacker_ip: str,
decky: str,
service: str,
attempt_count: int,
ts: Optional[Any] = None,
) -> Optional[dict[str, Any]]:
"""Upsert one credential-reuse finding. Returns the row dict (with
``inserted: bool`` mixed in) on insert/update, or None if the row
is below the reuse threshold and shouldn't be persisted yet.
"""
pass
@abstractmethod
async def list_credential_reuses(
self,
limit: int = 50,
offset: int = 0,
min_target_count: int = 2,
secret_kind: Optional[str] = None,
) -> tuple[int, list[dict[str, Any]]]:
"""Paged list of credential-reuse findings ordered by target_count desc."""
pass
@abstractmethod
async def get_credential_reuse_by_id(
self, reuse_id: str
) -> Optional[dict[str, Any]]:
"""One credential-reuse finding by UUID, or None."""
pass
@abstractmethod
async def update_credential_attacker_uuid(
self, attacker_ip: str, attacker_uuid: str
) -> int:
"""Backfill ``attacker_uuid`` on every Credential row matching the IP
whose ``attacker_uuid`` is currently null. Returns rows updated.
"""
pass
@abstractmethod
async def get_state(self, key: str) -> Optional[dict[str, Any]]:
"""Retrieve a specific state entry by key."""

View File

@@ -32,6 +32,7 @@ from decnet.web.db.models import (
Log,
Bounty,
Credential,
CredentialReuse,
State,
Attacker,
AttackerBehavior,
@@ -684,7 +685,7 @@ class SQLModelRepository(BaseRepository):
out.append(d)
return out
async def get_credential_reuse(
async def get_credential_attempts_for_secret(
self, secret_sha256: str
) -> List[dict[str, Any]]:
"""Every (attacker_ip, decky, service, principal) row sharing this
@@ -706,6 +707,197 @@ class SQLModelRepository(BaseRepository):
out.append(d)
return out
# ─── credential reuse (findings) ──────────────────────────────────────
async def update_credential_attacker_uuid(
self, attacker_ip: str, attacker_uuid: str
) -> int:
"""Backfill ``attacker_uuid`` on every Credential row matching the
given IP whose ``attacker_uuid`` is currently null. Run by the
profiler after it mints/updates an Attacker row.
"""
async with self._session() as session:
result = await session.execute(
update(Credential)
.where(
Credential.attacker_ip == attacker_ip,
Credential.attacker_uuid.is_(None),
)
.values(attacker_uuid=attacker_uuid)
)
await session.commit()
return int(result.rowcount or 0)
@staticmethod
def _merge_unique(existing_json: str, value: Optional[str]) -> tuple[str, bool]:
"""Append ``value`` to a JSON list[str] column if not present.
Returns (new_json, changed). None values and duplicates are skipped.
"""
if value is None:
return existing_json, False
try:
current = json.loads(existing_json) if existing_json else []
if not isinstance(current, list):
current = []
except (json.JSONDecodeError, TypeError):
current = []
if value in current:
return existing_json, False
current.append(value)
return json.dumps(current, ensure_ascii=True), True
async def upsert_credential_reuse(
self,
*,
secret_sha256: str,
secret_kind: str,
principal: Optional[str],
attacker_uuid: Optional[str],
attacker_ip: str,
decky: str,
service: str,
attempt_count: int,
ts: Optional[datetime] = None,
) -> Optional[dict[str, Any]]:
"""Upsert a credential-reuse finding.
The row is keyed by ``(secret_sha256, secret_kind, principal_key)``
— ``principal_key`` is the canonicalised non-null form ("" when
principal is null) so the unique constraint behaves the same on
SQLite and MySQL.
Returns the row dict augmented with ``inserted: bool`` and
``changed: bool`` so the correlator can decide whether to publish
a bus event.
"""
principal_key = principal or ""
now = ts or datetime.now(timezone.utc)
async with self._session() as session:
existing = (await session.execute(
select(CredentialReuse).where(
CredentialReuse.secret_sha256 == secret_sha256,
CredentialReuse.secret_kind == secret_kind,
CredentialReuse.principal_key == principal_key,
)
)).scalar_one_or_none()
if existing is None:
row = CredentialReuse(
id=str(uuid.uuid4()),
secret_sha256=secret_sha256,
secret_kind=secret_kind,
principal=principal,
principal_key=principal_key,
attacker_uuids=json.dumps(
[attacker_uuid] if attacker_uuid else [], ensure_ascii=True
),
attacker_ips=json.dumps([attacker_ip], ensure_ascii=True),
deckies=json.dumps([decky], ensure_ascii=True),
services=json.dumps([service], ensure_ascii=True),
target_count=1,
attempt_count=int(attempt_count),
confidence=1.0,
first_seen=now,
last_seen=now,
updated_at=now,
)
session.add(row)
await session.commit()
await session.refresh(row)
d = row.model_dump(mode="json")
d["inserted"] = True
d["changed"] = True
return d
changed = False
new_uuids, c1 = self._merge_unique(existing.attacker_uuids, attacker_uuid)
new_ips, c2 = self._merge_unique(existing.attacker_ips, attacker_ip)
new_deckies, c3 = self._merge_unique(existing.deckies, decky)
new_services, c4 = self._merge_unique(existing.services, service)
existing.attacker_uuids = new_uuids
existing.attacker_ips = new_ips
if c3 or c4:
existing.deckies = new_deckies
existing.services = new_services
# Recount target tuples from the underlying credentials
# table — a (decky, service) tuple only counts when both
# were observed together, which the JSON lists alone
# can't tell us.
stmt = (
select(func.count(func.distinct(
Credential.decky_name + ":" + Credential.service
)))
.where(
Credential.secret_sha256 == secret_sha256,
Credential.secret_kind == secret_kind,
(Credential.principal == principal) if principal is not None
else Credential.principal.is_(None),
)
)
target_count = (await session.execute(stmt)).scalar() or 0
existing.target_count = int(target_count)
existing.attempt_count = (existing.attempt_count or 0) + int(attempt_count)
existing.last_seen = now
existing.updated_at = now
if c1 or c2 or c3 or c4:
changed = True
session.add(existing)
await session.commit()
await session.refresh(existing)
d = existing.model_dump(mode="json")
d["inserted"] = False
d["changed"] = changed
return d
async def list_credential_reuses(
self,
limit: int = 50,
offset: int = 0,
min_target_count: int = 2,
secret_kind: Optional[str] = None,
) -> tuple[int, List[dict[str, Any]]]:
async with self._session() as session:
base = select(CredentialReuse).where(
CredentialReuse.target_count >= min_target_count
)
if secret_kind:
base = base.where(CredentialReuse.secret_kind == secret_kind)
total_stmt = select(func.count()).select_from(base.subquery())
total = (await session.execute(total_stmt)).scalar() or 0
list_stmt = (
base.order_by(desc(CredentialReuse.target_count),
desc(CredentialReuse.last_seen))
.offset(offset).limit(limit)
)
rows = (await session.execute(list_stmt)).scalars().all()
out: List[dict[str, Any]] = []
for r in rows:
d = r.model_dump(mode="json")
for key in ("attacker_uuids", "attacker_ips", "deckies", "services"):
try:
d[key] = json.loads(d[key])
except (json.JSONDecodeError, TypeError):
d[key] = []
out.append(d)
return int(total), out
async def get_credential_reuse_by_id(
self, reuse_id: str
) -> Optional[dict[str, Any]]:
async with self._session() as session:
row = (await session.execute(
select(CredentialReuse).where(CredentialReuse.id == reuse_id)
)).scalar_one_or_none()
if row is None:
return None
d = row.model_dump(mode="json")
for key in ("attacker_uuids", "attacker_ips", "deckies", "services"):
try:
d[key] = json.loads(d[key])
except (json.JSONDecodeError, TypeError):
d[key] = []
return d
async def get_state(self, key: str) -> Optional[dict[str, Any]]:
async with self._session() as session:
statement = select(State).where(State.key == key)