Lays the storage and bus substrate for the "credential reuse patterns"
task in DEVELOPMENT.md and scaffolds decnet/vectorstore/ as the future
substrate for statistical attacker re-identification over behavioral
fingerprints. No correlator, profiler, API, or dashboard wiring in
this commit — see TODO.md for the handoff.
Schema:
- Credential.attacker_uuid (nullable FK to attackers.uuid),
backfilled by the profiler post-write to avoid coupling the
capture path to the profiler's ordering.
- CredentialReuse table — UUID PK, JSON list columns for the
accumulating attacker_uuids/ips/deckies/services, target_count
(the discriminative scalar), confidence reserved for a future
fuzzy-credential pass.
Repo:
- upsert_credential_reuse / list_credential_reuses /
get_credential_reuse_by_id / update_credential_attacker_uuid.
- Renamed pre-existing get_credential_reuse(secret_sha256) to
get_credential_attempts_for_secret(secret_sha256) — the new
findings table needs the cleaner name.
Bus topics:
- credential.captured (one per Credential upsert)
- credential.reuse.detected (correlator-emitted on insert/grow)
Vectorstore subpackage (decnet/vectorstore/, flat layout mirroring
decnet/bus/):
- BaseVectorStore ABC keyed by (kind, id) — kind discriminator
means new feature families are additive, no schema migration.
- FakeVectorStore (in-memory L2 KNN), NullVectorStore (no-op for
DECNET_VECTORSTORE_ENABLED=false), SqliteVecVectorStore (lazy
sqlite_vec extension load, one vec0 virtual table per kind).
- get_vectorstore() env-driven dispatch with graceful fallback
to FakeVectorStore when the sqlite-vec extension isn't on the
host, so workers don't crash on a missing optional dep.
Tests: 26 new (11 cred-reuse repo, 15 vectorstore). Existing
credentials and base-repo tests updated for the rename. Total: 34
passing on the touched files.
132 lines
4.1 KiB
Python
132 lines
4.1 KiB
Python
"""In-memory vector store backend.
|
|
|
|
Two flavors:
|
|
|
|
* :class:`FakeVectorStore` — a real, working in-memory store. Used by
|
|
tests and by dev environments that want similarity search without
|
|
any native extension on the box. KNN is brute-force L2 — fine up to
|
|
a few thousand vectors per kind.
|
|
* :class:`NullVectorStore` — a no-op store returned by the factory
|
|
when ``DECNET_VECTORSTORE_ENABLED=false``. Every method succeeds
|
|
trivially; ``get`` and ``knn`` return None / [] respectively. Lets
|
|
workers run unaffected when the operator hasn't opted into vector
|
|
features yet.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import math
|
|
from typing import Optional, Sequence
|
|
|
|
from decnet.vectorstore.base import BaseVectorStore, Neighbor, VectorRecord
|
|
|
|
|
|
class FakeVectorStore(BaseVectorStore):
|
|
"""Pure-python in-memory vector store, brute-force KNN.
|
|
|
|
Suitable for tests and small-scale dev (≤ a few thousand vectors
|
|
per kind). Not persistent — every process restart drops state.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
# {kind: {id: VectorRecord}}
|
|
self._store: dict[str, dict[str, VectorRecord]] = {}
|
|
# {kind: dim} — locked the first time a kind is written.
|
|
self._dims: dict[str, int] = {}
|
|
|
|
async def initialize(self) -> None:
|
|
return None
|
|
|
|
async def close(self) -> None:
|
|
return None
|
|
|
|
async def health(self) -> dict:
|
|
total = sum(len(by_id) for by_id in self._store.values())
|
|
return {
|
|
"ok": True,
|
|
"backend": "fake",
|
|
"kinds": len(self._store),
|
|
"vectors": total,
|
|
}
|
|
|
|
async def insert(
|
|
self,
|
|
kind: str,
|
|
id: str,
|
|
vector: Sequence[float],
|
|
*,
|
|
extractor_version: int = 1,
|
|
) -> None:
|
|
dim = len(vector)
|
|
existing_dim = self._dims.get(kind)
|
|
if existing_dim is None:
|
|
self._dims[kind] = dim
|
|
elif existing_dim != dim:
|
|
raise ValueError(
|
|
f"vector dim mismatch for kind={kind!r}: "
|
|
f"expected {existing_dim}, got {dim}"
|
|
)
|
|
rec = VectorRecord(
|
|
kind=kind, id=id, vector=tuple(float(x) for x in vector),
|
|
dim=dim, extractor_version=int(extractor_version),
|
|
)
|
|
self._store.setdefault(kind, {})[id] = rec
|
|
|
|
async def get(self, kind: str, id: str) -> Optional[VectorRecord]:
|
|
return self._store.get(kind, {}).get(id)
|
|
|
|
async def delete(self, kind: str, id: str) -> bool:
|
|
bucket = self._store.get(kind)
|
|
if bucket is None or id not in bucket:
|
|
return False
|
|
del bucket[id]
|
|
return True
|
|
|
|
async def knn(
|
|
self, kind: str, vector: Sequence[float], k: int = 10
|
|
) -> list[Neighbor]:
|
|
bucket = self._store.get(kind)
|
|
if not bucket:
|
|
return []
|
|
q = tuple(float(x) for x in vector)
|
|
if len(q) != self._dims.get(kind, len(q)):
|
|
raise ValueError(
|
|
f"query dim {len(q)} != stored dim {self._dims[kind]} "
|
|
f"for kind={kind!r}"
|
|
)
|
|
scored: list[Neighbor] = []
|
|
for rid, rec in bucket.items():
|
|
d = math.sqrt(sum((a - b) ** 2 for a, b in zip(q, rec.vector)))
|
|
scored.append(Neighbor(kind=kind, id=rid, distance=d))
|
|
scored.sort(key=lambda n: n.distance)
|
|
return scored[: max(0, int(k))]
|
|
|
|
|
|
class NullVectorStore(BaseVectorStore):
|
|
"""No-op vector store. Returned when vectorstore is disabled."""
|
|
|
|
async def initialize(self) -> None:
|
|
return None
|
|
|
|
async def close(self) -> None:
|
|
return None
|
|
|
|
async def health(self) -> dict:
|
|
return {"ok": True, "backend": "null", "kinds": 0, "vectors": 0}
|
|
|
|
async def insert(
|
|
self, kind: str, id: str, vector: Sequence[float],
|
|
*, extractor_version: int = 1,
|
|
) -> None:
|
|
return None
|
|
|
|
async def get(self, kind: str, id: str) -> Optional[VectorRecord]:
|
|
return None
|
|
|
|
async def delete(self, kind: str, id: str) -> bool:
|
|
return False
|
|
|
|
async def knn(
|
|
self, kind: str, vector: Sequence[float], k: int = 10
|
|
) -> list[Neighbor]:
|
|
return []
|