merge: testing → main (reconcile 2-week divergence)
This commit is contained in:
27
decnet/vectorstore/__init__.py
Normal file
27
decnet/vectorstore/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
"""Vector store substrate for behavioral fingerprint similarity search.
|
||||
|
||||
Provider-pluggable storage for ``(kind, id, vector)`` triples used by the
|
||||
future statistical re-identification engine. ``kind`` discriminates
|
||||
feature families (``ja3``, ``hassh``, ``keystroke``, ``cmd_ngram``, ...)
|
||||
so new feature types are additive — no schema migration required when
|
||||
adding a new extractor.
|
||||
|
||||
Use :func:`get_vectorstore` from :mod:`decnet.vectorstore.factory`; never
|
||||
import concrete implementations directly. Mirrors the same factory
|
||||
discipline as :mod:`decnet.bus` and :mod:`decnet.web.db`.
|
||||
"""
|
||||
from decnet.vectorstore.base import (
|
||||
BaseVectorStore,
|
||||
Neighbor,
|
||||
VectorRecord,
|
||||
VECTORSTORE_SCHEMA_VERSION,
|
||||
)
|
||||
from decnet.vectorstore.factory import get_vectorstore
|
||||
|
||||
__all__ = [
|
||||
"BaseVectorStore",
|
||||
"Neighbor",
|
||||
"VectorRecord",
|
||||
"VECTORSTORE_SCHEMA_VERSION",
|
||||
"get_vectorstore",
|
||||
]
|
||||
114
decnet/vectorstore/base.py
Normal file
114
decnet/vectorstore/base.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""Vector-store abstractions: :class:`BaseVectorStore` ABC + record types.
|
||||
|
||||
Every backend (sqlite-vec, in-memory fake, future pgvector / Qdrant)
|
||||
speaks this contract. The store is keyed by ``(kind, id)`` where:
|
||||
|
||||
* ``kind`` is a short discriminator (``ja3``, ``hassh``,
|
||||
``keystroke_dwell``, ``cmd_ngram``, ...) — vectors are only ever
|
||||
compared **within the same kind**, so adding a new feature family is
|
||||
a non-event for the store.
|
||||
* ``id`` is a stable identifier owned by the caller — typically the
|
||||
``session_id`` or ``attacker_uuid``. The store does not interpret it.
|
||||
* ``extractor_version`` is recorded alongside the vector so v1 vs v2 of
|
||||
the same kind never get cross-compared by accident — a similarity
|
||||
scorer that respects versioning is the consumer's responsibility, but
|
||||
the data it needs is here.
|
||||
|
||||
The contract is intentionally minimal (insert/get/knn/delete/health) so
|
||||
backends with different physical layouts can implement it
|
||||
straightforwardly. No batch APIs in v1 — sub-millisecond per-vector
|
||||
overhead at honeypot scales (≤ 100k vectors per kind) makes batching
|
||||
unnecessary, and the loop-over-singles pattern keeps the contract small.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import abc
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Sequence
|
||||
|
||||
# Bumped when the wire/ABI shape of records changes incompatibly.
|
||||
# Backends MAY refuse to load older data when this changes, but the
|
||||
# pre-v1 expectation is to migrate forward in the same release.
|
||||
VECTORSTORE_SCHEMA_VERSION = 1
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VectorRecord:
|
||||
"""One stored vector, returned by :meth:`BaseVectorStore.get`."""
|
||||
|
||||
kind: str
|
||||
id: str
|
||||
vector: Sequence[float]
|
||||
dim: int
|
||||
extractor_version: int = 1
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Neighbor:
|
||||
"""One similarity-search hit, returned by :meth:`BaseVectorStore.knn`.
|
||||
|
||||
``distance`` is whatever the backend's native metric reports —
|
||||
cosine distance for sqlite-vec's default index, L2 for the in-memory
|
||||
fake. Smaller is more similar in both cases. Consumers that need
|
||||
a uniform metric should configure the backend explicitly.
|
||||
"""
|
||||
|
||||
kind: str
|
||||
id: str
|
||||
distance: float
|
||||
|
||||
|
||||
class BaseVectorStore(abc.ABC):
|
||||
"""Async interface for a kind-discriminated vector store.
|
||||
|
||||
Implementations MAY be transactional (sqlite) or not (pure
|
||||
in-memory). All methods are async to match the rest of the DECNET
|
||||
storage layer; trivial backends can ``await`` no-op coroutines.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def initialize(self) -> None:
|
||||
"""One-shot setup (open files, load extensions, create tables)."""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def close(self) -> None:
|
||||
"""Release resources. Idempotent."""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def health(self) -> dict:
|
||||
"""Liveness + capability probe.
|
||||
|
||||
Returns a dict like ``{"ok": True, "backend": "sqlite_vec",
|
||||
"kinds": 4, "vectors": 12_345}``. Used by ``/api/v1/health`` and
|
||||
diagnostics; never raises — backends that can't determine a
|
||||
field set it to None.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def insert(
|
||||
self,
|
||||
kind: str,
|
||||
id: str,
|
||||
vector: Sequence[float],
|
||||
*,
|
||||
extractor_version: int = 1,
|
||||
) -> None:
|
||||
"""Insert or replace ``(kind, id)``. Vector dim is fixed per kind
|
||||
the first time a kind is seen; mismatched dims raise.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def get(self, kind: str, id: str) -> Optional[VectorRecord]:
|
||||
"""Fetch one record, or None if absent."""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def delete(self, kind: str, id: str) -> bool:
|
||||
"""Delete one record. Returns True if a row was removed."""
|
||||
|
||||
@abc.abstractmethod
|
||||
async def knn(
|
||||
self, kind: str, vector: Sequence[float], k: int = 10
|
||||
) -> list[Neighbor]:
|
||||
"""Return up to *k* nearest neighbors of ``vector`` within
|
||||
``kind``. Empty list if the kind is unknown or empty.
|
||||
"""
|
||||
73
decnet/vectorstore/factory.py
Normal file
73
decnet/vectorstore/factory.py
Normal file
@@ -0,0 +1,73 @@
|
||||
"""Vectorstore factory — selects a :class:`BaseVectorStore` implementation.
|
||||
|
||||
Dispatch keys:
|
||||
|
||||
* ``DECNET_VECTORSTORE_ENABLED`` — ``"false"`` short-circuits to
|
||||
:class:`~decnet.vectorstore.fake.NullVectorStore`. Default ``"true"``.
|
||||
* ``DECNET_VECTORSTORE_TYPE`` — ``"sqlite_vec"`` (default) or
|
||||
``"fake"``.
|
||||
* ``DECNET_VECTORSTORE_PATH`` — sqlite file path. Defaults to
|
||||
``/var/lib/decnet/vectors.sqlite`` if writable, else
|
||||
``~/.decnet/vectors.sqlite``.
|
||||
|
||||
Mirrors :mod:`decnet.bus.factory` and :mod:`decnet.web.db.factory`:
|
||||
lazy imports inside each branch, env-driven dispatch, callers MUST go
|
||||
through :func:`get_vectorstore` rather than instantiating backends.
|
||||
|
||||
If ``sqlite_vec`` is requested but the extension is unavailable on
|
||||
this host, the factory logs a warning and returns the fake backend
|
||||
instead — the caller's code path stays valid (``insert`` no-ops, etc.)
|
||||
without crashing the worker on a missing optional dependency.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from decnet.vectorstore.base import BaseVectorStore
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_vectorstore(**kwargs: Any) -> BaseVectorStore:
|
||||
if os.environ.get("DECNET_VECTORSTORE_ENABLED", "true").lower() == "false":
|
||||
from decnet.vectorstore.fake import NullVectorStore
|
||||
return NullVectorStore()
|
||||
|
||||
backend = os.environ.get("DECNET_VECTORSTORE_TYPE", "sqlite_vec").lower()
|
||||
|
||||
if backend == "fake":
|
||||
from decnet.vectorstore.fake import FakeVectorStore
|
||||
return FakeVectorStore()
|
||||
|
||||
if backend == "sqlite_vec":
|
||||
# Probe extension availability up front so the factory can fall
|
||||
# back cleanly. Construction is cheap, but the extension load
|
||||
# only happens in initialize(); without this probe the caller
|
||||
# sees the failure too late to substitute a backend.
|
||||
try:
|
||||
import sqlite_vec # noqa: F401
|
||||
except ImportError as e:
|
||||
LOG.warning(
|
||||
"sqlite_vec not installed (%s); falling back to FakeVectorStore. "
|
||||
"Install the sqlite-vec package or set "
|
||||
"DECNET_VECTORSTORE_TYPE=fake to silence this warning.", e,
|
||||
)
|
||||
from decnet.vectorstore.fake import FakeVectorStore
|
||||
return FakeVectorStore()
|
||||
from decnet.vectorstore.sqlite_vec import SqliteVecVectorStore
|
||||
db_path = kwargs.pop("db_path", None) or _default_db_path()
|
||||
return SqliteVecVectorStore(db_path=db_path)
|
||||
|
||||
raise ValueError(f"Unsupported vectorstore type: {backend}")
|
||||
|
||||
|
||||
def _default_db_path() -> str:
|
||||
explicit = os.environ.get("DECNET_VECTORSTORE_PATH")
|
||||
if explicit:
|
||||
return explicit
|
||||
runtime_dir = "/var/lib/decnet"
|
||||
if os.path.isdir(runtime_dir) and os.access(runtime_dir, os.W_OK):
|
||||
return f"{runtime_dir}/vectors.sqlite"
|
||||
return os.path.expanduser("~/.decnet/vectors.sqlite")
|
||||
131
decnet/vectorstore/fake.py
Normal file
131
decnet/vectorstore/fake.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""In-memory vector store backend.
|
||||
|
||||
Two flavors:
|
||||
|
||||
* :class:`FakeVectorStore` — a real, working in-memory store. Used by
|
||||
tests and by dev environments that want similarity search without
|
||||
any native extension on the box. KNN is brute-force L2 — fine up to
|
||||
a few thousand vectors per kind.
|
||||
* :class:`NullVectorStore` — a no-op store returned by the factory
|
||||
when ``DECNET_VECTORSTORE_ENABLED=false``. Every method succeeds
|
||||
trivially; ``get`` and ``knn`` return None / [] respectively. Lets
|
||||
workers run unaffected when the operator hasn't opted into vector
|
||||
features yet.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from typing import Optional, Sequence
|
||||
|
||||
from decnet.vectorstore.base import BaseVectorStore, Neighbor, VectorRecord
|
||||
|
||||
|
||||
class FakeVectorStore(BaseVectorStore):
|
||||
"""Pure-python in-memory vector store, brute-force KNN.
|
||||
|
||||
Suitable for tests and small-scale dev (≤ a few thousand vectors
|
||||
per kind). Not persistent — every process restart drops state.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
# {kind: {id: VectorRecord}}
|
||||
self._store: dict[str, dict[str, VectorRecord]] = {}
|
||||
# {kind: dim} — locked the first time a kind is written.
|
||||
self._dims: dict[str, int] = {}
|
||||
|
||||
async def initialize(self) -> None:
|
||||
return None
|
||||
|
||||
async def close(self) -> None:
|
||||
return None
|
||||
|
||||
async def health(self) -> dict:
|
||||
total = sum(len(by_id) for by_id in self._store.values())
|
||||
return {
|
||||
"ok": True,
|
||||
"backend": "fake",
|
||||
"kinds": len(self._store),
|
||||
"vectors": total,
|
||||
}
|
||||
|
||||
async def insert(
|
||||
self,
|
||||
kind: str,
|
||||
id: str,
|
||||
vector: Sequence[float],
|
||||
*,
|
||||
extractor_version: int = 1,
|
||||
) -> None:
|
||||
dim = len(vector)
|
||||
existing_dim = self._dims.get(kind)
|
||||
if existing_dim is None:
|
||||
self._dims[kind] = dim
|
||||
elif existing_dim != dim:
|
||||
raise ValueError(
|
||||
f"vector dim mismatch for kind={kind!r}: "
|
||||
f"expected {existing_dim}, got {dim}"
|
||||
)
|
||||
rec = VectorRecord(
|
||||
kind=kind, id=id, vector=tuple(float(x) for x in vector),
|
||||
dim=dim, extractor_version=int(extractor_version),
|
||||
)
|
||||
self._store.setdefault(kind, {})[id] = rec
|
||||
|
||||
async def get(self, kind: str, id: str) -> Optional[VectorRecord]:
|
||||
return self._store.get(kind, {}).get(id)
|
||||
|
||||
async def delete(self, kind: str, id: str) -> bool:
|
||||
bucket = self._store.get(kind)
|
||||
if bucket is None or id not in bucket:
|
||||
return False
|
||||
del bucket[id]
|
||||
return True
|
||||
|
||||
async def knn(
|
||||
self, kind: str, vector: Sequence[float], k: int = 10
|
||||
) -> list[Neighbor]:
|
||||
bucket = self._store.get(kind)
|
||||
if not bucket:
|
||||
return []
|
||||
q = tuple(float(x) for x in vector)
|
||||
if len(q) != self._dims.get(kind, len(q)):
|
||||
raise ValueError(
|
||||
f"query dim {len(q)} != stored dim {self._dims[kind]} "
|
||||
f"for kind={kind!r}"
|
||||
)
|
||||
scored: list[Neighbor] = []
|
||||
for rid, rec in bucket.items():
|
||||
d = math.sqrt(sum((a - b) ** 2 for a, b in zip(q, rec.vector)))
|
||||
scored.append(Neighbor(kind=kind, id=rid, distance=d))
|
||||
scored.sort(key=lambda n: n.distance)
|
||||
return scored[: max(0, int(k))]
|
||||
|
||||
|
||||
class NullVectorStore(BaseVectorStore):
|
||||
"""No-op vector store. Returned when vectorstore is disabled."""
|
||||
|
||||
async def initialize(self) -> None:
|
||||
return None
|
||||
|
||||
async def close(self) -> None:
|
||||
return None
|
||||
|
||||
async def health(self) -> dict:
|
||||
return {"ok": True, "backend": "null", "kinds": 0, "vectors": 0}
|
||||
|
||||
async def insert(
|
||||
self, kind: str, id: str, vector: Sequence[float],
|
||||
*, extractor_version: int = 1,
|
||||
) -> None:
|
||||
return None
|
||||
|
||||
async def get(self, kind: str, id: str) -> Optional[VectorRecord]:
|
||||
return None
|
||||
|
||||
async def delete(self, kind: str, id: str) -> bool:
|
||||
return False
|
||||
|
||||
async def knn(
|
||||
self, kind: str, vector: Sequence[float], k: int = 10
|
||||
) -> list[Neighbor]:
|
||||
return []
|
||||
285
decnet/vectorstore/sqlite_vec.py
Normal file
285
decnet/vectorstore/sqlite_vec.py
Normal file
@@ -0,0 +1,285 @@
|
||||
"""SQLite + sqlite-vec backend.
|
||||
|
||||
Lazy-imports the ``sqlite_vec`` extension. If the extension isn't
|
||||
available (the package isn't installed, or the host's libsqlite3 is too
|
||||
old to load loadable extensions), construction raises
|
||||
:class:`SqliteVecUnavailable`; the factory catches that and falls back
|
||||
to :class:`~decnet.vectorstore.fake.FakeVectorStore` with a warning.
|
||||
|
||||
Schema:
|
||||
|
||||
CREATE TABLE vectors (
|
||||
kind TEXT NOT NULL,
|
||||
id TEXT NOT NULL,
|
||||
extractor_version INTEGER NOT NULL DEFAULT 1,
|
||||
dim INTEGER NOT NULL,
|
||||
PRIMARY KEY (kind, id)
|
||||
);
|
||||
CREATE VIRTUAL TABLE vec_<kind> USING vec0(
|
||||
embedding float[<dim>]
|
||||
);
|
||||
|
||||
A vec0 virtual table is created lazily per-kind on first insert
|
||||
(distinct ``kind`` values get distinct vec0 tables because vec0's dim
|
||||
is a schema-time constant). The ``vectors`` row is the source of truth
|
||||
for metadata (extractor_version, dim) and for the (kind, id) → rowid
|
||||
mapping; vec0 stores only the embedding, keyed by an INTEGER rowid.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import sqlite3
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Optional, Sequence
|
||||
|
||||
from decnet.vectorstore.base import BaseVectorStore, Neighbor, VectorRecord
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SqliteVecUnavailable(RuntimeError):
|
||||
"""sqlite_vec couldn't be loaded (extension missing / too-old sqlite3)."""
|
||||
|
||||
|
||||
def _load_sqlite_vec(conn: sqlite3.Connection) -> None:
|
||||
try:
|
||||
import sqlite_vec # type: ignore[import-untyped]
|
||||
except ImportError as e:
|
||||
raise SqliteVecUnavailable("sqlite_vec package not installed") from e
|
||||
try:
|
||||
conn.enable_load_extension(True)
|
||||
except (AttributeError, sqlite3.NotSupportedError) as e:
|
||||
raise SqliteVecUnavailable(
|
||||
"system sqlite3 was built without loadable-extension support"
|
||||
) from e
|
||||
try:
|
||||
sqlite_vec.load(conn)
|
||||
except sqlite3.OperationalError as e:
|
||||
raise SqliteVecUnavailable(f"sqlite_vec load failed: {e}") from e
|
||||
finally:
|
||||
try:
|
||||
conn.enable_load_extension(False)
|
||||
except sqlite3.NotSupportedError:
|
||||
pass
|
||||
|
||||
|
||||
class SqliteVecVectorStore(BaseVectorStore):
|
||||
"""sqlite-vec backed vector store. Single-file, async-friendly via
|
||||
:func:`asyncio.to_thread`. Keep one instance per process.
|
||||
"""
|
||||
|
||||
def __init__(self, db_path: str) -> None:
|
||||
self._db_path = db_path
|
||||
self._conn: Optional[sqlite3.Connection] = None
|
||||
self._lock = threading.Lock()
|
||||
# {kind: dim} cached after first insert/probe.
|
||||
self._kinds: dict[str, int] = {}
|
||||
|
||||
async def initialize(self) -> None:
|
||||
await asyncio.to_thread(self._init_sync)
|
||||
|
||||
def _init_sync(self) -> None:
|
||||
Path(self._db_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
conn = sqlite3.connect(self._db_path, check_same_thread=False)
|
||||
_load_sqlite_vec(conn) # raises SqliteVecUnavailable on failure
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS vectors (
|
||||
kind TEXT NOT NULL,
|
||||
id TEXT NOT NULL,
|
||||
extractor_version INTEGER NOT NULL DEFAULT 1,
|
||||
dim INTEGER NOT NULL,
|
||||
rowid_in_vec INTEGER NOT NULL,
|
||||
PRIMARY KEY (kind, id)
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS ix_vectors_kind ON vectors(kind)"
|
||||
)
|
||||
conn.commit()
|
||||
# Re-hydrate kind→dim cache from any existing rows so a process
|
||||
# restart doesn't accept a mismatched dim on the first insert.
|
||||
for row in conn.execute("SELECT kind, dim FROM vectors GROUP BY kind"):
|
||||
self._kinds[row[0]] = int(row[1])
|
||||
self._conn = conn
|
||||
|
||||
async def close(self) -> None:
|
||||
await asyncio.to_thread(self._close_sync)
|
||||
|
||||
def _close_sync(self) -> None:
|
||||
with self._lock:
|
||||
if self._conn is not None:
|
||||
self._conn.close()
|
||||
self._conn = None
|
||||
|
||||
async def health(self) -> dict:
|
||||
return await asyncio.to_thread(self._health_sync)
|
||||
|
||||
def _health_sync(self) -> dict:
|
||||
if self._conn is None:
|
||||
return {"ok": False, "backend": "sqlite_vec", "reason": "not initialized"}
|
||||
try:
|
||||
row = self._conn.execute("SELECT COUNT(*) FROM vectors").fetchone()
|
||||
return {
|
||||
"ok": True,
|
||||
"backend": "sqlite_vec",
|
||||
"kinds": len(self._kinds),
|
||||
"vectors": int(row[0]) if row else 0,
|
||||
}
|
||||
except sqlite3.Error as e:
|
||||
return {"ok": False, "backend": "sqlite_vec", "reason": str(e)}
|
||||
|
||||
@staticmethod
|
||||
def _vec_table(kind: str) -> str:
|
||||
# Validate the kind so it can't break out of the table name.
|
||||
# Allowed: ascii letters, digits, underscore. Anything else =
|
||||
# programmer error; raise loudly.
|
||||
if not kind or not all(c.isalnum() or c == "_" for c in kind):
|
||||
raise ValueError(f"invalid kind {kind!r}: ascii [a-z0-9_] only")
|
||||
return f"vec_{kind}"
|
||||
|
||||
def _ensure_kind_table(self, kind: str, dim: int) -> None:
|
||||
assert self._conn is not None # nosec B101
|
||||
existing = self._kinds.get(kind)
|
||||
if existing is None:
|
||||
# vec_<kind> identifier is validated by _vec_table() to be
|
||||
# ascii [a-z0-9_] only, and dim is int-cast — no injection
|
||||
# vector. The f-string is the only way to interpolate a
|
||||
# virtual-table name; placeholders aren't allowed for DDL.
|
||||
ddl = ( # nosec B608
|
||||
f"CREATE VIRTUAL TABLE IF NOT EXISTS {self._vec_table(kind)} "
|
||||
f"USING vec0(embedding float[{int(dim)}])"
|
||||
)
|
||||
self._conn.execute(ddl)
|
||||
self._conn.commit()
|
||||
self._kinds[kind] = dim
|
||||
elif existing != dim:
|
||||
raise ValueError(
|
||||
f"vector dim mismatch for kind={kind!r}: "
|
||||
f"expected {existing}, got {dim}"
|
||||
)
|
||||
|
||||
async def insert(
|
||||
self, kind: str, id: str, vector: Sequence[float],
|
||||
*, extractor_version: int = 1,
|
||||
) -> None:
|
||||
await asyncio.to_thread(
|
||||
self._insert_sync, kind, id, list(vector), int(extractor_version)
|
||||
)
|
||||
|
||||
def _insert_sync(
|
||||
self, kind: str, id: str, vector: list[float], extractor_version: int,
|
||||
) -> None:
|
||||
with self._lock:
|
||||
assert self._conn is not None # nosec B101
|
||||
dim = len(vector)
|
||||
self._ensure_kind_table(kind, dim)
|
||||
vec_table = self._vec_table(kind)
|
||||
cur = self._conn.cursor()
|
||||
existing = cur.execute(
|
||||
"SELECT rowid_in_vec FROM vectors WHERE kind=? AND id=?",
|
||||
(kind, id),
|
||||
).fetchone()
|
||||
if existing is not None:
|
||||
rowid = int(existing[0])
|
||||
# vec_table is validated; rowid is bound. Safe.
|
||||
cur.execute(f"DELETE FROM {vec_table} WHERE rowid=?", (rowid,)) # nosec B608
|
||||
import struct
|
||||
blob = struct.pack(f"{dim}f", *vector)
|
||||
cur.execute(f"INSERT INTO {vec_table}(embedding) VALUES (?)", (blob,)) # nosec B608
|
||||
new_rowid = cur.lastrowid
|
||||
cur.execute(
|
||||
"INSERT OR REPLACE INTO vectors"
|
||||
"(kind, id, extractor_version, dim, rowid_in_vec) "
|
||||
"VALUES (?, ?, ?, ?, ?)",
|
||||
(kind, id, extractor_version, dim, new_rowid),
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
async def get(self, kind: str, id: str) -> Optional[VectorRecord]:
|
||||
return await asyncio.to_thread(self._get_sync, kind, id)
|
||||
|
||||
def _get_sync(self, kind: str, id: str) -> Optional[VectorRecord]:
|
||||
with self._lock:
|
||||
assert self._conn is not None # nosec B101
|
||||
row = self._conn.execute(
|
||||
"SELECT extractor_version, dim, rowid_in_vec "
|
||||
"FROM vectors WHERE kind=? AND id=?",
|
||||
(kind, id),
|
||||
).fetchone()
|
||||
if row is None:
|
||||
return None
|
||||
ext_v, dim, rowid = int(row[0]), int(row[1]), int(row[2])
|
||||
vec_table = self._vec_table(kind)
|
||||
blob_row = self._conn.execute(f"SELECT embedding FROM {vec_table} WHERE rowid=?", (rowid,)).fetchone() # nosec B608
|
||||
if blob_row is None:
|
||||
return None
|
||||
import struct
|
||||
vec = list(struct.unpack(f"{dim}f", blob_row[0]))
|
||||
return VectorRecord(
|
||||
kind=kind, id=id, vector=vec, dim=dim,
|
||||
extractor_version=ext_v,
|
||||
)
|
||||
|
||||
async def delete(self, kind: str, id: str) -> bool:
|
||||
return await asyncio.to_thread(self._delete_sync, kind, id)
|
||||
|
||||
def _delete_sync(self, kind: str, id: str) -> bool:
|
||||
with self._lock:
|
||||
assert self._conn is not None # nosec B101
|
||||
row = self._conn.execute(
|
||||
"SELECT rowid_in_vec FROM vectors WHERE kind=? AND id=?",
|
||||
(kind, id),
|
||||
).fetchone()
|
||||
if row is None:
|
||||
return False
|
||||
rowid = int(row[0])
|
||||
vec_table = self._vec_table(kind)
|
||||
self._conn.execute(f"DELETE FROM {vec_table} WHERE rowid=?", (rowid,)) # nosec B608
|
||||
self._conn.execute(
|
||||
"DELETE FROM vectors WHERE kind=? AND id=?", (kind, id)
|
||||
)
|
||||
self._conn.commit()
|
||||
return True
|
||||
|
||||
async def knn(
|
||||
self, kind: str, vector: Sequence[float], k: int = 10,
|
||||
) -> list[Neighbor]:
|
||||
return await asyncio.to_thread(self._knn_sync, kind, list(vector), int(k))
|
||||
|
||||
def _knn_sync(self, kind: str, vector: list[float], k: int) -> list[Neighbor]:
|
||||
with self._lock:
|
||||
assert self._conn is not None # nosec B101
|
||||
existing_dim = self._kinds.get(kind)
|
||||
if existing_dim is None:
|
||||
return []
|
||||
if len(vector) != existing_dim:
|
||||
raise ValueError(
|
||||
f"query dim {len(vector)} != stored dim {existing_dim} "
|
||||
f"for kind={kind!r}"
|
||||
)
|
||||
vec_table = self._vec_table(kind)
|
||||
import struct
|
||||
qblob = struct.pack(f"{existing_dim}f", *vector)
|
||||
knn_sql = f"SELECT rowid, distance FROM {vec_table} WHERE embedding MATCH ? ORDER BY distance LIMIT ?" # nosec B608
|
||||
rows = self._conn.execute(knn_sql, (qblob, max(0, k))).fetchall()
|
||||
if not rows:
|
||||
return []
|
||||
id_map = {
|
||||
int(r[0]): r[1]
|
||||
for r in self._conn.execute(
|
||||
"SELECT rowid_in_vec, id FROM vectors WHERE kind=?",
|
||||
(kind,),
|
||||
)
|
||||
}
|
||||
out: list[Neighbor] = []
|
||||
for rowid, dist in rows:
|
||||
rid = id_map.get(int(rowid))
|
||||
if rid is None:
|
||||
continue
|
||||
out.append(Neighbor(kind=kind, id=rid, distance=float(dist)))
|
||||
return out
|
||||
Reference in New Issue
Block a user