Vector embeddings storage and similarity search

Apache-2.0 1.4k downloads
Updated 22 days ago Repository
embeddingsvectorssimilaritysearch

Run

wippy run wippy/embeddings

Embeddings

Vector embeddings storage and similarity search with SQLite (vec0) and PostgreSQL (pgvector) support.

Installation

entries:
  - name: embeddings
    kind: ns.dependency
    component: wippy/embeddings
    version: "*"
    parameters:
      - name: target_db
        value: app:db

Adding Embeddings

local embeddings = require("embeddings")

-- Add single embedding (auto-generated via LLM)
local result, err = embeddings.add(
    "This is the content to embed",
    "document_chunk",           -- content_type
    "doc-123",                  -- origin_id
    "section-1",                -- context_id (optional)
    {author = "John"}           -- meta (optional)
)
-- result.entry_id, result.origin_id, result.content_type

Batch Processing

local result, err = embeddings.add_batch({
    {
        content = "First chunk of text",
        content_type = "document_chunk",
        origin_id = "doc-123",
        context_id = "section-1",
        meta = {page = 1}
    },
    {
        content = "Second chunk of text",
        content_type = "document_chunk",
        origin_id = "doc-123",
        context_id = "section-2"
    }
})
-- result.count, result.items[]

Batches automatically split based on token limits (8000 tokens per request).

Similarity Search

-- Basic search
local results, err = embeddings.search("find similar content", {
    limit = 10
})

-- Filtered search
local results, err = embeddings.search("query text", {
    content_type = "document_chunk",
    origin_id = "doc-123",
    context_id = "section-1",
    limit = 5
})

-- Results include similarity scores
for _, result in ipairs(results) do
    print(result.content, result.similarity)
end

Convenience Methods

-- Find by content type
local results = embeddings.find_by_type("query", "document_chunk", {
    limit = 10
})

-- Find within specific origin
local results = embeddings.find_by_origin("query", "doc-123", {
    content_type = "document_chunk",
    limit = 5
})

Low-level Repository

For direct vector operations:

local embedding_repo = require("embedding_repo")

-- Add with pre-generated embedding vector
local result, err = embedding_repo.add(
    content, content_type, origin_id, context_id, meta,
    {0.1, 0.2, 0.3, ...}  -- 512-dim vector
)

-- Batch add with vectors
embedding_repo.add_batch({
    {
        content = "...",
        content_type = "...",
        origin_id = "...",
        embedding = {0.1, 0.2, ...}
    }
})

-- Search with pre-generated query vector
local results = embedding_repo.search_by_embedding(
    {0.1, 0.2, 0.3, ...},
    {content_type = "...", limit = 10}
)

-- Get embeddings by origin
local entries = embedding_repo.get_by_origin("doc-123")

-- Delete
embedding_repo.delete_by_origin("doc-123")
embedding_repo.delete_by_entry("entry-uuid")

Database Schema

Requires embeddings_512 table with vector column. Migration provided:

entries:
  - name: 01_create_embeddings_table
    kind: function.lua
    meta:
      type: migration
      target_db: app:db

PostgreSQL (pgvector)

CREATE TABLE embeddings_512 (
    entry_id UUID PRIMARY KEY,
    origin_id UUID NOT NULL,
    content_type VARCHAR(255) NOT NULL,
    context_id VARCHAR(255),
    embedding vector(512) NOT NULL,
    content TEXT NOT NULL,
    meta JSONB,
    created_at TIMESTAMP DEFAULT NOW(),
    updated_at TIMESTAMP DEFAULT NOW()
);

SQLite (vec0)

CREATE VIRTUAL TABLE embeddings_512 USING vec0(
    entry_id TEXT PRIMARY KEY,
    origin_id TEXT NOT NULL,
    origin_id_aux TEXT,
    content_type TEXT NOT NULL,
    context_id TEXT,
    embedding float[512],
    content TEXT NOT NULL,
    meta TEXT,
    created_at TEXT,
    updated_at TEXT
);

Configuration

Default embedding model: text-embedding-3-small with 512 dimensions.