feat: add semantic fact search and embeddings
This commit is contained in:
@@ -16,7 +16,7 @@ afterEach(async () => {
|
||||
});
|
||||
|
||||
describe('initializeSchema', () => {
|
||||
it('creates the topics, facts, fact_topics, topic_relations, and topic_aliases tables', async () => {
|
||||
it('creates the topics, facts, fact_embeddings, fact_topics, topic_relations, and topic_aliases tables', async () => {
|
||||
const connection = await createDatabase({ client: 'sqlite', filename: ':memory:' });
|
||||
openConnections.push(connection.destroy);
|
||||
|
||||
@@ -33,6 +33,7 @@ describe('initializeSchema', () => {
|
||||
|
||||
expect(tableNames).toContain('topics');
|
||||
expect(tableNames).toContain('facts');
|
||||
expect(tableNames).toContain('fact_embeddings');
|
||||
expect(tableNames).toContain('fact_topics');
|
||||
expect(tableNames).toContain('topic_relations');
|
||||
expect(tableNames).toContain('topic_aliases');
|
||||
@@ -46,6 +47,7 @@ describe('initializeSchema', () => {
|
||||
|
||||
const topicsColumns = await sql<{ name: string }>`PRAGMA table_info(topics)`.execute(connection.db);
|
||||
const factsColumns = await sql<{ name: string }>`PRAGMA table_info(facts)`.execute(connection.db);
|
||||
const factEmbeddingsColumns = await sql<{ name: string }>`PRAGMA table_info(fact_embeddings)`.execute(connection.db);
|
||||
const factTopicsColumns = await sql<{ name: string }>`PRAGMA table_info(fact_topics)`.execute(connection.db);
|
||||
const topicRelationsColumns = await sql<{ name: string }>`PRAGMA table_info(topic_relations)`.execute(connection.db);
|
||||
const topicAliasesColumns = await sql<{ name: string }>`PRAGMA table_info(topic_aliases)`.execute(connection.db);
|
||||
@@ -73,6 +75,16 @@ describe('initializeSchema', () => {
|
||||
'updated_at',
|
||||
]);
|
||||
|
||||
expect(factEmbeddingsColumns.rows.map((row) => row.name)).toEqual([
|
||||
'fact_id',
|
||||
'model',
|
||||
'dimensions',
|
||||
'embedding',
|
||||
'content_hash',
|
||||
'created_at',
|
||||
'updated_at',
|
||||
]);
|
||||
|
||||
expect(factTopicsColumns.rows.map((row) => row.name)).toEqual([
|
||||
'fact_id',
|
||||
'topic_id',
|
||||
|
||||
170
tests/semantic-search.test.ts
Normal file
170
tests/semantic-search.test.ts
Normal file
@@ -0,0 +1,170 @@
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
|
||||
import { IdentityDB } from '../src/core/identity-db';
|
||||
import type { FactExtractor } from '../src/ingestion/types';
|
||||
import type { EmbeddingProvider } from '../src/types/api';
|
||||
|
||||
class FakeEmbeddingProvider implements EmbeddingProvider {
|
||||
model = 'fake-semantic-v1';
|
||||
dimensions = 3;
|
||||
|
||||
async embed(input: string): Promise<number[]> {
|
||||
return embeddingFor(input);
|
||||
}
|
||||
|
||||
async embedMany(inputs: string[]): Promise<number[][]> {
|
||||
return Promise.all(inputs.map((input) => this.embed(input)));
|
||||
}
|
||||
}
|
||||
|
||||
function embeddingFor(input: string): number[] {
|
||||
const normalized = input.toLowerCase();
|
||||
|
||||
if (normalized.includes('bun') && normalized.includes('typescript')) {
|
||||
return [1, 0, 0];
|
||||
}
|
||||
|
||||
if (normalized.includes('tooling') || normalized.includes('runtime')) {
|
||||
return [0.98, 0.02, 0];
|
||||
}
|
||||
|
||||
if (normalized.includes('typescript')) {
|
||||
return [0.9, 0.1, 0];
|
||||
}
|
||||
|
||||
if (normalized.includes('python')) {
|
||||
return [0, 1, 0];
|
||||
}
|
||||
|
||||
if (normalized.includes('database')) {
|
||||
return [0, 0.2, 0.8];
|
||||
}
|
||||
|
||||
return [0.1, 0.1, 0.1];
|
||||
}
|
||||
|
||||
describe('IdentityDB semantic search', () => {
|
||||
let db: IdentityDB;
|
||||
let provider: FakeEmbeddingProvider;
|
||||
|
||||
beforeEach(async () => {
|
||||
provider = new FakeEmbeddingProvider();
|
||||
db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' });
|
||||
await db.initialize();
|
||||
|
||||
await db.addFact({
|
||||
statement: 'Bun runs TypeScript tooling quickly.',
|
||||
topics: [
|
||||
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
|
||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
||||
],
|
||||
});
|
||||
|
||||
await db.addFact({
|
||||
statement: 'TypeScript compiles to JavaScript.',
|
||||
topics: [
|
||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
||||
{ name: 'JavaScript', category: 'entity', granularity: 'concrete' },
|
||||
],
|
||||
});
|
||||
|
||||
await db.addFact({
|
||||
statement: 'Python uses indentation syntax.',
|
||||
topics: [
|
||||
{ name: 'Python', category: 'entity', granularity: 'concrete' },
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await db.close();
|
||||
});
|
||||
|
||||
it('indexes facts and returns semantic search matches ordered by score', async () => {
|
||||
await db.indexFactEmbeddings({ provider });
|
||||
|
||||
const matches = await db.searchFacts({
|
||||
query: 'TypeScript runtime tooling',
|
||||
provider,
|
||||
limit: 2,
|
||||
});
|
||||
|
||||
expect(matches).toHaveLength(2);
|
||||
expect(matches[0]?.statement).toBe('Bun runs TypeScript tooling quickly.');
|
||||
expect(matches[1]?.statement).toBe('TypeScript compiles to JavaScript.');
|
||||
expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score);
|
||||
});
|
||||
|
||||
it('filters semantic search candidates by topic names', async () => {
|
||||
await db.indexFactEmbeddings({ provider });
|
||||
|
||||
const matches = await db.searchFacts({
|
||||
query: 'TypeScript runtime tooling',
|
||||
provider,
|
||||
topicNames: ['Python'],
|
||||
limit: 5,
|
||||
});
|
||||
|
||||
expect(matches.map((match) => match.statement)).toEqual(['Python uses indentation syntax.']);
|
||||
});
|
||||
|
||||
it('finds similar facts from an input statement', async () => {
|
||||
await db.indexFactEmbeddings({ provider });
|
||||
|
||||
const matches = await db.findSimilarFacts({
|
||||
statement: 'Bun makes TypeScript tooling fast.',
|
||||
provider,
|
||||
limit: 2,
|
||||
});
|
||||
|
||||
expect(matches[0]?.statement).toBe('Bun runs TypeScript tooling quickly.');
|
||||
expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score);
|
||||
});
|
||||
});
|
||||
|
||||
describe('IdentityDB dedup-aware ingestion', () => {
|
||||
let db: IdentityDB;
|
||||
let provider: FakeEmbeddingProvider;
|
||||
let extractor: FactExtractor;
|
||||
|
||||
beforeEach(async () => {
|
||||
provider = new FakeEmbeddingProvider();
|
||||
extractor = {
|
||||
async extract(input) {
|
||||
return {
|
||||
statement: input,
|
||||
topics: [
|
||||
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
|
||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
||||
],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' });
|
||||
await db.initialize();
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await db.close();
|
||||
});
|
||||
|
||||
it('returns the existing fact when ingestion detects a semantic duplicate', async () => {
|
||||
const first = await db.ingestStatement('Bun runs TypeScript tooling quickly.', {
|
||||
extractor,
|
||||
embeddingProvider: provider,
|
||||
});
|
||||
|
||||
const second = await db.ingestStatement('Bun makes TypeScript tooling fast.', {
|
||||
extractor,
|
||||
embeddingProvider: provider,
|
||||
duplicateThreshold: 0.95,
|
||||
});
|
||||
|
||||
const facts = await db.getTopicFacts('TypeScript');
|
||||
|
||||
expect(second.id).toBe(first.id);
|
||||
expect(facts).toHaveLength(1);
|
||||
expect(facts[0]?.statement).toBe('Bun runs TypeScript tooling quickly.');
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user