feat: add semantic fact search and embeddings
This commit is contained in:
170
tests/semantic-search.test.ts
Normal file
170
tests/semantic-search.test.ts
Normal file
@@ -0,0 +1,170 @@
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
|
||||
import { IdentityDB } from '../src/core/identity-db';
|
||||
import type { FactExtractor } from '../src/ingestion/types';
|
||||
import type { EmbeddingProvider } from '../src/types/api';
|
||||
|
||||
class FakeEmbeddingProvider implements EmbeddingProvider {
|
||||
model = 'fake-semantic-v1';
|
||||
dimensions = 3;
|
||||
|
||||
async embed(input: string): Promise<number[]> {
|
||||
return embeddingFor(input);
|
||||
}
|
||||
|
||||
async embedMany(inputs: string[]): Promise<number[][]> {
|
||||
return Promise.all(inputs.map((input) => this.embed(input)));
|
||||
}
|
||||
}
|
||||
|
||||
function embeddingFor(input: string): number[] {
|
||||
const normalized = input.toLowerCase();
|
||||
|
||||
if (normalized.includes('bun') && normalized.includes('typescript')) {
|
||||
return [1, 0, 0];
|
||||
}
|
||||
|
||||
if (normalized.includes('tooling') || normalized.includes('runtime')) {
|
||||
return [0.98, 0.02, 0];
|
||||
}
|
||||
|
||||
if (normalized.includes('typescript')) {
|
||||
return [0.9, 0.1, 0];
|
||||
}
|
||||
|
||||
if (normalized.includes('python')) {
|
||||
return [0, 1, 0];
|
||||
}
|
||||
|
||||
if (normalized.includes('database')) {
|
||||
return [0, 0.2, 0.8];
|
||||
}
|
||||
|
||||
return [0.1, 0.1, 0.1];
|
||||
}
|
||||
|
||||
describe('IdentityDB semantic search', () => {
|
||||
let db: IdentityDB;
|
||||
let provider: FakeEmbeddingProvider;
|
||||
|
||||
beforeEach(async () => {
|
||||
provider = new FakeEmbeddingProvider();
|
||||
db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' });
|
||||
await db.initialize();
|
||||
|
||||
await db.addFact({
|
||||
statement: 'Bun runs TypeScript tooling quickly.',
|
||||
topics: [
|
||||
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
|
||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
||||
],
|
||||
});
|
||||
|
||||
await db.addFact({
|
||||
statement: 'TypeScript compiles to JavaScript.',
|
||||
topics: [
|
||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
||||
{ name: 'JavaScript', category: 'entity', granularity: 'concrete' },
|
||||
],
|
||||
});
|
||||
|
||||
await db.addFact({
|
||||
statement: 'Python uses indentation syntax.',
|
||||
topics: [
|
||||
{ name: 'Python', category: 'entity', granularity: 'concrete' },
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await db.close();
|
||||
});
|
||||
|
||||
it('indexes facts and returns semantic search matches ordered by score', async () => {
|
||||
await db.indexFactEmbeddings({ provider });
|
||||
|
||||
const matches = await db.searchFacts({
|
||||
query: 'TypeScript runtime tooling',
|
||||
provider,
|
||||
limit: 2,
|
||||
});
|
||||
|
||||
expect(matches).toHaveLength(2);
|
||||
expect(matches[0]?.statement).toBe('Bun runs TypeScript tooling quickly.');
|
||||
expect(matches[1]?.statement).toBe('TypeScript compiles to JavaScript.');
|
||||
expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score);
|
||||
});
|
||||
|
||||
it('filters semantic search candidates by topic names', async () => {
|
||||
await db.indexFactEmbeddings({ provider });
|
||||
|
||||
const matches = await db.searchFacts({
|
||||
query: 'TypeScript runtime tooling',
|
||||
provider,
|
||||
topicNames: ['Python'],
|
||||
limit: 5,
|
||||
});
|
||||
|
||||
expect(matches.map((match) => match.statement)).toEqual(['Python uses indentation syntax.']);
|
||||
});
|
||||
|
||||
it('finds similar facts from an input statement', async () => {
|
||||
await db.indexFactEmbeddings({ provider });
|
||||
|
||||
const matches = await db.findSimilarFacts({
|
||||
statement: 'Bun makes TypeScript tooling fast.',
|
||||
provider,
|
||||
limit: 2,
|
||||
});
|
||||
|
||||
expect(matches[0]?.statement).toBe('Bun runs TypeScript tooling quickly.');
|
||||
expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score);
|
||||
});
|
||||
});
|
||||
|
||||
describe('IdentityDB dedup-aware ingestion', () => {
|
||||
let db: IdentityDB;
|
||||
let provider: FakeEmbeddingProvider;
|
||||
let extractor: FactExtractor;
|
||||
|
||||
beforeEach(async () => {
|
||||
provider = new FakeEmbeddingProvider();
|
||||
extractor = {
|
||||
async extract(input) {
|
||||
return {
|
||||
statement: input,
|
||||
topics: [
|
||||
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
|
||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
||||
],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' });
|
||||
await db.initialize();
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await db.close();
|
||||
});
|
||||
|
||||
it('returns the existing fact when ingestion detects a semantic duplicate', async () => {
|
||||
const first = await db.ingestStatement('Bun runs TypeScript tooling quickly.', {
|
||||
extractor,
|
||||
embeddingProvider: provider,
|
||||
});
|
||||
|
||||
const second = await db.ingestStatement('Bun makes TypeScript tooling fast.', {
|
||||
extractor,
|
||||
embeddingProvider: provider,
|
||||
duplicateThreshold: 0.95,
|
||||
});
|
||||
|
||||
const facts = await db.getTopicFacts('TypeScript');
|
||||
|
||||
expect(second.id).toBe(first.id);
|
||||
expect(facts).toHaveLength(1);
|
||||
expect(facts[0]?.statement).toBe('Bun runs TypeScript tooling quickly.');
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user