import { afterEach, beforeEach, describe, expect, it } from 'vitest'; import { IdentityDB } from '../src/core/identity-db'; import type { FactExtractor } from '../src/ingestion/types'; import type { EmbeddingProvider } from '../src/types/api'; class FakeEmbeddingProvider implements EmbeddingProvider { model = 'fake-semantic-v1'; dimensions = 3; async embed(input: string): Promise { return embeddingFor(input); } async embedMany(inputs: string[]): Promise { return Promise.all(inputs.map((input) => this.embed(input))); } } function embeddingFor(input: string): number[] { const normalized = input.toLowerCase(); if (normalized.includes('bun') && normalized.includes('typescript')) { return [1, 0, 0]; } if (normalized.includes('tooling') || normalized.includes('runtime')) { return [0.98, 0.02, 0]; } if (normalized.includes('typescript')) { return [0.9, 0.1, 0]; } if (normalized.includes('python')) { return [0, 1, 0]; } if (normalized.includes('database')) { return [0, 0.2, 0.8]; } return [0.1, 0.1, 0.1]; } describe('IdentityDB semantic search', () => { let db: IdentityDB; let provider: FakeEmbeddingProvider; beforeEach(async () => { provider = new FakeEmbeddingProvider(); db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' }); await db.initialize(); await db.addFact({ statement: 'Bun runs TypeScript tooling quickly.', topics: [ { name: 'Bun', category: 'entity', granularity: 'concrete' }, { name: 'TypeScript', category: 'entity', granularity: 'concrete' }, ], }); await db.addFact({ statement: 'TypeScript compiles to JavaScript.', topics: [ { name: 'TypeScript', category: 'entity', granularity: 'concrete' }, { name: 'JavaScript', category: 'entity', granularity: 'concrete' }, ], }); await db.addFact({ statement: 'Python uses indentation syntax.', topics: [ { name: 'Python', category: 'entity', granularity: 'concrete' }, ], }); }); afterEach(async () => { await db.close(); }); it('indexes facts and returns semantic search matches ordered by score', async () => { await db.indexFactEmbeddings({ provider }); const matches = await db.searchFacts({ query: 'TypeScript runtime tooling', provider, limit: 2, }); expect(matches).toHaveLength(2); expect(matches[0]?.statement).toBe('Bun runs TypeScript tooling quickly.'); expect(matches[1]?.statement).toBe('TypeScript compiles to JavaScript.'); expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score); }); it('filters semantic search candidates by topic names', async () => { await db.indexFactEmbeddings({ provider }); const matches = await db.searchFacts({ query: 'TypeScript runtime tooling', provider, topicNames: ['Python'], limit: 5, }); expect(matches.map((match) => match.statement)).toEqual(['Python uses indentation syntax.']); }); it('finds similar facts from an input statement', async () => { await db.indexFactEmbeddings({ provider }); const matches = await db.findSimilarFacts({ statement: 'Bun makes TypeScript tooling fast.', provider, limit: 2, }); expect(matches[0]?.statement).toBe('Bun runs TypeScript tooling quickly.'); expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score); }); }); describe('IdentityDB dedup-aware ingestion', () => { let db: IdentityDB; let provider: FakeEmbeddingProvider; let extractor: FactExtractor; beforeEach(async () => { provider = new FakeEmbeddingProvider(); extractor = { async extract(input) { return { statement: input, topics: [ { name: 'Bun', category: 'entity', granularity: 'concrete' }, { name: 'TypeScript', category: 'entity', granularity: 'concrete' }, ], }; }, }; db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' }); await db.initialize(); }); afterEach(async () => { await db.close(); }); it('returns the existing fact when ingestion detects a semantic duplicate', async () => { const first = await db.ingestStatement('Bun runs TypeScript tooling quickly.', { extractor, embeddingProvider: provider, }); const second = await db.ingestStatement('Bun makes TypeScript tooling fast.', { extractor, embeddingProvider: provider, duplicateThreshold: 0.95, }); const facts = await db.getTopicFacts('TypeScript'); expect(second.id).toBe(first.id); expect(facts).toHaveLength(1); expect(facts[0]?.statement).toBe('Bun runs TypeScript tooling quickly.'); }); });