171 lines
4.7 KiB
TypeScript
171 lines
4.7 KiB
TypeScript
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
|
|
|
import { IdentityDB } from '../src/core/identity-db';
|
|
import type { FactExtractor } from '../src/ingestion/types';
|
|
import type { EmbeddingProvider } from '../src/types/api';
|
|
|
|
class FakeEmbeddingProvider implements EmbeddingProvider {
|
|
model = 'fake-semantic-v1';
|
|
dimensions = 3;
|
|
|
|
async embed(input: string): Promise<number[]> {
|
|
return embeddingFor(input);
|
|
}
|
|
|
|
async embedMany(inputs: string[]): Promise<number[][]> {
|
|
return Promise.all(inputs.map((input) => this.embed(input)));
|
|
}
|
|
}
|
|
|
|
function embeddingFor(input: string): number[] {
|
|
const normalized = input.toLowerCase();
|
|
|
|
if (normalized.includes('bun') && normalized.includes('typescript')) {
|
|
return [1, 0, 0];
|
|
}
|
|
|
|
if (normalized.includes('tooling') || normalized.includes('runtime')) {
|
|
return [0.98, 0.02, 0];
|
|
}
|
|
|
|
if (normalized.includes('typescript')) {
|
|
return [0.9, 0.1, 0];
|
|
}
|
|
|
|
if (normalized.includes('python')) {
|
|
return [0, 1, 0];
|
|
}
|
|
|
|
if (normalized.includes('database')) {
|
|
return [0, 0.2, 0.8];
|
|
}
|
|
|
|
return [0.1, 0.1, 0.1];
|
|
}
|
|
|
|
describe('IdentityDB semantic search', () => {
|
|
let db: IdentityDB;
|
|
let provider: FakeEmbeddingProvider;
|
|
|
|
beforeEach(async () => {
|
|
provider = new FakeEmbeddingProvider();
|
|
db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' });
|
|
await db.initialize();
|
|
|
|
await db.addFact({
|
|
statement: 'Bun runs TypeScript tooling quickly.',
|
|
topics: [
|
|
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
|
|
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
|
],
|
|
});
|
|
|
|
await db.addFact({
|
|
statement: 'TypeScript compiles to JavaScript.',
|
|
topics: [
|
|
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
|
{ name: 'JavaScript', category: 'entity', granularity: 'concrete' },
|
|
],
|
|
});
|
|
|
|
await db.addFact({
|
|
statement: 'Python uses indentation syntax.',
|
|
topics: [
|
|
{ name: 'Python', category: 'entity', granularity: 'concrete' },
|
|
],
|
|
});
|
|
});
|
|
|
|
afterEach(async () => {
|
|
await db.close();
|
|
});
|
|
|
|
it('indexes facts and returns semantic search matches ordered by score', async () => {
|
|
await db.indexFactEmbeddings({ provider });
|
|
|
|
const matches = await db.searchFacts({
|
|
query: 'TypeScript runtime tooling',
|
|
provider,
|
|
limit: 2,
|
|
});
|
|
|
|
expect(matches).toHaveLength(2);
|
|
expect(matches[0]?.statement).toBe('Bun runs TypeScript tooling quickly.');
|
|
expect(matches[1]?.statement).toBe('TypeScript compiles to JavaScript.');
|
|
expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score);
|
|
});
|
|
|
|
it('filters semantic search candidates by topic names', async () => {
|
|
await db.indexFactEmbeddings({ provider });
|
|
|
|
const matches = await db.searchFacts({
|
|
query: 'TypeScript runtime tooling',
|
|
provider,
|
|
topicNames: ['Python'],
|
|
limit: 5,
|
|
});
|
|
|
|
expect(matches.map((match) => match.statement)).toEqual(['Python uses indentation syntax.']);
|
|
});
|
|
|
|
it('finds similar facts from an input statement', async () => {
|
|
await db.indexFactEmbeddings({ provider });
|
|
|
|
const matches = await db.findSimilarFacts({
|
|
statement: 'Bun makes TypeScript tooling fast.',
|
|
provider,
|
|
limit: 2,
|
|
});
|
|
|
|
expect(matches[0]?.statement).toBe('Bun runs TypeScript tooling quickly.');
|
|
expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score);
|
|
});
|
|
});
|
|
|
|
describe('IdentityDB dedup-aware ingestion', () => {
|
|
let db: IdentityDB;
|
|
let provider: FakeEmbeddingProvider;
|
|
let extractor: FactExtractor;
|
|
|
|
beforeEach(async () => {
|
|
provider = new FakeEmbeddingProvider();
|
|
extractor = {
|
|
async extract(input) {
|
|
return {
|
|
statement: input,
|
|
topics: [
|
|
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
|
|
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
|
],
|
|
};
|
|
},
|
|
};
|
|
|
|
db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' });
|
|
await db.initialize();
|
|
});
|
|
|
|
afterEach(async () => {
|
|
await db.close();
|
|
});
|
|
|
|
it('returns the existing fact when ingestion detects a semantic duplicate', async () => {
|
|
const first = await db.ingestStatement('Bun runs TypeScript tooling quickly.', {
|
|
extractor,
|
|
embeddingProvider: provider,
|
|
});
|
|
|
|
const second = await db.ingestStatement('Bun makes TypeScript tooling fast.', {
|
|
extractor,
|
|
embeddingProvider: provider,
|
|
duplicateThreshold: 0.95,
|
|
});
|
|
|
|
const facts = await db.getTopicFacts('TypeScript');
|
|
|
|
expect(second.id).toBe(first.id);
|
|
expect(facts).toHaveLength(1);
|
|
expect(facts[0]?.statement).toBe('Bun runs TypeScript tooling quickly.');
|
|
});
|
|
});
|