feat: add semantic fact search and embeddings

This commit is contained in:
2026-05-11 12:05:47 +09:00
parent 428f5021e8
commit 810f4a6bf2
10 changed files with 529 additions and 4 deletions

View File

@@ -16,7 +16,7 @@ afterEach(async () => {
});
describe('initializeSchema', () => {
it('creates the topics, facts, fact_topics, topic_relations, and topic_aliases tables', async () => {
it('creates the topics, facts, fact_embeddings, fact_topics, topic_relations, and topic_aliases tables', async () => {
const connection = await createDatabase({ client: 'sqlite', filename: ':memory:' });
openConnections.push(connection.destroy);
@@ -33,6 +33,7 @@ describe('initializeSchema', () => {
expect(tableNames).toContain('topics');
expect(tableNames).toContain('facts');
expect(tableNames).toContain('fact_embeddings');
expect(tableNames).toContain('fact_topics');
expect(tableNames).toContain('topic_relations');
expect(tableNames).toContain('topic_aliases');
@@ -46,6 +47,7 @@ describe('initializeSchema', () => {
const topicsColumns = await sql<{ name: string }>`PRAGMA table_info(topics)`.execute(connection.db);
const factsColumns = await sql<{ name: string }>`PRAGMA table_info(facts)`.execute(connection.db);
const factEmbeddingsColumns = await sql<{ name: string }>`PRAGMA table_info(fact_embeddings)`.execute(connection.db);
const factTopicsColumns = await sql<{ name: string }>`PRAGMA table_info(fact_topics)`.execute(connection.db);
const topicRelationsColumns = await sql<{ name: string }>`PRAGMA table_info(topic_relations)`.execute(connection.db);
const topicAliasesColumns = await sql<{ name: string }>`PRAGMA table_info(topic_aliases)`.execute(connection.db);
@@ -73,6 +75,16 @@ describe('initializeSchema', () => {
'updated_at',
]);
expect(factEmbeddingsColumns.rows.map((row) => row.name)).toEqual([
'fact_id',
'model',
'dimensions',
'embedding',
'content_hash',
'created_at',
'updated_at',
]);
expect(factTopicsColumns.rows.map((row) => row.name)).toEqual([
'fact_id',
'topic_id',

View File

@@ -0,0 +1,170 @@
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { IdentityDB } from '../src/core/identity-db';
import type { FactExtractor } from '../src/ingestion/types';
import type { EmbeddingProvider } from '../src/types/api';
class FakeEmbeddingProvider implements EmbeddingProvider {
model = 'fake-semantic-v1';
dimensions = 3;
async embed(input: string): Promise<number[]> {
return embeddingFor(input);
}
async embedMany(inputs: string[]): Promise<number[][]> {
return Promise.all(inputs.map((input) => this.embed(input)));
}
}
function embeddingFor(input: string): number[] {
const normalized = input.toLowerCase();
if (normalized.includes('bun') && normalized.includes('typescript')) {
return [1, 0, 0];
}
if (normalized.includes('tooling') || normalized.includes('runtime')) {
return [0.98, 0.02, 0];
}
if (normalized.includes('typescript')) {
return [0.9, 0.1, 0];
}
if (normalized.includes('python')) {
return [0, 1, 0];
}
if (normalized.includes('database')) {
return [0, 0.2, 0.8];
}
return [0.1, 0.1, 0.1];
}
describe('IdentityDB semantic search', () => {
let db: IdentityDB;
let provider: FakeEmbeddingProvider;
beforeEach(async () => {
provider = new FakeEmbeddingProvider();
db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' });
await db.initialize();
await db.addFact({
statement: 'Bun runs TypeScript tooling quickly.',
topics: [
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
],
});
await db.addFact({
statement: 'TypeScript compiles to JavaScript.',
topics: [
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
{ name: 'JavaScript', category: 'entity', granularity: 'concrete' },
],
});
await db.addFact({
statement: 'Python uses indentation syntax.',
topics: [
{ name: 'Python', category: 'entity', granularity: 'concrete' },
],
});
});
afterEach(async () => {
await db.close();
});
it('indexes facts and returns semantic search matches ordered by score', async () => {
await db.indexFactEmbeddings({ provider });
const matches = await db.searchFacts({
query: 'TypeScript runtime tooling',
provider,
limit: 2,
});
expect(matches).toHaveLength(2);
expect(matches[0]?.statement).toBe('Bun runs TypeScript tooling quickly.');
expect(matches[1]?.statement).toBe('TypeScript compiles to JavaScript.');
expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score);
});
it('filters semantic search candidates by topic names', async () => {
await db.indexFactEmbeddings({ provider });
const matches = await db.searchFacts({
query: 'TypeScript runtime tooling',
provider,
topicNames: ['Python'],
limit: 5,
});
expect(matches.map((match) => match.statement)).toEqual(['Python uses indentation syntax.']);
});
it('finds similar facts from an input statement', async () => {
await db.indexFactEmbeddings({ provider });
const matches = await db.findSimilarFacts({
statement: 'Bun makes TypeScript tooling fast.',
provider,
limit: 2,
});
expect(matches[0]?.statement).toBe('Bun runs TypeScript tooling quickly.');
expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score);
});
});
describe('IdentityDB dedup-aware ingestion', () => {
let db: IdentityDB;
let provider: FakeEmbeddingProvider;
let extractor: FactExtractor;
beforeEach(async () => {
provider = new FakeEmbeddingProvider();
extractor = {
async extract(input) {
return {
statement: input,
topics: [
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
],
};
},
};
db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' });
await db.initialize();
});
afterEach(async () => {
await db.close();
});
it('returns the existing fact when ingestion detects a semantic duplicate', async () => {
const first = await db.ingestStatement('Bun runs TypeScript tooling quickly.', {
extractor,
embeddingProvider: provider,
});
const second = await db.ingestStatement('Bun makes TypeScript tooling fast.', {
extractor,
embeddingProvider: provider,
duplicateThreshold: 0.95,
});
const facts = await db.getTopicFacts('TypeScript');
expect(second.id).toBe(first.id);
expect(facts).toHaveLength(1);
expect(facts[0]?.statement).toBe('Bun runs TypeScript tooling quickly.');
});
});