IdentityDB/tests/ingestion.test.ts

import { afterEach, beforeEach, describe, expect, it } from 'vitest';

import { IdentityDB } from '../src/core/identity-db';
import { LlmFactExtractor } from '../src/ingestion/llm-extractor';
import { NaiveExtractor } from '../src/ingestion/naive-extractor';
import type { FactExtractor } from '../src/ingestion/types';

describe('IdentityDB ingestion', () => {
  let db: IdentityDB;

  beforeEach(async () => {
    db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' });
    await db.initialize();
  });

  afterEach(async () => {
    await db.close();
  });

  it('ingests a statement using a provided extractor', async () => {
    const extractor: FactExtractor = {
      async extract(input) {
        return {
          statement: input,
          topics: [
            { name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' },
            { name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' },
            { name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' },
          ],
        };
      },
    };

    const fact = await db.ingestStatement('I have worked with TypeScript since 2025.', {
      extractor,
    });

    expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'TypeScript', '2025']);

    const linkedFacts = await db.getTopicFactsLinkedTo('TypeScript', '2025');
    expect(linkedFacts).toHaveLength(1);
    expect(linkedFacts[0]?.statement).toBe('I have worked with TypeScript since 2025.');
  });

  it('ships a deterministic naive extractor for local usage', async () => {
    const fact = await db.ingestStatement('I have worked with TypeScript since 2025.', {
      extractor: new NaiveExtractor(),
    });

    expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'TypeScript', '2025']);

    const topic = await db.getTopicByName('TypeScript', { includeFacts: true });
    expect(topic?.facts).toHaveLength(1);
  });

  it('ships an LLM extractor adapter that returns structured facts from the model', async () => {
    let prompt = '';

    const extractor = new LlmFactExtractor({
      model: {
        async generateText(input) {
          prompt = input;

          return {
            statement: 'I have worked with Bun and TypeScript since 2025.',
            summary: 'The speaker has Bun and TypeScript experience.',
            source: 'chat',
            confidence: 0.91,
            metadata: { channel: 'telegram' },
            topics: [
              { name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' },
              { name: 'Bun', category: 'entity', granularity: 'concrete', role: 'object' },
              { name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' },
              { name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' },
            ],
          };
        },
      },
      instructions: 'Prefer technology and time topics.',
    });

    const fact = await db.ingestStatement('I have worked with Bun and TypeScript since 2025.', {
      extractor,
    });

    expect(prompt).toContain('Prefer technology and time topics.');
    expect(prompt).toContain('I have worked with Bun and TypeScript since 2025.');
    expect(fact.summary).toBe('The speaker has Bun and TypeScript experience.');
    expect(fact.source).toBe('chat');
    expect(fact.confidence).toBe(0.91);
    expect(fact.metadata).toEqual({ channel: 'telegram' });
    expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'Bun', 'TypeScript', '2025']);
  });
});