feat: add provider-agnostic LLM extractor adapter

2026-05-11 12:19:50 +09:00
parent 7a02621e40
commit 4f877a40fb
4 changed files with 367 additions and 0 deletions
--- a/tests/ingestion.test.ts
+++ b/tests/ingestion.test.ts
@@ -1,6 +1,7 @@
 import { afterEach, beforeEach, describe, expect, it } from 'vitest';

 import { IdentityDB } from '../src/core/identity-db';
+import { LlmFactExtractor } from '../src/ingestion/llm-extractor';
 import { NaiveExtractor } from '../src/ingestion/naive-extractor';
 import type { FactExtractor } from '../src/ingestion/types';

@@ -51,4 +52,86 @@ describe('IdentityDB ingestion', () => {
    const topic = await db.getTopicByName('TypeScript', { includeFacts: true });
    expect(topic?.facts).toHaveLength(1);
  });
+
+  it('ships an LLM extractor adapter that turns structured JSON responses into facts', async () => {
+    let prompt = '';
+
+    const extractor = new LlmFactExtractor({
+      model: {
+        async generateText(input) {
+          prompt = input;
+
+          return JSON.stringify({
+            statement: 'I have worked with Bun and TypeScript since 2025.',
+            summary: 'The speaker has Bun and TypeScript experience.',
+            source: 'chat',
+            confidence: 0.91,
+            metadata: { channel: 'telegram' },
+            topics: [
+              { name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' },
+              { name: 'Bun', category: 'entity', granularity: 'concrete', role: 'object' },
+              { name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' },
+              { name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' },
+            ],
+          });
+        },
+      },
+      instructions: 'Prefer technology and time topics.',
+    });
+
+    const fact = await db.ingestStatement('I have worked with Bun and TypeScript since 2025.', {
+      extractor,
+    });
+
+    expect(prompt).toContain('Prefer technology and time topics.');
+    expect(prompt).toContain('I have worked with Bun and TypeScript since 2025.');
+    expect(fact.summary).toBe('The speaker has Bun and TypeScript experience.');
+    expect(fact.source).toBe('chat');
+    expect(fact.confidence).toBe(0.91);
+    expect(fact.metadata).toEqual({ channel: 'telegram' });
+    expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'Bun', 'TypeScript', '2025']);
+  });
+
+  it('parses JSON responses wrapped in markdown code fences', async () => {
+    const extractor = new LlmFactExtractor({
+      model: {
+        async generateText() {
+          return [
+            'Here is the extracted fact:',
+            '```json',
+            JSON.stringify({
+              statement: 'Bun powers TypeScript tooling.',
+              topics: [
+                { name: 'Bun', category: 'entity', granularity: 'concrete' },
+                { name: 'TypeScript', category: 'entity', granularity: 'concrete' },
+              ],
+            }),
+            '```',
+          ].join('\n');
+        },
+      },
+    });
+
+    const fact = await db.ingestStatement('Bun powers TypeScript tooling.', {
+      extractor,
+    });
+
+    expect(fact.topics.map((topic) => topic.name)).toEqual(['Bun', 'TypeScript']);
+  });
+
+  it('rejects invalid LLM responses before writing facts', async () => {
+    const extractor = new LlmFactExtractor({
+      model: {
+        async generateText() {
+          return 'not json at all';
+        },
+      },
+    });
+
+    await expect(
+      db.ingestStatement('Bun powers TypeScript tooling.', {
+        extractor,
+      }),
+    ).rejects.toThrow('LLM extractor returned invalid JSON.');
+  });
 });