feat: add provider-agnostic LLM extractor adapter

2026-05-11 12:19:50 +09:00
parent 7a02621e40
commit 4f877a40fb
4 changed files with 367 additions and 0 deletions
--- a/src/index.ts
+++ b/src/index.ts
@@ -2,6 +2,7 @@ export * from './adapters';
 export * from './core/identity-db';
 export * from './core/migrations';
 export * from './ingestion/extractor';
+export * from './ingestion/llm-extractor';
 export * from './ingestion/naive-extractor';
 export * from './ingestion/types';
 export * from './types/api';
--- a/src/ingestion/llm-extractor.ts
+++ b/src/ingestion/llm-extractor.ts
@@ -0,0 +1,273 @@
+import { IdentityDBError } from '../core/errors';
+import type { TopicCategory, TopicGranularity } from '../types/domain';
+import type {
+  ExtractedFact,
+  FactExtractor,
+  LlmFactExtractorOptions,
+} from './types';
+
+const DEFAULT_INSTRUCTIONS = [
+  'Extract one structured fact from the user input.',
+  'Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.',
+  'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.',
+  'Only include topics that are explicitly supported by the input.',
+].join('\n');
+
+export class LlmFactExtractor implements FactExtractor {
+  constructor(private readonly options: LlmFactExtractorOptions) {}
+
+  async extract(input: string): Promise<ExtractedFact> {
+    const prompt = this.buildPrompt(input);
+    const response = await this.options.model.generateText(prompt);
+    return parseLlmExtractedFactResponse(response);
+  }
+
+  private buildPrompt(input: string): string {
+    if (this.options.promptBuilder) {
+      return this.options.promptBuilder(input, this.options.instructions);
+    }
+
+    const instructions = this.options.instructions?.trim();
+
+    return [
+      DEFAULT_INSTRUCTIONS,
+      instructions && instructions.length > 0 ? `Additional instructions:\n${instructions}` : null,
+      `Input:\n${input.trim()}`,
+    ]
+      .filter((value): value is string => value !== null)
+      .join('\n\n');
+  }
+}
+
+export function parseLlmExtractedFactResponse(response: string): ExtractedFact {
+  const payload = parseJsonCandidate(response);
+
+  if (!isRecord(payload)) {
+    throw new IdentityDBError('LLM extractor response must be a JSON object.');
+  }
+
+  const topics = parseTopics(payload.topics);
+  const extracted: ExtractedFact = { topics };
+
+  const statement = optionalString(payload.statement);
+  if (statement !== undefined) {
+    extracted.statement = statement;
+  }
+
+  const summary = optionalNullableString(payload.summary);
+  if (summary !== undefined) {
+    extracted.summary = summary;
+  }
+
+  const source = optionalNullableString(payload.source);
+  if (source !== undefined) {
+    extracted.source = source;
+  }
+
+  const confidence = optionalNullableNumber(payload.confidence);
+  if (confidence !== undefined) {
+    extracted.confidence = confidence;
+  }
+
+  const metadata = optionalMetadata(payload.metadata);
+  if (metadata !== undefined) {
+    extracted.metadata = metadata;
+  }
+
+  return extracted;
+}
+
+function parseJsonCandidate(response: string): unknown {
+  const trimmed = response.trim();
+
+  for (const candidate of collectJsonCandidates(trimmed)) {
+    try {
+      return JSON.parse(candidate);
+    } catch {
+      continue;
+    }
+  }
+
+  throw new IdentityDBError('LLM extractor returned invalid JSON.');
+}
+
+function collectJsonCandidates(response: string): string[] {
+  const candidates = new Set<string>();
+  candidates.add(response);
+
+  const fencePattern = /```(?:json)?\s*([\s\S]*?)```/gi;
+  let match: RegExpExecArray | null = fencePattern.exec(response);
+
+  while (match) {
+    const candidate = match[1]?.trim();
+    if (candidate) {
+      candidates.add(candidate);
+    }
+
+    match = fencePattern.exec(response);
+  }
+
+  const firstBrace = response.indexOf('{');
+  const lastBrace = response.lastIndexOf('}');
+  if (firstBrace >= 0 && lastBrace > firstBrace) {
+    candidates.add(response.slice(firstBrace, lastBrace + 1));
+  }
+
+  return Array.from(candidates);
+}
+
+function parseTopics(value: unknown): ExtractedFact['topics'] {
+  if (!Array.isArray(value)) {
+    throw new IdentityDBError('LLM extractor response must include a topics array.');
+  }
+
+  return value.map((entry) => parseTopic(entry));
+}
+
+function parseTopic(value: unknown): ExtractedFact['topics'][number] {
+  if (!isRecord(value)) {
+    throw new IdentityDBError('LLM extractor topics must be JSON objects.');
+  }
+
+  const name = optionalString(value.name)?.trim();
+  if (!name) {
+    throw new IdentityDBError('LLM extractor topics must include a non-empty name.');
+  }
+
+  const topic: ExtractedFact['topics'][number] = { name };
+
+  const category = optionalTopicCategory(value.category);
+  if (category !== undefined) {
+    topic.category = category;
+  }
+
+  const granularity = optionalTopicGranularity(value.granularity);
+  if (granularity !== undefined) {
+    topic.granularity = granularity;
+  }
+
+  const role = optionalNullableString(value.role);
+  if (role !== undefined) {
+    topic.role = role;
+  }
+
+  const description = optionalNullableString(value.description);
+  if (description !== undefined) {
+    topic.description = description;
+  }
+
+  const metadata = optionalMetadata(value.metadata);
+  if (metadata !== undefined) {
+    topic.metadata = metadata;
+  }
+
+  return topic;
+}
+
+function optionalString(value: unknown): string | undefined {
+  if (value === undefined) {
+    return undefined;
+  }
+
+  if (typeof value !== 'string') {
+    throw new IdentityDBError('LLM extractor expected a string field.');
+  }
+
+  return value;
+}
+
+function optionalNullableString(value: unknown): string | null | undefined {
+  if (value === undefined) {
+    return undefined;
+  }
+
+  if (value === null) {
+    return null;
+  }
+
+  if (typeof value !== 'string') {
+    throw new IdentityDBError('LLM extractor expected a nullable string field.');
+  }
+
+  return value;
+}
+
+function optionalNullableNumber(value: unknown): number | null | undefined {
+  if (value === undefined) {
+    return undefined;
+  }
+
+  if (value === null) {
+    return null;
+  }
+
+  if (typeof value !== 'number' || Number.isNaN(value)) {
+    throw new IdentityDBError('LLM extractor expected confidence to be a number or null.');
+  }
+
+  return value;
+}
+
+function optionalMetadata(value: unknown): ExtractedFact['metadata'] | undefined {
+  if (value === undefined) {
+    return undefined;
+  }
+
+  if (value === null) {
+    return null;
+  }
+
+  if (!isJsonLike(value)) {
+    throw new IdentityDBError('LLM extractor metadata must be valid JSON-compatible data.');
+  }
+
+  return value as ExtractedFact['metadata'];
+}
+
+function optionalTopicCategory(value: unknown): TopicCategory | undefined {
+  if (value === undefined) {
+    return undefined;
+  }
+
+  if (value === 'entity' || value === 'concept' || value === 'temporal' || value === 'custom') {
+    return value;
+  }
+
+  throw new IdentityDBError('LLM extractor returned an unsupported topic category.');
+}
+
+function optionalTopicGranularity(value: unknown): TopicGranularity | undefined {
+  if (value === undefined) {
+    return undefined;
+  }
+
+  if (value === 'abstract' || value === 'concrete' || value === 'mixed') {
+    return value;
+  }
+
+  throw new IdentityDBError('LLM extractor returned an unsupported topic granularity.');
+}
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === 'object' && value !== null && !Array.isArray(value);
+}
+
+function isJsonLike(value: unknown): boolean {
+  if (value === null) {
+    return true;
+  }
+
+  if (typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') {
+    return true;
+  }
+
+  if (Array.isArray(value)) {
+    return value.every((entry) => isJsonLike(entry));
+  }
+
+  if (isRecord(value)) {
+    return Object.values(value).every((entry) => isJsonLike(entry));
+  }
+
+  return false;
+}
--- a/src/ingestion/types.ts
+++ b/src/ingestion/types.ts
@@ -17,6 +17,16 @@ export interface FactExtractor {
  extract(input: string): Promise<ExtractedFact>;
 }

+export interface LlmTextGenerationModel {
+  generateText(prompt: string): Promise<string>;
+}
+
+export interface LlmFactExtractorOptions {
+  model: LlmTextGenerationModel;
+  instructions?: string;
+  promptBuilder?: (input: string, instructions?: string) => string;
+}
+
 export interface IngestStatementOptions {
  extractor: FactExtractor;
  embeddingProvider?: EmbeddingProvider;
--- a/tests/ingestion.test.ts
+++ b/tests/ingestion.test.ts
@@ -1,6 +1,7 @@
 import { afterEach, beforeEach, describe, expect, it } from 'vitest';

 import { IdentityDB } from '../src/core/identity-db';
+import { LlmFactExtractor } from '../src/ingestion/llm-extractor';
 import { NaiveExtractor } from '../src/ingestion/naive-extractor';
 import type { FactExtractor } from '../src/ingestion/types';

@@ -51,4 +52,86 @@ describe('IdentityDB ingestion', () => {
    const topic = await db.getTopicByName('TypeScript', { includeFacts: true });
    expect(topic?.facts).toHaveLength(1);
  });
+
+  it('ships an LLM extractor adapter that turns structured JSON responses into facts', async () => {
+    let prompt = '';
+
+    const extractor = new LlmFactExtractor({
+      model: {
+        async generateText(input) {
+          prompt = input;
+
+          return JSON.stringify({
+            statement: 'I have worked with Bun and TypeScript since 2025.',
+            summary: 'The speaker has Bun and TypeScript experience.',
+            source: 'chat',
+            confidence: 0.91,
+            metadata: { channel: 'telegram' },
+            topics: [
+              { name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' },
+              { name: 'Bun', category: 'entity', granularity: 'concrete', role: 'object' },
+              { name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' },
+              { name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' },
+            ],
+          });
+        },
+      },
+      instructions: 'Prefer technology and time topics.',
+    });
+
+    const fact = await db.ingestStatement('I have worked with Bun and TypeScript since 2025.', {
+      extractor,
+    });
+
+    expect(prompt).toContain('Prefer technology and time topics.');
+    expect(prompt).toContain('I have worked with Bun and TypeScript since 2025.');
+    expect(fact.summary).toBe('The speaker has Bun and TypeScript experience.');
+    expect(fact.source).toBe('chat');
+    expect(fact.confidence).toBe(0.91);
+    expect(fact.metadata).toEqual({ channel: 'telegram' });
+    expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'Bun', 'TypeScript', '2025']);
+  });
+
+  it('parses JSON responses wrapped in markdown code fences', async () => {
+    const extractor = new LlmFactExtractor({
+      model: {
+        async generateText() {
+          return [
+            'Here is the extracted fact:',
+            '```json',
+            JSON.stringify({
+              statement: 'Bun powers TypeScript tooling.',
+              topics: [
+                { name: 'Bun', category: 'entity', granularity: 'concrete' },
+                { name: 'TypeScript', category: 'entity', granularity: 'concrete' },
+              ],
+            }),
+            '```',
+          ].join('\n');
+        },
+      },
+    });
+
+    const fact = await db.ingestStatement('Bun powers TypeScript tooling.', {
+      extractor,
+    });
+
+    expect(fact.topics.map((topic) => topic.name)).toEqual(['Bun', 'TypeScript']);
+  });
+
+  it('rejects invalid LLM responses before writing facts', async () => {
+    const extractor = new LlmFactExtractor({
+      model: {
+        async generateText() {
+          return 'not json at all';
+        },
+      },
+    });
+
+    await expect(
+      db.ingestStatement('Bun powers TypeScript tooling.', {
+        extractor,
+      }),
+    ).rejects.toThrow('LLM extractor returned invalid JSON.');
+  });
 });