diff --git a/src/index.ts b/src/index.ts index 26dbb5e..9466236 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2,6 +2,7 @@ export * from './adapters'; export * from './core/identity-db'; export * from './core/migrations'; export * from './ingestion/extractor'; +export * from './ingestion/llm-extractor'; export * from './ingestion/naive-extractor'; export * from './ingestion/types'; export * from './types/api'; diff --git a/src/ingestion/llm-extractor.ts b/src/ingestion/llm-extractor.ts new file mode 100644 index 0000000..f018300 --- /dev/null +++ b/src/ingestion/llm-extractor.ts @@ -0,0 +1,273 @@ +import { IdentityDBError } from '../core/errors'; +import type { TopicCategory, TopicGranularity } from '../types/domain'; +import type { + ExtractedFact, + FactExtractor, + LlmFactExtractorOptions, +} from './types'; + +const DEFAULT_INSTRUCTIONS = [ + 'Extract one structured fact from the user input.', + 'Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.', + 'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.', + 'Only include topics that are explicitly supported by the input.', +].join('\n'); + +export class LlmFactExtractor implements FactExtractor { + constructor(private readonly options: LlmFactExtractorOptions) {} + + async extract(input: string): Promise { + const prompt = this.buildPrompt(input); + const response = await this.options.model.generateText(prompt); + return parseLlmExtractedFactResponse(response); + } + + private buildPrompt(input: string): string { + if (this.options.promptBuilder) { + return this.options.promptBuilder(input, this.options.instructions); + } + + const instructions = this.options.instructions?.trim(); + + return [ + DEFAULT_INSTRUCTIONS, + instructions && instructions.length > 0 ? `Additional instructions:\n${instructions}` : null, + `Input:\n${input.trim()}`, + ] + .filter((value): value is string => value !== null) + .join('\n\n'); + } +} + +export function parseLlmExtractedFactResponse(response: string): ExtractedFact { + const payload = parseJsonCandidate(response); + + if (!isRecord(payload)) { + throw new IdentityDBError('LLM extractor response must be a JSON object.'); + } + + const topics = parseTopics(payload.topics); + const extracted: ExtractedFact = { topics }; + + const statement = optionalString(payload.statement); + if (statement !== undefined) { + extracted.statement = statement; + } + + const summary = optionalNullableString(payload.summary); + if (summary !== undefined) { + extracted.summary = summary; + } + + const source = optionalNullableString(payload.source); + if (source !== undefined) { + extracted.source = source; + } + + const confidence = optionalNullableNumber(payload.confidence); + if (confidence !== undefined) { + extracted.confidence = confidence; + } + + const metadata = optionalMetadata(payload.metadata); + if (metadata !== undefined) { + extracted.metadata = metadata; + } + + return extracted; +} + +function parseJsonCandidate(response: string): unknown { + const trimmed = response.trim(); + + for (const candidate of collectJsonCandidates(trimmed)) { + try { + return JSON.parse(candidate); + } catch { + continue; + } + } + + throw new IdentityDBError('LLM extractor returned invalid JSON.'); +} + +function collectJsonCandidates(response: string): string[] { + const candidates = new Set(); + candidates.add(response); + + const fencePattern = /```(?:json)?\s*([\s\S]*?)```/gi; + let match: RegExpExecArray | null = fencePattern.exec(response); + + while (match) { + const candidate = match[1]?.trim(); + if (candidate) { + candidates.add(candidate); + } + + match = fencePattern.exec(response); + } + + const firstBrace = response.indexOf('{'); + const lastBrace = response.lastIndexOf('}'); + if (firstBrace >= 0 && lastBrace > firstBrace) { + candidates.add(response.slice(firstBrace, lastBrace + 1)); + } + + return Array.from(candidates); +} + +function parseTopics(value: unknown): ExtractedFact['topics'] { + if (!Array.isArray(value)) { + throw new IdentityDBError('LLM extractor response must include a topics array.'); + } + + return value.map((entry) => parseTopic(entry)); +} + +function parseTopic(value: unknown): ExtractedFact['topics'][number] { + if (!isRecord(value)) { + throw new IdentityDBError('LLM extractor topics must be JSON objects.'); + } + + const name = optionalString(value.name)?.trim(); + if (!name) { + throw new IdentityDBError('LLM extractor topics must include a non-empty name.'); + } + + const topic: ExtractedFact['topics'][number] = { name }; + + const category = optionalTopicCategory(value.category); + if (category !== undefined) { + topic.category = category; + } + + const granularity = optionalTopicGranularity(value.granularity); + if (granularity !== undefined) { + topic.granularity = granularity; + } + + const role = optionalNullableString(value.role); + if (role !== undefined) { + topic.role = role; + } + + const description = optionalNullableString(value.description); + if (description !== undefined) { + topic.description = description; + } + + const metadata = optionalMetadata(value.metadata); + if (metadata !== undefined) { + topic.metadata = metadata; + } + + return topic; +} + +function optionalString(value: unknown): string | undefined { + if (value === undefined) { + return undefined; + } + + if (typeof value !== 'string') { + throw new IdentityDBError('LLM extractor expected a string field.'); + } + + return value; +} + +function optionalNullableString(value: unknown): string | null | undefined { + if (value === undefined) { + return undefined; + } + + if (value === null) { + return null; + } + + if (typeof value !== 'string') { + throw new IdentityDBError('LLM extractor expected a nullable string field.'); + } + + return value; +} + +function optionalNullableNumber(value: unknown): number | null | undefined { + if (value === undefined) { + return undefined; + } + + if (value === null) { + return null; + } + + if (typeof value !== 'number' || Number.isNaN(value)) { + throw new IdentityDBError('LLM extractor expected confidence to be a number or null.'); + } + + return value; +} + +function optionalMetadata(value: unknown): ExtractedFact['metadata'] | undefined { + if (value === undefined) { + return undefined; + } + + if (value === null) { + return null; + } + + if (!isJsonLike(value)) { + throw new IdentityDBError('LLM extractor metadata must be valid JSON-compatible data.'); + } + + return value as ExtractedFact['metadata']; +} + +function optionalTopicCategory(value: unknown): TopicCategory | undefined { + if (value === undefined) { + return undefined; + } + + if (value === 'entity' || value === 'concept' || value === 'temporal' || value === 'custom') { + return value; + } + + throw new IdentityDBError('LLM extractor returned an unsupported topic category.'); +} + +function optionalTopicGranularity(value: unknown): TopicGranularity | undefined { + if (value === undefined) { + return undefined; + } + + if (value === 'abstract' || value === 'concrete' || value === 'mixed') { + return value; + } + + throw new IdentityDBError('LLM extractor returned an unsupported topic granularity.'); +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function isJsonLike(value: unknown): boolean { + if (value === null) { + return true; + } + + if (typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') { + return true; + } + + if (Array.isArray(value)) { + return value.every((entry) => isJsonLike(entry)); + } + + if (isRecord(value)) { + return Object.values(value).every((entry) => isJsonLike(entry)); + } + + return false; +} diff --git a/src/ingestion/types.ts b/src/ingestion/types.ts index 11aca5d..78327fc 100644 --- a/src/ingestion/types.ts +++ b/src/ingestion/types.ts @@ -17,6 +17,16 @@ export interface FactExtractor { extract(input: string): Promise; } +export interface LlmTextGenerationModel { + generateText(prompt: string): Promise; +} + +export interface LlmFactExtractorOptions { + model: LlmTextGenerationModel; + instructions?: string; + promptBuilder?: (input: string, instructions?: string) => string; +} + export interface IngestStatementOptions { extractor: FactExtractor; embeddingProvider?: EmbeddingProvider; diff --git a/tests/ingestion.test.ts b/tests/ingestion.test.ts index 0a5ceee..27563f3 100644 --- a/tests/ingestion.test.ts +++ b/tests/ingestion.test.ts @@ -1,6 +1,7 @@ import { afterEach, beforeEach, describe, expect, it } from 'vitest'; import { IdentityDB } from '../src/core/identity-db'; +import { LlmFactExtractor } from '../src/ingestion/llm-extractor'; import { NaiveExtractor } from '../src/ingestion/naive-extractor'; import type { FactExtractor } from '../src/ingestion/types'; @@ -51,4 +52,86 @@ describe('IdentityDB ingestion', () => { const topic = await db.getTopicByName('TypeScript', { includeFacts: true }); expect(topic?.facts).toHaveLength(1); }); + + it('ships an LLM extractor adapter that turns structured JSON responses into facts', async () => { + let prompt = ''; + + const extractor = new LlmFactExtractor({ + model: { + async generateText(input) { + prompt = input; + + return JSON.stringify({ + statement: 'I have worked with Bun and TypeScript since 2025.', + summary: 'The speaker has Bun and TypeScript experience.', + source: 'chat', + confidence: 0.91, + metadata: { channel: 'telegram' }, + topics: [ + { name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' }, + { name: 'Bun', category: 'entity', granularity: 'concrete', role: 'object' }, + { name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' }, + { name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' }, + ], + }); + }, + }, + instructions: 'Prefer technology and time topics.', + }); + + const fact = await db.ingestStatement('I have worked with Bun and TypeScript since 2025.', { + extractor, + }); + + expect(prompt).toContain('Prefer technology and time topics.'); + expect(prompt).toContain('I have worked with Bun and TypeScript since 2025.'); + expect(fact.summary).toBe('The speaker has Bun and TypeScript experience.'); + expect(fact.source).toBe('chat'); + expect(fact.confidence).toBe(0.91); + expect(fact.metadata).toEqual({ channel: 'telegram' }); + expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'Bun', 'TypeScript', '2025']); + }); + + it('parses JSON responses wrapped in markdown code fences', async () => { + const extractor = new LlmFactExtractor({ + model: { + async generateText() { + return [ + 'Here is the extracted fact:', + '```json', + JSON.stringify({ + statement: 'Bun powers TypeScript tooling.', + topics: [ + { name: 'Bun', category: 'entity', granularity: 'concrete' }, + { name: 'TypeScript', category: 'entity', granularity: 'concrete' }, + ], + }), + '```', + ].join('\n'); + }, + }, + }); + + const fact = await db.ingestStatement('Bun powers TypeScript tooling.', { + extractor, + }); + + expect(fact.topics.map((topic) => topic.name)).toEqual(['Bun', 'TypeScript']); + }); + + it('rejects invalid LLM responses before writing facts', async () => { + const extractor = new LlmFactExtractor({ + model: { + async generateText() { + return 'not json at all'; + }, + }, + }); + + await expect( + db.ingestStatement('Bun powers TypeScript tooling.', { + extractor, + }), + ).rejects.toThrow('LLM extractor returned invalid JSON.'); + }); });