import { IdentityDBError } from '../core/errors'; import type { TopicCategory, TopicGranularity } from '../types/domain'; import type { ExtractedFact, FactExtractor, LlmFactExtractorOptions, } from './types'; const DEFAULT_INSTRUCTIONS = [ 'Extract one structured fact from the user input.', 'Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.', 'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.', 'Only include topics that are explicitly supported by the input.', ].join('\n'); export class LlmFactExtractor implements FactExtractor { constructor(private readonly options: LlmFactExtractorOptions) {} async extract(input: string): Promise { const prompt = this.buildPrompt(input); const response = await this.options.model.generateText(prompt); return parseLlmExtractedFactResponse(response); } private buildPrompt(input: string): string { if (this.options.promptBuilder) { return this.options.promptBuilder(input, this.options.instructions); } const instructions = this.options.instructions?.trim(); return [ DEFAULT_INSTRUCTIONS, instructions && instructions.length > 0 ? `Additional instructions:\n${instructions}` : null, `Input:\n${input.trim()}`, ] .filter((value): value is string => value !== null) .join('\n\n'); } } export function parseLlmExtractedFactResponse(response: string): ExtractedFact { const payload = parseJsonCandidate(response); if (!isRecord(payload)) { throw new IdentityDBError('LLM extractor response must be a JSON object.'); } const topics = parseTopics(payload.topics); const extracted: ExtractedFact = { topics }; const statement = optionalString(payload.statement); if (statement !== undefined) { extracted.statement = statement; } const summary = optionalNullableString(payload.summary); if (summary !== undefined) { extracted.summary = summary; } const source = optionalNullableString(payload.source); if (source !== undefined) { extracted.source = source; } const confidence = optionalNullableNumber(payload.confidence); if (confidence !== undefined) { extracted.confidence = confidence; } const metadata = optionalMetadata(payload.metadata); if (metadata !== undefined) { extracted.metadata = metadata; } return extracted; } function parseJsonCandidate(response: string): unknown { const trimmed = response.trim(); for (const candidate of collectJsonCandidates(trimmed)) { try { return JSON.parse(candidate); } catch { continue; } } throw new IdentityDBError('LLM extractor returned invalid JSON.'); } function collectJsonCandidates(response: string): string[] { const candidates = new Set(); candidates.add(response); const fencePattern = /```(?:json)?\s*([\s\S]*?)```/gi; let match: RegExpExecArray | null = fencePattern.exec(response); while (match) { const candidate = match[1]?.trim(); if (candidate) { candidates.add(candidate); } match = fencePattern.exec(response); } const firstBrace = response.indexOf('{'); const lastBrace = response.lastIndexOf('}'); if (firstBrace >= 0 && lastBrace > firstBrace) { candidates.add(response.slice(firstBrace, lastBrace + 1)); } return Array.from(candidates); } function parseTopics(value: unknown): ExtractedFact['topics'] { if (!Array.isArray(value)) { throw new IdentityDBError('LLM extractor response must include a topics array.'); } return value.map((entry) => parseTopic(entry)); } function parseTopic(value: unknown): ExtractedFact['topics'][number] { if (!isRecord(value)) { throw new IdentityDBError('LLM extractor topics must be JSON objects.'); } const name = optionalString(value.name)?.trim(); if (!name) { throw new IdentityDBError('LLM extractor topics must include a non-empty name.'); } const topic: ExtractedFact['topics'][number] = { name }; const category = optionalTopicCategory(value.category); if (category !== undefined) { topic.category = category; } const granularity = optionalTopicGranularity(value.granularity); if (granularity !== undefined) { topic.granularity = granularity; } const role = optionalNullableString(value.role); if (role !== undefined) { topic.role = role; } const description = optionalNullableString(value.description); if (description !== undefined) { topic.description = description; } const metadata = optionalMetadata(value.metadata); if (metadata !== undefined) { topic.metadata = metadata; } return topic; } function optionalString(value: unknown): string | undefined { if (value === undefined) { return undefined; } if (typeof value !== 'string') { throw new IdentityDBError('LLM extractor expected a string field.'); } return value; } function optionalNullableString(value: unknown): string | null | undefined { if (value === undefined) { return undefined; } if (value === null) { return null; } if (typeof value !== 'string') { throw new IdentityDBError('LLM extractor expected a nullable string field.'); } return value; } function optionalNullableNumber(value: unknown): number | null | undefined { if (value === undefined) { return undefined; } if (value === null) { return null; } if (typeof value !== 'number' || Number.isNaN(value)) { throw new IdentityDBError('LLM extractor expected confidence to be a number or null.'); } return value; } function optionalMetadata(value: unknown): ExtractedFact['metadata'] | undefined { if (value === undefined) { return undefined; } if (value === null) { return null; } if (!isJsonLike(value)) { throw new IdentityDBError('LLM extractor metadata must be valid JSON-compatible data.'); } return value as ExtractedFact['metadata']; } function optionalTopicCategory(value: unknown): TopicCategory | undefined { if (value === undefined) { return undefined; } if (value === 'entity' || value === 'concept' || value === 'temporal' || value === 'custom') { return value; } throw new IdentityDBError('LLM extractor returned an unsupported topic category.'); } function optionalTopicGranularity(value: unknown): TopicGranularity | undefined { if (value === undefined) { return undefined; } if (value === 'abstract' || value === 'concrete' || value === 'mixed') { return value; } throw new IdentityDBError('LLM extractor returned an unsupported topic granularity.'); } function isRecord(value: unknown): value is Record { return typeof value === 'object' && value !== null && !Array.isArray(value); } function isJsonLike(value: unknown): boolean { if (value === null) { return true; } if (typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') { return true; } if (Array.isArray(value)) { return value.every((entry) => isJsonLike(entry)); } if (isRecord(value)) { return Object.values(value).every((entry) => isJsonLike(entry)); } return false; }