From 0480ea182f354fb26e4991c5397cec801b77fcb1 Mon Sep 17 00:00:00 2001 From: p-sw Date: Tue, 19 May 2026 22:06:54 +0900 Subject: [PATCH] refactor: make generateText model return ExtractedFact --- src/ingestion/llm-extractor.ts | 265 +-------------------------------- src/ingestion/types.ts | 2 +- tests/ingestion.test.ts | 49 +----- 3 files changed, 5 insertions(+), 311 deletions(-) diff --git a/src/ingestion/llm-extractor.ts b/src/ingestion/llm-extractor.ts index 4d63e6a..1a2c526 100644 --- a/src/ingestion/llm-extractor.ts +++ b/src/ingestion/llm-extractor.ts @@ -1,5 +1,3 @@ -import { IdentityDBError } from "../core/errors"; -import type { TopicCategory, TopicGranularity } from "../types/domain"; import type { ExtractedFact, FactExtractor, @@ -18,8 +16,7 @@ export class LlmFactExtractor implements FactExtractor { async extract(input: string): Promise { const prompt = this.buildPrompt(input); - const response = await this.options.model.generateText(prompt); - return parseLlmExtractedFactResponse(response); + return this.options.model.generateText(prompt); } private buildPrompt(input: string): string { @@ -40,263 +37,3 @@ export class LlmFactExtractor implements FactExtractor { .join("\n\n"); } } - -export function parseLlmExtractedFactResponse(response: string): ExtractedFact { - const payload = parseJsonCandidate(response); - - if (!isRecord(payload)) { - throw new IdentityDBError("LLM extractor response must be a JSON object."); - } - - const topics = parseTopics(payload.topics); - const extracted: ExtractedFact = { topics }; - - const statement = optionalString(payload.statement); - if (statement !== undefined) { - extracted.statement = statement; - } - - const summary = optionalNullableString(payload.summary); - if (summary !== undefined) { - extracted.summary = summary; - } - - const source = optionalNullableString(payload.source); - if (source !== undefined) { - extracted.source = source; - } - - const confidence = optionalNullableNumber(payload.confidence); - if (confidence !== undefined) { - extracted.confidence = confidence; - } - - const metadata = optionalMetadata(payload.metadata); - if (metadata !== undefined) { - extracted.metadata = metadata; - } - - return extracted; -} - -function parseJsonCandidate(response: string): unknown { - const trimmed = response.trim(); - - for (const candidate of collectJsonCandidates(trimmed)) { - try { - return JSON.parse(candidate); - } catch { - continue; - } - } - - throw new IdentityDBError("LLM extractor returned invalid JSON."); -} - -function collectJsonCandidates(response: string): string[] { - const candidates = new Set(); - candidates.add(response); - - const fencePattern = /```(?:json)?\s*([\s\S]*?)```/gi; - let match: RegExpExecArray | null = fencePattern.exec(response); - - while (match) { - const candidate = match[1]?.trim(); - if (candidate) { - candidates.add(candidate); - } - - match = fencePattern.exec(response); - } - - const firstBrace = response.indexOf("{"); - const lastBrace = response.lastIndexOf("}"); - if (firstBrace >= 0 && lastBrace > firstBrace) { - candidates.add(response.slice(firstBrace, lastBrace + 1)); - } - - return Array.from(candidates); -} - -function parseTopics(value: unknown): ExtractedFact["topics"] { - if (!Array.isArray(value)) { - throw new IdentityDBError( - "LLM extractor response must include a topics array.", - ); - } - - return value.map((entry) => parseTopic(entry)); -} - -function parseTopic(value: unknown): ExtractedFact["topics"][number] { - if (!isRecord(value)) { - throw new IdentityDBError("LLM extractor topics must be JSON objects."); - } - - const name = optionalString(value.name)?.trim(); - if (!name) { - throw new IdentityDBError( - "LLM extractor topics must include a non-empty name.", - ); - } - - const topic: ExtractedFact["topics"][number] = { name }; - - const category = optionalTopicCategory(value.category); - if (category !== undefined) { - topic.category = category; - } - - const granularity = optionalTopicGranularity(value.granularity); - if (granularity !== undefined) { - topic.granularity = granularity; - } - - const role = optionalNullableString(value.role); - if (role !== undefined) { - topic.role = role; - } - - const description = optionalNullableString(value.description); - if (description !== undefined) { - topic.description = description; - } - - const metadata = optionalMetadata(value.metadata); - if (metadata !== undefined) { - topic.metadata = metadata; - } - - return topic; -} - -function optionalString(value: unknown): string | undefined { - if (value === undefined) { - return undefined; - } - - if (typeof value !== "string") { - throw new IdentityDBError("LLM extractor expected a string field."); - } - - return value; -} - -function optionalNullableString(value: unknown): string | null | undefined { - if (value === undefined) { - return undefined; - } - - if (value === null) { - return null; - } - - if (typeof value !== "string") { - throw new IdentityDBError( - "LLM extractor expected a nullable string field.", - ); - } - - return value; -} - -function optionalNullableNumber(value: unknown): number | null | undefined { - if (value === undefined) { - return undefined; - } - - if (value === null) { - return null; - } - - if (typeof value !== "number" || Number.isNaN(value)) { - throw new IdentityDBError( - "LLM extractor expected confidence to be a number or null.", - ); - } - - return value; -} - -function optionalMetadata( - value: unknown, -): ExtractedFact["metadata"] | undefined { - if (value === undefined) { - return undefined; - } - - if (value === null) { - return null; - } - - if (!isJsonLike(value)) { - throw new IdentityDBError( - "LLM extractor metadata must be valid JSON-compatible data.", - ); - } - - return value as ExtractedFact["metadata"]; -} - -function optionalTopicCategory(value: unknown): TopicCategory | undefined { - if (value === undefined) { - return undefined; - } - - if ( - value === "entity" || - value === "concept" || - value === "temporal" || - value === "custom" - ) { - return value; - } - - throw new IdentityDBError( - "LLM extractor returned an unsupported topic category.", - ); -} - -function optionalTopicGranularity( - value: unknown, -): TopicGranularity | undefined { - if (value === undefined) { - return undefined; - } - - if (value === "abstract" || value === "concrete" || value === "mixed") { - return value; - } - - throw new IdentityDBError( - "LLM extractor returned an unsupported topic granularity.", - ); -} - -function isRecord(value: unknown): value is Record { - return typeof value === "object" && value !== null && !Array.isArray(value); -} - -function isJsonLike(value: unknown): boolean { - if (value === null) { - return true; - } - - if ( - typeof value === "string" || - typeof value === "number" || - typeof value === "boolean" - ) { - return true; - } - - if (Array.isArray(value)) { - return value.every((entry) => isJsonLike(entry)); - } - - if (isRecord(value)) { - return Object.values(value).every((entry) => isJsonLike(entry)); - } - - return false; -} diff --git a/src/ingestion/types.ts b/src/ingestion/types.ts index 50d82a4..79d18b8 100644 --- a/src/ingestion/types.ts +++ b/src/ingestion/types.ts @@ -18,7 +18,7 @@ export interface FactExtractor { } export interface LlmTextGenerationModel { - generateText(prompt: string): Promise; + generateText(prompt: string): Promise; } export interface LlmFactExtractorOptions { diff --git a/tests/ingestion.test.ts b/tests/ingestion.test.ts index 27563f3..db60918 100644 --- a/tests/ingestion.test.ts +++ b/tests/ingestion.test.ts @@ -53,7 +53,7 @@ describe('IdentityDB ingestion', () => { expect(topic?.facts).toHaveLength(1); }); - it('ships an LLM extractor adapter that turns structured JSON responses into facts', async () => { + it('ships an LLM extractor adapter that returns structured facts from the model', async () => { let prompt = ''; const extractor = new LlmFactExtractor({ @@ -61,7 +61,7 @@ describe('IdentityDB ingestion', () => { async generateText(input) { prompt = input; - return JSON.stringify({ + return { statement: 'I have worked with Bun and TypeScript since 2025.', summary: 'The speaker has Bun and TypeScript experience.', source: 'chat', @@ -73,7 +73,7 @@ describe('IdentityDB ingestion', () => { { name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' }, { name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' }, ], - }); + }; }, }, instructions: 'Prefer technology and time topics.', @@ -91,47 +91,4 @@ describe('IdentityDB ingestion', () => { expect(fact.metadata).toEqual({ channel: 'telegram' }); expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'Bun', 'TypeScript', '2025']); }); - - it('parses JSON responses wrapped in markdown code fences', async () => { - const extractor = new LlmFactExtractor({ - model: { - async generateText() { - return [ - 'Here is the extracted fact:', - '```json', - JSON.stringify({ - statement: 'Bun powers TypeScript tooling.', - topics: [ - { name: 'Bun', category: 'entity', granularity: 'concrete' }, - { name: 'TypeScript', category: 'entity', granularity: 'concrete' }, - ], - }), - '```', - ].join('\n'); - }, - }, - }); - - const fact = await db.ingestStatement('Bun powers TypeScript tooling.', { - extractor, - }); - - expect(fact.topics.map((topic) => topic.name)).toEqual(['Bun', 'TypeScript']); - }); - - it('rejects invalid LLM responses before writing facts', async () => { - const extractor = new LlmFactExtractor({ - model: { - async generateText() { - return 'not json at all'; - }, - }, - }); - - await expect( - db.ingestStatement('Bun powers TypeScript tooling.', { - extractor, - }), - ).rejects.toThrow('LLM extractor returned invalid JSON.'); - }); });