From a33fd61c975339bcb61a24126f747daec0f4a17d Mon Sep 17 00:00:00 2001 From: p-sw Date: Sun, 17 May 2026 23:10:38 +0900 Subject: [PATCH] feat: adjust instruction detailed --- src/ingestion/llm-extractor.ts | 99 ++++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 35 deletions(-) diff --git a/src/ingestion/llm-extractor.ts b/src/ingestion/llm-extractor.ts index f018300..4d63e6a 100644 --- a/src/ingestion/llm-extractor.ts +++ b/src/ingestion/llm-extractor.ts @@ -1,17 +1,17 @@ -import { IdentityDBError } from '../core/errors'; -import type { TopicCategory, TopicGranularity } from '../types/domain'; +import { IdentityDBError } from "../core/errors"; +import type { TopicCategory, TopicGranularity } from "../types/domain"; import type { ExtractedFact, FactExtractor, LlmFactExtractorOptions, -} from './types'; +} from "./types"; const DEFAULT_INSTRUCTIONS = [ - 'Extract one structured fact from the user input.', - 'Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.', + "Extract one structured fact from the user input.", + "Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.", 'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.', - 'Only include topics that are explicitly supported by the input.', -].join('\n'); + 'Only include topics that are explicitly in the input as-is. For example, topic "I started TypeScript since 2015" can be "I", "TypeScript", "2015".', +].join("\n"); export class LlmFactExtractor implements FactExtractor { constructor(private readonly options: LlmFactExtractorOptions) {} @@ -31,11 +31,13 @@ export class LlmFactExtractor implements FactExtractor { return [ DEFAULT_INSTRUCTIONS, - instructions && instructions.length > 0 ? `Additional instructions:\n${instructions}` : null, + instructions && instructions.length > 0 + ? `Additional instructions:\n${instructions}` + : null, `Input:\n${input.trim()}`, ] .filter((value): value is string => value !== null) - .join('\n\n'); + .join("\n\n"); } } @@ -43,7 +45,7 @@ export function parseLlmExtractedFactResponse(response: string): ExtractedFact { const payload = parseJsonCandidate(response); if (!isRecord(payload)) { - throw new IdentityDBError('LLM extractor response must be a JSON object.'); + throw new IdentityDBError("LLM extractor response must be a JSON object."); } const topics = parseTopics(payload.topics); @@ -88,7 +90,7 @@ function parseJsonCandidate(response: string): unknown { } } - throw new IdentityDBError('LLM extractor returned invalid JSON.'); + throw new IdentityDBError("LLM extractor returned invalid JSON."); } function collectJsonCandidates(response: string): string[] { @@ -107,8 +109,8 @@ function collectJsonCandidates(response: string): string[] { match = fencePattern.exec(response); } - const firstBrace = response.indexOf('{'); - const lastBrace = response.lastIndexOf('}'); + const firstBrace = response.indexOf("{"); + const lastBrace = response.lastIndexOf("}"); if (firstBrace >= 0 && lastBrace > firstBrace) { candidates.add(response.slice(firstBrace, lastBrace + 1)); } @@ -116,25 +118,29 @@ function collectJsonCandidates(response: string): string[] { return Array.from(candidates); } -function parseTopics(value: unknown): ExtractedFact['topics'] { +function parseTopics(value: unknown): ExtractedFact["topics"] { if (!Array.isArray(value)) { - throw new IdentityDBError('LLM extractor response must include a topics array.'); + throw new IdentityDBError( + "LLM extractor response must include a topics array.", + ); } return value.map((entry) => parseTopic(entry)); } -function parseTopic(value: unknown): ExtractedFact['topics'][number] { +function parseTopic(value: unknown): ExtractedFact["topics"][number] { if (!isRecord(value)) { - throw new IdentityDBError('LLM extractor topics must be JSON objects.'); + throw new IdentityDBError("LLM extractor topics must be JSON objects."); } const name = optionalString(value.name)?.trim(); if (!name) { - throw new IdentityDBError('LLM extractor topics must include a non-empty name.'); + throw new IdentityDBError( + "LLM extractor topics must include a non-empty name.", + ); } - const topic: ExtractedFact['topics'][number] = { name }; + const topic: ExtractedFact["topics"][number] = { name }; const category = optionalTopicCategory(value.category); if (category !== undefined) { @@ -169,8 +175,8 @@ function optionalString(value: unknown): string | undefined { return undefined; } - if (typeof value !== 'string') { - throw new IdentityDBError('LLM extractor expected a string field.'); + if (typeof value !== "string") { + throw new IdentityDBError("LLM extractor expected a string field."); } return value; @@ -185,8 +191,10 @@ function optionalNullableString(value: unknown): string | null | undefined { return null; } - if (typeof value !== 'string') { - throw new IdentityDBError('LLM extractor expected a nullable string field.'); + if (typeof value !== "string") { + throw new IdentityDBError( + "LLM extractor expected a nullable string field.", + ); } return value; @@ -201,14 +209,18 @@ function optionalNullableNumber(value: unknown): number | null | undefined { return null; } - if (typeof value !== 'number' || Number.isNaN(value)) { - throw new IdentityDBError('LLM extractor expected confidence to be a number or null.'); + if (typeof value !== "number" || Number.isNaN(value)) { + throw new IdentityDBError( + "LLM extractor expected confidence to be a number or null.", + ); } return value; } -function optionalMetadata(value: unknown): ExtractedFact['metadata'] | undefined { +function optionalMetadata( + value: unknown, +): ExtractedFact["metadata"] | undefined { if (value === undefined) { return undefined; } @@ -218,10 +230,12 @@ function optionalMetadata(value: unknown): ExtractedFact['metadata'] | undefined } if (!isJsonLike(value)) { - throw new IdentityDBError('LLM extractor metadata must be valid JSON-compatible data.'); + throw new IdentityDBError( + "LLM extractor metadata must be valid JSON-compatible data.", + ); } - return value as ExtractedFact['metadata']; + return value as ExtractedFact["metadata"]; } function optionalTopicCategory(value: unknown): TopicCategory | undefined { @@ -229,27 +243,38 @@ function optionalTopicCategory(value: unknown): TopicCategory | undefined { return undefined; } - if (value === 'entity' || value === 'concept' || value === 'temporal' || value === 'custom') { + if ( + value === "entity" || + value === "concept" || + value === "temporal" || + value === "custom" + ) { return value; } - throw new IdentityDBError('LLM extractor returned an unsupported topic category.'); + throw new IdentityDBError( + "LLM extractor returned an unsupported topic category.", + ); } -function optionalTopicGranularity(value: unknown): TopicGranularity | undefined { +function optionalTopicGranularity( + value: unknown, +): TopicGranularity | undefined { if (value === undefined) { return undefined; } - if (value === 'abstract' || value === 'concrete' || value === 'mixed') { + if (value === "abstract" || value === "concrete" || value === "mixed") { return value; } - throw new IdentityDBError('LLM extractor returned an unsupported topic granularity.'); + throw new IdentityDBError( + "LLM extractor returned an unsupported topic granularity.", + ); } function isRecord(value: unknown): value is Record { - return typeof value === 'object' && value !== null && !Array.isArray(value); + return typeof value === "object" && value !== null && !Array.isArray(value); } function isJsonLike(value: unknown): boolean { @@ -257,7 +282,11 @@ function isJsonLike(value: unknown): boolean { return true; } - if (typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') { + if ( + typeof value === "string" || + typeof value === "number" || + typeof value === "boolean" + ) { return true; }