refactor: make generateText model return ExtractedFact

2026-05-19 22:06:54 +09:00
parent 185edfdae8
commit 0480ea182f
3 changed files with 5 additions and 311 deletions
--- a/src/ingestion/llm-extractor.ts
+++ b/src/ingestion/llm-extractor.ts
@@ -1,5 +1,3 @@
-import { IdentityDBError } from "../core/errors";
-import type { TopicCategory, TopicGranularity } from "../types/domain";
 import type {
  ExtractedFact,
  FactExtractor,
@@ -18,8 +16,7 @@ export class LlmFactExtractor implements FactExtractor {

  async extract(input: string): Promise<ExtractedFact> {
    const prompt = this.buildPrompt(input);
-    const response = await this.options.model.generateText(prompt);
-    return parseLlmExtractedFactResponse(response);
+    return this.options.model.generateText(prompt);
  }

  private buildPrompt(input: string): string {
@@ -40,263 +37,3 @@ export class LlmFactExtractor implements FactExtractor {
      .join("\n\n");
  }
 }
-
-export function parseLlmExtractedFactResponse(response: string): ExtractedFact {
-  const payload = parseJsonCandidate(response);
-
-  if (!isRecord(payload)) {
-    throw new IdentityDBError("LLM extractor response must be a JSON object.");
-  }
-
-  const topics = parseTopics(payload.topics);
-  const extracted: ExtractedFact = { topics };
-
-  const statement = optionalString(payload.statement);
-  if (statement !== undefined) {
-    extracted.statement = statement;
-  }
-
-  const summary = optionalNullableString(payload.summary);
-  if (summary !== undefined) {
-    extracted.summary = summary;
-  }
-
-  const source = optionalNullableString(payload.source);
-  if (source !== undefined) {
-    extracted.source = source;
-  }
-
-  const confidence = optionalNullableNumber(payload.confidence);
-  if (confidence !== undefined) {
-    extracted.confidence = confidence;
-  }
-
-  const metadata = optionalMetadata(payload.metadata);
-  if (metadata !== undefined) {
-    extracted.metadata = metadata;
-  }
-
-  return extracted;
-}
-
-function parseJsonCandidate(response: string): unknown {
-  const trimmed = response.trim();
-
-  for (const candidate of collectJsonCandidates(trimmed)) {
-    try {
-      return JSON.parse(candidate);
-    } catch {
-      continue;
-    }
-  }
-
-  throw new IdentityDBError("LLM extractor returned invalid JSON.");
-}
-
-function collectJsonCandidates(response: string): string[] {
-  const candidates = new Set<string>();
-  candidates.add(response);
-
-  const fencePattern = /```(?:json)?\s*([\s\S]*?)```/gi;
-  let match: RegExpExecArray | null = fencePattern.exec(response);
-
-  while (match) {
-    const candidate = match[1]?.trim();
-    if (candidate) {
-      candidates.add(candidate);
-    }
-
-    match = fencePattern.exec(response);
-  }
-
-  const firstBrace = response.indexOf("{");
-  const lastBrace = response.lastIndexOf("}");
-  if (firstBrace >= 0 && lastBrace > firstBrace) {
-    candidates.add(response.slice(firstBrace, lastBrace + 1));
-  }
-
-  return Array.from(candidates);
-}
-
-function parseTopics(value: unknown): ExtractedFact["topics"] {
-  if (!Array.isArray(value)) {
-    throw new IdentityDBError(
-      "LLM extractor response must include a topics array.",
-    );
-  }
-
-  return value.map((entry) => parseTopic(entry));
-}
-
-function parseTopic(value: unknown): ExtractedFact["topics"][number] {
-  if (!isRecord(value)) {
-    throw new IdentityDBError("LLM extractor topics must be JSON objects.");
-  }
-
-  const name = optionalString(value.name)?.trim();
-  if (!name) {
-    throw new IdentityDBError(
-      "LLM extractor topics must include a non-empty name.",
-    );
-  }
-
-  const topic: ExtractedFact["topics"][number] = { name };
-
-  const category = optionalTopicCategory(value.category);
-  if (category !== undefined) {
-    topic.category = category;
-  }
-
-  const granularity = optionalTopicGranularity(value.granularity);
-  if (granularity !== undefined) {
-    topic.granularity = granularity;
-  }
-
-  const role = optionalNullableString(value.role);
-  if (role !== undefined) {
-    topic.role = role;
-  }
-
-  const description = optionalNullableString(value.description);
-  if (description !== undefined) {
-    topic.description = description;
-  }
-
-  const metadata = optionalMetadata(value.metadata);
-  if (metadata !== undefined) {
-    topic.metadata = metadata;
-  }
-
-  return topic;
-}
-
-function optionalString(value: unknown): string | undefined {
-  if (value === undefined) {
-    return undefined;
-  }
-
-  if (typeof value !== "string") {
-    throw new IdentityDBError("LLM extractor expected a string field.");
-  }
-
-  return value;
-}
-
-function optionalNullableString(value: unknown): string | null | undefined {
-  if (value === undefined) {
-    return undefined;
-  }
-
-  if (value === null) {
-    return null;
-  }
-
-  if (typeof value !== "string") {
-    throw new IdentityDBError(
-      "LLM extractor expected a nullable string field.",
-    );
-  }
-
-  return value;
-}
-
-function optionalNullableNumber(value: unknown): number | null | undefined {
-  if (value === undefined) {
-    return undefined;
-  }
-
-  if (value === null) {
-    return null;
-  }
-
-  if (typeof value !== "number" || Number.isNaN(value)) {
-    throw new IdentityDBError(
-      "LLM extractor expected confidence to be a number or null.",
-    );
-  }
-
-  return value;
-}
-
-function optionalMetadata(
-  value: unknown,
-): ExtractedFact["metadata"] | undefined {
-  if (value === undefined) {
-    return undefined;
-  }
-
-  if (value === null) {
-    return null;
-  }
-
-  if (!isJsonLike(value)) {
-    throw new IdentityDBError(
-      "LLM extractor metadata must be valid JSON-compatible data.",
-    );
-  }
-
-  return value as ExtractedFact["metadata"];
-}
-
-function optionalTopicCategory(value: unknown): TopicCategory | undefined {
-  if (value === undefined) {
-    return undefined;
-  }
-
-  if (
-    value === "entity" ||
-    value === "concept" ||
-    value === "temporal" ||
-    value === "custom"
-  ) {
-    return value;
-  }
-
-  throw new IdentityDBError(
-    "LLM extractor returned an unsupported topic category.",
-  );
-}
-
-function optionalTopicGranularity(
-  value: unknown,
-): TopicGranularity | undefined {
-  if (value === undefined) {
-    return undefined;
-  }
-
-  if (value === "abstract" || value === "concrete" || value === "mixed") {
-    return value;
-  }
-
-  throw new IdentityDBError(
-    "LLM extractor returned an unsupported topic granularity.",
-  );
-}
-
-function isRecord(value: unknown): value is Record<string, unknown> {
-  return typeof value === "object" && value !== null && !Array.isArray(value);
-}
-
-function isJsonLike(value: unknown): boolean {
-  if (value === null) {
-    return true;
-  }
-
-  if (
-    typeof value === "string" ||
-    typeof value === "number" ||
-    typeof value === "boolean"
-  ) {
-    return true;
-  }
-
-  if (Array.isArray(value)) {
-    return value.every((entry) => isJsonLike(entry));
-  }
-
-  if (isRecord(value)) {
-    return Object.values(value).every((entry) => isJsonLike(entry));
-  }
-
-  return false;
-}
--- a/src/ingestion/types.ts
+++ b/src/ingestion/types.ts
@@ -18,7 +18,7 @@ export interface FactExtractor {
 }

 export interface LlmTextGenerationModel {
-  generateText(prompt: string): Promise<string>;
+  generateText(prompt: string): Promise<ExtractedFact>;
 }

 export interface LlmFactExtractorOptions {
--- a/tests/ingestion.test.ts
+++ b/tests/ingestion.test.ts
@@ -53,7 +53,7 @@ describe('IdentityDB ingestion', () => {
    expect(topic?.facts).toHaveLength(1);
  });

-  it('ships an LLM extractor adapter that turns structured JSON responses into facts', async () => {
+  it('ships an LLM extractor adapter that returns structured facts from the model', async () => {
    let prompt = '';

    const extractor = new LlmFactExtractor({
@@ -61,7 +61,7 @@ describe('IdentityDB ingestion', () => {
        async generateText(input) {
          prompt = input;

-          return JSON.stringify({
+          return {
            statement: 'I have worked with Bun and TypeScript since 2025.',
            summary: 'The speaker has Bun and TypeScript experience.',
            source: 'chat',
@@ -73,7 +73,7 @@ describe('IdentityDB ingestion', () => {
              { name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' },
              { name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' },
            ],
-          });
+          };
        },
      },
      instructions: 'Prefer technology and time topics.',
@@ -91,47 +91,4 @@ describe('IdentityDB ingestion', () => {
    expect(fact.metadata).toEqual({ channel: 'telegram' });
    expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'Bun', 'TypeScript', '2025']);
  });
-
-  it('parses JSON responses wrapped in markdown code fences', async () => {
-    const extractor = new LlmFactExtractor({
-      model: {
-        async generateText() {
-          return [
-            'Here is the extracted fact:',
-            '```json',
-            JSON.stringify({
-              statement: 'Bun powers TypeScript tooling.',
-              topics: [
-                { name: 'Bun', category: 'entity', granularity: 'concrete' },
-                { name: 'TypeScript', category: 'entity', granularity: 'concrete' },
-              ],
-            }),
-            '```',
-          ].join('\n');
-        },
-      },
-    });
-
-    const fact = await db.ingestStatement('Bun powers TypeScript tooling.', {
-      extractor,
-    });
-
-    expect(fact.topics.map((topic) => topic.name)).toEqual(['Bun', 'TypeScript']);
-  });
-
-  it('rejects invalid LLM responses before writing facts', async () => {
-    const extractor = new LlmFactExtractor({
-      model: {
-        async generateText() {
-          return 'not json at all';
-        },
-      },
-    });
-
-    await expect(
-      db.ingestStatement('Bun powers TypeScript tooling.', {
-        extractor,
-      }),
-    ).rejects.toThrow('LLM extractor returned invalid JSON.');
-  });
 });