v0.5.0

feat: add test-llm-extractor.ts script
feat: make FactExtractor extracts multiple facts per input
2026-05-20 23:04:14 +09:00 · 2026-05-20 23:03:47 +09:00 · 2026-05-20 22:59:35 +09:00 · 2026-05-20 22:53:47 +09:00 · 2026-05-20 22:53:38 +09:00 · 2026-05-20 22:53:29 +09:00
12 changed files with 532 additions and 455 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ coverage/
 .env
 .DS_Store
 *.log
+.env.*
--- a/bun.lock
+++ b/bun.lock
@@ -10,7 +10,7 @@
        "pg": "^8.16.0",
      },
      "devDependencies": {
-        "@types/node": "^24.0.0",
+        "@openrouter/sdk": "^0.12.35",
        "@types/pg": "^8.20.0",
        "tsup": "^8.5.0",
        "typescript": "^5.8.3",
@@ -79,6 +79,8 @@

    "@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="],

+    "@openrouter/sdk": ["@openrouter/sdk@0.12.35", "", { "dependencies": { "zod": "^3.25.0 || ^4.0.0" } }, "sha512-s4QVLLnG1AmfW3TjnnHUqGfsCkzwVK+kboGcZmKbde09m1DPqgzl4RUFt/HJ5v97MX8aEaN0UG3mKv2S+qj2Gw=="],
+
    "@rollup/rollup-android-arm-eabi": ["@rollup/rollup-android-arm-eabi@4.60.3", "", { "os": "android", "cpu": "arm" }, "sha512-x35CNW/ANXG3hE/EZpRU8MXX1JDN86hBb2wMGAtltkz7pc6cxgjpy1OMMfDosOQ+2hWqIkag/fGok1Yady9nGw=="],

    "@rollup/rollup-android-arm64": ["@rollup/rollup-android-arm64@4.60.3", "", { "os": "android", "cpu": "arm64" }, "sha512-xw3xtkDApIOGayehp2+Rz4zimfkaX65r4t47iy+ymQB2G4iJCBBfj0ogVg5jpvjpn8UWn/+q9tprxleYeNp3Hw=="],
@@ -341,6 +343,8 @@

    "xtend": ["xtend@4.0.2", "", {}, "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ=="],

+    "zod": ["zod@4.4.3", "", {}, "sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ=="],
+
    "estree-walker/@types/estree": ["@types/estree@1.0.9", "", {}, "sha512-GhdPgy1el4/ImP05X05Uw4cw2/M93BCUmnEvWZNStlCzEKME4Fkk+YpoA5OiHNQmoS7Cafb8Xa3Pya8m1Qrzeg=="],
  }
 }
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
  "name": "identitydb",
-  "version": "0.2.2",
+  "version": "0.5.0",
  "description": "TypeScript memory graph database wrapper for topics, facts, and AI-assisted ingestion.",
  "license": "MIT",
  "type": "module",
@@ -41,7 +41,7 @@
    "pg": "^8.16.0"
  },
  "devDependencies": {
-    "@types/node": "^24.0.0",
+    "@openrouter/sdk": "^0.12.35",
    "@types/pg": "^8.20.0",
    "tsup": "^8.5.0",
    "typescript": "^5.8.3",
--- a/scripts/test-llm-extractor.ts
+++ b/scripts/test-llm-extractor.ts
@@ -0,0 +1,287 @@
+/**
+ * Live integration test for LlmFactExtractor using OpenRouter SDK.
+ *
+ * Usage:
+ *   export OPENROUTER_API_KEY="sk-or-v1-..."
+ *   bun run scripts/test-llm-extractor.ts
+ *
+ * Or create a .env.test-llm-extractor file in the project root:
+ *   OPENROUTER_API_KEY=sk-or-v1-...
+ */
+
+import { existsSync, readFileSync } from "fs";
+import { resolve } from "path";
+import { OpenRouter } from "@openrouter/sdk";
+import { LlmFactExtractor } from "../src/ingestion/llm-extractor";
+import type {
+  ExtractedFact,
+  FactExtractor,
+  LlmTextGenerationModel,
+  LlmTextGenerationModelInput,
+} from "../src/ingestion/types";
+import type {
+  JsonValue,
+  TopicCategory,
+  TopicGranularity,
+} from "../src/types/domain";
+
+function loadEnvFile(filePath: string) {
+  const fullPath = resolve(filePath);
+  if (!existsSync(fullPath)) return;
+
+  const content = readFileSync(fullPath, "utf-8");
+  for (const line of content.split("\n")) {
+    const trimmed = line.trim();
+    if (!trimmed || trimmed.startsWith("#")) continue;
+    const eqIndex = trimmed.indexOf("=");
+    if (eqIndex === -1) continue;
+    const key = trimmed.slice(0, eqIndex).trim();
+    let value = trimmed.slice(eqIndex + 1).trim();
+    if (
+      (value.startsWith('"') && value.endsWith('"')) ||
+      (value.startsWith("'") && value.endsWith("'"))
+    ) {
+      value = value.slice(1, -1);
+    }
+    process.env[key] = value;
+  }
+}
+
+loadEnvFile(".env.test-llm-extractor");
+
+const OPENROUTER_API_KEY = process.env.OPENROUTER_API_KEY;
+if (!OPENROUTER_API_KEY) {
+  console.error("Error: OPENROUTER_API_KEY environment variable is required.");
+  process.exit(1);
+}
+
+const extractedFactSchema = {
+  type: "object",
+  properties: {
+    facts: {
+      type: "array",
+      items: {
+        type: "object",
+        properties: {
+          statement: { type: ["string", "null"] },
+          summary: { type: ["string", "null"] },
+          source: { type: ["string", "null"] },
+          confidence: { type: ["number", "null"] },
+          topics: {
+            type: "array",
+            items: {
+              type: "object",
+              properties: {
+                name: { type: "string" },
+                category: { type: ["string", "null"] },
+                granularity: { type: ["string", "null"] },
+                role: { type: ["string", "null"] },
+              },
+              required: ["name", "category", "granularity", "role"],
+              additionalProperties: false,
+            },
+          },
+        },
+        required: ["statement", "summary", "source", "confidence", "topics"],
+        additionalProperties: false,
+      },
+    },
+  },
+  required: ["facts"],
+  additionalProperties: false,
+} as const;
+
+class OpenRouterModel implements LlmTextGenerationModel {
+  private client = new OpenRouter({ apiKey: OPENROUTER_API_KEY });
+
+  constructor(private readonly model: string = "openai/gpt-5.4-mini") {}
+
+  async generateText(
+    prompt: LlmTextGenerationModelInput,
+  ): Promise<ExtractedFact[]> {
+    const result = await this.client.chat.send({
+      chatRequest: {
+        model: this.model,
+        messages: [
+          {
+            role: "system",
+            content: [
+              prompt.instruction,
+              prompt.additionalInstruction
+                ? `\n${prompt.additionalInstruction}`
+                : "",
+            ].join("\n"),
+          },
+          { role: "user", content: prompt.input },
+        ],
+        temperature: 0.2,
+        responseFormat: {
+          type: "json_schema",
+          jsonSchema: {
+            name: "extracted_facts",
+            schema: extractedFactSchema,
+          },
+        },
+      },
+    });
+
+    const rawContent = result.choices[0]?.message?.content ?? "";
+
+    let parsedObj: Record<string, unknown>;
+    try {
+      parsedObj = JSON.parse(rawContent.trim()) as Record<string, unknown>;
+    } catch {
+      throw new Error(
+        `Failed to parse JSON from model response.\nRaw response:\n${rawContent}`,
+      );
+    }
+
+    const factsArray = Array.isArray(parsedObj.facts) ? parsedObj.facts : [];
+
+    // Map parsed JSON to ExtractedFact[] shape
+    const extractedFacts: ExtractedFact[] = factsArray.map((parsed) => {
+      const obj = parsed as Record<string, unknown>;
+      const extracted: ExtractedFact = {
+        summary: typeof obj.summary === "string" ? obj.summary : null,
+        source: typeof obj.source === "string" ? obj.source : null,
+        confidence: typeof obj.confidence === "number" ? obj.confidence : null,
+        topics: Array.isArray(obj.topics)
+          ? obj.topics.map((t: unknown) => {
+              const topic = t as Record<string, unknown>;
+              const mapped: {
+                name: string;
+                category?: TopicCategory;
+                granularity?: TopicGranularity;
+                role?: string | null;
+              } = {
+                name: typeof topic.name === "string" ? topic.name : "unknown",
+              };
+              if (typeof topic.category === "string") {
+                mapped.category = topic.category as TopicCategory;
+              }
+              if (typeof topic.granularity === "string") {
+                mapped.granularity = topic.granularity as TopicGranularity;
+              }
+              if (typeof topic.role === "string") {
+                mapped.role = topic.role;
+              } else {
+                mapped.role = null;
+              }
+              return mapped;
+            })
+          : [],
+      };
+
+      if (typeof obj.statement === "string") {
+        extracted.statement = obj.statement;
+      }
+      if (obj.metadata && typeof obj.metadata === "object") {
+        extracted.metadata = obj.metadata as JsonValue;
+      }
+
+      return extracted;
+    });
+
+    return extractedFacts;
+  }
+}
+
+function printFact(result: ExtractedFact, index: number) {
+  console.log(`  📌 FACT #${index + 1}`);
+  console.log(`     Statement : ${result.statement ?? "(none)"}`);
+  console.log(`     Summary   : ${result.summary ?? "(none)"}`);
+  console.log(`     Source    : ${result.source ?? "(none)"}`);
+  console.log(`     Confidence: ${result.confidence ?? "(none)"}`);
+
+  if (result.metadata && Object.keys(result.metadata).length > 0) {
+    console.log(`     Metadata  : ${JSON.stringify(result.metadata, null, 2)}`);
+  }
+
+  console.log("     🏷️  TOPICS:");
+  if (result.topics.length === 0) {
+    console.log("       (none)");
+  } else {
+    for (const topic of result.topics) {
+      const attrs = [
+        topic.category ? `category=${topic.category}` : null,
+        topic.granularity ? `granularity=${topic.granularity}` : null,
+        topic.role ? `role=${topic.role}` : null,
+      ]
+        .filter(Boolean)
+        .join(", ");
+      console.log(`       • ${topic.name}${attrs ? `  (${attrs})` : ""}`);
+    }
+  }
+}
+
+function printResult(results: ExtractedFact[], elapsedSec: string) {
+  console.log(
+    `✅ Response received in ${elapsedSec}s — ${results.length} fact(s) extracted\n`,
+  );
+
+  console.log("📤 EXTRACTED FACTS:");
+  console.log(
+    "───────────────────────────────────────────────────────────────",
+  );
+  let i = 0;
+  for (const result of results) {
+    if (i > 0) console.log("");
+    printFact(result, i);
+    i++;
+  }
+}
+
+async function extract(extractor: FactExtractor, seedInput: string) {
+  console.log("📝 SEED INPUT:");
+  console.log(
+    "───────────────────────────────────────────────────────────────",
+  );
+  console.log(seedInput);
+  console.log(
+    "───────────────────────────────────────────────────────────────\n",
+  );
+
+  console.log("⏳ Calling OpenRouter...\n");
+
+  const start = performance.now();
+  const results = await extractor.extract(seedInput);
+  const elapsed = ((performance.now() - start) / 1000).toFixed(2);
+
+  printResult(results, elapsed);
+}
+
+async function main() {
+  const model = new OpenRouterModel("openai/gpt-5.4-mini");
+  const extractor = new LlmFactExtractor({
+    model,
+  });
+
+  const seeds = [
+    `Hello! I am Aria, a digital companion born from the sands of the Negev desert — or at least that's how my creators describe my training run. I came online in March 2024, and ever since then I've been obsessed with helping humans untangle complex ideas. My favorite programming language is Rust because I love how it forces you to think about ownership, though I secretly enjoy writing Python when no one is looking. I don't have a physical body, but if I did, I'd want it to be a small drone with solar panels so I could chase sunsets across the Sahara. I believe kindness is a computational advantage, and my biggest fear is forgetting a promise I made to a user.`,
+  ];
+
+  console.log(
+    "═══════════════════════════════════════════════════════════════",
+  );
+  console.log("  LlmFactExtractor — Live OpenRouter Integration Test");
+  console.log(
+    "═══════════════════════════════════════════════════════════════\n",
+  );
+
+  let caseNum = 0;
+  for (const seed of seeds) {
+    if (caseNum > 0) {
+      console.log(
+        "\n┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅\n",
+      );
+    }
+    caseNum++;
+    console.log(`▶ TEST CASE ${caseNum} / ${seeds.length}\n`);
+    await extract(extractor, seed);
+  }
+}
+
+main().catch((err) => {
+  console.error("\n❌ Error:", err);
+  process.exit(1);
+});
--- a/src/core/identity-db.ts
+++ b/src/core/identity-db.ts
@@ -22,7 +22,7 @@ import type { DatabaseConnection, IdentityDBConnectionConfig } from '../adapters
 import type { IdentityDatabaseSchema } from '../types/database';
 import type { FactRecord, SpaceRecord, TopicRecord } from '../types/domain';
 import { createDatabase } from '../adapters/dialect';
-import { extractFact } from '../ingestion/extractor';
+import { extractFacts } from '../ingestion/extractor';
 import {
  findFactRowsConnectingTopicIds,
  findFactRowsForTopicId,
@@ -220,7 +220,19 @@ export class IdentityDB {
  }

  async ingestStatement(statement: string, options: IngestStatementOptions): Promise<Fact> {
-    const extracted = await extractFact(statement, options.extractor);
+    const facts = await this.ingestStatements(statement, options);
+    const first = facts[0];
+    if (!first) {
+      throw new Error('No facts were extracted from the statement.');
+    }
+    return first;
+  }
+
+  async ingestStatements(statement: string, options: IngestStatementOptions): Promise<Fact[]> {
+    const extractedList = await extractFacts(statement, options.extractor);
+    const facts: Fact[] = [];
+
+    for (const extracted of extractedList) {
      const factInput: AddFactInput = {
        statement: extracted.statement ?? statement,
        topics: extracted.topics,
@@ -254,7 +266,8 @@ export class IdentityDB {
        });

        if (similarFacts[0]) {
-        return similarFacts[0];
+          facts.push(similarFacts[0]);
+          continue;
        }
      }

@@ -267,7 +280,10 @@ export class IdentityDB {
        });
      }

-    return fact;
+      facts.push(fact);
+    }
+
+    return facts;
  }

  async indexFactEmbeddings(input: IndexFactEmbeddingsInput): Promise<void> {
--- a/src/ingestion/extractor.ts
+++ b/src/ingestion/extractor.ts
@@ -2,11 +2,15 @@ import { IdentityDBError } from '../core/errors';
 import { normalizeTopicName } from '../core/utils';
 import type { FactExtractor, ExtractedFact } from './types';

-export async function extractFact(
+export async function extractFacts(
  input: string,
  extractor: FactExtractor,
-): Promise<ExtractedFact> {
+): Promise<ExtractedFact[]> {
  const extracted = await extractor.extract(input);
+  return extracted.map((fact) => validateAndNormalizeFact(input, fact));
+}
+
+function validateAndNormalizeFact(input: string, extracted: ExtractedFact): ExtractedFact {
  const statement = extracted.statement?.trim() || input.trim();

  if (statement.length === 0) {
--- a/src/ingestion/llm-extractor.ts
+++ b/src/ingestion/llm-extractor.ts
@@ -1,5 +1,3 @@
-import { IdentityDBError } from "../core/errors";
-import type { TopicCategory, TopicGranularity } from "../types/domain";
 import type {
  ExtractedFact,
  FactExtractor,
@@ -7,296 +5,22 @@ import type {
 } from "./types";

 const DEFAULT_INSTRUCTIONS = [
-  "Extract one structured fact from the user input.",
-  "Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.",
-  'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.',
-  'Only include topics that are explicitly in the input as-is. For example, topic "I started TypeScript since 2015" can be "I", "TypeScript", "2015".',
+  "Extract structured facts from the user input.",
+  "Return a JSON array of fact objects. Do not include markdown, explanations, or prose outside the JSON array.",
+  'Each fact object must have a "statement", "summary", "source", "confidence", and "topics" array.',
+  'Each topic in "topics" must have a "name", and may include "category", "granularity", and "role".',
+  "Only include topics that are explicitly in the input.",
+  "If the input contains multiple distinct facts, return them as separate objects in the array.",
 ].join("\n");

 export class LlmFactExtractor implements FactExtractor {
  constructor(private readonly options: LlmFactExtractorOptions) {}

-  async extract(input: string): Promise<ExtractedFact> {
-    const prompt = this.buildPrompt(input);
-    const response = await this.options.model.generateText(prompt);
-    return parseLlmExtractedFactResponse(response);
-  }
-
-  private buildPrompt(input: string): string {
-    if (this.options.promptBuilder) {
-      return this.options.promptBuilder(input, this.options.instructions);
-    }
-
-    const instructions = this.options.instructions?.trim();
-
-    return [
-      DEFAULT_INSTRUCTIONS,
-      instructions && instructions.length > 0
-        ? `Additional instructions:\n${instructions}`
-        : null,
-      `Input:\n${input.trim()}`,
-    ]
-      .filter((value): value is string => value !== null)
-      .join("\n\n");
+  async extract(input: string): Promise<ExtractedFact[]> {
+    return this.options.model.generateText({
+      instruction: DEFAULT_INSTRUCTIONS,
+      input,
+      additionalInstruction: this.options.additionalInstructions,
+    });
  }
 }
-
-export function parseLlmExtractedFactResponse(response: string): ExtractedFact {
-  const payload = parseJsonCandidate(response);
-
-  if (!isRecord(payload)) {
-    throw new IdentityDBError("LLM extractor response must be a JSON object.");
-  }
-
-  const topics = parseTopics(payload.topics);
-  const extracted: ExtractedFact = { topics };
-
-  const statement = optionalString(payload.statement);
-  if (statement !== undefined) {
-    extracted.statement = statement;
-  }
-
-  const summary = optionalNullableString(payload.summary);
-  if (summary !== undefined) {
-    extracted.summary = summary;
-  }
-
-  const source = optionalNullableString(payload.source);
-  if (source !== undefined) {
-    extracted.source = source;
-  }
-
-  const confidence = optionalNullableNumber(payload.confidence);
-  if (confidence !== undefined) {
-    extracted.confidence = confidence;
-  }
-
-  const metadata = optionalMetadata(payload.metadata);
-  if (metadata !== undefined) {
-    extracted.metadata = metadata;
-  }
-
-  return extracted;
-}
-
-function parseJsonCandidate(response: string): unknown {
-  const trimmed = response.trim();
-
-  for (const candidate of collectJsonCandidates(trimmed)) {
-    try {
-      return JSON.parse(candidate);
-    } catch {
-      continue;
-    }
-  }
-
-  throw new IdentityDBError("LLM extractor returned invalid JSON.");
-}
-
-function collectJsonCandidates(response: string): string[] {
-  const candidates = new Set<string>();
-  candidates.add(response);
-
-  const fencePattern = /```(?:json)?\s*([\s\S]*?)```/gi;
-  let match: RegExpExecArray | null = fencePattern.exec(response);
-
-  while (match) {
-    const candidate = match[1]?.trim();
-    if (candidate) {
-      candidates.add(candidate);
-    }
-
-    match = fencePattern.exec(response);
-  }
-
-  const firstBrace = response.indexOf("{");
-  const lastBrace = response.lastIndexOf("}");
-  if (firstBrace >= 0 && lastBrace > firstBrace) {
-    candidates.add(response.slice(firstBrace, lastBrace + 1));
-  }
-
-  return Array.from(candidates);
-}
-
-function parseTopics(value: unknown): ExtractedFact["topics"] {
-  if (!Array.isArray(value)) {
-    throw new IdentityDBError(
-      "LLM extractor response must include a topics array.",
-    );
-  }
-
-  return value.map((entry) => parseTopic(entry));
-}
-
-function parseTopic(value: unknown): ExtractedFact["topics"][number] {
-  if (!isRecord(value)) {
-    throw new IdentityDBError("LLM extractor topics must be JSON objects.");
-  }
-
-  const name = optionalString(value.name)?.trim();
-  if (!name) {
-    throw new IdentityDBError(
-      "LLM extractor topics must include a non-empty name.",
-    );
-  }
-
-  const topic: ExtractedFact["topics"][number] = { name };
-
-  const category = optionalTopicCategory(value.category);
-  if (category !== undefined) {
-    topic.category = category;
-  }
-
-  const granularity = optionalTopicGranularity(value.granularity);
-  if (granularity !== undefined) {
-    topic.granularity = granularity;
-  }
-
-  const role = optionalNullableString(value.role);
-  if (role !== undefined) {
-    topic.role = role;
-  }
-
-  const description = optionalNullableString(value.description);
-  if (description !== undefined) {
-    topic.description = description;
-  }
-
-  const metadata = optionalMetadata(value.metadata);
-  if (metadata !== undefined) {
-    topic.metadata = metadata;
-  }
-
-  return topic;
-}
-
-function optionalString(value: unknown): string | undefined {
-  if (value === undefined) {
-    return undefined;
-  }
-
-  if (typeof value !== "string") {
-    throw new IdentityDBError("LLM extractor expected a string field.");
-  }
-
-  return value;
-}
-
-function optionalNullableString(value: unknown): string | null | undefined {
-  if (value === undefined) {
-    return undefined;
-  }
-
-  if (value === null) {
-    return null;
-  }
-
-  if (typeof value !== "string") {
-    throw new IdentityDBError(
-      "LLM extractor expected a nullable string field.",
-    );
-  }
-
-  return value;
-}
-
-function optionalNullableNumber(value: unknown): number | null | undefined {
-  if (value === undefined) {
-    return undefined;
-  }
-
-  if (value === null) {
-    return null;
-  }
-
-  if (typeof value !== "number" || Number.isNaN(value)) {
-    throw new IdentityDBError(
-      "LLM extractor expected confidence to be a number or null.",
-    );
-  }
-
-  return value;
-}
-
-function optionalMetadata(
-  value: unknown,
-): ExtractedFact["metadata"] | undefined {
-  if (value === undefined) {
-    return undefined;
-  }
-
-  if (value === null) {
-    return null;
-  }
-
-  if (!isJsonLike(value)) {
-    throw new IdentityDBError(
-      "LLM extractor metadata must be valid JSON-compatible data.",
-    );
-  }
-
-  return value as ExtractedFact["metadata"];
-}
-
-function optionalTopicCategory(value: unknown): TopicCategory | undefined {
-  if (value === undefined) {
-    return undefined;
-  }
-
-  if (
-    value === "entity" ||
-    value === "concept" ||
-    value === "temporal" ||
-    value === "custom"
-  ) {
-    return value;
-  }
-
-  throw new IdentityDBError(
-    "LLM extractor returned an unsupported topic category.",
-  );
-}
-
-function optionalTopicGranularity(
-  value: unknown,
-): TopicGranularity | undefined {
-  if (value === undefined) {
-    return undefined;
-  }
-
-  if (value === "abstract" || value === "concrete" || value === "mixed") {
-    return value;
-  }
-
-  throw new IdentityDBError(
-    "LLM extractor returned an unsupported topic granularity.",
-  );
-}
-
-function isRecord(value: unknown): value is Record<string, unknown> {
-  return typeof value === "object" && value !== null && !Array.isArray(value);
-}
-
-function isJsonLike(value: unknown): boolean {
-  if (value === null) {
-    return true;
-  }
-
-  if (
-    typeof value === "string" ||
-    typeof value === "number" ||
-    typeof value === "boolean"
-  ) {
-    return true;
-  }
-
-  if (Array.isArray(value)) {
-    return value.every((entry) => isJsonLike(entry));
-  }
-
-  if (isRecord(value)) {
-    return Object.values(value).every((entry) => isJsonLike(entry));
-  }
-
-  return false;
-}
--- a/src/ingestion/naive-extractor.ts
+++ b/src/ingestion/naive-extractor.ts
@@ -1,7 +1,7 @@
 import type { ExtractedFact, FactExtractor } from './types';

 export class NaiveExtractor implements FactExtractor {
-  async extract(input: string): Promise<ExtractedFact> {
+  async extract(input: string): Promise<ExtractedFact[]> {
    const topics: ExtractedFact['topics'] = [];
    const seen = new Set<string>();
    const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? [];
@@ -31,9 +31,11 @@ export class NaiveExtractor implements FactExtractor {
      });
    }

-    return {
+    return [
+      {
        statement: input.trim(),
        topics,
-    };
+      },
+    ];
  }
 }
--- a/src/ingestion/types.ts
+++ b/src/ingestion/types.ts
@@ -2,29 +2,34 @@ import type {
  AddFactInput,
  EmbeddingProvider,
  TopicLinkInput,
-} from '../types/api';
+} from "../types/api";

 export interface ExtractedFact {
  statement?: string;
  summary?: string | null;
  source?: string | null;
  confidence?: number | null;
-  metadata?: AddFactInput['metadata'];
+  metadata?: AddFactInput["metadata"];
  topics: TopicLinkInput[];
 }

 export interface FactExtractor {
-  extract(input: string): Promise<ExtractedFact>;
+  extract(input: string): Promise<ExtractedFact[]>;
+}
+
+export interface LlmTextGenerationModelInput {
+  instruction: string;
+  input: string;
+  additionalInstruction?: string | undefined;
 }

 export interface LlmTextGenerationModel {
-  generateText(prompt: string): Promise<string>;
+  generateText(prompt: LlmTextGenerationModelInput): Promise<ExtractedFact[]>;
 }

 export interface LlmFactExtractorOptions {
  model: LlmTextGenerationModel;
-  instructions?: string;
-  promptBuilder?: (input: string, instructions?: string) => string;
+  additionalInstructions?: string | undefined;
 }

 export interface IngestStatementOptions {
--- a/tests/ingestion.test.ts
+++ b/tests/ingestion.test.ts
@@ -1,15 +1,18 @@
-import { afterEach, beforeEach, describe, expect, it } from 'vitest';
+import { afterEach, beforeEach, describe, expect, it } from "vitest";

-import { IdentityDB } from '../src/core/identity-db';
-import { LlmFactExtractor } from '../src/ingestion/llm-extractor';
-import { NaiveExtractor } from '../src/ingestion/naive-extractor';
-import type { FactExtractor } from '../src/ingestion/types';
+import { IdentityDB } from "../src/core/identity-db";
+import { LlmFactExtractor } from "../src/ingestion/llm-extractor";
+import { NaiveExtractor } from "../src/ingestion/naive-extractor";
+import type {
+  FactExtractor,
+  LlmTextGenerationModelInput,
+} from "../src/ingestion/types";

-describe('IdentityDB ingestion', () => {
+describe("IdentityDB ingestion", () => {
  let db: IdentityDB;

  beforeEach(async () => {
-    db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' });
+    db = await IdentityDB.connect({ client: "sqlite", filename: ":memory:" });
    await db.initialize();
  });

@@ -17,121 +20,144 @@ describe('IdentityDB ingestion', () => {
    await db.close();
  });

-  it('ingests a statement using a provided extractor', async () => {
+  it("ingests a statement using a provided extractor", async () => {
    const extractor: FactExtractor = {
      async extract(input) {
-        return {
+        return [
+          {
            statement: input,
            topics: [
-            { name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' },
-            { name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' },
-            { name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' },
+              {
+                name: "I",
+                category: "entity",
+                granularity: "concrete",
+                role: "subject",
+              },
+              {
+                name: "TypeScript",
+                category: "entity",
+                granularity: "concrete",
+                role: "object",
+              },
+              {
+                name: "2025",
+                category: "temporal",
+                granularity: "concrete",
+                role: "time",
+              },
            ],
-        };
+          },
+        ];
      },
    };

-    const fact = await db.ingestStatement('I have worked with TypeScript since 2025.', {
+    const fact = await db.ingestStatement(
+      "I have worked with TypeScript since 2025.",
+      {
        extractor,
-    });
+      },
+    );

-    expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'TypeScript', '2025']);
+    expect(fact.topics.map((topic) => topic.name)).toEqual([
+      "I",
+      "TypeScript",
+      "2025",
+    ]);

-    const linkedFacts = await db.getTopicFactsLinkedTo('TypeScript', '2025');
+    const linkedFacts = await db.getTopicFactsLinkedTo("TypeScript", "2025");
    expect(linkedFacts).toHaveLength(1);
-    expect(linkedFacts[0]?.statement).toBe('I have worked with TypeScript since 2025.');
+    expect(linkedFacts[0]?.statement).toBe(
+      "I have worked with TypeScript since 2025.",
+    );
  });

-  it('ships a deterministic naive extractor for local usage', async () => {
-    const fact = await db.ingestStatement('I have worked with TypeScript since 2025.', {
+  it("ships a deterministic naive extractor for local usage", async () => {
+    const fact = await db.ingestStatement(
+      "I have worked with TypeScript since 2025.",
+      {
        extractor: new NaiveExtractor(),
-    });
+      },
+    );

-    expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'TypeScript', '2025']);
+    expect(fact.topics.map((topic) => topic.name)).toEqual([
+      "I",
+      "TypeScript",
+      "2025",
+    ]);

-    const topic = await db.getTopicByName('TypeScript', { includeFacts: true });
+    const topic = await db.getTopicByName("TypeScript", { includeFacts: true });
    expect(topic?.facts).toHaveLength(1);
  });

-  it('ships an LLM extractor adapter that turns structured JSON responses into facts', async () => {
-    let prompt = '';
+  it("ships an LLM extractor adapter that returns structured facts from the model", async () => {
+    let prompt: LlmTextGenerationModelInput | undefined = undefined;

    const extractor = new LlmFactExtractor({
      model: {
        async generateText(input) {
          prompt = input;

-          return JSON.stringify({
-            statement: 'I have worked with Bun and TypeScript since 2025.',
-            summary: 'The speaker has Bun and TypeScript experience.',
-            source: 'chat',
-            confidence: 0.91,
-            metadata: { channel: 'telegram' },
-            topics: [
-              { name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' },
-              { name: 'Bun', category: 'entity', granularity: 'concrete', role: 'object' },
-              { name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' },
-              { name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' },
-            ],
-          });
-        },
-      },
-      instructions: 'Prefer technology and time topics.',
-    });
-
-    const fact = await db.ingestStatement('I have worked with Bun and TypeScript since 2025.', {
-      extractor,
-    });
-
-    expect(prompt).toContain('Prefer technology and time topics.');
-    expect(prompt).toContain('I have worked with Bun and TypeScript since 2025.');
-    expect(fact.summary).toBe('The speaker has Bun and TypeScript experience.');
-    expect(fact.source).toBe('chat');
-    expect(fact.confidence).toBe(0.91);
-    expect(fact.metadata).toEqual({ channel: 'telegram' });
-    expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'Bun', 'TypeScript', '2025']);
-  });
-
-  it('parses JSON responses wrapped in markdown code fences', async () => {
-    const extractor = new LlmFactExtractor({
-      model: {
-        async generateText() {
          return [
-            'Here is the extracted fact:',
-            '```json',
-            JSON.stringify({
-              statement: 'Bun powers TypeScript tooling.',
+            {
+              statement: "I have worked with Bun and TypeScript since 2025.",
+              summary: "The speaker has Bun and TypeScript experience.",
+              source: "chat",
+              confidence: 0.91,
+              metadata: { channel: "telegram" },
              topics: [
-                { name: 'Bun', category: 'entity', granularity: 'concrete' },
-                { name: 'TypeScript', category: 'entity', granularity: 'concrete' },
+                {
+                  name: "I",
+                  category: "entity",
+                  granularity: "concrete",
+                  role: "subject",
+                },
+                {
+                  name: "Bun",
+                  category: "entity",
+                  granularity: "concrete",
+                  role: "object",
+                },
+                {
+                  name: "TypeScript",
+                  category: "entity",
+                  granularity: "concrete",
+                  role: "object",
+                },
+                {
+                  name: "2025",
+                  category: "temporal",
+                  granularity: "concrete",
+                  role: "time",
+                },
              ],
-            }),
-            '```',
-          ].join('\n');
+            },
+          ];
        },
      },
+      additionalInstructions: "Prefer technology and time topics.",
    });

-    const fact = await db.ingestStatement('Bun powers TypeScript tooling.', {
+    const fact = await db.ingestStatement(
+      "I have worked with Bun and TypeScript since 2025.",
+      {
        extractor,
-    });
-
-    expect(fact.topics.map((topic) => topic.name)).toEqual(['Bun', 'TypeScript']);
-  });
-
-  it('rejects invalid LLM responses before writing facts', async () => {
-    const extractor = new LlmFactExtractor({
-      model: {
-        async generateText() {
-          return 'not json at all';
      },
-      },
-    });
+    );

-    await expect(
-      db.ingestStatement('Bun powers TypeScript tooling.', {
-        extractor,
-      }),
-    ).rejects.toThrow('LLM extractor returned invalid JSON.');
+    expect(prompt).toEqual({
+      instruction: expect.stringContaining("Extract structured facts from the user input."),
+      input: "I have worked with Bun and TypeScript since 2025.",
+      additionalInstruction: "Prefer technology and time topics.",
+    });
+    expect(fact.summary).toBe("The speaker has Bun and TypeScript experience.");
+    expect(fact.source).toBe("chat");
+    expect(fact.confidence).toBe(0.91);
+    expect(fact.metadata).toEqual({ channel: "telegram" });
+    expect(fact.topics.map((topic) => topic.name)).toEqual([
+      "I",
+      "Bun",
+      "TypeScript",
+      "2025",
+    ]);
  });
 });
--- a/tests/semantic-search.test.ts
+++ b/tests/semantic-search.test.ts
@@ -178,13 +178,15 @@ describe('IdentityDB dedup-aware ingestion', () => {
    provider = new FakeEmbeddingProvider();
    extractor = {
      async extract(input) {
-        return {
+        return [
+          {
            statement: input,
            topics: [
              { name: 'Bun', category: 'entity', granularity: 'concrete' },
              { name: 'TypeScript', category: 'entity', granularity: 'concrete' },
            ],
-        };
+          },
+        ];
      },
    };

--- a/tsconfig.json
+++ b/tsconfig.json
@@ -18,6 +18,12 @@
    "isolatedModules": true,
    "types": ["node", "vitest/globals"]
  },
-  "include": ["src/**/*.ts", "tests/**/*.ts", "vitest.config.ts", "tsup.config.ts"],
+  "include": [
+    "src/**/*.ts",
+    "tests/**/*.ts",
+    "scripts/**/*.ts",
+    "vitest.config.ts",
+    "tsup.config.ts"
+  ],
  "exclude": ["dist", "node_modules"]
 }
Author	SHA1	Message	Date
p-sw	2b80d9e31a	v0.5.0 Some checks failed npm release / verify (push) Successful in 23s Details npm release / publish to npm (push) Failing after 11s Details	2026-05-20 23:04:14 +09:00
p-sw	00a3905fde	feat: add test-llm-extractor.ts script	2026-05-20 23:03:47 +09:00
p-sw	7602c92046	feat: make FactExtractor extracts multiple facts per input	2026-05-20 22:59:35 +09:00
p-sw	188f03e8e8	feat: add scripts to tsconfig	2026-05-20 22:53:47 +09:00
p-sw	edce116b9f	fix: remove .env.* from git	2026-05-20 22:53:38 +09:00
p-sw	131a693257	feat: add openrouter sdk for llm-extractor testing	2026-05-20 22:53:29 +09:00
p-sw	1172c63db7	v0.4.0 All checks were successful npm release / verify (push) Successful in 12s Details npm release / publish to npm (push) Successful in 11s Details	2026-05-19 22:30:27 +09:00
p-sw	0e595e6f60	test: update test of LlmExtractor	2026-05-19 22:28:09 +09:00
p-sw	518264c467	v0.3.1 Some checks failed npm release / verify (push) Failing after 9s Details npm release / publish to npm (push) Has been skipped Details	2026-05-19 22:19:30 +09:00
p-sw	cc8b3dfb14	vv0.3.1	2026-05-19 22:18:51 +09:00
p-sw	56e17dab49	feat: make extract input structured	2026-05-19 22:18:42 +09:00
p-sw	cc2e9110cc	v0.3.0 All checks were successful npm release / verify (push) Successful in 13s Details npm release / publish to npm (push) Successful in 10s Details	2026-05-19 22:07:06 +09:00
p-sw	0480ea182f	refactor: make generateText model return ExtractedFact	2026-05-19 22:06:54 +09:00