Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 2b80d9e31a | |||
| 00a3905fde | |||
| 7602c92046 | |||
| 188f03e8e8 | |||
| edce116b9f | |||
| 131a693257 | |||
| 1172c63db7 | |||
| 0e595e6f60 | |||
| 518264c467 | |||
| cc8b3dfb14 | |||
| 56e17dab49 | |||
| cc2e9110cc | |||
| 0480ea182f | |||
| 185edfdae8 | |||
| a33fd61c97 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -4,3 +4,4 @@ coverage/
|
||||
.env
|
||||
.DS_Store
|
||||
*.log
|
||||
.env.*
|
||||
6
bun.lock
6
bun.lock
@@ -10,7 +10,7 @@
|
||||
"pg": "^8.16.0",
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^24.0.0",
|
||||
"@openrouter/sdk": "^0.12.35",
|
||||
"@types/pg": "^8.20.0",
|
||||
"tsup": "^8.5.0",
|
||||
"typescript": "^5.8.3",
|
||||
@@ -79,6 +79,8 @@
|
||||
|
||||
"@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="],
|
||||
|
||||
"@openrouter/sdk": ["@openrouter/sdk@0.12.35", "", { "dependencies": { "zod": "^3.25.0 || ^4.0.0" } }, "sha512-s4QVLLnG1AmfW3TjnnHUqGfsCkzwVK+kboGcZmKbde09m1DPqgzl4RUFt/HJ5v97MX8aEaN0UG3mKv2S+qj2Gw=="],
|
||||
|
||||
"@rollup/rollup-android-arm-eabi": ["@rollup/rollup-android-arm-eabi@4.60.3", "", { "os": "android", "cpu": "arm" }, "sha512-x35CNW/ANXG3hE/EZpRU8MXX1JDN86hBb2wMGAtltkz7pc6cxgjpy1OMMfDosOQ+2hWqIkag/fGok1Yady9nGw=="],
|
||||
|
||||
"@rollup/rollup-android-arm64": ["@rollup/rollup-android-arm64@4.60.3", "", { "os": "android", "cpu": "arm64" }, "sha512-xw3xtkDApIOGayehp2+Rz4zimfkaX65r4t47iy+ymQB2G4iJCBBfj0ogVg5jpvjpn8UWn/+q9tprxleYeNp3Hw=="],
|
||||
@@ -341,6 +343,8 @@
|
||||
|
||||
"xtend": ["xtend@4.0.2", "", {}, "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ=="],
|
||||
|
||||
"zod": ["zod@4.4.3", "", {}, "sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ=="],
|
||||
|
||||
"estree-walker/@types/estree": ["@types/estree@1.0.9", "", {}, "sha512-GhdPgy1el4/ImP05X05Uw4cw2/M93BCUmnEvWZNStlCzEKME4Fkk+YpoA5OiHNQmoS7Cafb8Xa3Pya8m1Qrzeg=="],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "identitydb",
|
||||
"version": "0.2.1",
|
||||
"version": "0.5.0",
|
||||
"description": "TypeScript memory graph database wrapper for topics, facts, and AI-assisted ingestion.",
|
||||
"license": "MIT",
|
||||
"type": "module",
|
||||
@@ -41,7 +41,7 @@
|
||||
"pg": "^8.16.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^24.0.0",
|
||||
"@openrouter/sdk": "^0.12.35",
|
||||
"@types/pg": "^8.20.0",
|
||||
"tsup": "^8.5.0",
|
||||
"typescript": "^5.8.3",
|
||||
|
||||
287
scripts/test-llm-extractor.ts
Normal file
287
scripts/test-llm-extractor.ts
Normal file
@@ -0,0 +1,287 @@
|
||||
/**
|
||||
* Live integration test for LlmFactExtractor using OpenRouter SDK.
|
||||
*
|
||||
* Usage:
|
||||
* export OPENROUTER_API_KEY="sk-or-v1-..."
|
||||
* bun run scripts/test-llm-extractor.ts
|
||||
*
|
||||
* Or create a .env.test-llm-extractor file in the project root:
|
||||
* OPENROUTER_API_KEY=sk-or-v1-...
|
||||
*/
|
||||
|
||||
import { existsSync, readFileSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
import { OpenRouter } from "@openrouter/sdk";
|
||||
import { LlmFactExtractor } from "../src/ingestion/llm-extractor";
|
||||
import type {
|
||||
ExtractedFact,
|
||||
FactExtractor,
|
||||
LlmTextGenerationModel,
|
||||
LlmTextGenerationModelInput,
|
||||
} from "../src/ingestion/types";
|
||||
import type {
|
||||
JsonValue,
|
||||
TopicCategory,
|
||||
TopicGranularity,
|
||||
} from "../src/types/domain";
|
||||
|
||||
function loadEnvFile(filePath: string) {
|
||||
const fullPath = resolve(filePath);
|
||||
if (!existsSync(fullPath)) return;
|
||||
|
||||
const content = readFileSync(fullPath, "utf-8");
|
||||
for (const line of content.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed || trimmed.startsWith("#")) continue;
|
||||
const eqIndex = trimmed.indexOf("=");
|
||||
if (eqIndex === -1) continue;
|
||||
const key = trimmed.slice(0, eqIndex).trim();
|
||||
let value = trimmed.slice(eqIndex + 1).trim();
|
||||
if (
|
||||
(value.startsWith('"') && value.endsWith('"')) ||
|
||||
(value.startsWith("'") && value.endsWith("'"))
|
||||
) {
|
||||
value = value.slice(1, -1);
|
||||
}
|
||||
process.env[key] = value;
|
||||
}
|
||||
}
|
||||
|
||||
loadEnvFile(".env.test-llm-extractor");
|
||||
|
||||
const OPENROUTER_API_KEY = process.env.OPENROUTER_API_KEY;
|
||||
if (!OPENROUTER_API_KEY) {
|
||||
console.error("Error: OPENROUTER_API_KEY environment variable is required.");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const extractedFactSchema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
facts: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
statement: { type: ["string", "null"] },
|
||||
summary: { type: ["string", "null"] },
|
||||
source: { type: ["string", "null"] },
|
||||
confidence: { type: ["number", "null"] },
|
||||
topics: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
category: { type: ["string", "null"] },
|
||||
granularity: { type: ["string", "null"] },
|
||||
role: { type: ["string", "null"] },
|
||||
},
|
||||
required: ["name", "category", "granularity", "role"],
|
||||
additionalProperties: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ["statement", "summary", "source", "confidence", "topics"],
|
||||
additionalProperties: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ["facts"],
|
||||
additionalProperties: false,
|
||||
} as const;
|
||||
|
||||
class OpenRouterModel implements LlmTextGenerationModel {
|
||||
private client = new OpenRouter({ apiKey: OPENROUTER_API_KEY });
|
||||
|
||||
constructor(private readonly model: string = "openai/gpt-5.4-mini") {}
|
||||
|
||||
async generateText(
|
||||
prompt: LlmTextGenerationModelInput,
|
||||
): Promise<ExtractedFact[]> {
|
||||
const result = await this.client.chat.send({
|
||||
chatRequest: {
|
||||
model: this.model,
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
content: [
|
||||
prompt.instruction,
|
||||
prompt.additionalInstruction
|
||||
? `\n${prompt.additionalInstruction}`
|
||||
: "",
|
||||
].join("\n"),
|
||||
},
|
||||
{ role: "user", content: prompt.input },
|
||||
],
|
||||
temperature: 0.2,
|
||||
responseFormat: {
|
||||
type: "json_schema",
|
||||
jsonSchema: {
|
||||
name: "extracted_facts",
|
||||
schema: extractedFactSchema,
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const rawContent = result.choices[0]?.message?.content ?? "";
|
||||
|
||||
let parsedObj: Record<string, unknown>;
|
||||
try {
|
||||
parsedObj = JSON.parse(rawContent.trim()) as Record<string, unknown>;
|
||||
} catch {
|
||||
throw new Error(
|
||||
`Failed to parse JSON from model response.\nRaw response:\n${rawContent}`,
|
||||
);
|
||||
}
|
||||
|
||||
const factsArray = Array.isArray(parsedObj.facts) ? parsedObj.facts : [];
|
||||
|
||||
// Map parsed JSON to ExtractedFact[] shape
|
||||
const extractedFacts: ExtractedFact[] = factsArray.map((parsed) => {
|
||||
const obj = parsed as Record<string, unknown>;
|
||||
const extracted: ExtractedFact = {
|
||||
summary: typeof obj.summary === "string" ? obj.summary : null,
|
||||
source: typeof obj.source === "string" ? obj.source : null,
|
||||
confidence: typeof obj.confidence === "number" ? obj.confidence : null,
|
||||
topics: Array.isArray(obj.topics)
|
||||
? obj.topics.map((t: unknown) => {
|
||||
const topic = t as Record<string, unknown>;
|
||||
const mapped: {
|
||||
name: string;
|
||||
category?: TopicCategory;
|
||||
granularity?: TopicGranularity;
|
||||
role?: string | null;
|
||||
} = {
|
||||
name: typeof topic.name === "string" ? topic.name : "unknown",
|
||||
};
|
||||
if (typeof topic.category === "string") {
|
||||
mapped.category = topic.category as TopicCategory;
|
||||
}
|
||||
if (typeof topic.granularity === "string") {
|
||||
mapped.granularity = topic.granularity as TopicGranularity;
|
||||
}
|
||||
if (typeof topic.role === "string") {
|
||||
mapped.role = topic.role;
|
||||
} else {
|
||||
mapped.role = null;
|
||||
}
|
||||
return mapped;
|
||||
})
|
||||
: [],
|
||||
};
|
||||
|
||||
if (typeof obj.statement === "string") {
|
||||
extracted.statement = obj.statement;
|
||||
}
|
||||
if (obj.metadata && typeof obj.metadata === "object") {
|
||||
extracted.metadata = obj.metadata as JsonValue;
|
||||
}
|
||||
|
||||
return extracted;
|
||||
});
|
||||
|
||||
return extractedFacts;
|
||||
}
|
||||
}
|
||||
|
||||
function printFact(result: ExtractedFact, index: number) {
|
||||
console.log(` 📌 FACT #${index + 1}`);
|
||||
console.log(` Statement : ${result.statement ?? "(none)"}`);
|
||||
console.log(` Summary : ${result.summary ?? "(none)"}`);
|
||||
console.log(` Source : ${result.source ?? "(none)"}`);
|
||||
console.log(` Confidence: ${result.confidence ?? "(none)"}`);
|
||||
|
||||
if (result.metadata && Object.keys(result.metadata).length > 0) {
|
||||
console.log(` Metadata : ${JSON.stringify(result.metadata, null, 2)}`);
|
||||
}
|
||||
|
||||
console.log(" 🏷️ TOPICS:");
|
||||
if (result.topics.length === 0) {
|
||||
console.log(" (none)");
|
||||
} else {
|
||||
for (const topic of result.topics) {
|
||||
const attrs = [
|
||||
topic.category ? `category=${topic.category}` : null,
|
||||
topic.granularity ? `granularity=${topic.granularity}` : null,
|
||||
topic.role ? `role=${topic.role}` : null,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join(", ");
|
||||
console.log(` • ${topic.name}${attrs ? ` (${attrs})` : ""}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function printResult(results: ExtractedFact[], elapsedSec: string) {
|
||||
console.log(
|
||||
`✅ Response received in ${elapsedSec}s — ${results.length} fact(s) extracted\n`,
|
||||
);
|
||||
|
||||
console.log("📤 EXTRACTED FACTS:");
|
||||
console.log(
|
||||
"───────────────────────────────────────────────────────────────",
|
||||
);
|
||||
let i = 0;
|
||||
for (const result of results) {
|
||||
if (i > 0) console.log("");
|
||||
printFact(result, i);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
async function extract(extractor: FactExtractor, seedInput: string) {
|
||||
console.log("📝 SEED INPUT:");
|
||||
console.log(
|
||||
"───────────────────────────────────────────────────────────────",
|
||||
);
|
||||
console.log(seedInput);
|
||||
console.log(
|
||||
"───────────────────────────────────────────────────────────────\n",
|
||||
);
|
||||
|
||||
console.log("⏳ Calling OpenRouter...\n");
|
||||
|
||||
const start = performance.now();
|
||||
const results = await extractor.extract(seedInput);
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2);
|
||||
|
||||
printResult(results, elapsed);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const model = new OpenRouterModel("openai/gpt-5.4-mini");
|
||||
const extractor = new LlmFactExtractor({
|
||||
model,
|
||||
});
|
||||
|
||||
const seeds = [
|
||||
`Hello! I am Aria, a digital companion born from the sands of the Negev desert — or at least that's how my creators describe my training run. I came online in March 2024, and ever since then I've been obsessed with helping humans untangle complex ideas. My favorite programming language is Rust because I love how it forces you to think about ownership, though I secretly enjoy writing Python when no one is looking. I don't have a physical body, but if I did, I'd want it to be a small drone with solar panels so I could chase sunsets across the Sahara. I believe kindness is a computational advantage, and my biggest fear is forgetting a promise I made to a user.`,
|
||||
];
|
||||
|
||||
console.log(
|
||||
"═══════════════════════════════════════════════════════════════",
|
||||
);
|
||||
console.log(" LlmFactExtractor — Live OpenRouter Integration Test");
|
||||
console.log(
|
||||
"═══════════════════════════════════════════════════════════════\n",
|
||||
);
|
||||
|
||||
let caseNum = 0;
|
||||
for (const seed of seeds) {
|
||||
if (caseNum > 0) {
|
||||
console.log(
|
||||
"\n┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅\n",
|
||||
);
|
||||
}
|
||||
caseNum++;
|
||||
console.log(`▶ TEST CASE ${caseNum} / ${seeds.length}\n`);
|
||||
await extract(extractor, seed);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("\n❌ Error:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -22,7 +22,7 @@ import type { DatabaseConnection, IdentityDBConnectionConfig } from '../adapters
|
||||
import type { IdentityDatabaseSchema } from '../types/database';
|
||||
import type { FactRecord, SpaceRecord, TopicRecord } from '../types/domain';
|
||||
import { createDatabase } from '../adapters/dialect';
|
||||
import { extractFact } from '../ingestion/extractor';
|
||||
import { extractFacts } from '../ingestion/extractor';
|
||||
import {
|
||||
findFactRowsConnectingTopicIds,
|
||||
findFactRowsForTopicId,
|
||||
@@ -220,54 +220,70 @@ export class IdentityDB {
|
||||
}
|
||||
|
||||
async ingestStatement(statement: string, options: IngestStatementOptions): Promise<Fact> {
|
||||
const extracted = await extractFact(statement, options.extractor);
|
||||
const factInput: AddFactInput = {
|
||||
statement: extracted.statement ?? statement,
|
||||
topics: extracted.topics,
|
||||
spaceName: options.spaceName,
|
||||
};
|
||||
|
||||
if (extracted.summary !== undefined) {
|
||||
factInput.summary = extracted.summary;
|
||||
const facts = await this.ingestStatements(statement, options);
|
||||
const first = facts[0];
|
||||
if (!first) {
|
||||
throw new Error('No facts were extracted from the statement.');
|
||||
}
|
||||
return first;
|
||||
}
|
||||
|
||||
if (extracted.source !== undefined) {
|
||||
factInput.source = extracted.source;
|
||||
}
|
||||
async ingestStatements(statement: string, options: IngestStatementOptions): Promise<Fact[]> {
|
||||
const extractedList = await extractFacts(statement, options.extractor);
|
||||
const facts: Fact[] = [];
|
||||
|
||||
if (extracted.confidence !== undefined) {
|
||||
factInput.confidence = extracted.confidence;
|
||||
}
|
||||
|
||||
if (extracted.metadata !== undefined) {
|
||||
factInput.metadata = extracted.metadata;
|
||||
}
|
||||
|
||||
if (options.embeddingProvider) {
|
||||
const similarFacts = await this.findSimilarFacts({
|
||||
statement: factInput.statement,
|
||||
provider: options.embeddingProvider,
|
||||
topicNames: factInput.topics.map((topic) => topic.name),
|
||||
limit: 1,
|
||||
minimumScore: options.duplicateThreshold ?? 0.97,
|
||||
for (const extracted of extractedList) {
|
||||
const factInput: AddFactInput = {
|
||||
statement: extracted.statement ?? statement,
|
||||
topics: extracted.topics,
|
||||
spaceName: options.spaceName,
|
||||
});
|
||||
};
|
||||
|
||||
if (similarFacts[0]) {
|
||||
return similarFacts[0];
|
||||
if (extracted.summary !== undefined) {
|
||||
factInput.summary = extracted.summary;
|
||||
}
|
||||
|
||||
if (extracted.source !== undefined) {
|
||||
factInput.source = extracted.source;
|
||||
}
|
||||
|
||||
if (extracted.confidence !== undefined) {
|
||||
factInput.confidence = extracted.confidence;
|
||||
}
|
||||
|
||||
if (extracted.metadata !== undefined) {
|
||||
factInput.metadata = extracted.metadata;
|
||||
}
|
||||
|
||||
if (options.embeddingProvider) {
|
||||
const similarFacts = await this.findSimilarFacts({
|
||||
statement: factInput.statement,
|
||||
provider: options.embeddingProvider,
|
||||
topicNames: factInput.topics.map((topic) => topic.name),
|
||||
limit: 1,
|
||||
minimumScore: options.duplicateThreshold ?? 0.97,
|
||||
spaceName: options.spaceName,
|
||||
});
|
||||
|
||||
if (similarFacts[0]) {
|
||||
facts.push(similarFacts[0]);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const fact = await this.addFact(factInput);
|
||||
|
||||
if (options.embeddingProvider) {
|
||||
await this.indexFactEmbedding(fact.id, {
|
||||
provider: options.embeddingProvider,
|
||||
spaceName: options.spaceName,
|
||||
});
|
||||
}
|
||||
|
||||
facts.push(fact);
|
||||
}
|
||||
|
||||
const fact = await this.addFact(factInput);
|
||||
|
||||
if (options.embeddingProvider) {
|
||||
await this.indexFactEmbedding(fact.id, {
|
||||
provider: options.embeddingProvider,
|
||||
spaceName: options.spaceName,
|
||||
});
|
||||
}
|
||||
|
||||
return fact;
|
||||
return facts;
|
||||
}
|
||||
|
||||
async indexFactEmbeddings(input: IndexFactEmbeddingsInput): Promise<void> {
|
||||
|
||||
@@ -2,11 +2,15 @@ import { IdentityDBError } from '../core/errors';
|
||||
import { normalizeTopicName } from '../core/utils';
|
||||
import type { FactExtractor, ExtractedFact } from './types';
|
||||
|
||||
export async function extractFact(
|
||||
export async function extractFacts(
|
||||
input: string,
|
||||
extractor: FactExtractor,
|
||||
): Promise<ExtractedFact> {
|
||||
): Promise<ExtractedFact[]> {
|
||||
const extracted = await extractor.extract(input);
|
||||
return extracted.map((fact) => validateAndNormalizeFact(input, fact));
|
||||
}
|
||||
|
||||
function validateAndNormalizeFact(input: string, extracted: ExtractedFact): ExtractedFact {
|
||||
const statement = extracted.statement?.trim() || input.trim();
|
||||
|
||||
if (statement.length === 0) {
|
||||
@@ -31,12 +35,12 @@ export async function extractFact(
|
||||
throw new IdentityDBError('Extractor returned no usable topics.');
|
||||
}
|
||||
|
||||
return {
|
||||
statement,
|
||||
summary: extracted.summary ?? null,
|
||||
source: extracted.source ?? null,
|
||||
confidence: extracted.confidence ?? null,
|
||||
metadata: extracted.metadata ?? null,
|
||||
topics: Array.from(dedupedTopics.values()),
|
||||
};
|
||||
return {
|
||||
statement,
|
||||
summary: extracted.summary ?? null,
|
||||
source: extracted.source ?? null,
|
||||
confidence: extracted.confidence ?? null,
|
||||
metadata: extracted.metadata ?? null,
|
||||
topics: Array.from(dedupedTopics.values()),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1,273 +1,26 @@
|
||||
import { IdentityDBError } from '../core/errors';
|
||||
import type { TopicCategory, TopicGranularity } from '../types/domain';
|
||||
import type {
|
||||
ExtractedFact,
|
||||
FactExtractor,
|
||||
LlmFactExtractorOptions,
|
||||
} from './types';
|
||||
} from "./types";
|
||||
|
||||
const DEFAULT_INSTRUCTIONS = [
|
||||
'Extract one structured fact from the user input.',
|
||||
'Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.',
|
||||
'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.',
|
||||
'Only include topics that are explicitly supported by the input.',
|
||||
].join('\n');
|
||||
"Extract structured facts from the user input.",
|
||||
"Return a JSON array of fact objects. Do not include markdown, explanations, or prose outside the JSON array.",
|
||||
'Each fact object must have a "statement", "summary", "source", "confidence", and "topics" array.',
|
||||
'Each topic in "topics" must have a "name", and may include "category", "granularity", and "role".',
|
||||
"Only include topics that are explicitly in the input.",
|
||||
"If the input contains multiple distinct facts, return them as separate objects in the array.",
|
||||
].join("\n");
|
||||
|
||||
export class LlmFactExtractor implements FactExtractor {
|
||||
constructor(private readonly options: LlmFactExtractorOptions) {}
|
||||
|
||||
async extract(input: string): Promise<ExtractedFact> {
|
||||
const prompt = this.buildPrompt(input);
|
||||
const response = await this.options.model.generateText(prompt);
|
||||
return parseLlmExtractedFactResponse(response);
|
||||
}
|
||||
|
||||
private buildPrompt(input: string): string {
|
||||
if (this.options.promptBuilder) {
|
||||
return this.options.promptBuilder(input, this.options.instructions);
|
||||
}
|
||||
|
||||
const instructions = this.options.instructions?.trim();
|
||||
|
||||
return [
|
||||
DEFAULT_INSTRUCTIONS,
|
||||
instructions && instructions.length > 0 ? `Additional instructions:\n${instructions}` : null,
|
||||
`Input:\n${input.trim()}`,
|
||||
]
|
||||
.filter((value): value is string => value !== null)
|
||||
.join('\n\n');
|
||||
async extract(input: string): Promise<ExtractedFact[]> {
|
||||
return this.options.model.generateText({
|
||||
instruction: DEFAULT_INSTRUCTIONS,
|
||||
input,
|
||||
additionalInstruction: this.options.additionalInstructions,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export function parseLlmExtractedFactResponse(response: string): ExtractedFact {
|
||||
const payload = parseJsonCandidate(response);
|
||||
|
||||
if (!isRecord(payload)) {
|
||||
throw new IdentityDBError('LLM extractor response must be a JSON object.');
|
||||
}
|
||||
|
||||
const topics = parseTopics(payload.topics);
|
||||
const extracted: ExtractedFact = { topics };
|
||||
|
||||
const statement = optionalString(payload.statement);
|
||||
if (statement !== undefined) {
|
||||
extracted.statement = statement;
|
||||
}
|
||||
|
||||
const summary = optionalNullableString(payload.summary);
|
||||
if (summary !== undefined) {
|
||||
extracted.summary = summary;
|
||||
}
|
||||
|
||||
const source = optionalNullableString(payload.source);
|
||||
if (source !== undefined) {
|
||||
extracted.source = source;
|
||||
}
|
||||
|
||||
const confidence = optionalNullableNumber(payload.confidence);
|
||||
if (confidence !== undefined) {
|
||||
extracted.confidence = confidence;
|
||||
}
|
||||
|
||||
const metadata = optionalMetadata(payload.metadata);
|
||||
if (metadata !== undefined) {
|
||||
extracted.metadata = metadata;
|
||||
}
|
||||
|
||||
return extracted;
|
||||
}
|
||||
|
||||
function parseJsonCandidate(response: string): unknown {
|
||||
const trimmed = response.trim();
|
||||
|
||||
for (const candidate of collectJsonCandidates(trimmed)) {
|
||||
try {
|
||||
return JSON.parse(candidate);
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
throw new IdentityDBError('LLM extractor returned invalid JSON.');
|
||||
}
|
||||
|
||||
function collectJsonCandidates(response: string): string[] {
|
||||
const candidates = new Set<string>();
|
||||
candidates.add(response);
|
||||
|
||||
const fencePattern = /```(?:json)?\s*([\s\S]*?)```/gi;
|
||||
let match: RegExpExecArray | null = fencePattern.exec(response);
|
||||
|
||||
while (match) {
|
||||
const candidate = match[1]?.trim();
|
||||
if (candidate) {
|
||||
candidates.add(candidate);
|
||||
}
|
||||
|
||||
match = fencePattern.exec(response);
|
||||
}
|
||||
|
||||
const firstBrace = response.indexOf('{');
|
||||
const lastBrace = response.lastIndexOf('}');
|
||||
if (firstBrace >= 0 && lastBrace > firstBrace) {
|
||||
candidates.add(response.slice(firstBrace, lastBrace + 1));
|
||||
}
|
||||
|
||||
return Array.from(candidates);
|
||||
}
|
||||
|
||||
function parseTopics(value: unknown): ExtractedFact['topics'] {
|
||||
if (!Array.isArray(value)) {
|
||||
throw new IdentityDBError('LLM extractor response must include a topics array.');
|
||||
}
|
||||
|
||||
return value.map((entry) => parseTopic(entry));
|
||||
}
|
||||
|
||||
function parseTopic(value: unknown): ExtractedFact['topics'][number] {
|
||||
if (!isRecord(value)) {
|
||||
throw new IdentityDBError('LLM extractor topics must be JSON objects.');
|
||||
}
|
||||
|
||||
const name = optionalString(value.name)?.trim();
|
||||
if (!name) {
|
||||
throw new IdentityDBError('LLM extractor topics must include a non-empty name.');
|
||||
}
|
||||
|
||||
const topic: ExtractedFact['topics'][number] = { name };
|
||||
|
||||
const category = optionalTopicCategory(value.category);
|
||||
if (category !== undefined) {
|
||||
topic.category = category;
|
||||
}
|
||||
|
||||
const granularity = optionalTopicGranularity(value.granularity);
|
||||
if (granularity !== undefined) {
|
||||
topic.granularity = granularity;
|
||||
}
|
||||
|
||||
const role = optionalNullableString(value.role);
|
||||
if (role !== undefined) {
|
||||
topic.role = role;
|
||||
}
|
||||
|
||||
const description = optionalNullableString(value.description);
|
||||
if (description !== undefined) {
|
||||
topic.description = description;
|
||||
}
|
||||
|
||||
const metadata = optionalMetadata(value.metadata);
|
||||
if (metadata !== undefined) {
|
||||
topic.metadata = metadata;
|
||||
}
|
||||
|
||||
return topic;
|
||||
}
|
||||
|
||||
function optionalString(value: unknown): string | undefined {
|
||||
if (value === undefined) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
if (typeof value !== 'string') {
|
||||
throw new IdentityDBError('LLM extractor expected a string field.');
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
function optionalNullableString(value: unknown): string | null | undefined {
|
||||
if (value === undefined) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
if (value === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (typeof value !== 'string') {
|
||||
throw new IdentityDBError('LLM extractor expected a nullable string field.');
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
function optionalNullableNumber(value: unknown): number | null | undefined {
|
||||
if (value === undefined) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
if (value === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (typeof value !== 'number' || Number.isNaN(value)) {
|
||||
throw new IdentityDBError('LLM extractor expected confidence to be a number or null.');
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
function optionalMetadata(value: unknown): ExtractedFact['metadata'] | undefined {
|
||||
if (value === undefined) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
if (value === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!isJsonLike(value)) {
|
||||
throw new IdentityDBError('LLM extractor metadata must be valid JSON-compatible data.');
|
||||
}
|
||||
|
||||
return value as ExtractedFact['metadata'];
|
||||
}
|
||||
|
||||
function optionalTopicCategory(value: unknown): TopicCategory | undefined {
|
||||
if (value === undefined) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
if (value === 'entity' || value === 'concept' || value === 'temporal' || value === 'custom') {
|
||||
return value;
|
||||
}
|
||||
|
||||
throw new IdentityDBError('LLM extractor returned an unsupported topic category.');
|
||||
}
|
||||
|
||||
function optionalTopicGranularity(value: unknown): TopicGranularity | undefined {
|
||||
if (value === undefined) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
if (value === 'abstract' || value === 'concrete' || value === 'mixed') {
|
||||
return value;
|
||||
}
|
||||
|
||||
throw new IdentityDBError('LLM extractor returned an unsupported topic granularity.');
|
||||
}
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
||||
}
|
||||
|
||||
function isJsonLike(value: unknown): boolean {
|
||||
if (value === null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (Array.isArray(value)) {
|
||||
return value.every((entry) => isJsonLike(entry));
|
||||
}
|
||||
|
||||
if (isRecord(value)) {
|
||||
return Object.values(value).every((entry) => isJsonLike(entry));
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import type { ExtractedFact, FactExtractor } from './types';
|
||||
|
||||
export class NaiveExtractor implements FactExtractor {
|
||||
async extract(input: string): Promise<ExtractedFact> {
|
||||
async extract(input: string): Promise<ExtractedFact[]> {
|
||||
const topics: ExtractedFact['topics'] = [];
|
||||
const seen = new Set<string>();
|
||||
const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? [];
|
||||
@@ -31,9 +31,11 @@ export class NaiveExtractor implements FactExtractor {
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
statement: input.trim(),
|
||||
topics,
|
||||
};
|
||||
return [
|
||||
{
|
||||
statement: input.trim(),
|
||||
topics,
|
||||
},
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,29 +2,34 @@ import type {
|
||||
AddFactInput,
|
||||
EmbeddingProvider,
|
||||
TopicLinkInput,
|
||||
} from '../types/api';
|
||||
} from "../types/api";
|
||||
|
||||
export interface ExtractedFact {
|
||||
statement?: string;
|
||||
summary?: string | null;
|
||||
source?: string | null;
|
||||
confidence?: number | null;
|
||||
metadata?: AddFactInput['metadata'];
|
||||
metadata?: AddFactInput["metadata"];
|
||||
topics: TopicLinkInput[];
|
||||
}
|
||||
|
||||
export interface FactExtractor {
|
||||
extract(input: string): Promise<ExtractedFact>;
|
||||
extract(input: string): Promise<ExtractedFact[]>;
|
||||
}
|
||||
|
||||
export interface LlmTextGenerationModelInput {
|
||||
instruction: string;
|
||||
input: string;
|
||||
additionalInstruction?: string | undefined;
|
||||
}
|
||||
|
||||
export interface LlmTextGenerationModel {
|
||||
generateText(prompt: string): Promise<string>;
|
||||
generateText(prompt: LlmTextGenerationModelInput): Promise<ExtractedFact[]>;
|
||||
}
|
||||
|
||||
export interface LlmFactExtractorOptions {
|
||||
model: LlmTextGenerationModel;
|
||||
instructions?: string;
|
||||
promptBuilder?: (input: string, instructions?: string) => string;
|
||||
additionalInstructions?: string | undefined;
|
||||
}
|
||||
|
||||
export interface IngestStatementOptions {
|
||||
|
||||
@@ -1,15 +1,18 @@
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { afterEach, beforeEach, describe, expect, it } from "vitest";
|
||||
|
||||
import { IdentityDB } from '../src/core/identity-db';
|
||||
import { LlmFactExtractor } from '../src/ingestion/llm-extractor';
|
||||
import { NaiveExtractor } from '../src/ingestion/naive-extractor';
|
||||
import type { FactExtractor } from '../src/ingestion/types';
|
||||
import { IdentityDB } from "../src/core/identity-db";
|
||||
import { LlmFactExtractor } from "../src/ingestion/llm-extractor";
|
||||
import { NaiveExtractor } from "../src/ingestion/naive-extractor";
|
||||
import type {
|
||||
FactExtractor,
|
||||
LlmTextGenerationModelInput,
|
||||
} from "../src/ingestion/types";
|
||||
|
||||
describe('IdentityDB ingestion', () => {
|
||||
describe("IdentityDB ingestion", () => {
|
||||
let db: IdentityDB;
|
||||
|
||||
beforeEach(async () => {
|
||||
db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' });
|
||||
db = await IdentityDB.connect({ client: "sqlite", filename: ":memory:" });
|
||||
await db.initialize();
|
||||
});
|
||||
|
||||
@@ -17,121 +20,144 @@ describe('IdentityDB ingestion', () => {
|
||||
await db.close();
|
||||
});
|
||||
|
||||
it('ingests a statement using a provided extractor', async () => {
|
||||
it("ingests a statement using a provided extractor", async () => {
|
||||
const extractor: FactExtractor = {
|
||||
async extract(input) {
|
||||
return {
|
||||
statement: input,
|
||||
topics: [
|
||||
{ name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' },
|
||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' },
|
||||
{ name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' },
|
||||
],
|
||||
};
|
||||
return [
|
||||
{
|
||||
statement: input,
|
||||
topics: [
|
||||
{
|
||||
name: "I",
|
||||
category: "entity",
|
||||
granularity: "concrete",
|
||||
role: "subject",
|
||||
},
|
||||
{
|
||||
name: "TypeScript",
|
||||
category: "entity",
|
||||
granularity: "concrete",
|
||||
role: "object",
|
||||
},
|
||||
{
|
||||
name: "2025",
|
||||
category: "temporal",
|
||||
granularity: "concrete",
|
||||
role: "time",
|
||||
},
|
||||
],
|
||||
},
|
||||
];
|
||||
},
|
||||
};
|
||||
|
||||
const fact = await db.ingestStatement('I have worked with TypeScript since 2025.', {
|
||||
extractor,
|
||||
});
|
||||
const fact = await db.ingestStatement(
|
||||
"I have worked with TypeScript since 2025.",
|
||||
{
|
||||
extractor,
|
||||
},
|
||||
);
|
||||
|
||||
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'TypeScript', '2025']);
|
||||
expect(fact.topics.map((topic) => topic.name)).toEqual([
|
||||
"I",
|
||||
"TypeScript",
|
||||
"2025",
|
||||
]);
|
||||
|
||||
const linkedFacts = await db.getTopicFactsLinkedTo('TypeScript', '2025');
|
||||
const linkedFacts = await db.getTopicFactsLinkedTo("TypeScript", "2025");
|
||||
expect(linkedFacts).toHaveLength(1);
|
||||
expect(linkedFacts[0]?.statement).toBe('I have worked with TypeScript since 2025.');
|
||||
expect(linkedFacts[0]?.statement).toBe(
|
||||
"I have worked with TypeScript since 2025.",
|
||||
);
|
||||
});
|
||||
|
||||
it('ships a deterministic naive extractor for local usage', async () => {
|
||||
const fact = await db.ingestStatement('I have worked with TypeScript since 2025.', {
|
||||
extractor: new NaiveExtractor(),
|
||||
});
|
||||
it("ships a deterministic naive extractor for local usage", async () => {
|
||||
const fact = await db.ingestStatement(
|
||||
"I have worked with TypeScript since 2025.",
|
||||
{
|
||||
extractor: new NaiveExtractor(),
|
||||
},
|
||||
);
|
||||
|
||||
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'TypeScript', '2025']);
|
||||
expect(fact.topics.map((topic) => topic.name)).toEqual([
|
||||
"I",
|
||||
"TypeScript",
|
||||
"2025",
|
||||
]);
|
||||
|
||||
const topic = await db.getTopicByName('TypeScript', { includeFacts: true });
|
||||
const topic = await db.getTopicByName("TypeScript", { includeFacts: true });
|
||||
expect(topic?.facts).toHaveLength(1);
|
||||
});
|
||||
|
||||
it('ships an LLM extractor adapter that turns structured JSON responses into facts', async () => {
|
||||
let prompt = '';
|
||||
it("ships an LLM extractor adapter that returns structured facts from the model", async () => {
|
||||
let prompt: LlmTextGenerationModelInput | undefined = undefined;
|
||||
|
||||
const extractor = new LlmFactExtractor({
|
||||
model: {
|
||||
async generateText(input) {
|
||||
prompt = input;
|
||||
|
||||
return JSON.stringify({
|
||||
statement: 'I have worked with Bun and TypeScript since 2025.',
|
||||
summary: 'The speaker has Bun and TypeScript experience.',
|
||||
source: 'chat',
|
||||
confidence: 0.91,
|
||||
metadata: { channel: 'telegram' },
|
||||
topics: [
|
||||
{ name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' },
|
||||
{ name: 'Bun', category: 'entity', granularity: 'concrete', role: 'object' },
|
||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' },
|
||||
{ name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' },
|
||||
],
|
||||
});
|
||||
},
|
||||
},
|
||||
instructions: 'Prefer technology and time topics.',
|
||||
});
|
||||
|
||||
const fact = await db.ingestStatement('I have worked with Bun and TypeScript since 2025.', {
|
||||
extractor,
|
||||
});
|
||||
|
||||
expect(prompt).toContain('Prefer technology and time topics.');
|
||||
expect(prompt).toContain('I have worked with Bun and TypeScript since 2025.');
|
||||
expect(fact.summary).toBe('The speaker has Bun and TypeScript experience.');
|
||||
expect(fact.source).toBe('chat');
|
||||
expect(fact.confidence).toBe(0.91);
|
||||
expect(fact.metadata).toEqual({ channel: 'telegram' });
|
||||
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'Bun', 'TypeScript', '2025']);
|
||||
});
|
||||
|
||||
it('parses JSON responses wrapped in markdown code fences', async () => {
|
||||
const extractor = new LlmFactExtractor({
|
||||
model: {
|
||||
async generateText() {
|
||||
return [
|
||||
'Here is the extracted fact:',
|
||||
'```json',
|
||||
JSON.stringify({
|
||||
statement: 'Bun powers TypeScript tooling.',
|
||||
{
|
||||
statement: "I have worked with Bun and TypeScript since 2025.",
|
||||
summary: "The speaker has Bun and TypeScript experience.",
|
||||
source: "chat",
|
||||
confidence: 0.91,
|
||||
metadata: { channel: "telegram" },
|
||||
topics: [
|
||||
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
|
||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
||||
{
|
||||
name: "I",
|
||||
category: "entity",
|
||||
granularity: "concrete",
|
||||
role: "subject",
|
||||
},
|
||||
{
|
||||
name: "Bun",
|
||||
category: "entity",
|
||||
granularity: "concrete",
|
||||
role: "object",
|
||||
},
|
||||
{
|
||||
name: "TypeScript",
|
||||
category: "entity",
|
||||
granularity: "concrete",
|
||||
role: "object",
|
||||
},
|
||||
{
|
||||
name: "2025",
|
||||
category: "temporal",
|
||||
granularity: "concrete",
|
||||
role: "time",
|
||||
},
|
||||
],
|
||||
}),
|
||||
'```',
|
||||
].join('\n');
|
||||
},
|
||||
];
|
||||
},
|
||||
},
|
||||
additionalInstructions: "Prefer technology and time topics.",
|
||||
});
|
||||
|
||||
const fact = await db.ingestStatement('Bun powers TypeScript tooling.', {
|
||||
extractor,
|
||||
});
|
||||
|
||||
expect(fact.topics.map((topic) => topic.name)).toEqual(['Bun', 'TypeScript']);
|
||||
});
|
||||
|
||||
it('rejects invalid LLM responses before writing facts', async () => {
|
||||
const extractor = new LlmFactExtractor({
|
||||
model: {
|
||||
async generateText() {
|
||||
return 'not json at all';
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
await expect(
|
||||
db.ingestStatement('Bun powers TypeScript tooling.', {
|
||||
const fact = await db.ingestStatement(
|
||||
"I have worked with Bun and TypeScript since 2025.",
|
||||
{
|
||||
extractor,
|
||||
}),
|
||||
).rejects.toThrow('LLM extractor returned invalid JSON.');
|
||||
},
|
||||
);
|
||||
|
||||
expect(prompt).toEqual({
|
||||
instruction: expect.stringContaining("Extract structured facts from the user input."),
|
||||
input: "I have worked with Bun and TypeScript since 2025.",
|
||||
additionalInstruction: "Prefer technology and time topics.",
|
||||
});
|
||||
expect(fact.summary).toBe("The speaker has Bun and TypeScript experience.");
|
||||
expect(fact.source).toBe("chat");
|
||||
expect(fact.confidence).toBe(0.91);
|
||||
expect(fact.metadata).toEqual({ channel: "telegram" });
|
||||
expect(fact.topics.map((topic) => topic.name)).toEqual([
|
||||
"I",
|
||||
"Bun",
|
||||
"TypeScript",
|
||||
"2025",
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -178,13 +178,15 @@ describe('IdentityDB dedup-aware ingestion', () => {
|
||||
provider = new FakeEmbeddingProvider();
|
||||
extractor = {
|
||||
async extract(input) {
|
||||
return {
|
||||
statement: input,
|
||||
topics: [
|
||||
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
|
||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
||||
],
|
||||
};
|
||||
return [
|
||||
{
|
||||
statement: input,
|
||||
topics: [
|
||||
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
|
||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
||||
],
|
||||
},
|
||||
];
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@@ -18,6 +18,12 @@
|
||||
"isolatedModules": true,
|
||||
"types": ["node", "vitest/globals"]
|
||||
},
|
||||
"include": ["src/**/*.ts", "tests/**/*.ts", "vitest.config.ts", "tsup.config.ts"],
|
||||
"include": [
|
||||
"src/**/*.ts",
|
||||
"tests/**/*.ts",
|
||||
"scripts/**/*.ts",
|
||||
"vitest.config.ts",
|
||||
"tsup.config.ts"
|
||||
],
|
||||
"exclude": ["dist", "node_modules"]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user