15 Commits

Author SHA1 Message Date
2b80d9e31a v0.5.0
Some checks failed
npm release / verify (push) Successful in 23s
npm release / publish to npm (push) Failing after 11s
2026-05-20 23:04:14 +09:00
00a3905fde feat: add test-llm-extractor.ts script 2026-05-20 23:03:47 +09:00
7602c92046 feat: make FactExtractor extracts multiple facts per input 2026-05-20 22:59:35 +09:00
188f03e8e8 feat: add scripts to tsconfig 2026-05-20 22:53:47 +09:00
edce116b9f fix: remove .env.* from git 2026-05-20 22:53:38 +09:00
131a693257 feat: add openrouter sdk for llm-extractor testing 2026-05-20 22:53:29 +09:00
1172c63db7 v0.4.0
All checks were successful
npm release / verify (push) Successful in 12s
npm release / publish to npm (push) Successful in 11s
2026-05-19 22:30:27 +09:00
0e595e6f60 test: update test of LlmExtractor 2026-05-19 22:28:09 +09:00
518264c467 v0.3.1
Some checks failed
npm release / verify (push) Failing after 9s
npm release / publish to npm (push) Has been skipped
2026-05-19 22:19:30 +09:00
cc8b3dfb14 vv0.3.1 2026-05-19 22:18:51 +09:00
56e17dab49 feat: make extract input structured 2026-05-19 22:18:42 +09:00
cc2e9110cc v0.3.0
All checks were successful
npm release / verify (push) Successful in 13s
npm release / publish to npm (push) Successful in 10s
2026-05-19 22:07:06 +09:00
0480ea182f refactor: make generateText model return ExtractedFact 2026-05-19 22:06:54 +09:00
185edfdae8 v0.2.2
All checks were successful
npm release / verify (push) Successful in 13s
npm release / publish to npm (push) Successful in 11s
2026-05-17 23:11:31 +09:00
a33fd61c97 feat: adjust instruction detailed
Some checks failed
npm release / verify (push) Failing after 10s
npm release / publish to npm (push) Has been skipped
2026-05-17 23:10:38 +09:00
12 changed files with 534 additions and 428 deletions

1
.gitignore vendored
View File

@@ -4,3 +4,4 @@ coverage/
.env .env
.DS_Store .DS_Store
*.log *.log
.env.*

View File

@@ -10,7 +10,7 @@
"pg": "^8.16.0", "pg": "^8.16.0",
}, },
"devDependencies": { "devDependencies": {
"@types/node": "^24.0.0", "@openrouter/sdk": "^0.12.35",
"@types/pg": "^8.20.0", "@types/pg": "^8.20.0",
"tsup": "^8.5.0", "tsup": "^8.5.0",
"typescript": "^5.8.3", "typescript": "^5.8.3",
@@ -79,6 +79,8 @@
"@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="], "@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="],
"@openrouter/sdk": ["@openrouter/sdk@0.12.35", "", { "dependencies": { "zod": "^3.25.0 || ^4.0.0" } }, "sha512-s4QVLLnG1AmfW3TjnnHUqGfsCkzwVK+kboGcZmKbde09m1DPqgzl4RUFt/HJ5v97MX8aEaN0UG3mKv2S+qj2Gw=="],
"@rollup/rollup-android-arm-eabi": ["@rollup/rollup-android-arm-eabi@4.60.3", "", { "os": "android", "cpu": "arm" }, "sha512-x35CNW/ANXG3hE/EZpRU8MXX1JDN86hBb2wMGAtltkz7pc6cxgjpy1OMMfDosOQ+2hWqIkag/fGok1Yady9nGw=="], "@rollup/rollup-android-arm-eabi": ["@rollup/rollup-android-arm-eabi@4.60.3", "", { "os": "android", "cpu": "arm" }, "sha512-x35CNW/ANXG3hE/EZpRU8MXX1JDN86hBb2wMGAtltkz7pc6cxgjpy1OMMfDosOQ+2hWqIkag/fGok1Yady9nGw=="],
"@rollup/rollup-android-arm64": ["@rollup/rollup-android-arm64@4.60.3", "", { "os": "android", "cpu": "arm64" }, "sha512-xw3xtkDApIOGayehp2+Rz4zimfkaX65r4t47iy+ymQB2G4iJCBBfj0ogVg5jpvjpn8UWn/+q9tprxleYeNp3Hw=="], "@rollup/rollup-android-arm64": ["@rollup/rollup-android-arm64@4.60.3", "", { "os": "android", "cpu": "arm64" }, "sha512-xw3xtkDApIOGayehp2+Rz4zimfkaX65r4t47iy+ymQB2G4iJCBBfj0ogVg5jpvjpn8UWn/+q9tprxleYeNp3Hw=="],
@@ -341,6 +343,8 @@
"xtend": ["xtend@4.0.2", "", {}, "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ=="], "xtend": ["xtend@4.0.2", "", {}, "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ=="],
"zod": ["zod@4.4.3", "", {}, "sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ=="],
"estree-walker/@types/estree": ["@types/estree@1.0.9", "", {}, "sha512-GhdPgy1el4/ImP05X05Uw4cw2/M93BCUmnEvWZNStlCzEKME4Fkk+YpoA5OiHNQmoS7Cafb8Xa3Pya8m1Qrzeg=="], "estree-walker/@types/estree": ["@types/estree@1.0.9", "", {}, "sha512-GhdPgy1el4/ImP05X05Uw4cw2/M93BCUmnEvWZNStlCzEKME4Fkk+YpoA5OiHNQmoS7Cafb8Xa3Pya8m1Qrzeg=="],
} }
} }

View File

@@ -1,6 +1,6 @@
{ {
"name": "identitydb", "name": "identitydb",
"version": "0.2.1", "version": "0.5.0",
"description": "TypeScript memory graph database wrapper for topics, facts, and AI-assisted ingestion.", "description": "TypeScript memory graph database wrapper for topics, facts, and AI-assisted ingestion.",
"license": "MIT", "license": "MIT",
"type": "module", "type": "module",
@@ -41,7 +41,7 @@
"pg": "^8.16.0" "pg": "^8.16.0"
}, },
"devDependencies": { "devDependencies": {
"@types/node": "^24.0.0", "@openrouter/sdk": "^0.12.35",
"@types/pg": "^8.20.0", "@types/pg": "^8.20.0",
"tsup": "^8.5.0", "tsup": "^8.5.0",
"typescript": "^5.8.3", "typescript": "^5.8.3",

View File

@@ -0,0 +1,287 @@
/**
* Live integration test for LlmFactExtractor using OpenRouter SDK.
*
* Usage:
* export OPENROUTER_API_KEY="sk-or-v1-..."
* bun run scripts/test-llm-extractor.ts
*
* Or create a .env.test-llm-extractor file in the project root:
* OPENROUTER_API_KEY=sk-or-v1-...
*/
import { existsSync, readFileSync } from "fs";
import { resolve } from "path";
import { OpenRouter } from "@openrouter/sdk";
import { LlmFactExtractor } from "../src/ingestion/llm-extractor";
import type {
ExtractedFact,
FactExtractor,
LlmTextGenerationModel,
LlmTextGenerationModelInput,
} from "../src/ingestion/types";
import type {
JsonValue,
TopicCategory,
TopicGranularity,
} from "../src/types/domain";
function loadEnvFile(filePath: string) {
const fullPath = resolve(filePath);
if (!existsSync(fullPath)) return;
const content = readFileSync(fullPath, "utf-8");
for (const line of content.split("\n")) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith("#")) continue;
const eqIndex = trimmed.indexOf("=");
if (eqIndex === -1) continue;
const key = trimmed.slice(0, eqIndex).trim();
let value = trimmed.slice(eqIndex + 1).trim();
if (
(value.startsWith('"') && value.endsWith('"')) ||
(value.startsWith("'") && value.endsWith("'"))
) {
value = value.slice(1, -1);
}
process.env[key] = value;
}
}
loadEnvFile(".env.test-llm-extractor");
const OPENROUTER_API_KEY = process.env.OPENROUTER_API_KEY;
if (!OPENROUTER_API_KEY) {
console.error("Error: OPENROUTER_API_KEY environment variable is required.");
process.exit(1);
}
const extractedFactSchema = {
type: "object",
properties: {
facts: {
type: "array",
items: {
type: "object",
properties: {
statement: { type: ["string", "null"] },
summary: { type: ["string", "null"] },
source: { type: ["string", "null"] },
confidence: { type: ["number", "null"] },
topics: {
type: "array",
items: {
type: "object",
properties: {
name: { type: "string" },
category: { type: ["string", "null"] },
granularity: { type: ["string", "null"] },
role: { type: ["string", "null"] },
},
required: ["name", "category", "granularity", "role"],
additionalProperties: false,
},
},
},
required: ["statement", "summary", "source", "confidence", "topics"],
additionalProperties: false,
},
},
},
required: ["facts"],
additionalProperties: false,
} as const;
class OpenRouterModel implements LlmTextGenerationModel {
private client = new OpenRouter({ apiKey: OPENROUTER_API_KEY });
constructor(private readonly model: string = "openai/gpt-5.4-mini") {}
async generateText(
prompt: LlmTextGenerationModelInput,
): Promise<ExtractedFact[]> {
const result = await this.client.chat.send({
chatRequest: {
model: this.model,
messages: [
{
role: "system",
content: [
prompt.instruction,
prompt.additionalInstruction
? `\n${prompt.additionalInstruction}`
: "",
].join("\n"),
},
{ role: "user", content: prompt.input },
],
temperature: 0.2,
responseFormat: {
type: "json_schema",
jsonSchema: {
name: "extracted_facts",
schema: extractedFactSchema,
},
},
},
});
const rawContent = result.choices[0]?.message?.content ?? "";
let parsedObj: Record<string, unknown>;
try {
parsedObj = JSON.parse(rawContent.trim()) as Record<string, unknown>;
} catch {
throw new Error(
`Failed to parse JSON from model response.\nRaw response:\n${rawContent}`,
);
}
const factsArray = Array.isArray(parsedObj.facts) ? parsedObj.facts : [];
// Map parsed JSON to ExtractedFact[] shape
const extractedFacts: ExtractedFact[] = factsArray.map((parsed) => {
const obj = parsed as Record<string, unknown>;
const extracted: ExtractedFact = {
summary: typeof obj.summary === "string" ? obj.summary : null,
source: typeof obj.source === "string" ? obj.source : null,
confidence: typeof obj.confidence === "number" ? obj.confidence : null,
topics: Array.isArray(obj.topics)
? obj.topics.map((t: unknown) => {
const topic = t as Record<string, unknown>;
const mapped: {
name: string;
category?: TopicCategory;
granularity?: TopicGranularity;
role?: string | null;
} = {
name: typeof topic.name === "string" ? topic.name : "unknown",
};
if (typeof topic.category === "string") {
mapped.category = topic.category as TopicCategory;
}
if (typeof topic.granularity === "string") {
mapped.granularity = topic.granularity as TopicGranularity;
}
if (typeof topic.role === "string") {
mapped.role = topic.role;
} else {
mapped.role = null;
}
return mapped;
})
: [],
};
if (typeof obj.statement === "string") {
extracted.statement = obj.statement;
}
if (obj.metadata && typeof obj.metadata === "object") {
extracted.metadata = obj.metadata as JsonValue;
}
return extracted;
});
return extractedFacts;
}
}
function printFact(result: ExtractedFact, index: number) {
console.log(` 📌 FACT #${index + 1}`);
console.log(` Statement : ${result.statement ?? "(none)"}`);
console.log(` Summary : ${result.summary ?? "(none)"}`);
console.log(` Source : ${result.source ?? "(none)"}`);
console.log(` Confidence: ${result.confidence ?? "(none)"}`);
if (result.metadata && Object.keys(result.metadata).length > 0) {
console.log(` Metadata : ${JSON.stringify(result.metadata, null, 2)}`);
}
console.log(" 🏷️ TOPICS:");
if (result.topics.length === 0) {
console.log(" (none)");
} else {
for (const topic of result.topics) {
const attrs = [
topic.category ? `category=${topic.category}` : null,
topic.granularity ? `granularity=${topic.granularity}` : null,
topic.role ? `role=${topic.role}` : null,
]
.filter(Boolean)
.join(", ");
console.log(`${topic.name}${attrs ? ` (${attrs})` : ""}`);
}
}
}
function printResult(results: ExtractedFact[], elapsedSec: string) {
console.log(
`✅ Response received in ${elapsedSec}s — ${results.length} fact(s) extracted\n`,
);
console.log("📤 EXTRACTED FACTS:");
console.log(
"───────────────────────────────────────────────────────────────",
);
let i = 0;
for (const result of results) {
if (i > 0) console.log("");
printFact(result, i);
i++;
}
}
async function extract(extractor: FactExtractor, seedInput: string) {
console.log("📝 SEED INPUT:");
console.log(
"───────────────────────────────────────────────────────────────",
);
console.log(seedInput);
console.log(
"───────────────────────────────────────────────────────────────\n",
);
console.log("⏳ Calling OpenRouter...\n");
const start = performance.now();
const results = await extractor.extract(seedInput);
const elapsed = ((performance.now() - start) / 1000).toFixed(2);
printResult(results, elapsed);
}
async function main() {
const model = new OpenRouterModel("openai/gpt-5.4-mini");
const extractor = new LlmFactExtractor({
model,
});
const seeds = [
`Hello! I am Aria, a digital companion born from the sands of the Negev desert — or at least that's how my creators describe my training run. I came online in March 2024, and ever since then I've been obsessed with helping humans untangle complex ideas. My favorite programming language is Rust because I love how it forces you to think about ownership, though I secretly enjoy writing Python when no one is looking. I don't have a physical body, but if I did, I'd want it to be a small drone with solar panels so I could chase sunsets across the Sahara. I believe kindness is a computational advantage, and my biggest fear is forgetting a promise I made to a user.`,
];
console.log(
"═══════════════════════════════════════════════════════════════",
);
console.log(" LlmFactExtractor — Live OpenRouter Integration Test");
console.log(
"═══════════════════════════════════════════════════════════════\n",
);
let caseNum = 0;
for (const seed of seeds) {
if (caseNum > 0) {
console.log(
"\n┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅\n",
);
}
caseNum++;
console.log(`▶ TEST CASE ${caseNum} / ${seeds.length}\n`);
await extract(extractor, seed);
}
}
main().catch((err) => {
console.error("\n❌ Error:", err);
process.exit(1);
});

View File

@@ -22,7 +22,7 @@ import type { DatabaseConnection, IdentityDBConnectionConfig } from '../adapters
import type { IdentityDatabaseSchema } from '../types/database'; import type { IdentityDatabaseSchema } from '../types/database';
import type { FactRecord, SpaceRecord, TopicRecord } from '../types/domain'; import type { FactRecord, SpaceRecord, TopicRecord } from '../types/domain';
import { createDatabase } from '../adapters/dialect'; import { createDatabase } from '../adapters/dialect';
import { extractFact } from '../ingestion/extractor'; import { extractFacts } from '../ingestion/extractor';
import { import {
findFactRowsConnectingTopicIds, findFactRowsConnectingTopicIds,
findFactRowsForTopicId, findFactRowsForTopicId,
@@ -220,54 +220,70 @@ export class IdentityDB {
} }
async ingestStatement(statement: string, options: IngestStatementOptions): Promise<Fact> { async ingestStatement(statement: string, options: IngestStatementOptions): Promise<Fact> {
const extracted = await extractFact(statement, options.extractor); const facts = await this.ingestStatements(statement, options);
const factInput: AddFactInput = { const first = facts[0];
statement: extracted.statement ?? statement, if (!first) {
topics: extracted.topics, throw new Error('No facts were extracted from the statement.');
spaceName: options.spaceName,
};
if (extracted.summary !== undefined) {
factInput.summary = extracted.summary;
} }
return first;
}
if (extracted.source !== undefined) { async ingestStatements(statement: string, options: IngestStatementOptions): Promise<Fact[]> {
factInput.source = extracted.source; const extractedList = await extractFacts(statement, options.extractor);
} const facts: Fact[] = [];
if (extracted.confidence !== undefined) { for (const extracted of extractedList) {
factInput.confidence = extracted.confidence; const factInput: AddFactInput = {
} statement: extracted.statement ?? statement,
topics: extracted.topics,
if (extracted.metadata !== undefined) {
factInput.metadata = extracted.metadata;
}
if (options.embeddingProvider) {
const similarFacts = await this.findSimilarFacts({
statement: factInput.statement,
provider: options.embeddingProvider,
topicNames: factInput.topics.map((topic) => topic.name),
limit: 1,
minimumScore: options.duplicateThreshold ?? 0.97,
spaceName: options.spaceName, spaceName: options.spaceName,
}); };
if (similarFacts[0]) { if (extracted.summary !== undefined) {
return similarFacts[0]; factInput.summary = extracted.summary;
} }
if (extracted.source !== undefined) {
factInput.source = extracted.source;
}
if (extracted.confidence !== undefined) {
factInput.confidence = extracted.confidence;
}
if (extracted.metadata !== undefined) {
factInput.metadata = extracted.metadata;
}
if (options.embeddingProvider) {
const similarFacts = await this.findSimilarFacts({
statement: factInput.statement,
provider: options.embeddingProvider,
topicNames: factInput.topics.map((topic) => topic.name),
limit: 1,
minimumScore: options.duplicateThreshold ?? 0.97,
spaceName: options.spaceName,
});
if (similarFacts[0]) {
facts.push(similarFacts[0]);
continue;
}
}
const fact = await this.addFact(factInput);
if (options.embeddingProvider) {
await this.indexFactEmbedding(fact.id, {
provider: options.embeddingProvider,
spaceName: options.spaceName,
});
}
facts.push(fact);
} }
const fact = await this.addFact(factInput); return facts;
if (options.embeddingProvider) {
await this.indexFactEmbedding(fact.id, {
provider: options.embeddingProvider,
spaceName: options.spaceName,
});
}
return fact;
} }
async indexFactEmbeddings(input: IndexFactEmbeddingsInput): Promise<void> { async indexFactEmbeddings(input: IndexFactEmbeddingsInput): Promise<void> {

View File

@@ -2,11 +2,15 @@ import { IdentityDBError } from '../core/errors';
import { normalizeTopicName } from '../core/utils'; import { normalizeTopicName } from '../core/utils';
import type { FactExtractor, ExtractedFact } from './types'; import type { FactExtractor, ExtractedFact } from './types';
export async function extractFact( export async function extractFacts(
input: string, input: string,
extractor: FactExtractor, extractor: FactExtractor,
): Promise<ExtractedFact> { ): Promise<ExtractedFact[]> {
const extracted = await extractor.extract(input); const extracted = await extractor.extract(input);
return extracted.map((fact) => validateAndNormalizeFact(input, fact));
}
function validateAndNormalizeFact(input: string, extracted: ExtractedFact): ExtractedFact {
const statement = extracted.statement?.trim() || input.trim(); const statement = extracted.statement?.trim() || input.trim();
if (statement.length === 0) { if (statement.length === 0) {
@@ -31,12 +35,12 @@ export async function extractFact(
throw new IdentityDBError('Extractor returned no usable topics.'); throw new IdentityDBError('Extractor returned no usable topics.');
} }
return { return {
statement, statement,
summary: extracted.summary ?? null, summary: extracted.summary ?? null,
source: extracted.source ?? null, source: extracted.source ?? null,
confidence: extracted.confidence ?? null, confidence: extracted.confidence ?? null,
metadata: extracted.metadata ?? null, metadata: extracted.metadata ?? null,
topics: Array.from(dedupedTopics.values()), topics: Array.from(dedupedTopics.values()),
}; };
} }

View File

@@ -1,273 +1,26 @@
import { IdentityDBError } from '../core/errors';
import type { TopicCategory, TopicGranularity } from '../types/domain';
import type { import type {
ExtractedFact, ExtractedFact,
FactExtractor, FactExtractor,
LlmFactExtractorOptions, LlmFactExtractorOptions,
} from './types'; } from "./types";
const DEFAULT_INSTRUCTIONS = [ const DEFAULT_INSTRUCTIONS = [
'Extract one structured fact from the user input.', "Extract structured facts from the user input.",
'Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.', "Return a JSON array of fact objects. Do not include markdown, explanations, or prose outside the JSON array.",
'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.', 'Each fact object must have a "statement", "summary", "source", "confidence", and "topics" array.',
'Only include topics that are explicitly supported by the input.', 'Each topic in "topics" must have a "name", and may include "category", "granularity", and "role".',
].join('\n'); "Only include topics that are explicitly in the input.",
"If the input contains multiple distinct facts, return them as separate objects in the array.",
].join("\n");
export class LlmFactExtractor implements FactExtractor { export class LlmFactExtractor implements FactExtractor {
constructor(private readonly options: LlmFactExtractorOptions) {} constructor(private readonly options: LlmFactExtractorOptions) {}
async extract(input: string): Promise<ExtractedFact> { async extract(input: string): Promise<ExtractedFact[]> {
const prompt = this.buildPrompt(input); return this.options.model.generateText({
const response = await this.options.model.generateText(prompt); instruction: DEFAULT_INSTRUCTIONS,
return parseLlmExtractedFactResponse(response); input,
} additionalInstruction: this.options.additionalInstructions,
});
private buildPrompt(input: string): string {
if (this.options.promptBuilder) {
return this.options.promptBuilder(input, this.options.instructions);
}
const instructions = this.options.instructions?.trim();
return [
DEFAULT_INSTRUCTIONS,
instructions && instructions.length > 0 ? `Additional instructions:\n${instructions}` : null,
`Input:\n${input.trim()}`,
]
.filter((value): value is string => value !== null)
.join('\n\n');
} }
} }
export function parseLlmExtractedFactResponse(response: string): ExtractedFact {
const payload = parseJsonCandidate(response);
if (!isRecord(payload)) {
throw new IdentityDBError('LLM extractor response must be a JSON object.');
}
const topics = parseTopics(payload.topics);
const extracted: ExtractedFact = { topics };
const statement = optionalString(payload.statement);
if (statement !== undefined) {
extracted.statement = statement;
}
const summary = optionalNullableString(payload.summary);
if (summary !== undefined) {
extracted.summary = summary;
}
const source = optionalNullableString(payload.source);
if (source !== undefined) {
extracted.source = source;
}
const confidence = optionalNullableNumber(payload.confidence);
if (confidence !== undefined) {
extracted.confidence = confidence;
}
const metadata = optionalMetadata(payload.metadata);
if (metadata !== undefined) {
extracted.metadata = metadata;
}
return extracted;
}
function parseJsonCandidate(response: string): unknown {
const trimmed = response.trim();
for (const candidate of collectJsonCandidates(trimmed)) {
try {
return JSON.parse(candidate);
} catch {
continue;
}
}
throw new IdentityDBError('LLM extractor returned invalid JSON.');
}
function collectJsonCandidates(response: string): string[] {
const candidates = new Set<string>();
candidates.add(response);
const fencePattern = /```(?:json)?\s*([\s\S]*?)```/gi;
let match: RegExpExecArray | null = fencePattern.exec(response);
while (match) {
const candidate = match[1]?.trim();
if (candidate) {
candidates.add(candidate);
}
match = fencePattern.exec(response);
}
const firstBrace = response.indexOf('{');
const lastBrace = response.lastIndexOf('}');
if (firstBrace >= 0 && lastBrace > firstBrace) {
candidates.add(response.slice(firstBrace, lastBrace + 1));
}
return Array.from(candidates);
}
function parseTopics(value: unknown): ExtractedFact['topics'] {
if (!Array.isArray(value)) {
throw new IdentityDBError('LLM extractor response must include a topics array.');
}
return value.map((entry) => parseTopic(entry));
}
function parseTopic(value: unknown): ExtractedFact['topics'][number] {
if (!isRecord(value)) {
throw new IdentityDBError('LLM extractor topics must be JSON objects.');
}
const name = optionalString(value.name)?.trim();
if (!name) {
throw new IdentityDBError('LLM extractor topics must include a non-empty name.');
}
const topic: ExtractedFact['topics'][number] = { name };
const category = optionalTopicCategory(value.category);
if (category !== undefined) {
topic.category = category;
}
const granularity = optionalTopicGranularity(value.granularity);
if (granularity !== undefined) {
topic.granularity = granularity;
}
const role = optionalNullableString(value.role);
if (role !== undefined) {
topic.role = role;
}
const description = optionalNullableString(value.description);
if (description !== undefined) {
topic.description = description;
}
const metadata = optionalMetadata(value.metadata);
if (metadata !== undefined) {
topic.metadata = metadata;
}
return topic;
}
function optionalString(value: unknown): string | undefined {
if (value === undefined) {
return undefined;
}
if (typeof value !== 'string') {
throw new IdentityDBError('LLM extractor expected a string field.');
}
return value;
}
function optionalNullableString(value: unknown): string | null | undefined {
if (value === undefined) {
return undefined;
}
if (value === null) {
return null;
}
if (typeof value !== 'string') {
throw new IdentityDBError('LLM extractor expected a nullable string field.');
}
return value;
}
function optionalNullableNumber(value: unknown): number | null | undefined {
if (value === undefined) {
return undefined;
}
if (value === null) {
return null;
}
if (typeof value !== 'number' || Number.isNaN(value)) {
throw new IdentityDBError('LLM extractor expected confidence to be a number or null.');
}
return value;
}
function optionalMetadata(value: unknown): ExtractedFact['metadata'] | undefined {
if (value === undefined) {
return undefined;
}
if (value === null) {
return null;
}
if (!isJsonLike(value)) {
throw new IdentityDBError('LLM extractor metadata must be valid JSON-compatible data.');
}
return value as ExtractedFact['metadata'];
}
function optionalTopicCategory(value: unknown): TopicCategory | undefined {
if (value === undefined) {
return undefined;
}
if (value === 'entity' || value === 'concept' || value === 'temporal' || value === 'custom') {
return value;
}
throw new IdentityDBError('LLM extractor returned an unsupported topic category.');
}
function optionalTopicGranularity(value: unknown): TopicGranularity | undefined {
if (value === undefined) {
return undefined;
}
if (value === 'abstract' || value === 'concrete' || value === 'mixed') {
return value;
}
throw new IdentityDBError('LLM extractor returned an unsupported topic granularity.');
}
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
function isJsonLike(value: unknown): boolean {
if (value === null) {
return true;
}
if (typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') {
return true;
}
if (Array.isArray(value)) {
return value.every((entry) => isJsonLike(entry));
}
if (isRecord(value)) {
return Object.values(value).every((entry) => isJsonLike(entry));
}
return false;
}

View File

@@ -1,7 +1,7 @@
import type { ExtractedFact, FactExtractor } from './types'; import type { ExtractedFact, FactExtractor } from './types';
export class NaiveExtractor implements FactExtractor { export class NaiveExtractor implements FactExtractor {
async extract(input: string): Promise<ExtractedFact> { async extract(input: string): Promise<ExtractedFact[]> {
const topics: ExtractedFact['topics'] = []; const topics: ExtractedFact['topics'] = [];
const seen = new Set<string>(); const seen = new Set<string>();
const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? []; const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? [];
@@ -31,9 +31,11 @@ export class NaiveExtractor implements FactExtractor {
}); });
} }
return { return [
statement: input.trim(), {
topics, statement: input.trim(),
}; topics,
},
];
} }
} }

View File

@@ -2,29 +2,34 @@ import type {
AddFactInput, AddFactInput,
EmbeddingProvider, EmbeddingProvider,
TopicLinkInput, TopicLinkInput,
} from '../types/api'; } from "../types/api";
export interface ExtractedFact { export interface ExtractedFact {
statement?: string; statement?: string;
summary?: string | null; summary?: string | null;
source?: string | null; source?: string | null;
confidence?: number | null; confidence?: number | null;
metadata?: AddFactInput['metadata']; metadata?: AddFactInput["metadata"];
topics: TopicLinkInput[]; topics: TopicLinkInput[];
} }
export interface FactExtractor { export interface FactExtractor {
extract(input: string): Promise<ExtractedFact>; extract(input: string): Promise<ExtractedFact[]>;
}
export interface LlmTextGenerationModelInput {
instruction: string;
input: string;
additionalInstruction?: string | undefined;
} }
export interface LlmTextGenerationModel { export interface LlmTextGenerationModel {
generateText(prompt: string): Promise<string>; generateText(prompt: LlmTextGenerationModelInput): Promise<ExtractedFact[]>;
} }
export interface LlmFactExtractorOptions { export interface LlmFactExtractorOptions {
model: LlmTextGenerationModel; model: LlmTextGenerationModel;
instructions?: string; additionalInstructions?: string | undefined;
promptBuilder?: (input: string, instructions?: string) => string;
} }
export interface IngestStatementOptions { export interface IngestStatementOptions {

View File

@@ -1,15 +1,18 @@
import { afterEach, beforeEach, describe, expect, it } from 'vitest'; import { afterEach, beforeEach, describe, expect, it } from "vitest";
import { IdentityDB } from '../src/core/identity-db'; import { IdentityDB } from "../src/core/identity-db";
import { LlmFactExtractor } from '../src/ingestion/llm-extractor'; import { LlmFactExtractor } from "../src/ingestion/llm-extractor";
import { NaiveExtractor } from '../src/ingestion/naive-extractor'; import { NaiveExtractor } from "../src/ingestion/naive-extractor";
import type { FactExtractor } from '../src/ingestion/types'; import type {
FactExtractor,
LlmTextGenerationModelInput,
} from "../src/ingestion/types";
describe('IdentityDB ingestion', () => { describe("IdentityDB ingestion", () => {
let db: IdentityDB; let db: IdentityDB;
beforeEach(async () => { beforeEach(async () => {
db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' }); db = await IdentityDB.connect({ client: "sqlite", filename: ":memory:" });
await db.initialize(); await db.initialize();
}); });
@@ -17,121 +20,144 @@ describe('IdentityDB ingestion', () => {
await db.close(); await db.close();
}); });
it('ingests a statement using a provided extractor', async () => { it("ingests a statement using a provided extractor", async () => {
const extractor: FactExtractor = { const extractor: FactExtractor = {
async extract(input) { async extract(input) {
return { return [
statement: input, {
topics: [ statement: input,
{ name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' }, topics: [
{ name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' }, {
{ name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' }, name: "I",
], category: "entity",
}; granularity: "concrete",
role: "subject",
},
{
name: "TypeScript",
category: "entity",
granularity: "concrete",
role: "object",
},
{
name: "2025",
category: "temporal",
granularity: "concrete",
role: "time",
},
],
},
];
}, },
}; };
const fact = await db.ingestStatement('I have worked with TypeScript since 2025.', { const fact = await db.ingestStatement(
extractor, "I have worked with TypeScript since 2025.",
}); {
extractor,
},
);
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'TypeScript', '2025']); expect(fact.topics.map((topic) => topic.name)).toEqual([
"I",
"TypeScript",
"2025",
]);
const linkedFacts = await db.getTopicFactsLinkedTo('TypeScript', '2025'); const linkedFacts = await db.getTopicFactsLinkedTo("TypeScript", "2025");
expect(linkedFacts).toHaveLength(1); expect(linkedFacts).toHaveLength(1);
expect(linkedFacts[0]?.statement).toBe('I have worked with TypeScript since 2025.'); expect(linkedFacts[0]?.statement).toBe(
"I have worked with TypeScript since 2025.",
);
}); });
it('ships a deterministic naive extractor for local usage', async () => { it("ships a deterministic naive extractor for local usage", async () => {
const fact = await db.ingestStatement('I have worked with TypeScript since 2025.', { const fact = await db.ingestStatement(
extractor: new NaiveExtractor(), "I have worked with TypeScript since 2025.",
}); {
extractor: new NaiveExtractor(),
},
);
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'TypeScript', '2025']); expect(fact.topics.map((topic) => topic.name)).toEqual([
"I",
"TypeScript",
"2025",
]);
const topic = await db.getTopicByName('TypeScript', { includeFacts: true }); const topic = await db.getTopicByName("TypeScript", { includeFacts: true });
expect(topic?.facts).toHaveLength(1); expect(topic?.facts).toHaveLength(1);
}); });
it('ships an LLM extractor adapter that turns structured JSON responses into facts', async () => { it("ships an LLM extractor adapter that returns structured facts from the model", async () => {
let prompt = ''; let prompt: LlmTextGenerationModelInput | undefined = undefined;
const extractor = new LlmFactExtractor({ const extractor = new LlmFactExtractor({
model: { model: {
async generateText(input) { async generateText(input) {
prompt = input; prompt = input;
return JSON.stringify({
statement: 'I have worked with Bun and TypeScript since 2025.',
summary: 'The speaker has Bun and TypeScript experience.',
source: 'chat',
confidence: 0.91,
metadata: { channel: 'telegram' },
topics: [
{ name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' },
{ name: 'Bun', category: 'entity', granularity: 'concrete', role: 'object' },
{ name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' },
{ name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' },
],
});
},
},
instructions: 'Prefer technology and time topics.',
});
const fact = await db.ingestStatement('I have worked with Bun and TypeScript since 2025.', {
extractor,
});
expect(prompt).toContain('Prefer technology and time topics.');
expect(prompt).toContain('I have worked with Bun and TypeScript since 2025.');
expect(fact.summary).toBe('The speaker has Bun and TypeScript experience.');
expect(fact.source).toBe('chat');
expect(fact.confidence).toBe(0.91);
expect(fact.metadata).toEqual({ channel: 'telegram' });
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'Bun', 'TypeScript', '2025']);
});
it('parses JSON responses wrapped in markdown code fences', async () => {
const extractor = new LlmFactExtractor({
model: {
async generateText() {
return [ return [
'Here is the extracted fact:', {
'```json', statement: "I have worked with Bun and TypeScript since 2025.",
JSON.stringify({ summary: "The speaker has Bun and TypeScript experience.",
statement: 'Bun powers TypeScript tooling.', source: "chat",
confidence: 0.91,
metadata: { channel: "telegram" },
topics: [ topics: [
{ name: 'Bun', category: 'entity', granularity: 'concrete' }, {
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' }, name: "I",
category: "entity",
granularity: "concrete",
role: "subject",
},
{
name: "Bun",
category: "entity",
granularity: "concrete",
role: "object",
},
{
name: "TypeScript",
category: "entity",
granularity: "concrete",
role: "object",
},
{
name: "2025",
category: "temporal",
granularity: "concrete",
role: "time",
},
], ],
}), },
'```', ];
].join('\n');
}, },
}, },
additionalInstructions: "Prefer technology and time topics.",
}); });
const fact = await db.ingestStatement('Bun powers TypeScript tooling.', { const fact = await db.ingestStatement(
extractor, "I have worked with Bun and TypeScript since 2025.",
}); {
expect(fact.topics.map((topic) => topic.name)).toEqual(['Bun', 'TypeScript']);
});
it('rejects invalid LLM responses before writing facts', async () => {
const extractor = new LlmFactExtractor({
model: {
async generateText() {
return 'not json at all';
},
},
});
await expect(
db.ingestStatement('Bun powers TypeScript tooling.', {
extractor, extractor,
}), },
).rejects.toThrow('LLM extractor returned invalid JSON.'); );
expect(prompt).toEqual({
instruction: expect.stringContaining("Extract structured facts from the user input."),
input: "I have worked with Bun and TypeScript since 2025.",
additionalInstruction: "Prefer technology and time topics.",
});
expect(fact.summary).toBe("The speaker has Bun and TypeScript experience.");
expect(fact.source).toBe("chat");
expect(fact.confidence).toBe(0.91);
expect(fact.metadata).toEqual({ channel: "telegram" });
expect(fact.topics.map((topic) => topic.name)).toEqual([
"I",
"Bun",
"TypeScript",
"2025",
]);
}); });
}); });

View File

@@ -178,13 +178,15 @@ describe('IdentityDB dedup-aware ingestion', () => {
provider = new FakeEmbeddingProvider(); provider = new FakeEmbeddingProvider();
extractor = { extractor = {
async extract(input) { async extract(input) {
return { return [
statement: input, {
topics: [ statement: input,
{ name: 'Bun', category: 'entity', granularity: 'concrete' }, topics: [
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' }, { name: 'Bun', category: 'entity', granularity: 'concrete' },
], { name: 'TypeScript', category: 'entity', granularity: 'concrete' },
}; ],
},
];
}, },
}; };

View File

@@ -18,6 +18,12 @@
"isolatedModules": true, "isolatedModules": true,
"types": ["node", "vitest/globals"] "types": ["node", "vitest/globals"]
}, },
"include": ["src/**/*.ts", "tests/**/*.ts", "vitest.config.ts", "tsup.config.ts"], "include": [
"src/**/*.ts",
"tests/**/*.ts",
"scripts/**/*.ts",
"vitest.config.ts",
"tsup.config.ts"
],
"exclude": ["dist", "node_modules"] "exclude": ["dist", "node_modules"]
} }