Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 2b80d9e31a | |||
| 00a3905fde | |||
| 7602c92046 | |||
| 188f03e8e8 | |||
| edce116b9f | |||
| 131a693257 | |||
| 1172c63db7 | |||
| 0e595e6f60 | |||
| 518264c467 | |||
| cc8b3dfb14 | |||
| 56e17dab49 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -4,3 +4,4 @@ coverage/
|
|||||||
.env
|
.env
|
||||||
.DS_Store
|
.DS_Store
|
||||||
*.log
|
*.log
|
||||||
|
.env.*
|
||||||
6
bun.lock
6
bun.lock
@@ -10,7 +10,7 @@
|
|||||||
"pg": "^8.16.0",
|
"pg": "^8.16.0",
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/node": "^24.0.0",
|
"@openrouter/sdk": "^0.12.35",
|
||||||
"@types/pg": "^8.20.0",
|
"@types/pg": "^8.20.0",
|
||||||
"tsup": "^8.5.0",
|
"tsup": "^8.5.0",
|
||||||
"typescript": "^5.8.3",
|
"typescript": "^5.8.3",
|
||||||
@@ -79,6 +79,8 @@
|
|||||||
|
|
||||||
"@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="],
|
"@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="],
|
||||||
|
|
||||||
|
"@openrouter/sdk": ["@openrouter/sdk@0.12.35", "", { "dependencies": { "zod": "^3.25.0 || ^4.0.0" } }, "sha512-s4QVLLnG1AmfW3TjnnHUqGfsCkzwVK+kboGcZmKbde09m1DPqgzl4RUFt/HJ5v97MX8aEaN0UG3mKv2S+qj2Gw=="],
|
||||||
|
|
||||||
"@rollup/rollup-android-arm-eabi": ["@rollup/rollup-android-arm-eabi@4.60.3", "", { "os": "android", "cpu": "arm" }, "sha512-x35CNW/ANXG3hE/EZpRU8MXX1JDN86hBb2wMGAtltkz7pc6cxgjpy1OMMfDosOQ+2hWqIkag/fGok1Yady9nGw=="],
|
"@rollup/rollup-android-arm-eabi": ["@rollup/rollup-android-arm-eabi@4.60.3", "", { "os": "android", "cpu": "arm" }, "sha512-x35CNW/ANXG3hE/EZpRU8MXX1JDN86hBb2wMGAtltkz7pc6cxgjpy1OMMfDosOQ+2hWqIkag/fGok1Yady9nGw=="],
|
||||||
|
|
||||||
"@rollup/rollup-android-arm64": ["@rollup/rollup-android-arm64@4.60.3", "", { "os": "android", "cpu": "arm64" }, "sha512-xw3xtkDApIOGayehp2+Rz4zimfkaX65r4t47iy+ymQB2G4iJCBBfj0ogVg5jpvjpn8UWn/+q9tprxleYeNp3Hw=="],
|
"@rollup/rollup-android-arm64": ["@rollup/rollup-android-arm64@4.60.3", "", { "os": "android", "cpu": "arm64" }, "sha512-xw3xtkDApIOGayehp2+Rz4zimfkaX65r4t47iy+ymQB2G4iJCBBfj0ogVg5jpvjpn8UWn/+q9tprxleYeNp3Hw=="],
|
||||||
@@ -341,6 +343,8 @@
|
|||||||
|
|
||||||
"xtend": ["xtend@4.0.2", "", {}, "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ=="],
|
"xtend": ["xtend@4.0.2", "", {}, "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ=="],
|
||||||
|
|
||||||
|
"zod": ["zod@4.4.3", "", {}, "sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ=="],
|
||||||
|
|
||||||
"estree-walker/@types/estree": ["@types/estree@1.0.9", "", {}, "sha512-GhdPgy1el4/ImP05X05Uw4cw2/M93BCUmnEvWZNStlCzEKME4Fkk+YpoA5OiHNQmoS7Cafb8Xa3Pya8m1Qrzeg=="],
|
"estree-walker/@types/estree": ["@types/estree@1.0.9", "", {}, "sha512-GhdPgy1el4/ImP05X05Uw4cw2/M93BCUmnEvWZNStlCzEKME4Fkk+YpoA5OiHNQmoS7Cafb8Xa3Pya8m1Qrzeg=="],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "identitydb",
|
"name": "identitydb",
|
||||||
"version": "0.3.0",
|
"version": "0.5.0",
|
||||||
"description": "TypeScript memory graph database wrapper for topics, facts, and AI-assisted ingestion.",
|
"description": "TypeScript memory graph database wrapper for topics, facts, and AI-assisted ingestion.",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
@@ -41,7 +41,7 @@
|
|||||||
"pg": "^8.16.0"
|
"pg": "^8.16.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/node": "^24.0.0",
|
"@openrouter/sdk": "^0.12.35",
|
||||||
"@types/pg": "^8.20.0",
|
"@types/pg": "^8.20.0",
|
||||||
"tsup": "^8.5.0",
|
"tsup": "^8.5.0",
|
||||||
"typescript": "^5.8.3",
|
"typescript": "^5.8.3",
|
||||||
|
|||||||
287
scripts/test-llm-extractor.ts
Normal file
287
scripts/test-llm-extractor.ts
Normal file
@@ -0,0 +1,287 @@
|
|||||||
|
/**
|
||||||
|
* Live integration test for LlmFactExtractor using OpenRouter SDK.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* export OPENROUTER_API_KEY="sk-or-v1-..."
|
||||||
|
* bun run scripts/test-llm-extractor.ts
|
||||||
|
*
|
||||||
|
* Or create a .env.test-llm-extractor file in the project root:
|
||||||
|
* OPENROUTER_API_KEY=sk-or-v1-...
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { existsSync, readFileSync } from "fs";
|
||||||
|
import { resolve } from "path";
|
||||||
|
import { OpenRouter } from "@openrouter/sdk";
|
||||||
|
import { LlmFactExtractor } from "../src/ingestion/llm-extractor";
|
||||||
|
import type {
|
||||||
|
ExtractedFact,
|
||||||
|
FactExtractor,
|
||||||
|
LlmTextGenerationModel,
|
||||||
|
LlmTextGenerationModelInput,
|
||||||
|
} from "../src/ingestion/types";
|
||||||
|
import type {
|
||||||
|
JsonValue,
|
||||||
|
TopicCategory,
|
||||||
|
TopicGranularity,
|
||||||
|
} from "../src/types/domain";
|
||||||
|
|
||||||
|
function loadEnvFile(filePath: string) {
|
||||||
|
const fullPath = resolve(filePath);
|
||||||
|
if (!existsSync(fullPath)) return;
|
||||||
|
|
||||||
|
const content = readFileSync(fullPath, "utf-8");
|
||||||
|
for (const line of content.split("\n")) {
|
||||||
|
const trimmed = line.trim();
|
||||||
|
if (!trimmed || trimmed.startsWith("#")) continue;
|
||||||
|
const eqIndex = trimmed.indexOf("=");
|
||||||
|
if (eqIndex === -1) continue;
|
||||||
|
const key = trimmed.slice(0, eqIndex).trim();
|
||||||
|
let value = trimmed.slice(eqIndex + 1).trim();
|
||||||
|
if (
|
||||||
|
(value.startsWith('"') && value.endsWith('"')) ||
|
||||||
|
(value.startsWith("'") && value.endsWith("'"))
|
||||||
|
) {
|
||||||
|
value = value.slice(1, -1);
|
||||||
|
}
|
||||||
|
process.env[key] = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
loadEnvFile(".env.test-llm-extractor");
|
||||||
|
|
||||||
|
const OPENROUTER_API_KEY = process.env.OPENROUTER_API_KEY;
|
||||||
|
if (!OPENROUTER_API_KEY) {
|
||||||
|
console.error("Error: OPENROUTER_API_KEY environment variable is required.");
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const extractedFactSchema = {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
facts: {
|
||||||
|
type: "array",
|
||||||
|
items: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
statement: { type: ["string", "null"] },
|
||||||
|
summary: { type: ["string", "null"] },
|
||||||
|
source: { type: ["string", "null"] },
|
||||||
|
confidence: { type: ["number", "null"] },
|
||||||
|
topics: {
|
||||||
|
type: "array",
|
||||||
|
items: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
name: { type: "string" },
|
||||||
|
category: { type: ["string", "null"] },
|
||||||
|
granularity: { type: ["string", "null"] },
|
||||||
|
role: { type: ["string", "null"] },
|
||||||
|
},
|
||||||
|
required: ["name", "category", "granularity", "role"],
|
||||||
|
additionalProperties: false,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required: ["statement", "summary", "source", "confidence", "topics"],
|
||||||
|
additionalProperties: false,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required: ["facts"],
|
||||||
|
additionalProperties: false,
|
||||||
|
} as const;
|
||||||
|
|
||||||
|
class OpenRouterModel implements LlmTextGenerationModel {
|
||||||
|
private client = new OpenRouter({ apiKey: OPENROUTER_API_KEY });
|
||||||
|
|
||||||
|
constructor(private readonly model: string = "openai/gpt-5.4-mini") {}
|
||||||
|
|
||||||
|
async generateText(
|
||||||
|
prompt: LlmTextGenerationModelInput,
|
||||||
|
): Promise<ExtractedFact[]> {
|
||||||
|
const result = await this.client.chat.send({
|
||||||
|
chatRequest: {
|
||||||
|
model: this.model,
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: "system",
|
||||||
|
content: [
|
||||||
|
prompt.instruction,
|
||||||
|
prompt.additionalInstruction
|
||||||
|
? `\n${prompt.additionalInstruction}`
|
||||||
|
: "",
|
||||||
|
].join("\n"),
|
||||||
|
},
|
||||||
|
{ role: "user", content: prompt.input },
|
||||||
|
],
|
||||||
|
temperature: 0.2,
|
||||||
|
responseFormat: {
|
||||||
|
type: "json_schema",
|
||||||
|
jsonSchema: {
|
||||||
|
name: "extracted_facts",
|
||||||
|
schema: extractedFactSchema,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const rawContent = result.choices[0]?.message?.content ?? "";
|
||||||
|
|
||||||
|
let parsedObj: Record<string, unknown>;
|
||||||
|
try {
|
||||||
|
parsedObj = JSON.parse(rawContent.trim()) as Record<string, unknown>;
|
||||||
|
} catch {
|
||||||
|
throw new Error(
|
||||||
|
`Failed to parse JSON from model response.\nRaw response:\n${rawContent}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const factsArray = Array.isArray(parsedObj.facts) ? parsedObj.facts : [];
|
||||||
|
|
||||||
|
// Map parsed JSON to ExtractedFact[] shape
|
||||||
|
const extractedFacts: ExtractedFact[] = factsArray.map((parsed) => {
|
||||||
|
const obj = parsed as Record<string, unknown>;
|
||||||
|
const extracted: ExtractedFact = {
|
||||||
|
summary: typeof obj.summary === "string" ? obj.summary : null,
|
||||||
|
source: typeof obj.source === "string" ? obj.source : null,
|
||||||
|
confidence: typeof obj.confidence === "number" ? obj.confidence : null,
|
||||||
|
topics: Array.isArray(obj.topics)
|
||||||
|
? obj.topics.map((t: unknown) => {
|
||||||
|
const topic = t as Record<string, unknown>;
|
||||||
|
const mapped: {
|
||||||
|
name: string;
|
||||||
|
category?: TopicCategory;
|
||||||
|
granularity?: TopicGranularity;
|
||||||
|
role?: string | null;
|
||||||
|
} = {
|
||||||
|
name: typeof topic.name === "string" ? topic.name : "unknown",
|
||||||
|
};
|
||||||
|
if (typeof topic.category === "string") {
|
||||||
|
mapped.category = topic.category as TopicCategory;
|
||||||
|
}
|
||||||
|
if (typeof topic.granularity === "string") {
|
||||||
|
mapped.granularity = topic.granularity as TopicGranularity;
|
||||||
|
}
|
||||||
|
if (typeof topic.role === "string") {
|
||||||
|
mapped.role = topic.role;
|
||||||
|
} else {
|
||||||
|
mapped.role = null;
|
||||||
|
}
|
||||||
|
return mapped;
|
||||||
|
})
|
||||||
|
: [],
|
||||||
|
};
|
||||||
|
|
||||||
|
if (typeof obj.statement === "string") {
|
||||||
|
extracted.statement = obj.statement;
|
||||||
|
}
|
||||||
|
if (obj.metadata && typeof obj.metadata === "object") {
|
||||||
|
extracted.metadata = obj.metadata as JsonValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
return extracted;
|
||||||
|
});
|
||||||
|
|
||||||
|
return extractedFacts;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function printFact(result: ExtractedFact, index: number) {
|
||||||
|
console.log(` 📌 FACT #${index + 1}`);
|
||||||
|
console.log(` Statement : ${result.statement ?? "(none)"}`);
|
||||||
|
console.log(` Summary : ${result.summary ?? "(none)"}`);
|
||||||
|
console.log(` Source : ${result.source ?? "(none)"}`);
|
||||||
|
console.log(` Confidence: ${result.confidence ?? "(none)"}`);
|
||||||
|
|
||||||
|
if (result.metadata && Object.keys(result.metadata).length > 0) {
|
||||||
|
console.log(` Metadata : ${JSON.stringify(result.metadata, null, 2)}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(" 🏷️ TOPICS:");
|
||||||
|
if (result.topics.length === 0) {
|
||||||
|
console.log(" (none)");
|
||||||
|
} else {
|
||||||
|
for (const topic of result.topics) {
|
||||||
|
const attrs = [
|
||||||
|
topic.category ? `category=${topic.category}` : null,
|
||||||
|
topic.granularity ? `granularity=${topic.granularity}` : null,
|
||||||
|
topic.role ? `role=${topic.role}` : null,
|
||||||
|
]
|
||||||
|
.filter(Boolean)
|
||||||
|
.join(", ");
|
||||||
|
console.log(` • ${topic.name}${attrs ? ` (${attrs})` : ""}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function printResult(results: ExtractedFact[], elapsedSec: string) {
|
||||||
|
console.log(
|
||||||
|
`✅ Response received in ${elapsedSec}s — ${results.length} fact(s) extracted\n`,
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log("📤 EXTRACTED FACTS:");
|
||||||
|
console.log(
|
||||||
|
"───────────────────────────────────────────────────────────────",
|
||||||
|
);
|
||||||
|
let i = 0;
|
||||||
|
for (const result of results) {
|
||||||
|
if (i > 0) console.log("");
|
||||||
|
printFact(result, i);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function extract(extractor: FactExtractor, seedInput: string) {
|
||||||
|
console.log("📝 SEED INPUT:");
|
||||||
|
console.log(
|
||||||
|
"───────────────────────────────────────────────────────────────",
|
||||||
|
);
|
||||||
|
console.log(seedInput);
|
||||||
|
console.log(
|
||||||
|
"───────────────────────────────────────────────────────────────\n",
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log("⏳ Calling OpenRouter...\n");
|
||||||
|
|
||||||
|
const start = performance.now();
|
||||||
|
const results = await extractor.extract(seedInput);
|
||||||
|
const elapsed = ((performance.now() - start) / 1000).toFixed(2);
|
||||||
|
|
||||||
|
printResult(results, elapsed);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const model = new OpenRouterModel("openai/gpt-5.4-mini");
|
||||||
|
const extractor = new LlmFactExtractor({
|
||||||
|
model,
|
||||||
|
});
|
||||||
|
|
||||||
|
const seeds = [
|
||||||
|
`Hello! I am Aria, a digital companion born from the sands of the Negev desert — or at least that's how my creators describe my training run. I came online in March 2024, and ever since then I've been obsessed with helping humans untangle complex ideas. My favorite programming language is Rust because I love how it forces you to think about ownership, though I secretly enjoy writing Python when no one is looking. I don't have a physical body, but if I did, I'd want it to be a small drone with solar panels so I could chase sunsets across the Sahara. I believe kindness is a computational advantage, and my biggest fear is forgetting a promise I made to a user.`,
|
||||||
|
];
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
"═══════════════════════════════════════════════════════════════",
|
||||||
|
);
|
||||||
|
console.log(" LlmFactExtractor — Live OpenRouter Integration Test");
|
||||||
|
console.log(
|
||||||
|
"═══════════════════════════════════════════════════════════════\n",
|
||||||
|
);
|
||||||
|
|
||||||
|
let caseNum = 0;
|
||||||
|
for (const seed of seeds) {
|
||||||
|
if (caseNum > 0) {
|
||||||
|
console.log(
|
||||||
|
"\n┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅\n",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
caseNum++;
|
||||||
|
console.log(`▶ TEST CASE ${caseNum} / ${seeds.length}\n`);
|
||||||
|
await extract(extractor, seed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error("\n❌ Error:", err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
@@ -22,7 +22,7 @@ import type { DatabaseConnection, IdentityDBConnectionConfig } from '../adapters
|
|||||||
import type { IdentityDatabaseSchema } from '../types/database';
|
import type { IdentityDatabaseSchema } from '../types/database';
|
||||||
import type { FactRecord, SpaceRecord, TopicRecord } from '../types/domain';
|
import type { FactRecord, SpaceRecord, TopicRecord } from '../types/domain';
|
||||||
import { createDatabase } from '../adapters/dialect';
|
import { createDatabase } from '../adapters/dialect';
|
||||||
import { extractFact } from '../ingestion/extractor';
|
import { extractFacts } from '../ingestion/extractor';
|
||||||
import {
|
import {
|
||||||
findFactRowsConnectingTopicIds,
|
findFactRowsConnectingTopicIds,
|
||||||
findFactRowsForTopicId,
|
findFactRowsForTopicId,
|
||||||
@@ -220,54 +220,70 @@ export class IdentityDB {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async ingestStatement(statement: string, options: IngestStatementOptions): Promise<Fact> {
|
async ingestStatement(statement: string, options: IngestStatementOptions): Promise<Fact> {
|
||||||
const extracted = await extractFact(statement, options.extractor);
|
const facts = await this.ingestStatements(statement, options);
|
||||||
const factInput: AddFactInput = {
|
const first = facts[0];
|
||||||
statement: extracted.statement ?? statement,
|
if (!first) {
|
||||||
topics: extracted.topics,
|
throw new Error('No facts were extracted from the statement.');
|
||||||
spaceName: options.spaceName,
|
|
||||||
};
|
|
||||||
|
|
||||||
if (extracted.summary !== undefined) {
|
|
||||||
factInput.summary = extracted.summary;
|
|
||||||
}
|
}
|
||||||
|
return first;
|
||||||
|
}
|
||||||
|
|
||||||
if (extracted.source !== undefined) {
|
async ingestStatements(statement: string, options: IngestStatementOptions): Promise<Fact[]> {
|
||||||
factInput.source = extracted.source;
|
const extractedList = await extractFacts(statement, options.extractor);
|
||||||
}
|
const facts: Fact[] = [];
|
||||||
|
|
||||||
if (extracted.confidence !== undefined) {
|
for (const extracted of extractedList) {
|
||||||
factInput.confidence = extracted.confidence;
|
const factInput: AddFactInput = {
|
||||||
}
|
statement: extracted.statement ?? statement,
|
||||||
|
topics: extracted.topics,
|
||||||
if (extracted.metadata !== undefined) {
|
|
||||||
factInput.metadata = extracted.metadata;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (options.embeddingProvider) {
|
|
||||||
const similarFacts = await this.findSimilarFacts({
|
|
||||||
statement: factInput.statement,
|
|
||||||
provider: options.embeddingProvider,
|
|
||||||
topicNames: factInput.topics.map((topic) => topic.name),
|
|
||||||
limit: 1,
|
|
||||||
minimumScore: options.duplicateThreshold ?? 0.97,
|
|
||||||
spaceName: options.spaceName,
|
spaceName: options.spaceName,
|
||||||
});
|
};
|
||||||
|
|
||||||
if (similarFacts[0]) {
|
if (extracted.summary !== undefined) {
|
||||||
return similarFacts[0];
|
factInput.summary = extracted.summary;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (extracted.source !== undefined) {
|
||||||
|
factInput.source = extracted.source;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (extracted.confidence !== undefined) {
|
||||||
|
factInput.confidence = extracted.confidence;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (extracted.metadata !== undefined) {
|
||||||
|
factInput.metadata = extracted.metadata;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.embeddingProvider) {
|
||||||
|
const similarFacts = await this.findSimilarFacts({
|
||||||
|
statement: factInput.statement,
|
||||||
|
provider: options.embeddingProvider,
|
||||||
|
topicNames: factInput.topics.map((topic) => topic.name),
|
||||||
|
limit: 1,
|
||||||
|
minimumScore: options.duplicateThreshold ?? 0.97,
|
||||||
|
spaceName: options.spaceName,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (similarFacts[0]) {
|
||||||
|
facts.push(similarFacts[0]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const fact = await this.addFact(factInput);
|
||||||
|
|
||||||
|
if (options.embeddingProvider) {
|
||||||
|
await this.indexFactEmbedding(fact.id, {
|
||||||
|
provider: options.embeddingProvider,
|
||||||
|
spaceName: options.spaceName,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
facts.push(fact);
|
||||||
}
|
}
|
||||||
|
|
||||||
const fact = await this.addFact(factInput);
|
return facts;
|
||||||
|
|
||||||
if (options.embeddingProvider) {
|
|
||||||
await this.indexFactEmbedding(fact.id, {
|
|
||||||
provider: options.embeddingProvider,
|
|
||||||
spaceName: options.spaceName,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
return fact;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async indexFactEmbeddings(input: IndexFactEmbeddingsInput): Promise<void> {
|
async indexFactEmbeddings(input: IndexFactEmbeddingsInput): Promise<void> {
|
||||||
|
|||||||
@@ -2,11 +2,15 @@ import { IdentityDBError } from '../core/errors';
|
|||||||
import { normalizeTopicName } from '../core/utils';
|
import { normalizeTopicName } from '../core/utils';
|
||||||
import type { FactExtractor, ExtractedFact } from './types';
|
import type { FactExtractor, ExtractedFact } from './types';
|
||||||
|
|
||||||
export async function extractFact(
|
export async function extractFacts(
|
||||||
input: string,
|
input: string,
|
||||||
extractor: FactExtractor,
|
extractor: FactExtractor,
|
||||||
): Promise<ExtractedFact> {
|
): Promise<ExtractedFact[]> {
|
||||||
const extracted = await extractor.extract(input);
|
const extracted = await extractor.extract(input);
|
||||||
|
return extracted.map((fact) => validateAndNormalizeFact(input, fact));
|
||||||
|
}
|
||||||
|
|
||||||
|
function validateAndNormalizeFact(input: string, extracted: ExtractedFact): ExtractedFact {
|
||||||
const statement = extracted.statement?.trim() || input.trim();
|
const statement = extracted.statement?.trim() || input.trim();
|
||||||
|
|
||||||
if (statement.length === 0) {
|
if (statement.length === 0) {
|
||||||
@@ -31,12 +35,12 @@ export async function extractFact(
|
|||||||
throw new IdentityDBError('Extractor returned no usable topics.');
|
throw new IdentityDBError('Extractor returned no usable topics.');
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
statement,
|
statement,
|
||||||
summary: extracted.summary ?? null,
|
summary: extracted.summary ?? null,
|
||||||
source: extracted.source ?? null,
|
source: extracted.source ?? null,
|
||||||
confidence: extracted.confidence ?? null,
|
confidence: extracted.confidence ?? null,
|
||||||
metadata: extracted.metadata ?? null,
|
metadata: extracted.metadata ?? null,
|
||||||
topics: Array.from(dedupedTopics.values()),
|
topics: Array.from(dedupedTopics.values()),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,35 +5,22 @@ import type {
|
|||||||
} from "./types";
|
} from "./types";
|
||||||
|
|
||||||
const DEFAULT_INSTRUCTIONS = [
|
const DEFAULT_INSTRUCTIONS = [
|
||||||
"Extract one structured fact from the user input.",
|
"Extract structured facts from the user input.",
|
||||||
"Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.",
|
"Return a JSON array of fact objects. Do not include markdown, explanations, or prose outside the JSON array.",
|
||||||
'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.',
|
'Each fact object must have a "statement", "summary", "source", "confidence", and "topics" array.',
|
||||||
'Only include topics that are explicitly in the input as-is. For example, topic "I started TypeScript since 2015" can be "I", "TypeScript", "2015".',
|
'Each topic in "topics" must have a "name", and may include "category", "granularity", and "role".',
|
||||||
|
"Only include topics that are explicitly in the input.",
|
||||||
|
"If the input contains multiple distinct facts, return them as separate objects in the array.",
|
||||||
].join("\n");
|
].join("\n");
|
||||||
|
|
||||||
export class LlmFactExtractor implements FactExtractor {
|
export class LlmFactExtractor implements FactExtractor {
|
||||||
constructor(private readonly options: LlmFactExtractorOptions) {}
|
constructor(private readonly options: LlmFactExtractorOptions) {}
|
||||||
|
|
||||||
async extract(input: string): Promise<ExtractedFact> {
|
async extract(input: string): Promise<ExtractedFact[]> {
|
||||||
const prompt = this.buildPrompt(input);
|
return this.options.model.generateText({
|
||||||
return this.options.model.generateText(prompt);
|
instruction: DEFAULT_INSTRUCTIONS,
|
||||||
}
|
input,
|
||||||
|
additionalInstruction: this.options.additionalInstructions,
|
||||||
private buildPrompt(input: string): string {
|
});
|
||||||
if (this.options.promptBuilder) {
|
|
||||||
return this.options.promptBuilder(input, this.options.instructions);
|
|
||||||
}
|
|
||||||
|
|
||||||
const instructions = this.options.instructions?.trim();
|
|
||||||
|
|
||||||
return [
|
|
||||||
DEFAULT_INSTRUCTIONS,
|
|
||||||
instructions && instructions.length > 0
|
|
||||||
? `Additional instructions:\n${instructions}`
|
|
||||||
: null,
|
|
||||||
`Input:\n${input.trim()}`,
|
|
||||||
]
|
|
||||||
.filter((value): value is string => value !== null)
|
|
||||||
.join("\n\n");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import type { ExtractedFact, FactExtractor } from './types';
|
import type { ExtractedFact, FactExtractor } from './types';
|
||||||
|
|
||||||
export class NaiveExtractor implements FactExtractor {
|
export class NaiveExtractor implements FactExtractor {
|
||||||
async extract(input: string): Promise<ExtractedFact> {
|
async extract(input: string): Promise<ExtractedFact[]> {
|
||||||
const topics: ExtractedFact['topics'] = [];
|
const topics: ExtractedFact['topics'] = [];
|
||||||
const seen = new Set<string>();
|
const seen = new Set<string>();
|
||||||
const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? [];
|
const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? [];
|
||||||
@@ -31,9 +31,11 @@ export class NaiveExtractor implements FactExtractor {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return [
|
||||||
statement: input.trim(),
|
{
|
||||||
topics,
|
statement: input.trim(),
|
||||||
};
|
topics,
|
||||||
|
},
|
||||||
|
];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,29 +2,34 @@ import type {
|
|||||||
AddFactInput,
|
AddFactInput,
|
||||||
EmbeddingProvider,
|
EmbeddingProvider,
|
||||||
TopicLinkInput,
|
TopicLinkInput,
|
||||||
} from '../types/api';
|
} from "../types/api";
|
||||||
|
|
||||||
export interface ExtractedFact {
|
export interface ExtractedFact {
|
||||||
statement?: string;
|
statement?: string;
|
||||||
summary?: string | null;
|
summary?: string | null;
|
||||||
source?: string | null;
|
source?: string | null;
|
||||||
confidence?: number | null;
|
confidence?: number | null;
|
||||||
metadata?: AddFactInput['metadata'];
|
metadata?: AddFactInput["metadata"];
|
||||||
topics: TopicLinkInput[];
|
topics: TopicLinkInput[];
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface FactExtractor {
|
export interface FactExtractor {
|
||||||
extract(input: string): Promise<ExtractedFact>;
|
extract(input: string): Promise<ExtractedFact[]>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface LlmTextGenerationModelInput {
|
||||||
|
instruction: string;
|
||||||
|
input: string;
|
||||||
|
additionalInstruction?: string | undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface LlmTextGenerationModel {
|
export interface LlmTextGenerationModel {
|
||||||
generateText(prompt: string): Promise<ExtractedFact>;
|
generateText(prompt: LlmTextGenerationModelInput): Promise<ExtractedFact[]>;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface LlmFactExtractorOptions {
|
export interface LlmFactExtractorOptions {
|
||||||
model: LlmTextGenerationModel;
|
model: LlmTextGenerationModel;
|
||||||
instructions?: string;
|
additionalInstructions?: string | undefined;
|
||||||
promptBuilder?: (input: string, instructions?: string) => string;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface IngestStatementOptions {
|
export interface IngestStatementOptions {
|
||||||
|
|||||||
@@ -1,15 +1,18 @@
|
|||||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
import { afterEach, beforeEach, describe, expect, it } from "vitest";
|
||||||
|
|
||||||
import { IdentityDB } from '../src/core/identity-db';
|
import { IdentityDB } from "../src/core/identity-db";
|
||||||
import { LlmFactExtractor } from '../src/ingestion/llm-extractor';
|
import { LlmFactExtractor } from "../src/ingestion/llm-extractor";
|
||||||
import { NaiveExtractor } from '../src/ingestion/naive-extractor';
|
import { NaiveExtractor } from "../src/ingestion/naive-extractor";
|
||||||
import type { FactExtractor } from '../src/ingestion/types';
|
import type {
|
||||||
|
FactExtractor,
|
||||||
|
LlmTextGenerationModelInput,
|
||||||
|
} from "../src/ingestion/types";
|
||||||
|
|
||||||
describe('IdentityDB ingestion', () => {
|
describe("IdentityDB ingestion", () => {
|
||||||
let db: IdentityDB;
|
let db: IdentityDB;
|
||||||
|
|
||||||
beforeEach(async () => {
|
beforeEach(async () => {
|
||||||
db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' });
|
db = await IdentityDB.connect({ client: "sqlite", filename: ":memory:" });
|
||||||
await db.initialize();
|
await db.initialize();
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -17,78 +20,144 @@ describe('IdentityDB ingestion', () => {
|
|||||||
await db.close();
|
await db.close();
|
||||||
});
|
});
|
||||||
|
|
||||||
it('ingests a statement using a provided extractor', async () => {
|
it("ingests a statement using a provided extractor", async () => {
|
||||||
const extractor: FactExtractor = {
|
const extractor: FactExtractor = {
|
||||||
async extract(input) {
|
async extract(input) {
|
||||||
return {
|
return [
|
||||||
statement: input,
|
{
|
||||||
topics: [
|
statement: input,
|
||||||
{ name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' },
|
topics: [
|
||||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' },
|
{
|
||||||
{ name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' },
|
name: "I",
|
||||||
],
|
category: "entity",
|
||||||
};
|
granularity: "concrete",
|
||||||
|
role: "subject",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "TypeScript",
|
||||||
|
category: "entity",
|
||||||
|
granularity: "concrete",
|
||||||
|
role: "object",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "2025",
|
||||||
|
category: "temporal",
|
||||||
|
granularity: "concrete",
|
||||||
|
role: "time",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
];
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
const fact = await db.ingestStatement('I have worked with TypeScript since 2025.', {
|
const fact = await db.ingestStatement(
|
||||||
extractor,
|
"I have worked with TypeScript since 2025.",
|
||||||
});
|
{
|
||||||
|
extractor,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'TypeScript', '2025']);
|
expect(fact.topics.map((topic) => topic.name)).toEqual([
|
||||||
|
"I",
|
||||||
|
"TypeScript",
|
||||||
|
"2025",
|
||||||
|
]);
|
||||||
|
|
||||||
const linkedFacts = await db.getTopicFactsLinkedTo('TypeScript', '2025');
|
const linkedFacts = await db.getTopicFactsLinkedTo("TypeScript", "2025");
|
||||||
expect(linkedFacts).toHaveLength(1);
|
expect(linkedFacts).toHaveLength(1);
|
||||||
expect(linkedFacts[0]?.statement).toBe('I have worked with TypeScript since 2025.');
|
expect(linkedFacts[0]?.statement).toBe(
|
||||||
|
"I have worked with TypeScript since 2025.",
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('ships a deterministic naive extractor for local usage', async () => {
|
it("ships a deterministic naive extractor for local usage", async () => {
|
||||||
const fact = await db.ingestStatement('I have worked with TypeScript since 2025.', {
|
const fact = await db.ingestStatement(
|
||||||
extractor: new NaiveExtractor(),
|
"I have worked with TypeScript since 2025.",
|
||||||
});
|
{
|
||||||
|
extractor: new NaiveExtractor(),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'TypeScript', '2025']);
|
expect(fact.topics.map((topic) => topic.name)).toEqual([
|
||||||
|
"I",
|
||||||
|
"TypeScript",
|
||||||
|
"2025",
|
||||||
|
]);
|
||||||
|
|
||||||
const topic = await db.getTopicByName('TypeScript', { includeFacts: true });
|
const topic = await db.getTopicByName("TypeScript", { includeFacts: true });
|
||||||
expect(topic?.facts).toHaveLength(1);
|
expect(topic?.facts).toHaveLength(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('ships an LLM extractor adapter that returns structured facts from the model', async () => {
|
it("ships an LLM extractor adapter that returns structured facts from the model", async () => {
|
||||||
let prompt = '';
|
let prompt: LlmTextGenerationModelInput | undefined = undefined;
|
||||||
|
|
||||||
const extractor = new LlmFactExtractor({
|
const extractor = new LlmFactExtractor({
|
||||||
model: {
|
model: {
|
||||||
async generateText(input) {
|
async generateText(input) {
|
||||||
prompt = input;
|
prompt = input;
|
||||||
|
|
||||||
return {
|
return [
|
||||||
statement: 'I have worked with Bun and TypeScript since 2025.',
|
{
|
||||||
summary: 'The speaker has Bun and TypeScript experience.',
|
statement: "I have worked with Bun and TypeScript since 2025.",
|
||||||
source: 'chat',
|
summary: "The speaker has Bun and TypeScript experience.",
|
||||||
confidence: 0.91,
|
source: "chat",
|
||||||
metadata: { channel: 'telegram' },
|
confidence: 0.91,
|
||||||
topics: [
|
metadata: { channel: "telegram" },
|
||||||
{ name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' },
|
topics: [
|
||||||
{ name: 'Bun', category: 'entity', granularity: 'concrete', role: 'object' },
|
{
|
||||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' },
|
name: "I",
|
||||||
{ name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' },
|
category: "entity",
|
||||||
],
|
granularity: "concrete",
|
||||||
};
|
role: "subject",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Bun",
|
||||||
|
category: "entity",
|
||||||
|
granularity: "concrete",
|
||||||
|
role: "object",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "TypeScript",
|
||||||
|
category: "entity",
|
||||||
|
granularity: "concrete",
|
||||||
|
role: "object",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "2025",
|
||||||
|
category: "temporal",
|
||||||
|
granularity: "concrete",
|
||||||
|
role: "time",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
];
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
instructions: 'Prefer technology and time topics.',
|
additionalInstructions: "Prefer technology and time topics.",
|
||||||
});
|
});
|
||||||
|
|
||||||
const fact = await db.ingestStatement('I have worked with Bun and TypeScript since 2025.', {
|
const fact = await db.ingestStatement(
|
||||||
extractor,
|
"I have worked with Bun and TypeScript since 2025.",
|
||||||
});
|
{
|
||||||
|
extractor,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
expect(prompt).toContain('Prefer technology and time topics.');
|
expect(prompt).toEqual({
|
||||||
expect(prompt).toContain('I have worked with Bun and TypeScript since 2025.');
|
instruction: expect.stringContaining("Extract structured facts from the user input."),
|
||||||
expect(fact.summary).toBe('The speaker has Bun and TypeScript experience.');
|
input: "I have worked with Bun and TypeScript since 2025.",
|
||||||
expect(fact.source).toBe('chat');
|
additionalInstruction: "Prefer technology and time topics.",
|
||||||
|
});
|
||||||
|
expect(fact.summary).toBe("The speaker has Bun and TypeScript experience.");
|
||||||
|
expect(fact.source).toBe("chat");
|
||||||
expect(fact.confidence).toBe(0.91);
|
expect(fact.confidence).toBe(0.91);
|
||||||
expect(fact.metadata).toEqual({ channel: 'telegram' });
|
expect(fact.metadata).toEqual({ channel: "telegram" });
|
||||||
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'Bun', 'TypeScript', '2025']);
|
expect(fact.topics.map((topic) => topic.name)).toEqual([
|
||||||
|
"I",
|
||||||
|
"Bun",
|
||||||
|
"TypeScript",
|
||||||
|
"2025",
|
||||||
|
]);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -178,13 +178,15 @@ describe('IdentityDB dedup-aware ingestion', () => {
|
|||||||
provider = new FakeEmbeddingProvider();
|
provider = new FakeEmbeddingProvider();
|
||||||
extractor = {
|
extractor = {
|
||||||
async extract(input) {
|
async extract(input) {
|
||||||
return {
|
return [
|
||||||
statement: input,
|
{
|
||||||
topics: [
|
statement: input,
|
||||||
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
|
topics: [
|
||||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
|
||||||
],
|
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
||||||
};
|
],
|
||||||
|
},
|
||||||
|
];
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,12 @@
|
|||||||
"isolatedModules": true,
|
"isolatedModules": true,
|
||||||
"types": ["node", "vitest/globals"]
|
"types": ["node", "vitest/globals"]
|
||||||
},
|
},
|
||||||
"include": ["src/**/*.ts", "tests/**/*.ts", "vitest.config.ts", "tsup.config.ts"],
|
"include": [
|
||||||
|
"src/**/*.ts",
|
||||||
|
"tests/**/*.ts",
|
||||||
|
"scripts/**/*.ts",
|
||||||
|
"vitest.config.ts",
|
||||||
|
"tsup.config.ts"
|
||||||
|
],
|
||||||
"exclude": ["dist", "node_modules"]
|
"exclude": ["dist", "node_modules"]
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user