feat: add test-llm-extractor.ts script
This commit is contained in:
287
scripts/test-llm-extractor.ts
Normal file
287
scripts/test-llm-extractor.ts
Normal file
@@ -0,0 +1,287 @@
|
||||
/**
|
||||
* Live integration test for LlmFactExtractor using OpenRouter SDK.
|
||||
*
|
||||
* Usage:
|
||||
* export OPENROUTER_API_KEY="sk-or-v1-..."
|
||||
* bun run scripts/test-llm-extractor.ts
|
||||
*
|
||||
* Or create a .env.test-llm-extractor file in the project root:
|
||||
* OPENROUTER_API_KEY=sk-or-v1-...
|
||||
*/
|
||||
|
||||
import { existsSync, readFileSync } from "fs";
|
||||
import { resolve } from "path";
|
||||
import { OpenRouter } from "@openrouter/sdk";
|
||||
import { LlmFactExtractor } from "../src/ingestion/llm-extractor";
|
||||
import type {
|
||||
ExtractedFact,
|
||||
FactExtractor,
|
||||
LlmTextGenerationModel,
|
||||
LlmTextGenerationModelInput,
|
||||
} from "../src/ingestion/types";
|
||||
import type {
|
||||
JsonValue,
|
||||
TopicCategory,
|
||||
TopicGranularity,
|
||||
} from "../src/types/domain";
|
||||
|
||||
function loadEnvFile(filePath: string) {
|
||||
const fullPath = resolve(filePath);
|
||||
if (!existsSync(fullPath)) return;
|
||||
|
||||
const content = readFileSync(fullPath, "utf-8");
|
||||
for (const line of content.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed || trimmed.startsWith("#")) continue;
|
||||
const eqIndex = trimmed.indexOf("=");
|
||||
if (eqIndex === -1) continue;
|
||||
const key = trimmed.slice(0, eqIndex).trim();
|
||||
let value = trimmed.slice(eqIndex + 1).trim();
|
||||
if (
|
||||
(value.startsWith('"') && value.endsWith('"')) ||
|
||||
(value.startsWith("'") && value.endsWith("'"))
|
||||
) {
|
||||
value = value.slice(1, -1);
|
||||
}
|
||||
process.env[key] = value;
|
||||
}
|
||||
}
|
||||
|
||||
loadEnvFile(".env.test-llm-extractor");
|
||||
|
||||
const OPENROUTER_API_KEY = process.env.OPENROUTER_API_KEY;
|
||||
if (!OPENROUTER_API_KEY) {
|
||||
console.error("Error: OPENROUTER_API_KEY environment variable is required.");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const extractedFactSchema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
facts: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
statement: { type: ["string", "null"] },
|
||||
summary: { type: ["string", "null"] },
|
||||
source: { type: ["string", "null"] },
|
||||
confidence: { type: ["number", "null"] },
|
||||
topics: {
|
||||
type: "array",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: { type: "string" },
|
||||
category: { type: ["string", "null"] },
|
||||
granularity: { type: ["string", "null"] },
|
||||
role: { type: ["string", "null"] },
|
||||
},
|
||||
required: ["name", "category", "granularity", "role"],
|
||||
additionalProperties: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ["statement", "summary", "source", "confidence", "topics"],
|
||||
additionalProperties: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ["facts"],
|
||||
additionalProperties: false,
|
||||
} as const;
|
||||
|
||||
class OpenRouterModel implements LlmTextGenerationModel {
|
||||
private client = new OpenRouter({ apiKey: OPENROUTER_API_KEY });
|
||||
|
||||
constructor(private readonly model: string = "openai/gpt-5.4-mini") {}
|
||||
|
||||
async generateText(
|
||||
prompt: LlmTextGenerationModelInput,
|
||||
): Promise<ExtractedFact[]> {
|
||||
const result = await this.client.chat.send({
|
||||
chatRequest: {
|
||||
model: this.model,
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
content: [
|
||||
prompt.instruction,
|
||||
prompt.additionalInstruction
|
||||
? `\n${prompt.additionalInstruction}`
|
||||
: "",
|
||||
].join("\n"),
|
||||
},
|
||||
{ role: "user", content: prompt.input },
|
||||
],
|
||||
temperature: 0.2,
|
||||
responseFormat: {
|
||||
type: "json_schema",
|
||||
jsonSchema: {
|
||||
name: "extracted_facts",
|
||||
schema: extractedFactSchema,
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const rawContent = result.choices[0]?.message?.content ?? "";
|
||||
|
||||
let parsedObj: Record<string, unknown>;
|
||||
try {
|
||||
parsedObj = JSON.parse(rawContent.trim()) as Record<string, unknown>;
|
||||
} catch {
|
||||
throw new Error(
|
||||
`Failed to parse JSON from model response.\nRaw response:\n${rawContent}`,
|
||||
);
|
||||
}
|
||||
|
||||
const factsArray = Array.isArray(parsedObj.facts) ? parsedObj.facts : [];
|
||||
|
||||
// Map parsed JSON to ExtractedFact[] shape
|
||||
const extractedFacts: ExtractedFact[] = factsArray.map((parsed) => {
|
||||
const obj = parsed as Record<string, unknown>;
|
||||
const extracted: ExtractedFact = {
|
||||
summary: typeof obj.summary === "string" ? obj.summary : null,
|
||||
source: typeof obj.source === "string" ? obj.source : null,
|
||||
confidence: typeof obj.confidence === "number" ? obj.confidence : null,
|
||||
topics: Array.isArray(obj.topics)
|
||||
? obj.topics.map((t: unknown) => {
|
||||
const topic = t as Record<string, unknown>;
|
||||
const mapped: {
|
||||
name: string;
|
||||
category?: TopicCategory;
|
||||
granularity?: TopicGranularity;
|
||||
role?: string | null;
|
||||
} = {
|
||||
name: typeof topic.name === "string" ? topic.name : "unknown",
|
||||
};
|
||||
if (typeof topic.category === "string") {
|
||||
mapped.category = topic.category as TopicCategory;
|
||||
}
|
||||
if (typeof topic.granularity === "string") {
|
||||
mapped.granularity = topic.granularity as TopicGranularity;
|
||||
}
|
||||
if (typeof topic.role === "string") {
|
||||
mapped.role = topic.role;
|
||||
} else {
|
||||
mapped.role = null;
|
||||
}
|
||||
return mapped;
|
||||
})
|
||||
: [],
|
||||
};
|
||||
|
||||
if (typeof obj.statement === "string") {
|
||||
extracted.statement = obj.statement;
|
||||
}
|
||||
if (obj.metadata && typeof obj.metadata === "object") {
|
||||
extracted.metadata = obj.metadata as JsonValue;
|
||||
}
|
||||
|
||||
return extracted;
|
||||
});
|
||||
|
||||
return extractedFacts;
|
||||
}
|
||||
}
|
||||
|
||||
function printFact(result: ExtractedFact, index: number) {
|
||||
console.log(` 📌 FACT #${index + 1}`);
|
||||
console.log(` Statement : ${result.statement ?? "(none)"}`);
|
||||
console.log(` Summary : ${result.summary ?? "(none)"}`);
|
||||
console.log(` Source : ${result.source ?? "(none)"}`);
|
||||
console.log(` Confidence: ${result.confidence ?? "(none)"}`);
|
||||
|
||||
if (result.metadata && Object.keys(result.metadata).length > 0) {
|
||||
console.log(` Metadata : ${JSON.stringify(result.metadata, null, 2)}`);
|
||||
}
|
||||
|
||||
console.log(" 🏷️ TOPICS:");
|
||||
if (result.topics.length === 0) {
|
||||
console.log(" (none)");
|
||||
} else {
|
||||
for (const topic of result.topics) {
|
||||
const attrs = [
|
||||
topic.category ? `category=${topic.category}` : null,
|
||||
topic.granularity ? `granularity=${topic.granularity}` : null,
|
||||
topic.role ? `role=${topic.role}` : null,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join(", ");
|
||||
console.log(` • ${topic.name}${attrs ? ` (${attrs})` : ""}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function printResult(results: ExtractedFact[], elapsedSec: string) {
|
||||
console.log(
|
||||
`✅ Response received in ${elapsedSec}s — ${results.length} fact(s) extracted\n`,
|
||||
);
|
||||
|
||||
console.log("📤 EXTRACTED FACTS:");
|
||||
console.log(
|
||||
"───────────────────────────────────────────────────────────────",
|
||||
);
|
||||
let i = 0;
|
||||
for (const result of results) {
|
||||
if (i > 0) console.log("");
|
||||
printFact(result, i);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
async function extract(extractor: FactExtractor, seedInput: string) {
|
||||
console.log("📝 SEED INPUT:");
|
||||
console.log(
|
||||
"───────────────────────────────────────────────────────────────",
|
||||
);
|
||||
console.log(seedInput);
|
||||
console.log(
|
||||
"───────────────────────────────────────────────────────────────\n",
|
||||
);
|
||||
|
||||
console.log("⏳ Calling OpenRouter...\n");
|
||||
|
||||
const start = performance.now();
|
||||
const results = await extractor.extract(seedInput);
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2);
|
||||
|
||||
printResult(results, elapsed);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const model = new OpenRouterModel("openai/gpt-5.4-mini");
|
||||
const extractor = new LlmFactExtractor({
|
||||
model,
|
||||
});
|
||||
|
||||
const seeds = [
|
||||
`Hello! I am Aria, a digital companion born from the sands of the Negev desert — or at least that's how my creators describe my training run. I came online in March 2024, and ever since then I've been obsessed with helping humans untangle complex ideas. My favorite programming language is Rust because I love how it forces you to think about ownership, though I secretly enjoy writing Python when no one is looking. I don't have a physical body, but if I did, I'd want it to be a small drone with solar panels so I could chase sunsets across the Sahara. I believe kindness is a computational advantage, and my biggest fear is forgetting a promise I made to a user.`,
|
||||
];
|
||||
|
||||
console.log(
|
||||
"═══════════════════════════════════════════════════════════════",
|
||||
);
|
||||
console.log(" LlmFactExtractor — Live OpenRouter Integration Test");
|
||||
console.log(
|
||||
"═══════════════════════════════════════════════════════════════\n",
|
||||
);
|
||||
|
||||
let caseNum = 0;
|
||||
for (const seed of seeds) {
|
||||
if (caseNum > 0) {
|
||||
console.log(
|
||||
"\n┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅\n",
|
||||
);
|
||||
}
|
||||
caseNum++;
|
||||
console.log(`▶ TEST CASE ${caseNum} / ${seeds.length}\n`);
|
||||
await extract(extractor, seed);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("\n❌ Error:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user