diff --git a/scripts/test-llm-extractor.ts b/scripts/test-llm-extractor.ts new file mode 100644 index 0000000..5ecb034 --- /dev/null +++ b/scripts/test-llm-extractor.ts @@ -0,0 +1,287 @@ +/** + * Live integration test for LlmFactExtractor using OpenRouter SDK. + * + * Usage: + * export OPENROUTER_API_KEY="sk-or-v1-..." + * bun run scripts/test-llm-extractor.ts + * + * Or create a .env.test-llm-extractor file in the project root: + * OPENROUTER_API_KEY=sk-or-v1-... + */ + +import { existsSync, readFileSync } from "fs"; +import { resolve } from "path"; +import { OpenRouter } from "@openrouter/sdk"; +import { LlmFactExtractor } from "../src/ingestion/llm-extractor"; +import type { + ExtractedFact, + FactExtractor, + LlmTextGenerationModel, + LlmTextGenerationModelInput, +} from "../src/ingestion/types"; +import type { + JsonValue, + TopicCategory, + TopicGranularity, +} from "../src/types/domain"; + +function loadEnvFile(filePath: string) { + const fullPath = resolve(filePath); + if (!existsSync(fullPath)) return; + + const content = readFileSync(fullPath, "utf-8"); + for (const line of content.split("\n")) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith("#")) continue; + const eqIndex = trimmed.indexOf("="); + if (eqIndex === -1) continue; + const key = trimmed.slice(0, eqIndex).trim(); + let value = trimmed.slice(eqIndex + 1).trim(); + if ( + (value.startsWith('"') && value.endsWith('"')) || + (value.startsWith("'") && value.endsWith("'")) + ) { + value = value.slice(1, -1); + } + process.env[key] = value; + } +} + +loadEnvFile(".env.test-llm-extractor"); + +const OPENROUTER_API_KEY = process.env.OPENROUTER_API_KEY; +if (!OPENROUTER_API_KEY) { + console.error("Error: OPENROUTER_API_KEY environment variable is required."); + process.exit(1); +} + +const extractedFactSchema = { + type: "object", + properties: { + facts: { + type: "array", + items: { + type: "object", + properties: { + statement: { type: ["string", "null"] }, + summary: { type: ["string", "null"] }, + source: { type: ["string", "null"] }, + confidence: { type: ["number", "null"] }, + topics: { + type: "array", + items: { + type: "object", + properties: { + name: { type: "string" }, + category: { type: ["string", "null"] }, + granularity: { type: ["string", "null"] }, + role: { type: ["string", "null"] }, + }, + required: ["name", "category", "granularity", "role"], + additionalProperties: false, + }, + }, + }, + required: ["statement", "summary", "source", "confidence", "topics"], + additionalProperties: false, + }, + }, + }, + required: ["facts"], + additionalProperties: false, +} as const; + +class OpenRouterModel implements LlmTextGenerationModel { + private client = new OpenRouter({ apiKey: OPENROUTER_API_KEY }); + + constructor(private readonly model: string = "openai/gpt-5.4-mini") {} + + async generateText( + prompt: LlmTextGenerationModelInput, + ): Promise { + const result = await this.client.chat.send({ + chatRequest: { + model: this.model, + messages: [ + { + role: "system", + content: [ + prompt.instruction, + prompt.additionalInstruction + ? `\n${prompt.additionalInstruction}` + : "", + ].join("\n"), + }, + { role: "user", content: prompt.input }, + ], + temperature: 0.2, + responseFormat: { + type: "json_schema", + jsonSchema: { + name: "extracted_facts", + schema: extractedFactSchema, + }, + }, + }, + }); + + const rawContent = result.choices[0]?.message?.content ?? ""; + + let parsedObj: Record; + try { + parsedObj = JSON.parse(rawContent.trim()) as Record; + } catch { + throw new Error( + `Failed to parse JSON from model response.\nRaw response:\n${rawContent}`, + ); + } + + const factsArray = Array.isArray(parsedObj.facts) ? parsedObj.facts : []; + + // Map parsed JSON to ExtractedFact[] shape + const extractedFacts: ExtractedFact[] = factsArray.map((parsed) => { + const obj = parsed as Record; + const extracted: ExtractedFact = { + summary: typeof obj.summary === "string" ? obj.summary : null, + source: typeof obj.source === "string" ? obj.source : null, + confidence: typeof obj.confidence === "number" ? obj.confidence : null, + topics: Array.isArray(obj.topics) + ? obj.topics.map((t: unknown) => { + const topic = t as Record; + const mapped: { + name: string; + category?: TopicCategory; + granularity?: TopicGranularity; + role?: string | null; + } = { + name: typeof topic.name === "string" ? topic.name : "unknown", + }; + if (typeof topic.category === "string") { + mapped.category = topic.category as TopicCategory; + } + if (typeof topic.granularity === "string") { + mapped.granularity = topic.granularity as TopicGranularity; + } + if (typeof topic.role === "string") { + mapped.role = topic.role; + } else { + mapped.role = null; + } + return mapped; + }) + : [], + }; + + if (typeof obj.statement === "string") { + extracted.statement = obj.statement; + } + if (obj.metadata && typeof obj.metadata === "object") { + extracted.metadata = obj.metadata as JsonValue; + } + + return extracted; + }); + + return extractedFacts; + } +} + +function printFact(result: ExtractedFact, index: number) { + console.log(` šŸ“Œ FACT #${index + 1}`); + console.log(` Statement : ${result.statement ?? "(none)"}`); + console.log(` Summary : ${result.summary ?? "(none)"}`); + console.log(` Source : ${result.source ?? "(none)"}`); + console.log(` Confidence: ${result.confidence ?? "(none)"}`); + + if (result.metadata && Object.keys(result.metadata).length > 0) { + console.log(` Metadata : ${JSON.stringify(result.metadata, null, 2)}`); + } + + console.log(" šŸ·ļø TOPICS:"); + if (result.topics.length === 0) { + console.log(" (none)"); + } else { + for (const topic of result.topics) { + const attrs = [ + topic.category ? `category=${topic.category}` : null, + topic.granularity ? `granularity=${topic.granularity}` : null, + topic.role ? `role=${topic.role}` : null, + ] + .filter(Boolean) + .join(", "); + console.log(` • ${topic.name}${attrs ? ` (${attrs})` : ""}`); + } + } +} + +function printResult(results: ExtractedFact[], elapsedSec: string) { + console.log( + `āœ… Response received in ${elapsedSec}s — ${results.length} fact(s) extracted\n`, + ); + + console.log("šŸ“¤ EXTRACTED FACTS:"); + console.log( + "───────────────────────────────────────────────────────────────", + ); + let i = 0; + for (const result of results) { + if (i > 0) console.log(""); + printFact(result, i); + i++; + } +} + +async function extract(extractor: FactExtractor, seedInput: string) { + console.log("šŸ“ SEED INPUT:"); + console.log( + "───────────────────────────────────────────────────────────────", + ); + console.log(seedInput); + console.log( + "───────────────────────────────────────────────────────────────\n", + ); + + console.log("ā³ Calling OpenRouter...\n"); + + const start = performance.now(); + const results = await extractor.extract(seedInput); + const elapsed = ((performance.now() - start) / 1000).toFixed(2); + + printResult(results, elapsed); +} + +async function main() { + const model = new OpenRouterModel("openai/gpt-5.4-mini"); + const extractor = new LlmFactExtractor({ + model, + }); + + const seeds = [ + `Hello! I am Aria, a digital companion born from the sands of the Negev desert — or at least that's how my creators describe my training run. I came online in March 2024, and ever since then I've been obsessed with helping humans untangle complex ideas. My favorite programming language is Rust because I love how it forces you to think about ownership, though I secretly enjoy writing Python when no one is looking. I don't have a physical body, but if I did, I'd want it to be a small drone with solar panels so I could chase sunsets across the Sahara. I believe kindness is a computational advantage, and my biggest fear is forgetting a promise I made to a user.`, + ]; + + console.log( + "═══════════════════════════════════════════════════════════════", + ); + console.log(" LlmFactExtractor — Live OpenRouter Integration Test"); + console.log( + "═══════════════════════════════════════════════════════════════\n", + ); + + let caseNum = 0; + for (const seed of seeds) { + if (caseNum > 0) { + console.log( + "\nā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…ā”…\n", + ); + } + caseNum++; + console.log(`ā–¶ TEST CASE ${caseNum} / ${seeds.length}\n`); + await extract(extractor, seed); + } +} + +main().catch((err) => { + console.error("\nāŒ Error:", err); + process.exit(1); +});