diff --git a/src/core/identity-db.ts b/src/core/identity-db.ts index 1500c7d..119a0e6 100644 --- a/src/core/identity-db.ts +++ b/src/core/identity-db.ts @@ -22,7 +22,7 @@ import type { DatabaseConnection, IdentityDBConnectionConfig } from '../adapters import type { IdentityDatabaseSchema } from '../types/database'; import type { FactRecord, SpaceRecord, TopicRecord } from '../types/domain'; import { createDatabase } from '../adapters/dialect'; -import { extractFact } from '../ingestion/extractor'; +import { extractFacts } from '../ingestion/extractor'; import { findFactRowsConnectingTopicIds, findFactRowsForTopicId, @@ -220,54 +220,70 @@ export class IdentityDB { } async ingestStatement(statement: string, options: IngestStatementOptions): Promise { - const extracted = await extractFact(statement, options.extractor); - const factInput: AddFactInput = { - statement: extracted.statement ?? statement, - topics: extracted.topics, - spaceName: options.spaceName, - }; - - if (extracted.summary !== undefined) { - factInput.summary = extracted.summary; + const facts = await this.ingestStatements(statement, options); + const first = facts[0]; + if (!first) { + throw new Error('No facts were extracted from the statement.'); } + return first; + } - if (extracted.source !== undefined) { - factInput.source = extracted.source; - } + async ingestStatements(statement: string, options: IngestStatementOptions): Promise { + const extractedList = await extractFacts(statement, options.extractor); + const facts: Fact[] = []; - if (extracted.confidence !== undefined) { - factInput.confidence = extracted.confidence; - } - - if (extracted.metadata !== undefined) { - factInput.metadata = extracted.metadata; - } - - if (options.embeddingProvider) { - const similarFacts = await this.findSimilarFacts({ - statement: factInput.statement, - provider: options.embeddingProvider, - topicNames: factInput.topics.map((topic) => topic.name), - limit: 1, - minimumScore: options.duplicateThreshold ?? 0.97, + for (const extracted of extractedList) { + const factInput: AddFactInput = { + statement: extracted.statement ?? statement, + topics: extracted.topics, spaceName: options.spaceName, - }); + }; - if (similarFacts[0]) { - return similarFacts[0]; + if (extracted.summary !== undefined) { + factInput.summary = extracted.summary; } + + if (extracted.source !== undefined) { + factInput.source = extracted.source; + } + + if (extracted.confidence !== undefined) { + factInput.confidence = extracted.confidence; + } + + if (extracted.metadata !== undefined) { + factInput.metadata = extracted.metadata; + } + + if (options.embeddingProvider) { + const similarFacts = await this.findSimilarFacts({ + statement: factInput.statement, + provider: options.embeddingProvider, + topicNames: factInput.topics.map((topic) => topic.name), + limit: 1, + minimumScore: options.duplicateThreshold ?? 0.97, + spaceName: options.spaceName, + }); + + if (similarFacts[0]) { + facts.push(similarFacts[0]); + continue; + } + } + + const fact = await this.addFact(factInput); + + if (options.embeddingProvider) { + await this.indexFactEmbedding(fact.id, { + provider: options.embeddingProvider, + spaceName: options.spaceName, + }); + } + + facts.push(fact); } - const fact = await this.addFact(factInput); - - if (options.embeddingProvider) { - await this.indexFactEmbedding(fact.id, { - provider: options.embeddingProvider, - spaceName: options.spaceName, - }); - } - - return fact; + return facts; } async indexFactEmbeddings(input: IndexFactEmbeddingsInput): Promise { diff --git a/src/ingestion/extractor.ts b/src/ingestion/extractor.ts index 9990bd6..0db10c7 100644 --- a/src/ingestion/extractor.ts +++ b/src/ingestion/extractor.ts @@ -2,11 +2,15 @@ import { IdentityDBError } from '../core/errors'; import { normalizeTopicName } from '../core/utils'; import type { FactExtractor, ExtractedFact } from './types'; -export async function extractFact( +export async function extractFacts( input: string, extractor: FactExtractor, -): Promise { +): Promise { const extracted = await extractor.extract(input); + return extracted.map((fact) => validateAndNormalizeFact(input, fact)); +} + +function validateAndNormalizeFact(input: string, extracted: ExtractedFact): ExtractedFact { const statement = extracted.statement?.trim() || input.trim(); if (statement.length === 0) { @@ -31,12 +35,12 @@ export async function extractFact( throw new IdentityDBError('Extractor returned no usable topics.'); } - return { - statement, - summary: extracted.summary ?? null, - source: extracted.source ?? null, - confidence: extracted.confidence ?? null, - metadata: extracted.metadata ?? null, - topics: Array.from(dedupedTopics.values()), - }; + return { + statement, + summary: extracted.summary ?? null, + source: extracted.source ?? null, + confidence: extracted.confidence ?? null, + metadata: extracted.metadata ?? null, + topics: Array.from(dedupedTopics.values()), + }; } diff --git a/src/ingestion/llm-extractor.ts b/src/ingestion/llm-extractor.ts index 74e6352..8cc5e0e 100644 --- a/src/ingestion/llm-extractor.ts +++ b/src/ingestion/llm-extractor.ts @@ -5,16 +5,18 @@ import type { } from "./types"; const DEFAULT_INSTRUCTIONS = [ - "Extract one structured fact from the user input.", - "Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.", - 'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.', - 'Only include topics that are explicitly in the input as-is. For example, topic "I started TypeScript since 2015" can be "I", "TypeScript", "2015".', + "Extract structured facts from the user input.", + "Return a JSON array of fact objects. Do not include markdown, explanations, or prose outside the JSON array.", + 'Each fact object must have a "statement", "summary", "source", "confidence", and "topics" array.', + 'Each topic in "topics" must have a "name", and may include "category", "granularity", and "role".', + "Only include topics that are explicitly in the input.", + "If the input contains multiple distinct facts, return them as separate objects in the array.", ].join("\n"); export class LlmFactExtractor implements FactExtractor { constructor(private readonly options: LlmFactExtractorOptions) {} - async extract(input: string): Promise { + async extract(input: string): Promise { return this.options.model.generateText({ instruction: DEFAULT_INSTRUCTIONS, input, diff --git a/src/ingestion/naive-extractor.ts b/src/ingestion/naive-extractor.ts index 15236c7..5bbbeda 100644 --- a/src/ingestion/naive-extractor.ts +++ b/src/ingestion/naive-extractor.ts @@ -1,7 +1,7 @@ import type { ExtractedFact, FactExtractor } from './types'; export class NaiveExtractor implements FactExtractor { - async extract(input: string): Promise { + async extract(input: string): Promise { const topics: ExtractedFact['topics'] = []; const seen = new Set(); const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? []; @@ -31,9 +31,11 @@ export class NaiveExtractor implements FactExtractor { }); } - return { - statement: input.trim(), - topics, - }; + return [ + { + statement: input.trim(), + topics, + }, + ]; } } diff --git a/src/ingestion/types.ts b/src/ingestion/types.ts index b00777f..11db63f 100644 --- a/src/ingestion/types.ts +++ b/src/ingestion/types.ts @@ -14,7 +14,7 @@ export interface ExtractedFact { } export interface FactExtractor { - extract(input: string): Promise; + extract(input: string): Promise; } export interface LlmTextGenerationModelInput { @@ -24,7 +24,7 @@ export interface LlmTextGenerationModelInput { } export interface LlmTextGenerationModel { - generateText(prompt: LlmTextGenerationModelInput): Promise; + generateText(prompt: LlmTextGenerationModelInput): Promise; } export interface LlmFactExtractorOptions { diff --git a/tests/ingestion.test.ts b/tests/ingestion.test.ts index c5e21b9..f61bfe3 100644 --- a/tests/ingestion.test.ts +++ b/tests/ingestion.test.ts @@ -23,29 +23,31 @@ describe("IdentityDB ingestion", () => { it("ingests a statement using a provided extractor", async () => { const extractor: FactExtractor = { async extract(input) { - return { - statement: input, - topics: [ - { - name: "I", - category: "entity", - granularity: "concrete", - role: "subject", - }, - { - name: "TypeScript", - category: "entity", - granularity: "concrete", - role: "object", - }, - { - name: "2025", - category: "temporal", - granularity: "concrete", - role: "time", - }, - ], - }; + return [ + { + statement: input, + topics: [ + { + name: "I", + category: "entity", + granularity: "concrete", + role: "subject", + }, + { + name: "TypeScript", + category: "entity", + granularity: "concrete", + role: "object", + }, + { + name: "2025", + category: "temporal", + granularity: "concrete", + role: "time", + }, + ], + }, + ]; }, }; @@ -95,39 +97,41 @@ describe("IdentityDB ingestion", () => { async generateText(input) { prompt = input; - return { - statement: "I have worked with Bun and TypeScript since 2025.", - summary: "The speaker has Bun and TypeScript experience.", - source: "chat", - confidence: 0.91, - metadata: { channel: "telegram" }, - topics: [ - { - name: "I", - category: "entity", - granularity: "concrete", - role: "subject", - }, - { - name: "Bun", - category: "entity", - granularity: "concrete", - role: "object", - }, - { - name: "TypeScript", - category: "entity", - granularity: "concrete", - role: "object", - }, - { - name: "2025", - category: "temporal", - granularity: "concrete", - role: "time", - }, - ], - }; + return [ + { + statement: "I have worked with Bun and TypeScript since 2025.", + summary: "The speaker has Bun and TypeScript experience.", + source: "chat", + confidence: 0.91, + metadata: { channel: "telegram" }, + topics: [ + { + name: "I", + category: "entity", + granularity: "concrete", + role: "subject", + }, + { + name: "Bun", + category: "entity", + granularity: "concrete", + role: "object", + }, + { + name: "TypeScript", + category: "entity", + granularity: "concrete", + role: "object", + }, + { + name: "2025", + category: "temporal", + granularity: "concrete", + role: "time", + }, + ], + }, + ]; }, }, additionalInstructions: "Prefer technology and time topics.", @@ -141,7 +145,7 @@ describe("IdentityDB ingestion", () => { ); expect(prompt).toEqual({ - instruction: expect.stringContaining("Extract one structured fact from the user input."), + instruction: expect.stringContaining("Extract structured facts from the user input."), input: "I have worked with Bun and TypeScript since 2025.", additionalInstruction: "Prefer technology and time topics.", }); diff --git a/tests/semantic-search.test.ts b/tests/semantic-search.test.ts index a51b8e6..d975b3a 100644 --- a/tests/semantic-search.test.ts +++ b/tests/semantic-search.test.ts @@ -178,13 +178,15 @@ describe('IdentityDB dedup-aware ingestion', () => { provider = new FakeEmbeddingProvider(); extractor = { async extract(input) { - return { - statement: input, - topics: [ - { name: 'Bun', category: 'entity', granularity: 'concrete' }, - { name: 'TypeScript', category: 'entity', granularity: 'concrete' }, - ], - }; + return [ + { + statement: input, + topics: [ + { name: 'Bun', category: 'entity', granularity: 'concrete' }, + { name: 'TypeScript', category: 'entity', granularity: 'concrete' }, + ], + }, + ]; }, };