feat: make FactExtractor extracts multiple facts per input
This commit is contained in:
@@ -22,7 +22,7 @@ import type { DatabaseConnection, IdentityDBConnectionConfig } from '../adapters
|
||||
import type { IdentityDatabaseSchema } from '../types/database';
|
||||
import type { FactRecord, SpaceRecord, TopicRecord } from '../types/domain';
|
||||
import { createDatabase } from '../adapters/dialect';
|
||||
import { extractFact } from '../ingestion/extractor';
|
||||
import { extractFacts } from '../ingestion/extractor';
|
||||
import {
|
||||
findFactRowsConnectingTopicIds,
|
||||
findFactRowsForTopicId,
|
||||
@@ -220,54 +220,70 @@ export class IdentityDB {
|
||||
}
|
||||
|
||||
async ingestStatement(statement: string, options: IngestStatementOptions): Promise<Fact> {
|
||||
const extracted = await extractFact(statement, options.extractor);
|
||||
const factInput: AddFactInput = {
|
||||
statement: extracted.statement ?? statement,
|
||||
topics: extracted.topics,
|
||||
spaceName: options.spaceName,
|
||||
};
|
||||
|
||||
if (extracted.summary !== undefined) {
|
||||
factInput.summary = extracted.summary;
|
||||
const facts = await this.ingestStatements(statement, options);
|
||||
const first = facts[0];
|
||||
if (!first) {
|
||||
throw new Error('No facts were extracted from the statement.');
|
||||
}
|
||||
return first;
|
||||
}
|
||||
|
||||
if (extracted.source !== undefined) {
|
||||
factInput.source = extracted.source;
|
||||
}
|
||||
async ingestStatements(statement: string, options: IngestStatementOptions): Promise<Fact[]> {
|
||||
const extractedList = await extractFacts(statement, options.extractor);
|
||||
const facts: Fact[] = [];
|
||||
|
||||
if (extracted.confidence !== undefined) {
|
||||
factInput.confidence = extracted.confidence;
|
||||
}
|
||||
|
||||
if (extracted.metadata !== undefined) {
|
||||
factInput.metadata = extracted.metadata;
|
||||
}
|
||||
|
||||
if (options.embeddingProvider) {
|
||||
const similarFacts = await this.findSimilarFacts({
|
||||
statement: factInput.statement,
|
||||
provider: options.embeddingProvider,
|
||||
topicNames: factInput.topics.map((topic) => topic.name),
|
||||
limit: 1,
|
||||
minimumScore: options.duplicateThreshold ?? 0.97,
|
||||
for (const extracted of extractedList) {
|
||||
const factInput: AddFactInput = {
|
||||
statement: extracted.statement ?? statement,
|
||||
topics: extracted.topics,
|
||||
spaceName: options.spaceName,
|
||||
});
|
||||
};
|
||||
|
||||
if (similarFacts[0]) {
|
||||
return similarFacts[0];
|
||||
if (extracted.summary !== undefined) {
|
||||
factInput.summary = extracted.summary;
|
||||
}
|
||||
|
||||
if (extracted.source !== undefined) {
|
||||
factInput.source = extracted.source;
|
||||
}
|
||||
|
||||
if (extracted.confidence !== undefined) {
|
||||
factInput.confidence = extracted.confidence;
|
||||
}
|
||||
|
||||
if (extracted.metadata !== undefined) {
|
||||
factInput.metadata = extracted.metadata;
|
||||
}
|
||||
|
||||
if (options.embeddingProvider) {
|
||||
const similarFacts = await this.findSimilarFacts({
|
||||
statement: factInput.statement,
|
||||
provider: options.embeddingProvider,
|
||||
topicNames: factInput.topics.map((topic) => topic.name),
|
||||
limit: 1,
|
||||
minimumScore: options.duplicateThreshold ?? 0.97,
|
||||
spaceName: options.spaceName,
|
||||
});
|
||||
|
||||
if (similarFacts[0]) {
|
||||
facts.push(similarFacts[0]);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const fact = await this.addFact(factInput);
|
||||
|
||||
if (options.embeddingProvider) {
|
||||
await this.indexFactEmbedding(fact.id, {
|
||||
provider: options.embeddingProvider,
|
||||
spaceName: options.spaceName,
|
||||
});
|
||||
}
|
||||
|
||||
facts.push(fact);
|
||||
}
|
||||
|
||||
const fact = await this.addFact(factInput);
|
||||
|
||||
if (options.embeddingProvider) {
|
||||
await this.indexFactEmbedding(fact.id, {
|
||||
provider: options.embeddingProvider,
|
||||
spaceName: options.spaceName,
|
||||
});
|
||||
}
|
||||
|
||||
return fact;
|
||||
return facts;
|
||||
}
|
||||
|
||||
async indexFactEmbeddings(input: IndexFactEmbeddingsInput): Promise<void> {
|
||||
|
||||
@@ -2,11 +2,15 @@ import { IdentityDBError } from '../core/errors';
|
||||
import { normalizeTopicName } from '../core/utils';
|
||||
import type { FactExtractor, ExtractedFact } from './types';
|
||||
|
||||
export async function extractFact(
|
||||
export async function extractFacts(
|
||||
input: string,
|
||||
extractor: FactExtractor,
|
||||
): Promise<ExtractedFact> {
|
||||
): Promise<ExtractedFact[]> {
|
||||
const extracted = await extractor.extract(input);
|
||||
return extracted.map((fact) => validateAndNormalizeFact(input, fact));
|
||||
}
|
||||
|
||||
function validateAndNormalizeFact(input: string, extracted: ExtractedFact): ExtractedFact {
|
||||
const statement = extracted.statement?.trim() || input.trim();
|
||||
|
||||
if (statement.length === 0) {
|
||||
@@ -31,12 +35,12 @@ export async function extractFact(
|
||||
throw new IdentityDBError('Extractor returned no usable topics.');
|
||||
}
|
||||
|
||||
return {
|
||||
statement,
|
||||
summary: extracted.summary ?? null,
|
||||
source: extracted.source ?? null,
|
||||
confidence: extracted.confidence ?? null,
|
||||
metadata: extracted.metadata ?? null,
|
||||
topics: Array.from(dedupedTopics.values()),
|
||||
};
|
||||
return {
|
||||
statement,
|
||||
summary: extracted.summary ?? null,
|
||||
source: extracted.source ?? null,
|
||||
confidence: extracted.confidence ?? null,
|
||||
metadata: extracted.metadata ?? null,
|
||||
topics: Array.from(dedupedTopics.values()),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -5,16 +5,18 @@ import type {
|
||||
} from "./types";
|
||||
|
||||
const DEFAULT_INSTRUCTIONS = [
|
||||
"Extract one structured fact from the user input.",
|
||||
"Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.",
|
||||
'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.',
|
||||
'Only include topics that are explicitly in the input as-is. For example, topic "I started TypeScript since 2015" can be "I", "TypeScript", "2015".',
|
||||
"Extract structured facts from the user input.",
|
||||
"Return a JSON array of fact objects. Do not include markdown, explanations, or prose outside the JSON array.",
|
||||
'Each fact object must have a "statement", "summary", "source", "confidence", and "topics" array.',
|
||||
'Each topic in "topics" must have a "name", and may include "category", "granularity", and "role".',
|
||||
"Only include topics that are explicitly in the input.",
|
||||
"If the input contains multiple distinct facts, return them as separate objects in the array.",
|
||||
].join("\n");
|
||||
|
||||
export class LlmFactExtractor implements FactExtractor {
|
||||
constructor(private readonly options: LlmFactExtractorOptions) {}
|
||||
|
||||
async extract(input: string): Promise<ExtractedFact> {
|
||||
async extract(input: string): Promise<ExtractedFact[]> {
|
||||
return this.options.model.generateText({
|
||||
instruction: DEFAULT_INSTRUCTIONS,
|
||||
input,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import type { ExtractedFact, FactExtractor } from './types';
|
||||
|
||||
export class NaiveExtractor implements FactExtractor {
|
||||
async extract(input: string): Promise<ExtractedFact> {
|
||||
async extract(input: string): Promise<ExtractedFact[]> {
|
||||
const topics: ExtractedFact['topics'] = [];
|
||||
const seen = new Set<string>();
|
||||
const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? [];
|
||||
@@ -31,9 +31,11 @@ export class NaiveExtractor implements FactExtractor {
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
statement: input.trim(),
|
||||
topics,
|
||||
};
|
||||
return [
|
||||
{
|
||||
statement: input.trim(),
|
||||
topics,
|
||||
},
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ export interface ExtractedFact {
|
||||
}
|
||||
|
||||
export interface FactExtractor {
|
||||
extract(input: string): Promise<ExtractedFact>;
|
||||
extract(input: string): Promise<ExtractedFact[]>;
|
||||
}
|
||||
|
||||
export interface LlmTextGenerationModelInput {
|
||||
@@ -24,7 +24,7 @@ export interface LlmTextGenerationModelInput {
|
||||
}
|
||||
|
||||
export interface LlmTextGenerationModel {
|
||||
generateText(prompt: LlmTextGenerationModelInput): Promise<ExtractedFact>;
|
||||
generateText(prompt: LlmTextGenerationModelInput): Promise<ExtractedFact[]>;
|
||||
}
|
||||
|
||||
export interface LlmFactExtractorOptions {
|
||||
|
||||
@@ -23,29 +23,31 @@ describe("IdentityDB ingestion", () => {
|
||||
it("ingests a statement using a provided extractor", async () => {
|
||||
const extractor: FactExtractor = {
|
||||
async extract(input) {
|
||||
return {
|
||||
statement: input,
|
||||
topics: [
|
||||
{
|
||||
name: "I",
|
||||
category: "entity",
|
||||
granularity: "concrete",
|
||||
role: "subject",
|
||||
},
|
||||
{
|
||||
name: "TypeScript",
|
||||
category: "entity",
|
||||
granularity: "concrete",
|
||||
role: "object",
|
||||
},
|
||||
{
|
||||
name: "2025",
|
||||
category: "temporal",
|
||||
granularity: "concrete",
|
||||
role: "time",
|
||||
},
|
||||
],
|
||||
};
|
||||
return [
|
||||
{
|
||||
statement: input,
|
||||
topics: [
|
||||
{
|
||||
name: "I",
|
||||
category: "entity",
|
||||
granularity: "concrete",
|
||||
role: "subject",
|
||||
},
|
||||
{
|
||||
name: "TypeScript",
|
||||
category: "entity",
|
||||
granularity: "concrete",
|
||||
role: "object",
|
||||
},
|
||||
{
|
||||
name: "2025",
|
||||
category: "temporal",
|
||||
granularity: "concrete",
|
||||
role: "time",
|
||||
},
|
||||
],
|
||||
},
|
||||
];
|
||||
},
|
||||
};
|
||||
|
||||
@@ -95,39 +97,41 @@ describe("IdentityDB ingestion", () => {
|
||||
async generateText(input) {
|
||||
prompt = input;
|
||||
|
||||
return {
|
||||
statement: "I have worked with Bun and TypeScript since 2025.",
|
||||
summary: "The speaker has Bun and TypeScript experience.",
|
||||
source: "chat",
|
||||
confidence: 0.91,
|
||||
metadata: { channel: "telegram" },
|
||||
topics: [
|
||||
{
|
||||
name: "I",
|
||||
category: "entity",
|
||||
granularity: "concrete",
|
||||
role: "subject",
|
||||
},
|
||||
{
|
||||
name: "Bun",
|
||||
category: "entity",
|
||||
granularity: "concrete",
|
||||
role: "object",
|
||||
},
|
||||
{
|
||||
name: "TypeScript",
|
||||
category: "entity",
|
||||
granularity: "concrete",
|
||||
role: "object",
|
||||
},
|
||||
{
|
||||
name: "2025",
|
||||
category: "temporal",
|
||||
granularity: "concrete",
|
||||
role: "time",
|
||||
},
|
||||
],
|
||||
};
|
||||
return [
|
||||
{
|
||||
statement: "I have worked with Bun and TypeScript since 2025.",
|
||||
summary: "The speaker has Bun and TypeScript experience.",
|
||||
source: "chat",
|
||||
confidence: 0.91,
|
||||
metadata: { channel: "telegram" },
|
||||
topics: [
|
||||
{
|
||||
name: "I",
|
||||
category: "entity",
|
||||
granularity: "concrete",
|
||||
role: "subject",
|
||||
},
|
||||
{
|
||||
name: "Bun",
|
||||
category: "entity",
|
||||
granularity: "concrete",
|
||||
role: "object",
|
||||
},
|
||||
{
|
||||
name: "TypeScript",
|
||||
category: "entity",
|
||||
granularity: "concrete",
|
||||
role: "object",
|
||||
},
|
||||
{
|
||||
name: "2025",
|
||||
category: "temporal",
|
||||
granularity: "concrete",
|
||||
role: "time",
|
||||
},
|
||||
],
|
||||
},
|
||||
];
|
||||
},
|
||||
},
|
||||
additionalInstructions: "Prefer technology and time topics.",
|
||||
@@ -141,7 +145,7 @@ describe("IdentityDB ingestion", () => {
|
||||
);
|
||||
|
||||
expect(prompt).toEqual({
|
||||
instruction: expect.stringContaining("Extract one structured fact from the user input."),
|
||||
instruction: expect.stringContaining("Extract structured facts from the user input."),
|
||||
input: "I have worked with Bun and TypeScript since 2025.",
|
||||
additionalInstruction: "Prefer technology and time topics.",
|
||||
});
|
||||
|
||||
@@ -178,13 +178,15 @@ describe('IdentityDB dedup-aware ingestion', () => {
|
||||
provider = new FakeEmbeddingProvider();
|
||||
extractor = {
|
||||
async extract(input) {
|
||||
return {
|
||||
statement: input,
|
||||
topics: [
|
||||
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
|
||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
||||
],
|
||||
};
|
||||
return [
|
||||
{
|
||||
statement: input,
|
||||
topics: [
|
||||
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
|
||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
||||
],
|
||||
},
|
||||
];
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user