feat: make FactExtractor extracts multiple facts per input

This commit is contained in:
2026-05-20 22:59:35 +09:00
parent 188f03e8e8
commit 7602c92046
7 changed files with 157 additions and 127 deletions

View File

@@ -22,7 +22,7 @@ import type { DatabaseConnection, IdentityDBConnectionConfig } from '../adapters
import type { IdentityDatabaseSchema } from '../types/database';
import type { FactRecord, SpaceRecord, TopicRecord } from '../types/domain';
import { createDatabase } from '../adapters/dialect';
import { extractFact } from '../ingestion/extractor';
import { extractFacts } from '../ingestion/extractor';
import {
findFactRowsConnectingTopicIds,
findFactRowsForTopicId,
@@ -220,54 +220,70 @@ export class IdentityDB {
}
async ingestStatement(statement: string, options: IngestStatementOptions): Promise<Fact> {
const extracted = await extractFact(statement, options.extractor);
const factInput: AddFactInput = {
statement: extracted.statement ?? statement,
topics: extracted.topics,
spaceName: options.spaceName,
};
if (extracted.summary !== undefined) {
factInput.summary = extracted.summary;
const facts = await this.ingestStatements(statement, options);
const first = facts[0];
if (!first) {
throw new Error('No facts were extracted from the statement.');
}
return first;
}
if (extracted.source !== undefined) {
factInput.source = extracted.source;
}
async ingestStatements(statement: string, options: IngestStatementOptions): Promise<Fact[]> {
const extractedList = await extractFacts(statement, options.extractor);
const facts: Fact[] = [];
if (extracted.confidence !== undefined) {
factInput.confidence = extracted.confidence;
}
if (extracted.metadata !== undefined) {
factInput.metadata = extracted.metadata;
}
if (options.embeddingProvider) {
const similarFacts = await this.findSimilarFacts({
statement: factInput.statement,
provider: options.embeddingProvider,
topicNames: factInput.topics.map((topic) => topic.name),
limit: 1,
minimumScore: options.duplicateThreshold ?? 0.97,
for (const extracted of extractedList) {
const factInput: AddFactInput = {
statement: extracted.statement ?? statement,
topics: extracted.topics,
spaceName: options.spaceName,
});
};
if (similarFacts[0]) {
return similarFacts[0];
if (extracted.summary !== undefined) {
factInput.summary = extracted.summary;
}
if (extracted.source !== undefined) {
factInput.source = extracted.source;
}
if (extracted.confidence !== undefined) {
factInput.confidence = extracted.confidence;
}
if (extracted.metadata !== undefined) {
factInput.metadata = extracted.metadata;
}
if (options.embeddingProvider) {
const similarFacts = await this.findSimilarFacts({
statement: factInput.statement,
provider: options.embeddingProvider,
topicNames: factInput.topics.map((topic) => topic.name),
limit: 1,
minimumScore: options.duplicateThreshold ?? 0.97,
spaceName: options.spaceName,
});
if (similarFacts[0]) {
facts.push(similarFacts[0]);
continue;
}
}
const fact = await this.addFact(factInput);
if (options.embeddingProvider) {
await this.indexFactEmbedding(fact.id, {
provider: options.embeddingProvider,
spaceName: options.spaceName,
});
}
facts.push(fact);
}
const fact = await this.addFact(factInput);
if (options.embeddingProvider) {
await this.indexFactEmbedding(fact.id, {
provider: options.embeddingProvider,
spaceName: options.spaceName,
});
}
return fact;
return facts;
}
async indexFactEmbeddings(input: IndexFactEmbeddingsInput): Promise<void> {

View File

@@ -2,11 +2,15 @@ import { IdentityDBError } from '../core/errors';
import { normalizeTopicName } from '../core/utils';
import type { FactExtractor, ExtractedFact } from './types';
export async function extractFact(
export async function extractFacts(
input: string,
extractor: FactExtractor,
): Promise<ExtractedFact> {
): Promise<ExtractedFact[]> {
const extracted = await extractor.extract(input);
return extracted.map((fact) => validateAndNormalizeFact(input, fact));
}
function validateAndNormalizeFact(input: string, extracted: ExtractedFact): ExtractedFact {
const statement = extracted.statement?.trim() || input.trim();
if (statement.length === 0) {
@@ -31,12 +35,12 @@ export async function extractFact(
throw new IdentityDBError('Extractor returned no usable topics.');
}
return {
statement,
summary: extracted.summary ?? null,
source: extracted.source ?? null,
confidence: extracted.confidence ?? null,
metadata: extracted.metadata ?? null,
topics: Array.from(dedupedTopics.values()),
};
return {
statement,
summary: extracted.summary ?? null,
source: extracted.source ?? null,
confidence: extracted.confidence ?? null,
metadata: extracted.metadata ?? null,
topics: Array.from(dedupedTopics.values()),
};
}

View File

@@ -5,16 +5,18 @@ import type {
} from "./types";
const DEFAULT_INSTRUCTIONS = [
"Extract one structured fact from the user input.",
"Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.",
'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.',
'Only include topics that are explicitly in the input as-is. For example, topic "I started TypeScript since 2015" can be "I", "TypeScript", "2015".',
"Extract structured facts from the user input.",
"Return a JSON array of fact objects. Do not include markdown, explanations, or prose outside the JSON array.",
'Each fact object must have a "statement", "summary", "source", "confidence", and "topics" array.',
'Each topic in "topics" must have a "name", and may include "category", "granularity", and "role".',
"Only include topics that are explicitly in the input.",
"If the input contains multiple distinct facts, return them as separate objects in the array.",
].join("\n");
export class LlmFactExtractor implements FactExtractor {
constructor(private readonly options: LlmFactExtractorOptions) {}
async extract(input: string): Promise<ExtractedFact> {
async extract(input: string): Promise<ExtractedFact[]> {
return this.options.model.generateText({
instruction: DEFAULT_INSTRUCTIONS,
input,

View File

@@ -1,7 +1,7 @@
import type { ExtractedFact, FactExtractor } from './types';
export class NaiveExtractor implements FactExtractor {
async extract(input: string): Promise<ExtractedFact> {
async extract(input: string): Promise<ExtractedFact[]> {
const topics: ExtractedFact['topics'] = [];
const seen = new Set<string>();
const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? [];
@@ -31,9 +31,11 @@ export class NaiveExtractor implements FactExtractor {
});
}
return {
statement: input.trim(),
topics,
};
return [
{
statement: input.trim(),
topics,
},
];
}
}

View File

@@ -14,7 +14,7 @@ export interface ExtractedFact {
}
export interface FactExtractor {
extract(input: string): Promise<ExtractedFact>;
extract(input: string): Promise<ExtractedFact[]>;
}
export interface LlmTextGenerationModelInput {
@@ -24,7 +24,7 @@ export interface LlmTextGenerationModelInput {
}
export interface LlmTextGenerationModel {
generateText(prompt: LlmTextGenerationModelInput): Promise<ExtractedFact>;
generateText(prompt: LlmTextGenerationModelInput): Promise<ExtractedFact[]>;
}
export interface LlmFactExtractorOptions {

View File

@@ -23,29 +23,31 @@ describe("IdentityDB ingestion", () => {
it("ingests a statement using a provided extractor", async () => {
const extractor: FactExtractor = {
async extract(input) {
return {
statement: input,
topics: [
{
name: "I",
category: "entity",
granularity: "concrete",
role: "subject",
},
{
name: "TypeScript",
category: "entity",
granularity: "concrete",
role: "object",
},
{
name: "2025",
category: "temporal",
granularity: "concrete",
role: "time",
},
],
};
return [
{
statement: input,
topics: [
{
name: "I",
category: "entity",
granularity: "concrete",
role: "subject",
},
{
name: "TypeScript",
category: "entity",
granularity: "concrete",
role: "object",
},
{
name: "2025",
category: "temporal",
granularity: "concrete",
role: "time",
},
],
},
];
},
};
@@ -95,39 +97,41 @@ describe("IdentityDB ingestion", () => {
async generateText(input) {
prompt = input;
return {
statement: "I have worked with Bun and TypeScript since 2025.",
summary: "The speaker has Bun and TypeScript experience.",
source: "chat",
confidence: 0.91,
metadata: { channel: "telegram" },
topics: [
{
name: "I",
category: "entity",
granularity: "concrete",
role: "subject",
},
{
name: "Bun",
category: "entity",
granularity: "concrete",
role: "object",
},
{
name: "TypeScript",
category: "entity",
granularity: "concrete",
role: "object",
},
{
name: "2025",
category: "temporal",
granularity: "concrete",
role: "time",
},
],
};
return [
{
statement: "I have worked with Bun and TypeScript since 2025.",
summary: "The speaker has Bun and TypeScript experience.",
source: "chat",
confidence: 0.91,
metadata: { channel: "telegram" },
topics: [
{
name: "I",
category: "entity",
granularity: "concrete",
role: "subject",
},
{
name: "Bun",
category: "entity",
granularity: "concrete",
role: "object",
},
{
name: "TypeScript",
category: "entity",
granularity: "concrete",
role: "object",
},
{
name: "2025",
category: "temporal",
granularity: "concrete",
role: "time",
},
],
},
];
},
},
additionalInstructions: "Prefer technology and time topics.",
@@ -141,7 +145,7 @@ describe("IdentityDB ingestion", () => {
);
expect(prompt).toEqual({
instruction: expect.stringContaining("Extract one structured fact from the user input."),
instruction: expect.stringContaining("Extract structured facts from the user input."),
input: "I have worked with Bun and TypeScript since 2025.",
additionalInstruction: "Prefer technology and time topics.",
});

View File

@@ -178,13 +178,15 @@ describe('IdentityDB dedup-aware ingestion', () => {
provider = new FakeEmbeddingProvider();
extractor = {
async extract(input) {
return {
statement: input,
topics: [
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
],
};
return [
{
statement: input,
topics: [
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
],
},
];
},
};