feat: make FactExtractor extracts multiple facts per input

This commit is contained in:
2026-05-20 22:59:35 +09:00
parent 188f03e8e8
commit 7602c92046
7 changed files with 157 additions and 127 deletions

View File

@@ -22,7 +22,7 @@ import type { DatabaseConnection, IdentityDBConnectionConfig } from '../adapters
import type { IdentityDatabaseSchema } from '../types/database'; import type { IdentityDatabaseSchema } from '../types/database';
import type { FactRecord, SpaceRecord, TopicRecord } from '../types/domain'; import type { FactRecord, SpaceRecord, TopicRecord } from '../types/domain';
import { createDatabase } from '../adapters/dialect'; import { createDatabase } from '../adapters/dialect';
import { extractFact } from '../ingestion/extractor'; import { extractFacts } from '../ingestion/extractor';
import { import {
findFactRowsConnectingTopicIds, findFactRowsConnectingTopicIds,
findFactRowsForTopicId, findFactRowsForTopicId,
@@ -220,7 +220,19 @@ export class IdentityDB {
} }
async ingestStatement(statement: string, options: IngestStatementOptions): Promise<Fact> { async ingestStatement(statement: string, options: IngestStatementOptions): Promise<Fact> {
const extracted = await extractFact(statement, options.extractor); const facts = await this.ingestStatements(statement, options);
const first = facts[0];
if (!first) {
throw new Error('No facts were extracted from the statement.');
}
return first;
}
async ingestStatements(statement: string, options: IngestStatementOptions): Promise<Fact[]> {
const extractedList = await extractFacts(statement, options.extractor);
const facts: Fact[] = [];
for (const extracted of extractedList) {
const factInput: AddFactInput = { const factInput: AddFactInput = {
statement: extracted.statement ?? statement, statement: extracted.statement ?? statement,
topics: extracted.topics, topics: extracted.topics,
@@ -254,7 +266,8 @@ export class IdentityDB {
}); });
if (similarFacts[0]) { if (similarFacts[0]) {
return similarFacts[0]; facts.push(similarFacts[0]);
continue;
} }
} }
@@ -267,7 +280,10 @@ export class IdentityDB {
}); });
} }
return fact; facts.push(fact);
}
return facts;
} }
async indexFactEmbeddings(input: IndexFactEmbeddingsInput): Promise<void> { async indexFactEmbeddings(input: IndexFactEmbeddingsInput): Promise<void> {

View File

@@ -2,11 +2,15 @@ import { IdentityDBError } from '../core/errors';
import { normalizeTopicName } from '../core/utils'; import { normalizeTopicName } from '../core/utils';
import type { FactExtractor, ExtractedFact } from './types'; import type { FactExtractor, ExtractedFact } from './types';
export async function extractFact( export async function extractFacts(
input: string, input: string,
extractor: FactExtractor, extractor: FactExtractor,
): Promise<ExtractedFact> { ): Promise<ExtractedFact[]> {
const extracted = await extractor.extract(input); const extracted = await extractor.extract(input);
return extracted.map((fact) => validateAndNormalizeFact(input, fact));
}
function validateAndNormalizeFact(input: string, extracted: ExtractedFact): ExtractedFact {
const statement = extracted.statement?.trim() || input.trim(); const statement = extracted.statement?.trim() || input.trim();
if (statement.length === 0) { if (statement.length === 0) {

View File

@@ -5,16 +5,18 @@ import type {
} from "./types"; } from "./types";
const DEFAULT_INSTRUCTIONS = [ const DEFAULT_INSTRUCTIONS = [
"Extract one structured fact from the user input.", "Extract structured facts from the user input.",
"Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.", "Return a JSON array of fact objects. Do not include markdown, explanations, or prose outside the JSON array.",
'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.', 'Each fact object must have a "statement", "summary", "source", "confidence", and "topics" array.',
'Only include topics that are explicitly in the input as-is. For example, topic "I started TypeScript since 2015" can be "I", "TypeScript", "2015".', 'Each topic in "topics" must have a "name", and may include "category", "granularity", and "role".',
"Only include topics that are explicitly in the input.",
"If the input contains multiple distinct facts, return them as separate objects in the array.",
].join("\n"); ].join("\n");
export class LlmFactExtractor implements FactExtractor { export class LlmFactExtractor implements FactExtractor {
constructor(private readonly options: LlmFactExtractorOptions) {} constructor(private readonly options: LlmFactExtractorOptions) {}
async extract(input: string): Promise<ExtractedFact> { async extract(input: string): Promise<ExtractedFact[]> {
return this.options.model.generateText({ return this.options.model.generateText({
instruction: DEFAULT_INSTRUCTIONS, instruction: DEFAULT_INSTRUCTIONS,
input, input,

View File

@@ -1,7 +1,7 @@
import type { ExtractedFact, FactExtractor } from './types'; import type { ExtractedFact, FactExtractor } from './types';
export class NaiveExtractor implements FactExtractor { export class NaiveExtractor implements FactExtractor {
async extract(input: string): Promise<ExtractedFact> { async extract(input: string): Promise<ExtractedFact[]> {
const topics: ExtractedFact['topics'] = []; const topics: ExtractedFact['topics'] = [];
const seen = new Set<string>(); const seen = new Set<string>();
const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? []; const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? [];
@@ -31,9 +31,11 @@ export class NaiveExtractor implements FactExtractor {
}); });
} }
return { return [
{
statement: input.trim(), statement: input.trim(),
topics, topics,
}; },
];
} }
} }

View File

@@ -14,7 +14,7 @@ export interface ExtractedFact {
} }
export interface FactExtractor { export interface FactExtractor {
extract(input: string): Promise<ExtractedFact>; extract(input: string): Promise<ExtractedFact[]>;
} }
export interface LlmTextGenerationModelInput { export interface LlmTextGenerationModelInput {
@@ -24,7 +24,7 @@ export interface LlmTextGenerationModelInput {
} }
export interface LlmTextGenerationModel { export interface LlmTextGenerationModel {
generateText(prompt: LlmTextGenerationModelInput): Promise<ExtractedFact>; generateText(prompt: LlmTextGenerationModelInput): Promise<ExtractedFact[]>;
} }
export interface LlmFactExtractorOptions { export interface LlmFactExtractorOptions {

View File

@@ -23,7 +23,8 @@ describe("IdentityDB ingestion", () => {
it("ingests a statement using a provided extractor", async () => { it("ingests a statement using a provided extractor", async () => {
const extractor: FactExtractor = { const extractor: FactExtractor = {
async extract(input) { async extract(input) {
return { return [
{
statement: input, statement: input,
topics: [ topics: [
{ {
@@ -45,7 +46,8 @@ describe("IdentityDB ingestion", () => {
role: "time", role: "time",
}, },
], ],
}; },
];
}, },
}; };
@@ -95,7 +97,8 @@ describe("IdentityDB ingestion", () => {
async generateText(input) { async generateText(input) {
prompt = input; prompt = input;
return { return [
{
statement: "I have worked with Bun and TypeScript since 2025.", statement: "I have worked with Bun and TypeScript since 2025.",
summary: "The speaker has Bun and TypeScript experience.", summary: "The speaker has Bun and TypeScript experience.",
source: "chat", source: "chat",
@@ -127,7 +130,8 @@ describe("IdentityDB ingestion", () => {
role: "time", role: "time",
}, },
], ],
}; },
];
}, },
}, },
additionalInstructions: "Prefer technology and time topics.", additionalInstructions: "Prefer technology and time topics.",
@@ -141,7 +145,7 @@ describe("IdentityDB ingestion", () => {
); );
expect(prompt).toEqual({ expect(prompt).toEqual({
instruction: expect.stringContaining("Extract one structured fact from the user input."), instruction: expect.stringContaining("Extract structured facts from the user input."),
input: "I have worked with Bun and TypeScript since 2025.", input: "I have worked with Bun and TypeScript since 2025.",
additionalInstruction: "Prefer technology and time topics.", additionalInstruction: "Prefer technology and time topics.",
}); });

View File

@@ -178,13 +178,15 @@ describe('IdentityDB dedup-aware ingestion', () => {
provider = new FakeEmbeddingProvider(); provider = new FakeEmbeddingProvider();
extractor = { extractor = {
async extract(input) { async extract(input) {
return { return [
{
statement: input, statement: input,
topics: [ topics: [
{ name: 'Bun', category: 'entity', granularity: 'concrete' }, { name: 'Bun', category: 'entity', granularity: 'concrete' },
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' }, { name: 'TypeScript', category: 'entity', granularity: 'concrete' },
], ],
}; },
];
}, },
}; };