feat: make FactExtractor extracts multiple facts per input
This commit is contained in:
@@ -22,7 +22,7 @@ import type { DatabaseConnection, IdentityDBConnectionConfig } from '../adapters
|
|||||||
import type { IdentityDatabaseSchema } from '../types/database';
|
import type { IdentityDatabaseSchema } from '../types/database';
|
||||||
import type { FactRecord, SpaceRecord, TopicRecord } from '../types/domain';
|
import type { FactRecord, SpaceRecord, TopicRecord } from '../types/domain';
|
||||||
import { createDatabase } from '../adapters/dialect';
|
import { createDatabase } from '../adapters/dialect';
|
||||||
import { extractFact } from '../ingestion/extractor';
|
import { extractFacts } from '../ingestion/extractor';
|
||||||
import {
|
import {
|
||||||
findFactRowsConnectingTopicIds,
|
findFactRowsConnectingTopicIds,
|
||||||
findFactRowsForTopicId,
|
findFactRowsForTopicId,
|
||||||
@@ -220,54 +220,70 @@ export class IdentityDB {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async ingestStatement(statement: string, options: IngestStatementOptions): Promise<Fact> {
|
async ingestStatement(statement: string, options: IngestStatementOptions): Promise<Fact> {
|
||||||
const extracted = await extractFact(statement, options.extractor);
|
const facts = await this.ingestStatements(statement, options);
|
||||||
const factInput: AddFactInput = {
|
const first = facts[0];
|
||||||
statement: extracted.statement ?? statement,
|
if (!first) {
|
||||||
topics: extracted.topics,
|
throw new Error('No facts were extracted from the statement.');
|
||||||
spaceName: options.spaceName,
|
|
||||||
};
|
|
||||||
|
|
||||||
if (extracted.summary !== undefined) {
|
|
||||||
factInput.summary = extracted.summary;
|
|
||||||
}
|
}
|
||||||
|
return first;
|
||||||
|
}
|
||||||
|
|
||||||
if (extracted.source !== undefined) {
|
async ingestStatements(statement: string, options: IngestStatementOptions): Promise<Fact[]> {
|
||||||
factInput.source = extracted.source;
|
const extractedList = await extractFacts(statement, options.extractor);
|
||||||
}
|
const facts: Fact[] = [];
|
||||||
|
|
||||||
if (extracted.confidence !== undefined) {
|
for (const extracted of extractedList) {
|
||||||
factInput.confidence = extracted.confidence;
|
const factInput: AddFactInput = {
|
||||||
}
|
statement: extracted.statement ?? statement,
|
||||||
|
topics: extracted.topics,
|
||||||
if (extracted.metadata !== undefined) {
|
|
||||||
factInput.metadata = extracted.metadata;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (options.embeddingProvider) {
|
|
||||||
const similarFacts = await this.findSimilarFacts({
|
|
||||||
statement: factInput.statement,
|
|
||||||
provider: options.embeddingProvider,
|
|
||||||
topicNames: factInput.topics.map((topic) => topic.name),
|
|
||||||
limit: 1,
|
|
||||||
minimumScore: options.duplicateThreshold ?? 0.97,
|
|
||||||
spaceName: options.spaceName,
|
spaceName: options.spaceName,
|
||||||
});
|
};
|
||||||
|
|
||||||
if (similarFacts[0]) {
|
if (extracted.summary !== undefined) {
|
||||||
return similarFacts[0];
|
factInput.summary = extracted.summary;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (extracted.source !== undefined) {
|
||||||
|
factInput.source = extracted.source;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (extracted.confidence !== undefined) {
|
||||||
|
factInput.confidence = extracted.confidence;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (extracted.metadata !== undefined) {
|
||||||
|
factInput.metadata = extracted.metadata;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.embeddingProvider) {
|
||||||
|
const similarFacts = await this.findSimilarFacts({
|
||||||
|
statement: factInput.statement,
|
||||||
|
provider: options.embeddingProvider,
|
||||||
|
topicNames: factInput.topics.map((topic) => topic.name),
|
||||||
|
limit: 1,
|
||||||
|
minimumScore: options.duplicateThreshold ?? 0.97,
|
||||||
|
spaceName: options.spaceName,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (similarFacts[0]) {
|
||||||
|
facts.push(similarFacts[0]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const fact = await this.addFact(factInput);
|
||||||
|
|
||||||
|
if (options.embeddingProvider) {
|
||||||
|
await this.indexFactEmbedding(fact.id, {
|
||||||
|
provider: options.embeddingProvider,
|
||||||
|
spaceName: options.spaceName,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
facts.push(fact);
|
||||||
}
|
}
|
||||||
|
|
||||||
const fact = await this.addFact(factInput);
|
return facts;
|
||||||
|
|
||||||
if (options.embeddingProvider) {
|
|
||||||
await this.indexFactEmbedding(fact.id, {
|
|
||||||
provider: options.embeddingProvider,
|
|
||||||
spaceName: options.spaceName,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
return fact;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async indexFactEmbeddings(input: IndexFactEmbeddingsInput): Promise<void> {
|
async indexFactEmbeddings(input: IndexFactEmbeddingsInput): Promise<void> {
|
||||||
|
|||||||
@@ -2,11 +2,15 @@ import { IdentityDBError } from '../core/errors';
|
|||||||
import { normalizeTopicName } from '../core/utils';
|
import { normalizeTopicName } from '../core/utils';
|
||||||
import type { FactExtractor, ExtractedFact } from './types';
|
import type { FactExtractor, ExtractedFact } from './types';
|
||||||
|
|
||||||
export async function extractFact(
|
export async function extractFacts(
|
||||||
input: string,
|
input: string,
|
||||||
extractor: FactExtractor,
|
extractor: FactExtractor,
|
||||||
): Promise<ExtractedFact> {
|
): Promise<ExtractedFact[]> {
|
||||||
const extracted = await extractor.extract(input);
|
const extracted = await extractor.extract(input);
|
||||||
|
return extracted.map((fact) => validateAndNormalizeFact(input, fact));
|
||||||
|
}
|
||||||
|
|
||||||
|
function validateAndNormalizeFact(input: string, extracted: ExtractedFact): ExtractedFact {
|
||||||
const statement = extracted.statement?.trim() || input.trim();
|
const statement = extracted.statement?.trim() || input.trim();
|
||||||
|
|
||||||
if (statement.length === 0) {
|
if (statement.length === 0) {
|
||||||
@@ -31,12 +35,12 @@ export async function extractFact(
|
|||||||
throw new IdentityDBError('Extractor returned no usable topics.');
|
throw new IdentityDBError('Extractor returned no usable topics.');
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
statement,
|
statement,
|
||||||
summary: extracted.summary ?? null,
|
summary: extracted.summary ?? null,
|
||||||
source: extracted.source ?? null,
|
source: extracted.source ?? null,
|
||||||
confidence: extracted.confidence ?? null,
|
confidence: extracted.confidence ?? null,
|
||||||
metadata: extracted.metadata ?? null,
|
metadata: extracted.metadata ?? null,
|
||||||
topics: Array.from(dedupedTopics.values()),
|
topics: Array.from(dedupedTopics.values()),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,16 +5,18 @@ import type {
|
|||||||
} from "./types";
|
} from "./types";
|
||||||
|
|
||||||
const DEFAULT_INSTRUCTIONS = [
|
const DEFAULT_INSTRUCTIONS = [
|
||||||
"Extract one structured fact from the user input.",
|
"Extract structured facts from the user input.",
|
||||||
"Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.",
|
"Return a JSON array of fact objects. Do not include markdown, explanations, or prose outside the JSON array.",
|
||||||
'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.',
|
'Each fact object must have a "statement", "summary", "source", "confidence", and "topics" array.',
|
||||||
'Only include topics that are explicitly in the input as-is. For example, topic "I started TypeScript since 2015" can be "I", "TypeScript", "2015".',
|
'Each topic in "topics" must have a "name", and may include "category", "granularity", and "role".',
|
||||||
|
"Only include topics that are explicitly in the input.",
|
||||||
|
"If the input contains multiple distinct facts, return them as separate objects in the array.",
|
||||||
].join("\n");
|
].join("\n");
|
||||||
|
|
||||||
export class LlmFactExtractor implements FactExtractor {
|
export class LlmFactExtractor implements FactExtractor {
|
||||||
constructor(private readonly options: LlmFactExtractorOptions) {}
|
constructor(private readonly options: LlmFactExtractorOptions) {}
|
||||||
|
|
||||||
async extract(input: string): Promise<ExtractedFact> {
|
async extract(input: string): Promise<ExtractedFact[]> {
|
||||||
return this.options.model.generateText({
|
return this.options.model.generateText({
|
||||||
instruction: DEFAULT_INSTRUCTIONS,
|
instruction: DEFAULT_INSTRUCTIONS,
|
||||||
input,
|
input,
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import type { ExtractedFact, FactExtractor } from './types';
|
import type { ExtractedFact, FactExtractor } from './types';
|
||||||
|
|
||||||
export class NaiveExtractor implements FactExtractor {
|
export class NaiveExtractor implements FactExtractor {
|
||||||
async extract(input: string): Promise<ExtractedFact> {
|
async extract(input: string): Promise<ExtractedFact[]> {
|
||||||
const topics: ExtractedFact['topics'] = [];
|
const topics: ExtractedFact['topics'] = [];
|
||||||
const seen = new Set<string>();
|
const seen = new Set<string>();
|
||||||
const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? [];
|
const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? [];
|
||||||
@@ -31,9 +31,11 @@ export class NaiveExtractor implements FactExtractor {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return [
|
||||||
statement: input.trim(),
|
{
|
||||||
topics,
|
statement: input.trim(),
|
||||||
};
|
topics,
|
||||||
|
},
|
||||||
|
];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ export interface ExtractedFact {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export interface FactExtractor {
|
export interface FactExtractor {
|
||||||
extract(input: string): Promise<ExtractedFact>;
|
extract(input: string): Promise<ExtractedFact[]>;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface LlmTextGenerationModelInput {
|
export interface LlmTextGenerationModelInput {
|
||||||
@@ -24,7 +24,7 @@ export interface LlmTextGenerationModelInput {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export interface LlmTextGenerationModel {
|
export interface LlmTextGenerationModel {
|
||||||
generateText(prompt: LlmTextGenerationModelInput): Promise<ExtractedFact>;
|
generateText(prompt: LlmTextGenerationModelInput): Promise<ExtractedFact[]>;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface LlmFactExtractorOptions {
|
export interface LlmFactExtractorOptions {
|
||||||
|
|||||||
@@ -23,29 +23,31 @@ describe("IdentityDB ingestion", () => {
|
|||||||
it("ingests a statement using a provided extractor", async () => {
|
it("ingests a statement using a provided extractor", async () => {
|
||||||
const extractor: FactExtractor = {
|
const extractor: FactExtractor = {
|
||||||
async extract(input) {
|
async extract(input) {
|
||||||
return {
|
return [
|
||||||
statement: input,
|
{
|
||||||
topics: [
|
statement: input,
|
||||||
{
|
topics: [
|
||||||
name: "I",
|
{
|
||||||
category: "entity",
|
name: "I",
|
||||||
granularity: "concrete",
|
category: "entity",
|
||||||
role: "subject",
|
granularity: "concrete",
|
||||||
},
|
role: "subject",
|
||||||
{
|
},
|
||||||
name: "TypeScript",
|
{
|
||||||
category: "entity",
|
name: "TypeScript",
|
||||||
granularity: "concrete",
|
category: "entity",
|
||||||
role: "object",
|
granularity: "concrete",
|
||||||
},
|
role: "object",
|
||||||
{
|
},
|
||||||
name: "2025",
|
{
|
||||||
category: "temporal",
|
name: "2025",
|
||||||
granularity: "concrete",
|
category: "temporal",
|
||||||
role: "time",
|
granularity: "concrete",
|
||||||
},
|
role: "time",
|
||||||
],
|
},
|
||||||
};
|
],
|
||||||
|
},
|
||||||
|
];
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -95,39 +97,41 @@ describe("IdentityDB ingestion", () => {
|
|||||||
async generateText(input) {
|
async generateText(input) {
|
||||||
prompt = input;
|
prompt = input;
|
||||||
|
|
||||||
return {
|
return [
|
||||||
statement: "I have worked with Bun and TypeScript since 2025.",
|
{
|
||||||
summary: "The speaker has Bun and TypeScript experience.",
|
statement: "I have worked with Bun and TypeScript since 2025.",
|
||||||
source: "chat",
|
summary: "The speaker has Bun and TypeScript experience.",
|
||||||
confidence: 0.91,
|
source: "chat",
|
||||||
metadata: { channel: "telegram" },
|
confidence: 0.91,
|
||||||
topics: [
|
metadata: { channel: "telegram" },
|
||||||
{
|
topics: [
|
||||||
name: "I",
|
{
|
||||||
category: "entity",
|
name: "I",
|
||||||
granularity: "concrete",
|
category: "entity",
|
||||||
role: "subject",
|
granularity: "concrete",
|
||||||
},
|
role: "subject",
|
||||||
{
|
},
|
||||||
name: "Bun",
|
{
|
||||||
category: "entity",
|
name: "Bun",
|
||||||
granularity: "concrete",
|
category: "entity",
|
||||||
role: "object",
|
granularity: "concrete",
|
||||||
},
|
role: "object",
|
||||||
{
|
},
|
||||||
name: "TypeScript",
|
{
|
||||||
category: "entity",
|
name: "TypeScript",
|
||||||
granularity: "concrete",
|
category: "entity",
|
||||||
role: "object",
|
granularity: "concrete",
|
||||||
},
|
role: "object",
|
||||||
{
|
},
|
||||||
name: "2025",
|
{
|
||||||
category: "temporal",
|
name: "2025",
|
||||||
granularity: "concrete",
|
category: "temporal",
|
||||||
role: "time",
|
granularity: "concrete",
|
||||||
},
|
role: "time",
|
||||||
],
|
},
|
||||||
};
|
],
|
||||||
|
},
|
||||||
|
];
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
additionalInstructions: "Prefer technology and time topics.",
|
additionalInstructions: "Prefer technology and time topics.",
|
||||||
@@ -141,7 +145,7 @@ describe("IdentityDB ingestion", () => {
|
|||||||
);
|
);
|
||||||
|
|
||||||
expect(prompt).toEqual({
|
expect(prompt).toEqual({
|
||||||
instruction: expect.stringContaining("Extract one structured fact from the user input."),
|
instruction: expect.stringContaining("Extract structured facts from the user input."),
|
||||||
input: "I have worked with Bun and TypeScript since 2025.",
|
input: "I have worked with Bun and TypeScript since 2025.",
|
||||||
additionalInstruction: "Prefer technology and time topics.",
|
additionalInstruction: "Prefer technology and time topics.",
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -178,13 +178,15 @@ describe('IdentityDB dedup-aware ingestion', () => {
|
|||||||
provider = new FakeEmbeddingProvider();
|
provider = new FakeEmbeddingProvider();
|
||||||
extractor = {
|
extractor = {
|
||||||
async extract(input) {
|
async extract(input) {
|
||||||
return {
|
return [
|
||||||
statement: input,
|
{
|
||||||
topics: [
|
statement: input,
|
||||||
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
|
topics: [
|
||||||
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
|
||||||
],
|
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
|
||||||
};
|
],
|
||||||
|
},
|
||||||
|
];
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user