feat: add pluggable fact extraction pipeline

This commit is contained in:
2026-05-11 10:54:40 +09:00
parent 9f3133a403
commit 2c6624beea
5 changed files with 135 additions and 1 deletions

View File

@@ -9,6 +9,7 @@ import {
type UpsertTopicInput,
type AddFactInput,
} from '../types/api';
import type { IngestStatementOptions } from '../ingestion/types';
import type { DatabaseConnection, IdentityDBConnectionConfig } from '../adapters/dialect';
import type { IdentityDatabaseSchema } from '../types/database';
import type { FactRecord, TopicRecord } from '../types/domain';
@@ -24,6 +25,7 @@ import {
nowIsoString,
serializeMetadata,
} from './utils';
import { extractFact } from '../ingestion/extractor';
import {
findFactRowsConnectingTopicIds,
findFactRowsForTopicId,
@@ -85,7 +87,8 @@ export class IdentityDB {
const topics: FactTopic[] = [];
for (const [index, topicInput] of input.topics.entries()) {
for (let index = 0; index < input.topics.length; index += 1) {
const topicInput = input.topics[index]!;
const topic = await this.upsertTopicInExecutor(trx, topicInput);
await trx
@@ -120,6 +123,35 @@ export class IdentityDB {
});
}
async ingestStatement(
statement: string,
options: IngestStatementOptions,
): Promise<Fact> {
const extracted = await extractFact(statement, options.extractor);
const factInput: AddFactInput = {
statement: extracted.statement ?? statement,
topics: extracted.topics,
};
if (extracted.summary !== undefined) {
factInput.summary = extracted.summary;
}
if (extracted.source !== undefined) {
factInput.source = extracted.source;
}
if (extracted.confidence !== undefined) {
factInput.confidence = extracted.confidence;
}
if (extracted.metadata !== undefined) {
factInput.metadata = extracted.metadata;
}
return this.addFact(factInput);
}
async getTopicFacts(name: string): Promise<Fact[]> {
const topicRow = await this.getRequiredTopicRow(name);

View File

@@ -1,6 +1,9 @@
export * from './adapters';
export * from './core/identity-db';
export * from './core/migrations';
export * from './ingestion/extractor';
export * from './ingestion/naive-extractor';
export * from './ingestion/types';
export * from './types/api';
export * from './types/database';
export * from './types/domain';

View File

@@ -0,0 +1,42 @@
import { IdentityDBError } from '../core/errors';
import { normalizeTopicName } from '../core/utils';
import type { FactExtractor, ExtractedFact } from './types';
export async function extractFact(
input: string,
extractor: FactExtractor,
): Promise<ExtractedFact> {
const extracted = await extractor.extract(input);
const statement = extracted.statement?.trim() || input.trim();
if (statement.length === 0) {
throw new IdentityDBError('Extractor returned an empty statement.');
}
const dedupedTopics = new Map<string, ExtractedFact['topics'][number]>();
for (const topic of extracted.topics) {
const normalizedName = normalizeTopicName(topic.name);
if (normalizedName.length === 0) {
continue;
}
if (!dedupedTopics.has(normalizedName)) {
dedupedTopics.set(normalizedName, topic);
}
}
if (dedupedTopics.size === 0) {
throw new IdentityDBError('Extractor returned no usable topics.');
}
return {
statement,
summary: extracted.summary ?? null,
source: extracted.source ?? null,
confidence: extracted.confidence ?? null,
metadata: extracted.metadata ?? null,
topics: Array.from(dedupedTopics.values()),
};
}

View File

@@ -0,0 +1,39 @@
import type { ExtractedFact, FactExtractor } from './types';
export class NaiveExtractor implements FactExtractor {
async extract(input: string): Promise<ExtractedFact> {
const topics: ExtractedFact['topics'] = [];
const seen = new Set<string>();
const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? [];
for (const token of tokens) {
const normalized = token.trim().toLowerCase();
if (seen.has(normalized)) {
continue;
}
seen.add(normalized);
if (/^\d{4}$/.test(token)) {
topics.push({
name: token,
category: 'temporal',
granularity: 'concrete',
role: 'time',
});
continue;
}
topics.push({
name: token,
category: 'entity',
granularity: 'concrete',
role: token === 'I' ? 'subject' : 'object',
});
}
return {
statement: input.trim(),
topics,
};
}
}

18
src/ingestion/types.ts Normal file
View File

@@ -0,0 +1,18 @@
import type { AddFactInput, TopicLinkInput } from '../types/api';
export interface ExtractedFact {
statement?: string;
summary?: string | null;
source?: string | null;
confidence?: number | null;
metadata?: AddFactInput['metadata'];
topics: TopicLinkInput[];
}
export interface FactExtractor {
extract(input: string): Promise<ExtractedFact>;
}
export interface IngestStatementOptions {
extractor: FactExtractor;
}