feat: add pluggable fact extraction pipeline
This commit is contained in:
@@ -9,6 +9,7 @@ import {
|
||||
type UpsertTopicInput,
|
||||
type AddFactInput,
|
||||
} from '../types/api';
|
||||
import type { IngestStatementOptions } from '../ingestion/types';
|
||||
import type { DatabaseConnection, IdentityDBConnectionConfig } from '../adapters/dialect';
|
||||
import type { IdentityDatabaseSchema } from '../types/database';
|
||||
import type { FactRecord, TopicRecord } from '../types/domain';
|
||||
@@ -24,6 +25,7 @@ import {
|
||||
nowIsoString,
|
||||
serializeMetadata,
|
||||
} from './utils';
|
||||
import { extractFact } from '../ingestion/extractor';
|
||||
import {
|
||||
findFactRowsConnectingTopicIds,
|
||||
findFactRowsForTopicId,
|
||||
@@ -85,7 +87,8 @@ export class IdentityDB {
|
||||
|
||||
const topics: FactTopic[] = [];
|
||||
|
||||
for (const [index, topicInput] of input.topics.entries()) {
|
||||
for (let index = 0; index < input.topics.length; index += 1) {
|
||||
const topicInput = input.topics[index]!;
|
||||
const topic = await this.upsertTopicInExecutor(trx, topicInput);
|
||||
|
||||
await trx
|
||||
@@ -120,6 +123,35 @@ export class IdentityDB {
|
||||
});
|
||||
}
|
||||
|
||||
async ingestStatement(
|
||||
statement: string,
|
||||
options: IngestStatementOptions,
|
||||
): Promise<Fact> {
|
||||
const extracted = await extractFact(statement, options.extractor);
|
||||
const factInput: AddFactInput = {
|
||||
statement: extracted.statement ?? statement,
|
||||
topics: extracted.topics,
|
||||
};
|
||||
|
||||
if (extracted.summary !== undefined) {
|
||||
factInput.summary = extracted.summary;
|
||||
}
|
||||
|
||||
if (extracted.source !== undefined) {
|
||||
factInput.source = extracted.source;
|
||||
}
|
||||
|
||||
if (extracted.confidence !== undefined) {
|
||||
factInput.confidence = extracted.confidence;
|
||||
}
|
||||
|
||||
if (extracted.metadata !== undefined) {
|
||||
factInput.metadata = extracted.metadata;
|
||||
}
|
||||
|
||||
return this.addFact(factInput);
|
||||
}
|
||||
|
||||
async getTopicFacts(name: string): Promise<Fact[]> {
|
||||
const topicRow = await this.getRequiredTopicRow(name);
|
||||
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
export * from './adapters';
|
||||
export * from './core/identity-db';
|
||||
export * from './core/migrations';
|
||||
export * from './ingestion/extractor';
|
||||
export * from './ingestion/naive-extractor';
|
||||
export * from './ingestion/types';
|
||||
export * from './types/api';
|
||||
export * from './types/database';
|
||||
export * from './types/domain';
|
||||
|
||||
42
src/ingestion/extractor.ts
Normal file
42
src/ingestion/extractor.ts
Normal file
@@ -0,0 +1,42 @@
|
||||
import { IdentityDBError } from '../core/errors';
|
||||
import { normalizeTopicName } from '../core/utils';
|
||||
import type { FactExtractor, ExtractedFact } from './types';
|
||||
|
||||
export async function extractFact(
|
||||
input: string,
|
||||
extractor: FactExtractor,
|
||||
): Promise<ExtractedFact> {
|
||||
const extracted = await extractor.extract(input);
|
||||
const statement = extracted.statement?.trim() || input.trim();
|
||||
|
||||
if (statement.length === 0) {
|
||||
throw new IdentityDBError('Extractor returned an empty statement.');
|
||||
}
|
||||
|
||||
const dedupedTopics = new Map<string, ExtractedFact['topics'][number]>();
|
||||
|
||||
for (const topic of extracted.topics) {
|
||||
const normalizedName = normalizeTopicName(topic.name);
|
||||
|
||||
if (normalizedName.length === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!dedupedTopics.has(normalizedName)) {
|
||||
dedupedTopics.set(normalizedName, topic);
|
||||
}
|
||||
}
|
||||
|
||||
if (dedupedTopics.size === 0) {
|
||||
throw new IdentityDBError('Extractor returned no usable topics.');
|
||||
}
|
||||
|
||||
return {
|
||||
statement,
|
||||
summary: extracted.summary ?? null,
|
||||
source: extracted.source ?? null,
|
||||
confidence: extracted.confidence ?? null,
|
||||
metadata: extracted.metadata ?? null,
|
||||
topics: Array.from(dedupedTopics.values()),
|
||||
};
|
||||
}
|
||||
39
src/ingestion/naive-extractor.ts
Normal file
39
src/ingestion/naive-extractor.ts
Normal file
@@ -0,0 +1,39 @@
|
||||
import type { ExtractedFact, FactExtractor } from './types';
|
||||
|
||||
export class NaiveExtractor implements FactExtractor {
|
||||
async extract(input: string): Promise<ExtractedFact> {
|
||||
const topics: ExtractedFact['topics'] = [];
|
||||
const seen = new Set<string>();
|
||||
const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? [];
|
||||
|
||||
for (const token of tokens) {
|
||||
const normalized = token.trim().toLowerCase();
|
||||
if (seen.has(normalized)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(normalized);
|
||||
|
||||
if (/^\d{4}$/.test(token)) {
|
||||
topics.push({
|
||||
name: token,
|
||||
category: 'temporal',
|
||||
granularity: 'concrete',
|
||||
role: 'time',
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
topics.push({
|
||||
name: token,
|
||||
category: 'entity',
|
||||
granularity: 'concrete',
|
||||
role: token === 'I' ? 'subject' : 'object',
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
statement: input.trim(),
|
||||
topics,
|
||||
};
|
||||
}
|
||||
}
|
||||
18
src/ingestion/types.ts
Normal file
18
src/ingestion/types.ts
Normal file
@@ -0,0 +1,18 @@
|
||||
import type { AddFactInput, TopicLinkInput } from '../types/api';
|
||||
|
||||
export interface ExtractedFact {
|
||||
statement?: string;
|
||||
summary?: string | null;
|
||||
source?: string | null;
|
||||
confidence?: number | null;
|
||||
metadata?: AddFactInput['metadata'];
|
||||
topics: TopicLinkInput[];
|
||||
}
|
||||
|
||||
export interface FactExtractor {
|
||||
extract(input: string): Promise<ExtractedFact>;
|
||||
}
|
||||
|
||||
export interface IngestStatementOptions {
|
||||
extractor: FactExtractor;
|
||||
}
|
||||
Reference in New Issue
Block a user