feat: add pluggable fact extraction pipeline
This commit is contained in:
@@ -9,6 +9,7 @@ import {
|
|||||||
type UpsertTopicInput,
|
type UpsertTopicInput,
|
||||||
type AddFactInput,
|
type AddFactInput,
|
||||||
} from '../types/api';
|
} from '../types/api';
|
||||||
|
import type { IngestStatementOptions } from '../ingestion/types';
|
||||||
import type { DatabaseConnection, IdentityDBConnectionConfig } from '../adapters/dialect';
|
import type { DatabaseConnection, IdentityDBConnectionConfig } from '../adapters/dialect';
|
||||||
import type { IdentityDatabaseSchema } from '../types/database';
|
import type { IdentityDatabaseSchema } from '../types/database';
|
||||||
import type { FactRecord, TopicRecord } from '../types/domain';
|
import type { FactRecord, TopicRecord } from '../types/domain';
|
||||||
@@ -24,6 +25,7 @@ import {
|
|||||||
nowIsoString,
|
nowIsoString,
|
||||||
serializeMetadata,
|
serializeMetadata,
|
||||||
} from './utils';
|
} from './utils';
|
||||||
|
import { extractFact } from '../ingestion/extractor';
|
||||||
import {
|
import {
|
||||||
findFactRowsConnectingTopicIds,
|
findFactRowsConnectingTopicIds,
|
||||||
findFactRowsForTopicId,
|
findFactRowsForTopicId,
|
||||||
@@ -85,7 +87,8 @@ export class IdentityDB {
|
|||||||
|
|
||||||
const topics: FactTopic[] = [];
|
const topics: FactTopic[] = [];
|
||||||
|
|
||||||
for (const [index, topicInput] of input.topics.entries()) {
|
for (let index = 0; index < input.topics.length; index += 1) {
|
||||||
|
const topicInput = input.topics[index]!;
|
||||||
const topic = await this.upsertTopicInExecutor(trx, topicInput);
|
const topic = await this.upsertTopicInExecutor(trx, topicInput);
|
||||||
|
|
||||||
await trx
|
await trx
|
||||||
@@ -120,6 +123,35 @@ export class IdentityDB {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async ingestStatement(
|
||||||
|
statement: string,
|
||||||
|
options: IngestStatementOptions,
|
||||||
|
): Promise<Fact> {
|
||||||
|
const extracted = await extractFact(statement, options.extractor);
|
||||||
|
const factInput: AddFactInput = {
|
||||||
|
statement: extracted.statement ?? statement,
|
||||||
|
topics: extracted.topics,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (extracted.summary !== undefined) {
|
||||||
|
factInput.summary = extracted.summary;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (extracted.source !== undefined) {
|
||||||
|
factInput.source = extracted.source;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (extracted.confidence !== undefined) {
|
||||||
|
factInput.confidence = extracted.confidence;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (extracted.metadata !== undefined) {
|
||||||
|
factInput.metadata = extracted.metadata;
|
||||||
|
}
|
||||||
|
|
||||||
|
return this.addFact(factInput);
|
||||||
|
}
|
||||||
|
|
||||||
async getTopicFacts(name: string): Promise<Fact[]> {
|
async getTopicFacts(name: string): Promise<Fact[]> {
|
||||||
const topicRow = await this.getRequiredTopicRow(name);
|
const topicRow = await this.getRequiredTopicRow(name);
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
export * from './adapters';
|
export * from './adapters';
|
||||||
export * from './core/identity-db';
|
export * from './core/identity-db';
|
||||||
export * from './core/migrations';
|
export * from './core/migrations';
|
||||||
|
export * from './ingestion/extractor';
|
||||||
|
export * from './ingestion/naive-extractor';
|
||||||
|
export * from './ingestion/types';
|
||||||
export * from './types/api';
|
export * from './types/api';
|
||||||
export * from './types/database';
|
export * from './types/database';
|
||||||
export * from './types/domain';
|
export * from './types/domain';
|
||||||
|
|||||||
42
src/ingestion/extractor.ts
Normal file
42
src/ingestion/extractor.ts
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
import { IdentityDBError } from '../core/errors';
|
||||||
|
import { normalizeTopicName } from '../core/utils';
|
||||||
|
import type { FactExtractor, ExtractedFact } from './types';
|
||||||
|
|
||||||
|
export async function extractFact(
|
||||||
|
input: string,
|
||||||
|
extractor: FactExtractor,
|
||||||
|
): Promise<ExtractedFact> {
|
||||||
|
const extracted = await extractor.extract(input);
|
||||||
|
const statement = extracted.statement?.trim() || input.trim();
|
||||||
|
|
||||||
|
if (statement.length === 0) {
|
||||||
|
throw new IdentityDBError('Extractor returned an empty statement.');
|
||||||
|
}
|
||||||
|
|
||||||
|
const dedupedTopics = new Map<string, ExtractedFact['topics'][number]>();
|
||||||
|
|
||||||
|
for (const topic of extracted.topics) {
|
||||||
|
const normalizedName = normalizeTopicName(topic.name);
|
||||||
|
|
||||||
|
if (normalizedName.length === 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!dedupedTopics.has(normalizedName)) {
|
||||||
|
dedupedTopics.set(normalizedName, topic);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dedupedTopics.size === 0) {
|
||||||
|
throw new IdentityDBError('Extractor returned no usable topics.');
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
statement,
|
||||||
|
summary: extracted.summary ?? null,
|
||||||
|
source: extracted.source ?? null,
|
||||||
|
confidence: extracted.confidence ?? null,
|
||||||
|
metadata: extracted.metadata ?? null,
|
||||||
|
topics: Array.from(dedupedTopics.values()),
|
||||||
|
};
|
||||||
|
}
|
||||||
39
src/ingestion/naive-extractor.ts
Normal file
39
src/ingestion/naive-extractor.ts
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
import type { ExtractedFact, FactExtractor } from './types';
|
||||||
|
|
||||||
|
export class NaiveExtractor implements FactExtractor {
|
||||||
|
async extract(input: string): Promise<ExtractedFact> {
|
||||||
|
const topics: ExtractedFact['topics'] = [];
|
||||||
|
const seen = new Set<string>();
|
||||||
|
const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? [];
|
||||||
|
|
||||||
|
for (const token of tokens) {
|
||||||
|
const normalized = token.trim().toLowerCase();
|
||||||
|
if (seen.has(normalized)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
seen.add(normalized);
|
||||||
|
|
||||||
|
if (/^\d{4}$/.test(token)) {
|
||||||
|
topics.push({
|
||||||
|
name: token,
|
||||||
|
category: 'temporal',
|
||||||
|
granularity: 'concrete',
|
||||||
|
role: 'time',
|
||||||
|
});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
topics.push({
|
||||||
|
name: token,
|
||||||
|
category: 'entity',
|
||||||
|
granularity: 'concrete',
|
||||||
|
role: token === 'I' ? 'subject' : 'object',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
statement: input.trim(),
|
||||||
|
topics,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
18
src/ingestion/types.ts
Normal file
18
src/ingestion/types.ts
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
import type { AddFactInput, TopicLinkInput } from '../types/api';
|
||||||
|
|
||||||
|
export interface ExtractedFact {
|
||||||
|
statement?: string;
|
||||||
|
summary?: string | null;
|
||||||
|
source?: string | null;
|
||||||
|
confidence?: number | null;
|
||||||
|
metadata?: AddFactInput['metadata'];
|
||||||
|
topics: TopicLinkInput[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface FactExtractor {
|
||||||
|
extract(input: string): Promise<ExtractedFact>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface IngestStatementOptions {
|
||||||
|
extractor: FactExtractor;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user