diff --git a/src/core/identity-db.ts b/src/core/identity-db.ts index 560ee54..cb2828a 100644 --- a/src/core/identity-db.ts +++ b/src/core/identity-db.ts @@ -9,6 +9,7 @@ import { type UpsertTopicInput, type AddFactInput, } from '../types/api'; +import type { IngestStatementOptions } from '../ingestion/types'; import type { DatabaseConnection, IdentityDBConnectionConfig } from '../adapters/dialect'; import type { IdentityDatabaseSchema } from '../types/database'; import type { FactRecord, TopicRecord } from '../types/domain'; @@ -24,6 +25,7 @@ import { nowIsoString, serializeMetadata, } from './utils'; +import { extractFact } from '../ingestion/extractor'; import { findFactRowsConnectingTopicIds, findFactRowsForTopicId, @@ -85,7 +87,8 @@ export class IdentityDB { const topics: FactTopic[] = []; - for (const [index, topicInput] of input.topics.entries()) { + for (let index = 0; index < input.topics.length; index += 1) { + const topicInput = input.topics[index]!; const topic = await this.upsertTopicInExecutor(trx, topicInput); await trx @@ -120,6 +123,35 @@ export class IdentityDB { }); } + async ingestStatement( + statement: string, + options: IngestStatementOptions, + ): Promise { + const extracted = await extractFact(statement, options.extractor); + const factInput: AddFactInput = { + statement: extracted.statement ?? statement, + topics: extracted.topics, + }; + + if (extracted.summary !== undefined) { + factInput.summary = extracted.summary; + } + + if (extracted.source !== undefined) { + factInput.source = extracted.source; + } + + if (extracted.confidence !== undefined) { + factInput.confidence = extracted.confidence; + } + + if (extracted.metadata !== undefined) { + factInput.metadata = extracted.metadata; + } + + return this.addFact(factInput); + } + async getTopicFacts(name: string): Promise { const topicRow = await this.getRequiredTopicRow(name); diff --git a/src/index.ts b/src/index.ts index 86f9b1b..26dbb5e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,6 +1,9 @@ export * from './adapters'; export * from './core/identity-db'; export * from './core/migrations'; +export * from './ingestion/extractor'; +export * from './ingestion/naive-extractor'; +export * from './ingestion/types'; export * from './types/api'; export * from './types/database'; export * from './types/domain'; diff --git a/src/ingestion/extractor.ts b/src/ingestion/extractor.ts new file mode 100644 index 0000000..9990bd6 --- /dev/null +++ b/src/ingestion/extractor.ts @@ -0,0 +1,42 @@ +import { IdentityDBError } from '../core/errors'; +import { normalizeTopicName } from '../core/utils'; +import type { FactExtractor, ExtractedFact } from './types'; + +export async function extractFact( + input: string, + extractor: FactExtractor, +): Promise { + const extracted = await extractor.extract(input); + const statement = extracted.statement?.trim() || input.trim(); + + if (statement.length === 0) { + throw new IdentityDBError('Extractor returned an empty statement.'); + } + + const dedupedTopics = new Map(); + + for (const topic of extracted.topics) { + const normalizedName = normalizeTopicName(topic.name); + + if (normalizedName.length === 0) { + continue; + } + + if (!dedupedTopics.has(normalizedName)) { + dedupedTopics.set(normalizedName, topic); + } + } + + if (dedupedTopics.size === 0) { + throw new IdentityDBError('Extractor returned no usable topics.'); + } + + return { + statement, + summary: extracted.summary ?? null, + source: extracted.source ?? null, + confidence: extracted.confidence ?? null, + metadata: extracted.metadata ?? null, + topics: Array.from(dedupedTopics.values()), + }; +} diff --git a/src/ingestion/naive-extractor.ts b/src/ingestion/naive-extractor.ts new file mode 100644 index 0000000..15236c7 --- /dev/null +++ b/src/ingestion/naive-extractor.ts @@ -0,0 +1,39 @@ +import type { ExtractedFact, FactExtractor } from './types'; + +export class NaiveExtractor implements FactExtractor { + async extract(input: string): Promise { + const topics: ExtractedFact['topics'] = []; + const seen = new Set(); + const tokens = input.match(/\bI\b|\b\d{4}\b|\b[A-Z][A-Za-z0-9+#.-]*\b/g) ?? []; + + for (const token of tokens) { + const normalized = token.trim().toLowerCase(); + if (seen.has(normalized)) { + continue; + } + seen.add(normalized); + + if (/^\d{4}$/.test(token)) { + topics.push({ + name: token, + category: 'temporal', + granularity: 'concrete', + role: 'time', + }); + continue; + } + + topics.push({ + name: token, + category: 'entity', + granularity: 'concrete', + role: token === 'I' ? 'subject' : 'object', + }); + } + + return { + statement: input.trim(), + topics, + }; + } +} diff --git a/src/ingestion/types.ts b/src/ingestion/types.ts new file mode 100644 index 0000000..d1add29 --- /dev/null +++ b/src/ingestion/types.ts @@ -0,0 +1,18 @@ +import type { AddFactInput, TopicLinkInput } from '../types/api'; + +export interface ExtractedFact { + statement?: string; + summary?: string | null; + source?: string | null; + confidence?: number | null; + metadata?: AddFactInput['metadata']; + topics: TopicLinkInput[]; +} + +export interface FactExtractor { + extract(input: string): Promise; +} + +export interface IngestStatementOptions { + extractor: FactExtractor; +}