9 Commits

Author SHA1 Message Date
1172c63db7 v0.4.0
All checks were successful
npm release / verify (push) Successful in 12s
npm release / publish to npm (push) Successful in 11s
2026-05-19 22:30:27 +09:00
0e595e6f60 test: update test of LlmExtractor 2026-05-19 22:28:09 +09:00
518264c467 v0.3.1
Some checks failed
npm release / verify (push) Failing after 9s
npm release / publish to npm (push) Has been skipped
2026-05-19 22:19:30 +09:00
cc8b3dfb14 vv0.3.1 2026-05-19 22:18:51 +09:00
56e17dab49 feat: make extract input structured 2026-05-19 22:18:42 +09:00
cc2e9110cc v0.3.0
All checks were successful
npm release / verify (push) Successful in 13s
npm release / publish to npm (push) Successful in 10s
2026-05-19 22:07:06 +09:00
0480ea182f refactor: make generateText model return ExtractedFact 2026-05-19 22:06:54 +09:00
185edfdae8 v0.2.2
All checks were successful
npm release / verify (push) Successful in 13s
npm release / publish to npm (push) Successful in 11s
2026-05-17 23:11:31 +09:00
a33fd61c97 feat: adjust instruction detailed
Some checks failed
npm release / verify (push) Failing after 10s
npm release / publish to npm (push) Has been skipped
2026-05-17 23:10:38 +09:00
4 changed files with 132 additions and 354 deletions

View File

@@ -1,6 +1,6 @@
{ {
"name": "identitydb", "name": "identitydb",
"version": "0.2.1", "version": "0.4.0",
"description": "TypeScript memory graph database wrapper for topics, facts, and AI-assisted ingestion.", "description": "TypeScript memory graph database wrapper for topics, facts, and AI-assisted ingestion.",
"license": "MIT", "license": "MIT",
"type": "module", "type": "module",

View File

@@ -1,273 +1,24 @@
import { IdentityDBError } from '../core/errors';
import type { TopicCategory, TopicGranularity } from '../types/domain';
import type { import type {
ExtractedFact, ExtractedFact,
FactExtractor, FactExtractor,
LlmFactExtractorOptions, LlmFactExtractorOptions,
} from './types'; } from "./types";
const DEFAULT_INSTRUCTIONS = [ const DEFAULT_INSTRUCTIONS = [
'Extract one structured fact from the user input.', "Extract one structured fact from the user input.",
'Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.', "Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.",
'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.', 'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.',
'Only include topics that are explicitly supported by the input.', 'Only include topics that are explicitly in the input as-is. For example, topic "I started TypeScript since 2015" can be "I", "TypeScript", "2015".',
].join('\n'); ].join("\n");
export class LlmFactExtractor implements FactExtractor { export class LlmFactExtractor implements FactExtractor {
constructor(private readonly options: LlmFactExtractorOptions) {} constructor(private readonly options: LlmFactExtractorOptions) {}
async extract(input: string): Promise<ExtractedFact> { async extract(input: string): Promise<ExtractedFact> {
const prompt = this.buildPrompt(input); return this.options.model.generateText({
const response = await this.options.model.generateText(prompt); instruction: DEFAULT_INSTRUCTIONS,
return parseLlmExtractedFactResponse(response); input,
} additionalInstruction: this.options.additionalInstructions,
});
private buildPrompt(input: string): string {
if (this.options.promptBuilder) {
return this.options.promptBuilder(input, this.options.instructions);
}
const instructions = this.options.instructions?.trim();
return [
DEFAULT_INSTRUCTIONS,
instructions && instructions.length > 0 ? `Additional instructions:\n${instructions}` : null,
`Input:\n${input.trim()}`,
]
.filter((value): value is string => value !== null)
.join('\n\n');
} }
} }
export function parseLlmExtractedFactResponse(response: string): ExtractedFact {
const payload = parseJsonCandidate(response);
if (!isRecord(payload)) {
throw new IdentityDBError('LLM extractor response must be a JSON object.');
}
const topics = parseTopics(payload.topics);
const extracted: ExtractedFact = { topics };
const statement = optionalString(payload.statement);
if (statement !== undefined) {
extracted.statement = statement;
}
const summary = optionalNullableString(payload.summary);
if (summary !== undefined) {
extracted.summary = summary;
}
const source = optionalNullableString(payload.source);
if (source !== undefined) {
extracted.source = source;
}
const confidence = optionalNullableNumber(payload.confidence);
if (confidence !== undefined) {
extracted.confidence = confidence;
}
const metadata = optionalMetadata(payload.metadata);
if (metadata !== undefined) {
extracted.metadata = metadata;
}
return extracted;
}
function parseJsonCandidate(response: string): unknown {
const trimmed = response.trim();
for (const candidate of collectJsonCandidates(trimmed)) {
try {
return JSON.parse(candidate);
} catch {
continue;
}
}
throw new IdentityDBError('LLM extractor returned invalid JSON.');
}
function collectJsonCandidates(response: string): string[] {
const candidates = new Set<string>();
candidates.add(response);
const fencePattern = /```(?:json)?\s*([\s\S]*?)```/gi;
let match: RegExpExecArray | null = fencePattern.exec(response);
while (match) {
const candidate = match[1]?.trim();
if (candidate) {
candidates.add(candidate);
}
match = fencePattern.exec(response);
}
const firstBrace = response.indexOf('{');
const lastBrace = response.lastIndexOf('}');
if (firstBrace >= 0 && lastBrace > firstBrace) {
candidates.add(response.slice(firstBrace, lastBrace + 1));
}
return Array.from(candidates);
}
function parseTopics(value: unknown): ExtractedFact['topics'] {
if (!Array.isArray(value)) {
throw new IdentityDBError('LLM extractor response must include a topics array.');
}
return value.map((entry) => parseTopic(entry));
}
function parseTopic(value: unknown): ExtractedFact['topics'][number] {
if (!isRecord(value)) {
throw new IdentityDBError('LLM extractor topics must be JSON objects.');
}
const name = optionalString(value.name)?.trim();
if (!name) {
throw new IdentityDBError('LLM extractor topics must include a non-empty name.');
}
const topic: ExtractedFact['topics'][number] = { name };
const category = optionalTopicCategory(value.category);
if (category !== undefined) {
topic.category = category;
}
const granularity = optionalTopicGranularity(value.granularity);
if (granularity !== undefined) {
topic.granularity = granularity;
}
const role = optionalNullableString(value.role);
if (role !== undefined) {
topic.role = role;
}
const description = optionalNullableString(value.description);
if (description !== undefined) {
topic.description = description;
}
const metadata = optionalMetadata(value.metadata);
if (metadata !== undefined) {
topic.metadata = metadata;
}
return topic;
}
function optionalString(value: unknown): string | undefined {
if (value === undefined) {
return undefined;
}
if (typeof value !== 'string') {
throw new IdentityDBError('LLM extractor expected a string field.');
}
return value;
}
function optionalNullableString(value: unknown): string | null | undefined {
if (value === undefined) {
return undefined;
}
if (value === null) {
return null;
}
if (typeof value !== 'string') {
throw new IdentityDBError('LLM extractor expected a nullable string field.');
}
return value;
}
function optionalNullableNumber(value: unknown): number | null | undefined {
if (value === undefined) {
return undefined;
}
if (value === null) {
return null;
}
if (typeof value !== 'number' || Number.isNaN(value)) {
throw new IdentityDBError('LLM extractor expected confidence to be a number or null.');
}
return value;
}
function optionalMetadata(value: unknown): ExtractedFact['metadata'] | undefined {
if (value === undefined) {
return undefined;
}
if (value === null) {
return null;
}
if (!isJsonLike(value)) {
throw new IdentityDBError('LLM extractor metadata must be valid JSON-compatible data.');
}
return value as ExtractedFact['metadata'];
}
function optionalTopicCategory(value: unknown): TopicCategory | undefined {
if (value === undefined) {
return undefined;
}
if (value === 'entity' || value === 'concept' || value === 'temporal' || value === 'custom') {
return value;
}
throw new IdentityDBError('LLM extractor returned an unsupported topic category.');
}
function optionalTopicGranularity(value: unknown): TopicGranularity | undefined {
if (value === undefined) {
return undefined;
}
if (value === 'abstract' || value === 'concrete' || value === 'mixed') {
return value;
}
throw new IdentityDBError('LLM extractor returned an unsupported topic granularity.');
}
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
function isJsonLike(value: unknown): boolean {
if (value === null) {
return true;
}
if (typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') {
return true;
}
if (Array.isArray(value)) {
return value.every((entry) => isJsonLike(entry));
}
if (isRecord(value)) {
return Object.values(value).every((entry) => isJsonLike(entry));
}
return false;
}

View File

@@ -2,14 +2,14 @@ import type {
AddFactInput, AddFactInput,
EmbeddingProvider, EmbeddingProvider,
TopicLinkInput, TopicLinkInput,
} from '../types/api'; } from "../types/api";
export interface ExtractedFact { export interface ExtractedFact {
statement?: string; statement?: string;
summary?: string | null; summary?: string | null;
source?: string | null; source?: string | null;
confidence?: number | null; confidence?: number | null;
metadata?: AddFactInput['metadata']; metadata?: AddFactInput["metadata"];
topics: TopicLinkInput[]; topics: TopicLinkInput[];
} }
@@ -17,14 +17,19 @@ export interface FactExtractor {
extract(input: string): Promise<ExtractedFact>; extract(input: string): Promise<ExtractedFact>;
} }
export interface LlmTextGenerationModelInput {
instruction: string;
input: string;
additionalInstruction?: string | undefined;
}
export interface LlmTextGenerationModel { export interface LlmTextGenerationModel {
generateText(prompt: string): Promise<string>; generateText(prompt: LlmTextGenerationModelInput): Promise<ExtractedFact>;
} }
export interface LlmFactExtractorOptions { export interface LlmFactExtractorOptions {
model: LlmTextGenerationModel; model: LlmTextGenerationModel;
instructions?: string; additionalInstructions?: string | undefined;
promptBuilder?: (input: string, instructions?: string) => string;
} }
export interface IngestStatementOptions { export interface IngestStatementOptions {

View File

@@ -1,15 +1,18 @@
import { afterEach, beforeEach, describe, expect, it } from 'vitest'; import { afterEach, beforeEach, describe, expect, it } from "vitest";
import { IdentityDB } from '../src/core/identity-db'; import { IdentityDB } from "../src/core/identity-db";
import { LlmFactExtractor } from '../src/ingestion/llm-extractor'; import { LlmFactExtractor } from "../src/ingestion/llm-extractor";
import { NaiveExtractor } from '../src/ingestion/naive-extractor'; import { NaiveExtractor } from "../src/ingestion/naive-extractor";
import type { FactExtractor } from '../src/ingestion/types'; import type {
FactExtractor,
LlmTextGenerationModelInput,
} from "../src/ingestion/types";
describe('IdentityDB ingestion', () => { describe("IdentityDB ingestion", () => {
let db: IdentityDB; let db: IdentityDB;
beforeEach(async () => { beforeEach(async () => {
db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' }); db = await IdentityDB.connect({ client: "sqlite", filename: ":memory:" });
await db.initialize(); await db.initialize();
}); });
@@ -17,121 +20,140 @@ describe('IdentityDB ingestion', () => {
await db.close(); await db.close();
}); });
it('ingests a statement using a provided extractor', async () => { it("ingests a statement using a provided extractor", async () => {
const extractor: FactExtractor = { const extractor: FactExtractor = {
async extract(input) { async extract(input) {
return { return {
statement: input, statement: input,
topics: [ topics: [
{ name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' }, {
{ name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' }, name: "I",
{ name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' }, category: "entity",
granularity: "concrete",
role: "subject",
},
{
name: "TypeScript",
category: "entity",
granularity: "concrete",
role: "object",
},
{
name: "2025",
category: "temporal",
granularity: "concrete",
role: "time",
},
], ],
}; };
}, },
}; };
const fact = await db.ingestStatement('I have worked with TypeScript since 2025.', { const fact = await db.ingestStatement(
"I have worked with TypeScript since 2025.",
{
extractor, extractor,
}); },
);
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'TypeScript', '2025']); expect(fact.topics.map((topic) => topic.name)).toEqual([
"I",
"TypeScript",
"2025",
]);
const linkedFacts = await db.getTopicFactsLinkedTo('TypeScript', '2025'); const linkedFacts = await db.getTopicFactsLinkedTo("TypeScript", "2025");
expect(linkedFacts).toHaveLength(1); expect(linkedFacts).toHaveLength(1);
expect(linkedFacts[0]?.statement).toBe('I have worked with TypeScript since 2025.'); expect(linkedFacts[0]?.statement).toBe(
"I have worked with TypeScript since 2025.",
);
}); });
it('ships a deterministic naive extractor for local usage', async () => { it("ships a deterministic naive extractor for local usage", async () => {
const fact = await db.ingestStatement('I have worked with TypeScript since 2025.', { const fact = await db.ingestStatement(
"I have worked with TypeScript since 2025.",
{
extractor: new NaiveExtractor(), extractor: new NaiveExtractor(),
}); },
);
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'TypeScript', '2025']); expect(fact.topics.map((topic) => topic.name)).toEqual([
"I",
"TypeScript",
"2025",
]);
const topic = await db.getTopicByName('TypeScript', { includeFacts: true }); const topic = await db.getTopicByName("TypeScript", { includeFacts: true });
expect(topic?.facts).toHaveLength(1); expect(topic?.facts).toHaveLength(1);
}); });
it('ships an LLM extractor adapter that turns structured JSON responses into facts', async () => { it("ships an LLM extractor adapter that returns structured facts from the model", async () => {
let prompt = ''; let prompt: LlmTextGenerationModelInput | undefined = undefined;
const extractor = new LlmFactExtractor({ const extractor = new LlmFactExtractor({
model: { model: {
async generateText(input) { async generateText(input) {
prompt = input; prompt = input;
return JSON.stringify({ return {
statement: 'I have worked with Bun and TypeScript since 2025.', statement: "I have worked with Bun and TypeScript since 2025.",
summary: 'The speaker has Bun and TypeScript experience.', summary: "The speaker has Bun and TypeScript experience.",
source: 'chat', source: "chat",
confidence: 0.91, confidence: 0.91,
metadata: { channel: 'telegram' }, metadata: { channel: "telegram" },
topics: [ topics: [
{ name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' }, {
{ name: 'Bun', category: 'entity', granularity: 'concrete', role: 'object' }, name: "I",
{ name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' }, category: "entity",
{ name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' }, granularity: "concrete",
role: "subject",
},
{
name: "Bun",
category: "entity",
granularity: "concrete",
role: "object",
},
{
name: "TypeScript",
category: "entity",
granularity: "concrete",
role: "object",
},
{
name: "2025",
category: "temporal",
granularity: "concrete",
role: "time",
},
], ],
}); };
}, },
}, },
instructions: 'Prefer technology and time topics.', additionalInstructions: "Prefer technology and time topics.",
}); });
const fact = await db.ingestStatement('I have worked with Bun and TypeScript since 2025.', { const fact = await db.ingestStatement(
"I have worked with Bun and TypeScript since 2025.",
{
extractor, extractor,
}); },
);
expect(prompt).toContain('Prefer technology and time topics.'); expect(prompt).toEqual({
expect(prompt).toContain('I have worked with Bun and TypeScript since 2025.'); instruction: expect.stringContaining("Extract one structured fact from the user input."),
expect(fact.summary).toBe('The speaker has Bun and TypeScript experience.'); input: "I have worked with Bun and TypeScript since 2025.",
expect(fact.source).toBe('chat'); additionalInstruction: "Prefer technology and time topics.",
});
expect(fact.summary).toBe("The speaker has Bun and TypeScript experience.");
expect(fact.source).toBe("chat");
expect(fact.confidence).toBe(0.91); expect(fact.confidence).toBe(0.91);
expect(fact.metadata).toEqual({ channel: 'telegram' }); expect(fact.metadata).toEqual({ channel: "telegram" });
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'Bun', 'TypeScript', '2025']); expect(fact.topics.map((topic) => topic.name)).toEqual([
}); "I",
"Bun",
it('parses JSON responses wrapped in markdown code fences', async () => { "TypeScript",
const extractor = new LlmFactExtractor({ "2025",
model: { ]);
async generateText() {
return [
'Here is the extracted fact:',
'```json',
JSON.stringify({
statement: 'Bun powers TypeScript tooling.',
topics: [
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
],
}),
'```',
].join('\n');
},
},
});
const fact = await db.ingestStatement('Bun powers TypeScript tooling.', {
extractor,
});
expect(fact.topics.map((topic) => topic.name)).toEqual(['Bun', 'TypeScript']);
});
it('rejects invalid LLM responses before writing facts', async () => {
const extractor = new LlmFactExtractor({
model: {
async generateText() {
return 'not json at all';
},
},
});
await expect(
db.ingestStatement('Bun powers TypeScript tooling.', {
extractor,
}),
).rejects.toThrow('LLM extractor returned invalid JSON.');
}); });
}); });