7 Commits

Author SHA1 Message Date
1172c63db7 v0.4.0
All checks were successful
npm release / verify (push) Successful in 12s
npm release / publish to npm (push) Successful in 11s
2026-05-19 22:30:27 +09:00
0e595e6f60 test: update test of LlmExtractor 2026-05-19 22:28:09 +09:00
518264c467 v0.3.1
Some checks failed
npm release / verify (push) Failing after 9s
npm release / publish to npm (push) Has been skipped
2026-05-19 22:19:30 +09:00
cc8b3dfb14 vv0.3.1 2026-05-19 22:18:51 +09:00
56e17dab49 feat: make extract input structured 2026-05-19 22:18:42 +09:00
cc2e9110cc v0.3.0
All checks were successful
npm release / verify (push) Successful in 13s
npm release / publish to npm (push) Successful in 10s
2026-05-19 22:07:06 +09:00
0480ea182f refactor: make generateText model return ExtractedFact 2026-05-19 22:06:54 +09:00
4 changed files with 127 additions and 378 deletions

View File

@@ -1,6 +1,6 @@
{
"name": "identitydb",
"version": "0.2.2",
"version": "0.4.0",
"description": "TypeScript memory graph database wrapper for topics, facts, and AI-assisted ingestion.",
"license": "MIT",
"type": "module",

View File

@@ -1,5 +1,3 @@
import { IdentityDBError } from "../core/errors";
import type { TopicCategory, TopicGranularity } from "../types/domain";
import type {
ExtractedFact,
FactExtractor,
@@ -17,286 +15,10 @@ export class LlmFactExtractor implements FactExtractor {
constructor(private readonly options: LlmFactExtractorOptions) {}
async extract(input: string): Promise<ExtractedFact> {
const prompt = this.buildPrompt(input);
const response = await this.options.model.generateText(prompt);
return parseLlmExtractedFactResponse(response);
}
private buildPrompt(input: string): string {
if (this.options.promptBuilder) {
return this.options.promptBuilder(input, this.options.instructions);
}
const instructions = this.options.instructions?.trim();
return [
DEFAULT_INSTRUCTIONS,
instructions && instructions.length > 0
? `Additional instructions:\n${instructions}`
: null,
`Input:\n${input.trim()}`,
]
.filter((value): value is string => value !== null)
.join("\n\n");
return this.options.model.generateText({
instruction: DEFAULT_INSTRUCTIONS,
input,
additionalInstruction: this.options.additionalInstructions,
});
}
}
export function parseLlmExtractedFactResponse(response: string): ExtractedFact {
const payload = parseJsonCandidate(response);
if (!isRecord(payload)) {
throw new IdentityDBError("LLM extractor response must be a JSON object.");
}
const topics = parseTopics(payload.topics);
const extracted: ExtractedFact = { topics };
const statement = optionalString(payload.statement);
if (statement !== undefined) {
extracted.statement = statement;
}
const summary = optionalNullableString(payload.summary);
if (summary !== undefined) {
extracted.summary = summary;
}
const source = optionalNullableString(payload.source);
if (source !== undefined) {
extracted.source = source;
}
const confidence = optionalNullableNumber(payload.confidence);
if (confidence !== undefined) {
extracted.confidence = confidence;
}
const metadata = optionalMetadata(payload.metadata);
if (metadata !== undefined) {
extracted.metadata = metadata;
}
return extracted;
}
function parseJsonCandidate(response: string): unknown {
const trimmed = response.trim();
for (const candidate of collectJsonCandidates(trimmed)) {
try {
return JSON.parse(candidate);
} catch {
continue;
}
}
throw new IdentityDBError("LLM extractor returned invalid JSON.");
}
function collectJsonCandidates(response: string): string[] {
const candidates = new Set<string>();
candidates.add(response);
const fencePattern = /```(?:json)?\s*([\s\S]*?)```/gi;
let match: RegExpExecArray | null = fencePattern.exec(response);
while (match) {
const candidate = match[1]?.trim();
if (candidate) {
candidates.add(candidate);
}
match = fencePattern.exec(response);
}
const firstBrace = response.indexOf("{");
const lastBrace = response.lastIndexOf("}");
if (firstBrace >= 0 && lastBrace > firstBrace) {
candidates.add(response.slice(firstBrace, lastBrace + 1));
}
return Array.from(candidates);
}
function parseTopics(value: unknown): ExtractedFact["topics"] {
if (!Array.isArray(value)) {
throw new IdentityDBError(
"LLM extractor response must include a topics array.",
);
}
return value.map((entry) => parseTopic(entry));
}
function parseTopic(value: unknown): ExtractedFact["topics"][number] {
if (!isRecord(value)) {
throw new IdentityDBError("LLM extractor topics must be JSON objects.");
}
const name = optionalString(value.name)?.trim();
if (!name) {
throw new IdentityDBError(
"LLM extractor topics must include a non-empty name.",
);
}
const topic: ExtractedFact["topics"][number] = { name };
const category = optionalTopicCategory(value.category);
if (category !== undefined) {
topic.category = category;
}
const granularity = optionalTopicGranularity(value.granularity);
if (granularity !== undefined) {
topic.granularity = granularity;
}
const role = optionalNullableString(value.role);
if (role !== undefined) {
topic.role = role;
}
const description = optionalNullableString(value.description);
if (description !== undefined) {
topic.description = description;
}
const metadata = optionalMetadata(value.metadata);
if (metadata !== undefined) {
topic.metadata = metadata;
}
return topic;
}
function optionalString(value: unknown): string | undefined {
if (value === undefined) {
return undefined;
}
if (typeof value !== "string") {
throw new IdentityDBError("LLM extractor expected a string field.");
}
return value;
}
function optionalNullableString(value: unknown): string | null | undefined {
if (value === undefined) {
return undefined;
}
if (value === null) {
return null;
}
if (typeof value !== "string") {
throw new IdentityDBError(
"LLM extractor expected a nullable string field.",
);
}
return value;
}
function optionalNullableNumber(value: unknown): number | null | undefined {
if (value === undefined) {
return undefined;
}
if (value === null) {
return null;
}
if (typeof value !== "number" || Number.isNaN(value)) {
throw new IdentityDBError(
"LLM extractor expected confidence to be a number or null.",
);
}
return value;
}
function optionalMetadata(
value: unknown,
): ExtractedFact["metadata"] | undefined {
if (value === undefined) {
return undefined;
}
if (value === null) {
return null;
}
if (!isJsonLike(value)) {
throw new IdentityDBError(
"LLM extractor metadata must be valid JSON-compatible data.",
);
}
return value as ExtractedFact["metadata"];
}
function optionalTopicCategory(value: unknown): TopicCategory | undefined {
if (value === undefined) {
return undefined;
}
if (
value === "entity" ||
value === "concept" ||
value === "temporal" ||
value === "custom"
) {
return value;
}
throw new IdentityDBError(
"LLM extractor returned an unsupported topic category.",
);
}
function optionalTopicGranularity(
value: unknown,
): TopicGranularity | undefined {
if (value === undefined) {
return undefined;
}
if (value === "abstract" || value === "concrete" || value === "mixed") {
return value;
}
throw new IdentityDBError(
"LLM extractor returned an unsupported topic granularity.",
);
}
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null && !Array.isArray(value);
}
function isJsonLike(value: unknown): boolean {
if (value === null) {
return true;
}
if (
typeof value === "string" ||
typeof value === "number" ||
typeof value === "boolean"
) {
return true;
}
if (Array.isArray(value)) {
return value.every((entry) => isJsonLike(entry));
}
if (isRecord(value)) {
return Object.values(value).every((entry) => isJsonLike(entry));
}
return false;
}

View File

@@ -2,14 +2,14 @@ import type {
AddFactInput,
EmbeddingProvider,
TopicLinkInput,
} from '../types/api';
} from "../types/api";
export interface ExtractedFact {
statement?: string;
summary?: string | null;
source?: string | null;
confidence?: number | null;
metadata?: AddFactInput['metadata'];
metadata?: AddFactInput["metadata"];
topics: TopicLinkInput[];
}
@@ -17,14 +17,19 @@ export interface FactExtractor {
extract(input: string): Promise<ExtractedFact>;
}
export interface LlmTextGenerationModelInput {
instruction: string;
input: string;
additionalInstruction?: string | undefined;
}
export interface LlmTextGenerationModel {
generateText(prompt: string): Promise<string>;
generateText(prompt: LlmTextGenerationModelInput): Promise<ExtractedFact>;
}
export interface LlmFactExtractorOptions {
model: LlmTextGenerationModel;
instructions?: string;
promptBuilder?: (input: string, instructions?: string) => string;
additionalInstructions?: string | undefined;
}
export interface IngestStatementOptions {

View File

@@ -1,15 +1,18 @@
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { afterEach, beforeEach, describe, expect, it } from "vitest";
import { IdentityDB } from '../src/core/identity-db';
import { LlmFactExtractor } from '../src/ingestion/llm-extractor';
import { NaiveExtractor } from '../src/ingestion/naive-extractor';
import type { FactExtractor } from '../src/ingestion/types';
import { IdentityDB } from "../src/core/identity-db";
import { LlmFactExtractor } from "../src/ingestion/llm-extractor";
import { NaiveExtractor } from "../src/ingestion/naive-extractor";
import type {
FactExtractor,
LlmTextGenerationModelInput,
} from "../src/ingestion/types";
describe('IdentityDB ingestion', () => {
describe("IdentityDB ingestion", () => {
let db: IdentityDB;
beforeEach(async () => {
db = await IdentityDB.connect({ client: 'sqlite', filename: ':memory:' });
db = await IdentityDB.connect({ client: "sqlite", filename: ":memory:" });
await db.initialize();
});
@@ -17,121 +20,140 @@ describe('IdentityDB ingestion', () => {
await db.close();
});
it('ingests a statement using a provided extractor', async () => {
it("ingests a statement using a provided extractor", async () => {
const extractor: FactExtractor = {
async extract(input) {
return {
statement: input,
topics: [
{ name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' },
{ name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' },
{ name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' },
{
name: "I",
category: "entity",
granularity: "concrete",
role: "subject",
},
{
name: "TypeScript",
category: "entity",
granularity: "concrete",
role: "object",
},
{
name: "2025",
category: "temporal",
granularity: "concrete",
role: "time",
},
],
};
},
};
const fact = await db.ingestStatement('I have worked with TypeScript since 2025.', {
extractor,
});
const fact = await db.ingestStatement(
"I have worked with TypeScript since 2025.",
{
extractor,
},
);
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'TypeScript', '2025']);
expect(fact.topics.map((topic) => topic.name)).toEqual([
"I",
"TypeScript",
"2025",
]);
const linkedFacts = await db.getTopicFactsLinkedTo('TypeScript', '2025');
const linkedFacts = await db.getTopicFactsLinkedTo("TypeScript", "2025");
expect(linkedFacts).toHaveLength(1);
expect(linkedFacts[0]?.statement).toBe('I have worked with TypeScript since 2025.');
expect(linkedFacts[0]?.statement).toBe(
"I have worked with TypeScript since 2025.",
);
});
it('ships a deterministic naive extractor for local usage', async () => {
const fact = await db.ingestStatement('I have worked with TypeScript since 2025.', {
extractor: new NaiveExtractor(),
});
it("ships a deterministic naive extractor for local usage", async () => {
const fact = await db.ingestStatement(
"I have worked with TypeScript since 2025.",
{
extractor: new NaiveExtractor(),
},
);
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'TypeScript', '2025']);
expect(fact.topics.map((topic) => topic.name)).toEqual([
"I",
"TypeScript",
"2025",
]);
const topic = await db.getTopicByName('TypeScript', { includeFacts: true });
const topic = await db.getTopicByName("TypeScript", { includeFacts: true });
expect(topic?.facts).toHaveLength(1);
});
it('ships an LLM extractor adapter that turns structured JSON responses into facts', async () => {
let prompt = '';
it("ships an LLM extractor adapter that returns structured facts from the model", async () => {
let prompt: LlmTextGenerationModelInput | undefined = undefined;
const extractor = new LlmFactExtractor({
model: {
async generateText(input) {
prompt = input;
return JSON.stringify({
statement: 'I have worked with Bun and TypeScript since 2025.',
summary: 'The speaker has Bun and TypeScript experience.',
source: 'chat',
return {
statement: "I have worked with Bun and TypeScript since 2025.",
summary: "The speaker has Bun and TypeScript experience.",
source: "chat",
confidence: 0.91,
metadata: { channel: 'telegram' },
metadata: { channel: "telegram" },
topics: [
{ name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' },
{ name: 'Bun', category: 'entity', granularity: 'concrete', role: 'object' },
{ name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' },
{ name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' },
{
name: "I",
category: "entity",
granularity: "concrete",
role: "subject",
},
{
name: "Bun",
category: "entity",
granularity: "concrete",
role: "object",
},
{
name: "TypeScript",
category: "entity",
granularity: "concrete",
role: "object",
},
{
name: "2025",
category: "temporal",
granularity: "concrete",
role: "time",
},
],
});
};
},
},
instructions: 'Prefer technology and time topics.',
additionalInstructions: "Prefer technology and time topics.",
});
const fact = await db.ingestStatement('I have worked with Bun and TypeScript since 2025.', {
extractor,
});
expect(prompt).toContain('Prefer technology and time topics.');
expect(prompt).toContain('I have worked with Bun and TypeScript since 2025.');
expect(fact.summary).toBe('The speaker has Bun and TypeScript experience.');
expect(fact.source).toBe('chat');
expect(fact.confidence).toBe(0.91);
expect(fact.metadata).toEqual({ channel: 'telegram' });
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'Bun', 'TypeScript', '2025']);
});
it('parses JSON responses wrapped in markdown code fences', async () => {
const extractor = new LlmFactExtractor({
model: {
async generateText() {
return [
'Here is the extracted fact:',
'```json',
JSON.stringify({
statement: 'Bun powers TypeScript tooling.',
topics: [
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
],
}),
'```',
].join('\n');
},
},
});
const fact = await db.ingestStatement('Bun powers TypeScript tooling.', {
extractor,
});
expect(fact.topics.map((topic) => topic.name)).toEqual(['Bun', 'TypeScript']);
});
it('rejects invalid LLM responses before writing facts', async () => {
const extractor = new LlmFactExtractor({
model: {
async generateText() {
return 'not json at all';
},
},
});
await expect(
db.ingestStatement('Bun powers TypeScript tooling.', {
const fact = await db.ingestStatement(
"I have worked with Bun and TypeScript since 2025.",
{
extractor,
}),
).rejects.toThrow('LLM extractor returned invalid JSON.');
},
);
expect(prompt).toEqual({
instruction: expect.stringContaining("Extract one structured fact from the user input."),
input: "I have worked with Bun and TypeScript since 2025.",
additionalInstruction: "Prefer technology and time topics.",
});
expect(fact.summary).toBe("The speaker has Bun and TypeScript experience.");
expect(fact.source).toBe("chat");
expect(fact.confidence).toBe(0.91);
expect(fact.metadata).toEqual({ channel: "telegram" });
expect(fact.topics.map((topic) => topic.name)).toEqual([
"I",
"Bun",
"TypeScript",
"2025",
]);
});
});