IdentityDB/tests/ingestion.test.ts

import { afterEach, beforeEach, describe, expect, it } from "vitest";

import { IdentityDB } from "../src/core/identity-db";
import { LlmFactExtractor } from "../src/ingestion/llm-extractor";
import { NaiveExtractor } from "../src/ingestion/naive-extractor";
import type {
  FactExtractor,
  LlmTextGenerationModelInput,
} from "../src/ingestion/types";

describe("IdentityDB ingestion", () => {
  let db: IdentityDB;

  beforeEach(async () => {
    db = await IdentityDB.connect({ client: "sqlite", filename: ":memory:" });
    await db.initialize();
  });

  afterEach(async () => {
    await db.close();
  });

  it("ingests a statement using a provided extractor", async () => {
    const extractor: FactExtractor = {
      async extract(input) {
        return {
          statement: input,
          topics: [
            {
              name: "I",
              category: "entity",
              granularity: "concrete",
              role: "subject",
            },
            {
              name: "TypeScript",
              category: "entity",
              granularity: "concrete",
              role: "object",
            },
            {
              name: "2025",
              category: "temporal",
              granularity: "concrete",
              role: "time",
            },
          ],
        };
      },
    };

    const fact = await db.ingestStatement(
      "I have worked with TypeScript since 2025.",
      {
        extractor,
      },
    );

    expect(fact.topics.map((topic) => topic.name)).toEqual([
      "I",
      "TypeScript",
      "2025",
    ]);

    const linkedFacts = await db.getTopicFactsLinkedTo("TypeScript", "2025");
    expect(linkedFacts).toHaveLength(1);
    expect(linkedFacts[0]?.statement).toBe(
      "I have worked with TypeScript since 2025.",
    );
  });

  it("ships a deterministic naive extractor for local usage", async () => {
    const fact = await db.ingestStatement(
      "I have worked with TypeScript since 2025.",
      {
        extractor: new NaiveExtractor(),
      },
    );

    expect(fact.topics.map((topic) => topic.name)).toEqual([
      "I",
      "TypeScript",
      "2025",
    ]);

    const topic = await db.getTopicByName("TypeScript", { includeFacts: true });
    expect(topic?.facts).toHaveLength(1);
  });

  it("ships an LLM extractor adapter that returns structured facts from the model", async () => {
    let prompt: LlmTextGenerationModelInput | undefined = undefined;

    const extractor = new LlmFactExtractor({
      model: {
        async generateText(input) {
          prompt = input;

          return {
            statement: "I have worked with Bun and TypeScript since 2025.",
            summary: "The speaker has Bun and TypeScript experience.",
            source: "chat",
            confidence: 0.91,
            metadata: { channel: "telegram" },
            topics: [
              {
                name: "I",
                category: "entity",
                granularity: "concrete",
                role: "subject",
              },
              {
                name: "Bun",
                category: "entity",
                granularity: "concrete",
                role: "object",
              },
              {
                name: "TypeScript",
                category: "entity",
                granularity: "concrete",
                role: "object",
              },
              {
                name: "2025",
                category: "temporal",
                granularity: "concrete",
                role: "time",
              },
            ],
          };
        },
      },
      additionalInstructions: "Prefer technology and time topics.",
    });

    const fact = await db.ingestStatement(
      "I have worked with Bun and TypeScript since 2025.",
      {
        extractor,
      },
    );

    expect(prompt).toEqual({
      instruction: expect.stringContaining("Extract one structured fact from the user input."),
      input: "I have worked with Bun and TypeScript since 2025.",
      additionalInstruction: "Prefer technology and time topics.",
    });
    expect(fact.summary).toBe("The speaker has Bun and TypeScript experience.");
    expect(fact.source).toBe("chat");
    expect(fact.confidence).toBe(0.91);
    expect(fact.metadata).toEqual({ channel: "telegram" });
    expect(fact.topics.map((topic) => topic.name)).toEqual([
      "I",
      "Bun",
      "TypeScript",
      "2025",
    ]);
  });
});