Compare commits

...

9 Commits

Author SHA1 Message Date
283f91ed91 ci: configure npm auth for release publish
All checks were successful
npm release / verify (push) Successful in 13s
npm release / publish to npm (push) Successful in 12s
2026-05-11 13:59:29 +09:00
5991e4f1f0 ci: run Gitea release steps with bash
Some checks failed
npm release / verify (push) Successful in 13s
npm release / publish to npm (push) Failing after 11s
2026-05-11 13:57:45 +09:00
0dc657c97b ci: make Gitea release workflow self-contained
Some checks failed
npm release / verify (push) Failing after 10s
npm release / publish to npm (push) Has been skipped
2026-05-11 13:56:39 +09:00
96d0568197 ci: set writable HOME for Gitea release workflow
Some checks failed
npm release / verify (push) Failing after 2s
npm release / publish to npm (push) Has been skipped
2026-05-11 13:53:04 +09:00
e8adccfbbf ci: add tag-gated npm release workflow
Some checks failed
npm release / verify (push) Failing after 13s
npm release / publish to npm (push) Has been skipped
2026-05-11 13:36:07 +09:00
1c82b63e7a docs: add IdentityDB wiki documentation plan 2026-05-11 12:27:12 +09:00
3e39d3bbd5 docs: document LLM extractor adapter usage 2026-05-11 12:19:58 +09:00
4f877a40fb feat: add provider-agnostic LLM extractor adapter 2026-05-11 12:19:50 +09:00
7a02621e40 docs: add LLM extractor adapter plan 2026-05-11 12:14:55 +09:00
8 changed files with 659 additions and 0 deletions

View File

@@ -0,0 +1,117 @@
name: npm release
on:
push:
tags:
- 'v*'
- '[0-9]*'
permissions:
contents: read
defaults:
run:
shell: bash
jobs:
verify:
name: verify
runs-on: ubuntu-latest
container:
image: node:20-bookworm
timeout-minutes: 20
steps:
- name: Install release tools
run: |
set -euo pipefail
apt-get update
apt-get install -y git curl ca-certificates
curl -fsSL https://bun.sh/install | bash -s -- bun-v1.3.13
install -m 0755 /root/.bun/bin/bun /usr/local/bin/bun
node --version
npm --version
bun --version
- name: Clone tagged source
run: |
set -euo pipefail
REPO_URL="${{ gitea.server_url }}/${{ gitea.repository }}.git"
AUTH_HEADER="$(printf '%s' '${{ gitea.actor }}:${{ secrets.GITEA_TOKEN }}' | base64 -w0)"
git -c http.extraHeader="Authorization: Basic $AUTH_HEADER" clone --depth 1 --branch "${{ gitea.ref_name }}" "$REPO_URL" repo
git -C repo rev-parse HEAD
- name: Verify release tag matches package version
working-directory: repo
shell: bash
run: |
set -euo pipefail
TAG_NAME="${{ gitea.ref_name }}"
PACKAGE_VERSION="$(node -p "require('./package.json').version")"
if [ "$TAG_NAME" = "v$PACKAGE_VERSION" ] || [ "$TAG_NAME" = "$PACKAGE_VERSION" ]; then
echo "Release tag $TAG_NAME matches package version $PACKAGE_VERSION"
exit 0
fi
echo "Tag $TAG_NAME does not match package.json version $PACKAGE_VERSION" >&2
exit 1
- name: Run verify pipeline
working-directory: repo
run: |
set -euo pipefail
bun install --frozen-lockfile
bun run test
bun run check
bun run build
release:
name: publish to npm
runs-on: ubuntu-latest
container:
image: node:20-bookworm
timeout-minutes: 20
needs:
- verify
steps:
- name: Install release tools
run: |
set -euo pipefail
apt-get update
apt-get install -y git curl ca-certificates
curl -fsSL https://bun.sh/install | bash -s -- bun-v1.3.13
install -m 0755 /root/.bun/bin/bun /usr/local/bin/bun
node --version
npm --version
bun --version
- name: Clone tagged source
run: |
set -euo pipefail
REPO_URL="${{ gitea.server_url }}/${{ gitea.repository }}.git"
AUTH_HEADER="$(printf '%s' '${{ gitea.actor }}:${{ secrets.GITEA_TOKEN }}' | base64 -w0)"
git -c http.extraHeader="Authorization: Basic $AUTH_HEADER" clone --depth 1 --branch "${{ gitea.ref_name }}" "$REPO_URL" repo
git -C repo rev-parse HEAD
- name: Install dependencies
working-directory: repo
run: |
set -euo pipefail
bun install --frozen-lockfile
- name: Build package
working-directory: repo
run: |
set -euo pipefail
bun run build
- name: Publish package to npm
working-directory: repo
env:
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
run: |
set -euo pipefail
printf '//registry.npmjs.org/:_authToken=%s\n' "$NODE_AUTH_TOKEN" > ~/.npmrc
npm publish

View File

@@ -113,6 +113,29 @@ await db.ingestStatement('Bun makes TypeScript tooling fast.', {
});
```
## LLM-backed extraction
You can bridge any text-generating model into IdentityDB by wrapping it with `LlmFactExtractor`.
```ts
import { LlmFactExtractor } from 'identitydb';
const extractor = new LlmFactExtractor({
model: {
async generateText(prompt) {
return callYourFavoriteLlm(prompt);
},
},
instructions: 'Prefer technology, product, and time topics over generic nouns.',
});
await db.ingestStatement('I have worked with Bun and TypeScript since 2025.', {
extractor,
});
```
The adapter expects the model to return JSON and will validate the structured response before IdentityDB writes a fact.
## Development
```bash

View File

@@ -0,0 +1,87 @@
# IdentityDB LLM Extractor Adapter Implementation Plan
> **For Hermes:** Use the `subagent-driven-development` skill to execute this plan task-by-task. Enforce strict TDD for every production behavior.
**Goal:** Add a provider-agnostic LLM-backed fact extractor adapter so callers can plug a small language model into IdentityDB ingestion without coupling the package to a specific SDK.
**Architecture:** Keep `FactExtractor` as the stable ingestion contract, then add an `LlmFactExtractor` adapter that delegates prompting and text generation to a narrow model interface. The adapter should build a deterministic JSON-only extraction prompt, parse structured JSON from the model response, validate the shape, and return `ExtractedFact` objects that flow through the existing ingestion validation path.
**Tech Stack:** TypeScript, Bun, Node.js, Kysely, Vitest, tsup.
---
## Scope and interpretation
- The new adapter must remain provider-agnostic and must not depend on OpenAI, Anthropic, or any other SDK.
- The adapter should accept a minimal language-model interface that returns text so package consumers can bridge any LLM client they want.
- Structured output must be validated in the adapter before returning it to `extractFact()`.
- The adapter should tolerate common model formatting noise such as fenced ```json blocks around the payload.
- Initial release should focus on correctness and predictable integration, not prompt-optimization or retries.
---
## Public API additions
```ts
const extractor = new LlmFactExtractor({
model: {
async generateText(prompt) {
return jsonStringFromSomeLlm(prompt);
},
},
});
const fact = await db.ingestStatement('I have worked with Bun and TypeScript since 2025.', {
extractor,
});
```
Optional customization:
```ts
const extractor = new LlmFactExtractor({
model,
instructions: 'Prefer product and technology topics over generic nouns.',
});
```
---
## Execution plan
### Task 1: Lock the adapter behavior with failing tests
**Objective:** Define the LLM adapter contract before implementation.
**Files:**
- Modify: `tests/ingestion.test.ts`
- Modify: `src/ingestion/types.ts`
- Modify: `src/index.ts`
**Verification:**
- Run focused ingestion tests and confirm they fail for the missing adapter behavior.
### Task 2: Implement the LLM adapter and response parsing
**Objective:** Add a reusable `LlmFactExtractor` implementation plus robust JSON extraction helpers.
**Files:**
- Create: `src/ingestion/llm-extractor.ts`
- Modify: `src/ingestion/types.ts`
- Modify: `src/ingestion/extractor.ts`
- Modify: `src/index.ts`
**Verification:**
- Run the focused ingestion tests until green.
### Task 3: Document the adapter and run the full suite
**Objective:** Expose the new adapter in docs and ensure the whole package still passes verification.
**Files:**
- Modify: `README.md`
- Modify: `src/index.ts`
**Verification:**
- Run `bun run test && bun run check && bun run build`
- Confirm the README shows how to bridge an arbitrary LLM client into the adapter.

View File

@@ -0,0 +1,65 @@
# IdentityDB Wiki Documentation Implementation Plan
> **For Hermes:** Execute this plan step-by-step. Prefer concrete repository inspection over assumptions, and verify the wiki remote after each major write.
**Goal:** Verify the IdentityDB wiki repository state, create or clone it as needed, and publish concrete wiki documentation covering the project's purpose, usage, and extractor choices including `NaiveExtractor`.
**Architecture:** Treat the Gitea wiki as a separate Git repository. First verify whether the wiki feature is enabled and whether the `.wiki.git` remote already exists. If the remote does not exist yet, bootstrap it with a minimal `Home.md`, then clone the wiki repo into a local working directory and author Markdown pages there. Keep the documentation practical, using the package README and current source files as the canonical content source.
**Tech Stack:** Gitea, tea CLI, Git, Markdown, Bun/TypeScript project docs.
---
## Execution plan
### Task 1: Inspect wiki availability and remote state
**Objective:** Confirm that the repository has wiki support enabled and determine whether the Git-backed wiki repo already exists.
**Files:**
- Inspect: `https://git.psw.kr/p-sw/IdentityDB`
- Read: `/home/hermes-agent/IdentityDB/README.md`
- Read: `/home/hermes-agent/IdentityDB/src/ingestion/naive-extractor.ts`
- Read: `/home/hermes-agent/IdentityDB/src/ingestion/llm-extractor.ts`
**Verification:**
- Check Gitea repo metadata for `has_wiki=true`.
- Check whether `https://git.psw.kr/p-sw/IdentityDB.wiki.git` is readable.
### Task 2: Bootstrap the wiki repo if missing
**Objective:** Create the Git-backed wiki repository if it has not been materialized yet.
**Files:**
- Create temporarily: `/home/hermes-agent/IdentityDB-wiki-bootstrap/Home.md`
**Verification:**
- Push a first commit to `https://git.psw.kr/p-sw/IdentityDB.wiki.git`.
- Confirm the remote becomes cloneable afterward.
### Task 3: Clone the wiki repo and author concrete pages
**Objective:** Write practical docs explaining why IdentityDB exists, how to use it, and where `NaiveExtractor` fits.
**Files:**
- Clone to: `/home/hermes-agent/IdentityDB.wiki`
- Create/modify: `/home/hermes-agent/IdentityDB.wiki/Home.md`
- Create/modify: `/home/hermes-agent/IdentityDB.wiki/Getting-Started.md`
- Create/modify: `/home/hermes-agent/IdentityDB.wiki/Extractors.md`
- Create/modify: `/home/hermes-agent/IdentityDB.wiki/_Sidebar.md`
**Verification:**
- Review the generated Markdown files locally.
- Ensure internal wiki links resolve by page name.
### Task 4: Commit, push, and verify the published wiki state
**Objective:** Publish the wiki docs and verify the remote history reflects the changes.
**Files:**
- Commit within: `/home/hermes-agent/IdentityDB.wiki`
**Verification:**
- Run `git status --short` and `git log --oneline -n 3` in the wiki repo.
- Push to the remote wiki repo.
- Confirm the wiki is cloneable and the latest commit is visible remotely.

View File

@@ -2,6 +2,7 @@ export * from './adapters';
export * from './core/identity-db';
export * from './core/migrations';
export * from './ingestion/extractor';
export * from './ingestion/llm-extractor';
export * from './ingestion/naive-extractor';
export * from './ingestion/types';
export * from './types/api';

View File

@@ -0,0 +1,273 @@
import { IdentityDBError } from '../core/errors';
import type { TopicCategory, TopicGranularity } from '../types/domain';
import type {
ExtractedFact,
FactExtractor,
LlmFactExtractorOptions,
} from './types';
const DEFAULT_INSTRUCTIONS = [
'Extract one structured fact from the user input.',
'Return JSON only. Do not include markdown, explanations, or prose outside the JSON object.',
'Use this shape: {"statement": string?, "summary": string|null, "source": string|null, "confidence": number|null, "metadata": object|null, "topics": Array<{"name": string, "category": "entity"|"concept"|"temporal"|"custom"?, "granularity": "abstract"|"concrete"|"mixed"?, "role": string|null, "description": string|null, "metadata": object|null}>}.',
'Only include topics that are explicitly supported by the input.',
].join('\n');
export class LlmFactExtractor implements FactExtractor {
constructor(private readonly options: LlmFactExtractorOptions) {}
async extract(input: string): Promise<ExtractedFact> {
const prompt = this.buildPrompt(input);
const response = await this.options.model.generateText(prompt);
return parseLlmExtractedFactResponse(response);
}
private buildPrompt(input: string): string {
if (this.options.promptBuilder) {
return this.options.promptBuilder(input, this.options.instructions);
}
const instructions = this.options.instructions?.trim();
return [
DEFAULT_INSTRUCTIONS,
instructions && instructions.length > 0 ? `Additional instructions:\n${instructions}` : null,
`Input:\n${input.trim()}`,
]
.filter((value): value is string => value !== null)
.join('\n\n');
}
}
export function parseLlmExtractedFactResponse(response: string): ExtractedFact {
const payload = parseJsonCandidate(response);
if (!isRecord(payload)) {
throw new IdentityDBError('LLM extractor response must be a JSON object.');
}
const topics = parseTopics(payload.topics);
const extracted: ExtractedFact = { topics };
const statement = optionalString(payload.statement);
if (statement !== undefined) {
extracted.statement = statement;
}
const summary = optionalNullableString(payload.summary);
if (summary !== undefined) {
extracted.summary = summary;
}
const source = optionalNullableString(payload.source);
if (source !== undefined) {
extracted.source = source;
}
const confidence = optionalNullableNumber(payload.confidence);
if (confidence !== undefined) {
extracted.confidence = confidence;
}
const metadata = optionalMetadata(payload.metadata);
if (metadata !== undefined) {
extracted.metadata = metadata;
}
return extracted;
}
function parseJsonCandidate(response: string): unknown {
const trimmed = response.trim();
for (const candidate of collectJsonCandidates(trimmed)) {
try {
return JSON.parse(candidate);
} catch {
continue;
}
}
throw new IdentityDBError('LLM extractor returned invalid JSON.');
}
function collectJsonCandidates(response: string): string[] {
const candidates = new Set<string>();
candidates.add(response);
const fencePattern = /```(?:json)?\s*([\s\S]*?)```/gi;
let match: RegExpExecArray | null = fencePattern.exec(response);
while (match) {
const candidate = match[1]?.trim();
if (candidate) {
candidates.add(candidate);
}
match = fencePattern.exec(response);
}
const firstBrace = response.indexOf('{');
const lastBrace = response.lastIndexOf('}');
if (firstBrace >= 0 && lastBrace > firstBrace) {
candidates.add(response.slice(firstBrace, lastBrace + 1));
}
return Array.from(candidates);
}
function parseTopics(value: unknown): ExtractedFact['topics'] {
if (!Array.isArray(value)) {
throw new IdentityDBError('LLM extractor response must include a topics array.');
}
return value.map((entry) => parseTopic(entry));
}
function parseTopic(value: unknown): ExtractedFact['topics'][number] {
if (!isRecord(value)) {
throw new IdentityDBError('LLM extractor topics must be JSON objects.');
}
const name = optionalString(value.name)?.trim();
if (!name) {
throw new IdentityDBError('LLM extractor topics must include a non-empty name.');
}
const topic: ExtractedFact['topics'][number] = { name };
const category = optionalTopicCategory(value.category);
if (category !== undefined) {
topic.category = category;
}
const granularity = optionalTopicGranularity(value.granularity);
if (granularity !== undefined) {
topic.granularity = granularity;
}
const role = optionalNullableString(value.role);
if (role !== undefined) {
topic.role = role;
}
const description = optionalNullableString(value.description);
if (description !== undefined) {
topic.description = description;
}
const metadata = optionalMetadata(value.metadata);
if (metadata !== undefined) {
topic.metadata = metadata;
}
return topic;
}
function optionalString(value: unknown): string | undefined {
if (value === undefined) {
return undefined;
}
if (typeof value !== 'string') {
throw new IdentityDBError('LLM extractor expected a string field.');
}
return value;
}
function optionalNullableString(value: unknown): string | null | undefined {
if (value === undefined) {
return undefined;
}
if (value === null) {
return null;
}
if (typeof value !== 'string') {
throw new IdentityDBError('LLM extractor expected a nullable string field.');
}
return value;
}
function optionalNullableNumber(value: unknown): number | null | undefined {
if (value === undefined) {
return undefined;
}
if (value === null) {
return null;
}
if (typeof value !== 'number' || Number.isNaN(value)) {
throw new IdentityDBError('LLM extractor expected confidence to be a number or null.');
}
return value;
}
function optionalMetadata(value: unknown): ExtractedFact['metadata'] | undefined {
if (value === undefined) {
return undefined;
}
if (value === null) {
return null;
}
if (!isJsonLike(value)) {
throw new IdentityDBError('LLM extractor metadata must be valid JSON-compatible data.');
}
return value as ExtractedFact['metadata'];
}
function optionalTopicCategory(value: unknown): TopicCategory | undefined {
if (value === undefined) {
return undefined;
}
if (value === 'entity' || value === 'concept' || value === 'temporal' || value === 'custom') {
return value;
}
throw new IdentityDBError('LLM extractor returned an unsupported topic category.');
}
function optionalTopicGranularity(value: unknown): TopicGranularity | undefined {
if (value === undefined) {
return undefined;
}
if (value === 'abstract' || value === 'concrete' || value === 'mixed') {
return value;
}
throw new IdentityDBError('LLM extractor returned an unsupported topic granularity.');
}
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
function isJsonLike(value: unknown): boolean {
if (value === null) {
return true;
}
if (typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') {
return true;
}
if (Array.isArray(value)) {
return value.every((entry) => isJsonLike(entry));
}
if (isRecord(value)) {
return Object.values(value).every((entry) => isJsonLike(entry));
}
return false;
}

View File

@@ -17,6 +17,16 @@ export interface FactExtractor {
extract(input: string): Promise<ExtractedFact>;
}
export interface LlmTextGenerationModel {
generateText(prompt: string): Promise<string>;
}
export interface LlmFactExtractorOptions {
model: LlmTextGenerationModel;
instructions?: string;
promptBuilder?: (input: string, instructions?: string) => string;
}
export interface IngestStatementOptions {
extractor: FactExtractor;
embeddingProvider?: EmbeddingProvider;

View File

@@ -1,6 +1,7 @@
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { IdentityDB } from '../src/core/identity-db';
import { LlmFactExtractor } from '../src/ingestion/llm-extractor';
import { NaiveExtractor } from '../src/ingestion/naive-extractor';
import type { FactExtractor } from '../src/ingestion/types';
@@ -51,4 +52,86 @@ describe('IdentityDB ingestion', () => {
const topic = await db.getTopicByName('TypeScript', { includeFacts: true });
expect(topic?.facts).toHaveLength(1);
});
it('ships an LLM extractor adapter that turns structured JSON responses into facts', async () => {
let prompt = '';
const extractor = new LlmFactExtractor({
model: {
async generateText(input) {
prompt = input;
return JSON.stringify({
statement: 'I have worked with Bun and TypeScript since 2025.',
summary: 'The speaker has Bun and TypeScript experience.',
source: 'chat',
confidence: 0.91,
metadata: { channel: 'telegram' },
topics: [
{ name: 'I', category: 'entity', granularity: 'concrete', role: 'subject' },
{ name: 'Bun', category: 'entity', granularity: 'concrete', role: 'object' },
{ name: 'TypeScript', category: 'entity', granularity: 'concrete', role: 'object' },
{ name: '2025', category: 'temporal', granularity: 'concrete', role: 'time' },
],
});
},
},
instructions: 'Prefer technology and time topics.',
});
const fact = await db.ingestStatement('I have worked with Bun and TypeScript since 2025.', {
extractor,
});
expect(prompt).toContain('Prefer technology and time topics.');
expect(prompt).toContain('I have worked with Bun and TypeScript since 2025.');
expect(fact.summary).toBe('The speaker has Bun and TypeScript experience.');
expect(fact.source).toBe('chat');
expect(fact.confidence).toBe(0.91);
expect(fact.metadata).toEqual({ channel: 'telegram' });
expect(fact.topics.map((topic) => topic.name)).toEqual(['I', 'Bun', 'TypeScript', '2025']);
});
it('parses JSON responses wrapped in markdown code fences', async () => {
const extractor = new LlmFactExtractor({
model: {
async generateText() {
return [
'Here is the extracted fact:',
'```json',
JSON.stringify({
statement: 'Bun powers TypeScript tooling.',
topics: [
{ name: 'Bun', category: 'entity', granularity: 'concrete' },
{ name: 'TypeScript', category: 'entity', granularity: 'concrete' },
],
}),
'```',
].join('\n');
},
},
});
const fact = await db.ingestStatement('Bun powers TypeScript tooling.', {
extractor,
});
expect(fact.topics.map((topic) => topic.name)).toEqual(['Bun', 'TypeScript']);
});
it('rejects invalid LLM responses before writing facts', async () => {
const extractor = new LlmFactExtractor({
model: {
async generateText() {
return 'not json at all';
},
},
});
await expect(
db.ingestStatement('Bun powers TypeScript tooling.', {
extractor,
}),
).rejects.toThrow('LLM extractor returned invalid JSON.');
});
});