Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 49 additions & 2 deletions docker-compose.jobs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,60 @@ services:
networks:
- constructive-net

# Generate chunks function (text splitting + embeddings for knowledge tables)
generate-chunks:
container_name: generate-chunks
image: constructive:dev
entrypoint: ["node", "functions/generate-chunks/dist/index.js"]
environment:
NODE_ENV: development
LOG_LEVEL: info
# Postgres connection (same DB where knowledge tables live)
PGHOST: postgres
PGPORT: "5432"
PGUSER: postgres
PGPASSWORD: password
PGDATABASE: constructive
# Embedding provider (optional — chunks work without embeddings)
EMBEDDER_PROVIDER: "${EMBEDDER_PROVIDER:-ollama}"
EMBEDDER_MODEL: "${EMBEDDER_MODEL:-nomic-embed-text}"
EMBEDDER_BASE_URL: "${EMBEDDER_BASE_URL:-http://host.docker.internal:11434}"
ports:
- "8083:8080"
networks:
- constructive-net

# Invoke function handler (resolves schema, dispatches, updates invocation status)
invoke-function:
container_name: invoke-function
image: constructive:dev
entrypoint: ["node", "functions/invoke-function/dist/index.js"]
environment:
NODE_ENV: development
LOG_LEVEL: info
# Postgres connection (reads metaschema + function_module tables)
PGHOST: postgres
PGPORT: "5432"
PGUSER: postgres
PGPASSWORD: password
PGDATABASE: constructive
# Function dispatch routing (same as knative-job-service)
INTERNAL_GATEWAY_URL: "http://send-verification-link:8080"
INTERNAL_GATEWAY_DEVELOPMENT_MAP: '{"email:send_verification_link":"http://send-verification-link:8080","email:send_email":"http://send-email:8080","chunk:generate_chunks":"http://generate-chunks:8080","generate_knowledge_chunks":"http://generate-chunks:8080"}'
ports:
- "8084:8080"
networks:
- constructive-net

# Jobs runtime: callback server + worker + scheduler
knative-job-service:
container_name: knative-job-service
image: constructive:dev
entrypoint: ["node", "jobs/knative-job-service/dist/run.js"]
depends_on:
- send-verification-link
- generate-chunks
- invoke-function
environment:
NODE_ENV: development

Expand All @@ -124,7 +171,7 @@ services:

# Worker configuration
JOBS_SUPPORT_ANY: "false"
JOBS_SUPPORTED: "email:send_verification_link,email:send_email"
JOBS_SUPPORTED: "email:send_verification_link,email:send_email,chunk:generate_chunks,function:invoke"
HOSTNAME: "knative-job-service-1"

# Callback HTTP server (job completion callbacks)
Expand All @@ -137,7 +184,7 @@ services:
INTERNAL_GATEWAY_URL: "http://send-verification-link:8080"

# Development-only map from task identifier -> function URL
INTERNAL_GATEWAY_DEVELOPMENT_MAP: '{"email:send_verification_link":"http://send-verification-link:8080","email:send_email":"http://send-email:8080"}'
INTERNAL_GATEWAY_DEVELOPMENT_MAP: '{"email:send_verification_link":"http://send-verification-link:8080","email:send_email":"http://send-email:8080","chunk:generate_chunks":"http://generate-chunks:8080","generate_knowledge_chunks":"http://generate-chunks:8080","function:invoke":"http://invoke-function:8080"}'

ports:
- "8080:8080"
Expand Down
15 changes: 15 additions & 0 deletions functions/generate-chunks/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM node:18-alpine

WORKDIR /usr/src/app

# Install pnpm and production dependencies
COPY package.json pnpm-lock.yaml ./
RUN npm install -g pnpm@9 && pnpm install --prod

# Copy compiled code (build should be run before building image)
COPY dist ./dist

ENV NODE_ENV=production
ENV PORT=8080

CMD ["node", "dist/index.js"]
49 changes: 49 additions & 0 deletions functions/generate-chunks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# @constructive-io/generate-chunks-fn

Knative function that splits text content into chunks, generates per-chunk embeddings, and inserts them into the chunks table.

## Job Payload

```json
{
"table": "team_agent_knowledge",
"schema": "dataroom_public",
"id": "row-uuid",
"chunks_table": "team_agent_knowledge_chunks",
"chunk_size": "1000",
"chunk_overlap": "200",
"chunk_strategy": "paragraph"
}
```

## Chunking Strategies

- **fixed** — split at character boundaries
- **sentence** — split on sentence-ending punctuation
- **paragraph** — split on double-newline boundaries
- **semantic** — falls back to paragraph (model-based splitting planned)

## Environment Variables

| Variable | Default | Description |
|----------|---------|-------------|
| `PGHOST` | `localhost` | PostgreSQL host |
| `PGPORT` | `5432` | PostgreSQL port |
| `PGUSER` | `postgres` | PostgreSQL user |
| `PGPASSWORD` | `password` | PostgreSQL password |
| `PGDATABASE` | `postgres` | PostgreSQL database |
| `EMBEDDER_PROVIDER` | `ollama` | Embedding provider |
| `EMBEDDER_MODEL` | `nomic-embed-text` | Embedding model |
| `EMBEDDER_BASE_URL` | `http://localhost:11434` | Embedding provider URL |
| `PORT` | `8080` | HTTP server port |

## Pipeline

```
INSERT into knowledge table
→ AFTER INSERT trigger fires
→ app_jobs.add_job('generate_knowledge_chunks', payload)
→ knative-job-worker picks up job
→ POST payload to this function
→ read content → split → embed → insert chunks
```
91 changes: 91 additions & 0 deletions functions/generate-chunks/__tests__/chunker.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import { splitText } from '../src/chunker';

describe('splitText', () => {
it('returns empty array for empty text', () => {
expect(splitText('', 'paragraph', 1000, 200)).toEqual([]);
expect(splitText(' ', 'paragraph', 1000, 200)).toEqual([]);
});

describe('paragraph strategy', () => {
it('splits on double newlines', () => {
const text = 'First paragraph.\n\nSecond paragraph.\n\nThird paragraph.';
const chunks = splitText(text, 'paragraph', 1000, 0);
expect(chunks.length).toBe(1); // all fit in one chunk
expect(chunks[0].content).toContain('First paragraph.');
expect(chunks[0].content).toContain('Third paragraph.');
});

it('creates multiple chunks when paragraphs exceed size', () => {
const para = 'A'.repeat(500);
const text = `${para}\n\n${para}\n\n${para}`;
const chunks = splitText(text, 'paragraph', 600, 0);
expect(chunks.length).toBeGreaterThan(1);
for (const chunk of chunks) {
expect(chunk.chunk_index).toBeGreaterThanOrEqual(0);
expect(chunk.content.length).toBeGreaterThan(0);
}
});

it('assigns sequential chunk indexes', () => {
const para = 'X'.repeat(400);
const text = `${para}\n\n${para}\n\n${para}`;
const chunks = splitText(text, 'paragraph', 500, 0);
for (let i = 0; i < chunks.length; i++) {
expect(chunks[i].chunk_index).toBe(i);
}
});
});

describe('sentence strategy', () => {
it('splits on sentence boundaries', () => {
const text = 'First sentence. Second sentence. Third sentence.';
const chunks = splitText(text, 'sentence', 30, 0);
expect(chunks.length).toBeGreaterThan(1);
});

it('keeps short text in one chunk', () => {
const text = 'Hello world.';
const chunks = splitText(text, 'sentence', 1000, 0);
expect(chunks.length).toBe(1);
expect(chunks[0].content).toBe('Hello world.');
});
});

describe('fixed strategy', () => {
it('splits at character boundaries', () => {
const text = 'A'.repeat(300);
const chunks = splitText(text, 'fixed', 100, 0);
expect(chunks.length).toBe(3);
expect(chunks[0].content.length).toBe(100);
});

it('handles overlap', () => {
const text = 'A'.repeat(200);
const chunks = splitText(text, 'fixed', 100, 20);
expect(chunks.length).toBeGreaterThan(2);
// Second chunk should start with overlap from first
});
});

describe('overlap', () => {
it('prepends tail of previous chunk', () => {
const para1 = 'First para content here.';
const para2 = 'Second para content here.';
const text = `${para1}\n\n${para2}`;
const chunks = splitText(text, 'paragraph', 30, 10);
if (chunks.length > 1) {
// Second chunk should contain overlap from first
expect(chunks[1].metadata).toHaveProperty('overlap_chars');
}
});
});

describe('metadata', () => {
it('includes char_count in metadata', () => {
const text = 'Test content.';
const chunks = splitText(text, 'paragraph', 1000, 0);
expect(chunks[0].metadata).toHaveProperty('char_count');
expect(chunks[0].metadata.char_count).toBe(text.length);
});
});
});
18 changes: 18 additions & 0 deletions functions/generate-chunks/jest.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/** @type {import('ts-jest').JestConfigWithTsJest} */
module.exports = {
preset: 'ts-jest',
testEnvironment: 'node',
transform: {
'^.+\\.tsx?$': [
'ts-jest',
{
babelConfig: false,
tsconfig: 'tsconfig.json'
}
]
},
transformIgnorePatterns: [`/node_modules/*`],
testRegex: '(/__tests__/.*|(\\.|/)(test|spec))\\.(jsx?|tsx?)$',
moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'],
modulePathIgnorePatterns: ['dist/*']
};
46 changes: 46 additions & 0 deletions functions/generate-chunks/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
{
"name": "@constructive-io/generate-chunks-fn",
"version": "0.1.0",
"description": "Knative function to split text into chunks, generate per-chunk embeddings, and insert into the chunks table",
"author": "Constructive <developers@constructive.io>",
"homepage": "https://github.com/constructive-io/constructive",
"license": "MIT",
"main": "index.js",
"module": "esm/index.js",
"types": "index.d.ts",
"publishConfig": {
"access": "public",
"directory": "dist"
},
"repository": {
"type": "git",
"url": "https://github.com/constructive-io/constructive"
},
"bugs": {
"url": "https://github.com/constructive-io/constructive/issues"
},
"directories": {
"lib": "src",
"test": "__tests__"
},
"scripts": {
"clean": "makage clean",
"prepack": "npm run build",
"build": "makage build",
"build:dev": "makage build --dev",
"test": "jest"
},
"devDependencies": {
"makage": "^0.3.0",
"ts-jest": "^29.4.11",
"jest": "^30.4.2",
"@types/pg": "^8.11.0"
},
"dependencies": {
"@constructive-io/knative-job-fn": "workspace:^",
"@agentic-kit/ollama": "^1.0.3",
"@pgpmjs/env": "workspace:^",
"@pgpmjs/logger": "workspace:^",
"pg": "^8.21.0"
}
}
Loading
Loading