diff --git a/.gitignore b/.gitignore
index e41eca8..1f7f108 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,7 +11,11 @@ venv/
# Generated
embeddings.json
+# Vector database storage
+chroma_db/
+
# Package build artifacts
*.egg-info/
dist/
build/
+*.dist-info/
diff --git a/README.md b/README.md
index 36fcda1..d86deff 100644
--- a/README.md
+++ b/README.md
@@ -10,10 +10,18 @@ A progressive RAG system built from first principles -- from raw embeddings and
-1. **Chunks** text documents into overlapping word windows so meaning is preserved at boundaries
-2. **Embeds** each chunk using the OpenAI `text-embedding-3-small` API, producing a 1536-dimensional vector per chunk
-3. **Stores** vectors alongside the original text in a local `embeddings.json` file
-4. **Searches** by embedding a natural language query using the same model, then ranking all chunks by cosine similarity and returning the top-K matches
+Ingestion
+
+1. **Loads** `.txt` files (PDF, DOCX, Markdown from Phase 4)
+2. **Chunks** each document into overlapping word windows
+3. **Embeds** each chunk using OpenAI `text-embedding-3-small`, producing a 1536-dimensional vector
+4. **Stores** vectors with metadata (`source`, `chunk_index`) in a persistent Chroma collection
+
+Search
+
+1. **Embeds** the query using the same model
+2. **Queries** Chroma for the top-K nearest vectors using built-in ANN (Approximate Nearest Neighbor) search
+3. **Returns** results with chunk text, source filename, and distance score
@@ -30,8 +38,7 @@ A progressive RAG system built from first principles -- from raw embeddings and
- Python 3.12
- OpenAI SDK (`text-embedding-3-small`)
-- NumPy (cosine similarity)
-- Plain JSON (storage -- current phase)
+- Chroma (persistent vector database)
- python-dotenv
---
@@ -40,19 +47,23 @@ A progressive RAG system built from first principles -- from raw embeddings and
```text
rag-document-engine/
-├── documents/ # Sample .txt files to embed
+├── documents/ # Sample .txt files
│ ├── ancient-rome.txt
│ ├── climate-change.txt
│ ├── music-and-the-brain.txt
│ ├── nutrition-and-health.txt
│ └── space-exploration.txt
-├── embed.py # Load, chunk, embed documents -> embeddings.json
-├── search.py # Embed query + retrieve top-K chunks by cosine similarity
-├── utils.py # chunk_text and cosine_similarity helpers
+├── embed.py # embed_chunks and embed_query utilities
+├── ingest.py # Load, chunk, embed, store in Chroma
+├── search.py # Embed query + retrieve top-K from Chroma
+├── inspect_collection.py # Print collection stats and a sample entry
+├── utils.py # chunk_text, load_document, load_documents
+├── chroma_db/ # Chroma persistent storage (not committed)
+├── diagrams/ # Pipeline diagrams
├── docs/
│ └── implementation-plan.md # Phase-by-phase build plan
├── pyproject.toml
-└── .env # API keys (not committed)
+└── .env # API keys (not committed)
```
---
@@ -77,11 +88,14 @@ EMBEDDING_MODEL=text-embedding-3-small
## Usage
```bash
-# Step 1 -- Embed all documents (generates embeddings.json)
-python3 embed.py
+# Step 1 -- Ingest documents into Chroma
+python3 ingest.py
# Step 2 -- Search
python3 search.py
+
+# Inspect the collection
+python3 inspect_collection.py
```
The query is set in `search.py` main. Change it to anything you want to search for.
@@ -93,22 +107,23 @@ The query is set in `search.py` main. Change it to anything you want to search f
Query: `"what foods are good for the heart"`
```text
-Result 1 (score: 0.3571)
-Nutrition is the science of how food affects the body. The food we eat provides energy and
-the raw materials needed to build and repair tissues... Unsaturated fats found in olive oil,
+Result 1 (distance: 1.2862) -- nutrition-and-health.txt [chunk 0]
+Nutrition is the science of how food affects the body... Unsaturated fats found in olive oil,
nuts, avocados, and fatty fish are associated with reduced risk of heart disease...
-Result 2 (score: 0.3143)
+Result 2 (distance: 1.3720) -- nutrition-and-health.txt [chunk 1]
The Mediterranean diet -- rich in vegetables, fruit, whole grains, fish, and olive oil -- is
consistently associated with lower rates of heart disease, diabetes, and cognitive decline...
-Result 3 (score: 0.1786)
+Result 3 (distance: 1.6426) -- music-and-the-brain.txt [chunk 1]
Music also affects mood and stress. Slow, quiet music activates the parasympathetic nervous
system, lowering heart rate and cortisol levels...
```
The top two results come from the nutrition document. Result 3 surfaces from the music document because it mentions "heart rate" -- semantic search catches conceptual overlap, not just keyword matches.
+Note: distance is an inverse similarity score -- lower means more relevant.
+
---
## Progress
@@ -116,7 +131,7 @@ The top two results come from the nutrition document. Result 3 surfaces from the
| Phase | Title | Status |
| ----: | ----- | ------ |
| 1 | Semantic Foundation | Complete |
-| 2 | Vector Store | In Progress |
+| 2 | Vector Store | Complete |
| 3 | RAG Pipeline | Planned |
| 4 | Document Ingestion | Planned |
| 5 | Retrieval Quality | Planned |
@@ -133,10 +148,11 @@ See [docs/implementation-plan.md](./docs/implementation-plan.md) for full phase
- **Cosine similarity** -- measures the angle between vectors; direction encodes meaning, magnitude does not
- **Chunking** -- splits documents into overlapping windows so meaning is not diluted or cut at boundaries
- **Model consistency** -- the same embedding model must be used for both documents and queries
+- **Vector database** -- stores embeddings with metadata and retrieves them by similarity using ANN search
- **RAG** -- Retrieval-Augmented Generation: retrieve relevant context, then generate a grounded answer
---
## Diagrams
-The pipeline diagram is maintained as a PlantUML source file (`pipeline.puml`) and auto-exported to SVG on every push to main using [diagram-sync](https://www.npmjs.com/package/diagram-sync).
+Pipeline diagrams are maintained as PlantUML source files in `diagrams/` and auto-exported to SVG on every push to main using [diagram-sync](https://www.npmjs.com/package/diagram-sync).
diff --git a/diagrams/pipeline.puml b/diagrams/pipeline.puml
new file mode 100644
index 0000000..98d320e
--- /dev/null
+++ b/diagrams/pipeline.puml
@@ -0,0 +1,60 @@
+@startuml pipeline-phase2
+
+skinparam backgroundColor #FFFFFF
+skinparam defaultFontName Arial
+skinparam defaultFontSize 13
+skinparam ArrowColor #555555
+skinparam componentStyle rectangle
+
+skinparam component {
+ BackgroundColor #EEF3FB
+ BorderColor #5577AA
+ FontColor #222222
+}
+
+skinparam database {
+ BackgroundColor #FFF8E7
+ BorderColor #CC9900
+}
+
+skinparam cloud {
+ BackgroundColor #F0FFF0
+ BorderColor #448844
+}
+
+title Phase 2 -- Chroma Vector Store Pipeline
+
+package "Ingestion (ingest.py)" {
+ [.txt files] as docs
+ [load_documents()] as load
+ [chunk_text()] as chunk
+ [embed_chunks()] as embed
+}
+
+cloud "OpenAI API\ntext-embedding-3-small" as openai
+
+database "Chroma DB\n(persistent)" as chroma
+
+package "Search (search.py)" {
+ [Query] as query
+ [embed_query()] as embedq
+ [collection.query()] as cquery
+ [Top-K Results\n(text + source + distance)] as results
+}
+
+docs --> load
+load --> chunk
+chunk --> embed
+embed --> openai : API call
+openai --> embed : 1536-dim vectors
+embed --> chroma : upsert with\nids + metadata
+
+query --> embedq
+embedq --> openai : API call
+openai --> embedq : query vector
+embedq --> cquery
+cquery --> chroma : ANN search
+chroma --> cquery : nearest vectors
+cquery --> results
+
+@enduml
diff --git a/diagrams/pipeline.svg b/diagrams/pipeline.svg
deleted file mode 100644
index aacf16b..0000000
--- a/diagrams/pipeline.svg
+++ /dev/null
@@ -1,80 +0,0 @@
-
\ No newline at end of file
diff --git a/docs/implementation-plan.md b/docs/implementation-plan.md
index 974b663..d96f3ca 100644
--- a/docs/implementation-plan.md
+++ b/docs/implementation-plan.md
@@ -58,7 +58,7 @@ The chunking and embedding logic from Phase 1 carries over unchanged. The only t
- `chromadb` -- local persistent vector database
-**Status:** In Progress
+**Status:** Complete
**Vector DB tradeoffs to understand:**
diff --git a/embed.py b/embed.py
index e6cd6c7..47ac3f6 100644
--- a/embed.py
+++ b/embed.py
@@ -1,7 +1,5 @@
from openai import OpenAI
from dotenv import load_dotenv
-from utils import chunk_text
-import json
import os
load_dotenv()
@@ -14,32 +12,8 @@ def embed_chunks(chunks: list[str]) -> list[dict]:
model=os.getenv("EMBEDDING_MODEL"),
input=chunks
)
-
return [{"text": chunk, "embedding": response.data[i].embedding} for i, chunk in enumerate(chunks)]
+
def embed_query(query: str) -> list[float]:
return embed_chunks([query])[0]["embedding"]
-
-def load_document(filepath: str) -> str:
- try:
- with open(filepath, 'r') as f:
- return f.read()
- except FileNotFoundError as e:
- raise FileNotFoundError(f"Document not found: {filepath}: {e}")
-
-def response_write(filepath: str, embeddings: list[dict]) -> bool:
- try:
- with open(filepath, 'w') as f:
- json.dump(embeddings, f)
- except IOError as e:
- raise IOError(f"Failed to wrte embeddings: {e}")
-
-def main():
- file_names = os.listdir("documents/")
- json_response = []
- for file_name in filter(lambda f: f.endswith('.txt'), file_names):
- json_response.extend(embed_chunks(chunk_text(load_document(f"documents/{file_name}"))))
- response_write("embeddings.json", json_response)
-
-if __name__ == '__main__':
- main()
\ No newline at end of file
diff --git a/ingest.py b/ingest.py
new file mode 100644
index 0000000..1952dc0
--- /dev/null
+++ b/ingest.py
@@ -0,0 +1,31 @@
+from utils import chunk_text, load_documents
+from embed import embed_chunks
+import chromadb
+
+client = chromadb.PersistentClient(path="./chroma_db")
+
+
+def main():
+ collection = client.get_or_create_collection(name="documents")
+
+ for doc in load_documents("documents/"):
+ chunks = chunk_text(doc["text"])
+ embedded = embed_chunks(chunks)
+
+ # Delete existing chunks for this source before re-ingesting
+ if collection.count() > 0:
+ collection.delete(where={"source": doc["filename"]})
+
+ collection.upsert(
+ ids = [f"{doc['filename']}_{i}" for i in range(len(chunks))],
+ embeddings = [e["embedding"] for e in embedded],
+ documents = chunks,
+ metadatas = [{"source": doc["filename"], "chunk_index": i} for i in range(len(chunks))]
+ )
+
+ print(f"Ingested {len(chunks)} chunks from {doc['filename']}")
+
+ print(f"\nTotal vectors in collection: {collection.count()}")
+
+if __name__ == '__main__':
+ main()
diff --git a/inspect_collection.py b/inspect_collection.py
new file mode 100644
index 0000000..70c4f36
--- /dev/null
+++ b/inspect_collection.py
@@ -0,0 +1,25 @@
+import chromadb
+
+client = chromadb.PersistentClient(path="./chroma_db")
+
+
+def main():
+ collection = client.get_or_create_collection(name="documents")
+
+ print(f"Total vectors in collection: {collection.count()}\n")
+
+ sample = collection.get(
+ limit = 1,
+ include = ["documents", "metadatas", "embeddings"]
+ )
+
+ print("Sample entry:")
+ print(f" id: {sample['ids'][0]}")
+ print(f" source: {sample['metadatas'][0]['source']}")
+ print(f" chunk_index: {sample['metadatas'][0]['chunk_index']}")
+ print(f" text: {sample['documents'][0][:120]}...")
+ print(f" embedding: [{sample['embeddings'][0][0]:.6f}, {sample['embeddings'][0][1]:.6f}, ...] ({len(sample['embeddings'][0])} dims)")
+
+
+if __name__ == '__main__':
+ main()
diff --git a/pyproject.toml b/pyproject.toml
index 2481e7a..0b9e40b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,6 +6,7 @@ dependencies = [
"openai>=1.0.0",
"python-dotenv>=1.0.0",
"numpy>=1.26.0",
+ "chromadb>=1.5.9"
]
[tool.setuptools]
diff --git a/search.py b/search.py
index 674a4ca..8e94815 100644
--- a/search.py
+++ b/search.py
@@ -1,31 +1,40 @@
-import json
-from utils import chunk_text, cosine_similarity
-from embed import embed_chunks, embed_query
+from embed import embed_query
+import chromadb
+
+client = chromadb.PersistentClient(path="./chroma_db")
-def load_embedded_response() -> list[dict]:
- try:
- with open('embeddings.json', 'r') as f:
- return json.load(f)
- except FileNotFoundError as e:
- raise FileNotFoundError(f"File not found")
def search(query: str, top_k: int = 3) -> list[dict]:
- embedded_response = load_embedded_response()
+ collection = client.get_or_create_collection(name="documents")
query_vector = embed_query(query)
- scored = [
- {"text": chunk["text"], "score": cosine_similarity(query_vector, chunk["embedding"])}
- for chunk in embedded_response
+ results = collection.query(
+ query_embeddings = [query_vector],
+ n_results = top_k,
+ include = ["documents", "metadatas", "distances"]
+ )
+
+ return [
+ {
+ "text": results["documents"][0][i],
+ "source": results["metadatas"][0][i]["source"],
+ "chunk_index": results["metadatas"][0][i]["chunk_index"],
+ "distance": results["distances"][0][i]
+ }
+ for i in range(len(results["documents"][0]))
]
- return sorted(scored, key = lambda x: x["score"], reverse = True)[:top_k]
def main():
- results = search("what foods are good for the heart")
+ query = "what foods are good for the heart"
+ results = search(query)
+
+ print(f"Query: \"{query}\"\n")
for i, result in enumerate(results):
- print(f"Result {i + 1} (score: {result['score']:.4f})")
+ print(f"Result {i + 1} (distance: {result['distance']:.4f}) — {result['source']} [chunk {result['chunk_index']}]")
print(result["text"])
print()
+
if __name__ == '__main__':
main()
diff --git a/utils.py b/utils.py
index bc082b2..a04b3d4 100644
--- a/utils.py
+++ b/utils.py
@@ -1,5 +1,22 @@
import numpy as np
+
+def load_document(filepath: str) -> str:
+ try:
+ with open(filepath, 'r') as f:
+ return f.read()
+ except FileNotFoundError as e:
+ raise FileNotFoundError(f"Document not found: {filepath}: {e}")
+
+
+def load_documents(directory: str) -> list[dict]:
+ import os
+ documents = []
+ for filename in filter(lambda f: f.endswith('.txt'), os.listdir(directory)):
+ filepath = os.path.join(directory, filename)
+ documents.append({"filename": filename, "text": load_document(filepath)})
+ return documents
+
def chunk_text(text: str, chunk_size: int = 300, overlap: int = 50) -> list[str]:
words = text.split()
step = chunk_size - overlap