Buffden · Buffden · Jun 17, 2026 · Jun 17, 2026
diff --git a/.gitignore b/.gitignore
@@ -11,7 +11,11 @@ venv/
 # Generated
 embeddings.json
 
+# Vector database storage
+chroma_db/
+
 # Package build artifacts
 *.egg-info/
 dist/
 build/
+*.dist-info/
diff --git a/README.md b/README.md
@@ -10,10 +10,18 @@ A progressive RAG system built from first principles -- from raw embeddings and
 <tr>
 <td valign="top" width="55%">
 
-1. **Chunks** text documents into overlapping word windows so meaning is preserved at boundaries
-2. **Embeds** each chunk using the OpenAI `text-embedding-3-small` API, producing a 1536-dimensional vector per chunk
-3. **Stores** vectors alongside the original text in a local `embeddings.json` file
-4. **Searches** by embedding a natural language query using the same model, then ranking all chunks by cosine similarity and returning the top-K matches
+Ingestion
+
+1. **Loads** `.txt` files (PDF, DOCX, Markdown from Phase 4)
+2. **Chunks** each document into overlapping word windows
+3. **Embeds** each chunk using OpenAI `text-embedding-3-small`, producing a 1536-dimensional vector
+4. **Stores** vectors with metadata (`source`, `chunk_index`) in a persistent Chroma collection
+
+Search
+
+1. **Embeds** the query using the same model
+2. **Queries** Chroma for the top-K nearest vectors using built-in ANN (Approximate Nearest Neighbor) search
+3. **Returns** results with chunk text, source filename, and distance score
 
 </td>
 <td valign="top" width="45%">
@@ -30,8 +38,7 @@ A progressive RAG system built from first principles -- from raw embeddings and
 
 - Python 3.12
 - OpenAI SDK (`text-embedding-3-small`)
-- NumPy (cosine similarity)
-- Plain JSON (storage -- current phase)
+- Chroma (persistent vector database)
 - python-dotenv
 
 ---
@@ -40,19 +47,23 @@ A progressive RAG system built from first principles -- from raw embeddings and
 
 ```text
 rag-document-engine/
-├── documents/              # Sample .txt files to embed
+├── documents/                  # Sample .txt files
 │   ├── ancient-rome.txt
 │   ├── climate-change.txt
 │   ├── music-and-the-brain.txt
 │   ├── nutrition-and-health.txt
 │   └── space-exploration.txt
-├── embed.py                # Load, chunk, embed documents -> embeddings.json
-├── search.py               # Embed query + retrieve top-K chunks by cosine similarity
-├── utils.py                # chunk_text and cosine_similarity helpers
+├── embed.py                    # embed_chunks and embed_query utilities
+├── ingest.py                   # Load, chunk, embed, store in Chroma
+├── search.py                   # Embed query + retrieve top-K from Chroma
+├── inspect_collection.py       # Print collection stats and a sample entry
+├── utils.py                    # chunk_text, load_document, load_documents
+├── chroma_db/                  # Chroma persistent storage (not committed)
+├── diagrams/                   # Pipeline diagrams
 ├── docs/
 │   └── implementation-plan.md  # Phase-by-phase build plan
 ├── pyproject.toml
-└── .env                    # API keys (not committed)
+└── .env                        # API keys (not committed)
 ```
 
 ---
@@ -77,11 +88,14 @@ EMBEDDING_MODEL=text-embedding-3-small
 ## Usage
 
 ```bash
-# Step 1 -- Embed all documents (generates embeddings.json)
-python3 embed.py
+# Step 1 -- Ingest documents into Chroma
+python3 ingest.py
 
 # Step 2 -- Search
 python3 search.py
+
+# Inspect the collection
+python3 inspect_collection.py
 ```
 
 The query is set in `search.py` main. Change it to anything you want to search for.
@@ -93,30 +107,31 @@ The query is set in `search.py` main. Change it to anything you want to search f
 Query: `"what foods are good for the heart"`
 
 ```text
-Result 1 (score: 0.3571)
-Nutrition is the science of how food affects the body. The food we eat provides energy and
-the raw materials needed to build and repair tissues... Unsaturated fats found in olive oil,
+Result 1 (distance: 1.2862) -- nutrition-and-health.txt [chunk 0]
+Nutrition is the science of how food affects the body... Unsaturated fats found in olive oil,
 nuts, avocados, and fatty fish are associated with reduced risk of heart disease...
 
-Result 2 (score: 0.3143)
+Result 2 (distance: 1.3720) -- nutrition-and-health.txt [chunk 1]
 The Mediterranean diet -- rich in vegetables, fruit, whole grains, fish, and olive oil -- is
 consistently associated with lower rates of heart disease, diabetes, and cognitive decline...
 
-Result 3 (score: 0.1786)
+Result 3 (distance: 1.6426) -- music-and-the-brain.txt [chunk 1]
 Music also affects mood and stress. Slow, quiet music activates the parasympathetic nervous
 system, lowering heart rate and cortisol levels...
 ```
 
 The top two results come from the nutrition document. Result 3 surfaces from the music document because it mentions "heart rate" -- semantic search catches conceptual overlap, not just keyword matches.
 
+Note: distance is an inverse similarity score -- lower means more relevant.
+
 ---
 
 ## Progress
 
 | Phase | Title | Status |
 | ----: | ----- | ------ |
 | 1 | Semantic Foundation | Complete |
-| 2 | Vector Store | In Progress |
+| 2 | Vector Store | Complete |
 | 3 | RAG Pipeline | Planned |
 | 4 | Document Ingestion | Planned |
 | 5 | Retrieval Quality | Planned |
@@ -133,10 +148,11 @@ See [docs/implementation-plan.md](./docs/implementation-plan.md) for full phase
 - **Cosine similarity** -- measures the angle between vectors; direction encodes meaning, magnitude does not
 - **Chunking** -- splits documents into overlapping windows so meaning is not diluted or cut at boundaries
 - **Model consistency** -- the same embedding model must be used for both documents and queries
+- **Vector database** -- stores embeddings with metadata and retrieves them by similarity using ANN search
 - **RAG** -- Retrieval-Augmented Generation: retrieve relevant context, then generate a grounded answer
 
 ---
 
 ## Diagrams
 
-The pipeline diagram is maintained as a PlantUML source file (`pipeline.puml`) and auto-exported to SVG on every push to main using [diagram-sync](https://www.npmjs.com/package/diagram-sync).
+Pipeline diagrams are maintained as PlantUML source files in `diagrams/` and auto-exported to SVG on every push to main using [diagram-sync](https://www.npmjs.com/package/diagram-sync).
diff --git a/diagrams/pipeline.puml b/diagrams/pipeline.puml
@@ -0,0 +1,60 @@
+@startuml pipeline-phase2
+
+skinparam backgroundColor #FFFFFF
+skinparam defaultFontName Arial
+skinparam defaultFontSize 13
+skinparam ArrowColor #555555
+skinparam componentStyle rectangle
+
+skinparam component {
+  BackgroundColor #EEF3FB
+  BorderColor #5577AA
+  FontColor #222222
+}
+
+skinparam database {
+  BackgroundColor #FFF8E7
+  BorderColor #CC9900
+}
+
+skinparam cloud {
+  BackgroundColor #F0FFF0
+  BorderColor #448844
+}
+
+title Phase 2 -- Chroma Vector Store Pipeline
+
+package "Ingestion (ingest.py)" {
+  [.txt files] as docs
+  [load_documents()] as load
+  [chunk_text()] as chunk
+  [embed_chunks()] as embed
+}
+
+cloud "OpenAI API\ntext-embedding-3-small" as openai
+
+database "Chroma DB\n(persistent)" as chroma
+
+package "Search (search.py)" {
+  [Query] as query
+  [embed_query()] as embedq
+  [collection.query()] as cquery
+  [Top-K Results\n(text + source + distance)] as results
+}
+
+docs --> load
+load --> chunk
+chunk --> embed
+embed --> openai : API call
+openai --> embed : 1536-dim vectors
+embed --> chroma : upsert with\nids + metadata
+
+query --> embedq
+embedq --> openai : API call
+openai --> embedq : query vector
+embedq --> cquery
+cquery --> chroma : ANN search
+chroma --> cquery : nearest vectors
+cquery --> results
+
+@enduml
diff --git a/diagrams/pipeline.svg b/diagrams/pipeline.svg
diff --git a/docs/implementation-plan.md b/docs/implementation-plan.md
@@ -58,7 +58,7 @@ The chunking and embedding logic from Phase 1 carries over unchanged. The only t
 
 - `chromadb` -- local persistent vector database
 
-**Status:** In Progress
+**Status:** Complete
 
 **Vector DB tradeoffs to understand:**
 

diff --git a/embed.py b/embed.py
@@ -1,7 +1,5 @@
 from openai import OpenAI
 from dotenv import load_dotenv
-from utils import chunk_text
-import json
 import os
 
 load_dotenv()
@@ -14,32 +12,8 @@ def embed_chunks(chunks: list[str]) -> list[dict]:
         model=os.getenv("EMBEDDING_MODEL"),
         input=chunks
     )
-
     return [{"text": chunk, "embedding": response.data[i].embedding} for i, chunk in enumerate(chunks)]
 
+
 def embed_query(query: str) -> list[float]:
     return embed_chunks([query])[0]["embedding"]
-
-def load_document(filepath: str) -> str:
-    try:
-        with open(filepath, 'r') as f:
-            return f.read()
-    except FileNotFoundError as e:
-        raise FileNotFoundError(f"Document not found: {filepath}: {e}")
-
-def response_write(filepath: str, embeddings: list[dict]) -> bool:
-    try:
-        with open(filepath, 'w') as f:
-            json.dump(embeddings, f)
-    except IOError as e:
-        raise IOError(f"Failed to wrte embeddings: {e}")
-
-def main():
-    file_names = os.listdir("documents/")
-    json_response = []
-    for file_name in filter(lambda f: f.endswith('.txt'), file_names):
-        json_response.extend(embed_chunks(chunk_text(load_document(f"documents/{file_name}"))))
-    response_write("embeddings.json", json_response)
-
-if __name__ == '__main__':
-    main()
diff --git a/ingest.py b/ingest.py
@@ -0,0 +1,31 @@
+from utils import chunk_text, load_documents
+from embed import embed_chunks
+import chromadb
+
+client = chromadb.PersistentClient(path="./chroma_db")
+
+
+def main():
+    collection = client.get_or_create_collection(name="documents")
+
+    for doc in load_documents("documents/"):
+        chunks = chunk_text(doc["text"])
+        embedded = embed_chunks(chunks)
+
+        # Delete existing chunks for this source before re-ingesting
+        if collection.count() > 0:
+            collection.delete(where={"source": doc["filename"]})
+
+        collection.upsert(
+            ids = [f"{doc['filename']}_{i}" for i in range(len(chunks))],
+            embeddings = [e["embedding"] for e in embedded],
+            documents = chunks,
+            metadatas = [{"source": doc["filename"], "chunk_index": i} for i in range(len(chunks))]
+        )
+
+        print(f"Ingested {len(chunks)} chunks from {doc['filename']}")
+
+    print(f"\nTotal vectors in collection: {collection.count()}")
+
+if __name__ == '__main__':
+    main()