diff --git a/.gitignore b/.gitignore index e41eca8..1f7f108 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,11 @@ venv/ # Generated embeddings.json +# Vector database storage +chroma_db/ + # Package build artifacts *.egg-info/ dist/ build/ +*.dist-info/ diff --git a/README.md b/README.md index 36fcda1..d86deff 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,18 @@ A progressive RAG system built from first principles -- from raw embeddings and -1. **Chunks** text documents into overlapping word windows so meaning is preserved at boundaries -2. **Embeds** each chunk using the OpenAI `text-embedding-3-small` API, producing a 1536-dimensional vector per chunk -3. **Stores** vectors alongside the original text in a local `embeddings.json` file -4. **Searches** by embedding a natural language query using the same model, then ranking all chunks by cosine similarity and returning the top-K matches +Ingestion + +1. **Loads** `.txt` files (PDF, DOCX, Markdown from Phase 4) +2. **Chunks** each document into overlapping word windows +3. **Embeds** each chunk using OpenAI `text-embedding-3-small`, producing a 1536-dimensional vector +4. **Stores** vectors with metadata (`source`, `chunk_index`) in a persistent Chroma collection + +Search + +1. **Embeds** the query using the same model +2. **Queries** Chroma for the top-K nearest vectors using built-in ANN (Approximate Nearest Neighbor) search +3. **Returns** results with chunk text, source filename, and distance score @@ -30,8 +38,7 @@ A progressive RAG system built from first principles -- from raw embeddings and - Python 3.12 - OpenAI SDK (`text-embedding-3-small`) -- NumPy (cosine similarity) -- Plain JSON (storage -- current phase) +- Chroma (persistent vector database) - python-dotenv --- @@ -40,19 +47,23 @@ A progressive RAG system built from first principles -- from raw embeddings and ```text rag-document-engine/ -├── documents/ # Sample .txt files to embed +├── documents/ # Sample .txt files │ ├── ancient-rome.txt │ ├── climate-change.txt │ ├── music-and-the-brain.txt │ ├── nutrition-and-health.txt │ └── space-exploration.txt -├── embed.py # Load, chunk, embed documents -> embeddings.json -├── search.py # Embed query + retrieve top-K chunks by cosine similarity -├── utils.py # chunk_text and cosine_similarity helpers +├── embed.py # embed_chunks and embed_query utilities +├── ingest.py # Load, chunk, embed, store in Chroma +├── search.py # Embed query + retrieve top-K from Chroma +├── inspect_collection.py # Print collection stats and a sample entry +├── utils.py # chunk_text, load_document, load_documents +├── chroma_db/ # Chroma persistent storage (not committed) +├── diagrams/ # Pipeline diagrams ├── docs/ │ └── implementation-plan.md # Phase-by-phase build plan ├── pyproject.toml -└── .env # API keys (not committed) +└── .env # API keys (not committed) ``` --- @@ -77,11 +88,14 @@ EMBEDDING_MODEL=text-embedding-3-small ## Usage ```bash -# Step 1 -- Embed all documents (generates embeddings.json) -python3 embed.py +# Step 1 -- Ingest documents into Chroma +python3 ingest.py # Step 2 -- Search python3 search.py + +# Inspect the collection +python3 inspect_collection.py ``` The query is set in `search.py` main. Change it to anything you want to search for. @@ -93,22 +107,23 @@ The query is set in `search.py` main. Change it to anything you want to search f Query: `"what foods are good for the heart"` ```text -Result 1 (score: 0.3571) -Nutrition is the science of how food affects the body. The food we eat provides energy and -the raw materials needed to build and repair tissues... Unsaturated fats found in olive oil, +Result 1 (distance: 1.2862) -- nutrition-and-health.txt [chunk 0] +Nutrition is the science of how food affects the body... Unsaturated fats found in olive oil, nuts, avocados, and fatty fish are associated with reduced risk of heart disease... -Result 2 (score: 0.3143) +Result 2 (distance: 1.3720) -- nutrition-and-health.txt [chunk 1] The Mediterranean diet -- rich in vegetables, fruit, whole grains, fish, and olive oil -- is consistently associated with lower rates of heart disease, diabetes, and cognitive decline... -Result 3 (score: 0.1786) +Result 3 (distance: 1.6426) -- music-and-the-brain.txt [chunk 1] Music also affects mood and stress. Slow, quiet music activates the parasympathetic nervous system, lowering heart rate and cortisol levels... ``` The top two results come from the nutrition document. Result 3 surfaces from the music document because it mentions "heart rate" -- semantic search catches conceptual overlap, not just keyword matches. +Note: distance is an inverse similarity score -- lower means more relevant. + --- ## Progress @@ -116,7 +131,7 @@ The top two results come from the nutrition document. Result 3 surfaces from the | Phase | Title | Status | | ----: | ----- | ------ | | 1 | Semantic Foundation | Complete | -| 2 | Vector Store | In Progress | +| 2 | Vector Store | Complete | | 3 | RAG Pipeline | Planned | | 4 | Document Ingestion | Planned | | 5 | Retrieval Quality | Planned | @@ -133,10 +148,11 @@ See [docs/implementation-plan.md](./docs/implementation-plan.md) for full phase - **Cosine similarity** -- measures the angle between vectors; direction encodes meaning, magnitude does not - **Chunking** -- splits documents into overlapping windows so meaning is not diluted or cut at boundaries - **Model consistency** -- the same embedding model must be used for both documents and queries +- **Vector database** -- stores embeddings with metadata and retrieves them by similarity using ANN search - **RAG** -- Retrieval-Augmented Generation: retrieve relevant context, then generate a grounded answer --- ## Diagrams -The pipeline diagram is maintained as a PlantUML source file (`pipeline.puml`) and auto-exported to SVG on every push to main using [diagram-sync](https://www.npmjs.com/package/diagram-sync). +Pipeline diagrams are maintained as PlantUML source files in `diagrams/` and auto-exported to SVG on every push to main using [diagram-sync](https://www.npmjs.com/package/diagram-sync). diff --git a/diagrams/pipeline.puml b/diagrams/pipeline.puml new file mode 100644 index 0000000..98d320e --- /dev/null +++ b/diagrams/pipeline.puml @@ -0,0 +1,60 @@ +@startuml pipeline-phase2 + +skinparam backgroundColor #FFFFFF +skinparam defaultFontName Arial +skinparam defaultFontSize 13 +skinparam ArrowColor #555555 +skinparam componentStyle rectangle + +skinparam component { + BackgroundColor #EEF3FB + BorderColor #5577AA + FontColor #222222 +} + +skinparam database { + BackgroundColor #FFF8E7 + BorderColor #CC9900 +} + +skinparam cloud { + BackgroundColor #F0FFF0 + BorderColor #448844 +} + +title Phase 2 -- Chroma Vector Store Pipeline + +package "Ingestion (ingest.py)" { + [.txt files] as docs + [load_documents()] as load + [chunk_text()] as chunk + [embed_chunks()] as embed +} + +cloud "OpenAI API\ntext-embedding-3-small" as openai + +database "Chroma DB\n(persistent)" as chroma + +package "Search (search.py)" { + [Query] as query + [embed_query()] as embedq + [collection.query()] as cquery + [Top-K Results\n(text + source + distance)] as results +} + +docs --> load +load --> chunk +chunk --> embed +embed --> openai : API call +openai --> embed : 1536-dim vectors +embed --> chroma : upsert with\nids + metadata + +query --> embedq +embedq --> openai : API call +openai --> embedq : query vector +embedq --> cquery +cquery --> chroma : ANN search +chroma --> cquery : nearest vectors +cquery --> results + +@enduml diff --git a/diagrams/pipeline.svg b/diagrams/pipeline.svg deleted file mode 100644 index aacf16b..0000000 --- a/diagrams/pipeline.svg +++ /dev/null @@ -1,80 +0,0 @@ -Indexing — embed.pySearch — search.pydocuments/chunk_text300 words · 50 overlapembed_chunksOpenAI APIembeddings.jsonQueryembed_queryOpenAI APIcosine_similarityvs all chunksTop-K Results \ No newline at end of file diff --git a/docs/implementation-plan.md b/docs/implementation-plan.md index 974b663..d96f3ca 100644 --- a/docs/implementation-plan.md +++ b/docs/implementation-plan.md @@ -58,7 +58,7 @@ The chunking and embedding logic from Phase 1 carries over unchanged. The only t - `chromadb` -- local persistent vector database -**Status:** In Progress +**Status:** Complete **Vector DB tradeoffs to understand:** diff --git a/embed.py b/embed.py index e6cd6c7..47ac3f6 100644 --- a/embed.py +++ b/embed.py @@ -1,7 +1,5 @@ from openai import OpenAI from dotenv import load_dotenv -from utils import chunk_text -import json import os load_dotenv() @@ -14,32 +12,8 @@ def embed_chunks(chunks: list[str]) -> list[dict]: model=os.getenv("EMBEDDING_MODEL"), input=chunks ) - return [{"text": chunk, "embedding": response.data[i].embedding} for i, chunk in enumerate(chunks)] + def embed_query(query: str) -> list[float]: return embed_chunks([query])[0]["embedding"] - -def load_document(filepath: str) -> str: - try: - with open(filepath, 'r') as f: - return f.read() - except FileNotFoundError as e: - raise FileNotFoundError(f"Document not found: {filepath}: {e}") - -def response_write(filepath: str, embeddings: list[dict]) -> bool: - try: - with open(filepath, 'w') as f: - json.dump(embeddings, f) - except IOError as e: - raise IOError(f"Failed to wrte embeddings: {e}") - -def main(): - file_names = os.listdir("documents/") - json_response = [] - for file_name in filter(lambda f: f.endswith('.txt'), file_names): - json_response.extend(embed_chunks(chunk_text(load_document(f"documents/{file_name}")))) - response_write("embeddings.json", json_response) - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/ingest.py b/ingest.py new file mode 100644 index 0000000..1952dc0 --- /dev/null +++ b/ingest.py @@ -0,0 +1,31 @@ +from utils import chunk_text, load_documents +from embed import embed_chunks +import chromadb + +client = chromadb.PersistentClient(path="./chroma_db") + + +def main(): + collection = client.get_or_create_collection(name="documents") + + for doc in load_documents("documents/"): + chunks = chunk_text(doc["text"]) + embedded = embed_chunks(chunks) + + # Delete existing chunks for this source before re-ingesting + if collection.count() > 0: + collection.delete(where={"source": doc["filename"]}) + + collection.upsert( + ids = [f"{doc['filename']}_{i}" for i in range(len(chunks))], + embeddings = [e["embedding"] for e in embedded], + documents = chunks, + metadatas = [{"source": doc["filename"], "chunk_index": i} for i in range(len(chunks))] + ) + + print(f"Ingested {len(chunks)} chunks from {doc['filename']}") + + print(f"\nTotal vectors in collection: {collection.count()}") + +if __name__ == '__main__': + main() diff --git a/inspect_collection.py b/inspect_collection.py new file mode 100644 index 0000000..70c4f36 --- /dev/null +++ b/inspect_collection.py @@ -0,0 +1,25 @@ +import chromadb + +client = chromadb.PersistentClient(path="./chroma_db") + + +def main(): + collection = client.get_or_create_collection(name="documents") + + print(f"Total vectors in collection: {collection.count()}\n") + + sample = collection.get( + limit = 1, + include = ["documents", "metadatas", "embeddings"] + ) + + print("Sample entry:") + print(f" id: {sample['ids'][0]}") + print(f" source: {sample['metadatas'][0]['source']}") + print(f" chunk_index: {sample['metadatas'][0]['chunk_index']}") + print(f" text: {sample['documents'][0][:120]}...") + print(f" embedding: [{sample['embeddings'][0][0]:.6f}, {sample['embeddings'][0][1]:.6f}, ...] ({len(sample['embeddings'][0])} dims)") + + +if __name__ == '__main__': + main() diff --git a/pyproject.toml b/pyproject.toml index 2481e7a..0b9e40b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ dependencies = [ "openai>=1.0.0", "python-dotenv>=1.0.0", "numpy>=1.26.0", + "chromadb>=1.5.9" ] [tool.setuptools] diff --git a/search.py b/search.py index 674a4ca..8e94815 100644 --- a/search.py +++ b/search.py @@ -1,31 +1,40 @@ -import json -from utils import chunk_text, cosine_similarity -from embed import embed_chunks, embed_query +from embed import embed_query +import chromadb + +client = chromadb.PersistentClient(path="./chroma_db") -def load_embedded_response() -> list[dict]: - try: - with open('embeddings.json', 'r') as f: - return json.load(f) - except FileNotFoundError as e: - raise FileNotFoundError(f"File not found") def search(query: str, top_k: int = 3) -> list[dict]: - embedded_response = load_embedded_response() + collection = client.get_or_create_collection(name="documents") query_vector = embed_query(query) - scored = [ - {"text": chunk["text"], "score": cosine_similarity(query_vector, chunk["embedding"])} - for chunk in embedded_response + results = collection.query( + query_embeddings = [query_vector], + n_results = top_k, + include = ["documents", "metadatas", "distances"] + ) + + return [ + { + "text": results["documents"][0][i], + "source": results["metadatas"][0][i]["source"], + "chunk_index": results["metadatas"][0][i]["chunk_index"], + "distance": results["distances"][0][i] + } + for i in range(len(results["documents"][0])) ] - return sorted(scored, key = lambda x: x["score"], reverse = True)[:top_k] def main(): - results = search("what foods are good for the heart") + query = "what foods are good for the heart" + results = search(query) + + print(f"Query: \"{query}\"\n") for i, result in enumerate(results): - print(f"Result {i + 1} (score: {result['score']:.4f})") + print(f"Result {i + 1} (distance: {result['distance']:.4f}) — {result['source']} [chunk {result['chunk_index']}]") print(result["text"]) print() + if __name__ == '__main__': main() diff --git a/utils.py b/utils.py index bc082b2..a04b3d4 100644 --- a/utils.py +++ b/utils.py @@ -1,5 +1,22 @@ import numpy as np + +def load_document(filepath: str) -> str: + try: + with open(filepath, 'r') as f: + return f.read() + except FileNotFoundError as e: + raise FileNotFoundError(f"Document not found: {filepath}: {e}") + + +def load_documents(directory: str) -> list[dict]: + import os + documents = [] + for filename in filter(lambda f: f.endswith('.txt'), os.listdir(directory)): + filepath = os.path.join(directory, filename) + documents.append({"filename": filename, "text": load_document(filepath)}) + return documents + def chunk_text(text: str, chunk_size: int = 300, overlap: int = 50) -> list[str]: words = text.split() step = chunk_size - overlap