From 1a571f3063640d00844f4e78c4df3d7150864234 Mon Sep 17 00:00:00 2001 From: Buffden Date: Wed, 17 Jun 2026 19:37:57 -0500 Subject: [PATCH 1/2] enhance pipeline chart --- docs/pipeline-vector-store.puml | 39 +++++++++++++++++++-------------- inspect_collection.py | 10 ++++----- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/docs/pipeline-vector-store.puml b/docs/pipeline-vector-store.puml index 5c05064..9b5e57f 100644 --- a/docs/pipeline-vector-store.puml +++ b/docs/pipeline-vector-store.puml @@ -17,9 +17,10 @@ skinparam ActorBorderColor #448844 title Chroma Vector Store Pipeline box "Ingestion" #EEF6FF - participant "load_documents()" as load - participant "chunk_text()" as chunk - participant "embed_chunks()" as embed + collections "documents/" as docs + participant "ingest.py" as ingest + participant "utils.py" as utils + participant "embed.py" as embed end box participant "OpenAI API" as openai @@ -27,28 +28,32 @@ database "Chroma DB" as chroma box "Search" #FFF0F8 actor "Query" as query - participant "embed_query()" as embedq - participant "collection.query()" as cquery + participant "search.py" as search end box == Ingestion == -[-> load : .txt files -load -> chunk : text -chunk -> embed : chunks -embed -> openai : embed request +docs -> ingest : .txt files +ingest -> utils : load_documents() +utils --> ingest : text +ingest -> utils : chunk_text() +utils --> ingest : chunks +ingest -> embed : embed_chunks() +embed -> openai : embeddings.create() openai --> embed : 1536-dim vectors -embed -> chroma : upsert(ids, embeddings, metadata) +embed --> ingest : embedded chunks +ingest -> chroma : upsert(ids, embeddings, documents, metadatas) == Search == -query -> embedq : query string -embedq -> openai : embed request -openai --> embedq : query vector -embedq -> cquery : query vector -cquery -> chroma : ANN search -chroma --> cquery : nearest vectors -cquery --> query : Top-K results\n(text + source + distance) +query -> search : query string +search -> embed : embed_query() +embed -> openai : embeddings.create() +openai --> embed : query vector +embed --> search : query vector +search -> chroma : query(query_embeddings, n_results, include) +chroma --> search : nearest vectors +search --> query : Top-K results\n(text + source + distance) ' end @enduml diff --git a/inspect_collection.py b/inspect_collection.py index 70c4f36..a971b56 100644 --- a/inspect_collection.py +++ b/inspect_collection.py @@ -14,11 +14,11 @@ def main(): ) print("Sample entry:") - print(f" id: {sample['ids'][0]}") - print(f" source: {sample['metadatas'][0]['source']}") - print(f" chunk_index: {sample['metadatas'][0]['chunk_index']}") - print(f" text: {sample['documents'][0][:120]}...") - print(f" embedding: [{sample['embeddings'][0][0]:.6f}, {sample['embeddings'][0][1]:.6f}, ...] ({len(sample['embeddings'][0])} dims)") + print(f"id: {sample['ids'][0]}") + print(f"source: {sample['metadatas'][0]['source']}") + print(f"chunk_index: {sample['metadatas'][0]['chunk_index']}") + print(f"text: {sample['documents'][0][:120]}...") + print(f"embedding: [{sample['embeddings'][0][0]:.6f}, {sample['embeddings'][0][1]:.6f}, ...] ({len(sample['embeddings'][0])} dims)") if __name__ == '__main__': From 6a27a13fb9e9d5e7358e50a61403decb0310e0be Mon Sep 17 00:00:00 2001 From: Buffden Date: Wed, 17 Jun 2026 19:40:09 -0500 Subject: [PATCH 2/2] update readme --- README.md | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index fc49782..7af7f1f 100644 --- a/README.md +++ b/README.md @@ -6,32 +6,21 @@ A progressive RAG system built from first principles -- from raw embeddings and ## What It Does (Current State) - - - - - -
- -Ingestion +**Ingestion** 1. **Loads** `.txt` files (PDF, DOCX, Markdown from Phase 4) 2. **Chunks** each document into overlapping word windows 3. **Embeds** each chunk using OpenAI `text-embedding-3-small`, producing a 1536-dimensional vector 4. **Stores** vectors with metadata (`source`, `chunk_index`) in a persistent Chroma collection -Search +**Search** 1. **Embeds** the query using the same model 2. **Queries** Chroma for the top-K nearest vectors using built-in ANN (Approximate Nearest Neighbor) search 3. **Returns** results with chunk text, source filename, and distance score - - ![Pipeline](./diagrams/docs/pipeline-vector-store.svg) -
- --- ## Stack