diff --git a/diagrams/diagrams/pipeline-phase2.svg b/diagrams/diagrams/pipeline-phase2.svg deleted file mode 100644 index a1bd146..0000000 --- a/diagrams/diagrams/pipeline-phase2.svg +++ /dev/null @@ -1,97 +0,0 @@ -Phase 2 -- Chroma Vector Store PipelineIngestion (ingest.py)Search (search.py).txt filesload_documents()chunk_text()embed_chunks()Queryembed_query()collection.query()Top-K Results(text + source + distance)OpenAI APItext-embedding-3-smallChroma DB(persistent)API call1536-dim vectorsupsert withids + metadataAPI callquery vectorANN searchnearest vectors \ No newline at end of file diff --git a/diagrams/docs/pipeline-phase2.svg b/diagrams/docs/pipeline-phase2.svg deleted file mode 100644 index a1bd146..0000000 --- a/diagrams/docs/pipeline-phase2.svg +++ /dev/null @@ -1,97 +0,0 @@ -Phase 2 -- Chroma Vector Store PipelineIngestion (ingest.py)Search (search.py).txt filesload_documents()chunk_text()embed_chunks()Queryembed_query()collection.query()Top-K Results(text + source + distance)OpenAI APItext-embedding-3-smallChroma DB(persistent)API call1536-dim vectorsupsert withids + metadataAPI callquery vectorANN searchnearest vectors \ No newline at end of file diff --git a/diagrams/docs/pipeline-semantic-search.svg b/diagrams/docs/pipeline-semantic-search.svg deleted file mode 100644 index cc43a4d..0000000 --- a/diagrams/docs/pipeline-semantic-search.svg +++ /dev/null @@ -1,80 +0,0 @@ -Indexing — embed.pySearch — search.pydocuments/chunk_text300 words · 50 overlapembed_chunksOpenAI APIembeddings.jsonQueryembed_queryOpenAI APIcosine_similarityvs all chunksTop-K Results \ No newline at end of file diff --git a/diagrams/docs/pipeline-vector-store.svg b/diagrams/docs/pipeline-vector-store.svg deleted file mode 100644 index 5651427..0000000 --- a/diagrams/docs/pipeline-vector-store.svg +++ /dev/null @@ -1,99 +0,0 @@ -Phase 2 -- Chroma Vector Store PipelineIngestion (ingest.py)Search (search.py).txt filesload_documents()chunk_text()embed_chunks()Queryembed_query()collection.query()Top-K Results(text + source + distance)OpenAI APItext-embedding-3-smallChroma DB(persistent)API call1536-dim vectorsupsert withids + metadataAPI callquery vectorANN searchnearest vectors \ No newline at end of file diff --git a/diagrams/docs/pipeline.svg b/diagrams/docs/pipeline.svg deleted file mode 100644 index aacf16b..0000000 --- a/diagrams/docs/pipeline.svg +++ /dev/null @@ -1,80 +0,0 @@ -Indexing — embed.pySearch — search.pydocuments/chunk_text300 words · 50 overlapembed_chunksOpenAI APIembeddings.jsonQueryembed_queryOpenAI APIcosine_similarityvs all chunksTop-K Results \ No newline at end of file diff --git a/docs/pipeline-vector-store.puml b/docs/pipeline-vector-store.puml index 954dd62..0322a30 100644 --- a/docs/pipeline-vector-store.puml +++ b/docs/pipeline-vector-store.puml @@ -3,60 +3,51 @@ skinparam backgroundColor #FFFFFF skinparam defaultFontName Arial skinparam defaultFontSize 13 -skinparam ArrowColor #555555 -skinparam ArrowThickness 2 -skinparam componentStyle rectangle -skinparam linetype ortho - -skinparam component { - BackgroundColor #EEF3FB - BorderColor #5577AA - FontColor #222222 -} - -skinparam database { - BackgroundColor #FFF8E7 - BorderColor #CC9900 -} - -skinparam cloud { - BackgroundColor #F0FFF0 - BorderColor #448844 -} - -title Phase 2 -- Chroma Vector Store Pipeline - -package "Ingestion (ingest.py)" { - [.txt files] as docs - [load_documents()] as load - [chunk_text()] as chunk - [embed_chunks()] as embed -} - -cloud "OpenAI API\ntext-embedding-3-small" as openai - -database "Chroma DB\n(persistent)" as chroma - -package "Search (search.py)" { - [Query] as query - [embed_query()] as embedq - [collection.query()] as cquery - [Top-K Results\n(text + source + distance)] as results -} - -docs --> load -load --> chunk -chunk --> embed -embed --> openai : API call +skinparam sequenceArrowThickness 2 +skinparam SequenceBoxBackgroundColor #F8F9FF +skinparam SequenceBoxBorderColor #AABBDD +skinparam ParticipantBackgroundColor #EEF3FB +skinparam ParticipantBorderColor #5577AA +skinparam ParticipantFontColor #222222 +skinparam DatabaseBackgroundColor #FFF8E7 +skinparam DatabaseBorderColor #CC9900 +skinparam ActorBackgroundColor #F0FFF0 +skinparam ActorBorderColor #448844 + +title Chroma Vector Store Pipeline + +box "Ingestion" #EEF6FF + participant "load_documents()" as load + participant "chunk_text()" as chunk + participant "embed_chunks()" as embed +end box + +participant "OpenAI API" as openai +database "Chroma DB" as chroma + +box "Search" #FFF0F8 + actor "Query" as query + participant "embed_query()" as embedq + participant "collection.query()" as cquery +end box + +== Ingestion == + +[-> load : .txt files +load -> chunk : text +chunk -> embed : chunks +embed -> openai : embed request openai --> embed : 1536-dim vectors -embed --> chroma : upsert with\nids + metadata +embed -> chroma : upsert(ids, embeddings, metadata) -query --> embedq -embedq --> openai : API call +== Search == + +query -> embedq : query string +embedq -> openai : embed request openai --> embedq : query vector -embedq --> cquery -cquery --> chroma : ANN search +embedq -> cquery : query vector +cquery -> chroma : ANN search chroma --> cquery : nearest vectors -cquery --> results +cquery --> query : Top-K results\n(text + source + distance) @enduml