diff --git a/README.md b/README.md index db11877..9f574ae 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,26 @@ # RAG Document Engine -A progressive RAG system built from first principles -- from raw embeddings and cosine similarity all the way to a full retrieval-augmented generation pipeline with document ingestion, reranking, and cited answers. +A progressive RAG system built from first principles -- from raw embeddings and cosine similarity all the way to a full retrieval-augmented generation pipeline with multi-format document ingestion and cited answers. --- ## What It Does (Current State) -**Ingestion** +### Ingestion -1. **Loads** `.txt` files (PDF, DOCX, Markdown from Phase 4) +1. **Parses** `.txt`, `.pdf`, `.docx`, and `.md` files into plain text via format-specific parsers 2. **Chunks** each document into overlapping word windows 3. **Embeds** each chunk using OpenAI `text-embedding-3-small`, producing a 1536-dimensional vector -4. **Stores** vectors with metadata (`source`, `chunk_index`) in a persistent Chroma collection +4. **Deduplicates** - deletes any existing chunks for the file before storing, so re-ingestion replaces rather than duplicates +5. **Stores** vectors with metadata (`source`, `chunk_index`) in a persistent Chroma collection -**Search** +### Search 1. **Embeds** the query using the same model 2. **Queries** Chroma for the top-K nearest vectors using built-in ANN (Approximate Nearest Neighbor) search 3. **Returns** results with chunk text, source filename, and distance score -**Generation** +### Generation 1. **Selects** retrieved chunks within a 2000-token budget using `tiktoken` 2. **Builds** a numbered context block from the selected chunks @@ -34,6 +35,9 @@ A progressive RAG system built from first principles -- from raw embeddings and - OpenAI SDK (`text-embedding-3-small` for embeddings, `gpt-4o-mini` for generation) - Chroma (persistent vector database) - tiktoken (token counting for context budget management) +- pymupdf (PDF parsing) +- python-docx (DOCX parsing) +- numpy (cosine similarity computation) - python-dotenv --- @@ -42,24 +46,25 @@ A progressive RAG system built from first principles -- from raw embeddings and ```text rag-document-engine/ -├── documents/ # Sample .txt files -│ ├── ancient-rome.txt -│ ├── climate-change.txt -│ ├── music-and-the-brain.txt -│ ├── nutrition-and-health.txt -│ └── space-exploration.txt +├── documents/ # Sample documents (.txt, .pdf, .docx, .md) +├── ingest/ # Format-specific parsers (Phase 4) +│ ├── __init__.py +│ ├── router.py # Resolves parser by file extension +│ ├── pdf_parser.py # PDF extraction via pymupdf +│ ├── docx_parser.py # DOCX extraction via python-docx +│ └── markdown_parser.py # Markdown stripping to plain text ├── prompts/ │ └── system_prompt.txt # LLM system prompt (loaded at runtime) ├── embed.py # embed_chunks and embed_query utilities -├── ingest.py # Load, chunk, embed, store in Chroma +├── ingest.py # CLI entry point - parse, chunk, embed, store ├── search.py # Embed query + retrieve top-K from Chroma ├── generate.py # Token-budgeted answer generation via gpt-4o-mini ├── rag.py # End-to-end pipeline entry point ├── inspect_collection.py # Print collection stats and a sample entry ├── utils.py # chunk_text, load_document, load_documents ├── chroma_db/ # Chroma persistent storage (not committed) -├── diagrams/ # Pipeline diagrams (SVG, auto-exported from PlantUML) -├── docs/ # PlantUML source files and implementation plan +├── diagrams/ # Pipeline diagrams (SVG, generated via npx diagram-sync) +├── docs/ # Phase notes, PlantUML source files, and docs index ├── pyproject.toml └── .env # API keys (not committed) ``` @@ -88,8 +93,9 @@ TOKEN_BUDGET=2000 ## Usage ```bash -# Step 1 -- Ingest documents into Chroma -python3 ingest.py +# Step 1 -- Ingest a single file or an entire directory +python3 ingest.py documents/ancient-rome.pdf +python3 ingest.py documents/ # Step 2 -- Ask a question (full RAG pipeline) python3 rag.py "what foods are good for the heart" @@ -130,15 +136,15 @@ No answer found in the documents. **Search only** -- `python3 search.py` ```text -Result 1 (distance: 1.2862) -- nutrition-and-health.txt [chunk 0] +Result 1 (distance: 1.2862) - nutrition-and-health.txt [chunk 0] Nutrition is the science of how food affects the body... Unsaturated fats found in olive oil, nuts, avocados, and fatty fish are associated with reduced risk of heart disease... -Result 2 (distance: 1.3720) -- nutrition-and-health.txt [chunk 1] +Result 2 (distance: 1.3720) - nutrition-and-health.txt [chunk 1] The Mediterranean diet -- rich in vegetables, fruit, whole grains, fish, and olive oil -- is consistently associated with lower rates of heart disease, diabetes, and cognitive decline... -Result 3 (distance: 1.6426) -- music-and-the-brain.txt [chunk 1] +Result 3 (distance: 1.6426) - music-and-the-brain.txt [chunk 1] Music also affects mood and stress. Slow, quiet music activates the parasympathetic nervous system, lowering heart rate and cortisol levels... ``` @@ -156,7 +162,7 @@ Note: distance is an inverse similarity score -- lower means more relevant. | 1 | Semantic Foundation | Complete | | 2 | Vector Store | Complete | | 3 | RAG Pipeline | Complete | -| 4 | Document Ingestion | Planned | +| 4 | Document Ingestion | Complete | | 5 | Retrieval Quality | Planned | | 6 | Search and Chat Mode | Planned | | 7 | Role-Based Document Access | Planned | @@ -173,14 +179,15 @@ See [docs/implementation-plan.md](./docs/implementation-plan.md) for full phase - **Model consistency** -- the same embedding model must be used for both documents and queries - **Vector database** -- stores embeddings with metadata and retrieves them by similarity using ANN search - **RAG** -- Retrieval-Augmented Generation: retrieve relevant context, then generate a grounded answer +- **Document parsing** -- format-specific extraction that converts PDF, DOCX, and Markdown into plain text before chunking; all formats share the same embedding and storage flow after parsing --- ## Diagrams -Pipeline diagrams are maintained as PlantUML source files in `docs/` and auto-exported to SVG on every push to main using [diagram-sync](https://www.npmjs.com/package/diagram-sync). +Pipeline diagrams are maintained as PlantUML source files in `docs/` and exported to SVG via `npx diagram-sync` using [diagram-sync](https://www.npmjs.com/package/diagram-sync). -The three diagrams below show the system growing phase by phase -- each one builds on the previous. +The diagrams below show the system growing phase by phase -- each one builds on the previous. ### Phase 1 -- Semantic Search (cosine similarity over JSON embeddings) @@ -193,3 +200,23 @@ The three diagrams below show the system growing phase by phase -- each one buil ### Phase 3 -- RAG Pipeline (generation on top of retrieval) ![RAG Pipeline](./diagrams/docs/pipeline-rag.svg) + +### Phase 4 -- Document Ingestion (multi-format parsing, deduplication) + +The ingestion flow is split into 4 focused diagrams - read in this order: + +**1. Entry and Routing** - CLI validation, collection setup, file vs directory routing + +![Entry and Routing](./diagrams/docs/pipeline-document-ingestion-entry-routing.svg) + +**2. Parsing** - router extension resolution, all 4 parsers (PDF / DOCX / MD / TXT), flatten to plain text + +![Parsing](./diagrams/docs/pipeline-document-ingestion-parsing.svg) + +**3. Chunking and Embedding** - sliding window chunking, OpenAI embeddings API call + +![Chunking and Embedding](./diagrams/docs/pipeline-document-ingestion-chunk-embed.svg) + +**4. Upsert** - deduplication check, ChromaDB upsert with full payload + +![Upsert](./diagrams/docs/pipeline-document-ingestion-upsert.svg) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..93d0870 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,52 @@ +# Docs + +This folder contains the implementation plan, phase-by-phase notes, and sequence diagrams for the RAG Document Engine. + +--- + +## Implementation Plan + +| File | Description | +| ---- | ----------- | +| `implementation-plan.md` | Full build plan across all 7 phases — goals, what gets built, stack additions, and questions to answer per phase | + +--- + +## Phase Notes + +One file per phase. Each covers the goal, design decisions, and key concepts for that phase. + +| File | Phase | +| ---- | ----- | +| `phase-1-semantic-foundation.md` | Semantic search from scratch — chunking, embeddings, cosine similarity over flat JSON | +| `phase-2-vector-store.md` | Replace JSON with ChromaDB — persistent collection, metadata, `collection.query()` | +| `phase-3-rag-pipeline.md` | Close the loop — retrieval + LLM generation, grounded answers, citations, token budget | +| `phase-4-document-ingestion.md` | Multi-format ingestion — PDF, DOCX, Markdown parsers, CLI trigger, deduplication | +| `phase-5-retrieval-quality.md` | Improve retrieval — evaluation set, hybrid search (BM25 + vector), re-ranker, metadata filters | +| `phase-6-search-and-chat-mode.md` | Two interaction modes — document search and multi-turn chat with conversation history | +| `phase-7-role-based-document-access.md` | Access control — owner metadata at ingestion, per-user query filters, no bypass path | + +--- + +## Sequence Diagrams + +PlantUML sequence diagrams for each pipeline. Open with any PlantUML-compatible renderer. + +### Document Ingestion Pipeline + +Covers Phase 4. Split into 4 focused diagrams — read in this order to follow the full flow: + +| Order | File | Covers | +| ----- | ---- | ------- | +| 1 | [pipeline-document-ingestion-entry-routing.svg](../diagrams/docs/pipeline-document-ingestion-entry-routing.svg) | CLI entry, arg validation, `get_or_create_collection`, file vs directory routing | +| 2 | [pipeline-document-ingestion-parsing.svg](../diagrams/docs/pipeline-document-ingestion-parsing.svg) | Router extension resolution, all 4 parsers (PDF / DOCX / MD / TXT), flatten to plain text | +| 3 | [pipeline-document-ingestion-chunk-embed.svg](../diagrams/docs/pipeline-document-ingestion-chunk-embed.svg) | Sliding window chunking, OpenAI embeddings API call | +| 4 | [pipeline-document-ingestion-upsert.svg](../diagrams/docs/pipeline-document-ingestion-upsert.svg) | Deduplication check, ChromaDB upsert with full payload | + +### Other Pipelines + +| File | Covers | +| ---- | ------- | +| [pipeline-semantic-search.svg](../diagrams/docs/pipeline-semantic-search.svg) | Phase 1 — query embedding, cosine similarity, top-K retrieval over flat JSON | +| [pipeline-vector-store.svg](../diagrams/docs/pipeline-vector-store.svg) | Phase 2 — ingest and query flow using ChromaDB | +| [pipeline-rag.svg](../diagrams/docs/pipeline-rag.svg) | Phase 3 — end-to-end RAG: retrieval, token budget, prompt construction, LLM generation, citations | diff --git a/docs/pipeline-document-ingestion-chunk-embed.puml b/docs/pipeline-document-ingestion-chunk-embed.puml new file mode 100644 index 0000000..4d2ef65 --- /dev/null +++ b/docs/pipeline-document-ingestion-chunk-embed.puml @@ -0,0 +1,35 @@ +@startuml pipeline-document-ingestion-chunk-embed +skinparam sequenceMessageAlign center +skinparam ParticipantPadding 10 + +participant "ingest_file()" as ingest +participant "utils\nchunk_text()" as chunker +participant "embed\nembed_chunks()" as embedder +participant "OpenAI\nEmbeddings API" as openai + +== Chunking == + +ingest -> chunker : chunk_text(text, chunk_size = 300, overlap = 50) +activate chunker +chunker -> chunker : words = text.split() +chunker -> chunker : step = chunk_size - overlap = 250 +loop for i in range(0, len(words), step = 250) +chunker -> chunker : chunk_words = words[i : i + 300] +chunker -> chunker : chunks.append(" ".join(chunk_words)) +end +chunker --> ingest : chunks: list[str] +deactivate chunker + +== Embedding == + +ingest -> embedder : embed_chunks(chunks) +activate embedder +embedder -> openai : embeddings.create(model = EMBEDDING_MODEL, input = chunks) +activate openai +openai --> embedder : EmbeddingResponse — response.data[i].embedding +deactivate openai +embedder -> embedder : build [{"text": chunk, "embedding": [...float]}] +embedder --> ingest : embedded: list[dict] +deactivate embedder + +@enduml diff --git a/docs/pipeline-document-ingestion-entry-routing.puml b/docs/pipeline-document-ingestion-entry-routing.puml new file mode 100644 index 0000000..7a8361e --- /dev/null +++ b/docs/pipeline-document-ingestion-entry-routing.puml @@ -0,0 +1,62 @@ +@startuml pipeline-document-ingestion-entry-routing +skinparam sequenceMessageAlign center +skinparam ParticipantPadding 10 + +actor User as user +participant "ingest.py" as main +participant "ChromaDB\nPersistentClient" as chromaClient +participant "ChromaDB\nCollection" as collection + +== Module Init (at import time) == + +main -> chromaClient : PersistentClient(path = "./chroma_db") +activate chromaClient +chromaClient --> main : client (persistent, on-disk) +deactivate chromaClient + +== main() Entry == + +user -> main : python ingest.py +activate main + +opt sys.argv < 2 — no argument provided +main --> user : print "Usage: python ingest.py " +main --> user : sys.exit(1) +end + +main -> chromaClient : get_or_create_collection(name = "documents") +activate chromaClient +chromaClient --> main : collection +deactivate chromaClient +activate collection + +== Path Routing == + +alt target.is_file() +main -> main : ingest_file(collection, filepath) + +else target.is_dir() +main -> main : filter files by suffix (.txt | .pdf | .docx | .md) +alt no supported files found in directory +main --> user : print "No supported files found in {target}" +main --> user : sys.exit(0) +else supported files exist +loop for each file in directory +main -> main : ingest_file(collection, filepath) +end +end + +else path does not exist +main --> user : print "Path not found: {target}" +main --> user : sys.exit(1) +end + +== Summary == + +main -> collection : count() +collection --> main : total_count +main --> user : print "Total vectors in collection: {total_count}" +deactivate collection +deactivate main + +@enduml diff --git a/docs/pipeline-document-ingestion-parsing.puml b/docs/pipeline-document-ingestion-parsing.puml new file mode 100644 index 0000000..b259ba9 --- /dev/null +++ b/docs/pipeline-document-ingestion-parsing.puml @@ -0,0 +1,108 @@ +@startuml pipeline-document-ingestion-parsing +skinparam sequenceMessageAlign center +skinparam ParticipantPadding 10 + +participant "ingest_file()" as ingest +participant "router" as router +participant "pdf_parser" as pdfParser +participant "docx_parser" as docxParser +participant "markdown_parser" as mdParser +participant "_parse_txt" as txtParser + +ingest -> router : parse(filepath) +activate router + +router -> router : _resolve_parser(filepath) +router -> router : ext = Path(filepath).suffix.lower() + +opt ext not in _PARSERS (.txt / .pdf / .docx / .md) +router --> ingest : raise ValueError("Unsupported file type: {ext}") +end + +== PDF Parser == + +alt ext == ".pdf" +router -> pdfParser : parse_pdf(filepath) +activate pdfParser +pdfParser -> pdfParser : doc = fitz.open(filepath) +loop for page_num, page in enumerate(doc, start = 1) +pdfParser -> pdfParser : text = page.get_text() +opt text.strip() is non-empty +pdfParser -> pdfParser : append {"text": text, "page": page_num} +end +end +pdfParser -> pdfParser : doc.close() +pdfParser --> router : pages: list[dict] +deactivate pdfParser + +== DOCX Parser == + +else ext == ".docx" +router -> docxParser : parse_docx(filepath) +activate docxParser +docxParser -> docxParser : doc = docx.Document(filepath)\ncurrent_heading = None, buffer = [] +loop for each para in doc.paragraphs +alt para.style.name starts with "Heading" +opt buffer is non-empty +docxParser -> docxParser : flush buffer to sections with current_heading +docxParser -> docxParser : buffer = [] +end +docxParser -> docxParser : current_heading = para.text.strip() or None +else regular paragraph +opt para.text.strip() is non-empty +docxParser -> docxParser : buffer.append(para.text.strip()) +end +end +end +opt remaining buffer is non-empty +docxParser -> docxParser : flush buffer to sections with current_heading +end +docxParser --> router : sections: list[dict] +deactivate docxParser + +== Markdown Parser == + +else ext == ".md" +router -> mdParser : parse_markdown(filepath) +activate mdParser +mdParser -> mdParser : open(filepath, "r", encoding = "utf-8"), raw = f.read() +mdParser -> mdParser : sections = [], current_heading = None, buffer = [] +loop for each line in raw.splitlines() +alt line matches r"^#{1,6}\s+(.*)" +opt buffer is non-empty +mdParser -> mdParser : flush buffer to sections with current_heading +mdParser -> mdParser : buffer = [] +end +mdParser -> mdParser : current_heading = match.group(1).strip() or None +else regular line +mdParser -> mdParser : cleaned = _strip_markdown(line) +note right : strips: code fences, HR lines,\nblockquotes, list markers,\nimages, links, bold/italic,\ninline code, HTML tags +opt cleaned is non-empty +mdParser -> mdParser : buffer.append(cleaned) +end +end +end +opt remaining buffer is non-empty +mdParser -> mdParser : flush buffer to sections with current_heading +end +mdParser --> router : sections: list[dict] +deactivate mdParser + +== TXT Parser == + +else ext == ".txt" +router -> txtParser : _parse_txt(filepath) +activate txtParser +txtParser -> txtParser : open(filepath, "r", encoding = "utf-8") +txtParser -> txtParser : text = f.read() +txtParser --> router : [{"text": full_text}] +deactivate txtParser +end + +== Flatten == + +router -> router : _flatten(sections)\n"\n\n".join(s["text"] for s in sections) +router --> ingest : plain text: str +deactivate router + +@enduml diff --git a/docs/pipeline-document-ingestion-upsert.puml b/docs/pipeline-document-ingestion-upsert.puml new file mode 100644 index 0000000..19f4be9 --- /dev/null +++ b/docs/pipeline-document-ingestion-upsert.puml @@ -0,0 +1,26 @@ +@startuml pipeline-document-ingestion-upsert +skinparam sequenceMessageAlign center +skinparam ParticipantPadding 10 + +participant "ingest_file()" as ingest +participant "ChromaDB\nCollection" as collection + +== Deduplication == + +ingest -> collection : count() +collection --> ingest : current_count + +opt current_count > 0 — collection already has data +ingest -> collection : delete(where = {"source": filename}) +note right : Removes all existing chunks\nfor this file before re-ingesting\n(handles re-ingest / refresh) +end + +== Upsert == + +ingest -> collection : upsert(ids, embeddings, documents, metadatas) +note right : ids = ["{filename}_{i}" per chunk]\nembeddings = [float vectors from OpenAI]\ndocuments = raw chunk strings\nmetadatas = [{"source": filename, "chunk_index": i}] +collection --> ingest : OK + +ingest -> ingest : print "Ingested {len(chunks)} chunks from {filename}" + +@enduml diff --git a/documents/ancient-rome.pdf b/documents/ancient-rome.pdf new file mode 100644 index 0000000..c2fe119 Binary files /dev/null and b/documents/ancient-rome.pdf differ diff --git a/documents/climate-change.docx b/documents/climate-change.docx new file mode 100644 index 0000000..88a1838 Binary files /dev/null and b/documents/climate-change.docx differ diff --git a/documents/space-exploration.md b/documents/space-exploration.md new file mode 100644 index 0000000..53978c7 --- /dev/null +++ b/documents/space-exploration.md @@ -0,0 +1,18 @@ +# Space Exploration + +Space exploration is the investigation of outer space using astronomy and space technology. It began in earnest in the mid-20th century, driven by competition between the United States and the Soviet Union during the Cold War. + +The Soviet Union launched the first artificial satellite, Sputnik 1, in October 1957. This was followed by Yuri Gagarin becoming the first human in space in April 1961, completing a single orbit of Earth aboard Vostok 1. The United States responded by accelerating its own space programme under NASA. + +## Early Missions + +The Apollo programme is one of the greatest achievements in human history. On July 20, 1969, Neil Armstrong and Buzz Aldrin became the first humans to walk on the Moon during the Apollo 11 mission. Armstrong's words — "That's one small step for man, one giant leap for mankind" — were broadcast live to hundreds of millions of people around the world. Five more Moon landings followed before the programme ended in 1972. + +Unmanned missions have expanded our knowledge of the solar system enormously. The Voyager probes, launched in 1977, have now left the solar system and continue to transmit data from interstellar space. The Mars rovers — Curiosity and Perseverance — have been exploring the Martian surface, searching for signs of ancient microbial life and collecting rock samples for eventual return to Earth. + +## Modern Era + +The International Space Station has been continuously inhabited since November 2000, serving as a laboratory for scientific research in microgravity. Astronauts from many countries have lived and worked aboard it, conducting experiments in biology, physics, and medicine that are only possible in the unique environment of space. + +The 21st century has seen the rise of private spaceflight companies. SpaceX developed the reusable Falcon 9 rocket, dramatically reducing the cost of launching payloads into orbit. Blue Origin and Virgin Galactic have pursued suborbital tourism. Artemis, NASA's current lunar programme, aims to return humans to the Moon by the mid-2020s, including the first woman and first person of colour to walk on its surface. + diff --git a/ingest.py b/ingest.py index 1952dc0..4c62346 100644 --- a/ingest.py +++ b/ingest.py @@ -1,31 +1,54 @@ -from utils import chunk_text, load_documents -from embed import embed_chunks +import sys +from pathlib import Path import chromadb +from utils import chunk_text +from embed import embed_chunks +from ingest.router import parse, SUPPORTED_EXTENSIONS -client = chromadb.PersistentClient(path="./chroma_db") +client = chromadb.PersistentClient(path = "./chroma_db") -def main(): - collection = client.get_or_create_collection(name="documents") +def ingest_file(collection, filepath: str): + filename = Path(filepath).name + text = parse(filepath) + chunks = chunk_text(text) + embedded = embed_chunks(chunks) - for doc in load_documents("documents/"): - chunks = chunk_text(doc["text"]) - embedded = embed_chunks(chunks) + if collection.count() > 0: + collection.delete(where = {"source": filename}) - # Delete existing chunks for this source before re-ingesting - if collection.count() > 0: - collection.delete(where={"source": doc["filename"]}) + collection.upsert( + ids = [f"{filename}_{i}" for i in range(len(chunks))], + embeddings = [e["embedding"] for e in embedded], + documents = chunks, + metadatas = [{"source": filename, "chunk_index": i} for i in range(len(chunks))], + ) - collection.upsert( - ids = [f"{doc['filename']}_{i}" for i in range(len(chunks))], - embeddings = [e["embedding"] for e in embedded], - documents = chunks, - metadatas = [{"source": doc["filename"], "chunk_index": i} for i in range(len(chunks))] - ) + print(f"Ingested {len(chunks)} chunks from {filename}") - print(f"Ingested {len(chunks)} chunks from {doc['filename']}") - print(f"\nTotal vectors in collection: {collection.count()}") +def main(): + if len(sys.argv) < 2: + print("Usage: python ingest.py ") + sys.exit(1) + + target = Path(sys.argv[1]) + collection = client.get_or_create_collection(name = "documents") + + if target.is_file(): + ingest_file(collection, str(target)) + elif target.is_dir(): + files = [f for f in target.iterdir() if f.suffix.lower() in SUPPORTED_EXTENSIONS] + if not files: + print(f"No supported files found in {target}") + sys.exit(0) + for f in files: + ingest_file(collection, str(f)) + else: + print(f"Path not found: {target}") + sys.exit(1) + + print(f"\nTotal vectors in collection: {collection.count()}") if __name__ == '__main__': - main() + main() diff --git a/ingest/__init__.py b/ingest/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ingest/docx_parser.py b/ingest/docx_parser.py new file mode 100644 index 0000000..efe1b7c --- /dev/null +++ b/ingest/docx_parser.py @@ -0,0 +1,23 @@ +import docx + + +def parse_docx(filepath: str) -> list[dict]: + doc = docx.Document(filepath) + sections = [] + current_heading = None + buffer = [] + + for para in doc.paragraphs: + if para.style.name.startswith("Heading"): + if buffer: + sections.append({"text": "\n".join(buffer), "heading": current_heading}) + buffer = [] + current_heading = para.text.strip() or None + else: + if para.text.strip(): + buffer.append(para.text.strip()) + + if buffer: + sections.append({"text": "\n".join(buffer), "heading": current_heading}) + + return sections diff --git a/ingest/markdown_parser.py b/ingest/markdown_parser.py new file mode 100644 index 0000000..455799d --- /dev/null +++ b/ingest/markdown_parser.py @@ -0,0 +1,44 @@ +import re # Python's built in regular expressions module + + +def parse_markdown(filepath: str) -> list[dict]: + with open(filepath, "r", encoding = "utf-8") as f: + raw = f.read() + + sections = [] + current_heading = None + buffer = [] + + for line in raw.splitlines(): + heading_match = re.match(r"^#{1,6}\s+(.*)", line) + if heading_match: + if buffer: + sections.append({"text": "\n".join(buffer), "heading": current_heading}) + buffer = [] + current_heading = heading_match.group(1).strip() or None + else: + cleaned = _strip_markdown(line) + if cleaned: + buffer.append(cleaned) + + if buffer: + sections.append({"text": "\n".join(buffer), "heading": current_heading}) + + return sections + + +def _strip_markdown(line: str) -> str: + if re.match(r"^```", line): + return "" + if re.match(r"^(\*{3,}|-{3,}|_{3,})\s*$", line): + return "" + line = re.sub(r"^>\s?", "", line) + line = re.sub(r"^\s*[-*+]\s+", "", line) + line = re.sub(r"^\s*\d+\.\s+", "", line) + line = re.sub(r"!\[([^\]]*)\]\([^)]*\)", r"\1", line) + line = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", line) + line = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", line) + line = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", line) + line = re.sub(r"`([^`]*)`", r"\1", line) + line = re.sub(r"<[^>]+>", "", line) + return line.strip() diff --git a/ingest/pdf_parser.py b/ingest/pdf_parser.py new file mode 100644 index 0000000..2bdc759 --- /dev/null +++ b/ingest/pdf_parser.py @@ -0,0 +1,14 @@ +import fitz # pymupdf + + +def parse_pdf(filepath: str) -> list[dict]: + doc = fitz.open(filepath) + pages = [] + + for page_num, page in enumerate(doc, start = 1): + text = page.get_text() + if text.strip(): + pages.append({"text": text, "page": page_num}) + + doc.close() + return pages diff --git a/ingest/router.py b/ingest/router.py new file mode 100644 index 0000000..6c8b8bd --- /dev/null +++ b/ingest/router.py @@ -0,0 +1,36 @@ +from pathlib import Path +from .pdf_parser import parse_pdf +from .docx_parser import parse_docx +from .markdown_parser import parse_markdown + + +def _parse_txt(filepath: str) -> list[dict]: + with open(filepath, "r", encoding = "utf-8") as f: + return [{"text": f.read()}] + + +_PARSERS = { + ".txt": _parse_txt, + ".pdf": parse_pdf, + ".docx": parse_docx, + ".md": parse_markdown, +} + +SUPPORTED_EXTENSIONS = set(_PARSERS.keys()) + + +def _resolve_parser(filepath: str): + ext = Path(filepath).suffix.lower() + parser = _PARSERS.get(ext) + if parser is None: + raise ValueError(f"Unsupported file type: {ext}") + return parser + + +def _flatten(sections: list[dict]) -> str: + return "\n\n".join(s["text"] for s in sections) + + +def parse(filepath: str) -> str: + parser = _resolve_parser(filepath) + return _flatten(parser(filepath)) diff --git a/pyproject.toml b/pyproject.toml index 4d8b28f..17d7cd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,9 @@ dependencies = [ "python-dotenv>=1.0.0", "numpy>=1.26.0", "chromadb>=1.5.9", - "tiktoken>=0.7.0" + "tiktoken>=0.7.0", + "pymupdf>=1.24.0", + "python-docx>=1.0.0" ] [tool.setuptools]