diff --git a/docs/pipeline-rag.puml b/docs/pipeline-rag.puml new file mode 100644 index 0000000..65193d3 --- /dev/null +++ b/docs/pipeline-rag.puml @@ -0,0 +1,70 @@ +@startuml pipeline-rag + +skinparam backgroundColor #FFFFFF +skinparam defaultFontName Arial +skinparam defaultFontSize 16 +skinparam sequenceArrowThickness 2 +skinparam SequenceBoxBackgroundColor #F8F9FF +skinparam SequenceBoxBorderColor #AABBDD +skinparam ParticipantBackgroundColor #EEF3FB +skinparam ParticipantBorderColor #5577AA +skinparam ParticipantFontColor #222222 +skinparam DatabaseBackgroundColor #FFF8E7 +skinparam DatabaseBorderColor #CC9900 +skinparam ActorBackgroundColor #F0FFF0 +skinparam ActorBorderColor #448844 + +title RAG Pipeline + +box "Retrieval" #EEF6FF + actor "User" as user + participant "rag.py" as rag + participant "search.py" as search + participant "embed.py" as embed +end box + +database "Chroma DB" as chroma +participant "OpenAI API" as openai + +box "Generation" #FFF0F8 + participant "generate.py" as generate +end box + +== Retrieval == + +user -> rag : question\n(CLI arg or input()) +rag -> search : search(question, top_k=5) +search -> embed : embed_query() +embed -> openai : embeddings.create()\ntext-embedding-3-small +openai --> embed : query vector (1536-dim) +embed --> search : query vector +search -> chroma : query(query_embeddings, n_results=5) +chroma --> search : top-5 nearest chunks\n(text + source + chunk_index + distance) +search --> rag : chunks list + +== Generation == + +rag -> generate : generate_answer(question, chunks) +generate -> generate : _select_chunks_within_budget()\ntiktoken counts tokens per chunk\nskips any chunk exceeding 2000-token budget +generate -> generate : _build_context_block()\nnumbered + labelled context entries +generate -> openai : chat.completions.create()\ngpt-4o-mini · temperature=0 +note right of openai + System prompt: + Answer ONLY from the provided context. + If context is insufficient, respond with: + "I don't know based on the provided documents." +end note +openai --> generate : answer text +generate --> rag : { answer, sources } + +== Output == + +rag -> rag : answer == NO_ANSWER_PHRASE? +alt answer not in documents + rag -> user : "No answer found in the documents." +else answer found + rag -> user : Answer text\n+ source citations\n(source filename · chunk index · text preview) +end + +' end +@enduml diff --git a/generate.py b/generate.py new file mode 100644 index 0000000..d79f286 --- /dev/null +++ b/generate.py @@ -0,0 +1,90 @@ +from openai import OpenAI +from dotenv import load_dotenv +import tiktoken +import os + +load_dotenv() + +client = OpenAI(api_key = os.getenv("OPENAI_API_KEY")) + +GENERATION_MODEL = os.getenv("GENERATION_MODEL", "gpt-4o-mini") +TOKEN_BUDGET = int(os.getenv("TOKEN_BUDGET", "2000")) +NO_ANSWER_PHRASE = "I don't know based on the provided documents." + +_PROMPTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "prompts") + + +def _load_prompt(filename: str) -> str: + path = os.path.join(_PROMPTS_DIR, filename) + with open(path) as f: + return f.read().strip() + + +_SYSTEM_PROMPT = _load_prompt("system_prompt.txt") + + +def _count_tokens(text: str) -> int: + enc = tiktoken.get_encoding("cl100k_base") + return len(enc.encode(text)) + + +def _select_chunks_within_budget(chunks: list[dict], budget: int) -> list[dict]: + selected = [] + tokens_used = 0 + + for chunk in chunks: + chunk_tokens = _count_tokens(chunk["text"]) + if tokens_used + chunk_tokens > budget: + continue + selected.append(chunk) + tokens_used += chunk_tokens + + return selected + + +def _build_context_block(chunks: list[dict]) -> str: + lines = [] + for i, chunk in enumerate(chunks, start = 1): + lines.append(f"[{i}] Source: {chunk['source']} (chunk {chunk['chunk_index']})") + lines.append(chunk["text"]) + lines.append("") + return "\n".join(lines) + + +def generate_answer(question: str, chunks: list[dict]) -> dict: + selected_chunks = _select_chunks_within_budget(chunks, TOKEN_BUDGET) + + if not selected_chunks: + return { + "answer": NO_ANSWER_PHRASE, + "sources": [] + } + + context_block = _build_context_block(selected_chunks) + user_message = f"Context:\n{context_block}\nQuestion: {question}" + + response = client.chat.completions.create( + model = GENERATION_MODEL, + messages = [ + {"role": "system", "content": _SYSTEM_PROMPT}, + {"role": "user", "content": user_message} + ], + temperature = 0.0 + ) + + answer = response.choices[0].message.content.strip() + + sources = [ + { + "index": i + 1, + "source": chunk["source"], + "chunk_index": chunk["chunk_index"], + "text": chunk["text"] + } + for i, chunk in enumerate(selected_chunks) + ] + + return { + "answer": answer, + "sources": sources + } diff --git a/prompts/system_prompt.txt b/prompts/system_prompt.txt new file mode 100644 index 0000000..8d55a29 --- /dev/null +++ b/prompts/system_prompt.txt @@ -0,0 +1,8 @@ +You are a precise document assistant. Answer the user's question using ONLY the information provided in the context below. + +Rules: +- Base your answer strictly on the provided context. Do not use prior knowledge. +- If the context does not contain enough information to answer the question, respond with exactly: + I don't know based on the provided documents. +- Do not add caveats, speculation, or information not present in the context. +- Be concise and factual. diff --git a/pyproject.toml b/pyproject.toml index 0b9e40b..4d8b28f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,8 @@ dependencies = [ "openai>=1.0.0", "python-dotenv>=1.0.0", "numpy>=1.26.0", - "chromadb>=1.5.9" + "chromadb>=1.5.9", + "tiktoken>=0.7.0" ] [tool.setuptools] diff --git a/rag.py b/rag.py new file mode 100644 index 0000000..bc5a0ff --- /dev/null +++ b/rag.py @@ -0,0 +1,49 @@ +import sys +from search import search +from generate import generate_answer, NO_ANSWER_PHRASE + +TOP_K = 5 + + +def _print_result(result: dict) -> None: + print("\nAnswer:") + print(result["answer"]) + + if result["sources"]: + print("\nSources:") + for source in result["sources"]: + preview = source["text"][:120].replace("\n", " ") + print(f" [{source['index']}] {source['source']} (chunk {source['chunk_index']}): \"{preview}...\"") + + +def run(question: str) -> None: + chunks = search(question, top_k = TOP_K) + + if not chunks: + print("No documents found in the collection. Run ingest.py first.") + return + + result = generate_answer(question, chunks) + + if result["answer"].strip() == NO_ANSWER_PHRASE: + print("\nNo answer found in the documents.") + return + + _print_result(result) + + +def main(): + if len(sys.argv) > 1: + question = " ".join(sys.argv[1:]) + else: + question = input("Enter your question: ").strip() + + if not question: + print("Error: question cannot be empty.") + sys.exit(1) + + run(question) + + +if __name__ == "__main__": + main()