diff --git a/docs/pipeline-rag.puml b/docs/pipeline-rag.puml
new file mode 100644
index 0000000..65193d3
--- /dev/null
+++ b/docs/pipeline-rag.puml
@@ -0,0 +1,70 @@
+@startuml pipeline-rag
+
+skinparam backgroundColor #FFFFFF
+skinparam defaultFontName Arial
+skinparam defaultFontSize 16
+skinparam sequenceArrowThickness 2
+skinparam SequenceBoxBackgroundColor #F8F9FF
+skinparam SequenceBoxBorderColor #AABBDD
+skinparam ParticipantBackgroundColor #EEF3FB
+skinparam ParticipantBorderColor #5577AA
+skinparam ParticipantFontColor #222222
+skinparam DatabaseBackgroundColor #FFF8E7
+skinparam DatabaseBorderColor #CC9900
+skinparam ActorBackgroundColor #F0FFF0
+skinparam ActorBorderColor #448844
+
+title RAG Pipeline
+
+box "Retrieval" #EEF6FF
+  actor "User" as user
+  participant "rag.py" as rag
+  participant "search.py" as search
+  participant "embed.py" as embed
+end box
+
+database "Chroma DB" as chroma
+participant "OpenAI API" as openai
+
+box "Generation" #FFF0F8
+  participant "generate.py" as generate
+end box
+
+== Retrieval ==
+
+user -> rag : question\n(CLI arg or input())
+rag -> search : search(question, top_k=5)
+search -> embed : embed_query()
+embed -> openai : embeddings.create()\ntext-embedding-3-small
+openai --> embed : query vector (1536-dim)
+embed --> search : query vector
+search -> chroma : query(query_embeddings, n_results=5)
+chroma --> search : top-5 nearest chunks\n(text + source + chunk_index + distance)
+search --> rag : chunks list
+
+== Generation ==
+
+rag -> generate : generate_answer(question, chunks)
+generate -> generate : _select_chunks_within_budget()\ntiktoken counts tokens per chunk\nskips any chunk exceeding 2000-token budget
+generate -> generate : _build_context_block()\nnumbered + labelled context entries
+generate -> openai : chat.completions.create()\ngpt-4o-mini · temperature=0
+note right of openai
+  System prompt:
+  Answer ONLY from the provided context.
+  If context is insufficient, respond with:
+  "I don't know based on the provided documents."
+end note
+openai --> generate : answer text
+generate --> rag : { answer, sources }
+
+== Output ==
+
+rag -> rag : answer == NO_ANSWER_PHRASE?
+alt answer not in documents
+  rag -> user : "No answer found in the documents."
+else answer found
+  rag -> user : Answer text\n+ source citations\n(source filename · chunk index · text preview)
+end
+
+' end
+@enduml
diff --git a/generate.py b/generate.py
new file mode 100644
index 0000000..d79f286
--- /dev/null
+++ b/generate.py
@@ -0,0 +1,90 @@
+from openai import OpenAI
+from dotenv import load_dotenv
+import tiktoken
+import os
+
+load_dotenv()
+
+client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))
+
+GENERATION_MODEL = os.getenv("GENERATION_MODEL", "gpt-4o-mini")
+TOKEN_BUDGET = int(os.getenv("TOKEN_BUDGET", "2000"))
+NO_ANSWER_PHRASE = "I don't know based on the provided documents."
+
+_PROMPTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "prompts")
+
+
+def _load_prompt(filename: str) -> str:
+	path = os.path.join(_PROMPTS_DIR, filename)
+	with open(path) as f:
+		return f.read().strip()
+
+
+_SYSTEM_PROMPT = _load_prompt("system_prompt.txt")
+
+
+def _count_tokens(text: str) -> int:
+	enc = tiktoken.get_encoding("cl100k_base")
+	return len(enc.encode(text))
+
+
+def _select_chunks_within_budget(chunks: list[dict], budget: int) -> list[dict]:
+	selected = []
+	tokens_used = 0
+
+	for chunk in chunks:
+		chunk_tokens = _count_tokens(chunk["text"])
+		if tokens_used + chunk_tokens > budget:
+			continue
+		selected.append(chunk)
+		tokens_used += chunk_tokens
+
+	return selected
+
+
+def _build_context_block(chunks: list[dict]) -> str:
+	lines = []
+	for i, chunk in enumerate(chunks, start = 1):
+		lines.append(f"[{i}] Source: {chunk['source']} (chunk {chunk['chunk_index']})")
+		lines.append(chunk["text"])
+		lines.append("")
+	return "\n".join(lines)
+
+
+def generate_answer(question: str, chunks: list[dict]) -> dict:
+	selected_chunks = _select_chunks_within_budget(chunks, TOKEN_BUDGET)
+
+	if not selected_chunks:
+		return {
+			"answer": NO_ANSWER_PHRASE,
+			"sources": []
+		}
+
+	context_block = _build_context_block(selected_chunks)
+	user_message = f"Context:\n{context_block}\nQuestion: {question}"
+
+	response = client.chat.completions.create(
+		model = GENERATION_MODEL,
+		messages = [
+			{"role": "system", "content": _SYSTEM_PROMPT},
+			{"role": "user", "content": user_message}
+		],
+		temperature = 0.0
+	)
+
+	answer = response.choices[0].message.content.strip()
+
+	sources = [
+		{
+			"index": i + 1,
+			"source": chunk["source"],
+			"chunk_index": chunk["chunk_index"],
+			"text": chunk["text"]
+		}
+		for i, chunk in enumerate(selected_chunks)
+	]
+
+	return {
+		"answer": answer,
+		"sources": sources
+	}
diff --git a/prompts/system_prompt.txt b/prompts/system_prompt.txt
new file mode 100644
index 0000000..8d55a29
--- /dev/null
+++ b/prompts/system_prompt.txt
@@ -0,0 +1,8 @@
+You are a precise document assistant. Answer the user's question using ONLY the information provided in the context below.
+
+Rules:
+- Base your answer strictly on the provided context. Do not use prior knowledge.
+- If the context does not contain enough information to answer the question, respond with exactly:
+  I don't know based on the provided documents.
+- Do not add caveats, speculation, or information not present in the context.
+- Be concise and factual.
diff --git a/pyproject.toml b/pyproject.toml
index 0b9e40b..4d8b28f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,8 @@ dependencies = [
     "openai>=1.0.0",
     "python-dotenv>=1.0.0",
     "numpy>=1.26.0",
-    "chromadb>=1.5.9"
+    "chromadb>=1.5.9",
+    "tiktoken>=0.7.0"
 ]
 
 [tool.setuptools]
diff --git a/rag.py b/rag.py
new file mode 100644
index 0000000..bc5a0ff
--- /dev/null
+++ b/rag.py
@@ -0,0 +1,49 @@
+import sys
+from search import search
+from generate import generate_answer, NO_ANSWER_PHRASE
+
+TOP_K = 5
+
+
+def _print_result(result: dict) -> None:
+	print("\nAnswer:")
+	print(result["answer"])
+
+	if result["sources"]:
+		print("\nSources:")
+		for source in result["sources"]:
+			preview = source["text"][:120].replace("\n", " ")
+			print(f"  [{source['index']}] {source['source']} (chunk {source['chunk_index']}): \"{preview}...\"")
+
+
+def run(question: str) -> None:
+	chunks = search(question, top_k = TOP_K)
+
+	if not chunks:
+		print("No documents found in the collection. Run ingest.py first.")
+		return
+
+	result = generate_answer(question, chunks)
+
+	if result["answer"].strip() == NO_ANSWER_PHRASE:
+		print("\nNo answer found in the documents.")
+		return
+
+	_print_result(result)
+
+
+def main():
+	if len(sys.argv) > 1:
+		question = " ".join(sys.argv[1:])
+	else:
+		question = input("Enter your question: ").strip()
+
+	if not question:
+		print("Error: question cannot be empty.")
+		sys.exit(1)
+
+	run(question)
+
+
+if __name__ == "__main__":
+	main()