From 261eba7ac322b1ccd047a5fc0fd0555ca32337eb Mon Sep 17 00:00:00 2001 From: Buffden Date: Tue, 23 Jun 2026 16:58:38 -0500 Subject: [PATCH 1/5] document ingestion - include pymupdf for pdf parsing and python-docx for docx file parsing in toml file --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4d8b28f..17d7cd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,9 @@ dependencies = [ "python-dotenv>=1.0.0", "numpy>=1.26.0", "chromadb>=1.5.9", - "tiktoken>=0.7.0" + "tiktoken>=0.7.0", + "pymupdf>=1.24.0", + "python-docx>=1.0.0" ] [tool.setuptools] From a2a8e10b2c67b23fba3f0c7cbdb3f8ebc4b5f8a9 Mon Sep 17 00:00:00 2001 From: Buffden Date: Tue, 23 Jun 2026 16:59:13 -0500 Subject: [PATCH 2/5] document ingestion - documents for ingestion testing in pdf, docs and md format --- documents/ancient-rome.pdf | Bin 0 -> 2310 bytes documents/climate-change.docx | Bin 0 -> 37402 bytes documents/space-exploration.md | 18 ++++++++++++++++++ 3 files changed, 18 insertions(+) create mode 100644 documents/ancient-rome.pdf create mode 100644 documents/climate-change.docx create mode 100644 documents/space-exploration.md diff --git a/documents/ancient-rome.pdf b/documents/ancient-rome.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c2fe1193d53f0e2b5cb71f3fda0aca3b1a3f1ed1 GIT binary patch literal 2310 zcmZuzX;@Qd7OqyUT(z!1ZEN@JxBQOa(%C zcZpCLNH#(SP0Jzund2&#sn9|ph%^GE7~}o|Zp{1?9Pfz;C47cFDv*GBDRc@*BNK!0 z-f|URg@6RSH=>k>D+FjO#xmY$TM7i_@>^d_v>bX<2Es-kNHu<^Gc!_{UeHJn&kYY& z83$#Y7KCT;mBhN8_LPtXEHT4t{d&Y(%%|4)Y7x&nL(HJd?bxoN`IQ~Qe&Chs(Ch$<$dv(U2^xD zimYG#bLurCvR1LlL$86963@D4+q@qSs2^herLqyH!x`&p>VUIt(GB5sU+sxqzN7eN z&Vs_Qr)kOEO|x@6tVI5H+qD<8gNr#S!M9uHVorlru4_K`Q=0{HINjSrEpN=gFS>pA z?(Xr`+{c)98@acWMWgjar>pDO_AidP;HsWxk*fMATfTcXGK;+-as8#TI{D6j7g;)% zW)}Y9Y@3)Rp3!w8x4+Qo>zbZS{TR;ByG`R1TNz`h6(4NsTZl8C z4`;4)ds^F`S}6#eC26^NzJ7(be-k5B znv_t=6moss`*Xq`y_l#?Xj^1SO5zEFeHxqZJ+#$-^?6oAW*;tOdEUwKZFVcJ^JqOwd+*!K38hIj}QEq1bHs9@43YH}Ntb zj;+1FKFjI4U(>}ahqvs=@TyPCUr{kLa_jf29<(kS@GsEd20pX!Fw7b>>6=;ac6^a0 zowF)xj@8Qa6cQ<}%8{!ZEyZ$8=LIC5=-o8oH1(`j^V9Y=>@`}^EDj|h>d=z4`-=1l zn*rV4&~ovu??O-UG9K!Ss zP`QNu=yUl6t4m@W+lD6ES6f6BCz%AWe96{HMP}>VQ*)J0v7k+7Q}n;6qQ>x1irtQe zcic2D*0J?$W!xa!l*M!4XOLsm>tF3Fm*8+3habl_;UjdzY2&_m+KS~3X52HCgPLzr zX*BG(X?OFr{(5PMb#Z3-^IOY)`C{_)Qf5=pNDOz*55DSzG8=QpdR|-pw@+4fH`}cb zd6Qa{Q*cVN%d@;CRN1ogt@7=~zhkD(cMe>NjY>U}FWv~L#kh^H-vwmzwBud50$)dJ za#&%ZNLSHAm?#J(l)j{LTQkt#GVWdX2E)QdMqOU?3$OewV(>BiXFlCBS?3 zT-M^)6Fu{_CvX$3=gUi{Bz0wtJvuT(OiZ$Kzj=PFAxflGVP4s1PWR(oQ9T=Cr0(d@}P`p1RRCBbCBg{RF3^VZb&>?-TE z52HJ!q9=Y@dHYc|oHx0cc}_pf8X4+;WO=Cj?2XpD=6b94>aLxZiSU=Ng5DbL zY_dLk)i--GI?v#!>`(w%_NDEZGGOyZdg!l zzVuNXN4#{yiY$&v@oRy0e)ECYx~*1}pbKBKJUcySt86?42l+wF3s{?KEJ0ht{ub6* zA1n$JZ7#&99*Vht%l3MieZjgqq}O~{Ua{m*>u${bOOePKei17-bZf?syPFu7cI8Hw zR~6`*nFaevp;LE@)s>nu_al`DVuG+OEgXj@8PWeZr%oO$HyIj;wD7M_rFV?xH8w|2 zO;#9U^=AEU|4jKmzwsyqA_52?4Iq3xAWWfnCef*WO)Ktt7%(tQ(5SAMgT1Sny{mzm zm!p}B9;2t7ZF91M(uy!z_{9r$3X>p-j~FU;#kM20BmEaV@!CuPxAqd#nLGsWeu6hj z{ikvS4C9xq^m{*|l?H(x-xf+;G4V4!y)4bb50It4g=++lQ#l@^ z+M%=D`>yQ%G7;eM0VozIk@K>|NU4!+lZns~#GMvpc&RQKZrGGKe zqaCN_XLTGRjpuzYhLUyC7feIdNUHZ0CWEyrzS=kRe(?UDrZ}El5V#bne?K>?U^k&2 zIEEU|p)ORf(kT+3@{Ns$>&@v{?1LX5FI0D#M_kdbOWZ(!I!JiP=iv(FY2OI?*SiO# zpqhK>=kAg{x0T=m6M6a+8r_#`R+CFR}NRF8S5d{ZU*`3#Xsx;9|$bd?B+Ce~GTdHbm1{dpz+JPG|4(Un@$ z&5vQ$0mea6zJ=p+X?6Rqt;gpFp%zzZ!$9HV1V?n$bVXAZT<#(i87ItJF`0wuxQc}K zES!$VPGw8o$}7gh5HTfHeNXaRn8_}TA#RH%{?eaqJ5u`$meTl9H)s+ znd?(8beiLr{NTR*)aAVRXMrNa5ml`~r}^7B>{6kSybcN+`4dF!U4l-ntJuI4|pdg~CB(lUt^zzmc zBCsskG?mHjL;>N*%!f-`p@G!iN}oAAeU6^}Lz*3HjnF3wIH)9#RZl}N1-NGK`q^M9U| zRS5^@!>#JNuR9DC*N@K4ZLc!8Py`COnzpgkVZOACb5o@n`Dd-=ql|h8SVX{Qf4#rG zvSP{nDuY!jk_9ExAw5IiGg)xh81N?b`hJOk?t$wH>fNygiayP(huG`$z%~9q`*ORo zf_nn$i!Brw7~0?aV&v%fM_bg!?N^x5x}NBu^jtq%!HRr*ml(Vt@>6ULuP!mGb?*BI z*)LG-EZhEmz({PKa3(+$UnAdB)Ay%E5p&okMN4(;abUriveOzubM5ER{gugs4=zv@ z(B;?~!{~ZW2GifG=V2J&UDGv{Yrgw@Xf&RdD-sDr#sp#nY53meq9SEUv8a-<}# z0tr5gVfW>PC6qOyk1L?T^awrA!cG^Lf0j)6goxhpdKT$n2w@l3q-ayv{|7+Qu`>Id zdIb#kbJIy9iE2Z+X5@A3SwgRt$Y>r92jq(wb|lB%yEwZK1L?>_Bk*I|AYwjx;W zKqvQ``Y9+;KVDZUyUkcdDo`^*MV>V@4$h`bpiixv9cXd-eP&c^uGsHzqWN8DqWOQ5GagsWC>i06mbgoSfDTsp zEI2zSZ9Et)tKz;|>M!0~XE4L|)z=E2{HTN}MI2>%B8!JDaihzxPO^$x{A_L8l9w?K zif2#m?0ER6k_WL8W1P7BO!ADURCFQoaA9leMwj-qTrIC0BRokPw?V5li?xdNL6VFb zviAE?jmpi!+8He`zAN=&Y=s5EuSwc04A*-2T?Oy9&Sg82V+z5i9#iwR%UfC-3)=b` z-T7hyq;_?km#Td0ShrZ_xY5r$@1WAE<-N;e74b)SH~On^3K`pgfldw4A=4A+eZFRs z3rj*~E3mwuTBru(f=}bSh^eQy6d>M1o$A@Tb{?GB{H$BH70Ho*AX#?f6Y$adxdGcg zVut#Vcsc%>z~Q&6M5Y*xs}@S?PNk1Au;FJ<#)Gx-cv8kg-s(@-nj3uaOCZCa@KsQJ<;fft-HfMp}HhbO5`EzsF5=(MvtPC^Y&uSz5>FPV|0$EoVdX6VNldB&71RgtWBaZ-4vWQ{a^6v3U$*L7U+wr95L?2gx|QQe+F$VUH|r2HuNqfZx76^#a5+W4F=9wfDa+lGBYZojjnBaQ z=w@0FnQa{!BkH9%Y4gl1Al6WjlAbQ;o(G69Jp?c3-v=OD8%i7NeyfAhExtn~p(F#! zyAUP>?C_8XW2C}B@AR2A2)5Ma*SC!edWT?{(&4GesP(Vd;_-Q8HExT0=5x$^+@7hW zsCYfN2!Nx3ng8r_uo$OmG7kBsrd~9IsU}#lK1~iJ6!81hH{t5r!rj_Apx@aL4ZqF0 zjac1vq-API6r7;kXHqapi@;leH6EM5FylMcz=F?0s3akiWrC$iopAmkDQZTy9sLU& zp;57jU8-KNWe|F7O(whad7Wwt3jsImLCt+U7x<@!OSY(wc}qpUtKw-=O)xS>fezbO zJ9>BYQj4G0n*EX%BnuWrnYiUBowoAf%CVp#HXq*Oem_d|#KnS*=+Z{*I+BJ#Qv)(a z7BR3^lN%$iR`>}8uMFwcFV1RpL6TP7M~bssUZc!IczuOM{@%A4v0Q^6D=DiWJd%fd z7Fx?qrX~cz{eEYW%6#y^m0c|UYH>*GlN;$}1flFBqSl^-Wp@ovLpQ3Sd&vZbAvF7p znsR3`ls($)9gc1mg42nU28l@;k*H>HRU8Z7=GUeZX;%~nC^x0Ow_g~<-a-7j7pSOk zcw`=xSU47Vlt^Ebqb&t;DujF8CooiTjYc4M*wSW|sQ^^WYGV?5W+_ji#A1y-A#*;6 zArkAmn$QmBU#wL%cz^iUm?6G5?3d$*sLf8pZk*kr>LCAYc6m=v9p+)NE8oY12>n&N zZtXVdG-B!sZR}&zebE>r4gZuMR2Uby@6LrN(2RxlbG+ukMy@}BU8+3<6VAaf@Mzxs z3Mb? z`nKyG)I#GIho{=uZ?8%}_YD4dv9%Ax$6y7m!sDP-m;kg2|8cSXa}W9VOYWch$*&?x zAFJ2J_!jiB`?4#eG7EpH5O4b(phDrWPyc65o&|Nd77uycnx##rDrLvoyyi;9R z=X*!jBpb{f1A&S)8>VcA!Eb?@deYY@tuA6yO#;tb*U#C##4oDw^Q=DEvg$zgtHLuv zeLKZa0|I$-zP)fec1>*2p`>bJcmtOXgtqw4#5ALb+#FZ$&JS0UHEq}3RW_SKYZ)&6 zn~IGZ!2{?}^-J@HdSo0FhT9hhKL8J=1OYFw`UvJt^8{igdqK|%pJFi#pfv8YAP3nV z46VNJm@k-siF@>lrHZi@*JY`0L0*@#MT>A#G-RlT+JwEIwBfeC{rPpsTM^#X2NptT zM=++8V4H_f%jV;_rqqeRz-1an5DAqXj?1qyjeWyaIqqM#KWm|QfFUzX|Ex9aTy44w z6BrnVIy4ySU$u5|^|Cc{`Ms=xzAGhdOE&oR_7)TxJnz~igo2Gu*@Gpz6Bzk)=hR=Y zDX`6+{`i4w8GmLFKtU=@tgfQ2AzP8yz&DVCqfE1>nVqC&_VSV&&}Dn!Z$KQE`EtI- zHQ0;X)2@4=@l-l_e$(mT|MJe454eBssdJu_vhrg8a-J!)IO^~UjDG_@EndWx3bmf> z=sf_}t~yiaJD(0_j{uz!5j)3TFAe;!c5fZ)VNCwNi1kxSFQ*3&B2xFZ##c^SD__P> zuA978ZMjN?h^zYhx|PCtB4ZlfZmk-1e4gf4#=Sj03%xmNISiuTckbNm13cfKRgPI2 z+((oy4*E9>5)E4TFXxQc9%&QCnXcdQ$M z?_`l-J(ojGx^vIsGM_G!0oR@xgk0md(f-a1-Y-1fWH(8bJ2Y-yJzl?7g~UbU1HzbI zdscfoTdT&ChgkybI#>OAKIvZcya2==%~eu~2Aww6m{2$T(|y~h2u z1TV}h)yG!FpVdg? zWu-q#mtm-UWvdLO-@~FKm19_xg-cZI=&cIaAYgjX_}(>&wNTZ?l357;(e^60^`#sy zP3JTH>(iQ5H($-gP|@<|Iu^DKnI$c|?Ig4!PDoiel}rQ*sQv?SC(`4j!ch8VZ57dk zU#>Kya2D%WeFkthC4pF>UaGKF9aWsFV>VVL~K)C-%{gVo+oa(}w)bl?GE}vI29E2cVX`MAU!H*Ag@?Wy9^|V4F4gKfdr1q3lZ)Q)pJQ6 z&1hY}qy9Zq4L!{y6cL4+s&<@ZcUg%8jlSPmSN~QXmP}bJs z*|QPQj;Xt3=RzQ^HMj57xjZ^<`$``pz^I$@d@;QC1aNEDvmf>mc^+JxaL>0~^1RV~ z7@5tj-SVtGnio_$*xUbBu=NnTA*9y|5GRha-)3AEPW}{k!SMy}?a2{IjKOQ#5RHDd zoRIt;16A7zQ?vf3NewD={q(_jec7m4*?qu;mPgatY0sycTqzxXrlk0_CU>t<_GN~I zX6hX$Uk~nd*aI%Ar*6gZQIqSp@uvcI%ypB8gOk^X^K~aGnRDsx1GKHe6Wx*)KIyNn z?`!Mg>D_BhanoI1LOP>k3QyI`!uM#~yXRM~ngXJ-$s7P*XRax^dtE(LEvHbvwS=9> zHh9iLjMA#(Y?T`2F{xT{gju~JXnkTkVEqoCYBt)^ z{CC_39V8zg6l7|w)}u#0G2xkb|7^q8-^JUKm?C+c^!Z6MH#`|3XkWl6?t+Ay;izXB zd?cD=-}Xa`7*}QU6C}(59?8SZ*AqEad4ml1!Bon}2MlB00p})pndF;fq(;gN4-v(C zlDH?N;nj$MjnXm`#cu+mgl@9EavjkdD(ZvP?w7MWoHQju_PrYtuP+?s9=R8{NAY_v zd$G{~_otof=kpi3x9Sx@=`3)5%W%N+#VLUI%-|t69`W`WoT!28#?cS1r^myADgH$4 zT7u-hva`)e^5fb;INs-s6onL_GbX+0cU-Ps<*Ne4&Z#gPd3u%iKq~(LU;)^UP18eV z|0mq&37Q?coLrGOEJn44+*Zby6jPk=J3)NZ7RhAwALd~eL!E^_83R>Z|Q zGQJzAPsL^7cj{-@yxTNT*8AJM%`0T;wQ42;BEUp;TUCQEP7M|HQ;$k_@N&1|dq~e1 zA`0uupRC<|DvpQEtE8w6)HO^t4Gub1kXTIT2*SrWB2HaWEiuQX9^4`jh($Ee0uaom zvI2oB!@4uGC4TzLmtCuVad9afN(&1+iiJPfa9nI=IJ-wIS{b;~P-IEP$fort$gdhT zslGhs_;IJP-(KFdp1DunJ$3Fjo6kEI2X6`W^!9L!LZ8e)QXx$-aS|chs%uS_1!H1I z(Pp5aw9uY*s9&i*3sC7|ezV~{G@6Sc;{NI6U%!4>S5hMYfKuDyEJRRrC9f#s;0;x0 zsnNxbcI=uwuhw#nadYx(*4Df1b4S;jXU)~m$-tybO~sGLVQHRIDUqoR!{pLzW+&t^ zwvO6**P-wKsRqTz{N+*UY=ALk8 ze37)vL+h@u+;cRv9+Bq;W#zs4YB}aqjf_9PklXMg;zyzQ*1>)Gik;x=Gg(pbS}=?>BLfNjl>)_z>X$pzdLWUDHLCV_@>i4zM@Y zmho%TcG=u#GIF}O4#VNcR&$wOQw+`7(WZQ@h%GkWN!djdqs1u%WZKTFPIB80u1V}W zMRPHjPo}$dzPTAzGQvOZ9U#Q^`@3Frl-eaIVrlnk9$WMw8oeDX_z{mpO@ z_bai2HL^g6xac6Y71?)2F4O7oVQRjj;be8<(~dhV?lK>QC48T1_(DOx=AE97E5Xql zgZcbw*g^WGsyPKUs@WJZ4bKR*ORkEk3DNztj7;Hczc1=HV`%NxT<_%*&4|=p1S3E0 z`NdS0iV9imh0--c3$uB20DSi!X3vl-ycc-Dt5EDri)5d(a^ zY!b}sjgI3pQOlbYL9knINvHH<6VAmRZa-Br;I$7F>{<>>(dyNFdT^+V7s8rzaL!CL zCK;xJq$rJHPAMo-6bTMb(v8Y>8aQ&)o~3vI?C%+yKOGOcWx2YJ$nT}TECQYHZ;fsi z)rAIad`B+eA`W^X$p)whEUF)HG<|nEhw><4MkdvSh--)+n|KJ6@|`ZyYdlHUmi2YO zdQFqV6Y=>v`)J!6D&{d~RQv23t0x7VY2IjQQ^#Ev1IG0N#twJ33W_6|NO{cH7{67l zHChu?ZRxFPE(yj$3YO}0)rnl5hK2K4L}$Hj=&Lj(MO*_ssK zzUWsI4dFI6n!28S@@Xd`A-8YV61cdJ==H88qGo?`Px;v`Er(*0RNEcm;{X?Z_{P zQDKMEY{!hk)^d~1mcgf*GOLDn65h zQnfC#AS*U#K7C$S#x9Cn*Wv@bKaA}*GH|)&bT3WmHl!lylsmNJ&7GD~T@MMy^xUfp zU3bmef$jpgi^ENjIEah5 z!JAW$`J$eLmRn9%^zP-sD7NfI%pb`79$?)(H_OIb*F;!ZeRL5Nw={aRsQas%sz-YGwE zWUWS2+DsnA@Sb&w!@Y!Q;?Wnz$>(aI&*w7fli38urEVyv=dwM;fJ#IjyUqA{W z^hOOaORLm2a#2~xSCRN@xz>R#VFpHM&alX7GnI@+ztqpmhaRVL_#8t&T`L7mdjlj6=4L;y_#kPtkVvh{G)vK!IKZVd9yZGE?oMJRV>G zf0Fs*g|VQcUTJNaW`jZK`lPwAz7m6NN2K@fbi+fo<3H28|CO%)SGtta@AO-=WA(Gf ze2*leghkbpT*iX+EcM1}y2luAn$0kX9A&0SQtvjk?y^+d2va`nx$zUbuae!DBa)%s z>Q70BYpZa(fH~Ik=rS&gdE4=L!9slm&-_5DIXRa|B1UzlV~Q2hWkqPfwYO;TN2V@P ztbG0X9!77ju;A*Gd<|7X@ot_ne{S2{N$MTFtK;oIBZE3K#S9jA) z8Sc7tv~sPr5J7jQRZv%sj;k9wV81HEj%C(jP7a(2>oU~a3gzcu@v%g=DIfWYOyPO} zA*T{S6MiogbbBiH6qIwhNYq$bdrm-E;cGZAEw~epuooQY76}X9Zrd+gK}lLakGF>B zi`PqkoD3>FGeYB-Wv52@-b1Rzg2miIE>@v--W-+GgUnKe0%-{>8*_0~Hxw&<{OOiX z6%Mh=m>7uT&c##`Ll#N9PdL)5&FydJO8j_XS+t|YIf-^F@&!E|@5Fl-oo2HgXBRV$ z92JR-Alzmbrs=O6?H}J;SJxD5Sl{9R{43SE3jJ*oK8i^2Z#hwze0>~LfgjvG+{d|b zg?DJyk<6~;U18jB z7r$CXAl9#@XNDMPP!$jDO+A-1mNf3O335mX!=-DA|ZSot5KdaBB*JA zp5Uv9nOj*1H^!I6401)RC@s+!JbC2!;eXgrREz$}HeFu$|HURQGJ@C@CDb05B_PG; zr#=gEBw4AZWRhE13nwjD8Z|xt5o$MkK8q1zVA{ImgP{{5clNxN0Bsdj-eFrq=G~F4 z6|MI>g0<(Iz>LsVW6UPJ__N{&Y zczoS@n6yde<$u=ZOL*?CKX$Y_pCQ*H!4OtdAbcR`ix#;ZN7}q?HmCLylc|idz$poJ z^JLBXNL1j9oGDcW-M2|Tz zQ7qI$v_lAz2G4edNr2>eL?VH45AAFsF=o*-V!=v36F2t@o0w=1U&ds5nlK*wiR|cV z%<@Q-wyF#`^&85~F@-;}~Z2n!W{hovf< zI6~P9u`o)}3$ZZL={ra(ks;%b0b^>`m8gjhVd0uqie^wp&+Vjr-!Qsqj!g zSTZd!zkEBKHy~!NAOTP}%4U1-+|*)5SEAQZpjEz;aXTBz73DA2Ao`dL7g$p`o#%qK zcJ9#{x0yH-#g!d|>-_o2Tb;lLhRSl0vmMJW(5{@TkFrI^HfhOxQ6+K0nC>P64Rd3N zRq#q-M?RdWc8J^NBZHITcGeoq`yn2{Y&Yvc)vK{;w;s>$!6BZ`ltgk$8%SMA%mYx8 zlj5IAJpY|!%ji(fl}a@wla<760iqf*wKEKH*v`@#p_;PF`u1ZdncIS)f&w((xIz-b zUFzSVxu2CR1+H~01&($={(IbpM#i{-vyPC@Ns)dB^MHNI^zceHs}W}&12n&Emmrd* zDzrxDGW9(KpB@l`AL;VPc=K2C#;9*y`@kWd`WU6GS?;9#Q97q79W}Y%N?Ju=oD^SN zYbSW<;+N~@;+IMOtq|#q`JxPRJInGZ2%eiaJ^Z6eu^z_^Ls~K?PW$T4LWj|EO>U&-(K0AAsNPidRTi=Kyo91ke#gSMD7Af_@XSO@#i$o<7DBRFp-ZM(nWb0-` z#Pjb)@R9FO#6#&61-+Y>=D6!9j0%*(|!ndY6)8NxHte*>I+A1Xjby#i6EyhD-uO|A6@bqa`@U+a(E_Y|PTz7Fc<4{9=9 z5cNtu`XAIOjOkjg{}=T=g(7IX{d~PYE^rJjr|h7PoZ^^yXKDoBWrEs|vD-|stoN3# zMbIzS<*XG<5k81Pis*r#()6CsFPsAAm=HDPk6rd;+ghRdU*xr7Ao5IJv_HsW^FicW z|EF(=X^*bw{gi)^W5dXOPwg1cZ0m?ENWdvc5){t;9_xGba{YLYJaN`vn*CG#@bqC( zLQ?~ydbAg3^pt~wn_|tP-&tAS!8f{?{d^DxS>@+ceFo8s&GDJ7dVFsqM}#SlDj)~u z$7kzWGwL+V@j7A1@G~7xN6hg zQ=+Bf%&{igPj)ng%7+2@ZZ4#sO(;LugZ5FbT?}*y$%OMtH@aPr3alFTxrlh@WoF5D zax60>*%kWSORRgGGAISq-RAa?ORdiCt!A=H=4(j`tVl96*6GGoE|MkKt0CDH@|I(t zR~I#43DFLZJeXKQEHkQru@Q=HGwk2ms4_~>6)@|v%QGCZN@SGO3EjMAt#nTOPFzc^ zn&~1f@ghr-?@^lOd|)vYPQVl}Eng{w?wjX7B0+gSuIc_H$)L3H-^o|aW>;W6k1DYe zpM|ousMyCZwVF{?m(p8z%vkUzR+_ZZc|rJ)0aIYL*R93?(611mZnd2XFE zLT{NZTx(rmjdJ8z%n;7DfxbzR4BEozUNrTE}5_Smr31UDi;h2f12c}`fYMl z`?tvpyZ=9vgH}3Ee(M~iRsfL63}%JjPsxVPF5h*R{aA8ZMWSPi+Tjj4OjTHA#GSSm zLhaiVw0fV%R8dqcT+ufp>daH6&(b?jH=^7^m1&{=_==D;K6;}X689bb9m3S-@U(T~ zsb$f3!pcj7lp%+jKL$q_h`K^N3x%nb=Wl^X@N=1hn4Hv5&z^$#^lc2}^*_MH5uk{L zcfrsg#F600`k-YBpu9u``r)mssrX{B42i_Ff{x4@ulO&Ax~mkL0=+A(J%d*K*rRbb z*QDT?uzb>?cYtY)GV8|S0%az6!k}f63bah76A2^1F@eywp?{+xS_}N(jll(>c?LCd zS#svI59lvY>I-+Pb9D9Dg`^H6UUGVd;vyIcwcO>DV;Ox2YMlGLEjYwe&M=a-teyPs+tENrnOdtgh-F`jI`yu&3chn0 zST_q7u+$_JMzECP7Dh20r1+2GJ?;Kgyfd?#VcU_M(7zPE)2B8#fHPR{+;Ep4$K(o3b zNf#B4xIB)e`LR<-wuWuldgZ7mSzX|j`p!MB&2CMC9s-TPE4so_OH(>;BPn0ux{B%c zLa=;bVK8B#y{gZ-Dirh2JKgESLDf>1;;pOo2i25si&ony9HB^hZeL!Wpa1-z=p9f!9%1Ex zinr!eu(!hLY2Z1-fn{Y%P>3rHL5PAy2(piXrI!&eD2@2DURU}BY>TIkgG;W#$yq7- z9gyZX(WRe%v(1|8zUKPoii*Rf!bC0P1k*buy$H;F#;-tlBm`@2eF6HSUQa+-o%?pd z&UbgzTD(@OiP|vpEj!e{1QGZ<;tAnCoYq3qlc;89oS`jvg&dI~v zL=z!Zu%JRtLK)t*_YUKRe=d?56N%pWl8+GlFiS^iQPYhq<9rNIX@g$<=x@iUvn^F& zG>>*8ITi5uD6H-Wwdl=@qm8DZr>=0sc*yFv{n711bzBqav~_lWVN5kPz38o@p@h<5 z6Oryq6A|sz$KX5+gHbu(>a3_j#2GH3!}75R=HdgzoUswGWZ*5pU$vxbom&X|QJ z|3KKUJ5biIY^20Ew|q9f0^iC#<2_L0#J0z0Pf(3mlkOpzc*eYvct#RhNETh~^2D>h zHo5SJYyZ&0nZeZxpY}_!jx^6*4}5b?g2)C<(n4P{ip)4$0yKgmtq{8*w6CW{%UK| zjbtbP0CphPxa`b|QG^zveXvUVP_{Yh*?n@bb`pF)iRw-JVARAiomXyhr3u=CIp%GD3eY4BdNgsaE?Ljzp7dK4}f?dS`ffTr9S~Wdoc;9xYs4P?P2d6hh9f!jrLrF zcR27(vAAci%#mjb*P z%JVc#FsM;oEk404?%DP>5x6*fhxpgNtCFwFh}?0N^3dG+w#lDB*__}0q~|L||C3&j zkZ%W^31prNb<}S~4{3>mspP-X-*Xd-25&2qzX* zv460F52JGnm_AFE&!KJBZlLeJfvHwj=dp%H)nT0v8Kt)i|2*@MIg6xjbs7g{ZGB#sy!#)eaFrWkrqr!D|~k^FhiH&o$^k=qnm^RPyN0 z+Ra#qW7$r(P`b7$2Dr^Ia=&VkM2M|z_ofL8B3F}8x* z^uAww>~fPPM{fFV!W@#d()sD=I63#~Sv><(jm;?QSV8M$l{`*ZL9A?*yom;myzlgZ zTR(%jyRq5Ociz75BPKRR^^1MXPgEFmNz%1Ed;1V4&h7dd3xPpV$w#Kh>>B&=`$;@L zg`_s3wn@oe{kHrjOata?wDqiB8XBh#<0fX8oUKFO6STE!Jk-r0ob#&a7jrq;$|hyI zEe9dA()-GWzPp;66(*$CpV%#j2WOj@LDb7tRTWm*?#4@TKKTlp<0}}OVH|OkuC9J{ zZ2!bWXVcdQ%4vYXqr3UEkrVB@WS5)Fz(1WCwPDRer)t4GoYKb+19^cYj|7w654Y*z z1w#f_O@bteQ=cCzizI0QWAb|!pkS8D?;6)X#_Da%yk^8aHIUQURdd9hP|=G-`4?4) z6^P34H&uu>Jq~2NU(fsBHv1<+fImzy#TKe(?q#p5s+QhojDFulE{IAdZEX0o*g65V zD+==4=wki@oeofdjJO=4E&!@XtakbZw0&PrXhStO-{^NJ<5VqeDeZI)cMB7DiInBQ z6s>e|?@1aiE{ac#i(Rml{*ah-phkN7)P0)1S*D`d=G9vm{`tc(ZV#&tfEh2GS70E(QXBuBB99=RIFF!TFqP(SIA<89_DAQVM)UkYRebxKCIWs^)fy8LS=d7wLWgY6tXTKmY`7o zz|eiZ84P)nG-2i-!{8`>%6=ew5OaxFhX}A0z;Dw zd6}?sP#%N`G!Rlh1dYC~H_>pff-_&#(96Yv(ZAA7G%QiW!Kw`G;zm+ApIA8`xUT0H zf?zcCfEneN+bkBgifooAYGlJI0v(LdAq>5E-~Gk0#u}6Mq0#WL{{6cfjgwFW3W4Mq zW$<<7s60`jJkc8k=SLXqkyqC(Y4-L?LfTz>YDjDZzjyGoXlg&8Xg!0twobpA>{V#w z4~!vlKhdPM>QJdsTit#MN4R7phQbD|tU8CQKe)=!v`l>Z1dBu4MSI-K5<2s*6$qfP z;b;#bu&W}g5DzLe7HKDZh3bYKnib2G8=m-)u{U+6;c0InXq)P?+Z>ep-6_{&^tjAg zxXb@iq2~$rrwXW?3rC&U250q38*Ki7?^c`NyVPCFE^7z(qfpVeHS@>%C{2|YzLxZy z)1Z2NTnihP2yhdVO%n{PcLTX|V^S?{V)2IGsS>QE5MfC3YuR6@hIa%18$# zh-OQR*tIJANcR&@Qr*;F`guUm%)L$Ek%obujdSATu(a0T#bV)kYIT_uhAexI=o?O6 z4{(0@7f|&dKnVW^Y6Jo0znYu_#1H^Njgrzvr-}@^yGEUXe<18=3xKXIK1P|+-~bKm zFQVTR$JkW@FG9(Sgr)N}it?H?%LIy)-<)P1NyC2oC}Kv)I!f!t4_n?Duq^t~H@Hps zUhC=T(3@ztCK5}QQ?%p@gfAI6DYPhp+Xv|8YSA-LUOe|a>YomveYl-UPHTtLGC#&h zB7iEf&Kvei>2&_a(MY05xdD|m7Hji~1fB*>r3{m%>eQ&(9)T3 zgMV!3IAj@niAge4<@+SNrGHr^Ze_7L#T_wxz}Xc`{|_?N;aH50kNBphm~?tz(~K1`%Pg$jOdT=T&3SV2v%0k>1EU;9?R0U zg5K5_&JDyEi<3P2sdUh8L*xB}K)bDxQpWZvqd;?y*J-(iJbiU+HkFV)nKLO4=Yim(2 zE^_Zcukf#TuL$rn7~pjFc2^O#WUlSczB-_0rg~&5_-DGqSs!^+f$JZY?z`WZrkg{XR$ z{Jnf$IIV8n>J9up3Dn*Nl-L1N&HV!c-dgkmJgj$C#{G(VfsE}z7ybcWf(BjX9e19E zrEffM#^WwYqq(7(5v{3v-T`gh9xHn;&V!kNb1Q)N8Q1bcVv^E($qtD=DVwQNCJxC+^L6Uz(!EnLO=D9d3N7v@iXVPcm!|{54KidL6hN zY8MwYtgBzzUY)Jzv|+@5@y33<`{{1Yb+cu4l_8&(8gb$O+_i3tOPrg;C+q#pD6#tz z@V2<7*T64ePc*PsI=ph@pGw3-u=3gG=pyTRa&S~*|)?S-Pc zw+cf{Xkwvr-p|>*ed)}ibN+)5D=(qkv`{fFYXGor#R3Jka&-QsGYhb0@J+p!DQ9DY zZ)&m}SaZ93WBda2b#8yX+aPYZPv^O1VFAAS*T38ip8guRzhGT+d?Z`^7!{L%BbJy@r5g$hz99+tNs4l zhr6vdTnb!zA<$FIhX&v0Iwi%fULh?dl-H+GgOqFrOVt@uFNW_>dBQ-5iX~5s^7_`ew$)zvU0#3z_o&8?tE!okqG_KQAJ<`&!g`d< z+CqFTPHAo`tt=b6?^w!)>kN*PLN!MrP+@*IK;H(TEoNWV+826yLN{%yt zmv>In9nXh#l_9}O}CgVuVdWm7_X7g^4fKBLhs+ z(Gh#I`#`7m?k6|ksD8zlTrLmdID#_&K6vlF()okZ7rlW9VlVnf3dnx5-O_n`1Bd+G z{5E<=~{V9Rr{x80%197SjqpI~uC~Cw?w@hV8&x#;Il6`|$cD=?!BYrS>=d)c9B9r+ zRw1~n(Cxt|AG9GBB3lBz<%V>@B18rl0tfN7aQcj59T7J#zn8awp{S~XCTMae z6A7S+I>|O@$A3c*EWovby=36`Qf&VY`YZc)kmsZwI6+C-Zc&s|GSEJ z9IeofX$8L^xaEJ^gA>45wSu99go400{-O0>l72xPGoU+w!r(z+HvbCibFfF;WR__D zjk5;9jWqKc=kEc3^IRv{1U;wwVI35P0SfzV4aLT}Qsxom?&dn~DQu5)PnYt!Rwk(n z%-?q|fI*0%bsPXWdcuIEF+ELdn6iz0YuE^M9=(?d)D0$J@u`fTuAJN|E)#MnPkJ#JG{yuPz8G|n)uY?#}K0c+T7I`*vV6b8K_RQ9ZiDg;?q_j*9#G;0c5~ zbM`rUS!5UfovAn0RD8OvGF$utLseFl# zHm!PXXYAerT*vzzeLI+c90%?$-#%N_PkwSHjJyd`Z9nJEn({?Ye+HPf`@ZnB_j-hx z2vskP>)mYv2EWCRD`ncv_^s~-PR)9JP31Uu1nk@8`ilc+FQ!xZ+E+!-$-b{HR&S*G zt#mvn4YyVweVcL6b?NQ$U&$bv3RuaAGdYj>GdD(oXuMzrO3JXR2i|-Ws;sQ59 zVcVN~?cFTT`-2!$!Q5;tMN`3?Y*U|;IWYu7_lo{gOQ=RNsQu?h3$QVzTv(T2-zCxn zpDWXMp=KmZK}`lIR=GGc_8dieu?g-^P*$Z52_-?Ln#f8@Wc&i8p87IY|BtY@0FGnH zx`f5d%*@PeF++=)nOU+$7Be$5Gcz;GVzQW77Tb~q+kS6mcHZ~z?jIeh4n=gId(O$K z?3-DYc`ZpUz9X)yUn8^c3RH&POOt0yeGO}}6wG$5cN5?ij9etR^dMLSTdsqutV1pL z-3j1hBQD>xwXGr{(hTybMjbxr4O;tw65b>7QU|mDbsgUL0~5CRdHzqY!l< zsh8ZtPB3seBvfQiT+#|>PSjHe@u)(QM)5b&QN**-N+*u3#G4pc?T8o~5@tY#YHY<+ z*;5#T>h!ZxVoW7mx0v&dhS0pA!hqT>M;L~}V?$}zko)pI;Pd{Kkl;yv*6Z5=Uyi4t z;y~{~lE8bzIHJ4Dy3h0LYn+5&){*bM&fDU_WaDeIkO3Egd?nLs`}z-VW&%bBWXGo^ zj*M9K+g`;nkKGPm@8|Z-T$6+n=OHnN)Wik>Du)6h$&8xAlNsIFF22Riz3N83VbGYsVwY0W(2qre`?4>S@3K9`T?eVTy~(aP-%R`NzpZ}p^k(qg zMe@KMgvGd6YVgz*$n%J~bBk(HOGvap>|DDZNieb8J?UF?pQ! z_;In8k`Xnde`{sPZUp*;hc2M(AZ2jzwz%Fm~5KMv4&D zcAeGh72=;Ko!<3maC1C(R41JMx>+;oOKnPaaVZp+IUUF}r@Fp9>MR?1an%`ry!iqy zPkEJ+SaYlu(#?5kh^OJ$`CPwX-kDrzw`OgvTmKYSlx?x&xHq@Hw`pMUbnL& zw5HDHNq4h_$54;Oj;_k%w`aTqi(aKNuHtVN{4F-;hJ4G-6U<8$(33mDb?>IP^;^kN z+stQYXt4Wts)T{c3WnF%#9unxIe4Akoxca1vG+QBo@f>NW~Z8>i4?Z(TH9CTu8sUW`h41Pvu&-5P!x z9LV&0dp6}IY6>%D#{<#`2L?AM*cklA*|RJQJW@1<#N?#_evje=M-oiL=H=BjEc9@Z7AouNz(v;s`#c`dFgk;4wwdj-pG7VrNTsW08@zf{X?>hs zs$7$@%!m8^2wD8A7%ibU6zSA2Oe&uc6bYt32xt66%^)Fz%@W0$MvHV|0 zN8bXco_7_%a;IZ8TKfqIZvE_gp@1XJuw}!vZ-iKB=|dqA>)+ws8MKoUVMlJiBae|4 zFM0J@I3pqU*)iAn=-7Yx4I%k618H17agc*I%J}(m-N&_*#WwQ3R2bG#X0L0XgI_=w zDH}QD_z9WYXc=xbtL96OS1)fJ;v8gm^0u%sHDljlTcR9NKi%i6+g)bQ%bYn4$o><^ z8|c@Tu)ZO%bb+}~pbW!GJMHsvu;4iFt((_BWR&!vn^XOr^w@Hl7;g4FJJzr|eC|%l zmhVM&Wj^%d35k)8*R{PaX-K=AjmJ{@H;hqy*X%w9Nlt-tN;^jz>oNniKaH^bACiv@0@WvH#4s(BJO*wf&K1GAa6ARmPAvy^PlhP2F!XU~rK8gUcfW!$DH)#Qs2ZcDH?I6`ZTNIUo2@v<*Q2su^V zXIH7cZ7n_^w;1Z(Z8P?`&kH@cBin8Qraz!o^8p{5Srh#~Edbhm z-zVE119pNx|Nq$mIHK1quaDLl>;Z+^7$w=vGPpnIiSDjXudf2O#5S-c zRIXl|7!Ac8?Lnx_( zq$%xaz*mJe>dzh^FuEY5S6;Vf)quJpc}+w^25;?yiQ*CJT+)h4b7~?s;~{F>1#0?T z#=IVM18sDJ44+ghLf(A*Z)^%Zr-S@W3;aU_EN)26S0$i0jNoF+x>fT1IgcwejgD#F zw)aNB@h%4H&C02(X_TUn7@Ol1;1k7DXxmD0n?}E`ATDip3spDgbMY#liVk-+N0!%o zNSj4TTPUy5DS_37E;x%0RA^+Ty!d{jDzfL~UJd72 z4dzYiYz>@a4XmSrU|3r$!WQBQNST4#UXAuyjWH4qAN2R96_DuEfP(WH^-`%>FI$X` z`Mnx*W?KQ*j$aS#6wIBMOJd*MJ!}=cY$3akR7VDNZ3lIe6lH43_dJGZXDYDuJ#00+ zY}3I*zXi1RaDo(>8gJ^H;hY8)I}zU4#`B+NV^QmAnLK>dc8no&{nXuZ zX&!Tuf}Q6ZYF3k$<0T!5>{uh?SmWR3LuLr1=wW*Yd*EIiv-K>Av)akr;;SQGW|*9Hu3Se9 z_r)8Nz4y<0owrP+?ccxCM*O%oh%I1I7Dm%x^|P-uqAn*+-55T7BpZys9t}C<@e))M zX^xOuZyKMg_?{N5!xm~sVM@^!JSC~=7H~e;?-x!&9@j{wjKrRaSKhAOj<3C3>57~_ zl48W25s_R64@%#Qy!S8Ha7t^h0`FiWDYct7PSDV149>6yr9-cAFr+0)+o}NEFb2 zhrC#`1cr3K{3|5iQ|MBI<-bB^L!pofhxa2X^yjKfmXLzE+R)l!;%v}9#-NBqsirfF z8_i;jA~+aBk&7GMk_REJb)~w|iGvtfL51dfol7b60M%BOIb2(P5rq)W$I4u!32Oj5 zv=4;=arrG$%P+or8w%6-@<|k8G#KVrRL|73Q^Jb*ahvpSv?kqR&kB@)cF=*5phXZ- z2pH700$8V6oGp5dt4wt~G8Fa#*YyQDmb*H87*sbixcd;8%Z$PF0meTbF!-9yfqJhd z0zzOw5N7&OR27GKvc;34YO1}?k77Sli-SPe6!6Bl;LrK*jZ!ZJg;&E!|<_%@Y820BplZF2=Yv1hQs%&+g5YEZ zke282%`<$XHtWlr1fC85-89QO5Fj0oB&&pEH=be;f}#*qp)mWmOFw1jj<8=LZgH6w zI;UT;tPM>*M{zJ?lA&;uw0k(={ZBr!Vh|=&ZcMs5kdp5$h@^qB^2WU=0iRTH*xN&J zqfr7z;t=?Ze&{Szpn&=p;NqB^TN6kc@U;JFFyH@kg8>N1evREZKR(?7=Y7*RUx8Tx zkAdr#&eLnM0D1`M#uNk;g8XM+qxg6FeVe{la#i2A_Ebpc0ps{^s7d@JIK;`UqzLp$ z-K5`7!jnXUAjct~bOJwy0*3ta22hFL>5+bqjSQvDZp`QohTW%bcsD7!@NslEbM&@z z^mI#4hK0z%l;7{(7zXeW|1iuT$HzotQBNL=zI=^dq8)D|i&(@@QDf{#>v$%48`C>( z9E-*rkYA(IITI6&Dao0rIUde4csg)9o~?TwsN!CzB8F}n=q`3Cf$B4u$ez}yGgC6D z(rlzY+M3b{_Cx;YMg1`FmF4IiZ?2!_=$pe4+wS&YL#U05M@$%4bR1nA+q|6F3?AF* z93$>DCXk?nH+Z7)@xAv#HSqgB6q~-9_a=>O(8p@VIit@=TE9y{ zkhS|I`&8WRs~vUoviHcn|2U->F^Zjmm6~ykjc0ng;lqOGVOO`KY-w}z!rSdTJ+nQn z;PF=D{idAlQ4N>~O#1{^x6^G9yqfn^Pk;Pq`RTLA+o;D|0DqpM?73Wf3V!lZX`LCs zVoH;ZreTkMJ?ZYnoN@3XY;`Amm05@q&(KUq%OcazD!RaNkpxCk^MpojN6)Xp`Ql`v!^|-{*Cv`U=;JQ$@dk7)J*y`p>*i(S=SxkU zr~P27v7fSeRL^_jLXWwDkOZiq$J_dp72GdC&_KIc&>cNDO@)HtM7G*jB+MYTBR;ju z2~5z`=KqVHbEG49gMokQarDELBX32#xS-HEeY`inEt1|&d)(hUgy(%8YDA<})U^4c z^6m4k$Y2n~BT2`tx0s-|91Z9oJV>Q>jWT!*YWN9>&+mg3A6oz9pspj$Y;tKi>};$} zOmu9S6=_h-7$#pr5H&&*FQIRV+kR!g`Ll~Z=vFQ+GnU}>O8cx8T3OyZR;HE)@!tP}xZobLjhnye?Tz3V#mT~ovNU0r zqD+&duXntPTvnU=?aVv4vZp%#dV0 zx`tqz`hTh%hMk`_NEhV@va&e2TLqPN7ag4EUqG3eMolBhTn0xl!z;v9GD@}@)ihi$ zsIEhpXYj@qXa9ro(lHpt+E0Yp7-D6?EKiy#!qMdVd;wC-N4j@PXdP~G94?14a}El{ zhB%Kv!y?OZK(BbMlu8|D1r$;E7skItoC}L_{1E|U_5T(TGc4XD*JJRv0M>lic?9}y zB8!b8wsVx3%MjW>v_3QW5&El9QxNo!BhF6~I0JKxPyNEBq?w?Qk`WfY=~kamG ztGG0THgFS$Ra-jYZ3kg`uU9IoVHn2bE3JW8E$q?(1RC&JC?Y0pT(|TX7E_;2v zZ^z7h5pVev50-aC2kCd%lI*@K;slSKDd`07&ZGF%Ni|&6Yhd4wm;#%k4K+XDn2wF$ z+PBq?Lc|?3Sn|~2G}IY`72Kc|%^)VO7~ok2-*HUKEP<)1v^^M7s=O_VS+m)=kD_Go z?McK_#1kJBkEV`naqrr4N)jy@Ti7c2PSSZ0Wqp14PU@#s5=A?r^9@NGQT_fD(Eiojth)K$v~oXXlph#hJuFKe-7d3v51C%^|`>7{%HB9 z^m%eyXSi(cor;)>bNpNaGF1HB@SSJpKakBh1=>?{IwHZ{wo%P@FTvg3#N}YR+{Ze#Yfy76cSq9+IU*t)crn1oLl} z;hZS8(sv3`u7qNZ+Eag9RB@KFiE%1iWVwFIiRn}spAchXf4CrvqY#2sN+hMdx zT@m^&rj>9+++n*V6t5%W9IGk#a z+Z^c%OK)F8>62#JNG4V&s0ku;unw9n58W_tGUh(Xif}b)1%Uc?q7Ap@huH+D?1f16 zp176gH@pHe1C6=K+9*yFF(_d07%Zbl)1?_KTQyl0lgczCzG1vjStEtzNrx!T8D#Kb zJS3PFiWR3NE_WK+EK#6oVk(;I*%vq}E$XRyMKlbYDr%eQev|Py`di zVLZnE^ReGjP#oX{rTpAby}bikg#y;3VD=93~OZDT_a zodt6dEf4meOs(%yojFhiQcrDKCZ$-Fl}1%ndgPy|#jyWGB|!?MBwnhgN=H=tmj!zD zKnoJ!vHn;v(-oQYO!BZ6wOj6mn(U>iJd(ZC6-ftWzOAm-K>f3}uv23G3nt}Xd=DW1 zAa8X=-nk;E%}ZDE;D;22zlB2E2CfTZatbE$saJm!Pb-VW$T$_oY)YFybj*SZa-JYN z)dW%o&IM1)n0o*zClyNl56Wa<|D?>5K5JoTFp_D+r6ds(N1=$xD`;8pE7E*{k6k?0 z#FUpwia0wHi@N#MeyU#ceTA}|&hJ2_Tdmns1Z{Fz0LK#AGDOs><^vk>y483^}P!fD%AvqN~`ct;oj5C7CX*U!blBHzor5Ng= zd=BsAN;UF@PBjwIq7c=P{H^x+%E!j zFx;ZT7Y#oU_n>wY>M#h!ixRn~?Nb~sCxm3_ww;okJ)cAKepG=sg(zRf1P|RuBASnC zN%O=zP2Q&?NrKmzDLcbOi;ykX2w$jV*h79#lQE;8>(<-WF$OQRws}Sn9y}A*&_Son z_CO0!9^W#IcOu!-3*B}aBS1rn2ZJs9vFkiYg7E9v9}+G0ljnd8C4mDQ6Z789{Z}yi zhUdRxCQNJBpgVZq?VKuWwG3k-3j{Ne+L;8@Z;PfHghDNgMZ z?cKpcvp->bFKm4=yvg^R)U=#w=&OYX1A0YM-$otzGw{}Hmv9_LrnV_0S$>F`_7E^&qVHe)MR-- z4u#7eOfypoyj1?z3_TvOnW>{eR4$TC_^??Jifn+ICcE-r#Nb+7+2$K0(5!Ffr-)ys zN%uf#L5p_nH0)ur+ zROUfK#(Yn$j2AYahL$^-mJ>n~`wB!wjVpU9%89+AHXnU@L+G>?-2B1{?Z&WMB?#Z% zpi$smiMQbytu&;Wy8X?0I9C;VK_Z!5+_T@w1jHwQ>YxYWvL^d{BxE)GSeqUTgtGPT z7HOjXdk?Xv@#J$(zETe_jbvm<;XEY!{^4S6WJm@0qm&hEsIDOVV$%b{HtOXYIeAUt zd?e>{`e|^kSk7TsRBC?vNH0HAa|baJpygO1*T88)KEuq8iV)m&ZG{VhtcDAwv-T|= zAjb;7?x_sp-a-!(qxj`>1pW#Fy$z#5+KPW8C9EnJ*hk*A3>RvauRR8Cr)Bzk2T3Td zav_|>64kFiyT$Sw*}MO*AVeOCSI}GNMb^k)z$ovVYq!9M-qU`Vb!>$T0RsvB9Y`P| z_1i79nVu-{vtj(h#GQD4H$^nln)*D4_X#3MDlLLCyD9@QZh#@3P0pD%A_~!(gpm#7S)f%)VE@cSRmk+7#ENK zpKt#8^jyg=F`Td@t3kc;PottSzrcUt?jtiMRRM2<$VU2kz6w;zk`HDB>ASAv^8OW% zYZv2>7xH5?`^fni-$@CE2UrN4fS&!A6!L&yLok0kCj44Zbqh_MIr5zBaT3D!p$_r4 zV>lwU-9}t_O+rkBxA*0AF)*%Jx6sqt-p34oh^_gn{*lB&SOk=`&n#oB(?i~J*aHgj|vI)#eZ>U+@%>(ZXH&B+Mi%C(yR`x zLeRvmZDeq&-U=g}jrscIo#vE^d$4A0x&A11o16myaqTj@dBjx@64oIH4x6@~8=&Tv zNeC!g4_E|VQe0)8f;fl*t&5jtVdwz%Yzv;5uWJ!s$8HJQ%q095H<`;k&pUcAZi7Gp zu3vJFfiPu%V9Nf6(Fc@0t4tC8`JQg}u{8=r5v}WX$NF9{5E>WYO8nV``7R8pYp`MJ zd&C~cmo;mm->PFd_y1?GPk=Jsi}^w8&_%nD#c5R~?0?Z&kJwrxj9vTT<5``918@z? zIaa<@7-RQk>9@YFUEGAi+U59SO|r+a!5$w8aFr>z$1lssALsoe1<`MDS{V}QKR`JH zR{L>s8W21)Z!Vczd&5~!uSP(T`ehx63_!F6;X&(GOA^eS{V)(yCR%$y!9``DLvCgEZh`cexD9lw=>#0Fq}$A5WTR<_V#= ze9=b>TBwImo)VsR!&1=D?Mtuhmg%?B9jWxRiX7(OaMI+SOA}y2l2wdDbm2nJm9<)d`#$uLG zJDNLtv(UIj8SK{bSA=lgtj&b|GfOcenWEOO-=Fq`00u|D#gQqPykUH3GT#q;NZ?9* zd*o@Ef4&~MINvd`eAM{ZziMDeB%Dv0NH_;K_T#7}4u*VEm4SQgb*WK3r69YKqoE|5 z5u-oL&_EjnOarK)T7n%l#25o~O!9R0r+c2$bLO$VBCL@Zc967j76t|V1UCy!7NCyv zGsZ<~a}>YnfP_0NQ>0Iypw6`R)j>HihE$FBZ8rZ+A{Xk^5B^BZA$mUY# zrA4qd?%^eNQ0BD;cU1x$lB`ZXEkFso(NoXUq(fXnh%X96ZyM0dOfFE;U^tE9S}Q2~U*3!*N0OgFG+p zNK!bV;P~oDfD2i5wiC1LlaekCwZ1Yya^B+*Yp1Q2CLHqU(B-?@Q87z#$k%tDc8Q~6 z#Ay-UgdjbBNa#OS98o-wbMICNSjD-pAp`sP4{5SV*ceGA6_o~g^8E5@n3#O00%yE{ zaD0YZ)NmRUHoO4fO&H+eeh$W~6qQGrpef|;wJUiJCYTfKT4I6-4LT;D9xs6V3wU3( zM;#R(Qq1n_oDN6eU_92-^ZgzcXCM~~bc`^PON||~J&1UlBum~bv@vUz6z@=@2szBY z5l&+X!3z1_6&(9^Elow0c4@X!@@lY)O8XqoQL&N^xfI^(<{eYYJh|Z&+v-a>FkO^H z2BwQ_UN9WUN5zJdG8m)2)7z`}E&2J`F!a=$QHh_HuqI_RlhG|NXGbq#aGPKVl9Q&a zRa27w#fgjEtJ$BIhT0|Gmbyp^9%Mdueh7IeJcm7Iq~1;mU{U0R(E!P`LN3C~D#lAo zTdM=4w4@dCHzjd#0u)VvxEn6+k$CY}Sm42vhWbqfuutnWE*%0sJ#PzVBn$Sr8$pR}S3Fyp`_mIuj9}uqEkFkAsa3tSc5EJ%0{L_)pSifrHjuE;a0dbf!}pK>l6FeRUe1k zFWMfV1V_Cxg_=GfSG^WzO7|-Xts*SD&UBsd>} zp(syL7GgP`u$Z+VMp8ynCMzida1VYwa8gN>32Q0P5&=jV>m`c&KTFRfE_SMQX}icK zSYjVGJCTlC6W`|sW=D45ntow1jitDsviyrwTp}hj-+gl zA<`u6Z$s7*{_sfwBK&1YX(~97kIZUTF#BhLn2fnKaTgt6JMs3&&?C9Pxm20~U6Q(+ zV7Zf(&1vAmz#q51lU_^=beop4@y~w$+T~SES`sF=)KL!4l-U;_AJs^edA<3K8iDM} z<*hqydVU^+J+*-Um6(|j*w@-c%7-s8s0d`_J?dNOEErtp(nYIlE9TJn3HM_mOmW&e zDV_m3`EZLrcr0Nhd{wr@l{dgE=ET1h1I|j4WpeuQ;jsg`pF+50spsl*c%hP>1xzgG zp}ugmqt}?n{drPYfF1F*A*FX=hUBjqk|yT$eo{_BG|;H(sorGs1dR2xelj42#8Zfb ztYv3?j)Y{95n6NyF#*FuJfJyM0`O_|%X)NHMorZ$Lt_jOhiu4$E4QJK$b1G9U#oaj zip(g%-8!+6A(2c0P0MUgw0^v|UivHNgDQngMggC{#T_hS93QvYX{4})>*Y^Y`J{r!RXy7Giu+nfNgb_KcGFM zsNYtgvIA~D0Mhss#QuRh5ureFhkPS)BGn@O>Jszr`CDJF>+qN&b2j4Ve8dzJK{)2X zc@S}0dOEKA`PDMJS7Q#A)bL0aS|dR|f84K~?^g$coJNc`8K7P$R7XcX?JXK5NfESd z{Sh7Ua1epE6}%0sw)d$Q0vzadW<_gzQ6W{GiQ5w*lox>gNE4<+b6J!FWOe zn>FP%p@oGNJRK>-l&uP`__teOq{db$>FropKCEEV2FLQgM9g+D8->}vxB^AAjVAy} zgldg7q+;zv$0wHY`q6CPAbjPm)lrp)HH3a8P@xK!os%jOV;SPV7|H zN)0vJiXY-3NzYXSRAZ}A^o?xqHxjZJ>Os3)xih7N4FuoG2RHzOhZKnr6K>9Mu*^Z) zKHhKO9;(urypNf4bHQ>53Z%7;Ixm=iH^48urk4Un2 ze&wvaw9FRsbGvIXlfN!_sT%V`ai7{$_^DuDVmPq{#dT$|jRraOd54_Xoc@#ugng~# z#-w2eQ;-_G$ZBK)-oOTBCi@aW4F-kcoN`6nytmFxpT4IfAxp{^ag_TJXjfkK@%KUHU6}&=MmAJ_^2l5 z)DFQyIa`22XR>kwYs}i$S{kB-nAy1O$XB6JBUmhJn=hW#DnKOOtco9}80EL-{;4p=nm-;P-TtMsZF zo7nwTyL2mcz~MU!!RU{#ku+)9vUW)=7^rlkq0LiB_}(Ds1oc=O30J}$3%|ed+r_b5 z)ys4}*vETa(N9x=2EyRFiAjT#@_x)dn|wUH5$d#kVQ8chr=x5C(U0fEFbK|C>~qLZ z{L(WOYDydPj3O0?zLSA^wAsq6vuHunu94c2Q$+>u<6Z5(>86hv4LejxWil9p12<96 zmF9BD#vnv>YfhpS%}F>Hex4Bp(%$Hb?-?EiBZe2RNhXc6$JeuyLctarSj8H%JZbM0 zzM(d5%pf70%%*Fhd_2Z-WQ%`#g~N&`T_9d=Yr*~;sY36{4{;VQ(K-@%oP~;whD^4< zF*@#GsHO4kTP7E;Md`cmC={7&iCQ!(cBF8TqSN6<{-J5t#rgn_oWz3{7o8QU1xIEL zMYfePifLugt!$Q&Oj;GgX=H44W;y6u^}si}xwgH}kXO?cj+%i>WexfPA-rcnx(=hI z2&#E(h1G3lHa+MF_ix?t&jtZ}9-spEVit$$H(+JEkKZq0+`W1b>uM|XxCPpWE=nUc zyI2Sl(<~GS}8?D zYZHrW7T2|-Q6@Q~=taD8?)Nv|svm;8Y#pkv5r)59c*}GkY(+XX#Gze}q)~P`9#QyC z2|Y&kEa1~;sB*J>5k>_1JKS0R?MdfI#S@WarxfXF?5f2(=zU}bZx`$ZiDF6Ws)csn zcC}8w9j*=r+Y?5*eYUQ@^#~4Ck2a;9F^sfN9Q<&q1J#@e<1Qga4k-vaafQ6Zrel-A z6XiG7O+ncAm92T04HyKB689@G5Z8C21mfOZ8wP?F*o~H|)EBy__vTtlBG`9`3}4jcz1{AwUpNYW z+=o$1v?D@=$!vmnhX?VWf?O&o(it{W;x$qhfMf4dxD64zo{B*-1wIB>4P48U21xiL zEf6~8O_=D0x3hYd)3#hRMo5pPhPEGh;h99KEEH6)fwU##$ZROQ-hy?kQNXSRIQ1!2 zJJ1DtB?%wnF>MxKkSV0tlNG_mCd5G~xw;`2kiCLObAH$vXDI`uzy$j9!JQjOp*?7( zPqWsNVso)3nZ#t9PFKFp6%;!SDKyuKnRSY)w8N5RVmtIl_!~56%#q8C+tX}{N_;Mx zULhC@K`3PrL4F64&1FjwD;a ztoL_MMgAYwmgQxP)v{GT%)C}3ul?@3w&vYub=@}bjYU6?BV9f+cL&|l(h7BnnRrqI8_>LAuxB zuPehd)R^Ky`J9)-48>GwT7@P&Yw9L6w8L^11;}TC>{*v+O_yvU)Xq7PG3~T^-;&R* zMdd6NX>&iD3v0S*z7GaEmTyI_aAUdc2L-jZY?pH}7!Bpy#~cz$`96Ym5Np<;CN}J} z+6^qs+gw79{Y-H9?e1Dw23}*QlWO~vU5mZ|6&4zPbOL41ibJ%OMbguRuEm|hl&;H-3}wST2b_fhy%oH~{al%kFMSGJ+x@-Y zETCeRO`ZAX7%bKIhhw?xo~Se!_BS{Hec!gY+j?~Pk)roN{@uLOq-0g2{uSYOrn1KD zjq3+{?kd7@DNbrr7+nJZ4n!qSGvj2ioOu_#p54!k?!|PqWDStUbXsr_(YG5RvELP= zylK??vvzWa>hhPntVUW~#(QkRts~Rh9}cUs%O3eK@N0HS@*&gTMYU;I(k;WEP;tvm zw$&UO{rMdM>z!45u3%i8)vGIe?H_$TXe&EM6-ufFzKLPGUi0d7&M3MdCUeMHCLGx+kl84Wo)$Jer;?%C^8VbhtIi1~qYrNhC0#;z{@N|AYFg@xY5?dNo|5K_e>P9& z14Sg3h2@@y0Z8wkmX}d&QB3Tdbh>-i|h>4LE7aBh; zR$LyLok%yX&9U0r^RD2A?Fr+uYo?jTYP)3|RnS==?NBfqQA1^r$C}YlAQ0t9m@tR7 z8g?kAHP3_PrD`58Qkvcb`_Lh!-B@3a+`Q&Z7?d0o7}fKC>}4>(y2SCk)#?9&{%0(94Z2K>Q;0%ou1z;{!3S2ss{Eqgm=Yd3TIzkc>=q8s1_Suj9;`btbBPL%S6V=Stm zP@@*W)ylklhf3^DEH?Sr4)5Fb_~JBnY_ISRM$EwnXp@j}b%OK0k^~U{ygwGcM+vzI z5rNLGMX^$~l+t!@_kbi-1jnYEn7}yDW5rs6&GS zO!cN};lPt(KDMUpTldo-#8EVo_R|(jd*#=;KjsmsYqe$9pRcUTXQ=s*P2DUEhL|PR;+ZtYi-YF-& z-CL*%j?1=9`0>-#-VeE|U4$xbzPz+D#sAdl!I|saa?Q^ng{0NFnO>6R9JKi5y6Dq@ zO^2D)ZG^Z88EywlnPbfaBMO6hI!WqOFs~MqX|8+YwTb0iR_FV|b029JwJL!m`Mt4S zF%A*zc7tJxhS zQ`(Vlh{sgEdn@p9nvQ*`b-P~XClERcb}*`*qobauqJUAW;x7oz?(XB{ARp(eAIn!KO^ul#vj&1$=R4!eE!+u|1J5N!$g|W{W zm82Q)a^(}UU3(Z&vpo|2fz%y&LGfVpOJJMTo1?L868hJDD%r39F-nV{3yZJ?&dEdo zv%CL{IUL;WP0U?@voe3OyBV!5hkYKLo}Dt`1WCP?h$IdOj^;vP4gDBN@|JrLDxCWB zym3~R&1<griS64=(#rK{1cI~@#MzkG&40rh8 zL+ZZPO>*6#{dC2QCOrYU_w2~RXTn`G&4^uAS=ABt5Ibnqs{Eo?HZHh>y9M`br7cqp z=kB_2>hOx)NO-uCPE6Ynl)8;WNoOjheMl-TA`VTZ`ODE2je6w)XVv8bR&w=zD=C?b z=?j;8(WnCPaJk57Qg$LoS`#+!^Wndjw8R&Vx{(71z=EeN71=tXR+@X@<+Kj!5=+V%t zM1Mun+?zjJS@=96;Am8cgc2jAXgdn2hoYXK(p{D(ePPf--7a0?-|*U5(!>2lajN0S zjFw?U4s%0wqh8ttju$4ztGLRDLszvPsn# zS6l?skBFS|@a+g%n3Ph^iA!j0*^}sAygPb|WaQp%tlzDB6rM1TKv6YV#*F0t2JpD) zeTYR}{cbV2aQbP|Mb&T;GZIS_fRE54BV|H8dna6((zmOYR@*D%vHCCwYuhuocinR% zZy+Aib0fL~J)p0tKYAGAEx`=skb}why$8K_AOeNsGZ;d@*|~yFbN)yw3dJnl+x-u} z_Yb6I;s}#)o}>y5&X|T5tXrRacmUg}xb=$#7<)xt1EyLeFxw4r{z7%Id3-gUrC7M` z^F7A-hvJ+m)~B5rzPlEE?%y!IP*jFJ$2HDS$I2WRr^1;&pLTG&Zd=qXUtwyI(Dd0M{K|Jy8T+hw>T*Zua&Y?0%3h0?s#$Ogw0XYdT zAdg`JJ|7JjYHr!iO7nqyDiXp{&?xX2IHP1Xx10hPhTylGTg9+S!3l=nF1Oa-db97h zTxWT`PaF!z3Jr(m?$EqpSrEX@JwzUwd#4;)YMpqhWG^wf+(1s-ItKSZY{9G$CJURE zOmsT!I%Ia>8KMI@#zQ&Mbm6dh2{8SGZ1y^x&tO4%&37YSI_u1(mahc zvpU6n8>Go`8qC1J(=S<8%mgOm7sxO&XWPSHql#EjBI>xIez;W>c!8QA)of9x@|t!J zT{Wtc=Y{+DX|*c-HairOz_W>pSA;YQ^BhUkQJ>x3IIq$U6hQA7W8Sbed0H{jH&5Y9 zyRw5}bQQxN1v8iX@AR8BWPQK7ly9c zIY~I)lu(B*_Kl9yvQ`x)nQ!^m=63yM_)-5OGRMgpZLvSZ2CNK{LEdDl70@3PB0hk+>7F(ZdkUV07~ZHG7^<;4+#nGQr7U;?E~`M*Sj-w@x@CIpeiQ)ckd&WeVrCIub%wMe|Ji zdUN8OBUNu7aqa(oVDj^Ivt)Par7ONz4*7J5(S|^o+ zuYJM0?z~?`hI-4^snfL<)fY7T@B16e*I$udKdp-G^;QPP=A#C;3=Rh5NMz4QS>bWSI7PCE zH?>Xkg{19=dy}qvuRF?r`h(>AR}ZpR?uz>IYhmO{hafANmaEs_{J)W7)b>Q^x9RR} z4ey@ra7w={`nv1DlG&@YuRMKqO0)m@^4U%MoN8{ZyH)y5Iz&Nb<)qhh*1fUX75?&# z+EMjJ;C;kKwi#;|e#p8Q|9-M>W!mSQV8OaoER$_t%riXugp0+e%iE~4Nu|TLDV^=Y z`vp}l@g-XK&DB0T98%L~nem%@UL?Elwx@y(*Hs(V%;b#bo7brv$MRy<)4mShjmGm6 z_c2V^^Rx2R`V#JWzRMR~?5>D=Y0eaLPoVeTUCG|KdrcqxncncZ{C3#3^~YWYZt+XO z@<9y-HxFFCS#dtujsJs?8asc%smO+{bqa>nsX1>q-N+Zc$!E4MVcv@ekLH9fxhR~l z^uJtQ_63FQ(#@Uk*pj)rHH&%#Ba{zI&U_&!(Y^E2DusWEsS$l2tZGk9xMCvmMuJP{ zaG}n_ig^i{XP95GCp~HJJ*|6*H~z_`{`jdAx2GL`HidKA+q=qh+Rp*c_ZGh0z9cQJ zS7xeh=BFuhb}u`(r~2BPrbE?hj&7TD$*8)r)c5Tp)gM)Ne(%p~x2^A0ZqI(cuHWw7=actp zf7}0kbNX=nR~gHyHQy`lzsx^Z^ZV}izv=ZB^$#Dd-yi$yZ@>NjMWu!Zc&FH9#NJ?O zQ*BYMiV{C;7Th@T!{egT!(C!8V^_EJpO2nADe>lp%IfbEIDeiiR(^HFdF4#u^l!da z{@%e_ySE;?^*=b3xmER*72ivlJPYOYmY)T{1D8uS@Gom(lge1oh?V$n-0|&@eSkM3 zlL#|tc$|YFUGI9#+MuNqrvh83r+{N_pl$~YGyugI7^0nlgTbkJCGjDZ1*yfcpw=0> z0hMar4nKfO?*Ye=xna72bOYltpb?NZKDcR*uCb$yJ2(@lVNpw@tyIKfS#9E-Cg3|Fq*N@&FMCjkxj+DhvTZaMOtiT)u Q%2k34iNNf!xEI6&01?u*@&Et; literal 0 HcmV?d00001 diff --git a/documents/space-exploration.md b/documents/space-exploration.md new file mode 100644 index 0000000..53978c7 --- /dev/null +++ b/documents/space-exploration.md @@ -0,0 +1,18 @@ +# Space Exploration + +Space exploration is the investigation of outer space using astronomy and space technology. It began in earnest in the mid-20th century, driven by competition between the United States and the Soviet Union during the Cold War. + +The Soviet Union launched the first artificial satellite, Sputnik 1, in October 1957. This was followed by Yuri Gagarin becoming the first human in space in April 1961, completing a single orbit of Earth aboard Vostok 1. The United States responded by accelerating its own space programme under NASA. + +## Early Missions + +The Apollo programme is one of the greatest achievements in human history. On July 20, 1969, Neil Armstrong and Buzz Aldrin became the first humans to walk on the Moon during the Apollo 11 mission. Armstrong's words — "That's one small step for man, one giant leap for mankind" — were broadcast live to hundreds of millions of people around the world. Five more Moon landings followed before the programme ended in 1972. + +Unmanned missions have expanded our knowledge of the solar system enormously. The Voyager probes, launched in 1977, have now left the solar system and continue to transmit data from interstellar space. The Mars rovers — Curiosity and Perseverance — have been exploring the Martian surface, searching for signs of ancient microbial life and collecting rock samples for eventual return to Earth. + +## Modern Era + +The International Space Station has been continuously inhabited since November 2000, serving as a laboratory for scientific research in microgravity. Astronauts from many countries have lived and worked aboard it, conducting experiments in biology, physics, and medicine that are only possible in the unique environment of space. + +The 21st century has seen the rise of private spaceflight companies. SpaceX developed the reusable Falcon 9 rocket, dramatically reducing the cost of launching payloads into orbit. Blue Origin and Virgin Galactic have pursued suborbital tourism. Artemis, NASA's current lunar programme, aims to return humans to the Moon by the mid-2020s, including the first woman and first person of colour to walk on its surface. + From 75f090f30586ad9840e0e0fbbd37bc14855bb467 Mon Sep 17 00:00:00 2001 From: Buffden Date: Tue, 23 Jun 2026 17:02:40 -0500 Subject: [PATCH 3/5] document ingestion - selectively calling the right parsing method based on extension type and calling ingestion on it along with embedding the chunks and restoring the indentation as well to tabspace for my readability --- ingest.py | 63 ++++++++++++++++++++++++++------------- ingest/__init__.py | 0 ingest/docx_parser.py | 23 ++++++++++++++ ingest/markdown_parser.py | 44 +++++++++++++++++++++++++++ ingest/pdf_parser.py | 14 +++++++++ ingest/router.py | 36 ++++++++++++++++++++++ 6 files changed, 160 insertions(+), 20 deletions(-) create mode 100644 ingest/__init__.py create mode 100644 ingest/docx_parser.py create mode 100644 ingest/markdown_parser.py create mode 100644 ingest/pdf_parser.py create mode 100644 ingest/router.py diff --git a/ingest.py b/ingest.py index 1952dc0..4c62346 100644 --- a/ingest.py +++ b/ingest.py @@ -1,31 +1,54 @@ -from utils import chunk_text, load_documents -from embed import embed_chunks +import sys +from pathlib import Path import chromadb +from utils import chunk_text +from embed import embed_chunks +from ingest.router import parse, SUPPORTED_EXTENSIONS -client = chromadb.PersistentClient(path="./chroma_db") +client = chromadb.PersistentClient(path = "./chroma_db") -def main(): - collection = client.get_or_create_collection(name="documents") +def ingest_file(collection, filepath: str): + filename = Path(filepath).name + text = parse(filepath) + chunks = chunk_text(text) + embedded = embed_chunks(chunks) - for doc in load_documents("documents/"): - chunks = chunk_text(doc["text"]) - embedded = embed_chunks(chunks) + if collection.count() > 0: + collection.delete(where = {"source": filename}) - # Delete existing chunks for this source before re-ingesting - if collection.count() > 0: - collection.delete(where={"source": doc["filename"]}) + collection.upsert( + ids = [f"{filename}_{i}" for i in range(len(chunks))], + embeddings = [e["embedding"] for e in embedded], + documents = chunks, + metadatas = [{"source": filename, "chunk_index": i} for i in range(len(chunks))], + ) - collection.upsert( - ids = [f"{doc['filename']}_{i}" for i in range(len(chunks))], - embeddings = [e["embedding"] for e in embedded], - documents = chunks, - metadatas = [{"source": doc["filename"], "chunk_index": i} for i in range(len(chunks))] - ) + print(f"Ingested {len(chunks)} chunks from {filename}") - print(f"Ingested {len(chunks)} chunks from {doc['filename']}") - print(f"\nTotal vectors in collection: {collection.count()}") +def main(): + if len(sys.argv) < 2: + print("Usage: python ingest.py ") + sys.exit(1) + + target = Path(sys.argv[1]) + collection = client.get_or_create_collection(name = "documents") + + if target.is_file(): + ingest_file(collection, str(target)) + elif target.is_dir(): + files = [f for f in target.iterdir() if f.suffix.lower() in SUPPORTED_EXTENSIONS] + if not files: + print(f"No supported files found in {target}") + sys.exit(0) + for f in files: + ingest_file(collection, str(f)) + else: + print(f"Path not found: {target}") + sys.exit(1) + + print(f"\nTotal vectors in collection: {collection.count()}") if __name__ == '__main__': - main() + main() diff --git a/ingest/__init__.py b/ingest/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ingest/docx_parser.py b/ingest/docx_parser.py new file mode 100644 index 0000000..efe1b7c --- /dev/null +++ b/ingest/docx_parser.py @@ -0,0 +1,23 @@ +import docx + + +def parse_docx(filepath: str) -> list[dict]: + doc = docx.Document(filepath) + sections = [] + current_heading = None + buffer = [] + + for para in doc.paragraphs: + if para.style.name.startswith("Heading"): + if buffer: + sections.append({"text": "\n".join(buffer), "heading": current_heading}) + buffer = [] + current_heading = para.text.strip() or None + else: + if para.text.strip(): + buffer.append(para.text.strip()) + + if buffer: + sections.append({"text": "\n".join(buffer), "heading": current_heading}) + + return sections diff --git a/ingest/markdown_parser.py b/ingest/markdown_parser.py new file mode 100644 index 0000000..455799d --- /dev/null +++ b/ingest/markdown_parser.py @@ -0,0 +1,44 @@ +import re # Python's built in regular expressions module + + +def parse_markdown(filepath: str) -> list[dict]: + with open(filepath, "r", encoding = "utf-8") as f: + raw = f.read() + + sections = [] + current_heading = None + buffer = [] + + for line in raw.splitlines(): + heading_match = re.match(r"^#{1,6}\s+(.*)", line) + if heading_match: + if buffer: + sections.append({"text": "\n".join(buffer), "heading": current_heading}) + buffer = [] + current_heading = heading_match.group(1).strip() or None + else: + cleaned = _strip_markdown(line) + if cleaned: + buffer.append(cleaned) + + if buffer: + sections.append({"text": "\n".join(buffer), "heading": current_heading}) + + return sections + + +def _strip_markdown(line: str) -> str: + if re.match(r"^```", line): + return "" + if re.match(r"^(\*{3,}|-{3,}|_{3,})\s*$", line): + return "" + line = re.sub(r"^>\s?", "", line) + line = re.sub(r"^\s*[-*+]\s+", "", line) + line = re.sub(r"^\s*\d+\.\s+", "", line) + line = re.sub(r"!\[([^\]]*)\]\([^)]*\)", r"\1", line) + line = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", line) + line = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", line) + line = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", line) + line = re.sub(r"`([^`]*)`", r"\1", line) + line = re.sub(r"<[^>]+>", "", line) + return line.strip() diff --git a/ingest/pdf_parser.py b/ingest/pdf_parser.py new file mode 100644 index 0000000..2bdc759 --- /dev/null +++ b/ingest/pdf_parser.py @@ -0,0 +1,14 @@ +import fitz # pymupdf + + +def parse_pdf(filepath: str) -> list[dict]: + doc = fitz.open(filepath) + pages = [] + + for page_num, page in enumerate(doc, start = 1): + text = page.get_text() + if text.strip(): + pages.append({"text": text, "page": page_num}) + + doc.close() + return pages diff --git a/ingest/router.py b/ingest/router.py new file mode 100644 index 0000000..6c8b8bd --- /dev/null +++ b/ingest/router.py @@ -0,0 +1,36 @@ +from pathlib import Path +from .pdf_parser import parse_pdf +from .docx_parser import parse_docx +from .markdown_parser import parse_markdown + + +def _parse_txt(filepath: str) -> list[dict]: + with open(filepath, "r", encoding = "utf-8") as f: + return [{"text": f.read()}] + + +_PARSERS = { + ".txt": _parse_txt, + ".pdf": parse_pdf, + ".docx": parse_docx, + ".md": parse_markdown, +} + +SUPPORTED_EXTENSIONS = set(_PARSERS.keys()) + + +def _resolve_parser(filepath: str): + ext = Path(filepath).suffix.lower() + parser = _PARSERS.get(ext) + if parser is None: + raise ValueError(f"Unsupported file type: {ext}") + return parser + + +def _flatten(sections: list[dict]) -> str: + return "\n\n".join(s["text"] for s in sections) + + +def parse(filepath: str) -> str: + parser = _resolve_parser(filepath) + return _flatten(parser(filepath)) From d17d36ef4a90bffeb6466705772dd46c2906906c Mon Sep 17 00:00:00 2001 From: Buffden Date: Tue, 23 Jun 2026 17:08:00 -0500 Subject: [PATCH 4/5] document ingestion - diagrams for pipeline flow --- docs/README.md | 52 +++++++++ ...peline-document-ingestion-chunk-embed.puml | 35 ++++++ ...line-document-ingestion-entry-routing.puml | 62 ++++++++++ docs/pipeline-document-ingestion-parsing.puml | 108 ++++++++++++++++++ docs/pipeline-document-ingestion-upsert.puml | 26 +++++ 5 files changed, 283 insertions(+) create mode 100644 docs/README.md create mode 100644 docs/pipeline-document-ingestion-chunk-embed.puml create mode 100644 docs/pipeline-document-ingestion-entry-routing.puml create mode 100644 docs/pipeline-document-ingestion-parsing.puml create mode 100644 docs/pipeline-document-ingestion-upsert.puml diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..93d0870 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,52 @@ +# Docs + +This folder contains the implementation plan, phase-by-phase notes, and sequence diagrams for the RAG Document Engine. + +--- + +## Implementation Plan + +| File | Description | +| ---- | ----------- | +| `implementation-plan.md` | Full build plan across all 7 phases — goals, what gets built, stack additions, and questions to answer per phase | + +--- + +## Phase Notes + +One file per phase. Each covers the goal, design decisions, and key concepts for that phase. + +| File | Phase | +| ---- | ----- | +| `phase-1-semantic-foundation.md` | Semantic search from scratch — chunking, embeddings, cosine similarity over flat JSON | +| `phase-2-vector-store.md` | Replace JSON with ChromaDB — persistent collection, metadata, `collection.query()` | +| `phase-3-rag-pipeline.md` | Close the loop — retrieval + LLM generation, grounded answers, citations, token budget | +| `phase-4-document-ingestion.md` | Multi-format ingestion — PDF, DOCX, Markdown parsers, CLI trigger, deduplication | +| `phase-5-retrieval-quality.md` | Improve retrieval — evaluation set, hybrid search (BM25 + vector), re-ranker, metadata filters | +| `phase-6-search-and-chat-mode.md` | Two interaction modes — document search and multi-turn chat with conversation history | +| `phase-7-role-based-document-access.md` | Access control — owner metadata at ingestion, per-user query filters, no bypass path | + +--- + +## Sequence Diagrams + +PlantUML sequence diagrams for each pipeline. Open with any PlantUML-compatible renderer. + +### Document Ingestion Pipeline + +Covers Phase 4. Split into 4 focused diagrams — read in this order to follow the full flow: + +| Order | File | Covers | +| ----- | ---- | ------- | +| 1 | [pipeline-document-ingestion-entry-routing.svg](../diagrams/docs/pipeline-document-ingestion-entry-routing.svg) | CLI entry, arg validation, `get_or_create_collection`, file vs directory routing | +| 2 | [pipeline-document-ingestion-parsing.svg](../diagrams/docs/pipeline-document-ingestion-parsing.svg) | Router extension resolution, all 4 parsers (PDF / DOCX / MD / TXT), flatten to plain text | +| 3 | [pipeline-document-ingestion-chunk-embed.svg](../diagrams/docs/pipeline-document-ingestion-chunk-embed.svg) | Sliding window chunking, OpenAI embeddings API call | +| 4 | [pipeline-document-ingestion-upsert.svg](../diagrams/docs/pipeline-document-ingestion-upsert.svg) | Deduplication check, ChromaDB upsert with full payload | + +### Other Pipelines + +| File | Covers | +| ---- | ------- | +| [pipeline-semantic-search.svg](../diagrams/docs/pipeline-semantic-search.svg) | Phase 1 — query embedding, cosine similarity, top-K retrieval over flat JSON | +| [pipeline-vector-store.svg](../diagrams/docs/pipeline-vector-store.svg) | Phase 2 — ingest and query flow using ChromaDB | +| [pipeline-rag.svg](../diagrams/docs/pipeline-rag.svg) | Phase 3 — end-to-end RAG: retrieval, token budget, prompt construction, LLM generation, citations | diff --git a/docs/pipeline-document-ingestion-chunk-embed.puml b/docs/pipeline-document-ingestion-chunk-embed.puml new file mode 100644 index 0000000..4d2ef65 --- /dev/null +++ b/docs/pipeline-document-ingestion-chunk-embed.puml @@ -0,0 +1,35 @@ +@startuml pipeline-document-ingestion-chunk-embed +skinparam sequenceMessageAlign center +skinparam ParticipantPadding 10 + +participant "ingest_file()" as ingest +participant "utils\nchunk_text()" as chunker +participant "embed\nembed_chunks()" as embedder +participant "OpenAI\nEmbeddings API" as openai + +== Chunking == + +ingest -> chunker : chunk_text(text, chunk_size = 300, overlap = 50) +activate chunker +chunker -> chunker : words = text.split() +chunker -> chunker : step = chunk_size - overlap = 250 +loop for i in range(0, len(words), step = 250) +chunker -> chunker : chunk_words = words[i : i + 300] +chunker -> chunker : chunks.append(" ".join(chunk_words)) +end +chunker --> ingest : chunks: list[str] +deactivate chunker + +== Embedding == + +ingest -> embedder : embed_chunks(chunks) +activate embedder +embedder -> openai : embeddings.create(model = EMBEDDING_MODEL, input = chunks) +activate openai +openai --> embedder : EmbeddingResponse — response.data[i].embedding +deactivate openai +embedder -> embedder : build [{"text": chunk, "embedding": [...float]}] +embedder --> ingest : embedded: list[dict] +deactivate embedder + +@enduml diff --git a/docs/pipeline-document-ingestion-entry-routing.puml b/docs/pipeline-document-ingestion-entry-routing.puml new file mode 100644 index 0000000..7a8361e --- /dev/null +++ b/docs/pipeline-document-ingestion-entry-routing.puml @@ -0,0 +1,62 @@ +@startuml pipeline-document-ingestion-entry-routing +skinparam sequenceMessageAlign center +skinparam ParticipantPadding 10 + +actor User as user +participant "ingest.py" as main +participant "ChromaDB\nPersistentClient" as chromaClient +participant "ChromaDB\nCollection" as collection + +== Module Init (at import time) == + +main -> chromaClient : PersistentClient(path = "./chroma_db") +activate chromaClient +chromaClient --> main : client (persistent, on-disk) +deactivate chromaClient + +== main() Entry == + +user -> main : python ingest.py +activate main + +opt sys.argv < 2 — no argument provided +main --> user : print "Usage: python ingest.py " +main --> user : sys.exit(1) +end + +main -> chromaClient : get_or_create_collection(name = "documents") +activate chromaClient +chromaClient --> main : collection +deactivate chromaClient +activate collection + +== Path Routing == + +alt target.is_file() +main -> main : ingest_file(collection, filepath) + +else target.is_dir() +main -> main : filter files by suffix (.txt | .pdf | .docx | .md) +alt no supported files found in directory +main --> user : print "No supported files found in {target}" +main --> user : sys.exit(0) +else supported files exist +loop for each file in directory +main -> main : ingest_file(collection, filepath) +end +end + +else path does not exist +main --> user : print "Path not found: {target}" +main --> user : sys.exit(1) +end + +== Summary == + +main -> collection : count() +collection --> main : total_count +main --> user : print "Total vectors in collection: {total_count}" +deactivate collection +deactivate main + +@enduml diff --git a/docs/pipeline-document-ingestion-parsing.puml b/docs/pipeline-document-ingestion-parsing.puml new file mode 100644 index 0000000..b259ba9 --- /dev/null +++ b/docs/pipeline-document-ingestion-parsing.puml @@ -0,0 +1,108 @@ +@startuml pipeline-document-ingestion-parsing +skinparam sequenceMessageAlign center +skinparam ParticipantPadding 10 + +participant "ingest_file()" as ingest +participant "router" as router +participant "pdf_parser" as pdfParser +participant "docx_parser" as docxParser +participant "markdown_parser" as mdParser +participant "_parse_txt" as txtParser + +ingest -> router : parse(filepath) +activate router + +router -> router : _resolve_parser(filepath) +router -> router : ext = Path(filepath).suffix.lower() + +opt ext not in _PARSERS (.txt / .pdf / .docx / .md) +router --> ingest : raise ValueError("Unsupported file type: {ext}") +end + +== PDF Parser == + +alt ext == ".pdf" +router -> pdfParser : parse_pdf(filepath) +activate pdfParser +pdfParser -> pdfParser : doc = fitz.open(filepath) +loop for page_num, page in enumerate(doc, start = 1) +pdfParser -> pdfParser : text = page.get_text() +opt text.strip() is non-empty +pdfParser -> pdfParser : append {"text": text, "page": page_num} +end +end +pdfParser -> pdfParser : doc.close() +pdfParser --> router : pages: list[dict] +deactivate pdfParser + +== DOCX Parser == + +else ext == ".docx" +router -> docxParser : parse_docx(filepath) +activate docxParser +docxParser -> docxParser : doc = docx.Document(filepath)\ncurrent_heading = None, buffer = [] +loop for each para in doc.paragraphs +alt para.style.name starts with "Heading" +opt buffer is non-empty +docxParser -> docxParser : flush buffer to sections with current_heading +docxParser -> docxParser : buffer = [] +end +docxParser -> docxParser : current_heading = para.text.strip() or None +else regular paragraph +opt para.text.strip() is non-empty +docxParser -> docxParser : buffer.append(para.text.strip()) +end +end +end +opt remaining buffer is non-empty +docxParser -> docxParser : flush buffer to sections with current_heading +end +docxParser --> router : sections: list[dict] +deactivate docxParser + +== Markdown Parser == + +else ext == ".md" +router -> mdParser : parse_markdown(filepath) +activate mdParser +mdParser -> mdParser : open(filepath, "r", encoding = "utf-8"), raw = f.read() +mdParser -> mdParser : sections = [], current_heading = None, buffer = [] +loop for each line in raw.splitlines() +alt line matches r"^#{1,6}\s+(.*)" +opt buffer is non-empty +mdParser -> mdParser : flush buffer to sections with current_heading +mdParser -> mdParser : buffer = [] +end +mdParser -> mdParser : current_heading = match.group(1).strip() or None +else regular line +mdParser -> mdParser : cleaned = _strip_markdown(line) +note right : strips: code fences, HR lines,\nblockquotes, list markers,\nimages, links, bold/italic,\ninline code, HTML tags +opt cleaned is non-empty +mdParser -> mdParser : buffer.append(cleaned) +end +end +end +opt remaining buffer is non-empty +mdParser -> mdParser : flush buffer to sections with current_heading +end +mdParser --> router : sections: list[dict] +deactivate mdParser + +== TXT Parser == + +else ext == ".txt" +router -> txtParser : _parse_txt(filepath) +activate txtParser +txtParser -> txtParser : open(filepath, "r", encoding = "utf-8") +txtParser -> txtParser : text = f.read() +txtParser --> router : [{"text": full_text}] +deactivate txtParser +end + +== Flatten == + +router -> router : _flatten(sections)\n"\n\n".join(s["text"] for s in sections) +router --> ingest : plain text: str +deactivate router + +@enduml diff --git a/docs/pipeline-document-ingestion-upsert.puml b/docs/pipeline-document-ingestion-upsert.puml new file mode 100644 index 0000000..19f4be9 --- /dev/null +++ b/docs/pipeline-document-ingestion-upsert.puml @@ -0,0 +1,26 @@ +@startuml pipeline-document-ingestion-upsert +skinparam sequenceMessageAlign center +skinparam ParticipantPadding 10 + +participant "ingest_file()" as ingest +participant "ChromaDB\nCollection" as collection + +== Deduplication == + +ingest -> collection : count() +collection --> ingest : current_count + +opt current_count > 0 — collection already has data +ingest -> collection : delete(where = {"source": filename}) +note right : Removes all existing chunks\nfor this file before re-ingesting\n(handles re-ingest / refresh) +end + +== Upsert == + +ingest -> collection : upsert(ids, embeddings, documents, metadatas) +note right : ids = ["{filename}_{i}" per chunk]\nembeddings = [float vectors from OpenAI]\ndocuments = raw chunk strings\nmetadatas = [{"source": filename, "chunk_index": i}] +collection --> ingest : OK + +ingest -> ingest : print "Ingested {len(chunks)} chunks from {filename}" + +@enduml From d8202cc1f52ece344855774f3a318f202121e6ef Mon Sep 17 00:00:00 2001 From: Buffden Date: Tue, 23 Jun 2026 17:18:06 -0500 Subject: [PATCH 5/5] document ingestion - readme update --- README.md | 73 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index db11877..9f574ae 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,26 @@ # RAG Document Engine -A progressive RAG system built from first principles -- from raw embeddings and cosine similarity all the way to a full retrieval-augmented generation pipeline with document ingestion, reranking, and cited answers. +A progressive RAG system built from first principles -- from raw embeddings and cosine similarity all the way to a full retrieval-augmented generation pipeline with multi-format document ingestion and cited answers. --- ## What It Does (Current State) -**Ingestion** +### Ingestion -1. **Loads** `.txt` files (PDF, DOCX, Markdown from Phase 4) +1. **Parses** `.txt`, `.pdf`, `.docx`, and `.md` files into plain text via format-specific parsers 2. **Chunks** each document into overlapping word windows 3. **Embeds** each chunk using OpenAI `text-embedding-3-small`, producing a 1536-dimensional vector -4. **Stores** vectors with metadata (`source`, `chunk_index`) in a persistent Chroma collection +4. **Deduplicates** - deletes any existing chunks for the file before storing, so re-ingestion replaces rather than duplicates +5. **Stores** vectors with metadata (`source`, `chunk_index`) in a persistent Chroma collection -**Search** +### Search 1. **Embeds** the query using the same model 2. **Queries** Chroma for the top-K nearest vectors using built-in ANN (Approximate Nearest Neighbor) search 3. **Returns** results with chunk text, source filename, and distance score -**Generation** +### Generation 1. **Selects** retrieved chunks within a 2000-token budget using `tiktoken` 2. **Builds** a numbered context block from the selected chunks @@ -34,6 +35,9 @@ A progressive RAG system built from first principles -- from raw embeddings and - OpenAI SDK (`text-embedding-3-small` for embeddings, `gpt-4o-mini` for generation) - Chroma (persistent vector database) - tiktoken (token counting for context budget management) +- pymupdf (PDF parsing) +- python-docx (DOCX parsing) +- numpy (cosine similarity computation) - python-dotenv --- @@ -42,24 +46,25 @@ A progressive RAG system built from first principles -- from raw embeddings and ```text rag-document-engine/ -├── documents/ # Sample .txt files -│ ├── ancient-rome.txt -│ ├── climate-change.txt -│ ├── music-and-the-brain.txt -│ ├── nutrition-and-health.txt -│ └── space-exploration.txt +├── documents/ # Sample documents (.txt, .pdf, .docx, .md) +├── ingest/ # Format-specific parsers (Phase 4) +│ ├── __init__.py +│ ├── router.py # Resolves parser by file extension +│ ├── pdf_parser.py # PDF extraction via pymupdf +│ ├── docx_parser.py # DOCX extraction via python-docx +│ └── markdown_parser.py # Markdown stripping to plain text ├── prompts/ │ └── system_prompt.txt # LLM system prompt (loaded at runtime) ├── embed.py # embed_chunks and embed_query utilities -├── ingest.py # Load, chunk, embed, store in Chroma +├── ingest.py # CLI entry point - parse, chunk, embed, store ├── search.py # Embed query + retrieve top-K from Chroma ├── generate.py # Token-budgeted answer generation via gpt-4o-mini ├── rag.py # End-to-end pipeline entry point ├── inspect_collection.py # Print collection stats and a sample entry ├── utils.py # chunk_text, load_document, load_documents ├── chroma_db/ # Chroma persistent storage (not committed) -├── diagrams/ # Pipeline diagrams (SVG, auto-exported from PlantUML) -├── docs/ # PlantUML source files and implementation plan +├── diagrams/ # Pipeline diagrams (SVG, generated via npx diagram-sync) +├── docs/ # Phase notes, PlantUML source files, and docs index ├── pyproject.toml └── .env # API keys (not committed) ``` @@ -88,8 +93,9 @@ TOKEN_BUDGET=2000 ## Usage ```bash -# Step 1 -- Ingest documents into Chroma -python3 ingest.py +# Step 1 -- Ingest a single file or an entire directory +python3 ingest.py documents/ancient-rome.pdf +python3 ingest.py documents/ # Step 2 -- Ask a question (full RAG pipeline) python3 rag.py "what foods are good for the heart" @@ -130,15 +136,15 @@ No answer found in the documents. **Search only** -- `python3 search.py` ```text -Result 1 (distance: 1.2862) -- nutrition-and-health.txt [chunk 0] +Result 1 (distance: 1.2862) - nutrition-and-health.txt [chunk 0] Nutrition is the science of how food affects the body... Unsaturated fats found in olive oil, nuts, avocados, and fatty fish are associated with reduced risk of heart disease... -Result 2 (distance: 1.3720) -- nutrition-and-health.txt [chunk 1] +Result 2 (distance: 1.3720) - nutrition-and-health.txt [chunk 1] The Mediterranean diet -- rich in vegetables, fruit, whole grains, fish, and olive oil -- is consistently associated with lower rates of heart disease, diabetes, and cognitive decline... -Result 3 (distance: 1.6426) -- music-and-the-brain.txt [chunk 1] +Result 3 (distance: 1.6426) - music-and-the-brain.txt [chunk 1] Music also affects mood and stress. Slow, quiet music activates the parasympathetic nervous system, lowering heart rate and cortisol levels... ``` @@ -156,7 +162,7 @@ Note: distance is an inverse similarity score -- lower means more relevant. | 1 | Semantic Foundation | Complete | | 2 | Vector Store | Complete | | 3 | RAG Pipeline | Complete | -| 4 | Document Ingestion | Planned | +| 4 | Document Ingestion | Complete | | 5 | Retrieval Quality | Planned | | 6 | Search and Chat Mode | Planned | | 7 | Role-Based Document Access | Planned | @@ -173,14 +179,15 @@ See [docs/implementation-plan.md](./docs/implementation-plan.md) for full phase - **Model consistency** -- the same embedding model must be used for both documents and queries - **Vector database** -- stores embeddings with metadata and retrieves them by similarity using ANN search - **RAG** -- Retrieval-Augmented Generation: retrieve relevant context, then generate a grounded answer +- **Document parsing** -- format-specific extraction that converts PDF, DOCX, and Markdown into plain text before chunking; all formats share the same embedding and storage flow after parsing --- ## Diagrams -Pipeline diagrams are maintained as PlantUML source files in `docs/` and auto-exported to SVG on every push to main using [diagram-sync](https://www.npmjs.com/package/diagram-sync). +Pipeline diagrams are maintained as PlantUML source files in `docs/` and exported to SVG via `npx diagram-sync` using [diagram-sync](https://www.npmjs.com/package/diagram-sync). -The three diagrams below show the system growing phase by phase -- each one builds on the previous. +The diagrams below show the system growing phase by phase -- each one builds on the previous. ### Phase 1 -- Semantic Search (cosine similarity over JSON embeddings) @@ -193,3 +200,23 @@ The three diagrams below show the system growing phase by phase -- each one buil ### Phase 3 -- RAG Pipeline (generation on top of retrieval) ![RAG Pipeline](./diagrams/docs/pipeline-rag.svg) + +### Phase 4 -- Document Ingestion (multi-format parsing, deduplication) + +The ingestion flow is split into 4 focused diagrams - read in this order: + +**1. Entry and Routing** - CLI validation, collection setup, file vs directory routing + +![Entry and Routing](./diagrams/docs/pipeline-document-ingestion-entry-routing.svg) + +**2. Parsing** - router extension resolution, all 4 parsers (PDF / DOCX / MD / TXT), flatten to plain text + +![Parsing](./diagrams/docs/pipeline-document-ingestion-parsing.svg) + +**3. Chunking and Embedding** - sliding window chunking, OpenAI embeddings API call + +![Chunking and Embedding](./diagrams/docs/pipeline-document-ingestion-chunk-embed.svg) + +**4. Upsert** - deduplication check, ChromaDB upsert with full payload + +![Upsert](./diagrams/docs/pipeline-document-ingestion-upsert.svg)