| :; # Indeksator Prosty RAG v0.2 - Jerzy Głowacki na licencji Apache 2.0 | |
| :; # *NIX: | |
| :; OS=$(uname -s | sed -e 's/^Linux$/linux/' -e 's/^Darwin$/mac/') | |
| :; embedfile="bge-m3.embedfile" | |
| :; inputDir="baza" | |
| :; chunksFile="chunks.txt" | |
| :; dbFile="prosty-rag.db" | |
| :; chunkWords=200 | |
| :; overlapWords=10 | |
| :; > $chunksFile | |
| :; # Instalacja | |
| :; [ ! -d $inputDir ] && echo Pobieranie przykładowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe! | |
| :; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://huggingface.co/asg017/embedfile/resolve/refs%2Fpr%2F2/bge-m3.embedfile?download=true && chmod +x $embedfile && echo Gotowe! | |
| :; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-$OS-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-$OS-4.05.tar.gz xpdf-tools-$OS-4.05/bin64/pdftotext && del xpdf-tools-$OS-4.05.tar.gz && echo Gotowe! | |
| :; # Uruchamianie | |
| :; echo "Indeksowanie plików PDF/TXT/MD w folderze $inputDir..." | |
| :; shopt -s nullglob | |
| :; for pdf in $inputDir/*.pdf; do echo "Konwertowanie $(basename "$pdf")..." && pdftotext -nopgbrk -enc UTF-8 "$pdf"; done | |
| :; for file in $inputDir/*.txt $inputDir/*.md; do | |
| :; filename=$(basename "$file") | |
| :; echo "Przetwarzanie $filename..." | |
| :; mapfile -t words < <(tr -s '[:space:]' '\n' < "$file" | grep -v '^$') | |
| :; totalWords=${#words[@]} | |
| :; start=0 | |
| :; while ((start < totalWords)); do | |
| :; chunk=("${words[@]:start:chunkWords}") | |
| :; echo "$filename: ${chunk[*]}" >> $chunksFile | |
| :; ((start += chunkWords - overlapWords)) | |
| :; done | |
| :; done | |
| :; echo "Osadzanie plików..." | |
| :; [ -f $dbFile ] && rm $dbFile | |
| :; ./$embedfile import $chunksFile $dbFile && echo "Gotowe! Po każdej zmianie w folderze $inputDir należy uruchomić ponownie indeksator." | |
| :; rm $chunksFile; exit $? | |
| :; # Windows: | |
| @echo off | |
| setlocal enabledelayedexpansion | |
| set embedfile=bge-m3.embedfile | |
| set inputDir=baza | |
| set chunksFile=chunks.txt | |
| set dbFile=prosty-rag.db | |
| set chunkWords=200 | |
| set overlapWords=10 | |
| for /l %%i in (1,1,%overlapWords%) do set buf[%%i]= | |
| break>%chunksFile% | |
| :; # Instalacja | |
| if not exist %inputDir% echo Pobieranie przykładowego pliku %inputDir%\wikipedia.txt... && curl --create-dirs -Lo %inputDir%\wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe^! | |
| if not exist %embedfile% echo Pobieranie %embedfile%... && curl -Lo %embedfile% https://huggingface.co/asg017/embedfile/resolve/refs%%2Fpr%%2F2/bge-m3.embedfile?download=true && echo Gotowe^! | |
| if not exist pdftotext.exe echo Pobieranie pdftotext.exe... && curl -LO https://dl.xpdfreader.com/xpdf-tools-win-4.05.zip && tar --strip-components 2 -xf xpdf-tools-win-4.05.zip xpdf-tools-win-4.05/bin64/pdftotext.exe && del xpdf-tools-win-4.05.zip && echo Gotowe^! | |
| :; # Uruchamianie | |
| echo Indeksowanie plików PDF/TXT/MD w folderze %inputDir%... | |
| for %%F in ("%inputDir%\*.pdf") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && pdftotext -nopgbrk -enc UTF-8 "%%~F" | |
| for %%F in ("%inputDir%\*.txt" "%inputDir%\*.md") do ( | |
| echo Przetwarzanie %%~nxF... | |
| ( | |
| set wordCount=0 | |
| set /p =%%~nxF: <nul | |
| for /f "usebackq delims=" %%L in ("%%F") do ( | |
| set line=%%L | |
| set line=!line: =^ | |
| ! | |
| for /f "delims=" %%W in ("!line!") do ( | |
| set /p =%%W <nul | |
| for /l %%i in (2,1,%overlapWords%) do (set /a j=%%i-1 && set buf[!j!]=!buf[%%i]!) | |
| set "buf[%overlapWords%]=%%W" | |
| set /a wordCount+=1 | |
| if !wordCount! geq !chunkWords! ( | |
| echo. | |
| set /p =%%~nxF: <nul | |
| for /l %%i in (1,1,%overlapWords%) do set /p =!buf[%%i]! <nul | |
| set /a wordCount=0 | |
| ) | |
| ) | |
| ) | |
| echo. | |
| )>>%chunksFile% | |
| ) | |
| echo Osadzanie plików... | |
| if exist %dbFile% del %dbFile% | |
| %embedfile% import %chunksFile% %dbFile% && echo Gotowe^! Po każdej zmianie w folderze %inputDir% należy uruchomić ponownie indeksator. | |
| del %chunksFile% | |
| endlocal | |
| pause | |