prosty-rag / indeksator.cmd
jglowa's picture
Update indeksator.cmd
f925648 verified
raw
history blame
4.07 kB
:; # Indeksator Prosty RAG v0.2 - Jerzy Głowacki na licencji Apache 2.0
:; # *NIX:
:; OS=$(uname -s | sed -e 's/^Linux$/linux/' -e 's/^Darwin$/mac/')
:; embedfile="bge-m3.embedfile"
:; inputDir="baza"
:; chunksFile="chunks.txt"
:; dbFile="prosty-rag.db"
:; chunkWords=200
:; overlapWords=10
:; > $chunksFile
:; # Instalacja
:; [ ! -d $inputDir ] && echo Pobieranie przykładowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
:; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://huggingface.co/asg017/embedfile/resolve/refs%2Fpr%2F2/bge-m3.embedfile?download=true && chmod +x $embedfile && echo Gotowe!
:; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-$OS-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-$OS-4.05.tar.gz xpdf-tools-$OS-4.05/bin64/pdftotext && del xpdf-tools-$OS-4.05.tar.gz && echo Gotowe!
:; # Uruchamianie
:; echo "Indeksowanie plików PDF/TXT/MD w folderze $inputDir..."
:; shopt -s nullglob
:; for pdf in $inputDir/*.pdf; do echo "Konwertowanie $(basename "$pdf")..." && pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
:; for file in $inputDir/*.txt $inputDir/*.md; do
:; filename=$(basename "$file")
:; echo "Przetwarzanie $filename..."
:; mapfile -t words < <(tr -s '[:space:]' '\n' < "$file" | grep -v '^$')
:; totalWords=${#words[@]}
:; start=0
:; while ((start < totalWords)); do
:; chunk=("${words[@]:start:chunkWords}")
:; echo "$filename: ${chunk[*]}" >> $chunksFile
:; ((start += chunkWords - overlapWords))
:; done
:; done
:; echo "Osadzanie plików..."
:; [ -f $dbFile ] && rm $dbFile
:; ./$embedfile import $chunksFile $dbFile && echo "Gotowe! Po każdej zmianie w folderze $inputDir należy uruchomić ponownie indeksator."
:; rm $chunksFile; exit $?
:; # Windows:
@echo off
setlocal enabledelayedexpansion
set embedfile=bge-m3.embedfile
set inputDir=baza
set chunksFile=chunks.txt
set dbFile=prosty-rag.db
set chunkWords=200
set overlapWords=10
for /l %%i in (1,1,%overlapWords%) do set buf[%%i]=
break>%chunksFile%
:; # Instalacja
if not exist %inputDir% echo Pobieranie przykładowego pliku %inputDir%\wikipedia.txt... && curl --create-dirs -Lo %inputDir%\wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe^!
if not exist %embedfile% echo Pobieranie %embedfile%... && curl -Lo %embedfile% https://huggingface.co/asg017/embedfile/resolve/refs%%2Fpr%%2F2/bge-m3.embedfile?download=true && echo Gotowe^!
if not exist pdftotext.exe echo Pobieranie pdftotext.exe... && curl -LO https://dl.xpdfreader.com/xpdf-tools-win-4.05.zip && tar --strip-components 2 -xf xpdf-tools-win-4.05.zip xpdf-tools-win-4.05/bin64/pdftotext.exe && del xpdf-tools-win-4.05.zip && echo Gotowe^!
:; # Uruchamianie
echo Indeksowanie plików PDF/TXT/MD w folderze %inputDir%...
for %%F in ("%inputDir%\*.pdf") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && pdftotext -nopgbrk -enc UTF-8 "%%~F"
for %%F in ("%inputDir%\*.txt" "%inputDir%\*.md") do (
echo Przetwarzanie %%~nxF...
(
set wordCount=0
set /p =%%~nxF: <nul
for /f "usebackq delims=" %%L in ("%%F") do (
set line=%%L
set line=!line: =^
!
for /f "delims=" %%W in ("!line!") do (
set /p =%%W <nul
for /l %%i in (2,1,%overlapWords%) do (set /a j=%%i-1 && set buf[!j!]=!buf[%%i]!)
set "buf[%overlapWords%]=%%W"
set /a wordCount+=1
if !wordCount! geq !chunkWords! (
echo.
set /p =%%~nxF: <nul
for /l %%i in (1,1,%overlapWords%) do set /p =!buf[%%i]! <nul
set /a wordCount=0
)
)
)
echo.
)>>%chunksFile%
)
echo Osadzanie plików...
if exist %dbFile% del %dbFile%
%embedfile% import %chunksFile% %dbFile% && echo Gotowe^! Po każdej zmianie w folderze %inputDir% należy uruchomić ponownie indeksator.
del %chunksFile%
endlocal
pause