Update indeksator.cmd
Browse files- indeksator.cmd +15 -12
indeksator.cmd
CHANGED
@@ -1,21 +1,22 @@
|
|
1 |
-
:; # Indeksator Prosty RAG v0.
|
2 |
:; # *NIX:
|
3 |
:; OS=$(uname -s | sed -e 's/^Linux$/linux/' -e 's/^Darwin$/mac/')
|
4 |
:; embedfile="bge-m3.embedfile"
|
5 |
:; inputDir="baza"
|
6 |
-
:; chunksFile="chunks.
|
7 |
:; dbFile="prosty-rag.db"
|
8 |
:; chunkWords=200
|
9 |
-
:; overlapWords=
|
10 |
:; > $chunksFile
|
11 |
:; # Instalacja
|
12 |
:; [ ! -d $inputDir ] && echo Pobieranie przykładowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
|
13 |
:; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://huggingface.co/asg017/embedfile/resolve/refs%2Fpr%2F2/bge-m3.embedfile?download=true && chmod +x $embedfile && echo Gotowe!
|
14 |
:; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-$OS-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-$OS-4.05.tar.gz xpdf-tools-$OS-4.05/bin64/pdftotext && rm xpdf-tools-$OS-4.05.tar.gz && echo Gotowe!
|
15 |
:; # Uruchamianie
|
16 |
-
:; echo "Indeksowanie plików PDF/TXT/MD w folderze $inputDir..."
|
17 |
:; shopt -s nullglob
|
18 |
-
:; for pdf in $inputDir/*.pdf; do echo "Konwertowanie $(basename "$pdf")..." && pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
|
|
|
19 |
:; for file in $inputDir/*.txt $inputDir/*.md; do
|
20 |
:; filename=$(basename "$file")
|
21 |
:; echo "Przetwarzanie $filename..."
|
@@ -34,22 +35,24 @@
|
|
34 |
:; rm $chunksFile; exit $?
|
35 |
:; # Windows:
|
36 |
@echo off
|
|
|
37 |
setlocal enabledelayedexpansion
|
38 |
set embedfile=bge-m3.embedfile
|
39 |
set inputDir=baza
|
40 |
-
set chunksFile=chunks.
|
41 |
set dbFile=prosty-rag.db
|
42 |
set chunkWords=200
|
43 |
-
set overlapWords=
|
44 |
for /l %%i in (1,1,%overlapWords%) do set buf[%%i]=
|
45 |
break>%chunksFile%
|
46 |
:; # Instalacja
|
47 |
-
if not exist %inputDir% echo Pobieranie przykładowego pliku %inputDir%\wikipedia.txt... && curl --create-dirs -Lo %inputDir%\wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe
|
48 |
-
if not exist %embedfile% echo Pobieranie %embedfile%... && curl -Lo %embedfile% https://huggingface.co/asg017/embedfile/resolve/refs%%2Fpr%%2F2/bge-m3.embedfile?download=true && echo Gotowe
|
49 |
-
if not exist pdftotext.exe echo Pobieranie pdftotext.exe... && curl -LO https://dl.xpdfreader.com/xpdf-tools-win-4.05.zip && tar --strip-components 2 -xf xpdf-tools-win-4.05.zip xpdf-tools-win-4.05/bin64/pdftotext.exe && del xpdf-tools-win-4.05.zip && echo Gotowe
|
50 |
:; # Uruchamianie
|
51 |
-
echo Indeksowanie plików PDF/TXT/MD w folderze %inputDir%...
|
52 |
for %%F in ("%inputDir%\*.pdf") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && pdftotext -nopgbrk -enc UTF-8 "%%~F"
|
|
|
53 |
for %%F in ("%inputDir%\*.txt" "%inputDir%\*.md") do (
|
54 |
echo Przetwarzanie %%~nxF...
|
55 |
(
|
@@ -78,7 +81,7 @@ for %%F in ("%inputDir%\*.txt" "%inputDir%\*.md") do (
|
|
78 |
)
|
79 |
echo Osadzanie plików...
|
80 |
if exist %dbFile% del %dbFile%
|
81 |
-
%embedfile% import %chunksFile% %dbFile% && echo Gotowe
|
82 |
del %chunksFile%
|
83 |
endlocal
|
84 |
pause
|
|
|
1 |
+
:; # Indeksator Prosty RAG v0.3 - Jerzy Głowacki na licencji Apache 2.0
|
2 |
:; # *NIX:
|
3 |
:; OS=$(uname -s | sed -e 's/^Linux$/linux/' -e 's/^Darwin$/mac/')
|
4 |
:; embedfile="bge-m3.embedfile"
|
5 |
:; inputDir="baza"
|
6 |
+
:; chunksFile="chunks.tmp"
|
7 |
:; dbFile="prosty-rag.db"
|
8 |
:; chunkWords=200
|
9 |
+
:; overlapWords=20
|
10 |
:; > $chunksFile
|
11 |
:; # Instalacja
|
12 |
:; [ ! -d $inputDir ] && echo Pobieranie przykładowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
|
13 |
:; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://huggingface.co/asg017/embedfile/resolve/refs%2Fpr%2F2/bge-m3.embedfile?download=true && chmod +x $embedfile && echo Gotowe!
|
14 |
:; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-$OS-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-$OS-4.05.tar.gz xpdf-tools-$OS-4.05/bin64/pdftotext && rm xpdf-tools-$OS-4.05.tar.gz && echo Gotowe!
|
15 |
:; # Uruchamianie
|
16 |
+
:; echo "Indeksowanie plików PDF/TXT/MD/CSV w folderze $inputDir..."
|
17 |
:; shopt -s nullglob
|
18 |
+
:; for pdf in $inputDir/*.pdf; do [ ! -f "${pdf%.pdf}.txt" ] && echo "Konwertowanie $(basename "$pdf")..." && pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
|
19 |
+
:; for csv in $inputDir/*.csv; do [ ! -f "${csv%.csv}.txt" ] && echo "Konwertowanie $(basename "$csv")..." && awk -F, -vFPAT='([^,]*)|("([^"]|"")+")' 'NR==1{for(i=1;i<=NF;i++) h[i]=$i; next} {for(i=1;i<=NF;i++) printf "%s: %s%s", h[i], $i, (i<NF? ", ":".\n")}' "$csv" > "${csv%.csv}.txt"; done
|
20 |
:; for file in $inputDir/*.txt $inputDir/*.md; do
|
21 |
:; filename=$(basename "$file")
|
22 |
:; echo "Przetwarzanie $filename..."
|
|
|
35 |
:; rm $chunksFile; exit $?
|
36 |
:; # Windows:
|
37 |
@echo off
|
38 |
+
chcp 65001 >nul
|
39 |
setlocal enabledelayedexpansion
|
40 |
set embedfile=bge-m3.embedfile
|
41 |
set inputDir=baza
|
42 |
+
set chunksFile=chunks.tmp
|
43 |
set dbFile=prosty-rag.db
|
44 |
set chunkWords=200
|
45 |
+
set overlapWords=20
|
46 |
for /l %%i in (1,1,%overlapWords%) do set buf[%%i]=
|
47 |
break>%chunksFile%
|
48 |
:; # Instalacja
|
49 |
+
if not exist %inputDir% echo Pobieranie przykładowego pliku %inputDir%\wikipedia.txt... && curl --create-dirs -Lo %inputDir%\wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe^^!
|
50 |
+
if not exist %embedfile% echo Pobieranie %embedfile%... && curl -Lo %embedfile% https://huggingface.co/asg017/embedfile/resolve/refs%%2Fpr%%2F2/bge-m3.embedfile?download=true && echo Gotowe^^!
|
51 |
+
if not exist pdftotext.exe echo Pobieranie pdftotext.exe... && curl -LO https://dl.xpdfreader.com/xpdf-tools-win-4.05.zip && tar --strip-components 2 -xf xpdf-tools-win-4.05.zip xpdf-tools-win-4.05/bin64/pdftotext.exe && del xpdf-tools-win-4.05.zip && echo Gotowe^^!
|
52 |
:; # Uruchamianie
|
53 |
+
echo Indeksowanie plików PDF/TXT/MD/CSV w folderze %inputDir%...
|
54 |
for %%F in ("%inputDir%\*.pdf") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && pdftotext -nopgbrk -enc UTF-8 "%%~F"
|
55 |
+
for %%F in ("%inputDir%\*.csv") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && powershell -Command "Import-Csv '%%~F' | %% { (($_.PSObject.Properties | %% { \"$($_.Name): $($_.Value)\" }) -join ', ') + '.'} | Out-File '%%~dpnF.txt' -Encoding utf8"
|
56 |
for %%F in ("%inputDir%\*.txt" "%inputDir%\*.md") do (
|
57 |
echo Przetwarzanie %%~nxF...
|
58 |
(
|
|
|
81 |
)
|
82 |
echo Osadzanie plików...
|
83 |
if exist %dbFile% del %dbFile%
|
84 |
+
%embedfile% import %chunksFile% %dbFile% && echo Gotowe^^! Po każdej zmianie w folderze %inputDir% należy uruchomić ponownie indeksator.
|
85 |
del %chunksFile%
|
86 |
endlocal
|
87 |
pause
|