code-completion / archive-misc /dset_splitter.py
jblitzar's picture
Upload folder using huggingface_hub
a8639ac verified
import os
import tqdm
# get through all files in inputted path, put the first 80% in one file concatenated and the last 20% in another
def split_files(input_path, output_path1, output_path2):
files = [
f
for f in os.listdir(input_path)
if os.path.isfile(os.path.join(input_path, f)) and f.endswith(".py")
]
# Sort files to maintain order
files.sort()
split_index = int(len(files) * 0.8)
first_80_files = files[:split_index]
last_20_files = files[split_index:]
print(os.listdir(input_path))
with open(output_path1, "w") as outfile1:
for fname in tqdm.tqdm(first_80_files):
with open(os.path.join(input_path, fname), errors="ignore") as infile:
outfile1.write(infile.read())
outfile1.write("\nprint('---FILESEP---')\n")
with open(output_path2, "w") as outfile2:
for fname in tqdm.tqdm(last_20_files):
with open(os.path.join(input_path, fname), errors="ignore") as infile:
outfile2.write(infile.read())
outfile2.write("\nprint('---FILESEP---')\n")
# Example usage
input_path = os.path.expanduser("~/torch_datasets/github-python/all_trains")
output_path1 = os.path.expanduser("~/torch_datasets/github-python/80")
output_path2 = os.path.expanduser("~/torch_datasets/github-python/20")
split_files(input_path, output_path1, output_path2)