# #!/bin/bash
# # dataset_name=arxiv-math; device_num=2; downsample_rate=0.1; max_len=1024; batch_size=1
# # dataset_name=gsm8k; device_num=0; downsample_rate=0.1; max_len=512; batch_size=2
# dataset_name=alpaca-gpt4; device_num=1; downsample_rate=0.1; max_len=1024; batch_size=1

# # dataset_name=fineweb; device_num=4; downsample_rate=0.02; max_len=1024; batch_size=1 # takes about 14.5 mins to finish

# save_path="xxx/llama_reader_larger/${dataset_name}/ratio_${downsample_rate}/"
# echo "save_path: ${save_path}"
# python llama_reader_clean.py --dataset_name $dataset_name --device_num $device_num --downsample_rate $downsample_rate --save_path $save_path --max_len $max_len --batch_size $batch_size




# dataset_name=databricks-dolly-15k; device_num=1; downsample_rate=0.3; max_len=1024; batch_size=1

# sleep 0.6h
dataset_name=OpenOrca; device_num=4; downsample_rate=0.05; max_len=1024; batch_size=1

# dataset_name=fineweb; device_num=4; downsample_rate=0.02; max_len=1024; batch_size=1 # takes about 14.5 mins to finish

save_path="xxx/llama_reader_larger/${dataset_name}/ratio_${downsample_rate}/"
echo "save_path: ${save_path}"
python llama_reader_clean.py --dataset_name $dataset_name --device_num $device_num --downsample_rate $downsample_rate --save_path $save_path --max_len $max_len --batch_size $batch_size
