#!/bin/bash

RESULT_DIR="hallucinate_small/"

rates=(0.5)
counter=1
echo "Starting serial data generation and conversion..."
for i in "${!rates[@]}"; do
    rate=${rates[$i]}
    input_folder="${RESULT_DIR}pretrain_perturbed$((i+62)).txt"
    output_folder="${RESULT_DIR}pretrain_perturbed$((i+62))"    
    mkdir -p "$output_folder"
    echo "Generating data for task $((i+62)): people_rate=$rate, input_folder=$input_folder, output_folder=$output_folder"

    python generate_bios_by_num_data.py --people_rate "$rate" --number "$((i+62))"

    python convert_binary.py -i "$input_folder" -o "$output_folder" --val_shard_size 10000000
    
    
    log_dir="/data/temp_log$((i+62))"

    torchrun --standalone --nproc_per_node=8 train_gpt2.py \
        --input_folder "$output_folder" \
        --save_every 2000 \
        --val_loss_every 2000 \
        --run_name "xs_pretrain_small_$((i+62))" \
        --warmup_ratio 0.05 \
        --warmdown_ratio 0.9 \
        --sequence_length 512 \
        --device_batch_size 16 \
        --num_epochs 4 \
        --weight_decay 0.1 \
        --learning_rate 0.0003 \
        --batch_size 128 \
        --bf16 \
        --model_size xxs \
        --output_dir "$log_dir"
    ((counter++))
done

wait

echo "All training tasks completed!"