#!/bin/bash

RESULT_DIR="hallucinate_small/"

rates=(2.0)

echo "Starting serial data generation and conversion..."
for i in "${!rates[@]}"; do
    rate=${rates[$i]}
    input_folder="${RESULT_DIR}pretrain_perturbed$((i+16)).txt"
    output_folder="${RESULT_DIR}pretrain_perturbed$((i+16))"

    echo "Generating data for task $((i+16)): people_rate=$rate, input_folder=$input_folder, output_folder=$output_folder"

    python generate_bios_by_rate.py --people_rate "$rate" --number "$((i+16))"

    python convert_binary.py -i "$input_folder" -o "$output_folder" --val_shard_size 10000000
    
    mkdir -p "$output_folder"
    log_dir="/data/temp_log$((i+16))"

    torchrun --standalone --nproc_per_node=8 train_gpt2.py \
        --input_folder "$output_folder" \
        --save_every 4000 \
        --val_loss_every 4000 \
        --run_name "xs_pretrain_small_$((i+16))" \
        --warmup_ratio 0.05 \
        --warmdown_ratio 0.9 \
        --sequence_length 512 \
        --device_batch_size 16 \
        --num_epochs 4 \
        --weight_decay 0.1 \
        --learning_rate 0.0003 \
        --batch_size 128 \
        --bf16 \
        --model_size small \
        --output_dir "$log_dir"
done

wait

echo "All training tasks completed!"