#!/bin/bash

# Run batch training jobs in parallel on different GPUs
# We vary subset size and attention usage

# Create folders for output logs
mkdir -p logs

# Calculate epochs and frequencies for different subset sizes to maintain consistent iterations
# Base values for full dataset (-1)
BASE_EPOCHS=200
BASE_CHECKPOINT_FREQ=20
BASE_SAMPLE_FREQ=20

# Subset sizes to run
# declare -a SUBSET_SIZES=(2000 4000 6000 8000)
declare -a SUBSET_SIZES=(100 1000 10000 50000)

# Multipliers for each subset size to keep iterations constant
# For 100 samples: 50000/100 = 500x
# For 1000 samples: 50000/1000 = 50x
# For 10000 samples: 50000/10000 = 5x
# For full dataset (-1): 1x
# declare -a MULTIPLIERS=(25 12.5 8.33 6.25)
declare -a MULTIPLIERS=(500 50 5 1)

gpu=4
job_count=0

# Loop through subset sizes
for i in "${!SUBSET_SIZES[@]}"; do
    subset_size=${SUBSET_SIZES[$i]}
    multiplier=${MULTIPLIERS[$i]}

    # Calculate adjusted parameters using bc for floating-point arithmetic
    epochs=$(echo "$BASE_EPOCHS * $multiplier" | bc | cut -d. -f1)
    checkpoint_freq=$(echo "$BASE_CHECKPOINT_FREQ * $multiplier" | bc | cut -d. -f1)
    sample_freq=$(echo "$BASE_SAMPLE_FREQ * $multiplier" | bc | cut -d. -f1)

    # For very small subset sizes, cap the epochs to avoid excessive training
    if [ $epochs -gt 100000 ]; then
        epochs=100000
        checkpoint_freq=$((epochs / 10))
        sample_freq=$((epochs / 10))
    fi

    # Run version without attention
    echo "Starting job: subset-size ${subset_size}, no attention, GPU ${gpu}"
    python nn_baselines/train.py \
        --gpu $gpu \
        --subset-size $subset_size \
        --epochs $epochs \
        --checkpoint-freq $checkpoint_freq \
        --sample-freq $sample_freq \
        > logs/train_${subset_size}_noattn_gpu${gpu}.log 2>&1 &

    # Increment GPU ID
    gpu=$((gpu + 1))
    job_count=$((job_count + 1))

    # # Run version with attention
    # echo "Starting job: subset-size ${subset_size}, with attention, GPU ${gpu}"
    # python nn_baselines/train.py \
    #     --gpu $gpu \
    #     --subset-size $subset_size \
    #     --use-attention \
    #     --epochs $epochs \
    #     --checkpoint-freq $checkpoint_freq \
    #     --sample-freq $sample_freq \
    #     > logs/train_${subset_size}_attn_gpu${gpu}.log 2>&1 &

    # # Increment GPU ID
    # gpu=$((gpu + 1))
    # job_count=$((job_count + 1))
done

echo "Started $job_count jobs in parallel. Check logs in the logs/ directory."
echo "Use 'nvidia-smi' to monitor GPU usage."

# Wait for all background jobs to finish
wait
echo "All training jobs completed."
