#!/bin/bash
set +e
gpu_ids=$1         
command_to_run=$2  
retry_interval=${3:-"10"}        
required_mem=${4:-"29360"}       


IFS=',' read -ra gpu_array <<< "$gpu_ids"
shuffled_array=($(shuf -e "${gpu_array[@]}"))
echo $command_to_run

found_available_gpu=0
while [ $found_available_gpu -eq 0 ]; do
    
    cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print 100 - $8}')
    cpu_usage=$(printf "%.0f" "$cpu_usage")
    if [ "$cpu_usage" -ge 90 ]; then
        echo "The CPU usage is too high (${cpu_usage}%), skipping the GPU check..."
        sleep $retry_interval
        continue
    fi
    
    for gpu_id in "${shuffled_array[@]}"; do
        gpu_info=$(nvidia-smi --query-gpu=memory.free,utilization.gpu --format=csv,noheader,nounits -i $gpu_id 2>/dev/null)
        if [ -z "$gpu_info" ]; then
            echo "Warning: Failed to obtain information about GPU ${gpu_id}"
            continue
        fi
        
        mem_free=$(echo $gpu_info | awk -F ', ' '{print $1}')
        gpu_util=$(echo $gpu_info | awk -F ', ' '{print $2}')

        if [ "$mem_free" -ge $required_mem ] && [ "$gpu_util" -lt 99 ]; then
            echo "GPU ${gpu_id} is available (with ${mem_free}MB free), executing the command..."
            
            CUDA_VISIBLE_DEVICES=$gpu_id $command_to_run --check true || exit_code=$?
            
            if [ "${exit_code:-0}" -eq 17 ]; then
                found_available_gpu=1
                echo "The training file already exists. Exiting the GPU resource polling script..."
                exit 0
            fi

            CUDA_VISIBLE_DEVICES=$gpu_id $command_to_run &
            found_available_gpu=1
            break  
        fi
    done
    
    if [ $found_available_gpu -eq 1 ]; then
        echo "The command has been executed on GPU ${gpu_id}, and the GPU resource polling script will exit after waiting for ${retry_interval} seconds."
        sleep $retry_interval
    else
        echo "All the specified GPUs are currently unavailable. Retrying after waiting for 60 seconds..."
        sleep 60
    fi
    
done