#!/bin/bash

# Create base results directory
mkdir -p ./results/temperature_sample
mkdir -p ./logs

# Get current timestamp for log files
timestamp=$(date +"%Y%m%d_%H%M%S")

# Define temperature values and repeat counts to iterate over
temperatures=(0.5 0.75 1.0 1.25 1.5)
#repeat_counts=(1 3 5 10 25 50 75 100)
repeat_counts=(100 75 50 25 10 5 3 1)

# Base batch size (will be adjusted based on num_sequences)
base_batch_size=70

# Function to run evaluation for a specific model with specific temperature and repeat count
run_model_eval() {
    local model_path=$1
    local gpu_id=$2
    local temperature=$3
    local num_sequences=$4

    # Extract model name from path
    model_name=$(basename $model_path)

    # 动态调整batch_size，确保真实批次大小一致
    # 计算公式: adjusted_batch_size = base_batch_size / num_sequences
    # 确保batch_size至少为1
    adjusted_batch_size=$(python -c "import math; print(max(1, math.floor($base_batch_size / $num_sequences)))")

    # Set sampling method (using do_sample=True for temperature sampling)
    sampling_method="temperature"

    # Create log file
    log_file="./logs/temp${temperature}_repeat${num_sequences}_${model_name}.log"

    echo "[$(date '+%Y-%m-%d %H:%M:%S')] Starting evaluation for $model_name on GPU $gpu_id" | tee -a "$log_file"
    echo "Running evaluation with:" | tee -a "$log_file"
    echo "- Sampling method: $sampling_method" | tee -a "$log_file"
    echo "- Temperature: $temperature" | tee -a "$log_file"
    echo "- Model: $model_name" | tee -a "$log_file"
    echo "- Number of sequences: $num_sequences" | tee -a "$log_file"
    echo "- Adjusted batch size: $adjusted_batch_size (base: $base_batch_size)" | tee -a "$log_file"
    echo "- Effective batch size: $((adjusted_batch_size * num_sequences))" | tee -a "$log_file"
    echo "- GPU: $gpu_id" | tee -a "$log_file"
    echo "- Log file: $log_file" | tee -a "$log_file"

    # Create directory structure
    output_dir="./results/${sampling_method}_sample/temp${temperature}_repeat${num_sequences}"
    mkdir -p $output_dir

    # Run evaluation and log output
    {
        echo "[$(date '+%Y-%m-%d %H:%M:%S')] Evaluation command started"

        CUDA_VISIBLE_DEVICES=$gpu_id HF_ALLOW_CODE_EVAL=1 HF_DATASETS_TRUST_REMOTE_CODE=true confirm_run_unsafe_code=true   python -m lm_eval \
            --model vllm \
            --model_args pretrained=$model_path,trust_remote_code=True,dtype=bfloat16 \
            --tasks arc_challenge_chat,coqa,drop,ifeval,gsm8k,minerva_math_algebra \
            --confirm_run_unsafe_code \
            --limit 250 \
            --batch_size auto \
            --device cuda \
            --trust_remote_code \
            --apply_chat_template \
            --sampling_method $sampling_method \
            --output_path ${output_dir} \
            --gen_kwargs do_sample=True,num_return_sequences=$num_sequences,temperature=$temperature,top_p=0.8,top_k=20,max_length=8096,repetition_penalty=1.2,max_gen_toks=4096

        CUDA_VISIBLE_DEVICES=$gpu_id HF_ALLOW_CODE_EVAL=1 HF_DATASETS_TRUST_REMOTE_CODE=true confirm_run_unsafe_code=true  python -m lm_eval \
                    --model vllm \
                    --model_args pretrained=$model_path,trust_remote_code=True,dtype=bfloat16 \
                    --tasks bbh_cot_zeroshot,mmlu_pro_plus \
                    --confirm_run_unsafe_code \
                    --limit 25 \
                    --batch_size auto \
                    --device cuda \
                    --trust_remote_code \
                    --apply_chat_template \
                    --sampling_method $sampling_method \
                    --output_path ${output_dir}_group_sample \
                    --gen_kwargs do_sample=True,num_return_sequences=$num_sequences,temperature=$temperature,top_p=0.8,top_k=20,max_length=8096,repetition_penalty=1.2,max_gen_toks=4096

        # wmt16
        CUDA_VISIBLE_DEVICES=$gpu_id HF_ALLOW_CODE_EVAL=1 HF_DATASETS_TRUST_REMOTE_CODE=true confirm_run_unsafe_code=true   python -m lm_eval \
                                    --model vllm \
                                    --model_args pretrained=$model_path,trust_remote_code=True,dtype=bfloat16 \
                                    --tasks wmt16 \
                                    --confirm_run_unsafe_code \
                                    --limit 70 \
                                    --batch_size auto \
                                    --device cuda \
                                    --trust_remote_code \
                                    --apply_chat_template \
                                    --sampling_method $sampling_method \
                                    --output_path ${output_dir}_wmt16_sample \
                                    --gen_kwargs do_sample=True,num_return_sequences=$num_sequences,temperature=$temperature,top_p=0.8,top_k=20,max_length=8096,repetition_penalty=1.2,max_gen_toks=4096

        echo "[$(date '+%Y-%m-%d %H:%M:%S')] Evaluation command completed with exit code: $?"
    } 2>&1 | tee "$log_file"
}

# Main execution logic
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Starting temperature-based evaluations"

# Define GPU assignments for models
declare -A gpu_model_map=(
#    ["1"]="meta-llama/Llama-3.2-1B-Instruct"
#    ["3"]="Qwen/Qwen2.5-1.5B-Instruct"
#    ["4"]="meta-llama/Llama-3.2-3B-Instruct"
    ["2"]="meta-llama/Llama-3.1-8B-Instruct"
    ["3"]="Qwen/Qwen2.5-7B-Instruct"
#    ["2"]="Qwen/Qwen2.5-3B-Instruct"
)

# Process tasks in parallel across GPUs, but serial on each GPU
# Launch one job per GPU and keep them running
for gpu_id in "${!gpu_model_map[@]}"; do
    (
        model_path="${gpu_model_map[$gpu_id]}"
        echo "[$(date '+%Y-%m-%d %H:%M:%S')] Starting sequence of evaluations for GPU $gpu_id with model $model_path"

        # Process all temperature and repeat combinations sequentially on this GPU
        for temp in "${temperatures[@]}"; do
            for repeat in "${repeat_counts[@]}"; do
                run_model_eval "$model_path" "$gpu_id" "$temp" "$repeat"
                echo "[$(date '+%Y-%m-%d %H:%M:%S')] Completed evaluation with temp=$temp, repeat=$repeat on GPU $gpu_id"
            done
        done

        echo "[$(date '+%Y-%m-%d %H:%M:%S')] All evaluations for GPU $gpu_id completed"
    ) &
done

# Wait for all GPU processes to finish
wait

echo "[$(date '+%Y-%m-%d %H:%M:%S')] All evaluations completed!"
echo "Log files are available in the ./logs directory"
echo "Results are available in the ./results/temperature_sample directory"

## 为独立的评估命令也添加动态batch_size调整
#num_sequences=3
#base_batch_size=70
#adjusted_batch_size=$(python -c "import math; print(max(1, math.floor($base_batch_size / $num_sequences)))")
#
# arc_challenge_chat,coqa,drop,ifeval,gsm8k,minerva_math_algebra,bbh_cot_zeroshot,mmlu_pro_plus,wmt16
#CUDA_VISIBLE_DEVICES=2 HF_ALLOW_CODE_EVAL=1 HF_DATASETS_TRUST_REMOTE_CODE=true confirm_run_unsafe_code=true python -m lm_eval \
#            --model vllm \
#            --model_args pretrained=Qwen/Qwen2.5-3B-Instruct,trust_remote_code=True,dtype=bfloat16 \
#            --tasks wmt16 \
#            --confirm_run_unsafe_code \
#            --limit 20 \
#            --batch_size 50 \
#            --device cuda \
#            --trust_remote_code \
#            --log_samples \
#            --apply_chat_template \
#            --sampling_method temperature \
#            --output_path ./results/temperature_sample/test \
#            --gen_kwargs do_sample=True,num_return_sequences=5,temperature=1.0,top_k=20,top_p=0.8,max_length=8096,repetition_penalty=1.2,max_gen_toks=4096