#!/bin/bash

# Create base results directory
mkdir -p ./results/greedy_sample
mkdir -p ./logs

# Get current timestamp for log files
timestamp=$(date +"%Y%m%d_%H%M%S")

# Function to run evaluation for a specific model on a specific GPU
run_model_eval() {
    local model_path=$1
    local gpu_id=$2

    # Extract model name from path
    model_name=$(basename $model_path)

    # Set sampling method
    sampling_method="greedy"
    num_sequences=1

    # Create log file
    log_file="./logs/greedy_${model_name}.log"

    echo "[$(date '+%Y-%m-%d %H:%M:%S')] Starting evaluation for $model_name on GPU $gpu_id" | tee -a "$log_file"
    echo "Running evaluation with:" | tee -a "$log_file"
    echo "- Sampling method: $sampling_method" | tee -a "$log_file"
    echo "- Model: $model_name" | tee -a "$log_file"
    echo "- Number of sequences: $num_sequences" | tee -a "$log_file"
    echo "- GPU: $gpu_id" | tee -a "$log_file"
    echo "- Log file: $log_file" | tee -a "$log_file"

    # Create directory structure
    output_dir="./results/${sampling_method}_sample"
    mkdir -p $output_dir

    # Run evaluation and log output
    {
        echo "[$(date '+%Y-%m-%d %H:%M:%S')] Evaluation command started"

        CUDA_VISIBLE_DEVICES=$gpu_id HF_ALLOW_CODE_EVAL=1 HF_DATASETS_TRUST_REMOTE_CODE=true confirm_run_unsafe_code=true python -m lm_eval \
            --model vllm \
            --model_args pretrained=$model_path,trust_remote_code=True,dtype=bfloat16 \
            --tasks arc_challenge_chat,coqa,drop,ifeval,gsm8k,minerva_math_algebra \
            --confirm_run_unsafe_code \
            --limit 250 \
            --batch_size 300 \
            --device cuda \
            --log_samples \
            --trust_remote_code \
            --apply_chat_template \
            --output_path "${output_dir}/sample_${num_sequences}" \
            --gen_kwargs do_sample=False,num_return_sequences=1,temperature=0,top_p=0.8,top_k=20,max_length=8096,repetition_penalty=1.2,max_gen_toks=4096

        CUDA_VISIBLE_DEVICES=$gpu_id HF_ALLOW_CODE_EVAL=1 HF_DATASETS_TRUST_REMOTE_CODE=true confirm_run_unsafe_code=true python -m lm_eval \
                    --model vllm \
                    --model_args pretrained=$model_path,trust_remote_code=True,dtype=bfloat16 \
                    --tasks bbh_cot_zeroshot,mmlu_pro_plus \
                    --confirm_run_unsafe_code \
                    --limit 25 \
                    --batch_size 300 \
                    --device cuda \
                    --log_samples \
                    --trust_remote_code \
                    --apply_chat_template \
                    --output_path "${output_dir}/group_sample_${num_sequences}" \
                    --gen_kwargs do_sample=False,num_return_sequences=1,temperature=0,top_p=0.8,top_k=20,max_length=8096,repetition_penalty=1.2,max_gen_toks=4096

        # wmt16
        CUDA_VISIBLE_DEVICES=$gpu_id HF_ALLOW_CODE_EVAL=1 HF_DATASETS_TRUST_REMOTE_CODE=true confirm_run_unsafe_code=true python -m lm_eval \
                                    --model vllm \
                                    --model_args pretrained=$model_path,trust_remote_code=True,dtype=bfloat16 \
                                    --tasks wmt16 \
                                    --confirm_run_unsafe_code \
                                    --limit 70 \
                                    --batch_size 300 \
                                    --log_samples \
                                    --device cuda \
                                    --trust_remote_code \
                                    --apply_chat_template \
                                    --output_path "${output_dir}/wmt16_sample_${num_sequences}" \
                                    --gen_kwargs do_sample=False,num_return_sequences=1,temperature=0,top_p=0.8,top_k=20,max_length=8096,repetition_penalty=1.2,max_gen_toks=4096

        echo "[$(date '+%Y-%m-%d %H:%M:%S')] Evaluation command completed with exit code: $?"
    } 2>&1 | tee "$log_file"
}

# Run each model on a different GPU in parallel
#run_model_eval "Qwen/Qwen2.5-1.5B-Instruct" 3 &
run_model_eval "Qwen/Qwen2.5-7B-Instruct" 6 &
#run_model_eval "meta-llama/Llama-3.2-1B-Instruct" 1 &
#run_model_eval "meta-llama/Llama-3.2-3B-Instruct" 2 &
#run_model_eval "meta-llama/Llama-3.1-8B-Instruct" 5 &

#run_model_eval "Qwen/Qwen2.5-1.5B-Instruct" 6
#run_model_eval "Qwen/Qwen2.5-7B-Instruct" 6
#run_model_eval "Qwen/Qwen2.5-3B-Instruct" 7
#run_model_eval "meta-llama/Llama-3.2-1B-Instruct" 7
#run_model_eval "meta-llama/Llama-3.2-3B-Instruct" 7
run_model_eval "meta-llama/Llama-3.1-8B-Instruct" 7

# Wait for all background processes to finish
wait

#echo "[$(date '+%Y-%m-%d %H:%M:%S')] All evaluations completed!" | tee -a "./logs/main_${timestamp}.log"
echo "Log files are available in the ./logs directory"

