#params setting
export PYTHON="~/miniconda3/envs/mlagentbench/bin/python"
export BENCHMARK="MLAgentBench"
export WANDB_PROJECT=
export EXPERIMENT_NAME=
export SFT_MODEL=
export NUM_STEPS=15
export GENERATOR_URL=

export HUB_URL=
export HUB_KEY=

export MAX_RETRY=3 # Maximum number of retries for 1 trajectory          
export TIMEOUT_DURATION_TRAJ="1800" # Timeout duration for 1 trajectory
export RUNS_PER_TASK=8  # Number of runs per task
export TIMEOUT_DURATION_EVA="120m" # Timeout duration for score


START_TIME=$(date +%s)

VALID_FORMAT_ENTRIES=(
    "Reflection" "Research Plan and Status" "Fact Check" "Thought" "Action" "Action Input"
    # "Action" "Action Input"
)
export VALID_FORMAT_ENTRIES_STR=$(IFS=','; echo "${VALID_FORMAT_ENTRIES[*]}")
echo "Your valid format entries are: ${VALID_FORMAT_ENTRIES[@]}"; sleep 5

TASK_LIST=(
    "cifar10"
    "feedback"
    "house-price"
    "denoising-dirty-documents"
    "leaf-classification"
    "statoil-iceberg-classifier-challenge"
    "whale-categorization-playground"
    "learning-agency-lab-automated-essay-scoring-2"
    "detecting-insults-in-social-commentary"
    "spooky-author-identification"
    "jigsaw-toxic-comment-classification-challenge"
    "us-patent-phrase-to-phrase-matching"
    "tabular-playground-series-dec-2021"
)
export TASK_LIST_STR=$(IFS=','; echo "${TASK_LIST[*]}")

declare -A model_dict
model_dict=(
    ["gpt-4o-mini"]="gpt-4o-mini"
    ["gpt-4o-2024-08-06"]="gpt-4o-2024-08-06"
    ["Qwen2.5_7B_Instruct"]="~/model/Qwen2.5-7B-Instruct"
    ["Llama-3.1-8B-Instruct"]="~/model/Llama-3.1-8B-Instruct"
    ["DeepSeek-R1-Distill-Qwen-32B"]="~/model/DeepSeek-R1-Distill-Qwen-32B"
    ["deepseek-r1"]="deepseek-r1"
    ["deepseek-v3"]="deepseek-v3"
    ["qwen25_32b_coder_instruct"]="~/model/Qwen2.5-Coder-32B-Instruct"
    ["qwen25_32b_instruct"]="~/model/Qwen2.5-32B-Instruct"
    ["ML-AGENT-SFT"]="LLaMA-Factory/checkpoint/sft/${SFT_MODEL}/checkpoint-312"
    ["ML-AGENT-PPO"]="verl/verl/checkpoints/${WANDB_PROJECT}/${EXPERIMENT_NAME}/global_step_39/actor/huggingface"
)

export TEMPLATE_NAME="qwen-new"
export CODER_NAME="qwen25_32b_coder_instruct"
export CODER="${model_dict[$CODER_NAME]}"

model_list=(
    # "deepseek-r1"
    "gpt-4o-mini"
    # "gpt-4o-2024-08-06"
    )


# CUDA_VISIBLE_DEVICES=4,5,6,7 python -m vllm.entrypoints.openai.api_server --model ~/model/Qwen2.5-Coder-32B-Instruct  --port 30020 --tensor-parallel-size 4

for model_name in "${model_list[@]}"; do
    START_TIME=$(date +%s)

    export GENERATOR_NAME="$model_name"
    export GENERATOR="${model_dict[$GENERATOR_NAME]}"
    bash scripts/judge/detect_coder.sh; if [ $? -ne 0 ]; then exit 0; fi # Check if the qwen2.5-32b-instruct is online
    bash scripts/judge/detect_generator.sh; if [ $? -ne 0 ]; then exit 0; fi # Deploy and check the local generator on GPU 0 or check the remote generator is available
    bash scripts/judge/infer.sh # Run the inference
    bash scripts/judge/kill.sh # Kill the generator
    bash scripts/judge/score.sh # Score the results
    bash scripts/judge/show.sh # Show the results

    echo "Total execution time: $(($(date +%s) - START_TIME)) seconds"
done





