#!/bin/bash

# Define the list of models to evaluate
TEXT_MODELS=(
    "Qwen3-235B-A22B-Instruct-2507"
    "DeepSeek-R1-0528"
    "GLM-4.5"
    "DeepSeek-V3-0324"
    "Kimi-K2-Instruct"
)

# Loop through each model and run the evaluation
for model in "${TEXT_MODELS[@]}"
do
    echo "================================="
    echo "Evaluating model: $model"
    echo "==============single=================="
    python evaluator/likert.py --model "$model" --mode single --output_dir outputs_full/likert --data_path data/full.jsonl --eval
    echo "==============pair=================="
    python evaluator/likert.py --model "$model" --mode pair --output_dir outputs_full/likert --data_path data/full.jsonl --eval
done

MLLMS=(
    "Qwen2.5-VL-72B-Instruct"
    "gpt-4o"
    "gpt-4.1"
    "gemini-2.5-pro"
)


for model in "${MLLMS[@]}"
do
    echo "================================="
    echo "Evaluating model: $model"
    echo "==============single=================="
    python evaluator/likert.py --model "$model" --mode single --output_dir outputs_full/likert --with_image --data_path data/full.jsonl --eval
    echo "==============pair=================="
    python evaluator/likert.py --model "$model" --mode pair --output_dir outputs_full/likert --with_image --data_path data/full.jsonl --eval
done

echo "All evaluations complete."