#! /bin/bash

########################################################
## YOUR SETTINGS
API_KEY="YOUR_API_KEY"

# ## vLLM 
MODEL_NAME="YOUR_MODEL_NAME"
BASE_URL="YOUR_VLLM_SERVER_URL/v1"

SAVE_MODEL_NAME="YOUR_SAVE_MODEL_NAME"

## Evaluation Config
# PROMPT_TYPE : EVALUATION MODE CANDIDATE
# ours : judge_with_checklist_generation, judge_with_gt_checklist
PROMPT_TYPE="ours"
EVALUATION_MODE="judge_with_checklist_generation"
# True, False
USE_IN_PROGRESS=True
# auto_regressive (default), bt_modeling  
REWARD_MODEL_TYPE="auto_regressive"
# text_only, image_only, text_image
INPUT_TYPE="text_only"
# if 1: use default, if >1 (Change temperature): average & self-consistency 
NUM_GENERATE=5
# 0.0 ~ 1.0
TEMPERATURE=1.0
USE_LOGPROBS=True
# False (default), True (only for bt_modeling)
USE_BATCH=False

# Use our checklist (default: False, only use it when EVALUATION_MODE is judge_with_checklist_generation)
USE_OUR_CHECKLIST=False
OUR_CHECKLIST_PATH="YOUR_OUR_CHECKLIST_PATH"

## Others
NUM_WORKERS=20
DATASET_NAME="YOUR_DATASET_NAME"
SAVE_DIR="results_benchmark"
########################################################

python src/evaluate_minibench.py \
    --dataset_name $DATASET_NAME \
    --save_dir $SAVE_DIR \
    --save_model_name $SAVE_MODEL_NAME \
    --model_name $MODEL_NAME \
    --base_url $BASE_URL \
    --api_key $API_KEY \
    --num_generate $NUM_GENERATE \
    --reward_model_type $REWARD_MODEL_TYPE \
    --evaluation_mode $EVALUATION_MODE \
    --prompt_type $PROMPT_TYPE \
    --input_type $INPUT_TYPE \
    --temperature $TEMPERATURE \
    --num_workers $NUM_WORKERS \
    --use_batch $USE_BATCH \
    --use_our_checklist $USE_OUR_CHECKLIST \
    --our_checklist_path $OUR_CHECKLIST_PATH \
    --use_in_progress $USE_IN_PROGRESS \
    --use_log_probs $USE_LOGPROBS \
    # --num_data 1 \