
LLM_path="" #TODO: LLM path
state_dict_dir="" #TODO: fine-tune LLM and difficulty classifier dir
state_dict_tag="" #TODO: fine-tune LLM and difficulty classifier tag
eval_data_dir="" #TODO: eval data path
output_dir="" #TODO: output dir
thres=0.5


############################################
#! 1. Evaluation across various dataset
############################################
CUDA_VISIBLE_DEVICES=0 python math_eval_self_v9_hard.py \
    --data_names aime25,aime24,amc23,math500,gsm8k \
    --data_dir "$eval_data_dir" \
    --state_dict_dir "$state_dict_dir" \
    --state_dict_tag "$state_dict_tag" \
    --Qwen_model_path "$LLM_path" \
    --log_step 1 \
    --time_step 4 \
    --cot_len 512 \
    --thres $thres \
    --output_dir "$output_dir" \
    --prompt_type qwen_deepseek_distill \
    --reThink_prompt_type qwen_deepseek_distill_reThink \
    --num_test_sample -1 \
    --max_tokens_per_call 32768 \
    --max_model_len 32768 \
    --n_sampling 1 \
    --save_outputs \
    --overwrite \
    --seed 42 \
    --do_sample False


