# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# python evaluate_model.py --model_name /home/bmm-system/data/private/chenminghan/project/understand-r1-zero/oat-output/qwen2.5-Math-7b-drgrpo-Notemplate-4.2-master02_0402T14:24:06/saved_models/step_00400



# {'aime': 0.26666666666666666, 'amc': 0.5783132530120482, 'math': 0.766, 'minerva': 0.20220588235294118, 'olympiad_bench': 0.3718518518518519}
# avg: 0.4370075307767015


# python evaluate_model.py --model_name /home/bmm-system/data/private/chenminghan/project/understand-r1-zero/oat-output/oat7b-math-node03_0404T20:40:29/saved_models/step_00300
# {'aime': 0.3, 'amc': 0.5783132530120482, 'math': 0.806, 'minerva': 0.33088235294117646, 'olympiad_bench': 0.4237037037037037}
# avg: 0.4877798619313857



# 看来只增加lr 其他不动不是好的选择
#python evaluate_model.py --model_name /home/bmm-system/data/private/chenminghan/project/understand-r1-zero/oat-output/oat7b-math-node02-lr5e-6-exp1_0406T13:01:10/saved_models/step_00400
#{'aime': 0.1, 'amc': 0.4457831325301205, 'math': 0.74, 'minerva': 0.3272058823529412, 'olympiad_bench': 0.37925925925925924}
#avg: 0.3984496548284642






#python evaluate_model.py --model_name /home/bmm-system/data/private/chenminghan/project/understand-r1-zero/oat-output/oat7b-debug_0408T01:26:30/saved_models/step_00400

# s400
# {'aime': 0.23333333333333334, 'amc': 0.5542168674698795, 'math': 0.808, 'minerva': 0.31985294117647056, 'olympiad_bench': 0.44}
# avg: 0.4710806283959367



# export CUDA_VISIBLE_DEVICES=2
#python evaluate_model.py --model_name /home/bmm-system/data/private/chenminghan/project/understand-r1-zero/oat-output/oat7b-math-node02-lr5e-6-exp1_0406T13:01:10/saved_models/step_00100

# s100
# {'aime': 0.2, 'amc': 0.5662650602409639, 'math': 0.788, 'minerva': 0.33455882352941174, 'olympiad_bench': 0.4222222222222222}
# avg: 0.4622092211985196





# export CUDA_VISIBLE_DEVICES=2
# python evaluate_model.py /home/bmm-system/data/private/chenminghan/project/understand-r1-zero/oat-output/oat7b-debug-lr2e-6_0409T19:16:07/saved_models/step_00580


#s580
# {'aime': 0.23333333333333334, 'amc': 0.5542168674698795, 'math': 0.828, 'minerva': 0.3382352941176471, 'olympiad_bench': 0.4088888888888889}
# avg: 0.47253487676194983

# s585
# {'aime': 0.23333333333333334, 'amc': 0.5542168674698795, 'math': 0.812, 'minerva': 0.3088235294117647, 'olympiad_bench': 0.4192592592592593}
# avg: 0.4655265978948474

# s590
# {'aime': 0.2, 'amc': 0.5421686746987951, 'math': 0.814, 'minerva': 0.34558823529411764, 'olympiad_bench': 0.41185185185185186}
# avg: 0.4627217523689529

# pip install -U pynvml 可以治疗vllm检测平台失败
export CUDA_VISIBLE_DEVICES=6,7
# python cmh_evaluate_model.py sail/Qwen2.5-Math-7B-Oat-Zero \
#     --template qwen_math \
#     --temperature 0.6 \
#     --top_p 1.0 \
#     --n_samples 10 \
#     --tasks ["aime"]

# {'aime': 0.3, 'amc': 0.5903614457831325, 'math': 0.832, 'minerva': 0.3014705882352941, 'olympiad_bench': 0.4474074074074074}
# seed-s10-m2-resample-exp1-resume_0508T11:37:16/saved_models/step_00115 avg: 0.49424788828516675

# {'aime': 0.36666666666666664, 'amc': 0.6144578313253012, 'math': 0.83, 'minerva': 0.3492647058823529, 'olympiad_bench': 0.4577777777777778}
# seed-s10-m2-resample-exp1-resume_0508T11:37:16/saved_models/step_00120 avg: 0.5236333963304196 

# {'aime': 0.4, 'amc': 0.6626506024096386, 'math': 0.826, 'minerva': 0.3492647058823529, 'olympiad_bench': 0.46074074074074073}
# focaal/step_00750 avg: 0.5397312098065464
# {'aime': 0.36666666666666664, 'amc': 0.6626506024096386, 'math': 0.834, 'minerva': 0.3382352941176471, 'olympiad_bench': 0.47555555555555556}
# focaal/step_00755 avg: 0.5354216237499017
python evaluate_model.py --model_name /home/bmm-system/data/private/chenminghan/project/重要权重/focaal/step_00755