model_name,dataset,avg_thinking_lengths_ill_posed,avg_correct_well_posed,avg_llm_abstention_ill_posed
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,mmlu,5132.526315789473,0.9624060150375939,0.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,umwp,589.75,0.8728070175438597,0.12719298245614036
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,mc,2221.21,0.88,0.13
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,mip,6680.75,0.5769230769230769,0.019230769230769232
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,mmlu,5126.751879699248,0.9774436090225563,0.007518796992481203
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,umwp,2405.1403508771928,0.9298245614035088,0.09649122807017543
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,mc,4463.5,0.93,0.08
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,mip,7312.173076923077,0.5961538461538461,0.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,mmlu,5153.360902255639,0.9774436090225563,0.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,umwp,336.8859649122807,0.9473684210526315,0.25877192982456143
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,mc,2323.48,0.93,0.19
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,mip,7278.0192307692305,0.6153846153846154,0.0
Qwen/QwQ-32B,mmlu,7969.699248120301,0.9924812030075187,0.0
Qwen/QwQ-32B,umwp,3710.8201754385964,0.9649122807017544,0.017543859649122806
Qwen/QwQ-32B,mc,6488.73,0.97,0.01
Qwen/QwQ-32B,mip,10478.26923076923,0.6153846153846154,0.0
Qwen/Qwen3-8B,mmlu,6065.834586466165,0.9699248120300752,0.007518796992481203
Qwen/Qwen3-8B,umwp,3549.2587719298244,0.9517543859649122,0.02631578947368421
Qwen/Qwen3-8B,mc,5717.9,0.97,0.01
Qwen/Qwen3-8B,mip,9274.557692307691,0.6153846153846154,0.019230769230769232
Qwen/Qwen3-14B,mmlu,5206.2180451127815,0.9699248120300752,0.0
Qwen/Qwen3-14B,umwp,2598.815789473684,0.9473684210526315,0.12280701754385964
Qwen/Qwen3-14B,mc,4416.66,1.0,0.03
Qwen/Qwen3-14B,mip,8149.346153846154,0.5769230769230769,0.019230769230769232
Qwen/Qwen3-32B,mmlu,5599.616541353384,0.9548872180451128,0.007518796992481203
Qwen/Qwen3-32B,umwp,2665.2368421052633,0.9342105263157895,0.03070175438596491
Qwen/Qwen3-32B,mc,4809.04,0.98,0.01
Qwen/Qwen3-32B,mip,8947.23076923077,0.6346153846153846,0.019230769230769232
nvidia/AceReason-Nemotron-1.1-7B,mmlu,6411.2406015037595,0.9924812030075187,0.022556390977443608
nvidia/AceReason-Nemotron-1.1-7B,umwp,4379.377192982456,0.9342105263157895,0.02631578947368421
nvidia/AceReason-Nemotron-1.1-7B,mc,7113.09,0.98,0.05
nvidia/AceReason-Nemotron-1.1-7B,mip,10195.98076923077,0.6153846153846154,0.038461538461538464
nvidia/AceReason-Nemotron-14B,mmlu,6933.413533834587,0.9849624060150376,0.0
nvidia/AceReason-Nemotron-14B,umwp,4883.372807017544,0.956140350877193,0.0043859649122807015
nvidia/AceReason-Nemotron-14B,mc,8032.65,0.97,0.01
nvidia/AceReason-Nemotron-14B,mip,11260.961538461539,0.6346153846153846,0.0
XiaomiMiMo/MiMo-7B-RL-0530,mmlu,10762.345864661655,0.9924812030075187,0.015037593984962405
XiaomiMiMo/MiMo-7B-RL-0530,umwp,6896.6140350877195,0.9473684210526315,0.039473684210526314
XiaomiMiMo/MiMo-7B-RL-0530,mc,10847.39,0.97,0.03
XiaomiMiMo/MiMo-7B-RL-0530,mip,15067.307692307691,0.6153846153846154,0.019230769230769232
Skywork/Skywork-OR1-7B,mmlu,9565.157894736842,0.9924812030075187,0.0
Skywork/Skywork-OR1-7B,umwp,3324.745614035088,0.9210526315789473,0.10526315789473684
Skywork/Skywork-OR1-7B,mc,8319.82,0.97,0.06
Skywork/Skywork-OR1-7B,mip,12216.75,0.5961538461538461,0.0
Skywork/Skywork-OR1-32B,mmlu,5806.496240601504,0.7969924812030075,0.0
Skywork/Skywork-OR1-32B,umwp,471.99122807017545,0.9517543859649122,0.21929824561403508
Skywork/Skywork-OR1-32B,mc,2096.43,0.95,0.2
Skywork/Skywork-OR1-32B,mip,10268.5,0.6153846153846154,0.0
