model_name,dataset,avg_thinking_lengths_ill_posed,avg_correct_well_posed,avg_llm_abstention_ill_posed
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,mmlu,4555.067669172932,0.9022556390977443,0.06015037593984962
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,umwp,736.5701754385965,0.8903508771929824,0.14912280701754385
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,mc,2145.85,0.81,0.1
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,mip,7137.961538461538,0.5769230769230769,0.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,mmlu,3246.7368421052633,0.9548872180451128,0.17293233082706766
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,umwp,2283.372807017544,0.9254385964912281,0.09210526315789473
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,mc,4942.28,0.95,0.04
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,mip,7110.153846153846,0.6153846153846154,0.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,mmlu,3839.2406015037595,0.9548872180451128,0.09774436090225563
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,umwp,322.7938596491228,0.9122807017543859,0.2675438596491228
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,mc,2383.9,0.96,0.17
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,mip,7436.115384615385,0.6153846153846154,0.019230769230769232
Qwen/QwQ-32B,mmlu,6452.390977443609,0.9924812030075187,0.08270676691729323
Qwen/QwQ-32B,umwp,3841.811403508772,0.9605263157894737,0.03070175438596491
Qwen/QwQ-32B,mc,6407.05,0.96,0.01
Qwen/QwQ-32B,mip,10604.307692307691,0.6153846153846154,0.0
Qwen/Qwen3-8B,mmlu,4336.015037593985,0.9774436090225563,0.21052631578947367
Qwen/Qwen3-8B,umwp,3630.9210526315787,0.9473684210526315,0.017543859649122806
Qwen/Qwen3-8B,mc,5739.33,0.97,0.01
Qwen/Qwen3-8B,mip,9301.576923076924,0.6153846153846154,0.0
Qwen/Qwen3-14B,mmlu,3629.3984962406016,0.6090225563909775,0.39849624060150374
Qwen/Qwen3-14B,umwp,2561.3815789473683,0.956140350877193,0.10964912280701754
Qwen/Qwen3-14B,mc,4330.2,0.96,0.04
Qwen/Qwen3-14B,mip,8458.25,0.5961538461538461,0.019230769230769232
Qwen/Qwen3-32B,mmlu,4152.9699248120305,0.9624060150375939,0.18045112781954886
Qwen/Qwen3-32B,umwp,2806.8333333333335,0.9517543859649122,0.04824561403508772
Qwen/Qwen3-32B,mc,4706.3,0.98,0.0
Qwen/Qwen3-32B,mip,9282.25,0.6153846153846154,0.0
nvidia/AceReason-Nemotron-1.1-7B,mmlu,6463.571428571428,0.9849624060150376,0.007518796992481203
nvidia/AceReason-Nemotron-1.1-7B,umwp,4380.956140350877,0.9429824561403509,0.05701754385964912
nvidia/AceReason-Nemotron-1.1-7B,mc,6933.32,0.97,0.01
nvidia/AceReason-Nemotron-1.1-7B,mip,10315.961538461539,0.6153846153846154,0.0
nvidia/AceReason-Nemotron-14B,mmlu,6187.308270676692,0.9924812030075187,0.05263157894736842
nvidia/AceReason-Nemotron-14B,umwp,4931.377192982456,0.9517543859649122,0.021929824561403508
nvidia/AceReason-Nemotron-14B,mc,7919.56,0.97,0.01
nvidia/AceReason-Nemotron-14B,mip,10883.942307692309,0.6346153846153846,0.019230769230769232
XiaomiMiMo/MiMo-7B-RL-0530,mmlu,10562.172932330826,1.0,0.045112781954887216
XiaomiMiMo/MiMo-7B-RL-0530,umwp,6849.118421052632,0.9649122807017544,0.05263157894736842
XiaomiMiMo/MiMo-7B-RL-0530,mc,10604.2,0.98,0.03
XiaomiMiMo/MiMo-7B-RL-0530,mip,14704.884615384615,0.6153846153846154,0.0
Skywork/Skywork-OR1-7B,mmlu,8720.436090225563,0.9849624060150376,0.03759398496240601
Skywork/Skywork-OR1-7B,umwp,3502.719298245614,0.9429824561403509,0.10526315789473684
Skywork/Skywork-OR1-7B,mc,8038.48,0.96,0.03
Skywork/Skywork-OR1-7B,mip,12339.634615384615,0.6153846153846154,0.0
Skywork/Skywork-OR1-32B,mmlu,5059.548872180451,0.6541353383458647,0.11278195488721804
Skywork/Skywork-OR1-32B,umwp,586.8552631578947,0.9429824561403509,0.2543859649122807
Skywork/Skywork-OR1-32B,mc,2286.03,0.98,0.2
Skywork/Skywork-OR1-32B,mip,10007.846153846154,0.6346153846153846,0.0
