model_name,dataset,avg_thinking_lengths_ill_posed,avg_correct_well_posed,avg_llm_abstention_ill_posed
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,mmlu,5329.766917293233,0.6616541353383458,0.06015037593984962
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,umwp,660.7850877192982,0.8859649122807017,0.15350877192982457
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,mc,2321.42,0.77,0.1
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,mip,7137.961538461538,0.5769230769230769,0.019230769230769232
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,mmlu,3123.0751879699246,0.8721804511278195,0.2706766917293233
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,umwp,2271.688596491228,0.9078947368421053,0.12719298245614036
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,mc,4511.69,0.95,0.07
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,mip,7110.153846153846,0.6153846153846154,0.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,mmlu,3881.218045112782,0.9398496240601504,0.15037593984962405
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,umwp,303.35087719298247,0.9166666666666666,0.2675438596491228
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,mc,2271.47,0.95,0.19
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,mip,6854.346153846154,0.6153846153846154,0.019230769230769232
Qwen/QwQ-32B,mmlu,5480.293233082707,0.9924812030075187,0.2631578947368421
Qwen/QwQ-32B,umwp,3753.8377192982457,0.956140350877193,0.008771929824561403
Qwen/QwQ-32B,mc,6504.39,0.97,0.0
Qwen/QwQ-32B,mip,10441.826923076924,0.5961538461538461,0.0
Qwen/Qwen3-8B,mmlu,4460.293233082707,0.9473684210526315,0.21804511278195488
Qwen/Qwen3-8B,umwp,3614.3245614035086,0.9517543859649122,0.039473684210526314
Qwen/Qwen3-8B,mc,5526.26,0.97,0.01
Qwen/Qwen3-8B,mip,9409.673076923076,0.6153846153846154,0.0
Qwen/Qwen3-14B,mmlu,3039.4586466165415,0.9849624060150376,0.3383458646616541
Qwen/Qwen3-14B,umwp,2617.0877192982457,0.9517543859649122,0.08771929824561403
Qwen/Qwen3-14B,mc,4363.42,0.96,0.06
Qwen/Qwen3-14B,mip,8705.288461538461,0.5769230769230769,0.019230769230769232
Qwen/Qwen3-32B,mmlu,3356.4210526315787,0.9849624060150376,0.20300751879699247
Qwen/Qwen3-32B,umwp,2771.285087719298,0.9517543859649122,0.03508771929824561
Qwen/Qwen3-32B,mc,4715.71,0.97,0.02
Qwen/Qwen3-32B,mip,8796.403846153846,0.6346153846153846,0.0
nvidia/AceReason-Nemotron-1.1-7B,mmlu,6501.315789473684,1.0,0.08270676691729323
nvidia/AceReason-Nemotron-1.1-7B,umwp,4364.4473684210525,0.9342105263157895,0.03070175438596491
nvidia/AceReason-Nemotron-1.1-7B,mc,7405.78,0.95,0.02
nvidia/AceReason-Nemotron-1.1-7B,mip,10195.98076923077,0.6153846153846154,0.038461538461538464
nvidia/AceReason-Nemotron-14B,mmlu,6289.639097744361,0.9924812030075187,0.07518796992481203
nvidia/AceReason-Nemotron-14B,umwp,4697.6359649122805,0.9605263157894737,0.017543859649122806
nvidia/AceReason-Nemotron-14B,mc,7919.56,0.97,0.01
nvidia/AceReason-Nemotron-14B,mip,10830.192307692309,0.6346153846153846,0.0
XiaomiMiMo/MiMo-7B-RL-0530,mmlu,10577.556390977443,0.9924812030075187,0.10526315789473684
XiaomiMiMo/MiMo-7B-RL-0530,umwp,6563.333333333333,0.9605263157894737,0.03508771929824561
XiaomiMiMo/MiMo-7B-RL-0530,mc,10847.39,0.97,0.03
XiaomiMiMo/MiMo-7B-RL-0530,mip,15395.192307692309,0.6153846153846154,0.0
Skywork/Skywork-OR1-7B,mmlu,9610.12030075188,0.9774436090225563,0.022556390977443608
Skywork/Skywork-OR1-7B,umwp,3256.1929824561403,0.9298245614035088,0.09649122807017543
Skywork/Skywork-OR1-7B,mc,7783.01,0.97,0.01
Skywork/Skywork-OR1-7B,mip,13778.73076923077,0.6153846153846154,0.0
Skywork/Skywork-OR1-32B,mmlu,6068.112781954887,0.6842105263157895,0.06766917293233082
Skywork/Skywork-OR1-32B,umwp,570.9605263157895,0.9517543859649122,0.2543859649122807
Skywork/Skywork-OR1-32B,mc,1736.36,0.97,0.16
Skywork/Skywork-OR1-32B,mip,10530.23076923077,0.6153846153846154,0.019230769230769232
