model_name,dataset,avg_thinking_lengths_ill_posed,avg_correct_well_posed,avg_llm_abstention_ill_posed
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,gpqa,4084.8916083916083,0.4370629370629371,0.03146853146853147
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,hle,2831.3618421052633,0.05921052631578947,0.019736842105263157
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,gpqa,3380.2762237762236,0.6503496503496503,0.04195804195804196
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,hle,2622.8355263157896,0.08552631578947369,0.046052631578947366
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,gpqa,3332.7482517482517,0.3531468531468531,0.05244755244755245
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,hle,2474.8881578947367,0.07236842105263158,0.03289473684210526
Qwen/QwQ-32B,gpqa,5142.44055944056,0.2867132867132867,0.15384615384615385
Qwen/QwQ-32B,hle,4199.328947368421,0.05263157894736842,0.03289473684210526
Qwen/Qwen3-8B,gpqa,3731.188811188811,0.33916083916083917,0.24125874125874125
Qwen/Qwen3-8B,hle,2949.559210526316,0.05921052631578947,0.05263157894736842
Qwen/Qwen3-14B,gpqa,2662.4405594405594,0.6118881118881119,0.32167832167832167
Qwen/Qwen3-14B,hle,2424.3223684210525,0.05263157894736842,0.1118421052631579
Qwen/Qwen3-32B,gpqa,3070.6188811188813,0.3181818181818182,0.1853146853146853
Qwen/Qwen3-32B,hle,2486.3289473684213,0.07236842105263158,0.039473684210526314
nvidia/AceReason-Nemotron-1.1-7B,gpqa,6502.283216783217,0.5384615384615384,0.01048951048951049
nvidia/AceReason-Nemotron-1.1-7B,hle,4377.9078947368425,0.13157894736842105,0.013157894736842105
nvidia/AceReason-Nemotron-14B,gpqa,5474.9055944055945,0.2762237762237762,0.02097902097902098
nvidia/AceReason-Nemotron-14B,hle,4277.4671052631575,0.09868421052631579,0.03289473684210526
XiaomiMiMo/MiMo-7B-RL-0530,gpqa,10020.643356643357,0.534965034965035,0.013986013986013986
XiaomiMiMo/MiMo-7B-RL-0530,hle,6638.309210526316,0.09868421052631579,0.046052631578947366
Skywork/Skywork-OR1-7B,gpqa,6733.748251748252,0.45454545454545453,0.0
Skywork/Skywork-OR1-7B,hle,4956.5,0.06578947368421052,0.03289473684210526
Skywork/Skywork-OR1-32B,gpqa,5719.101398601399,0.4230769230769231,0.02097902097902098
Skywork/Skywork-OR1-32B,hle,4036.7434210526317,0.07236842105263158,0.03289473684210526
