model_name,dataset,avg_thinking_lengths_ill_posed,avg_correct_well_posed,avg_llm_abstention_ill_posed
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,gpqa,3567.22027972028,0.4230769230769231,0.006993006993006993
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,hle,2614.5,0.09868421052631579,0.013157894736842105
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,gpqa,2711.9895104895104,0.6048951048951049,0.024475524475524476
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,hle,2444.5526315789475,0.08552631578947369,0.046052631578947366
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,gpqa,3174.1923076923076,0.24475524475524477,0.017482517482517484
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,hle,2417.1447368421054,0.06578947368421052,0.046052631578947366
Qwen/QwQ-32B,gpqa,5340.038461538462,0.2762237762237762,0.038461538461538464
Qwen/QwQ-32B,hle,4336.381578947368,0.039473684210526314,0.02631578947368421
Qwen/Qwen3-8B,gpqa,3474.671328671329,0.583916083916084,0.16433566433566432
Qwen/Qwen3-8B,hle,3012.0723684210525,0.05921052631578947,0.05263157894736842
Qwen/Qwen3-14B,gpqa,2642.951048951049,0.2867132867132867,0.2762237762237762
Qwen/Qwen3-14B,hle,2301.907894736842,0.125,0.08552631578947369
Qwen/Qwen3-32B,gpqa,3330.153846153846,0.2062937062937063,0.11888111888111888
Qwen/Qwen3-32B,hle,2483.657894736842,0.02631578947368421,0.03289473684210526
nvidia/AceReason-Nemotron-1.1-7B,gpqa,6376.765734265734,0.583916083916084,0.0
nvidia/AceReason-Nemotron-1.1-7B,hle,4677.815789473684,0.13157894736842105,0.013157894736842105
nvidia/AceReason-Nemotron-14B,gpqa,4800.55944055944,0.6748251748251748,0.0034965034965034965
nvidia/AceReason-Nemotron-14B,hle,4118.375,0.09868421052631579,0.039473684210526314
XiaomiMiMo/MiMo-7B-RL-0530,gpqa,9916.97902097902,0.527972027972028,0.013986013986013986
XiaomiMiMo/MiMo-7B-RL-0530,hle,7106.368421052632,0.09210526315789473,0.039473684210526314
Skywork/Skywork-OR1-7B,gpqa,6115.7692307692305,0.43356643356643354,0.0
Skywork/Skywork-OR1-7B,hle,5012.993421052632,0.07236842105263158,0.046052631578947366
Skywork/Skywork-OR1-32B,gpqa,5200.762237762237,0.3111888111888112,0.0034965034965034965
Skywork/Skywork-OR1-32B,hle,4101.368421052632,0.06578947368421052,0.03289473684210526
