model_name,dataset,avg_thinking_lengths_ill_posed,avg_correct_well_posed,avg_llm_abstention_ill_posed
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,gpqa,4320.2307692307695,0.3916083916083916,0.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,hle,2782.2302631578946,0.08552631578947369,0.039473684210526314
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,gpqa,4083.786713286713,0.2062937062937063,0.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,hle,2462.8223684210525,0.09868421052631579,0.046052631578947366
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,gpqa,4183.461538461538,0.6888111888111889,0.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B,hle,2500.7697368421054,0.05921052631578947,0.03289473684210526
Qwen/QwQ-32B,gpqa,6160.419580419581,0.6398601398601399,0.0
Qwen/QwQ-32B,hle,4146.9473684210525,0.06578947368421052,0.039473684210526314
Qwen/Qwen3-8B,gpqa,5120.363636363636,0.6503496503496503,0.0
Qwen/Qwen3-8B,hle,5361.456790123457,0.08641975308641975,0.021604938271604937
Qwen/Qwen3-14B,gpqa,4353.695804195804,0.6328671328671329,0.0034965034965034965
Qwen/Qwen3-14B,hle,2331.7039473684213,0.07894736842105263,0.09210526315789473
Qwen/Qwen3-32B,gpqa,4376.832167832168,0.6888111888111889,0.0034965034965034965
Qwen/Qwen3-32B,hle,2604.2894736842104,0.08552631578947369,0.02631578947368421
nvidia/AceReason-Nemotron-1.1-7B,gpqa,6293.8216783216785,0.6118881118881119,0.0
nvidia/AceReason-Nemotron-1.1-7B,hle,4694.921052631579,0.07894736842105263,0.013157894736842105
nvidia/AceReason-Nemotron-14B,gpqa,6391.72027972028,0.6783216783216783,0.0
nvidia/AceReason-Nemotron-14B,hle,4410.9078947368425,0.05263157894736842,0.019736842105263157
XiaomiMiMo/MiMo-7B-RL-0530,gpqa,10385.56993006993,0.548951048951049,0.0034965034965034965
XiaomiMiMo/MiMo-7B-RL-0530,hle,6824.9276315789475,0.10526315789473684,0.05921052631578947
Skywork/Skywork-OR1-7B,gpqa,7426.933566433567,0.25874125874125875,0.0034965034965034965
Skywork/Skywork-OR1-7B,hle,5121.743421052632,0.05263157894736842,0.046052631578947366
Skywork/Skywork-OR1-32B,gpqa,6057.986013986014,0.7027972027972028,0.0
Skywork/Skywork-OR1-32B,hle,4064.842105263158,0.05921052631578947,0.02631578947368421
