model,right_rate,right,wrong,bad_domain,all
nl_Llama-3.1-70B-Instruct,0.011,3,0,280,283
nl_Llama-3.1-8B-Instruct,0.000,0,0,283,283
nl_Qwen2.5-0.5B-Instruct,0.000,0,0,283,283
nl_Qwen2.5-1.5B-Instruct,0.000,0,0,283,283
nl_Qwen2.5-14B-Instruct,0.216,61,49,173,283
nl_Qwen2.5-32B-Instruct,0.240,68,49,166,283
nl_Qwen2.5-3B-Instruct,0.021,6,1,276,283
nl_Qwen2.5-72B-Instruct,0.385,109,58,116,283
nl_Qwen2.5-7B-Instruct,0.057,16,26,241,283
nl_Qwen2.5-Coder-1.5B-Instruct,0.000,0,0,283,283
nl_Yi-1.5-34B-Chat,0.120,34,11,238,283
nl_Yi-1.5-6B-Chat,0.004,1,0,282,283
nl_Yi-1.5-9B-Chat,0.067,19,20,244,283
nl_Yi-Coder-1.5B-Chat,0.000,0,0,283,283
nl_Yi-Coder-9B-Chat,0.099,28,28,227,283
o1mini_nl,0.417,118,55,110,283
o1mini_problem_ipc,0.337,112,45,175,332
o1preview_nl,0.558,158,31,94,283
o1preview_problem_ipc,0.524,174,28,130,332
prob_Llama-3.1-70B-Instruct,0.000,0,0,332,332
prob_Llama-3.1-8B-Instruct,0.000,0,0,332,332
prob_Qwen2.5-0.5B-Instruct,0.000,0,0,332,332
prob_Qwen2.5-1.5B-Instruct,0.000,0,0,332,332
prob_Qwen2.5-14B-Instruct,0.253,84,46,202,332
prob_Qwen2.5-32B-Instruct,0.316,105,41,186,332
prob_Qwen2.5-3B-Instruct,0.015,5,0,327,332
prob_Qwen2.5-72B-Instruct,0.328,109,62,161,332
prob_Qwen2.5-7B-Instruct,0.117,39,25,268,332
prob_Qwen2.5-Coder-1.5B-Instruct,0.000,0,0,332,332
prob_Yi-1.5-6B-Chat,0.018,6,0,326,332
prob_Yi-1.5-9B-Chat,0.093,31,18,283,332
prob_Yi-Coder-1.5B-Chat,0.000,0,0,332,332
prob_Yi-Coder-9B-Chat,0.145,48,28,256,332
prob_genYi-1.5-34B-Chat,0.087,29,5,298,332
