model_name,thinking_mode,model_family,instruction_tuned,source,ARC-C,Arena-Hard,BBH,GPQA,GSM8K,HellaSwag,HumanEval,HumanEval+,IFEval_strict_prompt,LiveBench_0831,LiveCodeBench,MATH,MBPP,MBPP+,MMLU,MMLU-Pro,MMLU-Redux,MMLU-STEM,MTbench,Multi-Exam,Multi-Mathematics,Multi-Translation,Multi-Understanding,MultiPL-E,TheoremQA,TruthfulQA,Winogrande
Qwen/Qwen2-0.5B,Non-thinking,Qwen,Yes,Qwen2.5 Report Table 10 (0.5B-1.5B instruct-tuned models),,,,23.7,40.1,,31.1,,14.6,7.4,1.6,13.9,39.7,,,14.4,12.9,,,,,,,20.8,,,
Qwen/Qwen2.5-0.5B,Non-thinking,Qwen,Yes,Qwen2.5 Report Table 10 (0.5B-1.5B instruct-tuned models),,,,29.8,49.6,,35.4,,27.9,12.6,5.1,34.4,49.6,,,15.0,24.1,,,,,,,28.5,,,
Qwen/Qwen2-1.5B,Non-thinking,Qwen,Yes,Qwen2.5 Report Table 10 (0.5B-1.5B instruct-tuned models),,,,21.2,61.6,,42.1,,29.0,12.4,4.5,25.3,44.2,,,22.9,41.2,,,,,,,38.5,,,
Qwen/Qwen2.5-1.5B,Non-thinking,Qwen,Yes,Qwen2.5 Report Table 10 (0.5B-1.5B instruct-tuned models),,,,29.8,73.2,,61.6,,42.5,18.8,14.8,55.2,63.2,,,32.4,50.7,,,,,,,50.4,,,
Llama/Llama-3-70B,Non-thinking,Llama,No,Qwen2.5 Report Table 2 (70B+ base models & Qwen2.5-Plus),68.8,,81.0,36.3,77.6,88.0,48.2,42.1,,,,42.5,70.4,58.4,79.5,52.8,75.0,73.7,,70.0,67.1,38.0,79.9,46.3,32.3,45.6,85.3
Mistral/Mixtral-8x22B,Non-thinking,Mistral,No,Qwen2.5 Report Table 2 (70B+ base models & Qwen2.5-Plus),70.7,,78.9,34.3,83.7,88.7,46.3,40.2,,,,41.7,71.7,58.1,77.8,51.6,72.9,71.7,,63.5,62.9,23.3,77.7,46.7,35.9,51.0,85.0
Llama/Llama-3-405B,Non-thinking,Llama,No,Qwen2.5 Report Table 2 (70B+ base models & Qwen2.5-Plus),,,85.9,,89.0,,61.0,,,,,53.8,73.0,63.9,85.2,61.6,,,,,,,,,,,86.7
Qwen/Qwen2-72B,Non-thinking,Qwen,Yes,Qwen2.5 Report Table 6 (70B+ Instruct models),68.9,48.1,82.4,42.4,93.2,87.3,86.0,56.1,77.6,41.5,32.2,69.0,80.2,63.9,84.2,64.4,81.6,79.6,9.12,76.6,76.0,37.8,80.7,69.2,42.8,54.8,85.1
Qwen/Qwen2.5-72B,Non-thinking,Qwen,Yes,Qwen2.5 Report Table 6 (70B+ Instruct models),72.4,81.2,86.3,49.0,95.8,87.6,86.6,51.2,84.1,52.3,55.5,83.1,88.2,69.2,86.1,71.1,86.8,82.7,9.35,78.7,76.7,39.0,89.6,75.1,42.4,60.4,83.9
Qwen/Qwen2.5-Plus,Non-thinking,Qwen,Yes,Qwen2.5 Report Table 6 (70B+ Instruct models),70.9,81.4,85.8,49.7,96.0,89.2,87.8,52.4,86.3,54.6,51.4,84.7,85.5,66.9,85.4,72.5,86.3,81.2,9.3,78.5,82.4,40.4,89.2,77.0,48.5,55.3,85.5
Mistral/Mistral-7B,Non-thinking,Mistral,No,Qwen2.5 Report Table 4 (7B+ base models),60.0,,56.1,24.7,36.2,83.3,29.3,24.4,,,,10.2,51.1,40.9,64.2,30.9,58.1,50.1,,47.1,26.3,23.3,63.3,29.4,19.2,42.2,78.4
Llama/Llama-3-8B,Non-thinking,Llama,No,Qwen2.5 Report Table 4 (7B+ base models),59.3,,57.7,25.8,55.3,82.1,33.5,29.3,,,,20.5,53.9,44.4,66.6,35.4,61.6,55.3,,52.3,36.3,31.9,68.6,33.5,22.1,44.0,77.4
Google/Gemma-2-9B,Non-thinking,Gemma,No,Qwen2.5 Report Table 4 (7B+ base models),68.2,,68.2,32.8,70.7,81.9,37.8,30.5,,,,37.7,62.2,50.6,71.3,44.7,67.9,65.1,,61.2,53.0,36.5,78.3,34.9,28.9,45.3,79.5
Qwen/Qwen2-7B,Non-thinking,Qwen,No,Qwen2.5 Report Table 4 (7B+ base models),60.6,,62.3,30.8,80.2,80.7,51.2,43.3,,,,43.5,64.2,51.9,70.3,40.1,68.1,64.2,,59.2,57.5,31.5,72.0,41.0,29.6,54.2,77.0
Qwen/Qwen2.5-7B,Non-thinking,Qwen,No,Qwen2.5 Report Table 4 (7B+ base models),63.7,,70.4,36.4,85.4,80.2,57.9,50.6,,,,49.8,74.9,62.9,74.2,45.0,71.1,72.3,,59.4,57.8,32.4,79.3,50.3,36.0,56.4,75.9
Llama/LLaMA-3.1-70B,Non-thinking,Llama,Yes,Qwen2.5 Report Table 6 (70B+ Instruct models),,55.7,,46.7,95.1,,80.5,,83.6,46.6,32.1,68.0,84.2,,,66.4,83.0,,8.79,,,,,68.2,,,
Llama/LLaMA-3-405B,Non-thinking,Llama,Yes,Qwen2.5 Report Table 6 (70B+ Instruct models),,69.3,,51.1,96.8,,89.0,,86.0,53.2,41.6,73.8,84.5,,,73.3,86.2,,9.08,,,,,73.5,,,
Qwen/Qwen2-57B-A14B,Non-thinking,Qwen,Yes,Qwen2.5 Report Table 7 (14B-30B+ Instruct models),,17.8,,34.3,85.3,,79.9,,59.9,31.1,22.5,49.1,70.9,,,52.8,72.6,,8.55,,,,,66.4,,,
Google/Gemma-3-27B,Non-thinking,Gemma,Yes,Qwen2.5 Report Table 7 (14B-30B+ Instruct models),,57.5,,38.4,90.4,,78.7,,77.1,39.6,,54.4,81.0,,,55.5,75.7,,9.1,,,,,67.4,,,
OpenAI/GPT-4o-mini,Non-thinking,OpenAI,Yes,Qwen2.5 Report Table 7 (14B-30B+ Instruct models),,74.9,,40.2,93.2,,88.4,,80.4,43.3,40.7,70.2,85.7,,,63.1,81.5,,,,,,,75.0,,,
Qwen/Qwen2.5-Turbo,Non-thinking,Qwen,Yes,Qwen2.5 Report Table 7 (14B-30B+ Instruct models),,67.1,,42.3,93.8,,86.6,,76.3,42.3,37.8,81.1,82.8,,,64.5,81.7,,8.88,,,,,73.7,,,
Qwen/Qwen2.5-14B,Non-thinking,Qwen,Yes,Qwen2.5 Report Table 7 (14B-30B+ Instruct models),,68.3,,45.5,94.8,,83.5,,81.0,44.4,42.6,80.0,82.0,,,63.7,80.0,,8.82,,,,,72.8,,,
Qwen/Qwen2.5-32B,Non-thinking,Qwen,Yes,Qwen2.5 Report Table 7 (14B-30B+ Instruct models),,74.5,,49.5,95.9,,88.4,,79.5,50.7,51.2,83.1,84.0,,,69.0,83.9,,9.2,,,,,75.4,,,
Google/Gemma-2-2B,Non-thinking,Gemma,Yes,Qwen2.5 Report Table 9 (2B-4B instruct-tuned models),,,,29.3,63.2,,68.9,,51.0,20.1,5.8,26.6,74.9,,,26.7,51.9,,,,,,,30.5,,,
Phi/Phi-3.5-Mini,Non-thinking,Phi,Yes,Qwen2.5 Report Table 9 (2B-4B instruct-tuned models),,,,27.2,86.2,,72.6,,52.1,27.4,15.8,48.5,63.2,,,47.5,67.7,,,,,,,47.2,,,
MiniCPM/MiniCPM-3-4B,Non-thinking,MiniCPM,Yes,Qwen2.5 Report Table 9 (2B-4B instruct-tuned models),,,,31.3,81.1,,74.4,,68.4,27.6,23.8,46.6,72.5,,,43.0,59.9,,,,,,,49.1,,,
Qwen/Qwen2.5-3B,Non-thinking,Qwen,Yes,Qwen2.5 Report Table 9 (2B-4B instruct-tuned models),,,,30.3,86.7,,74.4,,58.2,26.8,19.9,65.9,72.7,,,43.7,64.4,,,,,,,60.2,,,
