AIME'24,AIME'25,ARC-C,AlignBench_v1.1,Arena-Hard,AutoLogi,BBH,BFCL_v3,C-Eval,CRUX-O,CodeForces_Percentile,CodeForces_Rating,Creative_Writing_v3,EvalPlus,GPQA,GPQA-Diamond,GSM8K,HellaSwag,HumanEval,HumanEval+,IFEval_strict_prompt,INCLUDE,LiveBench_0831,LiveBench_2024-11-25,LiveCodeBench,LiveCodeBench_v5,MATH,MATH-500,MBPP,MBPP+,MGSM,MLogiQA,MMLU,MMLU-Pro,MMLU-Redux,MMLU-STEM,MMMLU,MMMLU_14_langs,MT-AIME2024,MTbench,Multi-Exam,Multi-IF,Multi-Mathematics,Multi-Translation,Multi-Understanding,MultiPL-E,PolyMath,RULER_Avg,SuperGPQA,TheoremQA,TruthfulQA,Winogrande,WritingBench,ZebraLogic,instruction_tuned,model_family,model_name,source,thinking_mode
,,,,,,,,71.8,,,,,,,65.2,,,,,,,,54.5,,,,,,,,,,,89.3,,,,,,,,,,,,,,,,,,,,Yes,DeepSeek,DeepSeek/DeepSeek-R1-Distill-Llama-70B,Qwen3 Report Table 13,Thinking
,,,,,,,,88.4,,,,,,,65.6,,,,,,,,72.0,,,,,,,,,,,90.0,,,,,,,,,,,,,,,,,,,,Yes,Qwen,Qwen/QwQ-32B,Qwen3 Report Table 13,Thinking
,,,,,,,,75.1,,,,,,,76.8,,,,,,,,70.0,,,,,,,,,,,90.0,,,,,,,,,,,,,,,,,,,,Yes,OpenAI,OpenAI/o3-mini,Qwen3 Report Table 13,Thinking
,,,,,,,,87.3,,,,,,,68.4,,,,,,,,74.9,,,,,,,,,,,90.9,,,,,,,,,,,,,,,,,,,,No,Qwen,Qwen/Qwen3-32B,Qwen3 Report Table 13,Thinking
8.1,8.8,,7.81,74.9,52.6,,64.0,66.3,,52.6,1113.0,70.3,,,40.2,,,,,80.4,66.0,,41.3,,27.9,,78.2,,,,42.6,,,81.5,,,72.1,6.0,,,62.4,,,,,12.0,,,,,,5.98,20.1,Yes,OpenAI,OpenAI/GPT-4o-mini-2024-07-18,Qwen3 Report Table 14,Non-thinking
28.6,10.0,,7.49,70.5,56.8,,45.4,78.2,,43.7,981.0,55.0,,,57.2,,,,,84.7,74.1,,47.6,,29.8,,82.6,,,,53.9,,,86.3,,,77.5,19.1,,,64.2,,,,,20.9,,,,,,5.49,24.2,Unknown,Llama,Llama/LLaMA-4-Scout,Qwen3 Report Table 14,Non-thinking
18.9,15.0,,7.89,81.2,66.1,,63.4,84.7,,35.0,859.0,61.8,,,49.0,,,,,84.1,69.6,,51.4,,30.7,,83.6,,,,59.3,,,86.8,,,76.9,12.7,,,65.3,,,,,16.9,,,,,,7.06,26.6,Yes,Qwen,Qwen/Qwen2.5-72B-Instruct,Qwen3 Report Table 14,Non-thinking
31.0,20.2,,8.58,92.8,78.5,,63.0,83.3,,71.0,1353.0,78.3,,,54.6,,,,,83.2,70.9,,59.8,,31.3,,88.6,,,,62.9,,,85.7,,,76.5,24.1,,,70.7,,,,,22.5,,,,,,7.54,29.2,No,Qwen,Qwen/Qwen3-32B,Qwen3 Report Table 14,Non-thinking
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,85.4,,,,,,,Varies,Qwen,Qwen/Qwen2.5-7B-Instruct,Qwen3 Report Table 23 (RULER),Non-thinking
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,91.4,,,,,,,Varies,Qwen,Qwen/Qwen2.5-14B-Instruct,Qwen3 Report Table 23 (RULER),Non-thinking
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,92.9,,,,,,,Varies,Qwen,Qwen/Qwen2.5-32B-Instruct,Qwen3 Report Table 23 (RULER),Non-thinking
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,95.1,,,,,,,Varies,Qwen,Qwen/Qwen2.5-72B-Instruct,Qwen3 Report Table 23 (RULER),Non-thinking
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,83.5,,,,,,,Varies,Qwen,Qwen/Qwen3-4B,Qwen3 Report Table 23 (RULER),Thinking
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,84.4,,,,,,,Varies,Qwen,Qwen/Qwen3-8B,Qwen3 Report Table 23 (RULER),Thinking
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,89.1,,,,,,,Varies,Qwen,Qwen/Qwen3-14B,Qwen3 Report Table 23 (RULER),Thinking
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,91.0,,,,,,,Varies,Qwen,Qwen/Qwen3-32B,Qwen3 Report Table 23 (RULER),Thinking
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,86.6,,,,,,,Varies,Qwen,Qwen/Qwen3-30B-A3B,Qwen3 Report Table 23 (RULER),Thinking
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,92.2,,,,,,,Varies,Qwen,Qwen/Qwen3-235B-A22B,Qwen3 Report Table 23 (RULER),Thinking
,,,,,,84.48,,,67.8,,,,66.25,47.97,,92.87,,,,,64.35,,,,,57.7,,73.6,,78.12,,83.32,55.1,81.97,,82.4,,,,,,,,,58.3,,,33.55,,,,,,No,Qwen,Qwen/Qwen2.5-32B-Base,Qwen3 Report Table 4 (Base models @~32B),Non-thinking
,,,,,,86.3,,,68.5,,,,65.93,45.88,,91.5,,,,,69.05,,,,,62.12,,75.4,,82.4,,86.06,58.07,83.91,,83.49,,,,,,,,,58.7,,,36.2,,,,,,No,Qwen,Qwen/Qwen2.5-72B-Base,Qwen3 Report Table 4 (Base models @~32B),Non-thinking
,,,,,,79.95,,,60.4,,,,55.78,26.26,,81.2,,,,,68.94,,,,,51.78,,68.4,,73.74,,78.69,52.88,76.53,,77.62,,,,,,,,,45.03,,,29.87,,,,,,No,Gemma,Google/Gemma-3-27B-Base,Qwen3 Report Table 4 (Base models @~32B),Non-thinking
,,,,,,82.4,,,61.9,,,,59.9,40.4,,85.37,,,,,68.09,,,,,51.66,,68.6,,79.93,,78.27,56.13,71.09,,82.05,,,,,,,,,47.38,,,26.51,,,,,,No,Llama,Llama/Llama-4-Scout-Base,Qwen3 Report Table 4 (Base models @~32B),Non-thinking
,,,,,,87.38,,,72.5,,,,72.05,49.49,,93.4,,,,,67.67,,,,,61.62,,78.2,,83.06,,83.61,65.54,83.41,,83.83,,,,,,,,,67.06,,,39.78,,,,,,No,Qwen,Qwen/Qwen3-32B-Base,Qwen3 Report Table 4 (Base models @~32B),Non-thinking
,,,,,,57.7,,,36.8,,,,44.13,25.8,,55.3,,,,,44.94,,,,,20.5,,48.4,,38.92,,66.6,35.56,61.59,,59.65,,,,,,,,,31.45,,,20.54,,,,,,No,Llama,Llama/Llama-3-8B-Base,Qwen3 Report Table 6 (Base models @~8B),Non-thinking
,,,,,,70.4,,,48.5,,,,62.18,36.36,,85.36,,,,,53.98,,,,,49.8,,63.4,,63.6,,74.16,45.0,71.06,,71.34,,,,,,,,,50.73,,,26.34,,,,,,No,Qwen,Qwen/Qwen2.5-7B-Base,Qwen3 Report Table 6 (Base models @~8B),Non-thinking
,,,,,,78.18,,,61.1,,,,60.7,32.83,,90.22,,,,,60.26,,,,,55.64,,69.0,,74.68,,79.66,51.16,76.64,,78.34,,,,,,,,,54.79,,,30.68,,,,,,No,Qwen,Qwen/Qwen2.5-14B-Base,Qwen3 Report Table 6 (Base models @~8B),Non-thinking
,,,,,,78.4,,,62.0,,,,67.65,44.44,,89.84,,,,,59.4,,,,,60.8,,69.8,,76.02,,76.89,56.73,72.17,,75.72,,,,,,,,,58.75,,,31.64,,,,,,No,Qwen,Qwen/Qwen3-8B-Base,Qwen3 Report Table 6 (Base models @~8B),Non-thinking
,,,,,,51.7,,,34.0,,,,43.23,24.24,,43.97,,,,,49.06,,,,,26.1,,46.4,,33.11,,59.51,29.23,56.91,,59.62,,,,,,,,,28.06,,,17.68,,,,,,No,Gemma,Google/Gemma-3-4B-Base,Qwen3 Report Table 7 (Base models @~4B),Non-thinking
,,,,,,56.3,,,36.5,,,,46.28,26.26,,79.08,,,,,45.9,,,,,42.64,,54.6,,47.53,,65.62,34.61,63.68,,65.55,,,,,,,,,39.65,,,20.31,,,,,,No,Qwen,Qwen/Qwen2.5-3B-Base,Qwen3 Report Table 7 (Base models @~4B),Non-thinking
,,,,,,70.4,,,48.5,,,,62.18,36.36,,85.36,,,,,53.98,,,,,49.8,,63.4,,63.6,,74.16,45.0,71.06,,71.34,,,,,,,,,50.73,,,26.34,,,,,,No,Qwen,Qwen/Qwen2.5-7B-Base,Qwen3 Report Table 7 (Base models @~4B),Non-thinking
,,,,,,72.59,,,55.0,,,,63.53,36.87,,87.79,,,,,56.29,,,,,54.1,,67.0,,67.74,,72.99,50.58,72.79,,71.42,,,,,,,,,53.13,,,28.43,,,,,,No,Qwen,Qwen/Qwen3-4B-Base,Qwen3 Report Table 7 (Base models @~4B),Non-thinking
74.3,79.2,,8.86,92.1,79.8,,67.8,85.5,,96.7,1891.0,81.7,,,78.0,,,,,92.6,84.6,,75.7,,63.9,,96.4,,,,75.5,,,92.8,,,88.4,67.4,,,48.8,,,,,38.9,,,,,,7.69,81.0,Unknown,OpenAI,OpenAI/o1,Qwen3 Report Table 11 (Thinking),Thinking
79.8,70.0,,8.76,92.3,86.1,,56.9,91.8,,98.1,2029.0,85.5,,,71.5,,,,,83.3,82.7,,71.6,,64.3,,97.3,,,,73.8,,,92.9,,,86.4,73.5,,,67.7,,,,,47.1,,,,,,7.71,78.7,Unknown,DeepSeek,DeepSeek/DeepSeek-R1,Qwen3 Report Table 11 (Thinking),Thinking
77.3,77.3,,,,,,,,,,,,,,80.2,,,,,,,,,,70.6,,83.9,,,,,,,,,,,,,,,,,,,,,,,,,,,Unknown,xAI,xAI/Grok-3-Beta,Qwen3 Report Table 11 (Thinking),Thinking
86.7,86.7,,9.03,96.4,85.4,,62.9,82.9,,97.9,2001.0,86.0,,,84.0,,,,,89.5,85.1,,82.4,,70.4,,92.0,,,,75.6,,,93.7,,,86.9,76.9,,,77.8,,,,,52.2,,,,,,8.09,86.7,Unknown,Google,Google/Gemini-2.5-Pro,Qwen3 Report Table 11 (Thinking),Thinking
81.5,81.5,,8.94,95.6,89.0,,70.8,89.6,,98.2,2056.0,84.6,,,71.1,,,,,83.4,78.7,,77.1,,70.7,,85.7,,,,77.1,,,92.7,,,84.3,80.8,,,71.9,,,,,54.7,,,,,,8.03,81.5,No,Qwen,Qwen/Qwen3-235B-A22B,Qwen3 Report Table 11 (Thinking),Thinking
11.1,7.6,,8.42,85.3,65.9,,72.5,75.5,,35.4,864.0,81.1,,,46.0,,,,,86.5,78.8,,52.2,,32.7,,77.2,,,,57.4,,,87.0,,,80.3,9.2,,,65.6,,,,,12.0,,,,,,7.11,27.4,Yes,OpenAI,OpenAI/GPT-4o-2024-11-20,Qwen3 Report Table 12 (Non-thinking),Non-thinking
39.2,28.8,,8.64,85.5,76.1,,57.6,86.5,,54.1,1134.0,74.0,,,59.1,,,,,86.1,76.7,,60.5,,33.1,,90.2,,,,59.3,,,89.1,,,81.2,10.2,,,65.3,,,,,29.2,,,,,,6.49,42.1,Unknown,DeepSeek,DeepSeek/DeepSeek-V3,Qwen3 Report Table 12 (Non-thinking),Non-thinking
18.9,15.0,,7.89,81.2,66.1,,63.4,84.7,,35.0,859.0,61.8,,,49.0,,,,,86.1,69.6,,51.4,,30.7,,83.6,,,,59.2,,,86.8,,,74.3,12.7,,,65.3,,,,,16.7,,,,,,7.06,26.6,Yes,Qwen,Qwen/Qwen2.5-72B-Instruct,Qwen3 Report Table 12 (Non-thinking),Non-thinking
38.5,15.9,,7.89,82.7,75.2,,52.9,83.5,,24.3,712.0,61.3,,,69.8,,,,,86.7,69.0,,59.5,,17.2,,90.6,,,,47.0,,,91.8,,,82.5,27.0,,,75.5,,,,,27.0,,,,,,5.46,40.0,Unknown,Llama,Llama/LLaMA-4-Maverick,Qwen3 Report Table 12 (Non-thinking),Non-thinking
40.1,24.7,,8.91,96.1,83.3,,68.0,86.1,,75.7,1387.0,80.4,,,62.9,,,,,83.2,75.6,,62.5,,35.3,,91.2,,,,67.6,,,89.2,,,79.8,32.4,,,70.2,,,,,32.4,,,,,,7.7,37.7,No,Qwen,Qwen/Qwen3-235B-A22B,Qwen3 Report Table 12 (Non-thinking),Non-thinking
,,,,,,,,82.2,,,,,,,62.1,,,,,,,,45.6,,,,,,,,,,,88.2,,,,,,,,,,,,,,,,,,,,Yes,DeepSeek,DeepSeek/DeepSeek-R1-Distill-Qwen-32B,"Qwen3 Report Table 15 (Thinking, Qwen3-30B-A3B / Qwen3-14B / QwQ-32B)",Thinking
,,,,,,,,88.4,,,,,,,65.6,,,,,,,,72.0,,,,,,,,,,,90.0,,,,,,,,,,,,,,,,,,,,Yes,Qwen,Qwen/QwQ-32B,"Qwen3 Report Table 15 (Thinking, Qwen3-30B-A3B / Qwen3-14B / QwQ-32B)",Thinking
,,,,,,,,86.2,,,,,,,64.0,,,,,,,,71.3,,,,,,,,,,,88.6,,,,,,,,,,,,,,,,,,,,No,Qwen,Qwen/Qwen3-14B,"Qwen3 Report Table 15 (Thinking, Qwen3-30B-A3B / Qwen3-14B / QwQ-32B)",Thinking
,,,,,,,,86.6,,,,,,,65.8,,,,,,,,74.3,,,,,,,,,,,89.5,,,,,,,,,,,,,,,,,,,,No,Qwen,Qwen/Qwen3-30B-A3B,"Qwen3 Report Table 15 (Thinking, Qwen3-30B-A3B / Qwen3-14B / QwQ-32B)",Thinking
,,,,,,,,66.9,,,,,,,42.4,,,,,,,,41.6,,,,,,,,,,,82.6,,,,,,,,,,,,,,,,,,,,Yes,Gemma,Google/Gemma-3-27B-IT,"Qwen3 Report Table 16 (Non-thinking, Qwen3-30B-A3B / Qwen3-14B / Gemma-3-27B)",Non-thinking
,,,,,,,,86.0,,,,,,,49.5,,,,,,,,50.0,,,,,,,,,,,83.9,,,,,,,,,,,,,,,,,,,,Yes,Qwen,Qwen/Qwen2.5-32B-Instruct,"Qwen3 Report Table 16 (Non-thinking, Qwen3-30B-A3B / Qwen3-14B / Gemma-3-27B)",Non-thinking
,,,,,,,,81.0,,,,,,,54.8,,,,,,,,59.6,,,,,,,,,,,82.0,,,,,,,,,,,,,,,,,,,,No,Qwen,Qwen/Qwen3-14B,"Qwen3 Report Table 16 (Non-thinking, Qwen3-30B-A3B / Qwen3-14B / Gemma-3-27B)",Non-thinking
,,,,,,,,82.9,,,,,,,54.8,,,,,,,,59.4,,,,,,,,,,,84.1,,,,,,,,,,,,,,,,,,,,No,Qwen,Qwen/Qwen3-30B-A3B,"Qwen3 Report Table 16 (Non-thinking, Qwen3-30B-A3B / Qwen3-14B / Gemma-3-27B)",Non-thinking
,,,,,,,,78.1,,,,,,,59.1,,,,,,,,52.3,,,,,,,,,,,84.1,,,,,,,,,,,,,,,,,,,,Yes,DeepSeek,DeepSeek/DeepSeek-R1-Distill-Qwen-14B,"Qwen3 Report Table 17 (Thinking, Qwen3-8B / Qwen3-4B)",Thinking
,,,,,,,,82.2,,,,,,,62.1,,,,,,,,45.6,,,,,,,,,,,88.2,,,,,,,,,,,,,,,,,,,,Yes,DeepSeek,DeepSeek/DeepSeek-R1-Distill-Qwen-32B,"Qwen3 Report Table 17 (Thinking, Qwen3-8B / Qwen3-4B)",Thinking
,,,,,,,,77.5,,,,,,,55.9,,,,,,,,63.6,,,,,,,,,,,83.7,,,,,,,,,,,,,,,,,,,,No,Qwen,Qwen/Qwen3-4B,"Qwen3 Report Table 17 (Thinking, Qwen3-8B / Qwen3-4B)",Thinking
,,,,,,,,83.4,,,,,,,62.0,,,,,,,,67.1,,,,,,,,,,,87.5,,,,,,,,,,,,,,,,,,,,No,Qwen,Qwen/Qwen3-8B,"Qwen3 Report Table 17 (Thinking, Qwen3-8B / Qwen3-4B)",Thinking
,,,,,,,,52.0,,,,,,,32.8,,,,,,,,26.0,,,,,,,,,,,61.7,,,,,,,,,,,,,,,,,,,,Yes,Llama,Llama/LLaMA-3.1-8B-Instruct,"Qwen3 Report Table 18 (Non-thinking, Qwen3-8B / Qwen3-4B / LLaMA-3.1-8B)",Non-thinking
,,,,,,,,61.1,,,,,,,40.9,,,,,,,,43.7,,,,,,,,,,,77.8,,,,,,,,,,,,,,,,,,,,Yes,Gemma,Google/Gemma-3-12B-IT,"Qwen3 Report Table 18 (Non-thinking, Qwen3-8B / Qwen3-4B / LLaMA-3.1-8B)",Non-thinking
,,,,,,,,78.0,,,,,,,36.4,,,,,,,,58.9,,,,,,,,,,,75.4,,,,,,,,,,,,,,,,,,,,Yes,Qwen,Qwen/Qwen2.5-7B-Instruct,"Qwen3 Report Table 18 (Non-thinking, Qwen3-8B / Qwen3-4B / LLaMA-3.1-8B)",Non-thinking
,,,,,,,,72.2,,,,,,,45.5,,,,,,,,60.7,,,,,,,,,,,80.0,,,,,,,,,,,,,,,,,,,,Yes,Qwen,Qwen/Qwen2.5-14B-Instruct,"Qwen3 Report Table 18 (Non-thinking, Qwen3-8B / Qwen3-4B / LLaMA-3.1-8B)",Non-thinking
,,,,,,,,72.2,,,,,,,41.7,,,,,,,,48.4,,,,,,,,,,,77.3,,,,,,,,,,,,,,,,,,,,No,Qwen,Qwen/Qwen3-4B,"Qwen3 Report Table 18 (Non-thinking, Qwen3-8B / Qwen3-4B / LLaMA-3.1-8B)",Non-thinking
,,,,,,,,77.9,,,,,,,39.3,,,,,,,,53.5,,,,,,,,,,,79.5,,,,,,,,,,,,,,,,,,,,No,Qwen,Qwen/Qwen3-8B,"Qwen3 Report Table 18 (Non-thinking, Qwen3-8B / Qwen3-4B / LLaMA-3.1-8B)",Non-thinking
,,,,,,,,27.1,,,,,,,33.8,,,,,,,,24.9,,,,,,,,,,,45.4,,,,,,,,,,,,,,,,,,,,Yes,DeepSeek,DeepSeek/DeepSeek-R1-Distill-Qwen-1.5B,"Qwen3 Report Table 19 (Thinking, Qwen3-1.7B / Qwen3-0.6B)",Thinking
,,,,,,,,50.4,,,,,,,49.0,,,,,,,,40.6,,,,,,,,,,,66.4,,,,,,,,,,,,,,,,,,,,Yes,DeepSeek,DeepSeek/DeepSeek-R1-Distill-LLaMA-8B,"Qwen3 Report Table 19 (Thinking, Qwen3-1.7B / Qwen3-0.6B)",Thinking
,,,,,,,,30.3,,,,,,,27.9,,,,,,,,30.3,,,,,,,,,,,55.6,,,,,,,,,,,,,,,,,,,,No,Qwen,Qwen/Qwen3-0.6B,"Qwen3 Report Table 19 (Thinking, Qwen3-1.7B / Qwen3-0.6B)",Thinking
,,,,,,,,68.1,,,,,,,40.1,,,,,,,,51.1,,,,,,,,,,,73.9,,,,,,,,,,,,,,,,,,,,No,Qwen,Qwen/Qwen3-1.7B,"Qwen3 Report Table 19 (Thinking, Qwen3-1.7B / Qwen3-0.6B)",Thinking
,,,,,,,,28.5,,,,,,,19.2,,,,,,,,14.4,,,,,,,,,,,33.3,,,,,,,,,,,,,,,,,,,,Yes,Gemma,Google/Gemma-3-1B-IT,"Qwen3 Report Table 20 (Non-thinking, Qwen3-1.7B / Qwen3-0.6B)",Non-thinking
,,,,,,,,40.0,,,,,,,25.2,,,,,,,,25.3,,,,,,,,,,,67.9,,,,,,,,,,,,,,,,,,,,Yes,Phi,Google/Phi-4-mini,"Qwen3 Report Table 20 (Non-thinking, Qwen3-1.7B / Qwen3-0.6B)",Non-thinking
,,,,,,,,53.3,,,,,,,29.8,,,,,,,,18.0,,,,,,,,,,,50.7,,,,,,,,,,,,,,,,,,,,Yes,Qwen,Qwen/Qwen2.5-1.5B-Instruct,"Qwen3 Report Table 20 (Non-thinking, Qwen3-1.7B / Qwen3-0.6B)",Non-thinking
,,,,,,,,68.2,,,,,,,30.3,,,,,,,,23.8,,,,,,,,,,,64.4,,,,,,,,,,,,,,,,,,,,Yes,Qwen,Qwen/Qwen2.5-3B-Instruct,"Qwen3 Report Table 20 (Non-thinking, Qwen3-1.7B / Qwen3-0.6B)",Non-thinking
,,,,,,,,42.6,,,,,,,22.9,,,,,,,,21.8,,,,,,,,,,,44.6,,,,,,,,,,,,,,,,,,,,No,Qwen,Qwen/Qwen3-0.6B,"Qwen3 Report Table 20 (Non-thinking, Qwen3-1.7B / Qwen3-0.6B)",Non-thinking
,,,,,,,,61.0,,,,,,,28.6,,,,,,,,35.6,,,,,,,,,,,64.4,,,,,,,,,,,,,,,,,,,,No,Qwen,Qwen/Qwen3-1.7B,"Qwen3 Report Table 20 (Non-thinking, Qwen3-1.7B / Qwen3-0.6B)",Non-thinking
,,,,,,,,,,,,,,23.7,,40.1,,31.1,,14.6,,7.4,,1.6,,13.9,,39.7,,,,,14.4,12.9,,,,,,,,,,,20.8,,,,,,,,,Yes,Qwen,Qwen/Qwen2-0.5B,Qwen2.5 Report Table 10 (0.5B-1.5B instruct-tuned models),Non-thinking
,,,,,,,,,,,,,,29.8,,49.6,,35.4,,27.9,,12.6,,5.1,,34.4,,49.6,,,,,15.0,24.1,,,,,,,,,,,28.5,,,,,,,,,Yes,Qwen,Qwen/Qwen2.5-0.5B,Qwen2.5 Report Table 10 (0.5B-1.5B instruct-tuned models),Non-thinking
,,,,,,,,,,,,,,21.2,,61.6,,42.1,,29.0,,12.4,,4.5,,25.3,,44.2,,,,,22.9,41.2,,,,,,,,,,,38.5,,,,,,,,,Yes,Qwen,Qwen/Qwen2-1.5B,Qwen2.5 Report Table 10 (0.5B-1.5B instruct-tuned models),Non-thinking
,,,,,,,,,,,,,,29.8,,73.2,,61.6,,42.5,,18.8,,14.8,,55.2,,63.2,,,,,32.4,50.7,,,,,,,,,,,50.4,,,,,,,,,Yes,Qwen,Qwen/Qwen2.5-1.5B,Qwen2.5 Report Table 10 (0.5B-1.5B instruct-tuned models),Non-thinking
,,68.8,,,,81.0,,,,,,,,36.3,,77.6,88.0,48.2,42.1,,,,,,,42.5,,70.4,58.4,,,79.5,52.8,75.0,73.7,,,,,70.0,,67.1,38.0,79.9,46.3,,,,32.3,45.6,85.3,,,No,Llama,Llama/Llama-3-70B,Qwen2.5 Report Table 2 (70B+ base models & Qwen2.5-Plus),Non-thinking
,,70.7,,,,78.9,,,,,,,,34.3,,83.7,88.7,46.3,40.2,,,,,,,41.7,,71.7,58.1,,,77.8,51.6,72.9,71.7,,,,,63.5,,62.9,23.3,77.7,46.7,,,,35.9,51.0,85.0,,,No,Mistral,Mistral/Mixtral-8x22B,Qwen2.5 Report Table 2 (70B+ base models & Qwen2.5-Plus),Non-thinking
,,,,,,85.9,,,,,,,,,,89.0,,61.0,,,,,,,,53.8,,73.0,63.9,,,85.2,61.6,,,,,,,,,,,,,,,,,,86.7,,,No,Llama,Llama/Llama-3-405B,Qwen2.5 Report Table 2 (70B+ base models & Qwen2.5-Plus),Non-thinking
,,68.9,,,,82.4,,,,,,,,37.4,,89.0,87.3,64.6,56.1,,,,,,,50.9,,76.9,63.9,,,84.2,55.7,80.5,79.6,,,,,76.6,,76.0,37.8,80.7,59.6,,,,42.8,54.8,85.1,,,No,Qwen,Qwen/Qwen2-72B,Qwen2.5 Report Table 2 (70B+ base models & Qwen2.5-Plus),Non-thinking
,,72.4,,,,86.3,,,,,,,,45.9,,91.5,87.6,59.1,51.2,,,,,,,62.1,,84.7,69.2,,,86.1,58.1,83.9,82.7,,,,,78.7,,76.7,39.0,89.6,60.5,,,,42.4,60.4,83.9,,,No,Qwen,Qwen/Qwen2.5-72B,Qwen2.5 Report Table 2 (70B+ base models & Qwen2.5-Plus),Non-thinking
,,70.9,,,,85.8,,,,,,,,43.9,,93.0,89.2,59.1,52.4,,,,,,,64.4,,79.7,66.9,,,85.4,64.0,82.8,81.2,,,,,78.5,,82.4,40.4,89.2,61.0,,,,48.5,55.3,85.5,,,No,Qwen,Qwen/Qwen2.5-Plus,Qwen2.5 Report Table 2 (70B+ base models & Qwen2.5-Plus),Non-thinking
,,60.0,,,,56.1,,,,,,,,24.7,,36.2,83.3,29.3,24.4,,,,,,,10.2,,51.1,40.9,,,64.2,30.9,58.1,50.1,,,,,47.1,,26.3,23.3,63.3,29.4,,,,19.2,42.2,78.4,,,No,Mistral,Mistral/Mistral-7B,Qwen2.5 Report Table 4 (7B+ base models),Non-thinking
,,59.3,,,,57.7,,,,,,,,25.8,,55.3,82.1,33.5,29.3,,,,,,,20.5,,53.9,44.4,,,66.6,35.4,61.6,55.3,,,,,52.3,,36.3,31.9,68.6,33.5,,,,22.1,44.0,77.4,,,No,Llama,Llama/Llama-3-8B,Qwen2.5 Report Table 4 (7B+ base models),Non-thinking
,,68.2,,,,68.2,,,,,,,,32.8,,70.7,81.9,37.8,30.5,,,,,,,37.7,,62.2,50.6,,,71.3,44.7,67.9,65.1,,,,,61.2,,53.0,36.5,78.3,34.9,,,,28.9,45.3,79.5,,,No,Gemma,Google/Gemma-2-9B,Qwen2.5 Report Table 4 (7B+ base models),Non-thinking
,,60.6,,,,62.3,,,,,,,,30.8,,80.2,80.7,51.2,43.3,,,,,,,43.5,,64.2,51.9,,,70.3,40.1,68.1,64.2,,,,,59.2,,57.5,31.5,72.0,41.0,,,,29.6,54.2,77.0,,,No,Qwen,Qwen/Qwen2-7B,Qwen2.5 Report Table 4 (7B+ base models),Non-thinking
,,63.7,,,,70.4,,,,,,,,36.4,,85.4,80.2,57.9,50.6,,,,,,,49.8,,74.9,62.9,,,74.2,45.0,71.1,72.3,,,,,59.4,,57.8,32.4,79.3,50.3,,,,36.0,56.4,75.9,,,No,Qwen,Qwen/Qwen2.5-7B,Qwen2.5 Report Table 4 (7B+ base models),Non-thinking
,,,,55.7,,,,,,,,,,46.7,,95.1,,80.5,,83.6,,46.6,,32.1,,68.0,,84.2,,,,,66.4,83.0,,,,,8.79,,,,,,68.2,,,,,,,,,Yes,Llama,Llama/LLaMA-3.1-70B,Qwen2.5 Report Table 6 (70B+ Instruct models),Non-thinking
,,,,69.3,,,,,,,,,,51.1,,96.8,,89.0,,86.0,,53.2,,41.6,,73.8,,84.5,,,,,73.3,86.2,,,,,9.08,,,,,,73.5,,,,,,,,,Yes,Llama,Llama/LLaMA-3-405B,Qwen2.5 Report Table 6 (70B+ Instruct models),Non-thinking
,,,,48.1,,,,,,,,,,42.4,,93.2,,86.0,,77.6,,41.5,,32.2,,69.0,,80.2,,,,,64.4,81.6,,,,,9.12,,,,,,69.2,,,,,,,,,Yes,Qwen,Qwen/Qwen2-72B,Qwen2.5 Report Table 6 (70B+ Instruct models),Non-thinking
,,,,81.2,,,,,,,,,,49.0,,95.8,,86.6,,84.1,,52.3,,55.5,,83.1,,88.2,,,,,71.1,86.8,,,,,9.35,,,,,,75.1,,,,,,,,,Yes,Qwen,Qwen/Qwen2.5-72B,Qwen2.5 Report Table 6 (70B+ Instruct models),Non-thinking
,,,,81.4,,,,,,,,,,49.7,,96.0,,87.8,,86.3,,54.6,,51.4,,84.7,,85.5,,,,,72.5,86.3,,,,,9.3,,,,,,77.0,,,,,,,,,Yes,Qwen,Qwen/Qwen2.5-Plus,Qwen2.5 Report Table 6 (70B+ Instruct models),Non-thinking
,,,,17.8,,,,,,,,,,34.3,,85.3,,79.9,,59.9,,31.1,,22.5,,49.1,,70.9,,,,,52.8,72.6,,,,,8.55,,,,,,66.4,,,,,,,,,Yes,Qwen,Qwen/Qwen2-57B-A14B,Qwen2.5 Report Table 7 (14B-30B+ Instruct models),Non-thinking
,,,,57.5,,,,,,,,,,38.4,,90.4,,78.7,,77.1,,39.6,,,,54.4,,81.0,,,,,55.5,75.7,,,,,9.1,,,,,,67.4,,,,,,,,,Yes,Gemma,Google/Gemma-3-27B,Qwen2.5 Report Table 7 (14B-30B+ Instruct models),Non-thinking
,,,,74.9,,,,,,,,,,40.2,,93.2,,88.4,,80.4,,43.3,,40.7,,70.2,,85.7,,,,,63.1,81.5,,,,,,,,,,,75.0,,,,,,,,,Yes,OpenAI,OpenAI/GPT-4o-mini,Qwen2.5 Report Table 7 (14B-30B+ Instruct models),Non-thinking
,,,,67.1,,,,,,,,,,42.3,,93.8,,86.6,,76.3,,42.3,,37.8,,81.1,,82.8,,,,,64.5,81.7,,,,,8.88,,,,,,73.7,,,,,,,,,Yes,Qwen,Qwen/Qwen2.5-Turbo,Qwen2.5 Report Table 7 (14B-30B+ Instruct models),Non-thinking
,,,,68.3,,,,,,,,,,45.5,,94.8,,83.5,,81.0,,44.4,,42.6,,80.0,,82.0,,,,,63.7,80.0,,,,,8.82,,,,,,72.8,,,,,,,,,Yes,Qwen,Qwen/Qwen2.5-14B,Qwen2.5 Report Table 7 (14B-30B+ Instruct models),Non-thinking
,,,,74.5,,,,,,,,,,49.5,,95.9,,88.4,,79.5,,50.7,,51.2,,83.1,,84.0,,,,,69.0,83.9,,,,,9.2,,,,,,75.4,,,,,,,,,Yes,Qwen,Qwen/Qwen2.5-32B,Qwen2.5 Report Table 7 (14B-30B+ Instruct models),Non-thinking
,,,,,,,,,,,,,,29.3,,63.2,,68.9,,51.0,,20.1,,5.8,,26.6,,74.9,,,,,26.7,51.9,,,,,,,,,,,30.5,,,,,,,,,Yes,Gemma,Google/Gemma-2-2B,Qwen2.5 Report Table 9 (2B-4B instruct-tuned models),Non-thinking
,,,,,,,,,,,,,,27.2,,86.2,,72.6,,52.1,,27.4,,15.8,,48.5,,63.2,,,,,47.5,67.7,,,,,,,,,,,47.2,,,,,,,,,Yes,Phi,Phi/Phi-3.5-Mini,Qwen2.5 Report Table 9 (2B-4B instruct-tuned models),Non-thinking
,,,,,,,,,,,,,,31.3,,81.1,,74.4,,68.4,,27.6,,23.8,,46.6,,72.5,,,,,43.0,59.9,,,,,,,,,,,49.1,,,,,,,,,Yes,MiniCPM,MiniCPM/MiniCPM-3-4B,Qwen2.5 Report Table 9 (2B-4B instruct-tuned models),Non-thinking
,,,,,,,,,,,,,,30.3,,86.7,,74.4,,58.2,,26.8,,19.9,,65.9,,72.7,,,,,43.7,64.4,,,,,,,,,,,60.2,,,,,,,,,Yes,Qwen,Qwen/Qwen2.5-3B,Qwen2.5 Report Table 9 (2B-4B instruct-tuned models),Non-thinking
