model_name,thinking_mode,model_family,instruction_tuned,source,AIME'24,AIME'25,AlignBench_v1.1,Arena-Hard,AutoLogi,BBH,BFCL_v3,C-Eval,CRUX-O,CodeForces_Percentile,CodeForces_Rating,Creative_Writing_v3,EvalPlus,GPQA,GPQA-Diamond,GSM8K,IFEval_strict_prompt,INCLUDE,LiveBench_2024-11-25,LiveCodeBench_v5,MATH,MATH-500,MBPP,MGSM,MLogiQA,MMLU,MMLU-Pro,MMLU-Redux,MMMLU,MMMLU_14_langs,MT-AIME2024,Multi-IF,MultiPL-E,PolyMath,RULER_Avg,SuperGPQA,WritingBench,ZebraLogic
DeepSeek/DeepSeek-R1-Distill-Llama-70B,Thinking,DeepSeek,Yes,Qwen3 Report Table 13,,,,,,,,71.8,,,,,,,65.2,,,,54.5,,,,,,,,,89.3,,,,,,,,,,
Qwen/QwQ-32B,Thinking,Qwen,Yes,Qwen3 Report Table 13,,,,,,,,88.4,,,,,,,65.6,,,,72.0,,,,,,,,,90.0,,,,,,,,,,
OpenAI/o3-mini,Thinking,OpenAI,Yes,Qwen3 Report Table 13,,,,,,,,75.1,,,,,,,76.8,,,,70.0,,,,,,,,,90.0,,,,,,,,,,
Qwen/Qwen3-32B,Thinking,Qwen,Varies,Qwen3 Report Table 23 (RULER),31.0,20.2,8.58,92.8,78.5,,63.0,83.3,,71.0,1353.0,78.3,,,54.6,,83.2,70.9,59.8,31.3,,88.6,,,62.9,,,85.7,,76.5,24.1,70.7,,22.5,91.0,,7.54,29.2
OpenAI/GPT-4o-mini-2024-07-18,Non-thinking,OpenAI,Yes,Qwen3 Report Table 14,8.1,8.8,7.81,74.9,52.6,,64.0,66.3,,52.6,1113.0,70.3,,,40.2,,80.4,66.0,41.3,27.9,,78.2,,,42.6,,,81.5,,72.1,6.0,62.4,,12.0,,,5.98,20.1
Llama/LLaMA-4-Scout,Non-thinking,Llama,Unknown,Qwen3 Report Table 14,28.6,10.0,7.49,70.5,56.8,,45.4,78.2,,43.7,981.0,55.0,,,57.2,,84.7,74.1,47.6,29.8,,82.6,,,53.9,,,86.3,,77.5,19.1,64.2,,20.9,,,5.49,24.2
Qwen/Qwen2.5-72B-Instruct,Non-thinking,Qwen,Varies,Qwen3 Report Table 23 (RULER),18.9,15.0,7.89,81.2,66.1,,63.4,84.7,,35.0,859.0,61.8,,,49.0,,84.1,69.6,51.4,30.7,,83.6,,,59.3,,,86.8,,76.9,12.7,65.3,,16.9,95.1,,7.06,26.6
Qwen/Qwen2.5-7B-Instruct,Non-thinking,Qwen,Varies,Qwen3 Report Table 23 (RULER),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,85.4,,,
Qwen/Qwen2.5-14B-Instruct,Non-thinking,Qwen,Varies,Qwen3 Report Table 23 (RULER),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,91.4,,,
Qwen/Qwen2.5-32B-Instruct,Non-thinking,Qwen,Varies,Qwen3 Report Table 23 (RULER),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,92.9,,,
Qwen/Qwen3-4B,Thinking,Qwen,Varies,Qwen3 Report Table 23 (RULER),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,83.5,,,
Qwen/Qwen3-8B,Thinking,Qwen,Varies,Qwen3 Report Table 23 (RULER),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,84.4,,,
Qwen/Qwen3-14B,Thinking,Qwen,Varies,Qwen3 Report Table 23 (RULER),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,89.1,,,
Qwen/Qwen3-30B-A3B,Thinking,Qwen,Varies,Qwen3 Report Table 23 (RULER),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,86.6,,,
Qwen/Qwen3-235B-A22B,Thinking,Qwen,Varies,Qwen3 Report Table 23 (RULER),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,92.2,,,
Qwen/Qwen2.5-32B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 4 (Base models @~32B),,,,,,84.48,,,67.8,,,,66.25,47.97,,92.87,,64.35,,,57.7,,73.6,78.12,,83.32,55.1,81.97,82.4,,,,58.3,,,33.55,,
Qwen/Qwen2.5-72B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 4 (Base models @~32B),,,,,,86.3,,,68.5,,,,65.93,45.88,,91.5,,69.05,,,62.12,,75.4,82.4,,86.06,58.07,83.91,83.49,,,,58.7,,,36.2,,
Google/Gemma-3-27B-Base,Non-thinking,Gemma,No,Qwen3 Report Table 4 (Base models @~32B),,,,,,79.95,,,60.4,,,,55.78,26.26,,81.2,,68.94,,,51.78,,68.4,73.74,,78.69,52.88,76.53,77.62,,,,45.03,,,29.87,,
Llama/Llama-4-Scout-Base,Non-thinking,Llama,No,Qwen3 Report Table 4 (Base models @~32B),,,,,,82.4,,,61.9,,,,59.9,40.4,,85.37,,68.09,,,51.66,,68.6,79.93,,78.27,56.13,71.09,82.05,,,,47.38,,,26.51,,
Qwen/Qwen3-32B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 4 (Base models @~32B),,,,,,87.38,,,72.5,,,,72.05,49.49,,93.4,,67.67,,,61.62,,78.2,83.06,,83.61,65.54,83.41,83.83,,,,67.06,,,39.78,,
Llama/Llama-3-8B-Base,Non-thinking,Llama,No,Qwen3 Report Table 6 (Base models @~8B),,,,,,57.7,,,36.8,,,,44.13,25.8,,55.3,,44.94,,,20.5,,48.4,38.92,,66.6,35.56,61.59,59.65,,,,31.45,,,20.54,,
Qwen/Qwen2.5-7B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 7 (Base models @~4B),,,,,,70.4,,,48.5,,,,62.18,36.36,,85.36,,53.98,,,49.8,,63.4,63.6,,74.16,45.0,71.06,71.34,,,,50.73,,,26.34,,
Qwen/Qwen2.5-14B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 6 (Base models @~8B),,,,,,78.18,,,61.1,,,,60.7,32.83,,90.22,,60.26,,,55.64,,69.0,74.68,,79.66,51.16,76.64,78.34,,,,54.79,,,30.68,,
Qwen/Qwen3-8B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 6 (Base models @~8B),,,,,,78.4,,,62.0,,,,67.65,44.44,,89.84,,59.4,,,60.8,,69.8,76.02,,76.89,56.73,72.17,75.72,,,,58.75,,,31.64,,
Google/Gemma-3-4B-Base,Non-thinking,Gemma,No,Qwen3 Report Table 7 (Base models @~4B),,,,,,51.7,,,34.0,,,,43.23,24.24,,43.97,,49.06,,,26.1,,46.4,33.11,,59.51,29.23,56.91,59.62,,,,28.06,,,17.68,,
Qwen/Qwen2.5-3B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 7 (Base models @~4B),,,,,,56.3,,,36.5,,,,46.28,26.26,,79.08,,45.9,,,42.64,,54.6,47.53,,65.62,34.61,63.68,65.55,,,,39.65,,,20.31,,
Qwen/Qwen3-4B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 7 (Base models @~4B),,,,,,72.59,,,55.0,,,,63.53,36.87,,87.79,,56.29,,,54.1,,67.0,67.74,,72.99,50.58,72.79,71.42,,,,53.13,,,28.43,,
