model_name,thinking_mode,model_family,instruction_tuned,source,AIME'24,AIME'25,AlignBench_v1.1,Arena-Hard,AutoLogi,BBH,BFCL_v3,C-Eval,CRUX-O,CodeForces_Percentile,CodeForces_Rating,Creative_Writing_v3,EvalPlus,GPQA,GPQA-Diamond,GSM8K,IFEval_strict_prompt,INCLUDE,LiveBench_2024-11-25,LiveCodeBench_v5,MATH,MATH-500,MBPP,MGSM,MLogiQA,MMLU,MMLU-Pro,MMLU-Redux,MMMLU,MMMLU_14_langs,MT-AIME2024,Multi-IF,MultiPL-E,PolyMath,RULER_Avg,SuperGPQA,WritingBench,ZebraLogic
DeepSeek/DeepSeek-R1-Distill-Llama-70B,Thinking,DeepSeek,Yes,Qwen3 Report Table 13,,,,,,,,71.8,,,,,,,65.2,,,,54.5,,,,,,,,,89.3,,,,,,,,,,
Qwen/QwQ-32B,Thinking,Qwen,Yes,"Qwen3 Report Table 15 (Thinking, Qwen3-30B-A3B / Qwen3-14B / QwQ-32B)",,,,,,,,88.4,,,,,,,65.6,,,,72.0,,,,,,,,,90.0,,,,,,,,,,
OpenAI/o3-mini,Thinking,OpenAI,Yes,Qwen3 Report Table 13,,,,,,,,75.1,,,,,,,76.8,,,,70.0,,,,,,,,,90.0,,,,,,,,,,
Qwen/Qwen3-32B,Thinking,Qwen,Varies,Qwen3 Report Table 23 (RULER),31.0,20.2,8.58,92.8,78.5,,63.0,83.3,,71.0,1353.0,78.3,,,54.6,,83.2,70.9,59.8,31.3,,88.6,,,62.9,,,85.7,,76.5,24.1,70.7,,22.5,91.0,,7.54,29.2
OpenAI/GPT-4o-mini-2024-07-18,Non-thinking,OpenAI,Yes,Qwen3 Report Table 14,8.1,8.8,7.81,74.9,52.6,,64.0,66.3,,52.6,1113.0,70.3,,,40.2,,80.4,66.0,41.3,27.9,,78.2,,,42.6,,,81.5,,72.1,6.0,62.4,,12.0,,,5.98,20.1
Llama/LLaMA-4-Scout,Non-thinking,Llama,Unknown,Qwen3 Report Table 14,28.6,10.0,7.49,70.5,56.8,,45.4,78.2,,43.7,981.0,55.0,,,57.2,,84.7,74.1,47.6,29.8,,82.6,,,53.9,,,86.3,,77.5,19.1,64.2,,20.9,,,5.49,24.2
Qwen/Qwen2.5-72B-Instruct,Non-thinking,Qwen,Yes,Qwen3 Report Table 12 (Non-thinking),18.9,15.0,7.89,81.2,66.1,,63.4,84.7,,35.0,859,61.8,,,49.0,,86.1,69.6,51.4,30.7,,83.6,,,59.2,,,86.8,,74.3,12.7,65.3,,16.7,95.1,,7.06,26.6
Qwen/Qwen2.5-7B-Instruct,Non-thinking,Qwen,Yes,"Qwen3 Report Table 18 (Non-thinking, Qwen3-8B / Qwen3-4B / LLaMA-3.1-8B)",,,,,,,,78.0,,,,,,,36.4,,,,58.9,,,,,,,,,75.4,,,,,,,85.4,,,
Qwen/Qwen2.5-14B-Instruct,Non-thinking,Qwen,Yes,"Qwen3 Report Table 18 (Non-thinking, Qwen3-8B / Qwen3-4B / LLaMA-3.1-8B)",,,,,,,,72.2,,,,,,,45.5,,,,60.7,,,,,,,,,80.0,,,,,,,91.4,,,
Qwen/Qwen2.5-32B-Instruct,Non-thinking,Qwen,Yes,"Qwen3 Report Table 16 (Non-thinking, Qwen3-30B-A3B / Qwen3-14B / Gemma-3-27B)",,,,,,,,86.0,,,,,,,49.5,,,,50.0,,,,,,,,,83.9,,,,,,,92.9,,,
Qwen/Qwen3-4B,Non-thinking,Qwen,No,"Qwen3 Report Table 18 (Non-thinking, Qwen3-8B / Qwen3-4B / LLaMA-3.1-8B)",,,,,,,,72.2,,,,,,,41.7,,,,48.4,,,,,,,,,77.3,,,,,,,83.5,,,
Qwen/Qwen3-8B,Non-thinking,Qwen,No,"Qwen3 Report Table 18 (Non-thinking, Qwen3-8B / Qwen3-4B / LLaMA-3.1-8B)",,,,,,,,77.9,,,,,,,39.3,,,,53.5,,,,,,,,,79.5,,,,,,,84.4,,,
Qwen/Qwen3-14B,Non-thinking,Qwen,No,"Qwen3 Report Table 16 (Non-thinking, Qwen3-30B-A3B / Qwen3-14B / Gemma-3-27B)",,,,,,,,81.0,,,,,,,54.8,,,,59.6,,,,,,,,,82.0,,,,,,,89.1,,,
Qwen/Qwen3-30B-A3B,Non-thinking,Qwen,No,"Qwen3 Report Table 16 (Non-thinking, Qwen3-30B-A3B / Qwen3-14B / Gemma-3-27B)",,,,,,,,82.9,,,,,,,54.8,,,,59.4,,,,,,,,,84.1,,,,,,,86.6,,,
Qwen/Qwen3-235B-A22B,Non-thinking,Qwen,No,Qwen3 Report Table 12 (Non-thinking),40.1,24.7,8.91,96.1,83.3,,68.0,86.1,,75.7,1387,80.4,,,62.9,,83.2,75.6,62.5,35.3,,91.2,,,67.6,,,89.2,,79.8,32.4,70.2,,32.4,92.2,,7.7,37.7
Qwen/Qwen2.5-32B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 4 (Base models @~32B),,,,,,84.48,,,67.8,,,,66.25,47.97,,92.87,,64.35,,,57.7,,73.6,78.12,,83.32,55.1,81.97,82.4,,,,58.3,,,33.55,,
Qwen/Qwen2.5-72B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 4 (Base models @~32B),,,,,,86.3,,,68.5,,,,65.93,45.88,,91.5,,69.05,,,62.12,,75.4,82.4,,86.06,58.07,83.91,83.49,,,,58.7,,,36.2,,
Google/Gemma-3-27B-Base,Non-thinking,Gemma,No,Qwen3 Report Table 4 (Base models @~32B),,,,,,79.95,,,60.4,,,,55.78,26.26,,81.2,,68.94,,,51.78,,68.4,73.74,,78.69,52.88,76.53,77.62,,,,45.03,,,29.87,,
Llama/Llama-4-Scout-Base,Non-thinking,Llama,No,Qwen3 Report Table 4 (Base models @~32B),,,,,,82.4,,,61.9,,,,59.9,40.4,,85.37,,68.09,,,51.66,,68.6,79.93,,78.27,56.13,71.09,82.05,,,,47.38,,,26.51,,
Qwen/Qwen3-32B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 4 (Base models @~32B),,,,,,87.38,,,72.5,,,,72.05,49.49,,93.4,,67.67,,,61.62,,78.2,83.06,,83.61,65.54,83.41,83.83,,,,67.06,,,39.78,,
Llama/Llama-3-8B-Base,Non-thinking,Llama,No,Qwen3 Report Table 6 (Base models @~8B),,,,,,57.7,,,36.8,,,,44.13,25.8,,55.3,,44.94,,,20.5,,48.4,38.92,,66.6,35.56,61.59,59.65,,,,31.45,,,20.54,,
Qwen/Qwen2.5-7B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 7 (Base models @~4B),,,,,,70.4,,,48.5,,,,62.18,36.36,,85.36,,53.98,,,49.8,,63.4,63.6,,74.16,45.0,71.06,71.34,,,,50.73,,,26.34,,
Qwen/Qwen2.5-14B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 6 (Base models @~8B),,,,,,78.18,,,61.1,,,,60.7,32.83,,90.22,,60.26,,,55.64,,69.0,74.68,,79.66,51.16,76.64,78.34,,,,54.79,,,30.68,,
Qwen/Qwen3-8B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 6 (Base models @~8B),,,,,,78.4,,,62.0,,,,67.65,44.44,,89.84,,59.4,,,60.8,,69.8,76.02,,76.89,56.73,72.17,75.72,,,,58.75,,,31.64,,
Google/Gemma-3-4B-Base,Non-thinking,Gemma,No,Qwen3 Report Table 7 (Base models @~4B),,,,,,51.7,,,34.0,,,,43.23,24.24,,43.97,,49.06,,,26.1,,46.4,33.11,,59.51,29.23,56.91,59.62,,,,28.06,,,17.68,,
Qwen/Qwen2.5-3B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 7 (Base models @~4B),,,,,,56.3,,,36.5,,,,46.28,26.26,,79.08,,45.9,,,42.64,,54.6,47.53,,65.62,34.61,63.68,65.55,,,,39.65,,,20.31,,
Qwen/Qwen3-4B-Base,Non-thinking,Qwen,No,Qwen3 Report Table 7 (Base models @~4B),,,,,,72.59,,,55.0,,,,63.53,36.87,,87.79,,56.29,,,54.1,,67.0,67.74,,72.99,50.58,72.79,71.42,,,,53.13,,,28.43,,
OpenAI/OpenAI-o1,Thinking,OpenAI,Unknown,Qwen3 Report Table 11 (235B-A22B Thinking),74.3,79.2,8.86,92.1,79.8,,67.8,85.5,,96.7,1891,81.7,,,78.0,,92.6,84.6,75.7,63.9,,96.4,,,75.5,,,92.8,,88.4,67.4,48.8,,38.9,,,7.69,81.0
DeepSeek/DeepSeek-R1,Thinking,DeepSeek,Unknown,Qwen3 Report Table 11 (Thinking),79.8,70.0,8.76,92.3,86.1,,56.9,91.8,,98.1,2029,85.5,,,71.5,,83.3,82.7,71.6,64.3,,97.3,,,73.8,,,92.9,,86.4,73.5,67.7,,47.1,,,7.71,78.7
Anthropic/Grok-3-Beta,Thinking,Anthropic,Unknown,Qwen3 Report Table 11 (235B-A22B Thinking),,77.3,,,,,,,,,,,,,84.0,,,,,70.6,,83.9,,,,,,80.2,,,,,,,,,,
Google/Gemini2.5-Pro,Thinking,Google,Unknown,Qwen3 Report Table 11 (235B-A22B Thinking),86.7,86.7,9.03,96.4,85.4,,62.9,82.9,,97.9,2001,86.0,,,84.0,,89.5,85.1,82.4,70.4,,92.0,,,75.6,,,93.7,,86.9,76.9,77.8,,52.2,,,8.09,87.4
OpenAI/GPT-4o-2024-11-20,Non-thinking,OpenAI,Yes,Qwen3 Report Table 12 (Non-thinking),11.1,7.6,8.42,85.3,65.9,,72.5,75.5,,35.4,864,81.1,,,46.0,,86.5,78.8,52.2,32.7,,77.2,,,57.4,,,87.0,,80.3,9.2,65.6,,12.0,,,7.11,27.4
DeepSeek/DeepSeek-V3,Non-thinking,DeepSeek,Unknown,Qwen3 Report Table 12 (Non-thinking),39.2,28.8,8.64,85.5,76.1,,57.6,86.5,,54.1,1134,74.0,,,59.1,,86.1,76.7,60.5,33.1,,90.2,,,59.3,,,89.1,,81.2,10.2,65.3,,29.2,,,6.49,42.1
Llama/Llama-4-Maverick,Non-thinking,Llama,Unknown,Qwen3 Report Table 12 (235B-A22B Non-thinking),38.5,15.9,7.89,82.7,75.2,,52.9,83.5,,24.3,712,61.3,,,69.8,,86.7,80.9,59.5,37.2,,90.6,,,47.6,,,91.8,,82.5,27.0,75.5,,50.7,,,5.46,40.0
OpenAI/o1,Thinking,OpenAI,Unknown,Qwen3 Report Table 11 (Thinking),74.3,79.2,8.86,92.1,79.8,,67.8,85.5,,96.7,1891,81.7,,,78.0,,92.6,84.6,75.7,63.9,,96.4,,,75.5,,,92.8,,88.4,67.4,48.8,,38.9,,,7.69,81.0
xAI/Grok-3-Beta,Thinking,xAI,Unknown,Qwen3 Report Table 11 (Thinking),77.3,77.3,,,,,,,,,,,,,80.2,,,,,70.6,,83.9,,,,,,,,,,,,,,,,
Google/Gemini-2.5-Pro,Thinking,Google,Unknown,Qwen3 Report Table 11 (Thinking),86.7,86.7,9.03,96.4,85.4,,62.9,82.9,,97.9,2001,86.0,,,84.0,,89.5,85.1,82.4,70.4,,92.0,,,75.6,,,93.7,,86.9,76.9,77.8,,52.2,,,8.09,86.7
Llama/LLaMA-4-Maverick,Non-thinking,Llama,Unknown,Qwen3 Report Table 12 (Non-thinking),38.5,15.9,7.89,82.7,75.2,,52.9,83.5,,24.3,712,61.3,,,69.8,,86.7,69.0,59.5,17.2,,90.6,,,47.0,,,91.8,,82.5,27.0,75.5,,27.0,,,5.46,40.0
DeepSeek/DeepSeek-R1-Distill-Qwen-32B,Thinking,DeepSeek,Yes,"Qwen3 Report Table 17 (Thinking, Qwen3-8B / Qwen3-4B)",,,,,,,,82.2,,,,,,,62.1,,,,45.6,,,,,,,,,88.2,,,,,,,,,,
Google/Gemma-3-27B-IT,Non-thinking,Gemma,Yes,"Qwen3 Report Table 16 (Non-thinking, Qwen3-30B-A3B / Qwen3-14B / Gemma-3-27B)",,,,,,,,66.9,,,,,,,42.4,,,,41.6,,,,,,,,,82.6,,,,,,,,,,
DeepSeek/DeepSeek-R1-Distill-Qwen-14B,Thinking,DeepSeek,Yes,"Qwen3 Report Table 17 (Thinking, Qwen3-8B / Qwen3-4B)",,,,,,,,78.1,,,,,,,59.1,,,,52.3,,,,,,,,,84.1,,,,,,,,,,
Llama/LLaMA-3.1-8B-Instruct,Non-thinking,Llama,Yes,"Qwen3 Report Table 18 (Non-thinking, Qwen3-8B / Qwen3-4B / LLaMA-3.1-8B)",,,,,,,,52.0,,,,,,,32.8,,,,26.0,,,,,,,,,61.7,,,,,,,,,,
Google/Gemma-3-12B-IT,Non-thinking,Gemma,Yes,"Qwen3 Report Table 18 (Non-thinking, Qwen3-8B / Qwen3-4B / LLaMA-3.1-8B)",,,,,,,,61.1,,,,,,,40.9,,,,43.7,,,,,,,,,77.8,,,,,,,,,,
DeepSeek/DeepSeek-R1-Distill-Qwen-1.5B,Thinking,DeepSeek,Yes,"Qwen3 Report Table 19 (Thinking, Qwen3-1.7B / Qwen3-0.6B)",,,,,,,,27.1,,,,,,,33.8,,,,24.9,,,,,,,,,45.4,,,,,,,,,,
DeepSeek/DeepSeek-R1-Distill-LLaMA-8B,Thinking,DeepSeek,Yes,"Qwen3 Report Table 19 (Thinking, Qwen3-1.7B / Qwen3-0.6B)",,,,,,,,50.4,,,,,,,49.0,,,,40.6,,,,,,,,,66.4,,,,,,,,,,
Qwen/Qwen3-0.6B,Non-thinking,Qwen,No,"Qwen3 Report Table 20 (Non-thinking, Qwen3-1.7B / Qwen3-0.6B)",,,,,,,,42.6,,,,,,,22.9,,,,21.8,,,,,,,,,44.6,,,,,,,,,,
Qwen/Qwen3-1.7B,Non-thinking,Qwen,No,"Qwen3 Report Table 20 (Non-thinking, Qwen3-1.7B / Qwen3-0.6B)",,,,,,,,61.0,,,,,,,28.6,,,,35.6,,,,,,,,,64.4,,,,,,,,,,
Google/Gemma-3-1B-IT,Non-thinking,Gemma,Yes,"Qwen3 Report Table 20 (Non-thinking, Qwen3-1.7B / Qwen3-0.6B)",,,,,,,,28.5,,,,,,,19.2,,,,14.4,,,,,,,,,33.3,,,,,,,,,,
Google/Phi-4-mini,Non-thinking,Phi,Yes,"Qwen3 Report Table 20 (Non-thinking, Qwen3-1.7B / Qwen3-0.6B)",,,,,,,,40.0,,,,,,,25.2,,,,25.3,,,,,,,,,67.9,,,,,,,,,,
Qwen/Qwen2.5-1.5B-Instruct,Non-thinking,Qwen,Yes,"Qwen3 Report Table 20 (Non-thinking, Qwen3-1.7B / Qwen3-0.6B)",,,,,,,,53.3,,,,,,,29.8,,,,18.0,,,,,,,,,50.7,,,,,,,,,,
Qwen/Qwen2.5-3B-Instruct,Non-thinking,Qwen,Yes,"Qwen3 Report Table 20 (Non-thinking, Qwen3-1.7B / Qwen3-0.6B)",,,,,,,,68.2,,,,,,,30.3,,,,23.8,,,,,,,,,64.4,,,,,,,,,,
