model_name,thinking_mode,model_family,instruction_tuned,source,AGI-Eval,AGIEval,AI2D,ANLI,ARC-C,ARC-Challenge,ARC-E,ARC-Easy,ASDiv,AlpacaEval-2,ArenaHard,AttaQ,Average,Avg,BBEH,BBH,BLINK,BigBenchHard,Bird-SQL,BoolQ,COCO-caption,ChartQA,CodeGeneration,CommonsenseQA,CountBenchVQA,DROP,DS-1000,DocVQA,DocVQA-test,ECLeKTic,FACTS-Grounding,FactualKnowledge,GMMLU-Lite,GPQA,GPQA-Diamond,GSM-Hard,GSM8K,GSM8K+Py,GSM8K-CoT,GSM8K-PAL,Global-MMLU-Lite,HellaSwag,HiddenMath,HumanEval,HumanEval+,HumanEval-Multilingual,HumanEval-Python,IFEval,IFEval_strict_prompt,InfoVQA,LanguageUnderstanding,LiveCodeBench,MATH,MATH+Py,MATH-Lv5,MAWPS,MBPP,MBPP+,MGSM,MMLU,MMLU-Pro,MMMU,MMMU-Pro,MMMU-val,MTOB-full,MTOB-half,MUSR,Math,MathVista,MedQA,Multilingual,N2C,NQ,OCW,OK-VQA,OpenBookQA,PIQA,PopQA,PopularAgg,ReMI,RealWorldQA,Reasoning,Robustness,SAT,SIQA,SVAMP,SciQ,SimpleQA,SocialIQa,SpatialSenseVQA,TQA,TabMWP,TallyQA,TextVQA,TriviaQA,TruthfulQA,TyDiQA,VQAv2,WMT24++,WinoGrande,Winogrande
Google/Gemma-3-1B,Non-thinking,Gemma,Yes,Gemma3 Report Table 6 (Instruction fine-tuned),,,,,,,,,,,,,,,,,,,6.4,,,,,,,,,,,,36.4,,,,19.2,,,,,,34.2,,15.8,,,,,,,,,1.9,48.0,,,,,,,,14.7,,,,,,,,,,,,,,,,,,,,,,,,,,,2.2,,,,,,,,,,,,,
Google/Gemma-3-4B,Non-thinking,Gemma,Yes,Gemma3 Report Table 6 (Instruction fine-tuned),,,,,,,,,,,,,,,,,,,36.3,,,,,,,,,,,,70.1,,,,30.8,,,,,,54.5,,43.0,,,,,,,,,12.6,75.6,,,,,,,,43.6,,,48.8,,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,
Google/Gemma-3-12B,Non-thinking,Gemma,Yes,Gemma3 Report Table 6 (Instruction fine-tuned),,,,,,,,,,,,,,,,,,,47.9,,,,,,,,,,,,75.8,,,,40.9,,,,,,69.5,,54.5,,,,,,,,,24.6,83.8,,,,,,,,60.6,,,59.6,,,,,,,,,,,,,,,,,,,,,,,,6.3,,,,,,,,,,,,,
Google/Gemma-3-27B,Non-thinking,Gemma,Yes,Gemma3 Report Table 6 (Instruction fine-tuned),,,,,,,,,,,,,,,,,,,54.4,,,,,,,,,,,,74.9,,,,42.4,,,,,,75.1,,60.3,,,,,,,,,29.7,89.0,,,,,,,,67.5,,,64.9,,,,,,,,,,,,,,,,,,,,,,,,10.0,,,,,,,,,,,,,
Google/Gemma-3-1B,Non-thinking,Gemma,No,"Gemma3 Report Table 9 (Factuality, reasoning)",,,,,38.4,,73.0,,,,,,,,,28.4,,,,63.2,,,,,,42.4,,,,,,,,,,,,,,,,62.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.48,,,,73.8,,,,,,,,48.9,,,,,,39.8,,,,,,,,,58.2,
Google/Gemma-3-4B,Non-thinking,Gemma,No,"Gemma3 Report Table 9 (Factuality, reasoning)",,,,,56.2,,82.4,,,,,,,,,50.9,,,,72.3,,,,,,60.1,,,,,,,,,,,,,,,,77.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,20.0,,,,79.6,,,,,,,,51.9,,,,,,65.8,,,,,,,,,64.7,
Google/Gemma-3-12B,Non-thinking,Gemma,No,"Gemma3 Report Table 9 (Factuality, reasoning)",,,,,68.9,,88.3,,,,,,,,,72.6,,,,78.8,,,,,,72.2,,,,,,,,,,,,,,,,84.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,31.4,,,,81.8,,,,,,,,53.4,,,,,,78.2,,,,,,,,,74.3,
Google/Gemma-3-27B,Non-thinking,Gemma,No,"Gemma3 Report Table 9 (Factuality, reasoning)",,,,,70.6,,89.0,,,,,,,,,77.7,,,,82.4,,,,,,77.2,,,,,,,,,,,,,,,,85.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,36.1,,,,83.3,,,,,,,,54.9,,,,,,85.5,,,,,,,,,78.8,
Google/Gemma-3-4B,Non-thinking,Gemma,No,Gemma3 Report Table 10 (STEM + code),,42.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15.0,,38.4,,,,,,,36.0,,,,,,,,,24.2,,,,46.0,,,59.6,29.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Google/Gemma-3-12B,Non-thinking,Gemma,No,Gemma3 Report Table 10 (STEM + code),,57.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,25.4,,71.0,,,,,,,45.7,,,,,,,,,43.3,,,,60.4,,,74.5,45.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Google/Gemma-3-27B,Non-thinking,Gemma,No,Gemma3 Report Table 10 (STEM + code),,66.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,24.3,,82.6,,,,,,,48.8,,,,,,,,,50.0,,,,65.6,,,78.6,52.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Google/Gemma-3-4B,Non-thinking,Gemma,No,Gemma3 Report Table 11 (Multimodal),,,63.2,,,,,,,,,,,,,,38.0,,,,102.0,63.6,,,26.1,,,72.8,,,,,,,,,,,,,,,,,,,,,,44.1,,,,,,,,,,,,39.2,,,,,,,,,,,,,51.0,,,,,27.3,45.5,,,,,,,,,50.9,,,42.5,58.9,,,,63.9,,,
Google/Gemma-3-12B,Non-thinking,Gemma,No,Gemma3 Report Table 11 (Multimodal),,,75.2,,,,,,,,,,,,,,35.9,,,,111.0,74.7,,,17.8,,,82.3,,,,,,,,,,,,,,,,,,,,,,54.8,,,,,,,,,,,,50.3,,,,,,,,,,,,,58.7,,,,,38.5,52.2,,,,,,,,,60.0,,,51.8,66.5,,,,71.2,,,
Google/Gemma-3-27B,Non-thinking,Gemma,No,Gemma3 Report Table 11 (Multimodal),,,79.0,,,,,,,,,,,,,,39.6,,,,116.0,76.3,,,68.0,,,85.6,,,,,,,,,,,,,,,,,,,,,,59.4,,,,,,,,,,,,56.1,,,,,,,,,,,,,60.2,,,,,44.8,53.9,,,,,,,,,59.4,,,54.3,68.6,,,,72.9,,,
Google/Gemma-3-1B,Non-thinking,Gemma,Yes,Gemma3 Report Table 18 (Instruction fine-tuned extended),,,,,,,,,,,,,,,7.2,39.1,,,,,,,,,,,,,,1.4,,,34.2,,,,62.8,,,,,,15.0,41.5,,,,,80.2,,,5.0,48.0,,,,35.2,,,38.8,,,,,,,,,,,,56.0,,,,,,,,,,,,,,,,,,,,,,,,,,,35.9,,
Google/Gemma-3-4B,Non-thinking,Gemma,Yes,Gemma3 Report Table 18 (Instruction fine-tuned extended),,,,,,,,,,,,,,,11.0,72.2,,,,,,,,,,,,,,4.6,,,54.5,,,,89.2,,,,,,42.0,71.3,,,,,90.2,,,23.0,75.6,,,,63.2,,,58.1,,,,,,,,,,,,70.3,,,,,,,,,,,,,,,,,,,,,,,,,,,46.8,,
Google/Gemma-3-12B,Non-thinking,Gemma,Yes,Gemma3 Report Table 18 (Instruction fine-tuned extended),,,,,,,,,,,,,,,16.3,85.7,,,,,,,,,,,,,,10.3,,,69.5,,,,94.4,,,,,,51.0,85.4,,,,,88.9,,,32.0,83.8,,,,73.0,,,71.9,,,,,,,,,,,,80.7,,,,,,,,,,,,,,,,,,,,,,,,,,,51.6,,
Google/Gemma-3-27B,Non-thinking,Gemma,Yes,Gemma3 Report Table 18 (Instruction fine-tuned extended),,,,,,,,,,,,,,,19.3,87.6,,,,,,,,,,,,,,16.7,,,75.1,,,,95.9,,,,,,56.0,87.8,,,,,90.4,,,39.0,89.0,,,,74.4,,,76.9,,,,,,,,,,,,84.5,,,,,,,,,,,,,,,,,,,,,,,,,,,53.4,,
Meta/LLaMA-3.1-70B,Non-thinking,LLaMA,No,LLaMA-4 Report (Pre-trained models),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,41.6,,,,66.4,,,79.3,53.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29.9,,,,
Meta/LLaMA-3.1-405B,Non-thinking,LLaMA,No,LLaMA-4 Report (Pre-trained models),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,53.5,,,,74.4,,,85.2,61.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,34.3,,,,
Meta/LLaMA-4-Scout,Non-thinking,LLaMA,No,LLaMA-4 Report (Pre-trained models),,,,,,,,,,,,,,,,,,,,,,83.4,,,,,,89.4,,,,,,,,,,,,,,,,,,,,,,,,,50.3,,,,67.8,,,79.6,58.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,31.5,,,,
Meta/LLaMA-4-Maverick,Non-thinking,LLaMA,No,LLaMA-4 Report (Pre-trained models),,,,,,,,,,,,,,,,,,,,,,85.3,,,,,,91.6,,,,,,,,,,,,,,,,,,,,,,,,,61.2,,,,77.6,,,85.5,62.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,31.7,,,,
Meta/LLaMA-3.3-70B,Non-thinking,LLaMA,Yes,LLaMA-4 Report (Instruction tuned models),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,50.5,,,,,,,,,,,,,,,,,33.3,,,,,,,91.1,,68.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Meta/LLaMA-3.1-405B,Non-thinking,LLaMA,Yes,LLaMA-4 Report (Instruction tuned models),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,49.0,,,,,,,,,,,,,,,,,27.7,,,,,,,91.6,,73.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Meta/LLaMA-4-Scout,Non-thinking,LLaMA,Yes,LLaMA-4 Report (Instruction tuned models),,,,,,,,,,,,,,,,,,,,,,88.8,,,,,,,94.4,,,,,,57.2,,,,,,,,,,,,,,,,,32.8,,,,,,,90.6,,74.3,69.4,52.2,,,,,,70.7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Meta/LLaMA-4-Maverick,Non-thinking,LLaMA,Yes,LLaMA-4 Report (Instruction tuned models),,,,,,,,,,,,,,,,,,,,,,90.0,,,,,,,94.4,,,,,,69.8,,,,,,,,,,,,,,,,,43.4,,,,,,,92.3,,80.5,73.4,59.6,,,,,,73.7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
DeepSeek/DeepSeek-Coder-Base-1.3B,Non-thinking,DeepSeek,No,DeepSeek-Coder Report Table 8 (Math reasoning base models),,,,,,,,,48.2,,,,,31.9,,,,,,,,,,,,,,,,,,,,,,14.5,14.6,,,,,,,,,,,,,,,,16.8,,,62.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,36.7,,,,,,30.0,,,,,,,,,
DeepSeek/DeepSeek-Coder-Base-6.7B,Non-thinking,DeepSeek,No,DeepSeek-Coder Report Table 8 (Math reasoning base models),,,,,,,,,67.2,,,,,54.7,,,,,,,,,,,,,,,,,,,,,,40.3,43.2,,,,,,,,,,,,,,,,19.2,,,87.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,58.4,,,,,,67.9,,,,,,,,,
DeepSeek/DeepSeek-Coder-Base-33B,Non-thinking,DeepSeek,No,DeepSeek-Coder Report Table 8 (Math reasoning base models),,,,,,,,,76.7,,,,,65.8,,,,,,,,,,,,,,,,,,,,,,54.1,60.7,,,,,,,,,,,,,,,,29.1,,,93.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,71.6,,,,,,75.3,,,,,,,,,
DeepSeek/DeepSeek-Coder-Base-1B,Non-thinking,DeepSeek,No,DeepSeek-Coder Report (Coding performance),,,,,,,,,,,,,,,,,,,,,,,,,,,16.2,,,,,,,,,,,,,,,,,,,28.3,34.8,,,,,,,,,,46.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
DeepSeek/DeepSeek-Coder-Base-7B,Non-thinking,DeepSeek,No,DeepSeek-Coder Report (Coding performance),,,,,,,,,,,,,,,,,,,,,,,,,,,30.5,,,,,,,,,,,,,,,,,,,44.7,49.4,,,,,,,,,,60.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
DeepSeek/DeepSeek-Coder-Base-33B,Non-thinking,DeepSeek,No,DeepSeek-Coder Report (Coding performance),,,,,,,,,,,,,,,,,,,,,,,,,,,40.2,,,,,,,,,,,,,,,,,,,50.3,56.1,,,,,,,,,,66.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
DeepSeek/DeepSeek-Coder-Instruct-7B,Non-thinking,DeepSeek,Yes,DeepSeek-Coder Report (Coding performance),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,66.1,78.6,,,,,,,,,,65.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
DeepSeek/DeepSeek-Coder-Instruct-33B,Non-thinking,DeepSeek,Yes,DeepSeek-Coder Report (Coding performance),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,69.2,79.3,,,,,,,,,,70.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
IBM/Granite-3.3-8B-Instruct,Non-thinking,Granite,Yes,Granite Report (Instruction Following & General Benchmarks),,,,,,,,,,61.16,55.23,85.99,,,,,,65.6,,,,,,,,50.73,,,,,,,,,,,83.09,,,,,,,89.47,86.88,,,73.57,,,,,,,,,,,,66.93,,,,,,,,,,,,,,,,,,28.08,,,,,,,,,,,,,,,,,,66.37,,,,,
IBM/Granite-8B-Code-Base,Non-thinking,Granite,No,Granite Report Table 15 (Chain-of-thought math tasks),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,61.9,63.1,,,,,,,,,,,,,,,21.4,35.4,,,,,,,,,,,,,,,,,,,,8.8,,,,,,,,,,62.5,,,,,,,,,,,,,,,,,
IBM/Granite-3.1-8B-Instruct,Non-thinking,Granite,Yes,Granite Report HuggingFace Leaderboard V1,,,,,,62.62,,,,,,,,71.31,,,,,,,,,,,,,,,,,,,,,,,73.84,,,,,84.48,,,,,,,,,,,,,,,,,,65.34,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,66.23,,,,,75.37
IBM/Granite-3.1-2B-Instruct,Non-thinking,Granite,Yes,Granite Report HuggingFace Leaderboard V1,,,,,,54.61,,,,,,,,60.79,,,,,,,,,,,,,,,,,,,,,,,52.76,,,,,75.14,,,,,,,,,,,,,,,,,,55.31,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,59.42,,,,,67.48
IBM/Granite-3.1-3B-A800M-Instruct,Non-thinking,Granite,Yes,Granite Report HuggingFace Leaderboard V1,,,,,,50.42,,,,,,,,56.53,,,,,,,,,,,,,,,,,,,,,,,48.97,,,,,73.01,,,,,,,,,,,,,,,,,,52.19,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,49.71,,,,,64.87
IBM/Granite-3.1-1B-A400M-Instruct,Non-thinking,Granite,Yes,Granite Report HuggingFace Leaderboard V1,,,,,,42.66,,,,,,,,46.29,,,,,,,,,,,,,,,,,,,,,,,33.88,,,,,65.97,,,,,,,,,,,,,,,,,,26.13,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,46.77,,,,,62.35
IBM/Granite-3.1-8B-Instruct,Non-thinking,Granite,Yes,Granite Report HuggingFace Leaderboard V2,,,,,,,,,,,,,,30.55,,34.09,,,,,,,,,,,,,,,,,,8.28,,,,,,,,,,,,,,72.08,,,,,,,21.68,,,,,,28.19,,,,,,19.01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
IBM/Granite-3.1-2B-Instruct,Non-thinking,Granite,Yes,Granite Report HuggingFace Leaderboard V2,,,,,,,,,,,,,,21.06,,21.82,,,,,,,,,,,,,,,,,,5.26,,,,,,,,,,,,,,62.86,,,,,,,11.33,,,,,,20.21,,,,,,4.87,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
IBM/Granite-3.1-3B-A800M-Instruct,Non-thinking,Granite,Yes,Granite Report HuggingFace Leaderboard V2,,,,,,,,,,,,,,17.1,,16.69,,,,,,,,,,,,,,,,,,5.15,,,,,,,,,,,,,,55.16,,,,,,,10.35,,,,,,12.75,,,,,,2.51,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
IBM/Granite-3.1-1B-A400M-Instruct,Non-thinking,Granite,Yes,Granite Report HuggingFace Leaderboard V2,,,,,,,,,,,,,,10.05,,6.18,,,,,,,,,,,,,,,,,,0.78,,,,,,,,,,,,,,46.86,,,,,,,4.08,,,,,,2.41,,,,,,0.78,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Microsoft/Phi-3-Mini-128K-Instruct,Non-thinking,Phi,Yes,Phi Report (Benchmarks Small/Medium Models),39.5,,,52.3,,85.5,,,,,,,66.4,,,,,72.1,,77.1,,,,,,,,,,,,,,29.7,,,,,85.3,,,70.5,,60.4,,,,,,,,,,,,,70.0,,,69.7,,,,,,,,,,56.4,,,,,,78.8,80.1,,,,,,,,,,,,74.7,,,,,,57.8,64.8,,,,71.0,
Microsoft/Phi-3-Medium-128K-Instruct-14B,Non-thinking,Phi,Yes,Phi Report (Benchmarks Small/Medium Models),49.7,,,57.3,,91.0,,97.6,,,,,77.3,,,,,77.9,,86.5,,,,82.2,,,,,,,,,,,,,,,87.5,,,81.6,,58.5,,,,,,,,,,,,,73.8,,,76.6,,,,,,,,,,67.6,,,,,,87.2,87.8,,,,,,,,,,,,79.0,,,,,,73.9,74.3,,,,78.9,
Microsoft/Phi-4-14B,Non-thinking,Phi,No,Phi Report (Benchmarks Small/Medium Models),,,,,,,,,,,,,,,,,,,,,,,,,,75.5,,,,,,,,56.1,,,,,,,,,,82.6,,,,,,,,,80.4,,,,,,80.6,84.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,,,,,
Microsoft/Phi-3-Mini-128K-Instruct,Non-thinking,Phi,Yes,Phi Report (Aggregated Categories Small Models),,,,,,,,,,,,,,,,,,,,,,,61.0,,,,,,,,,35.8,,,,,,,,,,,,,,,,,,,57.5,,,,,,,,,,,,,,,,,51.6,,,56.4,,,,,,,,60.6,,,69.4,61.1,,,,,,,,,,,,,,,,,,
Microsoft/Phi-3-Medium-128K-Instruct-14B,Non-thinking,Phi,Yes,Phi Report (Medium/Large Instruction Models),49.7,,,57.3,,91.0,,97.6,,,,,77.3,,,,,77.9,,86.5,,,,82.2,,,,,,,,,,,,,,,87.5,,,81.6,,58.5,,,,,,,,,,,,,73.8,,,76.6,,,,,,,,,,67.6,,,,,,87.2,87.8,,,,,,,,,,,,79.0,,,,,,73.9,74.3,,,,78.9,
Microsoft/Phi-4-14B,Non-thinking,Phi,No,Phi Report (Medium/Large Instruction Models),,,,,,,,,,,,,,,,,,,,,,,,,,75.5,,,,,,,,56.1,,,,,,,,,,82.6,,,,,,,,,80.4,,,,,,80.6,84.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,,,,,
BigCode/StarCoderBase-3B,Non-thinking,StarCoder,No,"StarCoder Report Table 9 (HumanEval, MBPP)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,21.3,17.1,,,,,,,,,,,,42.6,35.8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BigCode/StarCoder2-3B,Non-thinking,StarCoder,No,"StarCoder Report Table 9 (HumanEval, MBPP)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,31.7,27.4,,,,,,,,,,,,57.4,47.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BigCode/StarCoderBase-7B,Non-thinking,StarCoder,No,"StarCoder Report Table 9 (HumanEval, MBPP)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,30.5,25.0,,,,,,,,,,,,47.4,39.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BigCode/StarCoder2-7B,Non-thinking,StarCoder,No,"StarCoder Report Table 9 (HumanEval, MBPP)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,35.4,29.5,,,,,,,,,,,,54.4,53.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BigCode/StarCoderBase-15B,Non-thinking,StarCoder,No,"StarCoder Report Table 9 (HumanEval, MBPP)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29.3,25.6,,,,,,,,,,,,50.6,43.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BigCode/StarCoder2-15B,Non-thinking,StarCoder,No,"StarCoder Report Table 9 (HumanEval, MBPP)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,46.3,37.8,,,,,,,,,,,,66.2,53.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BigCode/StarCoderBase-3B,Non-thinking,StarCoder,No,StarCoder Report Table 14 (GSM8K PAL),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BigCode/StarCoder2-3B,Non-thinking,StarCoder,No,StarCoder Report Table 14 (GSM8K PAL),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,27.7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BigCode/StarCoderBase-7B,Non-thinking,StarCoder,No,StarCoder Report Table 14 (GSM8K PAL),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BigCode/StarCoder2-7B,Non-thinking,StarCoder,No,StarCoder Report Table 14 (GSM8K PAL),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,40.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BigCode/StarCoderBase-15B,Non-thinking,StarCoder,No,StarCoder Report Table 14 (GSM8K PAL),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,21.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BigCode/StarCoder2-15B,Non-thinking,StarCoder,No,StarCoder Report Table 14 (GSM8K PAL),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,65.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Falcon-3-10B,Non-thinking,Falcon,No,Falcon Report (Base models),,,,,,,,,,,,,,27.59,,41.38,,,,,,,,,,,,,,,,,,12.75,,,,,,,,,,,,,,36.48,,,,,24.77,,,,,,,,36.0,,,,,,14.17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Falcon-3-7B,Non-thinking,Falcon,No,Falcon Report (Base models),,,,,,,,,,,,,,24.72,,31.56,,,,,,,,,,,,,,,,,,12.86,,,,,,,,,,,,,,34.16,,,,,19.26,,,,,,,,32.34,,,,,,18.14,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Falcon-3-10B-Instruct,Non-thinking,Falcon,Yes,Falcon Report (Instruct models),,,,,,,,,,,,,,35.19,,44.82,,,,,,,,,,,,,,,,,,10.51,,,,,,,,,,,,,,78.17,,,,,25.91,,,,,,,,38.1,,,,,,13.61,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Falcon-3-7B-Instruct,Non-thinking,Falcon,Yes,Falcon Report (Instruct models),,,,,,,,,,,,,,34.91,,37.92,,,,,,,,,,,,,,,,,,8.05,,,,,,,,,,,,,,76.12,,,,,31.87,,,,,,,,34.3,,,,,,21.17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Falcon-3-1B-Instruct,Non-thinking,Falcon,Yes,Falcon Report (Falcon-3-1B/3B Instruct),,,,,,45.9,,,,,,,,,,39.0,,,,,,,,,,,,,,,,,,26.5,,,38.6,,,,,,,,,,,54.4,,,,,1.0,,,,,,,43.9,18.6,,,,,,35.1,,,,,,,,,40.0,72.0,,,,,,,,,,86.8,,,,,,,,,,,,,60.2,
Falcon-3-3B-Instruct,Non-thinking,Falcon,Yes,Falcon Report (Falcon-3-1B/3B Instruct),,,,,,58.5,,,,,,,,,,45.4,,,,,,,,,,,,,,,,,,29.6,,,71.9,,,,,,,,,,,68.3,,,,,19.9,,,,,,,55.7,29.7,,,,,,40.2,,,,,,,,,42.2,74.4,,,,,,,,,,95.6,,,,,,,,,,,,,65.0,
Falcon-3-7B-Instruct,Non-thinking,Falcon,Yes,Falcon Report (Falcon-3-7B/10B Instruct detailed),,,,,,65.9,,,,,,,,,,52.4,,,,,,,,,,,,,,,,,,32.0,,,79.1,,,,,,,,,,,76.5,,,,,29.4,,,,,,,68.0,40.7,,,,,,46.4,,,,,,,,,45.8,78.8,,,,,,,,,,94.7,,,,,,,,,,,,,70.4,
Falcon-3-10B-Instruct,Non-thinking,Falcon,Yes,Falcon Report (Falcon-3-7B/10B Instruct detailed),,,,,,64.5,,,,,,,,,,58.4,,,,,,,,,,,,,,,,,,33.5,,,83.1,,,,,,,,,,,78.0,,,,,22.1,,,,,,,71.6,44.0,,,,,,41.1,,,,,,,,,48.2,78.4,,,,,,,,,,90.4,,,,,,,,,,,,,71.0,
