row_id,rank,method,method_short,open_weights,method_details,evaluation_date,metrics,raw_metrics,uses_additional_data,paper,tags,mmlu_accuracy
,0,OpenAI o1,OpenAI o1,FALSE,,9/12/24,{'Average (%)': '92.3'},{'Average (%)': 92.3},FALSE,https://openai.com/index/learning-to-reason-with-llms/,[],92.3
113523,1,Gemini Ultra ~1760B,Gemini Ultra ~1760B,FALSE,,12/19/23,{'Average (%)': '90'},{'Average (%)': 90.0},FALSE,"{'id': 1346190, 'title': 'Gemini: A Family of Highly Capable Multimodal Models', 'url': '/paper/gemini-a-family-of-highly-capable-multimodal-1', 'published': '2023-12-19T00:00:00.000000', 'code': False, 'review_url': None}","[{'id': 567, 'name': 'self-consistency', 'color': '#c4a000'}, {'id': 318, 'name': 'chain-of-thought', 'color': '#eea320'}]",90
122102,2,GPT-4o,GPT-4o,FALSE,,5/17/23,{'Average (%)': '88.7'},{'Average (%)': 88.7},FALSE,"{'id': 1174373, 'title': 'GPT-4 Technical Report', 'url': '/paper/GPT-4-technical-report-1', 'published': '2023-03-15T00:00:00.000000', 'code': True, 'review_url': None}",[],88.7
127848,3,Llama 3.1 405B (CoT),Llama 3.1 405B,TRUE,CoT,7/31/24,{'Average (%)': '88.6'},{'Average (%)': 88.6},FALSE,"{'id': 1493137, 'title': 'The Llama 3 Herd of Models', 'url': '/paper/the-Llama-3-herd-of-models', 'published': '2024-07-31T00:00:00.000000', 'code': True, 'review_url': None}","[{'id': 318, 'name': 'chain-of-thought', 'color': '#eea320'}]",88.6
99170,4,"Claude 3 Opus (5-shot, CoT)",Claude 3 Opus,FALSE,"5-shot, CoT",3/4/24,{'Average (%)': '88.2'},{'Average (%)': 88.2},FALSE,"{'id': 1391602, 'title': 'The Claude 3 Model Family: Opus, Sonnet, Haiku', 'url': '/paper/the-claude-3-model-family-opus-sonnet-haiku', 'published': '2024-03-04T00:00:00.000000', 'code': False, 'review_url': None}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",88.2
118652,5,Claude 3 Opus (5-shot),Claude 3 Opus,FALSE,5-shot,3/4/24,{'Average (%)': '86.8'},{'Average (%)': 86.8},FALSE,"{'id': 1391602, 'title': 'The Claude 3 Model Family: Opus, Sonnet, Haiku', 'url': '/paper/the-claude-3-model-family-opus-sonnet-haiku', 'published': '2024-03-04T00:00:00.000000', 'code': False, 'review_url': None}",[],86.8
121578,6,Leeroo (5-shot),Leeroo,FALSE,5-shot,1/25/24,{'Average (%)': '86.64'},{'Average (%)': 86.64},FALSE,"{'id': 1366435, 'title': 'Routoo: Learning to Route to Large Language Models Effectively', 'url': '/paper/leeroo-orchestrator-elevating-llms', 'published': '2024-01-25T00:00:00.000000', 'code': True, 'review_url': '/paper/leeroo-orchestrator-elevating-llms/review/?hl=121578'}",[],86.64
114904,7,GPT-4 (few-shot),GPT-4,FALSE,few-shot,3/15/23,{'Average (%)': '86.4'},{'Average (%)': 86.4},FALSE,"{'id': 1174373, 'title': 'GPT-4 Technical Report', 'url': '/paper/GPT-4-technical-report-1', 'published': '2023-03-15T00:00:00.000000', 'code': True, 'review_url': '/paper/GPT-4-technical-report-1/review/?hl=114904'}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",86.4
127849,8,Llama 3.1 70B (CoT),Llama 3.1 70B,TRUE,CoT,7/31/24,{'Average (%)': '86.0'},{'Average (%)': 86.0},FALSE,"{'id': 1493137, 'title': 'The Llama 3 Herd of Models', 'url': '/paper/the-Llama-3-herd-of-models', 'published': '2024-07-31T00:00:00.000000', 'code': True, 'review_url': None}","[{'id': 318, 'name': 'chain-of-thought', 'color': '#eea320'}]",86
113522,9,Gemini Ultra (5-shot),Gemini Ultra,FALSE,5-shot,,{'Average (%)': '83.7'},{'Average (%)': 83.7},FALSE,"{'id': None, 'title': None, 'url': None, 'published': None, 'code': False, 'review_url': None}","[{'id': 238, 'name': '5-shot', 'color': '#d08216'}]",83.7
123748,10,GaC(Qwen2-72B-Instruct + Llama-3-70B-Instruct),GaC,TRUE,Qwen2-72B-Instruct + Llama-3-70B-Instruct,6/18/24,{'Average (%)': '83.54'},{'Average (%)': 83.54},FALSE,"{'id': 1463808, 'title': 'Breaking the Ceiling of the LLM Community by Treating Token Generation as a Classification for Ensembling', 'url': '/paper/breaking-the-ceiling-of-the-llm-community-by', 'published': '2024-06-18T00:00:00.000000', 'code': True, 'review_url': '/paper/breaking-the-ceiling-of-the-llm-community-by/review/?hl=123748'}","[{'id': 284, 'name': 'Ensemble', 'color': '#77bb41'}]",83.54
120070,11,"Claude 3 Sonnet (5-shot, CoT)",Claude 3 Sonnet,FALSE,"5-shot, CoT",3/4/24,{'Average (%)': '81.5'},{'Average (%)': 81.5},FALSE,"{'id': 1391602, 'title': 'The Claude 3 Model Family: Opus, Sonnet, Haiku', 'url': '/paper/the-claude-3-model-family-opus-sonnet-haiku', 'published': '2024-03-04T00:00:00.000000', 'code': False, 'review_url': None}",[],81.5
118934,12,Flan-PaLM 2-L,Flan-PaLM 2-L,FALSE,,5/17/23,{'Average (%)': '81.2'},{'Average (%)': 81.2},FALSE,"{'id': 1210556, 'title': 'PaLM 2 Technical Report', 'url': '/paper/palm-2-technical-report-1', 'published': '2023-05-17T00:00:00.000000', 'code': True, 'review_url': '/paper/palm-2-technical-report-1/review/?hl=118934'}",[],81.2
113521,13,Gemini Pro (CoT@8),Gemini Pro,FALSE,CoT@8,,{'Average (%)': '79.1'},{'Average (%)': 79.1},FALSE,"{'id': None, 'title': None, 'url': None, 'published': None, 'code': False, 'review_url': None}","[{'id': 318, 'name': 'chain-of-thought', 'color': '#eea320'}]",79.1
120069,14,Claude 3 Sonnet (5-shot),Claude 3 Sonnet,FALSE,5-shot,3/4/24,{'Average (%)': '79'},{'Average (%)': 79.0},FALSE,"{'id': 1391602, 'title': 'The Claude 3 Model Family: Opus, Sonnet, Haiku', 'url': '/paper/the-claude-3-model-family-opus-sonnet-haiku', 'published': '2024-03-04T00:00:00.000000', 'code': False, 'review_url': None}",[],79
117489,15,Claude 2 (5-shot),Claude 2,FALSE,5-shot,7/11/23,{'Average (%)': '78.5'},{'Average (%)': 78.5},FALSE,"{'id': 1300010, 'title': 'Model Card and Evaluations for Claude Models', 'url': '/paper/model-card-and-evaluations-for-claude-models', 'published': '2023-07-11T00:00:00.000000', 'code': False, 'review_url': None}","[{'id': 238, 'name': '5-shot', 'color': '#d08216'}, {'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",78.5
118932,16,PaLM 2-L (5-shot),PaLM 2-L,FALSE,5-shot,5/17/23,{'Average (%)': '78.3'},{'Average (%)': 78.3},FALSE,"{'id': 1210556, 'title': 'PaLM 2 Technical Report', 'url': '/paper/palm-2-technical-report-1', 'published': '2023-05-17T00:00:00.000000', 'code': True, 'review_url': '/paper/palm-2-technical-report-1/review/?hl=118932'}","[{'id': 238, 'name': '5-shot', 'color': '#d08216'}]",78.3
119156,17,Qwen1.5 72B (5-shot),Qwen1.5 72B,TRUE,5-shot,2/4/24,{'Average (%)': '77.5'},{'Average (%)': 77.5},FALSE,"{'id': None, 'title': None, 'url': None, 'published': None, 'code': False, 'review_url': None}",[],77.5
117488,18,Claude 1.3 (5-shot),Claude 1.3,FALSE,5-shot,4/18/23,{'Average (%)': '77'},{'Average (%)': 77.0},FALSE,"{'id': 1300010, 'title': 'Model Card and Evaluations for Claude Models', 'url': '/paper/model-card-and-evaluations-for-claude-models', 'published': '2023-07-11T00:00:00.000000', 'code': False, 'review_url': None}","[{'id': 238, 'name': '5-shot', 'color': '#d08216'}, {'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",77
120072,19,"Claude 3 Haiku (5-shot, CoT)",Claude 3 Haiku,FALSE,"5-shot, CoT",3/4/24,{'Average (%)': '76.7'},{'Average (%)': 76.7},FALSE,"{'id': 1391602, 'title': 'The Claude 3 Model Family: Opus, Sonnet, Haiku', 'url': '/paper/the-claude-3-model-family-opus-sonnet-haiku', 'published': '2024-03-04T00:00:00.000000', 'code': False, 'review_url': None}",[],76.7
115961,20,Leeroo (5-shot),Leeroo,FALSE,5-shot,1/25/24,{'Average (%)': '75.9'},{'Average (%)': 75.9},FALSE,"{'id': 1366435, 'title': 'Routoo: Learning to Route to Large Language Models Effectively', 'url': '/paper/leeroo-orchestrator-elevating-llms', 'published': '2024-01-25T00:00:00.000000', 'code': True, 'review_url': '/paper/leeroo-orchestrator-elevating-llms/review/?hl=115961'}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",75.9
119092,21,Camelidae-8√ó34B (5-shot),Camelidae-8√ó34B,TRUE,5-shot,1/5/24,{'Average (%)': '75.6'},{'Average (%)': 75.6},FALSE,"{'id': 1355944, 'title': 'Parameter-Efficient Sparsity Crafting from Dense to Mixture-of-Experts for Instruction Tuning on General Tasks', 'url': '/paper/parameter-efficient-sparsity-crafting-from', 'published': '2024-01-05T00:00:00.000000', 'code': True, 'review_url': '/paper/parameter-efficient-sparsity-crafting-from/review/?hl=119092'}","[{'id': 238, 'name': '5-shot', 'color': '#d08216'}]",75.6
120071,22,Claude 3 Haiku (5-shot),Claude 3 Haiku,FALSE,5-shot,3/4/24,{'Average (%)': '75.2'},{'Average (%)': 75.2},FALSE,"{'id': 1391602, 'title': 'The Claude 3 Model Family: Opus, Sonnet, Haiku', 'url': '/paper/the-claude-3-model-family-opus-sonnet-haiku', 'published': '2024-03-04T00:00:00.000000', 'code': False, 'review_url': None}",[],75.2
73150,23,Flan-U-PaLM 540B,Flan-U-PaLM 540B,FALSE,,10/20/22,{'Average (%)': '74.1'},{'Average (%)': 74.1},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73150'}","[{'id': 184, 'name': 'fine-tuned', 'color': '#e56666'}]",74.1
119889,24,DBRX Instruct 132B (5-shot),DBRX Instruct 132B,TRUE,5-shot,7/31/24,{'Average (%)': '73.7'},{'Average (%)': 73.7},FALSE,"{'id': 1493137, 'title': 'The Llama 3 Herd of Models', 'url': '/paper/the-Llama-3-herd-of-models', 'published': '2024-07-31T00:00:00.000000', 'code': True, 'review_url': None}",[],73.7
73148,25,Flan-PaLM 540B,Flan-PaLM 540B,FALSE,,10/20/22,{'Average (%)': '73.5'},{'Average (%)': 73.5},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73148'}","[{'id': 184, 'name': 'fine-tuned', 'color': '#e56666'}]",73.5
107031,26,Claude Instant 1.1 (5-shot),Claude Instant 1.1,FALSE,5-shot,7/11/23,{'Average (%)': '73.4'},{'Average (%)': 73.4},FALSE,"{'id': 1300010, 'title': 'Model Card and Evaluations for Claude Models', 'url': '/paper/model-card-and-evaluations-for-claude-models', 'published': '2023-07-11T00:00:00.000000', 'code': False, 'review_url': None}","[{'id': 238, 'name': '5-shot', 'color': '#d08216'}, {'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",73.4
127850,27,Llama 3.1 8B (CoT),Llama 3.1 8B,TRUE,CoT,7/31/24,{'Average (%)': '73.0'},{'Average (%)': 73.0},FALSE,"{'id': 1493137, 'title': 'The Llama 3 Herd of Models', 'url': '/paper/the-Llama-3-herd-of-models', 'published': '2024-07-31T00:00:00.000000', 'code': True, 'review_url': None}","[{'id': 318, 'name': 'chain-of-thought', 'color': '#eea320'}]",73
72866,28,"Flan-PaLM (5-shot, finetuned)",Flan-PaLM,FALSE,"5-shot, finetuned",7/31/24,{'Average (%)': '72.2'},{'Average (%)': 72.2},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=72866'}","[{'id': 238, 'name': '5-shot', 'color': '#d08216'}, {'id': 184, 'name': 'fine-tuned', 'color': '#e56666'}]",72.2
96120,29,code-davinci-002 175B + REPLUG LSR (5-shot),code-davinci-002 175B + REPLUG LSR,FALSE,5-shot,1/30/23,{'Average (%)': '71.8'},{'Average (%)': 71.8},FALSE,"{'id': 1149330, 'title': 'REPLUG: Retrieval-Augmented Black-Box Language Models', 'url': '/paper/replug-retrieval-augmented-black-box-language', 'published': '2023-01-30T00:00:00.000000', 'code': True, 'review_url': None}","[{'id': 238, 'name': '5-shot', 'color': '#d08216'}, {'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",71.8
113520,30,Gemini Pro (5-shot),Gemini Pro,FALSE,5-shot,,{'Average (%)': '71.8'},{'Average (%)': 71.8},FALSE,"{'id': None, 'title': None, 'url': None, 'published': None, 'code': False, 'review_url': None}","[{'id': 238, 'name': '5-shot', 'color': '#d08216'}, {'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",71.8
73159,31,Flan-PaLM 540B (CoT),Flan-PaLM 540B,FALSE,CoT,10/20/22,{'Average (%)': '70.9'},{'Average (%)': 70.9},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73159'}","[{'id': 318, 'name': 'chain-of-thought', 'color': '#eea320'}, {'id': 184, 'name': 'fine-tuned', 'color': '#e56666'}]",70.9
72741,32,U-PaLM 540B (5-shot),U-PaLM 540B,FALSE,5-shot,10/20/22,{'Average (%)': '70.7'},{'Average (%)': 70.7},FALSE,"{'id': 1097009, 'title': 'Transcending Scaling Laws with 0.1% Extra Compute', 'url': '/paper/transcending-scaling-laws-with-0-1-extra', 'published': '2022-10-20T00:00:00.000000', 'code': False, 'review_url': '/paper/transcending-scaling-laws-with-0-1-extra/review/?hl=72741'}","[{'id': 238, 'name': '5-shot', 'color': '#d08216'}, {'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",70.7
114901,33,Falcon 180B (5-shot),Falcon 180B,TRUE,5-shot,11/28/23,{'Average (%)': '70.6'},{'Average (%)': 70.6},FALSE,"{'id': 1329402, 'title': 'The Falcon Series of Open Language Models', 'url': '/paper/the-falcon-series-of-open-language-models', 'published': '2023-11-28T00:00:00.000000', 'code': False, 'review_url': '/paper/the-falcon-series-of-open-language-models/review/?hl=114901'}","[{'id': 238, 'name': '5-shot', 'color': '#d08216'}, {'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",70.6
118948,34,Mixtral 8x7B (5-shot),Mixtral 8x7B,TRUE,5-shot,1/8/24,{'Average (%)': '70.6'},{'Average (%)': 70.6},FALSE,"{'id': 1356260, 'title': 'Mixtral of Experts', 'url': '/paper/mixtral-of-experts', 'published': '2024-01-08T00:00:00.000000', 'code': True, 'review_url': None}","[{'id': 238, 'name': '5-shot', 'color': '#d08216'}]",70.6
73151,35,"Flan-PaLM (5-shot, finetuned, CoT)",Flan-PaLM,FALSE,"5-shot, finetuned, CoT",10/20/22,{'Average (%)': '70.2'},{'Average (%)': 70.2},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73151'}","[{'id': 318, 'name': 'chain-of-thought', 'color': '#eea320'}, {'id': 238, 'name': '5-shot', 'color': '#d08216'}, {'id': 184, 'name': 'fine-tuned', 'color': '#e56666'}]",70.2
114905,36,GPT-3.5 Turbo,GPT-3.5 Turbo,FALSE,,3/15/23,{'Average (%)': '70.0'},{'Average (%)': 70.0},FALSE,"{'id': 1174373, 'title': 'GPT-4 Technical Report', 'url': '/paper/GPT-4-technical-report-1', 'published': '2023-03-15T00:00:00.000000', 'code': True, 'review_url': '/paper/GPT-4-technical-report-1/review/?hl=114905'}",[],70
73161,37,Flan-U-PaLM 540B (CoT),Flan-U-PaLM 540B,FALSE,CoT,10/20/22,{'Average (%)': '69.8'},{'Average (%)': 69.8},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73161'}","[{'id': 318, 'name': 'chain-of-thought', 'color': '#eea320'}, {'id': 184, 'name': 'fine-tuned', 'color': '#e56666'}]",69.8
58550,38,PaLM,PaLM,FALSE,,4/5/22,{'Average (%)': '69.3'},{'Average (%)': 69.3},FALSE,"{'id': 989558, 'title': 'PaLM: Scaling Language Modeling with Pathways', 'url': '/paper/palm-scaling-language-modeling-with-pathways-1', 'published': '2022-04-05T00:00:00.000000', 'code': True, 'review_url': None}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",69.3
105221,39,AiLMe (5-shot),AiLMe,FALSE,5-shot,6/23/23,{'Average (%)': '69.0'},{'Average (%)': 69.0},FALSE,"{'id': None, 'title': None, 'url': None, 'published': None, 'code': False, 'review_url': None}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",69
97685,40,Llama 65B (fine-tuned),Llama 65B,TRUE,fine-tuned,2/27/23,{'Average (%)': '68.9'},{'Average (%)': 68.9},TRUE,"{'id': 1164350, 'title': 'Llama: Open and Efficient Foundation Language Models', 'url': '/paper/Llama-open-and-efficient-foundation-language-1', 'published': '2023-02-27T00:00:00.000000', 'code': True, 'review_url': '/paper/Llama-open-and-efficient-foundation-language-1/review/?hl=97685'}","[{'id': 184, 'name': 'fine-tuned', 'color': '#e56666'}]",68.9
96121,41,code-davinci-002 175B (5-shot),code-davinci-002 175B,FALSE,5-shot,1/30/23,{'Average (%)': '68.3'},{'Average (%)': 68.3},FALSE,"{'id': 1149330, 'title': 'REPLUG: Retrieval-Augmented Black-Box Language Models', 'url': '/paper/replug-retrieval-augmented-black-box-language', 'published': '2023-01-30T00:00:00.000000', 'code': True, 'review_url': None}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",68.3
118813,42,code-davinci-002 175B (5-shot),code-davinci-002 175B,FALSE,5-shot,10/20/22,{'Average (%)': '68.2'},{'Average (%)': 68.2},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=118813'}",[],68.2
50746,43,Chinchilla 70B (5-shot),Chinchilla 70B,FALSE,5-shot,3/29/22,{'Average (%)': '67.5'},{'Average (%)': 67.5},FALSE,"{'id': 985465, 'title': 'Training Compute-Optimal Large Language Models', 'url': '/paper/training-compute-optimal-large-language', 'published': '2022-03-29T00:00:00.000000', 'code': True, 'review_url': None}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",67.5
119093,44,Qwen2idae-16x14B (5-shot),Qwen2idae-16x14B,TRUE,5-shot,3/12/24,{'Average (%)': '66.7'},{'Average (%)': 66.7},FALSE,"{'id': 1355944, 'title': 'Parameter-Efficient Sparsity Crafting from Dense to Mixture-of-Experts for Instruction Tuning on General Tasks', 'url': '/paper/parameter-efficient-sparsity-crafting-from', 'published': '2024-01-05T00:00:00.000000', 'code': True, 'review_url': '/paper/parameter-efficient-sparsity-crafting-from/review/?hl=119093'}",[],66.7
73149,45,Flan-cont-PaLM 62B,Flan-cont-PaLM 62B,FALSE,,10/20/22,{'Average (%)': '66.1'},{'Average (%)': 66.1},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73149'}",[],66.1
118811,46,text-davinci-003 175B (5-shot),text-davinci-003 175B,FALSE,5-shot,10/20/22,{'Average (%)': '64.8'},{'Average (%)': 64.8},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=118811'}",[],64.8
118812,47,text-davinci-003 175B (CoT),text-davinci-003 175B,FALSE,CoT,10/20/22,{'Average (%)': '64.6'},{'Average (%)': 64.6},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=118812'}",[],64.6
118820,48,code-davinci-002 175B (CoT),code-davinci-002 175B,FALSE,CoT,10/20/22,{'Average (%)': '64.5'},{'Average (%)': 64.5},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=118820'}",[],64.5
97684,49,Llama 65B (5-shot),Llama 65B,TRUE,5-shot,2/27/23,{'Average (%)': '63.4'},{'Average (%)': 63.4},TRUE,"{'id': 1164350, 'title': 'Llama: Open and Efficient Foundation Language Models', 'url': '/paper/Llama-open-and-efficient-foundation-language-1', 'published': '2023-02-27T00:00:00.000000', 'code': True, 'review_url': '/paper/Llama-open-and-efficient-foundation-language-1/review/?hl=97684'}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",63.4
118809,50,text-davinci-002 175B (5-shot),text-davinci-002 175B,FALSE,5-shot,10/20/22,{'Average (%)': '63.1'},{'Average (%)': 63.1},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=118809'}",[],63.1
106299,51,Llama 2 34B (5-shot),Llama 2 34B,TRUE,5-shot,7/18/23,{'Average (%)': '62.6'},{'Average (%)': 62.6},TRUE,"{'id': 1248363, 'title': 'Llama 2: Open Foundation and Fine-Tuned Chat Models', 'url': '/paper/Llama-2-open-foundation-and-fine-tuned-chat', 'published': '2023-07-18T00:00:00.000000', 'code': True, 'review_url': '/paper/Llama-2-open-foundation-and-fine-tuned-chat/review/?hl=106299'}",[],62.6
118947,52,Mistral 7B (5-shot),Mistral 7B,TRUE,5-shot,1/8/24,{'Average (%)': '62.5'},{'Average (%)': 62.5},FALSE,"{'id': 1356260, 'title': 'Mixtral of Experts', 'url': '/paper/mixtral-of-experts', 'published': '2024-01-08T00:00:00.000000', 'code': True, 'review_url': None}",[],62.5
118805,53,Flan-cont-PaLM 62B (CoT),Flan-cont-PaLM 62B,FALSE,CoT,10/20/22,{'Average (%)': '62'},{'Average (%)': 62.0},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=118805'}",[],62
118935,54,Mistral 7B (5-shot),Mistral 7B,TRUE,5-shot,10/10/23,{'Average (%)': '60.1'},{'Average (%)': 60.1},FALSE,"{'id': 1297015, 'title': 'Mistral 7B', 'url': '/paper/mistral-7b', 'published': '2023-10-10T00:00:00.000000', 'code': True, 'review_url': '/paper/mistral-7b/review/?hl=118935'}",[],60.1
47494,55,Gopher 280B (5-shot),Gopher 280B,FALSE,5-shot,12/8/21,{'Average (%)': '60.0'},{'Average (%)': 60.0},FALSE,"{'id': 942590, 'title': 'Scaling Language Models: Methods, Analysis & Insights from Training Gopher', 'url': '/paper/scaling-language-models-methods-analysis-1', 'published': '2021-12-08T00:00:00.000000', 'code': True, 'review_url': None}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",60
118810,56,text-davinci-002 175B (CoT),text-davinci-002 175B,FALSE,CoT,10/20/22,{'Average (%)': '60'},{'Average (%)': 60.0},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=118810'}",[],60
118808,57,GPT-3 Davinci 175B (CoT),GPT-3 Davinci 175B,FALSE,CoT,10/20/22,{'Average (%)': '59.5'},{'Average (%)': 59.5},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=118808'}",[],59.5
117237,58,Llama 33B (5-shot),Llama 33B,TRUE,5-shot,2/27/23,{'Average (%)': '57.8'},{'Average (%)': 57.8},TRUE,"{'id': 1164350, 'title': 'Llama: Open and Efficient Foundation Language Models', 'url': '/paper/Llama-open-and-efficient-foundation-language-1', 'published': '2023-02-27T00:00:00.000000', 'code': True, 'review_url': '/paper/Llama-open-and-efficient-foundation-language-1/review/?hl=117237'}",[],57.8
114903,59,Falcon 40B,Falcon 40B,TRUE,,11/28/23,{'Average (%)': '57.0'},{'Average (%)': 57.0},FALSE,"{'id': 1329402, 'title': 'The Falcon Series of Open Language Models', 'url': '/paper/the-falcon-series-of-open-language-models', 'published': '2023-11-28T00:00:00.000000', 'code': False, 'review_url': '/paper/the-falcon-series-of-open-language-models/review/?hl=114903'}",[],57
73147,60,Flan-PaLM,Flan-PaLM,FALSE,,10/20/22,{'Average (%)': '56.9'},{'Average (%)': 56.9},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73147'}",[],56.9
107030,61,Qwen 7B (5-shot),Qwen 7B,TRUE,5-shot,8/3/23,{'Average (%)': '56.7'},{'Average (%)': 56.7},FALSE,"{'id': None, 'title': None, 'url': None, 'published': None, 'code': False, 'review_url': None}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",56.7
118378,62,FLAN-UL2 20B (5-shot),FLAN-UL2 20B,TRUE,5-shot,5/10/22,{'Average (%)': '55.7'},{'Average (%)': 55.7},FALSE,"{'id': 1007751, 'title': 'UL2: Unifying Language Learning Paradigms', 'url': '/paper/unifying-language-learning-paradigms', 'published': '2022-05-10T00:00:00.000000', 'code': True, 'review_url': '/paper/unifying-language-learning-paradigms/review/?hl=118378'}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",55.7
73141,63,Flan-T5-XXL 11B,Flan-T5-XXL 11B,TRUE,,10/20/22,{'Average (%)': '55.1'},{'Average (%)': 55.1},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73141'}",[],55.1
106287,64,Llama 2 13B (5-shot),Llama 2 13B,TRUE,5-shot,7/18/23,{'Average (%)': '54.8'},{'Average (%)': 54.8},TRUE,"{'id': 1248363, 'title': 'Llama 2: Open Foundation and Fine-Tuned Chat Models', 'url': '/paper/Llama-2-open-foundation-and-fine-tuned-chat', 'published': '2023-07-18T00:00:00.000000', 'code': True, 'review_url': '/paper/Llama-2-open-foundation-and-fine-tuned-chat/review/?hl=106287'}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",54.8
119030,65,Branch-Train-MiX 4x7B (sampling top-1 experts),Branch-Train-MiX 4x7B,TRUE,sampling top-1 experts,3/12/24,{'Average (%)': '53.2'},{'Average (%)': 53.2},FALSE,"{'id': 1397263, 'title': 'Branch-Train-MiX: Mixing Expert LLMs into a Mixture-of-Experts LLM', 'url': '/paper/branch-train-mix-mixing-expert-llms-into-a', 'published': '2024-03-12T00:00:00.000000', 'code': True, 'review_url': None}",[],53.2
85951,66,GAL 120B (zero-shot),GAL 120B,TRUE,zero-shot,11/16/22,{'Average (%)': '52.6'},{'Average (%)': 52.6},FALSE,"{'id': 1112728, 'title': 'Galactica: A Large Language Model for Science', 'url': '/paper/galactica-a-large-language-model-for-science-1', 'published': '2022-11-16T00:00:00.000000', 'code': True, 'review_url': '/paper/galactica-a-large-language-model-for-science-1/review/?hl=85951'}","[{'id': 188, 'name': 'zero-shot', 'color': '#2771D3'}, {'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",52.6
73140,67,Flan-T5-XL 3B,Flan-T5-XL 3B,TRUE,,10/20/22,{'Average (%)': '52.4'},{'Average (%)': 52.4},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73140'}",[],52.4
118379,68,FLAN-UL2 20B (chain-of-thought),FLAN-UL2 20B,TRUE,chain-of-thought,5/10/22,{'Average (%)': '52.2'},{'Average (%)': 52.2},FALSE,"{'id': 1007751, 'title': 'UL2: Unifying Language Learning Paradigms', 'url': '/paper/unifying-language-learning-paradigms', 'published': '2022-05-10T00:00:00.000000', 'code': True, 'review_url': '/paper/unifying-language-learning-paradigms/review/?hl=118379'}","[{'id': 318, 'name': 'chain-of-thought', 'color': '#eea320'}]",52.2
73146,69,Flan-PaLM 8B,Flan-PaLM 8B,FALSE,,10/20/22,{'Average (%)': '49.3'},{'Average (%)': 49.3},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73146'}",[],49.3
47497,70,UnifiedQA 11B,UnifiedQA 11B,TRUE,,5/2/20,{'Average (%)': '48.9'},{'Average (%)': 48.9},FALSE,"{'id': 193599, 'title': 'UnifiedQA: Crossing Format Boundaries With a Single QA System', 'url': '/paper/unifiedqa-crossing-format-boundaries-with-a', 'published': '2020-05-02T00:00:00.000000', 'code': True, 'review_url': '/paper/unifiedqa-crossing-format-boundaries-with-a/review/?hl=47497'}","[{'id': 184, 'name': 'fine-tuned', 'color': '#e56666'}]",48.9
73156,71,Flan-T5-XXL 11B (CoT),Flan-T5-XXL 11B,TRUE,CoT,10/20/22,{'Average (%)': '48.6'},{'Average (%)': 48.6},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73156'}","[{'id': 318, 'name': 'chain-of-thought', 'color': '#eea320'}]",48.6
66197,72,Atlas (5-shot),Atlas,FALSE,5-shot,8/5/22,{'Average (%)': '47.9'},{'Average (%)': 47.9},FALSE,"{'id': 1055376, 'title': 'Atlas: Few-shot Learning with Retrieval Augmented Language Models', 'url': '/paper/few-shot-learning-with-retrieval-augmented', 'published': '2022-08-05T00:00:00.000000', 'code': True, 'review_url': '/paper/few-shot-learning-with-retrieval-augmented/review/?hl=66197'}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",47.9
73155,73,Flan-T5-XL 3B (CoT),Flan-T5-XL 3B,TRUE,CoT,10/20/22,{'Average (%)': '45.5'},{'Average (%)': 45.5},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73155'}","[{'id': 318, 'name': 'chain-of-thought', 'color': '#eea320'}]",45.5
106297,74,Llama 2 7B (5-shot),Llama 2 7B,TRUE,5-shot,7/18/23,{'Average (%)': '45.3'},{'Average (%)': 45.3},TRUE,"{'id': 1248363, 'title': 'Llama 2: Open Foundation and Fine-Tuned Chat Models', 'url': '/paper/Llama-2-open-foundation-and-fine-tuned-chat', 'published': '2023-07-18T00:00:00.000000', 'code': True, 'review_url': '/paper/Llama-2-open-foundation-and-fine-tuned-chat/review/?hl=106297'}",[],45.3
73129,75,Flan-T5-Large 780M,Flan-T5-Large 780M,TRUE,,10/20/22,{'Average (%)': '45.1'},{'Average (%)': 45.1},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73129'}",[],45.1
88975,76,GLM-130B,GLM-130B,TRUE,,10/5/22,{'Average (%)': '44.8'},{'Average (%)': 44.8},FALSE,"{'id': 1086925, 'title': 'GLM-130B: An Open Bilingual Pre-trained Model', 'url': '/paper/glm-130b-an-open-bilingual-pre-trained-model', 'published': '2022-10-05T00:00:00.000000', 'code': True, 'review_url': '/paper/glm-130b-an-open-bilingual-pre-trained-model/review/?hl=88975'}",[],44.8
47496,77,GPT-3 175B (5-shot),GPT-3 175B,FALSE,5-shot,6/11/20,{'Average (%)': '43.9'},{'Average (%)': 43.9},FALSE,"{'id': 216419, 'title': 'Measuring Massive Multitask Language Understanding', 'url': '/paper/measuring-massive-multitask-language', 'published': '2020-09-07T00:00:00.000000', 'code': True, 'review_url': '/paper/measuring-massive-multitask-language/review/?hl=47496'}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",43.9
47498,78,GPT-3 175B (fine-tuned),GPT-3 175B,FALSE,fine-tuned,9/7/20,{'Average (%)': '43.9'},{'Average (%)': 43.9},FALSE,"{'id': 216419, 'title': 'Measuring Massive Multitask Language Understanding', 'url': '/paper/measuring-massive-multitask-language', 'published': '2020-09-07T00:00:00.000000', 'code': True, 'review_url': '/paper/measuring-massive-multitask-language/review/?hl=47498'}","[{'id': 184, 'name': 'fine-tuned', 'color': '#e56666'}]",43.9
117923,79,GPT-3 175B (5-shot),GPT-3 175B,FALSE,5-shot,5/28/20,{'Average (%)': '43.9'},{'Average (%)': 43.9},FALSE,"{'id': 198147, 'title': 'Language Models are Few-Shot Learners', 'url': '/paper/language-models-are-few-shot-learners', 'published': '2020-05-28T00:00:00.000000', 'code': True, 'review_url': '/paper/language-models-are-few-shot-learners/review/?hl=117923'}",[],43.9
47499,80,GPT-3 6.7B (fine-tuned),GPT-3 6.7B,FALSE,fine-tuned,9/7/20,{'Average (%)': '43.2'},{'Average (%)': 43.2},FALSE,"{'id': 216419, 'title': 'Measuring Massive Multitask Language Understanding', 'url': '/paper/measuring-massive-multitask-language', 'published': '2020-09-07T00:00:00.000000', 'code': True, 'review_url': '/paper/measuring-massive-multitask-language/review/?hl=47499'}","[{'id': 184, 'name': 'fine-tuned', 'color': '#e56666'}]",43.2
126691,81,Gemma 2B,Gemma 2B,TRUE,,7/31/24,{'Average (%)': '42.3'},{'Average (%)': 42.3},FALSE,"{'id': None, 'title': None, 'url': None, 'published': None, 'code': False, 'review_url': None}",[],42.3
73154,82,Flan-T5-Large 780M (CoT),Flan-T5-Large 780M,TRUE,CoT,10/20/22,{'Average (%)': '40.5'},{'Average (%)': 40.5},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73154'}",[],40.5
118807,83,GPT-3 Davinci 175B (5-shot),GPT-3 Davinci 175B,FALSE,5-shot,10/20/22,{'Average (%)': '39.7'},{'Average (%)': 39.7},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=118807'}",[],39.7
100780,84,Bloomberg GPT 50B (5-shot),Bloomberg GPT 50B,FALSE,5-shot,3/30/23,{'Average (%)': '39.2'},{'Average (%)': 39.2},FALSE,"{'id': 1183339, 'title': 'BloombergGPT: A Large Language Model for Finance', 'url': '/paper/bloombergGPT-a-large-language-model-for', 'published': '2023-03-30T00:00:00.000000', 'code': False, 'review_url': '/paper/bloombergGPT-a-large-language-model-for/review/?hl=100780'}",[],39.2
118377,85,UL2 20B (5-shot),UL2 20B,TRUE,5-shot,5/10/22,{'Average (%)': '39.2'},{'Average (%)': 39.2},FALSE,"{'id': 1007751, 'title': 'UL2: Unifying Language Learning Paradigms', 'url': '/paper/unifying-language-learning-paradigms', 'published': '2022-05-10T00:00:00.000000', 'code': True, 'review_url': '/paper/unifying-language-learning-paradigms/review/?hl=118377'}",[],39.2
100783,86,BLOOM 176B (5-shot),BLOOM 176B,TRUE,5-shot,3/30/23,{'Average (%)': '39.1'},{'Average (%)': 39.1},FALSE,"{'id': 1183339, 'title': 'BloombergGPT: A Large Language Model for Finance', 'url': '/paper/bloombergGPT-a-large-language-model-for', 'published': '2023-03-30T00:00:00.000000', 'code': False, 'review_url': '/paper/bloombergGPT-a-large-language-model-for/review/?hl=100783'}",[],39.1
117042,87,phi-1.5-web 1.3B,phi-1.5-web 1.3B,TRUE,,9/11/23,{'Average (%)': '37.9'},{'Average (%)': 37.9},FALSE,"{'id': 1275377, 'title': 'Textbooks Are All You Need II: phi-1.5 technical report', 'url': '/paper/textbooks-are-all-you-need-ii-phi-1-5', 'published': '2023-09-11T00:00:00.000000', 'code': True, 'review_url': '/paper/textbooks-are-all-you-need-ii-phi-1-5/review/?hl=117042'}",[],37.9
117490,88,GPT-3 175B (0-shot),GPT-3 175B,FALSE,0-shot,6/11/20,{'Average (%)': '37.7'},{'Average (%)': 37.7},FALSE,"{'id': 216419, 'title': 'Measuring Massive Multitask Language Understanding', 'url': '/paper/measuring-massive-multitask-language', 'published': '2020-09-07T00:00:00.000000', 'code': True, 'review_url': '/paper/measuring-massive-multitask-language/review/?hl=117490'}",[],37.7
100782,89,OPT 66B (5-shot),OPT 66B,TRUE,5-shot,3/30/23,{'Average (%)': '36'},{'Average (%)': 36.0},FALSE,"{'id': 1183339, 'title': 'BloombergGPT: A Large Language Model for Finance', 'url': '/paper/bloombergGPT-a-large-language-model-for', 'published': '2023-03-30T00:00:00.000000', 'code': False, 'review_url': '/paper/bloombergGPT-a-large-language-model-for/review/?hl=100782'}",[],36
73128,90,Flan-T5-Base 250M,Flan-T5-Base 250M,TRUE,,10/20/22,{'Average (%)': '35.9'},{'Average (%)': 35.9},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73128'}",[],35.9
73153,91,Flan-T5-Base 250M (CoT),Flan-T5-Base 250M,TRUE,CoT,10/20/22,{'Average (%)': '33.7'},{'Average (%)': 33.7},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73153'}","[{'id': 184, 'name': 'fine-tuned', 'color': '#e56666'}]",33.7
47575,92,GPT-NeoX 20B (5-shot),GPT-NeoX 20B,TRUE,5-shot,4/14/22,{'Average (%)': '33.6'},{'Average (%)': 33.6},FALSE,"{'id': 994573, 'title': 'GPT-NeoX-20B: An Open-Source Autoregressive Language Model', 'url': '/paper/GPT-neox-20b-an-open-source-autoregressive-1', 'published': '2022-04-14T00:00:00.000000', 'code': True, 'review_url': None}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",33.6
47495,93,GPT-2-XL 1.5B (fine-tuned),GPT-2-XL 1.5B,TRUE,fine-tuned,11/5/19,{'Average (%)': '32.4'},{'Average (%)': 32.4},FALSE,"{'id': 216419, 'title': 'Measuring Massive Multitask Language Understanding', 'url': '/paper/measuring-massive-multitask-language', 'published': '2020-09-07T00:00:00.000000', 'code': True, 'review_url': '/paper/measuring-massive-multitask-language/review/?hl=47495'}","[{'id': 184, 'name': 'fine-tuned', 'color': '#e56666'}]",32.4
118727,94,RWKV v5 Eagle 7B,RWKV v5 Eagle 7B,TRUE,,1/28/24,{'Average (%)': '31'},{'Average (%)': 31.0},FALSE,"{'id': None, 'title': None, 'url': None, 'published': None, 'code': False, 'review_url': None}",[],31
120435,95,Llama7B-MiLe-Loss(5-shot),Llama7B-MiLe-Loss,TRUE,5-shot,10/30/23,{'Average (%)': '29.68'},{'Average (%)': 29.68},TRUE,"{'id': 1311580, 'title': 'MiLe Loss: a New Loss for Mitigating the Bias of Learning Difficulties in Generative Language Models', 'url': '/paper/infoentropy-loss-to-mitigate-bias-of-learning', 'published': '2023-10-30T00:00:00.000000', 'code': True, 'review_url': None}",[],29.68
47502,96,Gopher 7.1B (5-shot),Gopher 7.1B,FALSE,5-shot,12/8/21,{'Average (%)': '29.5'},{'Average (%)': 29.5},FALSE,"{'id': 942590, 'title': 'Scaling Language Models: Methods, Analysis & Insights from Training Gopher', 'url': '/paper/scaling-language-models-methods-analysis-1', 'published': '2021-12-08T00:00:00.000000', 'code': True, 'review_url': None}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",29.5
73127,97,Flan-T5-Small 80M,Flan-T5-Small 80M,TRUE,,10/20/22,{'Average (%)': '28.7'},{'Average (%)': 28.7},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73127'}",[],28.7
47556,98,GPT-NeoX 20B (0-shot),GPT-NeoX 20B,TRUE,0-shot,4/14/22,{'Average (%)': '28.6'},{'Average (%)': 28.6},FALSE,"{'id': 994573, 'title': 'GPT-NeoX-20B: An Open-Source Autoregressive Language Model', 'url': '/paper/GPT-neox-20b-an-open-source-autoregressive-1', 'published': '2022-04-14T00:00:00.000000', 'code': True, 'review_url': None}","[{'id': 188, 'name': 'zero-shot', 'color': '#2771D3'}]",28.6
114902,99,Falcon 7B (5-shot),Falcon 7B,TRUE,5-shot,11/28/23,{'Average (%)': '28.0'},{'Average (%)': 28.0},FALSE,"{'id': 1329402, 'title': 'The Falcon Series of Open Language Models', 'url': '/paper/the-falcon-series-of-open-language-models', 'published': '2023-11-28T00:00:00.000000', 'code': False, 'review_url': '/paper/the-falcon-series-of-open-language-models/review/?hl=114902'}",[],28
47581,100,RoBERTa-base 125M (fine-tuned),RoBERTa-base 125M,TRUE,fine-tuned,7/26/19,{'Average (%)': '27.9'},{'Average (%)': 27.9},FALSE,"{'id': 148282, 'title': 'RoBERTa: A Robustly Optimized BERT Pretraining Approach', 'url': '/paper/roBERTa-a-robustly-optimized-BERT-pretraining', 'published': '2019-07-26T00:00:00.000000', 'code': True, 'review_url': '/paper/roBERTa-a-robustly-optimized-BERT-pretraining/review/?hl=47581'}","[{'id': 184, 'name': 'fine-tuned', 'color': '#e56666'}]",27.9
47501,101,Gopher 1.4B (5-shot),Gopher 1.4B,FALSE,5-shot,12/8/21,{'Average (%)': '27.3'},{'Average (%)': 27.3},FALSE,"{'id': 942590, 'title': 'Scaling Language Models: Methods, Analysis & Insights from Training Gopher', 'url': '/paper/scaling-language-models-methods-analysis-1', 'published': '2021-12-08T00:00:00.000000', 'code': True, 'review_url': None}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",27.3
47555,102,GPT-J 6B (zero-shot),GPT-J 6B,TRUE,zero-shot,5/1/21,{'Average (%)': '27.3'},{'Average (%)': 27.3},FALSE,"{'id': 994573, 'title': 'GPT-NeoX-20B: An Open-Source Autoregressive Language Model', 'url': '/paper/GPT-neox-20b-an-open-source-autoregressive-1', 'published': '2022-04-14T00:00:00.000000', 'code': True, 'review_url': None}","[{'id': 188, 'name': 'zero-shot', 'color': '#2771D3'}]",27.3
47580,103,ALBERT-xxlarge 223M (fine-tuned),ALBERT-xxlarge 223M,TRUE,fine-tuned,9/26/19,{'Average (%)': '27.1'},{'Average (%)': 27.1},FALSE,"{'id': 156146, 'title': 'ALBERT: A Lite BERT for Self-supervised Learning of Language Representations', 'url': '/paper/alBERT-a-lite-BERT-for-self-supervised', 'published': '2019-09-26T00:00:00.000000', 'code': True, 'review_url': '/paper/alBERT-a-lite-BERT-for-self-supervised/review/?hl=47580'}","[{'id': 184, 'name': 'fine-tuned', 'color': '#e56666'}]",27.1
47577,104,GPT-3 13B (5-shot),GPT-3 13B,FALSE,5-shot,6/11/20,{'Average (%)': '26'},{'Average (%)': 26.0},FALSE,"{'id': 216419, 'title': 'Measuring Massive Multitask Language Understanding', 'url': '/paper/measuring-massive-multitask-language', 'published': '2020-09-07T00:00:00.000000', 'code': True, 'review_url': '/paper/measuring-massive-multitask-language/review/?hl=47577'}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",26
118518,105,"GPT-3 13B (few-shot, k=32)",GPT-3 13B,FALSE,"few-shot, k=32",5/28/20,{'Average (%)': '26'},{'Average (%)': 26.0},FALSE,"{'id': 198147, 'title': 'Language Models are Few-Shot Learners', 'url': '/paper/language-models-are-few-shot-learners', 'published': '2020-05-28T00:00:00.000000', 'code': True, 'review_url': '/paper/language-models-are-few-shot-learners/review/?hl=118518'}",[],26
47579,106,GPT-3 2.7B (5-shot),GPT-3 2.7B,FALSE,5-shot,6/11/20,{'Average (%)': '25.9'},{'Average (%)': 25.9},FALSE,"{'id': 216419, 'title': 'Measuring Massive Multitask Language Understanding', 'url': '/paper/measuring-massive-multitask-language', 'published': '2020-09-07T00:00:00.000000', 'code': True, 'review_url': '/paper/measuring-massive-multitask-language/review/?hl=47579'}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",25.9
117949,107,GPT-3 2.7B (5-shot),GPT-3 2.7B,FALSE,5-shot,5/28/20,{'Average (%)': '25.9'},{'Average (%)': 25.9},FALSE,"{'id': 198147, 'title': 'Language Models are Few-Shot Learners', 'url': '/paper/language-models-are-few-shot-learners', 'published': '2020-05-28T00:00:00.000000', 'code': True, 'review_url': '/paper/language-models-are-few-shot-learners/review/?hl=117949'}",[],25.9
47500,108,Gopher 0.4B (5-shot),Gopher 0.4B,FALSE,5-shot,12/8/21,{'Average (%)': '25.7'},{'Average (%)': 25.7},FALSE,"{'id': 942590, 'title': 'Scaling Language Models: Methods, Analysis & Insights from Training Gopher', 'url': '/paper/scaling-language-models-methods-analysis-1', 'published': '2021-12-08T00:00:00.000000', 'code': True, 'review_url': None}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",25.7
47493,109,Random chance baseline,Random chance baseline,TRUE,,9/7/20,{'Average (%)': '25.0'},{'Average (%)': 25.0},FALSE,"{'id': 216419, 'title': 'Measuring Massive Multitask Language Understanding', 'url': '/paper/measuring-massive-multitask-language', 'published': '2020-09-07T00:00:00.000000', 'code': True, 'review_url': '/paper/measuring-massive-multitask-language/review/?hl=47493'}",[],25
47578,110,GPT-3 6.7B (5-shot),GPT-3 6.7B,FALSE,5-shot,6/11/20,{'Average (%)': '24.9'},{'Average (%)': 24.9},FALSE,"{'id': 216419, 'title': 'Measuring Massive Multitask Language Understanding', 'url': '/paper/measuring-massive-multitask-language', 'published': '2020-09-07T00:00:00.000000', 'code': True, 'review_url': '/paper/measuring-massive-multitask-language/review/?hl=47578'}","[{'id': 183, 'name': 'few-shot', 'color': '#a1df95'}]",24.9
117942,111,GPT-3 6.7B (5-shot),GPT-3 6.7B,FALSE,5-shot,5/28/20,{'Average (%)': '24.9'},{'Average (%)': 24.9},FALSE,"{'id': 198147, 'title': 'Language Models are Few-Shot Learners', 'url': '/paper/language-models-are-few-shot-learners', 'published': '2020-05-28T00:00:00.000000', 'code': True, 'review_url': '/paper/language-models-are-few-shot-learners/review/?hl=117942'}",[],24.9
73152,113,Flan-T5-Small 80M (CoT),Flan-T5-Small 80M,TRUE,CoT,10/20/22,{'Average (%)': '12.1'},{'Average (%)': 12.1},FALSE,"{'id': 1097139, 'title': 'Scaling Instruction-Finetuned Language Models', 'url': '/paper/scaling-instruction-finetuned-language-models', 'published': '2022-10-20T00:00:00.000000', 'code': True, 'review_url': '/paper/scaling-instruction-finetuned-language-models/review/?hl=73152'}",[],12.1