model,score,rating_q025,rating_q975,CI,avg_tokens,date
gpt-4-turbo-2024-04-09,82.63,80.75,84.6,"(-1.88, +1.97)",662.0,2024-07-31
claude-3-5-sonnet-20240620,79.35,77.25,80.62,"(-2.10, +1.27)",567.0,2024-07-31
gpt-4o-2024-05-13,79.21,77.42,80.71,"(-1.79, +1.50)",696.0,2024-07-31
gpt-4-0125-preview,77.96,75.94,79.9,"(-2.02, +1.94)",619.0,2024-07-31
athene-70b-0725,76.83,74.84,78.74,"(-1.99, +1.91)",683.0,2024-07-31
gpt-4o-mini-2024-07-18,74.94,72.66,77.07,"(-2.28, +2.13)",668.0,2024-07-31
gemini-1.5-pro-api-0514,71.96,69.62,74.62,"(-2.34, +2.66)",676.0,2024-07-31
yi-large-preview,71.48,69.02,73.37,"(-2.46, +1.89)",720.0,2024-07-31
mistral-large-2407,70.42,68.11,72.43,"(-2.31, +2.01)",623.0,2024-07-31
llama-3.1-405b-instruct,64.09,61.43,66.55,"(-2.66, +2.46)",633.0,2024-07-31
glm-4-0520,63.84,61.28,66.19,"(-2.56, +2.35)",636.0,2024-07-31
yi-large,63.7,61.76,65.86,"(-1.94, +2.16)",626.0,2024-07-31
deepseek-coder-v2,62.3,59.82,64.72,"(-2.48, +2.42)",578.0,2024-07-31
claude-3-opus-20240229,60.36,57.56,62.34,"(-2.80, +1.98)",541.0,2024-07-31
gemma-2-27b-it,57.51,55.11,60.12,"(-2.40, +2.61)",577.0,2024-07-31
llama-3.1-70b-instruct,55.73,52.85,58.2,"(-2.88, +2.47)",628.0,2024-07-31
glm-4-0116,55.72,53.83,58.16,"(-1.89, +2.44)",622.0,2024-07-31
gemini-1.5-pro-api-0409-preview,53.37,51.13,56.66,"(-2.24, +3.29)",478.0,2024-07-31
glm-4-air,50.88,48.62,53.21,"(-2.26, +2.33)",619.0,2024-07-31
gpt-4-0314,50.0,50.0,50.0,"(-0.00, +0.00)",423.0,2024-07-31
gemini-1.5-flash-api-0514,49.61,47.46,52.17,"(-2.15, +2.56)",642.0,2024-07-31
qwen2-72b-instruct,46.86,44.57,49.29,"(-2.29, +2.43)",515.0,2024-07-31
claude-3-sonnet-20240229,46.8,44.12,49.04,"(-2.68, +2.24)",552.0,2024-07-31
llama-3-70b-instruct,46.57,43.84,49.18,"(-2.73, +2.61)",591.0,2024-07-31
claude-3-haiku-20240307,41.47,39.57,44.02,"(-1.90, +2.55)",505.0,2024-07-31
gpt-4-0613,37.9,35.6,40.36,"(-2.30, +2.46)",354.0,2024-07-31
mistral-large-2402,37.71,34.81,39.77,"(-2.90, +2.06)",400.0,2024-07-31
mixtral-8x22b-instruct-v0.1,36.36,34.21,38.55,"(-2.15, +2.19)",430.0,2024-07-31
qwen1.5-72b-chat,36.12,33.88,38.15,"(-2.24, +2.03)",474.0,2024-07-31
phi-3-medium-4k-instruct,33.37,31.26,35.14,"(-2.11, +1.77)",517.0,2024-07-31
command-r-plus,33.07,30.85,35.12,"(-2.22, +2.05)",541.0,2024-07-31
mistral-medium,31.9,29.66,34.31,"(-2.24, +2.41)",485.0,2024-07-31
phi-3-small-8k-instruct,29.77,27.94,31.97,"(-1.83, +2.20)",568.0,2024-07-31
mistral-next,27.37,25.4,29.09,"(-1.97, +1.72)",297.0,2024-07-31
gpt-3.5-turbo-0613,24.82,22.54,26.29,"(-2.28, +1.47)",401.0,2024-07-31
dbrx-instruct-preview,24.63,22.33,26.83,"(-2.30, +2.20)",415.0,2024-07-31
claude-2.0,23.99,21.71,25.65,"(-2.28, +1.66)",295.0,2024-07-31
mixtral-8x7b-instruct-v0.1,23.4,21.38,25.41,"(-2.02, +2.01)",457.0,2024-07-31
gpt-3.5-turbo-0125,23.34,21.67,25.27,"(-1.67, +1.93)",329.0,2024-07-31
yi-34b-chat,23.15,20.75,24.7,"(-2.40, +1.55)",611.0,2024-07-31
starling-lm-7b-beta,23.01,20.81,24.66,"(-2.20, +1.65)",530.0,2024-07-31
claude-2.1,22.77,20.65,25.43,"(-2.12, +2.66)",290.0,2024-07-31
llama-3.1-8b-instruct,21.34,19.71,23.09,"(-1.63, +1.75)",861.0,2024-07-31
snorkel-mistral-pairrm-dpo,20.73,19.04,22.05,"(-1.69, +1.32)",564.0,2024-07-31
llama-3-8b-instruct,20.56,18.82,22.61,"(-1.74, +2.05)",585.0,2024-07-31
gpt-3.5-turbo-1106,18.87,17.06,20.58,"(-1.81, +1.71)",285.0,2024-07-31
gpt-3.5-turbo-0314,18.05,16.57,20.06,"(-1.48, +2.01)",334.0,2024-07-31
gemini-pro,17.8,15.96,19.32,"(-1.84, +1.52)",322.0,2024-07-31
snowflake-arctic-instruct,17.61,16.12,19.27,"(-1.49, +1.66)",365.0,2024-07-31
command-r,17.02,15.73,18.51,"(-1.29, +1.49)",432.0,2024-07-31
phi-3-mini-128k-instruct,15.43,13.94,17.02,"(-1.49, +1.59)",609.0,2024-07-31
tulu-2-dpo-70b,14.99,13.05,16.82,"(-1.94, +1.83)",550.0,2024-07-31
starling-lm-7b-alpha,12.8,11.23,14.5,"(-1.57, +1.70)",483.0,2024-07-31
mistral-7b-instruct,12.57,11.05,14.11,"(-1.52, +1.54)",541.0,2024-07-31
gemma-1.1-7b-it,12.09,10.61,13.43,"(-1.48, +1.34)",341.0,2024-07-31
llama-2-70b-chat,11.55,10.02,13.01,"(-1.53, +1.46)",595.0,2024-07-31
vicuna-33b,8.63,7.59,9.84,"(-1.04, +1.21)",451.0,2024-07-31
gemma-7b-it,7.47,6.5,8.6,"(-0.97, +1.13)",378.0,2024-07-31
gemma-1.1-2b-it,3.37,2.74,4.14,"(-0.63, +0.77)",316.0,2024-07-31
gemma-2b-it,3.0,2.33,3.67,"(-0.67, +0.67)",369.0,2024-07-31
