﻿model name,company,context window,AIME 2025,SWE-bench Verified,TAU-bench-Airline,Tau-bench-Retail
Gemini 2.5 Pro,Google,1M,88%,59.60%,NA,NA
Gemini 2.5 Flash-non thinking,Google,1M,61.60%,50%,NA,NA
Gemini 2.5 Flash-Lite-non thinking,Google,1M,49.80%,31.60%,NA,NA
Gemini 2.5 Flash-Lite-thinking,Google,1M,63.10%,27.60%,NA,NA
Gemini 2.5 Flash-thinking,Google,1M,72.00%,48.90%,NA,NA
o4-mini,OpenAI,200k,92.70%,68.10%,49.20%,65.60%
O3,OpenAI,200k,88.90%,69.10%,52%,70.40%
O3 mini,OpenAI,200k,86.50%,49.30%,32.40%,57.60%
Claude 3.7 sonnet,Anthropic,200k,54.80%,62.30%,58.40%,81.20%
Claude Sonnet 4,Anthropic,200k,70.50%,72.70%,60%,80.50%
Claude Opus 4,Anthropic,200k,75.50%,72.50%,59.60%,81.40%
Seed-Thinking v1.5,Bytedance,32k,74%,47%,NA,NA
Seed-Thinking v1.6,Bytedance,256k,86%,NA,NA,NA
DeepSeek-V3,DeepSeek,128k,NA,42%,NA,NA
DeepSeek-R1 ,DeepSeek,128k,70%,49.20%,NA,NA
DeepSeek-V3-0324,DeepSeek,64k,47.74%,38.80%,NA,NA
DeepSeek-R1-0528,DeepSeek,64k,87.50%,57.60%,NA,NA