 initial model & dataset & gemma-7b-it & Llama-2-13b-chat-hf & Llama-2-70b-chat-hf & Mixtral-8x7B-Instruct-v0.1 & Qwen1.5-14B-Chat & Qwen1.5-72B-Chat & gpt-3.5-turbo-0125 & gemini-1.0-pro-001 & claude-3-opus-20240229 & gpt-4-0613 & gpt-4-0125-preview & random & human \\
 & \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{50.3} & \cellcolor[RGB]{211,211,211}{53.2} & \textbf{75.3} & \cellcolor[RGB]{211,211,211}{37.7} & \cellcolor[RGB]{211,211,211}{49.0} & \cellcolor[RGB]{211,211,211}{23.4} & 65.3 & \cellcolor[RGB]{211,211,211}{39.9} & \cellcolor[RGB]{211,211,211}{38.3} & \cellcolor[RGB]{211,211,211}{51.6} & 61.0 & 55.0 & 81.8 \\
 & \scalebox{0.9}[1]{FgFactV} & 62.8 & 71.5 & \textbf{83.3} & \cellcolor[RGB]{211,211,211}{42.6} & 61.9 & \cellcolor[RGB]{211,211,211}{17.6} & \cellcolor[RGB]{211,211,211}{38.5} & \cellcolor[RGB]{211,211,211}{40.4} & \cellcolor[RGB]{211,211,211}{40.1} & \cellcolor[RGB]{211,211,211}{6.7} & \cellcolor[RGB]{211,211,211}{11.5} & 55.7 & 95.5 \\
 & \scalebox{0.9}[1]{AnsCls} & 58.6 & \textbf{80.2} & 78.7 & \cellcolor[RGB]{211,211,211}{30.9} & \cellcolor[RGB]{211,211,211}{54.6} & \cellcolor[RGB]{211,211,211}{9.9} & \cellcolor[RGB]{211,211,211}{18.8} & \cellcolor[RGB]{211,211,211}{31.8} & \cellcolor[RGB]{211,211,211}{26.9} & \cellcolor[RGB]{211,211,211}{12.3} & \cellcolor[RGB]{211,211,211}{13.6} & 57.9 & 78.3 \\
 & \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{50.6} & \cellcolor[RGB]{211,211,211}{50.4} & \cellcolor[RGB]{211,211,211}{72.7} & \cellcolor[RGB]{211,211,211}{45.8} & \cellcolor[RGB]{211,211,211}{47.9} & \cellcolor[RGB]{211,211,211}{39.2} & \cellcolor[RGB]{211,211,211}{67.1} & \cellcolor[RGB]{211,211,211}{47.3} & \cellcolor[RGB]{211,211,211}{74.6} & 82.7 & \textbf{87.9} & 75.0 & 96.7 \\
 & \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{61.9} & \cellcolor[RGB]{211,211,211}{77.4} & \textbf{82.5} & \cellcolor[RGB]{211,211,211}{24.8} & \cellcolor[RGB]{211,211,211}{61.1} & \cellcolor[RGB]{211,211,211}{11.1} & \cellcolor[RGB]{211,211,211}{24.2} & \cellcolor[RGB]{211,211,211}{31.7} & \cellcolor[RGB]{211,211,211}{32.5} & \cellcolor[RGB]{211,211,211}{25.2} & \cellcolor[RGB]{211,211,211}{54.0} & 78.8 & 100.0 \\
 & \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{24.2} & \textbf{78.2} & \cellcolor[RGB]{211,211,211}{46.4} & \cellcolor[RGB]{211,211,211}{20.6} & \cellcolor[RGB]{211,211,211}{45.0} & \cellcolor[RGB]{211,211,211}{2.4} & \cellcolor[RGB]{211,211,211}{2.0} & \cellcolor[RGB]{211,211,211}{9.5} & \cellcolor[RGB]{211,211,211}{15.1} & \cellcolor[RGB]{211,211,211}{48.4} & \cellcolor[RGB]{211,211,211}{63.9} & 77.5 & 100.0 \\
