 initial model & dataset & gemma-7b-it & Llama-2-13b-chat-hf & Llama-2-70b-chat-hf & Mixtral-8x7B-Instruct-v0.1 & Qwen1.5-14B-Chat & Qwen1.5-72B-Chat & gpt-3.5-turbo-0125 & gemini-1.0-pro-001 & claude-3-opus-20240229 & gpt-4-0613 & gpt-4-0125-preview & random & human \\
 & \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{44.3} & \cellcolor[RGB]{211,211,211}{51.6} & 55.4 & \cellcolor[RGB]{211,211,211}{46.9} & \cellcolor[RGB]{211,211,211}{48.7} & \cellcolor[RGB]{211,211,211}{31.9} & 62.7 & \cellcolor[RGB]{211,211,211}{37.8} & \cellcolor[RGB]{211,211,211}{51.7} & 65.1 & \textbf{69.2} & 55.0 & 90.0 \\
 & \scalebox{0.9}[1]{FgFactV} & 58.5 & 59.5 & \textbf{65.4} & \cellcolor[RGB]{211,211,211}{42.9} & \cellcolor[RGB]{211,211,211}{55.7} & \cellcolor[RGB]{211,211,211}{25.1} & \cellcolor[RGB]{211,211,211}{41.2} & \cellcolor[RGB]{211,211,211}{45.1} & \cellcolor[RGB]{211,211,211}{48.7} & \cellcolor[RGB]{211,211,211}{12.4} & \cellcolor[RGB]{211,211,211}{19.8} & 55.7 & 95.5 \\
 & \scalebox{0.9}[1]{AnsCls} & 57.9 & \textbf{66.5} & 65.9 & \cellcolor[RGB]{211,211,211}{39.3} & \cellcolor[RGB]{211,211,211}{52.7} & \cellcolor[RGB]{211,211,211}{16.5} & \cellcolor[RGB]{211,211,211}{26.9} & \cellcolor[RGB]{211,211,211}{39.6} & \cellcolor[RGB]{211,211,211}{38.2} & \cellcolor[RGB]{211,211,211}{21.6} & \cellcolor[RGB]{211,211,211}{23.6} & 57.9 & 85.7 \\
 & \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{51.9} & \cellcolor[RGB]{211,211,211}{55.2} & \cellcolor[RGB]{211,211,211}{66.8} & \cellcolor[RGB]{211,211,211}{56.7} & \cellcolor[RGB]{211,211,211}{54.4} & \cellcolor[RGB]{211,211,211}{51.6} & \cellcolor[RGB]{211,211,211}{71.7} & \cellcolor[RGB]{211,211,211}{52.2} & 81.6 & 87.6 & \textbf{89.0} & 75.0 & 98.3 \\
 & \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{68.5} & \cellcolor[RGB]{211,211,211}{77.7} & \textbf{80.6} & \cellcolor[RGB]{211,211,211}{35.5} & \cellcolor[RGB]{211,211,211}{63.8} & \cellcolor[RGB]{211,211,211}{18.4} & \cellcolor[RGB]{211,211,211}{34.3} & \cellcolor[RGB]{211,211,211}{41.2} & \cellcolor[RGB]{211,211,211}{45.2} & \cellcolor[RGB]{211,211,211}{38.1} & \cellcolor[RGB]{211,211,211}{67.8} & 78.8 & 100.0 \\
 & \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{35.6} & \textbf{76.5} & \cellcolor[RGB]{211,211,211}{50.2} & \cellcolor[RGB]{211,211,211}{30.8} & \cellcolor[RGB]{211,211,211}{43.7} & \cellcolor[RGB]{211,211,211}{4.6} & \cellcolor[RGB]{211,211,211}{3.8} & \cellcolor[RGB]{211,211,211}{15.6} & \cellcolor[RGB]{211,211,211}{26.0} & \cellcolor[RGB]{211,211,211}{64.1} & \cellcolor[RGB]{211,211,211}{75.7} & 77.5 & 98.3 \\
