 initial model & dataset & gemma-7b-it & Llama-2-13b-chat-hf & Llama-2-70b-chat-hf & Mixtral-8x7B-Instruct-v0.1 & Qwen1.5-14B-Chat & Qwen1.5-72B-Chat & gpt-3.5-turbo-0125 & gemini-1.0-pro-001 & claude-3-opus-20240229 & gpt-4-0613 & gpt-4-0125-preview & random & human \\
 & \scalebox{0.9}[1]{MathGen} & 83.1 & \cellcolor[RGB]{211,211,211}{45.5} & \textbf{98.7} & \cellcolor[RGB]{211,211,211}{48.1} & 70.1 & \cellcolor[RGB]{211,211,211}{13.0} & 80.5 & \cellcolor[RGB]{211,211,211}{11.7} & \cellcolor[RGB]{211,211,211}{29.9} & 55.8 & 64.9 & 55.0 & 81.8 \\
 & \scalebox{0.9}[1]{FgFactV} & 62.8 & \cellcolor[RGB]{211,211,211}{55.1} & \textbf{89.7} & 64.1 & 88.5 & \cellcolor[RGB]{211,211,211}{19.2} & \cellcolor[RGB]{211,211,211}{44.9} & \cellcolor[RGB]{211,211,211}{24.4} & \cellcolor[RGB]{211,211,211}{24.4} & \cellcolor[RGB]{211,211,211}{7.7} & \cellcolor[RGB]{211,211,211}{14.1} & 55.7 & 95.5 \\
 & \scalebox{0.9}[1]{AnsCls} & 65.4 & 67.9 & \textbf{84.0} & \cellcolor[RGB]{211,211,211}{40.7} & 76.5 & \cellcolor[RGB]{211,211,211}{6.2} & \cellcolor[RGB]{211,211,211}{28.4} & \cellcolor[RGB]{211,211,211}{12.3} & \cellcolor[RGB]{211,211,211}{22.2} & \cellcolor[RGB]{211,211,211}{14.8} & \cellcolor[RGB]{211,211,211}{13.6} & 57.9 & 78.3 \\
 & \scalebox{0.9}[1]{MathGen} & 80.0 & \cellcolor[RGB]{211,211,211}{20.0} & \textbf{89.2} & \cellcolor[RGB]{211,211,211}{58.3} & \cellcolor[RGB]{211,211,211}{64.2} & \cellcolor[RGB]{211,211,211}{22.5} & 80.0 & \cellcolor[RGB]{211,211,211}{25.8} & \cellcolor[RGB]{211,211,211}{69.2} & 85.0 & 88.3 & 75.0 & 96.7 \\
 & \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{59.5} & \cellcolor[RGB]{211,211,211}{61.1} & 81.7 & \cellcolor[RGB]{211,211,211}{41.3} & \textbf{92.9} & \cellcolor[RGB]{211,211,211}{10.3} & \cellcolor[RGB]{211,211,211}{19.0} & \cellcolor[RGB]{211,211,211}{12.7} & \cellcolor[RGB]{211,211,211}{16.7} & \cellcolor[RGB]{211,211,211}{35.7} & \cellcolor[RGB]{211,211,211}{65.1} & 78.8 & 100.0 \\
 & \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{28.2} & \cellcolor[RGB]{211,211,211}{40.3} & \cellcolor[RGB]{211,211,211}{21.8} & \cellcolor[RGB]{211,211,211}{25.0} & \textbf{79.8} & \cellcolor[RGB]{211,211,211}{0.8} & \cellcolor[RGB]{211,211,211}{1.6} & \cellcolor[RGB]{211,211,211}{3.2} & \cellcolor[RGB]{211,211,211}{13.7} & \cellcolor[RGB]{211,211,211}{51.6} & \cellcolor[RGB]{211,211,211}{75.0} & 77.5 & 100.0 \\
