 initial model & dataset & gemma-7b-it & Llama-2-13b-chat-hf & Llama-2-70b-chat-hf & Mixtral-8x7B-Instruct-v0.1 & Qwen1.5-14B-Chat & Qwen1.5-72B-Chat & gpt-3.5-turbo-0125 & gemini-1.0-pro-001 & claude-3-opus-20240229 & gpt-4-0613 & gpt-4-0125-preview & random & human \\
 & \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{3.9} & \cellcolor[RGB]{211,211,211}{20.8} & \cellcolor[RGB]{211,211,211}{2.6} & \cellcolor[RGB]{211,211,211}{15.6} & \cellcolor[RGB]{211,211,211}{13.0} & \cellcolor[RGB]{211,211,211}{6.5} & \cellcolor[RGB]{211,211,211}{26.0} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{22.1} & \cellcolor[RGB]{211,211,211}{39.0} & \textbf{49.4} & 55.0 & 81.8 \\
 & \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{42.3} & \cellcolor[RGB]{211,211,211}{41.0} & \textbf{44.9} & \cellcolor[RGB]{211,211,211}{6.4} & \cellcolor[RGB]{211,211,211}{24.4} & \cellcolor[RGB]{211,211,211}{3.8} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{16.7} & \cellcolor[RGB]{211,211,211}{20.5} & \cellcolor[RGB]{211,211,211}{5.1} & \cellcolor[RGB]{211,211,211}{3.8} & 55.7 & 95.5 \\
 & \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{33.3} & \textbf{55.6} & \cellcolor[RGB]{211,211,211}{45.7} & \cellcolor[RGB]{211,211,211}{14.8} & \cellcolor[RGB]{211,211,211}{16.0} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{1.2} & \cellcolor[RGB]{211,211,211}{11.1} & \cellcolor[RGB]{211,211,211}{18.5} & \cellcolor[RGB]{211,211,211}{8.6} & \cellcolor[RGB]{211,211,211}{13.6} & 57.9 & 78.3 \\
 & \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{2.5} & \cellcolor[RGB]{211,211,211}{23.3} & \cellcolor[RGB]{211,211,211}{4.2} & \cellcolor[RGB]{211,211,211}{16.7} & \cellcolor[RGB]{211,211,211}{6.7} & \cellcolor[RGB]{211,211,211}{17.5} & \cellcolor[RGB]{211,211,211}{33.3} & \cellcolor[RGB]{211,211,211}{6.7} & \cellcolor[RGB]{211,211,211}{62.5} & 75.8 & \textbf{84.2} & 75.0 & 96.7 \\
 & \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{29.4} & \textbf{61.1} & \cellcolor[RGB]{211,211,211}{57.9} & \cellcolor[RGB]{211,211,211}{3.2} & \cellcolor[RGB]{211,211,211}{13.5} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{0.8} & \cellcolor[RGB]{211,211,211}{7.9} & \cellcolor[RGB]{211,211,211}{8.7} & \cellcolor[RGB]{211,211,211}{10.3} & \cellcolor[RGB]{211,211,211}{41.3} & 78.8 & 100.0 \\
 & \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{6.5} & \textbf{79.8} & \cellcolor[RGB]{211,211,211}{1.6} & \cellcolor[RGB]{211,211,211}{1.6} & \cellcolor[RGB]{211,211,211}{1.6} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{1.6} & \cellcolor[RGB]{211,211,211}{8.1} & \cellcolor[RGB]{211,211,211}{49.2} & \cellcolor[RGB]{211,211,211}{56.5} & 77.5 & 100.0 \\
