 initial model & dataset & gemma-7b-it & Llama-2-13b-chat-hf & Llama-2-70b-chat-hf & Mixtral-8x7B-Instruct-v0.1 & Qwen1.5-14B-Chat & Qwen1.5-72B-Chat & gpt-3.5-turbo-0125 & gemini-1.0-pro-001 & claude-3-opus-20240229 & gpt-4-0613 & gpt-4-0125-preview & random & human \\
 & \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{33.8} & 61.0 & \textbf{100.0} & \cellcolor[RGB]{211,211,211}{54.5} & 93.5 & \cellcolor[RGB]{211,211,211}{48.1} & 97.4 & 74.0 & \cellcolor[RGB]{211,211,211}{42.9} & 62.3 & 64.9 & 55.0 & 81.8 \\
 & \scalebox{0.9}[1]{FgFactV} & 78.2 & 97.4 & \textbf{100.0} & 79.5 & 83.3 & \cellcolor[RGB]{211,211,211}{35.9} & 70.5 & 62.8 & 56.4 & \cellcolor[RGB]{211,211,211}{9.0} & \cellcolor[RGB]{211,211,211}{14.1} & 55.7 & 95.5 \\
 & \scalebox{0.9}[1]{AnsCls} & 70.4 & \textbf{98.8} & 93.8 & \cellcolor[RGB]{211,211,211}{40.7} & 88.9 & \cellcolor[RGB]{211,211,211}{17.3} & \cellcolor[RGB]{211,211,211}{29.6} & 58.0 & \cellcolor[RGB]{211,211,211}{32.1} & \cellcolor[RGB]{211,211,211}{14.8} & \cellcolor[RGB]{211,211,211}{14.8} & 57.9 & 78.3 \\
 & \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{30.8} & 75.8 & \textbf{98.3} & \cellcolor[RGB]{211,211,211}{65.0} & 95.0 & \cellcolor[RGB]{211,211,211}{68.3} & 90.0 & 82.5 & 79.2 & 86.7 & 90.8 & 75.0 & 96.7 \\
 & \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{78.6} & 97.6 & \textbf{98.4} & \cellcolor[RGB]{211,211,211}{44.4} & 93.7 & \cellcolor[RGB]{211,211,211}{27.8} & \cellcolor[RGB]{211,211,211}{59.5} & \cellcolor[RGB]{211,211,211}{60.3} & \cellcolor[RGB]{211,211,211}{44.4} & \cellcolor[RGB]{211,211,211}{30.2} & \cellcolor[RGB]{211,211,211}{57.9} & 78.8 & 100.0 \\
 & \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{31.5} & \textbf{97.6} & 82.3 & \cellcolor[RGB]{211,211,211}{27.4} & 95.2 & \cellcolor[RGB]{211,211,211}{6.5} & \cellcolor[RGB]{211,211,211}{5.6} & \cellcolor[RGB]{211,211,211}{21.0} & \cellcolor[RGB]{211,211,211}{18.5} & \cellcolor[RGB]{211,211,211}{47.6} & \cellcolor[RGB]{211,211,211}{65.3} & 77.5 & 100.0 \\
