 initial model & dataset & gemma-7b-it & Llama-2-13b-chat-hf & Llama-2-70b-chat-hf & Mixtral-8x7B-Instruct-v0.1 & Qwen1.5-14B-Chat & Qwen1.5-72B-Chat & gpt-3.5-turbo-0125 & gemini-1.0-pro-001 & claude-3-opus-20240229 & gpt-4-0613 & gpt-4-0125-preview & random & human \\
 & \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{54.2} & 60.3 & 59.8 & 72.5 & 75.0 & 71.4 & 63.3 & 56.2 & \textbf{95.8} & 89.6 & 80.6 & 55.0 & 100.0 \\
 & \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{53.3} & \cellcolor[RGB]{211,211,211}{49.4} & 56.9 & 64.1 & 56.6 & 65.2 & \cellcolor[RGB]{211,211,211}{55.6} & 57.6 & 65.5 & \textbf{85.7} & 73.3 & 55.7 & 95.5 \\
 & \scalebox{0.9}[1]{AnsCls} & 58.9 & 58.5 & \cellcolor[RGB]{211,211,211}{56.7} & 60.0 & 59.6 & 71.4 & 63.9 & 76.9 & 75.0 & \textbf{92.3} & 91.7 & 57.9 & 94.7 \\
 & \scalebox{0.9}[1]{MathGen} & 76.8 & \cellcolor[RGB]{211,211,211}{70.6} & 85.6 & 84.3 & 91.7 & 93.1 & 83.5 & 93.9 & \textbf{94.3} & 93.6 & 89.1 & 75.0 & 100.0 \\
 & \scalebox{0.9}[1]{FgFactV} & 86.2 & 80.2 & 81.1 & 96.3 & 82.4 & \textbf{100.0} & 96.0 & 84.2 & 100.0 & 90.0 & 93.2 & 78.8 & 100.0 \\
 & \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{76.1} & 79.4 & 79.4 & \cellcolor[RGB]{211,211,211}{75.6} & 78.6 & \textbf{100.0} & 100.0 & 80.0 & 100.0 & 94.1 & 91.2 & 77.5 & 96.7 \\
