 initial model & dataset & gemma-7b-it & Llama-2-13b-chat-hf & Llama-2-70b-chat-hf & Mixtral-8x7B-Instruct-v0.1 & Qwen1.5-14B-Chat & Qwen1.5-72B-Chat & gpt-3.5-turbo-0125 & gemini-1.0-pro-001 & claude-3-opus-20240229 & gpt-4-0613 & gpt-4-0125-preview & random & human \\
 & \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{46.4} & \cellcolor[RGB]{211,211,211}{51.6} & 55.0 & 63.6 & 56.7 & 67.3 & 63.0 & 57.6 & 86.8 & \textbf{87.3} & 74.6 & 55.0 & 100.0 \\
 & \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{55.0} & 55.9 & 55.7 & 57.9 & 56.0 & 57.1 & 60.4 & 58.3 & 71.0 & \textbf{87.5} & 73.3 & 55.7 & 95.5 \\
 & \scalebox{0.9}[1]{AnsCls} & 60.6 & 58.4 & \cellcolor[RGB]{211,211,211}{57.1} & \cellcolor[RGB]{211,211,211}{56.9} & \cellcolor[RGB]{211,211,211}{57.1} & 70.0 & 58.5 & 71.2 & 65.0 & \textbf{92.3} & 92.3 & 57.9 & 94.7 \\
 & \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{56.9} & \cellcolor[RGB]{211,211,211}{72.8} & 77.1 & 77.2 & 83.2 & 87.2 & 80.0 & 80.5 & 90.5 & \textbf{92.0} & 87.2 & 75.0 & 100.0 \\
 & \scalebox{0.9}[1]{FgFactV} & 83.2 & 79.4 & \cellcolor[RGB]{211,211,211}{78.5} & 88.9 & 82.5 & 94.6 & \textbf{98.7} & 80.9 & 98.2 & 86.4 & 92.4 & 78.8 & 100.0 \\
 & \scalebox{0.9}[1]{AnsCls} & 79.6 & 78.6 & 81.0 & 79.1 & 78.7 & 80.0 & \cellcolor[RGB]{211,211,211}{70.0} & \cellcolor[RGB]{211,211,211}{72.2} & \textbf{100.0} & 95.2 & 92.0 & 77.5 & 96.7 \\
