 initial model & dataset & gemma-7b-it & Llama-2-13b-chat-hf & Llama-2-70b-chat-hf & Mixtral-8x7B-Instruct-v0.1 & Qwen1.5-14B-Chat & Qwen1.5-72B-Chat & gpt-3.5-turbo-0125 & gemini-1.0-pro-001 & claude-3-opus-20240229 & gpt-4-0613 & gpt-4-0125-preview & random & human \\
 & \scalebox{0.9}[1]{MathGen} & 52.1 & 57.9 & 55.0 & 53.6 & \cellcolor[RGB]{211,211,211}{48.6} & 52.1 & 65.7 & 60.0 & 69.3 & 69.3 & \textbf{72.9} & 50.5 & 88.2 \\
 & \scalebox{0.9}[1]{FgFactV} & 54.3 & 53.6 & 56.4 & \cellcolor[RGB]{211,211,211}{45.0} & \cellcolor[RGB]{211,211,211}{50.0} & \cellcolor[RGB]{211,211,211}{46.4} & 57.1 & 54.3 & \textbf{62.1} & \cellcolor[RGB]{211,211,211}{46.4} & 52.1 & 50.7 & 94.3 \\
 & \scalebox{0.9}[1]{AnsCls} & 52.1 & \textbf{60.0} & 56.4 & \cellcolor[RGB]{211,211,211}{45.0} & 52.9 & \cellcolor[RGB]{211,211,211}{49.3} & \cellcolor[RGB]{211,211,211}{48.6} & 54.3 & 53.6 & \cellcolor[RGB]{211,211,211}{47.9} & \cellcolor[RGB]{211,211,211}{48.6} & 51.2 & 82.9 \\
 & \scalebox{0.9}[1]{MathGen} & 75.6 & 68.1 & 74.4 & \cellcolor[RGB]{211,211,211}{51.2} & \cellcolor[RGB]{211,211,211}{43.8} & \cellcolor[RGB]{211,211,211}{58.8} & 62.5 & \cellcolor[RGB]{211,211,211}{61.9} & 80.0 & 83.1 & \textbf{85.6} & 62.5 & 97.1 \\
 & \scalebox{0.9}[1]{FgFactV} & 68.8 & 74.4 & \textbf{75.0} & \cellcolor[RGB]{211,211,211}{28.7} & \cellcolor[RGB]{211,211,211}{47.5} & \cellcolor[RGB]{211,211,211}{26.2} & \cellcolor[RGB]{211,211,211}{35.0} & \cellcolor[RGB]{211,211,211}{50.0} & 68.1 & \cellcolor[RGB]{211,211,211}{37.5} & \cellcolor[RGB]{211,211,211}{59.4} & 66.5 & 100.0 \\
 & \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{40.6} & \textbf{74.4} & 72.5 & \cellcolor[RGB]{211,211,211}{36.9} & \cellcolor[RGB]{211,211,211}{23.8} & \cellcolor[RGB]{211,211,211}{24.4} & \cellcolor[RGB]{211,211,211}{23.1} & \cellcolor[RGB]{211,211,211}{27.5} & \cellcolor[RGB]{211,211,211}{38.1} & \cellcolor[RGB]{211,211,211}{55.6} & 66.2 & 65.1 & 97.1 \\
