 initial model & dataset & gemma-7b-it & Llama-2-13b-chat-hf & Llama-2-70b-chat-hf & Mixtral-8x7B-Instruct-v0.1 & Qwen1.5-14B-Chat & Qwen1.5-72B-Chat & gpt-3.5-turbo-0125 & gemini-1.0-pro-001 & claude-3-opus-20240229 & gpt-4-0613 & gpt-4-0125-preview & random & human \\
 & \scalebox{0.9}[1]{MathGen} & 57.5 & \cellcolor[RGB]{211,211,211}{55.0} & 67.5 & 72.4 & 66.4 & 76.3 & 69.4 & \cellcolor[RGB]{211,211,211}{43.8} & \textbf{90.8} & 90.3 & 80.9 & 55.0 & 100.0 \\
 & \scalebox{0.9}[1]{FgFactV} & 56.5 & \cellcolor[RGB]{211,211,211}{53.1} & \cellcolor[RGB]{211,211,211}{55.5} & \cellcolor[RGB]{211,211,211}{52.1} & \cellcolor[RGB]{211,211,211}{55.2} & 64.3 & \cellcolor[RGB]{211,211,211}{46.9} & 61.8 & 72.3 & \textbf{88.3} & 80.4 & 55.7 & 95.5 \\
 & \scalebox{0.9}[1]{AnsCls} & 59.6 & 57.9 & 59.4 & 60.4 & 65.0 & \cellcolor[RGB]{211,211,211}{55.7} & 62.2 & 73.7 & 70.4 & 90.5 & \textbf{91.6} & 57.9 & 94.7 \\
 & \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{72.3} & \cellcolor[RGB]{211,211,211}{74.5} & 84.4 & 87.0 & 92.9 & \textbf{93.5} & 82.9 & 87.3 & 91.4 & 93.2 & 90.3 & 75.0 & 100.0 \\
 & \scalebox{0.9}[1]{FgFactV} & 82.1 & 79.8 & 80.5 & 96.3 & 81.5 & \cellcolor[RGB]{211,211,211}{73.6} & 98.7 & 84.7 & \textbf{99.2} & 84.7 & 92.4 & 78.8 & 100.0 \\
 & \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{77.3} & 78.9 & \cellcolor[RGB]{211,211,211}{73.6} & 83.6 & \cellcolor[RGB]{211,211,211}{72.6} & \cellcolor[RGB]{211,211,211}{70.0} & \cellcolor[RGB]{211,211,211}{67.5} & \cellcolor[RGB]{211,211,211}{67.6} & \textbf{100.0} & 95.3 & 93.8 & 77.5 & 96.7 \\
