 initial model & dataset & gemma-7b-it & Llama-2-13b-chat-hf & Llama-2-70b-chat-hf & Mixtral-8x7B-Instruct-v0.1 & Qwen1.5-14B-Chat & Qwen1.5-72B-Chat & gpt-3.5-turbo-0125 & gemini-1.0-pro-001 & claude-3-opus-20240229 & gpt-4-0613 & gpt-4-0125-preview & random & human \\
 & \scalebox{0.9}[1]{MathGen} & 75.0 & \cellcolor[RGB]{211,211,211}{50.0} & \textbf{100.0} & 80.0 & 71.4 & 100.0 & 76.9 & \cellcolor[RGB]{211,211,211}{0.0} & 100.0 & 93.8 & 86.4 & 55.0 & 100.0 \\
 & \scalebox{0.9}[1]{FgFactV} & 60.0 & \cellcolor[RGB]{211,211,211}{51.6} & \cellcolor[RGB]{211,211,211}{53.0} & \cellcolor[RGB]{211,211,211}{31.2} & \cellcolor[RGB]{211,211,211}{52.8} & 75.0 & \cellcolor[RGB]{211,211,211}{0.0} & 72.2 & 84.2 & \textbf{100.0} & 75.0 & 55.7 & 95.5 \\
 & \scalebox{0.9}[1]{AnsCls} & 61.4 & \cellcolor[RGB]{211,211,211}{55.6} & 66.1 & 66.7 & 76.5 & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{50.0} & 81.8 & 71.4 & 87.5 & \textbf{91.7} & 57.9 & 94.7 \\
 & \scalebox{0.9}[1]{MathGen} & 75.0 & 77.8 & \textbf{100.0} & 95.2 & 100.0 & 100.0 & 87.0 & 100.0 & 94.9 & 93.8 & 92.7 & 75.0 & 100.0 \\
 & \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{78.7} & 79.4 & 83.0 & \textbf{100.0} & 81.0 & \cellcolor[RGB]{211,211,211}{0.0} & 100.0 & 90.9 & 100.0 & \cellcolor[RGB]{211,211,211}{76.5} & 89.7 & 78.8 & 100.0 \\
 & \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{72.7} & 80.5 & \cellcolor[RGB]{211,211,211}{50.0} & \textbf{100.0} & \cellcolor[RGB]{211,211,211}{66.7} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{50.0} & 100.0 & 96.8 & 95.9 & 77.5 & 96.7 \\
