 initial model & dataset & gemma-7b-it & Llama-2-13b-chat-hf & Llama-2-70b-chat-hf & Mixtral-8x7B-Instruct-v0.1 & Qwen1.5-14B-Chat & Qwen1.5-72B-Chat & gpt-3.5-turbo-0125 & gemini-1.0-pro-001 & claude-3-opus-20240229 & gpt-4-0613 & gpt-4-0125-preview & random & human \\
 & \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{39.1} & 56.0 & 71.0 & 58.7 & 70.6 & 56.1 & \textbf{76.5} & 64.8 & 57.4 & 72.7 & 69.4 & 55.0 & 90.0 \\
 & \scalebox{0.9}[1]{FgFactV} & 64.6 & 71.0 & \textbf{71.6} & 67.0 & 67.0 & \cellcolor[RGB]{211,211,211}{44.1} & 65.1 & 60.5 & 62.9 & \cellcolor[RGB]{211,211,211}{16.3} & \cellcolor[RGB]{211,211,211}{23.7} & 55.7 & 95.5 \\
 & \scalebox{0.9}[1]{AnsCls} & 65.1 & \textbf{73.4} & 71.0 & \cellcolor[RGB]{211,211,211}{47.5} & 69.6 & \cellcolor[RGB]{211,211,211}{27.7} & \cellcolor[RGB]{211,211,211}{39.3} & 63.9 & \cellcolor[RGB]{211,211,211}{43.0} & \cellcolor[RGB]{211,211,211}{25.5} & \cellcolor[RGB]{211,211,211}{25.5} & 57.9 & 85.7 \\
 & \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{40.0} & \cellcolor[RGB]{211,211,211}{74.3} & 86.4 & \cellcolor[RGB]{211,211,211}{70.6} & 88.7 & 76.6 & 84.7 & 81.5 & 84.4 & \textbf{89.3} & 89.0 & 75.0 & 98.3 \\
 & \scalebox{0.9}[1]{FgFactV} & 80.8 & 87.5 & 87.3 & \cellcolor[RGB]{211,211,211}{59.3} & \textbf{87.7} & \cellcolor[RGB]{211,211,211}{42.9} & \cellcolor[RGB]{211,211,211}{74.3} & \cellcolor[RGB]{211,211,211}{69.1} & \cellcolor[RGB]{211,211,211}{61.2} & \cellcolor[RGB]{211,211,211}{44.7} & \cellcolor[RGB]{211,211,211}{71.2} & 78.8 & 100.0 \\
 & \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{45.1} & \textbf{87.1} & 81.6 & \cellcolor[RGB]{211,211,211}{40.7} & 86.1 & \cellcolor[RGB]{211,211,211}{11.9} & \cellcolor[RGB]{211,211,211}{10.4} & \cellcolor[RGB]{211,211,211}{32.5} & \cellcolor[RGB]{211,211,211}{31.3} & \cellcolor[RGB]{211,211,211}{63.4} & \cellcolor[RGB]{211,211,211}{76.4} & 77.5 & 98.3 \\
