 initial model & dataset & gemma-7b-it & Llama-2-13b-chat-hf & Llama-2-70b-chat-hf & Mixtral-8x7B-Instruct-v0.1 & Qwen1.5-14B-Chat & Qwen1.5-72B-Chat & gpt-3.5-turbo-0125 & gemini-1.0-pro-001 & claude-3-opus-20240229 & gpt-4-0613 & gpt-4-0125-preview & random & human \\
 & \scalebox{0.9}[1]{MathGen} & 4.4 & \cellcolor[RGB]{211,211,211}{-12.5} & 0.1 & 4.1 & 3.5 & \cellcolor[RGB]{211,211,211}{-5.0} & 3.3 & 11.0 & \cellcolor[RGB]{211,211,211}{-11.4} & 2.0 & 6.4 \\
 & \scalebox{0.9}[1]{FgFactV} & 0.9 & 1.8 & 0.1 & 2.1 & \cellcolor[RGB]{211,211,211}{-1.0} & 5.9 & 3.1 & 3.9 & \cellcolor[RGB]{211,211,211}{-10.3} & \cellcolor[RGB]{211,211,211}{-4.2} & 14.9 \\
 & \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{-1.7} & 0.1 & 0.7 & 8.2 & 0.8 & \cellcolor[RGB]{211,211,211}{-1.6} & 12.9 & \cellcolor[RGB]{211,211,211}{-9.1} & 2.7 & \cellcolor[RGB]{211,211,211}{-10.5} & \cellcolor[RGB]{211,211,211}{-8.1} \\
 & \scalebox{0.9}[1]{MathGen} & 11.7 & \cellcolor[RGB]{211,211,211}{-3.0} & 0.7 & 7.7 & \cellcolor[RGB]{211,211,211}{-0.4} & \cellcolor[RGB]{211,211,211}{-2.6} & 3.2 & 12.4 & 2.4 & \cellcolor[RGB]{211,211,211}{-0.2} & 1.2 \\
 & \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{-5.1} & 5.4 & 0.1 & 2.6 & \cellcolor[RGB]{211,211,211}{-14.2} & 1.2 & 1.3 & 10.8 & 1.8 & 4.5 & \cellcolor[RGB]{211,211,211}{-3.5} \\
 & \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{-7.2} & 1.3 & \cellcolor[RGB]{211,211,211}{-3.2} & \cellcolor[RGB]{211,211,211}{-1.3} & \cellcolor[RGB]{211,211,211}{-1.1} & 15.0 & 5.0 & 11.1 & 0.0 & \cellcolor[RGB]{211,211,211}{-0.5} & \cellcolor[RGB]{211,211,211}{-0.2} \\
