 initial model & dataset & gemma-7b-it & Llama-2-13b-chat-hf & Llama-2-70b-chat-hf & Mixtral-8x7B-Instruct-v0.1 & Qwen1.5-14B-Chat & Qwen1.5-72B-Chat & gpt-3.5-turbo-0125 & gemini-1.0-pro-001 & claude-3-opus-20240229 & gpt-4-0613 & gpt-4-0125-preview & random & human \\
 & \scalebox{0.9}[1]{MathGen} & 25.7 & \cellcolor[RGB]{211,211,211}{-2.9} & 0.0 & 5.7 & \cellcolor[RGB]{211,211,211}{-0.7} & 2.1 & 2.1 & \cellcolor[RGB]{211,211,211}{-1.4} & 2.1 & 2.1 & 2.1 \\
 & \scalebox{0.9}[1]{FgFactV} & 5.7 & 1.4 & 0.0 & 7.9 & \cellcolor[RGB]{211,211,211}{-3.6} & 7.9 & \cellcolor[RGB]{211,211,211}{-1.4} & \cellcolor[RGB]{211,211,211}{-2.1} & \cellcolor[RGB]{211,211,211}{-14.3} & \cellcolor[RGB]{211,211,211}{-1.4} & 4.3 \\
 & \scalebox{0.9}[1]{AnsCls} & 0.7 & \cellcolor[RGB]{211,211,211}{-1.4} & 2.1 & 1.4 & \cellcolor[RGB]{211,211,211}{-3.6} & \cellcolor[RGB]{211,211,211}{-0.7} & 1.4 & \cellcolor[RGB]{211,211,211}{-15.0} & \cellcolor[RGB]{211,211,211}{-0.7} & \cellcolor[RGB]{211,211,211}{-2.9} & 1.4 \\
 & \scalebox{0.9}[1]{MathGen} & 19.4 & \cellcolor[RGB]{211,211,211}{-20.6} & 1.2 & 5.6 & \cellcolor[RGB]{211,211,211}{-8.8} & 1.2 & \cellcolor[RGB]{211,211,211}{-3.7} & \cellcolor[RGB]{211,211,211}{-16.9} & 6.9 & \cellcolor[RGB]{211,211,211}{-1.9} & 0.0 \\
 & \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{-1.3} & \cellcolor[RGB]{211,211,211}{-3.1} & 0.6 & 11.9 & \cellcolor[RGB]{211,211,211}{-49.4} & 6.9 & \cellcolor[RGB]{211,211,211}{-28.8} & \cellcolor[RGB]{211,211,211}{-11.2} & 5.6 & \cellcolor[RGB]{211,211,211}{-3.1} & \cellcolor[RGB]{211,211,211}{-6.9} \\
 & \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{-1.3} & \cellcolor[RGB]{211,211,211}{-10.6} & 5.6 & \cellcolor[RGB]{211,211,211}{-10.0} & \cellcolor[RGB]{211,211,211}{-37.5} & 7.5 & \cellcolor[RGB]{211,211,211}{-1.3} & \cellcolor[RGB]{211,211,211}{-5.0} & \cellcolor[RGB]{211,211,211}{-3.8} & \cellcolor[RGB]{211,211,211}{-3.7} & \cellcolor[RGB]{211,211,211}{-1.2} \\
