                 initial model &                        dataset &                    gemma-7b-it &            Llama-2-13b-chat-hf &            Llama-2-70b-chat-hf &     Mixtral-8x7B-Instruct-v0.1 &               Qwen1.5-14B-Chat &               Qwen1.5-72B-Chat &             gpt-3.5-turbo-0125 &             gemini-1.0-pro-001 &         claude-3-opus-20240229 &                     gpt-4-0613 &             gpt-4-0125-preview &                         random &                          human  \\
                               &     \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{44.3} & \cellcolor[RGB]{211,211,211}{51.6} &                           55.4 & \cellcolor[RGB]{211,211,211}{46.9} & \cellcolor[RGB]{211,211,211}{48.7} & \cellcolor[RGB]{211,211,211}{31.9} &                           62.7 & \cellcolor[RGB]{211,211,211}{37.8} & \cellcolor[RGB]{211,211,211}{51.7} &                           65.1 &                  \textbf{69.2} &                           55.0 &                           90.0  \\
                               &     \scalebox{0.9}[1]{FgFactV} &                           58.5 &                           59.5 &                  \textbf{65.4} & \cellcolor[RGB]{211,211,211}{42.9} & \cellcolor[RGB]{211,211,211}{55.7} & \cellcolor[RGB]{211,211,211}{25.1} & \cellcolor[RGB]{211,211,211}{41.2} & \cellcolor[RGB]{211,211,211}{45.1} & \cellcolor[RGB]{211,211,211}{48.7} & \cellcolor[RGB]{211,211,211}{12.4} & \cellcolor[RGB]{211,211,211}{19.8} &                           55.7 &                           95.5  \\
                               &      \scalebox{0.9}[1]{AnsCls} &                           57.9 &                  \textbf{66.5} &                           65.9 & \cellcolor[RGB]{211,211,211}{39.3} & \cellcolor[RGB]{211,211,211}{52.7} & \cellcolor[RGB]{211,211,211}{16.5} & \cellcolor[RGB]{211,211,211}{26.9} & \cellcolor[RGB]{211,211,211}{39.6} & \cellcolor[RGB]{211,211,211}{38.2} & \cellcolor[RGB]{211,211,211}{21.6} & \cellcolor[RGB]{211,211,211}{23.6} &                           57.9 &                           85.7  \\
                               &     \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{51.9} & \cellcolor[RGB]{211,211,211}{55.2} & \cellcolor[RGB]{211,211,211}{66.8} & \cellcolor[RGB]{211,211,211}{56.7} & \cellcolor[RGB]{211,211,211}{54.4} & \cellcolor[RGB]{211,211,211}{51.6} & \cellcolor[RGB]{211,211,211}{71.7} & \cellcolor[RGB]{211,211,211}{52.2} &                           81.6 &                           87.6 &                  \textbf{89.0} &                           75.0 &                           98.3  \\
                               &     \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{68.5} & \cellcolor[RGB]{211,211,211}{77.7} &                  \textbf{80.6} & \cellcolor[RGB]{211,211,211}{35.5} & \cellcolor[RGB]{211,211,211}{63.8} & \cellcolor[RGB]{211,211,211}{18.4} & \cellcolor[RGB]{211,211,211}{34.3} & \cellcolor[RGB]{211,211,211}{41.2} & \cellcolor[RGB]{211,211,211}{45.2} & \cellcolor[RGB]{211,211,211}{38.1} & \cellcolor[RGB]{211,211,211}{67.8} &                           78.8 &                          100.0  \\
                               &      \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{35.6} &                  \textbf{76.5} & \cellcolor[RGB]{211,211,211}{50.2} & \cellcolor[RGB]{211,211,211}{30.8} & \cellcolor[RGB]{211,211,211}{43.7} & \cellcolor[RGB]{211,211,211}{4.6} & \cellcolor[RGB]{211,211,211}{3.8} & \cellcolor[RGB]{211,211,211}{15.6} & \cellcolor[RGB]{211,211,211}{26.0} & \cellcolor[RGB]{211,211,211}{64.1} & \cellcolor[RGB]{211,211,211}{75.7} &                           77.5 &                           98.3  \\
