\begin{table*}[t]
\centering
% \setlength{\tabcolsep}{4pt} % Adjust column spacing
% \renewcommand{\arraystretch}{1.2} % Slightly increase row height
\begin{tabularx}{\textwidth}{@{}l | >{\centering\arraybackslash}X >{\centering\arraybackslash}X >{\centering\arraybackslash}X >{\centering\arraybackslash}X >{\centering\arraybackslash}X | >{\centering\arraybackslash}X | >{\centering\arraybackslash}X | >{\centering\arraybackslash}X | >{\centering\arraybackslash}X | >{\centering\arraybackslash}X @{}}
\toprule
\multirow{2}{*}{\textbf{Model}} & \multicolumn{5}{c|}{\textbf{Compilation}} & \multirow{2}{*}{\textbf{TSS}} & \multirow{2}{*}{\textbf{GS}} & \multirow{2}{*}{\textbf{CS}} & \multirow{2}{*}{\textbf{FFS}} & \multirow{2}{*}{\textbf{Total}} \\
\cmidrule(lr){2-6} % Adjusted cmidrule to span 5 columns
& \textbf{CSE} & \textbf{GIF} & \textbf{PSI} & \textbf{AFS} & \textbf{CPR} & & & & & \\
\midrule
\multicolumn{11}{c}{\textit{In-context learning}} \\
\midrule
Qwen2.5-VL-32B & 78.32 & 42.09 & 38.11 & 34.52 & 10.18 & 35.04 & 28.51 & 30.93 & 26.26 & 30.19 \\
Qwen2.5-VL-72B & 80.85 & 44.51 & 40.93 & 37.01 & 14.55 & 37.11 & 31.65 & 33.08 & 28.90 & 32.68 \\
InternVL2.5-78B & 74.12 & 42.17 & 36.01 & 33.91 & 12.84 & 35.04 & 30.67 & 31.95 & 29.04 & 31.68 \\
Claude3.5-Sonnet & 87.36 & 57.94 & 50.12 & 41.62 & 20.73 & 44.02 & 38.99 & 39.21 & 36.87 & 39.77 \\
GPT-4o & 95.03 & 61.13 & 48.28 & 45.25 & 28.56 & 50.06 & 40.57 & 41.58 & 39.06 & 42.82 \\
Gemini2.5-Flash & 83.60 & 50.24 & 46.89 & 40.77 & 18.93 & 42.61 & 40.86 & 37.13 & 36.91 & 39.38 \\
Gemini2.5-pro & 94.47 & 60.06 & 53.41 & 46.01 & 30.03 & 51.51 & 43.71 & 42.68 & 37.28 & 42.80 \\
\midrule
\multicolumn{11}{c}{\textit{Environmental learning}} \\
\midrule
Qwen2.5-VL-32B & 88.87 & 70.13 & 65.02 & 63.92 & 39.08 & 45.61 & 33.51 & 38.28 & 29.51 & 36.72 \\
Qwen2.5-VL-72B & 90.58 & 77.90 & 68.91 & 66.92 & 43.81 & 48.72 & 38.03 & 40.86 & 34.74 & 40.58 \\
InternVL2.5-78B & 85.23 & 68.92 & 65.81 & 60.01 & 38.34 & 48.91 & 35.37 & 36.72 & 35.02 & 39.00 \\
Claude3.5-Sonnet & 98.05 & 85.89 & 81.74 & 78.52 & 52.90 & 55.82 & 50.21 & 52.15 & 43.66 & 50.46 \\
GPT-4o & \textbf{100} & \textbf{92.55} & 88.25 & 82.56 & \textbf{66.92} & 58.29 & 51.52 & 54.81 & \textbf{46.07} & 52.67 \\
Gemini2.5-Flash & 92.51 & 82.81 & 80.03 & 79.93 & 51.94 & 53.01 & 48.86 & 50.95 & 44.91 & 49.43 \\
Gemini2.5-pro & \textbf{100} & 90.74 & \textbf{92.57} & \textbf{84.27} & 65.89 & \textbf{60.18} & \textbf{52.23} & \textbf{56.99} & 45.24 & \textbf{53.66} \\
\midrule
\multicolumn{11}{c}{\textit{Reinforcement learning}} \\
\midrule
Qwen2.5-VL-32B & 91.03 & 72.84 & 70.42 & 68.92 & 45.17 & 49.55 & 39.91 & 42.78 & 38.07 & 42.57 \\
\bottomrule
\end{tabularx}
\caption{Results of different MLLMs and methods on the code generation task. Compilation indicates whether compilation is successful, including the probability of no occurrence of the four compilation errors(\ref{cp}), as well as the overall compilation pass rate (CPR). When compilation is successful, the similarity in four dimensions(\ref{code_eval}) and the total score are calculated. This score is scaled to [0,100] for ease of presentation.}
\label{main_2}
\end{table*}