\documentclass[../midl25_191.tex]{subfiles}
\begin{document}
\label{sec:results}
\begin{table}[htbp]
\caption{Comparative analysis of VA prediction models.}
\label{tab:datasets_models_performance}
\centering
\renewcommand{\arraystretch}{1.2}
\resizebox{\textwidth}{!}{%
\begin{tabular}{l|ccc|ccc}
\hline
\multirow{2}{*}{\textbf{Model}} & \multicolumn{3}{c|}{\textbf{DIME Dataset}} & \multicolumn{3}{c}{\textbf{INDEX Dataset}} \\
\cline{2-7}
& \textbf{MAE}$\downarrow$ & \textbf{RMSE}$\downarrow$ & $\mathbf{R}^2$$\uparrow$ & \textbf{MAE}$\downarrow$ & \textbf{RMSE}$\downarrow$ & $\mathbf{R}^2$$\uparrow$ \\
\hline
\multicolumn{7}{l}{\textit{Clinical Data Models}} \\
Linear Regression & 4.73 ± 0.98 & 6.05 ± 1.37 & 0.55 ± 0.17 & 6.89 ± 3.99 & 7.67 ± 3.90 & 0.25 ± 0.49 \\
Random Forest & 5.17 ± 1.41 & 7.58 ± 2.29 & 0.25 ± 0.51 & 6.09 ± 3.51 & 6.84 ± 4.09 & 0.33 ± 0.52 \\
Neural Network & 7.78 ± 1.51 & 9.82 ± 2.69 & -0.21 ± 0.41 & 8.18 ± 1.77 & 10.35 ± 4.25 & -0.31 ± 0.48 \\
\hline
\multicolumn{7}{l}{\textit{OCT Image Models}} \\
EfficientNet-b0 & 5.29 ± 1.53 & 6.26 ± 1.74 & 0.53 ± 0.17 & 6.87 ± 1.98 & 8.34 ± 2.26 & 0.14 ± 0.22 \\
ResNet-50 & 6.22 ± 2.29 & 7.35 ± 2.52 & 0.33 ± 0.29 & 6.27 ± 1.72 & 7.80 ± 1.33 & 0.21 ± 0.25 \\
\hline
\multicolumn{7}{l}{\textit{Multimodal Models}} \\
\textbf{Proposed Model} & \textbf{3.07 ± 0.82} & \textbf{4.03 ± 1.12} & \textbf{0.77 ± 0.16} & \textbf{4.20 ± 2.79} & \textbf{4.87 ± 3.52} & \textbf{0.61 ± 0.36} \\
\multicolumn{7}{l}{\textit{Reference Models (External Datasets)}} \\
~~~~Ensemble ML~\cite{liu2021automatic}* & 6.5 & 10.0 & 0.68 & — & — & — \\
~~~~OCT-based DL~\cite{wen2023deep}† & 3.5 & 5.5 & 0.80 & — & — & — \\
\hline
\end{tabular}%
}
\vspace{1mm}
\begin{minipage}{\linewidth}
\footnotesize
Note: Values presented as mean ± standard deviation across 5-fold cross-validation. $\uparrow$ indicates higher is better, $\downarrow$ indicates lower is better.\\
All error metrics are in ETDRS letters. *Tested on GDPH/ZHSMU dataset. †Tested on iERM dataset.
\end{minipage}
\end{table}

Our proposed multimodal approach outperformed single-modality baselines using either clinical features alone (linear regression, random forest, neural network with the same architecture as the clinical network model but with a regression head) or OCT images alone (ResNet-50 and EfficientNet-B0). It achieved superior performance across all metrics (Table~\ref{tab:datasets_models_performance}), with an MAE of $3.07 \pm 0.82$ ETDRS letters ($R^2$: $0.77 \pm 0.16$) on DIME and $4.20 \pm 2.79$ ($R^2$: $0.61 \pm 0.36$) on INDEX, significantly outperforming both clinical-only (best MAE: 4.73) and imaging-only models (best MAE: 5.29) in both treatment-naïve and chronic DME cases. The proposed model also demonstrated better error metrics than reference models from Liu et al.~\cite{liu2021automatic} and comparable performance to Wen et al.~\cite{wen2023deep}, though these were evaluated on different datasets, so direct comparison should be made cautiously. Detailed per-fold performance metrics are provided in Appendix Table~\ref{tab:per_fold_performance}.

\begin{figure}[!b]
    \centering
    \setlength{\tabcolsep}{0pt}  
    \begin{tabular}{@{}c@{}}  %
        % DIME grid - force total width with minipage
        \begin{minipage}{0.96\textwidth} 
        \begin{tabular}{@{}c@{}c@{}c@{}c@{}c@{}c@{}}
            \multicolumn{6}{@{}c}{\small\textbf{DIME}} \\[0.1mm]
            \includegraphics[width=0.16\textwidth]{images/gradcam/DIME/slice_022_gradcam_overlay.png}&
            \includegraphics[width=0.16\textwidth]{images/gradcam/DIME/slice_029_gradcam_overlay.png}&
            \includegraphics[width=0.16\textwidth]{images/gradcam/DIME/slice_032_gradcam_overlay.png}&
            \includegraphics[width=0.16\textwidth]{images/gradcam/DIME/slice_039_gradcam_overlay.png}&
            \includegraphics[width=0.16\textwidth]{images/gradcam/DIME/slice_072_gradcam_overlay.png}&
            \includegraphics[width=0.16\textwidth]{images/gradcam/DIME/slice_074_gradcam_overlay.png}\\[-0.5mm]
            \includegraphics[width=0.16\textwidth]{images/gradcam/DIME/slice_078_gradcam_overlay.png}&
            \includegraphics[width=0.16\textwidth]{images/gradcam/DIME/slice_080_gradcam_overlay.png}&
            \includegraphics[width=0.16\textwidth]{images/gradcam/DIME/slice_114_gradcam_overlay.png}&
            \includegraphics[width=0.16\textwidth]{images/gradcam/DIME/slice_122_gradcam_overlay.png}&
            \includegraphics[width=0.16\textwidth]{images/gradcam/DIME/slice_126_gradcam_overlay.png}&
            \includegraphics[width=0.16\textwidth]{images/gradcam/DIME/slice_127_gradcam_overlay.png}
        \end{tabular}
        \end{minipage}
        \\[2mm]
        % INDEX grid and colorbar
        \begin{minipage}{0.96\textwidth}  
        \begin{tabular}{@{}c@{\hspace{2mm}}c@{}}
            \begin{tabular}{@{}c@{}c@{}c@{}c@{}c@{}c@{}}
                \multicolumn{6}{@{}c}{\small\textbf{INDEX}} \\[0.1mm]
                \includegraphics[width=0.16\textwidth]{images/gradcam/INDEX/slice_38_gradcam_overlay.png}&
                \includegraphics[width=0.16\textwidth]{images/gradcam/INDEX/slice_22_gradcam_overlay.png}&
                \includegraphics[width=0.16\textwidth]{images/gradcam/INDEX/slice_30_gradcam_overlay.png}&
                \includegraphics[width=0.16\textwidth]{images/gradcam/INDEX/slice_2_gradcam_overlay.png}&
                \includegraphics[width=0.16\textwidth]{images/gradcam/INDEX/slice_29_gradcam_overlay.png}&
                \includegraphics[width=0.16\textwidth]{images/gradcam/INDEX/slice_37_gradcam_overlay.png}\\[-0.8mm]
                \includegraphics[width=0.16\textwidth]{images/gradcam/INDEX/slice_47_gradcam_overlay.png}&
                \includegraphics[width=0.16\textwidth]{images/gradcam/INDEX/slice_35_gradcam_overlay.png}&
                \includegraphics[width=0.16\textwidth]{images/gradcam/INDEX/slice_44_gradcam_overlay.png}&
                \includegraphics[width=0.16\textwidth]{images/gradcam/INDEX/slice_45_gradcam_overlay.png}&
                \includegraphics[width=0.16\textwidth]{images/gradcam/INDEX/slice_48_gradcam_overlay.png}&
                \includegraphics[width=0.16\textwidth]{images/gradcam/INDEX/slice_81_gradcam_overlay.png}
            \end{tabular}
            & 
            \raisebox{-\height}{\includegraphics[height=6em]{images/gradcam/colorbar_512x51.pdf}}
        \end{tabular}
        \end{minipage}
    \end{tabular}
    \caption{Guided Grad-CAM activation maps for randomly chosen OCT images in the DIME (top) and INDEX (bottom) datasets, highlighting regions of interest identified by the model. The color bar indicates activation magnitude from 0.0 (blue) to 1.0 (red). Images were selected randomly to illustrate typical model interpretations across the datasets.}
    \label{fig:gradcam_comparison}
\end{figure}

Interpretability analysis through Grad-CAM revealed distinct spatial attention distributions (Figure~\ref{fig:gradcam_comparison}). Treatment-naïve cases (DIME) demonstrated focal activation patterns localizing to regions of intraretinal fluid (IRF), while chronic cases (INDEX) exhibited broader attention distribution across areas of edema and structural alteration, consistent with established pathological progression patterns~\cite{sakini2024diabetic}.

\begin{figure}[t]
   \centering
   \subfigure[DIME]{\includegraphics[width=0.35\columnwidth]{images/error_analysis_plot_DIME.pdf}\label{fig:error_analysis_dime}}
   \hspace{0.9cm}
   \subfigure[INDEX]{\includegraphics[width=0.35\columnwidth]{images/error_analysis_plot_INDEX.pdf}\label{fig:error_analysis_index}}
   \caption{Error analysis by post-treatment VA range. Blue bars show systematic bias (direction of error); orange bars show MAE. Patient counts ($n$) are displayed below each group. Positive values indicate VA overestimation; negative values show underestimation.}
   \label{fig:error_analysis}
\end{figure}

Systematic error patterns varied across VA ranges (Figure~\ref{fig:error_analysis}a,b). In DIME (Figure~\ref{fig:error_analysis_dime}), low VA cases ($<71$ ETDRS letters, $n=10$) showed over-prediction ($2.24 \pm 3.61$ ETDRS letters), while high VA cases ($>82$ ETDRS letters, $n=11$) showed under-prediction ($-2.06 \pm 3.07$ ETDRS letters). In INDEX (Figure~\ref{fig:error_analysis_index}), low VA predictions ($n=12$) were balanced ($1.66 \pm 5.56$ ETDRS letters), while higher VA ranges showed under-prediction bias ($n=6$, $-1.39 \pm 1.34$ ETDRS letters; $n=2$, $-11.86 \pm 2.74$ ETDRS letters).

Feature importance analysis through Integrated Gradients quantified the relative contributions of imaging and clinical features on the DIME dataset (Figure~\ref{fig:attributions}). OCT features provided the strongest predictive signal ($\text{mean}=8.07 \pm 4.21$), complemented by baseline VA as the primary clinical indicator ($\text{mean}=5.78 \pm 1.95$).

\begin{figure}[t]
    \centering
    \includegraphics[width=.55\columnwidth]{images/mean_attributions_bar_plot.pdf}
    \caption{Mean feature attribution magnitudes across five-fold validation showing relative importance of imaging and clinical predictors on DIME dataset. OCT volume measurements show the highest variability, suggesting dataset-specific learning patterns.}
    \label{fig:attributions}
\end{figure}

\begin{table}[htbp]
\caption{Ablation study of model components.}
\label{tab:ablation_study}
\centering
\renewcommand{\arraystretch}{1.2}
\resizebox{\textwidth}{!}{%
\begin{tabular}{l|ccc|ccc}
\hline
\multirow{2}{*}{\textbf{Model Configuration}} & \multicolumn{3}{c|}{\textbf{DIME Dataset}} & \multicolumn{3}{c}{\textbf{INDEX Dataset}} \\
\cline{2-7}
& \textbf{MAE}$\downarrow$ & \textbf{RMSE}$\downarrow$ & $\mathbf{R}^2$$\uparrow$ & \textbf{MAE}$\downarrow$ & \textbf{RMSE}$\downarrow$ & $\mathbf{R}^2$$\uparrow$ \\
\hline
\textbf{Proposed Model} & 3.07 ± 0.82 & 4.03 ± 1.12 & 0.77 ± 0.16 & 4.20 ± 2.79 & 4.87 ± 3.52 & 0.61 ± 0.36 \\
\hline
\multicolumn{7}{l}{\textit{Clinical Features}} \\
All Clinical Features Variation (11/18) & 4.85 ± 1.67 & 6.71 ± 2.26 & 0.42 ± 0.31 & 6.56 ± 2.97 & 8.57 ± 4.85 & 0.09 ± 0.59 \\
Replace Baseline VA with: & & & & & & \\
~~~- IRF Cysts & 5.45 ± 1.18 & 7.14 ± 1.83 & 0.35 ± 0.20 & — & — & — \\
~~~- Baseline IOP (mmHg) & — & — & — & 7.54 ± 1.95 & 9.02 ± 4.32 & 0.05 ± 0.15 \\
\hline
\multicolumn{7}{l}{\textit{Model Architecture Variation}} \\
Without Attention Mechanism & 3.84 ± 1.13 & 5.06 ± 1.46 & 0.66 ± 0.19 & 4.57 ± 2.31 & 6.13 ± 4.43 & 0.59 ± 0.30 \\
\hline
\end{tabular}%
}
\vspace{1mm}
\begin{minipage}{\linewidth}
\footnotesize
Note: DIME contains 11 clinical features while INDEX contains 18 clinical features for the all clinical features configuration. IOP = Intraocular Pressure.
\end{minipage}
\end{table}

To further assess the contribution of individual components in our multimodal framework, we conducted an ablation study (Table~\ref{tab:ablation_study}). When using all available clinical features (11 for DIME, 18 for INDEX) rather than our selected subset, model performance degraded significantly (MAE increased by 1.78 and 2.36 ETDRS letters for DIME and INDEX, respectively), demonstrating the effectiveness of our feature selection approach in mitigating overfitting on small datasets.

Replacing baseline VA with alternative features (IRF cysts for DIME; baseline IOP for INDEX) resulted in substantial performance deterioration, confirming baseline VA as a critical predictor. Additionally, removing the cross-modal attention mechanism increased prediction error by 25\% for DIME and 9\% for INDEX, highlighting the importance of modality-specific feature weighting in our architecture. These findings validate our design choices for both feature selection and architectural components.

\end{document}
