\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{colortbl}
\usepackage[table]{xcolor} % For row colors
\usepackage{booktabs}      % For better table rules
\usepackage{threeparttable} % For footnotes

\jmlryear{2025}\jmlrworkshop{Full Paper -- MIDL 2025}\jmlrvolume{-- nnn}\editors{Accepted for publication at MIDL 2025}

\title[Predicting the Year of Total Knee Replacement]{Predicting the Year of Total Knee Replacement: A Transformer-Based Multimodal Approach}

\midlauthor{\Name{Ozkan Cigdem\nametag{$^{1}$}} \Email{ozkan.cigdem@nyulangone.org}\\
\addr $^{1}$ Department of Radiology, New York University Langone Health, New York, USA \AND
\Name{Refik Soyak\nametag{$^{2}$}} \Email{refik.soyak@fau.de}\\
\addr $^{2}$ Artifical Intelligence, Friedrich-Alexander-Universität Erlangen-Nürnberg, Erlangen, Germany \AND
\Name{Kyunghyun Cho\nametag{$^{3}$}} \Email{kyunghyun.cho@nyu.edu}\\
\addr $^{3}$ Center of Data Science, New York University, New York, USA \AND
\Name{Cem M. Deniz\nametag{$^{1}$}} \Email{cem.deniz@nyulangone.org}
}

\begin{document}
\maketitle
\begin{abstract}
Accurate prediction of the year of total knee replacement (TKR) is challenging due to the complex interplay of factors influencing the surgical decision. Current deep learning models often rely on single-modality data, limiting their predictive power. Multimodal approaches integrating imaging and patient data offer the potential to improve predictions and support clinical decisions. This study presents an end-to-end trained, transformer-based multimodal model that integrates MR imaging with tabular data, including clinical variables and image readings, to predict the year of TKR for each subject. Our model leverages cross-modal attention to fuse features from an image encoder with a self-supervised pretrained tabular encoder, achieving the highest accuracy of 63.4\% among tested models. We evaluated its performance against three unimodal models and four multimodal fusion strategies, including simple concatenation, DAFT, and multimodal interaction. The results demonstrate that our model's cross-modal interaction approach with pretrained TabNet not only outperformed all unimodal models but also showed improvements over other multimodal fusion techniques, highlighting the effectiveness of cross-modal attention fusion for integrating complex data modalities in TKR year prediction tasks.
Source code is available at \url{https://github.com/denizlab/2025\_MIDL\_time2TKR}.
\end{abstract}

\begin{keywords}
Multimodal Learning, Year of TKR Prediction, Deep Learning, Knee Osteoarthritis
\end{keywords}
%
\section{Introduction}
\label{sec:intro}
Osteoarthritis (OA), a prevalent joint disorder, often leads to physical disability and affects global health \cite{Kellgren1957}. Knee osteoarthritis (KOA), the most common form, impacts millions worldwide, causing pain and mobility issues \cite{Kellgren1957}. It affects about 10\% of men and 13\% of women over 60. While there is no cure for reversing the course of KOA, total knee replacement (TKR) surgery becomes necessary in the advanced stages. Estimating the year of TKR is crucial for identifying high-risk patients and informing timely treatment decisions. However, predicting the year of TKR is complex, influenced not only by disease progression but also by individual factors like patient preferences, financial constraints, comorbidities, and overall health \cite{Cigdem2023Review}. This variability makes accurate prediction challenging, underscoring the need for advanced predictive tools.

\section{Related Work}
In recent years, deep learning (DL) models have advanced KOA severity assessment across various imaging modalities \cite{Rajamohan2023, KNOAP2020_Challenge, Tolpadi2020, Panfilov2022_Progression, Mahmoud2023}. MR imaging, for example, effectively detects key structural features of knee degeneration like cartilage defects, osteophytes, joint effusion, and bone marrow edema \cite{Rajamohan2023, Cigdem2023Review}. Most studies in the literature focus on predicting OA progression \cite{Rajamohan2023, Panfilov2022_Progression, Leung2020, Panfilov2023-gl}, which is typically formulated as a binary classification task. Others aim to estimate OA severity \cite{Tolpadi2020, Felfeliyan2024}, KL grades \cite{Leung2020}, or symptomatic radiographic KOA \cite{KNOAP2020_Challenge}. These models primarily rely on imaging alone or incorporate a limited set of clinical variables, such as age, sex, BMI, and pain scores. However, only a few studies have investigated predicting the specific year of TKR, making this a relatively unexplored yet clinically important area \cite{Mahmoud2023, Jamshidi2021, Heisinger2020, Qiang_Liu2022}. Previous studies using the Osteoarthritis Initiative (OAI) dataset—a 10-year observational study—have employed survival analysis methods and relied on clinical variables and image readings to estimate the year of TKR \cite{Mahmoud2023, Jamshidi2021, Heisinger2020}. However, these studies have typically focused on timeframes of no more than five years. While many approaches have demonstrated success in OA progression prediction \cite{Rajamohan2023, Tolpadi2022, KNOAP2020_Challenge}, existing DL models are predominantly unimodal, focusing only on imaging data without incorporating tabular data such as patient demographics, clinical assessments, or image readings \cite{Rajamohan2023, Tolpadi2020, Panfilov2022_Progression}. As physicians rely on both imaging and clinical data for accurate diagnosis, there is a growing need for automated multimodal AI systems that integrate medical images with clinical patient data to enhance consistency and precision in OA management strategies. 

As tabular data gains prominence in multimodal learning, its integration becomes crucial for enhancing diverse applications \cite{Kita2023-qt, Felfeliyan2024, Du2024-df}. In \cite{Kita2023-qt}, TabNet \cite{Arik2019-vh} and a DL model were combined for spinal cord tumor diagnosis through concatenated outputs. \cite{Felfeliyan2024} employed a CLIP-style vision-language model to predict OA severity by merging knee radiographs with tabular OA scores. Meanwhile, \cite{Du2024-df} introduced a transformer-based multimodal framework using 2D short-axis cardiac MR images and tabular data, leveraging self-supervised learning to manage missing data in cardiac disease classification. 

Unlike previous studies that primarily focus on predicting KOA progression \cite{Rajamohan2023, Panfilov2022_Progression, Leung2020, Panfilov2023-gl}, KL grade \cite{Leung2020}, or KOA severity \cite{KNOAP2020_Challenge}, this study specifically aims to predict the year of TKR. To this end, we developed an end-to-end, transformer-based multimodal model that integrates MR scans with clinical and image reading data. As a result, this study should not be directly compared to research focused on OA progression or severity prediction, as it addresses a distinct clinical outcome. Compared to our previous study \cite{Cigdem2024RA}, where we performed survival analysis using a two-stage prediction model, this study employs end-to-end training by integrating MR scans with clinical and image reading data. Additionally, we implemented 5-fold cross-validation (CV) to obtain a more reliable estimate of the model’s generalization performance. We processed tabular data with TabNet, which is then combined with image features using cross-modal attention to predict the year of TKR for each subject. The model using the OAI dataset outputs a predicted year within a 0 to 9-year timeframe, representing the estimated year of TKR surgery. The 10-year timeframe was selected based on the OAI study design \cite{Lester2008_OAI}. Tabular data was encoded using TabNet \cite{Arik2019-vh}, a transformer-based model that employs sequential attention to identify key features, enhancing interpretability. TabNet also leverages unsupervised pre-training to predict masked features.

Our contributions are as follows: $(1)$ Introducing an end-to-end trained multimodal approach that combines MR data with tabular data (clinical variables and image readings) to predict the year of TKR within 9 years. $(2)$ Implementing a transformer-based multimodal architecture, utilizing unsupervised representation learning through masked self-supervised learning for tabular data. $(3)$ Demonstrating that integrating image data with pretrained TabNet-processed tabular data through a multimodal interaction module based on cross-modal attention improves the accuracy of predicting the year of TKR surgery.
\section{Method}
A multimodal model is proposed to predict the year of TKR using both image and tabular data. Assume $(\mathbf{X}^i \in \mathbb{R}^{H \times W \times S \times 1}$, $\mathbf{X}^{t} = [x_t^1, \dots, x_t^N] \in \mathbb{R}^N)$ be an image-tabular pair, where $N$ is the number of selected tabular variables. When $N_a$ is the number of categorical variables, $[x_t^1, \dots, x_t^{N_a}]$, then $(N - N_a)$ is that of continuous variables, $[x_t^{N_a+1}, \dots, x_t^N]$. The continuous variables were standardized using the z-score normalization \cite{Du2024-df}. As shown in Figure \ref{Flowchart}, the model includes a CNN-based image encoder $\phi_i$, a tabular encoder $\phi_t$, and a multimodal interaction module $\psi$. 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[t]
\centering
\includegraphics[height=0.42\textheight]{Figures/MIDL2025.pdf}
\caption{The architecture of the proposed model, including its image encoder, unsupervised pre-trained tabular encoder, and multimodal interaction module. DESS: sagittal fat-suppressed three-dimensional dual-echo in steady state.}
\label{Flowchart}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%+
\subsection{Image Encoder, $\phi_i$} A 3D Resnet18 model \cite{Tran2017_3DResnet} was used with the sagittal fat-suppressed three-dimensional dual-echo in steady state (DESS) MR images in OAI study cohort. To improve model generalizability, we applied random cropping during training and center cropping during validation. The resulting input image sizes were set to 300x300x160. Features were extracted from the output of the last pooling layer. The image encoder produced the image representation $\mathbf{I} \in \mathbb{R}^{H' \times W' \times S' \times C}$, where $C$ is its corresponding channel dimension. An ablation study for ResNet18 selection as the image encoder is provided in the \textit{Ablation study} section of the Supplementary Document. 

\subsection{Tabular Encoder, $\phi_t$} Out of 1224 baseline clinical variables in the OAI \cite{Lester2008_OAI} database, 245 were available for over 90\% of the subjects. All available image assessment measurements were utilized. A least absolute shrinkage and selection operator (Lasso) method \cite{tibshirani1996regression} is applied to identify the most relevant features. The regularization strength ($\alpha$) is optimized using the Optuna framework \cite{Akiba2019-co} to maximize explained variance while promoting sparsity. The optimal $\alpha$ is used to finalize the selection of 31 features. 
Let $(\mathbf{X}^{t} = [\mathbf{X}^c \mathbf{X}^r] = [x_t^1, \dots, x_t^A] \in \mathbb{R}^A, $where $\mathbf{X}^c, \mathbf{X}^r \in \mathbb{R}^{A})$ be the concatenation of clinical variables and image readings, where A is the number of all tabular variables available in OAI dataset. After applying Lasso feature selection, the $(\mathbf{X}^{t'} = [x_t^1, \dots, x_t^N] \in \mathbb{R}^N)$ is obtained. The masked self-supervised pretraining of the tabular encoder incorporates a task for predicting missing feature columns from the existing ones. The \( N \)-dimensional selected tabular features $(\mathbf{f} \in \mathbb{R}^{B \times N})$ is passed to each decision step, where \( B \) is the batch size. Consider a binary mask $(\mathbf{M} \in \{0, 1\}^{B \times N})$. The encoder inputs $(1-\mathbf{M} \cdot \mathbf{\hat{f}})$, and the decoder outputs the reconstructed features, $(\mathbf{M} \cdot \mathbf{\hat{f}})$. The prior scale term, denoting how much a particular feature has been used previously, is initialized as $\mathbf{P[0]} = (1-\mathbf{M})$ in the encoder so that the model emphasizes only the known features while the decoder's final fully-connected layer is multiplied by $(\mathbf{M})$ to output the unknown features. The reconstruction loss during the self-supervised phase is:
\begin{equation}
\sum_{b=1}^{B} \sum_{j=1}^{D} \left| \frac{(\hat{f}_{b,j} - f_{b,j}) \cdot S_{b,j}}{\left( \sum_{b=1}^{B} \left( f_{b,j} - \frac{1}{B} \sum_{b=1}^{B} f_{b,j} \right)^2 \right)} \right|^2.
\end{equation}
After pretraining, TabNet leverages the learned weights and sequential attention to focus on the most relevant features at each decision step. The tabular encoder produced the tabular representation $\mathbf{T} \in \mathbb{R}^{N \times D}$, where $D$ is its corresponding channel dimensions.

\subsection{Multimodal Interaction Module, $\psi$} A cross-attention mechanism is used to effectively capture relationships across modalities \cite{Vaswani2017-hj}. The $\mathbf{I}$ is projected via linear layer into $\hat{\mathbf{I}} \in \mathbb{R}^{(H'W'S') \times D}$ to have the same embedding size as $\mathbf{T}$. The interaction module is composed of $L_m$ layers, each integrating self-attention, cross-modal attention, an MLP feed-forward layer, and layer normalization. $\mathbf{F}$ captures a joint representation of an image-tabular pair. The cross-modal attention in the $l$-th layer can be formulated as described in \cite{Vaswani2017-hj, Du2024-df}:

\begin{equation}
\label{equation:crossatt}
\text{CrossAttention}(\boldsymbol{Q}, \boldsymbol{K}, \boldsymbol{V}) = \text{softmax} \left( \frac{\boldsymbol{Q} \boldsymbol{K}^T}{\sqrt{d_k}} \right) \boldsymbol{V},
\end{equation}

where $\boldsymbol{Q} = \mathbf{F}^{l-1} \mathbf{W}_Q^l$, $\boldsymbol{K} = \hat{\mathbf{I}} \mathbf{W}_K^l$, $\boldsymbol{V} = \hat{\mathbf{I}} \mathbf{W}_V^l$, and $\mathbf{F}^0 = \mathbf{T}$.

\section{Experiments}
\subsection{Study cohort}
\label{sec:Study cohort}
The study utilized knee data from the publicly accessible OAI database. The OAI database contains clinical variables, MRI exams, and MRI quantitative and semi-quantitative image assessment measurements for 4,796 subjects aged 45 to 79 with or at risk for KOA, evaluated at baseline and follow-ups at 12, 18, 24, 30, 36, 48, 60, 72, and 96 months. The OAI received ethical approval from the Internal Review Boards at the University of California at San Francisco. All participants provided written informed consent. The study cohort in the OAI was evaluated with longitudinal DESS MRI exams from 3.0T MRI scanner. Out of 4796 subjects in the OAI, 547 subjects underwent TKR during the 9-year follow-up period. Each subject may have undergone TKR in either one or both knees (163 with only the left knee, 168 with only the right knee, and 108 with both knees). In this study, we utilized all available data from the OAI dataset that included MR scans, image readings, and clinical data, resulting in 850 knee MRIs, as detailed in Figure \ref{DatasetSelection}. The baseline gender and age of study cohorts were provided in Table \ref{demographicInfo}. For data augmentation, each knee was treated as an independent data point. Follow-up time point data for each patient were treated as independent, separate entries rather than part of a longitudinal study, with each follow-up time considered as year 0 for estimating the year of TKR. 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[hbt!]
\centering
\includegraphics[width=\columnwidth,height=0.35\textheight,keepaspectratio]{Figures/Dataset.eps}
\caption{Flowchart for study cohort generation. After reviewing MRI data along with clinical, quantitative, and semi-quantitative image assessment measurements, 850 knee data from 547 subjects who underwent TKR within a 9-year follow-up period in the OAI database were identified. Knees: Knee images.}
\label{DatasetSelection}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{table}[!htbp]
\centering
\resizebox{0.5\textheight}{!}{
\begin{tabular}{lc}
\hline
\rowcolor[HTML]{9B9B9B} 
\textbf{Dataset}                                                              & \textbf{OAI}                                                 \\ \hline
Number of Knees                                                                & 850                                                          \\ \hline
\rowcolor[HTML]{C0C0C0} 
\begin{tabular}[c]{@{}l@{}}Imaging Type\\ (Train/Validation/Test)\end{tabular} & \begin{tabular}[c]{@{}c@{}}DESS \\ (604/82/164)\end{tabular} \\ \hline
SEX                                                                            & Male: 490, Female: 360                                       \\ \hline
\rowcolor[HTML]{C0C0C0} 
AGE                                                                            & Mean±SD (Range)                                              \\
Male                                                                           & 64.2±8.3 (45-83)                                             \\
\rowcolor[HTML]{C0C0C0} 
Female                                                                         & 65.7±8.5 (45-82)                                             \\ \hline
BMI                                                                            & Male: 29.7±5.3, Female: 29.9±4.2                             \\ \hline
\rowcolor[HTML]{C0C0C0} 
RACE                                                                           & Non-White: 17, White: 731, Black: 78, Asian: 24              \\ \hline
KL Grade                                                                       & 0: 16, 1: 32, 2: 162, 3: 317, 4: 323                         \\ \hline
\rowcolor[HTML]{C0C0C0} 
PAIN (WOMAC Score)                                                             & 0: 95, 1-5: 369, 6-10: 294, 11-15: 88, 16-20: 4              \\ \hline
OARSI Grade                                                                    & None: 676, Small: 43, Medium: 64, Large: 67                  \\ \hline
\rowcolor[HTML]{C0C0C0} 
BML Subregions                                                                 & 0: 121, 1-3: 287, 4-6: 390, 7-9: 52, 10-12: 0                \\ \hline
\end{tabular}}
\begin{tablenotes}
\item \footnotesize BML: bone marrow lesions, DESS: sagittal fat-suppressed three-dimensional dual-echo in steady state, OAI: osteoarthritis initiative, std: standard deviation. There are fifteen BML subregions, covering the femoral, tibial, and patellar areas. A subregion is classified as damaged if the grade is greater than 0. \normalfont
\end{tablenotes}
\caption{Demographic and key clinical and imaging assessment variables of subjects in the OAI study cohort.}
\label{demographicInfo} 
\end{table}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Multimodal Model Designs}
 We compared the performance of our proposed model against three unimodal models, an MR image-only model, a TabNet-based tabular-only model, and a pretrained TabNet-based tabular-only model, as well as several multimodal models that integrate image and tabular data in four different ways as: \textit{1. Basic Concatenation:} Combines image features with all tabular data via concatenation in the penultimate layer, \textit{2. Dynamic Affine Feature Map Transform (DAFT):} Adjusts image feature maps based on tabular data, enabling images to be interpreted in the context of tabular data \cite{Wolf2022-pc}, \textit{3. TabNet-Processed Models:} Tabular data is processed with TabNet, then combined with image features using basic concatenation and DAFT, and \textit{4. Self-Attention Tabular Encoder with Multimodal Interaction Fusion:} Tabular data is processed with a self-attention transformer before being combined with encoded image data using cross-modal attention \cite{Du2024-df}. This comprehensive comparison allowed us to identify the most effective model for predicting the year of TKR within a 0 to 9-year timeframe by evaluating how different fusion strategies impact predictive performance. 

\subsection{Experiment Design}
We used 604, 82, and 164 image-tabular data pairs for the training, validation, and test splits, respectively. The splits were made at the subject level to ensure that all follow-up data for the same subject was included within a single split. 5-fold CV was used to validate the model performance. The Kullback–Leibler divergence loss and label discretization are used for end-to-end training \cite{Cigdem2024_ISMRM}. Predictions were calculated based on the area under the predicted distribution. The optimal value of the Lasso regularization parameter $\alpha$ obtained through hyperparameter tuning was 0.097. The Adam optimizer is used for all training. For self-supervised pretraining, we set the maximum epochs to 200, batch size \(B = 16\), virtual batch size \(B_V = 4\), masking ratio to 0.5, and a learning rate (LR) scheduler, with a starting LR of \(10^{-2}\). The end-to-end model was trained with an LR of \(10^{-5}\) and a weight decay of \(10^{-4}\), running for 150 epochs with a batch size of 4. For TabNet, we set a categorical embedding dimension of eight, \(N_d = N_a = 64\), \(N_{\text{steps}} = 4\), \(\gamma = 1.3\), and momentum \(m_B = 0.02\). The optimal LR of 0.025, weight decay of 0.0014, nine independent layers, and seven shared layers were selected through hyperparameter tuning. Both our transformer-based tabular encoder and multimodal interaction module consist of four transformer layers, each featuring eight attention heads and a hidden dimension of 64. We used an MLP with hidden sizes of 512 for image data and 64 for tabular data, both producing outputs of size 64. The best model was selected based on the highest validation accuracy. To mitigate the risk of overfitting, we monitored validation loss and selected the model with the best validation accuracy, ensuring optimal performance. The image encoder is computationally intensive due to 3D convolutions on MRI data, while the tabular encoder remains lightweight with embeddings. The fusion module integrates both via cross-attention, reducing spatial dimensions to enhance efficiency and scalability for clinical use. The accuracy, MAE, and macro-AUC metrics were used for evaluating the models. Details of the metrics are provided in \textit{Model prediction evaluation metrics} section of the Supplementary Document.
\section{Results and Discussion}
We evaluated the performance of two imputation methods, mean imputation for continuous variables with median imputation for categorical variables and a more advanced Random Forest-based imputation \cite{Stekhoven2012-rt}, alongside the impact of feature selection on model accuracy in predicting the year of TKR. Models included TabNet regressor with and without self-supervised pretraining, and the Lasso feature selection method was applied prior to encoding. As shown in Table~\ref{ImputationTable}, the highest accuracy of 61.0\% was achieved by combining Random Forest imputation, Lasso feature selection, and the pretrained TabNet model. This configuration consistently outperformed other setups, suggesting that leveraging a feature selection and pretraining enhances model performance. Additionally, TabNet pretraining consistently improved accuracy across all settings, highlighting the benefit of self-supervised pretraining in tabular data encoding. Since we used TabNet regression to predict the time to TKR, macro-AUC could not be calculated for these models.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{table}[htb!]
\centering
\resizebox{\columnwidth}{!}{
\begin{tabular}{|l|l|l|c|l|}
\hline
\textbf{Imputation Method} & \textbf{\begin{tabular}[c]{@{}l@{}}Feature \\ Selection\end{tabular}} & \textbf{Model}                                             & \textbf{\begin{tabular}[c]{@{}c@{}}ACC\\ (\%)\end{tabular}} & \textbf{MAE} \\ \hline
Mean(Cont.)+Median(Cat.)   & - & TabNet & 59.2 & 1.58      \\ \hline
Mean(Cont.)+Median(Cat.)   & - & TabNet\textsubscript{Pretrained} & 60.4 & 1.55 \\ \hline
Mean(Cont.)+Median(Cat.)   & Lasso & TabNet & 55.5 & 1.57 \\ \hline
Mean(Cont.)+Median(Cat.)   & Lasso & TabNet\textsubscript{Pretrained} & 57.3 & 1.62  \\ \hline
Random Forest              & - & TabNet & 54.3 & 1.65 \\ \hline
Random Forest              & - & TabNet\textsubscript{Pretrained} & 59.8 & 1.59  \\ \hline
Random Forest              & Lasso & TabNet & 57.3 & 1.74 \\ \hline
\textbf{Random Forest}     & \textbf{Lasso} & \textbf{TabNet\textsubscript{Pretrained}} & \textbf{61.0} & \textbf{1.55}  \\ \hline
\end{tabular}}
\caption{Performance comparison of two imputation methods, feature selection strategies, and models in predicting the year of TKR.}
\label{ImputationTable}
\end{table}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Table~\ref{tab:performance_comparison} compares the performance of our proposed model with various multimodal and unimodal models for predicting the year of TKR within a 9-year timeframe. Among the unimodal models, the pretrained TabNet-based tabular-only model and the MR image-only model achieved similar accuracies of 61.0\% and 60.7\%, respectively. These results indicate that each modality alone offers competitive performance, with the pretrained TabNet particularly effective at handling tabular data independently. The proposed model combining an image encoder with a pretrained TabNet as the tabular encoder using a multimodal cross-modal attention fusion approach achieved the highest accuracy of 63.4\% across all models. In comparison, basic concatenation of image data with raw tabular data and with pretrained TabNet-processed tabular data reached accuracies of 54.6\% and 57.9\%, respectively. DAFT showed improved performance over concatenation, achieving accuracies of 58.5\% when applied to MR images with raw tabular data and 60.4\% when applied to MR images with pretrained TabNet-processed data. Additionally, the multimodal model using a self-attention transformer-based tabular encoder and a cross-modal attention fusion provided an accuracy of 59.2\%. As we used both knees from the same patient as independent samples, this could introduce correlation biases, potentially affecting the statistical robustness of the results. To assess this, we investigated whether training the model using only a single knee per patient would impact accuracy. This approach reduced the dataset size, leading to lower accuracy (57.7\% vs. 63.4\%), higher MAE (1.56 vs. 1.33), and a decrease in AUC (0.615 ± 0.040 vs. 0.665 ± 0.029). These findings reinforce the importance of leveraging both knees as independent samples to enhance model robustness and predictive accuracy.

The results underscore the advantages of our proposed model's cross-modal attention fusion approach, which outperforms other fusion methods. The highest performance achieved by our model highlights the importance of both pretraining and cross-modal attention fusion when combining tabular and image data for the year of TKR prediction over a 9-year timeframe. The proposed model is effective in capturing nuanced relationships across modalities, leading to improvements over conventional fusion techniques in multimodal learning.

TKR surgery decisions are mostly influenced by clinical symptoms, particularly pain and functional limitations, rather than radiographic OA severity alone. For instance, a patient with advanced OA (KL grade 4) may avoid surgery if they experience minimal pain, while another with mild OA (KL grade 1) may undergo TKR due to severe pain. Therefore, we use tabular data as the query vector in cross-modal attention fusion instead of imaging features. To provide a quantitative comparison, we conducted an additional experiment using image features as the query while keeping the rest of the model unchanged. This resulted in a decline in performance compared to our proposed approach, which uses tabular features as the query. Specifically, accuracy decreased from 63.4\% to 57.3\% MAE increased from 1.33 to 1.51 and macro-AUC dropped from 0.665 ± 0.029 to 0.620 ± 0.073.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{table}[htb!]
\centering
\resizebox{\columnwidth}{!}{
\begin{tabular}{|l|l|l|c|c|c|c|}
\hline
\textbf{Data}   & \textbf{Model}                    & \textbf{Fusion}        & \textbf{ACC} & \textbf{MAE} & \textbf{macro-AUC ± std}\\ \hline
Tabular-only  & TabNet\textsubscript{Pretrained}    & -                      & 61.0 & 1.55 & - \\  \hline
Image-only    & DL                                  & -                      & 60.7 & 1.46 & 0.615 ± 0.047\\ \hline
Image+Tabular & DL+Raw Tabular                      & Concatenation          & 54.6 & 1.68 & 0.539 ± 0.058\\ \hline
Image+Tabular & DL+Raw Tabular                      & DAFT                   & 58.5 & 1.56 & 0.608 ± 0.013 \\ \hline
Image+Tabular & DL+TabNet\textsubscript{Pretrained} & Concatenation          & 57.9 & 1.54 & 0.597 ± 0.040\\ \hline
Image+Tabular & DL+TabNet\textsubscript{Pretrained} & DAFT                   & 60.4 & 1.47 & 0.644 ± 0.024\\ \hline
Image+Tabular & DL+Transformer\textsubscript{Self-Attention}                       & Multimodal Interaction & 59.2 & 1.48 & 0.559 ± 0.071\\ \hline
\textbf{Image+Tabular} &\textbf{DL+TabNet\textsubscript{Pretrained}} &\textbf{Multimodal Interaction} &\textbf{63.4} &\textbf{1.33} &\textbf{0.665 ± 0.029}\\ \hline
\end{tabular}}
\caption{Performance comparison of the proposed model against various multimodal and unimodal models for predicting the year of TKR within a 9-year timeframe.}
\label{tab:performance_comparison}
\end{table}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Our study has limitations. We included only subjects who underwent TKR within a 9-year follow-up period, requiring the pre-classification of subjects into TKR and non-TKR groups. In future work, censored data will be incorporated, and survival analysis will be conducted for the control group. Additionally, we used both knees from the same patient, which may introduce correlation; however, treating them as separate data points increases the sample size, enhances statistical power, and captures individual knee variability. While some clinical data, such as general health indicators, apply to both knees, each knee has distinct measurements and images. Since their condition and progression can vary, we treated them independently in our model. The generalizability of our model to external datasets is also limited due to differences in available clinical variables, image readings, and the absence of DESS MRI data in other cohorts, preventing direct validation. Furthermore, the OAI dataset primarily consists of older, overweight, and Caucasian subjects. As a result, the model’s generalizability to populations with greater diversity in age, body mass index, race, and ethnicity requires further investigation. 
\section{Conclusion}
Our study demonstrates that an end-to-end transformer-based multimodal model, integrating MR imaging and tabular data with pretrained TabNet encoder, improves the year of TKR prediction accuracy compared to unimodal and other multimodal approaches. The proposed approach can also be applied to other biomedical applications involving multimodal data integration and time-to-event analysis.
\section{Acknowledgments}
\label{sec:acknowledgments}
This work was supported in part by the National Institutes of Health (NIH) R01 AR074453. 

\bibliography{midl25_220}
\clearpage
\input{Supplementary Document}

\end{document}

