\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage[table]{xcolor} % This should uncomment for the final submission

\usepackage{booktabs}
\usepackage{comment}
\usepackage{rotating}
\usepackage{adjustbox}
\usepackage{multirow}
\usepackage{svg}
\usepackage{mwe} % to get dummy images
\jmlrvolume{-- 231}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026} 
\editors{Accepted for publication at MIDL 2026}

\title[Attention-to-Survival]{Attention-to-Survival: Multimodal Fracture Risk Prediction Based on Pelvic Radiographs and Clinical Data\\from the Study of Osteoporotic Fractures}

\midlauthor{\Name{Niklas C. Koser\midlotherjointauthor\nametag{$^{1}$}} \orcid{0009-0006-0599-2901}\Email{niklas.koser@rad.uni-kiel.de}\\
\Name{Marten J. Finck\midljointauthortext{Contributed equally}\nametag{$^{2}$}} \orcid{0009-0008-9808-2222} \Email{mafi@informatik.uni-kiel.de}\\
\Name{Silja Janßen\nametag{$^{3}$}} \orcid{0009-0002-0749-179X} \Email{sja@informatik.uni-kiel.de}\\        
\Name{Coenraad Mouton\nametag{$^{1}$}} \orcid{0000-0001-8610-2478}\Email{coenraad.mouton@rad.uni-kiel.de}\\
\Name{Li-Y. Lui\nametag{$^{4}$}} \Email{lily.lui@ucsf.edu}\\
\Name{Steven R. Cummings\nametag{$^{4}$}} \Email{steven.cummings@ucsf.edu}\\
\Name{Kevin Köser\nametag{$^{3}$}} \orcid{0000-0002-0974-3834} \Email{kk@informatik.uni-kiel.de}\\
\Name{Jan-B. Hövener\nametag{$^{1}$}} \orcid{0000-0001-7255-7252}\Email{jan.hoevener@rad.uni-kiel.de}\\
\Name{Sören Pirk\nametag{$^{2}$}} \orcid{0000-0003-1937-9797} \Email{sp@informatik.uni-kiel.de}\\
\Name{Claus-C. Glüer\nametag{$^{1}$}} \orcid{0000-0003-3539-8955}\Email{glueer@rad.uni-kiel.de}\\
\addr $^{1}$ i2Lab@SBMI, Kiel University, University Hospital Schleswig-Holstein, Kiel, Germany\\
\addr $^{2}$ Visual Computing and Artificial Intelligence, Kiel University, Kiel, Germany \\
\addr $^{3}$ Department of Computer Science, Kiel University, Kiel, Germany \\
\addr $^{4}$ California Pacific Medical Center Research Institute, San Francisco, CA, USA 
}

\begin{document}

\maketitle

\begin{abstract}
Osteoporotic changes in the hip structure render the proximal femur particularly vulnerable to fractures, which leads to severe consequences for patients' health and significant socioeconomic burdens, a strongly increasing problem in aging populations. Accurate risk estimation is therefore essential for initiating timely preventive measures. However, the current clinical standard measures bone mineral density~(BMD) and the Fracture Risk Assessment Tool (FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}}) provide only limited predictive value. Neither BMD nor FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}} capture structural characteristics that could be derived from pelvic radiographs that are widely available. To address this gap, we present the Attention-to-Survival Fusion~(ATSF) model, a multimodal survival analysis framework that combines clinical risk factors~(CRFs) with pelvic radiograph features. An attention-based architecture equipped with a deep conditional transformation model~(DCTM) prediction head enables accurate estimation of time-dependent fracture risk. The ATSF model is designed to accommodate missing clinical variables, handle all forms of non-informative censoring, and provides modality-specific interpretability through the attention mechanisms. It was developed, validated and tested with data of 7825 women from the Study of Osteoporotic Fractures~(SOF) followed for fracture incidence for 23 years. We benchmark ATSF against established baselines, including FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}}, the Cox proportional hazards model~(CoxPH), and a deep learning reference model. Our results demonstrate significant superior performance across concordance index~(C-index) and area under the receiver operating characteristic curve~(AUC), indicating the importance of integrating radiographic and clinical data within a unified survival framework. Furthermore, offering improved interpretability and a scalable multimodal design, the proposed method provides a promising alternative for advancing individualized hip-fracture risk prediction in osteoporosis research and precision medicine.
\end{abstract}


\begin{keywords}
Survival Analysis, Hip Fracture Risk, Multimodal, Deep Learning
\end{keywords}

\section{Introduction}
\label{sec:introduction}
Osteoporotic fractures resulting from gradual deterioration of the trabecular bone structure represent a major health problem for the elderly population due to their prevalence rising steadily with age~\cite{compston_osteoporosis_2019,maquer_bone_2015,shen2022global}. Usually progressing unnoticed, even minor trauma or everyday stresses can lead to fractures due to the fragility of bone~\cite{bartl_osteoporose_2010}. The resulting increase in morbidity and mortality not only significantly reduces the quality of life of those affected, but also places a considerable socioeconomic burden on healthcare systems worldwide~\cite{shen2022global,kanis_scope_2021}. In order to prevent such consequences, accurate prognosis and early diagnosis are crucial, as timely, guideline-based preventive measures have been proven successful~\cite{shepstone2018screening,porter_osteoporosis_2025}.

In the clinical assessment of osteoporosis, the World Health Organization~(WHO) recommends determining bone mineral density~(BMD) using dual-energy X-ray absorptiometry~(DXA) as the clinical standard. The measurement provides a quantitative indicator of bone mineralization and thus offers insights into the extent of bone loss and the risk of future fractures, including the most important fracture sites,  vertebral bodies and the proximal femur~\cite{kanis2007assessment,bartl_osteoporose_2010}. 
% Quantitative computed tomography~(QCT) is considered a possible alternative to DXA, as it enables significantly more sensitive measurement of BMD and is therefore particularly suitable for the early detection of skeletal changes. However, due to higher radiation exposure, considerable costs, and methodological challenges in standardization and quality control, it has not yet found widespread use in routine clinical practice~\cite{kanis2007assessment,bartl_osteoporose_2010,loffler2020x}.


Since osteoporosis cannot be characterized solely by decreasing bone density, additional diagnostic metrics, specifically those reflecting bone microarchitecture, have been proposed in the literature to improve the sensitivity of risk assessment. One example is the trabecular bone score~(TBS) enabling indirect quantification of trabecular structure~\cite{harvey2015trabecular,silva2014trabecular}. In addition, current research is investigating the use of high-resolution peripheral quantitative computed tomography (HRpQCT) but this method cannot be applied to the hip. To address this, deep learning~(DL)-based super-resolution methods of clinical CTs have been developed to derive trabecular structure parameters~\cite{nishiyama2013clinical,koser2025realsuperres,finck2025femoralyze}. Aside from image derived metrics clinical risk factors~(CRFs) constitute to the patient-specific risk of fracture. These include, among others, gender, age, body mass index, prior fractures, alcohol, and tobacco consumption. Importantly, some of these risk factors are at least partially dependent on BMD, whereas others influence fracture risk independently of BMD~\cite{kanis2007assessment,bartl_osteoporose_2010, porter_osteoporosis_2025}.

Based on these findings, risk models have been developed in recent years that take into account a variety of skeletal and clinical factors in order to predict the absolute risk of fracture within a defined period (usually ten years). A prominent example of this is the Fracture Risk Assessment Tool (FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}}), which estimates the 10-year risk of hip or major osteoporotic fractures based on CRFs with or without the inclusion of BMD. FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}} was developed for postmenopausal women and men aged 40 and older and is adapted to the geographical heterogeneity of fracture risk by using country-specific calibrated models. The underlying hazard function is estimated using Poisson regression, a method typically used in survival analysis~(SA)~\cite{kanis2007assessment}.

In contrast to the purely binary consideration of the occurrence of an event (e.g., an osteoporotic fracture), SA, also known as time-to-event analysis, allows for a model-based consideration of the temporal dimension of risk. This is particularly important in a clinical context, as both the probability of occurrence and the underlying risk factors can change over time~\cite{collett2023modelling,george2014survival,kanis2007assessment,wiegrebe2024deep}. 
In addition to conventional methods, DL models for SA have become increasingly important in recent years. These approaches build on the established methods but enable flexible modeling of complex, particularly nonlinear relationships, and allow seamless integration of heterogeneous, multimodal data sources. This provides significantly increased modeling flexibility and can potentially contribute to improved performance in clinical applications~\cite{wiegrebe2024deep, george2014survival}. Further theoretical principles of SA are presented in detail in the appendix \ref{Appendix:Survival}. 

The combination of CRFs with image-based features from radiographs represents a promising strategy to improve the prognostic accuracy of survival models for osteoporotic fractures. 
Prior multimodal approaches in fracture risk prediction remain limited in several respects. For example, \citet{schmarje2022opportunistic} classify fracture risk without modeling the temporal component. 
Similarly, \citet{shaikh2024cnn} perform convolutional neural network~(CNN) based classification of vertebral CT scans to estimate 10-year fracture risk, and subsequently incorporate the CNN-derived prediction into a Cox proportional hazards~(CoxPH) model for survival analysis. In contrast, \citet{kong2022development}, while conceptually closest to our approach, perform SA on spinal radiographs, incorporating the time component in one multimodal framework. However, they employ a simple ResNet-based architecture trained on a small, single-center dataset. Both \citet{shaikh2024cnn} and \citet{kong2022development} omit relevant FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}} CRFs and inherit the proportional hazards limitations of the CoxPH model.

Collectively, the constraints of these methods highlight the need for a unified multimodal survival framework that integrates comprehensive CRFs with pelvic radiographs and directly models the time-dependent fracture risk. Therefore, we aim to advance the state of the art by making the following key contributions:
(1) We introduce the Attention-to-Survival Fusion~(ATSF) model, an attention-based multimodal network with a deep conditional transformation model~(DCTM) prediction head that integrates CRFs with pelvic radiograph features for hip-fracture risk estimation;
(2) The model robustly handles missing data and accommodates all forms of non-informative censoring;
(3) The attention mechanism provides a pathway for model interpretability and enables insight into modality-specific feature relevance; and
(4) We systematically compare our approach with established baselines, including CoxPH~\cite{cox1972regression}, FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}}~\cite{kanis2007assessment}, and the DL approach mentioned~\cite{kong2022development}, using concordance index~(C-index) as the primary and area under the receiver operating characteristic curve~(AUC) as a secondary, complementary metric.


\section{Related Work}
\label{sec:related_work}
A current and powerful development in the field of DL-based SA are DCTMs~\cite{campanella2025flexible}. This approach parameterizes the log cumulative baseline hazards using Bernstein polynomials and thus also includes the CoxPH model as a special case. The transformation function is modeled by a neural network, which allows complex, nonlinear, and non-proportional hazard structures to be mapped. The architecture consists of a feature extractor for structured or unstructured data and a flexible DCTM survival head. This modular design allows seamless integration with a variety of feature extraction models. The survival head is trained using the negative log-likelihood~(NLL), and its parameterization via Bernstein polynomials and sigmoid distribution functions provides more stable optimization compared to models like DeepSurv, while also avoiding the biases that can arise from cross-entropy-based loss functions, as in DeepHit~\cite{lee2018deephit}. In extensive evaluations DCTMs consistently show performance gains over state-of-the-art~(SOTA) models such as DeepSurv~\cite{katzman2018deepsurv}, DeepHit, random survival forest, and CoxPH~\cite{cox1972regression}. 

Beyond methodological advances, multimodal approaches that combine structured clinical information with unstructured medical image data have gained importance in osteoporosis research. However, existing work has so far focused primarily on binary classification tasks. For example, image features are extracted from fine-tuned CNNs on chest radiographs and fused with clinical variables that are transferred to the feature space via a multilayer perceptron~(MLP). The combined features are then passed through another MLP to opportunistically screen for osteoporosis~\cite{tang2025fusion}. A similar approach additionally uses dimensionality reduction via principal component analysis~(PCA) and clustering-based feature selection to improve classification performance~\cite{chagahi2024enhancing}. For fracture risk classification, \citet{schmarje2022opportunistic} concatenated image features from pelvic radiographs with CRFs and used an MLP for binary prediction, without modeling the temporal dimension of fracture risk. Similarly, a small vision transformer~(ViT-S) was pre-trained in a self-supervised manner and combined with clinical features processed via an MLP~\cite{senanayake2023classification}. Radiomics-based approaches extract features from CT images and fuse them with clinical variables, achieving improved predictive performance, particularly with gradient boosting models~\cite{saravi2024integrating,zhang2024development}. Only a few studies exist in the field of multimodal SA for osteoporotic fracture prediction. \citet{shaikh2024cnn} proposed a two-stage pipeline in which CNN-derived vertebral CT features are first used to classify 10-year fracture risk, and the resulting scores are then combined with age and BMI in a CoxPH model in the second stage. This approach achieved a C-index of 0.78 and demonstrated the potential of CNN-based fracture assessment, but is constrained by its two-stage design and the omission of certain CRFs. The most relevant study compared to our work was proposed by \citet{kong2022development}, who introduced a two-stage multimodal survival modeling framework. Their approach combines ResNet-based spinal radiograph features with CRFs within a DeepSurv architecture. In the first stage, a keypoint detection model is trained to localize the vertebral bodies L1–L5 in spinal radiographs, which are then used to extract fixed regions of interest (ROIs) or corresponding image patches. These ROIs are kept fixed and serve as input to the second stage of the pipeline. In the second stage, the cropped ROIs are processed by a ResNet to extract image features, which are concatenated with CRFs and passed through a MLP, with DeepSurv acting as the survival prediction head. This approach demonstrated that multimodal methods incorporating temporal survival modeling can outperform traditional FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}} and CoxPH baselines. The study of \citet{kong2022development} is limited by a small sample size, low event rate, single-center data, and potential selection bias and missing values. Moreover, due to reliance on simple feature concatenation, multimodal fusion is restricted to global-level integration, preventing explicit modeling of fine-grained cross-modal interactions. Motivated by these gaps in prior work, we propose the following approach to multimodal SA for osteoporotic fracture risk assessment, leveraging pelvic radiographs as a more widely available alternative to DXA alongside recommended CRFs.


\section{Methodology}
\label{sec:methodology}
In this section, we describe the proposed ATSF model, provide details about the used data, the model architecture along with our training procedure, and the evaluation setup. The code and model weights will be released upon acceptance.

\subsection{Data}
\label{sec:data}
This work uses data from the Study of Osteoporotic Fractures~(SOF), a population-based multicenter cohort study funded by the National Institutes of Health. Between 1986 and 2008, the study enrolled 9,704 predominantly Caucasian women aged 65 years or older across four U.S. clinical sites~(Baltimore, Minneapolis, Portland, and Pittsburgh) and recorded longitudinal health information. The dataset includes radiographs from anatomically relevant regions for osteoporosis assessment, including pelvic images, along with CRFs, measurements obtained from blood samples, functional and cognitive tests, and precomputed BMD. It should be noted that DXA images used to derive BMD measurements are not used in the present study. Fracture status was systematically updated at four-month intervals throughout the study period~\cite{cummings1995risk,cummings1998hormones}. The dataset is publicly accessible via the SOF online portal\footnote{https://sofonline.ucsf.edu/}.

\subsection{Baselines}
\label{sec:baselines}
To validate our model we compare it to three established baselines: First, we employ the WHO-endorsed FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}} score, using the nine CRFs sex, age, BMI, smoking status, alcohol consumption, previous fractures, rheumatoid arthritis, glucocorticoid exposure, and parental history of hip fracture~\cite{kanis2007assessment}. FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}} is evaluated both with and without BMD, and we use the US~(caucasian)-calibrated model to ensure consistency with the population of SOF. Second, we use a CoxPH model incorporating the same CRFs as FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}} to ensure optimal comparability~\cite{cox1972regression}. Because the SOF dataset contains exclusively female participants, sex is excluded from this baseline due to its lack of discriminative value. The third baseline serves as our DL-baseline and follows the multimodal approach described by \citet{kong2022development}. Since our work focuses on multimodal survival modeling rather than vertebral localization or ROI extraction, we base our DL-baseline exclusively on the second stage of the pipeline proposed by \citet{kong2022development}. As the original publication does not specify the exact model variant, used weights, or layer parameters, our implementation adopts reasonable and commonly used choices to replicate the method as faithfully as possible. Specifically, we concatenate the eight normalized CRFs with 2,048 pelvic radiograph features extracted from a pretrained and frozen ResNet-50 with IMAGENET1K\_V1 weights~\cite{he2016deep}. The concatenated feature vector is fed to a trainable two-layer MLP~(2048+8 $\rightarrow$ 1024 $\rightarrow$ ReLU $\rightarrow$ 512) and subsequently passed into a DCTM prediction head. While the original study employed DeepSurv~\cite{katzman2018deepsurv}, we deliberately replace this component with a DCTM head (12 Bernstein polynomials) to ensure comparability across survival models and to isolate differences in feature extraction and multimodal fusion rather than survival-head design~\cite{campanella2025flexible}. The MLP used for feature fusion remains identical to the original architecture. The DL-baseline model follows the same optimization strategy, training protocol, and final hyperparameter settings as specified in Section 3.4; the only difference is that these hyperparameters are manually selected rather than determined via automatic hyperparameter optimization.


\subsection{Attention-to-Survival Fusion Model}
\label{sec:atsf}
Our ATSF model for multimodal SA of hip-fracture risk consists of four components~(\figureref{fig:model}): the image encoder~(green), the CRF encoder~(red), the fusion layers~(blue), and the DCTM head~(orange). In the image encoder, pelvic radiographs are used intentionally because, unlike DXA scans, they are widely available and thus enable broader applicability~\cite{hsieh2021automated,kanis_scope_2021,kanis2005requirements}. Depending on the configuration, the model operates on unilateral (left hip), bilateral (left plus mirrored right hip), or full pelvis radiographs acquired at the first visit. The latter is incorporated via a MedVAE-derived projection that compresses the image while preserving the most relevant structural information~\cite{varma2025medvae}. Features are then extracted using a Vision Transformer (ViT, pre-trained on ImageNet, vit\_base\_patch16\_224 without classification head) and reduced to the most informative dimensions via a Q-Former, ensuring that only the features most relevant for risk prediction are retained~\cite{wu2020visual,deng2009imagenet,zhang2024vision}. In the CRF encoder, the eight CRFs used in the CoxPH and DL baselines are embedded via learnable embedding layers. This ensures that structured clinical information is represented in a comparable latent space alongside high-dimensional image features. The fusion layers serve as the central mechanism for integrating the two modalities. A randomly initialized, task-specific programmable token (query) iteratively aggregates relevant information from the multimodal features (keys and values) through three successive attention mechanisms, forming an explicit latent representation of the target risk. Stacking three fusion layers allows the model to refine this representation and capture complex relationships between modalities. Finally, the representative token is passed to the DCTM head, which performs SA of hip-fracture risk. The general shift model with 6–14 Bernstein polynomials is used to approximate the logistic function~\cite{campanella2025flexible}. 
 
\begin{figure}[t]
  \centering
  \includegraphics[width=0.8\linewidth]{midl26_231_ATSF.png}
  \vspace{-4mm}
  \caption{Overview of the ATSF model for hip-fracture risk prediction. Radiographic features~(green) and CRFs~(red) are integrated through multiple fusion layers~(blue), enabling modality-aware weighting. The combined representation is passed to a DCTM head~(orange) for time-dependent risk estimation.}
  \label{fig:model}
\end{figure} 





\subsection{Training Details}
\label{sec:training_details}
Prior to training, a keypoint-based detection algorithm~\cite{damm2022artificial} was applied to standardize the pelvic radiograph orientation, isolate the left and right hip regions, and crop images to the model input size of $224\times224$ pixels. The algorithm achieves an accuracy of 94\,\% for the left hip and 96\,\% for the right hip. Images for which keypoint detection failed were excluded. Full-pelvis radiographs were additionally processed using MedVAE pretrained on radiographs to generate a single-channel projection with an eight-fold compression per dimension~\cite{varma2025medvae}. Because the hip region is nearly square, the resulting projection was rescaled to the required input size, accepting minor geometric distortion to avoid loss of relevant anatomical information. Normalization was applied to all preprocessed input images. Given that the SOF dataset originates from multiple clinical sites, we also assessed center-specific differences in imaging protocols or acquisition quality. While handling such heterogeneity was not the primary focus of this study, we observed comparable distributions across centers without pronounced biases. Based on this assessment, no additional harmonization or center-specific normalization methods were applied. Continuous CRFs were standardized, and categorical CRFs were one-hot encoded for the ATSF model, with variable names being encoded for clear identification. For the DL-baseline, categorical variables were included as binary indicators (0 or 1) and continuous variables were standardized. We evaluated several alternative imputation strategies for missing CRF values (mean, median, and embedding-based imputation~(see Appendix~\ref{Appendix:Additional_Results} \tableref{tab:results_test_complete_impute_5y,tab:results_test_complete_impute_10y,tab:results_test_reduced_impute_5y,tab:results_test_reduced_impute_10y})). As none of these strategies resulted in statistically significant performance differences compared to the original setup, missing CRF values in the DL-baseline were encoded as -1. For ATSF, we considered different strategies for handling missing values, including a learnable representation of missingness within the attention mechanism and a masking-based approach in which missing entries are excluded from learning. The masking-based strategy was adopted, as it is consistent with the assumption of non-informative missingness. After preprocessing, 7,818 of the 9,704 SOF participants had both a valid radiograph and complete time-to-event information. These participants were randomly assigned to training, validation, and test sets in a 50/10/40 ratio, ensuring comparable distributions of key variables and sufficient fracture events within each split~(e.g., 14.2\,\% in the test set, see Appendix~\ref{Appendix:SOF}). Smaller test sets would have led to unstable and high-variance metric estimates, potentially resulting in misleading conclusions regarding model performance. We therefore prioritized a robust assessment on a large held-out test set. The split was performed externally by an independent collaborator, and was held out to prevent any optimization on the test data. Since the externally created test set contains missing values in several CRFs and classic methods such as FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}} and CoxPH cannot process these, we test on a reduced, fully observed subset, referred to as $\text{Test}_\text{reduced}$. This ensures a fair comparison of all methods using exactly the same samples. The complete test set (referred to as $\text{Test}_\text{complete}$), including missing values, was later used in the ablation experiments~(see Section \ref{sec:evaluation}). To improve robustness, the training set was augmented using random Gaussian noise, Gaussian smoothing, affine transformations, random contrast adjustment, and zoom operations. All models were optimized using AdamW with a weight decay of $10^{-2}$ and a batch size of 169. The majority of experiments were performed on the NVIDIA H100 NVL (96 GB VRAM) while training with lower computational requirements was also executed on GPUs with lower memory capacity such as the NVIDIA Tesla V100 SXM2 or NVIDIA RTX 6000 Ada Generation. The DL-baseline was trained for 300 epochs with a learning rate of $7.2\times10^{-5}$. For the ATSF model, we performed hyperparameter optimization~(HPO) with 150 epochs over learning rates between $1\times10^{-8}$ and $5\times10^{-6}$, augmentation probabilities, the number of Bernstein polynomials~(6–14), the number of fusion layers, attention heads~(8 or 16), and the number of Q-Former tokens~(16–64). All DL-based survival models were trained using the NLL loss, and model selection was conducted based on the validation C-index.



\subsection{Evaluation and Ablation}
\label{sec:evaluation}
To quantify the predictive accuracy of our models in comparison to the baselines, we primarily use the C-index. Introduced in 1982 by Harrell et al., this metric is based on rank correlations between predicted and observed outcomes and is formally defined as the ratio of all evaluable patient pairs for which the prediction and the actual observed outcome are concordant~\cite{harrell1982evaluating,harrell1996multivariable}. 
In addition to the C-index, we use the AUC to quantify the discriminatory power of the models independently of the ranking. Since, unlike the C-index, the AUC does not take into account the temporal aspect of the event occurrence, the test data set is adjusted for this analysis. Participants who suffer from a fracture after a predefined period of time are coded as negatives, and right-censored observations are excluded from the AUC calculation~\cite{hanley1982meaning}. Due to these limitations and the required adjustments, the AUC should be interpreted with caution and is considered a secondary metric complementing the C-index. Following the recommendation of Harrell et al., we perform bootstrapping for model comparison by resampling the test data with replacement~(2000 bootstrap samples)~\cite{harrell1996multivariable}. Specifically, we use the nonparametric test by \citet{kang_comparing_2015}, which allows robust estimates of the uncertainty of performance measures, including the calculation of 95\,\% confidence intervals. All statistical analyses between the best ATSF and baseline model are performed on the independent hold-out test set unless explicitly stated otherwise. A two-sided significance level of $p<0.05$ is used as a reference threshold. Results are reported for 5- and 10-year time horizons. The 10-year horizon matches the FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}} assessment period, while the shorter 5-year horizon is close to clinical guideline recommendations (3 years according to \citet{DVO}) and ensures sufficient positive cases for stable evaluation.

To interpret the ATSF model, we analyze the cross-attention weights of the programmable token within the fusion blocks, following the general approach used in attention-based survival models such as SurvTrace~\cite{wang2022survtrace}. For the CRFs, we compute cross-attention between the programmable token~(query) and all CRF tokens~(keys). For each attention head, this yields a distribution indicating how strongly the programmable token attends to each CRF token. We then average the attention weights across heads and extract the row corresponding to the programmable token, resulting in a one-dimensional vector with one value per CRF token. Visual attention maps are obtained analogously. After processing the images with the ATSF image encoder, the resulting visual tokens form a 2D spatial grid. We compute cross-attention from the programmable token to all visual tokens, average the weights across heads, and extract the programmable-token query row. The resulting weights are reshaped to the original 2D token grid, upsampled to image resolution using bilinear interpolation, and normalized for visualization.

To assess the contribution of each modality and image configuration, we conducted a structured ablation study. First, we evaluated unimodal variants of the proposed architecture using only CRFs or only radiograph features, implemented by disabling the complementary modality while keeping all other model components identical. Second, we examined the influence of different radiographic inputs by training separate models using unilateral (left) radiographs, bilateral (left and mirrored right) radiographs, and full projection sets. These experiments allow us to isolate the effect of each data source and to quantify the added value of multimodal integration compared to unimodal or reduced-input variants.

Because FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}} and CoxPH cannot process missing values, whereas the ATSF model explicitly supports them, we additionally evaluated ATSF under two test set conditions: a reduced test set from which all missing values were removed~($\text{Test}_\text{reduced}$) and the full test set containing missing values~($\text{Test}_\text{complete}$). This design allows us to disentangle several effects: the potential advantage of a larger evaluable test set enabled by missing-value handling, the ATSF model’s performance when evaluated on exactly the same data as FRAX and CoxPH, and the sensitivity of the ATSF model to missing information.


\section{Results}
\label{sec:results}
The C-index and AUC results of the different models on $\text{Test}_\text{reduced}$ with a 10-year time horizon are summarized in \tableref{tab:results_test_reduced_10y}. The table also reports 95\,\% confidence intervals and statistical significance against CoxPH with $\text{CRF}_\text{BMD}$, which is marked by *.
\begin{figure}[t]
  \centering
  \includegraphics[width=\linewidth]{midl26_231_Attention.jpg}
  \vspace{-8mm}
  \caption{Visual (left) and CRF attention (right) of the ATSF model.}
  \label{fig:attn_masks}
\end{figure} 

\begin{table}[t]
\floatconts
  {tab:results_test_reduced_10y}%
  {\caption{Performance comparison of all models on $\text{Test}_\text{reduced}$ using C-index, AUC, and their respective 95\,\% confidence intervals~(10-year time horizon). $\text{CRF}_{\text{BMD}}$ denotes inclusion of BMD. For multimodal~(MM) and image-only models, subscripts indicate the radiograph configuration: L for unilateral~(left hip), LR for bilateral~(left \& mirrored right hip), and Proj for the MedVAE-derived full-pelvis projection.}}%
  {
  \begin{tabular}{llcc}
  \toprule
   \bfseries Model & \bfseries Modality  &  $\uparrow$ \bfseries C-Index & $\uparrow$ \bfseries AUC\\
   %Name & Details & Train & Val & Train& Val\\\hline
   \hline
   

   % ----- FRAX -----
   \multirow{2}{*}{$\text{FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}}}$} 
       & CRF                        &  0.703 \textcolor{gray}{(0.652\,-\,0.753)} & 0.718 \textcolor{gray}{(0.667\,-\,0.768)}\\
       & $\text{CRF}_{\text{BMD}}$  &  0.746 \textcolor{gray}{(0.702\,-\,0.787)} & 0.758 \textcolor{gray}{(0.716\,-\,0.799)} \\\hline

   % ----- CoxPH -----
   \multirow{2}{*}{CoxPH}
       & CRF                         &  0.724 \textcolor{gray}{(0.674\,-\,0.772)} &  0.746 \textcolor{gray}{(0.696\,-\,0.794)} \\
       & $\text{CRF}_{\text{BMD}}$   &  0.769 \textcolor{gray}{(0.727\,-\,0.808)} &  0.788 \textcolor{gray}{(0.746\,-\,0.828)} \\\hline

   % ----- DL-Baseline -----
   \multirow{3}{*}{DL-Baseline}
       & $\text{MM}_\text{L}$     &  0.716 \textcolor{gray}{(0.662\,-\,0.765)} &  0.737 \textcolor{gray}{(0.684\,-\,0.758)}\\
       & $\text{MM}_\text{LR}$    &  0.752 \textcolor{gray}{(0.706\,-\,0.796)} &  0.776 \textcolor{gray}{(0.731\,-\,0.818)}\\
       & $\text{MM}_\text{Proj}$  &  0.731 \textcolor{gray}{(0.685\,-\,0.775)} &  0.755 \textcolor{gray}{(0.710\,-\,0.799)}\\\hline

   % ----- ATS Fusion Model -----
   \multirow{7}{*}{ATSF (Ours)}
       & CRF                       &  0.722 \textcolor{gray}{(0.673\,-\,0.771)} &  0.742 \textcolor{gray}{(0.690\,-\,0.792)}\\
       & $\text{Image}_\text{L}$   &  0.758 \textcolor{gray}{(0.711\,-\,0.803)} &  0.775 \textcolor{gray}{(0.726\,-\,0.820)}\\
       & $\text{Image}_\text{LR}$  &  0.757 \textcolor{gray}{(0.715\,-\,0.795)} &  0.771 \textcolor{gray}{(0.728\,-\,0.812)}\\
       & $\text{Image}_\text{Proj}$&  0.709 \textcolor{gray}{(0.660\,-\,0.757)} &  0.726 \textcolor{gray}{(0.676\,-\,0.774)}\\
       & $\text{MM}_\text{L}$      &  0.770 \textcolor{gray}{(0.736\,-\,0.836)} &  0.788 \textcolor{gray}{(0.720\,-\,0.815)}\\
       & $\text{MM}_\text{LR}$     &  \textbf{0.805*} \textcolor{gray}{(0.764\,-\,0.844)} &  \textbf{0.826} \textcolor{gray}{(0.786\,-\,0.863)}\\
       & $\text{MM}_\text{Proj}$   &  0.742 \textcolor{gray}{(0.699\,-\,0.782)} &  0.762 \textcolor{gray}{(0.715\,-\,0.803)}\\  
    \bottomrule 
  \end{tabular}}
  \vspace{-6pt}
\end{table}
Based on both C-index and AUC, the inclusion of BMD is consistently associated with improved performance for FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}} and CoxPH. Among these clinical baselines, CoxPH with $\text{CRF}_\text{BMD}$ achieves the highest discrimination, with a C-index of 0.769 and an AUC of 0.788, outperforming all baseline models tested. Within the DL-Baseline, the best-performing configurations are observed for bilateral pelvic radiographs followed by projection-based inputs. Across all multimodal settings, the ATSF model yields higher performance than the corresponding configurations of the DL-baseline. The bilateral pelvic radiograph configuration achieves the highest overall discrimination, reaching a C-index of 0.805 and an AUC of 0.826. It demonstrates statistically significant improvement compared to CoxPH with $\text{CRF}_\text{BMD}$. For image-only variants, ATSF shows higher performance than the baseline models, with the exception of CoxPH with $\text{CRF}_\text{BMD}$. In contrast, the text-only variant of ATSF attains a C-index of 0.722 and exhibits lower discrimination than CoxPH when evaluated with the same input variables. 
Analysis of the input modalities shows that bilateral pelvic radiographs always achieve better C-indices and AUC than projections. Unilateral is only preferable to projections in certain configurations.
Consistent patterns are observed for the 5-year prediction horizon on $\text{Test}_\text{reduced}$, where the multimodal ATSF model with bilateral inputs again achieves superior evaluation metrics compared to alternative approaches (see Appendix~\ref{Appendix:Additional_Results} \tableref{tab:results_test_reduced_5y}). Unlike with a 10-year time horizon, it can be seen here that ATSF outperforms the corresponding CoxPH model with CRFs only. Additional Kaplan-Meier analysis of the best baseline~(CoxPH $\text{CRF}_\text{BMD}$) and overall best ATSF configuration~($\text{MM}_\text{LR}$) highlights that both models achieve a comparably strong separation between low- and high-risk participants (see Appendix~\ref{Appendix:Additional_Results}, \figureref{fig:KM_10,fig:KM_5}). The results on $\text{Test}_\text{complete}$~(see Appendix~\ref{Appendix:Additional_Results}, \tableref{tab:results_test_complete_10y,tab:results_test_complete_5y}) show no substantial changes in overall performance compared with $\text{Test}_\text{reduced}$, even with standard techniques for missing values. Across both test sets, the ATSF model exhibits higher values for the evaluated metrics when using a 5-year prediction horizon. To contextualize the ATSF model outputs, we further examine its attention mechanisms, illustrated in \figureref{fig:attn_masks}. The overlaid example images of fracture cases indicate that the model consistently allocates higher attention to regions including the greater trochanter, the femoral neck (Ward’s triangle), and adjacent areas of the lower hip. The cross-attention between CRFs and the programmable token shows the largest contributions from age and prior fracture history, while alcohol consumption receives the lowest attention values. The hyperparameters of all models are reported in Appendix~\ref{Appendix:Hyperparameters}.


\section{Discussion}
\label{sec:discussion}
Based on the results of the two established baselines FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}} and CoxPH, it can initially be deduced that the information gained by adding the DXA-determined BMD provides predictive added value. CoxPH outperforms FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}} with regard to SA based on the SOF data set. Apart from the CoxPH model with $\text{CRF}_\text{BMD}$, the multimodal bilateral DL-baseline, which was trained on pelvic radiographs and CRFs excluding BMD, is better than all other baselines. However, since pelvic radiographs are more widely available than DXA, this baseline is still preferable in some scenarios. The ATSF model presented in this paper outperforms all other approaches in its bilateral multimodal configuration. It can therefore be concluded that the inclusion of right radiographs offers an advantage in this application. In comparison, the general lower performance of the MedVAE full-pelvis projections suggests that not all relevant trabecular structures are preserved by MedVAE and that the slight geometric distortion negatively affects predictive accuracy. Ablation studies further prove that only the combination of CRFs and images represents a significant improvement over established methods. Although BMD is also extracted from an image modality, the ATSF model in its best configuration is superior to established methods with $\text{CRF}_\text{BMD}$. This suggests that the model is capable of delivering higher predictive performance from the previously unrecommended combination of CRFs and pelvic radiographs.
%
Since the results on the full test set $\text{Test}_\text{complete}$ do not show notable degradation compared with $\text{Test}_\text{reduced}$, the findings indicate that the model handles missing values in the CRFs robustly and maintains stable predictive performance even under incomplete clinical information. Slight performance differences, e.g., at the 5-year horizon, may be related to missing values in influential CRFs~(e.g., age), which could reduce prediction accuracy for affected participants. While this effect cannot be precisely quantified, it is consistent with our feature importance analysis. The observed improvement in performance at shorter prediction horizons aligns with the notion that radiographic features capture structural risk characteristics whose predictive relevance may diminish over longer time spans due to ongoing bone remodeling. In addition, several CRFs are time-varying, and their influence is inherently more stable over shorter horizons, which may further contribute to the superior performance observed at 5 years compared with 10 years. The attention patterns provide additional context for interpreting the model’s behavior. The model consistently places emphasis on anatomically relevant regions of the proximal femur and prioritizes key CRFs, such as age and prior fractures. These patterns suggest that the model leverages information consistent with established risk factors for osteoporotic fractures, supporting plausibility and interpretability.


\section{Conclusion, Limitations, and Future Work}
\label{sec:conclusion}
In this work, we introduced the ATSF model, a multimodal survival analysis framework that integrates pelvic radiographs with CRFs to predict long-term hip-fracture risk. Across all evaluated configurations, ATSF consistently outperformed established baselines, with the bilateral multimodal variant achieving the highest performance on both reduced and complete test sets. These results demonstrate that pelvic radiographs -- an imaging modality far more widely available than DXA -- provide clinically valuable structural information that, when combined with CRFs in a unified survival framework, can lead to substantially improved fracture risk estimation. In addition to its performance improvements, ATSF provides modality-specific interpretability, which allows clinicians to understand the relative contribution of radiographic and clinical features. These results position ATSF as a promising step toward more accessible, accurate, and individualized osteoporosis risk assessment, leveraging pelvic radiographs as a widely available alternative to the current standard of DXA imaging.

Despite promising results, this study has limitations. The SOF cohort includes only US women, with few fracture events, and potential selection bias cannot be excluded. The evaluation metrics used have known limitations, especially under high censoring. No k-fold cross-validation was performed, and treatment history, a relevant confounder, was not available. The DCTM survival head was used as a fixed reference and not systematically compared with alternative architectures. Furthermore, while an extensive ablation study was performed, it does not isolate the contributions of every individual model component. ATSF variants including DXA imaging or BMD were also not evaluated. Visual attention maps are only an approximate visualization, as the underlying queries are freely learnable rather than pixel-bound, making it suitable for intuition but not precise localization. Moreover, the DL-baseline hyperparameters were selected manually based on validation performance and training stability, rather than through systematic automated hyperparameter optimization.

Potential avenues for future work are to address these limitations by improving generalizability through integration of additional cohorts such as MrOS~\cite{orwoll2005design,blank2005overview}, incorporating alternative or complementary evaluation metrics for more differentiated analysis, considering additional factors such as DXA-derived BMD and treatment history, and performing more detailed ablation studies to disentangle the contributions of individual model components.

\clearpage 
\midlacknowledgments{This project was supported by the modular AI Imaging Pipelines (mAIPipes) Grant, Application No. 22024025 KI-Förderrichtlinie Schleswig-Holstein, Germany. Furthermore, this work was supported by the IDIR-Project (Digital Implant Research), a cooperation financed by Kiel University, University Hospital Schleswig-Holstein and Helmholtz Zentrum Hereon.

The Study of Osteoporotic Fractures (SOF) is supported by National
Institutes of Health funding. The National Institute on Aging (NIA) provides support under
the following grant numbers: R01 AG005407, R01 AR35582, R01 AR35583, R01
AR35584, R01 AG005394, R01 AG027574, and R01 AG027576.
%
We also would like to thank Ida Häggström for her support on questions related to DCTMs.
}


\bibliography{midl26_231}

\clearpage
\appendix
\section{Theoretical Foundations of Survival Analysis}
\label{Appendix:Survival}
The central theoretical concepts of survival analysis~(SA) are the survival function $S_T(t)$ and the hazard function $h_T(t)$. The former describes the probability that an event has not occurred by a certain point $t$ in time $T$ (assumed to be continuous here) and is formally defined as
\begin{equation}
\label{eq:survival_function}
    S_T(t):=P(T>t)=1-F_T(t).
\end{equation}
In this case, $F_T(t)$ describes the cumulative distribution function defined as $F_T(t):=P(T\leq t)$.
The hazard rate (or hazard function)
\begin{equation}
\label{eq:hazard_rate}
    h_T(t):=\lim_{\Delta t\to 0}\frac{1}{\Delta t}P(t\leq T<t+\Delta t |T \geq t)=\frac{f_T(t)}{S_T(t)}
\end{equation}
describes the instantaneous risk of the event occurring at time $t$, provided that it has not yet occurred by that time. It thus links the survival time density function $f_T(t)$ with the survival function $S_T(t)$ and provides a continuous, time-dependent representation of the risk.
The cumulative hazard function $H_T(t)$ is often used as an intermediate measure, which describes the total risk up to time $t$ and is calculated using
\begin{equation}
\label{eq:cum_hazard}
    H_T(t):= \int_{0}^{t}h_T(u)du=-\log(S_T(t)).
\end{equation}
This relationship illustrates the close functional connection between hazard and survival functions and forms a central basis for many parametric, semiparametric, and deep learning~(DL)-based methods of modern SA~\cite{collett2023modelling,wiegrebe2024deep,george2014survival}.

Data characteristics are crucial for selecting and developing suitable models in SA. The time dependence of covariates is particularly relevant: variables whose values change over time (e.g., lifestyle factors) are referred to as time-varying features (TVFs), while time-varying effects (TVEs) describe the time-varying influences of characteristics on risk or hazard rates. Both require models that go beyond the assumption of proportional hazards. Similarly, the dimensionality and modality of the data influence the choice of model. High-dimensional and multimodal data sets, such as those from clinical, molecular, and imaging sources, require specialized methods that can combine structured and unstructured information. Finally, the types of outcomes must also be taken into account: censoring (incompletely observed events), truncation (subjects are not part of the data set), competing risks, multi-state scenarios, and recurrent events pose different statistical challenges and require flexible modeling approaches~\cite{wiegrebe2024deep}.

To model the distribution of event times based on the entry time, exit time, event indicator, and a feature vector, various methodological approaches are used in SA~\cite{collett2023modelling,wiegrebe2024deep,george2014survival}. These can be broadly categorized into three main groups:
\begin{itemize}
    \item \textbf{Non-parametric methods:} Do not assume any specific distribution of survival times~\cite{collett2023modelling}. Typical examples include the Kaplan–Meier estimator~\cite{kaplan1958nonparametric} and the log-rank test~\cite{savage1956contributions}.
    \item \textbf{Semi-parametric methods:} Model the effect of covariates without assuming a specific baseline hazard. The Cox proportional hazards model~(CoxPH) estimates covariate effects via hazard ratios under the assumption of proportional hazards and uses the partial likelihood for parameter estimation~\cite{cox1972regression,george2014survival,wiegrebe2024deep}.
    \item \textbf{Parametric methods:} Assume a specific hazard or survival distribution~(e.g., exponential or Weibull) and include proportional and additive hazard models, accelerated failure time~(AFT) models, and piecewise exponential models~(PEMs)~\cite{collett2023modelling,george2014survival,wiegrebe2024deep}.
\end{itemize}

With the rapid advances in machine learning (ML) and DL, SA has significantly benefited from the increasing modeling capacity of these approaches. The integration of multimodal data sources and the growing emphasis on model interpretability have further contributed to the rising popularity of DL-based SA methods.
Early neural network–based extensions of classical statistical approaches emerged in the mid-1990s and continue to build upon traditional SA frameworks while leveraging the representational flexibility of modern architectures. Feedforward neural networks (FFNNs), for instance, enable flexible estimation of (semi-)parametric hazard functions and can account for TVFs. Convolutional neural networks (CNNs), often applied through transfer learning, facilitate the inclusion of unstructured data such as medical images, whereas recurrent neural networks (RNNs) are particularly suited for modeling temporal dependencies and longitudinal data. Autoencoder-based architectures are commonly used for dimensionality reduction and feature representation. Wiegrebe et~al.~\cite{wiegrebe2024deep} provide a comprehensive review of recent DL-based survival methods, systematically categorizing them according to model class, loss function, and parameterization, as well as their supported survival tasks and interpretability characteristics. Selected approaches are discussed in more detail in Section~\ref{sec:related_work}.


\section{Study of Osteoporotic Fractures (SOF): Cohort Characteristics}
\label{Appendix:SOF}

\begin{figure}[h]
  \centering
  \includegraphics[width=0.9\linewidth]{midl26_231_Data_Selection.png}
  \vspace{-2mm}
  \caption{Overview of the SOF data selection process used in this study.}
  \label{fig:data_selection}
\end{figure} 


%\vspace{-1cm}
\begin{sidewaystable}[htbp]
\tiny
\floatconts
  {tab:SOF_cohort}%
  {\caption{Dataset characteristics of the training, validation and test sets derived from SOF. Due to missing values for some variables, percentages do not always total 100\,\%.}}%
  {\begin{adjustbox}{right=22.5cm}%{left=32cm}       % right or left
  \begin{tabular}{l|rr|rr|rr|rr|rr}
  % \toprule
  \bfseries Characteristics & \multicolumn{2}{c|}{\bfseries Overall} & \multicolumn{2}{c|}{\bfseries Train} & \multicolumn{2}{c|}{\bfseries Validation} & \multicolumn{2}{c|}{\bfseries $\text{Test}_\text{complete}$} & \multicolumn{2}{c}{\bfseries $\text{Test}_\text{reduced}$}\\
  \hline
  % \midrule
  $n$ participants          & \multicolumn{2}{c|}{7,818\,\,\textcolor{gray}{(100.0\,\%)}}         & \multicolumn{2}{c|}{3,740\,\,\textcolor{gray}{(47.8\,\%)}}          & \multicolumn{2}{c|}{931\,\,\textcolor{gray}{(11.9\,\%)}}          & \multicolumn{2}{c|}{3,147\,\,\textcolor{gray}{(40.3\,\%)}}  & \multicolumn{2}{c}{1,969\,\,\textcolor{gray}{(25.2\,\%)}} \\
  Age (y)                   & \multicolumn{2}{c|}{$71.7\pm5.2\,\,\textcolor{gray}{[65,90]}$} & \multicolumn{2}{c|}{$71.7\pm5.3\,\,\textcolor{gray}{[65.0,90.0]}$} & \multicolumn{2}{c|}{$71.7\pm5.1\,\,\textcolor{gray}{[65.0,90.0]}$} & \multicolumn{2}{c|}{$71.6\pm5.1\,\,\textcolor{gray}{[65,90]}$}  & \multicolumn{2}{c}{$71.4\pm5.0\,\,\textcolor{gray}{[65,90]}$} \\
  BMI                       & \multicolumn{2}{c|}{$26.5\pm4.7\,\,\textcolor{gray}{[14.3,58.4]}$} & \multicolumn{2}{c|}{$26.4\pm4.6\,\,\textcolor{gray}{[14.3,49.7]}$} & \multicolumn{2}{c|}{$26.6\pm4.5\,\,\textcolor{gray}{[16.0,46.0]}$} & \multicolumn{2}{c|}{$26.6\pm4.8\,\,\textcolor{gray}{[15.2,58.4]}$} & \multicolumn{2}{c}{$26.6\pm4.7\,\,\textcolor{gray}{[15.2,58.4]}$} \\
  \hline
  & \multicolumn{1}{c}{\bfseries Yes} & \multicolumn{1}{c|}{\bfseries No} & \multicolumn{1}{c}{\bfseries Yes} & \multicolumn{1}{c|}{\bfseries No}& \multicolumn{1}{c}{\bfseries Yes} & \multicolumn{1}{c|}{\bfseries No} & \multicolumn{1}{c}{\bfseries Yes} & \multicolumn{1}{c|}{\bfseries No} & \multicolumn{1}{c}{\bfseries Yes} & \multicolumn{1}{c}{\bfseries No}\\\hline
  Smoking                   & 782\,\,\textcolor{gray}{(10.0\,\%)} & 7,011\,\,\textcolor{gray}{(89.7\,\%)} & 365\,\,\textcolor{gray}{(9.8\,\%)} & 3,366\,\,\textcolor{gray}{(90.0\,\%)} & 93\,\,\textcolor{gray}{(10.0\,\%)} & 833\,\,\textcolor{gray}{(89.5\,\%)} & 324\,\,\textcolor{gray}{(10.3\,\%)} & 2,812\,\,\textcolor{gray}{(89.4\,\%)} & 195\,\,\textcolor{gray}{(9.9\,\%)} & 1,774\,\,\textcolor{gray}{(90.1\,\%)} \\
  Drinking                  & 248\,\,\textcolor{gray}{(3.2\,\%)} & 7,565\,\,\textcolor{gray}{(96.8\,\%)} & 118\,\,\textcolor{gray}{(3.2\,\%)} & 3,620\,\,\textcolor{gray}{(96.8\,\%)} & 36\,\,\textcolor{gray}{(3.9\,\%)} & 895\,\,\textcolor{gray}{(96.1\,\%)} & 94\,\,\textcolor{gray}{(3.0\,\%)} & 3,050\,\,\textcolor{gray}{(96.9\,\%)} & 50\,\,\textcolor{gray}{(2.5\,\%)} & 1,919\,\,\textcolor{gray}{(97.5\,\%)}\\
  Previous Fracture         & 2,844\,\,\textcolor{gray}{(36.4\,\%)} & 4,928\,\,\textcolor{gray}{(63.0\,\%)} & 1,376\,\,\textcolor{gray}{(36.8\,\%)} & 2,338\,\,\textcolor{gray}{(62.5\,\%)} & 339\,\,\textcolor{gray}{(36.4\,\%)} & 587\,\,\textcolor{gray}{(63.1\,\%)} & 1,129\,\,\textcolor{gray}{(35.9\,\%)} & 2,003\,\,\textcolor{gray}{(63.6\,\%)} & 663\,\,\textcolor{gray}{(33.7\,\%)} & 1,306\,\,\textcolor{gray}{(66.3\,\%)}\\
  Rheumatoid Arthritis      & 505\,\,\textcolor{gray}{(6.5\,\%)} & 7,176\,\,\textcolor{gray}{(91.8\,\%)} & 229\,\,\textcolor{gray}{(6.1\,\%)} & 3,447\,\,\textcolor{gray}{(92.2\,\%)} & 64\,\,\textcolor{gray}{(6.9\,\%)} & 849\,\,\textcolor{gray}{(91.2\,\%)} & 212\,\,\textcolor{gray}{(6.7\,\%)} & 2,880\,\,\textcolor{gray}{(91.5\,\%)} & 129\,\,\textcolor{gray}{(6.6\,\%)} & 1,840\,\,\textcolor{gray}{(93.4\,\%)} \\
  Glucocorticoid Exposure & 896\,\,\textcolor{gray}{(11.5\,\%)} & 6,768\,\,\textcolor{gray}{(86.6\,\%)} & 436\,\,\textcolor{gray}{(11.7\,\%)} & 3,233\,\,\textcolor{gray}{(86.4\,\%)} & 115\,\,\textcolor{gray}{(12.4\,\%)} & 797\,\,\textcolor{gray}{(85.6\,\%)} & 345\,\,\textcolor{gray}{(11.0\,\%)} & 2,738\,\,\textcolor{gray}{(87.0\,\%)} & 227\,\,\textcolor{gray}{(11.5\,\%)} & 1,742\,\,\textcolor{gray}{(88.5\,\%)}\\
  Parental Hip Fracture & 879\,\,\textcolor{gray}{(11.2\,\%)} & 5,485\,\,\textcolor{gray}{(70.2\,\%)} & 427\,\,\textcolor{gray}{(11.4\,\%)} & 2,641\,\,\textcolor{gray}{(70.6\,\%)} & 113\,\,\textcolor{gray}{(12.1\,\%)} & 625\,\,\textcolor{gray}{(67.1\,\%)} & 339\,\,\textcolor{gray}{(10.8\,\%)} & 2,219\,\,\textcolor{gray}{(70.5\,\%)} & 266\,\,\textcolor{gray}{(13.5\,\%)} & 1,703\,\,\textcolor{gray}{(86.5\,\%)} \\
  \hline
  Hip Fracture              & 1,131\,\,\textcolor{gray}{(14.5\,\%)} & 6,687\,\,\textcolor{gray}{(85.5\,\%)} & 550\,\,\textcolor{gray}{(14.7\,\%)} & 3,190\,\,\textcolor{gray}{(85.3\,\%)} & 135\,\,\textcolor{gray}{(14.5\,\%)} & 796\,\,\textcolor{gray}{(85.5\,\%)} & 446\,\,\textcolor{gray}{(14.2\,\%)} & 2,701\,\,\textcolor{gray}{(85.8\,\%)} & 285\,\,\textcolor{gray}{(14.5\,\%)} & 1,684\,\,\textcolor{gray}{(85.5\,\%)} \\
  Time to Fracture (d)      & \multicolumn{2}{c|}{$5{\small,}120.3\pm2{\small,}275.2\,\,\textcolor{gray}{[6,\,8{\small,}535]}$} & \multicolumn{2}{c|}{$5{\small,}132.2\pm2{\small,}277.1\,\,\textcolor{gray}{[6,\,8{\small,}521]}$} & \multicolumn{2}{c|}{$5{\small,}081.0\pm2{\small,}286.4\,\,\textcolor{gray}{[6,\,8{\small,}520]}$} & \multicolumn{2}{c|}{$5{\small,}117.9\pm2{\small,}269.6\,\,\textcolor{gray}{[20,\,8{\small,}535]}$} & \multicolumn{2}{c}{$5{\small,}418.2\pm2{\small,}108.6\,\,\textcolor{gray}{[20,\,8{\small,}535]}$} \\
  \hline
  Hip Fracture \textless 10y & 499\,\,\textcolor{gray}{(6.4\,\%)} & 7,319\,\,\textcolor{gray}{(93.6\,\%)} & 238\,\,\textcolor{gray}{(6.4\,\%)} & 3,502\,\,\textcolor{gray}{(93.6\,\%)} & 62\,\,\textcolor{gray}{(6.7\,\%)} & 869\,\,\textcolor{gray}{(93.3\,\%)} & 199\,\,\textcolor{gray}{(6.3\,\%)} & 2,948\,\,\textcolor{gray}{(93.7\,\%)} & 115\,\,\textcolor{gray}{(5.8\,\%)} & 1,854\,\,\textcolor{gray}{(94.2\,\%)}\\
  Time to Fracture \textless 10y (d) & \multicolumn{2}{c|}{$2{\small,}155.1\pm980.6\,\,\textcolor{gray}{[6,\,3{\small,}650]}$} & \multicolumn{2}{c|}{$2{\small,}165.0\pm972.4\,\,\textcolor{gray}{[6,\,3{\small,}650]}$} & \multicolumn{2}{c|}{$2{\small,}132.7\pm1{\small,}020.3\,\,\textcolor{gray}{[6,\,3{\small,}648]}$} & \multicolumn{2}{c|}{$2{\small,}150.2\pm977.9\,\,\textcolor{gray}{[20,\,3{\small,}642]}$} & \multicolumn{2}{c}{$2{\small,}378.6\pm818.9\,\,\textcolor{gray}{[20,\,3{\small,}642]}$} \\
  \end{tabular}
  \end{adjustbox}}
\end{sidewaystable}

\clearpage
\section{ATSF Hyperparameters}
\label{Appendix:Hyperparameters}

\begin{table}[h!]
\floatconts
  {tab:hyperparameters}%
  {\caption{Overview of the optimal hyperparameters obtained from the HPO across all ATSF model configurations.}}%
  {\begin{tabular}{lccccccc}
  \toprule
  &&\multicolumn{3}{c}{\bfseries Image}&\multicolumn{3}{c}{\bfseries MM}\\
  \cmidrule(lr){3-5} \cmidrule(lr){6-8}
   \bfseries Hyperparameter &
   \bfseries CRF &
   \bfseries L &
   \bfseries LR &
   \bfseries Proj &
   \bfseries L &
   \bfseries LR &
   \bfseries Proj \\
  \midrule
   Learning Rate $r$              &2.4e-7&2.0e-7&6.4e-7&1.1e-6&1.7e-7&7.2e-7&1.8e-6\\
   $p$ Affine                     &-&0.5&0.5&0.0&0.8&0.5&0.8\\
   $p$ Contrast                   &-&0.0&0.0&0.5&0.0&0.0&0.5 \\
   $p$ Gaussian Noise             &-&0.0&0.0&0.9&0.9&0.5&0.5\\
   $p$ Gaussian Smooth            &-&0.3&0.3&0.5&0.0&0.3&0.0\\
   $p$ Zoom                       &-&0.8&0.5&0.5&0.5&0.5&0.5\\ \hline
   $n$ Bernstein Polynomials      &10&12&10&12&10&14&12\\ \hline
   $n$ Fusion Layers              &-&-&-&-&1&3&6\\
   $n$ Attention Heads            &-&-&-&-&16&8&16\\
   $n$ Q-Former Tokens            &-&-&-&-&32&64&32\\
  \bottomrule
  \end{tabular}}
\end{table}

\section{Baseline Hyperparameters}
\label{Appendix:Baseline_Hyperparameters}

\begin{table}[h!]
\floatconts
  {tab:baseline_hyperparameters}%
  {\caption{Overview of the hyperparameters used for the DL-baseline models.}}%
  {\begin{tabular}{lc}
  \toprule
  \bfseries Hyperparameter & \bfseries Value \\
  \midrule
  Learning Rate $r$              & 7.2e-5 \\
  $p$ Affine                     & 0.8 \\
  $p$ Contrast                   & 0.0 \\
  $p$ Gaussian Noise             & 0.5 \\
  $p$ Gaussian Smooth            & 0.5 \\
  $p$ Zoom                       & 0.5 \\ \hline
  $n$ Bernstein Polynomials      & 12 \\ 
  \bottomrule
  \end{tabular}}
\end{table}

\clearpage





\section{Additional Results}
\label{Appendix:Additional_Results}


\begin{table}[h]
\floatconts
  {tab:results_test_reduced_5y}%
  {\caption{Performance comparison of all models on $\text{Test}_\text{reduced}$ using C-index, AUC, and their respective 95\,\% confidence intervals~(5-year time horizon). $\text{CRF}_{\text{BMD}}$ denotes inclusion of BMD. For multimodal~(MM) and image-only models, subscripts indicate the radiograph configuration: L for unilateral~(left hip), LR for bilateral~(left \& mirrored right hip), and Proj for the MedVAE-derived full-pelvis projection.}}%
  {\begin{tabular}{llcc}
  \toprule
   \bfseries Model & \bfseries Modality  &  $\uparrow$ \bfseries C-Index & $\uparrow$ \bfseries AUC\\
   %Name & Details & Train & Val & Train& Val\\\hline
   \hline
   

   % ----- FRAX -----
   \multirow{2}{*}{$\text{FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}}}$} 
       & CRF      
       &  0.675 \textcolor{gray}{(0.556\,-\,0.771)}
       & 0.680 \textcolor{gray}{(0.576\,-\,0.772)}\\
       
       & $\text{CRF}_{\text{BMD}}$ 
       &  0.751 \textcolor{gray}{(0.663\,-\,0.829)} 
       & 0.755 \textcolor{gray}{(0.670\,-\,0.831)} \\\hline

   % ----- CoxPH -----
   \multirow{2}{*}{CoxPH}
       & CRF       
       &  0.681 \textcolor{gray}{(0.579\,-\,0.779)} 
       &  0.687 \textcolor{gray}{(0.586\,-\,0.779)} \\
       
       & $\text{CRF}_{\text{BMD}}$   
       &  0.764 \textcolor{gray}{(0.667\,-\,0.840)} 
       &  0.769 \textcolor{gray}{(0.685\,-\,0.845)} \\\hline

   % ----- DL-Baseline -----
   \multirow{3}{*}{DL-Baseline}
       & $\text{MM}_\text{L}$      &  0.705 \textcolor{gray}{(0.604\,-\,0.796)} &  0.707 \textcolor{gray}{(0.604\,-\,0.796)}\\
       & $\text{MM}_\text{LR}$     &  0.721 \textcolor{gray}{(0.636\,-\,0.798)} &  0.727 \textcolor{gray}{(0.641\,-\,0.802)}\\
       & $\text{MM}_\text{Proj}$   &  0.706 \textcolor{gray}{(0.627\,-\,0.785)} &  0.714 \textcolor{gray}{(0.629\,-\,0.789)}\\\hline

   % ----- ATS Fusion Model -----
   \multirow{7}{*}{ATSF (Ours)}
       & CRF                 &  0.696 \textcolor{gray}{(0.600\,-\,0.784)} &  0.702 \textcolor{gray}{(0.608\,-\,0.790)}\\
       & $\text{Image}_\text{L}$   &  0.775 \textcolor{gray}{(0.689\,-\,0.855)} &  0.778 \textcolor{gray}{(0.684\,-\,0.863)}\\
       & $\text{Image}_\text{LR}$  &  0.758 \textcolor{gray}{(0.679\,-\,0.827)} &  0.761 \textcolor{gray}{(0.682\,-\,0.831)}\\
       & $\text{Image}_\text{Proj}$&  0.748 \textcolor{gray}{(0.679\,-\,0.813)} &  0.753 \textcolor{gray}{(0.682\,-\,0.817)}\\
       & $\text{MM}_\text{L}$   &  0.782 \textcolor{gray}{(0.701\,-\,0.853)} &  0.785 \textcolor{gray}{(0.699\,-\,0.860)}\\
       & $\text{MM}_\text{LR}$  &  \textbf{0.808*} \textcolor{gray}{(0.726\,-\,0.881)} &  \textbf{0.813} \textcolor{gray}{(0.732\,-\,0.882)}\\
       & $\text{MM}_\text{Proj}$&  0.758 \textcolor{gray}{(0.683\,-\,0.827)} &  0.762 \textcolor{gray}{(0.691\,-\,0.829)}\\
    \bottomrule
  \end{tabular}}
\end{table}


\begin{table}[ht]
\floatconts
  {tab:results_test_complete_10y}%
  {\caption{Performance comparison of all models on $\text{Test}_\text{complete}$ using C-index, AUC, and their respective 95\,\% confidence intervals~(10-year time horizon). $\text{CRF}_{\text{BMD}}$ denotes inclusion of BMD. For multimodal~(MM) and image-only models, subscripts indicate the radiograph configuration: L for unilateral~(left hip), LR for bilateral~(left and mirrored right hip), and Proj for the MedVAE-derived full-pelvis projection. For $\text{FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}}}$ and CoxPH, missing categorical variables were set to 0. Missing continuous variables in the CoxPH model were imputed by the mean.}}%
  {\begin{tabular}{llcc}
  \toprule
   \bfseries Model & \bfseries Modality  &  $\uparrow$ \bfseries C-Index & $\uparrow$ \bfseries AUC\\
   %Name & Details & Train & Val & Train& Val\\\hline
   \hline

            % ----- FRAX -----
  \multirow{2}{*}{   $\text{FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}}}$} 
       &    CRF                        
       &    0.708 \textcolor{gray}{(0.666\,-\,0.749)} 
       &   0.724 \textcolor{gray}{(0.680\,-\,0.767)} \\
       
       &   $\text{CRF}_{\text{BMD}}$  
       &    0.746 \textcolor{gray}{(0.708\,-\,0.784)} 
       &   0.760 \textcolor{gray}{(0.721\,-\,0.797)} \\\hline


         \multirow{2}{*}{  CoxPH}
       &   CRF                         
       &    0.719 \textcolor{gray}{(0.675\,-\,0.760)} 
       &    0.740 \textcolor{gray}{(0.695\,-\,0.782)} \\
       
       &   $\text{CRF}_{\text{BMD}}$   
       &    0.757 \textcolor{gray}{(0.718\,-\,0.796)} 
       &    0.777 \textcolor{gray}{(0.739\,-\,0.813)} \\  \hline

   % ----- DL-Baseline -----
   \multirow{3}{*}{DL-Baseline}
       & $\text{MM}_\text{L}$      &  0.717 \textcolor{gray}{(0.673\,-\,0.759)} &  0.740 \textcolor{gray}{(0.697\,-\,0.782)}\\
       & $\text{MM}_\text{LR}$     &  0.741 \textcolor{gray}{(0.702\,-\,0.780)} &  0.765 \textcolor{gray}{(0.722\,-\,0.804)}\\
       & $\text{MM}_\text{Proj}$   &  0.731 \textcolor{gray}{(0.691\,-\,0.770)} &  0.754 \textcolor{gray}{(0.713\,-\,0.793)}\\\hline

   % ----- ATS Fusion Model -----
   \multirow{7}{*}{ATSF (Ours)}
       & CRF                 &  0.721 \textcolor{gray}{(0.677\,-\,0.763)} &  0.742 \textcolor{gray}{(0.698\,-\,0.784)}\\
       & $\text{Image}_\text{L}$   &  0.761 \textcolor{gray}{(0.721\,-\,0.800)} &  0.779 \textcolor{gray}{(0.738\,-\,0.818)}\\
       & $\text{Image}_\text{LR}$  &  0.757 \textcolor{gray}{(0.715\,-\,0.795)} &  0.771 \textcolor{gray}{(0.728\,-\,0.812)}\\
       & $\text{Image}_\text{Proj}$&  0.708 \textcolor{gray}{(0.667\,-\,0.748)} &  0.724 \textcolor{gray}{(0.680\,-\,0.765)}\\
       & $\text{MM}_\text{L}$ &  0.771 \textcolor{gray}{(0.731\,-\,0.811)} &  0.789 \textcolor{gray}{(0.747\,-\,0.829)}  \\
       & $\text{MM}_\text{LR}$  &  \textbf{0.800*} \textcolor{gray}{(0.764\,-\,0.834)} &  \textbf{0.821} \textcolor{gray}{(0.786\,-\,0.856)}\\
       & $\text{MM}_\text{Proj}$&  0.736 \textcolor{gray}{(0.698\,-\,0.774)} &  0.756 \textcolor{gray}{(0.715\,-\,0.796)}\\
    \bottomrule
  \end{tabular}}
\end{table}


\begin{table}[ht]
\floatconts
  {tab:results_test_complete_5y}%
   {\caption{Performance comparison of all models on $\text{Test}_\text{complete}$ using C-index, AUC, and their respective 95\,\% confidence intervals~(5-year time horizon). $\text{CRF}_{\text{BMD}}$ denotes inclusion of BMD. For multimodal~(MM) and image-only models, subscripts indicate the radiograph configuration: L for unilateral~(left hip), LR for bilateral~(left and mirrored right hip), and Proj for the MedVAE-derived full-pelvis projection. For $\text{FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}}}$ and CoxPH, missing categorical variables were set to 0. Missing continuous variables in the CoxPH model were imputed by the mean.}}%
  {\begin{tabular}{llcc}
  \toprule
   \bfseries Model & \bfseries Modality  &  $\uparrow$ \bfseries C-Index & $\uparrow$ \bfseries AUC\\
   %Name & Details & Train & Val & Train& Val\\\hline
   \hline

         % ----- FRAX -----
     \multirow{2}{*}{$\text{FRAX\textsuperscript{\resizebox{0.6em}{!}{\textregistered}}}$} 
       &    CRF                        
       &     0.683 \textcolor{gray}{(0.597\,-\,0.767)} 
       &    0.687 \textcolor{gray}{(0.599\,-\,0.771)}\\
       
       &    $\text{CRF}_{\text{BMD}}$  
       &     0.741 \textcolor{gray}{(0.664\,-\,0.815)} 
       &    0.744 \textcolor{gray}{(0.668\,-\,0.815)} \\\hline 

   % ----- CoxPH -----
            \multirow{2}{*}{CoxPH}
       &    CRF                         
       &     0.692 \textcolor{gray}{(0.605\,-\,0.776)} 
       &     0.692 \textcolor{gray}{(0.603\,-\,0.769)} \\
       
       &    $\text{CRF}_{\text{BMD}}$   
       &     0.744 \textcolor{gray}{(0.655\,-\,0.821)} 
       &     0.748 \textcolor{gray}{(0.667\,-\,0.824)} \\  \hline

   % ----- DL-Baseline -----
   \multirow{3}{*}{DL-Baseline}
       & $\text{MM}_\text{L}$      &  0.702 \textcolor{gray}{(0.618\,-\,0.784)} &  0.708 \textcolor{gray}{(0.625\,-\,0.787)}\\
       & $\text{MM}_\text{LR}$     &  0.729 \textcolor{gray}{(0.656\,-\,0.794)} &  0.733 \textcolor{gray}{(0.659\,-\,0.802)}\\
       & $\text{MM}_\text{Proj}$   &  0.723 \textcolor{gray}{(0.654\,-\,0.789)} &  0.728 \textcolor{gray}{(0.654\,-\,0.794)}\\\hline

   % ----- ATS Fusion Model -----
   \multirow{7}{*}{ATSF (Ours)}
       & CRF                 &  0.695 \textcolor{gray}{(0.614\,-\,0.778)} &  0.699 \textcolor{gray}{(0.608\,-\,0.781)}\\
       & $\text{Image}_\text{L}$   &  0.772 \textcolor{gray}{(0.690\,-\,0.846)} &  0.776 \textcolor{gray}{(0.691\,-\,0.853)}\\
       & $\text{Image}_\text{LR}$  &  0.758 \textcolor{gray}{(0.679\,-\,0.827)} &  0.761 \textcolor{gray}{(0.682\,-\,0.831)}\\
       & $\text{Image}_\text{Proj}$&  0.752 \textcolor{gray}{(0.689\,-\,0.813)} &  0.756 \textcolor{gray}{(0.686\,-\,0.819)}\\
       & $\text{MM}_\text{L}$   &  0.783 \textcolor{gray}{(0.707\,-\,0.852)} &  0.787 \textcolor{gray}{(0.707\,-\,0.852)}\\
       & $\text{MM}_\text{LR}$  &  \textbf{0.817*} \textcolor{gray}{(0.745\,-\,0.880)} &  \textbf{0.820} \textcolor{gray}{(0.746\,-\,0.884)}\\
       & $\text{MM}_\text{Proj}$&  0.756 \textcolor{gray}{(0.688\,-\,0.819)} &  0.758 \textcolor{gray}{(0.689\,-\,0.820)}\\
    \bottomrule
  \end{tabular}}
\end{table}



\begin{table}[h]
\floatconts
  {tab:results_test_complete_impute_5y}%
  {\caption{Performance comparison of different missing value strategies for the DL-baseline~($\text{MM}_\text{LR}$) on $\text{Test}_\text{complete}$ using C-index, AUC, and their respective 95\,\% confidence intervals~(5-year time horizon).}}%
  {\begin{tabular}{lllcc}
  \toprule
   \bfseries Model & \bfseries Modality  & \bfseries Imputation & $\uparrow$ \bfseries C-Index & $\uparrow$ \bfseries AUC\\
   %Name & Details & Train & Val & Train& Val\\\hline
   \hline

   % ----- DL-Baseline -----
   \multirow{4}{*}{DL-Baseline}
       & $\text{MM}_\text{LR}$   & -1  &  0.729 \textcolor{gray}{(0.656\,-\,0.794)} &  0.733 \textcolor{gray}{(0.659\,-\,0.802)}\\
       & $\text{MM}_\text{LR}$   & Mean &  0.739 \textcolor{gray}{(0.659\,-\,0.813)} &  0.745 \textcolor{gray}{(0.664\,-\,0.817)}\\
       & $\text{MM}_\text{LR}$   & Median &  0.732 \textcolor{gray}{(0.649\,-\,0.809)} &  0.738 \textcolor{gray}{(0.654\,-\,0.812)}\\
      % & $\text{MM}_\text{LR}$   & Embedding &  0.728 \textcolor{gray}{(0.648\,-\,0.801)} &  0.734 \textcolor{gray}{(0.651\,-\,0.807)}\\
       & $\text{MM}_\text{LR}$   & Embedding &  0.733 \textcolor{gray}{(0.655\,-\,0.805)} &  0.738 \textcolor{gray}{(0.657\,-\,0.805)}\\
    \bottomrule
  \end{tabular}}
\end{table} 

\begin{table}[h]
\floatconts
  {tab:results_test_complete_impute_10y}%
  {\caption{Performance comparison of different missing value strategies for the DL-baseline~($\text{MM}_\text{LR}$) on $\text{Test}_\text{complete}$ using C-index, AUC, and their respective 95\,\% confidence intervals~(10-year time horizon).}}%
  {\begin{tabular}{lllcc}
  \toprule
   \bfseries Model & \bfseries Modality  & \bfseries Imputation & $\uparrow$ \bfseries C-Index & $\uparrow$ \bfseries AUC\\
   %Name & Details & Train & Val & Train& Val\\\hline
   \hline

   % ----- DL-Baseline -----
   \multirow{4}{*}{DL-Baseline}
          & $\text{MM}_\text{LR}$ & -1    &  0.741 \textcolor{gray}{(0.702\,-\,0.780)} &  0.765 \textcolor{gray}{(0.722\,-\,0.804)}\\
       & $\text{MM}_\text{LR}$   & Mean &  0.739 \textcolor{gray}{(0.694\,-\,0.781)} &  0.758 \textcolor{gray}{(0.714\,-\,0.799)}\\
       & $\text{MM}_\text{LR}$   & Median &  0.753 \textcolor{gray}{(0.707\,-\,0.796)} &  0.777 \textcolor{gray}{(0.731\,-\,0.820)}\\
       %& $\text{MM}_\text{LR}$   & Embedding &  0.744 \textcolor{gray}{(0.700\,-\,0.786)} &  0.765 \textcolor{gray}{(0.720\,-\,0.808)}\\
       & $\text{MM}_\text{LR}$   & Embedding &  0.749 \textcolor{gray}{(0.705\,-\,0.790)} &  0.772 \textcolor{gray}{(0.727\,-\,0.814)}\\
    \bottomrule
  \end{tabular}}
\end{table}



\begin{table}[h]
\floatconts
  {tab:results_test_reduced_impute_5y}%
  {\caption{Performance comparison of different missing value strategies for the DL-baseline~($\text{MM}_\text{LR}$) on $\text{Test}_\text{reduced}$ using C-index, AUC, and their respective 95\,\% confidence intervals~(5-year time horizon).}}%
  {\begin{tabular}{lllcc}
  \toprule
   \bfseries Model & \bfseries Modality  & \bfseries Imputation & $\uparrow$ \bfseries C-Index & $\uparrow$ \bfseries AUC\\
   %Name & Details & Train & Val & Train& Val\\\hline
   \hline

   % ----- DL-Baseline -----
   \multirow{4}{*}{DL-Baseline}
            & $\text{MM}_\text{LR}$ & -1    &  0.721 \textcolor{gray}{(0.636\,-\,0.798)} &  0.727 \textcolor{gray}{(0.641\,-\,0.802)}\\
       & $\text{MM}_\text{LR}$   & Mean &  0.739 \textcolor{gray}{(0.672\,-\,0.804)} &  0.744 \textcolor{gray}{(0.670\,-\,0.808)}\\
       & $\text{MM}_\text{LR}$   & Median &  0.746 \textcolor{gray}{(0.678\,-\,0.809)} &  0.751 \textcolor{gray}{(0.680\,-\,0.813)}\\
      % & $\text{MM}_\text{LR}$   & Embedding &  0.713 \textcolor{gray}{(0.643\,-\,0.782)} &  0.717 \textcolor{gray}{(0.640\,-\,0.787)}\\
       & $\text{MM}_\text{LR}$   & Embedding  &  0.744 \textcolor{gray}{(0.678\,-\,0.806)} &  0.748 \textcolor{gray}{(0.678\,-\,0.810)}\\
    \bottomrule
  \end{tabular}}
\end{table}

\begin{table}[h]
\floatconts
  {tab:results_test_reduced_impute_10y}%
  {\caption{Performance comparison of different missing value strategies for the DL-baseline~($\text{MM}_\text{LR}$) on $\text{Test}_\text{reduced}$ using C-index, AUC, and their respective 95\,\% confidence intervals~(10-year time horizon).}}%
  {\begin{tabular}{lllcc}
  \toprule
   \bfseries Model & \bfseries Modality  & \bfseries Imputation & $\uparrow$ \bfseries C-Index & $\uparrow$ \bfseries AUC\\
   %Name & Details & Train & Val & Train& Val\\\hline
   \hline

   % ----- DL-Baseline -----
   \multirow{4}{*}{DL-Baseline}
     & $\text{MM}_\text{LR}$ & -1    &  0.752 \textcolor{gray}{(0.706\,-\,0.796)} &  0.776 \textcolor{gray}{(0.731\,-\,0.818)}\\
       & $\text{MM}_\text{LR}$   & Mean &  0.722 \textcolor{gray}{(0.683\,-\,0.763)} &  0.741 \textcolor{gray}{(0.700\,-\,0.781)}\\
       & $\text{MM}_\text{LR}$   & Median &  0.740 \textcolor{gray}{(0.699\,-\,0.779)} &  0.763 \textcolor{gray}{(0.721\,-\,0.803)}\\
          % & $\text{MM}_\text{LR}$   & Embedding &  0.711 \textcolor{gray}{(0.671\,-\,0.753)} &  0.732 \textcolor{gray}{(0.689\,-\,0.774)}\\
       & $\text{MM}_\text{LR}$   & Embedding  &  0.734 \textcolor{gray}{(0.696\,-\,0.775)} &  0.756 \textcolor{gray}{(0.715\,-\,0.796)}\\
    \bottomrule
  \end{tabular}}
\end{table}


\FloatBarrier


\begin{figure}[ht]
\centering
\subfigure[CoxPH $\text{CRF}_{\text{BMD}}$]{
  \includegraphics[width=0.47\linewidth]{midl26_231_KM_CoxPH_BMD_10.png}
  \label{fig:KM_10_CoxPH}
}
\hfill
\subfigure[ATSF $\text{MM}_{\text{LR}}$]{
  \includegraphics[width=0.47\linewidth]{midl26_231_KM_ATSF_MM_LR_10.png}
  \label{fig:KM_10_ATSF}
}
\caption{Kaplan–Meier curves for CoxPH $\text{CRF}_{\text{BMD}}$ ($a$) and ATSF $\text{MM}_{\text{LR}}$ ($b$) over a 10-year time horizon on $\text{Test}_\text{reduced}$. Participants were stratified into quartiles based on each model’s predicted risk (Q1 = lowest risk, Q4 = highest risk). Less opaque areas denote 95\,\% confidence intervals.}
\label{fig:KM_10}
\end{figure}


\begin{figure}[ht]
\centering
\subfigure[CoxPH $\text{CRF}_{\text{BMD}}$]{
  \includegraphics[width=0.47\linewidth]{midl26_231_KM_CoxPH_BMD_5.png}
  \label{fig:KM_5_CoxPH}
}
\hfill
\subfigure[ATSF $\text{MM}_{\text{LR}}$]{
  \includegraphics[width=0.47\linewidth]{midl26_231_KM_ATSF_MM_LR_5.png}
  \label{fig:KM_5_ATSF}
}
\caption{Kaplan–Meier curves for CoxPH $\text{CRF}_{\text{BMD}}$ ($a$) and ATSF $\text{MM}_{\text{LR}}$ ($b$) over a 5-year time horizon on $\text{Test}_\text{reduced}$. Participants were stratified into quartiles based on each model’s predicted risk (Q1 = lowest risk, Q4 = highest risk). Less opaque areas denote 95\,\% confidence intervals.}
\label{fig:KM_5}
\end{figure}



\end{document}
