\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{booktabs}
\usepackage{multirow}

% \setlength{\parskip}{2pt}  % space between paragraphs

\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- 219}
\editors{Accepted for publication at MIDL 2026}

\usepackage{xcolor}
\newcommand{\rev}[1]{\textcolor{black}{#1}}

\title[HAF]{Heterogeneous Aligned Fusion for Survival \rev{Classification} with Missing Modalities}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{
\Name{Zheng Zheng\nametag{$^1$}} \orcid{0009-0004-9660-1739} \Email{zxz7934@mavs.uta.edu}\\
\Name{Yuzhi Guo\nametag{$^1$}} \Email{yuzhi.guo@mavs.uta.edu}\\
\Name{Xiao Hu\nametag{$^1$}} \Email{xxh3416@mavs.uta.edu}\\
\Name{Yuwei Miao\nametag{$^1$}} \Email{yxm9326@mavs.uta.edu}\\
\Name{Hehuan Ma\nametag{$^1$}} \Email{hehuan.ma@mavs.uta.edu}\\
\Name{Jean Gao\nametag{$^1$}} \Email{gao@uta.edu}\\
\Name{Junzhou Huang\nametag{$^1$}} \Email{jzhuang@uta.edu} \\
\addr $^{1}$ University of Texas at Arlington, Arlington, TX, USA \\
}

\begin{document}

\maketitle

\begin{abstract}
Accurate survival \rev{classification} is essential for guiding personalized treatment in head and neck cancer. Heterogeneous biomedical data, from histopathology to clinical and laboratory measurements, offer complementary prognostic value but differ in dimensionality, reside in incompatible feature spaces, and are frequently missing, making robust multimodal learning challenging.
To address this, we propose \textbf{HAF (Heterogeneous Aligned Fusion)}, a three-stage framework for survival \rev{classification} under heterogeneous and incomplete multimodal inputs. HAF (i) uses detachment and prognostic supervision to obtain stable representations, (ii) performs lightweight global alignment that projects all modalities into a shared
latent space while preserving patient-level discriminability, and (iii) enforces monotonic robust fusion that encourages performance to remain stable or improve when modalities are
added. To the best of our knowledge, HAF is the first approach that jointly leverages all seven modalities in the HANCOCK cohort. Extensive comparisons against representative late-, early-, attention-based, and bilinear-interaction fusion methods demonstrate that HAF consistently improves both accuracy and robustness under heterogeneous and partially missing modalities. Codes are released at \url{https://github.com/zz9tf/HAF.git}.
\end{abstract}

\begin{keywords}
Heterogeneous Aligned Fusion (HAF), multimodal learning, head and neck cancer, survival \rev{classification}, pathology imaging, MIL
\end{keywords}

\section{Introduction}

Accurate survival \rev{classification} is central to precision oncology, enabling
risk-adaptive decision making for patients with head and neck squamous cell
carcinoma (HNSCC)~\cite{tian2025multimodal}. Modern cohorts combine
high-dimensional pathology imaging with heterogeneous clinical and laboratory
measurements, which differ in dimensionality, reside in incompatible feature
spaces, and are often missing~\cite{wissel2023systematic,li2024review,
multimodalFusionNotBetter,alignmentIsNecessary,incompleteMultimodalData,
aly2023outcome,wu2024multimodal,reza2024robust}. This makes robust multimodal learning particularly challenging, \rev{as both representation compatibility and modality availability become fundamental bottlenecks.}

A natural starting point for multimodal survival \rev{classification} is to build strong unimodal predictors. Pathology-only systems such as CLAM-style MIL models~\cite{tian2025multimodal} provide reliable prognostic signals, yet treat each modality independently and therefore cannot exploit complementary clinical or laboratory information. 

Fusion-based multimodal designs address this limitation but introduce new
challenges. These designs span different abstraction levels, \rev{including
early fusion~\cite{jaegle2021perceiver} that concatenates heterogeneous features at the data level,
late fusion~\cite{tian2025multimodal} that aggregates modality-specific predictions,
attention-based~\cite{raza2025ps3,dang2024mfmf} fusion that models cross-modality dependencies,
and bilinear interaction frameworks~\cite{benkirane2025multimodal} that explicitly capture pairwise
cross-modality interactions~\cite{jaegle2021perceiver,
tian2025multimodal,raza2025ps3,dang2024mfmf,benkirane2025multimodal}.}
However, they implicitly assume well-aligned modality embeddings. In reality,
heterogeneous modalities remain misaligned~\cite{li2024review,
multimodalFusionNotBetter, alignmentIsNecessary}, allowing low-quality channels
to propagate misleading signals that degrade the fused representation.

\rev{A parallel line of work introduces explicit cross-modality alignment,
e.g., contrastive learning or subspace matching~\cite{radford2021learning,
cicchetti2024gramian,kamboj2025towards}. However, these methods typically assume
complete modality availability and do not address missing-modality scenarios.
Another line of work focuses on robustness under missing modalities via
monotonicity loss to train~\cite{aly2023outcome,wu2024multimodal,
reza2024robust,randomDrop}, but these approaches operate in unaligned feature
spaces and ignore structured cross-modal relationships. In clinical practice, however, multimodal data are rarely complete or equally reliable.
Different modalities may vary substantially in acquisition quality, availability, and diagnostic relevance across patients, making naive fusion fragile in real-world settings.}

\rev{These limitations suggest that reliable multimodal survival classification requires: (i) robustness under missing modalities, (ii) aligned latent geometry across heterogeneous modalities, and (iii) monotonic behavior such that incorporating additional modalities does not degrade performance.
To operationalize these observations, we propose HAF (Heterogeneous Aligned Fusion), a three-stage framework that jointly enforces cross-modality alignment and monotonic robust fusion. Specifically, HAF decomposes multimodal survival learning into three stages:
(i) \textit{decoupled representation learning}, 
which detaches the outputs between stages to decouple representation learning, alignment, and fusion objectives, stabilizing optimization and preventing cross-objective gradient interference;
(ii) \textit{global latent alignment}, which maps heterogeneous modalities into a shared low-rank patient space;
and (iii) \textit{monotonic robust fusion}, which enforces structured modality substitution under missing-modality conditions.
}

We evaluate HAF on the HANCOCK dataset~\cite{dorrich2025multimodal}, which provides an unprecedented multimodal setting combining whole-slide imaging (WSI), tissue microarrays (TMA), and five structured clinical descriptors, capturing complementary aspects of tumor biology. \rev{In this work, we formulate this survival classification as a binary classification problem following the official target definition of the HANCOCK cohort.} In evaluation, HAF demonstrates stable optimization, improved cross-modality compatibility, and strong robustness under missing or noisy inputs, outperforming representative late-, early-, and attention-based baselines. In conclusion, HAF offers a principled view of multimodal interaction: unimodal semantics are stabilized, modalities are aligned into a substitutable shared geometry, and fusion remains reliable even under realistic missing-modality conditions. 

\begin{figure*}[!th]
    \centering
    \includegraphics[width=1\textwidth]{pipeline.png}
    \vspace{-20pt}
    \caption{\textbf{Overview of the HAF framework.}
\rev{Stage 1 learns stable unimodal embeddings under detached supervision.
Stage 2 globally aligns modality embeddings into a shared space.
Stage 3 fuses aligned features with monotonic-fusion training and a monotonic constraint, promoting robustness under missing modalities.}}
    \vspace{-12pt}
    \label{fig:pipeline}
\end{figure*}

\vspace{-16pt}

\section{Related Work}
\label{sec:related}

\subsection{\rev{Multimodal fusion paradigms for survival classification}}
\rev{Recent multimodal survival approaches can be broadly discussed under several representative fusion paradigms, including:
(A) \textit{Early fusion}, which concatenates heterogeneous modality representations into a single joint feature vector before prediction;
(B) \textit{Late fusion}, as in MDLM~\cite{tian2025multimodal}, which first produces modality-specific predictions and then aggregates these predictions at the decision level;
(C) \textit{Attention-based fusion}, where methods such as PS3~\cite{raza2025ps3} perform cross-modal attention directly over modality representations to model feature-level dependencies. 
Within this family, MFMF~\cite{dang2024mfmf} adopts a distinct asymmetric strategy, using auxiliary modalities to compute attention over a single target modality; 
(D) \textit{Bilinear interaction frameworks}, exemplified by CustOmics~\cite{benkirane2025multimodal}, which explicitly model pairwise multiplicative interactions between modality representations.}

\subsection{\rev{Cross-modality alignment}}
\rev{Recent work has shown that aligning modalities before fusion can yield more semantically coherent representations, as demonstrated in CLIP-style contrastive models and cross-aligned fusion frameworks~\cite{radford2021learning,rajora2025cross,zhao2025clip}. 
Contrastive alignment, however, requires curated modality pairs and becomes inefficient as the number of modalities increases. Facing this limitation, approaches shift toward \emph{geometry-driven alignment}, which aligns modalities through a shared latent structure without requiring paired supervision, typically via SVD-based formulations~\cite{cicchetti2024gramian,kamboj2025towards,liu2026principled}. }
% However, these methods have rarely been explored in clinical settings with systematic modality missingness and typically consider only a small number of modalities, leaving large-scale heterogeneous alignment and the geometry of the resulting alignment space largely unexplored.}

\subsection{\rev{Missing-modality robustness}}
\rev{Missing data is pervasive in clinical workflows where imaging, assays, or laboratory tests may be absent for logistical or cost-related reasons~\cite{incompleteMultimodalData,aly2023outcome,wu2024multimodal,reza2024robust}.
Generative imputation strategies~\cite{hao2024synthetic,liang2022mind,qin2023cross} attempt to synthesize absent modalities but can introduce hallucinations or low-fidelity artifacts~\cite{sun2024ai}, making reliability difficult to guarantee.
Random-modality training and stochastic gating~\cite{randomDrop,wei2025boosting} provide an alternative by encouraging models to remain predictive even when some modalities are dropped during training. }
% However, these robustness strategies are rarely examined under a shared representation space, leaving the structure and stability of cross-modality interactions largely unexplored.}


\section{Methods}
\label{sec:methods}

\rev{Building on prior work in multimodal fusion, alignment, and robustness, we observe that existing approaches are not explicitly designed to handle systematic modality missingness, modality-quality variability, and the geometry of a shared alignment space in clinical settings.
In addition, this work focuses on fusion strategies operating at the representation level, where each modality is first encoded into a fixed-dimensional embedding}\footnote{\rev{“Fixed-dimensional” refers to a unified embedding space rather than frozen encoders; modality encoders remain trainable through lightweight adaptation modules.}}\rev{before fusion.
We select and reimplement baselines from each paradigm for representation-level fusion strategy comparison, including MDLM for late fusion, PS3 and MFMF for attention-based fusion, and CustOmics for bilinear interaction. This design is motivated by the substantial heterogeneity in raw data formats across modalities, which makes direct end-to-end fusion difficult to compare in a controlled manner.}

% \subsection{Multimodal Inputs and Task Definition}

% We study binary survival \rev{classification} on the HANCOCK cohort~\cite{dorrich2025multimodal}, where the task is to predict whether a patient survives within a fixed follow-up window ($y \in \{0,1\}$). Each patient provides up to seven heterogeneous modalities: WSI and TMA histopathology, TMA-derived cell-density maps, clinical metadata, pathological staging, blood biomarkers, and ICD diagnostic codes. In practice, not all patients are fully observed; imaging and laboratory assays may be unavailable for logistical or cost-related reasons, leading to patient-specific subsets of observed modalities. Formally, for each patient $i$ we denote by $\mathcal{M}_i \subseteq \mathcal{M}$ the set of available modalities and by $\{x_i^{(m)}\}_{m \in \mathcal{M}_i}$ the corresponding inputs, and the goal is to learn a predictor $f$ that maps these heterogeneous, 
% partially missing observations to a binary event \rev{classification}: $\hat{y}_i = f(\{x_i^{(m)}\}_{m\in\mathcal{M}_i}), \hat{y}_i \in \{0,1\}.$

\subsection{Multimodal Inputs and Task Definition}

\rev{We study binary survival status classification on the HANCOCK cohort~\cite{dorrich2025multimodal}, strictly following the official target definition provided in the dataset. Specifically, for survival classification, the class labels correspond to the patient survival status (\emph{deceased} vs.\ \emph{living}) at the time of the last available follow-up ($y \in \{0,1\}$), consistent with the original HANCOCK setting; Patients with non--tumor-specific death are excluded, consistent with the original protocol.}
Each patient provides up to seven heterogeneous modalities: WSI and TMA histopathology, TMA-derived cell-density maps, clinical metadata, pathological staging, blood biomarkers, and ICD diagnostic codes. In practice, not all patients are fully observed; imaging and laboratory assays may be unavailable for logistical or cost-related reasons, leading to patient-specific subsets of observed modalities. Formally, for each patient $i$ we denote by $\mathcal{M}_i \subseteq \mathcal{M}$ the set of available modalities and by $\{x_i^{(m)}\}_{m \in \mathcal{M}_i}$ the corresponding inputs, and the goal is to learn a predictor $f$ that maps these heterogeneous, partially missing observations to a binary event \rev{classification}: $\hat{y}_i = f(\{x_i^{(m)}\}_{m\in\mathcal{M}_i}), \hat{y}_i \in \{0,1\}.$ \rev{The preprocessing in this work addresses feature-level missingness within existing modalities, 
while modality-level missingness (entire modalities unavailable) is handled separately via random-modality masking in Stage~3. Additional dataset statistics and evaluation details are provided in the appendix.}


\vspace{-10pt}
\subsection{Modality Representations}

For WSI, slides are processed using the CLAM tile extraction pipeline, and tile-level embeddings are extracted with the UNI foundation model~\cite{UNI}, followed by patient-level aggregation. 
For TMA, embeddings are extracted from the provided TMA PNG images using the same UNI model, rather than from the original TMA SVS files, and are likewise aggregated at the patient level.
The remaining five modalities are compact tabular descriptors: $X^{(m)} \in \mathbb{R}^{d_m}, \quad m \in \mathcal{M}_{\text{tab}}, \quad d_m \leq 51$,
where $\mathcal{M}_{\text{tab}} = \{\text{Cell, Clin, Path, Blood, ICD}\}$. All tabular features are preprocessed using min--max normalization, one-hot encoding, and imputation with the most frequent value, following the preprocessing protocol in the HANCOCK dataset. This yields a unified representation where high-dimensional imaging bags and low-dimensional tabular vectors coexist, but still differ substantially in scale, noise level, and semantic content, motivating the alignment and fusion strategy introduced in the following sections.

\subsection{Stage 1: Prognostic Pathology Representation Learning}
\label{subsec:stage1}

Each pathology modality (WSI or TMA) is represented as a variable-length bag of patch embeddings $H\!\in\!\mathbb{R}^{N\times1024}$, where $N$ depends on sampled tissue content. We adopt the gated-attention MIL encoder from CLAM~\cite{clam}.

To obtain compact and expressive instance features, each 1024-dimensional patch embedding is first projected using a linear mapping $w_l\!\in\!\mathbb{R}^{1024\times d_1}$. 
Two learnable projection matrices $U,V\!\in\!\mathbb{R}^{d_1\times d_2}$ generate gated-attention features, and a learnable query vector $w_a\!\in\!\mathbb{R}^{d_2\times 1}$ computes instance importance:

\vspace{-4pt}
{\small
\begin{equation}
a = \operatorname{softmax}\!\Big(
\big[
\tanh\!\big((Hw_l)V\big)
\odot
\sigma\!\big((Hw_l)U\big)
\big]w_a
\Big).
\end{equation}
}
The slide-level representation for modality $m$ is obtained by weighted aggregation:
\begin{equation}
z^{(m)} = (Hw_l)^\top a,
\label{eq:z}
\end{equation}
where $m\in\{\text{WSI},\text{TMA}\}$ and $d_2$ denotes the attention dimension.
Each modality-specific embedding is supervised using a cross-entropy loss combined with instance-level regularization:
\begin{equation}
\mathcal{L}_{\text{img}}^{(m)} =
\mathcal{L}_{\text{CE}}(y,\hat{y}^{(m)})
+
\mathcal{L}_{\text{inst}}^{(m)}.
\label{eq:img}
\end{equation}
We use $d_1=64$ and $d_2=32$.  
The resulting pathology embeddings are detached before Stage~2 to preserve their prognostic semantics and avoid distortion during multimodal training.

\subsection{Stage 2: Global Aligned Shared Latent Projection} 
\label{subsec:stage2} 
This stage aligns modalities into a shared latent space, ensuring that downstream robustness operates on substitutable rather than incompatible modality embeddings.
To achieve this, we first project each modality into a common latent space using small MLPs, then apply global alignment losses to encourage their convergence onto a unified consensus signal.
This yields a coherent multimodal representation that amplifies high-quality modalities while suppressing noisy or weak ones.

\subsubsection{Latent Projection into a Shared Space}
To place all modalities on a comparable footing, each modality-specific embedding
$z^{(m)}$ is mapped into a shared latent space through a lightweight two-layer MLP:
\vspace{-5pt}
\begin{equation}
u^{(m)} = \phi^{(m)}(z^{(m)}) \in \mathbb{R}^{d_{\text{out}}}, 
\quad m \in \mathcal{M},
\end{equation}
where $\mathcal{M}=\{\text{WSI},\text{TMA},\text{Clin},\text{Path},\text{Blood},\text{ICD},\text{Cell}\}$ and $d_{\text{out}}=128$. The alignment treats all modalities symmetrically, avoiding vision-centric or tabular-centric bias.

\subsubsection{Global Alignment via Singular Value Decomposition}

Singular value decomposition (SVD) decomposes a set of vectors into orthogonal directions ordered by how much signal the data concentrate along each direction. The largest singular value $\sigma_1$ corresponds to the dominant component shared across vectors, while smaller singular values capture weaker signals. In our setting, if modalities project toward a common disease-related direction, most information accumulates in this dominant component (large $\sigma_1$), and the remaining components become comparatively weak (small $\sigma_2,\dots,\sigma_k$).

\rev{Following} the principled alignment framework of Liu et al.~\cite{liu2026principled}, we leverage this property by encouraging all projected modality embeddings to concentrate their variation along the dominant component of the patient-specific matrix
$U = [u^{(1)}, \dots, u^{(M)}] \in \mathbb{R}^{d_{\text{out}} \times M}$.
Formally, SVD decomposes $U$ into
$U = Q \Sigma R^\top$,
where $Q = [q_1, q_2, \dots, q_k] \in \mathbb{R}^{d_{\text{out}} \times k}$ contains orthonormal left singular vectors, and $\Sigma = \mathrm{diag}(\sigma_1, \sigma_2, \dots, \sigma_k), k = \min(d_{\text{out}}, M)$ stores the singular values in descending order.
Here, $\sigma_1$ quantifies the strength of the dominant shared component across modalities, 
and $q_1$ is the corresponding direction in the latent space.
When $\sigma_1 \gg \sigma_2,\dots,\sigma_k$, the decomposition becomes approximately rank-1, meaning that all modality embeddings align along $q_1$ and express a common latent signal.
We adopt two complementary losses \rev{from Liu et al.~\cite{liu2026principled}} to enforce this behavior. First, the \emph{singular emphasis loss} increases the dominance of $\sigma_1$, promoting alignment of all modalities toward the shared component:
\begin{equation}
\mathcal{L}_{\mathrm{SV}}
=
-\log 
\frac{\exp(\sigma_1/\tau)}
{\sum_{j=1}^{k}\exp(\sigma_j/\tau)}.
\end{equation}

Second, during this alignment process, different patients could collapse to the same direction. To retain patient-level distinction, we adopt their \emph{dominant-direction discriminability loss} that separates patients based on their dominant singular vectors:
\begin{equation}
\mathcal{L}_{\mathrm{PD}}
=
-\frac{1}{B}\sum_{i=1}^{B}
\log 
\frac{\exp((q_1^{(i)})^\top q_1^{(i)}/\tau)}
{\sum_{j=1}^{B}\exp((q_1^{(i)})^\top q_1^{(j)}/\tau)}.
\end{equation}

The total alignment objective is
\begin{equation}
\mathcal{L}_{\mathrm{GA}} 
= 
\mathcal{L}_{\mathrm{SV}} 
+ 
\lambda_{\mathrm{PD}}\mathcal{L}_{\mathrm{PD}}.
\label{eq:lga}
\end{equation}

In our experiments, we set the patient-discriminability weight to
$\lambda_{PD} = 0.01$, which balances alignment strength and inter-patient separation.

\subsection{Stage 3: Fusion with Robust Modality Collaboration}
\label{subsec:stage3}

We perform fused multimodal \rev{classification} using the aligned features from Stage~2.
For each patient, let $\{u^{(m)}\}_{m \in \mathcal{M}}$ denote the aligned modality embeddings and 
$\mathcal{M}_{\text{obs}} \subseteq \mathcal{M}$ the set of modalities observed for that patient.
We construct a fused representation by concatenating the available modalities:
\begin{equation}
h = \operatorname{concat}\big(\{u^{(m)}\}_{m \in \mathcal{M}_{\text{obs}}}\big),
\end{equation}
and obtain a scalar \rev{logit} using a shared fusion MLP,
\begin{equation}
s = \phi(h), \qquad \hat{y} = \sigma(s),
\end{equation}
\vspace{-8pt}
where $\sigma(\cdot)$ is the sigmoid function.
\vspace{4pt}
\paragraph{Random-modality Drop training.}
To ensure robustness under missing or unreliable modalities, we adopt random-modality drop~\cite{randomDrop} masking on top of the aligned geometry, enabling the model to rely on substitutable modalities rather than raw heterogeneous features.
At each iteration, we randomly mask a subset of modalities to obtain a reduced-modality representation $h^{-}$ and its \rev{classification} $\hat{y}^{-}$, alongside the full-modality \rev{classification} $\hat{y}^{+}$.
This encourages the model to rely on informative modalities while remaining stable when others are absent.

\paragraph{Monotonic collaboration constraint.}
To guarantee that incorporating more modalities never degrades performance,
we impose a monotonicity loss:
\begin{equation}
\mathcal{L}_{\text{MoFe}}
=
\max \big(
\ell(\hat{y}^{+}, y) - \ell(\hat{y}^{-}, y),
0
\big),
\label{eq:mofe}
\end{equation}
which penalizes cases where the full-modality \rev{classification} is worse than that of a reduced subset.

\paragraph{Fusion objective.}
The final fusion loss combines full-modality supervision, reduced-modality supervision, and the monotonicity constraint:
\begin{equation}
\mathcal{L}_{\text{fusion}}
=
\ell(\hat{y}^{+}, y)
+
\ell(\hat{y}^{-}, y)
+
\lambda_{\text{MoFe}}\,\mathcal{L}_{\text{MoFe}},
\label{eq:lfusion}
\end{equation}
where we set $\lambda_{\text{MoFe}} = 0.1$ in our experiments. 
The full objective therefore enforces consistency across complete and reduced-modality inputs, 
while the monotonicity term prevents \rev{classification} reversals under modality removal. We found this combination to yield stable optimization and improved robustness in practice.

\subsection{Overall Training Objective}

The full HAF objective integrates the three stages through four
loss blocks:


\begin{equation}
\mathcal{L}_{\text{total}}
=
\underbrace{
\mathcal{L}_{\text{WSI}}
+
\mathcal{L}_{\text{TMA}}
}_{\text{Stage 1}}
+
\underbrace{
\mathcal{L}_{\text{GA}}
}_{\text{Stage 2}}
+
\underbrace{
\mathcal{L}_{\text{fusion}}
}_{\text{Stage 3}}.
\label{eq:ltotal}
\end{equation}

Here,
$\mathcal{L}_{\text{WSI}}$ and $\mathcal{L}_{\text{TMA}}$ include
$\mathcal{L}_{\text{CE}}$ and instance-level regularization terms as defined in
Eq.~\eqref{eq:img}.
The global alignment loss $\mathcal{L}_{\text{GA}}$ follows Eq.~\eqref{eq:lga}.
The fusion objective $\mathcal{L}_{\text{fusion}}$ follows Eq.~\eqref{eq:lfusion}.
Crucially, the outputs of each stage are detached before being
passed to the next stage. This decoupling prevents conflicting
gradients from dominating weaker modalities and ensures that each learning
objective, including (i) pathology semantics, (ii) cross-modality compatibility,
and (iii) robustness to missing data, is optimized without being overridden by others.
%
\section{Experiments}

\label{sec:experiments}

\subsection{Experimental Setup}
\label{subsec:setup}

\rev{We adopt the same modality representations and preprocessing pipeline described in Sec.~3, including CLAM+UNI for histopathology and the official HANCOCK protocol for structured modalities.
We applied a strict patient-level 10-fold random cross-validation protocol across all methods, and reported accuracy and AUC as the primary evaluation metrics. All metrics are computed exclusively at the patient level to avoid data leakage across splits. All models are trained under the same optimization protocol without method-specific hyperparameter tuning. We apply early stopping and select the best model based on validation AUC (maximum 200 epochs, patience 25).} A \texttt{ReduceLROnPlateau} scheduler is used with patience 15 and factor 0.5. \rev{Unless otherwise stated, all experiments are conducted on an NVIDIA RTX A6000 GPU; full 10-fold training for HAF requires roughly 4 hours in total.}

\vspace{-8pt}
\subsection{Baselines and Variants}
\label{subsec:baselines}

\rev{We evaluate HAF against a comprehensive set of baselines designed to reflect four levels of comparison:
(i) pathology-only references, 
(ii) fusion models and HAF variants,
(iii) parameter-free aggregation baselines, and 
(iv) representative multimodal methods from the literature.}

\rev{\textbf{Pathology-only baselines.}
The official HANCOCK benchmarks consist of \textbf{WSI-CLAM}, \textbf{TMA-CLAM}, and \textbf{WSI+TMA-CLAM}, 
which serve as unimodal and dual-modality pathology references. 
We additionally reproduce WSI+TMA-CLAM using the released UNI features to ensure evaluation consistency.
We report the original HANCOCK results only for contextual reference, 
as their evaluation protocol is defined at the slide level, 
whereas all experiments in our work adopt a strict patient-level split.}

\rev{\textbf{Naive fusion models.}
Beyond pathology-only baselines, we include two naive fusion models: 
a \textbf{WSI+TMA Fusion MLP} and an \textbf{All-Modality Fusion MLP}, 
both of which directly concatenate modality embeddings without any alignment or robustness mechanisms. 
These baselines quantify the limitations of simple multimodal aggregation in heterogeneous feature spaces.}

\rev{\textbf{Alignment and robustness variants.}
To examine the effects of HAF’s core components, we further evaluate 
\textbf{Global Alignment (GA)}, the SVD-based latent alignment from Stage~2; }
\rev{\textbf{CLIP Alignment}, a vision-centric contrastive baseline anchoring non-visual modalities to WSI; 
and \textbf{Random-Modality Drop}, the robustness mechanism from Stage~3. 
We also test their combinations to assess whether alignment and robustness act synergistically.}

\rev{\textbf{Parameter-free aggregation baselines.}
Following recent work on simple multimodal interaction~\cite{zhang2023learning}, 
we additionally evaluate parameter-free pooling strategies on the aligned modality embeddings, 
including mean, sum, and max pooling. 
These baselines test whether robustness gains can be achieved by straightforward aggregation alone, 
without any learnable fusion mechanism.}

\rev{\textbf{Comparable multimodal methods.}
Finally, we compare against representative multimodal fusion approaches, including 
\textbf{PS3}, \textbf{MDLM}, \textbf{MFMF}, and a \textbf{Bilinear Interaction} model, 
to situate HAF relative to existing fusion strategies. 
All literature baselines are re-implemented as representation-level fusion modules
operating on fixed-dimensional modality embeddings, following the original methodological
descriptions. All methods use identical encoder architectures and the same preprocessing
and training protocol, and are compared under the same setting.}

% We evaluate HAF against a broad set of pathology-only, multimodal, and alignment–robustness variants to isolate the contributions of each component. The official HANCOCK benchmarks consist of \textbf{WSI-CLAM}, \textbf{TMA-CLAM}, and \textbf{WSI+TMA-CLAM}, which serve as unimodal and dual-modality pathology references. We also reproduce WSI+TMA-CLAM using the released UNI features to ensure evaluation consistency. Moving beyond pathology, we include two naive fusion models: a \textbf{WSI+TMA Fusion MLP} and an \textbf{All-Modality Fusion MLP}, both of which directly concatenate modality embeddings without any alignment or robustness mechanisms. These baselines quantify the limitations of simple multimodal aggregation in heterogeneous feature spaces.

% To examine the effects of HAF’s core components, we evaluate \textbf{Global Alignment (GA)}, the SVD-based latent alignment from Stage~2; \textbf{CLIP Alignment}, a vision-centric contrastive baseline anchoring non-visual modalities to WSI as a comparable alignment method; and \textbf{Random-Modality Drop}, the robustness mechanism from Stage~3. We further test their combination to assess whether alignment and robustness act synergistically. Finally, we compare against representative multimodal approaches such as \textbf{PS3}, \textbf{MDLM}, \textbf{MFMF}, and a \textbf{Simple Feature Interaction} model to situate HAF relative to existing fusion strategies. All model variants share the same detached pathology encoders from Stage~1, ensuring that differences stem solely from their alignment and fusion strategies.

\vspace{-10pt}
\begin{table}[!ht]
  \floatconts
    {tab:multimodal-results}
    {\caption{\textbf{\rev{Overall quantitative results on the HANCOCK dataset.}}}}
    {%
    \vspace{-10pt}
      \small
      \setlength{\tabcolsep}{6pt}
      \begin{tabular}{lcc}
      \toprule
      \textbf{Method} & \rev{\textbf{accuracy}} & \textbf{AUC} \\
       & {\scriptsize(mean $\pm$ std)} & \scriptsize(mean $\pm$ std) \\
      \midrule
      \multicolumn{3}{c}{\textit{Pathology-only baselines}} \\
      \midrule
      WSI-CLAM                         & -- & 0.65 \\
      TMA-CLAM                         & -- & 0.52 \\
      WSI+TMA-CLAM                     & -- & 0.69 \\
      WSI+TMA-CLAM (reproduced)        & 0.712$\pm$0.087 & 0.679$\pm$0.119 \\
      \midrule
      \multicolumn{3}{c}{\textit{Fusion models and HAF variants}} \\
      \midrule
      WSI+TMA Fusion MLP               & 0.739$\pm$0.041 & 0.668$\pm$0.133 \\
      All-Modality Fusion MLP          & 0.748$\pm$0.046 & 0.694$\pm$0.113 \\
      Global Alignment (GA)            & \textbf{0.752$\pm$0.047} & 0.698$\pm$0.127 \\
      CLIP Alignment                   & 0.741$\pm$0.074 & 0.697$\pm$0.103 \\
      Random-Modality Drop             & 0.748$\pm$0.074 & 0.715$\pm$0.099 \\
      CLIP + Random Drop               & 0.741$\pm$0.073 & 0.735$\pm$0.097 \\
      \textbf{HAF (GA + Random Drop)}  & 0.745$\pm$0.065 & \textbf{0.739$\pm$0.092} \\
      \midrule
      \multicolumn{3}{c}{\textit{Parameter-free aggregation baselines (on GA)}} \\
      \midrule
      \rev{GA + Mean Pool}              & \rev{0.606$\pm$0.122} & \rev{0.700$\pm$0.090} \\
      \rev{GA + Sum Pool}               & \rev{0.636$\pm$0.077} & \rev{0.717$\pm$0.077} \\
      \rev{GA + Max Pool}               & \rev{0.649$\pm$0.101} & \rev{0.702$\pm$0.106} \\
      \midrule
      \multicolumn{3}{c}{\textit{Comparable multimodal methods}} \\
      \midrule
      PS3                              & 0.626$\pm$0.123 & 0.718$\pm$0.117 \\
      MDLM                             & 0.557$\pm$0.145 & 0.626$\pm$0.122 \\
      MFMF                             & 0.675$\pm$0.089 & 0.732$\pm$0.127 \\
      \rev{Bilinear Interaction}    & \rev{0.682$\pm$0.101} & \rev{0.684$\pm$0.119} \\
      \bottomrule
      \end{tabular}
    }
\end{table}


\subsection{Overall Quantitative Results}
\label{subsec:main-results}

\rev{Table~\ref{tab:multimodal-results} organizes the results into pathology-only baselines, fusion models and HAF variants, parameter-free aggregation baselines, and representative multimodal methods.} The official HANCOCK baselines
(\textbf{WSI-CLAM}, \textbf{TMA-CLAM}, \textbf{WSI+TMA-CLAM}) provide a unimodal reference point,
and our reproduction of \textbf{WSI+TMA-CLAM} closely matches the reported results
(Accuracy 0.712, AUC 0.679). The \textbf{WSI+TMA Fusion MLP} improves accuracy (0.739) but yields a
lower AUC (0.668). Extending fusion to all seven modalities with the \textbf{All-Modality Fusion MLP}
gives moderate improvements (AUC 0.694), although performance varies considerably across folds.
Among the single-component variants, \textbf{Global Alignment (GA)} achieves the highest accuracy
(0.752), while \textbf{CLIP Alignment} shows a similar but slightly weaker trend. 
\textbf{Random-Modality Drop} primarily improves AUC (0.715) relative to the All-Modality Fusion MLP.
Combining Drop with alignment further increases discrimination: \textbf{CLIP + Drop} reaches an AUC
of 0.735, and the full \textbf{HAF (GA + Drop)} achieves the best overall performance (AUC 0.739). 
\rev{To compare Random-Modality Drop with simpler parameter-free aggregation on aligned embeddings,
we additionally evaluate mean/sum/max pooling on GA features.
While these baselines are competitive in AUC (\textbf{GA+Sum Pool}: 0.717; \textbf{GA+Mean Pool}: 0.700; \textbf{GA+Max Pool}: 0.702),
they are consistently below HAF (AUC 0.739) and yield lower accuracy (0.606--0.649).}
For comparison, representative multimodal frameworks such as \textbf{PS3} (0.626 / 0.718),
\textbf{MDLM} (0.557 / 0.626), \textbf{MFMF} (0.675 / 0.732), and \rev{\textbf{Bilinear Interaction} (0.682 / 0.684)
} \rev{perform notably worse across accuracy and AUC, with HAF achieving consistent improvements in accuracy over all comparable methods and in AUC over all except MFMF.}

\subsection{Robustness to Modality Drop}
\label{subsec:robust}
Fig.~\ref{fig:robust} summarizes model robustness under varying drop probabilities during testing.
AUC and accuracy remain nearly constant up to $\rev{\rho}=0.4$, indicating that the model compensates for missing modalities by relying on reliable inputs, and degrade only when most modalities are absent ($\rev{\rho}>0.4$), demonstrating improved resilience to real-world incompleteness and noise.

\begin{figure}[!h]
    \centering
    \includegraphics[width=0.85\linewidth]{robust_results_boxplot.png}
    \vspace{-10pt}
    \caption{\textbf{Model robustness under modality dropout.}}
    \label{fig:robust}
\end{figure}

\subsection{Representation Analysis}
\label{subsec:representation}

To illustrate the alignment effect, Fig.~\ref{fig:alignment_tsne} visualizes multimodal embeddings before and after global alignment.
Each color represents one modality, and each point corresponds to a patient from the test set.

\begin{figure}[!h]
    \hfill
    \includegraphics[width=0.9\linewidth]{tsne-plot.png}
    \hspace{0.2cm}
    \caption{\textbf{t-SNE projection of multimodal embeddings before and after global alignment.} The horizontal and vertical axes represent the first and second t-SNE components, respectively.}
    \label{fig:alignment_tsne}
\end{figure}


Before alignment, modalities form well-separated clusters with large inter-modality gaps, while patient embeddings within the same modality are highly overlapping, indicating strong modality bias but limited patient discriminability. 
After alignment, modalities become more coherent along shared axes and different patients are pulled further apart, simultaneously reducing inter-modality discrepancies and enhancing inter-patient separability.

\section{\rev{Discussion}}

\rev{This work demonstrates that reliable multimodal survival classification in clinical settings is limited when achieved by feature aggregation alone. Our results suggest that performance gains in HAF arise from explicitly addressing three structural challenges: (i) geometric incompatibility across modalities, (ii) systematic modality missingness, and (iii) cross-objective interference during end-to-end training. By decoupling representation learning, alignment, and fusion into staged objectives, HAF provides a framework for building robust multimodal classification.}

\rev{
While global alignment improves overall performance, the aligned representations do not
collapse into perfect subject-level clusters. As shown in Fig.~\ref{fig:alignment_tsne}, a degree of modality-wise separation remains after alignment. This behavior is not contradictory to the objective of alignment. Rather than enforcing strict feature collapse across modalities,
the alignment loss encourages shared low-rank structure while preserving modality-specific
residual variations. In highly heterogeneous clinical data, such residual structure is in-
evitable and may even be beneficial, as different modalities capture complementary aspects
of patient state that cannot be fully reconciled in a single latent space.
}

\rev{
The relatively weak performance of several literature baselines can be attributed to the heterogeneous quality of the available modalities (see Table~\ref{tab:multimodal-results}).
Methods that treat all modalities symmetrically, or that rely on weaker modalities as queries (e.g., attention-based fusion), may propagate noisy signals into the fusion process,
while decision-level late fusion may discard cross-modal interactions before meaningful integration occurs.
In contrast, global alignment yields more consistent results by placing heterogeneous modalities into a comparable representation space,
and multimodal integration provides systematic gains over all single-modality predictors (Table~\ref{tab:single_modality}).
HAF further improves over naive fusion, suggesting that combining alignment with robustness-aware fusion is beneficial in settings where modality quality varies substantially.
}


\rev{
Random-Modality Drop explicitly trains the fusion module under missing-modality conditions, which aligns with the stable performance under moderate drop rates in Fig.~\ref{fig:robust}.
The detachment ablation suggests an optimization trade-off in multimodal alignment.
Allowing downstream task gradients to directly update the alignment module can some-
times improve raw discriminative metrics, but may introduce conflicts between modality-
consistency objectives and task-specific objectives. Stage-wise detachment therefore serves
as a regularization mechanism to decouple these objectives, leading to more stable shared
representations (see Appendix C).
}


% \rev{This work demonstrates that reliable multimodal survival classification in clinical settings is limited when achieved by feature aggregation alone.
% Our results suggest that performance gains in HAF arise from explicitly addressing three structural challenges: 
% (i) geometric incompatibility across modalities, 
% (ii) systematic modality missingness, and 
% (iii) cross-objective interference during end-to-end training.
% By decoupling representation learning, alignment, and fusion into staged objectives, HAF provides a framework for building robust and interpretable multimodal predictors.}

% \rev{
% In this framework, global alignment does not result in a complete collapse of all modalities into identical subject-level clusters.
% As observed in the t-SNE visualization in Fig.~\ref{fig:alignment_tsne}, a degree of modality-wise separation remains after alignment.
% This indicates that aligned representations retain residual modality-dependent structure.
% In our setting, alignment in this framework is not intended to eliminate modality-specific structure,
% but to place heterogeneous representations into a comparable geometric space
% where cross-modality relationships can be meaningfully modeled.
% }

% \rev{
% Compared to vision-centric contrastive alignment methods such as CLIP, 
% which align all modalities to a single reference modality,
% global alignment achieves consistently better performance in our experiments.
% A potential explanation is that anchoring all modalities to a single modality may introduce modality-specific bias into the shared space,
% whereas global alignment treats all modalities symmetrically and avoids privileging any particular source.
% }

% \rev{
% The consistently weak performance of all single-modality predictors (Table~\ref{tab:multimodal-results} and Table~\ref{tab:single_modality}) 
% suggests that individual data sources alone provide limited signal for reliable survival classification in this cohort.
% Despite substantial variance in modality quality, incorporating multiple modalities yields systematic performance gains,
% as even naive multimodal fusion outperforms all single-modality baselines (Table~\ref{tab:multimodal-results}).
% This indicates that survival classification in this setting is better supported by multimodal integration, 
% where complementary information across heterogeneous sources is essential. 
% Meanwhile, the performance gap between naive fusion strategies and HAF suggests that HAF offers a more effective fusion mechanism for exploiting multimodal information.
% }

% \rev{
% Furthermore, Random-Modality Drop should not be viewed merely as a regularization heuristic.
% It explicitly exposes the model to missing-modality scenarios during training,
% allowing the fusion module to learn from incomplete modality subsets.
% This training strategy is consistent with the flat performance profile up to moderate drop rates ($\rho \leq 0.4$) observed in Fig.~\ref{fig:robust}, 
% as well as the gradual decline when most modalities are absent.
% }
% \rev{The ablation on detachment suggests that its effectiveness depends on the degree of conflict between training objectives.
% Training without detachment corresponds to end-to-end optimization, which can entangle alignment and downstream fusion losses and bias representations toward task-specific shortcuts.
% As shown in Appendix~C, both the all-modality setting and the full HAF framework benefit from detachment with consistent AUC improvements and only marginal changes in accuracy.
% In contrast, in the alignment-only setting without Random-Modality Drop, allowing task gradients to update the alignment module can weaken cross-modality alignment while yielding higher AUC by favoring task separability. This reflects a trade-off between alignment consistency and task discriminability.
% Overall, these results indicate that detachment is more appropriate in the full HAF framework.}

% \rev{Our results indicate that effective multimodal fusion in highly heterogeneous clinical settings requires not only integrating multiple data sources, but also addressing representation incompatibility and modality variance. 
% HAF approaches this problem through a staged framework that combines individual modality representation learning, lightweight global alignment, and robustness-aware fusion.
% Eventually, this design yields consistent improvements over representative early-, late-, and attention-based fusion baselines, particularly under partial modality availability.}

% \rev{While global alignment improves overall performance, the aligned representations do not collapse into
% perfect subject-level clusters. As observed in the t-SNE visualization, a degree of modality-wise separation
% remains after alignment. This behavior is not contradictory to the objective of alignment.
% Rather than enforcing strict feature collapse across modalities, the alignment loss encourages shared
% low-rank structure while preserving modality-specific residual variations.
% In highly heterogeneous clinical data, such residual structure is inevitable and may even be beneficial,
% as different modalities capture complementary aspects of patient state that cannot be fully reconciled
% in a single latent space.}

% \rev{The detachment ablation suggests an optimization trade-off in multimodal alignment.
% Allowing downstream task gradients to directly update the alignment module can sometimes improve
% raw discriminative metrics, but may introduce conflicts between modality-consistency objectives
% and task-specific objectives.
% Stage-wise detachment therefore serves as a regularization mechanism to decouple these objectives,
% leading to more stable shared representations (see Appendix~C).}


% \rev{The relatively weak performance of several literature baselines can be attributed to the heterogeneous
% quality of the available modalities (see Table~\ref{tab:single_modality}). In the HANCOCK cohort, WSI consistently provides richer and more
% informative signals than other modalities.
% Methods that treat all modalities symmetrically, or that rely on weaker modalities as queries
% (e.g., attention-based fusion), may propagate noisy signals into the fusion process.
% Similarly, decision-level late fusion approaches may discard important cross-modal interactions
% before meaningful integration occurs.
% These observations suggest that naive or symmetric fusion strategies are insufficient in settings
% where modality quality varies substantially.}

\section{\rev{Limitations}}
\rev{
Despite the empirical improvements observed across multiple settings, a limitation is that the global alignment objective does not guarantee that the resulting shared representation space is optimal for the downstream task.
While alignment enforces cross-modality consistency, it does not explicitly optimize for task-specific separability,
and in some cases may reduce inter-patient discriminability when the alignment geometry deviates from the task-relevant structure.
}

%
\section{Conclusion}
We presented \textbf{HAF}, a staged and detached \rev{early} multimodal fusion framework that mitigates cross-objective interference, stabilizes pathology representations, establishes a shared cross-modality geometry, and improves robustness under missing or noisy modalities. Beyond its empirical gains on HANCOCK, HAF also provides a general recipe for designing principled fusion pipelines in highly heterogeneous clinical settings. Future work will focus on exploring stronger task-aware alignment, incorporating finer spatial reasoning, and validating HAF on broader clinical datasets. In addition, extending HAF to prospective cohorts and evaluating its utility in real clinical decision support will further illuminate its translational potential.

\section{Acknowledgements}
This work was partially supported by US National Science Foundation IIS-2412195, CCF-2400785, the Cancer Prevention and Research Institute of Texas (CPRIT) award (RP230363), the National Institutes of Health (NIH) R01 award (1R01AI190103-01) and Microsoft Accelerate Foundation Models Research (2024).

\bibliography{midl26_219}

% \section{Proof of Theorem 1}

% This is a boring technical proof of
% \begin{equation}\label{eq:example}
% \cos^2\theta + \sin^2\theta \equiv 1.
% \end{equation}

% \section{Proof of Theorem 2}

% This is a complete version of a proof sketched in the main text.

\newpage

\appendix
\section{\rev{Dataset Statistics and Evaluation Protocol}}

\rev{
We conduct all experiments on the publicly available HANCOCK cohort~\cite{dorrich2025multimodal}.
Patient-level survival labels are obtained from the official \texttt{target.csv} file released in the HANCOCK GitHub repository.
The full cohort contains 763 patients with survival annotations.
Among them, 701 patients have whole-slide images (WSI) available, corresponding to 1,078 WSI slides in total.
We further restrict the cohort to patients with both WSI and TMA available, excluding 2 patients who have no TMA available, resulting in 699 patients used in our experiments.
For the remaining five structured modalities, missing feature entries are imputed following the official HANCOCK preprocessing protocol released in the original repository.
Modality-level missingness is simulated during training and evaluation via random-modality masking.
Among these, 509 patients are labeled as living and 190 as deceased.
We perform patient-level 10-fold cross-validation, where approximately 69--70 patients are held out for testing in each fold.
All reported metrics are computed strictly at the patient level.
}


\section{\rev{Single Modality}}
\begin{table}[h]
\centering
\caption{\rev{Single-modality baselines on the HANCOCK cohort. 
Each model uses CLAM for pathology-based modalities and a two-layer MLP for structured modalities. 
Results are reported as mean $\pm$ standard deviation over 10 patient-level folds.}}
\label{tab:single_modality}
\begin{tabular}{lcc}
\toprule
\rev{Model} & \rev{accuracy} & \rev{AUC} \\
& \rev{{\scriptsize(mean $\pm$ std)}} & \rev{\scriptsize(mean $\pm$ std)} \\
\midrule
\rev{blood} & \rev{$0.703 \pm 0.060$} & \rev{$0.644 \pm 0.111$} \\
\rev{clinical} & \rev{$0.632 \pm 0.076$} & \rev{$0.603 \pm 0.117$} \\
\rev{icd} & \rev{$0.626 \pm 0.106$} & \rev{$0.581 \pm 0.076$} \\
\rev{pathological} & \rev{$0.665 \pm 0.091$} & \rev{$0.650 \pm 0.140$} \\
\rev{tma cell density} & \rev{$0.645 \pm 0.106$} & \rev{$0.550 \pm 0.135$} \\
\bottomrule
\end{tabular}
\end{table}



\section{\rev{Ablations on Detachment}}
\label{subsec:detachment}

\rev{Training without detachment corresponds to end-to-end optimization across all stages, which often entangles heterogeneous objectives and distorts the geometry needed for modality substitutability.
When heterogeneous objectives are trained jointly without isolation, cross-stage gradients can conflict and bias earlier representations toward downstream fusion losses.
Detachment prevents such interference by allowing each stage to converge independently before passing non-trainable features forward.
As shown in Table~\ref{tab:detach_merge}, comparing HAF with its non-detached variant demonstrates that detachment is beneficial for our framework, yielding a consistent improvement on the ranking-oriented objective (+1.8 pp AUC), while inducing only a marginal change in classification accuracy.}

\begin{table}[!h]
  \centering
  \footnotesize
  \setlength{\tabcolsep}{6pt}
  \renewcommand{\arraystretch}{1.05}
  \caption{\rev{\textbf{Ablations on detachment.} “w” denotes training \textit{with} detachment, “w/o” indicates no detachment.}}
  \label{tab:detach_merge}
  \begin{tabular}{lccc}
  \toprule
  \textbf{\rev{Setting}} & \textbf{\rev{Metric}} & \rev{\textbf{w (with detach)}} & \rev{\textbf{w/o (no detach)}} \\
  & & {\rev{\scriptsize(mean $\pm$ std)}} & \rev{(mean $\pm$ std)} \\
  \midrule
  \multirow{2}{*}{\rev{All modalities}}
   & \rev{accuracy} & \rev{\textbf{0.748$\pm$0.046}} & \rev{0.738$\pm$0.063} \\
   & \rev{AUC}      & \rev{0.694$\pm$0.113} & \rev{\textbf{0.698$\pm$0.110}} \\
  \midrule
  \multirow{2}{*}{\rev{+ Global Alignment}} 
   & \rev{accuracy} & \rev{\textbf{0.752$\pm$0.047}} & \rev{0.752$\pm$0.050} \\
   & \rev{AUC}      & \rev{0.698$\pm$0.127} & \rev{\textbf{0.727$\pm$0.108}} \\
  \midrule
  \multirow{2}{*}{\rev{+ Random Drop}} 
   & \rev{accuracy} & \rev{\textbf{0.748$\pm$0.074}} & \rev{0.748$\pm$0.081} \\
   & \rev{AUC}      & \rev{\textbf{0.715$\pm$0.099}} & \rev{0.714$\pm$0.101} \\
  \midrule
  \multirow{2}{*}{\rev{\textbf{HAF}}} 
   & \rev{accuracy} & \rev{0.745$\pm$0.065} & \rev{\textbf{0.748$\pm$0.052}} \\
   & \rev{AUC}      & \rev{\textbf{0.739$\pm$0.092}} & \rev{0.721$\pm$0.098} \\
  \bottomrule
  \end{tabular}
  \end{table}

\rev{A small trade-off appears in the alignment-only setting: without detachment, global alignment is less complete and modality-specific biases leak into the shared space.
These residual biases can sometimes inflate AUC by exploiting cohort-specific correlations, yet they blur patient-level separability and marginally undermine generalization.
In contrast, detachment removes such interference and yields more consistent, semantically grounded representations, improving the robustness of the full HAF framework.}

\section{\rev{Additional Representation Visualization}}

\rev{The heatmap in Fig.~\ref{fig:alignment_heatmap} provides an additional qualitative illustration of the alignment effect.
Before alignment, feature intensity sequences across modalities are largely uncorrelated, whereas after alignment
they become more synchronized for the same patient, indicating improved cross-modality consistency.}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.85\linewidth]{sample_13_heatmap.png}
    \vspace{-10pt}
    \caption{\rev{\textbf{Heatmap of aligned modality representations for a representative patient.}}}
    \label{fig:alignment_heatmap}
\end{figure}

\section{\rev{Statistical Significance}}
\label{app:stat_test}

\rev{We conduct statistical significance testing using paired tests to compare our proposed model (HAF) against each baseline on the same patient-level test sets.
For classification accuracy, we apply McNemar’s test to paired predictions obtained from identical test folds, which is appropriate for discrete classification outcomes.
For AUC, we apply DeLong’s test for correlated ROC curves, as AUC reflects a ranking-based continuous statistic.
We further apply the Holm--Bonferroni correction, which is a more powerful step-down procedure than the standard Bonferroni correction, to adjust the resulting p-values while controlling the family-wise error rate.
The Holm--Bonferroni procedure is applied in a step-down manner by ordering comparisons according to ascending raw p-values and sequentially adjusting rejection thresholds.
All tests use a significance level of $\alpha = 0.05$.
The set of comparable methods (baselines) includes: PS3, MDLM, MFMF, and Bilinear Interaction.}


\begin{table}[h]
\centering
\caption{\rev{Statistical significance testing (HAF vs. baselines) for classification accuracy}}
\label{tab:stat_acc}
\begin{tabular}{lccc}
\toprule
\rev{Baseline} & \rev{Raw $p$-value} & \rev{Holm-adjusted $p$} & \rev{Significant ($\alpha=0.05$)} \\
\midrule
\rev{MDLM} & \rev{0.00078} & \rev{0.00312} & \rev{Yes} \\
\rev{MFMF} & \rev{0.00860} & \rev{0.02580} & \rev{Yes} \\
\rev{PS3} & \rev{0.01310} & \rev{0.02620} & \rev{Yes} \\
\rev{Bilinear Interaction} & \rev{0.01870} & \rev{0.02620} & \rev{Yes} \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}[h]
\centering
\caption{\rev{Statistical significance testing (HAF vs. baselines) for AUC}}
\label{tab:stat_auc}
\begin{tabular}{lccc}
\toprule
\rev{Baseline} & \rev{Raw $p$-value} & \rev{Holm-adjusted $p$} & \rev{Significant ($\alpha=0.05$)} \\
\midrule
\rev{MDLM} & \rev{0.00063} & \rev{0.00252} & \rev{Yes} \\
\rev{Bilinear Interaction} & \rev{0.00930} & \rev{0.02790} & \rev{Yes} \\
\rev{PS3} & \rev{0.02340} & \rev{0.04680} & \rev{Yes} \\
\rev{MFMF} & \rev{0.09549} & \rev{0.09549} & \rev{No} \\
\bottomrule
\end{tabular}
\end{table}

\end{document}
