\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{graphicx}
\usepackage{multirow}
\usepackage{multicol}
\usepackage{wrapfig}
\usepackage{booktabs}
\usepackage{xcolor}


% \jmlrvolume{-- Under Review}
% \jmlryear{2025}
% \jmlrworkshop{Full Paper -- MIDL 2025 submission}
% \editors{Under Review for MIDL 2025}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- 17}
\editors{Accepted for publication at MIDL 2025}

\title[Decoupling Fusion Network]{DeFusion: An Effective Decoupling Fusion Network for Multi-Modal Pregnancy Prediction}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Xueqiang Ouyang\nametag{$^{1}$}}\Email{202221044618@mail.scut.edu.cn}\\
\Name{Jia Wei\midljointauthortext{Corresponding author: csjwei@scut.edu.cn}\nametag{$^{1}$}}\Email{csjwei@scut.edu.cn}\\
\Name{Wenjie Huo\nametag{$^{2}$}}\Email{1282039293@qq.com}\\
\Name{Xiaocong Wang\nametag{$^{2}$}}\Email{xwang@smu.edu.cn}\\
\Name{Rui Li\nametag{$^{3}$}}\Email{rxlics@rit.edu}\\
\Name{Jianlong Zhou\nametag{$^{4}$}}\Email{jianlong.zhou@uts.edu.au}\\
\addr $^{1}$ School of Computer Science and Engineering, South China University of Technology, Guangzhou, China\\
\addr $^{2}$ Department of Obstetrics and Gynecology, Nanfang Hospital, Southern Medical University, Guangzhou, China\\
\addr $^{3}$ Golisano College of Computing and Information Sciences, Rochester Institute of Technology, Rochester, NY 14623, USA\\
\addr $^{4}$ UTS Data Science Institute, University of Technology Sydney, Ultimo, NSW 2007, Australia
}

\begin{document}

\maketitle

\begin{abstract}
Temporal embryo images and parental fertility table indicators are both valuable for pregnancy prediction in \textbf{in vitro fertilization embryo transfer} (IVF-ET). However, current machine learning models cannot make full use of the complementary information between the two modalities to improve pregnancy prediction performance. In this paper, we propose a Decoupling Fusion Network called DeFusion to effectively integrate the multi-modal information for IVF-ET pregnancy prediction. Specifically, we propose a decoupling fusion module that decouples the information from the different modalities into related and unrelated information, thereby achieving a more delicate fusion. And we fuse temporal embryo images with a spatial-temporal position encoding, and extract fertility table indicator information with a table transformer. To evaluate the effectiveness of our model, we use a new dataset including 4046 cases collected from Southern Medical University. The experiments show that our model outperforms state-of-the-art methods. Meanwhile, the performance on the eye disease prediction dataset reflects the model's good generalization. Our code and dataset are available at \href{https://github.com/Ou-Young-1999/DFNet}{https://github.com/Ou-Young-1999/DFNet}.
\end{abstract}

\begin{keywords}
decoupling fusion, multi-modal fusion, IVF-ET pregnancy prediction.
\end{keywords}

\section{Introduction}
\indent Recent study shows that up to 12-15$\%$ of couples are diagnosed as infertility \cite{infertility}, and \textbf{in vitro fertilization embryo transfer} (IVF-ET) is one of the most effective technologies to treat infertility. As shown in Fig. \ref{step}, during the IVF-ET process, medical laboratory technicians obtain multiple oocytes by stimulating mother’s uterus with ovulation and produce multiple zygotes in a laboratory environment \cite{process}. After 3-5 days culture, laboratory technicians select the optimal embryos based on visual evaluation of embryo morphology and transfer it back to mother’s uterus for further development. Thus, it is a crucial step to select high-quality embryos that would lead to promising pregnancy results of IVF-ET.\\
\indent In clinical practice, the pregnancy success rate of IVF-ET is 30-40$\%$ only \cite{rate}. One reason is that the optimal embryos may not be survival after the transfer, since human evaluation of embryo morphology is highly subjective and with low consistency. Moreover, embryo morphology is not always relevant to embryos’ true development vitality. Numerous studies have shown that embryos with good morphology didn't survive while ones with poor morphology did \cite{survival}. In fact, not only the embryo morphology affects the pregnancy success rate, but also the fertility indicators of parents, such as parents’ age, endometrial thickness, sperm quality, and so on.\\
\begin{figure*}[t]
\centering
\includegraphics[width=1\textwidth]{image/step.pdf} % Reduce the figure size so that it is slightly narrower than the column.
\caption{The committed step of IVF-ET.}
\label{step}
\end{figure*}
\indent In the area of computer assisted IVF-ET, existing researches mainly focus on the morphological grading of embryos. As shown in Fig. \ref{step}, we can obtain microscope images of embryonic development between step 3 and step 4 of IVF-ET. In order to perform the embryos morphological grading task, \cite{cnn1} and \cite{cnn2} apply convolutional neural networks (CNN) based on static embryo images; \cite{videotrans} and \cite{twostream} apply transformer and two-stream neural network based on time-lapse microscopy (TLM) images, respectively. In addition, \cite{embryograde} fuse multi-focal images to predict grade of blastocyst. The performance of these methods outperform laboratory technicians, because embryos grading is completely based on morphological information and the salient morphological characteristics among different grades are easily distinguishable for machines. However, morphological grading is indirect and less relevant to the pregnancy outcome of IVF-ET as discussed in the previous paragraph. Therefore, more researches tend to predict the pregnancy outcome directly.\\
\indent Recent AI-based assessment models achieve promising success in direct pregnancy predction. \cite{table} apply traditional machine learning methods with fertility table indicators obtained before step 5 for pregnancy prediction. In addition, static images of the fifth day’s embryos and TLM images are also adopted to predict the pregnancy outcome, respectively \cite{static,tlm}. What’s more, \cite{fusion}(\textbf{MMBE}) fuse the fifth day’s static embryo image and fertility table indicators to achieve better pregnancy prediction performance. The major limitation of the existing image-based methods is that they are only applicable to the fifth day’s embryo transfer. However, in reality many reproductive centers carry out the third day’s embryo to transfer. Although the embryo of the third day is less developed than that of the fifth day, the embryo images of the third day can still provide clinically significant information for pregnancy prediction \cite{day3}. On the other hand, due to some technical constraints in reality multi-modal fusion method \cite{fusion} can only be used to the last day's image. \\
\indent To address the limitations discussed above, we propose a Decoupling Fusion Network called DeFusion to effectively integrate temporal images of the first three days and parental fertility table indicators for IVF-ET pregnancy prediction. The main contributions are summarized as follows:
\begin{enumerate}
\item[$\bullet$] DeFusion is the first to integrate the first three days of embryonic development temporal images and parental fertility table indicators for pregnancy prediction.
\item[$\bullet$] We propose a spatial-temporal position encoding for fusing temporal embryo images. Moreover, we apply a table transformer to extract tabular information from fertility indicators.
\item[$\bullet$] We propose a novel decoupling fusion network to fuse multi-modal information more finely grained by decoupling information from different modalities into modality related and unrelated feature.   
\end{enumerate}
\begin{figure*}[t]
\centering
\includegraphics[width=1\textwidth]{image/framework2.pdf} % Reduce the figure size so that it is slightly narrower than the column.
\caption{The framework of DeFusion. In the process of decoupling training, the features of different modalities change from entangled to disentangled.}
\label{framework}
\end{figure*}
\section{Method}
\indent In the field of medical multi-modal fusion, the final fusion features are obtained by complementing the unique features and enhancing the common features of different modalities. However, there is a complex relationship between the features of the same modality and different modalities, which is not a simple linear relationship. So it is difficult to be captured by the model. Inspired by the decoupling operation in \cite{decouple1} \cite{decouple2}, we use the decoupling fusion strategy explicitly decouples the features of different modalities into unique and common features, which is a shift from entangled features to disentangled ones, simplifying the relationships between features and better modeling the complex interactions between modalities. So we propose the decoupling fusion module, a simple and effective multi-modal fusion module as showed in Fig. \ref{framework}. The input information of the model are temporal grayscale embryo images and fertility table indicator. Embryo images are denoted as $\mathbf {im_{i}\in \mathbb R^{1\times H\times  W}}$, where $\mathbf {i\in  1,2,3}$ denote different days, and $\mathbf H$ and $\mathbf W$ denote the height and the width of an image, respectively. Table indicators are denoted as $\mathbf {ta\in \mathbb R^{N}}$, where $\mathbf N$ denotes the number of indicators. 
\begin{figure*}[t]
\centering
\includegraphics[width=1\textwidth]{image/STPE2.pdf} % Reduce the figure size so that it is slightly narrower than the column.
\caption{The details of temporal fusion network (image extractor).}
\label{stpe}
\end{figure*}
\subsection{Image Extractor}
\indent To fuse the first three days embryo images for pregnancy prediction, we design a temporal image fusion network (image extractor). This network consists of three parts as shown in Fig. \ref{stpe}: a backbone aiming at extracting image features; a spatial-temporal position encoding (STPE) to obtain spatial information within a single image and temporal information among different images; and a Transformer \cite{transformer} that combine STPE to fuse temporal image features. Firstly, we use the backbone to extract embryo image features $\mathbf {im_{i}^{f}\in \mathbb R^{C\times \frac{H}{S}\times \frac{W}{S}}}$ from the first three days, where $\mathbf C$ is the number of channels and $\mathbf S$ is the scaling factor. Then, we reshape the image features into tokens and add positional information to them. We encode image features as spatial position encoding $\mathbf {PE_{s}^{i}\in \mathbb R^{C\times \frac{H}{S}\times \frac{W}{S}}}$ through a convolution layer: $\mathbf{PE_{s}^{i} =Conv(im_{i}^{f})}$. By using convolution operation to capture local spatial features, we can obtain spatial position information within a single image \cite{spe}. Image features are encoded as temporal position encoding $\mathbf {PE_{t}^{i}\in \mathbb R^{C\times 1\times 1}}$ through a pooling layer and a series of recurrent layers: $\mathbf {PE_{t}^{1}=Pooling(im_{1}^{f}), PE_{t}^{2}=Pooling(im_{2}^{f})+PE_{t}^{1}, PE_{t}^{3}=Pooling(im_{3}^{f})+PE_{t}^{2}}$. By using recurrent operations to capture temporal dependencies, we can obtain temporal position information among different images \cite{lstm}. To align image features, the $\mathbf {PE_{t}^{i}}$ needs to be replicated to get $\mathbf {PE_{t}^{i}\in \mathbb R^{C\times \frac{H}{S}\times \frac{W}{S}}}$. The $\mathbf {PE_{s}^{i}}$ and $\mathbf {PE_{t}^{i}}$ are integrated through a position encoding attention:
\begin{equation}
\mathbf {
PE_{att}^{i}=Softmax(Pooling(PE_{s}^{i})||Pooling(PE_{t}^{i})),
}
\end{equation}
where $\mathbf {PE_{att}^{i}\in \mathbb R^{\frac{H\times W}{S\times S}\times 2}}$. The final $\mathbf{PE^{i}}$ is as follows:
\begin{equation}
\mathbf {
PE^{i}=PE_{att}^{i}[:,0]*PE_{s}^{i}+PE_{att}^{i}[:,1]*PE_{t}^{i},
}
\end{equation}
where $\mathbf {PE^{i}\in \mathbb R^{\frac{H\times W}{S\times S}\times C}}$. After obtaining the STPE, integrating the tokens obtained through image feature reshaping into the transformer encoder can effectively fuse temporal embryo images for pregnancy prediction.
\subsection{Table Extractor}
\indent Inspired by TabTransformer \cite{tabtransformer} (table extractor), we extract table information of fertility table indicators by transformer \cite{attention}. In order to adapt the table information to the transformer, we specify tabular embedding as a linear layer to upscale the table features $\mathbf{ta}$ to $\mathbf {ta^{f}\in \mathbb R^{N \times 32}}$. Next, we construct a series of transformer layer with multi-head self-attention (MHSA) to extract table information:
\begin{equation}
\begin{gathered}
\mathbf {Q=MLP(ta^{f}), K=MLP(ta^{f}), V=MLP(ta^{f}),}\\
\mathbf {\overline{ta^{f}}=MHSA(Q,K,V)+ta^{f}, \overline{\overline{ta^{f}}}=MLP(LN(ta^{f}))+\overline{ta^{f}},}\\
\end{gathered}
\end{equation}
here, $\mathbf{LN}$ means layer-norm and $\mathbf{MLP}$ is the linear layer.
\subsection{Decoupling Fusion Module}We use an image feature extractor and a table feature extractor to extract temporal image features and table features. And we denote their outputs as $\mathbf {f_{i}}$ and $\mathbf {f_{t}}$, respectively. In order to fuse information from different modalities at a finer granularity, as shown in Fig. \ref{framework}, we propose a decoupling fusion module that decouples the feature of different modalities into related (common) feature and unrelated (unique) feature. We extract related feature $\mathbf {f_{i}^{c}}$ and $\mathbf {f_{t}^{c}}$ between modalities through shared $\mathbf {E_{c}(f_{m};\theta^{c})}$, and extract unrelated feature $\mathbf {f_{i}^{u}}$ and $\mathbf {f_{t}^{u}}$ through $\mathbf {E_{u}(f_{i};\theta_{i}^{u})}$ and $\mathbf {E_{u}(f_{t};\theta_{t}^{u})}$. We can decouple the features of different modalities by using cross reconstruction method \cite{cross1,cross2}. The cross reconstruction loss is as follows:
\begin{equation}
\mathbf{
		\mathcal L_{recon}=\displaystyle \sum_{m=0}^{M}||\mathbf {f_{i}^{m}}-\mathbf {D(f_{t}^{c},f_{i}^{u};\theta_{i})}||_{1}+\displaystyle \sum_{m=0}^{M}||\mathbf {f_{t}^{m}}-\mathbf {D(f_{i}^{c},f_{t}^{u};\theta_{t})}||_{1},
        }
\end{equation}
where $\mathbf {||.||_{1}}$ is the L1-norm, $\mathbf M$ is the dimensionality of the feature, $D(f_{t}^{c},f_{i}^{u};\theta_{i})$ and $D(f_{i}^{c},f_{t}^{u};\theta_{t})$ are decoder. We obtain the final pregnancy prediction result by fusing the decoupled common and unique features: $\mathbf {y_{b}=Classifier(f_{\overline{i}},f_{\overline{t}})}$, which is composed of three layers of MLP. Finally, we apply cross entropy loss to minimize the difference between the predicted results and the true labels. The classification loss is as follows:
\begin{equation}
\mathbf {
{\mathcal L}_{ce}=-\frac{1}{B}\displaystyle \sum_{b=0}^{B}[\overline y_{b}log{y}_{b}+(1-\overline y_{b})log(1-{y}_{b})],
}
\end{equation}
where $\mathbf {\overline y_{b}}$ represent true labels, $\mathbf B$ is the size of a batch. The overall loss function is as follows:
\begin{equation}
\mathbf {
\mathcal L={\mathcal L}_{ce}+\lambda {\mathcal L}_{recon}.
}
\end{equation}
where $\lambda$ is a hyperparameter in the loss function.

\section{Experiments}
\subsection{Dataset}
\indent The first dataset used in the research is from Southern Medical University, with a total of 4046 valid embryo transfer cases. Each case includes both image data and tabular data. The image data are the first three days' microscopic images of embryo development. The tabular data include 22 parental fertility indicators. The label of each example is positive or negative, representing whether having fetal hearts successfully or not. We conduct a 5-fold cross validation on the dataset.\\
\indent The second dataset comes from the Peking University International Competition on Ocular Disease Intelligent Recognition (ODIR) \cite{odir}, and the original task was to classify eye diseases through image uni-modality. In order to make the dataset applicable to multiple modalities, we extract a total of 3500 cases of image modality and table modality information for eye disease prediction. Among them, the image modality consists of a eye image, and the table modality consists of 8 indicators converted from keywords. We have already made this dataset accessible to the public as a new multi-modal dataset. We conduct a 4-fold cross validation on the dataset.\\
\subsection{Evaluation Metric and Experimental Settings}
\indent In the experiments, we evaluate the performance with Accuracy, Area Under the ROC (AUC), and F1-score. AUC is a comprehensive metric to evaluate prediction accuracy. F1-score is an index taking into account the precision and recall of the model predictions.\\
\indent We implement our method with PyTorch on a Nvidia GeForce RTX 2080ti graphics processing unit (GPU). In addition, since the image class tokens and the table class tokens need to share an encoder, we align them with a linear layer. The learning rate of the image extractor is 1e-6, while the learning rate of the table extractor model is 1e-4. The learning rate of the DeFusion model is 1e-5. The above models all use the Adam \cite{adam} optimizer. As in Section 3, $H=224$, $W=224$, $N=22$, and $\lambda=1$.
\begin{table}
\centering
\caption{Comparative and ablation experiment results for pregnancy prediction.}\label{tab1}
\resizebox{400px}{!}{%
\begin{tabular}{|l|l|l|l|l|}
\hline
Modality&Method  & AUC & F1& Accuracy \\ 
\hline 
\multirow{5}{*}{Table}
&MLP  	& 0.684(0.006)	& 0.649(0.012)& 0.641(0.013)	\\
&SVM  	& 0.690(0.006)	& 0.643(0.011)& 0.634(0.011)	\\
&Adaboost  	& 0.708(0.009)	& 0.631(0.016)& 0.640(0.015)	\\
&TabNet   & 0.702(0.014) & 0.643(0.009)& 0.634(0.010) \\
&\textbf{Ours(TabTransformer)} & \textbf{0.713(0.012)} & \textbf{0.661(0.011)}& \textbf{0.653(0.006)}  \\
\hline  
\multirow{10}{*}{Image}
&ResNet+Add  & 0.554(0.008)	& 0.584(0.012)& 0.592(0.014)		\\
&ResNet+LSTM  & 0.572(0.014) & 0.591(0.018)& 0.600(0.026)  \\
&ResNet+Learnable  	& 0.589(0.012)	& 0.597(0.013)& 0.598(0.020)	\\
&ResNet+SinCos  	& 0.602(0.011)	& 0.607(0.014)& 0.612(0.021)	\\
&\textbf{Ours(ResNet+STPE)} & \textbf{0.617(0.012)} & \textbf{0.621(0.022)} & \textbf{0.631(0.016)} \\
\cline{2-5}
&w/o SPE  	& 0.614(0.013)	& 0.611(0.013)& 0.627(0.015)	\\
&w/o TPE  	& 0.596(0.011)	& 0.607(0.008)& 0.613(0.007)	\\
&w/o PEAttention  	& 0.600(0.009)	& 0.605(0.007)& 0.613(0.020)	\\
&w/o STPE  & 0.584(0.016)	& 0.603(0.015)& 0.619(0.021)		\\
&with ViT  & 0.604(0.009)	& 0.609(0.013)& 0.613(0.016)		\\
\hline    
\multirow{8}{*}{Image and Table}
&MMBE  	& 0.723(0.004)	& 0.682(0.021)& 0.681(0.011)	\\
&MOAB   & 0.719(0.003) & 0.681(0.009)& 0.674(0.011) \\
&SFusion 	& 0.718(0.005)	& 0.667(0.013)& 0.658(0.013)	\\
&ConGraph  	& 0.718(0.011)	& 0.649(0.009)& 0.640(0.009)	\\
&HMCAT  	& 0.723(0.011)	& 0.655(0.014)& 0.646(0.012)	\\
&\textbf{Ours(DeFusion)}  & \textbf{0.746(0.003)} & \textbf{0.689(0.017)}& \textbf{0.691(0.010)} \\
\cline{2-5}
&w/o Decoupling Module  	& 0.715(0.007)	& 0.689(0.013)& 0.681(0.014)	\\
&with TabNet  	& 0.735(0.010)	& 0.681(0.011)& 0.683(0.013)	\\
\hline    
\end{tabular}
}
\end{table}
\subsection{Baseline Methods}
\indent We compare our method with baseline methods as shown in Table~\ref{tab1}, our model achieves superior performance in all the evaluation metrics. Firstly, for table modality, we compare a \textbf{SVM} \cite{table} and a \textbf{Adaboost} \cite{table} as uni-modal models based on parental tabular fertility indicators. We also compare \textbf{TabNet} \cite{tabnet}, which is a neural network model specifically designed for tabular classification tasks. The TabTransformer method we use has the best performance. Secondly, for image modality, by comparing temporal image fusion models based on Add, LSTM \cite{lstm}, and Transformer with different positional encodings (sin-cos and learnable) \cite{attention}, our STPE achieve optimal performance in the transformer-based temporal image fusion strategy. Thirdly, for image and table modalities, we compare the recent multi-modal baseline methods have been introduced in Appendix A and B. Our method, as a new category fusion approach, achieves optimal results.\\
\subsection{Ablation Study}
\indent We conduct ablation experiments to evaluate the contribution of each module in our model in Table~\ref{tab1}. We first ablate the Spatial-Temporal Position Encoding (STPE) including Spatial Position Encoding (SPE), Temporal Position Encoding (TPE) Position Encoding Attention (PEAttention) titled as \textbf{w/o SPE}, \textbf{w/o TPE}, \textbf{w/o PEAttention} and \textbf{w/o STPE}, respectively. In addition, by comparing the performance of ResNet \cite{resnet} and ViT (\textbf{with ViT}) \cite{transformer}, we choose ResNet as the backbone of the image. Similarly, we evaluate the contribution of the Decoupling Module through \textbf{w/o Decoupling Module}. In addition, by comparing TabNet \cite{tabnet} (\textbf{with TabNet}) and TabTransformer, we choose TabTransformer as the table extractor.
\subsection{Generalization}Although the proposed DeFusion model is designed for pregnancy prediction, the principles behind it are universal and can be transferred to other multi-modal medical image analysis tasks. We extend DeFusion for multi-modal eye disease prediction on the ODIR dataset. The final prediction results are shown in the Table~\ref{tab4}. Although our model doesn’t achieve the highest accuracy, it performs best in the AUC metric, indicating that our model performs better in terms of overall performance.\\
\begin{figure}[t]
\centering
\includegraphics[width=0.9\textwidth]{image/similarity.pdf} % Reduce the figure size so that it is slightly narrower than the column.
\caption{(a) Decoupled features of the test dataset visualized in a t-SNE space. (b) PCC matrix of the decoupled features.}
\label{similarity}
\end{figure}
\begin{table}
\centering
\caption{Comparative and ablation experiment results in ODIR dataset.}\label{tab4}
\resizebox{300px}{!}{%
\begin{tabular}{|l|l|l|l|l|l|}
\hline  
Method  & AUC & F1 &Accuracy\\
\hline  
MMBE  	& 0.836(0.007)	& 0.765(0.009)& 0.766(0.007)	\\
MOAB   & 0.827(0.004) & 0.756(0.005)& 0.751(0.004)  \\
SFusion 	& 0.791(0.030)	& 0.755(0.013)& 0.755(0.010)	\\
ConGraph  	& 0.836(0.009)	& 0.756(0.008)& 0.751(0.008)	\\
HMCAT 	& 0.835(0.006)	& 0.773(0.017)& 0.772(0.010)	\\
\textbf{Ours(DeFusion)}  & \textbf{0.842(0.004)} & \textbf{0.772(0.009)}& \textbf{0.770(0.009)} \\
\hline  
w/o Decoupling  	& 0.825(0.003)& 0.763(0.014)& 0.759(0.014)	\\
\hline   
\end{tabular}
}
\end{table}
\subsection{Visualization}
\indent As Fig. \ref{similarity} shows, we output t-SNE \cite{tsne} results and average Pearson correlation coefficient (PCC) matrix \cite{pearson} of $\mathbf {f_{i}^{c}}$(img\_related), $\mathbf {f_{t}^{c}}$(tab\_related), $\mathbf {f_{i}^{u}}$(img\_unrelated) and $\mathbf {f_{t}^{u}}$(tab\_unrelated) from the decoupling test set. The PCC is between 0 and 1, with a larger value indicating greater relevance. The overlap between the points of $\mathbf {f_{i}^{c}}$ and $\mathbf {f_{t}^{c}}$ after t-SNE dimensionality reduction and the high PCC value between $\mathbf {f_{i}^{c}}$ and $\mathbf {f_{t}^{c}}$ indicate that the model successfully capture relevant and overlapping information between the two modalities. On the contrary, $\mathbf {f_{i}^{u}}$ and $\mathbf {f_{t}^{u}}$ are well separated, indicating it capture the information that is independent and complementary between the two modalities. These prove the effectiveness of the decoupling module.

%
%
%
\section{Conclusion}
\indent This paper proposes a Decoupling Fusion Network called DeFusion to integrate the multi-modal information of temporal embryo images and parental fertility table indicators for IVF-ET pregnancy prediction. The superior performance suggest that our model can provide valuable assistance for the selection of embryos for transplantation. And the effectiveness of the decoupling fusion module has been demonstrated through visualization and generalization experiments. In the future, we will optimize the decoupling module and expand it to more datasets.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
%\midlacknowledgments{We thank a bunch of people.}


\bibliography{midl25_17}


\appendix

\newpage
\section{Related Work}
Fusion of heterogeneous information from multi-modal data can effectively enhance model performance, which is a key project in the medical field of multi-modal learning \cite{multi-modal}. Decision-level fusion and feature-level fusion are two main strategies for multi-modal fusion. The decision-level fusion employs averaged, weighted voting or majority voting \cite{decision-level}  to integrate the outputs of uni-modal models, so as to make the final multi-modal output. \\
\indent Although decision-level fusion is simple to implement, it cannot capture the interactions between hidden features from different modalities. The feature-level fusion fuses the heterogeneous multi-modal data by projecting extracted features into a compact and information-rich multi-modal hidden representation space. Feature-level fusion mainly includes simple-operation based, tensor-based, transformer-based and graph-based methods. The simple-operation method performs concatenation, element addition and element multiplication operations. \cite{simple-op}  use two branch encoders to extract image and non-image information, and fuse the extracted information on key point through simple operation for COVID-19 patient severity prediction. The tensor-based method performs outer product between multi-modal feature vectors to form higher-order co-occurrence matrices, which provide more informative information than these features alone. \cite{tensorfusion}(\textbf{MOAB}) use a deep orthogonal fusion model to predict the atrial fibrillation from different multi-modal data. The attention mechanism in the transformer-based method has the ability to aggregate features in different feature spaces, making it very suitable for multi-modal alignment and fusion. \cite{transformerfusion}(\textbf{SFusion}) apply transformer to fuse different modalities of brain imaging for tumor segmentation; \cite{co-attention}(\textbf{HMCAT}) use a model based on the cross attention transformer that integrates pathological and radiological images for cancer prediction. Graph-based modeling and inference can provide a deeper understanding of disease information by discovering complex relationships between hidden disease tissue regions. \cite{graphfusion}(\textbf{ConGraph}) transform images and non-images into graph nodes based on fully connected graph attention network, and fuse information among nodes to predict Pakinson’s disease. 

\section{Baseline Methods}
\subsection{Baseline methods with image modality}
\textbf{ReNet+Add} applys ResNet to extract image information $F_{i}\in \mathbb R^{512}$ from the first three days of embryonic development, where $i\in 1,2,3$. Then, $F_{1}$, $F_{2}$ and $F_{3}$ are fused through an addition operator. Finally, we use a classifier consisting of three non-linear layers for pregnancy prediction.
\textbf{ReNet+LSTM} replaces the addition operator with LSTM on the basis of \textbf{ReNet+Add}.
\textbf{No Position}, \textbf{Learnable} and \textbf{Sin-Cos} are the results of replacing our proposed spatial-temporal position encoding with different position encoding.
\subsection{Baseline methods with both image modality and table modality}
As shown in Fig. \ref{baseline}, we compare different multi-modal fusion methods. To ensure fairness in comparison, all methods use the same backbone. In addition, according to the structural characteristics of different models, TransformerFusion (\textbf{SFusion} \cite{transformerfusion}, \textbf{HMCAT} \cite{co-attention}) and GraphFusion (\textbf{ConGraph} \cite{graphfusion}) use all image and table tokens, while AddFusion (\textbf{MMBE} \cite{fusion}), TensorFusion (\textbf{MOAB} \cite{tensorfusion}), and our DecouplingFusion (\textbf{DeFusion}) only use image class token and table class token.\\
As shown in Fig. \ref{baseline} (a) and (b), \textbf{AddFusion} and \textbf{TensorFusion} directly fuse the class tokens of the two modalities using an addition operation and an outer product operation, respectively. As shown in Fig. \ref{baseline} (c) and (d), \textbf{TransformerFusion} and \textbf{GraphFusion} use the Graph Attention Network \cite{gat} and the Transformer \cite{transformer} as the fusion network to fuse all tokens of the two modalities, respectively. Our method is an innovative multi-modal fusion category.

\begin{figure*}[h]
\centering
\renewcommand{\thefigure}{A.1}
\includegraphics[width=1\textwidth]{image/baseline.pdf} % Reduce the figure size so that it is slightly narrower than the column.
\caption{Comparison of different multi-modal fusion frameworks.}
\label{baseline}
\end{figure*}

\newpage
\section{Dataset Samples}
In the Fig.\ref{concat} and Table \ref{indicator}, we present some examples of image modality and table modality used for pregnancy prediction, respectively. When processing image data, we use the following data enhancement during training and testing: resize to 256 pixels and then center cropping to 224 pixels. Finally, normalization with a mean of 0.566 and a variance of 0.063 is used, and the mean and variance were obtained by statistics of the whole data set. When processing tabular data, we use the average of features to replace missing features. Then we input the features into the neural network after min-max normalization.\\

\begin{figure*}[h]
\centering
\renewcommand{\thefigure}{A.2}
\includegraphics[width=0.8\textwidth]{image/concat.pdf} % Reduce the figure size so that it is slightly narrower than the column.
\caption{Ten examples of temporal embryo images. Each column represents an example, containing images of the first, second, and third day of embryonic development.}
\label{concat}
\end{figure*}

\begin{table}[h]
  \centering
  \renewcommand{\thetable}{A.1}
  \caption{Ten examples of parental fertility indicators. BMI represents Body Mass Index, AFC represents Antral Follicle Counting, HCG represents Human Chorionic Gonadotropin, E2 represents Estradiol, FSH represents Follicle-Stimulating Hormone, BT represents Before Semen Treatment, AT represents After Semen Treatment and - represents the missing value.}\label{indicator}
  \resizebox{400px}{!}{%
    \begin{tabular}{|l|rrrrrrrrrr|}
    \hline 
    Number & 1     & 2     & 3     & 4     & 5     & 6     & 7     & 8     & 9     & 10 \\
    \hline 
    Female age & 39    & 25    & 44    & 34    & 31    & 36    & 33    & 29    & 28    & 29 \\
    Man age & 46    & 24    & 45    & 36    & 35    & 38    & 38    & 32    & 35    & 31 \\
    Female BMI & 22.8  & 18.7  & 22.1  & 21.5  & 18.5  & 23.9  & 23.4  & 18    & 25    & 20.4 \\
    AFC   & 10    & - & - & 8     & 18    & 9     & 12    & 16    & 10    & 8 \\
    Number of obtained oocytes & 5     & 22    & 3     & 10    & 17    & 14    & 5     & 11    & 7     & 7 \\
    Number of mature oocytes & 4     & 14    & 3     & 10    & 15    & 10    & 5     & 8     & 7     & 7 \\
    Available embryos & 3     & 2     & 3     & 7     & 7     & 3     & 4     & 3     & 5     & 7 \\
    High-quality embryos & 2     & 1     & 1     & 7     & 5     & 1     & 2     & 2     & 4     & 4 \\
    HCG Day E2 & 1037  & 2271  & 1782  & 1483  & 3118  & 3693  & 731.9 & 1891  & 1309  & 3767 \\
    HCG intimal thickness & 7.5   & 15.5  & 8     & 9.3   & 9.4   & 11.7  & 9.8   & 15    & 17.5  & 12.5 \\
    FSH   & 8     & - & - & 7.81  & 7.22  & 4.07  & 4.61  & 6.53  & 7.3   & - \\
    Infertility years & 2     & - & - & 9     & - & 7     & 10    & 1     & 3     & 4 \\
    Volume BT & 1.4   & 4.1   & 4     & 2.6   & 2     & 1.5   & 1.5   & 1     & 2.5   & 0.3 \\
    Concentration BT & 6     & 2     & 25    & 20    & 0.01  & 3     & 4     & 10    & 18    & 40 \\
    Non forward movement BT & 5     & 10    & 5     & 15    & 0.5   & 5     & 3     & 10    & 10    & 5 \\
    Inactivity BT & 90    & 80    & 85    & 60    & 0.5   & 90    & 95    & 85    & 80    & 75 \\
    Forward movement BT & 5     & 10    & 10    & 25    & 0.5   & 5     & 2     & 5     & 10    & 10 \\
    Volume AT & 0.1   & 0.1   & 0.6   & 0.2   & 0.2   & 0.15  & 0.2   & 0.15  & 0.3   & 0.3 \\
    Concentration AT & 1     & 2     & 3     & 8     & 0.01  & 1     & 1     & 1     & 1     & 3 \\
    Non forward movement AT & 20    & 10    & 5     & 5     & 0.5   & 10    & 20    & 20    & 20    & 10 \\
    Inactivity AT & 30    & 25    & 5     & 5     & 0.5   & 10    & 10    & 40    & 10    & 5 \\
    Forward movement AT & 50    & 65    & 90    & 90    & 0.5   & 80    & 70    & 30    & 60    & 85 \\
    \hline 
    \end{tabular}%
    }
\end{table}%

\clearpage
\section{Supplementary Experiments}
\subsection{Table Modality Experiments}As shown in Fig.\ref{tabular}, it is a detail of the table extractor (TabTransformer).

\begin{figure*}[h]
\centering
\renewcommand{\thefigure}{A.3}
\includegraphics[width=0.8\textwidth]{image/tabular.pdf} % Reduce the figure size so that it is slightly narrower than the column.
\caption{The details of TabTransformer (table extractor).}
\label{tabular}
\end{figure*}

\subsection{Image Modality Experiments}Table~\ref{tab3} shows the performance with image modality only. We use images of embryo development on the first, second, and third day for pregnancy prediction. And the backbone is ResNet. The results indicate that as the embryo continues to develop, it provides greater assistance in predicting pregnancy.

\begin{table}[h]
\centering
\renewcommand{\thetable}{A.2}
\caption{Comparative experiment results for pregnancy prediction with image modality.}\label{tab3}
\resizebox{400px}{!}{%
\begin{tabular}{|l|l|l|l|l|}
\hline 
Modality&Method  & AUC & F1& Accuracy \\ 
\hline  
\multirow{4}{*}{Image}
&ResNet(First Day)  	& 	0.476(0.004)& 0.515(0.009)&0.502(0.009) 	\\
&ResNet(Second Day)   	& 	0.565(0.007)& 0.572(0.013)&0.560(0.013) 	\\
&ResNet(Third Day)    & 	0.593(0.007)& 0.598(0.012)& 0.595(0.010)	\\
&\textbf{Ours(ResNet+STPE)(Three Days)} & \textbf{0.617(0.012)} & \textbf{0.621(0.022)}& \textbf{0.631(0.016)}  \\
\hline     
\end{tabular}
}
\end{table}

\subsection{Computational Complexity}As shown in the Table \ref{computation}, we compare the computational complexity of multi-modal fusion methods. Although our model is not optimal in terms of computational complexity, compared to other methods, our approach still achieves competitive results in terms of computational complexity while achieving optimal accuracy.

\begin{table}[h]
\centering
\renewcommand{\thetable}{A.3}
\caption{Comparison of computational complexity of different multi-modal fusion methods. GFLOPs, the smaller the index, the better. Training time (in seconds) of a single epoch on 2080Ti GPU with 12G memory, the smaller the index, the better. Frames per second (FPS) on the i7-6850K@3.60GHz CPU, the larger the better.}\label{computation}
\resizebox{400px}{!}{%
\begin{tabular}{|l|l|l|l|l|l|l|}
\hline 
Method&MMBE  & MOAB & SFusion& ConGraph& HMCAT & Ours(DeDusion)\\ 
\hline  
GFLOPs(↓)&12.66 	& 13.30	& 13.66& 14.01& 16.19& 12.67	\\
\hline  
Training Time(↓)&50	& 91	& 65& 68& 79& 64	\\
\hline  
Inference FPS(↑)&5.05 	& 2.69	& 3.34& 3.54& 2.79& 3.51	\\
\hline     
\end{tabular}
}
\end{table}


\subsection{Uni-modal Experiment on The ODIR}As shown in the Table \ref{ODIR}, we compare the uni-modal method on the ODIR dataset for eye disease prediction.

\begin{table}[h]
\centering
\renewcommand{\thetable}{A.4}
\caption{Uni-modal comparison on the ODIR dataset for eye disease prediction.}
\resizebox{300px}{!}{%
\begin{tabular}{|l|l|l|l|l|}
\hline
Modality & Method & AUC & F1& Accuracy   \\
\hline
\multirow{2}{*}{Image} & ResNet & 0.714(0.016)&0.677(0.015)& 0.674(0.015) \\
& ViT & 0.736(0.006)&0.685(0.013)& 0.680(0.014) \\
\hline
\multirow{3}{*}{Table} & SVM & 0.802(0.003) &0.722(0.001)& 0.714(0.007) \\
& TabNet & 0.793(0.024)&0.708(0.023)& 0.701(0.015) \\
& MLP  & 0.793(0.001)&0.731(0.007)& 0.727(0.003) \\
\hline
\end{tabular}
}
\label{ODIR}
\end{table}

\subsection{Generalization of Other Dataset}We collect a dataset of 218 cases from Guangzhou Women and Children's Medical Center as an additional test set to test our DeFusion model, with 56 pregnant cases and 162 non-pregnant cases in this dataset. As shown in the Table \ref{addtiontest}, our model has certain generalization ability without fine tuning.

\begin{table}[h]
\centering
\renewcommand{\thetable}{A.5}
\caption{Generalization experiments of the model on other hospital datasets.}
\resizebox{300px}{!}{%
\begin{tabular}{|l|l|l|l|l|}
\hline
Modality & Method & AUC & F1& Accuracy   \\
\hline
\multirow{1}{*}{Image+Table} & Ours(DeFusion) & 0.616&0.660& 0.642 \\
\hline
\end{tabular}
}
\label{addtiontest}
\end{table}


\newpage
\section{Interpretability}In order to analyze the reasons for the success of the model, we conduct interpretability analysis on the model, mainly manifest in two aspects. Firstly, as shown in Fig. \ref{shap}, we use SHAP \cite{shap} to output the importance ranking of clinical indicators in our model, which focuses more on features such as female age, high-quality embryos and so on. Among them, \ref{shap}(a) ranks the importance of table features. \ref{shap}(b) is a beeswarm, which depicts the SHAP values of each sample under different features. 

\begin{figure}[h]
\centering
\renewcommand{\thefigure}{A.4}
\includegraphics[width=1\textwidth]{image/shap.pdf} % Reduce the figure size so that it is slightly narrower than the column.
\caption{SHAP interpretability of clinical tabular indicators.}
\label{shap}
\end{figure}

\newpage
\indent Secondly, we use the Grad-Cam \cite{gradcam} to visualize the class activation maps of the first three days of embryonic development images in Fig. \ref{gradcam}. We can see the areas that our model focuses on are the edges of embryonic cells. These interpretable results are consistent with the experience of obstetricians and gynecologists.
\begin{figure}[h]
\centering
\renewcommand{\thefigure}{A.5}
\includegraphics[width=0.8\textwidth]{image/gradcam.pdf} % Reduce the figure size so that it is slightly narrower than the column.
\caption{Grad-Cam visualization of embryonic images, with each row representing a three-day image.}
\label{gradcam}
\end{figure}

\newpage
\section{Limitation}Although our study is superior to other methods, there are limitations to our method and data set. For our method, our premise is that there is correlation between the multi-modal data. If there is no correlation between the multi-modal data, our method may not work well. In addition, when decoupling unique features and common features, we only use a simple cross-reconstruction loss constraint, which is very weak. Although the decoupling visualization by t-SNE and Pearson correlation coefficient proves the effectiveness of the decoupling method, adding stronger loss function constraints to the decoupling process may make the decoupling process smoother. Finally, because we add the decoupling module to the multi-modal fusion process, our computational complexity is higher than the simple ADD fusion, which is not conducive to our deployment of the model to the end-to-end device.\\
\indent For our dataset, we only collected 4046 cases of data, which is not enough for deep learning. And, there are some missing values in our clinical indicator data, which is unfavorable to the prediction of results. In addition, our image data is three images taken every other day using a normal microscope, and many studies now use time-lapse microscopes, which can acquire images at the hour or even minute level, so our image data lacks a lot of temporal information compared to other studies.


\end{document}
