\section{Experimental Results}
%This section details the comprehensive experiments conducted using PyTorch on NVIDIA GTX 3090 GPUs. 
The dataset comprises microscopic images of 643 human embryos, sourced from a collaborating hospital and ethically approved, divided into two categories based on post-surgery results: successful implantation (n=310) and implantation failure (n=333). 
%The ethical approval number is an internal approval number: Sichuan Jinxin Xinan Women \& Children Hospital (2019) Reproductive Ethics Approval No.~(002). 
For each embryo, we manually take three microscopic images of different focal planes: stage, ICM, and TE. Due to the inherent movement of embryos during imaging, the stage FP-image was designated as a reference for aligning the other images. We evaluate the performance of our \model using accuracy (ACC, \%), sensitivity (SEN, \%), positive predictive value (PPV, \%), negative predictive value (NPV, \%), F1 score, and area under the receiver operating characteristic curve (AUC) compared to previous methods. To enhance the robustness of our findings and avoid biases from a limited dataset, we adopt a stratified sampling method, culminating in a five-fold cross-validation approach. The results presented are the aggregated averages from this comprehensive cross-validation process.
% In this section, we conduct the following whole experiments by PyTorch with NVIDIA GTX 3090 GPU. \model is trained over 100 epochs with a mini-batch size of 8, employing the SGD optimizer with parameters: momentum = 0.9, weight decay = $1\times10^{-4}$, $\lambda$ = 1, and learning rate = $3\times10^{-3}$. We evaluate the performance of our \model using accuracy (ACC, \%), sensitivity (SEN, \%), positive predictive value (PPV, \%), negative predictive value (NPV, \%), F1 score, and area under the receiver operating characteristic curve (AUC) compared to previous methods. 

% \subsection{Dataset} 
% We use microscopic images of 643 human embryos from our collaborating hospital to train and validate our \model model. 
% The ethical approval number is an internal approval number: 
% %Chengdu Jinjiang 
% Sichuan Jinxin Xinan Women \& Children Hospital
% % Chengdu Xinan Gynecology Hospital For Women and Children Health
% % \footnote{The name of the hospital is not given here due to anonymity.} 
% (2019) Reproductive Ethics Approval No.~(002). The 643 human embryos were classified into two groups based on post-surgery results: successfully implanted (n=310) and implantation failure (n=333). For each embryo, we manually took three microscopic images of different focal planes: stage, ICM, and TE. Because the three FP-images could not be perfectly aligned due to embryo movement\eat{the movement of the embryo} during focusing, we take the stage FP-image as the benchmark to align the other two FP-images. To avoid poor generalization of the experimental results caused by the small amount of data, we stratify the sampling of the data and create a five-fold cross-validation dataset. The final results of the experiments are the average values of the five-fold cross-validation.

% \subsection{Evaluation Metrics}
% We use accuracy (ACC, \%), sensitivity (SEN, \%), positive predictive value (PPV, \%), negative predictive value (NPV, \%), F1 score, and area under the receiver operating characteristic curve (AUC) to verify the performance of our \model method comparing to previous methods on the above dataset.


% \subsection{Setups}
% We use PyTorch to build and train our \model, and use the SGD optimizer with momentum = 0.9, weight decay = $1\times10^{-4}$, $\lambda$ = 1, and learning rate = $3\times10^{-3}$. We train the network for 100 epochs with a mini-batch of 8. We first align the three FP-images because blastocyst often moves slightly when photographing the multiple FP-images. 
%The input images are scaled to size $224 \times 224$. Random cropping, flipping, and rotation are used for data augmentation during training; only center cropping is used in the inference stage. 
% The Fusion Module is applied after the $3^{rd}$ layer, and the squeeze output channel number in the Fusion Module is 4 in our experiments.
%The three convolutional layers in CI-Gen use $13\times13$ convolutional kernel, and their input-output channels are $9-64, 64-128, 128-3$, respectively. Spatial-Channel SMHA combination is used in Fusion Module.

\subsection{Comparison to State-of-the-Art Methods}





% \begin{figure}[t!]
%     \centering
%     \includegraphics[scale=1]{IMG/Early.pdf}
%     \caption{Early Fusion Mode: three focal plane images' grayscales are concatenated as a 'RGB' image, and then the 'RGB' image is classified through the classification network to obtain the classification result}
%     \label{Early}
% \end{figure}

% \begin{figure}[t!]
%     \centering
%     \includegraphics[scale=0.7]{IMG/Late.pdf}
%     \caption{Late Fusion Mode: three images represent as three feature vectors after passing through independent backbone networks. Then these feature vectors are concatenated as a longer feature vector for final classification.}
%     \label{Late}
% \end{figure}
We modify known state-of-the-art (SOTA) methods to fit our dataset. (1) Erlich et al.~\cite{IErlich2022PseudoCL} used ResNet50~\cite{KaimingHe2015DeepRL} as the feature extractor. (2) STEM~\cite{QiuyueLiao2021DevelopmentOD} classified blastocyst and nonblastocyst images with DenseNet~\cite{GaoHuang2016DenselyCC}. (3) STORK~\cite{PegahKhosravi2019DeepLE} trained InceptionNet-V1~\cite{ChristianSzegedy2014GoingDW} for embryo quality grading. (4) Fordham et al.~\cite{DanielEFordham2022EmbryologistAW} used EfficientNetV2~\cite{MingxingTan2021EfficientNetV2SM} as the image encoder. These methods cover the widely-used CNN models, and all of them achieved state-of-the-art performance on their respective tasks. Hence, we 
%decided to 
migrate these methods to test on our dataset\eat{ for testing} and apply \eat{two fusion methods for each model: }early fusion and late fusion on them for fair comparison. Specifically,
% As shown in Fig.\ref{Early}, 
Early Fusion~\cite{AstridZeman2021DeepLF} concatenats the grayscales of the three FP-images into an `RGB' image, while Late Fusion 
% (Fig.\ref{Late}) 
uses three individual backbones to extract feature maps and concatenates them before the classifier. Each model is retrained on our dataset, and the best parameters for accuracy are selected for testing. 
%The results are reported in Table~\ref{basline}, \eat{by comparing different fusion methods, }from which we observe the following.
As shown in Table~\ref{basline}, compared with the known SOTA methods, our \model outperforms them in all the evaluation metrics. For instance, our accuracy is 4.8\% higher than the best existing method, and we achieve a 3\% increase in positive predictive value and a 4.2\% increase in negative predictive value. This is because CI-Gen initially eliminates redundancy and focuses on the significant regions of each FP-image. The subsequent Fusion Module captures key features of FP-images and fuses them with the core feature map, which further enhances multi-modal fusion. Therefore, our \model comprehensively outperforms the Early Fusion and Late Fusion methods.

% (a) In both the Early Fusion and Late Fusion groups, STORK outperforms known methods across most of the metrics. This can be attributed to the presence of the Inception module within STORK, which incorporates parallel convolutional layers and pooling layers, along with convolutional kernels of varying scales. This design enables the model to capture features in different scales, enhancing its ability to fuse information from various modalities more effectively. As a result, STORK demonstrates an improved capacity for understanding and representing multi-modal data.

% (b) The Early Fusion method in each backbone model has better classification performance than the Late Fusion one. We believe this is due to the high similarity among the three FP-images. Similar images bring redundant feature vectors before Late Fusion, which brings many noisy features and results in worse classification performance.

% (c) Compared with the known SOTA methods, our \model outperforms them in all the evaluation metrics. For instance, our accuracy is 4.8\% higher than the best existing method, and we achieve a 3\% increase in positive predictive value and a 4.2\% increase in negative predictive value. This is because CI-Gen initially eliminates redundancy and focuses on the significant regions of each FP-image. The subsequent Fusion Module captures key features of FP-images and fuses them with the core feature map, which further enhances multi-modal fusion. Consequently, our \model comprehensively outperforms the Early Fusion and Late Fusion methods.


% \begin{figure}[t!]
%     \centering
%     \includegraphics[scale=0.7]{IMG/visual.pdf}
%     \caption{Grad-CAM of the three FP-images in their Feature Layers}
%     \label{visual}
% \end{figure}

% ===================== RESULTS =====================

\begin{table*}[t]\scriptsize
    \centering
    \caption{Quantitative comparison of \model and SOTA methods on five-fold cross-validation. (E) denotes early fusion and (L) indicates late fusion. We use \textbf{bold} to indicate the best results and \uline{underline} to represent the second-best results.}
    \vspace{-2ex}
    {
    \begin{tabular}{c|c|c|c|c|c|c}
        \hline
        Method & ACC (\%) & F1 & AUC & SEN (\%)& PPV (\%)& NPV (\%)\\
        \hline
        (E) Erlich et al.  & 59.0 & 58.3 & 55.0 & 52.9 & 58.4 & 59.4  \\
%        \hline
        (E) STEM  & 59.3 & 57.2 & 56.0 & 50.0 & 59.4 & 59.1  \\
%        \hline
        (E) STORK  & \uline{60.8} & \uline{60.8} & \uline{60.1} & 61.6 & 59.2 & \uline{62.5}\\
 %       \hline
        (E) Fordham et al.  & 58.5 & 55.5 & 55.0 & 56.1 & 57.5 & 59.7 \\
        \hline
        (L) Erlich et al. & 56.9 & 51.8 & 55.5  & 56.5  & 55.2 & 58.2  \\
   %     \hline
        (L) STEM  & 58.3 & 55.5  & 54.8  & 41.6  & 59.8  & 57.3   \\
   %     \hline
        (L) STORK  & 59.1  & 58.2  & 56.2  & \uline{63.9}   & 57.0  & 61.7 \\
    %    \hline
        (L) Fordham et al.   & 57.4  & 54.4  & 54.2 & 31.6  & \uline{61.5}  & 55.9  \\
	\hline
        \model (ours) & \textbf{65.6 }& \textbf{65.6}& \textbf{62.8} & \textbf{64.5} & \textbf{64.5} & \textbf{66.7} \\
        \hline
    \end{tabular}}
    \label{basline}
    \vspace{-3ex}
\end{table*}

\subsection{Ablation Study}
We design ablation experiments shown in Table~\ref{ablation:input},~\ref{ablation} and~\ref{ablation2} to verify the improvement brought by each component in our MFIF-Net. 
%(i) A single type of focal plane images (ICM, TE, stage): It directly inputs only one type of focal plane images and trains on ResNet-18 to predict implantation outcomes. (ii) Core images only: It uses only the core images as input for ResNet-18 to predict blastocyst implantation outcomes. (iii) Fusion without core images: In this model, every focal plane feature map fuses with the other two focal plane feature maps. (iv) Different combinations of SMHA in Fusion Module. Table~\ref{ablation:input}, Table~\ref{ablation} and Table \ref{ablation2} report the detailed results, from which we draw several conclusions.

%We design ablation experiments to verify the improvement brought by each component of \model. (i) One FP-image: It directly inputs only one FP-image and trains on ResNet-18 to predict implantation outcomes. (ii) Core image only: It uses only the core image as input for ResNet-18 to predict implantation outcomes. (iii) Fusion without the core image: In this model, every focal plane feature map fuses with the other two focal plane feature maps.
% (iv) Different combinations of SMHA in the Fusion Module. 
%Table \ref{ablation} reports the detailed results, from which we draw several conclusions.

% (a) The differences between the Concat version and the versions for a single type of FP-images suggest that it is better to analyze the three FP-images jointly in order to determine whether a blastocyst can be successfully implanted.
\textbf{Effects of Different Types of FP-images and Core Image.}
To demonstrate the importance of different types of FP-images, we conduct experiments on single-type FP-image classification, as shown in Table~\ref{ablation:input}. In this table, ICM, TE, and stage represent experiments using only one type of FP-images for classification. “Concat” indicates an experiment where the three types of FP-images are concatenated and used for classification~\cite{AstridZeman2021DeepLF}, and “Core Image” represents an experiment using only the core image generated by our proposed Core Image Generator. From the results in Table~\ref{ablation:input}, it can be observed that both “Concat” and “Core Image” outperform the models using only a single type of FP-images in all the metrics, indicating that utilizing information from all the three types of images effectively improves the model performance. Furthermore, our proposed Core Image Generator outperforms “Concat” in most the metrics, with only a slight decrease of 0.1 in F1 score, demonstrating that our Core Image Generator achieves better fusion of different FP-image types by simply weighting the three FP-images.



%(b) Compared with the Concat version, the Core Image version achieves a good effect by simply weighting the three FP-images. We think this is because the importance of different FP-images in different regions is different.

%(c) Different from the early fusion of the Core Image version and the Concat version, the fusion method fuses different FP-images through mid-fusion. The improvement brought by the 
%equivalent 
%Fusion version with the three FP-images is not as 
%convenient and 
%big as the Core Image version.
\textbf{Effects of Different Modules.}
To validate the effectiveness of the two components in our method, CI-Gen and KFFNet, we conduct experiments and the results are shown in Table~\ref{ablation}. Here, “Concat” refers to the fusion of the three types of FP-images, which is consistent with the results in Table~\ref{ablation:input}. “Core Image” represents the experiments using only the core image generated by CI-Gen, and “Fusion Layer” denotes the model that combines the three types of FP-images using the proposed fusion layer in KFFNet. From the results in Table~\ref{ablation}, it can be observed that the benefits of the Fusion Layer are not as significant as those of the core image. However, considering the information loss in the Core Image version, we add the Fusion Module with the core image and the three FP-images to supplement information and enhance features. The final results demonstrate that the overall performance of our \model significantly outperforms the other versions in Table~\ref{ablation}.

\begin{table*}[t!]\scriptsize
    \centering
    % \vspace{-0.6cm}
    \caption{Effects of three different types of FP-images and core image.}
    % We use \textbf{bold} to indicate the best results and \uline{underline} to represent the second-best results.}
    \vspace{1ex}
    {
    \begin{tabular}{c|c|c|c|c|c|c}
        \hline
        Method & ACC (\%) & F1 & AUC & SEN (\%) & PPV (\%)& NPV (\%) \\
        \hline
        ICM  & 57.1 & 52.6 & 55.3 & 48.4 & 56.4 &57.2  \\
        \hline
        TE  & 58.3 & 57.7 & 55.2 & 58.7  & 56.8 & 60.0 \\
        \hline
        Stage  & 58.2 & 56.3 & 54.2 & 50.6  &57.5 & 58.3 \\
        \hline
Concat~\cite{AstridZeman2021DeepLF} & \uline{61.4} & \textbf{61.4} & \uline{60.4} & \uline{59.4} & \uline{60.3} & \uline{62.4} \\
        \hline
        Core Image & \textbf{62.2} & \uline{61.3} & \textbf{60.8} & \textbf{62.6}& \textbf{60.6} & \textbf{63.7} \\
        \hline
    \end{tabular}}
    \label{ablation:input}
    \vspace{-4ex}
\end{table*}


\begin{table*}[t!]\scriptsize
    \centering
    % \vspace{-0.6cm}
    \caption{Effects of different modules.}\label{ablation}
    % We use \textbf{bold} to indicate the best results and \uline{underline} to represent the second-best results.}    
    \vspace{1ex}
    \begin{tabular}{c|c|c|c|c|c|c}
        \hline
        Method & ACC (\%) & F1 & AUC & SEN (\%) & PPV (\%) & NPV (\%) \\
        \hline
        Concat~\cite{AstridZeman2021DeepLF} & 61.4 & \uline{61.4} & 60.4 & 59.4  & 60.3 & 62.4 \\
        \hline
        Core Image & \uline{62.2} & 61.3 & \uline{60.8} & \uline{62.6}  & 60.6 & \uline{63.7} \\
        \hline
        Fusion Layer & 61.8 & 60.1 & 58.7 & 53.2  & \uline{62.1} & 61.3 \\
        \hline
        \model & \textbf{65.6 }& \textbf{65.6}& \textbf{62.8} & \textbf{64.5}&\textbf{64.5} & \textbf{66.7}\\
        \hline
    \end{tabular}
    \vspace{-4ex}
\end{table*}

\begin{table}[t!]\scriptsize
    \centering
    \caption{Effects of different combinations of self-SMHA and cross-SMHA.}\label{ablation2}
    % We use \textbf{bold} to indicate the best results.}
    \vspace{1ex}
    {
    \begin{tabular}{c|c|c|c|c|c|c|c}
        \hline
        Self-SMHA & Cross-SMHA &ACC \eat{(\%)} & F1 & AUC & SEN \eat{(\%)} & PPV \eat{(\%)}& NPV \eat{(\%)}\\
        \hline
        Channel & Channel &  64.1 & 63.8 & 61.3 & \textbf{69.0} & 61.5 &	\textbf{67.1}  \\
        \hline
        Spatial & Spatial & 63.8 & 63.4 & 62.3 & 60.0  & 63.3 & 64.2  \\
        \hline
        Channel & Spatial &  64.2 & 64.1 & 62.0 & 56.1  & \textbf{65.3} & 63.6  \\
        \hline
        Spatial & Channel & \textbf{65.6 }& \textbf{65.6 }& \textbf{62.8} & 64.5 & 64.5 & 66.7 \\
        \hline
    \end{tabular}}
    \vspace{-3ex}
\end{table}

%(d) Considering the information loss in the Core Image version, we add the Fusion Module with the core image and the three FP-images for information supplement and feature enhancement. The final results show that the performance of our overall \model model surpasses those of the other versions in Table \ref{ablation} and the baselines in Table \ref{basline} considerably.

%{SMHA Combination Experiments.}
\textbf{Effects of Different Combinations of Self-SMHA and Cross-SMHA.} To examine the effects brought by different SMHA combinations, we conduct an additional ablation experiment presented in Table~\ref{ablation2}. Here, the first column and the second column respectively indicate whether the SMHA used in self-SMHA and cross-SMHA is channel-SMHA or spatial-SMHA. As shown in Table~\ref{ablation2}, the combinations with different SMHAs perform better than the combinations with the same SMHA modules. This is because the Fusion Module made up with the same SMHAs cannot fully enhance features. 
% 
In addition, the channel-channel model has the best SEN and NPV.
% , it is because the dataset is imbanlanced and the model's learning ability is relatively weak, it is more inclined to make predictions through the data distribution. 
This is because this model is weak in spatial feature extraction and cannot identify the targets in the stage, ICM, and TE areas well. Therefore, this model is more likely to classify samples as positive, which leads to an increase of SEN and NPV.
% 
The spatial-channel combination is better than the channel-spatial one. We believe this is because the spatial information in blastocyst's FP-images is quite obvious, and self-spatial-SMHA can generate useful feature maps without the core image's information. Then, with the supervision of the core image, the most valuable channels are enhanced for further fusion.





% \subsection{Computational Cost Comparison}
% %Squeeze Multi-Head Attention (SMHA) replaces the \eat{comprehensive}original query by the squeezed one for computational cost reduction. Table \ref{thop} reports that SMHA reduces the computational costs of MHA to 50.32\%, 65.76\%, and 58.04\% with channel SMHA, spatial SMHA, and the overall Fusion Module (in Fusion Layer 1), respectively. 
% % SMHA mitigates the computationally expensive problem.
% We calculate the computational savings achieved by Squeeze Multi-Head Attention (SMHA), and the results are shown in Table~\ref{thop}. In Channel SMHA, the computational cost is reduced by 50.32\% compared to the original MHA. Similarly, in spatial SMHA, the computational cost is also reduced by 65.76\% compared to the original MHA. Overall, the Fusion Module (in Fusion Layer 1) achieves a remarkable total reduction of 58.04\% in computational cost.

% \begin{table}[t!]\scriptsize
%     \centering
%     \caption{Computational cost comparison between MHA and SMHA. }
%     {
%     \begin{tabular}{c|c}
%         \hline
%         Method & MFlops (in Fusion Layer 1)\\
%         \hline
%         MHA & 157.35\\
%         \hline
%         channel-SMHA & 79.18 (50.32\%) \\
%         \hline
%         spatial-SMHA & 103.48 (65.76\%)\\
%         \hline
%         Fusion Module (MHA) & 314.7 \\
%         \hline
%         Fusion Module (SMHA) & 182.67 (58.04\%)\\
%         \hline
%     \end{tabular}}
%     \label{thop}
% \end{table}

% \subsection{Visualization Results}
% In Fig.~\ref{visual}, we show Grad-CAM\cite{RamprasaathRSelvaraju2016GradCAMVE} results of the three focal plane images in their Feature Layers for two blastocysts. On blastocyst A, the three Feature Layers focus on the stage, ICM, and TE handover positions, respectively. In comparison, on blastocyst B, the Feature Layer for TE does not record many of TE's specific features.

% We believe that such differences are due to some common information in the TE and stage images, which is more pronounced in immature embryos because the blastocyst is not fully expanded and the overlap of TE and stage images is quite high. However, on the blastocyst at the later stage of development, after the blastocyst cell breaks through the zona pellucida (ZP), the key areas of TE and stage images become more distinguished.

% % \begin{figure}[t]
% %     \centering
% %     \includegraphics[width=1.0\linewidth]{IMG/Grad-CAM.pdf}
% %     \caption{Grad-CAM results of the three focal plane images for two blastocyst samples: Blastocyst A is a positive sample and Blastocyst B is a negative sample.}
% %     \label{visual}
% % \end{figure}

% \begin{figure*}[h!]
%     \centering
%     % \hspace{-3.75ex}
%     \subfigure[Blastocyst A]{
%          \centering
%          \includegraphics[width=0.4\textwidth]{IMG/Grad positive.pdf}
%          \label{fig:P_grad}
%     }
%     \subfigure[Blastocyst B]{
%          \centering
%          \includegraphics[width=0.4\textwidth]{IMG/Grad negative.pdf}
%          \label{fig:N_grad}
%     }
%     \caption{Grad-CAM results of two \eat{Examples of microscopic images at different focal planes }blastocysts. The prediction of implantation outcome of blastocyst A is consistent with the actual clinical result while blastocyst B is a misclassified sample.}
%     \label{visual}
%     % \vspace{-5ex}
% \end{figure*}