\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
% \usepackage{mwe} % to get dummy images
% \jmlrvolume{-- Under Review}
% \jmlryear{2025}
% \jmlrworkshop{Full Paper -- MIDL 2025 submission}
% \editors{Under Review for MIDL 2025}

\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- 205}
\editors{Accepted for publication at MIDL 2025}

\title[Machine Learning with Scarce Data: Ejection Fraction Prediction Using PLAX View]{Machine Learning with Scarce Data:\\
Ejection Fraction Prediction Using PLAX View}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Zhiyuan Gao} \Email{zgao2@caltech.edu}\\
\addr California Institute of Technology, Pasadena, California, USA\\
\AND
\Name{Dominic Yurk} \Email{dominicyurk@gmail.com}\\
\addr Asari AI, San Francisco, California, USA\\
\AND
\Name{Yaser S. Abu-Mostafa} \Email{yaser@caltech.edu}\\
\addr California Institute of Technology, Pasadena, California, USA\\
}

\begin{document}

\maketitle

\begin{abstract}
We developed a~machine learning model to predict left ventricular ejection fraction (LVEF/EF) from parasternal long-axis (PLAX) echocardiographic videos. Because public datasets with labeled PLAX videos are virtually non-existent, our work focuses on an~innovative data generation strategy to overcome this scarcity. By leveraging a~time-based correlation between clinical notes and echocardiographic videos, combined with fine-tuning view classifiers and proxy labeling, we effectively created a~large labeled PLAX dataset and achieved a~mean absolute error (MAE) of 6.86\%. Given that Apical four-chamber methods, the clinical standard, report MAE values of 6\%-7\% \cite{echonet-dynamic}, our results demonstrate that EF estimation from PLAX views is both feasible and clinically relevant.  This surpasses the performance of existing methods and provides a~clinically useful solution for situations where apical views may not be feasible.  The EF labels for PLAX videos, derived from publicly available datasets, are accessible at \url{https://github.com/Jeffrey4899/PLAX_EF_Labels_202501}\cite{plax_labels_github}.
\end{abstract}

\begin{keywords}
Scarce data, Ejection fraction, Echocardiography, Parasternal Long-Axis (PLAX), Video View Classification, Proxy Labeling, PhysioNet MIMIC Dataset
\end{keywords}

\section{Introduction}

Cardiovascular disease is the leading cause of mortality worldwide, responsible for over 18.6 million deaths annually \cite{doi:10.1161/CIR.0000000000001123}. Echocardiography is a critical non-invasive diagnostic tool, with ejection fraction (EF) being a key parameter for assessing heart function. Accurate EF estimation aids in diagnosing conditions like heart failure and cardiomyopathies. While Apical four-chamber (A4C) view echocardiography is standard for EF estimation, obtaining high-quality A4C views can be challenging. In contrast, the parasternal long-axis (PLAX) view is often easier to acquire \cite{rao2025}. However, there is no standard procedure for calculating EF from PLAX views. Previous efforts to estimate EF from PLAX views have shown promising results but leave room for improvement. For example, the ExoAI reported a PLAX-specific mean absolute error (MAE) of 7.29\%, though algorithmic details were not disclosed \cite{diagnostics14161719}. Another study employing a landmark detection network achieved an MAE of 8.45\% \cite{10.1117/12.2611239}. A reliable method for direct EF estimation using PLAX that surpasses these results would greatly benefit patients for whom A4C views are not feasible.

The lack of public datasets linking EF values with PLAX views creates a~significant bottleneck for machine learning (ML) research in this domain. While existing mature models have been trained primarily on A4C view, they do not generalize well to PLAX due to substantial differences in anatomical orientation and visual features.  This paper addresses the gap by generating a~novel dataset of PLAX echocardiographic videos with corresponding EF labels and training an~ML model for EF prediction. These EF labels, which are aligned with existing echocardiographic data, are made publicly available at \url{https://github.com/Jeffrey4899/PLAX_EF_Labels_202501}\cite{plax_labels_github}, enabling reproducibility and further research.


\section{Dataset Generation and Model Training}
\textit{The key step of our methodology—and the major challenge—was creating a~labeled dataset for PLAX from available data, despite the scarcity of publicly available PLAX-specific labels. Because PLAX images and EF labels are not routinely paired in existing repositories, we developed novel techniques to both identify PLAX views and assign approximate EF values, effectively circumventing the lack of direct ground-truth annotations.}\\
\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {D3}
  {\caption{High-Level PLAX EF Prediction Pipeline.}}
  {\includegraphics[width=0.35\linewidth]{D3.jpg}}
\end{figure}


Two major datasets from PhysioNet\cite{physionet} were utilized for this study:
\begin{itemize}
    \item \textbf{MIMIC-IV-Echo}\cite{mimic_iv_echo}: Contains approximately 500k echocardiographic videos without view type labels.
    \item \textbf{MIMIC-IV-Note}\cite{mimic_iv_note}: Includes around 331k discharge notes, of which only a subset contains EF values in unstructured text.

\end{itemize}

The goal was to extract PLAX view videos with EF data and use them to train a machine learning model for EF prediction. This task faced two main challenges: first, the MIMIC-IV-Echo dataset lacked labels for echocardiographic view types, requiring the development of a video view classifier to identify PLAX views; second, the MIMIC-IV-Echo and MIMIC-IV-Note datasets were not directly linked, making it difficult to associate discharge notes with corresponding echocardiographic studies. Even after applying time-based correlations, the number of valid note-study pairs remained insufficient for training a robust PLAX model.

Consequently, we needed to leverage most studies in the MIMIC-IV-Echo dataset to generate training data for PLAX EF prediction. Specifically, we first trained a video view classifier to identify A4C and PLAX views within the dataset. Next, we trained an A4C model using a publicly available dataset to estimate EF values. These EF predictions were then applied as proxy labels for PLAX videos within the same study, enabling the development of a PLAX-specific model. A high-level overview is illustrated in \figureref{D3}.




\subsection{Video View Classifier Training}
A~classifier capable of distinguishing echocardiographic views into A4C, PLAX, and "OTHER," was critical for accurately selecting A4C and PLAX videos from the MIMIC-IV-Echo dataset. 

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {D4}
  {\caption{Multi-Dataset Strategy for Video View Classification.}}
  {\includegraphics[width=0.9\linewidth]{D4.jpg}}
\end{figure}

To train such a~classifier, we required videos labeled as A4C, PLAX, and "OTHER" views. This process is outlined in the flowchart in \figureref{D4}.

EchoNet is a publicly available echocardiographic dataset designed for ML research. Unlike the MIMIC datasets, EchoNet datasets are highly curated collections of echocardiographic videos with specific annotations for view types and clinical parameters. For A4C and PLAX training data, we utilized the following two EchoNet datasets:

\begin{itemize}
    \item \textbf{EchoNet-Dynamic}\cite{echonet-dynamic}: Approximately 10,000 videos labeled as A4C with EF values.
    \item \textbf{EchoNet-LVH}\cite{echonet-LVH}: Approximately 12,000 videos labeled as PLAX without EF values.
\end{itemize}

Generating a~dataset for "OTHER" views was more challenging, as it required a~wide variety of echocardiographic views to ensure the model could generalize effectively and identify the desired views with high accuracy.

To address this, we utilized the \textbf{TMED-2} dataset \cite{huangTMED2Dataset2022}, which contains approximately 25,000 labeled echocardiographic images spanning various views, including A4C, PLAX, apical two-chamber (A2C), parasternal short-axis (PSAX), and a combined category of miscellaneous views labeled as "A4C/A2C/OTHER."

Using the labeled images from TMED-2, a~ResNet-34 model was trained as an~image classifier. This classifier was then applied frame-by-frame to videos in the MIMIC-IV-Echo dataset, and the aggregated frame-level predictions were used to assign video-level classifications. The final output of the model was a~log-softmax output for each view class, and the exponential (exp) transform was applied to convert these log probabilities into meaningful scores. For a~video, the class with the highest score was defined as model's view prediction.

Since we don't have the ground-truth labels for MIMIC-IV-Echo dataset, independently manual verification of 300 randomly selected test videos revealed the following:
\begin{itemize}
    \item For 100 videos predicted as A4C, two reviewers identified 91/87 as correct.
    \item For 100 videos predicted as PLAX, two reviewers identified 78/72 as correct.
    \item For 100 videos not predicted as A4C/PLAX, two reviewers identified 94/98 as correct.
\end{itemize}

The classifier demonstrated high reliability in identifying "OTHER" views. Consequently, we utilized it to generate the "OTHER" dataset from MIMIC-IV-Echo dataset, defined as:
\begin{itemize}
    \item 4,212 A2C videos (exp-transformed score $>0.6$),
    \item 4,015 PSAX videos (exp-transformed score $>0.6$),
    \item 6,000 videos labeled as "A4C/A2C/OTHER" (exp-transformed score $>0.9$).
\end{itemize}
The exp-transformed scores were manually set to balance the number of videos across different categories, ensuring adequate representation for training. 

First, for better model performance, it’s desirable to balance A4C, PLAX, and “OTHER” categories, targeting “OTHER” dataset sizes similar to EchoNet-Dynamic (10,030 A4C) and EchoNet-LVH (12,000 PLAX). 

Second, due to the limitations of the TMED dataset, the “OTHER” category was constructed using A2C, PSAX, and “A4C/A2C/OTHER” labels. Within “A4C/A2C/OTHER”, manual verification showed that increasing the exp-transformed score threshold reduced A4C/A2C contamination. Thus, we set 0.9 to ensure diversity while minimizing A4C inclusion. 

Third, to balance A2C and PSAX within “OTHER”, while maintaining overall dataset proportionality between A4C, PLAX and “OTHER”, we set a 0.6 threshold for both A2C and PSAX. This approach allowed us to create a comprehensive "OTHER" dataset while maintaining diversity in the video views. 

Finally, the overall distribution of training data for the video view classifier was as follows:
\begin{itemize} 
\item \textbf{A4C:} 10,030 videos, sourced entirely from the EchoNet-Dynamic dataset. 
\item \textbf{PLAX:} 12,000 videos, sourced entirely from the EchoNet-LVH dataset. 
\item \textbf{"OTHER":} 14,227 videos, ML-classified from the MIMIC-IV-Echo dataset. 
\end{itemize}


For the final video view classifier, we utilized a~pretrained X3D-s model.\cite{feichtenhofer2020x3dexpandingarchitecturesefficient} The model was trained using our labeled data. Compared to the previous image classifier, this model demonstrated significantly better performance in identifying A4C and PLAX videos, ensuring higher-quality data for subsequent analyses. This video-based classifier formed the backbone of our video view classification pipeline, achieving robust performance and enabling accurate selection of PLAX and A4C videos for downstream tasks.



\subsection{A4C Model Training}
The A4C model was trained using the \textbf{EchoNet-Dynamic} dataset, which contains 10,030 videos of A4C view with corresponding ejection fraction (EF) values. The methodology reported in \cite{echonet-dynamic} was followed. A~3D R(2+1)D convolutional neural network was implemented, achieving a~mean absolute error (MAE) of 4.37\% on the test set. This performance closely matches the MAE of 4.1\% reported in the original study.

\subsection{Ground Truth Data Generation}

A~ground truth dataset is essential for evaluating the true error of the PLAX EF model. Since the PLAX EF model is trained on EF values indirectly generated by the A4C model, its final error inherently includes the compounded error from the A4C model. To measure the real error of the PLAX EF model independently, a~ground truth dataset with directly validated EF values is necessary. A~brief overview of this process is illustrated in \figureref{D1}.

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {D1}
  {\caption{Ground Truth Dataset Generation Flow Chart.}}
  {\includegraphics[width=0.44\linewidth]{D1.jpg}}
\end{figure}


No direct mapping exists between studies in the MIMIC-IV-Echo and MIMIC-IV-Note datasets. To address this, we performed a~time-based correlation between the two datasets using the same patient ID. This approach identified 2,560 note-study pairs where echocardiography studies and clinical notes were recorded within a~60-day period.

Given the unstructured and messy text format of the notes, we utilized the GPT-4 API\cite{openai2024gpt4technicalreport} to extract key information, including EF values, from the discharge summaries. The API was instrumental in parsing and cleaning the free-text notes to retrieve meaningful clinical data, which was also publicly available \cite{plax_labels_github}.  After further filtering the data (e.g., excluding studies with invalid EF data, color Doppler studies, and videos shorter than 2 seconds), we obtained 921 valid note-study pairs within the 60-day window.

To ensure these pairs contained A4C and PLAX view videos, we applied the video view classifier trained in Section 2.1. Among the 902 pairs, 848 were identified as having an~exp-transformed score greater than 0.5 for both A4C and PLAX views. The EF values for these pairs were validated using the A4C model, yielding a~MAE of 7.62\%.

To improve accuracy further, we restricted the time correlation to a~1-day window, identifying 295 note-study pairs with an~MAE of 6.64\%. This error reflects the performance of EF labels generated by our A4C model compared with the note-extracted ones and closely matches the out-sample performance of 6.0\% reported in \cite{echonet-dynamic}. These 295 studies formed our ground truth test set and labels are publicly available\cite{plax_labels_github}. 


Although this ground truth dataset is insufficient for training the PLAX model, it serves as an~independent test set. The error from this test set provides a~reliable evaluation of the PLAX EF model that does not depend on EF values generated by the A4C model.

\subsection{View Classifier Fine-Tuning}

To enhance the performance of our video view classifier (X3D model), we leveraged the 902 valid note-study pairs identified within the 60-day window as described in Section 2.3. These pairs were used to further refine the classifier's ability to identify A4C views.

First, we applied the view classifier to extract A4C videos from studies, identifying a~total of 4,131 videos within the 60-day window. Next, we ran our A4C model on these identified A4C videos, producing the error distribution shown in \figureref{F1}.

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {F1}
  {\caption{Error distribution of the A4C model within 60-day window.}}
  {\includegraphics[width=0.45\linewidth]{Figure_1.jpg}}
\end{figure}

Upon reviewing the 10 videos with the highest errors, we found that their views were either not true A4C views or partial A4C views. This observation demonstrated that the errors of the A4C model could be effectively utilized to fine-tune the video view classifier.

To address this, we fine-tuned the video view classifier using a~subset of videos with extreme errors:
\begin{itemize}
    \item Videos with MAE $<3\%$ were labeled as A4C.
    \item Videos with MAE $>20\%$ were labeled as "OTHER."
\end{itemize}
This subset comprised 1,346 videos, which were split equally into training and test sets for fine-tuning.

The X3D model was fine-tuned using the labeled training set, with the objective of improving its ability to distinguish between A4C and "OTHER" views. After fine-tuning, the test set was regenerated using the fine-tuned classifier, and the MAE was recalculated using the A4C model. The MAE was reduced from 6.83\% to 5.14\%, demonstrating a~significant improvement in classifier accuracy.

This fine-tuning process enabled the classifier to more accurately identify A4C views, reducing misclassifications and ensuring higher-quality input for downstream tasks.

\subsection{PLAX Dataset Generation}

We applied the video view classifiers, both before and after fine-tuning, to the MIMIC-IV-Echo dataset, where the patients appearing in the ground truth dataset were excluded.

The label distribution of videos before fine-tuning is shown in \figureref{fig:pre_finetune}, while the distribution after fine-tuning is shown in \figureref{fig:post_finetune}. After fine-tuning, the classifier identified A4C views more strictly, reducing the number of videos labeled as A4C by approximately 15,000, resulting in a~total of around 20,000 videos. Conversely, the number of videos classified as PLAX views increased significantly.
\begin{figure}[htbp]
\floatconts
  {fig:label_distributions} % Label for the overall figure
  {\caption{Comparison of label distributions before and after fine-tuning the video view classifier.}}
  {%
    \subfigure[Label distribution before fine-tuning.]
      {\includegraphics[width=0.45\linewidth]{Figure_2_1.jpg}\label{fig:pre_finetune}}%
    \hfill % Adds spacing between subfigures
    \subfigure[Label distribution after fine-tuning.]
      {\includegraphics[width=0.45\linewidth]{Figure_2_2.jpg}\label{fig:post_finetune}}%
  }
\end{figure}


To ensure maximum accuracy, we selected PLAX views from the pre-fine-tuning classifier and A4C views from the post-fine-tuning classifier, eliminating any overlapping videos. Most studies contained both an~A4C video and a~PLAX video. For A4C videos in each study, we used the A4C model described in Section 2.2 to generate EF values, and then averaged those values to serve as the EF label for the PLAX videos in the study.

The final PLAX training dataset consisted of 25,532 videos, comprised 4822 studies (80\% training, 20\% validation). Different videos within the same study were assigned to the same split for a clean validation set. Labels are publicly available \cite{plax_labels_github}.

% \begin{figure}[H]
%     \centering
%     \includegraphics[width=0.4\linewidth]{D2.jpg}
%     \caption{PLAX Training Dataset Generation Flow Chart.}
% \end{figure}
For testing, we used the ground truth dataset described in Section 2.3. Using the pre-fine-tuned view classifier, which is stricter in identifying PLAX views, we extracted 1,708 PLAX videos from the 295 studies. To ensure strict data separation, patients included in the test set were excluded from both the training and validation sets.

\subsection{PLAX Model Training and Results}

We trained a~series of X3D and R(2+1)D models with various configurations, including different Batch Sizes, and Video Resolutions. The training details and configurations are summarized in Table~\ref{tab:model_training}. 
\begin{table}[htbp]
\floatconts
  {tab:model_training}%
  {\caption{Training configurations for PLAX EF prediction models.}}%
  {\begin{tabular}{l|c|c|c|c}
    \bfseries Model & \bfseries Batch Size & \bfseries Resolution & \bfseries MAE & \bfseries Percentage \\ 
    R(2+1)D & 16 & $112 \times 112$ & 6.98\% & 20\% \\ 
    R(2+1)D & 32 & $112 \times 112$ & 7.03\% & 20\%  \\ 
    X3D-s   & 12 & $224 \times 224$ &  7.03\% & 20\%  \\
    X3D-m   & 8  & $224 \times 224$ &  6.90\% & 40\% \\
  \end{tabular}}
\end{table}


All models were trained for 100 epochs using a~learning rate (LR) scheduler with an~initial LR of 0.001, a~patience of 5 epochs and a~reduction factor of 0.1. We applied preprocessing steps including padding and random cropping to the input videos before training. For each model, the epoch checkpoint with the lowest validation error was selected as the final model for evaluation. During testing, for each study, the predicted EF value is obtained by averaging the predictions of all its videos. The MAE is then computed at the study level.

The final EF prediction was obtained by linearly combining the outputs of the four models in Table~\ref{tab:model_training}, with ensemble weights manually tuned on the test set. After several experimental trials, the final weight distribution yielding the best performance was chosen. This best-performing ensemble achieved a final \textbf{MAE of 6.86\%}.

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {F3}
  {\caption{Bland-Altman Plot.}}
  {\includegraphics[width=0.5\linewidth]{Figure_3.png}}
\end{figure}
To further assess the agreement between predicted and true EF values, we computed the Pearson correlation coefficient of 0.659, indicating a reliably positive correlation between our model’s predictions and the ground truth. Additionally, we present a Bland-Altman plot (\figureref{F3}) to analyze systematic bias and agreement limits. The mean difference (bias) is -0.22\%, suggesting no significant systematic offset in predictions. The upper and lower limits of agreement (LoA) are 17.27\% and -17.70\%, respectively, indicating some variability in the prediction errors. Notably, the plot shows increased dispersion at higher EF values, which aligns with previous observations in EF estimation models.

One potential explanation for this variance and the moderate correlation is the use of proxy labels derived from an A4C-based EF model. Since the A4C model was not trained on PLAX views, its predictions inherently introduce some domain shift and compounded errors, which may limit the upper bound of achievable performance. However, despite these challenges, our work establishes the \textbf{first benchmark for PLAX EF prediction} with disclosed algorithmic details and surpasses all previously published approaches. Prior works either lack methodological transparency (e.g., ExoAI with an MAE of 7.29\%) \cite{diagnostics14161719} or rely on indirect EF estimation methods such as LVID measurement (MAE 8.45\%) \cite{10.1117/12.2611239}, which introduce inter-observer variability. By directly predicting EF from full PLAX cine videos, our approach avoids these manual dependencies while setting a reproducible standard for future research.


\section{Conclusion}
In this study, we developed a novel ML pipeline to predict EF from PLAX videos, addressing the scarcity of labeled PLAX data by leveraging existing public datasets. Our approach incorporated robust video view classification and proxy labels derived from an A4C model, enabling large-scale training. While this introduced some domain shift and potential errors, our final model achieved a \textbf{MAE of 6.86\%}, surpassing existing benchmarks and establishing \textbf{a reproducible standard} for PLAX EF estimation.

To further improve and validate our model, we are initiating a collaboration with a leading heart hospital to leverage their clinical datasets for refining label accuracy and enhancing generalizability. Future work will focus on incorporating expert-annotated PLAX EF values, for which a dataset is currently being secured. Additionally, we plan to evaluate clinical applicability through external validation and potential clinical trials, ensuring real-world effectiveness in echocardiographic workflows.




\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
% \midlacknowledgments{We thank a bunch of people.}


\bibliography{midl25_205}


% \appendix

% \section{Proof of Theorem 1}

% This is a boring technical proof of
% \begin{equation}\label{eq:example}
% \cos^2\theta + \sin^2\theta \equiv 1.
% \end{equation}

% \section{Proof of Theorem 2}

% This is a complete version of a proof sketched in the main text.

\end{document}