
\begin{table*}[t]
% \scriptsize

\centering
\resizebox{\linewidth}{!}{%
{\renewcommand{\arraystretch}{0.95}%
\begin{tabular}{ lcccccccc }
\toprule
\textbf{}
& \multicolumn{2}{c}{\textbf{}}
& \multicolumn{5}{c}{\textbf{AUROC} $\uparrow$}
& \textbf{}
\\
\cmidrule(l{3pt}r{0pt}){4-8}
\cmidrule(l{3pt}r{0pt}){9-9}
% "
\textbf{Method}
& \textbf{$f$}
& \textbf{$C$}
& \small Malignancy
& \small Calcification
& \small BI-RADS
& \small Bone Age
& \small Wrist Fracture
& Average
\\ 
 & & &  \small (FFDM) & \small  (FFDM) & \small  (FFDM) &  (X-ray) &  (X-ray)
\\
\midrule
\small High-Resolution & 1 & 1 & \textbf{66.1$_{\pm0.5}$} & \textbf{62.4$_{\pm0.6}$} & \textbf{63.4$_{\pm0.1}$} & \textbf{80.2$_{\pm0.1}$}  & \textbf{73.7$_{\pm0.0}$} & \textbf{69.2}\\

\midrule
\small Nearest & 16 & 1   &  65.5$_{\pm0.1}$ & 59.7$_{\pm0.3}$ & 62.4$_{\pm0.1}$ & \textcolor{blue}{81.6$_{\pm0.1}$}  & 70.5$_{\pm0.0}$ & 67.9\\
\small Bilinear & 16 & 1    &65.5$_{\pm0.1}$ & 58.1$_{\pm0.3}$ & 61.1$_{\pm0.2}$ & \textcolor{blue}{81.6$_{\pm0.0}$}  & \textbf{71.2$_{\pm0.1}$} & 67.5 \\
\small Bicubic & 16 & 1   &  65.5$_{\pm0.4}$ & 58.5$_{\pm0.5}$ & 61.1$_{\pm0.0}$ & \textcolor{blue}{81.8$_{\pm0.2}$}  & 71.1$_{\pm0.1}$ & 67.6\\
\small KL-VAE & 16 & 3   & 59.7$_{\pm0.2}$ & 59.1$_{\pm0.3}$ & 58.5$_{\pm0.1}$ & 74.3$_{\pm0.1}$ & 64.5$_{\pm0.1}$ & 63.2\\
\small VQ-GAN & 16 & 3   & 57.4$_{\pm0.3}$  & 58.2$_{\pm0.4}$ & 62.3$_{\pm0.1}$ & 79.1$_{\pm0.2}$ & 65.8$_{\pm0.1}$ & 64.6\\
\small 2D MedVAE & 16 & 1   &  63.6$_{\pm0.6}$ & \textcolor{blue}{\textbf{63.9$_{\pm0.4}$}} & \textcolor{blue}{\textbf{65.3$_{\pm0.2}$}} & \textcolor{blue}{\textbf{84.6$_{\pm0.1}$}} & 70.3$_{\pm0.1}$ & \textcolor{blue}{\textbf{69.5}}\\
\small 2D MedVAE & 16  & 3 &  \textcolor{blue}{\textbf{66.1$_{\pm0.2}$}} &   61.7$_{\pm0.2}$ & 62.3$_{\pm0.1}$ & \textcolor{blue}{82.1$_{\pm0.1}$}  &   70.6$_{\pm0.1}$ & 68.6\\
\midrule

\small Nearest & 64 & 1    & 63.0$_{\pm0.1}$ & 58.8$_{\pm0.2}$ & 60.0$_{\pm0.2}$ & 72.1$_{\pm0.0}$  & 65.1$_{\pm0.1}$ & 63.8	\\
\small Bilinear & 64 & 1   & 61.5$_{\pm0.3}$ & 56.9$_{\pm0.4}$ & \textbf{61.3$_{\pm0.1}$} & 72.8$_{\pm0.5}$  & \textbf{67.9$_{\pm0.1}$} & 64.1\\
\small Bicubic & 64 & 1   & 61.2$_{\pm0.5}$ & 57.6$_{\pm0.4}$ & 61.1$_{\pm0.1}$ & 72.8$_{\pm0.2}$  & 67.9$_{\pm0.2}$ & 64.1\\
\small KL-VAE & 64 & 4   & 62.2$_{\pm0.7}$ &  55.8$_{\pm0.4}$ & 56.8$_{\pm0.1}$ & 65.7$_{\pm0.0}$ & 58.8$_{\pm0.0}$ & 59.9\\
\small VQ-GAN & 64 & 4    & 64.5$_{\pm0.5}$ & 57.3$_{\pm0.3}$ &  56.6$_{\pm0.1}$ & 67.6$_{\pm0.1}$  & 61.6$_{\pm0.2}$ & 61.5 \\
\small 2D MedVAE & 64 & 1  & 59.0$_{\pm0.3}$ & \textbf{59.4$_{\pm0.7}$} & 60.7$_{\pm0.1}$ & \textbf{73.5$_{\pm0.2}$} & 64.3$_{\pm0.1}$ & 63.4\\
\small 2D MedVAE & 64 & 4   & \textbf{64.9$_{\pm0.2}$} &  58.5$_{\pm0.3}$ & 60.6$_{\pm0.0}$ & 73.0$_{\pm0.2}$ & 66.7$_{\pm0.1}$ & \textbf{64.7}\\

\bottomrule
\end{tabular}
}
}
\label{table:image_classification}
\vspace{-1mm}
\end{table*}


\begin{table*}[t]
% \scriptsize
\centering
\resizebox{0.8\linewidth}{!}{%
{\renewcommand{\arraystretch}{0.95}%
\begin{tabular}{ lcccccc }
\toprule
\textbf{}
& \multicolumn{2}{c}{\textbf{}}
& \multicolumn{3}{c}{\textbf{AUROC} $\uparrow$}
& \textbf{}
\\
\cmidrule(l{3pt}r{0pt}){4-6}
\cmidrule(l{3pt}r{0pt}){7-7}
\textbf{Method}
& \textbf{$f$}
& \textbf{$C$}
& \small Spine Fractures
& \small Skull Fractures
& \small Knee Injury
& Average
\\ 
 & & &  \small (CT) & \small (CT) & \small (MRI) &
\\
\midrule
\small High-Resolution & 1 & 1  & \textbf{82.9$_{\pm2.2}$} & \textbf{63.9$_{\pm6.3}$} & \textbf{69.9$_{\pm0.6}$} & \textbf{72.2}\\

\midrule
\small Bicubic & 64 & 1  &  77.3$_{\pm4.1}$ & \textcolor{blue}{64.8$_{\pm4.0}$} & 66.4$_{\pm2.3}$ & 69.5\\
\small KL-VAE & 64 & 3   & 68.8$_{\pm2.1}$ & 40.7$_{\pm9.1}$ & 63.9$_{\pm8.2}$  &  57.8\\
\small VQ-GAN & 64 & 3 & 73.2$_{\pm2.0}$  & 75.5$_{\pm14.8}$ & 63.6$_{\pm10.5}$ &   70.8 \\
\small 3D MedVAE & 64 & 1  & \textcolor{blue}{\textbf{83.7$_{\pm2.8}$}} & \textcolor{blue}{\textbf{87.0$_{\pm7.3}$}} & \textbf{68.4$_{\pm2.4}$} & \textcolor{blue}{\textbf{79.7}}\\
\midrule
\small Bicubic & 512 & 1 & \textbf{72.3$_{\pm2.2}$} & 38.4$_{\pm24.5}$ & \textbf{59.4$_{\pm2.5}$} & 56.7\\
\small KL-VAE & 512 & 4  & 67.7$_{\pm3.9}$ & 42.6$_{\pm4.0}$ & 50.9$_{\pm5.1}$ & 53.7\\
\small VQ-GAN & 512 & 4  & 68.9$_{\pm7.0}$ & 30.6$_{\pm12.5}$ & 57.4$_{\pm5.0}$ & 52.3 \\
\small 3D MedVAE & 512 & 1  & 72.0$_{\pm3.8}$ & \textbf{49.1$_{\pm19.8}$} & 58.2$_{\pm1.7}$ & \textbf{59.8} \\
\bottomrule
\end{tabular}
}
}
\caption{\textbf{Evaluating latent representation quality with CAD tasks.} We evaluate 2D MedVAE on five 2D CAD tasks (\textbf{\textit{Top}}) and 3D MedVAE on three 3D CAD tasks (\textbf{\textit{Bottom}}). We report the mean AUROC and standard deviation across three random seeds. Methods that perfectly preserve clinically-relevant features (i.e. performance equals or exceeds performance when training with high-resolution images) are in \textcolor{blue}{\textbf{blue}}.}
\label{table:3D_latent_cls}
\vspace{-1mm}
\end{table*}




\section{Results}

In order to evaluate MedVAE (Fig.~\ref{fig:method}c), we assess (1) whether downsized latent representations can effectively replace high-resolution images in CAD pipelines while maintaining performance (Section \ref{subsec:results_latent}); (2) whether latent representations can reduce storage requirements and improve downstream efficiency (Section \ref{subsec:results_efficiency}); and (3) whether decoded reconstructions effectively preserve features necessary for radiologist interpretation (Section \ref{subsec:results_reconstruction}). Extended results and analysis are provided in Appendix \ref{appendix:results}.

\subsection{Latent representation quality}
\label{subsec:results_latent}





We first evaluate whether clinically-relevant features are preserved in MedVAE latent representations. To this end, we measure the extent to which latent representations can serve as drop-in replacements for high-resolution input images in CAD pipelines \textit{without} any customization or modifications to CAD model architectures. 

We evaluate latent representation quality using the following 8 CAD tasks: malignancy detection on 2D FFDMs ~\cite{cai2023online}, calcification detection on 2D FFDMs ~\cite{cai2023online}, BI-RADS prediction on 2D FFDMs ~\cite{nguyen2022vindrmammo}, bone age prediction on 2D X-rays ~\cite{rsnaboneage}, fracture detection on 2D wrist X-rays ~\cite{Nagy2022wristfrac}, fracture detection on 3D spine CTs ~\cite{loffler2020vertebral}, fracture classification on 3D head CTs ~\cite{chilamkurthy2018development}, and anterior cruciate liagment (ACL) and meniscal tear detection on 3D sagittal knee MRIs ~\cite{bien2018deep}. In order to perform each of these CAD tasks, a model must rely on fine-grained, clinically-relevant features.

For each CAD task, we train a classifier (HRNet ~\cite{wang2020hrnet} in 2D settings and SEResNet ~\cite{hu2018squeeze} in 3D settings) on a training set consisting of latent representations. We then measure the difference in classification performance between models trained directly on latent representations and models trained using original, high-resolution images; this serves as an indicator of latent representation quality (e.g. a small performance difference indicates that the downsizing approach preserves diagnostic features). We compute AUROC for binary tasks and macro AUROC for multi-class tasks. We train each classifier with three random seeds, and we report results as mean AUROC $\pm$ standard deviation.

We compare MedVAE with two categories of image downsizing methods: (1) interpolation methods (nearest, bilinear, and bicubic), which are the de-facto gold standard for medical image downsizing as demonstrated by the quantity of prior work leveraging this approach ~\cite{wantlin2023benchmd, Varma2019, convirt, Huang_2021_ICCV}, and (2) recently-introduced large-scale natural image autoencoders (KL-VAE and VQ-GAN) ~\cite{rombach2022high}. Due to the fact that prior work on developing large-scale 3D autoencoders has been limited, we compare our 3D MedVAE models with 2D methods by stitching 2D latent representations together across slices such that the size of the 2D latent representation matches those generated by 3D models.

We provide results for 2D and 3D CAD tasks in Table \ref{table:3D_latent_cls}. Our results demonstrate that the MedVAE training approach yields high-quality latent representations for both 2D and 3D images. At a downsizing factor of $f=16$, 2D MedVAE perfectly preserves clinically-relevant features on four out of five 2D classification tasks. Similarly, at a downsizing factor of $f=64$, 3D MedVAE perfectly preserves relevant clinical information on two out of three 3D classification tasks (spine and skull CT fracture detection). In these cases, performance equals or exceeds performance when training with original, high-resolution images. We also observe that MedVAE consistently outperforms the natural image autoencoders KL-VAE and VQ-GAN on all classification tasks, including the two musculoskeletal tasks (bone age prediction and wrist fracture detection) despite the fact that no musculoskeletal radiographs are used during MedVAE training; this suggests effective generalization capabilities. Our findings also show that 3D training of autoencoders leads to high-quality latent representations due to preservation of volumetric information (e.g. fractures spanning multiple slices), particularly at $f=64$. In summary, we demonstrate that our MedVAE training procedure yields downsized latent representations that can be used as drop-in replacements for high-resolution input images in CAD pipelines. 

In Appendix Tables \ref{table:ablations2d} and \ref{table:ablations3d}, we provide ablations demonstrating the utility of our proposed two-stage training approach on latent representation quality.

\subsection{Storage and efficiency benefits of latent representations}
\label{subsec:results_efficiency}

Next, we evaluate the extent to which downsized MedVAE latent representations can reduce storage requirements and improve downstream efficiency of CAD pipelines. Using a 2D high-resolution network and 3D squeeze-excitation network as our base CAD architectures, we report latency, throughput, and maximum batch size. Latency is the time (in milliseconds) to perform a forward pass of the network on one batch. Throughput is the number of samples that can be evaluated by the network in one second. Finally, we report the maximum batch size (in powers of 2) for a forward pass that will fit on a single A100 GPU (2D) and an A6000 GPU (3D). We assume a high-resolution image size of $1024 \times 1024$ with 1 channel for 2D settings and a volume size of $256 \times 256 \times 256$ with 1 channel for 3D settings.

% ******* Figure ********
\begin{figure}[h]
\centering
\includegraphics[width=\textwidth, trim=0 0 0 0]{figures/efficiency.pdf}
\caption{\textbf{CAD model efficiency.} We compare the efficiency of CAD models trained with downsized latent representations to CAD models trained with high-resolution images.}
\label{fig:efficiency}
\end{figure}

Results are provided in Figure \ref{fig:efficiency}. We demonstrate that training CAD models directly on downsized latent representations can lead to large improvements in model efficiency. In the 2D setting, we observe that as the downsizing factor increases to $f=64$, latency decreases by 69x, throughput increases by 70x, and the maximum batch size increases by 32x. In the 3D setting, as the downsizing factor increases to $f=512$, latency decreases by 62x, throughput increases by 55x, and the maximum batch size increases by 512x. Storage costs decrease proportionally with the downsizing factor (i.e. 64x for 2D and 512x for 3D).

\subsection{Reconstructed image quality}
\label{subsec:results_reconstruction}


We evaluate whether clinically-relevant features are preserved in reconstructed images using both automated and manual perceptual quality evaluations. These evaluations quantify the extent to which the encoding and subsequent decoding processes retain relevant features.

For automated evaluations, we use perceptual metrics to compare reconstructed images with the original inputs. We report peak signal-to-noise ratio (PSNR) and the multi-scale structural similarity index measure (MS-SSIM). For 2D evaluations, we measure perceptual quality on X-rays~\cite{feng2021candid,johnson2019mimic}; FFDMs ~\cite{jeong2022emory,sorkhei2021csaw,rsnamammo,nguyen2022vindrmammo,moreira2012inbreast,cai2023online}; and musculoskeletal X-rays~\cite{Nagy2022wristfrac}. For 3D evaluations, we compute metrics on brain MRIs~\cite{jack2008alzheimer,dagley2017harvard,insel2020a4,lamontagne2019oasis}; head CTs~\cite{chilamkurthy2018development}; abdomen CTs~\cite{ji2022amos}; CTs from a wide range of anatomies~\cite{wasserthal2023totalsegmentator}; lung CTs ~\cite{armato2011lung}; and knee MRIs~\cite{bien2018deep}. Results are in Table \ref{table:perceptualid} and Appendix Tables \ref{table:perceptualidextended} and \ref{table:3dperceptualextended}.


\begin{table*}[t]
\centering
\resizebox{\linewidth}{!}{
\begin{tabular}{lcccccc|cccc}
\toprule
\textbf{Method}
& \textbf{$f$}
& \textbf{$C$}
& \multicolumn{2}{c}{\textbf{FFDMs (2D)}}
& \multicolumn{2}{c}{\textbf{MSK X-rays (2D)}}
& \multicolumn{2}{c}{\textbf{Brain MRIs (3D)}}
& \multicolumn{2}{c}{\textbf{Abdomen CTs (3D)}}
\\
\cmidrule(l{3pt}r{0pt}){4-5}
\cmidrule(l{3pt}r{0pt}){6-7}
\cmidrule(l{3pt}r{0pt}){8-9}
\cmidrule(l{3pt}r{0pt}){10-11}
\textbf{}
& 
\textbf{}
&
\textbf{}
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
\\ 
\midrule
\small Bicubic & 16 & 1  & 31.69  & 0.961  & 30.18  & 0.974 & 29.27 & 0.975 & 33.81 & 0.989\\ 
\small KL-VAE & 16 & 3  & 36.11  & 0.989 & 38.29  & 0.992 & 33.23 & \textbf{0.994} & 43.51 & 0.998\\
\small VQ-GAN & 16 & 3  &   35.55 & 0.986 & 36.41  & 0.990 & 32.72 & 0.992 & 40.85 & 0.997\\
\small 2D MedVAE & 16 & 1  & 32.34 &  0.969 & 33.97 & 0.973 & 29.48 & 0.980 & 33.45 & 0.983\\
\small 2D MedVAE & 16 & 3  & \textbf{37.57} &  \textbf{0.993} & \textbf{39.41} & \textbf{0.994} & \textbf{33.99} & \textbf{0.994} & \textbf{44.95} & \textbf{0.999}\\
\small 3D MedVAE & 64 & 1  & -- & -- & -- & -- & 29.52 & 0.983 & 36.61 & 0.993\\
\bottomrule
\end{tabular}
}
\caption{\textbf{Evaluating reconstruction quality.} We evaluate reconstruction quality using perceptual metrics. Here, $f$ represents the downsizing factor applied to the 2D area or 3D volume of the input image and $C$ represents the number of latent channels.}
\label{table:perceptualid}
\end{table*}




We find that 2D MedVAE achieves the highest perceptual quality across all evaluated image types. In particular, our evaluations with musculoskeletal X-rays, brain MRIs, and abdomen CTs explore generalization of 2D MedVAE to unseen anatomical features; notably, 2D MedVAE achieves the highest scores on these task, despite the fact that 2D MedVAE was not trained on musculoskeletal X-rays, MRI, or CT slices. We also note a general trend that increasing the number of latent channels $C$ improves perceptual quality of the reconstructed image. We also observe that 3D MedVAE achieves competitive performance, despite utilizing a significantly higher downsizing factor than comparable 2D methods (i.e. downsizing across all three dimensions rather than just two).

We supplement our automated evaluations of reconstructed image quality with a manual reader study. Three radiologists are each presented with 50 pairs of chest X-rays containing fractures ~\cite{feng2021candid}. Each pair consists of an original high-resolution image on the left and a reconstructed image on the right. The reconstructed images are scored on a 5-point Likert scale ranging from -2 to 2 based on three main criteria: image fidelity, preservation of diagnostic features, and the presence of artifacts. Readers rated image fidelity for 2D MedVAE to be 2.8 points higher than bicubic interpolation averaged across the two downsizing factors. 2D MedVAE also better preserved clinically-relevant features (2.8 points). Artifacts (e.g. blurring, hallucinations) were more frequent in interpolated images (2.6 points), which severely suffered from blurring artifacts. In summary, our reader study suggests that 2D MedVAE better preserves diagnostic features than interpolation. In Figure \ref{fig:qualitative}, we provide qualitative examples of a reconstructed chest X-ray and a reconstructed T1-weighted brain MRI slice. 

% ******* Figure ********
\begin{figure}[h]
\centering
\includegraphics[width=0.9\textwidth, trim=0 0 0 0]{figures/readerstudy.pdf}
\caption{\textbf{Reader evaluations.} We report scores from three expert readers on fidelity, preservation of relevant features, and artifacts. Bars represent 95\% confidence intervals.}
\label{fig:readerstudy}
\vspace{-3mm}
\end{figure}