\section{Extended Results}
\label{appendix:results}

\subsection{Evaluating latent representations}


\begin{table*}[t]
% \scriptsize
\begin{center}
{\renewcommand{\arraystretch}{1.2}%
{
\resizebox{\linewidth}{!}{
\begin{tabular}{lccccccc}
\toprule
\textbf{Classification Task}
& \textbf{Dimensionality}
& \textbf{Classes}
& \textbf{Dataset}
& \textbf{Modality}
& \textbf{Anatomy}
& \textbf{Num. Images}
\\ 
\midrule
Malignancy Detection & 2D & 2 & CMMD & FFDM & Breast & 3744\\
Calcification Detection & 2D & 2 & CMMD & FFDM & Breast & 5202\\
BI-RADS Classification & 2D & 5 & VinDR-Mammo & FFDM & Breast & 20,000 \\
Bone Age Prediction & 2D & 20 & RSNA Bone Age & X-Ray & Hand & 14,036 \\
Wrist Fracture Detection & 2D & 2 &  GRAZPEDWRI-DX  & X-Ray & Wrist & 14,113 \\
Spine Fracture Detection & 3D & 2 &  VerSe & CT & Spine & 160 \\
Head Fracture Detection & 3D & 2 &  CQ500  & CT & Head & 378 \\
ACL/Meniscal Tear Detection & 3D & 2 &  MRNet & MRI & Knee & 1250  \\
\bottomrule
\end{tabular}
}
}
}
\end{center}
\caption{\textbf{Summary of CAD tasks used for evaluating latent representation quality}. We report the task name, number of classes associated with the task, the dataset name, imaging modality, anatomical features, and the number of images after preprocessing \cite{cai2023online,nguyen2022vindrmammo,rsnaboneage,Nagy2022wristfrac,loffler2020vertebral,chilamkurthy2018development,bien2018deep}.}
\label{table:clssummary}
\end{table*}

We evaluate the quality of latent representations $z$ with a set of eight clinically-relevant CAD tasks, which directly evaluate the preservation of clinically-relevant features in 2D and 3D images (Table~\ref{table:clssummary}). For each CAD task, we measure the difference in classification performance between models trained using latent representations and those trained using original, high-resolution images; this serves as an indicator of latent quality by directly measuring the retention of important diagnostic features. These evaluations also provide insights into potential performance gains afforded by training downstream models directly on Med-VAE latent representations rather than high-resolution images.

Below, we provide implementation details for each 2D CAD task.
\begin{enumerate}
    \item  \textbf{Malignancy Detection:} We evaluate the quality of FFDM latent representations on a binary malignancy detection task, which involves predicting the presence or absence of a malignancy. We use images from the Chinese Mammography Dataset (CMMD), which includes a total of 5202 deidentified FFDMs from 1775 patients \cite{cai2023online, cmmddata}. CMMD includes labels indicating the presence of masses and calcifications as well as biopsy-confirmed labels indicating benign and malignant findings. We assigned 80\% of patients to the training set (1420 patients with 2982 images) and the remaining 20\% to the test set (355 patients with 762 images). The average size of an FFDM after preprocessing was $1999.2 \times 793.9 \times 1$. In order to maintain consistent sizing, we downsized each FFDM to $1024 \times 512 \times 1$ using bicubic interpolation. 
    \item \textbf{Calcification Detection:} We evaluate the quality of FFDM latent representations on a binary calcification detection task, which involves identifying the presence or absence of breast calcifications. We use the CMMD dataset, described in detail above \cite{cmmddata, cai2023online}. We preprocessed the CMMD dataset by assigning 80\% of patients to the training set (1420 patients with 4156 images) and 20\% of patients to the test set (355 patients with 1046 images). 
    \item \textbf{BI-RADS Classification:} We evaluate the quality of FFDM latent representations on Breast Imaging Reporting and Data System (BI-RADS) classification. We use images from the VinDR-Mammo dataset, which includes a total of 20,000 deidentified FFDMs from 5000 studies collected from Hanoi Medical University Hospital and Hospital 108 in Vietnam \cite{nguyen2022vindrmammo}. BI-RADS scores evaluate the likelihood of cancer on an integer scale from 0 to 6\cite{nguyen2022vindrmammo}. We use the provided data splits for VinDR-Mammo, which assign 16,000 images to the training set and 4000 images to the test set. There are no images with BI-RADS scores of 0 or 6. The average size of an FFDM after preprocessing was $2607.3 \times 948.6 \times 1$. In order to maintain consistent sizing across the dataset, we downsized each X-ray to $1024 \times 512 \times 1$. 
    \item \textbf{Bone Age Prediction:} We evaluate the quality of musculoskeletal X-ray latent representations on a bone age prediction task. We use images from the RSNA Bone Age dataset, which includes 14,036 hand radiographs collected from Children’s Hospital Colorado and Lucile Packard Children’s Hospital at Stanford University \cite{rsnaboneage}. We use the provided data splits for the RSNA Bone Age dataset, which assign 12,611 images to the training set and 1425 images to the test set.  The average size of a musculoskeletal X-ray after preprocessing was $1665.4 \times 1319.8 \times 1$. In order to maintain consistent sizing across the dataset, we downsized each X-ray to $1024 \times 1024 \times 1$.
    \item \textbf{Pediatric Wrist Fracture Detection:} We evaluate the quality of musculoskeletal X-ray latent representations on a binary wrist fracture detection task. We use images from the GRAZPEDWRI-DX dataset, which includes a total of 20,327 deidentified images from 6,091 patients collected at University Hospital Graz in Austria \cite{Nagy2022wristfrac}. We preprocessed the GRAZPEDWRI-DX dataset by first using provided labels to remove all samples with metal hardware and casts, which may exhibit spurious correlations with the target labels. We then assigned 75\% of patients to the training set (4281 patients with 10,511 images) and the remaining 25\% to the test set (1428 patients with 3602 images). The average size of a musculoskeletal X-ray after preprocessing was $987.8 \times 537.7 \times 1$. In order to maintain consistent sizing across the dataset, we resized each X-ray to $1024 \times 512 \times 1$.
\end{enumerate}

We perform each 2D CAD task listed above using a pretrained HRNet\_w64 neural network implemented in the \texttt{timm} Python package\cite{wang2020hrnet,timm}. HRNets are a type of convolutional neural network adapted for classification of high-resolution images. We preprocess latent representations by applying the mean operation across the channel dimension if more than one channel is present. We train the HRNet on 2 A100 GPUs using supervised linear probing with one output class. We train for 100 epochs using a batch size of 256, an AdamW optimizer \cite{adamw} with an initial learning rate of 1e-4, and cross-entropy loss. Classification performance is measured on the test set using the final model checkpoint. We report AUROC for binary classification tasks and Macro AUROC for multi-class classification tasks. 

Below, we provide implementation details for each 3D CAD task.
\begin{enumerate}
    \item \textbf{Spine Fracture Detection:} We evaluate the quality of Spine CT latent representations on a binary spine fracture detection task. We use images from the VerSe 2019 dataset \cite{loffler2020vertebral}, which includes 160 high-resolution, 1-mm isotropic or in sagittal 2-mm to 3-mm series of 1-mm in-plane resolution, spine CT images. The training, validation, and testing split (50/25/25) was maintained from the original dataset. The final size of a volume after preprocessing was $224 \times 224 \times 160$. 
    \item \textbf{Head Fracture Detection:} We evaluate the quality of head CT latent representations on a binary head fracture detection task. We use images from the CQ500 dataset \cite{chilamkurthy2018development}, which includes 378 head CT images. This dataset was curated by the Centre for Advanced Research in Imaging, Neurosciences, and Genomics (CARING) in New Delhi, India. Images were divided into training and testing sets following an 80/20 split. The final size of a volume after preprocessing was $224 \times 224 \times 44$. 
    \item \textbf{ACL and Meniscal Tear Detection:} We evaluate the quality of knee MRI latent representations on a binary ACL or meniscal tear detection task. We use images from the MRNet dataset \cite{bien2018deep}, which includes 1250 sagittal knee MRI scans performed at Stanford University Medical Center between 2001-2012. A positive label in this context may indicate the presence of an ACL tear, a meniscal tear, or both simultaneously. The dataset was split into a training and test set (95/5). The final size of a volume after preprocessing was $56 \times 256 \times 256$.
\end{enumerate}

We perform each 3D CAD task listed above using the MONAI SEResNet-152 \cite{hu2018squeeze} architecture. We implemented a weighted sampling strategy for the head fracture detection and ACL and meniscal tear detection tasks due to class imbalance. We trained the SEResNet-152 on an A6000 GPU using supervised linear probing with 1 output class. We trained for 100 epochs with a batch size of 20 for latents, a batch size of 10 for the original images, an AdamW optimizer \cite{adamw} with an initial learning rate of 1e-4, and binary cross-entropy loss. Classification performance (AUROC) is measured on the test set using the final model checkpoint. \\

For latent representation evaluations, we report classification performance using AUROC, calculated using the \texttt{torchmetrics} library. We report mean and standard deviations across three runs with different random seeds. 




\begin{table*}[h]
% \scriptsize
\centering
{%
\begin{tabular}{ lcccccc }
\toprule
\textbf{}
& \multicolumn{2}{c}{\textbf{}}
& \multicolumn{3}{c}{\textbf{AUROC} $\uparrow$}
& \textbf{}
\\
\cmidrule(l{3pt}r{0pt}){4-6}
\cmidrule(l{3pt}r{0pt}){7-7}
\textbf{Method}
& \textbf{$f$}
& \textbf{$C$}
& \small Spine Fractures
& \small Skull Fractures
& \small Knee Injury
& Average
\\ 
\midrule
\small High-Resolution & 1 & 1  & \textbf{82.9$_{\pm2.2}$} & \textbf{63.9$_{\pm6.3}$} & \textbf{69.9$_{\pm0.6}$} & \textbf{72.2}\\

\midrule
\small 2D Med-VAE  & 64 & 1  & 80.5$_{\pm4.9}$ & 57.4$_{\pm4.0}$ & 67.3$_{\pm3.6}$ &  68.4\\
\small 2D Med-VAE & 64 & 3  & 78.6$_{\pm0.8}$ & 50.9$_{\pm19.5}$ & 60.9$_{\pm4.2}$ &  63.5\\
\small 3D Med-VAE & 64 & 1  & \textcolor{blue}{\textbf{83.7$_{\pm2.8}$}} & \textcolor{blue}{\textbf{87.0$_{\pm7.3}$}} & \textbf{68.4$_{\pm2.4}$} & \textbf{79.7}\\
\midrule
\small 2D Med-VAE & 512 & 1  & 65.9$_{\pm8.7}$ & \textbf{63.0$_{\pm1.1}$} & 55.9$_{\pm8.3}$ & \textbf{61.6} \\
\small 2D Med-VAE  & 512 & 4  & \textbf{81.9$_{\pm1.2}$} & 17.1$_{\pm8.6}$ & 52.6$_{\pm1.9}$ & 50.5 \\
\small 3D Med-VAE & 512 & 1  & 72.0$_{\pm3.8}$ & 49.1$_{\pm19.8}$ & \textbf{58.2$_{\pm1.7}$} & 59.8 \\
\bottomrule
\end{tabular}
}
\caption{\textbf{Comparing 2D Med-VAE and 3D Med-VAE on 3D CAD tasks.} We compare 3D Med-VAE with 2D Med-VAE models. For 2D Med-VAE, we stitch 2D latent representations together across slices such that the size of the 2D latent representation matches those generated by the 3D model. Here, $f$ represents the downsizing factor applied to the 3D volume of the input image and $C$ represents the number of latent channels. The best performing models on each task are bolded. We highlight methods that perfectly preserve clinically-relevant features in \textcolor{blue}{blue}.}
\label{table:2d3dcad}
\vspace{-1mm}
\end{table*}

In Table \ref{table:2d3dcad}, we compare performance of 2D MedVAE and 3D MedVAE on 3D CAD tasks. These findings demonstrate that 3D training of autoencoders leads to high-quality latent representations due to preservation of volumetric information (e.g. fractures spanning multiple slices), particularly at f = 64. 

\subsection{Evaluating reconstructed images}
We evaluate the quality of reconstructions $\hat{x}$ using both automated and manual perceptual quality evaluations. Perceptual quality assessments measure information loss resulting from the autoencoding process by comparing the original image to the reconstructed (decoded) image. These evaluations quantify the extent to which the encoding and subsequent decoding process retains relevant features.

For 2D images, we evaluate full-image perceptual quality on chest X-rays, FFDMs, and musculoskeletal X-rays; we also evaluate fine-grained perceptual quality on musculoskeletal X-rays. Chest X-rays are obtained from CANDID-PTX \cite{feng2021candid} and MIMIC-CXR \cite{johnson2019mimic}; FFDMs are obtained from RSNA Mammography \cite{rsnamammo}, VinDR-Mammo \cite{nguyen2022vindrmammo}, CSAW-CC \cite{sorkhei2021csaw}, EMBED \cite{jeong2022emory}, CMMD \cite{cai2023online}, and INBreast \cite{moreira2012inbreast}; musculoskeletal X-rays are obtained from GRAZPEDWRI-DX \cite{Nagy2022wristfrac}. We compute two standard perceptual quality metrics: PSNR and MS-SSIM. For 2D fine-grained perceptual quality evaluations, we extract 7677 images containing fractures from GRAZPEDWRI-DX, and we use bounding boxes provided by the authors to isolate the region of the fracture \cite{Nagy2022wristfrac}. We then compute PSNR scores on these regions.

For 3D full-volume perceptual quality evaluations, we evaluate full-image perceptual quality on head MRIs, head CTs, abdomen CTs, whole-body CTs, lung CTs, and knee MRIs. Head MRIs are obtained from Alzheimer's Disease Neuroimaging Initiative (ADNI)  \cite{jack2008alzheimer}, Harvard Aging Brain Study (HABS) \cite{dagley2017harvard}, A4 dataset \cite{insel2020a4}, and Open Access Series of Imaging Studies (OASIS) brain dataset \cite{lamontagne2019oasis}; head CTs are obtained from CQ500 \cite{chilamkurthy2018development}; whole-body CTs are obtained from TotalSegmentator dataset \cite{wasserthal2023totalsegmentator}; abdomen CTs are obtained from the Abdominal Multi-Organ Segmentation (AMOS) dataset \cite{ji2022amos}; lung CTs are obtained from LIDC-IDRI \cite{armato2011lung}; and knee MRIs are obtained from MRNet \cite{bien2018deep}. For each volume, a center crop of volume dimensions $160 \times 160 \times 160$ was extracted. For the AMOS and CQ500 datasets, the crop region was expanded to dimensions $320 \times 320 \times 160$ to include both soft-tissue and bony features. We compute two standard perceptual quality metrics: PSNR and MS-SSIM. 



\begin{table*}[t]
% \scriptsize
\centering
\resizebox{\linewidth}{!}{
\begin{tabular}{lccccccccc}
\toprule
\textbf{Method}
& \textbf{$f$}
& \textbf{$C$}
& \multicolumn{2}{c}{\textbf{Mammograms}}
& \multicolumn{2}{c}{\textbf{Chest X-rays}}
& \multicolumn{2}{c}{\textbf{Musculoskeletal X-rays}}
& \multicolumn{1}{c}{\textbf{Wrist X-rays (FG)}}
\\
\cmidrule(l{3pt}r{0pt}){4-5}
\cmidrule(l{3pt}r{0pt}){6-7}
\cmidrule(l{3pt}r{0pt}){8-9}
\cmidrule(l{3pt}r{0pt}){10-10}
% "
& 
&
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
\\ 
\midrule
\small Nearest & 16 & 1    &  25.95$_{\pm0.06}$  & 0.846$_{\pm0.00}$  & 29.87$_{\pm0.04}$ & 0.942$_{\pm0.00}$   &  24.06$_{\pm0.02}$  & 0.890$_{\pm0.00}$  & 26.11$_{\pm0.02}$\\
\small Bilinear & 16 & 1    & 30.18$_{\pm0.07}$  & 0.936$_{\pm0.00}$ & 34.23$_{\pm0.03}$ & 0.981$_{\pm0.00}$ & 28.75$_{\pm0.02}$  & 0.959$_{\pm0.00}$ & 30.92$_{\pm0.03}$ \\
\small Bicubic & 16 & 1    & 31.69$_{\pm0.07}$  & 0.961$_{\pm0.00}$ & 35.48$_{\pm0.03}$ & 0.989$_{\pm0.00}$ & 30.18$_{\pm0.02}$  & 0.974$_{\pm0.00}$ & 32.65$_{\pm0.04}$  \\
\small KL-VAE & 16 & 3  & 36.11$_{\pm0.07}$  & 0.989$_{\pm0.00}$ & 41.45$_{\pm0.04}$ & 0.996$_{\pm0.00}$ & 38.29$_{\pm0.03}$  &0.992$_{\pm0.00}$  & 36.55$_{\pm0.03}$\\
\small VQ-GAN & 16 & 3  &   35.55$_{\pm0.07}$  & 0.986$_{\pm0.00}$ & 37.80$_{\pm0.03}$ & 0.995$_{\pm0.00}$ & 36.41$_{\pm0.02}$  & 0.990$_{\pm0.00}$  & 34.19$_{\pm0.04}$\\
\small 2D Med-VAE  & 16 & 1  & 32.34$_{\pm0.07}$ &  0.969$_{\pm0.00}$ & 38.44$_{\pm0.02}$ & 0.990$_{\pm0.00}$ & 33.97$_{\pm0.03}$ & 0.973$_{\pm0.00}$  &  31.97$_{\pm0.03}$ \\
\small 2D Med-VAE  & 16 & 3  & \textbf{37.57}$_{\pm0.08}$ &  \textbf{0.993}$_{\pm0.00}$ & \textbf{43.55 }$_{\pm0.02}$& \textbf{0.997}$_{\pm0.00}$ & \textbf{39.41}$_{\pm0.04}$ & \textbf{0.994}$_{\pm0.00}$  &  \textbf{37.61}$_{\pm0.02}$  \\
\midrule

\small Nearest & 64 & 1   & 22.46$_{\pm0.05}$  & 0.669$_{\pm0.00}$ & 26.22$_{\pm0.03}$ & 0.858$_{\pm0.00}$ & 19.93$_{\pm0.02}$  & 0.756$_{\pm0.00}$  & 22.14$_{\pm0.04}$  \\
\small Bilinear & 64 & 1    & 26.81$_{\pm0.06}$  & 0.837$_{\pm0.00}$ & 31.18$_{\pm0.03}$ & 0.949$_{\pm0.00}$ & 24.89$_{\pm0.01}$  & 0.898$_{\pm0.00}$ & 27.12$_{\pm0.03}$  \\
\small Bicubic & 64 & 1    & 27.84$_{\pm0.06}$  & 0.874$_{\pm0.00}$ & 32.09$_{\pm0.03}$ & 0.962$_{\pm0.00}$  & 25.92$_{\pm0.01}$  & 0.922$_{\pm0.00}$  & 28.54$_{\pm0.03}$ \\
\small KL-VAE & 64 & 4    & 31.88$_{\pm0.07}$  & 0.959$_{\pm0.00}$ &36.37$_{\pm0.01}$ &0.987$_{\pm0.00}$ &33.49$_{\pm0.02}$ &0.966$_{\pm0.00}$ & 31.04$_{\pm0.03}$ \\
\small VQ-GAN & 64 & 4   & 30.13$_{\pm0.06}$  & 0.938$_{\pm0.00}$ & 34.87$_{\pm0.02}$ & 0.980$_{\pm0.00}$ & 32.00$_{\pm0.02}$  & 0.953$_{\pm0.0}$  & 29.92$_{\pm0.02}$ \\
\small 2D Med-VAE  & 64 & 1   & 28.00$_{\pm0.07}$ & 0.872$_{\pm0.00}$ & 31.92$_{\pm0.04}$ &  0.962$_{\pm0.00}$ & 28.27$_{\pm0.02}$&  0.917$_{\pm0.00}$  & 28.03$_{\pm0.01}$\\
\small 2D Med-VAE  & 64 & 4  & \textbf{33.13}$_{\pm0.07}$  & \textbf{0.969}$_{\pm0.00}$ & \textbf{38.88}$_{\pm0.03}$ &  \textbf{0.990}$_{\pm0.00}$ & \textbf{34.73}$_{\pm0.02}$&  \textbf{0.972}$_{\pm0.00}$& \textbf{32.30}$_{\pm0.02}$\\
\bottomrule
\end{tabular}
}
\caption{\textbf{Evaluating reconstruction quality on 2D datasets.} We evaluate 2D Med-VAE with perceptual quality metrics on mammograms and chest X-rays, which we classify as \textit{in-distribution}, since the Med-VAE training set includes mammograms and chest X-rays. We also evaluate Med-VAE on musculoskeletal X-rays and wrist X-rays (fine-grained), which we classify as \textit{out-of-distribution}. Here, $f$ represents the downsizing factor applied to the 2D area of the input image and $C$ represents the number of latent channels. The best performing models are bolded. We calculate PSNR and MS-SSIM using a random sample of 1000 images for each image type; we report mean and standard deviations across four runs with different random seeds.}
\label{table:perceptualidextended}
\end{table*}




\begin{table*}[t]
% \scriptsize
\centering
%\renewcommand{\arraystretch}{1.2}%
\resizebox{\linewidth}{!}{
\begin{tabular}{lcccccccccccccc}
\toprule
\textbf{Method}
& \textbf{$f$}
& \textbf{$C$}
& \multicolumn{2}{c}{\textbf{Brain MRIs}}
& \multicolumn{2}{c}{\textbf{Head CTs}}
& \multicolumn{2}{c}{\textbf{Abdomen CTs}}
& \multicolumn{2}{c}{\textbf{TS CTs}}
& \multicolumn{2}{c}{\textbf{Lung CTs}}
& \multicolumn{2}{c}{\textbf{Knee MRIs}}
\\
\cmidrule(l{3pt}r{0pt}){4-5}
\cmidrule(l{3pt}r{0pt}){6-7}
\cmidrule(l{3pt}r{0pt}){8-9}
\cmidrule(l{3pt}r{0pt}){10-11}
\cmidrule(l{3pt}r{0pt}){12-13}
\cmidrule(l{3pt}r{0pt}){14-15}
% "
& 
&
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
\\ 
\midrule
\small Bicubic & 16 & 1 & 29.27 & 0.975 & 36.21 & 0.996 & 33.81 & 0.989 & 27.33 & 0.972 & 28.00 & 0.973 & 26.37 & 0.986 \\
\small KL-VAE & 16 & 3 & 33.23 & {\textbf{0.994}} & 47.65 & {\textbf{1.000}} & 43.51 & 0.998 & 34.14 & 0.994 & 32.62 & {\textbf{0.989}} & 31.31 & {\textbf{0.998}} \\
\small VQ-GAN & 16 & 3 & 32.72 & 0.992 & 42.87 & 0.999 & 40.85 & 0.997 & 33.55 & 0.993 & 32.20 & {\textbf{0.989}} & 30.75 & 0.997 \\
\small 2D Med-VAE  & 16 & 1 & 29.48 & 0.980 & 39.71 & 0.997 & 33.45 & 0.983 & 29.70 & 0.983 & 28.40 & 0.973 & 27.38 & 0.990 \\
\small 2D Med-VAE  & 16 & 3 & {\textbf{33.99}} & {\textbf{0.994}} & {\textbf{48.56}} & {\textbf{1.000}} & {\textbf{44.95}} & {\textbf{0.999}} & {\textbf{34.83}} & {\textbf{0.995}} & {\textbf{33.34}} & {\textbf{0.989}} & {\textbf{31.52}} & 0.997 \\
\small 3D Med-VAE  & 64 & 1 & 29.52 & 0.983 & 39.03 & 0.999 & 36.61 & 0.993 & 31.35 & 0.987 & 28.79 & 0.975 & 28.25 & 0.994 \\
\midrule
\small Bicubic & 64 & 1 & 26.25 & 0.911 & 30.11 & 0.980 & 28.84 & 0.955  & 24.24 & 0.914 & 24.40 & 0.928 & 24.11 & 0.956  \\
\small KL-VAE & 64 & 3 & 29.32 & {\textbf{0.977}} & 40.95 & 0.997 & 38.07 & {\textbf{0.995}} & 29.85 & 0.982 & 28.83 & 0.974 & 27.68 & {\textbf{0.993}} \\
\small VQ-GAN & 64 & 3 & 27.43 & 0.967 & 39.02 & 0.997 & 36.25 & 0.991 & 27.47 & 0.972 & 26.66 & 0.964 & 25.95 & 0.990 \\
\small 2D Med-VAE  & 64 & 1 & 25.66 & 0.920 & 33.10 & 0.988 & 29.51 & 0.967  & 24.50 & 0.922 & 24.39 & 0.933 & 24.48 & 0.973 \\
\small 2D Med-VAE  & 64 & 3 & {\textbf{29.34}} & 0.976 & {\textbf{41.98}} & {\textbf{0.999}} & {\textbf{39.49}} & {\textbf{0.995}} & {\textbf{30.35}} & {\textbf{0.984}} & {\textbf{29.59}} & {\textbf{0.977}} & {\textbf{28.05}} & {\textbf{0.993}} \\
\small 3D Med-VAE  & 512 & 1 & 26.23 & 0.937 & 30.85 & 0.991 & 29.47 & 0.960 & 26.34 & 0.949 & 24.76 & 0.934 & 24.36 & 0.977 \\

\bottomrule
\end{tabular}
}
\caption{\textbf{Evaluating reconstruction quality on 3D datasets.} We evaluate 3D Med-VAE with perceptual quality metrics on head MRIs, head CTs, abdomen CTs, various high-resolution CTs (TS), lung CTs, and knee MRIs. $f$ represents the downsizing factor applied to the input volume and $C$ represents the number of latent channels. The best performing models are bolded. We compare 3D Med-VAE with several 2D methods, including 2D Med-VAE, KL-VAE, and VQ-GAN.}
\label{table:3dperceptualextended}
\end{table*}




\begin{table*}[t]
% \scriptsize
\centering
%\renewcommand{\arraystretch}{1.2}%
\resizebox{\linewidth}{!}{
\begin{tabular}{lcccccccccccccc}
\toprule
\textbf{Method}
& \textbf{$f$}
& \textbf{$C$}
& \multicolumn{2}{c}{\textbf{Brain MRIs}}
& \multicolumn{2}{c}{\textbf{Head CTs}}
& \multicolumn{2}{c}{\textbf{Abdomen CTs}}
& \multicolumn{2}{c}{\textbf{TS CTs}}
& \multicolumn{2}{c}{\textbf{Lung CTs}}
& \multicolumn{2}{c}{\textbf{Knee MRIs}}
\\
\cmidrule(l{3pt}r{0pt}){4-5}
\cmidrule(l{3pt}r{0pt}){6-7}
\cmidrule(l{3pt}r{0pt}){8-9}
\cmidrule(l{3pt}r{0pt}){10-11}
\cmidrule(l{3pt}r{0pt}){12-13}
\cmidrule(l{3pt}r{0pt}){14-15}
% "
& 
&
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
& \small PSNR $\uparrow$
& \small MS-SSIM $\uparrow$
\\ 
\midrule
\small 2D Med-VAE-Decoder & 64 & 1  & 28.88 & 0.978 & 35.01 & 0.997 & 31.47 & 0.983 & 29.96 & 0.981 & 27.54 & 0.965 & 27.04 & 0.992 \\
\small 3D Med-VAE  & 64 & 1 & \textbf{29.52} &\textbf{ 0.983} & \textbf{39.03} & \textbf{0.999} & \textbf{36.61} & \textbf{0.993} & \textbf{31.35} & \textbf{0.987} & \textbf{28.79} & \textbf{0.975} & \textbf{28.25} &\textbf{ 0.994} \\
\midrule
\small 2D Med-VAE-Decoder & 512 & 1 & 25.85 & 0.927 & 18.65 & 0.824 & 20.47 & 0.699 & 25.26 & 0.929 & 23.33 & 0.909 & 23.92 & 0.969 \\
\small 3D Med-VAE  & 512 & 1 & \textbf{26.23} &\textbf{ 0.937} & \textbf{30.85} & \textbf{0.991} &\textbf{ 29.47} &\textbf{ 0.960} & \textbf{26.34} & \textbf{0.949} & \textbf{24.76} & \textbf{0.934} & \textbf{24.36} & \textbf{0.977} \\

\bottomrule
\end{tabular}
}
\caption{\textbf{Comparisons of 3D Med-VAE and 2D Med-VAE Decoder.} The 2D Med-VAE-Decoder model performs downsizing on individual 2D slices, which are then stitched and interpolated together to form a latent representation of equivalent size to the 3D Med-VAE model; we then perform fine-tuning of the decoder using our curated dataset of 3D volumes. We compare perceptual quality of reconstructed volumes across six 3D image types. Here, $f$ represents the downsizing factor applied to the 3D volume of the input image and $C$ represents the number of latent channels. The best performing models on each task are bolded.}
\label{table:decoder}
\end{table*}

% ******* Figure ********
\begin{figure}[t]
\centering
\includegraphics[width=0.9\textwidth, trim=0 0 0 0]{figures/qualitative.png}
\caption{\textbf{Qualitative examples of reconstructed medical images.} The top section provides qualitative examples of a reconstructed chest X-ray. The bottom section provides qualitative examples of a reconstructed brain MRI slice. Residual figures show pixel-level differences between reconstructed images and original, high-resolution images; brighter colors represent larger differences.}
\label{fig:qualitative}
\end{figure}




In Table \ref{table:perceptualidextended} and Table \ref{table:3dperceptualextended}, we provided an extended version of Table \ref{table:perceptualid} with additional perceptual quality evaluations. In Table \ref{table:decoder}, we compare 3D Med-VAE with a model referred to as 2D Med-VAE-Decoder, which has a comparable downsizing factor $f$. The 2D Med-VAE-Decoder model performs downsizing on individual 2D slices, which are then stitched and interpolated together to form a latent representation of equivalent size to the 3D Med-VAE model; we then perform fine-tuning of the decoder using our curated dataset of 3D volumes. The superiority of 3D Med-VAE to the 2D Med-VAE-Decoder approach demonstrates the utility of 3D training of autoencoders, which enables the model to capture important volumetric patterns.



\begin{figure*}[h]
  \includegraphics[width=\columnwidth]{figures/readerstudyvis.pdf}
  \caption{\textbf{Reader study user interface}. Expert readers score each reconstructed chest x-ray with respect to image fidelity, preservation of clinically-relevant features, and the presence of artifacts. Each expert reader is presented with a pair of chest X-rays, consisting of an original high-resolution image $x$ on the left and a reconstructed image $\hat{x}$ on the right. Readers are blinded to both the method and the downsizing factor used to generate the reconstructed image. }
  \label{fig:readerstudyui}
\end{figure*}

For manual evaluations of reconstructed image quality, we perform a reader study with 3 radiologists. Each expert reader is presented with a pair of chest X-rays, consisting of an original high-resolution image $x$ on the left and a reconstructed image $\hat{x}$ on the right (Fig.~\ref{fig:readerstudyui}). A total of 50 unique chest X-rays with fractures, randomly sampled from CANDID-PTX, are selected and presented in a randomized order \cite{feng2021candid}. The reader study poses three distinct questions on image fidelity, preservation of clinically-relevant features, and the presence of artifacts. Each question is scored based on a 5-point Likert scale ranging between -2 and 2. Below, we provide additional details on each of these questions: 
\begin{enumerate}
    \item \textbf{Image Fidelity:} This question aims to assess how closely the reconstructed CXR image resembles the original image in terms of image fidelity considering the overall similarity, level of detail preservation, and visual quality. A higher rating indicates a closer resemblance to the original image, while a lower rating implies a greater deviation or degradation.
    \item \textbf{Preservation of clinically-relevant features:} This question evaluates the extent to which the reconstructed chest X-rays image preserves the diagnostic information present in the original image given the clarity and visibility of anatomical structures, abnormalities, and other important diagnostic features. A higher rating indicates a greater preservation of diagnostic information, while a lower rating suggests a significant loss that may affect the accuracy of diagnosis.
    \item \textbf{Presence of Artifacts:} This question focuses on the presence and impact of artifacts in the reconstructed chest X-ray. Artifacts can include image distortions, noise, blurring, or other visual anomalies (ie. hallucinations) that are not present in the original image. A higher rating suggests less or no interference from artifacts, while a lower rating suggests a greater occurrence of artifacts.
\end{enumerate}

For automated perceptual quality evaluations on 2D images, we calculate PSNR and MS-SSIM on a random sample of 1000 images for each image type; we report mean and standard deviations across four runs with different random seeds. For automated perceptual quality evaluations on 3D images, we calculate PSNR and MS-SSIM on a single random sample of 100 images for each image type. For manual perceptual quality evaluations with expert readers, we report mean scores and 95\% confidence intervals across three readers.

In Figure \ref{fig:qualitative}, we provide qualitative examples of reconstructed medical images.


\subsection{Ablations}
We analyze the effects of each stage of training on latent representation quality in Table~\ref{table:ablations2d} and Table~\ref{table:ablations3d}. 

We further ablate the inclusion of the embedding consistency loss term in the Stage 1 training procedure. We find that the embedding consistency loss term helps improve reconstructed image quality, particularly at lower compression factors. For instance, at a compression factor of $f=16$, Stage 1 training without the embedding consistency loss term achieves a PSNR of $37.27_{\pm 0.08}$ and an MS-SSIM of $0.992_{\pm 0.0}$ on mammograms. In comparison, Stage 1 training with the embedding consistency loss term achieves a PSNR of $37.57_{\pm 0.08}$ and an MS-SSIM of $0.993_{\pm 0.0}$, as shown in Table \ref{table:perceptualid}.


\begin{table*}[t]
%\scriptsize
\centering
\resizebox{\linewidth}{!}
{%\renewcommand{\arraystretch}{1.2}%
\begin{tabular}{ lcccccccc }
\toprule
\textbf{}
& \multicolumn{2}{c}{\textbf{}}
& \multicolumn{5}{c}{\textbf{AUROC} $\uparrow$}
& \textbf{}
\\
\cmidrule(l{3pt}r{0pt}){4-8}
\cmidrule(l{3pt}r{0pt}){9-9}
\textbf{Method}
& \textbf{$f$}
& \textbf{$C$}
& \small Malignancy
& \small Calcification
& \small BI-RADS
& \small Bone Age
& \small Wrist Fracture
& Avg.
\\ 
\midrule
\small High-Resolution & 1 & 1 & \textbf{66.1} & \textbf{62.4} & \textbf{63.4} & \textbf{80.2}  & \textbf{73.7} & \textbf{69.2}\\

\midrule
\small 2D Base Autoencoder (Stage 1) \hspace{0.5mm} & 16 & 3 &  58.7 & 60.5 & 58.0 & 72.0 & 64.3 & 62.7 \\
\small 2D Med-VAE (Stage 2) \hspace{0.5mm} & 16 & 3 &   \textbf{66.1} &   \textbf{61.7} & \textbf{62.3} & \textbf{82.1}  &   \textbf{70.6} & \textbf{68.6}\\
\midrule

\small 2D Base Autoencoder (Stage 1) \hspace{0.5mm} & 64 & 4 &   63.4 &  54.4 & 58.6 & 65.7 & 61.9 & 60.8 \\
\small 2D Med-VAE (Stage 2) \hspace{0.5mm} & 64 & 4  &  \textbf{64.9} &  \textbf{58.5} & \textbf{60.6} &\textbf{ 73.0} & \textbf{66.7} & \textbf{64.7}\\
\bottomrule
\end{tabular}
}
\caption{\textbf{Effect of each autoencoder training stage on 2D Med-VAE latent representation quality.} We evaluate the effects of each stage of 2D Med-VAE training on latent representation quality using five 2D CAD tasks.}
\label{table:ablations2d}
\vspace{-1mm}
\end{table*}


\begin{table*}[t]
%\scriptsize
\centering
\resizebox{0.8\linewidth}{!}
{%\renewcommand{\arraystretch}{1.2}%
\begin{tabular}{ lcccccc }
\toprule
\textbf{}
& \multicolumn{2}{c}{\textbf{}}
& \multicolumn{3}{c}{\textbf{AUROC} $\uparrow$}
& \textbf{}
\\
\cmidrule(l{3pt}r{0pt}){4-6}
\cmidrule(l{3pt}r{0pt}){7-7}
\textbf{Method}
& \textbf{$f$}
& \textbf{$C$}
& \small Spine Fractures
& \small Skull Fractures
& \small Knee Injury
& Avg.
\\ 
\midrule
\small High-Resolution & 1 & 1 & \textbf{82.9} & \textbf{63.9} & \textbf{69.9} & \textbf{72.2}\\

\midrule
\small 2D Base Autoencoder (Stage 1) \hspace{0.5mm} & 64 & 1 & 76.1 & 36.6 & 65.0 & 59.2 \\
\small 3D Med-VAE (Stage 2) \hspace{0.5mm} & 64 & 1 & \textbf{83.7} & \textbf{87.0} & \textbf{68.4} & {\textbf{79.7}} \\
\midrule

\small 2D Base Autoencoder (Stage 1) \hspace{0.5mm} & 512 & 1 & \textbf{72.5 } & 45.4 & \textbf{68.8} & \textbf{62.2} \\
\small 3D Med-VAE (Stage 2) \hspace{0.5mm} & 512 & 1  & 72.0 & \textbf{49.1} & 58.2 & 59.8\\
\bottomrule
\end{tabular}
}
\caption{\textbf{Effect of each autoencoder training stage on 3D Med-VAE latent representation quality.} We evaluate the effects of each stage of 3D Med-VAE training on latent representation quality using three 3D CAD tasks. Since Stage 1 training exclusively involves 2D images, we evaluate this model on 3D tasks by stitching 2D latent representations together across slices such that the size of the 2D latent representation matches those generated by 3D models.}
\vspace{-1mm}
\label{table:ablations3d}
\end{table*}