\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{natbib}
\newcommand{\sig}[1]{\textbf{#1*}}

\usepackage{multirow}
\usepackage{booktabs}   
\usepackage{tikz}
\usetikzlibrary{backgrounds}
\usepackage{xcolor}

\usetikzlibrary{arrows.meta,positioning,fit}
\usepackage{graphicx}
\graphicspath{{figures/}}
\jmlrvolume{-- nnn}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}
\usepackage[justification=centering]{caption}

\title[Deep Learning Based CAC Detection: A Synthetic Approach]{Machine-Learning Based Detection of
Coronary Artery Calcification Using
Synthetic Chest X-Rays}

\midlauthor{\Name{Dylan Saeed\nametag{$^{1}$}}\Email{d.saeed@student.unsw.edu.au}\\
\Name{Ramtin Gharleghi\nametag{$^{3}$}} \Email{r.gharleghi@unsw.edu.au}\\
\Name{Susann Beier\nametag{$^{2}$}} \Email{s.beier@unsw.edu.au}\\
\Name{Sonit Singh\nametag{$^{1}$}} \Email{sonit.singh@unsw.edu.au}\\
\AND 
\addr $^{1}$ School of Computer Science and Engineering, University of New South Wales, Australia \\
\addr $^{2}$ School of Mechanical and Manufacturing Engineering, University of New South Wales, Australia \\
\addr $^{3}$ Independent Researcher, Sydney, Australia
}
\begin{document}
\maketitle
\begin{abstract}
Coronary artery calcification (CAC) is a strong predictor of cardiovascular events, with computed tomography (CT)-based Agatston scoring widely regarded as the clinical gold standard. However, CT is costly and impractical for large-scale screening, while chest X-rays (CXRs) are inexpensive but lack reliable ground truth labels, constraining deep learning development. Digitally reconstructed radiographs (DRRs) offer a scalable alternative by projecting CT volumes into CXR-like images while inheriting precise labels. In this work, we provide the first systematic evaluation of DRRs as a surrogate training domain for CAC detection. Using 667 CT scans from the COCA dataset, we generate synthetic DRRs (posterior--anterior and lateral views per scan) and assess model capacity, super-resolution (SR) fidelity enhancement, preprocessing, and training strategies. Lightweight convolutional neural networks (CNNs) trained from scratch outperform large pretrained networks (DenseNet121, ResNet18); pairing super-resolution with contrast enhancement yields significant gains; and curriculum learning stabilises training under weak supervision. Our best configuration achieves a mean area under the receiver operating characteristic curve (AUC) of 0.754, comparable to or exceeding prior CXR-based studies. These results establish DRRs as a scalable, label-rich foundation for CAC detection, while laying the foundation for future transfer learning and domain adaptation to real CXRs. 
\end{abstract}

\begin{keywords}
Coronary artery calcification, digitally reconstructed radiographs, deep learning, super-resolution, domain adaptation, medical imaging, chest X-ray
\end{keywords}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction}
Coronary artery calcification (CAC) is a well-established marker of atherosclerotic burden and a strong predictor of cardiovascular events and all-cause mortality~\cite{CAC-budoff}. Quantification using ECG-gated cardiac CT via the Agatston method remains the clinical gold standard for risk stratification~\cite{agatston1990quantification, cutoff}. However, CT screening is costly, resource-intensive, and impractical for large-scale population use~\cite{ct-screening}, motivating alternative modalities that are low-cost, scalable, and suitable for population-level risk assessment.

In contrast, chest X-rays (CXRs) are inexpensive and widely accessible, making them a low-cost, scalable modality for opportunistic CAC risk assessment~\cite{dancona2023deep}. Yet, CXRs have limited sensitivity for calcium depiction and lack standardised annotation protocols, leaving reliable ground truth labels scarce~\cite{kamel2021prediction}. Importantly, while a CT scan is not required at inference time for a CXR-based model, supervised training demands ground-truth CAC labels, which can only be derived from a corresponding CT. This necessitates retrospective assembly of paired CXR--CT datasets where both modalities were acquired within a clinically acceptable window (often $\leq$6 months). Such dual-modality acquisitions are uncommon in routine care and, when available, can introduce temporal misalignment and label noise due to disease progression between scans. This bottleneck constrains the development of deep learning methods, and prior studies~\cite{kamel2021prediction, dancona2023deep, jeong2024radiomics} remain limited in scale and generalisability. 

In such data-scarce settings, an emerging strategy is to train models within surrogate domains: synthetic or simulated approximations of the target modality that preserve key imaging physics while offering abundant labels~\cite{deepdrr, moore2011drr}. In clinical domains, digitally reconstructed radiographs (DRRs) are synthetic 2D projections of CT volumes that approximate real CXRs while being able to inherit precise CT-derived labels. Prior work in their generation has validated their clinical fidelity~\cite{moore2011drr}, suggesting that DRRs may serve as a scalable surrogate training domain where methodological feasibility can be established before transferring to real CXRs. Although direct transfer to real CXRs is not evaluated here, recent work has shown DRR-trained models to generalize effectively in fluoroscopic and interventional settings, supporting the plausibility of similar transferability for CAC detection~\cite{deepdrr}.
    

To our knowledge, this work provides the first systematic evaluation of DRRs as a surrogate training domain for CAC detection. We evaluate feasibility across model capacity, fidelity enhancement, preprocessing, and training strategies, and show that DRRs constitute a scalable and precisely labelled domain for developing models prior to transfer to real CXRs, laying the foundation for low-cost, population-scale cardiovascular risk screening using existing radiography infrastructure.

\section{Related Work}
\subsection{CT-based CAC detection}
ECG-gated cardiac CT is the clinical gold standard for CAC quantification via the Agatston method~\cite{agatston1990quantification}, and numerous studies have leveraged CT directly for automated CAC detection and scoring using deep learning~\cite{eng2021automated}. These approaches benefit from precise attenuation-based ground truth but remain limited by the cost, radiation dose, and infeasibility of CT screening at the population level~\cite{ct-screening}.

\subsection{CXR-based CAC detection}
Given the wide availability of chest radiographs, several groups have explored their utility for CAC risk assessment. \cite{kamel2021prediction} trained an attention-augmented VGG16 on 1,689 CXRs paired with CTs, achieving an AUC of 0.73 for CAC classification. \cite{dancona2023deep} demonstrated that deep learning on CXRs could refine pretest probability estimation in suspected angina patients, validated against invasive coronary angiography. \cite{jeong2024radiomics} proposed a radiomics-based approach requiring manual cardiac segmentation, reporting an AUC of 0.808 for detecting moderate-to-severe CAC ($>$100 Agatston units). More recently,~\cite{jeong2025ai} combined CXR features with multimodal patient data in a multi-objective learning framework for opportunistic CAC screening. Alternative accessible modalities have also been explored: \cite{song2021dual} and \cite{hsieh2022dual} demonstrated that dual-energy chest radiography can detect and quantify coronary calcium without CT, exploiting differential X-ray attenuation across energy levels. Collectively, these approaches demonstrate the potential of low-cost radiographic modalities for CAC screening, yet progress remains constrained by the difficulty of assembling large paired datasets, weak calcification sensitivity, and dependence on manual segmentation or specialised hardware—factors that hinder scalability and reproducibility.

\subsection{Synthetic imaging with DRRs}
DRRs have been widely used in radiotherapy and orthopaedics to simulate radiographs from CT volumes, and prior work has validated their anatomical and clinical fidelity~\cite{moore2011drr}. Recent studies have begun to explore DRRs as surrogate datasets for training deep learning models in scenarios where paired real-world imaging and labels are scarce~\cite{deepdrr}. However, these efforts have primarily focused on registration or dose optimisation rather than disease detection. To the best of our knowledge, no systematic evaluation has been conducted on their feasibility as a surrogate training domain for CAC detection.


\section{Methods}
\subsection{Dataset and Labels}
We use the publicly available Coronary Calcium and Chest CT (COCA) dataset~\cite{coca2022}, which contains 790 ECG-gated cardiac CT scans paired with coronary artery calcium (CAC) segmentations in \texttt{xml} format. The dataset is fully anonymised; detailed patient demographics are not provided. For each patient, a total Agatston score was computed by summing across per-artery calcium masks. Following clinical convention, we binarise labels at a threshold of 100: patients with scores $\leq$100 are classified as negative (no/mild CAC), and those $>$100 as positive (moderate/severe CAC). This threshold was not tuned, but chosen because it reflects the established clinical cutoff between non-actionable and clinically significant CAC~\cite{cutoff}.  

Scans were acquired predominantly at 120\,kV (99.5\% of cases) with a slice thickness of 3.0\,mm (98.9\%) or 2.5\,mm (1.1\%), and in-plane pixel spacing ranging from 0.25 to 0.72\,mm (mean 0.37\,mm). CT volumes were resampled to isotropic $\delta x$ mm spacing using trilinear interpolation, where $\delta x$ was matched to the in-plane pixel spacing of the source DICOM. Patients with insufficient slice coverage ($s < 30$) were excluded for quality control, yielding 667 usable scans with a median of 47 slices per volume (range 27--156). Although gated CTs do not capture full thoracic coverage, they represent the cardiac field of view most relevant for coronary artery calcium assessment~\cite{agatston1990quantification}. The cardiac silhouette is readily identifiable in CXRs, suggesting that analogous regions of interest could be isolated with standard localisation techniques, and that this dataset therefore remains a reasonable surrogate for methodological feasibility testing.

After binarisation at Agatston $>$100, the dataset comprised 490 negative ($\leq$100) and 177 positive ($>$100) patients, a ratio of approximately 2.8:1. The full score distribution was: 0--10 ($n{=}348$, 52.2\%), 11--100 ($n{=}142$, 21.3\%), 101--400 ($n{=}109$, 16.3\%), and $>$400 ($n{=}68$, 10.2\%). Class imbalance was handled via stratified cross-validation.

\subsection{Digitally Reconstructed Radiographs (DRRs)}
Synthetic radiographs were generated directly from CT volumes using the Siddon ray-tracing algorithm, implemented in the open-source \texttt{DiffDRR} framework~\cite{diffdrr}. Siddon projection computes exact line integrals through the CT volume, avoiding interpolation artefacts and preserving small, high-density structures such as calcifications. For each scan, we simulated posterior–anterior (PA) and lateral (LA) projections under a fan-beam geometry with fixed source–detector distance (1085.6\,mm) and detector width of 512 pixels at 1\,mm spacing; these parameters were selected to ensure full cardiac field-of-view coverage in the resulting projections. Both PA and LA projections used identical source–detector parameters, with the LA view generated by a $90^{\circ}$ rotation around the cranio-caudal axis to approximate orthogonal orientation. DRRs were normalised to $[0,1]$ and resized to $512 \times 512$. This pipeline (Figure~\ref{fig:min_pipeline}) yields synthetic radiographs that approximate clinical CXRs while retaining precise CAC labels inherited from the source CTs.

\begin{figure}[!h]
\centering
\resizebox{\textwidth}{!}{
    \begin{tikzpicture}[
      font=\small,
      >=Latex,
      node distance=18mm and 18mm,
      arrow/.style={-Latex, line width=0.6pt},
      accent/.style={draw=black, fill=black!6, rounded corners, line width=0.6pt},
      img/.style={inner sep=0pt, outer sep=0pt},
      captext/.style={inner sep=1pt, text height=1.4ex, text depth=.3ex}
    ]
    
    
    \begin{scope}[shift={(0,0)}]
      \node[img, xshift=4mm, yshift=-4mm] (ct3) {\includegraphics[width=24mm]{ct_slice1.png}};
      \node[img, xshift=2mm, yshift=-2mm] (ct2) {\includegraphics[width=24mm]{ct_slice1.png}};
      \node[img]                           (ct1) {\includegraphics[width=24mm]{ct_slice1.png}};
      \begin{scope}[on background layer]
          \node[fit=(ct1)(ct2)(ct3), accent] (ctbox) {};
        \end{scope}
      % subtle bounding box to unify the stack
    \node[captext, below=2mm of ctbox] {CT Volume};
    \end{scope}
    
    % --- Arrow: Projection ---
    \node[right=22mm of ctbox] (mid1) {};
    \draw[arrow] (ctbox.east) -- node[captext, above] {Projection} (mid1);
    \node[captext, left=3mm of mid1, yshift=-3mm] {\texttt{DiffDRR}};
    
    % --- DRR node (single image, boxed) ---
    \node[img, right=-0.5mm of mid1] (drrimg) {\includegraphics[width=22mm]{drr_pa.png}};
    \begin{scope}[on background layer]
        \node[fit=(drrimg), accent] (drrbox) {};
    \end{scope}
    \node[captext, below=2mm of drrbox]{Input DRR};
    
    \node[right=22mm of drrbox] (mid2) {};
    \draw[arrow] (drrbox.east) -- node[captext, above]{Preprocess}(mid2);
    
    % --- Model node (image, boxed) ---
    \node[img, right=-0.5mm of mid2] (modelimg) {\includegraphics[width=30mm]{cnn4_gap.png}};
    \begin{scope}[on background layer]
        \node[fit=(modelimg), accent] (modelbox) {};
    \end{scope}
    \node[captext, below=2mm of modelbox]{ML Model};
    \node[accent, right=15mm of modelbox, minimum width=25mm, minimum height=12mm] (outputbox) {};
    \node[captext, font=\footnotesize] at (outputbox) {No CAC / CAC};
    
    % --- Arrow from model to output ---
    \draw[arrow] (modelbox.east) -- (outputbox.west);
    \node[captext, below=8mm of outputbox]{Prediction};
    
    \end{tikzpicture}
    }
\caption{A CT volume (left) is projected into a DRR using Siddon’s algorithm (DiffDRR) and fed to a classifier to predict a binary CAC label (Agatston $>100$).}
\label{fig:min_pipeline}
\end{figure}

\subsection{Image Enhancement Strategies}
To test whether DRRs provide sufficient fidelity for CAC detection—or whether additional enhancements are needed—we evaluate two complementary strategies: \textbf{(i)} pre-projection super-resolution and \textbf{(ii)} post-projection preprocessing.  

\subsubsection{Pre-projection super-resolution}
Native CTs often have anisotropic voxels, where coarse in-plane resolution, from a small number of axial slices, can obscure small calcifications once projected. To test whether resolution recovery enhances CAC depiction, we apply a $4\times$ SRResNet~\cite{srgan}—pretrained on natural images—to sagittal slices prior to projection. The super-resolved slices were then reassembled into a volume, resampled isotropically, and projected identically to the native-resolution pipeline. We acknowledge that a dedicated CT super-resolution model would be preferable; however, paired low-/high-resolution CT data were unavailable. Medically pretrained super-resolution models remain an avenue for future work. This comparison probes whether fine detail restoration materially improves DRR-based detection.  

\subsubsection{Post-projection image preprocessing}
We further test whether DRR adjustments can aid detection by making CAC visually more salient. Three variants were compared:  
\textbf{(1) Original:} unaltered projection.  
\textbf{(2) CLAHE:} contrast-limited adaptive histogram equalisation to locally enhance soft tissue, implemented using an $8{\times}8$ tile grid and clip-limit of $2.0$, while the unsharp mask used a $5{\times}5$ Gaussian kernel ($\sigma{=}1.0$) with a gain of $1.5$. \textbf{(3) Calc-focused:} a composite filter designed to highlight calcifications and suppress irrelevant anatomy. Comprised of gamma correction $(\gamma=1.5)$, CLAHE and unsharp masking with a $5{\times}5$ Gaussian kernel ($\sigma{=}1.0$).
This axis examines whether heuristic contrast enhancement improves learnability, or whether native DRRs are sufficient (Figure~\ref{fig:preproc}).
\begin{figure}[!h]
    \centering
    \includegraphics[width=0.65\linewidth]{preproc.png}
    
    \vspace{1mm}
    \begin{minipage}{0.9\linewidth}
        \centering
        \hspace{0.0\linewidth}(a) Original \hspace{0.08\linewidth}(b) CLAHE \hspace{0.08\linewidth}(c) Calc-focused
    \end{minipage}

    \caption{Preprocessing variants applied to native DRRs (top row) and super-resolved DRRs (bottom row). Columns show (a) Original (unaltered projection), (b) CLAHE (contrast-limited adaptive histogram equalisation), and (c) Calc-focused (gamma correction + CLAHE + unsharp masking).}
    \label{fig:preproc}
\end{figure}
\subsection{Experimental Overview}
Figure~\ref{fig:exp_overview} summarises the full experimental design. From each CT volume, DRRs are generated using the native pipeline or after $4\times$ SR upsampling. Each DRR then passes through one of three preprocessing modes. Single-view (PA) models are evaluated across three architectures; multi-view models fuse PA and LA DRRs at three interaction levels. Training strategies (curriculum, SimCLR, or both) are applied to the best-performing fusion configuration.

\begin{figure}[htbp]
\centering
\resizebox{\textwidth}{!}{%
\begin{tikzpicture}[
  font=\small,
  >=Latex,
  node distance=5mm and 8mm,
  box/.style={draw, rounded corners=3pt, fill=white, text centered,
              minimum width=26mm, minimum height=8mm, inner sep=3pt, line width=0.5pt},
  inbox/.style={box, fill=black!8, minimum width=22mm},
  grp/.style={draw=black!35, rounded corners=4pt, dashed,
              inner xsep=5pt, inner ysep=6pt, line width=0.5pt},
  arr/.style={-Latex, line width=0.55pt},
  hdr/.style={font=\footnotesize\itshape, text=black!55},
]

%% ---- Row layout (all y positions explicit) ----
% Input
\node[inbox] (ct) at (0,0) {CT Volume};

% Resolution column  (x=4)
\node[box] (sr)     at (4.2, 0.6)  {SR DRR\newline(SRResNet $4{\times}$)};
\node[box] (native) at (4.2,-0.6)  {Native DRR};
\node[grp, fit=(sr)(native)] (resbox) {};
\node[hdr] at (resbox.north) [above=1pt] {Resolution};

% Preprocessing column (x=9)
\node[box] (orig)  at (9.0, 1.0)  {Original};
\node[box] (clahe) at (9.0, 0.0)  {CLAHE};
\node[box] (calc)  at (9.0,-1.0)  {Calc-focused};
\node[grp, fit=(orig)(clahe)(calc)] (prepbox) {};
\node[hdr] at (prepbox.north) [above=1pt] {Preprocessing};

% Model/Fusion column (x=14.2)
\node[box] (cnn)   at (14.2, 1.8) {CNN5\_GAP};
\node[box] (res18) at (14.2, 0.6) {ResNet18};
\node[box] (dense) at (14.2,-0.6) {DenseNet121};
\node[box] (fuse)  at (14.2,-1.8) {PA+LA Fusion};
\node[grp, fit=(cnn)(res18)(dense)(fuse)] (modbox) {};
\node[hdr] at (modbox.north) [above=1pt] {Model / Fusion};

% Training column (x=19.2)
\node[box] (std)    at (19.2, 1.0) {Standard};
\node[box] (curric) at (19.2, 0.0) {Curriculum};
\node[box] (simclr) at (19.2,-1.0) {SimCLR (+Curric.)};
\node[grp, fit=(std)(curric)(simclr)] (trainbox) {};
\node[hdr] at (trainbox.north) [above=1pt] {Training};

% Output
\node[inbox] (out) at (24.0, 0) {CAC / No CAC};

%% ---- Arrows ----
\draw[arr] (ct.east) -- ++(0.6,0) |- (sr.west);
\draw[arr] (ct.east) -- ++(0.6,0) |- (native.west);

\draw[arr] (resbox.east) -- ++(0.5,0) |- (orig.west);
\draw[arr] (resbox.east) -- ++(0.5,0) |- (clahe.west);
\draw[arr] (resbox.east) -- ++(0.5,0) |- (calc.west);

\draw[arr] (prepbox.east) -- ++(0.5,0) |- (cnn.west);
\draw[arr] (prepbox.east) -- ++(0.5,0) |- (res18.west);
\draw[arr] (prepbox.east) -- ++(0.5,0) |- (dense.west);
\draw[arr] (prepbox.east) -- ++(0.5,0) |- (fuse.west);

\draw[arr] (modbox.east) -- ++(0.5,0) |- (std.west);
\draw[arr] (modbox.east) -- ++(0.5,0) |- (curric.west);
\draw[arr] (modbox.east) -- ++(0.5,0) |- (simclr.west);

\draw[arr] (trainbox.east) -- (out.west);

\end{tikzpicture}}
\caption{Experimental design overview. Starting from a CT volume, DRRs are generated at native or super-resolved (SR) resolution and subjected to one of three preprocessing modes. Single-view (PA) models and multi-view PA+LA fusion variants are evaluated across training strategies. The best configuration (CNN5\_GAP, cross-attention fusion, SimCLR+Curriculum) achieves a mean AUC of 0.754.}
\label{fig:exp_overview}
\end{figure}

\subsection{Model Architectures}
To assess the robustness across model capacity and inductive bias, we evaluate three representative networks:
\textbf{(i) CNN5\_GAP} (Figure~\ref{fig:cnn5_gap}): a lightweight custom CNN with five convolutional blocks, global average pooling, and a two-layer classifier. Block~1 uses a $5{\times}5$ kernel; Blocks~2--5 use $3{\times}3$ kernels; all blocks apply LeakyReLU activations and batch normalisation, followed by dropout ($p{=}0.2$–$0.3$) in the classifier head for regularisation.
\textbf{(ii) ResNet18}~\cite{resnet}: a moderate-capacity residual network trained from scratch, serving as a standard baseline.
\textbf{(iii) DenseNet121}~\cite{densenet-backbone}: pretrained on CheXpert~\cite{chexpert} and augmented with a lightweight self-gating spatial attention mechanism derived from CBAM. More specifically, given feature maps $F \in \mathbb{R}^{B \times C \times H \times W}$,
we compute a spatial attention mask
$$M = \sigma\!\left(\frac{1}{C}\sum_{c=1}^{C} F_c \right)$$
where $\sigma(\cdot)$ denotes the sigmoid function. The attended feature map is then $\tilde{F} = F \odot M$, which reweights activations by their spatial salience prior to global pooling.

For multi-view experiments, we use dual-encoder variants incorporating postero-anterior (PA) and lateral (LA) projections. Fusion is tested at three levels: \textbf{(i) early fusion} — PA and LA DRRs are channel-concatenated into a single $512{\times}512{\times}2$ input before encoding; \textbf{(ii) intermediate fusion} — each view passes through a separate encoder and the resulting feature vectors are concatenated before the classifier; \textbf{(iii) cross-attention fusion} — a shared encoder processes both views; PA features act as queries attending to LA features as keys and values, allowing the dominant PA signal to selectively incorporate complementary lateral information; attended PA features and original LA features are then combined via a learned sigmoid gate. These designs examine whether DRRs encode complementary view information analogous to real CXR studies.
\begin{figure}[htbp]
\centering
\begin{tikzpicture}[
  font=\small, >=Latex,
  arr/.style={-Latex, line width=0.5pt},
  conv/.style={draw=blue!60!black, fill=blue!10, line width=0.5pt,
               minimum width=12mm, text centered, inner sep=1pt},
  gapst/.style={draw=green!55!black, fill=green!10, rounded corners=2pt,
                line width=0.5pt, minimum width=13mm, minimum height=10mm,
                text centered, inner sep=2pt},
  fcst/.style={draw=orange!65!black, fill=orange!10, rounded corners=2pt,
               line width=0.5pt, minimum width=22mm, minimum height=10mm,
               text centered, inner sep=2pt},
  fc2st/.style={draw=orange!65!black, fill=orange!10, rounded corners=2pt,
                line width=0.5pt, minimum width=13mm, minimum height=10mm,
                text centered, inner sep=2pt},
  outn/.style={draw=red!50!black, fill=red!8, rounded corners=2pt,
               line width=0.5pt, minimum width=14mm, minimum height=10mm,
               text centered, inner sep=2pt},
  lbl/.style={font=\footnotesize, text=black!70},
  chlbl/.style={font=\scriptsize, text=blue!75!black},
]

%% ---- Conv blocks: heights encode spatial resolution (sqrt-scaled) ----
\node[conv, minimum height=30mm] (c1) at (0,    0) {};
\node[conv, minimum height=22mm] (c2) at (16mm, 0) {};
\node[conv, minimum height=16mm] (c3) at (32mm, 0) {};
\node[conv, minimum height=11mm] (c4) at (48mm, 0) {};
\node[conv, minimum height=8mm]  (c5) at (64mm, 0) {};

%% All block labels and channel counts on consistent baselines
\coordinate (lblbase) at (0,-19mm);
\coordinate (chbase)  at (0,-23mm);
\node[lbl]   at (c1 |- lblbase) {Conv 1};
\node[lbl]   at (c2 |- lblbase) {Conv 2};
\node[lbl]   at (c3 |- lblbase) {Conv 3};
\node[lbl]   at (c4 |- lblbase) {Conv 4};
\node[lbl]   at (c5 |- lblbase) {Conv 5};
\node[chlbl] at (c1 |- chbase)  {16 ch};
\node[chlbl] at (c2 |- chbase)  {32 ch};
\node[chlbl] at (c3 |- chbase)  {64 ch};
\node[chlbl] at (c4 |- chbase)  {128 ch};
\node[chlbl] at (c5 |- chbase)  {256 ch};

%% ---- Classifier head ----
\node[gapst] (gap) at (79mm, 0)  {\small GAP};
\node[fcst]  (fc1) at (99mm, 0)  {\small FC $+$ LReLU};
\node[fc2st] (fc2) at (120mm, 0) {\small FC};
\node[outn]  (out) at (137mm, 0) {\small Sigmoid};

%% ---- Arrows ----
\draw[arr] (c1.east) -- (c2.west);
\draw[arr] (c2.east) -- (c3.west);
\draw[arr] (c3.east) -- (c4.west);
\draw[arr] (c4.east) -- (c5.west);
\draw[arr] (c5.east) -- (gap.west);
\draw[arr] (gap.east) -- (fc1.west);
\draw[arr] (fc1.east) -- (fc2.west);
\draw[arr] (fc2.east) -- (out.west);

\end{tikzpicture}
\caption{CNN5\_GAP architecture. Block height reflects spatial resolution ($512^2{\to}16^2$, halved per block); channel depth increases $16{\to}32{\to}64{\to}128{\to}256$. Each block applies Conv ($5{\times}5$ for Block~1, $3{\times}3$ otherwise), batch normalisation, LeakyReLU, and max-pool ${\downarrow}2{\times}$. GAP produces a 256-d vector fed to two FC layers with dropout.}
\label{fig:cnn5_gap}
\end{figure}

\subsection{Training \& Evaluation}
Models were trained with binary cross-entropy loss and label smoothing ($\varepsilon=0.1$) using Adam~\cite{adam} (learning rate $10^{-4}$, weight decay $10^{-5}$, batch size 32). Augmentations consisted of random affine transformations: rotations ($\pm 5^{\circ}$), translations ($\pm 5\%$), scaling ($[0.9, 1.1]$) and shearing ($10^{\circ}$). Horizontal flips were excluded to preserve anatomical context. Early stopping was applied on validation AUC (being the criterion for model selection), with dropout/batch normalisation for regularisation. Multi-view fusion models used a curriculum schedule~\cite{Curric} (extremes first, then borderline), intended as a heuristic to reduce instability from thresholding. 

Performance was evaluated with stratified 5-fold cross-validation (CV) across 5 seeds (25 runs) with approximately 530 training and 130 validation DRRs. Metrics included AUC-ROC (primary), with accuracy, precision, recall, and F1 as secondary. All splits were patient-level and stratified by CAC label to avoid data leakage across folds. To reduce optimism bias from volatile early epochs, we discarded the first five epochs and averaged performance over the top five subsequent epochs, selected by highest validation AUC. This conservative reporting may slightly underestimate peak metrics, but ensures robustness. Statistical significance was assessed using paired Wilcoxon signed-rank tests~\cite{hollander2013} across validation folds. Results were considered significant at $p < 0.05$. Augmentations were applied to 2D DRRs after projection; applying augmentations in 3D before projection would be more physically meaningful and is left for future work. All experiments were conducted on the UNSW Katana high-performance computing cluster, utilising NVIDIA GPU nodes (V100, A100, L40S, H200, and GH200) via a queue-based scheduler.

All preprocessing, training, and evaluation code is implemented in PyTorch and will be made publicly available upon publication for reproducibility.
\section{Results}
We present results in a structured manner, beginning with single-view baselines and ablations on preprocessing and super-resolution, followed by multi-view fusion strategies, and finally training variants such as curriculum learning. Unless otherwise noted, all results are reported as mean validation AUC (± standard deviation) across 25 runs, with statistical significance assessed using the Wilcoxon signed-rank test.

\subsection{Baselines, Preprocessing, and Super-resolution}
To assess the impact of preprocessing and super-resolution (SR) on model performance, we evaluated three architectures (CNN5\_GAP, DenseNet121, and ResNet18) across three DRR preprocessing modes (Original, CLAHE, Calc-focused) under both native and super-resolved inputs (Table \ref{tab:preproc_sr}). For CNN5\_GAP, mean AUCs improved by $+0.015$  ($0.720\rightarrow0.735$) from native-original to SR-calc-focused, with SR yielding significant gains under calc-focused and CLAHE preprocessing ($p<0.05$). DenseNet121 performance degraded $(0.718 \rightarrow 0.688;\,\,p < 0.05)$ under contrast-enhancing preprocessing, with SR offering only minor recovery. ResNet18 remained stable across most modes, with no significant differences observed. 

Given its consistent and competitive performance across all conditions, CNN5\_GAP was selected as the primary backbone for subsequent experiments.
\begin{table}[!h]
\centering
\caption{Validation AUC for three architectures (CNN5\_GAP; DenseNet121 with CBAM-derived spatial attention; ResNet18) across three preprocessing modes (Original, CLAHE, Calc-focused) and two resolution conditions (Native, SR = $4\times$ super-resolved).\protect\footnotemark}
\label{tab:preproc_sr}
\resizebox{\linewidth}{!}{
\begin{tabular}{lcccccc}
\toprule
\multirow{2}{*}{Preprocessing} & \multicolumn{2}{c}{CNN5\_GAP} & \multicolumn{2}{c}{DenseNet121} & \multicolumn{2}{c}{ResNet18} \\
 & Native & SR & Native & SR & Native & SR \\
\midrule
Original & 0.720 ± 0.048 & 0.727 ± 0.050 & \textbf{0.718 ± 0.031$^\dagger$} & 0.708 ± 0.026 & 0.714 ± 0.031 & 0.717 ± 0.031 \\
CLAHE    & 0.728 ± 0.049 & \textbf{0.733 ± 0.043$^*$} & 0.688 ± 0.053 & 0.695 ± 0.028 & 0.716 ± 0.068 & 0.702 ± 0.022 \\
Calc-foc & 0.729 ± 0.053 & \textbf{0.735 ± 0.043$^*$} & 0.683 ± 0.058 & 0.688 ± 0.042 & 0.722 ± 0.070 & 0.723 ± 0.032 \\
\bottomrule
\end{tabular}}
\end{table}
\footnotetext{$\dagger$ $p < 0.05$ vs CLAHE, Calc\_foc $*$ $p < 0.05$ vs Original}

\subsection{Fusion \& Training Strategies}
We next evaluated whether combining posterior–anterior (PA) and lateral (LA) projections could improve performance over best performing single-view (PA) baseline of $0.735 \pm 0.043$. Three fusion strategies were tested: early fusion (image concatenation), intermediary fusion (latent feature concatenation), and cross-attention fusion. Each was implemented with either shared or unshared encoders and a learnable scalar-gated mechanism to weigh each view's contribution.

\begin{table}[!h]
\centering
\caption{Validation AUC performance of PA+LA fusion strategies, varying by level of interaction (early, intermediary, or attention) and whether encoders are shared or unshared.}
\label{tab:fusion_results}
\resizebox{0.7\linewidth}{!}{
\begin{tabular}{lcc}
\toprule
Fusion Strategy & Shared Encoder & Unshared Encoder \\
\midrule
PA-Only (Baseline)         & $0.735 \pm 0.043$ \\
Early Fusion             & 0.702 ± 0.044 & -- \\
Intermediate Fusion      & 0.736 ± 0.059 & 0.739 ± 0.065 \\
Cross-Attention Fusion   & \textbf{0.740 ± 0.043} & 0.729 ± 0.045 \\
\bottomrule
\end{tabular}}
\end{table}
As shown in Table \ref{tab:fusion_results}, early fusion was significantly worse ($p < 0.05$) relative to the PA-only baseline ($0.735 \pm 0.043$). Intermediary fusion achieved mean AUCs of 0.736 (shared encoders) and 0.739 (unshared), while cross-attention fusion reached 0.740 (shared) and 0.729 (unshared). Differences among fusion variants were within one standard deviation, and none reached statistical significance \textit{relative} to the PA-only baseline. Notably, LA-only classification (AUC $0.695 \pm 0.043$; see Appendix \ref{appendix:C}) substantially trailed the PA-only baseline.

We also assessed the effect of alternative training strategies on the best-performing fusion model (cross-attention/shared). Specifically, we evaluated curriculum learning, SimCLR-based self-supervised pretraining~\cite{simclr}, and their combination.

\begin{table}[!h]
\caption{Validation AUC across training strategies applied to the best-performing fusion configuration (CNN5\_GAP, cross-attention fusion, shared encoders). Results reported as mean $\pm$ std across 25 runs.}
\centering
\resizebox{0.55\linewidth}{!}{
\begin{tabular}{lc}
\toprule
Training Strategy & Mean AUC \\
\midrule
Fusion (standard)        & 0.740 $\pm$ 0.043 \\
Curriculum Learning      & 0.750 $\pm$ 0.046 \\
SimCLR Pretraining       & 0.742 $\pm$ 0.045 \\
SimCLR + Curriculum      & \textbf{0.754 $\pm$ 0.055} \\
\bottomrule
\end{tabular}}
\label{tab:simclr_curric}
\end{table}

Table \ref{tab:simclr_curric} shows that curriculum learning raised mean AUC from 0.740 to 0.750. SimCLR pretraining yielded 0.742, close to the baseline. SimCLR combined with curriculum achieved the highest observed mean AUC (0.754), though this was comparable within variance. The consistent direction of improvement across folds suggests potential value, but larger datasets are required for confirmation (see Appendix \ref{appendix:C}, Table \ref{tab:wilcoxon_results} for pairwise Wilcoxon statistics).

At a clinically relevant operating point of 90\% sensitivity, the best-performing model (SimCLR + Curriculum) achieved: precision $0.58 \pm 0.05$, recall $0.91 \pm 0.00$, F1 $0.70 \pm 0.03$, and accuracy $0.63 \pm 0.06$. The low variance in F1 ($\pm 0.03$) demonstrates stable performance across all 25 runs. The modest precision reflects the inherent sensitivity--specificity trade-off and the class imbalance in the dataset.

Performance varied with Agatston score magnitude: accuracy was highest for extreme cases (0--10: $0.74 \pm 0.11$, $n{=}69$; $>$400: $0.78 \pm 0.13$, $n{=}13$) and lowest near the decision boundary (11--100: $0.55 \pm 0.15$, $n{=}28$; 101--400: $0.72 \pm 0.12$, $n{=}21$). The reduced accuracy in the 11--100 range is expected, as these patients have subclinical calcification with subtle imaging features, consistent with the weak image-level supervision inherent in binary labels. These results motivate examining how view complementarity and training stability interact under limited data, as discussed next.


\section{Discussion}
Our experiments evaluated the feasibility of using DRRs as a surrogate domain for CAC classification, focusing on: \textbf{(i)} fidelity enhancement via super-resolution (SR), \textbf{(ii)} signal optimisation through preprocessing, and \textbf{(iii)} fusion of complementary projections.

\subsection{Super-resolution and Preprocessing}

Coronary artery calcifications are high-frequency features whose visibility is diminished when CT volumes are reconstructed at low axial resolutions. On its own, SR modestly improved fidelity, yielding consistent but non-significant gains in CNN5 GAP ($+0.007;p=0.107$). When paired with preprocessing, SR-enabled pipelines achieved improvements over their native baselines ($p < 0.05$), with Calc\_foc + SR yielding the strongest single-view performance (0.735). This suggests that SR may play an enabling role: by partially restoring high-frequency structure, it could provide the substrate that preprocessing methods can then amplify into a more discriminative signal.

Preprocessing alone showed model-dependent behaviour. CNN5\_GAP adapted well, whereas DenseNet121 degraded significantly ($p < 0.05$). This highlights a key trade-off: pretrained backbones encode strong natural-image priors that are brittle to the contrast shifts introduced by synthetic-domain preprocessing, while lightweight task-specific CNNs trained from scratch are more flexible and can exploit enhanced contrast once fidelity is recovered—consistent with the broader observation that compact models generalise better within surrogate domains where texture and intensity statistics differ from real-world pretraining distributions. However, real CXRs are already acquired at high resolutions; SR's benefit is unique to CT-derived projections. In contrast, preprocessing strategies transfer rather trivially. In clinical CXRs, where acquisition noise and patient variability are greater, local contrast enhancement or normalisation may provide more value by homogenising inputs while highlighting subtle calcifications. 

Taken together, these results suggest that fidelity restoration and contrast enhancement jointly determine the effective signal-to-noise ratio for CAC depiction in DRRs.

\subsection{Fusion \& Training Strategies}
Given calcifications may be obfuscated in a single projection, we evaluated fusion of PA and LA DRRs. Early image-level fusion performed significantly worse ($p < 0.05$), suggesting destructive interference across modalities. Intermediary feature fusion and cross-attention achieved the best observed gains, though improvements did not reach statistical significance, likely reflecting fold-level variance under limited data. Notably, LA-only classification underperformed (AUC 0.695, see Table \ref{tab:la_performance}), indicating that PA projections provide most discriminative information, while the marginal gain from incorporating LA views may be limited under current dataset size and coverage.
Curriculum learning stabilised training (+0.010 AUC), SimCLR offered marginal gains, and their combination reached the highest mean AUC (0.754), consistent across folds but not statistically significant. Their complementary effects likely stem from stabilising optimisation (curriculum) and improving representation initialisation (SimCLR). Ultimately, these observations highlight the need for larger, more diverse datasets to fully exploit multi-view complementarity and self-supervised pretraining. 

Overall, SR enhances DRR fidelity for lightweight CNNs, preprocessing benefits depend on architecture, and fusion trends are promising but data-limited. DRRs reproduce key behaviours seen in real CXR studies while providing label-rich CT-derived data, substantiating their value as a surrogate domain and motivating future work in domain adaptation.


\subsection{Failure Mode Analysis}

To better understand model behaviour, we performed GradCAM visualisations on representative cases from our best-performing model. Figure~\ref{fig:gradcam} shows attention maps for true positives, false positives, and false negatives.

For true positives with high Agatston scores, attention concentrates on the central cardiac silhouette, consistent with the anatomical location of coronary arteries. However, we note this may partially reflect image intensity distribution rather than learned calcification-specific features. False positives show attention shifted toward the spine, suggesting the model may confuse vertebral density with cardiac calcification. False negatives exhibit diffuse, unfocused attention patterns, indicating failure to localise discriminative features in these cases.

These findings suggest that while the model learns anatomically plausible attention for clear positive cases, errors arise from confounding dense structures and insufficient feature localisation for atypical presentations. Future work could incorporate anatomical priors or region-specific losses to improve specificity.

Table~\ref{tab:agatston_strat} further characterises model behaviour by Agatston score range, confirming that classification accuracy is highest for extreme cases and lowest near the decision boundary.

\begin{table}[htbp]
\centering
\caption{Classification accuracy of the best-performing model (SimCLR + Curriculum) stratified by Agatston score range. Results shown as mean $\pm$ std across 25 runs.}
\label{tab:agatston_strat}
\begin{tabular}{lcc}
\toprule
Agatston Range & $n$ & Accuracy \\
\midrule
0--10   & 69 & $0.74 \pm 0.11$ \\
11--100 & 28 & $0.55 \pm 0.15$ \\
101--400 & 21 & $0.72 \pm 0.12$ \\
$>$400  & 13 & $0.78 \pm 0.13$ \\
\bottomrule
\end{tabular}
\end{table}

\begin{figure}[!htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:gradcam}
  {\caption{GradCAM visualisations. Left: True positive (Agatston 1272) showing attention on the cardiac silhouette. Centre: False positive (Agatston 1) showing attention shifted toward the spine. Right: False negative (Agatston 449) showing diffuse, unfocused attention.}}
  {\includegraphics[width=\textwidth]{gradcam_panel.png}}
\end{figure}

\subsection{Comparison to Prior Work}
Our best-performing configuration (mean AUC = 0.754 $\pm$ 0.055) is numerically comparable to prior CXR-based CAC classification studies. \cite{kamel2021prediction} reported an AUC of 0.73 using an attention-augmented VGG16 trained on 1,689 paired CXRs; \cite{dancona2023deep} achieved $\approx$0.71 AUC predicting significant coronary artery disease from radiographs; and~\cite{jeong2024radiomics} reported 0.808 AUC using a radiomics approach requiring manual cardiac segmentation. We stress that this comparison is not a direct performance claim: our model is trained on synthetic DRRs with perfect CT-derived labels, whereas prior work contends with real CXR images and their inherent label noise from retrospective CXR--CT pairing. Our result therefore demonstrates \emph{methodological feasibility} rather than superiority.

Our pipeline is \textbf{fully automated} (no manual segmentation or handcrafted features) and \textbf{intrinsically scalable}: a labelled CT repository yields thousands of precisely aligned DRRs automatically, removing the CXR--CT pairing bottleneck and establishing a reproducible, label-rich platform for future real-CXR transfer.

\section{Limitations}
Several limitations should be noted. Agatston scores provide image-level supervision only; future work should explore per-artery DRR labels to capture spatial heterogeneity. The dataset is small by deep-learning standards (${\sim}$130 validation DRRs per fold), contributing to fold-level variance and modest effect sizes. The synthetic domain may not fully replicate clinical CXRs—differences in noise statistics, detector response, and cardiac field-of-view limit direct transfer, so this study demonstrates methodological feasibility rather than clinical readiness; domain adaptation and real-CXR validation remain essential next steps. No systematic hyperparameter search was conducted for CNN5\_GAP, and Vision Transformers were excluded due to their higher data requirements. Finally, the absence of a held-out test set means all metrics are cross-validation estimates; a properly held-out test set should be standard practice as datasets scale.

\section{Conclusion}
This study establishes CT-derived DRRs as a viable surrogate training domain for deep learning-based CAC detection. Lightweight CNNs trained from scratch outperform larger pretrained models; pairing super-resolution with contrast enhancement yields statistically significant gains; and curriculum learning stabilises training under label noise, with SimCLR offering only marginal additional benefit. Multi-view fusion produced the best observed performance, though gains did not reach significance under data-limited evaluation. Collectively, these findings demonstrate that DRRs provide a scalable, label-rich foundation for reproducible CAC research prior to real-CXR transfer. With advances in domain adaptation and dataset scale, DRR-pretrained models could underpin population-level cardiovascular risk screening using existing radiography infrastructure.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{The authors are grateful for the support by the 2024 (Cardiac, Vascular, and Metabolic Medicine) CVMM THEME COLLABORATIVE GRANT SCHEME.}


\bibliography{midl26_019}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage

\appendix
\section{Statistical Testing Methods}
For each paired comparison (same folds/seeds), we apply the Wilcoxon signed-rank test on per-run AUC values. Using 5 randomised seeds across a 5-fold CV, this yields $n=25$ runs.  We report the test statistic and p-value. Significant results are marked as bold at $p<0.05$ with a $*$.

\section{Gated Dataset}
\label{appendix:A}
\begin{table}[!h]
\centering
\caption{Gated dataset – Wilcoxon signed-rank test within CNN5\_GAP across super-resolution and preprocessing modes. Statistical significance at $p<0.05$ is marked with *.}
\resizebox{\linewidth}{!}{
\begin{tabular}{lcccc}
\hline
Comparison & Stat & $p$-value & Mean AUC A & Mean AUC B \\
\hline
Native: orig vs calc\_foc & 97.0 & 0.078 & 0.720 & 0.729 \\
Native: orig vs clahe & 104.0 & 0.120 & 0.720 & 0.728 \\
Native: calc\_foc vs clahe & 151.0 & 0.771 & 0.729 & 0.728 \\
SR: orig vs calc\_foc & 66.0 & \sig{8.07e-03} & 0.727 & 0.735 \\
SR: orig vs clahe & 89.0 & \sig{0.048} & 0.727 & 0.733 \\
SR: calc\_foc vs clahe & 123.0 & 0.300 & 0.735 & 0.733 \\
Native vs SR (orig) & 129.0 & 0.381 & 0.720 & 0.727 \\
Native vs SR (calc\_foc) & 102.0 & 0.107 & 0.729 & 0.735 \\
Native vs SR (clahe) & 91.0 & 0.055 & 0.728 & 0.733 \\
\hline
\end{tabular}}
\end{table}


\begin{table}[!h]
\centering
\caption{Gated dataset – Wilcoxon signed-rank test within DenseNet121 across super-resolution and preprocessing modes. Statistical significance at $p<0.05$ is marked with *.}
\resizebox{\linewidth}{!}{
\begin{tabular}{lcccc}
\hline
Comparison & Stat & $p$-value & Mean AUC A & Mean AUC B \\
\hline
Native: orig vs calc\_foc & 83.0 & \sig{0.032} & 0.718 & 0.683 \\
Native: orig vs clahe & 81.0 & \sig{0.028} & 0.718 & 0.688 \\
Native: calc\_foc vs clahe & 122.0 & 0.276 & 0.683 & 0.688 \\
SR: orig vs calc\_foc & 66.5 & \sig{0.010} & 0.708 & 0.688 \\
SR: orig vs clahe & 80.0 & \sig{0.026} & 0.708 & 0.695 \\
SR: calc\_foc vs clahe & 128.0 & 0.367 & 0.688 & 0.695 \\
Native vs SR (orig) & 105.0 & 0.122 & 0.718 & 0.708 \\
Native vs SR (calc\_foc) & 150.0 & 0.751 & 0.683 & 0.688 \\
Native vs SR (clahe) & 129.0 & 0.381 & 0.688 & 0.695 \\
\hline
\end{tabular}}
\end{table}

\newpage
\section{Fusion Analysis}
\label{appendix:C}
\subsection{Lateral-only Performance}
Here we assess the strength of the calcium signal in Lateral (LA) projections. Significance was assessed with Wilcoxon signed-rank tests at $p < 0.05$.
\begin{table}[!h]
\centering
\caption{Wilcoxon signed-rank test results for LA-only AUC across preprocessing modes using CNN5\_GAP.
Statistical significance at $p<0.05$ is marked with *.}
\label{tab:la_performance}
\resizebox{\linewidth}{!}{
\begin{tabular}{lcccc}
\toprule
Comparison & Stat & $p$-value & Mean AUC A & Mean AUC B \\
\midrule
LA Native: orig vs calc\_foc & 37.0 & \sig{3.29e-04} & 0.672 & 0.695 \\
LA Native: orig vs clahe & 57.0 & \sig{0.003} & 0.672 & 0.691 \\
LA Native: calc\_foc vs clahe & 120.5 & 0.258 & 0.695 & 0.691 \\
\bottomrule
\end{tabular}}
\end{table}

\subsection{Fusion and Training Strategy Comparisons}
Pairwise Wilcoxon signed-rank test results comparing fusion baselines, curriculum learning, and SimCLR variants. Significant results at $p < 0.05$ are marked with $*$.
\begin{table}[!h]
\centering
\caption{Pairwise Wilcoxon signed-rank test results comparing fusion baselines, curriculum learning, and SimCLR variants. Statistical significance at $p<0.05$ is marked with *.}
\label{tab:wilcoxon_results}
\resizebox{\linewidth}{!}{
\begin{tabular}{lcccc}
\toprule
Comparison & Stat & $p$-value & Mean AUC A & Mean AUC B \\
\midrule
Early vs Int. Fusion (Shared) & 11.0 & \sig{4.6e-05} & 0.703 & 0.736 \\
Early vs Int. Fusion (Unshared) & 11.0 & \sig{4.6e-05} & 0.703 & 0.739 \\
Early vs Cross-Attention (Shared) & 11.0 & \sig{4.6e-05} & 0.703 & 0.740 \\
\midrule
Standard vs Curriculum & 132.0 & 0.426 & 0.740 & 0.750 \\
Standard vs SimCLR & 152.0 & 0.791 & 0.740 & 0.742 \\
Standard vs Curriculum + SimCLR & 128.0 & 0.367 & 0.740 & 0.754 \\
\bottomrule
\end{tabular}}
\end{table}


\end{document}
