\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{arydshln}
\usepackage{tikz}
\usetikzlibrary{shapes.geometric, arrows, positioning, fit, calc}
% \usepackage{cleveref}
\jmlrvolume{-- 234}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}

\newenvironment{DIFnomarkup}{}{}

\title[WSC Analysis: Quantifying Feature Utilization in DL Models]{Weight Space Correlation Analysis: Quantifying Feature Utilization in Deep Learning Models}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Chun Kit Wong\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \Email{ckwo@dtu.dk}\\
\Name{Paraskevas Pegios\midlotherjointauthor\nametag{$^{1,2}$}} \Email{ppar@dtu.dk}\\
\Name{Nina Weng\nametag{$^{1}$}} \Email{ninwe@dtu.dk}\\
\Name{Emilie Pi Fogtmann Sejer\nametag{$^{3,4}$}} \Email{emilie.pi.fogtmann.sejer.01@regionh.dk}\\
\Name{Martin Grønnebæk Tolsgaard\nametag{$^{3,4}$}} \Email{martin.groennebaek.tolsgaard@regionh.dk}\\
\Name{Anders Nymark Christensen\nametag{$^{1}$}} \Email{anym@dtu.dk}\\
\Name{Aasa Feragen\nametag{$^{1,2}$}} \Email{afhar@dtu.dk}\\
\addr $^{1}$ Technical University of Denmark, Kongens Lyngby, Denmark \\
\addr $^{2}$ Pioneer Centre for AI, Copenhagen, Denmark \\
\addr $^{3}$ University of Copenhagen, Copenhagen, Denmark \\
\addr $^{4}$ CAMES Rigshospitalet, Copenhagen, Denmark
}

\begin{document}

\maketitle

\begin{abstract}
Deep learning models in medical imaging are susceptible to shortcut learning, relying on confounding metadata (e.g.\ scanner model) that is often encoded in image embeddings. The crucial question is whether the model actively utilizes this encoded information for its final prediction. We introduce Weight Space Correlation analysis, an interpretable methodology that quantifies feature utilization by measuring the alignment between the classification heads of a primary clinical task and auxiliary metadata tasks. We first validate our method by successfully detecting artificially induced shortcut learning. We then apply it to probe the feature utilization of an SA-SonoNet model trained for Spontaneous Preterm Birth (sPTB) prediction. Our analysis confirmed that while the embeddings contain substantial metadata, the sPTB classifier's weight vectors were highly correlated with clinically relevant factors (e.g.\ cervical length) but decoupled from clinically irrelevant acquisition factors (e.g.\ scanner). Our methodology provides a tool for verifying model trustworthiness, by inspecting whether it utilizes features unrelated to the genuine clinical signal. Code available at \url{https://github.com/wong-ck/wsc-analysis}. % demonstrating that, in the absence of induced bias, the clinical model selectively utilizes features related to the genuine clinical signal.
\end{abstract}

\begin{keywords}
Shortcut learning, feature utilization, obstetric ultrasound.
\end{keywords}

\newcommand{\ckwo}[1]{{\color{red} \bf ckwo: #1}}
\newcommand{\afhar}[1]{{\color{blue} \bf afhar: #1}}
\newcommand{\anym}[1]{{\color{orange} \bf anym: #1}}

\section{Introduction}
Deep learning models have achieved impressive performance across numerous medical imaging tasks, often matching or exceeding human expert capabilities~\citep{de2018clinically,esteva2017dermatologist}. However, their reliance on vast, complex datasets introduces significant concerns regarding model trustworthiness. One of the primary threats to model reliability is shortcut learning, where a model learns a simple, non-causal predictive rule that performs well on training data but fails when deployed in new environments~\citep{geirhos2020shortcut,neuhaus2023spurious,jabbour2020deep}. In medical imaging, this often manifests as a model relying on confounding factors or shortcuts, which are variables correlated with both the image features and the clinical outcome, but without a direct causal link to the underlying anatomy or pathology, such as scanner models or acquisition protocols. % including acquisition protocols, scanner models, institution-specific artifacts, or patient demographics. %When a model relies on such features, it is effectively taking a ``shortcut'' to prediction, bypassing the true clinical signal~\citep{geirhos2020shortcut}.

There is an ever-present possibility that non-disease related features, such as patient demographics, types of scanners or pre-processing protocols are subtly encoded within a medical image's visual features for deep learning models; as unlike human experts, machines are inherently better at capturing textures or minute subtleties. \citet{gichoya2022ai} demonstrated that deep learning classifiers can predict a patient's self-reported race from chest X-ray images with high accuracy. This confirmed that models can extract non-clinical demographic information embedded in images, raising serious questions about fairness and generalization across different populations and settings~\citep{obermeyer2019dissecting}.

Although \citet{gichoya2022ai} demonstrated that confounding information, such as race, is encoded in medical images, this is not equivalent to the model making use of that information during inference. %, and only to which the fairness issue should be concerned. 
We argue that the mere presence of a feature in the embedding space is a prerequisite for shortcut learning, but not a proof of its use. \citet{glocker2023algorithmic} addressed this gap by proposing a method to discover patterns and clustering within embeddings of the training data, and quantifying the separation of embeddings relative to ground-truth metadata labels using the Kolmogorov-Smirnov (K-S) test. \citet{stanley2025and} further showed that such confounders can be encoded across different layers of the model, and yet it does not necessarily imply that the model relies on them for its prediction. While these allow one to examine the distribution of the images in the embedding space, it still does not directly address the fundamental question: \emph{Does the model’s final classification layer actively leverage the embedded features of the confounder?} Another branch of work evaluates shortcut learning by generating counterfactual test sets that manipulate suspected shortcut features \citep{kumar2023debiasing,fathi2024decodex,weng2024fast,bender2025mitigating,bender2026visual}. While these can be valuable for highly localized artifacts such support medical devices, it might be hard to adapt them across metadata factors such as scanner type or non-localized patient characteristics.
% It remains possible that the decision boundary is orthogonal to the confounder's axis, indicating the information is present in the embedding but functionally ignored by the classifier.

% Our method also distinguishes itself from attribution techniques like **Gradient-weighted Class Activation Mapping (Grad-CAM)** and other saliency map approaches (Selvaraju et al., 2017). These methods typically provide a spatial heatmap, highlighting which pixels or image regions contribute most to a specific prediction. While useful for localizing features, **saliency maps do not quantify the utilization of features based on their semantic meaning** (e.g.\ ``Scanner Model A feature") within the model's abstract, high-dimensional feature space. A high-saliency region might be important for *both* the primary task and the confounder, but Grad-CAM cannot tell us if the neural pathways dedicated to those two semantics are aligned or decoupled.

To definitively address the question of information utilization, we introduce a methodology that focuses on comparing the attention of the neurons in the classification head quantitatively. Our method, which we termed Weight Space Correlation (WSC) analysis, distinguishes itself by comparing the weight vectors (i.e.\ the ``attention'') of the primary task against the weight vectors of auxiliary metadata tasks. By quantifying the alignment between these decision boundaries using correlation, we can directly determine whether the features used for a clinical prediction are the same features used for a confounder. 

We motivate and evaluate our method in the context of fetal ultrasound, where models can inadvertently rely on shortcuts. Ultrasound images often contain on-screen annotations such as calipers and text \citep{mikolaj2023removing,lin2024shortcut}, which can act as shortcuts in applications including anatomical classification and standard plane quality assessment \citep{baumgartner2017sononet,lin2024learning,wisniewski2025determining,pegios2025diffusion}, out-of-distribution detection \citep{olsen2024unsupervised,wong2025influence} as well as spontaneous preterm birth prediction (sPTB) \citep{wlodarczyk2020spontaneous,wlodarczyk2019estimation,pegios2023leveraging}. In this work, we focus on patient characteristics and other metadata that can also act as confounders in medical imaging because models may encode and exploit non-causal signals correlated with outcomes \citep{gichoya2022ai}. In fetal ultrasound, related studies \citep{fournel2025cervix,sejer2025combined} have mainly assessed bias through stratified performance analyses across patient metadata groups and imaging-related factors for sPTB deep learning models such as SA-SonoNet \citep{pegios2023leveraging}. In contrast, in this work, we go beyond group-wise performance differences and directly \emph{quantify feature utilization} by measuring the alignment between the primary clinical head and auxiliary metadata heads.

%We validate this methodology by successfully detecting artificially introduced shortcut learning. We then apply the validated method to probe the feature utilization of a real-world scenario, the Spontaneous Preterm Birth (sPTB) prediction using SA-SonoNet~\citep{pegios2023leveraging}. Our results show that, in this specific case, while the sPTB model's embeddings contain metadata features, the model only rely on clinically relevant features (e.g.\ birth weight), rather than confounder features (e.g.\ scanner models) during inference.


% \section{Materials and Methods}
% \label{sec:materials_and_method}
% \subsection{Clinical Datasets}
% \label{sec:clinical_dataset}
% In this study, we utilize two distinct, private clinical ultrasound datasets to evaluate the interplay between feature encodability and shortcut learning. Both datasets are accompanied by a rich set of demographic and acquisition metadata, including ultrasound scanner manufacturer, hospital site ID, and maternal ethnicity.

% \begin{itemize}
%     \item \textbf{The Fetal Dataset}: This dataset focuses on anatomical classification. It comprises 2D ultrasound images of the four standard fetal planes: the fetal head, abdomen, femur, and thorax. The primary task is a multi-class classification problem where the model must identify the anatomical plane present in the image.
%     \item \textbf{The Cervix Dataset}: This dataset focuses on a prognostic task related to preterm birth. It consists of transvaginal cervical ultrasound images, balanced equally between two classes: term birth and preterm birth (defined as delivery before 37 weeks of gestation). The primary task is the binary classification of the image into these prognostic outcomes.
% \end{itemize}

% \subsubsection{Preprocessing of Metadata Attributes}
% To unify the prediction tasks under a single supervised learning framework, we reformulated the prediction of continuous metadata variables as a multi-class classification problem. Continuous attributes were discretized via binning, transforming scalar values into distinct categorical labels. Specifically, the range of each continuous variable was partitioned into $k$ intervals. Any value falling within a specific interval was assigned the class label corresponding to that bin. This discretization step mitigates the impact of outliers and allows for the application of classification metrics across all target variables.

% \subsection{Network Architectures and Training Regimes}

% To systematically disentangle the encoding of spurious correlations from their functional utilization, we employ a ResNet50 backbone across two distinct architectural configurations. These variations allow us to contrast a model trained solely for the clinical task against one explicitly forced to encode metadata features.

% \begin{enumerate}
%     \item \textbf{Baseline:} We train a standard ResNet50 where the input is solely the ultrasound image $x_{img}$ and the output is the primary clinical classification $y_{class}$ (Anatomy for the Fetal Dataset; Birth outcome for the Cervix Dataset).
%     % \item \textbf{Multimodal Fusion:} We augment the baseline by introducing metadata attributes as input features. The image embedding is concatenated with the encoded metadata vector $x_{meta}$ before the final classification layer, formulated as $f(x_{img}, x_{meta}) \rightarrow y_{class}$.
%     \item \textbf{Multi-Task Learning:} We utilize the image-only input but extend the architecture with auxiliary output heads. The model is trained to simultaneously predict the primary class $y_{class}$ and the metadata attributes (e.g.\ scanner ID) $y_{meta}$. This encourages the shared backbone to learn features relevant to both the clinical target and the potential confounders.
%     % \item \textbf{Multimodal + Multi-Task (Auto-Encoding):} We combine the inputs and outputs of the previous approaches. The model accepts both image and metadata as input and predicts both the class and the metadata attributes at the output: $f(x_{img}, x_{meta}) \rightarrow {y_{class}, y_{meta}}$. This architecture functions similarly to an auto-encoder, where the ``decoder" is simplified to a single fully connected linear layer, effectively forcing the latent representation to retain information about the input metadata.
% \end{enumerate}

% \subsection{Probing for Encoded Information}
% \label{sec:probe_for_encoded_info}
% To establish a baseline for ``encodability", i.e.\ the degree to which metadata is present in the latent space regardless of its utility, we adopt the linear probing methodology described by \citet{gichoya2022ai,glocker2023algorithmic}.

% For the \textbf{Baseline} models, we first complete the training for the primary clinical task. We then freeze the parameters of the backbone encoder, treating it as a fixed feature extractor. We discard the primary classification head and attach new, randomly initialized fully connected heads corresponding to the metadata attributes. These probing heads are then trained to predict the metadata (e.g.\ Hospital ID) using only the frozen embeddings. High performance on this probing task indicates that the model has encoded information about the metadata, even if it was not explicitly trained to do so.

% Meanwhile, the \textbf{Multi-Task} models do not require fine-tuning, since the multiple distinct heads connected to the shared embedding base were already trained jointly.

% \subsection{Quantifying Shortcut Reliance via Weight Space Correlation}
% \label{sec:weight_space_correlation}
% While the probing method described in \sectionref{sec:probe_for_encoded_info} could be used for confirming the \textit{presence} of confounding information, it does not quantify the \textit{extent} to which the model relies on this information for its primary prediction. To bridge this gap, we introduce a method to quantify reliance by analyzing the correlation of the decision weights in a reduced dimensionality space.

% We conceptualize the weights of the final fully connected (FC) layer not merely as regression coefficients, but as attention vectors acting upon the latent embedding. If the weight vector for the primary task ($W_{task}$) is highly correlated with the weight vector for a metadata attribute ($W_{meta}$), it suggests the model attends to similar features for both predictions, implying a reliance on that specific shortcut.

% To compute this robustly, we address the high dimensionality and potential sparsity of the ResNet50 latent space (2048 dimensions) using the following pipeline:

% \begin{enumerate}
%     \item \textbf{Manifold Estimation:} We compute the embeddings for the entire training set $X_{train}$ using the frozen backbone.
%     \item \textbf{Dimensionality Reduction:} We apply Principal Component Analysis (PCA) to these embeddings to identify the active data manifold. We construct a projection matrix $P$ that retains the top principal components explaining $99\%$ of the variance in the dataset, while enforcing a minimum floor of 50 components to ensure sufficient representational capacity is preserved even in lower-rank scenarios.
%     \item \textbf{Weight Projection:} We project the weights of the prediction heads (both the primary task and the probing heads) into this PCA-reduced space. Let $W_{fc}$ represent the weights of a fully connected head; the transformed weights are calculated as:
%     \begin{equation}
%         W'_{fc} = W_{fc} \cdot P^T
%     \end{equation}
%      This step ensures that the correlation is calculated based on the directions of variance that actually exist in the data, rather than the less informative orthogonal dimensions.
%     \item \textbf{Correlation Analysis:} Finally, we compute the pairwise correlation (cosine similarity) between every pair of projected weight vectors.
% \end{enumerate}
% A high correlation in this projected space serves as a quantitative proxy for shortcut learning: it implies that the decision boundary for the clinical task aligns closely with the decision boundary for the confounder.

\section{Method}
\label{sec:method}

\begin{figure}[!t]
\centering
\includegraphics[width=\linewidth]{figures/wsc_analysis_illustrated.png}
\caption{The WSC analysis workflow. An image is passed through the backbone encoder to produce an embedding, which is used by both the primary task (e.g.~anatomy) and the auxiliary/metadata (e.g.~scanner) classification heads. The weights of these heads are projected into a lower-dimensional space via PCA and their cosine similarity is computed to quantify the correlation, indicating feature utilization.}
\label{fig:wsc_workflow}
\end{figure}

Our goal is to determine whether a clinical prediction task implicitly relies on  metadata-related information learned during training. Our method consists of three steps: (i) representing each task through the linear decision directions of its classification head, (ii) projecting these directions onto the intrinsic data manifold, and (iii) quantifying the reliance between tasks via cosine similarity of their projected weight vectors.

\subsection{Latent Representation and Linear Classification Heads}

We consider deep image classifiers in which the backbone encoder produces a final-layer latent representation that serves as input to a linear classification head. An input image $x$ is mapped to a feature vector $z = f_\theta(x) \in \mathbb{R}^d$, and  a task-specific linear layer maps this representation to class logits $\ell = W z + b$, where $W \in \mathbb{R}^{C \times d}$ is the matrix of classifier weights whose rows $w_i^\top$ correspond to class-specific weight vectors, and  $b \in \mathbb{R}^{C}$ is the vector of class-specific bias terms. The predicted probabilities are given by $\hat{y} = \mathrm{softmax}(\ell)$. Each weight vector  $w_i$ specifies how evidence for class $i$ changes as the embedding moves in latent space. Intuitively, $w_i$ indicates the feature direction that most increases the model’s confidence in class $i$. For any task $t$ with $C_t$ classes,  we denote its classifier parameters by $W_t \in \mathbb{R}^{C_t \times d}$ and  $b_t \in \mathbb{R}^{C_t}$. We conceptualize the classifier weights not merely as regression coefficients, but as attention vectors acting upon the latent embedding. If the weights for the primary task are highly correlated with the weights for metadata attributes, it suggests the model attends to similar features for both predictions, implying a reliance on that specific shortcut.

\subsection{Projection onto the Data Manifold}
\label{sec:projection_onto_the_data_manifold}
To compare feature utilization across tasks, we express classifier weights in a shared low-dimensional coordinate system derived from the data manifold via Principal Component Analysis (PCA). More specifically, let $Z = f_\theta(X) \in \mathbb{R}^{N \times d}$ denote the (zero-mean) latent embeddings of the training set of size $N$. We compute the empirical covariance of $Z$ and extract the top $k$ principal components explaining $99\%$ of the variance in the dataset, while enforcing a minimum floor of 50 components (see \sectionref{sec:empirical_determination_of_the_pca_projection_threshold}). These form the projection matrix $P \in \mathbb{R}^{k \times d}.$ Then, each classifier head is projected into this subspace:
\begin{equation}
    W_t' = W_t P^\top, \qquad W_t' \in \mathbb{R}^{C_t \times k}
    \label{eq:project}
\end{equation}

\noindent where the $i$-th row $w'_{t,i}$ denotes the class-specific decision direction after projection. If the latent representation satisfies $z \approx P^\top z'$, then $W_t z \approx W_t P^\top z' = W_t' z'$. This step ensures that the correlation is calculated based on the directions of variance that actually exist in the data, rather than the less informative orthogonal dimensions.

\subsection{Quantifying Shortcut Reliance via Weight Space Correlation}
\label{sec:weight_space_correlation}
Given a primary clinical task $A$ and task $m$ related to metadata information we assess whether two tasks rely on similar latent directions, using the cosine  similarity between their projected class-specific weight vectors. Using \equationref{eq:project}, for tasks $A$ and $m$, we have $W_A' = W_A P^\top$ and $W_m' = W_m P^\top$. Cosine similarity between class $i$ of task $A$ and class $j$ of task $m$ is defined as:
\begin{equation}
    \cos_{ij} =
\frac{(w'_{A,i})^\top w'_{m,j}}
     {\|w'_{A,i}\| \, \|w'_{m,j}\|},
\qquad 
i = 1,\dots,C_A,\; j = 1,\dots,C_m.
\end{equation}

\noindent This yields the task-pair correlation matrix $\mathrm{Corr}(A,m) \in \mathbb{R}^{C_A \times C_m}$ which captures alignment between the decision directions of the two tasks within the intrinsic data manifold. High correlation indicates reliance on similar latent  directions and may signal shortcut usage when $m$ corresponds to metadata. A high correlation in this projected space serves as a quantitative proxy for shortcut learning: it implies that the decision boundary for the clinical task aligns closely with the decision boundary for the shortcut.

\section{Experiments and Results}
\label{sec:experiments_and_results}

\subsection{Clinical Datasets}
\label{sec:clinical_dataset}
We utilize two distinct, private clinical ultrasound datasets to evaluate the interplay between feature encodability and shortcut learning. Both datasets are accompanied by a rich set of demographic and acquisition metadata, including ultrasound scanner manufacturer, hospital site ID, and maternal ethnicity.

\begin{itemize}
    \item \textbf{The Fetal Dataset}: This dataset focuses on anatomical classification. It comprises 2D ultrasound images of four standard fetal planes: the fetal head, abdomen, femur, and thorax. The primary task is a multi-class classification problem where the model must identify the anatomical plane present in the image \citep{sendra2023generalisability}.
    \item \textbf{The Cervix Dataset}: This dataset focuses on a prognostic task related to preterm birth, defined as delivery before 37 weeks of gestation. It consists of transvaginal cervical ultrasound images, balanced equally between two classes: term birth and preterm birth. The primary task is the binary classification of the image into these prognostic outcomes \citep{pegios2023leveraging, sejer2025combined}.
\end{itemize}

\noindent \textbf{Preprocessing of Metadata Attributes:}
To unify the prediction tasks under a single framework, we reformulated the prediction of continuous metadata variables as a multi-class classification problem. Continuous attributes were discretized via binning, transforming scalar values into distinct categorical labels. The range of each continuous variable was partitioned into $k$ intervals. Any value falling within a specific interval was assigned the class label corresponding to that bin. This discretization step mitigates the impact of outliers and allows for the application of classification metrics across all target variables.

\subsection{Establishing the Embedding of Metadata in Images}
\label{sec:establishing_embedding_of_metadata}
The foundational question addressed in our experiments is whether the metadata attributes, both clinical and acquisition-related, are implicitly embedded within the visual features of the medical images themselves. To test this, we trained standard ResNet50 models as our \textbf{baseline} models to predict metadata factors directly from images in both fetal and cervix datasets (see \sectionref{sec:clinical_dataset}). We trained separate classification models for each metadata factor in each dataset. The performance of these baseline models, which simply predict a single metadata factor from the raw image input, is documented under Appendix \ref{sec:full_results_establishing_embedding_of_metadata}. % summarized in \tableref{tab:fetal_cervix_dataset_sanity_check_test_metrics}.

The results consistently demonstrate that the visual features extracted by the model contain substantial information regarding the metadata across both domains. In the fetal dataset, models achieved strong predictive accuracy for the primary task, as well as for acquisition-related factors like scanner, pixel spacing, and hospital ID. Similar performance was also observed among models predicting auxiliary factors from the cervix images.

% The results demonstrate that the visual features extracted by the model contain substantial information regarding the metadata. In particular, the models exhibited high performance on the primary classification task (i.e.\ identifying the Fetal Standard Plane), validating the general capability of the network on the image content. Meanwhile, the models also achieved strong predictive accuracy for several auxiliary, acquisition-related factors. This includes factors such as the scanner model, image pixel spacing (binned), and the hospital ID where the image was acquired. These findings confirm that a subset of the metadata factors are implicitly embedded within the image's texture, geometry, and presentation, suggesting that a model trained on these images may inadvertently learn these attributes alongside the primary clinical task.

These findings confirm a consistent observation: for both datasets, some of the metadata factors are implicitly embedded within the image's texture, geometry, and presentation. This suggests that any model trained on these images may inadvertently encode features related to these attributes alongside the primary clinical task, necessitating the subsequent investigation into their utilization (see \sectionref{sec:actual_utilization_of_clinically_irrelevant_factors}).

\noindent \textbf{Characterizing the Null Distribution of WSC Values:}
\label{sec:null_distribution_of_correlation_values}
Before analyzing specific metadata dependencies, we established a reference by determining the distribution of WSC values expected by chance or through architectural constraints. We aggregated all baseline models trained using the fetal dataset and extracted their classification heads. We then computed the pairwise cosine similarity between all combinations of weight vectors, categorizing them into two distinct groups to produce the histograms:

\begin{itemize}
    \item Intra-Task Correlation: Pairs of weight vectors taken from the same classification head. These represent the alignment between different classes within a single task (e.g.\ the weight vector for ``Fetal Head'' vs. ``Fetal Abdomen'').
    \item Inter-Task Correlation (Null Distribution): Pairs of weight vectors taken from different classification heads (e.g.\ the weight vector for ``Fetal Head'' vs.  ``Scanner Model B'').
\end{itemize}

\begin{figure}[!t]
    \centering
    \includegraphics[width=0.85\linewidth]{figures/null_distribution.png}
    \caption{Reference distributions of WSC values. (Left) The intra-task correlation between different classes within the same head. (Right) The null distribution of correlations between weight vectors from unrelated classification tasks.}
    \label{fig:null_distribution}
\end{figure}

As shown in \figureref{fig:null_distribution}, the inter-task correlation values form a narrow, zero-centered distribution. This ``null distribution'' confirms that under typical conditions, the decision boundaries for unrelated tasks are nearly orthogonal in the projected weight space. In contrast, the intra-task correlation exhibits a slight negative bias, reflecting the competition between classes in a softmax-based multiclass objective, where the model must learn to distinguish between mutually exclusive categories. This null distribution enables us to more confidently identify shortcut behavior in downstream experiments: any inter-task WSC value that significantly deviates from this zero-centered baseline provides quantitative evidence of shared feature utilization.

\subsection{Utilization of Clinically Irrelevant Factors in Classification}
\label{sec:actual_utilization_of_clinically_irrelevant_factors}
This section addresses our second core research question: Does the primary classifier (fetal plane identification) actively utilize these clinically irrelevant, but embedded, factors in its decision-making process? To answer this, we established a baseline for ``encodability", i.e.\ the degree to which metadata is present in the latent space regardless of its utility, we adopt the linear probing methodology described by \citet{gichoya2022ai,glocker2023algorithmic}.

We first trained baseline models for the primary clinical task of fetal standard plane classification. We then freeze the parameters of the backbone encoder, treating it as a fixed feature extractor. We discard the primary classification head and attach new, randomly initialized fully connected heads corresponding to the metadata attributes. These probing heads are then trained to predict the metadata (e.g.\ scanner, hospital ID) using only the frozen embeddings. High performance on this probing task indicates that the model has encoded information about the metadata, even if it was not explicitly trained to do so.

The performance of the fine-tuned model on the metadata prediction tasks, shown in \tableref{tab:fetal_dataset_finetuned_test_metrics}, confirms the continued presence of this information in the embeddings of the primary classifier. The model achieved a relatively high AUROC when fine-tuned to predict certain metadata variables (e.g.\ scanner). This result reinforces the finding that the image embeddings, generated by a model focused solely on plane classification, still contain sufficient features to distinguish between different acquisition parameters.

% \begin{table}[!ht]
%     \centering
%     \begin{tabular}{llllll}
%     \toprule
%     Target & Accuracy & Precision & Recall & F1 & AUROC \\
%     \midrule
%     Plane & 95.8 ± 0.6 & 95.4 ± 0.6 & 95.4 ± 0.6 & 95.4 ± 0.6 & 99.6 ± 0.1 \\
%     \hdashline
%     Scanner & 76.8 ± 1.4 & 75.8 ± 1.2 & 75.2 ± 1.9 & 75.3 ± 1.7 & 91.4 ± 0.9 \\
%     Pixel spacing & 56.8 ± 1.1 & 53.2 ± 1.3 & 54.1 ± 1.3 & 52.9 ± 1.3 & 88.2 ± 0.3 \\
%     GA & 50.5 ± 1.7 & 45.0 ± 1.4 & 45.7 ± 1.1 & 44.4 ± 1.2 & 80.0 ± 1.0 \\
%     Hospital ID & 49.8 ± 3.4 & 36.1 ± 2.1 & 36.9 ± 2.2 & 35.0 ± 2.2 & 77.0 ± 1.1 \\
%     Year of study & 46.4 ± 3.2 & 40.7 ± 0.8 & 45.3 ± 3.1 & 39.4 ± 0.8 & 75.6 ± 1.9 \\
%     BMI & 36.5 ± 1.6 & 35.8 ± 1.5 & 36.5 ± 1.4 & 35.2 ± 1.5 & 64.1 ± 1.1 \\
%     Ethnicity & 85.9 ± 5.8 & 50.4 ± 2.0 & 52.5 ± 3.8 & 50.0 ± 1.6 & 58.8 ± 5.8 \\
%     % Term Birth & 74.3 ± 2.9 & 53.4 ± 2.6 & 53.6 ± 2.7 & 53.4 ± 2.7 & 57.1 ± 3.6 \\
%     Parity & 49.7 ± 3.2 & 35.8 ± 0.6 & 35.9 ± 0.9 & 33.9 ± 0.3 & 55.0 ± 1.0 \\
%     Smoking status & 77.2 ± 7.0 & 50.4 ± 1.7 & 50.7 ± 2.3 & 49.5 ± 1.9 & 50.8 ± 3.9 \\
%     Maternal age & 25.6 ± 1.8 & 25.1 ± 1.5 & 25.7 ± 2.0 & 24.6 ± 1.4 & 50.8 ± 1.7 \\
%     \bottomrule
%     \end{tabular}
%     \caption{Test performance of ResNet50 classifier model trained for plane classification in \tableref{tab:fetal_dataset_sanity_check_test_metrics} and subsequently fine-tuned to predict the other targets.}
%     \label{tab:fetal_dataset_finetuned_test_metrics}
% \end{table}

% \begin{table}[ht!]
%     \centering
%     \begin{tabular}{lccccc}
%     \toprule
%     \textbf{Target} & \textbf{Accuracy} & \textbf{Precision} & \textbf{Recall} & \textbf{F1} & \textbf{AUROC} \\
%     \midrule
%     \multicolumn{6}{c}{\textbf{Fine-tuned model}} \\
%     \midrule
%     Plane & 95.8 ± 0.6 & 95.4 ± 0.6 & 95.4 ± 0.6 & 95.4 ± 0.6 & 99.6 ± 0.1 \\
%     \hdashline
%     Scanner & 76.8 ± 1.4 & 75.8 ± 1.2 & 75.2 ± 1.9 & 75.3 ± 1.7 & 91.4 ± 0.9 \\
%     Pixel spacing & 56.8 ± 1.1 & 53.2 ± 1.3 & 54.1 ± 1.3 & 52.9 ± 1.3 & 88.2 ± 0.3 \\
%     GA & 50.5 ± 1.7 & 45.0 ± 1.4 & 45.7 ± 1.1 & 44.4 ± 1.2 & 80.0 ± 1.0 \\
%     Hospital ID & 49.8 ± 3.4 & 36.1 ± 2.1 & 36.9 ± 2.2 & 35.0 ± 2.2 & 77.0 ± 1.1 \\
%     Year of study & 46.4 ± 3.2 & 40.7 ± 0.8 & 45.3 ± 3.1 & 39.4 ± 0.8 & 75.6 ± 1.9 \\
%     BMI & 36.5 ± 1.6 & 35.8 ± 1.5 & 36.5 ± 1.4 & 35.2 ± 1.5 & 64.1 ± 1.1 \\
%     Ethnicity & 85.9 ± 5.8 & 50.4 ± 2.0 & 52.5 ± 3.8 & 50.0 ± 1.6 & 58.8 ± 5.8 \\
%     Parity & 49.7 ± 3.2 & 35.8 ± 0.6 & 35.9 ± 0.9 & 33.9 ± 0.3 & 55.0 ± 1.0 \\
%     Smoking status & 77.2 ± 7.0 & 50.4 ± 1.7 & 50.7 ± 2.3 & 49.5 ± 1.9 & 50.8 ± 3.9 \\
%     Maternal age & 25.6 ± 1.8 & 25.1 ± 1.5 & 25.7 ± 2.0 & 24.6 ± 1.4 & 50.8 ± 1.7 \\
%     \midrule
%     \multicolumn{6}{c}{\textbf{Multitask model}} \\
%     \midrule
%     Plane & 95.7 ± 0.1 & 95.4 ± 0.1 & 95.3 ± 0.1 & 95.3 ± 0.1 & 99.6 ± 0.1 \\
%     \hdashline
%     Scanner & 96.9 ± 0.5 & 96.9 ± 0.6 & 96.5 ± 0.6 & 96.7 ± 0.6 & 99.7 ± 0.1 \\
%     Pixel spacing & 69.4 ± 1.3 & 67.9 ± 0.8 & 67.2 ± 1.3 & 67.0 ± 1.3 & 94.0 ± 0.4 \\
%     GA & 59.5 ± 1.4 & 52.8 ± 1.2 & 51.5 ± 1.0 & 51.6 ± 1.1 & 86.8 ± 0.4 \\
%     Hospital ID & 73.7 ± 0.8 & 55.6 ± 1.8 & 54.9 ± 1.8 & 54.5 ± 1.8 & 91.9 ± 0.7 \\
%     Year of study & 68.7 ± 1.2 & 51.4 ± 3.0 & 51.7 ± 0.7 & 49.9 ± 1.0 & 90.5 ± 0.2 \\
%     BMI & 39.8 ± 1.9 & 43.4 ± 0.8 & 39.9 ± 2.0 & 40.1 ± 1.9 & 68.6 ± 0.9 \\
%     Ethnicity & 94.3 ± 0.4 & 47.6 ± 0.0 & 49.5 ± 0.2 & 48.5 ± 0.1 & 42.0 ± 2.4 \\
%     Parity & 70.2 ± 2.3 & 33.7 ± 3.0 & 34.3 ± 0.4 & 32.1 ± 0.7 & 64.0 ± 1.6 \\
%     Smoking status & 84.5 ± 0.5 & 52.1 ± 4.8 & 50.4 ± 0.8 & 48.2 ± 1.4 & 51.2 ± 2.9 \\
%     Maternal age & 28.3 ± 1.4 & 27.5 ± 3.2 & 27.2 ± 2.2 & 25.0 ± 2.9 & 53.3 ± 1.7 \\
%     \bottomrule
%     \end{tabular}
%     \caption{Test performance of ResNet50 classifier model when fine-tuned to predict the other targets (above), or trained to predict various targets in a multitask setting (below).}
%     \label{tab:fetal_dataset_finetuned_test_metrics}
%     \label{tab:fetal_dataset_multitask_test_metrics}
% \end{table}

\begin{table}[!t]
    \centering
    
    \begin{DIFnomarkup}
    \resizebox{\textwidth}{!}{
        \begin{tabular}{@{}l@{}cccccc@{}}
        \toprule
         & \multicolumn{3}{c}{\textbf{Fine-tuned model}}& \multicolumn{3}{c}{\textbf{Multitask model}} \\
        \cmidrule(rl){2-4}\cmidrule(rl){5-7} \textbf{Target} & \textbf{Accuracy} & \textbf{F1} & \textbf{AUROC} & \textbf{Accuracy} & \textbf{F1} & \textbf{AUROC} \\
        \midrule
        Plane & 95.8 ± 0.6 & 95.4 ± 0.6 & 99.6 ± 0.1 & 95.7 ± 0.1 & 95.3 ± 0.1 & 99.6 ± 0.1 \\
        \hdashline
        Scanner & 76.8 ± 1.4 & 75.3 ± 1.7 & 91.4 ± 0.9 & 96.9 ± 0.5 & 96.7 ± 0.6 & 99.7 ± 0.1 \\
        Pixel Spacing & 56.8 ± 1.1 & 52.9 ± 1.3 & 88.2 ± 0.3 & 69.4 ± 1.2 & 67.1 ± 1.2 & 94.0 ± 0.4 \\
        GA & 50.5 ± 1.7 & 44.4 ± 1.2 & 80.0 ± 1.0 & 59.5 ± 1.3 & 51.6 ± 1.0 & 86.8 ± 0.4 \\
        Hospital ID & 49.8 ± 3.4 & 35.0 ± 2.2 & 77.0 ± 1.1 & 73.7 ± 0.7 & 54.5 ± 1.7 & 91.9 ± 0.7 \\
        Year Of Study & 46.4 ± 3.2 & 39.4 ± 0.8 & 75.6 ± 1.9 & 68.7 ± 1.1 & 49.9 ± 1.0 & 90.5 ± 0.2 \\
        BMI & 36.5 ± 1.6 & 35.2 ± 1.5 & 64.1 ± 1.1 & 39.7 ± 1.8 & 40.0 ± 1.8 & 68.6 ± 0.9 \\
        Ethnicity & 85.9 ± 5.8 & 50.0 ± 1.6 & 58.8 ± 5.8 & 94.3 ± 0.3 & 48.5 ± 0.1 & 42.0 ± 2.3 \\
        Term Birth & 74.3 ± 2.9 & 53.4 ± 2.7 & 57.1 ± 3.6 & 81.8 ± 0.3 & 50.3 ± 2.1 & 56.9 ± 2.6 \\
        Parity & 49.7 ± 3.2 & 33.9 ± 0.3 & 55.0 ± 1.0 & 70.2 ± 2.1 & 32.1 ± 0.6 & 64.0 ± 1.5 \\
        Smoking & 77.2 ± 7.0 & 49.5 ± 1.9 & 50.8 ± 3.9 & 84.5 ± 0.4 & 48.2 ± 1.3 & 51.2 ± 2.7 \\
        Maternal Age & 25.6 ± 1.8 & 24.6 ± 1.4 & 50.8 ± 1.7 & 28.3 ± 1.3 & 25.0 ± 2.7 & 53.3 ± 1.6 \\
        \bottomrule
        \end{tabular}
    }
    \end{DIFnomarkup}
    \caption{Test performance of ResNet50 classifier model when fine-tuned to predict the other targets, or trained to predict various targets in a multitask setting.}
    \label{tab:fetal_dataset_finetuned_test_metrics}
    \label{tab:fetal_dataset_multitask_test_metrics}
\end{table}


While the embeddings hold the information, the critical step is, however, determining if that information is being used. As demonstrated in \figureref{fig:correlation_sonai_plane_classifier}, WSC analysis on the weight vectors of the final classification heads suggests that the correlation between the weight vectors for the fetal plane classes and the weight vectors for the scanner classes was consistently low. This indicates that although the necessary information about the scanner is present in the preceding embedding layer, the model's decision boundary for the primary plane classification task is largely orthogonal to the directionality required to classify the scanner. In other words, the embedded, clinically irrelevant information is not being actively utilized by the classifier for its primary prediction.


\begin{figure}[!t]
    \centering
    % \subfigure[colorbar]{\includegraphics[trim={22cm 0 0 0},clip,angle=-90,width=0.8\textwidth]{figures/seed_42_sonai_plane_resnet50_final_layer_weight_correlation_pca_cropped.png}\label{fig:colorbar}}
    % \begin{minipage}{0.89\textwidth}
        \subfigure[Fine-tuned model; full dataset]{\includegraphics[width=0.90\textwidth]{figures/seed_42_sonai_plane_resnet50_final_layer_weight_correlation_pca_cropped.png}\label{fig:correlation_sonai_plane_classifier}}
        \subfigure[Multi-task model; full dataset]{\includegraphics[width=0.90\textwidth]{figures/seed_42_sonai_multitask_resnet50_final_layer_weight_correlation_pca_cropped.png}\label{fig:correlation_sonai_multitask_classifier}}
        
        \subfigure[Fine-tuned model; balanced dataset]{\includegraphics[width=0.90\textwidth]{figures/seed_42_sonai_balanced_plane_resnet50_final_layer_weight_correlation_pca_cropped.png}\label{fig:correlation_sonai_balanced_plane_classifier}}
        \subfigure[Multi-task model; balanced dataset]{\includegraphics[width=0.90\textwidth]{figures/seed_42_sonai_balanced_multitask_resnet50_final_layer_weight_correlation_pca_cropped.png}\label{fig:correlation_sonai_balanced_multitask_classifier}}
        
        \subfigure[Fine-tuned model; bias-induced dataset]{\includegraphics[width=0.90\textwidth]{figures/seed_42_sonai_biased_plane_resnet50_final_layer_weight_correlation_pca_cropped.png}\label{fig:correlation_sonai_biased_plane_classifier}}
        \subfigure[Multi-task model; bias-induced dataset]{\includegraphics[width=0.90\textwidth]{figures/seed_42_sonai_biased_multitask_resnet50_final_layer_weight_correlation_pca_cropped.png}\label{fig:correlation_sonai_biased_multitask_classifier}}
    % \end{minipage}
    % \begin{minipage}{0.1\textwidth}
    %     \includegraphics[trim={30cm 0 0 0},clip,angle=0,width=0.65\textwidth]{figures/colorbar.png}
    % \end{minipage}
    
    \caption{Correlation matrix between weight vectors from classification head of the primary task versus that of each metadata attributes, extracted from fine-tuned or multitask classifier models, trained using the entire, balanced, or biased fetal dataset. Full matrix available under Appendix \ref{sec:full_covariance_matrix_plot}.}
    \label{fig:cropped_correlation_matrix}
\end{figure}

% \begin{figure}[h!]
%     \centering
%     \subfigure[Embedding space]{\includegraphics[width=0.49\textwidth]{figures/seed_42_sonai_plane_resnet50_final_layer_weight_correlation.png}}
%     \subfigure[PCA space]{\includegraphics[width=0.49\textwidth]{figures/seed_42_sonai_plane_resnet50_final_layer_weight_correlation_pca.png}}
%     \caption{Correlation between weight vectors from classification head of each targets.}
%     \label{fig:correlation_sonai_plane_classifier}
% \end{figure}

% \ckwo{from \figureref{fig:correlation_sonai_plane_classifier,fig:correlation_sonai_multitask_classifier,fig:correlation_sonai_biased_classifier,fig:correlation_cervix_finetuned_classifier} it seems that PCA is not necessary. I am not sure whether we should exclude it; our technical novelty is already thin.}

%\subsubsection{Stress Test with Multi-Task Learning}
\label{sec:stress_test_with_multitask_learning}
To stress-test this finding, we trained a \emph{multi-task learning} model designed to encourage the simultaneous encoding of all factors. This model extends ResNet50 with auxiliary output heads, which was trained to predict the fetal plane and all metadata factors concurrently, thereby explicitly maximizing the metadata information content within the shared embedding space, as shown in \tableref{tab:fetal_dataset_multitask_test_metrics}. Upon performing the same analysis on this model, the correlation between the plane and scanner weight vectors remained low. This further reinforces the initial conclusion: even when the model is explicitly forced to encode metadata information into the embeddings, the weight vectors for the primary plane classification task remain largely decoupled from the weight vectors of the irrelevant scanner factors.

% \begin{table}[!ht]
%     \centering
%     \begin{tabular}{llllll}
%     \toprule
%     Target & Accuracy & Precision & Recall & F1 & AUROC \\
%      % &  &  &  &  &  \\
%     \midrule
%     Plane & 95.7 ± 0.1 & 95.4 ± 0.1 & 95.3 ± 0.1 & 95.3 ± 0.1 & 99.6 ± 0.1 \\
%     \hdashline
%     Scanner & 96.9 ± 0.5 & 96.9 ± 0.6 & 96.5 ± 0.6 & 96.7 ± 0.6 & 99.7 ± 0.1 \\
%     Pixel Spacing & 69.4 ± 1.3 & 67.9 ± 0.8 & 67.2 ± 1.3 & 67.0 ± 1.3 & 94.0 ± 0.4 \\
%     Hospital Id & 73.7 ± 0.8 & 55.6 ± 1.8 & 54.9 ± 1.8 & 54.5 ± 1.8 & 91.9 ± 0.7 \\
%     Year Of study & 68.7 ± 1.2 & 51.4 ± 3.0 & 51.7 ± 0.7 & 49.9 ± 1.0 & 90.5 ± 0.2 \\
%     GA & 59.5 ± 1.4 & 52.8 ± 1.2 & 51.5 ± 1.0 & 51.6 ± 1.1 & 86.8 ± 0.4 \\
%     BMI & 39.8 ± 1.9 & 43.4 ± 0.8 & 39.9 ± 2.0 & 40.1 ± 1.9 & 68.6 ± 0.9 \\
%     Parity & 70.2 ± 2.3 & 33.7 ± 3.0 & 34.3 ± 0.4 & 32.1 ± 0.7 & 64.0 ± 1.6 \\
%     % Term Birth & 81.8 ± 0.3 & 52.9 ± 2.9 & 51.3 ± 1.4 & 50.3 ± 2.2 & 56.9 ± 2.7 \\
%     Maternal Age & 28.3 ± 1.4 & 27.5 ± 3.2 & 27.2 ± 2.2 & 25.0 ± 2.9 & 53.3 ± 1.7 \\
%     Smoking Status & 84.5 ± 0.5 & 52.1 ± 4.8 & 50.4 ± 0.8 & 48.2 ± 1.4 & 51.2 ± 2.9 \\
%     Ethnicity & 94.3 ± 0.4 & 47.6 ± 0.0 & 49.5 ± 0.2 & 48.5 ± 0.1 & 42.0 ± 2.4 \\
%     \bottomrule
%     \end{tabular}
%     \caption{Test performance of ResNet50 classifier model trained to predict various target values in a multitask setting.}
%     \label{tab:fetal_dataset_multitask_test_metrics}
% \end{table}

% \begin{figure}[h!]
%     \centering
%     \subfigure[Embedding space]{\includegraphics[width=0.49\textwidth]{figures/seed_42_sonai_multitask_resnet50_final_layer_weight_correlation.png}}
%     \subfigure[PCA space]{\includegraphics[width=0.49\textwidth]{figures/seed_42_sonai_multitask_resnet50_final_layer_weight_correlation_pca.png}}
%     \caption{Correlation between weight vectors from classification head of each targets of the multitask-trained model.}
%     \label{fig:correlation_sonai_multitask_classifier}
% \end{figure}

\subsection{Validation of Shortcut Learning Detection via Induced Bias}
\label{sec:validation_of_shortcut_learning_detection}
% \sectionref{sec:actual_utilization_of_clinically_irrelevant_factors} suggested that the model did not utilize available scanner information for plane classification. This section aims to validate our hypothesis: If a model does adopt a shortcut learning strategy, this behavior will be detectable via the weight space correlation analysis. To induce shortcut learning, we intentionally biased the training set by introducing a strong correlation between the primary classification target (fetal plane) and a clinically irrelevant factor (scanner). We performed data culling on the fetal dataset, discarding images such that the majority of images for each standard plane were acquired by a distinct, single scanner model. The final biased dataset composition is documented in \tableref{tab:num_train_sample_fetal_full_vs_balanced_biased}, illustrating the high degree of induced correlation between the two factors. This manipulation forces the classifier to potentially adopt a shortcut learning approach, where classifying the scanner becomes an efficient proxy for classifying the fetal plane.

\sectionref{sec:actual_utilization_of_clinically_irrelevant_factors} suggested that the baseline models did not utilize available scanner information for plane classification. This section aims to validate our hypothesis: if a model does adopt a shortcut learning strategy, this behavior will be detectable via WSC analysis. Inspired by \citet{weng2024fast}, we derived two specific sub-datasets from the full fetal dataset:
\begin{itemize}
    \item Balanced Dataset: We performed data culling to ensure that the number of images in each plane-scanner class pair was balanced. This dataset serves as a rigorous control, ensuring no correlation exists between the clinical target and the acquisition metadata.
    \item Induced Bias Dataset: We intentionally introduced a strong correlation between the primary classification target (fetal plane) and a clinically irrelevant factor (scanner). We discarded images such that the majority of images for each standard plane were acquired by a distinct, single scanner, forcing the model to potentially adopt a shortcut where recognizing the scanner acts as an efficient proxy for anatomical classification.
\end{itemize}

The composition of these datasets is documented in \tableref{tab:num_train_sample_fetal_full_vs_balanced_biased}, illustrating the contrast between the balanced control and the high degree of induced correlation in the biased set.

\begin{table}[!t]
    \centering
    \begin{DIFnomarkup}
    \resizebox{\textwidth}{!}{
        \begin{tabular}{l@{}ccccccccc}
            \toprule
             & \multicolumn{3}{c}{Full}& \multicolumn{3}{c}{Balanced}& \multicolumn{3}{c}{Biased} \\
             \cmidrule(rl){2-4}\cmidrule(rl){5-7}\cmidrule(rl){8-10}
            Plane &  Voluson S& V830&  E10&   Voluson S& V830&  E10&   Voluson S& V830&  E10 \\
            \midrule
            % \hdashline
            Abdomen& 717& 333&  658&   300& 300& 300&   150& 150& 658 \\
            Head&   1018& 805&  888&   300& 300& 300&   150& 700& 150 \\
            Femur&   760& 315&  533&   300& 300& 300&   700& 150& 150 \\
            Thorax&  855& 602&  523&   300& 300& 300&   700& 150& 150 \\
            \bottomrule
        \end{tabular}
    }
    \end{DIFnomarkup}
    \caption{Composition of the full, balanced, and biased fetal plane dataset.}
    \label{tab:num_train_sample_fetal_full_vs_balanced_biased}
\end{table}

% We repeated the analysis described in \sectionref{sec:actual_utilization_of_clinically_irrelevant_factors} using the newly biased dataset. Specifically, we trained both the single-task plane classifier and the multi-task classifier on this biased data. The resulting weight space correlations between the plane classes and the scanner classes are shown in \figureref{fig:correlation_sonai_biased_plane_classifier,fig:correlation_sonai_biased_multitask_classifier}. For the single-task classifiers, the correlation matrix showed a noticeable increase in value compared to the original, unbiased experiment. More crucially, the multi-task model, which is explicitly encouraged to encode all features, exhibited a much stronger correlation between the weight vectors for the fetal plane classes and the scanner classes. This significant increase in correlation confirms that the model did adopt the shortcut when the bias was present, demonstrating that the decision boundary for plane classification now aligns with the directionality required for scanner classification.

We repeated the analysis described in \sectionref{sec:actual_utilization_of_clinically_irrelevant_factors} using these sub-datasets, training both single-task and multi-task classifiers on each. As detailed in Appendix \ref{sec:full_results_validation_via_induced_bias}, the predictive performance for both plane and scanner classification remained consistent across the full, balanced, and biased datasets. However, this stability does not extend to the internal weight alignments, as shown in \figureref{fig:correlation_sonai_biased_plane_classifier,fig:correlation_sonai_biased_multitask_classifier}. In the balanced scenario, the WSC values between plane and scanner weight vectors remained within the null distribution, confirming that the model did not associate these tasks when the data was uncorrelated. Meanwhile, in the induced bias scenario, the single-task classifiers showed an increase in WSC values. More crucially, the multi-task model exhibited a significantly stronger alignment between the weight vectors for fetal plane classes and scanner classes. This increase confirms that the model adopted the shortcut when bias was present, with the decision boundary for plane classification aligning with the directionality required for scanner classification.

While the strong WSC values observed between the plane and scanner weight vectors in the biased scenario is indicative of a dependency between the two prediction tasks, it is important to note the nature of this association. A high correlation coefficient signifies that the two tasks assign similar attention vectors to the shared model embeddings; they are looking at similar features in the embedding space to make their respective decisions. It does not explicitly define the direction of the shortcut. That is, the result does not prove whether the model is using scanner information to predict the plane, or if the plane information is strongly predictive of the scanner. It simply confirms that, under conditions of high dataset bias, the feature utilization for the two tasks becomes strongly coupled.

% While the strong correlation observed in the biased scenario indicates a dependency, it is important to note the nature of this association. A high correlation coefficient signifies that the two tasks assign highly similar attention vectors to the shared embeddings; they are looking at similar features in the embedding space to make their respective decisions. While this doesn't definitively prove the direction of the shortcut (i.e., whether scanner predicts plane or vice versa), it confirms that under high bias, feature utilization for the two tasks becomes strongly coupled.

% \begin{figure}[h!]
%     \centering
%     \subfigure[Embedding space; fine-tuned classifier]{\includegraphics[width=0.49\textwidth]{figures/seed_43_sonai_biased_plane_resnet50_final_layer_weight_correlation.png}}
%     \subfigure[PCA spacel fine-tuned classifier]{\includegraphics[width=0.49\textwidth]{figures/seed_43_sonai_biased_plane_resnet50_final_layer_weight_correlation_pca.png}}
%     \subfigure[Embedding space; multitask classifier]{\includegraphics[width=0.49\textwidth]{figures/seed_43_sonai_biased_multitask_resnet50_final_layer_weight_correlation.png}}
%     \subfigure[PCA space; multitask classifier]{\includegraphics[width=0.49\textwidth]{figures/seed_43_sonai_biased_multitask_resnet50_final_layer_weight_correlation_pca.png}}
%     \caption{Correlation between weight vectors from classification head of each targets of the biased base and multitask-trained model.}
%     \label{fig:correlation_sonai_biased_classifier}
% \end{figure}

The findings from this experiment provide crucial validation. High WSC is a reliable indicator of shortcut learning, where the model utilizes a non-causal, highly correlated factor for its prediction. The successful detection of this induced shortcut proves that the WSC analysis is effective in determining the active utilization of embedded, irrelevant information.

\subsection{Empirical Determination of the PCA Projection Threshold}
\label{sec:empirical_determination_of_the_pca_projection_threshold}
After introducing the experiments in \sectionref{sec:actual_utilization_of_clinically_irrelevant_factors,sec:validation_of_shortcut_learning_detection}, this section takes a detour to explain the empirical rationale behind the parameter choices made in our dimensionality reduction strategy (see \sectionref{sec:projection_onto_the_data_manifold}). Specifically, we investigate the choice of $k$, the number of principal components used to define the data manifold projection.

To determine the optimal $k$, we conducted a sensitivity analysis across four random seeds. This was performed for both the fine-tuned baseline models and the multi-task models, using all versions of the fetal dataset (i.e. full, balanced, and biased). We performed WSC analysis between the fetal plane classification head and the scanner classification head, varying $k$ from 10 up to the full dimensionality of the ResNet50 embedding space ($d = 2048$).

\begin{figure}[!t]
    \centering
    \includegraphics[width=1\linewidth]{figures/effect_of_pca_threshold_k_on_correlation_value_plane0_vs_scanner2.png}
    \caption{Sensitivity of WSC values to the number of principal components $k$. The WSC values are calculated using weight vectors for plane\_0 and scanner\_2, a pair representative of the general trends observed across all plane-scanner combinations (detailed in Appendix \ref{sec:full_plot_empirical_determination_of_pca_projection_threshold}).}
    \label{fig:effect_of_pca_threshold_k_on_correlation_value_plane0_vs_scanner2}
\end{figure}

The results of this analysis are shown in \figureref{fig:effect_of_pca_threshold_k_on_correlation_value_plane0_vs_scanner2}, including a reference line for correlation calculated without projection (i.e.\ in the raw embedding space). Two trends are observed:
\begin{itemize}
    \item Unstable Mean and High Variance at Low $k$: For small values of $k$, we observed high variance in correlation values across random seeds. Furthermore, at these low values, the mean correlation had not yet settled into the plateau it eventually reaches as $k$ increases. This indicates that a very low-dimensional projection is insufficient to capture the manifold and is sensitive to the stochasticity of individual training runs.
    \item Underestimation at High $k$: As $k$ approached the full dimension, the correlation values decreased slightly. This suggests that including all dimensions may dilute the meaningful alignment between tasks, underestimating the shortcut learning effect.
\end{itemize}
Based on these empirical observations, we established a strategy to balance stability and sensitivity. We set a floor value of $k = 50$ to ensure sufficient representational capacity and eliminate seed-based variance, while simultaneously requiring that the chosen $k$ must capture at least $99\%$ of the variance in the latent embeddings.

\subsection{Probing a Trained Model: Analysis of SA-SonoNet Embeddings}
\label{sec:probing_sa_sononet}
Having validated our WSC analysis methodology in \sectionref{sec:validation_of_shortcut_learning_detection}, we now apply our technique to probe a model trained on a real-world, relevant clinical task. For this analysis, we utilize the SA-SonoNet model~\citep{pegios2023leveraging}, which achieved state-of-the-art performance on the challenging task of Spontaneous Preterm Birth (sPTB) prediction.

% We started by performing a sanity check to confirm that the cervix dataset also contains sufficient metadata information implicitly encoded within the images, similar to our earlier findings with the fetal dataset (see \sectionref{sec:establishing_embedding_of_metadata}). We trained a series of simple baseline classification models to predict each metadata factor directly from the cervix ultrasound images. The performance of these models, summarized in \tableref{tab:fetal_cervix_dataset_sanity_check_test_metrics}, confirmed that the images in this dataset also carry significant information regarding a subset of the metadata.

%\subsubsection{Architecture and Methodology Adaptation}
SA-SonoNet is a shape- and spatially-aware network based on the SonoNet~\citep{baumgartner2017sononet} architecture, modified to predict term or preterm birth from transvaginal cervix ultrasound images. The key innovation is its multimodal input: given cervix image, the model first leverages a segmentation network \citep{lin2023dtu} to compute a segmentation map of important anatomical structures (e.g.\ cervical canal and boundaries). The final input to the SA-SonoNet classifier is the concatenation of the original image, the segmentation map, and the pixel spacing values, which are repeated and reshaped to image dimensions to inject spatial information.  The original SA-SonoNet model uses an average pooling layer on a $14 \times 18$ 2D embedding feature map for its final prediction. We first flattened this $14 \times 18$ feature map into a 252-long 1D embedding vector. This vector was then connected to a newly initialized fully connected classification head for predicting the metadata variables.  We modified its final classification layer for fine-tuning and analysis. For the WSC analysis, we represented the original average pooling operation, which maps the 252-long embedding to the final sPTB prediction, by including a 252-long vector of all ones in the set of weight vectors. This step allows us to compare the feature utilization direction of the original sPTB task against the fine-tuned metadata tasks.

%\subsubsection{Analysis of Feature Utilization in SA-SonoNet}

\begin{figure}[!t]
    \centering
    \includegraphics[width=0.90\textwidth]{figures/sf4_sasononet_final_layer_weight_correlation_pca_cropped.png}
    \caption{WSC values between weight vectors from classification head of the targets of the fine-tuned SA-SonoNet model. \texttt{ref\_avgpool} represents a flattened average pooling layer, which is the original model's classification head. Full covariance matrix is available under Appendix \ref{sec:full_covariance_matrix_plot}.}
    \label{fig:correlation_cervix_finetuned_classifier}
\end{figure}

% \begin{table}[!ht]
%     \centering
%     \begin{tabular}{lrlr}
%     \toprule
%      metadata attributes& correlation & metadata attributes&correlation \\
%     \midrule
%     birth weight group 0 & 0.165 ± 0.027  & scanner 0 &0.007 ± 0.033  
% \\
%     birth weight group 1 & 0.130 ± 0.041  & scanner 1 &-0.016 ± 0.044  
% \\
%     birth weight group 2 & 0.037 ± 0.048  & scanner 2 &0.070 ± 0.038  
% \\
%     birth weight group 3 & -0.075 ± 0.049  & scanner 3 &0.031 ± 0.027  
% \\
%     birth weight group 4 & -0.137 ± 0.052  & term birth 0 &0.138 ± 0.063  
% \\
%     birth weight group 5 & -0.185 ± 0.038  & term birth 1 &-0.151 ± 0.060  \\
%     bmi group 0 & 0.036 ± 0.052  & px spacing group 0 &0.044 ± 0.039  
% \\
%     bmi group 1 & 0.004 ± 0.073  & px spacing group 1 &0.017 ± 0.050  
% \\
%     bmi group 2 & -0.014 ± 0.060  & px spacing group 2 &0.010 ± 0.054  
% \\
%     bmi group 3 & 0.034 ± 0.061  & px spacing group 3 &0.018 ± 0.019  
% \\
%     cervical length group 0 & 0.208 ± 0.046  & px spacing group 4 &0.000 ± 0.021  
% \\
%     cervical length group 1 & 0.094 ± 0.033  & px spacing group 5 &-0.057 ± 0.046  
% \\
%     cervical length group 2 & -0.121 ± 0.046  & px spacing group 6 &-0.052 ± 0.047  
% \\
%     cervical length group 3 & -0.207 ± 0.041  & px spacing group 7 &-0.062 ± 0.062  
% \\
%     ga group 0 & -0.049 ± 0.055  & py spacing group 0 &0.050 ± 0.042  
% \\
%     ga group 1 & -0.027 ± 0.059  & py spacing group 1 &0.036 ± 0.052  
% \\
%     ga group 2 & 0.007 ± 0.059  & py spacing group 2 &0.019 ± 0.053  
% \\
%     ga group 3 & 0.010 ± 0.057  & py spacing group 3 &0.030 ± 0.034  
% \\
%     ga group 4 & 0.039 ± 0.069  & py spacing group 4 &-0.006 ± 0.029  
% \\
%     placenta weight group 0 & 0.060 ± 0.056  & py spacing group 5 &-0.037 ± 0.051  
% \\
%     placenta weight group 1 & 0.014 ± 0.055  & py spacing group 6 &-0.061 ± 0.067  
% \\
%     placenta weight group 2 & -0.059 ± 0.065  & py spacing group 7 &-0.045 ± 0.063  \\
%     placenta weight group 3 & -0.110 ± 0.050  & ref avgpool &1.000 ± 0.000  \\
%     \end{tabular}
%     \caption{Caption}
%     \label{tab:placeholder}
% \end{table}

The fine-tuned model's performance on various auxiliary metadata tasks and the resulting WSC values are presented in \tableref{tab:cervix_dataset_finetuned_test_metrics} under Appendix \ref{sec:full_results_establishing_embedding_of_metadata}, and the associated correlation matrix \figureref{fig:correlation_cervix_finetuned_classifier}. The analysis reveals several expected correlations, confirming that the model utilizes clinically significant factors, as well as some desirable decoupling from irrelevant factors. Cervical length showed a moderate correlation, aligning with its role as the clinical gold standard for sPTB risk. Pixel spacing also demonstrated a moderate correlation, reflecting its known influence on model performance and confirming that this imaging parameter contributes to the prediction. In contrast, scanner type showed only a weak correlation, suggesting the model does not rely on acquisition hardware–related shortcuts.

Meanwhile, it is worth noting that although birth weight exhibited a strong WSC value with sPTB prediction, which is superficially consistent with the expected clinical link between prematurity and low birthweight, this finding must be interpreted with caution. As shown in \tableref{tab:cervix_dataset_finetuned_test_metrics}, the predictive performance for the birth weight probing head was low, indicating that this metadata is not effectively encoded in the model's embeddings. Consequently, the high WSC value in this specific instance is functionally meaningless.

% This is summarized and discussed under \tableref{tab:sa_sononet_correlation_matrix_rationale}.
% \begin{table}[h!]
%     \centering
%     \begin{tabular}{| p{1.8cm} | l | p{9.95cm} |}
%     \hline
%     \textbf{Factor} & \textbf{Correlation} & \textbf{Rationale / Implication} \\
%     \hline
    
%     \hline
%     \textbf{Birth Weight} & \textbf{High} & This correlation is expected, as infants born pre-term generally have low birth weights. This suggests the model is utilizing features strongly associated with clinical outcome. \\
%     % \hline
%     % \textbf{Placental Weight} & \textbf{High} & Similar to birth weight, placental weight is strongly related to gestational age and fetal health, making this correlation expected and clinically relevant. \\
%     \hline
%     \textbf{Cervical Length} & \textbf{Moderate} & Cervical length is the clinical gold standard for sPTB. The moderate correlation confirms the model is aligning its attention vector with features related to this key indicator. \\
%     \hline
%     \textbf{Pixel Spacing} & \textbf{Moderate} & SA-SonoNet explicitly uses pixel spacing as an input, and Pegios et al. demonstrated its importance to the model's performance. The moderate correlation indicates this factor is actively utilized in the sPTB prediction process. \\
%     \hline
%     \textbf{Scanner} & \textbf{Low} & The weight vectors showed a low correlation. This is a positive finding, suggesting the model has not adopted a shortcut based on the acquisition hardware. \\
%     % \hline
%     % \textbf{BMI} & \textbf{Low} & The low correlation with maternal BMI suggests this non-visual, patient-specific factor is not being exploited by the classifier's feature utilization pathway. \\
%     \hline
%     \end{tabular}
%     \caption{Findings and discussions from weight space correlation analysis on SA-SonoNet.}
%     \label{tab:sa_sononet_correlation_matrix_rationale}
% \end{table}

\section{Limitations and Discussion}
%\subsection{Assumption of Linear Decision Boundaries}
Our method assumes that the relationship between the embedding space and the class predictions is linear. While this might seem restrictive, it aligns with standard deep learning architectural conventions where the backbone acts as a non-linear feature extractor, and the classification head functions as a linear probe on the resulting manifold. Furthermore, research suggests that the goal of supervised training is to linearize the data manifold, making classes linearly separable in the embedding space \citep{bengio2013representation}.

%\subsection{Interpretation of Failure Modes}
It is also vital to distinguish between ``shortcut learning'' and other forms of model degradation. A model that exhibits low WSC values for a metadata attribute is not necessarily immune to failure when that attribute shifts. If a model performs poorly on a new scanner despite low weight alignment, our method suggests that the failure is not due to the scanner type being used as a shortcut. Instead, it may stem from other factors:
\begin{itemize}
    \item \textbf{Data Quality:} Images from the new scanner may lack the diagnostic signal-to-noise ratio present in the training data.
    \item \textbf{Feature Shift:} The anatomical features learned by the model may be rendered differently by the new hardware, causing the embeddings to fall outside the distribution of the original decision boundary.
\end{itemize}
In such cases, our method rules out shortcut learning as the cause of a performance drop.%, narrowing the focus to other potential issues.

%\subsection{Complexity of Multi-Attribute Shortcuts}
Finally, while we demonstrate successful detection of shortcuts based on individual metadata attributes, real-world shortcuts may be multifaceted, involving non-linear combinations of several confounding factors. Future work will investigate the extension of WSC analysis to higher-order interactions between multiple metadata dimensions.

\section{Conclusion}
\label{sec:conclusion}   
In this work, we introduced Weight Space Correlation analysis, a simple and interpretable methodology designed to move beyond simply identifying the presence of confounding information to definitively quantifying its utilization by a deep learning classifier. We validated our method by successfully detecting artificially induced shortcut learning in a controlled environment. Applying this method to the SA-SonoNet model, we confirmed that while clinically irrelevant factors are indeed encoded in the image embeddings, the model's decision boundary for Spontaneous Preterm Birth prediction is selectively aligned with clinically meaningful metadata and, crucially, decoupled from confounding acquisition factors like scanner model. These findings provide a necessary level of trustworthiness in complex medical imaging models by confirming that the classifier is learning features relevant to the clinical task, rather than relying on spurious correlations.






% \section{Proposed Method}

% Our goal is to determine whether a clinical prediction task implicitly relies on  metadata-related information learned during training. Our method consists of three steps: (i) representing each task through the linear decision directions of its classification head, (ii) projecting these directions onto the intrinsic data manifold, and (iii) quantifying the reliance between tasks via cosine similarity of their projected weight vectors.

% \subsection{Latent Representation and Linear Classification Heads}

% We consider deep image classifiers in which the backbone encoder produces a final-layer latent 
% representation that serves as input to a linear classification head. An input 
% image $x$ is mapped to a feature vector $z = f_\theta(x) \in \mathbb{R}^d$, and  a task-specific linear layer maps this representation to class logits 
% $\ell = W z + b$, where $W \in \mathbb{R}^{C \times d}$ is the matrix of classifier  weights whose rows $w_i^\top$ correspond to class-specific weight vectors, and  $b \in \mathbb{R}^{C}$ is the vector of class-specific bias terms. The predicted  probabilities are given by $\hat{y} = \mathrm{softmax}(\ell)$. Each weight vector  $w_i$ specifies how evidence for class $i$ changes as the embedding moves in 
% latent space. Intuitively, $w_i$ indicates the feature direction that most 
% increases the model’s confidence in class $i$. For any task $t$ with $C_t$ classes,  we denote its classifier parameters by $W_t \in \mathbb{R}^{C_t \times d}$ and  $b_t \in \mathbb{R}^{C_t}$. We conceptualize the classier weights not merely as regression coefficients, but as attention vectors acting upon the latent embedding. If the weights for the primary task are highly correlated with the weights for
% metadata attributes, it suggests the model attends to similar features for both predictions, implying a reliance on that specific shortcut.

% \subsection{Projection onto the Data Manifold}

% To compare feature utilization across tasks, we express classifier weights in a shared low-dimensional coordinate system derived from the data manifold. More specifically, let $Z = f_\theta(X) \in \mathbb{R}^{N \times d}$ denote the (zero-mean) latent embeddings of the training set of size $N$. We compute the 
% empirical covariance of $Z$ and extract the top $k$ principal components 
% (explaining 99\% of the variance). These form the projection matrix $P \in \mathbb{R}^{k \times d}.$ Then, each classifier head is projected into this subspace:
% \begin{equation}
%     W_t' = W_t P^\top, \qquad W_t' \in \mathbb{R}^{C_t \times k}
%     \label{eq:project}
% \end{equation}

% \noindent where the $i$-th row $w'_{t,i}$ denotes the class-specific decision direction after projection. If the latent representation satisfies $z \approx P^\top z'$, then $W_t z = W_t P^\top z' = W_t' z'.$ 

% \subsection{Quantifying Shortcut Reliance via Weight Space Correlation}
% Given a primary clinical task $A$ and task $m$ related to metadata information 
% we assess whether two tasks rely on similar latent directions, using the cosine  similarity between their projected class-specific weight vectors. Using \equationref{eq:project}, for tasks $A$ and $m$, we have $W_A' = W_A P^\top, \qquad W_m' = W_m P^\top$ and compute cosine similarity between class $i$ of task $A$ and class $j$ of task $m$ is defined as

% \begin{equation}
%     \cos(\theta_{ij}) =
% \frac{(w'_{A,i})^\top w'_{m,j}}
%      {\|w'_{A,i}\| \, \|w'_{m,j}\|},
% \qquad 
% i = 1,\dots,C_A,\; j = 1,\dots,C_m.
% \end{equation}

% \noindent This yields the task-pair correlation matrix $\mathrm{Corr}(A,m) \in \mathbb{R}^{C_A \times C_m}$ which captures alignment between the decision directions of the two tasks within the intrinsic data manifold. High correlation indicates reliance on similar latent  directions and may signal shortcut usage when $m$ corresponds to metadata. A high correlation in this projected space serves as a quantitative proxy for shortcut learning: it implies that the decision boundary for the clinical task aligns closely with the decision
% boundary for the shortcut.















% \section{Next research questions - currently being re-written into coherent text}
% \subsection{Are the images embedded with maternal/machine/physical factors irrelevant to the clinical task?}

% \textit{writing note:} Before we begin, we first ask the question whether or not such factors are even embedded within the images. To do this, we train ResNet50 based classifier models to predict each maternal/machine/physical factors given the ultrasound image. Results are shown in \tableref{tab:fetal_cervix_dataset_sanity_check_test_metrics}.

% Here we see that in addition to good performance on the primary plane classification task, the model is also able to predict model of the ultrasound scanner used in acquiring the image. This brings us to the next question:

% \subsection{Are the classifiers making use of the embedded clinically irrelevant factors in the prediction process?}

% \textit{writing note: }To answer this question, we make use of the methodology described in \sectionref{sec:probe_for_encoded_info}. With a classifier model trained for plane classification, we fine-tuned it to classify the maternal/machine/physical factors. Results are shown in \tableref{tab:fetal_dataset_finetuned_test_metrics}.

% Here we see that we can still see presence of the scanner-related information in the model embeddings, with pretty high auroc. However, when we look at the weight space correlation (see \sectionref{sec:weight_space_correlation}), the correlation between the weight vectors for plane classes and scanner classes are low. This indicates that although information about scanners is present in the model embedding, they are not being utilized in the plane classification process.

% To stress test this further, we also trained a multi-task model to predict all factors in addition to the plane, which encourages the encoding of information about the metadata in the embeddings. We then perform the weight space correlation analysis on this multitask model, and finds that the correlation between the plane and scanner weightages are low again.

% This is a strong indication that the presence of irrelevant information in an image does not immediately means such information is being used, whether as a shortcut or other manner, to influence the primary task.

% \subsection{If we artificially introduce bias to the dataset, can we determine if the model has taken a shortcut learning approach in its learning?}

% Having said that, how do we know for sure that when a potential shortcut learning has taken space, we will be able to see that via weight space correlation analysis? To answer that question, we repeat the experiment but this time with artificially introduced bias. We discarded some images such that majority of each plane comes from a distinct scanner. However, since we have 4 plane classes but 3 scanner classes, the majority scanner for both femur and thorax were chosen to be E10, for abdomen it's v830, and for head it's voluson s. 

% From \figureref{fig:correlation_sonai_biased_classifier}, we see much stronger correlation between the plane classes and the scanner classes, especially in the multitask-trained classifier. This indicate that weight space correlation analysis does work.

% \subsection{After validating the methodology in the previous subsection, if we apply the method to an existing trained model, what are the observations?}

% We now apply the same analysis to a model trained for predicting pre-term birth using cervical ultrasound images. Before we begin, we again check for the possibility that maternal/machine/physical informations are embedded in the images. Results are in \tableref{tab:fetal_cervix_dataset_sanity_check_test_metrics}

% [insert some background and importance about the problem and approach, summarized from ppar paper]. Since the trained SA-SonoNet is a fully convolutional network, we flatten the (14 x 18) 2D embeddings into 252-long 1D dimensional embedding layer, allowing us to then replace the final average pooling layer with a fully-connected layer as the classification layer. We freeze the remaining of the model and fine-tune the fully-connected layers to classify each information separately. We then perform the same weight space correlation analysis on this model, with the addition of a 252-long ones vector to represent the original avg-pool operation.

% Here we see that the models have decent performance in predicting birth weight group, and weightage for those classes are correlated to weightage used in predicting pre-term birth. This is expected, since birth weight of pre-term born infants are generally low. Additionally, we also see such correlation with the placental weight, which again is expected for the same reason. Cervical length has been the typical indicator in standard practice in the clinic, which is also confirmed here with a moderate correlation. We also see moderate correlation with pixel spacing, which is shown by pegios et al that it is an important factor that one should feed to the model. We do not see correlation with scanner and bmi group, which is a good thing.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work is funded by the Danish Pioneer Centre for AI (DNRF
grant number P1), SONAI - a Danish Regions’ AI Signature Project, and the Novo Nordisk Foundation through the Center for Basic Machine Learning Research in Life Science (NNF20OC0062606).}

\bibliography{midl26_234}

\clearpage

\appendix

\section{Full Results: Establishing the Embedding of Metadata in Images}
\label{sec:full_results_establishing_embedding_of_metadata}
% \begin{table}[ht!]
%     \centering
%     \begin{tabular}{lllllll}
%     \toprule
%     Target & Accuracy & Precision & Recall & F1 & AUROC \\
%      % &  &  &  &  &  &  \\
%     \midrule
%     Plane & 95.8 ± 0.6 & 95.4 ± 0.6 & 95.4 ± 0.6 & 95.4 ± 0.6 & 99.6 ± 0.1 \\
%     \hdashline
%     Scanner & 97.6 ± 0.1 & 97.7 ± 0.2 & 97.2 ± 0.1 & 97.4 ± 0.1 & 99.8 ± 0.0 \\
%     Pixel spacing & 70.6 ± 0.9 & 69.5 ± 1.0 & 68.8 ± 1.0 & 68.5 ± 1.0 & 94.1 ± 0.2 \\
%     Hospital ID & 73.5 ± 2.0 & 56.7 ± 1.6 & 54.9 ± 2.4 & 54.5 ± 2.4 & 90.9 ± 1.4 \\
%     Year of study & 63.9 ± 5.5 & 51.4 ± 4.4 & 51.6 ± 3.4 & 49.4 ± 2.5 & 88.7 ± 1.2 \\
%     GA & 56.0 ± 1.5 & 50.3 ± 1.8 & 49.4 ± 0.4 & 49.3 ± 0.9 & 82.1 ± 0.6 \\
%     BMI & 37.2 ± 1.5 & 39.2 ± 1.3 & 37.1 ± 1.4 & 37.5 ± 1.3 & 65.4 ± 0.7 \\
%     Parity & 58.2 ± 7.5 & 38.0 ± 1.6 & 37.0 ± 1.8 & 35.4 ± 1.4 & 59.2 ± 2.1 \\
%     % Term Birth & 75.4 ± 3.0 & 51.9 ± 0.8 & 51.7 ± 0.4 & 51.6 ± 0.6 & 52.5 ± 2.5 \\
%     Maternal age & 26.9 ± 2.1 & 27.6 ± 1.6 & 27.5 ± 1.5 & 24.9 ± 2.5 & 52.0 ± 1.4 \\
%     Smoking status & 73.5 ± 8.4 & 51.0 ± 0.5 & 51.2 ± 0.9 & 50.0 ± 1.5 & 52.0 ± 1.0 \\
%     Ethnicity & 89.3 ± 0.8 & 49.9 ± 1.4 & 50.0 ± 1.6 & 49.8 ± 1.4 & 46.0 ± 3.9 \\
%     \bottomrule
%     \end{tabular}
%     \caption{Test performance of ResNet50 classifier model trained to predict various target values in the fetal plane dataset. Continuous values are discretized and grouped into bins.}
%     \label{tab:fetal_dataset_sanity_check_test_metrics}
% \end{table}

% \begin{table}[]
%     \centering
%     \begin{tabular}{llllll}
%     \toprule
%     Target & Accuracy & Precision & Recall & F1 & AUROC \\
%      % &  &  &  &  &  \\
%     \midrule
%     Term Birth & 55.7 ± 1.8 & 57.5 ± 2.1 & 57.1 ± 1.6 & 55.4 ± 2.0 & 60.7 ± 2.3 \\
%     \hdashline
%     Scanner & 97.4 ± 0.8 & 93.1 ± 3.7 & 85.6 ± 2.5 & 88.6 ± 2.6 & 98.9 ± 0.2 \\
%     Pixel spacing & 72.7 ± 15.2 & 70.4 ± 16.1 & 70.0 ± 16.4 & 70.1 ± 16.3 & 93.7 ± 4.8 \\
%     Cervical length & 62.5 ± 7.8 & 65.2 ± 7.0 & 61.8 ± 9.0 & 62.8 ± 8.7 & 85.0 ± 4.9 \\
%     Hospital ID & 46.3 ± 14.3 & 39.9 ± 14.9 & 38.9 ± 15.3 & 37.9 ± 15.4 & 81.2 ± 9.2 \\
%     GA & 39.3 ± 4.5 & 32.6 ± 3.8 & 30.6 ± 3.3 & 30.7 ± 3.3 & 66.9 ± 3.2 \\
%     Year of study & 26.5 ± 3.7 & 25.6 ± 3.0 & 24.2 ± 3.2 & 24.2 ± 3.2 & 62.9 ± 3.7 \\
%     Birth weight & 21.5 ± 2.2 & 19.7 ± 1.6 & 18.1 ± 0.9 & 16.9 ± 1.2 & 54.2 ± 0.5 \\
%     BMI & 45.6 ± 4.8 & 24.3 ± 1.2 & 25.0 ± 0.8 & 23.8 ± 0.5 & 51.9 ± 2.0 \\
%     Maternal Age & 30.7 ± 3.8 & 24.8 ± 5.1 & 26.7 ± 0.9 & 25.1 ± 3.4 & 51.9 ± 0.8 \\
%     Placenta weight & 35.5 ± 3.2 & 25.2 ± 2.0 & 25.4 ± 2.2 & 24.8 ± 2.1 & 50.5 ± 5.4 \\
%     Smoking status & 85.4 ± 1.8 & 50.4 ± 1.6 & 50.6 ± 1.7 & 50.4 ± 1.6 & 50.3 ± 2.5 \\
%     Ethnicity & 89.8 ± 1.5 & 48.9 ± 1.6 & 49.0 ± 1.6 & 48.9 ± 1.6 & 50.3 ± 2.4 \\
%     Parity group & 61.5 ± 1.1 & 34.1 ± 3.4 & 32.9 ± 1.1 & 31.7 ± 1.7 & 46.4 ± 3.9 \\
%     \bottomrule
%     \end{tabular}
%     \caption{Test performance of ResNet50 classifier model trained to predict various target values in the cervix dataset.}
%     \label{tab:cervix_dataset_sanity_check_test_metrics}
% \end{table}

% \clearpage
  
\begin{table}[ht!]
    \centering
    \begin{DIFnomarkup}
    \resizebox{\textwidth}{!}{
        \begin{tabular}{lllllll}
        \toprule
        \textbf{Target} & \textbf{Accuracy} & \textbf{Precision} & \textbf{Recall} & \textbf{F1} & \textbf{AUROC} \\
         % &  &  &  &  &  &  \\
        \midrule
        \multicolumn{6}{c}{\textbf{Fetal plane dataset}} \\
        \midrule
        Plane & 95.8 ± 0.6 & 95.4 ± 0.6 & 95.4 ± 0.6 & 95.4 ± 0.6 & 99.6 ± 0.1 \\
        \hdashline
        Scanner & 97.6 ± 0.1 & 97.7 ± 0.2 & 97.2 ± 0.1 & 97.4 ± 0.1 & 99.8 ± 0.0 \\
        Pixel spacing & 70.6 ± 0.9 & 69.5 ± 1.0 & 68.8 ± 1.0 & 68.5 ± 1.0 & 94.1 ± 0.2 \\
        Hospital ID & 73.5 ± 2.0 & 56.7 ± 1.6 & 54.9 ± 2.4 & 54.5 ± 2.4 & 90.9 ± 1.4 \\
        Year of study & 63.9 ± 5.5 & 51.4 ± 4.4 & 51.6 ± 3.4 & 49.4 ± 2.5 & 88.7 ± 1.2 \\
        GA & 56.0 ± 1.5 & 50.3 ± 1.8 & 49.4 ± 0.4 & 49.3 ± 0.9 & 82.1 ± 0.6 \\
        BMI & 37.2 ± 1.5 & 39.2 ± 1.3 & 37.1 ± 1.4 & 37.5 ± 1.3 & 65.4 ± 0.7 \\
        Parity & 58.2 ± 7.5 & 38.0 ± 1.6 & 37.0 ± 1.8 & 35.4 ± 1.4 & 59.2 ± 2.1 \\
        % Term Birth & 75.4 ± 3.0 & 51.9 ± 0.8 & 51.7 ± 0.4 & 51.6 ± 0.6 & 52.5 ± 2.5 \\
        Maternal age & 26.9 ± 2.1 & 27.6 ± 1.6 & 27.5 ± 1.5 & 24.9 ± 2.5 & 52.0 ± 1.4 \\
        Smoking status & 73.5 ± 8.4 & 51.0 ± 0.5 & 51.2 ± 0.9 & 50.0 ± 1.5 & 52.0 ± 1.0 \\
        Ethnicity & 89.3 ± 0.8 & 49.9 ± 1.4 & 50.0 ± 1.6 & 49.8 ± 1.4 & 46.0 ± 3.9 \\
        \midrule
        \multicolumn{6}{c}{\textbf{Cervix dataset}} \\
        \midrule
        Term Birth & 55.7 ± 1.8 & 57.5 ± 2.1 & 57.1 ± 1.6 & 55.4 ± 2.0 & 60.7 ± 2.3 \\
        \hdashline
        Scanner & 97.4 ± 0.8 & 93.1 ± 3.7 & 85.6 ± 2.5 & 88.6 ± 2.6 & 98.9 ± 0.2 \\
        Pixel spacing & 72.7 ± 15.2 & 70.4 ± 16.1 & 70.0 ± 16.4 & 70.1 ± 16.3 & 93.7 ± 4.8 \\
        Cervical length & 62.5 ± 7.8 & 65.2 ± 7.0 & 61.8 ± 9.0 & 62.8 ± 8.7 & 85.0 ± 4.9 \\
        Hospital ID & 46.3 ± 14.3 & 39.9 ± 14.9 & 38.9 ± 15.3 & 37.9 ± 15.4 & 81.2 ± 9.2 \\
        GA & 39.3 ± 4.5 & 32.6 ± 3.8 & 30.6 ± 3.3 & 30.7 ± 3.3 & 66.9 ± 3.2 \\
        Year of study & 26.5 ± 3.7 & 25.6 ± 3.0 & 24.2 ± 3.2 & 24.2 ± 3.2 & 62.9 ± 3.7 \\
        Birth weight & 21.5 ± 2.2 & 19.7 ± 1.6 & 18.1 ± 0.9 & 16.9 ± 1.2 & 54.2 ± 0.5 \\
        BMI & 45.6 ± 4.8 & 24.3 ± 1.2 & 25.0 ± 0.8 & 23.8 ± 0.5 & 51.9 ± 2.0 \\
        Maternal Age & 30.7 ± 3.8 & 24.8 ± 5.1 & 26.7 ± 0.9 & 25.1 ± 3.4 & 51.9 ± 0.8 \\
        Placenta weight & 35.5 ± 3.2 & 25.2 ± 2.0 & 25.4 ± 2.2 & 24.8 ± 2.1 & 50.5 ± 5.4 \\
        Smoking status & 85.4 ± 1.8 & 50.4 ± 1.6 & 50.6 ± 1.7 & 50.4 ± 1.6 & 50.3 ± 2.5 \\
        Ethnicity & 89.8 ± 1.5 & 48.9 ± 1.6 & 49.0 ± 1.6 & 48.9 ± 1.6 & 50.3 ± 2.4 \\
        Parity group & 61.5 ± 1.1 & 34.1 ± 3.4 & 32.9 ± 1.1 & 31.7 ± 1.7 & 46.4 ± 3.9 \\
        \bottomrule
        \end{tabular}
    }
    \end{DIFnomarkup}
    \caption{Test performance of ResNet50 classifier model trained to predict various target values in the fetal plane and cervix datasets. Continuous values are discretized and grouped into bins.}
    \label{tab:fetal_cervix_dataset_sanity_check_test_metrics}
\end{table}

\begin{table}[!ht]
    \centering
    \resizebox{\textwidth}{!}{
        \begin{tabular}{llllll}
        \toprule
        Target & Accuracy & Precision & Recall & F1 & AUROC \\
         % &  &  &  &  &  \\
        \midrule
        Term Birth & 67.5 ± 2.5 & 67.7 ± 2.4 & 67.5 ± 2.5 & 67.4 ± 2.6 & 73.9 ± 3.0 \\
        \hdashline
        Px Spacing & 70.9 ± 3.8 & 69.4 ± 3.1 & 70.3 ± 3.3 & 69.4 ± 3.1 & 95.3 ± 0.9 \\
        Py Spacing & 70.5 ± 3.0 & 66.6 ± 2.9 & 68.6 ± 3.1 & 66.6 ± 3.1 & 95.2 ± 1.0 \\
        Cervical Length & 52.6 ± 2.0 & 52.8 ± 2.3 & 55.6 ± 1.9 & 53.3 ± 2.0 & 80.0 ± 1.6 \\
        Scanner & 56.3 ± 6.4 & 32.0 ± 3.1 & 55.7 ± 9.9 & 30.4 ± 5.5 & 77.2 ± 5.7 \\
        Birth Weight & 21.4 ± 1.5 & 22.0 ± 1.8 & 23.7 ± 2.5 & 21.0 ± 1.9 & 61.2 ± 1.9 \\
        Placenta Weight & 28.8 ± 1.5 & 27.5 ± 1.2 & 30.0 ± 3.7 & 25.0 ± 1.6 & 55.4 ± 2.9 \\
        BMI & 27.4 ± 2.5 & 26.9 ± 2.2 & 28.4 ± 3.6 & 23.1 ± 2.4 & 53.3 ± 3.0 \\
        GA & 34.6 ± 3.0 & 27.7 ± 3.3 & 27.7 ± 3.5 & 27.5 ± 3.4 & 48.4 ± 2.0 \\
        \bottomrule
        \end{tabular}
    }
    \caption{Test performance of SA-SonoNet model trained for pre-term birth prediction and subsequently fine-tuned to predict the other targets.}
    \label{tab:cervix_dataset_finetuned_test_metrics}
\end{table}

\clearpage

\section{Full covariance matrix plot}
\label{sec:full_covariance_matrix_plot}
% \begin{figure}[ht!]
%     \centering
%     \subfigure[Fine-tuned model; full dataset]{\includegraphics[width=0.49\textwidth]{figures/seed_42_sonai_plane_resnet50_final_layer_weight_correlation_pca.png}\label{fig:full_correlation_sonai_plane_classifier}}
%     \subfigure[Multi-task model; full dataset]{\includegraphics[width=0.49\textwidth]{figures/seed_42_sonai_multitask_resnet50_final_layer_weight_correlation_pca.png}\label{fig:full_correlation_sonai_multitask_classifier}}
%     \subfigure[Fine-tuned model; bias-induced dataset]{\includegraphics[width=0.49\textwidth]{figures/seed_43_sonai_biased_plane_resnet50_final_layer_weight_correlation_pca.png}\label{fig:full_correlation_sonai_biased_plane_classifier}}
%     \subfigure[Multi-task model; bias-induced dataset]{\includegraphics[width=0.49\textwidth]{figures/seed_43_sonai_biased_multitask_resnet50_final_layer_weight_correlation_pca.png}\label{fig:full_correlation_sonai_biased_multitask_classifier}}
%     \caption{Full correlation matrix between weight vectors from classification head of each targets, extracted from fine-tuned or multitask classifier models, trained using the entire fetal dataset or with induced bias.}
%     \label{fig:full_correlation_matrix}
% \end{figure}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\textwidth]{figures/seed_42_sonai_plane_resnet50_final_layer_weight_correlation_pca.png}
    \caption{Full correlation matrix between weight vectors from classification head of each targets, extracted from fine-tuned classifier model trained using the entire fetal dataset.}
    \label{fig:full_correlation_sonai_plane_classifier}
\end{figure}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\textwidth]{figures/seed_42_sonai_multitask_resnet50_final_layer_weight_correlation_pca.png}
    \caption{Full correlation matrix between weight vectors from classification head of each targets, extracted from multitask classifier model trained using the entire fetal dataset.}
    \label{fig:full_correlation_sonai_multitask_classifier}
\end{figure}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\textwidth]{figures/seed_42_sonai_balanced_plane_resnet50_final_layer_weight_correlation_pca.png}
    \caption{Full correlation matrix between weight vectors from classification head of each targets, extracted from fine-tuned classifier model trained using the balanced fetal dataset.}
    \label{fig:full_correlation_sonai_balanced_plane_classifier}
\end{figure}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\textwidth]{figures/seed_42_sonai_balanced_multitask_resnet50_final_layer_weight_correlation_pca.png}
    \caption{Full correlation matrix between weight vectors from classification head of each targets, extracted from multitask classifier models trained using the balanced fetal dataset.}
    \label{fig:full_correlation_sonai_balanced_multitask_classifier}
\end{figure}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\textwidth]{figures/seed_42_sonai_biased_plane_resnet50_final_layer_weight_correlation_pca.png}
    \caption{Full correlation matrix between weight vectors from classification head of each targets, extracted from fine-tuned classifier model trained using the fetal dataset with induced bias.}
    \label{fig:full_correlation_sonai_biased_plane_classifier}
\end{figure}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\textwidth]{figures/seed_42_sonai_biased_multitask_resnet50_final_layer_weight_correlation_pca.png}
    \caption{Full correlation matrix between weight vectors from classification head of each targets, extracted from multitask classifier models trained using the fetal dataset with induced bias.}
    \label{fig:full_correlation_sonai_biased_multitask_classifier}
\end{figure}

\begin{figure}[h!]
    \centering
    \includegraphics[width=\textwidth]{figures/sf4_sasononet_final_layer_weight_correlation_pca.png}
    \caption{Full correlation between weight vectors from classification head of the targets of the fine-tuned SA-SonoNet model. A reference line is added to represent the average pool layer in the original model.}
    \label{fig:full_cropped_correlation_cervix_finetuned_classifier}
\end{figure}

\clearpage

\section{Full Results: Validation of Shortcut Learning Detection via Induced Bias}
\label{sec:full_results_validation_via_induced_bias}

\begin{table}[ht!]
    \centering
    \begin{DIFnomarkup}
    \resizebox{\textwidth}{!}{
        \begin{tabular}{@{}l@{}cccccc@{}}
        \toprule
         & \multicolumn{3}{c}{\textbf{Fine-tuned model}}& \multicolumn{3}{c}{\textbf{Multitask model}} \\
        \cmidrule(rl){2-4}\cmidrule(rl){5-7} \textbf{Target} & \textbf{Accuracy} & \textbf{F1} & \textbf{AUROC} & \textbf{Accuracy} & \textbf{F1} & \textbf{AUROC} \\
        \midrule
        Plane & 99.0 ± 0.6 & 99.0 ± 0.6 & 75.0 ± 0.0 & 97.2 ± 1.1 & 97.2 ± 1.1 & 74.9 ± 0.0 \\
        \hdashline
        Scanner & 77.0 ± 2.9 & 76.7 ± 2.8 & 91.7 ± 0.7 & 95.4 ± 0.7 & 95.4 ± 0.7 & 99.3 ± 0.3 \\
        Pixel Spacing & 56.1 ± 3.5 & 47.0 ± 4.2 & 86.6 ± 1.0 & 61.1 ± 2.4 & 53.5 ± 3.5 & 91.9 ± 0.8 \\
        Year Of Study & 45.9 ± 4.5 & 39.6 ± 3.4 & 76.6 ± 3.1 & 61.0 ± 2.2 & 52.0 ± 1.6 & 90.5 ± 0.6 \\
        GA & 47.2 ± 3.4 & 37.0 ± 2.2 & 69.8 ± 1.8 & 58.0 ± 1.8 & 46.6 ± 1.1 & 82.8 ± 0.7 \\
        BMI & 34.9 ± 3.9 & 22.8 ± 3.1 & 67.3 ± 1.2 & 50.0 ± 2.2 & 37.9 ± 5.5 & 75.9 ± 2.4 \\
        Hospital ID & 45.3 ± 0.7 & 33.2 ± 2.3 & 64.6 ± 1.2 & 66.8 ± 1.2 & 48.3 ± 1.7 & 78.0 ± 0.2 \\
        Parity & 59.0 ± 10.5 & 37.2 ± 3.1 & 59.8 ± 2.4 & 59.0 ± 7.4 & 36.3 ± 2.2 & 68.2 ± 2.1 \\
        Smoking Status & 81.6 ± 6.0 & 48.2 ± 1.9 & 59.8 ± 2.0 & 77.7 ± 6.9 & 48.5 ± 3.0 & 54.3 ± 4.8 \\
        Maternal Age & 24.0 ± 2.2 & 21.7 ± 2.7 & 53.2 ± 1.9 & 29.5 ± 5.5 & 28.7 ± 4.8 & 53.9 ± 3.1 \\
        Ethnicity & 82.8 ± 10.4 & 47.0 ± 1.9 & 50.1 ± 8.7 & 70.9 ± 12.8 & 43.9 ± 3.1 & 44.6 ± 6.1 \\
        Term Birth & 67.1 ± 7.4 & 42.5 ± 2.4 & 35.7 ± 4.5 & 67.0 ± 9.1 & 47.1 ± 3.6 & 46.3 ± 4.6 \\
        \bottomrule
        \end{tabular}
    }
    \end{DIFnomarkup}
    \caption{Test performance of ResNet50 classifier model when fine-tuned to predict the other targets, or trained to predict various targets in a multitask setting, using the balanced fetal dataset.}
    \label{tab:balanced_fetal_dataset_finetuned_multitask_test_metrics}
\end{table}

\begin{table}[ht!]
    \centering
    \begin{DIFnomarkup}
    \resizebox{\textwidth}{!}{
        \begin{tabular}{@{}l@{}cccccc@{}}
        \toprule
         & \multicolumn{3}{c}{\textbf{Fine-tuned model}}& \multicolumn{3}{c}{\textbf{Multitask model}} \\
        \cmidrule(rl){2-4}\cmidrule(rl){5-7} \textbf{Target} & \textbf{Accuracy} & \textbf{F1} & \textbf{AUROC} & \textbf{Accuracy} & \textbf{F1} & \textbf{AUROC} \\
        \midrule
        Plane & 98.3 ± 0.9 & 98.3 ± 0.9 & 75.0 ± 0.0 & 95.9 ± 0.9 & 95.9 ± 0.9 & 74.8 ± 0.1 \\
        \hdashline
        Scanner & 80.0 ± 2.3 & 80.0 ± 2.3 & 94.2 ± 1.0 & 95.6 ± 0.5 & 95.6 ± 0.4 & 99.2 ± 0.1 \\
        Pixel Spacing & 51.8 ± 2.1 & 45.0 ± 1.2 & 85.0 ± 1.3 & 64.3 ± 2.3 & 61.7 ± 2.6 & 91.8 ± 0.5 \\
        Year Of Study & 51.9 ± 4.3 & 40.5 ± 2.6 & 82.3 ± 1.6 & 67.3 ± 4.3 & 50.2 ± 2.8 & 91.4 ± 0.8 \\
        GA & 46.8 ± 1.6 & 40.0 ± 1.9 & 75.3 ± 1.1 & 55.7 ± 2.5 & 46.1 ± 2.4 & 83.6 ± 0.5 \\
        Hospital ID & 45.1 ± 0.7 & 31.1 ± 0.6 & 75.2 ± 1.3 & 66.2 ± 2.6 & 45.4 ± 2.9 & 85.8 ± 1.3 \\
        Parity & 58.8 ± 4.0 & 40.9 ± 4.4 & 71.0 ± 1.3 & 71.2 ± 3.5 & 33.6 ± 1.5 & 68.9 ± 2.7 \\
        Smoking Status & 83.5 ± 3.5 & 49.4 ± 3.0 & 55.9 ± 7.9 & 84.7 ± 4.2 & 51.2 ± 2.8 & 47.8 ± 3.5 \\
        Maternal Age & 26.9 ± 2.3 & 24.9 ± 2.7 & 54.7 ± 2.8 & 28.8 ± 4.9 & 28.1 ± 4.0 & 54.1 ± 3.4 \\
        Ethnicity & 90.0 ± 3.1 & 49.8 ± 1.8 & 54.5 ± 9.8 & 93.1 ± 1.4 & 51.8 ± 3.2 & 46.6 ± 2.6 \\
        Term Birth & 65.5 ± 5.0 & 43.6 ± 1.1 & 40.0 ± 1.7 & 84.4 ± 0.6 & 48.2 ± 0.7 & 48.2 ± 3.2 \\
        BMI & 34.0 ± 2.5 & 23.1 ± 1.7 & 39.1 ± 1.5 & 45.6 ± 6.9 & 29.9 ± 3.2 & 48.9 ± 1.5 \\
        \bottomrule
        \end{tabular}
    }
    \end{DIFnomarkup}
    \caption{Test performance of ResNet50 classifier model when fine-tuned to predict the other targets, or trained to predict various targets in a multitask setting, using the biased fetal dataset.}
    \label{tab:biased_fetal_dataset_finetuned_multitask_test_metrics}
\end{table}

\clearpage

\section{Full Results: Empirical Determination of the PCA Projection Threshold}
\label{sec:full_plot_empirical_determination_of_pca_projection_threshold}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=1\linewidth]{figures/effect_of_pca_threshold_k_on_correlation_value_full_finetuned_model.png}
    \caption{Sensitivity of correlation values to the number of principal components $k$ in fine-tuned models.}
    \label{fig:effect_of_pca_threshold_k_on_correlation_value_full_finetuned_model}
\end{figure}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=1\linewidth]{figures/effect_of_pca_threshold_k_on_correlation_value_full_multitask_model.png}
    \caption{Sensitivity of correlation values to the number of principal components $k$ in multi-task models.}
    \label{fig:effect_of_pca_threshold_k_on_correlation_value_full_multitask_model}
\end{figure}

% \section{Discretization of continuous values into groups}
% Here is the details about how we bin the continuous values into groups, allowing us to train models to predict them as a classification task.

% \clearpage


% \section{Introduction}

% This is where the content of your paper goes.  Some random
% notes\footnote{Random footnote are discouraged}:
% \begin{itemize}
% \item You should use \LaTeX \cite{Lamport:Book:1989}.
% \item JMLR/PMLR uses natbib for references. For simplicity, here, \verb|\cite|  defaults to
%   parenthetical citations, i.e.\ \verb|\citep|. You can of course also
%   use \verb|\citet| for textual citations.
% \item Eprints such as arXiv papers can of course be cited \cite{Hinton:arXiv:2015:Distilling}. We recomend using a \verb|@misc| bibtex entry for these as shown in the sample bibliography.
% \item You should follow the guidelines provided by the conference.
% \item Read through the JMLR template documentation for specific \LaTeX
%   usage questions.
% \item Note that the JMLR template provides many handy functionalities
% such as \verb|\figureref| to refer to a figure,
% e.g. \figureref{fig:example},  \verb|\tableref| to refer to a table,
% e.g. \tableref{tab:example} and \verb|\equationref| to refer to an equation,
% e.g. \equationref{eq:example}.
% \end{itemize}

% \begin{table}[htbp]
%  % The first argument is the label.
%  % The caption goes in the second argument, and the table contents
%  % go in the third argument.
% \floatconts
%   {tab:example}%
%   {\caption{An Example Table}}%
%   {\begin{tabular}{ll}
%   \bfseries Dataset & \bfseries Result\\
%   Data1 & 0.12345\\
%   Data2 & 0.67890\\
%   Data3 & 0.54321\\
%   Data4 & 0.09876
%   \end{tabular}}
% \end{table}

% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example}
%   {\caption{Example Image}}
%   {\includegraphics[width=0.5\linewidth]{example-image}}
% \end{figure}

% \begin{algorithm2e}
% \caption{Computing Net Activation}
% \label{alg:net}
%  % older versions of algorithm2e have \dontprintsemicolon instead
%  % of the following:
%  %\DontPrintSemicolon
%  % older versions of algorithm2e have \linesnumbered instead of the
%  % following:
%  %\LinesNumbered
% \KwIn{$x_1, \ldots, x_n, w_1, \ldots, w_n$}
% \KwOut{$y$, the net activation}
% $y\leftarrow 0$\;
% \For{$i\leftarrow 1$ \KwTo $n$}{
%   $y \leftarrow y + w_i*x_i$\;
% }
% \end{algorithm2e}

% \section{Proof of Theorem 1}

% This is a boring technical proof of
% \begin{equation}\label{eq:example}
% \cos^2\theta + \sin^2\theta \equiv 1.
% \end{equation}

% \section{Proof of Theorem 2}

% This is a complete version of a proof sketched in the main text.

\end{document}
