\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{amsmath}
\usepackage{bm}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{graphicx}
\usepackage{microtype}
\usepackage[table]{xcolor}

\usepackage{cbar}
\cbfalse


\jmlrvolume{-- Under Review}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025 submission}
\editors{Under Review for MIDL 2025}

\title[Artifact Severity Scoring of MR Images]{An Unsupervised Approach for Artifact Severity Scoring in Multi-Contrast MR Images}


\midlauthor{\Name{Savannah P. Hays\nametag{$^{1}$}}\orcid{0009-0005-8711-1356} \Email{shays6@jhu.edu}\\
\addr $^{1}$ Department of Electrical and Computer Engineering, Johns~Hopkins~University, USA \AND
\Name{Lianrui~Zuo\nametag{$^{2}$}}\orcid{0000-0002-5923-9097} \Email{lianrui.zuo@vanderbilt.edu}\\
\addr $^{2}$ Department of Electrical and Computer Engineering, Vanderbilt~University, USA \AND
\Name{Blake E. Dewey\nametag{$^{3}$}}\orcid{0000-0003-4554-5058} \Email{blake.dewey@jhu.edu}\\
\addr $^{3}$ Department of Neurology, Johns~Hopkins~School~of~Medicine, USA\AND
\Name{Samuel W. Remedios\nametag{$^{4}$}}\orcid{0000-0001-8634-8128} \Email{sremedi1@jhu.edu}\\
\addr $^{4}$ Department of Computer Science, Johns~Hopkins~University, USA\AND
\Name{Jinwei~Zhang\nametag{$^{1}$}} \Email{jwzhang@jhu.edu}\\
\Name{Ellen M. Mowry\nametag{$^{3}$}}\orcid{0000-0003-0623-5188} \Email{emowry1@jhmi.edu}\\
\Name{Scott D. Newsome\nametag{$^{3}$}}\orcid{0000-0002-5284-4681} \Email{snewsom2@jhmi.edu}\\
\Name{Aaron~Carass\nametag{$^{1}$}}\orcid{0000-0003-4939-5085} \Email{aaron\_carass@jhu.edu}\\
\Name{Jerry~L.~Prince\nametag{$^{1,4}$}}\orcid{0000-0002-6553-0876} \Email{prince@jhu.edu}\\
}


\begin{document}

\maketitle

\begin{abstract}
Quality assurance~(QA) in magnetic resonance~(MR) imaging is critical but remains a challenging and time-intensive process, particularly when working with large-scale, multi-site imaging datasets.
Manual QA methods are subjective, prone to inter-rater variability, and impractical for high-throughput workflows.
Existing automated QA methods often lack generalizability to diverse datasets or fail to provide interpretable insights into the causes of poor image quality.
To address these limitations, we introduce an unsupervised and interpretable QA framework for multi-contrast MR images that quantifies artifact severity.
By assigning a numerical score to each image, our method enables objective, consistent evaluation of image quality and highlights specific levels of artifact presence that can impair downstream analysis.
Our framework employs an unsupervised contrastive learning approach, leveraging simulated artifact transformations, including random bias, noise, anisotropy, and ghosting, to train the model without requiring manual labels or preprocessing. 
A margin-based contrastive loss further enables differentiation between varying levels of artifact severity.
We validate our framework using simulated artifacts on a public dataset and real artifacts on a private clinical dataset, demonstrating its robustness and generalizability for automatic MR image QA.
By efficiently evaluating image quality and identifying artifacts prior to data processing, our approach streamlines QA workflows and enhances the reliability of subsequent analyses in both research and clinical settings.
\end{abstract}

\begin{keywords}
MRI, Quality Assurance, Artifact Detection
\end{keywords}

\section{Introduction}
Magnetic resonance~(MR) imaging is a cornerstone of medical diagnostics and research, offering unparalleled insights into the structure and function of tissues~\cite{bernstein2004handbookofmri}.
However, the quality of MR images can be significantly compromised by various artifacts, including bias field inhomogeneities, noise, motion, anisotropic resolution, and ghosting~\cite{zaitsev2015motionartifacts}. 
These artifacts not only degrade image interpretability but also affect downstream analyses, potentially leading to erroneous conclusions in both clinical and research settings~\cite{zaitsev2015motionartifacts}.
Consequently, robust quality assurance~(QA) of MR images is essential to ensure the reliability of data and analyses.

Artifacts in MR images can have far-reaching implications.
For instance, in clinical settings, poor image quality can lead to misdiagnoses or necessitate costly and time-consuming rescans.
In research, artifacts can bias analyses, particularly in multi-site studies where variability in image quality is compounded by differences in scanner hardware, acquisition protocols, and patient populations.
Advanced techniques
% such as segmentation, radiomics, and machine learning pipelines
are particularly vulnerable to artifact-induced errors, underscoring the need for automated QA systems.

Existing MR image QA methods often involve manual inspection, which is labor-intensive, subjective, and prone to inter-rater variability~\cite{esteban2017mriqc, alfaro2018ukbiobank}.
Automated methods frequently rely on preprocessing steps such as image registration or supervised training paradigms, which require large, labeled datasets and are susceptible to biases inherent in the training data~\cite{esteban2017mriqc}.
Traditional quality control~(QC) metrics like signal-to-noise ratio~(SNR) and contrast-to-noise ratio~(CNR) are limited in their ability to capture complex artifacts.
% , such as those arising from motion or anisotropic resolution~\cite{esteban2017mriqc}.
Moreover, these approaches fail to provide interpretable outputs that can guide users in identifying corrupted images within their dataset.

\Added{Recent advances in deep learning have led to the development of supervised artifact detection models, which rely on labeled training data to classify images as high- or low-quality.
For example,~\citet{oksuz2021brainmriartefact} introduced a convolutional neural network (CNN)-based approach for detecting and correcting motion artifacts in brain MRI, demonstrating the effectiveness of deep learning in artifact detection.
However, supervised models require large annotated datasets and often struggle to generalize to unseen artifact types.}{R1}{ADDED}

\Added{More recently, self-supervised and transfer learning methods have been explored for MRI quality assessment.
\citet{vakli2023automrimotion} proposed an end-to-end deep learning model trained on image quality metrics to classify motion artifacts.
Similarly, \citet{loizillon2024automriqa} leveraged simulated artifacts and transfer learning to develop an automated MRI QA framework.
While supervised approaches have shown promise in MR image quality assessment, they rely heavily on large, manually labeled datasets, which can be challenging to acquire and prone to inter-rater variability.
In particular, some artifacts---such as subtle anisotropic blurring or low-level bias field distortions---are not always easily recognizable by human reviewers, making manual annotation subjective and inconsistent.
Additionally, supervised models are inherently limited by the quality and diversity of their training data, often struggling to generalize to unseen artifacts or new imaging protocols across different scanners.}{R1}{ADDED}

\Added{In contrast, an unsupervised approach circumvents these challenges by learning a continuous artifact severity scale without requiring explicit labels.
This allows for more scalable training across large, heterogeneous datasets while avoiding biases introduced by expert annotations.
Moreover, unsupervised learning is particularly beneficial for multi-site studies where variations in scanner hardware, acquisition parameters, and patient populations introduce unpredictable artifact distributions.
Our work builds upon these methods by employing a fully unsupervised approach that does not require explicit artifact labels, making it more flexible and scalable to diverse datasets.}{R1}{ADDED}

MRIQC~\cite{esteban2017mriqc} is a widely used tool that reports a range of image quality metrics (IQMs) for structural, functional, and diffusion-weighted MR images. 
For structural images, it supports both T$_1$-weighted~(T$_1$-w) and T$_2$-weighted~(T$_2$-w) modalities and provides quantitative assessments based on noise, entropy, and contrast-related measurements.
While MRIQC generates detailed reports, interpreting these IQMs can be challenging, especially for large datasets.
It remains difficult to establish consistent thresholds and optimal combinations of IQMs to determine whether an image should pass or fail QA.

Among MRIQC’s structural IQMs, we identified four key metrics that are relevant to our work: coefficient of joint variation~(CJV), contrast-to-noise ratio~(CNR), entropy focus criterion~(EFC), and foreground-background energy ratio~(FBER).
CJV~\cite{ganzetti2016inhomogeneitycorrection} measures intensity variability between gray matter~(GM) and white matter~(WM) and is sensitive to head motion and intensity non-uniformity artifacts.
% Lower CJV values indicate better image quality.
CNR~\cite{magnotta2006measurementsnr} assesses how well the intensity distributions of GM and WM are separated.
% Higher CNR values correspond to better contrast and, therefore, better image quality.
EFC~\cite{atkinson1997autocorrection} is based on Shannon entropy and quantifies the amount of ghosting and blurring induced by motion artifacts.
% Lower EFC values indicate fewer motion-induced distortions.
FBER~\cite{shehzad2015preprocessedconnectomes} measures the mean energy of voxel intensities within the brain relative to areas outside the brain.
% Higher FBER values are generally associated with better image quality.


Although MRIQC provides useful quantitative measures, determining a single threshold or combination of IQMs that universally defines poor-quality images remains a challenge.
The complexity of these metrics and their dataset dependence further motivate the need for a more interpretable and automated QA framework.

In this work, we propose an automatic method for QA of MR images that addresses these limitations.
% Our approach generates interpretable artifact severity scores, where higher scores correspond to worse image quality and lower scores indicate better image quality.
% These scores can be used to set thresholds for automatic QA, enabling users to exclude poor-quality images prior to downstream processing.
%
\Added{%
Our approach generates objective artifact severity scores without relying on subjective expert labels.
This enables a more scalable and generalizable solution for automated MR image QA, particularly in multi-site studies where image quality can vary significantly due to differences in scanner hardware and acquisition settings.
By learning a continuous severity scale rather than classifying images into discrete quality categories, our model is capable of capturing subtle variations in artifact intensity that may not be readily discernible through manual review.
These scores can be used to set adaptive thresholds for automatic QA, enabling users to systematically identify and exclude poor-quality images prior to downstream processing.}{R1}{ADDED}
Our method does not require image preprocessing steps, making it computationally efficient and broadly applicable across different datasets.
It can be directly used on NifTI files of many MR image modalities.

Central to our approach is an unsupervised training framework based on contrastive learning inspired by that of~\citet{zuo2022haca3,zuo2023latent}.
By leveraging a data loader that applies a diverse set of realistic transformations---including random bias, noise, anisotropy, and ghosting---we simulate a wide range of artifact severities.
This enables the model to learn discriminative features that correlate with image quality without the need for labeled training data.
% Contrastive learning, which pulls similar samples closer in feature space while pushing dissimilar ones apart, is particularly well-suited for this task as it enables the model to generalize to unseen artifact types.
\Added{Contrastive learning, which pulls similar samples closer in feature space while pushing dissimilar ones apart, is particularly well-suited for this task as it learns a meaningful ranking of artifact severity without explicit supervision.
Unlike classification-based approaches that rely on pre-defined labels, contrastive learning naturally organizes images into a continuous quality spectrum, making it more adaptable to real-world variations in artifact intensity.
Furthermore, since some artifacts can be difficult to detect through traditional quality metrics, a contrastive framework allows the model to learn subtle but clinically relevant degradation patterns that may not be captured by predefined quality scores.
}{R1}{ADDED}
Furthermore, our method is capable of recognizing and penalizing poor resolution, a critical but often overlooked aspect of image quality.

Our approach is designed to address the growing reliance on large, multi-site datasets where artifact heterogeneity poses significant challenges.
By providing interpretable artifact severity scores, our method promotes reproducibility and reliability in medical imaging studies.
Moreover, it can be seamlessly integrated into automated pipelines for preprocessing large-scale datasets, significantly reducing the burden of manual QC and enabling more efficient use of resources.

We validate our approach by comparing it with MRIQC~\cite{esteban2017mriqc}, demonstrating its effectiveness in accurately identifying low-quality images and providing actionable insights for improving data quality.
Our method offers a scalable and interpretable solution for automatic MR image QA, paving the way for more reliable and reproducible analyses in medical imaging.
Our model is open source and is publicly available from:
\url{https://github.com/shays15/artifact_scoring}.

\section{Methods}
\subsection{Overview}
Our model assigns an artifact score to an MR image, quantifying the level of artifacts present.
The model was trained in an unsupervised fashion using a diverse set of simulated artifacts.
It leverages triplet loss and the L2 loss to learn meaningful representations of image quality.
The overall training and inference workflow is illustrated in Fig.~\ref{fig:methods}.

\begin{figure}[!tb]
\centering
\includegraphics[width=0.85\textwidth]{MIDLLatexTemplate-master/figs/methods_wider.png} 
\caption{Training and inference workflow for our model. During training, three images are used to calculate the total loss. Img A and Img B are two different, clean image slices, while Img C is Img A with a randomly simulated artifact.}
\label{fig:methods}
\end{figure}

\subsection{Training Dataset}
We trained our model using 297 structural MR volumes from the TRaditional vs. Early Aggressive Therapy for Multiple Sclerosis (TREAT-MS) pragmatic, clinical trial (NCT03500328).
These scans were acquired from seven different imaging sites and included multiple contrasts: T$_1$-w, T$_2$-w, T$_2$-w FLAIR, and proton density~(PD) images.
To improve generalization across scanners and imaging conditions, only high-quality images were included in the training dataset.
Prior to training, the images were N4 bias field corrected~\cite{tustison2010n4itk} to address intensity inhomogeneities and 2D acquisitions were super-resolved~\cite{remedios2023sashimi} to ensure consistent resolution.
These preprocessing steps were necessary for artifact simulation but are not required during inference, ensuring the model remains applicable to diverse datasets without additional preprocessing.

\subsection{Model Architecture}
Our model operates on 2D MR image slices, making it computationally efficient while allowing for a larger number of training samples.
\Added{Our model takes any input and resizes it to 224$\times$224.}{R2}{ADDED}
The architecture consists of two key components: a custom convolutional block and a primary encoder.
The convolutional block is composed of two convolutional layers with $3\times 3$ kernels, each followed by instance normalization and a LeakyReLU activation function, allowing the network to effectively capture spatial features.
The encoder stacks two of the custom convolutional blocks with increasing channel dimensions, followed by a single convolutional layer to reduce the feature dimensions and an adaptive average pooling operation to reduce the output to the desired dimension.
An absolute value function is applied to enforce non-negative scores for interpretability.
During inference, an MR volume is assigned a single score by averaging scores across the middle 60\% of slices.

\subsection{Training and Artifact Simulation}
\label{sec:methods-artsim}
To effectively train the model, we developed a data augmentation module that simulates common MR image artifacts in a controlled manner.
These artifact transformations were implemented using the TorchIO library~\cite{perez2021torchio} and include random noise, random ghosting, random bias field, and random anisotropy.
Random noise introduces varying levels of noise in the images.
Random ghosting simulates motion-induced ghosting with varying intensities and repetition.
Random bias field introduces intensity inhomogeneities mimicking scanner-specific artifacts.
Random anisotropy reduces spatial resolution to simulate anisotropic acquisitions.
Each transformation is parameterized and assigned a calculated severity score~(SS) in the range $[0, 1]$, as detailed in Table~\ref{tab:parameters}.
The parameters are randomly sampled to ensure continuity in the artifact space exposing the model to diverse image degradations.
Figure~\ref{fig:training-data} illustrates example images with increasing severity scores.
During training, the severity scores are used as the margin in the triplet loss to emphasize the relative differences between clean and artifact-degraded images.
The model itself is only constrained to produce non-negative outputs.
It is not constrained to produce outputs within a specific range.
This design allows the model to flexibly assign scores based on the learned features.

\begin{figure}[!tb]
\centering
\includegraphics[width=0.9\textwidth]{MIDLLatexTemplate-master/figs/training_examples.png} 
\caption{Increasing severity scores~(SS) in the range $[0, 1]$ (left to right) of the simulated artifacts (from top to bottom: noise, ghosting, anisotropy, bias) seen during training.}
\label{fig:training-data}
\end{figure}

\begin{table}[!tb]
\centering
%
\caption{Each artifact and its parameters used in the severity score~(SS). The parameters were uniformly sampled in the corresponding range to ensure continuity of the artifact space.}
%
\begin{tabular}{@{}lllc@{}}
\toprule
\textbf{Artifact}& \textbf{Input} &  \textbf{Parameters} & \textbf{Severity Score~(SS)} \\
\cmidrule{1-4}
\rowcolor{cyan!5}
\textbf{Noise}& \texttt{std} & $\mathcal{U}[0.005, 0.2]$& $\dfrac{\text{std} - 0.005}{0.2 - 0.005}$\\
\cmidrule{1-4}
\multirow{2}{*}{\textbf{Ghosting}} & \texttt{num\_ghosts} & $\mathcal{U}\{2, \cdots, 10\}$ & \multirow{2}{*}{$\dfrac{(\text{intensity} - 0.2) + \frac{\text{num\_ghosts}}{10}}{(1.5 - 0.2) + 1}$}\\
%
& \texttt{intensity} & $\mathcal{U}[0.2, 1.5]$ & \\
%
\cmidrule{1-4}
\rowcolor{cyan!5}
\textbf{Bias Field} & \texttt{coefficients} & $\mathcal{U}[0.01, 0.3]$&$\dfrac{\text{coefficients} - 0.01}{0.3 - 0.01}$\\
\cmidrule{1-4}
\textbf{Anisotropy}& \texttt{scale}& $\mathcal{U}[1, 4]$ & $\dfrac{\text{scale} - 1}{4 - 1}$\\ \bottomrule
\end{tabular}
\label{tab:parameters}
\end{table}


\subsection{Loss Functions}
% During training, three images are passed through the network.
% The anchor is a clean image without any artifact simulation.
% The positive sample is another randomly selected clean image distinct from the anchor.
% The negative sample is the anchor with a randomly applied artifact simulation.
% Two loss functions are employed to guide the model's learning.
% First, we calculated an L2 loss.
% This measures the difference between the scores of the anchor and the positive sample.
% By minimizing this loss, the model ensures that clean images, which lack artifacts, are assigned consistently low scores, indicating high image quality.
% The second loss is the triplet loss (sometimes referred to as the contrastive loss).
% The triplet loss uses the scores from the anchor, the positive sample, and the negative sample.
% This loss encourages the score of the anchor to be similar to that of the positive sample, while the score from the negative sample is further apart.
% The triplet loss incorporates a margin that varies dynamically based on the artifact severity score of the negative sample.
% For high-severity artifacts, the margin is set higher, emphasizing a larger separation between clean and degraded images.
% For low-severity artifacts, the margin is smaller, reflecting subtler differences in quality.
% This adaptive margin ensures that the model's scores are interpretable and directly correlated with artifact severity.
\Added{During training, three images are passed through the network: a clean (anchor) image, a second clean image (positive sample), and the anchor image with a simulated artifact (negative sample). 
To learn robust artifact representations, the model is optimized using two loss functions: L2 loss and triplet loss.}{R1\\\hspace*{1.2em}R3}{ADDED}

\Added{\textbf{L2 Loss:} The L2 loss ensures that clean images are consistently assigned low severity scores, encouraging stability in the artifact-free case:
\begin{equation}
    \mathcal{L}_{L2} = \frac{1}{N} \sum_{i=1}^{N} (S_i - S_j)^2
\end{equation}
where \( S_i \) and \( S_j \) are the predicted severity scores of two clean images.}{R1\\\hspace*{1.2em}R3}{ADDED}

\Added{\textbf{Triplet Loss:} The triplet loss enforces a ranking such that the clean anchor image receives a lower severity score than the artifact-degraded negative image:
\begin{equation}
    \mathcal{L}_{\text{triplet}} = \sum_{i=1}^{N} \max(0, S_i^{\text{anchor}} - S_i^{\text{positive}} + m) + \max(0, S_i^{\text{negative}} - S_i^{\text{anchor}} + m)
\end{equation}
where \( m \) is a dynamic margin based on artifact severity.}{R1\\\hspace*{1.2em}R3}{ADDED}

\Added{While triplet loss ensures relative ranking between clean and artifact images, it does not enforce absolute scale consistency.
L2 loss stabilizes the training by anchoring clean images to low scores, preventing score drift.
The margin \( m \) in the triplet loss adapts dynamically based on artifact severity.
For high-severity artifacts, \( m \) is larger, ensuring clearer separability between clean and degraded images.
This approach acts as a form of hard negative mining, where the model prioritizes distinguishing the most challenging cases.}{R1\\\hspace*{1.2em}R3}{ADDED}

\section{Experiments and Results}
\subsection{Public Dataset}
We first evaluated our model on a sample from the OASIS dataset~\cite{marcus2007oasis1} ($N=20$).
%obtaining a mean and standard deviation $0.1722 \pm 0.47$ across the dataset.
%We first evaluated our model on the OASIS dataset, with the mean and standard deviation of the scores on the OASIS images was $0.1722 \pm 0.47$.
Based on our model scores during inference and visual interpretation, a value of $1$ is a reasonable threshold for the model scoring.
Anything with a score under $1$ is considered to be of sufficient quality for subsequent processing.
Anything with a higher score than $1$ should be excluded from processing or undergo correction steps dependent on the artifact type.
To assess the model’s ability to rank artifact severity, we simulated artifacts on the OASIS images resulting in evaluation of 120 images.
%Figure~\ref{fig:oasis_simulated} illustrates a progression of artifact severity levels.
We compared our model’s scores with MRIQC~(v25.0.0) IQMs (Table~\ref{tab:artifact_table}).
Our model consistently ranked images by artifact severity, whereas MRIQC’s IQMs showed inconsistent trends across different artifacts.
Notably, MRIQC processing failed to report IQMs on images of T$_2$-w contrast and high levels of simulated artifacts.
This restricted our experiments to one modality and one artifact type and SS rather than testing a variety of severities per artifact.
\Added{We performed all computations on a system with a 16-core processor running at 3.22 GHz per core and 251.66 GB of RAM.
On successful cases, MRIQC took between $7-9$ minutes per volume.
Our model took about $1$ second per volume.}{R1\\\hspace*{1.2em}R3}{ADDED}

%\begin{figure}[!tb]
%\centering
%\includegraphics[width=0.8\textwidth]{MIDLLatexTemplate-master/figs/sub30001-range.png} 
%\caption{Images from the OASIS dataset with a range of simulated artifacts ranked by the calculated severity.}
%\label{fig:oasis_simulated}
%\end{figure}

%\cbstart
\begin{table}[!ht]
\centering
\caption{The specified artifact type is added with the specified severity score~(SS) as outlined in Table~\ref{tab:parameters}. We compare the MRIQC statistics against the score from our model ($N=20$ for each artifact type). The arrows should indicate improving image quality. We report the Pearson correlation coefficient between the SS and each of the MRIQC IQMs and our model score in the last row of the table.}
\resizebox{0.9\textwidth}{!}{
\begin{tabular}{lc c cccc c c}
\toprule
\multirow{2}{*}{\begin{minipage}{6em}\textbf{Artifact} \textbf{Type}\end{minipage}} & \multirow{2}{*}{\textbf{SS}} && \multicolumn{4}{c}{\textbf{MRIQC}} && \multirow{2}{*}{\textbf{Ours} $\bm{\downarrow}$} \\
\cmidrule{4-7}
&&& \textbf{CJV} $\bm{\downarrow}$ & \textbf{CNR} $\bm{\uparrow}$ & \textbf{EFC} $\bm{\downarrow}$ & \textbf{FBER} $\bm{\uparrow}$ & \\
\midrule
None & 0.0 && 0.77$\pm$0.13 & 1.14$\pm$0.25 & 0.49$\pm$0.05 & 6962$\pm$2097 &&0.17$\pm$0.47 \\
Bias & 0.1 && 0.80$\pm$0.19 & 1.08$\pm$27 & 0.51$\pm$0.60 & 2367$\pm$1572 && 0.01$\pm$0.03 \\
Motion & 0.3 && 0.87$\pm$0.19 & 1.05$\pm$0.27 & 0.51$\pm$0.05 & 5501$\pm$2010 && 1.74$\pm$0.49 \\
Anisotropy & 0.6 && 0.78$\pm$0.05 & 1.32$\pm$0.23 & 0.52$\pm$0.05 & 9876$\pm$3326 && 2.32$\pm$0.48\\
Ghosting & 0.8 && 1.03$\pm$0.18 & 0.82$\pm$0.12 & 0.53$\pm$0.06 & 4161$\pm$1852 && 2.34$\pm$0.60 \\
Noise & 0.9 && -- & -- & -- & -- && 3.54$\pm$0.06 \\
\midrule
\textbf{Pearson $r$} &&& {0.36} & {-0.16} & {0.13} & {0.35} && {0.92} \\

\bottomrule
\end{tabular}}
\label{tab:artifact_table}
\end{table}
%
%\cbend


\subsection{Private Clinical Dataset}
We further validated our model using 124 structural MR volumes from the TREAT-MS pragmatic, clinical trial~(NCT03500328).
These images were acquired following a standardized protocol but still exhibited substantial variation in image quality, particularly in resolution.
Many images are 2D acquired at varying levels of resolutions.
Figure~\ref{fig:treatms-scores} shows clinically acquired images ranked by their scores.
Our model assigned scores close to 0 for high-quality images and higher scores for low-resolution 2D acquisitions, indicating strong detection of anisotropic resolution artifacts.
Based on our findings, we recommend images with scores $\leq1$ are of sufficient quality for downstream processing.
Images with scores $>1$ should be flagged for review or undergo artifact correction (e.g., super-resolution, background removal).
\Added{$83\%$ of this private clinical dataset scored $>1$.
This was expected as the majority of the images are 2D clinically acquired with thick slice thicknesses.}{R3}{ADDED}
This evaluation demonstrates that our model effectively identifies artifact severity in both public and clinical datasets, providing interpretable scores that facilitate automated MR image quality assurance.

\begin{figure}[!tb]
\centering
\includegraphics[width=0.8\textwidth]{MIDLLatexTemplate-master/figs/treatms_results.png} 
\caption{Clinically acquired images from the TREAT-MS dataset. Although images are acquired at various sites following a standardized protocol, several low resolution 2D acquisitions are observed.}
\label{fig:treatms-scores}
\end{figure}

\section{Discussion and Conclusion}
In this study, we introduced an unsupervised, interpretable framework for QA in multi-contrast MR imaging.
By training on diverse data with simulated artifact transformations, our model produces scores that correlate with image quality, offering an efficient and scalable solution for QA workflows.
\Added{Our approach measures apparent image quality rather than anatomical realness.}{R2}{ADDED}
Unlike existing methods, our approach eliminates the need for manual labels or preprocessing. %, making it robust to inter-site variability and diverse imaging datasets.
The framework is compatible with T$_1$-w, T$_2$-w, T$_2$-w FLAIR, and PD images.
Although MRIQC can be directly run using their provided docker, we experienced many issues and limitations running this program.
MRIQC requires data to be in BIDS format, this implies some level of preprocessing which we wish to avoid.
For structural images, it can only retrieve IQMs for T$_1$-w and T$_2$-w images, while our approach can handle T$_1$-w, T$_2$-w, FLAIR, and PD images.
However, MRIQC can also handle diffusion data which we cannot currently.
There is also a large variation in computational time between the two methods.
% MRIQC does several steps like bias field correction, registration, and segmentation resulting in a process that takes $7-9$ minutes per volume.
% Our model only requires about $1$ second per volume.
For many of the T$_2$-w images and images with a simulated artifact, MRIQC failed to complete---which we found was a common experience in the MRIQC discussion forums with no known solution.
This is the primary reason for our limited experiment and missing metrics for the noise artifact type in Table~\ref{tab:artifact_table}.
%
\Added{We did not observe much slice variation in the scoring of our model.
We credit this to only using the middle $60\%$ of slices.}{R3}{ADDED}
A key feature of the framework is the threshold for artifact scores.
Based on empirical results, a threshold of $1$ was determined as a reasonable cutoff for distinguishing between high- and low-quality images. 
%Images scoring below $1$ are typically clean and can proceed directly to downstream processing, while those scoring above this threshold require either exclusion or targeted correction based on the artifact type.
%This threshold balances sensitivity and specificity, ensuring that most usable images are retained while minimizing the inclusion of artifacts that could compromise analysis.
During training, the margin in the triplet loss is derived from simulated artifact severity scores bounded between 0 and 1.
This margin emphasizes the distinction between clean and artifact-ridden images while allowing flexibility for the model’s outputs during inference. 
Consequently, the model’s unrestricted scoring capability results in a broader range of scores (0 to 6) during real-world application.
The simulated artifacts used in training currently focus on one artifact type at a time.
In practice, MR images can exhibit multiple overlapping artifacts.
We hypothesize that the model detecting multiple artifacts might be a reasoning for the broader range of scores during application.
Future work would incorporate combinations of simulated artifacts during training, making the model more robust and reflective of real-world data. 
Additionally, incorporating artifact-specific weighting into the scoring process could improve interpretability by assigning higher penalties to artifacts that are particularly detrimental to downstream analysis.
%An alternative approach could involve the model outputting an $N$-dimensional vector, where each channel represents the severity of a specific artifact type.
%This granularity would allow users to pinpoint the exact nature of artifacts and determine the most appropriate corrective actions.

The proposed framework has significant implications for large-scale studies, where it can streamline data preprocessing by automating QA and identifying low-quality images prior to analysis.
Its unsupervised nature ensures adaptability across different datasets without reliance on labeled training data.
This adaptability makes it particularly valuable for multi-site studies with heterogeneous imaging protocols and scanner hardware.
%Future directions include extending the framework to encompass a broader range of artifact types, integrating it into real-time imaging pipelines, and coupling artifact severity scores with downstream tasks like harmonization or artifact removal.
%Introducing optional normalization or scaling factors could enhance score interpretability across datasets while maintaining the flexibility of unrestricted scoring.
Overall, this work lays a robust foundation for advancing automated MR image QA and emphasizes the importance of interpretable, unsupervised learning solutions in medical imaging.
By providing artifact severity scores that are both flexible and actionable, this framework enables more efficient and reliable analyses, paving the way for enhanced QA workflows in both clinical and research settings.



% This is where the content of your paper goes.  Some random
% notes\footnote{Random footnote are discouraged}:
% \begin{itemize}
% \item You should use \LaTeX \cite{Lamport:Book:1989}.
% \item JMLR/PMLR uses natbib for references. For simplicity, here, \verb|\cite|  defaults to
%   parenthetical citations, i.e. \verb|\citep|. You can of course also
%   use \verb|\citet| for textual citations.
% \item Eprints such as arXiv papers can of course be cited \cite{Hinton:arXiv:2015:Distilling}. We recomend using a \verb|@misc| bibtex entry for these as shown in the sample bibliography.
% \item You should follow the guidelines provided by the conference.
% \item Read through the JMLR template documentation for specific \LaTeX
%   usage questions.
% \item Note that the JMLR template provides many handy functionalities
% such as \verb|\figureref| to refer to a figure,
% e.g. \figureref{fig:example},  \verb|\tableref| to refer to a table,
% e.g. \tableref{tab:example} and \verb|\equationref| to refer to an equation,
% e.g. \equationref{eq:example}.
% \end{itemize}

% \begin{table}[htbp]
%  % The first argument is the label.
%  % The caption goes in the second argument, and the table contents
%  % go in the third argument.
% \floatconts
%   {tab:example}%
%   {\caption{An Example Table}}%
%   {\begin{tabular}{ll}
%   \bfseries Dataset & \bfseries Result\\
%   Data1 & 0.12345\\
%   Data2 & 0.67890\\
%   Data3 & 0.54321\\
%   Data4 & 0.09876
%   \end{tabular}}
% \end{table}

% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example}
%   {\caption{Example Image}}
%   {\includegraphics[width=0.5\linewidth]{example-image}}
% \end{figure}

% \begin{algorithm2e}
% \caption{Computing Net Activation}
% \label{alg:net}
%  % older versions of algorithm2e have \dontprintsemicolon instead
%  % of the following:
%  %\DontPrintSemicolon
%  % older versions of algorithm2e have \linesnumbered instead of the
%  % following:
%  %\LinesNumbered
% \KwIn{$x_1, \ldots, x_n, w_1, \ldots, w_n$}
% \KwOut{$y$, the net activation}
% $y\leftarrow 0$\;
% \For{$i\leftarrow 1$ \KwTo $n$}{
%   $y \leftarrow y + w_i*x_i$\;
% }
% \end{algorithm2e}

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This material is partially supported by the Johns Hopkins University Percy Pierre Fellowship~(Hays) and the National Science Foundation Graduate Research Fellowship under Grant No. DGE-2139757~(Hays) and Grant No. DGE-1746891~(Remedios).
Development is partially supported by FG-2008-36966~(Dewey), CDMRP W81XWH2010912~(Prince), NIH R01 CA253923~(Landman), NIH R01 CA275015~(Landman), the National MS Society grant RG-1507-05243~(Pham) and Patient-Centered Outcomes Research Institute~(PCORI) grant MS-1610-37115~(Newsome and Mowry).
The statements in this publication are solely the responsibility of the authors and do not necessarily represent the views of the Patient-Centered Outcomes Research Institute~(PCORI), its Board of Governors or Methodology Committee.}


\bibliography{MIDLLatexTemplate-master/cas-refs}


% \appendix

% \section{Proof of Theorem 1}

% This is a boring technical proof of
% \begin{equation}\label{eq:example}
% \cos^2\theta + \sin^2\theta \equiv 1.
% \end{equation}

% \section{Proof of Theorem 2}

% This is a complete version of a proof sketched in the main text.

\end{document}
