\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

%\usepackage{mwe} % to get dummy images
\usepackage{graphicx}
\usepackage{adjustbox}
\usepackage{hyperref} 
\usepackage{amsmath,amsfonts,bm} %math
\usepackage{multirow}
\usepackage{multicol}
\usepackage{soul}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- 168}
\editors{Accepted for publication at MIDL 2024}

\title[Erase to Enhance]{Erase to Enhance: Data-Efficient Machine Unlearning in MRI Reconstruction}


\midlauthor{\Name{Yuyang Xue\nametag{$^{1}$}} \Email{yuyang.xue@ed.ac.uk}\\
  \Name{Jingshuai Liu\nametag{$^{1}$}} \Email{jliu11@ed.ac.uk}\\
  \Name{Steven McDonagh\nametag{$^{1}$}} \Email{s.mcdonagh@ed.ac.uk}\\
  \Name{Sotirios A. Tsaftaris\nametag{$^{1}$}} \Email{s.tsaftaris@ed.ac.uk}\\
\addr $^{1}$ School of Engineering, The University of Edinburgh, Edinburgh, EH9 3FG, UK
}

\begin{document}

\maketitle

\begin{abstract}
Machine unlearning is a promising paradigm for removing unwanted data samples from a trained model, towards ensuring compliance with privacy regulations and limiting harmful biases.
Although unlearning has been shown in, e.g., classification and recommendation systems, its potential in medical image-to-image translation, specifically in image reconstruction, has not been thoroughly investigated.
This paper shows that machine unlearning is possible in MRI tasks and has the potential to be of benefit for bias removal.
We set up a protocol to study how much shared knowledge exists between datasets of different organs, allowing us to effectively quantify the effect of unlearning.
Our study reveals that combining training data can lead to hallucinations and reduced image quality in the reconstructed data.
We use unlearning to remove hallucinations as a proxy exemplar for the removal of undesirable data. 
We show that machine unlearning is possible without full retraining.
Furthermore, our observations indicate that maintaining high performance is feasible even when using only a subset of retain data. 
We have made our \href{https://github.com/YuyangXueEd/ReconUnlearning}{code} publicly available.
\end{abstract}

\begin{keywords}
Machine Unlearning, MRI Reconstruction, De-biasing
\end{keywords}

\section{Introduction}

As the dependence on data-driven learning grows, model generalisation remains a key challenge. 
Common strategies include the aggregation of multiple data sets to expand the available data distributions, as a means of improving generalisation \cite{teney2021unshuffling}. 
%- What's the problem?
However, such data agglomeration raises concerns about privacy, perpetuation of biases, and, in the case of image reconstruction, may generate hallucinations. 
In particular, successful magnetic resonance imaging (MRI) reconstruction of distinct organ types, under a single trained model, requires generalisation abilities. 
We corroborate recent work and find that model generalisation can be strengthened using data aggregation (Sect.~\ref{sect:experiment:data}). 
However, such aggregation leads to hallucinations that manifest themselves as false structures and artifacts in the reconstructed images \cite{darestani2021measuring}. 
Crucially, such resulting effects harm the clinical diagnostic value \cite{zbontar2018fastmri}. \figureref{fig:fig1} shows an example of how learning from combined brain and knee data training can result in unwanted hallucinations in structures such as the corpus callosum, which could potentially lead to misdiagnoses.

While machine unlearning has been shown to remove unwanted data and classes in other tasks (e.g., classification), research on the integration of machine unlearning into image reconstruction remains, at best, nascent \cite{nguyen2022survey}. 

\begin{figure}[t] 
\floatconts
  {fig:fig1}
  {
  \caption{
  The original model $G$ trained with combined brain and knee data shows hallucinations (red circles).
  The unlearned model $G^U$ can remove artifacts originating from such an anatomy shift, reducing the overall reconstruction error.
  }  
}
  {\includegraphics[width=0.8\linewidth]{imgs/img2.pdf}}  
\end{figure}
%- What's the gap?
This paper sets up a protocol to analyse what models can learn about datasets from different distributions and uses this protocol to study machine unlearning. 
The protocol consists of training MRI reconstruction models with data combined from different anatomies, as obtained with multiple coils, which we have found can produce unwanted artefacts (see~\figureref{fig:fig1}). 
We find the ``knowledge gap'' between the two datasets to effectively quantify this anatomy shift. 
We then propose suitable unlearning models that can remove such a shift. 
Thus, we use this setup as a proxy exemplar for unwanted data removal\footnote{Ideally, we would consider patient data with unique characteristics, e.g., patients with titanium implants, abnormal anatomy, etc. 
Such patient data would have been ideal for studying unlearning but unfortunately open datasets with real $k$-space data are lacking.
}.
We highlight the task difficulty: existing well-defined settings of unlearning focus on classification and whole class removal. 
Instead, our task not only deals with a generative image-to-image task but also requires the effective removal of misgenerated structure whilst accurately preserving the anatomical features of a patient's body.
%- What have you done?
To the best of our knowledge, we are the first to establish unlearning algorithms for a reconstruction task c.f.~previously considered classification tasks. 
%- What do you contribute?
Our contributions can be summarised as follows: (1) We propose a formal problem formulation and definition of machine unlearning within the MRI reconstruction domain. (2) We adapt machine unlearning approaches to MRI reconstruction and verify that successful machine unlearning allows for effective artifact removal and improves reconstruction quality. 
(3) In contrast to existing approaches, which necessitate access to an entire dataset, we demonstrate that unlearning algorithms can operate effectively with only a restricted fraction of data and achieve data-efficient unlearning for MRI reconstruction. 
%\end{itemize}

\section{Related Works}

\textbf{MRI Reconstruction} reconstructs medical images from acquired Fourier domain samples, known as the $k$-space. 
Deep learning reconstruction enables high-quality image reconstruction from undersampled $k$-space data.
This is achieved by training networks to capture complex patterns and features within MRI data \cite{schlemper2017deep}, learning mappings between the under- and fully sampled $k$-space \cite{akccakaya2019scan}, or directly in the image domain \cite{wang2016accelerating}. 
However, recent research shows that such models produce artifacts, overlook small structural changes, and have a decrease in performance under distribution shifts \cite{antun2020instabilities,darestani2021measuring}. 
%\citet{ozturkler2023smrd} showed the robustness of a diffusion model against distribution shifts in anatomy shift and measurement noise.
Inspired by these challenges, we set up a proxy exemplar of data removal.

\noindent\textbf{Machine Unlearning} aims to address the need for data privacy and regulatory compliance \cite{liu2020have,xu2023machine}. 
It is the process of removing the influence of a set of data from a trained model, effectively ``forgetting'' this set without retraining the model from scratch \cite{ginart2019making,su2022patient}. 
Numerous studies have emerged using data-based \cite{bhadra2021hallucinations,tarun2023fast,graves2021amnesiac} or model-based algorithms \cite{neel2021descent,chourasia2023forget,jia2023model}.
However, these approaches are mainly suited to classification tasks. 
%Very recently and simultaneously with this work, 
Parallel to this work, an image-to-image machine learning framework for generative models was developed \citet{li2024machine}. 
The idea is simple and borrows from a class of unlearning algorithms for classification tasks (see noisy labelling approaches discussed in Section~\ref{sect:experiment}): they introduce random noise during image generation (of the diffusion model) to degrade the model performance on the forget data. 
Similar to many unlearning methods, it relies on having access to the full original database, which can be unrealistic in medical practice.
%, and generally falls into the category of . 

\noindent\textbf{Machine Unlearning in Medical Imaging} \citet{hartley2023neural} showed that networks could pose a risk of retaining and potentially revealing the distinctive characteristics of patients. 
\citet{liu2020have} used the Kolmogorov-Smirnov distance to detect whether a model has used/forgotten a query dataset. 
%Harmonisation \cite{dinsdale2021deep} propose a similar solution to combine data sets to reduce heterogeneity; however, it does not explicitly consider the downstream application.
\citet{su2022patient} explored patient-wise forgetting and revealed that forgetting medical image data from a patient is harder than other vision tasks.
These works highlight several challenges in medical imaging settings: models can easily learn unwanted relationships, and there is considerable similarity between medical data. 
While they don't present algorithms for image reconstruction, their findings help enforce why a suitable proxy exemplar is required,\footnote{We found that unlearning a ``single patient's $k$-space data'' is impossible to effectively quantify, it would not guarantee creating a significant performance gap to be observed.} why unlearning in image-to-image tasks is hard and why a delicate balance of unlearning and performance is required. 
Furthermore, as fully-sampled $k$-space data are extremely large and rare, unlearning methods that can relax the need for complete access to all training data should be explored. 


\begin{figure}[!t]  

\floatconts 
  {fig:fig2}
  {
  \caption{Machine unlearning in MRI reconstruction overview. 
  The oracle model and the original model are trained on retain set and composite set (retain + forget), respectively. 
  Taking advantage of the original model by employing an unlearning algorithm can quickly adapt to data removal requests instead of retraining.
  }}
  {\includegraphics[width=0.7\linewidth]{imgs/img1.pdf}} 
\end{figure}

\section{Methodology}

\textbf{Our main goal} is to effectively investigate machine unlearning for MRI reconstruction. 
We set up a proxy exemplar where the task is to reduce artefacts and hallucinations without reducing diagnostic precision. 
Utilising unlearning algorithms, we systematically erase the influence of a forget dataset from the model by evaluating image performance. 
We evaluate the integrity of reconstructed images and found that the performance of the models remains uncompromised after unlearning. 
Lastly, we demonstrate the practical implementation of our methodology on MRI data that limits the size of the training set, highlighting its potential for real-world application. 
The overview of the pipeline is shown in \figureref{fig:fig2}.

\subsection{Definitions of Unlearning in the Context of Reconstruction Tasks}

We address the scenario where a reconstruction model $G$, parameterised by $\theta$, is initially trained on a combined dataset that comprises both a retain set $\mathcal{D}_r$ and a forget set $\mathcal{D}_f$, optimised via the loss function $\mathcal{J}(\theta)$. 
We posit the existence of an ideal oracle model $\hat{G}$, exclusively trained on $\mathcal{D}_r$ and thus uncontaminated by $\mathcal{D}_f$. 
The objective of the unlearning algorithm $\mathcal{A}$, applied to $G$, is twofold: (1) \textit{ to preserve or enhance the performance of the model in the retain set distribution $\mathcal{D}_r$}, ideally achieving the upper performance bound of $\hat{G}$ and (2) based on the premise of not destroying structural integrity, \textit{to eliminate the impact of $\mathcal{D}_f$ as much as possible}, that is, to reduce the performance in the forget distribution.
We call the unlearned model $G^U$.
In practical applications, accessibility to the training dataset is usually restricted due to privacy or large storage requirements, the oracle model $\hat{G}$ cannot be retrained, forcing the unlearning algorithm $\mathcal{A}$ to operate on $G$ with the $\mathcal{D}_f$ alone and with limited access to a subset of retain data $\mathcal{D}_r'$. 
We selected two test sets to measure the effect of unlearning, namely $\mathcal{D}_r^t$ of the retain distribution and $\mathcal{D}_f^t$ of the forget distribution.

\subsection{Protocol Setup}

We present a proof-of-concept protocol for machine unlearning for MRI reconstruction. 
We treat the brain as retain data $\mathcal{D}_r$ and the knee as forget data $\mathcal{D}_f$.
An oracle model $\hat{G}$ trained exclusively on multi-coil brain MRI data $\mathcal{D}_r$ establishes the gold standard for evaluating performance for unlearning algorithms. 
To simulate a real-world scenario, we then train the original model $G$ on a combined dataset comprising data from $\mathcal{D}_r$ and $\mathcal{D}_f$, each of which has a different anatomy.  
Subsequent application of unlearning algorithms removes the influence of knee data incorporated during the original training phase.

There is no precedent in evaluating the quality of forgetting in MRI reconstruction;
therefore, we employ image quality assessment metrics as indicators of successful unlearning. 
An increase in performance on $\mathcal{D}_r$, accompanied by a decrease on $\mathcal{D}_f$, suggests that the model is redirecting its attention towards $\mathcal{D}_r$, the desired in-distribution data. 
Compared to pre-unlearning performance, this shift in the model manifests itself through improved image quality both quantitatively and qualitatively.
To evaluate the efficacy of the unlearning process, we performed comparative analyses using several baselines and unlearning methods. 
The goal is to discern which algorithm achieves a balance of efficient unlearning while maintaining, or ideally enhancing, the model performance on $\mathcal{D}_r^t$. 
Since the unlearning process should be acute and accurate, rather than retraining the entire model, we restrict the unlearning steps to $10\%$ of the training epochs. 
This proof-of-concept underscores the potential of machine unlearning to address bias removal in AI models, and serves as a basis for future research in deploying machine unlearning within medical imaging domains.

\subsection{Machine Unlearning Methods for Reconstruction} 

We introduce machine unlearning into MRI reconstruction, adapting three major techniques, originally developed for classification tasks, to suit an image-to-image reconstruction context. All algorithmic details are found in the Appendix.

\indent \textbf{Fine-tuning (FT)} is the most widely deployed unlearning strategy, often used to address distribution shifts and adapt models to new distributions. 
In machine unlearning, the fine-tuning process is only implemented on $\mathcal{D}_r$, using a loss function $\mathcal{J}(\theta, \mathbf{x}, \mathbf{y})$, as

\begin{equation}
    \mathcal{L}_{FT} = \mathcal{J}(\theta, \mathbf{x}_r,\mathbf{y}),  \mathrm{\ } \mathbf{x}_r\in \mathcal{D}_{r},
    \label{eq:FT}
\end{equation}

\noindent where $\mathcal{J}(\theta, \mathbf{x},\mathbf{y})$ denotes the reconstruction loss, normally $\mathcal{L}_1$ between the undersampled input $\mathbf{x}$ and the ground truth $\mathbf{y}$, and $\theta$ refers to the model parameters. 
We use the same notation for the methods in the following. 
However, as fine-tuning does not impose any penalty on the forget data, the unlearning tends to be less efficient than other methods. 

\indent \textbf{Gradient Ascent (GA)} maximises the training loss w.r.t.~forget samples, effectively decreasing model accuracy for knowledge removal \cite{halimi2022federated,warnecke2021machine}. 
However, without any restrictions, the model tends to forget all the knowledge obtained before unlearning.
We hence add $\ell_1$-regularisation (\textit{GA-$\ell_1$}) to constrain the weights to prevent divergence, which encourages sparsity in the model weights during training \cite{jia2023model}. 
Sparsity can lead to simpler and more interpretable models that may perform better when it comes to generalisation on unseen data by reducing overfitting. The regularised GA loss is defined by 

\begin{equation}
\mathcal{L}_{GA_{\ell_1}} = -\mathcal{J}(\theta, \mathbf{x}_f,\mathbf{y}) + \gamma\|\theta\|_1, \ \mathbf{x}_f\in \mathcal{D}_{f}.\label{eq:GA}
\end{equation}

\indent \textbf{Noisy Labelling (NL)} minimises the training loss by reassigning the label of forget samples by adding Gaussian noise with a hyperparameter $\lambda$ indicating the amount of noise, thus removing the mapping of the input to the OOD images \cite{graves2021amnesiac,gandikota2023erasing}. 
In MRI reconstruction, a false mapping to the target would lead to a considerable drop in global performance.
We change the forget set labels as follows: 

\begin{equation}\label{eq:NL}
\mathcal{L}_{NL} = \mathcal{J}(\theta, \mathbf{x}_f, [\mathbf{y}+\lambda\mathcal{N}(0, I)]), \  \mathbf{x}_f\in \mathcal{D}_{f}.
\end{equation}

\section{Experiments}
\label{sect:experiment}

\subsection{Dataset}
\label{sect:experiment:data}
We use FastMRI dataset \cite{knoll2020fastmri} to reconstruct multiple coil brain and knee images. 
We randomly selected T2-weighted brain volumes, yielding 5,898 slices in 1,000 volumes as the full retain dataset $\mathcal{D}_r$. 
The forget set $\mathcal{D}_f$ has 536 randomly selected knee image slices. 
We adhered to a retain-to-forget set ratio of 10:1. 
For validation, a separate subset was created by randomly choosing 1,198 slices.
Two test sets are 100 volumes of brain and knee data, respectively.
For data processing, an equispaced Cartesian sampling pattern was applied to simulate unacquired $k$-space lines. 
The acceleration rate and the centre fraction rate were set to 8-fold and $0.04$, respectively.

\subsection{Training Details}
\label{sect:experiment:training}
We selected the E2E-VarNet model \cite{sriram2020end}, which processes masked multi-coil $k$-space data as input, as the main network for reconstruction.
The Sensitivity Map Estimation (SME) module inside computes sensitivity maps from the input $k$-space, together with a sequence of UNet cascades that iteratively refine outputs.
We follow the training protocol of the original implementation, with 12 cascades, resulting in 29.9M total parameters.
Models were trained using the Adam optimiser with a learning rate of 0.001 over 50 epochs without any data augmentation.
We used the PyTorch Lightning framework on a NVIDIA A100 Tensor Core GPU.
Image quality was evaluated using peak signal-to-noise ratio (PSNR) and structural similarity index (SSIM) \cite{wang2004image} as metrics.

\subsection{Unlearning Instantiating and Results}
\label{sect:experiment:results}
We analysed the experiments in three dimensions \cite{jia2023model}: accuracy on brain test data $\mathcal{D}_r^t$ as brain test accuracy (BTA), aiming to demonstrate the capacity to retain the learned target knowledge, accuracy on knee test data $\mathcal{D}_f^t$, denoted by knee test accuracy (KTA) and serving as a metric to gauge the efficacy of unlearning algorithms, and additionally accuracy on forget data $\mathcal{D}_f$ as unlearning accuracy (UA) which is used to confirm the unlearning quality when combining it with KTA.
We also added run-time efficiency (RTE) as another metric for the evaluation of the unlearning algorithm.
Performances are highlighted in \tableref{tab:retrain} and \figureref{fig:fig4} shows a radar chart of comparison of all algorithms.

\textbf{Oracle vs.\ Original} To show the domain gap existing between brain and knee data and hence quantify the influence of the forget set $\mathcal{D}_f$ on the reconstruction model, we compared the original model $G$, trained on both $\mathcal{D}_r$ and $\mathcal{D}_f$, and the oracle model $\hat{G}$, trained solely on brain data $\mathcal{D}_r$ to represent an upper bound for unlearning methods. 
We can see from \tableref{tab:retrain} that these two models show differences in terms of evaluation metrics for each organ, with the oracle $\hat{G}$ better on brain data and worse on knee data. This is expected given the domain shift, but the gap of about $5 \mathrm{dB}$ on knee data implies the distribution dissimilarities between the two organs.
The findings highlight the need for machine unlearning to counteract domain-related issues and maintain reliable image reconstruction quality.

\begin{table}[t]
\floatconts
    {tab:retrain}%
    {\caption{Quantitative evaluation of unlearning methods. All models were tested on brain test accuracy (BTA), knee test accuracy (KTA), and unlearning accuracy (UA). An ideal unlearning outcome should be \textbf{as close as possible} to the performance of oracle model $\hat{G}$, highlighted in bold text. $\lambda$ is set to $10^{-5}$ for \textit{NL} and \textit{NL-FT}. Except for \textit{Full FT}, unlearning methods were implemented with 10\% retain data. }}%
  {
  \begin{adjustbox}{width=\textwidth} 
    \begin{tabular}{l|cc|cc|cc}\cline{1-7}
    \multirow{2}*{{\bfseries Method}}&\multicolumn{2}{c|}{Brain\ $\mathcal{D}_r^t$ (BTA)} & \multicolumn{2}{c|}{Knee\  $\mathcal{D}_f^t$ (KTA)} & \multicolumn{2}{c}{Unlearning\ $\mathcal{D}_f$ (UA)}\\ \cline{2-7}
    ~ & \bfseries PSNR     & \bfseries SSIM     & \bfseries PSNR  & \bfseries SSIM & \bfseries PSNR  & \bfseries SSIM\\ \cline{1-7}
    \textit{$G$}   & 39.07$\pm$2.03             & 0.964$\pm$0.018            & 36.65$\pm$2.44         & 0.894$\pm$0.059 & 37.13$\pm$2.19 & 0.905$\pm$0.045\\
    \textit{$\hat{G}$}     & \textbf{39.36}$\pm$2.04    & \textbf{0.965}$\pm$0.017    & \textbf{31.82}$\pm$2.76     & \textbf{0.834}$\pm$0.065  & \textbf{31.74}$\pm$2.12 &\textbf{0.892}$\pm$0.052 \\ 
     \hline\hline
     \multicolumn{7}{c}{Fine-tuning}\\\hline
    \textit{FT}     & \textbf{39.18}$\pm$2.16             & \textbf{0.965}$\pm$0.019            & 36.67$\pm$2.39         & 0.893$\pm$0.059 & 37.18$\pm$2.18 &0.905$\pm$0.045\\
    \textit{Full FT}    & 39.12$\pm$2.02             & \textbf{0.965}$\pm$0.017            & 36.26$\pm$2.34         & 0.889$\pm$0.059 & 36.60$\pm$2.07 &0.901$\pm$0.046\\ 
     \hline\hline
     \multicolumn{7}{c}{Unlearning Algorithms}\\\hline
    \textit{GA-$\ell_1$}       & 16.19$\pm$2.38             & 0.172$\pm$0.035            & 14.15$\pm$3.10         & 0.156$\pm$0.067 & 13.40$\pm$2.63 &0.162$\pm$0.056\\
    \textit{NL}         & 16.44$\pm$1.58             & 0.502$\pm$0.035            & 17.39$\pm$2.89         & 0.421$\pm$0.089 & 17.04$\pm$2.28 &0.428$\pm$0.072\\    
    \textit{GA-$\ell_1$-FT}    & 31.81$\pm$2.09             & 0.910$\pm$0.025            & 27.09$\pm$2.49         & 0.735$\pm$0.074 & 26.90$\pm$1.57 &0.739$\pm$0.025\\
    \textit{NL-FT}      & 38.24$\pm$1.94             & 0.956$\pm$0.019            & \textbf{31.18}$\pm$4.28         & \textbf{0.863}$\pm$0.068 & $\textbf{31.72}\pm4.14$ &\textbf{0.875}$\pm$0.053\\ \cline{1-7}
  \end{tabular}
  \end{adjustbox}
  }
\end{table} 

\begin{figure}[b]  
\floatconts 
  {fig:fig4}
  {
  \caption{Unlearning approaches vs.\ oracle model $\hat{G}$. Unlearning accuracy (UA), brain test accuracy (BTA), and knee test accuracy (KTA) are shown in PSNR. The reciprocal of run-time efficiency (RTE) is normalised to $[0, 1]$ for ease of visualisation. \textit{FT} and \textit{NL-FT} achieve the best, closest to oracle with the highest RTE.}}
  {\includegraphics[width=0.8\linewidth]{imgs/img4.pdf}} 
\end{figure}

\textbf{Fine-tuning does not unlearn} Fine-tuning aims to unlearn the forget set by shifting the learned data distribution towards the target domain. 
Two levels of fine-tuning are selected. 
One is to fine-tune $G$ on the whole $\mathcal{D}_r$, termed \textit{Full FT}, which requires us to have access to large-scale training data which is impractical. 
The other is to fine-tune the model on a small subset of target data, $\mathcal{D}_r'$, termed \textit{FT}. 
Both \textit{Full FT} and \textit{FT} marginally enhance the performance of the model on the target brain dataset.
However, the ability to reconstruct knee images remains unaffected or, in the case of \textit{FT}, even improved.
This indicates that fine-tuning is insufficient in discarding knowledge of the forget set. 
These findings have inspired the integration of unlearning algorithms into the fine-tuning process.

\textbf{Combining unlearning and fine-tuning} 
We used \textit{GA-$\ell_1$} and \textit{NL} as unlearning algorithms.
\textit{GA-$\ell_1$} fine-tuned with $\mathcal{D}_r'$ is referred to as \textit{GA-$\ell_1$-FT}, and the other is designated as \textit{NL-FT}. 
We noted from \tableref{tab:retrain} that \textit{NL} and \textit{GA-$\ell_1$} lower the metrics for the knee test data, suggesting their effectiveness in the unlearning of domain knowledge. 
However, these methods also substantially deteriorated the reconstruction fidelity of brain images, indicating that unlearning algorithms, if applied solely with $\mathcal{D}_f$, can inadvertently compromise vital information in the target domain.
Methods \textit{GA-$\ell_1$-FT} and \textit{NL-FT} combine fine-tuning to maintain the learned knowledge in the target domain during their unlearning steps, using the knowledge from $\mathcal{D}_r'$ to steer the unlearning direction toward the retain data distribution, as illustrated in Appendix \figureref{fig:fig5}. 
According to the results in \tableref{tab:retrain}, both methods can effectively achieve model unlearning in a reasonable range, with \textit{NL-FT}, which could be a promising unlearning method, showing a superior ability to maintain the reconstruction integrity compared to \textit{GA-$\ell_1$-FT}.
More details are provided in Appendix \tableref{tab:detailed_testing}.

\textbf{Do we need all retain data?} We evaluate whether having full access to the retain dataset results in a more effective unlearning compared to having limited access practically to a portion of the retain dataset. 
For these experiments we used a smaller retain dataset $\mathcal{D}_r'$, at percentages 1\%, 5\%, 10\%, 20\%, 50\% of the full retain dataset $\mathcal{D}_r$. 
We evaluated performance considering two primary factors: run-time efficiency and PSNR of the brain test set $\mathcal{D}_r^t$.
We expect that the more retain (in-distribution) data we used for fine-tuning, the better the results should be. 
Unexpectedly, a decrease in PSNR was observed when the model was fine-tuned using the complete $\mathcal{D}_r$. 
\figureref{fig:fig3} in the Appendix depicts the results on \textit{NL-FT} and \textit{GA-$\ell_1$-FT} using a second-degree polynomial fitting. 
It shows that, to some extent, as the amount of retained set data increases, the quality of image reconstruction does not improve, but rather deteriorates. More information is in Appendix \tableref{tab:portion}.

\section{Conclusion}
\label{sect:conclusion}
Our study is the first to investigate the application of machine unlearning to MRI reconstruction, marking significant progress in medical imaging research with respect to artifacts reduction without comprehensive retraining. 
Our findings underscore the potential of unlearning approaches to improve image quality and preserve data integrity by fine-tuning in a data-efficient way. 
Further investigation is necessary to understand the decrease in using larger retain datasets and to determine the optimal subset size for successful unlearning. 
This work lays the groundwork for future research to extend these methods to more complex datasets and enhance their efficiency in clinical settings, facilitating accuracy for clinical decision-making and the ethical standards demanded by society. 
\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{
Y. Xue thanks additional financial support from the School of Engineering, the University of Edinburgh. S.A. Tsaftaris also acknowledges the support of Canon Medical and the Royal Academy of Engineering and the Research Chairs and Senior Research Fellowships scheme (grant RCSRF1819\textbackslash8\textbackslash25), of the UK’s Engineering and Physical Sciences Research Council (EPSRC) (grant EP/X017680/1) and the National Institutes of Health (NIH) grant 7R01HL148788-03. We also thank Jinghan Sun for the help.
}


\bibliography{midl24_168}

\newpage

\appendix

\section{Unlearning Algorithms}

We list the algorithm for the unlearning algorithm adaptation for \textit{GA-$\ell_1$-FT} and \textit{NL-FT}. \figureref{fig:fig5} gives a illustration of regarding the issue of using fine-tuning to effectively suppress gradient ascent and cause the model to diverge and collapse.

\begin{figure}[!h] 
\floatconts
  {fig:fig5}
  {
  \caption{The gradient ascent (\textit{GA-$\ell_1$-FT}) and gradient ascent with fine-tuning (\textit{NL-FT}).}  
}
  {\includegraphics[width=\linewidth]{imgs/img5.pdf}}  

  
\end{figure}
\begin{algorithm2e}[!h]
    \caption{Gradient Ascent with $l_1$-regularisation Unlearning with Fine-tuning}
    \label{alg:GAL1}
    \LinesNumbered
    Samples: {$\mathbf{x}_i, \mathbf{y}_i \in \mathcal{D}_{f}, \hat{\mathbf{x}}_j, \hat{\mathbf{y}}_j \in \mathcal{D}_{r}$} \\
    Network parameters: $\theta$ \\
    \For{$\mathbf{x}_i \in \mathcal{D}_{f}, \bar{\mathbf{x}}_j \in \mathcal{D}_{r}$}{
     $L = - \mathcal{L}_{GA_{\ell_1}}(\mathbf{x}_i, \mathbf{y}_i) + {L}_{FT}(\hat{\mathbf{x}}_j, \hat{\mathbf{y}}_j)$ \\
     $\theta \leftarrow \theta - \eta \nabla_\theta L$
    } 
    \end{algorithm2e}

\begin{algorithm2e}[!h]
    \caption{ Noisy Labelling Unlearning with Fine-tuning}
    \label{alg:NL}
    Samples: {$\mathbf{x}_i \in \mathcal{D}_{f}, \hat{\mathbf{x}}_j, \hat{\mathbf{y}}_j \in \mathcal{D}_{r}$} \\
    Network parameters: $\theta$ \\
    Hyperparameter: $\lambda$\\
    \For{$\mathbf{x}_i \in \mathcal{D}_{f}, \bar{\mathbf{x}}_j \in \mathcal{D}_{r}$}{
     $L = \mathcal{L}_{NL}(\mathbf{x}_i, \mathbf{y}_i + \lambda \mathcal{N}(0, I)) + {L}_{FT}(\hat{\mathbf{x}}_j, \hat{\mathbf{y}}_j)$ \\
     $\theta \leftarrow \theta - \eta \nabla_\theta L$
    }
\end{algorithm2e}

\section{Unlearning Experiment Results in Details}

We give detailed results for all the related experiments we conducted. 
\tableref{tab:detailed_testing} recorded the extra 5 epochs of unlearning and we take the last checkpoint's performance to show on \tableref{tab:retrain}. The ablation study of the proportions of retain data set we shall use during our unlearning algorithm for fine-tuning is shown in \tableref{tab:portion}. 

\begin{figure}[!t]  
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:fig3}
  {
  \caption{The Pareto optimum to achieve high unlearning efficiency may be found in the fitting curve of the BTA to unlearning time, which is directly related to retain sample usage.}  
}
  {\includegraphics[width=0.8\linewidth]{imgs/img3.pdf}}  
\end{figure}

\begin{table}[!h]
\floatconts
  {tab:detailed_testing}%
  {\caption{Detailed test results for different unlearning algorithms, shows the unlearning dynamics as well. }}%
  {\begin{tabular}{c|c|cc|cc}\cline{1-6}
    \multirow{2}*{{\bfseries Method}}&\multirow{2}*{{\bfseries Epochs}}&\multicolumn{2}{c|}{Brain\ $\mathcal{D}_r^t$ (KTA)} & \multicolumn{2}{c}{Knee\  $\mathcal{D}_f^t$ (KTA)}  \\ 
    ~&~     & \bfseries PSNR  & \bfseries SSIM & \bfseries PSNR  & \bfseries SSIM\\ \cline{1-6}
    \textit{$G$}   & 50& 39.07$\pm$2.03             & 0.964$\pm$0.018            & 36.65$\pm$2.44         & 0.894$\pm$0.059 \\
   \textit{$\hat{G}$}     &  50&  \textbf{39.36}$\pm$2.04    & \textbf{0.965}$\pm$0.017    & \textbf{31.82}$\pm$2.76     & \textbf{0.834}$\pm$0.065  \\ 
     \hline\hline
    \multirow{5}*{\rotatebox{90}{\textit{Full FT}}}&51     & 39.080$\pm$2.01 & 0.9640$\pm$0.018  & 36.624$\pm$2.40 & 0.8931$\pm$0.059 \\
    ~&52     & 39.008$\pm$2.00 & 0.9646$\pm$0.017 & 36.406$\pm$2.39 & 0.8915$\pm$0.059 \\
    ~&53     & 39.024$\pm$2.01 & 0.9649$\pm$0.018 & 36.173$\pm$2.34 & 0.8898$\pm$0.059 \\
    ~&54     & 39.207$\pm$2.01 & 0.9651$\pm$0.017 & 36.267$\pm$2.38 & 0.8905$\pm$0.059 \\
    ~&55     & 39.119$\pm$2.02 & 0.9651$\pm$0.017 & 36.260$\pm$2.34 & 0.8894$\pm$0.059 \\\hline
    \multirow{5}*{\rotatebox{90}{\textit{FT}}}&51 & 39.278$\pm$2.03  & 0.9651$\pm$0.017  & 36.756$\pm$2.40 & 0.8942$\pm$0.059 \\
    ~&52 & 39.140$\pm$2.02 & 0.9647$\pm$0.017  & 36.715$\pm$2.41 & 0.8938$\pm$0.059 \\
    ~&53 & 39.181+2.02    & 0.9649$\pm$0.017 & 36.706$\pm$2.41 & 0.8938$\pm$0.059 \\
    ~&54 & 39.224$\pm$2.03  & 0.9650$\pm$0.017  & 36.718$\pm$2.40 & 0.8935$\pm$0.059 \\
    ~&55 & 39.179$\pm$2.16 & 0.9647$\pm$0.019   & 36.672$\pm$2.39 & 0.8936$\pm$0.059 \\\hline
    \multirow{5}*{\rotatebox{90}{\textit{GA-$\ell_1$}}}&51 & 0.000$\pm$2.03  & 0.0019$\pm$0.001      & 0.000$\pm$2.91  & 0.0017$\pm$0.002      \\
    ~&52 & 0.000$\pm$2.42 & 0.0000$\pm$0.001 & 0.000$\pm$2.71 & 0.0000$\pm$0.001 \\
    ~&53 & 1.272$\pm$1.27   & 0.0073$\pm$0.007      & 0.007$\pm$2.77  & 0.0037$\pm$0.004      \\
    ~&54 & 2.357$\pm$2.36   & 0.0080$\pm$0.008      & 2.314$\pm$2.88   & 0.0069$\pm$0.006      \\
    ~&55 & 16.186$\pm$2.38  & 0.1724$\pm$0.035      & 14.148$\pm$3.10  & 0.1561$\pm$0.067  \\\hline
    \multirow{5}*{\rotatebox{90}{\textit{NL}}}&51 & 16.478$\pm$1.56 & 0.5026$\pm$0.034 & 17.436$\pm$2.90 & 0.4135$\pm$0.083 \\
    ~&52 & 16.442$\pm$1.56 & 0.4970$\pm$0.032 & 17.469$\pm$2.92 & 0.4189$\pm$0.090 \\
    ~&53 & 16.458$\pm$1.57 & 0.5173$\pm$0.035 & 17.430$\pm$2.92 & 0.4174$\pm$0.092 \\
    ~&54 & 16.472$\pm$1.58 & 0.5003$\pm$0.035 & 17.472$\pm$2.92 & 0.4217$\pm$0.094 \\
    ~&55 & 16.444$\pm$1.58 & 0.5015$\pm$0.035 & 17.385$\pm$2.89 & 0.8928$\pm$0.089 \\\hline
    \multirow{5}*{\rotatebox{90}{\textit{GA-$\ell_1$-FT}}}&51 & 30.626$\pm$1.95 & 0.9044$\pm$0.025 & 27.807$\pm$2.50 & 0.7546$\pm$0.070 \\
    ~&52 & 31.546$\pm$2.01 & 0.907$\pm$0.024  & 27.006$\pm$2.52 & 0.7403$\pm$0.070 \\
    ~&53 & 29.570$\pm$1.94 & 0.8632$\pm$0.033 & 27.952$\pm$2.55 & 0.7534$\pm$0.069 \\
    ~&54 & 31.744$\pm$2.00 & 0.9106$\pm$0.024 & 26.811$\pm$2.54 & 0.7247$\pm$0.072 \\
    ~&55 & 31.813$\pm$2.09 & 0.9106$\pm$0.025 & 27.094$\pm$2.49 & 0.7353$\pm$0.074  \\\hline
    \multirow{5}*{\rotatebox{90}{\textit{NL-FT}}}&
      51 & 37.592$\pm$1.98 & 0.9603$\pm$0.020 & 30.168$\pm$4.68 & 0.8514$\pm$0.071 \\
    ~&52 & 37.732$\pm$1.92 & 0.9612$\pm$0.019 & 30.365$\pm$4.47 & 0.8568$\pm$0.070 \\
    ~&53 & 37.953$\pm$1.93 & 0.9616$\pm$0.019 & 30.503$\pm$4.58 & 0.8573$\pm$0.069 \\
    ~&54 & 37.519$\pm$1.98 & 0.9596$\pm$0.018 & 30.595$\pm$4.42 & 0.8605$\pm$0.070 \\
    ~&55 & 38.243$\pm$1.94 & 0.9565$\pm$0.019 & 31.185$\pm$4.28 & 0.8630$\pm$0.068 \\\hline
  \end{tabular}}
  
\end{table}

\begin{table}[!h]
\floatconts
  {tab:portion}%
  {\caption{Detailed results for unlearning with various retain data $\mathcal{D}_r'$ proportions.}}%
  {\begin{tabular}{c|c|cc|cc}\cline{1-6}
    \multirow{2}*{{\bfseries Size}}&\multirow{2}*{{\bfseries Methods}}&\multicolumn{2}{c|}{Brain\ $\mathcal{D}_r^t$ (BTA)} & \multicolumn{2}{c}{Knee\  $\mathcal{D}_f^t$ (KTA)}  \\ 
    &~     & \bfseries PSNR       & \bfseries SSIM       & \bfseries PSNR        & \bfseries SSIM     \\ \cline{1-6}
  \multirow{2}*{{\textbackslash}}& \textit{$G$}   & 39.07$\pm$2.03             & 0.964$\pm$0.018            & 36.65$\pm$2.44         & 0.894$\pm$0.059 \\
    ~& \textit{$\hat{G}$}     & \textbf{39.36}$\pm$2.04    & \textbf{0.965}$\pm$0.017    & \textbf{31.82}$\pm$2.76     & \textbf{0.834}$\pm$0.065  \\ 
     \hline
    \multirow{3}*{\rotatebox{90}{\textit{1\%}}}&\textit{FT}         & 39.182$\pm$2.02 & 0.9649$\pm$0.018 & 36.743$\pm$2.40 & 0.8942$\pm$0.059 \\
    ~& \textit{NL-FT}    & 25.607$\pm$2.53 & 0.8800$\pm$0.027 & 31.167$\pm$4.96 & 0.4202$\pm$0.096 \\
    ~& \textit{GA-$\ell_1$-FT} & 25.385$\pm$1.38 & 0.7226$\pm$0.040 & 26.824$\pm$2.69 & 0.6922$\pm$0.068  \\\hline
    \multirow{3}*{\rotatebox{90}{\textit{5\%}}}&\textit{FT}         & 39.051$\pm$2.00 & 0.9614$\pm$0.018 & 36.728$\pm$2.50 & 0.8942$\pm$0.061 \\
    ~& \textit{NL-FT} & 28.877$\pm$2.78 & 0.9142$\pm$0.028 & 31.297$\pm$4.79 & 0.4491$\pm$0.078 \\
    ~& \textit{GA-$\ell_1$-FT} & 29.499$\pm$1.58 & 0.8440$\pm$0.025 & 25.924$\pm$2.46 & 0.6987$\pm$0.068  \\ \hline
    \multirow{3}*{\rotatebox{90}{\textit{10\%}}}&\textit{FT}          & 39.089$\pm$2.03 & 0.9621$\pm$0.019 & 35.452$\pm$2.64 & 0.8860$\pm$0.062  \\
    ~& \textit{NL-FT}   & 38.243$\pm$1.94 & 0.9565$\pm$0.019 & 31.185$\pm$4.28 & 0.8630$\pm$0.068 \\
    ~& \textit{GA-$\ell_1$-FT}  & 32.128$\pm$1.87 & 0.9179$\pm$0.022 & 30.173$\pm$2.44 & 0.8045$\pm$0.061  \\ \hline 
    \multirow{3}*{\rotatebox{90}{\textit{20\%}}}&\textit{FT}          & 39.070$\pm$2.01 & 0.9628$\pm$0.019 & 35.147$\pm$2.48 & 0.8845$\pm$0.061  \\
    ~& \textit{NL-FT}   & 38.828$\pm$2.05 & 0.9614$\pm$0.020 & 31.468$\pm$4.83 & 0.4577$\pm$0.085 \\
    ~& \textit{GA-$\ell_1$-FT}   & 33.746$\pm$1.96 & 0.9348$\pm$0.023 & 30.150$\pm$2.40 & 0.8085$\pm$0.062  \\ \hline
    \multirow{3}*{\rotatebox{90}{\textit{50\%}}}&\textit{FT}         & 39.048$\pm$1.99 & 0.9625$\pm$0.017 & 35.115$\pm$2.49 & 0.8844$\pm$0.061 \\
    ~& \textit{NL-FT}    & 38.484$\pm$1.98 & 0.9597$\pm$0.019  & 31.152$\pm$4.78 & 0.4591$\pm$0.078 \\
    ~& \textit{GA-$\ell_1$-FT} & 32.381$\pm$1.80 & 0.9193$\pm$0.022  & 29.841$\pm$2.48  & 0.7922$\pm$0.069  \\ \hline
    \multirow{3}*{\rotatebox{90}{\textit{100\%}}}&\textit{FT}     & 39.119$\pm$2.02 & 0.9651$\pm$0.017 & 36.260$\pm$2.34 & 0.8894$\pm$0.059 \\
    ~& \textit{NL-FT}    & 37.464$\pm$1.88 & 0.9587$\pm$0.019  & 31.122$\pm$4.88 & 0.4586$\pm$0.077 \\
    ~& \textit{GA-$\ell_1$-FT} & 31.928$\pm$1.65 & 0.9179$\pm$0.021  & 29.461$\pm$2.55  & 0.8021$\pm$0.065  \\ \hline
  \end{tabular}
  }
\end{table}

\end{document}
