\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{multirow}
\usepackage{wrapfig}
\usepackage{amsmath}
\DeclareMathOperator{\Var}{\widehat{Var}}
\usepackage{xcolor}
\newcommand{\review}[1]{{\leavevmode\color{red}#1}}
\newcommand{\ma}[1]{{\leavevmode\color{green}[MA: #1]}}
\usepackage{appendix}
%\usepackage{subcaption}

\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2024}
\title[Diffusion X-ray image denoising]{Diffusion X-ray image denoising}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 \midlauthor{\Name{Daniel Sanderson\nametag{$^{1,3,5}$}} \Email{dsanders@ing.uc3m.es}\\
  \Name{Pablo M.Olmos\nametag{$^{2,3}$}} \Email{pamartin@ing.uc3m.es}\\
  \Name{Carlos Fernández del Cerro\nametag{$^{1,3}$}} \Email{carlosfe@pa.uc3m.es}\\
  \Name{Manuel Desco\nametag{$^{1,3,4,5}$}}  \Email{manuel.desco@uc3m.es}\\
  \Name{Mónica Abella\nametag{$^{1,3,5}$}} \Email{mabella@ing.uc3m.es}\\
  \addr $^{1}$ Departamento de Bioingeniería, Universidad Carlos III de Madrid. Madrid, Spain \\
  \addr $^{2}$ Departamento de Teoría de la Señal, Universidad Carlos III de Madrid. Madrid, Spain \\
  \addr $^{3}$ Instituto de Investigación Sanitaria Gregorio Marañón. Madrid, Spain \\
  \addr $^{4}$ Centro Nacional de Investigaciones Cardiovasculares Carlos III (CNIC), Madrid, Spain.\\
  \addr $^{5}$ Centro de investigación en red en salud mental (CIBERSAM), Madrid, Spain.
  }


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

\begin{document}

\maketitle

\begin{abstract}
X-ray imaging is a cornerstone in medical diagnosis, constituting a significant portion of the radiation dose encountered by patients. Despite the imperative to reduce radiation doses, conventional image processing methods for X-ray denoising often struggle with heuristic parameter calibration and prolonged execution times. Deep Learning solutions have emerged as promising alternatives, but their effectiveness varies, and challenges persist in preserving image quality.
This paper presents an exploration of diffusion models for planar X-ray image denoising, a novel approach that to our knowledge has not been yet investigated in this domain. Evaluation on clinical data shows that our approach enables real time denoising of Poisson noise while preserving image resolution and structural similarity. This suggests that diffusion models are promising for planar X-ray image denoising, offering a potential improvement in the optimization of diagnostic utility amid dose reduction efforts.

\end{abstract}

\begin{keywords}
X-ray, radiography, dose, denoising, diffusion model.
\end{keywords}

\section{Introduction and related work}
X-ray imaging accounts for 93.7\% of the mean radiation dose applied to patients in medical diagnosis \cite{Dosis-radiacion}. A single X-ray chest image implies an approximate effective radiation dose of 0.1 mSv, which is equivalent to 10 days of exposure to ambient radiation. For fluoroscopic interventions or clinical studies requiring several planar X-ray images, doses can significantly build up, posing a risk for the patient, especially for paediatric patients \cite{luo2020ultra}.
Additionally, large radiation doses can lead to premature hardware failure of the X-ray equipment, due to vaporization of the tube´s anode and breakdown of the tube´s filament. Therefore, it is important to reduce the dose of X-ray acquisitions. However, a reduction in dose implies an increase in image noise, which arises due to the quantic nature of X-rays and the presence of thermal fluctuations in the detector \cite{ding2018statistical, yi2018sharpness}. This hampers the contrast resolution of the image, limiting the diagnostic utility of the radiography and degrading the performance of downstream processing or feature extraction algorithms \cite{juneja2023denoising}.

% Any software method designed to increase the signal to noise ratio of noisy X-ray images should accomplish the following requirements to ensure its utility in clinical practice \cite{sagheer2020review}: preservation of image resolution and structural similarity, i.e., no introduction of artifacts or content distortion, non essentialnees of large prior databases, given the difficulty to obtain clinical images, and low complexity and computational time, to enable real time use.

% Several conventional image processing methods have been proposed to increase the signal to noise ratio (SNR) of noisy X-ray images. Depending on their prior assumptions \cite{xu2015patch}, these methods may be classified as Spatial domain filtering methods, out of which bilateral filters have shown good results on Poisson contaminated X-ray images \cite{juneja2023denoising}, transform domain filtering methods \cite{sagheer2020review}, such as wavelets or Principal Component Analysis (PCA), diffusion and total variation (TV) methods, out of which TVTSVD has shown good results on CT sinogram denoising \cite{sagheer2020review}, and non local self-similarity based methods (NSS) based on patch analysis such as Weighted nuclear norm minimization (WNNM) \cite{gu2014weighted} or 3D Block Matching (BM3D) \cite{dabov2007image}, which are popular methods that have been traditionally considered state of the art methods \cite{gondara2016medical}

Several conventional image processing methods have been proposed to perform denoising of X-ray images. Some of the methods that have shown good results are bilateral filters \cite{juneja2023denoising}, total variation (TV) methods \cite{sagheer2020review},
or 3D Block Matching \cite{dabov2007image}.
However, these methods require heuristic calibration of parameters hindering their generalization and can have long execution times limiting their incorporation into clinical practice \cite{lin2023neural}. To solve these limitations, several Deep Learning (DL) solutions have been proposed for planar X-ray and fluoroscopy image denoising. To date most DL works simulate Gaussian noise, despite being Poisson noise the most relevant type of noise in X-ray images \cite{ding2018statistical, yi2018sharpness}.

% These methods either use real data provided by the clinics (supervised learning), or simulate data by applying specific probabilistic models (self-supervised learning), to predict the target image directly or the noise of the input image (residual learning). Deep Learning denoising is performed either on the exponential domain \cite{luo2020ultra,luo2022edge,long2023full} or the logarithmic domain \cite{kumar2018jaya,rawat2021novel,sahu2023application,nayak2023dmf}, but to our knowledge no study has evaluated which domain is better suited for denoising. 

Depending on the type of data used during training, most DL solutions applied to X-ray denoising can be classified into two main categories. The first aim to predict the clean image from the noisy image using simple networks such as DnCNN or Denoising Autoencoders \cite{juneja2023denoising,gondara2016medical} or heuristically designed architectures composed of feature extracting and refinement blocks \cite{nayak2023dmf} or of dual denoising networks\cite{sahu2023application}.
The second category of methods either train the network with pairs of noisy images of different noise content (Noise2Noise methods) or aim to predict specific pixels selected either randomly \cite{krull2019noise2void} or by intensity thresholding \cite{batson2019noise2self} (Noise2Self methods). 
The majority of these solutions minimize MSE estimates of the target, such as the Charbonnier or Frobenius norm, while only a few explore alternative loss functions \cite{matviychuk2016learning}. As MSE estimates only compare pixel wise differences, it is common to obtain results of reduced perceptual quality, generally leading to a loss of spatial resolution \cite{chung2022mr} or to incomplete noise removal. 

To better preserve image resolution and texture, generative models such as Generative Adversarial Networks (GANs) have been recently been applied to medical image denoising. However, GANs suffer from convergence issues, mode collapse and vanishing gradients, greatly hindering their training \cite{skandarani2023gans}. Recently, diffusion models have outperformed GANs \cite{dhariwal2021diffusion}, and have further improved image quality with notably simple models. These models apply a Noise2Noise training strategy, simulating noise in a self-supervised fashion at different noise levels and predicting the residual.
%While a single network can handle different noise levels, diffusion models require the noise level of the image as input, i.e, cannot handle blind noise, requiring the aid of prior noise estimators \cite{chung2022mr}. 
Diffusion models are recently being applied to CT denoising 
\cite{xia2022low,liu2023diffusion}, but to our knowledge they have not yet been applied to denoising of planar X-ray and/or fluoroscopy images. 

In this work, we propose a denoising method based on diffusion models for planar X-ray imaging. The method is trained with a small database to mimic the conditions of clinical scenarios where images are difficult to obtain and is evaluated on images contaminated with Poisson noise.

\section{Materials and Methods}
The proposed method is based on the original implementation of Denoising Diffusion Probabilistic Models (DDPMs) \cite{ho2020denoising}, which are designed for generative modelling by using a DL network to sequentially remove noise in a residual fashion. 

Figure \ref{fig:diffusion_model} shows the workflow of the proposed method, DDPM-X, that consists of two stages: I) a diffusion model is trained with real clinical data for image generation by progressively eliminating Gaussian noise starting from pure Gaussian noise and II), the method identifies the step of the generative pipeline from which to start denoising real images. This is achieved by identifying the specific denoising step within the generative pipeline that corresponds to the equivalent noise level of the noisy image. For this stage we used real images with simulated noise. 

\begin{figure}[htbp]
\centering
\includegraphics[width=0.9\linewidth]{Figuras/Diffusion_model.png}
\caption{Workflow of the proposed method.}
\label{fig:diffusion_model}
\end{figure}

Evaluation was done on images contaminated with Gaussian noise, for which the DDPM-X was trained, and Poisson noise, which better models the noise found in X-ray images.


\subsection{Generative diffusion model}

%As shown in Figure \ref{fig:diffusion_model}, our \review{method consists of two steps: First it trains a DDPM over clean X-ray images to achieve image generation (Stage 1). Then, it leverages} on the trained DDPM to perform denoising (Stage 2).

As shown in Figure \ref{fig:diffusion_model}, the diffusion model used in DDPM-X consists of three elements: a Forward Diffusion Kernel (FDK), a DL network, and a Reverse Diffusion Kernel (RDK).
The FDK iteratively applies a diffusion process to corrupt a clean image $\boldsymbol{x_0} \in \Re ^{H\times W}$ with Additive White Gaussian Noise (AWGN) for a set of timesteps $t \in [0,T]$, as shown in Equation \ref{eq:FDK_t}.
\begin{equation}
    q(x_t|x_{t-1})=\mathcal{N} (x_t; \sqrt{1-\beta_t}x_{t-1}, \beta_t \mathbf I)
    \label{eq:FDK_t}
\end{equation}
where $x_{t}$ is the noise corrupted image for timestep $t$ and $\beta_t$ is a hyperparameter.
The FDK is applied in a single step for any random $t$ as follows:
\begin{equation}
    q(x_{0:t }|x_0)=q(x_1,x_2,...,x_t |x_0)\stackrel{Markov}{=}\prod_{t=1}^{T} q(x_t|x_{t-1}) = \mathcal{N} (x_{t }; \sqrt{\overline{\alpha_{t }}} x_0, (1-\overline{\alpha_{t }})\mathbf I)
    \label{eq:FDK}
\end{equation}
where $\alpha_{t }=1-\beta_{t }$ and $\overline{\alpha_{t }}=\prod_{s=0}^{t } \alpha_s$. $\beta_t$ may be updated following any differentiable function that ensures that $\sqrt{\overline{\alpha_T}} \thickapprox 0$. We use the linear function presented in \cite{ho2020denoising}: $ \beta_t=\frac{\beta_e-\beta_s}{T}t + \beta_s,$ with $\beta_s=10^{-6}$ and $\beta_e=0.02$. The value of $\beta_s$ was selected to ensure that evaluation noise levels corresponded to realistic X-ray doses.

The RDK generates an image by iteratively reversing the forward diffusion. Given that the reversal of a Gaussian diffusion process is also Gaussian, the p.d.f of the data can be recovered by marginalization of the individual Markov steps:
\begin{equation}
    p_\theta(x_0)\stackrel{marginal}{=}\int p_\theta(x_{0:T})\partial x_{1:T}\stackrel{markov}{=}\int p(x_T)\prod_{t=0}^{T-1}p_\theta(x_{t-1}|x_t)\partial x_{1:T}
\end{equation}
where $\theta \in \Re ^\Theta$ are the parameters of the DL network, $p(x_T) \sim \mathcal{N} (0,\mathbf{I})$, and $p_\theta(x_{t-1}|x_t)=\mathcal{N} (x_{t-1}; \mu_\theta(x_t,t),\Sigma_\theta(x_t))$ is a step from the RDK, being $\mu_\theta \thickapprox x_t - \eta_\theta$ and $\eta_\theta$ the Gaussian noise prediced by the network. For simplicity, we used a fixed small variance $\Sigma_\theta = \overline{\beta_t}$. At each step of the RDK, we clipped $\mu_0$ to the [-1,1] intensity range \cite{saharia2022photorealistic}.

\subsection{Denoising strategy}
To perform denoising we follow a similar approach to the Come Closer Diffuse Faster algorithm (CCDF) \cite{chung2022come}. The RDK is applied from $t=t'$ to $t=0$ (Fig. \ref{fig:diffusion_model}), where $t'$ is the denoising timestep obtained from an estimate of the noise level of the image. The approach followed to compute the denoising timestep $t'$ varies depending on the probabilistic model used to simulate noise. In this work we consider two noise models: Gaussian noise, for which the DL network has been specifically trained, and Poisson noise, which is the type of noise inherent to X-rays due to their quantic nature. Gaussian noise is simulated with equation \ref{eq:FDK} for a specific timestep $t$, and therefore $t'=t$. Poisson noise is simulated using equation \ref{eq:Zlog}, which includes a small Gaussian noise $\eta \sim \mathcal{N}(0,\mathbf{I})$ scaled by $\sigma^2=10$ to emulate electronic noise, as in \cite{gao2023corediff}.
\begin{eqnarray}
    Z_{log} &=& -log \left (\frac{Poisson (Y) + \sigma^2\eta}{I} \right)
    \label{eq:Zlog}
\end{eqnarray}
where $Y=Ie^{-Y_{log}}$, $Y_{log}$ is the noiseless image, and $\int I_0(\epsilon) \partial \epsilon = I $ represents the flood image.
Due to the signal dependency of Poisson noise, the denoising timestep $t'$ is estimated from the maximum noise variance found in the image, as follows:
\begin{subequations}
\begin{align}
    \hat{Y_{log}}=\mathbf{I} * \frac{1}{n} \sum^N_{i=1} P_{99} (\max(Y^i_{log})) , \quad \mathbf{I} \in \mathbb{R}^{H\times W}
\end{align}
\end{subequations}
where $N$ correspond to the size of the training dataset. The percentile is applied to avoid the contribution of high intensity artificial details present in the images such as medical annotations. Then, $\hat{Z}_{log}$ is computed from $\hat{Y}=Ie^{-\hat{Y}_{log}}$ by using Equation \ref{eq:Zlog}, and the denoising timestep $t'$ is estimated as follows:
\begin{equation}
    t' := 1- \overline{\alpha_t'} \thickapprox \Var (\hat{Z}_{log})
    \label{eq:var}
\end{equation}
%% Note: when applying the exp(-\hat{Y_log}), the brighter values become the darkest, and therefore one would think that when applying Poisson, the obtained variance of Z_log would be the lowest of all the original Y_log image. However, when applying the -log, the low variance of the noise in the dark regions is maximized.

Calculating the timestep $t'$ using Equation \ref{eq:var} ensures that the model removes the noise of maximum variance.
%by selecting the value of $t$ such that the variance $1-\overline{\alpha_t}$ of Equation \ref{eq:FDK} corresponds to the maximum variance of $Y_{log}$. Given that Poisson noise is signal dependent, maximum variance is computed over the negative logarithm of a Poisson distribution with a mean equal to the 99 percentile of $Y_{log}$. 
It must be noticed that obtaining $Z_{log}$ requires knowing the dose $I$. For real noisy images, $I$ can be estimated from the X-ray acquisition parameters, or by using noise estimation methods \cite{turajlic2017adaptive}. 
To ensure an equivalent noise level between Gaussian and Poisson noise, Gaussian noise was simulated by taking the timestep $t$ from Equation \ref{eq:FDK} as the denoising timestep $t'$ estimated for Poisson noise.

\subsection{Network}
We used a U-Net composed of five pairs of downsampling and upsampling blocks with SiLU activation functions, each built of 2 Resnet layers, and an attention block of 8 heads. The number of output channels per downsampling and attention block were duplicated from 128 to 512 every two blocks. %The number of input and output channels was 1 to accomodate gray-scale images. 
The network was conditioned on the timestep which was given to each block as a sinusoidal embedding preprocessed by an MLP block of two layers.

The model was trained for 100 epochs with mixed precision, using a learning rate of $10^{-4}$, an AdamW optimizer, and a cosine schedule to achieve super convergence \cite{smith2019super}. The MSE loss function $L (\theta) = \mathbb{E} [\Vert \eta - \eta_\theta (x_t,t) \Vert ^2]$ was used to predict the Gaussian noise $\eta$ of the image at a timestep $t$ randomly drawn from a uniform distribution. A random horizontal flip was applied to the images to perform data augmentation. Training was performed on a RTX 3090 GPU of 24 GB, and took 500 s per epoch, while inference took 0.25s per timestep. All code was implemented on Pytorch based on Fastai \cite{howard2020fastai} and Diffusers from Hugging Face \cite{von-platen-etal-2022-diffusers}.

\subsection{Evaluation}
The proposed method was evaluated for Poisson noise and Gaussian equivalent noise. 
Noisy images were obtained for high dose with $I=5 \times 10^4$ and $\sqrt{1-\overline{\alpha_3}} = 9.6 \times 10^{-3}$ for Poisson and Gaussian noise, respectively, and for low dose with $I=9 \times 10^3$ and $\sqrt{1-\overline{\alpha_9}} =9.6 \times 10^{-2}$, for Poisson and Gaussian noise, respectively. The high dose corresponded to an estimated denoising step of $t'=3$ and the low dose to $t'=9$.
We randomly selected 1225 images from the NIH Chest X ray database \cite{wang2017chestx}, splitted into a training set of 1125 images and a validation set of 100 images. Images were resized from 1024x1024 to 512x512 and normalized to the [-1,1] intensity range, as in \cite{matviychuk2016learning}. These images were taken as the noiseless image $Y_{log}$ from Equation \ref{eq:Zlog}.

The evaluation of Poisson contaminated images was compared with four well-known algorithms: Block Matching and 3D filtering (BM3D) \cite{dabov2007image}; Neighbor2Neighbor (Nei2Nei) \cite{huang2021neighbor2neighbor}; Dual GAN (DU-GAN) \cite{huang2021gan}, and the same UNet architecture used by DDPM-X trained in a supervised fashion with the MSE loss. As BM3D is designed for Gaussian noise, to be fair we preprocessed the images with the Anscombe transform to convert Poisson noise into Gaussian of variance 1 and normalized them to the [0,1] intensity range as in \cite{bodduna2019poisson}.

To evaluate the performance of the models, we applied three metrics commonly used in denoising: Peak Signal to Noise Ratio (PSNR), to  evaluate pixelwise differences, Learned Perceptual Image Patch Similarity (LPIPS), to evaluate visual quality, and  Structural Similarity Index Measure (SSIM), to evaluate both distortion and visual quality \cite{blau2018perception}. %For LPIPS we took AlexNet as the feature extractor following the implementation of \cite{zhang2018unreasonable}.
The absolute difference (AD) was obtained as the difference between the metrics computed for the target and the denoised image, and the relative difference (RD) as the ratio of AD and the metric computed for the target and noisy image.
Visual evaluation was done after a simple post-processing pipeline, consisting of a Contrast Limited Adaptative Histogram Equalization (CLAHE) with size tile of 70 píxels and clip limit of 0.0001, and a Laplacian Pyramid of 3 levels.
We additionally performed a preliminary evaluation of our method with real noisy data acquired at a low dose. This evaluation is found in Appendix A.

% X-ray projection data is modelled by the Beer-lambert law:
% \begin{equation}
%     Y=\int I_0 (\epsilon) e^{\int \mu(x,\epsilon) \partial x} \partial \epsilon
% \end{equation}
% To obtain an image representing the attenuation map of the traversed tissues, a logarithmic preprocessing step is applied.
% \begin{equation}
%     Y_{log}=-log(\frac{Y}{\int I_0(\epsilon) \partial \epsilon})
% \end{equation}
% where $\int I_0(\epsilon) \partial \epsilon = I $ represents the flood image. Therefore:
% \begin{equation}
%     Y=Ie^{(-Y_{log})}
%     \label{eq:Y}
% \end{equation}
% X-ray images are corrupted by Poisson noise that arises from quantic nature of X-rays, and by Gaussian noise $\eta \sim \mathcal{N}(0,\mathbf{I})$ produced by electronic noise.
% \begin{eqnarray}
%     Z &=& Poisson (Y) + \sigma^2\eta \\
%     Z_{log} &=& -log (\frac{Poisson (Y) + \sigma^2\eta}{I})
%     \label{eq:Zlog}
% \end{eqnarray}

%Figure of noisy images
% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \centering
% \label{fig:noise}
% \includegraphics[width=0.7\linewidth]{Figuras/Figuras_ruido.png}
% \caption{Results of applying Poisson and Gaussian noise models. }
% \end{figure}

\section{Results}
% Fig. \ref{fig:generative_results} shows that images generated by the diffusion model have a realistic appearance. However, in some cases they present minor artifacts such as excessive darkening of the lungs (left image of Fig. \ref{fig:generative_results}) or a mild appearance of artificial textures (right image of Fig. \ref{fig:generative_results}).

% \begin{figure}[htbp] %this figure will be at the right
%     \centering
%     \includegraphics[width=0.55\textwidth]{Figuras/Figura_generativos.png}
%     \caption{Example of two images unsuccessfully generated by the DDPM. Red arrows point to artifacts.}
%     \label{fig:generative_results}
% \end{figure}
Table \ref{tab:metrics_timesteps} shows that our method is powerful enough to achieve significant denoising for a wide range of noise levels.

\begin{table}[htbp]
\caption{Metrics evaluated on the validation set at different dose levels.}
\vspace{1em}
\centering
\label{tab:metrics_timesteps}
\begin{tabular}{| c |c c c| c c c|}
\hline
Dose & \multicolumn{3}{ c |}{ AD } & \multicolumn{3}{ c |}{RD} \\
\hline I ($\times 10^3$) &  LPIPS   & SSIM   &  PSNR  &  LPIPS &  SSIM  &  PSNR \\
\hline
9   & 0.02   & 98.05  & 36.89 & \textbf{92.41}  & \textbf{12.68}   & \textbf{22.63} \\
14  & 0.02   & 98.25  & 38.36 & 91.93  & 7.74  & 19.57 \\
%20  & 0.01   & 98.52  & 39.40 & 91.42  & 5.37   & 17.35 \\
33  & 0.01   & 98.90  & 40.97 & 90.59  & 3.06   & 14.37 \\
50  & 0.01   & 99.06  & 41.93 & 87.07  & 1.86   & 11.50 \\
100 & \textbf{0.00}   & \textbf{99.47}  & \textbf{44.37} & 78.75  & 0.87   & 9.23 \\
\hline
\end{tabular}
\end{table}


\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
    \centering
    \includegraphics[width=0.9\linewidth]{Figuras/Figuras_Denoise_Poisson_all.png}
    \caption{Zoom of shoulder and lung regions indicated by the yellow rectangles of Figure 4. Red arrows point to hallucinations.}
    \label{fig:denoise_allDose}
\end{figure}


Table 2 shows that DDPM-X achieved the best quantitative performance of all methods for most. We can see that for most metrics, and specially for low doses, the best results were achived by DDPM-X for Gaussian noise, for which the network was trained.
However, the performance for Poisson noise was similar, with percentually small differences in RD (below 3\% in the worst of the cases) and from a qualitative point of view, these differences do not significantly hinder the visualization of the denoised image (Figure 2). The denosing timesteps for which the best quantitative results were obtained shows an error in the estimations of $t'=\pm 1$ for our method ($t'$ in Table 2).

Visual results in Figure 2 show that DDPM-X can effectively handle the different Poisson noise levels found in the images due to the signal dependency of Poisson noise, while preserving spatial resolution. However, for low doses it was unable to restore details of low contrast resolution that had been masked by noise, and in some cases it introduced a slight spatial distortion and/or small details as shown by the red arrows. BM3D introduces smoothing and artificial textures, being more noticeable for low doses, while Nei2Nei loses low contrast details for high doses and fails to achieve complete denoising for low doses. The vanilla UNet blurs the images for both high and low doses, while the DU-GAN preserves spatial resolution for high doses but introduces smoothing for low doses.

Figure 3 shows that for denoising timesteps above the optimum, the value of the metrics for DDPM-X on Poisson noise is not significantly affected. However, visual evaluation shows smoothing and hallucinations (Figure 4). For smaller timesteps, images preserve spatial content despite suffering from incomplete noise removal.

\begin{table}[htbp]
\caption{Quantitative results of the models for $t'$ denoising steps. DDPM-Xg and DDPM-Xp correspond to DDPM-X evaluated on Gaussian and Poisson noise respectively. \textbf{Best results}, \underline{second best results}}.
\vspace{1em}
\centering
\label{tab:metrics_gauss_poisson}
\begin{tabular} {|c| c |c |c c |c c |c c|}
\hline
Dose &  \multirow[t]{2}{*}{ Model } & \multirow[t]{2}{*}{t'} & \multicolumn{2}{c|}{ LPIPS $\downarrow$ } & \multicolumn{2}{c|}{ SSIM $\uparrow$} & \multicolumn{2}{c|}{ PSNR $\uparrow$} \\
\hline
 & & & AD & RD & AD & RD & AD & RD\\
\hline \multirow{6}{*}{Low}  & BM3D & - & 0.05 & 78.69 & 97.81 & 11.81 & 34.58 & 22.38 \\
& Vanilla UNet & - & 0.07 & 72.42& \underline{97.81} & 12.59& \textbf{37.56} & 20.25 \\
& Nei2Nei & - & 0.03 & 85.01 & 96.85 & 11.14 & 36.45 & 16.94 \\
& DU-GAN & - & 0.04 & 81.79 & 97.61 & 12.27 & 37.04 & 19.48 \\
& DDPM-Xg & 8 & \textbf{0.02} & \textbf{93.81} & 97.33 & \textbf{14.67} & \underline{37.27} & \textbf{26.49}\\
& DDPX-Xp & 8 & \underline{0.02} & \underline{92.40} & \textbf{98.05} & \underline{12.68} & 36.89 & \underline{22.63} \\

\hline
\multirow{4}{*}{High}  & BM3D & - & 0.02 & 65.88 & 98.88 & \textbf{1.99} & 39.78 & \textbf{12.11} \\
& Vanilla UNet & -& 0.02 & 64.66 & 98.68 & 1.81  &41.39 & 7.21\\
& Nei2Nei & - & 0.02 & 53.21 & 98.80 & 1.71 & 40.45 & 5.90 \\
& DU-GAN & - & 0.01 & 84.06 & \underline{99.11}& 1.88 & \underline{42.23} & 9.24 \\
& DDPM-Xg & 3 & \textbf{0.01} & \textbf{89.87} & \textbf{ 99.15} & 1.63  & \textbf{42.54} & \underline{11.61} \\
& DDPM-Xp & 4 & \underline{0.01} & \underline{87.06} & 99.07 &  \underline{1.86} & 41.93 & 11.50 \\
\hline
\end{tabular}

\end{table}

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\centering
\label{fig:denoise_all}
\includegraphics[width=1\linewidth]{Figuras/Figura_Denoise_steps.png}
\caption{Mean values of the metrics for different denoising steps for the low dose case on the validation set.}
\end{figure}

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
    \centering
    \label{fig:denoise_image_t}
    \includegraphics[width=1\linewidth]{Figuras/Figuras_oversmoothing.png}
    \caption{Denoising results of DDPM-X for Poisson noise for different timesteps, shown for the spine region indicated by the yellow rectangle. Arrow points to hallucination.}
\end{figure}

%Including stds
% \begin{table}[htbp]
% \floatconts
%   {tab:metrics_gauss_poisson}
% {\begin{tabular} {c c c c c c c c c c c c}
% Noise level &  \multirow[t]{2}{*}{ Noise model } & \multirow[t]{2}{*}{t} & \multicolumn{3}{c}{ LPIPS } & \multicolumn{3}{c}{ MSSIM } & \multicolumn{3}{c}{ PSNR } \\
%  & & & a.d & Std & RD & a.d & Std & RD & a.d & Std & RD\\
% \hline \multirow{2}{*}{Low}  & Gauss & 8 & \textbf{0.01} & \textbf{0.00} & \textbf{93.81} & \textbf{98.87} & 0.17 &\textbf{} \textbf{3.39} & \textbf{37.27} & 1.33 & \textbf{26.49}\\
% & Poisson & 8 & 0.02 & 0.00 & 92.28 & 98.84 & \textbf{0.15} & 2.96 & 36.85 & \textbf{1.13} & 22.52 \\
% \hline
% \multirow{2}{*}{High}  & Gauss & 3 & \textbf{0.01} & \textbf{0.00} & \textbf{89.87} & \textbf{ 99.69} & \textbf{0.06} & \textbf{0.40}  & \textbf{42.54} & \textbf{1.19} & \textbf{11.61} \\
% & Poisson & 4 & 0.01 & 86.22 & 0.00 & 99.64 & 0.06 &  0.49 & 42.23 & 1.24 & 12.32 \\
% \hline
% \end{tabular}}
% {\caption{Comparison between Denoising of Gaussian and Poisson noise, simulated for $I=5\times 10 ^ 4$ (equivalent computed timestep being $t=3$, and $I=9\times 10 ^ 3$ (equivalent computed timestep being $t=9$)}}
% \end{table}

% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:metrics}
%   {\caption{First row: Comparison of the results of different Noise levels conditioning for a real standard deviation of 0.018. Metrics have been computed between the noisy and target images, and between the denoised and target images, and the plots represent the percentage difference between both results. Second row: comparison of different Noise levels added to the image. Metrics are computed as the difference between the target and the denoised image}}
%   {\includegraphics[width=1\linewidth]{Figuras/Metrics.png}}
% \end{figure}

% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:denoise_t1and3}
%   {\caption{Denoising results for $t=1$ (\figureref{fig:denoise_t1}) and $t=3$ (\figureref{fig:denoise_t3}), which is equivalent to Gaussian noise of $\sigma^2=0.01$ (\figureref{fig:denoise_t1}) or $\sigma^2=0.018$ (\figureref{fig:denoise_t3}). First column: clean image. Second column: noisy image. Third column: denoised image by conditioning the network with for $t=1$ (\figureref{fig:denoise_t1}) or $t=3$ (\figureref{fig:denoise_t3}). Forth column: denoised image by conditioning the network with for $t=2$ (\figureref{fig:denoise_t1}) or $t=4$ (\figureref{fig:denoise_t3}). Second and third rows are insights of the regions labelled by the yellow boxes.}}
%   {
% \subfigure {
% \label{fig:denoise_t1}
% \includegraphics[width=0.8\linewidth]{Figuras/Figura_Denoise_t1.png}
% }
% \subfigure{
% \label{fig:denoise_t3}
% \includegraphics[width=0.8\linewidth]{Figuras/Figura_Denoise_t3.png}
% }
%   }
% \end{figure}

% \begin{tabular}{ c c c c c c c c c c c c c }
% \hline & \multicolumn{6}{ c }{ Abs } & \multicolumn{6}{ c }{$\%$} \\
% \hline I0 & \multicolumn{2}{ c }{ LPIPS   } & \multicolumn{2}{ c }{ MSSIM  } & \multicolumn{2}{ c }{ PSNR   } & \multicolumn{2}{ c }{ LPIPS   } & \multicolumn{2}{ c }{ MSSIM   } & \multicolumn{2}{ c }{ PSNR   } \\
% \hline 9090 & 0.10 & 0.20 & 97.95 & 9.58 & 36.60 & 4.27 & 91.69 & 15.77 & 3.08 & 0.30 & 22.71 & 2.91 \\
%  11111.00 & 0.08 & 0.59 & 98.09 & 9.01 & 37.26 & 3.23 & 91.37 & 8.87 & 2.49 & 1.12 & 21.12 & 4.08 \\
%  14285.00 & 0.07 & 0.59 & 98.24 & 9.12 & 38.08 & 3.33 & 91.07 & 8.90 & 1.95 & 0.93 & 19.51 & 4.20 \\
%  20000.00 & 0.06 & 0.49 & 98.40 & 9.24 & 39.13 & 3.53 & 90.56 & 8.96 & 1.38 & 0.73 & 17.40 & 4.04 \\
%  33333.00 & 0.05 & 0.40 & 98.58 & 9.36 & 40.61 & 3.83 & 89.74 & 9.22 & 0.81 & 0.54 & 14.27 & 3.21 \\
%  50000.00 & 0.04 & 0.30 & 98.69 & 9.47 & 41.85 & 3.98 & 86.23 & 8.75 & 0.52 & 0.41 & 12.23 & 3.43 \\
%  100000.00 & 0.03 & 0.79 & 98.83 & 9.00 & 43.96 & 3.07 & 78.00 & 8.78 & 0.24 & 1.21 & 9.17 & 4.04 \\
% \hline
% \end{tabular}

% \begin{table}[htbp]
% \centering
% \label{tab:metrics_gauss_poisson}
% \begin{tabular} {c c c c c c c c c}
% Dose &  \multirow[t]{2}{*}{ Noise model } & \multirow[t]{2}{*}{t} & \multicolumn{2}{c}{ LPIPS } & \multicolumn{2}{c}{ MSSIM } & \multicolumn{2}{c}{ PSNR } \\
%  & & & AD & RD & AD & RD & AD & RD\\
% \hline \multirow{2}{*}{Low}  & Gauss & 8 & \textbf{0.01} & \textbf{93.81} & \textbf{98.87} & \textbf{3.39} & \textbf{37.27} & \textbf{26.49}\\
% & Poisson & 8 & 0.02 & 92.28 & 98.84 & 2.96 & 36.85 & 22.52 \\
% \hline
% \multirow{2}{*}{High}  & Gauss & 3 & \textbf{0.01} & \textbf{89.87} & \textbf{ 99.69} & 0.40  & \textbf{42.54} & 11.61 \\
% & Poisson & 4 & 0.01 & 86.22 & 99.64 &  \textbf{0.49} & 42.23 & \textbf{12.32} \\
% \hline
% \end{tabular}
% \caption{Quantitative comparison between denoising of Gaussian and Poisson noise, simulated for high and low doses, and denoised for $t$ number of steps.}
% \end{table}

\section{Discussion}
In this work we have proposed DDPM-X, a method for planar X-ray image denoising based on a diffusion model.  Although the network was trained on Gaussian noise, results suggest that the diffusion model can be also applied to Poisson denoising without any modification or fine-tuning of the network.

Given the noise level conditioning of the network, the user can regulate the amount of denoising ad hoc. An initial estimate of the noise level, which is used to define the denoising timestep, may be obtained from the SVD decomposition of the image \cite{turajlic2017adaptive}, or from the X-ray dose associated to the acquisition parameters. However, the selection of the appropriate denoising timestep can be critical as overly large values can introduce hallucinations. Given that the preservation of spatial content is of uttermost importance in the medical field, it is therefore preferable to cautiously use smaller values. In the future, we will explore the inclusion of data consistency models to constrain the generative power of these models and reduce the risk of content distortion.  

Evaluation showed that the metrics failed to detect the appearance of hallucinations. In the future we will explore alternative metrics such as the Edge Preservation Index (EPI) \cite{sagheer2020review}, and we will evaluate them patch-wise to account for small local spatial distortions. On the other hand, the low differences in metric values between our method and the baselines did not seemingly correlate with the noticeable visual differences observed. This discrepancy is likely caused by the difference in contrast between the denoised image used to evaluate the metrics and the contrast-enhanced images used for visualization.

The proposed method can be efficiently trained with a small database of down to approximately 1100 images, enabling its application to real clinical scenarios which often lack large databases. Inference can be done in less than 3 seconds allowing its real-time application. The method could be further sped up by estimating the variance $\Sigma_{\theta}$ of the reverse diffusion path rather than taking a fixed value \cite{song2020denoising}.

% \begin{itemize}
% \item JMLR/PMLR uses natbib for references. For simplicity, here, \verb|\cite|  defaults to
%   parenthetical citations, i.e. \verb|\citep|. You can of course also
%   use \verb|\citet| for textual citations.
% \item Eprints such as arXiv papers can of course be cited . We recomend using a \verb|@misc| bibtex entry for these as shown in the sample bibliography.
% \item You should follow the guidelines provided by the conference.
% \item Read through the JMLR template documentation for specific \LaTeX
%   usage questions.
% \item Note that the JMLR template provides many handy functionalities
% such as \verb|\figureref| to refer to a figure,
% e.g. \figureref{fig:example},  \verb|\tableref| to refer to a table,
% e.g. \tableref{tab:example} and \verb|\equationref| to refer to an equation,
% e.g. \equationref{eq:example}.
% \end{itemize}

% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work was supported by PDC2021-121656-I00 (MULTIRAD) and PID2021-123182OB-I00, funded by MCIN/AEI/ 10.13039/501100011033 and by the ‘NextGenerationEU’/PRTR and European Union 'FEDER'. Also funded by Instituto de Salud Carlos III through the projects PT20/00044, co-funded by the European Regional Development Fund “A way to make Europe” and PMPTA22/00121 and PMPTA22/00118, co-funded by the European Union ‘NextGenerationEU’/PRTR, and by Comunidad de Madrid under grants IND2022/TIC-23550 and ELLIS Unit Madrid.The CNIC is supported by Instituto de Salud Carlos III, Ministerio de Ciencia e Innovación, and the Pro CNIC Foundation.}


\bibliography{midl24_33}

\appendix
\section{Evaluation on real noise}
DDPM-X was evaluated on a real noisy acquisition of an anthropomorphic phantom. Target images were acquired at 100kV and 4 mAs, while noisy images were acquired at the same voltage and 0.8 mAs. As can be seen in Figure \ref{fig:real_data}, results are slightly blurry, especially in the lung region. This may be because the phantom image has perfect borders of high resolution and deviates from the data distribution on which the model was trained. Table \ref{tab:metrics_real_data} shows that quantitative results are almost identical than for simulated data excepting PSNR, which is surprisingly low likely due to a non perfect alignment of the phantom for the low and high dose acquisitions. Despite this, results show the promise of our model on real data.



\begin{table}[htbp]
\vspace{1em}
\centering
\caption{Quantitative results of DDPM-X for $t'$ denoising steps.}
\label{tab:metrics_real_data}
\begin{tabular}{|l|l|l|l|l|}
\hline
Denoise timestep $t'$ & LPIPS    & MSSIM     & SSIM      & PSNR      \\
\hline
4.00             & 0.02     & 99.77     & \textbf{98.15} & 27.87     \\ 
3.00             & \textbf{0.02} & \textbf{99.78} & 98.15     & \textbf{27.88} \\ 
2.00             & 0.02     & 99.77     & 98.00     & 27.87     \\ \hline
\end{tabular}
\end{table}

\begin{figure}
    \centering
    \includegraphics[width=0.7\linewidth]{Figuras/Figure_realData.png}
    \caption{Denoising results for an antropomorphic phantom acquired at 100 kV/ 4 mAs and 100kV/0.8 mAs. Red arrow points to missing details.}
    \label{fig:real_data}
\end{figure}

\end{document}


% #Extras
% #What I originally wrote about DDPMS
% \subsection{Diffusion model}
% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \centering
% \label{fig:noise}
% \includegraphics[width=0.9\linewidth]{Figuras/Diffusion_model.png}
% \caption{Results of applying Poisson and Gaussian noise models to validation image. Second and third rows correspond to insights of the yellow rectangles.}
% \end{figure}

% We use the original implementation of Denoising Diffusion Probabilistic Models (DDPMs) \cite{ho2020denoising}. For a set of timesteps $t \in [0,T]$, a clean image $x_0 \in \Re ^{H\times W}$ is iteratively corrupted by Additive White Gaussian Noise (AWGN) using the Forward Diffusion Kernel (FDK).
% \begin{equation}
%     q(x_t|x_{t-1})=\mathcal{N} (x_t; \sqrt{1-\beta_t}x_{t-1}, \beta_t \mathbf I)
% \end{equation}
% where $x_{t}$ is the noise corrupted image for timestep $t$ and $\beta_t$ is a hyperparameter.
% For any random timestep $t $, the noisy image can be directly obtained assuming a Markov framework.
% \begin{equation}
%     q(x_{0:t }|x_0)=q(x_1,x_2,...,x_t |x_0)\stackrel{Markov}{=}\prod_{t=1}^{t } q(x_t|x_{t-1}) = \mathcal{N} (x_{t }; \sqrt{\overline{\alpha_{t }}} x_0, (1-\overline{\alpha_{t }})\mathbf I)
%     \label{eq:FDK}
% \end{equation}
% where $\alpha_{t }=1-\beta_{t }$ and $\overline{\alpha_{t }}=\prod_{s=0}^{t } \alpha_s$

% $\beta_t$ may be updated following any differentiable function that ensures that $\sqrt{\overline{\alpha_T}} \thickapprox 0$ and $\sqrt{1-\overline{\alpha_T}}\thickapprox 1$. We use the linear function presented in \cite{ho2020denoising}:
% \begin{equation}
%     \beta_t=\frac{\beta_e-\beta_s}{T}t + \beta_s.
% \end{equation}

% By reparametrization, the FDK can applied using normally distributed noise $\eta \in \mathcal{N}(0,\mathbf{I}$).
% \begin{eqnarray}
%     x_t=\sqrt{\overline{\alpha_{t }}} x_{{t }-1} + \sqrt{1-\overline{\alpha_{t }}} \eta
% \end{eqnarray}

% Given that the reversal of a Gaussian diffusion process is also Gaussian, the pdf of the data can be recovered by marginalization of the individual Markov steps:
% \begin{equation}
%     p_\theta(x_0)\stackrel{marginal}{=}\int p_\theta(x_{0:T})\partial x_{1:T}\stackrel{markov}{=}\int p(x_T)\prod_{t=0}^{T-1}p_\theta(x_{t-1}|x_t)\partial x_{1:T}
% \end{equation}
% where $\theta \in \Re ^\Theta$ are the parameters of a learning function such as a Neural Network, and $p_\theta(x_{t-1}|x_t)=\mathcal{N} (x_{t-1}; \mu_\theta(x_t,t),\Sigma_\theta(x_t))$ is a step from the reverse path. Given the intractability of the above integral, an estimate of $p_\theta(x_0)$ can be obtained by maximizing an ELBO, which can be simplified to conclude that the reverse path can be learned by approximating Baye´s Posterior of the FDK $q(x_{t-1}|x_t)$ .
% \begin{eqnarray}
%     q(x_{t-1}|x_t) &=& \mathcal{N}(x_{t-1}; \overline{\mu_t}(x_t,x_0),\overline{\beta_t} \mathbf I) \\
%     \overline{\mu_t}(x_t,x_0) &=& \frac{\sqrt{\overline{\alpha_{t-1}}}\beta_t}{1-\overline{\alpha_t}} x_0 + \frac{\sqrt{\alpha_t}(1-\overline{\alpha_{t-1}})}{1-\overline{\alpha_t}}x_t \\ 
%     x_0 &=& \frac{1}{\sqrt{\overline{\alpha_t}}}(x_t-\sqrt{1-\overline{\alpha_t}}\eta)\\
%     \overline{\beta_t} &=& \frac{1-\overline{\alpha_{t-1}}}{1-\alpha_t}\beta_t
% \end{eqnarray}
% Given that $\alpha_t, \beta_t, x_t$ are known at inference, $\overline{\mu_t}(x_t,x_0)$ can be learnt by approximating $\eta$ with the MSE loss function.
% \begin{equation}
%     L (\theta) = \mathbb{E} [\Vert \eta - \eta_\theta (x_t,t) \Vert ^2]
% \end{equation}
% For generative inference, $p(x_T) \sim \mathcal{N} (0,\mathbf{I})$ 
% and $p(x_0)$ is obtained by recursively computing $p_\theta(x_{t-1}|x_t)$ given the network´s estimation of noise $\eta_\theta$. For simplicity, we use a fixed small variance $\Sigma_\theta = \overline{\beta_t}$.

% For denoising inference, a similar approach to the Come Closer Diffuse Faster algorithm (CCDF) is followed \cite{chung2022come}. A noisy image is simulated and the reverse path is followed from $t=t'$ to $t=0$, being $t'$ the timestep corresponding to an estimate of the noise level.