\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage[utf8]{inputenc} % to get dummy images
\usepackage[T1]{fontenc}
\usepackage{bm}
\usepackage{amsmath,amsfonts,amssymb}
\usepackage{textcomp}
\usepackage{booktabs}
\usepackage{microtype}

%\usepackage{algorithmicx}

\DeclareMathOperator*{\argmin}{arg\,min~}
\DeclareMathOperator*{\argmax}{arg\,max~}
\DeclareMathOperator{\Forall}{\forall}
\newcommand{\transp}{\mathsf{T}}
\newcommand{\given}{\,\vert\,}
\newcommand{\Given}{\,\Vert\,}
\DeclareMathOperator{\kl}{KL}

\jmlrvolume{}
\jmlryear{2021}
\jmlrworkshop{Full Paper -- MIDL 2021}
\editors{Under Review for MIDL 2021}

\title[A Mean-Field Variational Inference Approach to Deep Image Prior]{A Mean-Field Variational Inference Approach to Deep Image Prior for Inverse Problems in Medical Imaging}

\midlauthor{\Name{Malte Tölle\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \Email{malte.toelle@med.uni-heidelberg.de}
\AND
\Name{Max-Heinrich Laves\midlotherjointauthor\nametag{$^{3}$}} \Email{max-heinrich.laves@tuhh.de}
\AND
\Name{Alexander Schlaefer\nametag{$^{3}$}} \Email{schlaefer@tuhh.de}
\AND
\addr $^{1}$ Department of Internal Medicine III, Heidelberg University Hospital \\
\addr $^{2}$ Informatics for Life, Heidelberg \\
\addr $^{3}$ Institute of Medical Technology and Intelligent Systems, Hamburg University of Technology
}

\begin{document}

\maketitle

\begin{abstract}
% Purpose
Exploiting the deep image prior property of convolutional auto-encoder networks is especially interesting for medical image processing as it avoids hallucinations by omitting supervised learning.
Its spectral bias towards lower frequencies makes it suitable for inverse image problems such as denoising and super-resolution, but manual early stopping has to be applied to act as a low-pass filter.
% Methods
In this paper, we present a novel Bayesian approach to deep image prior using mean-field variational inference.
This allows for uncertainty quantification on a per-pixel level and, given the right prior distribution on the network weights, omits the need for early stopping.
We optimize the parameters of the weight prior towards reconstruction accuracy using Bayesian optimization with Gaussian Process regression.
% Results
We evaluate our approach on different inverse tasks on a variety of modalities and demonstrate that an optimized weight prior outperforms former state-of-the-art Bayesian deep image prior approaches.
We show that a badly selected prior leads to worse accuracy and calibration and that it is sufficient to optimize the weight prior parameter per task domain.
% Conclusion
\end{abstract}

\begin{keywords}
Variational inference, Hallucination, Deep learning
\end{keywords}

\section{Introduction}

Automated methods for improving image quality have several applications in medical imaging, as acquiring high-quality images is time-consuming, costly, or entails a considerable radiation dose to the patient.
Such use cases include denoising and artifact removal in low-dose CT or PET \cite{Yang2018,Ma2020,Wang2018}, despeckling in ultrasound or optical coherence tomography \cite{Michailovich2006,Bernardes2010}, super-resolution of MRI \cite{Tanno2017}, or inpainting for hair removal in dermoscopy images \cite{Abbas2011}.
Enhancing medical images with poor quality is a fundamental step for better diagnosis or subsequent image analysis.
In this paper, we focus on post-processing methods that are generally applicable to all aforementioned modalities.

\begin{figure}[t]
    \centering
    \parbox{3.7cm}{\centering \small\textsf{input} \vphantom{(D)} \\
    \includegraphics[trim=0 0 0 20mm, clip,width=3.6cm]{img/original_hair.jpeg}} \hfill
    \parbox{3.7cm}{\centering \small\textsf{DIP} \vphantom{(D)} \\
    \includegraphics[trim=0 0 0 20mm, clip,width=3.6cm]{img/recon_avg_dip_inpaintign.jpeg}} \hfill
    \parbox{3.7cm}{\centering \small\textsf{MFVI (ours)} \vphantom{(D)} \\
    \includegraphics[trim=0 0 0 20mm, clip,width=3.6cm]{img/recon_avg_mfvi_inpainting.jpeg}} \hfill
    \parbox{3.7cm}{\centering \small\textsf{uncertainty} \vphantom{(D)} \\
    %\includegraphics[trim=0 0 0 6mm, clip,width=3.5cm]{img/uncert_inpainting_contrast.png}}
    \includegraphics[trim=0 0 0 6mm, clip,width=3.6cm]{img/uncert_inpainting_MFVI.pdf}}
    \caption{Inpainting for hair removal on dermoscopy images. Our mean-field variational inference approach to deep image prior is not prone to overfitting, outperforms the non-Bayesian baseline and provides consistent pixel-wise uncertainty maps.}
    \label{fig:opener_results}
\end{figure}

Those methods involve solving an inverse imaging problem, which try to reconstruct a high-quality image $ \hat{\bm{x}} $ from a low-quality observation $ \tilde{\bm{x}} = \bm{c} \circ \bm{x} $ of the true, but unknown image $ \bm{x} $ affected by some corruption process $ \bm{c} $.
%A possible corruption could be an additive white Gaussian noise model with zero mean and standard deviation $ \sigma $ \cite{Salinas2007,Zhang2017}.
The reconstruction comprises minimization of an objective function
$
    \hat{\bm{x}} = \argmin
    %\Big\{
    \mathcal{L}(\tilde{\bm{x}}, \hat{\bm{x}}) + \lambda \mathcal{R}(\hat{\bm{x}})
    %\Big\}
$,
governed by a similarity measure $ \mathcal{L} $ and some regularizing image prior $ \mathcal{R} $, weighted by a factor $ \lambda $ \cite{Sotiras2013}.
%In case of denoising, the prior should be selected such that $ \hat{\bm{x}} $ has less noise than $ \tilde{\bm{x}} $.
Common priors for image quality enhancement are total variation or penalization of first and higher order spatial derivatives \cite{rudin1992}.
The prior is of particular importance as it is responsible for the properties of the enhanced image; its manual selection is a delicate task.

More recently, deep-learning-based convolutional autoencoders have been trained to enhance images using sets of corrupted and uncorrupted data pairs \cite{Jain2009}.
Autoencoders extract important visual features from the corrupted input image and reconstruct the input from the extracted features using learned image statistics.
Through this, the neural networks implicitly learn regularization priors from data.

However, deep-learning-based methods show insufficient robustness to input data that lay outside their training domain.
\citet{Antun2020} have demonstrated that state-of-the-art deep learning methods for CT and MR image reconstruction, such as AUTOMAP \cite{Zhu2018}, show severe instabilities to tiny perturbations in the input data, which causes the reconstructions to contain considerable artifacts.
Even worse, novel pathologies that were not present in the training data can be made to disappear in the reconstruction \cite{Bhadra2020}.
This phenomenon is referred to as \emph{hallucination} and is not limited to tomographic reconstruction but also happens in other deep-learning-based inverse image tasks \cite{Laves2020MCDIP}.
Hallucinations can result in misdiagnosis and must be avoided at all costs in medical imaging.

\subsection{Related Work}

\citet{Lempitsky2018} have shown that the excellent performance of deep convolutional networks for inverse image tasks on in-domain data is not only due to their ability to learn image priors from data, but also due to the structure of the networks themselves.
The concept of deep image prior (DIP) for inverse tasks does not require supervised training and thus, it is not affected by the aforementioned instabilities and hallucinations.
%In DIP, a convolutional autoencoder with skip-connections is interpreted as a parameterization of the image to be reconstructed.
Besides empirical evidence, the effectiveness of DIP can be explained by the spectral bias of deep networks \cite{Rahaman2019}.
An autoencoder network decouples the frequency components of an image, comparable to a Fourier transform \cite{Chakrabarty2019}.
During optimization, the frequency components are learned at different rates.
Lower frequencies are reconstructed first, which behaves like a low pass filter; image corruptions such as noise are usually encoded in the high-frequency components.
This makes early stopping in optimization a crucial step in order to not overfit the corrupting features (see Fig.\,\ref{fig:opener_results}).
%An alternative to early stopping is carefully selecting the number of trainable parameters, which introduces an architectural form of regularization.
%The deep decoder framework has demonstrated that manually fine-tuned under-parameterization of a decoder network can also address overfitting \cite{Heckel2020}.

%However, both early stopping and under-parameterization require expert human interaction.
However, early stopping requires expert human interaction.
We seek to find a more automated way to prevent DIP from overfitting in order to take advantage of its robustness towards hallucinations.
\citet{Cheng2019} presented a first Bayesian approach to DIP in the context of natural images, where a prior distribution is placed over the weights of the network and the posterior distribution is used to output the final image.
They derived a Monte Carlo (MC) sampler from DIP using stochastic gradient Langevin dynamics (SGLD) as Bayesian approximation, which uses injection of Gaussian noise into the gradients during each SGD step \cite{Welling2011}.
The authors claim to have solved the problem of overfitting and provide pixel-wise reconstruction uncertainty estimates.
SGLD DIP has already been applied to PET image reconstruction \cite{Carrillo2021}.
Recently, \citet{Laves2020MCDIP} have shown that DIP with SGLD shows almost unchanged overfitting behavior in the case of medical images.
As a solution, they proposed a variational inference (VI) approach to DIP using Monte Carlo dropout \cite{Gal2016}.

In this paper, we show that former Bayesian approaches to DIP show overfitting on medical images at some point.
We attribute this to the manual selection of the weight's prior distribution.
It is important to distinguish between DIP, which imposes a spectral bias towards lower frequencies, and the prior distribution over the weights of the network in Bayesian inference.
In SGLD and MC dropout, the prior is implicitly defined by weight decay or the dropout rate.
We hypothesize that the potential of DIP can be utilized in medical image enhancement using a well-defined prior distribution in a Bayesian setting.
%

\paragraph{Contributions}

Our contribution is a novel approximate Bayesian approach to DIP by employing mean-field VI (MFVI), where the weight prior can be defined more explicitly than in SGLD or MC dropout.
We further use Bayesian optimization (BO) to tune the parameters of the weight prior on a per-task level and show its superiority to former approaches on different medical image enhancement problems.
Our code is available at \href{https://github.com/maltetoelle/mfvi-dip}{github.com/maltetoelle/mfvi-dip}.

\subsection{Background}

\paragraph{Bayesian Deep Learning} See Appendix \ref{app:bdl} for background information about Bayesian deep learning.

\paragraph{Deep Image Prior}

Convolutional networks have been extensively used to learn image priors from data.
\citet{Lempitsky2018} have shown that the structure of a CNN is sufficient to capture a great amount of image statistics and impose a strong prior to restore a high-quality image from a low-quality observation without having access to any data.
An image-generating network $ \hat{\bm{x}} = \bm{f}_{\bm{w}}(\bm{z}) $ with randomly-initialized weights $ \bm{w} $ is used as a parameterization of the image.
The input $ \bm{z} $ is sampled from a uniform distribution $ \bm{z} \in \mathbb{R}^{C \times H \times W} \sim \mathcal{U}(0, 0.1) $ with channels $ C $, width $ W $ and height $ H $.
Given a low-quality target image $ \tilde{\bm{x}} $, the reconstructed image is obtained by minimizing the pixel-wise mean squared error $ \Vert \tilde{\bm{x}} - \bm{f}_{\bm{w}}(\bm{z}) \Vert^{2} $ w.r.t.\ the weights $ \bm{w} $.
Due to the spectral bias of DIP towards lower frequencies, early stopping behaves like a low-pass filter \cite{Chakrabarty2019}, making it suitable for many inverse image tasks.

\section{Methods}

\subsection{Mean-Field Variational Inference for Deep Image Prior}

Given a low-quality medical image $ \tilde{\bm{x}} $ and an image-generator network $ \bm{f}_{\bm{w}}(\bm{z}) = \hat{\bm{x}} $ with randomly-initialized weights $ \bm{w} $, DIP aims at finding the optimal weight point estimate $ \hat{\bm{w}} $ by maximum likelihood estimation (MLE) with gradient descent.
The input $ \bm{z} $ has the same spatial dimensions as $ \hat{\bm{x}} $.
%and is randomly sampled in each SGD step.
Before we turn to a Bayesian approach, we model heteroscedastic reconstruction uncertainty by assuming that $ \tilde{\bm{x}} $ is sampled from a spatial random process and that each pixel $ i $ follows a Gaussian distribution $ \mathcal{N}(\tilde{x}_{i}; \hat{x}_{i}, \hat{\sigma}^{2}_{i}) $ with mean $ \hat{x}_{i} $ and variance $ \hat{\sigma}^{2}_{i} $.
We extend the last layer such that the network outputs these values for each pixel
$
    \bm{f}_{\bm{w}}(\bm{z}) = \left[ \hat{\bm{x}}, \hat{\bm{\sigma}}^{2} \right]
$.
Maximum posterior is performed by minimizing the negative log-likelihood, which leads to the following optimization criterion \cite{Kendall2017}
\begin{equation}
    \mathcal{L}(\bm{w}) = \frac{1}{N}\sum_{i=1}^{N} \hat{\sigma}_{i}^{-2} \big\Vert \tilde{x}_{i} - \hat{x}_{i} \big\Vert^{2} + \log \hat{\sigma}_{i}^{2} ~ ,
    \label{eq:nll}
\end{equation}
where $ N $ is the number of pixels per image.
%In this case, $ \hat{\bm{\sigma}}^{2} $ captures the pixel-wise aleatoric uncertainty and is jointly estimated with $ \hat{\bm{x}} $ by finding $ \bm{\theta} $ that minimizes Eq.\,(\ref{eq:nll}) with SGD.
For numerical stability, Eq.\,(\ref{eq:nll}) is implemented such that the network directly outputs $ -\log \hat{\bm{\sigma}}^{2} $.

Next, we employ a MFVI approach to DIP by assuming that the variational posterior can be factorized as $ q_{\bm{\phi}}(\bm{w}) = \prod_{i=1}^{L} \mathcal{N}(w_{i} \given \mu_{i}, \sigma_{i}^{2}) $, with number of layers $ L $.
In each forward pass, the weights are sampled using reparameterization $ \bm{w} = \bm{\mu} + \bm{\sigma} \odot \bm{\epsilon} $ with $ \bm{\epsilon} \sim \mathcal{N}(\bm{0}, \bm{I}) $, where $ \odot $ denoting element-wise multiplication.
The variational parameters $ \bm{\phi} = \{ \bm{\mu}, \bm{\sigma} \} $ are optimized by minimizing the negative log evidence lower bound (ELBO)
\begin{equation}
    \bm{\phi}^{\ast} = \argmin_{\bm{\phi}} \kl [q_{\bm{\phi}}(\bm{w}) \Given p(\bm{w})] - \mathbb{E}_{\bm{w} \sim q_{\bm{\phi}}} [ \log p(\mathcal{D} \given \bm{w}) ]
    \label{eq:elbo}
\end{equation}
using backpropagation without weight decay.
This effectively doubles the number of trainable parameters and is known as Bayes by backprop \cite{Blundell2015}.
%In case of a Gaussian prior, the first term in Eq.\,(\ref{eq:elbo}) is implemented in closed form as (cf.\ Appendix\,\ref{app:two_gaussians})
%\begin{equation}
%    \kl [q \Vert p] = \log \frac{\sigma_p}{\sigma_q} + \frac{\sigma_{q}^{2} + ( \mu_q - \mu_p )^{2}}{2 \sigma_{p}^{2}} - \frac{1}{2} ~ .
%    \label{eq:kl_two_gaussians}
%\end{equation}
The first term in Eq.\,(\ref{eq:elbo}) is usually approximated with MC integration by
%For any other prior distribution, we approximate the $ \kl $ divergence by
\begin{equation}
    \kl [q \Vert p] \approx \frac{1}{T} \sum_{i=1}^{T} \log q_{\bm{\phi}}(\bm{w}_{i}) - \log p(\bm{w}_{i}) ~ ,
    \label{eq:kl_mc}
\end{equation}
with $ T $ Monte Carlo samples $ \bm{w}_{i} $ drawn from the variational posterior $ q_{\bm{\phi}}(\bm{w}) $. In case of a Gaussian prior, it can be implemented in closed form accelerating training by omitting the need for drawing MC samples (cf.\ Appendix\,\ref{app:two_gaussians}).
The second term in Eq.\,(\ref{eq:elbo}), the log likelihood, is implemented using Eq.\,(\ref{eq:nll}) in the same MC fashion:
%over draws from the posterior:
\begin{equation}
    -\mathbb{E}_{\bm{w} \sim q_{\bm{\phi}}} [ \log p(\mathcal{D} \given \bm{w}) ] \approx \frac{1}{T} \sum_{i=1}^{T} \bm{\sigma}^{-2}_{\bm{w}_{i}} \Vert \tilde{\bm{x}} - \hat{\bm{x}}_{\bm{w_{i}}} \Vert^{2} + \log \bm{\sigma}^{2}_{\bm{w}_{i}} ~ .
    \label{eq:nll_mc}
\end{equation}
%\citet{Graves2011} suggested to reweight the complexity term in the ELBO using a factor $ \beta $ to balance both terms in case of discrepancy between number of weights and training samples:
%\begin{equation}
%    \mathrm{ELBO}(q_{\bm{\phi}}(\bm{w})) = \mathbb{E}_{\bm{w} \sim q} [ \log p(\mathcal{D} \given \bm{w}) ] - \beta \kl [q_{\bm{\phi}}(\bm{w}) \Given p(\bm{w})] ~ .
%    \label{eq:elbo2}
%\end{equation}
%This controls the influence of the prior and selecting it is our main tool to prevent Bayesian DIP from overfitting the low-quality image.
After convergence, we obtain the high-quality image $ \mathbb{E} [ \hat{\bm{x}} ] $ and the accompanying pixel-wise uncertainty $ \mathrm{Var} [ \hat{\bm{x}} ] $ by MC sampling from the predictive posterior \cite{Kendall2017}:
\begin{equation}
    \mathbb{E}_{\bm{w} \sim q_{\bm{\phi}}} [ \hat{\bm{x}} ] \approx \frac{1}{T} \sum_{i=1}^{T} \hat{\bm{x}}_{\bm{w}_{i}} ~ , \quad \mathrm{Var}_{\bm{w} \sim q_{\bm{\phi}}} [ \hat{\bm{x}} ] \approx \frac{1}{T} \sum_{i=1}^{T} \left( \hat{\bm{x}}_{i} - \frac{1}{T} \sum_{i=1}^{T} \hat{\bm{x}}_{i} \right)^{2} + \frac{1}{T} \sum_{i=1}^{T} \hat{\sigma}^{2}_{i} ~ .
    \label{eq:mc_inference}
\end{equation}

\subsection{Prior Selection with Bayesian Optimization}

Instead of manually selecting the prior distribution over the weights of the DIP network using heuristics or inefficient grid search, we employ derivative-free BO.
BO allows us to optimize black-box functions that are expensive to evaluate, such as the training of a deep network \cite{Snoek2015}.
It uses a computationally inexpensive surrogate model to retrieve a distribution over functions.
In this work, we maximize the peak signal-to-noise ratio (PSNR) between the reconstruction $ \hat{\bm{x}} $ and the high-quality image $ \bm{x} $ as a function of the prior standard deviation $ \sigma_{p} $
\begin{equation}
    \max_{\sigma_{p} \in A} ~ f(\sigma_{p}) = \max_{\sigma_{p} \in A} ~  \mathrm{PSNR}(\hat{\bm{x}}_{\bm{\phi}}(\sigma_{p}), \bm{x})
    \label{eq:bo_objective}
\end{equation}
using a Gaussian process (GP) as surrogate $ f \sim \mathcal{GP} $.
It is also possible to directly optimize the shape parameters of the prior.
In each step of the BO, we evaluate our objective function $ f $ at the current candidate $ \sigma_{p}^{\ast} $ to increase the set of observations $ \mathcal{D}_{\mathrm{BO}} $ and update the posterior of the surrogate model.
Next, we maximize an acquisition function $ a(\sigma_{p} ; \mu_{\mathcal{GP}}, \sigma^{2}_{\mathcal{GP}}) $ using the current GP posterior mean $ \mu_{\mathcal{GP}} $ and variance $ \sigma^{2}_{\mathcal{GP}} $.
Its maximizing argument $ \sigma_{p}^{\ast} \leftarrow \argmax a(\sigma_{p} ; \mu_{\mathcal{GP}}, \sigma^{2}_{\mathcal{GP}}) $ is used as candidate for the next iteration \cite{Frazier2018}.
We choose the commonly accepted expected improvement (EI) as acquisition function
\begin{equation}
    a_{\mathrm{EI}}(\sigma_{p} ; \mu_{\mathcal{GP}}, \sigma^{2}_{\mathcal{GP}})) = \mathbb{E} \left[ \max (y - f^{\ast}), 0) \given y \sim \mathcal{N} ( \mu_{\mathcal{GP}}(\sigma_{p}), \sigma^{2}_{\mathcal{GP}}(\sigma_{p}) ) \right] ~ ,
    \label{eq:bo_ei}
\end{equation}
where $ f^{\ast} = f(\sigma_{p,\mathrm{best}}) $ is the minimal value of the objective function observed so far.
Eq.\,(\ref{eq:bo_ei}) can be solved analytically as shown in \cite{Jones1998}.
We utilize automatic differentiation from modern deep learning frameworks to optimize the acquisition function in order to get the next candidate $ \sigma_{p}^{\ast} $ \cite{Gardner2018}.

\section{Experiments}

We evaluate the performance of our MFVI approach on the following three inverse post-processing tasks and compare it to non-Bayesian DIP \cite{Lempitsky2018}, DIP with SGLD \cite{Cheng2019} and DIP with MC dropout \cite{Laves2020MCDIP}.
We apply BO to optimize the variance $ \sigma_{p} $ of a Gaussian prior per task.
In the following experiments, we use the same network architectures as proposed by \cite{Lempitsky2018}.

\paragraph{Denoising}

Optical coherence tomography and ultrasound are prone to speckle noise due to interference phenomena, which can obscure small anatomical details and reduce image contrast.
%Although speckle patterns contain information about the tissue microstructure, denoising of such images is desirable because this information is imperceptible to a human observer.
Speckle noise can be modeled as additive white Gaussian noise on log-transformed image intensities \cite{Michailovich2006}. 
Noise in low-dose X-ray originates from irregular photon density and can be modeled with Poisson noise \cite{Lee2018,Zabic2013}.
We approximate the Poisson noise with Gaussian noise since $ \mathsf{Poisson(\lambda)} $ approaches a Normal distribution as $ \lambda \rightarrow \infty $.
We first create a low-noise image $ \bm{x} $ by smoothing and downsampling the original image to $ 256 \times 256 $ pixel.
This averages over highly correlated neighboring pixels affected by uncorrelated noise and decreases the observation noise.
%by sacrificing image resolution.
The downsampled image acts as ground truth and is corrupted by $ \tilde{\bm{x}} = \bm{x} + \mathcal{N}(\bm{0}, 0.1^{2}\bm{I}) $ using normal (X-ray) or log-transformed intensities (US and OCT).
We use retinal OCT scans and chest X-rays with native resolutions of $ 496 \times 496 $ and $ 1029 \times 1260 $ pixel from a public data set \cite{Kermany2018}.
%We compute the peak signal-to-noise ratio (PSNR) and the structural similarity (SSIM) of the denoised image $ \hat{\bm{x}} $.

\paragraph{Super-Resolution}

In CT and MRI, the sampling frequency is limited due to inherent physical limitations of the imaging utility, i.e.\ the pitch or spacing of the detector \cite{Greenspan2009}.
The resolution can be enhanced by reducing the size of detectors, but this comes at the expense of increased noise.
Since imaging devices are usually tuned towards low noise and short acquisition time, part of the resolution is sacrificed.
This motivates resolution-enhancing post-processing methods using a single image.
We use slices of
%chest CT scans from COVID-19 patients \cite{Zhao2020} and
T1-weighted in vivo whole brain MRI with isotropic resolution of 250\,\textmu m \cite{T1MRI2018}
from public data sets.
The $ 512 \times 448 $ pixel full-resolution images act as ground truth $ \bm{x} $ and are downsampled by a factor of 4 to obtain low-resolution images $ \tilde{\bm{x}} $.
The DIP network is optimized by applying a downsampling operator $ d : \mathbb{R}^{4H \times 4W} \rightarrow \mathbb{R}^{H \times W} $ to its output $ \hat{\bm{x}} $ and plugging $ d(\hat{\bm{x}}) $ into Eq.\,(\ref{eq:nll_mc}).
%Since there are many high-resolution images that reduce to the same low-resolution image, super-resolution is an ill-posed problem and choosing the downsampling operator $ d $ is far from surjective.
To use gradient-based optimization, the downsampling operator must be differentiable and we opt for a Lanczos kernel \cite{Duchon1979}.

\paragraph{Inpainting}

Applications of inpainting in medical imaging are hair removal in dermoscopy \cite{Abbas2011}, specular highlight removal in endoscopy \cite{Arnold2010}, or metal artifact removal in CT sinograms \cite{Peng2020} and MRI \cite{Armanious2020}.
In this paper, we focus on the former task and sample images from the HAM10000 data set \cite{Tschandl2018} showing different skin lesions with hair occlusions.
We manually mask the hair
%in a set of images
and optimize the ELBO with zero-weighting the masked pixels in the likelihood term.
The networks thus interpolate the masked areas.

\subsection{Results}

\begin{figure}
    \centering
    \includegraphics[width=5cm]{img/results_bo_den/fig_10.pdf} \hfill
    \includegraphics[width=5cm]{img/results_bo_sr/fig_4.pdf} \hfill
    \includegraphics[width=5cm]{img/results_bo_inp/fig_4.pdf}
    \caption{Results of Bayesian optimization. The acquisition function selects the next candidate for $ \sigma_p $ based on the maximum of the expected improvement.}
    \label{fig:bo}
\end{figure}

The results are presented as follows:
First, we use BO to optimize the weight prior standard deviation $ \sigma_p $ per task domain and use the optimal value in the subsequent experiments. 
Next, we show that all competing methods overfit the low-quality image given enough iterations.
Our method outperforms the other methods by means of reconstruction accuracy on all tasks and modalities after convergence when using an optimized weight prior and provides well-calibrated predictive uncertainty maps.
%MFVI needs less parameters to perform all mentioned tasks, which we show by pruning the weights based on their magnitude or signal-to-noise ratio ($ \mathrm{SNR} = \bm{\mu}_i /\bm{\sigma}_i $) in MFVI respectively.

Fig.\,\ref{fig:bo} shows that the optimal $ \sigma_p $ for denoising and super-resolution imposes a narrow prior with $ \sigma^{\ast}_{p,\mathrm{den}} = 0.05 $ and $ \sigma^{\ast}_{p,\mathrm{sr}} = 0.1 $, respectively.
In inpainting, the optimal value $ \sigma^{\ast}_{p,\mathrm{inp}} = 0.36 $ is slightly higher.
A narrow prior prevents weights from growing large, effectively avoiding overfitting of the corrupted image (note that we fixed $ \mu_{p} = 0 $).

This is empirically shown in Fig.\,\ref{fig:overfitting}, where DIP and DIP with SGLD strongly overfit the corrupted patterns, making manually applied early stopping essential to obtain the highest reconstruction accuracy (indicated by the narrow peaks).
Additionally, MC dropout overfits at some point, although the peak is wider and overfitting starts later in optimization.
While the PSNR between reconstruction $ \hat{\bm{x}} $ and ground truth $ \bm{x} $ approaches the PSNR between noisy image $ \tilde{x} $ and ground truth, for DIP and SGLD, MFVI safely converges to the optimal value in all modalities.
%Because of our fine-tuned prior with Bayesian optimization, we do even outperform all other methods with early stopping applied (see Tab. \ref{tab:results}).
%While the BO peaks in performance by means of PSNR in all tasks at a specific $ \sigma_p $ (see Fig.\ \ref{fig:bo}), it is important to note that MFVI does not show overfitting for other values of $ \sigma_p $ as well.
In super-resolution, overfitting is less severe.
MCDIP and MFVI do not overfit the low resolution image.
%While MCDIP did not show overfitting in our experiments, it can be expected that with sufficient enough iterations it will also exhibit overfitting.
DIP and SGLD do not show a sharp peak but rather decline slowly as shown in Fig.\,\ref{fig:overfitting} (right) and Fig.\,\ref{fig:qualitative}.
MFVI consistently provides well-calibrated pixel-wise uncertainty in denoising and super-resolution (see Tab.~\ref{tab:calib_app} in appendix).

%with the code provided by \citet{Cheng2019} employed.
For inpainting we restrict ourselves to a qualitative view onto the reconstruction results as all approaches converge to similar PSNR in the non-masked regions.
While the reconstruction of the DIP and SGLD contain artifacts, MC dropout and MFVI produce very smooth reconstruction results.
In inpainting tasks, the uncertainty maps are especially interesting, which we expect to show high uncertainty in masked regions as the model does not receive information from these areas.
It can be seen in Fig.\ \ref{fig:inpainting_uncert_maps} that the reconstruction of MFVI exhibit high uncertainty in regions with hair, while showing lower uncertainty in the higher frequency regions of the chloasma. 
In the region of the lesion, the uncertainty should be as low as possible, as it is important for the downstream task of classifying the skin lesion.

%Because of its over-parametrization training the DIP is comparably computational expensive. 
%Reducing the parameters posseses two advantages: the first being a speed-up in training and the second a possible compression if the number of needed parameters subceeds the number of pixels.
%We take a step towards assessing the compression ability of the different approaches by pruning the number of weights based on their magnitude or SNR respectively.
%The results are presented in Fig.\ ...
%It can be seen that MFVI generalizes best with the least parameters needed for the same PSNR levels.
%Future work can be conducted here by possibly performing uncertainty aware neural architecture search for DIP with MFVI.

\begin{figure}[p]
    \centering
    \includegraphics[width=5.0cm]{img/psnr_denoising_xray.pdf} \hfill
    \includegraphics[width=5.0cm]{img/psnr_denoising_oct.pdf} \hfill
    \includegraphics[width=5.0cm]{img/psnr_sr_mri1.pdf}
    \caption{Our MFVI approach with an optimized prior does not overfit. Plots show mean $ \pm 2 \times $ standard deviation from 3 runs with different random initialization.}
    \label{fig:overfitting}
\end{figure}

\begin{figure}[p]
    \centering
    \parbox{5.0cm}{\centering \small\textsf{SGLD} \\
    \includegraphics[width=4.9cm]{img/uncert_inpainting_SGLD_cm.pdf}} \hfill
    \parbox{5.0cm}{\centering \small\textsf{MC Dropout} \\
    \includegraphics[width=4.9cm]{img/uncert_inpainting_MCDIP_cm.pdf}} \hfill
    \parbox{5.0cm}{\centering \small\textsf{MFVI (ours)} \\
    \includegraphics[width=4.9cm]{img/uncert_inpainting_MFVI_cm.pdf}}
    \caption{MFVI shows high uncertainty in masked regions for inpainting, whereas the other methods show high uncertainty in regions important for the downstream task.
    %The reconstructions of the other approaches exhibit high uncertainty in regions important for the upcoming task of classifying skin lesions.
    }
    \label{fig:inpainting_uncert_maps}
\end{figure}

\begin{figure}[p]
    \setlength\tabcolsep{2pt}
	%\renewcommand{\arraystretch}{0}
    \centering
    \small
    \begin{tabular}{cccccc}
          & \textsf{input} & \textsf{DIP} & \textsf{SGLD} & \textsf{MC Dropout} & \textsf{MFVI (ours)} \\
        \raisebox{1.0cm}{\rotatebox[origin=c]{90}{\textsf{inpainting}}}
        & \includegraphics[trim=7mm 0 5mm 0,clip,width=2.8cm]{img/hair_1_small.png}
        & \includegraphics[width=2.8cm]{img/inp/inp_dip_skin1.png}
        & \includegraphics[width=2.8cm]{img/inp/inp_sgld_skin1.png}
        & \includegraphics[width=2.8cm]{img/inp/inp_mcd_skin1.png}
        & \includegraphics[width=2.8cm]{img/inp/inp_mfvi_skin1.png} \\
        \raisebox{1.4cm}{\rotatebox[origin=c]{90}{\textsf{super-resolution}}}
        & \includegraphics[trim=32 32 156 156,clip,width=2.8cm]{img/sr/img_139_small.png}
        & \includegraphics[trim=32 32 156 156,clip,width=2.8cm]{img/sr/sr_dip_mri1.png}
        & \includegraphics[trim=32 32 156 156,clip,width=2.8cm]{img/sr/sr_sgld_mri1.png}
        & \includegraphics[trim=32 32 156 156,clip,width=2.8cm]{img/sr/sr_mcd_mri1.png}
        & \includegraphics[trim=32 32 156 156,clip,width=2.8cm]{img/sr/sr_mfvi_mri1.png} \\
        \raisebox{1.4cm}{\rotatebox[origin=c]{90}{\textsf{denoising}}}
        & \includegraphics[width=2.8cm]{img/den/den_xray2_input.png}
        & \includegraphics[width=2.8cm]{img/den/den_dip_xray2.png}
        & \includegraphics[width=2.8cm]{img/den/den_sgld_xray2.png}
        & \includegraphics[width=2.8cm]{img/den/den_mcd_xray2.png}
        & \includegraphics[width=2.8cm]{img/den/den_mfvi_xray2.png}
    \end{tabular}
    \caption{Qualitative results for the different tasks after convergence.
    %The overfitting behavior of the methods is most visible in denoising.
    }
    \label{fig:qualitative}
\end{figure}

% \section{Related Work}
% 
% Deep learning based methods have seen increased use for medical image post-processing or tomographic reconstruction.
%Most approaches rely on supervised learning from data sets of input and output image pairs.
% Generative adversarial networks (GANs) are popular among enhancement methods in the image domain, such as noise reduction in low-dose CT \cite{Wolterink2017}.
% More recent works have adapted to more advanced networks, such as conditional Wasserstein GANs \cite{Yi2018}.
% Usually, the networks are trained using data pairs of low-dose and clinical-dose CTs from the same patient.
% Bayesian approach have been used by applying variational dropout to subpixel convolutional networks for MRI super-resolution \cite{Tanno2017} and Bayesian inversion with conditional Wasserstein GANs for upsampling extreme low-dose CT \cite{Adler2019}.
% %
% The first method using CNNs for CT image reconstruction from sinograms was AUTOMAP, which directly maps information from the sensor-domain to image-domain using supervised learning \cite{Zhu2018}.
% Deep image prior in a non-Bayesian setting was used for unsupervised CT reconstruction \cite{Baguer2020}.
% The sinograms are fed into a CNN and the reconstruction is transformed back using Radon transform to compute the loss between input and output.
% GANs have also been used for de-aliasing in MRI reconstruction \cite{Yang2018} and quality enhancement in low-dose PET imaging \cite{Wang2018}.

\section{Conclusion}

We presented a mean-field variational inference approach to deep image prior and optimized the weight prior using Bayesian optimization.
Bayesian methods are in general more robust to overfitting due to their inbuilt regularization from the weight prior.
However, a badly selected prior can still cause overfitting (as shown empirically for SGLD and MC dropout).
MFVI allows for a more detailed prior selection, which we exploit to optimize the prior using Bayesian optimization.
Selecting a suitable prior fixes the overfitting behavior of DIP-based approaches, which are generally interesting for medical imaging, as no supervised training is required.
Different inverse post-processing tasks in medical imaging were performed to show the benefits of the proposed method.
BO was used to optimize the prior towards reconstruction accuracy.
Even if early stopping is applied to the other methods, our approach performs on-par with respect to reconstruction accuracy and yields well-calibrated uncertainties.
It is further possible to additionally optimize the prior with respect to a calibration metric to ensure well-calibrated uncertainty maps.
The presented approach is not limited to post-processing tasks and can also be used for CT or MRI reconstruction from sinograms.

\midlacknowledgments{MT is supported by Informatics for Life founded by the Klaus Tschira Foundation, ML and AS received funding from Interdisciplinary Competence Center for Interface Research (ICCIR).}

\clearpage

\bibliography{toelle21}

\clearpage

\appendix

\section{KL Divergence Between Two Gaussians}
\label{app:two_gaussians}

If a Gaussian prior is chosen for convenience, the KL divergence is analytically tractable (cf.\ Eq.\,(\ref{eq:kl_mc})).
Let $ p(x) = \mathcal{N}(\mu_{p}, \sigma_{p}^{2}) $ and $ q(x) = \mathcal{N}(\mu_{q}, \sigma_{q}^{2}) $.
It is known that

\begin{align*}
    \kl [ q(x) \Given p(x) ] &= \int q(x) \log \frac{q(x)}{p(x)} \, \mathrm{d}x = \int q(x) \log q(x) \, \mathrm{d}x - \int q(x) \log p(x) \, \mathrm{d}x \\
      &= - \frac{1}{2} \left( 1 + \log 2 \pi \sigma_{q}^{2} \right) + \frac{1}{2} \log 2 \pi \sigma_{p}^{2} + \frac{\sigma_{q}^{2} + ( \mu_{q} - \mu_{p})^{2}}{2\sigma_{p}^{2}} \\
      &= \log \frac{\sigma_p}{\sigma_q} + \frac{\sigma_{q}^{2} + ( \mu_q - \mu_p )^{2}}{2 \sigma_{p}^{2}} - \frac{1}{2} ~ .
\end{align*}

\section{Computational Complexity of MFVI}

Prior selection using Bayesian optimization is performed offline and does not have to be repeated for each image at hand.
Therefore, increased complexity can be attributed to the parameter sampling of MFVI.
In each forward pass, the additional steps are (1) drawing $ n $ samples from a univariate Gaussian, where $ n $ is the number of parameters of the convolutional autoencoder and (2) reparameterization of the actual parameters by $ w_{i} = \mu_{i} + \sigma_{i} \epsilon_{i} $, which results in $ n $ additional multiplications and additions.
In our experiments, this results in $ \approx 2 \times $ slower forward pass times and $ 2 \times $ increased memory footprint.
Relative wall times for the denoising task were $ 1.0 $ for non-Bayesian DIP, $ 1.13 $ for MC dropout, $ 2.20 $ for MFVI and $ 2.76 $ for SGLD.

\section{Additional Figures}

\begin{figure}[h]
    \setlength\tabcolsep{2pt}
	%\renewcommand{\arraystretch}{0}
    \centering
    \small
    \begin{tabular}{cccccc}
          & \textsf{input} & \textsf{DIP} & \textsf{SGLD} & \textsf{MC Dropout} & \textsf{MFVI (ours)} \\
         \raisebox{1.4cm}{\rotatebox[origin=c]{90}{\textsf{denoising}}} & \includegraphics[width=2.8cm]{img/den/den_input_us2.png} & \includegraphics[width=2.8cm]{img/den/den_dip_us2.png} & \includegraphics[width=2.8cm]{img/den/den_sgld_us2.png} & \includegraphics[width=2.8cm]{img/den/den_mcd_us2.png} & \includegraphics[width=2.8cm]{img/den/den_mfvi_us2.png} \\
         \raisebox{1.0cm}{\rotatebox[origin=c]{90}{\textsf{inpainting}}} & \includegraphics[trim=14mm 0 13mm 0,clip,width=2.8cm]{img/inp/hair_0.png} & \includegraphics[width=2.8cm]{img/inp/inp_dip_skin0.png} & \includegraphics[width=2.8cm]{img/inp/inp_sgld_skin0.png} & \includegraphics[width=2.8cm]{img/inp/inp_mcd_skin0.png} & \includegraphics[width=2.8cm]{img/inp/inp_mfvi_skin0.png}
    \end{tabular}
    \caption{Additional qualitative results for US denoising and hair inpainting after convergence. The reconstructions from MC dropout and MFVI look most valid, while MC dropout overly smoothes important details (cf.\ texture of skin lesion).}
    \label{fig:qualitative_app}
\end{figure}

\begin{figure}[ht]
    \setlength\tabcolsep{2pt}
	%\renewcommand{\arraystretch}{0}
    \centering
    \small
    \begin{tabular}{ccccc}
          & \textsf{input} & \textsf{ground truth} & \textsf{non-DIP} & \textsf{MFVI (ours)} \\
         \raisebox{1.2cm}{\rotatebox[origin=c]{90}{\textsf{inpainting}}} & \includegraphics[trim=14mm 0 13mm 0,clip,width=3.4cm]{img/inp/hair_0.png} & \includegraphics[width=3.4cm]{img/inp/hair_0_mask.png} & \includegraphics[width=3.4cm]{img/inp/inp_biharmonic_skin0.png} &  \includegraphics[width=3.4cm]{img/inp/inp_mfvi_skin0.png} \\
         \raisebox{1.2cm}{\rotatebox[origin=c]{90}{\textsf{inpainting}}} & \includegraphics[trim=14mm 0 13mm 0,clip,width=3.4cm]{img/hair_1.png} & \includegraphics[trim=14mm 0 13mm 0,clip,width=3.4cm]{img/inp/hair_1_mask.png} & \includegraphics[width=3.4cm]{img/inp/inp_biharmonic_skin1.png} &  \includegraphics[width=3.4cm]{img/inp/inp_mfvi_skin1.png} \\
         \raisebox{1.5cm}{\rotatebox[origin=c]{90}{\textsf{denoising}}} & \includegraphics[width=3.4cm]{img/den/den_input_us2.png} & \includegraphics[width=3.4cm]{img/den/196_HC.png} & \includegraphics[width=3.4cm]{img/den/den_aniso_us2.png} &  \includegraphics[width=3.4cm]{img/den/den_mfvi_us2.png} \\
         \raisebox{1.5cm}{\rotatebox[origin=c]{90}{\textsf{denoising}}} & \includegraphics[width=3.4cm]{img/den/den_xray2_input.png} & \includegraphics[width=3.4cm]{img/den/VIRUS-9815549-0001.png} & \includegraphics[width=3.4cm]{img/den/den_aniso_xray2.png} &  \includegraphics[width=3.4cm]{img/den/den_mfvi_xray2.png} \\
         \raisebox{1.5cm}{\rotatebox[origin=c]{90}{\textsf{super-resolution}}} & \includegraphics[trim=32 32 156 156,clip,width=3.4cm]{img/sr/img_139_small.png} & \includegraphics[trim=32 32 156 156,clip,width=3.4cm]{img/sr/img_139_res384.png} & \includegraphics[trim=32 32 156 156,clip,width=3.4cm]{img/sr/sr_bilinear_mri1.png} & \includegraphics[trim=32 32 156 156,clip,width=3.4cm]{img/sr/sr_mfvi_mri1.png}
    \end{tabular}
    \caption{Non-DIP method comparison. The non-DIP algorithms are biharmonic functions for inpainting \cite{Damelin2018}, anisotropic diffusion for denoising \cite{Perona1990} and bilinear interpolation for super-resolution.}
    \label{fig:non_dip_app}
\end{figure}

\clearpage

\begin{figure}[h]
    \centering
    \includegraphics[width=5.0cm]{img/psnr_denoising_us.pdf} \quad
    \includegraphics[width=5.0cm]{img/ssim_sr_mri.pdf}
    \caption{(Left) Additional PSNR curve for denoising on US. (Right) Results for super-resolution measured using structural similarity index measure (SSIM).}
    \label{fig:ssim_app}
\end{figure}

\section{Uncertainty Calibration}

\begin{table}[h]
    \centering
    \caption{Uncertainty calibration error (UCE) \cite{Laves2020} for denoising and super-resolution experiments. The UCE describes the expected discrepancy between pixel-wise error and uncertainty of the reconstructions.}
    \begin{tabular}{cccc}
        \toprule
                          & SGLD  & MCDIP & MFVI (ours) \\
        \midrule
        denoising (X-ray) & 0.915 & 0.258 & \textbf{0.093} \\
        denoising (OCT)   & 0.815 & 0.144 & \textbf{0.073} \\
        denoising (US)    & 0.799 & 0.309 & \textbf{0.134} \\
        super-res. (MRI)  & \textbf{0.012} & 0.349 & 0.069 \\
        \bottomrule
    \end{tabular}
    \label{tab:calib_app}
\end{table}

\section{Illustration of Mathematical Concept}

\begin{figure}[ht]
    \centering
    \includegraphics{img/concept.pdf}
    \caption{Illustration of the mathematical concept behind MFVI DIP.}
    \label{fig:concept_app}
\end{figure}

\paragraph{Pseudocode of MFVI DIP}

\begin{enumerate}
    \itemsep0em 
    
    \item Sample input $ \bm{z}' = \mathcal{U}(0,0.1) $
    \item While $ i < i_{\mathrm{max}} $ do
    \begin{enumerate}
        \item Permute input $ \bm{z} = \bm{z}' + \mathcal{N}(0,0.01) $
        \item Sample $ \bm{\epsilon} \sim \mathcal{N}(\bm{0},\bm{I}) $
        \item Let $ \bm{w} = \bm{\mu} + \bm{\sigma} \odot \bm{\epsilon} $ with variational parameters $ \bm{\phi} = \{\bm{\mu}, \bm{\sigma}\} $
        %\item Let $ \phi = \{\bm{\mu}, \bm{\sigma}\} $ be our variational parameters
        \item Compute loss $ \textrm{ELBO}\left( \bm{f}_{\bm{w}}(\bm{z}) \right) = \log q_{\bm{\phi}}(\bm{w}) - \log p(\bm{w}) - \log p(\mathcal{D}\given\bm{w}) $
        \item Compute the gradient w.r.t.\ the mean and standard deviation
        \begin{align*}
            \Delta_{\bm{\mu}} &= \frac{ \partial \textrm{ELBO}\left( \bm{f}_{\bm{w}}(\bm{z}) \right) }{ \partial \bm{w} } + \frac{ \partial \textrm{ELBO}\left( \bm{f}_{\bm{w}}(\bm{z}) \right) }{ \partial \bm{\mu} } \\
            \Delta_{\bm{\sigma}} &= \frac{ \partial \textrm{ELBO}\left( \bm{f}_{\bm{w}}(\bm{z}) \right) }{ \partial \bm{w} } + \frac{ \partial \textrm{ELBO}\left( \bm{f}_{\bm{w}}(\bm{z}) \right) }{ \partial \bm{\sigma} }
        \end{align*}
        \item Update the variational parameters $ \bm{\phi} $
        \begin{align*}
            \bm{\mu} &\leftarrow \bm{\mu} - \eta \Delta_{\bm{\mu}} \\
            \bm{\sigma} &\leftarrow \bm{\sigma} - \eta \Delta_{\bm{\sigma}}
        \end{align*}
        \item $ i \leftarrow i + 1 $
    \end{enumerate}
\end{enumerate}

\section{Background on Bayesian Deep Learning}
\label{app:bdl}

In Bayesian deep learning, a prior distribution $ p(\bm{w} \given \alpha) $ is placed over the weights $ \bm{w} $ of a neural network, governed by a hyperparameter $ \alpha $.
After observing the data $ \mathcal{D} $, we are interested in the posterior $ p(\bm{w} \given \mathcal{D}, \alpha) = p(\mathcal{D} \given \bm{w}, \alpha) p(\bm{w} \given \alpha) / p(\mathcal{D}) $.
However, this distribution is not tractable in general.
% as the normalizing factor involves marginalization of the model likelihood over the prior $ p(\mathcal{D}) = \int p(\mathcal{D} \given \bm{w}, \alpha) p(\bm{w} \given \alpha) \, \mathrm{d}\bm{w} $.
%Consequently, the posterior predictive distribution is intractable as well.
This gives rise to different approximate Bayesian inference techniques that rely on either sampling or VI.
SGLD is a framework that derives a Markov chain Monte Carlo (MCMC) sampler from SGD by injecting Gaussian noise into the gradients after each learning step \cite{Welling2011}.
Under suitable conditions
%(i.e.\ variance of injected noise and learning rate decay),
SGLD eventually converges to the posterior distribution.
%
%In VI, we try to find a simpler, variational approximation to the Bayesian posterior distribution.
VI uses optimization instead of sampling to find the member $ q_{\bm{\phi}}(\bm{w}) $ of a family of distributions (e.g.\ a multivariate Gaussian) that is close to the exact posterior, defined by the variational parameters $ \bm{\phi} $.
We optimize $ q_{\bm{\phi}} $ w.r.t.\ $ \bm{\phi} $, such that the Kullback-Leibler divergence is minimized with regard to the true posterior \cite{Blei2017}.
Two practical implementations are MC dropout \cite{Gal2015Bernoulli} and Bayes by backprop \cite{Blundell2015}.
The former uses dropout before every weight layer during training and at inference time, which allows sampling from the approximate posterior.
The latter assumes a fully factorized Gaussian distribution $ w_{ij} \sim \mathcal{N}(\mu_{ij}, \sigma^{2}_{ij}) $, also known as mean-field distribution, which treats the mean and variance of each weight as learnable parameter.
%In SGLD and MC dropout, the variance of a Gaussian prior is implicitly controlled by weight decay.
In contrast to SGLD and MC dropout, MFVI allows us to directly compute the KL divergence between the variational posterior and the prior, which enables us to select other (non-Gaussian) prior distributions, where no closed form exists.

\end{document}
