%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%%% my
\DeclareMathOperator{\Dropout}{Dropout}
\DeclareMathOperator{\E}{E}
\DeclareMathOperator{\Var}{Var}
\DeclareMathOperator{\Bernoulli}{Bernoulli}
\DeclareMathOperator{\DropoutAr}{DropoutAr}
\DeclareMathOperator{\DropoutArPoly}{DropoutArPoly}
\DeclareMathOperator{\ReLU}{ReLU}
\DeclareMathOperator{\GAP}{GAP}
\DeclareMathOperator{\train}{train}
\DeclareMathOperator{\test}{test}
\usepackage{amsthm}  % proof
%\theoremstyle{definition}
\newtheorem{definition}{Definition}
\newtheorem{proposition}{Proposition}
\usepackage{amsfonts}  % mathbb
\usepackage{bbm}  % mathbbm

\newtheorem{innercustompro}{Proposition}
\newenvironment{custompro}[1]
  {\renewcommand\theinnercustompro{#1}\innercustompro}
  {\endinnercustompro}

\newtheorem{innercustomdef}{Definition}
\newenvironment{customdef}[1]
  {\renewcommand\theinnercustomdef{#1}\innercustomdef}
  {\endinnercustomdef}

%%% python code
\usepackage{listings}
\usepackage{xcolor}

\definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
\definecolor{codepurple}{rgb}{0.58,0,0.82}
\definecolor{backcolour}{rgb}{0.95,0.95,0.92}

\lstdefinestyle{mystyle}{
    backgroundcolor=\color{backcolour},   
    commentstyle=\color{codegreen},
    keywordstyle=\color{magenta},
    numberstyle=\tiny\color{codegray},
    stringstyle=\color{codepurple},
    basicstyle=\ttfamily\footnotesize,
    breakatwhitespace=false,         
    breaklines=true,                 
    captionpos=b,                    
    keepspaces=true,                 
    numbers=left,                    
    numbersep=5pt,                  
    showspaces=false,                
    showstringspaces=false,
    showtabs=false,                  
    tabsize=2
}

\lstset{style=mystyle}
%%%

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
%\usepackage{xr}
%\externaldocument{kim_84}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

%\title{Title in Title Case\\(Supplementary Material)}
\title{How to Use Dropout Correctly on Residual Networks\\with Batch Normalization\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors

\author[1]{Bum Jun Kim}
\author[1]{Hyeyeon Choi}
\author[1]{Hyeonah Jang}
\author[1]{Donggeon Lee}
\author[1]{\href{mailto:<swkim@postech.edu>?Subject=Your UAI 2023 paper}{Sang Woo Kim}{}}
%\author[1]{Sang Woo Kim}
% Add affiliations after the authors
\affil[1]{%
    Department of Electrical Engineering\\
    Pohang University of Science and Technology\\
    Pohang, South Korea
}


%\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Jane~J.~von~O'L\'opez}{}}
%\author[1]{Harry~Q.~Bovik}
%\author[1,2]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[1]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[3,1]{Further~Coauthor}
%% Add affiliations after the authors
%\affil[1]{%
%    Computer Science Dept.\\
%    Cranberry University\\
%    Pittsburgh, Pennsylvania, USA
%}
%\affil[2]{%
%    Second Affiliation\\
%    Address\\
%    …
%}
%\affil[3]{%
%    Another Affiliation\\
%    Address\\
%    …
%  }
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle


\appendix
\section{Proof of Proposition 2}

\begin{proof}
	The variances of PostDropout and PreDropout are represented as follows:
	\begin{align}
		\Var[\Dropout_{\textrm{train}}(\mathbf{W}\mathbf{x})]  & = \E[(\Dropout_{\textrm{train}}(\mathbf{W}\mathbf{x}))^2] - (\E[\Dropout_{\textrm{train}}(\mathbf{W}\mathbf{x})])^2, \label{eq:predropout1}    \\
		\Var[\mathbf{W} \Dropout_{\textrm{train}}(\mathbf{x})] & = \E[(\mathbf{W} \Dropout_{\textrm{train}}(\mathbf{x}))^2] - (\E[\mathbf{W} \Dropout_{\textrm{train}}(\mathbf{x})])^2. \label{eq:postdropout1}
	\end{align}
	We note that $\E[\Dropout_{\textrm{train}}(\mathbf{W}\mathbf{x})] = \E[\mathbf{W} \Dropout_{\textrm{train}}(\mathbf{x})]$. Thus, we investigate the difference between the first terms in Eqs. \ref{eq:predropout1} and \ref{eq:postdropout1}.

	First, as PostDropout drops columns, we have
	\begin{align}
		\E[(\Dropout_{\textrm{train}}(\mathbf{W}\mathbf{x}))^2] & = \frac{1}{p^2} \E[m_{i,i} (w_{i,1} x_1 + \cdots + w_{i,n} x_n)^2]                                                                               \\
		                                                        & = \frac{1}{p^2} \E[m_{i,i} (\sum_{j=1}^n w_{i,j} x_j)^2]                                                                                         \\
		                                                        & = \frac{1}{p} \sum_{j=1}^n w_{i,j}^2 \E[x_j^2] + \frac{1}{p} \sum_{j=1}^n \sum_{k \neq j}^n w_{i,j}w_{i,k} \E[x_j x_k ]. \label{eq:postdropout2}
	\end{align}
	Secondly, as PreDropout drops rows, we obtain
	\begin{align}
		\E[(\mathbf{W} \Dropout_{\textrm{train}}(\mathbf{x}))^2] & = \frac{1}{p^2} \E[(m_{1,1} w_{i,1} x_1 + \cdots + m_{n,n} w_{i,n} x_n)^2]                                                          \\
		                                                         & = \frac{1}{p^2} \E[(\sum_{j=1}^n m_{j,j} w_{i,j} x_j)^2]                                                                            \\
		                                                         & = \frac{1}{p^2} \E[\sum_{j=1}^n m_{j,j}^2 w_{i,j}^2 x_j^2 + \sum_{j=1}^n \sum_{k \neq j}^n m_{j,j}m_{k,k}w_{i,j}w_{i,k}x_j x_k ]    \\
		                                                         & = \frac{1}{p} \sum_{j=1}^n w_{i,j}^2 \E[x_j^2] + \sum_{j=1}^n \sum_{k \neq j}^n w_{i,j}w_{i,k} \E[x_j x_k ]. \label{eq:predropout2}
	\end{align}

	Note that this difference arises from the fact that $\E[m_{i,i}^2]=p$ and $\E[m_{i,i}m_{j,j}]=p^2$ for $i \neq j$. Intuitively, PostDropout uses a single mask; therefore, the multiplication of the two masks is equivalent to the original. However, PreDropout uses multiple masks, and the multiplication of the two masks results in a new mask with $\Bernoulli(p^2)$.

	In summary, the difference between Eqs. \ref{eq:postdropout2} and \ref{eq:predropout2} comes from the second term and is $\frac{1-p}{p} \sum_{j=1}^n \sum_{k \neq j}^n w_{i,j}w_{i,k} \E[x_j x_k ]$. Thus, we have
	\begin{align}
		\E[(\Dropout_{\textrm{train}}(\mathbf{W}\mathbf{x}))^2] > \E[(\mathbf{W} \Dropout_{\textrm{train}}(\mathbf{x}))^2],
	\end{align}
	if and only if $\sum_{j=1}^n \sum_{k \neq j}^n w_{i,j}w_{i,k} \E[x_j x_k ] > 0$. This proves the first inequality $\Delta(\Dropout(\mathbf{W}\mathbf{x})) < \Delta(\mathbf{W}\Dropout(\mathbf{x}))$.

	Next, we investigate the second inequality. We know that
	\begin{align}
		\Var[\mathbf{W} \Dropout_{\textrm{test}}(\mathbf{x})] = \E[(\mathbf{W}\mathbf{x})^2] - (\E[\mathbf{W}\mathbf{x}])^2.
	\end{align}

	We compare these two terms with those in Eq. \ref{eq:postdropout1}. The second term is equal to the second term in Eq. \ref{eq:postdropout1}. Note that
	\begin{align}
		\E[(\mathbf{W}\mathbf{x})^2] & = \sum_{j=1}^n w_{i,j}^2 \E[x_j^2] + \sum_{j=1}^n \sum_{k \neq j}^n w_{i,j}w_{i,k} \E[x_j x_k ]            \\
		                             & < \frac{1}{p}\sum_{j=1}^n w_{i,j}^2 \E[x_j^2] + \sum_{j=1}^n \sum_{k \neq j}^n w_{i,j}w_{i,k} \E[x_j x_k ] \\
		                             & = \E[(\mathbf{W} \Dropout_{\textrm{train}}(\mathbf{x}))^2].
	\end{align}

	Thus, $\Var[\mathbf{W} \Dropout_{\textrm{test}}(\mathbf{x})] < \Var[\mathbf{W} \Dropout_{\textrm{train}}(\mathbf{x})]$, proving that $\Delta(\mathbf{W}\Dropout(\mathbf{x}))<1$.
\end{proof}



\section{Proof of Proposition 5}

\begin{proof}
	GAP averages the feature map $\mathbf{x}$ in the spatial direction as $[\GAP(\mathbf{x})]_k = \frac{1}{HW}\sum_{i,j}^{H,W} x_{k,i,j}$. For simplicity, we regard the $i,\ j$ axis as the $a$ axis and denote $k$th element of GAP from $\mathbf{x} \in \mathbb{R}^{K \times A}$ as
	\begin{align}
		[\GAP(\mathbf{x})]_k = \frac{1}{A}\sum_{a}^{A} x_{k,a},
	\end{align}
	where $A=HW$. In vector form, $\GAP(\mathbf{x}) = \frac{1}{A} \mathbf{x} \cdot \mathbbm{1}^\top$ where $\mathbbm{1}=[1,\ 1,\ \cdots,\ 1] \in \mathbb{R}^{A}$. Now, dropout at H4 and H5 can be, respectively, written as:
	\begin{align}
		[\GAP(\Dropout_{\train}(\mathbf{x}))]_k & = \frac{1}{A} \sum_{a} \Bigl( \frac{1}{p} m_{k,a} x_{k,a} \Bigr), \\
		[\Dropout_{\train}(\GAP(\mathbf{x}))]_k & = \frac{1}{p} m_{k} \Bigl( \frac{1}{A} \sum_{a} x_{k,a} \Bigr).
	\end{align}
	The two equations indicate that dropout before the GAP masks each element of the feature map $\mathbf{x}$, whereas dropout after the GAP masks each channel of the feature map $\mathbf{x}$. This property enables a similar derivation in the proof of Proposition 2 as follows:
	\begin{align}
		\E[(\sum_{a} m_{k,a} x_{k,a})^2 & = \E[\sum_{a} m_{k,a}^2 x_{k,a}^2 + \sum_{a} \sum_{b \neq a} m_{k,a}m_{k,b}x_{k,a} x_{k,b} ] \\
		                                & = p \sum_{a} \E[x_{k,a}^2] + p^2 \sum_{a} \sum_{b \neq a} \E[x_{k,a} x_{k,b} ],              \\
		\E[(\sum_{a} m_{k} x_{k,a})^2]  & = \E[ m_{k}^2 (\sum_{a} x_{k,a}^2 + \sum_{a} \sum_{b \neq a} x_{k,a} x_{k,b} )]              \\
		                                & = p \sum_{a} \E[x_{k,a}^2] + p \sum_{a} \sum_{b \neq a} \E[x_{k,a} x_{k,b} ].
	\end{align}
	Thus, $\E[(\sum_{a} m_{k,a} x_{k,a})^2] < \E[(\sum_{a} m_{k} x_{k,a})^2]$ if and only if $\sum_{a} \sum_{b \neq a} \E[x_{k,a} x_{k,b} ] > 0$. Note that because $\mathbf{x}$ is the output of an activation function such as ReLU or ReLU6, the condition holds. This proves that the variance at H4 is smaller than at H5.
\end{proof}



\section{Design of Other Dropout Operation}
In our analysis, we used the properties of the Bernoulli distribution and corresponding mask matrix $\mathbf{M}$. One could think that if we design another mask matrix using a different distribution and employ it to modify dropout, then the dropout would not demonstrate variance inconsistency. For example, rather than a simple turn-off operation, using attenuation and amplification would have more degrees of freedom, which could enable us to find dropout without any inconsistency. Considering this, we seek a new general design for the Dropout operation.
%\begin{definition}
\begin{customdef}{3}
	For an $n$-dimensional vector $\mathbf{x}$, we define the \textit{DropoutAr} operation as:
	\begin{align}
		\DropoutAr_{\textrm{train}}(\mathbf{x}) & = \mathbf{A} \mathbf{x}, \\
		\DropoutAr_{\textrm{test}}(\mathbf{x})  & = \mathbf{x},
	\end{align}
	where $\mathbf{A}$ is an $n \times n$ diagonal matrix of $a_{i,j} = 0$ for $i \neq j$ and $a_{i,j}$ is sampled from the \textbf{arbitrary} distribution for $i=j$, independent of $\mathbf{x}$.
\end{customdef}
%\end{definition}
DropoutAr is a generalization of Dropout; Dropout is a special case of DropoutAr obtained by choosing $\mathbf{A}=\mathbf{M}/p$. For example, we could choose a Gaussian distribution to generate real numbers that serve as attenuation and amplification for each element of the vector $\mathbf{x}$. From this generalization, we investigate the form of a mask matrix that exhibits consistency in the training and test phases.

However, we find that this is unrealistic.
%\begin{proposition} \label{pro:dropoutar}
\begin{custompro}{6} \label{pro:dropoutar}
	DropoutAr cannot exhibit both mean and variance consistencies simultaneously for any $\mathbf{A}$, except for the identity matrix.
\end{custompro}
%\end{proposition}

\begin{proof}
	We prove this proposition by observing that mean consistency results in variance inconsistency. If DropoutAr has a mean consistency, we obtain $\E[\mathbf{A} \mathbf{x}] = \E[\mathbf{x}]$. Because each element of $\mathbf{A}$ is sampled independently of $\mathbf{x}$, we obtain $\E[\mathbf{A}]=1$. Now, we investigate the variance of DropoutAr:
	\begin{align}
		\Var[\mathbf{A}\mathbf{x}] & = \E[(\mathbf{A}\mathbf{x})^2] - (\E[\mathbf{A}\mathbf{x}])^2                                      \\
		                           & = \E[\mathbf{A}^2] \E[\mathbf{x}^2] - (\E[\mathbf{A}])^2 (\E[\mathbf{x}])^2                        \\
		                           & = (\Var[\mathbf{A}] + (\E[\mathbf{A}])^2) \E[\mathbf{x}^2] - (\E[\mathbf{A}])^2 (\E[\mathbf{x}])^2 \\
		                           & = (\Var[\mathbf{A}] + 1) \E[\mathbf{x}^2] - (\E[\mathbf{x}])^2                                     \\
		                           & > \E[\mathbf{x}^2] - (\E[\mathbf{x}])^2                                                            \\
		                           & = \Var[\mathbf{x}].
	\end{align}
	Note that if $\mathbf{A}$ is not an identity matrix for $\E[\mathbf{A}]=1$, then $\Var[\mathbf{A}]>0$. Thus, we obtain $\Var[\mathbf{A}\mathbf{x}] > \Var[\mathbf{x}]$. Therefore, we conclude that $\Delta(\DropoutAr(\mathbf{x})) < 1$; hence, DropoutAr cannot exhibit variance consistency as long as it is not the identity operator.
\end{proof}

One could think that this simple linear equation lacks sufficient degrees of freedom. We now consider using a polynomial function.

%\begin{definition}
\begin{customdef}{4}
	For an $n$-dimensional vector $\mathbf{x}$, we define the \textit{DropoutArPoly} operation as:
	\begin{align}
		[\DropoutArPoly_{\textrm{train}}(\mathbf{x})]_i & = \sum_{k=0}^d a_{i,k} x_i^k, \\
		\DropoutArPoly_{\textrm{test}}(\mathbf{x})      & = \mathbf{x},
	\end{align}
	where $a_{i,k}$ is sampled from an arbitrary distribution and is independent of $\mathbf{x}$.
\end{customdef}
%\end{definition}
DropoutArPoly is a further generalization of DropoutAr. Similarly, we propose the following:
%\begin{proposition} \label{pro:dropoutarpoly}
\begin{custompro}{7} \label{pro:dropoutarpoly}
	DropoutArPoly cannot exhibit both mean and variance consistencies simultaneously as long as DropoutArPoly is not the identity operator.
\end{custompro}
%\end{proposition}

\begin{proof}
	We prove this proposition by observing that having both mean and variance consistencies simultaneously leads to the identity operator. First, during the training phase, the mean of DropoutArPoly is
	\begin{align}
		\E[a_{i,0} + a_{i,1}x_i + a_{i,2}x_i^2 + \cdots + a_{i,d}x_i^d] = \E[a_{i,0}] + \E[a_{i,1}] \E[x_i] + \E[a_{i,2}] \E[x_i^2] + \cdots + \E[a_{i,d}] \E[x_i^d].
	\end{align}
	For DropoutArPoly, obtaining the mean consistency on an arbitrary $\mathbf{x}$ requires
	\begin{align}
		\E[a_{i,1}] & = 1, \label{eq:arpoly1}                                       \\
		\E[a_{i,0}] & = \E[a_{i,2}] = \cdots = \E[a_{i,d}] = 0.  \label{eq:arpoly2}
	\end{align}

	Secondly, to investigate variance, we compute the mean of the square.
	\begin{align}
		 & \E[(a_{i,0} + a_{i,1}x_i + a_{i,2}x_i^2 + \cdots + a_{i,d}x_i^d)^2]                                                                                          \\
		 & = \E[a_{i,0}^2] + 2 \E[a_{i,0}a_{i,1}] \E[x_i] + ( 2\E[a_{i,0}a_{i,2}] + \E[a_{i,1}^2]) \E[x_i^2]                                                            \\
		 & \ \ \ \ \ \ \ \ + (2 \E[a_{i,0}a_{i,3}] + 2 \E[a_{i,1}a_{i,2}]) \E[x_i^3] + (2 \E[a_{i,0}a_{i,4}] + 2 \E[a_{i,1}a_{i,3}] + \E[a_{i,2}^2]) \E[x_i^4] + \cdots \\
		 & = \E[a_{i,0}^2] + \E[a_{i,1}^2] \E[x_i^2] + \E[a_{i,2}^2] \E[x_i^4] + \cdots.
	\end{align}
	To obtain variance consistency, this result should be equal to $\E[x_i^2]$ for an arbitrary $\mathbf{x}$. This requires
	\begin{align}
		\E[a_{i,1}^2] & = 1, \label{eq:arpoly3}                                           \\
		\E[a_{i,0}^2] & = \E[a_{i,2}^2] = \cdots = \E[a_{i,d}^2] = 0.  \label{eq:arpoly4}
	\end{align}
	Eqs \ref{eq:arpoly1}, \ref{eq:arpoly2}, \ref{eq:arpoly3}, and \ref{eq:arpoly4} indicate that DropoutArPoly becomes the identity operator; hence, there is no other possible operation that can satisfy both mean and variance consistencies at the same time.
\end{proof}

In summary, for a dropout-like operation, variance inconsistency is a universal phenomenon and is not unique to the Bernoulli distribution.



\section{Mean and Variance after ReLU}
For $x \sim \mathcal{N}(0, \sigma^2)$, we compute the mean and variance of $\ReLU(x)$, which are used in the main text. We use $p(x)=\frac{1}{\sigma \sqrt{2 \pi}} \exp\left(-\frac{x^2}{2\sigma^2}\right)$ to denote the probability density function of $x$. First, we know that
\begin{align}
	\E[\ReLU(x)] & = \int_{-\infty}^{\infty} \ReLU(x)p(x)dx \\
	             & = \int_{0}^{\infty} x p(x)dx             \\
	             & = \frac{1}{\sqrt{2 \pi}} \sigma.
\end{align}
The last equation can be derived using the properties of a half-normal or truncated normal distribution. Secondly, using the symmetry of $p(x)$, we derive
\begin{align}
	\E[(\ReLU(x))^2] & = \int_{-\infty}^{\infty} (\ReLU(x))^2 p(x)dx    \\
	                 & = \int_{0}^{\infty} x^2 p(x)dx                   \\
	                 & = \frac{1}{2} \int_{-\infty}^{\infty} x^2 p(x)dx \\
	                 & = \frac{1}{2} \E[x^2]                            \\
	                 & = \frac{1}{2} \sigma^2.
\end{align}

Thus, we obtain
\begin{align}
	\Var[\ReLU(x)] & = \left(\frac{1}{2} - \frac{1}{2 \pi} \right) \sigma^2 \\
	               & = \frac{\pi -1 }{2 \pi}\sigma^2.
\end{align}

\begin{lstlisting}[language=Python, caption=PyTorch example to measure the mean and variance after ReLU.]
import torch

len_x = int(1e+8)
std_x = 1.0
x = torch.normal(mean=torch.zeros((len_x)), std=std_x)

ReLU = torch.nn.ReLU()
ReLU_x = ReLU(x)

print(ReLU_x.mean().item())
print(ReLU_x.var(unbiased=False).item())
\end{lstlisting}

The mean and variance after ReLU can also be empirically measured using the above Python code. We used a sufficiently large number of samples, $10^8$ and $\sigma=1$. From this code, we obtained the following:\\
\texttt{0.3989197611808777} \\
\texttt{0.34079989790916443}

Indeed, $\frac{1}{\sqrt{2 \pi}} \approx 0.3989$ and $\frac{\pi -1 }{2 \pi} \approx 0.3408$.


\section{Limitation}
One potential disadvantage of applying dropout before the GAP is that it could require an additional feature map that expands on the spatial axis, which leads to increased GPU memory consumption. However, modern libraries such as PyTorch provide an option called \texttt{inplace} that allows features to be dropped directly without generating an additional feature map. This option eliminates the potential disadvantage of applying dropout before the GAP.

\section{Hyperparameters for Experiments}

\paragraph{Notes on Module Tests}
When measuring the variance, certain libraries apply Bessel’s correction by default. To ensure correct results, this feature must be turned off. For example, in PyTorch, \texttt{torch.var(input, unbiased=False)} should be used to apply a biased estimator. In fact, \texttt{unbiased=False} is specified when the BN of PyTorch computes the standard deviation.

\paragraph{CIFAR Dataset}
The CIFAR-$\{10,\ 100\}$ dataset consists of 60K images of $\{10,\ 100\}$ classes. For data augmentation, we used $32 \times 32$ random cropping with 4-pixel padding, a random horizontal flip with a probability of 0.5, and mean-std normalization using dataset statistics. For training, the number of epochs of 164, stochastic gradient descent with a momentum of 0.9, learning rate of 0.1, learning rate decay of 0.1 at $\{81,\ 122\}$ epochs, weight decay of 0.0001, mini-batch size of 128, and dropout with a keep probability of 0.8 were used.

\paragraph{Oxford-IIIT Pet and Caltech-101 Datasets}
The Oxford-IIIT Pet dataset consists of 7K pet images from 37 classes; the Caltech-101 dataset includes 9K object images from 101 classes with a background category. Each dataset was split into training, validation, and test sets at a ratio of 70:15:15. All experiments were conducted at a resolution of $224\times224$ using standard data augmentation, including random resized cropping to 256 pixels, random rotations within 15 degrees, color jitter with a factor of 0.4, random horizontal flip with a probability of 0.5, center cropping with 224-pixel windows, and mean-std normalization based on ImageNet statistics. To better observe the performance difference, we trained the model from scratch and did not use pretrained weights. For training, stochastic gradient descent with a momentum of 0.9, learning rate of 0.1, cosine annealing schedule with 200 iterations, weight decay of 0.002, mini-batch size of 128, and dropout with a keep probability of 0.8 were used. The model with the highest validation accuracy was obtained for 200 training epochs.

\paragraph{ImageNet Dataset}
The ImageNet dataset consists of 1.2M images for 1,000 classes. For ImageNet experiments, we used the pytorch-image-models library, which is also known as \texttt{timm}. We used the hyperparameter recipe described in the official documentation. For training, stochastic gradient descent with momentum 0.9, learning rate 0.6, epochs 240, warm-up epochs 5, warm-up learning rate $10^{-5}$, cosine annealing schedule, weight decay $10^{-4}$, label smoothing 0.1, random erasing with probability 0.4 and count 3, RandAugment of magnitude 7 and noise-std 0.5 with increased severity (rand-m7-mstd0.5-inc1), and dropout with a keep probability of 0.8 were used.



\section{Additional Experimental Results}

\paragraph{Dropout at Two Positions}
One could attempt to apply two dropouts in the residual block to combine the advantages of (P5, P6, or P7). However, according to Proposition 1, applying dropout at both P5 and P6 results in $\Dropout(\ReLU(\Dropout(x))) = \Dropout(\ReLU(x))$, which is meaningless. We experimented by applying dropout at both P6 and P7 and observed that for PreResNet-$\{50,\ 110\}$, the accuracy was $\{93.4867,\ 94.1833\}$\% for CIFAR-10 and $\{72.04,\ 73.6667\}$\% for CIFAR-100, which is worse than applying one dropout.

\paragraph{On Width}
We discussed that the advantage of PreDropout comes from the weight condition, which is intensified by the width. Here, we experimented with different widths to test whether a small width could improve the accuracy. We used WideResNet with a depth of 28. We varied the widen factor $k$ that determines the number of channels $\{16,\ 16k,\ 32k,\ 64k\}$ for each stage. For $k=1,\ 2,\ 5,\ 10$, we observed improved accuracy from dropout (Table {\ref{tab:wrn}}), which implies that the widen factor 1 is sufficient to realize an advantage from dropout.

\begin{table*}[h!]
	\centering
	\caption{Experimental results on WideResNet-$\{$Depth$\}$-$\{$Width$\}$ with varying width.}
	\label{tab:wrn}
	\begin{tabular}{l|rr|rr|rr|rr}
		\toprule
		                       & \multicolumn{2}{c|}{WideResNet-28-10} & \multicolumn{2}{c|}{WideResNet-28-5} & \multicolumn{2}{c|}{WideResNet-28-2} & \multicolumn{2}{c}{WideResNet-28-1}                                                                                   \\
		                       & Accuracy                              & Difference                           & Accuracy                             & Difference                          & Accuracy & Difference                  & Accuracy & Difference                  \\
		\midrule
		No Dropout             & 96.1667                               & -                                    & 95.7733                              & -                                   & 94.9100  & -                           & 93.0433  & -                           \\
		\midrule
		Guideline 1 & 96.2433                               & (\textcolor{blue}{+0.0767})          & 96.1233                              & (\textcolor{blue}{+0.3500})         & 94.9367  & (\textcolor{blue}{+0.0267}) & 93.2433  & (\textcolor{blue}{+0.2000}) \\
		\bottomrule
	\end{tabular}
\end{table*}

\paragraph{Regression Task}
Although our main experiments focused on image classification tasks, our findings on the position of Dropout are not restricted to image classification and are expected to be seamlessly applicable to other tasks, including regression tasks. Here, we target a regression task using the AgeDB dataset. The AgeDB dataset contains 16K images with corresponding ages from 1 to 101. We use images as input and ages as labels to formulate regression tasks. For training, PreResNet-101, stochastic gradient descent with momentum 0.9, learning rate 0.001, epochs 200, mini-batch size 128, and weight decay 0.1 were used. For Dropout, we chose the position before the last weight layer in the residual branch. We measured the mean absolute error (MAE) on validation and test sets. We observed that following Guideline 1 improved the regression performance.

\begin{table}[h!]
\centering
\caption{Experimental results on AgeDB dataset.}
\begin{tabular}{l|rrr}
    \toprule
MAE  & No Dropout & With Dropout* & Difference  \\
\midrule
Val  & 6.1261     & 6.0272 & -0.0989  \\
Test & 6.3082     & 6.2291 & -0.0791  \\
\bottomrule
\end{tabular}
\end{table}




\end{document}
