% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
% \externaldocument{uai2023-template}


\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{graphicx}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{siunitx}
\usepackage{algorithm,algpseudocode}
% \usepackage{algorithm}
% \usepackage{algorithmic}
\usepackage{wrapfig}
\usepackage{amsmath,amsthm}
\usepackage{amssymb}
\usepackage{epstopdf}
\usepackage{mathtools}
\usepackage{multirow}
\usepackage{multicol}

% Write Python code
\usepackage{listings}
\usepackage{color}
\definecolor{mygreen}{rgb}{0,0.6,0}
\definecolor{mygray}{rgb}{0.5,0.5,0.5}
\definecolor{mymauve}{rgb}{0.58,0,0.82}
% Python style for highlighting
\newcommand\pythonstyle{\lstset{ 
  backgroundcolor=\color{white},   % choose the background color; you must add \usepackage{color} or \usepackage{xcolor}; should come as last argument
  basicstyle=\footnotesize,        % the size of the fonts that are used for the code
  breakatwhitespace=false,         % sets if automatic breaks should only happen at whitespace
  breaklines=true,                 % sets automatic line breaking
  captionpos=b,                    % sets the caption-position to bottom
  commentstyle=\color{mygreen},    % comment style
  deletekeywords={...},            % if you want to delete keywords from the given language
  escapeinside={\%*}{*)},          % if you want to add LaTeX within your code
  extendedchars=true,              % lets you use non-ASCII characters; for 8-bits encodings only, does not work with UTF-8
  firstnumber=1000,                % start line enumeration with line 1000
  frame=single,	                   % adds a frame around the code
  keepspaces=true,                 % keeps spaces in text, useful for keeping indentation of code (possibly needs columns=flexible)
  keywordstyle=\color{blue},       % keyword style
  language=Octave,                 % the language of the code
  morekeywords={*,...},            % if you want to add more keywords to the set
  numbers=none,                    % where to put the line-numbers; possible values are (none, left, right)
  numbersep=5pt,                   % how far the line-numbers are from the code
  numberstyle=\tiny\color{mygray}, % the style that is used for the line-numbers
  rulecolor=\color{black},         % if not set, the frame-color may be changed on line-breaks within not-black text (e.g. comments (green here))
  showspaces=false,                % show spaces everywhere adding particular underscores; it overrides 'showstringspaces'
  showstringspaces=false,          % underline spaces within strings only
  showtabs=false,                  % show tabs within strings adding particular underscores
  stepnumber=2,                    % the step between two line-numbers. If it's 1, each line will be numbered
  stringstyle=\color{mymauve},     % string literal style
  tabsize=2,	                   % sets default tabsize to 2 spaces
  title=\lstname                   % show the filename of files included with \lstinputlisting; also try caption instead of title
}}

% Python environment
\lstnewenvironment{python}[1][]
{
\pythonstyle
\lstset{#1}
}
{}

% Python for external files
\newcommand\pythonexternal[2][]{{
\pythonstyle
\lstinputlisting[#1]{#2}}}

% Python for inline
\newcommand\pythoninline[1]{{\pythonstyle\lstinline!#1!}}

%\hypersetup{colorlinks=true,
%    linkcolor=mydarkblue,
%    citepcolor=mydarkblue,
%    filecolor=mydarkblue,
%    urlcolor=mydarkblue}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\renewcommand{\v}[1]{\mathbf{#1}}
\newcommand{\MixupS}{\textit{MixupE}}
\newcommand{\inputmixup}{Mixup}

%% KK
\usepackage{Definitions}
\usepackage{Definitions}
% \newcommand{\bxp}{{\bar x}}
\newcommand{\Beta}{{\operatorname{Beta}}}
\newcommand{\bsigma}{\bar \sigma}
\newcommand{\tsigma}{\tilde \sigma}
\newcommand{\hLcal}{\hat \Lcal}
\newcommand{\hL}{\hat L}
\newcommand{\hf}{\hat f}
\newcommand{\htheta}{\hat \theta}
\newcommand{\bg}{\bar g}
\newcommand{\hg}{\hat g}
\newcommand{\baf}{\bar f}
\newcommand{\bW}{\bar W}
\newcommand{\uc}{\textit{\underbar c}}
\newcommand{\bc}{\bar c}
\newcommand{\bepsilon}{\bar \epsilon}
\newcommand{\tGcal}{\tilde \Gcal}
\newcommand{\nc}{the optimal stationary condition}
\newcommand{\bw}{\boldsymbol{w}}
\newcommand{\bp}{\boldsymbol{p}}
\newcommand{\bth}{\boldsymbol{\theta}}
\newcommand{\bA}{\boldsymbol{A}}
\newcommand{\cH}{\pazocal{H}}
\newcommand{\cN}{\pazocal{N}}
\newcommand{\cP}{\pazocal{P}}
\newcommand{\cD}{\pazocal{D}}
\newcommand{\cO}{\pazocal{O}}
\newcommand{\cL}{\pazocal{L}}
\newcommand{\hy}{\hat y}
\newcommand{\hell}{\hat \ell}
% \newcommand{\by}{\bar y}
\newcommand{\bell}{\bar \ell}
\newcommand{\hEE}{\hat \EE}
\newcommand{\oP}{\operatorname{P}}

% zyt
\newcommand{\bx}{{\mathbf{x}}}
\newcommand{\by}{{\mathbf{y}}}
\newcommand{\bz}{{\mathbf{z}}}
\newcommand{\bxp}{{\mathbf{x}^\prime}}


\newcommand{\cX}{{\cal X}}
\newcommand{\bR}{{\mathbb R}}
\newcommand{\cY}{{\cal Y}}
\def\tx{\tilde{\mathbf{x}}}
\def\ty{\tilde{\mathbf{y}}}
\def\tz{\tilde{\mathbf{z}}}
\newcommand{\bE}{\mathbb{E}}
\def\Ln{L_n^{std}}
\def\Lnmix{L^{\text{mix}}_n}
\newcommand{\td}{\tilde}
\def\tD{\tilde{\cD}}

\title{Supplementary Material for MixupE}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Yingtian Zou}
\author[2,3]{
Vikas~Verma}
% \href{mailto:<vikasverma.iitm@gmail.com>?Subject=UAI 2023 paper}{$^*$}{}}
\author[2]{Sarthak~Mittal}
\author[1]{Wai~Hoh~Tang}
\author[4]{Hieu~Pham}
\author[3]{Juho~Kannala}
\author[2]{Yoshua~Bengio}
\author[3]{Arno~Solin}
\author[1]{Kenji~Kawaguchi}
% Add affiliations after the authors
\affil[1]{%
    National University of Singapore\\
    Singapore
}

\affil[2]{%
    Universite de Montreal, Mila\\
    Canada
}
\affil[3]{%
    Aalto University\\
    Finland
}
\affil[4]{%
    Google Brain\\
    USA
}
% \author[1,2]{\href{mailto:<vikasverma.iitm@gmail.com>?Subject=UAI 2023 paper}{Vikas~Verma}{}}
% \author[1]{Sarthak~Mittal}
% \author[4]{Wai~Hoh~Tang}
% \author[3]{Hieu~Pham}
% \author[2]{Juho~Kannala}
% \author[1]{Yoshua~Bengio}
% \author[2]{Arno~Solin}
% \author[4]{Kenji~Kawaguchi}
% % Add affiliations after the authors
% \affil[1]{%
%     Universite de Montreal, Mila\\
%     Canada
% }
% \affil[2]{%
%     Aalto University\\
%     Finland
% }
% \affil[3]{%
%     Google Brain\\
%     USA
% }
% \affil[4]{%
%     National University of Singapore\\
%     Singapore
% }
  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix
\section{Notations}
 We denote by $z=(\bx, \by)$ the input and output pair where $\bx \in\cX\subseteq\bR^d$ and $\by \in\cY\subseteq\bR^C$. Let $f_\theta(\bx) \in \RR^C $ be the output of the logits (i.e., the last layer before the softmax or sigmoid) of the model parameterized by $\theta$. We use $\ell(\theta, \bz) = h(f_\theta(\bx)) - \by\T f_\theta(\bx) $ to denote the loss function. 
Let $g(\cdot)$ be the activation function. We use $\bx_{(i)}$ to index $i$-th element of the vector $\bx$ and $\bx_j$ to represent $j$-th variable in a set. The notation list is:
\begin{itemize}
    \item $S = \{\bx_i, \by_i\}_{i \in [n]}$ is the fixed training set while $\bxp$ is the random test sample.
    \item $\ell$ is the loss function for any data point.
    \item $L_n^{mix}(\theta, S)$: empirical risk of Mixup of size $n$ with parameters $\theta$.
    \item $\mathcal{L}$: empirical risk of \MixupS{}.
    \item $\Theta$: the constraint set of parameters $\theta$.
    \item $\mathcal{R}(\Theta, S)$: Empirical Rademacher complexity of set $\Theta$ over training set $S$.
    \item $\mathbf{J}_{a}(b)$: Jacobian matrix of $a$ w.r.t $b$.
\end{itemize}


\section{Proof of Theorem 1}%\ref{thm:1}}
\label{app:theorm1}
\begin{proof}
For the cross-entropy loss, we have 
\begin{equation}
    \ell(\theta, (\bx, \by)) = -\log\frac{\exp(\by\T f_\theta(\bx))}{\sum_{j}\exp(f_\theta(\bx)_{(j)})}=\log\left(\sum_{j}\exp(f_\theta(\bx)_{(j)})\right)-\by\T f_\theta(\bx)
\end{equation}
where $\by \in \RR^C$ is a one-hot vector.
For the logistic loss, 
\begin{equation}
\ell(\theta,(\bx, \by)) = -\log\frac{\exp(\by f_\theta(\bx))}{1+\exp(f_\theta(\bx))}=\log\left(1+\exp(f_\theta(\bx) \right)-\by f_\theta(\bx).
\end{equation}
Thus, for both cases, we can write 
\begin{equation}
    \ell(\theta,(\bx, \by)=h(f_\theta(\bx))- \by\T f_\theta(\bx)
\end{equation}
where 
$h(\bz)=\log\left(\sum_{j}\exp(\bz_j)\right)$ for the cross-entropy loss and $h(\bz)=\log(1+\exp(\bz))$ for the logistic loss. Using this and equation (9) of \citep{zhang2021does}, we have that 
$$
\Lnmix(\theta,S)=\frac{1}{n}\sum_{i=1}^n \EE_{\lambda\sim\Dcal_\lambda}\bE_{\bxp\sim \Dcal_X} l(\theta,(r_i(\bxp), \by_i)),
$$
where $\Dcal_X$ is the empirical distribution induced by training samples, and
\begin{equation}
    r_i(\bx)=\lambda \bx_i +(1-\lambda) \bx.
\end{equation}
Define $a_\lambda = 1-\lambda$. Then, 
\begin{equation}
    r_i(\bxp) = (1-a_\lambda) \bx_i +a_\lambda \bxp= \bx_i +a_\lambda(\bxp - \bx_{i}).
\end{equation}
Define 
\begin{equation}
    \varphi_{i}(a_\lambda) := f_\theta(  \bx_i + a_\lambda (\bxp - \bx_{i}))
\end{equation}
Assume $f_\theta$ lies in the $C^K$ manifold ($K$-times differentiable), then there exists a function $\psi_{i}$ such that $\lim_{a_\lambda \rightarrow 0}\psi_{ i }(a_\lambda)=0$ and with Taylor expansion at $a_\lambda=0$, we have
\begin{equation}
    \begin{aligned}
    \varphi_{i}(a_\lambda) 
    &= \varphi_{i}(0)+  \sum_{k=1}^K \frac{a_\lambda^{k}}{k!} \varphi^{(k)}_{i}(0)+a_\lambda^{K} \psi_{i}(a_\lambda)  \\
    &= f_\theta(  \bx_i )+  \sum_{k=1}^K \frac{a_\lambda^{k}}{k!} \varphi^{(k)}_{i}(0)+a_\lambda^{K} \psi_{i}(a_\lambda)
    \label{eq:varphi}
    \end{aligned}
\end{equation}
where  $\varphi^{(k)}_i(0)$ is the $k$-th order derivative at $a_\lambda = 0$, $\psi_{i}(a_\lambda)$ is the remainder term:
\begin{equation}
    \psi_{i}(a_\lambda) =  \int_{\RR} \varphi^{(K)}_i(a_\lambda) d a_\lambda - \frac{1}{k!} \varphi^{(K)}_i(0)
\end{equation}
Here, for any $k \in \NN^+$, we have
\begin{equation}
\begin{aligned}
        \varphi^{(k)}_{i}(0) 
        &= \varphi^{(k)}_{i}(a_\lambda)|_{a_\lambda=0} =
     \frac{\partial^{k} f_\theta(\bx_i + a_\lambda (\bxp - \bx_{i}))}{\partial (\bx_i + a_\lambda (\bxp - \bx_{i}))^k} (\bxp - \bx_{i})^{\otimes k}\bigg |_{a_\lambda=0} \\
     &=\frac{\partial^{k} f_\theta(\bx_i)}{\partial (\bx_i)^k} (\bxp - \bx_{i})^{\otimes k} \\
\end{aligned}
\end{equation} 
where $\otimes$ denotes Kronecker product and thus $(\bxp - \bx_{i})^{\otimes k} \in \RR^{d^k}$. We can then rewrite $\varphi^{(k)}_{i}(0)$ as
\begin{equation}
    \varphi^{(k)}_{i}(0) = \mathbf{J}^k_{f_\theta}(\bx_i) (\bxp - \bx_{i})^{\otimes k}
\end{equation}
Plug back into the (\ref{eq:varphi}), we have
\begin{equation}
\begin{aligned}
        f_\theta(  \bx_i + a_\lambda(\bxp - \bx_{i}))
        &=f_\theta(  \bx_i )+  \sum_{k=1}^K \frac{a_\lambda^{k}}{k!} \mathbf{J}^k_{f_\theta}(\bx_i) (\bxp - \bx_{i})^{\otimes k} + a_\lambda^{K} \psi_{i}(a_\lambda) \\
        &= f_\theta(  \bx_i )+  a_\lambda \underbrace{\left( \sum_{k=1}^K \frac{a_\lambda^{k-1}}{k!} \mathbf{J}^k_{f_\theta}(\bx_i) (\bxp - \bx_{i})^{\otimes k} + a_\lambda^{K-1} \psi_{i}(a_\lambda) \right)}_{\Delta_{i}}
\end{aligned}
\end{equation}
% Let $\Delta_{i}=\sum_{k=1}^K \frac{a_\lambda^{k-1}}{k!} \partial ^{k}f_\theta(\bx_i ) (\bxp - \bx_{i})^{\otimes k} + a_\lambda^{K-1} \psi_{i}(a_\lambda)$, 
Above equation will be
\begin{equation}
\begin{aligned}
\ell(\theta, (r_i(\bx), \by_i)) 
& =\ell[\theta,(  \bx_i +a_\lambda(\bxp - \bx_{i}), \by_i)]\\ 
& =h(f_\theta(  \bx_i +a_\lambda(\bxp - \bx_{i})))-\by_i\T f_\theta(  \bx_i +a_\lambda(\bxp - \bx_{i}))  \\ 
& =h(f_\theta(  \bx_i )+a_\lambda \Delta_{i})-\by_i\T (f_\theta( \bx_i )+a_\lambda\Delta_{i}).
\end{aligned}
\end{equation} 
Analogously, we can define $\hat \varphi^{(k)}_{i}(a_\lambda) := h(f_\theta(  \bx_i )+a_\lambda \Delta_{i})$ and the parallel notation $\hat \psi_{i}(a_\lambda)$, then
\begin{equation}
    h(f_\theta(  \bx_i )+a_\lambda \Delta_{i})=h(f_\theta(  \bx_i ))+  \sum_{k=1}^{K} \frac{a_\lambda^{k}}{k!} \mathbf{J}^k_{h \circ f_\theta}(\bx_i) \Delta_{i}^{\otimes k}+a_\lambda^{K} \hat \psi_{i}(a_\lambda)
\end{equation}
Combining these,
\begin{equation}
   \begin{aligned}
\ell(\theta,( r_i(\bx), \by_i)) 
& = h(f_\theta(  \bx_i ))-\by_i\T f_\theta(  \bx_i )-a_\lambda \by_i\Delta_{i} + \sum_{k=1}^{K} \frac{a_\lambda^{k}}{k!} \mathbf{J}^k_{h \circ f_\theta}(\bx_i) \Delta_{i}^{\otimes k}+a_\lambda^{K} \hat \psi_{i}(a_\lambda) \\ 
& =\ell(\theta,(\bx, \by_i)) - a_\lambda \by_i\T \Delta_{i}+  \sum_{k=1}^{K} \frac{a_\lambda^{k}}{k!} \mathbf{J}^k_{h \circ f_\theta}(\bx_i) \Delta_{i}^{\otimes k}+a_\lambda^{K} \hat \psi_{i}(a_\lambda)
\end{aligned} 
\end{equation}
Thus, the implicit regularization of Mixup can be unfolded as
\begin{equation}
   \begin{aligned}
\Lnmix(\theta, S) 
&=\frac{1}{n}\sum_{i=1}^n \EE_{\lambda\sim\Dcal_\lambda}\bE_{\bx\sim \Dcal_X} l(\theta,( r_i(\bx), \by_i))  \\
& =\Ln(\theta, S)+ \frac{1}{n}\sum_{i=1}^n \EE_{\lambda\sim\Dcal_\lambda}\bE_{\bx\sim \Dcal_X}\left(  \sum_{k=1}^{K} \frac{a_\lambda^{k}}{k!} \mathbf{J}^k_{h \circ f_\theta}(\bx_i) \Delta_{i}^{\otimes k}-a_\lambda \by_i\T \Delta_{i}+a_\lambda^{K} \hat \psi_{i}(a_\lambda) \right),
\end{aligned} 
\end{equation}
 where
\begin{equation}
    \Delta_{i}=\sum_{k=1}^K \frac{a_\lambda^{k-1}}{k!} \mathbf{J}^k_{f_\theta}(\bx_i) (\bxp - \bx_{i})^{\otimes k}+a_\lambda^{K-1} \psi_{  i }(a_\lambda).
\end{equation}
Note that with probability $1$, we have 
$$\lim_{a_\lambda \rightarrow 0} \hat \psi_i(a_\lambda)=0, \lim_{a_\lambda \rightarrow 0} \psi_{i}(a_\lambda)=0$$
% Then, by using the vectorization of the tensor $\partial ^{j}f_\theta(b)\in \RR^{D\times d^j}$, we can rewrite this equation as 
% \begin{align} \label{eq:7}
% \varphi^{(j)}_{i}(0)=\partial ^{j}f_\theta(x_i ) (\bx - x_{i})^{\otimes j}, 
% \end{align}
% where 
% $(\bx - x_{i})^{\otimes j}=(\bx - x_{i}) \otimes(\bx - x_{i}) \otimes \cdots \otimes (\bx - x_{i}) \in \RR^{d^j}$.
% Summarising above, there exists a function $\psi_{  i ,\bx }$ such that $\lim_{a \rightarrow 0}\psi_{  i ,\bx }(a)=0$
% and
% $$
% f_\theta(  x_i +a(\bx - x_{i}))=f_\theta(  x_i )+  \sum_{j=1}^J \frac{a^{j}}{j!} \partial ^{j}f_\theta(x_i ) (\bx - x_{i})^{\otimes j}+a^{J} \psi_{  i ,\bx }(a).
% $$
% By defining $\Delta_{i}=\sum_{j=1}^J \frac{a^{j-1}}{j!} \partial ^{j}f_\theta(x_i ) (\bx - x_{i})^{\otimes j}+a^{J-1} \psi_{  i ,\bx }(a)$, we have that 
% \begin{align*}
% l(\theta,(\check x_i, y_i)) & =l(\theta,(  x_i +a(\bx - x_{i}), y_i))
% \\ & =h(f_\theta(  x_i +a(\bx - x_{i})))-y_i\T f_\theta(  x_i +a(\bx - x_{i}))  
% \\ & =h(f_\theta(  x_i )+a\Delta_{i})-y_i\T (f_\theta(  x_i )+a\Delta_{i}).
% \end{align*}
% Similarly, by defining $\varphi_{i}'(a)=h(f_\theta(  x_i )+a\Delta_{i})$, there exists a function $\psi_{  i ,\bx }'$ such that $\lim_{a \rightarrow 0}\psi_{  i ,\bx }'(a)=0$ and
% $$
% h(f_\theta(  x_i )+a\Delta_{i})=h(f_\theta(  x_i ))+  \sum_{j=1}^{{J}} \frac{a^{j}}{j!} \partial ^{j}h(f_\theta(  x_i ) ) \Delta_{i}^{\otimes j}+a^{J} \psi_{  i ,\bx }'(a).
% $$
% Combining these, 
% \begin{align*}
% l(\theta,(\check x_i, y_i)) & =h(f_\theta(  x_i ))-y_i\T f_\theta(  x_i )-ay_i\Delta_{i}+  \sum_{j=1}^{J} \frac{a^{j}}{j!} \partial ^{j}h(f_\theta(  x_i ) )\Delta_{i}^{\otimes j}+a^{J} \psi_{  i ,\bx }'(a)
% \\ & =l(\theta,(x_i, y_i))-ay_i\T \Delta_{i}+  \sum_{j=1}^{J} \frac{a^{j}}{j!} \partial ^{j}h(f_\theta(  x_i ) ) \Delta_{i}^{\otimes j}+a^{J} \psi_{  i ,\bx }'(a). \end{align*}
% Thus,
% by writing  $a_\lambda=1-\lambda$ to make the dependence clear,
% \begin{align*}
% &\Lnmix(\theta,S)
% \\ &=\frac{1}{n}\sum_{i=1}^n \EE_{\lambda\sim\Dcal_\lambda}\bE_{\bx\sim \Dcal_X} l(\theta,(\check x_i, y_i)) 
% \\ & =\Ln(\theta,S)+ \frac{1}{n}\sum_{i=1}^n \EE_{\lambda\sim\Dcal_\lambda}\bE_{\bx\sim \Dcal_X}\left(  \sum_{j=1}^{J} \frac{a_\lambda^{j}}{j!} \partial ^{j}h(f_\theta(  x_i ) ) \Delta_{i}^{\otimes j}-a_\lambda y_i\T \Delta_{i}+a_\lambda^{J} \psi_{  i ,\bx }'(a_\lambda) \right),
% \end{align*}
%  where
% $$
% \Delta_{i}=\sum_{j=1}^J \frac{a_\lambda^{j-1}}{j!} \partial ^{j}f_\theta(x_i ) (\bx - x_{i})^{\otimes j}+a_\lambda^{J-1} \psi_{  i ,\bx }(a_\lambda).
% $$

% Since $\lim_{\alpha \rightarrow 0}\frac{1}{n}\sum_{i=1}^n \bE_{\bx\sim \Dcal_X} \psi_{  i ,\bx }'(a)=0$ and $\lim_{\alpha \rightarrow 0}\bE_{\bx\sim \Dcal_X}\psi_{  i ,\bx }(a)=0$, there exists functions $\psi$ and $\psi_{i}$ such that $\lim_{a \rightarrow 0}\psi(a)=0$, $\lim_{a \rightarrow 0}\psi_{i}(a)=0$, and
% \begin{align*}
% &\Lnmix(\theta,S) 
% \\ & =\Ln(\theta,S)+ \frac{1}{n}\sum_{i=1}^n \EE_{\substack{\lambda\sim\Dcal_\lambda \\ \bx\sim \Dcal_X }} \left(  \sum_{j=1}^{J} \frac{a_\lambda^{j}}{j!} \partial ^{j}h(f_\theta(  x_i ) ) \Delta_{i}^{\otimes j}-a_\lambda y_i\T \Delta_{i}+a_\lambda^{J} \psi(a_\lambda) \right),
% \end{align*}
% where
% $$
% \Delta_{i}=\sum_{j=1}^J \frac{a_\lambda^{j-1}}{j!} \partial ^{j}f_\theta(x_i ) (\bx - x_{i})^{\otimes j}+a_\lambda^{J-1} \psi_{  i }(a_\lambda).
% $$
% Finally, we compute $\partial h(f_\theta(  x_i ) )$. Recall that  
% $h(z)=\log\left(\sum_{k}\exp(z_k)\right)$ for the cross-entropy loss and $h(z)=\log(1+\exp(z))$ for the logistic. For the cross-entropy loss, we have
% $
% \frac{\partial h(z)}{\partial z_t} =\left(\sum_{k}\exp(z_k)\right)^{-1} \exp(z_t)=g(z)_{t},
% $ 
% where $g$ is the softmax function. Similarly,  for  the logistic  loss, since $h(z)=\log\left(1+\exp(z)\right)$, we have that $\frac{\partial h(z)}{\partial z} = (1+\exp(z))^{-1} \exp(z) =g(z)$, where $g$ is the sigmoid function.  Therefore, for both cases of  the cross-entropy loss and the logistic  loss, we have that 
% $
% \partial h(f_\theta(  x_i ) )=g(f_\theta(  x_i ))\T \in \RR^{1\times D}.
% $ 

\end{proof}

\section{Proof of Theorem 2}
The Rademacher generalization bound is widely applied where the empirical Rademacher complexity of a function class $\Theta$ is given by:
\begin{equation}
    \mathcal{R}_n(\Theta, \{\bx_i\}_{i\in [n]})=\mathbb{E}\left[\sup _{\theta \in \Theta} \frac{1}{n} \sum_{i=1}^n f_{\theta} \left(\bx_i\right) \epsilon_i\right]
\end{equation}
where, Rademacher r.v $\epsilon_i$ independently takes values in $\{-1, +1\}$ with equal probability. 
\begin{lemma} (\cite{bartlett2002rademacher}). For any B-uniformly bounded and $L$ Lipchitz function $\zeta$, for all $\phi \in \Phi$, with probability at least $1-\delta$,
$$
\mathbb{E} \zeta\left(\phi \left(\bx_i\right)\right) \leq \frac{1}{n} \sum_{i=1}^n \zeta\left(\phi\left(\bx_i\right)\right)+2 L \mathcal{R}_n(\Phi, S)+B \sqrt{\frac{\log (1 / \delta)}{2 n}}
$$
\label{lemma:1}
\end{lemma}
\begin{proof}
% Consider GLM that $h(f_\theta(\bx)) = A(\theta\T \bx)$ and training set $S$, the constraint of $ \Theta = \{\bx \to f_\theta(\bx) | \EE_{\bx} \hat{q}(\bx) \leq \gamma \}$ implies that
% \begin{equation}
%      \mathbb{E}_{\bx} |\hat{q}_i(\bx)| = \mathbb{E}_{\bx} (\by - A'(\theta^\top \bx))^\top (\theta^\top \bx) \leq \gamma 
% \end{equation}
% Rearranging the terms, and by Cauchy–Schwarz inequality we have
% \begin{equation}
% \begin{aligned}
%     \gamma  
%     & \geq \mathbb{E}_{\bx} (\by - A'(\theta^\top \bx))^\top (\theta^\top \bx) \\
%     & = \mathbb{E}_{\bx} \langle\by, \theta^\top \bx \rangle - \mathbb{E}_{\bx} \langle A'(\theta^\top \bx), \theta^\top \bx \rangle \\
%     & \geq \mathbb{E}_{\bx} \langle\by, \theta^\top \bx \rangle - \mathbb{E}_{\bx} \| A'(\theta^\top \bx) \|_2 \| \theta^\top \bx \|_2
% \end{aligned}
% \end{equation}
% Due to the fact that $A(\cdot)$ is a $L_A$ Lipchitz function, then it's trivial to prove
% \begin{equation}
%      \| A'(\theta^\top \bx) \|_2 \leq L_A
% \end{equation}
% Let $\by = (\theta^*)^\top \bx = (\Sigma \theta)^\top \bx$ where $\Sigma$ is the diagonal matrix. Thus the above relation will be
% \begin{equation}
% \begin{aligned}
%     \gamma 
%     & \geq \mathbb{E}_{\bx} \langle\by, \theta^\top \bx \rangle - \mathbb{E}_{\bx} \| A'(\theta^\top \bx) \|_2 \| \theta^\top \bx \|_2 \\
%     & \geq \mathbb{E}_{\bx} \langle (\Sigma \theta)^\top \bx , \theta^\top \bx \rangle - L_A \mathbb{E}_{\bx} \| \theta^\top \bx \|_2 \\
%     % & = \mathbb{E}_{\bx}  \bx^\top ( \Sigma \theta \theta^\top - L_A \theta \theta^\top )\bx \\
% \end{aligned}
% \end{equation}
% Let $\mathbf{v} = \theta^\top \bx$ and $\overline{\sigma}$ be the expected value that $\overline{\sigma} = \EE_{\bx} \Sigma_{(i)} =  \EE_{\bx} \frac{\operatorname{tr}(\Sigma)}{d}$, then we have 
% \begin{equation}
%     \gamma \geq  \mathbb{E}_{\bx} \overline{\sigma} \|\mathbf{v}\|_2^2 - L_A \|\mathbf{v}\|_2
% \end{equation}
Consider GLM that $h(f_\theta(\bx)) = A(\theta\T \bx)$ and training set $S$, the constraint of $ \Theta = \{\bx \to f_\theta(\bx) | \sup_{\bx} \hat{q}(\bx) \leq \gamma \}$ implies that
\begin{equation}
     \sup_{\bx} |\hat{q}_i(\bx)| = \sup_{\bx} (\by - A'(\theta^\top \bx))^\top (\theta^\top \bx) \leq \gamma 
\end{equation}
Rearranging the terms, and by Cauchy–Schwarz inequality we have
\begin{equation}
\begin{aligned}
    \gamma  
    & \geq \sup_{\bx} (\by - A'(\theta^\top \bx))^\top (\theta^\top \bx) \\
    & = \sup_{\bx} \langle\by, \theta^\top \bx \rangle - \sup_{\bx} \langle A'(\theta^\top \bx), \theta^\top \bx \rangle \\
    & \geq \sup_{\bx} \langle\by, \theta^\top \bx \rangle - \sup_{\bx} \| A'(\theta^\top \bx) \|_2 \| \theta^\top \bx \|_2
\end{aligned}
\end{equation}
Due to the fact that $A(\cdot)$ is a $L_A$ Lipchitz function, then it's trivial to prove
\begin{equation}
     \| A'(\theta^\top \bx) \|_2 \leq L_A
\end{equation}
Let $\by = (\theta^*)^\top \bx = (\Sigma \theta)^\top \bx$ where $\Sigma$ is the diagonal matrix. Thus the above relation will be
\begin{equation}
\begin{aligned}
    \gamma 
    & \geq \sup_{\bx} \langle\by, \theta^\top \bx \rangle - \sup_{\bx} \| A'(\theta^\top \bx) \|_2 \| \theta^\top \bx \|_2 \\
    & \geq \sup_{\bx} \langle (\Sigma \theta)^\top \bx , \theta^\top \bx \rangle - L_A \sup_{\bx} \| \theta^\top \bx \|_2 \\
\end{aligned}
\end{equation}
Let $\mathbf{v} = \sup_\bx \theta^\top \bx$ and $\overline{\sigma}$ be the expected value that $\overline{\sigma} = \EE_{j \in [d]} \sup_{\bx_i} \Sigma_i{(j)} =  \sup_\bx \frac{\operatorname{tr}(\Sigma)}{d}$, then we have 
\begin{equation}
    \gamma \geq  \overline{\sigma} \|\mathbf{v}\|_2^2 - L_A \|\mathbf{v}\|_2
\end{equation}
which implies
\begin{equation}
   \frac{L_A - \sqrt{L_A^2 + 4\gamma \overline{\sigma}}}{2 \overline{\sigma}} \leq \|\mathbf{v}\|_2 \leq \frac{L_A + \sqrt{L_A^2 + 4\gamma \overline{\sigma}}}{2 \overline{\sigma}}
\end{equation}
Obviously,  
\begin{equation}
    \left|\frac{L_A + \sqrt{L_A^2 + 4\gamma \overline{\sigma}}}{2 \overline{\sigma}} \right| > \left|\frac{L_A + \sqrt{L_A^2 - 4\gamma \overline{\sigma}}}{2 \overline{\sigma}} \right|
\end{equation}
% \begin{equation}
%     \gamma \geq \frac{\operatorname{tr}(\Sigma) - L_A^2}{d} \mathbb{E}_{\bx} (\bx^\top \theta \theta^\top \bx)
% \end{equation}
Denote $\mathbf{v}_i = \theta^\top  \bx_i$, we have the Rademacher complexity $\mathcal{R}(\Theta, S)$ that
\begin{equation}
    \begin{aligned}
\mathcal{R}\left(\Theta, S\right) 
& =\mathbb{E}_{\epsilon} \sup_{\EE_\bx \hat{q}(\bx) \leq \gamma} \frac{1}{n} \sum_{i=1}^n \epsilon_i \theta^{\top} \bx_i \\
% & =\mathbb{E}_{\epsilon} \sup_{\EE_\bx \hat{q}(\bx) \leq \gamma} \frac{1}{n} \sum_{i=1}^n \epsilon_i v^{\top} \tilde{x}_i \\
& \leq \mathbb{E}_{\epsilon} \sup_{ \|\mathbf{v}_i\|^2_2 \leq \left(\frac{L_A + \sqrt{L_A^2 + 4\gamma \overline{\sigma}}}{2 \overline{\sigma}} \right)^2 } \frac{1}{n} \sum_{i=1}^n \epsilon_i \mathbf{v}_i \\
% & \leq \mathbb{E}_{\epsilon} \sup_{\max\left(\gamma, \|\mathbf{v}_i\|^2_2 \leq \left(\frac{L_A + \sqrt{L_A^2 - 4\gamma \overline{\sigma}}}{2 \overline{\sigma}} \right)^2 \right)} \frac{1}{n} \sum_{i=1}^n \epsilon_i \mathbf{v}_i \\
% & \leq \frac{1}{n} \cdot\left(\frac{\gamma}{\rho}\right)^{1 / 4} \vee\left(\frac{\gamma}{\rho}\right)^{1 / 2} \cdot \mathbb{E}_{\epsilon}\left\|\sum_{i=1}^n \epsilon_i \tilde{\bx}_i\right\| \\
& \leq \frac{1}{n} \cdot \frac{L_A + \sqrt{L_A^2 + 4\gamma \overline{\sigma}}}{2 \overline{\sigma}} \cdot \sqrt{\mathbb{E}_{\epsilon}\left(\sum_{i=1}^n \epsilon_i \right)^2} \\
% & \leq \left(\frac{L_A + \sqrt{L_A^2 - 4\gamma \overline{\sigma}}}{2 \overline{\sigma}} \right)^2 \cdot \sqrt{n} \\
& = \frac{1}{\sqrt{n}} \frac{L_A + \sqrt{L_A^2 + 4\gamma \overline{\sigma}}}{2 \overline{\sigma}}  \\
%----------------------------------------------------
& \leq \frac{1}{\sqrt{n}} \frac{2 L_A + 2 \sqrt{\gamma \overline{\sigma}} }{2 \overline{\sigma}} \\
& = \frac{ L_A + \sqrt{\gamma \overline{\sigma}} }{ \overline{\sigma}\sqrt{n}} 
\end{aligned}
\end{equation}
% Due to the constraint , it implies that
% \begin{equation}
%     \EE_{\bx \sim \mathcal{D}_X} \left\langle(\by - A^\prime (\theta^\top \bx)), \theta^\top \bx \right \rangle \leq \gamma
% \end{equation}
Consequently, we have 
\begin{equation}
\mathcal{R}\left(\Theta, S\right) \leq  \frac{ L_A + \sqrt{\gamma \overline{\sigma}} }{ \overline{\sigma}\sqrt{n}}  %\frac{\gamma}{\sqrt{n}(\operatorname{tr}(\Sigma) - L_A^2)}
\end{equation}
Recall the objective of \MixupS{}, 
\begin{align}
\Lcal(\theta, S) &:= \hat{\eta} \left(L^{mix}_n(\theta, S)+ \eta R(\theta, S) \right) \\
\hat{\eta} &= \frac{|L^{mix}_n(\theta, S)|}{|L^{mix}_n(\theta, S)+ \eta R(\theta, S)|} 
\end{align}  
With Lemma \ref{lemma:1}, we can get
\begin{equation}
\begin{aligned}
    \mathcal{L}(\theta, S) 
    & \leq \hat{\eta} L^{mix}_n(\theta, S) +  2 \hat{\eta} \eta L \mathcal{R}(\Theta, S) +  B \sqrt{\frac{\log(1/\delta)}{2n}} \\
    & \leq \hat{\eta} L^{mix}_n(\theta, S) +  \frac{ 2 \hat{\eta} \eta LL_A(L_A + \sqrt{\gamma \overline{\sigma}}) }{ \overline{\sigma}\sqrt{n}}  +  B \sqrt{\frac{\log(1/\delta)}{2n}}
\end{aligned}
\end{equation}
% \mathcal{R}\left(\Theta, S\right)  
% & \leq \frac{1}{n} \frac{\gamma}{\rho n} \cdot \sqrt{\sum_{i=1}^n \tilde{\bx}_i^{\top} \tilde{\bx}_i}\\ %\cdot\left(\frac{\gamma}{\rho}\right)^{1 / 4} \vee\left(\frac{\gamma}{\rho}\right)^{1 / 2} 
% & \leq \frac{1}{\sqrt{n}} \frac{\gamma}{\rho n} \cdot \operatorname{rank}\left(\Sigma_X\right) .
% \end{aligned}  
% \end{equation}
% Based on this bound on Rademacher complexity, Corollary $3.1$ can be proved by directly applying the following theorem.
\end{proof}

\subsection{Comparison to vanilla Mixup}
As a comparison, for vanilla Mixup with parameter space $\hat \Theta = \{ \theta | \|\theta\|_2^2 \leq \gamma \}$ and assume $\left\| \bx_i \right\|^2 \leq \mathcal{X}, \forall i \in [n]$ the Rademacher complexity will be
\begin{equation}
\begin{aligned}
    \mathcal{R}(\hat \Theta, S) 
    &= \mathbb{E}_{\epsilon} \sup_{\|\theta\|_2^2 \leq \gamma} \frac{1}{n} \sum_{i=1}^n \epsilon_i \theta^{\top} \bx_i \\
    &= \frac{1}{n} \mathbb{E}_{\epsilon} \sup_{\|\theta\|_2^2 \leq \gamma} \sqrt{  \sum_{i=1}^n \epsilon^2_i \left \| \theta \right\|^2 \left\| \bx_i \right\|^2 } \\
    &= \frac{\sqrt{\gamma}}{n} \mathbb{E}_{\epsilon}  \sqrt{  \sum_{i=1}^n \epsilon^2_i \left\| \bx_i \right\|^2 } \\
    & \leq \frac{\sqrt{\gamma \mathcal{X}}}{\sqrt{n}}
\end{aligned}
\label{eq:mixup_rademacher}
\end{equation}
Compared to the Rademacher complexity of Mixup, we found that \MixupS{} don't need to bound the norm of input data by $\mathcal{X}$ which may cause a large term. However, if considering normalized input space where $\mathcal{X} \leq 1$, the condition to have a shrink parameter space is
\begin{equation}
    \frac{ L_A + \sqrt{\gamma \overline{\sigma}} }{ \overline{\sigma}} \leq \sqrt{\gamma} \Rightarrow \frac{L_A}{\overline{\sigma} - \sqrt{\overline{\sigma}}} \leq \sqrt{\gamma} ~~~\text{and}~~~ \overline{\sigma} > 1
\end{equation}
Thus, when the above condition is satisfied, our regularization reduces the norm of parameter space for the case where input space is normalized $\mathcal{X} \leq 1$. In general, the $\overline{\sigma}$ is the average entry value of the maximum correction matrix to the ground truth which can be quite large. Scaling by $\sigma$, it is probably satisfied in most cases.
% Instead, the Rademacher complexity of reduced parameter space $\mathcal{R}(\Theta, S)$ depends on a large term $(\operatorname{tr}(\Sigma) - L_A^2) > 1$ (generally) in the denominator which is absent in the (\ref{eq:mixup_rademacher}). 

% \newpage
\section{Implementation}
The code implementation in PyTorch is shown as Listing \ref{fig: code_block}.
\begin{python}[float=*h, caption={One epoch \MixupS{} training in PyTorch}, label={fig: code_block}]
def beta_mean(alpha, beta):
    return alpha/(alpha+beta)
    
lam_mod_mean = beta_mean(alpha+1, alpha) # mean of beta distribution

# y1, y2 should be one-hot vectors
for (x1, y1), (x2, y2) in zip(loader1, loader2):
    lam = numpy.random.beta(alpha, alpha)
    x = Variable(lam * x1 + (1. - lam) * x2)
    y = Variable(lam * y1 + (1. - lam) * y2)
    loss = loss_function(net(x), y)  # mixup loss
    loss_scale = torch.abs(loss.detach().data.clone())
    f = net(x1)
    b = y1 - torch.softmax(f, dim=1)
    loss_new = torch.sum(f * b, dim=1)
    loss_new = (1.0 - lam_mod_mean) * torch.sum(torch.abs(loss_new)) / batch_size  # additional loss term
    loss = loss + (mixup_eta * loss_new)  # total loss
    loss_new_scale = torch.abs(loss.detach().data.clone())
    loss = (loss_scale / loss_new_scale) * loss  # loss after scaling
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
\end{python}



% \iffalse
% \section{More About Experiments}

% \subsection{Example of train error v.s. test error} \label{app:1}
% In this subsection, we present experiments of PreActResNet18 and PreActResNet34 on CIFAR-10. We used these models because of the speed to train and we can get insights on convolution-based model. The experimental conditions are 200 epochs, the initial learning rate is set at 0.1 and annealed by a factor of 10 at epoch 100 and 150, batch size is 100, $L_2$ weight decay of $10^{-4}$ with a step-wise learning rate decay, $\alpha$ is 1.0 for Mixup method, and $\eta = 0.01$ for \MixupS{} method. We use SGD+Momentum optimizer with a momentum coefficient of 0.9. 

% We observe in Figure~\ref{fig:cifar10_resnet18_train_error_sub} and Figure~\ref{fig:cifar10_resnet34_train_error_sub} that while training error of \MixupS{}    overlaps with Mixup, \MixupS{}  method achieves improvements in test error in Figure~\ref{fig:cifar10_resnet18_test_error_sub} and Figure~\ref{fig:cifar10_resnet34_test_error_sub}. 

% \begin{figure}[h]%
%      \centering
%      \begin{subfigure}{0.48\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{resnet18_cifar10_train_error.png}
%          \caption{PreActResNet18 train error.}
%          \label{fig:cifar10_resnet18_train_error_sub}
%      \end{subfigure}
%     % 
%      \begin{subfigure}{0.48\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{resnet18_cifar10_test_error.png}
%          \caption{PreActResNet18 test error.}
%          \label{fig:cifar10_resnet18_test_error_sub}
%      \end{subfigure}
     
%      \begin{subfigure}{0.48\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{resnet34_cifar10_train_error.png}
%          \caption{PreActResNet34 train error.}
%          \label{fig:cifar10_resnet34_train_error_sub}
%      \end{subfigure}
%      %
%      \begin{subfigure}{0.48\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{resnet34_cifar10_test_error.png}
%          \caption{PreActResNet34 test error.}
%          \label{fig:cifar10_resnet34_test_error_sub}
%      \end{subfigure}       

%         \caption{Comparison of train and test errors of ERM, Mixup and \MixupS{} in the training of PreActResNet18 and PreActResNet34 on CIFAR-10. The train errors of \MixupS{} and Mixup are similar, but we see a smaller test error for the \MixupS{} method.}
%         \label{fig:cifar10_resnet18_train_test_error}
% \end{figure}



% \subsection{Tabular Datasets}
% \begin{table}[t!]
% %\kk{for the binary case, the code needs to be fixed or remove the binary case.}
%     \centering
%     \setlength{\tabcolsep}{20pt}
%     \renewcommand{\arraystretch}{1.5}
%     \begin{tabular}{c c c c c}
%     \toprule
%     \textbf{Dataset} & \textbf{Id} & \multicolumn{3}{c}{\textbf{Method}} \\
%     \midrule
%     & & ERM & Mixup & MixupScaled \\
% \iffalse
%     \midrule
%     \multicolumn{5}{c}{\textit{Binary Classification}} \\
%     \midrule
% kr-vs-kp & 3 & \g{0.77}{0.32} & \g{0.71}{0.34} & \g{\textbf{0.70}}{0.38} \\
% breast-w & 15 & \g{\textbf{2.17}}{1.08} & \g{2.32}{0.93} & \g{2.32}{0.93} \\
% credit-approval & 29 & \g{15.05}{1.96} & \g{13.80}{1.33} & \g{\textbf{13.59}}{2.10} \\
% credit-g & 31 & \g{25.23}{2.03} & \g{\textbf{24.52}}{2.53} & \g{25.02}{2.13} \\
% diabetes & 37 & \g{25.25}{1.36} & \g{\textbf{24.04}}{1.54} & \g{25.26}{2.00} \\
% haberman & 43 & \g{\textbf{25.81}}{3.55} & \g{29.40}{3.38} & \g{26.82}{4.02} \\
% heart-c & 49 & \g{21.73}{3.70} & \g{\textbf{20.37}}{3.97} & \g{21.12}{4.53} \\
% heart-statlog & 53 & \g{\textbf{14.90}}{5.83} & \g{16.23}{6.42} & \g{16.23}{6.73} \\
% arcene & 1458 & \g{\textbf{24.92}}{7.43} & \g{26.69}{9.56} & \g{26.27}{4.90} \\
% bank-marketing & 1461 & \g{9.35}{0.15} & \g{\textbf{9.30}}{0.31} & \g{9.43}{0.22} \\
% PhishingWebsites & 4534 & \g{3.45}{0.41} & \g{\textbf{3.27}}{0.29} & \g{3.49}{0.19} \\
% CreditCardFraudDetection & 42175 & \g{0.06}{0.01} & \g{0.06}{0.01} & \g{0.06}{0.01} \\
% Click\_prediction\_small & 42733 & \g{\textbf{16.26}}{0.29} & \g{16.29}{0.25} & \g{16.33}{0.28} \\
% online-shoppers-intention & 42993 & \g{\textbf{10.33}}{0.35} & \g{10.50}{0.48} & \g{10.52}{0.41} \\
% Pulsar-Dataset-HTRU2 & 43377 & \g{2.07}{0.26} & \g{\textbf{2.04}}{0.34} & \g{2.08}{0.22} \\
% \fi
%     \midrule
%     %\multicolumn{5}{c}{\textit{Multi-Class Classification}} \\
%     %\midrule
% arrhythmia & 5 & \g{\textbf{34.60}}{3.10} & \g{35.49}{3.88} & \g{34.85}{3.99} \\
% letter & 6 & \g{4.56}{0.27} & \g{\textbf{3.71}}{0.18} & \g{4.04}{0.20} \\
% balance-scale & 11 & \g{3.87}{1.03} & \g{3.70}{1.00} & \g{\textbf{3.68}}{0.97} \\
% mfeat-factors & 12 & \g{2.74}{0.81} & \g{\textbf{2.44}}{0.42} & \g{2.56}{0.64} \\
% mfeat-fourier & 14 & \g{17.69}{1.76} & \g{17.80}{1.56} & \g{\textbf{17.57}}{1.60} \\
% mfeat-karhunen & 16 & \g{3.74}{0.58} & \g{3.06}{0.29} & \g{\textbf{2.47}}{0.32} \\
% mfeat-morphological & 18 & \g{25.00}{2.10} & \g{\textbf{24.62}}{1.83} & \g{24.66}{1.30} \\
% mfeat-zernike & 22 & \g{17.58}{1.72} & \g{\textbf{15.19}}{1.73} & \g{15.55}{0.62} \\
% cmc & 23 & \g{45.77}{1.49} & \g{46.67}{1.83} & \g{\textbf{45.42}}{2.05} \\
% optdigits & 28 & \g{1.48}{0.19} & \g{\textbf{1.15}}{0.21} & \g{1.33}{0.14} \\
% pendigits & 32 & \g{1.03}{0.25} & \g{0.76}{0.19} & \g{\textbf{0.72}}{0.16} \\
% iris & 61 & \g{9.06}{7.01} & \g{8.14}{6.48} & \g{\textbf{7.29}}{6.95} \\
% mnist\_784 & 554 & \g{2.83}{0.11} & \g{2.57}{0.05} & \g{\textbf{2.56}}{0.14} \\
% abalone & 1557 & \g{35.05}{0.61} & \g{35.07}{0.69} & \g{\textbf{34.91}}{0.70} \\
% volkert & 41166 & \g{33.26}{0.62} & \g{32.74}{0.76} & \g{\textbf{32.54}}{0.61} \\
%      \midrule
%      \multicolumn{2}{c}{Total} & 15.88 & 15.54 & \textbf{15.34} \\
%      \bottomrule
%     \end{tabular}
%     \caption{Test error for tabular data classification task.}
%     \label{tab:tabular_full}
% \end{table}
% \fi

% % \begin{table}[h]
% %     \caption{ Classification Test Error (\%) on graph datasets from the TUDatasets benchmark when using the setup in \cite{sun2019infograph}. Results are averaged over five seeds.}
% %     \centering
% %     \setlength{\tabcolsep}{20pt}
% %     \renewcommand{\arraystretch}{1.25}
% %     \begin{tabular}{l  c c c}
% %     \toprule
% %     \textbf{Dataset}  & \multicolumn{3}{c}{\textbf{Method}} \\
% %     &  ERM & Mixup & \MixupS{} \\
% %     \midrule
% % MUTAG & \g{14.47}{1.59} & \g{13.19}{3.13} & \g{\textbf{12.34}}{4.54}\\
% % NCI1 & \g{21.46}{1.68} & \g{20.82}{1.40} & \g{\textbf{20.31}}{1.48}\\
% % PTC\_MR & \g{40.93}{5.37} & \g{37.67}{9.38} & \g{\textbf{36.05}}{4.65}\\
% % ENZYMES & \g{54.67}{2.27} & \g{52.53}{3.51} & \g{\textbf{52.00}}{1.93}\\
% % PROTEINS & \g{28.89}{1.48} & \g{27.53}{1.16} & \g{\textbf{26.88}}{1.42}\\
% % COLLAB & \g{31.22}{1.94} & \g{30.66}{1.54} & \g{\textbf{30.05}}{1.37}\\
% % REDDIT-MULTI-5K & \g{48.18}{2.06} & \g{\textbf{46.69}}{1.36} & \g{46.80}{1.59}\\
% % IMDB-BINARY & \g{30.00}{3.86} & \g{29.04}{3.36} & \g{\textbf{28.40}}{2.59}\\
% % IMDB-MULTI & \g{53.07}{2.54} & \g{52.69}{2.52} & \g{\textbf{51.95}}{2.02}\\
% % REDDIT-BINARY & \g{9.36}{1.01} & \g{8.40}{1.39} & \g{\textbf{8.28}}{0.71}\\
% %     \bottomrule
% %     \end{tabular}
    
% %     \label{tab:graph_info}
% % \end{table}

% \iffalse
% \begin{table}[h]
%     \caption{ Classification Test Error (\%) on graph datasets from the TUDatasets benchmark when using the setup in \cite{sun2019infograph}. Results are averaged over ten seeds.}
%     \centering
%     \setlength{\tabcolsep}{20pt}
%     \renewcommand{\arraystretch}{1.25}
%     \begin{tabular}{l  c c c}
%     \toprule
%     \textbf{Dataset}  & \multicolumn{3}{c}{\textbf{Method}} \\
%     &  ERM & Mixup & \MixupS{} \\
%     \midrule
% MUTAG & \g{15.96}{3.95} & \g{15.53}{4.04} & \g{\textbf{15.32}}{2.82}\\
% NCI1 & \g{21.10}{1.16} & \g{20.84}{1.77} & \g{\textbf{20.78}}{1.36}\\
% ENZYMES & \g{54.00}{2.27} & \g{53.80}{1.89} & \g{\textbf{53.73}}{2.60}\\
% PROTEINS & \g{28.89}{2.77} & \g{28.35}{2.04} & \g{\textbf{27.74}}{2.61}\\
% COLLAB & \g{31.20}{2.16} & \g{30.94}{1.90} & \g{\textbf{30.68}}{2.23}\\
% REDDIT-MULTI-5K & \g{48.73}{2.46} & \g{\textbf{47.96}}{1.87} & \g{48.22}{2.56}\\
% IMDB-BINARY & \g{31.00}{3.24} & \g{29.60}{3.55} & \g{\textbf{28.20}}{2.59}\\
% IMDB-MULTI & \g{53.92}{4.09} & \g{53.36}{2.80} & \g{\textbf{53.01}}{2.84}\\
% REDDIT-BINARY & \g{8.90}{1.09} & \g{\textbf{8.54}}{1.29} & \g{8.74}{1.13}\\
%     \bottomrule
%     \end{tabular}
    
%     \label{tab:graph_info}
% \end{table}

% \begin{table}[h]
%     \caption{ Classification Test Error (\%) on graph datasets from the TUDatasets benchmark when following the setup of \cite{xu2018powerful}. Results are obtained from 10-fold validation.}
%     \centering
%     \setlength{\tabcolsep}{20pt}
%     \renewcommand{\arraystretch}{1.25}
%     \begin{tabular}{l  c c c}
%     \toprule
%     \textbf{Dataset}  & \multicolumn{3}{c}{\textbf{Method}} \\
%     &  ERM & Mixup & \MixupS{} \\
%     \midrule
% MUTAG & \g{10.15}{0.06} & \g{10.67}{0.05} & \g{\textbf{10.06}}{0.06}\\
% NCI1 & \g{17.79}{0.02} & \g{18.59}{0.02} & \g{\textbf{17.74}}{0.01}\\
% PTC & \g{38.37}{0.09} & \g{\textbf{34.87}}{0.08} & \g{35.50}{0.08}\\
% PROTEINS & \g{25.43}{0.04} & \g{24.44}{0.04} & \g{\textbf{23.72}}{0.04}\\
% IMDBBINARY & \g{25.60}{0.03} & \g{25.30}{0.03} & \g{\textbf{25.20}}{0.03}\\
% IMDBMULTI & \g{50.33}{0.03} & \g{49.27}{0.04} & \g{\textbf{48.53}}{0.03}\\
%     \bottomrule
%     \end{tabular}
%     \label{tab:graph_pgnn}
% \end{table}

% \begin{table}[h]
%     \caption{ Classification Test Error (\%) on graph datasets when following the setup of \cite{wang2021mixup}. Results are averaged over 5 seeds.}
%     \centering
%     \setlength{\tabcolsep}{20pt}
%     \renewcommand{\arraystretch}{1.25}
%     \begin{tabular}{l  c c c}
%     \toprule
%     \textbf{Dataset}  & \multicolumn{3}{c}{\textbf{Method}} \\
%     &  ERM & Mixup & \MixupS{} \\
%     \midrule
% Cora & \g{13.76}{0.02} & \g{12.77}{0.01} & \g{\textbf{12.51}}{0.01}\\
% Pubmed & \g{11.65}{0.00} & \g{10.59}{0.00} & \g{\textbf{10.55}}{0.00}\\
% Citeseer & \g{25.38}{0.02} & \g{24.65}{0.01} & \g{\textbf{24.56}}{0.01}\\
% Flickr & \g{46.89}{0.00} & \g{46.70}{0.00} & \g{\textbf{46.65}}{0.00}\\
% \bottomrule
% \end{tabular}
% \label{tab:graph_mix}
% \end{table}


% \section{Additional Experiments: Graph Datasets}
% For graph classification, we consider the \textit{MUTAG, NCI1, PTC\_MR, ENZYMES, PROTEINS, COLLAB, REDDIT-MULTI-5K, IMDB-BINARY, IMDB-MULTI} and \textit{REDDIT-BINARY} datasets. We consider the following two setups for our experimentations.

% We first use the GIN model \citep{xu2018powerful} using the implementation settings of \citep{sun2019infograph} as the baseline system. In these experiments we apply Mixup and \MixupS{} after encoding the graph to a fixed dimensional vector, that is, at the graph-level readout stage. Each system here relies on 3 graph neural network layers that give rise to the readout, which is then operated on by a non-linear MLP. The models are trained for 300 epochs using Adam optimizer with a learning rate of 0.001. For the hyperparameters, we consider $\alpha \in \{0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0\}$ and $\eta \in \{0.0001, 0.001, 0.01, 0.1, 1.0\}$. Corresponding to each model setting, we train five seeds, and choose the epoch at which the mean of the five seeds give the best test accuracy, and report the final mean and standard deviation of the algorithm on that single epoch over the five seeds. We refer the readers to Table \ref{tab:graph_info}, which shows the benefits of using scaled mixup on the datasets.

% Next, we use the experimental settings defined in \citep{xu2018powerful} as the baseline system, where Mixup and \MixupS{}is again performed after encoding the graph to a fixed dimensional vector, that is, at the graph-level readout stage. Each system here relies on 5 graph neural network layers that give rise to the readout, which is then operated on by a non-linear MLP. The models are trained for 350 epochs using Adam optimizer with a learning rate of 0.01, which is halved every 50 epochs. For the hyperparameters, we consider $\alpha \in \{0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0\}$ and $\eta \in \{0.0001, 0.001, 0.01, 0.1, 1.0\}$. Corresponding to each model setting, we perform 10-fold validation and identify which epoch and hyperparameters give the best test accuracy, and report the final mean and standard deviation of the algorithm on it over the ten folds. We refer the readers to Table \ref{tab:graph_pgnn}, which shows the benefits of using scaled mixup on the datasets.

% From our experiments above, we see that \MixupS{} helps in two different baseline settings for graph based data, thereby validating the benefits of this approach.
% \fi

\bibliography{MixupE_129}

\end{document}
