% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)
% Added by Fan
% \usepackage{hyperref}
% \hypersetup{
%     colorlinks=true,
%     linkcolor=blue,
%     filecolor=blue,      
%     urlcolor=blue,
%     citecolor=cyan,
% }
\usepackage{xcolor}
\usepackage{amsmath,amssymb,amsthm,mathrsfs,url,array}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{wrapfig}
\usepackage{appendix}
\usepackage{multirow}
\usepackage{makecell}
\usepackage{diagbox}
% \usepackage{algorithm}
\usepackage[ruled,linesnumbered]{algorithm2e}
% \usepackage{algorithmic}
% \theoremstyle{break}
\newtheorem{Def}{Definition} 
\newtheorem{Th}{Theorem}
\newtheorem{Co}{Corollary}
\newtheorem{Lm}{Lemma}
\newtheorem{Prop}{Proposition} 
\allowdisplaybreaks[4]

\newcommand{\xyx}[1]{\textcolor{red}{[XYX]: #1}}
\newcommand{\fan}[1]{\textcolor{blue}{[Fan]: #1}}
\newcommand{\org}[1]{\textcolor{green}{[ORIGINAL]: #1}}
\newcommand{\jzl}[1]{\textcolor{brown}{[JZL]: #1}}
%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{X-MEN: Guaranteed XOR-Maximum Entropy \\
Constrained Inverse Reinforcement Learning \\
(Supplementary Material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% Add authors
\author[1]{\href{mailto:<ding274@purdue.edu>?Subject=Your UAI 2022 paper}{Fan Ding}{}}
\author[1]{Yexiang Xue}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    Purdue University\\
    West Lafayette, Indiana, USA
}
  
\begin{document}
\maketitle
%\appendix
% NOTE: necessary when ptmx or no mathfont class option is given

\section{Proof of Theorem 4}
 Before proving Theorem~4, we need to bound two important terms shown in Lemma \ref{lm:XOR_bound}.
%
\begin{Lm}\label{lm:XOR_bound}
Let $f:\mathbb{R}^d\rightarrow \mathbb{R}$ be a convex function and $\theta^*=\text{argmin}_{\theta} f(\theta)$. In iteration $t$, $g_t$ is the estimated gradient. Suppose $||\mathbb{E}[g_t^+]||_2 \leq G$, $||\mathbb{E}[g_t^-]||_2 \leq G$, and $||\theta_t - \theta^*||_2 \leq R$. If there exists a constant $c\geq 1$ s.t. $\frac{1}{c}[\nabla f(\theta_t)]^+ \leq \mathbb{E}[g_t^+]\leq c[\nabla f(\theta_t)]^+$ and $c[\nabla f(\theta_t)]^- \leq \mathbb{E}[g_t^-]\leq \frac{1}{c}[\nabla f(\theta_t)]^-$, then we have 
\begin{align}
     \frac{1}{c} ||\mathbb{E}[g_t]||_2^2 &\leq \langle\nabla f(\theta_t), \mathbb{E}[g_t]\rangle + 2(c-\frac{1}{c})G^2.\label{eq:lem2_ieq1}\\
     \langle\nabla f(\theta_t),\theta_t-\theta^*\rangle  &\leq c\langle\mathbb{E}[g_t],\theta_t-\theta^*\rangle + 2(c-\frac{1}{c})GR. \label{eq:lem2_ieq2}
\end{align}
\end{Lm}
\subsection{Proof of Lemma \ref{lm:XOR_bound}}
\begin{proof} (Lemma \ref{lm:XOR_bound})
Since we have the constant bound that
\begin{align}
    \frac{1}{c}[\nabla f(\theta_t)]^+ & \leq \mathbb{E}[g_t^+]\leq c[\nabla f(\theta_t)]^+.\label{eq:pos1}\\
    c[\nabla f(\theta_t)]^- & \leq \mathbb{E}[g_t^-]\leq \frac{1}{c}[\nabla f(\theta_t)]^-.\label{eq:neg1}
\end{align}
and because of $g_t^+\geq \mathbf{0}$ and $g_t^-\leq \mathbf{0}$ we can obtain
\begin{align*}
     \frac{1}{c}||\mathbb{E}[g_t^+]||_2^2&=\frac{1}{c}\langle \mathbb{E}[g_t^+], \mathbb{E}[g_t^+]\rangle \leq \langle[\nabla f(\theta_t)]^+, \mathbb{E}[g_t^+]\rangle \\
     &\leq c\langle \mathbb{E}[g_t^+], \mathbb{E}[g_t^+]\rangle = c||\mathbb{E}[g_t^+]||_2^2.\\
     \frac{1}{c}||\mathbb{E}[g_t^-]||_2^2&=\frac{1}{c}\langle \mathbb{E}[g_t^-], \mathbb{E}[g_t^-]\rangle \leq \langle[\nabla f(\theta_t)]^-, \mathbb{E}[g_t^-]\rangle\\
     &\leq c\langle \mathbb{E}[g_t^-], \mathbb{E}[g_t^-]\rangle = c||\mathbb{E}[g_t^-]||_2^2.
\end{align*}
For cross terms, we have:
\begin{align*}
    \langle [\nabla f(\theta_t)]^+, \mathbb{E}[g_t^-]\rangle &\geq c \langle [\nabla \mathbb{E}[g_t^+], \mathbb{E}[g_t^-]\rangle\\
    \langle [\nabla f(\theta_t)]^-, \mathbb{E}[g_t^+]\rangle &\geq c \langle [\nabla \mathbb{E}[g_t^-], \mathbb{E}[g_t^+]\rangle
\end{align*}
Notice that:
\begin{align*}
    &\frac{1}{c}||\mathbb{E}[g_t]||_2^2 \\ =&\frac{1}{c}||\mathbb{E}[g_t^+]+\mathbb{E}[g_t^-]||_2^2 \\
    =& \frac{1}{c}(||\mathbb{E}[g_t^+]||_2^2+||\mathbb{E}[g_t^-]||_2^2 + 2 \langle \mathbb{E}[g_t^+], \mathbb{E}[g_t^-] \rangle)
\end{align*}
Then we can further derive:
\begin{align*}
    &\frac{1}{c}||\mathbb{E}(g_t)||_2^2\\
    \leq& \langle [\nabla f(\theta_t)]^+,\mathbb{E}[g_t^+] \rangle+ 
    \langle [\nabla f(\theta_t)]^-,\mathbb{E}[g_t^-]\rangle +\\ &\frac{1}{c^2} \langle [\nabla f(\theta_t)]^+,\mathbb{E}[g_t^-] \rangle + \frac{1}{c^2} \langle [\nabla f(\theta_t)]^-,\mathbb{E}[g_t^+] \rangle\\
    =& \langle [\nabla f(\theta_t)]^+,\mathbb{E}[g_t^+] \rangle+ 
    \langle [\nabla f(\theta_t)]^-,\mathbb{E}[g_t^-]\rangle +\\
    &\langle [\nabla f(\theta_t)]^+,\mathbb{E}[g_t^-] \rangle +  \langle [\nabla f(\theta_t)]^-,\mathbb{E}[g_t^+] \rangle +\\
    &(\frac{1}{c^2}-1) \left(\langle [\nabla f(\theta_t)]^+,\mathbb{E}[g_t^-] \rangle +  \langle [\nabla f(\theta_t)]^-,\mathbb{E}[g_t^+] \rangle\right)\\
    =& \langle \nabla f(\theta_t),\mathbb{E}[g_t]\rangle +(\frac{1}{c^2}-1) \langle [\nabla f(\theta_t)]^+,\mathbb{E}[g_t^-] \rangle \\
    &+ (\frac{1}{c^2}-1) \langle [\nabla f(\theta_t)]^-,\mathbb{E}[g_t^+] \rangle\\
    \leq& \langle \nabla f(\theta_t),\mathbb{E}[g_t]\rangle +  (\frac{1}{c}-c) \langle \mathbb{E}[g_t^+],\mathbb{E}[g_t^-] \rangle \\
    &+ (\frac{1}{c}-c) \langle \mathbb{E}[g_t^-],\mathbb{E}[g_t^+] \rangle.
\end{align*}
According to Cauchy-Schwarz Inequality, there is $ |\langle \mathbb{E}[g_t^+], \mathbb{E}[g_t^-] \rangle| \leq ||\mathbb{E}[g_t^+]||_2 ||\mathbb{E}[g_t^-]||_2 \leq G^2$.
Combining the proof above, we can get Equation~\ref{eq:lem2_ieq1}.\\
To prove Equation~\ref{eq:lem2_ieq2}, first notice:
\begin{align*}
    \frac{1}{c} \langle \mathbb{E}[g_t^+], [\theta_t-\theta^*]^+ \rangle &\leq \langle [\nabla f(\theta_t)]^+, [\theta_t-\theta^*]^+ \rangle \\
    &\leq c \langle \mathbb{E}[g_t^+], [\theta_t-\theta^*]^+ \rangle, \\
    \frac{1}{c} \langle \mathbb{E}[g_t^-], [\theta_t-\theta^*]^- \rangle &\leq \langle [\nabla f(\theta_t)]^-, [\theta_t-\theta^*]^- \rangle \\
    &\leq c \langle \mathbb{E}[g_t^-], [\theta_t-\theta^*]^- \rangle, \\    
    {c} \langle \mathbb{E}[g_t^+], [\theta_t-\theta^*]^- \rangle &\leq \langle [\nabla f(\theta_t)]^+, [\theta_t-\theta^*]^- \rangle \\
    &\leq \frac{1}{c} \langle \mathbb{E}[g_t^+], [\theta_t-\theta^*]^- \rangle, \\
    {c} \langle \mathbb{E}[g_t^-], [\theta_t-\theta^*]^+ \rangle 
    &\leq \langle [\nabla f(\theta_t)]^-, [\theta_t-\theta^*]^+ \rangle \\
    &\leq \frac{1}{c} \langle \mathbb{E}[g_t^-], [\theta_t-\theta^*]^+ \rangle,
\end{align*} where $[\theta_t-\theta^*]^+=\max\{\theta_t-\theta^*,\textbf{0}\}$ and $[\theta_t-\theta^*]^-=\min\{\theta_t-\theta^*,\textbf{0}\}$. \\
Then we have:
\begin{align*}
    & \langle \nabla f(\theta_t), \theta_t-\theta^* \rangle \\
    =& \langle [\nabla f(\theta_t)]^+ + [\nabla f(\theta_t)]^-, [\theta_t-\theta^*]^+ + [\theta_t-\theta^*]^- \rangle\\
    =& \langle [\nabla f(\theta_t)]^+ , [\theta_t-\theta^*]^+ \rangle + 
    \langle [\nabla f(\theta_t)]^+ , [\theta_t-\theta^*]^- \rangle +\\
    &\langle [\nabla f(\theta_t)]^- , [\theta_t-\theta^*]^+ \rangle + 
    \langle [\nabla f(\theta_t)]^- , [\theta_t-\theta^*]^- \rangle\\
    \leq& c \langle \mathbb{E}[g_t^+], [\theta_t-\theta^*]^+ \rangle + c \langle \mathbb{E}[g_t^-], [\theta_t-\theta^*]^- \rangle + \\
    & \frac{1}{c} \langle \mathbb{E}[g_t^-], [\theta_t-\theta^*]^+ \rangle + \frac{1}{c} \langle \mathbb{E}[g_t^+], [\theta_t-\theta^*]^- \rangle\\
    =&c \langle \mathbb{E}[g_t^+], [\theta_t-\theta^*]^+ \rangle + c \langle \mathbb{E}[g_t^-], [\theta_t-\theta^*]^- \rangle + \\
    & c \langle \mathbb{E}[g_t^-], [\theta_t-\theta^*]^+ \rangle + {c} \langle \mathbb{E}[g_t^+], [\theta_t-\theta^*]^- \rangle +\\
    & (\frac{1}{c} - c) (\langle \mathbb{E}[g_t^-], [\theta_t-\theta^*]^+ \rangle + \langle \mathbb{E}[g_t^+], [\theta_t-\theta^*]^- \rangle)\\
    =& c \langle \mathbb{E}[g_t], [\theta_t-\theta^*] \rangle + \\
    & (\frac{1}{c} - c) (\langle \mathbb{E}[g_t^-], [\theta_t-\theta^*]^+ \rangle + \langle \mathbb{E}[g_t^+], [\theta_t-\theta^*]^- \rangle)
\end{align*}
In addition, $\langle \mathbb{E}[g_t^-], [\theta_t-\theta^*]^+ \rangle$ and $\langle \mathbb{E}[g_t^+], [\theta_t-\theta^*]^- \rangle$ could be bounded by Cauchy-Schwarz Inequality:
\begin{align*}
    |\langle \mathbb{E}[g_t^+], [\theta_t-\theta^*]^- \rangle| &\leq ||\mathbb{E}[g_t^+]||_2 ||[\theta_t-\theta^*]^-||_2 \\ 
    &= ||\mathbb{E}[g_t^+]||_2||\min \{ \theta_t-\theta^*, \mathbf{0} \}||_2\\
    &\leq ||\mathbb{E}[g_t^+]||_2|| \theta_t-\theta^*||_2 \\
    & \leq GR \\
    |\langle \mathbb{E}[g_t^-], [\theta_t-\theta^*]^+ \rangle| &\leq ||\mathbb{E}[g_t^-]||_2 ||[\theta_t-\theta^*]^+||_2 \\ 
    &= ||\mathbb{E}[g_t^-]||_2||\max \{ \theta_t-\theta^*, \mathbf{0} \}||_2\\
    &\leq ||\mathbb{E}[g_t^-]||_2|| \theta_t-\theta^*||_2 \\
    & \leq GR \\
\end{align*}
Therefore Equation~\ref{eq:lem2_ieq2} can be proved, and this completes the proof.
\end{proof}
Lemma \ref{lm:XOR_bound} gives the new bounds of two terms assuming the constant bound on the gradient, which are essential to the proof of convergence rate. Based on Lemma \ref{lm:XOR_bound}, we can prove Theorem~4, which bounds the error of Stochastic Gradient Descent (SGD) on
a convex optimization problem when the estimated gradient $g_t$ in the 
$t$-th step resides in a constant bound of $\nabla f(\theta_t)$.
\begin{proof} (Theorem 4)
By L-smooth of $f$, for the $t$-th iteration,
\begin{align*}
    f(\theta_{t+1})&\leq f(\theta_t)+\langle\nabla f(\theta_t),\theta_{t+1}-\theta_t\rangle +\frac{L}{2}||\theta_{t+1}-\theta_t||_2^2,\\
       & =f(\theta_t)-\eta\langle\nabla f(\theta_t), g_t\rangle+\frac{L\eta^2}{2}||g_t||^2.
\end{align*}
Because of the constant bound on gradient and $||\mathbb{E}[g_t]||_2^2= \mathbb{E}[||g_t||_2^2]-Var(g_t)$, by taking expectation on both sides w.r.t $g_t$ we get from Lemma \ref{lm:XOR_bound} that
\begin{align*}
    &\mathbb{E}[f(\theta_{t+1})] \leq f(\theta_t)- \eta\langle\nabla f(\theta_t),\mathbb{E}[g_t]\rangle +\frac{L\eta^2}{2}\mathbb{E}[||g_t||_2^2]\\
    & \leq f(\theta_t)- \eta \left( \frac{1}{c} ||\mathbb{E}[g_t]||_2^2  - 2(c-\frac{1}{c})G^2 \right) +\frac{L\eta^2}{2}\mathbb{E}[||g_t||_2^2]\\
    & = f(\theta_t)- \eta \left( \frac{1}{c} \left( \mathbb{E}[||g_t||_2^2] -Var(g_t) \right ) - 2(c-\frac{1}{c})G^2 \right) + \\
    &\quad \quad \frac{L\eta^2}{2}\mathbb{E}[||g_t||_2^2]\\
    & \leq f(\theta_t) - \frac{\eta(2-L \eta c)}{2c}\mathbb{E}[||g_t||_2^2] + \frac{\eta}{c}\sigma^2 + 2\eta (c-\frac{1}{c})G^2\\
    & \leq f(\theta_t) - \frac{\eta c}{2}\mathbb{E}[||g_t||_2^2] + \frac{\eta}{c}\sigma^2 + 2\eta (c-\frac{1}{c})G^2
\end{align*}
where the last inequality follows as $L\eta c\leq 2-c^2$. Because $f$ is convex, still from Lemma \ref{lm:XOR_bound} we get
\begin{align*}
    &\mathbb{E}[f(\theta_{t+1})]\\
    &\leq f(\theta^*)+\langle\nabla f(\theta_t),\theta_t-\theta^*\rangle-\frac{\eta c}{2}\mathbb{E}[||g_t||_2^2]+ \\
    & \quad \quad \frac{\eta}{c}\sigma^2 + 2\eta (c-\frac{1}{c})G^2,\\
    &\leq f(\theta^*)+c\langle\mathbb{E}[g_t],\theta_t-\theta^*\rangle + 2(c-\frac{1}{c})GR-\frac{\eta c}{2}\mathbb{E}[||g_t||_2^2]+ \\
    & \quad \quad \frac{\eta}{c}\sigma^2 + 2\eta (c-\frac{1}{c})G^2,\\
    &= f(\theta^*)+c\mathbb{E}[\langle g_t,\theta_{t}-\theta^* \rangle-\frac{\eta}{2}||g_t||_2^2]+\frac{\eta}{c}\sigma^2 + \\
    & \quad \quad 2(c-\frac{1}{c})GR + 2\eta (c-\frac{1}{c})G^2.
\end{align*}
Denote $\Lambda = \frac{\eta}{c}\sigma^2 + 2(c-\frac{1}{c})GR + 2\eta (c-\frac{1}{c})G^2$. 
We now repeat the calculations by completing the square for the middle two terms to get
\begin{align*}
    &\mathbb{E}[f(\theta_{t+1})]\\
    &\leq f(\theta^*)+\frac{c}{2\eta}\mathbb{E}[2\eta\langle g_t,\theta_{t}-\theta^* \rangle-\eta^2||g_t||_2^2]+\Lambda,\\
    &\leq f(\theta^*)+\frac{c}{2\eta}\mathbb{E}[||\theta_t-\theta^*||_2^2-||\theta_t-\theta^*-\eta g_t||_2^2]+\Lambda,\\
    &=f(\theta^*)+\frac{c}{2\eta}\mathbb{E}[(||\theta_t-\theta^*||_2^2-||\theta_{t+1}-\theta^*||_2^2)]+\Lambda.
\end{align*}
Summing the above equations for $t=0,\ldots,T-1$, we get
\begin{align*}
    &\sum_{t=0}^{T-1}\mathbb{E}[f(\theta_{t+1})-f(\theta^*)]\\
    &\leq\frac{c}{2\eta}(||\theta_0-\theta^*||_2^2-\mathbb{E}[||\theta_T-\theta^*||_2^2])+T \Lambda\\
    &\leq \frac{c||\theta_0-\theta^*||_2^2}{2\eta}+ T\Lambda.
\end{align*}
Finally, by Jensen's inequality, $tf(\overline{\theta_T})\leq \sum_{t=1}^Tf(\theta_t)$,
\begin{align*}
    \sum_{t=0}^{T-1}\mathbb{E}[f(\theta_{t+1})-f(\theta^*)]&=\mathbb{E}[\sum_{t=1}^Tf(\theta_t)]-Tf(\theta^*)\\
    &\geq T\mathbb{E}[f(\overline{\theta_T})]-Tf(\theta^*).
\end{align*}
Combining the above equations we get
\begin{align*}
    \mathbb{E}[f(\overline{\theta_T})] \leq & f(\theta^*)+\frac{c||\theta_0-\theta^*||_2^2}{2\eta T}+\frac{\eta}{c}\sigma^2 + \\
    & \quad 2(c-\frac{1}{c})GR + 2\eta (c-\frac{1}{c})G^2.
\end{align*}
This completes the proof.
\end{proof}


\section{Proof of Theorem 3}\label{app:Th:main}
To prove Theorem 3, we first introduce a Lemma as follows:
\begin{Lm}\label{Lm:L-smooth}
 If the total variation $\max_{\theta}Var_{P}(f(\tau))\leq \sigma_2^2$, then $L(\theta)$ is $\sigma_2^2$-smooth w.r.t. $\theta$.
\end{Lm}

\begin{proof}
Since $L(\theta)=\frac{1}{N}\sum_{\tau\in\mathcal{D}}\log P(\tau|\theta,T)$, $\sigma_2^2$-smoothness requires that
\begin{align*}
    ||\nabla L(\theta_1)-\nabla L(\theta_2)||_2\leq \sigma_2^2||\theta_1-\theta_2||_2
\end{align*}
where $L$ is a constant. Because of the mean value theorem, there exists a point $\tilde{\theta}\in(\theta_1, \theta_2)$ such that
\begin{align*}
    \nabla L(\theta_1)-\nabla L(\theta_2)=\nabla(\nabla L(\tilde{\theta}))(\theta_1-\theta_2).
\end{align*}
Taking the $L_2$ norm for both sides, we have
\begin{align}
    ||\nabla L(\theta_1)-\nabla L(\theta_2)||_2=&
    ||\nabla(\nabla L(\tilde{\theta}))(\theta_1-\theta_2)||_2\nonumber\\
    \leq&||\nabla(\nabla L(\tilde{\theta}))||_2~||\theta_1-\theta_2||_2\label{eq:lem11}
\end{align}
Then, the problem is to bound the matrix 2-norm $||\nabla(\nabla L(\tilde{\theta}))||_2$. Since we know the explicit form of  $L(\theta)$, we know
\begin{align}
    \nabla L(\theta)&=\frac{1}{|\mathcal{D}|}\sum_{\tau\in\mathcal{D}}f(\tau) - \nabla\log Z_{\theta},    \nonumber
\end{align}
\begin{align}
    &\nabla(\nabla L(\theta))\nonumber\\
    =& -\sum_{\tau\in\mathcal{T}}[f(\tau)-\nabla\log Z_{\theta}][f(\tau)-\nabla\log Z_{\theta}]^T P(\tau|\theta,T)\label{eq:lem12},
\end{align}
where $\nabla(\nabla L(\theta))$ is the co-variance matrix. Denote Cov$_{\theta}[f(\tau)]=-\nabla(\nabla L(\theta))$, which is both symmetric and positive semi-definite. We have
\begin{align*}
    ||\nabla(\nabla L(\tilde{\theta}))||_2=||\text{Cov}_{\theta}[f(\tau)]||_2=\lambda_{max},
\end{align*}
where $\lambda_{max}$ is the maximum eigenvalue of the matrix Cov$_{\theta}[f(\tau)]$. Then, because of the positive semi-definiteness of the co-variance matrix, all the eigenvalues are non-negative, and we can bound $\lambda_{max}$ as
\begin{align*}
    \lambda_{max}\leq\sum_{i}\lambda_i=Tr(\text{Cov}_{\theta}[f(\tau)]),
\end{align*}
where $Tr($Cov$_{\theta}[\phi(X)])$ is the trace of matrix Cov$_{\theta}[f(\tau)]$.  Using the definition in Equation \ref{eq:lem12}, $Tr($Cov$_{\theta}[f(\tau)])$ can be further derived as: 
\begin{align*}
    Tr(\text{Cov}_{\theta}[f(\tau)])=\mathbb{E}_{P}[||f(\tau)||_2^2]-||\mathbb{E}_{P}[f(\tau)]||_2^2,
\end{align*}
which is equal to the total variation $Var_{P}(f(\tau))$. Therefore, we have
\begin{align*}
    ||\nabla(\nabla L(\tilde{\theta}))||_2\leq Var_{P}(f(\tau))\leq \sigma_2^2.
\end{align*}
Combining this with Equation \ref{eq:lem11}, we know 
\begin{align*}
    ||\nabla L(\theta_1)-\nabla L(\theta_2)||_2\leq \sigma_2^2~||\theta_1-\theta_2||_2.
\end{align*}
This completes the proof.

\end{proof}

We give the full proof of Theorem 3 as follows:

\begin{proof}(Theorem 3)
Since we use $M_1$ samples from the training set $\{\tau_i\}_{i=1}^{M_1}$ and $M_2$ samples $\tau'_1, \dots, \tau'_{M_2}$ from $P(\tau|T, \theta)$ using XOR-Sampling at each iteration, we have 
$$g_k=\frac{1}{M_1}\sum_{\tau\in \mathcal{D}_{M_1}} f(\tau)-\frac{1}{M_2}\sum_{j=1}^{M_2}f(\tau'_j)$$
Denote $g_k^i=\frac{1}{M_1}\sum_{j=1}^{M_1}f(\tau_j)-f(\tau'_i)$, we have the expectation of $g_k$ as
\begin{align*}
\mathbb{E}_{\mathcal{D},P}[g_k]=\mathbb{E}_{\mathcal{D},P}[g_k^i].
\end{align*}

In each iteration $k$ we can adjust the parameters in XOR-Sampling to give the constant factor approximation of both the denominator and the nominator, then for each $g_k^i$ we can obtain from Theorem 2 that
\begin{align}
    \frac{1}{\delta} [\nabla L(\theta_k)]^+ &\leq \mathbb{E}_{\mathcal{D},P}[g_k^{i+}]\leq \delta [\nabla L(\theta_k)]^+,\label{eq:pos2}\\
    \delta [\nabla L(\theta_k)]^- &\leq \mathbb{E}_{\mathcal{D},P}[g_k^{i-}]\leq  \frac{1}{\delta}[\nabla L(\theta_k)]^-.\label{eq:neg2}
\end{align}
% \org{where $\nabla L(\theta_k)$ is the true gradient at $k$-th iteration. Denote $g_k^+=\max\{g_k,\textbf{0}\}$ and $g_k^-=\min\{g_k,\textbf{0}\}$. 
% %
% Clearly, $g_k^{i+} \geq 0$ and $g_k^{i-} \leq 0$. Moreover,
% for a given dimension, either $g_k^{i+}=0$ for that dimension or $g_k^{i-}=0$. 
% %
% Evaluating $g_k$ dimension by dimension, we can see that
% $g_k^+=\frac{1}{M_2}\sum_{i=1}^{M_2} g_k^{i+}$ and $g_k^-=\frac{1}{M_2}\sum_{i=1}^{M_2} g_k^{i-}$.
% Combined with Equation~\ref{eq:pos2} and \ref{eq:neg2}, we know 
% \begin{align*}
%     \frac{1}{\delta^2} [\nabla L(\theta_k)]^+ \leq \mathbb{E}[g_k^+]
%     \leq \delta^2 [\nabla L(\theta_k)]^+,\\
%     \delta^2 [\nabla L(\theta_k)]^- \leq \mathbb{E}[g_k^-]
%     \leq \frac{1}{\delta^2} [\nabla L(\theta_k)]^-.
% \end{align*}}
%
where we denote
\begin{align*}
    g_k^{i+} =& \max \{g_k^i,\textbf{0}\} , \quad g_k^{i-} =\min\{g_k^i,\textbf{0}\},\\
    [\nabla L(\theta_k)]^+ &= \mathbb{E}_{P}[[\mathbb{E}_{\mathcal{D}}[f(\tau)]-f(\tau')]^+]\\
    [\nabla L(\theta_k)]^- &= \mathbb{E}_{P}[[\mathbb{E}_{\mathcal{D}}[f(\tau)]-f(\tau')]^-].\\
\end{align*}
%
Notice that
$g_k^+=\frac{1}{M_2}\sum_{i=1}^{M_2} g_k^{i+}$ and $g_k^-=\frac{1}{M_2}\sum_{i=1}^{M_2} g_k^{i-}$.
Combined with Equation~\ref{eq:pos2} and \ref{eq:neg2}, we know,
\begin{align*}
    \frac{1}{\delta} [\nabla L(\theta_k)]^+ \leq \mathbb{E}[g_k^+]
    \leq \delta [\nabla L(\theta_k)]^+,\\
    \delta [\nabla L(\theta_k)]^- \leq \mathbb{E}[g_k^-]
    \leq \frac{1}{\delta} [\nabla L(\theta_k)]^-.
\end{align*}
As required in Theorem~3, $||\mathbb{E}[g_k^+]||_2$ and $||\mathbb{E}[g_k^-]||_2$ can be bounded by
\begin{align*}
    \mathbb{E}[g_k^+] &= \mathbb{E}_{P}[[\mathbb{E}_{\mathcal{D}}[f(\tau)]-f(\tau')]^+]\\
    & \leq \delta \mathbb{E}_{P}[[\mathbb{E}_{\mathcal{D}}[f(\tau)]-f(\tau')]^+]\\
    & \leq \delta \mathbb{E}_{P}[[\mathbb{E}_{\mathcal{D}}[f(\tau)]]^+ + [-f(\tau')]^+] \\
    & = \delta \{[\mathbb{E}_{\mathcal{D}}[f(\tau)]]^+ - \mathbb{E}_{P}[[f(\tau')]^-]\}.
\end{align*}
\begin{align*}
    \mathbb{E}[g_k^-] &= \mathbb{E}_{P}[[\mathbb{E}_{\mathcal{D}}[f(\tau)]-f(\tau')]^-]\\
    & \geq \delta \mathbb{E}_{P}[[\mathbb{E}_{\mathcal{D}}[f(\tau)]-f(\tau')]^-]\\
    & \geq \delta \mathbb{E}_{P}[[\mathbb{E}_{\mathcal{D}}[f(\tau)]]^- + [-f(\tau')]^-] \\
    & = \delta \{[\mathbb{E}_{\mathcal{D}}[f(\tau)]]^- - \mathbb{E}_{P}[[f(\tau')]^+]\}.
\end{align*}
Therefore, we have $||\mathbb{E}[g_k^+]||_2^2 \leq \delta^2(G+E)^2$ and $||\mathbb{E}[g_k^-]||_2^2 \leq \delta^2(G+E)^2$.
%

In terms of variance, because $\mathbb{E}_{\mathcal{D},P}[g_k]=\mathbb{E}_{\mathcal{D},P}[g_k^i]$, the variance of $g_k$, denoted as $Var_{\mathcal{D},P}(g_k)$, can then be bounded as
\begin{align*}
    &Var_{\mathcal{D},P}(g_k)\\
    =& Var_D\left(\frac{1}{M_1}\sum_{j=1}^{M_1}f(\tau_j)\right) +
    Var_{P}\left(\frac{1}{M_2}\sum_{i=1}^{M_2}f(\tau'_i)\right)\\
    =&\frac{1}{M_1}Var_D(f(\tau_j)) + \frac{1}{M_2}Var_{P}(f(\tau'_i))\\
 \leq& \frac{\sigma_1^2}{M_1} + \frac{\sigma_2^2}{M_2}.
\end{align*}
The last inequality is because $Var_{\mathcal{D}}(f(\tau))\leq\sigma_1^2$ and $\max_\theta Var_{P}(f(\tau'_j))\leq \sigma_2^2$. 

Since $L(\theta)$ is convex and $\sigma_2^2-$smooth from Lemma \ref{Lm:L-smooth}, according to Theorem 4, when the learning rate $\eta$ is bounded by:
\begin{align}
    \eta \leq \frac{2-\delta^2}{\sigma^2 \delta},
\end{align}
we can then apply Theorem 4 to get the result in Theorem 3:
% \org{\begin{align*}
%      \mathbb{E}[L(\overline{\theta_K})]-OPT &\leq \frac{\delta^2||\theta_0-\theta^*||_2^2}{2\eta K}+\frac{\eta\max_{\theta_k}\{Var_{\mathcal{D},P}(g_k)\}}{\delta^2}\\
%      &\leq\frac{\delta^2||\theta_0-\theta^*||_2^2}{2\eta K}+\frac{\eta\sigma_1^2}{\delta^2 M_1}+\frac{\eta\sigma_2^2}{\delta^2M_2}.
% \end{align*}}
\begin{align*}
     &~ \mathbb{E}[L(\overline{\theta_K})]-OPT \\
     &\leq\frac{\delta||\theta_0-\theta^*||_2^2}{2\eta K}+\frac{\eta\sigma_1^2}{\delta M_1}+\frac{\eta\sigma_2^2}{\delta M_2}+\\
      & \quad \quad  2(\delta^2-1)(G+E)R + 2\eta (\delta^3-\delta)(G+E)^2.
\end{align*}

This completes the proof.
\end{proof}


\section{Proof of Theorem 5}
\begin{proof} (Theorem 5)
Since we use flow constraints to ensure valid trajectories, the number of binary variables in XOR-Sampling in  $O(|\mathcal{S}||\mathcal{A}|)$. 
From Theorem 2 we know that in each iteration of X-MEN, we need to access
% $O(|\mathcal{S}||\mathcal{A}|\ln\frac{|\mathcal{S}||\mathcal{A}|}{\gamma})$
$O(-|\mathcal{S}||\mathcal{A}|\log(1- 1/\sqrt{\delta}) \log({-|\mathcal{S}||\mathcal{A}|/\gamma\log(1-1/\sqrt{\delta}))})$ queries of NP oracles in order to generate one sample. However, as specified also in %\cite{ermon2013embed},
Ermon et al [2013b],
only the first sample needs those many queries. Once we have the first sample, the number of XOR constraints to add can be known in generating future samples for this SGD iteration. 
%
Therefore, we fix the number of XOR constraints added starting the generation of the second sample.
%
As a result, we only need one  NP oracle query in generating each of the following $(M_2-1)$ samples. 
%
Therefore, total queries in each iteration will be
%$O(|\mathcal{S}||\mathcal{A}|\ln\frac{|\mathcal{S}||\mathcal{A}|}{\gamma}+M_2)$
$O (-|\mathcal{S}||\mathcal{A}|\log(1- 1/\sqrt{\delta}) \log({-|\mathcal{S}||\mathcal{A}|/\gamma\log(1-1/\sqrt{\delta}))} + M_2 )$. To complete all $K$ SGD iterations,  X-MEN needs
%$O(K|\mathcal{S}||\mathcal{A}|\ln\frac{|\mathcal{S}||\mathcal{A}|}{\gamma}+KM_2)$
$O (-K|\mathcal{S}||\mathcal{A}|\log(1- 1/\sqrt{\delta}) \log({-|\mathcal{S}||\mathcal{A}|/\gamma\log(1-1/\sqrt{\delta}))} + KM_2 )$ NP oracle queries in total.
\end{proof}



\end{document}