\documentclass[accepted]{uai2022}  % Comment this

\usepackage{graphicx}
% \usepackage{tabto}
\usepackage{amsmath}
\usepackage{booktabs}
\usepackage{amssymb}
\usepackage{hyperref}
\usepackage{xcolor}
\usepackage{enumitem}
\usepackage{amsmath}   % <-- for \eqref
\usepackage{commath}
\usepackage[linesnumbered,ruled,vlined, noend]{algorithm2e}
\usepackage{subcaption}
\usepackage{bm}
\usepackage{wrapfig}

%%% Coloring the comment as blue
\newcommand\mycommfont[1]{\footnotesize\ttfamily\textcolor{blue}{#1}}
\SetCommentSty{mycommfont}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\newcommand\tab[1][1cm]{\hspace*{#1}}
\let\oldemptyset\emptyset
\let\emptyset\varnothing
\newtheorem{mydef}{Definition}
\newcommand{\Real}{\mathbb R}
\newcommand{\eps}{\varepsilon}
\newcommand{\To}{\rightarrow}
\newcommand{\BX}{\bm{B}(X)}
\newcommand{\X}{\mathcal{X}}
\newcommand{\Y}{\mathcal{Y}}
\newcommand{\Z}{\mathcal{Z}}
\newcommand{\T}{\mathcal{T}}
\newcommand{\R}{\mathcal{R}}
\newcommand{\Tau}{\mathrm{T}}

\title{Marginal MAP Estimation for Inverse RL under Occlusion with Observer Noise (Supplementary material)}

\author[1]{Prasanth Sengadu Suresh}
\author[1]{\href{mailto:<pdoshi@uga.edu>?Subject=Your UAI 2022 paper}{Prashant Doshi}}
% Add affiliations after the authors
\affil[1]{%
    THINC Lab\\
    Department of Computer Science\\
    University of Georgia\\
    Athens, GA 30606, USA.
}

% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}
\maketitle
\SetAlgoNoLine
\section{Extended derivation of MMAP-BIRL Reward Gradients:}

Following the notations provided in the main paper, the likelihood of the visible portions of the trajectories are written as the marginal of the complete trajectory $X$ by summing out the corresponding hidden portion $Z$:
\begin{small}
\begin{align*}
    &Pr(\Y| R_{\bm{\theta}}) = \prod\limits_{Y \in \Y} Pr(Y|R_{\bm{\theta}})\nonumber\\  
    &= \prod\limits_{Y \in \Y} \sum\limits_{Z \in \Z} Pr(Y,Z| R_{\bm{\theta}})
    = \prod\limits_{Y \in \Y} \sum\limits_{Z \in \Z} Pr(X | R_{\bm{\theta}}).
\end{align*}
\end{small}
Here, the parameters $\bm{\theta}$ are the maximization variables and the occluded portion $Z$ of a trajectory comprises the summation variables of the marginal MAP inference. Using the above likelihood function, the MMAP-BIRL problem is more specifically formulated as:
\begin{small}
\begin{align}
    R_{\bm{\theta}}^* = \argmax_{\bm{\theta} \in \bm{\Theta}} ~\prod\limits_{Y \in \Y} \sum\limits_{Z \in \Z} Pr(Y,Z | R_{\bm{\theta}})~Pr(R_{\bm{\theta}}).\nonumber
\end{align}
\end{small}
Let $Z$ be the collection of the  observations in the occluded time steps of $X$, and $Y = X/Z$. Then,  
\begin{small}
\begin{align*}
&R_{\bm{\theta}}^* = \argmax_{\bm{\theta} \in \bm{\Theta}} ~\prod\limits_{Y \in \Y} \sum\limits_{Z \in \Z} Pr(o^1_l, o^2_l, o^3_l, \ldots, o^\T_l| R_{\bm{\theta}})\nonumber\\
&\tab[6.5cm] \times Pr(R_{\bm{\theta}}).
\end{align*}
\end{small}
The learner's observation $o^t_l$ is a noisy perception of the expert's state and action at time step $t$, and the observations are conditionally independent of each other given the expert's state and action. Therefore, we introduce the state-action pairs in the likelihood 
function above.  
\begin{small}
\begin{align*}
 &Pr(o^1_l, o^2_l, o^3_l, \ldots, o^{\T}_l| R_{\bm{\theta}}) 
 =\sum\limits_{s^1, a^1, s^2, a^2, \ldots, s^{\T}, a^{\T}} Pr(o^1_l,o^2_l,o^3_l, \nonumber\\ 
 &\tab[3cm]\ldots, o^{\T}_l, s^1, a^1, s^2, a^2, \ldots, s^{\T}, a^{\T}| R_{\bm{\theta}}).
\end{align*}
\end{small}
For convenience, let $\tau$ denote the underlying trajectory of state-action pairs, $\tau = (s^1, a^1, s^2, a^2...,s^{\T}, a^{\T})$. Then, we may reformulate the MMAP-BIRL problem as:
\begin{small}
\begin{align*}
&R_{\bm{\theta}}^* = \argmax_{R_{\bm{\theta}}} ~\prod\limits_{Y \in \Y} \sum\limits_{Z \in \Z} \sum\limits_{\tau \in (|S||A|)^{\T}} \nonumber\\
&\tab[3.5cm]Pr(o^1_l, o^2_l, o^3_l, \ldots, o^\T_l, \tau| R_{\bm{\theta}})~Pr(R_{\bm{\theta}}).\nonumber
\end{align*}
\end{small}
Now the log-posterior can be represented as: 
\begin{small}
\begin{align}\label{log-post}
    L_{\bm{\theta}} =  L_{\bm{\theta}}^{lh} + L_{\bm{\theta}}^{pr}.
\end{align}
\end{small}
The log forms of the prior and the likelihood function are represented as
\begin{small}
\begin{align*}
    &L_{\bm{\theta}}^{pr} = \log Pr(R_{\bm{\theta}}) \mbox{ and } L_{\bm{\theta}}^{lh} = \sum\limits_{Y \in \Y} \log~\sum\limits_{Z \in \Z} \sum\limits_{\tau \in (|S||A|)^{\T}} \nonumber\\
    &Pr(o^1_l, o^2_l, o^3_l, \ldots, o^\T_l, \tau| R_{\bm{\theta}}).
\end{align*}
\end{small}
Consequently, the partial differential of \eqref{log-post} becomes: 
\begin{small}
\begin{align}
  \frac{\partial L_{\bm{\theta}}}{\partial \bm{\theta}} = \frac{\partial L_{\bm{\theta}}^{lh}}{\partial \bm{\theta}} + \frac{\partial L_{\bm{\theta}}^{pr}}{\partial \bm{\theta}}.\nonumber
\end{align}
\end{small}
\subsection{Derivative of Log-Prior}
If we choose the prior $Pr(\bm{\theta}; \mu_{\bm{\theta}}, \sigma_{\bm{\theta}})$ to be Gaussian, then the distribution is given as:
\begin{small}
\begin{align}
Pr(\bm{\theta}; \mu_{\bm{\theta}}, \sigma_{\bm{\theta}}) = \frac{1}{\sqrt{2\pi}\sigma_{\bm{\theta}}}e^{-\frac{(\bm{\theta} - \mu_{\bm{\theta}})^2}{2\sigma_{\bm{\theta}}^2}}.\nonumber
\end{align}
\end{small}
where the mean $\mu_{\bm{\theta}}$ and standard deviation $\sigma_{\bm{\theta}}$ may differ between the feature weights. Then, log prior becomes:
\begin{small}
\begin{align}
 &L_{\bm{\theta}}^{pr} = \log \left( \frac{1}{\sqrt{2\pi}\sigma_{\bm{\theta}}} e^{-\frac{(\bm{\theta} - \mu_{\bm{\theta}})^2}{2\sigma_{\bm{\theta}}^2}} \right) \nonumber\\
 &= \log \left(\frac{1}{\sqrt{2\pi}\sigma_{\bm{\theta}}} \right) + \log \left(e^{-\frac{(\bm{\theta} - \mu_{\bm{\theta}})^2}{2\sigma_{\bm{\theta}}^2}} \right) \nonumber\\
 &= - \log\left(\sqrt{2\pi}\sigma_{\bm{\theta}} \right) + \log \left(\frac{-(\bm{\theta} - \mu_{\bm{\theta}})^2}{2\sigma_{\bm{\theta}}^2} \right)\nonumber
\end{align}
\end{small}
Therefore, partial differential of $L_{\bm{\theta}}^{pr}$ becomes:
\begin{small}
\begin{align}
   &\frac{\partial L_{\bm{\theta}}^{pr}}{\partial \bm{\theta}} = \left(\frac{- (\bm{\theta} - \mu_{\bm{\theta}})}{ \sigma_{\bm{\theta}}^2}\right).
\label{eqn:gradient_prior}
\end{align}
\end{small}
\subsection{Derivative of Log-Likelihood}
As explained in the paper, the log-likelihood can be fully written as:
\begin{small}
\begin{align}\label{log-likelihood-full}
&L_{\bm{\theta}}^{lh} = \sum\limits_{Y \in \Y} \log~\sum\limits_{Z \in \Z} \sum\limits_{\tau \in (|S||A|)^{\T}} Pr(s^1)~\pi(a^1|s^1;\bm{\theta}) \nonumber\\
&\tab[1.5cm]\left (\prod_{t=1}^{\T-1} O_l(s^t, a^t, o^t_l)~T(s^t,a^t,s^{t+1})~\pi(a^{t+1}|s^{t+1};\bm{\theta}) \right ) \nonumber\\
&\tab[6cm]\times ~O_l(s^\T, a^\T, o^\T_l).
\end{align}
\end{small}
Now, for convenience, let's represent everything within log in \eqref{log-likelihood-full} as:
\begin{small}
\begin{align}
&h_\theta = \sum\limits_{Z \in \Z} \sum\limits_{\tau \in (|S||A|)^{\T}} Pr(s^1)~\pi(a^1|s^1;\bm{\theta}) \times\nonumber\\
&\tab[1.5cm]\left (\prod_{t=1}^{\T-1} O_l(s^t, a^t, o^t_l)~T(s^t,a^t,s^{t+1})~\pi(a^{t+1}|s^{t+1};\bm{\theta}) \right )\nonumber\\
&\tab[6cm]\times O_l(s^\T, a^\T, o^\T_l).\nonumber\\
\end{align}
\end{small}
Log-likelihood now becomes:
\begin{small}
\begin{align}
&L_{\bm{\theta}}^{lh} = \sum\limits_{Y \in \Y} \log h_\theta \implies \frac{\partial L_{\bm{\theta}}^{lh}}{\partial \bm{\theta}} = \sum\limits_{Y \in \Y} \frac{1}{h_\theta} \frac{\partial h_\theta}{\partial \theta}.\nonumber\\
&\frac{\partial h_\theta}{\partial \theta} = \sum\limits_{Z \in \Z} \sum\limits_{\tau \in (|S||A|)^{\T}} Pr(s^1)\pi(a^1|s^1;\bm{\theta})\nonumber\\ 
&\left( \prod_{t=1}^{\mathcal{T}-1}  O_l(s^t,a^t,o^t_l) ~T(s^{t},a^{t},s^{t+1})\frac{\partial}{\partial\theta} \left (\prod_{t=1}^{\mathcal{T}-1} \pi(a^{t+1}|s^{t+1};\bm{\theta}) \right )\right)\nonumber\\
&\tab[5cm]\times O_l(s^\T, a^\T, o^\T_l).\nonumber
\end{align}
\end{small}
Now let's say for convenience $P^\pi _\theta$ holds $\prod_{t=1}^{\mathcal{T}-1}\pi(a^{t+1}|s^{t+1};\bm{\theta})$ term from the above equation:
\begin{small}
\begin{align}
&P^\pi _\theta = \prod_{t=1}^{\mathcal{T}-1} \pi(a^{t+1}|s^{t+1};\bm{\theta}) \nonumber\\
&\tab[0.45cm]= \pi(a^{2}|s^{2};\bm{\theta})\times\pi(a^{3}|s^{3};\bm{\theta})\times\pi(a^{4}|s^{4};\bm{\theta})...\pi(a^{\mathcal{T}-1}|s^{\mathcal{T}-1};\bm{\theta})\nonumber\\
&\frac{\partial P^\pi _\theta}{\partial \theta} = \left(\pi(a^{3}|s^{3};\bm{\theta})\times\pi(a^{4}|s^{4};\bm{\theta})...\pi(a^{\mathcal{T}-1}|s^{\mathcal{T}-1};\bm{\theta}) \right) \frac{\partial \pi(a^{2}|s^{2};\bm{\theta})}{\partial \theta} +\nonumber\\
&\tab[1.1cm]\left(\pi(a^{2}|s^{2};\bm{\theta})\times\pi(a^{4}|s^{4};\bm{\theta})...\pi(a^{\mathcal{T}-1}|s^{\mathcal{T}-1};\bm{\theta}) \right) \frac{\partial \pi(a^{3}|s^{3};\bm{\theta})}{\partial \theta} +\nonumber\\
&\tab[1.1cm]\left(\pi(a^{2}|s^{2};\bm{\theta})\times\pi(a^{3}|s^{3};\bm{\theta})...\pi(a^{\mathcal{T}-1}|s^{\mathcal{T}-1};\bm{\theta}) \right) \frac{\partial \pi(a^{4}|s^{4};\bm{\theta})}{\partial \theta}+ ....\nonumber\\
&\tab[1.1cm]\left(\pi(a^{2}|s^{2};\bm{\theta})\times\pi(a^{3}|s^{3};\bm{\theta})...\pi(a^{\mathcal{T}-2}|s^{\mathcal{T}-2};\bm{\theta}) \right) \frac{\partial \pi(a^{\mathcal{T}-1}|s^{\mathcal{T}-1};\bm{\theta})}{\partial \theta}\nonumber\\
&\tab[0.7cm]=\left (\sum_{t=1}^{\mathcal{T}-1} \frac{\partial \pi(a^{t+1}|s^{t+1};\bm{\theta})}{\partial\theta} \prod_{k \neq t}^{\mathcal{T}-1} \pi(a^k|s^k;\bm{\theta})\right )
\end{align}
\end{small}
Partial derivative of the policy $\pi(a^{t+1}|s^{t+1};\bm{\theta})$ is given as,
\begin{small}
\begin{align*}
    &\frac{\partial \pi(a^{t+1}|s^{t+1};\bm{\theta})}{\partial\bm{\theta}} = \pi(a^{t+1}|s^{t+1};\bm{\theta})(\dfrac{\beta~\partial Q^*(s^{t+1}, a^{t+1}; \bm{\theta})}{\partial \bm{\theta}}\nonumber\\
    &\tab[4cm]- \sum_{a'\in A} \pi(a'|s^{t+1};\bm{\theta})\dfrac{\beta~\partial Q^*(s^{t+1},a'; \bm{\theta})}{\partial \bm{\theta}})\nonumber
\end{align*}
\end{small}
where the partial derviative of the $Q$-function can be obtained as:
\begin{small}
\begin{align*}
    &\dfrac{\partial Q^*(s^{t+1},a^{t+1}; \bm{\theta})}{\partial \bm{\theta}} = \dfrac{\partial R_\theta(s^{t+1}, a^{t+1})}{\partial \bm{\theta}} +\nonumber\\
    &\tab[1cm]\gamma \sum_{s' \in S}T(s^{t+1}, a^{t+1},s')\sum_{a'\in A} \pi(a'|s^{t+1};\bm{\theta})\dfrac{\partial Q^*(s', a'; \bm{\theta})}{\partial \bm{\theta}}).
\end{align*}
\end{small}


\end{document}