\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr}
\makeatletter

\newcommand*{\addFileDependency}[1]{% argument=file name and extension
	\typeout{(#1)}% latexmk will find this if $recorder=0
	% however, in that case, it will ignore #1 if it is a .aux or 
	% .pdf file etc and it exists! If it doesn't exist, it will appear 
	% in the list of dependents regardless)
	%
	% Write the following if you want it to appear in \listfiles 
	% --- although not really necessary and latexmk doesn't use this
	%
	\@addtofilelist{#1}
	%
	% latexmk will find this message if #1 doesn't exist (yet)
	\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
	\externaldocument{#1}%
	\addFileDependency{#1.tex}%
	\addFileDependency{#1.aux}%
}
%------------End of helper code--------------

% put all the external documents here!
\myexternaldocument{geng_527}

% put all the external documents here!

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Sinong configurations
\usepackage{bbm}
\usepackage{bm}
\usepackage{mathtools}
\usepackage{amsfonts}
% self-defined command
\newcommand{\curly}[1]{\left\{#1\right\}}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}


\newcommand{\Norm}[1]{\left\lVert #1 \right\rVert}
\usepackage{amsthm}
\usepackage{dsfont}

\newcommand*{\Value}{\frac{1}{2}x^2}
\newcommand{\paran}[1]{\left(#1\right)}
\newcommand{\st}{\text{s.t.}}
\newcommand{\bsigma}{\bm{\sigma}} 
\newcommand{\bSigma}{\bm{\Sigma}} 
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bk}{\mathbf{k}} 
\newcommand{\hPi}{\hat{\Pi}} 

\newcommand{\be}{\mathbf{e}}
\newcommand{\bz}{\mathbf{z}}
\newcommand{\bd}{\mathbf{d}}
\newcommand{\bs}{\mathbf{s}}
\newcommand{\ts}{\tilde{s}}

\newcommand{\tE}{\tilde{E}}
\newcommand{\tEP}{\tilde{E}_\Pi}
\newcommand{\hE}{\hat{E}}
\newcommand{\hEP}{\hat{E}_{\Pi}}
\newcommand{\tQ}{\tilde{Q}}
\newcommand{\ttheta}{\tilde{\theta}}


\newcommand{\by}{\mathbf{y}}
\newcommand{\tV}{\tilde{V}}
\newcommand{\tP}{\tilde{P}}
\newcommand{\tM}{\tilde{M}}
\newcommand{\tL}{\tilde{L}}
\newcommand{\hL}{\hat{L}}
\newcommand{\htheta}{\hat{\theta}}

\newcommand{\ba}{\mathbf{a}}
\newcommand{\bp}{\mathbf{p}} 
\newcommand{\bq}{\mathbf{q}} 
\newcommand{\bA}{\mathbf{A}}
\newcommand{\bK}{\mathbf{K}}
\newcommand{\bu}{\mathbf{u}}
\newcommand{\bF}{\mathbf{F}}
\newcommand{\bX}{\mathbf{X}}
\newcommand{\bOmega}{\bm{\Omega}}
\newcommand{\bzero}{\mathbf{0}}
\newcommand{\tS}{\tilde{\mathcal{S}}}
\newcommand{\mS}{\mathcal{S}}
\newcommand{\mA}{\mathcal{A}}
\newcommand{\bZ}{\mathbf{Z}}
\newcommand{\bU}{\mathbf{U}}
\newcommand{\bM}{\mathbf{M}}
\newcommand{\bH}{\mathbf{H}}
\newcommand{\bG}{\mathbf{G}}
\newcommand{\bg}{\mathbf{g}}
\newcommand{\bD}{\mathbf{D}}
\newcommand{\bR}{\mathbf{R}}
\newcommand{\bB}{\mathbf{B}}
\newcommand{\br}{\mathbf{r}}
\newcommand{\bv}{\mathbf{v}}
\newcommand{\bC}{\mathbf{C}}
\newcommand{\bV}{\mathbf{V}}
\newcommand{\bb}{\mathbf{b}}
\newcommand{\bW}{\mathbf{W}}
\newcommand{\bE}{\mathbf{E}}

\newcommand{\bS}{\mathbf{S}}
\newcommand{\bO}{\mathbf{O}}
\newcommand{\bQ}{\mathbf{Q}}
\newcommand{\tbtheta}{\tilde{\theta}} 
\newcommand{\bI}{\mathbf{I}}
\newcommand{\bbeta}{\bm{\beta}}
\newcommand{\balpha}{\bm{\alpha}}
\newcommand{\bY}{\mathbf{Y}}
\newcommand{\bP}{\mathbf{P}}
\newcommand{\bdelta}{\bm{\delta}}
\newcommand{\bDelta}{\bm{\Delta}}
\newcommand{\bmu}{\bm{\mu}} 
\newcommand{\bepsilon}{\bm{\epsilon}}
\newcommand{\mE}{\mathcal{E}}
\newcommand{\btau}{\bm{\tau}}

\newcommand{\RR}{\mathbb{R}} 
\newcommand{\defas}{\triangleq}
\DeclareMathOperator{\tr}{Tr}
\DeclareMathOperator{\var}{Var}

\newcommand{\bphi}{\bm{\phi}}
\newcommand{\brho}{\bm{\rho}} 

\newcommand{\bbI}{\mathbb{I}}
\newcommand{\bfx}{\mathbf{x}}
\newcommand{\bomega}{\bm{\omega}}
\newcommand\given[1][]{\:#1\vert\:}

\newcommand{\EE}{{\mathbb{E}}}
\newcommand{\brz}{\bar{\bZ}} 

% absolute value
\usepackage{mathtools}
\DeclarePairedDelimiter\abs{\lvert}{\rvert}%
%\DeclarePairedDelimiter\norm{\lVert}{\rVert}%
\makeatletter
\let\oldabs\abs
\def\abs{\@ifstar{\oldabs}{\oldabs*}}
\let\oldnorm\norm


\usepackage{makecell}
% comment
\usepackage{comment}
%\usepackage[usenames]{color}
% Citations
\usepackage{graphicx}

%new table
\usepackage{multirow}
\usepackage{float}
\renewcommand\arraystretch{1.2}
% break line in tables
\usepackage{pbox}
\newcommand{\specialcell}[2][c]{%
	\begin{tabular}[#1]{@{}c@{}}#2\end{tabular}}

% Figures
\usepackage{graphicx}

\usepackage{capt-of}
% use subfigure
\usepackage{caption}
\usepackage{subcaption}
\usepackage{microtype}
\usepackage{wrapfig}


% algorithm
\usepackage{algorithm}
\usepackage{algorithmic}



% change algorithm indent size
%\algrenewcommand\algorithmicindent{0.8em}

% comment
\usepackage{comment}
%\usepackage[usenames]{color}
\newcommand{\sinong}[1]{\textcolor{blue}{[Sinong: #1]}}
\newcommand{\h}[1]{\textcolor{red}{[Charlie: #1]}}
\newcommand{\houssam}[1]{\textcolor{green}{[Houssam: #1]}}
%Shortcuts for formulas
\newcommand{\eq}[2]{\begin{equation} \label{eq:#1} #2 \end{equation}}
\newcommand{\eqs}[1]{\begin{equation*} #1 \end{equation*}}
\newcommand{\ali}[2]{\begin{align} \label{eq:#1} \begin{split}#2\end{split}   \end{align}}
\newcommand{\alis}[1]{\begin{align*}\begin{split} #1 \end{split}\end{align*}  }

%Math
\DeclareMathOperator*{\argmax}{arg\,max}
%\DeclareMathOperator*{\arccos}{arccos}
\DeclareMathOperator*{\argmin}{arg\,min}


% theorems
\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{remark}{Remark}
\newtheorem{definition}{Definition}
\newtheorem{corollary}{Corollary}
\newtheorem{assumption}{Assumption}

\usepackage[inline]{enumitem}
\newlist{inlineenum}{enumerate*}{1}
\setlist*[inlineenum,1]{%
	label=(\roman*),%
}


\newsavebox{\tempboxa}
\newsavebox{\tempboxb}
\newsavebox{\tempboxc}

\newlist{todolist}{itemize}{2}
\setlist[todolist]{label=$\square$}
\usepackage{pifont}
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\newcommand{\done}{\rlap{$\square$}{\raisebox{2pt}{\large\hspace{1pt}\cmark}}% 
	\hspace{-2.5pt}}
\newcommand{\wontfix}{\rlap{$\square$}{\large\hspace{1pt}\xmark}}

\newcommand{\charles}[1]{\textcolor{red}{[charles: #1]}}
\usepackage{scalerel}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%e%%%%%%%%%%%%%%%


\title{A Data-Driven State Aggregation Approach for Dynamic Discrete-Choice Models\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Sinong Geng}
\author[2]{Houssam Nassif}
\author[3]{Carlos A. Manzanares}
% Add affiliations after the authors
\affil[1]{%
	Computer Science Department\\
	Princeton University\\
	Princeton, NJ, USA
}
\affil[2]{%
	Meta, Seattle, WA, USA
}
\affil[3]{%
	Amazon, Seattle, WA, USA
}

\begin{document}
	
	\onecolumn %% Turn this off if single column is desired for the supplement
	\maketitle
	
	\section{Dynamic Discrete Choice Models in their Original Formulation}
	\label{sec:ap-ddm}
	In this section, we formulate dynamic discrete choice models (DDMs) using the original formulation~\citep{rust1987optimal}, and discuss its connection with the IRL formulation in Section~\ref{sec:ddm}. 
	Note that the setup in this section is an alternative to the IRL formulation which our main results are based on and just is provided for completeness and comparison. 
	SamQ does not require the assumptions listed in this section.
	
	
	\subsection{Model} 
	Agents choose actions according to a Markov decision process described by the tuple $\curly{ \curly{\mathcal{S}, \mathcal{E}}, \mathcal{A}, r, \gamma,P}$, where
	\begin{itemize}
		\item $\curly{\mathcal{S}, \mathcal{E}}$ denotes the space of state variables; 
		\item $\mathcal{A}$ represents a set of $n_a$ actions;
		\item $r$ represents an agent utility function;
		\item $\gamma \in [0,1)$ is a discount factor;
		\item $P$ represents the transition distribution.
	\end{itemize} 
	
	At time $t$, agents observe state  $S_t$ taking values in $\mathcal{S}$, and $\epsilon_t$ taking values in $\mathcal{E}$ to make decisions. 
	While $S_t$ is observable to researchers, $\epsilon_t$ is observable to agents but not to researchers. 
	The action is defined as a $n_a \times 1$ indicator vector, $A_t $, satisfying
	\begin{itemize}
		\item $\sum_{j=1}^{n_a}A_{tj} = 1 $,
		\item $A_{tj}$ takes value  in $ \curly{0,1}$.
	\end{itemize}
	In other words, at each time point, agents make a distinct choice over $n_a$ possible actions. 
	Meanwhile, $\epsilon_t$ is also a $n_a \times 1$ representing the potential shock of taking a choice. 
	
	
	The agent's control problem has the following value function: 
	\eq{value}{
		V(s,\epsilon) = \max_{\curly{a_t}_{t=0}^{\infty}} \EE \left[\sum_{t=0}^\infty \gamma^{t} r(S_t, \epsilon_t, A_t) \given s, \epsilon \right],  
	}
	where the expectation is taken over realizations of $\epsilon_t$, as well as transitions of $S_{t}$ and $\epsilon_{t}$ as dictated by $P$. 
	The utility function $r(s_t, \epsilon_t, a_t)$ can be further decomposed into
	\eqs{
		r(s_t, \epsilon_t, a_t) =u(s_t, a_t) + a_t^\top \epsilon_t,  
	}
	where $u$ represents the deterministic part of the utility function. Agents, but not researchers, observe $\epsilon_t$ before making a choice in each time period.
	
	\subsection{Assumptions and Definitions}
	We study DDMs under the following common assumptions. 
	
	\begin{assumption}
		\label{asm:trainsition}
		The transition from $S_t$ to $S_{t+1}$ is independent of $\epsilon_t$ 
		\eqs{
			\textnormal{P}(S_{t+1} \given S_t , \epsilon_t, A_t ) = \textnormal{P}(S_{t+1}  \given S_t, A_t).
		}
	\end{assumption}
	
	\begin{assumption}
		\label{asm:epsilon}
		The random shocks $\epsilon_t$ at each time point are independent and identically distributed (IID) according to a type-I extreme value distribution. 
	\end{assumption}
	
	Assumption~\ref{asm:trainsition} ensures that unobservable state variables do not influence state transitions. This assumption is common, since it drastically simplifies the task of identifying the impact of changes in observable  versus unobservable state variables. In our setting, Assumption~\ref{asm:epsilon} is convenient but not necessary, and $\bepsilon_t$ could follow other parametric distributions. As pointed out by \cite{arcidiacono2011practical}, Assumptions~\ref{asm:trainsition} and ~\ref{asm:epsilon} are nearly standard for applications of dynamic discrete choice models.
	Such a formulation is proved to be equivalent to the IRL formulation in Section~\ref{sec:ddm} by \citet{geng2020deep,fu2017learning,ermon2015learning}.
	
	
	
	
	\section{Proof of Theorem~\ref{thm:asym-main}}
	\begin{proof}
		By definition of $L$ and $\tL$, we can derive
		\ali{likelihood-diff}{
			L(\mathbbm{D};\theta^*) - \tL(\mathbbm{D};\theta^*)  =&\frac{1}{T} \sum_{(s,a) \in \mathbbm{D}} \bigg[Q^{\theta^*}(s,a) -\tQ^{\theta^*}(\Pi(s),a) 
			\\&+ \log\bigg(\sum_{a'\in\mathcal{A}} \exp(\tQ^{\theta}(\Pi(s),a')) \bigg) - \log\bigg(\sum_{a'\in\mathcal{A}} \exp(Q^{\theta}(s,a')) \bigg) \bigg]
			\\ \leq& \frac{1}{T}\sum_{(s,a) \in \mathbbm{D}} \bigg[\abs{Q^{\theta^*}(s,a) -\tQ^{\theta^*}(\Pi(s),a)} + \max_{a'\in \mathcal{A}}\abs{ Q^{\theta^*}(s,a') -\tQ^{\theta^*}(\Pi(s),a') } \bigg]
			\\ \leq& 2\max_{a'\in \mathcal{A}}\abs{ Q^{\theta^*}(s,a') -\tQ^{\theta^*}(\Pi(s),a') },
		}
		where the first inequality is due to the fact that the log sum exp function is Lipschitz continuous with constant $1$.
		Then, we take $f$ in Lemma~\ref{lem:projection-error} as $Q^{\theta^*}(s,a)$, and derive
		\begin{equation}
			\label{eq:from-projection-error}
			\max_{(s,a)\in\mathcal{S}\times\mathcal{A}}\abs{Q^{\theta^*}(s, a) - \tQ^{\theta^*}(\Pi(s), a)}
			\leq \frac{2}{1-\gamma}\max_{(s,a)\in\mathcal{S}\times\mathcal{A}}\abs{Q^{\theta^*}(s, a) - Q^{\theta^*}(\Pi(s),a)}.
		\end{equation}
		
		By taking \eqref{eq:from-projection-error} to \eqref{eq:likelihood-diff}, 
		\alis{
			L(\mathbbm{D};\theta^*) - \tL(\mathbbm{D};\theta^*) \leq \frac{4}{1-\gamma} \max_{(s,a)\in\mathcal{S}\times\mathcal{A}}\abs{Q^{\theta^*}(s, a) - Q^{\theta^*}(\Pi(s),a)}.
		}
		Finally, by Lemma~\ref{lem:likelihood-bound}
		\eqs{
			\epsilon_{asy} \leq \frac{4}{c_{H}(1-\gamma)} \max_{(s,a)\in\mathcal{S}\times\mathcal{A}}\abs{Q^{\theta^*}(s, a) - Q^{\theta^*}(\Pi(s),a)} = \epsilon_{Q},
		}
		which finishes the proof. 
	\end{proof}
	
	
	\begin{lemma}
		\label{lem:likelihood-bound}
		Under Assumption~\ref{asm:second-order} and Assumption~\ref{asm:regularity},
		\eqs{
			\norm{\tilde{\theta} - \theta^*}^2 \leq \frac{E[L(\mathbbm{D}; \theta^*) - \tL(\mathbbm{D}; \theta^*) ]}{c_{H}}.
		}
	\end{lemma}
	\begin{proof}
		
		By the definition of $\ttheta$, 
		\eq{key-1}{
			0\leq \EE[\tL(\mathbbm{D}; \ttheta) - \tL(\mathbbm{D}; \theta^*)]  \leq\EE[ L(\mathbbm{D}; \theta^*) - \tL(\mathbbm{D}; \theta^*)].
		}
		Further, by Taylor expansion, we have
		\eqs{
			\EE[\tL(\mathbbm{D}; \ttheta) - \tL(\mathbbm{D}; \theta^*)]  = (\ttheta - \theta^*)^\top  \EE\left[-\frac{\partial ^2 \tL(\mathbbm{D}; \bar{\theta})}{\partial \theta^2}\right](\ttheta - \theta^*),
		}
		where $\bar{\theta} = k\theta^* + (1-k)\ttheta$ with some $k \in [0,1]$. 
		Note that the first order term is zero, since $\ttheta$ maximizes $\EE[\tL(\mathbbm{D}, \theta)]$. 
		By Assumption~\ref{asm:second-order}, we finish the proof. 
		\eqs{
			\EE[\tL(\mathbbm{D}; \ttheta) - \tL(\mathbbm{D}; \theta^*)]  = (\ttheta - \theta^*)^\top  \EE\left[-\frac{\partial ^2 \tL(\mathbbm{D}; \bar{\theta})}{\partial \theta^2}\right](\ttheta - \theta^*) \geq C_H\norm{\ttheta - \theta^*}^2.
		}
		
	\end{proof}
	
	\begin{lemma}
		\label{lem:projection-error}
		For any projection function $\Pi$ defined in Section~\ref{sec:bias} and its aggregated Q function $\tQ$, the following inequality is true:
		\eqs{
			\max_{(s,a)\in\mathcal{S}\times\mathcal{A}}\abs{Q^{\theta^*}(s, a) - \tQ^{\theta^*}(\Pi(s), a)} \leq \frac{2}{1-\gamma}
			\min_{f}\max_{(s,a)\in\mathcal{S}\times\mathcal{A}}\abs{Q^{\theta^*}(s, a) - f(\Pi(s),a)},
		}
		where $f(s,a):\mathcal{S}\times \mathcal{A} \to \mathbbm{R}$ is any function.  
	\end{lemma}
	\begin{proof}
		The proof follows Theorem 3 of \cite{tsitsiklis1996feature}.  
	\end{proof}
	
	
	
	
	
	
	
	\section{Proof of Theorem~\ref{thm:finite-sample}}
	\subsection{Technical Lemmas for Theorem~\ref{thm:finite-sample}}
	\begin{lemma}
		\label{lem:l-concentrate}
		Given $\theta \in \Theta$, for any $\delta \in (0,1)$, we provide the following probabilistic bound for the estimated aggregated likelihood $\hat{L}$   
		\alis{
			\textnormal{P}\bigg(\abs{ \hL(\mathbbm{D};\theta) - \EE[\tL(\mathbbm{D};\theta)]} \leq& \frac{2(R_{max}+1)}{1-\gamma} \sqrt{\frac{\log(\frac{4}{\delta})}{2N}} \\&+
			\frac{R_{max}+1}{1-\gamma} \sqrt{ \frac{\log(\frac{8|\tS||\mathcal{A}|}{\delta})}{2N} } \frac{2}{C_{uni} -\sqrt{\frac{\log(\frac{4|\tS||\mathcal{A}|}{\delta})}{2N}}  }   \bigg)  \geq 1-\delta,
		}
		where the expectation is over the sample $\mathbbm{D}$. 
	\end{lemma}
	\begin{proof}
		
		By inserting $\tL(\mathbbm{D};\theta)$, we have
		\begin{equation}
			\label{eq:main-general}
			\abs{ \hL(\mathbbm{D};\theta) - \EE[\tL(\mathbbm{D};\theta)]} \leq \abs{ \hL(\mathbbm{D};\theta) - \tL(\mathbbm{D};\theta)} + \abs{\tL(\mathbbm{D};\theta) - \EE[\tL(\mathbbm{D};\theta)]}.
		\end{equation}
		
		\paragraph{First term on the RHS of \eqref{eq:main-general}}
		To start with, we consider $\abs{ \hL(\mathbbm{D};\htheta) - \tL(\mathbbm{D};\htheta)}$.
		To this end, we aim to bound $\max_{(\ts,a) \in \tS \times \mathcal{A}}\abs{\tQ^{\theta}(\ts,a) - \hat{Q}^{\theta}(\ts,a)}$. 
		We insert $\hat{\mathcal{T}} (\tQ^{\theta}(\ts,a))$:
		\alis{
			\tQ^{\theta}(\ts,a) - \hat{Q}^{\theta}(\ts,a) =& 
			\tilde{\mathcal{T}}(\tQ^{\theta}(\ts,a)) - \hat{\mathcal{T}} (\tQ^{\theta}(\ts,a))+\hat{\mathcal{T}} (\tQ^{\theta}(\ts,a)) - \hat{\mathcal{T}}(\hat{Q}^{\theta}(\ts,a)). 
		}
		Since $\hat{\mathcal{T}}$ is a contraction with $\gamma$, we further derive
		\begin{equation}
			\label{eq:q-bound-inter}
			\abs{\tQ^{\theta}(\ts,a) - \hat{Q}^{\theta}(\ts,a)} \leq \frac{\abs{\tilde{\mathcal{T}}(\tQ^{\theta}(\ts,a)) - \hat{\mathcal{T}} (\tQ^{\theta}(\ts,a))}}{1-\gamma}.
		\end{equation}
		
		By the definition of $\tilde{\mathcal{T}}$ and $\hat{\mathcal{T}}$, it can be seen that $ \hat{\mathcal{T}} (\tQ^{\theta}(\ts,a))$ is a sample average estimation to $\tilde{\mathcal{T}}(\tQ^{\theta}(\ts,a))$.
		Therefore, we aim to bound the difference between the two by concentration inequalities.
		Specifically, by assumption~\ref{asm:uni}  and Hoeffding's inequality, we have
		\eq{p-na}{
			\textnormal{P}\bigg(\sum_{i=1,2,\cdots,N} \mathbbm{1}_{\curly{ \Pi(s_i) = \ts, a_i = a}} \geq NC_{uni} - \sqrt{-\frac{1}{2}N\log(\frac{\delta}{2})}\bigg) \geq 1-\frac{\delta}{2}.
		}
		Further, conditional on the event $\curly{\sum_{i=1,2,\cdots,N} \mathbbm{1}_{\curly{ \Pi(s_i) = \ts, a_i = a}} \geq NC_{uni} - \sqrt{-N\log(\frac{\delta}{2})}}$, by Hoeffding's inequality and Assumption~\ref{asm:reward-bound}, for any $(\ts,a) \in \tS \times \mathcal{A}$
		\ali{p-bell}{
			\textnormal{P} \bigg(\bigg| \tilde{\mathcal{T}}(\tQ^{\theta}(\ts,a)) - \hat{\mathcal{T}} (\tQ^{\theta}(\ts,a))  \bigg| \leq \frac{R_{max}+1}{1-\gamma} \sqrt{ \frac{\log(\frac{4}{\delta})}{2N} } \frac{1}{C_{uni} -\sqrt{\frac{\log(\frac{2}{\delta})}{2N}}  } \bigg) \geq 1-\frac{\delta }{2}.
		}
		
		Combining \eqref{eq:p-na} and \eqref{eq:p-bell}, for a given $(\ts,a) \in \tS \times \mathcal{A}$, for any $\delta \in(0,1)$
		\eqs{
			\textnormal{P} \bigg(\bigg| \tilde{\mathcal{T}}(\tQ^{\theta}(\ts,a)) - \hat{\mathcal{T}} (\tQ^{\theta}(\ts,a))  \bigg| \leq \frac{R_{max}+1}{1-\gamma} \sqrt{ \frac{\log(\frac{4}{\delta})}{2N} } \frac{1}{C_{uni} -\sqrt{\frac{\log(\frac{2}{\delta})}{2N}}  } \bigg) \geq 1-\delta.
		}
		Next, by union bound again, we can extend the results to any $(\ts,a) \in \tS \times \mathcal{A}$
		\begin{equation}
			\label{eq:p-joint}
			\textnormal{P} \bigg(\max_{\ts \in \tS, a\in\mathcal{A}}\bigg| \tilde{\mathcal{T}}(\tQ^{\theta}(\ts,a)) - \hat{\mathcal{T}} (\tQ^{\theta}(\ts,a))  \bigg| \leq \frac{R_{max}+1}{1-\gamma} \sqrt{ \frac{\log(\frac{4|\tS||\mathcal{A}|}{\delta})}{2N} } \frac{1}{C_{uni} -\sqrt{\frac{\log(\frac{2|\tS||\mathcal{A}|}{\delta})}{2N}}  } \bigg) \geq 1-\delta.
		\end{equation}
		Combined with \eqref{eq:q-bound-inter}, we derive:
		\eqs{
			\textnormal{P} \bigg(\max_{(\ts,a) \in \tS \times \mathcal{A}}\abs{\tQ^{\theta}(\ts,a) - \hat{Q}^{\theta}(\ts,a)} \leq \frac{R_{max}+1}{(1-\gamma)^2} \sqrt{ \frac{\log(\frac{4|\tS||\mathcal{A}|}{\delta})}{2N} } \frac{1}{C_{uni} -\sqrt{\frac{\log(\frac{2|\tS||\mathcal{A}|}{\delta})}{2N}}  } \bigg) \geq 1-\delta.
		}
		By the definition of $\tilde{L}$ in \eqref{eq:aggregated-l} and \eqref{eq:likelihood-diff}, we have
		\eqs{
			\textnormal{P} \bigg(\abs{\tilde{L}(\mathbbm{D}; \theta) - \hat{L}(\mathbbm{D}; \theta)} \leq \frac{R_{max}+1}{(1-\gamma)^2} \sqrt{ \frac{\log(\frac{4|\tS||\mathcal{A}|}{\delta})}{2N} } \frac{2}{C_{uni} -\sqrt{\frac{\log(\frac{2|\tS||\mathcal{A}|}{\delta})}{2N}}  } \bigg) \geq 1-\delta.
		}
		\paragraph{Second term on the RHS of \eqref{eq:main-general}}
		Now, we consider $\abs{\tL(\mathbbm{D};\theta) - \EE[\tL(\mathbbm{D};\theta)]}$. 
		By \eqref{eq:likelihood-diff} and Assumption~\ref{asm:reward-bound}, $\tL(\mathbbm{D};\htheta)$ is bounded by $\frac{2(R_{max}+1)}{1-\gamma}$.
		Thus, by Hoeffding's inequality, for any $\delta \in (0,1)$
		\alis{
			\textnormal{P}\bigg(\abs{\EE[\tL(\mathbbm{D};\htheta)] - \tL(\mathbbm{D};\htheta)} \leq& \frac{2(R_{max}+1)}{1-\gamma} \sqrt{\frac{\log(\frac{2}{\delta})}{2N}} 
			\bigg)\geq 1-\delta.
		}
		
		Therefore, by union bound, \eqref{eq:main-general} can be bounded by
		\alis{
			\textnormal{P}\bigg(\abs{ \hL(\mathbbm{D};\theta) - \EE[\tL(\mathbbm{D};\theta)]} \leq& \frac{2(R_{max}+1)}{1-\gamma} \sqrt{\frac{\log(\frac{4}{\delta})}{2N}} \\&+
			\frac{R_{max}+1}{(1-\gamma)^2} \sqrt{ \frac{\log(\frac{8|\tS||\mathcal{A}|}{\delta})}{2N} } \frac{2}{C_{uni} -\sqrt{\frac{\log(\frac{4|\tS||\mathcal{A}|}{\delta})}{2N}}  }   \bigg)  \geq 1-\delta.
		}
	\end{proof}
	
	\begin{lemma}
		\label{lem:expectation}
		Let $\ttheta^{\hat{\Pi}}:= \argmax_{\theta \in \Theta}\EE[\tL(\mathbbm{D}; \theta, \hat{\Pi})]$. 
		Then, 
		\eqs{
			\norm{\theta^* - \ttheta^{\hat{\Pi}}} \leq \frac{4}{C_H(1-\gamma)} \bigg(\frac{R_{\max}+1}{1-\gamma}\frac{4}{n_s^{\frac{1}{n_a}}-1}+2\epsilon_Q + \epsilon_{c}\bigg).}
	\end{lemma}
	
	\begin{proof}
		A Euclidean ball of radius $R$ in $\mathbbm{R}^{n_a}$ can be covered by $\bigg(\frac{4R+\delta}{\delta}\bigg)^{n_a}$ balls of radius $\delta$ (see Lemma 2.5 of \citet{van2000empirical}). Therefore, with $n_s$ states after aggregation, by Assumption~\ref{asm:clustering},
		\eqs{
			\hat{\epsilon}(\Pi^*) \leq \frac{R_{\max}+1}{1-\gamma}\frac{4}{n_s^{\frac{1}{n_a}}-1}.
		}
		Further by Assumption~\ref{asm:clustering} and Assumption~\ref{asm:irl},
		\eqs{
			\epsilon(\hat{\Pi}) \leq \hat{\epsilon}(\Pi^*) +2\epsilon_Q + \epsilon_{c} \leq \frac{R_{\max}+1}{1-\gamma}\frac{4}{n_s^{\frac{1}{n_a}}-1} +2\epsilon_Q + \epsilon_{c}.
		}
		Therefore, by Theorem~\ref{thm:asym-main}
		\eqs{
			\norm{\theta^* - \ttheta^{\hat{\Pi}}} \leq \frac{4}{C_H(1-\gamma)} \bigg(\frac{R_{\max}+1}{1-\gamma}\frac{4}{n_s^{\frac{1}{n_a}}-1}+2\epsilon_Q + \epsilon_{c}\bigg).
		}
		
		
		
	\end{proof}
	\subsection{Proof}
	We first aim to bound $\EE[\tL(\mathbbm{D}; \ttheta^{\hat{\Pi}}) - \tL(\mathbbm{D}; \hat{\theta})]$, where the expectation is over $\mathbbm{D}$ only instead of $\hat{\theta}$. 
	To this end, we insert $\hL(\mathbbm{D}; \ttheta^{\hat{\Pi}})$ and $\hL(\mathbbm{D}; \htheta)$:
	\alis{
		\EE[\tL(\mathbbm{D}; \ttheta^{\hat{\Pi}}) - \tL(\mathbbm{D}; \hat{\theta})] \leq& \EE[\tL(\mathbbm{D}; \ttheta^{\hat{\Pi}}) - \hL(\mathbbm{D};\ttheta^{\hat{\Pi}}) ] + \hL(\mathbbm{D};\ttheta^{\hat{\Pi}}) - \hL(\mathbbm{D};\htheta)+\hL(\mathbbm{D};\htheta) - \EE[\tL(\mathbbm{D};\htheta)]
		\\ \leq&\abs{\EE[\tL(\mathbbm{D}; \ttheta^{\hat{\Pi}}) - \hL(\mathbbm{D};\ttheta^{\hat{\Pi}}) ]}  +\abs{ \hL(\mathbbm{D};\htheta) - \EE[\tL(\mathbbm{D};\htheta)]}. 
	}
	By Lemma~\ref{lem:l-concentrate} and the union bound, 
	\alis{
		\textnormal{P}\bigg(\max_{\theta \in \Theta}\abs{ \hL(\mathbbm{D};\theta) -\EE[\tL(\mathbbm{D};\theta)]}& \leq \frac{2(R_{max}+1)}{1-\gamma} \sqrt{\frac{\log(\frac{4|\Theta|}{\delta})}{2N}} \\&+
		\frac{R_{max}+1}{(1-\gamma)^2} \sqrt{ \frac{\log(\frac{8|\tS||\mathcal{A}||\Theta|}{\delta})}{2N} } \frac{2}{C_{uni} -\sqrt{\frac{\log(\frac{4|\tS||\mathcal{A}||\Theta|}{\delta})}{2N}}  }   \bigg)  \geq 1-\delta.
	}
	Therefore, 
	\alis{
		\textnormal{P}\bigg(\EE[\tL(\mathbbm{D}; \ttheta^{\hat{\Pi}}) - \tL(\mathbbm{D}; \hat{\theta})]& \leq \frac{4(R_{max}+1)}{1-\gamma} \sqrt{\frac{\log(\frac{4|\Theta|}{\delta})}{2N}} \\&+
		\frac{R_{max}+1}{(1-\gamma)^2} \sqrt{ \frac{\log(\frac{8|\tS||\mathcal{A}||\Theta|}{\delta})}{2N} } \frac{4}{C_{uni} -\sqrt{\frac{\log(\frac{4|\tS||\mathcal{A}||\Theta|}{\delta})}{2N}}  }   \bigg)  \geq 1-\delta.
	}
	
	By Assumption~\ref{asm:second-order} and a similar analysis as Lemma~\ref{lem:likelihood-bound}, 
	\alis{
		\textnormal{P}\bigg(\abs{\htheta - \ttheta^{\hat{\Pi}}}& \leq \frac{4(R_{max}+1)}{(1-\gamma)C_H} \sqrt{\frac{\log(\frac{4|\Theta|}{\delta})}{2N}} \\&+
		\frac{R_{max}+1}{(1-\gamma)^2C_H} \sqrt{ \frac{\log(\frac{8|\tS||\mathcal{A}||\Theta|}{\delta})}{2N} } \frac{4}{C_{uni} -\sqrt{\frac{\log(\frac{4|\tS||\mathcal{A}||\Theta|}{\delta})}{2N}}  }   \bigg)  \geq 1-\delta.
	}
	Combined with Lemma~\ref{lem:expectation}, 
	\alis{
		\textnormal{P}\bigg(\abs{\htheta - \theta^*}& \leq
		\frac{4}{C_H(1-\gamma)} \bigg(\frac{R_{\max}+1}{1-\gamma}\frac{4}{n_s^{\frac{1}{n_a}}-1}+2\epsilon_Q + \epsilon_{c}\bigg)+\frac{4(R_{max}+1)}{(1-\gamma)C_H} \sqrt{\frac{\log(\frac{4|\Theta|}{\delta})}{2N}} \\&+
		\frac{R_{max}+1}{(1-\gamma)^2C_H} \sqrt{ \frac{\log(\frac{8n_sn_a|\Theta|}{\delta})}{2N} } \frac{4}{C_{uni} -\sqrt{\frac{\log(\frac{4n_sn_a|\Theta|}{\delta})}{2N}}  }   \bigg)  \geq 1-\delta.
	}
	
	
	
	
	
	\bibliography{geng_527}
	
\end{document}
