\section{Preliminary}
% \begin{tikzpicture}[scale=2]
%   \coordinate (O) at (0,0);
%   \coordinate (A) at (1,0);
%   \draw (O) circle (1cm);
%   \draw[->,thick] (O) -- (.701,.701) node[above,above] {$\mathbf{A^*}$};
%   \draw[->,thick] (O) -- (0.258819045103,0.965925826289) node[above left] {$\boldsymbol{\alpha_1}$};
%   \draw[->,thick] (O) -- (0.965925826289,0.258819045103)node[above right] {$\boldsymbol{\alpha_2}$};
%   \draw[->,dashed] (O) -- (-0.176776695297, 0.176776695297)node[above left] {$\boldsymbol{S_1}$};
%   \draw[->,dashed] (O) -- (0.176776695297, -0.176776695297)node[below right] {$\boldsymbol{S_2}$};
% \end{tikzpicture}
% In this section, we will discuss the prerequisite knowledge about the setting of Linear Stochastic Bandits and the problem of Inverse Reinforcement Learning. \Cref{prelim:lsb} provides background on  the setting of Linear Stochastic Bandits. In \Cref{prelim:phased_elim}, we then discuss the main algorithm used to solve this problem: Phased Elimination. We then present the main problem of Inverse Reinforcement Learning in \Cref{prelim:irl}, along with the assumptions that our Inverse Reinforcement Learning algorithm will use.


\subsection{Linear Stochastic Bandits}
\label{prelim:lsb}
In the Stochastic Bandit setting, a demonstrator sequentially chooses an action $A$ from an action set $\mathcal{A}$ and receives reward $r(A)$, repeating this for $T$ timesteps. The goal of this demonstrator is to maximize the sum of rewards over time. The Linear Stochastic Bandit setting is when the action $A$ is a vector and the action set $\mathcal{A}$ is a convex subset of $\mathbb{R}^d$ where $d$ is the dimension. Furthermore, the reward function is a linear function parameterized by a true term $\theta$ where $r(A) = \langle \theta, A \rangle + \eta\text{.}$ Here, the term $\eta$, in this case, is some noise sampled from some zero-mean subgaussian distribution. The reward parameter $\theta$ is unknown to the demonstrator beforehand, so a demonstrator must balance exploitation and exploration to achieve strong expected rewards. Phased Elimination is one such algorithm used to solve this setting that achieves low expected regret.
% \subsubsection{Shape of Action Set}

% We state that the constraints of $\mathbb{A}$, \ie the hull formed by $\mathbb{A}$, defined as the function $g: \mathbb{R}^d \to {0, 1}$, where an arm $A$ is within the hull of $\mathbb{A}$ if $g(A) = 1$, else $g(A)= 0$. Furthermore, we will impose the condition that our action set is dense and finite. We formally define these assumptions in $\Cref{rem:shape}$.

% \begin{assumption}
%  \label{rem:shape}
% We state that $\mathbb{A}$ is a finite set. Furthermore, for any vector $\tilde{A} \in \mathbb{R}^d$ such that $g(\tilde{A}) = 1$, there exists an arm $A \in \mathbb{A}_0$ such that $\norm{A - \tilde{A}}_2 \leq \gamma$ where $\gamma = \mathbb{O}\left(\frac{1}{T^2d^2}\right)$
% \end{assumption}
% This ensures that the hull generated by the function $g$ is $\gamma$-covered by the action set $\mathbb{A}$. 
%  To aid for future analysis, we define the function $f(\Beta)$. It takes in a set of $d$ angles in the form of $\Beta = \{\Beta_1, \Beta_2, \dots, \Beta_d\}$ and returns a vector $A$ that aligns with the angles $\Beta$ and has maximal length such that $g(A) = 1$. 



\subsection{Phased Elimination}
\RestyleAlgo{ruled}
\begin{wrapfigure}{r}{0.5\textwidth}
  \begin{minipage}{0.5\textwidth}
  \vspace{-25pt}
   \label{alg:phased_elim}
\begin{algorithm}[H]
 \caption{Phased Elimination}
  \KwData{$\delta, T$}
  \KwResult{$A_1, \dots, A_T$}
  $l \leftarrow 1$\;
  $\mathbb{A}_0 \leftarrow \mathbb{A}$\;
  $N_1 \leftarrow 0$\;
  \While{$\sum_{i=1}^l N_{l - 1} \leq T$}{
        $\varepsilon_l \leftarrow 2^{-l}$ \\
        $\pi_l \leftarrow \text{G-Optimal design of } \mathbb{A}_l$\\
        $N_{l} \leftarrow 0$\\
        % \For{$A \in \mathbb{A}_l$}{
        %     \State{$n_l(A) \leftarrow \ceil{\pi_l(A) \cdot \frac{g(\pi_l) }{\varepsilon_l^2}\log{\frac{1}{\delta}}}$}\\
        %     \State{$N_{l} \leftarrow N_{l} + n_l(A)$} \\
        % }
        $\text{Play each action } A \in \mathbb{A}_l \text{ each } n_l(A) \text{ times}$  \\
        $V_l \leftarrow \sum_{A \in \mathbb{A}_l} n_l(A) AA^T$ \\
        $\hat{\theta}_l \leftarrow V_l^{-1} \sum_{t=t_l}^{t_l + T_l} A_t X_t$ \\
        $A_{l+1} \leftarrow \{ A \in \mathbb{A}_l \text{ s.t. } \underset{B \in \mathbb{A}_l}\max(\langle \hat{\theta}_l, B - A \rangle) \leq 2\varepsilon_l\}$\\
        $l \leftarrow l + 1$\\
}
\end{algorithm}
\end{minipage}
\end{wrapfigure}
\label{prelim:phased_elim}
Phased Elimination sequentially selects arms and then eliminates arms in phases \citep{valko14}. Notably, during a phase $l$, it picks an experimental design over the set of non-eliminated arms, plays the arms according to the said experimental design, updates its estimates of $\theta$, and eliminates arms that it deems suboptimal. This algorithm is known to achieve the regret bound on the order of $\mathbb{O}(\sqrt{dT})$, the best regret bound known for linear bandit settings \cite{valko14}. Phased Elimination chooses an exploratory distribution or G-Optimal-design using probability paramter $\delta$ and error parameter $\epsilon$ over the remaining arms in a phase such that the distribution of arms covers all exploratory directions as much as possible. We provide more details of how to choose G-Optimal Designs in the appendix since only the effects of G-Optimal designs are needed here. After the G-optimal design has been executed, the algorithm then encourages exploitation by eliminating suboptimal arms. If any arm is worse than another arm in the active set by more than $2\epsilon_l$, it is eliminated. Combining these two steps encouraging exploration and exploitation, phased elimination achieves a low-regret bound. For clarity, we provide further details of Phased Elimination in \Cref{alg:phased_elim}. Now that we have defined the forward algorithm, we must discuss forming the Inverse Reinforcement Algorithm. 

\subsection{Inverse Reinforcement Learning for Linear Stochastic Bandits}
The process of Inverse Reinforcement Learning is that of learning the reward function given only the actions taken by a demonstrator. Specifically, our inverse learner only has knowledge of the actions taken within each phase $\pi_1, \dots, \pi_L$ and the actions available at each phase $\mathbb{A}_1 \dots \mathbb{A}_L$ where $L$ is the last phase run. Our demonstrator aims to produce an estimate $\hat{\theta}$ of $\theta$ which minimizes $\frac{\norm{\hat{\theta} - \theta}_2}{\norm{\theta}_2}$. This true parameter is also unknown to both the forward and inverse algorithms. Here, we also assume that our inverse learner knows that the reward function is a linear stochastic reward function and that the forward algorithm employs Phased Elimination. Moreover, a common assumption in the Inverse Reinforcement Literature is that our inverse algorithm knows the best reward for any arm achievable. Such an assumption helps ignore trivial identifiability issues as seen in \citet{guo2021learning}.  

\begin{assumption}
    \label{ass:knowledge_best}
    We assume our inverse learner knows $\mu^*$ and $A^*$.
\end{assumption}


\section{Assumptions on the Environment}
% 1. Introduce how the action set can change the problem
% 2. Provide two examples of degenerate action sets: not smooth and not dense
% 3. Motivate why smoothenss and denseness are practical and realstic assumptions 
% 4. Formal density assumption and smoothness assumptions

This setting and task can vary in difficulty greatly due to differences in how the reward function and action set are constructed. In fact, for certain shapes of action sets, the task of Inverse Reinforcement Learning can degenerate into other already studied cases. There are two concrete examples of this. The first is that where action sets are sparse, and there is large space between different actions. One extreme example of this is where the action set comprises of only an orthogonal set of arms, reducing to the Multi-armed bandit setting. The second example is where the reward function is not smooth around the optimal arm. If the action set spikes around the optimal arm, any demonstrator could exploit such degeneracy, relinquishing its requirement to explore the action set, making the task of Inverse Reinforcement Learning much more difficult. Therefore, we will make two assumptions. We will firstly assume that the action set is dense. To define density, we first define the term $\gamma$-close to determine a metric of geometric closeness.
\begin{definition}
\label{def:gammacloseness}
    Two vectors $X, Y \in \mathbb{R}^d$ are $\gamma$-close if $\norm{X - Y}_2 \leq \gamma\text{.}$
\end{definition}

This metric of closeness is important and intuitive since two arms that are close according to \Cref{def:gammacloseness} have close rewards. We formalize this intuition in \Cref{lem:gam_close_rewards}. 
\begin{restatable}{lemma}{gammacloserewards}
\label{lem:gam_close_rewards}
Given two arms $X, Y$ that are $\gamma$-close, the difference in their rewards is bounded by 
$\mu_Y - \mu_X \leq \gamma \norm{\theta}_2 \text{.}$
\end{restatable}

Therefore, $\gamma$ entails a parameter describing the density of the action set. For our analysis, we only need density around the boundary of the action set since the boundary contains all relevant information for reward estimation.  We can formalize this assumption in \Cref{rem:shape}. 
        
\begin{assumption}
\label{rem:shape}
We assume that there exists a function $g: \mathbb{R}^d \to {0, 1}$ that is defined everywhere such that the two assumptions hold 
\begin{enumerate}
    \item{The action set $\mathbb{A}$ is a subset of the set of all points that satisfy $g(\cdot) = 1$. Formally, $$\mathbb{A} \subset S_g \text{ where } S_g \coloneqq \{v \text{ s.t. } g(v) = 1\}$$}
    \item{Let $l$ be the largest phase such that $\gamma \leq \frac{2\epsilon_l}{\norm{\theta}_2}$ such that for in any point in the boundary $\partial S_g$ of the set $S_g$ is $\gamma$-close to at least one point in $\mathbb{A}$. That is $\forall a \in \partial S_g, \exists b \in \mathbb{A} \text{ s.t. } \norm{a - b}_2 \leq \gamma\text{.}$}
    
\end{enumerate}

\end{assumption}

Here, \Cref{rem:shape} states simply that there is some function to that $g$  that describes the action set and that the action set densely covers the boundary of $g$ by parameter $\gamma \leq \frac{2\epsilon_l}{\norm{\theta}_2}$. 

Now that we have provided our density assumption, we will put an assumption on the smoothness of the dataset. Namely, an arm that is geometrically close to the optimal arm cannot have extremely different rewards. This prevents the degenerate case where the optimal arm "spikes" away from the action set, allowing the forward algorithm to exploit this spike. To formalize this, we will slightly abuse notation. 

\begin{wrapfigure}{R}{0.5\textwidth}
\begin{tikzpicture}
\centering
\node (A) at (0.5,-1) {};
\node (O) at (0,0) {};
\node (S) at (-3,-1.4) {};
\node (Sperp) at (-1.8, 1.5) {};
\node (Sperp-base) at (-2, 0.4) {};
\node (fb) at (2,0.25) {};

\draw (0,0) ellipse (3cm and 2cm);
\draw [fill=blue] (A) circle (2pt) node [left] [label=below: {$A^*$}] {};
\draw [fill=blue] (S) circle (2pt) node [left] [label=below: {$S_i$}] {};
\draw [fill=blue] (O) circle (2pt) node [left] [label=below: {$O$}] {};
\draw [fill=blue] (fb) circle (2pt) node [left] [label=above: {$f(\beta, i)$}] {};
\draw [fill=red] (Sperp) circle (0pt) node [left] [label=right: {$S_\perp$}] {};

  \greatcircle[red] {0,0} {2.95cm}{0.9cm}{-10}
    \draw[fill] (0,0) circle (1pt) node[xshift=5pt] {}; 
    \draw[-latex, blue, thick] (O) -- (A);
    \draw[-latex, blue, thick] (O) -- (S);
    \draw[-latex, blue, thick] (O) -- (fb);
    
    \draw[-latex, red, thick] (Sperp-base) -- (Sperp);
    
    \draw (0.5,-1) coordinate (A) -- (0,0) coordinate (O)
         --  (2,0.25) coordinate (C)
           pic ["$\beta$",draw,->,black,thick,angle radius=0.5cm]{angle = A--O--fb};
\end{tikzpicture}
\caption{Rotating $A^*$ to other boundary points of $S_g$ (within the plane $H_i$ defined by $S_i, O, A^*$)} \label{fig: rotation in plane}
\end{wrapfigure}

\kri{Check function definition here}
We define the function
$$f(\beta, i) = v \text{ s.t. } v \in \partial S_g \cap H_i,  H_i \coloneqq \{x \in \mathbb{R}^d  \vert (x-A^*) \tran S_\perp = 0\} \text{ and } \arccos\left(\frac{\langle v, A^*\rangle}{\|v\|_2\|A^*\|_2}\right) = \beta $$

%earlier definition
% $$f(\beta, i) = v \text{ s.t. } v \in \partial S_g \text{ and } \arccos\left(\frac{\langle v, A^*\rangle}{\|v\|_2\|A^*\|_2}\right) = \beta \text{ and } \frac{v - A^*}{\|v - A^*\|_2} = S_i  \text{.}$$
Here, $S_i$ is the $i$th vector of the regular $d-1$-simplex centered at the origin.
% (we assume an arbitrary ordering of the $d$ simplex vectors $S_i$). 
$f(\beta, i)$ finds a rotation of the optimal arm $A^*$ by an angle of $\beta$ in the hyperplane defined by Simplex vector $S_i$ and optimal arm $A^*$, such that the rotation is in the boundary of the action set $\partial S_g$. Note : $f(0, i)$ for any $i \in [d]$ simply returns the optimal arm $A^*$. We will also consider a variant of the reward function where $r(\beta, i) \coloneqq \langle f(\beta, i), \theta \rangle$. With this notation, we can write our assumptions. We desire that the reward function is smooth on the space $\partial S_g$ with respect to the $\ell_2$ norm. This is formalized in \Cref{ass:lip_smooth}.
\begin{assumption}
\label{ass:lip_smooth}
\textbf{OLD: }\\\\
We assume the following properties of $r(\beta, i)$. We assume that for every angle $\beta \in [0, \pi]$ and index $i \in [d]$, the reward function is $\mathbb{L}, \omega$-smooth such that we can write that 
 $$|r(0, i) - r(\beta, i)| \leq \mathbb{L} \beta^\omega\text{.}$$ Here, we require that $\omega \geq 1$.\\\\

 \textbf{NEW: }\\\\
 We assume the following $r(\beta, i)$ properties. We assume that for every index $i \in [d]$, that $r(0, i) - r(\beta, i) = 6 * 2^{-L}$ implies that $\beta > \left(\frac{6*2^{-L}}{\mathbb{L}}\right)^\frac{1}{\omega}$ for a smoothness constant $\omega$.
\end{assumption}
% We also assume that the reward parameter is not very small in that 
% \begin{restatable}{assumption}{thetasmall}
%     \label{ass:thetasmall}
%     We assume the true reward vector is not small as in $\|\theta\|_2 \geq  2*2^{-L}\left[ \frac{6*2^{-L}}{\mathbb{L}}\right]^{-\frac{1}{\omega}}\cdot d^{\frac{1}{2}}$.  
% \end{restatable}
Moreover, we will assume that the boundary is connected and defined for all angle vectors $\Beta$ to avoid trivial continuity issues. We formalize this in \Cref{ass:continuity}

\begin{restatable}{assumption}{continuity}
\label{ass:continuity}
% Furthermore, we assume that with any ray $\mathbf{v}_\Beta$, defined as the line that forms an angle $\Beta$ with the optimal arm $A^*$ and goes through the origin. There is only one point in the intersection between $\partial S_g$ and $\mathbf{v}_\Beta$, 
% $$|\mathbf{v}_\Beta \cap \partial S_g| = 1$$ and furthermore
% $$\mathbf{v}_\Beta \cap \partial S_g = \{f(\Beta)\}\text{.}$$
We assume that the funcion $f$ is defined for all $\beta \in [-\pi, \pi]$ and $i\in[d]$. This means that $\partial S_g$ is continuous over all directions. 
\end{restatable}

Therefore, we have assumed that our action set is smooth, continuous, and dense. Such assumptions are practical and often seen across many datasets from experiments across literature. 

 
% \label{sec:shape_as}
% The set of actions from which a demonstrator can choose is some finite set of vectors $\mathbb{A}$. There are many possible shapes such a set can take. For example, if $\mathbb{A}$ consists only of orthogonal arms, this can be seen as the traditional Multi-Armed Bandit setting. However, a more interesting case is when many arms in $\mathbb{A}$ are not orthogonal, where their rewards are linked by how close they are to each other. We will define several characteristics of $\mathbb{A}$ that are useful for our analysis. 
% Furthermore, we wish to study \emph{nondegenerate} action sets for our purposes. An example of a degenerate action set could be the $\ell_2$ unit ball but with a large spike representing $A^*$. In this case, any demonstrator could exploit such degeneracy, relinquishing its requirement to explore the action set. This degeneracy would make it difficult, intuitively, for any inverse estimator to learn about the rewards associated with directions orthogonal from the optimal arm. In this manner, any inverse estimator would suffer a significant error in its estimation of $\hat{\theta}$. 
% In order to alleviate these issues, we will make several assumptions on our action set $\mathbb{A}$. 
% Moreover, any two arms that are $\gamma$-close are also close in rewards.


                                                                                                    

% Intuitively, this set $\partial S_g$ is the boundary of our action set or the points in the action set. From the other direction, we can say that $\mathbb{A}$ forms a $\gamma$-covering of the set $\partial S_g$ with respect to the $\ell_2$ norm. To achieve such a denseness requirement, there is an intuitive relationship between $\gamma$, the parameter of the denseness, and $K$, the size of the action set $\mathbb{A}$. We quickly remark on such a relationship. 

% \begin{restatable}{lemma}{gamtok}
%     \label{lem:gam_to_k} Assuming our action set $\mathbb{A}$ is nicely chosen to be the smallest possible $\gamma$-covering of $\partial S_g$, the number of arms $K$ in $\mathbb{A}$ is upper bounded by $$K \leq \frac{\Psi_d}{\gamma}$$
%     where $$\Psi_d \coloneqq \frac{\Gamma\left(\frac{d}{2} + 1\right)}{\Gamma\left(\frac{d}{2} + \frac{1}{2}\right)} \cdot \frac{1}{\sqrt{\pi}} (2\pi)^d\cdot \operatorname{SA}(\partial S_g)\text{.}$$
%     We use here that the function $\operatorname{SA}$ returns the surface area of the space $S_g$. 
% \end{restatable}

% We need the following density assumption on $\gamma$.
% \begin{assumption}
% \label{ass:gamsmall}
% We require $\gamma$ to be sufficiently small as in $\gamma \leq \sqrt{\frac{2\epsilon_L}{\norm{\theta}_2^2}}$.
% \end{assumption}
% Now, a degenerate action set would mean two points in $\partial S_g$ that are geometrically close to each other with very different rewards. We place a continuousness assumption to analyze nondegenerate action sets. To ease the analysis, we will consider the function $r(\Beta) \coloneqq \langle f(\Beta), \theta \rangle$ where $f(\Beta): [0, 2\pi]^d \to \partial S_g$ takes a set of $d$ angles in $\Beta \coloneqq [\Beta_1, \dots, \Beta_d]$ and finds the vector in $\partial S_g$ that most aligns with the set of angles with the largest norm. For notational ease, we define $\Beta^* \coloneqq [0]^d$ where $f(\Beta^*)$ is $\gamma$-close to  $A^*$. A vector $f(\Beta)$ forms an angle of $\Beta_i$ with the optimal arm $A^*$ in the hyperplane defined by the $i$th and $i+1$th axes.


% This assumption is equivalent to the statements that the boundary $\partial S_g$ is a connected set and $S_g$ contains no empty space.
% By definition, we state that $\Beta^* $ is associated with $\mu^* - \gamma^2\norm{\theta}_2^2 \leq r(\Beta^*) \leq \mu^* + \gamma^2\norm{\theta}_2^2$ and $f(\Beta^*)$ is $\gamma$-close to  $A^*$.

% \begin{assumption}
% \label{ass:lip_smooth}
% We assume the following properties of $r(\Beta)$. We assume for some neighborhood around $\Beta^*$, 
% i.e. $\mathcal{N}_{\beta^*} \coloneqq \{\Beta \text{ s.t. } \norm{\Beta - \Beta^*}_2 \leq \frac{1}{d}\}$, the function $r$ is concave. Moreover,
%  we assume that the function $r(\Beta)$ is 2-Holder Continuous with coefficient $\mathbb{H}$ with respect to the $\ell_2$ norm. We denote the minimum eigenvalue of $\mathbb{H}$ as $\mathbb{L}$. Formally, for any two points $\Beta,\Beta' \in \mathcal{N}_{\beta^*}$,
%  $$r(\Beta) - r(\Beta') \geq \mathbb{H} \norm{\Beta - \Beta'}_2^2 \text{.}$$
%  Furthermore, if $\Beta$ and $\Beta'$ only differ in one dimension $i$, then we can further write that 
%  $$r(\Beta) - r(\Beta') \geq \mathbb{L} (\Beta_i - \Beta'_i)^2\text{.}$$
% \end{assumption}

% To analyze this problem, we will slightly abuse notation. We state that we will say that $\beta_i$ is the vector of $d$ angles containing all values for $0$ except at the $i$th index, where it contains the value $\beta$. This vector looks like

 
%  In this way, $f(\beta_i)$ is the arm formed by rotating the optimal arm in the hyperplane defined by $i$th and $i+1$th axes by angle $\beta$. 
% \label{prelim:irl}

