\subsubsection*{\bibname}}




\begin{document}


\runningauthor{Volodymyr Tkachuk, Seyed Alireza Bakhtiari, Johannes Kirschner, Matej Jusup, Ilija Bogunovic, Csaba Szepesvári}
\newcommand{Efficient Planning in Combinatorial Action Spaces with Applications to Cooperative Multi-Agent Reinforcement Learning}{Efficient Planning in Combinatorial Action Spaces with Applications to Cooperative Multi-Agent Reinforcement Learning}

\twocolumn[
\aistatstitle{Efficient Planning in Combinatorial Action Spaces with Applications to Cooperative Multi-Agent Reinforcement Learning}

\aistatsauthor{ 
	Volodymyr Tkachuk$^*$  \And
	Seyed Alireza Bakhtiari$^*$ \And
	Johannes Kirschner}

\aistatsaddress{ University of Alberta \And University of Alberta  \And University of Alberta}

\aistatsauthor{ 
	Matej Jusup \And
	Ilija Bogunovic\And
	Csaba Szepesvári }

\aistatsaddress{ETH Zurich \And University College London \And University of Alberta/DeepMind }  ]

\begin{abstract}
A practical challenge in reinforcement learning are combinatorial action spaces that make planning computationally demanding. For example, in cooperative multi-agent reinforcement learning, a potentially large number of agents jointly optimize a global reward function, which leads to a combinatorial blow-up in the action space by the number of agents. As a minimal requirement, we assume access to an argmax oracle that allows to efficiently compute the greedy policy for any Q-function in the model class. Building on recent work in planning with local access to a simulator and linear function approximation, we propose efficient algorithms for this setting that lead to polynomial compute and query complexity in all relevant problem parameters. For the special case where the feature decomposition is additive, we further improve the bounds and extend the results to the kernelized setting with an efficient algorithm.
\end{abstract}

\input{parts/introduction}
\input{parts/related_work}
\input{parts/notation}
\input{parts/preliminaries}
\input{parts/algorithms}
\input{parts/additive}

\input{parts/conclusion}

 \subsubsection*{Acknowledgements}
 Johannes Kirschner gratefully acknowledges funding from the SNSF Early Postdoc.Mobility fellowship P2EZP2\_199781.
 Matej Jusup gratefully acknowledges support by the Swiss National Science Foundation under the research project DADA/181210.
Csaba Szepesv\'ari gratefully acknowledges funding  from the Canada CIFAR AI Chairs Program, Amii and NSERC.


\bibliographystyle{unsrtnat}

\section{Introduction}
We consider a cooperative multi-agent reinforcement learning setting with $m \in \mathbb{N}$ agents. 
Let $\mathcal{S}$ be a state space that is shared for all agents, and $\mathcal{A}^{(i)}$ the set of actions available to agent $i \in [m]$. 
We use the following notation to denote the joint agent action space
$\mathcal{A}^{(1:m)} = \mathcal{A}^{(1)} \times ... \times \mathcal{A}^{(m)},$
and the following to denote an action vector
$a^{(1:m)} = (a^{(1)}, ..., a^{(m)}),$
where $\times$ is used to denote a Cartesian product and $a^{(i)} \in \mathcal{A}^{(i)}$ is the action agent $i$ took.
Denote $A^{(1:m)} = |\mathcal{A}^{(1:m)}|$ and $A^{(j)} = |\mathcal{A}^{(j)}|$ where $j \in [m]$ and we use notation $[a] = \{1, ..., a\}, \ a \in \mathbb{N}$ throughout.
There is a global transition kernel $P : \mathcal{S} \times \mathcal{A}^{(1:m)} \rightarrow \Delta_{\mathcal{S}}$, where $\Delta_{\mathcal{S}}$ denotes the set of probability measures over $\mathcal{S}$. 
That is, given a state $s \in \mathcal{S}$ and action vector $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ for each agent, the system transits to a new state $s' \sim P(s, a^{(1:m)})$. 
Each agent $i=1,\dots,m$ has a reward function $r_i : \mathcal{S} \times \mathcal{A}^{(i)} \rightarrow [0, 1]$. A centralized, deterministic policy $\pi :\mathcal{S} \rightarrow \mathcal{A}^{(1:m)}$ maps states to $m$ actions.

The global value of a policy $\pi$ is
\begin{align*}
	V_\pi(s_0) = \EE_{\pi}[\sum_{h=1}^H \sum_{i=1}^m r_i(s_h, a_h^{(i)})|s_0] \,.
\end{align*}
The expectation is over the sequence of states $s_1, \dots, s_H$ and action tuples $(a_h^{(1:m)})_{h=1}^H$ sampled from the transition kernel $P$ and the policy $\pi$ for a given starting state $s_0$. A policy $\pi^*$ is called optimal if $V_{\pi^*} = \max_{\pi} V_{\pi}$.

The value of agent $i$ under the joint policy $\pi$ is
\begin{align*}
	V_{\pi,i}(s_0) = \EE_{\pi}[\sum_{h=1}^H r_i(s_h, a_{h,i})|s_0] \,.
\end{align*}
Note that the value depends on the actions of all agents through the joint transition kernel. The Q-function for agent $i$ is defined analogously,
\begin{align*}
	Q_{\pi}^{(i)}(s,a^{(1:m)}) = r_i(s,a^{(i)})  + \EE_{s' \sim P(s,a^{(1:m)})}[V_{\pi,i}(s')]
\end{align*} 
and $Q_\pi(s,a^{(1:m)}) = \sum_{i=1}^m Q_{\pi,i}(s, a^{(1:m)})$.

Our goal is to find a computation and sample efficient local planner with access to a simulator. 
The simulator can be used to sample transitions from the kernel $P$. 
We also assume access to state-action features $\phi_i : \mathcal{S} \times \mathcal{A}^{(i)} \rightarrow \mathbb{R}^d$. We make the following realizability assumption:

\begin{assumption}
\label{ass: ass1}
	For each (deterministic) policy $\pi$, there exists weight vectors $w_1, \dots w_m \in \mathbb{R}^d$ such that $Q_{\pi}^{(i)}(s,a^{(1:m)}) = \sum_{j=1}^m \phi_j(s,a^{(j)})^\top w_i$, and hence $Q_\pi(s,a^{(1:m)}) = \sum_{j=1}^m \phi_j(s,a^{(j)})^\top \sum_{i=1}^m w_i$.
\end{assumption}
This is a quite restrictive assumption that requires the $Q_{\pi}^{(i)}$ function to decompose additively. 
This means the effect of an action chosen by agent $i$ on the $Q_{\pi}^{(i)}$ function is independent of actions chosen by other agents.

A slightly less restrictive assumption is the following. 
Assume that we are given access to features $\phi_{i,j}(s,a) \in \mathbb{R}^d$ for each pair of agents $i,j \in [m]$.
\begin{assumption}
\label{ass: ass2}
	For each (deterministic) policy $\pi$, there exists weight vectors $w_1, \dots w_m \in \mathbb{R}^d$ such that $Q_{\pi}^{(i)}(s,a^{(1:m)}) = \sum_{j=1}^m \phi_{i,j}(s,a^{(j)})^\top w_i$. Therefore, 
	\[Q_\pi(s,a^{(1:m)}) = \sum_{i=1}^m \sum_{j=1}^m \phi_{i,j}^\top(s,a^{(j)}) w_i= \sum_{j=1}^m \tilde \phi_j(s, a^{(j)})^\top \tilde w \,,\]
	where $\tilde \phi_j(s, a) = (\phi_{1,j}(s, a), \dots, \phi_{m,j}(s,a)) \in \mathbb{R}^{md}$ and $\tilde w = (w_1, \dots, w_m) \in \mathbb{R}^{md}$ concatenates the agent specific features and weights respectively.
	
\end{assumption}


\paragraph{Goals} Assuming $|\mathcal{A}^{(i)}| = A, \ \forall i \in [m]$ for some constant $A$. 
The objective is to find a planner with corresponding policy $\pi$ that provably satisfies $V_{\pi^*}(s) - V_{\pi}(s) \leq \epsilon$ and has sample and computation complexity $\text{poly}(H, A, m, d, 1/\epsilon)$. 
The work by \citet{yin2021efficient} provides a planner that is $\text{poly}(H, A^m, d, 1/\epsilon)$, which could serve as a starting point.
The idea is to be more careful about which actions to evaluate, and using the feedback for each individual agent. 

Further, we are planning to empirically evaluate different ideas on toy examples. At a later stage, we could also design a scaling solution based on neural networks, for example taking inspiration from the AlphaZero algorithm \citep{silver2018general}.


\paragraph{Motivating Application} The \emph{Flatland} environment \citep{mohanty2020flatlandrl} is a railway simulator. Provided with a rail network, the goal is to route trains from a starting point to a destination while minimizing the total delay. This is also known as the \emph{vehicle routing problem}. This setting can be modeled as a multi-agent planning problem. The state $s$ captures the railway network and the current position of all trains. Each train corresponds to one agent. The agents choose movement actions $\{\text{STOP}, \text{FORWARD}, \text{LEFT}, \text{RIGHT}\}$ as permitted by the rail network. Trains can occur stochastic malfunctions that may require replanning. The objective is to minimize overall delay, therefore the agent-specific reward function is set to the negative delay.

\paragraph{Related Work}

QMix: \cite{rashid2018qmix}

Local Utility functions: \cite{zohar2021locality}

Potential Games: \cite{leonardos2021global}


\section{MDP Setting Analysis}

For ease of reference we include the naive ($O(\Pi_{i=1}^m |A^{(i)}|, ...)$ computation) multi-agent implementation of Algorithm 1, Confident Rollout, from \cite{yin2021efficient}.
Unlike in the original algorithm, we will use $n$ to denote the number of samples and $H$ to denote the length of the trajectory. This change is made so that we can continue to use $m$ to represent the number of agents.
We will work under Assumption \ref{ass: ass1} for all the following subsections. 

We define $w = \sum_{i=1}^m w_i$ and, $\phi(s, a^{(1:m)}) = \sum_{i=1}^m \phi_i(s, a^{(i)})$.

\begin{algorithm}
\caption{Multi-Agent Confident Rollout} \label{alg:confident rollout}  
\begin{algorithmic}[1]
\State \textbf{Input:} number of rollouts $n$, length of rollouts $H$, weight vector $w \in \mathbb{R}^d$, discount $\gamma$, initial state $s_0$, initial action $a^{(1:m)}$, feature matrix $\Phi_C$, regularization coefficient $\lambda$, threshold $\tau$.
\For {$i = 1, ..., n$}
    \State $s_{i, 0} \gets s_0, a_{i, 0} \gets \bar a$, query the simulator, obtain reward $r_{i, 0} \gets r(s_{i, 0}, a_{i, 0})$, and the next state $s_{i, 1}$.
    \For {$t = 1, ..., H$} 
        \For {$a^{(1:m)} \in \mathcal{A}^{(1:m)}$} 
            \State Compute feature $\phi(s, a^{(1:m)})$.
            \If {$\phi(s, a^{(1:m)})^\top (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \phi(s, a^{(1:m)}) > \tau$}
                \State status $\gets$ uncertain, result $\gets (s, a^{(1:m)}, \phi(s, a^{(1:m)}), \text{none})$
                \State \Return {status, result}
            \EndIf
        \EndFor 
        \State $a_{i, t}^{(1:m)} = \argmax_{\tilde{a}^{(1:m)} \in \mathcal{A}^{(1:m)}} w^\top \phi(s, \tilde{a}^{(1:m)})$.
        \State Query the simulator with $s, a_{i, t}^{(1:m)}$, obtain reward $r_{i, t} \gets r(s, a_{i, t}^{(1:m)})$, and next state $s_{i, {t+1}}$.
    \EndFor 
\EndFor
\State status $\gets$ done, result $\gets \frac{1}{n} \sum_{i=1}^n \sum_{t=0}^H \gamma^t r_{i, t}$ 
\State \Return status, result
\end{algorithmic}
\end{algorithm}

The issue with the above naive approach is that the loop in line 5 and the argmax in line 12 will require $O(\Pi_{i=1}^m |A^{(i)}|)$ computation.

We highlight that the argmax in line 12 can be computed in $O(\sum_{i=1}^m |A^{(i)}|)$ time under Assumption \ref{ass: ass1} as
$$a_{i, t}^{(1:m)} = \left(\argmax_{\tilde{a}^{(1)} \in \mathcal{A}^{(1)}} w^\top \phi_1(s, \tilde{a}^{(1)}), ..., \argmax_{\tilde{a}^{(m)} \in \mathcal{A}^{(m)}} w^\top \phi_m(s, \tilde{a}^{(m)})\right), $$

The difficulty then lies in reducing the computation time in the loop in line 5 from $O(\Pi_{i=1}^m |A^{(i)}|) \to O(\sum_{i=1}^m |A^{(i)}|)$.
Next we present an approaches to do this.

Define $V_\mathcal{C} = \Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I$ and a weighted matrix norm as $||x||_{Y}^2 = x^\top Y x, \ x \in \mathbb{R}^d, Y \in \mathbb{R}^{d \times d}$. 
Define the \textit{good set} as $\mathcal{H} = \{ \phi(s, a^{(1:m)}): ||\phi(s, a^{(1:m)})||_{V_\mathcal{C}^{-1}}^2 \le \tau, s \in \mathcal{S}, a^{(1:m)} \in \mathcal{A}^{(1:m)} \}$
Line 7 can have two cases. Case A: $||\phi(s, a^{(1:m)})||_{V_\mathcal{C}^{-1}}^2 > \tau$. Or Case B: $||\phi(s, a^{(1:m)})||_{V_\mathcal{C}^{-1}}^2 \le \tau$.
Then lines 5-11 can have two cases. Case 1: Case A holds for at least one $a^{(1:m)} \in \mathcal{A}^{(1:m)}$. Or Case 2: Case B holds for all $a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
The goal of lines 5-11 were to ensure that we are able identify if we are in Case 1 or Case 2. This was important for two reasons.

Reason 1: was to make sure that the virtual algorithm (VA) and main algorithm (MA) had identical trajectories, until Case A occurred for the first time.

Reason 2: was to ensure that the VA was greedy w.r.t. 
\begin{equation*}
\tilde{Q}(s, a^{(1:m)})=\begin{cases}
                      \tilde{w}^\top \phi(s, a^{(1:m)}) \quad &\text{if} \, \phi(s, a^{(1:m)}) \in \mathcal{H} \\
                      Q_{\tilde{\pi}}(s, a^{(1:m)})     \quad &\text{if} \, \phi(s, a^{(1:m)}) \notin \mathcal{H} \\
                \end{cases}
\end{equation*}
since $\phi(s, a^{(1:m)}) \in \mathcal{H}$ is needed for Lemma B.3 in \cite{yin2021efficient}.

\subsection{Size of the Core Set}
The upper bound on the size of the core set depends on the 2-norm of the features added to it.
We introduce what we believe is a more natural assumption on the bound of the features for the multi-agent setting (compared to a bound of $1$).
\begin{assumption}[Bounded features]
\label{ass: bounded features}
We assume that $||\phi(s, a^{(1:m)})||_2 = ||\sum_{i=1}^m \phi_i(s, a^{(i)})||_2 \le m$ for all $(s, a^{(1:m)}) \in \mathcal{S} \times \mathcal{A}^{(1:m)}$
\end{assumption}
We have the following lemma bounding the size of the core set:

\begin{theorem}
[Size of the core set]
\label{thm:size of core set}

Under Assumption \ref{ass: bounded features} the size of the core set $\mathcal{C}$ will not exceed
\begin{equation}
    C_{\max} :=
   
   
   
   
   
   
    \frac{m^2 d(1 + \tau)}{\lambda \tau^2}
\end{equation}
\end{theorem}
\begin{proof}
    The proof is almost the same as the proof in the older version of \cite{yin2021efficient} (This is a \href{https://arxiv.org/pdf/2108.05533v2.pdf}{link} to the version) that didn't have the Politex result yet (which worked by first noting that the trace of $V_\mathcal{C}^{-1}$ must always be greater than zero since $V_\mathcal{C}^{-1}$ is positive definite for all $\mathcal{C}$ and equal to exactly $d/\lambda$ when the core set is empty, then showing that each time you add a new feature to the core set the trace of $V_\mathcal{C}^{-1}$ decreases by $\tau^2/(1+\tau)$).
    The only difference is that the term $\sum_{i=1}^d \alpha_i^2 = ||\alpha||_2^2$ in equation A.7 is bounded by $m^2$ instead of $1^2$, scaling the final bound by a factor of $m^2$.
\end{proof}

\subsubsection{Logarithmic Dependence on $m$}
Using the analysis of the newer version of \cite{yin2021efficient} for the size of the core set,
we get the following upper bound for the size of the core set:
\begin{align}
    \tilde{C}_{\max} := \frac{e}{e-1} \frac{1 + \tau}{\tau} d \left( 
        \log(1 + \frac{1}{\tau}) +
        \log(1 + \frac{m}{\lambda})
    \right) \label{eq:cmax-new}
\end{align}

\subsection{Policy Evaluation}
We have two different algorithms for multi-agent-confident rollout,
and each one results in a different bound for policy evaluation.
Therefore, in this section, we state these two algorithms for multi-agent rollout
alongside their theoretical guarantees and computational complexities.
In the next section, we use these results for policy improvement and obtain the final sub-optimality for these algorithms.

\subsubsection{Default Action Vector Approach (DAV)}
This approach aims to leverage our understanding of why lines 5-11 were necessary (namely Reason 1 and Reason 2) to modify Algorithm \ref{alg:confident rollout} in a clever way such that we only need to check if $O(\sum_{i=1}^m |A^{(i)}|)$ action vectors are in the good set.
We introduce the notation $(x^{(i)}, y^{(-i)}) = (y^{(1)}, ..., x^{(i)}, ..., y^{(m)})$.
We define a \textit{default action vector} $\bar a^{(1:m)} \in \mathcal{A}^{(1:m)}$. 
It is just a fixed action vector that must be selected at the beginning of Algorithm 1 from \cite{yin2021efficient} and held constant throughout.
Thus, whenever we refer to the default action vector we are always talking about the exact same vector and thus the policy does not change for different instances of Algorithm \ref{alg:confident rollout da}. 
The modified algorithm is presented as Algorithm \ref{alg:confident rollout da}, where the differences between Algorithm \ref{alg:confident rollout} and Algorithm \ref{alg:confident rollout da} are in blue. 
For completeness we also include Algorithm \ref{alg:confident ma mc-lspi dav} (modified Algorithm 1 from \cite{yin2021efficient}) which makes calls to Algorithm \ref{alg:confident rollout da}).
The differences between Algorithm \ref{alg:confident ma mc-lspi dav} and Algorithm 1 from \cite{yin2021efficient} are indicated in blue.

\begin{algorithm}
\caption{Multi-Agent(MA)-Confident Rollout with Default Action Vector(DAV)} \label{alg:confident rollout da}  
\begin{algorithmic}[1]
\State \textbf{Input:} number of rollouts $n$, length of rollouts $H$, rollout policy $\pi$, discount $\gamma$, initial state $s_0$, initial action vector $a_0^{(1:m)}$, \textcolor{blue}{default action vector $\bar a^{(1:m)}$}, feature matrix $\Phi_C$, regularization coefficient $\lambda$, threshold $\tau$.
\For {$i = 1, ..., n$}
    \State $s_{i, 0} \gets s_0, a_{i, 0} \gets a_0$, query the simulator, obtain reward $r_{i, 0} \gets r(s_{i, 0}, a_{i, 0})$, and the next state $s_{i, 1}$.
    \For {$t = 1, ..., H$} 
        \If {\textcolor{blue}{$\phi(s_{i, t}, \bar a^{(1:m)})^\top (\Phi_C^\top \Phi_C + \lambda I)^{-1} \phi(s_{i, t}, \bar a^{(1:m)}) > \tau$}} \Comment{Ensure $\phi(s_{i, t}, \bar a^{(1:m)})$ is in the good set}
            \State \textcolor{blue}{$\mathcal{C} \gets \mathcal{C} \cup \{(s_{i, t}, \bar a^{(1:m)}, \phi(s_{i, t}, \bar a^{(1:m)}), \text{none})\}$}
        \EndIf
        \For {\textcolor{blue}{$j \in [m]$}}
            \For {\textcolor{blue}{$a^{(j)} \in \mathcal{A}^{(j)}$}}
                \State \textcolor{blue}{Compute feature $\phi(s_{i,t}, (a^{(j)}, \bar a^{(-j)})$.}
                \If {\textcolor{blue}{$\phi(s_{i, t}, (a^{(j)}, \bar a^{(-j)}))^\top (\Phi_C^\top \Phi_C + \lambda I)^{-1} \phi(s_{i, t}, (a^{(j)}, \bar a^{(-j)})) > \tau$}}
                    \State \textcolor{blue}{status $\gets$ uncertain, result $\gets (s_{i, t}, (a^{(j)}, \bar a^{(-j)}), \phi(s_{i, t}, (a^{(j)}, \bar a^{(-j)})), \text{none})$}
                    \State \Return {status, result}
                \EndIf
            \EndFor 
        \EndFor 
        \State \textcolor{blue}{$a_{i, t}^{(1:m)} \sim \pi(\cdot | s_{i,t})$} 
       
       
        \State Query the simulator with $s_{i, t}, a_{i, t}^{(1:m)}$, obtain reward $r_{i, t} \gets r(s_{i, t}, a_{i, t}^{(1:m)})$, and next state $s_{i, {t+1}}$.
    \EndFor 
\EndFor
\State status $\gets$ done, result $\gets \frac{1}{n} \sum_{i=1}^n \sum_{t=0}^H \gamma^t r_{i, t}$ 
\State \Return status, result
\end{algorithmic}
\end{algorithm}

\begin{algorithm}
\caption{Confident Multi-Agent MC-LSPI DAV} \label{alg:confident ma mc-lspi dav}  
\begin{algorithmic}[1]
\State \textbf{Input:} initial state $\rho$, \textcolor{blue}{default action vector $\bar a^{(1:m)}$}, initial policy $\pi_0$, number of iterations $K$, regularization coefficient $\lambda$, threshold $\tau$, discount $\gamma$, number of rollouts $n$, length of rollout $H$ 
\State {$\mathcal{C} \gets \emptyset$} 
\If {\textcolor{blue}{$\phi(\rho, \bar a^{(1:m)})^\top (\Phi_C^\top \Phi_C + \lambda I)^{-1} \phi(\rho, \bar a^{(1:m)}) > \tau$}} \Comment{Ensure $\phi(\rho, \bar a^{(1:m)})$ is in the good set}
    \State \textcolor{blue}{$\mathcal{C} \gets \mathcal{C} \cup \{(\rho, \bar a^{(1:m)}, \phi(\rho, \bar a^{(1:m)}), \text{none})\}$}
\EndIf
\For {\textcolor{blue}{$j \in [m]$}}
    \For {\textcolor{blue}{$a^{(j)} \in \mathcal{A}^{(j)}$}}
        \If {\textcolor{blue}{$\mathcal{C} = \emptyset \text{ or } \phi(\rho, (a^{(j)}, \bar a^{(-j)}))^\top (\Phi_C^\top \Phi_C + \lambda I)^{-1} \phi(\rho, (a^{(j)}, \bar a^{(-j)})) > \tau$}}
            \State \textcolor{blue}{$\mathcal{C} \gets \mathcal{C} \cup \{(\rho, (a^{(j)}, \bar a^{(-j)}), \phi(\rho, (a^{(j)}, \bar a^{(-j)})), \text{none})\}$}
        \EndIf
    \EndFor 
\EndFor 
\State $z_q \gets \text{none}, \, \forall z \in \mathcal{C}$ \quad \Comment{Policy iteration starts $(*)$}
\For {$k \in 1, \dots, K$}
    \For {$z \in \mathcal{C}$}
        \State status, result $\gets \text{MA-Confident Rollout with DAV}(n, H, \pi_{k-1}, \gamma, z_s, z_{a^{(1:m)}},$\textcolor{blue}{$\bar a^{(1:m)}$}, $\Phi_\mathcal{C}, \lambda, \tau)$
        \State \textbf{if} status $=$ done, \textbf{then} $z_q = \text{result}$; \textbf{else} $\mathcal{C} \gets \mathcal{C} \cup \{\text{result}\}$ and \textbf{goto} line $(*)$ 
    \EndFor 
    \State $w_k \gets (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \Phi_\mathcal{C}^\top q_\mathcal{C}$
    \State \textcolor{blue}{$\pi_k(a^{(1:m)}|s) \gets \mathbf{1}(a^{(1:m)} = (\argmax_{\tilde{a}^{(1)} \in \mathcal{A}^{(1)}} w^\top \phi_1(s, \tilde{a}^{(1)}), ..., \argmax_{\tilde{a}^{(m)} \in \mathcal{A}^{(m)}} w^\top \phi_m(s, \tilde{a}^{(m)})))$}
\EndFor
\State \Return $w_{K-1}$. 
\end{algorithmic}
\end{algorithm}


Define the polynomial sized set of modified default action vectors as $\bar \mathcal{A}^{(1:m)} = \{ (a^{(i)}, \bar a^{(-i)}): a^{(i)} \in \mathcal{A}^{(i)}, \ i \in [m] \}$.
Define the set of states for which all the modified default action vectors are in the good set as $\bar \mathcal{S} = \{ s \in \mathcal{S}: ||\phi(s, a^{(1:m)})||_{V_\mathcal{C}^{-1}}^2 \le \tau, \forall a^{(1:m)} \in \bar \mathcal{A}^{(1:m)} \}$.

Notice that we can not satisfy Reason 2, unless we are sure $\phi(s,  a^{(1:m)}) \in \mathcal{H}$ for all $(s,  a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A}^{(1:m)})$.
In the next subsection we ensure that for a given state all $\Pi_{i=1}^m |A^{(i)}|$ feature vectors are in the good set in $\sum_{i=1}^m |A^{(i)}|$ computation time, while in this subsection we chose to redefine $\tilde Q$. 

Consider iteration $k$ of policy iteration.
We redefine the VA value function as 
\begin{equation*}
\tilde Q_{k-1}(s, a^{(1:m)})=\begin{cases}
      \tilde w_k^\top \phi(s, a^{(1:m)}) \quad & s \in \bar \mathcal{S} \\
      Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)}). \quad & s \in \mathcal{S} \backslash \bar \mathcal{S}
    \end{cases}
\end{equation*}
We define the VA policy as (this definition is the same as in \cite{yin2021efficient}, but our $\tilde Q_{k-1}$ is different) 
\begin{equation*}
   \tilde{\pi}_k (a^{(1:m)}|s) = \mathbf{1} \left(a^{(1:m)} = \argmax_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \tilde Q_{k-1}(s, a^{(1:m)})\right)
\end{equation*}
where $\tilde w_k = V_\mathcal{C}^{-1} \Phi_\mathcal{C}^\top \tilde q_\mathcal{C}$, and $\tilde q_\mathcal{C}$, including all other details about the VA, are the same as explained in B.1 of the Appendix in \cite{yin2021efficient}.

\subsubsection*{A Bound for Policy Evaluation for all Actions}

Our goal is to ensure that $||\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})||_\infty$ is bounded.
The purpose for this is so that we can use Lemme B.3 from \cite{yin2021efficient}. 

For completeness, we first state a slight modification of Lemme B.2 from \cite{yin2021efficient} that uses the fact that the approximation error $\epsilon = 0$ in our case due to Assumption \ref{ass: ass1}. 
As done in \cite{yin2021efficient}, we will use $w_\pi$ to denote the parameter vector that ensures $||w_\pi^\top \phi(s, a^{(1:m)}) - Q_\pi(s, a^{(1:m)})||_\infty = 0$ (Assumption \ref{ass: ass1}). 

\begin{lemma}[DAV modified Lemma B.2 from \cite{yin2021efficient}]
\label{lemma: mod b.2}
Suppose that Assumption \ref{ass: ass1} holds. 
With all terms as defined earlier and $\theta > 0$. 
Then, with probability at least 
$$1 - 2C_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$$
for any $(s,  a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A}^{(1:m)})$ pair such that $\phi(s, a^{(1:m)}) \in \mathcal{H}$, we have 
$$|\tilde{w}_k^\top \phi(s, a^{(1:m)}) - w_{\tilde\pi_{k-1}}^\top \phi(s, a^{(1:m)})| \le b\sqrt{\lambda \tau} + \left( \frac{\gamma^{H-1}}{1 - \gamma} + \theta \right) \sqrt{\tau C_{\text{max}}} := \bar\eta$$
\end{lemma}

\begin{proof}
The proof is identical to that of Lemme B.2 from \cite{yin2021efficient} with the triangle inequality for the approximation error step removed and then the approximation error set to zero due to our Assumption \ref{ass: ass1}.
Also $\tilde{Q}_{k-1}(s, a^{(1:m)})$ in the Lemme B.2 from \cite{yin2021efficient} was replaced with $\tilde{w}_k^\top \phi(s, a^{(1:m)})$, since Lemme B.2 only holds for features in the good set, in which case $\tilde{Q}_{k-1}(s, a^{(1:m)}) = \tilde{w}_k^\top \phi(s, a^{(1:m)})$ (where $\tilde{Q}_{k-1}$ in the last line is as defined in \cite{yin2021efficient}).
\end{proof}

We now show that $||\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})||_\infty$ can be bounded.

\begin{prop}[approximate value function bound for DAV]
\label{prop: approx value function bound for DAV}
Suppose that Assumption \ref{ass: ass1} holds. 
With all terms as defined earlier and $\theta > 0$. 
Then, with probability at least 
$$1 - 2C_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$$
we have
$$||\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})||_\infty \le \bar\eta (2m-1) := \eta_1.$$ 
\end{prop}

\begin{proof}

For any $(s, a^{(1:m)}) \in (\bar \mathcal{S} \times \mathcal{A}^{(1:m)})$, we have
\begin{align}
    & |\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| \nonumber \\
    &= |\tilde{w}_k^\top \phi(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| \nonumber \\
    &= |\tilde{w}_k^\top \phi(s, a^{(1:m)}) \pm w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| \nonumber \\
    &\le |\tilde{w}_k^\top \phi(s, a^{(1:m)}) - w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)})| + |w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| \nonumber \\
    &\le |\tilde{w}_k^\top \phi(s, a^{(1:m)}) - w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)})| + 0 \nonumber \\
    &= |\tilde{w}_k^\top \phi(s, a^{(1:m)}) - w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)}) \pm (m-1)\tilde{w}_k^\top \phi(s, \bar a^{(1:m)}) \pm (m-1) w_{\tilde{\pi}_{k-1}}^\top \phi(s, \bar a^{(1:m)})| \nonumber \\
    &= \left|\left( \sum_{i=1}^m \tilde{w}_k^\top \phi(s, (a^{(i)}, \bar a^{(-i)})) - w_{\tilde\pi_{k-1}}^\top \phi(s, (a^{(i)}, \bar a^{(-i)})) \right) + (m-1)\left[w_{\tilde{\pi}_{k-1}}^\top \phi(s, \bar a^{(1:m)})) - \tilde{w}_k^\top \phi(s, \bar a^{(1:m)}\right]\right| \nonumber \\
    &\le m \bar\eta + (m-1) \bar \eta \nonumber \\ 
    &= \bar\eta (2m-1)  \label{value function bound 1}
\end{align}
where the second last inequality holds by Lemma \ref{lemma: mod b.2} (because the features of all the state action pairs considered are in $\mathcal{H}$, since $s \in \bar \mathcal{S}$).

While for any $(s, a^{(1:m)}) \in ((\mathcal{S} \backslash \bar \mathcal{S}) \times \mathcal{A}^{(1:m)})$, we have
\begin{align}
    |\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| 
    = |Q_{\tilde \pi_{k-1}}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})|
    &= 0 \label{value function bound 2}
\end{align}
\end{proof}




\subsubsection{Efficient Good Set Search Approach (EGSS)} \label{subsec:good set search}

In this subsection we aim to satisfy Reason 2 in polynomial time.
Namely, we ensure that for a given state all $\Pi_{i=1}^m |A^{(i)}|$ feature vectors are in the good set in $O(\sum_{i=1}^m |A^{(i)}|)$ computation time.

\begin{lemma}[Efficient good set search]
\label{lemma:good set search}
With all terms as defined earlier. 
One can ensure, with computation time $2 d^2 \sum_{i=1}^m |A^{(i)}|$ that either
$$\phi(s, a^{(1:m)})^\top V_\mathcal{C}^{-1} \phi(s, a^{(1:m)}) \le \sqrt{d}\tau$$
for all $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ or there exists an $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ such that
$$\phi(s, a^{(1:m)})^\top V_\mathcal{C}^{-1} \phi(s, a^{(1:m)}) > \tau$$
\end{lemma}

\begin{proof}
Recall that we are able to compute $\max_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \langle u, \phi(s, a^{(1:m)}) \rangle$ for any $u \in \mathbb{R}^d$ in $d \sum_{i=1}^m |A^{(i)}|$ time (due to the linear decomposition of the features for each agent).
Now, we make use of a bidirectional 2-norm to $\infty$-norm inequality that will take advantage of the above mentioned efficient computation.

Fix $\mathcal{C}$ and define the lower triangular matrix $L$ via the Cholesky decomposition $V_\mathcal{C}^{-1} = V^{-1} = L L^\top$.
Define $\{e_i\}_{i=1}^d$ as the standard basis vectors and 
\begin{equation*}
(v^*, a_\text{max}^{(1:m)}) := \text{arg} \left(\max_{v \in \{\pm e_i\}_{i=1}^d} \max_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \langle L v, \phi(s, a^{(1:m)}) \rangle \right) 
\end{equation*}
Then we have that
\begin{align}
   \frac{1}{\sqrt{d}} \| \phi(s, a_\text{max}^{(1:m)}) \|_{V^{-1}} 
   &= \frac{1}{\sqrt{d}} \sqrt{\phi(s, a_\text{max}^{(1:m)})^\top V^{-1} \phi(s, a_\text{max}^{(1:m)})} \nonumber \\
   &= \frac{1}{\sqrt{d}} \sqrt{\phi(s, a_\text{max}^{(1:m)})^\top L L^\top \phi(s, a_\text{max}^{(1:m)})} \nonumber \\
   &= \frac{1}{\sqrt{d}} \| L^\top \phi(s, a_\text{max}^{(1:m)}) \|_2 \nonumber \\
   &\le \max_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \| L^\top \phi(s, a^{(1:m)})\|_\infty \nonumber \\
   &= \max_{v \in \{\pm e_i\}_{i=1}^d} \max_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \langle v, L^\top \phi(s, a^{(1:m)}) \rangle \nonumber \\
   &= \max_{v \in \{\pm e_i\}_{i=1}^d} \max_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \langle L v, \phi(s, a^{(1:m)}) \rangle \nonumber \\
   &= \langle L v^*, \phi(s, a^{(1:m)}_\text{max}) \rangle  \label{inf-norm term} \\
   &\le \| L^\top \phi(s, a_\text{max}^{(1:m)}) \|_2 \nonumber 
\end{align}
Notice that the purpose of writing all the equalities up to equation (\ref{inf-norm term}) was to show that equation (\ref{inf-norm term}) can be computed in $2 d^2 \sum_{i=1}^m |A^{(i)}|$. 
Since $\max_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \langle L v, \phi(s, a^{(1:m)})$ can be computed in $d \sum_{i=1}^m |A^{(i)}|$ time and $\{\pm e_i\}_{i=1}^d$ contains $2d$ elements. 
Also, noting that $L v = $  
Also, note that $L$ can be computed with $d^2$ computation in each loop by doing a \href{https://en.wikipedia.org/wiki/Cholesky_decomposition#Rank-one_update}{rank one update to the Cholesky decomposition} of $V^{-1} = L L^\top$.
For further details see \ref{vec and eff implementation}.

If equation (\ref{inf-norm term}) is larger than $\tau$, then $\|\phi(s, a_\text{max}^{(1:m)}) \|_{V^{-1}}^2 > \tau$.
While, if equation (\ref{inf-norm term}) is less than or equal $\tau$, then $\|\phi(s, a_\text{max}^{(1:m)}) \|_{V^{-1}}^2 \le \sqrt{d}\tau$, completing the proof.

\end{proof}

We now introduce Algorithm \ref{alg:confident rollout egss}, which makes use of the result from Lemma \ref{lemma:good set search} in lines 5-11.
For completeness we also include Algorithm \ref{alg:confident ma mc-lspi egss} (modified Algorithm 1 from \cite{yin2021efficient}) which makes calls to Algorithm \ref{alg:confident rollout egss}).
Notice that Lemma \ref{lemma:good set search} provides a weaker guarantee if lines 5-11 in Algorithm \ref{alg:confident rollout egss} run without termination than if lines 5-11 in Algorithm \ref{alg:confident rollout} run without termination.
Specifically, if lines 5-11 in Algorithm \ref{alg:confident rollout egss} run without termination, then Lemma \ref{lemma:good set search} guarantees that $||\phi(s, a^{(1:m)})||_{V_\mathcal{C}^{-1}}^2 \le \sqrt{d}\tau$ for all $a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
While if lines 5-11 in Algorithm \ref{alg:confident rollout} run without termination, then $||\phi(s, a^{(1:m)})||_{V_\mathcal{C}^{-1}}^2 \le \tau$ for all $a^{(1:m)} \in \mathcal{A}^{(1:m)}$, since all $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ were checked. 
Although the guarantee $||\phi(s, a^{(1:m)})||_{V_\mathcal{C}^{-1}}^2 \le \sqrt{d}\tau$ in Algorithm \ref{alg:confident rollout egss} introduces an extra $\sqrt{d}$ factor in the suboptimality of policy evaluation (when compared to Algorithm \ref{alg:confident rollout}, as we shall show next), it does not need to perform computation exponential in the number of agents (i.e. $\Pi_{i=1}^m |A^{(i)}|$ computation) like Algorithm \ref{alg:confident rollout}. 

\begin{algorithm}
\caption{Multi-Agent(MA)-Confident Rollout with Efficient Good Set Search(EGSS)} \label{alg:confident rollout egss}  
\begin{algorithmic}[1]
\State \textbf{Input:} number of rollouts $n$, length of rollouts $H$, rollout policy $\pi$, discount $\gamma$, initial state $s_0$, initial action vector $a_0^{(1:m)}$, feature matrix $\Phi_C$, regularization coefficient $\lambda$, threshold $\tau$.
\For {$i = 1, ..., n$}
    \State $s_{i, 0} \gets s_0, a_{i, 0} \gets a_0$, query the simulator, obtain reward $r_{i, 0} \gets r(s_{i, 0}, a_{i, 0})$, and the next state $s_{i, 1}$.
    \For {$t = 1, ..., H$} 
       
       
       
       
       
       
       
       
       
       
       
        \State \textcolor{blue}{max\_uncertainty, $a_\text{max}^{(1:m)} \gets \text{EGSS}(\rho, \lambda, \Phi_\mathcal{C})$}
        \If {\textcolor{blue}{$\text{max\_uncertainty} > \tau$}}
                \State \textcolor{blue}{status $\gets$ uncertain, result $\gets (s_{i, t}, a_\text{max}^{(1:m)}, \phi(s_{i, t}, a_\text{max}^{(1:m)}), \text{none})$}
                \State \Return {status, result}
        \EndIf
       
       
       
       
       
        \State \textcolor{blue}{$a_{i, t}^{(1:m)} \sim \pi(\cdot | s_{i,t})$} 
       
       
        \State Query the simulator with $s_{i, t}, a_{i, t}^{(1:m)}$, obtain reward $r_{i, t} \gets r(s_{i, t}, a_{i, t}^{(1:m)})$, and next state $s_{i, {t+1}}$.
    \EndFor 
\EndFor
\State status $\gets$ done, result $\gets \frac{1}{n} \sum_{i=1}^n \sum_{t=0}^H \gamma^t r_{i, t}$ 
\State \Return status, result
\end{algorithmic}
\end{algorithm}

\begin{algorithm}
\caption{Confident Multi-Agent MC-LSPI EGSS} \label{alg:confident ma mc-lspi egss}  
\begin{algorithmic}[1]
\State \textbf{Input:} initial state $\rho$, initial policy $\pi_0$, number of iterations $K$, regularization coefficient $\lambda$, threshold $\tau$, discount $\gamma$, number of rollouts $n$, length of rollout $H$ 
\State {$\mathcal{C} \gets \emptyset$} 
   
   
   
   
   
\State \textcolor{blue}{max\_uncertainty, $a_\text{max}^{(1:m)} \gets \text{EGSS}(\rho, \lambda, \Phi_\mathcal{C})$}
\While {\textcolor{blue}{$\mathcal{C} = \emptyset \text{ or } \text{max\_uncertainty} > \tau$}}
    \State \textcolor{blue}{$\mathcal{C} \gets \mathcal{C} \cup \{(\rho, a_\text{max}^{(1:m)}, \phi(\rho, a_\text{max}^{(1:m)}), \text{none})\}$}
    \State \textcolor{blue}{max\_uncertainty, $a_\text{max}^{(1:m)} \gets \text{EGSS}(\rho, \lambda, \Phi_\mathcal{C})$}
\EndWhile
\State $z_q \gets \text{none}, \, \forall z \in \mathcal{C}$ \quad \Comment{Policy iteration starts $(*)$}
\For {$k \in 1, \dots, K$}
    \For {$z \in \mathcal{C}$}
        \State status, result $\gets \text{MA-Confident Rollout with EGSS}(n, H, \pi_{k-1}, \gamma, z_s, z_{a^{(1:m)}},\Phi_\mathcal{C}, \lambda, \tau)$
        \State \textbf{if} status $=$ done, \textbf{then} $z_q = \text{result}$; \textbf{else} $\mathcal{C} \gets \mathcal{C} \cup \{\text{result}\}$ and \textbf{goto} line $(*)$ 
    \EndFor 
    \State $w_k \gets (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \Phi_\mathcal{C}^\top q_\mathcal{C}$
    \State \textcolor{blue}{$\pi_k(a^{(1:m)}|s) \gets \mathbf{1}(a^{(1:m)} = (\argmax_{\tilde{a}^{(1)} \in \mathcal{A}^{(1)}} w^\top \phi_1(s, \tilde{a}^{(1)}), ..., \argmax_{\tilde{a}^{(m)} \in \mathcal{A}^{(m)}} w^\top \phi_m(s, \tilde{a}^{(m)})))$}
\EndFor
\State \Return $w_{K-1}$. 
\end{algorithmic}
\end{algorithm}

\begin{algorithm}
\caption{EGSS} \label{alg:egss}  
\begin{algorithmic}[1]
\State \textbf{Input:} {\textcolor{blue}{state $s$, regularization coefficient $\lambda$, uncertainty matrix $\Phi$}}  
\State {\textcolor{blue}{$L \gets \text{Cholesky}((\Phi^\top \Phi + \lambda I)^{-1})$ \Comment{where $L L^\top = (\Phi^\top \Phi + \lambda I)^{-1}$}}}
\For {\textcolor{blue}{$v \in \{\pm e_l\}_{l = 1}^d$}}
    \State \textcolor{blue}{$a_\text{temp}^{(1:m)} \gets \emptyset$}
    \State \textcolor{blue}{max\_uncertainty $\gets 0$}
    \For {\textcolor{blue}{$j \in [m]$}}
        \State \textcolor{blue}{$a_\text{temp}^{(1:m)}.\text{append}\left(\argmax_{a^{(j)} \in \mathcal{A}^{(j)}} \left[\phi(s, a^{(j)})^\top L v \right]^2 \right)$}
    \EndFor 
    \If {\textcolor{blue}{$\left[\phi(s, a_\text{temp}^{(1:m)})^\top L v\right]^2 \ge \text{max\_uncertainty}$}}
        \State {\textcolor{blue}{$a_\text{max}^{(1:m)} \gets  a_\text{temp}^{(1:m)}$}}
        \State {\textcolor{blue}{$\text{max\_uncertainty} \gets  \left[\phi(s, a_\text{max}^{(1:m)})^\top L v\right]^2$}}
    \EndIf
\EndFor 
\State \Return \textcolor{blue}{max\_uncertainty, $a_\text{max}^{(1:m)}$}. 
\end{algorithmic}
\end{algorithm}


We now explicitly show how the suboptimality of policy evaluation is affected by this change

\begin{lemma}[EGSS modified Lemma B.2 from \cite{yin2021efficient}]
\label{lemma: mod b.2 egss}
Suppose that Assumption \ref{ass: ass1} holds. 
With all terms as defined earlier and $\theta > 0$. 
Then, with probability at least 
$$1 - 2C_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$$
for any $(s,  a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A}^{(1:m)})$ pair such that $\phi(s, a^{(1:m)}) \in \mathcal{H}$, we have 
$$|\tilde{w}_k^\top \phi(s, a^{(1:m)}) - w_{\tilde\pi_{k-1}}^\top \phi(s, a^{(1:m)})| \le b\sqrt{\lambda \sqrt{d} \tau} + \left( \frac{\gamma^{H+1}}{1 - \gamma} + \theta \right) \sqrt{\sqrt{d} \tau C_{\text{max}}} := \eta_2$$
\end{lemma}

\begin{proof}
The proof is identical to that of Lemme B.2 from \cite{yin2021efficient} except $\tau$ is replaced with $\sqrt{d} \tau$ everywhere, due to the weaker guarantee by lines 5-11 of Algorithm \ref{alg:confident rollout egss} as discussed above. 
\end{proof}
Since the virtual policy has access to the true value function for the elements outside of the $\mathcal{H}$,
we have the following bound on the error of policy evaluation for EGSS.

\begin{prop}[approximate value function bound for EGSS]
\label{prop: approx value function bound for EGSS}
Suppose that Assumption \ref{ass: ass1} holds. 
With all terms as defined earlier and $\theta > 0$. 
Then, with probability at least 
$$1 - 2C_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$$
we have
$$||\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})||_\infty \le \eta_2.$$ 
\end{prop}

\subsubsection{Vectorized and Efficient Implementation} \label{vec and eff implementation}
We describe how calculating equation (\ref{inf-norm term}) can be implemented in a vectorized way, such that it is more efficient in code.
Stacking the elements of $\{+ e_i\}_{i=1}^d$ as the rows of a matrix we can write is as the identity matrix $I \in \mathbb{R}^{d \times d}$.
Similarly stacking the elements of $\{- e_i\}_{i=1}^d$ as the rows of a matrix we can write it as the negative identity matrix $-I \in \mathbb{R}^{d \times d}$.
Hence, stacking the elements of $\{\pm e_i\}_{i=1}^d$ as the rows of a matrix we can write is as$[I, -I] \in \mathbb{R}^{d \times 2d}$.
Define $X = [LI, -LI]^\top = [L, -L]^\top$.
Fix a state $s \in \mathcal{S}$.
Then, 
\begin{align*}
& \max_{v \in \{\pm e_i\}_{i=1}^d} \langle Lv, \phi(s, a^{(1:m)}) \rangle = \max_{i \in [2d]} (X \phi(s, a^{(1:m)})_i) & a^{(1:m)} \in \mathcal{A}^{(1:m)}
\end{align*}
Define $\Phi_{\mathcal{A}^{(1:m)}} = [\phi(s, a^{(1:m)})_1, ..., \phi(s, a^{(1:m)})_g]$ where $g = {\sum_{j=1}^m A_i}$ and $\{a^{(1:m)}_i\}_{j=1}^g = \mathcal{A}^{(1:m)}$. 
Then, 
\begin{align*}
& \max_{v \in \{\pm e_i\}_{i=1}^d} \max_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \langle Lv, \phi(s, a^{(1:m)} \rangle = \max_{(i, k) \in [2d] \times [g]} (X \Phi_{\mathcal{A}^{(1:m)}})_{i, k}
\end{align*}
Since the features can be written as a linear combination of the features of each agent (i.e. $\phi(s, a^{(1:m)}) = \sum_{j=1}^m \phi_j(s, a^{(j)})$ we define $\Phi_{\mathcal{A}^{(j)}} = [\phi_j(s, a^{(j)})_1, ..., \phi_j(s, a^{(j)})_{A_j}]$ where $j \in [m]$.
Then equation (\ref{inf-norm term}) can be computed as,
\begin{align*}
& \max_{v \in \{\pm e_i\}_{i=1}^d} \max_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \langle Lv, \phi(s, a^{(1:m)} \rangle = \max_{i \in [2d]} \left( \max_{k \in [A_1]}(X \Phi_{\mathcal{A}^{(1)}})_{:, k} + ... + \max_{k \in [A_m]}(X \Phi_{\mathcal{A}^{(m)}})_{:, k} \right)_i
\end{align*}
where $\max_{k \in [A_j]}(X \Phi_{\mathcal{A}^{(j)}})_{:, k} = [\max_{k \in [A_i]}(X \Phi_{\mathcal{A}^{(j)}})_{1, k}, ..., \max_{k \in [A_i]}(X \Phi_{\mathcal{A}^{(j)}})_{2d, k}]^\top$

We now mention another useful technique to speed up computation.
Notice that
\begin{align*}
\max_{k \in [g]}(X \Phi_{\mathcal{A}^{(1)}})_{:, k} = \left[\max_{k \in [A_1]}(L^\top \Phi_{\mathcal{A}^{(1)}})_{:, k}, \max_{k \in [A_1]}(-L^\top \Phi_{\mathcal{A}^{(1)}})_{:, k}\right] 
\end{align*}
and that $(L^\top \Phi_{\mathcal{A}^{(j)}})_k = L^\top \phi_j(s, a^{(j)}_k)$ where $k \in [A_j]$ and $\{a^{(j)}_k\}_{k \in [A_j]} = \mathcal{A}^{(j)}$. 

Define the Cholesky decomposition of $V = M M^\top$. 
Then, $V^{-1} = L L^\top = (M M^\top)^{-1} = M^{-\top} M^{-1}$
Noticing that everything above would hold with $L$ replaced with $M^{-\top}$, since we only require that
$$\phi(s, a_\text{max}^{(1:m)})^\top V_\mathcal{C}^{-1} \phi(s, a_\text{max}^{(1:m)}) = \phi(s, a_\text{max}^{(1:m)})^\top L L^\top \phi(s, a_\text{max}^{(1:m)}) = \phi(s, a_\text{max}^{(1:m)})^\top M^{-\top} M^{-1} \phi(s, a_\text{max}^{(1:m)})$$

Now, $L^\top \phi_j(s, a^{(j)}_k) = M^{-\top} \phi_j(s, a^{(j)}_k)$.
Define $y = M^{-\top} \phi_j(s, a^{(j)}_k)$. 
Then, $y$ can be calculated by solving for $y$ in $M^\top y = \phi_j(s, a^{(j)}_k)$ which can be done by using forward substitution since $M^\top$ is upper triangular.
Define the forward substitution function as $\text{FS}(M^\top, \phi_j(s, a^{(j)}_k)) = M^{-\top} \phi_j(s, a^{(j)}_k)$ and $\text{FS}(M^\top, \Phi_{\mathcal{A}^{(j)}}) = M^{-\top} \Phi_{\mathcal{A}^{(j)}}$

Then,
\begin{align*}
\max_{k \in [g]}(X \Phi_{\mathcal{A}^{(1)}})_{:, k} = \left[\max_{k \in [A_1]}(\text{FS}(M^\top, \Phi_{\mathcal{A}^{(j)}}))_{:, k}, \max_{k \in [A_1]}(\text{FS}(-M^\top, \Phi_{\mathcal{A}^{(j)}}))_{:, k}\right] 
\end{align*}

Which only requires computing the Cholesky decomposition of $V = M M^T$ instead of $V^{-1} = L L^T$.
Thus, the inverse of $V^{-1}$ need not be computed.
Further, recall that we fixed $\mathcal{C}$, but in general $V = V_\mathcal{C}$. 
Denote $\mathcal{C}^\prime$ as the core set after a new feature $\phi^\prime$ is added to it. 
Then $V_{\mathcal{C}^\prime} = V_{\mathcal{C}} + \phi^\prime {\phi^\prime}^\top$ and thus, the Cholesky decomposition of $V_{\mathcal{C}^\prime}$ can be computed using a \href{https://en.wikipedia.org/wiki/Cholesky_decomposition#Rank-one_update}{rank one Cholesky update}.



\subsection{Policy Improvement}
In this section we assume that we have a high probability bound on the policy evaluation
like the ones from DAV \ref{prop: approx value function bound for DAV} and EGSS \ref{prop: approx value function bound for EGSS},
and show the sub-optimality of the policy obtained at the last iteration of policy improvement.
\begin{lemma}[approximate policy iteration]
\label{lemma:approximate policy iteration}
Suppose that we run K approximate policy iterations and generate a sequence of policies
$\pi_0, \pi_1, \pi_2, \cdots, \pi_K$.
Suppose that for every $k = 1, 2, \cdots, K$, in the k-th iteration, we obtain a function
$\tilde{Q}_{k-1}$ such that, $\| \tilde{Q}_{k - 1} - Q_{\pi_{k - 1}} \|_\infty \leq \eta$,
and choose $\pi_k$ to be greedy with respect to $\tilde{Q}_{k-1}$.
Then
\begin{align*}
    \| Q^* - Q_{\pi_K} \|_\infty \leq \frac{2 \eta}{1 - \gamma} + \frac{\gamma^K}{1 - \gamma},
\end{align*}
\end{lemma}
\begin{proof}
    This lemma is a direct consequence of the results in \cite{munos2003error} and \cite{farahmand2010error}.
\end{proof}

Under the same assumption as lemma \ref{lemma:approximate policy iteration}, since $\| \tilde{Q}_{K-1} - Q_{\pi_{K-1}} \| \leq \eta$,
we know that:
\begin{align}
    \|Q^* - \tilde{Q}_{K-1} \|_\infty \leq \frac{3 \eta}{1 - \gamma} + \frac{\gamma^{K - 1}}{1 - \gamma} \label{eq: q function bound}
\end{align}

\begin{lemma}
[\cite{singh1994upper}]
\label{lemma: value function bound}
Let $\pi$ be greedy with respect to a function $Q$.
Then for any state s,
\begin{align*}
    V^*(s) - V_\pi(s) \leq \frac{2}{1 - \gamma} \| Q^* - Q \|_\infty.
\end{align*}
\end{lemma}

Using lemma \ref{lemma: value function bound} and equation \ref{eq: q function bound}, and the fact that $\pi_K$ is greedy with respect to $\tilde{Q}_{K-1}$, we know that:
\begin{align}
    V^*(s) - V_{\pi_K}(s) \leq \frac{6 \eta}{(1 - \gamma)^2} + \frac{2 \gamma^{K - 1}}{(1 - \gamma)^2}. \label{eq: v function bound}
\end{align}

We obtained the results in lemma \ref{lemma: value function bound} and \ref{lemma:approximate policy iteration}
based on having bounded policy evaluation bound in all $K$ iteration of policy improvement.
However, we can only bound policy evaluation error with high probability using propositions \ref{prop: approx value function bound for DAV} and \ref{prop: approx value function bound for EGSS}.
Therefore, we need to use another union bound over all iterations of policy improvement to be able to use inequality \ref{eq: v function bound}.
Since we have $K$ iterations of policy improvement the final bound holds with probability
\begin{align*}
    1 - 2KC_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n).
\end{align*}

\subsection{Main Algorithm}
The results stated so far are true for the virtual algorithm (with the access to the true value function for $\phi(s, a^{(1:m)} \notin \mathcal{H}$).
However, using the same arguments as section B.3 of \cite{yin2021efficient}, one can show that the final sub-optimality of the corresponding main algorithm
starting at state $\rho$ with probability at least
\begin{align*}
    1 - 4KC_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)
\end{align*}
we have
\begin{align}
    V^*(\rho) - V_{\pi_K}(\rho) \leq \frac{8 \eta}{(1 - \gamma)^2} + \frac{2 \gamma^{K - 1}}{(1 - \gamma)^2} \label{eq: v function bound main}
\end{align}

\subsection{Optimality for DAV and EGSS}
By plugging different policy evaluations bounds from DAV \ref{prop: approx value function bound for DAV} and EGSS \ref{prop: approx value function bound for EGSS}
into the final sub-optimality bound from \ref{eq: v function bound main} we get the two following results.

\subsubsection{DAV Sub-Optimality}
By choosing appropriate parameters according to $\delta$ and $\kappa$, we can ensure that with probability of at least $1 - \delta$, we have:
\begin{align*}
    V^*(\rho) - V_{\pi_K}(\rho) \leq \kappa,
\end{align*}
by the following parameter initialization (see \ref{sec:parameter DAV})
\begin{align*}
    \tau &= 1\\
    \lambda &= \frac{\kappa^2(1 - \gamma)^4}{1024 b^2 (2m -1)^2}\\
    \theta &= \frac{\kappa(1- \gamma)^2}{32 (2m-1) \sqrt{C_{\text{max}}}}\\
    H &= \frac{
        \log \left ( 32 \sqrt{C_{\text{max}}} (2m - 1) \right)
        - \log \left( \kappa(1 - \gamma)^3 \right)
    }{
        log(\frac{1}{\gamma})
    } - 1\\
    K &= \frac{\log\left(\frac{1}{\kappa(1 - \gamma)^2}\right) + \log(8)}{\log(\frac{1}{\gamma})} + 1 \\
    n &= \frac{log(\delta) - \log(2KC_{\text{max}})}{2 \theta^2(1-\gamma)^2} \\
    C_{\max} &= \frac{e}{e-1} \frac{1 + \tau}{\tau} d \left( 
        \log(1 + \frac{1}{\tau}) +
        \log(1 + \frac{m}{\lambda})
    \right) 
\end{align*}

\subsubsection{EGSS Sub-Optimality}
By choosing appropriate parameters according to $\delta$ and $\kappa$, we can ensure that with probability of at least $1 - \delta$, we have:
\begin{align*}
    V^*(\rho) - V_{\pi_K}(\rho) \leq \kappa,
\end{align*}
by the following parameter initialization (see \ref{sec:parameter EGSS})
\begin{align*}
    \tau &= 1\\
    \lambda &= \frac{\kappa^2(1 - \gamma)^4}{1024b^2 \sqrt{d}}\\
    \theta &= \frac{\kappa(1- \gamma)^2}{32 d^{\frac{1}{4}} C_{\text{max}}^{\frac{1}{2}}}\\
    H &= \frac{
        \log\left (32 d^{\frac{1}{4}} C_{\text{max}}^{\frac{1}{2}} \right)
        - \log(\kappa(1 - \gamma)^3)
    }{
        log(\frac{1}{\gamma})
    } - 1\\
    K &= \frac{\log\left(\frac{1}{\kappa(1 - \gamma)^2}\right) + \log(8)}{\log(\frac{1}{\gamma})} + 1 \\
    n &= \frac{log(\delta) - \log(2KC_{\text{max}})}{2 \theta^2(1-\gamma)^2} \\
    C_{\max} &= \frac{e}{e-1} \frac{1 + \tau}{\tau} d \left( 
        \log(1 + \frac{1}{\tau}) +
        \log(1 + \frac{m}{\lambda})
    \right) 
\end{align*}
\subsubsection{Computational and Query costs}
Both algorithms can learn a $\kappa$-optimal policy with computational cost of $\text{poly}(d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}), \log(b), m, |\mathcal{A}|)$.
The query cost for DAV is $\text{poly}(d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}), \log(b), m)$, and for EGSS is $\text{poly}(d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}), \log(b), \log(m))$.


\begin{center}
\begin{tabular}{c c c c} 
 \hline
 & Query($\epsilon = 0$) & Query ($\epsilon \ne 0$) & Sub-optimality ($\epsilon \ne 0)$ \\ 
 \hline
 LSPI-DAV & $\tilde{\mathcal{O}} \left( \frac{\mathbf{m^2} d^3}{\kappa^2 (1-\gamma)^8} \right)$ &  $\tilde{\mathcal{O}} \left( \frac{d^2}{\epsilon^2 (1-\gamma)^4} \right)$ & $\tilde{\mathcal{O}} \left( \frac{\epsilon \sqrt{d} \mathbf{m}}{(1-\gamma)^2} \right)$\\ 
 \hline
 LSPI-EGSS & $\tilde{\mathcal{O}} \left( \frac{\mathbf{\sqrt{d}}d^3}{\kappa^2 (1-\gamma)^8} \right)$ & $\tilde{\mathcal{O}} \left( \frac{d^2}{\epsilon^2 (1-\gamma)^4} \right)$ & $\tilde{\mathcal{O}} \left( \frac{\epsilon \sqrt{d} \mathbf{d^{1/4}}}{(1-\gamma)^2} \right)$\\
 \hline
 LSPI-Kernel-DAV & $\tilde{\mathcal{O}} \left( \frac{\mathbf{m^2} \tilde \Gamma^3}{\kappa^2 (1-\gamma)^8} \right)$ & $\tilde{\mathcal{O}} \left( \frac{d^2}{\epsilon^2 (1-\gamma)^4} \right)$ & $\tilde{\mathcal{O}} \left( \frac{\epsilon \sqrt{\tilde \Gamma} \mathbf{m}}{(1-\gamma)^2} \right)$\\
 \hline
\end{tabular}
\end{center}

\section{Kernel Setting}

The kernelized setting is a standard extension of the finite-dimensional linear setup \citep{srinivas2009gaussian,abbasi2012online}. It lifts the restriction that features and parameter vector are elements of $\mathbb{R}^d$. Instead we require that the $Q_\pi$-function is contained in a reproducing kernel Hilbert space (RKHS). This includes cases where the linear dimension of function class is infinite.

The more general setup requires us to address two main challenges: First, the scaling of the sample complexity with the dimension $d$ needs to be improved to a notion of effective dimension that can be bounded for the RKHS of interest. Second, computationally we cannot directly work with infinite dimensional features $\phi(s,a)$. Instead, we need to rely on the `kernel trick' and compute all quantities of interest in the finite-dimensional data space. 

Formally\todoa{Added the assumptions for kernel case from here:}, for each agent $j \in [m]$, the function $k^{(j)} : (\mathcal{S} \times \mathcal{A}^{(1:m)})^2 \rightarrow \mathbb{R}$ is defined as

\begin{align}
    k^{(j)}(s_1, a_1^{(1:m)}, s_2, a_2^{(1:m)}) = k_j(s_1, a_1^{(j)}, s_2, a_2^{(j)}), \label{eq:rkhs_j}
\end{align}
where $k_j: (\mathcal{S} \times \mathcal{A})^2 \rightarrow \mathbb{R}$ is the underlying kernel function for agent $j$,
and $\mathcal{H}_j$ is the RKHS associated with it.

Based on definition \eqref{eq:rkhs_j}, it's easy to see that $\{k^{(j)}\}_{j \in [m]}$ is a set of kernel functions too, and they share the same vector space which is $V := \mathbb{R}^{\mathcal{S} \times \mathcal{A}^{(1:m)}}$.
However, they have different inner products on this space which produce a different RKHS for every $j \in [m]$.
We denote RKHS of $k^{(j)}$ as $\mathcal{H}^{(j)}$, and its inner product follows from equation \eqref{eq:rkhs_j} as
\begin{align}
    \langle k^{(j)}(s_1, a_1^{(1:m)}, \cdot, \cdot), k^{(j)}(s_2, a_2^{(1:m)}, \cdot, \cdot) \rangle_{\mathcal{H}^{(j)}} =
    \langle k_j(s_1, a_1^{(j)}, \cdot, \cdot), k_j(s_2, a_2^{(j)}, \cdot, \cdot) \rangle_{\mathcal{H}_j}. \label{eq:rkhs-inner-1}
\end{align}
By defining $\phi_j(s, a) := k_j(s,a, \cdot, \cdot) \in \mathcal{H}_j$ and $\phi^{(j)}(s, a^{(1:m)}) := k^{(j)}(s, a^{(1:m)}, \cdot, \cdot) \in \mathcal{H}^{(j)}$, we can rewrite \eqref{eq:rkhs-inner-1} for fixed $s_1, s_2, a_1^{(1:m)}, a_2^{(1:m)}$ as
\begin{align}
    \langle \phi^{(j)}(s_1, a_1^{(1:m)}), \phi^{(j)}(s_2, a_2^{(1:m)}) \rangle_{\mathcal{H}^{(j)}} &=
    \langle \phi_j(s_1, a_1^{(j)}), \phi_j(s_2, a_2^{(j)})\rangle_{\mathcal{H}_j}.\label{eq:rkhs-inner-1-rev}
\end{align}
Intuitively, equation \eqref{eq:rkhs-inner-1-rev} suggests that the inner product $\langle \cdot, \cdot \rangle_{H^{(j)}}$ only depends on the state $s$, and the action taken by agent $j$.

Next, we define the joint additive kernel $k : (\mathcal{S} \times \mathcal{A}^{(1:m)})^2 \rightarrow \mathbb{R}$ as follows
\begin{align}
	k(s_1,a_1^{(1:m)}, s_2,a_2^{(1:m)}) &= \sum_{j=1}^m k^{(j)}(s_1,a_1^{(1:m)}, s_2,a_2^{(1:m)}) \label{eq:jak-def}\\
	&= \sum_{j=1}^m k_j(s_1,a_1^{(j)}, s_2,a_2^{(j)})\\
	&= \sum_{j=1}^m \langle k_j(s_1,a_1^{(j)}, \cdot, \cdot), k_j(s_2,a_2^{(j)}, \cdot, \cdot) \rangle_{\mathcal{H}_j},
\end{align}
and we denote its associated RKHS as $\mathcal{H}$. Again, note that $\mathcal{H}$ uses the same vector space, namely $V$, as all the $\mathcal{H}^{(j)}$s.

Now, we can restate Assumption \todoa{I think we still want to solve the problem under assumption one for the kernel case, right?}\ref{ass: ass1}for the kernel case.

\begin{assumption}[Assumption 1 for RKHS]
    \label{ass:kernel-2}
	For each (deterministic) policy $\pi$, there exists
	$f_\pi \in \mathcal{H}$, such that
	$Q_\pi(s, a^{(1:m)}) = \langle \phi(s,a^{(1:m)}), f_\pi \rangle_{\mathcal{H}}$.
\end{assumption}
Next, we show that there exist a function $f_\pi^{(i)} \in \mathcal{H}^{(i)}$ for $i \in [m]$, such that:
\begin{align*}
    Q_\pi (s, a^{(1:m)}) &= \sum_{j=1}^m Q_\pi^{(j)}(s, a^{(1:m)})\\
    Q_\pi^{(i)} (s, a^{(1:m)}) &= \langle \phi^{(i)}(s,a^{(1:m)}), f_\pi^{(i)} \rangle_{\mathcal{H}^{(i)}}
\end{align*}
Or, there exist $f_{\pi, j} \in \mathcal{H}_j$ for $j \in \mathcal{H}_j$, such that:
\begin{align*}
    Q_\pi (s, a^{(1:m)}) &= \sum_{j=1}^m Q_{\pi,j}(s, a^{(j)})\\
    Q_\pi^{(i)} (s, a^{(1:m)}) &= \langle \phi^{(i)}(s,a^{(1:m)}), f_{\pi,j} \rangle_{\mathcal{H}^{(i)}}
\end{align*}

\begin{proof}
    As $f_\pi$ is an element of $\mathcal{H}$ we know that it can be shown based on the basis vectors of $\mathcal{H}$:
    \begin{align*}
        f_\pi = \sum_{i=1}^{\infty} \alpha_i k(s_i, a_i, \cdot, \cdot).
    \end{align*}
    From the definition of the joint additive kernel and the assumption \ref{ass:kernel-2} we have:
    \begin{align}
        Q_\pi (s, a^{(1:m)}) &= \langle \phi(s,a^{(1:m)}), f_\pi \rangle_{\mathcal{H}} \nonumber \\
        &= \langle \phi(s,a^{(1:m)}), \sum_{i=1}^{\infty} \alpha_i k(s_i, a_i, \cdot, \cdot) \rangle_{\mathcal{H}} \nonumber \\
        &= \sum_{i=1}^{\infty} \alpha_i \langle k(s,a^{(1:m)}, \cdot, \cdot), k(s_i, a_i, \cdot, \cdot) \rangle_{\mathcal{H}} \nonumber \\
        &= \sum_{i=1}^{\infty} \alpha_i k(s,a^{(1:m)}, s_i, a_i) \nonumber \\
        &= \sum_{i=1}^{\infty} \alpha_i \sum_{j=1}^m k^{(j)}(s,a^{(1:m)}, s_i, a_i) & \text{Based on }\ref{eq:jak-def}\label{eq:checkpoint}\\
        &= \sum_{j=1}^m \sum_{i=1}^{\infty} \alpha_i k^{(j)}(s,a^{(1:m)}, s_i, a_i) \nonumber \\
        &= \sum_{j=1}^m \sum_{i=1}^{\infty} \alpha_i \langle \phi^{(j)}(s,a^{(1:m)}) , \phi^{(j)}(s_i, a_i) \rangle_{\mathcal{H}^{(j)}} \nonumber\\
        &= \sum_{j=1}^m \langle \phi^{(j)}(s,a^{(1:m)}) , \underbrace{\sum_{i=1}^{\infty} \alpha_i \phi^{(j)}(s_i, a_i)}_{:=E^{(j)}(f_\pi) := f_\pi^{(i)}} \rangle_{\mathcal{H}^{(j)}} \nonumber \\
        &= \sum_{j=1}^{m} \langle \phi^{(j)}(s,a^{(1:m)}), f_\pi^{(j)} \rangle_{\mathcal{H}^{(j)}}. \nonumber
    \end{align}
    or from \eqref{eq:checkpoint} we have:
    \begin{align*}
        Q_\pi (s, a^{(1:m)})
        &= \sum_{i=1}^{\infty} \alpha_i \sum_{j=1}^m k^{(j)}(s,a^{(1:m)}, s_i, a_i) \\
        &= \sum_{i=1}^{\infty} \alpha_i \sum_{j=1}^m k_j(s,a^{(j)}, s_i, a_i^{(j)}) & \text{Based on }\ref{eq:rkhs_j}\\
        &= \sum_{i=1}^{\infty} \alpha_i \sum_{j=1}^m \langle \phi_j(s,a^{(j)}), \phi_j(s_i, a_i^{(j)}) \rangle_{\mathcal{H}_j}\\
        &= \sum_{j=1}^m \langle
        \phi_j(s,a^{(j)}),
        \underbrace{
        \sum_{i=1}^{\infty} \alpha_i \phi_j(s_i, a_i^{(j)})
        }_{:=E_j(f_\pi) :=f_{\pi, j}}
        \rangle_{\mathcal{H}_j}\\
        &= \sum_{j=1}^m \langle
        \phi_j(s,a^{(j)}),
        f_{\pi, j}
        \rangle_{\mathcal{H}_j}.
    \end{align*}
    We may need to show that $f_\pi^{j}$ and $f_{\pi, j}$ have finite norms in their corresponding Hilbert spaces.
\end{proof}




\paragraph{Kernelized Algorithm}

As before we can compute the ridge estimate
\begin{align}
	\hat Q_t = \argmin_{Q \in \mathcal{H}} \sum_{(s,a^{(1:m)})\in\mathcal{C}_t} (Q(s,a^{(1:m)}) - q_{(s,a^{(1:m)})})^2 + \lambda \|Q\|_{\mathcal{H}}^2 = (\Phi_{\mathcal{C}_t}\Phi_{\mathcal{C}_t}^\top + \lambda \mathbf{I}_\mathcal{H})^{-1}\Phi_{\mathcal{C}_t}q_{\mathcal{C}_t}
\end{align}
Here, $\mathbf{I}_{\mathcal{H}} : \mathcal{H} \rightarrow \mathcal{H}$ is the identity mapping, and $\Phi_{\mathcal{C}}^\top$ can be formally defined as map $\Phi_\mathcal{C}^\top : \mathcal{H} \rightarrow \mathbb{R}^{|\mathcal{C}|}, f \mapsto [f(s,a^{(1:m)})]_{(s,a^{(1:m)}) \in \mathcal{C}}, \, f \in \mathcal{H}$; and $\Phi_{\mathcal{C}} : \mathbb{R}^{|\mathcal{C}|} \rightarrow \mathcal{H}$ is the adjoint of $\Phi_{\mathcal{C}}^\top$.\todoj{somewhat more natural is to transpose the definition of $\Phi_{\mathcal{C}}$ but I wanted to be consistent with the earlier notation}

Using the `kernel trick' we express the estimator as follows
\begin{align}
	\hat Q_t = \Phi_{\mathcal{C}_t}(K_{\mathcal{C}_t} + \lambda \mathbf{I}_{t})^{-1}q_{\mathcal{C}_t}
\end{align}
where $K_{\mathcal{C}_t} = \Phi_{\mathcal{C}_t}^\top \Phi_{\mathcal{C}_t} \in \mathbb{R}^{t \times t}$ is the kernel matrix. Lastly, we can evaluate for any $s,a^{(1:m)}$:
\begin{align}
	\hat Q_t(s,a^{(1:m)}) = k_{\mathcal{C}_t}(s,a^{(1:m)})^\top(K_{\mathcal{C}_t} + \lambda \mathbf{I}_{t})^{-1}q_{\mathcal{C}_t}
\end{align}
where we defined $k_\mathcal{C}(s,a^{(1:m)}) = [k(s,a^{(1:m)}, s',a'^{(1:m)})]_{(s',a'^{(1:m)}) \in \mathcal{C}} \in \mathbb{R}^{|\mathcal{C}|}$ (for some fixed ordering of $\mathcal{C}$). Importantly, the last display only involves finite-dimensional quantities that can be computed from kernel evaluations. Moreover, since $k(s,a^{(1:m)},s',a'^{(1:m)}) = \sum_{j=1}^m k_j(s,a^{(j)}, s', a^{(j)})$ we can write
\begin{align}
	\hat Q_t(s,a^{(1:m)}) = \sum_{j=1}^m k_{j, \mathcal{C}_t}(s,a^{(j)})^\top(K_{\mathcal{C}_t} + \lambda \mathbf{I}_{t})^{-1}q_{\mathcal{C}_t}
\end{align}
where $k_{j, \mathcal{C}}(s,a^{(j)}) = [k_j(s,a^{(j)}, s^\prime , a^{\prime(j)})]_{(s^\prime, a^{\prime(1:m)}) \in \mathcal{C}} \in \mathbb{R}^{|\mathcal{C}|}$. Hence we can still compute the maximizer independently for each agent.

The second quantity required by the algorithm is the squared norm $\|\phi(s,a^{(1:m)})\|_{(\Phi_\mathcal{C} \Phi_{\mathcal{C}}^\top + \lambda \mathbf{I}_\mathcal{H})^{-1}}^2$, where now $\phi(s,a^{(1:m)}) = k(s,a^{(1:m)}, \cdot, \cdot) \in \mathcal{H}$. Using the Woodbury identity, we can write 
\begin{align}
	\lambda (\Phi_\mathcal{C} \Phi_{\mathcal{C}}^\top + \lambda \mathbf{I}_\mathcal{H})^{-1} = \mathbf{I}_{\mathcal{H}} - \Phi_{\mathcal{C}} (K_\mathcal{C} + \lambda \mathbf{I}_{|\mathcal{C}|})^{-1}\Phi_{\mathcal{C}}^\top
\end{align}
Therefore the feature norm can be written using finite-dimensional quantities: 
\begin{align}
	\|\phi(s,a^{(1:m)})\|_{(\Phi_\mathcal{C} \Phi_{\mathcal{C}}^\top + \lambda \mathbf{I}_\mathcal{H})^{-1}}^2 = \frac{1}{\lambda} \left( k(s,a^{(1:m)},s,a^{(1:m)}) - k_\mathcal{C}(s,a^{(1:m)})^\top(K_\mathcal{C}+ \lambda \mathbf{I}_{|\mathcal{C}|})^{-1}k_{\mathcal{C}}(s,a^{(1:m)})\right)
\end{align}
With this, we can implement the DAV version of the algorithm directly. The EGSS is more tricky to implement, but this is potentially possible using eigenfunctions from Mercer's theorem.\todoj{I haven't seen this done even for UCB on kernel bandits, but there are related works for Thompson Sampling, e.g. \cite{mutny2018efficient}}


\paragraph{Analysis}
Our goal next is to extend the analysis to the kernel case, carefully arguing that the linear dimension $d$ can be replaced by a more benign quantity. A common complexity measure is the total information gain, which we define as follows:
\begin{align}
	\Gamma_{\mathcal{C}} = \log \det (\Phi_{\mathcal{C}}\Phi_{\mathcal{C}}^\top + \lambda \mathbf{I}_d) - \log \det (\lambda \mathbf{I}_d)
\end{align}\todoj{it is tempting to add a factor $\frac{1}{2}$ to the definition, to much the entropy of the Bayesian model with Gaussian prior and likelihood}
Note that we can compute $\Gamma_{\mathcal{C}}$ for any given core set $\mathcal{C}$. In the kernel case, we can compute $\Gamma_{\mathcal{C}} = \log \det (\mathbf{I}_{|\mathcal{C}|} + \lambda^{-1} K_{\mathcal{C}})$ using similar arguments as before.

The maximum information gain is $\Gamma_t = \max_{\mathcal{C} : |\mathcal{C}|=t} \Gamma_{\mathcal{C}}$. It serves as a complexity measure in the bandit literature and can be bounded for many kernels of interests \citep{srinivas2009gaussian,vakili2021information}. Following \citet{du2021bilinear}, we further define the \emph{critical information gain},
\begin{align}
	\tilde \Gamma = \max \{t \geq 1 : t \leq \Gamma_t \}
\end{align}
Note that the proof of  \cite[Lemma 5.1]{yin2021efficient} implies that $|C| \leq  \log(1+\tau)^{-1}\Gamma_{|C|}$

Since the dimension $d$ enters our bounds only through $C_{\max}$ we can immediately get a sample complexity bound for the kernelized algorithm in terms of $\tilde\Gamma$. For the finite-dimensional case, \cite[Lemma 5.1]{yin2021efficient} shows that $\tilde \Gamma \leq \mathcal{O}(d)$, recovering the previous bound.


\paragraph{Unknown Critical Information Gain} Somewhat impractical for the algorithm is that we need to know a bound on $C_{\max}$ or $\tilde \Gamma$ respectively to set the number of episodes required for some target level of accuracy $\kappa > 0$ (roughly, $m = C_{\max}/\kappa^2$).

As a remedy, we can replace the check $\|\phi(s,a)\|_{(\Phi_\mathcal{C} \Phi_{\mathcal{C}} + \lambda \mathbf{I}_d)^{-1}}^2 > \tau$ by\todoj[inline]{this needs some more thinking, as we don't want to set $\tau$ to be too small - maybe? An easier approach could be to set $m = |\mathcal{C}|/\kappa^2$?}
\begin{align*}
	\|\phi(s,a)\|_{(\Phi_\mathcal{C} \Phi_{\mathcal{C}} + \lambda \mathbf{I}_d)^{-1}}^2 > \frac{\tilde \tau}{\max(\Gamma_{\mathcal{C}},1)}
\end{align*}

Let $\mathcal{C}_1, \dots, \mathcal{C}_t$ be the sequence of core sets obtained by adding elements that satisfy the above condition. Note that $\Gamma(\mathcal{C}_t)$ is a non-decreasing sequence. Combined with \citep[Lemma 5.1]{yin2021efficient} , this implies that
\begin{align}
	t \log\left(1 + \tfrac{\tilde \tau}{\max(\Gamma_t,1)}\right) \leq t \log\left(1 + \tfrac{\tilde \tau}{\max( \Gamma_{\mathcal{C}_{t}}, 1)}\right) \leq \sum_{s=1}^t \log\left(1 + \tfrac{\tilde \tau}{\max(\Gamma_{\mathcal{C}_{s}},1)}\right) \leq \Gamma_{\mathcal{C}_{t}} \leq \Gamma_t\label{eq:cmax adaptiv}
\end{align}
Hence the condition is triggered at most
\begin{align}
	\tilde C_{\max}(\tilde \tau) = \max \left\{t \geq 1 : t \leq \Gamma_t \log\left(1 + \tfrac{\tilde \tau}{\max(\Gamma_t, 1)} \right)^{-1}\right\}
\end{align}
times. \todoj{Would be great to show some bounds for $\tilde C_{\max}$, e.g. in the finite-dimensional case}Moreover, we can set $\tilde \tau=1$ and $m = \frac{1}{\kappa^2}$ (i.e.~without knowing a bound on $C_{\max}$) to obtain the required target accuracy $\tilde \mathcal{O}(\kappa)$.\todoj{this requires some introspection of \citep[Lemma B.2]{yin2021efficient} and \eqref{eq:cmax adaptiv}}




\section{Bandit Setting Analysis}

Recall that $\phi_i: \mathcal{S} \times \mathcal{A}^{(i)}  \to \mathbb{R}^d$.
Fix state $s \in \mathcal{S}$, and redefine $\phi_i: \mathcal{A}^{(i)} \to \mathbb{R}^d$, which reduces the problem to the bandit setting.
Under assumption \ref{ass: ass1} we know that there exist weight vectors $w_1^*, \cdots, w_m^* \in \mathbb{R}^d$, such that:
\[
    \begin{aligned}
        Q_{w^*}(a^{(1 : m)}) &= \sum_{j =1}^{m} \phi_j(a^{(j)})^\top
        \sum_{i =1}^{m} w_j^*\\
        &= \phi(a^{(1:m)})^\top w^*,
    \end{aligned}
\]
where we define $\phi(a^{(1:m)})^\top = \sum_{j =1}^{m} \phi_j(a^{(j)})^\top$ and $w^* = \sum_{i =1}^{m} w_j^*$.
Next we define 
$$a^{*, (1:m)} = \argmax_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} Q_{w^*}(a^{(1:m)}) $$

\begin{algorithm}
	\algnewcommand{\LineComment}[1]{\State \(\triangleright\) #1}
    \caption{Confident Multi-Agent Linear Bandit}\label{alg:bandit} 
    \begin{algorithmic}
        \Require $\hat w_0 \in \mathbb{R}^d$, Default action $\bar a^{(1:m)} \in \mathcal{A}^{(1:m)}$, Number of rollouts $n$
        \State $\mathcal{C} \gets \{ \bar a^{(1:m)} \}$
        \LineComment{Build Core Set}
        \For{$j=1,\dots m$}
        	\For{$a_j \in \mathcal{A}^{(j)}$}
        		\If{$\phi((a^{(j)}, \bar a^{(-j)}))^\top (\Phi_\mathcal{C} \Phi_\mathcal{C}^\top + \lambda \mathbf{I}_d)^{-1} \phi((a^{(j)}, \bar a^{(-j)})) > \tau$}
        			\State $\mathcal{C} \gets \mathcal{C} \cup \{(a^{(j)}, \bar a^{(-j)})\}$         \Comment{Add action $(a^{(j)}, \bar a^{(-j)})$ to the core set}
        		\EndIf
        	\EndFor
        \EndFor
        \LineComment{Perform "Roll outs" for each action in the core set}
        \State Define $q_\mathcal{C} = \mathbf{0}_{|\mathcal{C}|}$
        \For{$a^{(1:m)}_j \in \mathcal{C}$}
        	\For{$i=1,\dots,n$}
        		\State $y_{a^{(1:m)}_j,i} = Q_{w^*}(a^{(1:m)}_j) + \epsilon_{a^{(1:m)}_j,i}$ \Comment{Single noisy evaluation}
        	\EndFor
        	\State ${q_\mathcal{C}}_j \gets \frac{1}{n} \sum_{i=1}^n y_{a^{(1:m)}_j, i}$ \Comment{Average evaluations for each action}
       	\EndFor
       
       \State Compute regularized least-squares estimator $\hat w \gets (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda \mathbf{I}_d)^{-1}\Phi_\mathcal{C}^\top q_\mathcal{C}$
        
        \State Output best guess $\hat{a}^{(1:m)} = \argmax_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \phi(a^{(1:m)})^\top \hat w$.
    \end{algorithmic}
\end{algorithm}

We propose the following Algorithm \ref{alg:bandit}. 
First we iterate over the actions of each agent and build the core set with the help of a default action $\bar a^{(1:m)}$. 
Then we evaluate each action in the core set $n$ times, and compute an estimator. The output action is the empirically best action.
\begin{theorem}
	The output action $\hat a^{(1:m)}$ of Algorithm \ref{alg:bandit} satisfies with probability at least $1-\delta$,
	\begin{align*}
		Q_{w^*}(a^{*, (1:m)}) - Q_{w^*}(\hat a^{(1:m)}) \leq (2m-1) \left( \|w^*\| \sqrt{\lambda \tau} + \sqrt{\frac{2\tau}{n} \log\left(\frac{2}{\delta}\right)} \right)
	\end{align*}
	for $\tau \ge 1, \lambda > 0, \delta \in (0, 1)$, and $C_\text{max} := \frac{e}{e-1} \frac{1+\tau}{\tau} d \left( \log(1+1/\tau) + \log(1+1/\lambda) \right)$.
	
	Moreover, the compute complexity is $\mathcal{O}(d^3 \sum_{j=1}^m |\mathcal{A}^{(j)}|)$.
\end{theorem}
\begin{proof}
	First, we decompose the simple regret:
	\begin{align*}
		&Q_{w^*}(a^{*, (1:m)}) - Q_{w^*}(\hat a^{(1:m)}) \\ 
		&= Q_{w^*}(a^{*, (1:m)}) - Q_{w^*}(\hat a^{(1:m)}) \pm Q_{\hat w_n}(\hat a^{(1:m)}) \pm Q_{\hat w_n}(a^{*, (1:m)})\\
		&= Q_{w^*}(a^{*, (1:m)}) - Q_{\hat w_n}(a^{*, (1:m)}) +
		\underbrace{Q_{\hat w_n}(a^{*, (1:m)})
			- Q_{\hat w_n}(\hat a^{(1:m)})}_{< 0} + Q_{\hat w_n}(\hat a^{(1:m)}) - Q_{w^*}(\hat a^{(1:m)})
	\end{align*}
	Taking the absolute value we get
	\begin{align*}
		|Q_{w^*}(a^{*, (1:m)}) - Q_{w^*}(\hat a^{(1:m)})|
		&\leq \underbrace{| Q_{w^*}(a^{*, (1:m)}) - Q_{\hat w}(a^{*, (1:m)}) |}_{E_1} + \underbrace{|Q_{\hat w}(\hat a^{(1:m)}) - Q_{w^*}(\hat a^{(1:m)})|}_{E_2}\\
	\end{align*}
	
	
   
    Defining $\mathcal{H} = \{\tilde a^{(1:m)}: \|\phi(\tilde a^{(1:m)})\|_{V_\mathcal{C}^{-1}} \le \tau , \tilde a^{(1:m)} \in \mathcal{A}^{(1:m)}\}$.
    Comparing $w^*$ and $\hat w$ in any direction $x \in \mathcal{H}$, we have:
    \begin{align*}
        \left \langle x, \hat{w} - w^* \right \rangle &= 
        \left \langle x, (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \Phi^\top q_\mathcal{C} - w^* \right \rangle\\
        &=\left \langle x, (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \sum_{t=1}^{|\mathcal{C}|} \Phi_{\mathcal{C}_t} q_{\mathcal{C}_t} - w^* \right \rangle\\
        &=\left \langle x, (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \sum_{t=1}^{|\mathcal{C}|} \Phi_{\mathcal{C}_t} (\Phi_{\mathcal{C}_t}^\top w^* + \epsilon_{a_t}) - w^* \right \rangle\\
        &= \left \langle x, (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \sum_{t=1}^{|\mathcal{C}|} \Phi_{\mathcal{C}_t} \Phi_{\mathcal{C}_t}^\top w^* - w^* \right \rangle + 
            \left \langle x, (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \sum_{t=1}^{|\mathcal{C}|} \Phi_{\mathcal{C}_t} \epsilon_{a_t} \right \rangle \\
        &= \underbrace{\left \langle x, (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \Phi_{\mathcal{C}} \Phi_{\mathcal{C}}^\top w^* - w^* \right \rangle}_{e_1} + 
            \underbrace{\sum_{t=1}^{|\mathcal{C}|} \left \langle x, (\Phi_\mathcal{C} \Phi_\mathcal{C}^\top + \lambda I)^{-1} \Phi_{\mathcal{C}_t} \right \rangle \epsilon_{a_t}}_{e_2} \\
    \end{align*}
    Since $e_1$ is exactly the same expression as $E_1$ in the proof of Lemma B.2 in \cite{yin2021efficient} we can use there result to bound this term. 
    Giving us
    $$e_1 \le ||w^*||_2 \sqrt{\lambda \tau}$$
    Now we need to bound $e_2$, which
    is a $\sigma_\varepsilon/\sqrt{n}$-subguassian random variable since it's a summation of $|\mathcal{C}|$ $1/\sqrt{n}$-subgaussian random variables (since we sample each action in the core set $n$ times) each multiplied by a constant, with
    \begin{align*}
        \sigma_\varepsilon &= \sqrt{
            \sum_{t=1}^{|\mathcal{C}|} \left \langle x, (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \Phi_{\mathcal{C}_t} \right \rangle^2
        }
    \end{align*}
    Let $\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I := V \Lambda V^\top$ be the eigendecomposition of $\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I$
    with $\Lambda = \text{diag}(\lambda_1, \lambda_2, \cdots, \lambda_d)$ and $V$ being an orthonormal matrix.
    Notice that for all $i$, $\lambda_i \ge \lambda$. Let $\alpha = V^\top x$, then we have
    \begin{align*}
        \sigma_\varepsilon^2 = \sum_{t=1}^{|\mathcal{C}|} \left \langle x, (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \Phi_{\mathcal{C}_t} \right \rangle^2 &=
        \sum_{t=1}^{|\mathcal{C}|} x^\top V \Lambda^{-1} V^\top \Phi_{\mathcal{C}_t} \Phi_{\mathcal{C}_t}^\top V \Lambda^{-1} V^\top x\\
        &= x^\top V \Lambda^{-1} V^\top \Phi_{\mathcal{C}} \Phi_{\mathcal{C}}^\top V \Lambda^{-1} V^\top x\\
        &= x^\top V \Lambda^{-1} V^\top (V \Lambda V^\top - \lambda I) V \Lambda^{-1} V^\top x\\
        &= x^\top V \Lambda^{-1} V^\top x
        - \lambda \alpha^\top \Lambda^{-2} \alpha\\
        &\leq \tau - \lambda \sum_{i=1}^{|\mathcal{C}|} \frac{\alpha_i^2}{\lambda_i^2}\\
        &\leq \tau
    \end{align*}
    Using this bound, and theorem 5.3 of the bandit book, we have with probability $1 - \delta$:
    \begin{align*}
        |e_2| \leq \sqrt{\frac{2 \tau}{n} \log\left(\frac{2}{\delta}\right)}
    \end{align*}
    Putting the bounds on $e_1$ and $e_2$ together, with probability $1 - \delta$, for every $x \in \mathcal{H}$ we have
    \begin{align*}
        \langle x, \hat{w} - w^* \rangle \leq \|w^*\| \sqrt{\lambda \tau} + \sqrt{\frac{2\tau}{n} \log\left(\frac{2}{\delta}\right)}
    \end{align*}
    \todoa{We didn't even need to bound this error inside the Good set, we only needed to bound it for the modified action. If that's correct, can we get a better bound by adding all of the modified actions to the core set? (we won't have to extrapolate or ... anymore)}
    
	Now notice that for any $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ we have
	\begin{align*}
	&|Q_{\hat w}(a^{(1:m)}) - Q_{w^*}(a^{(1:m)})| \\
	&= \left|Q_{\hat w}(a^{(1:m)}) - Q_{w^*}(a^{(1:m)}) \pm (m-1) Q_{w^*}(\bar a^{(1:m)}) \pm (m-1) Q_{\hat w}(\bar a^{(1:m)})\right| \\ 
    &= \left |
            \sum_{j=1}^m Q_{\hat w}((a^{(j)}, \bar a^{(-j)})) -
            \sum_{j=1}^m Q_{w^*}((a^{(j)}, \bar a^{(-j)}))) +
            (m-1)\left(Q_{w^*}(\bar a^{(1:m)}) - Q_{\hat w}(\bar a^{(1:m)})\right)
        \right |\\
    &= \left |
        \sum_{j=1}^m Q_{\hat w - w^*}((a^{(j)}, \bar a^{(-j)})) +
        (m-1)\left(Q_{w^* - \hat w}(\bar a^{(1:m)}) \right)
    \right |.\\
	&\leq \sum_{j=1}^m \|\phi((a^{(j)}, \bar a^{(-j)}))\|_{V_\mathcal{C}^{-1}} \|\hat w - w^*\|_{V_\mathcal{C}} + (m-1) \|\phi(\bar a^{(1:m)})\|_{V_\mathcal{C}^{-1}} \|\hat w - w^*\|_{V_\mathcal{C}} \\
	&\leq (2m-1) \left( \|w^*\| \sqrt{\lambda \tau} + \sqrt{\frac{2\tau}{n} \log\left(\frac{2}{\delta}\right)} \right)
	\end{align*}

	The final inequality follows from noting that $\bar a^{(1:m)}, (a^{*, (j)}, \bar a^{(-j)}) \in \{\tilde a^{(1:m)}: \|\phi(\tilde a^{(1:m)})\|_{V_\mathcal{C}^{-1}} \le \tau \}, \ \forall j \in [m]$ by algorithm design.
	The claim follows by using the last display to bound $E_1$ and $E_2$.
	
	For the compute complexity, note that the core set is of size at most $d$ (by Lemma 5.1 in \cite{yin2021efficient}). 
	Adding a vector to the core set and updating the inverse iteratively is $d^2$. 
	Computing the estimate is $d^2 \sum_{j=1}^m |\mathcal{A}^{(j)}|$ and computing the maximizer is $d \sum_{j=1}^m |\mathcal{A}^{(j)}|$.
	
\end{proof}

We need to prove that if we add a state-action pair to the core set, it remains in the good set in future.

\begin{theorem}
Assume that $\Phi_\mathcal{C} \in \mathbb{R}^{t \times d}$,  $V_\mathcal{C} = \Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I$, and $\phi_{t+1} \in \mathbb{R}^d$. Define $\hat \Phi = [\Phi^\top \, \phi_{t+1}]^\top$,
and $\hat V_\mathcal{C} = \hat{\Phi}_\mathcal{C}^\top \hat{\Phi}_\mathcal{C} + \lambda I$.
Then we have:
\begin{align*}
    \| \phi_{t+1} \|_{\hat V_\mathcal{C}^{-1}} < 1
\end{align*}
\end{theorem}
\begin{proof}
By the definition of the norm we have:
\[
\begin{aligned}
    \| \phi_{t+1} \|_{\hat{V}_\mathcal{C}^{-1}} &= \phi_{t+1}^\top \hat{V}_\mathcal{C}^{-1} \phi_{t+1}\\ 
    &= \phi_{t+1}^\top \left( 
        \hat{\Phi}_\mathcal{C}^\top \hat{\Phi}_\mathcal{C} + \lambda I
    \right)^{-1} \phi_{t+1}\\
    &= \phi_{t+1}^\top \left ( 
        \sum_{i = 1}^{t+1} \phi_i \phi_i^\top
        + \lambda I
    \right)^{-1} \phi_{t+1}\\
    &= \phi_{t+1}^\top \left ( 
        \sum_{i = 1}^{t} \phi_i \phi_i^\top
        + \lambda I
        + \phi_{t+1} \phi_{t+1}^\top
    \right)^{-1} \phi_{t+1}\\
    &= \phi_{t+1}^\top \left ( 
        V_\mathcal{C}
        + \phi_{t+1} \phi_{t+1}^\top
    \right)^{-1} \phi_{t+1}\\
    &= \phi_{t+1}^\top \left ( 
        V_\mathcal{C}^{-1}
        - \frac{V_\mathcal{C}^{-1} \phi_{t+1} \phi_{t+1}^\top V_\mathcal{C}^{-1}}{1 + \phi_{t+1}^\top V_\mathcal{C}^{-1} \phi_{t+1}^\top}
    \right) \phi_{t+1} & \text{Sherman-Morrison}\\
    &= \phi_{t+1}^\top V_\mathcal{C}^{-1} \phi_{t+1}
        - \frac{
        \phi_{t+1}^\top V_\mathcal{C}^{-1} \phi_{t+1} \phi_{t+1}^\top V_\mathcal{C}^{-1}\phi_{t+1}
        }{
        1 + \phi_{t+1}^\top V_\mathcal{C}^{-1} \phi_{t+1}
        }
    \\
    &= \frac{
        \phi_{t+1}^\top V_\mathcal{C}^{-1} \phi_{t+1} 
        }{
        1 + \phi_{t+1}^\top V_\mathcal{C}^{-1} \phi_{t+1}
        }\\
    &< 1
    \\
\end{aligned}
\]
\end{proof}
Therefore, if we set $\tau \geq 1$, then none of the features that have been added to the core set can produce $V_\mathcal{C}^{-1}$-norm greater than $\tau$, so they remain in the good set.

\newpage
\section*{Appendix}

\section*{Parameter Assignment}

\subsection{DAV}
\label{sec:parameter DAV}
The total error is the following:
\begin{align}
    \frac{8 \eta_1}{(1 - \gamma)^2} + \frac{2 \gamma^{K - 1}}{(1 - \gamma)^2} &\leq \kappa \label{eq:egss-main} \\
    \frac{8}{(1 - \gamma)^2}
    \left(
        b\sqrt{\lambda  \tau} + \left( \frac{\gamma^{H+1}}{1 - \gamma} + \theta \right) \sqrt{ \tau C_{\text{max}}}
        \right)(2m-1) +
        \frac{2\gamma^{K - 1}}{(1 - \gamma)^2}
    &\leq \kappa \nonumber \\
    &\Rightarrow \nonumber \\
    \frac{8(2m-1)}{(1 - \gamma)^2} b\sqrt{\lambda  \tau} & \leq \frac{\kappa}{4} \label{eq:egss-lambda}\\
    \frac{8\sqrt{ \tau C_{\text{max}}}(2m-1)}{(1 - \gamma)^2} \frac{\gamma^{H+1}}{1 - \gamma} & \leq \frac{\kappa}{4} \label{eq:egss-H}\\
    \frac{8\sqrt{ \tau C_{\text{max}}}(2m-1)}{(1 - \gamma)^2} \theta & \leq \frac{\kappa}{4} \label{eq:egss-theta}\\
    \frac{2\gamma^{K - 1}}{(1 - \gamma)^2} & \leq \frac{\kappa}{4} \label{eq:egss-K}
\end{align}
First we assume that $\tau = 1$.

From \eqref{eq:egss-lambda} we get:
\begin{align*}
    \frac{8(2m - 1)}{(1 - \gamma)^2} b\sqrt{\lambda } & \leq \frac{\kappa}{4} \\
    \sqrt{\lambda } &\leq \frac{(1 - \gamma)^2\kappa}{32 b (2m -1)}\\
    \lambda &\leq \frac{(1 - \gamma)^4\kappa^2}{1024 b^2 (2m -1)^2}\\
\end{align*}
From \eqref{eq:egss-H} we get:
\begin{align*}
    \frac{8\sqrt{ C_{\text{max}}}(2m -1)}{(1 - \gamma)^3} \gamma^{H+1} & \leq \frac{\kappa}{4} \\
    (2m -1) \sqrt{C_{\text{max}}} \gamma^{H+1} & \leq \frac{\kappa(1 - \gamma)^3}{32} \\
    \gamma^{H+1} & \leq \frac{\kappa(1 - \gamma)^3}{32 \sqrt{C_{\text{max}}} (2m - 1)} \\
    H & \geq \frac{\log\left (\frac{\kappa(1 - \gamma)^3}{32 \sqrt{C_{\text{max}}} (2m - 1)} \right)}{
        log(\gamma)
    } - 1\\
\end{align*}

From \eqref{eq:egss-theta} we get:
\begin{align*}
    \frac{8\sqrt{ C_{\text{max}}}(2m-1)}{(1 - \gamma)^2} \theta & \leq \frac{\kappa}{4}\\
    (2m-1) \sqrt{C_{\text{max}}} \theta & \leq \frac{\kappa(1- \gamma)^2}{32}\\
    \theta & \leq \frac{\kappa(1- \gamma)^2}{32 (2m-1) \sqrt{C_{\text{max}}}}\\
\end{align*}

From \eqref{eq:egss-K} we get:
\begin{align*}
    \frac{2\gamma^{K - 1}}{(1 - \gamma)^2} & \leq \frac{\kappa}{4}\\
    \gamma^{K - 1} & \leq \frac{\kappa(1 - \gamma)^2}{8}\\
    K & \leq \frac{\log \left( \frac{\kappa(1 - \gamma)^2}{8} \right)}{\log(\gamma)} + 1\\
    K & \leq \frac{\log\left(\kappa(1 - \gamma)^2\right) - \log(8)}{\log(\gamma)} + 1\\
\end{align*}

We know that \eqref{eq:egss-main} holds with probability at least $1 - 2KC_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$.
Therefore from that the rest of values we get:
\begin{align*}
        2KC_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n) &\leq \delta\\
        \exp(-2 \theta^2(1-\gamma)^2 n) &\leq \frac{\delta}{2KC_{\text{max}}}\\
        -2 \theta^2(1-\gamma)^2 n &\leq log( \frac{\delta}{2KC_{\text{max}}})\\
        n &\geq \frac{log(\delta) - \log(2KC_{\text{max}})}{2 \theta^2(1-\gamma)^2}\\
\end{align*}

\subsection{EGSS}
\label{sec:parameter EGSS}
The total error is the following:
\begin{align}
    \frac{8 \eta_2}{(1 - \gamma)^2} + \frac{2 \gamma^{K - 1}}{(1 - \gamma)^2} &\leq \kappa \label{eq:dav-main} \\
    \frac{8}{(1 - \gamma)^2}
    \left(
        b\sqrt{\lambda  \tau} + \left( \frac{\gamma^{H+1}}{1 - \gamma} + \theta \right) \sqrt{ \tau C_{\text{max}}}
        \right) +
        \frac{2\gamma^{K - 1}}{(1 - \gamma)^2}
    &\leq \kappa \nonumber \\
    &\Rightarrow \nonumber \\
    \frac{8}{(1 - \gamma)^2} b\sqrt{\lambda  \tau} & \leq \frac{\kappa}{4} \label{eq:dav-lambda}\\
    \frac{8\sqrt{ \tau C_{\text{max}}}}{(1 - \gamma)^2} \frac{\gamma^{H+1}}{1 - \gamma} & \leq \frac{\kappa}{4} \label{eq:dav-H}\\
    \frac{8\sqrt{ \tau C_{\text{max}}}}{(1 - \gamma)^2} \theta & \leq \frac{\kappa}{4} \label{eq:dav-theta}\\
    \frac{2\gamma^{K - 1}}{(1 - \gamma)^2} & \leq \frac{\kappa}{4} \label{eq:dav-K}
\end{align}
First we assume that $\tau = 1$.

From \eqref{eq:dav-lambda} we get:
\begin{align*}
    \frac{8}{(1 - \gamma)^2} b\sqrt{\lambda } & \leq \frac{\kappa}{4} \\
    \sqrt{\lambda } &\leq \frac{(1 - \gamma)^2\kappa}{32 b}\\
    \lambda &\leq \frac{(1 - \gamma)^4\kappa^2}{1024 b^2 }\\
\end{align*}
From \eqref{eq:dav-H} we get:
\begin{align*}
    \frac{8\sqrt{ C_{\text{max}}}}{(1 - \gamma)^3} \gamma^{H+1} & \leq \frac{\kappa}{4} \\
    d^{\frac{1}{4}} C_{\text{max}}^{\frac{1}{2}} \gamma^{H+1} & \leq \frac{\kappa(1 - \gamma)^3}{32} \\
    \gamma^{H+1} & \leq \frac{\kappa(1 - \gamma)^3}{32 d^{\frac{1}{4}} C_{\text{max}}^{\frac{1}{2}}} \\
    H & \geq \frac{\log\left (\frac{\kappa(1 - \gamma)^3}{32 d^{\frac{1}{4}} C_{\text{max}}^{\frac{1}{2}}} \right)}{
        log(\gamma)
    } - 1\\
\end{align*}

From \eqref{eq:dav-theta} we get:
\begin{align*}
    \frac{8\sqrt{C_{\text{max}}}}{(1 - \gamma)^2} \theta & \leq \frac{\kappa}{4}\\
    d^{\frac{1}{4}} C_{\text{max}}^{\frac{1}{2}} \theta & \leq \frac{\kappa(1- \gamma)^2}{32}\\
    \theta & \leq \frac{\kappa(1- \gamma)^2}{32 d^{\frac{1}{4}} C_{\text{max}}^{\frac{1}{2}}}\\
\end{align*}

From \eqref{eq:dav-K} we get:
\begin{align*}
    \frac{2\gamma^{K - 1}}{(1 - \gamma)^2} & \leq \frac{\kappa}{4}\\
    \gamma^{K - 1} & \leq \frac{\kappa(1 - \gamma)^2}{8}\\
    K & \leq \frac{\log \left( \frac{\kappa(1 - \gamma)^2}{8} \right)}{\log(\gamma)} + 1\\
    K & \leq \frac{\log\left(\kappa(1 - \gamma)^2\right) - \log(8)}{\log(\gamma)} + 1\\
\end{align*}

We know that \eqref{eq:dav-main} holds with probability at least $1 - 2KC_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$.
Therefore from that the rest of values we get:
\begin{align*}
        2KC_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n) &\leq \delta\\
        \exp(-2 \theta^2(1-\gamma)^2 n) &\leq \frac{\delta}{2KC_{\text{max}}}\\
        -2 \theta^2(1-\gamma)^2 n &\leq log( \frac{\delta}{2KC_{\text{max}}})\\
        n &\geq \frac{log(\delta) - \log(2KC_{\text{max}})}{2 \theta^2(1-\gamma)^2}\\
\end{align*}

\section*{Other Attempts}

\subsection*{Approach 1}
For a given $i, t$ our goal is determine if we are in Case 1 or Case 2 in computation $O(mA)$. 

\subsubsection*{Attempt 1}
We can check if Case 2 holds as follows. Notice that we can bound $f(s, a^{(1:m)})$ as follows
\begin{align*}
    f(s, a^{(1:m)}) 
    &= \phi(s, a^{(1:m)})^\top G^{-1} \phi(s, a^{(1:m)}) \\
    &= ||\phi(s, a^{(1:m)})||_{G^{-1}}^2 \\
    &= ||\sum_{i=1}^m \phi_i(s, a_i)||_{G^{-1}}^2\\
    &\le (\sum_{i=1}^m||\phi_i(s, a_i)||_{G^{-1}})^2 
\end{align*}
Notice that if we can bound $||\phi_i(s, a_i)||_{G^{-1}}$ by $\sqrt{\tau}/m$ for all $i \in [m]$, then the above expression becomes
\begin{align*}
    & (\sum_{i=1}^m||\phi_i(s, a_i)||_{G^{-1}})^2 \\
    &\le (\sum_{i=1}^m \sqrt{\tau}/m )^2 \\
    &= \sqrt{\tau}^2 \\
    &= \tau 
\end{align*}
Thus, we just need to bound $||\phi_i(s, a_i)||_{G^{-1}}$ by $\sqrt{\tau}/m$ for all $i \in [m]$ to ensure that $f(s, a^{(1:m)}) \le \tau$. This can be done for action wise (i.e. for each action $a^{(i)}$ independently), which would only take $O(mA)$ computation.

\textbf{Problem:} However, it is not clear how to check Case 1 holds. 
For example if $||\phi_i(s, a_i)||_{G^{-1}} > \sqrt{\tau}/m$ for some $i \in [m]$, then we can no longer claim we are in Case 2.
However, it also does not necessarily mean we in Case 1.
It is not clear how to find out if we are in Case 1 or Case 2 then. 

\subsubsection*{Attempt 2}
This attempt is very crude, we're working on it. Many things could be incorrect!

We can solve the maximization problem below:
\begin{align}
    \max_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \quad & f(a^{(1:m)}) = \phi(s, a^{(1:m)})^\top \underbrace{(\Phi_C^\top \Phi_C + \lambda I)^{-1}}_{G} \phi(s, a^{(1:m)})
\end{align}
We know that the matrix $G$ is positive semi-definite, which allows us to write the following decomposition:
\begin{align}
    G &= Q \Lambda Q^{-1}\\
    \Rightarrow \phi(s, a^{(1:m)})^\top G \phi(s, a^{(1:m)}) &= \phi(s, a^{(1:m)})^\top\ Q \Lambda Q^{-1} \ \phi(s, a^{(1:m)})\\
    &= \sum_{i = 1}^m \phi(s, a^i)^\top\ Q \Lambda Q^{-1} \ \sum_{i = 1}^m \phi(s, a^i)
\end{align}
Guess: Decomposition could be done in polynomial time, then by maximizing the following quantity we can solve the maximization problem:
\begin{align}
    \label{eq:max1}
    \sum_{i = 1}^m \phi(s, a^i)^\top\ Q \Lambda \mathbf{1}
\end{align}
This problem could be solved independently for each $i$ in polynomial time.

\paragraph{Problem} We don't know if maximizing \eqref{eq:max1} leads to maximum value of $f$ or not.
Also, we're looking for other numerical analysis methods.

\subsubsection*{Approach 1 Attempt 1}
We do this by modifying Algorithm 1 as shown in blue in Algorithm 2, where we define $x^{(i)}(y^{(1:m)}) = (y^{(1)}, ..., x^{(i)}, ..., y^{(m)})$.
\begin{algorithm}
\caption{Multi-Agent Confident Rollout Default Actions} 
\label{alg:confident rollout da old}  
\begin{algorithmic}[1]
\State \textbf{Input:} number of rollouts $m$, length of rollout $n$, rollout policy $\pi$ (\textcolor{green}{should change to $w$?}), discount $\gamma$, initial state $s_0$, initial action vector $a_0^{(1:m)}$, default action vector $\bar a^{(1:m)}$, feature matrix $\Phi_C$, regularization coefficient $\lambda$, threshold $\tau$.
\For {$i = 1, ..., n$}
    \State $s_{i, 0} \gets s_0, a_{i, 0} \gets \bar a$, query the simulator, obtain reward $r_{i, 0} \gets r(s_{i, 0}, a_{i, 0})$, and the next state $s_{i, 1}$.
    \For {$t = 1, ..., H$} 
        \For {\textcolor{blue}{$j \in [m]$}}
            \For {\textcolor{blue}{$a^{(j)} \in \mathcal{A}^{(j)}$}}
                \State \textcolor{blue}{Compute feature $\phi(s, (a^{(j)}, \bar a^{(-j)})$.}
                \If {\textcolor{blue}{$\phi(s, (a^{(j)}, \bar a^{(-j)}))^\top (\Phi_C^\top \Phi_C + \lambda I)^{-1} \phi(s, (a^{(j)}, \bar a^{(-j)})) > \tau$}}
                    \State \textcolor{blue}{status $\gets$ uncertain, result $\gets (s, (a^{(j)}, \bar a^{(-j)}), \phi(s, (a^{(j)}, \bar a^{(-j)})), \text{none})$}
                    \State \Return {status, result}
                \EndIf
            \EndFor 
        \EndFor 
        \State \textcolor{blue}{
        \begin{align*}
        a_{i, t}^{(1:m)} = \Bigg(
        &\argmax_{\tilde{a}^{(1)} \in \mathcal{A}^{(1)}} \left[w^\top \phi_1(s, \tilde{a}^{(1)}) +
        \underbrace{\sum_{p \ne 1} w^\top \phi_p(s, \bar a^{(p)})}_{\text{\textcolor{green}{can be removed?}}} 
        \right], \\
        &..., \\
        & \argmax_{\tilde{a}^{(m)} \in \mathcal{A}^{(m)}} \left[w^\top \phi_m(s, \tilde{a}^{(m)}) + \sum_{p \ne m} w^\top \phi_p(s, \bar a^{(p)}) \right] \Bigg).
        \end{align*}}
        \If {\textcolor{blue}{$\phi(s, a_{i, t}^{(1:m)})^\top (\Phi_C^\top \Phi_C + \lambda I)^{-1} \phi(s, a_{i, t}^{(1:m)}) > \tau$}}
            \State \textcolor{blue}{status $\gets$ uncertain, result $\gets (s, a_{i, t}^{(1:m)}, \phi(s, a_{i, t}^{(1:m)}), \text{none})$}
            \State \textcolor{blue}{\Return {status, result}}
        \EndIf
        \State Query the simulator with $s, a_{i, t}^{(1:m)}$, obtain reward $r_{i, t} \gets r(s, a_{i, t}^{(1:m)})$, and next state $s_{i, {t+1}}$.
    \EndFor 
\EndFor
\State status $\gets$ done, result $\gets \frac{1}{n} \sum_{i=1}^n \sum_{t=0}^H \gamma^t r_{i, t}$ 
\State \Return status, result
\end{algorithmic}
\end{algorithm}

\textbf{Problem:} Algorithm 2 satisfies Reason 1; however, it does not satisfy Reason 2.
Reason 1 is satisfied because the action selected in line 14 by both the VA and MA are always the same, until the MA algorithm terminates (Case A holds for the first time). 
The action is the same because the argmax taken in line 14 is only over action vectors that are in the good set since this was ensured in lines 5-13.

The reason it does not satisfy Reason 2 is because we can not ensure that the VA is greedy w.r.t $\tilde{Q}(s, a^{(1:m)})$. 
This can easily be seen by considering some action vector $\tilde{a}^{(1:m)}$ that was not checked in lines 5-13 but is not in the good set. 
If $\argmax_{\bar{a}^{(1:m)}} \tilde{Q}(s, \bar{a}^{(1:m)}) = Q_{\tilde{\pi}}(s, \tilde{a}^{(1:m)})$ then clearly the VA was not greedy w.r.t $\tilde{Q}(s, a^{(1:m)})$ since in line 14 it never considered $\tilde{a}^{(1:m)}$
Stated differently, the VA is greedy w.r.t. $\tilde{w} \phi(s, a^{(1:m)})$ which is not the same as being greedy w.r.t $\tilde{Q}(s, a^{(1:m)})$

\subsubsection*{Attempt 3}
This attempt has been proposed recently, we have not investigated it thoroughly yet. \todoj{RKHS is mostly equivalent to the linear setting. We can look into this later, but it will not address any challenges that are already there in the linear setting.}

we can use \textbf{reproducing kernel Hilbert space}(RKHS) to solve the maximization problem approximately, starting from an initial assignment to $a^{(1:m)}$.
We think in that case we should change the proof for policy improvement lemma.

\paragraph{Relating/Reducing to the Bandit Setting}

At a high level, the policy evaluation step can be reduced to a bandit problem.
We begin by relating terms from bandits (as introduced in Chap 19 in \cite{lattimore2020bandit}) to terms in our current RL setting.
Assume there is only one state $s$, and we are at step $k$ of policy iteration. 
We can define the true parameter vector $\theta_* = w_{\pi_{k-1}}$, the feature of the optimal action $A^*_t = \phi(s, \argmax_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} w_{\pi_{k-1}}^\top \phi(s, a^{(1:m)})$, and the action taken by the policy $A_t = \phi(s, \bar \pi(s)) = \phi(s, \argmax_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \tilde Q_{k-1}(s, a^{(1:m)}))$.
Assume we are in the final loop of Algorithm 2 in \cite{yin2021efficient}, which means all the good set checks (lines 4-13 in our Algorithm 2) pass, and no further elements will be added to the core set.
Setting $H=1$ and $\gamma = 0$, notice that Algorithm \ref{alg:confident rollout da} becomes a well defined bandit algorithm (where we just need to add a final step defining the update rule for $w_k$ as line 14 of Algorithm 2 in \cite{yin2021efficient}).
It is important to clarify that the bandit algorithm is just the policy evaluation step for a specific $k$.
Notice that we only update the parameter vector once (at the very end, once all the reward data has been gathered)!
Also, the algorithm will take $N = nmA$ ($n$ samples and $mA$ good set checks each time. Assume the reward is stochastic so that the $n$ samples are actually needed.) actions.

Now we relate our analysis to the regret proof for linear bandits (Chap 19 in \cite{lattimore2020bandit}). 
A critical step to the regret proof was to bound the instantaneous regret $r_t = \langle \theta_*, A^*_t - A_t \rangle$.
Notice that we are interested in PAC results which translates to the instantaneous regret at the end (after $N$ actions are taken). 
\begin{equation}
\label{Pac Bandit}
r_N = \langle \theta_*, A^*_N - A_N \rangle
\end{equation}

I will now remind us how $r_t$ was bounded for linear bandits, after which I will explain how similar steps can be taken to bound equation (\ref{Pac Bandit}).
Recall that we can bound $r_t$ as follows
\begin{align*}
    r_t &= \langle \theta_*, A_t^* - A_t \rangle \\
    &\le \langle \tilde \theta - \theta_*, A_t \rangle \\
    &\le ||A_t||_{V_{t-1}^{-1}} ||\tilde \theta - \theta_*||_{V_{t-1}}
\end{align*}
where $V_t = G = \Phi_C^\top \Phi_C + \lambda I$.
The first inequality was achieved in the bandit setting due to the definition of $A_t = \argmax_{a \in \mathcal{A}} \text{UCB}_t(a)$. 
And the second inequality was achieved by using the elliptical potential lemma (Lemma 19.4 in \cite{lattimore2020bandit}), and designing the core set in a special way.
\todoj{The elliptical potential lemma is to bound $\sum_t \|A_t\|_{V_t^{-1}}^2$ (not used here). Here we are explicitly enforcing $\|\phi_a\|_{V_t^{-1}}^2 \leq \tau$ by sampling the core set and by construction of the algorithm}
Neither of these inequalities will work in our setting, since our algorithm and core set design is different.
However, we can still make things works in a sort of similar way.
\begin{align*}
    r_N &= \langle \theta_*, A_N^* - A_N \rangle\\
    &= \langle \theta_*, A_N^*\rangle \pm \langle w_k, A_N^*\rangle \pm \langle w_k, A_N\rangle - \langle \theta_*, A_N \rangle \\
    &= \langle w_k - \theta_*, A_N^*\rangle + \langle w_k, A_N^*\rangle - \langle w_k, A_N\rangle + \langle w_k - \theta_*, A_N \rangle \\
    &\le \langle w_k - \theta_*, A_N^*\rangle + \langle w_k - \theta_*, A_N \rangle \\
    &\le 2 \bar \eta (2m-1)
\end{align*}
Where the second last inequality holds since $\langle w_k, A_N^*\rangle - \langle w_k, A_N\rangle \le 0$ by the definition of $A_N$. 
And the last inequality holds by equation (\ref{A2A2: approx bound}).

\subsection*{Bandit case}

Recall that $\phi_i: \mathcal{S} \times \mathcal{A}  \to \mathbb{R}^d$.
Fix state $s \in \mathcal{S}$, and redefine $\phi_i: \mathcal{A} \to \mathbb{R}^d$, and relax the problem to a bandit setting for now.
Under assumption 1 we know that there exist weight vectors $w_1^*, \cdots, w_m^* \in \mathbb{R}^d$, such that:
\[
    \begin{aligned}
        v(a_{1 : m}) &= \sum_{j =1}^{m} \phi_j(a_j)^\top
        \underbrace{\sum_{i =1}^{m} w_j^*}_{:= w^*}\\
        &= \sum_{j =1}^{m} \phi_j(a_j)^\top w^*,
    \end{aligned}
\]
which suggests that we are interested in the $w^*$ which is the sum of $(w_i)_i$, and not their individual values(true?).
Now we can define $f_w$ and $a^*$ as:
\[
    \begin{aligned}
        a_{1:m}^* &= \argmax_{a_{1:m} \in \mathcal{A}^m} \left\{f_{w^*}(a_{1:m}) := \sum_{j =1}^{m} \phi_j(a_j)^\top w^* \right \}
    \end{aligned}
\]

\begin{algorithm}
	\algnewcommand{\LineComment}[1]{\State \(\triangleright\) #1}
    \caption{Confident Multi-Agent Linear Bandit}\label{alg:bandit} 
    \begin{algorithmic}
        \Require $n \geq 0, \hat w_0 \in \mathbb{R}^d$, Default action $a_0 \in \mathcal{A}$, Number of rollouts $M$
        \State $\mathcal{C} \gets \emptyset$
        \LineComment{Build Core Set}
        \For{$j=1,\dots m$}
        	\For{$a_j \in \mathcal{A}^{(j)}$}
        		\If{$\phi(a_0^{(j)}(a_j))^\top (\Phi_\mathcal{C} \Phi_\mathcal{C}^\top + \lambda \mathbf{I}_d)^{-1} \phi(a_0^{(j)}(a_j)) > \tau$}
        			\State $\mathcal{C} \gets \mathcal{C} \cup \{a_0^{(j)}(a_j)\}$         \Comment{Add action $a_0^{(j)}(a_j)$ to the core set}
        		\EndIf
        	\EndFor
        \EndFor
        \LineComment{Perform "Roll outs" for each action in the core set}
        \For{$a \in \mathcal{C}$}
        	\For{$t=1,\dots,M$}
        		\State $y_{a,t} = f(a) + \epsilon_{a,t}$ \Comment{Single noisy evaluation}
        	\EndFor
        	\State $y_a \gets \frac{1}{M} \sum_{t=1}^M y_{a,t}$ \Comment{Average evaluations for each action}
       	\EndFor
       
       \State Compute regularized least-squares estimator $\hat w \gets (\Phi_\mathcal{C} \Phi_\mathcal{C}^\top + \lambda \mathbf{I}_d)^{-1}\Phi_\mathcal{C} y_\mathcal{C}$
        
        \State Output best guess $\hat{a}^* = \argmax_{a \in \mathcal{A}^m} \sum_{j=1}^m\phi_j(a_j)^\top \hat w$.
    \end{algorithmic}
\end{algorithm}

We propose the following Algorithm \ref{alg:bandit}. First we iterate over the actions of each agents and build the core set with the help of a default action $a_0$. Then we evaluate each action in the core set $M$ times, and compute an estimator. The output action is the empirically best action.
\begin{theorem}
	The output action $\hat a^*$ of Algorithm \ref{alg:bandit} satisfies with probability at least $1-\delta$,
	\begin{align*}
		f(a^*) - f(\hat a^*) \leq \mathcal{O}\left(\frac{m d \sqrt{\tau \log(1/\delta)}}{\sqrt{M}}\right)
	\end{align*}
	Moreover, the compute complexity is $\mathcal{O}(d^3 \sum_{j=1}^m |\mathcal{A}^{(j)}|)$.
\end{theorem}
\begin{proof}
	First, we decompose the simple regret:\todoj{just a sketch, I haven't checked yet super carefully}
	\begin{align*}
		f_{w^*}(a^*) - f_{w^*}(\hat a^*) &= f_{w^*}(a^*) - f_{w^*}(\hat a^*) \pm f_{\hat w_n}(\hat a^*) \pm f_{\hat w_n}(a^*)\\
		&= f_{w^*}(a^*) - f_{\hat w_n}(a^*) +
		\underbrace{f_{\hat w_n}(a^*)
			- f_{\hat w_n}(\hat a^*)}_{< 0} + f_{\hat w_n}(\hat a^*) - f_{w^*}(\hat a^*)
	\end{align*}
	From this we get
	\begin{align*}
		|f_{w^*}(a^*) - f_{w^*}(\hat a^*)|
		&\leq \underbrace{| f_{w^*}(a^*) - f_{\hat w_n}(a^*) |}_{E_1} + \underbrace{|f_{\hat w_n}(\hat a^*) - f_{w^*}(\hat a^*)|}_{E_2}\\
	\end{align*}
	Note that for each action $a \in \mathcal{A}$:
	\begin{align}
	|f_{\hat w}(a) - f_{w^*}(a)| \leq \sum_{j=1}^m \|\phi(a_0^{(j)}(a_j))\|_{V_\mathcal{C}^{-1}} \|\hat w_n - w^*\|_{V_\mathcal{C}} \leq m \sqrt{\tau} \cdot \mathcal{O}(d/\sqrt{M})
	\end{align}
	The first inequality follows by adding and subtracting $\phi(a_0)$ a total of $(m-1)$ times. The second inequalit follows from noting that $\|\phi(a_0^{(j)}(a_j))\|_{V_\mathcal{C}^{-1}} \leq \tau$ by design, and from standard linear least-squares concentration $\|\hat w_n - w^*\|_{V_\mathcal{C}} \leq \mathcal{O}(d \sqrt{\log(1/\delta)/M})$ with probability at least $1-\delta$. Note that we averaged $M$ observations for each action in the core set $\mathcal{C}$, so each observation is essentially $1/\sqrt{M}$-sub-Gaussian. The claim follows by using the last display to bound $E_1$ and $E_2$.
	
	For the compute complexity, note that the core set is of size at most $d$. Adding a vector to the core set and updating the inverse iteratively is $d^2$. Computing the estimate is $d^2 \sum_{j=1}^m |\mathcal{A}^{(j)}|$ and computing the maximizer is $d \sum_{j=1}^m |\mathcal{A}^{(j)}|$.
	
\end{proof}

For now, we can define $Choose\_Action$ function as (where the argmax can be done in time poly$(A, M)$ due to the linear form of $f_{\hat{w}}$).\todoj{as Vlad wrote in the ToDos, this should be changed to match the RL algorithm}
\[
    Choose\_Action(\hat w) = \argmax_{a_{1:m} \in \mathcal{A}^m} f_{\hat{w}}(a_{1:m}).
\]
We are interested in simple regret (true?) which is the following quantity:
\[
    \begin{aligned}
        R_n^{\text{SIMPLE}}(\pi) &= \mathbb{E}_{\pi} [
            \Delta_{a^{(n)}}
        ]\\
        \Delta_{a} &= v(a^*) - v(a)\\
        &= f_{w^*}(a^*) - f_{w^*}(a).
    \end{aligned}
\]
Therefore, we want to bound the following \todoj{As Vlad mentioned, we only need a one-sided bound}
\[
    \begin{aligned}
        f_{w^*}(a^*) - f_{w^*}(a^{(n)}) &= f_{w^*}(a^*) - f_{w^*}(a^{(n)}) \pm f_{\hat w_n}(a^{(n)}) \pm f_{\hat w_n}(a^*)\\
        &= f_{w^*}(a^*) - f_{\hat w_n}(a^*) +
        \underbrace{f_{\hat w_n}(a^*)
        - f_{\hat w_n}(a^{(n)})}_{< 0} + f_{\hat w_n}(a^{(n)}) - f_{w^*}(a^{(n)}) \\
        |f_{w^*}(a^*) - f_{w^*}(a^{(n)})|
        &\leq \underbrace{| f_{w^*}(a^*) - f_{\hat w_n}(a^*) |}_{E_1} + \underbrace{|f_{\hat w_n}(a^{(n)}) - f_{w^*}(a^{(n)})|}_{E_2}\\
    \end{aligned}
\]

\subsubsection*{Bounding $E_2$}
$E_2$ is our approximation error for value of action $a^{(n)}$.
For bounding $E_2$, we can use the idea of having a default action vector.
Fix an arbitrary action $b_{1:m}$. Then, we have the following:
\begin{align*}
        |f_{\hat w_n}(a^{(n)}) - f_{w^*}(a^{(n)})| &=
        |
            f_{\hat w_n}(a^{(n)}) - f_{w^*}(a^{(n)})
            \pm (m-1) f_{w^*}(b)
            \pm (m-1) f_{\hat w_n}(b)
        |\\
        &= | \sum_{j = 1}^m \phi_j(a^{(n)}_j)^\top \hat{w}_n  - \sum_{j = 1}^m \phi_j(a^{(n)}_j)^\top w^* \\
        &+ (m-1) \sum_{j = 1}^m \phi_j(b_j)^\top w^*  - (m-1) \sum_{j = 1}^m \phi_j(b_j)^\top w^* \\
        &+ (m-1) \sum_{j = 1}^m \phi_j(b_j)^\top \hat{w}_n  - (m-1) \sum_{j = 1}^m \phi_j(b_j)^\top \hat{w}_n|\\
        &= | \underbrace{\sum_{j = 1}^m \phi_j(a^{(n)}_j)^\top \hat{w}_n + (m-1) \sum_{j = 1}^m \phi_j(b_j)^\top \hat{w}_n}_{A_1}\\
        &- \left( \underbrace{\sum_{j = 1}^m \phi_j(a^{(n)}_j)^\top w^* + (m-1) \sum_{j = 1}^m \phi_j(b_j)^\top w^*}_{A_2}  \right)\\
        &+ (m-1) \sum_{j = 1}^m \phi_j(b_j)^\top w^* - (m-1) \sum_{j = 1}^m \phi_j(b_j)^\top \hat{w}_n|\\
        &= \left |
        A_1 - A_2 +
        \underbrace{(m-1) \left(f_{w^*}(b) - f_{\hat w_n}(b)\right)}_{A_3}
        \right | \tag{\theequation} \label{ineq:app-err-1}
\end{align*}
We overload the notation of $b_{j}: \mathcal{A} \to \mathcal{A}^m$ to be a function which replaces the $j$th agent's action in $b_{1:m}$ with a new action (the input of the function).
We have the following for $A_1$:
\[
    \begin{aligned}
    \sum_{j = 1}^m \phi_j(a^{(n)}_j)^\top \hat{w}_n + (m-1) \sum_{j = 1}^m \phi_j(b_j)^\top \hat{w}_n &=\\
    &+\left(\phi_1(a^{(n)}_1)^\top \hat{w}_n + \sum_{j \in [n] - \{1\}}\phi_j(b_j)^\top \hat{w}_n\right) + \phi_1(b_1)^\top \hat{w}_n\\
    &+\left(\phi_2(a^{(n)}_2)^\top \hat{w}_n + \sum_{j \in [n] - \{2\}}\phi_j(b_j)^\top \hat{w}_n\right) + \phi_2(b_2)^\top \hat{w}_n\\
    &+\cdots\\
    &+\left(\phi_{m-1}(a^{(n)}_{m-1})^\top \hat{w}_n + \sum_{j \in [n] - \{m-1\}}\phi_j(b_j)^\top \hat{w}_n\right) + \phi_{m-1}(b_{m-1})^\top \hat{w}_n\\
    &+\phi_{m}(a^{(n)}_{m})^\top \hat{w}_n\\
    &=\\
    &+\left(\phi_1(a^{(n)}_1)^\top \hat{w}_n + \sum_{j \in [n] - \{1\}}\phi_j(b_j)^\top \hat{w}_n\right)\\
    &+\left(\phi_2(a^{(n)}_2)^\top \hat{w}_n + \sum_{j \in [n] - \{2\}}\phi_j(b_j)^\top \hat{w}_n\right)\\
    &+\cdots\\
    &+\left(\phi_{m-1}(a^{(n)}_{m-1})^\top \hat{w}_n + \sum_{j \in [n] - \{m-1\}}\phi_j(b_j)^\top \hat{w}_n\right)\\
    &+\left(\phi_{m}(a^{(n)}_{m})^\top \hat{w}_n + \sum_{j \in [n] - \{m\}}\phi_j(b_j)^\top \hat{w}_n\right)\\
    &= \sum_{j=1}^m f_{\hat w_n}(b_j(a^{(n)}_j)).
    \end{aligned}
\]
We can have a similar derivation for $A_2$, which together with \eqref{ineq:app-err-1} results in:
\begin{align*}
        |f_{\hat w_n}(a^{(n)}) - f_{w^*}(a^{(n)})| &= \left |
        \underbrace{
        \left(
            \sum_{j=1}^m f_{\hat w_n}(b_j(a^{(n)}_j)) -
            \sum_{j=1}^m f_{w^*}(b_j(a^{(n)}_j))
        \right)}_{A_1 - A_2}+
        A_3
           
        \right | \label{eq:} \\
        &\leq | A_1 - A_2 | + |A_3|
\end{align*}
Then, we have the following:
\[
    \begin{aligned}
        A_1 - A_2 &= \sum_{j=1}^m \left(
            f_{\hat w_n}(b_j(a^{(n)}_j)) -
                f_{w^*}(b_j(a^{(n)}_j))
            \right)\\
        &= \sum_{j=1}^m
            f_{\hat{w}_n - w^*}(b_j(a^{(n)}_j))
            \\
        &= \sum_{j=1}^m
            \phi(b_j(a^{(n)}_j))^\top
            \left(
                \hat{w}_n - w^*
            \right)
            \\
    \end{aligned}
\]

where we define $\phi(c_{1:m}) = \sum_{i=1}^m \phi_i(c_i)$.
We can write the same for $A_3$:
\begin{align*}
    A_3 &= (m-1) f_{w^*}(b) - f_{\hat w_n}(b) \\
    &= (m-1) f_{w^* - \hat{w}_n}(b) \\
    &= (m-1) \phi(b_{1:m})^\top(w^* - \hat{w}_n) \\
\end{align*}
Similar to the linear bandits setting, we can bound the above quantities with the following:
\[
    \begin{aligned}
        A_1 - A_2 = \sum_{j=1}^m
            \phi(b_j(a^{(n)}_j))^\top
            \left(
                \hat{w}_n - w^*
            \right)
        &\leq
        \sum_{j=1}^m
        \underbrace{\| \phi(b_j(a^{(n)}_j)) \|_{V_n^{-1}}}_{B_1}
        \underbrace{\| \hat{w}_n - w^* \|_{V_n}}_{\text{small with high probability}}\\
        A_3 &\leq (m - 1) \underbrace{\| \phi(b) \|_{V_n^{-1}}}_{B_2}
        \underbrace{\| \hat{w}_n - w^* \|_{V_n}}_{\text{small with high probability}}\\
        V_0 = \lambda I \qquad V_n &= V_0 + \sum_{s=1}^n \phi(b_j(a^{(s)}_j)) \phi(b_j(a^{(s)}_j))^\top\\
        W_0 = \lambda I \qquad W_n &= W_0 + \sum_{s=1}^n \phi(a^{(s)}) \phi(a^{(s)})^\top
    \end{aligned}
\]
For bounding $B_1$ and $B_2$, we need to introduce the idea of Core set and change the $Choose\_Action$ function.

Now we bound E1 in a similar way to E2. 
\[
    \begin{aligned}
        |f_{\hat w_n}(a^*) - f_{w^*}(a^*)| &=
        |
            f_{\hat w_n}(a^*) - f_{w^*}(a^*)
            \pm (m-1) f_{w^*}(b)
            \pm (m-1) f_{\hat w_n}(b)
        | \\ 
        &= \left |
        \underbrace{
        \left(
            \sum_{j=1}^m f_{\hat w_n}(b_j(a_j^*) -
            \sum_{j=1}^m f_{w^*}(b_j(a^*_j))
        \right)}_{C}+
            (m-1)\left(f_{w^*}(b) - f_{\hat w_n}(b)\right)
        \right |.\\
    \end{aligned}
\]

We can of course bound $f_{w^*}(b) - f_{\hat w_n}(b)$ the exact same way as we have done earlier in $E_2$..

It is left to bound 
$$C = \sum_{j=1}^m f_{\hat w_n}(b_j(a_j^*) - \sum_{j=1}^m f_{w^*}(b_j(a^*_j)) = \sum_{j=1}^m \phi(b_j(a^*_j))^\top (\hat{w}_n - w^*)$$

I'm not sure how we can do this since we don't know if $\phi(b_j(a_j^*))$ is in the Good set or not (I can't think of how we could know this at least).


\input{parts/flatland.tex}




















   


























\newpage
\bibliographystyle{unsrtnat}

\section{\uppercase{Additive Q-Functions}}\label{sec:additive}

The result in \cref{ss:mc-lspi-egss-theory} makes no restriction on the choice of features as long as the greedy policy can be computed efficiently (\cref{asm:linear-q-pi,ass: argmax oracle}).
Next, we introduce an additive feature model for which the oracle can be implemented efficiently.

With the greedy oracle (\cref{ass: argmax oracle}), one can use \textsc{Confident MC-LSPI}~combined with \textsc{UncertaintyCheck-EGSS}~and directly invoke \cref{thm:mc-lspi-egss sub-optimality}.
However, in \cref{ss:dav} we introduce a new uncertainty check algorithm, \textsc{UncertaintyCheck-DAV}, that explicitly uses the additive structure.
The additive feature structure leads to improved results in the regimes where the dimension is large, but more importantly facilitates an efficient kernelized version of the \textsc{Confident MC-LSPI}~algorithm (\cref{ss:kernel}). 
The additive model also allows an efficient implementation of \textsc{Confident MC-Politex}~\citep{yin2021efficient}, which leads to an improved dependence on the suboptimality in the misspecified setting (\cref{ss:politex}).


In the following, we assume that the action space can be decomposed into a product $\mathcal{A} = \mathcal{A}^{(1:m)} := \mathcal{A}^{(1)} \times \cdots \times \mathcal{A}^{(m)}$ for $m \geq 1$ (borrowing the standard notation from the multi-agent setting).
We further assume access to feature maps $\phi_i: \mathcal{S} \times \mathcal{A}^{(i)} \to \mathbb{R}^d$ for each $i \in [m]$ and define $\phi(s, a^{(1:m)}) = \sum_{i=1}^m \phi_i(s, a^\ag{i})$.
The next assumption states that for any policy $\pi$, the $Q_\pi$-function is (approximately) linear in the feature map $\phi$ and decomposes additively across the components $\mathcal{A}^{(i)}$. 
\begin{assumption}
	\label{ass: feature decomposition}


	For each policy $\pi$ there exists a weight vector $w_\pi \in \mathbb{R}^d, \|w_\pi\|_2 \leq b$ satisfying \looseness=-1
 $\max\limits_{(s, a^{(1:m)}) \in \mathcal{S} \times \mathcal{A}} |Q_\pi(s,a^{(1:m)}) -w_\pi^\top  \sum_{j=1}^m \phi_j(s,a^{(j)})| \le \epsilon$.
 
\end{assumption}
In the context of the multi-agent setting (\cref{ex:multi-agent}), the interpretation is that each $\phi_i(s,a^\ag{i})$ models the contribution to the $Q$-function of each agent individually.
Moreover, when \cref{ass: feature decomposition} is satisfied, then for any weight vector $w \in \mathbb{R}^d$ the greedy policy can be implemented with $\mathcal{O}(d\sum_{i=1}^m|\mathcal{A}^\ag{i}|)$ computation: \looseness=-1
\begin{align*}
	&\argmax\nolimits_{a^{(1:m)} \in \mathcal{A}} w^\top \phi(s, a^{(1:m)})
	\\
	&=\big(\argmax_{a^{(1)} \in \mathcal{A}^{(1)}} w^\top \phi_1(s, a^{(1)}), ..., \argmax_{a^{(m)} \in \mathcal{A}^{(m)}} w^\top \phi_m(s, a^{(m)})\big)
\end{align*}

		
			

A simple example when \cref{ass: feature decomposition} holds is when $m$ agents ``live'' in $m$ separate MDPs such that in each MDP the action-value functions are linearly realizable with their respective feature-maps and the goal is to maximize the sum of the rewards across the MDPs.
In cases like this, we say that the ``large'' MDP is a \emph{product MDP}.
Note that in this setting agents only observe a joint reward after taking their actions, so an optimal policy for the joint MDP may not always be learned by simply applying single agent algorithms in each individual MDP. 
In \cref{app:additive mdp example} we show that \cref{ass: feature decomposition} also captures MDPs that require cooperation between agents, and provide some empirical results.\looseness=-1

	
	




\subsection{Uncertainty Check using a Default Action Vector} \label{ss:dav}

In this section we introduce the uncertainty check with a default action vector (\textsc{UncertaintyCheck-DAV}, \cref{alg:uncertainty check dav}).
The goal of the uncertainty check is to ultimately bound the estimation error of $w_k$, i.e. 
\begin{align}
	|w_k^\top \phi(s, a^{(1:m)}) - Q_{\pi_{k-1}}(s, a^{(1:m)})| \le \eta \,.\label{eq:suboptimality}
\end{align}
Lemma \ref{lemma: yin lemma b.2} shows that a sufficient condition is to ensure that $\|\phi(s, a^{(1:m)})\|_{V_\mathcal{C}^{-1}}^2 \le \tau$ for all $(s, a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A})$ that are queried during policy evaluation.\looseness=-1

We show that under \cref{ass: feature decomposition}, it is possible to achieve \cref{eq:suboptimality} while running the uncertainty check for a much smaller set of actions of size $\sum_{i=1}^m |\mathcal{A}^\ag{i}|$. 
Recall that \textsc{Confident MC-LSPI}~sets a \emph{default action vector} $\bar a^{(1:m)} \in \mathcal{A}$ as a global. 
Define a subset of $\mathcal{A}$ as $\bar \mathcal{A}^{(1:m)} = \{(a^\ag{i}, \bar a^\ag{-i}): a^\ag{i} \in \mathcal{A}^\ag{i}, i \in [m]\}$, where we define $(a^\ag{i}, \bar a^\ag{-i}) =(\bar a^\ag{1},...,\bar a^\ag{i - 1}, a^\ag{i}, \bar a^\ag{i + 1},...,\bar a^\ag{m})$ as the action vector resulting from changing agent $i$'s default action in $\bar a^{(1:m)}$ with $a^\ag{i}$.
Then, by \cref{ass: feature decomposition} for any $a^{(1:m)} \in \mathcal{A}$ we have
\begin{align}
	&w_k^\top \phi(s, a^{(1:m)}) \nonumber
	= w_k^\top \sum_{i=1}^m \phi_i(s, a^{\ag{i}}) \nonumber \\
	&= w_k^\top \Big(\sum_{i=1}^m \phi_i(s, a^\ag{i}) \pm (m-1) \phi_i(s, \bar a^{(1:m)}) \Big) \nonumber \\
	&= w_k^\top \Big(\sum_{i=1}^m \phi_i(s, (a^\ag{i}, \bar a^\ag{-i})) - (m-1) \phi_i(s, \bar a^{(1:m)}) \Big) \nonumber
\end{align}
Notice that $\bar a^{(1:m)}, (a^\ag{i}, \bar a^\ag{-i}) \in \bar \mathcal{A}^{(1:m)}, \forall i \in [m]$.
Thus, when $\|\phi(s, \tilde a^{(1:m)})\|_{V_\mathcal{C}^{-1}}^2 \le \tau$ for all $\tilde a^{(1:m)} \in \bar \mathcal{A}^{(1:m)}$ we can ensure that for all action-vectors $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ that $|w_k^\top \phi(s, a^{(1:m)}) - Q_{\pi_{k-1}}(s, a^{(1:m)})| \le (2m-1)\eta$.
In words, by checking the uncertainty of action-vectors that differ from the default action vector by at most one position $\tilde a^{(1:m)} \in \bar \mathcal{A}^{(1:m)}$ we can bound the sub-optimality of our estimate $w_k$, since the feature of any action vector can be related to the feature of the default action vector under Assumption \ref{ass: feature decomposition}. 
Since $\bar \mathcal{A}^{(1:m)}$ only contains $\sum_{i=1}^m |\mathcal{A}^\ag{i}|$ elements, this procedure is $\text{poly}(d,\sum_{i=1}^m |\mathcal{A}^\ag{i}|)$.\looseness=-1

\begin{algorithm}
	\caption{\textsc{UncertaintyCheck-DAV}} \label{alg:uncertainty check dav}  
	\begin{algorithmic}[1]
		\State \textbf{Input:} state $s$, core set $\mathcal{C}$, threshold $\tau$.
		\State \textbf{Globals:} number of action components $m$ 
		\For {$j \in [m]$}
		\For {$a^{(j)} \in \mathcal{A}^{(j)}$}
		\State $\tilde{a} \gets (a^{(j)}, \bar a^{(-j)})$
		\If {$\phi(s, \tilde{a})^\top V_\mathcal{C}^{-1} \phi(s, \tilde a) > \tau$}
	
		\State result $\gets (s, \tilde a, \phi(s, \tilde a), \text{\textsc{none}})$
		\State \Return {\textsc{uncertain}, result}
		\EndIf
		\EndFor 
		\EndFor 
		\State \Return \textsc{certain}, \textsc{none} 
	\end{algorithmic}
\end{algorithm}

The result
that characterizes the performance of
\textsc{Confident MC-LSPI}~combined with \textsc{UncertaintyCheck-DAV} ~is summarized in the next theorem.
\begin{theorem}[\textsc{Confident MC-LSPI DAV} Sub-Optimality] \label{thm:mc-lspi-dav sub-optimality}	
        Suppose Assumption \ref{ass: feature decomposition}, and \ref{ass: bounded features} hold.
	If $\epsilon = 0$, for any $\kappa > 0$, with probability at least $1 - \delta$, the policy $\pi_{K-1}$, output by \textsc{Confident MC-LSPI}~combined with \textsc{UncertaintyCheck-DAV}~satisfies
	\begin{equation*}
		V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \kappa.
	\end{equation*}
	  The query and computation complexity are $\mathcal{O}\left(\tfrac{m^2d^3}{\kappa^2 (1-\gamma)^8} \right)$.
	and $\text{poly}(\sum_{i=1}^m |\mathcal{A}^\ag{i}|, d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}))$ respectively. 
	  If $\epsilon > 0$, then with probability at least $1 - \delta$, the output policy $\pi_{K-1}$ satisfies\looseness=-1
	\begin{equation*}
		V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \tfrac{128 \epsilon \sqrt{d} m}{(1-\gamma)^2} (1 +\log(1+ b^2 \epsilon^{-2} d^{-1}))^{1/2}.
	\end{equation*}
	  The query and computation complexity are $\mathcal{O}\left(\tfrac{d^2}{\epsilon^2 (1-\gamma)^4} \right)$.


	  and $\text{poly}(\sum_{i=1}^m |\mathcal{A}^\ag{i}|, d, \frac{1}{1 - \gamma}, \frac{1}{\epsilon}, \log(\frac{1}{\delta}), \log(1+b))$ respectively.
  
      
       All parameter settings are in \cref{app: theorem proofs}.
\end{theorem}
When compared to the result in \citet[Theorem 5.1]{yin2021efficient} we have an extra factor of $m^2$ in the query complexity for $\epsilon = 0$, while for $\epsilon \neq 0$ we only have an extra factor of $m$ in the sub-optimality of the output policy. On the other hand, the computational complexity is improved from $\mathcal{O}(\prod_{i=1}^m |\mathcal{A}^\ag{i}|)$ for the prior work to $\mathcal{O}(\sum_{i=1}^m |\mathcal{A}^\ag{i}|)$.
When compared to \cref{thm:mc-lspi-egss sub-optimality} where \textsc{UncertaintyCheck-EGSS}~was used instead of \textsc{UncertaintyCheck-DAV}~the extra dependence on $\sqrt{d}$ changed to $m$.


\subsection{Kernelized Setting}\label{ss:kernel}
The kernelized setting is a standard extension of the finite-dimensional linear setup \citep{srinivas2009gaussian,abbasi2012online}. 
It lifts the restriction that the features and parameter vector are elements of $\mathbb{R}^d$. 
Formally the kernel is $\mathbf{k} : (\mathcal{S} \times \mathcal{A}^{(1:m)})^2 \rightarrow \mathbb{R}$, which gives rise to a reproducing kernel Hilbert space (RKHS) $\mathcal{H}$, defined as a vector space $V_\mathcal{H} := \mathbb{R}^{\mathcal{S} \times \mathcal{A}^{(1:m)}}$ with inner product $\langle \cdot, \cdot \rangle_\mathcal{H} : V_\mathcal{H} \times V_\mathcal{H} \to \mathbb{R}$.
We require that the $Q_\pi$-function is approximately contained in an RKHS. 
This includes cases where the linear dimension of function class is infinite.
\begin{assumption}[Kernel $Q_\pi$-realizability]
\label{ass:kernel q-pi}
For each policy $\pi$ there exists a vector $\tilde Q_\pi \in \mathcal{H}, \|\tilde Q_\pi\|_\mathcal{H} \le b$ that satisfies 
    $\sup_{s \in \mathcal{S}, a^{(1:m)} \in \mathcal{A}^{(1:m)}} |Q_\pi(s,a^{(1:m)}) - \tilde Q_\pi (s,a^{(1:m)})| \le \epsilon$, where $\tilde Q_\pi(s,a^{(1:m)}) = \langle 
    \tilde Q_\pi, \mathbf{k}(s, a^{(1:m)}, \cdot, \cdot )\rangle_\mathcal{H}.$
\end{assumption}
Similar to the finite setting we assume an additive structure (on the kernel now) to allow efficient implementation.
For component $j \in [m]$, define the kernel as $\mathbf{k}_j : (\mathcal{S} \times \mathcal{A}^\ag{j})^2 \rightarrow \mathbb{R}$, which gives rise to an RKHS $\mathcal{H}_j$, defined as a vector space $V_{\mathcal{H}_j} := \mathbb{R}^{\mathcal{S} \times \mathcal{A}^\ag{j}}$ with inner product $\langle \cdot, \cdot \rangle_\mathcal{H} : V_{\mathcal{H}_j} \times V_{\mathcal{H}_j} \to \mathbb{R}$. 
\begin{assumption}
\label{ass:kernel additive}
    The kernel $\mathbf{k}$ can be written as $\mathbf{k}(s_1,a_1^{(1:m)}, s_2,a_2^{(1:m)}) = \sum_{j=1}^m \mathbf{k}_j(s_1,a_1^{(j)}, s_2,a_2^{(j)})$ where $s_1, s_2 \in \mathcal{S}, \ a_1^{(1:m)}, a_2^{(1:m)} \in \mathcal{A}^{(1:m)}$.
\end{assumption}

The kernel setting  requires us to address two main challenges. 
First, the scaling of the query complexity with the dimension $d$ needs to be improved to a notion of effective dimension. 
Following \citet{du2021bilinear, huang2021short} we make use of the critical information gain $\tilde \Gamma$ (defined in \cref{eq:critical infogain}, \cref{app:kernel setting}) which can be bounded for different RKHS of interest \citep{srinivas2009gaussian,huang2021short}. 
Second, computationally we cannot directly work with infinite dimensional features $\phi(s,a) = \mathbf{k}(s, a, \cdot, \cdot)$. 
Instead, we rely on the `kernel trick' and compute all quantities of interest in the finite-dimensional data space \citep{scholkopf2001generalized}. 
After formally arguing as stated above, one can show that a kernelized version of \textsc{Confident MC-LSPI}~and \textsc{UncertaintyCheck-DAV}~provide the following sub-optimality guarantees on the output policy (proof in \cref{app:kernel setting}).

\begin{theorem}[\textsc{Confident Kernel MC-LSPI DAV} Sub-Optimality] \label{thm:kernel mc-lspi-dav sub-optimality}	
        Suppose Assumption \ref{ass:kernel q-pi}, \ref{ass:kernel additive}, and \ref{ass: bounded features} hold.
        Define $\tilde \Gamma := \tilde \Gamma(\lambda, \log(2))$.
	If $\epsilon = 0$, for any $\kappa > 0$, with probability at least $1 - \delta$, the policy $\pi_{K-1}$, returned by \textsc{Confident Kernel MC-LSPI}~(\cref{alg:confident kernel mc-lspi/politex}) combined with \textsc{UncertaintyCheck-K-DAV}~(\cref{alg:uncertainty check k-dav}) satisfies
	\begin{equation*}
		V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \kappa.
	\end{equation*}
	The query and computation complexity are $\mathcal{O}\left(\tfrac{m^2\tilde \Gamma^3}{\kappa^2 (1-\gamma)^8} \right)$ and $\text{poly}(\sum_{i=1}^m |\mathcal{A}^\ag{i}|, \tilde \Gamma, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}))$ respectively. 
	  If $\epsilon > 0$, then with probability at least $1 - \delta$, the final policy $\pi_{K-1}$ satisfies
	\begin{equation*}
		V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \tfrac{32 \epsilon m \sqrt{\tilde \Gamma}}{(1-\gamma)^2}.
	\end{equation*}
	  The query and computation complexity are $\mathcal{O}\left(\tfrac{\tilde \Gamma^2}{\epsilon^2 (1-\gamma)^4} \right)$


	  and $\text{poly}(\sum_{i=1}^m |\mathcal{A}^\ag{i}|, \tilde \Gamma, \frac{1}{1 - \gamma}, \frac{1}{\epsilon}, \log(\frac{1}{\delta}), \log(1+b))$ respectively.
  
       All parameter settings are in \cref{app: theorem proofs}.
\end{theorem}
The result is identical to \cref{thm:mc-lspi-dav sub-optimality} except with $d$ replaced with the critical information gain $\tilde \Gamma(\lambda, \log(2))$.

\subsection{Politex}\label{ss:politex}
The Politex algorithm has been shown to obtain better sub-optimality gaurantees than LSPI by \citet{abbasi2019politex}.
In this section we show that \textsc{Confident MC-Politex}~presented by \citet{yin2021efficient} can be extended to combinatorially large action spaces.
Although Politex is a also based on policy iteration, like LSPI, an important difference is that it uses stochastic policies based on an exponential weighting of each actions $Q$-value.
Efficiently sampling from such a policy is not always possible when the action space is combinatorially large.
We show \cref{ass: feature decomposition} is sufficient to do so (Proposition \ref{prop: efficient politex policy sampling}).
Moreover, using similar arguments as in \cref{ss:dav}, indeed, \textsc{Confident MC-Politex}~combined with \textsc{UncertaintyCheck-DAV}~achieves better sub-optimality guarantees than \textsc{Confident MC-LSPI}.

\begin{theorem}[\textsc{Confident MC-Politex} Sub-Optimality] \label{thm:mc-politex sub-optimality}	
        Suppose Assumption \ref{ass: feature decomposition}, and \ref{ass: bounded features} hold.

	If $\epsilon > 0$, for any $\kappa > 0$, with probability at least $1 - \delta$, the policy $\bar \pi_{K-1}$, output by \textsc{Confident MC-Politex}~(\cref{alg:confident ma mc-politex}) combined with \textsc{UncertaintyCheck-DAV}~(\cref{alg:uncertainty check dav}) satisfies






	\begin{equation*}
		V^*(\rho) - V_{\bar \pi_{K-1}}(\rho) \leq \tfrac{64 \epsilon m \sqrt{d}}{1-\gamma} (1 +\log(1+b^2 \epsilon^{-2} d^{-1}))^{1/2}.
	\end{equation*}
	The query and computation complexity are $\mathcal{O}\left(\tfrac{md}{\epsilon^4 (1-\gamma)^5} \right)$


	  and $\text{poly}(\sum_{i=1}^m |\mathcal{A}^\ag{i}|, d, \frac{1}{1 - \gamma}, \frac{1}{\epsilon}, \log(\frac{1}{\delta}), \log(1+b))$ respectively.
       All parameter settings are in \cref{app: theorem proofs}.
  
\end{theorem}

As expected the sub-optimality is better (scales with $1/(1-\gamma)$) than that of  \textsc{Confident MC-LSPI} (\cref{thm:mc-lspi-dav sub-optimality}), which scales with $1/(1-\gamma)^2$.
However, the query complexity is worse (as is typical for Politex), and an extra factor of $m$ is introduced, since mirror descent needs to be run on the entire action space of size $\prod_{i=1}^m |\mathcal{A}^\ag{i}|$ for each state. 
We also extend the result to the kernelized setting in \cref{app: theorem proofs},
and show that \textsc{UncertaintyCheck-EGSS}~can be used when \cref{ass: feature decomposition} is satisfied



\section{\uppercase{Efficient MC-LSPI}} \label{sec: algorithms}
In this section, we extend the \textsc{Confident MC-LSPI} algorithm proposed by \cite{yin2021efficient} to the combinatorial action setting. 
More precisely, \cref{alg:confident ma mc-lspi} with \cref{alg:uncertainty check} used for the \textsc{UncertaintyCheck} \ is equivalent to the \textsc{Confident MC-LSPI} algorithm presented in \cite{yin2021efficient}, which relies on either enumerating the action set or solving a quadratic maximization problem, both which become infeasible for large $\mathcal{A}$ in general \citep[e.g.,][]{bhattiprolu2021framework}. 
The main challenge is to come up with a procedure that uses only polynomially many calls to the greedy oracle while also scaling polynomially in all other quantities of interest.\looseness=-1




At a high level, \cref{alg:confident ma mc-lspi} alternates between policy evaluation and policy improvement. 
For evaluation, a core set is constructed that holds a small but sufficiently diverse set of features corresponding to state-action pairs. 
For each element of the core set, the \textsc{Rollout}~routine (\cref{alg:confident rollout ma}) returns a Monte-Carlo estimate of the Q-value. 
During each rollout, the \textsc{UncertaintyCheck}~subroutine (\cref{alg:uncertainty check egss}) determines if a feature should be added to the core set. 
This procedure is repeated until no more elements are added to the core set. 
The Monte-Carlo returns from the rollouts are then used to construct a least-squares estimate of $Q_\pi(s,a)$, which in turn is used to improve the policy.


Formally, the outer loop aims to complete $K$ iterations of policy iteration.
The goal of each iteration $k$ is to estimate $Q_{\pi_{k-1}}$ using a weight vector $w_k \in \mathbb{R}^d$ and derive a new greedy policy $\pi_k$, w.r.t.~$w_k$. 
For estimation, the algorithm maintains a \emph{core set} $\mathcal{C}$ with elements corresponding to state-action pairs. The elements of the core set $z = (z_s, z_{a}, z_\phi, z_q) \in \mathcal{C}$ are tuples containing a state $z_s \in \mathcal{S}$, an action $z_{a} \in \mathcal{A}$, the corresponding feature $z_\phi \in \mathbb{R}^d$ , and a value estimate $z_q \in \mathbb{R} \cup \{\textsc{none}\}$.
We denote the vector of all value estimates in the core set as $q_\mathcal{C} = (z_q)_{z \in \mathcal{C}} \in \mathbb{R}^{|\mathcal{C}|}$.
The weight vector $w_k$ to estimate $Q_{\pi_{k-1}}$ is computed using regularized least squares, with $q_\mathcal{C}$ as the targets (line 16). 
An improved policy based on $w_k$ is then calculated by following the greedy policy with respect to $w^\top \phi(s,a)$ (line 17).
The core set is initialized in lines 3-8  by adding the initial state with a \emph{default action} $\bar a$, so that there is at least one element in the core set to rollout from (line 3). 
Then we continuously run the \textsc{UncertaintyCheck}~algorithm until it stops returning a status of \textsc{uncertain}, and add the uncertain tuple to the core set each time. 
This is to ensure that the final policy\footnote{The algorithm returns $\pi_{K-1}$ instead of $\pi_K$ because the proof requires that the uncertainty checks for the final policy pass. This is only ensured for $\pi_{K-1}$.} $\pi_{K-1}$  returned by the main algorithm is approximately optimal from the initial state $\rho$, and this can be insured if all the uncertain actions (from $\rho$) are added to the core set (details in Appendix \ref{app:efficient uncertainty check}).\looseness=-1

\input{parts/algo-mclspi}
\input{parts/algo-rollout}

In each iteration $k$, a Monte-Carlo estimation procedure (\textsc{Rollout}, \cref{alg:confident rollout ma}) is launched for every element $z \in \mathcal{C}$ in the core set. 
An estimate (result in line 14) is obtained via taking the average return of $n$ Monte-Carlo rollouts of length $H$ while following policy $\pi_{k-1}$.
\textsc{Rollout}~is \emph{successful} if it returns a status of~\textsc{done}~and an estimate of $Q_{\pi_{k-1}}(z_s, z_{a})$, which is assigned to $z_q$.
If at iteration $k$ \textsc{Rollout}~is successful for every core set element then $z_q$ has a value estimate for all $z \in \mathcal{C}$, and the iteration is completed with the policy improvement step. 
The way the core set is constructed guarantees that the features of all the elements in the core set are sufficiently different to provide good target values $q_\mathcal{C}$ for least squares (\cref{prop: approx value function bound for DAV,prop: approx value function bound for EGSS}).\looseness=-1







Each time when \textsc{Rollout}~is \emph{unsuccessful}, it returns a status of \textsc{uncertain}~and a corresponding tuple. The uncertain tuple is added to the core set and policy iteration is restarted (line 14) and the value estimates for all the core set elements are reset to \textsc{none}~(line 9).
Roughly speaking, a tuple is flagged as uncertain when during the rollout a features is observed that is sufficiently different from all the features in the core set $\{ z_\phi: z \in \mathcal{C} \}$.
Important is that adding tuples to the core set in this way ensures that the size of the core set is bounded by a $\mathcal{O}(d)$ (\cref{lemma:bound on core set size}). 
Restarting policy iteration is mainly to simplify the analysis; in practice it is reasonable to continue with the same policy.\looseness=-1






It remains to specify the \textsc{UncertaintyCheck}~subroutine that is used in \cref{alg:confident ma mc-lspi,alg:confident rollout ma}.
For a fixed state $s \in \mathcal{S}$ the purpose of the uncertainty check is to search for an \emph{uncertain action} that satisfies 
\begin{align}
\phi(s, a)^\top (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \phi(s, a) > \tau\label{eq:uncertainty-check}
\end{align}
Here $\Phi_\mathcal{C} \in \mathbb{R}^{|\mathcal{C}| \times d}$ is a matrix of all the features from the tuples in the core set stacked vertically. Solving \cref{eq:uncertainty-check} \emph{exactly} recovers the approach by \citet{yin2021efficient}. However, as this amounts to solving a positive-definite maximization problem, this is infeasible in general.








\subsection{Efficient Good Set Search (EGSS, \cref{alg:uncertainty check egss})}
Next, we show how to efficiently approximate the uncertainty check in \cref{eq:uncertainty-check}.
Define $V_\mathcal{C} = \Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I$ and a weighted matrix norm as $\|x\|_{B}^2 = x^\top B x, \ x \in \mathbb{R}^d, B \in \mathbb{R}^{d \times d}$.
Using this notation, \cref{eq:uncertainty-check} becomes
\begin{align*}
\phi(s, a)^\top (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \phi(s, a) = \|\phi(s, a)\|_{V_\mathcal{C}^{-1}}^2\,> \tau.
\end{align*}
We define the \textit{good set} to be the set of all features with $\| . \|_{V_\mathcal{C}^{-1}}^2$ weighted norm less than or equal to $\tau$ as follows
\begin{align*}
    \mathcal{D} = \{\phi(s, a): \|\phi(s, a)\|_{V_\mathcal{C}^{-1}}^2 \le \tau\}.
\end{align*}
Fix a state $s \in \mathcal{S}$.
We want to check if there exists an action outside of the good set (i.e. $a \in \mathcal{A}$ that satisfies $\|\phi(s, a)\|_{V_\mathcal{C}^{-1}}^2 > \tau$) with computation that does not depend on $|\mathcal{A}|$.
To this end, let $L L^\top  = V_\mathcal{C}^{-1}$ be a Cholesky decomposition of $V_{\mathcal{C}}^{-1}$ and define 
 $\hat a = \argmax_{a \in \mathcal{A}} \|L^\top \phi(s, a)\|_\infty$. Note that $\hat a$ satisfies the following norm inequalities:
$$\frac{1}{d}\|\phi(s, \hat a)\|_{V_\mathcal{C}^{-1}}^2 \le \|L^\top \phi(s, \hat a)\|_\infty^2 \le \|\phi(s, \hat a)\|_{V_\mathcal{C}^{-1}}^2$$
 In other words, if $\|L^\top \phi(s, \hat a)\|_\infty^2 > \tau$ holds, then  we have $\|\phi(s, \hat a)\|_{V_\mathcal{C}^{-1}}^2 > \tau$ and we have found an uncertain state-action pair.
At the same time if $\|L^\top \phi(s, \hat a)\|_\infty^2 \le \tau$ then we are sure that $\|\phi(s, a)\|_{V_\mathcal{C}^{-1}}^2 \le d \tau$ for all $a \in \mathcal{A}$.
The fact that the last inequality is still sufficient to provide bounds on the sub-optimality of policy evaluation manifests in Proposition \ref{prop: approx value function bound for EGSS}, where only an extra factor of $\sqrt{d}$ is introduced.
Finally, notice that 
\begin{align}
&\max_{a \in \mathcal{A}} \|L^\top \phi(s, a)\|_\infty = \max_{v \in \{\pm e_i\}_{i=1}^d} \max_{a \in \mathcal{A}} \langle Lv, \phi(s, a)\rangle\label{eq:infty-norm-oracle}
\end{align}
can be computed efficiently using $2d$ calls to the greedy oracle (\cref{ass: argmax oracle})


\input{parts/algo-egss}



\subsection{Theoretical Guarantees}\label{ss:mc-lspi-egss-theory}
The result that 
characterizes the performance of
\textsc{Confident MC-LSPI}~combined with \textsc{UncertaintyCheck-EGSS}~is summarized in the next theorem.
\begin{theorem}[\textsc{Confident MC-LSPI EGSS} Sub-Optimality] \label{thm:mc-lspi-egss sub-optimality}	
        Suppose \cref{asm:linear-q-pi,ass: bounded features,ass: argmax oracle} hold.
	If $\epsilon = 0$, for any $\kappa > 0$, with probability at least $1 - \delta$, the final policy $\pi_{K-1}$, returned by \textsc{Confident MC-LSPI}~combined with \textsc{UncertaintyCheck-EGSS}~satisfies
	\begin{equation*}
		V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \kappa.
	\end{equation*}
	  The query and computation complexity are $\mathcal{O}\big(\tfrac{d^4}{\kappa^2 (1-\gamma)^8} \big)$ and $\text{poly}(d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}))$ respectively.

	  If $\epsilon > 0$, then with probability at least $1 - \delta$, the policy $\pi_{K-1}$, output satisfies
	\begin{equation*}
		V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \tfrac{64 \epsilon d}{(1-\gamma)^2} (1 +\log(1+b^2 \epsilon^{-2} d^{-1}))^{1/2}.
	\end{equation*}
	  The query and computation complexity are $\mathcal{O}\big(\tfrac{d^2}{\epsilon^2 (1-\gamma)^4} \big)$ and $\text{poly}(d, \frac{1}{1 - \gamma}, \frac{1}{\epsilon}, \log(\frac{1}{\delta}), \log(1+b))$, respectively.
  
       All parameter settings are in \cref{app: theorem proofs}.
\end{theorem}
When compared to the result in \citet[Theorem 5.1]{yin2021efficient} we have an extra factor of $d$ in the query complexity for $\epsilon = 0$, while for $\epsilon \neq 0$ we only have an extra factor of $\sqrt{d}$ in the sub-optimality of the output policy.
This is similar to linear bandits, where an extra $\sqrt{d}$ is suffered in the regret for oracle-efficient methods \citep{dani2008stochastic,agrawal2013thompson,abeille2017linear}. \looseness=-1


The full proof is given in \cref{app: theorem proofs}. 
The proof essentially follows the ideas in \citet{yin2021efficient} while carefully arguing how \textsc{UncertaintyCheck-EGSS}~affects the query complexity. 
For the computational complexity, note that \textsc{UncertaintyCheck-EGSS}~can be implemented in $\text{poly}(d)$ by \cref{eq:infty-norm-oracle}, and linear algebra operations. 
Since the core set size is bounded (\cref{lemma:bound on core set size}), policy iteration only restarts $\mathcal{O}(d)$ times.
Lastly, the policy improvement step is trivially implemented using the greedy oracle (\cref{ass: argmax oracle}). \looseness=-1


\section{\uppercase{Efficient Policy Sampling}} \label{app:efficienct policy sampling}
    The policy in \citep{yin2021efficient} for \textsc{Confident MC-LSPI}~and \textsc{Confident MC-Politex}~is as follows
    \begin{equation}
    \pi_k(a|s) \gets 
        \begin{cases}
            \mathds{1}\left(a = \argmax\limits_{\tilde{a} \in \mathcal{A}} w^\top \phi(s, \tilde{a})\right) & \text{LSPI} \\
            \exp\left(\alpha \sum\limits_{j=0}^{k-1} Q_j(s, a)\right) / \sum\limits_{a \in \mathcal{A}} \exp\left(\alpha \sum\limits_{j=0}^{k-1} Q_k(s, a)\right) . & \text{Politex}
        \end{cases} \label{eq:yin policy}
    \end{equation}
    with $w_k = (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \Phi_\mathcal{C}^\top q_\mathcal{C}$ 
    and $Q_{k-1}(s, a) = \min\{\max\{w_k^\top \phi(s, a), 0\}, 1/(1-\gamma)\}$ for the Politex case only.
    In this section we show that the above policy can be sampled from efficiently if \cref{ass: feature decomposition}  or \cref{ass: argmax oracle} is satisfied for the LSPI case and policy $\pi_k$ can be sampled from efficiently if \cref{ass: feature decomposition} is satisfied for the Politex case.
    To be precise, by efficiently we mean with computation that does not depend on $|\mathcal{A}|$.
    We assume only $w \in \mathbb{R}^d$ or $w_0, ..., w_{k-1} \in \mathbb{R}^d$ (for LSPI and Politex respectively) and a feature map $\phi: \mathcal{S} \times \mathcal{A} \to \mathbb{R}^d$ are given, thus the process of sampling may require calculating the policy if necessary to accurately sample.
    First we handle the LSPI case.
    
    \begin{proposition}[Efficient LSPI Policy Sampling]
        \label{prop: efficient lspi policy sampling}
        Given state $s \in \mathcal{S}$, parameter vector $w \in \mathbb{R}^d$, feature map $\phi: \mathcal{S} \times \mathcal{A} \to \mathbb{R}^d$ and assumption \cref{ass: feature decomposition} or \cref{ass: argmax oracle} satisfied. 
        Then policy
        $$\pi_k(a|s) = \mathds{1}\left(a = \argmax_{\tilde{a} \in \mathcal{A}} w^\top \phi(s, \tilde{a})\right)$$
        can be sampled from in with computation that does not depend on $|\mathcal{A}|$.
    \end{proposition}
    \begin{proof}
        One can sample from policy $\pi_k$ by simply outputting the result of $\argmax_{\tilde{a} \in \mathcal{A}} w^\top \phi(s, \tilde{a})$. 
        Under assumption \cref{ass: argmax oracle} $\argmax_{\tilde{a} \in \mathcal{A}} w^\top \phi(s, \tilde{a})$ can be computed in constant time by applying the oracle to $w$ and $\phi$ (i.e. $\mathcal{G}(w, \phi)$).
        While, \cref{ass: feature decomposition} implies we can compute $\argmax_{\tilde{a} \in \mathcal{A}} w^\top \phi(s, \tilde{a})$ in $\text{poly}(\sum_{i=1}^m |A^\ag{i}|, d)$ time, since 
        \begin{align*}
	&\argmax\nolimits_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} w^\top \phi(s, a^{(1:m)})
	\\
	&=\big(\argmax_{a^{(1)} \in \mathcal{A}^{(1)}} w^\top \phi_1(s, a^{(1)}), ..., \argmax_{a^{(m)} \in \mathcal{A}^{(m)}} w^\top \phi_m(s, a^{(m)})\big)
\end{align*}
    \end{proof}

    Next, we handle the Politex case. 
    To achieve the result below we assume \cref{ass: feature decomposition} is satisfied.
   
    We have to modify the Politex policy in \cref{eq:yin policy} slightly, by removing the clipping of the $Q$-function at each iteration $k$ (i.e. we define the $Q$-function at iteration $k$ to be $Q_{k-1}(s, a) = w_k^\top \phi(s, a)$ instead of $Q_{k-1}(s, a) = \min\{\max\{w_k^\top \phi(s, a), 0\}, 1/(1-\gamma)\}$).
    This was done since we were not aware of an efficient way to compute the clipped $Q$-function for all action-vectors in $\mathcal{A}^{(1:m)}$.
    Importantly, removing the clipping does not suffer any increase in the dominating terms of the final policies sub-optimality (shown in \cref{app: theorem proofs})
    \begin{proposition}[Efficient Politex Policy Sampling]
        \label{prop: efficient politex policy sampling}
        Given state $s \in \mathcal{S}$, parameter vectors $w_0, ..., w_{k-1} \in \mathbb{R}^d$, feature map $\phi: \mathcal{S} \times \mathcal{A}^{(1:m)} \to \mathbb{R}^d$ and \cref{ass: feature decomposition} satisfied. 
        Then policy
        $$\pi_k(a^{(1:m)}|s) = \exp\left(\alpha \sum\nolimits_{j=0}^{k-1} w_{j}^\top \phi(s, a^{(1:m)})\right)/ \sum\nolimits_{\tilde{a}^{(1:m)} \in \mathcal{A}^{(1:m)}} \exp \left(\alpha \sum\nolimits_{j=0}^{k-1} w_{j}^\top \phi(s, \tilde{a}^{(1:m)})\right)$$ 
        with $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ can be sampled from in time $\text{poly}(\sum_{i=1}^m |\mathcal{A}_i|, d)$.
    \end{proposition}

    \begin{proof}
    Fix arbitrary $a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
    To sample from $\pi_k$ it is sufficient to sample actions $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ proportional to $\exp(\alpha \sum_{j=0}^{k-1} Q_{j}(s, a^{(1:m)}))$.
    Rearranging $\exp(\alpha \sum_{j=0}^{k-1} Q_{j}(s, a^{(1:m)}))$ and plugging in that $\phi(s, a^{(1:m)}) = \sum_{i=1}^m \phi_i(s, a^\ag{i})$ under assumption \cref{ass: feature decomposition} we have
    \begin{align*}
        \exp\left(\alpha \sum\nolimits_{j=0}^{k-1} w_{j}^\top \phi(s, a^{(1:m)})\right) 
        &= \prod_{j=0}^{k-1} \exp\left(\alpha w_{j}^\top \phi(s, a^{(1:m)})\right) \\
        &= \prod_{j=0}^{k-1} \exp\left(\alpha w_{j}^\top \sum_{i=1}^m \phi_i(s, a^\ag{i})\right) \\
        &= \prod_{i=1}^m \prod_{j=0}^{k-1} \exp\left(\alpha w_{j}^\top \phi_i(s, a^\ag{i})\right)
    \end{align*}

    Which means that the probability of sampling action $a^{(1:m)}$ is equal to the product of the probabilities of sampling $a^\ag{i}$ for $i \in [m]$ independently.
    Since $a^{(1:m)}$ was arbitrary this completes the proof.
   
   
    \end{proof}


    \section{\uppercase{Bound on Core Set Size}}
    \citet{yin2021efficient} showed that when only tuples containing state-action vectors that satisfy $\phi(s, a)^\top (\Phi^\top \Phi + \lambda I)^{-1} \phi(s, a) > \tau$ are add to the core set then it can be bounded as follows.
   
    
    \begin{lemma}[Bound on Core Set Size (Lemme 5.1 in \citep{yin2021efficient})] \label{lemma:bound on core set size}
        When \cref{ass: bounded features} is satisfied, and $(s,  a) \in (\mathcal{S} \times \mathcal{A})$ that satisfy $\phi(s, a)^\top (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \phi(s, a) > \tau$ are added to the core set, the size of the core set can be bounded by
        \begin{align}
             \tilde{C}_{\max} := \frac{e}{e-1} \frac{1 + \tau}{\tau} d \left( 
                \log(1 + \frac{1}{\tau}) +
                \log(1 + \frac{1}{\lambda})
            \right). \label{eq:cmax-new}       
        \end{align}
    \end{lemma}

   
   

    \section{\uppercase{Efficient Uncertainty Check}} \label{app:efficient uncertainty check}
    \label{app: efficient uncertainty check}
   
    The \textsc{Confident MC-LSPI}~algorithm proposed by \citet{yin2021efficient} is the same as our \textsc{Confident MC-LSPI}~(\cref{alg:confident ma mc-lspi}) algorithm combined with \textsc{UncertaintyCheck}~(\cref{alg:uncertainty check}) and the policy on line 17 of \textsc{Confident MC-LSPI}~replaced with \cref{eq:yin policy}.
    \begin{algorithm}
    \caption{\textsc{UncertaintyCheck}} \label{alg:uncertainty check}  
    \begin{algorithmic}[1]
    \State \textbf{Input:} state $s$, core set $\mathcal{C}$, threshold $\tau$
    \For {$a \in \mathcal{A}$}
        \If {$\phi(s, a)^\top (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \phi(s, a) > \tau$}
            \State status $\gets$ \textsc{uncertain}, result $\gets (s, a, \phi(s, a), \textsc{none})$
            \State \Return {status, result}
        \EndIf
    \EndFor 
    \State \Return \textsc{certain}, \textsc{none} 
    \end{algorithmic}
    \end{algorithm}
    Notice that \textsc{UncertaintyCheck}~requires iterating over $\mathcal{A}$ (line 2), which is computationally expensive with the action space is combinatorially large.
    In this appendix we show how the loop over all actions $a \in \mathcal{A}$ in the \textsc{UncertaintyCheck}~algorithm can be avoided when either \cref{ass: feature decomposition} or \cref{ass: argmax oracle} is satisfied.
    In particular, we show that \textsc{UncertaintyCheck-DAV}~and \textsc{UncertaintyCheck-EGSS}~algorithms are able to reduce the computation time of \textsc{UncertaintyCheck}~to no longer depend on $|\mathcal{A}|$, while still maintaining suitable output policy guarantees.

    Since, we are extending the \textsc{Confident MC-LSPI}~algorithm proposed by \citet{yin2021efficient}, we will be borrowing much of the steps from their proof.
    \citet{yin2021efficient} used a \textit{virtual algorithm} (VA) and \textit{main algorithm} (MA) to prove the sub-optimality of their \textsc{Confident MC-LSPI}~algorithm.
    We give a brief summary of the VA and MA; however, avoid full details since we use the exact same definition as in \citet{yin2021efficient}. 
    Until the next subsection, assume \textsc{UncertaintyCheck}~is used in \textsc{Confident MC-LSPI}~and \textsc{Rollout}.
    The MA is exactly \textsc{Confident MC-LSPI}. 
    The VA is based on the \textsc{Confident MC-LSPI}~algorithm, but has some differences, which we outline next. 
    The VA runs for exactly $C_\text{max}$ loops, $K$ iterations, and completes all $n$ of its rollouts of length $H$. 
    For each loop and iteration $k$ the VA always obtains estimates $q_\mathcal{C}$ of its policy.
    The VA uses a different policy than the MA for rollouts.
    We will first focus on the LSPI case and return to Politex much later.
    The VA's $Q$-function at iteration $k$ is  
    \begin{equation*}
        \tilde{Q}_{k-1}(s, a)=\begin{cases}
                          \tilde{w}_k^\top \phi(s, a) \quad &\text{if} \, \phi(s, a) \in \mathcal{D} \\
                          Q_{\tilde{\pi}_{k-1}}(s, a)     \quad &\text{if} \, \phi(s, a) \notin \mathcal{D} \\
                    \end{cases}
    \end{equation*}
    where $\tilde w_k = V_\mathcal{C}^{-1} \Phi_\mathcal{C}^\top \tilde q_\mathcal{C}$, and $\tilde q_\mathcal{C}$ are the estimates obtained from running \textsc{Rollout}~on each element of the core set, and $\mathcal{D} = \{\phi(s, a): \|\phi(s, a)\|_{V_\mathcal{C}^{-1}}^2 \le \tau\}$ is the \textit{good set}.
    The VA's policy is
    \begin{equation*}
        \tilde \pi_k(a | s) = \mathds{1} \left( a = \argmax_{\tilde a \in \mathcal{A}} \tilde Q_{k-1}(s, \tilde a) \right).
    \end{equation*}

    The nice thing about defining the VA's policy in this way is that we can make use of the following Lemma from \citep{yin2021efficient}.
    
    \begin{lemma}[Lemma B.2 in \citep{yin2021efficient}]
    \label{lemma: yin lemma b.2}
    Suppose that Assumption \cref{ass: feature decomposition} holds. 
    With all terms as defined earlier and $\theta > 0$. 
    Then, with probability at least 
    $$1 - 2C_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$$
    for any $(s,  a) \in (\mathcal{S} \times \mathcal{A})$ pair such that $\phi(s, a) \in \mathcal{D}$, we have 
    $$|\tilde{Q}_{k-1} (s, a) - Q_{\tilde\pi_{k-1}} (s, a)| \le b\sqrt{\lambda \tau} + \left(\epsilon +  \frac{\gamma^{H-1}}{1 - \gamma} + \theta \right) \sqrt{\tau C_{\text{max}}} + \epsilon := \eta$$
    \end{lemma}

    Notice that for any $(s,  a) \in (\mathcal{S} \times \mathcal{A})$ pair such that $\phi(s, a) \notin \mathcal{D}$, the VA's $Q$-function $\tilde Q_{k-1}$ has access to the true $Q$-function $Q_{\tilde{\pi}_{k-1}}$ of policy $\tilde \pi_{k-1}$.
    Thus, we have that 
    \begin{equation}
    \label{eq:va inf-norm bound}
        \| \tilde{Q}_{k-1} (s, a) - Q_{\tilde\pi_{k-1}} (s, a) \|_\infty \le \eta 
    \end{equation}
    Combined with the fact that $\tilde \pi_k$ is greedy w.r.t. $\tilde Q_{k-1}$ the above result turns out to be especially useful.
    
    To understand why, we state a classic policy improvement result, which can be found as Lemma B.3 in \citet{yin2021efficient} and in other papers.
    \begin{lemma}[approximate policy iteration]
    \label{lemma:approximate policy iteration}
        Suppose that we run K approximate policy iterations and generate a sequence of policies
        $\pi_0, \pi_1, \pi_2, \cdots, \pi_K$.
        Suppose that for every $k = 1, 2, \cdots, K$, in the k-th iteration, we obtain a function
        $\tilde{Q}_{k-1}$ such that, $\| \tilde{Q}_{k - 1} - Q_{\pi_{k - 1}} \|_\infty \leq \eta$,
        and choose $\pi_k$ to be greedy with respect to $\tilde{Q}_{k-1}$.
        Then
        \begin{align*}
            \| Q^* - Q_{\pi_K} \|_\infty \leq \frac{2 \eta}{1 - \gamma} + \frac{\gamma^K}{1 - \gamma},
        \end{align*}
    \end{lemma}

    In our case the VA's policy $\tilde \pi_k$ is greedy w.r.t. $\tilde Q_{k-1}$ and thus we have that
    \begin{align*}
        \| Q^* - Q_{\tilde \pi_K} \|_\infty \leq \frac{2 \eta}{1 - \gamma} + \frac{\gamma^K}{1 - \gamma},
    \end{align*}

    Now we explain how the MA can be related to the VA, and make use of the above result.
    The \textsc{UncertaintyCheck}~algorithm can have two cases: 
    
    \textbf{Case 1:} $\|\phi(s, a)\|_{V_\mathcal{C}^{-1}}^2 > \tau$ holds for at least one $a \in \mathcal{A}$,
    
    \textbf{Case 2:} $\|\phi(s, a)\|_{V_\mathcal{C}^{-1}}^2 \le \tau$ holds for all $a \in \mathcal{A}$. This is equivalent to saying $\phi(s, a) \in \mathcal{D}, \ \forall a \in \mathcal{A}$.

    The VA is exactly the same at the MA algorithm, until Case 1 occurs for the first time.
    This is because the MA's and VA's simulators are coupled, in the sense that at iteration $k$, rollout $i$, and step $t$, when both simulators are queried with the same state-action vector pairs, they sample the exact same next state and reward. 
    The VA also uses the same initial policy as the MA at the start of policy iteration for every loop.
    Once Case 1 occurs the MA would restart policy iteration (else condition in line 14 of \textsc{Confident MC-LSPI}), while the VA does not. 
    The VA records the state-action vector pair when Case 1 occurs for the first time and adds it to the core set once it completes running policy iteration for the current loop.
    In this way the core set maintained by the MA and VA are always the same.
   
   
    Since the size of the core set is bounded by $C_\text{max}$ when $(s, a) \in (\mathcal{S} \times \mathcal{A})$ that satisfy $\phi(s, a)^\top (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \phi(s, a) > \tau$ are added to the core set (\cref{lemma:bound on core set size}), there will be a loop of policy iteration at which the MA and VA never encounter Case 1 for any of the $K$ iterations of policy iteration.
    We call this loop the \emph{final loop}.
    This is equivalent to say that all $(s,  a) \in (\mathcal{S} \times \mathcal{A})$ observed during all $K$ iterations of policy iteration in the final loop are in the good set (i.e. $\phi(s, a) \in \mathcal{D}$).
    Notice that this means MA and VA behaved identical in the final loop, since the VA's policy would have always been greedy w.r.t. $\tilde{w}_k^\top \phi$ and the MA and VA use the same initial policy at the start of each loop.
    It turns out this relationship between the MA and VA allows us to bound the sub-optimality of the MA in the final loop, by using the result in \cref{eq:va inf-norm bound} we have for the VA. 
    More precisely, the following result can be extracted from \citep{yin2021efficient}
    \begin{proposition}[equation (B.15) in \citet{yin2021efficient}] \label{prop:optimality of output policy}
    With all terms as defined earlier. 
    Define $\eta \ge \|\tilde Q_{k-1}(s, a) - Q_{\tilde \pi_{k-1}}(s, a)\|_\infty$. 
    Suppose $\eta \ge |\tilde w_{k}^\top \phi(\rho, a) - Q_{\tilde \pi_{k-1}}(\rho, a)|, \ \forall a \in \mathcal{A}$.
    Then, if the VA and MA behave identically in the final loop, with probability at least $1 - 4KC_{\text{max}}^2 \exp(-2 \theta^2(1-\gamma)^2 n)$ we have
    \begin{align}
        V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \frac{8 \eta}{(1 - \gamma)^2} + \frac{2 \gamma^{K - 1}}{(1 - \gamma)^2} \label{eq: v function bound main}
    \end{align}
    \end{proposition}
    
    Notice, that we require three things to use the above result. 
    We need a bound on $\|\tilde Q_{k-1}(s, a) - Q_{\tilde \pi_{k-1}}(s, a)\|_\infty$. 
    We need a bound on $|\tilde w_{k}^\top \phi(\rho, a) - Q_{\tilde \pi_{k-1}}(\rho, a)|, \ \forall a \in \mathcal{A}$.
    We need to ensure that the VA and MA behave identically in the final loop. 
    Then, we can get a bound on the sub-optimality of the MA's output policy $\pi_{K-1}$.
    An important observation is that \textsc{UncertaintyCheck}~ensured that MA and VA behave identically in the final loop.
    It did this by making sure that the VA's policy $\tilde \pi_k$ would only be able to use $\tilde{w}_k^\top \phi$ to derive its actions, since \textsc{UncertaintyCheck}~always returns a status of \textsc{certain} in the final loop, which means that $\phi(s, a) \in \mathcal{D}$ for all $s, a \in \mathcal{S} \times \mathcal{A}$ encountered in the final loop.
    With this information in mind, we now show that \textsc{UncertaintyCheck-DAV}~and \textsc{UncertaintyCheck-EGSS}~only requires computation independent of $|\mathcal{A}|$, while providing only slightly worse sub-optimality guarantees when compared to the result in \citep{yin2021efficient}.

   
   

   
    \subsection{Efficient Good Set Search Approach (EGSS)} \label{subsec:good set search}
    
    In this section we prove some useful results for \textsc{UncertaintyCheck-EGSS}.
    Fix a state $s \in \mathcal{S}$.
    First, we show that with computation independent of $|\mathcal{A}|$, one can find an action vector $a \in \mathcal{A}$ that approximately maximizes $\phi(s, a)^\top V_\mathcal{C}^{-1} \phi(s, a)$.
    
    \begin{lemma}[Efficient good set search]
    \label{lemma:good set search}
    Assume either \cref{ass: argmax oracle} is satisfied.
    With all terms as defined earlier. 
    One can ensure, with $2 d$ calls to the greedy oracle that
    $$\phi(s, a)^\top V_\mathcal{C}^{-1} \phi(s, a) \le d\tau$$
    for all $a \in \mathcal{A}$, or there exists an $a \in \mathcal{A}$ such that
    $$\phi(s, a)^\top V_\mathcal{C}^{-1} \phi(s, a) > \tau.$$
    Further, if \cref{ass: feature decomposition} is satisfied, then the same guarantees hold with $2 d^2 \sum_{i=1}^m |A^\ag{i}|$ computation time.
    \end{lemma}
    
    \begin{proof}
    Recall that we are able to compute $\max_{a \in \mathcal{A}} \langle u, \phi(s, a) \rangle$ for any $u \in \mathbb{R}^d$ in constant time if \cref{ass: feature decomposition} is satisfied, and in $d \sum_{i=1}^m |A^\ag{i}|$ time if \cref{ass: argmax oracle} is satisfied.
    We make use of a bi-directional 2-norm to $\infty$-norm inequality that will take advantage of the above mentioned efficient computation.
    Fix $\mathcal{C}$ and define the lower triangular matrix $L$ via the Cholesky decomposition $V_\mathcal{C}^{-1} = L L^\top$.
    Define $\{e_i\}_{i=1}^d$ as the standard basis vectors and 
    \begin{equation*}
    (v^*, a_\text{max}) := \text{arg} \left(\max_{v \in \{\pm e_i\}_{i=1}^d} \max_{a \in \mathcal{A}} \langle L v, \phi(s, a) \rangle \right) 
    \end{equation*}
    Then we have that
    \begin{align}
       \frac{1}{d} \| \phi(s, a_\text{max}) \|_{V_\mathcal{C}^{-1}}^2
       &= \frac{1}{d} \phi(s, a_\text{max})^\top V_\mathcal{C}^{-1} \phi(s, a_\text{max}) \nonumber \\
       &= \frac{1}{d} \phi(s, a_\text{max})^\top L L^\top \phi(s, a_\text{max}) \nonumber \\
       &= \frac{1}{d} \| L^\top \phi(s, a_\text{max}) \|_2^2 \nonumber \\
       &\le \max_{a \in \mathcal{A}} \| L^\top \phi(s, a)\|_\infty^2 \nonumber \\
       &= \max_{v \in \{\pm e_i\}_{i=1}^d} \max_{a \in \mathcal{A}} \langle v, L^\top \phi(s, a) \rangle^2 \nonumber \\
       &= \max_{v \in \{\pm e_i\}_{i=1}^d} \max_{a \in \mathcal{A}} \langle L v, \phi(s, a) \rangle^2 \label{inf-norm term} \\
       &= \langle L v^*, \phi(s, a_\text{max}) \rangle^2  \nonumber  \\
       &\le \| L^\top \phi(s, a_\text{max}) \|_2^2 \nonumber 
    \end{align}
    The purpose of writing all the equalities up to \cref{inf-norm term} was to show that \cref{inf-norm term} can be computed efficiently. 
    This is since we are able to compute $\max_{a \in \mathcal{A}} \langle L v, \phi(s, a) \rangle^2$ in constant time if \cref{ass: feature decomposition} is satisfied, and in $d \sum_{i=1}^m |A^\ag{i}|$ time if \cref{ass: argmax oracle} is satisfied, and $\{\pm e_i\}_{i=1}^d$ contains $2d$ elements.
   
    Also, note that $L$ can be computed with at most $d^2$ computation in each loop by doing a rank one update to the Cholesky decomposition of $V_\mathcal{C}^{-1} = L L^\top$.
    
    If equation (\cref{inf-norm term}) is larger than $\tau$, then $\|\phi(s, a_\text{max}) \|_{V^{-1}}^2 > \tau$.
    While, if equation (\cref{inf-norm term}) is less than or equal $\tau$, then $\|\phi(s, a_\text{max}) \|_{V^{-1}}^2 \le d\tau$, completing the proof.
    \end{proof}

    \textsc{UncertaintyCheck-EGSS}~is essentially an implementation of equation (\cref{inf-norm term}), thus its computation is independent of $|\mathcal{A}|$, as stated in \cref{lemma:good set search}.
    Also, since only $a \in \mathcal{A}$ that satisfy $\|\phi(s, a)\|_{V_\mathcal{C}^{-1}}^2 \ge \|\phi(s, a)\|_\infty^2 > \tau$ are added to the core set, we can still use \cref{lemma:bound on core set size} to bound the size of the core set by $C_\text{max}$.
    Basically, \cref{inf-norm term} is an underestimate of $\| \phi(s, a_\text{max}) \|_{V^{-1}}^2$ and we only add elements to the core set when it is larger than $\tau$, thus the core set is no larger than it was when using \textsc{UncertaintyCheck}.
    
    Now, we aim to ensure that the VA and MA behave identically in the final loop.
    Notice that \textsc{UncertaintyCheck-EGSS}~provides a weaker guarantee than \textsc{UncertaintyCheck}, when the returned result is \textsc{certain}.
    Specifically, when \textsc{UncertaintyCheck-EGSS}~returns a result of \textsc{certain}, then \cref{lemma:good set search} guarantees that $\|\phi(s, a)\|_{V_\mathcal{C}^{-1}}^2 \le d\tau$ for all $a \in \mathcal{A}$.
    While when the \textsc{UncertaintyCheck}~returns a result of \textsc{certain}, then $\|\phi(s, a)\|_{V_\mathcal{C}^{-1}}^2 \le \tau$ for all $a \in \mathcal{A}$.
    Thus, we define a smaller good set $\mathcal{D}_d = \{ \phi(s, a): \|\phi(s, a)\|_{V_\mathcal{C}^{-1}}^2 \le d\tau\}$.
    
    Redefine the VA's $Q$-function at iteration $k$ as
    \begin{equation*}
        \tilde{Q}_{k-1}(s, a)=\begin{cases}
                          \tilde{w}_k^\top \phi(s, a) \quad &\text{if} \, \phi(s, a) \in \mathcal{D}_d \\
                          Q_{\tilde{\pi}_{k-1}}(s, a)     \quad &\text{if} \, \phi(s, a) \notin \mathcal{D}_d \\
                    \end{cases}
    \end{equation*}
    and VA's policy as
    \begin{equation*}
        \tilde \pi_k(a | s) = \mathds{1} \left( a = \argmax_{\tilde a \in \mathcal{A}} \tilde Q_{k-1}(s, \tilde a) \right).
    \end{equation*}
    Notice that in the final loop \textsc{UncertaintyCheck-EGSS}~always returns a \textsc{result} of \textsc{certain}, and thus we are sure that all $a \in \mathcal{A}$ for all the states encountered in the final loop are in the smaller good set $\mathcal{D}_d$.
    Thus, the VA's policy $\pi_{k}$ would always be greedy w.r.t. $\tilde w_k^\top \phi$ in the final loop.
    This ensures that the VA and MA behave identically in the final loop.

    Next we need show that we can bound $\|\tilde Q_{k-1}(s, a) - Q_{\tilde \pi_{k-1}}(s, a)\|_\infty$ with this new definition of $\tilde Q_{k-1}$. 
    First we state a slight modification of \cref{lemma: yin lemma b.2} that holds for the smaller good set $\mathcal{D}_d$ 
    
    \begin{lemma}[EGSS modified Lemma B.2 from \cite{yin2021efficient}]
    \label{lemma: mod b.2 egss}
    Suppose that \cref{asm:linear-q-pi} holds. 
    With all terms as defined earlier and $\theta > 0$. 
    Then, with probability at least 
    $$1 - 2C_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$$
    for any $(s,  a) \in (\mathcal{S} \times \mathcal{A})$ pair such that $\phi(s, a) \in \mathcal{D}_d$, we have 
    $$|\tilde{w}_k^\top \phi(s, a) - w_{\tilde\pi_{k-1}}^\top \phi(s, a)| \le b\sqrt{\lambda d \tau} + \left(\epsilon + \frac{\gamma^{H+1}}{1 - \gamma} + \theta \right) \sqrt{d \tau C_{\text{max}}} + \epsilon = \sqrt{d} \bar \eta:= \eta_2$$
    \end{lemma}
    
    \begin{proof}
    The proof is identical to that of Lemme B.2 from \cite{yin2021efficient} except $\tau$ is replaced with $d \tau$ everywhere, due to the weaker guarantee of \textsc{UncertaintyCheck-EGSS} as discussed above. 
    \end{proof}
    
    Essentially we get an extra $\sqrt{d}$ factor due to the smaller good set $\mathcal{D}_d$. 
    Since the VA's policy $\tilde \pi_{k}$ has access to the true $Q$-function $Q_{\tilde \pi_{k-1}}$ for all $\phi(s, a) \notin \mathcal{D}_d$,
    we can show that $\|\tilde Q_{k-1}(s, a) - Q_{\tilde \pi_{k-1}}(s, a)\|_\infty$ can be bounded.
    
    \begin{proposition}[approximate value function bound for EGSS]
    \label{prop: approx value function bound for EGSS}
    Suppose that \cref{asm:linear-q-pi} holds. 
    With all terms as defined earlier and $\theta > 0$. 
    Then, with probability at least 
    $$1 - 2C_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$$
    we have
    $$\|\tilde Q_{k-1}(s, a) - Q_{\tilde{\pi}_{k-1}}(s, a)\|_\infty \le \eta_2.$$ 
    \end{proposition}

    \begin{proof}
    For any $(s, a) \in (\mathcal{S} \times \mathcal{A})$ such that $\phi(s, a) \in \mathcal{D}_d$, we have
    \begin{align}
        |\tilde Q_{k-1}(s, a) - Q_{\tilde{\pi}_{k-1}}(s, a)| \le \eta_2
    \end{align}
    by \cref{prop: approx value function bound for EGSS}.
    While for any $(s, a) \in (\mathcal{S} \times \mathcal{A})$ such that $\phi(s, a) \notin \mathcal{D}_d$, we have
    \begin{align}
        |\tilde Q_{k-1}(s, a) - Q_{\tilde{\pi}_{k-1}}(s, a)| 
        = |Q_{\tilde \pi_{k-1}}(s, a) - Q_{\tilde{\pi}_{k-1}}(s, a)|
        &= 0 
    \end{align}
    \end{proof}

    Finally, it is left to show that $|\tilde w_{k}^\top \phi(\rho, a) - Q_{\tilde \pi_{k-1}}(\rho, a)|$  can be bounded for all $a \in \mathcal{A}$.
    Notice that lines 4-8 in \textsc{Confident MC-LSPI}~run \textsc{UncertaintyCheck-EGSS}~with state $\rho$ as input until the returned status is \textsc{certain}.
    Recall that once \textsc{UncertaintyCheck-EGSS}~returns a status of \textsc{certain}~we know that $\rho \in \mathcal{D}_d$.
    Thus, we can immediately apply \cref{lemma: mod b.2 egss} to bound $\eta_2 \ge |\tilde w_{k}^\top \phi(\rho, a) - Q_{\tilde \pi_{k-1}}(\rho, a)|, \ \forall a \in \mathcal{A}$.


       
    \subsection{Default Action Vector (DAV) Method}
    In this section we prove some useful results for \textsc{UncertaintyCheck-DAV}.
    Fix a state $s \in \mathcal{S}$.
    As mentioned in the body, assume the action space can be decomposed as a product $\mathcal{A}^{(1:m)} = \mathcal{A}^\ag{1} \times ... \times \mathcal{A}^\ag{m}$ throughout this section.
    We call elements of $\mathcal{A}^{(1:m)}$ \textit{action vectors}.
    First, \textsc{UncertaintyCheck-DAV}~only iterates over $\sum_{i=1}^m |A^\ag{i}|$ action vectors instead of all the action vectors like \textsc{UncertaintyCheck}~does.
    Define the $\sum_{i=1}^m A^\ag{i}$ sized set of modified default action vectors as $\bar \mathcal{A}^{(1:m)} = \{ (a^\ag{i}, \bar a^{(-i)}): a^\ag{i} \in \mathcal{A}^\ag{i}, \ i \in [m] \}$.
    Notice \textsc{UncertaintyCheck-DAV}~iterates over all the actions in the set $a^{(1:m)} \in \bar \mathcal{A}^{(1:m)}$ and checks if any of them satisfy $\|\phi(s, a^{(1:m)})\|_{V_\mathcal{C}^{-1}}^2 > \tau$.
    This of course achieves the goal of compute independent of $|\mathcal{A}^{(1:m)}|$, since there are only $\sum_{i=1}^m A^\ag{i}$ action vectors in $\bar \mathcal{A}^{(1:m)}$ to iterate over now.
    Also, since only $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ that satisfy $\|\phi(s, a^{(1:m)})\|_{V_\mathcal{C}^{-1}}^2 > \tau$ are added to the core set, we can still use \cref{lemma:bound on core set size} to bound the size of the core set by $C_\text{max}$.

    Now, we aim to ensure that the VA and MA behave identically in the final loop.
    Define the set of states for which all the modified default action vectors are in the good set as $\bar \mathcal{S} = \{ s \in \mathcal{S}: \|\phi(s, a^{(1:m)})\|_{V_\mathcal{C}^{-1}}^2 \le \tau, \forall a^{(1:m)} \in \bar \mathcal{A}^{(1:m)} \}$.
    Redefine the VA's $Q$-function as 
    \begin{equation*}
    \tilde Q_{k-1}(s, a^{(1:m)})=\begin{cases}
          \tilde w_k^\top \phi(s, a^{(1:m)}) \quad & s \in \bar \mathcal{S} \\
          Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)}). \quad & s \in \mathcal{S} \backslash \bar \mathcal{S}
        \end{cases}
    \end{equation*}
    The VA's policy is
    \begin{equation*}
        \tilde \pi_k(a^{(1:m)} | s) = \mathds{1} \left( a^{(1:m)} = \argmax_{\tilde a^{(1:m)} \in \mathcal{A}^{(1:m)}} \tilde Q_{k-1}(s, \tilde a^{(1:m)}) \right).
    \end{equation*}
    Notice that in the final loop the check $\phi(s, (a^{(j)}, \bar a^{(-j)}))^\top (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \phi(s, (a^{(j)}, \bar a^{(-j)})) > \tau$ in \textsc{UncertaintyCheck-DAV}~never returns \textsc{True}, and thus we are sure that all $a^{(1:m)} \in \bar \mathcal{A}^{(1:m)}$ for all the states encountered in the final loop are in the good set.
    Notice that these states that satisfy this condition are state in $\bar \mathcal{S}$.
    Thus, the VA's policy $\pi_{k}$ would always be greedy w.r.t. $\tilde w_k^\top \phi$ in the final loop.
    This ensures that the VA and MA behave identically in the final loop.

    Now we show that we can bound $\|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(s, a^{(1:m)})\|_\infty$ with this new definition of $\tilde Q_{k-1}$. 
    First we state a slight modification of \cref{lemma: yin lemma b.2} for $w_{\tilde{\pi}_{k-1}}^\top \phi$ instead of $Q_{\tilde{\pi}_{k-1}}$ which excludes the $\|w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})\|_\infty \le \epsilon$ term in the proof of Lemma B.2 in \citep{yin2021efficient}.
    
    \begin{lemma}[Lemma B.2 in \citep{yin2021efficient}]
    \label{lemma: yin lemma b.2 no epsilon}
    Suppose that \cref{ass: feature decomposition} holds. 
    With all terms as defined earlier and $\theta > 0$. 
    Then, with probability at least 
    $$1 - 2C_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$$
    for any $(s,  a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A}^{(1:m)})$ pair such that $\phi(s, a^{(1:m)}) \in \mathcal{D}$, we have 
    $$|\tilde{w}_{k} (s, a^{(1:m)}) - w_{\tilde\pi_{k-1}}^\top (s, a^{(1:m)})| \le b\sqrt{\lambda \tau} + \left(\epsilon +  \frac{\gamma^{H-1}}{1 - \gamma} + \theta \right) \sqrt{\tau C_{\text{max}}}:= \bar\eta$$
    \end{lemma}

    The following Proposition gives us a bound on $\|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(s, a^{(1:m)})\|_\infty$.
    
    \begin{proposition}[approximate value function bound for DAV]
    \label{prop: approx value function bound for DAV}
    Suppose that \cref{ass: feature decomposition} holds. 
    With all terms as defined earlier and $\theta > 0$. 
    Then, with probability at least 
    $$1 - 2C_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$$
    we have
    $$\|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})\|_\infty \le \bar\eta (2m-1) + \epsilon := \eta_1.$$ 
    \end{proposition}
    
    \begin{proof}
    
    For any $(s, a^{(1:m)}) \in (\bar \mathcal{S} \times \mathcal{A}^{(1:m)})$, we have
    \begin{align}
        & |\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| \nonumber \\
        &= |\tilde{w}_k^\top \phi(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| \nonumber \\
        &= |\tilde{w}_k^\top \phi(s, a^{(1:m)}) \pm w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| \nonumber \\
        &\le |\tilde{w}_k^\top \phi(s, a^{(1:m)}) - w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)})| + |w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| \nonumber \\
        &\le |\tilde{w}_k^\top \phi(s, a^{(1:m)}) - w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)})| + \epsilon \nonumber \\
        &= |\tilde{w}_k^\top \phi(s, a^{(1:m)}) - w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)}) \pm (m-1)\tilde{w}_k^\top \phi(s, \bar a^{(1:m)}) \pm (m-1) w_{\tilde{\pi}_{k-1}}^\top \phi(s, \bar a^{(1:m)})| + \epsilon \nonumber \\
        &= \left|\left( \sum_{i=1}^m \tilde{w}_k^\top \phi(s, (a^\ag{i}, \bar a^{(-i)})) - w_{\tilde\pi_{k-1}}^\top \phi(s, (a^\ag{i}, \bar a^{(-i)})) \right) + (m-1)\left[w_{\tilde{\pi}_{k-1}}^\top \phi(s, \bar a^{(1:m)})) - \tilde{w}_k^\top \phi(s, \bar a^{(1:m)}\right]\right| + \epsilon \nonumber \\
        &\le m \bar\eta + (m-1) \bar \eta + \epsilon \nonumber \\ 
        &= \bar\eta (2m-1) + \epsilon \label{value function bound 1}
    \end{align}
    where the second last inequality holds by \cref{lemma: yin lemma b.2 no epsilon} (because the features of all the state action pairs considered are in $\mathcal{D}$, since $s \in \bar \mathcal{S}$).
    
    While for any $(s, a^{(1:m)}) \in ((\mathcal{S} \backslash \bar \mathcal{S}) \times \mathcal{A}^{(1:m)})$, we have
    \begin{align}
        |\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| 
        = |Q_{\tilde \pi_{k-1}}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})|
        &= 0 \label{value function bound 2}
    \end{align}
    \end{proof}

    Finally, it is left to show that $|\tilde w_{k}^\top \phi(\rho, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(\rho, a^{(1:m)})|$  can be bounded for all $a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
    Notice that lines 4-8 in \textsc{Confident MC-LSPI}~run \textsc{UncertaintyCheck-DAV}~with state $\rho$ as input until the returned status is \textsc{certain}.
    Recall that once \textsc{UncertaintyCheck-DAV}~returns a status of \textsc{certain}~we know that $\rho \in \bar \mathcal{S}$.
    Thus, we can immediately apply the result in \cref{value function bound 1} to bound $\eta_1 \ge |\tilde w_{k}^\top \phi(\rho, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(\rho, a^{(1:m)})|, \ \forall a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
 
 

    
    \subsection{Extending to Politex} \label{subsec:extending to politex}
    Recall the above results where for the \textsc{Confident MC-LSPI}~algorithm.
    The \textsc{Confident MC-Politex}~algorithm can be found as \cref{alg:confident ma mc-politex}.
    \begin{algorithm}[t]
	\caption{\textsc{Confident MC-Politex}} \label{alg:confident ma mc-politex}  
	\begin{algorithmic}[1]
		\State \textbf{Input:} initial state $\rho$, initial policy $\pi_0$, number of iterations $K$, threshold $\tau$, number of rollouts $n$, length of rollout $H$
		\State \textbf{Globals:} default action $\bar a$, regularization coefficient $\lambda$, discount $\gamma$, subroutine \textsc{UncertaintyCheck}
		\State {$\mathcal{C} \gets \{(\rho, \bar a, \phi(\rho, \bar a), \textsc{none})\}$} 
\State status, result $\gets \textsc{UncertaintyCheck}(\rho,  \mathcal{C}, \tau)$
		\While {status $=$ \textsc{uncertain}}
		\State $\mathcal{C} \gets \mathcal{C} \cup \{\text{result}\}$
		 \State status, result $\gets \text{\textsc{UncertaintyCheck}}(\rho, \mathcal{C}, \tau)$
		\EndWhile
		\State $z_q \gets \textsc{none}, \, \forall z \in \mathcal{C}$ \quad \Comment{Policy iteration starts $(*)$}
		\For {$k \in 1, \dots, K$}
		\For {$z \in \mathcal{C}$}
		\State status, result $\gets \text{\textsc{Rollout}}(n, H, \pi_{k-1}, z, \mathcal{C}, \tau)$
		\State \textbf{if} status $=$ \textsc{done}, \textbf{then} $z_q = \text{result}$
		\State \textbf{else} $\mathcal{C} \gets \mathcal{C} \cup \{\text{result}\}$ and \textbf{goto} line $(*)$ 
		\EndFor 
		\State $w_k \gets (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \Phi_\mathcal{C}^\top q_\mathcal{C}$  
		\State $\pi_k(a^{(1:m)}|s) \gets \propto \prod_{i=1}^m \prod_{j=0}^{k-1} \exp\left(\alpha w_{j}^\top \phi_i(s, a^\ag{i})\right).$ 
		\EndFor
		\State \Return $\bar \pi_{K-1} \sim \text{Unif}\{\pi_k\}_{k=0}^{K-1}$
	\end{algorithmic}
    \end{algorithm}
    It turns out the story for \textsc{Confident MC-Politex}~is extremely similar and can be argued in nearly the same way. 
    The main difference is that the policy used in \textsc{Confident MC-Politex}~is different than in \textsc{Confident MC-LSPI}~(line 17 in \textsc{Confident MC-Politex}~is different from line 17 in \textsc{Confident MC-LSPI}).
    As such, we can no longer use \cref{lemma:approximate policy iteration} (since it relied on a greedy policy) and, thus cannot use \cref{prop:optimality of output policy} to bound the sub-optimality of the policy output by \textsc{Confident MC-Politex}. 
    Next, we show there is a similar Lemma and Proposition that can derived for \textsc{Confident MC-Politex}.

    Recall that we do not use clipping on the $Q$-functions in \textsc{Confident MC-Politex}, so that we can sample from the policy efficiently (\cref{prop: efficient politex policy sampling}).  
    Importantly \cref{prop: efficient politex policy sampling} only holds when \cref{ass: feature decomposition} is satisfied.
    Thus, for the remainder of this section we will be working with the product action space $\mathcal{A}^{(1:m)}$.
    This means we must define the VA's $Q$-function differently from \citep{yin2021efficient}, by removing clipping from the case when $\phi(s, a^{(1:m)}) \in \mathcal{D}$.
    \begin{equation*}
        \tilde{Q}_{k-1}(s, a^{(1:m)})=\begin{cases}
                          \tilde{w}_k^\top \phi(s, a^{(1:m)}) \quad &\text{if} \, \phi(s, a^{(1:m)}) \in \mathcal{D} \\
                          Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})     \quad &\text{if} \, \phi(s, a^{(1:m)}) \notin \mathcal{D} \\
                    \end{cases}
    \end{equation*}
   
    Then the VA's policy is
    \begin{equation} \label{eq:politex virtual policy}
        \tilde \pi_k(a^{(1:m)} | s) \propto \exp \left( \alpha \sum_{j=0}^{k-1} \tilde Q_{j}(s, a^{(1:m)}) \right).
    \end{equation}
    
    Also, due to no clipping, the sequence of $Q$-functions during policy iteration is now in the $[-\eta, (1-\gamma)^{-1} + \eta]$ interval, where $\eta \ge \|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(s, a^{(1:m)})\|_\infty$.
    We now restate Lemma D.1 from \citet{yin2021efficient} which bounds the mixture policy output by Politex for an arbitrary sequence of $Q$-functions
    Since we do not use clipping the theorem is slightly modified (we replace the interval $[0, (1-\gamma)^{-1}]$ with a general interval $[a, b], \ a, b \in \mathbb{R}$, which can be extracted from the calculations in \citet{szepesvari2022}).

    \begin{lemma}[modified Lemma D.1 in \citet{yin2021efficient} also in \citet{szepesvari2022}] \label{lemma:politex mixture policy bound}
    Given an initial policy $\pi_0$, a sequence of functions $Q_k: \mathcal{S} \times \mathcal{A}^{(1:m)} \to [a, b], \ k \in [K-1], a, b \in \mathbb{R}$, and $Q_{\pi^*} \in [0, 1/(1-\gamma)]$, construct a sequence of policies $\pi_1, ..., \pi_{K-1}$ according to (\cref{eq:politex virtual policy}) with $\alpha = 1/(b-a) \sqrt{\frac{2 \log(|\mathcal{A}^{(1:m)}|)}{K}}$, then, for any $s \in \mathcal{S}$, the mixture policy $\bar \pi_{K-1} \sim \text{Unif}\{\pi_k\}_{k=0}^{K-1}$ satisfies

    \begin{equation}
        V^*(s) - V_{\bar \pi_K}(s) \le \frac{b-a}{(1-\gamma)}\sqrt{\frac{2 \log(|\mathcal{A}^{(1:m)}|)}{K}} + \frac{2 \max_{0 \le k \le K-1} \|Q_k - Q_{\pi_k}\|_\infty}{1 - \gamma}
    \end{equation}
    \end{lemma}

    Notice that the above result suggests we just need to control the term $\|Q_k - Q_{\pi_k}\|_\infty$.
    For the VA this is $\|\tilde Q_k - Q_{\tilde \pi_k}\|_\infty$ and as we have already seen, this can be bounded using the high probability bound on policy evaluation for \textsc{Uncertainty Chcek with DAV} (\cref{prop: approx value function bound for DAV}) and \textsc{Uncertainty Chcek with EGSS} (Proposition \cref{prop: approx value function bound for EGSS}).
    Using \cref{lemma:politex mixture policy bound} instead of Lemma D.1 in \citet{yin2021efficient}, one can extract another slightly modified result from \citet{yin2021efficient}.
    
    \begin{proposition}[equation (D.8) in \citet{yin2021efficient}] \label{prop:politex optimality of output policy}
    With all terms as defined earlier. 
    Define $\eta \ge \|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(s, a^{(1:m)})\|_\infty$. 
    Suppose $\eta \ge |\tilde w_{k}^\top \phi(\rho, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(\rho, a^{(1:m)})|_\infty, \ \forall a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
    Then, if the VA and MA behave identically in the final loop, with probability at least $1 - 4KC_{\text{max}}^2 \exp(-2 \theta^2(1-\gamma)^2 n)$ we have
    \begin{align}
            V^*(s) - V_{\bar \pi_{K-1}}(\rho) \le \frac{b-a}{(1-\gamma)} \sqrt{\frac{2 \log(|\mathcal{A}^{(1:m)}|)}{K}} + \frac{4 \eta}{1 - \gamma}
    \end{align}
    \end{proposition}
    
    Notice, that we require the same three things as in the \textsc{Confident MC-LSPI}~case (\cref{prop:optimality of output policy}).
    We need a bound on $\|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(s, a^{(1:m)})\|_\infty$. 
    We need a bound on $|\tilde w_{k}^\top \phi(\rho, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(\rho, a^{(1:m)})|_\infty, \ \forall a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
    We need to ensure that the VA and MA behave identically in the final loop. 
    Then, we can get a bound on the sub-optimality of the MA's output policy $\bar \pi_{K-1}$.
    Using the same steps as in the previous sections, one can verify that indeed, \textsc{Confident MC-Politex}~combined with \textsc{UncertaintyCheck-DAV}~or \textsc{UncertaintyCheck-EGSS}~does satisfy the above three conditions, with $\eta = \eta_1$ ($\eta_1$ as defined in \cref{prop: approx value function bound for DAV}) and $\eta = \eta_2$ ($\eta_2$ as defined in \cref{prop: approx value function bound for EGSS}) respectively.
    
    We bound $|\mathcal{A}^{(1:m)}| = \prod_{i=1}^m |A^\ag{i}| \le \max_{i \in [m]} |\mathcal{A}^\ag{i}|$. 
    We can replace $b-a$ with $1/(1-\gamma) + 2\eta$, since $w^\top \phi(s, a^{(1:m)}) \in [-\eta, (1-\gamma)^{-1} + \eta], \ \forall (s \times a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A}^{(1:m)})$ in the final loop for the same event which holds with probability at least $1 - 4KC_{\text{max}}^2 \exp(-2 \theta^2(1-\gamma)^2 n)$ in \cref{prop:politex optimality of output policy}. 
    Applying \cref{prop:politex optimality of output policy} we get with probability at least $1 - 4KC_{\text{max}}^2 \exp(-2 \theta^2(1-\gamma)^2 n)$ that
    \begin{align} 
            V^*(s) - V_{\bar \pi_{K-1}}(\rho) \le \left(\frac{1}{(1-\gamma)^2} + \frac{2\eta}{(1-\gamma)}\right) \sqrt{\frac{2 m \log(\max_{i \in [m]} |\mathcal{A}^\ag{i}|)}{K}} + \frac{4 \eta}{1 - \gamma}. \label{eq:politex actual optimality of output policy}
    \end{align}

        \section{\uppercase{Kernel Setting}} \label{app:kernel setting}
    
   
    
   
   
   

   
   
   
   
   
    
   
    
   
   
   
   
   
   
   
    
   
   
   
   
   
   
   
   
   
   
   
   
   
    
   
   
   
   
   
   
   
   
    













    
   
   
   
   
   
   
   
   
   
   
    
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
    
   
    Define $q_k(s,a^{(1:m)})$ as the estimated rollout value for $(s,a^{(1:m)}) \in \mathcal{C}$ in round $k \in [K]$ of policy iteration, and $q_k = [q_k(s, a^{(1:m)})]_{(s, a^{(1:m)}) \in \mathcal{C}} \in \mathbb{R}^{|\mathcal{C}|}$ as the vector containing all rollout results at round $k$, using some fixed ordering of $\mathcal{C}$.
    In round $k$ of policy iteration we need to compute the ridge estimate $\hat Q_k$ using $q_k$ as least squares targets,
    \begin{align}
    	\hat Q_k = \argmin_{Q \in \mathcal{H}} \sum_{(s,a^{(1:m)})\in\mathcal{C}} (Q(s,a^{(1:m)}) - q_k(s,a^{(1:m)}))^2 + \lambda \|Q\|_{\mathcal{H}}^2 = (\Phi_{\mathcal{C}}^\top\Phi_{\mathcal{C}} + \lambda \mathbf{I}_\mathcal{H})^{-1}\Phi_{\mathcal{C}}^\top q_k
    \end{align}
    Here, $\mathbf{I}_{\mathcal{H}} : \mathcal{H} \rightarrow \mathcal{H}$ is the identity mapping, $\Phi_{\mathcal{C}}$ can be formally defined as a map $\Phi_\mathcal{C} : \mathcal{H} \rightarrow \mathbb{R}^{|\mathcal{C}|}, f \mapsto [f(s,a^{(1:m)})]_{(s,a^{(1:m)}) \in \mathcal{C}}, \, f \in \mathcal{H}$; and $\Phi_{\mathcal{C}}^\top : \mathbb{R}^{|\mathcal{C}|} \rightarrow \mathcal{H}$ is the adjoint of $\Phi_{\mathcal{C}}$.
    
    Using the `kernel trick' we express the estimator as follows
    \begin{align}
    	\hat Q_k = \Phi_{\mathcal{C}}^\top(K_{\mathcal{C}} + \lambda \mathbf{I}_{|\mathcal{C}|})^{-1}q_k
    \end{align}
    where $K_{\mathcal{C}} = \Phi_{\mathcal{C}} \Phi_{\mathcal{C}}^\top \in \mathbb{R}^{|\mathcal{C}| \times |\mathcal{C}|}$ is the kernel matrix. Lastly, we can evaluate for any $(s,a^{(1:m)}) \in \mathcal{S} \times \mathcal{A}^{(1:m)}$:
    \begin{align}
    	\hat Q_k(s,a^{(1:m)}) = \mathbf{k}_{\mathcal{C}}(s,a^{(1:m)})^\top(K_{\mathcal{C}} + \lambda \mathbf{I}_{|\mathcal{C}|})^{-1}q_k
    \end{align}
    where we defined $\mathbf{k}_\mathcal{C}(s,a^{(1:m)}) = [\mathbf{k}(s,a^{(1:m)}, s',a'^{(1:m)})]_{(s',a'^{(1:m)}) \in \mathcal{C}} \in \mathbb{R}^{|\mathcal{C}|}$ (using the same fixed ordering of $\mathcal{C}$).
    Importantly, the last display only involves finite-dimensional quantities that can be computed from kernel evaluations.
    Moreover, since $\mathbf{k}(s,a^{(1:m)},s',a'^{(1:m)}) = \sum_{j=1}^m \mathbf{k}_j(s,a^{(j)}, s', a'^{(j)})$ we can write
    \begin{align}
    	\hat Q_k(s,a^{(1:m)}) &= \sum_{j=1}^m \hat{Q}_{k, j}(s, a^\ag{j})\\
    	\hat{Q}_{k, j}(s, a^\ag{j}) &:= \mathbf{k}_{j, \mathcal{C}}(s,a^{(j)})^\top(K_{\mathcal{C}} + \lambda \mathbf{I}_{|\mathcal{C}|})^{-1}q_k
     \label{eq:kernel agent-wise q}
    \end{align}
    where $\mathbf{k}_{j, \mathcal{C}}(s,a^{(j)}) = [\mathbf{k}_j(s,a^{(j)}, s^\prime , a^{\prime(j)})]_{(s^\prime, a^{\prime(1:m)}) \in \mathcal{C}} \in \mathbb{R}^{|\mathcal{C}|}$.
    Since the term $(K_{\mathcal{C}} + \lambda \mathbf{I}_{|\mathcal{C}|})^{-1}q_k$ is fixed for each $j$, we can still compute the maximizer independently for each $j \in [m]$ by iterating over all actions.
    This allows us to define \textsc{Confident Kernel MC-LSPI/Politex}~(\cref{alg:confident kernel mc-lspi/politex}), which makes use of \cref{eq:kernel agent-wise q} in line 17 for calculating the policy.

    \begin{algorithm}[t]
	\caption{\textsc{Confident Kernel MC-LSPI/Politex}} \label{alg:confident kernel mc-lspi/politex}  
	\begin{algorithmic}[1]
		\State \textbf{Input:} initial state $\rho$, initial policy $\pi_0$, number of iterations $K$, threshold $\tau$, number of rollouts $n$, length of rollout $H$
		\State \textbf{Globals:} default action $\bar a$, regularization coefficient $\lambda$, discount $\gamma$, subroutine \textsc{UncertaintyCheck}, kernel $\mathbf{k}$
		\State {$\mathcal{C} \gets \{(\rho, \bar a, \phi(\rho, \bar a), \textsc{none})\}$} 
\State status, result $\gets \textsc{UncertaintyCheck}(\rho,  \mathcal{C}, \tau)$
		\While {status $=$ \textsc{uncertain}}
		\State $\mathcal{C} \gets \mathcal{C} \cup \{\text{result}\}$
		 \State status, result $\gets \text{\textsc{UncertaintyCheck}}(\rho, \mathcal{C}, \tau)$
		\EndWhile
		\State $z_q \gets \textsc{none}, \, \forall z \in \mathcal{C}$ \quad \Comment{Policy iteration starts $(*)$}
		\For {$k \in 1, \dots, K$}
		\For {$z \in \mathcal{C}$}
		\State status, result $\gets \text{\textsc{Rollout}}(n, H, \pi_{k-1}, z, \mathcal{C}, \tau)$
		\State \textbf{if} status $=$ \textsc{done}, \textbf{then} $z_q = \text{result}$
		\State \textbf{else} $\mathcal{C} \gets \mathcal{C} \cup \{\text{result}\}$ and \textbf{goto} line $(*)$ 
		\EndFor 
		\State $\hat Q_k = \Phi_{\mathcal{C}}^\top(K_{\mathcal{C}} + \lambda \mathbf{I}_{|\mathcal{C}|})^{-1}q_k$  
		\State {$\pi_k(a^{(1:m)}|s) \gets 
			\begin{cases}
				\mathds{1}\left(a^{(1:m)} = \argmax\limits_{\tilde{a}^{(1:m)} \in \mathcal{A}^{(1:m)}} \hat Q_k(s,\tilde{a}^{(1:m)}) \right) & \text{LSPI} \\
				\propto \prod_{i=1}^m \prod_{j=0}^{k-1} \exp\left(\alpha \hat{Q}_{j, i}(s, a^\ag{i})\right). & \text{Politex}
			\end{cases}$}
		\EndFor
		\State \Return $\pi_{K-1}$ for LSPI, or $\bar \pi_{K-1} \sim \text{Unif}\{\pi_k\}_{k=0}^{K-1}$ for Politex 
	\end{algorithmic}
    \end{algorithm}
    
    The second quantity required by the algorithm is the squared norm $\|\phi(s,a^{(1:m)})\|_{(\Phi_\mathcal{C}^\top \Phi_{\mathcal{C}} + \lambda \mathbf{I}_\mathcal{H})^{-1}}^2$, where now $\phi(s,a^{(1:m)}) = \mathbf{k}(s,a^{(1:m)}, \cdot, \cdot) \in \mathcal{H}$. A direct extension of the Woodbury formula to infinite vector spaces shows that
    \begin{align}
    	\lambda(\Phi_\mathcal{C}^\top \Phi_{\mathcal{C}} + \lambda \mathbf{I}_\mathcal{H})^{-1} = \mathbf{I}_{\mathcal{H}} - \Phi_{\mathcal{C}}^\top (K_\mathcal{C} + \lambda \mathbf{I}_{|\mathcal{C}|})^{-1}\Phi_{\mathcal{C}}
    \end{align}
   
   
   
   
   
   
    Therefore the feature norm can be written using finite-dimensional quantities: 
    \begin{align}
    	\|\phi(s,a^{(1:m)})\|_{(\Phi_\mathcal{C}^\top \Phi_{\mathcal{C}} + \lambda \mathbf{I}_\mathcal{H})^{-1}}^2 = \frac{1}{\lambda} \left( \mathbf{k}(s,a^{(1:m)},s,a^{(1:m)}) - \mathbf{k}_\mathcal{C}(s,a^{(1:m)})^\top(K_\mathcal{C}+ \lambda \mathbf{I}_{|\mathcal{C}|})^{-1}\mathbf{k}_{\mathcal{C}}(s,a^{(1:m)})\right)
     \label{eq:kernel uncertainty}
    \end{align}
   
   
   
   
   
    With this, we can define \textsc{UncertaintyCheck-K-DAV}~(\cref{alg:uncertainty check k-dav}) which makes use of \cref{eq:kernel uncertainty}.
   

    \begin{algorithm}
    	\caption{\textsc{Uncertainty Check with Kernel-Default Action Vector (K-DAV)}} \label{alg:uncertainty check k-dav}  
    	\begin{algorithmic}[1]
    		\State \textbf{Input:} state $s$, core set $\Phi_\mathcal{C}$, threshold $\tau$.
    		\State \textbf{Globals:} number of action components $m$. 
    		\For {$j \in [m]$}
    		\For {$a^{(j)} \in \mathcal{A}^{(j)}$}
                \State $\tilde a \gets (a^\ag{j}, \bar a^\ag{-j})$
    		\If {$\frac{1}{\lambda} \left( \mathbf{k}(s, \tilde a,s, \tilde a) - \mathbf{k}_\mathcal{C}(s, \tilde a)^\top(\Phi_\mathcal{C} \Phi_\mathcal{C}^\top + \lambda \mathbf{I}_{|\mathcal{C}|})^{-1}\mathbf{k}_{\mathcal{C}}(s, \tilde a)\right) > \tau$}
    		\State result $\gets (s, \tilde a, \phi(s, \tilde a), \textsc{none})$
    		\State \Return {\textsc{uncertain}, result}
    		\EndIf
    		\EndFor 
    		\EndFor 
    		\State \Return \textsc{certain}, \textsc{none} 
    	\end{algorithmic}
    \end{algorithm}


    \paragraph{Analysis}
    Our goal next is to extend the analysis used in the finite case to the kernel case, carefully arguing that the linear dimension $d$ can be replaced by a more benign quantity. A common complexity measure is the total information gain, which we define as follows:
    \begin{align}
    	\Gamma(\lambda; \mathcal{C}) = \log \det (\Phi_{\mathcal{C}}^\top\Phi_{\mathcal{C}} + \lambda \mathbf{I}_\mathcal{H}) - \log \det (\lambda \mathbf{I}_\mathcal{H})
    \end{align}
    Note that we can compute $\Gamma(\lambda; \mathcal{C})$ for any given core set $\mathcal{C}$. In the kernel case, we can compute $\Gamma(\lambda; \mathcal{C}) = \log \det (\mathbf{I}_{|\mathcal{C}|} + \lambda^{-1} K_{\mathcal{C}})$ using similar arguments as before.
    
    The maximum information gain is 
    $$\Gamma_t(\lambda) = \max_{\mathcal{C} : |\mathcal{C}|=t} \Gamma(\lambda; \mathcal{C}).$$ 
    It serves as a complexity measure in the bandit literature and can be bounded for many kernels of interests \citep{srinivas2009gaussian,vakili2021information}. 
    Following \citet{du2021bilinear, huang2021short}, we further define the \emph{critical information gain} for any fixed constant $c > 0$,
    \begin{align}
    	\tilde \Gamma(\lambda, c) = \max \{t \geq 1 : c t \le \Gamma_t(\lambda) \}.\label{eq:critical infogain}
    \end{align}
    Note that the proof of  \cite[Lemma 5.1]{yin2021efficient} implies that $\log(1+\tau)|C| \leq \Gamma_{|C|}(\lambda)$
    Thus, $|\mathcal{C}| \le C_\text{max} = \tilde \Gamma(\lambda, \log(1+\tau)) $
    
   
    Since the dimension $d$ enters our bounds only through $C_{\max}$ we can immediately get a query complexity bound for the kernelized algorithm in terms of $\tilde\Gamma$. 
    For the finite-dimensional case, \cite[Lemma 5.1]{yin2021efficient} shows that $\tilde \Gamma \leq \mathcal{O}(d)$, recovering the previous bound.






   
   
   
   
   
   
   
   
   
   
   
    
   
   
 
    
   
    
   
    
   
    
   
   
   
    
   
   
   
   
   
   
   
   
   
    
   
    
   
    
   
    
    
    

    
    \section{\uppercase{Proofs of Theorems}} \label{app: theorem proofs}
    We make a remark on the query complexity of \textsc{Confident MC-LSPI}~and \textsc{Confident MC-Politex}.
    From \cref{lemma:bound on core set size} we know the core set size is bounded by $C_\text{max} = \tilde \mathcal{O}(d)$.
    The total number of times Policy iteration is restarted (restart means line 14 in \textsc{Confident MC-LSPI}~or \textsc{Confident MC-Politex}~is reached) is thus at most $C_\text{max}$.
    Each run of policy iteration can take as much as $K$ iterations.
    In each iteration \textsc{Rollout}~is run at most $C_\text{max}$ times.
    \textsc{Rollout}~does $n$ rollouts of length $H$ which queries the simulator once for each step.
    In total the number of queries performed by \textsc{Confident MC-LSPI}~or \textsc{Confident MC-Politex} is bounded by $C_\text{max}^2 K n H$.
    This equation is used to calculate the query cost for the different variants of \textsc{Confident MC-LSPI}, once all the parameter values have been calculated.
    Since, the only difference between \textsc{Confident Kernel MC-LSPI/Politex}~and \textsc{Confident MC-LSPI}~or \textsc{Confident MC-Politex}~is how the policy is calculated (lines 16-17 in each of the algorithms), thus we can use the same expression as above ($C_\text{max}^2 K n H$) to bound the query complexity of \textsc{Confident Kernel MC-LSPI/Politex}~, with $C_\text{max} = \tilde \Gamma(\lambda, c)$. 

   
   
   
   
   
   
   
    
    
    \subsection{Proof of \cref{thm:mc-lspi-egss sub-optimality}}
    Plugging in $\eta=\eta_2$ ($\eta_2$ as defined in \cref{prop: approx value function bound for EGSS}) into \cref{prop:optimality of output policy}.
   
    Suppose \cref{asm:linear-q-pi,ass: argmax oracle,ass: bounded features} are satisfied with $\epsilon=0$.
    By choosing appropriate parameters according to $\delta$ and $\kappa$, we can ensure that with probability at least $1 - \delta$ that the policy output by \textsc{Confident MC-LSPI}~combined with \textsc{UncertaintyCheck-EGSS}~, $\pi_{K-1}$ satisfies:
    \begin{align*}
        V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \kappa,
    \end{align*}
    with the following parameter settings 
    \begin{align*}
        \tau &= 1\\
        \lambda &= \frac{\kappa^2(1 - \gamma)^4}{1024 b^2 d}\\
        \theta &= \frac{\kappa(1- \gamma)^2}{32 \sqrt{d} \sqrt{C_{\text{max}}}}\\
        H &= \frac{
            \log \left ( 32 \sqrt{C_{\text{max}}} \sqrt{d} \right)
            - \log \left( \kappa(1 - \gamma)^3 \right)
        }{
            1-\gamma
        } - 1\\
        K &= \frac{\log\left(\frac{1}{\kappa(1 - \gamma)^2}\right) + \log(8)}{1-\gamma} + 1 \\
        n &= \frac{\log(\delta) - \log(4KC_{\text{max}}^2)}{2 \theta^2(1-\gamma)^2} \\
        C_{\max} &= \frac{e}{e-1} \frac{1 + \tau}{\tau} d \left( 
            \log(1 + \frac{1}{\tau}) +
            \log(1 + \frac{1}{\lambda})
        \right) 
    \end{align*}
    with computational cost of $\text{poly}(d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}))$
    and query cost $\mathcal{O}\left(\tfrac{d^4}{\kappa^2 (1-\gamma)^8} \right)$ 

    If $\epsilon > 0$, then by choosing parameters as above, with $\kappa = \frac{32 \epsilon d}{(1-\gamma)^2} (1 + \log( b^2 \epsilon^{-2} d^{-1}))^{1/2}$, we can ensure that with probability of at least $1 - \delta$ that $\pi_{K-1}$ satisfies:
    
    $$V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \frac{64 \epsilon d}{(1-\gamma)^2} (1 +\log(1+b^2 \epsilon^{-2} d^{-1}))^{1/2}$$
    
    with computational cost of $\text{poly}(d, \frac{1}{1 - \gamma}, \frac{1}{\epsilon}, \log(\frac{1}{\delta}), \log(1+b))$
    and query cost $\mathcal{O}\left(\tfrac{d^2}{\epsilon^2 (1-\gamma)^4} \right)$ 
    
   

    \subsection{Proof of \cref{thm:mc-lspi-dav sub-optimality}}
    Plugging in $\eta=\eta_1$ ($\eta_1$ as defined in \cref{prop: approx value function bound for DAV}) into \cref{prop:optimality of output policy}.
   
    Suppose \cref{ass: feature decomposition,ass: bounded features} are satisfied with $\epsilon=0$.
    By choosing appropriate parameters according to $\delta$ and $\kappa$, we can ensure that with probability at least $1 - \delta$ that the policy output by \textsc{Confident MC-LSPI}~combined with \textsc{UncertaintyCheck-EGSS}~, $\pi_{K-1}$ satisfies:
    \begin{align*}
        V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \kappa,
    \end{align*}
    with the following parameter settings 
    \begin{align*}
        \tau &= 1\\
        \lambda &= \frac{\kappa^2(1 - \gamma)^4}{1024 b^2 (2m-1)^2}\\
        \theta &= \frac{\kappa(1- \gamma)^2}{32 (2m-1) \sqrt{C_{\text{max}}}}\\
        H &= \frac{
            \log \left ( 32 \sqrt{C_{\text{max}}} (2m-1) \right)
            - \log \left( \kappa(1 - \gamma)^3 \right)
        }{
            1-\gamma
        } - 1\\
        K &= \frac{\log\left(\frac{1}{\kappa(1 - \gamma)^2}\right) + \log(8)}{1-\gamma} + 1 \\
        n &= \frac{\log(\delta) - \log(4KC_{\text{max}}^2)}{2 \theta^2(1-\gamma)^2} \\
        C_{\max} &= \frac{e}{e-1} \frac{1 + \tau}{\tau} d \left( 
            \log(1 + \frac{1}{\tau}) +
            \log(1 + \frac{1}{\lambda})
        \right) 
    \end{align*}
    with computational cost of $\text{poly}(\sum_{i=1}^m |A^\ag{i}|, d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}))$
    and query cost $\mathcal{O}\left(\tfrac{m^2 d^3}{\kappa^2 (1-\gamma)^8} \right)$ 

    If $\epsilon > 0$, then by choosing parameters as above, with $\kappa = \frac{32 \epsilon \sqrt{d} m}{(1-\gamma)^2} (1 + \log( b^2 \epsilon^{-2} d^{-1}))^{1/2}$, we can ensure that with probability of at least $1 - \delta$ that $\pi_{K-1}$ satisfies:
    
    $$V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \frac{128 \epsilon \sqrt{d} m}{(1-\gamma)^2} (1 +\log(1+b^2 \epsilon^{-2} d^{-1}))^{1/2}$$
    
    with computational cost of $\text{poly}(\sum_{i=1}^m |A^\ag{i}|, d, \frac{1}{1 - \gamma}, \frac{1}{\epsilon}, \log(\frac{1}{\delta}), \log(1+b))$
    and query cost $\mathcal{O}\left(\tfrac{d^2}{\epsilon^2 (1-\gamma)^4} \right)$ 


   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   

   
   
    
   
    
   
   
    
   
    

    \subsection{Proof of \cref{thm:mc-politex sub-optimality} + \textsc{UncertaintyCheck-EGSS}~case}
    Plugging in $\eta=\eta_1$ when \textsc{UncertaintyCheck-DAV}~is used ($\eta_1$ as defined in \cref{prop: approx value function bound for DAV}) and $\eta=\eta_2$ when \textsc{UncertaintyCheck-EGSS}~is used ($\eta_2$ as defined in \cref{prop: approx value function bound for EGSS}) into \cref{eq:politex actual optimality of output policy}.
    Setting $\zeta=2m-1$ when \textsc{UncertaintyCheck-DAV}~is used, and $\zeta = \sqrt{d}$ when \textsc{UncertaintyCheck-EGSS}~is used.
    Suppose \cref{ass: feature decomposition,ass: bounded features} are satisfied with $\epsilon=0$.
    By choosing appropriate parameters according to $\delta$ and $\kappa$, we can ensure that with probability at least $1 - \delta$ that the policy output by \textsc{Confident MC-Politex}~$\bar \pi_{K-1}$ satisfies:
    \begin{align*}
        V^*(\rho) - V_{\bar \pi_{K-1}}(\rho) \leq \kappa,
    \end{align*}
    with the following parameter settings
    \begin{align*}
        \tau &= 1\\
        \lambda &= \frac{\kappa^2(1 - \gamma)^2}{576 b^2 \zeta^2}\\
        \theta &= \frac{\kappa(1- \gamma)}{24 \zeta \sqrt{C_{\text{max}}}}\\
        H &= \frac{
            \log \left ( 24 \sqrt{C_{\text{max}}} \zeta \right)
            - \log \left( \kappa(1 - \gamma)^2 \right)
        }{
            1-\gamma
        } - 1\\
        K &= 2m \log(A) \left( \frac{4}{\kappa^2 (1-\gamma)^4} + \frac{3}{\kappa (1-\gamma)^2} + \frac{9}{16} \right)\\
        n &= \frac{\log(\delta) - \log(4KC_{\text{max}}^2)}{2 \theta^2(1-\gamma)^2} \\
        C_{\max} &= \frac{e}{e-1} \frac{1 + \tau}{\tau} d \left( 
            \log(1 + \frac{1}{\tau}) +
            \log(1 + \frac{1}{\lambda})
        \right) 
    \end{align*}
    with computational cost of $\text{poly}(\sum_{i=1}^m |A^\ag{i}|, d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}))$
    and query cost $\mathcal{O}\left(\tfrac{m \zeta^2 d^3}{\kappa^4 (1-\gamma)^9} \right)$ 

   
    If $\epsilon > 0$, then by choosing parameters as above, with $\kappa = \frac{16 \epsilon \sqrt{d} \zeta}{(1-\gamma)} (1 + \log( b^2 \epsilon^{-2} d^{-1}))^{1/2}$, we can ensure that with probability of at least $1 - \delta$ that $\bar \pi_{K-1}$ satisfies:
    
    $$V^*(\rho) - V_{\bar \pi_{K-1}}(\rho) \leq \frac{32 \epsilon \sqrt{d} \zeta}{1-\gamma} (1 +\log(1+ b^2 \epsilon^{-2} d^{-1}))^{1/2}$$

    with computational cost of $\text{poly}(\sum_{i=1}^m |A^\ag{i}|, d, \frac{1}{1 - \gamma}, \frac{1}{\epsilon}, \log(\frac{1}{\delta}), \log(1+b))$
    and query cost $\mathcal{O}\left(\tfrac{m d}{\epsilon^4 (1-\gamma)^5} \right)$ 


    \subsection{Proof of \cref{thm:kernel mc-lspi-dav sub-optimality}}
    Plugging in $\eta=\eta_1$ ($\eta_1$ as defined in \cref{prop: approx value function bound for DAV}) into \cref{prop:optimality of output policy}.
    Suppose \cref{ass: bounded features,ass:kernel q-pi,ass:kernel additive} are satisfied with $\epsilon=0$.
    By choosing appropriate parameters according to $\delta$ and $\kappa$, we can ensure that with probability at least $1 - \delta$ that the policy output by \textsc{Confident Kernel MC-LSPI} $\pi_{K-1}$ satisfies:
    \begin{align*}
        V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \kappa,
    \end{align*}
    with the following parameter settings 
    \begin{align*}
        \tau &= 1\\
        \lambda &= \frac{\kappa^2(1 - \gamma)^4}{1024 b^2 (2m-1)^2}\\
        \theta &= \frac{\kappa(1- \gamma)^2}{32 (2m-1) \sqrt{C_{\text{max}}}}\\
        H &= \frac{
            \log \left ( 32 \sqrt{C_{\text{max}}} (2m-1) \right)
            - \log \left( \kappa(1 - \gamma)^3 \right)
        }{
            1-\gamma
        } - 1\\
        K &= \frac{\log\left(\frac{1}{\kappa(1 - \gamma)^2}\right) + \log(8)}{1-\gamma} + 1 \\
        n &= \frac{\log(\delta) - \log(4KC_{\text{max}}^2)}{2 \theta^2(1-\gamma)^2} \\
        C_{\max} &= \tilde \Gamma(\lambda, \log(2))
    \end{align*}
    with computational cost of $\text{poly}(\sum_{i=1}^m |A^\ag{i}|, \tilde \Gamma(\lambda, \log(2)), \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}))$
    and query cost $\mathcal{O}\left(\tfrac{m^2 \tilde \Gamma(\lambda, \log(2))^3}{\kappa^2 (1-\gamma)^8} \right)$ 

   
    If $\epsilon > 0$, then by choosing parameters as above, with $\kappa = \frac{16 \epsilon m \sqrt{\tilde \Gamma(\lambda, \log(2))}}{(1-\gamma)^2}$, we can ensure that with probability of at least $1 - \delta$ that $\pi_{K-1}$ satisfies:
    
    $$V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \frac{32 \epsilon m \sqrt{\tilde \Gamma(\lambda, \log(2))}}{(1-\gamma)^2}$$
    
    with computational cost of $\text{poly}(\sum_{i=1}^m |A^\ag{i}|, \tilde \Gamma(\lambda, \log(2)), \frac{1}{1 - \gamma}, \frac{1}{\epsilon}, \log(\frac{1}{\delta}), \log(1+b))$
    and query cost $\mathcal{O}\left(\tfrac{\tilde \Gamma(\lambda, \log(2))^2}{\epsilon^2 (1-\gamma)^4} \right)$ 

    \paragraph{Theroem for \textsc{Confident Kernel MC-Politex}~combined with \textsc{UncertaintyCheck-DAV}}

    As mentioned in the body we state the theorem bounding the sub-optimality of the policy output by \textsc{Confident Kernel MC-Politex}~combined with \textsc{UncertaintyCheck-K-DAV}. 
    \begin{theorem}[\textsc{Confident Kernel MC-Politex DAV} Sub-Optimality] \label{thm:kernel mc-politex-dav sub-optimality}	
            Suppose Assumption \cref{ass:kernel q-pi,ass:kernel additive,ass: bounded features} hold.
            Define $\tilde \Gamma := \tilde \Gamma(\lambda, \log(2))$.
    	If $\epsilon = 0$, for any $\kappa > 0$, with probability at least $1 - \delta$, the policy $\bar \pi_{K-1}$, output by \textsc{Confident Kernel MC-Politex}~combined with \textsc{UncertaintyCheck-K-DAV}~satisfies
    	\begin{equation*}
    		V^*(\rho) - V_{\bar \pi_{K-1}}(\rho) \leq \kappa. 
    	\end{equation*}
    	Further, the query cost is $\mathcal{O}\left(\tfrac{m^3\tilde \Gamma^3}{\kappa^4 (1-\gamma)^9} \right)$
    	and computation cost is $\text{poly}(\sum_{i=1}^m |A^\ag{i}|, \tilde \Gamma, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}))$ 
    	  If $\epsilon > 0$, then with probability at least $1 - \delta$, the policy $\bar \pi_{K-1}$, output satisfies
    	\begin{equation*}
    		V^*(\rho) - V_{\bar \pi_{K-1}}(\rho) \leq \frac{16 \epsilon m \sqrt{\tilde \Gamma}}{1-\gamma}
    	\end{equation*}
    	Further, the query cost is $\mathcal{O}\left(\tfrac{m \tilde \Gamma}{\epsilon^4 (1-\gamma)^5} \right)$
    	and computation cost is $\text{poly}(\sum_{i=1}^m |A^\ag{i}|, \tilde \Gamma, \frac{1}{1 - \gamma}, \frac{1}{\epsilon}, \log(\frac{1}{\delta}), \log(1+b))$
    	The parameter settings for both cases are defined below. 
    \end{theorem}

    \subsection{Proof of \cref{thm:kernel mc-politex-dav sub-optimality}}
    Plugging in $\eta=\eta_1$ ($\eta_1$ as defined in \cref{prop: approx value function bound for DAV}) into \cref{eq:politex actual optimality of output policy}.
    Suppose \cref{ass:kernel additive,ass:kernel q-pi,ass: bounded features} are satisfied with $\epsilon=0$.
    By choosing appropriate parameters according to $\delta$ and $\kappa$, we can ensure that with probability at least $1 - \delta$ that the policy output by \textsc{Confident Kernel MC-Politex}~combined with \textsc{UncertaintyCheck-K-DAV}~, $\bar \pi_{K-1}$ satisfies:
    \begin{align*}
        V^*(\rho) - V_{\bar \pi_{K-1}}(\rho) \leq \kappa,
    \end{align*}
    with the following parameter settings
    \begin{align*}
        \tau &= 1\\
        \lambda &= \frac{\kappa^2(1 - \gamma)^2}{576 b^2 (2m-1)^2}\\
        \theta &= \frac{\kappa(1- \gamma)}{24 (2m-1) \sqrt{C_{\text{max}}}}\\
        H &= \frac{
            \log \left ( 24 \sqrt{C_{\text{max}}} (2m-1) \right)
            - \log \left( \kappa(1 - \gamma)^2 \right)
        }{
            1-\gamma
        } - 1\\
        K &= 2m \log(A) \left( \frac{4}{\kappa^2 (1-\gamma)^4} + \frac{3}{\kappa (1-\gamma)^2} + \frac{9}{16} \right)\\
        n &= \frac{\log(\delta) - \log(4KC_{\text{max}}^2)}{2 \theta^2(1-\gamma)^2} \\
        C_{\max} &= \tilde \Gamma(\lambda, \log(2))
    \end{align*}
    with computational cost of $\text{poly}(\sum_{i=1}^m |A^\ag{i}|, \tilde \Gamma(\lambda, \log(2)), \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}))$
    and query cost $\mathcal{O}\left(\tfrac{m^3 \tilde \Gamma(\lambda, \log(2))^3}{\kappa^4 (1-\gamma)^9} \right)$ 

   
    If $\epsilon > 0$, then by choosing parameters as above, with $\kappa = \frac{8 \epsilon m \sqrt{\tilde \Gamma(\lambda, \log(2))}}{(1-\gamma)} $, we can ensure that with probability of at least $1 - \delta$ that $\bar \pi_{K-1}$ satisfies:
    
    $$V^*(\rho) - V_{\bar \pi_{K-1}}(\rho) \leq \frac{16 \epsilon m \sqrt{\tilde \Gamma(\lambda, \log(2))}}{1-\gamma}$$

    with computational cost of $\text{poly}(\sum_{i=1}^m |A^\ag{i}|, \tilde \Gamma(\lambda, \log(2)), \frac{1}{1 - \gamma}, \frac{1}{\epsilon}, \log(\frac{1}{\delta}), \log(1+ b))$
    and query cost $\mathcal{O}\left(\tfrac{m \tilde \Gamma(\lambda, \log(2))}{\epsilon^4 (1-\gamma)^5} \right)$ 
    



\section{\uppercase{Examples and Experiments}}\label{app:additive mdp example}
\begin{figure}[h]
	\begin{center}
		
		\begin{tikzpicture}[auto,node distance=8mm,>=latex,font=\small]
			\tikzstyle{round}=[thick,draw=black,circle]
			\node[round] (s1) {$s_1$};
			\node[round, right=0mm and 20mm of s1] (s2) {$s_2$};
			\node[round, left=0mm and 20mm of s1] (s3) {$s_3$};
			
			\draw[->] (s1) [out=0,in=180] to node {
				\tiny $\phi(s_1,(1,1))=\begin{bmatrix} 1 \\ 0 \\
				\end{bmatrix}$
			} node [swap] {
				\tiny $\phi(s_1,(1,0))=\begin{bmatrix} 1 \\ 0 \\
				\end{bmatrix}$
			} node [anchor=mid, fill=white!20] {\tiny r = 0} (s2);
			\draw[->] (s1) [out=180,in=0] to node {
				\tiny $\phi(s_1,(0,0))=\begin{bmatrix} 0 \\ 1 \\
				\end{bmatrix}$
			} node [swap] {
				\tiny $\phi(s_1,(0,1))=\begin{bmatrix} 0 \\ 1 \\
				\end{bmatrix}$
			} node [anchor=mid, fill=white!20] {\tiny r = 0} (s3);
			\draw[->] (s2) [out=60,in=120,loop] to node {\tiny r = 0} node [swap] { \tiny
				$\phi(s_2,(1,0))=\phi(s_2,(0,0))=\begin{bmatrix} 1 \\ 0 \\
				\end{bmatrix}$
			}(s2);
			\draw[->] (s2) [out=-60,in=-120,loop] to node { \tiny
				$\phi(s_2,(0,1))=\phi(s_2,(1,1))=\begin{bmatrix} 2 \\ 0 \\
				\end{bmatrix}$
			} node [swap] {\tiny r = 1} (s2);
			\draw[->] (s3) [out=60,in=120,loop] to node {\tiny r = 1} node [swap] { \tiny
				$\phi(s_3,(1,0))=\phi(s_3,(0,0))=\begin{bmatrix} 0 \\ 2 \\
				\end{bmatrix}$
			}(s3);
			\draw[->] (s3) [out=-60,in=-120,loop] to node { \tiny
				$\phi(s_3,(0,1))=\phi(s_3,(1,1))= \begin{bmatrix} 0 \\ 1 \\
				\end{bmatrix}$
			} node [swap] {\tiny r = 0} (s3);
		\end{tikzpicture}
	\end{center}
\caption{Illustration of \cref{ex:ski}.}\label{fig:coordination-example}
\end{figure}
\subsection{Additive MDP, Cooperation Example}
\begin{example}[Coordination]\label{ex:ski}





	
	Consider the MDP in \cref{fig:coordination-example}, which can be verified to satisfy \cref{ass: feature decomposition} with $\gamma = 1/2$ (proof in the next subsection).
	At every time step, two agents in the MDP take actions from $\mathcal{A}^{(1)}= \mathcal{A}^{(2)} = \{0, 1\}$, and move to a next state together.
	The starting state is $s_1$ and by taking a joint action they move to $s_2$ or $s_3$, which 
	are absorbing states and the agents will remain in them once they get there.
	It is easy to see that if we fix the policy for one of the agents in all states, the other agent will face a reduced MDP where the transitions only depends on the its actions.
	We will show that for two different policies followed by the second agent,
	the problem (the MDP) the first agent faces changes.
	More specifically, the best action for the first agent in $s_1$ is different in the resulting MDPs, which suggests that 
	the first agent should coordinate
	with the second agent to achieve a higher value.
	It also shows that this example cannot be reduced to a product MDP, since in product MDPs the best action for each agent is irrespective of the behavior of the other agents.
	
	Assume two different policies $\pi_0, \pi_1: \mathcal{S} \rightarrow \Delta_{\mathcal{A}^{(2)}}$ for the second agent,
	such that $\pi_0(s_i) = \delta_0 , \pi_1(s_i) = \delta_1$ for $i \in [3]$ where $\delta_j$ for $j \in [2]$ is the Dirac delta distribution.
	Policy $\pi_0$ causes the joint policy $\pi$ to get reward 1 in $s_3$ and get reward 0 in $s_2$, regardless of the policy followed by the first agent.
	The effect of following $\pi_1$ is exactly the opposite, meaning getting reward 1 in $s_2$ and 0 in $s_3$.
	Consequently, the optimal action for agent 1 depends on choosing $\pi_0$ or $\pi_1$ by the second agent.
	Therefore, agent 1 needs to coordinate its action with the second agent's policy to get the higher reward.
	This property, coordination with other agent's policy, cannot be modeled with separate MDPs
	since in those cases the optimal action for each agent only depends on the agent's MDP, and does not depend
	on the behavior of other agents.
	This example shows that the \cref{ass: feature decomposition} is not limited
	to solving multiple MDPs with joint reward observation, and can model some cases where cooperation is needed.\looseness=-1
\end{example}

\subsubsection*{Realizability}
In this section we prove that the MDP in \cref{fig:coordination-example} satisfies \cref{ass: feature decomposition}.
We start by showing that all the deterministic policies are realizable using the shown feature vectors.
We use the weight vector
$w_{(a_1^1,a_1^2),(a_2^1,a_2^2),(a_3^1,a_3^2)}$ 
for a deterministic policy
that takes action vector $(a^1_i, a^2_i)$ in state $s_i$ for $i \in \{1, 2, 3\}$ and $a^1_i, a^2_i \in \{0,1\}$. We also use $\cdot$ to show that the choice of an action in the respective state does not change the weight vector. 
One can verify that the following vectors satisfy realizability assumption:
\begin{align*}
    w_{(\cdot, \cdot),(\cdot, 0),(\cdot, 0)} = \begin{bmatrix} 0 \\ 1 \end{bmatrix},
    \quad \quad
    w_{(\cdot, \cdot),(\cdot, 0),(\cdot, 1)} = \begin{bmatrix} 0 \\ 0 \end{bmatrix},
    \\
    w_{(\cdot, \cdot),(\cdot, 1),(\cdot, 0)} = \begin{bmatrix} 1 \\ 1 \end{bmatrix},
    \quad \quad
    w_{(\cdot, \cdot),(\cdot, 1),(\cdot, 1)} = \begin{bmatrix} 1 \\ 0 \end{bmatrix}.
\end{align*}
It remains to show that the non-deterministic policies are also realizable.
For a policy $\pi$ that takes action $(\cdot, 1)$ at $s_2$ with probability $p_2$, and action $(\cdot, 0)$ at $s_3$ with probability $p_3$, the realizable weight vector is:
\[
    w_\pi = \begin{bmatrix}
    p_2\\
    p_3\\
    \end{bmatrix}.
\]
This holds since the choice of the action in $s_1$ does not change the weight vector in this example.






   
    
    
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   















\input{parts/experiment.tex}

\section{\uppercase{Conclusion}} \label{sec:discussion}

In this work, we considered the problem of planning with a local access simulator when the action space is combinatorially large.
We introduced several algorithms that achieve polynomial computational and query complexity guarantees, while still maintaining a reasonable sub-optimality of the output policy under various assumptions. The main novelty is an efficient implementation of the uncertainty check under the mild assumption of having access to a greedy oracle. If the $Q$-functions for all policies satisfy an additive structure we provide nuanced results that show how the sample complexity can be improved in the regime where the dimension is large. Under the same additive structure our results also extend to the kernelized setting. An interesting direction for future work is to extend the results to the Factored MDP model \citep{guestrin2001multiagent} or the Confident LSVI algorithm \citep{hao2022confident}.



\subsection{Experimental Results}


\begin{wrapfigure}{r}{0.3\textwidth}
	\begin{center}
		\includegraphics[width=0.3\textwidth]{plots/grid.pdf}
	\end{center}
	\caption{Four agent grid world.}\label{fig:grid}
\end{wrapfigure}

We evaluate the performance of the proposed algorithms in a small grid world example as shown in \cref{fig:grid}. 
Each of four agents is placed in a 3x3 grid world. 
The agents obtain a +1 reward for reaching the goal state and a -1 reward in a `trap' state. 
Reaching either the trap state or the reward state terminates the episode. 
Each agent has four actions to move to a neighboring cell. 
The selected action is applied with probability 0.95 while with 0.05 probability an action is chosen uniformly at random. 
The global reward is the sum of the agents rewards. 
Note that the individual rewards are not observed, therefore the example is different from four separate grid worlds.


We run each variant of the algorithm for 50 iterations ($K=50$) without resets (the resets are mainly for simplicity of analysis). 
The discount factor is set to $\gamma = 0.8$, the regularization parameter is set to $\lambda = 10^{-5}$, for Politex we set $\alpha=1$ and the rollout length is $H=15$. 
The agents' individual features are one-hot encodings of agent, agent positions and actions which results in a feature of dimension $d=4 \cdot 9 \cdot 4 = 144$. 
Note, however, that the joint MDP is \emph{not} tabular, as the joint features,~i.e. the sum over the agent features, are not one-hot vectors. 
In fact, the features are crucial for generalization as there are a total $9^4 = 9561$ joint states for all four agents combined.
	
	
\Cref{fig:exp} shows two experiments with $n=10$ and $n=50$ rollouts. 
The plots show the performance of the policy estimate after each iteration averaged over 25 random seeds. 
We run both \textsc{Confident MC-LSPI}~and \textsc{Confident MC-Politex}~with \textsc{EGSS}~(\cref{alg:uncertainty check egss}) and \textsc{DAV}~(\cref{alg:uncertainty check dav}) uncertainty checks. 
In addition we compare to the \textsc{NAIVE}~uncertainty check (\cref{alg:uncertainty check}) that iterates over all $|A| = 4^4$ actions \citep{yin2021efficient}.  
Note that with 50 rollouts, both the \textsc{EGSS}~and \textsc{DAV}~variants perform essentially the same as \textsc{NAIVE}, despite the relaxed uncertainty bound. 
LSPI finds a good policy within at most five iterations. 
With only 10 rollouts, the final policy of \textsc{Confident MC-LSPI}~ converges to a suboptimal value on average. This can be understood as the data between iterations is not shared, and the noise from the Monte-Carlo estimates sometimes leads to a deteriorating in the policy improvement step. With 50 rollouts per iteration, LSPI reliably finds the optimal policy in all MDPs.
On the other hand, \textsc{Confident MC-Politex}~is much more stable even with just 10 rollouts, but also requires more iterations to converge. 
This is expected because in \textsc{Confident MC-Politex},~the policy estimates from all iterations are averaged.

\begin{figure}[t]
	\includegraphics{plots/3x3-aistats.pdf}
	\caption{Numerical results on a grid world with four agents.}\label{fig:exp}
\end{figure}
\section{Experiments} \label{sec:experiments}
In this section we present experiments on a cooperative multi-agent cliff-walking experiment.
We results contain algorithms: MCLSPI-EGSS, MCLSPI-DAV, Politex-EGSS, Politex-DAV
\section{Extensions}\todoj{put this here for now but this can potentially go elsewhere}
\subsection{Argmax Oracle}


\subsection{Kernelized Algorithm}
Add all the new notation for kernel setting and state the theorem.
    Kernel proofs are in Appendix \ref{app:kernel setting} \todoj{if we keep it we just refer to the appendix}
\section{\uppercase{Introduction}} \label{sec: introduction}

Reinforcement learning (RL) is concerned with training data-driven agents to make optimal decisions in interactive environments. 
An agent interacts with an environment by choosing actions and observing its state and a reward signal. 
The goal is to learn an optimal policy that maximizes the total reward. 
Efficiently computing optimal policies, also known as \emph{planning}, is therefore at the heart of any reinforcement learning algorithm. 


Recent works have successfully applied reinforcement learning algorithms to complex domains including video games \citep{mnih2013playing}, tokamak plasmas control \citep{degrave2022magnetic}, robotic manipulation tasks \citep{akkaya2019solving}, to name a few. A common theme of these works is that the agent is trained on a simulated environment. This provides additional flexibility on how the agent can interact with the environment. A reasonable assumption is that the internal state of the simulator can be saved (`checkpointing') and later revisited. 
\looseness=-1


In this work, we formally study \emph{efficient planning with local access to a simulator}. The local access model was recently proposed by \citet{yin2021efficient} with the goal of making the simulation access model more practical in applications. Local access means that the only states at which the planner can query the simulator are the initial state or states returned in response to previously issued queries. Efficient planning means that given an initial state, the learner outputs a near-optimal policy using polynomial compute and queries in all relevant parameters. 

\input{parts/table}

Motivated by the increasing complexity of applications, we specifically study the case where the state space is large or continuous.
To avoid the query complexity scaling with the size of the state space, it is standard to introduce linear function approximation \citep[e.g.,][]{bertsekas1996temporal,lagoudakis2003least,munos2005error,lattimore2020learning}. 
In particular, we assume linear $\epsilon$-realizability of joint state-action value functions for \emph{all} policies. 
This is motivated by the recent realization that realizability of the optimal state-action value function alone is not sufficient to develop a query-efficient planner~\citep{weisz2021exponential}. 
However, even under stronger realizability assumptions, previous approaches are not computationally efficient in the case where the action space is \emph{combinatorially large}, and direct enumeration of the action space becomes infeasible.
Therefore, we work with a minimal oracle assumption that allows us to compute the greedy policy for any Q-function in the model class (which amounts to solving a linear optimization over the feature space).

One prominent special case of this setting is multi-agent reinforcement learning. 
Multi-agent reinforcement learning has been a recent research focus with multiple promising attempts at tackling complex multi-agent problems, e.g., team games \citep{baker2019emergent}, large scale traffic signal control \citep{chu2019multi}, cooperative controls in powergrids \citep{chen2021powernet} among others.
Naively applying single-agent planning algorithms fails to achieve efficiency in the multi-agent setting because the single-agent algorithms typically face an exponential blow-up of the action space in the number agents. 
In many practical tasks, however, there is an inherent structure in the underlying dynamics that can be exploited to address both efficiency and scalability issues. \looseness=-1


\paragraph{Contributions} Our first contribution is a novel oracle-efficient variant of the Confident Monte-Carlo least-squares policy iteration (\textsc{Confident MC-LSPI}) algorithm by \citet{yin2021efficient}, for combinatorially large action spaces. 
The key insight is an efficient implementation of the \emph{uncertainty check}, that determines the diversity of the state and action set used for estimation. 
We also study a special case where the Q-function has an additive structure in the features (formally introduced in \Cref{ass: feature decomposition}), which leads to improved bounds in the regime where the dimension is large. 
In the multi-agent setting, the decomposition corresponds to agent-specific features, and the proposed algorithms achieve polynomial compute and query complexity in the number of agents and other quantities of interest. 
We further introduce a kernelized variant, which under the corresponding additivity assumption admits an efficient implementation. 
Lastly, the additive structure leads to an efficient implementation of the \textsc{Confident MC-Politex} algorithm that admits improved bounds in the misspecified setting. 
The formal results are summarized in \cref{table:2}.\looseness=-1






\section{\uppercase{Preliminaries}} \label{sec:preliminaries}
We consider reinforcement learning in an infinite-horizon Markov decision process (MDP) specified by a tuple $\mathcal{M} = \left(\mathcal{S}, \mathcal{A}, \mathbb{P}, r, \gamma\right)$. 
As usual, $\mathcal{S}$ denotes the state space, $\mathcal{A}$ denotes the action space, and
$\mathbb{P} : \mathcal{S} \times \mathcal{A} \rightarrow \Delta_{\mathcal{S}}$ is the transition kernel, where $\Delta_{\mathcal{S}}$ denotes the set of probability measures over $\mathcal{S}$. 
Given a state $s \in \mathcal{S}$ and action vector $a \in \mathcal{A}$, the system transits to a new state $s' \sim \mathbb{P}(s, a)$. The reward function is 
$r : \mathcal{S} \times \mathcal{A} \rightarrow [0, 1]$ and 
$\gamma \in [0, 1)$ is the discount factor.\looseness=-1

A stationary policy $\pi :\mathcal{S} \rightarrow \Delta_\mathcal{A}$ maps states to a distribution over $\mathcal{A}$.
The state value function $V_\pi: \mathcal{S} \to \mathbb{R}$ of a policy $\pi$ from a state {$s \in \mathcal{S}$} is 
\begin{align*}
	V_\pi(s) = \mathbb{E}_{\pi}\Bigg[\sum_{t=0}^\infty \gamma^t r(s_t, a_t)\Big|s_0 = s\Bigg] \,.
\end{align*}
The expectation is over the sequence of states $(s_t)_{t \in \mathbb{N}}$ and actions  $(a_t)_{t \in \mathbb{N}}$ queried from the transition kernel $\mathbb{P}$ and the policy $\pi$. 
A policy $\pi^*$ is \emph{optimal} if $V_{\pi^*} = \max_{\pi} V_{\pi}$.

The Q-function $Q_\pi: \mathcal{S} \times \mathcal{A} \to \mathbb{R}$ of a policy $\pi$
is defined for $s \in \mathcal{S}$ and $a \in \mathcal{A}$ as
\begin{align*}
	Q_{\pi}(s,a) = r(s,a)  + \gamma \mathbb{E}_{s' \sim \mathbb{P}(s,a)}\left[V_{\pi}(s')\right].
\end{align*} 

In the following we assume that we are given a state-action feature map $\phi: \mathcal{S} \times \mathcal{A} \to \mathbb{R}^d$, that allows to approximate the $Q$-function of any policy as a linear function.\looseness=-1
\begin{assumption}[Linear $Q_\pi$-realizability]
	\label{asm:linear-q-pi}


	For each policy $\pi$ there exists a weight vector $w_\pi \in \mathbb{R}^d, \|w_\pi\|_2 \leq b $ satisfying  $\max_{s,a} |Q_\pi(s,a) -w_\pi^\top  \phi(s,a)| \le \epsilon$.
\end{assumption}
The assumption is commonly used in combination with policy iteration algorithms \citep{lattimore2020learning,zanette2020learning}.  In particular, the assumption allows to obtain query complexity results that are independent of the number of states and actions. We remark that the linear MDP assumption \citep{jin2020provably} implies $Q_\pi$-realizability, but not vice versa. We also make the following standard boundedness assumption:

\begin{assumption}[Bounded features]
	\label{ass: bounded features}
	We assume that $\|\phi(s, a)\|_2 \le 1$ for all $(s, a) \in \mathcal{S} \times \mathcal{A}$.
\end{assumption}


Our main objective is to obtain query and computationally efficient algorithms for the case where the action set $\mathcal{A}$ is \emph{combinatorially} large, and direct enumeration becomes infeasible. To obtain meaningful results in this setting, we assume that the \emph{offline problem} of computing the greedy policy given a \emph{fixed} approximator $w \in \mathbb{R}^d$ can be solved efficiently. This is formally captured in the next assumption.\looseness=-1
\begin{assumption}[Greedy oracle]
	\label{ass: argmax oracle}
	We have access to an oracle $\mathcal{G}$ which takes as input a vector $w \in \mathbb{R}^d$, a state $s \in \mathcal{S}$ and a feature function $\phi: \mathcal{S} \times \mathcal{A} \to \mathbb{R}^d$ and returns an action that maximizes $w^\top \phi(s, a)$.
	Formally
	\begin{align*}
		\mathcal{G}(w,\phi) = \argmax_{a \in \mathcal{A}} w^\top \phi(s, a)\,,
	\end{align*}
	with ties broken arbitrarily.
\end{assumption}
Combined with the linear $Q_\pi$-realizability (\cref{asm:linear-q-pi}), the greedy oracle amounts to solving a \emph{linear} optimization over the action set $\mathcal{A}$. This is a reasonable assumption, as optimized solvers are available for many settings. It is also a \emph{minimal} assumption in the sense that it is required to implement a policy iteration procedure. Note that the assumption can be relaxed to require only an $\epsilon$-approximate solution, which is essentially equivalent to misspecification (\cref{asm:linear-q-pi}). In \cref{sec:additive} we provide an additive model where the oracle can be directly implemented.


Our goal is to find a computational and query efficient algorithm that given a starting state $\rho \in \mathcal{S}$ returns a $\kappa$-optimal policy $\hat \pi$, i.e.~$V_{\pi^*}(\rho) - V_{\hat \pi}(\rho) \leq \kappa$ for $\kappa > 0$ while minimizing the number of queries needed. 
To obtain queries, the learner is given \emph{local access} to a simulator of the MDP \citep{yin2021efficient}.
A simulator of the MDP takes as input a state-action pair $(s,a) \in \mathcal{S} \times \mathcal{A}$ and returns a next state $s^\prime \sim \mathbb{P}(s, a)$ and reward $r(s,a)$.
A local access simulator restricts the input state $s \in \mathcal{S}$ only to those states which have been visited previously.





An important example where the action set is typically large is cooperative multi-agent reinforcement learning.
\begin{example}[Cooperative multi-agent RL] \label{ex:multi-agent} In the multi-agent setting, $m \in \mathbb{N}$ agents act jointly on the MDP $\mathcal{M}$.
Each agent $i \in [m]$ has a set of actions $\mathcal{A}^\ag{i}$ available where $[m] := \{1,\dots,m\}$.
We denote the joint action set by $\mathcal{A} = \mathcal{A}^{(1:m)} := \mathcal{A}^{(1)} \times ... \times \mathcal{A}^{(m)}$. 

	 The state space $\mathcal{S}$ is joint for all agents.
	 A centralized, stationary policy $\pi :\mathcal{S} \rightarrow \Delta_{\mathcal{A}^{(1:m)}}$ maps states to a distribution over $\mathcal{A}^{(1:m)}$. In the \emph{cooperative} setting, the agents jointly maximize a global reward function $r : \mathcal{S} \times \mathcal{A}^{(1:m)} \rightarrow [0, 1]$.
	
	
	
\end{example}
Note that the size of the joint action set is exponential in the number of agents, which makes approaches designed for the single agent setting computationally intractable. We will revisit this example in \cref{sec:additive} where we discuss how an additive feature decomposition leads to algorithms that scale polynomially in the number of agents $m$. We remark that prior work on multi-agent RL has focused on architectures where the greedy policy can be computed efficiently \citep[e.g.,][]{guestrin2001multiagent,rashid2018qmix,delarue2020reinforcement,zohar2021locality}. \looseness=-1





\section{\uppercase{Related Work}} \label{sec:related work}


Computing optimal policies, also known as \emph{planning}, is a central challenge in reinforcement learning \citep{sutton2018reinforcement,szepesvari2010algorithms}. The two most classical planning algorithms are value iteration \citep{bellman1957markovian} and policy iteration \citep{howard1960dynamic}. Approximate versions of value and policy iteration were analyzed by \citet{munos2003error,munos2005error,farahmand2010error}. A common setting is planning with a \emph{generative model} (also \emph{global} simulator access), where the learner can query the transition kernel at any state and action \citep{kakade2003sample}. In the corresponding tabular setting the query complexity of value and policy iteration are completely understood \citep[e.g.,][]{azar2012sample,azar2013minimax}. When combined with function approximation, the picture becomes more nuanced. A lower bound under misspecification was provided by \citet{du2019good}. Sample complexity bounds for least-squares policy iteration \citep{bertsekas1996temporal,lagoudakis2003least} are by \citet{lattimore2020learning}. The latter work combines a G-experimental design over state-action pairs with Monte-Carlo rollouts to obtain value estimates for the policies. In similar fashion, least-squares value iteration (LSVI) was analyzed in the generative model setting \citep{agarwal2020reinforcement}. Yet another approach is Politex \citep{abbasi2019politex,rltheory2022}, which uses mirror descent to improve the policy. \looseness=-1

A much larger body of work focuses on the online setting, where the learner interacts with the environment in one or multiple episodes. Early work that uses function approximation includes \citep{bradtke1996linear,melo2007q}. Recent works provide query complexity guarantees under various models \citep{osband2016generalization,yang2020function,ayoub2020model,zanette2020learning,du2021bilinear,zhou2021nearly}. This includes approaches that are computationally efficient for small action sets \citep{jin2020provably,agarwal2020pc}. We are not aware of provably query efficient algorithms with \emph{only} linear $Q_\pi$-realizability (\cref{asm:linear-q-pi})  for the online setting. \Citet{abbasi2019politex,lazic2021improved,wei2021learning} prove bounds with a \emph{feature excitation} condition, although these works do not consider large action sets. Negative results under weaker assumptions are known, e.g.~for $Q^*$-realizability \citep{weisz2021exponential} and approximate $Q_\pi$-realizability \citep{du2019good}.\looseness=-1

Recently, \citet{yin2021efficient} introduced the \emph{local access} model, in which the learner can query the simulator at the initial state or any state observed during planning. They further introduce a Monte-Carlo policy iteration algorithm that provides the basis of our work. Different to this previous work, we consider the combinatorial action set setting, and provide new algorithms that avoid scaling of the computational complexity with the size of the action set. 
Least-squares value iteration with local access was analyzed by \citet{hao2022confident}. For a detailed discussion on different simulators models we refer the reader to  \citep{yin2021efficient}. 



Relatively few related works on computationally efficient planning in MDPs are concerned with combinatorial action spaces. 
This topic has received attention in the context of factored MDPs in planning \citep{dean1998solving,geisser2020trial,raghavan2012planning}, online RL \citep{osband2014near, xu2020near, tian2020towards, chen2020efficient} and in the empirical literature \citep{delarue2020reinforcement,hubert2021learning} with applications to vehicle routing and control problems. We are not aware of prior work with query complexity guarantee for MDPs with large action sets, however there is a long line of works on combinatorial bandits \citep[e.g.,][]{cesa2012combinatorial,chen2013combinatorial,shleyfman2014combinatorial,combes2015combinatorial,jourdan2021efficient}. Relevant in this context are also kernelized bandit algorithms (Bayesian optimization) that exploit additive structure of the reward function \citep{kandasamy2015high,wang2019improving,kirschner2021bias,mutny2018efficient,rolland2018high}. We consider a similar assumption in \cref{sec:additive} as a special case.




Multi-agent reinforcement learning  \citep{busoniu2008comprehensive,zhang2021multi} can be understood as a combinatorial setting, which has a large body of works on its own. Query complexity bounds focus mostly on the competitive setting, e.g. in tabular Markov games (e.g., \cite{shapley1953stochastic, song2021can,tian2021online,bai2020provable,liu2021sharp,leonardos2021global}). 
One of the key challenges is the exponential blowup in the action space with the number of agents, which is sometimes refered to as `curse of multi-agents'. \Citet{jin2021v} introduce a computationally efficient algorithm for tabular Markov games. Multi-agent reinforcement learning with function approximation is studied by \cite{huang2021towards,chen2021almost,jin2020provably}. These works consider the competitive setting and focus on obtaining query efficient algorithms, while the approaches are not computationally tractable.  In the limit where the number of agents becomes large, previous work uses mean-field approximations \citep{yang2018mean,pasztor2021efficient}.








Most closely related is \emph{cooperative} multi-agent learning. Early work by \citet{guestrin2001multiagent} proposes the use of factored MDPs to make planning tractable via message passing algorithms. \Citet{rashid2018qmix} propose a neural network architecture that allows to decouple the agent rewards in a way such that the greedy policy can be computed efficiently. 
The goal of these works is to ensure the greedy policy can be computed efficiently. \Citet{zohar2021locality} consider a setting where a graph structure captures the reward dependencies across the agents; however the guarantees they provide apply only to the bandit setting.\looseness=-1 




\section{Theoretical Results} \label{sec:theoretical results}\todoj{results from this section get distributed into section 4/5, then remove this section}
In this section we provide a theorem for each of the four variants of the main algorithm, as discussed in the previous section. 
The theorems provide bounds on the computation and query complexity of the main algorithm, and the sub-optimality of the output policy's value function.
The complete proofs of all the results can be found in \cref{app: theorem proofs}.

\begin{theorem}[MA-MC-LSPI Sub-Optimality] \label{thm:ma-mc-lspi sub-optimality}
    With $z=2m-1$ when \textsc{Uncertainty Check with DAV} is used, and $z = \sqrt{d}$ when \textsc{Uncertainty Check with EGSS} is used.
    If \cref{ass: feature decomposition} holds and $\epsilon \neq 0$, then with probability at least $1 - \delta$, the policy $\pi_{K-1}$, output by \textsc{Confident Multi-Agent MC-LSPI} satisfies
    \begin{equation*}
        V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \frac{64 \epsilon \sqrt{d} z}{(1-\gamma)^2} (1 +\log(1+m b^2 \epsilon^{-2} d^{-1}))^{1/2}.
    \end{equation*}
    Further on, the query cost is $\mathcal{O}\left(\tfrac{d^2}{\epsilon^2 (1-\gamma)^4} \right)$
    and computation cost is $\text{poly}(d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}), \log(b), m, A)$.
    
    While if $\epsilon = 0$, then for any $\kappa > 0$, with probability at least $1 - \delta$, the policy $\pi_{K-1}$, output by \textsc{Confident Multi-Agent MC-LSPI} satisfies
    \begin{equation*}
        V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \kappa. 
    \end{equation*}
    Further, the query cost is $\mathcal{O}\left(\tfrac{z^2 d^3}{\kappa^2 (1-\gamma)^8} \right)$
    and computation cost is $\text{poly}(d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}), \log(b), m, A)$.
    With parameters settings for both cases above as defined in \cref{app: theorem proofs}.
    Moreover, the above results also hold under \cref{ass: argmax oracle} when \textsc{Uncertainty Check with EGSS} is used.
\end{theorem}

\begin{theorem}[MA-MC-Politex Sub-Optimality] \label{thm:ma-mc-politex sub-optimality}
    With $z=2m-1$ when \textsc{Uncertainty Check with DAV} is used, and $z = \sqrt{d}$ when \textsc{Uncertainty Check with EGSS} is used.
    If \cref{ass: feature decomposition} holds and $\epsilon \neq 0$, then with probability at least $1 - \delta$, the policy $\bar \pi_{K-1}$, output by \textsc{Confident Multi-Agent MC-Politex} satisfies
    \begin{equation*}
        V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \frac{32 \epsilon \sqrt{d} z}{1-\gamma} (1 +\log(m b^2 \epsilon^{-2} d^{-1}))^{1/2}.
    \end{equation*}
    Further, the query cost is $\mathcal{O}\left(\tfrac{md}{\epsilon^4 (1-\gamma)^5} \right)$
    and computation cost is $\text{poly}(d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}), \log(b), m, A)$.
    
    While if $\epsilon = 0$, then for any $\kappa > 0$, with probability at least $1 - \delta$, the policy $\bar \pi_{K-1}$, output by \textsc{Confident Multi-Agent MC-Politex} satisfies
    \begin{equation*}
        V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \kappa. 
    \end{equation*}
    Further on, the query cost is $\mathcal{O}\left(\tfrac{mz^2 d^3}{\kappa^4 (1-\gamma)^9} \right)$
    and computation cost is $\text{poly}(d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}), \log(b), m, A)$.
    With parameters settings for both cases above as defined in \cref{app: theorem proofs}.
\end{theorem}
\subsubsection*{\bibname}}


\begin{document}



\onecolumn
\aistatstitle{Provably Efficient Cooperative Multi-Agent Reinforcement Learning with an Online Simulator \\
Supplementary Materials}

\section{Efficient Policy Sampling} \label{app:efficienct policy sampling}
    The policy in \citep{yin2021efficient} can be extended to the multi-agent setting with action space $\mathcal{A}^{(1:m)}$ as follows
    \begin{equation}
    \pi_k(a^{(1:m)}|s) \gets 
        \begin{cases}
            \mathbbm{1}\left(a^{(1:m)} = \argmax\limits_{\tilde{a}^{(1:m)} \in \mathcal{A}^{(1:m)}} w^\top \phi(s, \tilde{a}^{(1:m)})\right) & \text{LSPI} \\
            \exp\left(\alpha \sum\limits_{j=0}^{k-1} w_{j}^\top \phi(s, a^{(1:m)})\right) / \sum\limits_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \exp\left(\alpha \sum\limits_{j=0}^{k-1} w_{j}^\top \phi(s, a^{(1:m)})\right) . & \text{Politex}
        \end{cases} \label{eq:yin policy}
    \end{equation}
    with $w_k = (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \Phi_\mathcal{C}^\top q_\mathcal{C}$ 
    and $Q_{k-1}(s, a) = \Pi_{[0, (1-\gamma)^{-1}]} (w_k^\top \phi(s, a))$ for the Politex case only.
    In this appendix we show that the above policy can be sampled from efficiently if assumption \cref{ass: feature decomposition}  or \cref{ass: argmax oracle} is satisfied for the LSPI case and policy $\pi_k$ can be sampled from efficiently if \cref{ass: argmax oracle} is satisfied for the Politex case.
    To be precise, by efficiently we mean with computation that depends on $\text{poly}(m, A, d)$ and not $\text{poly}(A^m, d)$.
    We assume only $w \in \mathbb{R}^d$ or $w_0, ..., w_{k-1} \in \mathbb{R}^d$ (for LSPI and Politex respectively) and a feature map $\phi: \mathcal{S} \times \mathcal{A}^{(1:m)} \to \mathbb{R}^d$ are given, thus the process of sampling may require calculating the policy if necessary to accurately sample.
    First we handle the LSPI case.
    
    \begin{proposition}[Efficient LSPI Policy Sampling]
        \label{prop: efficient lspi policy sampling}
        Given state $s \in \mathcal{S}$, parameter vector $w \in \mathbb{R}^d$, feature map $\phi: \mathcal{S} \times \mathcal{A}^{(1:m)} \to \mathbb{R}^d$ and assumption \ref{ass: feature decomposition} or \ref{ass: argmax oracle} satisfied. 
        Then policy
        $$\pi_k(a^{(1:m)}|s) = \mathbbm{1}\left(a^{(1:m)} = \argmax_{\tilde{a}^{(1:m)} \in \mathcal{A}^{(1:m)}} w^\top \phi(s, \tilde{a}^{(1:m)})\right)$$
        can be sampled from in time $\text{poly}(d, m, A)$.
    \end{proposition}
    \begin{proof}
        One can sample from policy $\pi_k$ by simply outputting the result of $\argmax_{\tilde{a}^{(1:m)} \in \mathcal{A}^{(1:m)}} w^\top \phi(s, \tilde{a}^{(1:m)})$. 
        Under assumption \ref{ass: argmax oracle} $\argmax_{\tilde{a}^{(1:m)} \in \mathcal{A}^{(1:m)}} w^\top \phi(s, \tilde{a}^{(1:m)})$ can be computed in time $\text{poly}(d, m, A)$ by applying the oracle to $w$ and $\phi$ (i.e. $\mathcal{O}(w, \phi)$).
        Since assumption \ref{ass: feature decomposition} implies assumption \ref{ass: argmax oracle} the result also holds under assumption \ref{ass: feature decomposition}. 
    \end{proof}

    Next, we handle the Politex case. 
    To achieve the result below we had to modify the Politex algorithm slightly, by removing the clipping of the $Q$-function at each iteration $k$ (i.e. we define the $Q$-function at iteration $k$ to be $Q_{k-1}(s, a) = w_k^\top \phi(s, a^{(1:m)})$ instead of $Q_{k-1}(s, a) = \min\{\max\{w_k^\top \phi(s, a^{(1:m)}), 0\}, 1/(1-\gamma)\}$).
    This was done since we were not aware of an efficient way to compute the clipped $Q$-function for all action-vectors in $\mathcal{A}^{(1:m)}$.
    Removing the clipping only suffers a minor increase in the final policies sub-optimality (shown in Appendix \ref{app: theorem proofs} and in the Notes section of \citet{szepesvari2022})
    \begin{proposition}[Efficient Politex Policy Sampling]
        \label{prop: efficient politex policy sampling}
        Given state $s \in \mathcal{S}$, parameter vectors $w_0, ..., w_{k-1} \in \mathbb{R}^d$, feature map $\phi: \mathcal{S} \times \mathcal{A}^{(1:m)} \to \mathbb{R}^d$ and assumption \ref{ass: feature decomposition} satisfied. 
        Then policy
        $$\pi_k(a^{(1:m)}|s) = \exp\left(\alpha \sum\nolimits_{j=0}^{k-1} w_{j}^\top \phi(s, a^{(1:m)})\right)/ \sum\nolimits_{\tilde{a}^{(1:m)} \in \mathcal{A}^{(1:m)}} \exp \left(\alpha \sum\nolimits_{j=0}^{k-1} w_{j}^\top \phi(s, \tilde{a}^{(1:m)})\right)$$ 
        can be sampled from in time $\text{poly}(d, m, A)$.
    \end{proposition}

    \begin{proof}
    Fix arbitrary $a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
    To sample from $\pi_k$ it is sufficient to sample actions $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ proportional to $\exp(\alpha \sum_{j=0}^{k-1} Q_{j}(s, a^{(1:m)}))$.
    Rearranging $\exp(\alpha \sum_{j=0}^{k-1} Q_{j}(s, a^{(1:m)}))$ and plugging in that $\phi(s, a^{(1:m)}) = \sum_{i=1}^m w^\top \phi_i(s, a^\ag{i})$ under assumption \ref{ass: feature decomposition} we have
    \begin{align*}
        \exp\left(\alpha \sum\nolimits_{j=0}^{k-1} w_{j}^\top \phi(s, a^{(1:m)})\right) 
        &= \prod_{j=0}^{k-1} \exp\left(\alpha w_{j}^\top \phi(s, a^{(1:m)})\right) \\
        &= \prod_{j=0}^{k-1} \exp\left(\alpha w_{j}^\top \sum_{i=1}^m \phi_i(s, a^\ag{i})\right) \\
        &= \prod_{i=1}^m \prod_{j=0}^{k-1} \exp\left(\alpha w_{j}^\top \phi_i(s, a^\ag{i})\right)
    \end{align*}

    Which means that the probability of sampling action $a^{(1:m)}$ is equal to the product of the probabilities of sampling $a^\ag{i}$ for agents $i \in [m]$ independently.
    Since $a^{(1:m)}$ was arbitrary this completes the proof.
    \end{proof}


    \section{{Bound on Core Set Size}}
    Recall, that only tuples containing state-action vectors that satisfy $\phi(s, a^{(1:m)})^\top (\Phi^\top \Phi + \lambda I)^{-1} \phi(s, a^{(1:m)}) > \tau$ are add to the core set. 
    This ensures that the size of the core set can be be bounded by a $\text{poly}(d, \tau, \log(m), \log(1/\lambda))$ function.
    
    \begin{lemma}[Bound on Core Set Size (modified Lemma 5.1 in \citep{yin2021efficient})] \label{lemma:bound on core set size}
        When \cref{ass: bounded features} is satisfied, and $(s,  a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A}^{(1:m)})$ that satisfy $\phi(s, a^{(1:m)})^\top (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \phi(s, a^{(1:m)}) > \tau$ are added to the core set, the size of the core set can be bounded by
        \begin{align}
             \tilde{C}_{\max} := \frac{e}{e-1} \frac{1 + \tau}{\tau} d \left( 
                \log(1 + \frac{1}{\tau}) +
                \log(1 + \frac{m}{\lambda})
            \right). \label{eq:cmax-new}       
        \end{align}
    \end{lemma}

    The Lemma displayed above is borrowed directly from \citet{yin2021efficient}.
    The only difference is that the features in our setting are bounded by $m$ instead of $1$ (Assumption \ref{ass: bounded features}), and thus we obtain an extra $\log(m)$ factor, which can easily be verified.

    \section{Efficient Uncertainty Check} \label{app:efficient uncertainty check}
    \label{app: efficient uncertainty check}
    The \textsc{Uncertainty Check} algorithm is the implementation used by \citet{yin2021efficient} extended to the multi-agent setting, where the action space is $\mathcal{A}^{(1:m)}$.
    The \textsc{Confident MC-LSPI/Politex} algorithm proposed by \citet{yin2021efficient} algorithm is identical to our \textsc{Confident Multi-Agent MC-LSPI/Politex} algorithm with \textsc{Uncertainty Check with (DAV/EGSS)} replaced with \textsc{Uncertainty Check} and the policy on line 15 replaced with \cref{eq:yin policy}.
    \begin{algorithm}
    \caption{\textsc{Uncertainty Check}} \label{alg:uncertainty check}  
    \begin{algorithmic}[1]
    \State \textbf{Input:} state $s$, feature matrix $\Phi_\mathcal{C}$, regularization coefficient $\lambda$, threshold $\tau$.
    \For {$a^{(1:m)} \in \mathcal{A}^{(1:m)}$}
        \If {$\phi(s, a^{(1:m)})^\top (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \phi(s, a^{(1:m)}) > \tau$}
            \State status $\gets$ uncertain, result $\gets (s, a^{(1:m)}, \phi(s, a^{(1:m)}), \text{none})$
            \State \Return {status, result}
        \EndIf
    \EndFor 
    \State \Return certain, none 
    \end{algorithmic}
    \end{algorithm}
    In this appendix we show how the loop over all action vectors $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ in the \textsc{Uncertainty Check} algorithm (line 2) can be avoided when either \cref{ass: feature decomposition} or \cref{ass: argmax oracle} is satisfied.
    In particular, we show that \textsc{Uncertainty Check with DAV} and \textsc{Uncertainty Check with EGSS} algorithms are able to reduce the computation time of the \textsc{Uncertainty Check} from $\text{poly}(A^m, d)$ to $\text{poly}(m, A, d)$, while still maintaining suitable output policy guarantees.

    Since, we are extending the \textsc{Confident MC-LSPI/Politex} algorithm proposed by \citet{yin2021efficient}, we will be borrowing much of the steps from their proof.
    \citet{yin2021efficient} used a \textit{virtual algorithm} (VA) and \textit{main algorithm} (MA) to prove the sub-optimality of their \textsc{Confident MC-LSPI/Politex} algorithm.
    We give a brief summary of the VA and MA; however, avoid full details since we use the exact same definition as in \citet{yin2021efficient}. 
    Until the next subsection, assume \textsc{Uncertainty Check} is used in place of \textsc{Uncertainty Check with (DAV/EGSS)} in \textsc{Confident Multi-Agent MC-LSPI/Politex}.
    The MA is exactly \textsc{Confident Multi-Agent MC-LSPI/Politex}. 
    The VA is based on the \textsc{Confident Multi-Agent MC-LSPI/Politex} algorithm, but has some differences, which we outline next. 
    The VA runs for exactly $C_\text{max}$ loops, $K$ iterations, and completes all $n$ of its rollouts of length $H$. 
    For each loop and iteration $k$ the VA always obtains estimates $q_\mathcal{C}$ of its policy.
    The VA uses a different policy than the MA for rollouts.
    We will first focus on the LSPI case and return to Politex much later.
    The VA's $Q$-function at iteration $k$ is  
    \begin{equation*}
        \tilde{Q}_{k-1}(s, a^{(1:m)})=\begin{cases}
                          \tilde{w}_k^\top \phi(s, a^{(1:m)}) \quad &\text{if} \, \phi(s, a^{(1:m)}) \in \mathcal{H} \\
                          Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})     \quad &\text{if} \, \phi(s, a^{(1:m)}) \notin \mathcal{H} \\
                    \end{cases}
    \end{equation*}
    where $\tilde w_k = V_\mathcal{C}^{-1} \Phi_\mathcal{C}^\top \tilde q_\mathcal{C}$, and $\tilde q_\mathcal{C}$ are the estimates obtained from running \textsc{Multi-Agent-Confident Rollout} on each element of the core set, and $\mathcal{H} = \{\phi(s, a^{(1:m)}): \|\phi(s, a^{(1:m)})\|_{V_\mathcal{C}^{-1}}^2 \le \tau\}$ is the \textit{good set}.
    The VA's policy is
    \begin{equation*}
        \tilde \pi_k(a^{(1:m)} | s) = \mathbbm{1} \left( a^{(1:m)} = \argmax_{\tilde a^{(1:m)} \in \mathcal{A}^{(1:m)}} \tilde Q_{k-1}(s, \tilde a^{(1:m)}) \right).
    \end{equation*}

    The nice thing about defining the VA's policy in this way is that we can make use of the following Lemma from \citep{yin2021efficient}.
    
    \begin{lemma}[Lemma B.2 in \citep{yin2021efficient}]
    \label{lemma: yin lemma b.2}
    Suppose that Assumption \ref{ass: feature decomposition} holds. 
    With all terms as defined earlier and $\theta > 0$. 
    Then, with probability at least 
    $$1 - 2C_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$$
    for any $(s,  a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A}^{(1:m)})$ pair such that $\phi(s, a^{(1:m)}) \in \mathcal{H}$, we have 
    $$|\tilde{Q}_{k-1} (s, a^{(1:m)}) - Q_{\tilde\pi_{k-1}} (s, a^{(1:m)})| \le b\sqrt{\lambda \tau} + \left(\epsilon +  \frac{\gamma^{H-1}}{1 - \gamma} + \theta \right) \sqrt{\tau C_{\text{max}}} + \epsilon := \eta$$
    \end{lemma}

    Notice that for any $(s,  a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A}^{(1:m)})$ pair such that $\phi(s, a^{(1:m)}) \notin \mathcal{H}$, the VA's $Q$-function $\tilde Q_{k-1}$ has access to the true $Q$-function $Q_{\tilde{\pi}_{k-1}}$ of policy $\tilde \pi_{k-1}$.
    Thus, we have that 
    \begin{equation}
    \label{eq:va inf-norm bound}
        \| \tilde{Q}_{k-1} (s, a^{(1:m)}) - Q_{\tilde\pi_{k-1}} (s, a^{(1:m)}) \|_\infty \le \eta 
    \end{equation}
    Combined with the fact that $\tilde \pi_k$ is greedy w.r.t. $\tilde Q_{k-1}$ the above result turns out to be especially useful.
    
    To understand why, we state a classic policy improvement result, which can be found as Lemma B.3 in \citet{yin2021efficient} and in other papers.
    \begin{lemma}[approximate policy iteration]
    \label{lemma:approximate policy iteration}
        Suppose that we run K approximate policy iterations and generate a sequence of policies
        $\pi_0, \pi_1, \pi_2, \cdots, \pi_K$.
        Suppose that for every $k = 1, 2, \cdots, K$, in the k-th iteration, we obtain a function
        $\tilde{Q}_{k-1}$ such that, $\| \tilde{Q}_{k - 1} - Q_{\pi_{k - 1}} \|_\infty \leq \eta$,
        and choose $\pi_k$ to be greedy with respect to $\tilde{Q}_{k-1}$.
        Then
        \begin{align*}
            \| Q^* - Q_{\pi_K} \|_\infty \leq \frac{2 \eta}{1 - \gamma} + \frac{\gamma^K}{1 - \gamma},
        \end{align*}
    \end{lemma}

    In our case the VA's policy $\tilde \pi_k$ is greedy w.r.t. $\tilde Q_{k-1}$ and thus we have that
    \begin{align*}
        \| Q^* - Q_{\tilde \pi_K} \|_\infty \leq \frac{2 \eta}{1 - \gamma} + \frac{\gamma^K}{1 - \gamma},
    \end{align*}

    Now we explain how the MA can be related to the VA, and make use of the above result.
    The \textsc{Uncertainty Check} algorithm can have two cases: 
    
    \textbf{Case 1:} $\|\phi(s, a^{(1:m)})\|_{V_\mathcal{C}^{-1}}^2 > \tau$ holds for at least one $a^{(1:m)} \in \mathcal{A}^{(1:m)}$,
    
    \textbf{Case 2:} $\|\phi(s, a^{(1:m)})\|_{V_\mathcal{C}^{-1}}^2 \le \tau$ holds for all $a^{(1:m)} \in \mathcal{A}^{(1:m)}$. This is equivalent to saying $\phi(s, a^{(1:m)}) \in \mathcal{H}, \ \forall a^{(1:m)} \in \mathcal{A}^{(1:m)}$.

    The VA is exactly the same at the MA algorithm, until Case 1 occurs for the first time.
    This is because the MA's and VA's simulators are coupled, in the sense that at iteration $k$, rollout $i$, and step $t$, when both simulators are queried with the same state-action vector pairs, they sample the exact same next state and reward. 
    The VA also uses the same initial policy as the MA at the start of policy iteration for every loop.
    Once Case 1 occurs the MA would restart policy iteration (else condition in line 12 of \textsc{Confident Multi-Agent MC-LSPI/Politex}), while the VA does not. 
    The VA records the state-action vector pair when Case 1 occurs for the first time and adds it to the core set once it completes running policy iteration for the current loop.
    In this way the core set maintained by the MA and VA are always the same.
   
   
    Since the size of the core set is bounded by $C_\text{max}$ when $(s, a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A}^{(1:m)})$ that satisfy $\phi(s, a^{(1:m)})^\top (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \phi(s, a^{(1:m)}) > \tau$ are added to the core set (\cref{lemma:bound on core set size}), there will be a loop of policy iteration at which the MA and VA never encounter Case 1 for any of the $K$ iterations of policy iteration.
    We call this loop the \emph{final loop}.
    This is equivalent to say that all $(s,  a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A}^{(1:m)})$ observed during all $K$ iterations of policy iteration in the final loop are in the good set (i.e. $\phi(s, a^{(1:m)}) \in \mathcal{H}$).
    Notice that this means MA and VA behaved identical in the final loop, since the VA's policy would have always been greedy w.r.t. $\tilde{w}_k^\top \phi$ and the MA and VA use the same initial policy at the start of each loop.
    It turns out this relationship between the MA and VA allows us to bound the sub-optimality of the MA in the final loop, by using the result in \cref{eq:va inf-norm bound} we have for the VA. 
    More precisely, the following result can be extracted from \citep{yin2021efficient}
    \begin{proposition}[equation (B.15) in \citet{yin2021efficient}] \label{prop:optimality of output policy}
    With all terms as defined earlier. 
    Define $\eta \ge \|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(s, a^{(1:m)})\|_\infty$. 
    Suppose $\eta \ge |\tilde w_{k}^\top \phi(\rho, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(\rho, a^{(1:m)})|_\infty, \ \forall a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
    Then, if the VA and MA behave identically in the final loop, with probability at least $1 - 4KC_{\text{max}}^2 \exp(-2 \theta^2(1-\gamma)^2 n)$ we have
    \begin{align}
        V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \frac{8 \eta}{(1 - \gamma)^2} + \frac{2 \gamma^{K - 1}}{(1 - \gamma)^2} \label{eq: v function bound main}
    \end{align}
    \end{proposition}
    
    Notice, that we require three things to use the above result. 
    We need a bound on $\|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(s, a^{(1:m)})\|_\infty$. 
    We need a bound on $|\tilde w_{k}^\top \phi(\rho, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(\rho, a^{(1:m)})|_\infty, \ \forall a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
    We need to ensure that the VA and MA behave identically in the final loop. 
    Then, we can get a bound on the sub-optimality of the MA's output policy $\pi_{K-1}$.
    An important observation is that \textsc{Uncertainty Check} ensured that MA and VA behave identically in the final loop.
    It did this by making sure that the VA's policy $\tilde \pi_k$ would only be able to use $\tilde{w}_k^\top \phi$ to derive its actions, since \textsc{Uncertainty Check} always returns a \textsc{status} of \textsc{certain} in the final loop, which means that $\phi(s, a^{(1:m)}) \in \mathcal{H}$ for all $s, a^{(1:m)} \in \mathcal{S} \times \mathcal{A}^{(1:m)}$ encountered in the final loop.
    With this information in mind, we now show that \textsc{Uncertainty Check with DAV} and \textsc{Uncertainty Check with EGSS} only requires computation $\text{poly}(d, m, A)$, while providing only slightly worse sub-optimality guarantees when compared to the result in \citep{yin2021efficient}.

   
   

   
    \subsection{Default Action Vector (DAV) Method}
    In this section we prove some useful results for \textsc{Uncertainty Check with DAV}.
    Fix a state $s \in \mathcal{S}$.
    First, \textsc{Uncertainty Check with DAV} only iterates over $\sum_{i=1}^m |A^\ag{i}|$ action vectors instead of all the action vectors like \textsc{Uncertainty Check} does.
    Define the $\sum_{i=1}^m A^\ag{i}$ sized set of modified default action vectors as $\bar \mathcal{A}^{(1:m)} = \{ (a^\ag{i}, \bar a^{(-i)}): a^\ag{i} \in \mathcal{A}^\ag{i}, \ i \in [m] \}$.
    Notice \textsc{Uncertainty Check with DAV} iterates over all the actions in the set $a^{(1:m)} \in \bar \mathcal{A}^{(1:m)}$ and checks if any of them satisfy $\|\phi(s, a^{(1:m)})\|_{V_\mathcal{C}^{-1}}^2 > \tau$.
    This of course achieves the goal of only $\text{poly}(d, m, A)$ computation.
    Also, since only $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ that satisfy $\|\phi(s, a^{(1:m)})\|_{V_\mathcal{C}^{-1}}^2 > \tau$ are added to the core set, we can still use \cref{lemma:bound on core set size} to bound the size of the core set by $C_\text{max}$.

    Now, we aim to ensure that the VA and MA behave identically in the final loop.
    Define the set of states for which all the modified default action vectors are in the good set as $\bar \mathcal{S} = \{ s \in \mathcal{S}: \|\phi(s, a^{(1:m)})\|_{V_\mathcal{C}^{-1}}^2 \le \tau, \forall a^{(1:m)} \in \bar \mathcal{A}^{(1:m)} \}$.
    Redefine the VA's $Q$-function as 
    \begin{equation*}
    \tilde Q_{k-1}(s, a^{(1:m)})=\begin{cases}
          \tilde w_k^\top \phi(s, a^{(1:m)}) \quad & s \in \bar \mathcal{S} \\
          Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)}). \quad & s \in \mathcal{S} \backslash \bar \mathcal{S}
        \end{cases}
    \end{equation*}
    The VA's policy is
    \begin{equation*}
        \tilde \pi_k(a^{(1:m)} | s) = \mathbbm{1} \left( a^{(1:m)} = \argmax_{\tilde a^{(1:m)} \in \mathcal{A}^{(1:m)}} \tilde Q_{k-1}(s, \tilde a^{(1:m)}) \right).
    \end{equation*}
    Notice that in the final loop the check $\phi(s, (a^{(j)}, \bar a^{(-j)}))^\top (\Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I)^{-1} \phi(s, (a^{(j)}, \bar a^{(-j)})) > \tau$ in \textsc{Uncertainty Check with DAV} never returns \textsc{True}, and thus we are sure that all $a^{(1:m)} \in \bar \mathcal{A}^{(1:m)}$ for all the states encountered in the final loop are in the good set.
    Notice that these states that satisfy this condition are state in $\bar \mathcal{S}$.
    Thus, the VA's policy $\pi_{k}$ would always be greedy w.r.t. $\tilde w_k^\top \phi$ in the final loop.
    This ensures that the VA and MA behave identically in the final loop.

    Now we show that we can bound $\|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(s, a^{(1:m)})\|_\infty$ with this new definition of $\tilde Q_{k-1}$. 
    First we state a slight modification of \cref{lemma: yin lemma b.2} for $w_{\tilde{\pi}_{k-1}}^\top \phi$ instead of $Q_{\tilde{\pi}_{k-1}}$ which excludes the $\|w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})\|_\infty \le \epsilon$ term in the proof of Lemma B.2 in \citep{yin2021efficient}.
    
    \begin{lemma}[Lemma B.2 in \citep{yin2021efficient}]
    \label{lemma: yin lemma b.2 no epsilon}
    Suppose that Assumption \ref{ass: feature decomposition} holds. 
    With all terms as defined earlier and $\theta > 0$. 
    Then, with probability at least 
    $$1 - 2C_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$$
    for any $(s,  a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A}^{(1:m)})$ pair such that $\phi(s, a^{(1:m)}) \in \mathcal{H}$, we have 
    $$|\tilde{w}_{k} (s, a^{(1:m)}) - w_{\tilde\pi_{k-1}}^\top (s, a^{(1:m)})| \le b\sqrt{\lambda \tau} + \left(\epsilon +  \frac{\gamma^{H-1}}{1 - \gamma} + \theta \right) \sqrt{\tau C_{\text{max}}}:= \bar\eta$$
    \end{lemma}

    The following Proposition gives us a bound on $\|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(s, a^{(1:m)})\|_\infty$.
    
    \begin{proposition}[approximate value function bound for DAV]
    \label{prop: approx value function bound for DAV}
    Suppose that Assumption \ref{ass: feature decomposition} holds. 
    With all terms as defined earlier and $\theta > 0$. 
    Then, with probability at least 
    $$1 - 2C_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$$
    we have
    $$\|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})\|_\infty \le \bar\eta (2m-1) + \epsilon := \eta_1.$$ 
    \end{proposition}
    
    \begin{proof}
    
    For any $(s, a^{(1:m)}) \in (\bar \mathcal{S} \times \mathcal{A}^{(1:m)})$, we have
    \begin{align}
        & |\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| \nonumber \\
        &= |\tilde{w}_k^\top \phi(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| \nonumber \\
        &= |\tilde{w}_k^\top \phi(s, a^{(1:m)}) \pm w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| \nonumber \\
        &\le |\tilde{w}_k^\top \phi(s, a^{(1:m)}) - w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)})| + |w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| \nonumber \\
        &\le |\tilde{w}_k^\top \phi(s, a^{(1:m)}) - w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)})| + \epsilon \nonumber \\
        &= |\tilde{w}_k^\top \phi(s, a^{(1:m)}) - w_{\tilde{\pi}_{k-1}}^\top \phi(s, a^{(1:m)}) \pm (m-1)\tilde{w}_k^\top \phi(s, \bar a^{(1:m)}) \pm (m-1) w_{\tilde{\pi}_{k-1}}^\top \phi(s, \bar a^{(1:m)})| + \epsilon \nonumber \\
        &= \left|\left( \sum_{i=1}^m \tilde{w}_k^\top \phi(s, (a^\ag{i}, \bar a^{(-i)})) - w_{\tilde\pi_{k-1}}^\top \phi(s, (a^\ag{i}, \bar a^{(-i)})) \right) + (m-1)\left[w_{\tilde{\pi}_{k-1}}^\top \phi(s, \bar a^{(1:m)})) - \tilde{w}_k^\top \phi(s, \bar a^{(1:m)}\right]\right| + \epsilon \nonumber \\
        &\le m \bar\eta + (m-1) \bar \eta + \epsilon \nonumber \\ 
        &= \bar\eta (2m-1) + \epsilon \label{value function bound 1}
    \end{align}
    where the second last inequality holds by Lemma \ref{lemma: yin lemma b.2 no epsilon} (because the features of all the state action pairs considered are in $\mathcal{H}$, since $s \in \bar \mathcal{S}$).
    
    While for any $(s, a^{(1:m)}) \in ((\mathcal{S} \backslash \bar \mathcal{S}) \times \mathcal{A}^{(1:m)})$, we have
    \begin{align}
        |\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| 
        = |Q_{\tilde \pi_{k-1}}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})|
        &= 0 \label{value function bound 2}
    \end{align}
    \end{proof}

    Finally, it is left to show that $|\tilde w_{k}^\top \phi(\rho, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(\rho, a^{(1:m)})|$  can be bounded for all $a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
    Notice that lines 3-6 in \textsc{Confident Multi-Agent MC-LSPI} run \textsc{Uncertainty Check with DAV} with state $\rho$ as input until the returned \textsc{status} is \textsc{certain}.
    Recall that once \textsc{Uncertainty Check with DAV} returns a \textsc{status} of \textsc{certain} we know that $\rho \in \bar \mathcal{S}$.
    Thus, we can immediately apply the result in \cref{value function bound 1} to bound $\eta_1 \ge |\tilde w_{k}^\top \phi(\rho, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(\rho, a^{(1:m)})|, \ \forall a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
    
    \subsection{Efficient Good Set Search Approach (EGSS)} \label{subsec:good set search}
    
    In this section we prove some useful results for \textsc{Uncertainty Check with EGSS}.
    Fix a state $s \in \mathcal{S}$.
    First, we show that with $\text{poly}(d, m, A)$ computation, one can find an action vector $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ that approximately maximizes $\phi(s, a^{(1:m)})^\top V_\mathcal{C}^{-1} \phi(s, a^{(1:m)})$.
    
    \begin{lemma}[Efficient good set search]
    \label{lemma:good set search}
    Assume either assumption \ref{ass: feature decomposition} or \ref{ass: argmax oracle} is satisfied.
    With all terms as defined earlier. 
    One can ensure, with computation time $2 d^2 \sum_{i=1}^m A^\ag{i}$ that either
    $$\phi(s, a^{(1:m)})^\top V_\mathcal{C}^{-1} \phi(s, a^{(1:m)}) \le d\tau$$
    for all $a^{(1:m)} \in \mathcal{A}^{(1:m)}$, or there exists an $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ such that
    $$\phi(s, a^{(1:m)})^\top V_\mathcal{C}^{-1} \phi(s, a^{(1:m)}) > \tau$$
    \end{lemma}
    
    \begin{proof}
    Recall that we are able to compute $\max_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \langle u, \phi(s, a^{(1:m)}) \rangle$ for any $u \in \mathbb{R}^d$ in $\text{poly}(d ,m, A)$ time (due to assumption \ref{ass: feature decomposition} or \ref{ass: argmax oracle}).
    Now, we make use of a bi-directional 2-norm to $\infty$-norm inequality that will take advantage of the above mentioned efficient computation.
    
    Fix $\mathcal{C}$ and define the lower triangular matrix $L$ via the Cholesky decomposition $V_\mathcal{C}^{-1} = L L^\top$.
    Define $\{e_i\}_{i=1}^d$ as the standard basis vectors and 
    \begin{equation*}
    (v^*, a_\text{max}^{(1:m)}) := \text{arg} \left(\max_{v \in \{\pm e_i\}_{i=1}^d} \max_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \langle L v, \phi(s, a^{(1:m)}) \rangle \right) 
    \end{equation*}
    Then we have that
    \begin{align}
       \frac{1}{d} \| \phi(s, a_\text{max}^{(1:m)}) \|_{V_\mathcal{C}^{-1}}^2
       &= \frac{1}{d} \phi(s, a_\text{max}^{(1:m)})^\top V_\mathcal{C}^{-1} \phi(s, a_\text{max}^{(1:m)}) \nonumber \\
       &= \frac{1}{d} \phi(s, a_\text{max}^{(1:m)})^\top L L^\top \phi(s, a_\text{max}^{(1:m)}) \nonumber \\
       &= \frac{1}{d} \| L^\top \phi(s, a_\text{max}^{(1:m)}) \|_2^2 \nonumber \\
       &\le \max_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \| L^\top \phi(s, a^{(1:m)})\|_\infty^2 \nonumber \\
       &= \max_{v \in \{\pm e_i\}_{i=1}^d} \max_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \langle v, L^\top \phi(s, a^{(1:m)}) \rangle^2 \nonumber \\
       &= \max_{v \in \{\pm e_i\}_{i=1}^d} \max_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \langle L v, \phi(s, a^{(1:m)}) \rangle^2 \nonumber \\
       &= \langle L v^*, \phi(s, a^{(1:m)}_\text{max}) \rangle^2  \label{inf-norm term} \\
       &\le \| L^\top \phi(s, a_\text{max}^{(1:m)}) \|_2^2 \nonumber 
    \end{align}
    Notice that the purpose of writing all the equalities up to equation (\ref{inf-norm term}) was to show that equation (\ref{inf-norm term}) can be computed in $\text{poly}(d, m, A)$ time. 
    Since $\max_{a^{(1:m)} \in \mathcal{A}^{(1:m)}} \langle L v, \phi(s, a^{(1:m)}) \rangle^2$ can be computed in $\text{poly}(d, m, A)$ time (due to assumption \ref{ass: feature decomposition} or \ref{ass: argmax oracle}) and $\{\pm e_i\}_{i=1}^d$ contains $2d$ elements. 
    Also, note that $L$ can be computed with at most $d^2$ computation in each loop by doing a rank one update to the Cholesky decomposition of $V_\mathcal{C}^{-1} = L L^\top$.
    
    If equation (\ref{inf-norm term}) is larger than $\sqrt{\tau}$, then $\|\phi(s, a_\text{max}^{(1:m)}) \|_{V^{-1}}^2 > \tau$.
    While, if equation (\ref{inf-norm term}) is less than or equal $\tau$, then $\|\phi(s, a_\text{max}^{(1:m)}) \|_{V^{-1}}^2 \le d\tau$, completing the proof.
    \end{proof}

    \textsc{Uncertainty Check with EGSS} is essentially an implementation of equation (\ref{inf-norm term}), thus it only takes computation $\text{poly}(d, m, A)$ to run, as stated in Lemma \ref{lemma:good set search}.
    Also, since only $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ that satisfy $\|\phi(s, a^{(1:m)})\|_{V_\mathcal{C}^{-1}}^2 \ge \|\phi(s, a^{(1:m)})\|_\infty^2 > \tau$ are added to the core set, we can still use \cref{lemma:bound on core set size} to bound the size of the core set by $C_\text{max}$.
    Basically, \ref{inf-norm term} is an underestimate of $\| \phi(s, a_\text{max}^{(1:m)}) \|_{V^{-1}}^2$ and we only add elements to the core set when it is larger than $\tau$, thus the core set is no larger than it was when using \textsc{Uncertainty Check}.
    
    Now, we aim to ensure that the VA and MA behave identically in the final loop.
    Notice that \textsc{Uncertainty Check with EGSS} provides a weaker guarantee than \textsc{Uncertainty Check}, when the returned \textsc{result} is \textsc{certain}.
    Specifically, when \textsc{Uncertainty Check with EGSS} returns a \textsc{result} of \textsc{certain}, then Lemma \ref{lemma:good set search} guarantees that $\|\phi(s, a^{(1:m)})\|_{V_\mathcal{C}^{-1}}^2 \le d\tau$ for all $a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
    While when the \textsc{Uncertainty Check} returns a \textsc{result} of \textsc{certain}, then $\|\phi(s, a^{(1:m)})\|_{V_\mathcal{C}^{-1}}^2 \le \tau$ for all $a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
    Thus, we define a smaller good set $\mathcal{H}_d = \{ \phi(s, a^{(1:m)}): \|\phi(s, a^{(1:m)})\|_{V_\mathcal{C}^{-1}}^2 \le d\tau\}$.
    
    Redefine the VA's $Q$-function at iteration $k$ as
    \begin{equation*}
        \tilde{Q}_{k-1}(s, a^{(1:m)})=\begin{cases}
                          \tilde{w}_k^\top \phi(s, a^{(1:m)}) \quad &\text{if} \, \phi(s, a^{(1:m)}) \in \mathcal{H}_d \\
                          Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})     \quad &\text{if} \, \phi(s, a^{(1:m)}) \notin \mathcal{H}_d \\
                    \end{cases}
    \end{equation*}
    and VA's policy as
    \begin{equation*}
        \tilde \pi_k(a^{(1:m)} | s) = \mathbbm{1} \left( a^{(1:m)} = \argmax_{\tilde a^{(1:m)} \in \mathcal{A}^{(1:m)}} \tilde Q_{k-1}(s, \tilde a^{(1:m)}) \right).
    \end{equation*}
    Notice that in the final loop \textsc{Uncertainty Check with DAV} always returns a \textsc{result} of \textsc{certain}, and thus we are sure that all $a^{(1:m)} \in \mathcal{A}^{(1:m)}$ for all the states encountered in the final loop are in the smaller good set $\mathcal{H}_d$.
    Thus, the VA's policy $\pi_{k}$ would always be greedy w.r.t. $\tilde w_k^\top \phi$ in the final loop.
    This ensures that the VA and MA behave identically in the final loop.

    We need show that we can bound $\|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(s, a^{(1:m)})\|_\infty$ with this new definition of $\tilde Q_{k-1}$. 
    First we state a slight modification of \cref{lemma: yin lemma b.2} that holds for the smaller good set $\mathcal{H}_d$ 
    
    \begin{lemma}[EGSS modified Lemma B.2 from \cite{yin2021efficient}]
    \label{lemma: mod b.2 egss}
    Suppose that Assumption \ref{ass: feature decomposition} holds. 
    With all terms as defined earlier and $\theta > 0$. 
    Then, with probability at least 
    $$1 - 2C_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$$
    for any $(s,  a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A}^{(1:m)})$ pair such that $\phi(s, a^{(1:m)}) \in \mathcal{H}_d$, we have 
    $$|\tilde{w}_k^\top \phi(s, a^{(1:m)}) - w_{\tilde\pi_{k-1}}^\top \phi(s, a^{(1:m)})| \le b\sqrt{\lambda d \tau} + \left(\epsilon + \frac{\gamma^{H+1}}{1 - \gamma} + \theta \right) \sqrt{d \tau C_{\text{max}}} + \epsilon = \sqrt{d} \bar \eta:= \eta_2$$
    \end{lemma}
    
    \begin{proof}
    The proof is identical to that of Lemme B.2 from \cite{yin2021efficient} except $\tau$ is replaced with $d \tau$ everywhere, due to the weaker guarantee of algorithm \ref{alg:uncertainty check egss} as discussed above. 
    \end{proof}
    
    Essentially we get an extra $\sqrt{d}$ factor due to the smaller good set $\mathcal{H}_d$. 
    Since the VA's policy $\tilde \pi_{k}$ has access to the true $Q$-function $Q_{\tilde \pi_{k-1}}$ for all $\phi(s, a^{(1:m)}) \notin \mathcal{H}_d$,
    Now we show that $\|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(s, a^{(1:m)})\|_\infty$ can be bounded.
    
    \begin{proposition}[approximate value function bound for EGSS]
    \label{prop: approx value function bound for EGSS}
    Suppose that Assumption \ref{ass: feature decomposition} holds. 
    With all terms as defined earlier and $\theta > 0$. 
    Then, with probability at least 
    $$1 - 2C_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$$
    we have
    $$\|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})\|_\infty \le \eta_2.$$ 
    \end{proposition}

    \begin{proof}
    For any $(s, a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A}^{(1:m)})$ such that $\phi(s, a^{(1:m)}) \in \mathcal{H}_d$, we have
    \begin{align}
        |\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| \le \eta_2
    \end{align}
    by \cref{prop: approx value function bound for EGSS}.
    While for any $(s, a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A}^{(1:m)})$ such that $\phi(s, a^{(1:m)}) \notin \mathcal{H}_d$, we have
    \begin{align}
        |\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})| 
        = |Q_{\tilde \pi_{k-1}}(s, a^{(1:m)}) - Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})|
        &= 0 
    \end{align}
    \end{proof}

    Finally, it is left to show that $|\tilde w_{k}^\top \phi(\rho, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(\rho, a^{(1:m)})|$  can be bounded for all $a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
    Notice that lines 3-6 in \textsc{Confident Multi-Agent MC-LSPI} run \textsc{Uncertainty Check with EGSS} with state $\rho$ as input until the returned \textsc{status} is \textsc{certain}.
    Recall that once \textsc{Uncertainty Check with EGSS} returns a \textsc{status} of \textsc{certain} we know that $\rho \in \mathcal{H}_d$.
    Thus, we can immediately apply \cref{lemma: mod b.2 egss} to bound $\eta_2 \ge |\tilde w_{k}^\top \phi(\rho, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(\rho, a^{(1:m)})|, \ \forall a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
 

    
    \subsection{Extending to Politex} \label{subsec:extending to politex}
    Recall the above results where for the \textsc{Confident Multi-Agent MC-LSPI} case.
    It turns out the story for the \textsc{Confident Multi-Agent MC-Politex} is extremely similar and can be argued in nearly the same way. 
    The main difference is that the policy used in \textsc{Confident Multi-Agent MC-Politex} is different than in \textsc{Confident Multi-Agent MC-LSPI} (line 15).
    As such, we can no longer use \cref{lemma:approximate policy iteration} (since it relied on a greedy policy) and, thus cannot use \cref{prop:optimality of output policy} to bound the sub-optimality of the policy output by \textsc{Confident Multi-Agent MC-Politex}. 
    Next, we show there is a similar Lemma and Proposition that can derived for \textsc{Confident Multi-Agent MC-Politex}.

    Recall that we do not use clipping on the $Q$-functions in \textsc{Confident Multi-Agent MC-Politex}, so that we can sample from the policy efficiently (Proposition \ref{prop: efficient politex policy sampling}).  
    This means we must define the VA's $Q$-function differently from \citep{yin2021efficient}, by removing clipping from the case when $\phi(s, a^{(1:m)}) \in \mathcal{H}$.
    \begin{equation*}
        \tilde{Q}_{k-1}(s, a^{(1:m)})=\begin{cases}
                          \tilde{w}_k^\top \phi(s, a^{(1:m)}) \quad &\text{if} \, \phi(s, a^{(1:m)}) \in \mathcal{H} \\
                          Q_{\tilde{\pi}_{k-1}}(s, a^{(1:m)})     \quad &\text{if} \, \phi(s, a^{(1:m)}) \notin \mathcal{H} \\
                    \end{cases}
    \end{equation*}
   
    Then the VA's policy is
    \begin{equation} \label{eq:politex virtual policy}
        \tilde \pi_k(a^{(1:m)} | s) \propto \exp \left( \alpha \sum_{j=0}^{k-1} \tilde Q_{j}(s, a^{(1:m)}) \right).
    \end{equation}
    
    Also, due to no clipping, the sequence of $Q$-functions during policy iteration is now in the $[-\eta, (1-\gamma)^{-1} + \eta]$ interval, where $\eta \ge \|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(s, a^{(1:m)})\|_\infty$.
    We now restate Lemma D.1 from \citet{yin2021efficient} which bounds the mixture policy output by Politex for an arbitrary sequence of $Q$-functions
    Since we do not use clipping the theorem is slightly modified (we replace the interval $[0, (1-\gamma)^{-1}]$ with a general interval $[a, b], \ a, b \in \mathbb{R}$, which can be extracted from the calculations in \citet{szepesvari2022}).

    \begin{lemma}[modified Lemma D.1 in \citet{yin2021efficient} also in \citet{szepesvari2022}] \label{lemma:politex mixture policy bound}
    Given an initial policy $\pi_0$, a sequence of functions $Q_k: \mathcal{S} \times \mathcal{A}^{(1:m)} \to [a, b], \ k \in [K-1], a, b \in \mathbb{R}$, and $Q_{\pi^*} \in [0, 1/(1-\gamma)]$, construct a sequence of policies $\pi_1, ..., \pi_{K-1}$ according to (\ref{eq:politex virtual policy}) with $\alpha = 1/(b-a) \sqrt{\frac{2 \log(|\mathcal{A}^{(1:m)}|)}{K}}$, then, for any $s \in \mathcal{S}$, the mixture policy $\bar \pi_{K-1} \sim \text{Unif}\{\pi_k\}_{k=0}^{K-1}$ satisfies

    \begin{equation}
        V^*(s) - V_{\bar \pi_K}(s) \le \frac{b-a}{(1-\gamma)}\sqrt{\frac{2 \log(|\mathcal{A}^{(1:m)}|)}{K}} + \frac{2 \max_{0 \le k \le K-1} \|Q_k - Q_{\pi_k}\|_\infty}{1 - \gamma}
    \end{equation}
    \end{lemma}

    Notice that the above result suggests we just need to control the term $\|Q_k - Q_{\pi_k}\|_\infty$.
    For the VA this is $\|\tilde Q_k - Q_{\tilde \pi_k}\|_\infty$ and as we have already seen, this can be bounded using the high probability bound on policy evaluation for \textsc{Uncertainty Chcek with DAV} (Proposition \ref{prop: approx value function bound for DAV}) and \textsc{Uncertainty Chcek with EGSS} (Proposition \ref{prop: approx value function bound for EGSS}).
    Using Lemma \ref{lemma:politex mixture policy bound} instead of Lemma D.1 in \citet{yin2021efficient}, one can extract another slightly modified result from \citet{yin2021efficient}.
    
    \begin{proposition}[equation (D.8) in \citet{yin2021efficient}] \label{prop:politex optimality of output policy}
    With all terms as defined earlier. 
    Define $\eta \ge \|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(s, a^{(1:m)})\|_\infty$. 
    Suppose $\eta \ge |\tilde w_{k}^\top \phi(\rho, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(\rho, a^{(1:m)})|_\infty, \ \forall a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
    Then, if the VA and MA behave identically in the final loop, with probability at least $1 - 4KC_{\text{max}}^2 \exp(-2 \theta^2(1-\gamma)^2 n)$ we have
    \begin{align}
            V^*(s) - V_{\bar \pi_{K-1}}(\rho) \le \frac{b-a}{(1-\gamma)} \sqrt{\frac{2 \log(|\mathcal{A}^{(1:m)}|)}{K}} + \frac{4 \eta}{1 - \gamma}
    \end{align}
    \end{proposition}
    
    Notice, that we require the same three things as in the \textsc{Confident Multi-Agent MC-LSPI} case (Proposition \ref{prop:optimality of output policy}).
    We need a bound on $\|\tilde Q_{k-1}(s, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(s, a^{(1:m)})\|_\infty$. 
    We need a bound on $|\tilde w_{k}^\top \phi(\rho, a^{(1:m)}) - Q_{\tilde \pi_{k-1}}(\rho, a^{(1:m)})|_\infty, \ \forall a^{(1:m)} \in \mathcal{A}^{(1:m)}$.
    We need to ensure that the VA and MA behave identically in the final loop. 
    Then, we can get a bound on the sub-optimality of the MA's output policy $\bar \pi_{K-1}$.
    Using the same steps as in the previous sections, one can verify that indeed, \textsc{Confident Multi-Agent MC-Politex} with \textsc{Uncertainty Check with DAV} or \textsc{Uncertainty Check with EGSS} does satisfy the above three conditions, with $\eta = \eta_1$ ($\eta_1$ as defined in \ref{prop: approx value function bound for DAV}) and $\eta = \eta_2$ ($\eta_2$ as defined in \ref{prop: approx value function bound for EGSS}) respectively.
    
    We bound $|\mathcal{A}^{(1:m)}| \le A^m$. 
    We can replace $b-a$ with $1/(1-\gamma) + 2\eta$, since $w^\top \phi(s, a^{(1:m)}) \in [-\eta, (1-\gamma)^{-1} + \eta], \ \forall (s \times a^{(1:m)}) \in (\mathcal{S} \times \mathcal{A}^{(1:m)})$ in the final loop for the same event which holds with probability at least $1 - 4KC_{\text{max}}^2 \exp(-2 \theta^2(1-\gamma)^2 n)$ in \cref{prop:politex optimality of output policy}. 
    We get with probability at least $1 - 4KC_{\text{max}}^2 \exp(-2 \theta^2(1-\gamma)^2 n)$ that
    \begin{align} 
            V^*(s) - V_{\bar \pi_{K-1}}(\rho) \le \left(\frac{1}{(1-\gamma)^2} + \frac{2\eta}{(1-\gamma)}\right) \sqrt{\frac{2 m \log(A)}{K}} + \frac{4 \eta}{1 - \gamma}. \label{eq:politex actual optimality of output policy}
    \end{align}

    
    \section{Proofs of Theorems} \label{app: theorem proofs}
    We make a remark on the query complexity of \textsc{Confident Multi-Agent MC-LSPI/Politex}.
    From \cref{lemma:bound on core set size} we know the core set size is bounded by $C_\text{max} = \tilde \mathcal{O}(d)$.
    The total number of times Policy iteration is thus at most $C_\text{max}$.
    Each run of policy iteration can take as much as $K$ iterations.
    In each iteration \textsc{MA-Confident Rollout} is run at most $C_\text{max}$ times.
    \textsc{MA-Confident Rollout} does $n$ rollouts of length $H$ which queries the simulator once for each step.
    In total the number of queries performed by \textsc{Confident Multi-Agent MC-LSPI/Politex} is bounded by $C_\text{max}^2 K n H$.
    This equation is used to calculate the query cost for the different variants of \textsc{Confident Multi-Agent MC-LSPI/Politex}, once all the parameter values have been calculated.

   
   
   
   
   
   
   
    
    
    \subsection{Proof of \cref{thm:ma-mc-lspi sub-optimality}}
    Plugging in $\eta=\eta_1$ when \textsc{Uncertainty Check with DAV} is used ($\eta_1$ as defined in \ref{prop: approx value function bound for DAV}) and $\eta=\eta_2$ when \textsc{Uncertainty Check with EGSS} is used ($\eta_2$ as defined in \ref{prop: approx value function bound for EGSS}) into Proposition \ref{prop:optimality of output policy}.
    Setting $z=2m-1$ when \textsc{Uncertainty Check with DAV} is used, and $z = \sqrt{d}$ when \textsc{Uncertainty Check with EGSS} is used.
    Suppose assumption \ref{ass: feature decomposition} is satisfied with $\epsilon=0$.
    By choosing appropriate parameters according to $\delta$ and $\kappa$, we can ensure that with probability at least $1 - \delta$ that the policy output by \textsc{Confident Multi-Agent MC-LSPI} $\pi_{K-1}$ satisfies:
    \begin{align*}
        V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \kappa,
    \end{align*}
    with the following parameter initialization (see \ref{sec:parameter mclsipi-dav})
    \begin{align*}
        \tau &= 1\\
        \lambda &= \frac{\kappa^2(1 - \gamma)^4}{1024 b^2 z^2}\\
        \theta &= \frac{\kappa(1- \gamma)^2}{32 z \sqrt{C_{\text{max}}}}\\
        H &= \frac{
            \log \left ( 32 \sqrt{C_{\text{max}}} z \right)
            - \log \left( \kappa(1 - \gamma)^3 \right)
        }{
            1-\gamma
        } - 1\\
        K &= \frac{\log\left(\frac{1}{\kappa(1 - \gamma)^2}\right) + \log(8)}{1-\gamma} + 1 \\
        n &= \frac{\log(\delta) - \log(4KC_{\text{max}}^2)}{2 \theta^2(1-\gamma)^2} \\
        C_{\max} &= \frac{e}{e-1} \frac{1 + \tau}{\tau} d \left( 
            \log(1 + \frac{1}{\tau}) +
            \log(1 + \frac{m}{\lambda})
        \right) 
    \end{align*}
    with computational cost of $\text{poly}(d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}), \log(b), m, |\mathcal{A}|)$.
    and query cost $\mathcal{O}\left(\tfrac{z^2 d^3}{\kappa^2 (1-\gamma)^8} \right)$ 

    Suppose assumption \ref{ass: feature decomposition} is satisfied with $\epsilon \neq 0$,
    By choosing parameters as above, with $\kappa = \frac{32 \epsilon \sqrt{d} z}{(1-\gamma)^2} (1 + \log(m b^2 \epsilon^{-2} d^{-1}))^{1/2}$, we can ensure that with probability of at least $1 - \delta$ that the policy output by \textsc{Confident Multi-Agent MC-LSPI} $\pi_{K-1}$ satisfies:
    
    $$V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \frac{64 \epsilon \sqrt{d} z}{(1-\gamma)^2} (1 +\log(1+m b^2 \epsilon^{-2} d^{-1}))^{1/2}$$
    
    with computational cost of $\text{poly}(d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}), \log(b), m, |\mathcal{A}|)$.
    and query cost $\mathcal{O}\left(\tfrac{d^2}{\epsilon^2 (1-\gamma)^4} \right)$ 
    
    Moreover, the above results also holds under \cref{ass: argmax oracle} when \textsc{Uncertainty Check with EGSS} is used.
    

    \subsection{Proof of \cref{thm:ma-mc-politex sub-optimality}}
    Plugging in $\eta=\eta_1$ when \textsc{Uncertainty Check with DAV} is used ($\eta_1$ as defined in \ref{prop: approx value function bound for DAV}) and $\eta=\eta_2$ when \textsc{Uncertainty Check with EGSS} is used ($\eta_2$ as defined in \ref{prop: approx value function bound for EGSS}) into \cref{eq:politex actual optimality of output policy}.
    Setting $z=2m-1$ when \textsc{Uncertainty Check with DAV} is used, and $z = \sqrt{d}$ when \textsc{Uncertainty Check with EGSS} is used.
    Suppose assumption \ref{ass: feature decomposition} is satisfied with $\epsilon=0$.
    By choosing appropriate parameters according to $\delta$ and $\kappa$, we can ensure that with probability at least $1 - \delta$ that the policy output by \textsc{Confident Multi-Agent MC-Politex} $\pi_{K-1}$ satisfies:
    \begin{align*}
        V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \kappa,
    \end{align*}
    with the following parameter initialization (see \ref{sec:parameter mclsipi-dav})
    \begin{align*}
        \tau &= 1\\
        \lambda &= \frac{\kappa^2(1 - \gamma)^2}{576 b^2 z^2}\\
        \theta &= \frac{\kappa(1- \gamma)}{24 z \sqrt{C_{\text{max}}}}\\
        H &= \frac{
            \log \left ( 24 \sqrt{C_{\text{max}}} z \right)
            - \log \left( \kappa(1 - \gamma)^2 \right)
        }{
            1-\gamma
        } - 1\\
        K &= 2m \log(A) \left( \frac{4}{\kappa^2 (1-\gamma)^4} + \frac{3}{\kappa (1-\gamma)^2} + \frac{9}{16} \right)\\
        n &= \frac{\log(\delta) - \log(4KC_{\text{max}}^2)}{2 \theta^2(1-\gamma)^2} \\
        C_{\max} &= \frac{e}{e-1} \frac{1 + \tau}{\tau} d \left( 
            \log(1 + \frac{1}{\tau}) +
            \log(1 + \frac{m}{\lambda})
        \right) 
    \end{align*}
    with computational cost of $\text{poly}(d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}), \log(b), m, |\mathcal{A}|)$.
    and query cost $\mathcal{O}\left(\tfrac{m z^2 d^3}{\kappa^4 (1-\gamma)^9} \right)$ 

    Suppose assumption \ref{ass: feature decomposition} is satisfied with $\epsilon \neq 0$,
    By choosing parameters as above, with $\kappa = \frac{16 \epsilon \sqrt{d} z}{(1-\gamma)} (1 + \log(m b^2 \epsilon^{-2} d^{-1}))^{1/2}$, we can ensure that with probability of at least $1 - \delta$ that the policy output by \textsc{Confident Multi-Agent MC-Politex} $\pi_{K-1}$ satisfies:
    
    $$V^*(\rho) - V_{\pi_{K-1}}(\rho) \leq \frac{32 \epsilon \sqrt{d} z}{(1-\gamma)^2} (1 +\log(1+m b^2 \epsilon^{-2} d^{-1}))^{1/2}$$

    with computational cost of $\text{poly}(d, \frac{1}{1 - \gamma}, \frac{1}{\kappa}, \log(\frac{1}{\delta}), \log(b), m, |\mathcal{A}|)$.
    and query cost $\mathcal{O}\left(\tfrac{m d}{\epsilon^4 (1-\gamma)^5} \right)$ 
    
    
    \section{Kernel Setting} \label{app:kernel setting}
    
    The kernelized setting is a standard extension of the finite-dimensional linear setup \citep{srinivas2009gaussian,abbasi2012online}. It lifts the restriction that features and parameter vector are elements of $\mathbb{R}^d$. Instead we require that the $Q_\pi$-function is contained in a reproducing kernel Hilbert space (RKHS). This includes cases where the linear dimension of function class is infinite.
    
    The more general setup requires us to address two main challenges: First, the scaling of the sample complexity with the dimension $d$ needs to be improved to a notion of effective dimension that can be bounded for the RKHS of interest. Second, computationally we cannot directly work with infinite dimensional features $\phi(s,a)$. Instead, we need to rely on the `kernel trick' and compute all quantities of interest in the finite-dimensional data space. 
    
    Formally for each agent $j \in [m]$, the function $k^{(j)} : (\mathcal{S} \times \mathcal{A}^{(1:m)})^2 \rightarrow \mathbb{R}$ is defined as
    
    \begin{align}
        k^{(j)}(s_1, a_1^{(1:m)}, s_2, a_2^{(1:m)}) = k_j(s_1, a_1^{(j)}, s_2, a_2^{(j)}), \label{eq:rkhs_j}
    \end{align}
    where $k_j: (\mathcal{S} \times \mathcal{A})^2 \rightarrow \mathbb{R}$ is the underlying kernel function for agent $j$,
    and $\mathcal{H}_j$ is the RKHS associated with it.
   
   
    
    Based on definition \eqref{eq:rkhs_j}, it's easy to see that $\{k^{(j)}\}_{j \in [m]}$ is a set of kernel functions too, and they share the same vector space which is $V := \mathbb{R}^{\mathcal{S} \times \mathcal{A}^{(1:m)}}$.
    However, they have different inner products on this space which produce a different RKHS for every $j \in [m]$.
    We denote RKHS of $k^{(j)}$ as $\mathcal{H}^{(j)}$, and its inner product follows from equation \eqref{eq:rkhs_j} as
    \begin{align}
        \langle k^{(j)}(s_1, a_1^{(1:m)}, \cdot, \cdot), k^{(j)}(s_2, a_2^{(1:m)}, \cdot, \cdot) \rangle_{\mathcal{H}^{(j)}} =
        \langle k_j(s_1, a_1^{(j)}, \cdot, \cdot), k_j(s_2, a_2^{(j)}, \cdot, \cdot) \rangle_{\mathcal{H}_j}. \label{eq:rkhs-inner-1}
    \end{align}
    By defining $\phi_j(s, a) := k_j(s,a, \cdot, \cdot) \in \mathcal{H}_j$ and $\phi^{(j)}(s, a^{(1:m)}) := k^{(j)}(s, a^{(1:m)}, \cdot, \cdot) \in \mathcal{H}^{(j)}$, we can rewrite \eqref{eq:rkhs-inner-1} for fixed $s_1, s_2, a_1^{(1:m)}, a_2^{(1:m)}$ as
    \begin{align}
        \langle \phi^{(j)}(s_1, a_1^{(1:m)}), \phi^{(j)}(s_2, a_2^{(1:m)}) \rangle_{\mathcal{H}^{(j)}} &=
        \langle \phi_j(s_1, a_1^{(j)}), \phi_j(s_2, a_2^{(j)})\rangle_{\mathcal{H}_j}.\label{eq:rkhs-inner-1-rev}
    \end{align}
    Intuitively, equation \eqref{eq:rkhs-inner-1-rev} suggests that the inner product $\langle \cdot, \cdot \rangle_{H^{(j)}}$ only depends on the state $s$, and the action taken by agent $j$.
    
    Next, we define the joint additive kernel $k : (\mathcal{S} \times \mathcal{A}^{(1:m)})^2 \rightarrow \mathbb{R}$ as follows
    \begin{align}
    	k(s_1,a_1^{(1:m)}, s_2,a_2^{(1:m)}) &= \sum_{j=1}^m k^{(j)}(s_1,a_1^{(1:m)}, s_2,a_2^{(1:m)}) \label{eq:jak-def}\\
    	&= \sum_{j=1}^m k_j(s_1,a_1^{(j)}, s_2,a_2^{(j)})\\
    	&= \sum_{j=1}^m \langle k_j(s_1,a_1^{(j)}, \cdot, \cdot), k_j(s_2,a_2^{(j)}, \cdot, \cdot) \rangle_{\mathcal{H}_j},
    \end{align}
    and we denote its associated RKHS as $\mathcal{H}$. Again, note that $\mathcal{H}$ uses the same vector space, namely $V$, as all the $\mathcal{H}^{(j)}$s.
    
    Now, we can restate Assumption for the kernel case.
    
    TODO: I think we still want to solve the problem under assumption one for the kernel case, right?\ref{ass: feature decomposition}
    
    \begin{assumption}[Assumption 1 for RKHS]
        \label{ass:kernel-2}
    	For each (deterministic) policy $\pi$, there exists
    	$f_\pi \in \mathcal{H}$, such that
    	$Q_\pi(s, a^{(1:m)}) = \langle \phi(s,a^{(1:m)}), f_\pi \rangle_{\mathcal{H}}$.
    \end{assumption}
    Next, we show that there exist a function $f_\pi^\ag{i} \in \mathcal{H}^\ag{i}$ for $i \in [m]$, such that:
    \begin{align*}
        Q_\pi (s, a^{(1:m)}) &= \sum_{j=1}^m Q_\pi^{(j)}(s, a^{(1:m)})\\
        Q_\pi^\ag{i} (s, a^{(1:m)}) &= \langle \phi^\ag{i}(s,a^{(1:m)}), f_\pi^\ag{i} \rangle_{\mathcal{H}^\ag{i}}
    \end{align*}
    Or, there exist $f_{\pi, j} \in \mathcal{H}_j$ for $j \in \mathcal{H}_j$, such that:
    \begin{align*}
        Q_\pi (s, a^{(1:m)}) &= \sum_{j=1}^m Q_{\pi,j}(s, a^{(j)})\\
        Q_\pi^\ag{i} (s, a^{(1:m)}) &= \langle \phi^\ag{i}(s,a^{(1:m)}), f_{\pi,j} \rangle_{\mathcal{H}^\ag{i}}
    \end{align*}
    
    \begin{proof}
        As $f_\pi$ is an element of $\mathcal{H}$ we know that it can be shown based on the basis vectors of $\mathcal{H}$:
        \begin{align*}
            f_\pi = \sum_{i=1}^{\infty} \alpha_i k(s_i, a_i, \cdot, \cdot).
        \end{align*}
        From the definition of the joint additive kernel and the assumption \ref{ass:kernel-2} we have:
        \begin{align}
            Q_\pi (s, a^{(1:m)}) &= \langle \phi(s,a^{(1:m)}), f_\pi \rangle_{\mathcal{H}} \nonumber \\
            &= \langle \phi(s,a^{(1:m)}), \sum_{i=1}^{\infty} \alpha_i k(s_i, a_i, \cdot, \cdot) \rangle_{\mathcal{H}} \nonumber \\
            &= \sum_{i=1}^{\infty} \alpha_i \langle k(s,a^{(1:m)}, \cdot, \cdot), k(s_i, a_i, \cdot, \cdot) \rangle_{\mathcal{H}} \nonumber \\
            &= \sum_{i=1}^{\infty} \alpha_i k(s,a^{(1:m)}, s_i, a_i) \nonumber \\
            &= \sum_{i=1}^{\infty} \alpha_i \sum_{j=1}^m k^{(j)}(s,a^{(1:m)}, s_i, a_i) & \text{Based on }\ref{eq:jak-def}\label{eq:checkpoint}\\
            &= \sum_{j=1}^m \sum_{i=1}^{\infty} \alpha_i k^{(j)}(s,a^{(1:m)}, s_i, a_i) \nonumber \\
            &= \sum_{j=1}^m \sum_{i=1}^{\infty} \alpha_i \langle \phi^{(j)}(s,a^{(1:m)}) , \phi^{(j)}(s_i, a_i) \rangle_{\mathcal{H}^{(j)}} \nonumber\\
            &= \sum_{j=1}^m \langle \phi^{(j)}(s,a^{(1:m)}) , \underbrace{\sum_{i=1}^{\infty} \alpha_i \phi^{(j)}(s_i, a_i)}_{:=E^{(j)}(f_\pi) := f_\pi^\ag{i}} \rangle_{\mathcal{H}^{(j)}} \nonumber \\
            &= \sum_{j=1}^{m} \langle \phi^{(j)}(s,a^{(1:m)}), f_\pi^{(j)} \rangle_{\mathcal{H}^{(j)}}. \nonumber
        \end{align}
        or from \eqref{eq:checkpoint} we have:
        \begin{align*}
            Q_\pi (s, a^{(1:m)})
            &= \sum_{i=1}^{\infty} \alpha_i \sum_{j=1}^m k^{(j)}(s,a^{(1:m)}, s_i, a_i) \\
            &= \sum_{i=1}^{\infty} \alpha_i \sum_{j=1}^m k_j(s,a^{(j)}, s_i, a_i^{(j)}) & \text{Based on }\ref{eq:rkhs_j}\\
            &= \sum_{i=1}^{\infty} \alpha_i \sum_{j=1}^m \langle \phi_j(s,a^{(j)}), \phi_j(s_i, a_i^{(j)}) \rangle_{\mathcal{H}_j}\\
            &= \sum_{j=1}^m \langle
            \phi_j(s,a^{(j)}),
            \underbrace{
            \sum_{i=1}^{\infty} \alpha_i \phi_j(s_i, a_i^{(j)})
            }_{:=E_j(f_\pi) :=f_{\pi, j}}
            \rangle_{\mathcal{H}_j}\\
            &= \sum_{j=1}^m \langle
            \phi_j(s,a^{(j)}),
            f_{\pi, j}
            \rangle_{\mathcal{H}_j}.
        \end{align*}
        We may need to show that $f_\pi^{j}$ and $f_{\pi, j}$ have finite norms in their corresponding Hilbert spaces.
    \end{proof}
    
    \paragraph{Kernelized Algorithm}
    
    As before we can compute the ridge estimate
    \begin{align}
    	\hat Q_t = \argmin_{Q \in \mathcal{H}} \sum_{(s,a^{(1:m)})\in\mathcal{C}_t} (Q(s,a^{(1:m)}) - q_{(s,a^{(1:m)})})^2 + \lambda \|Q\|_{\mathcal{H}}^2 = (\Phi_{\mathcal{C}_t}\Phi_{\mathcal{C}_t}^\top + \lambda \mathbf{I}_\mathcal{H})^{-1}\Phi_{\mathcal{C}_t}q_{\mathcal{C}_t}
    \end{align}
    Here, $\mathbf{I}_{\mathcal{H}} : \mathcal{H} \rightarrow \mathcal{H}$ is the identity mapping, and $\Phi_{\mathcal{C}}^\top$ can be formally defined as map $\Phi_\mathcal{C}^\top : \mathcal{H} \rightarrow \mathbb{R}^{|\mathcal{C}|}, f \mapsto [f(s,a^{(1:m)})]_{(s,a^{(1:m)}) \in \mathcal{C}}, \, f \in \mathcal{H}$; and $\Phi_{\mathcal{C}} : \mathbb{R}^{|\mathcal{C}|} \rightarrow \mathcal{H}$ is the adjoint of $\Phi_{\mathcal{C}}^\top$.
    
    Using the `kernel trick' we express the estimator as follows
    \begin{align}
    	\hat Q_t = \Phi_{\mathcal{C}_t}(K_{\mathcal{C}_t} + \lambda \mathbf{I}_{t})^{-1}q_{\mathcal{C}_t}
    \end{align}
    where $K_{\mathcal{C}_t} = \Phi_{\mathcal{C}_t}^\top \Phi_{\mathcal{C}_t} \in \mathbb{R}^{t \times t}$ is the kernel matrix. Lastly, we can evaluate for any $s,a^{(1:m)}$:
    \begin{align}
    	\hat Q_t(s,a^{(1:m)}) = k_{\mathcal{C}_t}(s,a^{(1:m)})^\top(K_{\mathcal{C}_t} + \lambda \mathbf{I}_{t})^{-1}q_{\mathcal{C}_t}
    \end{align}
    where we defined $k_\mathcal{C}(s,a^{(1:m)}) = [k(s,a^{(1:m)}, s',a'^{(1:m)})]_{(s',a'^{(1:m)}) \in \mathcal{C}} \in \mathbb{R}^{|\mathcal{C}|}$ (for some fixed ordering of $\mathcal{C}$). Importantly, the last display only involves finite-dimensional quantities that can be computed from kernel evaluations. Moreover, since $k(s,a^{(1:m)},s',a'^{(1:m)}) = \sum_{j=1}^m k_j(s,a^{(j)}, s', a^{(j)})$ we can write
    \begin{align}
    	\hat Q_t(s,a^{(1:m)}) = \sum_{j=1}^m k_{j, \mathcal{C}_t}(s,a^{(j)})^\top(K_{\mathcal{C}_t} + \lambda \mathbf{I}_{t})^{-1}q_{\mathcal{C}_t}
    \end{align}
    where $k_{j, \mathcal{C}}(s,a^{(j)}) = [k_j(s,a^{(j)}, s^\prime , a^{\prime(j)})]_{(s^\prime, a^{\prime(1:m)}) \in \mathcal{C}} \in \mathbb{R}^{|\mathcal{C}|}$. Hence we can still compute the maximizer independently for each agent.
    
    The second quantity required by the algorithm is the squared norm $\|\phi(s,a^{(1:m)})\|_{(\Phi_\mathcal{C} \Phi_{\mathcal{C}}^\top + \lambda \mathbf{I}_\mathcal{H})^{-1}}^2$, where now $\phi(s,a^{(1:m)}) = k(s,a^{(1:m)}, \cdot, \cdot) \in \mathcal{H}$. Using the Woodbury identity, we can write 
    \begin{align}
    	\lambda (\Phi_\mathcal{C} \Phi_{\mathcal{C}}^\top + \lambda \mathbf{I}_\mathcal{H})^{-1} = \mathbf{I}_{\mathcal{H}} - \Phi_{\mathcal{C}} (K_\mathcal{C} + \lambda \mathbf{I}_{|\mathcal{C}|})^{-1}\Phi_{\mathcal{C}}^\top
    \end{align}
    Therefore the feature norm can be written using finite-dimensional quantities: 
    \begin{align}
    	\|\phi(s,a^{(1:m)})\|_{(\Phi_\mathcal{C} \Phi_{\mathcal{C}}^\top + \lambda \mathbf{I}_\mathcal{H})^{-1}}^2 = \frac{1}{\lambda} \left( k(s,a^{(1:m)},s,a^{(1:m)}) - k_\mathcal{C}(s,a^{(1:m)})^\top(K_\mathcal{C}+ \lambda \mathbf{I}_{|\mathcal{C}|})^{-1}k_{\mathcal{C}}(s,a^{(1:m)})\right)
    \end{align}
    With this, we can implement the DAV version of the algorithm directly. The EGSS is more tricky to implement, but this is potentially possible using eigenfunctions from Mercer's theorem.
    
    
    \paragraph{Analysis}
    Our goal next is to extend the analysis to the kernel case, carefully arguing that the linear dimension $d$ can be replaced by a more benign quantity. A common complexity measure is the total information gain, which we define as follows:
    \begin{align}
    	\Gamma_{\mathcal{C}} = \log \det (\Phi_{\mathcal{C}}\Phi_{\mathcal{C}}^\top + \lambda \mathbf{I}_d) - \log \det (\lambda \mathbf{I}_d)
    \end{align}
    Note that we can compute $\Gamma_{\mathcal{C}}$ for any given core set $\mathcal{C}$. In the kernel case, we can compute $\Gamma_{\mathcal{C}} = \log \det (\mathbf{I}_{|\mathcal{C}|} + \lambda^{-1} K_{\mathcal{C}})$ using similar arguments as before.
    
    The maximum information gain is $\Gamma_t = \max_{\mathcal{C} : |\mathcal{C}|=t} \Gamma_{\mathcal{C}}$. It serves as a complexity measure in the bandit literature and can be bounded for many kernels of interests \citep{srinivas2009gaussian,vakili2021information}. Following \citet{du2021bilinear}, we further define the \emph{critical information gain},
    \begin{align}
    	\tilde \Gamma = \max \{t \geq 1 : t \leq \Gamma_t \}
    \end{align}
    Note that the proof of  \cite[Lemma 5.1]{yin2021efficient} implies that $|C| \leq  \log(1+\tau)^{-1}\Gamma_{|C|}$
   
    
    Since the dimension $d$ enters our bounds only through $C_{\max}$ we can immediately get a sample complexity bound for the kernelized algorithm in terms of $\tilde\Gamma$. For the finite-dimensional case, \cite[Lemma 5.1]{yin2021efficient} shows that $\tilde \Gamma \leq \mathcal{O}(d)$, recovering the previous bound.
    
    
    \paragraph{Unknown Critical Information Gain} Somewhat impractical for the algorithm is that we need to know a bound on $C_{\max}$ or $\tilde \Gamma$ respectively to set the number of episodes required for some target level of accuracy $\kappa > 0$ (roughly, $m = C_{\max}/\kappa^2$).
    
    As a remedy, we can replace the check $\|\phi(s,a)\|_{(\Phi_\mathcal{C} \Phi_{\mathcal{C}} + \lambda \mathbf{I}_d)^{-1}}^2 > \tau$ by
    
    TODO: this needs some more thinking, as we don't want to set $\tau$ to be too small - maybe? An easier approach could be to set $m = |\mathcal{C}|/\kappa^2$?
    
    \begin{align*}
    	\|\phi(s,a)\|_{(\Phi_\mathcal{C} \Phi_{\mathcal{C}} + \lambda \mathbf{I}_d)^{-1}}^2 > \frac{\tilde \tau}{\max(\Gamma_{\mathcal{C}},1)}
    \end{align*}
    
    Let $\mathcal{C}_1, \dots, \mathcal{C}_t$ be the sequence of core sets obtained by adding elements that satisfy the above condition. Note that $\Gamma(\mathcal{C}_t)$ is a non-decreasing sequence. Combined with \citep[Lemma 5.1]{yin2021efficient} , this implies that
    \begin{align}
    	t \log\left(1 + \tfrac{\tilde \tau}{\max(\Gamma_t,1)}\right) \leq t \log\left(1 + \tfrac{\tilde \tau}{\max( \Gamma_{\mathcal{C}_{t}}, 1)}\right) \leq \sum_{s=1}^t \log\left(1 + \tfrac{\tilde \tau}{\max(\Gamma_{\mathcal{C}_{s}},1)}\right) \leq \Gamma_{\mathcal{C}_{t}} \leq \Gamma_t\label{eq:cmax adaptiv}
    \end{align}
    Hence the condition is triggered at most
    \begin{align}
    	\tilde C_{\max}(\tilde \tau) = \max \left\{t \geq 1 : t \leq \Gamma_t \log\left(1 + \tfrac{\tilde \tau}{\max(\Gamma_t, 1)} \right)^{-1}\right\}
    \end{align}
    times. 
    
    TODO: Would be great to show some bounds for $\tilde C_{\max}$, e.g. in the finite-dimensional case
    
    Moreover, we can set $\tilde \tau=1$ and $m = \frac{1}{\kappa^2}$ (i.e.~without knowing a bound on $C_{\max}$) to obtain the required target accuracy $\tilde \mathcal{O}(\kappa)$.
    
    TODO: this requires some introspection of \citep[Lemma B.2]{yin2021efficient} and \eqref{eq:cmax adaptiv}
    
    
    
    \subsection{We need to prove that if we add a state-action pair to the core set, it remains in the good set in future.}
    
    
    \begin{theorem}
    Assume that $\Phi_\mathcal{C} \in \mathbb{R}^{t \times d}$,  $V_\mathcal{C} = \Phi_\mathcal{C}^\top \Phi_\mathcal{C} + \lambda I$, and $\phi_{t+1} \in \mathbb{R}^d$. Define $\hat \Phi = [\Phi^\top \, \phi_{t+1}]^\top$,
    and $\hat V_\mathcal{C} = \hat{\Phi}_\mathcal{C}^\top \hat{\Phi}_\mathcal{C} + \lambda I$.
    Then we have:
    \begin{align*}
        \| \phi_{t+1} \|_{\hat V_\mathcal{C}^{-1}} < 1
    \end{align*}
    \end{theorem}
    \begin{proof}
    By the definition of the norm we have:
    \[
    \begin{aligned}
        \| \phi_{t+1} \|_{\hat{V}_\mathcal{C}^{-1}} &= \phi_{t+1}^\top \hat{V}_\mathcal{C}^{-1} \phi_{t+1}\\ 
        &= \phi_{t+1}^\top \left( 
            \hat{\Phi}_\mathcal{C}^\top \hat{\Phi}_\mathcal{C} + \lambda I
        \right)^{-1} \phi_{t+1}\\
        &= \phi_{t+1}^\top \left ( 
            \sum_{i = 1}^{t+1} \phi_i \phi_i^\top
            + \lambda I
        \right)^{-1} \phi_{t+1}\\
        &= \phi_{t+1}^\top \left ( 
            \sum_{i = 1}^{t} \phi_i \phi_i^\top
            + \lambda I
            + \phi_{t+1} \phi_{t+1}^\top
        \right)^{-1} \phi_{t+1}\\
        &= \phi_{t+1}^\top \left ( 
            V_\mathcal{C}
            + \phi_{t+1} \phi_{t+1}^\top
        \right)^{-1} \phi_{t+1}\\
        &= \phi_{t+1}^\top \left ( 
            V_\mathcal{C}^{-1}
            - \frac{V_\mathcal{C}^{-1} \phi_{t+1} \phi_{t+1}^\top V_\mathcal{C}^{-1}}{1 + \phi_{t+1}^\top V_\mathcal{C}^{-1} \phi_{t+1}^\top}
        \right) \phi_{t+1} & \text{Sherman-Morrison}\\
        &= \phi_{t+1}^\top V_\mathcal{C}^{-1} \phi_{t+1}
            - \frac{
            \phi_{t+1}^\top V_\mathcal{C}^{-1} \phi_{t+1} \phi_{t+1}^\top V_\mathcal{C}^{-1}\phi_{t+1}
            }{
            1 + \phi_{t+1}^\top V_\mathcal{C}^{-1} \phi_{t+1}
            }
        \\
        &= \frac{
            \phi_{t+1}^\top V_\mathcal{C}^{-1} \phi_{t+1} 
            }{
            1 + \phi_{t+1}^\top V_\mathcal{C}^{-1} \phi_{t+1}
            }\\
        &< 1
        \\
    \end{aligned}
    \]
    \end{proof}
    Therefore, if we set $\tau \geq 1$, then none of the features that have been added to the core set can produce $V_\mathcal{C}^{-1}$-norm greater than $\tau$, so they remain in the good set.



\section*{Parameter Assignments} \label{app:parameter assignments}

\subsection{MCLSPI-DAV}
\label{sec:parameter mclsipi-dav}
The total error is the following:
\begin{align}
    \frac{8 \eta_1}{(1 - \gamma)^2} + \frac{2 \gamma^{K - 1}}{(1 - \gamma)^2} &\leq \kappa \label{eq:egss-main} \\
    \frac{8}{(1 - \gamma)^2}
    \left(
        b\sqrt{\lambda  \tau} + \left( \frac{\gamma^{H+1}}{1 - \gamma} + \theta \right) \sqrt{ \tau C_{\text{max}}}
        \right)(2m-1) +
        \frac{2\gamma^{K - 1}}{(1 - \gamma)^2}
    &\leq \kappa \nonumber \\
    &\Rightarrow \nonumber \\
    \frac{8(2m-1)}{(1 - \gamma)^2} b\sqrt{\lambda  \tau} & \leq \frac{\kappa}{4} \label{eq:egss-lambda}\\
    \frac{8\sqrt{ \tau C_{\text{max}}}(2m-1)}{(1 - \gamma)^2} \frac{\gamma^{H+1}}{1 - \gamma} & \leq \frac{\kappa}{4} \label{eq:egss-H}\\
    \frac{8\sqrt{ \tau C_{\text{max}}}(2m-1)}{(1 - \gamma)^2} \theta & \leq \frac{\kappa}{4} \label{eq:egss-theta}\\
    \frac{2\gamma^{K - 1}}{(1 - \gamma)^2} & \leq \frac{\kappa}{4} \label{eq:egss-K}
\end{align}
First we assume that $\tau = 1$.

From \eqref{eq:egss-lambda} we get:
\begin{align*}
    \frac{8(2m - 1)}{(1 - \gamma)^2} b\sqrt{\lambda } & \leq \frac{\kappa}{4} \\
    \sqrt{\lambda } &\leq \frac{(1 - \gamma)^2\kappa}{32 b (2m -1)}\\
    \lambda &\leq \frac{(1 - \gamma)^4\kappa^2}{1024 b^2 (2m -1)^2}\\
\end{align*}
From \eqref{eq:egss-H} we get:
\begin{align*}
    \frac{8\sqrt{ C_{\text{max}}}(2m -1)}{(1 - \gamma)^3} \gamma^{H+1} & \leq \frac{\kappa}{4} \\
    (2m -1) \sqrt{C_{\text{max}}} \gamma^{H+1} & \leq \frac{\kappa(1 - \gamma)^3}{32} \\
    \gamma^{H+1} & \leq \frac{\kappa(1 - \gamma)^3}{32 \sqrt{C_{\text{max}}} (2m - 1)} \\
    H & \geq \frac{\log\left (\frac{\kappa(1 - \gamma)^3}{32 \sqrt{C_{\text{max}}} (2m - 1)} \right)}{
        \log(\gamma)
    } - 1\\
\end{align*}

From \eqref{eq:egss-theta} we get:
\begin{align*}
    \frac{8\sqrt{ C_{\text{max}}}(2m-1)}{(1 - \gamma)^2} \theta & \leq \frac{\kappa}{4}\\
    (2m-1) \sqrt{C_{\text{max}}} \theta & \leq \frac{\kappa(1- \gamma)^2}{32}\\
    \theta & \leq \frac{\kappa(1- \gamma)^2}{32 (2m-1) \sqrt{C_{\text{max}}}}\\
\end{align*}

From \eqref{eq:egss-K} we get:
\begin{align*}
    \frac{2\gamma^{K - 1}}{(1 - \gamma)^2} & \leq \frac{\kappa}{4}\\
    \gamma^{K - 1} & \leq \frac{\kappa(1 - \gamma)^2}{8}\\
    K & \leq \frac{\log \left( \frac{\kappa(1 - \gamma)^2}{8} \right)}{\log(\gamma)} + 1\\
    K & \leq \frac{\log\left(\kappa(1 - \gamma)^2\right) - \log(8)}{\log(\gamma)} + 1\\
\end{align*}

We know that \eqref{eq:egss-main} holds with probability at least $1 - 2KC_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$.
Therefore from that the rest of values we get:
\begin{align*}
        2KC_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n) &\leq \delta\\
        \exp(-2 \theta^2(1-\gamma)^2 n) &\leq \frac{\delta}{2KC_{\text{max}}}\\
        -2 \theta^2(1-\gamma)^2 n &\leq log( \frac{\delta}{2KC_{\text{max}}})\\
        n &\geq \frac{log(\delta) - \log(2KC_{\text{max}})}{2 \theta^2(1-\gamma)^2}\\
\end{align*}

\subsection{MCLSPI-EGSS}
\label{sec:parameter mclspi-egss}
The total error is the following:
\begin{align}
    \frac{8 \eta_2}{(1 - \gamma)^2} + \frac{2 \gamma^{K - 1}}{(1 - \gamma)^2} &\leq \kappa \label{eq:dav-main} \\
    \frac{8}{(1 - \gamma)^2}
    \left(
        b\sqrt{\lambda  \tau} + \left( \frac{\gamma^{H+1}}{1 - \gamma} + \theta \right) \sqrt{ \tau C_{\text{max}}}
        \right) +
        \frac{2\gamma^{K - 1}}{(1 - \gamma)^2}
    &\leq \kappa \nonumber \\
    &\Rightarrow \nonumber \\
    \frac{8}{(1 - \gamma)^2} b\sqrt{\lambda  \tau} & \leq \frac{\kappa}{4} \label{eq:dav-lambda}\\
    \frac{8\sqrt{ \tau C_{\text{max}}}}{(1 - \gamma)^2} \frac{\gamma^{H+1}}{1 - \gamma} & \leq \frac{\kappa}{4} \label{eq:dav-H}\\
    \frac{8\sqrt{ \tau C_{\text{max}}}}{(1 - \gamma)^2} \theta & \leq \frac{\kappa}{4} \label{eq:dav-theta}\\
    \frac{2\gamma^{K - 1}}{(1 - \gamma)^2} & \leq \frac{\kappa}{4} \label{eq:dav-K}
\end{align}
First we assume that $\tau = 1$.

From \eqref{eq:dav-lambda} we get:
\begin{align*}
    \frac{8}{(1 - \gamma)^2} b\sqrt{\lambda } & \leq \frac{\kappa}{4} \\
    \sqrt{\lambda } &\leq \frac{(1 - \gamma)^2\kappa}{32 b}\\
    \lambda &\leq \frac{(1 - \gamma)^4\kappa^2}{1024 b^2 }\\
\end{align*}
From \eqref{eq:dav-H} we get:
\begin{align*}
    \frac{8\sqrt{ C_{\text{max}}}}{(1 - \gamma)^3} \gamma^{H+1} & \leq \frac{\kappa}{4} \\
    d^{\frac{1}{4}} C_{\text{max}}^{\frac{1}{2}} \gamma^{H+1} & \leq \frac{\kappa(1 - \gamma)^3}{32} \\
    \gamma^{H+1} & \leq \frac{\kappa(1 - \gamma)^3}{32 d^{\frac{1}{4}} C_{\text{max}}^{\frac{1}{2}}} \\
    H & \geq \frac{\log\left (\frac{\kappa(1 - \gamma)^3}{32 d^{\frac{1}{4}} C_{\text{max}}^{\frac{1}{2}}} \right)}{
        \log(\gamma)
    } - 1\\
\end{align*}

From \eqref{eq:dav-theta} we get:
\begin{align*}
    \frac{8\sqrt{C_{\text{max}}}}{(1 - \gamma)^2} \theta & \leq \frac{\kappa}{4}\\
    d^{\frac{1}{4}} C_{\text{max}}^{\frac{1}{2}} \theta & \leq \frac{\kappa(1- \gamma)^2}{32}\\
    \theta & \leq \frac{\kappa(1- \gamma)^2}{32 d^{\frac{1}{4}} C_{\text{max}}^{\frac{1}{2}}}\\
\end{align*}

From \eqref{eq:dav-K} we get:
\begin{align*}
    \frac{2\gamma^{K - 1}}{(1 - \gamma)^2} & \leq \frac{\kappa}{4}\\
    \gamma^{K - 1} & \leq \frac{\kappa(1 - \gamma)^2}{8}\\
    K & \leq \frac{\log \left( \frac{\kappa(1 - \gamma)^2}{8} \right)}{\log(\gamma)} + 1\\
    K & \leq \frac{\log\left(\kappa(1 - \gamma)^2\right) - \log(8)}{\log(\gamma)} + 1\\
\end{align*}

We know that \eqref{eq:dav-main} holds with probability at least $1 - 2KC_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n)$.
Therefore from that the rest of values we get:
\begin{align*}
        2KC_{\text{max}} \exp(-2 \theta^2(1-\gamma)^2 n) &\leq \delta\\
        \exp(-2 \theta^2(1-\gamma)^2 n) &\leq \frac{\delta}{2KC_{\text{max}}}\\
        -2 \theta^2(1-\gamma)^2 n &\leq \log( \frac{\delta}{2KC_{\text{max}}})\\
        n &\geq \frac{\log(\delta) - \log(2KC_{\text{max}})}{2 \theta^2(1-\gamma)^2}\\
\end{align*}


\end{document}
