\section{Proofs of Theorem 1}

\begin{proof}
    Consider an optimal (stochastic) policy $\pi^*$. Consider a $h_t$ reachable from $b_0$ by $\pi^*$, where $\pi^*$ has randomization (at least two actions have probabilities in $(0,1)$). Then, $\pi^*$ can be equivalently represented as a mixture over $n \leq |A|$ policies that deterministically selects a unique action at $h_t$ but is the same (stochastic) policy as $\pi^*$ everywhere else. That is, $\pi^*$ is a mixed policy over the set $\vec{\pi}^* = \{\pi_1, \cdots, \pi_n\}$ of $n$ policies that have a deterministic action at $h_t$ and $\pi_i = \pi_j = \pi^*$ at every other history. Let $w_i$ represent the non-zero probability of choosing $\pi_i$ at $h_t$. Then, $V^{\vec{\pi}^*}_R(h_t) = \sum_{i=1}^n w_i V^{\pi_i}_R(b_t)$ and 
    $V^{\vec{\pi}^*}_C(h_t) = \sum_{i=1}^n w_i V^{\pi_i}_C(b_t)$.
    
    We show that all the policies $\pi_i \in \vec{\pi}^*$ must be admissible in order for $\vec{\pi}^*$ to be admissible. Suppose there exists an inadmissible policy $\pi_j \in \vec{\pi}^*$. That is, there exists a history $h_f, f \geq t, $ s.t. Eq. \eqref{eq:pre-recursive constraints} or \eqref{eq:recursive constraints} is violated by $\pi_j$. 
    
    For $f > t$, $h_f$ is only reachable by taking action $\pi_j(h_t)$ at $h_t$, since each policy in $\vec{\pi}^*$ takes a different action at $h_t$, and so their reachable history spaces are different. Only the inadmissible $\pi_j$ is executed from $h_f$ when it is reached probablistically, and $W(h_t) + \gamma^tQ^{\pi_j}_C(b_f, \pi_j(h_f)) \not\leq \hat{c}$. This means that  Eq.\eqref{eq:pre-recursive constraints} is violated at depth $f$, so $\vec{\pi}^*$ is inadmissible, which is a contradiction. 
    
    Similarly, for $f = t$, if $V_{C}^{\pi_j}(b_t) \not\leq d(h_{t})$ (Eq.~\eqref{eq:recursive constraints} are violated),
    $$\exists h_{t+1} \text{ s.t.} V_{C}^{\pi_j}(b_{t+1}) \not\leq d(h_{t+1}),$$
    since
    $$[\forall h_{t+1} , V_C^{\pi_j}(b_{t+1}) \leq d(h_{t+1})] \implies V_C^{\pi_j}(b_t) \leq d(h_t).$$
    This can be seen by rearranging Eq.~\eqref{eq:history dependent cost recursive} and
    $$V_C^{\pi_j}(b_t) = C(b_t,\pi_j(h_t)) + \gamma \mathbb{E}[V_C^{\pi_j}(b_{t+1})]$$. That is, Eq.~\eqref{eq:recursive constraints} is violated at time step $t + 1$, so $\vec{\pi}^*$ is inadmissible, which is again a contradiction. Hence, each $\pi_i \in \vec{\pi}^*$ is admissible. 
    
    Since $\max_i V^{\pi_i}_R(b_t) \geq \sum_{i=1}^n w_i V^{\pi_i}_R(b_t)$, determinism at $h_t$ is sufficient, i.e., obtain a new $\pi^* = \arg\max_{\pi_i}(V^{\pi_i}_R(b_t))$ with one less randomization. Repeating the same process for all histories reachable from $b_0$ by $\pi^*$ with randomization obtains a deterministic optimal policy.
\end{proof}

\section{Proof of Theorem 2}

\begin{proof}
    
    Let $\mathbf{a}'$ denote the action set computed during the Bellman backup operation on $V^{\pi^*}(\bar{b}_t)$ to obtain $V^{\pi'}(\bar{b}_t)$.
    
    \begin{align*}
        \mathbf{a}' = \argmax_{a \in A}\Big[Q_R^{\pi^*}(\bar{b}_t, a) \mid
        Q^{\pi^*}_C(\bar{b}_t, a) \leq d(h_t)\Big]
    \end{align*}

    We first show that $V_R^{\pi*}(\bar{b}_t) = V_R^{\pi'}(\bar{b}_t) \;\;\forall \bar{b}_t$, i.e., the optimality after a Bellman backup operation is preserved.

    
    
    Consider any $\bar{b}_t$. By optimality of $\pi^*$,
    Suppose $\mathbf{a} \neq \emptyset$, 
    \begin{align*}
        V^{\pi'}(\bar{b}_t) = \max_{a \in A}\Big[Q_R^{\pi}(\bar{b}_t, a) \mid
        Q^{\pi}_C(\bar{b}_t, a) \leq d(h_t)\Big]\\
        V^{\pi'}(\bar{b}_t) > V^{\pi^*}(\bar{b}_t) \implies \pi^* \text{ is not optimal}
    \end{align*}
    which is a contradiction, so $V^{\pi'}(\bar{b}_t) \leq V^{\pi^*}(\bar{b}_t)$. However, we have that
    \begin{align*}
        V^{\pi'}_R(\bar{b}_t) = \max_{a}[Q_R^{\pi}(\bar{b}_t, a)]\\
        \implies V^{\pi'} \geq V_R^{\pi^*}(\bar{b}_t)\\
        \implies V^{\pi'} = V_R^{\pi^*}(\bar{b}_t).
    \end{align*}
    
    Next, we have that if $\mathbf{a} = \emptyset$, from Eq.~\eqref{eq:rcbackup}
    \begin{align*}
        V_R^{\pi'}(\bar{b}_t) = V_R^{\pi^*}(\bar{b}_t)
    \end{align*}
    Therefore, $V_R^{\pi*}(\bar{b}_t) = V_R^{\pi'}(\bar{b}_t) \;\;\forall \bar{b}_t$.

    Now, we show that $V_C^{\pi'}(b_t) \leq d(h_t), \;\forall \bar{b}_t \in \textsc{Reach}^{\pi'}\!(\bar{b}_0)$, i.e., the admissibility of the optimal policy after a Bellman backup operation is preserved.

    Let $\bar{b}_t \in \textsc{Reach}^{\pi'}(\bar{b}_0), t \in \mathbb{N}$ be the first augmented belief state from $b_0$ in a belief trajectory such that 
    $V_C^{\pi'}(\bar{b}_t)$ does not satisfy Eq.~\ref{eq:recursive constraints}, i.e., $\pi'$ satisfies Eq.~\ref{eq:recursive constraints} $\forall \tau < t$, and $V_C^{\pi'}(\bar{b}_t) \not\leq d(h)$.
    \begin{align*}
        V_C^{\pi'}(\bar{b}_t) \not\leq d(h_t) \implies \mathbf{a}' = \emptyset
    \end{align*}
    Therefore,
    \begin{align*}
        V_R^{\pi^*}(\bar{b}_t) = V_R^{\pi'}(\bar{b}_t),\\
        V_C^{\pi'}(\bar{b}_t) = V_C^{\pi^*}(\bar{b}_t) = (\infty, \ldots, \infty).
    \end{align*}

    Consider the augmented belief state $\bar{b}_{t-1}$ that transitions to $\bar{b}_t$ under some action $a'_{t-1} = \pi'(\bar{b}_{t-1})$ and observation $o_{t-1}$.
    \begin{align*}
        Q_C^{\pi^*}(\bar{b}_{t-1}, a'_{t-1}) &= C(b,a) + \mathbb{E}[V_R^{\pi^*}(\bar{b}')]\\
        &= C(b,a) + (\infty, \ldots, \infty)\\
        &= (\infty, \ldots, \infty) > d(h_{t-1}).
    \end{align*}
    
    Note that $d(h_{t-1}) \leq \frac{\hat{c}}{\gamma^{t-1}}$ is finite for finite $t-1$. So $a'_{t-1} \notin \argmax_{a \in A}\Big[Q_R^{\pi^*}(\bar{b}_{t-1}, a) \mid
        Q^{\pi^*}_C(\bar{b}_{t-1}, a) \leq d(h_{t-1})\Big]$ and therefore $\bar{b}_t$ cannot be reachable under $\pi'$. By induction, we have that $\forall \bar{b}_t \in \textsc{Reach}^{\pi'}(\bar{b}_0)$,
        \begin{align*}
            V_C^{\pi'}(\bar{b}_t) \leq d(h).
        \end{align*}
\end{proof}


\subsection{Proof of Theorem 3}

\begin{proof}

    Suppose that $\pi^0$ is well behaved.

    Denote $B_{inadmiss} = \{\bar{b} \;|\; \Phi = \emptyset\}$. Then, for all $\bar{b} = (b, d) \in B_{inadmiss}$, $V_C^{\pi^0}(\bar{b}) \not\leq d(h_t)$. A lack of admissible policy implies that there are no actions from $\bar{b}_t$ that leads to admissibility. This implies that $\forall a \in A$, there exists at least $1$ successor augmented belief state $\bar{b}'$, such that $\Phi(\bar{b}') = \emptyset$. Therefore,
    \begin{align*}
         \mathbf{a} &= \argmax_{a \in A}\Big[Q_R^{\pi}(\bar{b}, a) \mid
        Q^{\pi}_C(\bar{b}, a) \leq d\Big] = \emptyset\\
        \mathbb{B}[V^{\pi^0}](\bar{b}) &= (V_R^{\pi^0}(\bar{b}_t), (\infty, \ldots, \infty)).
    \end{align*}
    Therefore, we have that for all $\bar{b} \in B_{inadmiss}$, 
    $$\mathbb{B}^n[V^{\pi^0}](\bar{b}) = (V_R^{\pi^0}(\bar{b}_t), (\infty, \ldots, \infty)) \;\;\forall n.$$
    % Therefore, $V_R^{\pi^n}(\bar{b}) = \underline{R} \;\;\forall \bar{b} \in B_{inadmiss}, \;\;\forall n$.
    
    Next, denote $B_{admiss} = \{\bar{b}_t \;|\; \Phi(\bar{b}_t) \neq \emptyset\}$. Consider any $\bar{b} \in B_{admiss}$, $V_R^{\pi^0}(\bar{b}) \in \mathbb{R}$ and $V_C^{\pi^0}(\bar{b}) \leq d(h_t)$. There must exist at least $1$ action that is part of an admissible policy, i.e., $\mathbf{a} \neq \emptyset$. Therefore, at $\bar{b}$
   \begin{align*}
        \mathbb{B}[V^\pi](\bar{b}) = 
            \big(Q_R^{\pi}(\bar{b}, a), Q_C^{\pi}(\bar{b}, a)\big), a \in \mathbf{a}
    \end{align*}

    Since any $a \notin \mathbf{a}$ are inadmissible and will not be selected during the Bellman backup operation, we can exclude them from the set of actions without loss of generality. Denote $A'(\bar{b}) = \mathbf{a}$ as the set of actions that may be selected at $\bar{b}$. Then, for all $\bar{b} = (b, d) \in B_{admiss}$,
    \begin{align*}
        \mathbf{a'} &= \argmax_{a \in A'(\bar{b})}\Big[Q_R^{\pi}(\bar{b}, a)\Big]\\
        \mathbb{B}[V^{\pi^0}](\bar{b}) &= (Q_R^{\pi^0}((\bar{b}), a), Q_C^{\pi^0}(\bar{b}, a)), a \in \mathbf{a'}.
    \end{align*}

    Consider $V^{\pi^1} = \mathbb{B}[V^{\pi^0}](\bar{b})$. We have that
    \begin{align*}
        V_R^{\pi^1}(\bar{b}) \geq V_R^{\pi^0}(\bar{b}) \text{ and } V_C^{\pi^1}(\bar{b}) \leq d.
    \end{align*}

    By induction, $V_C^{\pi^n}(\bar{b}) \leq d \;\forall \bar{b} \in B_{admiss}$, so the policy $\pi^n$ remains admissible after applying $\mathbb{B}$. Therefore, we can write
    \begin{align*}
        \mathbb{B}_R[V_R^{\pi}](\bar{b}) = \max_{a \in A'(\bar{b})}\Big[Q_R^{\pi}(\bar{b}, a)\Big], \forall \bar{b} \in B_{admiss}.
    \end{align*}

    Note that this is the standard Bellman operator for an unconstrained discounted-sum POMDP over the set of admissible augmented belief states. From the results of the Bellman operator for a POMDP \citep{hauskrecht1997planning}, $\mathbb{B}_R$ is a contraction mapping and has a single, unique fixed point, i.e., for an optimal $\pi^*$, $\mathbb{B}_R[V_R^{\pi^*}](\bar{b}) = V_R^{\pi^*}(\bar{b}) \;\;\forall \bar{b} \in B_{admiss}$. Since $V_R^{\pi^n}(\bar{b}) = V_R^{\pi^0}(\bar{b}) \;\;\forall \bar{b} \in B_{inadmiss}, \forall n$, we have that
    \begin{align*}
        \mathbb{B}^{n}[(V_R^{\pi^0}, V_C^{\pi^0})] \rightarrow (V_R^{\pi^*}, V_C^{\pi^n}) \text{ as } n \rightarrow \infty.
    \end{align*}
    % such that $\pi^*$ is admissible.
    % \qh{need to show that this is admissible?}
    % such that $\pi^*$ is admissible.
    % Thus, $\pi^*$ satisfies the BPO criterion:
    %     $$\mathbb{B}[V_R^{\pi^*}](\bar{b}_t) = V_R^{\pi^*}(\bar{b}_t) \qquad \forall \bar{b}_t \in \textsc{Reach}^{\pi^*}\!(\bar{b}_0).\qedhere$$
\end{proof}

\section{ARCS Pseudocode}

\begin{algorithm}[ht!]
    \caption{Sampling of nodes for backup.}
    \label{alg:sampling}
    \textbf{Global variables}: $\mathcal{M}, T, \Gamma_{c_{min}}$\\
    Let $\gamma = \mathcal{M}.P.\gamma$\\
    \texttt{SAMPLE($\epsilon$)}.
    \begin{algorithmic}[1] %[1] enables line numbers
        \STATE $L \gets T.v_0.\lowervalue$.
        \STATE $U \gets L + \epsilon$.
        \IF {$rand() < 0.5$}
        \STATE \texttt{SampleHeu}($T.v_0, L, U, \epsilon_t, \gamma, 1)$.
        \ELSE
        \STATE \texttt{SampleRandom}($T.v_0, \gamma)$.
        \ENDIF
        \\
        \STATE \textbf{return} sampled nodes.
    \end{algorithmic}
    \texttt{SampleHeu}($v, L, U, \epsilon, \gamma, t$).
    \begin{algorithmic}[1]
        \STATE Let $\hat{V}$ be the predicted value of $v.V^*_R$.
        \IF {$\hat{V} \leq L$ and $v.\uppervalue \leq max \{U, \underline{V}_R(v.b) + \epsilon \gamma^{-t}\}$}
            \STATE \textbf{return}.
        \ELSE
        \STATE $\underline{Q} \gets \max_a \underline{Q}_R(v.b,a)$.
        \STATE $L' \gets \max\{L, \underline{Q}\}$.
        \STATE $U' \gets \max\{U, \underline{Q} + \gamma^{-t}\epsilon\}$.
        \STATE $a' \gets \argmax_a \{v.\upperQvalue(a) \mid v.\lowerQcost(a) \leq v.d\}$.
        \IF {$a' = \emptyset$}
            \STATE \textbf{return}.
        \ENDIF
        \STATE $o' \gets \argmax_o [p(o | b, a')(v'.\uppervalue - v'.\lowervalue - \epsilon \gamma^{-t})]$.
        \STATE Compute $L_t$ such that $L' = R(v.b,a') +$\\
        $\gamma(p(o'|b, a')L_t + \sum_{o \neq o'}p(o|v.b,a')v'.\lowervalue)$.
        \STATE Compute $U_t$ such that $U' = R(v.b,a') +$\\
        $\gamma(p(o'|b, a')U_t + \sum_{o \neq o'}p(o|v.b,a')v'.\uppervalue$.
        \STATE $v' \gets \texttt{SuccessorNode}(v, a', o')$.
        \STATE $T \gets v'$.
        \STATE \texttt{SampleHeu}($v', L_t, U_t, \epsilon, t + 1$).
    \ENDIF
    \end{algorithmic}
    \texttt{SampleRandom}($v, \gamma$)
    \begin{algorithmic}[1]
        \STATE $a \gets rand_a\{a \in A\}$.
        \STATE $o \gets rand_o\{o \in O\}$.
        \STATE $v' \gets \texttt{SuccessorNode}(v, a, o)$.
        \STATE $T \gets v'$.
        \IF {new node is added to $T$}
            \STATE \textbf{return}.
        \ELSE
            \STATE \texttt{SampleRandom}($v', \gamma$).
        \ENDIF
    \end{algorithmic}
\end{algorithm}

\begin{algorithm}[ht!]
    \caption{Compute Successor Node.}
    \label{alg:updatenode}
    \textbf{Global variables}: $\mathcal{M}, T, \Gamma_{c_{min}}$\\
    Let $\gamma = \mathcal{M}.P.\gamma$.\\
    \texttt{SuccessorNode}($v, a, o)$
    \begin{algorithmic}[1]
       \IF{$T.child(v, a, o) \notin T$}
           \STATE $b' \gets BeliefUpdate(v.b, a, o)$ using Eq.~\eqref{eq:bayes}.
          \STATE $d' \gets \frac{1}{\gamma}(v.d - C(v.b,a))$.
           \STATE Initialize lower bound on $k'$ using Eq.~\eqref{eq:maximinLP}.
           \STATE $(\alpha_r, \alpha_c) \gets  \argmin_{(\alpha_r, \alpha_c) \in \Gamma_{c_{min}}} \alpha_r^T b'$.
            \STATE $\lowervalue \gets \alpha_r^T b'$.
            \STATE $\uppercost \gets \alpha_c^T b'$.
          \STATE $\uppervalue \gets \uppervalue(b')$ with Fast Informed Bound (maximizing rewards).
           % \STATE $\globaluppervalue \gets \globaluppervalue \cup \uppervalue$.
           \STATE $\lowercost \gets \lowercost(b')$ with Fast Informed Bound (minimizing costs).
           % \STATE $\globallowercost \gets \globallowercost \cup \lowercost$.
           \STATE $\upperQvalue \gets \emptyset$.
           \STATE $\lowerQvalue \gets \emptyset$.
           \STATE $\upperQcost \gets \emptyset$.
           \STATE $\lowerQcost \gets \emptyset$.
           \STATE $v' \gets (b', d', k', \uppervalue, \lowervalue, \uppercost, \upperQvalue, \lowerQvalue, \upperQcost)$.
        \ELSE
            \STATE $v' \gets T.child(v, a, o)$.
       \ENDIF
       \STATE \textbf{return $v$}.
    \end{algorithmic}
\end{algorithm}

\begin{algorithm}[ht!]
    \caption{Perform backup at a node}
    \label{alg:backup}
    \textbf{Global variables}: $\mathcal{M}, T, \Gamma_{c_{min}}$\\
    Let $\gamma = \mathcal{M}.P.\gamma$\\
    \texttt{\texttt{BACKUP}($v$)}    \begin{algorithmic}[1] %[1] enables line numbers
        \STATE Initialize $\vec{k}$ of size $|A|$
        \FORALL{$a \in A$}
            \STATE $v.\upperQvalue(a) \gets R(v.b,a) + \gamma\mathbb{E}[v'.\uppervalue]$
            \STATE $v.\upperQcost(a) \gets C(v.b,a) + \gamma\mathbb{E}[v'.\uppercost]$
            \STATE $v.\lowerQvalue(a) \gets R(v.b,a) + \gamma\mathbb{E}[v'.\lowervalue]$
            \STATE $v.\lowerQcost(a) \gets C(v.b,a) + \gamma \mathbb{E}[v'.\lowercost]$
            \STATE $\vec{k}[a] \gets \min_{v'}v'.k$
        \ENDFOR
        \STATE $a \gets \argmax_a\{v.\lowerQvalue(a) \mid \upperQcost(a) \leq v.d\}$ 
        \IF{$a \neq \emptyset$}
            \STATE $v.\lowervalue \gets v.\lowerQvalue(a)$
            \STATE $v.\uppercost \gets v.\upperQcost(a)$
            \STATE $v.k \gets \vec{k}[a] + 1$
        \ELSE
            \STATE $a \gets \argmin_a\{v.\upperQcost(a)\}$ 
            \STATE $v.\lowervalue \gets v.\lowerQvalue(a)$
            \STATE $v.\uppercost \gets v.\upperQcost(a)$
            \STATE $v.k \gets 0$
        \ENDIF
        \STATE $a \gets \argmax_a\{v.\upperQvalue(a) \mid \lowerQcost(a) \leq v.d\}$
        \IF{$a \neq \emptyset$}
            \STATE $v.\uppervalue \gets v.\upperQvalue(a)$
            \STATE $v.\lowercost \gets v.\lowerQcost(a)$
        \ELSE
            \STATE $v.\uppervalue \gets -\infty$
            \STATE $v.\lowercost \gets \infty$
            \STATE $v.\lowervalue \gets -\infty$
            \STATE $v.\uppercost \gets \infty$
        \ENDIF
    \end{algorithmic}
\end{algorithm}

\begin{algorithm}[ht!]
    \caption{Prune nodes and node-actions from $v$}
    \label{alg:prune}
    \textbf{Global variables}: $\mathcal{M}, T, \Gamma_{c_{min}}$\\
    \texttt{PRUNE($v$)}
    \begin{algorithmic}[1] %[1] enables line numbers
    \FORALL{$v \in B_{sam}$}
        \IF{$v.\lowercost > v.d$}
            \STATE Prune $v$.
        \ENDIF
        \IF{\textbf{all} node-actions $(v,a)$ are pruned}
            \STATE Prune $v$.
        \ENDIF
        \STATE Initialize $\vec{k}$ of size $|A|$.
        \FORALL{$a \in A$}
            \STATE $\vec{k}[a] \gets \min v'.k$.
        \ENDFOR
        \FORALL{$a, a' \in A$}
                \IF{any child of $(v, a)$ is pruned}
                    \STATE Prune node-action $(v,a)$.
                \ENDIF
                \IF{$\vec{k}[a'] = \infty$, $v.\upperQvalue(a) < v.\lowerQvalue(a')$}
                    \STATE Prune node-action $(v,a)$.
                \ENDIF
        \ENDFOR
    \ENDFOR
    \end{algorithmic}
\end{algorithm}

\newpage
\newpage

\section{Proof of Lemma 1}

\begin{proof}
Given a maximum one-step cost $C_{max}$ for $\pi_{c}^{min}$ and a non-negative admissible cost $v.d$, a lower bound on the admissible horizon can be obtained as follows. The $k$-step admissible horizon from each leaf node $v$ is the largest $k$ such that
\begin{align*}
    \sum_{\tau=0}^{k-1} \gamma^\tau C_{max} \leq v.d.
\end{align*}
The LHS is a finite geometric series:
\begin{equation}
    \label{eq:geometric series}
    \sum_{\tau=0}^{k-1} \gamma^\tau C_{max} = C_{max}\left(\frac{1-\gamma^k}{1-\gamma}\right) \leq v.d.
\end{equation}
Hence, the largest integer $k$ that satisfies Eq.~\eqref{eq:geometric series} is
\begin{align*}
    k = \Big\lfloor \log \Big(1 - \Big(\frac{d(h_t)}{C_{max}}\Big) \cdot (1-\gamma)\Big) / \log(\gamma) \Big\rfloor.
\end{align*}

Also, we obtain the $\infty$-admissibility condition on $\pi_{c}^{min}$ by setting $k = \infty$ in Eq.~\eqref{eq:geometric series}:
\begin{align*}
    \frac{C_{max}}{1-\gamma} \leq v.d.
\end{align*}

Next, when $v.\uppercost^{\pi_{c^{min}}} = 0 \leq v.d$, since costs are non-negative, the minimum cost policy obtains $0$ cost at every future belief, and hence $k = \infty$.

Finally, when admissible cost is negative, the constraints are trivially violated, and hence $k = 0$.
\end{proof}

\section{Proof of Proposition 3}

\begin{proof}
     During search, the pruning criteria prunes policies according to four cases. We show that these four cases only prunes sub-optimal policies and inadmissible policies. 
        \begin{enumerate}
        \item $v.\lowercost > v.d$.\\
        It is easy to see that if $v.\lowercost > v.d$, it is guaranteed that no admissible policies exist from $v$, so we can prune $v$ and its subtree.
        \item At node $v$, actions $a$ and $a'$ are compared. Specifically, $v.\upperQvalue(a)$ is compared with $v.\lowerQvalue(a')$, if $k(v,a') = \infty$ (the policy from taking $a'$ is admissible). The node-action $(v,a)$ is pruned if $v.\upperQvalue(a) < v.\lowerQvalue(a')$.\\
        There are two cases to consider: (i) $k(v,a) = \infty$ and (ii) $k(v,a) < \infty$.  In case (i), $k(v,a) = \infty$. $v.\upperQvalue(a)$ is a valid upper bound of the Q reward-value of taking action $a$ as a consequence of Lemma~\ref{lemma:validity}.  In case (ii), $k(v,a) < \infty$, $v.\upperQvalue(a)$ is also a valid upper bound of the Q reward-value of taking action $a$. For a node $v$ with $\bar{b} = (v.b, v.d)$, We show that $v.\uppervalue$ is an upper bound on $V_R^*(\bar{b})$. That is, if $v.k \leq \infty$, we have that for an optimal policy starting from $\bar{b} = (v.b, v.d)$ with optimal reward-value $V_R^*(\bar{b})$ and cost-value $V_C^*(\bar{b})$,
    \begin{align}
        V_R^*(\bar{b}) \leq v.\uppervalue \text{ and }
        v.\lowercost \leq V_C^*(\bar{b}).
    \end{align}

    This can be seen by noting that the \texttt{BACKUP} step performs an RC-POMDP Bellman backup. If the policy is in fact admissible (since $v.k$ is an underestimate), then the results from Lemma~\ref{lemma:validity} hold. If the policy is not admissible, the optimal reward-value of an admissible policy from that node cannot be higher than $v.\uppervalue$ (and the optimal cost-value cannot be higher than $v.\lowercost$), since an admissible policy satisfies more constraints than a finite $k$-admissible one. In both cases, $a'$ is strictly a better action than $a$, so taking action $a$ at node $v$ cannot be part of an optimal policy.
    \item If all node-actions $(v,a)$ are pruned, $v$ is also pruned.\\
    No actions are admissible from $v$ so it is inadmissible.
    \item $(v,a)$ is pruned if any successor node from taking action $a$ is pruned.\\
    A successor node is pruned (case 1 or 3), so this policy is not admissible.
    \end{enumerate}
\end{proof}

\section{Proof of Lemma 2}

\begin{proof}
    The proof relies on the result of Theorem~\ref{thm:deterministic rcpomdp}. that for admissibility constraint $k = \infty$, deterministic policies are sufficient for optimality. That is, it is sufficient to provide upper and lower bounds over deterministic policies.

    We first show that the initial bounds $\uppervalue, \lowervalue, \uppercost, \lowercost, k$ for a new (leaf) node $v$ (Algorithm~\ref{alg:updatenode}) are true bounds on the optimal policy. 
    
    $\uppervalue$ and $\lowercost$ are initialized with the Fast Informed Bound with the unconstrained POMDP problem, separately for reward maximization and cost minimization. The Fast Informed Bound provides valid upper bounds on reward-value (and lower bounds on cost-value of a cost-minimization policy, which in turn is a lower bound on cost-value for an optimal RC-POMDP policy) \cite{hauskrecht2000value}. The upper bound on the optimal reward-value for the reward-maximization unconstrained POMDP problem is also an upper bound on the optimal reward-value for an RC-POMDP which has additional constraints. Similarly, the lower bound on the optimal cost-value for the cost-minimization unconstrained POMDP problem is also a lower bound on the cost-value of an optimal RC-POMDP policy which has additional constraints.
    
    $\lowervalue$ and $\uppercost$ are computed using the minimum cost policy $\Gamma_{c_{min}}$. This minimum cost policy is an alpha-vector policy, which is an upper bound on the cost-value function \cite{hauskrecht2000value}, so $\uppercost$ is an upper bound on the cost-value when following the minimum cost policy. It can also be seen that the value $\lowervalue$ from following the same policy is a valid bound on the optimal reward-value function from that node. Finally, the admissible horizon guarantee $k$ is initialized using the results from Lemma~\ref{lemma:k-admissible}, which is shown to be a lower bound on the true admissible horizon following the minimum cost policy.

    Next, we show that performing the \texttt{BACKUP} step (Algorithm~\ref{alg:backup} maintains the validity of the bounds. Recall the condition of this lemma that admissible horizon guarantee $v.k = \infty$ for the node $v$. Thus, after the backup step, the admissible horizon guarantee remains at $v.k = \infty$. From the proof of Theorem~\ref{thm: bpo rcpomdp}, the RC-POMDP Bellman backup $\mathbb{B}$ satisfies Bellman's Principle of Optimality and is a contraction mapping within the space of admissible value functions (and hence policies).
    
    Let $\uppervalue', \lowervalue', \uppercost', \lowercost'$ be the value after the \texttt{BACKUP} step, which performs a Bellman backup $\mathbb{B}$ on $\uppervalue, \lowervalue, \uppercost, \lowercost$:
    \begin{align*}
        v.\uppervalue' = v.\mathbb{B}(\uppervalue),\\
        v.\lowervalue' =  v.\mathbb{B}(\lowervalue),\\
        v.\uppercost' =   v.\mathbb{B}(\uppercost),\\
        v.\lowercost' = v.\mathbb{B}(\lowercost).
    \end{align*}
    Since $\mathbb{B}$ is a contraction mapping within the space of admissible policies, we see that:
    \begin{align*}
        v.\mathbb{B}(\uppervalue) \leq v.\uppervalue,\\
        v.\mathbb{B}(\uppercost) \leq v.\uppercost,\\
        v.\mathbb{B}(\lowervalue) \geq v.\lowervalue,\\
        v.\mathbb{B}(\lowercost) \geq v.\lowercost.
    \end{align*}
    
    Therefore, for $v.k = \infty$,  we have that for an optimal policy starting from $\bar{b} = (v.b, v.d)$ with optimal reward-value $V_R^*(\bar{b})$ and cost-value $V_C^*(\bar{b})$,
    \begin{align*}
        v.\lowervalue \leq V_R^*(\bar{b}) \leq v.\uppervalue\\
        V_C^*(\bar{b}) \leq v.\uppercost\\
    \end{align*}
\end{proof}

\section{Proof of Theorem 4}

\begin{proof}
    There are two termination criteria for ARCS, of which both must be true before termination. ARCS terminates when (1) it finds an admissible policy, and (2) the policy is $\epsilon$-optimal, that is when $v_0.\uppervalue - v_0.\lowervalue \leq \epsilon$. We first discuss admissibility, then $\epsilon$-optimality.

    (1) ARCS can terminate when it finds an admissible policy, i.e., $v_0.k = \infty$. ARCS finds an admissible policy when every leaf node $v_{leaf}$ under the policy satisfies (i) Eq.~\eqref{eq: infinite admissibility}, or (ii) $V_{C}^{\pi}(b'_t) = 0$.

    We prove that this is a sound condition, i.e., if the (i) and (ii) hold for every leaf node $v_{leaf}$, the computed policy is indeed admissible. As proven in Lemma~\ref{lemma:k-admissible}, the admissible horizon guarantee for a leaf node $v_{leaf}.k$ is a conservative under-approximation. Therefore, a leaf node with $v_{leaf}.k = \infty$ indeed means that we have found an admissible policy from $v_{leaf}$ (with $\Gamma_{c_{min}}$). Suppose all leaf nodes have $v_{leaf}.k = \infty$. The worst-case back-propagation of admissible horizon guarantee up the tree is sound, since a non-leaf node $v$ only has $v.k = \infty$ if all its leaf nodes have $k = \infty$ \emph{and} Eq.~\eqref{eq:precursor constraints} is satisfied at that node (Lines 9-18 in Algorithm~\ref{alg:backup}). Therefore, if $v_0.k = \infty$, the policy is admissible, and ARCS can terminate if $v_0.k = \infty$.
    
    (2) ARCS can terminate when the gap criterion at the root is satisfied, that is when $v_0.\uppervalue - v_0.\lowervalue \leq \epsilon$. 

    If $v_0.k = \infty$, the policy at $v_0.k$ is admissible, which implies every history-belief reachable under the policy tree is admissible. From Lemma~\ref{lemma:validity}, this implies that for all nodes, $\uppervalue, \lowervalue, \uppercost, \lowercost$ are valid bounds on the optimal value function. Thus, $v_0.\uppervalue$ and $v_0.\lowervalue$ are valid bounds on the optimal value function from $b_0$, and so, an $\epsilon$-optimal policy is indeed found.

    Therefore, if ARCS terminates, the computed solution is an admissible $\epsilon$-optimal policy.
\end{proof}

\section{Experimental Evaluation}
\label{appendix: evaluation}

\subsection{Implementation details}

The code for each algorithm implementation can be found in the attached supplementary material. Here, we detail parameters and implementation of the algorithms. For hyper-parameter tuning, we used the default parameters for ARCS. For the rest of the algorithms, the values of the hyper-parameters were chosen based empirical evaluations, and fixed for the experiments. For each environment, we used a maximum time step of $20$ during evaluation. Except for the Tiger problem, all algorithms reached terminal states before $20$ time steps in these problems or produced a policy which stayed still at $20$ time steps.

\subsubsection{ARCS}
We implemented ARCS as described. We used the Fast Informed Bound for the initialization of upper bound on reward value and lower bound on cost value. We used SARSOP for the computation of the minimum cost policy. We set the SARSOP hyperparameter $\kappa = 0.5$ for our experiments, the same value as \cite{Kurniawati-RSS08-SARSOP}. We used a uniform randomization ($0.5$ probability) between heuristic sampling and random sampling during planning, and we leave an analysis on how the randomization weight may affect planning efficiency to future work. 

\subsubsection{CGCP} We implemented CGCP and adapted it for discounted infinite horizon problems, using  Alg. 5 in \cite{walraven2018cgcp} as a basis. However, we use the discounted infinite horizon POMDP solver SARSOP \cite{Kurniawati-RSS08-SARSOP} in place of a finite horizon PBVI. Our method of constructing policy graphs also differs, as the approach described is for finite horizon problems. We check for a return to beliefs previously visited under the policy in order to reduce the size of the graph. A maximum time of $300$ seconds was used for CGCP. For each SARSOP iteration within CGCP, $\tau = 20$ seconds was given initially, while the solve time was incremented by $\tau^+ = 100$ seconds every time that the dual price $\lambda$ remained the same. Additionally, CGCP was limited to $100$ iterations. In an effort to reduce computation time, policy graph evaluation and SARSOP search was limited to depth $20$ (the same as the monte carlo evaluation depth) in all domains except the RockSample domains, which were allowed unlimited depth.

For the Tunnels benchmark, $1000$ monte carlo simulations to depth $20$ were used in place of a policy graph to estimate the value of policies. This was due to the inability of the policy graphs to estimate the value of some infinite horizon POMDP solutions which do not lead to terminal states or beliefs which have already appeared in the tree.

\subsubsection{CGCP-CL}
CGCP-CL uses the same parameters as CGCP, but re-plans at every time step.

\subsubsection{No-regret Learning Algorithm}

We implemented the no-regret learning algorithm from \citep{kalagarla22aNoRegret}. We used SARSOP as the unconstrained POMDP solver, and monte carlo simulations to estimate the value of policies. 

\subsubsection{CPBVI}
We implemented CPBVI based on \cite{kim2011cpbvi}. The algorithm generates a set of reachable beliefs $\mathcal{B}$ before performing iterations of approximate dynamic programming on the belief set. However, the paper did not include full details on belief set $\mathcal{B}$ generation and alpha-vector set $\Gamma$ initialization. 

The paper cited \citet{pineau2006anytime} for their belief set description, and so we followed \citet{pineau2006anytime} by expanding $\mathcal{B}$ greedily towards achieving uniform density in the set of reachable beliefs. This is done by randomly simulating a step forward from a node in the tree, thereby generating candidate beliefs, and keeping the belief that is farthest from any belief already in the tree. We repeat this expansion until the desired number of beliefs have been added to the tree.

To address $\Gamma$ initialization, we adopted the \textit{blind lower bound} approach. This approach represents the lower bound $\Gamma$ with a set of alpha-vectors corresponding to each action in $A$. Each alpha-vector is generated under the assumption that the same action is taken forever. To compute an alpha-vector corresponding to a given action, we first compute the \textit{best-action worst-state (BAWS)} lower bound. This is done by evaluating the discounted reward obtained by taking the best action in the worst state forever. We can then update the \textit{BAWS} alpha-vectors by performing value backups until convergence. 

The CPBVI algorithm involves the computation of a linear program (LP) to obtain the best action at a given belief. One of the constraints asserts that the convex combination of cost alpha-vectors evaluated at a given belief $b$ must be equal to or less than the admissible cost $d$ associated with $b$, which is used in CPBVI's heuristic approach. However, if $d < 0$, the LP becomes infeasible. The case of $d < 0$ is possible since no pruning of beliefs is conducted. The paper did not provide details to account for this situation. To address this, if the LP is infeasible, we output the action with the lowest cost, akin to ARCS' minimum cost policy method when no policy is admissible.

\subsubsection{CPBVI-D} CPBVI computes stochastic policies. We modify CPBVI to only compute deterministic policies with the following details. Instead of solving the LP to generate a stochastic action, we solve the for the single highest value action subject to the cost-value constraint. 

Although both CPBVI and CPBVI-D theoretically have performance insensitive to random seed initialization, both algorithms are sensitive to the number of belief parameter during planning. With too few beliefs selected for a problem, both algorithms cannot search the problem space sufficiently. With too many beliefs selected, the time taken for belief selection is too high for the moderately sized problems of CRS and Tunnels. Therefore, we tuned and chose a belief parameter of $30$ that allows finding solutions in the planning time of $300s$. Note that even with a small number of beliefs of $30$, CPBVI routinely overruns the planning time limit during its update step.

\subsection{Environment details}

\begin{itemize}[nosep] %,label={},leftmargin=0pt
    \item \textbf{CE}: Simplified counterexample in Figure~\ref{fig:counterexample}.
    \item \textbf{C-Tiger}: A constrained version of the Tiger POMDP Problem \cite{Kaelbling1998pomdp}, with cost of $1$ for the ``listen" action.
    \item \textbf{CRS}: A constrained version of the RockSample problem \cite{Trey2004HSVI} as defined in \cite{Lee2018ccpomcp} with varying sizes and number of rocks.
    \item \textbf{Tunnels}: A scaled version of Example \ref{ex:caveexample}, shown in Fig.~\ref{fig:tunnels problem}.
\end{itemize}

Except for the RockSample environment, our environments do not depend on randomness. For each RockSample environment of which rock location depends on randomness, we used the rng algorithm MersenneTwister with a fixed seed to generate the RockSample environments. 

\subsubsection{Counterexample Problem}

The counterexample POMDP in Figure~1 uses a discount of $\gamma = 1$. In the experiments, we used a discount factor of $\gamma = 1 - e^{-14}$ to approximate a discount of $\gamma = 1$. It is modeled as an RC-POMDP as follows. 

States are enumerated as $\{s_1,s_2,s_3,s_4,s_5\}$ with actions following as $\{a_A,a_B\}$ and observations being noisy indicators for whether or not a state is rocky.

States $s_1$ and $s_2$ indicate whether cave 1 or cave 2 contains rocky terrain, respectively. Taking action $a_B$ circumvents the caves unilaterally incurring a cost of $5.0$ and transitioning to terminal state $s_5$. Taking action $a_B$ moves closer to the caves where $s_i$ deterministically transitions to $s_{i+2}$. In this transition, the agent is given an 85\% accurate observation of the true state.

At this new observation position, the agent is given a choice to commit to one of two caves where $s_3$ indicates that cave 1 contains rocks and $s_4$ indicates that cave 2 contains rocks. Action $a_A$ moves through cave 1 and $a_B$ moves through cave 2. Moving through rocks incurs a cost of $10$ while avoiding them incurs no cost. Taking action $a_A$ at this point, regardless of true state, gives a reward of $12$. States $s_3$ and $s_4$ unilaterally transition to terminal state $s_5$.

\subsubsection{Tunnels Problem}

The tunnels problem is modeled as an RC-POMDP as follows.
As depicted in figure \ref{fig:tunnels problem}, the tunnels problem consists of a centralized starting hall that funnels into 3 separate tunnels. At the end of tunnels 1,2,and 3 lie rewards 2.0, 1.5 and 0.5 respectively. However, with high reward also comes high cost as tunnel 1 has a 80\% probability of containing rocks and tunnel 2 has a 40\% probability of containing rocks while tunnel 3 is always free of rocks. If present, the rocks fill 2 steps before the reward location at the end of a tunnel and a cost of 1 is incurred if the agent traverses over these rocks. Furthermore, a cost of 1 is incurred if the agent chooses to move backwards.

The only partial observability in the state is over whether or not rocks are present in tunnels 1 or 2. As the agent gets closer to the rocks, accuracy of observations indicating the presence of rocks increases. 

\subsection{Experiment Evaluation Setup}

We implemented each algorithm in Julia using tools from the POMDPs.jl framework~\cite{egorov2017pomdps}, and all experiments were conducted single-threaded on a computer with two nominally 2.2 GHz Intel Xeon CPUs with 48 cores and 128 GB RAM. All experiments were conducted in Ubuntu 18.04.6 LTS. For all algorithms except CGCP-CL, solve time is limited to $300$ seconds and online action selection to $0.05$ seconds. For CGCP-CL, $300$ seconds was given for each action (recomputed from scratch). We simulate each policy $1000$ times, except for CGCP-CL, which is simulated $100$ times due to the time taken for re-computation of the policy at each time step. The full results with the mean and standard error of the mean for each metric are shown in Table.~\ref{tab:extended results}.

\begin{table*}[t]
    % \small 
    \centering 
    \begin{center} 
    \begin{tabular}{l | c | l | c | c| c}  
    Environment & State/Action/Obs &  Algorithm & Violation Rate & Cumulative Reward & Cumulative Cost\\ [0.5ex]  
    \hline 
    \multirow{2}{*}{CE} & & CGCP & $0.514 \pm 0.016$ & $12.0 \pm 0.0$ & $5.19 \pm 0.158$\\  
                                                & & CGCP-CL & $0.0 \pm 0.0$ & $6.12 \pm 0.603$ & $3.25 \pm 0.313$\\ 
                                                ($\hat{c}=5$)& $5\,/\,2\,/\,2$ & CPBVI & $0.0 \pm 0.0$ & $8.354 \pm 0.135$ & $4.505 \pm 0.067$\\ 
                                                && CPBVI-D & $0.0 \pm 0.0$ & $6.192 \pm 0.19$ & $3.61 \pm 0.105$\\ 
                                                && EXP-Gradient & $0.485 \pm 0.016$ & $11.868 \pm 0.04$ & $4.975 \pm 0.157$ \\
                                                && Ours & $0.0 \pm 0.0$ & $10 \pm 0$ & $5 \pm 0$ \\\hline 
    \multirow{2}{*}{C-Tiger} &&  CGCP & $0.674 \pm 0.015$ & $-62.096 \pm 3.148$ & $1.536 \pm 0.034$ \\
                                                && CGCP-CL  & $0.76 \pm 0.043$ & $-72.424 \pm 5.283$ & $1.535 \pm 0.005$ \\
                                                ($\hat{c}=1.5$)&2\,/\,3\,/\,2& CPBVI & $0.482 \pm 0.016$ & $-74.456 \pm 1.79$ & $1.489 \pm 0.011$ \\
                                                & & CPBVI-D & $0.0 \pm 0.0$ & $-75.414 \pm 1.617$ & $1.497 \pm 0.0$ \\
                                                & &EXP-Gradient & $1.0 \pm 0.0$ & $-3.713 \pm 0.92$ & $2.294 \pm 0.004$ \\
                                                && Ours & $0.0 \pm 0.0$ & $-75.075 \pm 1.511$ & $1.422 \pm 0.0$ \\
    \hline
    \multirow{2}{*}{C-Tiger} &  &CGCP & $0.753 \pm 0.014$ & $-1.690 \pm 0.647$ & $2.996 \pm 0.014$ \\  
                                                & &CGCP-CL  & $0.140 \pm 0.035$ & $-2.983 \pm 2.045$ & $2.930 \pm 0.035$ \\ 
                                                ($\hat{c}=3$)& 2\,/\,3\,/\,2& CPBVI & $0.153 \pm 0.011$ & $-11.11 \pm 1.05$ & $2.58 \pm 0.010$\\ 
                                                &&  CPBVI-D & $0.0 \pm 0.0$ & $-178 \pm 2.62$ & $0.0 \pm 0.0$ \\ 
                                                && EXP-Gradient & $1.0 \pm 0.0$ & $1.813 \pm 0.323$ & $3.222 \pm 0.007$ \\
                                                && Ours & $0.0 \pm 0.0$ & $-5.75 \pm 0.522$ & $2.982 \pm 0.001$\\
    \hline
    \multirow{2}{*}{CRS(4,4)} & & CGCP & $0.512 \pm 0.024 $ & $10.434 \pm 0.125$ & $0.512 \pm 0.016$ \\  
                                            && CGCP-CL & $0.78 \pm 0.004 $ & $1.657 \pm 0.315$ & $0.724 \pm 0.040 $ \\ 
                                            ($\hat{c}=1$) & 201\,/\,8\,/\,3 &CPBVI & $0.0 \pm 0.0$ & $-0.4 \pm 0.316$ & $0.522 \pm 0.016$ \\ 
                                             && CPBVI-D & $0.0 \pm 0.0$ & $-0.321 \pm 0.434$ & $3.082 \pm 0.005$ \\
                                            && EXP-Gradient & $0.295 \pm 0.014$ & $10.383 \pm 0.156$ & $0.918 \pm 0.058$ \\
                                            && Ours & $0.0 \pm 0.0$ & $6.52 \pm 0.316$ & $0.523 \pm 0.016$\\ 
    \hline 
    \multirow{2}{*}{CRS(5,7)} & & CGCP & $0.412 \pm 0.022$ & $11.984 \pm 0.193$ & $1.00 \pm 0.038$\\  
                                            & & CL-CGCP & $0.18 \pm 0.009$ & $9.641 \pm  0.477$ & $0.991 \pm 0.034$ \\ 
                                            ($\hat{c}=1$)& 3201\,/\,12\,/\,3 & CPBVI & $0.0 \pm 0.0$ & $0.0 \pm 0.0$ & $0.0 \pm 0.0$ \\ 
                                            & & CPBVI-D & $0.0 \pm 0.0$& $0.0 \pm 0.0$ & $0.0 \pm 0.0$\\ 
                                            && EXP-Gradient & $0.30 \pm 0.014$ & $11.90 \pm 0.22$ & $1.31 \pm 0.06$\\
                                            & & Ours & $0.0 \pm 0.0$ & $11.766 \pm 0.137$ & $0.950 \pm 0.0$ \\           
    \hline 
    \multirow{2}{*}{CRS(7,8)} & & CGCP & $0.357 \pm 0.015$ & $10.78 \pm 0.19$ & $0.945 \pm 0.04$\\  
                                            & & CL-CGCP & $0.20 \pm 0.13$ & $11.17 \pm  1.53$ & $0.931 \pm 0.078$ \\ 
                                            ($\hat{c}=1$) & 12545\,/\,13\,/\,3 & CPBVI & $0.0 \pm 0.0$ & $0.0 \pm 0.0$ & $0.0 \pm 0.0$ \\ 
                                            & & CPBVI-D & $0.0 \pm 0.0$& $0.0 \pm 0.0$ & $0.0 \pm 0.0$\\ 
                                            & & EXP-Gradient & $0.322 \pm 0.015$ & $10.03 \pm 0.16$ & $1.154 \pm 0.054$ \\
                                            & & Ours & $0.0 \pm 0.0$ & $6.61 \pm 0.22$ & $0.960 \pm 0.003$ \\           
    \hline 
    \multirow{2}{*}{CRS(11,11)} & & CGCP & $0.0 \pm 0.0$ & $5.987 \pm 0.0$ & $0.0 \pm 0.0$ \\
                                            & & CL-CGCP & - & - & - \\ 
                                            ($\hat{c}=1$) & 247809\,/\,16\,/\,3 & CPBVI & $0.0 \pm 0.0$ & $0.0 \pm 0.0$ & $0.0 \pm 0.0$ \\ 
                                             & & CPBVI-D & $0.0 \pm 0.0$& $0.0 \pm 0.0$ & $0.0 \pm 0.0$\\ 
                                            & & EXP-Gradient & $0.0 \pm 0.0$ & $5.987 \pm 0.0$ & $0.0 \pm 0.0$ \\
                                            & & Ours & $0.0 \pm 0.0$ & $5.987 \pm 0.0$ & $0.0 \pm 0.0$ \\          
    \hline 
    \multirow{2}{*}{Tunnels} & & CGCP & $0.50 \pm 0.015$ & $1.612 \pm 0.011$ & $1.011 \pm 0.016$\\  
                                                && CL-CGCP & $0.31 \pm 0.046$ & $1.22 \pm 0.058$ & $0.683 \pm 0.082$\\ 
                                                ($\hat{c}=1$)& 53\,/\,3\,/\,5 & CPBVI & $0.90 \pm 0.009$ & $1.921 \pm 0.0$ & $1.621 \pm 0.02$\\ 
                                                P(correct obs) = $0.8$ & & CPBVI-D & $0.89 \pm 0.010$ & $1.921 \pm 0.0$ & $1.568 \pm 0.024$\\ 
                                                & & EXP-Gradient & $0.48 \pm 0.016$ & $1.35 \pm 0.02$ & $0.815 \pm 0.03$\\
                                                & & Ours & $0.0 \pm 0.0$ & $1.028 \pm 0.018$ & $0.440 \pm 0.020$ \\ 
                                                \hline 
    \multirow{2}{*}{Tunnels} & & CGCP & $0.492 \pm 0.016$ & $1.679 \pm 0.008$ & $1.056 \pm 0.033$ \\  
                                            & & CL-CGCP & $0.27 \pm 0.045$ & $1.17 \pm 0.061$ & $0.812 \pm 0.069$ \\ 
                                            ($\hat{c}=1$)& 53\,/\,3\,/\,5 & C-PBVI & $0.783 \pm 0.013$ & $1.921 \pm 0.0$ & $1.517 \pm 0.026$ \\ 
                                            & & EXP-Gradient & $0.44 \pm 0.016$ & $1.42 \pm 0.02$ & $0.86 \pm 0.03$\\
                                            P(correct obs) = $0.95$ && CPBVI-D & $0.812 \pm 0.012$ & $1.921 \pm 0.0$ & $1.57 \pm 0.024$ \\ 
                                            & & Ours & $0.0 \pm 0.0$ & $1.010 \pm 0.017$ & $0.273 \pm 0.013$\\ 
    \end{tabular} 
    \end{center}
    \caption{Comparison of our RC-POMDP algorithm to state-of-the-art offline C-POMDP algorithms. We report the mean and $1$ standard error of the mean for each metric. A memory-out is indicated by a $-$. Note that for CRS(11,11), due to the $300s$ time limit, CGCP, Exp-Gradient and ours all compute a policy that goes directly to the exit area without interacting with any rocks, and achieve the same reward and $0$ cost.}
    \label{tab:extended results}
\end{table*}