\section{Theoretical Analysis of RC-POMDPs}

We first transform Eq. \eqref{eq:pre-recursive constraints} into an equivalent recursive form that is better suited for policy computation, e.g., tree search and dynamic programming. By rearranging Eq. \eqref{eq:pre-recursive constraints},  $V^\pi_C(b_t) \leq \gamma^{-t}\cdot(\hat{c} - W(h_{t})) $.
Based on this, we define the \textit{history-dependent admissible cost bound} as:
\begin{align}
    d(h_{t}) = {\gamma^{-t}} \cdot (\hat{c} - W(h_{t})),
\end{align}
which can be computed recursively:
\begin{equation}
    \label{eq:history dependent cost recursive}
    d(h_0) = \hat{c}, \quad  d(h_{t+1}) = {\gamma^{-1}} \cdot \big( d(h_t) - C(b_t,a_t) \big).
\end{equation}
\noindent
Then, Problem~\ref{prob: rcpomdp} can be reformulated with recursive bounds.
\begin{proposition}
    Problem~\ref{prob: rcpomdp} can be rewritten as:
    \begin{equation}
    \label{eq:recursive constraints}
    \begin{split}
        \pi^* &=  \arg\max _{\pi} V_{R}^{\pi}(b_{0}) \\ 
        & \text { s.t. } \quad V_{C}^{\pi} (b_t) \leq d(h_{t}) \quad  \forall t \in \{0, 1, \dots, k-1\}, 
    \end{split}
    \end{equation}    
    where $d(h_t)$ is defined recursively in Eq.~\eqref{eq:history dependent cost recursive}.
\end{proposition}

\paragraph*{Optimality of Deterministic Policies}
Here, we show that deterministic policies suffice for optimality in RC-POMDPs.

\begin{theorem}
    \label{thm:deterministic rcpomdp}
    An RC-POMDP with admissibility constraint $k = \infty$ has at least one deterministic optimal policy if an admissible policy exists.
\end{theorem}

A proof is provided in the Appendix. The main intuition is that we can always construct an optimal deterministic policy from an optimal stochastic policy. That is, at every history in which the policy has stochasticity, we can construct a new admissible policy that achieves the same reward-value while remaining admissible by deterministically choosing one of the stochastic actions at that history. We obtain a deterministic optimal policy by inductively performing this determinization at all reachable histories.

\textbf{Satisfaction of Bellman's Principle of Optimality \quad}
Here, we show that RC-POMDPs satisfy BPO with a policy-independent optimal substructure.

\begin{proposition}[Belief-Admissible Cost Formulation]
    \label{prop: markovian}
    An RC-POMDP belief $b_t$ with history dependent admissible cost bound $d(h_t)$ can be rewritten as an augmented belief-admissible cost state $\bar{b}_t = (b_t, d(h_t))$.
    Further, the augmented $Q$-values for a policy can be written as:
    \begin{align*}
        Q_R^\pi((b_t, d(h_t)), a) &= R(b_t,a) + \gamma \, \mathbb{E}[V^\pi_R((b_{t+1}, d(h_{t+1}))],\\
        Q_C^\pi((b_t, d(h_t)), a) &= C(b_t,a) + \gamma \, \mathbb{E}[V^\pi_C((b_{t+1}, d(h_{t+1}))].
    \end{align*}
\end{proposition}

We first see that the evolution of $\bar{b}_t$ is Markovian, i.e., 
    \begin{align*}
        &P(\bar{b}_{t+1}  \mid \bar{b}_{t}, a_t, o_t, h_t)
              \\
             &=\begin{cases}
                P(b_{t+1} \mid b_{t}, a, o_t, h_t)  &    \text{if } d(h_{t+1}) =  \frac{(d(h_t) - C(b_t, a_t))}{\gamma} \\
                0   & \text{otherwise,}
             \end{cases}
    \end{align*}
    thus, $P(\bar{b}_{t+1} \mid \bar{b}_{t}, a_t, o_t, h_t) = P(\bar{b}_{t+1} \mid \bar{b}_{t}, a_t, o_t)$.

Here, we use the policy iteration version of Bellman equation, but a similar argument can be made for value iteration.
\begin{theorem}
    \label{thm: bpo rcpomdp}
    Fix $\pi$. Let $V^\pi = (V_R^\pi, V_C^\pi)$ be reward- and cost-value function for $\pi$. The Bellman operator $\mathbb{B}$ for policy $\pi$ for an RC-POMDP is given by, $\forall \bar{b}_t$,
    \begin{align}
        \label{eq:rcbackup}
        &\mathbf{a} = \argmax_{a \in A}\Big[Q_R^{\pi}(\bar{b}_t, a) \mid
        Q^{\pi}_C(\bar{b}_t, a) \leq d(h_t)\Big] \\
        &\mathbb{B}[V^\pi](\bar{b}_t) \triangleq \begin{cases}
            \big(Q_R^{\pi}(\bar{b}_t, a), Q_C^{\pi}(\bar{b}_t, a)\big), \, a \in \mathbf{a}  & \text{ if } \mathbf{a} \neq \emptyset, \\
            \big(V_R^{\pi}(\bar{b}_t), (\infty, \ldots, \infty)\big) & \text{ if } \mathbf{a} = \emptyset,
        \end{cases} \nonumber
    \end{align}
    Assume an admissible policy exists for the RC-POMDP with admissibility constraint $k = \infty$. Let $V^{\pi^*} = (V_R^{\pi^*}, V_C^{\pi^*})$ be the values for an optimal admissible policy $\pi^*$, and we obtain a new policy $\pi'$ with $(V_R^{\pi'}, V_C^{\pi'}) = \mathbb{B}[V^{\pi^*}]$. $\pi^*$ satisfies the BPO criterion of an admissible optimal policy:
    \begin{align}
    \label{eq:bpo optimality}
    V_R^{\pi'}(\bar{b}_t) &= V_R^{\pi^*}(\bar{b}_t) \qquad \forall \bar{b}_t,\\
    V_C^{\pi'}(b_t) &\leq d(h_t) \qquad \quad \, \forall \bar{b}_t \in \textsc{Reach}^{\pi'}\!(\bar{b}_0),
    \end{align}
    where $\textsc{Reach}^\pi(\bar{b}_0)$ is the set of augmented belief states reachable from $b_0$ under policy $\pi$.
\end{theorem}

This theorem shows that an optimal policy remains admissible and optimal w.r.t rewards after applying $\mathbb{B}$ on a policy independent value function $V$. Note that $V_C^*$ is not unique as there may be multiple optimal cost-value functions for an optimal $V_R^*$. Next, we show that $\mathbb{B}$ is a contraction over reward-values for a suitably initialized value function, which is one that defines the space of admissible policies.

\begin{theorem}
    \label{thm: fixed point rcpomdp}
    For each $\bar{b}_t$, define $\Phi(\bar{b}_t)$ as the set of admissible policies from $\bar{b}_t$:
    \begin{align}
        \Phi(\bar{b}_t) = \{\pi \;|\; V^{\pi}_C(b_\tau) \leq d(h_\tau) \;\;\forall \tau \geq t\}.
    \end{align}

    $V^{\pi^0}$ is a well behaved initial value function if the following holds for all $\bar{b}_t$. If $\Phi(\bar{b}_t) = \emptyset$, $V^{\pi^0}_C(\bar{b}_t) = (\infty, \ldots, \infty)$. If $\Phi(\bar{b}_t) \neq \emptyset$, $V_C^{\pi^0}(\bar{b}_t) \leq d(h_t)$.

    Suppose that $V^{\pi^0}$ is well behaved, then $\mathbb{B}^{n}[(V_R^{\pi^0}, V_C^{\pi^0})]_{x} \rightarrow (V_R^{\pi^*}, V_C^{\pi^n})$ as $n \rightarrow \infty$. That is, starting from $\pi^0$, $\mathbb{B}$ is a contraction on $V_R$ and $V_R^{\pi^*}$ is a unique fixed point.
\end{theorem}

Proofs of all results are provided in the Appendix. Theorems~\ref{thm:deterministic rcpomdp}-\ref{thm: fixed point rcpomdp} show that it is sufficient to search in the space of deterministic policies for an optimal one, and the policy-independent optimal substructure of RC-POMDPs can be exploited to employ dynamic programming for an effective and computationally efficient algorithm for RC-POMDPs. Further, Theorem~\ref{thm: fixed point rcpomdp} shows that determining policy admissibility is essential for effective dynamic programming. These results also indicate that optimal policies for RC-POMDPs do not exhibit the same pathological behaviors as C-POMDPs.

