\section{Proofs}\label{app:proofs}

\subsection{Lemma~\ref{lem:sharpness}}\label{app:proof-sharpness}
\begin{proof}
  We reproduce the target bound in Equation~\ref{eq:lower-bound} below.
  \begin{equation*}
    \E[Y(\pi)\mid X] \geq \E[Y\,\tilde g_\pi^{(-)}(Y,X) \mid \Pi=\pi, X].
  \end{equation*}
  An intermediate result is that the proposed balancing criterion (Proposition~\ref{prop:balancing}) is sufficient to induce sharpness.
  \begin{equation*}
      \E\left[\dv{P_{Y(\pi)|X}(Y\mid X)}{P_{Y|\Pi=\pi,X}(Y\mid X)}\Bigm| \Pi=\pi, X\right] = 1
  \end{equation*}
  Following these conditions, we can use the proposed form for $\tilde g_\pi^{(-)}$,
  \begin{equation*} %
    \tilde g_\pi^{(-)}(Y,X)  \triangleq \begin{cases}
        \E\big[\Gamma^{+\norm{\Pi-\pi}}\bigm| X\big] & \text{if } Y\leq Q_\tau(\pi, X),\\
        \E\big[\Gamma^{-\norm{\Pi-\pi}}\bigm| X\big] & \text{if } Y> Q_\tau(\pi, X),
    \end{cases}
    \qquad
    \tau \triangleq \frac{\E\big[\Gamma^{+\norm{\Pi-\pi}}\mid X\big]-1}{\E\big[\Gamma^{+\norm{\Pi-\pi}}\mid X\big]-\E\big[\Gamma^{-\norm{\Pi-\pi}}\mid X\big]}.
  \end{equation*}
  In the rest of the proof, we omit $X$ in order to relieve the notational burden. Assume that the expectations are all implicitly conditioned on $X$. The results are invariant to its conditioning. Additionally, the outcome space $\mathcal{Y}$, which represents cumulative non-negative rewards, must itself be assumed to be a connected subset of $\mathbb{R}_{\geq 0}$. %
  
  We note that since the construction $\tilde g_\pi^{(-)}(Y,X)$ is already constrained by the proposed sensitivity model as well as the putative balancing criterion, the burden of proof has shifted to the validity of the bound rather than its sharpness.
  \paragraph{First, we prove validity.}
  Suppose that there exists a system in which $\E[Y(\pi)] < \E[Y\,\tilde g_\pi^{(-)} \mid \Pi=\pi]$ for some fixed $\pi$. 
  
  By implication of Equation~\ref{eq:identification}, there is an oracle $\tilde g_\pi$ such that
  \begin{equation*}
      \E[Y\,\tilde g_\pi \mid \Pi=\pi] < \E[Y\,\tilde g_\pi^{(-)} \mid \Pi=\pi], \quad\text{and}\quad \E[(\tilde g_\pi-\tilde g_\pi^{(-)})Y \mid \Pi=\pi] < 0.
  \end{equation*}
  Both kernels must be balanced in the sense that $\E_Y[\tilde g_\pi-\tilde g_\pi^{(-)} \mid \Pi=\pi]=0.$ %
  They are also both positive for all $y\in\mathcal{Y}$, as required for the existence of the proposed sensitivity model (Definition~\ref{def:sensitivity-model}). It stands to reason that the oracle kernel $\tilde g_\pi$ has moved probability mass in $\tilde g_\pi^{(-)}$ from some points to other points (both sets with nonzero measure in $\mathcal{Y}$). In fact, for the $\tilde g_\pi$-weighted conditional expectation of $Y$ to be lowered, there must be probability mass that is moved down, i.e.\ from a point in $\mathcal{Y}$ to a lower point in the same domain. Take any such pair of points for which that is the case:
  \begin{equation*}
      y_1 > y_2 \quad \land \quad \tilde g_\pi(y_1) < \tilde g_\pi^{(-)}(y_1) \quad \land \quad \tilde g_\pi(y_2) > \tilde g_\pi^{(-)}(y_2).
  \end{equation*}
  The kernels are subject to the sensitivity bounds of Equation~\ref{eq:g_bounds}:
  \begin{equation*}
    \E\big[\Gamma^{-\norm{\Pi-\pi}}\big] \leq \tilde g_\pi(Y) \leq \E\big[\Gamma^{+\norm{\Pi-\pi}}\big].
  \end{equation*}
  We must analyze where $(y_1,y_2)$ fall around the threshold $Q_\tau(\pi)$. If they are both on one side, so either $\{Q_\tau(\pi)\geq y_1 > y_2\}$ or $\{y_1 > y_2 > Q_\tau(\pi)\}$, then the sensitivity bounds are trivially violated since $g_\pi^{(-)}$ lies at the boundary.

  The remaining case is $\{y_1 > Q_\tau(\pi)\geq y_2\}$. There, it follows that
  \begin{align*}
      \tilde g_\pi(y_1) <&\ \tilde g_\pi(y_1) = \E[\Gamma^{-\norm{\Pi-\pi}}],\\
      \tilde g_\pi(y_2) >&\ \tilde g_\pi(y_2) = \E[\Gamma^{+\norm{\Pi-\pi}}],
  \end{align*}
  which both violate the sensitivity bounds, concluding the proof by contradiction.
  


  \paragraph{Second, we prove sharpness.} For any $\pi$, we seek a feasible $Y(\pi)$ such that $\E[Y(\pi)] = \E[Y\,\tilde g_\pi^{(-)} \mid \Pi=\pi]$. %
  For the present purposes, it is enough to treat $Y(\pi)$ as some latent variable, as long as it satisfies the various conditions imposed by the problem. If we show that $\tilde g_\pi^{(-)}$ is a valid Radon-Nikodym derivative of the form
  \begin{equation*}
      \tilde g_\pi^{(-)}(y) = \dv{P_{Z}(y)}{P_{Y|\Pi=\pi}(y)}
  \end{equation*}
  for some hypothetical $Z$, then we can have $Y(\pi)\triangleq Z$. The latent $Z$ is completely determined by $Y$ and $\pi$, with measure
  \begin{equation*}
      P_Z(Y)=\int_Y \tilde g_\pi^{(-)}(y)\, \dd P_{Y|\Pi=\pi}(y).
  \end{equation*}

\end{proof}






\subsection{Lemma~\ref{lem:control}}\label{app:proof-control}
\begin{proof}
  The main idea is that for every $Y(\pi)$, there exists a sequence $(u_0,u_1,\dots)$ giving that expected discounted reward.
  To do this, we will first illustrate the equivalence of the value function and the MPC problem we are solving.
  Next, we will define a lower bound of the value function conditioned on an uncertainty variable. 
  Finally, we will show that the infimum of this function over the uncertainty set is equal to the value function under the proposed objective. 

  A point-identified MPC would solve a global optimization over future action trajectories.
\begin{align*}
     V(x) &= \ \max_{\pi=[a_0\ a_1\ \cdots]\in\mathcal{T}} \E\big[Y(\pi)\mid \Pi=\pi, X=x\big]\\
     &= \ \max_{\pi=[a_0\ a_1\ \cdots]\in\mathcal{T}} \textcolor{nice-red}{\E\big[}R_0 + \gamma R_1 + \gamma^2 R_2 + \dots \mid \Pi=\pi, X=x\textcolor{nice-red}{\big]}\\
     &= \ \max_{\pi=[a_0\ a_1\ \cdots]\in\mathcal{T}} \E\big[R_0 \mid \Pi=\pi, X=x\big]\\
     &\qquad\qquad\qquad\qquad
     + \gamma\textcolor{nice-yellow}{\E\big[}R_1 + \gamma R_2 + \gamma^2 R_3 + \dots \mid \Pi=\pi, X=x \textcolor{nice-yellow}{\big]}\\
     &= \ \max_{\pi=[a_0\ a_1\ \cdots]\in\mathcal{T}} \bigg(\E\big[R_0 \mid A_0=a_0, X=x\big] \\
     &\qquad\qquad\qquad\qquad
     + \gamma\E\Big[\underbrace{\E\big[R_0 + \gamma R_1 + \gamma^2 R_2 + \dots \mid \Pi=[a_1\ a_2\ \cdots], X=X' \big]}_\text{(shifting one step ahead in time)} \mid A_0=a_0, X=x\Big]\bigg) \\ %
     &= \ \max_{a_0\in\mathcal{A}} \Bigg(\E\big[R_0 \mid A_0=a_0, X=x\big] \\
     &\qquad\qquad\qquad\qquad
     + \gamma\textcolor{nice-blue}{\E\bigg[}\underbrace{\max_{\pi'=[a_1\ a_2\ \cdots]\in\mathcal{T}}\textcolor{nice-red}{\E\big[}R_0 + \gamma R_1 + \gamma^2 R_2 + \dots \mid \Pi=\pi', X=X' \textcolor{nice-red}{\big]}}_{V(X')} \mid A_0=a_0, X=x\textcolor{nice-blue}{\bigg]}\Bigg)\\ 
     &= \ \max_{a_0\in\mathcal{A}} \E\big[R_0 + \gamma V(X') \mid A_0=a_0, X=x\big]
\end{align*}
The time shift from $\E\big[R_1 + \gamma R_2 + \gamma^2 R_3 + \dots \mid \Pi=[a_0\ a_2\ \cdots], X=x \big]$ to $\E\big[R_0 + \gamma R_1 + \gamma^2 R_2 + \dots \mid \Pi=[a_1\ a_2\ \cdots], X=X' \big]$ is justified by the observable-state transition distribution $X'|A_0,X$. Recall that the rewards are structured as $\mathcal{A}\times\mathcal{X} \to \Delta(\mathbb{R}_{\geq 0})$.

The \textcolor{nice-blue}{blue outer expectation} in the line following the time shift is over the next observable-state transitions $X'|A_0,X$. The \textcolor{nice-red}{red inner expectation} is a shifted version of the MPC objective, which we also mark as red on an earlier line. By iterated expectation, the \textcolor{nice-red}{red} and \textcolor{nice-blue}{blue} expectations together are identical to the \textcolor{nice-yellow}{yellow expectation} shown earlier. However, since the \textcolor{nice-blue}{blue outer expectation} is over a variable ($X'$) that is invariant to future actions $a_1,a_2,\dots$, we can safely break apart the join maximization and move the maximization over future actions inside the \textcolor{nice-blue}{blue outer expectation}.
This gives us equivalence between the value function viewpoint and the MPC problem we are solving.

 Now we define a compact uncertainty set $\mathcal{U}(A_0, X)$ conditioned on an initial action and state action transitions. 
  By the compactness of $\mathcal{U}$, there exists a sequence $(u_0, u_1, \ldots) \in \mathcal{U}$ that minimizes the expected reward. 
From the definition of the sensitivity model in Definition~\ref{def:sensitivity-model}, the reward is bounded below by~\eqref{eq:lower-bound}. 
By assumption, all hidden confounding is incorporated within the uncertainty set and the infimum of the value function over all elements within the uncertainty set provides the lowest expected reward. 


Under partial identification, our MPC instead maximizes a lower bound:
\begin{align*}
     V_{f_\mathrm{MPC}}(x) &= \ \max_{\pi=[a_0\ a_1\ \cdots]\in\mathcal{T}} \E\big[Y\tilde{g}^{(-)}_\pi \mid \Pi=\pi, X=x\big]\\
     &= \ \max_{\pi=[a_0\ a_1\ \cdots]\in\mathcal{T}}\ \  \inf_{(u_0,u_1,u_2,\dots)}\E\big[R_0 + \gamma R_1 + \gamma^2 R_2 + \dots \mid \Pi=\pi, X=x, U_0=u_0,U_1=u_1,U_2=u_2,\dots\big]\\
     &= \ \max_{a_0\in\mathcal{A}} \ \inf_{u\in\mathcal{U}(a_0,x)}\E\big[R_0 + \gamma V(X') \mid A_0=a_0, X=x\big] \\ 
     &= V^*(x) 
\end{align*}
giving us the equivalence between the value functions. 
\end{proof}
