\documentclass[accepted]{uai2022} % for initial submission
%\documentclass[accepted]{uai2021} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2021} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2021} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)
%\usepackage{times}  % DO NOT CHANGE THIS
%\usepackage{helvet} % DO NOT CHANGE THIS
%\usepackage{courier}  % DO NOT CHANGE THIS
%\usepackage[hyphens]{url}  % DO NOT CHANGE THIS
%\usepackage{graphicx} % DO NOT CHANGE THIS
%\urlstyle{rm} % DO NOT CHANGE THIS
%\def\UrlFont{\rm}  % DO NOT CHANGE THIS
%\usepackage{natbib}  % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
%\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}


% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2021} with \usepackage[nohyperref]{icml2021} above.
\usepackage{hyperref}
\usepackage{algorithmic}
\usepackage{algorithm}
% Attempt to make hyperref and algorithmic work together better:
%\newcommand{\theHalgorithm}{\arabic{algorithm}}


\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}  % use 8-bit T1 fonts
%\usepackage{hyperref}    % hyperlinks
\usepackage{url}      % simple URL typesetting

\usepackage{amsfonts}    % blackboard math symbols
\usepackage{nicefrac}    % compact symbols for 1/2, etc.

\usepackage{shortcuts}
\usepackage{collcell}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{amssymb}
\theoremstyle{plain}
\newtheorem{definition}{Definition}
\newtheorem{proposition}{Proposition}
\newtheorem{assumption}{Assumption}
\newtheorem{remark}{Remark}
\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
%\usepackage{wrapfig}
\usepackage{url}
\usepackage{color, colortbl}
\definecolor{Gray}{gray}{0.9}
%\usepackage{isomath}
%Following defines command for two vertical bars used for KL-divergence
\usepackage{mathtools}
\usepackage[T1]{fontenc}
%\usepackage{lmodern}
\usepackage{multirow}
\usepackage{mathabx}
\usepackage{dsfont}
\usepackage[
colorinlistoftodos,
%  disable,
textsize=footnotesize,
]{todonotes} 
\newcommand{\KA}[1]{{\color{blue}#1}} 
\usepackage[%
%capitalize,
sort&compress
]{cleveref}
\Crefname{assumption}{Assumption}{Assumption}

\usepackage{wrapfig}
\usepackage{xspace}
\newcommand{\MMCCP}{\mbox{MMCCP}\xspace}

\hypersetup{
    colorlinks = true,
    citecolor  = blue,
    linkcolor  = blue
}



%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Generalizing Off-Policy Learning under Sample Selection Bias (Supplementary material)}

% The standard author block has changed for UAI 2021 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \href{mailto:<thatt@ethz.ch>?Subject=GenPl}{
\author[1]{Tobias Hatt}
\author[1]{Daniel Tschernutter}
\author[1,2]{Stefan Feuerriegel}
%\author[3]{Further~Coauthor}
%\author[1]{Further~Coauthor}
%author[3]{Further~Coauthor}
%\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    ETH Zurich\\
    Switzerland
}
\affil[2]{%
    LMU Munich\\
    Germany
}
  
\begin{document}
\maketitle

\onecolumn
\appendix
\section{Mathematical Appendix}\label{apx:proofs}
\subsection{Proof of Proposition 1}\label{apx:proof_selection_ratio}
We know that, with the Radon-Nikod\'{y}m derivative $R = \text{d}\mathbb{P}/\text{d}\mathbb{P}_{\text{Train}}$,
\begin{align}\setcounter{equation}{30}
	V(\pi) = \mathbb{E}[Y(\pi)]
	= \mathbb{E}_{\text{Train}}[R\,Y^\pi].
\end{align}
where
\begin{align}
	&R = \frac{\text{d}\mathbb{P}(X,T,Y)}{\text{d}\mathbb{P}_{\text{Train}}(X,T,Y)} = \frac{\text{d}\mathbb{P}(X,T,Y)}{\text{d}\mathbb{P}(X,T,Y\mid S=1)}\\
	&= \frac{\text{d}\mathbb{P}(X,T,Y)}{\text{d}\mathbb{P}(X,T,Y,S=1)}\Prb{S=1} = \frac{\Prb{S=1}}{\mathbb{P}(S=1\mid X,T,Y)}.
\end{align}
\qed

\textbf{Remark 1.} With the above result for the Radon-Nikod\'{y}m derivative, we can see the effect of the selection variable $S$: If $S$ does not depend on $X$, $T$, and $Y$, then $R=1$. Therefore, $\mathbb{P}$ would be identical to $\mathbb{P}_{\textup{Train}}$ and, as a consequence, the policy value on the target population, \ie, $V_{\text{Target}}(\pi)$, would coincide with the policy value on the training data, \ie, $\mathbb{E}_{\textup{Train}}[Y^\pi]$. If, however, $S$ depends on $X$, $T$, and $Y$, then the policy value on the target population does not coincide with the policy value on the training data and, therefore, $V_{\text{Target}}(\pi)\neq \mathbb{E}_{\textup{Train}}[Y^\pi]$.


%\subsection{Calibration of $\mathbf{\Gamma}$ and $\mathbf{\Prb{S=1}}$}
%In this section, we discuss two approaches to calibrate the parameters $\Gamma$ and $\Prb{S=1}$ in (11), which are context-dependent: (i)~Practitioner calibration with domain knowledge and (ii)~data-driven calibration.
%
%\textbf{(i)~Practitioner calibration:} This approach is based on domain knowledge of a practitioner. Practitioners generally have domain knowledge on the variables that impact selection into training data. First, $\Prb{S=1}$, the population probability of inclusion, needs to be quantified. If the study is randomized, a value $\approx 1/2$ is reasonable. Second, $\Gamma$, the largest deviation from $\Prb{S=1}$, needs to be quantified. Practitioners may choose larger values of $\Gamma$ in case of high uncertainty. Our framework allows a practitioner-friendly choice of calibration parameters. In fact, both questions may be simply answered using domain knowledge.
%
%\textbf{(ii)~Data-driven calibration:} Although our framework enables practitioners to choose appropriate calibration parameters, we provide a fully data-driven approach for calibrating $\Gamma$ and $\Prb{S=1}$. To this end, we consider a setting in which samples from \emph{one} of the covariates of the target population are provided. This is reasonable, since we often have limited understanding of the target population and, for instance, know covariates such as the distribution of gender or age in the target population. Once we are given one covariate, \eg, $x_{\text{age}}$, we proceed in two steps: (1)~For calibrating $\Prb{S=1}$, we approximate $\Prb{S=1\mid X, Y, T}$ via an estimate of $\Prb{S=1\mid x_{\text{age}}}$ and, based on this, we approximate $\Prb{S=1}$ by averaging over $x_{\text{age}}$, \ie, $\frac1n\sm i n \Prb{S_i=1\mid x_{\text{age}, i}}$. (2)~For calibrating $\Gamma$, we take the maximum of the odds-ratio in (11) with the above estimates for $\Prb{S=1}$ and $\Prb{S=1\mid x_{\text{age}}}$ plugged in, which yields a value for $\Gamma$. We use this data-driven calibration procedure in our experiments (Section 5).
%
%In case the uncertainty regarding the calibration parameters remains high, large values for $\Gamma$ can be chosen, yielding a wide uncertainty set and conservative policies.


\subsection{Proof of Theorem 1}\label{apx:proof_generalization_bound}
%We first prove the following Lemma that is used to prove \Cref{thm:gen_bound}.

%\textbf{Lemma A1.} (Uniform convergence of policy function $\pi$ over envelope class$\cl F$.) Let $f(\pi) \leq \left\|F\right\|_2\leq B$ be a bound on the envelope function $f\in\cl F$. Then for $n$ large enough, there exists a universal constant $K^\Pi$ that depends only on the VC-major dimension of $\pi$, such that with probability at least $1-\delta$,
%\begin{equation}
%	\sup_{f\in \cl F}\bigg\lvert\frac1n \sm i n (f_i(\pi) - \Eb{f(\pi)})\bigg\rvert \leq \frac92 B K^\Pi \sqrt{\frac{\text{log}(5/\delta)}{n}}.
%\end{equation}

%\begin{proof}
%	We follow the proof of Lemma EC.2 of \citet{kallus2020minimax}. We first bound the deviations uniformly over the policy class and introduce the following empirical processes,
%	\begin{equation}
%		M=\sup_{f\in \cl F}\bigg\lvert \sm i n (f_i(\pi) - \Eb{f(\pi)})\bigg\rvert, \quad L=\sup_{f\in \cl F}\bigg\lvert \sm i n \epsilon_i f_i(\pi)\bigg\rvert.
%	\end{equation}
%	By a standard symmetrization argument, applying Jensen's inequality for the convex function $\Psi$ of the symmetrized process (\eg, Theorem 2.2 of \citet{pollard1990empirical}), we may bound the Orlicz norm of the maxima of the empirical process by the symmetrized process, conditional on the training data:
%	\begin{equation}
%		\Eb{\Psi(M)} \leq \Eb{\Psi(2L)}.
%	\end{equation}
%	Taking Orlicz norms with $\Psi(t)=\frac15 \text{exp}(t^2)$,we apply a tail inequality on the Orlicz norm of the symmetrized process $\Psi(2L)$, under the assumption of bounded outcomes. Applying Dudley’s inequality to the symmetrized empirical process $L$, (\eg, Theorem 3.5 of \citet{pollard1990empirical}, we have that
%	\begin{equation}
%		\mathbb{E}_{\epsilon}[\text{exp}(L^2/J^2)\mid \cl D] \leq 5, \text{ for } J=9\lVert F\rVert_2\int_{0}^{1}\sqrt{\text{log}(D(\lVert F\rVert_2\zeta, \cl F(X_{1:n})))}d\zeta
%	\end{equation}
%	Then, applying theorem 2.6.9 of \citet{van1996weak}, we have that there exists a universal constant $K^\Pi$ (depending only on the VC-major dimension of the policy), such that
%	\begin{equation}
%		\text{log} D(\lVert F\rVert_2\zeta, \cl F(X_{1:n}, T_{1:n}))\leq K(1/\zeta)^{\frac{2v}{v+2}}.
%	\end{equation}
%	The corresponding Dudley entropy integral is bounded by $\int_0^1\sqrt{K(1/\zeta)^{2v/(v+2)}}d\zeta \leq \sqrt{K}\frac{v+2}{2} = K^\Pi$. By Markov's inequality, we have that
%	\begin{equation}
%		\mathbb{P}(\frac1n L>t)\leq 5\text{exp}(-t^2n/J^2C^2,)
%	\end{equation}
%	so that therefore, 
%	\begin{equation}
%		\frac1n M \leq \frac{9/2CK^\Pi\sqrt{\text{log}(5/\delta)}}{\sqrt{n}}.
%	\end{equation}
%\end{proof}
%\textit{Proof of Theorem 1.}
Let $Z=\frac1n\sm i n R_i^*$. Then, 
\begin{equation}
	V_{\text{Target}}(\pi) \leq \hat{V}_{\text{Target}}^\ast(\pi) + \sup_{\pi\in\Pi}\big\lvert \hat{V}_{\text{Target}}^\ast(\pi) - V_{\text{Target}}(\pi) \big\rvert,
\end{equation}
and
\begin{align} 
	\sup_{\pi\in\Pi}&\big\lvert \hat{V}_{\text{Target}}^\ast(\pi) - V_{\text{Target}}(\pi) \big\rvert\\
	&=\sup_{\pi\in\Pi}\bigg\lvert \frac{\frac1n \sm i n R_i^\ast\psi_i(\pi)}{Z} - \frac{V_{\text{Target}}(\pi)}{Z} + \frac{V_{\text{Target}}(\pi)(1-Z)}{Z} \bigg\rvert\\
	&\leq \frac1Z\sup_{\pi\in\Pi}\bigg\lvert \frac1n \sm i n R_i^\ast\psi_i(\pi) - V(\pi)\bigg\rvert + \sup_{\pi\in\Pi}\frac{C\big\lvert 1-Z\big\rvert}{Z}\label{second_term}.
\end{align}
We let
\begin{equation}
	T = \sup_{\pi\in\Pi}\bigg\lvert \frac1n \sm i n R_i^\ast\psi_i(\pi) - V_{\text{Target}}(\pi)\bigg\rvert.
\end{equation}
Since $\lvert Y\rvert\leq C$ and, therefore, $\lvert\mu_t(x)\rvert\leq C$, $R_i^{\ast} \leq u$, and $1-\eta \geq \pi^b(x)\geq \eta$ (for some $\eta>0$ due to positivity), we have that

\begin{itemize}
	\item[1.)] for $\psi_i^{\mathrm{DM}}(\pi)$ from (8): $T=\sup_{\pi\in\Pi}\bigg\lvert \frac1n \sm i n R_i^\ast(\pi(X_i)\mu_1(X_i) + (1-\pi(X_i))\mu_0(X_i)) - V(\pi)\bigg\rvert$ satisfies bounded differences with $\frac{4Cu}{n}$,
	\item[2.)] for $\psi_i^{\mathrm{NIPW}}(\pi)$ from (9): $T=\sup_{\pi\in\Pi}\bigg\lvert \frac1n \sm i n R_i^\ast(\frac{2W_i^{\mathrm{IPW}}}{\frac1n\sm j n W_j^{\mathrm{IPW}}}(1-2T_i)(1-T_i-\pi(X_i))Y_i) - V(\pi)\bigg\rvert$, satisfies bounded differences with $\frac{4Cu}{n}\frac{1-\eta}{\eta}$,
	\item[3.)] for $\psi_i^{\mathrm{DR}}(\pi)$ from (10): $T=\sup_{\pi\in\Pi}\bigg\lvert \frac1n \sm i n R_i^\ast(\psi_i^{\mathrm{DM}}(\pi) + W_i^{\mathrm{IPW}}(1-2T_i)(1-T_i-\pi(X_i))(Y_i - \mu_{T_i}(X_i))) - V(\pi)\bigg\rvert$, satisfies bounded differences with $\frac{4Cu}{n}\frac{1+\eta}{\eta}$.
	
\end{itemize}

Hence, $T$ satisfies bounded differences with $\frac{4Cu}{n}K_\psi$, where $K_\psi=1$ for $\psi_i^{\mathrm{DM}}(\pi)$, $K_\psi=\frac{1-\eta}{\eta}$ for $\psi_i^{\mathrm{NIPW}}(\pi)$, and $K_\psi=\frac{1+\eta}{\eta}$ for $\psi_i^{\mathrm{DR}}(\pi)$.

Thus, using McDiarmid’s inequality yields


\begin{equation}
	\Prb{T - \mathbb{E}[T]\geq \epsilon} \leq \text{exp}(-\frac{n\epsilon^2}{8C^2u^2K_{\psi}^2}).
\end{equation}


Therefore, we have that
\begin{equation}
	\Prb{T - \mathbb{E}[T]\leq \epsilon} \geq 1-\text{exp}(-\frac{n\epsilon^2}{8C^2u^2K_{\psi}^2}).
\end{equation}


Using $p_1 = \text{exp}(-\frac{n\epsilon^2}{8C^2u^2K_{\psi}^2})$ and, therefore, $\epsilon = 2CuK_\psi\sqrt{\frac{2\text{log}(1/p_1)}{n}}$, we have that with probability at least $1-p_1$,
\begin{equation}
	T \leq \mathbb{E}[T] + 2CuK_\psi\sqrt{\frac{2\text{log}(1/p_1)}{n}}.
\end{equation}


Since $\mathbb{E}[R^\ast_i\psi_i(\pi)]=V(\pi)$, a standard symmetrization argument yields

\begin{equation}
	\mathbb{E}[T] \leq \mathbb{E}\Big[\frac{1}{2^n}\Sigma_{\sigma\in\{-1, +1\}^n}\sup_{\pi\in\Pi}\lvert\frac1n \sm i n \sigma_i R^\ast_i\psi_i(\pi)\rvert\Big].
\end{equation}
Then, using the Rademacher comparison theorem (Thm 4.12 in \citet{ledoux2013probability}), this yields

\begin{equation}
	\mathbb{E}[T] \leq 2CuK_\psi\mathbb{E}[\cl{R}_n(\Pi)],
\end{equation}
where $K_\psi$ is from above and depends on whether one uses $\psi_i^{\mathrm{DM}}(\pi)$, $\psi_i^{\mathrm{NIPW}}(\pi)$, or $\psi_i^{\mathrm{DR}}(\pi)$. Moreover, $\cl{R}_n(\Pi)$ satisfies bounded differences with constants $\frac2n$ and, hence, we can again use McDiarmid’s inequality, which yields

\begin{equation}
	\Prb{\mathbb{E}[\cl{R}_n(\Pi)] - \cl{R}_n(\Pi)\geq \epsilon} \leq \text{exp}(\frac{-\epsilon^2 n}{2}).
\end{equation}
Therefore, we have that
\begin{equation}
	\Prb{\mathbb{E}[\cl{R}_n(\Pi)] - \cl{R}_n(\Pi)\leq \epsilon} \geq 1-\text{exp}(\frac{-\epsilon^2 n}{2}).
\end{equation}
Using $p_2 = \text{exp}(\frac{-\epsilon^2 n}{2})$ and, therefore, $\epsilon = \sqrt{\frac{2\text{log}(1/p_2)}{n}}$, we have that with probability at least $1-p_2$,
\begin{equation}
	\mathbb{E}[\cl{R}_n(\Pi)] \leq \cl{R}_n(\Pi) + \sqrt{\frac{2\text{log}(1/p_2)}{n}}.
\end{equation}


The second term in \labelcref{second_term} can be bounded using $0 \leq R_i^\ast \leq u$, $\Eb{R^\ast} = 1$, and Hoeffding's inequality:
\begin{equation}
	\Prb{\lvert1-Z\rvert\geq \epsilon} \leq 2\text{exp}(-2\epsilon^2u^{-2}n).
\end{equation}
Therefore, we have that
\begin{equation}
	\Prb{\lvert1-Z\rvert\leq \epsilon} \geq 1-2\text{exp}(-2\epsilon^2u^{-2}n).
\end{equation}
Using $p_3 = 2\text{exp}(-2\epsilon^2u^{-2}n)$ and, therefore, $\epsilon = u\sqrt{\frac{\text{log}(2/p_3)}{2n}}$, we have that with probability at least $1-p_3$,
\begin{equation}
	C\lvert1-Z\rvert\leq Cu\sqrt{\frac{\text{log}(2/p_3)}{2n}}.
\end{equation}
Finally, using that $1/Z\leq 1/l$, we get that with probability at least $1-p_1-p_2-p_3$,
\begin{align}
	\sup_{\pi\in\Pi}\bigg\lvert \hat{V}_{\text{Target}}^\ast(\pi) - V_{\text{Target}}(\pi) \bigg\rvert
	\leq 2C\frac{u}{l}K_\psi \cl{R}_n(\Pi) + 2C\frac{u}{l}K_\psi \sqrt{\frac{2\text{log}(1/p_2)}{n}} + 2C\frac{u}{l}K_\psi\sqrt{\frac{2\text{log}(1/p_1)}{n}}
	+ C\frac{u}{l}\sqrt{\frac{\text{log}(2/p_3)}{2n}}.
	%\frac{9CuK^\psi K^\Pi}{l}\sqrt{\frac{\text{log}(5/p_1)}{n}} + \frac{Cu}{l}\sqrt{\frac{\text{log}(1/p_2)}{2n}}.
\end{align}
Let $p_1, p_2 = \delta/4$ and $p_3 = 2\delta/4$, then, using that $K_\psi \geq 1$, the above is bounded by $2C\frac{u}{l}K_\psi\cl{R}_n(\Pi) + 2C\frac{u}{l}K_\psi\sqrt{\frac{18\text{log}(4/\delta)}{n}}$. The proof is completed by recognizing that, since the true $R^\ast\in\cl R$, we have that $\hat{V}_{\text{Target}}^\ast(\pi) \leq \overline{V}_{\text{Target}}(\pi)$. \qed
\begin{remark}
We briefly explain how Theorem~1 is proven in the case in which we do not have access to the true nuisance functions (using the results from \citet{athey2017efficient}.

Let $\tilde{V}_{\text{Target}}^\ast(\pi)$ be the estimator which uses the true nuisance functions and $\tilde{V}_{\text{Target}}^\ast(\pi)$ the estimator which uses estimated nuisance functions. Let $Z=\frac1n\sm i n R_i^*$. Then, 
\begin{align}
	V_{\text{Target}}(\pi) &\leq 
	\hat{V}_{\text{Target}}^\ast(\pi) + 
	\sup_{\pi\in\Pi}\big\lvert \hat{V}_{\text{Target}}^\ast(\pi) -
	\tilde{V}_{\text{Target}}^\ast(\pi) +
	\tilde{V}_{\text{Target}}^\ast(\pi) -
	V_{\text{Target}}(\pi) \big\rvert\\
	&\leq \hat{V}_{\text{Target}}^\ast(\pi) + 
	\sup_{\pi\in\Pi}\big\lvert \tilde{V}_{\text{Target}}^\ast(\pi) -
	\hat{V}_{\text{Target}}^\ast(\pi)\big\rvert +
	\sup_{\pi\in\Pi}\big\lvert\tilde{V}_{\text{Target}}^\ast(\pi) -
	V_{\text{Target}}(\pi) \big\rvert
\end{align}
The term $\sup_{\pi\in\Pi}\big\lvert\tilde{V}_{\text{Target}}^\ast(\pi) -
	V_{\text{Target}}(\pi) \big\rvert$ can be bounded analogously to the proof above. The term 
	$\sup_{\pi\in\Pi}\big\lvert \tilde{V}_{\text{Target}}^\ast(\pi) -
	\hat{V}_{\text{Target}}^\ast(\pi)\big\rvert$ can be bounded using Lemma~4 in \cite{athey2017efficient}. The result follows the analogously to the proof above.
\end{remark}


\subsection{Proof of Theorem 2}\label{apx:proof_closed_form}
Let $(i)$ denote the $i$th index of the increasing order statistics, an ordering where $\psi_{(1)}(\theta) \leq \ldots \psi_{(n)}(\theta)$. Hence, we address the following optimization problem
\begin{align}
	\max_R \frac{\sm i n R_{(i)}\psi_{(i)}(\theta)}{\sm i n R_{(i)}} \quad \text{s. t.,}
	\quad l\leq R_{(i)}\leq u, \, R_{(i)} \geq 0, \forall i={1, \ldots, n}.
\end{align}
We derive a closed-form solution for any of the $\psi_i(\theta)$ in (8), (9), and (10), which generalizes the solution of \citet{kallus2018confounding} to all standard policy learning methods. Since the constraint on $R$ is linear, the above optimization problem is a linear fractional program. Hence, we can use the Charnes-Cooper transformation \citep{charnes1962programming} with $\tilde{R}_{(i)} = R_{(i)}/\sm i n R_{(i)}$ and $t = 1/\sm i n R_{(i)}$, which yields
\begin{align}
	\max_{\tilde{R}} \sm i n \tilde{R}_{(i)}\psi_{(i)}(\theta)\quad
	\text{s.t.}\quad t\,l\leq \tilde{R}_{(i)}\leq t\,u, \, \tilde{R}_{(i)}\geq 0 \, \sm i n \tilde{R}_{(i)} = 1, t\geq 0, \forall i={1, \ldots, n}
\end{align}
The corresponding dual problem has the dual variables $\lambda\in\mathbb{R}$ for the normalization constraint and $w,v\in\mathbb{R}_+^n$ for the box constraints on the normalized Radon-Nikod\'{y}m derivative. It is given by
\begin{align}
	\min_{\lambda, v,w}\lambda,\, \text{s.t. }\, \sm i n v_{(i)}\,u + w_{(i)}\,l \geq 0,\,
	\lambda + v_{(i)} - w_{(i)} \geq \psi_{(i)}(\theta), \forall i={1, \ldots, n}\,
	\lambda\in \mathbb{R}, v, w\in \mathbb{R}_+^n.
\end{align}
At the optimal solution, only one of the primal weight bound constraints, (for nontrivial bounds $l < u$), $t\,l\leq R_{(i)}$ or $R_{(i)}\leq t\,u$ will be tight. At the optimal solution, by complementary slackness, either none or one of the nonbinding primal constraints is nonzero, \ie, either $v_{(i)}$, $w_{(i)}$, or none is nonzero. Moreover, $t = 0$ is infeasible, since $t=0$ would imply $\tilde R_{(i)}=0$ for all $i$, which contradicts $\sm i n \tilde{R}_{(i)} = 1$. Hence, $t \neq 0$. At the optimal solution, the constraint $\sm i n v_{(i)}\,u + w_{(i)}\,l \geq 0$ must be active. Otherwise, we can find a $\lambda$ which is smaller than the optimal one but still feasible, and hence contradicts the optimality. As a result, at an optimal solution, we have that:
\begin{align}
	&\sm i n v_{(i)}\,u + w_{(i)}\,l = 0,\label{sub}\\
	&v_{(i)} - w_{(i)} = \psi_{(i)}(\theta)-\lambda, \forall i={1, \ldots, n}\label{sub_sol}.
\end{align}
Since $v, w \geq 0$, we see the following by distinction of cases. If $\psi_{(i)}(\theta)\geq\lambda$, then $w_{(i)}=0$ and $v_{(i)} = \psi_{(i)}(\theta)-\lambda$. If $\psi_{(i)}(\theta)<\lambda$, then $v_{(i)}=0$ and $w_{(i)} = \lambda-\psi_{(i)}(\theta)$.

At optimality, since $(i)$ is the increasing order statistics, there exists some index $k\in\{1, \ldots, n\}$ such that $\psi_{(k)}(\theta) <\lambda \leq \psi_{(k+1)}(\theta)$. Hence, we can substitute the solution from \labelcref{sub_sol} in \labelcref{sub} and obtain the following
\begin{equation}
	\sum_{i=1}^{k} l(\lambda-\psi_{(i)}(\theta)) - \sum_{i=k+1}^n u (\psi_{(i)}(\theta)-\lambda) = 0,
\end{equation}
and, therefore,
\begin{equation}
	\lambda(k) = \frac{l\sum_{i=1}^{k} \psi_{(i)}(\theta) + u\sum_{i=k+1}^n \psi_{(i)}(\theta)}{k\, l +  (n-k)\, u}.
\end{equation}
The optimal $k$ is given by $k^\ast = \inf\{k: \lambda(k) \leq\psi_{(k+1)}(\theta)\}$, which can be seen by the following argument. When $\lambda(k)$ is maximal, we have that $\lambda(k)\geq\lambda(k+1)$. This is equivalent to $\lambda(k)\leq\psi_{(k+1)}(\theta)$, since the following steps are equivalent
\begin{align}
	&0\geq \lambda(k+1) - \lambda(k)\\
	&0\geq \frac{(lk + u(n-k))\lambda(k) + (l-u)\psi_{(k+1)}(\theta)}{l(k+1) + u(n-k-1)} - \lambda(k)\\
	&0\geq (lk + u(n-k))\lambda(k) + (l-u)\psi_{(k+1)}(\theta) - \lambda(k)(l(k+1) + u(n-k-1))\\	
	&0\geq (lk + u(n-k))\lambda(k) + (l-u)\psi_{(k+1)}(\theta) - \lambda(k)(lk + u(n-k)) - \lambda(k)(l + u)\\	
	&0\geq  (l-u)\psi_{(k+1)}(\theta) - \lambda(k)(l - u)\\
	&\lambda(k) \leq  \psi_{(k+1)}(\theta),
\end{align}
where the last inequality switches because we divide by $l-u$ which is negative. Next, we show that if $\lambda(k)\geq\lambda(k+1)$, then  $\lambda(k+1)\geq\lambda(k+2)$. 
\begin{align}
	&\lambda(k+1)\\
	&=\frac{(lk + u(n-k))\lambda(k) + (l-u)\psi_{(k+1)}(\theta)}{l(k+1) + u(n-k-1)}\\
	&\leq\frac{(lk + u(n-k))\psi_{(k+1)}(\theta) + (l-u)\psi_{(k+1)}(\theta)}{l(k+1) + u(n-k-1)}\\
	&= \psi_{(k+1)}(\theta) \leq \psi_{(k+2)}(\theta),
\end{align}
and, since we showed above that $\lambda(k)\geq\lambda(k+1)$ is equivalent to $\lambda(k)\leq\psi_{(k+1)}(\theta)$, we have that $\lambda(k+1) \leq \psi_{(k+2)}(\theta)$ is equivalent to $\lambda(k+1)\geq\lambda(k+2)$. Thus, $k^\ast = \inf\{k: \lambda(k) \leq\psi_{(k+1)}(\theta)\}$. Hence, the solution of the dual problem is 
$\tilde{R}_{(i)} =\frac{l\mathds{1}({(i)} \leq k^\ast) + u\mathds{1}({(i)} > k^\ast)}{k^\ast\, l +  (n-k^\ast)\, u}$. Then, the solution of the primal problem can be recovered by $R=\frac{1}{t}\tilde{R}$, where $t=1/(k^\ast\, l +  (n-k^\ast)\, u)$.
\qed

\subsection{Proof of Lemma 1}\label{apx:proof_dc_representation}
For the direct method, we have
\begin{align}
	\psi_i^{\mathrm{DM}}(\theta)&=\pi(X_i,\theta){\mu}_1(X_i) + (1-\pi(X_i,\theta)){\mu}_0(X_i)\\
	&=(\tilde g(X_i,\theta)-\tilde h(X_i,\theta)){\mu}_1(X_i) + (1-\tilde g(X_i,\theta)+\tilde h(X_i,\theta)){\mu}_0(X_i).
\end{align}
To derive $g_i$ and $h_i$, we proceed with a case distinction.

\underline{Case 1: ${\mu}_0(X_i)\ge 0$ and ${\mu}_1(X_i)\ge 0$}\\
In this case, we have
\begin{align}
	\psi_i^{\mathrm{DM}}(\theta)&=(\tilde g(X_i,\theta){\mu}_1(X_i)+\tilde h(X_i,\theta){\mu}_0(X_i)+{\mu}_0(X_i))\\
	&-(\tilde h(X_i,\theta){\mu}_1(X_i)+\tilde g(X_i,\theta){\mu}_0(X_i)),
\end{align}
and, hence, the claim follows with
\begin{align}
	g_i(\theta)&=\tilde g(X_i,\theta){\mu}_1(X_i)+\tilde h(X_i,\theta){\mu}_0(X_i)+{\mu}_0(X_i)\\
	h_i(\theta)&=\tilde h(X_i,\theta){\mu}_1(X_i)+\tilde g(X_i,\theta){\mu}_0(X_i).
\end{align}

\underline{Case 2: ${\mu}_0(X_i)<0$ and ${\mu}_1(X_i)\ge 0$}\\
In this case, we have
\begin{align}
	\psi_i^{\mathrm{DM}}(\theta)&=(\tilde g(X_i,\theta){\mu}_1(X_i)+\tilde g(X_i,\theta)\lvert{\mu}_0(X_i)\rvert-\lvert{\mu}_0(X_i)\rvert)\\
	&-(\tilde h(X_i,\theta){\mu}_1(X_i)+\tilde h(X_i,\theta)\lvert{\mu}_0(X_i)\rvert),
\end{align}
and, hence, the claim follows with
\begin{align}
	g_i(\theta)&=\tilde g(X_i,\theta){\mu}_1(X_i)+\tilde g(X_i,\theta)\lvert{\mu}_0(X_i)\rvert-\lvert{\mu}_0(X_i)\rvert\\
	h_i(\theta)&=\tilde h(X_i,\theta)({\mu}_1(X_i)+\lvert{\mu}_0(X_i)\rvert).
\end{align}

\underline{Case 3: ${\mu}_0(X_i)\ge0$ and ${\mu}_1(X_i)<0$}\\
In this case, we have
\begin{align}
	\psi_i^{\mathrm{DM}}(\theta)&=(\tilde h(X_i,\theta)\lvert{\mu}_1(X_i)\rvert+\tilde h(X_i,\theta){\mu}_0(X_i)+{\mu}_0(X_i))\\
	&-(\tilde g(X_i,\theta)\lvert{\mu}_1(X_i)\rvert+\tilde g(X_i,\theta){\mu}_0(X_i)),
\end{align}
and, hence, the claim follows with
\begin{align}
	g_i(\theta)&=\tilde h(X_i,\theta)\lvert{\mu}_1(X_i)\rvert+\tilde h(X_i,\theta){\mu}_0(X_i)+{\mu}_0(X_i)\\
	h_i(\theta)&=\tilde g(X_i,\theta)(\lvert{\mu}_1(X_i)\rvert+{\mu}_0(X_i)).
\end{align}

\underline{Case 4: ${\mu}_0(X_i)<0$ and ${\mu}_1(X_i)<0$}\\
In this case, we have
\begin{align}
	\psi_i^{\mathrm{DM}}(\theta)&=(\tilde h(X_i,\theta)\lvert{\mu}_1(X_i)\rvert+\tilde g(X_i,\theta)\lvert{\mu}_0(X_i)\rvert-\lvert{\mu}_0(X_i)\rvert)\\
	&-(\tilde g(X_i,\theta)\lvert{\mu}_1(X_i)\rvert+\tilde h(X_i,\theta)\lvert{\mu}_0(X_i)\rvert),
\end{align}
and, hence, the claim follows with
\begin{align}
	g_i(\theta)&=\tilde h(X_i,\theta)\lvert{\mu}_1(X_i)\rvert+\tilde g(X_i,\theta)\lvert{\mu}_0(X_i)\rvert-\lvert{\mu}_0(X_i)\rvert\\
	h_i(\theta)&=\tilde g(X_i,\theta)\lvert{\mu}_1(X_i)\rvert+\tilde h(X_i,\theta)\lvert{\mu}_0(X_i)\rvert.
\end{align}

For the normalized inverse propensity weights method, we have
\begin{align}
	\psi_i^{\mathrm{NIPW}}(\theta) =\frac{2{W}_i^{\mathrm{IPW}}}{\frac{1}{n}\sm j n {W}_j^{\mathrm{IPW}}}(1-2T_i)(1-T_i-\pi(X_i,\theta))Y_i.
\end{align}
Again, by a case distinction, we yield for $T_i=1$:
\begin{align}
	\psi_i^{\mathrm{NIPW}}(\theta)&=\frac{2{W}_i^{\mathrm{IPW}}}{\frac{1}{n}\sm j n {W}_j^{\mathrm{IPW}}}\pi(X_i,\theta)Y_i\\
	&=\frac{2{W}_i^{\mathrm{IPW}}}{\frac{1}{n}\sm j n {W}_j^{\mathrm{IPW}}}\tilde g(X_i,\theta)Y_i-\frac{2{W}_i^{\mathrm{IPW}}}{\frac{1}{n}\sm j n {W}_j^{\mathrm{IPW}}}\tilde h(X_i,\theta)Y_i,
\end{align}
and, hence, the claim follows with
\begin{align}
	g_i(\theta)&=\frac{2{W}_i^{\mathrm{IPW}}}{\frac{1}{n}\sm j n {W}_j^{\mathrm{IPW}}}\tilde g(X_i,\theta)Y_i\\
	h_i(\theta)&=\frac{2{W}_i^{\mathrm{IPW}}}{\frac{1}{n}\sm j n {W}_j^{\mathrm{IPW}}}\tilde h(X_i,\theta)Y_i.
\end{align}
For $T_i=0$, we derive,
\begin{align}
	\psi_i^{\mathrm{NIPW}}(\theta)&=\frac{2{W}_i^{\mathrm{IPW}}}{\frac{1}{n}\sm j n {W}_j^{\mathrm{IPW}}}(1-\pi(X_i,\theta))Y_i\\
	&=\frac{2{W}_i^{\mathrm{IPW}}}{\frac{1}{n}\sm j n {W}_j^{\mathrm{IPW}}}(1-\tilde g(X_i,\theta)+\tilde h(X_i,\theta))Y_i,
\end{align}
and, hence, the claim follows with
\begin{align}
	g_i(\theta)&=\frac{2{W}_i^{\mathrm{IPW}}}{\frac{1}{n}\sm j n {W}_j^{\mathrm{IPW}}}(\tilde h(X_i,\theta)+1)Y_i\\
	h_i(\theta)&=\frac{2{W}_i^{\mathrm{IPW}}}{\frac{1}{n}\sm j n {W}_j^{\mathrm{IPW}}}\tilde g(X_i,\theta)Y_i.
\end{align}

For the doubly robust method, we can use the decomposition of the direct method.
By defining
\begin{align}
	\nu_i=(1-2T_i)(Y_i - {\mu}_{T_i}(X_i)),
\end{align}
and rewriting
\begin{align}
	{W}_i^{\mathrm{IPW}}(1-2T_i)(1-T_i-\pi(X_i,\theta))(Y_i - {\mu}_{T_i}(X_i))
	={W}_i^{\mathrm{IPW}}(1-T_i-\pi(X_i,\theta))\nu_i,
\end{align}
we proceed again by a case distinction for the rest. For $\nu_i\ge 0$ we have
\begin{align}
	{W}_i^{\mathrm{IPW}}(1-T_i-\pi(X_i,\theta))\nu_i&=
	{W}_i^{\mathrm{IPW}}(1-T_i)\nu_i+{W}_i^{\mathrm{IPW}}\nu_i\tilde h(X_i,\theta)\\
	&-{W}_i^{\mathrm{IPW}}\nu_i\tilde g(X_i,\theta).
\end{align}
For $\nu_i<0$, we have that
\begin{align}
	{W}_i^{\mathrm{IPW}}(1-T_i-\pi(X_i,\theta))\nu_i&=
	{W}_i^{\mathrm{IPW}}(1-T_i)\nu_i+{W}_i^{\mathrm{IPW}}\lvert\nu_i\rvert\tilde g(X_i,\theta)\\
	&-{W}_i^{\mathrm{IPW}}\lvert\nu_i\rvert\tilde h(X_i,\theta).
\end{align}
and, hence, the claim follows.\qed

\subsection{Proof of Theorem 3}\label{apx:proof:dc_representation_wcp}
By Theorem 2, we know that
\begin{align}
	\max\limits_{R\in \cl R} \frac{\sm i n R_i\psi_i(\theta)}{\sm i n R_i}=\max\limits_{R\in \mathcal{S}\subseteq\mathcal{R}} \frac{\sm i n R_i\psi_i(\theta)}{\sm i n R_i},
\end{align}
with $\lvert \mathcal{S} \rvert=n+1<\infty$. Hence, we can write the inner maximum as
\begin{align}
	\max\limits_{j\in \cl J} \frac{\sm i n R_i^j\psi_i(\theta)}{\sm i n R_i^j},
\end{align}
where $R^j$ for $j\in\mathcal{J}=\{0,\dots,n\}$ denotes one of the $n+1$ possible assignments of $l$ and $u$, \ie, for $j=0$, it is the vector with all entries equal to $l$; for $j=1$, it is the vector with all entries equal to $l$ except for the first one being $u$ and so on. By defining the convex functions
\begin{align}
	G^j(\theta)&=\frac{\sm i n R_i^j g_i(\theta)}{\sm i n R_i^j},\\
	H^j(\theta)&=\frac{\sm i n R_i^j h_i(\theta)}{\sm i n R_i^j},
\end{align}
we have
\begin{align}
	\frac{\sm i n R^j_i\psi_i(\theta)}{\sm i n R^j_i}=G^j(\theta)-H^j(\theta),
\end{align}
and, hence,
\begin{align}
	\max\limits_{j\in \cl J} \frac{\sm i n R^j_i\psi_i(\theta)}{\sm i n R^j_i}&=\max\limits_{j\in \cl J} \{G^j-H^j\}\\
	&=\max\limits_{j\in \cl J} \{G^j+\sum\limits_{\substack{k=1\\k\neq j}}^{n}H^k-\sum\limits_{k=1}^{n}H^k\}\\
	&=\underbrace{\max\limits_{j\in \cl J} \{G^j+\sum\limits_{\substack{k=1\\k\neq j}}^{n}H^k\}}_{=\vcentcolon g}-\underbrace{\sum\limits_{k=1}^{n}H^k}_{=\vcentcolon h}.
\end{align}
Note that $g$ and $h$ are convex as the sum of convex functions is convex and the maximum of convex functions is convex. Now, $g$ can be rewritten as follows
\begin{align}
	g(\theta)&=\max\limits_{j\in \cl J} \{G^j+\sum\limits_{\substack{k=1\\k\neq j}}^{n}H^k\}=\max\limits_{j\in \cl J} \{G^j-H^j+\sum\limits_{k=1}^{n}H^k\}\\
	&=\max\limits_{j\in \cl J} \{G^j-H^j\}+\sum\limits_{k=1}^{n}H^k=\max\limits_{R\in \cl R} \left\{\frac{\sm i n R_i\psi_i(\theta)}{\sm i n R_i}\right\}+\sum\limits_{k=1}^{n}H^k\\
	&=\max\limits_{R\in \cl R} \left\{\frac{\sm i n R_i\psi_i(\theta)}{\sm i n R_i}\right\}+h.
\end{align}
Furthermore, we can use the special structure of the worst case policy solutions to rewrite $h$ as
\begin{align}
	h&=\sum\limits_{k=1}^{n}H^k=\sum\limits_{k=1}^{n}\frac{\sm i n R_i^k h_i(\theta)}{\sm i n R_i^k}=\sum\limits_{k=1}^{n}\sm i n \frac{R_i^k}{\sm i n R_i^k} h_i(\theta)\\
	&=\sum\limits_{i=1}^{n} h_i(\theta)\underbrace{\sum\limits_{k=1}^{n}\frac{R_i^k}{\sm i n R_i^k}}_{=\vcentcolon c_i}=\sum\limits_{i=1}^{n} h_i(\theta)c_i,
\end{align}
where $c_i$ can be calculated as 
\begin{align}
	c_i=l\left(\sum\limits_{k=1}^{i}\frac{1}{(n-k+1)l+(k-1)u}\right)+u\left(\sum\limits_{k=i+1}^{n}\frac{1}{(n-k+1)l+(k-1)u}\right),
\end{align}
for all $i$ by combinatorial arguments.

\subsection{Proof of Theorem 4}\label{apx:proof_convergence}
The convergence analysis of MMCCP follows from the convergence analysis of the DC-algorithm~(DCA) \citep{tao1997convex}. More precisely, DCA for minimizing a function $f=g-h$ reduces to the convex-concave procedure in case that the function $h$ is differentiable \citep{nhat2018accelerated, sriperumbudur2009convergence}. This is exactly what we have in our case, as by our assumption on $\tilde g$ and $\tilde h$ we have that each $h_i$ (as a linear combination of differentiable functions) is differentiable and, hence, $h$ is differentiable.

Now, 1. in Theorem 4 directly follows from (i) of Theorem 3 in \citet{tao1997convex}. For 2. in Theorem 4, we have to proof the following:
\begin{enumerate}
	\item $\inf\limits_{\theta \in \Theta} \max\limits_{R\in \cl R} \frac{\sm i n R_i\psi_i(\theta)}{\sm i n R_i}$ is finite.\label{item:one}
	\item It holds $\rho(g)+\rho(h)>0$.\label{item:two}
	\item $(\theta^k)_{k\in\mathbb{N}}$ is bounded.\label{item:three}
\end{enumerate}

Ad \Cref{item:one}: Since $\lvert Y\rvert\le C$, we have that  $\lvert{\mu}_t(X_i)\rvert\leq C$. Also, the rest of the terms involved in each of the three cases for $\psi_i$ are bounded constants, and $l\le R_i\le u$ for all $i\in\{1,\dots,n\}$. Hence, since $\pi(\cdot,\theta)\in[0,1]$, we have that \Cref{item:one} holds true.

Ad \Cref{item:two}: For all $i\in\{1,\dots,n\}$, we have in each of the three cases for $\psi_i$, that $h_i$ is, up to a constant, a linear combination of $\tilde g$ and $\tilde h$ with positive weights. By our assumptions, we have that $\rho(\tilde g)>0$ and $\rho(\tilde h)>0$ and, hence, $\rho(h_i)>0$. By Theorem 3, we have that $h=\sm i n h_ic_i$ with non-negative weights $c_i$, which yields $\rho(h)>0$. \Cref{item:two} follows by observing that $\rho(g)\ge0$.

Ad \Cref{item:three}: Follows directly from Assumption 1.

Then, 2. in Theorem 4 follows by (iii) and (iv) of Theorem 3 in \citet{tao1997convex}.\qed

\section{Details on Covariates in the ACTG 175 Study}\label{apx:covarites_actg}

The ACTG 175 study assigned four treatments randomly to 2,139 subjects with human immunodeficiency virus (HIV) type 1, whose CD4 counts were 200--500 cells/$\text{mm}^3$. The four treatments that were compared are the zidovudine (ZDV) monotherapy, the didanosine (ddI) monotherapy, the ZDV combined with ddI, and the ZDV combined with zalcitabine (ZAL).

There  are  5  continuous  covariates:  age  (year),  weight  (kg,  coded  as wtkg),  CD4 count (cells/$\text{mm}^3$) at baseline, Karnofsky score (scale of 0-100, coded as karnof), CD8 count ($\text{mm}^3$) at baseline. They are centered and scaled before further analysis. In addition, there are 7 binary variables: gender ($1 =$ male, $0 =$ female), homosexual activity (homo, $1 =$ yes, $0 =$ no), race ($1 =$ nonwhite, $0 =$ white), history of intravenous drug use (drug, $1 =$ yes, $0 =$ no), symptomatic status (symptom, $1 =$ symptomatic, $0 =$ asymptomatic), antiretroviral history (str2, $1 =$ experienced, $0 =$ naive) and hemophilia (hemo, $1 =$ yes, $0 =$ no).

\section{Assumption 1 for Linear Policies}\label{apx:dc_representation_linear}
Linear policies are defined by $\pi(X,\theta)=\sigma(\theta^\intercal X)$, where $\sigma(z)=\min(1,\max(z,0))$. A DC-representation for $\sigma(z)$, with $z=\theta^\intercal X$, is given by
\begin{align}
	\tilde g_{\mathrm{Lin}}(z)&=\max(z,0),\\
	\tilde h_{\mathrm{Lin}}(z)&=\max(\max(z,0)-1,0).
\end{align}
It is straightforward to check that both functions are convex. Again, they can be made strongly convex by adding $\frac{\lambda}{2}z^2$ to both functions. Note however, that $\tilde g_{\mathrm{Lin}}$ is not differentiable in $0$ and $\tilde h_{\mathrm{Lin}}$ is not differentiable in $\{0,1\}$. As a remedy, one can set $\Theta_i^\epsilon=\{\theta\in\mathbb{R}^d: \epsilon\le\theta^\intercal X_i\le1-\epsilon\}$ for an $\epsilon>0$ and define $\Theta^\epsilon=\bigcap_{i=1}^{n} \Theta_i^\epsilon$. The intersection has to be nonempty to make this approach work.

\section{Implementation Details}\label{apx:implementation_details}
Our code is available at \url{github.com/tobhatt/GeneralOPL}. For our experiments, we used the policy class of logistic policies as introduced in the main paper. To fulfill Assumption~1, we choose $\Theta$ to be a hypercube with large bounds to ensure a large enough search space, \ie, $\Theta=[-10,000;10,000]^d$. In order to solve the subproblems in MMCCP, we draw upon the L-BFGS-B algorithm implemented in the open-source Python library SciPy. At this point, we note that the subproblems are convex but not necessarily differentiable, as the point-wise maximum of differentiable functions is not necessarily differentiable. However, logistic policies are continuously differentiable and the above choice for $\Theta$ is compact. Hence, the functions $\psi_i(\theta)$ are Lipschitz and, thus,
\begin{align}
	\label{eq:max_term}
	\max_{R\in \cl{R}} \frac{\sm i n R_i\psi_i(\theta)}{\sm i n R_i}
\end{align}
is Lipschitz as the point-wise maximum of Lipschitz functions. By Rademacher's theorem, \labelcref{eq:max_term} is therefore almost everywhere differentiable. The points $\theta$ where \labelcref{eq:max_term} is not differentiable are given by the points in which the maximizing argument $R$ changes. Due to this fact, we find empirically that L-BFGS-B can efficiently solve these subproblems. The rest of the parameters are set as follows. The parameter for the stopping criterion is set to $\delta_\mathrm{tol}$ to $10^{-4}$. In order to make $\tilde g$ and $\tilde h$ strongly convex, $\lambda$ is set to $10^{-3}$. In every run, the starting points are initialized via a normal distribution, \ie, $\theta^0\sim\mathcal{N}_d(\vc{0}_d, 0.1\cdot\vc{I}_d)$. For each method, we ran our algorithm 5 times on the datasets.

We run all of our experiments on a server with two 16 Core Intel Xeon Gold 6242 processors each with 2.8GHz, and 192GB of RAM.

\section{Results for GenDM and GenNIPW on ACTG 175 Study}\label{apx:res_actg}
We present the results on the ACTG 175 study for our method GenDM, which uses $\psi_i^{\mathrm{DM}}(\pi)$ from (8) and our method GenNIPW, which uses $\psi_i^{\mathrm{NIPW}}(\pi)$ from (9). Analogously to Section 5.2, we study the percentage of patients that are treated (\ie, $\pi(X)>0.5$) for varying $\Gamma$. The results are presented in \Cref{apx:fig_clinical_res}. Similar to the results for GenDR in Section 5.2, we find that compared to the baseline policy, our policy treats fewer patients for increasing $\Gamma$. GenNIPW shows little variance across several runs on the dataset. For each run, GenNIPW obtains different, but similar $\theta$. However, the percentage of patients treated remains consistent across different runs.
\begin{figure}[t]%{r}{0.5\textwidth}
	%\vspace{-10pt}
	\centering
	\scalebox{0.45}{\includegraphics{clinical_results.pdf}}
	%\vspace{-1\baselineskip}
	\caption{\footnotesize Percentage of patients with $\pi(X)>0.5$ for our GenDM and GenNIPW policy method. Fewer patients are treated for increasing $\Gamma$.}\label{apx:fig_clinical_res}%
	%\vspace{-1.5em}
\end{figure}

\bibliography{library}

\end{document}
