%%%%%%%% ICML 2024 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%%

\documentclass{article}

% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{bbm}

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2024} with \usepackage[nohyperref]{icml2024} above.
\usepackage{hyperref}


% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}

% Use the following line for the initial blind version submitted for review:
\usepackage[]{icml2024}

% If accepted, instead use the following line for the camera-ready submission:
% \usepackage[accepted]{icml2024}

% For theorems and such
\begin{align*}

% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\usepackage[textsize=tiny]{todonotes}

\newcommand{\vl}[1]{\textcolor{orange}{[VL: #1]}}

\newcommand{\eli}[1]{\textcolor{orange}{[Eli: #1]}}


% The \icmltitle you define below is probably too long as a header.
% Therefore, a short form for the running title is supplied here:
\icmltitlerunning{Optimizing Language Models for Human Preferences is a Causal Inference Problem}

\begin{document}

\twocolumn[
\icmltitle{Optimizing Language Models for Human Preferences is a Causal Inference Problem}

% It is OKAY to include author information, even for blind
% submissions: the style file will automatically remove it for you
% unless you've provided the [accepted] option to the icml2024
% package.

% List of affiliations: The first argument should be a (short)
% identifier you will use later to specify author affiliations
% Academic affiliations should list Department, University, City, Region, Country
% Industry affiliations should list Company, City, Region, Country

% You can specify symbols, otherwise they are numbered in order.
% Ideally, you should not use this facility. Affiliations will be numbered
% in order of appearance and this is the preferred way.
\icmlsetsymbol{equal}{*}

\begin{icmlauthorlist}
% \icmlauthor{Firstname1 Lastname1}{equal,yyy}
% \icmlauthor{Firstname2 Lastname2}{equal,yyy,comp}
% \icmlauthor{Firstname3 Lastname3}{comp}
% \icmlauthor{Firstname4 Lastname4}{sch}
% \icmlauthor{Firstname5 Lastname5}{yyy}
% \icmlauthor{Firstname6 Lastname6}{sch,yyy,comp}
% \icmlauthor{Firstname7 Lastname7}{comp}
% \icmlauthor{Firstname8 Lastname8}{sch}
% \icmlauthor{Firstname8 Lastname8}{yyy,comp}
\icmlauthor{Victoria Lin}{cmu}
\icmlauthor{Eli Ben-Michael}{cmu}
\icmlauthor{Louis-Philippe Morency}{cmu,meta}
\end{icmlauthorlist}

% \icmlaffiliation{yyy}{Department of XXX, University of YYY, Location, Country}
% \icmlaffiliation{comp}{Company Name, Location, Country}
% \icmlaffiliation{sch}{School of ZZZ, Institute of WWW, Location, Country}

\icmlaffiliation{cmu}{Carnegie Mellon University, Pittsburgh, PA, USA}
\icmlaffiliation{meta}{Meta Research, Pittsburgh, PA, USA}

% \icmlcorrespondingauthor{Firstname1 Lastname1}{first1.last1@xxx.edu}
% \icmlcorrespondingauthor{Firstname2 Lastname2}{first2.last2@www.uk}\
\icmlcorrespondingauthor{Victoria Lin}{victoria@stat.cmu.edu}

% You may provide any keywords that you
% find helpful for describing your paper; these are used to populate
% the "keywords" metadata in the PDF but will not be shown in the document
% \icmlkeywords{Machine Learning, ICML}
\icmlkeywords{causal inference, optimization, human feedback, language modeling}

\vskip 0.3in
]

% this must go after the closing bracket ] following \twocolumn[ ...

% This command actually creates the footnote in the first column
% listing the affiliations and the copyright notice.
% The command takes one argument, which is text to display at the start of the footnote.
% The \icmlEqualContribution command is standard text for equal contribution.
% Remove it (just {}) if you do not need this facility.

\printAffiliationsAndNotice{}  % leave blank if no need to mention equal contribution
% \printAffiliationsAndNotice{\icmlEqualContribution} % otherwise use the standard text.


\begin{abstract}
This document provides a basic paper template and submission guidelines.
Abstracts must be a single paragraph, ideally between 4--6 sentences long.
Gross violations will trigger corrections at the camera-ready phase.
\end{abstract}

\section{Introduction}

With the rising popularity of large language models (LLMs), there is great interest in methods that help language models learn (and generate texts according to) human preferences. Reinforcement learning from human feedback (RLHF) saw initial widespread use, but in recent months, works that bypass the requirement of reward modeling have emerged, most notably Direct Preference Optimization (DPO).

In this paper, we show that optimizing language models according to human feedback is actually a causal inference problem. That is, how do we intervene on the text distribution of the generating model to best \textit{cause} an optimal outcome (i.e., to maximize the generation of human-preferred texts)? 

Drawing on this formulation, we demonstrate that a doubly robust solution (Causal Preference Optimization, or CPO) to this optimization problem can be derived, analogous to the provably bias-reducing doubly robust estimators of statistical causal inference. We further illustrate that the prevailing LLM optimization methods, RLHF and DPO, are causally valid solutions to the same optimization problem; mirror the traditional causal inference notions of IPW and outcome model-based estimators; and in fact can be viewed as special cases of CPO.

We evaluate the doubly robust CPO against its IPW and outcome model-based ablations and find that it outperforms them in both automatic and human evaluations of preferred texts...

\section{Related Work [0.5-1 page]}

\subsection{Language model optimization}

\subsection{Causal inference and doubly robust estimators}

\section{A Causal Formulation for Language Model Optimization}

When a language model $f$ is trained or fine-tuned to generate texts that are consistent with human preferences, the implicit goal can be seen as optimizing texts $X \sim P^f$ with respect to some outcome $Y$. In current practice, this outcome $Y$ is typically an indicator of whether a user preferred or did not prefer a text previously generated by the model, under a forced-choice data setting in which the user is shown two possible generations. 

We propose a more general formulation of the optimization problem that bypasses the requirement of forced-choice data. Instead, we consider a data format in which $Y$ is any general reaction of the user to the texts (e.g., ratings, either binary or scalar).

Now, let $\mathcal{X}$ be the space of texts, and let $\mathcal{G}$ be the space of potential outcomes (under the potential outcomes framework of causal inference) for those texts. Then the potential outcomes are given by $\{Y(x) \; | \; x \in \mathcal{X}\} \sim \mathcal{G}$, and the optimization problem for language model $f$ becomes:
\begin{equation}
\label{eq:true_optimization}
    \arg\max \mathbb{E}_{X \sim P^f, Y(\cdot)\sim\mathcal{G}(\cdot)}[Y(X)]
\end{equation}

\subsection{Why is this causal?}

Datasets in which users provide a numerical response to a text are common in NLP (e.g., Reddit upvotes, Amazon ratings, etc.). Though some relationship can typically be inferred between the content of a text passage and a reader's response to it, this relationship generally cannot be assumed to be causal due to the potential presence of \textit{confounders}: factors that influence both a reader's choice of texts to read and how they might tend to respond to a text (e.g., demographic attributes, personal beliefs, prior experiences).

However, many text datasets are labeled through crowdsourced annotation on large online platforms, in which annotators are \textit{randomly assigned} to read and react to texts (i.e., a randomized experiment). The random assignment mechanism removes all possible confounding, since there are no longer any factors influencing which texts the annotator reads---thereby providing a \textit{causal guarantee}: within this dataset, the content of the text must be the sole factor that \textit{causes} the reader's response.

Consequently, a language model that optimizes the outcome or response over a crowdsourced dataset will learn to produce text that \textit{causes} that outcome. Therefore, our model optimization problem becomes the following causal inference problem: how do we \textit{intervene} on the distribution of the text-generating model to best cause an optimal outcome---in this case, the production of human-preferred text?

% [these types of text/rating datasets are common in nlp and generally obtained from crowdsourcing. due to the nature of crowdsourcing, this is basically a randomized experiment: people are randomly assigned to read/react to texts. this removes all possible confounding (factors that influence both ppl's choice to read a text and how they might tend to react to that text) because the random assignment ensures that external factors cannot influence people to read particular texts. therefore, this gives us a \textit{causal guarantee}: within this randomized dataset, we know that the content of the text is *causing* the observed outcome. consequently, a language model that optimizes the outcome over this dataset should produce text that *causes* the optimal outcome.]

\section{Causal Preference Optimization}

Reframing our optimization problem as a causal inference problem allows us to draw on solutions from statistical causal inference: in particular, the notion of the doubly-robust estimator [CITE]. We begin with a pre-trained language model $f^0$ from which we can generate texts $X_j \sim P^{f^0}$, $j \in [m]$; a crowdsourced dataset $\mathcal{D}_R$ of observed texts $X_i \sim P^R$ and their corresponding observed human feedback outcomes $Y_i(X_i)$, where $i \in [n]$; and an outcome model $g(X)=\widehat{Y}(X)$. Then we propose the following \textit{causal preference optimization} (CPO) objective:
\begin{equation}
    \mathcal{L}_{CPO} = \mathcal{L}_R + \mathcal{L}_O
\end{equation}
where
\begin{equation}
    \mathcal{L}_R=\frac{1}{n}\sum_{i=1}^n \mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\mathbb{E}_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i(X_i)-g(X_i))\Bigg]\Bigg]
\end{equation}
and
\begin{equation}
    \mathcal{L}_O=\frac{1}{m}\sum_{j=1}^m \mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\mathbb{E}_{X \sim P^{f^0}}\Bigg[\frac{P^f(X_j)}{P^{f^0}(X_j)}g(X_j)\Bigg]\Bigg]
\end{equation}

\vl{Would it be better to define the objective separately from the data? e.g., not averaging over $n$ and $m$?} [YES, REWRITE]

It can be shown (Appendix \ref{sec:unbiased_dr}) that $\mathcal{L}_{CPO}$ is an unbiased estimator for $\mathbb{E}_{X \sim P^f, Y(\cdot)\sim\mathcal{G}(\cdot)}[Y(X)]$ under one of two possible conditions, making it an effective proxy for the true optimization problem in equation \ref{eq:true_optimization}. Specifically, $\mathcal{L}_{CPO}$ is unbiased as long as \textit{either} (1) $\widehat{P}^R(X)=P^R(X)$ or (2) $g(X)=E_{Y(\cdot)\sim\mathcal{G}(\cdot)}[Y(X)]$. 

Importantly, in our proposed data setting, (1) will always hold: $\mathcal{D}_R$ is assumed to be a crowdsourced dataset, meaning that texts are drawn at random---so $P^R(X)$ is known to be $\frac{1}{n}$. Therefore, given a crowdsourced dataset, a model trained with CPO is guaranteed to be unbiased for the true optimization problem, meaning that it will correctly learn the text distribution that causes the optimal outcome.

\subsection{Double robustness of CPO}

[Under what conditions is CPO advantageous? In particular, under what conditions does each loss term of $\mathcal{L}_{CPO}$ confer robustness against misspecification or mis-estimation (e.g., incorrect reward/outcome model; confounding)?]

\subsection{CPO in practice}

[MOVE EMPIRICAL VERSIONS OF LOSSES HERE]

[In practice, what is $P^R$, and what is $P^{f^0}$? Under what conditions would we choose a specific $P^{f^0}$ (e.g., generate from CLM or generate from pretrained)? Do we always use a sample average for $P^R$, or do we sometimes estimate it from the data?]

\subsection{Ties to RLHF}

RLHF fine-tunes a language model for human feedback via \textit{reward modeling}, wherein the reward is a proxy for human feedback (i.e., the outcome) and the goal of the model is to obtain larger rewards. Interestingly, RLHF can be seen as a special case of CPO in which the data used to learn the reward model comes specifically from a \textit{forced choice experiment}. Given a prompt and two possible completions, users are asked to choose the better completion; the binary indicator of which completion is better can be seen as the outcome $Y$ in CPO. 

Under these conditions, the RLHF objective is analogous to the outcome modeling loss $\mathcal{L}_O$ in CPO. The loss function under RLHF is typically computed through proximal policy optimization (PPO):
\begin{align*}
    \mathcal{L}(\theta, \phi) = \mathcal{L}^{PPO}_{\text{policy}}(\theta) + c_1\mathcal{L}^{PPO}_{\text{value}}(\phi) - c_2\mathcal{L}^{PPO}_{\text{entropy}}(\theta)
\end{align*}

where $\mathcal{L}^{PPO}_{\text{value}}(\phi)$ and $\mathcal{L}^{PPO}_{\text{entropy}}(\theta)$ are regularization terms and $\mathcal{L}^{PPO}_{\text{policy}}(\theta)$ is the \textit{policy loss}. Letting $p$ denote the prompt, $c$ denote the completion, and $r$ denote the reward model, we consider only the policy loss without any stability tricks like clipping:
\begin{equation}
    \mathcal{L}_{PPO}(\theta)=
    \mathbb{E}\left[\frac{\pi_\theta(c|p)}{\pi_{\theta_0}(c|p)}\cdot r(p,c)\right]
\end{equation}

We can see the equivalence between $\pi_\theta(c|p)$ and $P^f(X)$, $\pi_{\theta_0}(c|p)$ and $P^{f^0}(X)$, and $r(p,c)$ and $Y(X)$; substituting these terms renders $\mathcal{L}_{PPO}$ equal to $\mathcal{L}_O$.

\subsection{Ties to DPO}

DPO fine-tunes a language model for human feedback by directly using a preference dataset rather than relying on reward modeling. As with RLHF, DPO can be seen as a special case of CPO in which the preference data comes from a forced choice experiment in which users are asked to choose the better of two completions for a prompt. Under these conditions, the DPO objective is analogous to the density ratio loss $\mathcal{L}_R$ in CPO. Letting $c_w$ and $c_l$ denote the preferred and non-preferred completions to the prompt $p$, respectively,
\begin{equation}
    \mathcal{L}_{DPO}(\theta)=\mathbb{E}\Bigg[\log \sigma \Bigg(\beta \log \frac{\pi_\theta(c_w|p)}{\pi_{\theta_0}(c_w|p)} - \beta\log \frac{\pi_\theta(c_l|p)}{\pi_{\theta_0}(c_l|p)}\Bigg)\Bigg]
\end{equation}

\vl{Not exactly sure what's the best way of explaining the parallel, since they use the ratio of the completion probabilities, whereas we multiply by -1 or 1 depending on which outcome is preferred. This has the same effect of increasing $\pi(c_w|p)$ and reducing $\pi(c_l|p)$, but it's not mathematically equivalent---so can we still call DPO a ``special case'' of CPO? Also, I guess they apply the logistic function over the whole thing, which we don't do?}

\section{Experiments [1-1.5 pages]}

\subsection{Datasets}

[Datasets contain text and outcome rated by crowdsourced human annotator. Does not require traditional forced choice/paired completion data.]

\begin{itemize}
    \item HK [How was this dataset created? What's different about it compared to natural text?]
    \item EmoBank (continuous)
    \item Hatespeech
    \item Confounded dataset [How did we generate confounding, and what do results on the confounded dataset tell us?]
\end{itemize}

\subsection{Methods}
\begin{itemize}
    \item IPW
    \item Outcome modeling
    \item CPO
    \item CLM/FT
\end{itemize}

\subsection{Evaluation}

\subsection{Text preferences}
\begin{itemize}
    \item FT model on text from target dataset
    \item Apply optimization method on FTed model
    \item Generate completions for the same prompt across all methods/models
    \item Show GPT-4 pairs of (CPO, [competing method]) completions for the same prompt and ask it to choose the one that is better w.r.t. to the outcome
    \item Compute overall win rate of CPO against each method
\end{itemize}

\subsection{Comparing GPT-4 to human annotators}

[To assess the validity of using GPT-4 as a substitute for human annotators]

\begin{itemize}
    \item Show humans the same pairs of (CPO, [competing method]) completions and ask them to choose the one that is better w.r.t. to the outcome
    \item 30 annotators who each annotate 20 samples. Total of 200 samples, so an average of 3 annotators per samples
    \item Compute agreement between each human annotator
    \item Compute agreement between each human annotator and GPT-4
    \item Compare agreements to see if they are similar
\end{itemize}

\section{Results and Discussion [1-1.5 pages]}

\subsection{Text preferences}

[TABLE OF WIN RATES]

\begin{itemize}
    \item On Hatespeech and EmoBank, CPO is preferred over CLM and each of its ablations. This tells us that (a) the optimization works and (b) the double robustness is working.
    \item On HK, IPW is preferred over CLM, CPO, and outcome modeling. This tell us that under conditions where $P^R$ is well controlled or can be estimated very well, IPW is strong.
    \item On the confounded dataset...? [TBD]
\end{itemize}

\subsection{Validity of GPT-4 as an annotator}

[TABLE OF AGREEMENTS]

[Human-human agreement is similar to Human-GPT4 agreement, so we conclude that GPT-4 is a reasonable substitute for human annotators]

\section{Conclusion}

\bibliography{ref}
\bibliographystyle{icml2024}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\appendix
\onecolumn
\section{Unbiasedness of $\mathcal{L}_{CPO}$}
\label{sec:unbiased_dr}

We can show that $\mathcal{L}_{CPO}=\mathbb{E}_{X \sim P^f, Y(\cdot)\sim\mathcal{G}(\cdot)}[Y(X)]$  under one of two conditions: $\widehat{P}^R(X)=P^R(X)$ or (2) $g(X)=E_{Y(\cdot)\sim\mathcal{G}(\cdot)}[Y(X)]$.
\begin{proof}{If (1) $\widehat{P}^R(X)=P^R(X)$,}
\begin{align*}
    \mathcal{L}_R&=\frac{1}{n}\sum_{i=1}^n \mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\mathbb{E}_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i(X_i)-g(X_i))\Bigg]\Bigg] \\
    &=\frac{1}{n}\sum_{i=1}^n \mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\mathbb{E}_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{P^R(X_i)}(Y_i(X_i)-g(X_i))\Bigg]\Bigg] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\mathbb{E}_{X \sim P^R}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}(Y(x)-g(x))\mathbbm{1}\{X=x\}\Bigg]\Bigg] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}(Y(x)-g(x))\underbrace{\mathbb{E}_{X \sim P^R}[\mathbbm{1}\{X=x\}]}_{P^R(x)}\Bigg] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)(Y(x)-g(x))\Bigg]\\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}[\mathbb{E}_{X \sim P^f}[Y(X)-g(X)]] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot),X \sim P^f}[Y(X)]-\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot),X \sim P^f}[g(X)]
\end{align*}

\begin{align*}
    \mathcal{L}_O&=\frac{1}{m}\sum_{j=1}^m \mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\mathbb{E}_{X \sim P^{f^0}}\Bigg[\frac{P^f(X_j)}{P^{f^0}(X_j)}g(X_j)\Bigg]\Bigg] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\mathbb{E}_{X \sim P^{f^0}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}g(x)\mathbbm{1}\{X=x\}\Bigg]\Bigg] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}g(x)\underbrace{\mathbb{E}_{X \sim P^{f^0}}[\mathbbm{1}\{X=x\}]}_{P^{f^0}(x)}\Bigg] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)g(x)\Bigg] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}[\mathbb{E}_{X \sim P^f}[g(X)]] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot),X \sim P^f}[g(X)]
\end{align*}

\begin{align*}
    \mathcal{L}_{CPO}&=\mathcal{L}_R+\mathcal{L}_O \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot),X \sim P^f}[Y(X)]-\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot),X \sim P^f}[g(X)]+\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot),X \sim P^f}[g(X)] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot),X \sim P^f}[Y(X)]
\end{align*}
\end{proof}

\begin{proof}{If (2) $g(X)=E_{Y(\cdot)\sim\mathcal{G}(\cdot)}[Y(X)]$,}
\begin{align*}
    \mathcal{L}_R&=\frac{1}{n}\sum_{i=1}^n \mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\mathbb{E}_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i(X_i)-g(X_i))\Bigg]\Bigg] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\mathbb{E}_{X \sim P^R}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{\widehat{P}^R(x)}(Y(x)-g(x))\mathbbm{1}\{X=x\}\Bigg]\Bigg] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{\widehat{P}^R(x)}(Y(x)-g(x))\underbrace{\mathbb{E}_{X \sim P^R}[\mathbbm{1}\{X=x\}]}_{P^R(x)}\Bigg] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}(Y(x)-g(x))\Bigg] \\
    &=\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}[Y(x)-g(x)] \\
    &=\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}(\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}[Y(x)]-\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}[\underbrace{g(x)}_{E_{Y(\cdot)\sim\mathcal{G}(\cdot)}[Y(x)]}]) \\
    &=\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}\cdot 0 \\
    &=0
\end{align*}

\begin{align*}
    \mathcal{L}_O&=\frac{1}{m}\sum_{j=1}^m \mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\mathbb{E}_{X \sim P^{f^0}}\Bigg[\frac{P^f(X_j)}{P^{f^0}(X_j)}g(X_j)\Bigg]\Bigg] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\mathbb{E}_{X \sim P^{f^0}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}g(x)\mathbbm{1}\{X=x\}\Bigg]\Bigg] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}g(x)\underbrace{\mathbb{E}_{X \sim P^{f^0}}[\mathbbm{1}\{X=x\}]}_{P^{f^0}(x)}\Bigg] \\
    &=\mathbb{E}_{Y(\cdot) \sim \mathcal{G}(\cdot)}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)g(x)\Bigg] \\
    &=\sum_{x \in \mathcal{X}}P^f(x)g(x) \\
    &=\mathbb{E}_{X \sim P^f}[\underbrace{g(X)}_{E_{Y(\cdot)\sim\mathcal{G}(\cdot)}[Y(X)]}] \\
    &=\mathbb{E}_{X \sim P^f}[E_{Y(\cdot)\sim\mathcal{G}(\cdot)}[Y(X)]] \\
    &=\mathbb{E}_{X \sim P^f,Y(\cdot)\sim\mathcal{G}(\cdot)}[Y(X)]
\end{align*}

\begin{align*}
    \mathcal{L}_{CPO}&=\mathcal{L}_R+\mathcal{L}_O \\
    &=0+\mathbb{E}_{X \sim P^f,Y(\cdot)\sim\mathcal{G}(\cdot)}[Y(X)] \\
    &=\mathbb{E}_{X \sim P^f,Y(\cdot)\sim\mathcal{G}(\cdot)}[Y(X)]
\end{align*}

\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\end{document}


% This document was modified from the file originally made available by
% Pat Langley and Andrea Danyluk for ICML-2K. This version was created
% by Iain Murray in 2018, and modified by Alexandre Bouchard in
% 2019 and 2021 and by Csaba Szepesvari, Gang Niu and Sivan Sabato in 2022.
% Modified again in 2023 and 2024 by Sivan Sabato and Jonathan Scarlett.
% Previous contributors include Dan Roy, Lise Getoor and Tobias
% Scheffer, which was slightly modified from the 2010 version by
% Thorsten Joachims & Johannes Fuernkranz, slightly modified from the
% 2009 version by Kiri Wagstaff and Sam Roweis's 2008 version, which is
% slightly modified from Prasad Tadepalli's 2007 version which is a
% lightly changed version of the previous year's version by Andrew
% Moore, which was in turn edited from those of Kristian Kersting and
% Codrina Lauth. Alex Smola contributed to the algorithmic style files.
