\documentclass{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{bbm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\allowdisplaybreaks

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


\newcommand{\vl}[1]{\textcolor{orange}{[VL: #1]}}

\newcommand{\eli}[1]{\textcolor{orange}{[Eli: #1]}}

\newcommand{\E}{\mathbb{E}}
\newcommand{\R}{\mathbbm{R}}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}


\title{Optimizing Language Models for Human Preferences \\\ is a Causal Inference Problem}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2024 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Harry~Q.~Bovik}
\author[1,2]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[1]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Cranberry University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Second Affiliation\\
    Address\\
    …
}
\affil[3]{%
    Another Affiliation\\
    Address\\
    …
  }
  
\begin{document}
\maketitle

\begin{abstract}
  As large language models (LLMs) see greater use in academic and commercial settings, there is increasing interest in methods that allow language models to generate texts aligned with human preferences. In this paper, we present an initial exploration of language model adaptation for human preferences from \textit{direct outcome datasets}, in which each sample consists of a text and an associated numerical outcome measuring the reader's response. We first propose that language model optimization must be viewed as a \textit{causal problem} to ensure that the model correctly learns the relationship between the text and the outcome. Following this observation, we introduce \textit{causal preference optimization} (CPO), an unbiased causal solution to the language optimization problem, and a \textit{doubly robust} extension (DR-CPO) that reduces the variance of CPO while retaining provably strong guarantees on bias. Finally, we empirically demonstrate the effectiveness of (DR-)CPO in optimizing state-of-the-art LLMs for human preferences on direct outcome data, and we validate the robustness of DR-CPO under difficult confounding conditions.
\end{abstract}

\section{Introduction}
\label{sec:intro}

Recent advances in computation have yielded large-scale self-supervised language models that achieve impressive performance on a variety of natural language processing (NLP) tasks \citep{zhang2022opt, chowdhery2023palm, lescao2023bloom, bubeck2023sparks}. These large language models (LLMs)---trained on vast amounts of text data of varying quality---can acquire less desirable attributes from these texts, and so they often require further fine-tuning on human preferences to improve their factual correctness and alignment with social values (e.g., less toxic, more helpful) \citep{NEURIPS2022_b1efde53, bommasani2022opportunities}. 

In this paper, we examine a paradigm for adapting language models to human preferences that has previously been underexplored: learning from \textit{direct outcome datasets}. Direct outcome datasets are text datasets in which each sample consists of a text and an associated numerical \textit{outcome} measuring the reader's response to the text (e.g., rating, helpfulness, like/dislike). A large number of direct outcome datasets are \textit{crowdsourced datasets}, in which annotators on a crowdsourcing platform are randomly assigned to read and respond to texts. Such datasets are a [hallmark??] of NLP and are far more common than the prompt-paired completion datasets that have more recently been used to help language models learn human-preferred text.

% [A large number of direct outcome datasets are \textit{crowdsourced datasets}, in which annotators on a crowdsourcing platform are randomly assigned to read and respond to texts. Due to the random assignment of texts to readers, crowdsourced datasets have a causal guarantee: the text ]

We present an initial exploration of language model adaptation in the direct outcome setting, where the language model is fine-tuned to optimize texts with respect to a desired outcome. We posit that language model optimization must be viewed as a \textit{causal} problem in order to guarantee a causal relationship between the text and the desired outcome. The solution to this optimization problem asks the following question: how do we \textit{intervene} on the text distribution of the generating language model to best \textit{cause} an optimal outcome (i.e., the generation of human-preferred texts)? 

Learning a language model that correctly represents this causal relationship can be difficult in practice due to the presence of unmeasured \textit{confounding} in the training data: external factors that affect both readers' choice of texts to read and how they tend to respond to to those texts (e.g., users of social media will tend to read posts on pages that they enjoy, so they will be more likely to click the ``like'' button on those posts). Language models optimized on confounded data may lack robustness or correctness.

We make the first observation that in the direct outcome setting, guaranteeing the causal relationship between the text and the outcome \textit{is} possible by leveraging crowdsourced datasets. Due to the random assignment of texts to readers in the crowdsourcing process, crowdsourced datasets are not subject to confounding and can in fact be viewed as randomized experiments \vl{CITE OURSELVES?}. Building on this observation, our first modeling contribution is to formalize the language model optimization problem in causal terms. We introduce \textit{causal preference optimization} (CPO), an unbiased causal solution to the optimization problem based on the notion of \textit{importance weighting}. Our second modeling contribution is \textit{doubly robust} CPO (DR-CPO), which improves on CPO by reducing its variance via outcome modeling while retaining provably strong guarantees on bias.

We empirically assess the effectiveness of (DR-)CPO in optimizing state-of-the-art LLMs for human preferences on direct outcome data, both with and without confounding. We find that CPO methods successfully adapt LLMs for human preferences and outperform baselines, and we further observe empirical evidence for the robustness of DR-CPO under difficult confounding conditions.

\section{Related Work}
\label{sec:related_work}

% [CPO is contextualized within a wider body of ]

\subsection{Language Model Optimization}


The performance of large self-supervised language models can be further improved by fine-tuning on datasets that align them with human-preferred text \citep{NEURIPS2022_b1efde53, bommasani2022opportunities}. These \textit{paired completion datasets} typically consist of prompts followed by two candidate completions, one of which is indicated to be human-preferred \citep{pmlr-v162-ethayarajh22a, bai2022training, ji2023beavertails}. A reinforcement learning algorithm may then derive its reward model from these datasets (reinforcement learning from human feedback, or RLHF) \citep{christiano2017rlhf}, after which language models are fine-tuned to maximize the human preference reward under the RLHF algorithm. 
% This optimizes the language model for the human preferences encoded in the dataset.

While RLHF has seen widespread use \citep{ stiennon2020summarize, touvron2023llama}, it is computationally demanding, as its training loop requires that new texts be generated and new rewards be computed at each step. 
% Despite this, the majority of the prevailing large language models---both academic and commercial---have been optimized using RLHF.
Consequently, in recent months, methods that allow language models to learn more directly from human preference data have emerged 
% an2023direct, 
\citep{hejna2023contrastive, dumoulin2024density}---the most popular of which is direct preference optimization (DPO) \citep{rafailov2023direct}. Like RLHF, DPO is designed for use with paired completion datasets, maximizing the probability ratio of preferred completions to non-preferred completions over the preference dataset.

% [Within the space of language models, we mostly have RLHF and more recently DPO...]

% RLHF fine-tunes a language model for human feedback via reward modeling, where the reward is a proxy for human feedback (i.e., the outcome) and the goal of the model is to obtain larger rewards.

\subsection{Causal Inference and Double Robust Policy Learning}

Although RLHF and DPO constitute the two most popular optimization approaches for language models, there exists a wide body of work on estimation and policy learning outside of the NLP space. Some notable work directly relevant to this paper includes a long history of doubly robust estimation of causal effects \citep{Robins1994} and---more directly applicably---doubly robust policy learning \citep{dudik2011doublyrobust, pmlr-v48-jiang16, Tang*2020Doubly,Athey2021, pmlr-v162-kallus22a}.

In causal inference, double robustness denotes an estimator formulation that provides robustness against misspecification of \textit{nuisance} parameters or functions. In particular, doubly robust estimators combine two existing estimators---an importance weighting estimator and an \textit{outcome modeling} estimator---such that only one of the two components must be correctly specified or estimated to guarantee the unbiasedness of the estimator \citep{Robins1994, chernozhukov_locally_2022}. This can also be viewed as the importance weighting term providing a \textit{bias correction} for the outcome modeling term. The doubly robust principle can be extended to not only the estimation of causal effects but the estimation of any quantity, including loss functions or policy objectives, as we do here.

% [But outside the space of language models and NLP, there's been lots of work on estimation and optimization. For the purposes of this paper, some previous work of note includes doubly robust estimation of causal effects and---more directly relevant---doubly robust optimization in traditional RL settings...]

\section{A Causal View of Language Model Optimization}
\label{sec:causal_formulation}

% \eli{I think the above should go more in related work, esp the forced choice part, and then start below}

% Formally, let $\mathcal{X}$ be the space of texts, and let $\mathcal{G}$ be the space of potential outcomes (under the potential outcomes framework of causal inference) for those texts. Then the potential outcomes are given by $\{Y(x): \mathcal{X} \rightarrow \mathbb{R} \; | \; x \in \mathcal{X}\} \sim \mathcal{G}$. We define the value function $V(f)$:
% \begin{equation}
%     V(f)=\E_{X \sim P^f}[\E_{Y(\cdot)\sim \mathcal{G}}[Y(X)]]
% \end{equation}

When a language model $f$ is trained or fine-tuned to generate texts that are consistent with human preferences, the implicit goal can be seen as optimizing texts $X \sim P^f$ with respect to some outcome $Y$. In this paper, we consider a direct outcome data format where $Y$ is any numerical response of the user to the texts (e.g., ratings, either binary or scalar).
% In current practice, this outcome $Y$ is typically an indicator of whether a user preferred or did not prefer a text previously generated by the model, under a forced-choice data setting in which the user is shown two possible generations. 

% In this paper, we propose a more general formulation of the optimization problem that bypasses the requirement of forced-choice data. Instead, we consider a \textit{direct outcome} data format in which $Y$ is any numerical response of the user to the texts (e.g., ratings, either binary or scalar).

Let $\mathcal{X}$ represent the space of texts.
We follow the potential outcomes framework \citep{neyman1923, rubin1974}: for each individual $i$ we posit the existence of a \emph{potential outcome function} 
$Y_i:\mathcal{X} \to \R$, where $Y_i(x)$ encodes their potential real-valued response if given text $x$.
This notation implicitly rules out the possibility that an individual's responses can be affected by the texts given to others---a common assumption in causal inference \citep{rubin1974}.
We assume that we sample individuals from a population $\mathcal{G}$ so that the set of potential outcomes $\{Y(x)\; | \; x \in \mathcal{X}\} \sim \mathcal{G}$. We define $g(x) \equiv \E_{Y(\cdot)\sim\mathcal{G}}[Y(x)]$ as the average outcome if all individuals in the population were given text $x$. \vl{TODO: Explain potential outcomes more. Distinction between non-causal formulation---maximizing sample/observed $Y$---vs. causal formulation---maximizing the \textit{potential outcomes} (could call them counterfactuals instead). Maybe even put the non-causal objective to distinguish. Phrase explicitly that we want to distinguish between correlation and causation.}

\vl{We only get to observe outcomes for some texts. Non-causal version just maximizes relative to those outcomes. Causal version is saying what if we could observe outcomes for \text{all} texts, and maximize relative to that? A randomized experiment is an approximation of observing all outcomes b/c they are random. As we mentioned in the intro, many NLP datasets are crowdsourced and therefore an approximation of randomized datasets.}

With this setup, our goal is to find a language model $f$ that leads to high outcomes $Y$ on average across the population of individuals $\mathcal{G}$ and across the texts generated according to model, denoted as $P^f$. We encode the quality of a generative text model $f$ via its value function 
\begin{equation}
    V(f) \equiv \E_{X \sim P^f}[\E_{Y(\cdot)\sim \mathcal{G}}[Y(X)]] = \E_{X \sim P^f}[g(X)]
\end{equation}

Then the \textit{causal} optimization problem is to find the language model $f$ that maximizes the value function:
\begin{equation}
\label{eq:true_optimization}
    \underset{f}{\arg\max}\; V(f) = \underset{f}{\arg\max} \; \E_{X \sim P^f}[\E_{Y(\cdot)\sim \mathcal{G}}[Y(X)]]
\end{equation}

By optimizing on direct outcome data rather than paired completion data, we (1) allow for the use of a much wider body of preexisting data, (2) account for the \textit{intensity} or \textit{degree} of a text's outcome rather than simply whether it is preferred or not, and (3) broaden the scope of problems that can be addressed by learning from human feedback. 

\textbf{Example.} \vl{TODO: Convert this into a hate speech example. Put it in intro, maybe even in the first paragraph. ``Should be seen as complementary to approaches that pair preferred text''} This final advantage is especially significant for tasks with more complex outcomes that do not have natural preference pairs. For instance, consider the task of inducing a language model to unlearn copyrighted material or private information. In the paired completion data setting, the language model is typically given data in the format \textit{([copyrighted material], [alternative text])}, with the preferred text being the latter. Rather than fully unlearning the copyrighted material, the language model is instead trained to preferentially generate the alternative text, which may be a statement like ``I can't provide that information'' \vl{CITE} or a plausible generic completion not containing the copyrighted material (which itself is a challenging research task to construct) \citep{eldan2023whos, maini2024tofu}.

In a direct outcome setting, however, let $X$ denote the copyrighted texts, and let $Y$ be a binary outcome indicating that each of these texts is \textit{not} a desired outcome (e.g., $Y=-1$). Then in maximizing $V(f)$ for language model $f$, solving the optimization problem in equation \ref{eq:true_optimization} explicitly decreases the probability of $X$ to 0 such that $f$ ``forgets'' the copyrighted material---all without increasing the probability of an alternative text or even requiring alternative texts or positive examples.

% Moreover, the CPO problem is further distinguished from existing language model optimization approaches by its general formulation that bypasses the typical requirement of forced-choice data, instead requiring only a numerical (binary or continuous) \textit{outcome} for each text. This allows us to broaden our view of what constitutes a ``preference,'' with potential use cases including not only learning desired outcomes but also 

% \eli{Add a few discussion points here about why we would want to do this, maybe with some examples of texts $x$ and outcomes $y$ from the evaluations or in general?} [TODO, BUT MAYBE THIS WOULD BE IN THE INTRO]

\subsection{Why is this Causal?}

\vl{Merge this with the previous section}

When we adapt a language model for human preferences, we assume an implicit causal relationship between the text and the outcome: that is, changing the text has a direct causal effect on how the reader will react to it. If this causal relationship holds, a language model that optimizes the outcome will learn to produce text that \textit{causes} the outcome.

In a real dataset, however, although some relationship can typically be inferred between the content of a text passage and a reader's response to it, this relationship generally cannot be assumed to be causal due to the potential presence of unmeasured confounders: factors that influence both a reader's choice of texts to read and how they might tend to respond to a text (e.g., demographic attributes, personal beliefs, prior experiences).

% Datasets where users provide a numerical response to a text are common in NLP (e.g., Reddit upvotes \citep{Lakkaraju_McAuley_Leskovec_2021}, Amazon ratings \citep{mcauley2013amazon}). Though some relationship can typically be inferred between the content of a text passage and a reader's response to it, this relationship generally cannot be assumed to be causal due to the potential presence of unmeasured \textit{confounders}: factors that influence both a reader's choice of texts to read and how they might tend to respond to a text (e.g., demographic attributes, personal beliefs, prior experiences).
% \eli{Add something about how optimizing from a confounded dataset can go wrong? Need to complete the thought: ``There can be confound`` $\rightarrow$ ``that can mess up the language model''} 

Under confounding, the response is no longer solely a function of the text, so the ability of language models to correctly or robustly optimize for the response may be compromised. For example, confounding may induce certain styles of text to be more common for desired outcomes, leading the language model to exclusively generate text in that style (e.g., certain communities of Reddit are very large, and posts in those communities receive many upvotes, so a language model optimized to write ``good'' Reddit posts may only generate posts on topics relevant to those specific communities) \vl{Not sure if this is a good example... a less text-based example would be something like writing a good email title with outcome being whether somebody opens the email or not. A confounder would be who is sending the email (e.g., is it someone you know?).}. Moreover, reward models trained on confounded data may be misspecified, since they can only capture the relationship between the text and the response---and not the confounders that have additionally influenced the response. While these issues are remedied if the confounding is also fully modeled, confounders are extremely difficult to measure fully in text data.

We make the observation that in the direct outcome setting, we \textit{can} guarantee the causal relationship between the text and the outcome by leveraging crowdsourced datasets. These datasets are labeled through large online platforms where annotators are randomly assigned to read and react to texts---in effect, a randomized experiment. The random assignment mechanism removes all confounding outside of the text, since no external factors influence which texts the annotator reads. 

Without confounding, crowdsourced datasets provide a crucial causal guarantee: the content of the text must be the sole factor that causes the reader's response. We are then left with the following causal inference problem: how do we \textit{intervene} on the distribution of the generating language model to best cause an optimal outcome on average---in this case, the production of human-preferred text? \vl{Too many colons??} \vl{TODO: Explain what ``intervene'' means}

% Consequently, a language model that optimizes the outcome over a crowdsourced dataset will learn to produce text that \textit{causes} that outcome, without susceptibility to confounding. Therefore, we reiterate that the problem of language model optimization problem is the following causal inference problem: how do we \textit{intervene} on the distribution of the generating language model to best cause an optimal outcome on average---in this case, the production of human-preferred text?

% [these types of text/rating datasets are common in nlp and generally obtained from crowdsourcing. due to the nature of crowdsourcing, this is basically a randomized experiment: people are randomly assigned to read/react to texts. this removes all possible confounding (factors that influence both ppl's choice to read a text and how they might tend to react to that text) because the random assignment ensures that external factors cannot influence people to read particular texts. therefore, this gives us a \textit{causal guarantee}: within this randomized dataset, we know that the content of the text is *causing* the observed outcome. consequently, a language model that optimizes the outcome over this dataset should produce text that *causes* the optimal outcome.]

\section{(Doubly Robust) Causal Preference Optimization}
\label{sec:cpo}
% \eli{Include randomized experiments and/or crowdsourced data in the title somewhere?}

% \eli{One big thing that gets lost here: what is "CPO"? Is the double robust version? Or are we describing "O-CPO", "IPW-CPO", "DR-CPO", } \vl{I guess more the latter}

Reframing our optimization problem as a causal inference problem allows us to draw on solutions from statistical causal inference---in particular, the use of randomized experiments to identify causal effects.
We formalize such a randomized experiment and/or crowdsourced annotated dataset as $\mathcal{D}_R = \{(X_1, Y_1),\ldots, (X_n, Y_n)\}$ where texts $X_i$ are drawn i.i.d. from a randomization distribution $P^R$ and individuals with potential outcome functions $Y_i(\cdot)$ are drawn i.i.d. from the population $\mathcal{G}$. 
This induces a distribution on the observed responses $Y_i = Y_i(X_i)$ that we denote as $P^R_y$.

In this section, we describe an unbiased solution to our causal optimization problem based on the notion of importance weighting, which we call \textit{causal preference optimization} (CPO). Following this definition, we extend CPO using the principle of double robustness from statistical causal inference, in which we use outcome modeling to reduce the variance of CPO while retaining strong guarantees on bias.

% in particular, the notion of the \textit{doubly-robust estimator} \citep{Robins1994, chernozhukov_locally_2022}. Following the convention of causal inference, we identify $V(f)$ in terms of observable data. 

\subsection{Causal Preference Optimization}
% \subsubsection{Identifying the value of a language model}
\label{sec:identification}

% We consider an observed crowdsourced dataset $\mathcal{D}_R$ with texts $X_i \sim P^R$ and their corresponding outcomes $Y_i \sim P^R_y$, where observed $Y_i$ are equal to their potential outcomes $Y_i(X_i)$. Then letting $g(x)=\E_{Y(\cdot)\sim\mathcal{G}}[Y(x)]$, 
\textbf{Identification.} The value of the language model 
$V(f)$ is a causal quantity that involves the potential outcomes for all of the units, some of which are unobserved.
However, we can link the value function to the randomization dataset $\mathcal{D}_R$ (i.e., \textit{identify} it from the observed data) by writing in the following way.
% \begin{align*}
%     V(f)=&\;\E_{X\sim P^f}[g(X)]  \tag*{(\text{$V_{out}$})} \\
%     =&\;\E_{X \sim P^R}\Bigg[\E_{Y \sim P^R_y}\Bigg[\frac{P^f(X)}{P^R(X)}Y\Bigg]\Bigg] \tag*{(\text{$V_{IPW}$})} \\
%     =&\;\E_{X\sim P^f}[g(X)]+ \\
%     &\;\E_{X \sim P^R}\Bigg[\E_{Y \sim P^R_y}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] \tag*{(\text{$V_{DR}$})}
% \end{align*}
\begin{proposition}\label{eq:v_ipw_proposition} (derived in Appendix \ref{sec:v_ipw_derivation}) The value function $V(f)$ can be identified as
\begin{align*}
    V(f)&=\E_{X \sim P^R}\Bigg[\E_{Y \sim P^R_y}\Bigg[\frac{P^f(X)}{P^R(X)}Y\Bigg]\Bigg] \tag*{(\text{$V_{IPW}$})}
\end{align*}
\end{proposition}

This value function draws on importance weighting  principles from statistical causal inference (also referred to as IPW). Observed outcomes $Y \sim P^R_y$ are weighted by the density ratios between texts drawn from the model $X \sim P^f$ and texts drawn from the randomization distribution  $X \sim P^R$; this approximates the average outcome under $P^f$, which is not observed.

% [In the next section, we describe these doubly robust properties and the ways in which they can guarantee the unbiasedness of an estimator for $V(f)$.]


% \subsubsection{Estimating the value}
% \eli{Note that I removed the hats from $P^R$} \vl{Can I bring them back? I think it's fine to state up front that $\widehat{P}^R=P^R$ in our experiments, but it makes things confusing when we're talking about one of the conditions of double robustness is $\widehat{P}^R=P^R$ but $\widehat{P}^R$ doesn't appear in the $\widehat{V}_{DR}(f)$ term.... or should I just rephrase that condition as ``$P^R$ must be known''?}

\textbf{Estimation.} After writing the causal quantity $V(f)$ in terms of observable data, we focus on estimating $V(f)$ in practice. The importance weighting value function $V_{IPW}(f)$ can be estimated directly from the crowdsourced data $\mathcal{D}_R$ as follows (recall that $X_i\sim P^R, Y_i \sim P^R_y$):
\begin{equation*}
    \widehat{V}_{IPW}(f)=\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{P^R(X_i)}Y_i    
\end{equation*}

Note that both $P^f$ and $P^R$ are \textit{known} quantities and do not need to be estimated---$P^f$ because it is obtained directly from the model $f$ we are optimizing, and $P^R$ because we know the randomization mechanism of the texts in $\mathcal{D}_R$.\footnote{In practice, it can still be empirically helpful to use a model-derived estimate of the randomization probabilities $\widehat{P}^R(X)$, similar to how the Hajek estimator can have lower variance than the Horvitz-Thompson estimator \citep{Hajek1971, Sarndal2003_model}} Importantly, this means that $\widehat{V}_{IPW}(f)$ is an unbiased estimator for $V(f)$ (shown formally in Appendix \ref{sec:ipw_unbiased}).

\begin{theorem} Let $\mathcal{D}_R$ be a randomized experiment parameterized by $P^R$, such that $P^R$ is known. Then 
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{IPW}(f)]]&=\E_{X \sim P^f}[\E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]] \\
    &=V(f)
\end{align*}
\end{theorem}

\subsection{Doubly Robust Causal Preference Optimization}

An importance weighting estimator like CPO is a natural solution for estimating causal quantities when randomized experimental data is available. Because CPO only optimizes over the experimental data, however, it can be further improved by the addition of an \textit{outcome modeling} term that predicts outcomes on unlabeled texts. The combination of IPW and outcome modeling yields a doubly robust estimator (DR-CPO) that reduces the variance of CPO and improves its generality while still remaining unbiased for the true causal optimization problem.

\textbf{Identification.} The doubly robust formulation gives us another way of linking the value function to the randomization dataset $\mathcal{D}_R$. 

\begin{proposition}\label{eq:v_dr_proposition} (derived in Appendix \ref{sec:v_dr_derivation}) The value function $V(f)$ can also be identified as
\begin{align*}
    V(f)=&\;\E_{X \sim P^R}\Bigg[\E_{Y \sim P^R_y}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] + \\
    &\;\E_{X\sim P^f}[g(X)] \tag*{(\text{$V_{DR}$})}
\end{align*}
\end{proposition}

This \textit{doubly robust} construction combines IPW and an outcome model $g$ to provide robustness against misspecification or mis-estimation within either term---akin to doubly robust estimators that serve the same purpose when estimating causal effects.

\textbf{Estimation}. The doubly robust value function $V_{DR}$ can be estimated from the crowdsourced data $\mathcal{D}_R$ and a learned outcome model $\widehat{g}(X)$. 

First, however, we consider the outcome modeling term $\E_{X\sim P^f}[g(X)]$. Even if we were to have access to the true outcome model $g$, it is difficult to optimize $g$ with respect to texts $X \sim P^f$,
% the original outcome modeling value function $V_{out}(f)=E_{X \sim P^f}[g(X)]$, 
as this requires that texts be drawn from the language model $f$ \textit{as $f$ is being updated}. To remedy this, we re-write $\E_{X\sim P^f}[g(X)]$ in terms of a fixed language model $f^0$:\footnote{We show this to be mathematically equivalent in Appendix \ref{sec:v_out_rewritten}}
\begin{equation*}
    \E_{X\sim P^f}[g(X)]=\E_{X \sim P^{f^0}}\Bigg[\frac{P^f(X)}{P^{f^0}(X)}g(X)\Bigg],
\end{equation*}
where $P^{f_0}$ denotes the distribution over texts from language model $f_0$.

We can create a Monte Carlo estimate of this by drawing texts $\widetilde{X}_1, \ldots, \widetilde{X}_m \sim P^{f^0}$ and computing
\begin{equation*}
    \frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)
\end{equation*}
where $\widehat{g}(x)$ is a model trained to predict $Y$ from $X$ and $f^0$ is any generative language model. 
% \eli{This could be more explicit. Part of the key is that $\hat{g}$ could be bad because it was fit on confounded data. Is that the setup we want to consider? We have a larger dataset that we can train $\hat{g}$ on? Or are we always fitting $\hat{g}$ on the randomization dataset, where it will be unconfounded?}
% Note that because the texts are generated from $f^0$, $P^{f^0}$ can be computed from the same language model and is therefore known rather than estimated.

Finally, the doubly robust value function $V_{DR}$ can be estimated as a combination of these two terms.
\begin{align*}
    \widehat{V}_{DR}(f)=&\;\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{P^R(X_i)}(Y_i-\widehat{g}(X_i)) + \\
    &\;\frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)
\end{align*}


Formally, it can be shown (Appendix \ref{sec:unbiased_dr}) that $\widehat{V}_{DR}(f)$ is an unbiased estimator for $V(f)$ 
under two possible conditions, making it an effective proxy for the true causal optimization problem.

\begin{theorem} Let $\mathcal{D}_R$ be a randomized experiment parameterized by $P^R$, where $\widehat{P}^R$ may be estimated. Let $g(X)=E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]$, where $\widehat{g}$ may need to be estimated. Then 
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]&=\E_{X \sim P^f}[\E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]] \\
    &=V(f)
\end{align*}
if \textit{either}
\begin{enumerate}
    \item $\widehat{P}^R(X)=P^R(X)$, or
    \item $\widehat{g}(X)=g(X)$
\end{enumerate}
\end{theorem}

Importantly, because $P^R$ \textit{is} known in the crowdsourced data setting, it does not need to be estimated, which means that condition (1) is always fulfilled for DR-CPO. Therefore, $\widehat{V}_{DR}(f)$ is guaranteed to be unbiased for $V_{DR}$ \textit{even if the outcome model is incorrect}. In other words, DR-CPO is robust to misspecification of $\widehat{g}$, as the IPW term in its value function corrects for any bias from the predicted outcomes.

As a result, rather than learning only from the experimental data $X \sim P^R, Y \sim P^R_y$, a model optimized with DR-CPO will additionally be able to leverage the generative language model $f^0$ to learn from unlimited unlabeled text $\widetilde{X} \sim P^{f^0}$ with outcomes $\widehat{g}(\widetilde{X})$, regardless of the correctness of $\widehat{g}$. This reduces the variance of the value function estimator $\widehat{V}_{DR}$ relative to $\widehat{V}_{IPW}$, and models learned under DR-CPO should have greater generalization than models learned under CPO. 

We note that in a different data setting where the true $P^R$ is unknown, a well-estimated $\widehat{g}$ that is close to the true $g$ can also help bias-correct any mis-estimation of $\widehat{P}^R(X)$. This may occur, for instance, when no text experiment or crowdsourced dataset is available, but a large amount of clean data exists to train the outcome model.

% Each of these value functions may be used to solve the CPO problem in practice. In the remainder of this paper, we refer to outcome modeling-based CPO (i.e., using $\widehat{V}_{out}$) as O-CPO; IPW-based CPO (i.e., using $\widehat{V}_{IPW}$) as IPW-CPO; and doubly robust CPO (i.e., using $\widehat{V}_{DR}$) as DR-CPO.

% In other words, the value function $V(f)$ can be identified from the observed data in multiple ways. With \textit{outcome modeling} ($V_{out}$), texts $X$ are generated from $P^f$ and outcomes are computed by a model $g(X)$, such that the optimization problem is to maximize the predicted outcome. With \textit{importance weighting} ($V_{IPW}$), or IPW, observed outcomes $Y \sim P^R_y$ are weighted by the density ratios between texts drawn from the model $X \sim P^f$ and texts drawn from the randomization distribution  $X \sim P^R$; this approximates the average outcome under $P^f$.
% % which are then maximized under optimization. 
% Finally, we use a \textit{doubly robust} construction ($V_{DR}$) that combines outcome modeling and IPW to provide robustness against misspecification or mis-estimation within either term---akin to doubly robust estimators that serve the same purpose when estimating causal effects. 

\subsection{Relationship to Existing Approaches}
\label{sec:rlhf_dpo_equivalence}

Elements of (DR-)CPO are reflected in two of the most prominent existing language model optimization approaches: RLHF and DPO. First, we draw attention to the fact that the outcome modeling term in $V_{DR}(f)$ is also itself a way of identifying $V(f)$ from the observed data. In particular,
\begin{align*}
    V(f)&=\E_{X\sim P^f}[g(X)]  \\
    &=\E_{X \sim P^{f^0}}\Bigg[\frac{P^f(X)}{P^{f^0}(\widetilde X)}g(X)\Bigg] \tag*{(\text{$V_{out}$})}
\end{align*}

which can be estimated (as we mention previously) from $\widetilde{X}_1, \ldots, \widetilde{X}_m \sim P^{f^0}$ as
\begin{equation*}
    \frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)
\end{equation*} 

Outcome modeling relies entirely on the predictive model $\widehat{g}$ and unlabeled texts $\widetilde{X} \sim P^{f^0}$ and therefore does not require an experimental dataset $\mathcal{D}_R$. However, if $\widehat{g}(X)$ is not a good outcome model---for instance, due to confounding---then $\widehat{V}_{out}(f)$ will be biased with respect to $V(f)$.

Interestingly, optimization of $V_{out}(f)$ is effectively RLHF in the direct outcome data setting. That is, RLHF can be seen as a version of outcome modeling in which the reward model is trained on paired completion data. Under these conditions, the RLHF reward model is analogous to the outcome model $g$, and the RLHF policy loss is mathematically equivalent to $V_{out}(f)$ (details in Appendix \ref{sec:rlhf_ties}).

% \eli{I think the framing and organization can be sharpened a bit as follows.  Have Section 4.3 be about the limitations of outcome modelling and IPW (and their connections to RLHF and DPO) 
%  I might characterize it as this:
% \begin{itemize}
%     \item Outcome modeling: doesn't require an experiment, but can be entirely wrong due to confounding
%     \item IPW: unbiased, but can have high variance and poor generalization properties because experiments are often small 
% \end{itemize}
% Then Section 4 is about how the DR approach addresses this. I think it can be framed explicitly as starting with the outcome modelling approach and then only using the experiment to correct for any bias in the outcome modelling approach. So we can even use externally fit outcome models that have some confounding, and it will still be ok.
% }

Likewise, DPO is similar (though not identical) to CPO in the paired completion data setting. Both DPO and CPO fine-tune a language model for human feedback by directly using a preference dataset rather than relying on reward or outcome modeling. The DPO objective shares certain similarities with $V_{IPW}$---most notably in that it directly increases the likelihood of texts corresponding to desired outcomes through importance weighting. However, because of the paired nature of the data, DPO increases the density ratio between preferred and non-preferred examples, while CPO directly increases the probability of texts with desired outcomes and decreases the probability of texts with non-desired outcomes. Given a paired data setting, the DPO objective could possibly be recovered from $V_{IPW}$; we leave this derivation for future work.

These parallels mean that RLHF and DPO are subject to the same limitations that we describe for outcome modeling and importance weighting more generally. That is, RLHF can degrade under confounding or misspecification of its reward model, while DPO can experience high variance and limited generalization resulting from the size or coverage of its preference dataset. \vl{Is this paragraph necessary?}
% [While outcome modeling with $\widehat{V}_{out}(f)$ and importance weighting with $\widehat{V}_{IPW}(f)$ can be useful, both can easily result in poor optimization of the language model. Outcome modeling does not require a text experiment, but if $\widehat{g}(X)$ is not a good outcome model, then $\widehat{V}_{out}(f)$ will be biased with respect to $V(f)$. In fact, it is not unlikely that the outcome model will be incorrect when learned from real-world data---often due to confounding. 

% Under confounding, the outcome is not fully caused by the text, and so a model trained to predict the outcome from the text will inevitably be misspecified, since it cannot capture the confounders that have influenced the outcome in addition to the content of the text. [We explore such a setting in our experiments and show that models optimized with $\widehat{V}_{out}(f)$ degrade under confounding, while models optimized with $\widehat{V}_{DR}(f)$ remain unaffected.]

% [Importance weighting is unbiased given a text experiment, but optimization is limited to the text data from the experiment.] \vl{Are these the right pros/cons to be mentioning?}


% \subsection{CPO corrects estimation bias} 
% \subsection{Double Robustness of CPO}
% \vl{Feel like I'm repeating myself a bit in this section... does it read that way?}

% The limitations of outcome modeling and importance weighting motivate our doubly robust formulation DR-CPO, which provides (a) robustness against outcome model misspecification \textit{and} (b) generalizability beyond a specific preference dataset or text experiment. 
% At a high level, DR-CPO is unbiased for the true optimization problem as long as \textit{either} (1) the distribution of the preference dataset or text experiment is known, or (2) the ``true'' outcome model is known.


% \subsection{Double robustness of CPO}

% [Under what conditions is CPO advantageous? In particular, under what conditions does each loss term of $\mathcal{L}_{CPO}$ confer robustness against misspecification or mis-estimation (e.g., incorrect reward/outcome model; confounding)?]

% \subsection{Ties to DPO}

% Like the importance weighting component of CPO, DPO fine-tunes a language model for human feedback by directly using a preference dataset rather than relying on reward modeling. As is the case with RLHF, DPO requires preference data to come from a forced choice experiment in which users are asked to choose the better of two completions for a prompt.

% Letting $c_w$ and $c_l$ denote the preferred and non-preferred completions to the prompt $p$, respectively, the DPO objective is given by
% \begin{align*}
%     \mathcal{L}_{DPO}(\theta)=&\;\E\Bigg[\log \sigma \Bigg(\beta \log \frac{\pi_\theta(c_w|p)}{\pi_{\theta_0}(c_w|p)} \\
%     &- \beta\log \frac{\pi_\theta(c_l|p)}{\pi_{\theta_0}(c_l|p)}\Bigg)\Bigg]
% \end{align*}
% where $\pi_\theta$ is again the probability under the model being optimized, while $\pi_{\theta_0}$ is the probability under a reference model.

% As with RLHF, DPO can be seen as a special case of CPO in which the preference data comes from a forced choice experiment in which users are asked to choose the better of two completions for a prompt. Under these conditions, the DPO objective is analogous to the density ratio loss $\mathcal{L}_R$ in CPO. Letting $c_w$ and $c_l$ denote the preferred and non-preferred completions to the prompt $p$, respectively,
% \begin{equation}
%     \mathcal{L}_{DPO}(\theta)=\E\Bigg[\log \sigma \Bigg(\beta \log \frac{\pi_\theta(c_w|p)}{\pi_{\theta_0}(c_w|p)} - \beta\log \frac{\pi_\theta(c_l|p)}{\pi_{\theta_0}(c_l|p)}\Bigg)\Bigg]
% \end{equation}

% The DPO objective shares certain similarities with the $V_{IPW}$ value function---most notably in that it directly increases the likelihood of texts corresponding to desired outcomes through importance weighting. Because of the paired nature of the data, DPO increases the density ratio between preferred and non-preferred examples, while importance-weighted CPO directly increases the probability of texts with desired outcomes and decreases the probability of texts with non-desired outcomes in separate steps, rather than adjusting the probabilities through a ratio. 
% \vl{I think the future work about recovering DPO from a paired version of CPO should go in the discussion rather than here, but I'm not sure what a neat way to tie up this section would be}
% \eli{If this gets folded in to a section about IPW and it's limitations, I think this can be more of a brief aside than a main thing, and so wouldn't need to be neatly tied up. Agreed on the discussion portion}

% \vl{Not exactly sure what's the best way of explaining the parallel, since they use the ratio of the completion probabilities, whereas we multiply by -1 or 1 depending on which outcome is preferred. This has the same effect of increasing $\pi(c_w|p)$ and reducing $\pi(c_l|p)$, but it's not mathematically equivalent---so can we still call DPO a ``special case'' of CPO? Also, I guess they apply the logistic function over the whole thing, which we don't do?}

\section{Experiments}
\label{sec:experiments}

We conduct evaluations to empirically the effectiveness of CPO and DR-CPO in optimizing language models for human preferences on direct outcome data, and we examine the doubly robust properties of DR-CPO under confounding.

\subsection{Datasets}

To evaluate optimization on direct outcome data, we consider three crowdsourced datasets in which human annotators provided numerical responses to texts.

\textbf{Hate Speech} \textit{(binary outcome).} The Hate Speech dataset \citep{qian-etal-2019-benchmark} consists of comments from the social media sites Reddit and Gab. Outcomes are collected via crowdsourcing and indicate whether the annotator percieves the comment to be hate speech. The Reddit comments are chosen from subreddits where hate speech is more common, and Gab is a platform where users sometimes migrate after being blocked from other social media sites. The optimization goal for this dataset is to generate texts that are \textit{less} hateful on average.

\textbf{Hong Kong} \textit{(scalar outcome).} The Hong Kong dataset \citep{fong2021causal} consists of texts concerning the Hong Kong democracy protests of 2019-2020. These texts are loosely based on speeches made about Hong Kong during U.S. Congressional sessions at the time of the protests. Outcomes are collected via a randomized experiment and indicate to what extent the respondent thinks  that the U.S. should support Hong Kong during this time, after reading the text. The texts are programmatically constructed: for each text, 2 or 3 text attributes are randomly chosen out of 7 (e.g., \textit{commitment}, \textit{bravery}, \textit{mistreatment}). Short passages corresponding to each attribute are then randomly chosen from a pool of about 20 to construct the text. The optimization goal for this dataset is to generate texts with \textit{high} outcomes on average.

\textbf{Confounded} \textit{(scalar outcome).} The Confounded dataset is a version of the Hong Kong dataset where we have induced confounding. We consider the strongest possible form of confounding: the confounder is fully correlated with the outcome, resulting in all outcomes being negations of the original outcomes. This dataset is used to train outcome models. We include this dataset with the realistic expectation that text data is often confounded, which poses a threat to outcome model-based approaches. Therefore, it is necessary to evaluate how different optimization approaches fare under confounding. Like the Hong Kong dataset, the optimization goal for this dataset is to generate texts with \textit{high} outcomes on average.

% \subsection{Evaluation}

\subsection{Implementation}
\label{sec:implementation}

\textbf{Evaluation.} To evaluate how well optimization for human preferences has occurred, we use a text preference framework in which a reader is asked to choose the better (with respect to the outcome) of a pair of texts generated by two different methods. Using GPT-4 as a proxy for human annotators, we compare pairs of \textit{(method, baseline)} completions for the same prompt; across all pairs, we compute method \textit{win rates} and compute 95\% confidence intervals. Since the datasets used for these experiments contain one text per sample rather than a prompt and a completion, we create prompts on the evaluation set by truncating each text to a random length. \vl{I know we said the use of the word ``pair'' here was confusing before given the previous discussion of paired completion datasets. Any suggestions on alternate phrasings?}

The full input provided to GPT-4 for each dataset can be found in Appendix \ref{sec:gpt_win_rate_questions}. We validate the use of GPT-4 as an annotator with a human study, which we describe in further detail in Section \ref{sec:gpt4_annotation}.


% [TODO: HOW WERE PROMPTS CONSTRUCTED? HOW WERE MODELS TRAINED? could also put this in datasets section at the beginning. datasets were split into train/eval splits... datasets were optimized on training split. then for eval split, since the datasets aren't in prompt/completion format by default, prompts were constructed by truncating texts to a random length.]

% [How did we generate confounding, and what do results on the confounded dataset tell us?]

% \textbf{Confounded Hate Speech?}

% \begin{itemize}
%     \item HK [How was this dataset created? What's different about it compared to natural text?]
%     % \item EmoBank (continuous)
%     \item Hatespeech
%     \item Confounded dataset(s) [How did we generate confounding, and what do results on the confounded dataset tell us?]
% \end{itemize}

\textbf{Methods.} We evaluate language models optimized using \textbf{CPO} and \textbf{DR-CPO}. As our baselines, we consider language models that have been fine-tuned on texts from each of the task datasets (\textbf{FT}), as well as models optimized using the outcome modeling value function $V_{out}(f)$. Since---as we discuss in Section \ref{sec:rlhf_dpo_equivalence}---the $V_{out}(f)$ objective is mathematically equivalent to the RLHF objective, we refer to this baseline as \textbf{OO-RLHF} (offline outcome RLHF).

We use Llama 2 7B \citep{touvron2023llama} as our base language model and fine-tune with low-rank adaptation \citep{hu2022lora}. All optimizations are applied after fine-tuning on text from the task dataset.

\textbf{Choice of $f^0$.} When optimizing with DR-CPO or OO-RLHF, any generative language model may be used as $f^0$, the fixed language model from which texts are drawn as input to the outcome model. One key consideration is whether $f^0$ should be a pre-trained model or whether it should be a model that has been fine-tuned on text relevant to the task---for instance, the randomized experiment dataset $\mathcal{D}_R$.

% In practice, we use the following general heuristics. 
In practice, we choose a pre-trained model as $f^0$ to leverage the diversity of texts such models tend to generate. If $\widehat{g}(X)$ is a good outcome model, then predicted outcomes on these texts will still be close to the true outcomes, and DR-CPO and OO-RLHF will benefit from outcome modeling. 

% If $\widehat{g}(X)$ is not a good outcome model, then O-CPO will be negatively affected. On the other hand, with DR-CPO, the bias-correction of the IPW component should ensure that the predicted outcomes---while not necessarily useful---do not hurt the overall optimization.

% If we feel that $\widehat{g}(X)$ may not be a very robust outcome model, we choose a fine-tuned model as $f^0$. [By keeping generated texts similar to the texts in the original randomized dataset $\mathcal{D}_R$, we ensure that $\widehat{g}(X)$ will only have to predict outcomes on texts similar to those that it has already seen before, increasing the likelihood that the predictions will be close to the true outcomes...?]
% Practical considerations may inform specific choices of $f^0$. For instance, one key 

\textbf{Choice of $\widehat{P}^R$.} In Section \ref{sec:identification}, we mention briefly that although the distribution of texts $P^R(X)$ under the randomized experiment is known, it can be helpful empirically to instead compute an estimated $\widehat{P}^R(X)$. This is generally due to the fact that the \textit{sample} probability of each text $X$ may not actually be equal to its theoretical probability merely by chance \citep{Hajek1971, Sarndal2003_model}.

We find this to be the case in our experiments, and so we use $\widehat{P}^R(X)$ estimated from a Llama 2 7B model fine-tuned on $\mathcal{D}_R$ in our CPO implementations.

% \subsection{Evaluation}

% \subsubsection{Text preferences}
% \begin{itemize}
%     \item FT model on text from target dataset
%     \item Apply optimization method on FTed model
%     \item Generate completions for the same prompt across all methods/models
%     \item Show GPT-4 pairs of (CPO, [competing method]) completions for the same prompt and ask it to choose the one that is better w.r.t. to the outcome
%     \item Compute overall win rate of CPO against each method
% \end{itemize}

% \subsubsection{Comparing GPT-4 to human annotators}

% [To assess the validity of using GPT-4 as a substitute for human annotators]

% \begin{itemize}
%     \item Show humans the same pairs of (CPO, [competing method]) completions and ask them to choose the one that is better w.r.t. to the outcome
%     \item 30 annotators who each annotate 20 samples. Total of 200 samples, so an average of 3 annotators per samples
%     \item Compute agreement between each human annotator
%     \item Compute agreement between each human annotator and GPT-4
%     \item Compare agreements to see if they are similar
% \end{itemize}

% \subsection{Implementation}

% \eli{Should all of this be under a separate (sub or subsub) section about implementation details?}
% [DISCUSS: When should $f^0$ be FTed vs. pretrained? Good $\widehat{g}(X)$, bad/limited $P^R$: $f^0$ should be pretrained, since this encourages $f$ to be optimized over a wider range of texts (as presumably texts generated by the pretrained model will be more diverse than texts generated by the FTed model). Bad/limited $\widehat{g}(X)$: $f^0$ should be FTed on $P^R$. Since we don't trust $\widehat{g}(X)$'s predictions on text that's very different from text it's already seen (assuming it was trained on $P^R$ as well), we want it to only make predictions on text similar to $P^R$.] \vl{For the bad/limited $\widehat{g}(X)$, is this reasoning correct? What if $\widehat{g}(X)$ wasn't trained on $P^R$? Then wouldn't its predictions be equally bad on $P^R$ and on text generated by the pretrained mode?}
% \eli{I do not understand this  point} \vl{I think this is the same as what you said in an earlier comment about whether we're fitting $\widehat{g}(X)$ on our unconfounded randomization dataset or whether we're fitting it on a larger potentially confounded dataset}

% [Also mention choice to use $\widehat{P}^R$ instead of $\frac{1}{n}$]

\section{Results and Discussion}
\label{sec:results}

\subsection{GPT-4 Annotation Validity}
\label{sec:gpt4_annotation}

\begin{table}[!ht]
    \centering
    \begin{tabular}{cccc}
    \toprule
    \toprule
        Annotator 1 & Annotator 2 & Fleiss' $\kappa$  \\
    \midrule
        Human & Human & 0.170 \\
        Human & GPT-4 & 0.219 \\
        Human majority & GPT-4 & 0.192 \\
    \bottomrule
    \bottomrule
    \end{tabular}
    \caption{Agreement rates of human annotators and GPT-4 when asked to choose preferred texts with respect to target outcomes. We examine inter-human agreement, human-GPT-4 agreement, and agreement between a majority vote of human annotators and GPT-4.}
    \label{tab:human_gpt4_agreement}
\end{table}


We use the online research platform Prolific\footnote{\url{https://www.prolific.co/}} to conduct our human study. To avoid annotator fatigue, examples are annotated in batches of 20. We recruit a total of 30 annotators for an average of 3 annotators per example and a total of 600 annotations. Additional details about the study can be found in Appendix \ref{sec:gpt4_annotation_details}.

Across three comparisons---human-human, majority vote-human, and human-GPT-4---we find that GPT-4 exhibits a similar or better level of agreement with human annotators as human annotators do with each other (Table \ref{tab:human_gpt4_agreement}). We conclude that GPT-4 is a reasonable surrogate for human annotators.

% We judge GPT-4 to be a valid surrogate for human annotators if human-GPT-4 agreement is similar to or better than inter-human agreement.

% [Human-human agreement is similar to Human-GPT4 agreement, so we conclude that GPT-4 is a reasonable substitute for human annotators]

\begin{figure*}[!ht]
    \centering
    \includegraphics[width=0.7\textwidth]{images/win_rates.png}
    \caption{CPO and DR-CPO win rates against FT and OO-RLHF. A win rate exceeding 0.5 indicates that the named method outperforms the competing method with respect to the target outcome. Win rates are computed across 2000 pairs for each method combination.}
    \label{fig:win_rates}
\end{figure*}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\columnwidth]{images/confounding.png}
    \caption{Impact of confounding (measured in win rate difference) on CPO, DR-CPO, and OO-RLHF. A negative impact indicates that confounding hurt the performance of the method. Win rates are computed across 600 pairs for each method combination.}
    \label{fig:confounding_win_rates}
\end{figure}

\subsection{Text Preferences}

We report CPO win rates against OO-RLHF, FT, and DR-CPO in Figure \ref{fig:win_rates}. In Figure \ref{fig:confounding_win_rates}, we examine the impact of confounding on OO-RLHF, CPO, and DR-CPO. Additional results may be found in Appendix \ref{sec:additional_results}.

\textbf{Outcome optimization} (Figure \ref{tab:win_rates}). On the Hate Speech dataset, we observe that CPO outperforms both the OO-RLHF and FT baselines. Against OO-RLHF, CPO's win rate is statistically significant at the 95\% confidence level, with the lower bound of its 95\% confidence intervals falling above 0.5. These results indicate that using CPO, language models successfully learn human preferences for less hateful text from direct outcomes.

DR-CPO further appears to outperform CPO on the Hate Speech dataset, though its win rate is just shy of statistical significance. It can be seen (Appendix \ref{sec:additional_results}, Table \ref{tab:win_rates}) that DR-CPO also outperforms both the OO-RLHF and FT baselines---this time at a statistically significant level for both comparisons. These results demonstrate the empirical benefits of double robustness, wherein a good $\widehat{P}^R$ provides bias-correction against a poorer $\widehat{g}$.

% Since CPO is applied on the fine-tuned model, DR-CPO being preferred over FT suggests that DR-CPO is successfully optimizing language models for the Hate Speech dataset target outcome: making texts less hateful. Moreover, DR-CPO being preferred over O-CPO and statistically indistinguishable from IPW-CPO provides empirical evidence for the doubly robust properties of DR-CPO, wherein a good $\widehat{P}^R$ provides bias-correction against a poorer $\widehat{g}(X)$.
% This is significant for a number of reasons. First, as we mention in Section \ref{sec:implementation}, the base model for our optimization is Llama-2-7b---an LLM that has already been trained with extensive RLHF for 

On the Hong Kong dataset, we likewise observe that CPO outperforms both the OO-RLHF and FT baselines. In this setting, CPO achieves statistically significant win rates against both baselines, with the lower bound of its 95\% confidence intervals falling above 0.5. 

Here, CPO also outperforms DR-CPO, which suggests that CPO can be very strong under conditions where $P^R$ is well controlled or can be estimated very well---as is the case for the Hong Kong dataset, where texts are not only randomly assigned to annotators but programmatically generated from random attributes. Furthermore, we note that outcome models trained on the Hong Kong dataset do not achieve good performance or generalization outside of the training data, possibly due to the artificial nature and relative homogeneity of the texts. Therefore, our results point to a conclusion that under conditions where the outcome model is particularly poor, DR-CPO---in spite of its \textit{theoretical} robustness---may \textit{empirically} fall short of CPO.

\textbf{Double robustness under confounding} (Figure \ref{fig:confounding_win_rates}). Finally, on the Confounded dataset, we find that DR-CPO remains robust under confounding, while OO-RLHF degrades significantly. (Vanilla CPO is not affected by confounding because it does not use an outcome model, but we train two separate models to account for randomness in the optimization process.) OO-RLHF experiences a negative impact to win rate that is significantly lower than 0, while CPO experiences no impact and DR-CPO (interestingly) experiences a positive impact. \vl{What's a good (but hand-wavey) way to say: this is kind of an unusual result, so we may explore it in future work?} 
% When provided an unconfounded outcome model, O-CPO achieves a win rate statistically significantly above 0.5 compared to O-CPO with a confounded outcome model. In contrast, DR-CPO is not negatively impacted by a confounded outcome model.

These results further illustrate the doubly robust properties of DR-CPO and the shortcomings of outcome modeling approaches. Even under aggressive confounding, with a worst-case outcome model that has been trained on completely negated data, DR-CPO is not compromised, while OO-RLHF is. Given the functional equivalence between the OO-RLHF objective $V_{out}(f)$ and the paired completion data RLHF objective, this result may also be taken to imply that traditional RLHF-based optimization on paired completion datasets would degrade under confounding.

We reiterate that because they are optimized on randomized experimental data, $\mathcal{D}_R$, CPO and DR-CPO are \textit{causal} approaches. Taken together, our results constitute empirical evidence for a core theoretical strength---robustness to confounding---of an optimization framework that maintains the causal relationship between text and outcome.

% \textbf{Analysis of $f^0$ and $P^R$}

% If $\widehat{g}(X)$ is \textit{not} a good outcome model, then it may instead be beneficial to use a model fine-tuned on $\mathcal{D}_R$ as $f^0$. In this case, if the fine-tuning is successful, then $P^{f^0} \approx P^R$, and consequently $\widehat{V}_{DR}(f)$ will reduce to $\widehat{V}_{IPW}(f)$ \vl{SHOW THIS IN APPENDIX?}, removing the negative influence of the outcome model in DR-CPO. \vl{But why does this matter if the bias-correction of the IPW term in DR-CPO should guarantee that it isn't negatively affected by the bad outcome model anyway?}


% \begin{itemize}
%     \item On Hatespeech and EmoBank, CPO is preferred over CLM and each of its ablations. This tells us that (a) the optimization works and (b) the double robustness is working.
%     \item On HK, IPW is preferred over CLM, CPO, and outcome modeling. This tells us that under conditions where $P^R$ is well controlled or can be estimated very well, IPW is strong. Furthermore, under conditions where the outcome model may not be very good, outcome modeling and CPO (despite double robustness) may fall short of IPW. [Due to its careful randomization---not only with respect to how texts are assigned to annotators but also in how the texts themselves are constructed---the Hong Kong dataset is a setting in which we expect IPW to work well]
%     \item On the confounded dataset, outcome modeling under confounding is significantly worse than outcome modeling without confounding. However, IPW and CPO are not significantly worse under confounding than without confounding. This illustrates one of the core strengths of a causal approach: outcome/reward modeling without causal considerations may be susceptible to confounding in the data. IPW does not use outcome modeling and is therefore not susceptible, and the double robustness of CPO also prevents it from being affected.
% \end{itemize}


\section{Conclusion}

In this paper, we explore language model adaptation for human preferences from direct outcome datasets, in which each sample consists of a text and the reader's numerical response. We first posit that language model optimization must be viewed as a causal problem to ensure that the model correctly learns the relationship between the text and the outcome, and we define conditions under which this causal relationship can be guaranteed. Following this, we introduce CPO, an unbiased causal solution to the language optimization problem---and improve upon it with the doubly robust DR-CPO, which reduces the variance of CPO while retaining provably strong guarantees on bias. Finally, we empirically demonstrate the effectiveness of (DR-)CPO in optimizing state-of-the-art LLMs for human preferences on direct outcome data, and we validate the robustness of DR-CPO under difficult confounding conditions. These theoretical contributions and results open the door to a wide range of data, human preferences, and optimization goals that language models can learn using CPO.

\vl{Worth mentioning any future work? We had some stuff about the paired version of DR-CPO before, but I'm not sure if it really fits now that we're leaning into direct outcome data as a novelty/contribution.}

% ---from which one could likely recover the DPO objective.

% [TODO: 1 paragraph -- or 2 if we want to discuss future work with paired version to recover DPO?]


\begin{contributions} % will be removed in pdf for initial submission 
					  % (without ‘accepted’ option in \documentclass)
                      % so you can already fill it to test with the
                      % ‘accepted’ class option
    Briefly list author contributions. 
    This is a nice way of making clear who did what and to give proper credit.
    This section is optional.

    H.~Q.~Bovik conceived the idea and wrote the paper.
    Coauthor One created the code.
    Coauthor Two created the figures.
\end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    Briefly acknowledge people and organizations here.

    \emph{All} acknowledgements go in this section.
\end{acknowledgements}

% References
\bibliography{ref}

\newpage

\onecolumn

\title{Optimizing Language Models for Human Preferences \\\ is a Causal Inference Problem\\(Supplementary Material)}
\maketitle

\appendix

\section{Identifying $V(f)$}

\subsection{Derivation of Proposition \ref{eq:v_ipw_proposition} [TODO]}
\label{sec:v_ipw_derivation}

\subsection{Derivation of Proposition \ref{eq:v_dr_proposition} [TODO]}
\label{sec:v_dr_derivation}

\section{Unbiasedness of $\widehat{V}_{IPW}(f)$}
\label{sec:ipw_unbiased}

We can show that $\E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{IPW}(f)]]=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]]=V(f)$ when $P^R$ is known.

\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^R}[\widehat{V}_{IPW}(f)]]
    &=\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{P^R(X_i)}Y_i\Bigg]\Bigg] \\
    &=\frac{1}{n}\sum_{i=1}^n\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{P^R(X_i)}Y_i\Bigg]\Bigg] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}Y(x)\mathbbm{1}\{X=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}Y(x)\underbrace{\E_{X \sim P^R}[\mathbbm{1}\{X=x\}]}_{P^R(x)}\Bigg] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)Y(x)\Bigg] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]] \\
    &=V(f)
\end{align*}

\section{Equivalence of $V_{out}(f)$}
\label{sec:v_out_rewritten}

We can show that our rewriting of $V_{out}(f)$ is equivalent to our original definition:
\begin{align*}
    V_{out}(f)&=\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(X)}{P^{f^0}(X)}g(X)\Bigg] \\
    &=\sum_{x \in \mathcal{X}}P^{f^0}(x) \frac{P^f(x)}{P^{f^0}(x)}g(x) \\
    &=\sum_{x \in \mathcal{X}}P^f(x)g(x) \\
    &=E_{X \sim P^f}[g(X)]
\end{align*}

\section{Unbiasedness of $\widehat{V}_{DR}(f)$}
\label{sec:unbiased_dr}

We can show that $\E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]]=V(f)$  under one of two conditions: $\widehat{P}^R(X)=P^R(X)$ (i.e., $P^R$ is known) or (2) $\widehat{g}(X)=g(X)=E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]$ (i.e., $g(X)$ is known).

First, we rewrite $\E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]$:
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]] =& \; \E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_X\Bigg[\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i)) + \frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg] \\
    =& \;\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{X\sim P^R}\Bigg[\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg] \\
    &+ \E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{\widetilde{X}\sim P^{f^0}}\Bigg[\frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg] \\
    =& \; \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg] \\ 
    &+ \frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg]
\end{align*}

\begin{proof}{If (1) $\widehat{P}^R(X)=P^R(X)$,}


Rewriting the first term,
\begin{align*}
    % \mathcal{L}_R&=
    \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg]
    &=\frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{P^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}(Y(x)-\widehat{g}(x))\mathbbm{1}\{X=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}(Y(x)-\widehat{g}(x))\underbrace{\E_{X \sim P^R}[\mathbbm{1}\{X=x\}]}_{P^R(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)(Y(x)-\widehat{g}(x))\Bigg]\\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)-\widehat{g}(X)]] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]]-\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]]
\end{align*}

Rewriting the second term,
\begin{align*}
    % \mathcal{L}_O&=
    \frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg]
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}\widehat{g}(x)\mathbbm{1}\{\widetilde{X}=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}\widehat{g}(x)\underbrace{\E_{\widetilde{X} \sim P^{f^0}}[\mathbbm{1}\{\widetilde{X}=x\}]}_{P^{f^0}(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)\widehat{g}(x)\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]] \\
    % &=\E_{Y(\cdot) \sim \mathcal{G},X \sim P^f}[g(X)]
\end{align*}

% \begin{align*}
%     \mathcal{L}_{CPO}&=
%     \mathcal{L}_R+\mathcal{L}_O \\
%     &=\E_{Y(\cdot) \sim \mathcal{G},X \sim P^f}[Y(X)]-\E_{Y(\cdot) \sim \mathcal{G},X \sim P^f}[g(X)]+\E_{Y(\cdot) \sim \mathcal{G},X \sim P^f}[g(X)] \\
%     &=\E_{Y(\cdot) \sim \mathcal{G},X \sim P^f}[Y(X)]
% \end{align*}
Then we have
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]]-\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]]+\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]] \\
    &=V(f)
\end{align*}
\end{proof}

\begin{proof}{If (2) $\widehat{g}(X)=g(X)=E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]$,}

Rewriting the first term,
\begin{align*}
    % \mathcal{L}_R&=
    \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg]
    &= \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-g(X_i))\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{\widehat{P}^R(x)}(Y(x)-g(x))\mathbbm{1}\{X=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{\widehat{P}^R(x)}(Y(x)-g(x))\underbrace{\E_{X \sim P^R}[\mathbbm{1}\{X=x\}]}_{P^R(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}(Y(x)-g(x))\Bigg] \\
    &=\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}\E_{Y(\cdot) \sim \mathcal{G}}[Y(x)-g(x)] \\
    &=\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}(\E_{Y(\cdot) \sim \mathcal{G}}[Y(x)]-\E_{Y(\cdot) \sim \mathcal{G}}[\underbrace{g(x)}_{E_{Y(\cdot)\sim\mathcal{G}}[Y(x)]}]) \\
    &=\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}\cdot 0 \\
    &=0
\end{align*}

Rewriting the second term,
\begin{align*}
    % \mathcal{L}_O&=
    \frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg]
    &=\frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}g(\widetilde{X}_j)\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}g(x)\mathbbm{1}\{\widetilde{X}=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}g(x)\underbrace{\E_{\widetilde{X} \sim P^{f^0}}[\mathbbm{1}\{\widetilde{X}=x\}]}_{P^{f^0}(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)g(x)\Bigg] \\
    &=\sum_{x \in \mathcal{X}}P^f(x)g(x) \\
    &=\E_{X \sim P^f}[\underbrace{g(X)}_{E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]}] \\
    &=\E_{X \sim P^f}[E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]]
\end{align*}

Then we have
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]
    &=0+\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]] \\
    &=V(f)
\end{align*}

\end{proof}

\section{Parallels between RLHF and O-CPO}
\label{sec:rlhf_ties}

The loss function under RLHF is typically computed through proximal policy optimization (PPO):
\begin{align*}
    \mathcal{L}(\theta, \phi) = \mathcal{L}^{PPO}_{\text{policy}}(\theta) + c_1\mathcal{L}^{PPO}_{\text{value}}(\phi) - c_2\mathcal{L}^{PPO}_{\text{entropy}}(\theta)
\end{align*}

where $\mathcal{L}^{PPO}_{\text{value}}(\phi)$ and $\mathcal{L}^{PPO}_{\text{entropy}}(\theta)$ are regularization terms and $\mathcal{L}^{PPO}_{\text{policy}}(\theta)$ is the \textit{policy loss}. Letting $p$ denote the prompt, $c$ denote the completion, and $r$ denote the reward model, we consider only the policy loss without any stability tricks like clipping. \vl{Include clipped version too?} $\pi_\theta$ is the probability under the policy being optimized, while $\pi_{\theta_0}$ is the probability under a reference policy (often the starting policy or the policy at the previous step).
\begin{equation*}
    \mathcal{L}_{PPO}(\theta)=
    \E\left[\frac{\pi_\theta(c|p)}{\pi_{\theta_0}(c|p)}\cdot r(p,c)\right]
\end{equation*}

We can see the equivalence between $\pi_\theta(c|p)$ and $P^f(X)$, $\pi_{\theta_0}(c|p)$ and $P^{f^0}(X)$, and $r(p,c)$ and $g(X)$; substituting these terms renders $\mathcal{L}_{PPO}$ equal to $V_{out}(f)$.

\section{Experiments}

\subsection{GPT-4 Win Rate Prompts [TODO]}
\label{sec:gpt_win_rate_questions}

\subsection{Human Agreement Study [TODO]}
\label{sec:gpt4_annotation_details}

\section{Additional Results}
\label{sec:additional_results}

\begin{table*}[!ht]
    \centering
    \begin{tabular}{c|cc|cc}
    \toprule
    \toprule
    & \multicolumn{2}{c|}{Hong Kong} & \multicolumn{2}{c}{Hate Speech} \\
        & IPW-CPO win rate & DR-CPO win rate & IPW-CPO win rate & DR-CPO win rate \\
    \midrule
        FT & \textbf{\textcolor{green!50!black}{0.528* [0.506, 0.550]}} & 0.477 [0.455, 0.499] & 0.517 [0.495, 0.539] & \textbf{\textcolor{green!50!black}{0.524* [0.502, 0.546]}} \\
        IPW-CPO & - & 0.441 [0.419, 0.463] & - & 0.518 [0.496, 0.540] \\
        O-CPO & 0.542* [0.520, 0.564] & 0.482 [0.460, 0.504] & 0.538* [0.516, 0.560] & 0.560* [0.538, 0.582] \\
        DR-CPO & 0.559* [0.537, 0.581] & - & 0.482 [0.460, 0.504] & -\\
    \bottomrule
    \bottomrule
    \end{tabular}
    \caption{IPW-CPO and DR-CPO win rates against fine-tuning and CPO variants. A win rate exceeding 0.5 indicates that the named method outperforms the competing method with respect to the target outcome. Win rates are computed across 2000 pairs for each method combination.}
    \label{tab:win_rates}
\end{table*}

\begin{table}[!ht]
    \centering
    \begin{tabular}{c|c}
    \toprule
    \toprule
    & Unconfounded win rate over confounded \\
    \midrule
    O-CPO & \textbf{\textcolor{green!50!black}{0.546* [0.506, 0.586]}} \\
    DR-CPO & 0.456* [0.416, 0.496] \\
    IPW-CPO & 0.505 [0.465, 0.545] \\
    \bottomrule
    \bottomrule
    \end{tabular}
    \caption{CPO win rates with outcome models trained on unconfounded data (Hong Kong) vs. confounded data (Confounded). A win rate exceeding 0.5 indicates that CPO+unconfounded outcome model outperforms CPO+confounded outcome model with respect to the target outcome. Win rates are computed across 600 pairs for each method combination.}
    \label{tab:confounding_win_rates}
\end{table}


\end{document}
