\documentclass{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{bbm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\allowdisplaybreaks

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


\newcommand{\vl}[1]{\textcolor{orange}{[VL: #1]}}

\newcommand{\eli}[1]{\textcolor{orange}{[Eli: #1]}}

\newcommand{\E}{\mathbb{E}}
\newcommand{\R}{\mathbbm{R}}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}


\title{Optimizing Language Models for Human Preferences \\\ is a Causal Inference Problem}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2024 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Harry~Q.~Bovik}
\author[1,2]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[1]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Cranberry University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Second Affiliation\\
    Address\\
    …
}
\affil[3]{%
    Another Affiliation\\
    Address\\
    …
  }
  
\begin{document}
\maketitle

\begin{abstract}
  As large language models see greater use in academic and commercial settings, there is increasing interest in methods that allow language models to generate texts aligned with human preferences. In this paper, we propose \textit{causal preference optimization} (CPO), a novel causal formulation for optimizing language models for human preferences. A language model optimized with CPO contains (in theory) a causal guarantee: text generated by the model \textit{causes} the reader's response to it. Motivated by principles from statistical causal inference, we derive three solutions to this optimization problem based on outcome modeling, importance weighting, and a \textit{doubly robust} combination of the two; and we show that this doubly robust solution is provably unbiased for the true optimization problem in our proposed data setting. Finally, we empirically demonstrate the effectiveness of CPO in optimizing for human preferences on unpaired data, and we validate the double robustness of DR-CPO under difficult confounding conditions.
\end{abstract}

\section{Introduction}
\label{sec:intro}

\vl{DIAGRAM IDEA: RLHF/DPO vs. CPO? Paired optimization vs. unpaired optimization?}

Recent advances in computation have yielded large-scale self-supervised language models that achieve impressive performance on a variety of natural language processing (NLP) tasks \citep{zhang2022opt, chowdhery2023palm, lescao2023bloom, bubeck2023sparks}. These large language models (LLMs)---trained on vast amounts of text data of varying quality---can acquire less desirable attributes from these texts, and so they often require further fine-tuning on human preferences to improve their factual correctness and alignment with social values (e.g., less toxic, more helpful) \citep{NEURIPS2022_b1efde53, bommasani2022opportunities}. 

Given the rising popularity of LLMs in both academic and commercial applications, there is great interest in methods that help language models learn and generate texts according to human preferences. To this end, reinforcement learning from human feedback (RLHF) has seen widespread use \citep{christiano2017rlhf, stiennon2020summarize, touvron2023llama}. In recent months, works that learn more directly from human preference data have emerged \citep{an2023direct, hejna2023contrastive, dumoulin2024density}---the most notable of which is direct preference optimization (DPO) \citep{rafailov2023direct}.

% With the rising popularity of large language models (LLMs), there is great interest in methods that help language models learn (and generate texts according to) human preferences. [Users may want to train LMs to be less toxic, more helpful, etc.] To this end, reinforcement learning from human feedback (RLHF) saw initial widespread use, but in recent months, works that bypass the requirement of reward modeling have emerged, most notably Direct Preference Optimization (DPO).

When a language model $f$ is trained or fine-tuned to generate texts that are consistent with human preferences, the implicit goal can be seen as optimizing texts $X \sim P^f$ with respect to some outcome $Y$. In current practice, this outcome $Y$ is typically an indicator of whether a user preferred or did not prefer a text previously generated by the model, under a forced-choice data setting in which the user is shown two possible generations. 

In this paper, we propose a more general formulation of the optimization problem that bypasses the requirement of forced-choice data. Instead, we consider a data format in which $Y$ is any general reaction of the user to the texts (e.g., ratings, either binary or scalar). Following this construction, we show that optimizing language models according to human feedback is actually a causal inference problem, which we call \textit{causal preference optimization} (CPO). The resulting policy learned under CPO asks the following question: how do we intervene on the text distribution of the generating model to best \textit{cause} an optimal outcome (i.e., to maximize the generation of human-preferred texts)? 

Building on this causal view, we derive three solutions to the CPO problem that reflect outcome modeling (O-CPO), importance weighting (IPW-CPO), and doubly robust (DR-CPO) estimators from the statistical causal inference literature. We illustrate that the prevailing LLM optimization methods, RLHF and DPO, closely relate to O-CPO and IPW-CPO, respectively, with the RLHF objective in particular being mathematically equivalent to O-CPO. Finally, we show that the DR-CPO objective corrects bias that can arise from outcome modeling solutions like O-CPO and RLHF---and is in fact provably unbiased for the true optimization problem under our proposed data setting.
% robust estimators of statistical causal inference. We further illustrate that the prevailing LLM optimization methods, RLHF and DPO, are solutions to the same optimization problem and mirror the traditional causal inference notions of IPW and outcome model-based estimators.
\eli{At this point I do not know what the proposed data setting is...}

We emphasize that the causal formulation of CPO---in addition to being of theoretical interest---confers a strong practical benefit: it ensures that the relationship between the text and the outcome is not \textit{confounded} by external factors that may influence both a person's tendency to read certain texts and their preference for those texts. 

As an evaluation of CPO under these circumstances---and unconfounded ones---we empirically assess the ability of CPO to optimize state-of-the-art LLMs for specific outcomes across two datasets without confounding and one with confounding. In keeping with our problem description, these datasets are not paired completion datasets but instead contain a single numerical outcome for every text. Comparing texts generated across optimized models, we find that CPO is successful in optimizing LLMs for human preferences directly from outcome data, without requiring a forced-choice experiment or paired completions. Additionally, we empirically demonstrate the robustness and bias-correction capabilities of DR-CPO, as it remains performant under confounding while O-CPO is affected. 

\vl{Are these the key results we should be highlighting?}
\eli{Mention the need for an experiment/crowd sourced data at some point in the intro?}
% [This opens up a wider range of existing datasets that can be used for optimization.]

% [The CPO problem is further distinguished from existing LM optimization approaches by its general formulation that bypasses the typical requirement of forced-choice data with paired preferred/non-preferred completions for a prompt, instead requiring only a numerical (binary or continuous) \textit{outcome} for each text. This allows us to broaden our view of what constitutes a ``preference''---for instance, not only whether a text is good or bad but whether it]

% [We evaluate the ability of CPO to optimize SoTA LLMs for specific outcomes across three datasets and find that it is successful in doing so, even under difficult confounding conditions.]

% We evaluate the doubly robust CPO against its IPW and outcome model-based counterparts and find that it outperforms them in both automatic and human evaluations of preferred texts...

\section{Related Work}
\label{sec:related_work}

% [CPO is contextualized within a wider body of ]

\subsection{Language Model Optimization}

The performance of large self-supervised language models can be further improved by fine-tuning on datasets that align them with \textit{human-preferred} text \citep{NEURIPS2022_b1efde53, bommasani2022opportunities}. These datasets are typically made up of prompts followed by a pair of completions, one of which is indicated to be human-preferred \citep{pmlr-v162-ethayarajh22a, bai2022training, ji2023beavertails}. A reinforcement learning algorithm may then derive its reward model from these human preferences (reinforcement learning from human feedback, or RLHF) \citep{christiano2017rlhf}, after which language models are fine-tuned to maximize the human preference reward under the RLHF algorithm. This optimizes the language model for the human preferences encoded in the dataset.

RLHF is computationally demanding, as its training loop requires that new texts be generated and new rewards be computed at each step. 
% Despite this, the majority of the prevailing large language models---both academic and commercial---have been optimized using RLHF.
Consequently, an alternative approach (direct preference optimization, or DPO) has gained popularity in recent months for its ability to optimize language models directly on the human preference data, without having to train a reward model or generate new texts \citep{rafailov2023direct}. Similarly to RLHF, DPO is designed for use with paired prompt-completion datasets, maximizing the probability ratio of preferred completions to non-preferred completions over the preference dataset.

% [Within the space of language models, we mostly have RLHF and more recently DPO...]

% RLHF fine-tunes a language model for human feedback via reward modeling, where the reward is a proxy for human feedback (i.e., the outcome) and the goal of the model is to obtain larger rewards.

\subsection{Causal Inference and Double Robustness}

Although RLHF and DPO constitute the two most popular optimization approaches for language models, there exists a wide body of work on estimation and policy learning outside of the NLP space. Some of the previous work of note directly relevant to this paper includes a long history of doubly robust estimation of causal effects [CITE] and---more directly applicably---doubly robust policy learning \citep{dudik2011doublyrobust, pmlr-v48-jiang16, Tang*2020Doubly, pmlr-v162-kallus22a}.

In causal inference, double robustness denotes an estimator formulation that provides robustness against misspecification of \textit{nuisance} parameters or functions. In particular, doubly robust estimators combine two existing estimators---an \textit{importance weighting} estimator and an \textit{outcome modeling} estimator---such that only one of the two components must be correctly specified or estimated to guarantee the unbiasedness of the estimator \citep{Robins1994, chernozhukov_locally_2022}. This can also be viewed as the importance weighting term providing a \textit{bias correction} for the outcome modeling term.

\vl{How much more detail should we give? Would it be helpful to have this written out mathematically with the ATE as an example, or is that too much detail/prelim?}

This principle can then be extended to not only the estimation of causal effects but the estimation of any quantity, including loss functions / policy objectives, \vl{... examples?}

% [But outside the space of language models and NLP, there's been lots of work on estimation and optimization. For the purposes of this paper, some previous work of note includes doubly robust estimation of causal effects and---more directly relevant---doubly robust optimization in traditional RL settings...]

\eli{Could just keep this tight and short?}

\section{A Causal View of Language Model Optimization}
\label{sec:causal_formulation}

% \eli{I think the above should go more in related work, esp the forced choice part, and then start below}

% Formally, let $\mathcal{X}$ be the space of texts, and let $\mathcal{G}$ be the space of potential outcomes (under the potential outcomes framework of causal inference) for those texts. Then the potential outcomes are given by $\{Y(x): \mathcal{X} \rightarrow \mathbb{R} \; | \; x \in \mathcal{X}\} \sim \mathcal{G}$. We define the value function $V(f)$:
% \begin{equation}
%     V(f)=\E_{X \sim P^f}[\E_{Y(\cdot)\sim \mathcal{G}}[Y(X)]]
% \end{equation}

Let $\mathcal{X}$ represent the space of texts.
We follow the potential outcomes framework \citep{neyman1923, rubin1974}: for each individual $i$ we posit the existence of a \emph{potential outcome function} 
$Y_i:\mathcal{X} \to \R$, where $Y_i(x)$ encodes their potential real-valued response if given text $x$.
This notation implicitly rules out the possibility that individual's responses can be affected by the texts given to others.
We assume that we sample individuals from a population $\mathcal{G}$ so that the set of potential outcomes $\{Y(x)\; | \; x \in \mathcal{X}\} \sim \mathcal{G}$. We define $g(x) \equiv \E_{Y(\cdot)\sim\mathcal{G}}[Y(x)]$ as the average outcome if all individuals in the population were given text $x$.

With this setup, our goal is to find a language model $f$ that leads to high outcomes $Y$ on average across the population of individuals $\mathcal{G}$ and across the texts generated according to model, denoted as $P^f$. We encode the quality of a generative text model $f$ via its value function 
\begin{equation}
    V(f) \equiv \E_{X \sim P^f}[\E_{Y(\cdot)\sim \mathcal{G}}[Y(X)]] = \E_{X \sim P^f}[g(X)]
\end{equation}

Then the \textit{causal preference optimization} (CPO) problem is to find the language model $f$ that maximizes the value function:
\begin{equation}
\label{eq:true_optimization}
    \underset{f}{\arg\max}\; V(f) = \underset{f}{\arg\max} \; \E_{X \sim P^f}[\E_{Y(\cdot)\sim \mathcal{G}}[Y(X)]]
\end{equation}

By reframing the problem to directly optimize the scalar outcome rather than choosing between preferred and non-preferred texts, we (1) allow for the use of a much wider range of numerical outcome data rather than restricting to paired completion datasets, (2) account for the \textit{intensity} or \textit{degree} of a text's outcome rather than simply whether it is preferred or not, and (3) broaden the scope of problems that can be addressed by learning from human feedback. This is especially significant in the case of tasks or problems with more complex outcomes that do not lend themselves intuitively to preference pairs---for instance, optimizing a language model to unlearn copyrighted material or private information. 

\vl{What I mean by this is that in the case of copyrighted material or private information, the paired data typically looks like ([copyrighted material], ``I can't provide that information''), with the preferred text being the latter. So rather than truly unlearning the copyrighted material, the LLM is just trained to preferentially generate an ``I don't know''-type answer. CPO, on the other hand, explicitly decreases the probability of the copyrighted material (in theory all the way to 0) without telling it an alternative it should be generating instead. Moreover, in theory CPO doesn't require any positive examples at all---you could just give it a dataset of copyrighted material, all with outcome -1 (i.e., do not want), and optimize it to ``forget'' all that copyrighted material, again without telling the LLM what it should be doing instead. So it's not just removing the technical/logistical requirement of having to construct paired completion data but also like... it's conceptually more aligned with what unlearning should actually be? This applies to learning a specific outcome with only positive examples as well, but in that case I guess you could just fine-tune directly on that text data. I feel like this is an important discussion point, but I don't know where to put it.}, 

% Moreover, the CPO problem is further distinguished from existing language model optimization approaches by its general formulation that bypasses the typical requirement of forced-choice data, instead requiring only a numerical (binary or continuous) \textit{outcome} for each text. This allows us to broaden our view of what constitutes a ``preference,'' with potential use cases including not only learning desired outcomes but also 

% \eli{Add a few discussion points here about why we would want to do this, maybe with some examples of texts $x$ and outcomes $y$ from the evaluations or in general?} [TODO, BUT MAYBE THIS WOULD BE IN THE INTRO]

\subsection{Why is this Causal?}

Datasets where users provide a numerical response to a text are common in NLP (e.g., Reddit upvotes, Amazon ratings, etc.). Though some relationship can typically be inferred between the content of a text passage and a reader's response to it, this relationship generally cannot be assumed to be causal due to the potential presence of \textit{confounders}: factors that influence both a reader's choice of texts to read and how they might tend to respond to a text (e.g., demographic attributes, personal beliefs, prior experiences).
% \eli{Add something about how optimizing from a confounded dataset can go wrong? Need to complete the thought: ``There can be confound`` $\rightarrow$ ``that can mess up the language model''} 

Because the response is no longer solely a function of the text under confounding, the ability of language models to correctly optimize for the response may be compromised. Confounding may induce certain styles of text to be more common for desired outcomes, leading the language model to exclusively generate text in that style \vl{(e.g., ?????)}. Moreover, reward models trained on confounded data may be misspecified, since they can only capture the relationship between the text and the response---and not the confounders that have additionally influenced the response.

CPO instead uses datasets that are labeled through crowdsourced annotation on large online platforms, in which annotators are \textit{randomly assigned} to read and react to texts (i.e., a randomized experiment). The random assignment mechanism removes all possible confounding, since there are no longer any factors influencing which texts the annotator reads---thereby providing a causal guarantee: within this dataset, the content of the text must be the sole factor that causes the reader's response. 

Consequently, a language model that optimizes the outcome or response over a randomized dataset will learn to produce text that \textit{causes} that outcome, without susceptibility to confounding. Therefore, our model optimization problem becomes the following causal inference problem: how do we \textit{intervene} on the distribution of the text-generating model to best cause an optimal outcome on average---in this case, the production of human-preferred text?

% [these types of text/rating datasets are common in nlp and generally obtained from crowdsourcing. due to the nature of crowdsourcing, this is basically a randomized experiment: people are randomly assigned to read/react to texts. this removes all possible confounding (factors that influence both ppl's choice to read a text and how they might tend to react to that text) because the random assignment ensures that external factors cannot influence people to read particular texts. therefore, this gives us a \textit{causal guarantee}: within this randomized dataset, we know that the content of the text is *causing* the observed outcome. consequently, a language model that optimizes the outcome over this dataset should produce text that *causes* the optimal outcome.]

\section{Causal Preference Optimization}
\label{sec:cpo}
% \eli{Include randomized experiments and/or crowdsourced data in the title somewhere?}

% \eli{One big thing that gets lost here: what is "CPO"? Is the double robust version? Or are we describing "O-CPO", "IPW-CPO", "DR-CPO", } \vl{I guess more the latter}

Reframing our optimization problem as a causal inference problem allows us to draw on solutions from statistical causal inference: in particular, the use of randomized experiments to identify causal effects.
We formalize such a randomized experiment and/or crowdsourced annotated dataset $\mathcal{D}_R = \{(X_1, Y_1),\ldots, (X_n, Y_n)\}$ where texts $X_i$ are drawn i.i.d. from a randomization distribution $P^R$ and individuals with potential outcome functions $Y_i(\cdot)$ are drawn i.i.d. from the population $\mathcal{G}$. 
This induces a distribution on the observed responses $Y_i = Y_i(X_i)$ that we denote as $P^R_y$.


% in particular, the notion of the \textit{doubly-robust estimator} \citep{Robins1994, chernozhukov_locally_2022}. Following the convention of causal inference, we identify $V(f)$ in terms of observable data. 

\subsection{Identifying the Value of a Language Model}
\label{sec:identification}

% We consider an observed crowdsourced dataset $\mathcal{D}_R$ with texts $X_i \sim P^R$ and their corresponding outcomes $Y_i \sim P^R_y$, where observed $Y_i$ are equal to their potential outcomes $Y_i(X_i)$. Then letting $g(x)=\E_{Y(\cdot)\sim\mathcal{G}}[Y(x)]$, 
The value of the language model 
$V(f)$ is a causal quantity that involves the \emph{potential outcomes} for all of the units, some of which are unobserved.
However, we can link the value function to the randomization dataset $\mathcal{D}_R$ by writing it in the following ways:
\begin{align*}
    V(f)=&\;\E_{X\sim P^f}[g(X)]  \tag*{(\text{$V_{out}$})} \\
    =&\;\E_{X \sim P^R}\Bigg[\E_{Y \sim P^R_y}\Bigg[\frac{P^f(X)}{P^R(X)}Y\Bigg]\Bigg] \tag*{(\text{$V_{IPW}$})} \\
    =&\;\E_{X\sim P^f}[g(X)]+ \\
    &\;\E_{X \sim P^R}\Bigg[\E_{Y \sim P^R_y}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] \tag*{(\text{$V_{DR}$})}
\end{align*}

In other words, the value function $V(f)$ can be identified from the observed data in multiple ways. With \textit{outcome modeling} ($V_{out}$), texts $X$ are generated from $P^f$ and outcomes are computed by a model $g(X)$, such that the optimization problem is to maximize the predicted outcome. With \textit{importance weighting} ($V_{IPW}$), or IPW, observed outcomes $Y \sim P^R_y$ are weighted by the density ratios between texts drawn from the model $X \sim P^f$ and texts drawn from the randomization distribution  $X \sim P^R$; this approximates the average outcome under $P^f$.
% which are then maximized under optimization. 
Finally, we use a \textit{doubly robust} construction ($V_{DR}$) that combines outcome modeling and IPW to provide robustness against misspecification or mis-estimation within either term---akin to doubly robust estimators that serve the same purpose when estimating causal effects. 

Note that we do not know the true model $g(x)$, and so will need to estimate it. However, we do know the randomization distribution in our study $P^R$: because texts are drawn at random, $P^R(X)$ is known to be $\frac{1}{n}$.\footnote{In practice, it can still be empirically helpful to use a model-derived $\widehat{P}^R(X)$.} \eli{It doesn't have to be $\frac{1}{n}$ right? E.g. in the Hong Kong example it isn't} Therefore, the IPW construction does not require additional estimation.

% [In the next section, we describe these doubly robust properties and the ways in which they can guarantee the unbiasedness of an estimator for $V(f)$.]


\subsection{Estimation: CPO in Practice}
% \eli{Note that I removed the hats from $P^R$} \vl{Can I bring them back? I think it's fine to state up front that $\widehat{P}^R=P^R$ in our experiments, but it makes things confusing when we're talking about one of the conditions of double robustness is $\widehat{P}^R=P^R$ but $\widehat{P}^R$ doesn't appear in the $\widehat{V}_{DR}(f)$ term.... or should I just rephrase that condition as ``$P^R$ must be known''?}

In the previous section we showed how to write the causal quantity $V(f)$ in terms of observable data. Now we focus on estimating it in practice. 

First, the importance weighting value function $V_{IPW}(f)$ can be estimated directly from the crowdsourced data $\mathcal{D}_R$ as follows (recall that $X_i\sim P^R, Y_i \sim P^R_y$):
\begin{equation*}
    \widehat{V}_{IPW}(f)=\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{P^R(X_i)}Y_i    
\end{equation*}

Next, consider the  outcome modeling value function $V_{out}(f)=E_{X \sim P^f}[g(X)]$.
Even if we were to know the true causal outcome model $g$, it is difficult to optimize $V_{out}(f)$
% the original outcome modeling value function $V_{out}(f)=E_{X \sim P^f}[g(X)]$, 
as it requires that texts be drawn from the language model $f$ \textit{as $f$ is being updated}. To remedy this, we re-write $V_{out}(f)$ in terms of a fixed language model $f^0$:\footnote{We show this to be mathematically equivalent in Appendix \ref{sec:v_out_rewritten}}
\begin{equation*}
    V_{out}(f)=\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde X)}{P^{f^0}(\widetilde X)}g(\widetilde X)\Bigg],
\end{equation*}
where $P^{f_0}$ denotes the distribution over texts from language model $f_0$.

We can create a Monte Carlo estimate of this by drawing texts $\widetilde{X}_1, \ldots, \widetilde{X}_m \sim P^{f^0}$ and computing
\begin{equation*}
    \widehat{V}_{out}(f)=\frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)
\end{equation*}
where $\widehat{g}(x)$ is a model trained to predict $Y$ from $X$ and $f^0$ is any generative language model. 
% \eli{This could be more explicit. Part of the key is that $\hat{g}$ could be bad because it was fit on confounded data. Is that the setup we want to consider? We have a larger dataset that we can train $\hat{g}$ on? Or are we always fitting $\hat{g}$ on the randomization dataset, where it will be unconfounded?}
% Note that because the texts are generated from $f^0$, $P^{f^0}$ can be computed from the same language model and is therefore known rather than estimated.

Finally, the doubly robust value function can be estimated as a combination of these two terms:
\begin{align*}
    \widehat{V}_{DR}(f)=&\;\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{P^R(X_i)}(Y_i-\widehat{g}(X_i)) + \\
    &\;\frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)
\end{align*}

Each of these value functions may be used to solve the CPO problem in practice. In the remainder of this paper, we refer to outcome modeling-based CPO (i.e., using $\widehat{V}_{out}$) as O-CPO; IPW-based CPO (i.e., using $\widehat{V}_{IPW}$) as IPW-CPO; and doubly robust CPO (i.e., using $\widehat{V}_{DR}$) as DR-CPO.

\subsection{Outcome Modeling, IPW, and Relationship to Current Approaches}

% \eli{I think the framing and organization can be sharpened a bit as follows.  Have Section 4.3 be about the limitations of outcome modelling and IPW (and their connections to RLHF and DPO) 
%  I might characterize it as this:
% \begin{itemize}
%     \item Outcome modeling: doesn't require an experiment, but can be entirely wrong due to confounding
%     \item IPW: unbiased, but can have high variance and poor generalization properties because experiments are often small 
% \end{itemize}
% Then Section 4 is about how the DR approach addresses this. I think it can be framed explicitly as starting with the outcome modelling approach and then only using the experiment to correct for any bias in the outcome modelling approach. So we can even use externally fit outcome models that have some confounding, and it will still be ok.
% }

Each value function estimator can be useful under different circumstances. Outcome modeling does not require a text experiment, but if $\widehat{g}(X)$ is not a good outcome model---for instance, due to confounding---then $\widehat{V}_{out}(f)$ will be biased with respect to $V(f)$. 
% We explore such a setting in our experiments and show that models optimized with $\widehat{V}_{out}(f)$ degrade under confounding. 
On the other hand, importance weighting is unbiased given a text experiment and does not require an outcome model, but $\widehat{V}_{IPW}(f)$ can have high variance and yield language models with poor generalizability due to the limited amount of experimental data available.

Such limitations are reflected in two of the most prominent language model optimization approaches: RLHF and DPO, which relate closely to O-CPO and IPW-CPO, respectively. RLHF can be seen as a special case of O-CPO in which the data used to learn the reward model comes specifically from a \textit{forced choice experiment} where users are asked to choose the better of two completions for a prompt. Under these conditions, the RLHF reward model is equivalent to the O-CPO outcome model, and the RLHF policy loss is mathematically equivalent to the O-CPO value function $V_{out}(f)$ (details in Appendix \ref{sec:rlhf_ties}).

Likewise, DPO is similar to a restricted form of IPW-CPO in which preference data (like with RLHF) comes from a forced choice experiment in which users are asked to choose the better of two completions for a prompt. Both DPO and IPW-CPO fine-tune a language model for human feedback by directly using a preference dataset rather than relying on reward modeling. The DPO objective shares certain similarities with $V_{IPW}$---most notably in that it directly increases the likelihood of texts corresponding to desired outcomes through importance weighting. However, because of the paired nature of the data, DPO increases the density ratio between preferred and non-preferred examples, while IPW-CPO directly increases the probability of texts with desired outcomes and decreases the probability of texts with non-desired outcomes. Given a paired data setting, the DPO objective could likely be recovered from $V_{IPW}$; we leave this derivation for future work.

These parallels mean that RLHF and DPO are subject to the same limitations that we describe for outcome modeling and importance weighting more generally. That is, RLHF can degrade under confounding or misspecification of its reward model, while DPO can suffer from high variance and poor generalization resulting from the size or coverage of its preference dataset.
% [While outcome modeling with $\widehat{V}_{out}(f)$ and importance weighting with $\widehat{V}_{IPW}(f)$ can be useful, both can easily result in poor optimization of the language model. Outcome modeling does not require a text experiment, but if $\widehat{g}(X)$ is not a good outcome model, then $\widehat{V}_{out}(f)$ will be biased with respect to $V(f)$. In fact, it is not unlikely that the outcome model will be incorrect when learned from real-world data---often due to confounding. 

% Under confounding, the outcome is not fully caused by the text, and so a model trained to predict the outcome from the text will inevitably be misspecified, since it cannot capture the confounders that have influenced the outcome in addition to the content of the text. [We explore such a setting in our experiments and show that models optimized with $\widehat{V}_{out}(f)$ degrade under confounding, while models optimized with $\widehat{V}_{DR}(f)$ remain unaffected.]

% [Importance weighting is unbiased given a text experiment, but optimization is limited to the text data from the experiment.] \vl{Are these the right pros/cons to be mentioning?}


% \subsection{CPO corrects estimation bias} 
\subsection{Double Robustness of CPO}
% \vl{Feel like I'm repeating myself a bit in this section... does it read that way?}

The limitations of outcome modeling and importance weighting motivate our doubly robust formulation DR-CPO, which provides (a) robustness against outcome model misspecification \textit{and} (b) generalizability beyond a specific preference dataset or text experiment. 
% At a high level, DR-CPO is unbiased for the true optimization problem as long as \textit{either} (1) the distribution of the preference dataset or text experiment is known, or (2) the ``true'' outcome model is known.

Formally, it can be shown (Appendix \ref{sec:unbiased_dr}) that $\widehat{V}_{DR}(f)$ is an unbiased estimator for $V(f)$ 
under two possible conditions, making it an effective proxy for the true optimization problem in equation \ref{eq:true_optimization}. 

\begin{theorem} Let $\mathcal{D}_R$ be a randomized experiment parameterized by $P^R$, where $\widehat{P}^R$ may be estimated. Let $g(X)=E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]$, where $\widehat{g}$ may need to be estimated. Then 
\begin{equation*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]=\E_{X \sim P^f}[\E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]]=V(f)
\end{equation*}
if \textit{either}
\begin{enumerate}
    \item $\widehat{P}^R(X)=P^R(X)$, or
    \item $\widehat{g}(X)=g(X)$
\end{enumerate}
\end{theorem}


% Specifically, $\widehat{V}_{DR}(f)$ is unbiased 
% as long as \textit{either}
% % (1) $\widehat{P}^R(X)=P^R(X)$ 
% (1) $P^R(X)$ is known
% or (2) $g(X)$ is known (i.e., $\widehat{g}(X)=g(X)$).
% In other words, it is robust to misspecification of $\widehat{P}^R(X)$ as long as $\widehat{g}(X)$ is correctly specified, and it is robust against misspecification of $\widehat{g}(X)$ as long as $\widehat{P}^R(X)$ is correctly specified.

% \vl{Should we formalize this as a theorem or something?}

Importantly, because $P^R(X)$ \textit{is} known in our proposed experiment/crowdsourced data setting, it does not need to be estimated, and
% , in our proposed data setting, $\widehat{P}^R(X)=P^R(X)$ will always hold: 
% because $\mathcal{D}_R$ is a crowdsourced dataset, texts are drawn at random---so $P^R(X)$ is known to be $\frac{1}{n}$.\footnote{In practice, it can still be empirically helpful to use a model-derived $\widehat{P}^R(X)$ instead of the sample probability $\frac{1}{n}$.} Therefore, 
% CPO will confer the benefits of both IPW and outcome modeling during language model optimization. The outcome model allows for optimization over any body of text, regardless of its source, and the crowdsourced dataset with known $P^R(X)$ ensures that the optimization is unbiased (i.e., that the true optimization problem is still being solved) even if the outcome model is completely incorrect.
the doubly robust construction will guarantee that (a) $\widehat{V}_{DR}(f)$ is robust to misspecification of $\widehat{g}(X)$. In other words, DR-CPO is unbiased (i.e., the true optimization problem is still being solved) \textit{even if the outcome model is incorrect}. Therefore, a poor outcome model will still help DR-CPO (b) generalize beyond the preference data, as the IPW term in the value function corrects for any bias from the predicted outcomes. 

In a data setting where the true $P^R(X)$ is unknown, a well-estimated $\widehat{g}(X)$ that is close to the true $g(X)$ can also help bias-correct any mis-estimation of $\widehat{P}^R(X)$. This may occur, for instance, when no text experiment is available but a large amount of clean data exists to train the outcome model.


% \eli{suggest changing the framing to focus on the fact that DR is unbiased regardless of the outcome model $g$ because we know $P^R$, and then mention that we also if we happened to not know $P^R$ but had a good $g$ it would also work. The section title might then be about ``bias correction'' rather than ``double robustness''}


% Given a crowdsourced dataset, a model trained with CPO is unbiased for the true optimization problem \textit{even if the outcome model} $\widehat{g}(X)$ \textit{is completely incorrect}.


% These two conditions form the basis for the double robustness of $\widehat{V}_{DR}(f)$. Because $\widehat{V}_{DR}(f)$ requires only one of the two conditions to be true in order to be unbiased, CPO with $\widehat{V}_{DR}(f)$

% [This means CPO is helpful in data settings where we might not know if our outcome model is correct Consequently, CPO is beneficial in two situations: (1) when the outcome model $\widehat{g}(X)$ is known to be well-performing


% [TODO: FIX THE REST OF THIS SECTION]



% \subsection{CPO in practice}

% [In practice, what is $P^R$, and what is $P^{f^0}$? Under what conditions would we choose a specific $P^{f^0}$ (e.g., generate from CLM or generate from pretrained)? Do we always use a sample average for $P^R$, or do we sometimes estimate it from the data?]


% \subsection{Double robustness of CPO}

% [Under what conditions is CPO advantageous? In particular, under what conditions does each loss term of $\mathcal{L}_{CPO}$ confer robustness against misspecification or mis-estimation (e.g., incorrect reward/outcome model; confounding)?]

% \subsection{Ties to DPO}

% Like the importance weighting component of CPO, DPO fine-tunes a language model for human feedback by directly using a preference dataset rather than relying on reward modeling. As is the case with RLHF, DPO requires preference data to come from a forced choice experiment in which users are asked to choose the better of two completions for a prompt.

% Letting $c_w$ and $c_l$ denote the preferred and non-preferred completions to the prompt $p$, respectively, the DPO objective is given by
% \begin{align*}
%     \mathcal{L}_{DPO}(\theta)=&\;\E\Bigg[\log \sigma \Bigg(\beta \log \frac{\pi_\theta(c_w|p)}{\pi_{\theta_0}(c_w|p)} \\
%     &- \beta\log \frac{\pi_\theta(c_l|p)}{\pi_{\theta_0}(c_l|p)}\Bigg)\Bigg]
% \end{align*}
% where $\pi_\theta$ is again the probability under the model being optimized, while $\pi_{\theta_0}$ is the probability under a reference model.

% As with RLHF, DPO can be seen as a special case of CPO in which the preference data comes from a forced choice experiment in which users are asked to choose the better of two completions for a prompt. Under these conditions, the DPO objective is analogous to the density ratio loss $\mathcal{L}_R$ in CPO. Letting $c_w$ and $c_l$ denote the preferred and non-preferred completions to the prompt $p$, respectively,
% \begin{equation}
%     \mathcal{L}_{DPO}(\theta)=\E\Bigg[\log \sigma \Bigg(\beta \log \frac{\pi_\theta(c_w|p)}{\pi_{\theta_0}(c_w|p)} - \beta\log \frac{\pi_\theta(c_l|p)}{\pi_{\theta_0}(c_l|p)}\Bigg)\Bigg]
% \end{equation}

% The DPO objective shares certain similarities with the $V_{IPW}$ value function---most notably in that it directly increases the likelihood of texts corresponding to desired outcomes through importance weighting. Because of the paired nature of the data, DPO increases the density ratio between preferred and non-preferred examples, while importance-weighted CPO directly increases the probability of texts with desired outcomes and decreases the probability of texts with non-desired outcomes in separate steps, rather than adjusting the probabilities through a ratio. 
% \vl{I think the future work about recovering DPO from a paired version of CPO should go in the discussion rather than here, but I'm not sure what a neat way to tie up this section would be}
% \eli{If this gets folded in to a section about IPW and it's limitations, I think this can be more of a brief aside than a main thing, and so wouldn't need to be neatly tied up. Agreed on the discussion portion}

% \vl{Not exactly sure what's the best way of explaining the parallel, since they use the ratio of the completion probabilities, whereas we multiply by -1 or 1 depending on which outcome is preferred. This has the same effect of increasing $\pi(c_w|p)$ and reducing $\pi(c_l|p)$, but it's not mathematically equivalent---so can we still call DPO a ``special case'' of CPO? Also, I guess they apply the logistic function over the whole thing, which we don't do?}

\section{Experiments}
\label{sec:experiments}

We conduct evaluations to empirically assess the ability of CPO to optimize language models for a human preference outcome, and we examine the doubly robust properties of DR-CPO under confounding.

\subsection{Datasets}

Unlike existing optimization approaches, CPO does not require paired completion data and can instead optimize for human preferences or outcomes over any data where each text has a corresponding numerical response. To evaluate optimization under these conditions, we consider three crowdsourced datasets in which human annotators provided responses to texts.

\textbf{Hate Speech} \textit{(binary outcome).} The Hate Speech dataset \citep{qian-etal-2019-benchmark} consists of comments from the social media sites Reddit and Gab. Outcomes are collected via crowdsourcing and indicate whether the annotator percieves the comment to be hate speech. The Reddit comments are chosen from subreddits where hate speech is more common, and Gab is a platform where users sometimes migrate after being blocked from other social media sites. The optimization goal for this dataset is to generate texts that are \textit{less} hateful \eli{on average?}.

\textbf{Hong Kong} \textit{(scalar outcome).} The Hong Kong dataset \citep{fong2021causal} consists of texts concerning the Hong Kong democracy protests of 2019-2020. These texts are loosely based on speeches made about Hong Kong during U.S. Congressional sessions at the time of the protests. Outcomes are collected via a randomized experiment and indicate to what extent the respondent thinks after reading the text that the U.S. should support Hong Kong during this time. The texts are programmatically constructed: for each text, 2-3 text attributes are chosen out of 7 (e.g., \textit{commitment}, \textit{bravery}, \textit{mistreatment}). Short passages corresponding to each attribute are then randomly chosen from a pool of about 20 to construct the text. The optimization goal for this dataset is to generate texts with \textit{high} outcome.

\textbf{Confounded} \textit{(scalar outcome).} The Confounded dataset is a version of the Hong Kong dataset where we have induced confounding. We consider the strongest possible form of confounding: the confounder is fully correlated with the outcome, resulting in all outcomes being negations of the original outcomes. This dataset is used to train the outcome model in O-CPO and DR-CPO. We include this dataset with the realistic expectation that text-outcome data is often confounded, which poses a threat to outcome model-based approaches. Therefore, it is necessary to evaluate how each variant of CPO fares under confounding. Like the Hong Kong dataset, the optimization goal for this dataset is to generate texts with \textit{high} outcomes.

% \subsection{Evaluation}

\subsection{Implementation}
\label{sec:implementation}

\textbf{Evaluation.} To evaluate the ability of CPO to optimize language models for a specific outcome on unpaired data, we use a text preference framework in which a reader is asked to choose the better (with respect to the outcome) of a pair of texts generated by two different methods. Using GPT-4 as a proxy for human annotators, we compare pairs of (method, baseline) completions to the same prompt; across all pairs, we compute method \textit{win rates} and compute 95\% confidence intervals. Since the datasets used for these experiments were not released in prompt/completion format, we create prompts on the evaluation set by truncating texts to random lengths.

The full input provided to GPT-4 for each dataset can be found in Appendix \ref{sec:gpt_win_rate_questions}. We validate the use of GPT-4 as an annotator with a human study, which we describe in further detail in Section \ref{sec:gpt4_annotation}.


% [TODO: HOW WERE PROMPTS CONSTRUCTED? HOW WERE MODELS TRAINED? could also put this in datasets section at the beginning. datasets were split into train/eval splits... datasets were optimized on training split. then for eval split, since the datasets aren't in prompt/completion format by default, prompts were constructed by truncating texts to a random length.]

% [How did we generate confounding, and what do results on the confounded dataset tell us?]

% \textbf{Confounded Hate Speech?}

% \begin{itemize}
%     \item HK [How was this dataset created? What's different about it compared to natural text?]
%     % \item EmoBank (continuous)
%     \item Hatespeech
%     \item Confounded dataset(s) [How did we generate confounding, and what do results on the confounded dataset tell us?]
% \end{itemize}

\textbf{Methods.} We evaluate language models optimized using the three variants of CPO (\textbf{IPW-CPO}, \textbf{O-CPO}, and \textbf{DR-CPO}), as well as language models that have been fine-tuned on texts from each of the task datasets (\textbf{FT}). We use Llama 2 7B \citep{touvron2023llama} as our base language model and fine-tune with LoRA \citep{hu2022lora}. All CPO optimizations are applied after fine-tuning on text from the task dataset.

\textbf{Choice of $f^0$.} When using O-CPO or DR-CPO, any generative language model may be used as $f^0$, the fixed language model from which texts are drawn as input to the outcome model. One key consideration is whether $f^0$ should be a pre-trained  model or whether it should be a model that has been fine-tuned on text relevant to the task---for instance, the randomized experiment dataset $\mathcal{D}_R$.

% In practice, we use the following general heuristics. 
In practice, because the
% If we are confident that $\widehat{g}(X)\approx g(X)$, or if we feel that the
available fine-tuning data is very limited in size and coverage (e.g., the Hong Kong experiment data), 
% then 
we choose a pre-trained model as $f^0$. With this choice of $f^0$, the generated texts 
% will be 
are
comparatively diverse relative to texts generated by a fine-tuned model. If $\widehat{g}(X)$ is a good outcome model, then predicted outcomes on these texts will still be close to the true outcomes, and O-CPO and DR-CPO will benefit from outcome modeling. 

If $\widehat{g}(X)$ is \textit{not} a good outcome model, then it may instead be beneficial to use a model fine-tuned on $\mathcal{D}_R$ as $f^0$. In this case, if the fine-tuning is successful, then $P^{f^0} \approx P^R$, and consequently $\widehat{V}_{DR}(f)$ will reduce to $\widehat{V}_{IPW}(f)$ \vl{SHOW THIS IN APPENDIX?}, removing the negative influence of the outcome model in DR-CPO. \vl{But why does this matter if the bias-correction of the IPW term in DR-CPO should guarantee that it isn't negatively affected by the bad outcome model anyway?}

% If $\widehat{g}(X)$ is not a good outcome model, then O-CPO will be negatively affected. On the other hand, with DR-CPO, the bias-correction of the IPW component should ensure that the predicted outcomes---while not necessarily useful---do not hurt the overall optimization.

% If we feel that $\widehat{g}(X)$ may not be a very robust outcome model, we choose a fine-tuned model as $f^0$. [By keeping generated texts similar to the texts in the original randomized dataset $\mathcal{D}_R$, we ensure that $\widehat{g}(X)$ will only have to predict outcomes on texts similar to those that it has already seen before, increasing the likelihood that the predictions will be close to the true outcomes...?]
% Practical considerations may inform specific choices of $f^0$. For instance, one key 

\textbf{Choice of $\widehat{P}^R$.} In Section \ref{sec:identification}, we mention briefly that although the distribution of texts under the randomized experiment is known to be $P^R(X)=\frac{1}{n}$ \eli{flagging again -- not always}, it can be helpful empirically to instead compute an estimated $\widehat{P}^R(X)$. This is generally due to the fact that the \textit{sample} probability of each text $X$ may not actually be $\frac{1}{n}$.

We find this to be the case in our experiments, and so we use $\widehat{P}^R(X)$ estimated from a Llama-2-7b model fine-tuned on $\mathcal{D}_R$ in our CPO implementations.

% \subsection{Evaluation}

% \subsubsection{Text preferences}
% \begin{itemize}
%     \item FT model on text from target dataset
%     \item Apply optimization method on FTed model
%     \item Generate completions for the same prompt across all methods/models
%     \item Show GPT-4 pairs of (CPO, [competing method]) completions for the same prompt and ask it to choose the one that is better w.r.t. to the outcome
%     \item Compute overall win rate of CPO against each method
% \end{itemize}

% \subsubsection{Comparing GPT-4 to human annotators}

% [To assess the validity of using GPT-4 as a substitute for human annotators]

% \begin{itemize}
%     \item Show humans the same pairs of (CPO, [competing method]) completions and ask them to choose the one that is better w.r.t. to the outcome
%     \item 30 annotators who each annotate 20 samples. Total of 200 samples, so an average of 3 annotators per samples
%     \item Compute agreement between each human annotator
%     \item Compute agreement between each human annotator and GPT-4
%     \item Compare agreements to see if they are similar
% \end{itemize}

% \subsection{Implementation}

% \eli{Should all of this be under a separate (sub or subsub) section about implementation details?}
% [DISCUSS: When should $f^0$ be FTed vs. pretrained? Good $\widehat{g}(X)$, bad/limited $P^R$: $f^0$ should be pretrained, since this encourages $f$ to be optimized over a wider range of texts (as presumably texts generated by the pretrained model will be more diverse than texts generated by the FTed model). Bad/limited $\widehat{g}(X)$: $f^0$ should be FTed on $P^R$. Since we don't trust $\widehat{g}(X)$'s predictions on text that's very different from text it's already seen (assuming it was trained on $P^R$ as well), we want it to only make predictions on text similar to $P^R$.] \vl{For the bad/limited $\widehat{g}(X)$, is this reasoning correct? What if $\widehat{g}(X)$ wasn't trained on $P^R$? Then wouldn't its predictions be equally bad on $P^R$ and on text generated by the pretrained mode?}
% \eli{I do not understand this  point} \vl{I think this is the same as what you said in an earlier comment about whether we're fitting $\widehat{g}(X)$ on our unconfounded randomization dataset or whether we're fitting it on a larger potentially confounded dataset}

% [Also mention choice to use $\widehat{P}^R$ instead of $\frac{1}{n}$]

\section{Results and Discussion}
\label{sec:results}

\subsection{GPT-4 Annotation Validity}
\label{sec:gpt4_annotation}

\begin{table}[!ht]
    \centering
    \begin{tabular}{cccc}
    \toprule
    \toprule
        Annotator 1 & Annotator 2 & Fleiss' $\kappa$  \\
    \midrule
        Human & Human & 0.170 \\
        Human & GPT-4 & 0.219 \\
        Human majority & GPT-4 & 0.192 \\
    \bottomrule
    \bottomrule
    \end{tabular}
    \caption{Agreement rates of human annotators and GPT-4 when asked to choose preferred texts with respect to target outcomes. We examine inter-human agreement, human-GPT-4 agreement, and agreement between a majority vote of human annotators and GPT-4.}
    \label{tab:human_gpt4_agreement}
\end{table}

Following the precedent set by \cite{rafailov2023direct}, we conduct a human study to assess the validity of GPT-4 as an annotator when choosing between pairs of texts for a preferred outcome. Across 200 randomly sampled examples from the Hong Kong dataset, we show human annotators (method, baseline) completion pairs and ask them to choose the better of the two with respect to the target outcome. We compute agreement between each human annotator, as well as agreement between each human annotator and GPT-4.  Agreement is measured through Fleiss' $\kappa$ \citep{fleiss1971measuring}, a common metric for agreement among multiple raters.


\begin{table*}[!ht]
    \centering
    \begin{tabular}{c|cc|cc}
    \toprule
    \toprule
    & \multicolumn{2}{c|}{Hong Kong} & \multicolumn{2}{c}{Hate Speech} \\
        & IPW-CPO win rate & DR-CPO win rate & IPW-CPO win rate & DR-CPO win rate \\
    \midrule
        FT & \textbf{\textcolor{green!50!black}{0.528* [0.506, 0.550]}} & 0.477 [0.455, 0.499] & 0.517 [0.495, 0.539] & \textbf{\textcolor{green!50!black}{0.524* [0.502, 0.546]}} \\
        IPW-CPO & - & 0.441 [0.419, 0.463] & - & 0.518 [0.496, 0.540] \\
        O-CPO & 0.542* [0.520, 0.564] & 0.482 [0.460, 0.504] & 0.538* [0.516, 0.560] & 0.560* [0.538, 0.582] \\
        DR-CPO & 0.559* [0.537, 0.581] & - & 0.482 [0.460, 0.504] & -\\
    \bottomrule
    \bottomrule
    \end{tabular}
    \caption{IPW-CPO and DR-CPO win rates against fine-tuning and CPO variants. A win rate exceeding 0.5 indicates that the named method outperforms the competing method with respect to the target outcome. Win rates are computed across 2000 pairs for each method combination.}
    \label{tab:win_rates}
\end{table*}

\begin{table}[!ht]
    \centering
    \begin{tabular}{c|c}
    \toprule
    \toprule
    & Unconfounded win rate over confounded \\
    \midrule
    O-CPO & \textbf{\textcolor{green!50!black}{0.546* [0.506, 0.586]}} \\
    DR-CPO & 0.456* [0.416, 0.496] \\
    IPW-CPO & 0.505 [0.465, 0.545] \\
    \bottomrule
    \bottomrule
    \end{tabular}
    \caption{CPO win rates with outcome models trained on unconfounded data (Hong Kong) vs. confounded data (Confounded). A win rate exceeding 0.5 indicates that CPO+unconfounded outcome model outperforms CPO+confounded outcome model with respect to the target outcome. Win rates are computed across 600 pairs for each method combination.}
    \label{tab:confounding_win_rates}
\end{table}

We use the online research platform Prolific\footnote{\url{https://www.prolific.co/}} to conduct our human study. To avoid annotator fatigue, examples are annotated in batches of 20. We recruit a total of 30 annotators for an average of 3 annotators per example and a total of 600 annotations. Additional details about the study can be found in Appendix \ref{sec:gpt4_annotation_details}.

Across three comparisons---human-human, majority vote-human, and human-GPT-4---we find that GPT-4 exhibits a similar or better level of agreement with human annotators as human annotators do with each other (Table \ref{tab:human_gpt4_agreement}). We conclude that GPT-4 is a reasonable surrogate for human annotators.

% We judge GPT-4 to be a valid surrogate for human annotators if human-GPT-4 agreement is similar to or better than inter-human agreement.

% [Human-human agreement is similar to Human-GPT4 agreement, so we conclude that GPT-4 is a reasonable substitute for human annotators]



\subsection{Text Preferences}


We report win rates across a variety of method pairs in Tables \ref{tab:win_rates} and \ref{tab:confounding_win_rates}. In particular, we examine the win rates of CPO against language models fine-tuned on the text (FT), the win rates of DR-CPO against IPW-CPO and O-CPO, and the win rates of methods under confounding.

\textbf{Outcome optimization} (Table \ref{tab:win_rates}). On the Hate Speech dataset, we observe that DR-CPO outperforms the fine-tuned model (FT) and O-CPO. DR-CPO achieves statistically significant win rates against both FT and O-CPO, with the lower bound of its 95\% confidence intervals falling above 0.5. Against IPW-CPO, DR-CPO's win rate is statistically indistinguishable from 0.5, though the point estimate suggests that DR-CPO may be the better performer. 

Since CPO is applied on the fine-tuned model, DR-CPO being preferred over FT suggests that DR-CPO is successfully optimizing language models for the Hate Speech dataset target outcome: making texts less hateful. Moreover, DR-CPO being preferred over O-CPO and statistically indistinguishable from IPW-CPO provides empirical evidence for the doubly robust properties of DR-CPO, wherein a good $\widehat{P}^R$ provides bias-correction against a poorer $\widehat{g}(X)$.
% This is significant for a number of reasons. First, as we mention in Section \ref{sec:implementation}, the base model for our optimization is Llama-2-7b---an LLM that has already been trained with extensive RLHF for 

On the Hong Kong dataset, we see instead that IPW-CPO is preferred over FT and the other CPO variants, O-CPO and DR-CPO. In this case, IPW-CPO achieves statistically significant win rates against all three competing methods, with the lower bound of its 95\% confidence intervals falling above 0.5.

The success of IPW-CPO in this setting suggests that it can be very strong under conditions where $P^R$ is well controlled or can be estimated very well---as is the case for the Hong Kong dataset, where texts are not only randomly assigned to annotators but programmatically generated from random attributes. Furthermore, we note that outcome models trained on the Hong Kong dataset do not achieve good performance or generalization outside of the training data, possibly due to the artificial nature and relative homogeneity of the texts. Therefore, our results are consistent with the hypothesis that under conditions where the outcome model is known to be poor, O-CPO and DR-CPO---in spite of its theoretical robustness---may empirically fall short of IPW-CPO.

\textbf{Double robustness under confounding} (Table \ref{tab:confounding_win_rates}). Finally, on the Confounded dataset, we find that O-CPO degrades significantly under confounding, while DR-CPO does not. (IPW-CPO is not affected by confounding because it does not use an outcome model, but we train two separate models for the sake of comparison.) When provided an unconfounded outcome model, O-CPO achieves a win rate statistically significantly above 0.5 compared to O-CPO with a confounded outcome model. In contrast, DR-CPO is not negatively impacted by a confounded outcome model.

These results further illustrate the doubly robust properties of DR-CPO and the weaknesses of outcome modeling approaches. Even under aggressive confounding, with a worst-case outcome model that has been trained on completely negated data, DR-CPO is not compromised, while O-CPO is. Given the equivalence between the O-CPO objective $\widehat{V}_{out}(f)$ and the RLHF objective, this result may also be taken as evidence that an RLHF-based optimization approach would degrade under confounding.

We further reiterate that because they are optimized on text experiment data $\mathcal{D}_R$, DR-CPO and IPW-CPO are \textit{causal} approaches. We find empirical evidence for a core theoretical strength---robustness to confounding---of an optimization method that maintains the causal relationship between text and outcome.

% \begin{itemize}
%     \item On Hatespeech and EmoBank, CPO is preferred over CLM and each of its ablations. This tells us that (a) the optimization works and (b) the double robustness is working.
%     \item On HK, IPW is preferred over CLM, CPO, and outcome modeling. This tells us that under conditions where $P^R$ is well controlled or can be estimated very well, IPW is strong. Furthermore, under conditions where the outcome model may not be very good, outcome modeling and CPO (despite double robustness) may fall short of IPW. [Due to its careful randomization---not only with respect to how texts are assigned to annotators but also in how the texts themselves are constructed---the Hong Kong dataset is a setting in which we expect IPW to work well]
%     \item On the confounded dataset, outcome modeling under confounding is significantly worse than outcome modeling without confounding. However, IPW and CPO are not significantly worse under confounding than without confounding. This illustrates one of the core strengths of a causal approach: outcome/reward modeling without causal considerations may be susceptible to confounding in the data. IPW does not use outcome modeling and is therefore not susceptible, and the double robustness of CPO also prevents it from being affected.
% \end{itemize}


\section{Conclusion}

In this paper, we propose \textit{causal preference optimization} (CPO), a causal formulation for optimizing language models for human preferences. We derive three solutions to this optimization problem based on concepts from statistical causal inference, and we show that a \textit{doubly robust} solution---DR-CPO---is provably unbiased for the true optimization problem in our proposed data setting. We conduct empirical evaluations that demonstrate the effectiveness of CPO in optimizing for human preferences on unpaired data, and we validate the double robustness of DR-CPO under difficult confounding conditions. These results open the door to a wide range of data, human preferences, and optimization goals that language models can learn using CPO.

Lastly, while we consider an unpaired data setting in this work for the sake of generality, we note that there are some tasks that are explicitly aided by paired data. Future explorations of CPO could study the paired data setting.

% ---from which one could likely recover the DPO objective.

% [TODO: 1 paragraph -- or 2 if we want to discuss future work with paired version to recover DPO?]


\begin{contributions} % will be removed in pdf for initial submission 
					  % (without ‘accepted’ option in \documentclass)
                      % so you can already fill it to test with the
                      % ‘accepted’ class option
    Briefly list author contributions. 
    This is a nice way of making clear who did what and to give proper credit.
    This section is optional.

    H.~Q.~Bovik conceived the idea and wrote the paper.
    Coauthor One created the code.
    Coauthor Two created the figures.
\end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    Briefly acknowledge people and organizations here.

    \emph{All} acknowledgements go in this section.
\end{acknowledgements}

% References
\bibliography{ref}

\newpage

\onecolumn

\title{Optimizing Language Models for Human Preferences \\\ is a Causal Inference Problem\\(Supplementary Material)}
\maketitle

\appendix

\section{Equivalence of $V_{out}(f)$}
\label{sec:v_out_rewritten}

We can show that our rewriting of $V_{out}(f)$ is equivalent to our original definition:
\begin{align*}
    V_{out}(f)&=\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(X)}{P^{f^0}(X)}g(X)\Bigg] \\
    &=\sum_{x \in \mathcal{X}}P^{f^0}(x) \frac{P^f(x)}{P^{f^0}(x)}g(x) \\
    &=\sum_{x \in \mathcal{X}}P^f(x)g(x) \\
    &=E_{X \sim P^f}[g(X)]
\end{align*}

\section{Parallels between RLHF and O-CPO}
\label{sec:rlhf_ties}

The loss function under RLHF is typically computed through proximal policy optimization (PPO):
\begin{align*}
    \mathcal{L}(\theta, \phi) = \mathcal{L}^{PPO}_{\text{policy}}(\theta) + c_1\mathcal{L}^{PPO}_{\text{value}}(\phi) - c_2\mathcal{L}^{PPO}_{\text{entropy}}(\theta)
\end{align*}

where $\mathcal{L}^{PPO}_{\text{value}}(\phi)$ and $\mathcal{L}^{PPO}_{\text{entropy}}(\theta)$ are regularization terms and $\mathcal{L}^{PPO}_{\text{policy}}(\theta)$ is the \textit{policy loss}. Letting $p$ denote the prompt, $c$ denote the completion, and $r$ denote the reward model, we consider only the policy loss without any stability tricks like clipping. \vl{Include clipped version too?} $\pi_\theta$ is the probability under the policy being optimized, while $\pi_{\theta_0}$ is the probability under a reference policy (often the starting policy or the policy at the previous step).
\begin{equation}
    \mathcal{L}_{PPO}(\theta)=
    \E\left[\frac{\pi_\theta(c|p)}{\pi_{\theta_0}(c|p)}\cdot r(p,c)\right]
\end{equation}

We can see the equivalence between $\pi_\theta(c|p)$ and $P^f(X)$, $\pi_{\theta_0}(c|p)$ and $P^{f^0}(X)$, and $r(p,c)$ and $g(X)$; substituting these terms renders $\mathcal{L}_{PPO}$ equal to $V_{out}(f)$.

\section{Unbiasedness of $V_{DR}(f)$}
\label{sec:unbiased_dr}

We can show that $\E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]=\E_{X \sim P^f}[\E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]]=V(f)$  under one of two conditions: $\widehat{P}^R(X)=P^R(X)$ (i.e., $P^R$ is known) or (2) $\widehat{g}(X)=g(X)=E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]$ (i.e., $g(X)$ is known).

First, we rewrite $\E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]$:
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]] =& \; \E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_X\Bigg[\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i)) + \frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg] \\
    =& \;\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{X\sim P^R}\Bigg[\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg] + \E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{\widetilde{X}\sim P^{f^0}}\Bigg[\frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg] \\
    =& \; \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i(X_i)-\widehat{g}(X_i))\Bigg]\Bigg] \\ 
    &+ \frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg]
\end{align*}

\begin{proof}{If (1) $\widehat{P}^R(X)=P^R(X)$,}


Rewriting the first term,
\begin{align*}
    % \mathcal{L}_R&=
    \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i(X_i)-\widehat{g}(X_i))\Bigg]\Bigg]
    &=\frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{P^R(X_i)}(Y_i(X_i)-\widehat{g}(X_i))\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}(Y(x)-\widehat{g}(x))\mathbbm{1}\{X=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}(Y(x)-\widehat{g}(x))\underbrace{\E_{X \sim P^R}[\mathbbm{1}\{X=x\}]}_{P^R(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)(Y(x)-\widehat{g}(x))\Bigg]\\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)-\widehat{g}(X)]] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]]-\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]]
\end{align*}

Rewriting the second term,
\begin{align*}
    % \mathcal{L}_O&=
    \frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg]
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}\widehat{g}(x)\mathbbm{1}\{\widetilde{X}=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}\widehat{g}(x)\underbrace{\E_{\widetilde{X} \sim P^{f^0}}[\mathbbm{1}\{\widetilde{X}=x\}]}_{P^{f^0}(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)\widehat{g}(x)\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]] \\
    % &=\E_{Y(\cdot) \sim \mathcal{G},X \sim P^f}[g(X)]
\end{align*}

% \begin{align*}
%     \mathcal{L}_{CPO}&=
%     \mathcal{L}_R+\mathcal{L}_O \\
%     &=\E_{Y(\cdot) \sim \mathcal{G},X \sim P^f}[Y(X)]-\E_{Y(\cdot) \sim \mathcal{G},X \sim P^f}[g(X)]+\E_{Y(\cdot) \sim \mathcal{G},X \sim P^f}[g(X)] \\
%     &=\E_{Y(\cdot) \sim \mathcal{G},X \sim P^f}[Y(X)]
% \end{align*}
Then we have
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]]-\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]]+\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]]
\end{align*}
\end{proof}

\begin{proof}{If (2) $\widehat{g}(X)=g(X)=E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]$,}

Rewriting the first term,
\begin{align*}
    % \mathcal{L}_R&=
    \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i(X_i)-\widehat{g}(X_i))\Bigg]\Bigg]
    &= \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i(X_i)-g(X_i))\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{\widehat{P}^R(x)}(Y(x)-g(x))\mathbbm{1}\{X=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{\widehat{P}^R(x)}(Y(x)-g(x))\underbrace{\E_{X \sim P^R}[\mathbbm{1}\{X=x\}]}_{P^R(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}(Y(x)-g(x))\Bigg] \\
    &=\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}\E_{Y(\cdot) \sim \mathcal{G}}[Y(x)-g(x)] \\
    &=\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}(\E_{Y(\cdot) \sim \mathcal{G}}[Y(x)]-\E_{Y(\cdot) \sim \mathcal{G}}[\underbrace{g(x)}_{E_{Y(\cdot)\sim\mathcal{G}}[Y(x)]}]) \\
    &=\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}\cdot 0 \\
    &=0
\end{align*}

Rewriting the second term,
\begin{align*}
    % \mathcal{L}_O&=
    \frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg]
    &=\frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}g(\widetilde{X}_j)\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}g(x)\mathbbm{1}\{\widetilde{X}=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}g(x)\underbrace{\E_{\widetilde{X} \sim P^{f^0}}[\mathbbm{1}\{\widetilde{X}=x\}]}_{P^{f^0}(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)g(x)\Bigg] \\
    &=\sum_{x \in \mathcal{X}}P^f(x)g(x) \\
    &=\E_{X \sim P^f}[\underbrace{g(X)}_{E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]}] \\
    &=\E_{X \sim P^f}[E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]]
\end{align*}

Then we have
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]
    &=0+\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]]
\end{align*}

\end{proof}

\section{GPT-4 Win Rate Prompts [TODO]}
\label{sec:gpt_win_rate_questions}

\section{Human Agreement Study [TODO]}
\label{sec:gpt4_annotation_details}

\end{document}
