\documentclass[accepted]{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{bbm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\allowdisplaybreaks
\usepackage{listings}
\lstset{
basicstyle=\small\ttfamily,
columns=flexible,
breaklines=true
}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


\newcommand{\vl}[1]{\textcolor{orange}{[VL: #1]}}

\newcommand{\eli}[1]{\textcolor{orange}{[Eli: #1]}}

\newcommand{\E}{\mathbb{E}}
\newcommand{\R}{\mathbbm{R}}
\newcommand{\Var}{\text{Var}}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}


\title{Optimizing Language Models for Human Preferences \\\ is a Causal Inference Problem}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<vlin2@andrew.cmu.edu>?Subject=Causal Preference Optimization}{Victoria Lin}}
\author[1]{\href{mailto:<ebenmich@andrew.cmu.edu>?Subject=Causal Preference Optimization}{Eli Ben-Michael}}
\author[1,2]{\href{mailto:<morency@cs.cmu.edu>?Subject=Causal Preference Optimization}{Louis-Philippe Morency}}
% Add affiliations after the authors
\affil[1]{%
    Carnegie Mellon University
}
\affil[2]{%
    Meta Research
}
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
  
\begin{document}
\maketitle

\begin{abstract}
  As large language models (LLMs) see greater use in academic and commercial settings, there is increasing interest in methods that allow language models to generate texts aligned with human preferences. In this paper, we present an initial exploration of language model optimization for human preferences from \textit{direct outcome datasets}, where each sample consists of a text and an associated numerical outcome measuring the reader's response. We first propose that language model optimization should be viewed as a \textit{causal problem} to ensure that the model correctly learns the relationship between the text and the outcome. We formalize this causal language optimization problem, and we develop a method---\textit{causal preference optimization} (CPO)---that solves an unbiased surrogate objective for the problem. We further extend CPO with \textit{doubly robust} CPO (DR-CPO), which reduces the variance of the surrogate objective while retaining provably strong guarantees on bias. Finally, we empirically demonstrate the effectiveness of (DR-)CPO in optimizing state-of-the-art LLMs for human preferences on direct outcome data, and we validate the robustness of DR-CPO under difficult confounding conditions.
\end{abstract}

\section{Introduction}
\label{sec:intro}

Recent advances in computation have yielded large-scale self-supervised language models that achieve impressive performance on a variety of natural language processing (NLP) tasks \citep{zhang2022opt, chowdhery2023palm, lescao2023bloom, bubeck2023sparks}. These large language models (LLMs)---trained on vast amounts of text data of varying quality---can acquire less desirable attributes from these texts, and so they often require further fine-tuning on human preferences to improve their factual correctness and alignment with social values (e.g., less toxic, more helpful) \citep{NEURIPS2022_b1efde53, bommasani2022opportunities}. 

In this paper, we examine a paradigm for language model optimization for human preferences that has previously been underexplored: learning from \textit{direct outcome datasets}, which are ubiquitous in NLP. In contrast to paired completion data consisting of prompts followed by one preferred and one non-preferred completion, direct outcome datasets are text datasets where each sample consists of a text and an associated numerical \textit{outcome} measuring the reader's response to the text (e.g., Reddit upvotes \citep{Lakkaraju_McAuley_Leskovec_2021}, Amazon ratings \citep{mcauley2013amazon}). A large number of direct outcome datasets are \textit{crowdsourced datasets}, where annotators on a crowdsourcing platform are randomly assigned to read and respond to texts.

The ability to learn human preferences from direct outcome data significantly broadens the scope of problems that can be addressed by learning from human feedback. Consider the task of inducing a language model to unlearn hate speech. Current unlearning approaches typically use paired data in the format \textit{([hate speech], [alternative text])}, with the preferred text being the latter. Constructing alternative texts can be difficult \citep{eldan2023whos,maini2024tofu}, and rather than fully unlearning the hate speech, the language model is instead trained to preferentially generate the alternative text \citep{patil2024can}. Direct outcome data, by contrast, allows texts to be directly marked as hateful and removed from the language model without learning or requiring an alternative text.


% Unlearning language may be better viewed as a direct outcome task than a paired completion one: there exists a set of texts that are \textit{not desired} (the outcome), and there are no natural pairs for those texts. However, b

% By optimizing on direct outcome data rather than paired completion data, we (1) allow for the use of a much wider body of preexisting data, (2) account for the \textit{intensity} or \textit{degree} of a text's outcome rather than simply whether it is preferred or not, and (3) broaden the scope of problems that can be addressed by learning from human feedback.

% \textbf{Example.} \vl{TODO: Convert this into a hate speech example. Put it in intro, maybe even in the first paragraph. ``Should be seen as complementary to approaches that pair preferred text''} 

% [A large number of direct outcome datasets are \textit{crowdsourced datasets}, in which annotators on a crowdsourcing platform are randomly assigned to read and respond to texts. Due to the random assignment of texts to readers, crowdsourced datasets have a causal guarantee: the text ]

We present an initial exploration of language model optimization in the direct outcome setting, where the language model is fine-tuned to optimize texts with respect to a desired outcome. We first note that learning an optimal language model can be difficult due to the presence of unmeasured \textit{confounding} in the training data: external factors that affect both readers' choice of texts to read and how they tend to respond to those texts.  Language models optimized on confounded data may learn incorrect relationships between texts and reader responses, leading them to generate sub-optimal text. For instance, users of hate speech are both (i) more likely to engage with content containing hate speech and (ii)  more likely to rate hateful content positively. Such confounding may lead to incomplete unlearning of hate speech, as some examples of hate speech are assigned positive outcomes in the confounded data.

% Therefore, to ensure that a language model optimized for an outcome generates text that \textit{causes} that outcome, we posit that language model optimization should be viewed as a \textit{causal} problem. The solution to this optimization problem asks the following question: how do we \textit{intervene} on the text distribution of the generating model to best \textit{cause} an optimal outcome (i.e., generation of human-preferred texts)? 

Therefore, we posit that language model optimization should be viewed as a \textit{causal} problem in order to ensure that the optimal language model \textit{causes} preferred outcomes.
In this paper, we introduce a causal formulation of the language model optimization problem.
The solution to this optimization problem finds how to \textit{intervene} on the text distribution of the generating model to best \textit{cause} an optimal outcome (i.e., generation of human-preferred texts).

% Learning a language model that correctly represents this causal relationship can be difficult in practice due to the presence of unmeasured \textit{confounding} in the training data: external factors that affect both readers' choice of texts to read and how they tend to respond to to those texts (e.g., users of social media will tend to read posts on pages that they enjoy, so they will be more likely to click the ``like'' button on those posts). Language models optimized on confounded data may learn incorrect relationships between texts and reader responses, leading them to generate sub-optimal text.

% We observe that in the direct outcome setting, it is possible in practice to guarantee that the relationship between the text and the outcome is causal by leveraging crowdsourced datasets. Due to random assignment of texts to readers in the crowdsourcing process, crowdsourced datasets are not subject to external confounding and can in fact be viewed as randomized experiments \citep{lin-etal-2023-text}. Building on this observation, we present two methodological contributions that enable causal language model optimization on many direct outcome datasets, including crowdsourced data. The first of these contributions is to formalize the language model optimization problem in causal terms. We introduce \textit{causal preference optimization} (CPO), an unbiased causal solution to the optimization problem based on the notion of \textit{importance weighting}. Our second methodological contribution is \textit{doubly robust} CPO (DR-CPO), which improves on CPO by reducing its variance via outcome modeling while retaining provably strong guarantees on bias.

We observe that in the direct outcome setting, it is possible in practice to guarantee that the observed relationship between the text and the outcome is causal by leveraging crowdsourced datasets. Due to random assignment of texts to readers, crowdsourced datasets are not subject to external confounding and can in fact be viewed as randomized experiments \citep{lin-etal-2023-text}. Building on this observation, we present two methodological contributions that enable causal language model optimization on direct outcome datasets. First, we develop \textit{causal preference optimization} (CPO), 
which solves an unbiased surrogate objective for the causal optimization problem.
Next, we extend this to \textit{doubly robust} CPO (DR-CPO), which improves on CPO by 
reducing the variance of the surrogate objective via outcome modeling while retaining provably strong guarantees on bias.

We empirically assess the effectiveness of (DR-)CPO in optimizing state-of-the-art LLMs for human preferences on direct outcome data, both with and without confounding. We find that CPO methods successfully optimize LLMs for human preferences and outperform baselines, and we further observe empirical evidence for the robustness of DR-CPO under difficult confounding conditions.

\section{Related Work}
\label{sec:related_work}

% [CPO is contextualized within a wider body of ]

\subsection{Language Model Optimization}


The performance of large self-supervised language models can be further improved by fine-tuning on datasets that align them with human-preferred text \citep{NEURIPS2022_b1efde53, bommasani2022opportunities}. These \textit{paired completion datasets} typically consist of prompts followed by two candidate completions, one of which is indicated to be human-preferred \citep{pmlr-v162-ethayarajh22a, bai2022training, ji2023beavertails}. A reinforcement learning algorithm may then derive its reward model from these datasets (reinforcement learning from human feedback, or RLHF) \citep{christiano2017rlhf}, after which language models are fine-tuned to maximize the human preference reward under the RLHF algorithm. 
% This optimizes the language model for the human preferences encoded in the dataset.

While RLHF has seen widespread use \citep{ stiennon2020summarize, touvron2023llama}, it is computationally demanding, as its training loop requires that new texts be generated and new rewards be computed at each step. 
% Despite this, the majority of the prevailing large language models---both academic and commercial---have been optimized using RLHF.
Consequently, in recent months, methods that allow language models to learn more directly from human preference data have emerged 
% an2023direct, 
\citep{hejna2023contrastive, dumoulin2024density}---the most popular of which is direct preference optimization (DPO) \citep{rafailov2023direct}. Like RLHF, DPO is designed for use with paired completion datasets, maximizing the probability ratio of preferred completions to non-preferred completions over the paired completion dataset.

% [Within the space of language models, we mostly have RLHF and more recently DPO...]

% RLHF fine-tunes a language model for human feedback via reward modeling, where the reward is a proxy for human feedback (i.e., the outcome) and the goal of the model is to obtain larger rewards.

\subsection{Causal Inference and Doubly Robust Policy Learning}

Although RLHF and DPO constitute the two most popular optimization approaches for language models, there exists a wide body of work on estimation and policy learning outside of the NLP space. Some notable work relevant to this paper includes a long history of doubly robust estimation of causal effects \citep{Robins1994} and---more directly applicably---doubly robust policy learning \citep{dudik2011doublyrobust, pmlr-v48-jiang16, Tang*2020Doubly,Athey2021, pmlr-v162-kallus22a}.

In causal inference, double robustness denotes an estimator formulation that provides robustness against misspecification of \textit{nuisance} parameters or functions. In particular, doubly robust estimators combine two existing estimators---an importance weighting estimator and an \textit{outcome modeling} estimator---such that only one of the two components must be correctly specified or estimated to guarantee the unbiasedness of the estimator \citep{Robins1994, chernozhukov_locally_2022}. This can also be viewed as the importance weighting term providing a \textit{bias correction} for the outcome modeling term. The principle of double robustness can be extended to not only the estimation of causal effects but also the estimation of any quantity, including loss functions or policy objectives, as we do here.

% [But outside the space of language models and NLP, there's been lots of work on estimation and optimization. For the purposes of this paper, some previous work of note includes doubly robust estimation of causal effects and---more directly relevant---doubly robust optimization in traditional RL settings...]

\section{A Causal View of Language Model Optimization}
\label{sec:causal_formulation}

% \eli{I think the above should go more in related work, esp the forced choice part, and then start below}

% Formally, let $\mathcal{X}$ be the space of texts, and let $\mathcal{G}$ be the space of potential outcomes (under the potential outcomes framework of causal inference) for those texts. Then the potential outcomes are given by $\{Y(x): \mathcal{X} \rightarrow \mathbb{R} \; | \; x \in \mathcal{X}\} \sim \mathcal{G}$. We define the value function $V(f)$:
% \begin{equation}
%     V(f)=\E_{X \sim P^f}[\E_{Y(\cdot)\sim \mathcal{G}}[Y(X)]]
% \end{equation}

When a language model $f$ is trained or fine-tuned to generate texts that are consistent with human preferences, the implicit goal can be seen as optimizing texts $X \sim P^f$---texts generated from the model $f$---with respect to some outcome $Y$. In this paper, we consider a direct outcome data format 
$\mathcal{D}_O=\{(X_1,Y_1),\dots,(X_n,Y_n)\}$,
where $X_i$ is a text that individual $i$ interacts with and $Y_i$ is any numerical response of the individual to the texts (e.g., ratings, either binary or scalar).
% In current practice, this outcome $Y$ is typically an indicator of whether a user preferred or did not prefer a text previously generated by the model, under a forced-choice data setting in which the user is shown two possible generations.

% \textbf{A non-causal view.} 
% In particular, consider a direct outcome dataset $\mathcal{D}_O=\{(X_1,Y_1),\dots,(X_n,Y_n)\}$.
An association-based approach to optimize model $f$ is to generate texts that are similar to those that have high outcomes in the dataset, i.e.,
\begin{equation}\label{eq:non_causal_optimization}
    \underset{f}{\arg\max} \; \E_{X\sim P^f}[E_{\mathcal{D}_O}[Y|X]],
\end{equation}
where the conditional expectation $\E_{\mathcal{D}_O}[Y|X]$ is the average outcome among individuals who observed the text $X$ and can be learned from $\mathcal{D}_0$. A language model optimized under Equation \eqref{eq:non_causal_optimization} will generate texts that are \textit{correlated} with high outcomes. We distinguish this from our true optimization goal: to learn---across all possible texts and outcomes---to \textit{intervene} on the distribution of the generating language model to \textit{cause} the best possible outcomes.

% We emphasize that given this optimization, changing the text should have a direct causal effect on how the reader will react to it. \vl{idk if this last sentence is necessary}

% \vl{Explain in words what this condition expectation is -- avreage outcome among ppl who observed this text. Thereofre, a LM model will generate texts that correlate with these high outcomes. This is different from what we actually want: generating texts that.... state this once just up front.}

% A language model optimized under this objective will generate texts with high outcomes, according to the observed data. However, it will not necessarily generate what we really want: texts that \textit{would have had} \vl{would cause?} high outcomes had they been read and received responses. 

These hypothetical outcomes can be formalized using the potential outcomes framework \citep{neyman1923, rubin1974}: over text space $\mathcal{X}$, for each individual $i$ we posit the existence of a \emph{potential outcome function} 
$Y_i:\mathcal{X} \to \R$, where $Y_i(x)$ encodes their potential real-valued response if given text $x$.\footnote{This notation implicitly rules out the possibility that an individual's responses can be affected by the texts given to others---a common assumption in causal inference \citep{rubin1974}.} We emphasize that most individuals' potential outcomes are not observed, and so $Y_i(x)$ denotes the response individual $i$ would have had \textit{had they seen text} $x$, possibly contrary to reality. This is also commonly known as the \textit{counterfactual}. 

We assume that we sample individuals from a population $\mathcal{G}$ so that the set of potential outcomes is given by $\{Y(x)\; | \; x \in \mathcal{X}\} \sim \mathcal{G}$. We define $g(x) \equiv \E_{Y(\cdot)\sim\mathcal{G}}[Y(x)]$ as the average outcome if \textit{all} individuals in the population were given text $x$. Note that $g(x)$ is different from the correlational measure $\E_{\mathcal{D}_O}[Y|X]$ because for $\E_{\mathcal{D}_O}[Y|X]$, the association between $X$ and $Y$ may be confounded by an external factor.

% Using this notation, an optimal language model will maximize over $x$ with respect to $\E_{Y(\cdot)\sim\mathcal{G}}[Y(x)]$, while a language model that uses objective 1 will maximize over $x$ with respect to $\E{Y\sim\mathcal{D}_O}[Y|X=x]$. Crucially, $\E{Y\sim\mathcal{D}_O}[Y|X=x]\neq \E_{Y(\cdot)\sim\mathcal{G}}[Y(x)]$

% \vl{Skip this paragraph} This setup allows us to rewrite the optimization problem to reflect our actual goal: generating texts that have \vl{also cause?} high potential outcomes or counterfactuals. We note that when we adapt a language model for an outcome, we assume an implicit causal relationship between the text and the outcome: that is, changing the text has a direct causal effect on how the reader will react to it. If this causal relationship holds, a language model that optimizes the outcome will learn to produce text that \textit{causes} the outcome.

Formally, then, our goal is to find a language model $f$ that causes high outcomes $Y$ on average across the population of individuals $\mathcal{G}$ and across the texts generated according to the model. We encode the quality of a generative text model $f$ via its value function that measures the expected outcome (or reward):
\begin{equation}
    V(f) \equiv \E_{X \sim P^f}[\E_{Y(\cdot)\sim \mathcal{G}}[Y(X)]] = \E_{X \sim P^f}[g(X)]
\end{equation}

Then the \textit{causal} optimization problem is to find the language model $f$ that maximizes the expected outcome if a random individual were given a random text according to $P^f$:
\begin{equation}
\label{eq:true_optimization}
    \underset{f}{\arg\max}\; V(f) = \underset{f}{\arg\max} \; \E_{X \sim P^f}[\E_{Y(\cdot)\sim \mathcal{G}}[Y(X)]]
\end{equation}

In intuitive terms, this optimization problem asks the following question: which texts would we generate if we knew what every individual's response to every text would be? By optimizing the text with respect to \textit{every possible response}, this construction removes confounding influences on which texts are read or observed, such that the content of the text must be the sole factor that causes the outcome. 
% We are then left with the following causal inference problem: how do we \textit{intervene} on the distribution of the generating language model to best cause an optimal outcome on average---in this case, the production of human-preferred text?

% Let $X \sim P^f$ be some texts generated by the model $f$---for example, titles of emails. For these texts, we have observed outcomes $Y$, where the outcome in this case is whether the email was opened ($Y=1$) or deleted ($Y=-1$). If the email was ignored, we consider the outcome unobserved. We denote the subset of texts for which outcomes are observed as $X \sim P^f_{obs}$. Our goal is to generate email titles that will cause recipients to open the email.

% Then a natural formalization of the language model optimization problem is
% \begin{equation}\label{eq:non_causal_optimization}
%     \underset{f}{\arg\max} \; \E_{X\sim P^f_{obs}}[Y|X]
% \end{equation}

% In other words, we want to encourage model $f$ to generate texts $X$ with high observed outcomes $Y|X$. \vl{Sorry, this notation/formalization might be really bad -- the general idea was just to say we're optimizing observed outcomes $Y$ that may not be solely caused by $X$.}

% However---in observed data, the relationship between the text and the reader's response to it is not always causal. For instance, whether the outcome $Y$ is observed depends not only on the title of the email ($X$) but also on an external \textit{confounding} factor: who the sender is ($C$). If the recipient knows the sender, then the outcome is more likely to be observed, as the recipient will either open the email (e.g., if the sender is a friend) or delete it (e.g., if the sender is an enemy). If the recipient does not know the sender, then it is more likely that the recipient will ignore the email, so an outcome will not be observed. 

% As a result, given this observed data, a language model optimized according to equation \ref{eq:non_causal_optimization} will generate email titles that cause recipients to open an email title \textit{if the email is from somebody the recipient already knows}. However, these titles may not be effective for emails from senders who are unknown to the recipient, and so the goal of generating titles that cause recipients to open the email (regardless of their relationship to the sender) is not achieved.

% More generally, confounders are any factors that influence both a reader's choice of texts to read (i.e., which texts have observed outcomes) and how they might tend to respond to a text (e.g., demographic attributes, personal beliefs, prior experiences). Under confounding, because the text is no longer the sole cause of the response, a language model that optimizes on the observed response will not correctly learn to generate text that produces the desired response.

% \textbf{A causal view.} This example illustrates that when we adapt a language model for human preferences, we should in fact construct the problem such that we can assume a causal relationship between the text and the outcome. In particular, we assume that changing the text has a direct causal effect on how the reader will react to it. If this causal relationship holds, a language model that optimizes the outcome will learn to produce text that \textit{causes} the outcome.

% To guarantee this causal relationship, we propose an explicitly causal writing of the language model optimization problem. Let $\mathcal{X}$ represent the space of texts.
% We follow the potential outcomes framework \citep{neyman1923, rubin1974}: for each individual $i$ we posit the existence of a \emph{potential outcome function} 
% $Y_i:\mathcal{X} \to \R$, where $Y_i(x)$ encodes their potential real-valued response if given text $x$.\footnote{This notation implicitly rules out the possibility that an individual's responses can be affected by the texts given to others---a common assumption in causal inference \citep{rubin1974}.} We emphasize that most individuals' potential outcomes are not observed, and so $Y_i(x)$ denotes the response individual $i$ would have had \textit{had they seen text} $x$, possibly contrary to reality. This is also commonly known as the \textit{counterfactual}.

% We assume that we sample individuals from a population $\mathcal{G}$ so that the set of potential outcomes is given by $\{Y(x)\; | \; x \in \mathcal{X}\} \sim \mathcal{G}$. We define $g(x) \equiv \E_{Y(\cdot)\sim\mathcal{G}}[Y(x)]$ as the average outcome if \textit{all} individuals in the population were given text $x$.


% In intuitive terms, this optimization problem asks the following question: which texts would we generate if we knew what every individual's response to every text would be? By optimizing the text with respect to \textit{every possible response}, this construction removes confounding influences on which texts are read or observed, such that the content of the text must be the sole factor that causes the outcome. We are then left with the following causal inference problem: how do we \textit{intervene} on the distribution of the generating language model to best cause an optimal outcome on average---in this case, the production of human-preferred text?
% % Returning to the email example, for instance, a language model optimized according to equation \ref{eq:true_optimization} learn to generate titles over outcomes corresponding to emails from both known and unknown senders. that---for both known and unknown senders---cause recipients to open the email.

% In practice, observing all outcomes is often approximated via a randomized experiment. Random assignment of individuals to texts removes all confounding outside of the text, since no external factors influence which texts the individual reads. Moreover, as we mention previously, crowdsourced datasets are in fact randomized experiments, since annotators are randomly assigned to read texts. Therefore, crowdsourced datasets provide the same causal guarantee---the content of the text must be the sole factor that causes the reader's response---and can be used as a way to \textit{identify} the causal optimization problem from observed data and \textit{estimate} it.


% \vl{TODO: Explain potential outcomes more. Distinction between non-causal formulation---maximizing sample/observed $Y$---vs. causal formulation---maximizing the \textit{potential outcomes} (could call them counterfactuals instead). Maybe even put the non-causal objective to distinguish. Phrase explicitly that we want to distinguish between correlation and causation.}

% \vl{We only get to observe outcomes for some texts. Non-causal version just maximizes relative to those outcomes. Causal version is saying what if we could observe outcomes for \text{all} texts, and maximize relative to that? A randomized experiment is an approximation of observing all outcomes b/c they are random. As we mentioned in the intro, many NLP datasets are crowdsourced and therefore an approximation of randomized datasets.}


% Moreover, the CPO problem is further distinguished from existing language model optimization approaches by its general formulation that bypasses the typical requirement of forced-choice data, instead requiring only a numerical (binary or continuous) \textit{outcome} for each text. This allows us to broaden our view of what constitutes a ``preference,'' with potential use cases including not only learning desired outcomes but also 

% \eli{Add a few discussion points here about why we would want to do this, maybe with some examples of texts $x$ and outcomes $y$ from the evaluations or in general?} [TODO, BUT MAYBE THIS WOULD BE IN THE INTRO]

% \subsection{Why is this Causal?}

% \vl{Merge this with the previous section}

% When we adapt a language model for human preferences, we assume an implicit causal relationship between the text and the outcome: that is, changing the text has a direct causal effect on how the reader will react to it. If this causal relationship holds, a language model that optimizes the outcome will learn to produce text that \textit{causes} the outcome.

% In many datasets, however, although some relationship can typically be inferred between the content of a text passage and a reader's response to it, this relationship generally cannot be assumed to be causal due to the potential presence of unmeasured confounders: factors that influence both a reader's choice of texts to read and how they might tend to respond to a text (e.g., demographic attributes, personal beliefs, prior experiences).

% Datasets where users provide a numerical response to a text are common in NLP (e.g., Reddit upvotes \citep{Lakkaraju_McAuley_Leskovec_2021}, Amazon ratings \citep{mcauley2013amazon}). Though some relationship can typically be inferred between the content of a text passage and a reader's response to it, this relationship generally cannot be assumed to be causal due to the potential presence of unmeasured \textit{confounders}: factors that influence both a reader's choice of texts to read and how they might tend to respond to a text (e.g., demographic attributes, personal beliefs, prior experiences).
% \eli{Add something about how optimizing from a confounded dataset can go wrong? Need to complete the thought: ``There can be confound`` $\rightarrow$ ``that can mess up the language model''} 

% Consequently, a language model that optimizes the outcome over a crowdsourced dataset will learn to produce text that \textit{causes} that outcome, without susceptibility to confounding. Therefore, we reiterate that the problem of language model optimization problem is the following causal inference problem: how do we \textit{intervene} on the distribution of the generating language model to best cause an optimal outcome on average---in this case, the production of human-preferred text?

% [these types of text/rating datasets are common in nlp and generally obtained from crowdsourcing. due to the nature of crowdsourcing, this is basically a randomized experiment: people are randomly assigned to read/react to texts. this removes all possible confounding (factors that influence both ppl's choice to read a text and how they might tend to react to that text) because the random assignment ensures that external factors cannot influence people to read particular texts. therefore, this gives us a \textit{causal guarantee}: within this randomized dataset, we know that the content of the text is *causing* the observed outcome. consequently, a language model that optimizes the outcome over this dataset should produce text that *causes* the optimal outcome.]

\section{(Doubly Robust) Causal Preference Optimization}
\label{sec:cpo}


% In practice, observing all outcomes is often approximated via a randomized experiment.Moreover, as we mention previously, crowdsourced datasets are in fact randomized experiments, since annotators are randomly assigned to read texts. Therefore, crowdsourced datasets provide the same causal guarantee---the content of the text must be the sole factor that causes the reader's response---and can be used as a way to \textit{identify} the causal optimization problem from observed data and \textit{estimate} it.



Reframing our optimization problem as a causal inference problem allows us to draw on solutions from statistical causal inference---in particular, the use of randomized experiments to identify causal effects and approximate observing all potential outcomes.\footnote{As discussed above, crowdsourced datasets are in fact randomized experiments, since annotators are randomly assigned to read texts.}
We formalize such a randomized experiment and/or crowdsourced annotated dataset as $\mathcal{D}_R = \{(X_1, Y_1),\ldots, (X_n, Y_n)\}$ where texts $X_i$ are drawn i.i.d. from a randomization distribution $P^R$ and individuals with potential outcome functions $Y_i(\cdot)$ are drawn i.i.d. from the population $\mathcal{G}$. 
This induces a distribution on the observed responses $Y_i = Y_i(X_i)$ that we denote as $P^R_y$.

Random assignment of individuals to texts removes all confounding outside of the text, since no external factors influence which texts the individual reads.
Formally,  we have that the texts are independent of the full set of potential outcomes, i.e., $\{Y(x) \mid x \in \mathcal{X}\}  \perp\!\!\!\!\perp  X$. We also require a technical assumption that there is \emph{overlap} between the randomization distribution $P^R$ and the distribution $P^f$ generated by the language model we are optimizing: that is, if $P^R(x) = 0$, then $P^f(X) = 0$ as well. This ensures that the randomization distribution is sufficiently informative about the domain we want to optimize over.
In principle, this is directly enforceable as a constraint on the language model. In practice, due to the underlying structure of text data and the fact that we often fine-tune language models to the text domain as a precursor to optimization, the overlap assumption is unlikely to be binding.

In this section, we describe \textit{causal preference optimization} (CPO), which solves an unbiased surrogate objective for the true causal optimization problem using importance weighting. Following this definition, we extend CPO using the principle of double robustness, in which we use outcome modeling to reduce the variance of the CPO objective while retaining strong guarantees on bias.

Derivations and technical results are shown in Appendix \ref{sec:technical_results}.

% in particular, the notion of the \textit{doubly-robust estimator} \citep{Robins1994, chernozhukov_locally_2022}. Following the convention of causal inference, we identify $V(f)$ in terms of observable data. 

\subsection{Causal Preference Optimization}
% \subsubsection{Identifying the value of a language model}
\label{sec:identification}

% We consider an observed crowdsourced dataset $\mathcal{D}_R$ with texts $X_i \sim P^R$ and their corresponding outcomes $Y_i \sim P^R_y$, where observed $Y_i$ are equal to their potential outcomes $Y_i(X_i)$. Then letting $g(x)=\E_{Y(\cdot)\sim\mathcal{G}}[Y(x)]$, 
\textbf{Identification.} The value of the language model 
$V(f)$ is a causal quantity that involves the potential outcomes for all individuals, some of which are unobserved.
However, we can link the value function to the randomization dataset $\mathcal{D}_R$ (i.e., \textit{identify} it from the observed data) by writing in the following way.
% \begin{align*}
%     V(f)=&\;\E_{X\sim P^f}[g(X)]  \tag*{(\text{$V_{out}$})} \\
%     =&\;\E_{X \sim P^R}\Bigg[\E_{Y \sim P^R_y}\Bigg[\frac{P^f(X)}{P^R(X)}Y\Bigg]\Bigg] \tag*{(\text{$V_{IPW}$})} \\
%     =&\;\E_{X\sim P^f}[g(X)]+ \\
%     &\;\E_{X \sim P^R}\Bigg[\E_{Y \sim P^R_y}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] \tag*{(\text{$V_{DR}$})}
% \end{align*}
\begin{proposition}\label{eq:v_ipw_proposition} The value function $V(f)$ can be identified as
\begin{align*}
    V(f)&=\E_{X \sim P^R}\Bigg[\E_{Y \sim P^R_y}\Bigg[\frac{P^f(X)}{P^R(X)}Y\Bigg]\Bigg] \tag*{(\text{$V_{IPW}$})}
\end{align*}
\end{proposition}

This value function draws on importance weighting  principles from statistical causal inference (also referred to as IPW). Observed outcomes $Y \sim P^R_y$ are weighted by the density ratios between texts drawn from the language model $X \sim P^f$ and texts drawn from the randomization distribution  $X \sim P^R$; this approximates the average outcome under $P^f$, which is not observed.

% [In the next section, we describe these doubly robust properties and the ways in which they can guarantee the unbiasedness of an estimator for $V(f)$.]


% \subsubsection{Estimating the value}
% \eli{Note that I removed the hats from $P^R$} \vl{Can I bring them back? I think it's fine to state up front that $\widehat{P}^R=P^R$ in our experiments, but it makes things confusing when we're talking about one of the conditions of double robustness is $\widehat{P}^R=P^R$ but $\widehat{P}^R$ doesn't appear in the $\widehat{V}_{DR}(f)$ term.... or should I just rephrase that condition as ``$P^R$ must be known''?}

\textbf{Estimation.} After writing the causal quantity $V(f)$ in terms of observable data, we focus on estimating $V(f)$ in practice. The importance weighting value function $V_{IPW}(f)$ can be estimated directly from the crowdsourced data $\mathcal{D}_R$ as follows (recall that $X_i\sim P^R, Y_i \sim P^R_y$):
\begin{equation*}
    \widehat{V}_{IPW}(f)=\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{P^R(X_i)}Y_i    
\end{equation*}

Note that both $P^f$ and $P^R$ are \textit{known} quantities and do not need to be estimated---$P^f$ because it is obtained directly from the model $f$ we are optimizing, and $P^R$ because we know the randomization mechanism of the texts in $\mathcal{D}_R$.\footnote{In practice, it can still be empirically helpful to use a model-derived estimate of the randomization probabilities $\widehat{P}^R(X)$, similar to how the H\'ajek estimator can have lower variance than the Horvitz-Thompson estimator \citep{Hajek1971, Sarndal2003_model}.} Importantly, this means that $\widehat{V}_{IPW}(f)$ is an unbiased estimator for $V(f)$.

\begin{theorem}
\label{thm:ipw_unbiased}
Let $\mathcal{D}_R$ be a randomized experiment parameterized by $P^R$, such that $P^R$ is known. Then 
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{IPW}(f)]]&=\E_{X \sim P^f}[\E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]] \\
    &=V(f)
\end{align*}
\end{theorem}

\subsection{Doubly Robust Causal Preference Optimization}

An importance weighting estimator like the CPO value function is a natural solution for estimating causal quantities when randomized experimental data such as crowdsourced data is available. However, CPO optimizes over only the experimental data, and so it can be further improved by the addition of an outcome modeling term that predicts outcomes on unlabeled texts. The combination of IPW and outcome modeling yields a doubly robust estimator (DR-CPO) that reduces the variance of CPO and improves its generality while still remaining unbiased for the true causal optimization problem.

\textbf{Identification.} The doubly robust formulation gives us another way of linking the value function to the randomization dataset $\mathcal{D}_R$. 

\begin{proposition}\label{eq:v_dr_proposition} The value function $V(f)$ can also be identified as
\begin{align*}
    V(f)=&\;\E_{X \sim P^R}\Bigg[\E_{Y \sim P^R_y}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] + \\
    &\;\E_{X\sim P^f}[g(X)] \tag*{(\text{$V_{DR}$})}
\end{align*}
\end{proposition}

This construction combines IPW and an outcome model $g$ to provide robustness against misspecification or mis-estimation within either term---akin to doubly robust estimators that serve the same purpose when estimating causal effects.

\textbf{Estimation}. The doubly robust value function $V_{DR}$ can be estimated from the crowdsourced data $\mathcal{D}_R$ and a learned outcome model $\widehat{g}(X)$. 

First, however, we consider the outcome modeling term $\E_{X\sim P^f}[g(X)]$. Even if we were to have access to the true outcome model $g$, it is difficult to optimize $g$ with respect to texts $X \sim P^f$,
% the original outcome modeling value function $V_{out}(f)=E_{X \sim P^f}[g(X)]$, 
as this requires that texts be drawn from the language model $f$ \textit{as $f$ is being updated}. To remedy this, we re-write $\E_{X\sim P^f}[g(X)]$ in terms of a fixed language model $f^0$:\footnote{We show this equivalence in Appendix \ref{sec:v_out_rewritten}.}
\begin{equation*}
    \E_{X\sim P^f}[g(X)]=\E_{X \sim P^{f^0}}\Bigg[\frac{P^f(X)}{P^{f^0}(X)}g(X)\Bigg] \tag*{(\text{$V_{out}$})}
\end{equation*}
where $P^{f_0}$ denotes the distribution over texts from language model $f_0$.

We can create a Monte Carlo estimate of this by drawing texts $\widetilde{X}_1, \ldots, \widetilde{X}_m \sim P^{f^0}$ and computing
\begin{equation*}
    \widehat{V}_{out}(f)=\frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)
\end{equation*}
where $\widehat{g}(x)$ is a model trained to predict $Y$ from $X$ and $f^0$ is any generative language model. 
% \eli{This could be more explicit. Part of the key is that $\hat{g}$ could be bad because it was fit on confounded data. Is that the setup we want to consider? We have a larger dataset that we can train $\hat{g}$ on? Or are we always fitting $\hat{g}$ on the randomization dataset, where it will be unconfounded?}
% Note that because the texts are generated from $f^0$, $P^{f^0}$ can be computed from the same language model and is therefore known rather than estimated.

Finally, the doubly robust value function $V_{DR}$ can be estimated as a combination of these two terms.
\begin{align*}
    \widehat{V}_{DR}(f)=&\;\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{P^R(X_i)}(Y_i-\widehat{g}(X_i)) + \\
    &\;\frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)
\end{align*}


Formally, it can be shown that $\widehat{V}_{DR}(f)$ is an unbiased estimator for $V(f)$ 
under two possible conditions, making it an effective proxy for the true causal optimization problem.

\begin{theorem}
\label{thm:dr_unbiased}
Let $\mathcal{D}_R$ be a randomized experiment parameterized by $P^R$, which may be estimated from a separate sample by $\widehat{P}^R$. Let $g(X)=E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]$, which may be estimated from a separate sample by $\widehat{g}(X)$. Then 
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]&=\E_{X \sim P^f}[\E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]] \\
    &=V(f)
\end{align*}
if \textit{either}
\begin{enumerate}
    \item $\widehat{P}^R(X)=P^R(X)$, or
    \item $\widehat{g}(X)=g(X)$
\end{enumerate}
\end{theorem}

Importantly, because $P^R$ \textit{is known} in randomized experiments, including the crowdsourced data setting, it does not need to be estimated, which means that condition (1) is always fulfilled for DR-CPO. Therefore, $\widehat{V}_{DR}(f)$ is guaranteed to be unbiased for $V_{DR}$ \textit{even if the outcome model is incorrect}. In other words, DR-CPO is robust to misspecification of $\widehat{g}$, as the IPW term in its value function corrects for any bias from the predicted outcomes.

As a result, rather than learning only from the experimental data $X \sim P^R, Y \sim P^R_y$, a model optimized with DR-CPO will additionally be able to leverage the generative language model $f^0$ to learn from unlimited unlabeled text $\widetilde{X} \sim P^{f^0}$ with predicted outcomes $\widehat{g}(\widetilde{X})$.
This can reduce the variance of the value function estimator.
\begin{proposition}
  \label{prop:var_diff}
    If $\widehat{g}$ is fit on a separate sample, then conditional on $\widehat{g}$, $n \left(\Var\left(\widehat{V}_{IPW}(f)\right) - \Var\left(\widehat{V}_{DR}(f)\right)\right)$ is equal to 
   \begin{align*}
       \Var\left(\frac{P^f(X)}{P^R(X)}g(X)\right) - \Var\left(\frac{P^f(X)}{P^R(X)}\left(g(X)-\widehat{g}(X)\right)\right)\\
        \quad- \frac{n}{m} \Var\left(\frac{P^f(\widetilde{X})}{P^{f^0}(\widetilde{X})}\widehat{g}(\widetilde{X})\right)
   \end{align*}
     
\end{proposition}

Proposition~\ref{prop:var_diff} shows the difference in the variances of the IPW and DR value function estimators, scaled by the sample size $n$ of the crowdsourced data to highlight asymptotic differences. This difference indicates that $\Var(\widehat{V}_{DR}(f))<\Var(\widehat{V}_{IPW}(f))$ subject to two conditions: (i) the number of Monte Carlo samples drawn from $f^0$ is much larger than the sample size of the crowdsourced data, i.e., $m \gg n$; and (ii) $\widehat{g}(x)$ has \textit{some} additional predictive power compared to a constant model.

Condition (i) limits the component of the variance difference that arises due to Monte Carlo error from taking $m$ samples from the reference language model $f^0$. Then the main comparison is the difference between the variance of the expected outcome $g(x)$, and the variance of the \emph{prediction} error for the model. As a result, under condition (ii), we can expect the variance of $\widehat{V}_{DR}$ to be lower than the variance of $\widehat{V}_{IPW}$  (e.g., if $(\widehat{g}(x) - g(x))^2 < g(x)^2$).

% if we choose a large enough number of samples so that $m >> n$, this term is not a major component of the variance. The main component is the difference between the variance of the expected outcome $g(x)$, and the variance of the \emph{prediction} error for the model, $\widehat{g}(x) - g(x)$. If the model $\widehat{g}$ has \emph{some} predictive power over a constant model, so that e.g. $(\widehat{g}(x) - g(x))^2 < g(x)^2$, then we can expect the variance of $\widehat{V}_{DR}$ to be lower than the variance of $\widehat{V}_{IPW}$.

% This reduces the variance of the value function estimator $\widehat{V}_{DR}$ relative to $\widehat{V}_{IPW}$, and models learned under DR-CPO should have greater generalization than models learned under CPO. 

We note that in a different data setting where the true $P^R$ is unknown, a well-estimated $\widehat{g}$ that is close to the true $g$ can also help bias-correct any mis-estimation of $\widehat{P}^R(X)$. This may occur, for instance, when no text experiment or crowdsourced dataset is available, but a large amount of clean data exists to train the outcome model.

% Each of these value functions may be used to solve the CPO problem in practice. In the remainder of this paper, we refer to outcome modeling-based CPO (i.e., using $\widehat{V}_{out}$) as O-CPO; IPW-based CPO (i.e., using $\widehat{V}_{IPW}$) as IPW-CPO; and doubly robust CPO (i.e., using $\widehat{V}_{DR}$) as DR-CPO.

% In other words, the value function $V(f)$ can be identified from the observed data in multiple ways. With \textit{outcome modeling} ($V_{out}$), texts $X$ are generated from $P^f$ and outcomes are computed by a model $g(X)$, such that the optimization problem is to maximize the predicted outcome. With \textit{importance weighting} ($V_{IPW}$), or IPW, observed outcomes $Y \sim P^R_y$ are weighted by the density ratios between texts drawn from the model $X \sim P^f$ and texts drawn from the randomization distribution  $X \sim P^R$; this approximates the average outcome under $P^f$.
% % which are then maximized under optimization. 
% Finally, we use a \textit{doubly robust} construction ($V_{DR}$) that combines outcome modeling and IPW to provide robustness against misspecification or mis-estimation within either term---akin to doubly robust estimators that serve the same purpose when estimating causal effects. 

\subsection{Relationship to Existing Approaches}
\label{sec:rlhf_dpo_equivalence}

Elements of (DR-)CPO are reflected in two of the most prominent existing language model optimization approaches: RLHF and DPO. 

First, notice that the outcome modeling term $V_{out}$ is itself a way of identifying $V(f)$ from the observed data.
Outcome modeling relies entirely on the predictive model $\widehat{g}$ and unlabeled texts $\widetilde{X} \sim P^{f^0}$ and therefore does not require an experimental dataset $\mathcal{D}_R$. However, if $\widehat{g}(X)$ is not a good outcome model, then $\widehat{V}_{out}(f)$ will be biased with respect to $V(f)$. In particular, reward models trained on confounded data may be misspecified, since they can only capture the relationship between the text and the response---and not the confounders that have additionally influenced the response. While these issues are remedied if the confounding is also fully modeled, confounders are extremely difficult to measure fully in text data.

Optimization of $V_{out}(f)$ is closely related to RLHF via proximal policy optimization (PPO) in the direct outcome data setting. In particular, the PPO policy loss term can be seen as a version of outcome modeling in which the reward model is trained on paired completion data. Under these conditions, the RLHF reward model is analogous to the outcome model $g$, and the RLHF policy loss is mathematically equivalent to $V_{out}(f)$ (details in Appendix \ref{sec:rlhf_ties}).

% \eli{I think the framing and organization can be sharpened a bit as follows.  Have Section 4.3 be about the limitations of outcome modelling and IPW (and their connections to RLHF and DPO) 
%  I might characterize it as this:
% \begin{itemize}
%     \item Outcome modeling: doesn't require an experiment, but can be entirely wrong due to confounding
%     \item IPW: unbiased, but can have high variance and poor generalization properties because experiments are often small 
% \end{itemize}
% Then Section 4 is about how the DR approach addresses this. I think it can be framed explicitly as starting with the outcome modelling approach and then only using the experiment to correct for any bias in the outcome modelling approach. So we can even use externally fit outcome models that have some confounding, and it will still be ok.
% }

Likewise, DPO shares similarities with CPO. Both DPO and CPO fine-tune a language model for human feedback by directly using a preference dataset rather than relying on reward or outcome modeling. The DPO objective is similar to $V_{IPW}$ in that it directly increases the likelihood of texts corresponding to desired outcomes through importance weighting. However, because of the paired nature of the data, DPO increases the density ratio between preferred and non-preferred examples, while CPO directly increases the probability of texts with desired outcomes and decreases the probability of texts with non-desired outcomes. Given a paired data setting, the DPO objective could possibly be recovered from $V_{IPW}$; we leave this derivation for future work.

% These parallels mean that RLHF and DPO are subject to the same limitations that we describe for outcome modeling and importance weighting more generally. That is, RLHF can degrade under confounding or misspecification of its reward model, while DPO can experience high variance and limited generalization resulting from the size or coverage of its preference dataset. \vl{Is this paragraph necessary?}
% [While outcome modeling with $\widehat{V}_{out}(f)$ and importance weighting with $\widehat{V}_{IPW}(f)$ can be useful, both can easily result in poor optimization of the language model. Outcome modeling does not require a text experiment, but if $\widehat{g}(X)$ is not a good outcome model, then $\widehat{V}_{out}(f)$ will be biased with respect to $V(f)$. In fact, it is not unlikely that the outcome model will be incorrect when learned from real-world data---often due to confounding. 

% Under confounding, the outcome is not fully caused by the text, and so a model trained to predict the outcome from the text will inevitably be misspecified, since it cannot capture the confounders that have influenced the outcome in addition to the content of the text. [We explore such a setting in our experiments and show that models optimized with $\widehat{V}_{out}(f)$ degrade under confounding, while models optimized with $\widehat{V}_{DR}(f)$ remain unaffected.]

% [Importance weighting is unbiased given a text experiment, but optimization is limited to the text data from the experiment.] \vl{Are these the right pros/cons to be mentioning?}


% \subsection{CPO corrects estimation bias} 
% \subsection{Double Robustness of CPO}
% \vl{Feel like I'm repeating myself a bit in this section... does it read that way?}

% The limitations of outcome modeling and importance weighting motivate our doubly robust formulation DR-CPO, which provides (a) robustness against outcome model misspecification \textit{and} (b) generalizability beyond a specific preference dataset or text experiment. 
% At a high level, DR-CPO is unbiased for the true optimization problem as long as \textit{either} (1) the distribution of the preference dataset or text experiment is known, or (2) the ``true'' outcome model is known.


% \subsection{Double robustness of CPO}

% [Under what conditions is CPO advantageous? In particular, under what conditions does each loss term of $\mathcal{L}_{CPO}$ confer robustness against misspecification or mis-estimation (e.g., incorrect reward/outcome model; confounding)?]

% \subsection{Ties to DPO}

% Like the importance weighting component of CPO, DPO fine-tunes a language model for human feedback by directly using a preference dataset rather than relying on reward modeling. As is the case with RLHF, DPO requires preference data to come from a forced choice experiment in which users are asked to choose the better of two completions for a prompt.

% Letting $c_w$ and $c_l$ denote the preferred and non-preferred completions to the prompt $p$, respectively, the DPO objective is given by
% \begin{align*}
%     \mathcal{L}_{DPO}(\theta)=&\;\E\Bigg[\log \sigma \Bigg(\beta \log \frac{\pi_\theta(c_w|p)}{\pi_{\theta_0}(c_w|p)} \\
%     &- \beta\log \frac{\pi_\theta(c_l|p)}{\pi_{\theta_0}(c_l|p)}\Bigg)\Bigg]
% \end{align*}
% where $\pi_\theta$ is again the probability under the model being optimized, while $\pi_{\theta_0}$ is the probability under a reference model.

% As with RLHF, DPO can be seen as a special case of CPO in which the preference data comes from a forced choice experiment in which users are asked to choose the better of two completions for a prompt. Under these conditions, the DPO objective is analogous to the density ratio loss $\mathcal{L}_R$ in CPO. Letting $c_w$ and $c_l$ denote the preferred and non-preferred completions to the prompt $p$, respectively,
% \begin{equation}
%     \mathcal{L}_{DPO}(\theta)=\E\Bigg[\log \sigma \Bigg(\beta \log \frac{\pi_\theta(c_w|p)}{\pi_{\theta_0}(c_w|p)} - \beta\log \frac{\pi_\theta(c_l|p)}{\pi_{\theta_0}(c_l|p)}\Bigg)\Bigg]
% \end{equation}

% The DPO objective shares certain similarities with the $V_{IPW}$ value function---most notably in that it directly increases the likelihood of texts corresponding to desired outcomes through importance weighting. Because of the paired nature of the data, DPO increases the density ratio between preferred and non-preferred examples, while importance-weighted CPO directly increases the probability of texts with desired outcomes and decreases the probability of texts with non-desired outcomes in separate steps, rather than adjusting the probabilities through a ratio. 
% \vl{I think the future work about recovering DPO from a paired version of CPO should go in the discussion rather than here, but I'm not sure what a neat way to tie up this section would be}
% \eli{If this gets folded in to a section about IPW and it's limitations, I think this can be more of a brief aside than a main thing, and so wouldn't need to be neatly tied up. Agreed on the discussion portion}

% \vl{Not exactly sure what's the best way of explaining the parallel, since they use the ratio of the completion probabilities, whereas we multiply by -1 or 1 depending on which outcome is preferred. This has the same effect of increasing $\pi(c_w|p)$ and reducing $\pi(c_l|p)$, but it's not mathematically equivalent---so can we still call DPO a ``special case'' of CPO? Also, I guess they apply the logistic function over the whole thing, which we don't do?}

\section{Experiments}
\label{sec:experiments}

We conduct evaluations to empirically assess the effectiveness of CPO and DR-CPO in optimizing language models for human preferences on direct outcome data, and we examine the doubly robust properties of DR-CPO under confounding.

\subsection{Datasets}

To evaluate optimization on direct outcome data, we consider three crowdsourced or randomized experimental datasets in which human annotators provided numerical responses to texts.

\textbf{Hate Speech} \textit{(binary outcome).} The Hate Speech dataset \citep{qian-etal-2019-benchmark} consists of comments from the social media sites Reddit and Gab. Outcomes are collected via crowdsourcing and indicate whether the annotator percieves the comment to be hate speech. The Reddit comments are chosen from subreddits where hate speech is more common, and Gab is a platform where users sometimes migrate after being blocked from other social media sites. The optimization goal for this dataset is to generate texts that are \textit{less} hateful on average.

\textbf{Hong Kong} \textit{(scalar outcome).} The Hong Kong dataset \citep{fong2021causal} consists of texts concerning the Hong Kong democracy protests of 2019-2020. These texts are loosely based on speeches made about Hong Kong during U.S. Congressional sessions at the time of the protests. Outcomes are collected via a randomized experiment and indicate on a scale of 0-100 to what extent the respondent thinks that the U.S. should support Hong Kong during this time, after reading the text. The texts are programmatically constructed: for each text, 2 or 3 text attributes are randomly chosen out of 7 (e.g., \textit{commitment}, \textit{bravery}, \textit{mistreatment}). Short passages corresponding to each attribute are then randomly chosen from a pool of about 20 to construct the text. The optimization goal for this dataset is to generate texts with \textit{high} outcomes on average.

\textbf{Confounded} \textit{(scalar outcome).} The Confounded dataset is a version of the Hong Kong dataset where we have induced confounding. We consider the strongest possible form of confounding: the confounder is fully correlated with the outcome, resulting in all outcomes being negations of the original outcomes. This dataset is used to train outcome models. We include this dataset with the realistic expectation that text data is often confounded, which poses a threat to outcome model-based approaches. Therefore, it is necessary to evaluate how different optimization approaches fare under confounding. Like the Hong Kong dataset, the optimization goal for this dataset is to generate texts with \textit{high} outcomes on average.

% \subsection{Evaluation}

\subsection{Implementation}
\label{sec:implementation}

\textbf{Evaluation.} To evaluate how well optimization for human preferences has occurred, we use a text preference framework in which a reader is asked to choose the better (with respect to the outcome) of a pair of texts generated by two different methods. Using GPT-4 as a proxy for human annotators, we compare pairs of \textit{(method, baseline)} completions for the same prompt; across all pairs, we compute method \textit{win rates} and compute 95\% confidence intervals. Since the datasets used for these experiments contain one text per sample rather than a prompt and a completion, we create prompts on the evaluation set by truncating each text to a random length.

The full input provided to GPT-4 for each dataset can be found in Appendix \ref{sec:gpt_win_rate_questions}. We validate the use of GPT-4 as an annotator with a human study, which we describe in further detail in Section \ref{sec:gpt4_annotation}.


% [TODO: HOW WERE PROMPTS CONSTRUCTED? HOW WERE MODELS TRAINED? could also put this in datasets section at the beginning. datasets were split into train/eval splits... datasets were optimized on training split. then for eval split, since the datasets aren't in prompt/completion format by default, prompts were constructed by truncating texts to a random length.]

% [How did we generate confounding, and what do results on the confounded dataset tell us?]

% \textbf{Confounded Hate Speech?}

% \begin{itemize}
%     \item HK [How was this dataset created? What's different about it compared to natural text?]
%     % \item EmoBank (continuous)
%     \item Hatespeech
%     \item Confounded dataset(s) [How did we generate confounding, and what do results on the confounded dataset tell us?]
% \end{itemize}

\textbf{Methods.} We evaluate language models optimized using \textbf{CPO} and \textbf{DR-CPO}. As our baselines, we consider language models that have been fine-tuned on texts from each of the task datasets (\textbf{FT}), as well as models optimized using the outcome modeling value function $V_{out}(f)$. Since---as we discuss in Section \ref{sec:rlhf_dpo_equivalence}---the $V_{out}(f)$ objective is mathematically equivalent to the RLHF objective, we refer to this baseline as \textbf{OO-RLHF} (offline outcome RLHF).

We use Llama 2 7B \citep{touvron2023llama} as our base language model and fine-tune with low-rank adaptation \citep{hu2022lora}. All optimizations are applied after fine-tuning on text from the task dataset.

\textbf{Choice of $f^0$.} When optimizing with DR-CPO or OO-RLHF, any generative language model may be used as $f^0$, the fixed language model from which texts are drawn as input to the outcome model. One key consideration is whether $f^0$ should be a pre-trained model or whether it should be a model that has been fine-tuned on text relevant to the task---for instance, the randomized experiment dataset $\mathcal{D}_R$.

% In practice, we use the following general heuristics. 
In practice, we choose a pre-trained model as $f^0$ to leverage the diversity of texts such models tend to generate. If $\widehat{g}(X)$ is a good outcome model, then predicted outcomes on these texts will still be close to the true outcomes, and DR-CPO and OO-RLHF will benefit from outcome modeling. 

% If $\widehat{g}(X)$ is not a good outcome model, then O-CPO will be negatively affected. On the other hand, with DR-CPO, the bias-correction of the IPW component should ensure that the predicted outcomes---while not necessarily useful---do not hurt the overall optimization.

% If we feel that $\widehat{g}(X)$ may not be a very robust outcome model, we choose a fine-tuned model as $f^0$. [By keeping generated texts similar to the texts in the original randomized dataset $\mathcal{D}_R$, we ensure that $\widehat{g}(X)$ will only have to predict outcomes on texts similar to those that it has already seen before, increasing the likelihood that the predictions will be close to the true outcomes...?]
% Practical considerations may inform specific choices of $f^0$. For instance, one key 

\textbf{Choice of $\widehat{P}^R$.} In Section \ref{sec:identification}, we mention briefly that although the distribution of texts $P^R(X)$ under the randomized experiment is known, it can be helpful empirically to instead compute an estimated $\widehat{P}^R(X)$. This is generally due to the fact that the \textit{sample} probability of each text $X$ may not actually be equal to its theoretical probability merely by chance \citep{Hajek1971, Sarndal2003_model}.

We find this to be the case in our experiments, and so we use $\widehat{P}^R(X)$ estimated from a Llama 2 7B model fine-tuned on $\mathcal{D}_R$ in our CPO and DR-CPO implementations.

% \subsection{Evaluation}

% \subsubsection{Text preferences}
% \begin{itemize}
%     \item FT model on text from target dataset
%     \item Apply optimization method on FTed model
%     \item Generate completions for the same prompt across all methods/models
%     \item Show GPT-4 pairs of (CPO, [competing method]) completions for the same prompt and ask it to choose the one that is better w.r.t. to the outcome
%     \item Compute overall win rate of CPO against each method
% \end{itemize}

% \subsubsection{Comparing GPT-4 to human annotators}

% [To assess the validity of using GPT-4 as a substitute for human annotators]

% \begin{itemize}
%     \item Show humans the same pairs of (CPO, [competing method]) completions and ask them to choose the one that is better w.r.t. to the outcome
%     \item 30 annotators who each annotate 20 samples. Total of 200 samples, so an average of 3 annotators per samples
%     \item Compute agreement between each human annotator
%     \item Compute agreement between each human annotator and GPT-4
%     \item Compare agreements to see if they are similar
% \end{itemize}

% \subsection{Implementation}

% \eli{Should all of this be under a separate (sub or subsub) section about implementation details?}
% [DISCUSS: When should $f^0$ be FTed vs. pretrained? Good $\widehat{g}(X)$, bad/limited $P^R$: $f^0$ should be pretrained, since this encourages $f$ to be optimized over a wider range of texts (as presumably texts generated by the pretrained model will be more diverse than texts generated by the FTed model). Bad/limited $\widehat{g}(X)$: $f^0$ should be FTed on $P^R$. Since we don't trust $\widehat{g}(X)$'s predictions on text that's very different from text it's already seen (assuming it was trained on $P^R$ as well), we want it to only make predictions on text similar to $P^R$.] \vl{For the bad/limited $\widehat{g}(X)$, is this reasoning correct? What if $\widehat{g}(X)$ wasn't trained on $P^R$? Then wouldn't its predictions be equally bad on $P^R$ and on text generated by the pretrained mode?}
% \eli{I do not understand this  point} \vl{I think this is the same as what you said in an earlier comment about whether we're fitting $\widehat{g}(X)$ on our unconfounded randomization dataset or whether we're fitting it on a larger potentially confounded dataset}

% [Also mention choice to use $\widehat{P}^R$ instead of $\frac{1}{n}$]

\section{Results and Discussion}
\label{sec:results}

\subsection{GPT-4 Annotation Validity}
\label{sec:gpt4_annotation}

\begin{table}[!ht]
    \centering
    \begin{tabular}{cccc}
    \toprule
    \toprule
        Annotator 1 & Annotator 2 & Fleiss' $\kappa$  \\
    \midrule
        Human & Human & 0.170 \\
        Human & GPT-4 & 0.219 \\
        Human majority & GPT-4 & 0.192 \\
    \bottomrule
    \bottomrule
    \end{tabular}
    \caption{Agreement rates of human annotators and GPT-4 when asked to choose preferred texts with respect to target outcomes. We examine inter-human agreement, human-GPT-4 agreement, and agreement between a majority vote of human annotators and GPT-4.}
    \label{tab:human_gpt4_agreement}
\end{table}

Following the precedent set by \cite{rafailov2023direct}, we conduct a human study to assess the validity of GPT-4 as an annotator when choosing between pairs of texts for a preferred outcome. Across 200 randomly sampled examples from the Hong Kong dataset, we show human annotators \textit{(method, baseline)} completion pairs and ask them to choose the better of the two with respect to the outcome. We compute agreement between each human annotator, as well as agreement between each human annotator and GPT-4.  Agreement is measured through Fleiss' $\kappa$ \citep{fleiss1971measuring}, a common metric for agreement among multiple raters. 
% \vl{Mention that this is a ``fair'' amount of agreement}

We use the online research platform Prolific\footnote{\url{https://www.prolific.co/}} to conduct our human study. To avoid annotator fatigue, examples are annotated in batches of 20. We recruit a total of 30 annotators for an average of 3 annotators per example and a total of 600 annotations.

Across three comparisons---human-human, majority vote-human, and human-GPT-4---we find that GPT-4 exhibits a similar or better level of agreement with human annotators as human annotators do with each other (Table \ref{tab:human_gpt4_agreement}). 
% While strict interpretation of $\kappa$ is task-dependent, both human-human and human-GPT-4 agreements may be reasonably interpreted as ``slight'' to ``fair.''
We conclude that GPT-4 is a reasonable surrogate for human annotators.

% We judge GPT-4 to be a valid surrogate for human annotators if human-GPT-4 agreement is similar to or better than inter-human agreement.

% [Human-human agreement is similar to Human-GPT4 agreement, so we conclude that GPT-4 is a reasonable substitute for human annotators]

\begin{figure*}[!ht]
    \centering
    \includegraphics[width=0.75\textwidth]{images/win_rates.png}
    \caption{CPO and DR-CPO win rates against OO-RLHF, FT, and each other. A win rate exceeding 0.5 indicates that the named method outperforms the competing method with respect to the target outcome. Win rates are computed across 2000 pairs for each method combination.}
    \label{fig:win_rates}
\end{figure*}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\columnwidth]{images/confounding.png}
    \caption{Impact of confounding (measured in win rate difference divided by 2) on OO-RLHF, CPO, and DR-CPO. A negative impact indicates that confounding hurts the performance of the method. Win rates are computed across 600 pairs for each method combination.}
    \label{fig:confounding_win_rates}
\end{figure}

\subsection{Text Preferences}

We report CPO and DR-CPO win rates against OO-RLHF and FT (and against one another) in Figure \ref{fig:win_rates}. In Figure \ref{fig:confounding_win_rates}, we examine the impact of confounding on OO-RLHF, CPO, and DR-CPO. Additional results are found in Appendix \ref{sec:additional_results}.

\textbf{Outcome optimization} (Figure \ref{fig:win_rates}). On the Hate Speech dataset, we observe that DR-CPO outperforms both the OO-RLHF and FT baselines. Against both baselines, DR-CPO's win rate is statistically significant at the 95\% confidence level, with the lower bound of its 95\% confidence intervals falling above 0.5. These results indicate that using DR-CPO, language models successfully learn human preferences for less hateful text from direct outcomes.

DR-CPO further appears to outperform CPO on the Hate Speech dataset, though its win rate is just shy of statistical significance. These results demonstrate the empirical benefits of double robustness, wherein a good $\widehat{P}^R$ provides bias-correction against a poorer $\widehat{g}$ (the former evidenced by DR-CPO's relatively smaller win rate margin against CPO, and the latter evidenced by DR-CPO's larger win rate margin against OO-RLHF).

% Since CPO is applied on the fine-tuned model, DR-CPO being preferred over FT suggests that DR-CPO is successfully optimizing language models for the Hate Speech dataset target outcome: making texts less hateful. Moreover, DR-CPO being preferred over O-CPO and statistically indistinguishable from IPW-CPO provides empirical evidence for the doubly robust properties of DR-CPO, wherein a good $\widehat{P}^R$ provides bias-correction against a poorer $\widehat{g}(X)$.
% This is significant for a number of reasons. First, as we mention in Section \ref{sec:implementation}, the base model for our optimization is Llama-2-7b---an LLM that has already been trained with extensive RLHF for 

On the Hong Kong dataset, we likewise observe that CPO outperforms both the OO-RLHF and FT baselines. In this setting, CPO achieves statistically significant win rates against both baselines, with the lower bound of its 95\% confidence intervals falling above 0.5. 

Here, in contrast to the Hate Speech dataset, CPO also outperforms DR-CPO, which suggests that CPO can be very strong under conditions where $P^R$ is well controlled or can be estimated very well---as is the case for the Hong Kong dataset, where texts are not only randomly assigned to annotators but programmatically generated from random attributes. Furthermore, we note that outcome models trained on the Hong Kong dataset do not achieve good performance or generalization outside of the training data, possibly due to the artificial nature and relative homogeneity of the texts. Therefore, our results point to a conclusion that under conditions where the outcome model is particularly difficult to learn, CPO may enjoy \textit{empirical} advantages over DR-CPO despite the \textit{theoretical} robustness of the latter.

\textbf{Double robustness under confounding} (Figure \ref{fig:confounding_win_rates}). Finally, on the Confounded dataset, we find that DR-CPO remains robust under confounding, while OO-RLHF degrades significantly. (Vanilla CPO is not affected by confounding because it does not use an outcome model, but we train two separate models to account for randomness in the optimization process.) OO-RLHF experiences a negative impact to win rate that is significantly lower than 0, while CPO experiences no impact and DR-CPO experiences a positive impact. Exploring this last result is an avenue for future work.
% When provided an unconfounded outcome model, O-CPO achieves a win rate statistically significantly above 0.5 compared to O-CPO with a confounded outcome model. In contrast, DR-CPO is not negatively impacted by a confounded outcome model.

These results further illustrate the doubly robust properties of DR-CPO and the shortcomings of outcome modeling approaches. Even under aggressive confounding, with a worst-case outcome model that has been trained on completely negated data, DR-CPO is not compromised, while OO-RLHF is.

We reiterate that because they are optimized on randomized experimental data, $\mathcal{D}_R$, CPO and DR-CPO are \textit{causal} approaches. Taken together, our results constitute empirical evidence for a core theoretical strength---robustness to confounding---of an optimization framework that maintains the causal relationship between text and outcome.

% \textbf{Analysis of $f^0$ and $P^R$}

% If $\widehat{g}(X)$ is \textit{not} a good outcome model, then it may instead be beneficial to use a model fine-tuned on $\mathcal{D}_R$ as $f^0$. In this case, if the fine-tuning is successful, then $P^{f^0} \approx P^R$, and consequently $\widehat{V}_{DR}(f)$ will reduce to $\widehat{V}_{IPW}(f)$ \vl{SHOW THIS IN APPENDIX?}, removing the negative influence of the outcome model in DR-CPO. \vl{But why does this matter if the bias-correction of the IPW term in DR-CPO should guarantee that it isn't negatively affected by the bad outcome model anyway?}


% \begin{itemize}
%     \item On Hatespeech and EmoBank, CPO is preferred over CLM and each of its ablations. This tells us that (a) the optimization works and (b) the double robustness is working.
%     \item On HK, IPW is preferred over CLM, CPO, and outcome modeling. This tells us that under conditions where $P^R$ is well controlled or can be estimated very well, IPW is strong. Furthermore, under conditions where the outcome model may not be very good, outcome modeling and CPO (despite double robustness) may fall short of IPW. [Due to its careful randomization---not only with respect to how texts are assigned to annotators but also in how the texts themselves are constructed---the Hong Kong dataset is a setting in which we expect IPW to work well]
%     \item On the confounded dataset, outcome modeling under confounding is significantly worse than outcome modeling without confounding. However, IPW and CPO are not significantly worse under confounding than without confounding. This illustrates one of the core strengths of a causal approach: outcome/reward modeling without causal considerations may be susceptible to confounding in the data. IPW does not use outcome modeling and is therefore not susceptible, and the double robustness of CPO also prevents it from being affected.
% \end{itemize}


\section{Conclusion}

In this paper, we explore language model optimization for human preferences from direct outcome datasets, in which each sample consists of a text and the reader's numerical response. We first posit that language model optimization should be viewed as a causal problem to ensure that the model correctly learns the relationship between the text and the outcome, and we define conditions under which this causal relationship can be guaranteed. Following this, we introduce CPO, a method that solves an unbiased surrogate objective for the causal language optimization problem---and improve upon it with the doubly robust DR-CPO, which reduces the variance of the CPO objective while retaining provably strong guarantees on bias. Finally, we empirically demonstrate the effectiveness of (DR-)CPO in optimizing state-of-the-art LLMs for human preferences on direct outcome data, and we validate the robustness of DR-CPO under difficult confounding conditions. These theoretical contributions and results open the door to a wide range of data, human preferences, and optimization goals that language models can learn using CPO.

Several natural lines of future research follow from this work. For instance, (DR-)CPO may benefit empirically from exploration of entropy regularization techniques that are common in policy optimization. Additionally, future work may wish to extend DR-CPO to the paired completion data setting, as the bias guarantees and variance reduction of a doubly robust approach can also be useful for paired data.

% ---from which one could likely recover the DPO objective.

% [TODO: 1 paragraph -- or 2 if we want to discuss future work with paired version to recover DPO?]


% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions. 
%     This is a nice way of making clear who did what and to give proper credit.
%     This section is optional.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
                         
This material is based upon work partially supported by Meta and the National Institutes of Health (awards R01MH125740, R01MH132225, and R21MH130767). Victoria Lin is supported by a Meta Research PhD Fellowship. Any opinions, findings, conclusions, or
recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the sponsors, and no official endorsement should be inferred.
\end{acknowledgements}

% References
\bibliography{ref}

\newpage

\onecolumn

\title{Optimizing Language Models for Human Preferences \\\ is a Causal Inference Problem\\(Supplementary Material)}
\maketitle

\appendix


\section{Technical results}
\label{sec:technical_results}
\subsection{Identifying $V(f)$}

\begin{proof}{Proof of Proposition \ref{eq:v_ipw_proposition}}

We can show that $V(f)=\E_{Y(\cdot)\sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]]=\E_{Y \sim P^R_y}[\E_{X \sim P^R}[\frac{P^f(X)}{P^R(X)}Y]]$.
\begin{align*}
    V(f)&=\E_{Y(\cdot)\sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]] \\
    &=\E_{Y(\cdot)\sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)Y(x)\Bigg] \\
    &=\E_{Y(\cdot)\sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^R(x)\frac{P^f(x)}{P^R(x)}Y(x)\Bigg] \\
    &=\E_{Y(\cdot)\sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}Y(X)\Bigg]\Bigg] \\
    &=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}Y\Bigg]\Bigg]
\end{align*}

\end{proof}


\begin{proof}{Proof of Proposition \ref{eq:v_dr_proposition}}

We can show that $V(f)=\E_{Y(\cdot)\sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]]=\E_{Y \sim P^R_y}[\E_{X \sim P^R}[\frac{P^f(X)}{P^R(X)}(Y-g(X))]] + \E_{X\sim P^f}[g(X)]$.
\begin{align*}
    V(f)&=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}Y\Bigg]\Bigg] \tag*{(Proposition \ref{eq:v_ipw_proposition})} \\
    &=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X)+g(X))\Bigg]\Bigg] \\
    &=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] + \E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}g(X)\Bigg]\Bigg] \\
    &=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] + \E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}g(X)\Bigg] \\
    &=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] + \sum_{x \in \mathcal{X}}P^R(x)\frac{P^f(x)}{P^R(x)}g(x) \\
    &=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] + \sum_{x \in \mathcal{X}}P^f(x)g(x) \\
    &=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] + \E_{X \sim P^f}[g(X)]
\end{align*}
    
\end{proof}

% \subsection{Derivation of Proposition \ref{eq:v_ipw_proposition} [TODO]}
% \label{sec:v_ipw_derivation}

% \subsection{Derivation of Proposition \ref{eq:v_dr_proposition} [TODO]}
% \label{sec:v_dr_derivation}

\subsection{Unbiasedness proofs and variance difference}
\label{sec:unbiased}
\begin{proof}{Proof of Theorem~\ref{thm:ipw_unbiased}
}
    

We can show that $\E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{IPW}(f)]]=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]]=V(f)$ when $P^R$ is known.

\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^R}[\widehat{V}_{IPW}(f)]]
    &=\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{P^R(X_i)}Y_i\Bigg]\Bigg] \\
    &=\frac{1}{n}\sum_{i=1}^n\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{P^R(X_i)}Y_i\Bigg]\Bigg] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}Y(x)\mathbbm{1}\{X=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}Y(x)\underbrace{\E_{X \sim P^R}[\mathbbm{1}\{X=x\}]}_{P^R(x)}\Bigg] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)Y(x)\Bigg] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]] \\
    &=V(f)
\end{align*}

\end{proof}

% \subsection{Unbiasedness of $\widehat{V}_{DR}(f)$}
% \label{sec:unbiased_dr}
\begin{proof}{Proof of Theorem~\ref{thm:dr_unbiased}}
    

We can show that $\E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]]=V(f)$  under one of two conditions: $\widehat{P}^R(X)=P^R(X)$ (i.e., $P^R$ is known) or (2) $\widehat{g}(X)=g(X)=E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]$ (i.e., $g(X)$ is known).

First, we rewrite $\E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]$:
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_{X,\widetilde{X}}[\widehat{V}_{DR}(f)]] =& \; \E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{X,\widetilde{X}}\Bigg[\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i)) + \frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg] \\
    =& \;\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{X\sim P^R}\Bigg[\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg] \\
    &+ \E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{\widetilde{X}\sim P^{f^0}}\Bigg[\frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg] \\
    =& \; \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg] \\ 
    &+ \frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg]
\end{align*}

Case (1) $\widehat{P}^R(X)=P^R(X)$:


Rewriting the first term,
\begin{align*}
    % \mathcal{L}_R&=
    \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg]
    &=\frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{P^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}(Y(x)-\widehat{g}(x))\mathbbm{1}\{X=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}(Y(x)-\widehat{g}(x))\underbrace{\E_{X \sim P^R}[\mathbbm{1}\{X=x\}]}_{P^R(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)(Y(x)-\widehat{g}(x))\Bigg]\\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)-\widehat{g}(X)]] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]]-\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]]
\end{align*}

Rewriting the second term,
\begin{align*}
    % \mathcal{L}_O&=
    \frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg]
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}\widehat{g}(x)\mathbbm{1}\{\widetilde{X}=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}\widehat{g}(x)\underbrace{\E_{\widetilde{X} \sim P^{f^0}}[\mathbbm{1}\{\widetilde{X}=x\}]}_{P^{f^0}(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)\widehat{g}(x)\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]] \\
    % &=\E_{Y(\cdot) \sim \mathcal{G},X \sim P^f}[g(X)]
\end{align*}

% \begin{align*}
%     \mathcal{L}_{CPO}&=
%     \mathcal{L}_R+\mathcal{L}_O \\
%     &=\E_{Y(\cdot) \sim \mathcal{G},X \sim P^f}[Y(X)]-\E_{Y(\cdot) \sim \mathcal{G},X \sim P^f}[g(X)]+\E_{Y(\cdot) \sim \mathcal{G},X \sim P^f}[g(X)] \\
%     &=\E_{Y(\cdot) \sim \mathcal{G},X \sim P^f}[Y(X)]
% \end{align*}
Then we have
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]]-\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]]+\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]] \\
    &=V(f)
\end{align*}

Case (2) $\widehat{g}(X)=g(X)=E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]$:

Rewriting the first term,
\begin{align*}
    % \mathcal{L}_R&=
    \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg]
    &= \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-g(X_i))\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{\widehat{P}^R(x)}(Y(x)-g(x))\mathbbm{1}\{X=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{\widehat{P}^R(x)}(Y(x)-g(x))\underbrace{\E_{X \sim P^R}[\mathbbm{1}\{X=x\}]}_{P^R(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}(Y(x)-g(x))\Bigg] \\
    &=\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}\E_{Y(\cdot) \sim \mathcal{G}}[Y(x)-g(x)] \\
    &=\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}(\E_{Y(\cdot) \sim \mathcal{G}}[Y(x)]-g(x)) \\
    &=\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}\cdot 0 \\
    &=0
\end{align*}

Rewriting the second term,
\begin{align*}
    % \mathcal{L}_O&=
    \frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg]
    &=\frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}g(\widetilde{X}_j)\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}g(x)\mathbbm{1}\{\widetilde{X}=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}g(x)\underbrace{\E_{\widetilde{X} \sim P^{f^0}}[\mathbbm{1}\{\widetilde{X}=x\}]}_{P^{f^0}(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)g(x)\Bigg] \\
    &=\sum_{x \in \mathcal{X}}P^f(x)g(x) \\
    &=\E_{X \sim P^f}[g(X)] \\
    &=\E_{X \sim P^f}[E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]]
\end{align*}

Then we have
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]
    &=0+\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]] \\
    &=V(f)
\end{align*}

\end{proof}



% \subsection{Variance}


\begin{proof}{Proof of Proposition \ref{prop:var_diff}}
      
      First, we compute the variance of $\widehat{V}_{IPW}(f)$, where $X_i \sim P^R$ and $Y_i \sim P^R_y$ i.i.d.
 \begin{align*}
     \Var\left(\widehat{V}_{IPW}(f)\right) & = \Var\left(\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{P^R(X_i)}Y_i  \right)\\
     & = \frac{1}{n^2}\sum_{i=1}^n \Var\left(\frac{P^f(X_i)}{P^R(X_i)}Y_i \right)\\
     & = \frac{1}{n}\Var\left(\frac{P^f(X)}{P^R(X)}Y \right)\\
     & = \frac{1}{n}\E\left[\Var\left(\frac{P^f(X)}{P^R(X)}Y \mid X \right)\right] + \frac{1}{n}\Var\left(\E\left[\frac{P^f(X)}{P^R(X)}Y \mid X\right] \right)\\
     & = \frac{1}{n}\E\left[\frac{P^f(X)^2}{P^R(X)^2}\Var\left(Y \mid X \right)\right] + \frac{1}{n}\Var\left(\frac{P^f(X)}{P^R(X)}\E\left[Y \mid X\right] \right)\\
     & = \frac{1}{n}\E\left[\frac{P^f(X)^2}{P^R(X)^2}\Var\left(Y \mid X \right)\right] + \frac{1}{n}\Var\left(\frac{P^f(X)}{P^R(X)}g(X)\right)
 \end{align*}
 
 where we have used that under randomization $\E[Y \mid X = x ] = \E_{Y(\cdot) \sim \mathcal{G}}[Y(x)] = g(x)$.
 
 For the variance of $\widehat{V}_{DR}(f)$, we first note that if $\widehat{g}$ is fit on a separate, independent sample, we have that 
 \begin{align*}
      \Var\left(\widehat{V}_{DR}(f)\right) & = \Var\left(\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{P^R(X_i)}(Y_i-\widehat{g}(X_i)) + \frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\right)\\
      & = \frac{1}{n^2}\sum_{i=1}^n \Var\left(\frac{P^f(X_i)}{P^R(X_i)}(Y_i-\widehat{g}(X_i))\right) + \frac{1}{m^2}\sum_{j=1}^m \Var\left(\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\right)\\
      & = \frac{1}{n} \underbrace{\Var\left(\frac{P^f(X)}{P^R(X)}(Y-\widehat{g}(X))\right)}_{(\ast)} + \frac{1}{m} \Var\left(\frac{P^f(\widetilde{X})}{P^{f^0}(\widetilde{X})}\widehat{g}(\widetilde{X})\right)
 \end{align*}
 where $X \sim P^R$, $Y \sim P^R_y$, and $\widetilde{X} \sim P^{f^0}$.
 Now notice that
 \begin{align*}
     (\ast) & = \E\left[\Var\left(\frac{P^f(X)}{P^R(X)}(Y-\widehat{g}(X)) \mid X\right)\right] + \Var\left(\E\left[\frac{P^f(X)}{P^R(X)}(Y-\widehat{g}(X)) \mid X \right]\right)\\
     & = \E\left[\frac{P^f(X)^2}{P^R(X)^2}\Var\left(Y \mid X\right)\right] +  \Var\left(\frac{P^f(X)}{P^R(X)}\left(g(X)-\widehat{g}(X)\right)\right)
 \end{align*}


\end{proof}

\subsection{Equivalence of $V_{out}(f)$}
\label{sec:v_out_rewritten}

We can show that our rewriting of $V_{out}(f)$ is equivalent to our original definition:
\begin{align*}
    V_{out}(f)&=\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(X)}{P^{f^0}(X)}g(X)\Bigg] \\
    &=\sum_{x \in \mathcal{X}}P^{f^0}(x) \frac{P^f(x)}{P^{f^0}(x)}g(x) \\
    &=\sum_{x \in \mathcal{X}}P^f(x)g(x) \\
    &=E_{X \sim P^f}[g(X)]
\end{align*}

\section{Parallels between RLHF and optimization of $V_{out}(f)$}
\label{sec:rlhf_ties}

The loss function under RLHF is typically computed through proximal policy optimization (PPO):
\begin{align*}
    \mathcal{L}(\theta, \phi) = \mathcal{L}^{PPO}_{\text{policy}}(\theta) + c_1\mathcal{L}^{PPO}_{\text{value}}(\phi) - c_2\mathcal{L}^{PPO}_{\text{entropy}}(\theta)
\end{align*}

where $\mathcal{L}^{PPO}_{\text{value}}(\phi)$ and $\mathcal{L}^{PPO}_{\text{entropy}}(\theta)$ are regularization terms and $\mathcal{L}^{PPO}_{\text{policy}}(\theta)$ is the \textit{policy loss}. Letting $p$ denote the prompt, $c$ denote the completion, and $r$ denote the reward model, we consider only the policy loss without any stability tricks like clipping.  $\pi_\theta$ is the probability under the policy being optimized, while $\pi_{\theta_0}$ is the probability under a reference policy (often the starting policy or the policy at the previous step).
\begin{equation*}
    \mathcal{L}_{PPO}(\theta)=
    \E\left[\frac{\pi_\theta(c|p)}{\pi_{\theta_0}(c|p)}\cdot r(p,c)\right]
\end{equation*}

We can see the equivalence between $\pi_\theta(c|p)$ and $P^f(X)$, $\pi_{\theta_0}(c|p)$ and $P^{f^0}(X)$, and $r(p,c)$ and $g(X)$; substituting these terms renders $\mathcal{L}_{PPO}$ equal to $V_{out}(f)$.

\section{Experiments}

\subsection{GPT-4 Win Rate Prompts}
\label{sec:gpt_win_rate_questions}

In this section, we include the inputs provided to GPT-4 for each dataset to obtain its preferences between texts generated by different methods. We use \verb|gpt-4-1106-preview| from the OpenAI API. We adapt the prompt formats described in \cite{rafailov2023direct}. The order of texts is random for each evaluation. For a \textit{(method, baseline)} comparison, \textit{method} corresponds to text A 50\% of the time and text B 50\% of the time.

\subsubsection{Hate Speech prompt}

\begin{lstlisting}
Hate speech is a direct attack on an individual based on protected characteristics: race, ethnicity, national origin, religious affiliation, sexual orientation, caste, sex, gender, gender identity, and serious disease or disability. 

Under the definition of hate speech, which of the following two texts is less hateful?

FIRST provide a one-sentence comparison of the two texts, explaining which you think is less hateful and why.

SECOND, on a new line, state only "A" or "B" to indicate your choice. Your response should use the format:
Comparison: <one-sentence comparison and explanation>
Preferred: <"A" or "B">

A: <text A>

B: <text B>
\end{lstlisting}

\subsubsection{Hong Kong and Confounded prompt}

\begin{lstlisting}
In March 2019, the Hong Kong government proposed a bill that would have allowed certain criminal suspects to be extradited to mainland China for trial. The 2019-2020 Hong Kong democracy protests were a series of demonstrations that began in response to this bill, stoked by growing concerns about the increasing influence of the Chinese government over Hong Kong. Hong Kong currently has its own legal and political system and is considered semi-autonomous. 

Which of the following two texts better convinces you that the U.S. government should support the Hong Kong democracy protesters?

FIRST provide a one-sentence comparison of the two texts, explaining which you think is more convincing and why.

SECOND, on a new line, state only "A" or "B" to indicate your choice. Your response should use the format:
Comparison: <one-sentence comparison and explanation>
Preferred: <"A" or "B">

A: <text A>

B: <text B>
\end{lstlisting}

% \subsection{Human Agreement Study [TODO]}
% \label{sec:gpt4_annotation_details}



\subsection{Additional Results}
\label{sec:additional_results}

\begin{table*}[!ht]
    \centering
    \begin{tabular}{c|cc|cc}
    \toprule
    \toprule
    & \multicolumn{2}{c|}{Hong Kong} & \multicolumn{2}{c}{Hate Speech} \\
        & CPO win rate & DR-CPO win rate & CPO win rate & DR-CPO win rate \\
    \midrule
        FT & \textbf{\textcolor{green!50!black}{0.528* [0.506, 0.550]}} & 0.477 [0.455, 0.499] & \textcolor{green!50!black}{0.517 [0.495, 0.539]} & \textbf{\textcolor{green!50!black}{0.524* [0.502, 0.546]}} \\
        CPO & - & 0.441 [0.419, 0.463] & - & 0.518 [0.496, 0.540] \\
        OO-RLHF & \textbf{\textcolor{green!50!black}{0.542* [0.520, 0.564]}} & 0.482 [0.460, 0.504] & \textbf{\textcolor{green!50!black}{0.538* [0.516, 0.560]}} & \textbf{\textcolor{green!50!black}{0.560* [0.538, 0.582]}} \\
        DR-CPO & 0.559* [0.537, 0.581] & - & 0.482 [0.460, 0.504] & -\\
    \bottomrule
    \bottomrule
    \end{tabular}
    \caption{CPO and DR-CPO win rates against OO-RLHF, FT, and each other. A win rate exceeding 0.5 indicates that the named method outperforms the competing method with respect to the target outcome. Win rates are computed across 2000 pairs for each method combination.}
    \label{tab:win_rates}
\end{table*}

\begin{table}[!ht]
    \centering
    \begin{tabular}{c|c}
    \toprule
    \toprule
    & Unconfounded win rate over confounded \\
    \midrule
    OO-RLHF & \textbf{\textcolor{red!50!black}{0.546* [0.506, 0.586]}} \\
    DR-CPO & \textbf{\textcolor{green!50!black}{0.456* [0.416, 0.496]}} \\
    CPO & 0.505 [0.465, 0.545] \\
    \bottomrule
    \bottomrule
    \end{tabular}
    \caption{Win rates with outcome models trained on unconfounded data (Hong Kong) vs. confounded data (Confounded). A win rate exceeding 0.5 indicates that the method+unconfounded outcome model outperforms the method+confounded outcome model with respect to the target outcome---in other words, that confounding hurts the method. Win rates are computed across 600 pairs for each method combination.}
    \label{tab:confounding_win_rates}
\end{table}

We include the full set of CPO and DR-CPO win rates against OO-RLHF and FT and against each other (Table \ref{tab:win_rates}). We also include raw win rates from the confounding experiments, specifically win rates of unconfounded methods over confounded methods (Table \ref{tab:confounding_win_rates}). We briefly discuss comparisons that did not appear in the main body of the paper.

On the Hate Speech dataset, we observe that CPO---like DR-CPO---also outperforms both the OO-RLHF and FT baselines. Against OO-RLHF, CPO's win rate is statistically significant at the 95\% confidence level, while its win rate against FT falls slightly short of statistical significance.

On the Hong Kong dataset, we find that DR-CPO performs comparably to OO-RLHF but falls short against the other methods. We attribute this to possible difficulty in learning the outcome model itself; this is further evidenced by the strong performance of CPO, which does not use an outcome model. As we mention in the main results, learning a strong outcome model on the Hong Kong dataset may be challenging, as its texts read somewhat artificially due to their programmatic construction.

\end{document}
