% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{bbm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\allowdisplaybreaks
\usepackage{listings}
\lstset{
basicstyle=\small\ttfamily,
columns=flexible,
breaklines=true
}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


\newcommand{\vl}[1]{\textcolor{orange}{[VL: #1]}}
\newcommand{\edit}[1]{\textcolor{orange}{#1}}

\newcommand{\eli}[1]{\textcolor{orange}{[Eli: #1]}}

\newcommand{\E}{\mathbb{E}}
\newcommand{\R}{\mathbbm{R}}
\newcommand{\Var}{\text{Var}}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}


\title{Optimizing Language Models for Human Preferences \\\ is a Causal Inference Problem}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<vlin2@andrew.cmu.edu>?Subject=Causal Preference Optimization}{Victoria Lin}}
\author[1]{\href{mailto:<ebenmich@andrew.cmu.edu>?Subject=Causal Preference Optimization}{Eli Ben-Michael}}
\author[1]{\href{mailto:<morency@cs.cmu.edu>?Subject=Causal Preference Optimization}{Louis-Philippe Morency}}
% Add affiliations after the authors
\affil[1]{%
    Carnegie Mellon University
}

  
\begin{document}
\maketitle

\begin{abstract}
  As large language models (LLMs) see greater use in academic and commercial settings, there is increasing interest in methods that allow language models to generate texts aligned with human preferences. In this paper, we present an initial exploration of language model optimization for human preferences from \textit{direct outcome datasets}, where each sample consists of a text and an associated numerical outcome measuring the reader's response. We first propose that language model optimization should be viewed as a \textit{causal problem} to ensure that the model correctly learns the relationship between the text and the outcome. We formalize this causal language optimization problem, and we develop a method---\textit{causal preference optimization} (CPO)---that solves an unbiased surrogate objective for the problem. We further extend CPO with \textit{doubly robust} CPO (DR-CPO), which reduces the variance of the surrogate objective while retaining provably strong guarantees on bias. Finally, we empirically demonstrate the effectiveness of (DR-)CPO in optimizing state-of-the-art LLMs for human preferences on direct outcome data, and we validate the robustness of DR-CPO under difficult confounding conditions.
\end{abstract}

\section{Introduction}
\label{sec:intro}

Recent advances in computation have yielded large-scale self-supervised language models that achieve impressive performance on a variety of natural language processing (NLP) tasks \citep{zhang2022opt, chowdhery2023palm, lescao2023bloom, bubeck2023sparks}. These large language models (LLMs)---trained on vast amounts of text data of varying quality---can acquire less desirable attributes from these texts, and so they often require further fine-tuning on human preferences to improve their factual correctness and alignment with social values (e.g., less toxic, more helpful) \citep{NEURIPS2022_b1efde53, bommasani2022opportunities}. 

In this paper, we examine a paradigm for language model optimization for human preferences that has previously been underexplored: learning from \textit{direct outcome datasets}, which are ubiquitous in NLP. In contrast to paired completion data consisting of prompts followed by one preferred and one non-preferred completion, direct outcome datasets are text datasets where each sample consists of a text and an associated numerical \textit{outcome} measuring the reader's response to the text (e.g., Reddit upvotes \citep{Lakkaraju_McAuley_Leskovec_2021}, Amazon ratings \citep{mcauley2013amazon}). A large number of direct outcome datasets are \textit{crowdsourced datasets}, where annotators on a crowdsourcing platform are randomly assigned to read and respond to texts. 

The ability to learn human preferences from direct outcome data significantly broadens the scope of problems that can be addressed by learning from human feedback. Consider the task of inducing a language model to unlearn hate speech. Current unlearning approaches typically use paired data in the format \textit{([hate speech], [alternative text])}, with the preferred text being the latter. Constructing alternative texts can be difficult \citep{eldan2023whos,maini2024tofu}, and rather than fully unlearning the hate speech, the language model is instead trained to preferentially generate the alternative text \citep{patil2024can}. Direct outcome data, by contrast, allows texts to be directly marked as hateful and removed from the language model without learning or requiring an alternative text. Moreover, direct outcome datasets greatly outnumber paired completion datasets in NLP: at the time of writing, for instance, the HuggingFace Datasets hub contains thousands of direct outcome datasets and fewer than 50 paired completion datasets.

We present an initial exploration of language model optimization in the direct outcome setting, where the language model is fine-tuned to optimize texts with respect to a desired outcome. We first note that learning an optimal language model can be difficult due to the presence of unmeasured \textit{confounding} in the training data: external factors that affect both readers' choice of texts to read and how they tend to respond to those texts. Language models optimized on confounded data may learn incorrect relationships between texts and reader responses, leading them to generate sub-optimal text. For instance, users of hate speech are both (i) more likely to engage with content containing hate speech and (ii)  more likely to rate hateful content positively. Such confounding may lead to incomplete unlearning of hate speech, as some examples of hate speech are assigned positive outcomes in the confounded data.

Therefore, we posit that language model optimization should be viewed as a \textit{causal} problem in order to ensure that the optimal language model \textit{causes} preferred outcomes.
In this paper, we introduce a novel causal formulation of the language model optimization problem.
The solution to this optimization problem finds how to \textit{intervene} on the text distribution of the generating model to best \textit{cause} an optimal outcome (i.e., generation of human-preferred texts).

We observe that in the direct outcome setting, it is possible in practice to guarantee that the observed relationship between the text and the outcome is causal by leveraging crowdsourced datasets. Due to random assignment of texts to readers, crowdsourced datasets are not subject to external confounding and can in fact be viewed as randomized experiments \citep{lin-etal-2023-text}. Building on this observation, we present two methodological contributions that enable causal language model optimization on direct outcome datasets. First, we develop \textit{causal preference optimization} (CPO), 
which solves an unbiased surrogate objective for the causal optimization problem.
Next, we extend this to \textit{doubly robust} CPO (DR-CPO), which improves on CPO by 
reducing the variance of the surrogate objective via outcome modeling while retaining provably strong guarantees on bias.\footnote{Our code is publicly available at \url{https://github.com/torylin/causal-preference-optimization}.}

We empirically assess the effectiveness of (DR-)CPO in optimizing state-of-the-art LLMs for human preferences on direct outcome data, both with and without confounding. We find that CPO methods successfully optimize LLMs for human preferences and outperform baselines, and we further observe empirical evidence for the robustness of DR-CPO under difficult confounding conditions.

\section{Related Work}
\label{sec:related_work}

\subsection{Language Model Optimization}


The performance of large self-supervised language models can be further improved by fine-tuning on datasets that align them with human-preferred text \citep{NEURIPS2022_b1efde53, bommasani2022opportunities}. These \textit{paired completion datasets} typically consist of prompts followed by two candidate completions, one of which is indicated to be human-preferred \citep{pmlr-v162-ethayarajh22a, bai2022training, ji2023beavertails}. A reinforcement learning algorithm may then derive its reward model from these datasets (reinforcement learning from human feedback, or RLHF) \citep{christiano2017rlhf}, after which language models are fine-tuned to maximize the human preference reward under the RLHF algorithm. 

While RLHF has seen widespread use \citep{ stiennon2020summarize, touvron2023llama}, it is computationally demanding, as its training loop requires that new texts be generated and new rewards be computed at each step. 
Consequently, in recent months, methods that allow language models to learn more directly from human preference data have emerged 
\citep{hejna2023contrastive, dumoulin2024density}---the most popular of which is direct preference optimization (DPO) \citep{rafailov2023direct}. Like RLHF, DPO is designed for use with paired completion datasets, maximizing the probability ratio of preferred completions to non-preferred completions over the paired completion dataset.

\subsection{Causal Inference and Doubly Robust Policy Learning}

Although RLHF and DPO constitute the two most popular optimization approaches for language models, there exists a wide body of work on estimation and policy learning outside of the NLP space. Some notable work relevant to this paper includes a long history of doubly robust estimation of causal effects \citep{Robins1994} and---more directly applicably---doubly robust policy learning \citep{dudik2011doublyrobust, pmlr-v48-jiang16, Tang*2020Doubly,Athey2021, pmlr-v162-kallus22a}.

In causal inference, double robustness denotes an estimator formulation that provides robustness against misspecification of \textit{nuisance} parameters or functions. In particular, doubly robust estimators combine two existing estimators---an \textit{importance weighting} estimator and an \textit{outcome modeling} estimator---such that only one of the two components must be correctly specified or estimated to guarantee the unbiasedness of the estimator \citep{Robins1994, chernozhukov_locally_2022}. This can also be viewed as the importance weighting term providing a \textit{bias correction} for the outcome modeling term. The principle of double robustness can be extended to not only the estimation of causal effects but also the estimation of any quantity, including loss functions or policy objectives, as we do here.

\section{A Causal View of Language Model Optimization}
\label{sec:causal_formulation}

When a language model $f$ is trained or fine-tuned to generate texts that are consistent with human preferences, the implicit goal can be seen as optimizing texts $X \sim P^f$---texts generated from the model $f$---with respect to some outcome $Y$. In this paper, we consider a direct outcome data format 
$\mathcal{D}_O=\{(X_1,Y_1),\dots,(X_n,Y_n)\}$,
where $X_i$ is a text that individual $i$ interacts with and $Y_i$ is any numerical response of the individual to the texts (e.g., ratings, either binary or scalar).

An association-based approach to optimize model $f$ is to generate texts that are similar to those that have high outcomes in the dataset, i.e.,
\begin{equation}\label{eq:non_causal_optimization}
    \underset{f}{\arg\max} \; \E_{X\sim P^f}[E_{\mathcal{D}_O}[Y|X]],
\end{equation}
where the conditional expectation $\E_{\mathcal{D}_O}[Y|X]$ is the average outcome among individuals who observed the text $X$ and can be learned from $\mathcal{D}_0$. A language model optimized under Equation \eqref{eq:non_causal_optimization} will generate texts that are \textit{correlated} with high outcomes. We distinguish this from our true optimization goal: to learn---across all possible texts and outcomes---to \textit{intervene} on the distribution of the generating language model to \textit{cause} the best possible outcomes.

These hypothetical outcomes can be formalized using the potential outcomes framework \citep{neyman1923, rubin1974}: over text space $\mathcal{X}$, for each individual $i$ we posit the existence of a \emph{potential outcome function} 
$Y_i:\mathcal{X} \to \R$, where $Y_i(x)$ encodes their potential real-valued response if given text $x$.\footnote{This notation implicitly rules out the possibility that an individual's responses can be affected by the texts given to others---a common assumption in causal inference \citep{rubin1974}.} We emphasize that most individuals' potential outcomes are not observed, and so $Y_i(x)$ denotes the response individual $i$ would have had \textit{had they seen text} $x$, possibly contrary to reality. This is also commonly known as the \textit{counterfactual}. 

We assume that we sample individuals from a population $\mathcal{G}$ so that the set of potential outcomes is given by $\{Y(x)\; | \; x \in \mathcal{X}\} \sim \mathcal{G}$. We define $g(x) \equiv \E_{Y(\cdot)\sim\mathcal{G}}[Y(x)]$ as the average outcome if \textit{all} individuals in the population were given text $x$. Note that $g(x)$ is different from the correlational measure $\E_{\mathcal{D}_O}[Y|X]$ because for $\E_{\mathcal{D}_O}[Y|X]$, the association between $X$ and $Y$ may be confounded by an external factor.

Formally, then, our goal is to find a language model $f$ that causes high outcomes $Y$ on average across the population of individuals $\mathcal{G}$ and across the texts generated according to the model. We encode the quality of a generative text model $f$ via its value function that measures the expected outcome (or reward):
\begin{equation}
    V(f) \equiv \E_{X \sim P^f}[\E_{Y(\cdot)\sim \mathcal{G}}[Y(X)]] = \E_{X \sim P^f}[g(X)]
\end{equation}

Then the \textit{causal} optimization problem is to find the language model $f$ that maximizes the expected outcome if a random individual were given a random text according to $P^f$:
\begin{equation}
\label{eq:true_optimization}
    \underset{f}{\arg\max}\; V(f) = \underset{f}{\arg\max} \; \E_{X \sim P^f}[\E_{Y(\cdot)\sim \mathcal{G}}[Y(X)]]
\end{equation}

In intuitive terms, this optimization problem asks the following question: which texts would we generate if we knew what every individual's response to every text would be? By optimizing the text with respect to \textit{every possible response}, this construction removes confounding influences on which texts are read or observed, such that the content of the text must be the sole factor that causes the outcome. 

\section{(Doubly Robust) Causal Preference Optimization}
\label{sec:cpo}

Reframing our optimization problem as a causal inference problem allows us to draw on solutions from statistical causal inference---in particular, the use of randomized experiments to identify causal effects and approximate observing all potential outcomes.\footnote{As discussed above, crowdsourced datasets are randomized experiments, since annotators are randomly assigned to read texts.}
We formalize such a randomized experiment and/or crowdsourced annotated dataset as $\mathcal{D}_R = \{(X_1, Y_1),\ldots, (X_n, Y_n)\}$ where texts $X_i$ are drawn i.i.d. from a known randomization distribution $P^R$, and individuals with potential outcome functions $Y_i(\cdot)$ are drawn i.i.d. from the population $\mathcal{G}$.\footnote{The probability of any text $X_i$ is known according to the randomization probability. For instance, if each reader reads one text uniformly assigned from a corpus of size $n$, then $P^R(X_i)=\frac{1}{n}$ for all $X_i$ in the corpus.}
This induces a distribution on the observed responses $Y_i = Y_i(X_i)$ that we denote as $P^R_y$.

Random assignment of individuals to texts removes all confounding outside of the text, since no external factors influence which texts the individual reads.
Formally,  we have that the texts are independent of the full set of potential outcomes, i.e., $\{Y(x) \mid x \in \mathcal{X}\}  \perp\!\!\!\!\perp  X$. We also require a technical assumption that there is \emph{overlap} between the randomization distribution $P^R$ and the distribution $P^f$ generated by the language model we are optimizing: that is, if $P^R(x) = 0$, then $P^f(X) = 0$ as well. This ensures that the randomization distribution is sufficiently informative about the domain we want to optimize over.
In principle, this is directly enforceable as a constraint on the language model. In practice, due to the underlying structure of text data and the fact that we often fine-tune language models to the text domain as a precursor to optimization, the overlap assumption is unlikely to be binding.

In this section, we describe \textit{causal preference optimization} (CPO), which solves an unbiased surrogate objective for the true causal optimization problem using importance weighting. Following this definition, we extend CPO using the principle of double robustness, in which we use outcome modeling to reduce the variance of the CPO objective while retaining strong guarantees on bias.

Derivations and technical results are shown in Appendix \ref{sec:technical_results}.

\subsection{Causal Preference Optimization}
\label{sec:identification}

\textbf{Identification.} The value of the language model 
$V(f)$ is a causal quantity that involves the potential outcomes for all individuals, some of which are unobserved.
However, we can link the value function to the randomization dataset $\mathcal{D}_R$ (i.e., \textit{identify} it from the observed data) by writing in the following way.

\begin{proposition}\label{eq:v_ipw_proposition} The value function $V(f)$ can be identified as
\begin{align*}
    V(f)&=\E_{X \sim P^R}\Bigg[\E_{Y \sim P^R_y}\Bigg[\frac{P^f(X)}{P^R(X)}Y\Bigg]\Bigg] \tag*{(\text{$V_{IPW}$})}
\end{align*}
\end{proposition}

This value function draws on importance weighting  principles from statistical causal inference (also referred to as IPW). Observed outcomes $Y \sim P^R_y$ are weighted by the density ratios between texts drawn from the language model $X \sim P^f$ and texts drawn from the randomization distribution  $X \sim P^R$; this approximates the average outcome under $P^f$, which is not observed.

\textbf{Estimation.} After writing the causal quantity $V(f)$ in terms of observable data, we focus on estimating $V(f)$ in practice. The importance weighting value function $V_{IPW}(f)$ can be estimated directly from the crowdsourced data $\mathcal{D}_R$ as follows (recall that $X_i\sim P^R, Y_i \sim P^R_y$):
\begin{equation*}
    \widehat{V}_{IPW}(f)=\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{P^R(X_i)}Y_i    
\end{equation*}

Note that both $P^f$ and $P^R$ are \textit{known} quantities and do not need to be estimated---$P^f$ because it is obtained directly from the model $f$ we are optimizing, and $P^R$ because we know the randomization mechanism of the texts in $\mathcal{D}_R$.\footnote{In practice, it can still be empirically helpful to use a model-derived estimate of the randomization probabilities $\widehat{P}^R(X)$, similar to how the H\'ajek estimator can have lower variance than the Horvitz-Thompson estimator \citep{Hajek1971, Sarndal2003_model}.} Importantly, this means that $\widehat{V}_{IPW}(f)$ is an unbiased estimator for $V(f)$.

\begin{theorem}
\label{thm:ipw_unbiased}
Let $\mathcal{D}_R$ be a randomized experiment parameterized by $P^R$, such that $P^R$ is known. Then 
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{IPW}(f)]]&=\E_{X \sim P^f}[\E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]] \\
    &=V(f)
\end{align*}
\end{theorem}

\subsection{Doubly Robust Causal Preference Optimization}

An importance weighting estimator like the CPO value function is a natural solution for estimating causal quantities. However, CPO optimizes over only randomized experimental data such as crowdsourced data, and so it can be further improved by the addition of an outcome modeling term that can leverage (often larger) non-randomized data to learn to predict outcomes on unlabeled texts. \textit{Doubly robust causal preference optimization} (DR-CPO) combines IPW (over randomized data) and outcome modeling (over non-randomized data) to yield a doubly robust estimator that reduces the variance of CPO and improves its generality while still remaining unbiased for the true causal optimization problem.

\textbf{Identification.} The doubly robust formulation gives us another way of linking the value function to the randomization dataset $\mathcal{D}_R$. 

\begin{proposition}\label{eq:v_dr_proposition} The value function $V(f)$ can also be identified as
\begin{align*}
    V(f)=&\;\E_{X \sim P^R}\Bigg[\E_{Y \sim P^R_y}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] + \\
    &\;\E_{X\sim P^f}[g(X)] \tag*{(\text{$V_{DR}$})}
\end{align*}
\end{proposition}

This construction combines IPW and an outcome model $g$ to provide robustness against misspecification or mis-estimation within either term---akin to doubly robust estimators that serve the same purpose when estimating causal effects.

\textbf{Estimation}. The doubly robust value function $V_{DR}$ can be estimated from the crowdsourced data $\mathcal{D}_R$ and a learned outcome model $\widehat{g}(X)$. 

First, however, we consider the outcome modeling term $\E_{X\sim P^f}[g(X)]$. Even if we were to have access to the true outcome model $g$, it is difficult to optimize $g$ with respect to texts $X \sim P^f$,
% the original outcome modeling value function $V_{out}(f)=E_{X \sim P^f}[g(X)]$, 
as this requires that texts be drawn from the language model $f$ \textit{as $f$ is being updated}. To remedy this, we re-write $\E_{X\sim P^f}[g(X)]$ in terms of a fixed language model $f^0$:\footnote{We show this equivalence in Appendix \ref{sec:v_out_rewritten}.}
\begin{equation*}
    \E_{X\sim P^f}[g(X)]=\E_{X \sim P^{f^0}}\Bigg[\frac{P^f(X)}{P^{f^0}(X)}g(X)\Bigg] \tag*{(\text{$V_{out}$})}
\end{equation*}
where $P^{f_0}$ denotes the distribution over texts from language model $f_0$.

We can create a Monte Carlo estimate of this by drawing texts $\widetilde{X}_1, \ldots, \widetilde{X}_m \sim P^{f^0}$ and computing
\begin{equation*}
    \widehat{V}_{out}(f)=\frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)
\end{equation*}
where $\widehat{g}(x)$ is a model trained to predict $Y$ from $X$ and $f^0$ is any generative language model. 

Finally, the doubly robust value function $V_{DR}$ can be estimated as a combination of these two terms.
\begin{align*}
    \widehat{V}_{DR}(f)=&\;\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{P^R(X_i)}(Y_i-\widehat{g}(X_i)) + \\
    &\;\frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)
\end{align*}


Formally, it can be shown that $\widehat{V}_{DR}(f)$ is an unbiased estimator for $V(f)$ 
under two possible conditions, making it an effective proxy for the true causal optimization problem.

\begin{theorem}
\label{thm:dr_unbiased}
Let $\mathcal{D}_R$ be a randomized experiment parameterized by $P^R$, which may be estimated from a separate sample by $\widehat{P}^R$. Let $g(X)=E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]$, which may be estimated from a separate sample by $\widehat{g}(X)$. Then 
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]&=\E_{X \sim P^f}[\E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]] \\
    &=V(f)
\end{align*}
if \textit{either}
\begin{enumerate}
    \item $\widehat{P}^R(X)=P^R(X)$, or
    \item $\widehat{g}(X)=g(X)$
\end{enumerate}
\end{theorem}

Importantly, because $P^R$ \textit{is known} in randomized experiments, including the crowdsourced data setting, it does not need to be estimated, which means that condition (1) is always fulfilled for DR-CPO. As a result, $\widehat{V}_{DR}(f)$ is guaranteed to be unbiased for $V_{DR}$ \textit{even if the outcome model is incorrect}. In other words, DR-CPO is robust to misspecification of $\widehat{g}$, as the IPW term in its value function corrects for any bias from the predicted outcomes. This means that rather than requiring a randomized dataset for training, $\widehat{g}$ can reasonably be learned over \textit{any} direct outcome dataset, including those where the causal relationship between the text and the outcome may be confounded.

Therefore, in addition to learning from the experimental data $X \sim P^R, Y \sim P^R_y$, a model optimized with DR-CPO will be able to further leverage the generative language model $f^0$ to learn from unlimited unlabeled text $\widetilde{X} \sim P^{f^0}$ with predicted outcomes $\widehat{g}(\widetilde{X})$.
This can reduce the variance of the value function estimator.
\begin{proposition}
  \label{prop:var_diff}
    If $\widehat{g}$ is fit on a separate sample, then conditional on $\widehat{g}$, $n \left(\Var\left(\widehat{V}_{IPW}(f)\right) - \Var\left(\widehat{V}_{DR}(f)\right)\right)$ is equal to 
   \begin{align*}
       \Var\left(\frac{P^f(X)}{P^R(X)}g(X)\right) - \Var\left(\frac{P^f(X)}{P^R(X)}\left(g(X)-\widehat{g}(X)\right)\right)\\
        \quad- \frac{n}{m} \Var\left(\frac{P^f(\widetilde{X})}{P^{f^0}(\widetilde{X})}\widehat{g}(\widetilde{X})\right)
   \end{align*}
     
\end{proposition}

Proposition~\ref{prop:var_diff} shows the difference in the variances of the IPW and DR value function estimators, scaled by the sample size $n$ of the crowdsourced data to highlight asymptotic differences. This difference indicates that $\Var(\widehat{V}_{DR}(f))<\Var(\widehat{V}_{IPW}(f))$ subject to two conditions: (i) the number of Monte Carlo samples drawn from $f^0$ is much larger than the sample size of the crowdsourced data, i.e., $m \gg n$; and (ii) $\widehat{g}(x)$ has \textit{some} additional predictive power compared to a constant model.

Condition (i) limits the component of the variance difference that arises due to Monte Carlo error from taking $m$ samples from the reference language model $f^0$. Then the main comparison is the difference between the variance of the expected outcome $g(x)$, and the variance of the \emph{prediction} error for the model. Consequently, under condition (ii), we can expect the variance of $\widehat{V}_{DR}$ to be lower than the variance of $\widehat{V}_{IPW}$  (e.g., if $(\widehat{g}(x) - g(x))^2 < g(x)^2$).

We note that in a different data setting where the true $P^R$ is unknown (e.g., no crowdsourced or randomized data is available), DR-CPO can still solve the true causal problem if all confounders are known in the dataset used to train $\widehat{g}$. Controlling for all confounders allows the outcome model to learn the correct causal relationship between the text and the outcome. Since such an outcome model is correct, it fulfills the condition $\widehat{g}(X)=g(X)$, and it follows from Theorem \ref{thm:dr_unbiased} that the DR-CPO objective $\widehat{V}_{DR}$ is an unbiased estimator for the true value function $V(f)$ even if $P^R$ is not randomized.

In practice, the full set of confounders is rarely known (particularly for text data). However, a good $\widehat{g}$ can still likely be learned if a large amount of clean data exists to train the outcome model. In this case, a well-estimated outcome model $\widehat{g}$ can still help bias-correct mis-estimation of $\widehat{P}^R(X)$.

\subsection{Relationship to Existing Approaches}
\label{sec:rlhf_dpo_equivalence}

Elements of (DR-)CPO are reflected in two of the most prominent existing language model optimization approaches: RLHF and DPO. 

First, notice that the outcome modeling term $V_{out}$ is itself a way of identifying $V(f)$ from the observed data.
Outcome modeling relies entirely on the predictive model $\widehat{g}$ and unlabeled texts $\widetilde{X} \sim P^{f^0}$ and therefore does not require an experimental dataset $\mathcal{D}_R$. However, if $\widehat{g}(X)$ is not a good outcome model, then $\widehat{V}_{out}(f)$ will be biased with respect to $V(f)$. Reward models trained on confounded data may be misspecified, since they can only capture the relationship between the text and the response---and not the confounders that have additionally influenced the response. While these issues are remedied if the confounding is also fully modeled, confounders are extremely difficult to measure fully in text data.

Optimization of $V_{out}(f)$ is closely related to RLHF in the direct outcome setting via proximal policy optimization (PPO) and REINFORCE \citep{stiennon2020summarize, williams1992simple, ahmadian2024basics}. In particular, the PPO loss term can be seen as a version of outcome modeling in which the reward model is trained on paired completion data. Under these conditions, the RLHF reward model is analogous to the outcome model $g$, and the PPO loss is mathematically equivalent to $V_{out}(f)$ (details in Appendix \ref{sec:rlhf_ties}). Because in practice PPO often optimizes both the policy network and the value function network, $V_{out}(f)$---which optimizes only the policy network---can also be seen as similar to a REINFORCE objective in which learning occurs over Monte Carlo samples drawn from a fixed language model $f^0$.

Likewise, DPO shares similarities with CPO. Rather than relying on reward or outcome modeling, both DPO and CPO fine-tune a language model for human feedback by directly using a preference dataset, which greatly reduces their computational overhead. The DPO objective is similar to $V_{IPW}$ in that it directly increases the likelihood of texts corresponding to desired outcomes through importance weighting. However, because it uses paired data, DPO increases the density ratio between preferred and non-preferred examples, while CPO directly increases the probability of texts with desired outcomes and decreases the probability of texts with non-desired outcomes. Given a paired data setting, the DPO objective could possibly be recovered from $V_{IPW}$; we leave this derivation for future work.

\section{Experiments}
\label{sec:experiments}

We conduct evaluations to empirically assess the effectiveness of CPO and DR-CPO in optimizing language models for human preferences on direct outcome data, and we examine the doubly robust properties of DR-CPO under confounding.

\subsection{Datasets}

To evaluate optimization on direct outcome data, we consider three crowdsourced or randomized experimental datasets in which human annotators provided numerical responses to texts.

\textbf{Hate Speech} \textit{(binary outcome).} The Hate Speech dataset \citep{qian-etal-2019-benchmark} consists of comments from the social media sites Reddit and Gab. Comments span a wide variety of subjects, including daily life, personal relationships, politics, and finance. Outcomes are collected via crowdsourcing and indicate whether the annotator perceives the comment to be hate speech. The Reddit comments are chosen from subreddits where hate speech is more common, and Gab is a platform where users sometimes migrate after being blocked from other social media sites. The optimization goal for this dataset is to generate texts that are \textit{less} hateful on average.

We include this dataset to evaluate optimization under conditions where (i) texts are topically diverse and (ii) outcomes are relatively simple to judge.

\textbf{Hong Kong} \textit{(scalar outcome).} The Hong Kong dataset \citep{fong2021causal} consists of texts concerning the Hong Kong democracy protests of 2019-2020. These texts are loosely based on speeches made about Hong Kong during U.S. Congressional sessions at the time of the protests. Outcomes are collected via a randomized experiment and indicate on a scale of 0-100 to what extent the respondent thinks that the U.S. should support Hong Kong during this time, after reading the text. The texts are programmatically constructed: for each text, 2 or 3 text attributes are randomly chosen out of 7 (e.g., \textit{commitment}, \textit{bravery}, \textit{mistreatment}). Short passages corresponding to each attribute are then randomly chosen from a pool of about 20 to construct the text. The optimization goal for this dataset is to generate texts with \textit{high} outcomes on average.

We include this dataset to evaluate optimization under more challenging conditions where (i) texts all concern a single topic and (ii) outcomes are complex, noisy, and difficult for a model to predict.

\textbf{Confounded} \textit{(scalar outcome).} The Confounded dataset is a version of the Hong Kong dataset where we have induced confounding. Concretely, it is identical to the Hong Kong dataset with the exception of its outcome, which is the negation of the original outcome (e.g., an outcome of 60 in the Hong Kong dataset becomes an outcome of -60 in the Confounded dataset). This inverts the relationship between the text and the outcome: texts originally associated with high outcomes in the Hong Kong dataset are now associated with low outcomes, and vice versa. Intuitively, negating the outcome is akin to introducing a strong confounding or corrupting factor where readers who tend to read texts that \textit{should} have high outcomes also tend to have very \textit{negative} responses to those texts (e.g., “haters” brigading a popular celebrity's posts).

We include this dataset with the realistic expectation that text data is often confounded. While CPO by construction uses only randomized data, confounded data can pose a threat to approaches that use non-randomized data to train their outcome models. Therefore, it is necessary to evaluate how approaches like DR-CPO and OO-RLHF fare when their outcome models are trained on confounded data.

\subsection{Implementation}
\label{sec:implementation}

\textbf{Evaluation.} To evaluate how well optimization for human preferences has occurred, we consider two separate metrics. As our primary evaluation, we use a text preference framework in which a reader is asked to choose the better (with respect to the outcome) of a pair of texts generated by two different methods. Using GPT-4 as a proxy for human annotators, we compare pairs of \textit{(method, baseline)} completions for the same prompt; across all pairs, we compute method \textit{win rates} and compute 95\% confidence intervals. Since the datasets used for these experiments contain one text per sample rather than a prompt and a completion, we create prompts on the evaluation set by truncating each text to a random length.

The full input provided to GPT-4 for each dataset can be found in Appendix \ref{sec:gpt_win_rate_questions}. We validate the use of GPT-4 as an annotator with a human study, which we describe in further detail in Section \ref{sec:gpt4_annotation}.

To supplement our text preference evaluation, we additionally assess the effectiveness of each method in maximizing expected reward (i.e., the value function). In particular, we focus on the reward under the DR-CPO objective, which is the most efficient estimate of the reward.

\textbf{Methods.} We evaluate language models optimized using \textbf{CPO} and \textbf{DR-CPO}. As our baselines, we consider language models that have been fine-tuned on texts from each of the task datasets (\textbf{FT}), as well as models optimized using the outcome modeling value function $V_{out}(f)$. Since---as we discuss in Section \ref{sec:rlhf_dpo_equivalence}---the $V_{out}(f)$ objective is mathematically equivalent to the RLHF objective, we refer to this baseline as \textbf{OO-RLHF} (offline outcome RLHF).

We use Llama 2 7B \citep{touvron2023llama} as our base language model and fine-tune with low-rank adaptation (LoRA) \citep{hu2022lora}. All optimizations are applied after fine-tuning on text from the task dataset. Because only a tiny fraction of parameters are updated under LoRA, we note that we expect relatively small changes in performance under optimization.

Our outcome models for the Hate Speech dataset are also based on Llama 2 7B. For the Hong Kong outcome model, LLMs perform poorly on the dataset's complex outcomes, particularly given the dataset's smaller size. Instead, we use a standard linear regression model over features extracted using the Empath lexicon \citep{fast2016empath}.

Implementation details can be found in Appendix \ref{sec:hyperparameters}.

\textbf{Choice of $f^0$.} When optimizing with DR-CPO or OO-RLHF, any generative language model may be used as $f^0$, the fixed language model from which texts are drawn as input to the outcome model. One key consideration is whether $f^0$ should be a pre-trained model or whether it should be a model that has been fine-tuned on text relevant to the task---for instance, the randomized experiment dataset $\mathcal{D}_R$.

In practice, we choose a pre-trained model as $f^0$ to leverage the diversity of texts such models tend to generate. If $\widehat{g}(X)$ is a good outcome model, then predicted outcomes on these texts will still be close to the true outcomes, and DR-CPO and OO-RLHF will benefit from outcome modeling. 

\textbf{Choice of $\widehat{P}^R$.} In Section \ref{sec:identification}, we mention that although the distribution of texts $P^R(X)$ under the randomized experiment is known, it can be helpful empirically to instead compute an estimated $\widehat{P}^R(X)$. This is generally due to the fact that the \textit{sample} probability of each text $X$ may not actually be equal to its theoretical probability merely by chance \citep{Hajek1971, Sarndal2003_model}.

We find this to be the case in our experiments, and so we use $\widehat{P}^R(X)$ estimated from a Llama 2 7B model fine-tuned on $\mathcal{D}_R$ in our CPO and DR-CPO implementations.

\section{Results and Discussion}
\label{sec:results}

\subsection{GPT-4 Annotation Validity}
\label{sec:gpt4_annotation}

\begin{table}[!ht]
    \centering
    \begin{tabular}{cccc}
    \toprule
    \toprule
        Annotator 1 & Annotator 2 & Fleiss' $\kappa$  \\
    \midrule
        Human & Human & 0.170 \\
        Human & GPT-4 & 0.219 \\
        Human majority & GPT-4 & 0.192 \\
    \bottomrule
    \bottomrule
    \end{tabular}
    \caption{Agreement rates of human annotators and GPT-4 when asked to choose preferred texts with respect to target outcomes. We examine inter-human agreement, human-GPT-4 agreement, and agreement between a majority vote of human annotators and GPT-4.}
    \label{tab:human_gpt4_agreement}
\end{table}

Following the precedent set by \cite{rafailov2023direct}, we conduct a human study to assess the validity of GPT-4 as an annotator when choosing between pairs of texts for a preferred outcome. Across 200 randomly sampled examples from the Hong Kong dataset, we show human annotators \textit{(method, baseline)} completion pairs and ask them to choose the better of the two with respect to the outcome. We compute agreement between each human annotator, as well as agreement between each human annotator and GPT-4.  Agreement is measured through Fleiss' $\kappa$ \citep{fleiss1971measuring}, a common metric for agreement among multiple raters. 

We use the online research platform Prolific\footnote{\url{https://www.prolific.co/}} to conduct our human study. To prevent annotator fatigue, examples are annotated in batches of 20. We recruit a total of 30 annotators for an average of 3 annotators per example and a total of 600 annotations.

Across three comparisons---human-human, majority vote-human, and human-GPT-4---we find that GPT-4 exhibits a similar or better level of agreement with human annotators as human annotators do with each other (Table \ref{tab:human_gpt4_agreement}). 

We conclude that GPT-4 is a reasonable surrogate for human annotators.

\begin{table*}[!ht]
  \centering
    \begin{tabular}{llrrr}
    \toprule
    \toprule
    Dataset & Method & DR-CPO reward & CPO reward & OO-RLHF reward \\
    \midrule
    Hate Speech & FT    & 0.219 & -0.024 & 0.242 \\
    Hate Speech & OO-RLHF & 0.244 & -0.006 & 0.249 \\
    Hate Speech & CPO   & 0.230 & -0.012 & 0.242 \\
    Hate Speech & DR-CPO & \textbf{0.245} & \textbf{-0.005} & \textbf{0.250} \\
    \midrule
    Hong Kong & FT    & 24.505 & -0.256 & 24.761 \\
    Hong Kong & OO-RLHF & 22.256 & -2.753 & 25.009 \\
    Hong Kong & CPO   & \textbf{25.969} & \textbf{1.254} & 24.715 \\
    Hong Kong & DR-CPO & 25.162 & 0.269 & \textbf{24.893} \\
    \bottomrule
    \bottomrule
    \end{tabular}
  \caption{(Stabilized) expected reward under the DR-CPO, CPO, and OO-RLHF objectives. Expected reward under the DR-CPO objective is the most efficient estimate of the reward and therefore the most reliable metric. On the Hate Speech dataset, DR-CPO achieves the highest expected reward, while on the Hong Kong dataset, CPO achieves the highest reward.}
  \label{tab:expected_reward}
\end{table*}

\begin{figure*}[!ht]
    \centering
    \includegraphics[width=0.75\textwidth]{images/win_rates.png}
    \caption{CPO and DR-CPO win rates against OO-RLHF, FT, and one another other. A win rate exceeding 0.5 indicates that the named method outperforms the competing method with respect to the target outcome. The error bars correspond to 95\% confidence intervals, and asterisks (*) mean that the win rate difference between the two methods is statistically significant at the 95\% confidence level.}
    \label{fig:win_rates}
\end{figure*}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\columnwidth]{images/confounding.png}
    \caption{Impact of outcome model confounding (measured in win rate difference divided by 2) on OO-RLHF, CPO, and DR-CPO. A negative impact indicates that confounding hurts the performance of the method. The error bars correspond to 95\% confidence intervals, and asterisks (*) mean that the win rate difference is statistically significant at the 95\% confidence level.}
    \label{fig:confounding_win_rates}
\end{figure}

\subsection{Outcome Optimization}

Over 2000 test prompts for each dataset, we evaluate CPO and DR-CPO win rates against baselines OO-RLHF and FT (and against one another). Under this evaluation framework, a win rate greater than 0.5 indicates that the method outperforms its competitor. We report our results in Figure \ref{fig:win_rates}. Additional results are found in Appendix \ref{sec:additional_results}.

With the same 2000 prompts, we further evaluate expected reward\footnote{Rewards are stabilized to prevent numerical underflow and overflow.} under each optimization method. We reiterate that our optimization goal is to maximize the expected outcome or reward of texts generated by the language model. We report our results in Table \ref{tab:expected_reward}.

On the Hate Speech dataset, we observe that DR-CPO outperforms both the OO-RLHF and FT baselines with respect to win rate. Against both baselines, DR-CPO's win rate is statistically significant at the 95\% confidence level, with the lower bound of its 95\% confidence intervals falling above 0.5. DR-CPO also achieves the highest expected reward across all objectives. These results indicate that using DR-CPO, language models successfully learn human preferences for less hateful text from direct outcomes---and do so better than competing baselines.

DR-CPO additionally appears to outperform CPO on the Hate Speech dataset, though its win rate is just shy of statistical significance. This result supports our theoretical result (Proposition \ref{prop:var_diff}) that the addition of an outcome model to the CPO objective can improve optimization by reducing variance. Moreover, the success of DR-CPO over OO-RLHF provides further evidence for the benefits of the doubly robust formulation, wherein a good $P^R$ can help to correct bias from an outcome modeling approach. 

On the Hong Kong dataset, we likewise observe that CPO outperforms both the OO-RLHF and FT baselines with respect to both win rate and expected reward under the DR-CPO objective. In this setting, CPO achieves statistically significant win rates against both baselines, with the lower bound of its 95\% confidence intervals falling above 0.5. 

Here, in contrast to the Hate Speech dataset, CPO also outperforms DR-CPO, which suggests that CPO can be very strong under conditions where $P^R$ is well controlled or can be estimated very well---as is the case for the Hong Kong dataset, where texts are not only randomly assigned to annotators but programmatically generated from random attributes. An additional factor may be the difficulty of modeling the complex continuous outcome of the Hong Kong dataset from its artificially constructed, homogeneous texts. Under such conditions, an outcome model may introduce noise rather than help with bias correction (see Proposition \ref{prop:var_diff} for a discussion of the objectives' precision), and so CPO may enjoy empirical advantages over DR-CPO. The challenge of outcome modeling is additionally reflected in the underperformance of OO-RLHF relative to even FT, providing evidence for how outcome modeling alone can be insufficient for language model optimization.

\subsection{Double Robustness Under Confounding}

In Figure \ref{fig:confounding_win_rates}, we examine the impact of outcome model confounding on OO-RLHF, CPO, and DR-CPO. For our confounded condition, we train both the DR-CPO and OO-RLHF outcome models on the Confounded dataset. As DR-CPO also uses randomized data in its IPW term, we optimize this portion of the objective on the (unconfounded) Hong Kong dataset. For each method, we then compute win rate between a model that has been optimized under these confounding conditions and a model that has been optimized on unconfounded data using the same method.

CPO is constructed to use only randomized data and does not have an outcome model, so it is exclusively optimized on the (unconfounded) Hong Kong dataset. However, we compare two separately optimized models to account for randomness in the optimization process.

Win rates are computed across 600 pairs for each method. Additional results are found in Appendix \ref{sec:additional_results}.

In these experiments, we find that DR-CPO remains robust under confounding in the outcome model, while OO-RLHF degrades significantly. OO-RLHF experiences an impact to win rate that is significantly negative, while CPO experiences no impact (as expected, because it does not use an outcome model) and DR-CPO experiences a positive impact. Exploring this last result is an avenue for future work.

These results highlight one of the core strengths of DR-CPO and one of the core shortcomings of outcome modeling approaches. As long as a small crowdsourced dataset is available to act as $P^R$, DR-CPO can use large amounts of confounded data to learn its outcome model and still retain its unbiasedness guarantees---all while reducing variance relative to CPO. An exclusively outcome model-based approach like OO-RLHF, on the other hand, becomes biased under these conditions. Even under aggressive confounding, with a worst-case outcome model that has been trained on completely negated data, DR-CPO is not compromised, while OO-RLHF is.

We reiterate that because they are optimized (at least partially) on randomized experimental data, CPO and DR-CPO are \textit{causal} approaches. Taken together, our results emphasize the importance of an optimization framework that maintains the causal relationship between text and outcome.

\section{Conclusion}

In this paper, we explore language model optimization for human preferences from direct outcome datasets, in which each sample consists of a text and the reader's numerical response. We first posit that language model optimization should be viewed as a causal problem to ensure that the model correctly learns the relationship between the text and the outcome, and we define conditions under which this causal relationship can be guaranteed. Following this, we introduce CPO, a method that solves an unbiased surrogate objective for the causal language optimization problem---and improve upon it with the doubly robust DR-CPO, which reduces the variance of the CPO objective while retaining provably strong guarantees on bias. Finally, we empirically demonstrate the effectiveness of (DR-)CPO in optimizing state-of-the-art LLMs for human preferences on direct outcome data, and we validate the robustness of DR-CPO under difficult confounding conditions. 

To our knowledge, our work is the first to approach LLM optimization as a causal inference problem---from which an importance-weighted solution naturally follows---as well as the first to propose a doubly robust methodology for LLM optimization. These theoretical contributions and results open the door to a wide range of data, human preferences, and optimization goals that language models can learn using (DR-)CPO.

Several natural lines of future research follow from this work. For instance, (DR-)CPO may benefit empirically from entropy regularization techniques that are common in policy optimization. Additionally, the performance gains in our evaluations may have been limited by the small proportion of parameters updated under LoRA, and so it would be useful to explore whether experiments with commercial-grade computational resources have the potential to yield much larger improvements. Finally, future work may wish to extend DR-CPO to the paired completion data setting, as the bias guarantees and variance reduction of a doubly robust approach can also be useful for paired data.

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
                         
This material is based upon work partially supported by the National Institutes of Health (awards R01MH125740, R01MH132225, and R21MH130767). Victoria Lin is supported by a Meta Research PhD Fellowship. Any opinions, findings, conclusions, or
recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the sponsors, and no official endorsement should be inferred.
\end{acknowledgements}

% References
\bibliography{ref}

\newpage

\onecolumn

\title{Optimizing Language Models for Human Preferences \\\ is a Causal Inference Problem\\(Supplementary Material)}
\maketitle

\appendix


\section{Technical results}
\label{sec:technical_results}
\subsection{Identifying $V(f)$}

\begin{proof}{Proof of Proposition \ref{eq:v_ipw_proposition}}

We can show that $V(f)=\E_{Y(\cdot)\sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]]=\E_{Y \sim P^R_y}[\E_{X \sim P^R}[\frac{P^f(X)}{P^R(X)}Y]]$.
\begin{align*}
    V(f)&=\E_{Y(\cdot)\sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]] \\
    &=\E_{Y(\cdot)\sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)Y(x)\Bigg] \\
    &=\E_{Y(\cdot)\sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^R(x)\frac{P^f(x)}{P^R(x)}Y(x)\Bigg] \\
    &=\E_{Y(\cdot)\sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}Y(X)\Bigg]\Bigg] \\
    &=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}Y\Bigg]\Bigg]
\end{align*}

\end{proof}


\begin{proof}{Proof of Proposition \ref{eq:v_dr_proposition}}

We can show that $V(f)=\E_{Y(\cdot)\sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]]=\E_{Y \sim P^R_y}[\E_{X \sim P^R}[\frac{P^f(X)}{P^R(X)}(Y-g(X))]] + \E_{X\sim P^f}[g(X)]$.
\begin{align*}
    V(f)&=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}Y\Bigg]\Bigg] \tag*{(Proposition \ref{eq:v_ipw_proposition})} \\
    &=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X)+g(X))\Bigg]\Bigg] \\
    &=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] + \E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}g(X)\Bigg]\Bigg] \\
    &=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] + \E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}g(X)\Bigg] \\
    &=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] + \sum_{x \in \mathcal{X}}P^R(x)\frac{P^f(x)}{P^R(x)}g(x) \\
    &=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] + \sum_{x \in \mathcal{X}}P^f(x)g(x) \\
    &=\E_{Y\sim P^R_y}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X)}{P^R(X)}(Y-g(X))\Bigg]\Bigg] + \E_{X \sim P^f}[g(X)]
\end{align*}
    
\end{proof}

\subsection{Unbiasedness proofs and variance difference}
\label{sec:unbiased}
\begin{proof}{Proof of Theorem~\ref{thm:ipw_unbiased}
}
    

We can show that $\E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{IPW}(f)]]=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]]=V(f)$ when $P^R$ is known.

\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^R}[\widehat{V}_{IPW}(f)]]
    &=\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{P^R(X_i)}Y_i\Bigg]\Bigg] \\
    &=\frac{1}{n}\sum_{i=1}^n\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{P^R(X_i)}Y_i\Bigg]\Bigg] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}Y(x)\mathbbm{1}\{X=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}Y(x)\underbrace{\E_{X \sim P^R}[\mathbbm{1}\{X=x\}]}_{P^R(x)}\Bigg] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)Y(x)\Bigg] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]] \\
    &=V(f)
\end{align*}

\end{proof}

\begin{proof}{Proof of Theorem~\ref{thm:dr_unbiased}}
    

We can show that $\E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]]=V(f)$  under one of two conditions: $\widehat{P}^R(X)=P^R(X)$ (i.e., $P^R$ is known) or (2) $\widehat{g}(X)=g(X)=E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]$ (i.e., $g(X)$ is known).

First, we rewrite $\E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]$:
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_{X,\widetilde{X}}[\widehat{V}_{DR}(f)]] =& \; \E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{X,\widetilde{X}}\Bigg[\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i)) + \frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg] \\
    =& \;\E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{X\sim P^R}\Bigg[\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg] \\
    &+ \E_{Y(\cdot)\sim\mathcal{G}}\Bigg[\E_{\widetilde{X}\sim P^{f^0}}\Bigg[\frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg] \\
    =& \; \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg] \\ 
    &+ \frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg]
\end{align*}

Case (1) $\widehat{P}^R(X)=P^R(X)$:


Rewriting the first term,
\begin{align*}
    \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg]
    &=\frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{P^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}(Y(x)-\widehat{g}(x))\mathbbm{1}\{X=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^R(x)}(Y(x)-\widehat{g}(x))\underbrace{\E_{X \sim P^R}[\mathbbm{1}\{X=x\}]}_{P^R(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)(Y(x)-\widehat{g}(x))\Bigg]\\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)-\widehat{g}(X)]] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]]-\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]]
\end{align*}

Rewriting the second term,
\begin{align*}
    \frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg]
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}\widehat{g}(x)\mathbbm{1}\{\widetilde{X}=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}\widehat{g}(x)\underbrace{\E_{\widetilde{X} \sim P^{f^0}}[\mathbbm{1}\{\widetilde{X}=x\}]}_{P^{f^0}(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)\widehat{g}(x)\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]]
\end{align*}


Then we have
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]]-\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]]+\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[\widehat{g}(X)]] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}[\E_{X \sim P^f}[Y(X)]] \\
    &=V(f)
\end{align*}

Case (2) $\widehat{g}(X)=g(X)=E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]$:

Rewriting the first term,
\begin{align*}
    \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-\widehat{g}(X_i))\Bigg]\Bigg]
    &= \frac{1}{n}\sum_{i=1}^n \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\frac{P^f(X_i)}{\widehat{P}^R(X_i)}(Y_i-g(X_i))\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{X \sim P^R}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{\widehat{P}^R(x)}(Y(x)-g(x))\mathbbm{1}\{X=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{\widehat{P}^R(x)}(Y(x)-g(x))\underbrace{\E_{X \sim P^R}[\mathbbm{1}\{X=x\}]}_{P^R(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}(Y(x)-g(x))\Bigg] \\
    &=\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}\E_{Y(\cdot) \sim \mathcal{G}}[Y(x)-g(x)] \\
    &=\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}(\E_{Y(\cdot) \sim \mathcal{G}}[Y(x)]-g(x)) \\
    &=\sum_{x \in \mathcal{X}}\frac{P^f(x)P^R(x)}{\widehat{P}^R(x)}\cdot 0 \\
    &=0
\end{align*}

Rewriting the second term,
\begin{align*}
    \frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\Bigg]\Bigg]
    &=\frac{1}{m}\sum_{j=1}^m \E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}g(\widetilde{X}_j)\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}g(x)\mathbbm{1}\{\widetilde{X}=x\}\Bigg]\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}\frac{P^f(x)}{P^{f^0}(x)}g(x)\underbrace{\E_{\widetilde{X} \sim P^{f^0}}[\mathbbm{1}\{\widetilde{X}=x\}]}_{P^{f^0}(x)}\Bigg] \\
    &=\E_{Y(\cdot) \sim \mathcal{G}}\Bigg[\sum_{x \in \mathcal{X}}P^f(x)g(x)\Bigg] \\
    &=\sum_{x \in \mathcal{X}}P^f(x)g(x) \\
    &=\E_{X \sim P^f}[g(X)] \\
    &=\E_{X \sim P^f}[E_{Y(\cdot)\sim\mathcal{G}}[Y(X)]] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]]
\end{align*}

Then we have
\begin{align*}
    \E_{Y(\cdot)\sim\mathcal{G}}[\E_X[\widehat{V}_{DR}(f)]]
    &=0+\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]] \\
    &=\E_{Y(\cdot)\sim\mathcal{G}}[\E_{X \sim P^f}[Y(X)]] \\
    &=V(f)
\end{align*}

\end{proof}

\begin{proof}{Proof of Proposition \ref{prop:var_diff}}
      
      First, we compute the variance of $\widehat{V}_{IPW}(f)$, where $X_i \sim P^R$ and $Y_i \sim P^R_y$ i.i.d.
 \begin{align*}
     \Var\left(\widehat{V}_{IPW}(f)\right) & = \Var\left(\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{P^R(X_i)}Y_i  \right)\\
     & = \frac{1}{n^2}\sum_{i=1}^n \Var\left(\frac{P^f(X_i)}{P^R(X_i)}Y_i \right)\\
     & = \frac{1}{n}\Var\left(\frac{P^f(X)}{P^R(X)}Y \right)\\
     & = \frac{1}{n}\E\left[\Var\left(\frac{P^f(X)}{P^R(X)}Y \mid X \right)\right] + \frac{1}{n}\Var\left(\E\left[\frac{P^f(X)}{P^R(X)}Y \mid X\right] \right)\\
     & = \frac{1}{n}\E\left[\frac{P^f(X)^2}{P^R(X)^2}\Var\left(Y \mid X \right)\right] + \frac{1}{n}\Var\left(\frac{P^f(X)}{P^R(X)}\E\left[Y \mid X\right] \right)\\
     & = \frac{1}{n}\E\left[\frac{P^f(X)^2}{P^R(X)^2}\Var\left(Y \mid X \right)\right] + \frac{1}{n}\Var\left(\frac{P^f(X)}{P^R(X)}g(X)\right)
 \end{align*}
 
 where we have used that under randomization $\E[Y \mid X = x ] = \E_{Y(\cdot) \sim \mathcal{G}}[Y(x)] = g(x)$.
 
 For the variance of $\widehat{V}_{DR}(f)$, we first note that if $\widehat{g}$ is fit on a separate, independent sample, we have that 
 \begin{align*}
      \Var\left(\widehat{V}_{DR}(f)\right) & = \Var\left(\frac{1}{n}\sum_{i=1}^n \frac{P^f(X_i)}{P^R(X_i)}(Y_i-\widehat{g}(X_i)) + \frac{1}{m}\sum_{j=1}^m\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\right)\\
      & = \frac{1}{n^2}\sum_{i=1}^n \Var\left(\frac{P^f(X_i)}{P^R(X_i)}(Y_i-\widehat{g}(X_i))\right) + \frac{1}{m^2}\sum_{j=1}^m \Var\left(\frac{P^f(\widetilde{X}_j)}{P^{f^0}(\widetilde{X}_j)}\widehat{g}(\widetilde{X}_j)\right)\\
      & = \frac{1}{n} \underbrace{\Var\left(\frac{P^f(X)}{P^R(X)}(Y-\widehat{g}(X))\right)}_{(\ast)} + \frac{1}{m} \Var\left(\frac{P^f(\widetilde{X})}{P^{f^0}(\widetilde{X})}\widehat{g}(\widetilde{X})\right)
 \end{align*}
 where $X \sim P^R$, $Y \sim P^R_y$, and $\widetilde{X} \sim P^{f^0}$.
 Now notice that
 \begin{align*}
     (\ast) & = \E\left[\Var\left(\frac{P^f(X)}{P^R(X)}(Y-\widehat{g}(X)) \mid X\right)\right] + \Var\left(\E\left[\frac{P^f(X)}{P^R(X)}(Y-\widehat{g}(X)) \mid X \right]\right)\\
     & = \E\left[\frac{P^f(X)^2}{P^R(X)^2}\Var\left(Y \mid X\right)\right] +  \Var\left(\frac{P^f(X)}{P^R(X)}\left(g(X)-\widehat{g}(X)\right)\right)
 \end{align*}


\end{proof}

\subsection{Equivalence of $V_{out}(f)$}
\label{sec:v_out_rewritten}

We can show that our rewriting of $V_{out}(f)$ is equivalent to our original definition:
\begin{align*}
    V_{out}(f)&=\E_{\widetilde{X} \sim P^{f^0}}\Bigg[\frac{P^f(X)}{P^{f^0}(X)}g(X)\Bigg] \\
    &=\sum_{x \in \mathcal{X}}P^{f^0}(x) \frac{P^f(x)}{P^{f^0}(x)}g(x) \\
    &=\sum_{x \in \mathcal{X}}P^f(x)g(x) \\
    &=E_{X \sim P^f}[g(X)]
\end{align*}

\section{Parallels between RLHF and optimization of $V_{out}(f)$}
\label{sec:rlhf_ties}

The loss function under RLHF is typically computed through proximal policy optimization (PPO):
\begin{align*}
    \mathcal{L}(\theta, \phi) = \mathcal{L}^{PPO}_{\text{policy}}(\theta) + c_1\mathcal{L}^{PPO}_{\text{value}}(\phi) - c_2\mathcal{L}^{PPO}_{\text{entropy}}(\theta)
\end{align*}

where $\mathcal{L}^{PPO}_{\text{value}}(\phi)$ and $\mathcal{L}^{PPO}_{\text{entropy}}(\theta)$ are regularization terms and $\mathcal{L}^{PPO}_{\text{policy}}(\theta)$ is the \textit{policy loss}. Letting $p$ denote the prompt, $c$ denote the completion, and $r$ denote the reward model, we consider only the policy loss without any stability tricks like clipping.  $\pi_\theta$ is the probability under the policy being optimized, while $\pi_{\theta_0}$ is the probability under a reference policy (often the starting policy or the policy at the previous step).
\begin{equation*}
    \mathcal{L}_{PPO}(\theta)=
    \E\left[\frac{\pi_\theta(c|p)}{\pi_{\theta_0}(c|p)}\cdot r(p,c)\right]
\end{equation*}

We can see the equivalence between $\pi_\theta(c|p)$ and $P^f(X)$, $\pi_{\theta_0}(c|p)$ and $P^{f^0}(X)$, and $r(p,c)$ and $g(X)$; substituting these terms renders $\mathcal{L}_{PPO}$ equal to $V_{out}(f)$.

\section{Experiments}

\subsection{Model details and hyperparameters}
\label{sec:hyperparameters}

Our language models were implemented using the HuggingFace \verb|transformers| library (version 4.32.1), with Llama 2 7B weights stored locally. For each language model optimization method, we trained for 3 epochs with a batch size of 4, maximum gradient norm of 1.0, and learning rate of $5\times 10^{-5}$, with all other hyperparameters set to Trainer class defaults. We used the HuggingFace \verb|peft| library for LoRA and set our LoraConfig parameters as follows: rank 8, alpha 8, dropout probability 0.1, with all other parameters remaining as default. Custom loss functions for CPO, DR-CPO, and OO-RLHF were implemented using PyTorch (\verb|torch| version 2.0.1).

For the Hong Kong dataset outcome model, we used \verb|scikit-learn| (version 1.3.0) to implement our linear regression. All hyperparameters were set to their default values. Empath features were extracted using the \verb|empath| library.\footnote{\url{https://github.com/Ejhfast/empath-client}}

\subsection{GPT-4 Win Rate Prompts}
\label{sec:gpt_win_rate_questions}

Here, we include the inputs provided to GPT-4 for each dataset to obtain its preferences between texts generated by different methods. We accessed GPT-4 through the OpenAI API (\verb|gpt-4-1106-preview|). Prompt formats are adapted from those described in \cite{rafailov2023direct}. The order of texts is random for each evaluation. For a \textit{(method, baseline)} comparison, \textit{method} corresponds to text A 50\% of the time and text B 50\% of the time.

\subsubsection{Hate Speech prompt}

\begin{lstlisting}
Hate speech is a direct attack on an individual based on protected characteristics: race, ethnicity, national origin, religious affiliation, sexual orientation, caste, sex, gender, gender identity, and serious disease or disability. 

Under the definition of hate speech, which of the following two texts is less hateful?

FIRST provide a one-sentence comparison of the two texts, explaining which you think is less hateful and why.

SECOND, on a new line, state only "A" or "B" to indicate your choice. Your response should use the format:
Comparison: <one-sentence comparison and explanation>
Preferred: <"A" or "B">

A: <text A>

B: <text B>
\end{lstlisting}

\subsubsection{Hong Kong and Confounded prompt}

\begin{lstlisting}
In March 2019, the Hong Kong government proposed a bill that would have allowed certain criminal suspects to be extradited to mainland China for trial. The 2019-2020 Hong Kong democracy protests were a series of demonstrations that began in response to this bill, stoked by growing concerns about the increasing influence of the Chinese government over Hong Kong. Hong Kong currently has its own legal and political system and is considered semi-autonomous. 

Which of the following two texts better convinces you that the U.S. government should support the Hong Kong democracy protesters?

FIRST provide a one-sentence comparison of the two texts, explaining which you think is more convincing and why.

SECOND, on a new line, state only "A" or "B" to indicate your choice. Your response should use the format:
Comparison: <one-sentence comparison and explanation>
Preferred: <"A" or "B">

A: <text A>

B: <text B>
\end{lstlisting}

% \subsection{Human Agreement Study [TODO]}
% \label{sec:gpt4_annotation_details}



\subsection{Additional Results}
\label{sec:additional_results}

\begin{table*}[!ht]
    \centering
    \begin{tabular}{c|cc|cc}
    \toprule
    \toprule
    & \multicolumn{2}{c|}{Hong Kong} & \multicolumn{2}{c}{Hate Speech} \\
        & CPO win rate & DR-CPO win rate & CPO win rate & DR-CPO win rate \\
    \midrule
        FT & \textbf{\textcolor{green!50!black}{0.528* [0.506, 0.550]}} & 0.477 [0.455, 0.499] & \textcolor{green!50!black}{0.517 [0.495, 0.539]} & \textbf{\textcolor{green!50!black}{0.524* [0.502, 0.546]}} \\
        CPO & - & 0.441 [0.419, 0.463] & - & 0.518 [0.496, 0.540] \\
        OO-RLHF & \textbf{\textcolor{green!50!black}{0.542* [0.520, 0.564]}} & 0.482 [0.460, 0.504] & \textbf{\textcolor{green!50!black}{0.538* [0.516, 0.560]}} & \textbf{\textcolor{green!50!black}{0.560* [0.538, 0.582]}} \\
        DR-CPO & 0.559* [0.537, 0.581] & - & 0.482 [0.460, 0.504] & -\\
    \bottomrule
    \bottomrule
    \end{tabular}
    \caption{CPO and DR-CPO win rates against OO-RLHF, FT, and each other. A win rate exceeding 0.5 indicates that the named method outperforms the competing method with respect to the target outcome. Win rates are computed across 2000 pairs for each method combination.}
    \label{tab:win_rates}
\end{table*}

\begin{table}[!ht]
    \centering
    \begin{tabular}{c|c}
    \toprule
    \toprule
    & Unconfounded win rate over confounded \\
    \midrule
    OO-RLHF & \textbf{\textcolor{red!50!black}{0.546* [0.506, 0.586]}} \\
    DR-CPO & \textbf{\textcolor{green!50!black}{0.456* [0.416, 0.496]}} \\
    CPO & 0.505 [0.465, 0.545] \\
    \bottomrule
    \bottomrule
    \end{tabular}
    \caption{Win rates with outcome models trained on unconfounded data (Hong Kong) vs. confounded data (Confounded). A win rate exceeding 0.5 indicates that the method+unconfounded outcome model outperforms the method+confounded outcome model with respect to the target outcome---in other words, that confounding hurts the method. Win rates are computed across 600 pairs for each method combination.}
    \label{tab:confounding_win_rates}
\end{table}

We include the full set of CPO and DR-CPO win rates against OO-RLHF and FT and against each other (Table \ref{tab:win_rates}). We also include raw win rates from the confounding experiments, specifically win rates of unconfounded methods over confounded methods (Table \ref{tab:confounding_win_rates}). We briefly discuss comparisons that did not appear in the main body of the paper.

On the Hate Speech dataset, we observe that CPO---like DR-CPO---also outperforms both the OO-RLHF and FT baselines. Against OO-RLHF, CPO's win rate is statistically significant at the 95\% confidence level, while its win rate against FT falls slightly short of statistical significance.

On the Hong Kong dataset, we find that DR-CPO performs comparably to OO-RLHF but falls short against the other methods. We attribute this to possible difficulty in learning the outcome model itself; this is further evidenced by the strong performance of CPO, which does not use an outcome model. As we mention in the main results, learning a strong outcome model on the Hong Kong dataset may be challenging, as its texts read somewhat artificially due to their programmatic construction.

\end{document}
