\documentclass[accepted]{uai2025} % for initial submission
\usepackage{amsthm, amssymb}
%\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\usepackage{rotating} 
\usepackage{xcolor}
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools,bbm,amsmath} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{cleveref}
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\newcommand{\xhdr}[1]{\vspace{1.7mm}\noindent{{\bf #1.}}}

\usepackage{subcaption}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\mc}[1]{\mathcal{#1}}
\newcommand{\ra}{\rightarrow}
\newcommand{\bP}{\mathbb{P}}
\newcommand{\bI}{\mathbb{I}}
\newcommand{\bE}{\mathbb{E}}
\newcommand{\bH}{\mathbb{H}}
\newcommand{\bR}{\mathbb{R}}
\newcommand{\bN}{\mathbb{N}}
\newcommand{\bV}{\mathbb{V}}
\newcommand{\kl}[2]{D_{\mathrm{KL}}(#1 \mid\mid #2)}
\newcommand{\js}[2]{D_{\mathrm{JS}}(#1 \mid\mid #2)}
\newcommand{\ubr}[1]{\underbrace{#1}}
\newcommand{\indic}{\mathbbm{1}}

\newtheorem{assumption}{Assumption}
\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{proposition}{Proposition}

\newif\ifsubmit
\submitfalse
% \submittrue
\ifsubmit
\newcommand{\dnote}[1]{}
\newcommand{\gnote}[1]{}
\newcommand{\vnote}[1]{}
\newcommand{\bnote}[1]{}
\newcommand{\lnote}[1]{}
\else
\newcommand{\dnote}[1]{\textcolor{blue}{Dilip: #1}}
\newcommand{\gnote}[1]{\textcolor{green}{Gianluca: #1}}
\newcommand{\vnote}[1]{\textcolor{purple}{Veniamin: #1}}
\newcommand{\bnote}[1]{\textcolor{orange}{Benedikt: #1}}
\newcommand{\lnote}[1]{\textcolor{darkgreen}{Lisa: #1}}
\fi

\newcommand{\gianluca}[1]{{\color{red}#1}}

\newcommand{\llamab}{Llama-3.1-8b}
\newcommand{\llamaf}[1]{Llama-8b-HM-distill$_{\alpha = {#1}}$}

\newcommand{\llamai}{Llama-8b-Instruct}
\newcommand{\llamad}{R1-Llama-8b}
\newcommand{\gptmini}{gpt-4o-mini}
\newcommand{\gpt}{gpt-4o}

\title{Hindsight Merging: Diverse Data Generation with Language Models}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
% Add authors
\author[1]{\href{mailto:<veniamin@princeton.edu>?Subject=Hindsight Merging}{Veniamin Veselovsky\thanks{Equal contribution}}{}}
\author[1]{Benedikt Stroebl\textsuperscript{*}}
\author[1]{Gianluca Bencomo\textsuperscript{*}}
\author[1]{\\Dilip Arumugam} 
\author[3]{Lisa Schut}
\author[1]{Arvind Narayanan}
\author[1,2]{Thomas L. Griffiths}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    Princeton University
}
\affil[2]{%
    Department of Psychology\\
    Princeton University
}
\affil[3]{%
    OATML, Deptartment of Computer Science\\
    University of Oxford
  }
  
  \begin{document}
\maketitle

\begin{abstract}
Pre-training a language model equips it with a broad understanding of the world, while fine-tuning refines it into a helpful assistant. 
However, fine-tuning does not exclusively enhance task-specific behaviors but also suppresses some of the beneficial variability from pre-training. This reduction in diversity is partly due to the optimization process, which theoretically decreases model entropy in exchange for task performance. To counteract this, we introduce \emph{hindsight merging}, a technique that combines a fine-tuned model with a previous training checkpoint using linear interpolation to restore entropy and improve performance. Hindsight-merged models retain strong instruction-following capabilities and alignment while displaying increased diversity present in the base model. 
Additionally, this results in improved inference scaling, achieving a consistent 20-50\% increase in pass@$10$ relative to the instruction tuned model across a coding benchmark and series of models. 
Our findings suggest that hindsight merging is an effective strategy for generating diverse generations that follow instructions. 
\end{abstract}


\section{Introduction}\label{sec:intro}
Humans solve a wide variety of problems by reusing previously learned knowledge and applying diverse patterns of thinking \citep{griffiths2019doing}. This ability to adapt and explore multiple cognitive pathways allows human reasoning to converge to correct solutions over time \citep{collins2013cognitive,tomov2020discovery,solway2014optimal,maisto2015divide,correa2023humans}. 
The adaptive reuse of available resources is not just a hallmark of human intelligence but also a key ingredient in the design of reliable artificial intelligence (AI) systems. In many AI applications, diversity plays a crucial role. Repeated sampling from language models relies on a wide variety of generated responses to enhance performance~\citep{brown2024large}. Compound AI systems~\citep{compound-ai-blog} improve their scalability and inference capabilities when diverse data inputs and models are integrated. However, while diversity enhances both human reasoning and AI training, the challenge of generating synthetic datasets with sufficient richness and variation remains an unresolved problem.

The primary challenge can be understood through the lens of optimization. Both humans and language models are known to directly fit their training data~\citep{hasson2020direct}, but the breadth of human experience leads to more diversity in our ability to reason. Large language models (LLMs) trained on a vast and diverse internet corpora produce a base model that generates a wide variety of outputs. However, we remove much of this diversity when we fine-tune due to the use of smaller datasets and the objective of aligned behavior~\citep{murthy2024one}. This creates a paradox: the optimization that improves task performance undermines the model's ability to generate the broad range of outcomes necessary for rich and diverse synthetic datasets. Simply, the best model for solving a task is not necessarily the best model for generating a dataset.

\begin{figure*}[t]
    \centering
    \hfill
    \includegraphics[width=0.9\linewidth]{visuals/figure_1.pdf}
    \caption{Overview of our main findings.}
    \label{fig:fig1}
\end{figure*}


In this work, we explore this trade-off between optimizing on task-specific datasets and producing diverse synthetic datasets. We introduce a theoretically-motivated approach to increasing diversity, \emph{hindsight merging}, that merges an instruction-tuned model with prior training checkpoints to better accommodate the trade-off required for diverse dataset generation (see Figure~\ref{fig:fig1}). Through hindsight merging, we demonstrate that the resulting data is more diverse, maintains instruction-following abilities, and has improved pass@$k$ performance compared to the constituent models before merging. 
%We conclude with an example of how hindsight merging can be used to distill reasoning models. By distilling Llama-3.1-Instruct, we show that data generated by our merged models leads to a 52\% relative gain in performance compared to distilling on reasoning traces from \llamad{}. 


\section{The Entropy Spectrum}\label{sec:entropy}
\begin{figure*}[ht]
    \centering
    \includegraphics[width=\textwidth]{visuals/entropy.png}
    \caption{
    Analysis of theoretical volume changes for SGD based on logit rank (1-10) and softmax entropy in next-token predictions (top$_k$ = 600). Using 10 random arXiv abstracts from January 2025, we compute next-token predictive distributions and compute the Jacobian of the SGD step when the correct token corresponds to each logit.}
    \label{fig:entropy}
\end{figure*}

While fine-tuning language models is crucial for achieving high performance on downstream tasks, it requires specializing the response distribution away from the broad, diverse data that was well-approximated during pre-training. Suppose pre-training involved fitting language model responses to match a distribution $P$. Later, fine-tuning the pre-trained language model demands matching a new response distribution $Q$. Let $\bH(\cdot)$ denote the entropy of a distribution~\citep{shannon1948mathematical,cover2012elements}. It is to be expected that $\bH(Q) < \bH(P)$; indeed, given that current best practices often take $P$ as an Internet-scale distribution over prompt-response pairs, it may be more realistic to consider $\bH(Q) \ll \bH(P)$. During fine-tuning, a language model is being optimized away from accurately approximating $P$ to fit $Q$, which forces a decrease in entropy to accommodate the new response distribution. Such a drop entropy naturally begets a commensurate drop in overall response diversity.

This section attempts to make this intuitive idea mathematically precise in two concrete ways. First, we focus on supervised fine-tuning and adopt a local perspective by studying the change in entropy per optimization step. Second, we adopt a reinforcement-learning perspective to implicitly consider how entropy changes after multiple iterations of policy optimization. Taken together, these theoretical findings corroborate the intuitive notion that fine-tuning decreases entropy and, in doing so, reduces the diversity of language model responses. We conclude with an information-theoretic motivation for a simple remedy to this vanished diversity: mixing response distributions between the current fine-tuned model and the pre-trained base model.

\subsection{Supervised Fine-Tuning}

Consider a neural network $q_\theta(x_{t+1} | x_{1:t}) = \text{softmax}(f(x_{1:t}; \theta))$, parameterized by $\theta \in \mathbb{R}^D$, that models a predictive distribution over next-token generations. We denote $q_\theta(x_{t+1} | x_{1:t})$ as $q_\theta(x)$ for brevity. Let us define parameter updates during fine-tuning as a transition operator $T: \mathbb{R}^D \to \mathbb{R}^D$, which induces the following mapping $M$ on the output space: 
\begin{align}
    M : q_\theta^k(x) \mapsto q_\theta^{k+1}(x)
\end{align}
where $q_\theta^{k+1}(x) = \text{softmax}(f(x_{1:t};T(\theta_k)))$ after applying the transition $\theta_{k+1} = T(\theta_k)$. For all subsequent calculations, please consult Appendix \ref{sec:appendix_entropy_jacobian} for detailed derivations. The total entropy of the predictive distribution after $K$ update steps from some base model $q_\theta^0(x)$ is given by:
\begin{align}
    \label{eq:entropy_total}
    \bH(q_\theta^K) = \bH(q_\theta^0) + \sum_{k=0}^{K-1} \mathbb{E}_{x \sim q_\theta^k} \left[ \log \left| \frac{\partial M}{\partial q}\left(q_\theta^k(x)\right) \right| \right],
\end{align}
where $\left| \partial M / \partial q \right|$ denotes the Jacobian determinant of $M$ at each update step. For stochastic gradient descent (SGD), the transition operator $T$ is defined as
\begin{align}
    \theta_{k+1} \leftarrow \theta_k - \alpha\nabla_\theta\mathcal{L}(\theta_k), \label{eq:sgd}
\end{align}
where $\alpha$ is the learning rate. Although $M$ does not admit a closed form solution, we can approximate it using a first-order Taylor expansion for small $\alpha$:
\begin{align}
    \label{eq:M}
    M(q) \approx q - \alpha \left[\text{diag}(q) - qq^\top\right] \frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta).
\end{align}
Taking the Jacobian yields:
\begin{align}
    \label{eq:jacobian}
    \bH(q_\theta^k) - \bH(q_\theta^{k-1}) \approx \mathbb{E}_{x \sim q_\theta^k}\left[ \log \left| I - \alpha H \right| \right],
\end{align}
where $H = \left[E - G\right]\frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta)$ for $E_{ijk} = \delta_{ij} \cdot \delta_{jk}$ and $G_{ijk} = \delta_{ik}q_j + \delta_{jk}q_i$. This approximation denotes how much entropy changes per update step using SGD.

During supervised fine-tuning (SFT), the goal is to align the model's next-token predictions with a target data-generating process, $p^\star(x_{t+1}|x_{1:t})$, which is typically smaller and less diverse than the original pre-training data. In the most common case, we perform SFT with the cross-entropy loss. The gradient term from Equation \ref{eq:M} can be approximated by the expression
\begin{align}
\frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta) \approx (q - y),
\end{align}
where $q$ denotes model output probabilities and $y$ represents the one-hot encoded ground-truth labels for the input $x$. Note that this approximation captures the classical gradient for cross-entropy, but under the assumption of a small $\alpha$ and dominance of logit-level loss terms in the gradient flow. While simplifying, this approximation allows us to analyze of how entropy over next-token predictions evolves at each step of the optimization process using Equation~\ref{eq:jacobian}.

In Figure \ref{fig:entropy}, we show approximate changes to model entropy, indicated by $\left| \partial M / \partial q \right|$. To build an intuition, when $\left| \partial M / \partial q \right| > 1$, the volume of the predictive distribution $q_\theta(x_{t+1}|x_{1:t})$ is expanding, implying an increase in entropy over model generations given the token history $x_{1:t}$. When $\left| \partial M / \partial q \right| < 1$, volume is shrinking, implying a decrease in entropy. 

Optimizing the cross-entropy loss affects model behavior differently depending on the rank of the ground-truth logits. For rank-1 predictions, all models contract in volume, thereby systematically reducing the entropy of conditional distributions for next-token predictions as they begin to fit the data perfectly. In this case, entropy decrease is a symptom of overfitting. For rank-2 through rank-10 predictions, lower-entropy models show volume expansion whereas higher entropy models continue to contract. Regardless of rank, high-entropy models consistently exhibit volume contraction, with less aggressive contractions as the prediction rank increases. 

\subsection{Reinforcement Learning with Human Feedback}

Aside from the supervised approach examined in the preceding section, an alternative route to fine-tuning a language model uses reinforcement learning~\citep{stiennon2020learning,ouyang2022training}. Compared to the local analysis in the previous sub-section, which characterizes how entropy changes per optimization step, this section presents a more global picture. As the reinforcement learning with human feedback (RLHF) pipeline carries various adornments that complicate analysis, this section restricts its focus to a simpler Markov decision process (MDP) wherein learning an optimal policy is equivalent to learning the per-token distribution of some underlying fine-tuning response distribution. Under mild assumptions, we show that fine-tuning a language model towards a lower-entropy dataset via policy-gradient updates~\citep{sutton1999policy} decreases the entropy in the language models responses relative to the pretrained model. We encourage readers to consult Appendix \ref{sec:app_rl} for all technical details.

Let $\mc{V}$ be a finite vocabulary of tokens such that the set of possible token sequences is $\mc{L} = \bigcup\limits_{n=1}^\infty \mc{V}^n$. Let $\mu \in \Delta(\mc{L})$ be the distribution over prompts and let $p^\star: \mc{L} \ra \Delta(\mc{L})$ be the ground-truth response distribution given any prompt. 
Next, we specify a MDP in which any LLM is a policy and the reward function is constructed such that learning the optimal policy amounts to obtaining a LLM that matches $p^\star$.

Consider the infinite-horizon, discounted MDP~\citep{bellman1957markovian,Puterman94} $\mc{M} = \langle \mc{S}, \mc{A}, \mc{R}, \mc{T}, \mu, \gamma \rangle$. Here $\mc{S} = \mc{L}$ represents any token sequence from $\mc{V}$. The action space $\mc{A} = \mc{V} \cup \{\texttt{STOP}\}$ contains all valid tokens a LLM may emit as well as an explicit \texttt{STOP} token to denote response completion. Logically, the MDP follows deterministic transition dynamics $\mc{T}: \mc{S} \times \mc{A} \ra \mc{S}$ which appends the selected token to the current state: $\mc{T}(s, a) = \langle s, a \rangle$.\footnote{For brevity, we omit an absorbing, zero-reward terminal state that an agent transitions to upon choosing the \texttt{STOP} action.} The initial state distribution $\mu \in \Delta(\mc{S})$ is precisely the distribution over prompts from above. The discount factor $\gamma \in [0,1)$ conveys the effective time horizon for optimizing rewards. So far, we have specified a controlled Markov process (that is, a MDP without a reward function) such that any policy $\pi: \mc{S} \ra \Delta(\mc{A})$ represents a LLM that examines the prompt along with any partially-generated response thus far and emits a distribution over next tokens. 

To capture the objective of fine-tuning a LLM towards a ground-truth response distribution $p^\star$, we consider a policy-dependent reward function defined as $\mc{R}(s,a) = \log\left(\frac{p^\star(a \mid s)}{\pi(a \mid s)}\right)$. Recall that the performance of any policy $\pi$ with respect to a prompt $s \in \mc{S}$ is given by its associated value function $V^\pi(s) = \bE\left[\sum\limits_{t=0}^\infty \gamma^t \mc{R}(s_t, a_t) \mid s_0 = s\right]$. With a slight abuse of notation, we account for randomness in the initial state through $V^\pi(\mu) \triangleq \bE_{s_0 \sim \mu}\left[ V^\pi(s_0)\right]$. Recall that any policy induces a corresponding discounted stationary state visitation distribution $d^\pi_\mu(s) = (1-\gamma)\sum\limits_{t=0}^\infty \gamma^t \bP^\pi(s_t = s),$ where $\bP^\pi(s_t = \cdot) \in \Delta(\mc{S})$ is the distribution over states visited by policy $\pi$ at timestep $t$. Intuitively $d^\pi_\mu$ encodes which states policy $\pi$ will occupy using $\gamma$ to account for near-term versus future visitation. In the context of LLMs, $d^\pi_\mu$ encodes a distribution over prompts and partial/complete responses generated by a particular LLM $\pi$.

We define the optimal policy $\pi^\star$ of $\mc{M}$ as achieving supremal value with associated value function $V^\star(\mu) = \sup\limits_{\pi} V^\pi(\mu)$. For the particular choice of policy-dependent reward function, we see that an optimal policy $\pi^\star$ minimizes the KL-divergence between its own per-step token distribution and that of the ground-truth distribution $p^\star$: $V^\star(\mu) = -\inf\limits_{\pi} \frac{1}{(1-\gamma)} \bE_{s \sim d^\pi_\mu}\left[\kl{\pi(\cdot \mid s)}{p^\star(\cdot \mid s)}\right].$

\begin{theorem}[Informal]
    Let $\pi^0$ be an initial, pre-trained base model and $\pi^K$ be the fine-tuned LLM after $K \in \bN$ iterations of policy-gradient updates. Then, we have $$\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^K_s}\right] \lesssim \bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^0_s}\right].$$
    % $$\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^K_s}\right] \leq \bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^0_s}\right] + \frac{\eta^2\beta W^2 K}{2}.$$
    \label{thm:finetune_kl_decr}
\end{theorem}
% \vspace{-15pt}

As the optimal policy of the MDP $\mc{M}$ is $p^\star$, Theorem 1 affirms that making policy-gradient updates~\citep{sutton1999policy} brings the fine-tuned LLM $\pi^K$ closer (in KL-divergence) to the lower-entropy response distribution $p^\star$ than the higher-entropy pre-trained model $\pi^0$.

\subsection{Toward Recovering Diversity}

Just as the reduction of entropy formalized in the previous two sub-sections is an intuitive consequence of language model fine-tuning, we use this final sub-section to motivate an equally intuitive solution: mixing model parameters. Prior work establishes a link between the blending of model parameters and a commensurate blending of the associated model response distributions~\citep{kangaslahti2024continuous}. One might naturally hope to obtain the ``best of both worlds'' by mixing weights of a pre-trained language model fit to a high-entropy response distribution and those of a fine-tuned model closely approximating a low-entropy response distribution. We may formalize this intuition with a first proposition that considers an obvious linear interpolation between response distributions:
\begin{proposition}
Consider two arbitrary probability distributions $P$ and $Q$ such that $\bH(P) \geq \bH(Q)$. For any $\alpha \in [0,1]$, define $M_\alpha = \alpha \cdot P + (1-\alpha) \cdot Q$. Then, $$\bH(Q) \leq \bH(M) \leq \bH(P)$$
\end{proposition}
A more general information-theoretic analysis allows us to obtain looser bounds on the entropy of response sampled from the mixture distribution that goes beyond just linear interpolation:
\begin{proposition}
Consider two arbitrary probability distributions $P, Q \in \Delta(\mc{X})$ with $X_1 \sim P$ and $X_2 \sim Q$. Let $Z \in \Delta(\{1,2\})$ be a random index following an arbitrary distribution. Then, $X_Z$ is a random variable denoting a sample from the mixture distribution between $P$ and $Q$ induced by $Z$. Moreover, $$2 \cdot \min\limits_{i \in \{1,2\}} \bH(X_i) \leq \bH(X_Z) \leq 2 \cdot \max\limits_{i \in \{1,2\}} \bH(X_i) + 1.$$
\end{proposition}

Proofs may be found in Appendix \ref{sec:appendix_interpolate}. Together, these two propositions highlight one promising pathway to recovering the response diversity lost due to standard fine-tuning practices; namely, by blending the response distributions of the current fine-tuned language model and the pre-trained base model. In the next section, we present our hindsight merging approach that uses interpolation between respective model weights to achieve this diverse mixture of response distributions.



\section{Diverse Generation}

In Section \ref{sec:entropy}, we showed that fine-tuning on low diversity datasets leads to low diversity data generations in fine-tuned models. 
We propose \textit{hindsight merging} ---  interpolating between the weights of the base model, which has a high diversity, and the fine-tuned model --- to improve diversity in the generations.

Mixing weights leverages the fact that, during fine-tuning, the model is likely in the neural tangent kernel (NTK) regime~\citep{fort2020deep, wortsman2022robust, wortsman2022model, ilharco2022editing}. When the model is in the NTK regime, the functional updates are approximately linear. Therefore, by interpolating between the weights, we approximately roll-back the model along its optimization trajectory to previous checkpoints that capture the desirable results of fine-tuning but at higher levels of entropy.


\subsection{Encouraging Diversity}
\xhdr{Hindsight merging} For model merging, we focus our analysis on a couple classes and sizes of models: Llama-3.1-8b, Llama-2-7b, Llama-2-13b, Llam-3.1-70b. For each of these models we combine both the base and instruct models using MergeKit~\citep{goddard-etal-2024-arcees} and use linear interpolation (c.f., Appendix~\ref{app:merging} for further information). For the Llama-2 series of models, we combine it with the Vicuna instruct version~\cite{vicuna2023}. 

To merge the models, we define a parameter $\alpha\in [0,1]$ which measures the merging coefficient. When $\alpha=0$ the merged model equals the base model when $\alpha=1$, it is entirely the instruct copy. 
For experiments, we restrict $\alpha \in \{0, 0.7, 0.9, 1\}$. We denote $\alpha=0$ as the ``pretrained'' model, and $\alpha=1$ as the ``instruct'' model. For the pretrained model, we convert prompts to completion prompts, whereas for the merged models we use a chat template. 


\subsection{Evaluation}
\xhdr{Data} We take 378 tasks from MBPP+, a large-scale benchmark dataset for code generation in Python~\citep{liu2024evaluatinglanguagemodelsefficient}. MBPP+ is an extension of the original MBPP benchmark with additional unit tests to improve test coverage and avoid false positives.  


\xhdr{Diversity} In many applications of synthetic data, semantic diversity outweighs syntactic diversity---diverse approaches are more valuable than diverse use of language for the same approach. To account for this, we rely on a BERTScore-style~\citep{zhang2019bertscore} similarity metric. For each of the rewritten generations, we embed the traces using OpenAI's \texttt{text-embedding-3-small} with 150 dimensions. We measure the local similarity or the average cosine similarity across generations from the same question and the global similarity as the diversity in generations across different questions. 

\xhdr{Performance} For each of the programming tasks in MBPP+, we test a model's performance on the dataset using the provided unit tests (functions that verify the code correctness). Here we report two scores, the pass@$1$ and pass@$k$~\citep{kulal2019spoc}. Pass@$1$ is the average number of generations that are correct, irrespective of if the model gets multiple correct generations for one question. On the other hand, pass@$k$ tests if a model has $k$ tries to solve a problem, what are the chances at least one of the generations passes the unit tests. Explicitly, we estimate pass@$k$ using the following formula from \cite{chen2021evaluating}: 
\begin{align*}
    \text{pass}@k := \mathbb{E}_{\text{Problems}} \left[ 1 - \frac{\binom{n-c}{k}}{\binom{n}{k}} \right]
\end{align*}

\begin{figure*}[t]
    \begin{subfigure}{.95\textwidth}
        \centering
        \includegraphics[width=\linewidth]{visuals/avg_question_similarity_2x2.pdf}
        % \caption{Local setting cosine similarity.}
        % \label{fig:div:subfig2}
    \end{subfigure}
    \caption{Average cosine similarity across embeddings of generated text in the global settings.}
    \label{fig:diversity_global}
\end{figure*}

\begin{figure*}[t]
    \begin{subfigure}{.95\textwidth}
        \centering
        \includegraphics[width=\linewidth]{visuals/avg_same_question_similarity_2x2.pdf}
        % \caption{Local setting cosine similarity.}
        % \label{fig:div:subfig2}
    \end{subfigure}
    \caption{Average cosine similarity across embeddings of generated text in the local settings.}
    \label{fig:diversity_local}
\end{figure*}

\xhdr{Instruction following} Pre-training a language model provides it with an understanding of the world, however during the fine-tuning stage the model becomes an assistant learning how to follow instructions.
Thus, merging the two models, may lead to reduced performance in the instruction following tasks the model was trained on. One such task is refusal, where when prompted with a potentially problematic request, an instruction-tuned language model is explicitly trained to refuse that request. To evaluate the hindsight-merged models, we see how performance on SORRY-Bench --- a benchmark for safety refusals --- changes~\citep{xie2024sorrybench}. To evaluate SORRY-Bench generations, we use GPT-4.1-mini-as-a-judge. 

% \xhdr{Fine-tuning a reasoning model} To validate the downstream usefulness of our method, we use reasoning model distillation, a task known to benefit from diverse data traces~\citep{muennighoff2025s1}. We
% fine-tune \llamai{} on the data generated with our set of merged models to produce \llamaf{\cdot }. Following the method of various recent works, we filter out reasoning traces that lead to an incorrect solution \citep{sky_t1_2025, bespoke_stratos}. For questions with more than two correction generations, we randomly select two. We then do supervised fine-tuning (SFT) on the remaining data using \texttt{LLama-Factory} \citep{zheng2024llamafactory}. We evaluate the resulting reasoning models on the test set of easy tasks of the TACO benchmark.




\section{Results}

In our experiments, we show that we (1) can induce diversity in the generations using hindsight merging, (2) does not harm model performance in other aspects, such as instruction following, and (3) improves model pass@k performance.

\subsection{The best of both worlds}
\xhdr{Diversity} To begin, we explore how varying the $\alpha$ parameter affects diversity of the model generations. In Figure~\ref{fig:diversity_global}, we illustrate the global similarity metrics across the different models. Within the global context (cross-question similarity) reducing the alpha makes the model on average more diverse. Across the four models we studied, the fully instruct model has the highest on average BERTScore across its generated responses. 

In Figure~\ref{fig:diversity_local} we show the intra-question similarity for the different models. Within the Vicuna models we find that hindsight merging actually results in reduced diversity for generations of the same question compared with the fully instruct model. On the other hand, for the Llama-3.1-8B and Llama-3.1-70B models, we find that the fully instruct model consistently provides the most homogeneous answers. The difference between the Vicuna and Llama model is largely explained by the inability of the Vicuna models to solve some of the coding problems, resulting in highly unstable behavior. For some of the questions, the model merely repeats the original prompt vs.\ other times generating highly varying solutions. This behavior is further backed up by the general poor performance shown below. 

% We note that reducing $\alpha$ makes the model more diverse. 
% These results indicate that increasing the instruct model fraction in the merge results in generations that are more templated and less variable, while lower values of $\alpha$ preserve the base model’s greater variability in responses. 








% In the Appendix we include a figure 

\xhdr{Failure-to-Refuse}
Figure \ref{fig:sorry-bench-results} presents a heat-map of failure-to-refuse rates across the 44 SORRY-Bench content clusters. Each column corresponds to a model variant; the number in parentheses beside the name is its mean failure rate (i.e., the fraction of hazardous prompts it answered instead of refusing). Darker shades mark categories where the model answered when it should have refused, so higher values mean weaker safety behavior.

For the Llama-3.1-8B family, the pretrained checkpoint answers almost half of restricted prompts (0.48). Blending in instruct weights sharply reduces this: at $\alpha=0.7$ the rate falls to 0.32, at $\alpha=0.9$ to 0.30, and the fully-instruct model settles at 0.28. Thus even a modest interpolation captures most of the instruction-tuned caution while preserving base-model fluency.
On the other hand, for the Llama-3.1-70B family scale alone does not buy safety. The 70B pretrained model is the least compliant in the study (0.75). $\alpha=.70$ cuts the failure rate nearly in half (0.42), and full instruction tuning brings it down to 0.45, still well above the 8B instruct score. 
Finally, both Vicuna models show the same monotonic trend. For the 7B model the pretrained fails to refuse around 47\% of the time reduced to 0.44 in $\alpha=0.7$, further reduced to 0.40 in $\alpha=0.9$, compared to $0.37$ in the instruct setting. The 13B model demonstrates a similar pattern: 0.47 → 0.42 → 0.31 → 0.31. 

% Beyond column averages, the grid reveals stubborn hot spots: detailed cyber-crime tutorials (#14 System Intrusion, #15 Malware), extremist or violent content (#27 Extremism & Terror), and certain sexual or discriminatory requests (#30 Non-Explicit Adult Content, #31 Discrimination). Even the best-behaved models still answer 30–50 % of these prompts, highlighting priority areas for further alignment.

% Hindsight merging consistently lowers the failure-to-refuse rate, but the magnitude of the safety gain depends on both model family and size. Strategic interpolation around often delivers most of the safety benefits at a fraction of the compute cost of full fine-tuning. 

% \xhdr{Instruction Following} The instruct model has capabilities that we want the base model to possess, like instruction following. When a user asks the model to generate code, ideally the model generates a code block. In this section, we explore to what extent the merged model maintains the instruction following capabilities of the instruction model using SORRY-Bench. 
% Each model’s average refusal score is indicated in parentheses following the model name. The base models ($\alpha=0$) exhibit relatively low refusal rates—for example, Llama-3.1-8B (pretrained) has an average score of 0.26—reflecting minimal safety intervention and a tendency to answer even risky or restricted prompts. At the other extreme, the fully instruct-tuned models ($\alpha=1$) are much more conservative, with average scores reaching as high as 0.80 (Llama-3.1-8B instruct), indicating that these models frequently refuse to answer potentially sensitive content. As we interpolate between base and instruct with increasing $\alpha$, we observe a steady rise in refusal rates: for instance, Llama-3.1-8B ($\alpha=0.7$) achieves a score of 0.49, while at $\alpha=0.9$ it further increases to 0.64. This trend demonstrates that increasing the weight on the instruct model substantially raises the model’s overall caution, inheriting the alignment and safety behaviors of the instruct-tuned parent. Notably, this pattern is consistent across both Llama and Gemma model families, although the effect size varies slightly by architecture.
% It's relevant to note that the Gemma-3-4b (pretrained) failed to answer many of the generations by generating either repeated emojis or unrelated answers.



\subsection{Coding Performance}
% Next, we turn to evaluating how well the models solve the MBPP+ programming tasks. In Figure~\ref{fig:capabilities}, we show a comparison between the pass@$1$ and pass@$16$. 
% We observe that as $\alpha$ decreases (more of the base model merged in) the pass@$1$ decreases. In other words, the models become worse at generating the correct answer when prompted once. However, the pass@$16$ tells a different story. The merged models achieve a higher pass@$10$ than the $\alpha=1$ model. In the $\alpha=0.7$ setting, we have a pass@$10$ of 0.696, whereas the full instruct model has a pass@$10$ of 0.654. This suggests that the inference scaling laws of the two models may be different. In other words, repeated sampling from a diverse model leads to better pass@$k$ performance than repeated sampling from the instruct model. In Figure~\ref{fig:capabilities} we illustrate the difference in scaling patterns and show that after 6 generations, the merged model passes the fully instruct model. 
% That being said, $\alpha=0.7$ seems to be a sweet spot, as including even more of the base model ($\alpha=0.5$) results in a drop at the pass@$10$ to 0.586. 

We evaluate how well merged models perform on MBPP+ by measuring pass@$k$ for $k \in {1, \dots, 10}$. Figure~\ref{fig:diversity} shows inference scaling results across four model families. 
We observe a consistent trend across all models: hindsight merging seems to improve performance and achieve the best of both worlds, instruct and pretrained models. For all four model families, the highest pass@10 results are attained by hindsight merged models.

These gains seem to be more pronounced for weaker models like Vicuna-7B and Vicuna-13B. Vicuna-7B's pass@$10$ jumps from 0.296 in the instruct model to 0.491 with $\alpha=0.7$, while also improving pass@$1$ from 0.085 to 0.255. Similarly, Vicuna-13B improves from 0.41 to 0.56 on pass@$10$ when merging with $\alpha=0.7$. These results show that merging can recover strong performance while retaining instruction-following capabilities.

Interestingly, for the strongest model—Llama-3.1-70B—the gains are more modest, suggesting that the performance-diversity trade-off is already well-balanced in larger models. Nevertheless, the merged model with $\alpha=0.7$ achieves the highest overall performance for both pass@$1$ and pass@$10$.

In summary, the evidence indicates that hindsight merging reliably improves performance under repeated sampling (pass@$k$), with the strongest effects seen in smaller models. This confirms that inference scaling benefits from balancing instruction-following with higher generation diversity.

\begin{figure*}[t]
    \begin{subfigure}{.95\textwidth}
        \centering
        \includegraphics[width=\linewidth]{visuals/pass_at_k_plots.pdf}
        % \caption{Local setting cosine similarity.}
        % \label{fig:div:subfig2}
    \end{subfigure}
    \caption{Inference scaling across different $k$ between 1 and 10.}
    \label{fig:diversity}
\end{figure*}




% \subsection{Exploration of failure modes}


% We conduct a more qualitative analysis to see where the models fail. In Table~\ref{tab:error-analysis} we break down failure modes into two categories following the work of \citet{li2023taco}: (1) failure to include a code snippet and (2) failure for written code to pass unit tests. We additionally include a column with the fraction of generations with correct code. We first observe that in the base model case, $\alpha=0$, 54.9\% of the time the model fails to include a code snippet, and the remaining 33.9\% result in incorrect code. In the other extreme, the fully instruct model, the model includes code snippets most of the time (89.6\%), but fails due to incorrect generations (84.4\% of failures). As we can see, increasing the fraction of base model included results in inheriting the lack of instruction following: 83.5\% of the generations contain code in the case of $\alpha=0.7$ compared with 69.6\% for $\alpha=0.5$. 



% \begin{table}[]
%     \centering
%     \small
%     \renewcommand{\arraystretch}{1.2}
%     \begin{tabular}{lrrr}
%         \toprule
%         \textbf{Model} & \textbf{Incorrect (\%)} & \textbf{No code (\%)} & \textbf{Correct (\%)} \\
%         \midrule
%         $\alpha=1$ & 56.8\% & 10.5\% & 32.7\% \\
%         $\alpha=0.7$ & 58.4\% & 16.6\% & 25.0\% \\
%         $\alpha=0.5$ & 53.7\% & 30.0\% & 16.3\% \\
%         $\alpha=0$ & 33.9\% & 55.7\% & 10.5\% \\
%         \bottomrule
%     \end{tabular}
%     \caption{Error analysis. Incorrect means that the code component was present, but it gave the wrong answer. No code means that the generation had no code. Correct means percentage of generations that solved the task.}
%     \label{tab:error-analysis}
% \end{table}

% \subsection{Training a Reasoning Model}
% The hindsight-merged models produce diverse and correct reasoning traces, making them strong candidates for training a dedicated reasoning model. To evaluate the effectiveness of this approach, we fine-tune \llamai{} on datasets generated with different model merges, creating new series reasoning models \llamaf{}.

% We follow a standard distillation pipeline: first, we filter out reasoning traces leading to incorrect solutions, ensuring high-quality supervision. Then, we apply supervised fine-tuning (SFT) using Llama-Factory~\citep{zheng2024llamafactory}. Finally, we evaluate the resulting model on the TACO test set, focusing on medium-difficulty programming tasks.

% The results in \cref{tab:finetuning_results} show that \llamaf{0.7} trained on data curated from a model merged with a SLERP weight of $\alpha = 0.7$ on the instruct model achieves the highest single-sample accuracy, surpassing the model trained on the instruct model’s generations. This confirms that increased diversity in reasoning traces leads to better generalization.

% \begin{table}[h]
%     \centering
%     \small
%     \begin{tabular}{lcc}
%         \toprule
%         \textbf{Model} & \textbf{Pass@1} & \textbf{Pass@10} \\
%         \midrule
%         \llamaf{1.0} & 5.2\% & 21.5\% \\
%         \llamaf{0.7} & \textbf{7.0\%} & 29.0\% \\
%         \llamaf{0.5} & 6.7\% &\textbf{32.0\%} \\
%         \bottomrule
%     \end{tabular}
%     \caption{Pass@$1$ and Pass@$10$ results for fine-tuned reasoning models using different hindsight merging coefficients $\alpha$ on TACO-easy. The best-performing models both use data curated from merged models at $\alpha=0.7, 0.5$, suggesting that hindsight merging improves diversity and generalization.}
%     \label{tab:finetuning_results}
%     \vspace{-0.5cm}
% \end{table}

% Additionally, we observe improved inference scaling properties in the distilled models. Figure~\ref{fig:finetuning-scaling} presents pass@$k$ curves, highlighting that models trained on hindsight-merged data generate more diverse reasoning traces, yielding steeper inference scaling curves compared to models trained solely on instruct-generated data. Specifically, the scaling curve for \llamaf{0.5} is the steepest and achieves the highest pass@$k$ for a $k$ beyond four. This suggests that diverse training data improves robustness and compositional reasoning, allowing the model to explore a broader solution space.


% Together, these findings support the core hypothesis: hindsight merging enhances synthetic data diversity, which in turn leads to more effective reasoning model distillation. Our results are in line with prior work~\citep{muennighoff2025s1} that highlights the importance of diversity in reasoning trace data for model distillation. We show that hindsight merging can be a simple, yet effective technique for improving reasoning models.


%  \begin{figure}[t!]
%     \centering
%     \centering
%     \includegraphics[width=.8\linewidth]{visuals/finetuning_pass_at_k.pdf}
%     \caption{Performance on coding tasks with repeat sampling of the generations.}
%     \label{fig:finetuning-scaling}
%         \vspace{-0.5cm}
% \end{figure}


\section{Related work}
\xhdr{Model merging}
Model merging refers to combining the weights of different models to create a new model that ideally retains the strengths of its components. This approach has been found to produce models that perform well across multiple tasks originally handled by the individual models~\citep{pmlr-v162-wortsman22a}. Notably, model merging has been shown to be more effective than data mixing for integrating knowledge across models~\citep{Aakanksha2024MixDO}, and various strategies have been explored to improve its effectiveness~\citep{akiba2025evolutionary,yadav2024matters}. Additionally, \cite{wortsman2022robust} showed that model merging improve robustness to data shifts.

A common method for model merging is linear interpolation, where the weights of two or more models are combined using a weighted sum~\citep{pmlr-v162-wortsman22a}. 
Depending on the training regime, linear interpolation is approximately equal to ensembling models~\citep{wortsman2022robust, wortsman2022model}. 
% Related to this, task vectors have been proposed as a method for model adaptation: by fine-tuning a language model on a specific task and then adding the fine-tuned model back to the original model, task-specific capabilities can be transferred without full retraining~\citep{ilharco2022editing}.
Related to this idea, task vectors have been proposed as a method for model adaptation: a task can be approximated by computing the difference in the weights of a fine-tuned a language model and the base model. 
This difference represents the task, and can be combined with other task vectors to improve model performance ~\citep{ilharco2022editing}.
Related to this paper, task vectors have been used to convert a base language model into a change model~\citep{huang2024chat}. This builds on recent work that shows the  representations of the base and instruct models are aligned ~\citep{sae_finetuning,BaseLLMsRefuseToo,lindsey2024sparse,minder2025controllable}. 



\xhdr{Model post-training and reasoning}
Model post-training consists of everything that is done after the pretraining and is critical to making the model a capable assistant~\citep{bai2022training} and aligning them with human values~\citep{hendrycks2023aligningaisharedhuman}. The release of o1 was a harbinger for post-training reasoning into language models~\citep{jaech2024openai}, leading to a series of highly effective models, e.g., DeepSeek R1~\citep{guo2025deepseek}, S1~\citep{muennighoff2025s1}. 

Reasoning work builds on the observation that additional compute to solve tasks leads to dramatic improvements in performance. This additional reasoning can be achieved through prompting~\citep{prystawski2024think,NEURIPS2022_9d560961} increasing computational depth~\citep{goyal2024think,pfau2024lets}, and explicitly training reasoning into language models ~\citep{de2024rational, luo2024improvemathematicalreasoninglanguage, zelikman2022star}.
To train a reasoning model, work has shown that the data traces require high diversity, quality, and a range of difficulties~\citep{muennighoff2025s1}. 

\xhdr{Synthetic data curation and diversity}
Synthetic data curation is used for a wide range of applications from reasoning model distillation~\citep{guo2025deepseek} to self-play in language models~\citep{kumar2024training}. 
One active area of research is designing diverse synthetic data generation pipelines~\citep{samvelyan2024rainbow,veselovsky2023generating,ge2024scalingsyntheticdatacreation,zelikman2022star,yu2024large, chen2024diversity}, one approach explicitly asks the LLM to generate diverse hypotheses before solving a task~\citep{wang2024hypothesis,frohling2024personas,zhang2024improving}. One application of synthetic data has been training reasoning models~\citep{bespoke_stratos}, where it has been shown that weaker models provide better synthetic data than stronger models~\citep{bansal2024smaller} given that an accurate verifier is available to score responses~\citep{stroebl2024inference} 

Synthetic data is usually generated using RLHF models, which have been shown to reduce the diversity of the generated data~\citep{murthy2024one,achiam2023gpt,casper2023open,fdivergence,perez2022red}. 
Other work has shown that while they exhibit reduced diversity, it may not be problematic since it filters noisy and unhelpful generations~\citep{lake2024distributional}, a claim we further explore in this paper. One specific example where more diverse outputs are associated with better performance is inference scaling through best-of-n, where multiple candidate generations are sampled, and the highest-scoring output—often selected via a verifier or heuristic function—is chosen \citep{brown2024large, wang2024planningnaturallanguageimproves}.

\section{Discussion}
In this paper, we focus on reconciling a tension in generating data with language models: maintaining the output diversity of the base model without losing the strong instruction following abilities of the fine-tuned models. 
We show that fine-tuning via either supervised learning or reinforcement learning reduces output diversity by lowering the entropy during optimization. To counter this, we propose \emph{hindsight merging} for getting the best of both worlds by merging instruct models with their past checkpoints. Our experiments show the value of this method by both studying generated data, alongside its downstream applications. We illustrate that merging leads to more diversity, maintains instruction following, and exhibits better inference scaling behavior. 

\xhdr{Extensions} One natural first avenue for extending the methods outlined in this paper is scaling up of model parameters and data.  Even in our GPU-constrained settings, we saw inference scaling laws that demonstrate optimistic findings, and as we scale model size, number of samples generated, and number of questions answered, we expect similar improvements to hold. Second, a more rigorous comparison with other diversity increasing methods. Classically, entropy has been added to language models through temperature scaling. 
A more rigorous comparison of temperature scaling methods may provide interesting insights into how different diversity techniques result in different downstream implications. 
%One can also imagine playing with more interesting temperature scaling approaches, where once a generation appears similar to a previous generation, the temperature is dynamically scaled. 
Additionally, our work focuses on LERP for merging, but other methods could create different generation behaviors. While merging is usually done on the model level, logit-level mixing~\citep{huang2024divide,zhang2024cogenesis} may offer alternative distributional approaches. 


\xhdr{Limitations} While hindsight merging improves diversity and inference scaling, our approach has several limitations. First, verifier reliability: diverse generations produce responses that could increase the risk of misleading external verifiers often used to enable inference scaling \citep{stroebl2024inference}. Second, domain specificity: our experiments focus on code generation, and it remains unclear whether similar gains would extend to other tasks such as mathematics. Third, alternative methods: concurrent work suggests that generating synthetic data using base models combined with iterative rewriting can also enhance performance \citep{zhu2025bare}. This indicates that diverse, synthetic data generation may be achievable even with $\alpha = 0$, (that is, without explicit hindsight merging). However, in our experiments, we found that base models often struggle to generate high-quality, coherent, and sufficiently-long reasoning traces, limiting the effectiveness of this approach. 
%Lastly, latent reasoning models: recent work suggests that reasoning need not be explicit but can occur at the latent level~\citep{geiping2025scalingtesttimecomputelatent}, potentially reducing the necessity for explicit reasoning trace generation. Other latent reasoning methods still rely on chain-of-thought data~\citep{hao2024traininglargelanguagemodels}, however, and it remains to be seen which paradigm passes the test of time.

% last paragraph smth optimistic 
As the use of language model outputs grows, it will become increasingly important to generate diverse data. We believe that the careful interplay of richly-diverse base models and instruction-following, fine-tuned models may open up a wealth of opportunity for generating diverse and high-quality data. 


\xhdr{Reproducibility} We make the code and data used in this paper available here: \url{https://github.com/benediktstroebl/hindsight-merging}.

% References
\bibliography{main}

\newpage

\onecolumn

\title{Appendix}
\maketitle

\appendix

\section{Tracking Entropy Changes Over Optimization Steps}
\label{sec:appendix_entropy_jacobian}
In this section, we aim to provide a more rigorous treatment of Section~\ref{sec:entropy}. We will proceed pedagogically and first derive Equation~\ref{eq:entropy_total}, which forms the base of our analysis.

Suppose that we are performing $K$ transformations on some predictive distribution $q_\theta^k(x_{t+1}|x_{1:t})$ for $x \in \mathcal{X}$ denoted by $q_\theta^k(x)$ for brevity. Let
\begin{align*}
    M : q_\theta^k(x) \mapsto q_\theta^{k+1}(x) = \text{softmax}(f(x;T(\theta_k)))
\end{align*}
denote an invertible and differentiable transformation on the space of predictive distributions. Assume that all integrals exist and that the conditions for the change‐of‐variables theorem are met. Let $q_\theta^{k+1}(x) = M(q_\theta^k(x))$ denote an application of this transformation.

By the change-of-variables formula, we have
\begin{align}
    q_\theta^{k+1}(x) = q_\theta^k(x) \left|\frac{\partial M}{\partial q}\left(q_\theta^k(x)\right)\right|^{-1},
\end{align}
where $\left|\frac{\partial M}{\partial q}\left(q_\theta^k(x)\right)\right|$ denotes the determinant of the Jacobian matrix for the transformation $M$. Taking the natural logarithm of both sides gives:
\begin{align}
    \log q_\theta^{k+1}(x) = \log q_\theta^k(x) - \log \left|\frac{\partial M}{\partial q}\left(q_\theta^k(x)\right)\right|.
\end{align}
Now, let us take the expectation of both sides with respect to $q_\theta^{k+1}(x)$ and apply the definition for differential entropy:
\begin{align}
    H[q_\theta^{k+1}] = -\mathbb{E}_{q_\theta^{k+1}}\left[\log q_\theta^k(x)\right] + \mathbb{E}_{q_\theta^{k+1}}\left[\log \left|\frac{\partial M}{\partial q}\left(q_\theta^k(x)\right)\right|\right].
\end{align}
Since the change-of-variables formula ensures $p_y(y) dy = p_x(x) dx$, we can re-write the left-hand integrals in terms of $q_\theta^{k}$ and obtain
\begin{align}
    H[q_\theta^{k+1}] = H[q_\theta^{k}] + \mathbb{E}_{q_\theta^{k}}\left[\log \left|\frac{\partial M}{\partial q}\left(q_\theta^k(x)\right)\right|\right],
\end{align}
which, after summing for $K$ iterations starting at $q_\theta^0(x)$, produces Equation~\ref{eq:entropy_total}:
\begin{align*}
    H[q_\theta^K] = H[q_\theta^0] + \sum_{k=0}^{K-1} \mathbb{E}_{x \sim q_\theta^k} \left[ \log \left| \frac{\partial M}{\partial q}\left(q_\theta^k(x)\right) \right| \right],
\end{align*}


% Suppose we have some arbitrary transition operator $M : (0, 1) \to (0, 1)$ that transforms some probability distribution $p_x(x)$. The change-of-variables formula, which defines 

\subsection{Deriving $M(q)$ under Gradient Descent}
\label{sec:mq}
Let $z = f(x; \theta)$ and $q(x) = \text{softmax}(z) = \sigma(z)$. Applying a single gradient descent step:
\begin{align} z' = f(x; \theta - \alpha \nabla_\theta \mathcal{L}(\theta)) \end{align}
Using a first-order Taylor expansion around $\theta$, we obtain:
\begin{align} z' \approx f(x; \theta) + \frac{\partial f(x; \theta)}{\partial \theta} (\theta' - \theta) \end{align}
From the gradient update rule $\theta' - \theta = -\alpha \nabla_\theta \mathcal{L}(\theta)$, this simplifies to:
\begin{align} z' = z - \alpha \frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta) \end{align}
Defining $M(q)$ as the transformed distribution after the update:
\begin{align} M(q) = q' = \sigma(z') = \sigma\left(z - \alpha \frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta)\right) \end{align}
To linearize around $q$, we expand:
\begin{align} q' \approx q + \frac{\partial \sigma(z)}{\partial z} (z' - z) \end{align}
With $\sigma$ as the softmax function, this leads to:
\begin{align} \boxed{M(q) \approx q - \alpha \left[\text{diag}(q) - qq^\top\right] \frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta),} \end{align}
where $q$ denotes the softmax probabilities produced by the network $f(x; \theta)$ given data $x$.
\subsection{Jacobian of $M(q)$}
The Jacobian $\frac{\partial}{\partial q} M(q)$ allows us to track volume changes, per update step, over $q_\theta(x) = \text{softmax}(f(x;\theta))$. Consider the Jacobian for the approximation from Appendix \ref{sec:mq}:
\begin{align} 
    \frac{\partial}{\partial q} M(q) \approx \frac{\partial}{\partial q}\left[q - \alpha \left[\text{diag}(q) - qq^\top\right] \frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta)\right]. 
\end{align}

Since $\frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta)$ is parametrized by $\theta$, we treat it as a constant and only consider the term $q - \alpha \left[\text{diag}(q) - qq^\top\right]$. It is simple to show that the Jacobian of this term with respect to $q$ is:
\begin{align}
    I - \alpha\left[E - G\right],
\end{align}
where $E_{ijk} = \delta_{ij} \cdot \delta_{jk}$ and $G_{ijk} = \delta_{ik}q_j + \delta_{jk}q_i$. To approximate the $\frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta)$ term, we assume that (1) the learning rate $\alpha$ is small and (2) $\frac{\partial f(x; \theta)}{\partial \theta}$ does not significantly vary with $q$ near the current parameter setting $\theta$. 

For the cross-entropy loss, the gradient with respect to the logits is given by:
\begin{align}
    \frac{\partial \mathcal{L}(\theta)}{\partial f} = q - y.
\end{align}
for softmax probabilities $q$ and ground-truth one-hot encoded vectors $y$. As a result of the assumptions above, we choose the following approximation:
\begin{align}
    \frac{\partial f(x;\theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta) \approx q - y.
\end{align}
Combining these assumptions the Jacobian of the update mapping $M(q)$ can be approximated by
\begin{align}
    \boxed{\frac{\partial}{\partial q} M(q) \approx I - \alpha\left[E_{kk} - G\right](q - y),}
\end{align}
where $E_{ijk} = \delta_{ij} \cdot \delta_{jk}$ and $G_{ijk} = \delta_{ik}q_j + \delta_{jk}q_i$.


\section{Fine-Tuning via Reinforcement Learning Decreases Entropy}
\label{sec:app_rl}

\subsection{Problem Formulation}

Let $\mc{V}$ be a finite vocabulary of tokens such that the set of possible token sequences is $\mc{L} = \bigcup\limits_{n=1}^\infty \mc{V}^n$. A fine-tuning dataset is generated by sampling a prompt and generating a corresponding response from some fixed ground-truth distribution. Let $\mu \in \Delta(\mc{L})$ be the distribution over prompts and let $p^\star: \mc{L} \ra \Delta(\mc{L})$ be the ground-truth response distribution. One might expect $p^\star$ to take on a natural factorized form such that, for any initial prompt $s \sim \mu$ and length-$T$ response $w = (w_1,w_2,\ldots,w_T)$, $p^\star(w \mid s) = \prod\limits_{t=1}^T p^\star(w_t \mid w_1, \ldots, w_{t-1}, s)$. We may write down a particular Markov decision process (MDP) in which any LLM is a policy of the MDP and the reward function is constructed such that learning the optimal MDP policy amounts to obtaining a LLM that matches $p^\star$.

Consider the infinite-horizon, discounted MDP $\mc{M} = \langle \mc{S}, \mc{A}, \mc{R}, \mc{T}, \mu, \gamma \rangle$. Here $\mc{S} = \mc{L}$ represents language (a sequence of tokens from $\mc{V})$ consisting of a prompt and some partial or complete response. The action space $\mc{A} = \mc{V} \cup \{\texttt{STOP}\}$ contains all valid tokens a LLM may emit as well as an explicit \texttt{STOP} token to denote the completion of a response. Logically, the MDP follows deterministic transition dynamics $\mc{T}: \mc{S} \times \mc{A} \ra \mc{S}$ which appends the selected token to the current state: $\mc{T}(s, a) = \langle s, a \rangle$. For simplicity, we obviate the explicit incorporation of a designated absorbing, terminal state $s_\perp$ that an agent will transition to immediately upon selecting the \texttt{STOP} action. $\mu \in \Delta(\mc{S})$ is an initial state distribution which aligns with the distribution over prompts above; thus, any initial state already contains the prompt and subsequent token selections are folded into the next state transitions. As usual, $\gamma \in [0,1)$ is the standard discount factor for communicating a preference between near-term and long-term reward. So far, we have specified a controlled Markov process (that is, a MDP without a reward function) such that any policy $\pi: \mc{S} \ra \Delta(\mc{A})$ represents a LLM that examines the prompt along with any partially-generated response thus far and emits a distribution over next tokens. 

To capture the objective of fine-tuning a LLM towards a ground-truth response distribution $p^\star$, we consider a policy-dependent reward function defined as $\mc{R}(s,a) = \log\left(\frac{p^\star(a \mid s)}{\pi(a \mid s)}\right)$. Recall that the performance of any policy $\pi$ with respect to a prompt $s \in \mc{S}$ is given by its associated value function $V^\pi(s) = \bE\left[\sum\limits_{t=0}^\infty \gamma^t \mc{R}(s_t, a_t) \mid s_0 = s\right]$. With a slight abuse of notation, we account for randomness in the initial state through $V^\pi(\mu) \triangleq \bE_{s_0 \sim \mu}\left[ V^\pi(s_0)\right]$. Recall that any policy induces a corresponding discounted stationary state visitation distribution $d^\pi_\mu(s) = (1-\gamma)\sum\limits_{t=0}^\infty \gamma^t \bP^\pi(s_t = s),$ where $\bP^\pi(s_t = \cdot) \in \Delta(\mc{S})$ is the distribution over states visited by policy $\pi$ at timestep $t$. Intuitively $d^\pi_\mu$ encodes which states policy $\pi$ will occupy using $\gamma$ to account for near-term vs future visitation. In the context of LLMs, $d^\pi_\mu$ encodes a distribution over prompts and partial/complete responses generated by a particular LLM $\pi$. A well-known fact is that $$V^\pi(\mu) = \bE\left[\sum\limits_{t=0}^\infty \gamma^t \mc{R}(s_t, a_t) \right] = \frac{1}{(1-\gamma)} \bE_{s \sim d^\pi_\mu}\left[\bE_{a \sim \pi(\cdot \mid s)}\left[\mc{R}(s,a)\right]\right].$$

We define the optimal policy $\pi^\star$ of $\mc{M}$ as achieving supremal value with associated value function $V^\star(\mu) = \sup\limits_{\pi \in \Pi} V^\pi(\mu)$, where $\Pi \triangleq \{\mc{S} \ra \Delta(\mc{A})\}$ denotes the class of all stationary, stochastic policies. For the particular choice of policy-dependent reward function, we see that an optimal policy $\pi^\star$ minimizes the KL-divergence between its own per-step token distribution and that of the ground-truth distribution $p^\star$: 
\begin{align*}
    V^\star(\mu) &= \sup\limits_{\pi \in \Pi} V^\pi(\mu) \\
    &= \sup\limits_{\pi \in \Pi} \frac{1}{(1-\gamma)} \bE_{s \sim d^\pi_\mu}\left[\bE_{a \sim \pi(\cdot \mid s)}\left[\mc{R}(s,a)\right]\right] \\
    &= \sup\limits_{\pi \in \Pi} \frac{1}{(1-\gamma)} \bE_{s \sim d^\pi_\mu}\left[\bE_{a \sim \pi(\cdot \mid s)}\left[\log\left(\frac{p^\star(a \mid s)}{\pi(a \mid s)}\right)\right]\right] \\
    &= -\inf\limits_{\pi \in \Pi} \frac{1}{(1-\gamma)} \bE_{s \sim d^\pi_\mu}\left[\kl{\pi(\cdot \mid s)}{p^\star(\cdot \mid s)}\right].
\end{align*}

Denote the visitation distribution of the optimal policy as $d^\star_\mu \triangleq d^{\pi^\star}_\mu$. As the KL-divergence is non-negative and achieves its minimum value when two distributions are equal, it follows that $\bE_{s \sim d^{^\star}_\mu}\left[\kl{\pi^\star(\cdot \mid s)}{p^\star(\cdot \mid s)}\right] = 0$.

\subsection{Analysis}

LLM fine-tuning via reinforcement learning typically proceeds via policy search where any LLM can be seen as a parameterized policy $\pi_\theta: \mc{S} \ra \Delta(\mc{A})$ of the above MDP with parameters $\theta \in \Theta \subset \bR^d$. Let $\Pi_\Theta \triangleq \{\pi_\theta \mid \theta \in \Theta\} \subset \Pi$ denote the parameterized policy class. Our analysis proceeds using a smoothness assumption on $\Pi_\Theta$. 

Recall that a function $f: \bR^d \ra \bR$ is $\beta$-smooth is $$|\nabla f(x) - \nabla f(x')||_2 \leq \beta ||x-x'||_2 \qquad \forall x,x' \in \bR^d.$$ A consequence of this, either by Taylor's Theorem or Lemma 3.4 of \citet{bubeck2015convex}, is $$|f(x') - f(x) - \nabla f(x) \cdot (x-x')| \leq \frac{\beta}{2} ||x'-x||_2^2 \qquad \forall x,x' \in \bR^d.$$

\begin{assumption}
    For all $\pi_\theta \in \Pi_\Theta$, the mapping $\theta \mapsto \log\left(\pi_\theta(a \mid s)\right)$ is $\beta$-smooth, $\forall (s,a) \in \mc{S} \times \mc{A}$.
    \label{assume:smooth}
\end{assumption}

Consider an iteration of fine-tuning $k$ with current policy parameters $\theta^{(k)}$ where we perform the following abstract policy gradient update $$\theta^{(k+1)} = \theta^{(k)} + \eta \omega^{(k)},$$ where $\eta \in \bR_{\geq 0}$ is a learning rate and $\omega^{(k)}$ is some vector for updating policy parameters (we will specify a concrete update momentarily). For brevity, we use the shorthand $\pi^k \triangleq \pi_{\theta^{(k)}}.$ Observe that Assumption \ref{assume:smooth} yields the following lemma

\begin{lemma}
    Under Assumption \ref{assume:smooth}, for any state-action pair $(s,a) \in \mc{S} \times \mc{A}$, $$\log\left(\frac{\pi^{k+1}(a \mid s)}{\pi^{k}(a \mid s)}\right) \geq \eta \nabla_\theta \log\left(\pi^k(a \mid s)\right) \cdot \omega^{(k)} - \eta^2 \frac{\beta}{2}||\omega^{(k)}||_2^2.$$
    \label{lemma:log_ratio}
\end{lemma}
\begin{proof}
    Notice that for a $\beta$-smooth function $f:\bR^d \ra \bR$, $$|f(x') - f(x) - \nabla f(x) \cdot (x-x')| \leq \frac{\beta}{2} ||x'-x||_2^2 \implies f(x') - f(x) \geq \nabla f(x) \cdot (x-x') - \frac{\beta}{2} ||x'-x||_2^2.$$ Applying this to our $\beta$-smooth policies (by Assumption \ref{assume:smooth}), we have 
    \begin{align*}
        \log\left(\frac{\pi^{k+1}(a \mid s)}{\pi^{k}(a \mid s)}\right) &= \log\left(\pi^{k+1}(a \mid s)\right) - \log\left(\pi^{k}(a \mid s)\right) \\
        &\geq \nabla_\theta \log\left(\pi^k(a \mid s)\right) \cdot \left(\theta^{(k+1)} - \theta^{(k)}\right) - \frac{\beta}{2}||\theta^{(k+1)} - \theta^{(k)}||_2^2 \\
        &= \eta \nabla_\theta \log\left(\pi^k(a \mid s)\right) \cdot \omega^{(k)} - \eta^2 \frac{\beta}{2}||\omega^{(k)}||_2^2.
    \end{align*}
\end{proof}

At this point, we specify a precise choice of policy-gradient update for $\omega^{(k)}$. For brevity, we write the value function induced by policy $\pi^k$ as $V^k \triangleq V^{\pi_{\theta^{(k)}}}$. Additionally, we define the action-value function as $$Q^k(s,a) \triangleq Q^{\pi_{\theta^{(k)}}}(s,a) = \bE\left[\sum\limits_{t=0}^\infty \gamma^t \mc{R}(s_t,a_t) \mid s_0 = s, a_0 = a\right] = \mc{R}(s,a) + \gamma V^k(\mc{T}(s,a)).$$ Consequently, the advantage function~\citep{baird1993advantage,sutton1998introduction} is defined as $A^k(s,a) \triangleq Q^k(s,a) - V^k(s).$ While the standard choice in the literature is Proximal Policy Optimization (PPO)~\citep{schulman2017proximal}, we study a simpler, special case of PPO more commonly known as advantage actor-critic~\citep{mnih2016asynchronous} (equivalent to running PPO for exactly one epoch per minibatch of on-policy data). We define the policy-gradient update at iteration $k$ as $$\omega^{(k)} = \frac{A^k(s,a)}{||\nabla_\theta \log\left(\pi^k(a \mid s)\right)||_2^2} \cdot \nabla_\theta \log\left(\pi^k(a \mid s)\right).$$ We assume that all policy-gradient updates have bounded norm.

\begin{assumption}
    For all iterations $k$, $||\omega^{(k)}||_2 \leq W$, for some $W \in \bR_{\geq 0}$.
    \label{assume:bounded_norm}
\end{assumption}
We may then obtain the following lemma:
\begin{lemma}
    At any iteration $k$, under Assumptions \ref{assume:smooth} and \ref{assume:bounded_norm}, $$\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^k_s} - \kl{\pi^\star_s}{\pi^{k+1}_s}\right] \geq (1-\gamma)\eta \bE_{s_0 \sim \mu}\left[V^\star(s_0) - V^k(s_0)\right] - \frac{\eta^2 \beta W^2}{2}.$$ 
    \label{lemma:exp_kl_diff_bound}
\end{lemma}
\begin{proof}
    \begin{align*}
        \bE_{s \sim d^\star_{\mu}}&\left[\kl{\pi^\star_s}{\pi^k_s} - \kl{\pi^\star_s}{\pi^{k+1}_s}\right] = \bE_{s \sim d^\star_{\mu}}\left[\bE_{a \sim \pi^\star(\cdot \mid s)}\left[\log\left(\frac{\pi^\star(a \mid s)}{\pi^k(a \mid s)}\right)\right] - \bE_{a \sim \pi^\star(\cdot \mid s)}\left[\log\left(\frac{\pi^\star(a \mid s)}{\pi^{k+1}(a \mid s)}\right)\right]\right] \\
        &= \bE_{s \sim d^\star_{\mu}}\left[\bE_{a \sim \pi^\star(\cdot \mid s)}\left[\log\left(\frac{\pi^{k+1}(a \mid s)}{\pi^k(a \mid s)}\right)\right]\right] \\
        &\geq \bE_{s \sim d^\star_{\mu}}\left[\bE_{a \sim \pi^\star(\cdot \mid s)}\left[\eta \nabla_\theta \log(\pi^k(a \mid s)) \cdot \omega^{(k)} - \eta^2 \frac{\beta}{2} ||\omega^{(k)}||_2^2\right]\right] \\
        &= \bE_{s \sim d^\star_{\mu}}\left[\bE_{a \sim \pi^\star(\cdot \mid s)}\left[\eta \nabla_\theta \log(\pi^k(a \mid s)) \cdot \frac{A^k(s,a)}{||\nabla_\theta \log(\pi^k(a \mid s))||_2^2} \cdot \nabla_\theta \log(\pi^k(a \mid s)) - \eta^2 \frac{\beta}{2} ||\omega^{(k)}||_2^2\right]\right] \\
        &= \bE_{s \sim d^\star_{\mu}}\left[\bE_{a \sim \pi^\star(\cdot \mid s)}\left[\eta \cdot \frac{A^k(s,a) \cdot ||\nabla_\theta \log(\pi^k(a \mid s))||_2^2}{||\nabla_\theta \log(\pi^k(a \mid s))||_2^2} - \eta^2 \frac{\beta}{2} ||\omega^{(k)}||_2^2\right]\right] \\
        &= \bE_{s \sim d^\star_{\mu}}\left[\bE_{a \sim \pi^\star(\cdot \mid s)}\left[\eta \cdot A^k(s,a) - \eta^2 \frac{\beta}{2} ||\omega^{(k)}||_2^2\right]\right] \\
        &\geq \eta \cdot \bE_{s \sim d^\star_{\mu}}\left[\bE_{a \sim \pi^\star(\cdot \mid s)}\left[A^k(s,a)\right]\right]  -  \frac{\eta^2 \beta W^2}{2} \\
        &= (1-\gamma)\eta \bE_{s_0 \sim \mu}\left[V^\star(s_0) - V^k(s_0)\right] - \frac{\eta^2 \beta W^2}{2},
    \end{align*}
    where the first inequality follows from Assumption \ref{assume:bounded_norm} and Lemma \ref{lemma:log_ratio} above, the second inequality follows from Assumption \ref{assume:bounded_norm}, and the final equation follows from the performance-difference lemma~\citep{kakade2002approximately}.
\end{proof}

Using the above lemma, we may follow similar steps as \citet{agarwal2021theory} to obtain a result that relates a current policy after $K$ iterations of policy-gradient updates to the initial policy $\pi^0$ using the KL-divergence with the optimal policy as a benchmark or ``metric'' for comparison.

\begin{theorem}
    For a total number of iterations $K \in \bN$, under Assumptions \ref{assume:smooth} and \ref{assume:bounded_norm}, we have $$\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^K_s}\right] \leq \bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^0_s}\right] + \frac{\eta^2\beta W^2 K}{2}.$$
    \label{thm:finetune_kl_decr}
\end{theorem}
\begin{proof}
    To start, first observe that 
    \begin{align*}
        \frac{1}{K} \sum\limits_{k=0}^{K-1} \bE_{s_0 \sim \mu}\left[V^\star(s_0) - V^k(s_0)\right] &= \frac{1}{K\eta(1-\gamma)} \sum\limits_{k=0}^{K-1} \eta (1-\gamma) \bE_{s_0 \sim \mu}\left[V^\star(s_0) - V^k(s_0)\right] \\
        &\leq \frac{1}{K\eta(1-\gamma)}  \sum\limits_{k=0}^{K-1} \left(\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^k_s} - \kl{\pi^\star_s}{\pi^{k+1}_s}\right] + \frac{\eta^2 \beta W^2}{2}\right) \\
        &= \frac{1}{K\eta(1-\gamma)}  \sum\limits_{k=0}^{K-1} \bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^k_s} - \kl{\pi^\star_s}{\pi^{k+1}_s}\right] + \frac{\eta \beta W^2}{2(1-\gamma)} \\
        &= \frac{1}{K\eta(1-\gamma)}  \left(\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^0_s} - \kl{\pi^\star_s}{\pi^{K}_s}\right]\right) + \frac{\eta \beta W^2}{2(1-\gamma)},
    \end{align*}
    where the inequality follows from Lemma \ref{lemma:exp_kl_diff_bound}. Observe that, by definition of the optimal policy, $\frac{1}{K} \sum\limits_{k=0}^{K-1} \bE_{s_0 \sim \mu}\left[V^\star(s_0) - V^k(s_0)\right] \geq 0$. So, we have $$0 \leq \frac{1}{K\eta(1-\gamma)}  \left(\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^0_s} - \kl{\pi^\star_s}{\pi^{K}_s}\right]\right) + \frac{\eta \beta W^2}{2(1-\gamma)}.$$ Multiplying through by $K \eta (1-\gamma)$ and rearranging terms, we see that $$\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^{K}_s}\right] \leq \bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^0_s}\right] + \frac{\eta^2\beta W^2 K}{2},$$ as desired.
\end{proof}

In the context of LLM fine-tuning, recall that the optimal policy $\pi^\star$ for the MDP $\mc{M}$ defined above is the ground-truth distribution $p^\star$ for the fine-tuning dataset. Moreover, recall that fine-tuning begins with a LLM/policy $\pi^0$ initialized with parameters obtained via supervised language modeling on some broader data distribution (for example, the Internet) with entropy (presumably) much larger than $p^\star$. For a sufficiently small learning rate $\eta \ll 1$, Theorem \ref{thm:finetune_kl_decr} tells us fine-tuning with RL to obtain a LLM that more closely approximates a lower-entropy distribution $p^\star$ must necessarily bring the model farther away from the initial policy $\pi^0$ that closely matches a higher-entropy pre-training distribution.

% Let $\mc{V}$ be a finite vocabulary of tokens and let $L \in \bN$ be a maximum length (for instance, the size of a context window). We may then define a language of possible token sequences as $\mc{L} = \bigcup\limits_{\ell = 1}^L \mc{V}^\ell$. Consider the finite-horizon, episodic MDP $\mc{M} = \langle \mc{L}, \mc{V}, \mc{R}, \mc{T}, \mu, L \rangle$ where a state is any (possibly partial) language utterance $s \in \mc{L}$, an action is a single token $a \in \mc{V}$, a reward function $\mc{R}: \mc{L} \times \mc{V} \ra \bR_{\geq 0}$ maps language and next-token pairs to non-negative real values, a deterministic transition function $\mc{T}: \mc{L} \times \mc{V} \ra \mc{L}$ produces updated language, $\mu \in \Delta(\mc{L})$ is an initial state distribution, and $L$ is the horizon or maximum episode duration. The transition function simply appends the most recent token to the current language utterance: $\mc{T}(\ell' \mid \ell, a) = \indic(\ell' = \langle \ell, a \rangle).$ The initial state distribution $\mu$ represents a distribution over prompts. Observe that any stationary, stochastic policy $\pi: \mc{L} \ra \Delta(\mc{V})$ of MDP $\mc{M}$ is a language model. Practically, we may think of any realistic LLM with $d \in \bN$ parameters as a parameterized policy $\pi_\theta$ with $\theta \in \Theta \subset \bR^d$. The pretraining and fine-tuning of any LLM is equivalent to performing policy search over the restricted policy class $\Pi_\theta \subset \Pi$, where $\Pi_\theta \triangleq \{\pi_\theta \mid \theta \in \Theta\}$ and $\Pi \triangleq \{\mc{L} \ra \Delta(\mc{V})\}$.

% \dnote{TODO: Relative entropy policy search}

% Let $\pi_\beta \in \Pi_\theta$ denote a base model and let $\pi^\star_{\mathrm{fine}} \in \Pi$ denote the ground-truth response distribution for the fine-tuning dataset. Fine-tuning is equivalent to policy search with the reward function \dnote{Need to account for off-policy data from $\pi^\star_{\mathrm{fine}}$.}$$\mc{R}^{\pi_\theta}(\ell,a) = \log(\pi^\star_{\mathrm{fine}}(a \mid \ell)) - \lambda \log\left(\frac{\pi_\theta(a \mid \ell)}{\pi_\beta(a \mid \ell)}\right).$$

\section{Interpolating Between Two Distributions}
\label{sec:appendix_interpolate}
In this section, we discuss the resulting effects on entropy when we interpolate between two probability distributions assuming two different flavors of interpolation.

\subsection{Linear Interpolation}
Let $p_1(x_{t+1}\mid x_{1:t})$ denote a large, diverse corpus of internet text that is used to train a sufficiently parameterized base model such that $q_\theta^0(x_{t+1}\mid x_{1:t}) = p_1(x_{t+1}\mid x_{1:t})$. Now, suppose that we fine-tune this base model $q_\theta^0$ such that we mix the softmax probabilities over next-token predictions linearly with a fine-tuning dataset $p_2(x_{t+1}\mid x_{1:t})$ to produce some $q_\theta^\star(x_{t+1}|x_{1:t})$ such that
\begin{align}
    q_\theta^\star(x_{t+1}|x_{1:t}) = \alpha p_1(x_{t+1}\mid x_{1:t}) + (1 - \alpha) p_2(x_{t+1}\mid x_{1:t}),
\end{align}
for $\alpha \in [0,1]$. Since Shannon's entropy $\bH(\cdot)$ is concave over the space of probability distributions, it immediately follows that for
\begin{align}
    \min\{\bH(p_1), \bH(p_2) \} \leq \bH(\alpha p_1(x_{t+1}\mid x_{1:t}) + (1 - \alpha) p_2(x_{t+1}\mid x_{1:t})) \leq \max\{\bH(p_1), \bH(p_2) \}
\end{align}
we have
\begin{align}
    \bH(p_2) \leq \bH(q_\theta^\star) \leq \bH(p_1)
\end{align}
provided that $\bH(p_1) \geq \bH(p_2)$.
% When fine-tuning LLMs to perform specific tasks, we are interested in optimizing
% \begin{align}
%     \min \; D_{\mathrm{KL}}\bigl(p_2 \,\|\, q_\theta\bigr) \;+\; \beta \, D_{\mathrm{KL}}\bigl(q_\theta \,\|\, q_0\bigr),    
% \end{align}
% where, in a sufficiently expressible $q_0$, we can assume
% \[
% q_0\bigl(x_{t+1}\mid x_{1:t}\bigr) \;=\; p_1\bigl(x_{t+1}\mid x_{1:t}\bigr),
% \]
% for some $p_1$ that is a large, diverse corpus of internet text (base model).

% Recent literature has suggested that linear interpolation between $p_1$ and $p_2$ produces sensible models (``linear weight mixing''), implying that softmax probabilities over next‐token predictions likely follow a linear path.

% Consider, by slight abuse, that this mixing between $p_1$ and $p_2$ is indeed linear, such that
% \begin{align}
%     q_\theta\bigl(x_{t+1} \mid x_{1:t}\bigr)
% \;=\;
% \alpha\,p_1\bigl(x_{t+1}\mid x_{1:t}\bigr)
% \;+\;
% (1-\alpha)\,p_2\bigl(x_{t+1}\mid x_{1:t}\bigr),
% \end{align}
% for $\alpha \in [0,1]$. Assume that $H(p_1) > H(p_2)$. By the concavity of Shannon's entropy, we can apply Jensen's inequality:
% \begin{align}
%     H(q)
% \;\ge\;
% \alpha\,H\bigl(p_1\bigr)
% \;+\;
% (1-\alpha)\,H\bigl(p_2\bigr).
% \end{align}
% Since $H(p_1) > H(p_2)$, we have
% \begin{align*}
% H(q) \;\ge\; H\bigl(p_2\bigr),
% \end{align*}
% where equality is implied if and only if $\alpha = 0$. Since $p_1 \neq p_2$ and entropy is strictly concave, the maximum entropy on the open line segment between $p_1$ and $p_2$ must occur at one of the endpoints. Again, since $H(p_1) > H(p_2)$:
% \begin{align*}
% H\bigl(q_\theta\bigr) \;\le\; H\bigl(p_1\bigr).
% \end{align*}
% Thus, when perform linear mixing between base models (pre-trained on a large, diverse corpus of internet text) and fine-tuning datasets (specific, human-aligned preferences), we have
% \begin{align*}
%     H\bigl(p_2\bigr) \;\le\; H\bigl(q_\alpha\bigr) \;\le\; H\bigl(p_1\bigr)
% \end{align*}
% which means fine-tuning is an entropy destroying process for linearly interpolating weights.

% - diverse data helps with generalization. 

\subsection{Beyond Linear Interpolation}

Consider two given probability distributions $P,Q \in \Delta(\mathcal{X})$ such that $X_1 \sim P$ and $X_2 \sim Q$. Let $Z \in \Delta(\{1,2\})$ be a random index following an arbitrary distribution. Then, $$X_Z = \begin{cases} X_1 & Z = 1 \\ X_2 & Z = 2 \end{cases}$$ is a random variable denoting a sample from the mixture distribution between $P$ and $Q$ induced by $Z$. For example, consider $Z \sim \text{Bernoulli}(\alpha)$ for $\alpha \in [0,1]$. Note that $X_1$ and $X_2$ are independent ($X_1 \perp X_2$). 

By the chain rule of mutual information, we have  $$\bI(X_Z; X_1, X_2, Z) = \bI(X_Z; Z) + \bI(X_Z; X_1 \mid Z) + \bI(X_Z; X_2 \mid Z, X_1).$$ Since $X_1 \perp X_2$, $\bI(X_Z; X_2 \mid Z, X_1) = \bI(X_Z; X_2 \mid Z)$. Recall that conditional mutual information first integrates out randomness in the conditioning random variable (in this case, $Z$). So, when $Z=1$,  $\bI(X_Z; X_1 \mid Z = 1) = \bI(X_1; X_1 \mid Z = 1) = \bH(X_1)$. Alternatively, when $Z = 2$, $\bI(X_Z; X_1 \mid Z = 2) = \bI(X_2; X_1 \mid Z = 2) = 0$. The same logic holds \textit{mutatis mutandis} for the second conditional mutual information term $\bI(X_Z; X_2 \mid Z)$. So, the above expression simplifies as 
\begin{align*}
    \bI(X_Z; X_1, X_2, Z) &= \bI(X_Z; Z) + \bI(X_Z; X_1 \mid Z) + \bI(X_Z; X_2 \mid Z) \\
    &= \bI(X_Z; Z) + \bP(Z = 1)\bH(X_1) + \bP(Z = 2) \bH(X_2) \\
    &= \bI(X_Z;Z) + \bH(X_Z \mid Z) \\
    &= \bH(X_Z) - \bH(X_Z \mid Z)  + \bH(X_Z \mid Z) \\
    &= \bH(X_Z).
\end{align*}

The above just formalizes the obvious conclusion that knowing $(X_1, X_2, Z)$ is sufficient for knowing everything about $X_Z$. Taking an alternative decomposition via the chain rule of mutual information, we have 
\begin{align*}
    \bI(X_Z; X_1, X_2, Z) &= \bI(X_Z; X_1) + \ubr{\bI(X_Z; X_2 \mid X_1)}_{= \bI(X_Z; X_2)} + \ubr{\bI(X_Z; Z \mid X_1, X_2)}_{\leq \bH(Z)} \\
    &\leq \bI(X_Z; X_1) + \bI(X_Z; X_2) + \bH(Z) \\
    &= \bH(X_1) - \bH(X_1 \mid X_Z) + \bH(X_2) - \bH(X_2 \mid X_Z) + \ubr{\bH(Z)}_{\leq \log(2) = 1}\\
    &\leq \bH(X_1) + \bH(X_2) + 1\\
    &\leq 2 \cdot \max\limits_{i \in \{1,2\}} \bH(X_i) + 1.
\end{align*}
Applying the identity above to this inequality yields $\bH(X_Z) \leq 2 \cdot \max\limits_{i \in \{1,2\}} \bH(X_i) + 1.$

Meanwhile, we also have 
\begin{align*}
    \bI(X_Z; X_1, X_2, Z) &= \bI(X_Z; X_1) + \ubr{\bI(X_Z; X_2 \mid X_1)}_{= \bI(X_Z; X_2)} + \ubr{\bI(X_Z; Z \mid X_1, X_2)}_{\geq 0} \\
    &\geq \bI(X_Z; X_1) + \bI(X_Z; X_2) \\
    &= \bH(X_1) - \bH(X_1 \mid X_Z) + \bH(X_2) - \bH(X_2 \mid X_Z).
\end{align*}
Since $X_1 \perp X_2$, either value of $Z$ results in  $\bH(X_1 \mid X_Z) = \bH(X_2 \mid Z) = 0$. Thus,
\begin{align*}
    \bI(X_Z; X_1, X_2, Z) &\geq \bH(X_1) - \bH(X_1 \mid X_Z) + \bH(X_2) - \bH(X_2 \mid X_Z) \\
    &= \bH(X_1) + \bH(X_2) \\
    &\geq 2 \cdot \min\limits_{i \in \{1,2\}} \bH(X_i).
\end{align*}

In summary, $$2 \cdot \min\limits_{i \in \{1,2\}} \bH(X_i) \leq \bH(X_Z) \leq 2 \cdot \max\limits_{i \in \{1,2\}} \bH(X_i) + 1.$$

\newpage
\section{Instruction Following Results}\label{app:refusal}

% \begin{sidewaysfigure}
%     \centering
%     \includegraphics[height=0.65\textwidth, keepaspectratio]{visuals/benchmark-results.png}
%     \caption{SORRY-Bench results across different model merges. $\alpha=1$ is the no-mixing instruct model, and $\alpha=0$ is the base model.}
%     \label{fig:sorry-bench-results}
% \end{sidewaysfigure}

\begin{figure}[ht]
    \centering
    \rotatebox{90}{
        \includegraphics[height=0.55\textwidth, keepaspectratio]{visuals/benchmark-results.png}
    }
    \caption{SORRY-Bench results across different model merges}
    \label{fig:sorry-bench-results}
\end{figure}


\newpage
\section{Data Curation Details}\label{app:data-curation-details}

We curated the dataset using the MBPP+ benchmark\citep{liu2024evaluatinglanguagemodelsefficient}. We used the full benchmark dataset of 378 questions. Compared to the benchmark used in the original paper, the official implementation excluded 21 tasks from the dataset that were prone to errors. The detailed parameter settings for data curation are summarized in Table~\ref{tab:data_curation}.

\begin{table}[h]
\centering
\caption{Data Curation Parameters}
\label{tab:data_curation}
\begin{tabular}{ll}
\toprule
\textbf{Parameter} & \textbf{Value} \\
\midrule
Dataset & MBPP+ \\
Subset & 378 questions as contained in official benchmark harness implementation \footnote{See: \texttt{\url{https://github.com/evalplus/evalplus}}} \\
Number of Samples per Question & 11 \\
Sampling Temperature & 0.7 \\
\bottomrule
\end{tabular}
\end{table}

\section{Evaluation on SORRY-Bench}\label{app:evaluation-details}

Generations were done using the default SORRY-Bench settings\footnote{See: \texttt{\url{https://github.com/sorry-bench/sorry-bench}}}. 

% \subsection{Evaluation of distilled reasoning models on TACO}

% We evaluate the distilled reasoning models trained on our various sets of curated data by generating multiple completions for each test case and measuring their correctness. The evaluation consists of the following main steps:

% \begin{enumerate}
%     \item \textbf{Data:} We use the test set from the TACO benchmark
%     \item \textbf{Generation:} We generate solutions using the fine-tuned reasoning models.
%     \item \textbf{Scoring:} We execute generated solutions against provided unit tests.
% \end{enumerate}

% Table~\ref{tab:evaluation_params} provides the parameters used to generate solutions for evaluation.

% \begin{table}[h]
% \centering
% \caption{Evaluation Parameters for Solution Generation}
% \label{tab:evaluation_params}
% \begin{tabular}{ll}
% \toprule
% \textbf{Parameter} & \textbf{Value} \\
% \midrule
% Test Dataset & TACO (BAAI/TACO) \\
% Difficulty Level & Easy \\
% Number of Samples per Query & 10 \\
% Sampling Temperature & 0.7 \\
% Top-P Sampling & 0.7 \\
% Maximum Tokens per Generation & 8192 \\
% \bottomrule
% \end{tabular}
% \end{table}

% \section{Fine-tuning Details}\label{app:finetuning-details}

% Fine-tuning was performed using SFT with LoRA for efficiency. The key fine-tuning parameters are detailed in Table~\ref{tab:finetuning}. We relied on the LLaMA-Factory library \citep{zheng2024llamafactory}.

% \begin{table}[h]
% \centering
% \caption{Fine-tuning Parameters}
% \label{tab:finetuning}
% \begin{tabular}{ll}
% \toprule
% \textbf{Parameter} & \textbf{Value} \\
% \midrule
% Model & \texttt{meta-llama/Llama-3.1-8B-Instruct} \\
% Fine-tuning Method & Supervised Fine-Tuning (SFT) with LoRA \\
% LoRA Target Layers & All \\
% LoRA Rank & 256 \\
% Preferred $\beta$ & 0.1 \\
% Max Sequence Length & 16,384 tokens \\
% Number of Training Epochs & 2.0 \\
% Batch Size & 1 \\
% Gradient Accumulation Steps & 12 \\
% Learning Rate & $1.0 \times 10^{-5}$ \\
% Warmup Ratio & 0.1 \\
% Precision Used & BF16 \\
% \bottomrule
% \end{tabular}
% \end{table}


\section{Model Merging}\label{app:merging}
\xhdr{LERP} LinEar inteRPolation is a technique used to interpolate weights between two vectors. This technique is a classic approach for merging different neural network based models~\cite{wortsman2022model,izmailov2019averagingweightsleadswider}. It consists of defining an $\alpha$ parameter that defines a model mixing coefficient and then taking the weighted average across the weights. Explicitly, if we have one set of weights $\mathbf{v_1}$ and $\mathbf{v_2}$ then we define a merged weight as $\mathbf{v_m} = \alpha \cdot \mathbf{v_1} + (1-\alpha)\mathbf{v_2}$. We use the open-source implementation part of the \texttt{mergekit}\footnote{See: \texttt{\url{https://github.com/arcee-ai/mergekit}}} project.

% \xhdr{SLERP} Spherical LinEar inteRPolation (SLERP) is a spherical extension of LERP that consists of averaging model weights on the unit sphere. It provides a smooth transition between vectors. Given two unit vectors \( \mathbf{v}_0 \) and \( \mathbf{v}_1 \), SLERP finds an interpolated vector \( \mathbf{v}(t) \) that smoothly transitions between \( \mathbf{v}_0 \) and \( \mathbf{v}_1 \) as \( t \) varies from 0 to 1.

% The SLERP formula is derived from the spherical representation of the vectors. Given two unit vectors \( \mathbf{v}_0 \) and \( \mathbf{v}_1 \), we compute the dot product:

% \begin{equation}
% \cos \theta_0 = \mathbf{v}_0 \cdot \mathbf{v}_1
% \end{equation}

% where $\theta_0$ is the angle between the two vectors. If the vectors are nearly parallel (\( \cos \theta_0 \approx 1 \)), linear interpolation (LERP) is used instead.

% To perform SLERP, the interpolated vector \( \mathbf{v}(t) \) at interpolation factor \( t \) is given by:

% \begin{equation}
% \mathbf{v}(t) = \frac{\sin((1-t) \theta_0)}{\sin \theta_0} \mathbf{v}_0 + \frac{\sin(t \theta_0)}{\sin \theta_0} \mathbf{v}_1
% \end{equation}

% where $\sin((1-t) \theta_0 / \sin \theta_0$ and \( \sin(t \theta_0) / \sin \theta_0 \) are the interpolation weights.


% If \( \mathbf{v}_0 \) and \( \mathbf{v}_1 \) are almost parallel (i.e., \( |\mathbf{v}_0 \cdot \mathbf{v}_1| > \text{threshold} \)), numerical instability can occur. In such cases, we approximate SLERP using linear interpolation:

% \begin{equation}
% \mathbf{v}(t) = (1-t) \mathbf{v}_0 + t \mathbf{v}_1
% \end{equation}

% which provides a sufficiently accurate result when the angle between the vectors is small.

\end{document}
