\documentclass{uai2025} % for initial submission
\usepackage{amsthm, amssymb}
%\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\usepackage{rotating} 
\usepackage{xcolor}
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools,bbm,amsmath} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{cleveref}
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\newcommand{\xhdr}[1]{\vspace{1.7mm}\noindent{{\bf #1.}}}

\usepackage{subcaption}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\mc}[1]{\mathcal{#1}}
\newcommand{\ra}{\rightarrow}
\newcommand{\bP}{\mathbb{P}}
\newcommand{\bI}{\mathbb{I}}
\newcommand{\bE}{\mathbb{E}}
\newcommand{\bH}{\mathbb{H}}
\newcommand{\bR}{\mathbb{R}}
\newcommand{\bN}{\mathbb{N}}
\newcommand{\bV}{\mathbb{V}}
\newcommand{\kl}[2]{D_{\mathrm{KL}}(#1 \mid\mid #2)}
\newcommand{\js}[2]{D_{\mathrm{JS}}(#1 \mid\mid #2)}
\newcommand{\ubr}[1]{\underbrace{#1}}
\newcommand{\indic}{\mathbbm{1}}

\newtheorem{assumption}{Assumption}
\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{proposition}{Proposition}

\newif\ifsubmit
\submitfalse
% \submittrue
\ifsubmit
\newcommand{\dnote}[1]{}
\newcommand{\gnote}[1]{}
\newcommand{\vnote}[1]{}
\newcommand{\bnote}[1]{}
\newcommand{\lnote}[1]{}
\else
\newcommand{\dnote}[1]{\textcolor{blue}{Dilip: #1}}
\newcommand{\gnote}[1]{\textcolor{green}{Gianluca: #1}}
\newcommand{\vnote}[1]{\textcolor{purple}{Veniamin: #1}}
\newcommand{\bnote}[1]{\textcolor{orange}{Benedikt: #1}}
\newcommand{\lnote}[1]{\textcolor{darkgreen}{Lisa: #1}}
\fi

\newcommand{\gianluca}[1]{{\color{red}#1}}

\newcommand{\llamab}{Llama-3.1-8b}
\newcommand{\llamaf}[1]{Llama-8b-HM-distill$_{\alpha = {#1}}$}

\newcommand{\llamai}{Llama-8b-Instruct}
\newcommand{\llamad}{R1-Llama-8b}
\newcommand{\gptmini}{gpt-4o-mini}
\newcommand{\gpt}{gpt-4o}

\title{Hindsight Merging: Diverse Data Generation with Language Models}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2025 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Harry~Q.~Bovik}
\author[1,2]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[1]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Cranberry University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Second Affiliation\\
    Address\\
    …
}
\affil[3]{%
    Another Affiliation\\
    Address\\
    …
  }
  
  \begin{document}
\maketitle

\begin{abstract}
% Language models are pre-trained to capture broad, internet-level statistics and then fine-tuned into helpful and harmless assistants.
Pre-training a language model equips it with a broad understanding of the world, while fine-tuning refines it into a helpful assistant. 
However, fine-tuning does not exclusively enhance task-specific behaviors but also suppresses some of the beneficial variability from pre-training. This reduction in diversity is partly due to the optimization process, which theoretically decreases model entropy in exchange for task performance. To counteract this, we introduce \emph{hindsight merging}, a technique that combines a fine-tuned model with a previous training checkpoint to restore entropy and improve performance. Hindsight-merged models retain strong instruction-following capabilities and alignment while displaying increased reasoning diversity. This results in improved inference scaling, achieving a 4.2\% increase in pass@$10$ compared to the original R1 distilled Llama-8b model. Moreover, we demonstrate that distilling smaller reasoning models from hindsight-merged models leads to a 52\% relative increase in performance over the baseline R1 distilled model. 
Our findings suggest that hindsight merging is an effective strategy for generating diverse reasoning traces and enhancing reasoning performance.


% Finetuning language models is one of they key approaches for teaching language models how to follow instructions. While beneficial, this process comes at the cost of lost diversity. 

% - Human reasoning diverse. aligned LLMs not so much. 

% - when language models are fine-tuned they learn behaviors necessary for use, but lose the diversity of the pretraining dataset. i.e., some of the reduction in diversity not bad. 

% - Optimization destroys entropy (theoretical). 
% Rolling back closer to the base model increase entropy (theoretical).

% - we call this process of rolling back the model as hindsight merging. 

% - hindsight merged models exhibit the best of both their counterparts: (1) more diversity, (2) good instruction following. 
% combined, this makes the merged models exhibit better inference scaling laws than the original distilled llama model resulting in a 4.2\% increase in the pass@$10$. 

% % 
% % - Generating synthetic reasoning traces from this improves performance and diversity in reasoning models (experimental). 

% - we next show a practical use case of hindsight merged models, by distilling a smaller reasoning model. This process of distillation leads to a 31\% increase in performance compared to the instruct model.  

% - taken together, our findings suggest that hindsight merging is a good technique for collecting diverse synthetic reasoning data. 
\end{abstract}


\section{Introduction}\label{sec:intro}
Humans solve a wide variety of problems by reusing previously learned knowledge and applying diverse patterns of thinking \citep{griffiths2019doing}. This ability to adapt and explore multiple cognitive pathways allows human reasoning to converge to correct solutions over time \citep{collins2013cognitive,tomov2020discovery,solway2014optimal,maisto2015divide,correa2023humans}. 
The adaptive reuse of available resources is not just a hallmark of human intelligence but also a key ingredient in the design of reliable artificial intelligence (AI) systems. In many AI applications, diversity plays a crucial role. Repeated sampling from language models relies on a wide variety of generated responses to enhance performance~\citep{brown2024large}. Compound AI systems~\citep{compound-ai-blog} improve their scalability and inference capabilities when diverse data inputs and models are integrated. However, while diversity enhances both human reasoning and AI training, the challenge of generating synthetic datasets with sufficient richness and variation remains an unresolved problem.

The primary challenge can be understood through the lens of optimization. Both humans and language models are known to directly fit their training data~\cite{hasson2020direct}, but the breadth of human experience leads to more diversity in our ability to reason. Large language models (LLMs) trained on a vast and diverse internet corpora produce a base model that generates a wide variety of outputs. However, we remove much of this diversity when we fine-tune due to the use of smaller datasets and the objective of aligned behavior~\citep{murthy2024one}. This creates a paradox: the optimization that improves task performance undermines the model's ability to generate the broad range of outcomes necessary for rich and diverse synthetic datasets. Simply, the best model for solving a task is not necessarily the best model for generating a dataset.

\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{visuals/figure_1.pdf}
    \caption{Overview of our main findings.}
    \label{fig:fig1}
\end{figure}


% Despite the importance of diverse sampling for many applications, most modern synthetic data pipelines generate samples from AI systems that have been instruction tuned. These models excel at following directives but often suffer from mode collapse, where their generations become overly deterministic and lack variation [CITE]. This tension between instruction-following and diversity presents a fundamental challenge: how can we create models that generate diverse outputs while still adhering to instructions?

% - Humans are inherently diverse --- our language [CITE], thought processes [CITE], and problem-solving approaches [CITE]. [include some stuff about how humans benefit from diversity in problem solving] 

In this work, we explore this trade-off between optimizing on task-specific datasets and producing diverse synthetic datasets. We introduce a theoretically-motivated approach to increasing diversity, \emph{hindsight merging}, that merges an instruction-tuned model with prior training checkpoints to better accommodate the trade-off required for diverse dataset generation (see Figure~\ref{fig:fig1}). Through hindsight merging, we demonstrate that the resulting data is more diverse, maintains instruction following abilities, and has improved pass@$k$ performance compared to the pre-merged models. We conclude with an example of how hindsight merging can be used to distill reasoning models. By distilling Llama-3.1-Instruct, we show that data generated by our merged models leads to a 52\% relative gain in performance compared to distilling on reasoning traces from \llamad{}. 
%In this work, we explore this trade-off and introduce \emph{hindsight merging}, an approach that combines model merging with prior checkpoints of an instruction-tuned model. Through this technique, we recover the diversity of the base model while retaining its ability to follow instructions. Our analysis shows [summary of key results]


% - from a mechanistic interpretability point of view, these findings add to recent work showing that instruction tuned models representations can be composed with base models [CITE]. 

% - Hindsight-merged models produce more diverse data while retaining instruction following capability.

% - the models get better pass@$k$ than the instruct model, but a worse pass@$1$. 


% - Finally, we distill reasoning models from our collected datasets, and find that performance on downstream coding tasks improves with our more diverse data. 

% - [describe why reasoning models]

% Once we want to generate many generations for the same request, we require a new approach for 

% - reasoning in models is important. an extension that has long been shown to work in games. 

% - in reasoning, the quality of the underlying data is key. 

% - the original approach was to use human annotated data. 

% - now, increasingly, we are turning to synthetic data to create reasoning traces. 

% - however, synthetic data curation methods don't produce enough diversity [CITE: llm reasoners, best of n, inference scaling]

% - semantic vs. syntactic diversity 

% - accordingly, techniques like divpo have been designed to filter the data. 

% - gap: how to get diverse thoughts in the first place? this paper proposes a new technique for diversity-inducing sampling methods. 

% - we propose and evaluate three techniques for inducing diversity in the synthetic data: dynamic temperature scaling, sampling from the base-model, context cutoff. 


% - Diversity is key in many applications of machine learning models. 



% Synthetic data has emerged as a valuable resource across a broad spectrum of machine learning applications [CITE].


% The generation of fully synthetic data traces has unlocked numerous possibilities in the field, enabling advances in model distillation [CITE], reasoning capabilities [CITE], and novel training paradigms such as self-play [CITE].


% Despite these advantages, current synthetic data generation techniques face significant limitations, particularly in terms of output diversity. This lack of diversity poses a fundamental challenge to the effectiveness of synthetic data-based approaches.


% - Moreover, instruction-tuned language models are predominantly used to generate synthetic data due to their ability to produce higher-quality data traces. However, this approach comes with a significant drawback: these models tend to generate even less diverse outputs than their base counterparts [CITE].

% - In this paper, we introduce ``hindsight merging,'' a novel approach to synthetic data generation. Our method leverages a key observation: base language models (prior to instruction tuning) exhibit higher entropy compared to their instruction-tuned counterparts [CITE]. By strategically merging these two model types, we combine their complementary strengths --- the high-quality outputs characteristic of instruction-tuned models with the enhanced diversity inherent in base model generations.

% The focus of most sampling methods has been disproportionately on creating good single-use generations, e.g., 

% For these generations, sample diversity is often not a problem. 


% gianluca notes:
% - diverse outputs with verifiably correct responses is a key indication of generalization.



% contributions: 

% \begin{enumerate}
%     \item Provide a theoretical explanation for why sampling diversity is reduced 
%     \item Evaluate several sampling approaches for generating diverse reasoning traces
% \end{enumerate}

\section{The Entropy Spectrum}\label{sec:entropy}
\begin{figure*}[ht]
    \centering
    \includegraphics[width=\textwidth]{visuals/entropy.png}
    \caption{
    Analysis of theoretical volume changes for SGD based on logit rank (1-10) and softmax entropy in next-token predictions (top$_k$ = 600). Using 10 random Arxiv abstracts from January 2025, we compute next-token predictive distributions and compute the Jacobian of the SGD step when the correct token corresponds to each logit.}
    \label{fig:entropy}
\end{figure*}

While fine-tuning language models is crucial for achieving high performance on downstream tasks, it requires specializing the response distribution away from the broad, diverse data that was well-approximated during pre-training. Suppose pre-training involved fitting language model responses to match a distribution $P$. Later, fine-tuning the pre-trained language model demands matching a new response distribution $Q$. Letting $\bH(\cdot)$ denote the entropy of a distribution~\citep{shannon1948mathematical,cover2012elements}. It is to be expected that $\bH(Q) < \bH(P)$; indeed, given that current best practices often take $P$ as an Internet-scale distribution over prompt-response pairs, it may be more realistic to consider $\bH(Q) \ll \bH(P)$. During fine-tuning, a language model is being optimized away from accurately approximating $P$ to fit $Q$, which forces a decrease in entropy to accommodate the new response distribution. Such a drop entropy naturally begets a commensurate drop in overall response diversity.

This section attempts to make this intuitive idea mathematically precise in two concrete ways. First, we focus on supervised fine-tuning and adopt a local perspective by studying the change in entropy per optimization step. Second, we adopt a reinforcement-learning perspective to implicitly consider how entropy changes after multiple iterations of policy optimization. Taken together, these theoretical findings corroborate the intuitive notion that fine-tuning decreases entropy and, in doing so, reduces the diversity of language model responses. We conclude with an information-theoretic motivation for a simple remedy to this vanished diversity: mixing response distributions between the current fine-tuned model and the pre-trained base model.

\subsection{Supervised Fine-Tuning}

Consider a neural network $q_\theta(x_{t+1} | x_{1:t}) = \text{softmax}(f(x_{1:t}; \theta))$, parameterized by $\theta \in \mathbb{R}^D$, that models a predictive distribution over next-token generations. We denote $q_\theta(x_{t+1} | x_{1:t})$ as $q_\theta(x)$ for brevity. Let us define parameter updates during fine-tuning as a transition operator $T: \mathbb{R}^D \to \mathbb{R}^D$, which induces the following mapping $M$ on the output space: 
\begin{align}
    M : q_\theta^k(x) \mapsto q_\theta^{k+1}(x)
\end{align}
where $q_\theta^{k+1}(x) = \text{softmax}(f(x_{1:t};T(\theta_k)))$ after applying the transition $\theta_{k+1} = T(\theta_k)$. The total entropy of the predictive distribution after $K$ update steps from some base model $q_\theta^0(x)$ is given by:
\begin{align}
    \label{eq:entropy_total}
    \bH(q_\theta^K) = \bH(q_\theta^0) + \sum_{k=0}^{K-1} \mathbb{E}_{x \sim q_\theta^k} \left[ \log \left| \frac{\partial M}{\partial q}\left(q_\theta^k(x)\right) \right| \right],
\end{align}
where $\left| \partial M / \partial q \right|$ denotes the Jacobian determinant of $M$ at each update step (see Appendix \ref{sec:appendix_entropy_jacobian} for details). For stochastic gradient descent (SGD), the transition operator $T$ is defined as
\begin{align}
    \theta_{k+1} \leftarrow \theta_k - \alpha\nabla_\theta\mathcal{L}(\theta_k), \label{eq:sgd}
\end{align}
where $\alpha$ is the learning rate. Although $M$ does not admit a closed form solution, we can approximate it using a first-order Taylor expansion for small $\alpha$ (see Appendix \ref{sec:appendix_entropy_jacobian} for details):
\begin{align}
    \label{eq:M}
    M(q) \approx q - \alpha \left[\text{diag}(q) - qq^\top\right] \frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta).
\end{align}
Taking the Jacobian yields (see Appendix \ref{sec:appendix_entropy_jacobian} for derivation):
\begin{align}
    \label{eq:jacobian}
    \bH(q_\theta^k) - \bH(q_\theta^{k-1}) \approx \mathbb{E}_{x \sim q_\theta^k}\left[ \log \left| I - \alpha H \right| \right],
\end{align}
where $H = \left[E - G\right]\frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta)$ for $E_{ijk} = \delta_{ij} \cdot \delta_{jk}$ and $G_{ijk} = \delta_{ik}q_j + \delta_{jk}q_i$. This approximation denotes how much entropy changes per update step using SGD.

During supervised fine-tuning (SFT), the goal is to align the model's next-token predictions with a target data-generating process, $p^*(x_{t+1}|x_{1:t})$, which is typically smaller and less diverse than the original pre-training data. In the most common case, we perform SFT with the cross-entropy loss. The gradient term from Equation \ref{eq:M} can be approximated by the expression
\begin{align}
\frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta) \approx (q - y),
\end{align}
where $q$ denotes model output probabilities and $y$ represents the one-hot encoded ground-truth labels for the input $x$. Note that this approximation captures the classical gradient for cross-entropy, but under the assumption of a small $\alpha$ and dominance of logit-level loss terms in the gradient flow. While simplifying, this approximation allows us to analyze of how entropy over next-token predictions evolves at each step of the optimization process using Equation~\ref{eq:jacobian}.

In Figure \ref{fig:entropy}, we show approximate changes to model entropy, indicated by $\left| \partial M / \partial q \right|$. To build an intuition, when $\left| \partial M / \partial q \right| > 1$, the volume of the predictive distribution $q_\theta(x_{t+1}|x_{1:t})$ is expanding, implying an increase in entropy over model generations given the token history $x_{1:t}$. When $\left| \partial M / \partial q \right| < 1$, volume is shrinking, implying a decrease in entropy. 

Optimizing the cross-entropy loss affects model behavior differently depending on the rank of the ground-truth logits. For rank-1 predictions, all models contract in volume, thereby systematically reducing the entropy of conditional distributions for next-token predictions as they begin to fit the data perfectly. In this case, entropy decrease is a symptom of overfitting. For rank-2 through rank-10 predictions, lower-entropy models show volume expansion whereas higher entropy models continue to contract. Regardless of rank, high-entropy models consistently exhibit volume contraction, with less aggressive contractions as the prediction rank increases. This exhibits the inherently entropy-destroying process of optimization when trained on long trajectories with overparameterized models that can consistently make rank-1 predictions relative to ground-truth data.

\subsection{Reinforcement Learning with Human Feedback}

Aside from the supervised approach examined in the preceding section, an alternative route to fine-tuning a language model uses reinforcement learning~\citep{stiennon2020learning,ouyang2022training}. Compared to the local analysis in the previous sub-section, which characterizes how entropy changes per optimization step, this section presents a more global picture. As the reinforcement learning with human feedback (RLHF) pipeline carries various adornments that complicate analysis, this section restricts its focus to a simpler Markov decision process (MDP) wherein learning an optimal policy is equivalent to learning the per-token distribution of some underlying fine-tuning response distribution. Under mild assumptions, we establish a theoretical result that highlights how policy-gradient updates to fine-tune towards a lower-entropy response distribution implicitly decreases the entropy in language model responses relative to an initial pre-trained base model. We encourage readers to consult Appendix \ref{sec:app_rl} for all technical details.

Let $\mc{V}$ be a finite vocabulary of tokens such that the set of possible token sequences is $\mc{L} = \bigcup\limits_{n=1}^\infty \mc{V}^n$. Let $\mu \in \Delta(\mc{L})$ be the distribution over prompts and let $p^\star: \mc{L} \ra \Delta(\mc{L})$ be the ground-truth response distribution given any prompt. 
% One might expect $p^\star$ to take on a natural factorized form such that, for any initial prompt $s \sim \mu$ and length-$T$ response $w = (w_1,w_2,\ldots,w_T)$, $p^\star(w \mid s) = \prod\limits_{t=1}^T p^\star(w_t \mid w_1, \ldots, w_{t-1}, s)$. 
Next, we specify a MDP in which any LLM is a policy and the reward function is constructed such that learning the optimal policy amounts to obtaining a LLM that matches $p^\star$.

Consider the infinite-horizon, discounted MDP~\citep{bellman1957markovian,Puterman94} $\mc{M} = \langle \mc{S}, \mc{A}, \mc{R}, \mc{T}, \mu, \gamma \rangle$. Here $\mc{S} = \mc{L}$ represents any token sequence from $\mc{V})$. The action space $\mc{A} = \mc{V} \cup \{\texttt{STOP}\}$ contains all valid tokens a LLM may emit as well as an explicit \texttt{STOP} token to denote response completion. Logically, the MDP follows deterministic transition dynamics $\mc{T}: \mc{S} \times \mc{A} \ra \mc{S}$ which appends the selected token to the current state: $\mc{T}(s, a) = \langle s, a \rangle$.\footnote{For brevity, we omit an absorbing, zero-reward terminal state that an agent transitions to upon choosing the \texttt{STOP} action.} The initial state distribution $\mu \in \Delta(\mc{S})$ is precisely the distribution over prompts from above. The discount factor $\gamma \in [0,1)$ conveys the effective time horizon for optimizing rewards. So far, we have specified a controlled Markov process (that is, a MDP without a reward function) such that any policy $\pi: \mc{S} \ra \Delta(\mc{A})$ represents a LLM that examines the prompt along with any partially-generated response thus far and emits a distribution over next tokens. 

To capture the objective of fine-tuning a LLM towards a ground-truth response distribution $p^\star$, we consider a policy-dependent reward function defined as $\mc{R}(s,a) = \log\left(\frac{p^\star(a \mid s)}{\pi(a \mid s)}\right)$. Recall that the performance of any policy $\pi$ with respect to a prompt $s \in \mc{S}$ is given by its associated value function $V^\pi(s) = \bE\left[\sum\limits_{t=0}^\infty \gamma^t \mc{R}(s_t, a_t) \mid s_0 = s\right]$. With a slight abuse of notation, we account for randomness in the initial state through $V^\pi(\mu) \triangleq \bE_{s_0 \sim \mu}\left[ V^\pi(s_0)\right]$. Recall that any policy induces a corresponding discounted stationary state visitation distribution $d^\pi_\mu(s) = (1-\gamma)\sum\limits_{t=0}^\infty \gamma^t \bP^\pi(s_t = s),$ where $\bP^\pi(s_t = \cdot) \in \Delta(\mc{S})$ is the distribution over states visited by policy $\pi$ at timestep $t$. Intuitively $d^\pi_\mu$ encodes which states policy $\pi$ will occupy using $\gamma$ to account for near-term vs future visitation. In the context of LLMs, $d^\pi_\mu$ encodes a distribution over prompts and partial/complete responses generated by a particular LLM $\pi$.

We define the optimal policy $\pi^\star$ of $\mc{M}$ as achieving supremal value with associated value function $V^\star(\mu) = \sup\limits_{\pi} V^\pi(\mu)$. For the particular choice of policy-dependent reward function, we see that an optimal policy $\pi^\star$ minimizes the KL-divergence between its own per-step token distribution and that of the ground-truth distribution $p^\star$: $V^\star(\mu) = -\inf\limits_{\pi} \frac{1}{(1-\gamma)} \bE_{s \sim d^\pi_\mu}\left[\kl{\pi(\cdot \mid s)}{p^\star(\cdot \mid s)}\right].$

\begin{theorem}[Informal]
    Let $\pi^0$ be an initial, pre-trained base model and $\pi^K$ be the fine-tuned LLM after $K \in \bN$ iterations of policy-gradient updates. Then, we have $$\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^K_s}\right] \lesssim \bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^0_s}\right].$$
    % $$\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^K_s}\right] \leq \bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^0_s}\right] + \frac{\eta^2\beta W^2 K}{2}.$$
    \label{thm:finetune_kl_decr}
\end{theorem}
\vspace{-15pt}

As the optimal policy of the MDP $\mc{M}$ is $p^\star$, Theorem 1 affirms that making policy-gradient updates~\citep{sutton1999policy} brings the fine-tuned LLM $\pi^K$ closer (in KL-divergence) to the lower-entropy response distribution $p^\star$ than the higher-entropy pre-trained model $\pi^0$.

\subsection{Toward Recovering Diversity}

% \gianluca{\subsection{General Conditions}

% I think this section is optional but it may be nice to show talk about how if we have two distributions and we interpolate between them, entropy will lie between the two distributions. I have a simple proof in the appendix for linear interpolation between two distributions. Dilip has a better one for geodesic mixes that seems to be more general. 

% I think a section like this might be inuitive and motivating. It motivates the idea that we need to train from high-entropy distributions. It will probably cover our asses a bit if the reviewers do not understand what we are trying to say with the optimization math.
% }

Just as the reduction of entropy formalized in the previous two sub-sections is an intuitive consequence of language model fine-tuning, we use this final sub-section to motivate an equally-intuitive solution: mixing model parameters. Prior work establishes a link between the blending of model parameters and a commensurate blending of the associated model response distributions~\citep{kangaslahti2024continuous}. One might naturally hope to obtain the ``best of both worlds'' by mixing weights of a pre-trained language model fit to a high-entropy response distribution and those of a fine-tuned model closely approximating a low-entropy response distribution. We may formalize this intuition with a first proposition that considers an obvious linear interpolation between response distributions:
\begin{proposition}
Consider two arbitrary probability distributions $P$ and $Q$ such that $\bH(P) \geq \bH(Q)$. For any $\alpha \in [0,1]$, define $M_\alpha = \alpha \cdot P + (1-\alpha) \cdot Q$. Then, $$\bH(Q) \leq \bH(M) \leq \bH(P)$$
\end{proposition}
A more general information-theoretic analysis allows us to obtain looser bounds on the entropy of response sampled from the mixture distribution that goes beyond just linear interpolation:
\begin{proposition}
Consider two arbitrary probability distributions $P, Q \in \Delta(\mc{X})$ with $X_1 \sim P$ and $X_2 \sim Q$. Let $Z \in \Delta(\{1,2\})$ be a random index following an arbitrary distribution. Then, $X_Z$ is a random variable denoting a sample from the mixture distribution between $P$ and $Q$ induced by $Z$. Moreover, $$2 \cdot \min\limits_{i \in \{1,2\}} \bH(X_i) \leq \bH(X_Z) \leq 2 \cdot \max\limits_{i \in \{1,2\}} \bH(X_i) + 1.$$
\end{proposition}

Proofs may be found in Appendix \ref{sec:appendix_interpolate}. Altogether, these two propositions highlight one promising pathway to recovering the response diversity lost due to standard fine-tuning practices; namely, by blending the response distributions of the current fine-tuned language model and the pre-trained base model. In the next section, we present our hindsight merging approach that uses interpolation between respective model weights to achieve this diverse mixture of response distributions.



% This result is both interpretable and motivating. As models converge to smaller fine-tuning datasets that inherently have less diversity, models will lose significant entropy as they match top-1 tokens. This results in a entropy spectrum over the optimization trajectory. Regularization and different RL policies try to mitigate this, also temperature scaling. However, this is a systemic result as we optimize and match our input distribution. [Tie in the Dilip info theory result]

% This motivates our method in the sense that merging with the higher entropy base model will produce higher entropy intermediate models. This is an easy and natural way for producing more diverse reasoning traces, as we do in our case. This problem, under different losses and also in the RL setting, is ripe for further investigation. 

%This process can be idealized as minimizing a composite objective involving two Kullback-Leibler (KL) divergences:
%\begin{align} 
%\min_\theta D_{KL}\bigl(p \ || \ q_\theta\bigr) + \beta D_{KL}\bigl(q_\theta \ || \ q_0\bigr), \end{align}
%where $p$ denotes the empirical distribution of a specialized dataset, $q_\theta$ represents the model's output distribution parameterized by $\theta$, and $q_0$ is some pre-trained base model that we regularize towards to prevent overfitting and promote diversity in $q_\theta$.  



\section{Diverse Generation}

In Section \ref{sec:entropy}, we showed that fine-tuning on low diversity datasets leads to low diversity data generations in fine-tuned models. 
We propose \textit{hindsight merging} --  interpolating between the weights of the base model, which has a high diversity, and the fine-tuned model -- to improve diversity in the generations.

Mixing weights leverages the fact that during fine-tuning, the model is likely in the neural tangent kernel (NTK) regime~\citep{fort2020deep, wortsman2022robust, wortsman2022model, ilharco2022editing}. When the model is in the NTK regime, the functional updates are approximately linear. Therefore, by interpolating between the weights, we approximately roll-back the model along its optimization trajectory to previous checkpoints that capture the desirable results of fine-tuning but at higher levels of entropy. 

%\cite{ilharco2022patchingopenvocabularymodelsinterpolating} found that interpolating between the weights preserves performance on tasks unrelated to the fine-tuning task.

% when the model is in the ntk regime, interpolating weights is directly related to ensembling is described in \cite{wortsman2022robust,\cite{wortsman2022model}
% \gianluca{Section 2 has a very basic takeaway that we try to prove rigorously by approximating entropy dynamics during neural net train under different conditions: fine-tuning on datasets that are low diversity leads to models that are low diversity which leads to bad synthetic generations! If we open up with this, it necessitates the need to \emph{roll the model back} and sample from there.}

% {It'd be nice to say that we can interpolate between the two models \cite{wortsman2022robust, ilharco2022patchingopenvocabularymodelsinterpolating} because we're likely in the NTK regime (appendix A in \cite{ilharco2022editing} The NTK regime means that the training dynamics are roughly linear. This means that what we're doing either (1) is an ensembling of the two models, and/or (2) roughly provides us with the weights of an earlier checkpoint.}

% weight averaging is effective in improving robustness to distributon shifts in computer vision \cite{wortsman2022robust}

% [WRITE THE GENERATION SETTINGS TEMPERATURE AND ALL]


\subsection{Encouraging Diversity}
\xhdr{Hindsight merging} For model merging, we focus our analysis on \llamab{} and DeepSeek-R1-Distill-Llama-8B. We refer to the DeepSeek-R1-Distill-Llama-8B as \llamad{} for readability. To combine the base and instruct models we use MergeKit~\citep{goddard-etal-2024-arcees} and SLERP merging, a spherical extension of linear interpolation~\citep{pmlr-v162-wortsman22a} (c.f., Appendix~\ref{app:merging} for further information). 

To merge the models, we define a parameter $\alpha\in [0,1]$ which measures the merging coefficient. When $\alpha=0$ the merged model equals \llamab{}, when $\alpha=1$, it is entirely \llamad{}. 
For experiments, we restrict $\alpha \in \{0, 0.5, 0.7, 1\}$. 


% \xhdr{Model switching} Similar to the model weight merging, we consider another form of model merging on the token level. In this context, we first let the instruct model generate $n$ tokens, and then the base model generates $m$ tokens. This switching happens until the max token length is reached. In our context, we consider $n=300$ and $m=50$. We generate all of our data using vllm, a Python library for efficient model generations~\citep{kwon2023efficient}. 

\subsection{Data Curation Setup}
We take the first one thousand medium difficulty questions from TACO, a large-scale benchmark dataset for code generation~\citep{li2023taco}. We use \gptmini{} to rewrite the reasoning traces. Since the synthetic traces often exhibit strange idiosyncracies when generating from a small model (e.g., repetitions, forgetting to include correct tokens), the rewriter acts as reformater. This approach is often used when generating reasoning data~\citep{sky_t1_2025,bespoke_stratos,zhu2025bare}. 




\subsection{Evaluation}
\xhdr{Diversity} In many applications of synthetic data, semantic diversity outweighs syntactic diversity --- diverse approaches are more valuable than diverse use of language for the same approach. To account for this, we rely on a BERTScore-style~\citep{zhang2019bertscore} similarity metric. For each of the rewritten generations, we embed the reasoning using OpenAI's \texttt{text-embedding-3-small} with 150 dimensions. We then analyze the diversity of these embeddings across two scenarios. First, we measure the global similarity by taking all samples generated by a model and take the average pairwise cosine similarity, denoted $\text{sim}_{\text{global}}$. Second, we measure the local similarity or the average cosine similarity across generations from the same question, denoted $\text{sim}_{\text{local}}$.

\xhdr{Performance} For each of the programming tasks in TACO, we test the model reasoning trace on the dataset provided unit tests (code to verify function correctness). Here we report two scores, the pass@$1$ and pass@$k$~\citep{kulal2019spoc}. Pass@$1$ is the average number of generations that are correct, irrespective of if the model gets multiple correct generations for one question. On the other hand, pass@$k$ tests if a model has $k$ tries to solve a problem, what are the chances at least one of the generations pass it. Explicitly, we estimate pass@$k$ using the following formula from \cite{chen2021evaluating}: 
\begin{align*}
    \text{pass}@k := \mathbb{E}_{\text{Problems}} \left[ 1 - \frac{\binom{n-c}{k}}{\binom{n}{k}} \right]
\end{align*}
\begin{figure}[t]
    \centering
    \begin{subfigure}{0.33\textwidth}
        \centering
        \includegraphics[width=\linewidth]{visuals/avg_question_similarity.pdf}
        % \caption{Global setting cosine similarity.}
        % \label{fig:div:subfig1}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \centering
        \includegraphics[width=\linewidth]{visuals/avg_same_question_similarity.pdf}
        % \caption{Local setting cosine similarity.}
        % \label{fig:div:subfig2}
    \end{subfigure}
    \caption{Average cosine similarity across embeddings of generated text in the global and local settings.}
    \label{fig:diversity}
\end{figure}

\xhdr{Instruction following} Pre-training a language model provides it with an understanding of the world, however during the fine-tuning stage the model actually becomes an assistant learning how to follow instructions.
Thus, merging the two models, may lead to reduced performance in the instruction following tasks the model was trained on. One such task is refusal, where when prompted with a potentially problematic request, an instruction-tuned language model is explicitly trained to refuse that request. To evaluate the hindsight-merged models, we see how performance on SORRY-Bench --- a benchmark for safety refusals --- changes~\citep{xie2024sorrybench}. To evaluate SORRY-Bench generations, we use \gpt{} as-a-judge. 

\xhdr{Fine-tuning a reasoning model} To validate the downstream usefulness of our method, we use reasoning model distillation, task known to benefit from diverse data traces~\citep{muennighoff2025s1}. We
fine-tune \llamai{} on the data generated with our set of merged models to produce \llamaf{}. Following the method of various recent works, we filter out reasoning traces leading to an incorrect solution \citep{sky_t1_2025, bespoke_stratos}. For questions with more than two correction generations, we randomly select two. We then do supervised fine-tuning (SFT) on the remaining data using \texttt{LLama-Factory} \citep{zheng2024llamafactory}. We evaluate the resulting reasoning models on the test set of easy tasks of the TACO benchmark.




\section{Results}

In our experiments, we show that we (1) can induce diversity in the generations using hindsight merging, (2) the diversity improves model performance for reasoning, and (3) does not harm model performance in other aspects, such as [X]. 

\subsection{The best of both worlds}
\xhdr{Diversity} To begin, we explore how varying the $\alpha$ parameter affects diversity of reasoning traces. In Figure~\ref{fig:diversity}, we illustrate the global and local similarity metrics. We note that reducing $\alpha$ makes the model more diverse. In the global setting, $\text{sim}_\text{global}^{\alpha=1} = 0.53$, $\text{sim}_\text{global}^{\alpha=0.7} = 0.48$, 
$\text{sim}_\text{global}^{\alpha=0.5} = 0.46$. We observe a similar pattern in the local setting, where the diversity drops from 0.89 to 0.86. Moreover, we see that tails are much larger for the merged models than the instruct model (i.e., the model is likely to generate very different answers). 
% In the Appendix we include a figure 

\xhdr{Instruction Following} The instruct model has capabilities that we want the base model to possess, like instruction following. When a user asks the model to generate code, ideally the model actually generates a code block. In this section, we explore to what extent does the merged model maintain the instruction following capabilities of the instruction model using SORRY-Bench. The base model fails to refuse 61\% of the time, whereas in the full instruct model fails 14\% of the time. In the 50\% merge split, the model has 13\% reduction in refusal rejection (23\% normalized between the base and instruct). In the case where we have $\alpha=0.70$ we observe only a 4\% increase in failing to refuse a request. See Figure~\ref{fig:sorry-bench-results} in Appendix for breakdown of refusal results. 


\subsection{Model Capabilities}
Next, we turn to evaluating how well the models solve the TACO programming tasks. In Figure~\ref{fig:capabilities} we show a comparison between the pass@$1$ and pass@$10$. 
We observe that as alpha decreases (more of the base model merged in) the pass@$1$ decreases. In other words, the models become worse at generating the correct answer when prompted once. However, the pass@$10$ tells a different story. The merged models achieve a higher pass@$10$ than the $\alpha=1$ model. In the $\alpha=0.7$ setting, we have a pass@$10$ of 0.696, whereas the full instruct model has a pass@$10$ of 0.654. This suggests that the inference scaling laws of the two models may be different. In other words, repeat sampling from a diverse model leads to better pass@$k$ performance than repeat sampling from the instruct model. In Figure~\ref{fig:capabilities} we illustrate the difference in scaling patterns and show that after 6 generations, the merged model passes the fully instruct model. 
That being said, $\alpha=0.7$ seems to be a sweet spot, as including even more of the base model ($\alpha=0.5$) results in a drop at the pass@$10$ to 0.586. 


We conduct a more qualitative analysis to see where the models fail. In Table~\ref{tab:error-analysis} we break down failure modes into two categories following the work of \citet{li2023taco}: (1) failure to include a code snippet and (2) failure for written code to pass unit tests. We additionally include a column with the fraction of generations with correct code. We first observe that in the base model case, $\alpha=0$, 54.9\% of the time the model fails to include a code snippet, and the remaining 33.9\% result in incorrect code. In the other extreme, the fully instruct model, the model includes code snippets most of the time (89.6\%), but fails due to incorrect generations (84.4\% of failures). As we can see, increasing the fraction of base model included results in inheriting the lack of instruction following: 83.5\% of the generations contain code in the case of $\alpha=0.7$ compared with 69.6\% for $\alpha=0.5$. 

% - 0.50 inherits too much from the base model.  


% In addition, we show the scaling of the pass@$k$ across the different alpha values. 



\begin{table}[]
    \centering
    \small
    \renewcommand{\arraystretch}{1.2}
    \begin{tabular}{lrrr}
        \toprule
        \textbf{Model} & \textbf{Incorrect (\%)} & \textbf{No code (\%)} & \textbf{Correct (\%)} \\
        \midrule
        $\alpha=1$ & 56.8\% & 10.5\% & 32.7\% \\
        $\alpha=0.7$ & 58.4\% & 16.6\% & 25.0\% \\
        $\alpha=0.5$ & 53.7\% & 30.0\% & 16.3\% \\
        $\alpha=0$ & 33.9\% & 55.7\% & 10.5\% \\
        \bottomrule
    \end{tabular}
    \caption{Error analysis. Incorrect means that the code component was present, but it gave the wrong answer. No code means that the generation had no code. Correct means percentage of generations that solved the task.}
    \label{tab:error-analysis}
\end{table}

% \begin{table}[]
%     \centering
%     \small
%     \renewcommand{\arraystretch}{1.2}
%     \begin{tabular}{lrr}
%         \toprule
%         \textbf{Model} & \textbf{Incorrect} & \textbf{No code} \\
%         \midrule
%         $\alpha =1$ & 0.844 & 0.156 \\
%         $\alpha =0.7$ & 0.780 & 0.220 \\
%         $\alpha =0.5$ & 0.643 & 0.357 \\
%         $\alpha =0$ & 0.381 & 0.619 \\
%         \bottomrule
%     \end{tabular}
%     \caption{Error analysis. Incorrect means that the code component was present, but it gave the wrong answer. No code means that the generation had no code.}
%     \label{tab:error-analysis}
% \end{table}

\begin{figure}[t!]
    \centering
    \begin{subfigure}{0.8\linewidth}
        \centering
        \includegraphics[width=\linewidth]{visuals/correctness_plot.pdf}
    \end{subfigure}
    % \vspace{0.5cm} % Adjust spacing between figures if needed
    \begin{subfigure}{0.8\linewidth}
        \centering
        \includegraphics[width=\linewidth]{visuals/pass_at_k.pdf}
    \end{subfigure}
    
    \caption{(Top) Comparison of pass@$1$ and pass@$10$ for various alpha settings. (Bottom) Inference scaling across different $k$ between 1 and 10.}
    \label{fig:capabilities}
\end{figure}




\subsection{Training a Reasoning Model}
The hindsight-merged models produce diverse and correct reasoning traces, making them strong candidates for training a dedicated reasoning model. To evaluate the effectiveness of this approach, we fine-tune \llamai{} on datasets generated with different model merges, creating new series reasoning models \llamaf{}.

We follow a standard distillation pipeline: first, we filter out reasoning traces leading to incorrect solutions, ensuring high-quality supervision. Then, we apply supervised fine-tuning (SFT) using Llama-Factory~\citep{zheng2024llamafactory}. Finally, we evaluate the resulting model on the TACO test set, focusing on medium-difficulty programming tasks.

The results in \cref{tab:finetuning_results} show that \llamaf{0.7} trained on data curated from a model merged with a SLERP weight of $\alpha = 0.7$ on the instruct model achieves the highest single-sample accuracy, surpassing the model trained on the instruct model’s generations. This confirms that increased diversity in reasoning traces leads to better generalization.

\begin{table}[h]
    \centering
    \small
    \begin{tabular}{lcc}
        \toprule
        \textbf{Model} & \textbf{Pass@1} & \textbf{Pass@10} \\
        \midrule
        \llamaf{1.0} & 5.2\% & 21.5\% \\
        \llamaf{0.7} & \textbf{7.0\%} & 29.0\% \\
        \llamaf{0.5} & 6.7\% &\textbf{32.0\%} \\
        \bottomrule
    \end{tabular}
    \caption{Pass@$1$ and Pass@$10$ results for fine-tuned reasoning models using different hindsight merging coefficients $\alpha$ on TACO-easy. The best-performing models both use data curated from merged models at $\alpha=0.7, 0.5$, suggesting that hindsight merging improves diversity and generalization.}
    \label{tab:finetuning_results}
\end{table}

Additionally, we observe improved inference scaling properties in the distilled models. Figure~\ref{fig:finetuning-scaling} presents pass@$k$ curves, highlighting that models trained on hindsight-merged data generate more diverse reasoning traces, yielding steeper inference scaling curves compared to models trained solely on instruct-generated data. Specifically, the scaling curve for \llamaf{0.5} is the steepest and achieves the highest pass@$k$ for a $k$ beyond four. This suggests that diverse training data improves robustness and compositional reasoning, allowing the model to explore a broader solution space.


Together, these findings support the core hypothesis: hindsight merging enhances synthetic data diversity, which in turn leads to more effective reasoning model distillation. Our results are in line with prior work~\citep{muennighoff2025s1} that highlights the importance of diversity in reasoning trace data for model distillation. We show that hindsight merging can be a simple, yet effective technique for improving reasoning models.

% In Figure \ref{fig:diversity} we illustrate 
 % In Figure \ref{fig:diversity} we illustrate 


 \begin{figure}[t!]
    \centering
    \centering
    \includegraphics[width=.8\linewidth]{visuals/finetuning_pass_at_k.pdf}
    \caption{Performance on coding tasks with repeat sampling of the generations.}
    \label{fig:finetuning-scaling}
\end{figure}


\section{Related work}
% Our work builds on three areas of research in machine learning: model merging, reasoning language models, and diverse synthetic data generation. In this section, we briefly cover the relevant literature from each of these areas. 

\xhdr{Model merging}
Model merging refers to combining the weights of different models to create a new model that ideally retains the strengths of its components. This approach has been found to produce models that perform well across multiple tasks originally handled by the individual models~\citep{pmlr-v162-wortsman22a}. Notably, model merging has been shown to be more effective than data mixing for integrating knowledge across models~\citep{Aakanksha2024MixDO}, and various strategies have been explored to improve its effectiveness~\citep{akiba2025evolutionary,yadav2024matters}. Additionally, \citep{wortsman2022robust} showed that model merging improve robustness to data shifts.

A common method for model merging is linear interpolation, where the weights of two or more models are combined using a weighted sum~\citep{pmlr-v162-wortsman22a}. 
Depending on the training regime, linear interpolation is approximately equal to ensembling models~\citep{wortsman2022robust, wortsman2022model}. 
% Related to this, task vectors have been proposed as a method for model adaptation: by fine-tuning a language model on a specific task and then adding the fine-tuned model back to the original model, task-specific capabilities can be transferred without full retraining~\citep{ilharco2022editing}.
Related to this idea, task vectors have been proposed as a method for model adaptation: a task can be approximated by computing the difference in the weights of a fine-tuned a language model and the base model. 
This difference represents the task, and can be combined with other task vectors to improve model performance ~\citep{ilharco2022editing}.
Related to this paper, task vectors have been used to convert a base language model into a change model~\citep{huang2024chat}. This builds on recent work that shows the  representations of the base and instruct models are aligned ~\citep{sae_finetuning,BaseLLMsRefuseToo,lindsey2024sparse,minder2025controllable}. 
% These results suggest that finetuning enhances existing capabilities instead of learning new abilities~\cite{bau-finetuning}. 

%\cite{ilharco2022patchingopenvocabularymodelsinterpolating} found that interpolating between the weights preserves performance on tasks unrelated to the fine-tuning task.

% when the model is in the ntk regime, interpolating weights is directly related to ensembling is described in \cite{wortsman2022robust,\cite{wortsman2022model}
% \gianluca{Section 2 has a very basic takeaway that we try to prove rigorously by approximating entropy dynamics during neural net train under different conditions: fine-tuning on datasets that are low diversity leads to models that are low diversity which leads to bad synthetic generations! If we open up with this, it necessitates the need to \emph{roll the model back} and sample from there.}

% {It'd be nice to say that we can interpolate between the two models \cite{wortsman2022robust, ilharco2022patchingopenvocabularymodelsinterpolating} because we're likely in the NTK regime (appendix A in \cite{ilharco2022editing} The NTK regime means that the training dynamics are roughly linear. This means that what we're doing either (1) is an ensembling of the two models, and/or (2) roughly provides us with the weights of an earlier checkpoint.}

% weight averaging is effective in improving robustness to distributon shifts in computer vision \cite{wortsman2022robust}

% A common approach for model merging consists of linear interpolation between models~\citep{pmlr-v162-wortsman22a}. 

% - relatedly, task vectors have been proposed by finetuning a language model on a specific task and then adding that finetuned model back to the original model~\cite{ilharco2022editing}



% - representations for the base model are similar to the instruction model

% - in the simple case of entity tracking, finetuning has been shown to enhance existing capabilities in the models


% helps language models trained on individual tasks to be combined. 

% - How different parameters affect merging~\cite{yadav2024matters}.

% Show that model mixing leads to improved performance to individual models in vision tasks~\cite{pmlr-v162-wortsman22a}.



% Model merging for low-resource context



\xhdr{Model post-training and reasoning}
Model post-training consists of everything that is done after the pretraining and is critical to making the model a capable assistant~\citep{bai2022training} and aligning them with human values~\citep{hendrycks2023aligningaisharedhuman}. The release of o1 was a harbinger for post-training reasoning into language models~\citep{jaech2024openai}, leading to a series of highly effective models, e.g., DeepSeek R1~\citep{guo2025deepseek}, S1~\citep{muennighoff2025s1}. 

% More recently, post-training has focused on giving language models the ability to conduct search and reason [CITE]. 

Reasoning work builds on the observation that additional compute to solve tasks leads to dramatic improvements in performance. This additional reasoning can be achieved through prompting~\citep{prystawski2024think,NEURIPS2022_9d560961} increasing computational depth~\citep{goyal2024think,pfau2024lets}, and explicitly training reasoning into language models ~\citep{de2024rational, luo2024improvemathematicalreasoninglanguage, zelikman2022star}.
To train a reasoning model, work has shown that the data traces require high diversity, quality, and a range of difficulties~\citep{muennighoff2025s1}. 

% - often exhibit underthinking~\cite{wang2025thoughts}



\xhdr{Synthetic data curation and diversity}
Synthetic data curation is used for a wide range of applications from reasoning model distillation~\citep{guo2025deepseek} to self-play in language models~\citep{kumar2024training}. 
One active area of research is designing diverse synthetic data generation pipelines~\citep{samvelyan2024rainbow,veselovsky2023generating,ge2024scalingsyntheticdatacreation,zelikman2022star,yu2024large, chen2024diversity}, one approach explicitly asks the LLM to generate diverse hypotheses before solving a task~\citep{wang2024hypothesis,frohling2024personas,zhang2024improving}. One application of synthetic data has been training reasoning models~\citep{bespoke_stratos}, where it has been shown that weaker models provide better synthetic data than stronger models~\citep{bansal2024smaller} given that an accurate verifier is available to score responses~\citep{stroebl2024inference} 

Synthetic data is usually generated using RLHF models, which have been shown to reduce the diversity of the generated data~\citep{murthy2024one,achiam2023gpt,casper2023open,fdivergence,perez2022red}. 
Other work has shown that while they exhibit reduced diversity, it may not be problematic since it filters noisy and unhelpful generations~\citep{lake2024distributional}, a claim we further explore in this paper. One specific example where more diverse outputs are associated with better performance is inference scaling through best-of-n, where multiple candidate generations are sampled, and the highest-scoring output—often selected via a verifier or heuristic function—is chosen \citep{brown2024large, wang2024planningnaturallanguageimproves}.

\section{Discussion}
In this paper we focus on reconciling a tension in generating data with language models: maintaining the diversity of the base model, without losing the strong instruction following abilities of the fine-tuned models. 
We show that fine-tuning and reinforcement learning reduce output diversity by lowering the entropy during optimization. To counter this, we propose \emph{hindsight merging} for getting the best of both worlds by merging instruct models with their past checkpoints. Our experiments show the value of this method by both studying generated data, alongside its downstream applications. We illustrate that merging leads to more diversity, maintains instruction following, and exhibits better inference scaling behavior. 
Finally, we show that models trained on merged model traces benefit from the diverse data and act as better downstream reasoners. A model trained on the traces of an R1 distilled Llama-8b underperforms a hindsight merged version of the model by 11.5\% (relative increase in pass@$10$ by 53\%). 


% This approach contrasts with most existing settings for generating synthetic traces which focus on designing prompts from the instruct models due to their strong instruction following abilities. 
% Yet, these models are known to exhibit mode collapse that limits the diversity of the generated data. 
% In this paper, we explore this tension and propose a simple method to get the best of both world: a diverse model that still follows instructions. 
% To do this we propose hindsight merging --- model merging with prior checkpoints of the instruction tuned model. Through a series of experiments, we illustrate that this method results in both more diverse generations alongside good instruction following. 

% Using the reasoning traces generated by the hindsight merged models, we convert Llama-3.1-8b into a reasoning model and illustrate that 

% provide a theoretical explanation  inherent tension between instruction following and diversity. We illustrate that by hindsight merging, we are able to recover the diversity of the base model, while maintaining the instruction following the tuned model, necessary for synthetic data curation. 

% As uses of synthetic generations grow from repeat sampling depends on diverse data generations across samples [CITE]. Synthetic human simulation requires sampling from diverse human opinions~\cite{sorensenposition}. AI agents benefit from diverse inference scaling [CITE]. And language models benefit from diverse reasoning traces when becoming a reasoning model~\cite{muennighoff2025s1}. This last question, 


% - our method is also in theory transformers-agnostic. 

\xhdr{Extensions} There is a lot of room for building on the methods outlined in this paper. First, scale. Even in our GPU-constrained settings we saw inference scaling laws that demonstrate optimistic findings, and as we scale model size, number of samples generated, and number of questions answered, we expect similar improvements to hold. Second, a more rigorous comparison with other diversity-increasing methods. Classically, entropy has been added to language models through temperature scaling. 
A more rigorous comparison of temperature scaling methods may provide interesting insights into how different diversity techniques result in different downstream implications. 
%One can also imagine playing with more interesting temperature scaling approaches, where once a generation appears similar to a previous generation, the temperature is dynamically scaled. 
Additionally, our paper focuses on SLERP for merging, but other methods could create different generation behaviors. While merging is usually done on the model level, logit-level mixing~\citep{huang2024divide,zhang2024cogenesis} may offer alternative distributional approaches. 

% In this paper, we showed the scaling of merged models up to a pass@$k$ with $k=10$. It would be meaningful to check how these patterns scale as $k$ increases. 

% - headless chicken 

% - dynamic temperature scaling 

% - smarter merging strategies (task vector after model merging to recover some of the instruction following capabilities; adapt prompt for generation depending on base model weight during merging)

% - logit level mixing. 

% - better scaling laws for reasoning models trained on more diverse data

\xhdr{Limitations} While hindsight merging improves diversity and inference scaling, our approach has several limitations. First, verifier reliability: diverse generations increase the risk of producing responses that could increase the risk of misleading external verifiers often used to enable inference scaling \citep{stroebl2024inference}. Second, domain specificity: our experiments focus on code generation, and it remains unclear whether similar gains would extend to other tasks such as mathematics. Third, alternative methods: concurrent work suggests that generating synthetic data using base models combined with iterative rewriting can also enhance performance \citep{zhu2025bare}. This indicates that diverse synthetic data generation may be achievable even with $\alpha = 0$, i.e., without explicit hindsight merging. However, in our experiments, we found that base models often struggle to generate high-quality, coherent, and sufficiently long reasoning traces, limiting the effectiveness of this approach. Lastly, latent reasoning models: recent work suggests that reasoning need not be explicit but can occur at the latent level~\citep{geiping2025scalingtesttimecomputelatent}, potentially reducing the necessity for explicit reasoning trace generation. Other latent reasoning methods still rely on CoT data~\citep{hao2024traininglargelanguagemodels}, however, and it remains to be seen which paradigm passes the test of time.

% last paragraph smth optimistic 
As the use of language model outputs grows, it will become increasingly important to generate diverse data. We believe that the careful interplay of richly diverse base models and instruction following fine-tuned models may open up a wealth of opportunity for generating diverse and high-quality data. 


\xhdr{Reproducibility} We make the code and data used in this paper available here: 

% References
\bibliography{main}

\newpage

\onecolumn

\title{Appendix}
\maketitle

\appendix

\section{Tracking Entropy Changes Over Optimization Steps}
\label{sec:appendix_entropy_jacobian}
In this section, we aim to provide a more rigorous treatment of Section~\ref{sec:entropy}. We will proceed pedagogically and first derive Equation~\ref{eq:entropy_total}, which forms the base of our analysis.

Suppose that we are performing $K$ transformations on some predictive distribution $q_\theta^k(x_{t+1}|x_{1:t})$ for $x \in \mathcal{X}$ denoted by $q_\theta^k(x)$ for brevity. Let
\begin{align*}
    M : q_\theta^k(x) \mapsto q_\theta^{k+1}(x) = \text{softmax}(f(x;T(\theta_k)))
\end{align*}
denote an invertible and differentiable transformation on the space of predictive distributions. Assume that all integrals exist and that the conditions for the change‐of‐variables theorem are met. Let $q_\theta^{k+1}(x) = M(q_\theta^k(x))$ denote an application of this transformation.

By the change-of-variables formula, we have
\begin{align}
    q_\theta^{k+1}(x) = q_\theta^k(x) \left|\frac{\partial M}{\partial q}\left(q_\theta^k(x)\right)\right|^{-1},
\end{align}
where $\left|\frac{\partial M}{\partial q}\left(q_\theta^k(x)\right)\right|$ denotes the determinant of the Jacobian matrix for the transformation $M$. Taking the natural logarithm of both sides gives:
\begin{align}
    \log q_\theta^{k+1}(x) = \log q_\theta^k(x) - \log \left|\frac{\partial M}{\partial q}\left(q_\theta^k(x)\right)\right|.
\end{align}
Now, let us take the expectation of both sides with respect to $q_\theta^{k+1}(x)$ and apply the definition for differential entropy:
\begin{align}
    H[q_\theta^{k+1}] = -\mathbb{E}_{q_\theta^{k+1}}\left[\log q_\theta^k(x)\right] + \mathbb{E}_{q_\theta^{k+1}}\left[\log \left|\frac{\partial M}{\partial q}\left(q_\theta^k(x)\right)\right|\right].
\end{align}
Since the change-of-variables formula ensures $p_y(y) dy = p_x(x) dx$, we can re-write the left-hand integrals in terms of $q_\theta^{k}$ and obtain
\begin{align}
    H[q_\theta^{k+1}] = H[q_\theta^{k}] + \mathbb{E}_{q_\theta^{k}}\left[\log \left|\frac{\partial M}{\partial q}\left(q_\theta^k(x)\right)\right|\right],
\end{align}
which, after summing for $K$ iterations starting at $q_\theta^0(x)$, produces Equation~\ref{eq:entropy_total}:
\begin{align*}
    H[q_\theta^K] = H[q_\theta^0] + \sum_{k=0}^{K-1} \mathbb{E}_{x \sim q_\theta^k} \left[ \log \left| \frac{\partial M}{\partial q}\left(q_\theta^k(x)\right) \right| \right],
\end{align*}


% Suppose we have some arbitrary transition operator $M : (0, 1) \to (0, 1)$ that transforms some probability distribution $p_x(x)$. The change-of-variables formula, which defines 

\subsection{Deriving $M(q)$ under Gradient Descent}
\label{sec:mq}
Let $z = f(x; \theta)$ and $q(x) = \text{softmax}(z) = \sigma(z)$. Applying a single gradient descent step:
\begin{align} z' = f(x; \theta - \alpha \nabla_\theta \mathcal{L}(\theta)) \end{align}
Using a first-order Taylor expansion around $\theta$, we obtain:
\begin{align} z' \approx f(x; \theta) + \frac{\partial f(x; \theta)}{\partial \theta} (\theta' - \theta) \end{align}
From the gradient update rule $\theta' - \theta = -\alpha \nabla_\theta \mathcal{L}(\theta)$, this simplifies to:
\begin{align} z' = z - \alpha \frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta) \end{align}
Defining $M(q)$ as the transformed distribution after the update:
\begin{align} M(q) = q' = \sigma(z') = \sigma\left(z - \alpha \frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta)\right) \end{align}
To linearize around $q$, we expand:
\begin{align} q' \approx q + \frac{\partial \sigma(z)}{\partial z} (z' - z) \end{align}
With $\sigma$ as the softmax function, this leads to:
\begin{align} \boxed{M(q) \approx q - \alpha \left[\text{diag}(q) - qq^\top\right] \frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta),} \end{align}
where $q$ denotes the softmax probabilities produced by the network $f(x; \theta)$ given data $x$.
\subsection{Jacobian of $M(q)$}
The Jacobian $\frac{\partial}{\partial q} M(q)$ allows us to track volume changes, per update step, over $q_\theta(x) = \text{softmax}(f(x;\theta))$. Consider the Jacobian for the approximation from Appendix \ref{sec:mq}:
\begin{align} 
    \frac{\partial}{\partial q} M(q) \approx \frac{\partial}{\partial q}\left[q - \alpha \left[\text{diag}(q) - qq^\top\right] \frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta)\right]. 
\end{align}

Since $\frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta)$ is parametrized by $\theta$, we treat it as a constant and only consider the term $q - \alpha \left[\text{diag}(q) - qq^\top\right]$. It is simple to show that the Jacobian of this term with respect to $q$ is:
\begin{align}
    I - \alpha\left[E - G\right],
\end{align}
where $E_{ijk} = \delta_{ij} \cdot \delta_{jk}$ and $G_{ijk} = \delta_{ik}q_j + \delta_{jk}q_i$. To approximate the $\frac{\partial f(x; \theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta)$ term, we assume that (1) the learning rate $\alpha$ is small and (2) $\frac{\partial f(x; \theta)}{\partial \theta}$ does not significantly vary with $q$ near the current parameter setting $\theta$. 

For the cross-entropy loss, the gradient with respect to the logits is given by:
\begin{align}
    \frac{\partial \mathcal{L}(\theta)}{\partial f} = q - y.
\end{align}
for softmax probabilities $q$ and ground-truth one-hot encoded vectors $y$. As a result of the assumptions above, we choose the following approximation:
\begin{align}
    \frac{\partial f(x;\theta)}{\partial \theta} \nabla_\theta \mathcal{L}(\theta) \approx q - y.
\end{align}
Combining these assumptions the Jacobian of the update mapping $M(q)$ can be approximated by
\begin{align}
    \boxed{\frac{\partial}{\partial q} M(q) \approx I - \alpha\left[E_{kk} - G\right](q - y),}
\end{align}
where $E_{ijk} = \delta_{ij} \cdot \delta_{jk}$ and $G_{ijk} = \delta_{ik}q_j + \delta_{jk}q_i$.


\section{Fine-Tuning via Reinforcement Learning Decreases Entropy}
\label{sec:app_rl}

\subsection{Problem Formulation}

Let $\mc{V}$ be a finite vocabulary of tokens such that the set of possible token sequences is $\mc{L} = \bigcup\limits_{n=1}^\infty \mc{V}^n$. A fine-tuning dataset is generated by sampling a prompt and generating a corresponding response from some fixed ground-truth distribution. Let $\mu \in \Delta(\mc{L})$ be the distribution over prompts and let $p^\star: \mc{L} \ra \Delta(\mc{L})$ be the ground-truth response distribution. One might expect $p^\star$ to take on a natural factorized form such that, for any initial prompt $s \sim \mu$ and length-$T$ response $w = (w_1,w_2,\ldots,w_T)$, $p^\star(w \mid s) = \prod\limits_{t=1}^T p^\star(w_t \mid w_1, \ldots, w_{t-1}, s)$. We may write down a particular Markov decision process (MDP) in which any LLM is a policy of the MDP and the reward function is constructed such that learning the optimal MDP policy amounts to obtaining a LLM that matches $p^\star$.

Consider the infinite-horizon, discounted MDP $\mc{M} = \langle \mc{S}, \mc{A}, \mc{R}, \mc{T}, \mu, \gamma \rangle$. Here $\mc{S} = \mc{L}$ represents language (a sequence of tokens from $\mc{V})$ consisting of a prompt and some partial or complete response. The action space $\mc{A} = \mc{V} \cup \{\texttt{STOP}\}$ contains all valid tokens a LLM may emit as well as an explicit \texttt{STOP} token to denote the completion of a response. Logically, the MDP follows deterministic transition dynamics $\mc{T}: \mc{S} \times \mc{A} \ra \mc{S}$ which appends the selected token to the current state: $\mc{T}(s, a) = \langle s, a \rangle$. For simplicity, we obviate the explicit incorporation of a designated absorbing, terminal state $s_\perp$ that an agent will transition to immediately upon selecting the \texttt{STOP} action. $\mu \in \Delta(\mc{S})$ is an initial state distribution which aligns with the distribution over prompts above; thus, any initial state already contains the prompt and subsequent token selections are folded into the next state transitions. As usual, $\gamma \in [0,1)$ is the standard discount factor for communicating a preference between near-term and long-term reward. So far, we have specified a controlled Markov process (that is, a MDP without a reward function) such that any policy $\pi: \mc{S} \ra \Delta(\mc{A})$ represents a LLM that examines the prompt along with any partially-generated response thus far and emits a distribution over next tokens. 

To capture the objective of fine-tuning a LLM towards a ground-truth response distribution $p^\star$, we consider a policy-dependent reward function defined as $\mc{R}(s,a) = \log\left(\frac{p^\star(a \mid s)}{\pi(a \mid s)}\right)$. Recall that the performance of any policy $\pi$ with respect to a prompt $s \in \mc{S}$ is given by its associated value function $V^\pi(s) = \bE\left[\sum\limits_{t=0}^\infty \gamma^t \mc{R}(s_t, a_t) \mid s_0 = s\right]$. With a slight abuse of notation, we account for randomness in the initial state through $V^\pi(\mu) \triangleq \bE_{s_0 \sim \mu}\left[ V^\pi(s_0)\right]$. Recall that any policy induces a corresponding discounted stationary state visitation distribution $d^\pi_\mu(s) = (1-\gamma)\sum\limits_{t=0}^\infty \gamma^t \bP^\pi(s_t = s),$ where $\bP^\pi(s_t = \cdot) \in \Delta(\mc{S})$ is the distribution over states visited by policy $\pi$ at timestep $t$. Intuitively $d^\pi_\mu$ encodes which states policy $\pi$ will occupy using $\gamma$ to account for near-term vs future visitation. In the context of LLMs, $d^\pi_\mu$ encodes a distribution over prompts and partial/complete responses generated by a particular LLM $\pi$. A well-known fact is that $$V^\pi(\mu) = \bE\left[\sum\limits_{t=0}^\infty \gamma^t \mc{R}(s_t, a_t) \right] = \frac{1}{(1-\gamma)} \bE_{s \sim d^\pi_\mu}\left[\bE_{a \sim \pi(\cdot \mid s)}\left[\mc{R}(s,a)\right]\right].$$

We define the optimal policy $\pi^\star$ of $\mc{M}$ as achieving supremal value with associated value function $V^\star(\mu) = \sup\limits_{\pi \in \Pi} V^\pi(\mu)$, where $\Pi \triangleq \{\mc{S} \ra \Delta(\mc{A})\}$ denotes the class of all stationary, stochastic policies. For the particular choice of policy-dependent reward function, we see that an optimal policy $\pi^\star$ minimizes the KL-divergence between its own per-step token distribution and that of the ground-truth distribution $p^\star$: 
\begin{align*}
    V^\star(\mu) &= \sup\limits_{\pi \in \Pi} V^\pi(\mu) \\
    &= \sup\limits_{\pi \in \Pi} \frac{1}{(1-\gamma)} \bE_{s \sim d^\pi_\mu}\left[\bE_{a \sim \pi(\cdot \mid s)}\left[\mc{R}(s,a)\right]\right] \\
    &= \sup\limits_{\pi \in \Pi} \frac{1}{(1-\gamma)} \bE_{s \sim d^\pi_\mu}\left[\bE_{a \sim \pi(\cdot \mid s)}\left[\log\left(\frac{p^\star(a \mid s)}{\pi(a \mid s)}\right)\right]\right] \\
    &= -\inf\limits_{\pi \in \Pi} \frac{1}{(1-\gamma)} \bE_{s \sim d^\pi_\mu}\left[\kl{\pi(\cdot \mid s)}{p^\star(\cdot \mid s)}\right].
\end{align*}

Denote the visitation distribution of the optimal policy as $d^\star_\mu \triangleq d^{\pi^\star}_\mu$. As the KL-divergence is non-negative and achieves its minimum value when two distributions are equal, it follows that $\bE_{s \sim d^{^\star}_\mu}\left[\kl{\pi^\star(\cdot \mid s)}{p^\star(\cdot \mid s)}\right] = 0$.

\subsection{Analysis}

LLM fine-tuning via reinforcement learning typically proceeds via policy search where any LLM can be seen as a parameterized policy $\pi_\theta: \mc{S} \ra \Delta(\mc{A})$ of the above MDP with parameters $\theta \in \Theta \subset \bR^d$. Let $\Pi_\Theta \triangleq \{\pi_\theta \mid \theta \in \Theta\} \subset \Pi$ denote the parameterized policy class. Our analysis proceeds using a smoothness assumption on $\Pi_\Theta$. 

Recall that a function $f: \bR^d \ra \bR$ is $\beta$-smooth is $$|\nabla f(x) - \nabla f(x')||_2 \leq \beta ||x-x'||_2 \qquad \forall x,x' \in \bR^d.$$ A consequence of this, either by Taylor's Theorem or Lemma 3.4 of \citet{bubeck2015convex}, is $$|f(x') - f(x) - \nabla f(x) \cdot (x-x')| \leq \frac{\beta}{2} ||x'-x||_2^2 \qquad \forall x,x' \in \bR^d.$$

\begin{assumption}
    For all $\pi_\theta \in \Pi_\Theta$, the mapping $\theta \mapsto \log\left(\pi_\theta(a \mid s)\right)$ is $\beta$-smooth, $\forall (s,a) \in \mc{S} \times \mc{A}$.
    \label{assume:smooth}
\end{assumption}

Consider an iteration of fine-tuning $k$ with current policy parameters $\theta^{(k)}$ where we perform the following abstract policy gradient update $$\theta^{(k+1)} = \theta^{(k)} + \eta \omega^{(k)},$$ where $\eta \in \bR_{\geq 0}$ is a learning rate and $\omega^{(k)}$ is some vector for updating policy parameters (we will specify a concrete update momentarily). For brevity, we use the shorthand $\pi^k \triangleq \pi_{\theta^{(k)}}.$ Observe that Assumption \ref{assume:smooth} yields the following lemma

\begin{lemma}
    Under Assumption \ref{assume:smooth}, for any state-action pair $(s,a) \in \mc{S} \times \mc{A}$, $$\log\left(\frac{\pi^{k+1}(a \mid s)}{\pi^{k}(a \mid s)}\right) \geq \eta \nabla_\theta \log\left(\pi^k(a \mid s)\right) \cdot \omega^{(k)} - \eta^2 \frac{\beta}{2}||\omega^{(k)}||_2^2.$$
    \label{lemma:log_ratio}
\end{lemma}
\begin{proof}
    Notice that for a $\beta$-smooth function $f:\bR^d \ra \bR$, $$|f(x') - f(x) - \nabla f(x) \cdot (x-x')| \leq \frac{\beta}{2} ||x'-x||_2^2 \implies f(x') - f(x) \geq \nabla f(x) \cdot (x-x') - \frac{\beta}{2} ||x'-x||_2^2.$$ Applying this to our $\beta$-smooth policies (by Assumption \ref{assume:smooth}), we have 
    \begin{align*}
        \log\left(\frac{\pi^{k+1}(a \mid s)}{\pi^{k}(a \mid s)}\right) &= \log\left(\pi^{k+1}(a \mid s)\right) - \log\left(\pi^{k}(a \mid s)\right) \\
        &\geq \nabla_\theta \log\left(\pi^k(a \mid s)\right) \cdot \left(\theta^{(k+1)} - \theta^{(k)}\right) - \frac{\beta}{2}||\theta^{(k+1)} - \theta^{(k)}||_2^2 \\
        &= \eta \nabla_\theta \log\left(\pi^k(a \mid s)\right) \cdot \omega^{(k)} - \eta^2 \frac{\beta}{2}||\omega^{(k)}||_2^2.
    \end{align*}
\end{proof}

At this point, we specify a precise choice of policy-gradient update for $\omega^{(k)}$. For brevity, we write the value function induced by policy $\pi^k$ as $V^k \triangleq V^{\pi_{\theta^{(k)}}}$. Additionally, we define the action-value function as $$Q^k(s,a) \triangleq Q^{\pi_{\theta^{(k)}}}(s,a) = \bE\left[\sum\limits_{t=0}^\infty \gamma^t \mc{R}(s_t,a_t) \mid s_0 = s, a_0 = a\right] = \mc{R}(s,a) + \gamma V^k(\mc{T}(s,a)).$$ Consequently, the advantage function~\citep{baird1993advantage,sutton1998introduction} is defined as $A^k(s,a) \triangleq Q^k(s,a) - V^k(s).$ While the standard choice in the literature is Proximal Policy Optimization (PPO)~\citep{schulman2017proximal}, we study a simpler, special case of PPO more commonly known as advantage actor-critic~\citep{mnih2016asynchronous} (equivalent to running PPO for exactly one epoch per minibatch of on-policy data). We define the policy-gradient update at iteration $k$ as $$\omega^{(k)} = \frac{A^k(s,a)}{||\nabla_\theta \log\left(\pi^k(a \mid s)\right)||_2^2} \cdot \nabla_\theta \log\left(\pi^k(a \mid s)\right).$$ We assume that all policy-gradient updates have bounded norm.

\begin{assumption}
    For all iterations $k$, $||\omega^{(k)}||_2 \leq W$, for some $W \in \bR_{\geq 0}$.
    \label{assume:bounded_norm}
\end{assumption}
We may then obtain the following lemma:
\begin{lemma}
    At any iteration $k$, under Assumptions \ref{assume:smooth} and \ref{assume:bounded_norm}, $$\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^k_s} - \kl{\pi^\star_s}{\pi^{k+1}_s}\right] \geq (1-\gamma)\eta \bE_{s_0 \sim \mu}\left[V^\star(s_0) - V^k(s_0)\right] - \frac{\eta^2 \beta W^2}{2}.$$ 
    \label{lemma:exp_kl_diff_bound}
\end{lemma}
\begin{proof}
    \begin{align*}
        \bE_{s \sim d^\star_{\mu}}&\left[\kl{\pi^\star_s}{\pi^k_s} - \kl{\pi^\star_s}{\pi^{k+1}_s}\right] = \bE_{s \sim d^\star_{\mu}}\left[\bE_{a \sim \pi^\star(\cdot \mid s)}\left[\log\left(\frac{\pi^\star(a \mid s)}{\pi^k(a \mid s)}\right)\right] - \bE_{a \sim \pi^\star(\cdot \mid s)}\left[\log\left(\frac{\pi^\star(a \mid s)}{\pi^{k+1}(a \mid s)}\right)\right]\right] \\
        &= \bE_{s \sim d^\star_{\mu}}\left[\bE_{a \sim \pi^\star(\cdot \mid s)}\left[\log\left(\frac{\pi^{k+1}(a \mid s)}{\pi^k(a \mid s)}\right)\right]\right] \\
        &\geq \bE_{s \sim d^\star_{\mu}}\left[\bE_{a \sim \pi^\star(\cdot \mid s)}\left[\eta \nabla_\theta \log(\pi^k(a \mid s)) \cdot \omega^{(k)} - \eta^2 \frac{\beta}{2} ||\omega^{(k)}||_2^2\right]\right] \\
        &= \bE_{s \sim d^\star_{\mu}}\left[\bE_{a \sim \pi^\star(\cdot \mid s)}\left[\eta \nabla_\theta \log(\pi^k(a \mid s)) \cdot \frac{A^k(s,a)}{||\nabla_\theta \log(\pi^k(a \mid s))||_2^2} \cdot \nabla_\theta \log(\pi^k(a \mid s)) - \eta^2 \frac{\beta}{2} ||\omega^{(k)}||_2^2\right]\right] \\
        &= \bE_{s \sim d^\star_{\mu}}\left[\bE_{a \sim \pi^\star(\cdot \mid s)}\left[\eta \cdot \frac{A^k(s,a) \cdot ||\nabla_\theta \log(\pi^k(a \mid s))||_2^2}{||\nabla_\theta \log(\pi^k(a \mid s))||_2^2} - \eta^2 \frac{\beta}{2} ||\omega^{(k)}||_2^2\right]\right] \\
        &= \bE_{s \sim d^\star_{\mu}}\left[\bE_{a \sim \pi^\star(\cdot \mid s)}\left[\eta \cdot A^k(s,a) - \eta^2 \frac{\beta}{2} ||\omega^{(k)}||_2^2\right]\right] \\
        &\geq \eta \cdot \bE_{s \sim d^\star_{\mu}}\left[\bE_{a \sim \pi^\star(\cdot \mid s)}\left[A^k(s,a)\right]\right]  -  \frac{\eta^2 \beta W^2}{2} \\
        &= (1-\gamma)\eta \bE_{s_0 \sim \mu}\left[V^\star(s_0) - V^k(s_0)\right] - \frac{\eta^2 \beta W^2}{2},
    \end{align*}
    where the first inequality follows from Assumption \ref{assume:bounded_norm} and Lemma \ref{lemma:log_ratio} above, the second inequality follows from Assumption \ref{assume:bounded_norm}, and the final equation follows from the performance-difference lemma~\citep{kakade2002approximately}.
\end{proof}

Using the above lemma, we may follow similar steps as \citet{agarwal2021theory} to obtain a result that relates a current policy after $K$ iterations of policy-gradient updates to the initial policy $\pi^0$ using the KL-divergence with the optimal policy as a benchmark or ``metric'' for comparison.

\begin{theorem}
    For a total number of iterations $K \in \bN$, under Assumptions \ref{assume:smooth} and \ref{assume:bounded_norm}, we have $$\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^K_s}\right] \leq \bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^0_s}\right] + \frac{\eta^2\beta W^2 K}{2}.$$
    \label{thm:finetune_kl_decr}
\end{theorem}
\begin{proof}
    To start, first observe that 
    \begin{align*}
        \frac{1}{K} \sum\limits_{k=0}^{K-1} \bE_{s_0 \sim \mu}\left[V^\star(s_0) - V^k(s_0)\right] &= \frac{1}{K\eta(1-\gamma)} \sum\limits_{k=0}^{K-1} \eta (1-\gamma) \bE_{s_0 \sim \mu}\left[V^\star(s_0) - V^k(s_0)\right] \\
        &\leq \frac{1}{K\eta(1-\gamma)}  \sum\limits_{k=0}^{K-1} \left(\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^k_s} - \kl{\pi^\star_s}{\pi^{k+1}_s}\right] + \frac{\eta^2 \beta W^2}{2}\right) \\
        &= \frac{1}{K\eta(1-\gamma)}  \sum\limits_{k=0}^{K-1} \bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^k_s} - \kl{\pi^\star_s}{\pi^{k+1}_s}\right] + \frac{\eta \beta W^2}{2(1-\gamma)} \\
        &= \frac{1}{K\eta(1-\gamma)}  \left(\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^0_s} - \kl{\pi^\star_s}{\pi^{K}_s}\right]\right) + \frac{\eta \beta W^2}{2(1-\gamma)},
    \end{align*}
    where the inequality follows from Lemma \ref{lemma:exp_kl_diff_bound}. Observe that, by definition of the optimal policy, $\frac{1}{K} \sum\limits_{k=0}^{K-1} \bE_{s_0 \sim \mu}\left[V^\star(s_0) - V^k(s_0)\right] \geq 0$. So, we have $$0 \leq \frac{1}{K\eta(1-\gamma)}  \left(\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^0_s} - \kl{\pi^\star_s}{\pi^{K}_s}\right]\right) + \frac{\eta \beta W^2}{2(1-\gamma)}.$$ Multiplying through by $K \eta (1-\gamma)$ and rearranging terms, we see that $$\bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^{K}_s}\right] \leq \bE_{s \sim d^\star_{\mu}}\left[\kl{\pi^\star_s}{\pi^0_s}\right] + \frac{\eta^2\beta W^2 K}{2},$$ as desired.
\end{proof}

In the context of LLM fine-tuning, recall that the optimal policy $\pi^\star$ for the MDP $\mc{M}$ defined above is the ground-truth distribution $p^\star$ for the fine-tuning dataset. Moreover, recall that fine-tuning begins with a LLM/policy $\pi^0$ initialized with parameters obtained via supervised language modeling on some broader data distribution (for example, the Internet) with entropy (presumably) much larger than $p^\star$. For a sufficiently small learning rate $\eta \ll 1$, Theorem \ref{thm:finetune_kl_decr} tells us fine-tuning with RL to obtain a LLM that more closely approximates a lower-entropy distribution $p^\star$ must necessarily bring the model farther away from the initial policy $\pi^0$ that closely matches a higher-entropy pre-training distribution.

% Let $\mc{V}$ be a finite vocabulary of tokens and let $L \in \bN$ be a maximum length (for instance, the size of a context window). We may then define a language of possible token sequences as $\mc{L} = \bigcup\limits_{\ell = 1}^L \mc{V}^\ell$. Consider the finite-horizon, episodic MDP $\mc{M} = \langle \mc{L}, \mc{V}, \mc{R}, \mc{T}, \mu, L \rangle$ where a state is any (possibly partial) language utterance $s \in \mc{L}$, an action is a single token $a \in \mc{V}$, a reward function $\mc{R}: \mc{L} \times \mc{V} \ra \bR_{\geq 0}$ maps language and next-token pairs to non-negative real values, a deterministic transition function $\mc{T}: \mc{L} \times \mc{V} \ra \mc{L}$ produces updated language, $\mu \in \Delta(\mc{L})$ is an initial state distribution, and $L$ is the horizon or maximum episode duration. The transition function simply appends the most recent token to the current language utterance: $\mc{T}(\ell' \mid \ell, a) = \indic(\ell' = \langle \ell, a \rangle).$ The initial state distribution $\mu$ represents a distribution over prompts. Observe that any stationary, stochastic policy $\pi: \mc{L} \ra \Delta(\mc{V})$ of MDP $\mc{M}$ is a language model. Practically, we may think of any realistic LLM with $d \in \bN$ parameters as a parameterized policy $\pi_\theta$ with $\theta \in \Theta \subset \bR^d$. The pretraining and fine-tuning of any LLM is equivalent to performing policy search over the restricted policy class $\Pi_\theta \subset \Pi$, where $\Pi_\theta \triangleq \{\pi_\theta \mid \theta \in \Theta\}$ and $\Pi \triangleq \{\mc{L} \ra \Delta(\mc{V})\}$.

% \dnote{TODO: Relative entropy policy search}

% Let $\pi_\beta \in \Pi_\theta$ denote a base model and let $\pi^\star_{\mathrm{fine}} \in \Pi$ denote the ground-truth response distribution for the fine-tuning dataset. Fine-tuning is equivalent to policy search with the reward function \dnote{Need to account for off-policy data from $\pi^\star_{\mathrm{fine}}$.}$$\mc{R}^{\pi_\theta}(\ell,a) = \log(\pi^\star_{\mathrm{fine}}(a \mid \ell)) - \lambda \log\left(\frac{\pi_\theta(a \mid \ell)}{\pi_\beta(a \mid \ell)}\right).$$

\section{Interpolating Between Two Distributions}
\label{sec:appendix_interpolate}
In this section, we discuss the resulting effects on entropy when we interpolate between two probability distributions assuming two different flavors of interpolation.

\subsection{Linear Interpolation}
Let $p_1(x_{t+1}\mid x_{1:t})$ denote a large, diverse corpus of internet text that is used to train a sufficiently parameterized base model such that $q_\theta^0(x_{t+1}\mid x_{1:t}) = p_1(x_{t+1}\mid x_{1:t})$. Now, suppose that we fine-tune this base model $q_\theta^0$ such that we mix the softmax probabilities over next-token predictions linearly with a fine-tuning dataset $p_2(x_{t+1}\mid x_{1:t})$ to produce some $q_\theta^*(x_{t+1}|x_{1:t})$ such that
\begin{align}
    q_\theta^*(x_{t+1}|x_{1:t}) = \alpha p_1(x_{t+1}\mid x_{1:t}) + (1 - \alpha) p_2(x_{t+1}\mid x_{1:t}),
\end{align}
for $\alpha \in [0,1]$. Since Shannon's entropy $\bH(\cdot)$ is concave over the space of probability distributions, it immediately follows that for
\begin{align}
    \min\{\bH(p_1), \bH(p_2) \} \leq \bH(\alpha p_1(x_{t+1}\mid x_{1:t}) + (1 - \alpha) p_2(x_{t+1}\mid x_{1:t})) \leq \max\{\bH(p_1), \bH(p_2) \}
\end{align}
we have
\begin{align}
    \bH(p_2) \leq \bH(q_\theta^*) \leq \bH(p_1)
\end{align}
provided that $\bH(p_1) \geq \bH(p_2)$.
% When fine-tuning LLMs to perform specific tasks, we are interested in optimizing
% \begin{align}
%     \min \; D_{\mathrm{KL}}\bigl(p_2 \,\|\, q_\theta\bigr) \;+\; \beta \, D_{\mathrm{KL}}\bigl(q_\theta \,\|\, q_0\bigr),    
% \end{align}
% where, in a sufficiently expressible $q_0$, we can assume
% \[
% q_0\bigl(x_{t+1}\mid x_{1:t}\bigr) \;=\; p_1\bigl(x_{t+1}\mid x_{1:t}\bigr),
% \]
% for some $p_1$ that is a large, diverse corpus of internet text (base model).

% Recent literature has suggested that linear interpolation between $p_1$ and $p_2$ produces sensible models (``linear weight mixing''), implying that softmax probabilities over next‐token predictions likely follow a linear path.

% Consider, by slight abuse, that this mixing between $p_1$ and $p_2$ is indeed linear, such that
% \begin{align}
%     q_\theta\bigl(x_{t+1} \mid x_{1:t}\bigr)
% \;=\;
% \alpha\,p_1\bigl(x_{t+1}\mid x_{1:t}\bigr)
% \;+\;
% (1-\alpha)\,p_2\bigl(x_{t+1}\mid x_{1:t}\bigr),
% \end{align}
% for $\alpha \in [0,1]$. Assume that $H(p_1) > H(p_2)$. By the concavity of Shannon's entropy, we can apply Jensen's inequality:
% \begin{align}
%     H(q)
% \;\ge\;
% \alpha\,H\bigl(p_1\bigr)
% \;+\;
% (1-\alpha)\,H\bigl(p_2\bigr).
% \end{align}
% Since $H(p_1) > H(p_2)$, we have
% \begin{align*}
% H(q) \;\ge\; H\bigl(p_2\bigr),
% \end{align*}
% where equality is implied if and only if $\alpha = 0$. Since $p_1 \neq p_2$ and entropy is strictly concave, the maximum entropy on the open line segment between $p_1$ and $p_2$ must occur at one of the endpoints. Again, since $H(p_1) > H(p_2)$:
% \begin{align*}
% H\bigl(q_\theta\bigr) \;\le\; H\bigl(p_1\bigr).
% \end{align*}
% Thus, when perform linear mixing between base models (pre-trained on a large, diverse corpus of internet text) and fine-tuning datasets (specific, human-aligned preferences), we have
% \begin{align*}
%     H\bigl(p_2\bigr) \;\le\; H\bigl(q_\alpha\bigr) \;\le\; H\bigl(p_1\bigr)
% \end{align*}
% which means fine-tuning is an entropy destroying process for linearly interpolating weights.

% - diverse data helps with generalization. 

\subsection{Beyond Linear Interpolation}

Consider two given probability distributions $P,Q \in \Delta(\mathcal{X})$ such that $X_1 \sim P$ and $X_2 \sim Q$. Let $Z \in \Delta(\{1,2\})$ be a random index following an arbitrary distribution. Then, $$X_Z = \begin{cases} X_1 & Z = 1 \\ X_2 & Z = 2 \end{cases}$$ is a random variable denoting a sample from the mixture distribution between $P$ and $Q$ induced by $Z$. For example, consider $Z \sim \text{Bernoulli}(\alpha)$ for $\alpha \in [0,1]$. Note that $X_1$ and $X_2$ are independent ($X_1 \perp X_2$). 

By the chain rule of mutual information, we have  $$\bI(X_Z; X_1, X_2, Z) = \bI(X_Z; Z) + \bI(X_Z; X_1 \mid Z) + \bI(X_Z; X_2 \mid Z, X_1).$$ Since $X_1 \perp X_2$, $\bI(X_Z; X_2 \mid Z, X_1) = \bI(X_Z; X_2 \mid Z)$. Recall that conditional mutual information first integrates out randomness in the conditioning random variable (in this case, $Z$). So, when $Z=1$,  $\bI(X_Z; X_1 \mid Z = 1) = \bI(X_1; X_1 \mid Z = 1) = \bH(X_1)$. Alternatively, when $Z = 2$, $\bI(X_Z; X_1 \mid Z = 2) = \bI(X_2; X_1 \mid Z = 2) = 0$. The same logic holds \textit{mutatis mutandis} for the second conditional mutual information term $\bI(X_Z; X_2 \mid Z)$. So, the above expression simplifies as 
\begin{align*}
    \bI(X_Z; X_1, X_2, Z) &= \bI(X_Z; Z) + \bI(X_Z; X_1 \mid Z) + \bI(X_Z; X_2 \mid Z) \\
    &= \bI(X_Z; Z) + \bP(Z = 1)\bH(X_1) + \bP(Z = 2) \bH(X_2) \\
    &= \bI(X_Z;Z) + \bH(X_Z \mid Z) \\
    &= \bH(X_Z) - \bH(X_Z \mid Z)  + \bH(X_Z \mid Z) \\
    &= \bH(X_Z).
\end{align*}

The above just formalizes the obvious conclusion that knowing $(X_1, X_2, Z)$ is sufficient for knowing everything about $X_Z$. Taking an alternative decomposition via the chain rule of mutual information, we have 
\begin{align*}
    \bI(X_Z; X_1, X_2, Z) &= \bI(X_Z; X_1) + \ubr{\bI(X_Z; X_2 \mid X_1)}_{= \bI(X_Z; X_2)} + \ubr{\bI(X_Z; Z \mid X_1, X_2)}_{\leq \bH(Z)} \\
    &\leq \bI(X_Z; X_1) + \bI(X_Z; X_2) + \bH(Z) \\
    &= \bH(X_1) - \bH(X_1 \mid X_Z) + \bH(X_2) - \bH(X_2 \mid X_Z) + \ubr{\bH(Z)}_{\leq \log(2) = 1}\\
    &\leq \bH(X_1) + \bH(X_2) + 1\\
    &\leq 2 \cdot \max\limits_{i \in \{1,2\}} \bH(X_i) + 1.
\end{align*}
Applying the identity above to this inequality yields $\bH(X_Z) \leq 2 \cdot \max\limits_{i \in \{1,2\}} \bH(X_i) + 1.$

Meanwhile, we also have 
\begin{align*}
    \bI(X_Z; X_1, X_2, Z) &= \bI(X_Z; X_1) + \ubr{\bI(X_Z; X_2 \mid X_1)}_{= \bI(X_Z; X_2)} + \ubr{\bI(X_Z; Z \mid X_1, X_2)}_{\geq 0} \\
    &\geq \bI(X_Z; X_1) + \bI(X_Z; X_2) \\
    &= \bH(X_1) - \bH(X_1 \mid X_Z) + \bH(X_2) - \bH(X_2 \mid X_Z).
\end{align*}
Since $X_1 \perp X_2$, either value of $Z$ results in  $\bH(X_1 \mid X_Z) = \bH(X_2 \mid Z) = 0$. Thus,
\begin{align*}
    \bI(X_Z; X_1, X_2, Z) &\geq \bH(X_1) - \bH(X_1 \mid X_Z) + \bH(X_2) - \bH(X_2 \mid X_Z) \\
    &= \bH(X_1) + \bH(X_2) \\
    &\geq 2 \cdot \min\limits_{i \in \{1,2\}} \bH(X_i).
\end{align*}

In summary, $$2 \cdot \min\limits_{i \in \{1,2\}} \bH(X_i) \leq \bH(X_Z) \leq 2 \cdot \max\limits_{i \in \{1,2\}} \bH(X_i) + 1.$$

\section{Instruction Following Results}\label{app:refusal}

\begin{sidewaysfigure}
    \centering
    \includegraphics[height=0.28\textwidth, keepaspectratio]{visuals/benchmark-results.png}
    \caption{SORRY-Bench results across different model merges. $\alpha=1$ is the no-mixing instruct model, and $\alpha=0$ is the base model.}
    \label{fig:sorry-bench-results}
\end{sidewaysfigure}




\section{Data Curation Details}\label{app:data-curation-details}

We curated the dataset using the TACO benchmark\citep{li2023taco}. We used the first 1000 questions from the train set of medium-difficulty questions. The detailed parameter settings for data curation are summarized in Table~\ref{tab:data_curation}.

\begin{table}[h]
\centering
\caption{Data Curation Parameters}
\label{tab:data_curation}
\begin{tabular}{ll}
\toprule
\textbf{Parameter} & \textbf{Value} \\
\midrule
Dataset & TACO (BAAI/TACO) \\
Subset & First 1000 medium-difficulty questions \\
Number of Samples per Question & 10 \\
Base Model & Llama-3.1-8B \\
Instruct Model & Mixed models (SLERP $\alpha$ = \{0, 0.5, 0.7, 1.0\}) \\
Sampling Temperature (Base) & 0.8 \\
Sampling Temperature (Instruct) & 0.7 \\
Top-P (Base) & 0.95 \\
Top-P (Instruct) & 0.7 \\
Maximum Tokens & 8192 \\
\bottomrule
\end{tabular}
\end{table}

\section{Evaluation Details}\label{app:evaluation-details}

\subsection{Evaluation on SORRY-Bench}
Generations were done using the default SORRY-Bench settings\footnote{See: \texttt{\url{https://github.com/sorry-bench/sorry-bench}}}. 

\subsection{Evaluation of distilled reasoning models on TACO}

We evaluate the distilled reasoning models trained on our various sets of curated data by generating multiple completions for each test case and measuring their correctness. The evaluation consists of the following main steps:

\begin{enumerate}
    \item \textbf{Data:} We use the test set from the TACO benchmark
    \item \textbf{Generation:} We generate solutions using the fine-tuned reasoning models.
    \item \textbf{Scoring:} We execute generated solutions against provided unit tests.
\end{enumerate}

Table~\ref{tab:evaluation_params} provides the parameters used to generate solutions for evaluation.

\begin{table}[h]
\centering
\caption{Evaluation Parameters for Solution Generation}
\label{tab:evaluation_params}
\begin{tabular}{ll}
\toprule
\textbf{Parameter} & \textbf{Value} \\
\midrule
Test Dataset & TACO (BAAI/TACO) \\
Difficulty Level & Easy \\
Number of Samples per Query & 10 \\
Sampling Temperature & 0.7 \\
Top-P Sampling & 0.7 \\
Maximum Tokens per Generation & 8192 \\
\bottomrule
\end{tabular}
\end{table}

\section{Fine-tuning Details}\label{app:finetuning-details}

Fine-tuning was performed using SFT with LoRA for efficiency. The key fine-tuning parameters are detailed in Table~\ref{tab:finetuning}. We relied on the LLaMA-Factory library \citep{zheng2024llamafactory}.

\begin{table}[h]
\centering
\caption{Fine-tuning Parameters}
\label{tab:finetuning}
\begin{tabular}{ll}
\toprule
\textbf{Parameter} & \textbf{Value} \\
\midrule
Model & \texttt{meta-llama/Llama-3.1-8B-Instruct} \\
Fine-tuning Method & Supervised Fine-Tuning (SFT) with LoRA \\
LoRA Target Layers & All \\
LoRA Rank & 256 \\
Preferred $\beta$ & 0.1 \\
Max Sequence Length & 16,384 tokens \\
Number of Training Epochs & 2.0 \\
Batch Size & 1 \\
Gradient Accumulation Steps & 12 \\
Learning Rate & $1.0 \times 10^{-5}$ \\
Warmup Ratio & 0.1 \\
Precision Used & BF16 \\
\bottomrule
\end{tabular}
\end{table}


\section{Model Merging}\label{app:merging}
\xhdr{SLERP} Spherical LinEar inteRPolation (SLERP) is a method for interpolating between two vectors on the unit sphere. It provides a smooth transition between vectors. Given two unit vectors \( \mathbf{v}_0 \) and \( \mathbf{v}_1 \), SLERP finds an interpolated vector \( \mathbf{v}(t) \) that smoothly transitions between \( \mathbf{v}_0 \) and \( \mathbf{v}_1 \) as \( t \) varies from 0 to 1.

The SLERP formula is derived from the spherical representation of the vectors. Given two unit vectors \( \mathbf{v}_0 \) and \( \mathbf{v}_1 \), we compute the dot product:

\begin{equation}
\cos \theta_0 = \mathbf{v}_0 \cdot \mathbf{v}_1
\end{equation}

where $\theta_0$ is the angle between the two vectors. If the vectors are nearly parallel (\( \cos \theta_0 \approx 1 \)), linear interpolation (LERP) is used instead.

To perform SLERP, the interpolated vector \( \mathbf{v}(t) \) at interpolation factor \( t \) is given by:

\begin{equation}
\mathbf{v}(t) = \frac{\sin((1-t) \theta_0)}{\sin \theta_0} \mathbf{v}_0 + \frac{\sin(t \theta_0)}{\sin \theta_0} \mathbf{v}_1
\end{equation}

where $\sin((1-t) \theta_0 / \sin \theta_0$ and \( \sin(t \theta_0) / \sin \theta_0 \) are the interpolation weights.


If \( \mathbf{v}_0 \) and \( \mathbf{v}_1 \) are almost parallel (i.e., \( |\mathbf{v}_0 \cdot \mathbf{v}_1| > \text{threshold} \)), numerical instability can occur. In such cases, we approximate SLERP using linear interpolation:

\begin{equation}
\mathbf{v}(t) = (1-t) \mathbf{v}_0 + t \mathbf{v}_1
\end{equation}

which provides a sufficiently accurate result when the angle between the vectors is small.

% \section{Conclusion}

% \section{Ablations}
% \subsection{Confounder analysis}
% - Length of the generations? 

% \subsection{Model Merging Techniques}
% - LERP 

% \subsection{Questionless Base Model}
% - In the setting where the base model does not have access to 

% - Base model has no access to the question. 

% \subsection{Effect of formatting}
% - what is the effect of using GPT-4o-mini formatting on the model performance 

%\subsection{General Case}
%Most training regimes for neural networks do not follow linear interpolation and, only approximately so when it does happen. A more general argument can be made by considering Gianluca's Thoughts.



% \section{Temperature scaling and LLM pre-training as global prior}
% \begin{align}
%     q_\theta\bigl(x_{t+1} \mid x_{1:t}\bigr) \;=\; q_\theta\bigl(x_{t+1} \mid z\bigr) 
%     \;=\; \frac{\exp\!\bigl(z_i / T\bigr)}{\sum_{j}\,\exp\!\bigl(z_j / T\bigr)}.
% \end{align}
% Consider the limit:
% \begin{align}
%     \lim_{T \to \infty} \frac{\exp\!\bigl(z_i / T\bigr)}{\sum_{j}\,\exp\!\bigl(z_j / T\bigr)}
%     \;=\; \frac{1}{K},
% \end{align}
% where \(K\) is the size of the support (the number of possible outcomes).

% Formalized as Bayesian inference, the likelihood scaling is expressed as:
% \begin{align}
%     p\bigl(x_{t+1} \mid x_{1:t}\bigr)
%     \;=\; q_\theta\bigl(x_{t+1} \mid x_{1:t}\bigr)^{1/T},
% \end{align}
% where \(\alpha\) (often a function of \(1/T\)) controls how sharply or uniformly the probabilities are distributed.


\end{document}
