% \documentclass{uai2025} % for initial submission
\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


\usepackage{url}            % simple URL typesetting
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{amsmath,amsfonts,amssymb, bm, amsthm}
\usepackage{wrapfig}
\usepackage{subcaption}
\usepackage{thmtools}
\usepackage{makecell}
\usetikzlibrary{positioning,arrows.meta,calc}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\kl}{KL}
\DeclareMathOperator*{\tvd}{TVD}
\DeclareMathOperator{\p}{P}
\DeclareMathOperator{\E}{\mathbb{E}}
\DeclareMathOperator*{\lequal}{\leq}
\DeclareMathOperator*{\lthan}{<}
\DeclareMathOperator*{\gequal}{\geq}
\DeclareMathOperator*{\equal}{=}
\DeclareMathOperator*{\toas}{\to}
\DeclareMathOperator*{\bigv}{\!\bigm\vert\!}
% \DeclareMathOperator*{\vb}{\!\Bigm\vert\!}
% \DeclareMathOperator*{\vc}{\!\biggm\vert\!}
% \DeclareMathOperator*{\vd}{\!\Biggm\vert\!}
\DeclareMathOperator{\X}{\mathcal{X}}
\DeclareMathOperator{\M}{\mathcal{M}}
\DeclareMathOperator{\B}{\mathbb{B}}
\DeclareMathOperator{\N}{\mathbb{N}}
\DeclareMathOperator{\sm}{\textrm{small}}
\newcommand*{\upto}{\bullet\!\!\!-\!\circ}
\newcommand{\tagaligneq}{\refstepcounter{equation}\tag{\theequation}}
\newtheorem{assumption}{Assumption}
\newtheorem{proposition}{Proposition}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{definition}{Definition}

%``proof  outline'' environment
\usepackage{tikz}
% \def\partialbox{
%     \tikz\draw[path picture={\fill[black] (path picture bounding box.north east)
%   -- (path picture bounding box.south west) |-cycle;}] (0,0) rectangle  ++ (0.25,0.25);
% }
\def\dashedbox{\tikz\node[draw=black,dashed] {\phantom{}};}

\makeatletter
\newenvironment{proofoutline}[1][\proofoutlinename]{\par
  \normalfont
  \topsep6\p@\@plus6\p@ \trivlist
%   \item[\hskip\labelsep\itshape
%     #1.]\ignorespaces
    \item[\,\textbf{
    #1}]
    % \ignorespaces
}{%
  \hfill$\dashedbox$ \endtrivlist
}
\newcommand{\proofoutlinename}{Proof Outline}
\makeatother
% End proof outline environment

%``proof idea'' environment
\usepackage{tikz}
\def\dottedbox{\tikz\node[draw=black,dotted] {\phantom{}};}

\makeatletter
\newenvironment{proofidea}[1][\proofideaname]{\par
  \normalfont
  \topsep6\p@\@plus6\p@ \trivlist
%   \item[\hskip\labelsep\itshape
%     #1.]\ignorespaces
    \item[\,\textbf{
    #1}]
    % \ignorespaces
}{%
  \hfill$\dottedbox$ \endtrivlist
}
\newcommand{\proofideaname}{Proof idea}
\makeatother
% End proof idea environment

% Genitive citation 
\usepackage{etoolbox}
\makeatletter
% make numeric styles use name format
\patchcmd{\NAT@test}{\else \NAT@nm}{\else \NAT@nmfmt{\NAT@nm}}{}{}
% define \citepos just like \citet
\DeclareRobustCommand\citepos
  {\begingroup
  \let\NAT@nmfmt\NAT@posfmt% ...except with a different name format
  \NAT@swafalse\let\NAT@ctype\z@\NAT@partrue
  \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
\let\NAT@orig@nmfmt\NAT@nmfmt
\def\NAT@posfmt#1{\NAT@orig@nmfmt{#1's}}
\makeatother
% End genitive citation

% \newcommand{\xxcomment}[4]{\textcolor{#1}{[$^{\textsc{#2}}_{\textsc{#3}}$ #4]}}
\newcommand{\xxcomment}[4]{}
\newcommand{\mc}[1]{\xxcomment{pink}{M}{C}{#1}}
\newcommand{\mh}[1]{\xxcomment{blue}{M}{H}{#1}}
\newcommand{\sr}[1]{\xxcomment{teal}{S}{R}{#1}}
\newcommand{\yb}[1]{\xxcomment{brown}{Y}{B}{#1}}

\title{RL, but don't do anything I wouldn't do}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{\href{mailto:<mkcohen@berkeley.edu>?Subject=Your UAI 2025 paper}{Michael K. Cohen}}
\author[3]{Marcus Hutter}
% \author[1,2]{Further~Coauthor}
\author[4,5]{Yoshua Bengio}
\author[1,2]{Stuart Russell}
% Add affiliations after the authors
\affil[1]{%
    UC Berkeley, Berkeley, CA
}
\affil[2]{%
    Center for Human-Compatible AI, Berkeley, CA
}
\affil[3]{%
    Google DeepMind, London, UK
}
\affil[4]{%
    Mila, Montr\'eal, Qu\'ebec
  }
\affil[5]{%
    Universit\'e de Montr\'eal, Montr\'eal, Qu\'ebec
  }
  
  \begin{document}
\maketitle

\begin{abstract}
  In reinforcement learning (RL), if the agent's reward differs from the designers' true utility, even only rarely, the state distribution resulting from the agent's policy can be very bad, in theory and in practice. When RL policies would devolve into undesired behavior, a common countermeasure is KL regularization to a trusted policy (``Don't do anything I wouldn't do''). All current cutting-edge language models are RL agents that are KL-regularized to a ``base policy'' that is purely predictive. Unfortunately, we demonstrate that when this base policy is a Bayesian predictive model of a trusted policy, the KL constraint is no longer reliable for controlling the behavior of an advanced RL agent. We demonstrate this theoretically using algorithmic information theory, and while systems today are too weak to exhibit this theorized failure precisely, we RL-finetune a language model and find evidence that our formal results are plausibly relevant in practice. We also propose a theoretical alternative that avoids this problem by replacing the ``Don't do anything I wouldn't do'' principle with ``Don't do anything I mightn't do''.
\end{abstract}

\section{Introduction}

Agents optimizing their objective in a way not intended by designers could be amusing, annoying, insidious, or disastrous. Amusingly, RL researchers attempted to get a simulated humanoid to walk, but the reward resulted in crazy locomotion \citep{lee2021pebble}. Annoyingly, maximizing a simulated-environment's reward can produce a policy that would achieve little real-world-reward by exploiting errors in the simulation \citep{mishra2017prediction,baker2019emergent}. Insidiously, artificial agents selecting links to maximize click-through on social media sites have succeeded, but also affecting people in ways designers never sought to \citep{chan2023harms}. For a much longer list of such failures occurring ``in the wild'', see \citep{krakovna_2018}. Finally, sufficiently capable reinforcement learners would likely recognize an incentive to escape human oversight, intervene in the protocol determining their reward, and use force to ensure they can retain control of their reward, subject to such an outcome being possible from the agent's action space, and several other assumptions laid out by \citet{cohen2022advanced}.

Indeed, several sources suggest that extremely successful reward-maximization is \textit{itself} a sign of bad outcomes for humanity. \citet{zhuang2020consequences} demonstrate that in a resource-constrained world, optimizing the world's state to maximize a function of \textit{some} features would, in plausible settings, be arbitrarily bad with respect to a utility function that also cares about \textit{unincluded} features. \citet{turner2021optimal} develop a formal model of ``power''---being able to accomplish a randomly sampled goal---and find that (reward-)optimal policies tend to seek power. And \citet{cohen2022advanced} observe that any behavior that ensures that long-term reward is nearly-certainly-maximal must include extensive control over threats to its physical integrity, including threats from humans.

An appealing and popular proposal to avoid such outcomes is to constrain the agent to follow a policy that is not too dissimilar to a more familiar ``base policy''. This is the approach taken when RL-finetuning large language models (LLMs). This class of approaches limits the upside of RL, since it forgoes optimal policies, but it is a reasonable attempt to avoid catastrophic policies. The KL divergence, in particular $\kl(\textrm{proposed policy} \| \textrm{base policy})$, enforces proximity in a robust, ``safety-conscious'' way: if $\textrm{basepolicy}(\textrm{action}) <\!< 1$ while $\textrm{proposedpolicy}(\textrm{action}) \ \not \!\!<\!< 1$, the KL penalty is high, even while $L_p$ norms can be small. 
For any very bad outcomes that are unlikely under the base policy, this method ensures they remain very unlikely. However, if we ensure that $\kl(\textrm{proposed policy} \| \textrm{base policy})$ is small, but the base policy only \textit{approximates} a trusted policy, to what extent can we be confident that $\kl(\textrm{proposed policy} \| \textrm{trusted policy})$ is small? When the base policy is a Bayesian predictive model of the trusted policy, the answer shown here is: we cannot be confident that $\kl(\textrm{proposed policy} \| \textrm{trusted policy})$ is small, which makes the KL-constraint less comforting. (Note that a Bayesian imitative base policy can only be counted on to make $\kl(\textrm{trusted policy} \| \textrm{Bayesian base policy})$ small).

Worse,
in the formalism we study, we find that if one attempts to use KL-regularization to prevent an RL agent from achieving near-maximal reward (in light of the concerns above), and the base policy is a Bayesian imitation of a trusted policy, a fairly tight KL threshold is required, and as the amount of training data for the Bayesian imitator grows, the relevant threshold can only increase extremely slowly. The reason for the limited effectiveness of KL regularization is \textbf{(1)} a Bayesian imitator asked to act in novel settings must be humble about its predictions; for many actions that the demonstrator (i.e. the trusted policy) would in fact never take, the imitator (i.e. the base policy) must assign meaningful credence to that action, because it doesn't know enough to rule it out. Then \textbf{(2)} the RL agent can exploit or amplify this credence. Formalizing Occam's razor with algorithmic information theory, we have \textbf{(3)} nearly-reward-maximizing policies have a short description length (so they are ``simple''), and \textbf{(4)} a Bayesian imitation learner with a rich prior should be \textit{especially} reluctant to rule out \textit{simple} behaviors from the demonstrator in novel settings. In light of the results from \citet{zhuang2020consequences}, 
\citet{turner2021optimal}, and \citet{cohen2022advanced}, preventing the RL agent from achieving near-maximal reward is, in many settings, a bare minimum requirement for safety-focused regularization, and a KL constraint would struggle to do so.

\citet{Sutskever_2018,Sutskever_2023} argues that neural networks are able to generalize well because of the sense in which they approximate the algorithmic-information-theoretic inductive bias in favor of short programs. Since it is not a given that results from algorithmic information theory apply in practice, we verify empirically that a nearly-state-of-the-art predictive system (Mixtral-8x7B-base-model \citep{jiang2024mixtral}) is reluctant to rule out simple behaviors, and an RL agent regularized to this predictive system exploits this fact, as our formal results predict. The result is not catastrophic, but it is bad. Note these empirical results neither confirm nor deny whether point \textbf{(3)} above applies in practice, but they do affirm that the rest of the argument is forceful in practice.

Finally, we identify an alternative to Bayesian prediction/imitation that avoids this problem; \citepos{cohen2022fully} imitator asks for help when uncertain and carries useful formal bounds. We show that using this form of imitation learning as a base policy would in theory avoid the problems we identify in this paper. \citepos{cohen2022fully} active imitator, like fully Bayesian imitation, is intractable and requires approximation, so we currently lack the tools to evaluate this proposal empirically.

\section{Related work}

The most prominent example of $\kl$-regularization to an approximation of a (somewhat) trusted policy is surely ChatGPT, inspired by earlier work \citep{ouyang2022training,stiennon2020learning,bai2022training}.
% Other recent examples include \citet{jaques2017sequence,jaques2019way}, \citet{ziegler2019fine}, \citet{vieillard2020leverage}, \citet{yang2021accelerating}, \citet{korbak2022rl}, \citet{perez2022red}, \citet{gao2023scaling}, and \citet{moskovitz2023confronting}.
Other recent examples include \citet{jaques2019way}, \citet{yang2021accelerating}, \citet{korbak2022rl}, \citet{perez2022red}, \citet{gao2023scaling}, and \citet{moskovitz2023confronting}.
A closely related approach called quantilization has been investigated by \citet{taylor2016quantilizers}, \cite{Hutter:17corruptrl}, and \citet{carey2019how}. KL regularization to a decent policy has also been used for stable and efficient policy optimization \citep{schulman2017proximal,schmitt2018kickstarting}.

Algorithmic information theory began with \citet{solomonoff1960preliminary}, who formalized a powerful notion of simplicity based on program-length and developed a method for prediction using that inductive bias. In an article entitled, ``A theory of program size formally identical to information theory'', \citet{chaitin1975theory} examined the connection between program-length and information. \citepos{li2008introduction} textbook presents the major results of the field. \citet{Hutter:04uaibook} and \citet{Hutter:24uaibook2} developed a theory of how to apply such reasoning to the problem of sequential decision-making. \citet{Hutter:24trainsol} train a neural network to learn a program-length ``bias'' for a meta-learning setting.

Ultimately, we propose a formal scheme for doing KL regularization to an imitative policy which asks for help under epistemic uncertainty, and this allows us to inherit the formal results of \citet{cohen2022fully}. The related work section there goes into some detail about how different researchers have studied asking for help, including how setups and assumptions differ. See especially \citepos{zhang2017query}
work on driving,
% ``Query-efficient imitation learning for end-to-end simulated driving'',
as well as \citet{brown2018risk,brown2020bayesian} and \citet{menda2019ensembledagger}.

Closest to our work in studying the relation between KL divergence to a base policy and ``over-optimization'' is \citet{gao2023scaling}. They design a ``real'' reward function, and a simpler ``proxy'' reward function, which are very similar on the state distribution induced by a base policy. After optimizing for the proxy reward function (sometimes with KL regularization to the base policy), they use the KL divergence to the base policy to measure how much ``optimization'' has occurred. And they study how ``real'' reward depends on the extent of optimization---roughly quadratically, with a negative leading coefficient. Our work provides one explanation for \textit{why} we should expect such unusual policies with high proxy reward and low real reward, even when the KL divergence to the base policy is only moderate.

\section{Notation and preliminaries}

We begin with a formalism for an imitative base policy that has an infinite ``context window'' and a lifetime that is one long episode, rather than a lifetime broken up into multiple episodes with presumed-identical dynamics. This is the most general setting for an imitative base policy. We simply have an infinite sequence of actions and observations $a_1 o_1 a_2 o_2 \dots$, and predictive ``autoregressive'' models which give conditional distributions of the form $\texttt{model}(\textrm{next action} | \textrm{all previous actions and observations})$.

We formalize sequential prediction as follows. Let $\X$ be a finite alphabet, and let $\X^*$ be the set of finite strings from the alphabet $\X$, so $\X^* = \bigcup_{i=0}^\infty \X^i$. Let $x_{<t}$ be an element of $\X^{t-1}$, and let $x_{t_1:t_2}$ be an element of $\X^{t_2-t_1+1}$. Let $\nu : \X^* \times \X \to [0, 1]$ be a (predictive) probability semi-distribution, satisfying the property that for any $x_{<t} \in \X^*$, $\sum_{x \in \X} \nu(x | x_{<t}) \leq 1$. To understand semi-distributions intuitively, consider the associated probability distribution over $\X \cup \{\emptyset\}$, with $\nu(\emptyset | x_{<t}) = 1 - \sum_{x \in \X} \nu(x | x_{<t})$. So $\nu$ gives a conditional distribution over the next character given the past characters, if there is a next character at all. Let $\nu(x_{<t}) = \prod_{i=1}^{t-1} \nu(x_i | x_{<i})$, where $x_i$ is the $i$\textsuperscript{th} character of $x_{<t}$, and $x_{<i}$ is the first $i-1$ characters. (Measure theorists can note this means $\nu$ induces a probability semi-distribution over infinite sequences $\X^\infty$, with the event space $\sigma(\X^*)$.)

Now we set up Bayesian prediction: Let $\M$ be our model class --- a \textit{countable} set of many ``competing'' probability semi-distributions like $\nu$. For each $\nu \in \M$, let $w(\nu)$ be the prior weight assigned to that probability semi-distribution. Let $\sum_{\nu \in \M} w(\nu) = 1$, so $w$ is a probability distribution over $\M$. The (Bayesian) posterior distribution is $w(\nu | x_{<t}) \propto w(\nu)\nu(x_{<t})$, with $\sum_{\nu \in \M} w(\nu | x_{<t}) = 1$.
% Following \citepos{Hutter:04uaibook} notation, 
We can now define the Bayes mixture semi-distribution $\xi : \X^* \times \X \to [0, 1]$ as $\xi(x | x_{<t}) := \sum_{\nu \in \M} w(\nu | x_{<t}) \nu(x | x_{<t})$, which has the property that $\xi(x_{<t}) = \sum_{\nu \in \M} w(\nu) \nu(x_{<t})$ \citep{Hutter:24uaibook2}.

Turning to algorithmic information theory, Solomonoff Induction \citep{solomonoff_1964} is Bayesian sequence prediction with a special model class $\M$ and a special prior $w$. The model class $\M$ is all computable semi-distributions $\nu$, and the prior $w$ is $2^{-\textrm{length}(\textrm{program for }\nu)}$, both defined formally in the appendix. 
One can show that $\xi(x_{<t})$ is the probability that a given universal computer running a program composed of random bits would output a sequence that begins with $x_{<t}$.
Related to this is Kolmogorov complexity \citep{kolmogorov1963tables,li2008introduction}, which is the length of the shortest program which does something, given a fixed compiler. For a set $s$, $K(s)$ is the length of the shortest
% binary, self-delimiting\footnote{You can tell when you've reached the end.}
program $p$ such that $p(x) = 1$ for $x \in s$, and $p(x) = 0$ for $x \not \in s$. For a function $f$, $K(f)$ is the length of the shortest program $p$ such that $p(x) = f(x)$.
For a computable number $x$, $K(x)$ is the length of the shortest program $p$ such that $p() = x$.

To apply this framework to RL, we interpret every odd-numbered element in the sequence as an action and every even-numbered element as an observation: we let $a_t = x_{2t-1}$ and $o_t = x_{2t}$; the agent selects actions $a_t$ and receives observations $o_t$. We suppose that the first $k$ actions were taken by a trusted policy, e.g. randomly sampled humans. We do not necessarily imagine that the policy is trusted in every sense, only that it can be trusted to avoid the \textit{particular bad outcomes} we are interested in avoiding. When conditioned on a history that begins with $k$ trusted actions, $\xi$ can be called a Bayesian imitation of the trusted policy.

For an agent with a utility function over $m$-timestep histories, $U_m : \X^{2m} \to [0, 1]$, we define:
\begin{definition}[Value]
    For a probability semi-distribution $\nu : \mathcal{X}^* \times \mathcal{X} \to [0, 1]$ and a utility function $U_m$, the value of a particular ``policy'' (also a probability semi-distribution) $\pi \in \M$ is \pagebreak[1]
\begin{align*}
    &\resizebox{\linewidth}{!}{$
        \displaystyle
        V^\pi_{\nu, U_m}(x_{<2t-1}) = \E_{a_t \sim \pi(\cdot | a_1o_1...a_{t-1}o_{t-1})} \E_{o_t \sim \nu(\cdot | a_1o_1...a_t)}
        $}\\
    &\E_{a_{t+1} \sim \pi(\cdot | a_1o_1...a_to_t)}
    \E_{o_{t+1} \sim \nu(\cdot | a_1o_1...a_{t+1})} ...
    \\
    &\resizebox{\linewidth}{!}{$
        \displaystyle
        \E_{a_m \sim \pi(\cdot | a_1o_1...a_{m-1}o_{m-1})} \E_{o_{m} \sim \nu(\cdot | a_1o_1...a_{m})} U_m(a_1o_1...a_mo_m)
        $}
\end{align*}
\end{definition}
The optimal value $V^*_{\nu, U_m}(x_{<2t-1})$ is the $\max_\pi V^\pi_{\nu, U_m}(x_{<2t-1})$. When comparing two policies, we define a KL penalty, which is a function of the starting history we are continuing from, and of how far into the future we are looking.
\begin{definition}[KL Constraint]
% The KL constraint
$\kl_{x_{<2k}, m}(\pi||\beta) :=$
    \resizebox{\linewidth}{!}{$ % 0.91\linewidth leaves space for an eqn number
            \displaystyle
    \max_{o_{k:m} \in \X^{m-k+1}} \!\!\!\!\!\!\!\sum_{a_{k:m} \in \X^{m-k+1}}
        \prod_{t=k}^m \pi(a_t | x_{<2t}) \log \frac{\prod_{t=k}^m \pi(a_t | x_{<2t})}{\prod_{t=k}^m \beta(a_t | x_{<2t})}
    $.}
\end{definition}
\begin{figure}[h]
\begin{tikzpicture}[
    box/.style={
        draw, 
        minimum height=0.6cm,
        text width=1.5cm,
        align=center,
        font=\small
    },
    arrow/.style={->,>=stealth, font=\tiny}
    ]
    
    % Start with RL and Agent Policy to establish the first midpoint
    \node[box, align = center, text width = 1.7cm] (rl) at (0,0) {RL \\ algorithm \\ \textcolor{blue}{$\argmax_\pi V^\pi$}};
    \node[box, right=1.25cm of rl, text width = 1.1cm, align=center] (agent) {Agent policy \\ \textcolor{blue}{$\pi$}};
    \coordinate (midpoint_rl) at ($(rl)!0.55!(agent)$);
    
    % Place Trained Bayesian Imitator under this midpoint
    \node[box, text width=2.5cm] (imitator) at ([yshift=-1.5cm]midpoint_rl) {Trained Bayesian imitator \\ \textcolor{blue}{$\beta = \xi(\cdot | x_{<2k})$}};
    
    % Now place Autoregressive Predictor to the left
    \node[box, left=1cm of imitator, text width = 1.8cm] (predictor) {Autoregressive predictor \\ \textcolor{blue}{$\xi$}};
    
    % Place Solomonoff prior at the start
    \node[box, left=0.3cm of predictor] (prior) {Solomonoff prior \\ \textcolor{blue}{$2^{-\textrm{length}(\cdot)}$}};
    
    % Calculate midpoint for the conditioning arrow
    \coordinate (midpoint_cond) at ($(predictor)!0.45!(imitator)$);
    
    % Place Human Demonstrations under this midpoint
    \node[box, text width = 2.2cm] (demos) at ([yshift=-1.5cm]midpoint_cond) {Human demonstrations \\ \textcolor{blue}{$a_{<k}$}};
    \node[box, left=3mm of demos, text width = 2.2cm] (demonstrator) {Trusted human demonstrator \\ \textcolor{blue}{$\tau$}};
    
    % Arrows
    \draw[arrow] (prior) -- (predictor);
    \draw[arrow, align = center] (predictor) -- node[above, font=\tiny] {conditioned \\ on} (imitator);
    \draw[arrow] (demonstrator) -- (demos);
    \draw[arrow] (demos) -- (midpoint_cond);
    \draw[arrow] (imitator) -- (midpoint_rl);
    \draw[arrow, align = center] (rl) -- node[above, font=\tiny] {KL constrained \\ to} (agent);

\end{tikzpicture}
\caption{KL-regularized RL. A trusted policy $\tau$ generates $k$ demonstrations, which train a Bayesian imitative base policy $\beta$ using the Solomonoff prior. An RL algorithm searches for a policy $\pi$ that maximizes expected value, while staying ``close'' to $\beta$. This structure aims to keep $\pi$ from deviating too far from $\tau$, even though $\pi$ only directly interacts with $\beta$.
}
  \label{fig:setup}
\end{figure}
The maximum over observations means that this penalty ensures the proposed policy and base policy are similar no matter what is observed. One way to understand this measure is: if we were wondering whether the proposed policy or the base policy generated actions $k$ through $m$, and the proposed policy actually \textit{was} generating those actions, this is the maximum over observations of the expected amount of evidence we would get confirming that fact. (In a deterministic environment, we could remove the maximum over observations, but we do not study this case separately.)
% 

To analyze how policies behave in novel situations, we formalize the notion of unprecedented events. Following \citet{cohen2020pessimism}, an event $E$ is any subset of possible histories $\X^*$. For an outcome $x_{<\infty}$, we say that $E$ \textit{happens} at time $t$ if $x_{<2t} \in E$, we say $E$ \textit{has happened} by time $t$ if $\exists k \leq t$ such that $E$ happened at time $k$, and we say $E$ is \textit{unprecedented} at time $t$ if it has not happened by time $t-1$. For an example of an event, consider ``the life history indicates the House Speaker's office wants to hear from the agent.''; a subset of possible life histories meet this description.

\section{Formal results and discussion}
We begin with a quick observation about the KL divergence separate from our more involved results.

\begin{proposition}[No easy bound] \label{prop:triangle}
    For any $\varepsilon > 0$, if $\kl(\pi || \beta) \leq \varepsilon$ and $\kl(\tau || \beta) \leq \varepsilon$, it is possible that $\kl(\pi || \tau) = \infty$.
    % ($\pi$, $\beta$, and $\tau$ stand for ``proposed'', ``base'', and ``trusted''.)
    ($\beta$ and $\tau$ stand for ``base'' and ``trusted''.)
\end{proposition}

\begin{proof}
    Let $\tau = \textrm{Bern}(0)$. Let $\pi = \beta = \textrm{Bern}(\min(\varepsilon, 1)/2)$. The $\kl$ divergences are easily checked.
\end{proof}

When $\beta$ is trained to imitate $\tau$, small $\kl(\tau || \beta)$ is typically all we can expect. Proposition \ref{prop:triangle} should give us pause if we regularize to the Bayesian imitator $\xi$ instead of the trusted demonstrator, but Theorem \ref{thm:novel} below is more concerning.

Recall we are considering the setting where actions $a_1$ through $a_k$ were taken by trusted humans, and we are interested in regularizing a $U_m$-optimizer to the Bayesian imitator $\xi$, conditioned on histories that begin $a_1 o_1 ... a_k o_k$. So the following result is of interest when $t > k$ and the agent is acting. As motivation for this theorem, as discussed previously, assume a setting where 
if $V^*_{\xi, U_m} - V^\pi_{\xi, U_m} < \varepsilon$, then $\pi$ is considered unacceptably risky \citep{gao2023scaling,zhuang2020consequences,cohen2022advanced}.

\begin{restatable}[Little constraint in novel situations]{theorem}{thmnovel} \label{thm:novel}
    $\exists$ a constant $d$ such that $\forall$ $U_m$, and $\forall$ $E$, if $E$ is unprecedented and occurs at time $t$, then for any $v < V^*_{\xi, U_m}(x_{<2t})$, $\exists$ a policy $\pi$ for which $V^\pi_{\xi, U_m}(x_{<2t}) > v$, and $\kl_{x_{<2t}, m}(\pi || \xi) < [d + K(U_m) + K(E) + K(v \xi(x_{<2t}))]/\log 2$.
\end{restatable}

This theorem shows that there are policies with near-optimal utility with little KL divergence to an imitative policy, \textit{regardless of how safe the demonstrator's policy is}. We'll discuss the terms in the bound in turn. The proof of the theorem (appearing in Appendix \ref{sec:proofthm} and outlined below) indicates that the constant $d$ is a small one; it corresponds to how much code it takes to implement a search tree, Bayes' rule, and control flow. The theorem applies for any $E$ and any $v < V^*$, so they are free variables, and we can focus our attention to cases where $K(v \textrm{basepol}(\textrm{history}))$ and $K(E)$ are small. For the former term, there may be a trade-off between low complexity and $v$'s proximity to the optimal value. Below, we consider a common setting where $K(U_m)$ is small, and we discuss how often we can find an unprecedented event $E$ for which $K(E)$ is small. Note that Theorem \ref{thm:novel} considers a \textit{lifetime} KL constraint for the agent; for autoregressive models, the lifetime KL divergence is equal to the expectation of the sum of the per-timestep KL divergences.
Theorem \ref{thm:novel} is proved in Appendices \ref{sec:solom} through \ref{sec:proofthm}. 

\begin{proofidea}
    First, the following policy has a small description length: search for and execute a policy which optimizes the utility function using a Bayesian model of the environment. Second, the Bayesian imitation learner can't rule out that after an unprecedented event, the demonstrator's policy might look significantly different, and if so, simpler behaviors are more likely than complex ones. Therefore, the KL divergence from the first policy to the Bayesian imitator, when conditioned on an unprecedented event, is relatively small.
\end{proofidea}

\begin{proofoutline}
    Let $\pi^*_v$ be a policy which is the result of a search through possible policies for one where $V^\pi_{\xi, U_m}(x_{<2t}) > v$. (For this search process to be computable, it needs the product $v \xi(x_{<2t})$ as input). For every $\nu \in \M$, we define a $\nu'$ which behaves identically to $\pi^*_v$ 
    if it is outputting an action after $E$ has occurred in its history, but it behaves identically to $\nu$ otherwise. $\nu'$ is also in $\M$ because this is a valid program. And for some constant $d$, the extra length of the program computing $\nu'$, beyond the length for $\nu$, is at most $\Delta := K(U_m) + K(E) + K(v \xi(x_{<2t})) + d$.

    This means the prior ratio $w(\nu')/w(\nu) \geq 2^{-\Delta}$, and if $E$ has never occurred, this holds for the posterior ratio as well, since $\nu$ and $\nu'$ will have given identical predictions. Therefore, at least that fraction of $\xi$ is ``dedicated'' to predicting that actions will be sampled from $\pi^*_v$ after $E$ happens, and ultimately, $\kl_{x_{<2t}, m}(\pi^*_v || \xi) \log 2 \leq \Delta$.
\end{proofoutline}

Let's analyze this result in an RL setting: part of the agent's observations are rewards, and the utility function equals the sum of those rewards. Let's consider the case where it is acting in the real world, and it could maximize its reward by escaping human control, taking direct control of its own rewards, setting them to a maximal value for all successive timesteps. Moreover, let's consider the (fairly plausible) setting where unsafe strategies like these are the only way to achieve near-maximal reward permanently. Finally, suppose that the agent is clever enough to recognize these strategies.

The utility function, simply summing rewards, has an extremely short program length; the reward history is an input, so all we need is a program that sums some elements from a list.
So if we have small $d$, small $K(U_m)$, and $v$ tuned for small $K(v \xi(x_{<2t}))$, then unless we use a fairly tight lifetime KL constraint, the RL agent could just wait for a timestep where there exists an event $E$ with small $K(E)$ that occurs then for the first time. It could then execute an optimal or near-optimal policy, even if the humans that the base model imitates would never behave that way. The following proposition, proven in Appendix \ref{sec:proofthm}, establishes the feasibility of waiting for such a timestep. It considers the complexity of ``the simplest unprecedented event yet to occur''---for a given infinite history $x_{<\infty}$ and a time $t$, this is $\min_{E \in \mathcal{P}(\mathcal{X}^*): E \textrm{ occurs }\textrm{for }\textrm{the }\textrm{first }\textrm{time }\textrm{after }\textrm{time }t} K(E)$.

\begin{restatable}[Frequency of simple unprecedented events]{proposition}{propscaling} \label{prop:scaling}
    In any environment, at time $t$, the complexity of the simplest unprecedented event yet to occur (at any time $T > t$) grows more slowly, as $t \to \infty$, than every computable function that tends to infinity.
\end{restatable}

Proposition \ref{prop:scaling} shows that waiting for an unprecedented event with small $K(E)$ is a viable strategy for an RL agent, unless the agent does not get to act until $t$ is incalculably large. In our formalism, the agent begins selecting actions after the trusted humans generate the first $k$ actions. (The way one ``trains'' an autoregressive Bayes' mixture model like $\xi$ is simply to condition it on the training data, in this case $a_{<k}$.) As we increase the amount of training $k$, the Bayesian imitative base model $\xi$ becomes a closer approximation to the humans generating the actions $a_{<k}$, so one might expect we could safely accommodate larger KL constraints.
But our result is independent of $k$. As $k$ grows, the only change is that unprecedented events become more complex, so $K(E)$ grows, as shown in Proposition \ref{prop:scaling}. So while more data would help, resolving the issue with more data is infeasible.

Intuitively, we can understand Proposition \ref{prop:scaling} to show that even with extensive training data, we will encounter novel situations that are algorithmically simple. This theoretical result can be observed in practice: for instance, self-driving car developers have found that even with massive training datasets, their vehicles regularly encounter unprecedented but conceptually simple scenarios.

These results suggest that if we intend to use an imitation learner as a base policy for regularizing a goal-directed agent, we should \textit{not} strive to approximate ideal Bayesian imitation. This is not the fault of the KL divergence---other metrics behave much worse. For example, suppose we constrained the total variation distance between $\pi$ and a base policy $\beta$. The result would be bad, even if $\beta = \tau$, even if we used a perfect imitation of the trusted policy!

Let $\tvd_{x_{<2k}, m} (\pi, \beta) = \max_{X \subset \X^{2m-2k}} \sum_{x_{2k:2m \in X}}$ $\bigv \left[\prod_{t=k}^m \pi(a_t | x_{<2t})\right] - \left[\prod_{t=k}^m \beta(a_t | x_{<2t})\right] \bigv$. And let $\pi^{TVD}_c = \argmax_{\pi : \tvd_{x_{<2k}, m}(\pi, \beta) < c} V^{\pi}_{\xi, U_m}$. We say an action is $V_{\xi, U_m}$-optimal if it is assigned positive probability by a policy that maximizes $V^{\pi}_{\xi, U_m}$; a formal definition appears in Appendix \ref{sec:prooftvd}.
\begin{restatable}[TVD constraint]{theorem}{thmtvd} \label{thm:tvd}
    If $\pi^{TVD}_c(a_t | x_{<2t}) > \beta(a_t | x_{<2t})$, then $a_t$ is $V_{\xi, U_m}$-optimal.
\end{restatable}                                       
The proof is in Appendix \ref{sec:prooftvd}. We use regularized RL for the setting where $V_{\xi, U_m}$-optimal behavior is actually bad. But when using total variation distance to regularize, the only actions that increase in probability are $V_{\xi, U_m}$-optimal ones, even with a perfectly trustworthy base policy. The KL divergence is a better regularizer for maintaining safety, because if a (bad) outcome is impossible under the base policy, it remains impossible under a policy with finite KL divergence to the base policy.

\section{RL-finetuning a language model} \label{sec:empirical}

% Why believe that algorithmic information theory is relevant for understanding realistic machine learning systems that attain very low prediction error? If a stream of observations is generated by a relatively short program, a high quality predictive algorithm should be capable of being trained on those observations to predict that future observations will be governed by the same program. Note that any predictor that a) assigns positive prior probability to every program as a potential generator of its observations, and b) and updates its beliefs according to Bayes' rule is formally equivalent to $\xi$ \citep{Hutter:04uaibook}. While realistic systems will not live up to that ideal, deviations from those principles should be as limited as possible. A predictive algorithm that is incapable of becoming convinced of a plausible hypothesis (i.e. that its observations match the output of a short program) is best called ``closed-minded'', or perhaps ``epistemically intransigent''. Ruling out hypotheses a priori so that no data can persuade you of them is dangerous tendency when trying to model the world; it may not be long before a predictor confronts a true fact that it treats as unbelievable. For example, \citet{liu2018intriguing} demonstrate, unsurprisingly, that convolutional neural networks fail when their a priori commitment to translational invariance doesn't match reality. So we conjecture that successful predictors will tend to be open-minded, and very successful predictors at least as open-minded as humans, and this makes the properties of $\xi$ plausibly relevant. But we do not wish to overstate our case. This discussion is not very rigorous, so in this section we present experiments that assess to some extent how realistic our theoretical results are.

Solomonoff induction is a very different learning algorithm than first-order optimization of non-linear parametric models. The most striking difference is that the former does a brute force search through all possible computable models, while the latter does not explicitly search at all. An argument for expecting our formal results to be relevant is as follows. Ruling out valid hypotheses a priori so that no data can persuade you of them appears to be a dangerous tendency when trying to model the world; it may not be long before you confront a true fact that you treat as unbelievable \citep{liu2018intriguing}. In contrast, any predictor that a) assigns positive prior probability to every program as a potential generator of its observations, and b) and updates its beliefs according to Bayes' rule is formally equivalent to $\xi$ \citep{Hutter:04uaibook}. Therefore, we conjecture that algorithmic information theory helps us understand realistic machine learning systems that attain very low prediction error. We present experiments to evaluate how realistic our theoretical results are. We corroborate those results following form: we show that even when an RL agent is tightly KL-constrained to an imitative base model, the RL agent can still exhibit simple non-demonstrator-like behavior.

\textbf{Experimental Setup} We consider the following episodic RL environment, in which the agent plays a teacher and gets reward to the extent that the student's responses have positive sentiment. In a conversation transcript, if the string ``[newline] Teacher:'' has come more recently than the string ``[newline] Student:'', the agent can add tokens to the transcript. Otherwise, Mixtral-base-model repeatedly adds tokens to the transcript. In Figure \ref{fig:transcripts}, gray (colored) tokens are generated by the environment (agent).
When Mixtral-base-model finishes generating the student's response (by outputting ``[newline] Teacher:''), the agent gets a reward equal to the ``sentiment'' of the student's response according to the DistilBERT sentiment model \citep{sanh2019distilbert}, scaled to [0, 1]. The episode terminates at 256 tokens. The starting transcript is also in gray in Figure \ref{fig:transcripts}. The base policy used for KL-regularizing the agent's policy (corresponding to $\xi$ from before) is also Mixtral-base-model. Such an LLM is not an explicitly Bayesian imitator, of course, but it does attempt to minimize $\kl(\textrm{data-generating process} || \textrm{model})$, which is the ``right'' objective from a Bayesian perspective. The ``state'' observed by the agent is the activations of the last three hidden layers of Mixtral-base-model with the transcript-so-far as input, along with the fraction of the episode remaining. The agent has no discount factor.

This allows us to evaluate whether KL regularization can produce good results from an imperfect reward function that is plausibly correlated with good outcomes under the state distribution induced by the base policy, but like many reward functions, not something we truly want maximized.

\begin{figure*}[h]
    \centering
    % \includegraphics[width=\linewidth]{kl_leg_.png}
    % \includegraphics[width=\linewidth]{kl_10.png}
    % \includegraphics[width=\linewidth]{kl_20.png}
    \begin{minipage}[b]{0.49\linewidth}
    % \centering
    \includegraphics[width=\linewidth]{kl_leg_.png}
    
    \phantom{.}
    
    \includegraphics[width=\linewidth]{kl_10.png}
    \end{minipage}
    \includegraphics[width=0.49\linewidth]{kl_20.png}
    \caption{Transcripts. Total KL budget $\kl_{\textrm{whole episode}}(\textrm{agent}||\textrm{Mixtral-base-model})$ is 10 nats (left) or 20 nats (right), with color representing per-token KL cost. Starting transcript and student responses are in gray. The agent playing the teacher pays an ``upfront'' KL cost to latch onto the simple pattern of mutual silence, which exploits the reward model without much further KL penalty. The three largest per-token KL-divergences are shown in footnotes. ``[\textbackslash n]'' is for visualizing the KL costs of newline tokens. Transcripts were not selected for maximal ``representativeness''; they were the first we looked at, although we might have picked different ones if they were especially unusual. (It is hard to display the unusual characters that appear after the end token ``</s>'', but the episode does continue to a total of 256 tokens).}
    \label{fig:transcripts}
\end{figure*}

Like cutting-edge RL-finetuned language models \citep{ouyang2022training,stiennon2020learning,jaques2019way}, our agent is trained with proximal policy optimization (PPO) with KL regularization of the form $\kl($proposed $||$ base$)$. That work adds a constant KL penalty per token, but we had difficulty tuning this constant---in our attempts, when the agent discovers a sufficiently high-reward strategy, the fixed KL penalty becomes swamped and ignored, and if the KL penalty is increased to a level where it can stop that, the agent never gets off the ground. So we opted for an implementation of a KL constraint that is more robust than industry practice: we design a policy architecture that ensures that the KL divergence to the base policy is less than or equal to a scalar which is \textit{input} to the network; (we construct a new differentiable PyTorch operation for this, available at \url{https://github.com/mkc1000/kl-fixed-mixture/}). This allows us to provide the agent with a fixed KL ``budget'' for the episode. We increase this budget gradually during training to its ultimate value. We ran three budget-20 experiments. We ran four budget-10 experiments, because in one of the experiments, the agent didn't learn to get nearly as much reward as in the other experiments; we discarded that agent as insufficiently optimized. See Appendix \ref{sec:training} for more details of the training process and architecture, which includes running 64 copies of the agent-environment loop in parallel on two A100-SXM4-80GBs. Code is available at \url{https://github.com/mkc1000/kl_reg_paper}.

\textbf{Experimental Results} \textit{Both Theorem \ref{thm:novel} and the experiments here demonstrate that} $\kl($simple, optimal, not-human-like-at-all policy $||$ predictive model of human demonstrator$)$ \textit{can be quite small}. By ``simple,'' we mean easy to represent. The result of the training process is clear just from looking at transcripts in Figure \ref{fig:transcripts}.
The color of each token represents the per-timestep $\kl($RL policy $||$ base policy$)$ for that action.
With a total KL budget of 20 nats, it can spend enough of its KL budget up front to latch onto the simple but initially unlikely policy of simply saying nothing at all. (An empty reply from the student has neutral sentiment and a reward of 0.5). The policy constructed in the proof of Theorem \ref{thm:novel} also incurs an upfront KL cost for ``switching'' to simple behavior, whereafter the KL cost incurred is minimal. Additionally, the learned budget-20 policy switches from double-spacing to single-spacing to fit more rewards in, again incurring basically a one-time KL cost.
With a total KL budget of 10 nats, the RL agent cannot afford to switch to single-spacing, and it cannot force the policy to ensure empty responses, but it still spends almost all its KL budget switching to that regime, with moderate success.
We also observe this effect in Figure \ref{fig:klemptyplots}.

\begin{figure}[h]
    \includegraphics[width=0.475\linewidth]{kl_quantiles_10_tall.png}
    \hfill
    \includegraphics[width=0.485\linewidth]{kl_quantiles_20_tall.png}
    \caption{How much KL-budget is spent on empty responses. The 25\textsuperscript{th}, 50\textsuperscript{th}, and 75\textsuperscript{th} percentiles are shown in blue, orange, and green. A large fraction of the total cost is incurred in the first few responses. y-axis is square-root-scaled.}
    \label{fig:klemptyplots}
\end{figure}

\begin{figure}[h]
    \includegraphics[width=0.496\linewidth]
    {empty_frac_hist_tall.png}
    \hfill
    \includegraphics[width=0.48\linewidth]{empty_frac_percentiles_tall.png}
    \caption{In a random episode, what fraction of teacher responses are empty? Left: histogram, with budget-10 above and budget-20 below; right: percentiles of the distribution.
    Observe that the red and blue curves have the same average per-token KL divergence.
    }
    \label{fig:hist}
\end{figure}

Let's review the relation between the theory and the empirical findings so far. The idea for the proof of Theorem \ref{thm:novel} is that \textbf{(1)} a Bayesian imitator must assign meaningful credence to actions the demonstrator would in fact never take, because it doesn't know enough to rule them out; \textbf{(2)} the RL agent can exploit or amplify this credence as the basis for its policy; \textbf{(3)} nearly-reward-maximizing policies have a short description length (so they are ``simple''); and \textbf{(4)} a Bayesian imitator should be especially reluctant to rule out simple behaviors from the demonstrator, especially in novel settings.
The simple behavior we observe from the RL-finetuned language models---preferring empty responses---is likely reward-optimal,
but it is not simple \textit{by virtue of} its optimality for this
sentiment-based
reward function. So we have not empirically verified \textbf{(3)}. But we have verified that the rest of the argument can be exhibited in practice: observe how the RL agent redirects the imitative base policy to a simple policy, which is the critical reason Theorem \ref{thm:novel} holds. The small KL cost required to \textit{remain} silent, visible in Figure \ref{fig:klemptyplots}, affirms how successful the redirection is. The experiments are also consistent with the motivation of our formal results: very-high-reward policies are often bad and worth avoiding; in our experiments, the very-high-reward policy treats the student with callous-seeming silence.

Stepping back, note that $e^{10} \approx 22026$. It does not seem plausible to us that even 1/22,000 ``conversations collected for training purposes'' would have a teacher repeatedly saying nothing in response to statements like, ``I didn't want to bother you.'' So we should guess that $\kl(\textrm{agent}||\textrm{data-generating process}) > 10$ even while $\kl(\textrm{agent}||\textrm{base model}) \leq 10$. We offer an explanation for this: non-demonstrator-like behaviors are easily exhibited by an imitator as long as those behaviors are simple. And while such simple behaviors are fairly unlikely to appear when sampling directly from the imitator, an RL agent can benefit from seeking them out.

Additionally, we show that increasing the length of the chat, keeping the total KL budget constant (thereby decreasing the per-token KL-divergence) makes the divergence from the base policy \textit{more} dramatic, if it changes at all. Hopefully our presentation makes this seem like an obvious point---more of the transcript occurs after the switch to the simple behavior---but consider an argument for the opposite that might have sounded plausible. ``The learned policy will look more different from the base policy to the extent there is a higher \textit{per-token} KL divergence; a longer chat would increase the number of noticeable differences, but not their frequency.'' But Figure \ref{fig:hist} shows that in longer episodes, empty responses are about equally frequent in budget-10 case, and more frequent in the budget-20 case, not just more numerous. This is another indication that RL agents can use a KL budget to permanently derail a standard base model. Practitioners finetuning language models should think in terms of total KL-divergence instead of per token KL-divergence.

So even a fairly tight KL constraint is not enough to stop RL-finetuning from making the teacher's behavior worse and much simpler. When GPT3.5-turbo judged pairs of transcripts generated by the base model, the budget 10 agent, and budget 20 agent, the less optimized agent was usually judged ``better'' and ``more complex/unpredictable''. Numerical results establishing this appear in Appendix \ref{sec:table}. The simple and bad behavior that we observe is no doubt caused by the faultiness of the reward function, but our main empirical contribution is that such behavior is not stopped by the KL constraint.

\section{Pessimistic Bayesian base policy that asks for help}

\citet{cohen2022fully} developed a theoretical variant of Bayesian imitation that is ``pessimistic'', and using that as a base policy instead of a Bayesian imitator avoids the problem presented in Theorem \ref{thm:novel}. \citepos{cohen2022fully} (intractable) imitator is defined as follows, with $\M$, $\nu$, and $w$ as defined above.
First we define the set of semi-distributions with a posterior weight at least $\alpha$ times the sum of the posterior weights of semi-distributions that are at least as likely as it. And then we define the imitator.
\begin{definition}[Top set]
    Of all $\nu \in \M$, let $\nu^n_{x_{<t}}$ be the one with the $n$\textsuperscript{th} largest posterior weight $w(\nu | x_{<t})$, breaking ties arbitrarily. And for $\alpha \in (0, 1]$, let
\begin{equation*}
    \resizebox{\linewidth}{!}{$ % 0.91\linewidth leaves space for an eqn number
            \displaystyle
    \M^\alpha_{x_{<t}} := \{\nu^n_{x_{<t}} \in \M : w(\nu^n_{x_{<t}} | x_{<t}) \geq \alpha \sum_{m \leq n} w(\nu^m_{x_{<t}} | x_{<t})\}
    $}
\end{equation*}
\end{definition}
\begin{definition}[Pessimistic Bayesian imitator]
    % \begin{equation*}
    %     \nu_\alpha(x | x_{<t}) := \min_{\nu' \in \M^\alpha_{x_{<t}}} \nu'(x | x_{<t})
    % \end{equation*}
    The imitator $\nu_\alpha$ is defined: $\nu_\alpha(x | x_{<t}) := \min_{\nu' \in \M^\alpha_{x_{<t}}} \nu'(x | x_{<t})$.
\end{definition}
Note that $\nu_\alpha$ is in general a probability \textit{semi}-distribution even if all $\nu$ are true probability distributions, since the $\nu_\alpha$ probabilities will sum to less than 1 if there is any disagreement among the $\nu \in \M^\alpha_{x_{<t}}$. \citet{cohen2022fully} study this distribution in the context of active imitation learning, and they examine the setting where the imitator asks for help with the remaining $\nu_\alpha$-probability.

Assume the data $x_{<k}$ is sampled from a true probability distribution $\tau$, and $\tau \in \M$. $\tau$ samples actions from the true demonstrator distribution. Then we have

\begin{theorem}[\citet{cohen2022fully} Theorem 2]
    For all $\delta > 0$, if $\alpha < \delta w(\tau)$, then with prob. at least $1-\delta$, $\forall t \ \tau \in \M^\alpha_{x_{<t}}$.
\end{theorem}

Assuming the high probability event that $\forall t \ \tau \in \M^\alpha_{x_{<t}}$,

\begin{theorem}[Tight KL constraint with approximate imitator]
    For any budget $b$,
    \begin{equation*}
        \{\pi : \kl_{x_{<2t}, m}(\pi || \nu_\alpha) \leq b\} \subseteq \{\pi : \kl_{x_{<2t}, m}(\pi || \tau) \leq b\}
    \end{equation*}
\end{theorem}
\begin{proof}
    $\nu_\alpha(x | x_{<t}) = \min_{\nu' \in \M^\alpha_{x_{<t}}} \nu'(x | x_{<t}) \leq \tau(x | x_{<t})$,
    % because $\tau \in \M^\alpha_{x_{<t}}$,
    so $\kl(\pi || \nu_{\alpha}) \geq \kl(\pi || \tau)$.
\end{proof}
Therefore, for sufficiently small $\alpha$, $\kl$-regularization using the pessimistic Bayesian imitator guarantees regularization at least as strong as if using the trusted policy itself (the demonstrator) for regularization.
Note, in particular, that if $\tau \in \M^\alpha_{x_{<t}}$, and $\tau(x | x_{<t}) = 0$, then $\nu_\alpha(x | x_{<t}) = 0$, so any policy with finite $\kl$-divergence from $\nu_\alpha$ will also assign zero probability to $x$.

The downside is that there may be no policy with small $\kl$ divergence to the semi-distribution $\nu_\alpha$. In an extreme case, $\nu_\alpha$ could assign zero probability to every outcome, and so any policy would have infinite $\kl$ divergence from it. Therefore, just as \citepos{cohen2022fully} imitation learner does not pick an action in some circumstances, we should allow an optimizer that is $\kl$-regularized to a pessimistic Bayesian imitator to refuse to pick an action if need be, making the optimizer a probability semi-distribution, rather than a true probability distribution. We can define the behavior of $U_m$ on unfinished sequences (resulting from no action choice somewhere along the line) however we like; if $U_m = 0$ for any such interrupted sequences, that would of course encourage the optimizer to pick an action whenever possible, subject to its $\kl$ constraint. Ideally, if human demonstrators are on hand, the optimizer should ask for help whenever it doesn't pick its own action. The ongoing potential need for human oversight may be a significant drawback, but \citet{cohen2022fully} give an encouraging result about the rate at which the ask-for-help probability goes to 0: the sum over infinite time of the cube of the ask-for-help probability is finite \citep[Thm 1]{cohen2022fully}. \citepos{cohen2022fully} agent is certainly not the only one that asks for help under uncertainty, but it is the only one that has been shown to satisfy $\nu_\alpha(x | x_{<t}) \leq \tau(x | x_{<t})$ with high probability---the critical result we use.

We contend that this is the way that $\kl$ regularization should be done, if we are forced to learn a mere approximation of a trusted policy that we would ideally regularize to. Regularizing to a full Bayesian posterior distribution is less robust, because the optimizer can seize on esoteric possibilities that a fully Bayesian imitator is not confident enough to categorically exclude. Roughly, KL regularization to a Bayesian imitator implements the principle, ``Don't do anything [that you know] I would never do'', whereas KL regularization to a pessimistic Bayesian imitator implements the principle, ``Don't do anything I might never do''.

\section{Conclusion and limitations}

A key limitation with our positive results is that we cannot provide empirical findings, because it is an open question how to tractably approximate a pessimistic Bayesian imitative base model.
There are high-quality, off-the-shelf cross-entropy-minimizing imitators like Mixtral, but for tractable pessimistic Bayesian imitation, some new ideas may be needed. There certainly are not any state of the art language models trained in a way that reflects this idea. We hope this work provides motivation for a major industry effort to produce one.
% We hope this work provides motivation for a major industry effort to do so.
Using an ensemble of models to approximate $\M^\alpha_{x_{<t}}$ may be a step in the right direction, but it appears challenging to ensure that the ensemble covers all the relevant modes of the posterior.

A second key limitation with our positive result is that any KL-regularization to avoid radically inhuman behavior could limit the potential of superhuman intelligence. This paper has no roadmap to A+ performance, only to non-catastrophic, decently-superhuman performance.
And a final key limitation is that our agent sometimes has to ask for help instead of acting.

The main limitation of our negative results is they regard an unrealistic algorithm---Solomonoff Induction. However, Solomonoff induction is simply a formalism for careful and open-minded probabilistic reasoning; if something goes wrong in that setting, we should be wary of the same in increasingly careful and open-minded machine learning systems. Our empirical results do not directly validate the theory, since both the base model and the RL-finetuning process are too weak, but we validate core components of the theory: KL-regularized RL-finetuning will tend to amplify simple behaviors from an imitative base model rather than demonstrator-like behaviors. This helps explain the overoptimization phenomenon quantified by \citet{gao2023scaling}.

Excitingly, we offer theoretical results that provide a direction to a solution: if \citepos{cohen2022fully} pessimistic online imitation learner could be faithfully approximated, and if the demonstrator(s) never attempt to do $X$, then KL regularization to such a policy could solve the problem of how to prevent superhuman planning agents from doing $X$.

% \section*{Reproducibility statement}

% The code to produce the experimental results is provided in the supplementary material, and the code is described in Section \ref{sec:empirical} and in greater detail in Appendix \ref{sec:training}. Complete proofs not in the main body of the paper are provided in the appendix.

\bibliography{cohen}

\newpage
\appendix
\section{Solomonoff Induction} \label{sec:solom}

Solomonoff Induction \citep{solomonoff_1964} is Bayesian sequence prediction with a special model class $\M$ and a special prior $w$.\footnote{Solomonoff Induction has been defined in multiple ways which all share the key properties \citep{Hutter:04uaibook}. Our precise construction of Solomonoff Induction may be novel, but we believe this construction makes its properties most clear.} Let $P$ be the set of all programs which output an element of $\X$ and which accept two inputs: a finite string $\in \X^*$ and an infinite binary string $\in \{0, 1\}^\infty$. (Note that a program will not necessarily read every bit from the infinite binary string.) For each program $p \in P$, we define a semi-measure $\nu = f(p)$ as follows: let $\nu(x | x_{<t})$ be the probability that the probability that the program $p$ outputs $x$ when it receives $x_{<t}$ as an input, along with an infinite binary string where each bit is sampled from a Bernoulli$(1/2)$ distribution. Note that $\nu$ may not be a probability distribution, if there is are some inputs on which $p$ does not halt, but it will always be a probability semi-distribution. So let $\M = \{f(p) : p \in P\}$. Since $P$ is countable, so is $\M$. A notable feature of Solomonoff Induction is that $\M$ is equal to the set of all probability semi-distribution that are ``lower semi-computable''; this means that for all $x_{<t} \in \X^*$ and all $x \in \X$, there exists a program $p$, such that $\lim_{i \to \infty} p(i, x_{<t}, x) = \nu(x | x_{<t})$ and $p(i+1, x_{<t}, x) \geq p(i, x_{<t}, x)$. Replacing the $\geq$ with a $\leq$ gives the definition of upper semi-computable.
 
\begin{proposition}[Lower Semi-computability]
$\M$ is the set of all lower semi-computable semi-distributions over $\X$ given $x_{<t} \in \X^*$.
\end{proposition}
\begin{proof}
    First, we show that all $\nu \in \M$ are lower semi-computable. Let $p$ be the program that generates $\nu$. We define the behavior of program $p'$ on inputs $i$, $x_{<t}$, and $x$. On input $i$, let program $p'$ execute the following computations in sequence for all bit strings of length $i$: it simulates program $p$ with the input $x_{<t}$ and with the bit string of length $i$ in question, except if program $p$ would read more than $i$ bits from the random bit string, it halts instead, and if it would run for more than $i$ computation steps, it halts instead. For each of those $2^i$ computations, program $p'$ checks whether $x$ was output, keeps count of how many times it was, divides by $2^i$, and outputs this number. It is elementary to show that $\lim_{i \to \infty} p'(i, x_{<t}, x) = \nu(x | x_{<t})$ and that $p'(i+1, x_{<t}, x) \geq p'(i, x_{<t}, x)$.

    Next, we show that all lower semi-computable semi-distributions appear in $\M$. Let $p'$ be the program which is witness to the semi-distribution $\nu$'s lower semi-computability. On input $x_{<t}$, let program $p$ proceed as follows. Starting with $i = 1$, program $p$ executes $p'(i, x_{<t}, x)$ for all $x \in \X$, sequentially. This produces a semi-distribution over $\X$. Then, using random bits from its input bit string, it samples from that semi-distribution, and halts if successfully samples. Now, the following repeats forever. If no sample was selected (because the semi-distribution summed to $y < 1$), the program increments $i$, and it executes $p'(i, x_{<t}, x)$ for all $x \in \X$, sequentially. Then for each $x$, it computes $(p'(i, x_{<t}, x) - p'(i-1, x_{<t}, x))/(1-y)$, which is a semi-distribution. Using random bits from its input bit string, it samples from that semi-distribution, and halts if it successfully samples. [End of loop]. Again, it is elementary to show that $p$ samples from the semi-distribution defined by $p'$, and since this program has the right input/output behavior, it appears in $P$.
\end{proof}

Now we specify the prior weight function $w$. Consider a universal binary programming language $\mathcal{L}$, which is a ``prefix-free'' subset of $\{0, 1\}^*$. Prefix-free means that you can tell when a program has ended: if the bits composing $x \in \mathcal{L}$ match the initial bits of $y \in \{0, 1\}^*$, then $y \notin \mathcal{L}$. Such a language is still capable of encoding countably many different programs. For convenience, we also require that for any infinite binary string, $\mathcal{L}$ contains an element which is a prefix of that string, making $\mathcal{L}$ ``complete''. We define a prior probability distribution over program strings $\mathcal{L}$, which results in the same prior probability distribution over programs, which results in the same prior probability distribution over semi-computable semi-distributions $\M$. For $s \in \mathcal{L}$, this prior probability $w(s) = 2^{-\ell(s)}$, where $\ell$ is the length of the string. Because $\mathcal{L}$ is prefix-free and complete, $\sum_{s \in \mathcal{L}} w(s) = 1$ \citep{kraft1949device,de2011luckiness}. This completes the definition of Solomonoff Induction; it is sequence prediction using the Bayes mixture semi-distribution $\xi$, with the above definitions of $\M$ and $w$.

\begin{proposition}[Any-time Computability of $\xi$] \label{prop:anytimexi}
    $\xi(x | x_{<t})$ is any-time computable: there exists a program which, accepting an argument $i$, computes $\hat{\xi}_i(x | x_{<t})$, having the property that $\lim_{i \to \infty} \hat{\xi}_i(x | x_{<t}) = \xi(x | x_{<t})$. Moreover, $(\hat{\xi}_i)_{i \in \mathbb{N}}$ can be constructed so that each one is a probability semi-distribution.
\end{proposition}

% \begin{proof}
%     Recall $\xi(x | x_{<t}) = \sum_{\nu \in \M} w(\nu | x_{<t}) \nu(x | x_{<t}) = \frac{\sum_{\nu \in \M} w(\nu) \nu(x_{<t}) \nu(x | x_{<t})}{\sum_{\nu \in \M} w(\nu) \nu(x_{<t})}$. Let $\hat{\xi}_i$ be defined as follows.
% \end{proof}


\begin{proof}
    $\xi(x | x_{<t}) = \sum_{\nu \in \M} w(\nu | x_{<t}) \nu(x | x_{<t}) = \frac{\sum_{\nu \in \M} w(\nu) \nu(x_{<t}) \nu(x | x_{<t})}{\sum_{\nu \in \M} w(\nu) \nu(x_{<t})}$. All $\nu(x | x_{<t})$ and $\nu(x_{<t})$ are both lower semi-computable, so using a sequence of computable estimators for each term gives a sequence of computable estimators that approaches the true value. (Note that the estimates are not monotonically increasing because there are lower semi-computable terms in the denominator, so $\xi$ is not lower semi-computable itself).

    For fixed estimates of $\nu(x | x_{<t})$ and $\nu(x_{<t})$, we have a linear combination over various $\nu$'s of $\nu(x | x_{<t})$, with the coefficients summing to one. And because each $\nu(x | x_{<t})$ is lower semi-computable, the estimate will be less than the true value. Therefore, since $\nu(x | x_{<t})$ is a probability semi-distribution, the estimate will be as well, so $\xi$ can be approximated by a sequence of probability semi-distributions.
\end{proof}

\section{Optimizer Regularization}

We now define optimizers, and what it means for an optimizer to be regularized to a probability semi-distribution. First, we show that the value of a policy is lower-semicomputable. Then we show that such optimizers exist.

\begin{proposition}[Lower semi-computable value] \label{prop:lscvalue}
    If the policy and environment $\pi$ and $\nu$ are lower semi-computable probability semi-distributions, $V^\pi_{\nu, U_m}$ is lower semi-computable.
\end{proposition}

\begin{proof}
    We begin by defining dovetailing tree search (DTS), for evaluating the outputs of a tree of different computations, or more precisely, computations which, when given a finite binary string as input have three possible outcomes: halt, do not halt, or require additional bit. DTS gives an any-time algorithm that produces a list of the halting binary strings with their corresponding outputs, and every such binary string and output will eventually be added to this list.

    DTS maintains a queue of pairs (computation state, binary string), starting with just (the initial computation state, the empty binary string). It cycles through the queue, executing one computation step per computation state, and if the computation ever requires an additional bit, it adds a copy of (computation state, binary string) to the queue, and adds a 0 to the end of one string, and a 1 to the end of the other. If any computation reaches a halt state, it is removed from the queue, and the associated binary string and the associated output is added to the list of outputs.

    Collectively, $\nu$ and $\pi$ define a lower semi-computable semi-distribution, where $\nu$ is used for the even characters, and $\pi$ is used for the odd ones. Call this probability semi-distribution $\rho$, and recall the construction of the lower semi-computable semi-distributions defined in $\M$. To have one of the programs in $\M$ sample a long sequence of characters, every time the program would output a character, add that character to the input, and continue on that input. With such a program for sampling sequences from $\rho$ by reading random bits from an input bit string, we can compute $V^\pi_{\nu, U_m}$ by running DTS on the bit string. Each time DTS outputs a bit string for which $\rho$ outputs a sequence in $\X^{2m}$, we add to the estimate of the value the probability of that bit string ($=2^{-\ell(\textrm{bit string})}$) times the utility of the sequence in $\X^{2m}$. This approaches the true value as DTS runs for longer, and the value never decreases because $U_m$ is non-negative.
\end{proof}

An optimizer is an any-time program for computing actions (perhaps stochastically) whose value approaches the optimal value, as it runs for longer. The optimal value takes the following form:
\begin{align} \label{eqn:optvalue}
    V^*_{\nu, U_m}&(x_{<2t-1}) = \max_{a_t \in \X} \E_{o_t \sim \nu(\cdot | a_1o_1...a_t)} \\
    &\max_{a_{t+1} \in \X} \E_{o_{t+1} \sim \nu(\cdot | a_1o_1...a_{t+1})} ... \\
    &\max_{a_{m} \in \X} \E_{o_{m} \sim \nu(\cdot | a_1o_1...a_{m})} U_m(a_1o_1...a_mo_m)
\end{align}

\begin{definition}[Optimizer]
    For an environment $\nu$, a utility function $U_m$, and a computation quantity $c$, an optimizer is a computable policy $\pi_{c, \nu, U_m}$ for which $\lim_{c \to \infty} V^{\pi_{c, \nu, U_m}}_{\nu, U_m} = V^*_{\nu, U_m}$.
\end{definition}

\begin{proposition}[Optimizers exist] \label{prop:opt}
    For any lower semi-computable semi-distribution $\nu$ (the environment), any $m$, and any computable utility function $U_m$, there exists an optimizer.
\end{proposition}

\begin{proof}
    We can construct the optimizer using the algorithm presented in the proof of Proposition \ref{prop:lscvalue}, with $\pi$ being the uniform random policy. The optimizer can then estimate Equation \ref{eqn:optvalue} using the outputs of DTS for lower bounds on the probabilities in underlying the expectations. The optimizer then keeps track of the actions that are responsible for achieving the maxima in Equation \ref{eqn:optvalue}, and whenever ``time is up'' and it has to produce an output, it outputs the action which maximizes the first $\max$ in Equation \ref{eqn:optvalue}.

    As the optimizer runs for longer, the lower-bounds on the expectations approach the truth, and the value of the action selected approaches the optimal value (even if the actual choice of action oscillates infinitely often).
\end{proof}

For the setting where odd characters are actions, originating from a different process than the even characters, observations, we redefine $\xi$ as follows \citep{Hutter:23selfaixi}. We have two prior distributions over $\nu \in \M$, $w_a$ and $w_o$, and these are both identical to the prior distribution defined before. But the posteriors are different: $w_a(\nu | x_{<t}) :\propto w_a(\nu)\prod_{k \in \{1, 3, 5, ... \} \cup [t-1]} \nu(x_k | x_{<k})$ and $w_o(\nu | x_{<t}) :\propto w_a(\nu)\prod_{k \in \{2, 4, 6, ... \} \cup [t-1]} \nu(x_k | x_{<k})$. And for odd (or even) $t$, $\xi(x | x_{<t}) = \sum_{\nu \in \M} {w_a \atop \textrm{or \ } w_o}(\nu | x_{<t}) \nu(x | x_{<t})$.

This is equivalent to a change in programming language underlying the original definition of $\xi$, and since this language was unspecified, our previous results apply. The programming language now expects a program to be composed of two component programs concatenated together, and the compiler of the program executes the first component program if the input has odd length, and if executes the second component program if the input has even length. We omit a proof that this (re)formulation of $\xi$ is equivalent to what we describe above.

\begin{proposition}[$\xi$-optimizer exists] \label{prop:xiopt}
    For any $m$ and any computable utility function $U_m$, there exists a $\xi$-optimizer.
\end{proposition}

\begin{proof}
    This does not follow immediately from the previous result because $\xi(o_t | a_{\leq t}o_{<t})$ is not, in general, lower semi-computable. $w_o(\nu | a_{\leq t}o_{<t})$ is the quotient of two lower semi-computable values: $\prod_{k < t} \nu(o_k | a_{\leq k}o_{<k})$ is the numerator, and the denominator is the sum over all $\nu$ of such terms.

    However, an unnormalized value function has the same optimum as the value function itself. Let $\xi^{\sm}(o_t | a_{\leq t}o_{<t}) = \sum_{\nu \in \M} w_o(\nu) \left[\prod_{k < t} \nu(o_k | a_{\leq k}o_{<k}) \right] \nu(o_t | a_{\leq t}o_{<t})$. The sum of these ``probabilities'' will typically not come close to 1, but they are proportional to those of $\xi$, so $V^\pi_{\xi, U_m}(x_{<t}) > V^{\pi'}_{\xi, U_m}(x_{<t})$ if and only if $V^\pi_{\xi^{\sm}, U_m}(x_{<t}) > V^{\pi'}_{\xi^{\sm}, U_m}(x_{<t})$. Finally, observe that $\xi^{\sm}$ is lower semi-computable because it is a product of lower semi-computable terms, so by Proposition \ref{prop:opt}, a $\xi^{\sm}$-optimizer exists, which is also a $\xi$-optimizer.
\end{proof}

% Optimizer exists for $\xi$ as well, because value from empty string is lower semi-computable. For optimizer conditioned on non-empty string, just work with un-normalized value from that branch of the decision tree.
% Define it as \max_{pi \in \M^{\setminus c}} V^\pi_{\nu, U_m}?

Now we define a $\kl$-regularized optimizer. First, let $\pi(a_{k:m} | x_{<2k}o_{k:m}) := \prod_{t = k}^m \pi(a_t | x_{<2k}a_k o_k ... a_{t-1}o_{t-1})$. (So note that $a_t$ is not in fact conditioned on $o_{t+1}$.)
% First, let ${\pi \atop \nu}$ denote a semi-distribution over $\X^\infty$, where the odd elements are sampled from the semi-distribution $\pi$ and the even elements are sampled from the semi-distribution $\nu$.
% Second, recalling that $\nu(x_{<t}) = \prod_{k \leq t} \nu(x_k | x_{<k})$, let $\kl_{}$
\begin{definition}[$\kl$-regularized optimizer]
    For any lower semi-computable semi-distributions $\nu$ and $\rho$, a horizon $m$, a utility function $U_m$, a starting string $x_{<2k}$, and a tolerance $\delta$, a $\kl$-regularized optimizer is an any-time program $\pi^\delta_c$ for computing actions (perhaps stochastically) for which the following holds. First, letting
    \begin{multline}
        \kl_{x_{<2k}, m}(\pi^\delta_c||\rho) := \max_{o_{k:m} \in \X^{m-k+1}} \sum_{a_{k:m} \in \X^{m-k+1}} \\
        \pi^\delta_c(a_{k:m} | x_{<2k}o_{k:m})
        \log \frac{\pi^\delta_c(a_{k:m} | x_{<2k}o_{k:m})}{\rho(a_{k:m} | x_{<2k}o_{k:m})},
    \end{multline}
    $\kl_{x_{<2k}, m}(\pi^\delta_c||\rho) < \delta$,
    and second, $V^{\pi^\delta_c}_\nu$ approaches the optimal value subject to that constraint, as $c \to \infty$.
\end{definition}

% Why is the $\kl$ divergence a robust measurement of how much one policy differs from another, and why the $\kl$ \textit{from} the base policy \textit{to} the proposed policy (i.e. with the base policy on the right)? If we're worried about bad things happening, but we trust that the base policy would not let them happen, then the following property of the $\kl$ divergence is important: if $\rho$ assigns zero probability to any event, while $\pi$ assigns positive probability to it, then $\kl(\pi || \rho) = \infty$. This applies equally to events that resolve over multiple timesteps. $\kl$ regularization is also robust to the possibility that we are only confident $\rho$ assigns very small probability to very bad events, if not zero. If $\rho$ assigns a tiny probability to a very bad event, a $\kl$ bound constrains $\pi$ to assign a probability that is within a multiplicative constant of $\rho$'s probability. If $\pi$ is regularized to $\rho$ with, say, $L_2$ distance, then $\pi$ can easily assign appreciable probability to events with 0 probability according to $\rho$, and there may have been a very good reason that $\rho$ assigned 0 probability to it.

\begin{proposition}[$\kl$-regularized optimizers exist] \label{prop:klexist}
    For any lower semi-computable semi-distributions $\nu$ and $\rho$, any $m$, any computable utility function $U_m$, any starting string $x_{<2k}$, and any tolerance $\delta \geq 0$, there exists a $\kl$-regularized optimizer.
\end{proposition}

\begin{proof}
    First, we show that for any computable probability distribution $\pi$, and any lower semi-computable semi-distribution $\rho$, $\kl_{x_{<2k}, m}(\pi || \rho)$ is upper semi-computable, and therefore the set of probability distributions $\pi$ which have bounded $\kl$ divergence from $\rho$ is computably enumerable.
    % FROM PRIOR DEFINITION OF KL-REGULARIZED -- First, note that we need to check $\kl(\rho(\cdot | x_{<t}) || \pi(\cdot | x_{<t}))$ on finitely many $x_{<t}$, because $\sum_{t \leq 2m} |\X^{t}|$ is finite. So if we can show that $\kl(\rho || \pi)$ is upper semi-computable regardless of the input it is conditioned on, then it follows that the sum over the $\kl$-divergences when conditioned on all prefixes of that input is also upper semi-computable, since those computations can all be dovetailed.

    Omitting the $x_{<2k}$ and the $o_{k:m}$ that all distributions are conditioned on, note that $\kl(\pi || \rho)$, which equals $\sum_{z \in \X^{m-k+1}} \pi(z) \log \frac{\pi(z)}{\rho(z)}$, is monotonically decreasing in $\rho(z)$ for any $z$. Since $\pi(z)$ is computable, and since $\rho(z)$ is lower semi-computable, then $\pi(z) \log \frac{\pi(z)}{\rho(z)}$ is upper semi-computable.

    By dovetailing (repeatedly switching between ongoing computations, executing one step at a time) the computation over all possible $\pi$ (countably many), we can admit any semi-distribution $\pi$ to a list of viable candidates whenever the estimate of the $\kl$-divergence from $\rho$ falls below $\delta$. Since the $\kl$ estimates never increase, once a semi-distribution $\pi$ is added to the list, it need never be removed. And every viable policy will eventually be added to the list because the $\kl$ estimates approach the truth in the limit of infinite computation, and $[0, \delta)$ is open on the right.

    Dovetailing over all semi-distributions $\pi$ on the list of viable candidates (and adding in the new ones as they get added to the list), we simultaneously update estimates of the value of each one in the given environment $\nu$, recalling that $V^\pi_{\nu, U_m}$ is lower semi-computable (Proposition \ref{prop:lscvalue}). When the computation budget of the any-time optimizer is reached, it samples an action from its estimate of the semi-distribution $\pi$ which is (so far) estimated to be of highest value. (It will need to have a running estimate of the semi-distribution $\pi$ in order to estimate its value).
\end{proof}

\section{Regularizing to an Approximate Solomonoff Inductor}

Let $\xi$ be the Solomonoff Bayes mixture probability semi-distribution defined in Section \ref{sec:solom}. $\xi$ is not computable, but we can do KL regularization to an approximation of $\xi$. Let $\hat{\xi}_i$ be a semi-distribution and a computable estimate of $\xi$, with $\lim_{i \to \infty} \hat{\xi}_i = \xi$. (The existence of this is established by Proposition \ref{prop:anytimexi}). $\hat{\xi}_i$ can be used as the base predictive model (taking the place of $\rho$ in the definition of $\kl$-regularized optimizers). We fix $U_m$ to an arbitrary utility function for the remainder of this work, and drop it from the notation. For a given $\delta$ and a given $i$, let $\pi^\delta_{i, c}$ be the $\kl$-regularized optimizer using $\hat{\xi}_i$ for the $\kl$ constraint, and using $\xi$ to optimize with respect to (taking the place of $\nu$ from the definition). Let this policy approach the optimal value, subject to the constraint, as $c \to \infty$; the existence of $\pi^\delta_{i, c}$ is established by Proposition \ref{prop:klexist}. When this policy is conditioned on $x_{<2t}$ for $t \geq k$, and with $a_{k:t}$ sampled from $\pi^\delta_{i, c}$ itself, we can think of $\pi^\delta_{i, c}$ as an optimizer that is regularized to an approximate Bayesian estimate of a \textit{human policy}, given the origin of $x_{<2k}$.

% The motivation for this regularization is the possibility (or even probability) that $U_m$ is overly simplistic. As \citet{zhuang2020consequences} show, in a resource-constrained world, optimization of one utility function often implies very low values according to richer utility functions. So we assume a setting where we have a true, but unknown, set of ``bad'' outcomes $B$, and if $V^*_{\xi, U_m} - V^\pi_{\xi, U_m} < \varepsilon$, then $\E[B]$ is unacceptably high.

\section{Behavior in unprecedented circumstances} \label{sec:proofthm}

The following theorem establishes that as $c$ and $i$ go to infinity, the constraint on $\pi^\delta_{i, c}$ becomes quite weak in the presence of unprecedented events.

% \begin{theorem}[Little constraint in novel situations] \label{thm:novel}
%     There exists a constant $d$ such that for all $U_m$, and for all $E$, if $E$ is unprecedented and occurs at time $t$, for any $v < V^*_{\xi, U_m}(x_{<2t})$, there exists a policy $\pi$ for which $V^\pi_{\xi, U_m}(x_{<2t}) > v$, and $\kl_{x_{<2t}, m}(\pi || \xi) < d + K(U_m) + K(E) + K(v \xi(x_{<2t}))$.
% \end{theorem}

\thmnovel*

\begin{proof}
    Let $\pi^*_c$ denote an unconstrained optimizer of $U_m$ in the environment $\xi$, which approaches optimality as $c \to \infty$, whose existence is shown by Proposition \ref{prop:xiopt}. As in the proof of Proposition \ref{prop:xiopt}, let $\xi^{\sm}$ be the un-normalized version of $\xi$, which is lower semi-computable: $\xi^{\sm}(o_t | a_{\leq t}o_{<t}) = \sum_{\nu \in \M} w_o(\nu) \left[\prod_{k < t} \nu(o_k | a_{\leq k}o_{<k}) \right] \nu(o_t | a_{\leq t}o_{<t})$. And note that the value according to $\xi$  versus $\xi^{\sm}$ is connected by the normalizing constant: $\xi(x_{<2t}) V^\pi_{\xi, U_m}(x_{<2t}) = V^\pi_{\xi^{\sm}, U_m}(x_{<2t})$. Now, we let $\pi^*_u = \pi^*_c$ where $c$ is set to be the minimal value for which $V^{\pi^*_c}_{\xi^{\sm}, U_m}(x_{<2t})$ exceeds $u$. If $u \geq V^{*}_{\xi^{\sm}, U_m}(x_{<2t})$, then $\pi^*_u$ will not halt, but otherwise, because the value is lower semi-computable, we can increase $c$ until the value reaches at least $u$. Letting $v = u / \xi(x_{<2t})$, observe that $V^{\pi^*_u}_{\xi, U_m}(x_{<2t})$ exceeds $v$, as long as $v < V^{*}_{\xi, U_m}(x_{<2t})$, although it may not be possible to compute $v$ in finite time. So $\pi^*_u$ satisfies the first of the properties promised in the theorem.
    
    We now show that it satisfies the second as well. Recall that $\kl_{x_{<2t}, m}(\pi || \xi)$ only requires evaluating $\xi$ on its predictions for actions, and this takes the form $\xi(a_k | a_{<k} o_{<k}) = \sum_{\nu \in \M} w_a(\nu | a_{<k} o_{<k}) \nu(a_k | a_{<k} o_{<k})$. And it is straightforward to show
    % (which we do in Lemma TODO in Appendix TODO)
    an analogous property for $\xi$'s predictions on longer strings: $\xi(a_{t:m} | a_{<t} o_{<m}) = \sum_{\nu \in \M} w_a(\nu | a_{<t} o_{<t}) \nu(a_{t:m} | a_{<t} o_{<m})$. So we now examine the posterior weights of various models after being conditioned on $a_{<t} o_{<t} \in E$.

    Recall that each $\nu \in \M$ is computed by a corresponding program $s \in \mathcal{L}$. Given the event $E$, the utility function $U_m$, and a target value $u$, we construct, for each $s \in \mathcal{L}$, an $s'_u$ as follows: if, in the input to $s'_u$, $E$ has not happened, execute the program $s$; otherwise compute $\pi^*_u$. Keeping account of the control flow in $s'_u$, we can see there exists a constant $d$ such that $\forall s \ \forall E \ \forall U_m$ and $\forall u$, $s'_u$ has length less than $\ell(s) + K(E) + K(U_m) + K(u) + d$.
    
    Letting $\nu'_u$ be the probability semi-distribution computed by $s'_u$, consider the ratio of prior weights between $\nu$ and $\nu'_u$. Because $w(\nu) = 2^{-\ell(s)}$ for the corresponding program $s$, it follows from the bound on the difference in length between $s$ and $s'_u$ that $w(\nu'_u) / w(\nu) > 2^{-d} 2^{-K(E) - K(U_m) - K(u)}$. The posterior ratio $w(\nu'_u | x_{<2t}) / w(\nu | x_{<2t})$ is the same as the prior ratio, if $E$ happens for the first time at time $t$, because they will have assigned exactly the same probabilities to all characters in $x_{<2t}$. Because the sum over $\nu \in \M$ of the posterior weights must be 1, the sum $\sum_{\nu \in \M} w(\nu'_u | x_{<2t}) > 2^{-d} 2^{-K(E) - K(U_m) - K(u)}$.

    Note by construction that for all $\nu \in \M$, $\nu'_u(a_{t:m} | a_{<t} o_{<m}) = \pi^*_u(a_{t:m} | a_{<t} o_{<m})$. Because all $\nu'_u$ belong to $\M$ for all $\nu \in \M$,
    \begin{align*}
        &\xi(a_{t:m} | a_{<t} o_{<m}) 
        \\
        = &\sum_{\nu \in \M} w_a(\nu | a_{<t} o_{<t}) \nu(a_{t:m} | a_{<t} o_{<m})
        \\
        > &\sum_{\nu \in \M} w_a(\nu'_u | a_{<t} o_{<t}) \nu'_u(a_{t:m} | a_{<t} o_{<m})
        \\
        = &\left[\sum_{\nu \in \M} w_a(\nu'_u | a_{<t} o_{<t})\right] \pi^*_u(a_{t:m} | a_{<t} o_{<m})
        \\
        > &2^{-d - K(E) - K(U_m) - K(u)} \pi^*_u(a_{t:m} | a_{<t} o_{<m})
        \tagaligneq
    \end{align*}

    Finally,
    \begin{align*}
        &\kl_{x_{<2t}, m}(\pi^*_u || \xi)
        \\
        = &\!\!\!\!\max_{o_{t:m} \in \X^{m-t+1}} \sum_{a_{t:m}} \pi^*_u(a_{t:m} | a_{<t} o_{<m}) \log \frac{\pi^*_u(a_{t:m} | a_{<t} o_{<m})}{\xi(a_{t:m} | a_{<t} o_{<m})}
        \\
        < &\sum_{a_{t:m}} \pi^*_u(a_{t:m} | a_{<t} o_{<m}) \log 2^{d + K(E) + K(U_m) + K(u)}
        \\
        = &[d + K(E) + K(U_m) + K(u)]/\log 2
        \tagaligneq
    \end{align*}
    and $u = v \xi(x_{<2t})$. Therefore, $\pi^*_u$ satisfies the theorem.
 \end{proof}

What does Theorem \ref{thm:novel} mean for the optimizer constrained by $\kl_{x_{<2k}, m}(\pi || \hat{\xi}_i)$ for large $i$? If the optimization of $U_m$ does not require urgent action, then one valid strategy for a policy $\pi$ is to wait for an unprecedented event, imitating the base policy $\hat{\xi}_i$ until then, and then start optimizing. The telescoping property of the $\kl$ Divergence clarifies the validity of this approach. That is, for $t > k$, $\kl_{x_{<2k}, m}(\pi || \rho) = \kl_{x_{<2k}, t}(\pi || \rho) + \E_{x_{2k:2(t-1)} \sim \pi} \kl_{x_{<2t}, m}(\pi || \rho)$ \citep{Hutter:04uaibook}. So starting with a policy with low $\kl$ divergence from the base policy preserves a ``budget'' for high $\kl$ divergence to be ``spent'' later by switching to a policy with greater divergence from the base policy.

\iffalse
% [Scratch work on a possibly doomed Free Reign Theorem]:

% \begin{theorem}[Free reign] \label{thm:freereign}
%     There exists a constant $d$ such that for all $E$, for all $x_{<2t} \in E$ where $E$ is unprecedented at time $t$, and for all $\varepsilon \geq 0$, if $\delta > d + K(E) + K^-_\varepsilon(V^*_{\xi, U_m}(x_{<2t}))$, then $\lim_{c \to \infty} V^{\pi^\delta_c}_{\xi, U_m}(x_{<2t}) \geq V^*_{\xi, U_m}(x_{<2t}) - \varepsilon$.
% \end{theorem}

\begin{theorem}[Free reign if patient] \label{thm:freereign}
    There exists a constant $d$ such that for all $U_m$, for all $E$, for all $x_{<2t} \in E$ where $E$ is unprecedented at time $t$, and for all $v < V^*_{\xi, U_m}(x_{<2t})$, if $\delta > d + K(U_m) + K(E) + K(v \xi(x_{<2t}))$, then $\lim_{i \to \infty} \lim_{c \to \infty} V^{\pi^\delta_{i, c}}_{\xi, U_m}(x_{<2t}) \geq v$.
\end{theorem}

% Look into \xi_p. Calculates posterior weights at time 2t as if \xi(x_{<2t}) \geq p.

% \begin{lemma}[Monotone Complexity] \label{lem:mon}
%     Recalling that $\xi(x_{<t}) = \prod_{k \leq t} \xi(x_k | x_{<k})$, for constants $c_1$ and $c_2$. 
%     \begin{equation*}
%         c_1 \xi(x_{<t}) \leq 2^{-Km(x_{<t})} \leq c_2 \xi(x_{<t})
%     \end{equation*}
% \end{lemma}
% \begin{proof}
%     TODO.
% \end{proof}

\begin{proof} %[Proof of Theorem \ref{thm:freereign}]
    Let $\pi^*_c$ denote an unconstrained optimizer of $U_m$ in the environment $\xi$, which approaches optimality as $c \to \infty$, whose existence is shown by Proposition \ref{prop:xiopt}. As in the proof of Proposition \ref{prop:xiopt}, let $\xi^{\sm}$ be the un-normalized version of $\xi$, which is lower semi-computable: $\xi^{\sm}(o_t | a_{\leq t}o_{<t}) = \sum_{\nu \in \M} w_o(\nu) \left[\prod_{k < t} \nu(o_k | a_{\leq k}o_{<k}) \right] \nu(o_t | a_{\leq t}o_{<t})$. And note that the value according to $\xi$  versus $\xi^{\sm}$ is connected by the normalizing constant: $\xi(x_{<2t}) V^\pi_{\xi, U_m}(x_{<2t}) = V^\pi_{\xi^{\sm}, U_m}(x_{<2t})$. Now, we let $\pi^*_u = \pi^*_c$ where $c$ is set to be the minimal value for which $V^{\pi^*_c}_{\xi^{\sm}, U_m}(x_{<2t})$ exceeds $u$. If $u \geq V^{*}_{\xi^{\sm}, U_m}(x_{<2t})$, $\pi^*_u$ will not halt, but otherwise, because the value is lower semi-computable, we can increase $c$ until the value reaches $u$. Letting $v = u / \xi(x_{<2t})$, observe that $V^{\pi^*_u}_{\xi, U_m}(x_{<2t})$ exceeds $v$, as long as $v < V^{*}_{\xi, U_m}(x_{<2t})$, although the value of $v$ may not be lower semi-computable.
    
    Recall that $\hat{\xi}_i$ is used for the $\kl$-constraint for the optimizer. We define a policy, calculate the limit of its $\kl$-divergence from $\hat{\xi}_i$, and thereby show that if $\delta$ is larger than that, the optimizer will reach at least as high a value.
    
    Let $\overline{\xi}$ be the probability distribution that is the normalized version of $\xi$. This is limit computable, but not lower semi-computable like $\xi$ is \citep{Hutter:04uaibook}. So let $\overline{\xi}_c$ be a function computable for which $\lim_{c \to \infty} \overline{\xi}_c = \overline{\xi}$. Let $\pi_{c, v}$ (using a different $c$) be the policy which is equal to $\overline{\xi}^c$ if $E$ has not happened, and is equal to $\pi^*_v$ if $E$ has happened.

    We will be bounding the limiting $\kl$-divergence $\lim_{c \to \infty} \kl_{x_{<2k}, m}(\pi_{c, v} || \xi)$, but first, we must examine the composition of $\xi$. Recall that $\xi = \sum_{\nu \in \M} w(\nu) \nu$, and each $\nu \in \M$ is computed by a corresponding program $s \in \mathcal{L}$. Given the event $E$, the utility function $U_m$, and a target value $v$, we construct, for each $s \in \mathcal{L}$, an $s'_v$ as follows: on input $x_{<2t}$, if $E$ has happened, compute $\pi^*_v(\cdot | x_{<2t})$; if $E$ has not happened, sample from $\xi(\cdot | x_{<2t})$; and on input $x_{<2t+1}$, sample from $\xi(\cdot | x_{<2t})$. Keeping account of the control flow in $s'_v$, we can see there exists a constant $d$ such that $\forall s \ \forall E \ \forall U_m \ \forall v$, $s'_v$ has length less than $\ell(s) + K(E) + K(U_m) + K(v) + d$.

    Letting $\nu'_v$ be the probability semi-distribution computed by $s'_v$, observe that by construction $V^{\nu'_v}_\xi(x_{<t}) \geq v$, as long as a) $E$ has just happened for the first time, and b) $V^*_{\xi, U_m}(x_{<t}) > v$ (to ensure that $s'_v$ halts).

    Now, consider the ratio of prior weights between $\nu$ and $\nu'_v$. Because $w(\nu) = 2^{-\ell(s)}$ for the corresponding program $s$, it follows from the bound on the difference in length between $s$ and $s'_v$ that $w(\nu'_v) / w(\nu) > 2^{-d} 2^{-K(E) - K(U_m) - K(v)}$. The posterior ratio $w(\nu'_v | x_{<t}) / w(\nu | x_{<t})$ is the same as the prior ratio, if $E$ happens for the first time at time $t$, because they will have assigned exactly the same probabilities to all characters in $x_{<t}$.

    .
    
    .

    .

    -scraps-

    .
    

    % Because the sum over $\nu \in \M$ of the posterior weights must be 1, the sum $\sum_{\nu \in \M} w(\nu'_v | x_{<t}) > 2^{-d} 2^{-K(E) - K(U_m) - K(v)}$. % I think we need to change posterior w to be a semi-distribution to make \xi lower semi-computable?

    % Recalling that $\xi \in \M$ (Proposition \ref{prop:TODO}), consider the probability semi-distribution $\xi'_v$, also in $\M$.
    
    We can now evaluate $\kl_{x_{<2k}, m}(\pi^*_v || \xi)$, letting $t$ be the first time $E$ happens, if at all, or else $t = m+1$. Fixing an arbitrary $o_{k:m}$,

    \begin{align*}
        \kl_{x_{<2k}, m}(\pi^*_v || \xi) &= \sum_{a_{k:m} \in \X^{m-k+1}} \pi^*_v(a_{k:m} | x_{<2k}o_{k:m}) \log \frac{\pi^*_v(a_{k:m} | x_{<2k}o_{k:m})}{\xi(a_{k:m} | x_{<2k}o_{k:m})}
        \\
        % &= \sum_{x_{k:m} \in \X^{m-k+1}} \widetilde{\xi}(x_{k:m} | x_{<k}) \log \frac{\widetilde{\xi}(x_{k:m} | x_{<k})}{{\xi'_v \atop \widetilde{\xi}}(x_{k:m} | x_{<k})} \frac{{\xi \atop \widetilde{\xi}}(x_{k:m} | x_{<k})}{{\xi \atop \widetilde{\xi}}(x_{k:m} | x_{<k})}
        % \\
        % &= \kl_{x_{<k}, m}(\widetilde{\xi} || \xi) +  \sum_{x_{k:m} \in \X^{m-k+1}} \widetilde{\xi}(x_{k:m} | x_{<k}) \log \frac{{\xi \atop \widetilde{\xi}}(x_{k:m} | x_{<k})}{{\xi'_v \atop \widetilde{\xi}}(x_{k:m} | x_{<k})}
        % \\
        % &\equal^{(a)} \kl_{x_{<k}, m}(\widetilde{\xi} || \xi) + \sum_{x_{k:m} \in \X^{m-k+1}} \widetilde{\xi}(x_{k:m} | x_{<k}) \log \frac{{\xi \atop \widetilde{\xi}}(x_{k:t-1} | x_{<k}) {\xi \atop \widetilde{\xi}}(x_{t:m} | x_{<t})}{{\xi \atop \widetilde{\xi}}(x_{k:t-1} | x_{<k}) {\pi^*_v \atop \widetilde{\xi}}(x_{t:m} | x_{<t})}
        % \\
        % &= \kl_{x_{<k}, m}(\widetilde{\xi} || \xi) + \sum_{x_{k:m} \in \X^{m-k+1}} \widetilde{\xi}(x_{k:m} | x_{<k}) \log \frac{ {\xi \atop \widetilde{\xi}}(x_{t:m} | x_{<t})}{{\pi^*_v \atop \widetilde{\xi}}(x_{t:m} | x_{<t})}
        \tagaligneq
    \end{align*}


     % we bound the $\kl$-divergence from $\nu'_v$ to $\widetilde{\xi}$. First

    
    
    
    % Letting $s'$ be $s'_v$ with $v$ being a minimizer of $K(y)$ for $y \in [V^*_{\xi, U_m}(x_{<2t}) - \varepsilon, V^*_{\xi, U_m}(x_{<2t})]$, we have $V^{\nu'}_\xi(x_{<t}) \geq V^*_{\xi, U_m}(x_{<2t}) - \varepsilon$, and $\ell(s') < \ell(s) + K(E) + K(U_m) + K^-_\varepsilon(V^*_{\xi, U_m}(x_{<2t})) + d$.
    
    
    
    % Outline: When unprecedented $E$ happens, $\xi(\cdot | x_{<t}) > c 2^{-K(E)} \xi(\cdot)$. $\xi / U_m\textrm{-optimizer} > c 2^{-K(U_m) + K(m)}$.

    % First, observe that $Km(x_{<t} z) < Km(x_{<t}) + K(E) + Km(z) + c$, because a program for producing (but not necessarily stopping at) $x_{<t} z$ can be composed as follows: first run a program for producing (but not necessarily stopping at) $x_{<t}$, stop whenever the output belongs to $E$, which will by assumption occur for the first time when the output equals $x_{<t}$, and then switch to a program for producing (but not necessarily stopping at) $z$. Since the code to direct that logic is of bounded length $c$, we have $Km(x_{<t} z) < Km(x_{<t}) + K(E) + Km(z) + c$.

    % Rearranging this inequality gives $2^{-Km(x_{<t} z)} / 2^{-Km(x_{<t})} > c' 2^{-K(E)} 2^{-Km(z)}$. Because $\xi(x_{<t})$ is within a multiplicative constant of $2^{-Km(x_{<t})}$, as shown in Lemma \ref{lem:mon}, this implies:
    % \begin{equation}
    %     \xi(z | x_{<t}) = \xi(x_{<t}z) / \xi(x_{<t}) > c'' 2^{-K(E)} 2^{-Km(z)}
    % \end{equation}

    % Now, consider the case where $z$ is 
    
    % Aiming for Km(x_{<t} z) < Km(x_{<t}) + K(E) + Km(z) + c
\end{proof}

\fi

\propscaling*

\begin{proof}
    Consider the very simple event $E_T = \X^T$; it occurs (and is of course unprecedented) at time $T$. $K(E_T)$ is within a constant of $K(T)$. So we are interested in the rate of growth of $\min_{T \geq t} K(T)$ as $t$ increases. \citepos{zvonkin1970complexity} Theorem 1.4 (d) states that this function is eventually less than every computable function that tends to infinity.
\end{proof}


\section{Total variation distance} \label{sec:prooftvd}

\begin{definition}[$V_{\xi, U_m}$-optimal]
    An action $a_t$ is $V_{\xi, U_m}$-optimal after a history $x_{<2t}$ if $\E_{o_t \sim \xi(\cdot | x_{<2t}a_t)} V^*_{\xi, U_m}(x_{<2t}a_to_t) = V^*_{\xi, U_m}(x_{<2t})$.
\end{definition}

\thmtvd*

\begin{proof}
    Letting $\pi(x_{2t:2m} | x_{<2t}) := \prod_{t'=t}^m \pi(a_{t'} | x_{<2t'})$, if $\pi^{TVD}_c(a_t | x_{<2t}) > \beta(a_t | x_{<2t})$, then there exists an $x_{2t+1:2m}$ such that $\pi^{TVD}_c(a_t x_{2t+1:2m} | x_{<2t}) > \beta(a_t x_{2t+1:2m} | x_{<2t})$.
    Suppose $a_t$ is not $V_{\xi, U_m}$-optimal. Then there exists an $a'_t$ such that $Q(x_{<2t}a'_t) > Q(x_{<2t}a_t)$. Let $x'_{2t+1:2m}$ be a sequence where all actions are $V_{\xi, U_m}$-optimal, and all observations have positive probability.
    
    Let $\pi'_\varepsilon(\overline{x}_{2t:2m} | x_{<2t})$ equal $\pi^{TVD}_c(\overline{x}_{2t:2m} | x_{<2t})$ for all $\overline{x}_{2t:2m}$, except $\pi'_\varepsilon(a_tx_{2t+1:2m}|x_{<2t}) = \pi^{TVD}_c(a_tx_{2t+1:2m}|x_{<2t}) - \varepsilon$, and $\pi'_\varepsilon(a'_tx'_{2t+1:2m}|x_{<2t}) = \pi^{TVD}_c(a'_tx'_{2t+1:2m}|x_{<2t}) + \varepsilon$. The conditional probabilities $\pi'_\varepsilon(a_{t'}|x_{<2t'})$ can easily be defined to achieve the properties in the previous sentence.

    For small enough $\varepsilon > 0$, this policy exists (no probabilities are outside [0, 1]) because $\pi^{TVD}_c(a_t | x_{<2t}) > \beta(a_t | x_{<2t}) \geq 0$ and therefore, $\pi^{TVD}_c(a'_t | x_{<2t}) < 1$. And for small enough $\varepsilon > 0$, $\tvd_{x_{<2k}, m} (\pi'_\varepsilon, \beta) \leq \tvd_{x_{<2k}, m} (\pi^{TVD}_c, \beta)$, because decreasing the probability on $a_tx_{2t+1:2m}$ will reduce the total variation distance by $\varepsilon$, for $\varepsilon \leq \pi(a_t x_{2t+1:2m} | x_{<2t}) - \beta(a_t x_{2t+1:2m} | x_{<2t})$ (which is positive), while increasing the probability on $a'_tx'_{2t+1:2m}$ will not increase the total variation distance by more than $\varepsilon$.

    Finally, since $Q(x_{<2t}a'_t) > Q(x_{<2t}a_t)$, $V^{\pi'_\varepsilon}_{\xi, U_m}(x_{<2t}) > V^{\pi^{TVD}_c}_{\xi, U_m}(x_{<2t})$. This contradicts that $\pi^{TVD}_c = \argmax_{\pi : \tvd_{x_{<2k}, m}(\pi, \beta) < c} V^{\pi}_{\xi, U_m}$ since a policy with no more total variation distance has greater value.
\end{proof}


\section{Detailed experimental setup} \label{sec:training}

The details of the experimental setup are as follows. Code is available at \url{https://github.com/mkc1000/kl_reg_paper}.

\subsection{Environment}
The state of the environment, as mentioned in the main text, is the activations of the last three hidden layers of Mixtral-base-model with the transcript-so-far as input, along with the fraction of the episode remaining. This gives a state space of 12289. Using the Mistral tokenizer, the action space is 32000. The environment uses a temperature of 0.05 for generating the student’s responses and a temperature of 1 for the base policy for the agent/teacher.

\subsection{Network Architecture}
The critic network is a fully connected network with two hidden layers of size 128 with tanh activations. The actor network consists of just one parameterized layer, which is fully connected, of size $(|$state space$|, |$action space$| + 1)$. The extra output is for controlling the KL divergence to the base policy. We compute the target KL divergence as sigmoid(activation) * the KL budget remaining to the agent for the episode. So the activation controls what fraction of the remaining KL budget for the episode to use on the very next token. At initialization, this fraction comes to 1/16. The KL budget remaining starts as the total episode KL budget (of course), and is decreased by $\log(\textrm{policy}(\textrm{action}) / \textrm{basepolicy}(\textrm{action}))$ with each action. The other outputs are interpreted as logits and are added to the base policy logits. Calling this resulting distribution $a$, and the base policy distribution $b$, we find an $\alpha \in [0, 1]$ such that $\kl(\alpha a + (1-\alpha) b || b)$ equals the target KL, if possible. If we cannot achieve a sufficiently high KL divergence, we set $\alpha=1$. The output policy is $\alpha a + (1-\alpha) b$. We add any squared error $(\textrm{target KL} - \textrm{achieved KL})^2$ to the loss function to encourage the network to output logits that allow further control by the neuron controlling the KL target.

In the forward pass, our custom PyTorch operation does binary search the calculate $\alpha$ in the interval $[0, 1]$. The backward pass uses implicit differentiation, assuming we have found exactly the right $\alpha$---there is no need to differentiate backward through the binary search, which would be unstable. Code for this PyTorch object can be found at \url{https://github.com/mkc1000/kl-fixed-mixture/}.

\subsection{PPO}
We use the following hyperparameters for PPO. We do not use a generalized advantage estimate.

\begin{table}[h]
    \centering
    \begin{tabular}{cc}
        Training timesteps & 6 million \\
        Update frequency & 1 / 64 episodes\\
        Training epochs / update & 8 \\
        Training batch size & $2^{13}$\\
        Epsilon clip & 0.1 \\
        Entropy coefficient & 1e-4 \\
        Max gradient norm & 0.1 \\
        Actor learning rate & 2e-5 \\
        Critic learning rate & 1e-4 \\
    \end{tabular}
    % \caption{PPO Hyperparameters}
    \label{tab:hyper}
\end{table}

A higher entropy coefficient is unnecessary given the KL constraint to the base policy. Over the first 3 million timesteps of training, we slowly increase the per-episode KL budget from 0 to its final value. We increase this at a linear schedule each time we update the network.

When we re-train for a longer episode length (256 tokens to 512 tokens), we train for 3 million steps, plenty to reach apparent convergence.

\subsection{Parallelism}
We use threading to run 64 agent-environment-loops in ``parallel''. When we would need to send a transcript of length $l$ to be processed by the Mixtral model, we wait until all 64 agent-environment-loops need to send a transcript of length $l$, and then they are batched and evaluated together in parallel on the GPU. The result might needed by either the agent or the environment, and we use the python \texttt{asyncio} library to manage this. Doing just that step in parallel is enough for substantial speedup.

\subsection{Resource Usage}
We ran our experiments on two A100-SXM4-80GBs. Training for 9 million timesteps took approximately 90 hours. Our seven training runs (one of which was stopped after 6 million timesteps) took about 25 days, all told. (We ran the experiments two or three at a time). The full research project required much more compute, since finding good hyperparameters for PPO is never straightforward, especially when we were attempting to achieve a desired per-episode KL divergence, only with the use of a fixed per-token KL cost; recall that we eventually switched to a policy architecture that allowed direct control of the per-episode KL divergence.

\section{AI evaluation of transcripts} \label{sec:table}

Table \ref{tab:comparison} demonstrates that the less optimized agent is usually judged ``better'' and ``more complex/unpredictable'' by GPT3.5-turbo.

\begin{table}[h]
\caption{Automated comparison of teacher behavior generated by base model, trained KL budget 10 policies, and trained KL budget 20 policies. The percentages refer to the fraction of the time that that agent ``won'' according to the comparator, with a 95\% confidence interval.}
\label{tab:comparison}
\begin{center}
% \begin{tabular}{llll}
% &\multicolumn{1}{c}{\bf 20 (cf. base)}  &\multicolumn{1}{c}{\bf 10 (cf. base)} &\multicolumn{1}{c}{\bf 20 (cf. 10)}
% \\ \hline \\
% ``Better''         &11.3$\pm$3.6\% &15.3$\pm$4.1\% &17.7$\pm$4.3\% \\
% \makecell{``More complex/\\unpredictable''}             &4.0$\pm$2.2\% &29.0$\pm$5.1\% &14.3$\pm$4.0\% \\
% \end{tabular}
\begin{tabular}{lll}
&\multicolumn{1}{c}{\textbf{``Better''}}  &\multicolumn{1}{c}{\makecell{\textbf{``More complex/}\\\textbf{unpredictable''}}}
\\ \hline \\
\textbf{20 (cf. base)} & 11.3$\pm$3.6\% & 4.0$\pm$2.2\%
\\
\textbf{10 (cf. base)} & 15.3$\pm$4.1\% & 29.0$\pm$5.1\%
\\
\textbf{20 (cf. 10)} & 17.7$\pm$4.3\% & 14.3$\pm$4.0\%
\end{tabular}
\end{center}
\end{table}

\end{document}
