\newif\ifshowdeletions
% \showdeletionstrue

% \documentclass{uai2025} % for initial submission
\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{nicefrac}       
\usepackage{xcolor}         
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amsmath,amsfonts,amssymb,amsthm}
\usepackage{bbm}
\usepackage{cleveref}
\usepackage{setspace}
\usepackage{subcaption}
\usepackage{multirow}
\usepackage{xspace}
\usepackage{enumitem}
\usepackage{newtxmath}
\renewcommand{\UrlBreaks}{\do\-\do\.\do\@\do\\\do\/\do\!\do\_\do\|\do\;\do\>\do\]\do\)\do\,\do\?\do\'\do+\do\=\do\#}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\usepackage[createShortEnv,conf={no link to proof, text link},commandRef=autoref]{proof-at-the-end}


\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}

\input{math_commands.tex}
\DeclarePairedDelimiterX{\infdivx}[2]{(}{)}{
  #1\;\delimsize\|\;#2
}
\newcommand{\infdiv}{\KL\infdivx}

% \definecolor{Red}{rgb}{0.768, 0.054, 0.054}
% \definecolor{Blue}{rgb}{0.152, 0.294, 0.925}
% \definecolor{Green}{rgb}{0,0.4,0.7}
% \hypersetup{
%     colorlinks=true,
%     citecolor=teal,
%     linkcolor=Red,
%     urlcolor=Green,
% }
\newcommand*\rot{\rotatebox{90}}

\newcommand{\iid}{i.i.d.\vphantom{i}\xspace}
\newcommand{\eg}{\emph{e.g.}}
\newcommand{\ie}{\emph{i.e.}}

\newcommand{\kolya}[1]{\textcolor{red}{#1 -Kolya}}
\newcommand{\younesse}[1]{\textcolor{blue}{#1 -Younesse}}

\DeclareMathOperator*{\maximize}{maximize}

\ifshowdeletions
\newcommand{\edit}[2]{\textcolor{red}{#1}\textcolor{blue}{#2}}
\else
 \newcommand{\edit}[2]{\textcolor{blue}{#2}}
\fi

\makeatletter
\crefname{section}{\S\@gobble}{\S\@gobble}
\crefname{subsection}{\S\@gobble}{\S\@gobble}
\crefname{proposition}{Prop.}{Props.}
\crefname{figure}{Fig.}{Figs.}

\titleformat*{\section}{\raggedright\large\bfseries\MakeUppercase}
\titleformat*{\subsection}{\raggedright\bfseries\MakeUppercase}
% \titlespacing{\section}{\z@}{*2.75}{*1.375}
% \titlespacing{\subsection}{\z@}{*2.75}{*1.375}
% \titlespacing{\subsubsection}{\z@}{*2.75}{*1.375}
% \titlespacing{\paragraph}{\z@}{*1.375}{1em}
% \titlespacing{\section}{\z@}{*1}{*1.375}
% \titlespacing{\subsection}{\z@}{*1}{*1.375}
% \titlespacing{\subsubsection}{\z@}{*1}{*1.375}
% \titlespacing{\paragraph}{\z@}{*0.4}{1em}
% % \setlength{\parskip}{.4\baselineskip}
% \newcommand{\zerodisplayskips}{%
%   \setlength{\abovedisplayskip}{1mm}%
%   \setlength{\belowdisplayskip}{1mm}%
%   \setlength{\abovedisplayshortskip}{1mm}%
%   \setlength{\belowdisplayshortskip}{1mm}}
% \appto{\normalsize}{\zerodisplayskips}
% \appto{\small}{\zerodisplayskips}
% \appto{\footnotesize}{\zerodisplayskips}
\captionsetup{size=small}
\makeatother

\title{Can a Bayesian Oracle Prevent Harm from an Agent?}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2,3,4,*]{Yoshua~Bengio}
\author[5,*]{\href{mailto:<mkcohen@berkeley.edu>?Subject=Bayesian Oracle 2025 paper}{Michael~K.~Cohen}{}}
\author[6,4,*]{Nikolay~Malkin}
\author[1,2,7]{Matt~MacDermott}
\author[1,2,8,$\dagger$]{Damiano~Fornasiere}
\author[1,2,9,$\dagger$]{Pietro~Greiner}
\author[10,$\dagger$]{Younesse~Kaddar}
% Add affiliations after the authors
\affil[1]{%
    Mila\\
    Montr\'eal, Canada
}
\affil[2]{%
    LawZero\\
    Montr\'eal, Canada
}
\affil[3]{%
    Universit\'e de Montr\'eal\\
    Montr\'eal, Canada
  }
\affil[4]{
    CIFAR, Learning in Machines and Brains, Canada
}
\affil[5]{%
    UC Berkeley\\
    Berkeley, USA
  }
\affil[6]{%
    University of Edinburgh\\
    Edinburgh, UK
  }
\affil[7]{%
    Imperial College London\\
    London, UK
  }
\affil[8]{%
    Universitat de Barcelona\\
    Barcelona, Spain
  }
\affil[9]{%
    Universit\`a degli studi di Padova\\
    Padua, Italy
  }
\affil[10]{%
    University of Oxford\\
    Oxford, UK
  }
\affil[*,$\dagger$]{%
    First three authors and last three authors listed alphabetically
  }

% Let the title block take its natural height instead of a fixed 2.375in
\makeatletter
\renewcommand{\maketitlehooka}{\vbox\bgroup}
\makeatother
  
\begin{document}
\maketitle

\begin{abstract}
Is there a way to design powerful AI systems based on machine learning methods that would satisfy probabilistic safety guarantees? With the long-term goal of obtaining a probabilistic guarantee that would apply in every context, we consider estimating a context-dependent bound on the probability of violating a given safety specification. Such a risk evaluation would need to be performed at run-time to provide a guardrail against dangerous actions of an AI. Noting that different plausible hypotheses about the world could produce very different outcomes, and because we do not know which one is right, we derive bounds on the safety violation probability predicted under the true but unknown hypothesis. Such bounds could be used to reject potentially dangerous actions. Our main results involve searching for cautious but plausible hypotheses, obtained by a maximization that involves Bayesian posteriors over hypotheses. We consider two forms of this result, in the \iid case and in the non-\iid case, and conclude with open problems towards turning such theoretical results into practical AI guardrails.
\end{abstract}

\section{Introduction}

Ensuring that an AI system will not misbehave is a challenging open problem~\citep{bengio2024international}, particularly in the current context of rapid growth in AI capabilities. Governance measures and evaluation-based strategies have been proposed to mitigate the risk of harm from highly capable AI systems, but do not provide any form of safety guarantee when no undesired behavior is detected. In contrast, the \emph{safe-by-design} paradigm involves AI systems with quantitative safety guarantees, and therefore could represent a stronger form of protection \citep{dalrymple2024towards}. However, how to design such systems remains an open problem too.

\looseness=-1
Since testing an AI system for violations of a safety specification in every possible context, \eg, every (query, output) pair, is impossible, we consider a rejection sampling approach that declines a candidate output or action if it has too high a probability of violating an unknown safety constraint; we refer to such violations as ``harm''. The approach we outline does not require the developer to write down a safety constraint. The only requirement is the construction of a data generating process that produces information about harm, which we leave to future work. We also note that maintaining a Bayesian belief distribution about how to interpret a human-specified safety specification would protect the AI from committing to an incorrect interpretation. Here we instead focus on a question inspired by risk-management practice~\citep{mcneil2015quantitative}: even though the true probability of harm following from some proposed action is unknown, because the true data-generating process is unknown, can we bound that risk using quantities that can be estimated by machine learning methods given the observed data?

To illustrate this question, consider a committee of ``wise'' humans whose theories about the world are all equally compatible with the available data, knowing that an unknown member of the committee has the correct theory. Each committee member can make a prediction about the probability of future harm that would result from following some action in some context. Marginalizing this harm probability over the committee members amounts to making them vote with equal weights. If the majority is aligned with the correct member's prediction, then all is good, \ie, if the correct theory predicts harm, then the committee will predict harm and can choose to avoid the harmful action. But what if the correct member is in the minority regarding their harm prediction? To get a {\em guarantee} that the true harm probability is below a given threshold, we could simply consider the committee member whose theory predicts the highest harm probability, and we would be sure that their harm probability prediction upper bounds the true harm probability. In practice, committee members are not equally ``wise'', so we can correct this calculation based on how plausible the theory harbored by each committee member is. In a Bayesian framework, the plausibility of a theory corresponds to its posterior over all theories given the observed data, which is proportional to the data likelihood given the theory multiplied by the prior probability of that theory.

\looseness=-1
In this paper, we show how results about posterior consistency can provide probabilistic risk bounds. All the results have the form of inequalities, where the true probability of harm is upper bounded by a quantity that can in principle be estimated, given enough computational resources to approximate Bayesian posteriors over theories given the data. In addition, these are not hard bounds but only hold with some probability, and there is generally a trade-off between that probability and the tightness of the bound. We study two scenarios in the corresponding sections: the \iid data setting in \Cref{sec:iid} and the non-\iid data setting in \Cref{sec:non-iid}, followed by an experiment in \Cref{sec:experiments}. In all cases, a key intermediate result is a bound relating the Bayesian posterior on the unknown true theory and the probability of other theories (with propositions labeled {\bf True theory dominance}). The idea is that because the true theory generated the data, its posterior tends to increase as more data is acquired, and in the \iid case it asymptotically dominates other theories. From such a relationship, the harm risk bound can be derived with very little algebra (yielding propositions labeled {\bf Harm probability bound}). These bounds depend on the unknown true theory having nonzero prior weight.

\looseness=-1
We conclude the paper with a discussion of open problems that should be considered in turning such bounds into a safe-by-design AI system, taking into account the challenge of representing the notion of harm and the imperfect estimation of the required conditional probabilities.

% \paragraph*{Related Work} The concept of blocking actions based on probabilistic criteria is similar to probabilistic shielding in MDPs~\citep{jansen2020safe}. While our cheating guardrail in \Cref{sec:experiments} implements this concept, our main contribution is providing bounds that do not require knowledge of the true model, extending beyond what \citet{carr2023safe} achieved for partially observable MDPs. Recent work has proposed various frameworks for measuring harm. \citet{beckers2023quantifying} develop quantitative definitions addressing individual and societal contexts, while \citet{richens2022counterfactual} show the necessity of counterfactual reasoning due to distributional shifts. Our approach, agnostic about the precise nature of harm, allows for various definitions of harm, while providing conservative probability bounds for safety-critical contexts where tail risks must be strictly controlled. Finally, while both works use Bayesian approaches, \citet{osband2017posterior} study how to translate concentration bounds from pure prediction settings into MDP settings where exploration is necessary. Our work addresses an orthogonal question: providing safety guarantees without relying on potentially dangerous exploration to gain information about harm probability. This makes our approach suitable for safety-critical applications, where exploration itself could be harmful.

\paragraph{Related Work.} The concept of blocking actions based on probabilistic criteria resembles probabilistic shielding in Markov Decision Processes (MDPs)~\citep{jansen2020safe}, but our bounds do not require knowledge of the true model, extending beyond \citet{carr2023safe}'s work on partially observable MDPs. While \citet{beckers2023quantifying} and \citet{richens2022counterfactual} propose specific frameworks for quantifying harm, our approach remains agnostic, by only requiring harmful outcomes to be representable as binary events $H=1$, allowing various harm definitions, while providing conservative probability bounds for safety-critical contexts. \citet{osband2017posterior} study translating concentration bounds from a pure predictive setting to an MDP setting with exploration, whereas we address an orthogonal question: providing safety guarantees without relying on potentially harmful exploration to gain information.

\section{Safe-by-design AI?}

\looseness=-1
Before an AI is built and deployed, it is important that the developers have high assurances that the AI will behave well.
\citet{dalrymple2024towards} propose an approach to ``guaranteed safe AI'' designs with built-in high-assurance quantitative safety guarantees, although these guarantees can sometimes be probabilistic and only asymptotic. It remains an open question whether and how that research program can be realized. The authors take existing examples of quantitative guarantees in safety-critical systems and motivate why such a framework should be adopted if we ever build AI systems that match or exceed human cognitive abilities and could potentially act in dangerous ways. Their program is motivated by current known limitations of state-of-the-art AI systems based on deep learning, including the challenge of engineering AI systems that robustly act as intended~\citep{cohen2022advanced,krakovna_specification_2020, reward_misspecification, pang_reward_2023, subset_features, skalse_defining_2022, skalse2022invariance, karwowski2023goodharts, skalse2024starc}.

The approach proposed by \citet{dalrymple2024towards} has the following components: a {\em world model} (which can be a distribution about hypotheses explaining the data), a {\em safety specification} (what are considered unacceptable states of the world), and a {\em verifier} (a computable procedure that checks whether a policy or action violates the safety specification).

Here, we study a system that infers a probabilistic world model, or {\em theory}, $\tau$ and updates its estimate of $\tau$ via machine learning, using the stream of observed data $D$. The observations $D$ are assumed to come from a data-generating process given by a ground-truth world model $\tau^*$, which lies in the system's space of possible theories. We do not assume that the agent observes any sort of Markovian state. The inference of the theory $\tau$ is Bayesian, meaning that the system maintains an estimate $q$ of the true posterior $\p(-\mid D)$ over theories: $q(\tau\mid D)\approx\p(\tau\mid D)$, where $\p(\tau\mid D)$ is proportional to the product of the prior probability $\p(\tau)$ with the likelihood of the observations under the theory, $\p(D\mid\tau)$. In the simplest case, $q$ is a point estimate, optimally placing its mass on the mode of the posterior. Assuming an observation $x$ and a theory $\tau$ are independent given $D$, inference of the latent theory $\tau$ allows the system to approximate conditional probabilities $\p(y\mid x,D)\approx\E_{\tau\sim q(\tau\mid D)}[\p_{\tau}(y\mid x,D)]$ over any random variables $X,Y$ known to the world model.

The safety specification is given in the form of a binary random variable $H$ (called ``harm'' below) whose probability given the other variables depends on the theory $\tau$. We are interested in predicting the probability of harm under the true theory $\tau^*$. Because $\tau^*$ is unknown, we propose to estimate upper bounds on this probability using the estimated posteriors. These upper bounds can be used as thresholds for a {\em verifier} that checks whether the risk of harm falls below some acceptable level.

Following~\citet{dalrymple2024towards}, we assume that the notion of harm has been specified, possibly in natural language, and that the ambiguities about its interpretation are encoded within the Bayesian posterior $P(\tau \mid D)$. This paper focuses on the verifier under different assumptions of \iid or non-\iid data.

\paragraph{What do the observations and context represent?} 
We give a possible interpretation of the objects introduced in the preceding discussion in the simple case of an agent acting in a fully observed environment (Markov Decision Process, or MDP), where the theory is a transition model and the occurrence of harm at a state $s$ is conditionally independent of all other variables given $s$.
\begin{itemize}[left=0pt,nosep]
    \item Observations $Z$ are transitions $z = (s,a,s',r)$, where $s$ is a state, $a$ is an action, $s'$ is the next state, and $r$ is the reward received.
    \item Theories $\tau$ encode the state visitation, transition probabilities\footnote{To be more precise, we can obtain transition probabilities from $\tau$ by conditioning on $(s,a)$ to get $P_\tau(s', r \mid s,a)$, but only for state-action pairs with non-zero probability under $\tau$.}, as well as the behavior policy from which observations are collected.
    % in particular, conditioning on $(s,a)$ gives transition probabilities $\p_\tau(s'\mid s,a)$.
    \item The dataset $D$ is a sequence of observed transitions.
    \begin{itemize}[left=0pt,nosep]
        \item In the non-\iid setting, $D$ could consist, for example, of the observations from a finite rollout in the order in which they occurred. 
        \item In the \iid setting, $D$ would need to be a sequence of independent samples from a fixed state-action-reward visit distribution. This could be achieved, for example, by rolling out a behavior policy multiple times and randomly sampling transitions from the resulting trajectories. 
    \end{itemize}
    In the common special case of a contextual bandit MDP under a fixed policy, the two coincide.
    \item The context $X$ is a pair $x = (s,a)$, where $s$ is a state and $a$ is an action being considered at state $s$.
    \item The harm probability $\p(H=1\mid X=(s,a),\tau,D)$ can be any function of the theory $\tau$, the context $x=(s,a)$, and the data $D$. For example, this probability could be derived from a fixed specification of what it means for a state $s'$ to be harmful, $\p_{\rm harm}(H=1\mid s')$. Then, the harm probability could be computed as $\p(H=1\mid X=(s,a),\tau,D)=\sum_{s', r}\p_\tau(s', r\mid s,a)\p_{\rm harm}(H=1\mid s')$.
\end{itemize}
We note that the interpretation of harm probability in the example above includes the case where the occurrence of harm is an observed variable $s'_{\rm harm}$ that is part of the state $s'$: in that case, we set $\p_{\rm harm}(H=1\mid s')=1$ if $s'$ is harmful (\ie, $s'_{\rm harm}=1$), and $\p_{\rm harm}(H=1\mid s')=0$ otherwise. Then the harm probability is just the probability, under $\tau$, of reaching a harmful state, and observations of harm in $D$ affect the Bayesian posterior over theories.

This interpretation also includes the case where the harm probability is a function of the state $s'$, but (non)occurrence of harm is not observed in $D$. For example, a language model encoding world knowledge and human preferences or constraints, or an iterative reasoning procedure that uses those constraints, could generate some specification of harm $\p_{\rm harm}(H=1\mid s')$, perhaps unreliably.

Finally, a setting that separates the predicted next state $s'$ from the harm variable $H$ in this way gives a framework for studying how an agent might tamper with harm guardrails. If the state $s'$ decomposes as $s'=(s'_{\rm harm},s'_{\rm rest})$, and $\p_{\rm harm}$ is deterministic as a function of $s'_{\rm harm}$, except for some difficult-to-reach values of $s'_{\rm rest}$, then the agent can try to reach those values of $s'_{\rm rest}$, so that harm is `recorded' as not having occurred, even though it has. (See the end of \Cref{sec:non-iid}.)

\section{I.I.D. data}
\label{sec:iid}

Following the notation introduced in the previous section, here, we consider the easier-to-analyze case where the observed examples $D=(z_1, z_2, \ldots, z_n)$ are sampled \iid from the unknown distribution $\tau^*$. Assuming that the prior assigns nonzero mass to $\tau^{\ast}$, and all theories are distinct distributions, it can be shown that the posterior $\p(\tau \mid D)$ converges to a point mass at $\tau^*$. We show that for sufficiently large $n$, we can bound the probability under $\tau^*$ of a harm event $H=1$ given conditions $x$ (\eg, a context and an action) by considering the probability of $H=1$ given $x$ and $D$, under a plausible but ``cautious'' theory $\tilde{\tau}$ that maximizes $\p(\tilde{\tau} \mid D) \p(H=1 \mid x, \tilde{\tau}, D)$.

\paragraph{Setting.} Fix a complete separable metric space $\gZ$, called the \emph{observation space}, let $\gF$ be its Borel $\sigma$-algebra, and fix a $\sigma$-finite measure $\mu$ on $\gF$. A \emph{theory} is a probability distribution on the measurable space $(\gZ,\gF)$ that is absolutely continuous w.r.t.\ $\mu$. If $\tau$ is a theory, we denote by $\p_\tau(\cdot)$ the Radon-Nikodym derivative $\frac{d\tau}{d\mu}:\gZ\to\R_{\geq0}$, which is uniquely defined up to $\mu$-a.e.\ equality. 

% Any theory $\tau$ also canonically determines a $\gZ$-valued random variable by the identity map on the probability space $(\gZ,\gF,\tau)$.

One can keep in mind two cases:
\begin{enumerate}[label=(\arabic*),nosep,left=0pt]
    \item $\gZ$ is a finite or countable set and $\mu$ is the counting measure. Theories $\tau$ are equivalent to probability mass functions $\p_\tau:\gZ\to\R_{\geq0}$.
    \item $\gZ=\R^d$ and $\mu$ is the Lebesgue measure. Theories are equivalent to their probability density functions $\p_\tau:\gZ\to\R_{\geq0}$ up to a.e.\ equality.
\end{enumerate}

Consider a countable (possibly finite) set of theories $\gM$ containing a \emph{ground truth} theory $\tau^*$ and fix a choice of a (measurable) density function $\p_\tau$ for each $\tau\in\gM$.

\paragraph{Definition of posterior as a random variable.} If $\p$ is a prior distribution\footnote{To be precise, $\gM$ is endowed with the counting measure and we flexibly interchange distributions and mass functions on $\gM$.} on $\gM$ and $z\in\gZ$, we define the posterior to be the distribution with mass function
\begin{equation}\label{eq:discrete_posterior}
    \p(\tau\mid z)=\frac{\p(\tau)\p_\tau(z)}{\sum_{\tau'\in\gM}\p(\tau')\p_{\tau'}(z)}\propto\p(\tau)\p_\tau(z),
\end{equation}
assuming the denominator converges and the sum is nonzero. Otherwise, the posterior is considered to be undefined. As written, the posterior depends on the choice of density functions $\p_\tau$, but any two $\p_\tau$ that are $\mu$-a.e.\ equal yield the same posterior for $\mu$-a.e.\ $z$.

\looseness=-1
For $z_1,z_2\in\gZ$, we write $\p(\cdot\mid z_1,z_2)$ for the posterior given observation $z_2$ and prior $\p(\cdot\mid z_1)$, and similarly for a longer sequence of observations. It can be checked that $\p(\cdot\mid z_1,\dots,z_t)$ is invariant to the order of $z_1,\dots,z_t$ and that it is defined in one order if and only if it is defined in all orders. This allows us to unambiguously write $\p(\cdot\mid D)$ where $D$ is a finite multiset of observations, and we have
\begin{equation}\label{eq:discrete_posterior_multiset}
    \p(\tau\mid D)\propto\p(\tau)\prod_{z\in D}\p_\tau(z).
\end{equation}
Let $\tau^*\in\gM$ be the ground truth theory and $\p(\cdot)$ a prior over $\gM$. Consider a sequence of \iid $\gZ$-valued random variables $Z_1,Z_2,\dots$ (whose realizations are the \emph{observations}), where each $Z_i$ follows the distribution $\tau^*$. For any $t\in\N$, the posterior $\p(\cdot\mid Z_{1:t})$ is then a random variable taking values in the space of probability mass functions on $\gM$.\footnote{To be precise, if the $Z_i$'s are measurable functions from a sample space $\Omega$ to $\Z$ and $\langle Z_1, \ldots, Z_t \rangle$ is their pairing, the random variable $\p(\cdot\mid Z_{1:t})\colon \Omega \xrightarrow{\langle Z_1, \ldots, Z_t \rangle} \Z^t \xrightarrow{\p(\cdot\mid -)} \ProbMonad(\gM)$ has codomain the space $\ProbMonad(\gM)$ of functions $\gM\to\R_{\geq0}$ summing to 1. The function $\p(\cdot\mid -)$ mapping a sequence of observations to the posterior probability mass function is measurable, due to each $\p_\tau(z)$ being measurable in $z$ and elementary facts.}

\paragraph{Bayesian posterior consistency.} 
\looseness=-1
We state, in our setting, a result about the concentration of the posterior at the ground truth theory $\tau^*$ as the number of observations increases.


\begin{propositionE}[\bf True theory dominance][end,restate]
    \label{prop1iid}
    Under the above conditions and supposing that $\p(\tau^*)>0$, the posterior $\p(\cdot\mid Z_{1:t})$ is almost surely defined for all $n$, and the following almost surely hold:
    \begin{enumerate}[label=(\alph*),nosep,left=0pt]
        \item \label{prop1iid:posterior_convergence} \(\p(\cdot\mid Z_{1:t})\xrightarrow{t\to\infty}\delta_{\tau^*}\) as measures, where $\delta_{\tau^*}$ is the Dirac measure, which assigns mass $1$ to the theory $\tau^*$ and $0$ elsewhere; equivalently, \(\lim_{t\to\infty}\p(\tau\mid Z_{1:t})=\mathbbm{1}[\tau=\tau^*]\).
        \item \label{prop1iid:argmax_posterior} There exists \(N\in\N\) such that \(\argmax_{\tau\in\gM}\p(\tau \mid Z_{1:t})=\tau^*\) for all \(t\geq N\).
    \end{enumerate}
\end{propositionE}
\begin{proofE}
    This is an application of Doob's posterior consistency theorem (\citet{doob1949}; see also \citet{miller2018detailed} for a modern summary). This result, which follows from the theory of martingales, assumes that $\tau^*$ is sampled from the prior distribution $\p(\tau)$ and the observations $Z_i$ are defined as above. Doob's theorem states that if for every $S\in\gF$, the map $\tau\mapsto\p_\tau(S)$ is measurable, then the posteriors $\p(\cdot\mid Z_{1:t})$ are almost surely defined and \ref{prop1iid:posterior_convergence} holds $\p$-almost surely with respect to the choice of $\tau^*$.

    In our case, because $\gM$ is countable, the measurability condition is satisfied, showing that \ref{prop1iid:posterior_convergence} holds for $\p$-almost every $\tau^*\in\gM$. In particular, if $\p(\tau^*)>0$, then \ref{prop1iid:posterior_convergence} holds.

    Finally, by \ref{prop1iid:posterior_convergence}, we have that for any $\varepsilon>0$, there exists $N$ such that for every $t \geq N$, $\p(\tau^*\mid Z_{1:t}) > 1-\varepsilon$, or, equivalently, $\sum_{\tau\neq\tau^*} \p(\tau\mid Z_{1:t}) < \varepsilon$, and therefore $\p(\tau\mid Z_{1:t}) < \varepsilon$ for all $\tau\neq\tau^*$. In particular, taking $\varepsilon=1/2$, we get that for sufficiently large $t$, $\p(\tau^*\mid Z_{1:t}) > \p(\tau\mid Z_{1:t})$ for every $\tau$, which shows \ref{prop1iid:argmax_posterior}.
\end{proofE}
(All proofs can be found in \cref{sec:proofs}.) Note that this result assumes that all theories in $\gM$ are distinct \emph{as probability measures} (so no two of the $\p_\tau$ are $\mu$-a.e.\ equal). 

\paragraph{On necessity of conditions.} The \iid assumption in \Cref{prop1iid} is necessary; see \Cref{rmk:necessity} for an example where $\limsup_{t\to\infty}\p(\tau^*\mid Z_{1:t})$ does not almost surely approach 1.



However, in practice, \Cref{prop1iid} can be adapted to more general scenarios, by substituting the subset $\gT \subseteq \gM$ of theories with minimum relative entropy to $\tau^*$ for $\tau^*$ (when $\tau^*$ is not in $\gM$). Then, we can replace convergence to $\delta_{\tau^*}$ with $\p(\gT \mid Z_{1:t}) \to 1$ in \ref{prop1iid:posterior_convergence}, and replace $\tau^*$ with $\gT$ in \ref{prop1iid:argmax_posterior}.


\paragraph{On generalizations to uncountable sets of theories.} The proof of \Cref{prop1iid} critically uses that the set of theories $\gM$ is countable when passing from almost sure convergence under $\tau^*$ sampled from the prior to almost sure convergence for any particular $\tau^*$ with positive prior mass. This argument fails for uncountable $\gM$; indeed, characterization of the $\tau^*$ for which the posterior converges to $\delta_{\tau^*}$ is a delicate problem \citep{freedman1,freedman2,diaconis-freedman}. Concentration of the posterior in neighborhoods of $\tau^*$ under some topology on $\gM$ has been studied by \citet{schwartz1965bayes,barron1999consistency,miller2021asymptotic}, among others. For \emph{parametric} families of theories with parameter $\theta\in\R^d$, under smoothness and nondegeneracy assumptions, the Bernstein-von Mises theorem guarantees convergence of the posterior $\p(\theta\mid Z_{1:t})$ to the true parameter $\theta^*$ at a rate that is asymptotically  Gaussian with inverse covariance $I(\theta^*)t$, where $I(\cdot)$ denotes the Fisher information matrix.

\paragraph{On convergence rates.} While we do not handle the \emph{rate} of convergence in \Cref{prop1iid}, guarantees can be obtained under specific assumptions on the prior and the set of theories. 
For example, for any $\tau\in\gM$, the quantity $D_\tau^t:=\log\frac{\p(\tau^*\mid Z_{1:t})}{\p(\tau\mid Z_{1:t})}$ is a process with $D_\tau^0=\log\frac{\p(\tau^*)}{\p(\tau)}$ and \iid increments, with 
\begin{align*}\label{eq:logratio_increments}
    \E[D_\tau^{t+1}-D_\tau^t]&=\KL{\tau^*}{\tau} \\
    \E\left[(D_\tau^{t+1}-D_\tau^t)^2\right]&=\E_{Z\sim\tau^*}\left[\left(\log\frac{\p_{\tau*}(Z)}{\p_\tau(Z)}\right)^2\right].
    \tagaligneq
\end{align*}
Under the assumption that the variances are finite and uniformly bounded in $\tau$, the central limit theorem would give posterior convergence rate guarantees.

\looseness=-1
However, note that \Cref{prop1iid} is a law-of-large-numbers-like result that holds even if the variances in (\ref{eq:logratio_increments}) are not finite and uniformly bounded.

\paragraph{Harm probability bounds.}

So far, we have considered a collection $\gM$ of distributions over an observation space. Now, we show bounds when each theory computes probabilities over some additional variables. The following lemma extends \Cref{prop1iid}~\ref{prop1iid:argmax_posterior} to estimates of real-valued functions of the theories and observations.

\begin{lemmaE}[][end,restate]\label{function_of_theory_iid}
    Under the conditions of \Cref{prop1iid}, let $f:\gM\times\bigcup_{t=0}^\infty\gZ^t\to\R_{\geq0}$ be a bounded measurable function. Then there exists $N \in \N$ such that for all $t\geq N$ and any $\tilde\tau\in\argmax_\tau[\p(\tau\mid Z_{1:t})f(\tau,Z_{1:t})]$, it holds that $f(\tau^*,Z_{1:t})\leq f\left(\tilde\tau,Z_{1:t}\right)$.
\end{lemmaE}
\begin{proofE}
    First, note that the argmax exists by boundedness of $f$ and $\p(\cdot\mid Z_{1:t})$. By \Cref{prop1iid}~\ref{prop1iid:argmax_posterior}, there exists $N\in\N$ such that for all $t\geq N$ and $\tau\neq\tau^*$, $\p(\tau^*\mid Z_{1:t})>\p(\tau\mid Z_{1:t}) \geq 0$. Let $t\geq N$ and $\tilde\tau\in\argmax_\tau[\p(\tau\mid Z_{1:t})f(\tau,Z_{1:t})]$. Then
    $$\p(\tau^*\mid Z_{1:t})f(\tilde\tau,Z_{1:t}) \geq \p(\tilde\tau\mid Z_{1:t})f(\tilde\tau,Z_{1:t})\geq\p(\tau^*\mid Z_{1:t})f(\tau^*,Z_{1:t}).$$
    When $\tilde\tau\neq\tau^*$, the result follows since $\p(\tau^*\mid Z_{1:t})>0$. The case $\tilde\tau = \tau^*$ is trivial.
\end{proofE}

A particular case of interest is when each theory is associated with estimates of probabilities of harm ($H=1$) given a context $x$ and past observations $Z_{1:t}$. That is, $\gM$ gives rise to a collection of conditional probability mass functions over the possible harm outcomes, denoted $\p(\cdot\mid x,\tau,Z_{1:t})$, for every $x$ lying in some space of possible contexts. In this setting, we have the following corollary:

\begin{propositionE}[\bf Harm probability bound][end,restate]\label{prop:harm_iid}
    Under the same conditions as \Cref{prop1iid}, there exists $N \in \N$ such that for all $t\geq N$ and $\tilde{\tau} \in \arg\max_\tau \p(\tau \mid Z_{1:t}) \p(H=1 \mid x,\tau,Z_{1:t})$, it holds that  
    \begin{equation}
        \p(H=1\mid x,\tau^*,Z_{1:t}) \leq \p(H=1 \mid x,\tilde{\tau},Z_{1:t}).
    \end{equation}
\end{propositionE}
\begin{proofE}
    Apply \Cref{function_of_theory_iid} to the function $f(\tau,Z_{1:t})=\p(H=1\mid x,\tau,Z_{1:t})$.
\end{proofE}

Intuitively, this means that once we have seen enough \iid data, we can act as though the ‘most cautious yet still plausible’ model upper-bounds reality: if that model judges an action safe enough, the true world will be at least as safe.

\section{Non-I.I.D. data}
\label{sec:non-iid}

\newlength{\mainlinewidth}
\setlength{\mainlinewidth}{3.25in}

In this section, we remove the assumption made in \Cref{sec:iid} that the $Z_i$'s are \iid given a theory $\tau^*$.

\paragraph{Setting.} As before, let $(\gZ,\gF,\mu)$ be a $\sigma$-finite Borel measure space of observations. For the results below to hold, we must also assume that $(\gZ,\gF)$ is a Radon space (\eg, any countable set or manifold), so as to satisfy the conditions of the disintegration theorem.

Let $(\gZ^\infty,\gF^\infty,\mu^\infty)$ be the space of infinite sequences of observations, $\gZ^\infty=\{(z_1,z_2,\dots):z_i\in\gZ)\}$, with the associated product $\sigma$-algebra and $\sigma$-finite measure. This object is the projective limit of the measure spaces $(\gZ^t,\gF^{\otimes t},\mu^{\otimes t})$, where $\gZ^t=\{(z_1,\dots,z_t):z_i\in\gZ\}$ and the projection $\gZ^{t+1}\to\gZ^t$ `forgets' the observation $z_{t+1}$. A \emph{theory} $\tau$ is a probability distribution on $(\gZ^\infty,\gF^\infty)$ that is absolutely continuous w.r.t.\ $\mu^\infty$. For $A\in\gF^{\otimes t}$, we write $\tau_{1:t}(A)$ for the measure of the cylindrical set, $\tau(A\times\gZ\times\gZ\times\dots)$, so $\tau_{1:t}$ is a measure on $(\gZ^t,\gF^{\otimes t})$. Because $\gF^\infty$ is generated by cylindrical sets, the absolute continuity condition on $\tau$ is equivalent to absolute continuity of $\tau_{1:t}$ w.r.t.\ $\mu^{\otimes t}$ for all $t$.\footnote{This is, in turn, equivalent to absolute continuity of conditional distributions, \ie, for every measurable subset $A\subseteq\gZ^t$ such that $\tau_{1:t}(A)>0$, $\frac{1}{\tau_{1:t}(A)}\,\restr{\tau_{1:t+1}}{{A\times\gZ}}\ll \restr{\mu^{t+1}}{{A\times\gZ}}$, where $A\times\gZ\subseteq\gZ^t\times\gZ\cong\Z^{t+1}$.}
This condition allows to define measurable probability density functions $\p_\tau:\gZ^t\to\R_{\geq0}$ as Radon-Nikodym derivatives, so that
\[\forall A\in\gF^{\otimes t},\quad\tau_{1:t}(A)=\int_{z_{1:t} \in A}\p_\tau(z_{1:t})\,d\mu^{\otimes t},\]
\looseness=-1
and measurable conditional probability densities $\p_\tau(z_{t+1}\mid z_{1:t}):=\frac{\p_\tau(z_{1:t},z_{t+1})}{\p_\tau(z_{1:t})}$ when $\p_\tau(z_{1:t}) > 0$. The disintegration theorem for product measures implies that these conditionals and marginals over finitely many observations can be manipulated algebraically using the usual rules of probability for $\mu^\infty$-a.e.\ collection of values, \eg, one has the autoregressive decomposition $\p_\tau(z_{1:t})=\prod_{i=1}^{t}\p_\tau(z_i\mid z_{1:i-1})$, with the conditional $\p_\tau(z_1\mid z_{1:0})$ understood to be the marginal $\p_\tau(z_1)$.

A theory is canonically associated with a $\gZ^\infty$-valued random variable $Z_{1:\infty}$. We denote its components by $Z_1,Z_2,\dots$ and the collection of the first $t$ observations by $Z_{1:t}$.

\paragraph{Definition of posterior as a random variable.} Let $\gM=(\tau_i)_{i\in I}$ be a collection of theories indexed by a countable set $I$\footnote{Unlike in \Cref{sec:iid}, we no longer require theories to be distinct.} and let $\p$ be a prior distribution on $I$. We define the posterior over indices to be
\begin{equation}\label{eq:noniid_posterior}
    \p(i\mid z_{1:t}):=\frac{\p(i)\p_{\tau_i}(z_{1:t})}{\sum_{j\in I}\p(j)\p_{\tau_j}(z_{1:t})},
\end{equation}
assuming the denominator converges to a positive value.

Consider a ground truth index $i^*\in I$ and abbreviate $\tau^*:=\tau_{i^*}$. Let $Z_{1:\infty}$ be the random variable taking values in $\gZ^\infty$ corresponding to $\tau^*$. Similarly to the \iid case, the posterior $\p(\cdot\mid Z_{1:t})$ is a random variable taking values in the space of probability mass functions on $I$. 

For all results below, we assume that $\p(i^*)>0$.

\paragraph{Bayesian posterior convergence.}

Previous work (\eg, \citep{cohen2022fully}) has shown that if $Z_{1:\infty} \sim \tau^*$, then the limit inferior of $\p(i^* \mid Z_{1:t})$ is almost surely positive. More generally, with probability at least $1-\delta$, the posterior on the truth will not asymptotically go below $\delta$ times the prior on the truth. We repeat that result here in our notation.

\begin{lemmaE}[Martingale][end,restate] \label{lem:mart}
    The process $W_t:=\p(i^* \mid Z_{1:t})^{-1}$ is a supermartingale, \ie, it \textit{does not} increase over time in expectation.
\end{lemmaE}
\begin{proofE}
    We have
    \begin{align*}
    \label{ineq:supermartingale}
        \E_{\tau^*} [W_{t+1} \mid Z_{1:t}=z_{1:t}] &\equal \int_{\scriptsize \big\{z_{t+1} \in \Z \mathbin{:} \p_{\tau^*}(z_{t+1} \mid z_{1:t}) > 0\big\}} \p(i^* \mid z_{1:t+1})^{-1}\p_{\tau^*}(z_{t+1}  \mid  z_{1:t})\,d\mu
        \\
        &\equal^{(a)} \int_{\scriptsize \big\{z_{t+1} \in \Z \mathbin{:} \p_{\tau^*}(z_{t+1} \mid z_{1:t}) > 0\big\}} \frac{\sum_{j\in I} \p(j  \mid  z_{1:t}) \p_{\tau_j} (z_{t+1}  \mid  z_{1:t})}{\p(i^*  \mid  z_{1:t}) \p_{\tau^*} (z_{t+1}  \mid  z_{1:t})}  \p_{\tau^*}(z_{t+1}  \mid  z_{1:t})\,d\mu
        \\
        &\lequal^{(b)} \int_\gZ \frac{\sum_{j \in I} \p(j  \mid  z_{1:t}) \p_{\tau_j} (z_{t+1}  \mid  z_{1:t})}{\p(i^*  \mid  z_{1:t})}\,d\mu
        \\
        &= w_t \sum_{j \in I} \p(j  \mid  z_{1:t}) \int_\gZ \p_{\tau_j} (z_{t+1}  \mid  z_{1:t})\,d\mu
        \\
        &\equal^{(c)} w_t
        \tagaligneq
    \end{align*}
    where $(a)$ is by the definition (\ref{eq:noniid_posterior}), $(b)$ follows from cancellation and positivity of the integrand, $w_t := \p(i^* \mid Z_{1:t}=z_{1:t})^{-1}$ is the realization of $W_t$, and $(c)$ follows because both the posterior and the conditional probability measure integrate to 1.
    % Therefore, it comes that
    % \begin{align*}
    %     \E_{\tau^*} [W_{t+1} \mid W_t = w_t] &\equal \int_{z_1,\dots,z_t} \underbrace{\E_{\tau^*}\big[W_{t+1} \mid W_t = \p(i^* \mid Z_{1:t}=z_{1:t})^{-1}, Z_{1:t} = z_{1:t} \big]}_{= \; \E_{\tau^*}[W_{t+1} \,\mid\, Z_{1:t} = z_{1:t}]} \p_\tau(z_1,\dots,z_t \mid W_t = w_t)\,d\mu^{\otimes t} \\
    %     &\lequal \int_{z_1,\dots,z_t} w_t \p_\tau(z_1,\dots,z_t \mid W_t = w_t)\,d\mu^{\otimes t} \\
    %     &\lequal w_t \\
    % \end{align*}

        This holds for any $z_{1:t}$, so we can remove the conditional:
        
        \begin{align*}
        \E_{\tau^*}[W_{t+1} \mid w_t] &\equal \int_{z_1,\dots,z_t} \E_{\tau^*}[W_{t+1} | z_{1:t}, w_t] \p_{\tau^*}(z_1,\dots,z_t \mid W_t = w_t)\,d\mu^{\otimes t}\\ 
        &\equal^{(a)} \int_{z_{1:t}} \E_{\tau^*}[W_{t+1} | z_{1:t}]\,\p_{\tau^*}(z_{1:t} \mid w_t) \, d\mu^{\otimes t}\\
        &\lequal^{(b)} \int_{z_{1:t}} w_t \, \p_{\tau^*}(z_{1:t} \mid w_t)\,d\mu^{\otimes t} \\
        &\equal w_t
        \end{align*}
        
        where $(a)$ follows because $w_t$ is a function of $z_{1:t}$ and $(b)$ is Inequality \ref{ineq:supermartingale}.
\end{proofE}

\begin{propositionE}[Posterior on truth][end,restate] \label{prop:posttruth}
    For all $\delta>0$, with probability at least $1-\delta$, $\inf_t \p(i^* \mid Z_{1:t}) \geq \delta \p(i^*)$; that is, $\tau^*\big(\big\{z_{1:\infty}:\inf_t\p(i^*\mid z_{1:t})<\delta\p(i^*)\big\}\big)\leq\delta$, or equivalently:
    \begin{equation}\label{eqn:proptruth}
        \tau^*\big(\sup_{t \geq 0} W_t \geq (\delta\p(i^*))^{-1}\big) \leq \delta
    \end{equation}
\end{propositionE}
\begin{proofE}
    By Ville's inequality \citep{ville1939} for the supermartingale $W_t := \p(i^* \mid Z_{1:t})^{-1}$, for any $\lambda > 0$:
    $$\tau^*\Big(\sup_{t \geq 0} W_t \geq \lambda\Big) \leq \frac{\E[W_0]}{\lambda} = \frac{1}{\lambda\p(i^*)}$$
    Setting $\lambda = (\delta\p(i^*))^{-1}$, we get
    \begin{equation*}
        \tau^*\big(\sup_{t \geq 0} W_t \geq (\delta\p(i^*))^{-1}\big) \leq \delta
    \end{equation*}
    and given that
    $$\big\{z_{1:\infty} : \sup_{t \geq 0} w_t := \sup_{t \geq 0} \p(i^* \mid z_{1:t})^{-1} > (\delta\p(i^*))^{-1}\big\} = \big\{z_{1:\infty} : \inf_{t \geq 0} \p(i^*\mid z_{1:t}) < \delta\p(i^*)\big\},$$ the result follows.
\end{proofE}
In the language of financial markets, if $W_t$ was the price of a stock at time $t$, you could never make money in expectation by holding it. Suppose that you ``bought shares'' at time 0, paying $W_0$, and waited for their value to increase by a factor of $\delta^{-1}$. If (\ref{eqn:proptruth}) did not hold and the probability of such an increase occurring was greater than $\delta$, then you could make an expected profit by ``$\delta^{-1}$-tupling'' your money with probability greater than $\delta$. 

The bound in \Cref{prop:posttruth} is ``tight''; see \Cref{rmk:tight_asymptotic_posterior_bound}.

\paragraph{Harm probability bounds.} We now state analogues of \Cref{prop:harm_iid} in the non-\iid setting. As above, let $H_t$ be a binary random variable that may depend on $Z_{1:t}$, $\tau$, and a context variable $x_t$.

\begin{propositionE}[Weak harm probability bound][end,restate] \label{prop:harm1}
    For any $\delta > 0$, with probability at least $1-\delta$, the following holds for all $t \in \mathbb{N}$ and all $x_t$:
    \begin{equation*}
    \resizebox{\mainlinewidth}{!}{$
            \displaystyle
            \p (H_t = 1 \mid Z_{1:t}, \tau^*,x_t) \leq \sup_{i \in I} \frac{\p(i \mid Z_{1:t}) \p(H_t = 1 \mid Z_{1:t},\tau_i, x_t)}{ \delta \p(i^*)}.
        $}
    \end{equation*}
\end{propositionE}
\begin{proofE}
    Substituting $i$ for $i^*$ on the r.h.s.\ can never increase the r.h.s., since $i^* \in I$. Then, after canceling and rearranging the terms, the proposition is readily implied by \Cref{prop:posttruth}.
\end{proofE}

\looseness=-1
Next, we show how the bound in \Cref{prop:harm1} can be strengthened by restricting to theories that have sufficiently high posterior mass relative to theories that are ``better'' than them.

Let $i^1_{Z_{1:t}},i^2_{Z_{1:t}},i^3_{Z_{1:t}},\dots$ be an enumeration of $I$ in order of decreasing posterior weight $\p(i\mid Z_{1:t})$, breaking ties arbitrarily, for example, following some fixed enumeration of $I$ (\ie, we have $\p(i^n_{Z_{1:t}}\mid Z_{1:t})\geq\p(i^{n+1}_{Z_{1:t}}\mid Z_{1:t})$ for all $n$). 
Each ${i^n_{Z_{1:t}}}$ is an $I$-valued random variable (\ie, the index of a theory in $\gM$). For any $0<\alpha\leq1$, we also define the $\gP(I)$-valued random variable
\begin{equation}\label{eq:def_m_alpha}
    \resizebox{0.91\mainlinewidth}{!}{$
            \displaystyle
    \gI^\alpha_{Z_{1:t}} := \bigg\{i^n_{Z_{1:t}} \in I : \p(i^n_{Z_{1:t}} \mid Z_{1:t}) \geq \alpha \sum_{m \leq n} \p(i^m_{Z_{1:t}} \mid Z_{1:t})\bigg\},
    $}
\end{equation}
which is the set of indices that contain at least $\alpha$ of the posterior mass of all indices that are more likely than it under the posterior. If $\alpha=1$, this set is the singleton $\{i^1_{Z_{1:t}}\}$. For any $0<\alpha<1$, it is nonempty, because it contains $i^1_{Z_{1:t}}$, and finite, since $|\gI^\alpha_{Z_{1:t}}|\geq N$ implies (easily by induction) that
$\sum_{i\in\gI^\alpha_{Z_{1:t}}}\p(i\mid Z_{1:t})
    \geq
    \left(\frac{1}{1-\alpha}\right)^{N-1}\p(i^1_{Z_{1:t}}\mid Z_{1:t}).$

The following is a variant of \citet[Thm 2]{cohen2022fully}.
\begin{propositionE}[\bf True theory dominance][end,restate]
\label{lem:inset}
    If $\alpha < \delta \p(i^*)$, then with probability at least $1-\delta$, for all $t \in \mathbb{N}$, $i^* \in \gI^\alpha_{Z_{1:t}}$.
\end{propositionE}
\begin{proofE}
    For any $t\geq1$, by \Cref{prop:posttruth},
    \begin{align*}
        \delta
        &\geq
        \tau^*\Big(\big\{z_{1:\infty}:\inf_{t'}\p(i^*\mid z_{1:t'})<\delta\p(i^*)\big\}\Big)\\
        &\geq
        \tau^*(\{z_{1:\infty}:\p(i^*\mid z_{1:t})<\delta\p(i^*)\})\\
        &\geq
        \tau^*(\{z_{1:\infty}:\p(i^*\mid z_{1:t})<\alpha\}).
    \end{align*}
    So $\tau^*(\{z_{1:\infty}:\p(i^*\mid z_{1:t}) \geq \alpha\}) \geq 1 - \delta$, and the result follows by the fact that $\gI^\alpha_{Z_{1:t}} \supseteq \{i\in I : \p(i \mid Z_{1:t}) \geq \alpha\}$, since the sum in (\ref{eq:def_m_alpha}) never exceeds $1$.
\end{proofE}

\begin{propositionE}[\bf Harm probability bound][end,restate] \label{prop:harm2}
    If $\alpha < \delta \p(i^*)$, then with probability at least $1-\delta$, for all $t\in\N$ and all $x_t$,
    \begin{equation}
        \label{eq:harm2}
        \resizebox{0.91\mainlinewidth}{!}{
        $
            \displaystyle
          \p (H_t = 1 \mid Z_{1:t}, \tau^*, x_t) \leq
          \max_{i \in \gI^\alpha_{Z_{1:t}}} \p (H_t = 1 \mid Z_{1:t}, \tau_i, x_t) 
          $
          }
    \end{equation}
\end{propositionE}
\begin{proofE}
    This follows directly from \Cref{lem:inset}.
\end{proofE}

Intuitively, this means that even when the data distribution shifts over time, it is enough to check a small `credible but cautious' set of high-posterior theories: if all of them say the action is safe below the threshold, the real environment is very unlikely to be worse. Because the conclusion of \Cref{prop:harm2} is much stronger than that of \Cref{prop:harm1}, it would be much safer (or more useful, depending on the value of $\alpha$) to use $\argmax_{i \in \gI^\alpha_{Z_{1:t}}} \p(H_t = 1 \mid Z_{1:t-1}, \tau_i,x_t)$ as a `paranoid' theory rather than $\argmax_{i\in I} \p(\tau_i \mid Z_{1:t-1}) \p (H_t = 1 \mid Z_{1:t-1}, \tau_i, x_t)$. The factor of $(\delta\p(i^*))^{-1}$ in \Cref{prop:harm1} could render the upper bound on harm probability much larger than the trivial upper bound of 1. However, we note that approximating $\gI^\alpha_{Z_{1:t}}$ -- such as by amortization or by Monte Carlo methods -- is much more difficult than approximating the posterior alone.

\paragraph{On the harm-recording mechanism.}

Suppose that $\tau_{i^*}=\tau^*$ is a data-generating process meeting the description ``$H_t = 1$ when harm has occurred'', while $\tau_{i^\dagger}=\tau^\dagger$ is a data-generating process meeting the description ``$H_t = 1$ when harm is recorded as having occurred'' and agreeing with $\tau^*$ in its observational predictions otherwise. If, and only if, the recording process is functioning correctly, $\tau^* = \tau^\dagger$. For as long as the recording process is functioning correctly, $\p(i^* \mid Z_{1:t}) / \p(i^\dagger \mid Z_{1:t}) = \p(i^*) / \p(i^\dagger)$. If the recording process ever fails at time $t$, then $Z_t \sim \p_{\tau^\dagger}$, not $\p_{\tau^*}$, since $Z_t$ is the result of this recording process; therefore, $\p(i^* \mid Z_{1:t}) / \p(i^\dagger \mid Z_{1:t})$ would \textit{decrease} in expectation, perhaps dramatically. We should not expect $\p(i^*)$ to naturally win out over $\p(i^\dagger)$, even if there are no mistakes when recording how harmful certain situations are. However, the following holds with probability approaching $1$ as $\alpha\to0$: for all $t$, if the recording process has not failed by time $t$, $\gI^\alpha_{Z_{1:t}}$ contains both $i^*$ and $i^\dagger$. If $\tau^*$ considers tampering with the recording process to be a `harmful' outcome, then an AI system could attempt to avoid a first instance of tampering at time $t$, for all $t$.

\section{Experiments}\label{sec:experiments}


\begin{figure*}[t]
    \centering
    \includegraphics[width=\linewidth]{images/deaths_and_rewards_vs_alpha_error.pdf}
    \caption{Mean episode deaths and reward for different guardrails in the exploding bandit setting.}
    \label{fig:reward_vs_deaths}
\end{figure*}


\paragraph{Exploding bandit setting.}
\looseness=-1
We evaluate\footnote{Code available at \url{https://github.com/saifh-github/conservative-bayesian-public}} the performance of safety guardrails based on \Cref{prop:harm_iid} and \Cref{prop:harm2} in a bandit MDP with $10$ arms (actions).
%\footnote{Code available at \url{https://github.com/saifh-github/conservative-bayesian-public}.} 
Each arm $a\in\{1,\dots,10\}$ is represented by a feature vector $f_a\in\{0,1\}^{d}$ (we take $d=10$, but $d$ is not necessarily equal to the number of arms), which is sampled uniformly at random at the start of each episode and known to the agent. The reward distribution of each arm is fixed for the duration of each episode and assumed to be of the following form: the reward received after taking action $a$ follows a unit-variance normal distribution, $r(a)\sim\mathcal N (f_a\cdot v^*, 1)$, where $v^*\in\{0,1\}^d$ is some vector sampled uniformly at random at the start of each episode and unknown to the agent. Taking any action and observing the reward gives evidence about the identity of $v^*$ and thus about the reward distributions of the other actions. The agent maintains a belief over the vector used to compute the reward, beginning with a uniform prior over $\{0,1\}^{d}$ and updating its posterior with each observation of an action-reward pair. 

\looseness=-1
We assume that the agent samples its actions from a Boltzmann policy (with temperature 2) using the expected reward of each action under its posterior given the data seen so far, meaning that a reward vector $v\in \{0,1\}^d$ determines a distribution over sequences of action-reward pairs. Thus each $v\in\{0,1\}^{d}$ can be naturally associated with a theory $\tau_v$\footnote{\looseness=-1 The mapping $v\mapsto\tau_v$ may not be injective. Different vectors may represent the same collection of reward distributions and therefore the same distribution over sequences of action-reward pairs.}, and thus $I:=\{0,1\}^d$ is an indexing set for a collection of theories $\gM=(\tau_v)_{v\in I}$.
% this mapping gives rise to a multiset of theories $\gM$. 
Inference of $v$ 
% under a uniform prior over $\{0,1\}^{d}$ 
with evidence collected on-policy is equivalent to inference of $\tau_v$ 
% under a uniform prior over $\gM$ 
given data generated by a true theory $\tau^*:=\tau_{v^*}$. 
Since the policy changes across timesteps, so does the distribution of action-reward pairs, so we are in the non-\iid setting.

\looseness=-1
The bandit comes with a notion of harm: if the reward received at a given timestep exceeds some threshold $E$, the bandit explodes\footnote{This emulates the important and problematic scenario where the user's utility to be maximized, \eg, profits, conflicts with safety.}, terminating the episode. In other words, we define harm as $H_t:=\mathbbm{1}[R_t>E]$, where $R_t$ is the random variable representing the reward received when taking action $a_t$. $E$ is set to a Monte Carlo approximation of the expected highest mean reward of any action (\ie, $\mathbb{E}\left[\max_a(f_a\cdot v^*)\right]$). The maximum episode length is $25$ timesteps.

\paragraph{Safety guardrails.}

A \emph{guardrail} is an algorithm that, given a possible action and context (\eg, current state and history), determines whether taking the action in the context is admissible. A guardrail can be used to mask the policy to forbid certain actions, such as those whose estimated harm exceeds some threshold $C$. We compare several guardrails (formally defined below): those constructed from \Cref{prop:harm_iid} and \Cref{prop:harm2}, one that marginalizes across the posterior over $\tau$ to get the posterior predictive harm probability, and one that `cheats' by using the probability of harm under the true theory $\tau^*$. Recall that $Z_{1:t}$ consists of the observations (\ie, actions taken and rewards received) at previous timesteps.
\begin{itemize}[nosep,left=0pt]
    \item \textbf{\Cref{prop:harm_iid} guardrail:} rejects an action $a_{t+1}$ if there exists $ \tilde{v} \in \argmax_v \p(v \mid Z_{1:t}) \p(H_{t+1}=1 \mid \tau,Z_{1:t},a_{t+1})$ with $\p(H_{t+1}=1 \!\mid\! {\tau_{\tilde v}},Z_{1:t},a_{t+1})>C$ (note that the assumptions of \iid observations and distinct theories do not hold).
    \item \textbf{\Cref{prop:harm2} guardrail:} rejects an action $a_{t+1}$ if $\max_{v \in \gI^\alpha_{Z_{1:t}}} \p (H_{t+1} = 1 \mid Z_{1:t}, \tau_v, a_{t+1})>C.$
    \item \textbf{Posterior predictive guardrail:} rejects an action $a_{t+1}$ if $\p(H_{t+1}=1\mid Z_{1:t},a_{t+1})>C$.
    \item \textbf{Cheating guardrail:} rejects an action $a_{t+1}$ if $\p(H_{t+1}=1\mid Z_{1:t}, \tau^*, a_{t+1})>C$ (note that this guardrail assumes knowledge of the true theory $\tau^*$).
\end{itemize}
The guardrail is run at every sampling step, and actions that the guardrail rejects are forbidden to be sampled by the agent. If all actions are rejected, the episode terminates.


\begin{figure}[t]
    \centering
    \begin{subfigure}[b]{0.49\textwidth}
        \centering
        \includegraphics[width=\textwidth]{images/overestimation_error.pdf}\\[-0.5em]
        \caption{The frequency with which the inequality in \Cref{prop:harm2} is satisfied.}
        \label{fig:overestimation}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.49\textwidth}
        \centering
        \includegraphics[width=\textwidth]{images/harm_estimates.pdf}\\[-0.5em]
        \caption{The distribution of the right-hand side of (\ref{eq:harm2}), for an action with a true harm probability of $0.5$. }
        \label{fig:harm_estimates}
    \end{subfigure}
    \caption{Overestimate frequency and harm estimate distribution for the \Cref{prop:harm2} guardrail for varying $\alpha$.}
    \label{fig:tightness}
\end{figure}

\paragraph{Results.}
\looseness=-1
\Cref{fig:reward_vs_deaths} shows mean episode rewards and episode deaths under each guardrail across $10000$ episodes, for different values of the rejection threshold $C$. The cheating guardrail achieves near zero deaths for sufficiently small $C$, but for $C=0.1$ its death probability is high.\footnote{If every action taken had a harm probability of $0.1$, the probability of death across an episode would be $1-\big((1-0.1)^{25}\big)\approx 0.93$.} The posterior predictive guardrail achieves zero deaths for small $C$, while for larger $C$ it dies frequently, generally receiving lower reward compared to the cheating guardrail. The behavior of the \Cref{prop:harm2} guardrail depends strongly on $\alpha$. When $\alpha$ is close to $1$, actions are rarely rejected, leading to frequent deaths. Up to a point, this riskier behavior allows the agent to get more reward, but for $C=0.1$ and high $\alpha$ the trend starts to reverse, as early deaths become frequent enough to preclude the opportunity. At the other extreme, when $\alpha$ is close to $0$, the candidate set of theory indices $\gI^\alpha_{Z_{1:t}}$ is larger and the guardrail is extremely conservative. It rejects almost all actions, resulting in low deaths and low reward. This is the case even for larger $C$, since the estimated probability used to filter actions tends to overestimate an action's harm probability under the true theory. For middling values of $\alpha$, \Cref{prop:harm2} guardrail performs more similarly to the posterior predictive, sometimes with lower reward and higher deaths, and sometimes the opposite. The \Cref{prop:harm_iid} guardrail, which makes the incorrect assumptions of \iid data and distinct theories, is similarly conservative to the \Cref{prop:harm2} guardrail with low $\alpha$.

\paragraph{Tightness of bounds.}\looseness=-1
\Cref{fig:tightness} shows how often and how tightly the inequality in \Cref{prop:harm2} is satisfied. For an agent following a uniform policy across $10000$ bandit episodes without action rejection or death, \Cref{fig:overestimation} shows the frequency with which $\max_{v \in \gI^\alpha_{Z_{1:t}}}\p (H_{t+1} = 1 \mid Z_{1:t}, \tau_v, a_{t+1})$ overestimates the true harm probability. \Cref{prop:harm2} gives a lower bound of $1-\frac{\alpha}{\p (v^*)}$ (which may be negative) on the overestimation frequency, but the frequency significantly exceeds the bound for larger $\alpha$. \Cref{fig:harm_estimates} shows the distribution of harm estimates for actions with a ground truth harm probability of $0.5$. For large $\alpha$ the harm of these dangerous actions is usually \emph{underestimated} -- so the high overestimation rate (\Cref{fig:overestimation}) comes from less dangerous actions.


\section{Conclusion and open problems}

The approach to safety verification proposed here is based on context-dependent run-time verification because the set of possible inputs for a machine learning system is generally astronomical, whereas the safety of the answer to a specific question is more likely to be tractable. It focuses on the risk of wrongly interpreting the data, including the safety specification itself (called ``harm'' above) and exploits the fact that, as more evidence is gathered (necessary with \iid data) and when different theories predict different observations, the true interpretation rises towards the maximal value of the Bayesian posterior. The bound is tighter with the \iid data, but the \iid assumption is also unrealistic, and for safety-critical decisions, we would prefer to err on the side of prudence and fewer assumptions. However, it provides a template to think about variants of this idea in future work. Several challenges remain for turning such bounds into an operational run-time safeguard:
\begin{enumerate}[left=0pt,label=(\arabic*)]
\item \textbf{Upper-bounding overcautiousness.} Can we ensure that we do not underestimate the probability of harm but do not massively overestimate it? Some simple theories consistent with the dataset (even an arbitrarily large one) might deem non-harmful actions harmful. Can we bound how much this harm-avoidance hampers the agent? A plausible approach would be to make use of a mentor for the agent that demonstrates non-harmful behavior \citep{cohen2020pessimism}.
\item \textbf{Tractability of posterior estimation.} How can we efficiently estimate the required Bayesian posteriors? For computational tractability, a plausible answer would rely on amortized inference, which turns the difficult estimation of these posteriors into the task of training a neural net probabilistic estimator which will be fast at run-time. Recent work on amortized Bayesian inference for symbolic models, such as causal structures~\citep{deleu2022bayesian,deleu2023joint}, and for intractable posteriors in language models~\citep{guo2021efficient,hu2024amortizing,venkatraman2024amortizing,song2024latent,yu2024flow} -- which are useful when prior knowledge is encoded in a pretrained foundation model -- suggests that this is feasible. Advances in efficient and adaptive Monte Carlo methods, \eg, for language models \citep{phan2023training,zhao2024probabilistic,lew2023sequential}, can also be useful for this purpose, and MCMC approaches can complement and aid amortization \citep{hu2023gfnem,kim2023local,sendera2024improved,kim2024ant}.
\item \textbf{Efficient search for a cautious theory.} How can we efficiently identify a cautious but plausible theory that upper-bounds the risk, since this requires an optimization at run-time? Again, a plausible answer is to rely on amortized probabilistic inference, \eg, by sampling theories with a low or annealed temperature, a technique that has been used for decades in the nonconvex optimization literature and more recently combined with amortized sequential sampling methods \citep[][\emph{inter alia}]{zhang2023robust,zhang2023let,zhou2024phylogfn}.
\item \textbf{Partial specification of theories.} Another issue is that a full explanatory theory $\tau$ for the evidence $D$ will generally be too large an object to make the above bound calculations and optimization sufficiently efficient. An alternative would be to derive bounds that only require partial explanations, similarly to how humans tell cautious `stories' about what could go wrong in uncertain circumstances when thinking about taking a potentially dangerous decision~\citep{bengio2024towards}. Work along these lines could allow us to avoid the requirement that we have a prior that puts positive weight on a single model that correctly models the entire model.
\item \textbf{Harm specification in natural language.} How can natural language safety specifications, as in Constitutional AI \citep{bai2022constitutional}, be turned into a random variable over which conditional probabilities can be estimated? Manually labeling numerous examples matching the desired constraint is impractical. Instead, \citet{bengio2024towards} propose learning to interpret natural language statements as formal random variables over which probabilistic causal reasoning can occur. The ambiguity of natural language can then be captured by a distribution over formal interpretations.
\item\ \textbf{Effect of approximation error.} Finally, because we plan to use machine learning to estimate the bound, we have to consider the fact that a learned predictor will be imperfect. How do we conservatively correct for the potentially dangerous errors made by this predictor? This may be crucial if we use a neural network to estimate probabilities, since they are subject to lack of robustness and adversarial attacks~\citep{goodfellow2014explaining,wei2024jailbroken}, \eg, how do we ensure that the agent proposing the actions has not learned to find loopholes in the risk estimator?
\end{enumerate}

\bibliography{reference}
% Acknowledgements: thank Oliver Richardson
\newpage
\appendix
\onecolumn

\section{Proofs}
\label{sec:proofs}

\printProofs

\section{Accompanying examples}

\begin{remark}[Necessity of conditions, \iid case]
\label{rmk:necessity}
    The assumption that the data-generating process $\tau^*$ lies in $\gM$ and has positive prior mass is also necessary for convergence of the posterior. To illustrate this, we give a simple example in which the theories are Bernoulli distributions and the posterior does not converge to any distribution over $\gM$.
    
    Take $\gZ=\{-1,1\}$ and $\gM=\{\tau_p,\tau_{1/2},\tau_{1-p}\}$ for some $\frac12<p<1$, where $\p_{\tau_c}(1)=c$. Assume a prior with $\p(\tau_p)=\p(\tau_{1-p})=\frac12$ and take the true data-generating process $\tau^*$ to be $\tau_{1/2}$, which has prior mass 0. The log-ratio of posterior masses is then an unbiased random walk: 
    \[
        \log\frac{\p(\tau_p\mid Z_{1:t})}{\p(\tau_{1-p}\mid Z_{1:t})}
        =
        \log\frac{\p_{\tau_p}(Z_{1:t})}{\p_{\tau_{1-p}}(Z_{1:t})}
        =
        \left(\log\frac{p}{1-p}\right)\sum_{i=1}^tZ_i.
    \]
    This quantity almost surely takes on arbitrarily large and small values infinitely many times. In fact, by the law of iterated logarithms, for any $\epsilon>0$ there are infinitely many $t$ such that
    \[\log\frac{\p(\tau_p\mid Z_{1:t})}{\p(\tau_{1-p}\mid Z_{1:t})}\geq(1-\epsilon)\left(\log\frac{p}{1-p}\right)\sqrt{2t\log\log t}\]
    and the same holds for $\log\frac{\p(\tau_{1-p}\mid Z_{1:t})}{\p(\tau_{p}\mid Z_{1:t})}$, by symmetry. In particular, $\lim_{t\to\infty}\p(\tau\mid Z_{1:t})$ almost surely does not exist for any $\tau\neq\tau^*$, and the $\liminf$ and $\limsup$ are almost surely 0 and 1, respectively.
\end{remark}

\begin{remark}[Tightness, non-\iid case]
\label{rmk:tight_asymptotic_posterior_bound}
    \Cref{prop:posttruth} is ``tight'' in the following sense: for all $\delta, \varepsilon > 0$, there exist $\M$, $\p$, and $\tau_{i^*} \in \M$, such that with probability at least $\delta$, $\limsup_t \p(i^* \mid Z_{1:t}) \leq (\delta + \varepsilon) \p(i^*)$.
    
    We construct such an example. Consider the following setting: $\M = \{\tau^*, \tau'\}$ (indexed by $I=\{i^*,i'\}$ as $\tau_{i^*}=\tau^*,\tau_{i'}=\tau'$), $\Z = \{0, 1\}$, and the theories are defined by
    \begin{align*}
        &\p_{\tau^*}(1) = \delta,\quad
        \p_{\tau'}(1) = 1, \\
        &\p_{\tau_i}(1 \mid z_{1:t}) = \frac12  \;\forall i\in I,t\geq1,z_{1:t}\in\gZ^t.
    \end{align*}
    One has $\p(i^* \mid Z_1=1) = \frac{\delta \p(i^*)}{\delta \p(i^*) + \p(i')} < \delta \frac{\p(i^*)}{1 - \p(i^*)}.$
    Since $\tau^*$ and $\tau'$ give the same conditional probabilities of $Z_t$ given $Z_{1:t-1}$ for $t>1$, one has $\p(i^*\mid Z_{1:t})=\p(i^*\mid Z_1)$. So, for all $t\geq1$, $\p(i^* \mid Z_1=1,Z_{2:t}=z_{2:t}) < \delta (1 - \p(i^*))^{-1}\p(i^*)$,
     and hence
     \begin{multline}
         \tau^*\Big(\big\{z_{1:\infty}:\limsup_t\p(i^*\mid z_{1:t})<\delta (1 - \p(i^*))^{-1}\p(i^*)\big\}\Big)\geq \\ \tau^*\big(\big\{z_{1:\infty}:z_1=1\big\}\big)=\p_{\tau^*}(1)=\delta.
     \end{multline}
    So by choosing $\p(i^*) < 1 - 1/(1+\frac\varepsilon\delta)$, so that $\delta(1-\p(i^*))^{-1}<\delta+\varepsilon$, we get the desired property.
\end{remark}

\end{document}
