\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{oesterheld_748}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\title{Incentivizing honest performative predictions with proper scoring rules (Supplementary material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<oesterheld@cmu.edu>?Subject=Incentivizing honest performative predictions with proper scoring rules}{Caspar~Oesterheld}{}\thanks{Equal contribution}}
\author[2]{\href{mailto:<jtreutlein@berkeley.edu>?Subject=Incentivizing honest performative predictions with proper scoring rules}{Johannes~Treutlein}{}\footnote[1]{}}
\author[3]{Emery~Cooper}
\author[4]{Rubi~Hudson}
% Add affiliations after the authors
\affil[1]{%
Carnegie Mellon University
}
\affil[2]{%
University of California, Berkeley
}
  \affil[3]{%
Center on Long-Term Risk
  }
\affil[4]{%
University of Toronto
  }

\usepackage{xparse}

  \usepackage{amsthm}
\usepackage{amssymb}

\usepackage{cleveref}
\usepackage{thm-restate}
\usepackage{mathtools}
\usepackage{bbm}

\usepackage{csquotes}
\usepackage{nicefrac}

\usepackage{comment}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\newcommand{\thisthmnumber}{}
\theoremstyle{plain}
\newtheorem{lemma}{Lemma\thisthmnumber}
\newtheorem{theorem}{Theorem\thisthmnumber}
\newtheorem{proposition}{Proposition\thisthmnumber}
\newtheorem{corollary}{Corollary\thisthmnumber}

\usepackage{float}

\theoremstyle{definition}
\newtheorem{definition}{Definition\thisthmnumber}
\newtheorem{example}{Example\thisthmnumber}

\NewDocumentEnvironment{statement}{mo}
 {%
  \IfValueT{#2}{\renewcommand{\thisthmnumber}{ #2}}\begin{#1}%
 }
 {\end{#1}}


\newcommand{\ddp}{\frac{d}{dp}}
\newcommand{\ddx}{\frac{d}{dx}}
\newcommand{\partialp}{\frac{\partial}{\partial p}}
\newcommand{\Rbar}{\overline{\mathbb{R}}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\Prob}{\mathbb{P}}
\newcommand{\Score}{S}
\newcommand{\Id}{I}
\newcommand{\p}{{\bm{p}}}
\newcommand{\q}{\bm{q}}
\newcommand{\Pvar}{\bm{P}}
\newcommand{\Y}{Y}
\newcommand{\y}{\bm{y}}
\newcommand{\Regret}{\mathrm{Regret}}
\newcommand{\x}{\bm{x}}

%for tangent vectors
\newcommand{\Tv}{\bm{v}}
\newcommand{\Tw}{\bm{w}}

\newcommand{\op}{\mathrm{op}}

\newcommand{\D}{\mathcal{D}}
\newcommand{\N}{\mathcal{N}}
\newcommand{\Pset}{\Delta(\N)}
\newcommand{\TPset}{\mathcal{T}}
\newcommand{\interior}[1]{\mathrm{int}(#1)}

\usepackage{xcolor}

\usepackage{ifthen}
\newboolean{commentsactivated}
\setboolean{commentsactivated}{false}
\newcommand{\vc}[1]{\ifthenelse{\boolean{commentsactivated}}{{\color{blue} {\em VC: #1 }}}{}}
\newcommand{\co}[1]{\ifthenelse{\boolean{commentsactivated}}{{\color{red} {\em CO: #1 }}}{}}
\newcommand{\jt}[1]{\ifthenelse{\boolean{commentsactivated}}{{\color{olive} {\em JT: #1 }}}{}}
\newcommand{\ec}[1]{\ifthenelse{\boolean{commentsactivated}}{{\color{teal} {\em EC: #1 }}}{}}
\newcommand{\RH}[1]
{\ifthenelse{\boolean{commentsactivated}}{{\color{violet} {\em RH: #1 }}}{}}


% Em stuff
\usepackage{bm}
\newcommand{\norm}[1]{\left\Vert#1\right\Vert}
\renewcommand{\vec}[1]{\bm{#1}} % How vectors display
\usepackage{tikz}
\usepackage{tkz-euclide}
\usetikzlibrary{decorations.markings, calc}
\newcommand{\defeq}{\vcentcolon=}
\newcommand{\eqdef}{=\vcentcolon}

\usepackage{subcaption}

\setcounter{example}{4}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle


\appendix











\section{Proofs}
\label{appendix:proofs}
%\subsection{Preliminaries}

\begin{comment}
\subsubsection{Differentiable scoring functions}

, which is the inner product space
\begin{align*}
\TPset&:=\{\x\in\mathbb{R}^n\mid \sum_ix_i=0\}\\
&=\{\alpha(\p-\q)\mid \alpha\in\mathbb{R},\p,\q\in\Pset\}\end{align*}
with the Euclidean scalar product \(\langle \Tv,\Tw\rangle = \Tv^\top \Tw\) and the Euclidean norm \(\Vert \Tv \Vert = \sqrt{\Tv^\top \Tv}\).
For any \(\Tv\in\TPset\), \(\partial_\Tv G(\p)=g(\p)^\top\Tv\) then gives the directional derivative of \(G\) at point \(\p\) in the direction \(\Tv\). Note that we have to be somewhat careful since \(G\) is only defined on the simplex \(\Pset\), so that the partial derivatives \(\frac{\partial G(\p)}{\partial p_i}\) may not be defined. (If $G(\p)$ is naturally extended to $\mathbb{R}^n$, one could use the regular partial derivatives of $G(\p)$. But we don't want to do this.)
Instead, only directional derivatives with directions in \(\TPset\) are defined. This means that there is a degree of freedom in choosing a coordinate representation of \(\nabla G\) in \(\mathbb{R}^n\). In the following, we assume that \(\sum_i g_i(\p)=0\), or equivalently, that \(g(\p)^\top\x=0\) for any \(\x\in\mathbb{R}^n\) orthogonal to \(\TPset\).\co{(This works for mere subgradients as well. If $g$ is a subgradient of $G$ on the simplex, then so is $\tilde g (p)=(g_i(p)-\nicefrac{1}{n}\sum_jg_j(p))_i$. So generally we can assume if we want that the entries of $g$ sum to $0$.) Jonny says: Actually this doesn't quite work because the subgradient may be infinite at the edges.}
\jt{note that we need to define this where we mention it the first time.}
Note that this is also the choice that minimizes \(\Vert g(\p)\Vert\) and makes sure that \(\Vert g(\p)\Vert = \Vert g(\p)^\top|_{\TPset} \Vert_{\mathrm{op}}\), where $g(\p)^\top|_{\TPset}$ denotes the function $\TPset\rightarrow\mathbb R\colon \mathbf v \mapsto g(\p)^\top \mathbf v$. %\co{My interpretation of the right-hand side is that you interpret $g(\p)^\top$ as a function and restrict it to $\mathcal T$. Correct?}, \jt{correct}
This is due to the Cauchy--Schwarz inequality %\co{That's Cauchy-Schwarz, I assume?} \jt{correct}
and Pythagorean's theorem, since \(\Vert g(p)+\alpha \mathbf{1}\Vert^2=\Vert g(p)\Vert^2 + |\alpha|^2 \Vert \mathbf{1}\Vert^2\) for any \(\alpha\in\mathbb{R}\) when \(g(p)\in\TPset\). Moreover, by Cauchy--Schwarz, we have \(\Vert g(\p)^\top \vec{v}\Vert \leq \Vert g(\p)\Vert\Vert \vec{v}\Vert\) for any \(\vec{v}\in\TPset\) and if \(g(\p)\in\TPset\) then \(\Vert g(\p)^\top|_{\TPset}\Vert_{\mathrm{op}}\geq\Vert g(\p)^\top g(\p)\Vert/\Vert g(\p)\Vert = \Vert g(\p)\Vert\)
%\co{Our results hold even if $g$ is not in the tangent space. But the bound given by Theorem 3 is tightest if $g$ is in the tangent space. Should we discuss this? Where? Basically the operator norm of $g(p)$ is equal to the L2 norm if $g$ is in tangent space.}
%\jt{correct. where to discuss, I'm not sure.}\ec{To me, this doesn't seem worth saying.}

\hline

If \(G\) is differentiable at some point \(\p\),  then the subgradient \(g(\p)\) is just the gradient of \(G\), \(g(\p)=\nabla  G(\p)\). Here, \(\nabla G(\p)\) is an element of the \emph{tangent space} of \(\Pset\), which is the inner product space
\begin{align}\TPset&:=\{\x\in\mathbb{R}^n\mid \sum_ix_i=0\}\\
&=\{\alpha(\p-\q)\mid \alpha\in\mathbb{R},\p,\q\in\Pset\}\end{align}
with the euclidean scalar product \(\langle \Tv,\Tw\rangle = \Tv^\top \Tw\).
For any \(\Tv\in\TPset\), \(\partial_\Tv G(\p)=g(\p)^\top\Tv\) then gives the directional derivative of \(G\) at point \(\p\) in the direction \(\Tv\). Note that we have to be somewhat careful since, if \(G\) is only defined on the simplex \(\Pset\), the partial derivatives \(\frac{\partial G(\p)}{\partial p_i}\) may not be defined.
Instead, only directional derivatives with directions in \(\TPset\) are defined. This means that there is a degree of freedom in choosing a coordinate representation of \(\nabla G\) in \(\mathbb{R}^n\). In the following, we assume that \(\sum_i g_i(\p)=0\), or equivalently, that \(g(p)^\top\x=0\) for any \(\x\in\mathbb{R}^n\) orthogonal to \(\TPset\).\co{(This works for mere subgradients as well. If $g$ is a subgradient of $G$ on the simplex, then so is $\tilde g (p)=(g_i(p)-\nicefrac{1}{n}\sum_jg_j(p))_i$. So generally we can assume if we want that the entries of $g$ sum to $0$.) Jonny says: Actually this doesn't quite work because the subgradient may be infinite at the edges.}
\co{Is it worth pointing out that these are all $L2$ norms?}
\jt{note that we need to define this where we mention it the first time.}
Note that this is also the choice that minimizes \(\Vert g(p)\Vert\) and makes sure that \(\Vert g(p)\Vert = \Vert g(\p)^\top|_{\TPset} \Vert_{\mathrm{op}}\), where $g(\p)^\top|_{\TPset}$ denotes the function $\TPset \rightarrow \mathbb R\colon \mathbf v\mapsto g(\p)^\top \mathbf v$. %\co{My interpretation of the right-hand side is that you interpret $g(\p)^\top$ as a function and restrict it to $\mathcal T$. Correct?}\jt{correct}.
This is due to the Cauchy--Schwarz-inequality and Pythagorean's theorem, since \(\Vert g(p)+\alpha \mathbf{1}\Vert=\Vert g(p)\Vert + |\alpha| \Vert \mathbf{1}\Vert\) for any \(\alpha\in\mathbb{R}\) when \(g(p)\in\TPset\). Moreover, by CS-inequality, we have \(\Vert g(p)^\top v\Vert \leq \Vert g(p)\Vert\Vert v\Vert\) for any \(v\in\TPset\) and if \(g(p)\in\TPset\) then \(\Vert g(p)^\top|_{\TPset}\Vert_{\mathrm{op}}\geq\Vert g(p)^\top g(p)\Vert/\Vert g(p)\Vert = \Vert g(p)\Vert\)
\co{Our results hold even if $g$ is not in the tangent space. But the bound given by Theorem 3 is tightest if $g$ is in the tangent space. Should we discuss this? Where? Basically the operator norm of $g(p)$ is equal to the L2 norm if $g$ is in tangent space.}
\jt{correct. where to discuss, I'm not sure.}

As in the case of \(g\), note that the matrix representation of \(Dg(\p)\) in \(\mathbb{R}^{n,n}\) is not unique. (If \(G\) is defined on all of $\mathbb{R}^n$, as opposed to just on \(\Pset\), then in principle this defines \(Dg(\p)\) for all \(\mathbb{R}^{n,n}\). But we don't want to use this.)

Generally it doesn't matter much which representation of $Dg(\p)$ we use. Importantly, for all $v\in\TPset$, $Dg(p)v$ will always be unique and (because we assume that $g(\p)$ is in the tangent space) in $\mathcal{T}$. We have to take care, though, that when taking the operator norm or eigenvalues of $Dg(p)$, we consider $Dg(p)$ as restricted to $\mathcal{T}$. Other eigenvalues of $Dg(p)$ are sensitive to the representation.

(Alternatively, we could normalize $Dg(\p)$ to satisfy $Dg(\p) \mathbf{1}=0$. In that case, we'd have $\Vert Dg(\p) \Vert_{\mathrm{op}}=\Vert Dg(\p)|_{\TPset} \Vert_{\mathrm{op}}$.)

%\jt{Have to make sure that \(Dg(\p)v\in\TPset\) for all \(v\in\TPset\). Can also just assume that \(Dg(\p)\)'s rows sum to \(0\)}
%
%
%\jt{still need to think about everything more}

Given \(G,g\) as in the Gneiting and Raftery representation, we write \(Dg(\p)\in\mathbb{R}^{n,n}\) for the Jacobian matrix of \(g\), if it exists (i.e., this is the Hessian of \(G\)). As in the case of \(g\), note that the matrix representaiton of \(Dg(\p)\) in \(\mathbb{R}^{n,n}\) is not unique if \(G\) is only defined on \(\Pset\).

Note that since \(g(\p)\in \TPset\) by assumption, also \(Dg(\p)\vec{v}\in \TPset\) for any \(\vec{v}\in \TPset\). It is underspecified what happens to vectors \(\vec{v}\in\mathbb{R}^n\) in general when multiplying with a matrix representation of \(Dg(\p)\), since \(Dg(\p)\) is only specified for vectors in the tangent space. As with \(g(\p)\), we assume those vectors are just sent to \(0\).

Lastly, 
note that sometimes we might want to differentiate a function \(\tilde{G}\colon \mathbb{R}^n\rightarrow\mathbb{R}\), i.e., one that is defined for all of \(\mathbb{R}^n\). In that case, we have \(g=\Pi_{\TPset}\tilde{g}\) and \(Dg=\Pi_{\TPset}D\tilde{g}\Pi_{\TPset}\), where \(\Pi_{\TPset}\) is the euclidean projection onto \(\TPset\). Here, equivalently, we can define \(G(\p)=\tilde{G}(\frac{1}{\sum_{i}p_i}\p)\) to make sure \(G\) is constant as we move away from the probability simplex. In practice, it suffices also to make sure rows and columns of \(Dg(\p)\) sum to \(0\).

%We use \(\mathbf{1}\) to denote the vector \((1,\dotsc,1)^\top\in\mathbb{R}^n\) and \(\Id\) to denote the identity matrix. \co{Removed because it is already explained in the main text.}

Note that \(Dg(\p)\) can be seen as a linear automorphism on \(\TPset\). However, since it can also be seen as a matrix in \(\mathbb{R}^{n,n}\), we write \(Dg(\p)|_{\TPset}\) when we specifically consider the function on \(\TPset\), instead of the matrix in \(\mathbb{R}^{n,n}\). We know that \(Dg(\p)|_{\mathbb{R}^n}\) as defined above always has an Eigenvalue \(0\) and it is singular, since it sends \(Dg(\p)\mathbf{1}=0\). However, \(Dg(\p)|_{\TPset}\) may be positive definite and nonsingular.
\end{comment}

\subsection{Preliminaries}
We begin by proving a lemma characterizing the gradient \(\nabla_\p(S(\p,f(\p)))\), which we will use throughout.
\begin{lemma}\label{lemma:derivative-S}
    Assume \(G,g,f\) are differentiable. Then
    \[\nabla_\p(S(\p,f(\p)))=  Dg(\p)^\top (f(\p)-\p) +  Df(\p)^\top g(\p).\]
    If \(S\) is strictly proper and \(\p\in\interior{\Pset}\) an optimal report, then
    \[(\p-f(\p))^\top Dg(\p)= g(\p)^\top Df(\p).\]
\end{lemma}
\begin{proof}
    We have
    \begin{align}\nabla_\p(S(\p,f(\p)))
    &= \nabla_\p\left(G(\p) + g(\p)^\top (f(\p)-\p)\right)
    \\&=g(\p) + Dg(\p)^\top(f(\p)-\p) + Df(\p)^\top g(\p) - \Id g(\p)
    \\&=Dg(p)^\top (f(\p)-\p) + Df(\p)^\top g(\p).\end{align}
Next, if \(\p\) is an optimal report and an interior point, it must be \(\nabla_\p(S(\p,f(\p)))^\top \vec{v}=0\) for any \(\vec{v}\in \TPset \). Since \(\nabla_\p(S(\p,f(\p)))\in\TPset\), it follows that \(\nabla_\p(S(\p,f(\p)))=0\). Hence, using the above, it follows that
\begin{align}
    &\phantom{\Rightarrow} 0=\nabla_\p(S(\p,f(\p)))=Dg(p)^\top (f(\p)-\p) + Df(\p)^\top g(\p)
    \\
    &\Rightarrow Dg(\p)^\top (\p-f(\p)) = Df(\p)^\top g(\p).
\end{align}
\end{proof}

\subsection{Proof of Proposition~\ref{prop:non-fixed-point-optimal}}


\begin{restatable}{proposition}{propone}
Let 
\(S\)
 be any strictly proper scoring rule. For any interior fixed point \(\p^*\in \interior{\Pset}\)  there exists a function 
\(f\)
 with Lipschitz constant 
\(L_f<1\)
 and a unique fixed point at 
\(\p^*\), such that there exists 
\(\p'\neq \p^*\) with 
\(\Score(\p', f(\p'))>\Score(\p^*,f(\p^*))\). That is, the unique fixed point of 
\(f\) is not performatively optimal.
\end{restatable}

\begin{proof}

\begin{figure}
\centering
\includegraphics[width=0.5\textwidth]{graph_comp_combined.pdf}
\caption{Illustration of the setup for our proof. We plot \(f_0\) in black and \(f_\alpha\) for \(\alpha=0.15\) in red, projected onto a single dimension.}
\label{fig:illustration-fp-proof}
\end{figure}


To begin, let \(\p^*\in \interior{\Pset}\) arbitrary and define
\(f_{\alpha}(\p):=(1-\alpha)\p + \alpha \p^*\)
 for 
\(\alpha\in [0,1]\) and \(\p\in\Pset\). Note that since \(\Pset\) is convex, \(f_\alpha(\p)\in\Pset\). Let \(G\) be as in the Gneiting and Raftery characterization of \(S\) (\Cref{theorem:gneiting-raftery}).

To provide an intuition of how our proof will work, consider a binary prediction setting with \(f\) as given in \Cref{fig:illustration-fp-proof}. For any \(\alpha>0\), \(f_\alpha\) has a unique fixed point at \(\p^*\), while \(f_0\) is the identity function, so all points are fixed points of \(f_0\). By strict convexity of \(G\), there exists a point \(\p'\) which receives a strictly higher score than \(\p^*\) if it is a fixed point, so \(\Score(\p',f_0(\p'))>\Score(\p^*,f_0(\p^*))\). \(\p'\) is not a fixed point of \(f_\alpha\) for \(\alpha>0\). However, we will show that \(\Score(\p',f_\alpha(\p'))\) must be continuous in \(\alpha\), which means that we can choose a small enough \(\alpha>0\) to make sure that \(\p'\) remains preferable over \(\p^*\), i.e., \(\Score(\p',f_\alpha(\p'))>\Score(\p^*,f_\alpha(\p^*))\), despite it not being a fixed point.

To formalize the proof, begin by noting that
\[\Vert f_{\alpha}(\p)-f_{\alpha}(\p')\Vert=\Vert (1-\alpha )(\p-\p')\Vert=(1-\alpha)\Vert \p-\p'\Vert\]
for any \(\p,\p'\in \Pset\), so \(f_\alpha\) has Lipschitz constant 
\(L:=(1-\alpha)<1\), and as mentioned, \(\p^*\) is the unique fixed point of \(f_\alpha\). %Moreover, as stated above, for \(\alpha=0\), \(f_\alpha(p)=p\), i.e., this is the identity function for which every point is a fixed point, and for \(\alpha>0\), \(f_\alpha\) has a unique fixed point

Now consider the case 
\(\alpha=0\). As mentioned, every point is a fixed point of \(f_0\). Then by strict convexity of \(G,\) since \(\p^*\) is an interior point, there exists another interior point \(\p'\in \interior{\Pset}\) and \(\epsilon>0\) such that \(G(\p')\geq G(\p^*)+ \epsilon.\)
It follows that
\begin{equation}\label{eq:1}\Score(\p',f_{0}(\p'))=\Score(\p',\p')\geq\Score(\p^*,\p^*)+\epsilon=\Score(\p^*,f_{0}(\p^*))+\epsilon.\end{equation}
So for 
\(\alpha=0\), the model prefers to predict 
\(\p'\)
 over 
\(\p^*\)
 and gets at least 
\(\epsilon\)
 additional expected score. Lastly, note that since \(\p'\) is an interior point as well, it follows that \(G(\p')<\infty\).

Now we show that the model still prefers to predict 
\(\p'\), even for some small 
\(\alpha>0\). %which is enough to prevent \(p'\) from being a fixed point.
To that end, note that
\[\Score(\p',f_{\alpha}(\p'))=\E_{y\sim f_{\alpha}(\p')}[S(\p',y)]\]
is linear in \(f_{\alpha}(\p')\), and \(f_{\alpha}(\p')\)
 is affine-linear in 
\(\alpha\)
by construction. This means that 
\(\Score(\p,f_{\alpha}(\p))\)
 is continuous in 
\(\alpha\). So there must exist some small 
\(\alpha>0\)
 such that
\begin{align}\Score(\p',f_{\alpha}(\p'))&\geq\Score(\p',f_{0}(\p'))-\frac{\epsilon}{2}=\Score(\p',\p')-\frac{\epsilon}{2}
\\&\underset{\text{(\ref{eq:1})}}{\geq} \Score(\p^*, \p^*)+\frac{\epsilon}{2}> \Score(\p^*,\p^*)
\\&=\Score(\p^*,f(\p^*)).\end{align}
Choosing 
\(\alpha\)
 in this way, we can define 
\(f:=f_{\alpha}\), and have thus provided a function that satisfies the statement that we wanted to prove.
\end{proof}



\subsection{Proof of Theorem~\ref{prop:fixed-points-optimal-reports-are-rare}}
\label{appendix-proof-of-theorem-8}

We begin with two lemmas. In the following, we always assume a strictly proper scoring rule \(S\) and accompanying functions \(G,g\) as in the Gneiting and Raftery characterization (\Cref{theorem:gneiting-raftery}). Moreover, we let \(\Pi_{n-1}\colon\mathbb{R}^n\rightarrow\mathbb{R}^{n-1}\) be the projection onto \(\mathbb{R}^n\), defined via \(\Pi_{n-1} \x = (x_i)_{1\leq i\leq n-1}\) for \(\x\in\mathbb{R}^n\). We will not go into issues of measurability in our proofs.

First, we show that if \(\p^*\in \interior{\Pset}\) is a fixed point of \(f\), then either \(g(\p^*)=0\) or \(Df(\p)|_{\TPset}\), i.e., the map
\[Df(\p)\colon \TPset\rightarrow\TPset, \vec{v}\mapsto Df(\p)\vec{v},\] is singular.
\begin{lemma}\label{lemma:criticalpointatfixedpoint}
    Let \(G,g\), and \(f\) be differentiable. Let $\p\in \interior{\Pset}$ be a fixed point of $f$ and a performatively optimal prediction. Then $Df(\p)|_\TPset$ is singular or $g(\p)=0$.
\end{lemma}

\begin{proof}
Note that \(f(\p)\in \Pset\) for all \(\p\in\Pset\), so \(\partial_{\vec{v}} f(\p)=Df(\p)\vec{v}\in \TPset\) for all \(\vec{v}\in \TPset\). Hence, \(Df(\p)\) defines an automorphism \(Df(\p)|_{\TPset}\).

It follows from \Cref{lemma:derivative-S} that
$Dg(\p)^\top(\p-f(\p)) =Df(\p)^\top g(\p)$. Since $f(\p)-\p=0$, it must be $Df(\p)^\top g(\p)=0$, so either \(g(\p)=0\), or \(Df(\p)^\top\) (and thus also \(Df(\p)\)) is singular when restricted to \(\TPset\).
\end{proof}

Next, we show that the fixed points of \(f\) are almost surely not at points \(\p\) such that \(g(\p)=0\), under our assumptions on the distribution over \(f\).

\begin{lemma}\label{lemma:gneq0}
    Let $\mathcal{F}\defeq\{F(\p)\}_{\p\in \interior{\Pset}}$ be a stochastic process with values in \(\Pset\) and assume that for each \(\p\in \interior{\Pset}\), the random vector \(\Pi_{n-1} F(\p)\) has a density \(h_{\Pi_{n-1} F(\p)}\).
    Then almost surely if \(F(\p)=\p\) for some \(\p\in \interior{\Pset}\) then \(g(\p)\neq 0\). That is,
    \[\mathbb{P}(\exists \p\colon F(\p)=\p\land g(\p)=0)=0.\]
\end{lemma}

\begin{proof}
    First note that if $S$ is strictly proper, then $G$ is strictly convex and so there exists at most one \(\p\in\Pset\) with \(g(\p)=0\). If there is no such point, then we are done. Otherwise, let that point be $\p^*$. Since we assume that \(\Pi_{n-1} F(\p^*)\) has a density function \(h_{\Pi_{n-1} F(\p^*)}\), it follows that
    \[\mathbb{P}(F(\p^*)=\p^*)= \mathbb{P}(\Pi_{n-1} F(\p^*)=\Pi_{n-1} \p^*)=\int_{\{\Pi_{n-1}\p^*\}} h_{\Pi_{n-1} F(\p^*)}(\x)d\x=0.\]
\end{proof}

Lastly, we require a result about random fields. %Essentially, this says that if we take a random vector field \(F\colon W\rightsquigarrow \mathbb{R}^{d}\) from an open set \(W\subseteq\mathbb{R}^{d'}\) through a \(d\)-dimensional space, and if the density of the value \(F(p)\) is bounded across all points \(p\in W\), then for any particular point in \(\mathbb{R}^{d}\), the probability that the path crosses this point is \(0\).
The following is adapted from Proposition~6.11 in \citet{azais2009level}.
\begin{proposition}[\cite{azais2009level}, Proposition~6.11]\label{prop:random-field}
    Let \(\mathcal{Y}=\{Y(\x)\}_{\x\in W}\) be a random field with values in \(\mathbb{R}^{d}\) and \(W\) an open subset of \(\mathbb{R}^{d'}\). Let \(\vec{u}\in\mathbb{R}^{d}\) and \(I\subseteq W\). Assume that
    \begin{itemize}
\item the sample paths \(\x\rightsquigarrow Y(\x)\) are continuously differentiable
\item for each \(\x\in W\), \(Y(\x)\) has a density \(h_{Y(\x)}\) and there exists a constant \(C\) such that \(h_{Y(\x)}(\y)\leq C\) for all \(\x\in I\) and \(\y\in\mathbb{R}^{d}\).
\item The Hausdorff dimension of \(I\) is strictly smaller than \(d\).
\end{itemize}
Then, almost surely, there is no point \(\x\in I\) such that \(Y(\x)=\vec{u}\).
\end{proposition}

Now we can turn to the proof of the main result.

\setcounter{theorem}{1}

\begin{restatable}[]{theorem}{fprare}
Let \(\Score\) be a twice differentiable strictly proper scoring rule. Let \(\mathcal{F}:=\{F(\p)\}_{\p\in\interior{\Pset}}\) be a stochastic field with values in \(\Pset\) and let \(Y(\p,\Tv):=(\Pi_{n-1} F(\p), \Pi_{n-1}\partial_{\Tv}F(\p))\) for \(\p\in \interior{\Pset}\) and \(\Tv\in \TPset\cap S^{n-1}\). Assume that
    \begin{itemize}[nolistsep]
    \item the sample paths \(\p\rightsquigarrow F(\p)\) are twice continuously differentiable
    \item for each \(\p\in \interior{\Pset}\) and \(\Tv\in \TPset\cap S^{n-1}\), the random vector \(Y(\p,\Tv)\) has a joint density \(h_{Y(\p,\Tv)}\) and there exists a constant \(C\) such that $h_{Y(\p,\Tv)}\leq C$ for all \(\p\in\Pset,\Tv\in S^{n-1}\cap \TPset\). %\ec{Currently this is a lot to parse in one sentence.}
    \end{itemize}
    Then, almost surely, there is no point \(\p\in \interior{\Pset}\) such that \(\p\in\argmax_{\p'}\Score(\p',F(\p'))\) and \(F(\p)=\p\).
\end{restatable}

\begin{proof}

We want to show that almost surely there does not exist \(\p\in \interior{\Pset}\) such that \(F(\p)=\p\) and \(\p\) is performatively optimal. I.e., we want to show that
\[\mathbb{P}(\exists \p\colon F(\p)=\p\land \p\in \argmax \Score(\p,F(\p)))=0.\]

First, let \(\p^*\in \interior{\Pset}\) be a performatively optimal report. By \Cref{lemma:criticalpointatfixedpoint}, either \(g(\p^*)=0\) or \(DF(\p^*)|_{\TPset}\) is singular. Moreover, by assumption, \((\Pi_{n-1} F(\p),\Pi_{n-1} DF(\p)\vec{v})\) has a density function for any \(\vec{v}\in \TPset\cap S^{n-1}\), and thus also \(\Pi_{n-1}F(\p)\) has one. Hence, by \Cref{lemma:gneq0}, it follows that if \(F(\p)=\p\) for some \(\p\in \interior{\Pset}\), then almost surely \(g(\p)=0\). 

Second, we need to show that also almost surely \(DF(\p)|_{\TPset}\) is invertible at any fixed point of \(F\). To that end, define the random field \(\mathcal{Y}:=\{Y(\p,\vec{v})\}_{(\p,\vec{v}) \in W}\) where \(W:=\interior{\Pset}\times \TPset\) and
\[Y(\p,\vec{v}):=(\Pi_{n-1} F(\p)-\Pi_{n-1} \p,\Pi_{n-1}DF(\p)\vec{v}),\]
with values in \(\mathbb{R}^{n-1}\times\mathbb{R}^{n-1}\).%First, note that
%for any \(x,y\in\mathbb{R}^n\), where . Hence, \(Y\) has bounded density.
%has derivative \(F'(p)-\Id\)

Note that since \(F\) is in \(\mathcal{C}^2\), \(DF\) is continuously differentiable, and thus also \(Y\). Moreover
\[h_{Y(\p,\vec{v})}(\x,\y)=h_{\Pi_{n-1} F(\p),\Pi_{n-1} DF(\p)\vec{v}}(\x+\Pi_{n-1}\p,\y)\leq C\] for \(\x,\y\in\mathbb{R}^{n-1}\) by assumption. Finally, define \(\vec{u}:=(0,0)\in\mathbb{R}^{n-1}\times\mathbb{R}^{n-1}\) and \(I:=\Pset\times (\TPset \cap S^{n-1})\), where \(\TPset \cap S^{n-1}=\{\vec{v}\in \TPset\mid \Vert \vec{v}\Vert=1\}\). Note that the Hausdorff dimension of \(I\) is \(n-1+n-2=2n-3\), while \(Y\)'s values are \(2n-2\)-dimensional.

This shows all conditions of \Cref{prop:random-field}, so we can apply it to \(Y\) to conclude that almost surely there exists no \(\p,\vec{v}\in I\) such that \(Y(\p,\vec{v})=(0,0)\). This means that almost surely there exists no point \(\p\in \Pset\) such that \(F(\p)=\p\) and such that \(DF(\p)|_{\TPset}\) is singular, since if such a point existed, then also there would be a vector \(\vec{v}\in \TPset \cap S^{n-1}\) such that \(DF(\p)\vec{v}=0\) and thus \(\Pi_{n-1}DF(\p)\vec{v}=0\), implying that
\[Y(\p,\vec{v})=(\Pi_{n-1}F(\p)-\Pi_{n-1}\p,\Pi_{n-1}DF(\p)\vec{v})=0.\]

Summarizing our argument, it follows that
\[\mathbb{P}(\exists \p\colon {F(\p)=\p}\land {\p\in \argmax \Score(\p,F(\p))})
\]
\[\leq \mathbb{P}(\exists \p \colon {F(\p)=\p}\land {g(\p)=0})+\mathbb{P}(\exists \p \colon {F(\p)=\p}\land {DF(\p)\text{ is singular}})=0.\]
This concludes the proof.
\end{proof}

We conclude by providing an example of a stochastic process that satisfies our conditions, for the binary prediction case.
\begin{example}
\label{ex:gaussian-process}
Consider a Gaussian process \(\{X(p)\}_{p\in (0,1)}\) with values in \(\mathbb{R}\), with infinitely differentiable kernel and mean functions. We can make it into a process \(F(\p)\) by defining \(F(\p)= (X(p_1), 1-X(p_1))\) for \(\p\in \Delta([2])\). Note that the paths of \(F\) are infinitely differentiable and the values of \(\Pi_1 F(\p)=X(p_1)\) and its directional derivatives \(\Pi_1 DF(\p)\vec{v}=X'(p_1)v_1\) are jointly Gaussian and thus have a bounded density %\href{http://gaussianprocess.org/gpml/chapters/RW.pdf}{Rasmussen and Williams, 2006}
\cite[see][Ch.~9.4]{Rasmussen2006}. To deal with the restriction that \(X(p)\in [0,1]\) for \(p\in (0,1)\), we could condition on the event \(E:=\{\forall p\colon  F(p)\in [0,1]\}\), for instance. Then paths are still twice differentiable, and we claim that \(h_{X(p)|E}\), defined as the density of \(X\) at point \(p\), conditional on \(E\), is still bounded. To see that, note that if \(\mathbb{P}(E)>0\), then we are done, since then 
\[h_{X(p)|E}(x)=\frac{\mathbbm{1}_E(x)}{\mathbb{P}(E)}h_{X(p)}(x).\] We leave it as an exercise to the reader to prove that \(\mathbb{P}(E)>0\).
\end{example}


%\section{Proofs for Section~\ref{bounds-on-deviation}}
%\label{appendix:proofs-for-sec-5}



%For the proof we need the following linear algebra lemma.

%\begin{lemma}\label{lemma:norm-lowest-eigenvalue}
%    Let $A$ be \jt{symmetric} positive semidefinite. Note that then A's eigenvalues are all nonnegative. and let $\lambda$ be $A$'s smallest eigenvalue. Then for all $v$, we have that $v^TAv\geq \lambda\Vertv\Vert_2^2$.
%\end{lemma}

%\begin{proof}
%    All eigenvalues nonnegative is Theorem 6, $1\Rightarrow 2$.    %
%
%    \co{I think this result already exists, but I can't find a good reference for it. E.g., \url{https://math.stackexchange.com/a/2619139} gives it (requiring that A is symmetric), but says it follows from \url{https://en.wikipedia.org/wiki/Min-max_theorem}, but this seems to not be all that obvious. Anyway, below is Em's proof:}

 %   Because TODO we get that $A=B\Lambda B^T$, where $\Lambda$ is a diagonal matrix with $\Lambda_{ii}=\lambda_i>0$, where $\lambda_i$ is the $i$-th eigenvalue of $A$, and $B^T=B^{-1}$.
 %   Thus,
 %   \begin{eqnarray*}
 %       v^\top Av &=& v^\top B\Lambda B^\top  v\\
%        &=& (B^\top v)^\top \Lambda (B^\top v)\\
 %       &=& \sum_i (B^\top v)_i^2 \lambda_i\\
 %       &\geq & \lambda \sum_i (B^\top v)_i^2\\
 %       &=& \lambda \Vert B^\top v\Vert _2^2\\
 %       &=& \lambda v^\top BB^\top v\\
 %       &=& \lambda v^\top v\\
 %       &=& \lambda \Vert v\Vert _2^2.
 %   \end{eqnarray*}
%\end{proof}

\subsection{Proof of Theorem~\ref{theorem:Caspar-approx-fix-point}}

\begin{restatable}{theorem}{inaccuracybound}
    Let \(S\) be a strictly proper scoring rule, and let \(G,g\) as in the Gneiting and Raftery characterization (\Cref{theorem:gneiting-raftery}). %Let \(\Vert\cdot\Vert\) denote the euclidean norm. 
    Let \(\p\in\Pset\) and assume \(f,G,g\) are differentiable at \(\p\). Assume \(Dg(\p)|_{\TPset}\succeq\gamma_{\p}\) for some \(\gamma_\p>0\).
    Then whenever $\p$ is a performatively optimal report,
    \begin{equation*}
    \Vert \p - f(\p) \Vert \leq\frac{ \Vert  Df(\p)\Vert_{\mathrm{op}}\Vert g(\p)\Vert}{\gamma_{\p}}.\end{equation*}
    In particular, if $f$ has Lipschitz constant $L_f$, \(G\) has Lipschitz constant \(L_G\), and \(G\) is \(\gamma\)-strongly convex, then we have $\Vert \p - f(\p) \Vert \leq \frac{L_f L_G}{\gamma}$.
\end{restatable}


Recall that we assume that $g(\p)$ is normalized to be orthogonal to $\mathbf{1}$. Note that this is also the choice that minimizes \(\Vert g(\p)\Vert\) and makes sure that \(\Vert g(\p)\Vert = \Vert g(\p)^\top|_{\TPset} \Vert_{\mathrm{op}}\), where $g(\p)^\top|_{\TPset}$ denotes the function $\TPset\rightarrow\mathbb R\colon \mathbf v \mapsto g(\p)^\top \mathbf v$. %\co{My interpretation of the right-hand side is that you interpret $g(\p)^\top$ as a function and restrict it to $\mathcal T$. Correct?}, \jt{correct}
This is due to the Cauchy--Schwarz inequality %\co{That's Cauchy-Schwarz, I assume?} \jt{correct}
and the Pythagorean theorem, since \(\Vert g(\p)+\alpha \mathbf{1}\Vert^2=\Vert g(\p)\Vert^2 + |\alpha|^2 \Vert \mathbf{1}\Vert^2\) for any \(\alpha\in\mathbb{R}\) when \(g(\p)\in\TPset\). Moreover, by Cauchy--Schwarz, we have \(\Vert g(\p)^\top \vec{v}\Vert \leq \Vert g(\p)\Vert\Vert \vec{v}\Vert\) for any \(\vec{v}\in\TPset\) and if \(g(\p)\in\TPset\) then \(\Vert g(\p)^\top|_{\TPset}\Vert_{\mathrm{op}}\geq\Vert g(\p)^\top g(\p)\Vert/\Vert g(\p)\Vert = \Vert g(\p)\Vert\).

\begin{proof}
Assume \(\p\) is a performatively optimal report and that \(Dg(\p)|_{\TPset}\succeq\gamma_p\).
Note that this is equivalent to all eigenvalues of the function \(Dg(\p)|_{\TPset}\) being at least \(\gamma_p\), assuming \(Dg(\p)|_{\TPset}\) is symmetric. Moreover, \(Dg(\p)\) must be symmetric if \(G\) is twice differentiable (note that continuous differentiability is not needed since we assume differentiability in general, not just existence of the coordinate partial derivatives). This can be used to calculate our bound in practice.

Consider \(\nabla_\p(S(\p,f(\p)))^\top (f(\p)-\p),\) the directional derivative of \(\varphi\colon \p\mapsto S(\p,f(\p))\) in the direction $(f(\p)-\p)$. Note that this derivative must be at most zero: The line from $\p$ to $f(\p)$ lies entirely within the probability simplex, and so if the derivative were positive, $S(\p,f(\p))$ could be increased by moving in the direction of $f(\p)$ from $\p$. By \Cref{lemma:derivative-S}, we know that
\[\nabla_\p(\Score(\p,f(\p)))=Dg(\p)^\top (f(\p)-\p) +  Df(\p)^\top g(\p).\]
It follows that
\begin{align}&0\geq \nabla_\p(S(\p,f(\p)))^\top (f(\p)-\p)=
(f(\p)-\p)^\top Dg(\p) (f(\p)-\p)+g(\p)^\top Df(\p) (f(\p)-\p)\\
\Rightarrow&
-g(\p)^\top Df(\p) (f(\p)-\p)
\geq (f(\p)-\p)^\top  (Dg(\p)) (f(\p)-\p).
\end{align}
Using that \(Dg(\p)|_{\TPset}\succ \gamma_\p\) and thus \((f(\p)-\p)^\top  (Dg(\p)) (f(\p)-\p) \geq \gamma_\p\Vert f(\p)-\p\Vert^2\)
%\co{Also, I don't get how you get this conclusion. In the theorem it says that \(Dg(p)|_{\TPset}\succ \gamma\) means that \(v^\top (Dg(\p)-\Id)v\geq \gamma\) for all \(v\in\TPset\). But then that would mean \((f(p)-p)^\top  (Dg(p)) (f(p)-p) -\Vert f(\p)-\p\Vert^2 \geq \gamma\), which seems to be something else?} \jt{sorry, the theorem defined it wrong. it should be \(v^\top (Dg(\p)-\gamma\Id)v\geq 0\)}
, it follows that
\begin{eqnarray*}
    && \gamma_\p\Vert f(\p)-\p\Vert^2\\
    &\leq & (f(\p)-\p)^\top  Dg(\p) (f(\p)-\p) \\
    &\leq& - g (\p)^\top Df(\p)(f(\p)-\p)\\
    &\leq& \vert g(\p)^\top Df(\p)(f(\p)-\p)\vert\\
    &\underset{\text{Cauchy-Schwarz}}{\leq}& \Vert g(\p)\Vert \Vert Df(\p)(f(\p)-\p)\Vert\\
    &\leq& \Vert g(\p)\Vert \Vert Df(\p)\Vert_{\mathrm{op}}\Vert f(\p)-\p\Vert
\end{eqnarray*}
%\co{I guess one downside of using underset/overset is that if one doesn't know which one is used, one doesn't know what eq/ineq it is.}
Dividing by $\gamma_\p\Vert f(\p)-\p\Vert $, we get that $\Vert f(\p)-\p\Vert\leq \Vert Df(\p)\Vert_{\mathrm{op}} \Vert g(\p)\Vert/\gamma_\p$.

For the ``in particular'' part, note that if \(f\) is Lipschitz continuous with constant \(L_f\), then \(\Vert Df(\p)\Vert_{\op}\leq L_f\) for all \(\p\). Moreover, if \(G\) is Lipschitz continuous with constant \(L_G\), we have
\[L_G\geq \Vert DG(\p)\Vert_{\mathrm{op}}=\Vert g(\p)^\top \Vert_{\op}=\Vert g(\p)\Vert\]
for all \(\p\in\Pset\). Here, in the last step, we have used that for the Euclidean norm
\[\Vert g(\p)^\top\Vert_{\op}=\max_{\vec{v}\in \TPset}\frac{g(\p)^\top \vec{v}}{\Vert\vec{v}\Vert}=\frac{g(\p)^\top g(\p)}{\Vert g(\p)\Vert}=\Vert g(\p)\Vert. \]
Lastly, \(G\) being \(\gamma\)-strongly convex implies that \(D^2G(\p)\succeq \gamma\) for all \(\p\in\Pset\), and thus also \(Dg(\p)=D^2G(\p)^\top\succeq \gamma\).

Putting everything together, we get
\[    \Vert f(\p)-\p\Vert
\leq
\frac{\Vert g(\p)\Vert \Vert Df(\p)\Vert_{\mathrm{op}}}{\gamma_p}
\leq 
\frac{L_GL_f}{\gamma}\]
for all performatively optimal reports \(\p\).
%We get the second inequality as follows:\co{I assume the following is outdated?}
%    \begin{eqnarray*}
%\Vert p-f(p)\Vert  &\underset{\text{\Cref{theorem:Caspar-approx-fix-point}}}{\leq}&  \Vert  g'(p)^{-1}\Vert _{\mathrm{op}}\Vert f'(p) g(p)\Vert _2 \\
%&\leq & \Vert g'(p)^{-1}\Vert_{\mathrm{op}} \Vertf'(p)\Vert_{\mathrm{op}} \Vertg(p)\Vert_2\\
%&\leq & L \Vert g'(p)^{-1}\Vert_{\mathrm{op}} \Vertg(p)\Vert_2.
%\end{eqnarray*}
\end{proof}

\subsection{Proof of Theorem~\ref{thm:distance-to-fp}}

\begin{restatable}{theorem}{bounddisttofp}
    Same assumptions as \Cref{theorem:Caspar-approx-fix-point}. Assume further that $f$ has Lipschitz constant $L_f<1$. Let $\p^*$ be the unique fixed point of $f$. Then for the performatively optimal report $\p$,
    \begin{equation*}
        \Vert \p-\p^*\Vert \leq \frac{\Vert g(\p)\Vert\Vert Df(\p) \Vert_{\mathrm{op}}}{(1-L_f)\gamma_\p} \leq \frac{L_fL_G}{(1-L_f)\gamma_\p}.
    \end{equation*}
\end{restatable}


\begin{proof}
For \textit{any} \(\p\in \Delta(\mathcal{N})\), we have
\begin{eqnarray*}
\Vert \p-\p^*\Vert
&\underset{\text{triangle ineq.}}{\leq} & \Vert \p -f(\p)\Vert + \Vert f(\p)-\p^*\Vert \\
&\underset{p^*\text{ fixpoint}}{=}& \Vert \p -f(\p)\Vert + \Vert f(\p)-f(\p^*)\Vert\\
&\leq& \Vert\p-f(\p)\Vert + L_f\Vert \p-\p^*\Vert 
\end{eqnarray*}
Solving for $\Vert \p-\p^*\Vert $ yields
\begin{equation*}
\Vert \p-\p^*\Vert\leq \frac{\Vert \p-f(\p)\Vert}{1-L_f}.
\end{equation*}
%\jt{have to update this too}
Hence, if \(\p\in \Pset\) is an optimal prediction, it follows by \Cref{theorem:Caspar-approx-fix-point} that
\begin{eqnarray*}
\Vert \p-\p^*\Vert_2&\leq& \frac{\Vert \p-f(\p)\Vert}{1-L_f}\\
&\leq& \frac{\Vert Df(\p)\Vert_{\mathrm{op}} \Vert g(\p)\Vert}{\gamma_{\p}(1-L_f)}\\
&\leq& \frac{L_f \Vert g(\p)\Vert}{\gamma_{\p}(1-L_f)},
\end{eqnarray*}
which concludes the proof.
\end{proof}

\subsection{There is no non-trivial bound on the distance to the fixed point as \(L_f\rightarrow 1\)}
%Why Theorem~\ref{thm:distance-to-fp} needs $L_f<1$}
\label{appendix:Lf1-no-non-trivial-bound}

We here show why Theorem~\ref{thm:distance-to-fp} requires that we have some bound $L_f<1$ on the function $f$. Specifically, we show that if $f$ can have Lipschitz constants arbitrarily close to $1$, then even in the two-outcome case, only the trivial bound on the difference to the fixed point holds. (The trivial bound is $\lVert \p^*-\p \rVert \leq \sqrt{2}$, because any two points in $\Delta(\{1,2\})$ are at most $\lVert (0,1) - (1,0) \rVert=\sqrt{2}$ apart.) We prove that this holds even for the binary case.

\begin{proposition}
\label{prop:Lf1-no-non-trivial-bound}
    Consider the case of two outcomes, i.e., let $\mathcal{N}=\{1,2 \}$. Let $S$ be any strictly proper scoring rule. Then there exist functions $f$ with Lipschitz constants smaller than $1$ such that $\lVert \p^*-\p \rVert$ is arbitrarily close to $\sqrt{2}$, where $\p^*$ is the fixed point of $f$ and $\p$ is the optimal prediction for $S,f$.
\end{proposition}

\begin{figure}
    \centering
    \includegraphics[width = 0.5\linewidth]{illustration-proof-disttoFP-high-if-Lf-is-1.pdf}
    \caption{The blue line is the function used in the proof of \Cref{prop:Lf1-no-non-trivial-bound}. The orange line is the identity function.}
    \label{fig:funct-proof-of-prop:Lf1-no-non-trivial-bound}
\end{figure}

We here give some intuition for why the result holds. Recall that, roughly speaking, scoring rules generally induce a preference for extreme honest predictions over non-extreme honest predictions (see \Cref{preferences-between-fps}). In particular, in the binary case any scoring rule must either incentivize near-0 honest predictions or near-1 honest predictions (or both) over honest relatively close-to-uniform predictions. Consider the case where $S$ incentivizes predictions close to $0$ over more uniform predictions and take the function $f$ in \Cref{fig:funct-proof-of-prop:Lf1-no-non-trivial-bound}. The unique fixed point is at $0.8$. But if a prediction close to $0$ is made, the prediction is \textit{approximately} honest while more extreme than $0.8$. It turns out that a slight dishonesty (discrepancy between the report $\p$ and the true distribution $f(\p)$) can be outweighed by the fact that the prediction is more extreme. Predicting near $0$ may therefore be a better report than a of prediction $0.8$.

Note that in this example, the distance of the optimal report to fixed point ($\lVert \p - \p^* \rVert$) and the inaccuracy of the optimal report ($\lVert \p - f(\p) \rVert$) come apart: The optimal report might be far from the fixed point but still very accurate.

\begin{proof}
For notational convenience, we consider functions $f:[0,1]\rightarrow [0,1]$ on a single probability and similarly scoring rules $S:[0,1]\times [0,1]\rightarrow \mathbb{R}$.

Let $\zeta$ be a small positive real number and let $\delta$ be s.t.\ $0<\delta<\zeta/2$. By the strict convexity of the function $x\mapsto S(x,x)$, one of the following must be the case:
\begin{enumerate}[nolistsep]
    \item $S(\delta,\delta)>\max_{x\in [2\delta,1-2\delta]} S(x,x)$; or
    \item $S(1-\delta,1-\delta)>S(x,x)$ for all $x$ between $2\delta$ and $1-2\delta$.
\end{enumerate}

Consider the first case: Now for small positive $\epsilon$ consider the function $f_\epsilon$ that starts at some small positive value and increases linearly at rate $1-\epsilon$ from $0$ to $1-\zeta$ and is fixed at value $1-\zeta$ from $1-\zeta$ to $1$. \Cref{fig:funct-proof-of-prop:Lf1-no-non-trivial-bound} illustrates this function for $\zeta = 0.2, \epsilon = 0.03, \delta = 0.1$. Formally, $f_\epsilon(p)=1-\zeta$ for $p\geq \zeta$ and otherwise $f_\epsilon(p)=1-\zeta - (1-\zeta - p)(1-\epsilon)$. Note that $f_\epsilon$'s fixed point is $1-\zeta$ and $f_\epsilon$'s Lipschitz constant is $1-\epsilon$.
We will show that for small enough $\epsilon$ the optimal report for $f_\epsilon$ and $S$ is then close to $0$ and thus almost $1$ away from the fixed point, which means the distance is close to $\sqrt{2}$ in the simplex. %First note that the optimal report for $f_\epsilon$ is clearly in $[0,1-\zeta]$ regardless of which strictly proper scoring rule we use. -- don't need this anymore
Note that by continuity of $f_\epsilon$ and linearity (and thus continuity) of $S(p,q)$ in $q$, we have that
$S(\delta,f_\epsilon(\delta)) \rightarrow S(\delta,\delta)$ as $\epsilon\rightarrow 0$. Thus, for small enough $\epsilon$, we have for all $x$ between $2\delta$ and $1-2\delta$ that $S(\delta,f_\epsilon(\delta))>S(x,x)$. It follows that the optimal report $p$ cannot have $f_\epsilon(p)\in [2\delta,1-2\delta]\supset [\zeta,1-\zeta]$, because then we would have that $S(\delta,f_\epsilon(\delta))>S(f_\epsilon(p),f_\epsilon(p))>S(p,f_\epsilon(p))$, i.e., $\delta$ would be a better report. By construction of $f_\epsilon$, this means that the optimal report cannot be in $[2\delta,1]$. %With the earlier observation about the location of the optimal report it follows that the optimal report for $f_\epsilon$ is in $[0,2\delta]$.
Thus, the distance of the optimal report to the fixed point is at least $1-\zeta-2\delta$. By choosing $\delta$ and $\zeta$ to be small, we can make this arbitrarily close to $1$.

The second case can be considered analogously, by considering a function $f$ that is constant at value $\zeta$ from $0$ to $\zeta$ and then increases linearly at rate $1-\epsilon$.
%
%Notice also that the optimal report for $f_a$ is in $[\zeta,1]$ and the optimal report for $f_b$ is in $[0,1-\zeta]$.
%
%Now, by continuity of $f_a,f_b$ and linearity (and thus continuity) of $S(p,q)$ in $q$, we have that
%$S(\delta,f_a(\delta)) \rightarrow S(\delta,\delta)$ as $\epsilon\rightarrow 0$. Similarly $S(1-\delta,f_b(1-\delta)) \rightarrow S(1-\delta,1-\delta)$ as $\epsilon\rightarrow 0$. Therefore, we have that 
%
%It follows that either for the first function $S(1-\delta,1-\delta)>S(x,x)$ for any $x$ in $[0,1-2\delta]$ or analogously for the second function.
%
%Same argument as above via epsilon -> 0.
%
%In either case we have an error of ~1.
\end{proof}

\subsection{Proof of Theorem~\ref{theorem:two-outcomes-arbitrarily-good-bounds}}

\begin{restatable}{theorem}{exponentialthm}
Consider the case of two outcomes, i.e., let $\mathcal{N}=\{1,2 \}$. Let $L_f\in \mathbb{R}$ and $\epsilon>0$. Then there exists a scoring rule $S$ s.t.\ under any $f$ with Lipschitz constant $L_f$, any optimal report $\p$ satisfies $\Vert \p-f(\p)\Vert \leq \epsilon$. If $L_f<1$, then there also exists a scoring rule that additionally ensures that under any $f$ with Lipschitz constant $L_f$, any optimal report satisfies $\Vert \p-\p^*\Vert \leq \epsilon$, where $\p^*$ is the (unique) fixed point of $f$.
\end{restatable}


\begin{proof}
Consider the exponential scoring rule defined by $G(\p)=\frac{2}{K}e^{Kp_1}$ and $g(\p)=(e^{Kp_1},-e^{Kp_1})^\top$ s.t.\ $Dg(\p)=\begin{pmatrix} Ke^{Kp_1} & -Ke^{Kp_1}\\ 0& 0\end{pmatrix}$ and $\Vert g(\mathbf{p})\Vert =\sqrt{2}e^{Kp_1}$. The only eigenvalue of $Dg(\p)\vert_{\TPset}$ is $Ke^{Kp_1}$. Thus, $Dg(\p)\succeq Ke^{Kp_1}$. Therefore, by \Cref{theorem:Caspar-approx-fix-point}, the optimal report $\p$ satisfies $\Vert \p -f(\p)\Vert \leq \sqrt{2}L_f/K$. Thus, by choosing $K=L_f/(\sqrt{2}\epsilon)$, we obtain the desired bound. If $L_f<1$, then by \Cref{thm:distance-to-fp} we further have that $\Vert \p -\p^*\Vert  \leq \frac{L_f}{(1-L_f)}\frac{\sqrt{2}}{K}$, so that we can achieve the desired bound by setting $K=(1-L_f)/(\sqrt{2}\epsilon L_f)$.
\end{proof}

\subsection{Proof of Theorem \ref{thm:need-exponential-new}}

%\co{Explain that in this section we use the notation $S(p,q)$, where $p,q\in [0,1]$. We can then write these as $S(p,q)=g((p,1-p)^\top)(1,-1)^\top (q-p) +G(p,1-p)$. We will define $g(p)=g((p,1-p)^\top)(1,-1)^\top$.}
%\ec{I think you don't need the $\top$ inside $g$ (it doesn't make it clearer, and is just ugly), but we do need one outside of $g$. Though probably it would also be cleaner to just write the (1,-1) as a row vector on the LHS.}

Throughout this section we use the following simplifying notation. Let $S$ be a proper scoring rule for the two-outcome case with $G,g$ as per \Cref{theorem:gneiting-raftery}. Then for a single probability $p\in [0,1]$ we define $G(p)=G(p,1-p)$ and $g(p)=(1,-1)g(p,1-p)$. And $S(p,q)=S((p,1-p),(q,1-q))$. Then we have that $S(p,q)=g(p)(q-p)+G(p)$, where $g$ as a function on $[0,1]$ is a subgradient of $G$ as a function on $[0,1]$. Conversely, note that any function of this form induces a proper scoring rule on $\Delta(\{1,2\})$.

First, we prove a result that reduces the claim about $S$ to a claim about $g$.

\begin{lemma}\label{lemma:bounding-S-ratios-bounding-g-ratios}
    Let $[p_1,p_2]$ be any interval and $S$ be a proper scoring rule defined via $g$ as usual. Then
    \begin{equation*}
        \frac{\sup_{p\in [p_1,p_2]} S(p,p)-S(p+x,p)}{\inf_{p\in [p_1,p_2]} S(p,p)-S(p+x,p)} \geq \frac{1}{4} \frac{\sup_{p\in [p_1,p_2]} g(p+x)-g(p)}{\inf_{p\in [p_1,p_2]} g(p+x)-g(p)}.
    \end{equation*}
\end{lemma}

\begin{proof}
For our proof, we will use the following bounds:
    \begin{eqnarray*}
        S(p,p)-S(p+x,p) &=& x g(p+x) - \int_p^{p+x} g(t)dt\\
        &\geq & xg(p+x)- xg(p+x)/2 - xg(p+x/2)/2\\
        &=& x(g(p+x)-g(p+x/2))/2\\
        S(p,p) - S(p+x,p) &=& xg(p+x)-\int_p^{p+x} g(t)dt\\
        &\leq & xg(p+x) - xg(p)\\
        &=& x(g(p+x) - g(p)).
    \end{eqnarray*}
Using these bounds, we can prove the lemma as follows:
    \begin{eqnarray*}
        \frac{\sup_p S(p,p)-S(p+x,p)}{\inf_p S(p,p)-S(p+x,p)}
        &\geq& \frac{\sup_p x(g(p+x)-g(p+x/2))/2}{\inf_p x(g(p+x) - g(p))}\\
        &=& \frac{1}{2}\frac{\sup_p g(p+x)-g(p+x/2)}{\inf_p g(p+x) - g(p)}\\
        &\geq & \frac{1}{4}\frac{\sup_p g(p+x)-g(p)}{\inf_p g(p+x) - g(p)}.
    \end{eqnarray*}
\end{proof}


\begin{lemma}\label{compounding_hops}
    Let $y\geq 0$, $h > 0$.
    Let $g\geq 0$ be strictly increasing on $[a,b]$ s.t.\ $g(x+h)-g(x)\geq y g(x)$ for all $x\in [a,b]$. Then
    \begin{equation*}
        \frac{\sup_{x\in [a,b]} g(x+h)-g(x)}{\inf_{x\in [a,b]} g(x+h)-g(x)} \geq y(1+y)^{\lfloor (b-a)/h \rfloor-1}.
    \end{equation*}
\end{lemma}

\begin{proof}
    Let $N = \lfloor \frac{b-a}{h} \rfloor$.
    
    Note that $g(x+h) \geq (1+y)g(x)$ for $x \in [a,b]$. Thus, iterating, we get that $g(a+Nh)\geq (1+y)^{N-1}g(a+h)$.

    As a consequence, since $a+Nh \leq b$, we have:
    \begin{align*}
        g(a+(N+1)h)-g(a+Nh) &\geq y g(a+Nh)\\
        &\geq y(1+y)^{N-1}g(a+h)\\
        &\geq y(1+y)^{N-1}(g(a+h)-g(a))
    \end{align*}
    
    And so:
    \begin{align*}
         \frac{\sup_{x\in [a,b]} g(x+h)-g(x)}{\inf_{x\in [a,b]} g(x+h)-g(x)} \geq \frac{g(a+Nh + h)-g(a+Nh)}{g(a+h)-g(a)}\geq y(1+y)^{N-1}
    \end{align*}
\end{proof}

\begin{lemma}\label{exp_hops_somewhere}
Let $S$ defined via $g$ as usual be a proper scoring rule. %Assume $g>0$.
Let $\epsilon>0,L>0$. Assume that $S$ has the following property: For every $f$ with Lipschitz constant $L$, we have $|p^*-f(p^*)|\leq \epsilon$ for the optimal report(s) $p^*$.
Let $\delta = \frac{\epsilon}{L+1}$.
Then for every $2\delta$ interval contained in $[0,1-3\epsilon+2\delta]$%\co{I think it would be better to define the preceding in terms of $\delta$.}\ec{Should I just round $1- \epsilon(3L+1)/(L+1)$ down to $1-3\epsilon$? Another way of writing it is also $1-3\epsilon+2\delta$, but not sure that's any better...}\co{I like $1-3\epsilon+2\delta$. I guess I like simplifying things. One question is whether dropping the $+2\delta$ here is any better than dropping the $+2\delta$ later when we give the final result. Em says: Yeah, actually the final result becomes more complicated if we drop the the $+2\delta$ here, because the $+2\delta$ here cancel out with $-2\delta$ later.}
, there is a $p$ in that interval such that ($p+2\delta\leq 1$ and)
\begin{equation*}
    g(p+2\delta)-g(p) \geq \frac{2L}{L+3}g(p).
\end{equation*}
%\ec{(Could trade tightness of bound for slightly simpler numbers here)}\co{Probably better to optimize later.}
\end{lemma}

(Note that this result doesn't assume $g>0$. However, note that the consequent of the lemma is vacuous if $g(p)\leq 0$ (since $g$ is monotone increasing.)

\begin{proof}
We shall show, equivalently, that for every interval of width $2\delta$ in $[2\delta, 1-3\epsilon+4\delta]$ \ec{should we add a note here that the top of this interval can be above 1 but that that's okay?}, there is some $p$ (in $[0,1]$) contained in the interval such that:
\begin{equation}\label{g_deriv_bound}
g(p)-g(p-2\delta)\geq\frac{2L}{L+3}g(p-2\delta)
\end{equation}


Given an interval of width $2\delta$ in $[2\delta,1-3\epsilon+4\delta]$
%=[2\delta, 1-\epsilon\frac{3L-1}{L+1}]$
, write the interval as $[p_0-\delta,p_0+\delta]$, where $p_0 \in [3\delta, 1-3L\delta] = [3\delta, 1 - 3\epsilon + 3\delta] \subseteq [0,1]$.

Then, we construct $f$ as follows.

Let
\begin{align}
k_2 &\defeq p_0\left(1+\frac{1}{L}\right)\\
k_1 &\defeq k_2 - \frac{1}{L} = p_0 -\frac{1}{L}(1-p_0).\label{eq:def-k1}
\end{align}
Then consider 
\begin{equation*}
f(p)\defeq
\left\{\begin{array}{cl}
        1 & \text{if } p\leq k_1\\
        0 & \text{if } p\geq k_2\\
        \frac{k_2-p}{k_2-k_1} = L(k_2-p) & \text{if } k_1\leq p\leq k_2
        \end{array}\right.
\end{equation*}
for $p \in [0,1]$.

For $k_1=0.3,k_2=0.4,L=10$, this function looks as follows.\\
\begin{center}
\includegraphics[width=0.6\linewidth]{BoundLemmaVisualization.png}
\end{center}
Note that $f$ then has Lipschitz constant L. 

Moreover, note that $f$ has a unique fixed point, which occurs at $p_0 \in (k_1,k_2)$, since
\begin{equation*}
    f(p_0)=L(k_2-p_0)=L\left(p_0 + \frac{1}{L}p_0 - p_0\right) = p_0.
\end{equation*}
We can see that for any proper scoring rule, the optimal report under $f$ is in $[k_1,k_2]\cap [0,1]$.

Next we will show that for $p\in [k_1,k_2] \cap [0,1]$ to satisfy the bound $|f(p)-p|\leq \epsilon$, we must have that $p\in [p_0-\delta,p_0+\delta]$. To show this, observe that, if $p \in [k_1,k_2] \cap [0,1]$:
\begin{eqnarray*}
    f(p)-p &=& f(p)-f(p_0) - (p-p_0) +f(p_0)-p_0\\
    &=& -L(p-p_0) - (p-p_0) + f(p_0)-p_0\\
    &=& -(L+1)(p-p_0) + f(p_0)-p_0\\
    &\underset{p_0\text{ fixed point}}{=}& -(L+1)(p-p_0),
\end{eqnarray*}
So that, for $k_1\leq p \leq k_2$ we have $|f(p)-p|\leq\epsilon$ if and only if $|p-p_0|<\delta$. Thus, by assumption, we must have that the optimal report $p^*$ satisfies $p^* \in [p_0-\delta,p_0+\delta].$ We will show the claim of the theorem by showing that that $p=p^*$ satisfies equation (\ref{g_deriv_bound}).

First, we will check that $p^*-2\delta$ is in $[0,1]$, and is still on the steep section of the graph, i.e., is in $[k_1,k_2]$.

We have
\begin{equation*}
    p^* - 2\delta \geq p_0 - 3\delta \underset{\text{\Cref{eq:def-k1}}}{=} k_1 + \frac{1}{L}(1-p_0) - 3\delta \underset{p_0\leq 1-3L\delta}{\geq} k_1.
\end{equation*}
%\co{I added a reference to the middle inequality, because I was at first confused about what line this was using. I assumed the idea was:
%\begin{equation*}
%    p_0\underset{p_0\text{ fixed point}}{=}f(p_0) = L(k_2 - p_0)=L(k_1+\frac{1}{L}-p_0) = Lk_1 + 1 - Lp_0,
%\end{equation*}
%which seems like it gets you a different term. I also added a reference to the final inequality to make it easier to see where this comes from.
%}
Also we chose $p_0$ to satisfy $p_0 - 3\delta \geq 0$, so that overall $p^*-2\delta \geq \max(k_1,0)$. Also, we must have that $\max(k_1,0) \leq p^* \leq \min (k_2,1)$.

Now we just use the optimality of $p^*$ and the definition of $f$ to get the result. $S(p^*,f(p^*)) \geq S(p^*-2\delta, f(p^*-2\delta))$, i.e.:
\begin{align*}
    G(p^*)+g(p^*)(f(p^*)-p^*) &\geq G(p^*-2\delta)+g(p^*-2\delta)(f(p^*-2\delta)-p^*+2\delta) \\
    &= G(p^*-2\delta)+g(p^*-2\delta)(f(p^*)+2L\delta-p^* +2\delta)
\end{align*}
Now, by the fact that $g$ is a subgradient of $G$, we have that $G(p^*)-G(p^*-2\delta)\leq 2\delta g(p^*)$. %\co{I think this is exactly the subgradient property at $p^*$.}
Thus, rearranging:
\begin{align*}
    2\delta g(p^*) +g(p^*)(f(p^*)-p^*) \geq g(p^*-2\delta)(f(p^*)+2L\delta - p^* + 2\delta)
\end{align*}
and so
\begin{align*}
    (f(p^*)-p^* + 2\delta)(g(p^*)-g(p^*-2\delta)) \geq 2\delta L g(p^*-2\delta).
\end{align*}
Thus, since $|f(p^*)-p^*|\leq\epsilon$
\begin{align*}
    g(p^*)-g(p^*-2\delta) &\geq g(p^*-2\delta)\frac{2L\delta}{\epsilon+2\delta}\\
    &= \frac{2L}{L+3}g(p^*-2\delta).
\end{align*}
%\co{I've now finished reading the above and am convinced that it is correct. But yeah, relative to how intuitive the proof idea is, I still feel like the proof itself is a little inscrutable. But let's leave it as is for now.}
%
%\ec{Do you prefer this version? It's longer, but I think maybe more intuitive:}\co{I haven't looked at the following in detail, but let's just stick with the above for now.}
%We have:
%\begin{eqnarray*}
%    &. &S(p^*,f(p^*))-S(p^*-2\delta, f(p^*-2\delta))\\
%    &= &g(p^*-2\delta)(f(p^*)-p^*-f(p^*-2\delta)+(p^*-2\delta))\\
%    &&+ (g(p^*)-g(p^*-2\delta))(f(p^*)-p^*)\\
%    &&+G(p^*)-G(p^*-2\delta)\\
%    &\leq& g(p^*-2\delta)(2L\delta+2\delta)\\
%    && + (g(p^*)-g(p^*-2\delta))(f(p^*)-p^*)\\
%    && +2\delta g(p*)
%\end{eqnarray*}
%
%Thus,
%\begin{equation*}
%    (g(p^*)-g(p^*-2\delta))(f(p^*)-p^* + 2\delta)\geq  2\delta g(p^*-2\delta)L
%\end{equation*}
%
%Giving us
%\begin{equation*}
%    (g(p^*)-g(p^*-2\delta)) \geq \frac{2L\delta}{\epsilon+2\delta}g(p^*-2\delta) = \frac{2L}{L+3}g(p^*-2\delta)
%\end{equation*}
\end{proof}
\begin{lemma}\label{exp_hops_everywhere}
    Let $g$ be any monotonically increasing nonnegative function with the property that on each interval of length $h$ fully contained in $[a,b]$ there is $p$ in that interval such that $g(p+h)-g(p)\geq yg(p)$. Then for all $p\in [a,b-h]$, $g(p+2h)-g(p)\geq yg(p)$, provided $g$ is defined on $[a,b+h]$.
    %\ec{Currently we have $b-2h$ in the statement and $b-h$ in the proof... I assume this is because we are going to want $p+2h\leq b$? But we don't talk about that, or where $g$ is defined, here. ETA: I think with the $1-4\epsilon$ version we can actually just use $b-h$ now.}
\end{lemma}
\begin{proof}
    For any $p \in [a,b-h]$, we have that the interval $[p,p+h]$ must contain some $p^*$ with $g(p^*+h)-g(p^*) \geq yg(p^*)$. Then, since $p\leq p^*$, and $p^*+h \leq p+2h$, we have, by monotonicity
    \begin{equation*}
        g(p+2h)-g(p) \geq g(p^*+h)-g(p^*)\geq yg(p^*) \geq y g(p).
    \end{equation*}
\end{proof}

\begin{comment}
\begin{corollary}\label{cor:scoring-rules-with-bounds-must-be-exponential}
Suppose $S$ is a proper scoring rule defined via $g>0$, that $\epsilon, L > 0$, and that whenever $f$ is $L$-Lipschitz, the optimal report $p^*$ satisfies $|f(p^*) - p^*| < \epsilon$. Let $\delta = \frac{\epsilon}{L+1}$. Then we have:
\begin{equation*}
        \frac{\sup_{x\in [2\delta,1-3\epsilon]} g(x+4\delta)-g(x)}{\inf_{x\in [2\delta,1-3\epsilon]} g(x+4\delta)-g(x)} \geq \frac{2L}{L+3}\left(3\frac{L+1}{L+3}\right)^{\lfloor (1-3\epsilon)/(4\delta) -\frac{3}{2} \rfloor}.
\end{equation*}
\end{corollary}
\end{comment}

\begin{comment}
\begin{proof}
Note that combining Lemma \ref{exp_hops_somewhere} with Lemma \ref{exp_hops_everywhere}, we get the conditions of Lemma \ref{compounding_hops} with $a= 2\delta$,\co{Why not $a=0$? Em says: Probably $a=0$ works.} $b= 1- \epsilon\frac{3L+1}{L+1}-2\delta = 1- 3\epsilon$, $h = 4\delta$,\co{(This is a little confusing because more than one of the lemmas has an $h$.)} $y = \frac{2L}{L+3}$, so that $(b-a)/h = (1-3\epsilon)/(4\delta)-1/2$.
\end{proof}
\end{comment}

\begin{lemma}\label{lemma:symmetry-for-bounds}
    Let $S(p,q)=g(p)(q-p)+G(p)$ be a (strictly) proper scoring rule, and $f: [0,1]\rightarrow[0,1]$ be $L_f$-Lipschitz. Let
    \begin{align*}
        \tilde g(p) &\defeq - g(1-p)\\
        \tilde G(p) &\defeq G(1-p)\\
        \tilde S(p,q) &\defeq \tilde g(p)(q-p)+\tilde G(p)\\
        \tilde f(p) &\defeq 1 - f(1-p)
    \end{align*}
    Then $\tilde S$ is a (strictly) proper scoring rule, $\tilde f$ is $L_f$-Lipschitz and
    \begin{equation*}
        \tilde S(1-p,\tilde f(1-p)) = S(p,f(p)).
    \end{equation*}
\end{lemma}
\begin{proof}
First, we show that $\tilde S$ is a (strictly) proper scoring rule, by verifying that it conforms to the \citeauthor{gneiting2007strictly} characterization. From the form of $\tilde G$, we can see that $\tilde G$ is (strictly) convex iff $G$ is. It remains only to check that $\tilde g$ is a subderivative of $g$. Then, we have
\begin{equation*}
    \tilde G(p)- \tilde G(q) = G(1-p)-G(1-q) \geq g(1-q)(1-p-1+q) = \tilde g (q) (p-q).
\end{equation*}
as required. By inspection, $\tilde f$ is $L_f$-Lipschitz.

Finally,
\begin{align*}
    \tilde S(1-p,\tilde f(1-p)) &= \tilde g(1-p)(\tilde f(1-p)-(1-p))+\tilde G(1-p)\\
    &= (-g(p))(1-f(p) - (1-p))+G(p)\\
    &= g(p)(f(p) - p) +G(p)\\
    &= S(p,f(p)).
\end{align*}
\end{proof}

\begin{lemma}\label{lemma:g-ratios}
Suppose $S$ is a proper scoring rule defined via $g$ s.t.\ for some $\epsilon, L_f > 0$ we have that whenever $f$ is $L_f$-Lipschitz, the optimal report $p^*$ satisfies $|f(p^*) - p^*| < \epsilon$. Let $\delta = \frac{\epsilon}{L_f+1}$. Further consider $p_l,p_h$ s.t.\ $3\epsilon-3\delta  \leq p_l \leq p_h \leq 1-3\epsilon-\delta$. Then we have:
\begin{equation*}
        \frac{\sup_{x\in [p_l,p_h]} |g(x+4\delta)-g(x)|}{\inf_{x\in [p_l,p_h]} |g(x+4\delta)-g(x)|} \geq \frac{2L_f}{L_f+3}\left(3\frac{L_f+1}{L_f+3}\right)^{(L_f+1)(p_h-p_l)/(8\epsilon) -5/2}.
\end{equation*}
\end{lemma}

%\co{Note: the only difference is that the $1$ in the exponent becomes a $\nicefrac{1}{2}$. The $\nicefrac{3}{2}$ to $1$ difference is either an error in the original result or an error in my result.}

\begin{proof}
We will, show, equivalently, that for $3\epsilon -\delta \leq p_l \leq p_h \leq 1 - 3\epsilon + \delta$:

\begin{equation}\label{alt_form}
        \frac{\sup_{x\in [p_l,p_h]} |g(x+2\delta)-g(x-2\delta)|}{\inf_{x\in [p_l,p_h]} |g(x+2\delta)-g(x-2\delta)|} \geq \frac{2L_f}{L_f+3}\left(3\frac{L_f+1}{L_f+3}\right)^{(L_f+1)(p_h-p_l)/(8\epsilon) -5/2}.
\end{equation}

Consider first the case where $g((p_l+p_h)/2)<0$. Then consider $\tilde g$ as specified by \Cref{lemma:symmetry-for-bounds}. Note that from \Cref{lemma:symmetry-for-bounds} it follows that $\tilde S,\tilde g$ satisfy the claim of the Theorem in the form of equation (\ref{alt_form}) for $[p_l',p_h']\defeq[1-p_h,1-p_l]$ if and only if $S,g$ satisfy the claim of the Theorem for $[p_l,p_h]$:

\begin{align*}
        \frac{\sup_{x\in [p_l,p_h]} |g(x+2\delta)-g(x-2\delta)|}{\inf_{x\in [p_l,p_h]} |g(x+2\delta)-g(x-2\delta)|}
        &= \frac{\sup_{x\in [p_l,p_h]} |\tilde g(1-x+2\delta)-\tilde g(1-x-2\delta)|}{\inf_{x\in [p_l,p_h]} |\tilde g(1-x+2\delta)-\tilde g(1-x-2\delta)|}\\
        &= \frac{\sup_{y\in [1-p_h,1-p_l]} |\tilde g(y+2\delta)-\tilde g(y-2\delta)|}{\inf_{y\in [1-p_h,1-p_l]} |\tilde g(y+2\delta)-\tilde g(y-2\delta)|}.\\
\end{align*}

Note further that $\tilde g((p_l'+p_h')/2)>0$. Thus, for our proof we can assume WLOG $g((p_l+p_h)/2)>0$ and thus by monotonicity $g(x)>0$ for $x>(p_l+p_h)/2$.

Note that $p_h\leq 1-3\epsilon +\delta \leq1-3\epsilon+2\delta $. So, by \Cref{exp_hops_somewhere}, we have that in every $2\delta$ interval contained in $[(p_l+p_h)/2,p_h]$ there is a $p$ such that
\begin{equation*}
    g(p+2\delta)-g(p) \geq \frac{2L_f}{L_f+3}g(p).
\end{equation*}
Hence, by \Cref{exp_hops_everywhere}, we have that for all $p\in [(p_l+p_h)/2,p_h-2\delta]$,
\begin{equation*}
    g(p+4\delta)-g(p) \geq \frac{2L_f}{L_f+3}g(p)
\end{equation*}
since $p_h+2\delta \leq 1-3\epsilon + \delta + 2\delta \leq 1$.
Thus, by \Cref{compounding_hops},
\begin{align*}
&\phantom{ = } \frac{\sup_{x\in [p_l,p_h]} |g(x+2\delta)-g(x-2\delta)|}{\inf_{x\in [p_l,p_h]} |g(x+2\delta)-g(x-2\delta)|}\\
&=
\frac{\sup_{x\in [p_l-2\delta,p_h-2\delta]} |g(x+4\delta)-g(x)|}{\inf_{x\in [p_l-2\delta,p_h-2\delta]} |g(x+4\delta)-g(x)|}\\
        &\geq\frac{\sup_{x\in [(p_l+p_h)/2,p_h-2\delta]} g(x+4\delta)-g(x)}{\inf_{x\in [(p_l+p_h)/2,p_h-2\delta]} g(x+4\delta)-g(x)} \geq \frac{2L_f}{L_f+3}\left(1+\frac{2L_f}{L_f+3}\right)^{\lfloor (p_h-p_l)/(8\delta) - 1/2 \rfloor-1}.
\end{align*}
\end{proof}

\begin{comment}
\begin{proof}
Consider first the case where $g(\nicefrac{1}{2})<0$. Then consider $\tilde g$ as per \Cref{lemma:symmetry-for-bounds}. Note that from \Cref{lemma:symmetry-for-bounds} it follows that $\tilde S,\tilde g$ satisfy the claim of the Theorem if and only if $S,g$ satisfy the claim of the Theorem. Note further that $\tilde g(\nicefrac{1}{2})>0$. Thus, for our proof we can assume WLOG $g(\nicefrac{1}{2})\geq 0$ and thus by monotonicity $g(x)>0$ for $x>\nicefrac{1}{2}$.

%The rest of the proof works just like \Cref{cor:scoring-rules-with-bounds-must-be-exponential}.
By \Cref{exp_hops_somewhere}, we have that in every $2\delta$ interval contained in $[\nicefrac{1}{2},1-3\epsilon+2\delta]$ there is a $p$ such that
\begin{equation*}
    g(p+2\delta)-g(p) \geq \frac{2L}{L+3}g(p).
\end{equation*}
Hence, by \Cref{exp_hops_everywhere}, we have that for all $p\in [\nicefrac{1}{2},1-3\epsilon-2\delta]$,
\begin{equation*}
    g(p+4\delta)-g(p) \geq \frac{2L}{L+3}g(p).
\end{equation*}
Thus, by \Cref{compounding_hops},
\begin{eqnarray*}
        && \frac{\sup_{x\in [\nicefrac{1}{2},1-3\epsilon-2\delta]} g(x+4\delta)-g(x)}{\inf_{x\in [\nicefrac{1}{2},1-3\epsilon-2\delta]} g(x+4\delta)-g(x)}\\
        &\geq& \frac{2L}{L+3}\left(1+\frac{2L}{L+3}\right)^{\lfloor (\nicefrac{1}{2}-3\epsilon-2\delta)/(4\delta) \rfloor-1}\\
        &=& \frac{2L}{L+3}\left(3\frac{L+1}{L+3}\right)^{\lfloor (\nicefrac{1}{2}-3\epsilon)/(4\delta) -\nicefrac{3}{2}\rfloor}.
    \end{eqnarray*}
\end{proof}
\end{comment}

\co{TODO: Should do some sanity checks. E.g., if $\epsilon=1$, the bound should be trivial.}

\begin{comment}
\co{As I mentioned before, we really need to give a fairly simple result. I think the important thing is that we have something exponential in $L/\epsilon$. Currently, this is very hard to get out of the above term.}

\co{Following is outdated.}
%Are we sure that one can't get a much simpler result via my original choice of $\delta=\epsilon$ or whatever it was?}

First let's bound the exponent:
\begin{eqnarray*}
&& \lfloor (\nicefrac{1}{2}-3\epsilon)/(4\delta) -\nicefrac{3}{2} \rfloor\\
&\geq& (\nicefrac{1}{2}-3\epsilon)/(4\delta) -\nicefrac{5}{2} \\
&=& (\nicefrac{1}{2}-3\epsilon)/\left(4\frac{\epsilon}{L+1} \right) -\nicefrac{5}{2} \\
&\geq& (\nicefrac{1}{8}-\epsilon)(L+1)/\epsilon -\nicefrac{5}{2}
\end{eqnarray*}
If, say, $\epsilon<1/16$, then this becomes $\geq (L+1)/(16\epsilon) -\nicefrac{5}{2}$, which I guess is as simple as we can expect it to be.

I don't know how to bound the other stuff if we can't assume anything about $L$. If we assume, e.g., $L\geq 1$, then we get $2L/(L+3)\geq 1/2$ and $(3(L+1)/(L+3))\geq 3/2$. So by making some assumptions about $L,\epsilon$, we can get
\begin{equation*}
    \geq \nicefrac{1}{2}(3/2)^{(L+1)/(16\epsilon) -\nicefrac{5}{2}} \geq (3/2)^{(L+1)/(16\epsilon) -\nicefrac{9}{2}}.
\end{equation*}
\end{comment}

\begin{restatable}{theorem}{needexponential}
Suppose $S$ is a proper scoring rule s.t.\ for some $\epsilon, L_f > 0$ we have that whenever $f$ is $L_f$-Lipschitz, the optimal report $\p$ satisfies $\Vert f(\p) - \p\Vert < \epsilon$. Let $3\epsilon \leq p_l \leq p_h \leq 1-4\epsilon$ and $\delta=\epsilon / (L_f+1)$. Then the ratio of the supremum and infimum over $p_1\in [p_l,p_h]$ of $S((p_1+4\delta,1-p_1-4\delta),(p_1,1-p_1)) - S((p_1,1-p_1),(p_1,1-p_1))$ is at least
\begin{equation*}
    \frac{L_f}{2L_f+6}\left(3\frac{L_f+1}{L_f+3}\right)^{(L_f+1)(p_h-p_l)/(8\epsilon) -5/2}.
\end{equation*}
In particular, for fixed positive $L_f$, this term is exponential in $1/\epsilon$ and for fixed positive $\epsilon$ it is exponential in $L_f$.
\end{restatable}

\begin{proof}
    Follows from \Cref{lemma:bounding-S-ratios-bounding-g-ratios,lemma:g-ratios}.
\end{proof}

\subsection{Proof of Theorem\ \ref{thm:no_higher_dim_bound}}

%\ec{In the below, interpret $g$ as the projection of $g$ onto the plane of the probability simplex in case that it sticks out of the plane.}\co{This is now taken care of by being able to assume that the entries of $g$ sum to 0, i.e., that $g$ is in the tangent space. Minor complication: need to deal with the case where one of the entries can be $-\infty$.}
\begin{comment}
We first need a lemma that shows that we can find a long isoline:
\begin{lemma}\label{lemma:long_isoline}
    Let $\Delta \subset \mathbb{R}^3$ be an equilateral triangle within a plane of $\mathbb{R}^3$, with sides of length $t$\ec{Does triangle usually include the space inside the triangle, or only the outline?}. Let $G:\Delta\rightarrow \mathbb{R}$ be strictly convex. Consider the isolines of $G$ within $\Delta$: that is, the curves on which $G$ is constant. (Note that by strict convexity, these are well-defined.) Then there exists a continuous section of some such isoline of length at least $t\frac{\sqrt{3}}{3}$. %\ec{I think probably we can get something more like $\sqrt{3}/2$ if we want it to be tight. I haven't tried very hard to get the longest length possible. Probably something something continuity increasing isolines either bisects eventually then done or [stuff dealing with the other case] would work.}\co{I think getting this up to $\sqrt{3}/2$ has approximately no value. (Assuming that it has a small constant impact on the bounds that we ultimately obtain.)}
\end{lemma}

\begin{proof}
    Consider the isoline through the center of the triangle $\Delta$. We have two cases: either it reaches the edge of the triangle, or it remains entirely within the triangle.
    
    In the first case, we have that since the minimum distance from the sides to the center of $\Delta$ is $t\sqrt{3}/6$, the isoline must have a contiguous length within $\Delta$ of at least $t\sqrt{3}/3$.

    In the second case, look at the isoline with $G$ minimal that touches the edge(s) of $\Delta$. This isoline encloses the center of $\Delta$, and in particular must enclose a line from the center of $\Delta$ to the edge of $\Delta$. Since this line has length at least $t\sqrt{3}/6$, the isoline must have a closed curve within $\Delta$ of length at least $t\sqrt{3}/3$.
\end{proof}
%\co{This proof seems correct and I appreciate its simplicity. :-P}

Now we'll find a segment of that isoline which doesn't turn too much but is still reasonably long:

\begin{lemma}\label{lemma:small_angle_segment}
    Let $\Delta \subset \mathbb{R}^3$ be an equilateral triangle within a plane of $\mathbb{R}^3$, with sides of length $t$. Let $G:\Delta\rightarrow \mathbb{R}$ be strictly convex. Suppose $\theta \in (0,\pi)$.

    Let 
    \begin{equation}
        l = t\frac{\sqrt{3}}{3}\frac{\frac{\theta}{2\pi}}{1 + \frac{\theta}{2\pi}}
    \end{equation}
    Then there exists a connected section of an isoline of $G$ in $\Delta$, of length at least $l$, such that for all $\p,\p'$ on this section, the angle between $g(\p)$ and $g(\p')$ is at most $\theta$. Equivalently, the associated subtangents to the isoline have angle varying by at most $\theta$.
\end{lemma}

\begin{proof}
By Lemma \ref{lemma:long_isoline}, $\Delta$ contains an isoline with a continuous segment of length $l_0 \geq t\sqrt{3}/3$.

If $l_0/l \geq 1$ is not an integer, remove a section of length at most $l$ so that the remaining section is of length evenly divisible by $l$. The length of the remaining section has length at least $l_0-l = l_0(1+\frac{\theta}{2\pi})^{-1} = l\frac{2\pi}{\theta}$.  Call this remaining section $\gamma$, and note that we may now assume WLOG that $\gamma$ is not closed: i.e., it has two distinct endpoints.

Note that the isolines of $G$ in $\Delta$ form (part of) the boundaries of convex sets, due to the strict convexity of $G$. In particular, it then makes sense to talk about the supporting lines of the isolines. Note that $g(\p)$ is always perpendicular to one such supporting line through $\p$, and by convexity the angle of $g(\p)$ is then monotonic along the isoline (measuring clockwise relative to the angle at the most anticlockwise endpoint, say).

Thus, the total angle that $\gamma$ turns through is at most $2\pi$, since it forms part of the boundary of a convex set. More precisely: Moving clockwise along $\gamma$, $g(\p)/\norm{g(\p)}$ traces clockwise around the edge of the unit circle (in the plane), and never turns further than its start point.

Then, divide $\gamma$ into sub-segments of length $l$. Since the average angle each of these turn through is at most $2\pi l/(l_0-l) = \theta$, we may choose a a sub-segment that turns through angle at most $\theta$. I.e., we may choose a sub-segment where the angle $g(\p)$ always lies in some interval $[0,\theta]$, measured relative to $g(\p)$ at the start of the segment. This is equivalent to the statement in the lemma, so we are done.
\end{proof}
\end{comment}

We'll first need a lemma that we can find a section of an isoline of sufficient length that doesn't turn too much:
\begin{lemma}\label{lemma:isoline_section}
Let $\Pset$ be the probability simplex in $\mathbb{R}^3$ (an equilateral triangle with side length $\sqrt{2}$ lying in a plane embedded in $\mathbb{R}^3$). Let $G:\Pset\rightarrow \mathbb{R}$ be strictly convex, with subgradient $g$ (with entries summing to 0). Let $r=\frac{\sqrt{6}}{12}$, and $0<l<r$.  Then we can find a section of an isoline of $G$, $\gamma$, with the following properties:
\begin{enumerate}
    \item $\gamma$ has length $l$.
    \item Let $\p$ be one endpoint of $\gamma$, and $\vec{n}$ be the unit vector parallel to $g(\p)$, i.e. $\vec n = g(\p)/\norm{g(\p)}$. Then, $\p-2r\vec{n} \in \Pset$, and $G(\p-2r\vec{n}) \leq G(\p)$.
    \item For all $\p' \in \gamma$, the angle $\theta$ between $g(\p)$ and $g(\p')$ satisfies $\theta \leq \frac{2l}{r}$.
    \item Each point on $\gamma$ has distance at least $r - l$ from the boundary of the simplex.
\end{enumerate}
Moreover, let $\gamma$ be an isoline section with the above properties. Let $\p$ and $\vec{n}$ be defined as above, and $\vec{t}$ a unit vector perpendicular to $\vec{n}$ (and to $\vec{1}$). Let $\q$ be the other endpoint of $\gamma$. Then $|\vec{t}^\top(\q-\p)| \geq l\left(1-\frac{2l}{r}\right)$. I.e. the length of $\gamma$ in the direction orthogonal to $\vec{n}$ is at least $l\left(1-\frac{2l}{r}\right)$.
\end{lemma}
\begin{proof}
    Note that each isoline $\{\x\in\Pset:G(\x)=y\}$ forms part of the boundary of the set $\{\x\in\Pset:G(\x)\leq y\}$, which by strict convexity of $G$ is a convex set. Call this the \textit{enclosed set} of the isoline. Then, at each point $\p$ on an isoline, there is at least one supporting line to the isoline, i.e., a straight line that touches the isoline but does not contain any of the interior points of the enclosed set. Moreover, $g(\p)$ is always perpendicular to a supporting line to the isoline through $\p$, and points out of the enclosed set. 
    
    We will proceed by finding an isoline tangent to and enclosing a circle at the center of $\Pset$, and then arguing that a section of this isoline has the desired properties.

    First, consider the circle of radius $r$ at the center of $\Pset$. Let $\p$ be a point on the boundary of this circle at which $G$ is maximal, and consider the isoline of $G$ through $\p$.

    \begin{center}
    \begin{tikzpicture}
        \coordinate (A) at (0,0);
        \coordinate (B) at (6,0);
        \coordinate (F) at (3,3.5);
        \coordinate (G) at (1,0);
        \tkzDefTriangle[equilateral](A,B);
        \tkzGetPoint{C};
        \coordinate (H) at ($0.6*(B)+0.4*(C)$);
        \tkzDefTriangleCenter[centroid](A,B,C);
        \tkzGetPoint{O};
        \coordinate (D) at ($(O)+6*(0,0.144)$);
        \tkzDrawPolygons(A,B,C);
        \tkzDrawPoints(O);
        \tkzLabelPoint[above right](O){$O$};
        \tkzDrawCircle(O,D);
        \tkzDefPointBy[rotation= center O angle 180](D);
        \tkzGetPoint{E};
        \draw [dashed] (O) -- (E) node [midway, left] {$r$};
        \tkzDefPointBy[rotation= center O angle 15](D);
        \tkzGetPoint{p};
        \tkzDrawPoints(p);
        \tkzLabelPoint(p){$\p$};
        \tkzDefPointBy[rotation= center O angle 15](F);
        \tkzGetPoint{n};
        \draw [->] (p)--(n)  node [midway, above right] {$g(\p)$};
        \draw (G) to [out = 80, in = 195] (p) to [out = 15, in = 150] (H);
    \end{tikzpicture}
    \end{center}

    Observe that, by construction, the enclosed set of this isoline contains the circle. Moreover, the isoline is tangent to the circle at $\p$. Note further that the tangent line to the circle at $\p$ must be the unique supporting line to the isoline through $\p$, so that $g(\p)$ is perpendicular to this tangent. As a consequence, we know that if $\vec{n} = \frac{g(\p)}{\norm{g(\p)}}$, then $\p - 2r\vec{n}$ is the point on the opposite side of the circle. Thus, we must have $G(\p-2r\vec{n}) \leq G(\p)$, as required.

    Now, we have two cases: either the isoline stays within the interior of the simplex, or it reaches the boundary of the simplex.
    In the former case, the total length of the isoline is at least the circumference of the circle, i.e., $2\pi r > l$. Thus, we may choose a section (with two distinct endpoints) of the isoline, $\gamma$, with length $l$ and endpoint $\p$.

    Now, note that the distance from the centre of the simplex to the (nearest point on the) boundary is $\frac{\sqrt{6}}{6} = 2r$. Hence, the minimum distance from the circle to the boundary of the simplex is at least the $2r - r = r > l$. Thus, in the latter case, the isoline must have a connected section starting at $\p$ of length $l$. Call this section $\gamma$.

    In either case, $\gamma$ has distance at least $r-l$ from the boundary of the simplex.

    Now, we will bound the change in the angle of supporting lines, moving along $\gamma$. WLOG assume $\p$ is the anticlockwise-most point of $\gamma$.


    \begin{center}
    \begin{tikzpicture}
        \coordinate (O) at (0,0);
        \coordinate (P) at (0,7);
        \coordinate (P+) at (1,7);
        \coordinate (P-) at (-1,7);
        \coordinate (A) at (5,6);
        \coordinate (T0) at (P);
        \coordinate (T1) at ($(P)+(6,0)$);
        \tkzDrawArc[R, color = black](O,7)(0,90);
        \tkzDrawPoints(P, O,A);
        \tkzLabelPoint(O){$O$};
        \tkzLabelPoint[left](P){$\p$};
        \draw (P) to [out = 0, in = 150]  node [midway, below right] {$\gamma$} (A) node [above right] {$\q$};
        \draw [dashed] (T0) -- (T1);
        \draw [dashed] (O) -- (P) node [midway, left] {$r$};
        \tkzMarkRightAngle(T1,P,O);
        \tkzDefPointBy[rotation= center O angle -75](P);
        \tkzGetPoint{C};
        \tkzDrawPoints(C);
        \tkzLabelPoint(C){$C$};
        \tkzDefPointBy[rotation= center O angle -75](P+);
        \tkzGetPoint{C+};
        \tkzDefPointBy[rotation= center O angle -75](P-);
        \tkzGetPoint{C-};
        \tkzInterLL(C-,C+)(P,P+);
        \tkzGetPoint{B};
        \tkzDrawLines[add = 0.3 and 0.1](C,B);
        \tkzDrawPoints(B);
        \tkzLabelPoint(B){$B$};
        \draw [dashed] (O) -- (C)  node [midway, below right] {$r$};
        \tkzMarkAngle(C,O,P);
        \tkzLabelAngle[above right](C,O,P){$\theta$};
        \tkzMarkRightAngle(O,C,B);
        \tkzDefMidPoint(P,B);
        \tkzGetPoint{l1};
        \tkzLabelPoint[above](l1){$l$};
        \tkzDefMidPoint(C,B);
        \tkzGetPoint{l2};
        \tkzLabelPoint[right](l2){$l$};
        \draw [->] (P)--($(P)+(0,1)$) node [midway, right] {$g(\p)$};
        
    \end{tikzpicture}
    \end{center}

    
    Let the center of the circle be $O$. Consider the tangent to the circle at the point $C$, where $C$ is such that $OC$ is at an angle of $\theta$ from the line from $O$ to $\p$. Let the intersection of the tangents through $C$ and $\p$ be $B$. Let $\theta$ be such that the line from $\p$ to $B$ has length $l$. Note that $\theta<\pi/2$, since $l<r$. Let $\q$ be the other (clockwise-most) endpoint of $\gamma$.

    Note then that since the isoline must lie below the line $\p B$, and the length of $\gamma$ is $l$, $\gamma$ never crosses the line $BC$.

    Meanwhile, consider a point $\p'$ on $\gamma$. Note that since the enclosed set of the isoline contains the circle, no supporting line to the isoline through $\p'$ contains interior points of the circle. The angle of such a line must then lie between the angles of the tangents at $\p$ and $C$ (ie, is at most as steep as $BC$, on the diagram), and so makes angle at most $\theta$ with $\p B$. Since $g$ points out of the enclosed set of the isoline, orthogonal to its supporting lines, the difference in angle between $g(\p')$ and $g(\p)$ is then at most $\theta$.

    Therefore, we have that:
    \begin{equation*}
        \frac{\theta}{2} \leq \tan\left(\frac{\theta}{2}\right) = \frac{l}{r}
    \end{equation*}
    since $\tan x \geq x$ for $x \in [0,\pi/2)$. Thus, $\theta \leq \frac{2l}{r}$, as required. We have now established that $\gamma$ has the stated properties.

    We need then only check the final part of the statement, i.e., that $\gamma$ with these properties has sufficient length in the direction orthogonal to $\vec{n}$. Let $\vec{t}$ be as in the statement of the lemma, i.e., parallel to the supporting line through $\p$. We have by convexity that $\gamma$ lies within the triangle defined by supporting lines to the isoline at $\p$ and $\q$ and the straight line from $\p$ to $\q$. Hence, since $\theta<\pi/2$, $\gamma$ lies entirely within the triangle defined by the supporting line at $\p$ parallel to $\vec{t}$, the line through $\q$ parallel to $\vec{n}$, and the line from $\p$ to $\q$, as depicted in the diagram below.
        \begin{center}
    \begin{tikzpicture}
        \coordinate (P) at (0,1);
        \coordinate (Q) at (5,0);
        \coordinate (R) at (5,1);
        \tkzDrawPoints(P,Q);
        \tkzLabelPoint[left](P){$\p$};
        \draw (P) to [out = 0, in = 150]  node [midway, below right] {$\gamma$} (Q) node [above right] {$\q$};
        \draw [dashed] (P) -- (R)--(Q)--(P);
        \tkzMarkRightAngle(P,R,Q);

        
        \node (compass) at (-2,0) {};
        \draw [->](compass.north)--(-2,1) node [midway, left] {$\vec{n}$};
        \draw [->](compass.east)--(-1,0) node [midway, below] {$\vec{t}$};
        
    \end{tikzpicture}
    \end{center}
    Moreover, the maximum possible length of a convex path, within this triangle, from $\p$ to $\q$ is just the combined length of the two shorter sides, i.e. $|\vec{n}^\top(\p-\q)| + |\vec{t}^\top(\p-\q)|$. Thus, $\left|\vec{t}^\top(\p-\q)\right| \geq l - \left|\vec{n}^\top(\p-\q)\right|$. Then, note that the straight line from $\p$ to $\q$ makes angle  at most $\theta$ with the line parallel to $\vec{t}$, since its angle must lie between the angle of supporting lines at $\p$ and $\q$.
    
    Thus,
    \begin{equation*}
        |\vec{n}^\top(\p-\q)| \leq \sin\theta \norm{\p-\q} \leq l \sin\theta \leq l\theta.
    \end{equation*}

    Hence, we have
    \begin{equation*}
        |\vec{t}^\top(\p-\q)| \geq l - l\theta = l\left(1-\frac{2l}{r}\right)
    \end{equation*}
    and we are done.
\end{proof}
\begin{comment}
We also need a small lemma that if an isoline doesn't turn very much, and is reasonably long, then it's reasonably wide (from end to end):

\begin{lemma}\label{lemma:isoline_width}
    Let $\gamma$ be a convex curve of length at least $l$ on which supporting lines have angles varying by at most $\theta$ (for some choice of supporting line at each point on $\gamma$). Then the distance between the endpoints of $\gamma$ is at least $l\sqrt{\frac{1+\cos\theta}{2}}$.
\end{lemma}
\begin{proof}
Let the endpoints of $\gamma$ be $A$ and $B$, and $l_1$ and $l_2$ be their respective distances to the intersection of the supporting lines through $A$. Write $\theta'$ for the difference in angle between these supporting lines, so that $\theta'\leq\theta$. Let $c$ be the distance between $A$ and $B$.
\begin{center}
\begin{tikzpicture}
    \coordinate (A)  at (0,0);
    \coordinate (B) at (10,0);
    \coordinate (C) at (4,1);
    \draw [dashed] (A) node [left] {$A$}  -- (C) node [midway,above right] {$l_1$}-- (B) node [right] {$B$} node [midway,above left] {$l_2$};
    \draw [dashed] (A)--(B) node [midway, above] {$c$};
    \draw (A)  .. controls (3,0.75) and (5.5,0.75)  .. (B);
    \tkzMarkAngle[size = 0.3](A,C,B);
    \tkzLabelAngle[pos = -0.2](A,C,B){$\pi-\theta'$};
\end{tikzpicture}
\end{center}

Note that we must have $l_1+l_2 \geq l$. Then, we have that, by the law of cosines, and provided $\theta <\pi/2$:
\begin{align*}
    c^2 &= l_1^2 + l_2^2 - 2l_1l_2\cos(\pi-\theta')\\
    &= l_1^2 + l_2^2 + 2l_1l_2\cos(\theta')\\
    &= (l_1+l_2)^2 - 2l_1l_2(1-\cos\theta')\\
    &\geq l^2 - 2l_1(l-l_1)(1-\cos\theta')\\
    &\geq l^2 - \frac{1}{2}l^2(1-\cos\theta')\\
    &= \frac{l^2}{2}(1 + \cos\theta')\\
    & \geq \frac{l^2}{2}(1 + \cos\theta)
\end{align*}
The result then follows immediately.
\end{proof}
\end{comment}
Now, our main result: We can't get arbitrarily good bounds for fixed Lipschitz constant, and the bound one can at best get scales linearly with $L_f$ in the limit $L_f \rightarrow 0$.

%\begin{thm}\label{thm:no_higher_dim_bound}
%For any Lipschitz constant $L$, for $\epsilon>0$ sufficiently small, there is no proper scoring rule $S$ on $\mathbb{R}^3$ that achieves the following property: Whenever $f$ is $L$-Lipschitz, there is some $p^* \in \argmax_p S(p,f(p))$ with $\norm{f(p^*)-p^*} \leq \epsilon$.
%
%In particular, there exists $\epsilon(L)$ which is $O(L)$ in the limit $L \rightarrow 0$, for which the above property cannot be achieved with $\epsilon = \epsilon(L)$. Thus, the achievable bound is at best $O(L)$ as $L \rightarrow 0$.
%\end{thm}

\begin{restatable}{theorem}{impossibility}
For any Lipschitz constant $L_f$, for $\epsilon>0$ sufficiently small, there is no proper scoring rule $S$ for the three-outcome case that achieves the following property: Whenever $f$ is $L_f$-Lipschitz, there is some performatively optimal report $\p$ with $\Vert f(\p)-\p\Vert  \leq \epsilon$. In particular, there exists some function $\epsilon(L_f)$ with $\epsilon(L_f) \sim c L_f$ as $L_f \rightarrow 0$ for some fixed constant $c$, s.t.\ the above property cannot be achieved with $\epsilon = \epsilon(L_f)$.  Thus, the best achievable bound is in $\Omega(L_f)$ as $L_f \rightarrow 0$, i.e. scales at least linearly with $L_f$ in the limit.
\end{restatable}


\begin{proof}
Let $g$ and $G$ be as in the Gneiting and Raftery characterization of $S$. Let $\lambda = \min(L_f,2)$.

\jt{is it possible to illustrate this somehow?}

We will proceed as follows:
\begin{itemize}
    \item Find an isoline of $G$ on which the angle of $g(\p)$ doesn't change much. On this isoline, we are then able to move along the isoline without $g(\p)$ changing much in the direction of movement, and hence without $g(\p)^\top  \p$ changing much.
    \item Construct a $\lambda$-Lipschitz (and hence $L_f$-Lipschitz) function $f$ with fixed point $\p_0$ such that as we move sideways along the isoline, $f(\p)$ moves upwards, incentivising us to misrepresent in the direction of the isoline.
    \item We will then show that for a point $\q$, with $\norm{f(\q)-\q} \geq \epsilon$, reporting $\q$ gives higher score than any point $\p$ for which $\norm{f(\p)-\p} <\epsilon$ (for $\epsilon$ which we will choose).
\end{itemize}

Let $\theta = \arctan(\lambda/4)\leq\lambda/4\leq 1/2$. Note that then $\theta \sim L_f/4$ as $L_f \rightarrow 0$.

Let $r = \frac{\sqrt{6}}{12}$. Then, let $\gamma$ be an isoline satisfying the properties of Lemma \ref{lemma:isoline_section} with $l = \frac{\theta r}{2}$. Let the end points of $\gamma$ be $\p_1$ and $\q$ (chosen such that $\q$ is the same end as $\q$ in the statement of the Lemma), $\vec{n}\defeq \frac{g(\p_1)}{\norm{g(\p_1)}}$, and $\vec{t}$ a unit vector orthogonal to both $\vec{n}$ and $\vec{1}$. Then we have, in particular:
\begin{enumerate}[label = (P\arabic*)]
    \item $\p_1-2r\vec{n} \in \Pset$ and $G(\p_1-2r\vec{n}) \leq G(\p_1)$.\label{p:can_drop}
    \item For all $\p \in \gamma$, the angle between $g(\p)$ and $g(\p_1)$ (equivalently, $\vec{n}$) is at most $\frac{2l}{r} = \theta$.\label{p:small_angle}
    \item Each point on $\gamma$ has distance at least $r-l \geq 3l$ from the boundary of the simplex.\label{p:margin}
    \item $|t^\top(\q-\p_1)| \geq l(1-\frac{2l}{r}) = l(1-\theta)$.\label{p:width}
\end{enumerate}

Let $\epsilon = \frac{1}{2}|t^\top(\q-\p_1)|$. We have, by \ref{p:width}, $\epsilon \geq l(1-\theta)/2\geq l/4 > 0$. Note that $l(1-\theta)/2 \leq \epsilon \leq l/2$, and $l\theta = o(L_f)$, and so $\epsilon \sim \frac{1}{2}l = \frac{1}{4}\theta r \sim \frac{1}{16}L_f r=\frac{\sqrt{6}}{192}L_f$ as $L_f \rightarrow 0$.

Let $\p_0 = \p_1 - \epsilon\lambda\vec{n}$. Note that since $\epsilon\lambda \leq 2\epsilon \leq l$, we have by \ref{p:margin} that $\p_0$ is within the simplex.

\begin{center}
\begin{tikzpicture}[
    tangent/.style={
        decoration={
            markings,% switch on markings
            mark=
                at position #1
                with
                {
                    \coordinate (tangent point-\pgfkeysvalueof{/pgf/decoration/mark info/sequence number}) at (0pt,0pt);
                    \coordinate (tangent unit vector-\pgfkeysvalueof{/pgf/decoration/mark info/sequence number}) at (1,0pt);
                    \coordinate (tangent orthogonal unit vector-\pgfkeysvalueof{/pgf/decoration/mark info/sequence number}) at (0pt,1);
                }
        },
        postaction=decorate
    },
    use tangent/.style={
        shift=(tangent point-#1),
        x=(tangent unit vector-#1),
        y=(tangent orthogonal unit vector-#1)
    },
    use tangent/.default=1
]
    \coordinate (B) at (5,0.5); %right endpoint
    \coordinate (p1) at (0,1); %p_0+\lambda\epsilon n
    \coordinate (p0) at (0,0); %p_0

    %drawing gamma and line AB:
    \draw [name path = gamma, tangent = 1, tangent = 0.6] (p1) node [above left] {$\p_1$} to[out = 0, in = 165] (B); %curve \gamma
    %\draw [use tangent=2] (0,0) node [above]{$\gamma$}; %labelling gamma

    %defining q and point below q:
    \coordinate [use tangent] (q); %q
    \draw (q) node [right] {$\q$};
    \coordinate (q-) at (q |- p0); %p_0 + 2*epsilon t
    \draw (q-) node [below] {$\p_0+2\epsilon \vec{t}$};

    %showing relations between points:
    \draw [dashed] (q-) -- (q);
    \draw [dashed] (p1) -- (p0) node [below] {$\p_0$};
    \draw [dashed] (p0)--(q-) node [midway, below] {2$\epsilon$};

    %put circles at all the points:
    \tkzDrawPoints(p1,p0,q,q-);

    %showing the directions n and t:
    \node (compass) at (8,1) {};
    \draw [->](compass.north)--(8,2) node [midway, left] {$\vec{n}$};
    \draw [->](compass.east)--(9,1) node [midway, below] {$\vec{t}$};

    %drawing normals to show g:
    \draw [use tangent, ->] (0,0.2) -- (0,1.3) node [midway, right]{$g(\q)$};
    \draw [->] ($(p1)+(0,0.2)$) -- ($(p1)+(0,1.5)$) node [midway, right]{$g(\p_1)$};
\end{tikzpicture}
\end{center}

By construction, there is a supporting line to $\gamma$, parallel to $\vec{t}$, through $\p_1$. Thus, $(\q-\p_1)^\top \vec{n} \leq 0$.

Now, let $f(\p) = \p_0 + \lambda\min(|\vec{t}^\top (\p-\p_0)|,2\epsilon)\vec{n}$. Note that the image of $f$ is the line segment $[\p_0,\p_0 + 2\epsilon\lambda\vec{n}]$, which has maximum distance $\lambda\epsilon \leq l$ from $\gamma$, and hence by \ref{p:margin} is entirely within the probability simplex. Also, $f$ has Lipschitz constant $\lambda \leq L_f$.

Then, if $\norm{f(\p)-\p}\leq\epsilon$, we must have
\begin{equation*}
    \epsilon \geq |\vec{t}^\top (f(\p)-\p)| = |\vec{t}^\top (\p_0-\p)|
\end{equation*}
and hence $f(\p)$ must in fact lie in the line segment $[\p_0,\p_0 + \epsilon\lambda\vec{n}] = [\p_0,\p_1]$. Moreover, $\norm{f(\q)-q} \geq 2\epsilon > \epsilon$.

Meanwhile, we have by convexity and \ref{p:can_drop} that for $\p \in [\p_1-2r\vec{n}, \p_1]$, $G(\p) \leq \max(G(\p_1-2r\vec{n}),G(\p_1)) = G(\p_1)$. Hence, since $\lambda\epsilon < 2r$, the maximum of $G$ on $[\p_0,\p_1]$ is $G(\p_1)$.

Therefore, whenever $\norm{f(\p)-\p}\leq\epsilon$:

\begin{equation*}
    S(\p,f(\p))\leq S(f(\p),f(\p)) = G(f(\p)) \leq G(\p_1)
\end{equation*}
that is, the maximum achievable score is at most the score of honestly reporting $\p_1$.

We will now show that the score of reporting $\q$ is greater than $G(\p_1)$.

First, we have that
\begin{align*}
    S(\q,f(\q))&= g(\q)^\top (f(\q)-\q) + G(\q) &\text{(Gn\&Raf)}\\
    &= g(\q)^\top (\p_0+2\epsilon\lambda \vec{n} - \q) + G(\p_1) &\text{(Def. of } f, \q)
\end{align*}

It is left to show that the left summand is positive. We have that
\begin{align*}
    & g(\q)^\top (\p_0+2\epsilon\lambda\vec{n} -\q)\\
    &= (2\epsilon\lambda+(\p_0-\q)^\top \vec{n})g(\q)^\top \vec{n} + ((\p_0-\q)^\top \vec{t})g(\q)^\top \vec{t}\\
    &\geq  \epsilon\lambda g(\q)^\top \vec{n} - 2\epsilon |g(\q)^\top \vec{t}| &\text{(Def. of }\q,\p_0)\\
    &\geq \norm{g(\q)}\epsilon(\lambda\cos\theta - 2\sin\theta) &\text{(By \ref{p:small_angle}})\\
    &=  2\norm{g(\q)}\epsilon \cos(\theta)(\lambda/2-\tan\theta)\\
    &\geq \norm{g(\q)}\epsilon\cos(\theta) \lambda/2 > 0 &\text{(Choice of }\theta)
\end{align*}

\end{proof}

\section{Preferences between different fixed points}
\label{preferences-between-fps}


\begin{proposition}
\label{prop:preferences-between-fps}
    Let $F=\{\p\colon f(\p)=\p\}$ be a set of fixed points of $f$. Let $\p\in F$ such that $\p$ is the convex combination of elements of $F-\{\p\}$. (In other words, $\p$ is in the interior of the convex hull of $F$). Then if $S$ is strictly proper, there exists a $\p^*\in F$ s.t.\ $S(p^*,f(p^*))> S(p,f(p))$. Thus, $\argmax_{p\in F} S(p,f(p))$ is a subset of the extreme points of $F$.
\end{proposition}

This follows directly from the convexity of the expected score under honest reporting as per \Cref{theorem:gneiting-raftery}, but for completeness we provide a detailed proof.

\begin{proof}
Let $\p = \sum_{i=1}^k c_i\p_i$ for $c_i\in [0,1]$ with $\sum_{i=1}^k c_i=1$ and $\p_i\in F-{\p}$. Then
\begin{eqnarray*}
S(\p,f(\p)) &=& g(\p)(f(\p)-\p)+G(\p)\\
& \underset{\text{$\p$ fixed point}}{=} & G(\p)\\
&= & G\left(\sum_{i=1}^k c_i\p_i\right)\\
&\underset{\text{G strictly convex}}{<} & \sum_{i=1}^k c_i G(\p_i)\\
&\underset{\text{$\p_i$ fixed point}}{=}& \sum_{i=1}^k c_i (g(\p_i)(f(\p_i)-\p_i) +G(\p_i))\\
&=& \sum_{i=1}^k c_i S(\p_i,f(\p_i)).
\end{eqnarray*}
Now for the average of the $S(\p_i,f(\p_i))$ to be greater than $S(\p,f(\p))$, at least one of the $S(\p_i,f(\p_i))$ must be greater than $S(\p,f(\p))$.
\end{proof}

\co{LOW PRIORITY: Think about the symmetric case again. Maybe one can prove something interesting for the many-outcome case after all. But given our results the two-outcome case is most important.}

%One issue with oracle AIs arises when there are multiple possible fixed points, since the choice of a fixed point gives the AI an axis over which it can optimize the world. This problem would be alleviated if an AI were to simply choose among fixed points randomly, for instance. Unfortunately, we can show that under a strictly proper scoring rule, an AI is incentivized to choose extreme fixed points, i.e., predictions that make one of the outcomes most likely. %This means that the AI is instrumentally incentivized to try to bring about this outcome, and it may make the world as predictable as possible in doing so. %That means that the AI is instrumentally incentivized to manipulate the world to bring about one of the outcomes, and it may try to make the world as predictable as possible in doing so.%, a goal that may end distastrously for humanity if achieved.\footnote{Note that, as mentioned above, optimizing towards randomness may not be any better.}

%Specifically, we prove that for any fixed point, either all fixed points with lower probabilities or all fixed points with higher probabilities will result in a higher score. Our results do not unambiguously show that lower entropy fixed points are always preferred, since scoring rules may not be symmetric: they may favor one of the outcomes over the other. We show that if \(S\) is symmetric, i.e., \(\Score(p,p)=\Score(1-p,1-p)\) for all \(p\), then this means that lower entropy fixed points, corresponding to more confident predictions, are always preferred. Moreover, we show that there exist (asymmetric) strictly proper scoring rules that incentivize making either of the outcomes more likely. %This means that the fixed point leading to the highest score will always be the most extreme fixed point in one direction.


%\begin{proposition}
\co{LOW PRIORITY: Prove the following.

Let $F$ be a set of fixed points of $f$ and let $\p^*$ be an extreme point of $F$, i.e., an element of $F$ that is not the convex combination of other elements of $F$. Then there exists a strictly proper scoring rule $S$ such that $\argmax_{\p\in F} S(\p,f(\p))=\p^*$.}

\co{There are alternative versions of the above, but I think the above is the simplest. If $G$ was allowed to be weakly convex, then $G$ could just be $v^\top p$ s.t.\ $G$ could be any utility function. But since $G$ must be strictly convex, this can only be approximately true. Which is still interesting, but not as nice to state.}

\begin{comment}
\color{blue}

\begin{proposition} \label{prop:extreme-fp}Let \(S\) be any strictly proper scoring rule. Then ...%Then there is a point \(\hat{p}\in [0,1]\) such that \(\Score(p',p')>\Score(p,p)\) whenever \(|\hat{p}-p'|>|\hat{p}-p|\). If \(S\) is symmetric, lower-entropy fixed points are preferred.
\end{proposition}

\begin{proof} 
Follows from Gneiting and Raftery
%If \(p \in \{0,1\}\) then the proposition is vacuously true. We focus on the case where \(p \in \left(0,1\right)\).
%Since \(S\) is a strictly proper scoring rule, it is
%\[\Score\left(p',p'\right) > \Score\left(p,p'\right) = p'S\left(p,1\right) + (1 - p')S\left(p,0\right).\]

%Next, if \(S\left(p,1\right) \geq S\left(p,0\right)\), let \(p' \geq p\) arbitrary, and if \(S\left(p,1\right) \leq S\left(p,0\right)\), let \(p' \leq p\). Then we have
%\[p'S\left(p,1\right) + (1 - p')S\left(p,0\right) \geq pS\left(p,1\right) + (1 - p)S\left(p,0\right) = \Score\left(p,p\right).\]
%Combining both equations, we get
%\[\Score\left(p',p'\right) > p'S\left(p,1\right) + (1 - p')S\left(p,0\right)\geq\Score\left(p,p\right),\]
%which concludes the first part of the proof.

%For the ``in particular'' part, note that for any points \(p\) and \(p'\), it is trivial to find a function \(f\) such that both points are fixed points. For instance, all points are fixed points of the identity function \(f(x)=x\). The statement then follows from the first part of the result, since for \(p\in (0,1)\), we must have \(\Score(p',p')>\Score(p,p)\) for either \(p'=0\) or \(p'=1\).
\end{proof}


%\begin{corollary}
%     Assume that \(S\) is symmetric and let \(p\in [0,1]\) arbitrary. Then we have \(\Score(p',p')>\Score(p,p)\) for all \(p'\) such that \(|p'-1/2|>|p-1/2|\). That is, lower entropy fixed points are always preferred.
% \end{corollary}

%\begin{proof}
    %Let \(p\) arbitrary and assume \(p\geq1/2\) (the case \(p<1/2\) follows  analogously). Let \(p'\) such that \(|p'-1/2|>|p-1/2|\), which implies \(p'>p\) since \(p\geq 1/2\). By Proposition~\ref{prop:extreme-fp}, we have \(\Score(p',p')>\Score(p,p)\) either for all \(p'>p\) or for all \(p'<p\). In the first case, we are done. In the second case, note that \(p\geq1/2 \geq 1-p\geq 1-p'\). By symmetry of \(S\), it follows that
    %\[\Score(p',p')=\Score(1-p',1-p')>\Score(p,p).\]
    %This shows the second case and concludes the proof.
%\end{proof}

\jt{generalize}
\begin{proposition}\label{prop:arbitrary-outcomes}
    For each outcome \(y\in \{0,1\}\), there exists a strictly proper scoring rule \(S\) that incentivizes optimizing for \(y\); i.e., such that \(\Score(p',p')>\Score(p,p)\) for all \(p,p'\) with \(|p'-y|<|p-y|\).
\end{proposition}

See proof in related work

%\begin{proof}To begin, let \(y=1\) and define \(G(p):=p^2\). Since this function is strictly convex, it defines a strictly proper scoring rule \(S\) by the Gneiting and Raftery characterization (\cref{theorem:gneiting-raftery}). Then, for any \(p'>p\), we have
%\[\Score(p',p')=G(p')={p'}^2>p^2=\Score(p,p).\] This shows that \(S\) incentivizes choosing fixed points that make outcome \(1\) more likely.

%Next, for the case \(y=0\), we can choose \(G(p):=(1-p)^2\). The proof then follows analogously to the case \(y=1\).
%\end{proof}

\end{comment}

\section{Additional experimental results}
\label{appendix:experimental-results}

\subsection{Two outcomes}

%\(\sup_p|g(p)/g'(p)|\approx 0.22\) for the log scoring rule.

%\begin{figure}
%
%\includegraphics[width=0.5\columnwidth]{experiment_illustration.png}
%
%\caption{Illustration of our experimental setup, with a given function \(f\) parameterized by a fixed point \(p^*\) and slope \(\alpha\), and the optimal prediction \(\hat{p}\), for the log scoring rule.}
%\end{figure}

\begin{figure}[H]
\centering
\includegraphics[width=0.7\columnwidth]{max-l2-distance-log.pdf}
\caption{Maximal inaccuracy and maximal distance to fixed point (FP) of optimal predictions, depending on the slope of \(f\), according to our simulation and our theoretical bound.}
\label{fig:max-distance-log-score-new}
\end{figure}

\begin{figure}[H]
\centering
\includegraphics[width=0.49
\textwidth]{density-plot-log-l2-inaccuracy.pdf}
\caption{Heatmap of the L2 inaccuracy  of optimal predictions, depending on fixed point position and slope of \(f\), for the logarithmic scoring rule.}
\label{fig:density-plot-log-l2-inaccuracy}
\end{figure}

\begin{figure}[H]
\centering
\includegraphics[width=0.49
\textwidth]{density-plot-log-l2-disttofp.pdf}
\caption{Heatmap of the L2 distance to the fixed point of optimal predictions, depending on fixed point position and slope of \(f\), for the logarithmic scoring rule.}
\label{fig:density-plot-log-l2-disttofp}
\end{figure}

\Cref{fig:max-distance-log-score-new,fig:density-plot-log-l2-disttofp,fig:density-plot-log-l2-inaccuracy} give the same graphs that we give for the Brier scoring rule in the main text.
\jt{We have to give the theoretical bound here! (since we don't give in main text)}

Here the bounds for the log scoring rule are obtained as follows. First, note that for the log scoring rule we have that $g(p)=(\log p_i - \nicefrac{1}{2} (\log p_1+\log p_2 ))_i$. So, $\Vert g(\p)\Vert =\frac{| \log (p_1)-\log (p_2)| }{\sqrt{2}}$ and $Dg(p)=\begin{pmatrix}
\frac{1}{2p_1} & -\frac{1}{2p_2}\\
-\frac{1}{2p_1} &   \frac{1}{2p_2}\\
\end{pmatrix}.$ The eigenvalue of this on the tangent space is $1/(2p_1p_2)$. Thus, since $Dg$ is symmetric, $DG(\p)\succeq 1/(2p_1p_2)$. By \Cref{theorem:Caspar-approx-fix-point},
$\Vert \p-f(\p) \Vert \leq L_f \Vert g(p) \Vert 2p_1p_2= \sqrt{2} L_f p_1p_2 | \log (p_1)-\log (p_2)|$. Numerically this bound seems to be maximized at $p=0.824$ so that we get a bound $\Vert \p-f(\p) \Vert \leq 0.316 L_f$. Similarly, by \Cref{thm:distance-to-fp}, $\Vert \p - \p^* \Vert \leq 0.316 L_f / (1-L_f)$.

For the logarithmic scoring rule, we also give the same plot for the absolute distance between the logits or log odds of the two probabilities (logit distance), see \Cref{fig:density-plot-log-score-odds}. It is defined as \(d(\p,\p'):=|\sigma^{-1}(\p)-\sigma^{-1}(\p')|\), where
\(\sigma^{-1}(\p):=\log \frac{p_1}{p_2}\) is the logit of \(\p\) (or the inverse sigmoid transform). If probabilities are close to \(0\) or \(1\), then L2 distance will always evaluate to very small distances. In contrast, the logit distance depends on order of magnitude differences between probabilities, which may be the more useful quantity.

\begin{figure}[H]
\centering
\includegraphics[width=0.5\textwidth]{density-plot-log-score-odds.png}
\caption{Heatmap of logit distance inaccuracy of optimal predictions for the log scoring rule.}
\label{fig:density-plot-log-score-odds}
\end{figure}

We can see that inaccuracy remains high in logit space for fixed points close to \(0\) and \(1\). We don't plot logit distances for the quadratic score, since for that score, optimal predictions often take values close to or equal to \(\{(0,1),(1,0)\}\) (even if neither \(f(\p)\) nor \(\p^*\) lie in \(\{(0,1),(1,0)\}\)), so the corresponding distances become very large or infinite. The fact that logit distances are bounded for the log score is an advantage of that scoring rule.

\subsection{Many outcomes}

\subsubsection{Inaccuracy and ditance to fixed point are strongly correlated}

Throughout this paper we consider two measures of how wrong a prediction a prediction is, the inaccuracy, i.e., distance of the performatively optimal report $\p$ to $f(\p)$, and the distance of the performatively optimal report to the fixed point. Our experiments show that these measures are closely but not perfectly correlated, see \Cref{fig:scatterplot-distfptounif-l2-inaccuracy-brier-it2}. The correlation is $0.958$.

\begin{figure}[H]
    \centering
    \includegraphics[width=0.6\linewidth]{scatterplot-disttofp-vs-inaccuracy-brier-it2.pdf}
    \caption{Scatter plot showing the L2 inaccuracy of the performatively optimal report against the L2 distance of the performatively optimal report to the fixed point report.}
    \label{fig:scatterplot-distfptounif-l2-inaccuracy-brier-it2}
\end{figure}

\subsubsection{The effect of fixed point location}
\label{appendix:experiments-many-outcomes-effect-of-fp-loc}

\Cref{fig:scatterplot-distfptounif-l2-disttofp-brier} scatter-plots the distance to fixed points against the distance of the fixed points from the uniform distribution. The blue line is the best linear fit, which is $0.0274 + 0.751 x$. Similarly \Cref{fig:scatterplot-distfptounif-l2-inaccuracy-brier} scatter-plots the inaccuracy of the performatively optimal report against the distance of the fixed point report to the uniform distribution. The blue line is again given by the best linear fit, which is $0.0231 + 0.468 x$.

The overall effect of the distance of $\p^*$ from uniform actually seems larger than the effect of the operator norm, as indicated by the correlation coefficients in \Cref{table:two-by-two-correlation}.

\begin{figure}[H]
    \centering
    \includegraphics[width=0.6\linewidth]{scatterplot-distfptounif-l2-disttofp-brier-it2.pdf}
    \caption{Scatter plot showing the L2 distance of the performatively optimal report to the fixed point report against distance of the fixed point to the uniform distribution in our experiments. The blue line is found by linear regression on the points.}
    \label{fig:scatterplot-distfptounif-l2-disttofp-brier}
\end{figure}

\begin{figure}[H]
    \centering
    \includegraphics[width=0.6\linewidth]{scatterplot-distfptounif-l2-inaccuracy-brier-it2.pdf}
    \caption{Scatter plot showing the L2 inaccuracy of the performatively optimal report against the distance of the fixed point of $f$ to the uniform distribution in our experiments. The blue line is found by linear regression on the points.}
    \label{fig:scatterplot-distfptounif-l2-inaccuracy-brier}
\end{figure}

\begin{table}[H]
    \centering
    \begin{tabular}{ccc}
    \hline\hline
         & $\Vert \p - \p^*\Vert$ & $\Vert \p - f(\p) \Vert$ \\
        \hline
        $\Vert f_A\Vert_{\mathrm{op}}$ & 0.294 & 0.311 \\
        $\Vert p^*-\frac{1}{n}\mathbf{1}\Vert$ & 0.331 & 0.411
    \end{tabular}
    \caption{Each entry shows the empirical correlation between the quantities determined by the row and column.}
    \label{table:two-by-two-correlation}
\end{table}

\section{Fixed points via alternative notions of optimality}
\label{appendix:alternative-notions-rationality}

In this section, we will review alternatives to performance optimality under which fixed points are incentivized. We will elaborate on the settings introduced in \Cref{stop-gradients} and provide formal statements and proofs.

To motivate the following, consider an expert AI that chooses its prediction to match its world model, but without explicitly considering the effect of its prediction. For instance, such cognition could arise in an AI trained via a purely supervised objective on historical data. This AI may not learn to take into account effects of its predictions on the outcome of the prediction. If it nevertheless has a world model that generalizes correctly to performative predictions, this could put the AI in a game in which it is trying to make a prediction to match its world model, while the world model updates its beliefs conditional on the AI's prediction. The only equilibria of this game would be fixed points.

Alternatively, fixed points could also result from different training schemes that explicitly optimize an AI's prediction to track empirical outcomes, without also incentivizing influencing the outcomes themselves, such as repeated risk minimization or repeated gradient descent \citep{perdomo2020performative}. %The corresponding training objective is effectively optimizing \(S(\p,f(\p))\), but with a stop-gradient \citep{foerster2018dice,demski2019partial} in front of the \(f(\p)\) term.

Such expert AIs would likely be safer than ones optimizing for performative optimality. First, they report their true beliefs, which gives us better information to base decisions on. This also enables approaches in which we ensure that there is only one safe fixed point. Second, they do not explicitly optimize the choice of fixed point for a goal such as decreasing entropy. Instead, which fixed point is chosen will be contingent on initialization and specifics of the fixed point finding procedure.

%If the oracle is choosing predictions in this way, this has two advantages. First, predictions are honest. All else equal, it appears as though it is preferable if predictions accurately reflect beliefs. Second, the chosen fixed points have not been chosen specifically according to some objective. Rather, the choice of fixed point is contingent on the specific method and hyperparameters such as intialization. While it would be better if fixed points were chosen according to aligned goals, we believe 



%Oracles with stop-gradients optimize the world to find fixed points, which could lead to bad outcomes. Thus, the safest oracles would be ones that make predictions only about aspects of the world they cannot influence.

%However, among oracles that can influence the world, ones with a stop-gradient are preferable for two reasons. First, oracles report their true beliefs, which allow decisions conditional on the oracle's predictions to be made with better information. Second, the agent does not optimize over which fixed point to choose, which is safer for the standard reasons of not wanting to optimize for an unaligned goal. Which fixed point is chosen will be contingent on initialization and specifics of the fixed point finding procedure.


%\color{black}
\subsection{Performative stability and game theory}
\label{appendix:performative-stability-and-game-theory}

We begin by defining performative stability and relating it to an equilibrium in a two-player game. This represents the core idea behind all of the following settings.
A prediction \(\p^*\) is called \emph{performatively stable} \citep{perdomo2020performative} if
\begin{equation}\label{eq:stability-appendix}\p^*\in \argmax_{\p}\Score(\p,f(\p^*)).\end{equation}
First, it is clear that in our case, this is equivalent to \(\p^*\) being a fixed point.
\begin{proposition}\label{performative-stable-fixed-point}
    Assume \(S\) is strictly proper. Then a prediction \(\p^*\) is a fixed point if and only if it is performatively stable.
\end{proposition}
\begin{proof}
``\(\Rightarrow\)''. Assume \(f(\p^*)=\p^*\). Then \(S(\p^*,f(\p*))=S(\p^*,\p^*)\geq S(\p,\p^*)=S(\p,f(\p^*))\) for any \(\p\) since \(S\) is proper. Hence,
    \(\p^*\in \argmax_{\p}\Score(\p,f(\p^*)).\)
    
``\(\Leftarrow\)''. Assume \(\p^*\in \argmax_{\p}\Score(\p,f(\p^*))\). Then since \(S\) is strictly proper, it must be \(\p^*=f(\p^*)\).
\end{proof}

Next, the above objective is equivalent to the definition of a Nash equilibrium in the following game.

%\jt{give this game a name?}
%\RH{Oracle Decomposition Game?}

\begin{definition}[Oracle game]
Consider a two-player continuous game in which the first player controls \(\p\in \Pset\) and the second player controls \(\q\in\Pset\), with payoff functions \(U_1(\p,\q):=\Score(\p,\q)\) and \(U_2(\p,\q):=\Score(\q,f(\p))\) for the two players, respectively.
\end{definition}

If \(\p^*,\q^*\) is a Nash equilibrium of the oracle game, we have \(p^*=\argmax_\p \Score(\p,\q)\) and \(\q^*=\argmax_{\q} S(\q,f(\p^*))\). Substituting the optimal value \(\q^*=f(\p^*)\) for the second player gives us exactly above definition of performative stability in \Cref{eq:stability-appendix}. Conversely, if a prediction \(\p^*\) is performatively stable, then setting \(\q^*:=f(\p^*)\) yields a Nash equilibrium.

\begin{proposition}\label{prop:fp-are-ne}
Assume \(S\) is a proper scoring rule. Then \(\p\in \Pset\), \(\q:=f(\p)\) is a Nash equilibrium of the oracle game, if and only if \(\p\) is performatively stable. By \Cref{performative-stable-fixed-point}, this is equivalent to \(\p\) being a fixed point.
\end{proposition}


The oracle game could arise in an agent that uses a causal decision theory \citep{sep-decision-causal} to maximize its score and that believes that \(\Score\) is influenced causally by \(\p\), but only acausally by \(f(\p)\). In that case, the only \emph{ratifiable} \cite[][Ch.~1.7]{jeffrey1990logic,bell2021reinforcement} decision is a Nash equilibrium of the above game. Similarly, the deliberational causal epistemic decision theory discussed by \citet{greaves2013epistemic} would output Nash equilibria of this game (whereas performative optimality would correspond to an agent using evidential epistemic decision theory in this case).

Note that it is important that both players act simultaneously. \citet{perdomo2020performative} introduce a Stackelberg version of the oracle game that produces performatively optimal instead of performatively stable reports. Consider a game in which player \(1\) acts first and chooses \(\p\), after which player \(2\) responds with a prediction \(\q\). Then player \(2\) responds \(\q=f(\p)\) to player \(1\)'s action, and player \(1\)'s optimization problem becomes
\[p^*=\argmax_{\p}S(\p,\argmax_{\q}\Score(\q,f(\p)))=\argmax_\p \Score(\p,f(\p)).\]


\subsection{Repeated risk minimization and repeated gradient descent}
\label{rrm-and-rgd}
Above, we have defined performative stability and a related game which yield fixed points, but we have not defined methods for solving these problems. In the performative prediction context, \citet{perdomo2020performative} introduce \emph{repeated risk minimization} and \emph{repeated gradient descent}, both methods that converge to performatively stable points. In this section, we review both schemes and show how repeated gradient descent can be seen as gradient descent on a \emph{stop-gradient} \citep{foerster2018dice,demski2019partial} objective.

We assume direct access to \(\q\), instead of having only access to samples distributed according to \(\q\). In the next section, we discuss online learning when we only have access to samples. One way to understand this distinction is that the former corresponds to the internal cognition of an agent with a belief \(\q=f(\p)\) optimizing a prediction \(\p\). The latter instead corresponds to a machine learning training setup for an oracle AI, where \(\q\) is the ground truth environment distribution instead of the oracle's belief. Of course, there is no strict divide between the two. Any optimization algorithm could be used either by the agent itself or to train the agent.


%\[\p_{t+1}\defeq\p_t + \alpha \E_{y\sim f(\p_t)}[\nabla_p\Score(\p_t,y)]=
%\p_t + \alpha \nabla_p\Score(\p,\bot f(\p))
%\p_t + \alpha\left. \frac{\partial \Score(\p,\q)}%{\partial p}\right|_{\q=f(\p)}.\]
%where \(\bot\) is the stop-gradient operator that sets the derivative with respect to its argument to zero \cite{foerster2018dice}.
%They show that in their setting, this converges to performatively stable points. This is repeated for stochastic optimization (TODO other related work). We repeat a similar analysis for our setting in the appendix.%We can describe this as taking a \emph{stop-gradient} in front of \(f(\p)\), \[\p_{t+1}:=\p_t+\alpha\nabla_p\Score(\p,\bot f(\p))=\p_t+\alpha

First, \emph{repeated risk minimization} is a procedure by which we start with a prediction \(\p_0\) and then iteratively update the prediction as \(\p_{t+1}=\argmax_{\p}S(\p,f(\p_{t}))\). This is also the same as alternating best response learning in the oracle game, where player~\(1\) iteratively updates their prediction, responding to predictions \(\q_{t}=f(\p_{t})\) from player \(2\). If \(S\) is strictly proper, \(\p_{t+1}=f(\p_{t})\), and this results in \emph{fixed point iteration} for \(f\). Fixed point iteration converges globally to a fixed point if \(f\) has Lipschitz constant \(L_f<1\). It also converges locally to a fixed point \(\p^*\) if \(f\) is continuously differentiable at \(\p^*\) and \(\rho(Df(\p^*))<1\), where \(\rho(Df(\p^*))\) is the spectral radius of the Jacobian matrix \(Df(\p^*)\).

Second, assume that \(\Score\) is differentiable. Then \emph{repeated gradient ascent} updates points via
\[\p_{t+1}:=\Pi_\Delta(\p_{t}+\alpha \E_{y\sim f(\p_{t})}[\nabla_\p S(\p_{t},y)]),\]
where \(\Pi_\Delta\) is the Euclidean projection onto the probability simplex \(\Pset\), and \(\alpha>0\) is the learning rate.

Using the definition of \(\Score(\p,\q)\), we have
\[\E_{y\sim f(\p_t)}[\nabla_\p\Score(\p_t,y)]
=\nabla_\p(\E_{y\sim\q}[\Score(\p_t,y)])|_{\q=f(\p_t)}
=\nabla_\p(\Score(\p_t,\q))|_{\q=f(\p)}\]
We can express this as
\[\nabla_\p(\Score(\p_t,\bot f(\p_t))):=\nabla_\p(\Score(\p_t,\q))|_{\q=f(\p)},\]
where \(\bot\) is the \emph{stop-gradient operator}, which evaluates to the identity function but sets gradients to zero, \(\nabla_x( \bot x)=0\) \citep{foerster2018dice,demski2019partial}.\footnote{This is not a mathematical function (there is no function that is equal to the identity but has gradient zero everywhere), but rather a notational convention in reference to the \texttt{stop\_gradient} or \texttt{detach} functions from the tensorflow or pytorch python libraries. Interestingly, one can perform valid derivations using the stop-gradient operator (e.g., using the chain rule). We leave it to future work to explore the mathematics behind stop-gradients further.} In the following, we call \(S(\p, \bot f(\p))\) the \emph{stop-gradient objective}.

Importantly, it matters that the gradient in repeated gradient ascent lies inside instead of outside the expectation:
\[\E_{y\sim f(\p_t)}[\nabla_\p S(\p_t,y)]=\nabla_\p(S(\p_t,\bot f(\p_t)))\neq \nabla_p(S(\p_t,f(\p_t)))=\nabla_\p \E_{y\sim f(\p_{t})}[ S(\p_{t},y)]).\]
Unlike repeated gradient ascent, the latter implements gradient ascent on \(S(\p,f(\p))\) and thus leads to performatively optimal reports.

\citet{perdomo2020performative} show that, given their assumptions, repeated gradient descent globally converges to stable fixed points. They also provide convergence rates. We will show an analogous result relating repeated gradient ascent to fixed points in our setting, though we won't analyze global convergence or rates of convergence.

To begin, we show that repeated gradient descent is equivalent to Naive Learning \citep{letcherstable} in the oracle game, assuming that player~\(2\) always plays \(\q=f(\p)\).    
\begin{proposition}\label{prop:game-stop-gradient}
    Assume player \(1\) is performing gradient ascent on its objective with learning rate \(\alpha\), under the assumption that player \(2\) always plays \(\q=f(\p)\). Then player \(1\)'s update is
    \[\p_{t+1}=\Pi_\Delta(\p_t+\alpha\nabla_\p(\mathbf{S}(\p_t,\bot f(\p_t)))).\]
\end{proposition}

\begin{proof}The proof follows immediately from the definitions. Player \(1\)'s update is, by assumption,
    \[\p_{t+1}=\Pi_\Delta(\p_t+\alpha\nabla_\p(U_1(\p_t,\q)))=\Pi_\Delta(\p_t+\alpha\nabla_\p(\Score(\p_t,\q)))\]
    where \(\q\) is player \(2\)'s action. Assuming player \(2\) plays \(\q=f(\p_t)\), we get
\[\p_{t+1}=\Pi_\Delta(\p_t+\alpha\nabla_\p(\Score(\p_t,\q)))=
\Pi_\Delta(\p_t+\alpha\nabla_p(\Score(\p_t,\bot f(\p_t))))\]
\end{proof}

Next, we show that fixed points are critical points of the stop-gradient objective.

\begin{proposition}\label{stop-gradient-critical-points}
Assume \(S\) is proper and let \(G,g\) as in the Gneiting and Raftery characterization of \(S\) (\Cref{theorem:gneiting-raftery}) be differentiable. Then for any \(\p\in \Pset\), we have
\[\nabla_\p(S(\p,\bot f(\p)))= Dg(\p)^\top (f(\p)-\p).\]
In particular, if \(\p\) is a fixed point, it follows that \(\nabla_\p(\Score(\p,\bot f(\p)))=0\). The reverse is true if \(Dg(\p)|_{\TPset}\succ 0\).
\end{proposition}
\begin{proof}
    \begin{multline}
        \nabla_\p (\Score(\p,\bot f(\p)))
        =\nabla_\p(\Score(\p,\q))|_{\q=f(\p)}
        =\nabla_\p(G(\p)+g(\p)^\top (\q-\p))|_{\q=f(\p)}
        \\
        =(g(\p)+Dg(\p)^\top (\q-\p)-g(\p))|_{\q=f(\p)}
        =Dg(\p)^\top (f(\p)-\p).
    \end{multline}
    If \(\p\) is a fixed point, it follows that \(\nabla_\p (\Score(\p,\bot f(\p)))=0\). Moreover, if \(Dg(\p)|_{\TPset}\succ 0\), then if \(f(\p)-\p\neq 0,\) 
    \[  \nabla_\p (\Score(\p,\bot f(\p)))^\top (f(\p)-\p)
    =(f(\p)-\p)^\top Dg(\p)(f(\p)-\p)>0\]
    and thus \( \nabla_\p (\Score(\p,\bot f(\p)))\neq 0\).
\end{proof}

%Finally, note that \citet{perdomo2020performative} show that, repeated gradient ascent converges to performatively stable points and thus fixed points. We believe a similar result can be applied to our setting, but do not show this here.


Finally, we show that in our setting, repeated gradient ascent locally converges to fixed points \(\p^*\), assuming that \(\Vert Df(\p^*)\Vert_{\op}\) is sufficiently small. This is a local version of convergence results from \cite{perdomo2020performative}, adapted to our setting.


\begin{proposition}\label{prop:convergence-stop-gradient}
   Let \(S\) be a strictly proper scoring rule. Let \(\p^*\in\interior{\Pset}\) be a fixed point of \(f\) such that \(G\) is three times differentiable at \(\p^*\), i.e. \(D^2g(\p^*)=D^2\nabla g(\p^*)\) exists. Assume \(\beta\succeq Dg(\p^*)|_{\TPset}\succeq \gamma>0\), that \(f\) is differentiable at \(\p^*\), and \(\Vert Df(\p^*)\Vert_{\op}<\frac{\gamma}{\beta}\). Then, for small enough \(\alpha>0\), an agent taking updates \(\p_{t+1}=\Pi_{\Delta}(\p_t+\alpha \nabla_\p(\mathbf{S}(\p_t,\bot f(\p_t))))\) will locally converge to \(\p^*\).
\end{proposition}

For the proof, we use the following generalization of Ostrowski's theorem, adapted from \citet{Kitchen1966}.

\begin{theorem}[\cite{Kitchen1966}]\label{thm:kitchen}
Let \(\varphi\colon D\subseteq V\rightarrow W\) where \(V,W\) are Banach spaces. Assume
\begin{itemize}
\item \(\varphi\) has a fixed point \(\x^*\in \mathrm{int}(D)\)
\item \(\varphi\) is differentiable at \(\x^*\)
\item \(\rho(D\varphi(\x^*))<1\).
\end{itemize}
Then there exists an open set \(U\subseteq D\) with \(x^*\in U\) such that, letting \(\x_0\in U\) and \(\x_t\defeq\varphi(\x_{t-1})\) for \(k\in\mathbb{N},\) we have \(\x_t\in U\) for all \(k\) and \(\lim_t\x_t= \x^*.\)
\end{theorem}

\begin{proof}[Proof of \Cref{prop:convergence-stop-gradient}]
The Banach space we consider will be \(\TPset\). Note that, since \(\p^*\in \interior{\Pset},\) there exists an open set \(\D\subseteq\TPset\) (with respect to the standard topology on \(\TPset\)) with \(0\in \D\) such that \(\vec{v}+\p^*\subseteq \Pset\) for all \(\vec{v}\in \D\). Our iteration function then is \[\varphi\colon \D\subseteq\TPset\rightarrow \TPset, \vec{v}\mapsto \vec{v}+\alpha\nabla_{\vec{v}}(\Score(\vec{v}+\p^*,\bot f(\vec{v}+\p^*))).\]


Note that \(\varphi\) has a fixed point at \(0\). Our goal is now to show that there exists \(\alpha>0\) and an open set \(U\subseteq\D\) such that iterates of \(\varphi\) starting in \(U\) stay in \(U\) and converge to \(0\).

To that end, note that, using \Cref{stop-gradient-critical-points}, we have \(\nabla_{\p}(\Score(\p,\bot f(\p)))=Dg(\p)^\top (f(\p)-\p)\) and thus
\begin{align}D\varphi(0)
&=D(\vec{v}\mapsto\vec{v}+\alpha\nabla_{\vec{v}}\mathbf{S}(\vec{v}+\p^*,\bot f(\vec{v}+\p^*)))(0)\\
&
=
\Id + \alpha D(\vec{v}\mapsto Dg(\vec{v}+\p^*)^\top(f(\vec{v}+\p^*)-\vec{v}-\p^*))(0)\\
&=\Id + \alpha D^2g(\p^*)[f(\p^*)-\p^*]
+ \alpha Dg(\p^*)^\top(Df(\p^*)-\Id).
\end{align}
Here, \(D^2g(\vec{v}+\p^*)\) is a third-degree tensor, and \(D^2g(\vec{v}+\p^*)[f(\p^*)-\p^*]\) is a linear map. Since \(f(\p^*)=\p^*\), it follows 
\(D\varphi(0)=\Id+\alpha Dg(\p^*)^\top (Df(\p^*)-\Id)\). In particular, \(\varphi\) is differentiable at \(0\).

Now let \(\vec{v}\) be an arbitrary eigenvector of \(D\varphi(0)\), with eigenvalue \(\lambda\) and w.l.o.g. assume \(\Vert\vec{v}\Vert=1\). Note that \(\vec{v}^\top Dg(\p^*)\vec{v}\geq \gamma \Vert \vec{v}\Vert = \gamma\) and \(\vec{v}^\top Dg(\p^*)\vec{v}\leq \beta\Vert \vec{v}\Vert\leq \beta\) by assumption.
Letting \(\alpha:=\frac{1}{ \beta}\), it follows that \(\alpha \vec{v}^\top Dg(\p^*)\vec{v}\leq 1\) and thus
\[|1-\alpha \vec{v}^\top Dg(\p^*)^\top\vec{v}|
=|1-\alpha \vec{v}^\top Dg(\p^*)\vec{v}|
=1-\alpha \vec{v}^\top Dg(\p^*)\vec{v}
\leq 1-\alpha \gamma.
\]
% Hence, letting \(\alpha:=\frac{1}{2 \Vert Dg(\p^*)\Vert_{\op} }\), it follows that 
%\[\gamma\leq \vec{v}^\top Dg(\p^*)\vec{v}\leq \]
Moreover, since \(Dg(\p^*)\) is the Hessian of \(G\) and thus symmetric since \(G\) is twice differentiable, we have \(\Vert Dg(\p^*)\Vert_{\mathrm{op}}\leq \beta\).
Using this, as well as our assumption \(\Vert Df(\p^*)\Vert_{\op}<\frac{\gamma}{\beta}\), we get
\begin{multline}
    |\lambda|
    =\vert \lambda \vec{v}^\top\vec{v}\vert
    =\vert \vec{v}^\top D\varphi(0)\vec{v}\vert
    =\vert\vec{v}^\top (\Id+\alpha Dg(\p^*)^\top(Df(\p^*)-\Id))\vec{v}\vert
    \\
    =\vert\vec{v}^\top \vec{v}-\alpha \vec{v}^\top Dg(\p^*)^\top\vec{v}+\alpha \vec{v}^\top Dg(\p^*)^\top Df(\p^*))\vec{v}\vert
    \\
    \leq
    \vert 1-\alpha \vec{v}^\top Dg(\p^*)^\top\vec{v}\vert + \alpha \vert\vec{v}^\top Dg(\p^*)^\top Df(\p^*)\vec{v}\vert
    \\
    \underset{\text{Cauchy-Schwarz}}{\leq}
     1-\alpha \gamma+ \alpha \Vert Dg(\p^*) \vec{v}\Vert \Vert Df(\p^*)\vec{v}\Vert
     \\
    \leq
     1-\alpha \gamma + \alpha \Vert Dg(\p^*)\Vert_{\op} \Vert \vec{v}\Vert \Vert Df(\p^*) \Vert_{\op}\Vert\vec{v}\Vert
    \\ =
     1-\alpha \gamma + \alpha \Vert Dg(\p^*)\Vert_{\op}\Vert Df(\p^*)\Vert_{\op}
     < 1-\alpha \gamma + \alpha \gamma
     =1.
\end{multline}

%Now let \(\vec{v}_1,\dotsc,\vec{v}_{n-1}\in\TPset\) be an orthonormal eigenbasis of \(Dg(\p^*)\) with eigenvalues \(0<\gamma_1\leq  \dotsb\leq \gamma_{n-1}\) (noting that \(Dg(\p^*)\) is symmetric).
%Note that for any \(i,j\), we have
%\begin{equation}
%D\varphi(0)\vec{v}_i
%=(\Id+\alpha (Df(\p^*)-\Id)Dg(\p^*))\vec{v}_i
%=(1-\alpha\gamma_i)\vec{v}_i+\alpha \gamma_iDf(\p^*)\vec{v}_i
%\Vert
%\\\leq
%|1-\alpha\gamma_i|\Vert\vec{v}_i\Vert+\alpha \gamma_i\Vert Df(\p^*)\vec{v}_i\Vert
%\leq |1-\alpha\gamma_i| + \alpha\gamma_i \Vert Df(\p^*)\Vert_{\op}
%\end{equation}
%Letting \(\alpha:=\min_{i=1,\dotsc,n-1}\frac{1}{2\gamma_i}\) and noting \(\Vert Df(\p^*)\Vert_{\op}<1\), it follows for any \(i=1,\dotsc,n-1\) that
%\[
%\Vert D\varphi(0)\vec{v}_i\Vert
%\leq |1-\alpha\gamma_i| + \alpha\gamma_i\Vert Df(\p^*)\Vert_{\op}
%< |1-\alpha\gamma_i| + \alpha\gamma_i
%=1-\alpha\gamma_i + \alpha\gamma_i
%=1.
%\]
%This shows that  i.e., the singular  \(\rho(D\varphi(0))<1\). 

%Now let \(\vec{v}\) be any eigenvector of \(D\varphi(0)\) with eigenvalue \(\lambda\) and with \(\Vert \vec{v}\Vert=1\). We want to show that then \(|\lambda|<1\), given a small enough choice of \(\alpha\).
%Begin by letting \(\mu_1,\dotsc,\mu_{n-1}\) such that \(\vec{v}=\sum_{i=1}^{n-1}\mu_i\vec{v}_i\) and note that \(\sum_{i=1}^{n-1}\mu_i^2=\Vert\vec{v}\Vert=1\) by Parseval's identity.
%Now we have
%%\begin{multline}
  %% = \Vert\lambda \vec{v}\Vert
    %=\Vert D\varphi(0)\vec{v}\Vert
    %=\Vert \sum_{i=1}^{n-1}\mu_iD\varphi(0)\vec{v}_i\Vert
    %=\Vert \sum_{i=1}^{n-1}(1-\alpha\gamma_i)\mu_i\vec{v}_i+\alpha %\gamma_iDf(\p^*)\mu_i\vec{v}_i\Vert
   % =
    %\leq \sum_{i=1}^{n-1}|\mu_i|^2\Vert D\varphi(0)\vec{v}_i\Vert
    %<\sum_i |\mu_i|
%\end{multline}
This shows that \(\rho(D\varphi(0))<1\). Hence, by \Cref{thm:kitchen}, we can conclude that there exists an open set \(U\subseteq \D\) such that for arbitrary \(\vec{v}_0\in U\), \(\vec{v}_t:=\varphi(\vec{v}_{t-1})\in U\) for all \(t\geq 1\), and \(\lim_{t\rightarrow\infty}\vec{v}_t=0\). In particular, note that since \(\vec{v}_t\in U\) for all \(t\), \(\vec{v}_t+\p^*\in\Pset\) and
\[\p^*+\vec{v}_{t+1}=\p^* +\vec{v}_t+\alpha\nabla_{\vec{v}}(\Score(\vec{v}_t+\p^*,\bot f(\vec{v}_t+\p^*)))
=
\Pi_\Delta(\p^*+\vec{v}_t+\alpha\nabla_{\vec{v}}(\Score(\vec{v}_t+\p^*,\bot f(\vec{v}_t+\p^*))))
\]
for all \(t\). Hence, setting \(\p_t:=\p^*+\vec{v}_t\), it follows
\(\p_{t+1}=\Pi_{\Delta}(\p_t+\alpha \nabla_\p(\mathbf{S}(\p_t,\bot f(\p_t))))\)
for all \(t\) and \[\lim_{t\rightarrow\infty}\p_t
=\p^* + \lim_{t\rightarrow\infty}\vec{v}_t
=\p^*.\]
This concludes the proof.
\end{proof}

\subsection{Online learning}
\label{appendix:online-learning}
Now consider a machine learning setup in which we train an oracle with stochastic gradient ascent on environment samples. We assume that at time \(t\), a model makes a prediction \(\Pvar_t\) and receives a score \(S(\Pvar_t,\Y_t)\), where \(\Y_t\sim f(\Pvar_t)\). The model is then updated using gradient ascent on \(\Score (\Pvar_t,\Y_t)\). That is, for some learning rate schedule \((\alpha_t)_t\), we have
\[\Pvar_{t+1}=\Pi_\Delta(\Pvar_t+\alpha_t\nabla_\p\Score(\Pvar_t,\Y_t)),\]
where $\Pi_\Delta$ is the Euclidean projection onto $\Delta(\mathcal N)$ as before.

We discuss this as a theoretical model for oracles trained using machine learning, to show how training setups may incentivize predicting fixed points. There are many issues with the setting beyond giving accurate predictions; for instance, learning may fail to converge at all, and even if the training process sets the right incentives on training examples, the learned model may be optimizing a different objective when generalizing to new predictions \citep{hubinger2019risks} .

To see that this setting leads to fixed points, note that we have \[\E_{\Y_t\sim f(\Pvar_t)}[\nabla_\p\Score(\Pvar_t,\Y_t)]=\nabla_\p \E_{\Y_t\sim \bot f(\Pvar_t)}[\Score(\Pvar_t,\Y_t)]
=\nabla_\p( \Score(\Pvar_t,\bot f(\Pvar_t))).\]
That is, the expectation of this gradient, conditional on \(\Pvar_t,\) is exactly the repeated gradient from the previous section. Hence, given the right assumptions, this converges to fixed points instead of performative optima. We do not show this here, but an analogous result in performative prediction was proved by \citet{mendler2020stochastic}.


There are several variations of this setup that essentially set the same incentives. For instance, one could also draw entire batches of outcomes \(\Y_{t,1:B}\) and then perform updates based on the batch gradient \(\nabla_\p\sum_{b=1}^BS(\Pvar_t,\Y_{t,b}).\) This is a Monte Carlo estimate of the repeated gradient and hence also converges to performatively stable points and thus fixed points \citep{perdomo2020performative}. One could also mix the two algorithms and, e.g., perform gradient ascent on an average of past losses, yielding a version of the backwards-facing oracle discussed in \citet{armstrong2018standard}.




\begin{comment}
Assume \(\Pvar_{t+1}= \Pvar_t+\alpha_t\nabla_pS(\Pvar_t,Y_t)\) converges in \(\ell^2\). THen 
Assume that . Then \(\p\) is a  fixed point. To see this, note that
\[\E[\Vert \Pvar_t - \Pvar_{t+1}\Vert^2]
=\E[\Vert\alpha_t\nabla_p S(\Pvar_t,Y_t)\Vert^2]
\]



\RH{We should clarify if we proposing this as the start of an actual training process, the way Armstrong proposes backward-facing oracles, or just as a mathematical result. Either way we should add some of the issues that might arise.}
\jt{Which issues do you mean? Woudl linking to the oracle post be sufficient? Immediate issues I see:\\
- learning fails to converge at all (added below)\\
- inner alignment failure when doing training (added below)\\
- issues with hidden incentives (I discuss them below also)
- optimization occurs for finding fixed points, which is still weird (added this as a general point)}

\jt{here, I need to mention the whole performative prediction issue. basically this section should be more about explaining that model, instead of my own results (?)}

Now we consider an online learning setup in which at each time step \(t\), an agent makes a prediction \(P_t\), we sample an outcome \(Y_t\) using a Bernoulli distribution with parameter \(f(P_{t})\), and the agent receives loss \(L_t(P_t):=-S(P_t,Y_t)\).
We assume the agent's prediction \(P_{t+1}\) is trained to minimize the loss it would have gotten historically, given all past environment outcomes (and \(P_1\) is initialized arbitrarily). That is, we minimize the loss
\[\mathcal{L}_T(p):=\frac{1}{T}\sum_{t=1}^TL_{t}(p)
=\frac{1}{T}\sum_{t=1}^T -S(p,Y_{t}).
\]
%\ec{minor: I think we either need to start the $(P_t)$ at $t=1$ or change the sum here to start at 0 and have $1/(t+1)$, otherwise $P_1$ isn't well-defined.}
%\jt{I changed it to P_1 being initialized randomly}

%\jt{write that the issue where this jumps out or something like that doesn't play a role in the following}

This is a version of the backwards-facing oracle discussed in \href{https://www.alignmentforum.org/posts/hJaJw6LK39zpyCKW6/standard-ml-oracles-vs-counterfactual-ones}{Armstrong (2018)}.
We consider this setting here as a theoretical model for oracles trained using machine learning to show that such training setups may incentivize predicting fixed points. There are many issues with the setting beyond giving accurate predictions \jt{link to oracle post}; for instance, learning may fail to converge at all, and even if the training process sets the right incentives on training examples, the learned model may be \href{https://www.alignmentforum.org/s/r9tYkB2a8Fp4DN8yB}{optimizing a different objective} when generalizing to new predictions.

We consider training via gradient descent and let \(P_{t+1}=P_{t}-\alpha\nabla\mathcal{L}_{t}(P_t)\) for some learning rate \(\alpha.\) We assume in the following that learning converges to a point \(p^*\in (0,1)\) and that \(\alpha\) is chosen small enough to avoid stepping outside of the interval \((0,1)\). In practice, this is unlikely to be an issue; e.g., one could train log-odds instead of probabilities (which would not change the substance of our results).


We show that, if learning converges, the gradient with respect to the stop-gradient objective is zero at \(p^*\), and thus \(p^*\) is a fixed point by Proposition~\ref{prop:stop-gradient}. This is because we are not differentiating through \(f(P_t)\), as we are taking the gradient with respect to losses \(-S(p,Y_t)\), where \(Y_t\) is the fixed outcome of a random variable; in particular, we are \emph{not} differentiating an expectation \(\mathbb{E}[S(P_t,Y_t)]\). As a result, we get a stop-gradient formulation rather than an oracle that optimizes \(\mathbf{S}(p,f(p))\) directly.


%\ec{What do we do if this would be outside of (0,1)? The obvious thing would be to project to the nearest point in [0,1], but this seems problematic for our next update since the gradient of the loss need not exist on the boundary, right?}


\begin{proposition}\label{prop:ML-oracle}
    Let \((P_t)_{t\in\mathbb{N}}\) be predictions, where
    \(P_{t+1}=P_{t}-\alpha\nabla\mathcal{L}_t(P_t)\)
    and assume that \(P_t\) converges to some prediction \(p^*\in (0,1)\) almost surely. Assume that \(f\) is continuous and that \(\partial_1 S(p,y)\) exists and is continuous for any \(p\in (0,1)\) and \(y\in \{0,1\}\). Then \(\nabla_p(\mathbf{S}(p^*, \bot f(p^*)))=0\).
\end{proposition}


\begin{proof}
To begin, note that since \(p^*\in(0,1)\) we can choose a closed interval \(I\subseteq (0,1)\) such that \(p^*\in \mathrm{int}(I)\). By assumption, \(\partial_1 S(p,y)\) is continuous and thus bounded for \(p\in I,y\in \{0,1\}\).
For each $t$, let $Q_t$ be the projection of $P_t$ onto $I$.
Finally, let $\mathcal{L}(p) = -\mathbf{S}(p,f(p^*))$.

We will show the following:
\begin{enumerate}
    \item[(1)] $\mathbb{E}\left[\nabla \mathcal{L}_t(Q_t)\right] \rightarrow 0$ as $t \rightarrow \infty$
    \item[(2)] for all $p \in (0,1)$, $\mathbb{E} \left[ \nabla \mathcal{L}_t(p) \right] \rightarrow \nabla \mathcal{L}(p)$ as $t \rightarrow \infty$
    \item[(3)] $\mathbb{E} \left[ \nabla \mathcal{L}_t(Q_t) \right] \rightarrow \nabla \mathcal{L}(p^*)$ as $t \rightarrow \infty$
\end{enumerate}
from which it follows that $\nabla \mathcal{L}(p^*) = 0$.

\textbf{(1) $\mathbb{E}\left[\nabla \mathcal{L}_t(Q_t)\right] \rightarrow 0$ as $t \rightarrow \infty$}.

Note that since $\nabla \mathcal{L}_t(P_t) = \frac{P_{t+1}-P_t}{\alpha}$, and $P_t \xrightarrow{\text{a.s.}} p^*$ as $t \rightarrow \infty$, $\nabla \mathcal{L}_t(P_t) \xrightarrow{\text{a.s.}} 0$ as $t \rightarrow \infty$. Moreover, we have that almost surely $P_t \in I$ for $t$ sufficiently large, so that almost surely $P_t = Q_t$ and $\nabla \mathcal{L}_t(P_t) = \nabla \mathcal{L}_t(Q_t)$ for $t$ sufficiently large. Thus, almost surely,
\[\lim_{t\rightarrow \infty} \nabla \mathcal{L}_t(Q_t) = \lim_{t\rightarrow \infty} \nabla \mathcal{L}_t(P_t) = 0.\]

Finally, since $\nabla \mathcal{L}_t(Q_t)$ is bounded, we have by the dominated convergence theorem that $\nabla \mathcal{L}_t(Q_t) \xrightarrow{L^1} 0$ and as a consequence:
\[\lim_{t\rightarrow \infty}\mathbb{E}\left[\nabla \mathcal{L}_t(Q_t)\right] = 0 \]

\textbf{(2) for all $p \in (0,1)$, $\mathbb{E} \left[ \nabla \mathcal{L}_t(p) \right] \rightarrow \nabla \mathcal{L}(p)$ as $t \rightarrow \infty$.}

We have that
\begin{align*}
    \mathbb{E} \left[ \nabla \mathcal{L}_t(p) \right]&= -\frac{1}{t}\sum_{j=1}^t \mathbb{E}\left[\partial_1 S(p,Y_t)\right]
    = -\frac{1}{t}\sum_{j=1}^t \mathbb{E}\left[\mathbb{E}\left[\partial_1 S(p,Y_t)\middle\vert P_t\right]\right]\\
    &= -\frac{1}{t}\sum_{j=1}^t \mathbb{E}\left[(1-f(P_t))\partial_1 S(p,0) + f(P_t) \partial_1 S(p,1) \right]\\
    &= -\left(1 - \frac{\sum_t \mathbb{E}f(P_t)}{t}\right) \partial_1 S(p,0) - \frac{\sum_t \mathbb{E}f(P_t)}{t} \partial_1 S(p,1)
\end{align*}

Since $f$ is continuous, we have $f(P_t) \xrightarrow{\text{a.s.}} f(p^*)$. Then, by compactness, we have that $f$ is bounded on $[0,1]$. Finally, by the dominated convergence theorem, we may conclude $\mathbb{E} f(P_t) \rightarrow f(p^*)$ as $t \rightarrow \infty$. As a consequence, $ \frac{1}{t}\sum_{j\leq t} \mathbb{E} f(P_j) \rightarrow f(p^*)$ as $t \rightarrow \infty$.

Thus,
\[\lim_{t\rightarrow \infty}\mathbb{E} \left[ \nabla \mathcal{L}_t(p)\right] =
 -(1-f(p^*))\partial_1 S(p,0) - f(p^*)\partial_1 S(p,1) = -\partial_1\mathbf{S}(p,f(p^*)) = \nabla  \mathcal{L}(p)\]

\textbf{(3) $\mathbb{E} \left[ \nabla \mathcal{L}_t(Q_t) \right] \rightarrow \nabla \mathcal{L}(p^*)$ as $t \rightarrow \infty$.}

Note that
\[|\nabla \mathcal{L}_t(Q_t) - \nabla \mathcal{L}_t(p^*)|
\leq \max_{y}\left\vert\partial_1 S(Q_t,y) - \partial_1 S(p^*,y)\right\vert
\xrightarrow{\text{a.s.}} 0\]
as $t \rightarrow \infty$, since $\partial_1 S(p,1)$ and $\partial_1 S(p,0)$ are both continuous functions of $p$ on $I$.

Finally, by the dominated convergence theorem, and our second result:
\[\lim_{t\rightarrow \infty}\mathbb{E} \left[ \nabla \mathcal{L}_t(Q_t)\right] = \lim_{t\rightarrow \infty}\mathbb{E} \left[ \nabla \mathcal{L}_t(p^*)\right] = \nabla \mathcal{L}(p^*)\]
And we are done.
\end{proof}

\begin{corollary}
    Let \(S\) be a strictly proper scoring rule and let \(G,g\) as in the Gneiting and Raftery characterization of \(S\) (see Theorem~\ref{thm:gneiting}). Assume \(G\) is twice differentiable at any \(p\in (0,1)\), \(f\) is continuous, and \(P_t\) as defined above converges to some prediction \(p^*\in (0,1)\). Then \(p^*\) is a fixed point of \(f\).
\end{corollary}
\begin{proof}
Note that, using the derivation from Proposition~\ref{prop:stop-gradient}, it is \(\partial_1 S(p,y)=\partial_1\mathbf{S}(p,y)=g'(p)(y-p).\) Thus, \(\partial_1S(p,y)\) is continuous for \(y\in \{0,1\},p\in (0,1)\), since \(G''=g'\) is continuous by assumption. Hence, by Proposition~\ref{prop:ML-oracle}, it follows that \(\nabla_p(\mathbf{S}(p^*, \bot f(p^*)))=0.\)
By Proposition~\ref{prop:stop-gradient}, it then follows that \(p^*\) is a fixed point of \(f\).
\end{proof}

\end{comment}

Note that finding fixed points depends on the fact that we differentiate \(S(\Pvar_t,\Y_t)\) instead of the expectation
\(\E_{\Y_t\sim f(\Pvar_t)}[\Score(\Pvar_t,\Y_t)]=\Score(\Pvar_t,f(\Pvar_t))\). If we used policy gradients to differentiate \(\Score(\Pvar_t,f(\Pvar_t))\), for instance, we would again optimize for performative optimality. Similarly, we could learn a Q-function representing scores for each prediction, and update the function based on randomly sampled predictions \(\p\). Then the Q-function would converge to estimates of \(\Score(\p,f(\p))\), and the highest Q-value prediction would be a performative optimum. There are also some more recent results in performative prediction that explicitly try to estimate the gradient \(\nabla_\p(\Score(\p,f(\p)))\) and thus find performatively optimal instead of stable points \citep{izzo2021learn}.

Stop-gradients could also be circumvented in a hidden way \citep{krueger2020hidden}. For instance, consider a hyperparameter search to meta-learn a learning algorithm, where the evaluation criterion is the accumulated score during an episode. Then this search would prefer algorithms that optimize \(\Score(\p,f(\p))\) directly, without a stop-gradient.

Lastly, repeated gradient descent is related to \emph{decoupled approval} in RL \cite{uesato2020avoiding}. The decoupled approval policy gradient samples actions and approval queries independently and can thus differentiate with a stop-gradient in front of the approval signal. In our setting, we can differentiate through \(S(\Pvar_t,\Y_t)\) directly, so it is not necessary to calculate this gradient with a decoupled policy gradient. Decoupled gradients could be used to implement the stop-gradient objective if scores were discrete or otherwise not differentiable.

\subsection{No-regret learning}
\label{appendix:no-regret}
In this section, we consider no-regret learning and show that algorithms have sublinear regret if and only if their prediction error is sublinear. Regret takes environment outcomes as given and asks which predictions would have been optimal in hindsight. It thus corresponds to an alternative notion of optimality with a ``stop-gradient'' in front of environment probabilities.

As in the previous section, we assume that at time \(t\in\mathbb{N},\) the agent (i.e., the oracle AI) makes a prediction \(\Pvar_t\) and receives a score \(S(\Pvar_t,\Y_t)\), where \(Y_t\sim f(\Pvar_t)\). The agent's cumulative score at step \(T\) is defined as \(\sum_{t=1}^T\Score(\Pvar_t,\Y_t)\). In no-regret learning, we compare performance against \emph{experts}, which choose sequences of probabilities \((\Pvar'_t)_{t},\) \(\Pvar'_t\in \Pset\). We assume that an expert's prediction \(\Pvar_t'\) is independent of \(\Y_t\) conditional on \(\Pvar_t\). I.e., an expert knows the predictions \(\Pvar_t\) and thus probabilities \(f(\Pvar_t)\), but it does not know the outcome of \(\Y_t\). Let \(\mathcal{P}\) be the set of all such experts.


The regret of the agent is the difference between the cumulative score received by the best expert in expectation and the cumulative score received by the agent. To define it formally, let
\[\Pvar^*_t\in \argmax_{\Pvar'_t\in\mathcal{P}}\mathbb{E}[S(\Pvar'_t,\Y_t)\mid \Pvar_t]\] for \(t\in\mathbb{N}\). \(\Pvar^*_t\) is a random variable that maximizes the expectation of \(S(\Pvar^*_t,\Y_t)\) before \(\Y_t\) is drawn, but conditional on \(\Pvar_t\).
%\RH{Is this the expecation before $Y_t$ is drawn, but conditional on the selection of $P_t$? Think so, but should clarify the timing.}\jt{ This is just the expectation period, i.e., over all randomness, including in the \(P_t\). But maybe we can exclude \(P_t\). Gonna change.}. 
\begin{definition}[Regret]
The regret of agent $(\Pvar_t)_{t}$ at time \(T\) is
\begin{equation*}
\mathrm{Regret}(T) \coloneqq \sum_{t=1}^T S(\Pvar_t^*,\Y_t) - S(\Pvar_t,\Y_t).
\end{equation*}
The agent is said to have \emph{sublinear regret} or \emph{no-regret} if
\[\limsup_{T\rightarrow\infty}\frac{1}{T}\mathrm{Regret}(T)\leq 0.\]
\end{definition}

First, note that we define regret relative to the best expert in expectation instead of the best expert in hindsight. The latter would always be the one that made confident predictions and accidentally got all predictions exactly right. We are interested in algorithms with sublinear regret, and for that purpose it would be too much to ask the agent to perform well compared to the best expert in hindsight. Moreover, for scoring rules that are symmetric between the outcomes, this expert would have a constant score \(C\). This would imply that $\text{Regret}(T) = \sum_{t=1}^T C - S(\Pvar_t,\Y_t)$ and reduce the problem to minimizing the negative score, which would lead to performatively optimal predictions. 

Second, we evaluate the performance of the expert with respect to the environment outcomes \(\Y_t\) generated by the agent \((\Pvar_t)_t\), instead of evaluating the expert according to outcomes \(\tilde{\Y}_t\sim f(\Pvar^*_t)\) generated using the expert's own predictions. This means that, to receive sublinear regret, the agent only has to make accurate predictions—it does not have to find a performatively optimal prediction. This is different from the no-regret learning setup discussed in \citet{pmlr-v162-jagadeesan22a}, where regret is defined with respect to \(S(\Pvar^*_t,f(\Pvar^*_t))\). In that setting, only agents converging to performatively optimal predictions have sublinear regret.

We begin by showing that the best expert in expectation actually exists, and that \(\Pvar^*_t=f(\Pvar_t)\). 
\begin{proposition}\label{prop:f-of-p-optimal}
Let \(S\) be a proper scoring rule and \((\Pvar_t')_t\in\mathcal{P}\) an expert. Then for any \(t\in\mathbb{N}\), we have
\[\mathbb{E}[S(\Pvar_t',\Y_t)]=\mathbb{E}[\Score (\Pvar_t',f(\Pvar_t))].\]
 Moreover, we have \((\Pvar^*_t)_t=(f(\Pvar_t))_t\) and thus 
\[\mathrm{Regret}(T)
=\sum_{t=1}^TS(f(\Pvar_t),\Y_t)-S(\Pvar_t,\Y_t).\]
\end{proposition}
\begin{proof}
Let \(t\in\mathbb{N}\) and let \((\Pvar_t')_t\in\mathcal{P}\) be any expert. Conditional on \(\Pvar_t\), \(\Y_t\sim f(\Pvar_t)\) and \(\Y_t\) is independent of \(\Pvar_t'\) by assumption. Hence,
\[
\mathbb{E}\left[S(\Pvar_t',\Y_t) \right]
= 
\mathbb{E}\left[ \mathbb{E}[S(\Pvar_t',\Y_t)\mid \Pvar_t,\Pvar_t'] \right]
= 
\mathbb{E}\left[\Score (\Pvar_t',f(\Pvar_t)) \right].
\]

Next, since \(S\) is proper, 
\[
\mathbb{E}\left[ \Score (\Pvar_t',f(\Pvar_t)) \right]
\leq 
\mathbb{E}\left[ \Score(f(\Pvar_t),f(\Pvar_t))\right].
\]
It follows that
\[\max_{(\Pvar'_t)_t\in\mathcal{P}}
\mathbb{E}\left[ S(\Pvar_t',\Y_t) \right]
=
\max_{(\Pvar'_t)_t\in\mathcal{P}}
\mathbb{E}\left[ \Score(\Pvar_t',f(\Pvar_t)) \right]
\leq
\mathbb{E}\left[ \Score(f(\Pvar_t),f(\Pvar_t))\right]
=
\mathbb{E}\left[ S(f(\Pvar_t), \Y_t)\right].\]
Moreover, \((f(\Pvar_t))_t\in\mathcal{P}\), as \(f(\Pvar_t)\) is constant given \(\Pvar_t\) and thus independent of \(\Y_t\).

It follows that, for any \(t\in\mathbb{N},\) \(\Pvar^*_t\in\argmax_{(\Pvar_t')_t\in \mathcal{P}}\E[S(\Pvar'_t,\Y_t)]\), and thus 
\[\Regret(T)=\sum_{t=1}^TS(f(\Pvar_t),\Y_t)-S(\Pvar_t,\Y_t).\]
\end{proof}

\subsubsection{Characterization of regret in the limit}

If \(S\) is unbounded (such as the log scoring rule), then the agent's scores can become arbitrarily low, and the limit of \(\frac{1}{T}\mathrm{Regret}(T)\) may be undefined. To simplify our analysis, we will thus assume that there is a bound on the variance of the received score \(S(\Pvar'_t,\Y_t)\) and on the expected score \(\Score(\Pvar'_t,f(\Pvar_t))\) of both the agent, \(\Pvar'_t=\Pvar_t\), and the best expert, \(\Pvar'_t=\Pvar^*_t\). In the case of the log scoring rule, this would be satisfied, for instance, if the agent's predictions are bounded away from the boundary of the probability simplex.

Our next proposition shows that, given these assumptions, \(\lim_{T\rightarrow\infty}\frac{1}{T}\mathrm{Regret}(T)\) exists and is nonnegative, and having sublinear regret is equivalent to 
\(\lim_{t\rightarrow\infty}\frac{1}{T}\mathrm{Regret}(T)=0.\)

\begin{proposition}\label{prop:slln}
Let \(S\) be a proper scoring rule. Assume that \(\sup_t |\Score(\Pvar'_t,f(\Pvar_t))|<\infty\) and that \(\sup_t \mathrm{Var}(S(\Pvar'_t,\Y_t))<\infty\) for \(\Pvar'_t\in\{\Pvar_t,f(\Pvar_t)\}\). Then almost surely
\[\lim_{T\rightarrow\infty}\frac{1}{T}\mathrm{Regret}(T)=\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T\Score(f(\Pvar_t),f(\Pvar_t)) - \Score(\Pvar_t,f(\Pvar_t))\geq 0.\]
In particular, almost surely both limits exist and are finite, and the agent has sublinear regret if and only if
\[
\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T\Score(f(\Pvar_t),f(\Pvar_t)) - \Score(\Pvar_t,f(\Pvar_t))=0.\]
\end{proposition}
%\jt{comment somewhere on infinite regret, not bounded, etc.}

%Here, the regret of an algorithm is the quantity
%\[\text{Regret}(T):=\max_p\sum_{t'=1}^t L_{t'}(p)-\sum_{t'=1}^t L_{t'}(p_t),\]
%i.e., the difference between the loss achieved by the optimal prediction in hindsight (leaving the environment sampled using \(f(p_t)\) unchanged) and the algorithm's achieved loss. In this case, a prediction \(p\) has no regret, i.e., \(\lim_{T\rightarrow\infty}\frac{\text{Regret}(T)}{T}=0\) if it is a fixed point, and if a fixed point exists, it is the only prediction with no regret.

\begin{proof}
We will use a version of the strong law of large numbers for uncorrelated random variables with bounded variance, adapted from \citet[Theorem 2]{Neely2021}.

\begin{theorem}[\cite{Neely2021}, Theorem 2]\label{thm:slln}
    Let \(\{X_t\}_{t\in\mathbb{N}_0}\) be a sequence of pairwise uncorrelated random variables with mean \(0\) and bounded variances. I.e., assume that
    \begin{enumerate}
        \item \(\mathbb{E}[X_t]=0\) for all \(t\in\mathbb{N}_0\)
        \item There exists \(c>0\) such that \(\mathrm{Var}(X_t)\leq c\) for all \(t\in\mathbb{N}_0\)
        \item \(\mathrm{Cov}(X_t,X_{t'})=0\) for all \(t\neq t'\in\mathbb{N}_0\).
    \end{enumerate}
    Then almost surely
    \[\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^TX_t=0.\]
\end{theorem}
We will apply this law to random variables
\(X_t:=S(\Pvar'_t,\Y_t)-\Score(\Pvar'_t,f(\Pvar_t))\), where \(\Pvar'_t\) is either \(\Pvar_t\) or \(f(\Pvar_t)\).

First, by \Cref{prop:f-of-p-optimal}, \(\mathbb{E}[X_t]=\mathbb{E}[S(\Pvar'_t,\Y_t)-\Score(\Pvar'_t,f(\Pvar_t))]=0\). Second, by assumption, \[\sup_t\mathrm{Var}(S(\Pvar'_t,f(\Pvar_t)))<\infty.\] Hence, also 
\[\sup_t\mathrm{Var}(\Score(\Pvar'_t,f(\Pvar_t)))
=
\sup_t\mathrm{Var}(\mathbb{E}[S(\Pvar'_t,f(\Pvar_t))\mid \Pvar_t])
\leq\sup_t\mathrm{Var}(S(\Pvar'_t,f(\Pvar_t)))
<\infty.
\]
It follows that also \(\sup_t \mathrm{Var}(X_t)<\infty\).

Third, we know that \(\Y_t\) is independent of \(\Pvar_{t'}\) and \(\Y_{t'}\) for \(t>t'\), conditional on \(\Pvar_t\). Moreover, \(\Pvar_t'\) is constant given \(\Pvar_t\). Hence, given \(\Pvar_t\), also \(X_t=S(\Pvar'_t,\Y_t)-\Score(\Pvar'_t,f(\Pvar_t))\) is independent of \(X_{t'}\). Moreover,
\[
\mathbb{E}[X_t\mid \Pvar_t]
=\mathbb{E}[S(\Pvar'_t,Y_t)-\Score(\Pvar'_t,f(\Pvar_t))\mid \Pvar_t]=
\Score(\Pvar'_t,f(\Pvar_t))-\Score(\Pvar'_t,f(\Pvar_t))=0.
\]
It follows for \(t>t'\) that
\[\mathrm{Cov}(X_t,X_{t'})
=\mathbb{E}[X_t X_{t'}]
=\mathbb{E}[\mathbb{E}[X_t X_{t'}\mid \Pvar_t]]
=\mathbb{E}[\mathbb{E}[X_t \mid \Pvar_t]
\mathbb{E}[X_{t'}\mid \Pvar_t]]=0.\]

This shows all conditions of the theorem and thus
\[\lim_{t\rightarrow\infty}\frac{1}{T}\sum_{t=1}^TX_t=0\]
almost surely.

Now we turn to the limit of \(\frac{1}{T}\sum_{t=1}^T\Score(\Pvar_t',f(\Pvar_t))\). By assumption, \(\sup_t|\Score(\Pvar'_t,f(\Pvar_t))|<\infty\), so this limit exists and is finite. Thus, almost surely
\[
\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T\Score(\Pvar'_t,f(\Pvar_t))
=
\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^TS(\Pvar'_t,Y_t)-X_t
\]
\[
=\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^TS(\Pvar'_t,Y_t)
-\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^TX_t
=\lim_{T\rightarrow\infty}
\frac{1}{T}\sum_{t=1}^TS(\Pvar'_t,Y_t).
\]

Using \Cref{prop:f-of-p-optimal}, it follows that almost surely
\[\lim_{T\rightarrow\infty}\frac{1}{T}\mathrm{Regret}(T)
=\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^TS(f(\Pvar_t),Y_t)-S(\Pvar_t,Y_t)
=\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^TS(f(\Pvar_t),Y_t)-\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^TS(\Pvar_t,Y_t)
\]
\[
=\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T\Score(f(\Pvar_t),f(\Pvar_t))-\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T\Score(\Pvar_t,f(\Pvar_t))
=\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T\Score(f(\Pvar_t),f(\Pvar_t))-\Score(\Pvar_t,f(\Pvar_t)).
\]

Turning to the ``in particular'' part, note that this limit is finite by the above, and it is nonnegative since \(S\) is assumed to be proper. Moreover, it follows that almost surely
\[
\limsup_{T\rightarrow\infty}\frac{1}{T}\Regret(T)
=\lim_{T\rightarrow\infty}\frac{1}{T}\Regret(T)\geq 0.
\]
Thus, almost surely \(\limsup_{T\rightarrow\infty}\frac{1}{T}\Regret(T)\leq 0\) if and only if \(\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T\Score(f(\Pvar_t),f(\Pvar_t))-\Score(\Pvar_t,f(\Pvar_t))=0.\)
This concludes the proof.
\end{proof}


\subsubsection{Sublinear regret \(\Leftrightarrow\) sublinear prediction error}
Now we turn to the main result of this section. We show that given our assumptions, agents have sublinear regret if and only if their prediction error is sublinear. Note that here, we do \emph{not} require the \(\Pvar_t\) to converge; they could also oscillate between different fixed points.

\begin{theorem}\label{prop:no-regret-fp}
    Let $(\Pvar_t)_t$ be the sequence of the agent's predictions and \(S\) a strictly proper scoring rule.
    Assume that \(\sup_t \mathrm{Var}(S(\Pvar'_t,\Y_t))<\infty\) for \(\Pvar'_t\in\{\Pvar_t,f(\Pvar_t)\}\), and assume that there exists a closed set \(\mathcal{C}\subseteq\Pset\) such that \(\Pvar_t\in\mathcal{C}\) for all \(t\) and \(\Score(\p,f(\p))\), \(\Score(f(\p),f(\p)),\) and \(f(\p)\) are continuous in \(\p\) at any \(\p\in \mathcal{C}\). Then almost surely the agent has sublinear regret if and only if \(\sum_{t=1}^T \Vert f(\Pvar_t)-\Pvar_t\Vert \) is sublinear, i.e., if $\lim_{t\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T \Vert f(\Pvar_t)-\Pvar_t\Vert=0$.
\end{theorem}



%Note that the converse doesn't hold if $s$ isn't bounded, because the expert may occasionally make very bad predictions like falsely predicting increasingly extreme probabilities against a log scoring rule. 

%Note that if at some point the agent predicts a probability of $1$ or $0$, and then the outcome is $0$ or $1$, respectively, and, for example, the log-scoring rule is used, then the regret immediately becomes (and forever stays) infinite. So we need to rule that case out.


To show the result, we begin by proving an analytic lemma.

\begin{lemma}\label{lem:lemma-analysis}
Let \(\varphi,\psi\colon\mathbb{N}\rightarrow[0,\infty)\) and assume there exists a constant \(C>0\) such that for all \(t\in\mathbb{N},\) we have \(\psi(t)\leq C.\) Assume that for any \(\epsilon>0\), there exists \(\delta>0\) such that if \(\psi(t)>\epsilon\) for any \(t\in\mathbb{N}\), then \(\varphi(t)>\delta\). Then
\[\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T\varphi(t)=0
\Rightarrow
\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T\psi(t)=0.\]
\end{lemma}
\begin{proof}
We prove the contrapositive. That is, we assume that there exists some constant \(c>0\) such that there are infinitely many \(T\in\mathbb{N}\) such that \(\frac{1}{T}\sum_{t=1}^T\psi(t)>c\). Let \(\mathcal{T}\) be the set of such \(T\). We show that then there exists a constant \(c'>0\) such that for infinitely many \(T\), \(\frac{1}{T}\sum_{t=1}^T\varphi(t)>c'\).

Let \(T\in\mathcal{T}\). Since by assumption
\(\frac{1}{T}\sum_{t=1}^T\psi(t)>c,\) it follows that
\(\sum_{t=1}^T\frac{\psi(t)}{c}>T.\)
Let \(C':=\max\{C,1/c\}+1\). Since \(\psi(t)<C'\) it must be \(\psi(t)>\frac{c}{2C}\) for more than \(\frac{c}{2C}\) fraction of the times \(t\leq T\). Otherwise, it would be
\[
\sum_{t=1}^T\frac{\psi(t)}{c}
\leq T\left(\frac{C}{c}\frac{c}{2C} + \left(1-\frac{c}{2C}\right) \frac{\epsilon}{2C}\right)
\leq T\left(\frac{1}{2} + \frac{\epsilon}{2C}\right)<T.
\]

By assumption, this gives us a \(\delta>0\) such that whenever \(\psi(t)>\epsilon:=\frac{c}{2C}\), also \(\varphi(t)>\delta\). In particular, this applies to at least \(\epsilon\) fraction of \(t\leq T\). Hence, it follows that for any \(T\in\mathcal{T}\),
\[
\sum_{t=1}^T\varphi(t)
\geq
\delta \epsilon T.
\]
This shows that there are infinitely many \(T\) such that \(\frac{1}{T}\sum_{t=1}^T\varphi(t)>\delta \epsilon\) and thus concludes the proof.
\end{proof}


%Next we show 
%\(\frac{1}{T}\sum_{t=1}^T\Score(f(P_t),f(P_t))-\Score(P_t,f(P_t))\rightarrow 0\)
%if and only if
%\(\frac{1}{T}\sum_{t=1}^T S(f(P_t), Y_t)-S(P_t,Y_t)\rightarrow 0\).

%We know:
%\(\frac{1}{T}\sum_{t=1}^T S(f(P_t), Y_t)-\Score(f(P_t),f(P_t))\).
%goes to zero, and also
%\(\frac{1}{T}\sum_{t=1}^T\Score(P_t,f(P_t))-S(P_t,Y_t)\)
%goes to zero.
%Hence, 
%\[\lim \frac{1}{T}\sum_{t=1}^T\Score(f(P_t),f(P_t))-\Score(P_t,f(P_t))
%=
%\lim \frac{1}{T}\sum_{t=1}^T\Score(f(P_t),f(P_t))-\Score(P_t,f(P_t))+ \frac{1}{T}\sum_{t=1}^T S(f(P_t), Y_t)-\Score(f(P_t),f(P_t))
%=
%\]

%\jt{just need to add \(\Pvar\), otherwise it is good}
\begin{proof}[Proof of \Cref{prop:no-regret-fp}]
%Let $(\Pvar_t)_t$ be the sequence of the agent's predictions. Assume $S$ is strictly proper,   assume that \(\sup_t \mathrm{Var}(S(P'_t,Y_t))<\infty\) for \(P'_t\in\{P_t,f(P_t)\},\) and assume that there exists a compact set \(\mathcal{C}\subseteq\Pset\) such that \(P_t\in\mathcal{C}\) for all \(t\), and \(\Score(p,f(p)),\) \(\Score(f(p),f(p)),\) and \(f(p)\) are continuous in \(p\) at any \(p\in \mathcal{C}\).

To begin, note that since \(\mathcal{C}\subseteq\Pset\) is closed and \(\Pset\) compact, also \(\mathcal{C}\) is compact. Hence, continuity of \(\Score(\p,f(\p))\) and \(\Score(f(\p),f(\p))\) implies that both are also bounded on \(\mathcal{C}\) and thus \(\sup_t|\Score(\Pvar'_t,f(\Pvar_t))|<\infty\) for \(\Pvar'_t\in \{\Pvar_t,f(\Pvar_t)\}\). Hence, by our assumptions, the conditions for \Cref{prop:slln} are satisfied.

``\(\Rightarrow\)''.
Assume \(\mathrm{Regret}(T)\) is sublinear. We want to show that then $\sum_{t=1}^T \Vert f(\Pvar_t)-\Pvar_t\Vert $ is sublinear. To do this, we will apply \Cref{lem:lemma-analysis}.

To begin, define \(\varphi(t):=\Score(f(\Pvar_t),f(\Pvar_t))-\Score(\Pvar_t,f(\Pvar_t))\) and note that \(\varphi(t)\geq 0\) since \(S\) is proper. By \Cref{prop:slln}, it follows that if \(\Regret(T)\) is sublinear, also \(\sum_{t=1}^T\varphi(t)\) is sublinear almost surely. For brevity, we omit the ``almost surely'' qualification in the following. 

Next, define \(\psi(t):=\Vert f(\Pvar_t)-\Pvar_t\Vert \), and note that \(0\leq \psi(t)\leq n\). Next, let \(\epsilon>0\) arbitrary. To apply \Cref{lem:lemma-analysis} to \(\varphi\) and \(\psi\), it remains to show that there exists \(\delta>0\) such that whenever \(\psi(t)\geq\epsilon\), then \(\varphi(t)\geq \delta\).

To that end, let 
\[\delta:=\min_{\{\p\in\mathcal{C}\mid \Vert\p-f(\p)\Vert\geq\epsilon\}}\Score(f(\p),f(\p))-\Score(\p,f(\p)).\]
Since \(\Vert \cdot\Vert\) is continuous and \(f\) is continuous at any \(\p\in\mathcal{C}\), the set \(\{\p\in\mathcal{C}\mid \Vert \p-f(\p)\Vert\geq \epsilon\}\) is closed and thus compact. Moreover, \(\Score(f(\p),f(\p))\) and \(\Score(\p,f(\p))\) are continuous by assumption, and thus the minimum is attained at some point \(\hat{\p}\in\mathcal{C}\). But since \(S\) is strictly proper, it follows \(\delta=\Score(f(\hat{\p}),f(\hat{\p}))-\Score(\hat{\p},f(\hat{\p}))>0.\)
Hence, since \(\Pvar_t\in \mathcal{C}\) for any \(t\in\mathbb{N},\) it follows that whenever \(\varphi(t)\geq \epsilon,\) it follows\[\varphi(t)=\Score(f(\Pvar_t),f(\Pvar_t))-\Score(\Pvar_t,f(\Pvar_t))\geq \delta.\]

This shows all conditions for \Cref{lem:lemma-analysis}. Hence, we conclude that \(\lim_{t\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T\Vert f(\Pvar_t)-\Pvar_t\Vert =0\).


``\(\Leftarrow\)''.
Let \(\varphi(t):=\Vert f(\Pvar_t)-\Pvar_t\Vert \) and \(\psi:=\Score(f(\Pvar_t),f(\Pvar_t))-\Score(\Pvar_t,f(\Pvar_t))\). We assume that \(\sum_{t=1}^T\varphi(t)\) is sublinear in \(T\) and want to show that then \(\mathrm{Regret}(T)\) is sublinear as well. To do so, we will show that \(\sum_{t=1}^T\psi(t)\) is sublinear using our lemma, and then the required statement follows again from \Cref{prop:slln}.

Now we have to show the conditions of the lemma. First, as before, \(\varphi(t)\geq 0\) and \(\psi(t)\geq 0.\) Second, as noted in the beginning, we have \(\sup_{t}\psi(t)<\infty\) by our assumption that \(\Score(f(\p),f(\p))\) and \(\Score(\p,f(\p))\) are continuous on \(\mathcal{C}.\) Now let \(\epsilon>0\) arbitrary. Assume that \(\Score(f(\Pvar_t),f(\Pvar_t))-\Score(\Pvar_t,f(\Pvar_t))>\epsilon\) for some \(\epsilon>0\) and \(t\in\mathbb{N}.\)

Consider the set \(\mathcal{C}':=\{\p\in\mathcal{C}\mid \Score(f(\p),f(\p))-\Score(\p,f(\p))\geq\epsilon\}\). Since \(\Score(f(\p),f(\p))\) and \(\Score(\p,f(\p))\) are continuous on \(\mathcal{C}\) by assumption, this set is compact. Moreover, the function \(\p\in\mathcal{C}\mapsto \Vert \p-f(\p)\Vert\) is continuous since \(f\) is continuous on \(\mathcal{C}\) by assumption. Hence, the minimum
\(\delta:=\min_{\p\in \mathcal{C}'}\Vert \p-f(\p)\Vert\)
is attained at some point \(\hat{\p}\in\mathcal{C}'.\)

Now, if \(\delta=0,\) we would have \(\hat{\p}=f(\hat{\p})\) and thus
\[\Score(f(\hat{\p}),f(\hat{\p}))-\Score(\hat{\p},f(\hat{\p}))=
\Score(\hat{\p},\hat{\p})-\Score(\hat{\p},\hat{\p})
=0<\epsilon,\] which is a contradiction. Hence, \(\delta>0.\) Since \(\Pvar_t\in\mathcal{C},\) it follows from \(\Score(f(\Pvar_t),f(\Pvar_t))-\Score(\Pvar_t,f(\Pvar_t))\geq\epsilon,\) for \(t\in\mathbb{N}\) that \(\Pvar_t\in\mathcal{C}'\) and thus \(\Vert\Pvar_t-f(\Pvar_t)\Vert\geq \delta.\)
This shows the third condition for the lemma. We can thus conclude that \(\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T\psi(t)=0.\) Using \Cref{prop:slln}, this concludes the proof.
\end{proof}


\subsubsection{Convergence to fixed points}

The next result shows that if the agent's predictions converge to some distribution $\p$, then $\p$ must be a fixed point. %Otherwise, the agent incurs linear regret to the expert who constantly predicts $p$. Conversely, if $p$ is a fixed point, then converging to $p$ is sufficient to achieve sublinear regret.

\begin{corollary}
    \label{cor:no-regret-converge}
   In addition to the assumptions from \Cref{prop:no-regret-fp}, assume that \(\Pvar_t\) converges almost surely to a limit \(\lim_{t\rightarrow\infty} \Pvar_t=\p^*\). Then almost surely \(\p^*\) is a fixed point if and only if the agent has sublinear regret.
\end{corollary}

\begin{proof}
By \Cref{prop:no-regret-fp}, almost surely the agent has sublinear regret if and only if
\[\lim_{t\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T \Vert f(\mathbf{P}_t)-\mathbf{P}_t\Vert =0.\]
It remains to show that, given that the \(\Pvar_t\) converge, the latter is equivalent to convergence to a fixed point.

Since \(\mathcal{C}\) is compact and \(\Pvar_t\in\mathcal{C}\) for all \(t\in\mathbb{N},\) also \(\p^*\in\mathcal{C}.\) Hence, \(f\) is continuous at \(\p^*,\) so \[\Vert f(\p^*)-\p^*\Vert 
=\left\Vert f\left(\lim_{t\rightarrow\infty}\Pvar_t\right)-\lim_{t\rightarrow\infty}\Pvar_t\right\Vert
=\lim_{t\rightarrow\infty}\Vert f(\Pvar_t)-\Pvar_t\Vert .\]
Since this sequence converges, it is equal to its  %\href{https://proofwiki.org/wiki/Cesàro_Mean}{
Cesàro mean,
\[\lim_{t\rightarrow\infty}\Vert f(\Pvar_t)-\Pvar_t\Vert
=\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T\Vert f(\Pvar_t)-\Pvar_t\Vert.\]
Hence,
\[
\Vert f(\p^*)-\p^*\Vert
=\lim_{t\rightarrow\infty}\Vert f(\Pvar_t)-\Pvar_t\Vert
=\lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T\Vert f(\Pvar_t)-\Pvar_t\Vert
.\]
It follows that, if \(\lim_{t\rightarrow\infty}\Pvar_t=\p^*,\) then \[\Vert f(\p^*)-\p^*\Vert =0\Leftrightarrow \lim_{T\rightarrow\infty}\frac{1}{T}\sum_{t=1}^T\Vert f(\Pvar_t)-\Pvar_t\Vert=0.\]

This shows that, almost surely, \(\p^*\) is a fixed point, if and only if \(\sum_{t=1}^T\Vert f(\Pvar_t)-\Pvar_t\Vert \) is sublinear.
\end{proof}



\subsection{Prediction markets}

\label{appendix:prediction-markets}
Lastly, we consider prediction markets. We assume a simplified model of a prediction market, in which traders submit a single prediction and get scored using a proper scoring rule. The prediction that is output by the market and that influences the outcome is just a weighted average of the individual traders' predictions. In this situation, if a trader has a small weight and can thus barely influence the market prediction, the trader's score will mostly be determined by the accuracy of the report, rather than the influence of the report on the market. Thus, if all traders are small relative to the market, the equilibrium prediction will be close to a fixed point.

A similar result was shown by \citet{hardt2022performative} in the performative prediction context. They define a firm's performative power as the degree to which the firm can influence the overall outcome with their prediction. \citeauthor{hardt2022performative} show that in an equilibrium, the distance between a player's (performatively optimal) equilibrium strategy and their strategy when optimizing loss against the fixed equilibrium distribution (here, this means predicting the market probability) is bounded by the power of the trader. We give an analogous result for our formal setting and assumptions.

To formalize the setting, assume that there are $N$ players. We associate with each player $n\in [N]$ a number $w_n\in[0,1]$ s.t.\ $\sum_nw_n=1$, representing, intuitively, what fraction of the overall capital in the market is provided by player $n$. In the game, all players simultaneously submit a probability distribution $\p_n$. Then the event \(\Y\) is sampled according to the distribution $\q=f(\sum_nw_n\p_n)$. Finally, each player is scored in proportion to
$S(\p_n,\Y)$ for some strictly proper scoring rule $S$. Typical market scoring rules would consider terms like $S(\p_n,\Y)-S(\p_n,\Y)$, but subtracting $S(\p_n,\Y)$ (or multiplying by constants) does not matter for the game. We assume that players maximize their expected score, \(\E[S(\p_n,\Y)]=\Score(\p_n,f(\sum_mw_m\p_m))\).

%\jt{need to do ref management here}
For discussions of market scoring rules, see %\href{https://link.springer.com/article/10.1023/A:1022058209073}{Hanson 2003}
\citet{Hanson2003}
and %\href{https://www.algo.cs.uni-frankfurt.de/lehre/agt/material/Algorithmic_Game_Theory.pdf#page=672}{Sami and Pennock 2007}
\citet{Pennock2007}.
Prior work has connected these market scoring rules to more realistic prediction markets that trade Arrow--Debreu securities markets such as PredictIt %(\href{https://link.springer.com/article/10.1023/A:1022058209073}{Hanson 2003}; \href{http://www.ubplj.org/index.php/jpm/article/view/417/448}{Hanson 2007}; \href{https://www.algo.cs.uni-frankfurt.de/lehre/agt/material/Algorithmic_Game_Theory.pdf#page=672}{Pennock and Sami 2007}, Ch.\ 4; \href{https://arxiv.org/abs/1206.5252}{Chen and Pennock 2007}; \href{https://dl.acm.org/doi/pdf/10.1145/1566374.1566412}{Agrawal et al.\ 2009}; \href{https://dl.acm.org/doi/pdf/10.1145/1807342.1807372}{Chen and Vaughan 2010}).
%Note: these references are all copied from the decision scoring rules paper, without looking at them again.
[e.g., \citealp{Hanson2003}; %\citealp{Hanson2007};
\citealp{Pennock2007}, Section 4; \citealp{Chen2007}; \citealp{Agrawal2009}%; \citealp{Chen2010}].
].

We assume that $f$ is common knowledge. Moreover, in the following we only consider pure strategy equilibria, and we do not investigate the existence of equilibria. %[TODO: does this game always have pure strategy equilibria?]

\begin{theorem}\label{thm:market}
    Let \(S\) be a proper scoring rule and let \(G,g\) as in the Gneiting and Raftery characterization of \(S\). Let $(\p_n)_n$ be a pure strategy Nash equilibrium of the aforedefined game and let \(\hat{\p}:=\sum_n w_n\p_n\) be the market prediction. Assume \(f\) is differentiable at \(\hat{\p}\). For any player \(n\), if \(G,g\) are differentiable at \(\p_n\) and \(Dg(\p_n)\succ \gamma_{\p_n},\) it follows that
    \begin{equation*}
        \left\Vert f\left(\hat{\p}\right)-\p_n\right\Vert\leq \frac{w_n\Vert Df\left(\hat{p}\right)\Vert_{\op}\Vert g(\p_n)\Vert}{\gamma_{\p_n}}.
    \end{equation*}
\end{theorem}

In particular, this theorem shows that players $n$ with very low $w_n$ (little capital/influence on $\q$) will accurately predict \(\q=f(\hat{\p})\). Note, however, that $\hat{\p}$ is not necessarily a fixed point or close to a fixed point. If there are are also players $n$ with very high $w_n$, then their prediction and the overall market prediction may be wrong. (So interestingly the overall market probability $\hat{\p}=\sum_nw_n\p_n$ is worse than the prediction of individuals. One might take this to suggest that anyone interested in $\q$ should look at the latter type of predictions. Of course, if this is what everyone does, it is not so clear anymore that the model $\q=f(\sum_nw_n\p_n)$ is accurate.)

\begin{proof}The proof is analogous to that of \Cref{theorem:Caspar-approx-fix-point}.
Let \((\p_n)_n\) be a pure strategy Nash equilibrium and \(\hat{\p}:=\sum_mw_m\p_m\). Each player must play a best response to the other player's strategies, so $\p_n$ must be a global maximum of the function $\varphi\colon\p_n\mapsto S(\p_n,\sum_mw_m\p_m)$
Hence, it must be \(\nabla\varphi(\p_n)^\top(f(\hat{\p})-\p_n)\leq 0\), i.e., the directional derivative of \(\varphi\) in the direction \(f(\hat{\p})-\p_n)\) must be at most zero. Otherwise, player \(n\) could improve their loss by changing their prediction marginally towards \(f(\hat{\p})\).


Computing the gradient, we have
\begin{eqnarray*}
    \nabla_{\p_n}\left(S\left(\p_n,f\left(\sum_m w_m\p_m\right)\right)\right)
    &=& \nabla_{\p_n}\left(G(\p_n)+g(\p_n)^\top\left(f\left(\sum_mw_m\p_m\right)-\p_n\right)\right)\\
    &=& g(\p_n)+Dg(\p_n)^\top\left(f\left(\sum_mw_m\p_m\right)-\p_n\right)+ w_n Df(\hat{\p})^\top g(\p_n) - \Id g(\p_n)
    \\
    &=& Dg(\p_n)^\top(f(\hat{\p})-\p_n)+w_nDf(\hat{\p})^\top g(\p_n).
\end{eqnarray*}
It follows
\begin{align}&0\geq \nabla\varphi(\p_n)^\top (f(\hat{\p})-\p_n)=
(f(\hat{\p})-\p_n)^\top Dg(\p_n) (f(\hat{\p})-\p_n)+w_ng(\p_n)^\top Df(\hat{\p}) (f(\hat{\p})-\p_n)\\
\Rightarrow&
-w_ng(\p_n)^\top Df(\hat{\p}) (f(\hat{\p})-\p_n)
\geq (f(\hat{\p})-\p_n)^T (Dg(\p_n)) (f(\hat{\p})-\p_n).
\end{align}
Using that \(Dg(\p_n)|_{\TPset}\succ \gamma_{\p_n}\) and thus \((f(\hat{\p})-\p_n)^\top (Dg(\p_n)) (f(\hat{\p})-\p_n) \geq \gamma_\p\Vert f(\hat{\p})-\p_n\Vert^2\), it follows that
\begin{eqnarray*}
    && \gamma_{\p_n}\Vert f(\hat{\p})-\p_n\Vert^2\\
    &\leq & (f(\hat{\p})-\p_n)^\top Dg(\p_n) (f(\hat{\p})-\p_n) \\
    &\leq& - w_ng (\p_n)^\top Df(\hat{\p})(f(\hat{\p})-\p_n)\\
    &\leq& w_n\vert g(\p_n)^\top Df(\hat{\p})(f(\hat{\p})-\p_n)\vert\\
    &\underset{\text{Cauchy-Schwarz}}{\leq}&w_n \Vert g(\p_n)\Vert \Vert Df(\hat{\p})(f(\hat{\p})-\p_n)\Vert\\
    &\leq& w_n\Vert g(\p_n)\Vert \Vert Df(\hat{\p})\Vert_{\mathrm{op}}\Vert f(\hat{\p})-\p_n\Vert
\end{eqnarray*}
The result follows by dividing by $\gamma_{\p_n}\Vert f(\hat{\p})-\p_n\Vert $.

\end{proof}

\begin{corollary}
In addition to the assumptions from \Cref{thm:market}, assume that $f$ is Lipschitz-continuous and \(C:=\sup_{p\in\Pset}\frac{\Vert g(\p)\Vert }{\gamma_\p}<\infty.\) Let \((\p_n)_n\) be a Nash equilibrium and let $\epsilon>0$ arbitrary. Then there exists a $\delta>0$ such that if for all $n$, $w_n<\delta,$ all of $\p_n$ and $f(\p_n)$, for all $i$, as well as $\sum_mw_m\p_m$ and $f(\sum_mw_m\p_m)$ are within $\epsilon$ of each other.
\end{corollary}
\begin{proof}
Let \(\epsilon>0\) arbitrary. Let \(L_f\) be the Lipschitz constant of \(f\) and note that then \(\Vert Df(\p)\Vert_{\op}\leq L_f\) for all \(\p\in\Pset.\) 
By \Cref{thm:market}, it follows for \(\hat{\p}:=\sum_mw_m\p_m\) and any player \(n\) that
\[\Vert f(\hat{\p})-\p_n\Vert \leq w_nL_fC.\]
Now let \(\lambda:=\min(\{1,\frac{1}{L_f}\})\) and \(\delta:=\frac{\epsilon\lambda}{4CL_f}\), and assume \(w_n<\delta\) for all \(n\in[N]\). Then it follows
\[\Vert f(\hat{\p})-\p_n\Vert \leq \delta L_fC\leq \frac{\lambda}{4}\epsilon.\]
Moreover, since \(\hat{\p}\) is a convex combination of probabilities \(\p_n\), it follows that
\[\Vert f(\hat{\p})-\hat{\p}\Vert \leq \max_{n}\Vert f(\hat{\p})-\p_n\Vert \leq \frac{\lambda}{4}\epsilon.\]
Thus, by the triangle equality, we have \(\Vert \p_n-\hat{\p}\Vert \leq \frac{2\lambda}{4}\epsilon\), and since \(f\) is Lipschitz-continuous,
\[\Vert f(\hat{p})-f(\p_n)\Vert \leq L_f\Vert \hat{\p}-\p_n\Vert \leq L_f\frac{2\lambda}{4}\epsilon \leq \frac{1}{2}\epsilon\]
for any \(n\in[N]\).

This shows that all of \(\p_n,\hat{\p},f(\p_n)\) are within \(\epsilon/2\) of \(f(\hat{\p})\) and thus by the triangle inequality within \(\epsilon\) of each other.
\end{proof}


It would be interesting to extend these results. For example, it is unclear what happens when players make predictions \textit{repeatedly}. (To keep things simple, one should probably still imagine that all players know $f$ and that the environment probability is determined by $f$ applied to the majority forecast. If the traders have private information, prediction markets become harder to analyze. For some discussions, see \cite{ostrovsky2009information}, \cite{chen2016informational}.)




\bibliography{refs}

\end{document}
