% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{./uai_camera_ready/uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
\usepackage{xr-hyper} 

%% Self-defined macros
\usepackage{preamble}
\input{newMacros}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
\myexternaldocument{dummy}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\bibliographystyle{abbrvnat}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)


\title{Risk-limiting Financial Audits via Weighted Sampling without Replacement \\ (Supplementary Material)}
% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<shubhan2@andrew.cmu.edu>?Subject=Your UAI 2023 paper}{Shubhanshu Shekhar}{}}
\author[1]{Ziyu Xu}
\author[2, 3]{Zachary Lipton}
\author[3]{Pierre Liang}
\author[1, 2]{Aaditya Ramdas}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistics and Data Science\\
    Carnegie Mellon University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Machine Learning Department\\
    Carnegie Mellon University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[3]{%
    Tepper School of Business\\
    Carnegie Mellon University\\
    Pittsburgh, Pennsylvania, USA
}


 
\begin{document}
\onecolumn
\maketitle
 
\appendix
% \input{tex/appendix}
\section*{Organization}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We discuss some additional background material in~\Cref{appendix:additional-background}, and then formulate the proofs omitted from the body of the paper for \Cref{theorem:oracle-strategy} and \Cref{prop:control-variates-1} in \Cref{sec:Proofs}. 
% We discuss a practical auditing consideration, where one may wish to estimate the remaining misstated fraction as opposed to the total misstated fraction, $m^*$, in \Cref{sec:alt-defs}.
In \Cref{sec:hoeffding-empirical-bernstein}, we provide Hoeffding and empirical-Bernstein style CSs that are an alternative to the betting CSs we provide in the paper. We then use the Hoeffding CS constructed in the aforementioned to section as an example of the fact that previous CSs for unweighted mean estimation with uniform sampling from \citet{waudby2020confidence,waudby2020estimating} can be recovered by our method for a specific choice of $(\lambda_t)$ in \Cref{sec:HoefEBComparison}. Empirical results from simulations are shown for these CSs in \Cref{sec:HoefEBExperiments}, in which we compare the Hoeffding and empirical-Bernstein CSs to the betting CS discussed in the main body of the paper. Finally, we end by presenting the results of applying our methods on a real-world housing dataset in~\Cref{appendix:housing-data}. 

The code for reproducing the results of this paper is available here: \url{https://github.com/sshekhar17/WeightedWoRConfSeq}. 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Additional Background} 
\label{appendix:additional-background} 

    \subsection{Related Work on Confidence Sequences~(CS)}
    \label{appendix:related-work-CS}
        Confidence sequences~(CSs) are a fundamental tool in sequential analysis, and were introduced into this literature by Robbins and coauthors in a series of papers starting with~\citep{darling1967confidence}. Some other important early works in this area include~\citep{lai1976confidence} and~\citep{jennison1989interim}.
        % 
        More recently, there has been a resurgence of interest in confidence sequences, particularly motivated by its applications in anytime valid inference. In anytime valid inference, data samples are received sequentially, and the goal is to derive statistical guarantees that are valid even if one stops sampling and performs inference at a data-dependent time \citep{johari2015always, johari2017peeking, howard2021time, howard2022sequential}. Thus, confidence sequences can also be employed in the multi-armed bandit setting (where one can sample adaptively from multiple streams of data) to enable best arm identification \citep{jamieson2014best}. 
        % 
        Most of the papers mentioned above rely on making certain moment assumptions on the data-generating distributions. An important line of recent work aims to relax these assumptions, by  constructing confidence sequences for heavy-tailed data or contaminated data~\citep{wang2022catoni, bhatt2022catoni, mineiro2022lower, wang2023huber}. 
    
    \subsection{Betting-based CS construction}
    \label{appendix:betting-cs-construction}
        The betting-based approach for constructing confidence sequences builds upon the idea of \emph{testing-by-betting}, popularized recently by~\citet{shafer_testing_betting_2021}. This principle states that we can refute a claim~(equivalently, a null hypothesis $H_0$) about the probability distribution generating some data stream, if we can increase our wealth by repeatedly betting on the observations with the restriction that the betting payoffs are \emph{fair} under $H_0$. The restriction of fair payoff implies that that bettor is not expected to make large gains under $H_0$, irrespective of the betting strategy employed~(formally, the wealth process is a non-negative supermartingale). Consequently, if the bettor ends up making a large profit by betting on the observations, this can be considered as evidence against $H_0$; with the relative growth in wealth provide a precise measure of the strength of evidence. 

        To use the above principle for constructing CSs, we simultaneously play a continuum of betting games, indexed by $m \in [0,1]$, each with an initial wealth of $\$1$. For every $m \in [0,1]$, we bet against the claim $H_{0,m}$ that the true misspecified fraction $m^*$ is equal to $m$. We design the payoff functions of this betting game, such that the resulting wealth process, $\{W_t(m): t \geq 1\}$, is a non-negative martingale if $H_{0,m}$ were true, but grows at an exponential rate otherwise. Due to this property, the process at  $m^*$, denoted by $\{W_t(m^*): t \geq 1\}$,  is actually a \emph{test martingale}; that is, a nonnegative martingale with an initial value $1$. Hence, Ville's inequality~(recalled below in~\Cref{fact:ville}) implies that with probability at least $1-\alpha$, the process $(W_t(m^*))$ never exceeds the value $1/\alpha$. This fact, suggests a natural definition of a CS for $m^*$, consisting of sets $C_t = \{m: W_t(m)< 1/\alpha\}$, since these sets contain $m^*$ for all $t \geq 1$, with probability at least $1-\alpha$.
        To conclude, the betting-based approach breaks the task of constructing confidence sequences into two smaller tasks: 
        \begin{enumerate}
            \item Choosing a sequence of payoff functions that are fair under $H_{0,m}$: we achieve this by using the idea of importance weighting. 
            \item Developing a betting strategy that ensures fast growth of the wealth process for all $m \neq m^*$: we use the \kelly strategy for this in our construction. 
        \end{enumerate}
        An additional design choice that is unique to the problem studied in this paper, is that of the \emph{sampling strategy} to select the transaction indices. We discuss several strategies in~\Cref{sec:sampling-strategies}, whose performance is determined by the availability and accuracy of side-information. 

        We end this discussion by recalling a statement of Ville's inequality~\citep{ville1939etude}. 
        \begin{fact}[Ville's Inequality]
            \label{fact:ville} 
            Suppose $\{M_t: t \geq 0\}$ denotes a nonnegative supermartingale adapted to a filtration $\{\mc{F}_t: t \geq 0\}$. Then, for any $\alpha >0$, we have 
            \begin{align}
                \mathbb{P}\lp \exists t \geq 0: M_t \geq 1/\alpha \rp \leq \frac{\mathbb{E}[M_0]}{\alpha}.  
            \end{align}
        \end{fact}

    \subsection{Working with minibatches}
    \label{appendix:minibatch}
        In this paper, we have developed our methodology under the assumption that the transactions are sampled and sent to the human auditor, one at a time. In practical scenarios, it may be preferable for the human auditor to evaluate a minibatch of transactions a time, rather than querying the transactions one-by-one. This generalization can be easily handled by updating the wealth process with the averaged payoff (over the minibatch in each round). More specifically, we can proceed as follows, for any $m \in [0,1]$, and for $t=1, 2, \ldots$: 
        \begin{itemize}
            \item Calculate the next sampling distribution, $q_t$. 
            \item Sample the next batch of transactions, $\mc{B}_t \defined \{I_t^{(1)}, \ldots, I_t^{(B)}\}$, with $I_t^{(j)}$ is drawn according to $q_t$ restricted on $\mc{N}_t \setminus \{I_t^{(1)}, \ldots, I_t^{(j-1)}\}$. Recall that, we now have $\mc{N}_t = [N] \setminus \lp \cup_{s=1}^{t-1} \mc{B}_s \rp$. 
            \item Obtain the true $f$ values, $\{f(I_t^{(i)}: 1 \leq i \leq B\}$ from the oracle (i.e., the human auditor). 
            \item Update the wealth process: 
            \begin{align}
                W_t(m) = W_{t-1}(m) \times \lp 1 + \frac{1}{B} \sum_{i=1}^B \lambda_t^{(i)} \lp Z_t^{(i)} - \mu_t^{(i)}(m) \rp \rp, \quad \text{for } m \in [0,1].  
            \end{align}
            Here $\lambda_t^{(i)}$ denotes the bet based on $\cup_{s=1}^{t-1} \mc{B}_t \cup \{X_t^{(1)}, \ldots, X_t^{(i-1)} \}$, and $Z_t^{(i)}-\mu_t^{(i)}$ denotes the analogous payoff function, as defined in~\Cref{subsec:betting-CS-no-side-info}. 
            \item Update the CS, as $C_t = \{m \in [0,1]: W_t(m) < 1/\alpha\}$. 
        \end{itemize}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Proofs}
\label{sec:Proofs}
\subsection{Proof of Proposition~\ref{theorem:oracle-strategy}}
\label{sec:oracle-proof}
    Recall that $\expect_{I_n \sim q}[Z_n] = \mu_n(m^*)$ for any sampling distribution $q \in \Delta^{\mc{N}_n}$. Now, we note the following equivalencies
    \begin{align}
        &\expect_{I_n \sim q}[B_n(\lambda, m)]\\ 
        &= \expect_{I_n \sim q}[\lambda(Z_n - \mu_n(m)) - \lambda^2(Z_n -\mu_n(m))^2]\\
        &=\expect_{I_n \sim q}[\lambda(Z_n - \mu_n(m))] - \expect_{I_n \sim q}[\lambda^2(Z_n -\mu_n(m))^2]\\
        &=\lambda(m^* - m) - \lambda^2\expect_{I_n \sim q}[(Z_n- \mu^*_n(m) -\mu_n(m) + \mu^*_n(m))^2]\\
        &=\lambda(m^* - m) - \lambda^2(m^* - m)^2 + 2\lambda^2\expect_{I_n \sim q}[Z_n- \mu_n(m^*)](m^*-m) - \lambda^2\expect_{I_n \sim q}[(Z_n- \mu^*_n(m))^2]\\
        &= \lambda(m^* - m) - \lambda^2(m^* - m)^2 - \mathbb{V}_{I_n \sim q}[Z_n].
    \end{align}
    
    The above equivalencies show that $q_n^* = \argmin_{q \in \Delta^{\mc{N}_{n}}}\ \mathbb{V}_{I_n \sim q}[Z_n]$ by definition of $q_n^*$ in \eqref{eq:max-bound}, and since $\lambda, m^*, m$ are fixed in the optimization problem. Consequently, the minimizer of $\mathbb{V}_{I_n \sim q}[Z_n]$ is when the distribution of $Z_n$ has support on only a single value and the variance is 0. This is achieved when $q_n(i) \propto \pi(i)f(i)$ for each $i \in \mc{N}_n$. Hence, we have shown our desired result.

    \subsection{Proof of~ Proposition~\ref{prop:control-variates-1}}\label{proof:control-variates-1}
        For any $t \geq 1$, introduce the random variable $D_t = t \times \lp \Mtilde_t - A_t\rp = \sum_{i=1}^{t} \beta_i U_i$.  By construction of the term $U_i$, we know that for any $t \geq 1$, we have
        \begin{align}
            \mathbb{E}[D_t|\mc{F}_{t-1}] = \sum_{i=1}^{t-1} \beta_i U_i + \beta_t \mathbb{E}[D_t|\mc{F}_{t-1}] = D_{t-1}.
        \end{align}
        Thus, $\{D_t: t \geq 1\}$ is a martingale process. Furthermore, since both $\beta_t$ and $U_t$ lie in the set $[-1,1]$, the martingale process $\{D_t: t \geq 1\}$ has bounded differences.  Hence, by using the time-uniform deviation inequality for martingales with bounded differences~\citep[Eq.~(11)]{howard2021time}, we have
        \begin{align}
            \mathbb{P} \lp \exists t \leq n: |D_t| > 1.7 \sqrt{t \lp  \log \log(2t) + 0.72\log(10.4/\delta) \rp } \rp < \delta.
        \end{align}
        Since $|\Mtilde_t - A_t|/t = |D_t|/t$, the result follows.



% \section{Alternative Definitions of RLFA}
% \label{sec:alt-defs}

% \paragraph{Testing based $\boldsymbol{\rlfa}$.}  An alternative notion of RLFAs, that mirrors the corresponding definition of risk-limiting audits~\citep{stark_conservative_statistical_2008a} more closely, can be obtained by framing it  as the task of testing whether the overall misstated fraction, $m^*$, is ``small''.  This can be interpreted as auditing the claim that all the reported monetary values are accurate. 
% Formally, we assume that the announced assertion is that $m^* \leq \varepsilon$, and we want to design a procedure that (i) with probability at least $(1-\delta)$, rejects this assertion if it is false, and (ii) confirms this assertion with probability $1$ if it is true. Our general, CS-based strategy that we developed in this paper can be easily adapted to this problem --- we simply define the stopping time $\tau(\varepsilon, \delta) \coloneqq \min\{t \in [N]:  \mc{C}_t \subseteq (\varepsilon, 1] \cup \{N\}$ and reject if $C_{\tau(\varepsilon, \delta)} \subseteq (\varepsilon, 1]$. Properties (i) and (ii) follow immediately from the definition of a CS, and the consistency of a CS, i.e., $|\mc{C}_t| \rightarrow 0$ as $t \rightarrow N$. 


% \paragraph{Auditing the remaining misstated fraction.} Another quantity of interest is the \textit{remaining} misstated fraction, i.e., what is the remaining misstated fraction assuming we correct the transaction values for the transactions we have audited? In many practical auditing scenarios, the company does correct their finances in accordance with their records for the audited transactions, i.e., $f(I_i) = 0$ for each $i \in [t]$ after querying the $t$th transaction. This is equivalent to estimating $\mu_t(m^*)$, a time varying quantity, instead of the static quantity of $m^*$. Since $\mu_t(m^*)$ is simply a shift of $m^*$ by a quantity that is known to the auditor, all estimates of $m^*$ we produce in this work can easily be transformed into estimates of the remaining misstated fraction as well (by subtracting $\sum_{i \in [t]} \pi(I_i)f(I_i)$ from both boundaries of our CSs). Thus, we can estimate the remaining misstated fraction as efficiently as we estimate $m^*$.

    
\section{Hoeffding and empirical-Bernstein Confidence sequences}\label{sec:hoeffding-empirical-bernstein}

    In this section, we present a different approach for constructing confidence sequences, that are based on nonnegative supermartingales (NSMs), instead of nonnegative martingales used by the betting based CS (\Cref{subsec:betting-CS-no-side-info}). While these CSs are typically looser than the betting CS defined in \eqref{eq:conf-seq-def-1}, they are computationally inexpensive and can be derived analytically. We will introduce two such CSs, the Hoeffding CS and empirical-Bernstein CS, and each will have boundaries that take on an explicit form. Thus, only constant time is needed to compute the boundaries for each new sample. In contrast, the betting CS computes its boundaries through a root finding procedure which requires $O(t)$ computations to derive updated boundaries after receiving the $t$th sample. We provide simulations comparing the Hoeffding and empirical-Bernstein CSs with the betting CS in \Cref{sec:HoefEBExperiments}.
    Before defining our CSs, we  first introduce the following quantities:
    \begin{align}
        \widehat{m}_t \coloneqq \frac{\pi(I_t)}{q_t(I_t)}f(I_t) + \sum\limits_{i = 1}^{t - 1}\pi(I_i) f(I_i), \quad
        \widehat{\mu}_t \coloneqq  \frac{\sum\limits_{i = 1}^t \widehat{m}_i}{t}, \qquad \widehat{\mu}_t(\lambda_1^t) \coloneqq  \frac{\sum\limits_{i = 1}^t \lambda_i\widehat{m}_i}{\sum\limits_{i = 1}^t \lambda_i}.
    \end{align} Note that \(\widehat{\mu}_t(\lambda_1^t)\) where \(\lambda_1 = \ldots= \lambda_t = 1\) is equivalent to \(\widehat{\mu}_t\).

    \subsection{Hoeffding CS}\label{subsec:hoeffding-cs}
        To define the Hoeffding CS, we first define the nonnegative supermartingale (NSM) associated with it.
        As noted in prior work \citep{howard2021time,waudby2020estimating}, let the following be a CGF-like function for Hoeffding:
        \begin{align}
            \psi_{\rmH}^c(\lambda) \coloneqq \frac{\lambda^2c^2}{8},
        \end{align} for any fixed \(c > 0\). Now we define the following Hoeffding NSM as follows:
        \begin{align}
            M_t^{\rmH}(m) &\coloneqq \exp\left(\sum\limits_{i= 1}^t \lambda_i(Z_i - \mu_i(m)) - \psi_{\rmH}^{c_i}(\lambda_i)\right) =\exp\left(\sum\limits_{i= 1}^t \lambda_i(\widehat{m}_i - m) - \psi_{\rmH}^{c_i}(\lambda_i)\right),
        \end{align} where  \(c_t \geq \max_{i \in \mathcal{U}_t} \pi(i) / q_t(i)\), and both \((\lambda_t)\) and \((c_t)\) are predictable w.r.t.\ \((\filtration_t)\).

        \begin{proposition}
        \label{prop:hoeffding-nsm}
            \((M_t^{\rmH}(m^*))_{t \in [N]}\) is an NSM.
        \end{proposition}
        \begin{proof}
            First, note that \(\expect[Z_t \mid \filtration_{t - 1}]=\mu_t(m)\) since we are assuming the null $H_{0,m}$ is true. Second, note that \(Z_t \in [0, \max_{i \in \mathcal{U}_t} \pi(i) / q_t(i)]\) is bounded.
            Thus, the desired statement follows directly from the MGF bound on bounded random variables i.e.\ if \(X \in [\ell, u]\) is a random variable, then \(\expect[\exp(\lambda(X - \expect[X]))] \leq \exp(\lambda^2(u - \ell)^2 / 8)\) for any \(\lambda \in \reals\).
        \end{proof}

        Consequently, we can derive the following Hoeffding CS:
        \begin{align}
            C^{\rmH}_t \coloneqq \left(\widehat{\mu}_t(\lambda_1^t) \pm \frac{\log(2 / \alpha) + \sum\limits_{i = 1}^t \psi_{\rmH}^{c_i}(\lambda_i)}{\sum\limits_{i = 1}^t \lambda_i}\right) \cap [0, 1],
            \label{eqn:hoeffding-cs}
        \end{align} where  \(c_t \geq \max_{i \in \mathcal{U}_t} \pi(i) / q_t(i)\), and both \((\lambda_t)\) and \((c_t)\) are predictable w.r.t.\ \((\filtration_t)\).

    \subsection{Empirical-Bernstein CS}

        Define the following CGF-like function for empirical-Bernstein:
        \begin{align}
            \psi_{\E}^c \coloneqq \frac{-\log(1 - c\lambda) - c\lambda}{c^2},
        \end{align} for any \(c > 0\). Now we define the following empirical-Bernstein NSM:
        \begin{align}
            M_t^{\EB}(m) &\coloneqq \exp\left(\sum\limits_{i= 1}^t \lambda_i(Z_i - \mu_i(m)) - (Z_i - \widehat{\mu}_{i - 1})^2\psi_{\E}^{c_i}(\lambda_i)\right)\\
            &=\exp\left(\sum\limits_{i= 1}^t \lambda_i(\widehat{m}_t - m) - (Z_i - \widehat{\mu}_{i - 1})^2\psi_{\E}^{c_i}(\lambda_i)\right),
        \end{align} where \(\lambda_t \in [0, 1 / c_t)\), \(c_t \geq \widehat{\mu}_{t - 1}\), and both \((\lambda_t)\) and \((c_t)\) are predictable w.r.t.\ \((\filtration_t)\).


         \paragraph{Constructing the upper CS through mirroring.} \(M_t^{\EB}\) can be used to construct a CS that lower bounds \(m^*\), but naively constructing an analog NSM (i.e., by negating $ Z_t - \mu_t(m)$ into $\mu_t(m) - Z_t$) results in a loose construction for the upper CS, since $c_t$ would need to lower bound $\widehat{\mu}_{t - 1} - Z_t$. $Z_t$ (and hence $c_t$) be quite large depending on the sampling probabilities $q_t$. Thus, we use the fact that $m^* \in [0, 1]$ and hence $1 - m^* \in [0, 1]$ to construct a ``mirroring'' lower CS for $1 - m^* = \sum_{i = 1}^N \pi(i)(1 - f(i))$. The lower CS for $1 - m^*$ is based upon the following NSM:
        \begin{align}
            {M'}_t^{\EB}(m) &\coloneqq \exp\left(\sum\limits_{i= 1}^t \lambda_i\left(\widetilde{Z}_i - \left(1 - m + \sum\limits_{j = 1}^{i - 1}\pi(I_j)(1 - f(I_j))\right) - (\widetilde{Z}_i - \widetilde{\mu}_{i - 1})^2\psi_{\E}^{c_i}(\lambda_i)\right)\right)\\ &=\exp\left(\sum\limits_{i= 1}^t \lambda_i(\widetilde{m}_t - (1 - m)) - (\widetilde{Z}_i - \widetilde{\mu}_{i - 1})^2\psi_{\E}^{c_i}(\lambda_i)\right).
        \end{align} Here, define $\widetilde{Z}_t \coloneqq \frac{\pi(I_t)}{q_t(I_t)}(1 - f(I_t))$, and let $\widetilde{m}_t$, and  $\widetilde{\mu}_t$ be counterparts of $\widehat{m}_t$ and $\widehat{\mu}_t$ where $f(i)$ is replaced with $1 - f(i)$ in the respective definitions for each \(i \in [N]\).
        \begin{proposition}
        \label{prop:EBNSM}
        \((M_t^{\EB}(m^*))_{t \in [N]}\) and \(({M_t'}^{\EB}(m^*))_{t \in [N]}\) are both NSMs.
        \end{proposition}
    To prove \Cref{prop:EBNSM}, we introduce the following key lemma from \citet{fan_exponential_inequalities_2015}.
\begin{lemma}[{\citet[Lemma 4.1]{fan_exponential_inequalities_2015}}]
    \label{lemma:Fan}
    Let \(\xi\) be a number bounded from below i.e.\ satisfies \(\xi \geq -c\), where \(c \in \reals^+\) is a fixed constant. Let the following be true: \(\lambda \in [0, 1/c)\). Then,
    \begin{align}
    1 + \lambda \xi \geq \exp(\lambda \xi + \xi^2(\log(1 - c\lambda) + c\lambda)).
    \end{align}
    \end{lemma}
    \begin{proof}
    The proof revolves around the following function \(h\):
    \begin{align}
        h(x) \coloneqq \frac{\log(1 + x) - x}{x^2 / 2}, \qquad x > -1.
    \end{align}
    Note that \(f\) is increasing in its domain. Then, \(\lambda \xi \geq -c\lambda > -1\) by definition of \(\lambda\) and \(\xi\). Thus,
    \begin{align}
        h(\lambda \xi) &\geq h(-c\lambda) \Leftrightarrow \frac{\log(1 + \lambda \xi) - \lambda \xi}{\xi^2}  \geq \frac{\log(1 - c\lambda) + c\lambda}{c^2}.
    \end{align} The desired statement follows from expanding this inequality and rearranging terms.
    \end{proof}

    \begin{proof}[Proof of \Cref{prop:EBNSM}] We will only show the proof that $(M^{\EB}_t(m^*))$ is an NSM, since the proof that $({M'}_t^{\EB}(m^*))$ is an NSM will follow a similar derivation.
    Let \(Y_t = Z_i - \mu_i(m)\) and \(\delta_t = \widehat{\mu}_{t - 1} - \mu_i(m)\).
    Note that \(Y_t - \delta_t = Z_t - \widehat{\mu}_{t - 1}\). To prove our desired statement, it suffices to show the following is true:
    \begin{align}
    \expect\left[\exp(\lambda_t Y_t - (Y_t - \widehat{\mu}_{t - 1})^2\psi_{\E}^{\widehat{\mu}_{t - 1}}) \mid \filtration_{t - 1}\right] \leq 1.
    \label{eqn:EBIncrement}
    \end{align}

    We will now show that \eqref{eqn:EBIncrement} is indeed true:

    \begin{align}
    \expect\left[\exp(\lambda_t Y_t - (Y_t - \delta_t)^2\psi_{\E}^{\widehat{\mu}_{t - 1}}) \mid \filtration_{t - 1}\right] &= \expect\left[\exp(\lambda_t (Y_t - \delta_t) - (Y_t - \delta_t)^2\psi_{\E}^{\widehat{\mu}_{t - 1}}) \mid \filtration_{t - 1}\right]\exp(\lambda_t \delta_t)\\
    &\leq\expect\left[1 + \lambda_t (Y_t - \delta_t)  \mid \filtration_{t - 1}\right]\exp(\lambda_t \delta_t)\\
    &= \expect\left[1 - \lambda_t \delta_t  \mid \filtration_{t - 1}\right]\exp(\lambda_t \delta_t) \leq 1.
    \end{align}
    % use words

    % binary = \(n_1 - n_0 / (n_1 + n_0)\) optimal kelly bet lambda for bernoullis for testing 1/2. What is optimal kelly bet even in bernoulli-like situation for weighted sampling.
    The 1st inequality is by application of \Cref{lemma:Fan} with \(\xi_t = Y_t - \delta_t\) and \(c = \widehat{\mu}_{t - 1}\) (\(Z_t \geq 0\), so \(\xi_t \geq -\widehat{\mu}_{t - 1}\)). The 2nd equality is the result of \(\expect[Y_t \mid \filtration_{t - 1}] = (m^* - m^*_t) - (m^* - m^*_t) = 0\). The last inequality is by \(1 - x \leq \exp(-x)\) for all \(x \in \reals\).
    Thus, we have proved \eqref{eqn:EBIncrement} and our desired statement as a result.
    \end{proof}
    As a result, we can construct the following empirical-Bernstein CS:
    \begin{align}
        C_t^{\EB} \coloneqq &\left(\widehat{\mu}({\lambda'}_1^t) - \frac{\log(2 / \alpha) + \sum\limits_{i = 1}^t(Z_i - \widehat
        {\mu}_{i - 1})^2\psi_E^c(\lambda_i')}{\sum\limits_{i= 1}^t \lambda_i'}, 1 - \widetilde{\mu}_t({\lambda}_1^t) + \frac{\log(2 / \alpha) + \sum\limits_{i = 1}^t(\widetilde{Z}_t - \widetilde{\mu}_{t - 1})^2\psi_E^c(\lambda_i)}{\sum\limits_{i = 1}^t \lambda_i} \right)
        \cap [0, 1].
    \end{align} The mirroring trick of constructing a lower CS for $1 - m^*$ was originally employed to construct confidence bounds for off policy evaluation in contextual bandits \cite{thomas_highconfidence_offpolicy_2015,waudby-smith_anytimevalid_offpolicy_2022}. To the best of our knowledge, this is the first use of the mirroring trick simply for mean estimation.

        For both the Hoeffding and empirical-Bernstein CSs, we show in \Cref{sec:HoefEBComparison} that we are able to recover the unweighted, uniform sampling versions introduced by \citet{waudby2020confidence} as a special case, i.e., when $q_t$ is the uniform distribution over the remaining items and $\pi$ is uniform over all items. Thus, our formulations of the Hoeffding and empirical-Bernstein CSs generalize the CSs for sampling without replacement in \cite{waudby2020confidence} to weighted sampling and estimation.

\section{Connections with Waudby-Smith and Ramdas \cite{waudby2020confidence,waudby2020estimating}}
\label{sec:HoefEBComparison}
CSs for estimation of the unweighted mean through uniform sampling without replacement are shown in \citet{waudby2020confidence} for Hoeffding and empirical-Bernstein style CSs, and \citet{waudby2020estimating} for betting style CSs. In this section, we show these results are a special case of our results that also account for non-uniform sampling strategies and weighted means. For simplicity, we will show the Hoeffding case, and the results for the other CSs follow a similar argument. Following the notation of \citet{waudby2020confidence}, let \((X(i))_{i \in [N]}\) be a finite population of values in $[0, 1]$ (without loss of generality to arbitrary bounds on the $X(i)$). Let \(X_1, \dots, X_N\) be random variables that are the the result of from sampling uniformly w/o replacement from this population. \citet{waudby2020confidence} construct the following NSM for $\mu \coloneqq \tfrac{1}{N}\sum\limits_{i = 1}^N X(i)$:

        \begin{align}
            M_t^{\WSR}(m) \coloneqq \exp\left(\sum\limits_{i = 1}^t\lambda_i\left(X_i- m + \frac{1}{N-i+1}\sum\limits_{j = 1}^{i - 1}(X_j - m)\right) - \psi_H(\lambda_i)\right),
        \end{align} and derive the CS
        \begin{align}
            C_t^{\WSR} \coloneqq \left(\widehat{\mu}_t^{\WSR}(\lambda_1^t) \pm \frac{\log(2 / \alpha) + \sum\limits_{i = 1}^t\psi_H(\lambda_i)}{\sum\limits_{i = 1}^t \lambda_i\left(1 + \frac{i - 1}{N - i + 1}\right)}\right). 
        \end{align}
        The center of this CS is defined as
        \begin{align}
            \widehat{\mu}_t^{\WSR}(\lambda_1^t) \coloneqq \frac{\sum\limits_{i = 1}^t\lambda_i\left(X_i + \frac{1}{N-i + 1}\sum\limits_{j = 1}^{i - 1}X_j\right)}{\sum\limits_{i = 1}^t \lambda_i(1 + \frac{i - 1}{N - i + 1})} = \frac{\sum\limits_{i = 1}^t\lambda_i\left(X_i + \frac{1}{N-i + 1}\sum\limits_{j = 1}^{i - 1}X_j\right)}{\sum\limits_{i = 1}^t \lambda_i \cdot \frac{N}{N - i + 1}}.
        \end{align}

        In the weighted setting, \(\pi(i) = 1 / N\) and \(f(i) = X(i)\) for each \(i \in [N]\) implies that we are estimating the uniformly weighted average $m^* = \mu$. For each \(t \in [N]\) and \(i \in \mc{N}_t\), set \(q_t(i)  = 1 / (N - t + 1)\) to be the uniform distribution over the remaining items. This gets us the following estimate of the mean from our Hoeffding CS:
        \begin{align}
            \widehat{\mu}_t({\lambda_1'}^t) = \frac{\sum\limits_{i = 1}^t \lambda_i'\left(\frac{N - i + 1}{N}X_i + \frac{1}{N}\sum\limits_{j = 1}^{i - 1}X_j\right)}{\sum\limits_{i = 1}^t \lambda_i'}.
        \end{align}

        By setting \(\lambda'_i = \lambda_i N / (N - i + 1)\), for each \(i \in [t]\), we get that $\widehat{\mu}_t({\lambda'}_1^t) = \widehat{\mu}^{\WSR}_t(\lambda_1^t)$. To see that $C_t^H = C_t^{\WSR}$, we set $c_t = (N - t + 1) / N$ for each $t \in [N]$. Note that that is minimum possible value that $c_t$ can be since $\pi(i) = 1 / N$ for each $i \in [N]$, and $q_t(i) = 1/ (N - t + 1)$ for each $i \in \mc{N}_t$. As a result, we are able to recover the Hoeffding CS from \cite{waudby2020confidence} as a special case of our Hoeffding CS.
\begin{comment}
        More generally, our estimator is as follows:
        \begin{align}
            \widehat{\mu}_t(\lambda^1_t) = \frac{\sum\limits_{i = 1}^t \lambda_i\left(\frac{\pi(I_i)}{q_t(I_i)}f(I_i) + \sum\limits_{j = 1}^{i - 1}\pi(I_j)f(I_j)\right)}{\sum\limits_{i = 1}^t \lambda_i}.
        \end{align}
\end{comment}


\section{Experiments Comparing Different CS Constructions}
\label{sec:HoefEBExperiments}
    In \Cref{fig:HoefEBCS}, we compare the width of the Hoeffding, empirical-Bernstein, and betting CSs under the \propM sampling strategy, i.e., under a weighted sampling strategy. We follow the same setup as Experiment 1 in \Cref{sec:experiments}. We see that the empirical-Bernstein CS is tighter in cases where $\Nlarge$ is larger. In these cases, the support size, $c_t$, is large for $\Nlarge$ transactions, with makes the Hoeffding CS looser as a result. On the other hand, empirical-Bernstein is able to take advantage of the low-variance from a large number of transactions having similar misstated fractions $f(I_t)$. However, when $\Nlarge$ is small, most transactions will have a small support size, $c_t$, and the Hoeffding CS will be tighter than empirical-Bernstein as a result. The betting CS is tighter than both Hoeffding and empirical-Bernstein CSs in all our simulated setups. This trend is reflected in \Cref{fig:HoefEBHist} where we plot the histogram of the first time the CS reaches $\varepsilon=0.2$ width, i.e., the empirical-Bernstein CS reaches $\varepsilon$ width faster than the Hoeffding CS when $\Nlarge$ is larger, and the betting CS is the faster than both Hoeffding and empirical-Bernstein CSs.
    \begin{figure}[htb!]
            \def\figwidth{0.23\textwidth}
            \def\figheight{0.17\textwidth} % Feel free to change
        \hspace*{-2pt}
        \input{Figures/hoef_eb/exp1_prop_2_cs.tex}
        \input{Figures/hoef_eb/exp1_prop_8_cs.tex}
        \input{Figures/hoef_eb/exp1_inv_2_cs.tex}
        \input{Figures/hoef_eb/exp1_inv_8_cs.tex}
        \caption{Plots showing the variation of the width of the betting, Hoeffding, and empirical-Bernstein CSs using \propM sampling strategies in different data regimes with $N=200$; all CSs here also are intersected with the logical CS of~\Cref{subsec:logical-CS}. We can see that empirical-Bernstein is tighter than Hoeffding in cases where the proportion of transactions with large weights is high (i.e., $\Nlarge / N$ is large), and vice versa. Across the board, the betting CS is tighter than both Hoeffding and empirical-Bernstein CSs.}
        \label{fig:HoefEBCS}
    \end{figure}
    \begin{figure}[htb!]
            \def\figwidth{0.23\textwidth}
            \def\figheight{0.17\textwidth} % Feel free to change
        % \hspace*{-1em}
        \input{Figures/hoef_eb/exp1_prop_2_hist.tex}
        \input{Figures/hoef_eb/exp1_prop_8_hist.tex}
        \input{Figures/hoef_eb/exp1_inv_2_hist.tex}
        \input{Figures/hoef_eb/exp1_inv_8_hist.tex}
        \caption{Histograms of the first time for each CS to reach a width of $\varepsilon=0.2$ with $N=200$. We choose a larger $\varepsilon$ than in Experiment 1 for purposes of demonstrating the difference between the CSs, as all CSs converge to the logical CS and have nearly identical stopping time distribution for small values of $\varepsilon$. We can see that the empirical-Bernstein CS is reaches $\varepsilon$ width earlier than Hoeffding CS when $\Nlarge$ is large and the reverse is true when $\Nlarge$ is small, and the betting CS reaches $\varepsilon$ uniformly the fastest.}
        \label{fig:HoefEBHist}
    \end{figure}

\section{Experiments with Housing Sales Data}
\label{appendix:housing-data} 

    We now apply our auditing scheme to the transactions in the `\href{https://www.kaggle.com/datasets/harlfoxem/housesalesprediction}{House Sales in King County}' dataset from Kaggle. The dataset consists of $21,616$ datapoints, each consisting of $21$ features describing the house (such as the number of bedrooms, the number of bathrooms, square footage, floors, condition, etc), and one target variable \texttt{price}.  We treat the \texttt{price} values as the `reported monetary values' for our framework. 
    
    \paragraph{Creating the ground truth.}  To adapt the dataset to our problem, we first need to generate the `ground truth', that is, the true $f$-values. To do this, we proceed in the following steps: 
    \begin{itemize}
        \item We first select $10\%$ of the dataset, and assign them some arbitrary $f$ values in the range $(0,0.7)$.
        
        \item Using this `labelled' dataset, we train a random forest regressor with $200$ trees, and mean-absolute error criterion.
        
        \item Finally, this trained regressor is then used to generate the ground truth for the remaining $90\%$ of the dataset.
    \end{itemize}
     The reason for using this approach for generating the ground truth, is that we want it to be dependent on the additional features associated with each transaction.
    
    \paragraph{Generating the side-information.} Having obtained the $M$ and $f$-values, we obtain the side-information~(i.e., $S$-values) by using $10\%$ of the remaining labelled data to train another predictor for generating side-information on the rest of the data. In our experiments, we either used a single decision-tree regressor, or a random forest with a small number of trees~(fewer than $50$). Informally, we expect that increasing the capacity of the regressor should lead to increased correlation between the ground truth and the side-information. 
    % 
    \begin{figure}[htb!]
            \def\figwidth{0.45\textwidth}
            \def\figheight{0.45\textwidth} % Feel free to change
        \hspace*{-2pt}
        \input{Figures/Stopping_Times_Housing_DT}
        \input{Figures/Stopping_Times_Housing}
        \caption{Histograms of the first time for each CS to reach a width of $\varepsilon=0.05$ with $N=250$. As expected, the \propM strategy~(both with and without control variates) is significantly more sample-efficient than the uniform baseline, and furthermore, the improvement by using control variates increases with increasing informativeness~(i.e., $\rho$) of the side-information.}
        \label{fig:housing}
    \end{figure}
    %
    \paragraph{Experimental Results.} In~\Cref{fig:housing}, we consider two instances of this problem: (i) side-information generated by a decision-tree regressor, and (ii) side-information generated by a random-forest regressor, consisting of $10$ trees. In the former case, the correlation between the side information and the ground-truth is approximately $0.71$, while in the latter it is around $0.82$. As shown in the plots, the \propM based strategies (both with and without control variates)  significantly outperform the uniform baseline strategy. Furthermore, the improvement by incorporating control variates increases with increasing correlation. 


\bibliography{ref}
\end{document}
