\documentclass[accepted]{uai2023} 

\usepackage[american]{babel}


\usepackage{natbib} \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} \usepackage{booktabs} \usepackage{tikz} 



\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subcaption}
\usepackage{booktabs} \usepackage{multirow}


\usepackage[algo2e,linesnumbered,lined,boxed,commentsnumbered,noend]{algorithm2e}
\newcommand\mycommfont[1]{\footnotesize\ttfamily#1}
\SetCommentSty{mycommfont}
\setlength{\algomargin}{15pt} 

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}


\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{\typeout{(#1)}\@addtofilelist{#1}
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{\externaldocument{#1}\addFileDependency{#1.tex}\addFileDependency{#1.aux}}

\myexternaldocument{morrill_257}


\usepackage[capitalize,noabbrev]{cleveref}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\usepackage{nicefrac}
\definecolor{offWhite}{RGB}{240,240,240}
\definecolor{grey}{RGB}{180,180,180}
\definecolor{lightGrey}{RGB}{220,220,220}
\definecolor{darkgreen}{RGB}{0,125,0}
\definecolor{lime}{RGB}{255,200,0}

\definecolor{amiiBlue}{RGB}{16,72,118}
\definecolor{amiiPink}{RGB}{241,97,119}
\definecolor{amiiYellow}{RGB}{248,209,109}
\definecolor{amiiPurple}{RGB}{123,105,145}

\colorlet{yes}{cyan!50!white}
\colorlet{newYes}{cyan!75!white}
\colorlet{no}{red!50!white}
\colorlet{newNo}{red!75!white}
\newcommand{\eqTableEntry}[2]{
  \cellcolor{#1}{\textcolor{black}{#2}}
}

\usepackage{amsmath, amssymb, amsfonts, amsthm}
\makeatletter
\ifdefined\theorem
\else
  \newtheorem{theorem}{Theorem}
\fi
\ifdefined\lemma
\else
  \newtheorem{lemma}{Lemma}
\fi
\ifdefined\corollary
\else
  \newtheorem{corollary}{Corollary}
\fi
\ifdefined\definition
\else
  \newtheorem{definition}{Definition}
\fi
\ifdefined\proposition
\else
  \newtheorem{proposition}{Proposition}
\fi
\makeatother

\makeatletter
\newcommand\safeIncCounter[1]{\@ifundefined{c@#1}{\newcounter{#1}\stepcounter{#1}}{\stepcounter{#1}}}
\makeatother

\newcounter{resetCounter}

\usepackage{xargs}
 \usepackage{mathtools}
\DeclarePairedDelimiter{\floor}{\lfloor}{\rfloor}
\DeclarePairedDelimiter{\ceil}{\lceil}{\rceil}
\DeclarePairedDelimiter{\abs}{\lvert}{\rvert}
\DeclarePairedDelimiter{\norm}{\lVert}{\rVert}
\DeclarePairedDelimiter{\subex}{(}{)}
\DeclarePairedDelimiter{\subblock}{[}{]}
\DeclarePairedDelimiter{\tuple}{\langle}{\rangle}
\DeclarePairedDelimiter{\set}{\{}{\}}
\DeclarePairedDelimiter{\relu}{[}{]_+}
\DeclarePairedDelimiter{\ramp}{[}{]_+}

\usepackage{stmaryrd}
\DeclarePairedDelimiter{\stopGrad}{\llbracket}{\rrbracket}

\usepackage{bbold}
\usepackage{bm}

\newcommand{\reals}{\mathbb{R}}
\newcommand{\naturals}{\mathbb{N}}
\newcommand{\natSeq}[1]{\naturals_{#1}}
\newcommand{\Simplex}{\triangle}
\newcommand{\simplex}{\Simplex}
\newcommand{\like}{\widetilde}

\newcommand{\bs}[1]{\bm{#1}}
\newcommand{\smallMath}[1]{{\scriptstyle #1}}
\newcommand{\expectation}{\mathbb{E}}
\newcommand{\E}{\expectation}
\newcommand{\probability}{\mathbb{P}}
\newcommand{\Prob}{\probability}
\newcommand{\defword}[1]{\textbf{\boldmath{#1}}}
\newcommand{\as}{\doteq}

\newcommand{\ones}{\bs{1}}
\newcommand{\zeros}{\bs{0}}
\newcommand{\unitVector}{\bs{e}}

\newcommand{\bigO}{\operatorname{\mathcal{O}}}
\newcommand{\smallo}{\operatorname{o}}

\newcommand{\bmax}{\vee}
\newcommand{\bmin}{\wedge}

\newcommand{\ip}[2]{\langle #1, \, #2 \rangle}
\newcommand{\ind}[1]{\mathbb{1}\set*{#1}}
\newcommand{\given}{\,|\,}
\newcommand{\Given}{\;|\;}
\newcommand{\where}{\;|\;}

\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\e}{e}
\DeclareMathOperator*{\unif}{Unif}
\DeclareMathOperator*{\Unif}{\unif}
\DeclareMathOperator*{\proj}{proj}
\newcommand{\divergence}{\mathcal{B}}

\newcommand{\PureStratSet}{\mathcal{X}}
\newcommand{\PureStrategySet}{\PureStratSet}
\newcommand{\pureStrat}{x}
\newcommand{\PureStrat}{X}
\newcommand{\utility}{\upsilon}
\newcommand{\StrategySet}{\Pi}
\newcommand{\strategy}{\policy}
\newcommand{\strat}{\strategy}
\newcommand{\profile}{\strategy}
\newcommand{\stratProfile}{\profile}
\newcommand{\gap}{\varepsilon}
\newcommand{\Players}{\mathcal{P}}
\newcommand{\PlayerSet}{\Players}

\newcommand{\Actions}{\mathcal{A}}
\newcommand{\actions}{\Actions}
\newcommand{\RewardSet}{\mathcal{R}}
\newcommand{\regret}{\rho}
\newcommand{\Regret}{R}
\newcommand{\altGap}{\epsilon}
\newcommand{\supCfv}{U}
\newcommand{\maxCfv}{\supCfv}
\newcommand{\maxLoss}{L}
\newcommand{\grad}{\nabla}
\newcommand{\stepSize}{\alpha}
\newcommand{\regularizationWeight}{\eta}
\newcommand{\smProj}{\Pi_{\text{sm}}}
\newcommand{\EX}{\textsc{ex}}
\newcommand{\EXT}{\EX}
\newcommand{\IN}{\textsc{in}}
\newcommand{\INT}{\IN}
\newcommand{\SWAP}{\textsc{sw}}
\newcommand{\ACTION}{\SWAP}

\newcommand{\infoSet}{I}
\newcommand{\InfoSets}{\mathcal{I}}
\newcommand{\reachProb}{P}
\newcommand{\chance}{c}
\newcommand{\chancePolicy}{\policy_{\chance}}
\newcommand{\history}{h}
\newcommand{\Histories}{\mathcal{H}}
\newcommand{\TerminalHistories}{\mathcal{Z}}
\newcommand{\terminalHistory}{z}
\newcommand{\playerChoice}{p}
\usepackage{amssymb}
\newcommand{\emptyHistory}{\varnothing}
\newcommand{\intermediateHistory}{f}
\newcommand{\termValue}{\reward}
\newcommand{\cfIv}{v}
\newcommand{\cfAv}{a}
\newcommand{\cfv}{\cfIv}
\newcommand{\cfq}{\cfv}

\newcommand{\featureFn}{\varphi}
\newcommand{\featureExp}{\featureFn}
\usepackage{upgreek}
\newcommand{\FeatureSpace}{\reals^d}
\newcommand{\InputSpace}{\mathcal{X}}
\newcommand{\fnApprox}{y}
\newcommand{\hash}{\zeta}


\newcommand{\DecisionSet}{\Theta}
\newcommand{\odpDecision}{\theta}
\newcommand{\regularizer}{\psi}
\newcommand{\partialPolicy}{\sigma}
\newcommand{\PartialPolicySet}{\Sigma}
\newcommand{\immStrat}{\sigma}
\newcommand{\immStratSet}{\Sigma}

\newcommand{\DevSet}{\Phi}
\newcommand{\dev}{\phi}
\newcommand{\BLIND}{\textsc{blind}}
\newcommand{\INFORMED}{\textsc{informed}}
\newcommand{\PS}{\textsc{ps}}
\newcommand{\CAUSAL}{\textsc{caus}}
\newcommand{\COUNTERFACTUAL}{\textsc{cf}}
\newcommand{\CF}{\COUNTERFACTUAL}
\newcommand{\STRAT}{\textsc{strat}}
\newcommand{\DEV}{\textsc{dev}}
\newcommand{\BHV}{\SWAP}
\newcommand{\IMM}{\textsc{imm}}
\newcommand{\FULL}{\textsc{full}}
\newcommand{\TARGET}{\odot}
\newcommand{\TRIGGER}{\text{!}}
\newcommand{\FOLLOW}{\textsc{follow}}

\newcommand{\rmOperator}{L}
\newcommand{\maxActivation}{\alpha}

\newcommand{\historySampler}{\zeta}
\newcommand{\powerset}{\wp}
\newcommand{\LINK}{\textsc{lnk}}
\newcommand{\FILTER}{\textsc{fltr}}
\newcommand{\SUCC}{\textsc{suc}}
\newcommand{\ROOT}{\textsc{root}}
\newcommand{\est}{\widehat}
\newcommand{\PRED}{\textsc{pre}}
\newcommand{\IDEN}{1}
\newcommand{\DevSeqSet}{\DevSet}
\newcommand{\parent}{\mathbb{p}}
\newcommand{\devSeq}{\dev}
\newcommand{\parentAction}{\mathbb{a}}
\newcommand{\infoSetOf}{\mathbb{I}}


\newcommand{\RewardFunctionDist}{\mathcal{R}}
\newcommand{\policy}{\pi}
\newcommand{\reward}{r}
\newcommand{\PolicySet}{\Pi}
\newcommand{\StateSet}{\mathcal{S}}
\newcommand{\advantage}{\rho}
\newcommand{\Mdp}{\mathcal{G}}
\newcommand{\initialStateDist}{d_0}
\newcommand{\kOfN}{k\text{-of-}N}
\newcommand{\kOfNMeasure}{\mu_{\kOfN}}
\newcommand{\stateValueFn}{q}

\newcommand{\RewardFn}{\reward}
\newcommand{\RandomRewardFn}{R}
\newcommand{\RandomReturn}{G}

\newcommand{\MINIMAX}{\textsc{minimax}}
\newcommand{\MAXIMIN}{\textsc{mxmn}}

\newcommand{\LossSet}{\mathcal{L}}

\DeclareMathOperator*{\SortFn}{Sort}
\DeclareMathOperator*{\SortBy}{SortBy}
\DeclareMathOperator*{\numCols}{cols}
\DeclareMathOperator*{\numRows}{rows}
\DeclareMathOperator*{\Ucb}{UCB1}
\DeclareMathOperator*{\SuccessiveRejects}{SR}
\DeclareMathOperator*{\WorstKOfNLossesFn}{\LossSet_{\kOfN}}
\def\percentile{\eta}
\DeclareMathOperator*{\WorstFactileLossesFn}{\LossSet_{\percentile}}
\DeclareMathOperator*{\supportFn}{supp}

\newcommand{\mTestCasesToSelect}{m}
\newcommand{\jointTestCaseDistributionPolicyUncertainty}{\Psi}
\newcommand{\testCaseSet}{\mathcal{T}}
\newcommand{\testCaseGroupDecisionLabel}{\testCaseSet}
\newcommand{\testCase}{c}
\newcommand{\testCaseGroup}{\tau}
\newcommand{\testCaseDistribution}{\sigma}
\newcommand{\testTuple}{\tuple{\testCaseGroup, \hat{\testCaseDistribution}_{\testCaseGroup}}}

\newcommand{\tuningLabel}{\textsc{tnp}}
\newcommand{\deploymentLabel}{\textsc{cdp}}
\newcommand{\simLabel}{\textsc{sim}}
\newcommand{\seqLabel}{\textsc{seq}}
\newcommand{\differenceFn}{\mathit{\Delta}}
\newcommand{\maxGrad}{G}
\def\cvarPercentile{1\%}
\def\probMeasure{\mu}
\def\IntegrableFnSet{\mathcal{Y}}
\def\integrableFn{y}
\def\numHoldoutReplicas{100}
 
\usepackage{xspace}

\makeatletter
\DeclareRobustCommand\onedot{\futurelet\@let@token\@onedot}
\def\@onedot{\ifx\@let@token.\else.\null\fi\xspace}

\def\eg/{\emph{e.g}\onedot} \def\Eg/{\emph{E.g}\onedot}
\def\ie/{\emph{i.e}\onedot} \def\Ie/{\emph{I.e}\onedot}
\def\cf/{\emph{c.f}\onedot} \def\Cf/{\emph{C.f}\onedot}
\def\vs/{\emph{vs}\onedot} \def\Vs/{\emph{Vs}\onedot}
\def\etc/{\emph{etc}\onedot}
\def\wrt/{with respect to} \def\dof/{d.o.f\onedot}
\def\etal/{\emph{et al}\onedot}
\def\viceversa/{\emph{vice-versa}}
\def\ow/{\emph{o.w}\onedot}
\def\whp/{w.h.p\onedot}
\def\apriori/{\emph{a priori}} \def\Apriori/{\emph{A priori}}
\def\ala/{\`{a} la}

\def\naive/{na\"{\i}ve} \def\Naive/{Na\"{\i}ve}
\def\rmPlus/{regret matching\textsuperscript{+}}
\def\rrmPlus/{RRM\textsuperscript{+}}
\def\rcfrPlus/{RCFR\textsuperscript{+}}
\def\cfrPlus/{CFR\textsuperscript{+}}
\@ifdefinable{\Politex/}{\def\Politex/{\textsc{Politex}}}

\def\NashConv/{\textsc{NashConv}}
\def\NashConvAUC/{$\overline{\textsc{NashConv}}$}

\def\heads/{\textsc{heads}}
\def\tails/{\textsc{tails}}
\def\even/{\textsc{even}}
\def\odd/{\textsc{odd}}

\makeatother

\newcommand{\rulesep}{\unskip\ \vrule\ }
\newcommand{\textbxf}[1]{{\fontseries{b}\selectfont #1}}

\def\rampName/{ramp}
\def\RampName/{Ramp}
\def\GranTurismo7/{Gran Turismo\textsuperscript{\texttrademark} 7}
 \expandafter\newif\csname ifGin@setpagesize\endcsname
\newcommand{\todonote}[4][inline]{\safeIncCounter{#2NoteCounter}
  \todo[color=offWhite,bordercolor=#3,linecolor=#3,#1]{\textbf{\uppercase{#2}$_{\arabic{#2NoteCounter}}$:}~#4}}

\newcommand{\dmnote}[2][]{\todonote[#1]{DM}{amiiPink}{#2}}
\def\dustin{\dmnote}

\newcommand{\replaced}[3]{\def\counterPrefix{#1}
  \def\arrowMarker{#2}
  \def\replacedText{#3}
  \todo[color=offWhite,bordercolor=red,inline]{$\bs{\arrowMarker}$ \textbf{Replaced (\arabic{Replaced\counterPrefix{}NoteCounter})} \replacedText }}

\newcommand{\replacedStart}[2]{\def\user{#1}
  \def\text{#2}
  \safeIncCounter{Replaced#1NoteCounter}\replaced{\user}{\downarrow}{\text}}
\newcommand{\replacedEnd}[1]{\def\user{#1}
  \replaced{\user}{\uparrow}{}}

\newcommand{\issue}[3]{\todo[color=black,inline]{\textcolor{white}{$\bs{#2}$ \textbf{Issue \##3} (Part \arabic{Issue#3NoteCounter}) #1}}}

\newcommand{\issueChangeStart}[2][]{\safeIncCounter{Issue#2NoteCounter}\issue{#1}{\downarrow}{#2}}
\newcommand{\issueChangeEnd}[2][]{\issue{#1}{\uparrow}{#2}}

\newcommand{\copied}[2]{\def\arrowMarker{#1}
  \def\copiedLabel{#2}
  \todo[color=offWhite,bordercolor=green,inline]{$\bs{\arrowMarker}$ \textbf{Copied (\arabic{CopiedNoteCounter})} \copiedLabel }}

\newcommand{\copiedStart}[1]{\def\copiedStartLabel{#1}
  \safeIncCounter{CopiedNoteCounter}
  \copied{\downarrow}{\copiedStartLabel}}
\newcommand{\copiedEnd}{\copied{\uparrow}{}}

\newcommand{\placeholderText}[1]{\colorbox{black}{\textcolor{white}{#1}}
}
 
\title{Composing Efficient, Robust Tests for Policy Selection (Supplementary Material)}


\author[1]{\href{mailto:<dustin.morrill@sony.com>?Subject=Your UAI 2023 paper}{Dustin Morrill}}
\author[1]{Thomas J. Walsh}
\author[1]{Daniel Hernandez}
\author[1]{Peter R. Wurman}
\author[1,2]{Peter Stone}
\affil[1]{Sony AI\\
    New York, NY, USA
}
\affil[2]{Department of Computer Science\\
    The University of Texas at Austin\\
    Austin, TX USA
}

\begin{document}
\onecolumn
\maketitle



\section{Glossary}

\begin{description}
    \item[Policy.] A policy to solve a control problem or play a game, potentially generated by an RL algorithm.
    \item[Deployment policy.] A policy used in production, \eg/, deployed to end users, used in a competition, or integrated into a technology demonstration.
    \item[Deployment candidate.] A policy in consideration for deployment.
    \item[Test.] The aggregate result of test cases applied to a policy.
    \item[Test case.] An atomic unit of a test that reveals a particular skill or emulates a specific deployment scenario. RPOSST selects a small number of test cases and a distribution over them so that we can avoid executing all conceivable test cases on every deployment candidate every time we want to deploy a policy.
    \item[Test case result.] The numerical result of evaluating a policy on a test case.
        This number should be a good estimate of the policy's expected performance in the test case scenario, but it maybe noisy if the test case is stochastic, \eg/, the average test case result observed from Monte Carlo rollouts.
    \item[Test score.] The final score produced by a test, \ie/, the average test case result across test cases, perhaps weighted by the relative importance of each test case.
    \item[Tuning policy.] A policy used at the start of the RPOSST procedure to gather information about test cases. Each tuning policy is evaluated on each test case to construct the test case result matrix that forms the basis of RPOSST's loss function.
\end{description}

\section{Theory Background}

We make use of six basic results, which are restated here for completeness.
\begin{proposition}[Azuma-Hoeffding inequality.]
  \label{prop:azumaHoeffdingInequality}
  For constants $\tuple{c^t \in \reals}_{t = 1}^T$, martingale difference sequence $\tuple{Y^t \in \reals}_{t = 1}^T$ where $\abs{Y^t} \le c^t$ for each $t$, and $\tau \ge 0$,
  \[
    \Prob\subblock*{\abs*{\sum_{t = 1}^T Y^t} \ge \tau}
      \le 2 \exp\subex*{\frac{-\tau^2}{2 \sum_{t = 1}^T (c^t)^2}}.
  \]
\end{proposition}
For proof, see that of Theorem 3.14 by \citet{azumaHoeffdingInequality}.
\begin{proposition}[Regret matching$^+$ regret bound]
    \label{prop:rmp-regret-bound}
    Consider an online decision process with $\mTestCasesToSelect$ actions and the set of bounded, linear loss functions,
    $\LossSet = [0, \maxLoss]^{\mTestCasesToSelect}$.
    Regret matching$^+$ accumulates pseudoregrets
    $q^{1:t} = \ramp{q^{1:t - 1} + \regret^t}$, $q^{1:0} = \zeros$,
    where
    $\regret^t = (\ell^t)^{\top} \testCaseDistribution^t - \ell^t$
    is the instantaneous regret on round $t$ under loss function $\ell^t \in \LossSet$, and
    $\testCaseDistribution^t = q^{1:t - 1} / (\ones^{\top} q^{1:t - 1})$
    if
    $\ones^{\top} q^{1:t - 1} > 0$
    or
    $\testCaseDistribution^t = \frac{1}{\mTestCasesToSelect} \ones$ otherwise,
    is regret matching$^+$'s action distribution on round $t$.
    After $T$ rounds, regret matching$^+$'s cumulative regret is bounded as
    $\sum_{t = 1}^T \regret^t
        \le \maxLoss \sqrt{T \mTestCasesToSelect}$.
\end{proposition}
For proof, see \citet{solvingHulhe}.
\begin{proposition}[The linearization trick]
    \label{prop:linearization-trick}
    Consider an online decision process with convex decision set $\DecisionSet \subseteq \reals^{\mTestCasesToSelect}$ and a set of bounded, convex loss functions
    $\LossSet \subseteq \set*{\ell \where \ell: \DecisionSet \to [0, \maxLoss]}$,
    where each loss function $\ell \in \LossSet$ has subgradients with bounded maximum magnitude, \ie/,
    $\norm{\grad \ell(\odpDecision)}_{\infty} \le \maxGrad$,
    for all
    $\odpDecision \in \DecisionSet$.
    The instantaneous regret under loss function $\ell \in \LossSet$ is upper bounded by the instantaneous regret under the loss function subgradient $\grad \ell(\odpDecision)$ given decision $\odpDecision \in \DecisionSet$, \ie/,
    \[
        \ell(\odpDecision) - \ell(\odpDecision')
            \le
                \subex*{\grad \ell(\odpDecision)}^{\top} \odpDecision - \subex*{\grad \ell(\odpDecision)}^{\top} \odpDecision'.
    \]
\end{proposition}
\begin{proof}
    From the convexity of $\ell$, its first-order Taylor expansion lower bounds $\ell$, \ie/,
    $\ell(\odpDecision')
        \ge
            \ell(\odpDecision)
            + \subex*{\grad \ell(\odpDecision)}^{\top}
            \subex*{ \odpDecision' - \odpDecision}$,
    for all $\odpDecision, \odpDecision' \in \DecisionSet$.
    Therefore,
    \begin{align*}
        \ell(\odpDecision) - \ell(\odpDecision')
            &\le
                \ell(\odpDecision)
                - \subex*{
                    \ell(\odpDecision)
                    + \subex*{\grad \ell(\odpDecision)}^{\top} \subex*{ \odpDecision' - \odpDecision }
                }\\
            &=
                \subex*{\grad \ell(\odpDecision)}^{\top} \odpDecision
                - \subex*{\grad \ell(\odpDecision)}^{\top} \odpDecision',
    \end{align*}
    as required.
\end{proof}
\begin{proposition}[Lemma 2 of \citet{ED,exploitabilityDescentArxiv}]
    \label{prop:best-optimality}
    Assume that on each round $t$ of an online decision process with decision set $\DecisionSet \subseteq \reals^{\mTestCasesToSelect}$ and bounded loss functions from
    $\LossSet \subseteq \set*{\ell \where \ell: \DecisionSet \to [0, \maxLoss]}$,
    the loss function $\ell^t$ maximizes the loss of $\odpDecision^t \in \DecisionSet$ chosen by the decision-maker, \ie/,
    $\ell^t
        \in \argmax_{\ell \in \LossSet}
            \ell(\odpDecision^t)$.
    On the round $t^*$ where the minimum loss was observed,
    $t^* \in \argmin_{t \in \set{1, \ldots, T}}
        \ell^t(\odpDecision^t)
    $,
    the decision $\odpDecision^{t^*}$ has a maximum loss that is no more than $\frac{1}{T} \regret^{1:T}(\odpDecision)$ larger than that of any alternative decision $\odpDecision \in \DecisionSet$, \ie/,
    $\ell^{t^*}(\odpDecision^{t^*}) - \ell^{\odpDecision}(\odpDecision) \le \frac{1}{T} \regret^{1:T}(\odpDecision)$,
    where $\ell^{\odpDecision}(\odpDecision) \in \LossSet$ is a loss function that maximizes the loss on $\odpDecision$.
\end{proposition}
\begin{proof}
    Since the loss function on each round is chosen to maximize loss, the average regret for not choosing $\odpDecision \in \DecisionSet$ is lower bounded as
    \begin{align*}
        \dfrac{1}{T} \regret^{1:T}
            &\ge
                \dfrac{1}{T} \min_{t \in \set{1, \ldots, T}} T \ell^t(\odpDecision^t) - \dfrac{1}{T} \sum_{t = 1}^T \ell^t(\odpDecision)\\
            &\ge
                \ell^{t^*}(\odpDecision^{t^*}) - \ell^{\odpDecision}(\odpDecision),
    \end{align*}
    as required.
\end{proof}
\begin{proposition}[Theorem 4 of \citet{cfrbr}]
    \label{prop:sample-optimality}
    Assume that on each round $t$ of an online decision process with decision set $\DecisionSet \subseteq \reals^{\mTestCasesToSelect}$ and bounded (possibly random) loss functions from
    $\LossSet \subseteq \set*{\ell \where \ell: \DecisionSet \to [0, \maxLoss]}$,
    the loss function $\ell^t$ maximizes the loss of $\odpDecision^t \in \DecisionSet$ chosen by the decision-maker, \ie/,
    $\ell^t
        \in \argmax_{\ell \in \LossSet}
            \ell(\odpDecision^t)$.
    The loss function that the decision-maker observes on each round $t$ may be a random loss function $\hat{\ell}^t$ where $\E\subblock*{\hat{\ell}^t} = \ell^t$.
    On round
    $T' \sim \Unif(\set{1, \ldots, T})$
    after $T$ rounds of the online decision process, the decision $\odpDecision^{T'}$ has a maximum loss that is no more than
    $\frac{1}{qT} \regret^{1:T}(\odpDecision)$
    larger than that of any alternative decision $\odpDecision \in \DecisionSet$ with probability $1 - q$, $q \in (0, 1]$, \ie/,
    $\ell^{T'}(\odpDecision^{T'}) - \ell^{\odpDecision}(\odpDecision) \le \frac{1}{qT} \regret^{1:T}(\odpDecision)$
    holds with probability $1 - q$,
    where $\ell^{\odpDecision}(\odpDecision) \in \LossSet$ is a loss function that maximizes the loss on $\odpDecision$
    and the cumulative regret $\regret^{1:T}$ is with respect to the expected loss functions, $\tuple{\ell^t}_{t = 1}^T$.
\end{proposition}
See \citet{cfrbr} for proof.
\begin{proposition}[Successive Rejects error probability]
    \label{prop:sr-error-prob}
    Consider a best action identification task with $\mTestCasesToSelect$ actions from set $\Actions$.
    Each time an action $a \in \Actions$ is selected, a random sample of that action's loss, $\ell(a) \in [-0.5, 0.5]$, under a fixed but random loss function $\ell$, is observed.
    The goal is to identify an action $a^* \in \Actions^* \subset \Actions$ with the lowest expected loss, $\E\subblock*{\ell(a^*)}$, after $T$ samples.
    The probability that the action returned by the Successive Rejects algorithm is in $\Actions^*$ is at least
    \[
        1 - \dfrac{\mTestCasesToSelect\subex{ \mTestCasesToSelect - 1}}{2}
            \exp\subex*{ - \dfrac{T - \mTestCasesToSelect}{\overline{\log}(\mTestCasesToSelect)H_2} },
    \]
    where
    $\overline{\log}(\mTestCasesToSelect) = \frac{1}{2} + \sum_{i = 2}^{\mTestCasesToSelect} \frac{1}{i}$,
    $H_2 = \max_{i \in \set{1, \ldots, \abs{\Actions \setminus \Actions^*}}}
        \frac{i}{\subex*{\E\subblock*{\ell(a_{(i)})} - \E\subblock*{\ell(a^*)}}^2}$,
    and $a_{(i)}$ is the action that achieves the $i^{\text{th}}$ smallest loss (with ties broken arbitrarily) among the suboptimal actions.
\end{proposition}
See \citet{audibert2010BestArmIdentification} for proof.


\section{Sequential-Move Model Theory}

\begin{lemma}
  \label{lem:regret_matching_k_of_n_hp_regret_bound}
  Consider a $k$-of-$N$ game with $\mTestCasesToSelect$ actions and the set of bounded, convex loss functions
  $\LossSet = \set*{\ell \where \ell: \simplex^{\mTestCasesToSelect} \to [0, \maxLoss]}$,
  where each loss function $\ell \in \LossSet$ has subgradients with bounded maximum magnitude, \ie/,
  $\norm{\grad \ell(\testCaseDistribution)}_{\infty} \le \maxGrad$,
  for all
  $\testCaseDistribution \in \simplex^{\mTestCasesToSelect}$.
  Let the $k$-worst loss functions from $N$ of those sampled from the given uncertainty distribution $\jointTestCaseDistributionPolicyUncertainty$ on round $t$ be
  $\tuple{ \ell^t_{(i)} \in \LossSet }_{i = 1}^k$.
  The randomly sampled $k$-of-$N$ loss function on round $t$ is then the average
  $\bar{\ell}^t
      = \frac{1}{k} \sum_{i = 1}^k \ell_{(i)}$.
  After $T$ rounds, regret matching$^+$ on the random loss gradients $\grad \bar{\ell}^t(\testCaseDistribution^t)$ has no more than
  $2 \maxGrad \sqrt{T \mTestCasesToSelect} + 2 \maxLoss \sqrt{2 T \log{\nicefrac{1}{p}}}$
  cumulative regret on the expected $k$-of-$N$ losses, $\tuple*{ \E\subblock{ \bar{\ell} }^t }_{t = 1}^T$, with probability $1 - p$, $p > 0$.
\end{lemma}
\begin{proof}
  Since regret matching$^+$ observes and learns directly from $\grad \bar{\ell}^t$, its regret for not always choosing $\testCaseDistribution \in \simplex^{\mTestCasesToSelect}$, under the sampled loss functions, is deterministically upper bounded as
  \[
      \Regret^{1:T}
          = \sum_{t = 1}^T
              \underbrace{
                  \bar{\ell}^t(\testCaseDistribution^t)
                  - \bar{\ell}^t(\testCaseDistribution)
              }_{\as \Regret^t}
          \le 2 \maxGrad \sqrt{T \mTestCasesToSelect},
  \]
  where $\tuple{\testCaseDistribution^t \in \simplex^{\mTestCasesToSelect}}_{t = 1}^T$ are the decisions made by regret matching$^+$.
  This bound comes from regret matching$^+$'s regret bound on linear losses (\cref{prop:rmp-regret-bound}) and the linearization trick (\cref{prop:linearization-trick}), which states that the regret on loss gradients upper bounds that of the loss itself, \ie/,
  $\Regret^{1:T}
      \le
          \sum_{t = 1}^T
              \subex*{\grad \bar{\ell}^t(\testCaseDistribution^t)}^{\top} \testCaseDistribution^t
              - \subex*{\grad \bar{\ell}^t(\testCaseDistribution)}^{\top} \testCaseDistribution$.

  The rest of the proof largely follows the proof of \citet{farina2020stochasticRegretMin}'s Proposition 1.
  The sequence of differences,
  $\tuple*{
      \E\subblock*{\Regret^t} - \Regret^t \le 2\maxLoss
  }_{t = 1}^T,$
  is a bounded martingale difference sequence.

  The probability that the expected cumulative regret,
  $\E[\Regret^{1:T}]$,
  is bounded by the cumulative sampled regret plus slack
  $\lambda \ge 0$ is bounded according to the Azuma-Hoeffding inequality (\cref{prop:azumaHoeffdingInequality}) as
  \begin{align}
  &\Prob\subblock*{\E[\Regret^{1:T}] \le \Regret^{1:T} + \lambda}\\
  &\le
      \Prob\subblock*{\sum_{t = 1}^T \E[\Regret^t] - \Regret^t \le \lambda}\\
  &=
      1 - \Prob\subblock*{
      \sum_{t = 1}^T
          \E[\Regret^t] - \Regret^t
      \ge \lambda}\\
  &\le
      1 - \exp\subex*{\dfrac{2\lambda^2}{4T \subex*{2\maxLoss}^2}}.
  \end{align}
  Setting $\lambda = 2\maxLoss \sqrt{2T\log(\nicefrac{1}{p})}$ ensures that
  \[
  \E[\Regret^{1:T}]
      \le \Regret^{1:T} + 2\maxLoss \sqrt{2 T \log{\nicefrac{1}{p}}}
  \]
  with probability $1 - p$.
  Since $\Regret^{1:T} \le 2 \maxLoss \sqrt{T \mTestCasesToSelect}$,
  \[
  \E[\Regret^{1:T}]
      \le 2 \maxGrad \sqrt{T \mTestCasesToSelect} + 2\maxLoss \sqrt{2 T \log{\nicefrac{1}{p}}}
  \]
  with probability $1 - p$, as required.
\end{proof} 
\begin{theorem}
    \label{thm:rposst-seq}
    After $T' \sim \Unif(\set{1, \ldots, T_1})$, $T_1 > 0$, rounds of its optimization game, \cref{alg:rposst-seq} selects an $\mTestCasesToSelect$-tuple of test cases, $\testCaseGroup^*$ and weights
$\hat{\testCaseDistribution}^{T'}_{\testCaseGroup^*} \in \simplex^{\mTestCasesToSelect}$
that, with probability $(1 - p)(1 - q)(1 - \alpha)$, $p, q, \alpha > 0$, are $\frac{\gap}{q}$-optimal for \cref{eq:rposst-seq-objective}, where
$\gap = \bigO\subex*{ \sqrt{\frac{1}{T_1} \mTestCasesToSelect} + \sqrt{\frac{1}{T_1} \log\subex*{ \nicefrac{1}{p} }} }$
and
$\alpha = \bigO\subex*{\e^{-T_2}}$. \end{theorem}
\begin{proof}
    \def\kOfNLossFn{L_{\probMeasure_{\kOfN}, \jointTestCaseDistributionPolicyUncertainty}}
    Recall that the $k$-of-$N$ loss $\bar{\ell}^t$ that RPOSST$_{\seqLabel}$ updates from on each round $t = 1, \ldots, T_1$ is a Monte Carlo estimate of the $k$-of-$N$ percentile loss,
    \begin{align}
        \kOfNLossFn(\hat{\testCaseDistribution}^t_{\testCaseGroup})
            = \inf_{\integrableFn \in \IntegrableFnSet}
        \hspace{-1em}
        \underset{
            \substack{
                \percentile \in [0, 1]\\
                \Prob_{\policy_j, \testCaseDistribution} \subblock*{
					\ell(\hat{\testCaseDistribution}^t_{\testCaseGroup}; \policy_j, \testCaseDistribution) \le \integrableFn(\eta)
                } \ge \percentile
            }
        }{\int}
            \hspace{-3em}
            \integrableFn(\eta)
            \probMeasure_{\kOfN}(d\eta)
            = \E_{\policy_j, \testCaseDistribution}\subblock{ \bar{\ell}^t },
        \label{eq:loss-given-test-case-group}
    \end{align}
    where $(\policy_j, \testCaseDistribution) \sim \jointTestCaseDistributionPolicyUncertainty$.
    The sequence of test case weights,
    $\tuple{ \testCaseDistribution_{\testCaseGroup}^t }_{t = 1}^{T_1}$,
    for each $\mTestCasesToSelect$-tuple of test cases $\testCaseGroup \subset \testCaseSet$ is therefore random.
    All of the following probabilities and expectations are with respect to these random variables.

    \Cref{lem:regret_matching_k_of_n_hp_regret_bound} guarantees that RPOSST$_{\seqLabel}$, in generating the test case weight sequence $\tuple{ \testCaseDistribution_{\testCaseGroup}^t }_{t = 1}^{T_1}$ has no more than
    $C = 2 \maxGrad \sqrt{T_1 \mTestCasesToSelect} + 2\maxLoss \sqrt{2 T_1 \log{\nicefrac{1}{p}}}$
    cumulative regret on the $k$-of-$N$ percentile losses,
    \[\regret^{1:T_1}_{\testCaseDistribution_{\testCaseGroup}}
        = \sum_{t = 1}^T
            \kOfNLossFn(\hat{\testCaseDistribution}^t_{\testCaseGroup})
            - \kOfNLossFn(\testCaseDistribution_{\testCaseGroup}),
    \]
    for not always selecting test case weights $\testCaseDistribution_{\testCaseGroup}$,
    with probability $1 - p$.
    That is, $1 - p = \Prob\subblock*{ \regret^{1:T_1}_{\testCaseDistribution_{\testCaseGroup}} \le C}$.

    \cref{prop:sample-optimality} guarantees that, on round $T' \sim \Unif(1, \ldots, T_1)$, the weights for each $\mTestCasesToSelect$-tuple are
    $\frac{1}{qT_1} \regret^{1:T_1}_{\testCaseDistribution_{\testCaseGroup}}$
    close to optimal for \cref{eq:loss-given-test-case-group}, with probability $1 - q$.
    That is,
    $1 - q
        = \Prob\subblock*{
            \kOfNLossFn(\hat{\testCaseDistribution}^{T'}_{\testCaseGroup})
            - \kOfNLossFn(\testCaseDistribution_{\testCaseGroup})
                \le \frac{\regret^{1:T_1}_{\testCaseDistribution_{\testCaseGroup}}}{qT_1}
        }$,
    and this holds regardless of the value of $\regret^{1:T_1}_{\testCaseDistribution_{\testCaseGroup}}$, \ie/,
    $\Prob\subblock*{
        \kOfNLossFn(\hat{\testCaseDistribution}^{T'}_{\testCaseGroup})
        - \kOfNLossFn(\testCaseDistribution_{\testCaseGroup})
            \le \frac{\regret^{1:T_1}_{\testCaseDistribution_{\testCaseGroup}}}{qT_1}
    } = \Prob\subblock*{
        \kOfNLossFn(\hat{\testCaseDistribution}^{T'}_{\testCaseGroup})
        - \kOfNLossFn(\testCaseDistribution_{\testCaseGroup})
            \le \frac{\regret^{1:T_1}_{\testCaseDistribution_{\testCaseGroup}}}{qT_1}
        \given
        \regret^{1:T_1}_{\testCaseDistribution_{\testCaseGroup}} \le C'
    }$
    for all $C' \in \reals$.

    Combining these two results, we see that the probability that $\hat{\testCaseDistribution}^{T'}_{\testCaseGroup}$ has at most $\frac{C}{qT_1}$ excess $k$-of-$N$ percentile loss is
    \begin{align}
        \Prob\subblock*{
            \kOfNLossFn(\hat{\testCaseDistribution}^{T'}_{\testCaseGroup})
            - \kOfNLossFn(\testCaseDistribution_{\testCaseGroup})
                \le \frac{C}{qT_1}
        }
            &= \Prob\subblock*{
                \kOfNLossFn(\hat{\testCaseDistribution}^{T'}_{\testCaseGroup})
                - \kOfNLossFn(\testCaseDistribution_{\testCaseGroup})
                    \le \frac{\regret^{1:T_1}_{\testCaseDistribution_{\testCaseGroup}}}{qT_1},
                \regret^{1:T_1}_{\testCaseDistribution_{\testCaseGroup}} \le C
            }\\
            &= \Prob\subblock*{
                \kOfNLossFn(\hat{\testCaseDistribution}^{T'}_{\testCaseGroup})
                - \kOfNLossFn(\testCaseDistribution_{\testCaseGroup})
                    \le \frac{\regret^{1:T_1}_{\testCaseDistribution_{\testCaseGroup}}}{qT_1}
                \given
                \regret^{1:T_1}_{\testCaseDistribution_{\testCaseGroup}} \le C
            } \Prob\subblock*{
                \regret^{1:T_1}_{\testCaseDistribution_{\testCaseGroup}} \le C
            }\\
            &= \Prob\subblock*{
                \kOfNLossFn(\hat{\testCaseDistribution}^{T'}_{\testCaseGroup})
                - \kOfNLossFn(\testCaseDistribution_{\testCaseGroup})
                    \le \frac{\regret^{1:T_1}_{\testCaseDistribution_{\testCaseGroup}}}{qT_1}
            } \Prob\subblock*{
                \regret^{1:T_1}_{\testCaseDistribution_{\testCaseGroup}} \le C
            }\\
            &= (1 - p)(1 - q).
    \end{align}

    The last remaining step is to complete the outer minimization in \cref{eq:rposst-seq-objective} to select a single $\mTestCasesToSelect$-tuple of test cases.
    Since the $k$-of-$N$ loss observed on each round is random, we cannot compute a simple argmin using the test case weights on round $T'$, and are instead faced with a best arm identification problem.
    For this, we run the Successive Rejects algorithm, which we know from \cref{prop:sr-error-prob} identifies a minimum loss $\mTestCasesToSelect$-tuple of test cases with probability at least
    \[
        \alpha = 1 - \dfrac{\mTestCasesToSelect\subex{ \mTestCasesToSelect - 1}}{2}
            \exp\subex*{ - \dfrac{T_2 - \mTestCasesToSelect}{\overline{\log}(\mTestCasesToSelect)H_2} }.
    \]
    The probability of selecting the best $\mTestCasesToSelect$-tuple using the test case weights on round $T'$ is independent of whether or not the regret bound $C$ was actually achieved or if the test case weights on $T'$ are actually nearly optimal for any given $\mTestCasesToSelect$-tuple, the probability of which we previously characterized as $(1 - p)(1 - q)$.
    Therefore, the probability of achieving $\frac{C}{qT_1}$-optimality given each $\mTestCasesToSelect$-tuple and selecting the best $\mTestCasesToSelect$-tuple is the product $(1 - p)(1 - q)(1 - \alpha)$, as required.
\end{proof}

The $\sqrt{\abs{\testCaseSet}}$ dependence in \cref{thm:rposst-seq} could be improved to $\sqrt{\log\subex*{\abs{\testCaseSet}}}$ if regret matching$^+$ (within or without CFR, respectively) was replaced with an algorithm like Hedge~\citep{hedge}, but this tends to lead to worse performance in practice (see, \eg/, \citet{solvingHulhe,burch2017time}).

In the deterministic CVaR RPOSST case, we get the following corollary.
\begin{corollary}
    \label{cor:deterministic-rposst-seq}
    Assume that
$\jointTestCaseDistributionPolicyUncertainty \in \simplex^d$
for some finite $d \ge 1$.
After $T$ rounds of the CVaR($\percentile$) RPOSST$_{\seqLabel}$ optimization game, where the protagonist chooses $\mTestCasesToSelect$-size tests according to regret matching$^+$ against a best response antagonist,
$\testCaseGroup^*$ and $\testCaseDistribution^{t^*}_{\testCaseGroup^*}$
are $\gap$-optimal for \cref{eq:rposst-seq-objective} under the $\percentile$-fractile CVaR robustness measure, where
$\gap = \bigO\subex*{ \sqrt{\frac{1}{T} \mTestCasesToSelect}}$. \end{corollary}
\begin{proof}
    \Cref{prop:rmp-regret-bound} and \cref{prop:best-optimality} ensures that there is a round $t^*_{\testCaseGroup} \le T$ where
    $\testCaseDistribution_{\testCaseGroup}^{t^*_{\testCaseGroup}}$
    is
    $2 \maxGrad \sqrt{\mTestCasesToSelect \dfrac{1}{T}}$-optimal on the deterministic $k$-of-$N$ losses.
    Since the $k$-of-$N$ loss function observed on each round is deterministic, we can perform a simple minimization across $\set{1, \ldots, T}$ and the $\mTestCasesToSelect$-tuple of test cases to find the minimizers $t^*$ and $\testCaseGroup^*$, leading to the stated optimality guarantee.
\end{proof}

\section{Deterministic CVaR$(\percentile)$ RPOSST$_{\seqLabel}$ Pseudocode}
\begin{algorithm2e}[tb]
    \caption{Deterministic CVaR$(\percentile)$ RPOSST$_{\seqLabel}$ with regret matching$^+$}\label{alg:cvar-rposst-seq}
    \DontPrintSemicolon
    \textbf{\textit{Inputs:}} $\langle \percentile, T, \mTestCasesToSelect, \jointTestCaseDistributionPolicyUncertainty, \testCaseGroup^0, \ell \rangle$
    \vspace{0.2em} \hrule \vspace{0.2em}

    $q^{1:0}_{\testCaseGroup} \gets \zeros \in \reals^{\mTestCasesToSelect + \abs{\testCaseGroup^0}}$ \textbf{for} $\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}$

    $t^* \gets 1$

    $\bar{\cfv}^{t^*} \gets -\infty$

    \For{$t \gets 1, \ldots, T$}{
        \For{$\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}$}{
            $z^t \gets \ones^{\top} q^{1:t - 1}_{\testCaseGroup}$

            $\hat{\testCaseDistribution}^t_{\testCaseGroup} \gets q^{1:t - 1}_{\testCaseGroup} / z^t$ \textbf{if} $z^t > 0$ \textbf{else} $\ones / \mTestCasesToSelect$

            \tcp{Fill in zeros so that $\hat{\testCaseDistribution}^t_{\testCaseGroup} \in \simplex^{\abs{\testCaseSet}}$.}
            $\hat{\testCaseDistribution}^t_{\testCaseGroup}(x) \gets 0$ \textbf{for} $x \in \testCaseSet \setminus (\testCaseGroup \cup \testCaseGroup^0)$

$\subblock*{ \ell_{\testCaseGroup, (i)} }_{i = 1}^d \gets
              \WorstFactileLossesFn\subex*{\hat{\testCaseDistribution}^t_{\testCaseGroup}, \percentile, \jointTestCaseDistributionPolicyUncertainty, \ell}$

            $\cfv^t_{\testCaseGroup} \gets
              -\sum_{i = 1}^d \dfrac{\partial \ell_{\testCaseGroup, (i)}, \policy_{j_{(i)}}}{\partial \hat{\testCaseDistribution}^t_{\testCaseGroup}}$

            \tcp{Update regret matching$^+$.}
            $\bar{\cfv}_{\testCaseGroup}^t \gets (\hat{\testCaseDistribution}^t_{\testCaseGroup})^{\top} \cfv^t_{\testCaseGroup}$

            $\regret^t_{\testCaseGroup} \gets \cfv^t_{\testCaseGroup} - \bar{\cfv}_{\testCaseGroup}^t$

            $q^{1:t}_{\testCaseGroup} \gets \ramp{q^{1:t - 1}_{\testCaseGroup} + \regret^t_{\testCaseGroup}}$

            \tcp{Update the best round.}
            \If{$\bar{\cfv}_{\testCaseGroup}^t > \bar{\cfv}^{t^*}$}{
              $t^* \gets t$

              $\bar{\cfv}^{t^*} \gets \bar{\cfv}_{\testCaseGroup}^t$
            }
        }
    }

    \Return $\testCaseGroup^{t^*}, \hat{\testCaseDistribution}^{t^*}_{\testCaseGroup^*}$

    \vspace{0.5em} \hrule \vspace{0.2em}
    \setcounter{AlgoLine}{0}
    \SetKwProg{Subroutine}{Procedure}{}{}
    \Subroutine{$\WorstFactileLossesFn$ \quad \textbf{Inputs:} $\langle \hat{\testCaseDistribution}, \percentile, \jointTestCaseDistributionPolicyUncertainty, \ell \rangle$}{
      \vspace{0.2em} \hrule \vspace{0.2em}

      \tcp{The support of $\jointTestCaseDistributionPolicyUncertainty$, $\supportFn(\jointTestCaseDistributionPolicyUncertainty)$, is assumed to be a finite number $d = \abs{\supportFn(\jointTestCaseDistributionPolicyUncertainty)}$.}

      \For{$\policy_{j_i}, \testCaseDistribution_i \in \supportFn(\jointTestCaseDistributionPolicyUncertainty)$}{
        \tcp{Evaluate $\hat{\testCaseDistribution}$.}
        $\ell_i \gets \ell(\hat{\testCaseDistribution}; \policy_{j_i},\testCaseDistribution_i)$
      }

      $\SortFn\subex*{
        \set*{ i \where \ell_i }_{i = 1}^d
      }$

      \tcp{Assign weights to each loss function.}
      \tcp{Iterate over $\jointTestCaseDistributionPolicyUncertainty$'s support sorted accoding to descending loss value from the previous step.}
      $\beta \gets 0$

      \For{$\policy_{j_{(i)}}, \testCaseDistribution_{(i)} \in \supportFn(\jointTestCaseDistributionPolicyUncertainty)$}{
        $\alpha_{(i)} = \min \set*{
          \jointTestCaseDistributionPolicyUncertainty\subex*{
            \tuple*{\policy_{j_{(i)}}, \testCaseDistribution_{(i)}}
          },
          \percentile - \beta
        }$

        $\beta \gets \beta + \alpha_{(i)}$
      }

      \Return $\subblock*{ \frac{\alpha_{(i)}}{\percentile} \ell_{(i)} }_{i = 1}^d$
    }
  \end{algorithm2e}
 Pseudocode for CVaR($\percentile$) RPOSST$_{\seqLabel}$ is presented in \cref{alg:cvar-rposst-seq}.

\section{Simultaneous-Move Model}\label{sec:sim-move-model}
We present a more in-depth description of the simultaneous move antagonist model which describes the $\text{RPOSST}_{SIM}$ as introduced in Section~\ref{section:rposst}. This description is complemented by pseudocode describing its workings in \cref{alg:rposst-sim}.

In this model, the antagonist does not observe which $\mTestCasesToSelect$-tuple of test cases, $\testCaseGroup$, is sampled from the protagonist's $\hat{\testCaseDistribution}^t_{\testCaseGroupDecisionLabel} \in \simplex^{\abs{\testCaseSet}^{\mTestCasesToSelect}}$ distribution, making the antagonist role more difficult.
The simultaneous move model corresponds to the policy testing use case where a new $\mTestCasesToSelect$-tuple of test cases is sampled independently for each test that is performed.
Effectively, the protagonist and antagonist choose $\testCaseGroup$ and
$\tuple*{\tuple*{
    \policy_{j_{(i)}},
    \testCaseDistribution_{(i)}
}}_{i = 1}^k$
respectively in a simultaneous fashion. In this model, the antagonist must choose a single list of tuples
$\tuple*{\tuple*{
    \policy_{j_{(i)}},
    \testCaseDistribution_{(i)}
}}_{i = 1}^k$
that will lead to a large loss across all of the $\mTestCasesToSelect$-tuples of test cases that the protagonist might choose, thereby preventing the antagonist from exploiting the lacking aspects of each individual $\mTestCasesToSelect$-tuple.

The protagonist in the simultaneous move model must carefully choose $\hat{\testCaseDistribution}^t_{\testCaseGroupDecisionLabel}$ and each $\mTestCasesToSelect$-tuple distribution,
$\subblock*{\hat{\testCaseDistribution}^t_{\testCaseGroup}}_{\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}}$,
to thwart the antagonist.
We organize the protagonist's actions into two sequential decisions: first choosing the $\mTestCasesToSelect$-tuple $\testCaseGroup$ and then choosing $\hat{\testCaseDistribution}^t_{\testCaseGroup}$ given $\testCaseGroup$.
We then use CFR$^+$ to refine both $\hat{\testCaseDistribution}^t_{\testCaseGroupDecisionLabel}$ and each $\hat{\testCaseDistribution}^t_{\testCaseGroup}$ after each round.

\begin{algorithm2e}
  \DontPrintSemicolon
  \textbf{\textit{Inputs:}} $\tuple*{k, N, T, \mTestCasesToSelect, \jointTestCaseDistributionPolicyUncertainty, \testCaseGroup^0, \ell}$
  \vspace{0.2em} \hrule \vspace{0.2em}

  \tcp{Initialize pseudoregrets.}
  $q^{1:0}_{\testCaseGroupDecisionLabel} \gets \zeros \in \reals^{\abs{\testCaseSet^{\mTestCasesToSelect}}}$

  $q^{1:0}_{\testCaseGroup} \gets \zeros \in \reals^{\mTestCasesToSelect + \abs{\testCaseGroup^0}}$ \textbf{for} $\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}$

  \tcp{Initialize average distributions.}
  $\hat{\testCaseDistribution}^{1:0}_{\testCaseGroupDecisionLabel} \gets
      \zeros \in \reals^{\abs{\testCaseSet^{\mTestCasesToSelect}}}$

  $\hat{\testCaseDistribution}^{1:0}_{\testCaseGroup} \gets
      \zeros \in \reals^{\mTestCasesToSelect + \abs{\testCaseGroup^0}}$ \textbf{for} $\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}$

  \For{$t \gets 1, \ldots, T$}{

\tcp{Sample antagonist actions.}
      $\policy_{j_i}, \testCaseDistribution_i \sim \jointTestCaseDistributionPolicyUncertainty$ \textbf{for} $i = 1 \ldots N$

      \tcp{Generate test case distributions.}
      $z^t_{\testCaseGroupDecisionLabel} \gets \ones^{\top} q^{1:t - 1}_{\testCaseGroupDecisionLabel}$

      $\hat{\testCaseDistribution}^t_{\testCaseGroupDecisionLabel} \gets q^{1:t - 1}_{\testCaseGroupDecisionLabel} / z^t_{\testCaseGroupDecisionLabel}$ \textbf{if} $z^t_{\testCaseGroupDecisionLabel} > 0$ \textbf{else} $\ones / \abs{\testCaseSet^{\mTestCasesToSelect}}$

      \For{$\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}$}{
        $z^t_{\testCaseGroup} \gets \ones^{\top} q^{1:t - 1}_{\testCaseGroup}$

        $\hat{\testCaseDistribution}^t_{\testCaseGroup} \gets q^{1:t - 1}_{\testCaseGroup} / z^t_{\testCaseGroup}$ \textbf{if} $z^t_{\testCaseGroup} > 0$ \textbf{else} $\ones / \mTestCasesToSelect$

        \tcp{Fill in zeros so that $\hat{\testCaseDistribution}^t_{\testCaseGroup} \in \simplex^{\abs{\testCaseSet}}$.}
        $\hat{\testCaseDistribution}^t_{\testCaseGroup}(x) \gets 0$ \textbf{for} $x \in \testCaseSet \setminus (\testCaseGroup \cup \testCaseGroup^0)$
      }

      \tcp{Evaluate the CFR$^+$ distributions.}
      $\ell_i \gets
          (\hat{\testCaseDistribution}^t_{\testCaseGroupDecisionLabel})^{\top}
          \subblock*{
              \ell(\hat{\testCaseDistribution}^t_{\testCaseGroup}; \policy_{j_i}, \testCaseDistribution_i)
          }_{\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}}
      $ \textbf{for} $i = 1, \ldots, N$

      \tcp{Sort to identify the worst $k$.}
      $\SortBy\subex*{
          \subblock*{ \tuple*{\testCaseDistribution_i, \policy_{j_i}} }_{i = 1}^N,
          \subblock*{ \ell_i }_{i = 1}^N
      }$

      \tcp{Update CFR$^+$.}
      \For{$\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}$}{
          $\ell_{\testCaseGroup, (i)} \gets \ell(\hat{\testCaseDistribution}^t_{\testCaseGroup}; \policy_{j_{(i)}}, \testCaseDistribution_{(i)})$
              \textbf{for} $i = 1, \ldots, k$

          $\cfv^t_{\testCaseGroup} \gets \frac{-1}{k} \sum_{i = 1}^k \frac{\partial \ell_{\testCaseGroup, (i)}}{\partial \hat{\testCaseDistribution}^t_{\testCaseGroup}}$

          $\regret^t_{\testCaseGroup} \gets \cfv^t_{\testCaseGroup} - (\hat{\testCaseDistribution}^t_{\testCaseGroup})^{\top} \cfv^t_{\testCaseGroup}$

          $q^{1:t}_{\testCaseGroup} \gets \ramp{q^{1:t - 1}_{\testCaseGroup} + \regret^t_{\testCaseGroup}}$
      }
      $\cfv^t_{\testCaseGroupDecisionLabel} \gets
          \frac{-1}{k} \sum_{i = 1}^k
              \subblock*{
                  \ell_{\testCaseGroup, (i)}
              }_{\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}}$

      $\regret^t_{\testCaseGroupDecisionLabel} \gets \cfv^t_{\testCaseGroupDecisionLabel} - (\hat{\testCaseDistribution}^t_{\testCaseGroupDecisionLabel})^{\top} \cfv^t_{\testCaseGroupDecisionLabel}$

      $q^{1:t}_{\testCaseGroupDecisionLabel} \gets \ramp{q^{1:t - 1}_{\testCaseGroupDecisionLabel} + \regret^t_{\testCaseGroupDecisionLabel}}$

      \tcp{Update average distributions.}
      $\hat{\testCaseDistribution}^{1:t}_{\testCaseGroupDecisionLabel} \gets
          \hat{\testCaseDistribution}^{1:t - 1}_{\testCaseGroupDecisionLabel}
          + t \hat{\testCaseDistribution}^t_{\testCaseGroupDecisionLabel}$

      $\hat{\testCaseDistribution}^{1:t}_{\testCaseGroup} \gets
          \hat{\testCaseDistribution}^{1:t - 1}_{\testCaseGroup}
          + t \hat{\testCaseDistribution}^t_{\testCaseGroupDecisionLabel}(\testCaseGroup) \hat{\testCaseDistribution}^t_{\testCaseGroup}
      $ \textbf{for} $\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}$
  }
  \Return
      $\tuple*{
          \dfrac{
              \hat{\testCaseDistribution}^{1:T}_{\testCaseGroupDecisionLabel}
          }{
              \ones^{\top} \hat{\testCaseDistribution}^{1:T}
          },
          \subblock*{
              \dfrac{
                  \hat{\testCaseDistribution}^{1:T}_{\testCaseGroup}
              }{
                  \ones^{\top} \hat{\testCaseDistribution}^{1:T}_{\testCaseGroup}
              }
          }_{\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}}
      }$
  \caption{RPOSST$_{\simLabel}$ (simultaneous model; CFR$^+$)}
  \label{alg:rposst-sim}
\end{algorithm2e}
 
Instantiating the percentile performance loss of \cref{eq:percentile-performance-loss} for the simultaneous move model, the RPOSST objective is,
\begin{align}
    \min_{
        \substack{
						\hat{\testCaseDistribution}_{\testCaseSet} \in \simplex^{\abs{\testCaseSet}^{\mTestCasesToSelect}}\\
            \subblock*{
								\hat{\testCaseDistribution}_{\testCaseGroup} \in \simplex^{\mTestCasesToSelect}
            }_{\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}}
        }
    }
    \inf_{\integrableFn \in \IntegrableFnSet}
        \hspace{-1.5em}
        \underset{
            \substack{
                \percentile \in [0, 1]\\
                \Prob \subblock*{
                    \E_{
												\testCaseGroup \sim \hat{\testCaseDistribution}_{\testCaseSet}
                    }\subblock*{
												\ell(\hat{\testCaseDistribution}_{\testCaseGroup}; \testCaseDistribution, \policy_j)
                    } \le \integrableFn(\eta)
                } \ge \percentile
            }
        }{\int}
            \hspace{-4.5em}
            \integrableFn(\eta)
            \probMeasure_{\kOfN}(d\eta),
    \label{eq:rposst-sim-objective}
\end{align}
where
$\testCaseDistribution, \policy_j \sim \jointTestCaseDistributionPolicyUncertainty$.

After (linearly) averaging the protagonist's choices of $\hat{\testCaseDistribution}^t_{\testCaseGroupDecisionLabel}$ and $\subblock*{\hat{\testCaseDistribution}^t_{\testCaseGroup}}_{\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}}$ across each round, \cref{alg:rposst-sim} returns the average distributions $\bar{\hat{\testCaseDistribution}}^T_{\testCaseGroupDecisionLabel}$
and
$\subblock*{\bar{\hat{\testCaseDistribution}}^T_{\testCaseGroup}}_{\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}}$.



The simultaneous-move model can be made deterministic using a CVaR measure in the same way as the sequential-move model.
If we fix the ratio $\nicefrac{k}{N}$ and allow $N \to \infty$, the $k$-of-$N$ robustness measure converges toward the CVaR measure at the $\nicefrac{k}{N}$ fractile.
Furthermore, if our the distribution characterizing our uncertainty, $\jointTestCaseDistributionPolicyUncertainty$,
is over a discrete set of manageable size, then we can run RPOSST on CVaR robustness measures.
In RPOSST$_{\simLabel}$, the lowest loss test case distributions across all rounds can also be tracked instead of averaging all of the distributions.
%
 

\section{Experimental Details}\label{sec:experimental-details}
\begin{table*}[t]
	\caption{Approximate amount of time required to run $T = 500$ rounds of CVaR($\cvarPercentile$) RPOSST$_{\seqLabel}$ in each domain. Runtimes are similar across both variants in each domain and across holdout policy set sizes.}
	\label{tab:experiment_runtimes}
	\centering
    \begin{tabular}[c]{l|l}
        \hline
        domain & runtime / seed\\
        \hline
        \hline
        Racing Arrows & $\sim 2$ minutes\\
        \hline
        ACPC & $\sim 10$ seconds\\
        \hline
        GT & $\sim 90$ seconds\\
        \hline
    \end{tabular}
\end{table*}

In this section we provide further details on some of the experimental setups used in \cref{section:experiments}.

All CVaR($\cvarPercentile$) RPOSST$_{\seqLabel}$ procedures were run on a 16 core AMD$^{\text{\textregistered}}$ Ryzen 7 5800h CPU with 30.7 GiB of memory.
See \cref{tab:experiment_runtimes} for the time required to run CVaR($\cvarPercentile$) RPOSST$_{\seqLabel}$ on each domain.


\subsection{Racing Arrows}
Racing Arrows is a two-player, zero-sum, one-shot, continuous action game that replicates simple aspects of a passing scenario in a race featuring a "leader" player and faster "follower" player.
The goal of the follower is to pass the leader while the goal of the leader
is to block the follower.

Both players privately choose an angle in the half-circle between 0 and pi for their arrow.
The speed of each player is represented as the length of their arrow. The leader and follower are assigned a speed
according to their roles, where the leader's speed of $0.8$ is slightly slower than
the follower's speed of $1$ to give the follower a chance to pass.
The distance a player travels is the height of their arrow, \ie/,
$\text{speed} \cdot \sin(\text{angle})$.

The follower is blocked and the leader wins if the difference between the two arrows is below $\nicefrac{\pi}{10}$, that is, the leader is close enough to block the follower.
If the follower is not blocked, then the player who traveled the farthest wins.
Players receive $+1$ for a win, $0$ for a loss, or $0.5$ if they travel exactly the same distance (these payoffs sum to the constant $+1$, which is isomorphic to true zero-sum payoffs).


\subsection{Annual Computer Poker Competition}

The Annual Computer Poker Competition (ACPC) was run to test autonomous poker playing agents from 2006 to 2017.
The logs of play are freely available online.\footnote{\url{http://www.computerpokercompetition.org/downloads/competitions}}
Typically, these competitions are Texas hold'em variants: two-player limit, two-player no-limit, and 3-player limit, where ``limit'' and ``no-limit'' indicates whether players are only allowed to bet in fixed increments or if they can bet any number of chips from their current stack, respectively.
Chip stacks reset to their initial sizes after every hand (Doyle's game) so that players can be evaluated on their average one-hand performance across deck shufflings and seat positions.

To reduce variance, hands are played in duplicate, which means that the same deck order is played out multiple times so that each player has a turn playing with the same hands.
For example, if Alice in seat 1 is dealt the ace and king of spades and Bob in seat 2 is dealt the 2 and 7 of hearts in one hand, then Alice and Bob will also play the same hand in opposite positions, where Bob is dealt the ace and king of spades in seat 1, and Alice is dealt the 2 and 7 of hearts.
Alice's duplicate score is then the number of chips she wins over what Bob won in the same position, averaged across both positions.

Our experiments use duplicate score data, \ie/, a test case result here is a duplicate score between two agents, from the 2012 two-player limit and the 2017 two-player no-limit events.


\subsection{\GranTurismo7/}\label{sec:gt7}

\begin{figure}[!h]
	\centering
	\begin{subfigure}{0.55\linewidth}
			\includegraphics[width=\columnwidth, keepaspectratio]{figures/trial_mountain_screenshot.png}
			\caption{}
			\label{fig:trial-mountain}
	\end{subfigure}
	\hfill
	\begin{subfigure}{0.42\linewidth}
			\includegraphics[width=\linewidth, clip, trim=4em 20em 4em 10em, keepaspectratio]{figures/combined_winrate_matrix.pdf}
			\caption{}
			\label{figure:gt_winrate_matrix}
	\end{subfigure}
 \caption{Figure~\ref{fig:trial-mountain} shows a screenshot of two RL agents racing at the Trial Mountain racetrack. The layout of the track can be seen in the top right. Figure~\ref{figure:gt_winrate_matrix} shows the result matrix for the zero-sum experiment. Blue / red colors indicate positive / negative winrates from the point of view of the column player. Agents 0-2 correspond to the different built-in AIs, with the remaining agents being the trained RL agents sorted according to skill. Diagonal values denote an agent playing against itself, which we artificially set to 50\%.}
\end{figure}








Our \GranTurismo7/ experiments were conducted using the \GranTurismo7/ racing simulator.  Previous versions of the \GranTurismo7/ franchise have been used to exhibit  reinforcement learning results~\citep{fuchs2021super,song2021autonomous} including outracing top human drivers~\citep{wurman2022outracing}. Note our focus was not on agent training but rather the problem of selecting the best policy for a deployment, so for training we used the same training parameters reported by by Wurman et al. except for changes to training scenarios to match the track and car combination chosen for this experiment, training only for one-on-one competition, and utilizing a version of self-play to simplify the training process.

The experiment was conducted at the Trial Mountain racetrack (see \cref{fig:trial-mountain}) with the RL policy (and any RL-trained opponent policies) driving a Chevrolet Corvette C7 Stingray '14 using Sport Hard tires. The track and car were chosen because the long straightaways and sharp turns at Trial Mountain led to competitive racing among various RL policies as there are many different areas of the track where passes can occur and the long straightaways allow the agent to use the slipstream of the other car to stay in touch with the car in front.

From a single one-on-one training run we evaluated checkpoints from epochs 5, 200, and then every 75 epochs between epoch 1000 and 4000 for a total of 43 checkpoints. We also evaluated 3 built-in AI agent using cars and tires that made them competitive with the RL agents. Overall we evaluated 46 policies, each of which was considered as a candidate deployment policy or an opponent in a test case.

To create the result matrix shown in \cref{figure:gt_winrate_matrix}, each race was run 20 times with a side-by-side standstill start with the candidate and opponent policies swapping sides half the time to enforce symmetry. An agent would obtain 1 or 0 for winning or losing the race respectively. The diagonal denoting a race between an agent against itself was filled in with $0.5$ entries. As a second experiment on \GranTurismo7/ for a non-zero sum game, using the sportsmanship rule mentioned in \cref{section:experiments} we recomputed the result matrix from \cref{figure:gt_winrate_matrix} so as to penalize trajectories where any car collisions had happened, giving both agents a payoff of $-1$. We remove the entries in the result matrix related to built-in AIs as they are highly collision averse and therefore the sportsmanship constraints would not change their test results, reducing the test case pool size to 43.

\subsection{Supplemental Experimental Results}\label{sec:supplemental-experimental-results}

\begin{figure*}[t]
    \begin{subfigure}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{figures/gt_matrix_frequency_selections_holdout_20/minimax_k_of_n_policies.pdf}
        \caption{\kOfN}
        \label{fig:frequency_matrix_k_of_n}
    \end{subfigure}\hfill \begin{subfigure}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{figures/gt_matrix_frequency_selections_holdout_20/minimax_uniform_policies.pdf}
        \caption{Minimax uniform}
    \end{subfigure}\hfill \begin{subfigure}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{figures/gt_matrix_frequency_selections_holdout_20/iterative_minimax_policies.pdf}
        \caption{Iterative minimax}
    \end{subfigure}\vspace{0.5em}
    \begin{subfigure}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{figures/gt_matrix_frequency_selections_holdout_20/minimax_uniform_assume_uniform_td_policies.pdf}
        \caption{Minimax(TNP) uniform}
    \end{subfigure}\hfill \begin{subfigure}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{figures/gt_matrix_frequency_selections_holdout_20/minimax_uniform_assume_uniform_both_policies.pdf}
        \caption{Miniaverage uniform}
    \end{subfigure}\hfill \begin{subfigure}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{figures/gt_matrix_frequency_selections_holdout_20/minimax_uniform_assume_uniform_policy_policies.pdf}
        \caption{Minimax(TTD) uniform}
    \end{subfigure}\caption{Triangular matrices denoting frequencies of test case pairs choosen by all 6 algorithmic ablations over 100 runs of the winrate GT experiment on a holdout of size 20\%. This visualization is possible because only 2 test cases where chosen as output. The upper triangular matrix from \cref{fig:frequency_matrix_k_of_n} denotes average probability mass given to test case $i$. All other algorithms are limited to uniformly mixing over test cases so the upper triangular matrix is omitted for clarity.}
    \label{fig:frequency_matrices}
\end{figure*}
\begin{table*}[t]
    \caption{Top two test case pairs and corresponding selection frequencies chosen by each algorithm over the 100 seeds in the large GT experiment.}
    \label{tab:top_picks_holdout_20}
    \centering
    \begin{tabular}[c]{|l|cc|}
        \hline
        Algorithm & Pairs & Frequency \\
        \hline
\multirow{2}*{RPOSST$_{\seqLabel}$} & (41, 16) & 90 \\
                                            & (41, 19) & 3 \\
        \hline
        \multirow{2}*{Minimax uniform} & (41, 16) & 40 \\
                                        & (34, 9) & 37 \\
        \hline
\multirow{2}*{Iterative minimax} & (39, 32) & 87 \\
                                        & (39, 31) & 3 \\
        \hline
\multirow{2}*{Minimax(TNP) uniform} & (40, 16) & 92 \\
                                            & (36, 16) & 4 \\
        \hline
\multirow{2}*{Miniaverage uniform} & (37, 12) & 5 \\
                                            & (40, 1) & 4 \\
        \hline
\multirow{2}*{Minimax(TTD) uniform} & (43, 6) & 4 \\
                                            & (28, 1) & 4 \\
        \hline
    \end{tabular}
\end{table*}

\Cref{fig:test-score-error} in \cref{section:experiments} analyses the quantitative performance of RPOSST and its algorithmic ablations with respect to measuring test scores on a holdout set of unseen candidate deployment policies. We complement that analysis with a qualitative study of behaviors exhibited by the algorithms using the large GT experiment with holdout of size 20 as a representative example. We are interested in examining (1) how deterministic each algorithm's output is with respect to the selection of test case pairs and (2) whether different algorithms choose the same test-cases.

The lower triangular matrices from \cref{fig:frequency_matrices} show the frequency at which test case pairs were chosen over the 100 seeds. The top 2 most selected test case pairs for each algorithm are presented in \cref{tab:top_picks_holdout_20}. We observe that RPOSST, alongside Iterative minimax and Minimax(TNP) uniform are very deterministic algorithms, favouring the selection of the same test case pair over 90\%, 87\% and 92\% of the seeds respectively. We deem this a desirable property, as variance  in evaluation scenarios is undesirable because it can hamper interpretability and reproducibility. In contrast, Minimax uniform exhibits a bimodal choice. The remaining algorithms feature a very high variance in their choice of test case pairs, with their most chosen test case pair being selected 5\% of the time, spreading selection widely.

From \cref{tab:top_picks_holdout_20}, test case 16 is heavily favoured by half of the algorithms (RPOSST$_{\seqLabel}$, Minimax uniform and Minimax(TNP) uniform), followed to a lesser extent by test case 41. This indicates that all these algorithms find useful structure in such pairs of agents.

In \cref{fig:racing-arrows-f-50,fig:racing-arrows-l-50,fig:gt-large-matrix,fig:gt-non-zero-sum,fig:acpc2012,fig:acpc2017}, we show the performance of RPOSST$_{\seqLabel}$ and baselines in each domain across test sizes ($\mTestCasesToSelect \in \set{1, 2, 3}$) and holdout proportions ($20\%$, $40\%$, and $60\%$).
\Cref{fig:racing-arrows-l-500} shows the results for the 500 policy Racing Arrows experiment where the leader policies are treated as test cases.

\begin{figure*}[t]
\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_f_50/col_payoffs.size1.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_f_50/col_payoffs.size1.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_f_50/col_payoffs.size1.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\vspace{0.5em}

\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_f_50/col_payoffs.size2.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_f_50/col_payoffs.size2.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_f_50/col_payoffs.size2.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\vspace{0.5em}

\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_f_50/col_payoffs.size3.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_f_50/col_payoffs.size3.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_f_50/col_payoffs.size3.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\caption{Expected test score error (absolute difference) across holdout-policy--target-distribution pairs on Racing Arrows where test cases are 50 follower policies. Each row uses a different setting for the test size ($m = 1$ top, $m = 2$ middle, and $m = 3$ bottom) and each column uses a different holdout proportion ($20\%$ held out in the left column, $40\%$ middle, and $60\%$ right). $\numHoldoutReplicas$ sets of holdout policies were sampled. Holdout-policy--target-distribution pairs are sorted according to test score error. Each RPOSST$_{\seqLabel}$ instance was run for $500$ rounds ($T = 500$). Errorbars represent $95\%$ t-distribution confidence intervals.}
    \label{fig:racing-arrows-f-50}
\end{figure*}

\begin{figure*}[t]
\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_l_50/col_payoffs.size1.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_l_50/col_payoffs.size1.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_l_50/col_payoffs.size1.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\vspace{0.5em}

\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_l_50/col_payoffs.size2.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_l_50/col_payoffs.size2.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_l_50/col_payoffs.size2.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\vspace{0.5em}

\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_l_50/col_payoffs.size3.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_l_50/col_payoffs.size3.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_l_50/col_payoffs.size3.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\caption{Expected test score error (absolute difference) across holdout-policy--target-distribution pairs on Racing Arrows where test cases are 50 leader policies. Each row uses a different setting for the test size ($m = 1$ top, $m = 2$ middle, and $m = 3$ bottom) and each column uses a different holdout proportion ($20\%$ held out in the left column, $40\%$ middle, and $60\%$ right). $\numHoldoutReplicas$ sets of holdout policies were sampled. Holdout-policy--target-distribution pairs are sorted according to test score error. Each RPOSST$_{\seqLabel}$ instance was run for $500$ rounds ($T = 500$). Errorbars represent $95\%$ t-distribution confidence intervals.}
    \label{fig:racing-arrows-l-50}
\end{figure*}

\begin{figure*}[t]
\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2012_2pl/duplicate_mean_matrix_big_blinds.size1.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2012_2pl/duplicate_mean_matrix_big_blinds.size1.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2012_2pl/duplicate_mean_matrix_big_blinds.size1.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\vspace{0.5em}

\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2012_2pl/duplicate_mean_matrix_big_blinds.size2.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2012_2pl/duplicate_mean_matrix_big_blinds.size2.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2012_2pl/duplicate_mean_matrix_big_blinds.size2.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\vspace{0.5em}

\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2012_2pl/duplicate_mean_matrix_big_blinds.size3.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2012_2pl/duplicate_mean_matrix_big_blinds.size3.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2012_2pl/duplicate_mean_matrix_big_blinds.size3.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\caption{Expected test score error (absolute difference) across holdout-policy--target-distribution pairs on the ACPC 2012 data. Each row uses a different setting for the test size ($m = 1$ top, $m = 2$ middle, and $m = 3$ bottom) and each column uses a different holdout proportion ($20\%$ held out in the left column, $40\%$ middle, and $60\%$ right). $\numHoldoutReplicas$ sets of holdout policies were sampled. Holdout-policy--target-distribution pairs are sorted according to test score error. Each RPOSST$_{\seqLabel}$ instance was run for $500$ rounds ($T = 500$). Errorbars represent $95\%$ t-distribution confidence intervals.}
    \label{fig:acpc2012}
\end{figure*}

\begin{figure*}[t]
\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2017_2pn/duplicate_mean_matrix_big_blinds.size1.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2017_2pn/duplicate_mean_matrix_big_blinds.size1.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2017_2pn/duplicate_mean_matrix_big_blinds.size1.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\vspace{0.5em}

\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2017_2pn/duplicate_mean_matrix_big_blinds.size2.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2017_2pn/duplicate_mean_matrix_big_blinds.size2.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2017_2pn/duplicate_mean_matrix_big_blinds.size2.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\vspace{0.5em}

\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2017_2pn/duplicate_mean_matrix_big_blinds.size3.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2017_2pn/duplicate_mean_matrix_big_blinds.size3.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2017_2pn/duplicate_mean_matrix_big_blinds.size3.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\caption{Expected test score error (absolute difference) across holdout-policy--target-distribution pairs on the ACPC 2017 data. Each row uses a different setting for the test size ($m = 1$ top, $m = 2$ middle, and $m = 3$ bottom) and each column uses a different holdout proportion ($20\%$ held out in the left column, $40\%$ middle, and $60\%$ right). $\numHoldoutReplicas$ sets of holdout policies were sampled. Holdout-policy--target-distribution pairs are sorted according to test score error. Each RPOSST$_{\seqLabel}$ instance was run for $500$ rounds ($T = 500$). Errorbars represent $95\%$ t-distribution confidence intervals.}
    \label{fig:acpc2017}
\end{figure*}

\begin{figure*}[t]
\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_large_matrix/gt_winrate_matrix.size1.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_large_matrix/gt_winrate_matrix.size1.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_large_matrix/gt_winrate_matrix.size1.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\vspace{0.5em}

\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_large_matrix/gt_winrate_matrix.size2.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_large_matrix/gt_winrate_matrix.size2.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_large_matrix/gt_winrate_matrix.size2.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\vspace{0.5em}

\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_large_matrix/gt_winrate_matrix.size3.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_large_matrix/gt_winrate_matrix.size3.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_large_matrix/gt_winrate_matrix.size3.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\caption{Expected test score error (absolute difference) across holdout-policy--target-distribution pairs in the winrate GT domain. Each row uses a different setting for the test size ($m = 1$ top, $m = 2$ middle, and $m = 3$ bottom) and each column uses a different holdout proportion ($20\%$ held out in the left column, $40\%$ middle, and $60\%$ right). $\numHoldoutReplicas$ sets of holdout policies were sampled. Holdout-policy--target-distribution pairs are sorted according to test score error. Each RPOSST$_{\seqLabel}$ instance was run for $500$ rounds ($T = 500$). Errorbars represent $95\%$ t-distribution confidence intervals.}
    \label{fig:gt-large-matrix}
\end{figure*}

\begin{figure*}[t]
\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_non_zero_sum/gt_winrate_matrix.size1.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_non_zero_sum/gt_winrate_matrix.size1.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_non_zero_sum/gt_winrate_matrix.size1.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\vspace{0.5em}

\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_non_zero_sum/gt_winrate_matrix.size2.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_non_zero_sum/gt_winrate_matrix.size2.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_non_zero_sum/gt_winrate_matrix.size2.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\vspace{0.5em}

\begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_non_zero_sum/gt_winrate_matrix.size3.holdout20.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_non_zero_sum/gt_winrate_matrix.size3.holdout40.rposst.appendix.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_non_zero_sum/gt_winrate_matrix.size3.holdout60.rposst.appendix.error}.pdf}
    \end{minipage}\caption{Expected test score error (absolute difference) across holdout-policy--target-distribution pairs in the GT domain where -1 is given for a collision. Each row uses a different setting for the test size ($m = 1$ top, $m = 2$ middle, and $m = 3$ bottom) and each column uses a different holdout proportion ($20\%$ held out in the left column, $40\%$ middle, and $60\%$ right). $\numHoldoutReplicas$ sets of holdout policies were sampled. Holdout-policy--target-distribution pairs are sorted according to test score error. Each RPOSST$_{\seqLabel}$ instance was run for $500$ rounds ($T = 500$). Errorbars represent $95\%$ t-distribution confidence intervals.}
    \label{fig:gt-non-zero-sum}
\end{figure*}

\begin{figure*}[t]
    \begin{subfigure}[t]{0.32\linewidth}
      \includegraphics[width=\linewidth]{{figures/racing_arrows_l_500/col_payoffs.size1.holdout96.rposst.error}.pdf}
      \caption{$\mTestCasesToSelect = 1$}
  \end{subfigure}\hfill \begin{subfigure}[t]{0.32\linewidth}
      \includegraphics[width=\linewidth]{{figures/racing_arrows_l_500/col_payoffs.size2.holdout96.rposst.error}.pdf}
      \caption{$\mTestCasesToSelect = 2$}
  \end{subfigure}\hfill \begin{subfigure}[t]{0.32\linewidth}
      \includegraphics[width=\linewidth]{{figures/racing_arrows_l_500/col_payoffs.size3.holdout96.rposst.error}.pdf}
      \caption{$\mTestCasesToSelect = 3$}
  \end{subfigure}
    \caption{Expected test score error (absolute difference) across holdout-policy--target-distribution pairs on Racing Arrows where test cases are leader policies. Here, 500 Racing Arrows policies were sampled for both the follower and leader role and then $96\%$ of policies of both roles were held out before running RPOSST and each baseline. Each column uses a different setting for the test size ($m = 1$ top, $m = 2$ middle, and $m = 3$ bottom). $\numHoldoutReplicas$ sets of holdout policies were sampled. Holdout-policy--target-distribution pairs are sorted according to test score error. Each RPOSST$_{\seqLabel}$ instance was run for $500$ rounds ($T = 500$). Errorbars represent $95\%$ t-distribution confidence intervals.}
    \label{fig:racing-arrows-l-500}
\end{figure*}

We note that as $\mTestCasesToSelect$ increases, the error on the holdout set typically decreases, particularly for RPOSST, since larger tests have the capacity to be strictly more accurate. A qualitative analysis of these results suggests that there are few substantial differences between RPOSST tests of different sizes or with different, reasonably sized, holdout sets. Furthermore, the performance ordering of the tested algorithms remains the same as the results presented in the main paper.
 \bibliography{morrill_257}
\end{document}
