\documentclass[accepted]{uai2023} 

\usepackage[american]{babel}


\usepackage{natbib} \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} \usepackage{booktabs} \usepackage{tikz} 



\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subcaption}
\usepackage{booktabs} \usepackage{multirow}


\usepackage[algo2e,linesnumbered,lined,boxed,commentsnumbered,noend]{algorithm2e}
\newcommand\mycommfont[1]{\footnotesize\ttfamily#1}
\SetCommentSty{mycommfont}
\setlength{\algomargin}{15pt} 

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

\usepackage[capitalize,noabbrev]{cleveref}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}






\usepackage{nicefrac}
\definecolor{offWhite}{RGB}{240,240,240}
\definecolor{grey}{RGB}{180,180,180}
\definecolor{lightGrey}{RGB}{220,220,220}
\definecolor{darkgreen}{RGB}{0,125,0}
\definecolor{lime}{RGB}{255,200,0}

\definecolor{amiiBlue}{RGB}{16,72,118}
\definecolor{amiiPink}{RGB}{241,97,119}
\definecolor{amiiYellow}{RGB}{248,209,109}
\definecolor{amiiPurple}{RGB}{123,105,145}

\colorlet{yes}{cyan!50!white}
\colorlet{newYes}{cyan!75!white}
\colorlet{no}{red!50!white}
\colorlet{newNo}{red!75!white}
\newcommand{\eqTableEntry}[2]{
  \cellcolor{#1}{\textcolor{black}{#2}}
}

\usepackage{amsmath, amssymb, amsfonts, amsthm}
\makeatletter
\ifdefined\theorem
\else
  \newtheorem{theorem}{Theorem}
\fi
\ifdefined\lemma
\else
  \newtheorem{lemma}{Lemma}
\fi
\ifdefined\corollary
\else
  \newtheorem{corollary}{Corollary}
\fi
\ifdefined\definition
\else
  \newtheorem{definition}{Definition}
\fi
\ifdefined\proposition
\else
  \newtheorem{proposition}{Proposition}
\fi
\makeatother

\makeatletter
\newcommand\safeIncCounter[1]{\@ifundefined{c@#1}{\newcounter{#1}\stepcounter{#1}}{\stepcounter{#1}}}
\makeatother

\newcounter{resetCounter}

\usepackage{xargs}
 \usepackage{mathtools}
\DeclarePairedDelimiter{\floor}{\lfloor}{\rfloor}
\DeclarePairedDelimiter{\ceil}{\lceil}{\rceil}
\DeclarePairedDelimiter{\abs}{\lvert}{\rvert}
\DeclarePairedDelimiter{\norm}{\lVert}{\rVert}
\DeclarePairedDelimiter{\subex}{(}{)}
\DeclarePairedDelimiter{\subblock}{[}{]}
\DeclarePairedDelimiter{\tuple}{\langle}{\rangle}
\DeclarePairedDelimiter{\set}{\{}{\}}
\DeclarePairedDelimiter{\relu}{[}{]_+}
\DeclarePairedDelimiter{\ramp}{[}{]_+}

\usepackage{stmaryrd}
\DeclarePairedDelimiter{\stopGrad}{\llbracket}{\rrbracket}

\usepackage{bbold}
\usepackage{bm}

\newcommand{\reals}{\mathbb{R}}
\newcommand{\naturals}{\mathbb{N}}
\newcommand{\natSeq}[1]{\naturals_{#1}}
\newcommand{\Simplex}{\triangle}
\newcommand{\simplex}{\Simplex}
\newcommand{\like}{\widetilde}

\newcommand{\bs}[1]{\bm{#1}}
\newcommand{\smallMath}[1]{{\scriptstyle #1}}
\newcommand{\expectation}{\mathbb{E}}
\newcommand{\E}{\expectation}
\newcommand{\probability}{\mathbb{P}}
\newcommand{\Prob}{\probability}
\newcommand{\defword}[1]{\textbf{\boldmath{#1}}}
\newcommand{\as}{\doteq}

\newcommand{\ones}{\bs{1}}
\newcommand{\zeros}{\bs{0}}
\newcommand{\unitVector}{\bs{e}}

\newcommand{\bigO}{\operatorname{\mathcal{O}}}
\newcommand{\smallo}{\operatorname{o}}

\newcommand{\bmax}{\vee}
\newcommand{\bmin}{\wedge}

\newcommand{\ip}[2]{\langle #1, \, #2 \rangle}
\newcommand{\ind}[1]{\mathbb{1}\set*{#1}}
\newcommand{\given}{\,|\,}
\newcommand{\Given}{\;|\;}
\newcommand{\where}{\;|\;}

\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\e}{e}
\DeclareMathOperator*{\unif}{Unif}
\DeclareMathOperator*{\Unif}{\unif}
\DeclareMathOperator*{\proj}{proj}
\newcommand{\divergence}{\mathcal{B}}

\newcommand{\PureStratSet}{\mathcal{X}}
\newcommand{\PureStrategySet}{\PureStratSet}
\newcommand{\pureStrat}{x}
\newcommand{\PureStrat}{X}
\newcommand{\utility}{\upsilon}
\newcommand{\StrategySet}{\Pi}
\newcommand{\strategy}{\policy}
\newcommand{\strat}{\strategy}
\newcommand{\profile}{\strategy}
\newcommand{\stratProfile}{\profile}
\newcommand{\gap}{\varepsilon}
\newcommand{\Players}{\mathcal{P}}
\newcommand{\PlayerSet}{\Players}

\newcommand{\Actions}{\mathcal{A}}
\newcommand{\actions}{\Actions}
\newcommand{\RewardSet}{\mathcal{R}}
\newcommand{\regret}{\rho}
\newcommand{\Regret}{R}
\newcommand{\altGap}{\epsilon}
\newcommand{\supCfv}{U}
\newcommand{\maxCfv}{\supCfv}
\newcommand{\maxLoss}{L}
\newcommand{\grad}{\nabla}
\newcommand{\stepSize}{\alpha}
\newcommand{\regularizationWeight}{\eta}
\newcommand{\smProj}{\Pi_{\text{sm}}}
\newcommand{\EX}{\textsc{ex}}
\newcommand{\EXT}{\EX}
\newcommand{\IN}{\textsc{in}}
\newcommand{\INT}{\IN}
\newcommand{\SWAP}{\textsc{sw}}
\newcommand{\ACTION}{\SWAP}

\newcommand{\infoSet}{I}
\newcommand{\InfoSets}{\mathcal{I}}
\newcommand{\reachProb}{P}
\newcommand{\chance}{c}
\newcommand{\chancePolicy}{\policy_{\chance}}
\newcommand{\history}{h}
\newcommand{\Histories}{\mathcal{H}}
\newcommand{\TerminalHistories}{\mathcal{Z}}
\newcommand{\terminalHistory}{z}
\newcommand{\playerChoice}{p}
\usepackage{amssymb}
\newcommand{\emptyHistory}{\varnothing}
\newcommand{\intermediateHistory}{f}
\newcommand{\termValue}{\reward}
\newcommand{\cfIv}{v}
\newcommand{\cfAv}{a}
\newcommand{\cfv}{\cfIv}
\newcommand{\cfq}{\cfv}

\newcommand{\featureFn}{\varphi}
\newcommand{\featureExp}{\featureFn}
\usepackage{upgreek}
\newcommand{\FeatureSpace}{\reals^d}
\newcommand{\InputSpace}{\mathcal{X}}
\newcommand{\fnApprox}{y}
\newcommand{\hash}{\zeta}


\newcommand{\DecisionSet}{\Theta}
\newcommand{\odpDecision}{\theta}
\newcommand{\regularizer}{\psi}
\newcommand{\partialPolicy}{\sigma}
\newcommand{\PartialPolicySet}{\Sigma}
\newcommand{\immStrat}{\sigma}
\newcommand{\immStratSet}{\Sigma}

\newcommand{\DevSet}{\Phi}
\newcommand{\dev}{\phi}
\newcommand{\BLIND}{\textsc{blind}}
\newcommand{\INFORMED}{\textsc{informed}}
\newcommand{\PS}{\textsc{ps}}
\newcommand{\CAUSAL}{\textsc{caus}}
\newcommand{\COUNTERFACTUAL}{\textsc{cf}}
\newcommand{\CF}{\COUNTERFACTUAL}
\newcommand{\STRAT}{\textsc{strat}}
\newcommand{\DEV}{\textsc{dev}}
\newcommand{\BHV}{\SWAP}
\newcommand{\IMM}{\textsc{imm}}
\newcommand{\FULL}{\textsc{full}}
\newcommand{\TARGET}{\odot}
\newcommand{\TRIGGER}{\text{!}}
\newcommand{\FOLLOW}{\textsc{follow}}

\newcommand{\rmOperator}{L}
\newcommand{\maxActivation}{\alpha}

\newcommand{\historySampler}{\zeta}
\newcommand{\powerset}{\wp}
\newcommand{\LINK}{\textsc{lnk}}
\newcommand{\FILTER}{\textsc{fltr}}
\newcommand{\SUCC}{\textsc{suc}}
\newcommand{\ROOT}{\textsc{root}}
\newcommand{\est}{\widehat}
\newcommand{\PRED}{\textsc{pre}}
\newcommand{\IDEN}{1}
\newcommand{\DevSeqSet}{\DevSet}
\newcommand{\parent}{\mathbb{p}}
\newcommand{\devSeq}{\dev}
\newcommand{\parentAction}{\mathbb{a}}
\newcommand{\infoSetOf}{\mathbb{I}}


\newcommand{\RewardFunctionDist}{\mathcal{R}}
\newcommand{\policy}{\pi}
\newcommand{\reward}{r}
\newcommand{\PolicySet}{\Pi}
\newcommand{\StateSet}{\mathcal{S}}
\newcommand{\advantage}{\rho}
\newcommand{\Mdp}{\mathcal{G}}
\newcommand{\initialStateDist}{d_0}
\newcommand{\kOfN}{k\text{-of-}N}
\newcommand{\kOfNMeasure}{\mu_{\kOfN}}
\newcommand{\stateValueFn}{q}

\newcommand{\RewardFn}{\reward}
\newcommand{\RandomRewardFn}{R}
\newcommand{\RandomReturn}{G}

\newcommand{\MINIMAX}{\textsc{minimax}}
\newcommand{\MAXIMIN}{\textsc{mxmn}}

\newcommand{\LossSet}{\mathcal{L}}

\DeclareMathOperator*{\SortFn}{Sort}
\DeclareMathOperator*{\SortBy}{SortBy}
\DeclareMathOperator*{\numCols}{cols}
\DeclareMathOperator*{\numRows}{rows}
\DeclareMathOperator*{\Ucb}{UCB1}
\DeclareMathOperator*{\SuccessiveRejects}{SR}
\DeclareMathOperator*{\WorstKOfNLossesFn}{\LossSet_{\kOfN}}
\def\percentile{\eta}
\DeclareMathOperator*{\WorstFactileLossesFn}{\LossSet_{\percentile}}
\DeclareMathOperator*{\supportFn}{supp}

\newcommand{\mTestCasesToSelect}{m}
\newcommand{\jointTestCaseDistributionPolicyUncertainty}{\Psi}
\newcommand{\testCaseSet}{\mathcal{T}}
\newcommand{\testCaseGroupDecisionLabel}{\testCaseSet}
\newcommand{\testCase}{c}
\newcommand{\testCaseGroup}{\tau}
\newcommand{\testCaseDistribution}{\sigma}
\newcommand{\testTuple}{\tuple{\testCaseGroup, \hat{\testCaseDistribution}_{\testCaseGroup}}}

\newcommand{\tuningLabel}{\textsc{tnp}}
\newcommand{\deploymentLabel}{\textsc{cdp}}
\newcommand{\simLabel}{\textsc{sim}}
\newcommand{\seqLabel}{\textsc{seq}}
\newcommand{\differenceFn}{\mathit{\Delta}}
\newcommand{\maxGrad}{G}
\def\cvarPercentile{1\%}
\def\probMeasure{\mu}
\def\IntegrableFnSet{\mathcal{Y}}
\def\integrableFn{y}
\def\numHoldoutReplicas{100}
 
\usepackage{xspace}

\makeatletter
\DeclareRobustCommand\onedot{\futurelet\@let@token\@onedot}
\def\@onedot{\ifx\@let@token.\else.\null\fi\xspace}

\def\eg/{\emph{e.g}\onedot} \def\Eg/{\emph{E.g}\onedot}
\def\ie/{\emph{i.e}\onedot} \def\Ie/{\emph{I.e}\onedot}
\def\cf/{\emph{c.f}\onedot} \def\Cf/{\emph{C.f}\onedot}
\def\vs/{\emph{vs}\onedot} \def\Vs/{\emph{Vs}\onedot}
\def\etc/{\emph{etc}\onedot}
\def\wrt/{with respect to} \def\dof/{d.o.f\onedot}
\def\etal/{\emph{et al}\onedot}
\def\viceversa/{\emph{vice-versa}}
\def\ow/{\emph{o.w}\onedot}
\def\whp/{w.h.p\onedot}
\def\apriori/{\emph{a priori}} \def\Apriori/{\emph{A priori}}
\def\ala/{\`{a} la}

\def\naive/{na\"{\i}ve} \def\Naive/{Na\"{\i}ve}
\def\rmPlus/{regret matching\textsuperscript{+}}
\def\rrmPlus/{RRM\textsuperscript{+}}
\def\rcfrPlus/{RCFR\textsuperscript{+}}
\def\cfrPlus/{CFR\textsuperscript{+}}
\@ifdefinable{\Politex/}{\def\Politex/{\textsc{Politex}}}

\def\NashConv/{\textsc{NashConv}}
\def\NashConvAUC/{$\overline{\textsc{NashConv}}$}

\def\heads/{\textsc{heads}}
\def\tails/{\textsc{tails}}
\def\even/{\textsc{even}}
\def\odd/{\textsc{odd}}

\makeatother

\newcommand{\rulesep}{\unskip\ \vrule\ }
\newcommand{\textbxf}[1]{{\fontseries{b}\selectfont #1}}

\def\rampName/{ramp}
\def\RampName/{Ramp}
\def\GranTurismo7/{Gran Turismo\textsuperscript{\texttrademark} 7}
 \expandafter\newif\csname ifGin@setpagesize\endcsname
\newcommand{\todonote}[4][inline]{\safeIncCounter{#2NoteCounter}
  \todo[color=offWhite,bordercolor=#3,linecolor=#3,#1]{\textbf{\uppercase{#2}$_{\arabic{#2NoteCounter}}$:}~#4}}

\newcommand{\dmnote}[2][]{\todonote[#1]{DM}{amiiPink}{#2}}
\def\dustin{\dmnote}

\newcommand{\replaced}[3]{\def\counterPrefix{#1}
  \def\arrowMarker{#2}
  \def\replacedText{#3}
  \todo[color=offWhite,bordercolor=red,inline]{$\bs{\arrowMarker}$ \textbf{Replaced (\arabic{Replaced\counterPrefix{}NoteCounter})} \replacedText }}

\newcommand{\replacedStart}[2]{\def\user{#1}
  \def\text{#2}
  \safeIncCounter{Replaced#1NoteCounter}\replaced{\user}{\downarrow}{\text}}
\newcommand{\replacedEnd}[1]{\def\user{#1}
  \replaced{\user}{\uparrow}{}}

\newcommand{\issue}[3]{\todo[color=black,inline]{\textcolor{white}{$\bs{#2}$ \textbf{Issue \##3} (Part \arabic{Issue#3NoteCounter}) #1}}}

\newcommand{\issueChangeStart}[2][]{\safeIncCounter{Issue#2NoteCounter}\issue{#1}{\downarrow}{#2}}
\newcommand{\issueChangeEnd}[2][]{\issue{#1}{\uparrow}{#2}}

\newcommand{\copied}[2]{\def\arrowMarker{#1}
  \def\copiedLabel{#2}
  \todo[color=offWhite,bordercolor=green,inline]{$\bs{\arrowMarker}$ \textbf{Copied (\arabic{CopiedNoteCounter})} \copiedLabel }}

\newcommand{\copiedStart}[1]{\def\copiedStartLabel{#1}
  \safeIncCounter{CopiedNoteCounter}
  \copied{\downarrow}{\copiedStartLabel}}
\newcommand{\copiedEnd}{\copied{\uparrow}{}}

\newcommand{\placeholderText}[1]{\colorbox{black}{\textcolor{white}{#1}}
}
 
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{\typeout{(#1)}\@addtofilelist{#1}
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{\externaldocument{#1}\addFileDependency{#1.tex}\addFileDependency{#1.aux}}
\externaldocument[A-]{morrill_257-supp}



\title{Composing Efficient, Robust Tests for Policy Selection}


\author[1]{\href{mailto:<dustin.morrill@sony.com>?Subject=Your UAI 2023 paper}{Dustin Morrill}}
\author[1]{Thomas J. Walsh}
\author[1]{Daniel Hernandez}
\author[1]{Peter R. Wurman}
\author[1,2]{Peter Stone}
\affil[1]{Sony AI\\
    New York, NY, USA
}
\affil[2]{Department of Computer Science\\
    The University of Texas at Austin\\
    Austin, TX USA
}

\begin{document}
\maketitle


\begin{abstract}
Modern reinforcement learning systems produce many high-quality policies throughout the learning process. However, to choose which policy to actually deploy in the real world, they must be tested under an intractable number of environmental conditions. We introduce RPOSST, an algorithm to select a small set of test cases from a larger pool based on a relatively small number of sample evaluations. RPOSST treats the test case selection problem as a two-player game and optimizes a solution with provable $k$-of-$N$ robustness, bounding the error relative to a test that used all the test cases in the pool.
Empirical results demonstrate that RPOSST finds a small set of test cases that identify high quality policies in a toy one-shot game, poker datasets, and a high-fidelity racing simulator.
\end{abstract}

\section{Introduction}\label{section:introduction}

\begin{figure*}[h]
	\centering
	\begin{subfigure}[t]{0.16\linewidth}
\includegraphics[width=\linewidth, clip, trim=0em -9em 0em 0cm, keepaspectratio]{./figures/rposst_test_result_compilation.pdf}
     \label{fig:test_result_compilation}
	\end{subfigure}
	\begin{subfigure}[t]{0.26\linewidth}
\includegraphics[width=\linewidth, clip, trim=0em 0em 0.5em 0cm, keepaspectratio]{figures/flowchart/result_matrix_flowchart.pdf}
     \label{fig:payoff_matrix_generation}
	\end{subfigure}
	\begin{subfigure}[t]{0.26\linewidth}
     \includegraphics[width=\linewidth, clip, trim=0em 2.5em 11.5em 0cm, keepaspectratio]{figures/flowchart/rposst_flowchart.pdf}
     \label{fig:algorithm_description}
	\end{subfigure}
	\begin{subfigure}[t]{0.26\linewidth}
	   \includegraphics[width=\linewidth, clip, trim=2.0em 0em 1em 0cm, keepaspectratio]{figures/flowchart/usage_flowchart.pdf}
	   \label{fig:algorithm_output}
	\end{subfigure}
		\caption{Policy testing with RPOSST.
        From left to right, the result matrix $A$ is constructed from rollouts, \ie/, $A_{i,j}$ is the average rollout outcome for tuning policy $j$ ($\PolicySet_{\tuningLabel}$ is the set of tuning policies) on test case $i$.
        RPOSST analyzes $A$, taking into account uncertainty distribution $\jointTestCaseDistributionPolicyUncertainty$ and a (possibly empty) initial set of test cases that must be used, $\testCaseGroup^0$. RPOSST outputs an efficient robust test $\tuple{\testCaseGroup^*, \hat{\testCaseDistribution}_{\testCaseGroup^*}^*}$, here using only $\mTestCasesToSelect = 3$ test cases
        (if $\testCaseSet$ is too large to select all $3$ test cases at once, $\testCaseGroup^*$ can be fed back into RPOSST as $\testCaseGroup^0$).
        New candidate deployment policies, $\PolicySet_{\deploymentLabel}$, are tested against each test case in $\testCaseGroup^*$ and each result is weighted according to $\hat{\testCaseDistribution}^*_{\testCaseGroup^*}$, producing a test score for each candidate deployment policy.}
		\label{fig:algorithm_flowchart}
\end{figure*}

Reinforcement learning (RL)~\citep{sutton2018reinforcement} policies have made a number of stunning breakthroughs in multiplayer games~\citep{silver2016mastering,moravvcik2017deepstack,brown2018superhuman,vinyals2019grandmaster,brown2019superhuman,wurman2022outracing,meta2022diplomacy,perolat2022mastering}.
However, the process of choosing an RL policy for production usage, either in an exhibition or deployment for end users, is challenging.
Practitioners often generate many policies that perform well during training but which require thorough vetting on alternative conditions or opponents.
Ideally, we would construct a test case for every conceivable deployment scenario, evaluate each policy on each test case, and rank each policy according to a weighted average of test case results.
However, such a procedure is typically infeasible because of the sheer numbers of policies and deployment scenarios, especially if test cases are lengthy or involve people.
In this work, we present a method for selecting a small number of test cases from a larger pool that minimizes the reduction in test quality.


Practitioners from other fields, \eg/, educational testing~\citep{vanderlinden2005linear}, will recognize this problem as \emph{test construction}--selecting a small yet robust set of test cases, based on limited data, to evaluate many candidates.
This set of test cases should contain enough information to indicate performance over the whole test case pool.
For instance, if a policy can defeat a skilled opponent, we can infer that it can defeat an unskilled opponent.
However, complicated domains contain complex intransitive relationships between policies, necessitating test case diversity.
In addition, there is considerable uncertainty over what policies may be produced in the future and what test cases are the most important to game designers.
This uncertainty needs to be considered because once test cases are chosen, the future policies to assess may be the most difficult ones for the test to evaluate accurately.
Therefore, a robust solution is required.

We introduce a framework, \emph{robust population optimization for a small set of test cases} (\emph{RPOSST}), to compose an efficient robust test of a fixed size.
RPOSST tunes its test to approximate the test scores of adversarially selected policies and test case averaging weights, given test case results on a small set of policies.
We present two RPOSST algorithms representing different use cases, focusing on RPOSST$_{\seqLabel}$, which is better suited to current RL deployment pipelines.
We provide robustness guarantees for RPOSST$_{\seqLabel}$ and CVaR RPOSST$_{\seqLabel}$ (a convenient special case) for $k$-of-$N$ robustness measures~\citep{chen2012tractable}.
These guarantees provide confidence that RPOSST test scores for future deployment candidates are reliable.





Our contributions include the RPOSST framework, including two algorithm versions, robustness guarantees, and empirical validation in domains widely ranging in complexity.
Empirical results are presented for a toy one-shot game simulating race car passing, computer poker competition datasets, and the high fidelity racing simulator, \GranTurismo7/.
They show that RPOSST can dramatically reduce (compared to the full set) the number of test cases needed to identify good deployment policies.
 
\section{Problem Definition}
The goal of \emph{policy testing} is to evaluate the strengths and weaknesses of a large set of \emph{candidate deployment policies}, $\PolicySet_{\deploymentLabel} \subseteq \PolicySet$, in order to choose one for deployment.
A \emph{policy} $\policy \in \PolicySet$ in this setting can be any mapping from environment observations to a distribution over actions (\eg/, Markov policies; ~\citet{sutton2018reinforcement}).
A policy is evaluated on a \emph{test} consisting of various test cases chosen from a pool, $\testCaseSet$.
Each \emph{test case} simulates an important aspect of the deployment environment, for example, different parameter settings like weather conditions or different opponent policies in a competative game.
For straightforward comparisons between policies, we summarize a policy $\policy$'s test results with a scalar \emph{test score}, computed as the weighted average of $\policy$'s test case results according to \emph{test case weights}, $\testCaseDistribution \in \simplex^{\abs{\testCaseSet}}$.

If $\testCaseSet$ is small, then right before deployment we could simply test each policy, rank the policies in $\PolicySet_{\deploymentLabel}$ according to the test scores, and deploy the best one.
However, if policies will encounter a wide range of conditions during deployment, \eg/, hundreds or thousands of different players for a policy deployed to a popular video game, then $\testCaseSet$ ostensibly needs to be large in order to adequately reflect such diversity.
The linear scaling in $\abs{\testCaseSet}$ presents not just a computational burden, but also costs in sample complexity (if the test cases are lengthy) or even in person-time if human quality assurance testers might be needed for test cases.

This work addresses the problem of composing an efficient \emph{test}, $\testTuple$, by selecting a small number of test cases $\testCaseGroup \subset \testCaseSet$ and test case weights $\hat{\testCaseDistribution} \in \simplex^{\abs{\testCaseGroup}}$ to approximate a full test, $\tuple{\testCaseSet, \testCaseDistribution \in \simplex^{\abs{\testCaseSet}}}$.
Complicating this task are two sources of uncertainty to which the efficient test must be robust.
First, $\testTuple$ ought to be used on new candidate deployment policies, so $\PolicySet_{\deploymentLabel}$ is unknown before $\testTuple$ is chosen.
Second, the desired \emph{target distribution}, $\testCaseDistribution$, defining the full test to approximate may drift after $\testTuple$ is chosen.


We assume access to a small set of representative \emph{tuning policies} $\PolicySet_{\tuningLabel} \subset \PolicySet$ for immediate testing (\cref{section:rposst} discusses practical considerations in the composition of $\PolicySet_{\tuningLabel}$).
Additionally, our algorithm takes as input a joint distribution $\jointTestCaseDistributionPolicyUncertainty$ over $\PolicySet_{\tuningLabel}$ and $\simplex^{\abs{\testCaseSet}}$ to represent the combined uncertainty about which policies the output test will be applied to and which target distribution to approximate.
See \cref{fig:algorithm_flowchart} for an illustration of the test composition pipeline.

As a concrete example of the terms above and the need for robustness in the face of uncertainty, consider a car-racing agent developed for a one-on-one racing game. The first source of uncertainty is over the future policies we may want to test. Consider the case where, at test construction time, we have policies from two training runs--one that produces aggressive (collision-prone) policies, and another that produces more polite policies, but we are uncertain about which type will be best suited for the game. In this case, we want the selected test cases to provide good evaluations on policies from either set, and thus require $\jointTestCaseDistributionPolicyUncertainty$ to reflect this uncertainty. Policies from both sets should be included in $\PolicySet_{\tuningLabel}$ and our algorithm needs to be robust to policies within $\PolicySet_{\tuningLabel}$.

The second source of uncertainty is over which test cases are most important. Imagine that we have some test cases that specifically target and penalize off-track infractions. In the future, game designers could request fewer infractions or allow for more risky racing lines. To hedge against both of these possibilities we can add two target distributions to $\jointTestCaseDistributionPolicyUncertainty$, one where off-track tests cases have higher weights than the other test cases and another where they have lower weights. The job of an algorithm (such as RPOSST) is then to ensure its tests are accurate according to both target distributions.
  
\section{Background}
\label{section:background}
In order to compose an efficient and robust test, we utilize established game-theoretic frameworks for modeling robustness and learning optimal decisions (specifically, regret minimization).
The following subsections present background material on these two topics.


\subsection{Robustness}





The idea of \emph{robustness} is to prepare for an unfavorable portion of possible outcomes sampled from an uncertainty distribution.
In our formulation of policy testing the uncertainty distribution covers the future policies in $\PolicySet_{\deploymentLabel}$ and the target distribution.
A \emph{percentile robustness measure}~\citep{charnes1959chanceConstrainedProgramming}, $\probMeasure$, is a formal representation of a robustness criterion as a probability distribution over percentiles.
For example, if $\probMeasure$  has all of its weight on 0.01, then an $\mTestCasesToSelect$-size test with weights $\hat{\testCaseDistribution}_{\testCaseGroup}$ that is robust according to $\probMeasure$, then the test minimizes test score error on $\hat{\testCaseDistribution}_{\testCaseGroup}$'s worst 1\% of policy--target-distribution pairs sampled from $\jointTestCaseDistributionPolicyUncertainty$.






The \emph{$k$-of-$N$ robustness measures}~\citep{chen2012tractable} are percentile robustness measures defined by parameters $k, N \in \naturals$, $1 \le k \le N$, that permit \emph{tractable} optimization procedures.
This parameterization reflects the mechanics of how an efficient test $\testTuple$ is evaluated on such a measure: $N$ policy--target-distribution pairs are sampled from  $\jointTestCaseDistributionPolicyUncertainty$ and $\hat{\testCaseDistribution}_{\testCaseGroup}$'s performance is averaged over the $k$ worst pairs for $\hat{\testCaseDistribution}_{\testCaseGroup}$.
Every $k$-of-$N$ robustness measure is a non-increasing function, \ie/, more weight is placed on smaller percentiles, and the fraction $\nicefrac{k}{N}$ represents the percentile (technically the fractile) around which the measure decreases.

In our test construction setting, the choice of $k$ and $N$ reflects the designer's tolerance for test scores that are bad because of ``unlucky'' outcomes from $\jointTestCaseDistributionPolicyUncertainty$ (that is, test scores with large error on policy--target-distribution pairs sampled from $\jointTestCaseDistributionPolicyUncertainty$, even if they are sampled infrequently).
Optimizing for performance under small percentiles (\eg/, setting $k = 1, \;N = 100$) yields tests with a small maximum test score error across $\PolicySet_{\tuningLabel}$.
Then, even if each candidate deployment policy resembles the tuning policy that has the largest test score error, the optimized test will yield small test score errors.
In contrast, optimizing for the uniform measure ($k = N$) optimizes for mean performance across $\PolicySet_{\tuningLabel}$, essentially assuming $\PolicySet_{\deploymentLabel} = \PolicySet_{\tuningLabel}$, which can lead to large test score error on the actual candidate deployment policies.

As $N \to \infty$, the $k$-of-$N$ robustness measure approaches the \emph{conditional value at risk} (\emph{CVaR}) robustness measure at the $\nicefrac{k}{N}$ fractile~\citep{chen2012tractable}, which evenly weights all of the fractiles $\le \nicefrac{k}{N}$ and puts a weight of zero on all larger fractiles.
Formally, the robustness optimization objective is to minimize the \emph{percentile performance loss}:
\begin{align}
    L_{\probMeasure, \jointTestCaseDistributionPolicyUncertainty}(\hat{\testCaseDistribution}_{\testCaseGroup})
        =
            \inf_{\integrableFn \in \IntegrableFnSet}
                \hspace{-3.5em}
                \underset{
                    \percentile \in [0, 1], \,
                    \Prob \subblock*{
                        \ell(\hat{\testCaseDistribution}_{\testCaseGroup}; \policy, \testCaseDistribution) \le \integrableFn(\eta)
                    } \ge \percentile
                }{\int}
                    \hspace{-3.5em}
                    \integrableFn(\eta)
                    \probMeasure(d\eta),
    \label{eq:percentile-performance-loss}
\end{align}
under a loss function
$\ell : \simplex^{\abs{\testCaseSet}} \times \PolicySet_{\tuningLabel} \times \simplex^{\abs{\testCaseSet}} \to \reals$
where we overload $\ell$ for incomplete test case weight vectors by filling in zeros for missing elements,
$\tuple{\policy, \testCaseDistribution} \sim \jointTestCaseDistributionPolicyUncertainty$,
and $\IntegrableFnSet$ is the class of real-valued, bounded, $\probMeasure$-integrable functions on $[0, 1]$.
An \emph{efficient} ($\mTestCasesToSelect$-size) $\probMeasure$-\emph{robust test} is a minimizer of
$L_{\probMeasure, \jointTestCaseDistributionPolicyUncertainty}$
across all $\hat{\testCaseDistribution}_{\testCaseGroup}$ where $\testCaseGroup = \mTestCasesToSelect$.






The optimization of the percentile performance loss under $k$-of-$N$ robustness measure, $\kOfNMeasure$, can be  modeled as a zero-sum imperfect information game~\citep{chen2012tractable}.
Here, a protagonist player constructs efficient tests and an antagonist chooses a tuning policy to test and a target distribution.
For their payoffs, the antagonist receives the test score error of the protagonist's test given the antagonist's tuning policy and target distribution while the protagonist receives the negation.
The $k$ and $N$ parameters determine which target distributions and tuning policies that the antagonist can choose from and how many pairs must be averaged across.
At the start of the game, $N$ target-distribution--tuning-policy pairs are sampled.
From these $N$ pairs, the antagonist must select $k$ of them.
Finally, one of these $k$ pairs is sampled, both players receive their payoffs, and the game ends.
A \emph{minimax} test for the protagonist, \ie/, one that minimizes the protagonist's maximum loss in this game is a $\kOfNMeasure$-robust test.




\subsection{Regret}
While the game above models the optimization process, it does not instruct the protagonist on \emph{how} to choose test cases to win. A no-regret \emph{online decision process} (\emph{ODP}) algorithm can find approximate minimax decisions by repeatedly playing out the game and improving over time from payoff feedback.
Formally, on each round $t$ of the game, an ODP algorithm chooses an efficient test
$\tuple{\testCaseGroup^t, \hat{\testCaseDistribution}_{\testCaseGroup^t}^t}$
and receives the \emph{payoff function}
$\cfv^t = -\grad_{\hat{\testCaseDistribution}^t_{\testCaseGroup^t}} \ell(\hat{\testCaseDistribution}^t_{\testCaseGroup^t}; \policy^t, \testCaseDistribution^t)$
as feedback given $\tuple{\policy^t, \testCaseDistribution^t}$ chosen by the antagonist.
If the antagonist always plays a best response to the ODP algorithm, that is, the tuning-policy--target-distribution pair that maximizes the loss of
$\hat{\testCaseDistribution}^t_{\testCaseGroup^t}$
on each round $t \in \set{1, \ldots, T}$, $T \ge 1$, then the \emph{no-regret} property ensures that at least one of the tests in the sequence $\tuple{
    \tuple{\testCaseGroup^t, \hat{\testCaseDistribution}_{\testCaseGroup^t}^t}
}_{t = 1}^T$ is at most $\bigO(\nicefrac{\maxGrad}{\sqrt{T}})$ away from the minimax value, where $\maxGrad > 0$ is the maximum magnitude of the loss gradient (see \citet{ED,exploitabilityDescentArxiv} and Appendix \cref{A-prop:best-optimality} for more details).

\emph{Regret matching$^+$}~\citep{cfrPlus,solvingHulhe} is a no-regret algorithm for simplex decision sets, \eg/, the $\mTestCasesToSelect$ dimensional test case weight space $\simplex^{\mTestCasesToSelect}$,
that selects
$\hat{\testCaseDistribution}^t_{\testCaseGroup^t}
   = \frac{q^{1:t - 1}}{\ones^{\top} q^{1:t - 1}}$
using \emph{pseudoregrets}
$q^{1:t} = \ramp{q^{1:t - 1} + \regret^t}$, $q^{1:0} = \zeros$,
where $\regret^t = \cfv^t - \subex{\cfv^t}^{\top} \hat{\testCaseDistribution}^t_{\testCaseGroup^t}$ is the \emph{instantaneous regret} vector ($\hat{\testCaseDistribution}^t_{\testCaseGroup^t} = \frac{1}{d}\ones$ if none of the pseudoregrets are positive).


 



\section{RPOSST}\label{section:rposst}
Our approach, \emph{robust population optimization for a small set of test cases} (\emph{RPOSST})
begins by evaluating each tuning policy $\policy \in \PolicySet_{\tuningLabel}$ on each test case $\testCase \in \testCaseSet$, yielding a
$\abs{\testCaseSet} \times \abs{\PolicySet_{\tuningLabel}}$
result matrix $A$ of test case results.
As an optimization approach, RPOSST aims to minimize prediction errors, as measured by a convex function
$\differenceFn: \reals \times \reals \to \reals$,
\eg/, the absolute difference
$\differenceFn(\hat{x}, x) = \abs{\hat{x} - x}$.
RPOSST robustly optimizes for a small set of test cases and a weighting over them according to how well it reproduces test scores admitted by $A$ as measured by a loss function
\[
    \ell:
				\hat{\testCaseDistribution}; \policy_j, \testCaseDistribution
        \mapsto
            \differenceFn(
                \underbrace{
										E_{i \sim \hat{\testCaseDistribution}}\subblock*{
                        A_{i, j}
                    }
								}_{\hat{\testCaseDistribution}\text{'s test score for } \policy_j.},
                \underbrace{
                    E_{i \sim \testCaseDistribution}\subblock*{
                        A_{i, j}
                    }
                }_{\testCaseDistribution\text{'s test score for } \policy_j.}
            ),
\]
on test case distribution
$\hat{\testCaseDistribution} \in \simplex^{\abs{\testCaseSet}}$
compared to
$\testCaseDistribution \in \simplex^{\abs{\testCaseSet}}$
with respect to test results from the $j$\textsuperscript{th} tuning policy $\policy_j$.
Since $\hat{\testCaseDistribution}$ is being used to produce test scores that approximate those under $\testCaseDistribution$, we call $\testCaseDistribution$ a \emph{target distribution} in this context.
Our goal is to select a small number of test cases, so we constrain RPOSST to output weights
$\hat{\testCaseDistribution}_{\testCaseGroup} \in \simplex^{\mTestCasesToSelect}$
for groups of test cases $\testCaseGroup \subset \testCaseSet$ of size $\mTestCasesToSelect$.


Though $\testCaseSet$ is large, the cost of computing $A$ is balanced by the savings of using fewer test cases for future policies.
RPOSST is robust to any distribution over $\PolicySet_{\tuningLabel}$, so as long as this set covers the space of $\PolicySet_{\deploymentLabel}$ (\ie/, all $\pi \in \PolicySet_{\deploymentLabel}$ are convex mixtures of $\PolicySet_{\tuningLabel}$), this robustness imparts a minimum test accuracy guarantee even on deployment candidates.
Intuitively, this means the quality of RPOSST's tests will tend to improve with more diverse tuning policies.
Accordingly, it should be beneficial for a tuning policy to represent an extreme point in a reasonable region of policy space, or at least for it to be generated with a method similar to that which will generate deployment candidates (\eg/, sampled from checkpoints of RL training runs).
That way, the tuning policies include a diverse collection of skilled and unskilled policies with random variations, while retaining architectural and algorithmic similarities to future deployment candidates.

Following the earlier discussion of $k$-of-$N$ robustness, we frame the optimization in RPOSST as a zero-sum game.
By adversarially choosing policies to test, the antagonist forces RPOSST to compose tests that are better at accurately testing the more difficult-to-assess policies in the tuning set, providing a degree of robustness to the distribution of future deployment candidates.
Similarly, by adversarially choosing the target distribution, the antagonist also forces RPOSST to be robust along this dimension.
The steps of each round $t = 1, \ldots, T$ of our optimization game follows.
\begin{enumerate}
    \item The protagonist must choose an $\mTestCasesToSelect$-tuple of test cases $\testCaseGroup^t \subset \testCaseSet$ and weights
        $\hat{\testCaseDistribution}^t_{\testCaseGroup^t} \in \simplex^{\mTestCasesToSelect}$.
    \item $N$ policies to test and target distributions,
        $\tuple*{ \tuple*{ \policy_{j_i}, \testCaseDistribution_i} }_{i = 1}^N$,
        are sampled from uncertainty distribution
        $\jointTestCaseDistributionPolicyUncertainty$.
    \item The antagonist chooses the $k$ worst policies and target distributions, \ie/, those that maximize
        $\ell\subex*{
            \hat{\testCaseDistribution}^t_{\testCaseGroup^t};
            \policy_{j_i},
            \testCaseDistribution_i
        }.$
    \item One of the $k$ worst configurations is sampled uniformly, leading to the end of the round, at which point the protagonist receives the payoff
        $\cfv^t_{\testCaseGroup^t, (i)} = -\ell\subex*{
            \hat{\testCaseDistribution}^t_{\testCaseGroup^t};
            \policy_{j_{(i)}},
            \testCaseDistribution_{(i)}
        },$
        where the subscript $(i)$ denotes the $i$\textsuperscript{th} element of a sorted list in descending order (the $i$\textsuperscript{th} worst for the protagonist).
\end{enumerate}
The protagonist is allowed to update their strategy at the end of each round based on the expected payoff, $\E_{i \sim \Unif\subex*{\set{1, \ldots, k}}}\subblock*{ \cfv^t_{\testCaseGroup, (i)} }$, for each $\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}$ they could have chosen.
The more rounds of the game that are run (the larger $T$ is), the closer RPOSST gets to returning a minimax strategy, and consequently, a robust optimal selection of test cases and weights.
Thus, in application, $T$ can be set as large as is convenient under computational and time constraints.
\Cref{thm:rposst-seq} gives a precise rate for RPOSST's improvement, with high probability, as a function of $T$.
Although the protagonist must consider an exponential (in $\mTestCasesToSelect$) number of test case combinations, the premise of RPOSST is that we want a small set of test cases, so $\mTestCasesToSelect$ will be small.
To decrease computational requirements, RPOSST can be run in a loop to select test cases iteratively until $\mTestCasesToSelect$ have been selected, at a potential cost to test accuracy compared to optimizing for the entire $\mTestCasesToSelect$-tuple at once.


\subsection{Antagonist Information Models}

We consider two RPOSST algorithm variants that utilize different models of the information that the antagonist in our optimization game has before they make their choice.
These models correspond to two policy testing use cases.
The first, ``simultaneous move'' model is less pessimistic, but has impractical aspects, which are addressed by the subsequent ``sequential move'' model.




\textbf{Simultaneous move.}
The simultaneous move model is a na\"ive application of the original $k$-of-$N$ game by \citep{chen2012tractable}.
In this model, the antagonist does not observe which $\mTestCasesToSelect$-tuple of test cases, $\testCaseGroup^t$, is selected by the protagonist on each round $t$.
Instead, it is randomized with a distribution $\hat{\testCaseDistribution}^t_{\testCaseGroupDecisionLabel} \in \simplex^{\abs{\testCaseSet}^{\mTestCasesToSelect}}$.
This model corresponds to the policy testing use case where a new $\mTestCasesToSelect$-tuple of test cases is sampled independently for each test that is performed.
Every test only evaluates $\mTestCasesToSelect$ cases, as desired from a computational efficiency perspective, however, the particular test cases used in each test could be different, making results incomparable across tests.
See Appendix \cref{A-sec:sim-move-model} for additional details.



\begin{algorithm2e}[tb]
  \caption{RPOSST$_{\seqLabel}$ with regret matching$^+$ and Successive Rejects}\label{alg:rposst-seq}
  \DontPrintSemicolon
  \textbf{\textit{Inputs:}} $\langle k, N, T_1, \mTestCasesToSelect, \jointTestCaseDistributionPolicyUncertainty, \testCaseGroup^0, \ell, T_2 \rangle$
\vspace{0.2em} \hrule \vspace{0.2em}

  $q^{1:0}_{\testCaseGroup} \gets \zeros \in \reals^{\mTestCasesToSelect + \abs{\testCaseGroup^0}}$ \textbf{for} $\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}$

  $T' \sim \Unif(\set{1, \ldots, T_1})$

  \For{$t \gets 1, \ldots, T'$}{
      \For{$\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}$}{
          $z^t \gets \ones^{\top} q^{1:t - 1}_{\testCaseGroup}$

          $\hat{\testCaseDistribution}^t_{\testCaseGroup} \gets q^{1:t - 1}_{\testCaseGroup} / z^t$ \textbf{if} $z^t > 0$ \textbf{else} $\ones / \mTestCasesToSelect$

          \tcp{Add zeros to ensure $\hat{\testCaseDistribution}^t_{\testCaseGroup} \in \simplex^{\abs{\testCaseSet}}$.}
          $\hat{\testCaseDistribution}^t_{\testCaseGroup}(x) \gets 0$ \textbf{for} $x \in \testCaseSet \setminus (\testCaseGroup \cup \testCaseGroup^0)$

          $\subblock{ \ell_{\testCaseGroup, (i)} }_{i = 1}^k \gets
            \WorstKOfNLossesFn\subex*{\hat{\testCaseDistribution}^t_{\testCaseGroup}, \tuple{k, N}, \jointTestCaseDistributionPolicyUncertainty, \ell}$

          $\cfv^t_{\testCaseGroup} \gets
            \dfrac{-1}{k} \sum_{i = 1}^k \dfrac{\partial \ell_{\testCaseGroup, (i)}, \policy_{j_{(i)}}}{\partial \hat{\testCaseDistribution}^t_{\testCaseGroup}}$

          \tcp{Update regret matching$^+$.}
          $\regret^t_{\testCaseGroup} \gets \cfv^t_{\testCaseGroup} - (\hat{\testCaseDistribution}^t_{\testCaseGroup})^{\top} \cfv^t_{\testCaseGroup}$

          $q^{1:t}_{\testCaseGroup} \gets \ramp{q^{1:t - 1}_{\testCaseGroup} + \regret^t_{\testCaseGroup}}$
      }
  }
  $\testCaseGroup^* \gets
    \SuccessiveRejects\subex*{
      \testCaseGroup
        \mapsto
          \dfrac{1}{2 k \maxLoss} \ones^{\top}
            \WorstKOfNLossesFn\subex*{
              \hat{\testCaseDistribution}^{T'}_{\testCaseGroup},
              \tuple{k, N},
              \jointTestCaseDistributionPolicyUncertainty,
              \ell
            },
      T_2
    }$

  \Return $\testCaseGroup^*, \hat{\testCaseDistribution}^{T'}_{\testCaseGroup^*}$

  \vspace{0.5em} \hrule \vspace{0.2em}
  \setcounter{AlgoLine}{0}
  \SetKwProg{Subroutine}{Procedure}{}{}
  \Subroutine{$\WorstKOfNLossesFn$ \quad \textbf{Inputs:} $\langle \hat{\testCaseDistribution}, k, N, \jointTestCaseDistributionPolicyUncertainty, \ell \rangle$}{

\vspace{0.2em} \hrule \vspace{0.2em}

    \For{$i = 1 \ldots N$}{
      \tcp{Sample antagonist actions.}
      $\policy_{j_i}, \testCaseDistribution_i \sim \jointTestCaseDistributionPolicyUncertainty$

      \tcp{Evaluate $\hat{\testCaseDistribution}$.}
      $\ell_i \gets \ell(\hat{\testCaseDistribution}; \policy_{j_i},\testCaseDistribution_i)$
    }

    \tcp{Sort to identify the worst $k$.}
    $\SortFn\subex*{
      \subblock*{ \ell_i }_{i = 1}^N
    }$

    \Return $\subblock{ \ell_{(i)} }_{i = 1}^k$
  }
\end{algorithm2e}
 
\textbf{Sequential move.}
In the sequential move model, the antagonist observes $\testCaseGroup^t$ before acting.
The antagonist is thus able to tailor their choice of
$\tuple*{\tuple*{
    \policy_{j_{(i)}},
    \testCaseDistribution_{(i)}
}}_{i = 1}^k$
to whichever $\testCaseGroup^t$ is selected, and randomizing over the $\mTestCasesToSelect$-tuple of test cases has no benefit to the protagonist.
Since the antagonist observes $\testCaseGroup^t$, the protagonist must update all the weights that they would apply to each test case tuple $\testCaseGroup$ as if $\testCaseGroup^t = \testCaseGroup$.
Thus, the selection of $\testCaseGroup^t$ does not impact the protagonist's updates and we need not explicitly select an $\mTestCasesToSelect$-tuple until the very end of the algorithm, after $T' \sim \Unif\subex{\set{1, \ldots, T_1}}$ rounds.\footnote{RPOSST is run for $T'$ rather than $T_1$ rounds because we cannot guarantee a decrease in worst-case loss after every round. See the proof of \cref{thm:rposst-seq} for more details.}

Since the set of $N$ losses observed on each round are generally random, we cannot reuse them to identify which $\mTestCasesToSelect$-tuple leads to the lowest loss using the the test case weights computed after running for $T'$ rounds, $\tuple{\hat{\testCaseDistribution}^{T'}_{\testCaseGroup}}_{\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}}$.
In addition, we cannot access expected $k$-of-$N$ losses directly; we must estimate them by sampling from $\jointTestCaseDistributionPolicyUncertainty$.
Therefore, the selection of a single $\testCaseGroup$ is a ``best arm identification'' problem, where $\testCaseSet^{\mTestCasesToSelect}$ is the set of arms.
The Successive Rejects (SR)~\citep{audibert2010BestArmIdentification} algorithm is an exploration-only bandit algorithm that can be used to solve this problem with a worst-case guarantee on the probability that it identifies the best arm.
The more SR iterations that are run, the more likely it is to select the best arm.
\Cref{alg:rposst-seq} shows how to implement RPOSST for the sequential move model using regret matching$^+$ for tuning the test case weights and SR for the final selection of an $\mTestCasesToSelect$-tuple.

In specific applications, an example of which we will see in \cref{sec:deterministic-rposst} and our experiments, we can construct our optimization game so that it is deterministic, and consequently, we can replace SR with a simple argmax.

The RPOSST$_{\seqLabel}$ objective is the percentile performance loss
\begin{align}
    \min_{
        \substack{
            \testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}\\
			\hat{\testCaseDistribution}_{\testCaseGroup} \in \simplex^{\mTestCasesToSelect}
        }
    }
    \inf_{\integrableFn \in \IntegrableFnSet}
        \hspace{-1em}
        \underset{
            \substack{
                \percentile \in [0, 1]\\
                \Prob \subblock*{
                    \ell(\hat{\testCaseDistribution}_{\testCaseGroup}; \policy_j, \testCaseDistribution) \le \integrableFn(\eta)
                } \ge \percentile
            }
        }{\int}
            \hspace{-3em}
            \integrableFn(\eta)
            \probMeasure_{\kOfN}(d\eta),
    \label{eq:rposst-seq-objective}
\end{align}
where
$\tuple{\policy_j, \testCaseDistribution} \sim \jointTestCaseDistributionPolicyUncertainty$.

The sequential move model represents the policy testing use case where we select and fix $\mTestCasesToSelect$ test cases and test case weights for all future test policies.
Test scores are easily reproducible and comparable across test applications since the test cases never change.


\begin{theorem}
    \label{thm:rposst-seq}
    After $T' \sim \Unif(\set{1, \ldots, T_1})$, $T_1 > 0$, rounds of its optimization game, \cref{alg:rposst-seq} selects an $\mTestCasesToSelect$-tuple of test cases, $\testCaseGroup^*$ and weights
$\hat{\testCaseDistribution}^{T'}_{\testCaseGroup^*} \in \simplex^{\mTestCasesToSelect}$
that, with probability $(1 - p)(1 - q)(1 - \alpha)$, $p, q, \alpha > 0$, are $\frac{\gap}{q}$-optimal for \cref{eq:rposst-seq-objective}, where
$\gap = \bigO\subex*{ \sqrt{\frac{1}{T_1} \mTestCasesToSelect} + \sqrt{\frac{1}{T_1} \log\subex*{ \nicefrac{1}{p} }} }$
and
$\alpha = \bigO\subex*{\e^{-T_2}}$. \end{theorem}
\textbf{All proofs deferred to the Appendix.}
In the extreme case where $\PolicySet_{\tuningLabel}$ covers $\PolicySet$, then this optimality result, (in terms of an upper bounded percentile loss integral), extends to all deployment candidates $\PolicySet_{\deploymentLabel}$.


\subsection{Deterministic CVaR RPOSST}\label{sec:deterministic-rposst}

While in general, an RPOSST algorithm has a randomized procedure and a non-deterministic optimality guarantee, we can actually select hyperparameters so that RPOSST is deterministic, making the procedure simpler and more reliable.
If we fix the ratio $\nicefrac{k}{N}$ and allow $N \to \infty$, the $k$-of-$N$ robustness measure converges toward the CVaR measure at the $\nicefrac{k}{N}$ fractile.
A $k$-of-$N$ algorithm where $N \to \infty$ cannot be implemented with the usual sampling procedure, but it can be implemented if the distribution characterizing our uncertainty, $\jointTestCaseDistributionPolicyUncertainty$, has finite support.

Sampling $\jointTestCaseDistributionPolicyUncertainty$ infinitely would result in sampling all tuning-policy--target-distribution pairs in its support exactly in proportion to their probabilities.
Rather than selecting $k$ tuning-policy--target-distribution pairs, the antagonist must select pairs until their cumulative probability sums to $\nicefrac{k}{N}$.
Effectively, the antagonist assigns weights
\[\alpha_{(i)} = \min \set*{
    \jointTestCaseDistributionPolicyUncertainty\subex*{
    \tuple*{\policy_{j_{(i)}}, \testCaseDistribution_{(i)}}
    },
    \nicefrac{k}{N} - \sum_{h = 1}^{i - 1} \alpha_{(h)}
}\]
to each tuning-policy--target-distribution pair in $\jointTestCaseDistributionPolicyUncertainty$'s support, where the ordering between pairs is determined by the loss each induces for the protagonist.
Finally, these tuning-policy--target-distribution pairs are sampled according to the normalized weights $\frac{\alpha_{(i)} N}{k}$.

The robustness guarantees become deterministic because the entire RPOSST algorithm, denoted as CVaR($\percentile$) RPOSST for the $\percentile = \nicefrac{k}{N}$ fractile, can be run using exact expectations (excluding randomness in $A$, which is taken as given in RPOSST).
Determinism in RPOSST$_{\seqLabel}$ allows us to directly check the exact expected loss of each test case distribution on each round, letting us track the lowest loss test case distribution across all rounds.
This tracking, in turn, allows us to avoid both sampling $T'$ and running the $\SuccessiveRejects$ algorithm to do the final selection.
Instead, we can simply return the lowest loss test case distribution across all $T$ rounds.

If there are $d$ tuning-policy--target-distribution pairs in $\jointTestCaseDistributionPolicyUncertainty$'s support, then the expected CVaR($\percentile$) loss of the protagonist on round $t$ is
$L^t = \min_{\testCaseGroup \in \testCaseSet^{\mTestCasesToSelect}}
    \sum_{i = 1}^d
        \frac{\alpha_{(i)}}{\percentile}
            \ell(\hat{\testCaseDistribution}^t_{\testCaseGroup}; \policy_{j_{(i)}, \testCaseDistribution_{(i)}})$.
The round with the lowest expected loss is
$t^* = \argmin_{t \in \set{1, \ldots, T}} L^t$,
and this definition allows us to state the following corollary.
\begin{corollary}
    \label{cor:deterministic-rposst-seq}
    Assume that
$\jointTestCaseDistributionPolicyUncertainty \in \simplex^d$
for some finite $d \ge 1$.
After $T$ rounds of the CVaR($\percentile$) RPOSST$_{\seqLabel}$ optimization game, where the protagonist chooses $\mTestCasesToSelect$-size tests according to regret matching$^+$ against a best response antagonist,
$\testCaseGroup^*$ and $\testCaseDistribution^{t^*}_{\testCaseGroup^*}$
are $\gap$-optimal for \cref{eq:rposst-seq-objective} under the $\percentile$-fractile CVaR robustness measure, where
$\gap = \bigO\subex*{ \sqrt{\frac{1}{T} \mTestCasesToSelect}}$. \end{corollary}
Pseudocode for CVaR($\percentile$) RPOSST$_{\seqLabel}$ is presented in Appendix \cref{A-alg:cvar-rposst-seq}.

\def\targetDistributionLabelText{TTD}
\def\tuningLabelText{TNP}

In addition, we can construct a series of ablations of CVaR RPOSST$_{\seqLabel}$ to act as baselines for experiments, and to make a connection to the test-construction literature.

CVaR RPOSST$_{\seqLabel}$ generalizes an intuitive algorithm: find the $\mTestCasesToSelect$-tuple of test cases that minimizes the maximum error assuming a uniform distribution over the tuple. This \emph{minimax uniform} algorithm is implemented by executing only the initialization and selection steps of CVaR($0$) RPOSST$_{\seqLabel}$ ($T = 0$).
Further simplifying,
\emph{minimax(\targetDistributionLabelText) uniform} performs the antagonist maximization only over target distributions and assumes a uniform distribution over tuning policies.
\emph{Minimax(\tuningLabelText) uniform} performs the antagonist maximization only over tuning policies and assumes a uniform target distribution.
\emph{Miniaverage uniform} assumes both a uniform distribution over tuning policies and for the target distribution.

Additionally, we could select test cases one at a time to minimize the maximum error, echoing greedy algorithms from the test-construction literature (Chapter 4 of ~\citet{vanderlinden2005linear}).
This \emph{iterative minimax} algorithm is almost the same as running the initialization and return steps of CVaR($0$) RPOSST$_{\seqLabel}$ to select a single test case in a loop.
The sole difference being that iterative minimax could select the same test case multiple times within its loop to adjust the test case weighting away from uniform.


\section{Experiments}\label{section:experiments}

\begin{figure*}[t]
    \begin{minipage}[t]{0.325\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_f_50/col_payoffs.size2.holdout20.rposst.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.325\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_l_50/col_payoffs.size2.holdout20.rposst.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.325\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2012_2pl/duplicate_mean_matrix_big_blinds.size2.holdout20.rposst.error}.pdf}
    \end{minipage}\vspace{0.5em}
    \begin{minipage}[t]{0.325\linewidth}
        \includegraphics[width=\linewidth]{{figures/acpc2017_2pn/duplicate_mean_matrix_big_blinds.size2.holdout20.rposst.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.325\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_large_matrix/gt_winrate_matrix.size2.holdout20.rposst.error}.pdf}
    \end{minipage}\hfill \begin{minipage}[t]{0.325\linewidth}
        \includegraphics[width=\linewidth]{{figures/gt_non_zero_sum/gt_winrate_matrix.size2.holdout20.rposst.error}.pdf}
    \end{minipage}\caption{Expected test score error (absolute difference) across holdout-policy--target-distribution pairs on (top left and middle) Racing Arrows, (top right) the 2012 two-player, limit competition of the ACPC, (bottom left) the 2017 two-player, no-limit competition of the ACPC, (bottom middle and right) \GranTurismo7/ races, between CVaR($\cvarPercentile$) RPOSST$_{\seqLabel}$ and baseline tests on $\numHoldoutReplicas$ randomly sampled sets of holdout policies ($20\%$ of the full set of policies; $80\%$ used as tuning policies). Holdout-policy--target-distribution pairs are sorted according to test score error. Each RPOSST$_{\seqLabel}$ instance was run for $500$ rounds ($T = 500$). Errorbars represent $95\%$ t-distribution confidence intervals.}
    \label{fig:test-score-error}
\end{figure*}
\begin{figure*}[t]
    \begin{subfigure}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_f_500/col_payoffs.size1.holdout96.rposst.error}.pdf}
        \caption{$\mTestCasesToSelect = 1$}
    \end{subfigure}\hfill \begin{subfigure}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_f_500/col_payoffs.size2.holdout96.rposst.error}.pdf}
        \caption{$\mTestCasesToSelect = 2$}
    \end{subfigure}\hfill \begin{subfigure}[t]{0.32\linewidth}
        \includegraphics[width=\linewidth]{{figures/racing_arrows_f_500/col_payoffs.size3.holdout96.rposst.error}.pdf}
        \caption{$\mTestCasesToSelect = 3$}
    \end{subfigure}
    \caption{Expected test score error across holdout-policy--target-distribution pairs on Racing Arrows where test cases are follower policies. Here, 500 Racing Arrows policies were sampled for both the follower and leader role and then $96\%$ of policies of both roles were held out before running RPOSST and each baseline. Each column uses a different test size $\mTestCasesToSelect$. $\numHoldoutReplicas$ sets of holdout policies were sampled and each RPOSST$_{\seqLabel}$ instance was run for $500$ rounds ($T = 500$).}
    \label{fig:racing-arrows-f-500}
\end{figure*}

We explore CVaR RPOSST$_{\seqLabel}$'s performance in three two-player games spanning the range of complexity from a toy one-shot game to a high-fidelity racing simulator, in comparison with minimax and miniaverage baselines.
We show that robustness does tend to decrease test score errors on holdout policies and that RPOSST specifically either outperforms or performs about as well as each baseline in each domain.


\subsection{Experimental Setup}

In each domain, we start with data from playing out every pairing of $n > 0$ policies, yielding a matrix of scores for the column policy.
Each policy along the rows of this matrix is then treated as a test case, making the score at row $i$ and column $j$ the result of evaluating policy $j$ on test case $i$.

To emulate unknown deployment candidate policies to be tested, we hold out $h > 0$ columns of this matrix and call the policy associated with a holdout column a \emph{holdout policy}.
The remaining columns represent the test case results for the set of tuning policies.
The resulting $n \times (n - h)$ matrix is shifted and rescaled so that all entries are between zero and one, and then it is set as the test result matrix $A$ that our methods take as input. Note, although $h$ test cases are generated by holdout policies, as test cases they cannot provide any special information about what tests would be effective on the holdout policies.
To simulate scenarios where the set of tuning policies covers the set of future candidate deployment policies to varying degrees, we run experiments with three different values of $h$: $0.2n$, $0.4n$, and $0.6n$.
$\numHoldoutReplicas$ different holdout sets are randomly sampled for each value of $h$ and in each domain.

Given results for $n$ test cases, the goal is to produce a distribution over $\mTestCasesToSelect < n$ test cases that provides accurate test results on the set of holdout policies, according to a set of target distributions.
For our experiments, we use $\mTestCasesToSelect \in \set{1, 2, 3}$ and the set of target distributions generated from the softmax function applied to the negative average test case result under four different scales, specifically,
$\exp\subex*{ \frac{-\beta}{n} A \ones }
    / \ones^{\top} \exp\subex*{ \frac{-\beta}{n} A \ones }$
for $\beta \in \set{0, 1, 2, 4}$,
so that the distributions put varying degrees of emphasis on test cases that are more difficult on average across the tuning policies.
We set the RPOSST uncertainty distribution, $\jointTestCaseDistributionPolicyUncertainty$, to be uniform over each tuning-policy--target-distribution pair.
We set the CVaR percentile to $\cvarPercentile$ so that it is nearly optimizing for the worst-case, but is slightly less pessimistic, to add an additional distinguishing factor to RPOSST compared to the minimax and minaverage baselines.
We use the absolute difference loss for both optimization and evaluation.


\subsection{Domains}\label{sec:domains}

We test RPOSST on the following three domains of varying complexity.
Each domain has two variants arising from asymmetry, multiple datasets, or alternative scoring rules.
Appendix \cref{A-sec:experimental-details} provides further details on each domain.

\textbf{Racing Arrows.} Racing Arrows is a two-player, zero-sum, one-shot, continuous action game invented for our experiments to replicate aspects of a passing scenario in a race featuring a ``leader'' player and faster ``follower'' player. The follower tries to pass the leader while the latter tries to block.
Scores are recorded as $0$ or $+1$ for a loss or win, respectively, for the column player, which is either the leader or the follower, depending on the configuration.
We run RPOSST on both configurations.
For our experiments, we sample 50 or 500 different leader and follower policies evenly spread through the valid policy space, angles in $\subblock{0, \pi}$, by taking 50 or 500 evenly spaced angles between $\subblock{0.05\pi, (1 - 0.05)\pi}$ and then shifting them independently with uniform samples in $\subblock{-0.05 \pi, 0.05 \pi}$.

\textbf{Annual Computer Poker Competition.} We take two open datasets from the Annual Computer Poker Competition (ACPC) \citep{bard2013annual} containing pairwise match data for poker agents submitted to the 2017 two-player, no-limit competition and the 2012 two-player, limit competition.
These competitions contain different agent populations since they are separated by five years and are in different game formats (limit and no-limit).
The 2017 competition consists of 15 agents and the 2012 competition consists of 12 agents.
Scores are recorded as chip differentials of duplicate matches (two sets of hands where players play with the same set of shuffled decks in both seats).

\textbf{Gran Turismo\textsuperscript{\texttrademark} one-on-one races.} \GranTurismo7/ (GT)\footnote{\url{https://www.gran-turismo.com/us/}} is a high fidelity racing simulator on the PlayStation\textsuperscript{\texttrademark} platform.
Previous versions of GT served as benchmarks for training RL policies \citep{fuchs2021super,song2021autonomous} including policies that outraced the best human competitors~\citep{wurman2022outracing} in four-on-four racing.
We consider a simpler one-on-one racing scenario (see Appendix \cref{A-sec:gt7} for details).
We carry out two experiments, one where test case results are average winrates, and another where policies receive $0$ for a loss, $+1$ for a win, and $-1$ if there was a collision, making the game non-zero-sum.
The test case pool is comprised of $43$ trained RL policies and $3$ built-in ``AI'' policies.


\subsection{Results and Analysis}\label{sec:results_and_analysis}

The results of running CVaR($\cvarPercentile$) RPOSST$_{\seqLabel}$ on each domain, with $\mTestCasesToSelect = 2$ and $20\%$ of policies marked as holdout policies, are shown in \cref{fig:test-score-error}.
The same set of figures with $\mTestCasesToSelect = 1$ and $\mTestCasesToSelect = 3$, as well as 40\% and 60\% holdout policies, are qualitatively similar, except that the differences between the algorithms are typically smaller, and are provided in Appendix \cref{A-sec:supplemental-experimental-results}.

\begin{figure*}
	\begin{center}
		\includegraphics[width=\linewidth, keepaspectratio]{figures/gt_large_16_41_rows_no_aggregated_test_result.pdf}
	\end{center}
	\caption{The GT test results of candidate deployment policies against the test case pair most favoured by RPOSST. Blue and red indicates positive and negative winrates respectively for the candidate deployment policy.}\label{fig:pair_chosen_by_rposst}
\end{figure*}

Looking across each domain and variant, we can see that RPOSST$_{\seqLabel}$ performs nearly as well or better than all of the minimax and miniaverage baselines, particularly in terms of maximum error across holdout-policy--target-distribution pairs.
Interestingly, RPOSST$_{\seqLabel}$ has noticeably lower error in ACPC 2017 and GT (winrate) on the four most difficult holdout-policy--target-distribution pairs to accurately evaluate.
The improvement over the next best method is substantial in ACPC 2017 because RPOSST is the only method with an unlimited ability to optimize with a non-uniform test case weighting.\footnote{Iterative minimax can change its test case distribution away from uniform, but only indirectly by selecting a test case it already selected on a previous iteration before it fills its test-case quota.}
On the other variant in each domain, RPOSST$_{\seqLabel}$ is within the group of the lowest error methods.
In the two Racing Arrows domains, RPOSST$_{\seqLabel}$ and minimax uniform substantially outperform the other methods, at least on the most difficult holdout-policy--target distribution pairs.
This result shows that robustness is indeed beneficial here, but the uniform distribution over the selected two opponents happens to be quite effective.
The GT variant where $-1$ is assigned to a collision appears to be more difficult than the winrate variant, as all the methods cluster together in this variant at higher errors than in the winrate variant.

These results illustrate the utility of incorporating robustness generally, as all of the robust methods tend to outperform miniaverage uniform.
Minimax uniform and iterative minimax are the only baselines that minimize their maximum error over both tuning policy and target distribution uncertainty, and they are usually the next best methods after RPOSST$_{\seqLabel}$.
Minimax(TNP) uniform typically outperforms minimax(TTD) uniform, showing that it is more important to be robust to the tuning policy than the target distribution, in these domains.
When the target distributions are the same in the optimization and holdout evaluation phases, robustness should directly improve the minimum performance across holdout realizations.
Since no effort was made to enforce any relationship between the tuning and holdout policies, this result suggests that robustness to the tuning policy can yield large error reductions when $\PolicySet_{\tuningLabel}$ are even somewhat similar to the holdout policies.

As an example of RPOSST's capabilities, consider the pairs of opponent policies chosen as test cases in GT (winrate) over 100 experiment seeds (Appendix \cref{A-tab:top_picks_holdout_20}).
RPOSST$_{\seqLabel}$ is both more accurate (\cref{fig:test-score-error}) and very consistent, choosing the same pair 90\% of the time.
\Cref{fig:pair_chosen_by_rposst} illustrates the portion of the result matrix for just the two test cases most frequently chosen by RPOSST$_{\seqLabel}$ (test races against opponents 16 and 41).  The race against policy 41 (bottom row) is chosen because that policy wins/loses about half the time, providing a 50/50 information split.
Policy 16 is a weaker policy in many ways (more blue in the top row) but it serves to differentiate the worst policies (darker red squares in the left side of the matrix) from the rest of the policies, and to highlight the strongest policies.
Specifically, the best performing policies almost always win against policy 16, which provides a strong complementary signal to the noisier but more competitive policy 41 test case.
Overall, the two test cases indicate policies 1, 29, and 43 (darkest blue columns) are the strongest for deployment.
Policy 1 is a built-in AI in an overpowered car but 29 and 43 are very strong RL policies.  Looking at the overall winrate matrix (Appendix \cref{A-figure:gt_winrate_matrix}) we see that the same conclusion (the three darkest blue columns overall) would have been chosen using all 46 test cases.
Compressing from 46 test cases to two presents a massive saving in test time for future policies, and shows RPOSST$_{\seqLabel}$ can construct small tests to select deployment policies in a real and complex video game.

The results in \cref{fig:racing-arrows-f-500} repeat the previous analysis in Racing Arrows but with ten times the number of policies.
Only the results where follower policies are treated as test cases are shown, but the corresponding results where leader policies are test cases appear similar and are shown in Appendix \cref{A-sec:supplemental-experimental-results}.
$96\%$ of policies are held out, including those used as test cases, so there are only 20 test cases and tuning policies for RPOSST and the other algorithms to utilize.
This experiment emulates a scenario where an efficient test is constructed once with a relatively small number of tuning policies and then reused for many future deployment candidates.
As in the previous experiments, RPOSST is almost always one of the best methods.


\section{Related Work}


The bulk of the work on policy selection in RL focuses on selecting opponents for \emph{training} with self-play algorithms~\citep{hernandez2021comparison}. In that case, diversity is key for training additional policies to cooperate~\citep{rahman2022towards} or compete~\citep{liu2021towards,mcaleer2022anytime} with pre-existing policies.  However, the selection of policies as training opponents is often guided by aggregate performance metrics across entire populations~\citep{li2019generalized,lanctot2017unified,omidshafiei2019alpha,balduzzi2018re} and thus do not reduce the number of opponent pairings (test cases) required for assessments.

On the testing side, researchers in complex domains develop procedures for testing skill competency using hand-calibrated~\citep{wurman2022outracing} or randomly generated tests with complex percentile-scoring functions~\citep{team2021open}.
Our work seeks to automate and target test construction in such scenarios.
Complementary work~\citep{rowland2019multiagent} treats the computation of a result matrix as a multi-armed bandit problem, each entry represented by one arm.
While this method can greatly reduce sampling costs in the presence of low-variance outcomes,
it does not generalize to policies outside its input population, with the testing of a new policy requiring adding extra arms to be estimated from scratch. However, this method could be used in tandem with RPOSST to reduce the samples required to compute $A$.

Learning to rank methods~\citep{oosterhuis2021robust,bruch2021alternative,hu2018reinforcement} aim to find a function that ranks a set of items (\eg/, documents) based on the relevance of a given query, with hopes to generalize to future queries. Indeed, \citet{akiyama2016learning} use learning to rank to evaluate action sequences. However, predicting unseen policy performances under this model requires the tuning policies to be the queries, which would produce a ranking of the test cases themselves. The scores from such tests would therefore be incomparable across policies, violating one of our main objectives.

Test construction in educational modeling~\citep{vanderlinden2005linear} starts from an
item bank and a statistical model (\eg/, Item Response Theory~\citep{embretson2000item}) predicting the probability of answering each item correctly given a student's (unobserved) skill level.  That model yields an \emph{information matrix} and then automatic test construction methods, including linear optimization or greedy heuristics, can then build a finite-sized test.
By contrast, we do not assume a model of the response variance or a univariate skill measurement, so a closed-form calculation of information is often infeasible.  However,  we do empirically compare our optimization approach to the greedy heuristic.














 

\section{Conclusion and Future Work}
RPOSST is, to the best of our knowledge, the first algorithm to directly address test construction for reinforcement learning policies.  By leveraging the $k$-of-$N$ framework, RPOSST provides bounds on the approximation error of the resulting test despite uncertainty over the exact policies that will be evaluated and the desired test case weighting in the future.  Thus, RPOSST provides a much needed tool for policy selection in real-world deployment scenarios.  An interesting direction for future work is generating the test cases themselves~\citep{marris2021multi,pugh2016quality}, which is challenging on its own~\citep{balduzzi2019open}.






\begin{acknowledgements}
    Thanks to Francesco Riccio for reviewing this work. Thanks to the whole Sony AI team for experiment infrastructure.
\end{acknowledgements}

\bibliography{morrill_257}



\end{document}
