
% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{tikz} % nice language for creating drawings and diagrams


%%%%%%%%%%%%%%%%%%%%% defs %%%%%%%%%%%%%%%%%%%%%

\RequirePackage{latexsym}
\RequirePackage{amsmath}
\RequirePackage{amsthm}
\RequirePackage{amssymb}
\RequirePackage{bm}
\RequirePackage{stmaryrd}
\RequirePackage{environ}
\RequirePackage{graphics}
\usepackage{dsfont}


\newcount\Comments  %
\Comments=1   %
\newcommand{\kibitz}[2]{\ifnum\Comments=1\textcolor{#1}{#2}\fi}
\usepackage{color}
\definecolor{red}{rgb}{1,0,0}
\newcommand{\owner}[1]{\kibitz{red}      {[CURRENT OWNER: #1]}}

\newcommand{\assign}[2][1]{\kibitz{red}      {[TODO(#1):#2]}}


\RequirePackage{xspace}
\newcommand{\Blackbox}{Black-box\xspace}
\newcommand{\blackbox}{black-box\xspace}


\makeatletter
\@ifpackageloaded{dsfont}{}{\newcommand{\mathds}[1]{\mathbb{#1}}}
\@ifpackageloaded{mathrsfs}{}{\newcommand{\mathscr}[1]{\mathcal{#1}}}
\makeatother

%%%%%%%% Environment %%%%%%%%%%%%%

% \begin{resize}[height]{width} \end{resize}
 \NewEnviron{resize}[2][!]{\resizebox{#2}{#1}{\BODY}} 
% \begin{rescale}[height_ratio]{width_ratio} \end{rescale}
 \NewEnviron{rescale}[2][]{\scalebox{#2}[#1]{\BODY}}

\makeatletter
\newcommand{\newreptheorem}[2]{
  \newtheorem*{rep@#1}{\rep@title} 
  \newenvironment{rep#1}[1]{\def\rep@title{#2 \ref*{##1}}\begin{rep@#1}}{\end{rep@#1}}
}
\makeatother
\newreptheorem{theorem}{Theorem}
\newreptheorem{claim}{Claim}

\usepackage{thm-restate}

%%%%%%%% Stock standard definitions %%%%%%%%%%%%%%%

% matrix with original font/boldsymbol/mathcal and tilde/bar/hat
\newcommand{\ab}{{\bm{a}}}
\newcommand{\bb}{{\bm{b}}}
\newcommand{\cbb}{{\bm{c}}}
\newcommand{\db}{{\bm{d}}}
\newcommand{\eb}{{\bm{e}}}
\newcommand{\fb}{{\bm{f}}}
\newcommand{\gb}{{\bm{g}}}
\newcommand{\hb}{{\bm{h}}}
\newcommand{\ib}{{\bm{i}}}
\newcommand{\jb}{{\bm{j}}}
\newcommand{\kb}{{\bm{k}}}
\newcommand{\lb}{{\bm{l}}}
\newcommand{\mb}{{\bm{m}}}
\newcommand{\nbb}{{\bm{n}}}
\newcommand{\ob}{{\bm{o}}}
\newcommand{\pb}{{\bm{p}}}
\newcommand{\qb}{{\bm{q}}}
\newcommand{\rb}{{\bm{r}}}
\newcommand{\sbb}{{\bm{s}}}
\newcommand{\tb}{{\bm{t}}}
\newcommand{\ub}{{\bm{u}}}
\newcommand{\vb}{{\bm{v}}}
\newcommand{\wb}{{\bm{w}}}
\newcommand{\xb}{{\bm{x}}}
\newcommand{\yb}{{\bm{y}}}
\newcommand{\zb}{{\bm{z}}}

\newcommand{\ba}{\ab}
\newcommand{\bc}{\cbb}
\newcommand{\bd}{\db}
\newcommand{\be}{\eb}
\newcommand{\bbf}{\fb}
\newcommand{\bff}{\fb}
\newcommand{\bg}{\gb}
\newcommand{\bh}{\hb}
\newcommand{\bi}{\ib}
\newcommand{\bj}{\jb}
\newcommand{\bk}{\kb}
\newcommand{\bl}{\lb}
\newcommand{\bbm}{\mb}
\newcommand{\bmm}{\mb}
\newcommand{\bn}{\nbb}
\newcommand{\bo}{\ob}
\newcommand{\bp}{\pb}
\newcommand{\bq}{\qb}
\newcommand{\br}{\rb}
\newcommand{\bs}{\sbb}
\newcommand{\bt}{\tb}
\newcommand{\bu}{\ub}
\newcommand{\bv}{\vb}
\newcommand{\bw}{\wb}
\newcommand{\bx}{\xb}
\newcommand{\by}{y^n}
\newcommand{\bz}{\zb}

% version with tilde
\newcommand{\atil}{\tilde{a}}
\newcommand{\btil}{\tilde{b}}
\newcommand{\ctil}{\tilde{c}}
\newcommand{\dtil}{\tilde{d}}
\newcommand{\etil}{\tilde{e}}
\newcommand{\ftil}{\tilde{f}}
\newcommand{\gtil}{\tilde{g}}
\newcommand{\htil}{\tilde{h}}
\newcommand{\itil}{\tilde{i}}
\newcommand{\jtil}{\tilde{j}}
\newcommand{\ktil}{\tilde{k}}
\newcommand{\ltil}{\tilde{l}}
\newcommand{\mtil}{\tilde{m}}
\newcommand{\ntil}{\tilde{n}}
\newcommand{\otil}{\tilde{o}}
\newcommand{\ptil}{\tilde{p}}
\newcommand{\qtil}{\tilde{q}}
\newcommand{\rtil}{\tilde{r}}
\newcommand{\stil}{\tilde{s}}
\newcommand{\ttil}{\tilde{t}}
\newcommand{\util}{\tilde{u}}
\newcommand{\vtil}{\tilde{v}}
\newcommand{\wtil}{\tilde{w}}
\newcommand{\xtil}{\tilde{x}}
\newcommand{\ytil}{\tilde{y}}
\newcommand{\ztil}{\tilde{z}}

\newcommand{\at}{\tilde{a}}
\newcommand{\bti}{\tilde{b}}
%\newcommand{\btt}{\tilde{b}}
\newcommand{\ct}{\tilde{c}}
\newcommand{\dt}{\tilde{d}}
\newcommand{\et}{\tilde{e}}
\newcommand{\ft}{\tilde{f}}
\newcommand{\gt}{\tilde{g}}
\newcommand{\hti}{\tilde{h}}
\newcommand{\htt}{\tilde{h}}
\newcommand{\iti}{\tilde{i}}
\newcommand{\itt}{\tilde{i}}
\newcommand{\jt}{\tilde{j}}
\newcommand{\kt}{\tilde{k}}
\newcommand{\lt}{\tilde{l}}
\newcommand{\mt}{\tilde{m}}
\newcommand{\nt}{\tilde{n}}
\newcommand{\ot}{\tilde{o}}
\newcommand{\pti}{\tilde{p}}
\newcommand{\ptt}{\tilde{p}}
\newcommand{\qt}{\tilde{q}}
\newcommand{\rt}{\tilde{r}}
\newcommand{\st}{\tilde{s}}
\newcommand{\tti}{\tilde{t}}
\newcommand{\ttt}{\tilde{t}}
\newcommand{\ut}{\tilde{u}}
\newcommand{\vti}{\tilde{v}}
\newcommand{\vtt}{\tilde{v}}
\newcommand{\wt}{\tilde{w}}
\newcommand{\xt}{\tilde{x}}
\newcommand{\yt}{\tilde{y}}
\newcommand{\zt}{\tilde{z}}

\newcommand{\abtil}{\tilde{\ab}}
\newcommand{\bbtil}{\tilde{\bb}}
\newcommand{\cbtil}{\tilde{\cbb}}
\newcommand{\dbtil}{\tilde{\db}}
\newcommand{\ebtil}{\tilde{\eb}}
\newcommand{\fbtil}{\tilde{\fb}}
\newcommand{\gbtil}{\tilde{\gb}}
\newcommand{\hbtil}{\tilde{\hb}}
\newcommand{\ibtil}{\tilde{\ib}}
\newcommand{\jbtil}{\tilde{\jb}}
\newcommand{\kbtil}{\tilde{\kb}}
\newcommand{\lbtil}{\tilde{\lb}}
\newcommand{\mbtil}{\tilde{\mb}}
\newcommand{\nbtil}{\tilde{\nbb}}
\newcommand{\obtil}{\tilde{\ob}}
\newcommand{\pbtil}{\tilde{\pb}}
\newcommand{\qbtil}{\tilde{\qb}}
\newcommand{\rbtil}{\tilde{\rb}}
\newcommand{\sbtil}{\tilde{\sbb}}
\newcommand{\tbtil}{\tilde{\tb}}
\newcommand{\ubtil}{\tilde{\ub}}
\newcommand{\vbtil}{\tilde{\vb}}
\newcommand{\wbtil}{\tilde{\wb}}
\newcommand{\xbtil}{\tilde{\xb}}
\newcommand{\ybtil}{\tilde{\yb}}
\newcommand{\zbtil}{\tilde{\zb}}

\newcommand{\batil}{\tilde{\ab}}
\newcommand{\bctil}{\tilde{\cbb}}
\newcommand{\bdtil}{\tilde{\db}}
\newcommand{\betil}{\tilde{\eb}}
\newcommand{\bftil}{\tilde{\fb}}
\newcommand{\bgtil}{\tilde{\gb}}
\newcommand{\bhtil}{\tilde{\hb}}
\newcommand{\bitil}{\tilde{\ib}}
\newcommand{\bjtil}{\tilde{\jb}}
\newcommand{\bktil}{\tilde{\kb}}
\newcommand{\bltil}{\tilde{\lb}}
\newcommand{\bmtil}{\tilde{\mb}}
\newcommand{\bntil}{\tilde{\nbb}}
\newcommand{\botil}{\tilde{\ob}}
\newcommand{\bptil}{\tilde{\pb}}
\newcommand{\bqtil}{\tilde{\qb}}
\newcommand{\brtil}{\tilde{\rb}}
\newcommand{\bstil}{\tilde{\sbb}}
\newcommand{\bttil}{\tilde{\tb}}
\newcommand{\butil}{\tilde{\ub}}
\newcommand{\bvtil}{\tilde{\vb}}
\newcommand{\bwtil}{\tilde{\wb}}
\newcommand{\bxtil}{\tilde{\xb}}
\newcommand{\bytil}{\tilde{\yb}}
\newcommand{\bztil}{\tilde{\zb}}

\newcommand{\bat}{\tilde{\ab}}
\newcommand{\bbt}{\tilde{\bb}}
\newcommand{\bct}{\tilde{\cbb}}
\newcommand{\bdt}{\tilde{\db}}
\newcommand{\bet}{\tilde{\eb}}
\newcommand{\bft}{\tilde{\fb}}
\newcommand{\bgt}{\tilde{\gb}}
\newcommand{\bht}{\tilde{\hb}}
\newcommand{\bit}{\tilde{\ib}}
\newcommand{\bjt}{\tilde{\jb}}
\newcommand{\bkt}{\tilde{\kb}}
\newcommand{\blt}{\tilde{\lb}}
\newcommand{\bmt}{\tilde{\mb}}
\newcommand{\bnt}{\tilde{\nbb}}
\newcommand{\boti}{\tilde{\ob}}
\newcommand{\boot}{\tilde{\ob}}
\newcommand{\bpt}{\tilde{\pb}}
\newcommand{\bqt}{\tilde{\qb}}
\newcommand{\brt}{\tilde{\rb}}
\newcommand{\bst}{\tilde{\sbb}}
\newcommand{\btti}{\tilde{\tb}}
\newcommand{\btt}{\tilde{\tb}}
\newcommand{\but}{\tilde{\ub}}
\newcommand{\bvt}{\tilde{\vb}}
\newcommand{\bwt}{\tilde{\wb}}
\newcommand{\bxt}{\tilde{\xb}}
\newcommand{\byt}{\tilde{\yb}}
\newcommand{\bzt}{\tilde{\zb}}

% version with hat
\newcommand{\ahat}{\hat{a}}
\newcommand{\bhat}{\hat{b}}
\newcommand{\chat}{\hat{c}}
\newcommand{\dhat}{\hat{d}}
\newcommand{\ehat}{\hat{e}}
\newcommand{\fhat}{\hat{f}}
\newcommand{\ghat}{\hat{g}}
\newcommand{\hhat}{\hat{h}}
\newcommand{\ihat}{\hat{i}}
\newcommand{\jhat}{\hat{j}}
\newcommand{\khat}{\hat{k}}
\newcommand{\lhat}{\hat{l}}
\newcommand{\mhat}{\hat{m}}
\newcommand{\nhat}{\hat{n}}
\newcommand{\ohat}{\hat{o}}
\newcommand{\phat}{\hat{p}}
\newcommand{\qhat}{\hat{q}}
\newcommand{\rhat}{\hat{r}}
\newcommand{\shat}{\hat{s}}
\newcommand{\that}{\hat{t}}
\newcommand{\uhat}{\hat{u}}
\newcommand{\vhat}{\hat{v}}
\newcommand{\what}{\hat{w}}
\newcommand{\xhat}{\hat{x}}
\newcommand{\yhat}{\hat{y}}
\newcommand{\zhat}{\hat{z}}
\newcommand{\pihat}{\hat{\pi}}

\newcommand{\abhat}{\hat{\ab}}
\newcommand{\bbhat}{\hat{\bb}}
\newcommand{\cbhat}{\hat{\cb}}
\newcommand{\dbhat}{\hat{\db}}
\newcommand{\ebhat}{\hat{\eb}}
\newcommand{\fbhat}{\hat{\fb}}
\newcommand{\gbhat}{\hat{\gb}}
\newcommand{\hbhat}{\hat{\hb}}
\newcommand{\ibhat}{\hat{\ib}}
\newcommand{\jbhat}{\hat{\jb}}
\newcommand{\kbhat}{\hat{\kb}}
\newcommand{\lbhat}{\hat{\lb}}
\newcommand{\mbhat}{\hat{\mb}}
\newcommand{\nbhat}{\hat{\nb}}
\newcommand{\obhat}{\hat{\ob}}
\newcommand{\pbhat}{\hat{\pb}}
\newcommand{\qbhat}{\hat{\qb}}
\newcommand{\rbhat}{\hat{\rb}}
\newcommand{\sbhat}{\hat{\sb}}
\newcommand{\tbhat}{\hat{\tb}}
\newcommand{\ubhat}{\hat{\ub}}
\newcommand{\vbhat}{\hat{\vb}}
\newcommand{\wbhat}{\hat{\wb}}
\newcommand{\xbhat}{\hat{\xb}}
\newcommand{\ybhat}{\hat{\yb}}
\newcommand{\zbhat}{\hat{\zb}}

\newcommand{\bahat}{\hat{\ab}}
\newcommand{\bchat}{\hat{\bc}}
\newcommand{\bdhat}{\hat{\db}}
\newcommand{\behat}{\hat{\eb}}
\newcommand{\bfhat}{\hat{\fb}}
\newcommand{\bghat}{\hat{\gb}}
\newcommand{\bhhat}{\hat{\hb}}
\newcommand{\bihat}{\hat{\ib}}
\newcommand{\bjhat}{\hat{\jb}}
\newcommand{\bkhat}{\hat{\kb}}
\newcommand{\blhat}{\hat{\lb}}
\newcommand{\bmhat}{\hat{\mb}}
\newcommand{\bnhat}{\hat{\nbb}}
\newcommand{\bohat}{\hat{\ob}}
\newcommand{\bphat}{\hat{\pb}}
\newcommand{\bqhat}{\hat{\qb}}
\newcommand{\brhat}{\hat{\rb}}
\newcommand{\bshat}{\hat{\sbb}}
\newcommand{\bthat}{\hat{\tb}}
\newcommand{\buhat}{\hat{\ub}}
\newcommand{\bvhat}{\hat{\vb}}
\newcommand{\bwhat}{\hat{\wb}}
\newcommand{\bxhat}{\hat{\xb}}
\newcommand{\byhat}{\hat{\yb}}
\newcommand{\bzhat}{\hat{\zb}}

% version with bar
\newcommand{\abar}{\bar{a}}
\newcommand{\bbar}{\bar{b}}
\newcommand{\cbar}{\bar{c}}
\newcommand{\dbar}{\bar{d}}
\newcommand{\ebar}{\bar{e}}
\newcommand{\fbar}{\bar{f}}
\newcommand{\gbar}{\bar{g}}
\newcommand{\hbr}{\bar{h}}
\newcommand{\ibar}{\bar{i}}
\newcommand{\jbar}{\bar{j}}
\newcommand{\kbar}{\bar{k}}
\newcommand{\lbar}{\bar{l}}
\newcommand{\mbar}{\bar{m}}
\newcommand{\nbar}{\bar{n}}
\newcommand{\obr}{\bar{o}}
\newcommand{\pbar}{\bar{p}}
\newcommand{\qbar}{\bar{q}}
\newcommand{\rbar}{\bar{r}}
\newcommand{\sbar}{\bar{s}}
\newcommand{\tbar}{\bar{t}}
\newcommand{\ubar}{\bar{u}}
\newcommand{\vbar}{\bar{v}}
\newcommand{\wbar}{\bar{w}}
\newcommand{\xbar}{\bar{x}}
\newcommand{\ybar}{\bar{y}}
\newcommand{\zbar}{\bar{z}}

\newcommand{\abbar}{\bar{\ab}}
\newcommand{\bbbar}{\bar{\bb}}
\newcommand{\cbbar}{\bar{\cb}}
\newcommand{\dbbar}{\bar{\db}}
\newcommand{\ebbar}{\bar{\eb}}
\newcommand{\fbbar}{\bar{\fb}}
\newcommand{\gbbar}{\bar{\gb}}
\newcommand{\hbbar}{\bar{\hb}}
\newcommand{\ibbar}{\bar{\ib}}
\newcommand{\jbbar}{\bar{\jb}}
\newcommand{\kbbar}{\bar{\kb}}
\newcommand{\lbbar}{\bar{\lb}}
\newcommand{\mbbar}{\bar{\mb}}
\newcommand{\nbbar}{\bar{\nbb}}
\newcommand{\obbar}{\bar{\ob}}
\newcommand{\pbbar}{\bar{\pb}}
\newcommand{\qbbar}{\bar{\qb}}
\newcommand{\rbbar}{\bar{\rb}}
\newcommand{\sbbar}{\bar{\sbb}}
\newcommand{\tbbar}{\bar{\tb}}
\newcommand{\ubbar}{\bar{\ub}}
\newcommand{\vbbar}{\bar{\vb}}
\newcommand{\wbbar}{\bar{\wb}}
\newcommand{\xbbar}{\bar{\xb}}
\newcommand{\ybbar}{\bar{\yb}}
\newcommand{\zbbar}{\bar{\zb}}

\newcommand{\babar}{\bar{\ab}}
\newcommand{\bcbar}{\bar{\bc}}
\newcommand{\bdbar}{\bar{\db}}
\newcommand{\bebar}{\bar{\eb}}
\newcommand{\bfbar}{\bar{\fb}}
\newcommand{\bgbar}{\bar{\gb}}
\newcommand{\bhbar}{\bar{\hb}}
\newcommand{\bibar}{\bar{\ib}}
\newcommand{\bjbar}{\bar{\jb}}
\newcommand{\bkbar}{\bar{\kb}}
\newcommand{\blbar}{\bar{\lb}}
\newcommand{\bmbar}{\bar{\mb}}
\newcommand{\bnbar}{\bar{\nbb}}
\newcommand{\bobar}{\bar{\ob}}
\newcommand{\bpbar}{\bar{\pb}}
\newcommand{\bqbar}{\bar{\qb}}
\newcommand{\brbar}{\bar{\rb}}
\newcommand{\bsbar}{\bar{\sbb}}
\newcommand{\btbar}{\bar{\tb}}
\newcommand{\bubar}{\bar{\ub}}
\newcommand{\bvbar}{\bar{\vb}}
\newcommand{\bwbar}{\bar{\wb}}
\newcommand{\bxbar}{\bar{\xb}}
\newcommand{\bybar}{\bar{\yb}}
\newcommand{\bzbar}{\bar{\zb}}

% matrix with original font/boldsymbol/mathcal and tilde/bar/hat
\newcommand{\Atil}{\tilde{A}}
\newcommand{\Btil}{\tilde{B}}
\newcommand{\Ctil}{\tilde{C}}
\newcommand{\Dtil}{\tilde{D}}
\newcommand{\Etil}{\tilde{E}}
\newcommand{\Ftil}{\tilde{F}}
\newcommand{\Gtil}{\tilde{G}}
\newcommand{\Htil}{\tilde{H}}
\newcommand{\Itil}{\tilde{I}}
\newcommand{\Jtil}{\tilde{J}}
\newcommand{\Ktil}{\tilde{K}}
\newcommand{\Ltil}{\tilde{L}}
\newcommand{\Mtil}{\tilde{M}}
\newcommand{\Ntil}{\tilde{N}}
\newcommand{\Otil}{\tilde{O}}
\newcommand{\Ptil}{\tilde{P}}
\newcommand{\Qtil}{\tilde{Q}}
\newcommand{\Rtil}{\tilde{R}}
\newcommand{\Stil}{\tilde{S}}
\newcommand{\Ttil}{\tilde{T}}
\newcommand{\Util}{\tilde{U}}
\newcommand{\Vtil}{\tilde{V}}
\newcommand{\Wtil}{\tilde{W}}
\newcommand{\Xtil}{\tilde{X}}
\newcommand{\Ytil}{\tilde{Y}}
\newcommand{\Ztil}{\tilde{Z}}

\newcommand{\At}{\tilde{A}}
\newcommand{\Bt}{\tilde{B}}
\newcommand{\Ct}{\tilde{C}}
\newcommand{\Dt}{\tilde{D}}
\newcommand{\Et}{\tilde{E}}
\newcommand{\Ft}{\tilde{F}}
\newcommand{\Gt}{\tilde{G}}
\newcommand{\Ht}{\tilde{H}}
\newcommand{\It}{\tilde{I}}
\newcommand{\Jt}{\tilde{J}}
\newcommand{\Kt}{\tilde{K}}
\newcommand{\Lt}{\tilde{L}}
\newcommand{\Mt}{\tilde{M}}
\newcommand{\Nt}{\tilde{N}}
\newcommand{\Ot}{\tilde{O}}
\newcommand{\Pt}{\tilde{P}}
\newcommand{\Qt}{\tilde{Q}}
\newcommand{\Rt}{\tilde{R}}
\newcommand{\St}{\tilde{S}}
\newcommand{\Tt}{\tilde{T}}
\newcommand{\Ut}{\tilde{U}}
\newcommand{\Vt}{\tilde{V}}
\newcommand{\Wt}{\tilde{W}}
\newcommand{\Xt}{\tilde{X}}
\newcommand{\Yt}{\tilde{Y}}
\newcommand{\Zt}{\tilde{Z}}

\newcommand{\Atilde}{\widetilde{A}}
\newcommand{\Btilde}{\widetilde{B}}
\newcommand{\Ctilde}{\widetilde{C}}
\newcommand{\Dtilde}{\widetilde{D}}
\newcommand{\Etilde}{\widetilde{E}}
\newcommand{\Ftilde}{\widetilde{F}}
\newcommand{\Gtilde}{\widetilde{G}}
\newcommand{\Htilde}{\widetilde{H}}
\newcommand{\Itilde}{\widetilde{I}}
\newcommand{\Jtilde}{\widetilde{J}}
\newcommand{\Ktilde}{\widetilde{K}}
\newcommand{\Ltilde}{\widetilde{L}}
\newcommand{\Mtilde}{\widetilde{M}}
\newcommand{\Ntilde}{\widetilde{N}}
\newcommand{\Otilde}{\widetilde{O}}
\newcommand{\Ptilde}{\widetilde{P}}
\newcommand{\Qtilde}{\widetilde{Q}}
\newcommand{\Rtilde}{\widetilde{R}}
\newcommand{\Stilde}{\widetilde{S}}
\newcommand{\Ttilde}{\widetilde{T}}
\newcommand{\Utilde}{\widetilde{U}}
\newcommand{\Vtilde}{\widetilde{V}}
\newcommand{\Wtilde}{\widetilde{W}}
\newcommand{\Xtilde}{\widetilde{X}}
\newcommand{\Ytilde}{\widetilde{Y}}
\newcommand{\Ztilde}{\widetilde{Z}}

\newcommand{\Abar}{\bar{A}}
\newcommand{\Bbar}{\bar{B}}
\newcommand{\Cbar}{\bar{C}}
\newcommand{\Dbar}{\bar{D}}
\newcommand{\Ebar}{\bar{E}}
\newcommand{\Fbar}{\bar{F}}
\newcommand{\Gbar}{\bar{G}}
\newcommand{\Hbar}{\bar{H}}
\newcommand{\Ibar}{\bar{I}}
\newcommand{\Jbar}{\bar{J}}
\newcommand{\Kbar}{\bar{K}}
\newcommand{\Lbar}{\bar{L}}
\newcommand{\Mbar}{\bar{M}}
\newcommand{\Nbar}{\bar{N}}
\newcommand{\Obar}{\bar{O}}
\newcommand{\Pbar}{\bar{P}}
\newcommand{\Qbar}{\bar{Q}}
\newcommand{\Rbar}{\bar{R}}
\newcommand{\Sbar}{\bar{S}}
\newcommand{\Tbar}{\bar{T}}
\newcommand{\Ubar}{\bar{U}}
\newcommand{\Vbar}{\bar{V}}
\newcommand{\Wbar}{\bar{W}}
\newcommand{\Xbar}{\bar{X}}
\newcommand{\Ybar}{\bar{Y}}
\newcommand{\Zbar}{\bar{Z}}

\newcommand{\Ahat}{\hat{A}}
\newcommand{\Bhat}{\hat{B}}
\newcommand{\Chat}{\hat{C}}
\newcommand{\Dhat}{\hat{D}}
\newcommand{\Ehat}{\hat{E}}
\newcommand{\Fhat}{\hat{F}}
\newcommand{\Ghat}{\hat{G}}
\newcommand{\Hhat}{\hat{H}}
\newcommand{\Ihat}{\hat{I}}
\newcommand{\Jhat}{\hat{J}}
\newcommand{\Khat}{\hat{K}}
\newcommand{\Lhat}{\hat{L}}
\newcommand{\Mhat}{\hat{M}}
\newcommand{\Nhat}{\hat{N}}
\newcommand{\Ohat}{\hat{O}}
\newcommand{\Phat}{\hat{P}}
\newcommand{\Qhat}{\hat{Q}}
\newcommand{\Rhat}{\hat{R}}
\newcommand{\Shat}{\hat{S}}
\newcommand{\That}{\hat{T}}
\newcommand{\Uhat}{\hat{U}}
\newcommand{\Vhat}{\hat{V}}
\newcommand{\What}{\hat{W}}
\newcommand{\Xhat}{\hat{X}}
\newcommand{\Yhat}{\hat{Y}}
\newcommand{\Zhat}{\hat{Z}}

\newcommand{\Ab}{\bm{{A}}}
\newcommand{\Bb}{\bm{{B}}}
\newcommand{\Cb}{\bm{{C}}}
\newcommand{\Db}{\bm{{D}}}
\newcommand{\Eb}{\bm{{E}}}
\newcommand{\Fb}{\bm{{F}}}
\newcommand{\Gb}{\bm{{G}}}
\newcommand{\Hb}{\bm{{H}}}
\newcommand{\Ib}{\bm{{I}}}
\newcommand{\Jb}{\bm{{J}}}
\newcommand{\Kb}{\bm{{K}}}
\newcommand{\Lb}{\bm{{L}}}
\newcommand{\Mb}{\bm{{M}}}
\newcommand{\Nb}{\bm{{N}}}
\newcommand{\Ob}{\bm{{O}}}
\newcommand{\Pb}{\bm{{P}}}
\newcommand{\Qb}{\bm{{Q}}}
\newcommand{\Rb}{\bm{{R}}}
\newcommand{\Sbb}{\bm{{S}}}
\newcommand{\Tb}{\bm{{T}}}
\newcommand{\Ub}{\bm{{U}}}
\newcommand{\Vb}{\bm{{V}}}
\newcommand{\Wb}{\bm{{W}}}
\newcommand{\Xb}{\bm{{X}}}
\newcommand{\Yb}{\bm{{Y}}}
\newcommand{\Zb}{\bm{{Z}}}

\newcommand{\bA}{\bm{{A}}}
\newcommand{\bB}{\bm{{B}}}
\newcommand{\bC}{\bm{{C}}}
\newcommand{\bD}{\bm{{D}}}
\newcommand{\bE}{\bm{{E}}}
\newcommand{\bF}{\bm{{F}}}
\newcommand{\bG}{\bm{{G}}}
\newcommand{\bH}{\bm{{H}}}
\newcommand{\bI}{\bm{{I}}}
\newcommand{\bJ}{\bm{{J}}}
\newcommand{\bK}{\bm{{K}}}
\newcommand{\bL}{\bm{{L}}}
\newcommand{\bM}{\bm{{M}}}
\newcommand{\bN}{\bm{{N}}}
\newcommand{\bO}{\bm{{O}}}
\newcommand{\bP}{\bm{{P}}}
\newcommand{\bQ}{\bm{{Q}}}
\newcommand{\bR}{\bm{{R}}}
\newcommand{\bS}{\bm{{S}}}
\newcommand{\bT}{\bm{{T}}}
\newcommand{\bU}{\bm{{U}}}
\newcommand{\bV}{\bm{{V}}}
\newcommand{\bW}{\bm{{W}}}
\newcommand{\bX}{\bm{{X}}}
\newcommand{\bY}{\bm{{Y}}}
\newcommand{\bZ}{\bm{{Z}}}

\newcommand{\Abtil}{\tilde{\Ab}}
\newcommand{\Bbtil}{\tilde{\Bb}}
\newcommand{\Cbtil}{\tilde{\Cb}}
\newcommand{\Dbtil}{\tilde{\Db}}
\newcommand{\Ebtil}{\tilde{\Eb}}
\newcommand{\Fbtil}{\tilde{\Fb}}
\newcommand{\Gbtil}{\tilde{\Gb}}
\newcommand{\Hbtil}{\tilde{\Hb}}
\newcommand{\Ibtil}{\tilde{\Ib}}
\newcommand{\Jbtil}{\tilde{\Jb}}
\newcommand{\Kbtil}{\tilde{\Kb}}
\newcommand{\Lbtil}{\tilde{\Lb}}
\newcommand{\Mbtil}{\tilde{\Mb}}
\newcommand{\Nbtil}{\tilde{\Nb}}
\newcommand{\Obtil}{\tilde{\Ob}}
\newcommand{\Pbtil}{\tilde{\Pb}}
\newcommand{\Qbtil}{\tilde{\Qb}}
\newcommand{\Rbtil}{\tilde{\Rb}}
\newcommand{\Sbtil}{\tilde{\Sbb}}
\newcommand{\Tbtil}{\tilde{\Tb}}
\newcommand{\Ubtil}{\tilde{\Ub}}
\newcommand{\Vbtil}{\tilde{\Vb}}
\newcommand{\Wbtil}{\tilde{\Wb}}
\newcommand{\Xbtil}{\tilde{\Xb}}
\newcommand{\Ybtil}{\tilde{\Yb}}
\newcommand{\Zbtil}{\tilde{\Zb}}

\newcommand{\bAt}{\tilde{\Ab}}
\newcommand{\bBt}{\tilde{\Bb}}
\newcommand{\bCt}{\tilde{\Cb}}
\newcommand{\bDt}{\tilde{\Db}}
\newcommand{\bEt}{\tilde{\Eb}}
\newcommand{\bFt}{\tilde{\Fb}}
\newcommand{\bGt}{\tilde{\Gb}}
\newcommand{\bHt}{\tilde{\Hb}}
\newcommand{\bIt}{\tilde{\Ib}}
\newcommand{\bJt}{\tilde{\Jb}}
\newcommand{\bKt}{\tilde{\Kb}}
\newcommand{\bLt}{\tilde{\Lb}}
\newcommand{\bMt}{\tilde{\Mb}}
\newcommand{\bNt}{\tilde{\Nb}}
\newcommand{\bOt}{\tilde{\Ob}}
\newcommand{\bPt}{\tilde{\Pb}}
\newcommand{\bQt}{\tilde{\Qb}}
\newcommand{\bRt}{\tilde{\Rb}}
\newcommand{\bSt}{\tilde{\Sbb}}
\newcommand{\bTt}{\tilde{\Tb}}
\newcommand{\bUt}{\tilde{\Ub}}
\newcommand{\bVt}{\tilde{\Vb}}
\newcommand{\bWt}{\tilde{\Wb}}
\newcommand{\bXt}{\tilde{\Xb}}
\newcommand{\bYt}{\tilde{\Yb}}
\newcommand{\bZt}{\tilde{\Zb}}

\newcommand{\bAtil}{\tilde{\Ab}}
\newcommand{\bBtil}{\tilde{\Bb}}
\newcommand{\bCtil}{\tilde{\Cb}}
\newcommand{\bDtil}{\tilde{\Db}}
\newcommand{\bEtil}{\tilde{\Eb}}
\newcommand{\bFtil}{\tilde{\Fb}}
\newcommand{\bGtil}{\tilde{\Gb}}
\newcommand{\bHtil}{\tilde{\Hb}}
\newcommand{\bItil}{\tilde{\Ib}}
\newcommand{\bJtil}{\tilde{\Jb}}
\newcommand{\bKtil}{\tilde{\Kb}}
\newcommand{\bLtil}{\tilde{\Lb}}
\newcommand{\bMtil}{\tilde{\Mb}}
\newcommand{\bNtil}{\tilde{\Nb}}
\newcommand{\bOtil}{\tilde{\Ob}}
\newcommand{\bPtil}{\tilde{\Pb}}
\newcommand{\bQtil}{\tilde{\Qb}}
\newcommand{\bRtil}{\tilde{\Rb}}
\newcommand{\bStil}{\tilde{\Sbb}}
\newcommand{\bTtil}{\tilde{\Tb}}
\newcommand{\bUtil}{\tilde{\Ub}}
\newcommand{\bVtil}{\tilde{\Vb}}
\newcommand{\bWtil}{\tilde{\Wb}}
\newcommand{\bXtil}{\tilde{\Xb}}
\newcommand{\bYtil}{\tilde{\Yb}}
\newcommand{\bZtil}{\tilde{\Zb}}

\newcommand{\bAtilde}{\widetilde{\Ab}}
\newcommand{\bBtilde}{\widetilde{\Bb}}
\newcommand{\bCtilde}{\widetilde{\Cb}}
\newcommand{\bDtilde}{\widetilde{\Db}}
\newcommand{\bEtilde}{\widetilde{\Eb}}
\newcommand{\bFtilde}{\widetilde{\Fb}}
\newcommand{\bGtilde}{\widetilde{\Gb}}
\newcommand{\bHtilde}{\widetilde{\Hb}}
\newcommand{\bItilde}{\widetilde{\Ib}}
\newcommand{\bJtilde}{\widetilde{\Jb}}
\newcommand{\bKtilde}{\widetilde{\Kb}}
\newcommand{\bLtilde}{\widetilde{\Lb}}
\newcommand{\bMtilde}{\widetilde{\Mb}}
\newcommand{\bNtilde}{\widetilde{\Nb}}
\newcommand{\bOtilde}{\widetilde{\Ob}}
\newcommand{\bPtilde}{\widetilde{\Pb}}
\newcommand{\bQtilde}{\widetilde{\Qb}}
\newcommand{\bRtilde}{\widetilde{\Rb}}
\newcommand{\bStilde}{\widetilde{\Sbb}}
\newcommand{\bTtilde}{\widetilde{\Tb}}
\newcommand{\bUtilde}{\widetilde{\Ub}}
\newcommand{\bVtilde}{\widetilde{\Vb}}
\newcommand{\bWtilde}{\widetilde{\Wb}}
\newcommand{\bXtilde}{\widetilde{\Xb}}
\newcommand{\bYtilde}{\widetilde{\Yb}}
\newcommand{\bZtilde}{\widetilde{\Zb}}

\newcommand{\Abbar}{\bar{\Ab}}
\newcommand{\Bbbar}{\bar{\Bb}}
\newcommand{\Cbbar}{\bar{\Cb}}
\newcommand{\Dbbar}{\bar{\Db}}
\newcommand{\Ebbar}{\bar{\Eb}}
\newcommand{\Fbbar}{\bar{\Fb}}
\newcommand{\Gbbar}{\bar{\Gb}}
\newcommand{\Hbbar}{\bar{\Hb}}
\newcommand{\Ibbar}{\bar{\Ib}}
\newcommand{\Jbbar}{\bar{\Jb}}
\newcommand{\Kbbar}{\bar{\Kb}}
\newcommand{\Lbbar}{\bar{\Lb}}
\newcommand{\Mbbar}{\bar{\Mb}}
\newcommand{\Nbbar}{\bar{\Nb}}
\newcommand{\Obbar}{\bar{\Ob}}
\newcommand{\Pbbar}{\bar{\Pb}}
\newcommand{\Qbbar}{\bar{\Qb}}
\newcommand{\Rbbar}{\bar{\Rb}}
\newcommand{\Sbbar}{\bar{\Sbb}}
\newcommand{\Tbbar}{\bar{\Tb}}
\newcommand{\Ubbar}{\bar{\Ub}}
\newcommand{\Vbbar}{\bar{\Vb}}
\newcommand{\Wbbar}{\bar{\Wb}}
\newcommand{\Xbbar}{\bar{\Xb}}
\newcommand{\Ybbar}{\bar{\Yb}}
\newcommand{\Zbbar}{\bar{\Zb}}

\newcommand{\bAbar}{\bar{\Ab}}
\newcommand{\bBbar}{\bar{\Bb}}
\newcommand{\bCbar}{\bar{\Cb}}
\newcommand{\bDbar}{\bar{\Db}}
\newcommand{\bEbar}{\bar{\Eb}}
\newcommand{\bFbar}{\bar{\Fb}}
\newcommand{\bGbar}{\bar{\Gb}}
\newcommand{\bHbar}{\bar{\Hb}}
\newcommand{\bIbar}{\bar{\Ib}}
\newcommand{\bJbar}{\bar{\Jb}}
\newcommand{\bKbar}{\bar{\Kb}}
\newcommand{\bLbar}{\bar{\Lb}}
\newcommand{\bMbar}{\bar{\Mb}}
\newcommand{\bNbar}{\bar{\Nb}}
\newcommand{\bObar}{\bar{\Ob}}
\newcommand{\bPbar}{\bar{\Pb}}
\newcommand{\bQbar}{\bar{\Qb}}
\newcommand{\bRbar}{\bar{\Rb}}
\newcommand{\bSbar}{\bar{\Sbb}}
\newcommand{\bTbar}{\bar{\Tb}}
\newcommand{\bUbar}{\bar{\Ub}}
\newcommand{\bVbar}{\bar{\Vb}}
\newcommand{\bWbar}{\bar{\Wb}}
\newcommand{\bXbar}{\bar{\Xb}}
\newcommand{\bYbar}{\bar{\Yb}}
\newcommand{\bZbar}{\bar{\Zb}}

\newcommand{\Abhat}{\hat{\Ab}}
\newcommand{\Bbhat}{\hat{\Bb}}
\newcommand{\Cbhat}{\hat{\Cb}}
\newcommand{\Dbhat}{\hat{\Db}}
\newcommand{\Ebhat}{\hat{\Eb}}
\newcommand{\Fbhat}{\hat{\Fb}}
\newcommand{\Gbhat}{\hat{\Gb}}
\newcommand{\Hbhat}{\hat{\Hb}}
\newcommand{\Ibhat}{\hat{\Ib}}
\newcommand{\Jbhat}{\hat{\Jb}}
\newcommand{\Kbhat}{\hat{\Kb}}
\newcommand{\Lbhat}{\hat{\Lb}}
\newcommand{\Mbhat}{\hat{\Mb}}
\newcommand{\Nbhat}{\hat{\Nb}}
\newcommand{\Obhat}{\hat{\Ob}}
\newcommand{\Pbhat}{\hat{\Pb}}
\newcommand{\Qbhat}{\hat{\Qb}}
\newcommand{\Rbhat}{\hat{\Rb}}
\newcommand{\Sbhat}{\hat{\Sbb}}
\newcommand{\Tbhat}{\hat{\Tb}}
\newcommand{\Ubhat}{\hat{\Ub}}
\newcommand{\Vbhat}{\hat{\Vb}}
\newcommand{\Wbhat}{\hat{\Wb}}
\newcommand{\Xbhat}{\hat{\Xb}}
\newcommand{\Ybhat}{\hat{\Yb}}
\newcommand{\Zbhat}{\hat{\Zb}}

\newcommand{\bAhat}{\hat{\Ab}}
\newcommand{\bBhat}{\hat{\Bb}}
\newcommand{\bChat}{\hat{\Cb}}
\newcommand{\bDhat}{\hat{\Db}}
\newcommand{\bEhat}{\hat{\Eb}}
\newcommand{\bFhat}{\hat{\Fb}}
\newcommand{\bGhat}{\hat{\Gb}}
\newcommand{\bHhat}{\hat{\Hb}}
\newcommand{\bIhat}{\hat{\Ib}}
\newcommand{\bJhat}{\hat{\Jb}}
\newcommand{\bKhat}{\hat{\Kb}}
\newcommand{\bLhat}{\hat{\Lb}}
\newcommand{\bMhat}{\hat{\Mb}}
\newcommand{\bNhat}{\hat{\Nb}}
\newcommand{\bOhat}{\hat{\Ob}}
\newcommand{\bPhat}{\hat{\Pb}}
\newcommand{\bQhat}{\hat{\Qb}}
\newcommand{\bRhat}{\hat{\Rb}}
\newcommand{\bShat}{\hat{\Sbb}}
\newcommand{\bThat}{\hat{\Tb}}
\newcommand{\bUhat}{\hat{\Ub}}
\newcommand{\bVhat}{\hat{\Vb}}
\newcommand{\bWhat}{\hat{\Wb}}
\newcommand{\bXhat}{\hat{\Xb}}
\newcommand{\bYhat}{\hat{\Yb}}
\newcommand{\bZhat}{\hat{\Zb}}

% mathcal version 
\newcommand{\Acal}{\mathcal{A}}
\newcommand{\Bcal}{\mathcal{B}}
\newcommand{\Ccal}{\mathcal{C}}
\newcommand{\Dcal}{\mathcal{D}}
\newcommand{\Ecal}{\mathcal{E}}
\newcommand{\Fcal}{\mathcal{F}}
\newcommand{\Gcal}{\mathcal{G}}
\newcommand{\Hcal}{\mathcal{H}}
\newcommand{\Ical}{\mathcal{I}}
\newcommand{\Jcal}{\mathcal{J}}
\newcommand{\Kcal}{\mathcal{K}}
\newcommand{\Lcal}{\mathcal{L}}
\newcommand{\Mcal}{\mathcal{M}}
\newcommand{\Ncal}{\mathcal{N}}
\newcommand{\Ocal}{\mathcal{O}}
\newcommand{\Pcal}{\mathcal{P}}
\newcommand{\Qcal}{\mathcal{Q}}
\newcommand{\Rcal}{\mathcal{R}}
\newcommand{\Scal}{{\mathcal{S}}}
\newcommand{\Tcal}{{\mathcal{T}}}
\newcommand{\Ucal}{\mathcal{U}}
\newcommand{\Vcal}{\mathcal{V}}
\newcommand{\Wcal}{\mathcal{W}}
\newcommand{\Xcal}{\mathcal{X}}
\newcommand{\Ycal}{\mathcal{Y}}
\newcommand{\Zcal}{\mathcal{Z}}

\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cE}{\mathcal{E}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cJ}{\mathcal{J}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cO}{\mathcal{O}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cR}{\mathcal{R}}
\newcommand{\cS}{\mathcal{S}}
\newcommand{\cT}{\mathcal{T}}
\newcommand{\cU}{\mathcal{U}}
\newcommand{\cV}{\mathcal{V}}
\newcommand{\cW}{\mathcal{W}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cZ}{\mathcal{Z}}

\newcommand{\Acaltil}{\tilde{\cA}}
\newcommand{\Bcaltil}{\tilde{\cB}}
\newcommand{\Ccaltil}{\tilde{\cC}}
\newcommand{\Dcaltil}{\tilde{\cD}}
\newcommand{\Ecaltil}{\tilde{\cE}}
\newcommand{\Fcaltil}{\tilde{\cF}}
\newcommand{\Gcaltil}{\tilde{\cG}}
\newcommand{\Hcaltil}{\tilde{\cH}}
\newcommand{\Icaltil}{\tilde{\cI}}
\newcommand{\Jcaltil}{\tilde{\cJ}}
\newcommand{\Kcaltil}{\tilde{\cK}}
\newcommand{\Lcaltil}{\tilde{\cL}}
\newcommand{\Mcaltil}{\tilde{\cM}}
\newcommand{\Ncaltil}{\tilde{\cN}}
\newcommand{\Ocaltil}{\tilde{\cO}}
\newcommand{\Pcaltil}{\tilde{\cP}}
\newcommand{\Qcaltil}{\tilde{\cQ}}
\newcommand{\Rcaltil}{\tilde{\cR}}
\newcommand{\Scaltil}{\tilde{\cS}}
\newcommand{\Tcaltil}{\tilde{\cT}}
\newcommand{\Ucaltil}{\tilde{\cU}}
\newcommand{\Vcaltil}{\tilde{\cV}}
\newcommand{\Wcaltil}{\tilde{\cW}}
\newcommand{\Xcaltil}{\tilde{\cX}}
\newcommand{\Ycaltil}{\tilde{\cY}}
\newcommand{\Zcaltil}{\tilde{\cZ}}

\newcommand{\cAtil}{\tilde{\cA}}
\newcommand{\cBtil}{\tilde{\cB}}
\newcommand{\cCtil}{\tilde{\cC}}
\newcommand{\cDtil}{\tilde{\cD}}
\newcommand{\cEtil}{\tilde{\cE}}
\newcommand{\cFtil}{\tilde{\cF}}
\newcommand{\cGtil}{\tilde{\cG}}
\newcommand{\cHtil}{\tilde{\cH}}
\newcommand{\cItil}{\tilde{\cI}}
\newcommand{\cJtil}{\tilde{\cJ}}
\newcommand{\cKtil}{\tilde{\cK}}
\newcommand{\cLtil}{\tilde{\cL}}
\newcommand{\cMtil}{\tilde{\cM}}
\newcommand{\cNtil}{\tilde{\cN}}
\newcommand{\cOtil}{\tilde{\cO}}
\newcommand{\cPtil}{\tilde{\cP}}
\newcommand{\cQtil}{\tilde{\cQ}}
\newcommand{\cRtil}{\tilde{\cR}}
\newcommand{\cStil}{\tilde{\cS}}
\newcommand{\cTtil}{\tilde{\cT}}
\newcommand{\cUtil}{\tilde{\cU}}
\newcommand{\cVtil}{\tilde{\cV}}
\newcommand{\cWtil}{\tilde{\cW}}
\newcommand{\cXtil}{\tilde{\cX}}
\newcommand{\cYtil}{\tilde{\cY}}
\newcommand{\cZtil}{\tilde{\cZ}}

\newcommand{\cAt}{\tilde{\cA}}
\newcommand{\cBt}{\tilde{\cB}}
\newcommand{\cCt}{\tilde{\cC}}
\newcommand{\cDt}{\tilde{\cD}}
\newcommand{\cEt}{\tilde{\cE}}
\newcommand{\cFt}{\tilde{\cF}}
\newcommand{\cGt}{\tilde{\cG}}
\newcommand{\cHt}{\tilde{\cH}}
\newcommand{\cIt}{\tilde{\cI}}
\newcommand{\cJt}{\tilde{\cJ}}
\newcommand{\cKt}{\tilde{\cK}}
\newcommand{\cLt}{\tilde{\cL}}
\newcommand{\cMt}{\tilde{\cM}}
\newcommand{\cNt}{\tilde{\cN}}
\newcommand{\cOt}{\tilde{\cO}}
\newcommand{\cPt}{\tilde{\cP}}
\newcommand{\cQt}{\tilde{\cQ}}
\newcommand{\cRt}{\tilde{\cR}}
\newcommand{\cSt}{\tilde{\cS}}
\newcommand{\cTt}{\tilde{\cT}}
\newcommand{\cUt}{\tilde{\cU}}
\newcommand{\cVt}{\tilde{\cV}}
\newcommand{\cWt}{\tilde{\cW}}
\newcommand{\cXt}{\tilde{\cX}}
\newcommand{\cYt}{\tilde{\cY}}
\newcommand{\cZt}{\tilde{\cZ}}

\newcommand{\cAtilde}{\widetilde{\cA}}
\newcommand{\cBtilde}{\widetilde{\cB}}
\newcommand{\cCtilde}{\widetilde{\cC}}
\newcommand{\cDtilde}{\widetilde{\cD}}
\newcommand{\cEtilde}{\widetilde{\cE}}
\newcommand{\cFtilde}{\widetilde{\cF}}
\newcommand{\cGtilde}{\widetilde{\cG}}
\newcommand{\cHtilde}{\widetilde{\cH}}
\newcommand{\cItilde}{\widetilde{\cI}}
\newcommand{\cJtilde}{\widetilde{\cJ}}
\newcommand{\cKtilde}{\widetilde{\cK}}
\newcommand{\cLtilde}{\widetilde{\cL}}
\newcommand{\cMtilde}{\widetilde{\cM}}
\newcommand{\cNtilde}{\widetilde{\cN}}
\newcommand{\cOtilde}{\widetilde{\cO}}
\newcommand{\cPtilde}{\widetilde{\cP}}
\newcommand{\cQtilde}{\widetilde{\cQ}}
\newcommand{\cRtilde}{\widetilde{\cR}}
\newcommand{\cStilde}{\widetilde{\cS}}
\newcommand{\cTtilde}{\widetilde{\cT}}
\newcommand{\cUtilde}{\widetilde{\cU}}
\newcommand{\cVtilde}{\widetilde{\cV}}
\newcommand{\cWtilde}{\widetilde{\cW}}
\newcommand{\cXtilde}{\widetilde{\cX}}
\newcommand{\cYtilde}{\widetilde{\cY}}
\newcommand{\cZtilde}{\widetilde{\cZ}}

\newcommand{\Acalhat}{\hat{\cA}}
\newcommand{\Bcalhat}{\hat{\cB}}
\newcommand{\Ccalhat}{\hat{\cC}}
\newcommand{\Dcalhat}{\hat{\cD}}
\newcommand{\Ecalhat}{\hat{\cE}}
\newcommand{\Fcalhat}{\hat{\cF}}
\newcommand{\Gcalhat}{\hat{\cG}}
\newcommand{\Hcalhat}{\hat{\cH}}
\newcommand{\Icalhat}{\hat{\cI}}
\newcommand{\Jcalhat}{\hat{\cJ}}
\newcommand{\Kcalhat}{\hat{\cK}}
\newcommand{\Lcalhat}{\hat{\cL}}
\newcommand{\Mcalhat}{\hat{\cM}}
\newcommand{\Ncalhat}{\hat{\cN}}
\newcommand{\Ocalhat}{\hat{\cO}}
\newcommand{\Pcalhat}{\hat{\cP}}
\newcommand{\Qcalhat}{\hat{\cQ}}
\newcommand{\Rcalhat}{\hat{\cR}}
\newcommand{\Scalhat}{\hat{\cS}}
\newcommand{\Tcalhat}{\hat{\cT}}
\newcommand{\Ucalhat}{\hat{\cU}}
\newcommand{\Vcalhat}{\hat{\cV}}
\newcommand{\Wcalhat}{\hat{\cW}}
\newcommand{\Xcalhat}{\hat{\cX}}
\newcommand{\Ycalhat}{\hat{\cY}}
\newcommand{\Zcalhat}{\hat{\cZ}}

\newcommand{\cAhat}{\hat{\cA}}
\newcommand{\cBhat}{\hat{\cB}}
\newcommand{\cChat}{\hat{\cC}}
\newcommand{\cDhat}{\hat{\cD}}
\newcommand{\cEhat}{\hat{\cE}}
\newcommand{\cFhat}{\hat{\cF}}
\newcommand{\cGhat}{\hat{\cG}}
\newcommand{\cHhat}{\hat{\cH}}
\newcommand{\cIhat}{\hat{\cI}}
\newcommand{\cJhat}{\hat{\cJ}}
\newcommand{\cKhat}{\hat{\cK}}
\newcommand{\cLhat}{\hat{\cL}}
\newcommand{\cMhat}{\hat{\cM}}
\newcommand{\cNhat}{\hat{\cN}}
\newcommand{\cOhat}{\hat{\cO}}
\newcommand{\cPhat}{\hat{\cP}}
\newcommand{\cQhat}{\hat{\cQ}}
\newcommand{\cRhat}{\hat{\cR}}
\newcommand{\cShat}{\hat{\cS}}
\newcommand{\cThat}{\hat{\cT}}
\newcommand{\cUhat}{\hat{\cU}}
\newcommand{\cVhat}{\hat{\cV}}
\newcommand{\cWhat}{\hat{\cW}}
\newcommand{\cXhat}{\hat{\cX}}
\newcommand{\cYhat}{\hat{\cY}}
\newcommand{\cZhat}{\hat{\cZ}}

\newcommand{\Acalbar}{\bar{\cA}}
\newcommand{\Bcalbar}{\bar{\cB}}
\newcommand{\Ccalbar}{\bar{\cC}}
\newcommand{\Dcalbar}{\bar{\cD}}
\newcommand{\Ecalbar}{\bar{\cE}}
\newcommand{\Fcalbar}{\bar{\cF}}
\newcommand{\Gcalbar}{\bar{\cG}}
\newcommand{\Hcalbar}{\bar{\cH}}
\newcommand{\Icalbar}{\bar{\cI}}
\newcommand{\Jcalbar}{\bar{\cJ}}
\newcommand{\Kcalbar}{\bar{\cK}}
\newcommand{\Lcalbar}{\bar{\cL}}
\newcommand{\Mcalbar}{\bar{\cM}}
\newcommand{\Ncalbar}{\bar{\cN}}
\newcommand{\Ocalbar}{\bar{\cO}}
\newcommand{\Pcalbar}{\bar{\cP}}
\newcommand{\Qcalbar}{\bar{\cQ}}
\newcommand{\Rcalbar}{\bar{\cR}}
\newcommand{\Scalbar}{\bar{\cS}}
\newcommand{\Tcalbar}{\bar{\cT}}
\newcommand{\Ucalbar}{\bar{\cU}}
\newcommand{\Vcalbar}{\bar{\cV}}
\newcommand{\Wcalbar}{\bar{\cW}}
\newcommand{\Xcalbar}{\bar{\cX}}
\newcommand{\Ycalbar}{\bar{\cY}}
\newcommand{\Zcalbar}{\bar{\cZ}}

\newcommand{\cAbar}{\bar{\cA}}
\newcommand{\cBbar}{\bar{\cB}}
\newcommand{\cCbar}{\bar{\cC}}
\newcommand{\cDbar}{\bar{\cD}}
\newcommand{\cEbar}{\bar{\cE}}
\newcommand{\cFbar}{\bar{\cF}}
\newcommand{\cGbar}{\bar{\cG}}
\newcommand{\cHbar}{\bar{\cH}}
\newcommand{\cIbar}{\bar{\cI}}
\newcommand{\cJbar}{\bar{\cJ}}
\newcommand{\cKbar}{\bar{\cK}}
\newcommand{\cLbar}{\bar{\cL}}
\newcommand{\cMbar}{\bar{\cM}}
\newcommand{\cNbar}{\bar{\cN}}
\newcommand{\cObar}{\bar{\cO}}
\newcommand{\cPbar}{\bar{\cP}}
\newcommand{\cQbar}{\bar{\cQ}}
\newcommand{\cRbar}{\bar{\cR}}
\newcommand{\cSbar}{\bar{\cS}}
\newcommand{\cTbar}{\bar{\cT}}
\newcommand{\cUbar}{\bar{\cU}}
\newcommand{\cVbar}{\bar{\cV}}
\newcommand{\cWbar}{\bar{\cW}}
\newcommand{\cXbar}{\bar{\cX}}
\newcommand{\cYbar}{\bar{\cY}}
\newcommand{\cZbar}{\bar{\cZ}}

% mathscr version
\newcommand{\sA}{\mathscr{A}}
\newcommand{\sB}{\mathscr{B}}
\newcommand{\sC}{\mathscr{C}}
\newcommand{\sD}{\mathscr{D}}
\newcommand{\sE}{\mathscr{E}}
\newcommand{\sF}{\mathscr{F}}
\newcommand{\sG}{\mathscr{G}}
\newcommand{\sH}{\mathscr{H}}
\newcommand{\sI}{\mathscr{I}}
\newcommand{\sJ}{\mathscr{J}}
\newcommand{\sK}{\mathscr{K}}
\newcommand{\sL}{\mathscr{L}}
\newcommand{\sM}{\mathscr{M}}
\newcommand{\sN}{\mathscr{N}}
\newcommand{\sO}{\mathscr{O}}
\newcommand{\sP}{\mathscr{P}}
\newcommand{\sQ}{\mathscr{Q}}
\newcommand{\sR}{\mathscr{R}}
\newcommand{\sS}{\mathscr{S}}
\newcommand{\sT}{\mathscr{T}}
\newcommand{\sU}{\mathscr{U}}
\newcommand{\sV}{\mathscr{V}}
\newcommand{\sW}{\mathscr{W}}
\newcommand{\sX}{\mathscr{X}}
\newcommand{\sY}{\mathscr{Y}}
\newcommand{\sZ}{\mathscr{Z}}

\newcommand{\sAbar}{\bar{\sA}}
\newcommand{\sBbar}{\bar{\sB}}
\newcommand{\sCbar}{\bar{\sC}}
\newcommand{\sDbar}{\bar{\sD}}
\newcommand{\sEbar}{\bar{\sE}}
\newcommand{\sFbar}{\bar{\sF}}
\newcommand{\sGbar}{\bar{\sG}}
\newcommand{\sHbar}{\bar{\sH}}
\newcommand{\sIbar}{\bar{\sI}}
\newcommand{\sJbar}{\bar{\sJ}}
\newcommand{\sKbar}{\bar{\sK}}
\newcommand{\sLbar}{\bar{\sL}}
\newcommand{\sMbar}{\bar{\sM}}
\newcommand{\sNbar}{\bar{\sN}}
\newcommand{\sObar}{\bar{\sO}}
\newcommand{\sPbar}{\bar{\sP}}
\newcommand{\sQbar}{\bar{\sQ}}
\newcommand{\sRbar}{\bar{\sR}}
\newcommand{\sSbar}{\bar{\sS}}
\newcommand{\sTbar}{\bar{\sT}}
\newcommand{\sUbar}{\bar{\sU}}
\newcommand{\sVbar}{\bar{\sV}}
\newcommand{\sWbar}{\bar{\sW}}
\newcommand{\sXbar}{\bar{\sX}}
\newcommand{\sYbar}{\bar{\sY}}
\newcommand{\sZbar}{\bar{\sZ}}

\newcommand{\sAhat}{\hat{\sA}}
\newcommand{\sBhat}{\hat{\sB}}
\newcommand{\sChat}{\hat{\sC}}
\newcommand{\sDhat}{\hat{\sD}}
\newcommand{\sEhat}{\hat{\sE}}
\newcommand{\sFhat}{\hat{\sF}}
\newcommand{\sGhat}{\hat{\sG}}
\newcommand{\sHhat}{\hat{\sH}}
\newcommand{\sIhat}{\hat{\sI}}
\newcommand{\sJhat}{\hat{\sJ}}
\newcommand{\sKhat}{\hat{\sK}}
\newcommand{\sLhat}{\hat{\sL}}
\newcommand{\sMhat}{\hat{\sM}}
\newcommand{\sNhat}{\hat{\sN}}
\newcommand{\sOhat}{\hat{\sO}}
\newcommand{\sPhat}{\hat{\sP}}
\newcommand{\sQhat}{\hat{\sQ}}
\newcommand{\sRhat}{\hat{\sR}}
\newcommand{\sShat}{\hat{\sS}}
\newcommand{\sThat}{\hat{\sT}}
\newcommand{\sUhat}{\hat{\sU}}
\newcommand{\sVhat}{\hat{\sV}}
\newcommand{\sWhat}{\hat{\sW}}
\newcommand{\sXhat}{\hat{\sX}}
\newcommand{\sYhat}{\hat{\sY}}
\newcommand{\sZhat}{\hat{\sZ}}

\newcommand{\sAtilde}{\widetilde{\sA}}
\newcommand{\sBtilde}{\widetilde{\sB}}
\newcommand{\sCtilde}{\widetilde{\sC}}
\newcommand{\sDtilde}{\widetilde{\sD}}
\newcommand{\sEtilde}{\widetilde{\sE}}
\newcommand{\sFtilde}{\widetilde{\sF}}
\newcommand{\sGtilde}{\widetilde{\sG}}
\newcommand{\sHtilde}{\widetilde{\sH}}
\newcommand{\sItilde}{\widetilde{\sI}}
\newcommand{\sJtilde}{\widetilde{\sJ}}
\newcommand{\sKtilde}{\widetilde{\sK}}
\newcommand{\sLtilde}{\widetilde{\sL}}
\newcommand{\sMtilde}{\widetilde{\sM}}
\newcommand{\sNtilde}{\widetilde{\sN}}
\newcommand{\sOtilde}{\widetilde{\sO}}
\newcommand{\sPtilde}{\widetilde{\sP}}
\newcommand{\sQtilde}{\widetilde{\sQ}}
\newcommand{\sRtilde}{\widetilde{\sR}}
\newcommand{\sStilde}{\widetilde{\sS}}
\newcommand{\sTtilde}{\widetilde{\sT}}
\newcommand{\sUtilde}{\widetilde{\sU}}
\newcommand{\sVtilde}{\widetilde{\sV}}
\newcommand{\sWtilde}{\widetilde{\sW}}
\newcommand{\sXtilde}{\widetilde{\sX}}
\newcommand{\sYtilde}{\widetilde{\sY}}
\newcommand{\sZtilde}{\widetilde{\sZ}}

\newcommand{\sAt}{\tilde{\sA}}
\newcommand{\sBt}{\tilde{\sB}}
\newcommand{\sCt}{\tilde{\sC}}
\newcommand{\sDt}{\tilde{\sD}}
\newcommand{\sEt}{\tilde{\sE}}
\newcommand{\sFt}{\tilde{\sF}}
\newcommand{\sGt}{\tilde{\sG}}
\newcommand{\sHt}{\tilde{\sH}}
\newcommand{\sIt}{\tilde{\sI}}
\newcommand{\sJt}{\tilde{\sJ}}
\newcommand{\sKt}{\tilde{\sK}}
\newcommand{\sLt}{\tilde{\sL}}
\newcommand{\sMt}{\tilde{\sM}}
\newcommand{\sNt}{\tilde{\sN}}
\newcommand{\sOt}{\tilde{\sO}}
\newcommand{\sPt}{\tilde{\sP}}
\newcommand{\sQt}{\tilde{\sQ}}
\newcommand{\sRt}{\tilde{\sR}}
\newcommand{\sSt}{\tilde{\sS}}
\newcommand{\sTt}{\tilde{\sT}}
\newcommand{\sUt}{\tilde{\sU}}
\newcommand{\sVt}{\tilde{\sV}}
\newcommand{\sWt}{\tilde{\sW}}
\newcommand{\sXt}{\tilde{\sX}}
\newcommand{\sYt}{\tilde{\sY}}
\newcommand{\sZt}{\tilde{\sZ}}

% mathds version
\newcommand{\dA}{\mathds{A}}
\newcommand{\dB}{\mathds{B}}
\newcommand{\dC}{\mathds{C}}
\newcommand{\dD}{\mathds{D}}
\newcommand{\dE}{\mathds{E}}
\newcommand{\dF}{\mathds{F}}
\newcommand{\dG}{\mathds{G}}
\newcommand{\dH}{\mathds{H}}
\newcommand{\dI}{\mathds{I}}
\newcommand{\dJ}{\mathds{J}}
\newcommand{\dK}{\mathds{K}}
\newcommand{\dL}{\mathds{L}}
\newcommand{\dM}{\mathds{M}}
\newcommand{\dN}{\mathds{N}}
\newcommand{\dO}{\mathds{O}}
\newcommand{\dP}{\mathds{P}}
\newcommand{\dQ}{\mathds{Q}}
\newcommand{\dR}{\mathds{R}}
\newcommand{\dS}{\mathds{S}}
\newcommand{\dT}{\mathds{T}}
\newcommand{\dU}{\mathds{U}}
\newcommand{\dV}{\mathds{V}}
\newcommand{\dW}{\mathds{W}}
\newcommand{\dX}{\mathds{X}}
\newcommand{\dY}{\mathds{Y}}
\newcommand{\dZ}{\mathds{Z}}

\newcommand{\dAbar}{\bar{\dA}}
\newcommand{\dBbar}{\bar{\dB}}
\newcommand{\dCbar}{\bar{\dC}}
\newcommand{\dDbar}{\bar{\dD}}
\newcommand{\dEbar}{\bar{\dE}}
\newcommand{\dFbar}{\bar{\dF}}
\newcommand{\dGbar}{\bar{\dG}}
\newcommand{\dHbar}{\bar{\dH}}
\newcommand{\dIbar}{\bar{\dI}}
\newcommand{\dJbar}{\bar{\dJ}}
\newcommand{\dKbar}{\bar{\dK}}
\newcommand{\dLbar}{\bar{\dL}}
\newcommand{\dMbar}{\bar{\dM}}
\newcommand{\dNbar}{\bar{\dN}}
\newcommand{\dObar}{\bar{\dO}}
\newcommand{\dPbar}{\bar{\dP}}
\newcommand{\dQbar}{\bar{\dQ}}
\newcommand{\dRbar}{\bar{\dR}}
\newcommand{\dSbar}{\bar{\dS}}
\newcommand{\dTbar}{\bar{\dT}}
\newcommand{\dUbar}{\bar{\dU}}
\newcommand{\dVbar}{\bar{\dV}}
\newcommand{\dWbar}{\bar{\dW}}
\newcommand{\dXbar}{\bar{\dX}}
\newcommand{\dYbar}{\bar{\dY}}
\newcommand{\dZbar}{\bar{\dZ}}

\newcommand{\dAhat}{\hat{\dA}}
\newcommand{\dBhat}{\hat{\dB}}
\newcommand{\dChat}{\hat{\dC}}
\newcommand{\dDhat}{\hat{\dD}}
\newcommand{\dEhat}{\hat{\dE}}
\newcommand{\dFhat}{\hat{\dF}}
\newcommand{\dGhat}{\hat{\dG}}
\newcommand{\dHhat}{\hat{\dH}}
\newcommand{\dIhat}{\hat{\dI}}
\newcommand{\dJhat}{\hat{\dJ}}
\newcommand{\dKhat}{\hat{\dK}}
\newcommand{\dLhat}{\hat{\dL}}
\newcommand{\dMhat}{\hat{\dM}}
\newcommand{\dNhat}{\hat{\dN}}
\newcommand{\dOhat}{\hat{\dO}}
\newcommand{\dPhat}{\hat{\dP}}
\newcommand{\dQhat}{\hat{\dQ}}
\newcommand{\dRhat}{\hat{\dR}}
\newcommand{\dShat}{\hat{\dS}}
\newcommand{\dThat}{\hat{\dT}}
\newcommand{\dUhat}{\hat{\dU}}
\newcommand{\dVhat}{\hat{\dV}}
\newcommand{\dWhat}{\hat{\dW}}
\newcommand{\dXhat}{\hat{\dX}}
\newcommand{\dYhat}{\hat{\dY}}
\newcommand{\dZhat}{\hat{\dZ}}

\newcommand{\dAtilde}{\widetilde{\dA}}
\newcommand{\dBtilde}{\widetilde{\dB}}
\newcommand{\dCtilde}{\widetilde{\dC}}
\newcommand{\dDtilde}{\widetilde{\dD}}
\newcommand{\dEtilde}{\widetilde{\dE}}
\newcommand{\dFtilde}{\widetilde{\dF}}
\newcommand{\dGtilde}{\widetilde{\dG}}
\newcommand{\dHtilde}{\widetilde{\dH}}
\newcommand{\dItilde}{\widetilde{\dI}}
\newcommand{\dJtilde}{\widetilde{\dJ}}
\newcommand{\dKtilde}{\widetilde{\dK}}
\newcommand{\dLtilde}{\widetilde{\dL}}
\newcommand{\dMtilde}{\widetilde{\dM}}
\newcommand{\dNtilde}{\widetilde{\dN}}
\newcommand{\dOtilde}{\widetilde{\dO}}
\newcommand{\dPtilde}{\widetilde{\dP}}
\newcommand{\dQtilde}{\widetilde{\dQ}}
\newcommand{\dRtilde}{\widetilde{\dR}}
\newcommand{\dStilde}{\widetilde{\dS}}
\newcommand{\dTtilde}{\widetilde{\dT}}
\newcommand{\dUtilde}{\widetilde{\dU}}
\newcommand{\dVtilde}{\widetilde{\dV}}
\newcommand{\dWtilde}{\widetilde{\dW}}
\newcommand{\dXtilde}{\widetilde{\dX}}
\newcommand{\dYtilde}{\widetilde{\dY}}
\newcommand{\dZtilde}{\widetilde{\dZ}}

\newcommand{\dAt}{\tilde{\dA}}
\newcommand{\dBt}{\tilde{\dB}}
\newcommand{\dCt}{\tilde{\dC}}
\newcommand{\dDt}{\tilde{\dD}}
\newcommand{\dEt}{\tilde{\dE}}
\newcommand{\dFt}{\tilde{\dF}}
\newcommand{\dGt}{\tilde{\dG}}
\newcommand{\dHt}{\tilde{\dH}}
\newcommand{\dIt}{\tilde{\dI}}
\newcommand{\dJt}{\tilde{\dJ}}
\newcommand{\dKt}{\tilde{\dK}}
\newcommand{\dLt}{\tilde{\dL}}
\newcommand{\dMt}{\tilde{\dM}}
\newcommand{\dNt}{\tilde{\dN}}
\newcommand{\dOt}{\tilde{\dO}}
\newcommand{\dPt}{\tilde{\dP}}
\newcommand{\dQt}{\tilde{\dQ}}
\newcommand{\dRt}{\tilde{\dR}}
\newcommand{\dSt}{\tilde{\dS}}
\newcommand{\dTt}{\tilde{\dT}}
\newcommand{\dUt}{\tilde{\dU}}
\newcommand{\dVt}{\tilde{\dV}}
\newcommand{\dWt}{\tilde{\dW}}
\newcommand{\dXt}{\tilde{\dX}}
\newcommand{\dYt}{\tilde{\dY}}
\newcommand{\dZt}{\tilde{\dZ}}

\renewcommand{\vec}[1]{\mathbf{\boldsymbol{#1}}}

\newcommand{\avec}{\vec{a}}
\newcommand{\bvec}{\vec{b}}
\newcommand{\cvec}{\vec{c}}
\newcommand{\dvec}{\vec{d}}
\newcommand{\evec}{\vec{e}}
\newcommand{\fvec}{\vec{f}}
\newcommand{\gvec}{\vec{g}}
\newcommand{\hvec}{\vec{h}}
\newcommand{\ivec}{\vec{i}}
\newcommand{\jvec}{\vec{j}}
\newcommand{\kvec}{\vec{k}}
\newcommand{\lvec}{\vec{l}}
\newcommand{\mvec}{\vec{m}}
\newcommand{\nvec}{\vec{n}}
\newcommand{\ovec}{\vec{o}}
\newcommand{\pvec}{\vec{p}}
\newcommand{\qvec}{\vec{q}}
\newcommand{\rvec}{\vec{r}}
\newcommand{\svec}{\vec{s}}
\newcommand{\tvec}{\vec{t}}
\newcommand{\uvec}{\vec{u}}
\newcommand{\vvec}{\vec{v}}
\newcommand{\wvec}{\vec{w}}
\newcommand{\xvec}{\vec{x}}
\newcommand{\yvec}{\vec{y}}
\newcommand{\zvec}{\vec{z}}

\newcommand{\va}{\vec{a}}
\newcommand{\vecb}{\vec{b}}
\newcommand{\vc}{\vec{c}}
\newcommand{\vd}{\vec{d}}
\newcommand{\ve}{\vec{e}}
\newcommand{\vf}{\vec{f}}
\newcommand{\vg}{\vec{g}}
\newcommand{\vh}{\vec{h}}
\newcommand{\vi}{\vec{i}}
\newcommand{\vj}{\vec{j}}
\newcommand{\vk}{\vec{k}}
\newcommand{\vl}{\vec{l}}
\newcommand{\vm}{\vec{m}}
\newcommand{\vn}{\vec{n}}
\newcommand{\vo}{\vec{o}}
\newcommand{\vp}{\vec{p}}
\newcommand{\vq}{\vec{q}}
\newcommand{\vr}{\vec{r}}
\newcommand{\vs}{\vec{s}}
\newcommand{\vt}{\vec{t}}
\newcommand{\vu}{\vec{u}}
\newcommand{\vv}{\vec{v}}
\newcommand{\vw}{\vec{w}}
\newcommand{\vx}{\vec{x}}
\newcommand{\vy}{\vec{y}}
\newcommand{\vz}{\vec{z}}

\newcommand{\Avec}{\vec{A}}
\newcommand{\Bvec}{\vec{B}}
\newcommand{\Cvec}{\vec{C}}
\newcommand{\Dvec}{\vec{D}}
\newcommand{\Evec}{\vec{E}}
\newcommand{\Fvec}{\vec{F}}
\newcommand{\Gvec}{\vec{G}}
\newcommand{\Hvec}{\vec{H}}
\newcommand{\Ivec}{\vec{I}}
\newcommand{\Jvec}{\vec{J}}
\newcommand{\Kvec}{\vec{K}}
\newcommand{\Lvec}{\vec{L}}
\newcommand{\Mvec}{\vec{M}}
\newcommand{\Nvec}{\vec{N}}
\newcommand{\Ovec}{\vec{O}}
\newcommand{\Pvec}{\vec{P}}
\newcommand{\Qvec}{\vec{Q}}
\newcommand{\Rvec}{\vec{R}}
\newcommand{\Svec}{\vec{S}}
\newcommand{\Tvec}{\vec{T}}
\newcommand{\Uvec}{\vec{U}}
\newcommand{\Vvec}{\vec{V}}
\newcommand{\Wvec}{\vec{W}}
\newcommand{\Xvec}{\vec{X}}
\newcommand{\Yvec}{\vec{Y}}
\newcommand{\Zvec}{\vec{Z}}

\newcommand{\Amat}{\Ab}
\newcommand{\Bmat}{\Bb}
\newcommand{\Cmat}{\Cb}
\newcommand{\Dmat}{\Db}
\newcommand{\Emat}{\Eb}
\newcommand{\Fmat}{\Fb}
\newcommand{\Gmat}{\Gb}
\newcommand{\Hmat}{\Hb}
\newcommand{\Imat}{\Ib}

\newcommand{\Vmat}{\Vb}
\newcommand{\Wmat}{\Wb}
\newcommand{\Xmat}{\Xb}
\newcommand{\Ymat}{\Yb}
\newcommand{\Zmat}{\Zb}
\newcommand{\VV}{\mathrm{Var}}


\newcommand{\yvecbar}{\bar{\vec{y}}}
\newcommand{\wvecbar}{\bar{\vec{w}}}
\newcommand{\xvecbar}{\bar{\vec{x}}}
\newcommand{\yvectil}{\tilde{\vec{y}}}
\newcommand{\yvechat}{\hat{\vec{y}}}


%%%%%%%% Theorems and Friends %%%%%%%%%%%%%%%

%% Some style files might actually define these variables.
%% So don't mess with them if they are already defined

\ifx\BlackBox\undefined
\newcommand{\BlackBox}{\rule{1.5ex}{1.5ex}}  % end of proof
\fi

\ifx\QED\undefined
\def\QED{~\rule[-1pt]{5pt}{5pt}\par\medskip}
\fi

\ifx\proof\undefined
\newenvironment{proof}{\par\noindent{\bf Proof\ }}{\hfill\BlackBox\\[2mm]}
\fi

\ifx\Proof\undefined
\newcommand{\Proof}{\noindent{\bf Proof.}~}
\fi

\ifx\theorem\undefined
\newtheorem{theorem}{Theorem}
\fi

\ifx\example\undefined
\newtheorem{example}{Example}
\fi

\ifx\property\undefined
\newtheorem{property}{Property}
\fi

\ifx\lemma\undefined
%\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{lemma}{Lemma}
\fi

\ifx\proposition\undefined
%\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{proposition}{Proposition}
\fi

\ifx\remark\undefined
%\newtheorem{remark}[theorem]{Remark}
\newtheorem{remark}{Remark}
\fi

\ifx\corollary\undefined
%\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{corollary}{Corollary}
\fi

\ifx\definition\undefined
\newtheorem{definition}{Definition}
\fi

\ifx\conjecture\undefined
%\newtheorem{conjecture}[theorem]{Conjecture}
\newtheorem{conjecture}{Conjecture}
\fi

\ifx\axiom\undefined
\newtheorem{axiom}[theorem]{Axiom}
\fi

\ifx\claim\undefined
%\newtheorem{claim}[theorem]{Claim}
\newtheorem{claim}{Claim}
\fi

\ifx\assumption\undefined
\newtheorem{assumption}{Assumption}
\fi

% \renewcommand{\qedsymbol}{$\blacksquare$}


%%%%%%%% Widely accepted Sets and Symbols %%%%%%%%%%%%%%%

\newcommand{\done}{\mathds{1}} % Indicator with 1
\newcommand{\dAA}{\mathds{A}} 
\newcommand{\BB}{\mathds{B}} 
\newcommand{\CC}{\mathds{C}} % Complex numbers
\newcommand{\DD}{\mathds{D}} 
\newcommand{\EE}{\mathds{E}} % Expectation
\newcommand{\FF}{\mathds{F}} % Fexpectation
\newcommand{\GG}{\mathds{G}} 
\newcommand{\HH}{\mathds{H}} % Arbitrary field
\newcommand{\II}{\mathds{I}} % Delta Indicator with I
\newcommand{\JJ}{\mathds{J}} 
\newcommand{\KK}{\mathds{K}} % Arbitrary field
\newcommand{\LL}{\mathds{L}} 
\newcommand{\MM}{\mathds{M}} % Median
\newcommand{\NN}{\mathbb{N}} % Natural numbers
\newcommand{\OO}{\mathds{O}} 
\newcommand{\PP}{\mathds{P}} % Probability
\newcommand{\QQ}{\mathds{Q}} % Rationals
\newcommand{\RR}{\mathbb{R}} % Real numbers
\newcommand{\dSS}{\mathbb{S}} 
\newcommand{\RRbar}{\overline{\RR}} % Real numbers
\newcommand{\TT}{\mathds{T}} 
\newcommand{\UU}{\mathds{U}} 
\newcommand{\WW}{\mathds{W}} 
\newcommand{\XX}{\mathds{X}}
\newcommand{\YY}{\mathds{Y}}
\newcommand{\ZZ}{\mathds{Z}} % Integers

%%%%%%%% Mathematical Operations %%%%%%%%%%%%%%%

\newcommand*{\mini}{\operatorname*{minimize}}
\newcommand*{\maxi}{\operatorname*{maximize}}
\newcommand*{\argmin}{\operatorname*{argmin}}
\newcommand*{\argmax}{\operatorname*{argmax}}
\newcommand*{\arginf}{\operatorname*{arginf}}
\newcommand*{\argsup}{\operatorname*{argsup}}

\let\originalleft\left
\let\originalright\right
\renewcommand{\left}{\mathopen{}\mathclose\bgroup\originalleft}
\renewcommand{\right}{\aftergroup\egroup\originalright}

% version with default argument: e.g., \diag => diag ; \diag[A] => diag(A)
\newcommand{\co}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{co}\else\operatorname{co}\left(#1\right)\fi}
\newcommand{\core}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{core}\else\operatorname{core}\left(#1\right)\fi}
\newcommand{\diag}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{diag}\else\operatorname{diag}\left(#1\right)\fi}
\newcommand{\dom}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{dom}\else\operatorname{dom}\left(#1\right)\fi}
\newcommand{\expect}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{\mathds{E}}\else\operatorname{\mathds{E}}\ssbr{#1}\fi}
%\newcommand{\ind}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{\mathds{I}}\else\operatorname{\mathds{I}}\left[#1\right]\fi}
\newcommand{\lap}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{\bf Lap}\else\operatorname{\bf Lap}\left(#1\right)\fi}
\newcommand{\prob}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{\mathds{P}}\else\operatorname{\mathds{P}}\ssbr{#1}\fi}
\newcommand{\proj}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{\mathcal{P}}\else\operatorname{\mathcal{P}}\left(#1\right)\fi}
\newcommand{\prox}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{prox}\else\operatorname{prox}\left(#1\right)\fi}
\newcommand{\rank}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{rank}\else\operatorname{rank}\left(#1\right)\fi}
\newcommand{\sgn}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{sign}\else\operatorname{sign}\left(#1\right)\fi}
\newcommand{\sign}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{sign}\else\operatorname{sign}\left(#1\right)\fi}
\newcommand{\tr}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{tr}\else\operatorname{tr}\left(#1\right)\fi}
\newcommand{\trace}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{tr}\else\operatorname{tr}\left(#1\right)\fi}
\newcommand{\traj}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{Traj}\else\operatorname{Traj}\left(#1\right)\fi}
\newcommand{\var}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{\mathds{V}}\else\operatorname{\mathds{V}}\left[#1\right]\fi}
\newcommand{\vect}[1][]{\def\tst{#1}\ifx\tst\empty\operatorname{vec}\else\operatorname{vec}\left(#1\right)\fi}

\newcommand{\cov}{\mathrm{Cov}}
\newcommand{\conv}{\mathrm{conv}}
\newcommand{\const}{\mathrm{constant}}
\newcommand{\ri}{\operatorname{ri}}
\newcommand{\cl}{\operatorname{cl}}
\newcommand{\intr}{\operatorname{int}}
%\newcommand{\bd}{\operatorname{bd}}
\newcommand{\emp}{\operatorname{emp}}
\newcommand{\nnz}{\operatorname{nnz}}
\newcommand{\eproof}{$\null\hfill\blacksquare$}
\renewcommand{\O}{{\cO}}
\newcommand{\softO}[1]{{\widetilde{\cO}}\rbr{#1}}

% version with argument
\newcommand{\Co}[1]{\co[#1]}
\newcommand{\Core}[1]{\core[#1]}
\newcommand{\Diag}[1]{\diag[#1]}
\newcommand{\Dom}[1]{\dom[#1]}
\newcommand{\Expect}[2][]{\underset{#1}{\EE}\ssbr{{#2}}}
\newcommand{\Ind}[1]{\ind[#1]}
\newcommand{\Lap}[1]{\lap[#1]}
\newcommand{\Prob}[1]{\prob[#1]}
\newcommand{\Proj}[2][]{\mathcal{P}_{#1}\sbr{#2}}
\newcommand{\Prox}[2][]{\operatorname{prox}_{#1}\rbr{#2}}
\newcommand{\Rank}[1]{\rank[#1]}
\newcommand{\Sgn}[1]{\sgn[#1]}
\newcommand{\Sign}[1]{\sign[#1]}
\newcommand{\Tr}[1]{\trace[#1]}
\newcommand{\Trace}[1]{\trace[#1]}
\newcommand{\Traj}[1]{\traj[#1]}
\newcommand{\Var}[1]{\var[#1]}
\newcommand{\Vect}[1]{\vect[#1]}

%%%%%%%% Utility functions %%%%%%%%%%%%%%%

\newcommand{\eq}[1]{(\ref{#1})}
\newcommand{\mymatrix}[2]{\left[\begin{array}{#1} #2 \end{array}\right]}


\newcommand{\mychoose}[2]{\left(\begin{array}{c} #1 \\ #2 \end{array}\right)}
\newcommand{\mydet}[1]{\det\left[ #1 \right]}
\newcommand{\myspan}[1]{\mathrm{span}\cbr{#1}}
\newcommand{\smallfrac}[2]{{\textstyle \frac{#1}{#2}}}
\newcommand{\pwrt}[1]{\frac{\partial}{\partial #1}}
\newcommand{\ppwrt}[1]{\frac{\partial^2}{(\partial #1)^2}}
\newcommand{\aleq}{\preccurlyeq}
\newcommand{\ageq}{\succcurlyeq}

%%%%%%%% Short Forms %%%%%%%%%%%%%%%

\newcommand{\ea}{\emph{et al.}}
\newcommand{\eg}{\emph{e.g.}}
\newcommand{\ie}{\emph{i.e.}}
\newcommand{\iid}{\emph{iid}}
\newcommand{\cf}{\emph{cf.}\ }
\newcommand{\wrt}{\emph{w.r.t.}\ }

%%%%%%%% Brackets %%%%%%%%%%%%%%%

\newcommand{\rbr}[1]{\left({#1}\right)}
\newcommand{\sbr}[1]{\left[{#1}\right]}
\newcommand{\cbr}[1]{\left\{{#1}\right\}}
\newcommand{\nbr}[1]{\left\|{#1}\right\|}
\newcommand{\abr}[1]{\left\langle{#1}\right\rangle}
\newcommand{\abs}[1]{\left|{#1}\right|}
\newcommand{\floor}[1]{\left\lfloor {#1} \right\rfloor}
\newcommand{\ceil}[1]{\left\lceil {#1} \right\rceil}
\newcommand{\inner}[2]{\left\langle {#1},{#2} \right\rangle}
\newcommand{\norm}[1]{\left\|{#1}\right\|}
\newcommand{\frob}[1]{\norm{#1}_\text{F}}
\newcommand{\fro}[1]{\norm{#1}_\text{F}}
\newcommand{\onenorm}[1]{\norm{#1}_1}
\newcommand{\twonorm}[1]{\norm{#1}_2}
\newcommand{\infnorm}[1]{\norm{#1}_{\infty}}
\newcommand{\trnorm}[1]{\norm{#1}_{\text{tr}}}
\newcommand{\ccc}[1]{\left|\!\left|\!\left|{#1}\right|\!\right|\!\right|}
\newcommand{\bsd}[1]{\left\llbracket{#1}\right\rrbracket}
\newcommand{\ssbr}[1]{\left\llbracket{#1}\right\rrbracket}
\newcommand{\sembrack}[1]{\left\llbracket{#1}\right\rrbracket}

%%%%%%%%%%%%%%%  Mathematical Constants  %%%%%%%%%%%%%%%

\newcommand{\one}{{\bm{1}}}  % Identity
\newcommand{\zero}{{\bm{0}}} % Zero
\newcommand{\bone}{{\bm{1}}}  % Identity
\newcommand{\bzero}{{\bm{0}}} % Zero
\def\b0{\bm{0}} % Zero
\newcommand{\half}{\frac{1}{2}}
\newcommand{\sqrttwo}{\sqrt{2}}
\newcommand{\invsqrttwo}{\frac{1}{\sqrt{2}}}
\newcommand{\rmd}{\ensuremath{\mathrm{d}}}
\newcommand*{\diff}[1][]{\mathop{}\!{\mathrm{d}^{#1}}}

%%%%%%%% Greek Symbols %%%%%%%%%%%%%%%

\newcommand{\sigmab}{\bm{\sigma}}
\newcommand{\Sigmab}{\mathbf{\Sigma}}

\newcommand{\val}{\vec{\alpha}}
\newcommand{\valpha}{\vec{\alpha}}
\newcommand{\AL}{\vec{\alpha}}
\newcommand{\vbeta}{\vec{\beta}}
\newcommand{\vga}{\vec{\gamma}}
\newcommand{\vgamma}{\vec{\gamma}}
\newcommand{\vde}{\vec{\delta}}
\newcommand{\vdelta}{\vec{\delta}}
\newcommand{\veps}{\vec{\epsilon}}
\newcommand{\vepsilon}{\vec{\epsilon}}
\newcommand{\vze}{\vec{\zeta}}
\newcommand{\vzeta}{\vec{\zeta}}
\newcommand{\veta}{\vec{\eta}}
\newcommand{\vth}{\vec{\theta}}
\newcommand{\vtheta}{\vec{\theta}}
\newcommand{\viota}{\vec{\iota}}
\newcommand{\vkappa}{\vec{\kappa}}
\newcommand{\vlambda}{\vec{\lambda}}
\newcommand{\vmu}{\vec{\mu}}
\newcommand{\vnu}{\vec{\nu}}
\newcommand{\vxi}{\vec{\xi}}
\newcommand{\vpi}{\vec{\pi}}
\newcommand{\vrho}{\vec{\rho}}
\newcommand{\vsig}{\vec{\sigma}}
\newcommand{\vsigma}{\vec{\sigma}}
\newcommand{\vtau}{\vec{\tau}}
\newcommand{\vupsilon}{\vec{\upsilon}}
\newcommand{\vphi}{\vec{\phi}}
\newcommand{\vchi}{\vec{\chi}}
\newcommand{\vpsi}{\vec{\psi}}
\newcommand{\vomega}{\vec{\omega}}

\newcommand{\bal}{\vec{\alpha}}
\newcommand{\balpha}{\vec{\alpha}}
\newcommand{\bbeta}{\vec{\beta}}
\newcommand{\bga}{\vec{\gamma}}
\newcommand{\bgamma}{\vec{\gamma}}
\newcommand{\bde}{\vec{\delta}}
\newcommand{\bdelta}{\vec{\delta}}
\newcommand{\beps}{\vec{\epsilon}}
\newcommand{\bepsilon}{\vec{\epsilon}}
\newcommand{\bze}{\vec{\zeta}}
\newcommand{\bzeta}{\vec{\zeta}}
\newcommand{\boldeta}{\vec{\eta}}
\newcommand{\bth}{\vec{\theta}}
\newcommand{\btheta}{\vec{\theta}}
\newcommand{\biota}{\vec{\iota}}
\newcommand{\bkappa}{\vec{\kappa}}
\newcommand{\blambda}{\vec{\lambda}}
\newcommand{\bmu}{\vec{\mu}}
\newcommand{\bnu}{\vec{\nu}}
\newcommand{\bxi}{\vec{\xi}}
\newcommand{\bpi}{\vec{\pi}}
\newcommand{\brho}{\vec{\rho}}
\newcommand{\bsig}{\vec{\sigma}}
\newcommand{\bsigma}{\vec{\sigma}}
\newcommand{\btau}{\vec{\tau}}
\newcommand{\bupsilon}{\vec{\upsilon}}
\newcommand{\bphi}{\vec{\phi}}
\newcommand{\bchi}{\vec{\chi}}
\newcommand{\bpsi}{\vec{\psi}}
\newcommand{\bomega}{\vec{\omega}}

\newcommand{\ALbar}{\bar{\val}}
\newcommand{\balbar}{\bar{\val}}
\newcommand{\balphabar}{\bar{\val}}
\newcommand{\bbetabar}{\bar{\vbeta}}
\newcommand{\bgabar}{\bar{\vgamma}}
\newcommand{\bgammabar}{\bar{\vgamma}}
\newcommand{\bdebar}{\bar{\vdelta}}
\newcommand{\bdeltabar}{\bar{\vdelta}}
\newcommand{\bepsbar}{\bar{\vepsilon}}
\newcommand{\bepsilonbar}{\bar{\vepsilon}}
\newcommand{\bzebar}{\bar{\vzeta}}
\newcommand{\bzetabar}{\bar{\vzeta}}
\newcommand{\boldetabar}{\bar{\veta}}
\newcommand{\bthbar}{\bar{\vtheta}}
\newcommand{\bthetabar}{\bar{\vtheta}}
\newcommand{\biotabar}{\bar{\viota}}
\newcommand{\bkappabar}{\bar{\vkappa}}
\newcommand{\blambdabar}{\bar{\vlambda}}
\newcommand{\bmubar}{\bar{\vmu}}
\newcommand{\bnubar}{\bar{\vnu}}
\newcommand{\bxibar}{\bar{\vxi}}
\newcommand{\bpibar}{\bar{\vpi}}
\newcommand{\brhobar}{\bar{\vrho}}
\newcommand{\bsigbar}{\bar{\vsigma}}
\newcommand{\bsigmabar}{\bar{\vsigma}}
\newcommand{\btaubar}{\bar{\vtau}}
\newcommand{\bupsilonbar}{\bar{\vupsilon}}
\newcommand{\bphibar}{\bar{\vphi}}
\newcommand{\bchibar}{\bar{\vchi}}
\newcommand{\bpsibar}{\bar{\vpsi}}
\newcommand{\bomegabar}{\bar{\vomega}}

\newcommand{\ALhat}{\hat{\val}}
\newcommand{\balhat}{\hat{\val}}
\newcommand{\balphahat}{\hat{\val}}
\newcommand{\bbetahat}{\hat{\vbeta}}
\newcommand{\bgahat}{\hat{\vgamma}}
\newcommand{\bgammahat}{\hat{\vgamma}}
\newcommand{\bdehat}{\hat{\vdelta}}
\newcommand{\bdeltahat}{\hat{\vdelta}}
\newcommand{\bepshat}{\hat{\vepsilon}}
\newcommand{\bepsilonhat}{\hat{\vepsilon}}
\newcommand{\bzehat}{\hat{\vzeta}}
\newcommand{\bzetahat}{\hat{\vzeta}}
\newcommand{\boldetahat}{\hat{\veta}}
\newcommand{\bthhat}{\hat{\vtheta}}
\newcommand{\bthetahat}{\hat{\vtheta}}
\newcommand{\biotahat}{\hat{\viota}}
\newcommand{\bkappahat}{\hat{\vkappa}}
\newcommand{\blambdahat}{\hat{\vlambda}}
\newcommand{\bmuhat}{\hat{\vmu}}
\newcommand{\bnuhat}{\hat{\vnu}}
\newcommand{\bxihat}{\hat{\vxi}}
\newcommand{\bpihat}{\hat{\vpi}}
\newcommand{\brhohat}{\hat{\vrho}}
\newcommand{\bsighat}{\hat{\vsigma}}
\newcommand{\bsigmahat}{\hat{\vsigma}}
\newcommand{\btauhat}{\hat{\vtau}}
\newcommand{\bupsilonhat}{\hat{\vupsilon}}
\newcommand{\bphihat}{\hat{\vphi}}
\newcommand{\bchihat}{\hat{\vchi}}
\newcommand{\bpsihat}{\hat{\vpsi}}
\newcommand{\bomegahat}{\hat{\vomega}}

\newcommand{\ALtil}{\tilde{\val}}
\newcommand{\baltil}{\tilde{\val}}
\newcommand{\balphatil}{\tilde{\val}}
\newcommand{\bbetatil}{\tilde{\vbeta}}
\newcommand{\bgatil}{\tilde{\vgamma}}
\newcommand{\bgammatil}{\tilde{\vgamma}}
\newcommand{\bdetil}{\tilde{\vdelta}}
\newcommand{\bdeltatil}{\tilde{\vdelta}}
\newcommand{\bepstil}{\tilde{\vepsilon}}
\newcommand{\bepsilontil}{\tilde{\vepsilon}}
\newcommand{\bzetil}{\tilde{\vzeta}}
\newcommand{\bzetatil}{\tilde{\vzeta}}
\newcommand{\boldetatil}{\tilde{\veta}}
\newcommand{\bthtil}{\tilde{\vtheta}}
\newcommand{\bthetatil}{\tilde{\vtheta}}
\newcommand{\biotatil}{\tilde{\viota}}
\newcommand{\bkappatil}{\tilde{\vkappa}}
\newcommand{\blambdatil}{\tilde{\vlambda}}
\newcommand{\bmutil}{\tilde{\vmu}}
\newcommand{\bnutil}{\tilde{\vnu}}
\newcommand{\bxitil}{\tilde{\vxi}}
\newcommand{\bpitil}{\tilde{\vpi}}
\newcommand{\brhotil}{\tilde{\vrho}}
\newcommand{\bsigtil}{\tilde{\vsigma}}
\newcommand{\bsigmatil}{\tilde{\vsigma}}
\newcommand{\btautil}{\tilde{\vtau}}
\newcommand{\bupsilontil}{\tilde{\vupsilon}}
\newcommand{\bphitil}{\tilde{\vphi}}
\newcommand{\bchitil}{\tilde{\vchi}}
\newcommand{\bpsitil}{\tilde{\vpsi}}
\newcommand{\bomegatil}{\tilde{\vomega}}

\newcommand{\ALtilde}{\widetilde{\val}}
\newcommand{\baltilde}{\widetilde{\val}}
\newcommand{\balphatilde}{\widetilde{\val}}
\newcommand{\bbetatilde}{\widetilde{\vbeta}}
\newcommand{\bgatilde}{\widetilde{\vgamma}}
\newcommand{\bgammatilde}{\widetilde{\vgamma}}
\newcommand{\bdetilde}{\widetilde{\vdelta}}
\newcommand{\bdeltatilde}{\widetilde{\vdelta}}
\newcommand{\bepstilde}{\widetilde{\vepsilon}}
\newcommand{\bepsilontilde}{\widetilde{\vepsilon}}
\newcommand{\bzetilde}{\widetilde{\vzeta}}
\newcommand{\bzetatilde}{\widetilde{\vzeta}}
\newcommand{\boldetatilde}{\widetilde{\veta}}
\newcommand{\bthtilde}{\widetilde{\vtheta}}
\newcommand{\bthetatilde}{\widetilde{\vtheta}}
\newcommand{\biotatilde}{\widetilde{\viota}}
\newcommand{\bkappatilde}{\widetilde{\vkappa}}
\newcommand{\blambdatilde}{\widetilde{\vlambda}}
\newcommand{\bmutilde}{\widetilde{\vmu}}
\newcommand{\bnutilde}{\widetilde{\vnu}}
\newcommand{\bxitilde}{\widetilde{\vxi}}
\newcommand{\bpitilde}{\widetilde{\vpi}}
\newcommand{\brhotilde}{\widetilde{\vrho}}
\newcommand{\bsigtilde}{\widetilde{\vsigma}}
\newcommand{\bsigmatilde}{\widetilde{\vsigma}}
\newcommand{\btautilde}{\widetilde{\vtau}}
\newcommand{\bupsilontilde}{\widetilde{\vupsilon}}
\newcommand{\bphitilde}{\widetilde{\vphi}}
\newcommand{\bchitilde}{\widetilde{\vchi}}
\newcommand{\bpsitilde}{\widetilde{\vpsi}}
\newcommand{\bomegatilde}{\widetilde{\vomega}}

\newcommand{\ALt}{\tilde{\val}}
\newcommand{\balt}{\tilde{\val}}
\newcommand{\balphat}{\hat{\val}}
\newcommand{\bbetat}{\tilde{\vbeta}}
\newcommand{\bgat}{\tilde{\vgamma}}
\newcommand{\bgammat}{\tilde{\vgamma}}
\newcommand{\bdet}{\tilde{\vdelta}}
\newcommand{\bdeltat}{\tilde{\vdelta}}
\newcommand{\bepst}{\tilde{\vepsilon}}
\newcommand{\bepsilont}{\tilde{\vepsilon}}
\newcommand{\bzet}{\tilde{\vzeta}}
\newcommand{\bzetat}{\tilde{\vzeta}}
\newcommand{\boldetat}{\tilde{\veta}}
\newcommand{\btht}{\tilde{\vtheta}}
\newcommand{\bthetat}{\tilde{\vtheta}}
\newcommand{\biotat}{\tilde{\viota}}
\newcommand{\bkappat}{\tilde{\vkappa}}
\newcommand{\blambdat}{\tilde{\vlambda}}
\newcommand{\bmut}{\tilde{\vmu}}
\newcommand{\bnut}{\tilde{\vnu}}
\newcommand{\bxit}{\tilde{\vxi}}
\newcommand{\bpit}{\tilde{\vpi}}
\newcommand{\brhot}{\tilde{\vrho}}
\newcommand{\bsigt}{\tilde{\vsigma}}
\newcommand{\bsigmat}{\tilde{\vsigma}}
\newcommand{\btaut}{\tilde{\vtau}}
\newcommand{\bupsilont}{\tilde{\vupsilon}}
\newcommand{\bphit}{\tilde{\vphi}}
\newcommand{\bchit}{\tilde{\vchi}}
\newcommand{\bpsit}{\tilde{\vpsi}}
\newcommand{\bomegat}{\tilde{\vomega}}

\newcommand{\albar}{\bar{\al}}
\newcommand{\alphabar}{\bar{\al}}
\newcommand{\betabar}{\bar{\beta}}
\newcommand{\gabar}{\bar{\gamma}}
\newcommand{\gammabar}{\bar{\gamma}}
\newcommand{\debar}{\bar{\delta}}
\newcommand{\deltabar}{\bar{\delta}}
\newcommand{\epsbar}{\bar{\epsilon}}
\newcommand{\epsilonbar}{\bar{\epsilon}}
\newcommand{\zebar}{\bar{\zeta}}
\newcommand{\zetabar}{\bar{\zeta}}
\newcommand{\etabar}{\bar{\eta}}
\newcommand{\thbar}{\bar{\theta}}
\newcommand{\thetabar}{\bar{\theta}}
\newcommand{\iotabar}{\bar{\iota}}
\newcommand{\kappabar}{\bar{\kappa}}
\newcommand{\lambdabar}{\bar{\lambda}}
\newcommand{\mubar}{\bar{\mu}}
\newcommand{\nubar}{\bar{\nu}}
\newcommand{\xibar}{\bar{\xi}}
\newcommand{\pibar}{\bar{\pi}}
\newcommand{\rhobar}{\bar{\rho}}
\newcommand{\sigbar}{\bar{\sigma}}
\newcommand{\sigmabar}{\bar{\sigma}}
\newcommand{\taubar}{\bar{\tau}}
\newcommand{\upsilonbar}{\bar{\upsilon}}
\newcommand{\phibar}{\bar{\phi}}
\newcommand{\chibar}{\bar{\chi}}
\newcommand{\psibar}{\bar{\psi}}
\newcommand{\omegabar}{\bar{\omega}}

\newcommand{\alhat}{\hat{\al}}
\newcommand{\alphahat}{\hat{\al}}
\newcommand{\betahat}{\hat{\beta}}
\newcommand{\gahat}{\hat{\gamma}}
\newcommand{\gammahat}{\hat{\gamma}}
\newcommand{\dehat}{\hat{\delta}}
\newcommand{\deltahat}{\hat{\delta}}
\newcommand{\epshat}{\hat{\epsilon}}
\newcommand{\epsilonhat}{\hat{\epsilon}}
\newcommand{\zehat}{\hat{\zeta}}
\newcommand{\zetahat}{\hat{\zeta}}
\newcommand{\etahat}{\hat{\eta}}
\newcommand{\thhat}{\hat{\theta}}
\newcommand{\thetahat}{\hat{\theta}}
\newcommand{\iotahat}{\hat{\iota}}
\newcommand{\kappahat}{\hat{\kappa}}
\newcommand{\lambdahat}{\hat{\lambda}}
\newcommand{\muhat}{\hat{\mu}}
\newcommand{\nuhat}{\hat{\nu}}
\newcommand{\xihat}{\hat{\xi}}
\newcommand{\rhohat}{\hat{\rho}}
\newcommand{\sighat}{\hat{\sigma}}
\newcommand{\sigmahat}{\hat{\sigma}}
\newcommand{\tauhat}{\hat{\tau}}
\newcommand{\upsilonhat}{\hat{\upsilon}}
\newcommand{\phihat}{\hat{\phi}}
\newcommand{\chihat}{\hat{\chi}}
\newcommand{\psihat}{\hat{\psi}}
\newcommand{\omegahat}{\hat{\omega}}

\newcommand{\alphatil}{\tilde{\alpha}}
\newcommand{\betatil}{\tilde{\beta}}
\newcommand{\gatil}{\tilde{\gamma}}
\newcommand{\gammatil}{\tilde{\gamma}}
\newcommand{\deltatil}{\tilde{\delta}}
\newcommand{\epstil}{\tilde{\epsilon}}
\newcommand{\epsilontil}{\tilde{\epsilon}}
\newcommand{\zetil}{\tilde{\zeta}}
\newcommand{\zetatil}{\tilde{\zeta}}
\newcommand{\etatil}{\tilde{\eta}}
\newcommand{\thetatil}{\tilde{\theta}}
\newcommand{\iotatil}{\tilde{\iota}}
\newcommand{\kappatil}{\tilde{\kappa}}
\newcommand{\lambdatil}{\tilde{\lambda}}
\newcommand{\mutil}{\tilde{\mu}}
\newcommand{\nutil}{\tilde{\nu}}
\newcommand{\xitil}{\tilde{\xi}}
\newcommand{\pitil}{\tilde{\pi}}
\newcommand{\rhotil}{\tilde{\rho}}
\newcommand{\sigtil}{\tilde{\sigma}}
\newcommand{\sigmatil}{\tilde{\sigma}}
\newcommand{\tautil}{\tilde{\tau}}
\newcommand{\upsilontil}{\tilde{\upsilon}}
\newcommand{\phitil}{\tilde{\phi}}
\newcommand{\chitil}{\tilde{\chi}}
\newcommand{\psitil}{\tilde{\psi}}
\newcommand{\omegatil}{\tilde{\omega}}

\newcommand{\alphat}{\tilde{\alpha}}
\newcommand{\betat}{\tilde{\beta}}
\newcommand{\gammat}{\tilde{\gamma}}
\newcommand{\deltat}{\tilde{\delta}}
\newcommand{\epsilont}{\tilde{\epsilon}}
\newcommand{\epst}{\tilde{\epsilon}}
\newcommand{\zetat}{\tilde{\zeta}}
\newcommand{\etat}{\tilde{\eta}}
\newcommand{\thetat}{\tilde{\theta}}
\newcommand{\iotat}{\tilde{\iota}}
\newcommand{\kappat}{\tilde{\kappa}}
\newcommand{\lambdat}{\tilde{\lambda}}
\newcommand{\mut}{\tilde{\mu}}
\newcommand{\nut}{\tilde{\nu}}
\newcommand{\xit}{\tilde{\xi}}
\newcommand{\pit}{\tilde{\pi}}
\newcommand{\rhot}{\tilde{\rho}}
\newcommand{\sigt}{\tilde{\sigma}}
\newcommand{\sigmat}{\tilde{\sigma}}
\newcommand{\taut}{\tilde{\tau}}
\newcommand{\upsilont}{\tilde{\upsilon}}
\newcommand{\phit}{\tilde{\phi}}
\newcommand{\chit}{\tilde{\chi}}
\newcommand{\psit}{\tilde{\psi}}
\newcommand{\omegat}{\tilde{\omega}}

% Upper Case Greek Symbols
\newcommand{\Gammabar}{\bar{\Gamma}}
\newcommand{\Deltabar}{\bar{\Delta}}
\newcommand{\Thetabar}{\bar{\Theta}}
\newcommand{\Lambdabar}{\bar{\Lambda}}
\newcommand{\Xibar}{\bar{\Xi}}
\newcommand{\Pibar}{\bar{\Pi}}
\newcommand{\Sigmabar}{\bar{\Sigma}}
\newcommand{\Upsilonbar}{\bar{\Upsilon}}
\newcommand{\Phibar}{\bar{\Phi}}
\newcommand{\Psibar}{\bar{\Psi}}
\newcommand{\Omegabar}{\bar{\Omega}}

\newcommand{\Gammahat}{\hat{\Gamma}}
\newcommand{\Deltahat}{\hat{\Delta}}
\newcommand{\Thetahat}{\hat{\Theta}}
\newcommand{\Lambdahat}{\hat{\Lambda}}
\newcommand{\Xihat}{\hat{\Xi}}
\newcommand{\Pihat}{\hat{\Pi}}
\newcommand{\Sigmahat}{\hat{\Sigma}}
\newcommand{\Upsilonhat}{\hat{\Upsilon}}
\newcommand{\Phihat}{\hat{\Phi}}
\newcommand{\Psihat}{\hat{\Psi}}
\newcommand{\Omegahat}{\hat{\Omega}}

\newcommand{\Gammatil}{\tilde{\Gamma}}
\newcommand{\Deltatil}{\tilde{\Delta}}
\newcommand{\Thetatil}{\tilde{\Theta}}
\newcommand{\Lambdatil}{\tilde{\Lambda}}
\newcommand{\Xitil}{\tilde{\Xi}}
\newcommand{\Pitil}{\tilde{\Pi}}
\newcommand{\Sigmatil}{\tilde{\Sigma}}
\newcommand{\Upsilontil}{\tilde{\Upsilon}}
\newcommand{\Phitil}{\tilde{\Phi}}
\newcommand{\Psitil}{\tilde{\Psi}}
\newcommand{\Omegatil}{\tilde{\Omega}}

\newcommand{\Gammatilde}{\widetilde{\Gamma}}
\newcommand{\Deltatilde}{\widetilde{\Delta}}
\newcommand{\Thetatilde}{\widetilde{\Theta}}
\newcommand{\Lambdatilde}{\widetilde{\Lambda}}
\newcommand{\Xitilde}{\widetilde{\Xi}}
\newcommand{\Pitilde}{\widetilde{\Pi}}
\newcommand{\Sigmatilde}{\widetilde{\Sigma}}
\newcommand{\Upsilontilde}{\widetilde{\Upsilon}}
\newcommand{\Phitilde}{\widetilde{\Phi}}
\newcommand{\Psitilde}{\widetilde{\Psi}}
\newcommand{\Omegatilde}{\widetilde{\Omega}}

\newcommand{\Gammat}{\tilde{\Gamma}}
\newcommand{\Deltat}{\tilde{\Delta}}
\newcommand{\Thetat}{\tilde{\Theta}}
\newcommand{\Lambdat}{\tilde{\Lambda}}
\newcommand{\Xit}{\tilde{\Xi}}
\newcommand{\Pit}{\tilde{\Pi}}
\newcommand{\Sigmat}{\tilde{\Sigma}}
\newcommand{\Upsilont}{\tilde{\Upsilon}}
\newcommand{\Phit}{\tilde{\Phi}}
\newcommand{\Psit}{\tilde{\Psi}}
\newcommand{\Omegat}{\tilde{\Omega}}

% others
\newcommand{\valbar}{\bar{\val}}
\newcommand{\valhat}{\hat{\val}}
\newcommand{\valtil}{\tilde{\val}}
\newcommand{\vthtil}{\tilde{\vth}}
\newcommand{\vthhat}{\hat{\vth}}
\newcommand{\vthbar}{\bar{\vth}}
\newcommand{\vbetatil}{\tilde{\vbeta}}
\newcommand{\vbetahat}{\hat{\vbeta}}

\newcommand{\alphavec}{\val}
\newcommand{\alphavecbar}{\bar{\val}}
\newcommand{\alphavechat}{\hat{\val}}
\newcommand{\alphavectil}{\tilde{\val}}
\newcommand{\betavec}{\vec{\beta}}
\newcommand{\gammavec}{\vec{\gamma}}
\newcommand{\deltavec}{\vec{\delta}}
\newcommand{\etavec}{\vec{\eta}}
\newcommand{\phivec}{\vec{\phi}}
\newcommand{\psivec}{\vec{\psi}}
\newcommand{\thetavec}{\vec{\theta}}
\newcommand{\muvec}{\vec{\mu}}
\newcommand{\xivec}{\vec{\xi}}
\newcommand{\chivec}{\vec{\chi}}
\newcommand{\lambdavec}{\vec{\lambda}}

\newcommand{\alphab}{\boldsymbol{\alpha}}
\newcommand{\phib}{\boldsymbol{\phi}}
\newcommand{\epsilonb}{\boldsymbol{\epsilon}}
\newcommand{\betab}{\boldsymbol{\beta}}
\newcommand{\gammab}{\boldsymbol{\gamma}}
\newcommand{\thetab}{\boldsymbol{\theta}}
\newcommand{\mub}{\boldsymbol{\mu}}
\newcommand{\xib}{\boldsymbol{\xi}}
\newcommand{\Deltab}{\boldsymbol{\Delta}}
\newcommand{\Pib}{\boldsymbol{\Pi}}
\newcommand{\etab}{\boldsymbol{\eta}}
\newcommand{\taub}{\boldsymbol{\tau}}
\newcommand{\lambdab}{\boldsymbol{\lambda}}
\newcommand{\elltil}{\tilde{\ell}}
\newcommand{\rhob}{\boldsymbol{\rho}}

\newcommand{\delhat}{\hat{\delta}}
\newcommand{\delbar}{\bar{\delta}}

\newcommand{\lambdavectil}{\tilde{\vec{\lambda}}}


%%%% Symbols specific to this project %%%%%

\newcommand{\Poisson}{\mathrm{Poisson}}

\def\*#1{\mathbf{#1}}
\def\_#1{\mathcal{#1}}
\def\-#1{\mathbb{#1}}
\def\=#1{\pmb{#1}}

\newcommand{\Ibest}{I_{\text{best}}}
\newcommand{\gap}{\textsc{gap}}

\newcommand{\ind}[1]{\bone\left\{ #1 \right\}}

\def\EE{{\mathbb{E}}}\def\PP{{\mathbb{P}}}

\newcommand{\RegExp}{\mathsf{R}}
\newcommand{\RegEmp}{\widehat{\mathsf{R}}}
\newcommand{\thetals}{\hat{\pmb{\theta}}_{\mathrm{ls}}}
\newcommand{\thetawls}{\hat{\pmb{\theta}}_{\mathrm{wls}}}
\newcommand{\thetamle}{\hat{\pmb{\theta}}_{\mathrm{mle}}}
\newcommand{\thetaopt}{\pmb{\theta}^{*}}
\newcommand{\thetamleone}{\hat{\theta}_{\mathrm{mle}}}
\newcommand{\thetaoptone}{\theta^{*}}
\newcommand{\thetalsone}{\hat{\theta}_{\mathrm{ls}}}
\newcommand{\thetabet}{\hat{\pmb{\theta}}(\pmb{\beta})}
\newcommand{\ball}{\-B}
\newcommand{\sphere}{\-S}
\newcommand{\subg}{\mathrm{subG}}
\newcommand{\im}{i_{min}}
\newcommand{\eig}{\mathrm{eig}}
\newcommand{\lmax}{\lambda_{\mathrm{max}}}
\newcommand{\lmin}{\lambda_{\mathrm{min}}}
\newcommand{\dtheta}{\nabla_{\thetab}}
\newcommand{\stir}{\mathrm{Stirling}}
\newcommand{\eptilde}{\tilde{\pmb{\epsilon}}}
\newcommand{\rs}[1]{\textcolor{red}{[RS: #1]}}
\newcommand{\tmax}{w}
\newcommand{\xmax}{R}
\newcommand{\Rel}{\mathsc{Rel}}
\newcommand{\Relhat}{\widehat{\mathsc{Rel}}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\reals}{\mathbb{R}}
\newcommand{\pred}{\widehat{y}}
\newcommand{\e}{\boldsymbol{e}}
% \newcommand{\mse}{\mathrm{mse}}
\newcommand{\mse}{\cE}
\newcommand{\kl}{D_{\mathrm{KL}}}
\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}
\newcommand{\thetamletilde}{\tilde{\pmb{\theta}}_{\mathrm{mle}}}
\newcommand{\erm}[1]{\texttt{ERM} + \texttt{{#1}}}
\newcommand{\mixmle}[1]{\texttt{MLE(ZNBG)} + \texttt{{#1}}}
\newcommand{\nbmle}[1]{\texttt{MLE(NB)} + \texttt{{#1}}}

\newcommand{\RS}[1]{{\color{blue}RS: {#1}}}
%%%%%%%%%%%%%%%%%%%%%%%%%%discard pages%%%%%%%%%%%%%%%%%%%%%

\usepackage{pdfpages}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\usepackage{algorithmic}
\usepackage[ruled,vlined]{algorithm2e}

\usepackage{parskip}
\usepackage{wrapfig}
\usepackage{subfig}


% The \icmltitle you define below is probably too long as a header.
% Therefore, a short form for the running title is supplied here:
\title{Blackbox optimization of unimodal functions}

\author[1]{Ashok Cutkosky}
\author[2]{Abhimanyu Das}
\author[2]{Weihao Kong}
\author[3]{Chansoo Lee}
\author[2]{\href{mailto:<senrajat@google.com>?Subject=Your UAI 2023 paper}{Rajat Sen}{}}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% % Add affiliations after the authors
\affil[1]{%
    Boston University
}
\affil[2]{%
Google Research, Mountain View
}
\affil[3]{%
Google, Pittsburgh
}

\begin{document}
\maketitle


% It is OKAY to include author information, even for blind
% submissions: the style file will automatically remove it for you
% unless you've provided the [accepted] option to the icml2023
% package.

% List of affiliations: The first argument should be a (short)
% identifier you will use later to specify author affiliations
% Academic affiliations should list Department, University, City, Region, Country
% Industry affiliations should list Company, City, Region, Country

% You can specify symbols, otherwise they are numbered in order.
% Ideally, you should not use this facility. Affiliations will be numbered
% in order of appearance and this is the preferred way.
% \icmlsetsymbol{equal}{*}

% \begin{icmlauthorlist}
% \icmlauthor{Firstname1 Lastname1}{equal,yyy}
% \icmlauthor{Firstname2 Lastname2}{equal,yyy,comp}
% \icmlauthor{Firstname3 Lastname3}{comp}
% \icmlauthor{Firstname4 Lastname4}{sch}
% \icmlauthor{Firstname5 Lastname5}{yyy}
% \icmlauthor{Firstname6 Lastname6}{sch,yyy,comp}
% \icmlauthor{Firstname7 Lastname7}{comp}
% %\icmlauthor{}{sch}
% \icmlauthor{Firstname8 Lastname8}{sch}
% \icmlauthor{Firstname8 Lastname8}{yyy,comp}
% %\icmlauthor{}{sch}
% %\icmlauthor{}{sch}
% \end{icmlauthorlist}

% \icmlaffiliation{yyy}{Department of XXX, University of YYY, Location, Country}
% \icmlaffiliation{comp}{Company Name, Location, Country}
% \icmlaffiliation{sch}{School of ZZZ, Institute of WWW, Location, Country}

% \icmlcorrespondingauthor{Firstname1 Lastname1}{first1.last1@xxx.edu}
% \icmlcorrespondingauthor{Firstname2 Lastname2}{first2.last2@www.uk}

% % You may provide any keywords that you
% % find helpful for describing your paper; these are used to populate
% % the "keywords" metadata in the PDF but will not be shown in the document
% \icmlkeywords{Machine Learning, ICML}

% \vskip 0.3in


% this must go after the closing bracket ] following \twocolumn[ ...

% This command actually creates the footnote in the first column
% listing the affiliations and the copyright notice.
% The command takes one argument, which is text to display at the start of the footnote.
% The \icmlEqualContribution command is standard text for equal contribution.
% Remove it (just {}) if you do not need this facility.

%\printAffiliationsAndNotice{}  % leave blank if no need to mention equal contribution
% \printAffiliationsAndNotice{\icmlEqualContribution} % otherwise use the standard text.


\begin{abstract}
  We provide an intuitive new algorithm for \blackbox stochastic optimization of unimodal functions, a function class that we observe empirically can capture hyperparameter-tuning loss surfaces. Our method's convergence guarantee automatically adapts to Lipschitz constants and other problem difficulty parameters, recovering and extending prior results. We complement our theoretical development with experimental validation on hyperparameter tuning tasks.
\end{abstract}



\section{\Blackbox Optimization}\label{sec:intro}

This paper considers the problem of \emph{\blackbox stochastic optimization}. Specifically, we are interested in solving a maximization problem of the form:
\begin{align*}
    \max_{x}F(x) = \EE_{z}[f(x,z)]
\end{align*}
Here, $F$ is the objective function that takes the form $F(x) =\EE_{z}[f(x,z)]$, for some function $f$ and some random variable z. 
% Unfortunately, we do not know the value of this expectation, or the shape of the function $f$ or the distribution of $Z$. 
We have access to a \blackbox evaluation oracle (sometimes called a \emph{stochastic zeroth-order oracle}), which returns a random sample $f(x,z)$  given any $x$. This is the only information we have about $f$; we do not know the function definition or its gradients.
% must to use only this capacity to find a $\hat x$ that maximizes $F$.

The ability to solve this problem forms a fundamental primitive in more complex systems. For example, it can be used to optimize design of new materials \citep{terayama2021black}, to improve user interfaces, or search for hyperparameters in machine learning \citep{golovin2017google, feurer2019hyperparameter, hazan2018hyperparameter}. Although the techniques we present in this work are very general, we focus on the task of optimizing hyperparameters as an important motivating application.

\Blackbox optimization has been studied from a variety of perspectives, which is reflected in the variety of names that refer to essentially similar concepts. Depending on the author, it may be referred to as ``zero-order'' or ``derivative-free'' optimization \citep{rios2013derivative, duchi2015optimal, jamieson2012query}, ``bandit'' optimization \citep{agarwal2011stochastic, agrawal1995continuum, kleinberg2008multi, auer2007improved, shamir2013complexity, jun2017scalable}, sequential experimental design \citep{chernoff1959sequential}, or Bayesian optimization \citep{shahriari2015taking, srinivas2009gaussian, snoek2012practical}. While the details of the methods used to solve the problem are diverse, the overall idea behind all methods is necessarily similar: first, assume some sort of ``structure'' on either the distribution of $z$ or the shape of $f$ (or both) that constrains the number of possibilities for $F$ - popular examples include convexity, smoothness, a Bayesian prior. Then, use new evaluations $f(x,z)$ to further constrain the possibilities until one can reliable identify a maximizing point.

An algorithm's usefulness can thus be measured on two axes: the degree that whatever structural assumptions it makes are in fact reflected in practice, and the number of samples required by the algorithm to optimize $F$ subject to the assumptions. The goal is to find assumptions expressive enough to capture real problems while still admitting efficient optimization. In this paper, we focus on the assumption of \emph{unimodality}. That is (in the 1D case), the function $F$ has only one local maximum. This assumption is motivated via empirical observation of hyperparameter-tuning problems. For example, in Figure~\ref{fig:cifar100LR} we plot the influence of the learning rate hyperparameter on the accuracy of an AlexNet type architecture trained on the CIFAR100 image classification task~\citep{krizhevsky2009learning}. It is evident that this relationship is at least approximately unimodal, and we describe further evidence for unimodality in Appendix~\ref{sec:unimodalevidence}. While our algorithms simply assume unimodality and so may behave poorly when this condition does not hold, we observe empirically in Section~\ref{sec:experiments} that in practice our approach works well.
\begin{figure}%{r}{0.5\textwidth}
    \includegraphics[width=0.45\textwidth]{cifar100_ll.png}
    \caption{Accuracy of CNN on the CIFAR100 validation set as we vary the learning rate from 1e-6 to 1e-1 on a uniform grid in the log scale. We repeat the experiment 6 times for each of the 1000 learning rate values and show the mean accuracy here, with confidence intervals.}
    \label{fig:cifar100LR}
\end{figure}

Although unimodality is much less well-studied in the literature than its popular and more stringent cousin convexity, we are not the first to consider unimodality. For example, \cite{yu2011unimodal,combes2014unimodal, combes2020unimodal} all provide algorithms for $F$ that satisfy particular further assumptions in addition to unimodality. We develop a new algorithm that provides theoretical improvements over these works in terms of robustness to these further assumptions: our method simultaneously enjoys provably good performance for these prior function classes as well as other natural classes.

We then conduct an empirical evaluation of our method on a benchmark hyperparameter tuning task involving tuning hyperparameters across a number of different sub-tasks \citep{arango2021hpob}. We compare our algorithm to the simple random search baseline as well as the more advanced Gaussian process upper-confidence bound algorithm, which is a standard approach to hyperparameter tuning. Our method not only obtains slightly improved average performance across the different sub-tasks, it also obtains significantly better performance on tasks with less than 6 hyperparameters to tune, forming a very promising initial study of our approach.





In summary, we make the following contributions:
\begin{itemize}
    \item In Section~\ref{sec:algorithm}, we propose a novel algorithm for black-box optimization of 1D unimodal functions, with theoretical analysis on a variety of function classes. We simultaneously match the convergence rate of \citet{kleinberg2008multi} for Lipschitz (Theorem~\ref{thm:1dlipschitz}) and smooth functions (Theorem~\ref{thm:1dsmooth}), and that of   \citet{combes2020unimodal} for functions that meet certain growth conditions (Theorem~\ref{thm:1dlipschitzandlower}).
    % when $F$ satisfies certain growth conditions (Section~\ref{sec:algorithm}).
    % In particular, for $F$ satisfying certain growth conditions, after $B$ oracle evaluations, we output a point $\hat x$ that satisfies $\EE[F(x_\star)-F(\hat x)]\le \tilde O(1/\sqrt{B})$. , while simultaneously matching known rates (e.g. \citep{kleinberg2008multi}) on more general Lipschitz or smooth $F$. 
    Our proposed algorithm does \emph{not} need prior knowledge of problem parameters such as Lipschitz or smoothness constants to achieve these results.
    \item In Section~\ref{sec:coordinate}, we extend our 1D algorithm to higher dimensions. This extension provably converges to a local maximum.
    \item In Section~\ref{sec:experiments}, we empirically show that our multi-dimensional algorithm is generally competitive with  the state-of-the-art Bayesian optimization methods on hyperparameter tuning tasks. For modest dimension counts (i.e. $d\approx 5$), our method significantly outperforms Bayesian optimization methods.
\end{itemize}








\section{Definitions and Setting}\label{sec:definitions}

In this paper we consider exclusively loss functions of the form $f:[0,1]^d\times \cZ \to [0,1]$, where $\cZ$ is some arbitrary set. Given a $\cZ$-valued random variable $Z$, we let $F(x):[0,1]^d\to [0,1]$ be given by $F(x) =\EE[f(x,Z)]$. Our algorithms have access to a stochastic value oracle: given a point $x$, we may generate a new i.i.d. sample $z$ and compute $f(x,z)$. We call such a process a ``query'' or a ``sample'' interchangably. Our goal is to find a point $\hat x$ such that $F(\hat x)$ is as large as possible using at most $B$ samples for some given budget $B$.

A 1-D function is unimodal if it has a single local maximum.
We extend this notion to arbitrary dimensions by imposing the definition on each coordinate axis.
\begin{definition}[Unimodality]\label{def:unimodal}
A function $F:[0,1]^d\to [0,1]$ is unimodal if for all $(x_1,\dots,x_d)\in [0,1]^d$, and all $i\in\{1,\dots,d\}$, there exists an $x_i^\star\in [0,1]$ such that for all $a,b\in[0,1]$ with either $x_i^\star \ge a \ge b$ or $x_i^\star \le a \le b$, we have
\begin{align*}
    F(x_1,\dots,x_{i-1}, a,\dots,x_d)\ge F(x_1,\dots,x_{i-1},b,\dots, x_d)
\end{align*}
\end{definition}

We also define a notion of \emph{local optimality} to non-differentiable functions. 
\begin{definition}\label{def:stationary}
A point $x=(x_1,\dots,x_d)\in [0,1]^d$ is an $\tau$-approximate local optimum point of $F:[0,1]^d\to [0,1]$ if for all $i\in\{1,\dots, d\}$, for all $x_i^\star\in [0,1]$,
\begin{align*}
    F(x_1,\dots,x_{i-1},x_i^\star,\dots,x_d)\le F(x)+\tau
\end{align*}
\end{definition}
In other words, an approximate local optimum point is a point at which only minimal progress can be made by changing only one coordinate. 



\section{Algorithm and Analysis for 1-D functions}\label{sec:algorithm}
We first build an algorithm for the special-case of 1-D unimodal functions. We extend it in Section~\ref{sec:coordinate} to the high dimensional case by employing coordinate descent. For this section, let $F:[0,1]\to[0,1]$ and $x_\star=\argmax_{x\in[0,1]} F(x)$.

\subsection{Previous Work}

\emph{Golden section search} method \citep{kiefer1953sequential} is a minimax optimal algorithm for deterministic 1-D unimodal functions. The key observation is the following:

\emph{If $F(a)\ge F(b)$ for some $a,b\in[0,1]$, then $a\ge b$ implies $x_\star \ge b$, and $a\le b$ implies $x_\star \le b$.}

This suggests a natural strategy: maintain a \emph{candidate interval} $[l, r]$ such that $x_\star\in[l,r]$ and iteratively narrow it down. \cite{combes2020unimodal, yu2011unimodal} generalized the golden section search stochastic settings. They first commit to a small number of possible values $y_1,\dots,y_K \in [l, r]$, such that each $y_i$ is far from the boundary of the current candidate interval $[l,r]$. Then, they repeatedly sample $f(y_i,z_t)$ until they can estimate $F(y_1),\dots, F(y_K)$ with high accuracy and identify $y_i$ and $y_j$ where $F(y_i)\ge F(y_j)$.
% This straightforward approach leads to theoretically optimal algorithms under some assumptions.

% \citep{combes2020unimodal}.
%  by observing random samples $f(x, z)$ until we identify $a,b\in[l,r]$ such that $F(a)\ge F(b)$. Prior methods that use this approach \citep{combes2020unimodal, yu2011unimodal} 

However, this approach is intuitively wasteful because it puts too many eggs in few baskets. For example, it fails to make progress when $F(y_i)$ is \emph{equal} for all $i$, and all the repeat samples of $f(y_i, z_t)$ are wasted. To address this issue, \cite{combes2020unimodal} imposes additional structural conditions on $F$ that prevent the derivative from being too close to zero. \textbf{Our approach does not require such conditions, but it automatically enjoys a faster convergence rate when the conditions are satisfied.}

\begin{algorithm}
\caption{One Elimination Round of Unimodal Optimization}
\label{alg:oneround}
\begin{algorithmic}[1]
\REQUIRE Confidence parameter $\delta$, interval $\Delta=[l,r]$, threshold $\tau\ge 0$, spacing $\gap$ such that $\frac{u-l}{\gap}$ is an integer, confidence scaling constant $h>0$ to be set by Lemma~\ref{lem:azuma}.
\STATE Define the points $l=x^{(1)}<x^{(2)}<\ldots<x^{(N)}=r$ for $N =1+ \frac{l-r}{\gap}$ equally spaced in the interval $\Delta$.
\STATE\label{alg:oneround:onesample} Compute $f(x^{(k)},z^{(k)})$ for i.i.d. $z^{(1)},\dots,z^{(N)}$.
\STATE Given any $i,j\in\{1,\dots,N\}$, define $m_{ij} = \sum_{k=i}^{j} f(x^{(k)}, z^{(k)})/ (j - i + 1)$, $s_{ij} = h\sqrt{\log (2N/ \delta)}/ {\sqrt{j - i+1}}$.
\STATE Define upper confidence bound $U(i, j) = m_{ij} + s_{ij}$ for each $i,j\in\{1,\dots,N\}$
\STATE Define lower confidence bound $L(i, j) = m_{ij} -  s_{ij}$ for each $i,j\in\{1,\dots,N\}$

\STATE\label{alg:oneround:leftelim} Let $S_l=\max \{x^{(i)}\ |\exists i\le j\le k\le l\ s.t.\ U(i,j)<L(k,l)-\tau\}\cup \{x^{(1)}\}$.
\STATE\label{alg:oneround:rightelim} Let $S_r=\min \{x^{(l)}\ |\exists i\le j\le k\le l\ s.t.\ U(k,l) < L(i,j)- \tau\}\cup\{x^{(N)}\}$.
\STATE Let $\Ibest = \argmax_{[x^{(i)},x^{(j)}]} L(i,j)$
\STATE Return new interval $[S_l, S_r] \subset \Delta$ and the best interval $\Ibest$.
\end{algorithmic}
\end{algorithm}

Our approach is inspired by prior work on continuum armed-bandits \citep{auer2007improved} that tries to identify an interval containing $x_\star$. We extend their technique to unimodality so that the intervals can be used for elimination.
% Furthermore, from a practical standpoint, the optimization trajectory that consists of a  repeated samples at a small number of points is not informative for human users.

\subsection{Algorithm}
Our algorithm samples a large number of points only \emph{once} (Line~\ref{alg:oneround:onesample} of Algorithm~\ref{alg:oneround}).
As a result, our algorithm cannot build good enough estimates of $F$ at a single point to identify \emph{a pair of points} $(a,b)$ with $F(a)\ge F(b)$. Instead, \textbf{our algorithm identifies a pair of intervals} $I_a=[a_1,a_2]$ and $I_b=[b_1,b_2]$ for which there is some (unknown) $a\in I_a$, $b\in I_b$ satisfying $F(a)\ge F(b)$. Thus, we can either eliminate all points less than $b_1$ if $b_2 < a_1$, or all points greater than $b_2$ if $a_2 < b_1$. (Line~\ref{alg:oneround:leftelim}-\ref{alg:oneround:rightelim}).

Note that since $b \in [b_1, b_2]$, we do not eliminate as much as we would be able to if we had found the exact point $b$. That is, our algorithm operates with a potentially larger candidate interval than the existing algorithms at first, but casts a wider net for finding sub-intervals to eliminate. Intuitively speaking, our algorithm explores the geometry of the function $F$ more broadly.

% repeatedly sampling from a fixed set of points is constraining. Indeed, this approach causes the algorithm to fail in the case that $F(y_i)$ is \emph{equal} for all $i$. This is only ruled out by imposing additional structural assumptions on $F$ that intuitively prevent the derivative from being too close to zero. Moreover, this approach seems intuitively wasteful: rather than resampling a small number of points many times, we would like to sample any given point only \emph{once}, and use additional samples to further explore the geometry of the function $F$.
% This allows us to shrink the candidate interval $[l,r]$ by eliminating the region on the opposite side of $b$ from $a$. 

In order to find these intervals, we start by generating a grid of uniformly spaced points $x^{(1)},x^{(2)},\dots,x^{(N)}$ with $|x^{(i)}-x^{(i+1)}|=\Delta$ for some grid-spacing parameter $\Delta$. Then, we evaluate $f(x^{(i)},z^{(i)})$ for each $i\in\{1,\dots, N\}$ for independent samples $z_1,\dots,z_N$. Now, for each interval $I=[x_i,x_j]$, we can form the intersection $S_I=I\cap \{x^{(1)},\dots,x^{(N)}\}$ and estimate the average $F_I=\frac{1}{|S_I|}\sum_{x^{(i)}\in S}F(x^{(i)})$ by $\frac{1}{|S_I|}\sum_{x^{(i)}\in S_I}f(x^{(i)},z^{(i)})$. Moreover, for each $I$ we can also form \emph{confidence intervals} of width $O(\sqrt{\log(N^2/\delta)}/\sqrt{|S_I|})$ valid with probability $1-\delta/N^2$. Since there are $O(N^2)$ possible sets $S_I$, the confidence intervals are simultaneously valid with probability $1-\delta$. Finally, if any two intervals $I_a$ and $I_b$ are such that the confidence interval for $I_a$ is larger and disjoint from that for $I_b$, then there must be $a\in I_a$ and $b\in I_b$ with $F(a)\ge F(b)$.

We formally specify this algorithm in Algorithm~\ref{alg:oneround} and provide analysis in Lemma~\ref{thm:oneround}. The algorithm includes a "threshold" parameter $\tau$, which is used for extensions to multiple dimensions in Section~\ref{sec:coordinate}. For the remainder of this section, we will assume $\tau=0$.

% Setting $\tau$ to be a small non-zero value will be useful when extending to multiple dimensions for technical reasons, but for 1-dimensional problems it is best to consider $\tau=0$, and we encourage the reader to do so for the remainder of this section.

\begin{restatable}{lemma}{thmoneround}\label{thm:oneround}
There is a universal constant $h$ that can be provided to Algorithm~\ref{alg:oneround} such that with probability at least $1-\delta$, if $x_\star\in\Delta$ and $[S_l, S_r]$ is the output of Algorithm~\ref{alg:oneround} with any $\gap$ and $\tau$, then $x_\star\in[S_l, S_r]$, and $\frac{1}{j-i+1}\sum_{k=i}^j F(x^{(k)})\in[L(i,j),U(i,j)]$ for all $i,j$.
\end{restatable}
% \begin{proof}
% By Lemma~\ref{lem:azuma}, we have that $\frac{1}{j-i+1}\sum_{k=i}^jF(x^{(k)})\in [L(i,j),U(i,j)]$ for all $i,j$ with probability $1-\delta$. The rest of the proof conditions on this this high probability even. 

% Now, we will show that $x_\star \ge S_l$. The argument that $x_\star \le S_r$ is completely symmetric. Thus, this will establish $x_\star \in[S_l, S_r]$.

% First, if $S_l=x^{(1)}=u$, then $x_\star \ge S_l$ by assumption since $x_\star \in \Delta$. Otherwise, there must exist $i\le j\le k\le l$ such that $S_l=x^{(i)}$ and $U(i,j)< L(k,l) - \tau$. Now, since 
% \begin{align*}
%     \frac{1}{j-i+1}\sum_{k=i}^jF(x^{(k)})&\le U(i,j)\le L(k.l)\\
%     \frac{1}{l-k+1}\sum_{v=i}^jF(x^{(v)})&\le L(k,l)
% \end{align*}
% we have that there must exist some $b\in \{x^{(i)},\dots,x^{(j)}\}$ and $a\in\{x^{(k)},\dots,x^{(l)}\}$ such that $a\ge b$ and $F(a)\ge F(b)$. Thus, since $F$ is unimodal we must have $x_\star \ge b\ge x^{(i)}=S_l$ as desired.
% \end{proof}

Now, our full 1-D algorithm combines this interval shrinking routine with a doubling argument: we repeatedly call Algorithm~\ref{alg:oneround} with $\Delta$ set to the previously returned $[S_l,S_r]$ and $\gap$ set to the previous $\gap$ divided by two. Intuitively, if Algorithm~\ref{alg:oneround} does \emph{not} significantly shrink the interval $\Delta$, then this approach re-runs the algorithm with double the number of samples. Otherwise, if $\Delta$ shrinks by a constant factor, we are making sure to still take a reasonable number of samples in this new smaller interval. 

The final ingredient in our algorithm is how to select the final output $\hat x$. One plausible approach would be to pick a random element of the interval $\Delta$. However, we opt for a more refined method: we pick a random element of the interval that has the largest lower confidence bound. This enables us to make claims about the quality of our final output even in the case that the algorithm never eliminates any intervals at all (i.e. $\Delta=[0,1]$ always).
The formal description is provided in Algorithm~\ref{alg:1d}.


\begin{algorithm}
\caption{1D Unimodal Optimization}
\label{alg:1d}
\begin{algorithmic}[1]
\REQUIRE Confidence parameter $\delta$, threshold $\tau\ge 0$, budget $B$.
\STATE Initialize $t=1$, $\Delta_1=[0,1]$, $\gap_1=0.5$, $N=N_1=3$.
\WHILE{$N\le B$}
\STATE Call Algorithm~\ref{alg:oneround} with input $6\delta/\pi^2 t^2, \Delta_t, \tau,\gap_t$ to obtain outputs $[S_l,S_r]$, $\Ibest$.
\STATE Let $\Delta_{t+1}=[S_l,S_r]$.
\STATE Set $\gap_{t+1}=\gap_t/2$.
\STATE Set $N_{t+1}=1+\frac{S_r-S_l}{\gap}$ //Budget to be consumed by next iteration.
\STATE Set $N=N+N_t$. //Total budget consumed at end of next iteration.
\STATE Set $t=t+1$.
\ENDWHILE
\STATE Return a random element of $\Ibest$.
\end{algorithmic}
\end{algorithm}

\subsection{Convergence Analysis}

In order to prove convergence rates, we will need to make some additional assumptions about the shape of $F$ beyond just unimodality. To see this, consider the function $F(0.7)=1$ and $F(x)=0$ for $x\ne 0.7$. Such an $F$ is certainly unimodal, and yet clearly no algorithm can obtain any non-trivial bound on $F(x_\star)-F(\hat x)$. As our first assumption, we consider lipschitz functions.
\begin{definition}[Lipschitz]\label{assum:lipschitz}
$F(x)$ is L-lipschitz (i.e. $|F(x)-F(y)|\le L|x-y|$ for all $x,y\in[0,1]$).
\end{definition}
\begin{restatable}{theorem}{thmonedlipschitz}\label{thm:1dlipschitz}
Suppose $F$ is $L$-lipschitz. Let $\hat x$ be the output of Algorithm~\ref{alg:1d} with total sample budget $B$ and input failure probability $\delta$. Then, for any $\delta<1/2$, there is a constant $C$ and an event $E$ that occurs with  probability at least $1-\delta$ such that:
\begin{align*}
    \EE[F(\hat x)|E]&\ge F(x_\star) - 3(LC^2\gap_T \log(B/\delta))^{1/3}\\
    &\ge F(x_\star)-\frac{3(12 L C^2 \log(B/\delta))^{1/3}}{B^{1/3}}
\end{align*}
where $C$ is an absolute constant.
\end{restatable}
This result shows that the Algorithm~\ref{alg:1d} converges at a $O(1/B^{1/3})$ rate, or equivalently that $O(1/\epsilon^{3})$ samples suffice to find an $\epsilon$-suboptimal point for a Lipschitz objective $F$, matching classical rates for this setting~\citep{kleinberg2008multi}. The proof proceeds by observing that since Algorithm~\ref{alg:1d} never discards $x_\star$, eventually it will explore a fine enough grid of points that there will be an interval whose lower-confidence-bound is within $O(1/B^{1/3})$ of $F(x_\star)$, so that picking a random $\hat x$ from the interval with largest lower-confidence-bound is guaranteed to be $O(1/B^{1/3})$ suboptimal.


Next, we consider the case of \emph{smoooth} rather than Lipschitz $F$. For smooth $F$, we can improve upon the Lipschitz analysis. The key idea is that the average value of $F$ over an interval of width $W$ is within $O(W^2)$ of the value of $F$ at the midpoint of the interval. In contrast, if we assume only Lipschitz $F$, the average value may be $O(W)$ away from any given point in the interval. This improvement translates into a suboptimality of $O(1/B^{2/5})$ rather than $O(1/B^{1/3})$, as described formally in Theorem~\ref{thm:1dsmooth}.

\begin{definition}\label{assum:smooth}
A function $F$ is $\beta$-smooth if $F(x)$ is differentiable and $F'(x)$ is $\beta$-Lipschitz.
\end{definition}
\begin{restatable}{theorem}{thmonedsmooth}\label{thm:1dsmooth}
Suppose $F$ is $\beta$-smooth and that $F'(x_\star)=0$ (i.e. $x_\star$ is not on the boundary). Let $\hat x$ be the output of Algorithm~\ref{alg:1d} with total sample budget $B$ and input failure probability $\delta$. Then, there is a constant $C$ such that for any $\delta<1/2$ there is an event $E$ that occurs with  probability at least $1-\delta$ such that:
\begin{align*}
    \EE[F(\hat x)|E]\ge F(x_\star) - \frac{5(\beta C^4 \log(B/\delta)^2)^{1/5}}{B^{2/5}}
\end{align*}
\end{restatable}

Theorem~\ref{thm:1dlipschitz} and \ref{thm:1dsmooth} establish baseline convergence rates for Lipschitz or smooth $F$. They have the desirable property that the algorithm need not use any knowledge of the Lipschitz or smoothness parameters $L$ and $\beta$, so that the bounds hold with the tightest possible (unknown) values. On the other hand, the result is somewhat naive: we did not use any of the ``elimination'' power of the algorithm, and in fact the same result would hold if we simply applied one single round of Algorithm~\ref{alg:1d} and did not even require unimodality. 

The power of our algorithm is that it automatically enjoys a better convergence rate if the function meets certain conditions, without any changes to the input or prior knowledge. The first such condition we explore is the following: 
\begin{assumption}[$(\gamma, M)$-Lipschitz lower bound]\label{assum:lipschitzlowerbound}
There is some $\gamma$ and $M$ such that for all $x,y$ with $|x-x_\star|\ge \gamma$ and $|y-x_\star|\ge \gamma$, and $\sign(x-x_\star)=\sign(y-x_\star)$, we have $|F(x)-F(y)|\ge M|x-y|$ (i.e. if $F$ is differentiable, then $|F'(x)|\ge M$ whenever $|x-x_\star|\ge \gamma$).
\end{assumption}
Intuitively, these functions have a ``difficult'' interval of size $2\gamma$ around the optimum $x_\star$. The lower bound on the slope of $F$ outside this region allows our algorithm to rapidly eliminate subintervals and converge to a small interval near the optimum. We can improve upon Theorem~\ref{thm:1dlipschitz}, and establish that the asymptotic error is $\tilde O((\gamma/B)^{1/3}+1/\sqrt{B})$. Notice that the algorithm at no point requires knowledge of the parameter $\gamma$.

\begin{restatable}{theorem}{thmonedlipschitzlower}\label{thm:1dlipschitzandlower}
Suppose $F$ is $L$-Lipschitz and satisfies Assumption~\ref{assum:lipschitzlowerbound}. Let $\hat x$ be the output of Algorithm~\ref{alg:1d} with total sample budget $B>78$ and input failure probability $\delta$. Then, there is a constant $C$ such that for any $\delta<1/2$, there is an event $E$ that occurs with  probability at least $1-\delta$ such that:
\begin{align*}
    \EE[F(\hat x)|E]&\ge F(x_\star)  - 3(L\gap_T C^2 \log(B/\delta))^{1/3}\\
    &\ge F(x_\star) -\tilde O\left( \frac{L\gamma^{1/3}}{B^{1/3}} + \frac{1}{\sqrt{B}M^{1/3}}\right)
    % -3\max\left(\frac{(LC^2\log(B/\delta))^{1/3}}{2^{B/351}},\ \frac{(54LC^2 \log(B/\delta)\left(\gamma+\frac{\tau}{M}\right))^{1/3}}{B^{1/3}},\ \frac{ C^{1/3}\sqrt{\log(B/\delta)}27^{1/3}2^{5/3}}{\sqrt{B} M^{1/3}\sqrt{2^{2/3}-1}}\right)
\end{align*}
where $C$ is an absolute constant.
\end{restatable}

Finally, we consider the class of functions satisfying Assumption~\ref{assum:combes}, which is a strict superset of function class studied by \cite{combes2020unimodal} (denoted $\mathcal{U}_{[0,1]}$ in their paper). We show that our Algorithm~\ref{alg:1d} recovers the same $\tilde O(1/\sqrt{B})$ convergence rate in this setting, again without requiring any knowledge of problem parameters.

\begin{assumption}\label{assum:combes}
There exists $A_1,A_2$ and $z$ such that for all $x$, $A_2|x-x_\star|^z\ge F(x_\star) - F(x)\ge A_1|x-x_\star|^z$. Such functions need be neither Lipschitz nor smooth.
\end{assumption}

\begin{restatable}{theorem}{thmcombes}\label{thm:combes}
Suppose $F$ satisfies Assumption~\ref{assum:combes}. Then, there is a constant $C$ such that for all $\delta<1/2$, for any $B$ satisfying $B\ge \frac{24 A_2^{1/z}}{C^{1/z} \log(B/\delta)^{1/2z}}$, with probability at least $1-\delta$ Algorithm~\ref{alg:1d} with $\tau=0$ guarantees:
\begin{align*}
    F(\hat x) &\ge F(x_\star) - F(x_\star) \\
    &-\max\left(12^z\frac{C^{\frac{2z}{2z+1}} A_2^{\frac{2z+2}{2z+1}} \log(B/\delta)^{\frac{z}{2z+1}}}{A_1 2^{\frac{zB}{24z+12}}}, \right.\\
    &\qquad\qquad \left. 
    \frac{12^z  2^{\frac{z}{2z+1}}\sqrt{72}}{\sqrt{2^{\frac{2z}{2z+1}}-1}}\frac{C A_2^{1+\frac{1}{2z}}\sqrt{\log(B/\delta)}}{A_1^{1+\frac{1}{2z}}\sqrt{B}}\right)\\
    &\ge F(x_\star) - \tilde O(1/\sqrt{B})
\end{align*}
\end{restatable}

It should be noted that the dependencies of Theorem~\ref{thm:combes} on the parameter $z$ are slightly worse than in \cite{combes2020unimodal}. However, this is mitigated by the fact that our function class is actually somewhat larger (\cite{combes2020unimodal} consider a stronger version of Assumption~\ref{assum:combes} that controls local behavior of $F$ even far from $x_\star$). Further, due to the greater range of exploration of our algorithm, we are able to handle more general Lipschitz or smooth losses as described previously.

We would also like to note that our algorithm would be better than an algorithm which provides a third of the budget to CAB1~\citep{kleinberg2004nearly}, a third to~\citep{combes2020unimodal} and picks the best between the two returned answers by testing on the remaining trials. In order to see this, let us look at the following informal function class.

\begin{assumption}[Informal]
\label{assum:informal}
Let $F$ be the class of unimodal functions that satisfy Assumption 2 only on a sub-interval $R \in [0, 1]$ containing the optimum, and outside of this sub-interval we make no further assumptions other than unimodality.
\end{assumption} 

Under Assumption~\ref{assum:informal}, our algorithm will eliminate everything outside of $R$ in $\mathrm{poly}(1/|R|)$ time, using essentially a discretization style analysis. However, once this elimination occurs, the algorithm will automatically switch to the faster $O(1/\sqrt{B})$ rate. Note that for any $|R|>0$, CAB1 will not achieve a $O(1/\sqrt{B})$ asymptotic rate (due to its discretization it cannot hope for better than $O(1/B^{1/3})$), while for $|R| < 1/4$, the algorithm in~\citep{combes2020unimodal} may not converge. Thus, this is a class of functions for which we outperform the minimum of the two baselines. Moreover, this class of functions that are “well-behaved near the optimum” is reasonably natural: indeed our Figure~\ref{fig:cifar100LR} seems to exhibit such behavior.

\section{Multidimensional Problems via Coordinate Ascent}\label{sec:coordinate}

In this section, we describe a simple approach to generalize our unimodal optimization algorithm to higher dimensions. While in the previous section the threshold parameter $\tau$ was essentially a nuisance factor and could be safely set to zero, here we will need the threshold to overcome a technical difficulty: we would like to ensure that any time a point $x$ is eliminated by Algorithm~\ref{alg:1d}, it is possible to identify a $x^+$ for which $F(x_\star)-F(x^+)$ is significantly smaller than $F(x_\star)-F(x)$. We conjecture that $\tau$ is not actually required for this task, but this is left as an open question.

We consider functions $F:[0,1]^d\to[0,1]$ such that for every point $x\in[0,1]$, the $d$ 1-D functions given by restricting $F$ to each coordinate axis through $x$ are all unimodal. This assumption suggests a natural iterative strategy: we will initialize our algorithm at some $x_1\in[0,1]^d$. Then, we run a copy of our 1-D algorithm on each coordinate independently, choosing samples along each coordinate in a round-robin fashion. If one of the copies is able to successfully eliminate the original point $w_1$, then we let the output of this copy be a next iterate $w_2$ and repeat the process.

The analysis of this method proceeds by showing that (1) $F(x_{t+1})\ge F(x_t)+\tau$ for all $t$, and that (2) the number of samples required to identify $x_{t+1}$ is not too large, so long as an appropriate $x_{t+1}$ exists. Combined, this means that after at most roughly $1/\tau$ iterations of this procedure, we will converge to a $\tau$-approximate local optimum (Definition~\ref{def:stationary})
\begin{algorithm}
\caption{Unimodal Coordinate Ascent}
\label{algo:coord}
\begin{algorithmic}[1]
\STATE $t=1$
\STATE Choose an arbitrary $w^t=(w^t_{1},\dots,w^t_{d})\in[0,1]^d$
\WHILE{$N\le B-d$}
\STATE Given a starting point $w^t\in[0,1]^d$, initialize $d$ copies of Algorithm~\ref{alg:1d} with $\delta=6/dt^2\pi^2$, budget set to $\infty$ and threshold $\tau$ where the $i$th copy considers the function $F_i:[0,1]\to [0,1]$ $F_i(x) = F(w^t_{1},\dots,w^t_{i-1},x,w^t_{i+1},\dots,w^t_{d})$.\\ Let $\Delta^t_{ti}$ be the active interval associated with the $i$th copy.\\
Let $I^t_{\text{best}, i}$ be the current value of $\Ibest$ maintained by the $i$th copy.
\WHILE{$w^t_{i}\in \Delta^t_{i}$ for all $i$}
\IF{$N\le B-d$}
\STATE Returns  $w^t$.
\ENDIF
\STATE Let each copy sample one additional point.
\STATE $N=N+d$.
\ENDWHILE
\STATE Suppose the $j$th copy eliminated $w^t_{j}$ ($w^t_{j}\notin \Delta^t_{j}$).
\STATE Let $w^{t+1}_{j}$ be a randomly selected point in $I^t_{\text{best}, j}$.
\STATE Set $w^{t+1}=(w^t_{1},\dots,w^t_{j-1},w^{t+1}_{j},w^t_{j+1},\dots)$.
\STATE $t\gets t+1$.
\ENDWHILE
\STATE Return $w^t$.
\end{algorithmic}
\end{algorithm}

We provide an analysis of this algorithm for the case of Lipschitz $F$ in Theorem~\ref{thm:coord}. It follows from an alternative analysis of Algorithm~\ref{alg:1d}, which tells us that Algorithm~\ref{alg:1d} will quickly eliminate any non-local minimum point. For details, please see Lemma~\ref{thm:progress} in the appendix. Informally, this result tells us that with a budget of $B$ samples, we will find an $\tilde O(d^{1/4}/B^{1/4})$-approximate local minimum for a $d$-dimensional Lipschitz unimodal function. 

\begin{restatable}{theorem}{thmcoord}\label{thm:coord}
There is a constant $C$ such that, given a budget of $B$ and failure probability $\delta<1/2$, if we set $\tau= \max\left(\frac{2d\log(B/\delta)}{B},\ \frac{37\cdot (dC^2L \log(B/\delta)^2)^{1/4}}{B^{1/4}}\right)$, Then with probability at least $1-2\delta$, Algorithm \ref{algo:coord}, returns a point $\hat w$ that is a $3\tau$-approximate local minimum. That is, for each coordinate $i$, for any $w^\star_i$ that differs from $\hat w$ only in the $i$th coordinate, we have $F(w^\star_i)\le F(\hat w)+3\tau$
\end{restatable}




% Lemma~\ref{thm:progress} itself is a fairly simple refinement of our bounds for Lipschitz $F$. 
% One could instead consider functions that (along coordinate axes) satisfy other restrictions such as smoothness, or Assumptions~\ref{assum:lipschitzlowerbound} or \ref{assum:combes} to derive analogous high dimensional convergence bounds, but we opt for this simplest assumption.



\section{Experiments}\label{sec:experiments}
\begin{figure*}
  \centering
  \subfloat[Regret across all tasks]{\label{fig:whole}\includegraphics[scale=0.5]{all_bench.png}}\hfill
  \subfloat[Cumulative runtime]{\label{fig:runtime}\includegraphics[scale=0.5]{runtime.png}}\\
  \subfloat[Regret vs Dimension]{\label{fig:dim}\includegraphics[scale=0.5]{all_dims.png}}\hfill
  \subfloat[Regret on lower dimensional tasks]{\label{fig:lowdim}\includegraphics[scale=0.5]{less6_regret_33.png}}
 \caption{In (a), we plot the normalized regret as a function of number of trials (observations) averaged over 86 benchmark tasks. Each task is repeated 5 times and we plot the corresponding standard errors. It can be seen that our algorithm (Unimodal Ascent) outperforms the baselines over the majority of the x-axis. In (b), we compare the run-times of our algorithm and BoTorch GP-UCB as a function of number of trials. In (c), we plot the average regret of our algorithm and BoTorch-GP as a function of problem dimension. It can be seen that we outperform the GP on all dimensions except 8 and 18. In (d), we plot the normalized regret as a function of number of trials for problems with dimensions $\leq6$.}
 \label{fig:expapp}
\end{figure*}

We tested our algorithm on machine learning hyperparameter optimization (HPO) problems. We used the continuous variant of HPO-B benchmark by \cite[Sec~5.4]{arango2021hpob}.  The blackbox functions being optimized are the (approximated) validation accuracy of models for different hyper-parameter settings. The models vary across several tasks like training SVMs, GLMNets etc on various datasets. The approximation is done through XGBoost surrogate functions.  We used the "HPO-B-v3 Meta-test split" as detailed in \cite{arango2021hpob}. It consists of 86 tasks in 16 distinct search spaces. We followed the exact test protocols as defined in their open source package.

{\bf Benchmark algorithms.} The state of the art for HPO often involves using Gaussian Process (GP) regressors to derive surrogate functions and optimize them. We used the GP-UCB implementation included in the HPO-B open source package. It uses the GP implementation in BoTorch~\citep{balandat2019botorch} with UCB coefficient of 0.1 (this method was shown to have strong performance on the benchmarks~\citep{arango2021hpob}). We also use random search over the domain as another baseline. In order to be fair to all the algorithms, we initialize each of them at a random point for any of the benchmark tasks. All algorithms are run five times for every task.

% (chansoo): I think comparing it to Vizier makes sense, whether we include it in the plot or not. 

% GP-based algorithms involve a lot of hyperparameters, which can greatly change the performance. In order to verify that the HPO-B package's  GP-UCB implementation is competitive, we also tested Google Cloud's Vertex AI Vizier, which is another variation of GP-UCB \cite{golovin2017google}. Although the details of Vizier GP-UCB are not publicly available, we found that Vizier and BoTorch achieve comparable convergence rates. We also tried the out-of-box GP-UCB implementation of emukit library, which was vastly outperformed by the other GP-UCB implementations. We included this result in the appendix. 


% \begin{figure*}%[htpb!]
%   \centering
%   \subfloat[Regret vs Dimension]{\label{fig:dim}\includegraphics[scale=0.5]{figs/all_dims.png}}\hfill
%   \subfloat[Regret on lower dimensional tasks]{\label{fig:lowdim}\includegraphics[scale=0.5]{figs/less6_regret_33.png}}
%  \caption{ In (a), we plot the average regret of our algorithm and BoTorch-GP as a function of problem dimension. It can be seen that we outperform the GP on all dimensions except 8 and 18. In (b), we plot the normalized regret as a function of number of trials for problems with dimensions less than equal to 6.}
%  \label{fig:expapp}
% \end{figure*}

\begin{algorithm}
\caption{Modified Unimodal Coordinate Ascent}
\label{algo:mod}
\begin{algorithmic}[1]
\STATE Set $N, t=10$. Choose 10 random points $[0,1]^d$ and set arbitrary $w^t$ as the one with the best observed value.
\WHILE{$N\le B-d$}
\STATE Given a starting point $w^t\in[0,1]^d$, initialize $d$ copies of Algorithm~\ref{alg:1d_v2} with $\delta=6/dt^2\pi^2$, budget set to $\infty$ where the $i$th copy considers the function $F_i:[0,1]\to [0,1]$ $F_i(x) = F(w^t_{1},\dots,w^t_{i-1},x,w^t_{i+1},\dots,w^t_{d})$.\\ Let $\Delta^t_{ti}$ be the active interval associated with the $i$th copy.\\
Let $I^t_{\text{best}, i}$ be the current value of $\Ibest$ maintained by the $i$th copy.
\WHILE{$w^t_{i}\in \Delta^t_{i}$ for all $i$}
\IF{$N\le B-d$}
\STATE Return $w^t$.
\ENDIF
\STATE Let $i$ be the dimension sampled from distribution proportional to $\{\exp(s_1), \cdots, \exp(s_d)\}.$ Here $s_i$ is the observed standard deviation of the values for dimension $i$ so far.
\STATE Run one more epoch of Algorithm~\ref{alg:1d_v2} for dimension $i$.
\STATE $N=N+ \text{$<$\#new points sampled$>$}$.
\ENDWHILE
\STATE Suppose the $j$-th copy is one such copy that eliminated $w^t_{j}$ ($w^t_{j}\notin \Delta^t_{j}$) and $\Delta_{t, j}$ is smallest among all eliminating dimensions.
\STATE Let $w^{t+1}_{j}$ be the best point selected in $I^t_{\text{best}, j}$, in terms of observed value.
\STATE Set $w^{t+1}=(w^t_{1},\dots,w^t_{j-1},w^{t+1}_{j},w^t_{j+1},\dots)$ and begin a new round (reuse points if available).
\ENDWHILE
\STATE Return $w^t$.
\end{algorithmic}
\end{algorithm}

{\bf Our implementation.} We make some practical modifications to Algorithm~\ref{algo:coord} for the real world benchmarks, both to improve runtime and also to acknowledge that the true black-box functions might not exactly adhere to our assumptions. First, instead of comparing every possible pair of disjoint intervals $(I_1,I_2)$, we compare only pairs where $|I_1|=|I_2|$ and both are powers of 2. This significantly reduces the number of pairs to compare, and an inspection of our proof techniques shows that it will only harm constants in the bounds. Further, we never sample the same point twice, but instead \emph{re-use} the previous sample every time it is requested in Algorithm~\ref{alg:oneround}. This again only harms constants in the bounds. Next, in all tasks, we start our algorithm with $10$ random points in the domain, and then choose the point with the best observed value as the starting point for the unimodal coordinate ascent. In practice this leads to a good initialization of our algorithm. Note that we count those ten points as trials expended by our algorithm so that it is a fair comparison with the benchmarks. In Algorithm~\ref{algo:coord}, if at any point there are multiple coordinates that can eliminate the current point (line 12), we choose the coordinate along which the maximum area has been eliminated. Finally, instead of letting each coordinate sample a point in line 9, we choose the coordinate that can sample the next point from a distribution that up-weights coordinates that have shown more variation in past trials. We also set the threshold parameter $\tau$ to zero. In the interest of space, we provide the pseudo-code of this modified algorithm as Algorithm~\ref{algo:mod} (more details in Appendix~\ref{sec:moreexp}).

{\bf Regret on the full benchmark.} In Figure~\ref{fig:whole}, we plot the \textit{normalized regret} as a function of number of trials/ observations. We adopt the definition of normalized regret from~\citep{arango2021hpob} i.e given the  points $\{x_1, x_2, \cdots, x_t\}$ chosen by the algorithm till time $t$, the normalized regret is 
\[r(t) = \frac{F(x_*) - \max_{s \in [t]}F(x_s)}{F(x_*) - F(x_{min})}\] where $x_{min}$ is the point with the lowest objective value. It can be observed that our simple unimodal ascent algorithm outperforms a state of the art GP averaged over all the 86 real world benchmark HPO tasks. This is quite remarkable since the benchmark tasks might not strictly adhere to our assumptions and the GP has been carefully tuned by~\citet{arango2021hpob} for good performance on the HPO-B tasks. We also provide a comparison w.r.t a version of the Zooming Algorithm~\citep{kleinberg2008multi} in Appendix~\ref{sec:zooming}.

{\bf Runtime.} Figure~\ref{fig:runtime} shows the cumulative runtime of our algorithm against the BoTorch GP as a function of number of observations. The logarithmic scale in the y-axis shows that our algorithm is orders of magnitude faster than the GP, owing to its simplicity. All our experiments were performed on Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz machines with NVIDIA Tesla P100 GPU.

Indeed, it is well-known that generating the $N$-th suggestion from the GP takes $O(A(DN^2 + N^{3}) + B(DN+N^2))$ time, where $D$ is search space dimension, $A$ is the number of kernel hyper-parameter optimization (also known as \emph{ARD}) iterations, and $B$ is the number of acquisition function optimization iterations~\citep{garnett_bayesoptbook_2022}.

In comparison, generating the $N$-th suggestion from Algorithm~\ref{algo:mod} takes $O(N^2)$ time, up to logarithmic factors. This can be a significant advantage when tuning a lightweight model or optimizing with a large budget on the number of evaluations.



{\bf Dimension of search-space.} Now we dive deeper into the behavior of the algorithms in search-spaces of varying dimensions. The HPO-B benchmarks contain search spaces of dimensions ranging from 2 to 18. We expect our algorithm to work significantly better than the GP for lower dimensional problems in the presence of approximately unimodal structure. This phenomenon can be observed in Figure~\ref{fig:dim} where we plot the average normalized regret achieved at the end of 100 trials, broken down by search-space dimension. Our gains are much more significant for lower dimensions. The BoTorch GP is only better than us for problems with dimension 18 and comparable to us on problems with dimension 8. 

In Figure~\ref{fig:lowdim}, we plot regret as a function of number of trials, for tasks with dimension less than or equal to 6 (more than 38$\%$ of the search-spaces). This  shows that we are significantly better than the GP for these tasks. In conclusion our algorithm is simpler and faster than the state of the art BoTorch GP, while being more performant on average on the HPO-B benchmark. 

% Do we want to mention the fact that we have fewer hyperparameters to tune and they are more intuitive?



\section{Conclusion}\label{sec:conclusion}

We describe a new algorithm for black-box optimization of unimodal functions. Our algorithm is based upon the intuitive idea that one can estimate the objective value at any given point by sampling several nearby points, rather than sampling the same point many times. This allows us to uniformly cover the input space and gain more information about the shape of the function while still employing a simple elimination test to remove suboptimal regions of the input quickly. We demonstrate theoretically that this method matches prior work, and empirically we are able to outperform more advanced methods based upon Gaussian processes. We conjecture that this capability arises from the unimodality assumption providing an informative prior about the overall shape of the objective (even if it is not strictly true in all practical settings).

Our work suggests at least three natural avenues for future investigation. Perhaps the most pressing is improving the method of extension to high dimensional problems. We adopt a relatively simple coordinate ascent strategy, while one might hope for a more refined algorithm that views all dimensions at once. Nevertheless, even this naive approach performs surprisingly well in our experimental study. Next, although our theoretical development matches prior work in the 1D setting, there are classes of functions missing from our analysis, notably the \emph{convex} $F$, which are of course also unimodal. Thus, it would be valuable to improve the analysis or algorithm  to achieve the optimal $O(1/\sqrt{T})$ convergence rates for convex functions. 
%Finally, Our current experimental evaluation is limited to low to moderate dimensional tasks which are typical in hyper-parameter tuning. In these cases we achieve strong results, and so we hope that the trend would persist even in higher dimensional problems.

\bibliography{refs}


\newpage

\onecolumn
\appendix

\section{Proofs for Section~\ref{sec:algorithm}}

\subsection{Proof of Theorem~\ref{thm:oneround}}
In this section we provide the missing proofs for the analysis of the 1-D unimodal optimization algorithm. Results that appear in the main text are restated in this section for reference.

We begin with a technical observation that allows us to show Theorem~\ref{thm:oneround}:
\begin{restatable}{lemma}{lemazuma}\label{lem:azuma}
Using the notation of Algorithm~\ref{alg:oneround}, there exists a universal constant $h$ such that with probability at least $1-\delta$, for all $i,j\in \{1,\dots, N\}$,
\begin{align*}
    \frac{1}{j-i+1}\sum_{k=i}^j F(x^{(k)}) &\le \frac{1}{j-i+1}\sum_{k=i}^j F(x^{(k)}) + h\sqrt{\log (N/ \delta)}/ {\sqrt{j - i+1}}\\
    \frac{1}{j-i+1}\sum_{k=i}^j F(x^{(k)}) &\ge \frac{1}{j-i+1}\sum_{k=i}^j F(x^{(k)}) + h\sqrt{\log (N/ \delta)}/ {\sqrt{j - i+1}}
\end{align*}
\end{restatable}
\begin{proof}
Notice that the sequence $f(x^{(k)},z^{(k)})-F(x^{(k)})$ forms a bounded martingale difference. The Lemma then follows directly from Azuma-Hoeffding inequality followed by union bound over the $\frac{N(N-1)}{2}$ possible choices for $i,j$.
\end{proof}

With this, we can prove:
\thmoneround*
\begin{proof}
By Lemma~\ref{lem:azuma}, we have that $\frac{1}{j-i+1}\sum_{k=i}^jF(x^{(k)})\in [L(i,j),U(i,j)]$ for all $i,j$ with probability $1-\delta$. The rest of the proof conditions on this this high probability even. 

Now, we will show that $x_\star \ge S_l$. The argument that $x_\star \le S_r$ is completely symmetric. Thus, this will establish $x_\star \in[S_l, S_r]$.

First, if $S_l=x^{(1)}=u$, then $x_\star \ge S_l$ by assumption since $x_\star \in \Delta$. Otherwise, there must exist $i\le j\le k\le l$ such that $S_l=x^{(i)}$ and $U(i,j)< L(k,l) - \tau$. Now, since 
\begin{align*}
    \frac{1}{j-i+1}\sum_{k=i}^jF(x^{(k)})&\le U(i,j)\le L(k.l)\\
    \frac{1}{l-k+1}\sum_{v=i}^jF(x^{(v)})&\le L(k,l)
\end{align*}
we have that there must exist some $b\in \{x^{(i)},\dots,x^{(j)}\}$ and $a\in\{x^{(k)},\dots,x^{(l)}\}$ such that $a\ge b$ and $F(a)\ge F(b)$. Thus, since $F$ is unimodal we must have $x_\star \ge b\ge x^{(i)}=S_l$ as desired.
\end{proof}


Next, we prove Lemma~\ref{lem:onedconsistent}:
% This Lemma shows that Algorithm~\ref{alg:1d} does not behave too ``badly'': it consumes the budget and never discards the optimal point $x_\star$. Next, we will show concrete convergence guarantees. 
\begin{restatable}{lemma}{lemonedconsistent}\label{lem:onedconsistent}
For all $t$, Algorithm~\ref{alg:1d} guarantees:
\begin{enumerate}
    \item Let $x^{(1,t)},\dots,x^{(N_t, t)}$ be the input points sampled by the $t$th call to Algorithm~\ref{alg:oneround} and let $L^t(i,j)$ and $U^t(i,j)$ be the $L(i,j)$ and $U(i,j)$ values computed using the samples. Then with probability at least at least $1-\delta$ we have $x_\star \in \Delta_t$ and for all $t$ and $\frac{1}{j-i+1}\sum_{k=i}^j F(x^{(k,t)})\in[L^t(i,j),U^t(i,j)]$ for all $i,j,t$.
    \item There is a constant $h$ such that the confidence widths $s_{ij}$ computed in each round of Algorithm~\ref{alg:1d} satisfy $s_{ij}\le C\sqrt{\log(B/\delta)}/\sqrt{j-i+1}$ for all $ij$ for all rounds for some constant $C$ for any $\delta<1/2$.
    \item $|\Delta_t|/\epsilon_t\in \NN$ for all $t$ (with probability 1).
    \item At least $B/3$ budget is consumed (with probability 1).
    \item The value of $\gap_t$ after consuming $N$ samples is at most $4/N$ (with probability 1). In particular, the final value of $\gap_t$ is at most $12/B$.
\end{enumerate} 
\end{restatable}

\begin{proof}
\begin{enumerate}

\item 
The first statement follows from Theorem~\ref{thm:oneround} combined with union bound and the fact that $\sum_{i=1}^\infty 1/i^2 = \pi^2/6$.

\item 
For the second statement, in the $t$th round, the confidence width is
\begin{align*}
    s_{ij} &= \frac{h\sqrt{\log(N_t\pi^2/3\delta)}}{\sqrt{j-i+1}}\\
    &\le \frac{h\sqrt{\log(B\pi^2/3\delta)}}{\sqrt{j-i+1}}\\
    &\le \frac{h\sqrt{ \log(\pi^2/3)+\log(B/\delta)}}{\sqrt{j-i+1}}\\
    &\le \frac{h\sqrt{ \log(2\pi^2/6)/\log(2)}\sqrt{\log(2)+\log(B/\delta)}}{\sqrt{j-i+1}}\\
    &\le \frac{h\sqrt{ 2\log(2\pi^2/6)/\log(2)}\sqrt{\log(B/\delta)}}{\sqrt{j-i+1}}
\end{align*}
which shows the claim.

\item
Observe that since $\Delta_{t+1}=[x^{(i)},x^{(j)}]$ for some $x^{(i)}$ and $x^{(j)}$ produced by the $t$th call to Algorithm~\ref{alg:oneround}, we have that $|\Delta_{t+1}|=k\epsilon_t$ for some integer $k$. Now, since $\epsilon_t=2^{-t}$ for all $t$, we clearly have $|\Delta_{t+1}|/\epsilon_{t+1}=2k\in \NN$.

\item 
Now, to show that Algorithm~\ref{alg:1d} consumes at least $B/3$ budget, we claim that the $t$th call to Algorithm~\ref{alg:1d} uses at most twice as many samples as the $t-1$st call: $N_{t+1}\le 2N_t$. This claim suffices to show the desired result: the algorithm runs until $N=\sum_{i=1}^{t+1} N_i \ge B$ and consumes $\sum_{i=1}^t N_t$ budget. If $\sum_{i=1}^t N_t < B/3$, then $N_t< B/3$. However, since $N_{t+1}\le 2N_t$, this implies $N_{t+1}< 2B/3$ so that $\sum_{i=1}^{t+1} N_t<B$, which is a contradiction.

So, it remains to establish the claim. Notice that $N_t =1+ \frac{|\Delta_t|}{\gap_t} = 1+|\Delta_t|2^t$ for all $t$. Further, $|\Delta_{t+1}|\le |\Delta_t|$ for all $t$. Therefore:
\begin{align*}
    N_{t+1}&=1+|\Delta_{t+1}|2^{t+1}\\
    &\le 1+|\Delta_t|2^{t+1}\\
    &= 2(1 + |\Delta_t|2^t)-1\\
    &\le 2N_t
\end{align*}
as desired.

\item
Finally, we show that when $N$ samples are consumed, the smallest $\gap_t$ value is at most $4/N$. Since $\gap_t=2^{-t}$, this means that we need to show $2^t\ge B/4$. To see this, observe that $N_t \le 1+|\Delta_t|2^t\le 1+2^t$ for all $t$. Therefore $N\le \sum_{i=1}^t N_i \le t + 2^{t+1}\le 2^{t+2}$ so that indeed $B/4 \le 2^t$.
\end{enumerate}
\end{proof}


% Next, we provide the missing  proofs of Theorem~\ref{thm:1dlipschitz} and Theorem~\ref{thm:1dsmooth}.

\subsection{Proof of Theorem~\ref{thm:1dlipschitz}}
\thmonedlipschitz*

\begin{proof}
By Lemma~\ref{lem:onedconsistent}, we have that $x_\star\in \Delta_t$ for all $t$ with probability at least $1-\delta$. Then, let $E$ be the event that $x_\star\in \Delta_t$ for all $t$ so that $P(E)\ge 1-\delta$ as required. Let $C$ be the same absolute constant as in Lemma~\ref{lem:onedconsistent}.

Let $T$ be the number of calls to Algorithm~\ref{alg:oneround} made in Algorithm~\ref{alg:1d}. Then, Lemma~\ref{lem:onedconsistent} also implies that $\gap_T \le 12/B$. Let $x^{(1)},\dots,x^{(N_T)}$ be the points sampled by Algorithm~\ref{alg:oneround} in the $T$th and final round of Algorithm~\ref{alg:1d}. Suppose $i\le j$ are such that $x^{(i)}\le x^\star\le x^{(j)}$. Define $l=j-i+1$. Note that such an interval exists for all $l\le N_T$. Then we have the following identity:

% and $j-i=B$ $|x^{(i)}-x^{(j)}|\le something$. 

\begin{align*}
    L(i,j)&\ge\frac{1}{j-i+1} \sum_{k=i}^j f(x^{(k)},z^{(j)}) - \frac{C\sqrt{\log(B/\delta)}}{\sqrt{l}}\\
    &\ge \frac{1}{j-i+1} \sum_{k=i}^j F(x^{(k)}) - \frac{2C\sqrt{\log(B/\delta)}}{\sqrt{l}}\\
    &\ge F(x_\star) - L|x^{(i)}-x^{(j)}| - \frac{2C\sqrt{\log(B/\delta)}}{\sqrt{l}}\\
    &\ge F(x_\star) - L\gap_T (l-1)- \frac{2C\sqrt{\log(B/\delta)}}{\sqrt{l}}
\end{align*}
Now, suppose that there is no such $i,j$ satisfying $l=\lceil C^{2/3}\log(B/\delta)^{1/3}/(L\gap_t)^{2/3}\rceil$. Then we must have $N_T< \lceil C^{2/3}\log(B/\delta)^{1/3}/(L\gap_t)^{2/3}\rceil$, which implies $N_T-1<C^{2/3}\log(B/\delta)^{1/3}/(L\gap_t)^{2/3}$. Therefore, \emph{every} point $\hat x$ in the interval $\Delta$ satisfies:
\begin{align*}
    F(\hat x) &\ge F( x_\star)-  L\gap_T (N_T-1)\\
    &\ge F(x_\star) - (L\gap_T C^2 \log(B/\delta))^{1/3}
\end{align*}
Then, since Lemma~\ref{lem:onedconsistent} tells us $\gap_T\le \frac{12}{B}$, this yields $F(\hat x) \le F(x_\star) - \frac{(12 L C^2 \log(B/\delta))^{1/3}}{B^{1/3}}$ which would establish the desired result.

On the other hand, suppose that such an $i,j$ exists. Then for such an $i,j$ we have:
\begin{align*}
    L(i,j) &\ge F(x_\star) - L\gap_T (l-1)- \frac{2C\sqrt{\log(B/\delta)}}{\sqrt{l}}\\
    &\ge F(x_\star) - 3(L\gap_T C^2 \log(B/\delta))^{1/3}
\end{align*}

Finally, since we return a random selection from the interval $[x^{(a)},x^{(b)}]$ with largest value of $L(a,b)$, we have:
\begin{align*}
    \E[F(\hat x)] &=\frac{1}{b-a+1}\sum_{k=a}^b F(x^{(k)})\\
    &\ge L(a,b)\\
    &\ge L(i,j)\\
    &\ge F(x_\star) - 3(L\gap_T C^2 \log(B/\delta))^{1/3}
\end{align*}
Now, apply Lemma~\ref{lem:onedconsistent} to see $\gap_T\ge 12/B$ to establish the final result.

\end{proof}


\subsection{Proof of Theorem~\ref{thm:1dsmooth}}
\thmonedsmooth*

\begin{proof}
This proof follows nearly identical reasoning to that of Theorem~\ref{thm:1dlipschitz}. The only significant change is that instead of bounding the average value of $F$ over an interval containing $x_\star$ by the length of the interval, we leverage smoothness to bound it by the length of the interval squared, which will allow for a tighter tradeoff at the end of the proof.

Following the proof of Theorem~\ref{thm:1dlipschitz}, by Lemma~\ref{lem:onedconsistent}, we have that $x_\star\in \Delta_t$ for all $t$ with probability at least $1-\delta$. Then, we again let $E$ be the event that $x_\star\in \Delta_t$ for all $t$ so that $P(E)\ge 1-\delta$ as required.

Let $T$ be the number of calls to Algorithm~\ref{alg:oneround} made in Algorithm~\ref{alg:1d}. Then, Lemma~\ref{lem:onedconsistent} also implies that $\gap_T \le 12/B$. Let $x^{(1)},\dots,x^{(N_T)}$ be the points sampled by Algorithm~\ref{alg:oneround} in the $T$th and final round of Algorithm~\ref{alg:1d}. Suppose $i\le j$ are such that $x^{(i)}\le x^\star\le x^{(j)}$. Define $l=j-i+1$. Note that such an interval exists for all $l\in[2, N_T]$. 

Next, we make use of a standard identity for smooth losses (e.g. see \cite{bubeck2015convex}). For all $x\in [0,1]$, since $F'(x_\star)=0$, we have:
\begin{align*}
    F(x)\ge F(x_\star) - \frac{\beta}{2}|x-x_\star|^2
\end{align*}
From this, we see that:
% and $j-i=B$ $|x^{(i)}-x^{(j)}|\le something$. 

\begin{align*}
    L(i,j)&\ge \frac{1}{j-i+1} \sum_{k=i}^j f(x^{(k)},z^{(j)}) - \frac{C\sqrt{\log(B/\delta)}}{\sqrt{l}}\\
    &\ge \frac{1}{j-i+1} \sum_{k=i}^j F(x^{(k)}) - \frac{2C\sqrt{\log(B/\delta)}}{\sqrt{l}}\\
    &\ge F(x_\star) - \frac{\beta}{2}|x^{(i)}-x^{(j)}|^2 - \frac{2C\sqrt{\log(B/\delta)}}{\sqrt{l}}\\
    &\ge F(x_\star) - \frac{\beta\gap_T^2 (l-1)^2}{2}- \frac{2C\sqrt{\log(B/\delta)}}{\sqrt{l}}
\end{align*}

Now, suppose that there is no such $i,j$ satisfying $l=\lceil C^{2/5}\log(B/\delta)^{1/5}/(\beta\gap_t^2)^{2/5}\rceil$. Then we must have $N_T< \lceil C^{2/5}\log(B/\delta)^{1/5}/(\beta\gap_t^2)^{2/5}\rceil$ so that $N_T-1<C^{2/5}\log(B/\delta)^{1/5}/(\beta\gap_t^2)^{2/5}$. Therefore, \emph{every} point $\hat x$ in the interval $\Delta$ satisfies:
\begin{align*}
    F(\hat x) &\ge F( x_\star)-  \frac{\beta\gap_T^2(N_T-1)^2}{2}\\
    &\ge F(x_\star) - \frac{(\beta C^4 \log(B/\delta)^2\gap_T^2)^{1/5}}{2} 
\end{align*}
Next, since Lemma~\ref{lem:onedconsistent} tells us $\gap_T\le \frac{12}{B}$, this yields $F(\hat x) \le F(x_\star) - \frac{(144\beta C^4 \log(B/\delta)^2)^{1/5}}{2B^{2/5}}\ge F(x_\star) - \frac{2(\beta C^4 \log(B/\delta)^2)^{1/5}}{B^{2/5}} $ which would establish the desired result.

On the other hand, suppose that such an $i,j$ exists. Then for such an $i,j$ we have:
\begin{align*}
    L(i,j) &\ge F(x_\star) - \frac{\beta\gap_T^2 (l-1)^2}{2}- \frac{2C\sqrt{\log(B/\delta)}}{\sqrt{l}}\\
    &\ge F(x_\star) - \frac{3(144\beta C^4 \log(B/\delta)^2)^{1/5}}{2B^{2/5}}\\
    &F(x_\star) - \frac{5(\beta C^4 \log(B/\delta)^2)^{1/5}}{B^{2/5}}
\end{align*}

Finally, since we return a random selection from the interval $[x^{(a)},x^{(b)}]$ with largest value of $L(a,b)$, we have:
\begin{align*}
    \E[F(\hat x)] &=\frac{1}{b-a+1}\sum_{k=a}^b F(x^{(k)})\\
    &\ge L(a,b)\\
    &\ge L(i,j)\\
    &\ge F(x_\star) - \frac{5(\beta C^4 \log(B/\delta)^2)^{1/5}}{B^{2/5}}
\end{align*}
which again establishes the desired final result.

\end{proof}


\subsection{Proof of Lemma~\ref{thm:1dlipschitzandlower}}
Lemma~\ref{thm:lipschitzlower} shows that indeed Algorithm~\ref{alg:1d} quickly eliminates the easy region.

\begin{restatable}{lemma}{thmlipschitzlower}\label{thm:lipschitzlower}
Suppose $F$ satisfies Assumption~\ref{assum:lipschitzlowerbound}. Let $\Delta_t=[l_t,r_t]$ be the $t$th interval produced by Algorithm~\ref{alg:1d}. Then with probability at least $1-\delta$, for all $t>1$ we have:
\begin{align*}
    l_t &\ge x_\star - 6\cdot2^{-t} - 3\left[\max\left(\gamma,\ \frac{2^{5/3} C^{2/3} \log(B/\delta)^{1/3}2^{-t/3}}{M^{2/3}}\right)+\frac{\tau}{M}\right]\\
    r_t &\le  x_\star + 6\cdot2^{-t} + 3\left[\max\left(\gamma,\ \frac{2^{5/3} C^{2/3} \log(B/\delta)^{1/3}2^{-t/3}}{M^{2/3}}\right)+\frac{\tau}{M}\right]
\end{align*}
\end{restatable}
\begin{proof}
As before, we consider the event $E$ that $x_\star\in \Delta_t$ for all $t$, and in each call to Algorithm~\ref{alg:oneround}, for all $i,j$, $\sum_{k=i}^j F(x^{(k)})\in [L(i,j),U(i,j)]$ for all $i$ $j$. By Lemma~\ref{lem:onedconsistent}, $P(E)\ge 1-\delta$. The entire argument is conditioned on this event $E$. We will prove the statement for $r_t$. The argument for $l_t$ is completely symmetric. For $t=1$ the statement is trivially true since $6\cdot 2^{-1}\ge 1$.

Fix some value of $t$. Suppose for purposes of induction that $r_k \le x_\star + 6\cdot 2^{-k} + 3 \left[\max\left(\gamma,\ \frac{2^{5/3} C^{2/3} \log(B/\delta)^{1/3}2^{-k/3}}{M^{2/3}}\right)+\frac{\tau}{M}\right]$ for all $k\le t$.  We will show $r_{t+1} \le  x_\star +6 \cdot 2^{-(t+1)} 3\left[ \max\left(\gamma,\ \frac{ 2^{5/3}C^{2/3}\log(B/\delta)^{1/3} 2^{-(t+1)/3}}{M^{2/3}}\right)+\frac{\tau}{M}\right]$. 
% Since $t$ was arbitrary, this shows the desired result. 
First, suppose that already we have $r_t \le  x_\star +6\cdot 2^{-(t+1)} +3\left[\max\left(\gamma,\ \frac{ 2^{5/3} C^{2/3}\log(B/\delta)^{1/3} 2^{-(t+1)/3}}{M^{2/3}}\right)+\frac{\tau}{M}\right]$. Then there is nothing left to prove since $r_{t+1}\le r_t$. Thus, we may suppose $r_t >  x_\star +6\cdot 2^{-(t+1)}+ 3\left[\max\left(\gamma,\ \frac{ 2^{5/3} C^{2/3}\log(B/\delta)^{1/3} 2^{-(t+1)/3}}{M^{2/3}}\right)+\frac{\tau}{M}\right]=x_\star +3\cdot 2^{-t}+ 3\left[\max\left(\gamma,\ \frac{ 2^{4/3} C^{2/3}\log(B/\delta)^{1/3} 2^{-t/3}}{M^{2/3}}\right)+\frac{\tau}{M}\right]$.

Next, define $k = \left\lceil \max\left(\frac{\gamma}{\gap_t},\ \frac{2^{4/3}C^{2/3} \log(B/\delta)^{1/3}}{M^{2/3} \gap_t^{2/3}}\right) +\frac{\tau}{M \gap_t}\right\rceil$. Notice that
\begin{align*}
    k\gap_t &\ge \max\left(\gamma,\ \frac{2^{4/3}C^{2/3} \log(B/\delta)^{1/3}\gap_t^{1/3}}{M^{2/3} }\right)  +\frac{\tau}{M}\\
    3k\gap_t &\le  3\max\left(\gamma,\ \frac{ 2^{4/3}C^{2/3} \log(B/\delta)^{1/3}\gap_t^{1/3}}{M^{2/3} }\right)+\frac{3\tau}{M}+3\gap_t
\end{align*}
Therefore, since $\gap_t=2^{-t}$ and $r_t>x_\star + 3\cdot 2^{-t} +3\left[\max\left(\gamma,\ \frac{2^{4/3} C^{2/3}\log(B/\delta)^{1/3} 2^{-t/3}}{M^{2/3}}\right)+\frac{\tau}{M}\right]$, we have $r_t \ge x_\star + 3k\gap_t$. 

Let $x^{(1)},\dots,x^{(N_t)}$ be the points queried during the $t$th call to Algorithm~\ref{alg:oneround}. Then, since $r_t> x_\star +3k\gap_t$, there exists $i$ such that $i+3k\le N_t$ and $x^{(i)}\le x_\star\le x^{(i+1)}$ and $x^{(i+k)}\le x_\star+k\gap_t$. 

Then, letting $U(i,j)$ and $L(i,j)$ be the bounds defined in the $t$th call to Algorithm~\ref{alg:oneround}, we have:
\begin{align*}
    L(i+1,i+k) &\ge F(x_\star + k\gap_t)-2\frac{C\sqrt{\log(B/\delta)}}{\sqrt{k}}\\
    &\ge F(x_\star + k\gap_t) - 2^{1/3}C^{2/3}M^{1/3}\log(B/\delta)^{1/3}\gap_t^{1/3}
\end{align*}

Next, observe that $x_\star+k\gap_t\ge x_\star+\gamma+\frac{\tau}{M}$. Therefore, by Assumption~\ref{assum:lipschitzlowerbound}, we have:
\begin{align*}
    F(x_\star+2k\gap_t)&\le F(x_\star+k\gap_t) - k\gap_t M\\
    &\le F(x_\star+k\gap_t) -  M\max\left(\gamma,\ \frac{2^{4/3}C^{2/3} \log(B/\delta)^{1/3}\gap_t^{1/3}}{M^{2/3} }\right)  - \tau\\
    &\le F(x_\star+k\gap_t) -2^{4/3}C^{2/3}M^{1/3}\log(B/\delta)^{1/3} \gap_t^{1/3} - \tau
\end{align*}

Finally, we have:
\begin{align*}
    U(i+1+2k, i+3k) &\le F(x_\star+2k\gap_t) + 2\frac{C\sqrt{\log(B/\delta)}}{\sqrt{k}}\\
    &\le F(x_\star+k\gap_t) - 2^{4/3}C^{2/3}M^{1/3}\log(B/\delta)^{1/3} \gap_t^{1/3}  - \tau + 2\frac{C\sqrt{\log(B/\delta)}}{\sqrt{k}}\\
    &\le F(x_\star+k\gap_t) -2^{4/3}C^{2/3}M^{1/3}\log(B/\delta)^{1/3}\gap_t^{1/3}+2^{1/3}C^{2/3}M^{1/3}\log(B/\delta)^{1/3}\gap_t^{1/3}-\tau\\
    &\le F(x_\star+k\gap_t) -2^{1/3}C^{2/3}M^{1/3}\log(B/\delta)^{1/3}\gap_t^{1/3}-\tau
\end{align*}
Thus, we have $U(i+1+2k,i+3k)<L(i+1,i+k)-\tau$ so that 
\begin{align*}
    r_{t+1}&\le x_\star +3k\gap_t\\
    &\le x_\star +3\gap_t + 3\left[\max\left(\gamma,\ \frac{ 2^{4/3}C^{2/3} \log(B/\delta)^{1/3}\gap_t^{1/3}}{M^{2/3}}\right)+\frac{\tau}{M}\right]\\
    &\le x_\star +6\gap_{t+1} + 3\left[\max\left(\gamma,\ \frac{2^{5/3} C^{2/3} \log(B/\delta)^{1/3}\gap_{t+1}^{1/3}}{M^{2/3}}\right)+\frac{\tau}{M}\right]
\end{align*}

\end{proof}


\subsection{Proof  of Theorem~\ref{thm:1dlipschitzandlower}}

Here we prove Theorem~\ref{thm:1dlipschitzandlower}. The full version of this result is stated with all constants included below.
\begin{restatable}{theorem}{thmonedlipschitzlower}
Suppose $F$ is $L$-Lipschitz and satisfies Assumption~\ref{assum:lipschitzlowerbound}. Let $\hat x$ be the output of Algorithm~\ref{alg:1d} with total sample budget $B>78$ and input failure probability $\delta$. Then, for any $\delta<1/2$, there is an event $E$ that occurs with  probability at least $1-\delta$ such that:
\begin{align*}
    \EE[F(\hat x)|E]&\ge F(x_\star)  - 3(L\gap_T C^2 \log(B/\delta))^{1/3}\\
    &\ge F(x_\star) -3\max\left(\frac{(LC^2\log(B/\delta))^{1/3}}{2^{B/351}},\ \frac{(54LC^2 \log(B/\delta)\left(\gamma+\frac{\tau}{M}\right))^{1/3}}{B^{1/3}},\ \frac{ C^{1/3}\sqrt{\log(B/\delta)}27^{1/3}2^{5/3}}{\sqrt{B} M^{1/3}\sqrt{2^{2/3}-1}}\right)
\end{align*}
where $C$ is an absolute constant.
\end{restatable}
% \thmonedlipschitzlower*

\begin{proof}
As previously, we define $E$ to be the event that the statements in Lemma~\ref{lem:onedconsistent} hold and let $C$ be the absolute constant implied by the same Lemma.

Now, we improve upon Theorem~\ref{thm:1dlipschitz} by using Lemma~\ref{thm:lipschitzlower} to show that $\gap_T$ can become smaller than $O(1/B)$, decreasing to roughly $O(\gamma/B)$.

To see this, notice that at each round we have $N_t = 1+|\Delta_t|2^t$ and by Lemma~\ref{thm:lipschitzlower}, we have:
\begin{align*}
    |\Delta_t|&\le 12\cdot2^{-t} + 6\left[\max\left(\gamma,\ \frac{2^{5/3} C^{2/3} \log(B/\delta)^{1/3}2^{-t/3}}{M^{2/3}}\right)+\frac{\tau}{M}\right]\\
    &\le 12\cdot2^{-t} + 6\gamma +6\frac{\tau}{M}+ 6\frac{2^{5/3} C^{2/3} \log(B/\delta)^{1/3}2^{-t/3}}{M^{2/3}}
\end{align*}
Further, by Lemma~\ref{lem:onedconsistent}, $\sum_{i=1}^t N_t\ge B/3$. Therefore if Algorithm~\ref{alg:1d} runs for $T$ rounds,
\begin{align*}
    B/3&\le 13 T+6\left(\gamma+\frac{\tau}{M}\right)2^{T+1} +\sum_{t=1}^T 6\frac{2^{5/3} C^{2/3} \log(B/\delta)^{1/3}2^{2t/3}}{M^{2/3}} \\
    &\le 13 T+ 6\left(\gamma+\frac{\tau}{M}\right) 2^{T+1} + 6\frac{2^{7/3} C^{2/3} \log(B/\delta)^{1/3}2^{2T/3}}{M^{2/3}(2^{2/3}-1)}
\end{align*}
Now, from this we see that any value of $T$ that satisfies $B/9\ge 13 T$, $ B/9\ge 6\left(\gamma+\frac{\tau}{M}\right) 2^{T+1} $ and $B/9\ge 6\frac{2^{7/3} C^{2/3} \log(B/\delta)^{1/3}2^{2T/3}}{M^{2/3}(2^{2/3}-1)}$ must be a lower-bound for the true value. Therefore,
\begin{align*}
    \frac{1}{\gap_T}=2^T \ge \min\left(2^{B/117},\ \frac{B}{54\left(\gamma+\frac{\tau}{M}\right)},\ \frac{B^{3/2}M(2^{2/3}-1)^{3/2}}{27^{3/2}\cdot 2^{5}C \sqrt{\log(B/\delta)}}\right)
\end{align*}

The result now follows from Theorem~\ref{thm:1dlipschitz}.
\end{proof}

\section{Proof of Theorem~\ref{thm:combes}}

\thmcombes*
\begin{proof}
As before, we consider the event $E$ that $x_\star\in \Delta_t$ for all $t$, and in each call to Algorithm~\ref{alg:oneround}, for all $i,j$, $\sum_{k=i}^j F(x^{(k)})\in [L(i,j),U(i,j)]$ for all $i$ $j$. By Lemma~\ref{lem:onedconsistent}, $P(E)\ge 1-\delta$. The entire argument is again conditioned on this event $E$.

In the $i$th round, we make at most $1+2^i$ queries. Therefore, since we we must consume at least $B/3$ budget (by Lemma~\ref{lem:stopping}), the total number of rounds $t$ must satisfy $t +2^{t+1}\ge B/3$ so that $2^{t+2}\ge B/3$. In particular, since $B\ge \frac{24 A_2^{1/z}}{C^{1/z} \log(B/\delta)^{1/2z}}$, we have
\begin{align*}
    t \ge \log_2\left(\frac{2 A_2^{1/z}}{C^{1/z} \log(B/\delta)^{1/2z}}\right)
\end{align*}
Now, let Let $\Delta_t = [l_t,r_t]$. Our bound on $t$ implies:
\begin{align*}
    \frac{C^{\frac{z}{2z+1}} \log(B/\delta)^{\frac{1}{2z+1}}}{A_2^{\frac{2}{2z+1}} \gap_t^{\frac{2z}{2+1}}}\ge 2
\end{align*}
For all such $t$, we will show:
\begin{align*}
    r_t &\le x_\star+6\frac{C^{\frac{2}{2z+1}} A_2^{\frac{1}{2z^2+z}} \log(B/\delta)^{\frac{1}{2z+1}}\gap_{t}^{\frac{1}{2z+1}}}{A_1^{\frac{1}{z}}}\\
    l_t &\ge x_\star -6\frac{C^{\frac{2}{2z+1}} A_2^{\frac{1}{2z^2+z}} \log(B/\delta)^{\frac{1}{2z+1}}\gap_{t}^{\frac{1}{2z+1}}}{A_1^{\frac{1}{z}}}
\end{align*}

We will write the proof for $r_t$, and for notational convenience we will establish the bound for $r_{t+1}$ rather than $r_t$. The argument for $l_t$ is symmetric.

Suppose $r_t\le  x_\star + 6\frac{C^{\frac{2}{2z+1}} A_2^{\frac{1}{2z^2+z}} \log(B/\delta)^{\frac{1}{2z+1}}\gap_{t+1}^{\frac{1}{2z+1}}}{A_1^{\frac{1}{z}}}$. Then clearly $r_{t+1}\le r_t\le x_\star + 6\frac{C^{\frac{2}{2z+1}} A_2^{\frac{1}{2z^2+z}} \log(B/\delta)^{\frac{1}{2z+1}}\gap_{t+1}^{\frac{1}{2z+1}}}{A_1^{\frac{1}{z}}}$ and so we are done. So, let us suppose $r_t>  x_\star + 6\frac{C^{\frac{2}{2z+1}} A_2^{\frac{1}{2z^2+z}} \log(B/\delta)^{\frac{1}{2z+1}}\gap_{t+1}^{\frac{1}{2z+1}}}{A_1^{\frac{1}{z}}}\ge x_\star + 3\frac{C^{\frac{2}{2z+1}} A_2^{\frac{1}{2z^2+z}} \log(B/\delta)^{\frac{1}{2z+1}}\gap_{t}^{\frac{1}{2z+1}}}{A_1^{\frac{1}{z}}}$. Now, define the constants:
\begin{align*}
    y&=\frac{C^{\frac{2}{2z+1}} \log(B/\delta)^{\frac{1}{2z+1}}}{A_2^{\frac{2}{2z+1}} \gap_t^{\frac{2z}{2z+1}}}\\\
    k &= \left\lceil y-1\right\rceil\\
    d&=\left\lceil \left(\frac{A_2}{A_1}\right)^{1/z}y+1\right\rceil=\left\lceil \frac{A_2^{\frac{1}{2z^2+z}}C^{\frac{2}{2z+1}} \log(B/\delta)^{\frac{1}{2z+1}}}{A_1^{\frac{1}{z}} \gap_t^{\frac{2z}{2z+1}}}+1\right\rceil\\
\end{align*}
Notice that $k\ge 1$ by our assumption on $t$ and that $k\le d \le 2\left(\frac{A_2}{A_1}\right)^{1/z}y$.

Let $x^{(1)},\dots,x^{(N_t)}$ be the points queried in round $t$. Then, since $r_t> x_\star + 3\frac{C^{\frac{2}{2z+1}} A_2^{\frac{1}{2z^2+z}} \log(B/\delta)^{\frac{1}{2z+1}}\gap_{t}^{\frac{1}{2z+1}}}{A_1^{\frac{1}{z}}}=x_\star +3\gap_t\left(\frac{A_2}{A_1}\right)^{1/z}y$, there exists $i$ such that $i+k+d\le N_t$ and $x^{(i)}\le x_\star\le x^{(i+1)}\le\dots \le x^{(i+k)}$. Therefore, we have:
\begin{align*}
    L(i,j) &\ge F(x_\star) - A_2(k\gap_t)^z - 2\frac{C\sqrt{\log(B/\delta)}}{\sqrt{k+1}}\\
    &\ge F(x_\star) - A_2(y\gap_t)^z - 2\frac{C\sqrt{\log(B/\delta)}}{\sqrt{y}}\\
    &= F(x_\star) - 3A_2^{\frac{1}{2z+1}}C^{\frac{2z}{2z+1}}\log(B/\delta)^{\frac{z}{2z+1}}\gap_T^{\frac{z}{2z+1}}
\end{align*}
Similarly, we have 
\begin{align*}
    U(i+d,i+k+d)&\le F(x_\star) - A_1(d\gap_t)^z + 2\frac{C\sqrt{\log(B/\delta)}}{\sqrt{k+1}}\\
    &\ge F(x_\star) - A_1((A_2/A_1)^{1/z}y\gap_t)^z - 2\frac{C\sqrt{\log(B/\delta)}}{\sqrt{y}}\\
    &= F(x_\star) - 3A_2^{\frac{1}{2z+1}}C^{\frac{2z}{2z+1}}\log(B/\delta)^{\frac{z}{2z+1}}\gap_T^{\frac{z}{2z+1}}
\end{align*}
Thus, we have that $U(i+d,i+k+d)\le L(i,i+k)$ so that in the next round we have 
\begin{align*}
    r_{t+1}&\le x^{(i+k+d)}\\
    &\le  x_\star + \gap_t(k+d)\\
    &\le x_\star + 3\gap_t\left( \frac{A_2}{A_1}\right)^{1/z}y\\
    &\le x_\star + 3\frac{C^{\frac{2}{2z+1}} A_2^{\frac{1}{2z^2+z}} \log(B/\delta)^{\frac{1}{2z+1}}\gap_t^{\frac{1}{2z+1}}}{A_1^{\frac{1}{z}}}\\
    &\le x_\star + 6\frac{C^{\frac{2}{2z+1}} A_2^{\frac{1}{2z^2+z}} \log(B/\delta)^{\frac{1}{2z+1}}\gap_{t+1}^{\frac{1}{2z+1}}}{A_1^{\frac{1}{z}}}
\end{align*}

Thus, so long as $t \ge\log_2\left(\frac{2 A_2^{1/z}}{C^{1/z} \log(B/\delta)^{1/2z}}\right)$, we have that
\begin{align*}
    |\Delta_t|&\le 12\frac{C^{\frac{2}{2z+1}} A_2^{\frac{1}{2z^2+z}} \log(B/\delta)^{\frac{1}{2z+1}}\gap_t^{\frac{1}{2z+1}}}{A_1^{\frac{1}{z}}}
\end{align*}
Further, since $\gap_t=2^{-t}$, we have $t\ge \log_2(B/12)$ so that the condition on $T$ is implied by $B\ge \frac{24 A_2^{1/z}}{C^{1/z} \log(B/\delta)^{1/2z}}$.

Now, for any $\hat x\in \Delta_T$, we have:
\begin{align*}
    F(\hat x)&\ge F(x_\star) - A_2|\Delta_T|^z\\
    &\ge F(x_\star)-12^z\frac{C^{\frac{2z}{2z+1}} A_2^{\frac{2z+2}{2z+1}} \log(B/\delta)^{\frac{z}{2z+1}}\gap_T^{\frac{z}{2z+1}}}{A_1}
\end{align*}

So, it remains to compute $\gap_T$ for the final round $T$. To this end, notice that in the rounds with $t<\log_2\left(\frac{2 A_2^{1/z}}{C^{1/z} \log(B/\delta)^{1/2z}}\right)$, we consume $N_1+\dots+N_t\le \frac{4 A_2^{1/z}}{C^{1/z} \log(B/\delta)^{1/2z}}$ samples. In subsequent rounds, we have 
\begin{align*}
    N_t &= 1+|\Delta_t|2^t \\
    &\le 1+12\frac{C^{\frac{2}{2z+1}} A_2^{\frac{1}{2z^2+z}} \log(B/\delta)^{\frac{1}{2z+1}}2^{\frac{2zt}{2z+1}}}{A_1^{\frac{1}{z}}}
\end{align*}
Now, we have 
\begin{align*}
    \sum_{t=1}^T2^{\frac{2zt}{2z+1}}&\le \frac{2^{\frac{2z}{2z+1}}}{2^{\frac{2z}{2z+1}}-1}2^{\frac{2zT}{2z+1}}
\end{align*}
so that in $T$ rounds our total number of samples can be bounded:
\begin{align*}
    \sum_{t=1}^TN_t&\le \frac{4 A_2^{1/z}}{C^{1/z} \log(B/\delta)^{1/2z}}+T+12\frac{2^{\frac{2z}{2z+1}}}{2^{\frac{2z}{2z+1}}-1}\frac{C^{\frac{2}{2z+1}} A_2^{\frac{1}{2z^2+z}} \log(B/\delta)^{\frac{1}{2z+1}}2^{\frac{2zT}{2z+1}}}{A_1^{\frac{1}{z}}}
\end{align*}
Now, since we have $\sum_{t=1}^T N_t \ge B/3$ by Lemma~\ref{lem:onedconsistent}, this means that any $T$ such that
\begin{align*}
    \frac{4 A_2^{1/z}}{C^{1/z} \log(B/\delta)^{1/2z}}+T&\le B/6\\
    12\frac{2^{\frac{2z}{2z+1}}}{2^{\frac{2z}{2z+1}}-1}\frac{C^{\frac{2}{2z+1}} A_2^{\frac{1}{2z^2+z}} \log(B/\delta)^{\frac{1}{2z+1}}2^{\frac{2zT}{2z+1}}}{A_1^{\frac{1}{z}}}&\le B/6
\end{align*}
is a lower bound for the true number of rounds.
From this, we obtain that for all $B\ge \frac{48 A_2^{1/z}}{C^{1/z} \log(B/\delta)^{1/2z}}$,
\begin{align*}
    2^{\frac{zT}{2z+1}} &\ge \min\left(2^{\frac{zB}{24z+12}},\ \sqrt{\frac{2^{\frac{2z}{2z+1}}-1}{72\cdot 2^{\frac{2z}{2z+1}}}\frac{BA_1^{\frac{1}{z}}}{C^{\frac{2}{2z+1}} A_2^{\frac{1}{2z^2+z}} \log(B/\delta)^{\frac{1}{2z+1}}}}\right)
\end{align*}
Thus, putting all together we have:
\begin{align*}
    F(\hat x)&\ge F(x_\star) - 12^z\frac{C^{\frac{2z}{2z+1}} A_2^{\frac{2z+2}{2z+1}} \log(B/\delta)^{\frac{z}{2z+1}}\gap_T^{\frac{z}{2z+1}}}{A_1}\\
    &\ge F(x_\star)-\max\left(12^z\frac{C^{\frac{2z}{2z+1}} A_2^{\frac{2z+2}{2z+1}} \log(B/\delta)^{\frac{z}{2z+1}}}{A_1 2^{\frac{zB}{24z+12}}},\ 
    \frac{12^z  2^{\frac{z}{2z+1}}\sqrt{72}}{\sqrt{2^{\frac{2z}{2z+1}}-1}}\frac{C A_2^{1+\frac{1}{2z}}\sqrt{\log(B/\delta)}}{A_1^{1+\frac{1}{2z}}\sqrt{B}}\right)
\end{align*}
\end{proof}

\section{Proofs for Section~\ref{sec:coordinate}}

The first step is to show that the a 1-D algorithm will be able to eliminate the point $w^t$ (i.e. have $w^t_j\notin \Delta^t_j$ for some $j$) in a reasonable number of samples, provided that $F(w_t)$ is not a $3\tau$-approximate local minimum point (see Definition~\ref{def:stationary}). 

To this end, we provide the following additional result for the 1-D Algorithm~\ref{alg:1d}:

\begin{restatable}{lemma}{thmprogress}\label{thm:progress}
Suppose $F:[0,1]\to[0,1]$ is $L$-Lipschitz. Let $x$ be a point such that $F(x)< F(x_\star)-3\tau$. Then there is an even $E$ that occurs with probability at least $1-\delta$ such that conditioned on $E$, after at most $O\left(\frac{L h^2\log(N/\delta)}{(F(x^\star)-F(x))^3}\right)$, Algorithm~\ref{alg:1d} eliminates $x$. Moreover, regardless of when $x$ is eliminated, if $\hat x$ is a randomly selected point in $\Ibest$,
\begin{align*}
    \E[F(\hat x)|E]&\ge F(x)+\tau
\end{align*}
\end{restatable}
\begin{proof}
As usual, we condition on the probability $1-\delta$ event $E$ that all confidence intervals created by Algorithm~\ref{alg:1d} are valid, as described by Lemma~\ref{lem:onedconsistent}.

Without loss of generality, suppose $x\le x^\star$ (the proof is completely symmetric if $x\ge x^\star$). Let $w=x^\star - \frac{F(x^\star)-F(x)}{4L}$ and $y=x+ \frac{F(x^\star)-F(x)}{4L}$. Observe that by $L$-lipschitzness and unimodality, $F(x^\star)\ge F(w)\ge F(x^\star)-\frac{1}{4}(F(x^\star)-F(x))$ and $F(x)\le F(y)\le F(x) + \frac{1}{4}(F(x^\star)-F(x))$ so that $F(w)\ge F(y) + \frac{1}{2}(F(x^\star)-F(x))$. 

Now, by Lemma~\ref{lem:onedconsistent}, after taking $N=\lceil \frac{20Lk}{F(x^\star)-F(x)}\rceil$ samples for any $k\in \NN$, we must have $\epsilon < \frac{F(x^\star)-F(x)}{5Lk}$. Thus, there is an interval $[x^{(a)},x^{(b)}]$ with $b-a+1=k$ contained in $(x,y)$ and an $[x^{(c)},x^{(d)}]$ with $d-c+1=k$ contained in $(w, x^\star)$. 

Now, the upper bound $U(a,b)$ for the interval $[a,b]$ satisfies (where $C$ is the universal constant from Lemma~\ref{lem:onedconsistent}):
\begin{align*}
    U(a,b) &\le \frac{1}{k}\sum_{i=a}^b f(x^{(i)},z^{(i)})  +  C\frac{\sqrt{\log(N/\delta)}}{\sqrt{k}}\\
    &\le \frac{1}{k}\sum_{i=a}^b F(x^{(i)})  + 2C\frac{\sqrt{\log(N/\delta)}}{\sqrt{k}}\\
    &\le F(x^{(b)}) + C\frac{\sqrt{\log(N/\delta)}}{\sqrt{k}}\\
    &\le F(y) + 2C\frac{\sqrt{\log(N/\delta)}}{\sqrt{k}}\\
    &< F(x) + \frac{1}{4}(F(x^\star)-F(x))+ 2C\frac{\sqrt{\log(N/\delta)}}{\sqrt{k}}
\end{align*}
Similarly, the lower bound $L(c,d)$ for the interval $[c,d]$ satisfies:
\begin{align*}
    L(c,d)&=\frac{1}{k}\sum_{i=c}^d f(x^{(i)},z^{(i)})  - C\frac{\sqrt{\log(N/\delta)}}{\sqrt{k}}\\
    &\ge  \frac{1}{k}\sum_{i=c}^d F(x^{(i)})  - 2C\frac{\sqrt{\log(N/\delta)}}{\sqrt{k}}\\
    &\ge F(x^{(c)}) - 2C\frac{\sqrt{\log(N/\delta)}}{\sqrt{k}}\\
    &\ge F(w) - 2C\frac{\sqrt{\log(N/\delta)}}{\sqrt{k}}\\
    &> F(x^\star) - \frac{1}{4}(F(x^\star)-F(x)) - 2C\frac{\sqrt{\log(N/\delta)}}{\sqrt{k}}
\end{align*}

Now, set
\begin{align*}
    k \ge \frac{576 C^2 \log(N/\delta)}{(F(x^\star)-F(x))^2 }
\end{align*}
to obtain
\begin{align*}
    L(c,d)> F(x^\star) - \frac{1}{3}(F(x^\star)-F(x))\\
    U(a,b) <F(x) + \frac{1}{3}(F(x^\star)-F(x))
\end{align*}
Now, since $\tau\le  \frac{F(x^\star)-F(x)}{3}$, we will eliminate the region to the left of $a$, which includes $x$. Putting all together, we have used $N\le \lceil \frac{20Lk}{F(x^\star)-F(x)}\rceil =\lceil \frac{11520\cdot C^2L \log(N/\delta)}{(F(x^\star)-F(x))^3}\rceil$ samples.


Now, we prove the second part of the claim. On the round that $x$ is eliminated, there must exist $a\le b\le c\le d$ with $x< x^{(a)}$ and $U(a,b)< L(c,d)-\tau$.

Also, we must have that $F(x^{(b)})\ge F(x)$, for otherwise this implies $x^\star \le x^{(b)}$ by unimodality so that $F(x)\ge F(x^{(c)})\ge F(x^{(d)})$ again by unimodality and so it cannot hold that $F(x^{(c)})=\max_{i\in \{c,d\}}F(x^{(i)})\ge L(c,d)\ge U(a,b)\ge \min_{i\in \{a,b\}}F(x^{(i)})= F(x^{(b)})$. Therefore, we have:
\begin{align*}
    F(x) \le \frac{1}{b-a+1}\sum_{i=a}^b F(x^{(i)})\le U(a,b)\le L(c,d)-\tau
\end{align*}

Now, let $\Ibest=[e,f]$. We have $L(e,f)\ge L(c,d)$ by definition. Therefore:
\begin{align*}
    \frac{1}{f-e+1}\sum_{i=e}^f F(x^{(i)})&\ge L(e,f)\\
    &\ge L(c,d)\\
    &\ge U(a,b)+\tau\\
    &\ge F(x) + \tau
\end{align*}
\end{proof}


\thmcoord*


\begin{proof}

For each of the $d$ copies of Algorithm~\ref{alg:1d} instantiated in the $t$th iteration of Algorithm~\ref{algo:coord}, by Lemma~\ref{lem:onedconsistent}, with probability at least $1-6/t^2d\pi$, all confidence bounds created internal by Algorithm~\ref{alg:1d} and the calls it makes to Algorithm~\ref{alg:oneround} are valid. Thus, by union bound all such bounds are valid for all copies of Algorithm~\ref{alg:1d} with probability at least $1-\delta$. Let $E$ be this event, so that $P(E)\ge 1-\delta$.

Now, suppose that in some iteration $t$, $w^t$ is \emph{not} a $3\tau$-stationary point. For each $i\in\{1,\dots,d\}$, let $w^t_{\star,i}$ be the point that maximizes $F$ while differing from $w_t$ only in the $i$th coordinate. Then, since $w^t$ is not a $3\tau$ stationary point, there is some $i$ such that $F(w^t_{\star,i})-3\tau\ge F(w^t)$.

Thus, by Lemma~\ref{thm:progress}, after at most $\lceil \frac{11520\cdot C^2L \log(B/\delta)}{(F(w^t_{\star,i})-F(w^t))^3}\rceil\le \lceil \frac{311040\cdot C^2L \log(N/\delta)}{\tau^3}\rceil$ samples, the $i$th copy of Algorithm~\ref{alg:1d} will eliminate $w^t$, and select a point $w^{t+1}$ such that $\EE[F(w^{t+1})]\ge F(w^t)+\tau$. Since there are $d$ total copies, this consumes $O\left(\frac{d\log(B/\delta)}{\tau^3}\right)$ total samples.

Now, at an intuitive level, since $\EE[F(w^t)]$ can increase by $\tau$ at most $1/\tau$ times, this process repeats at most $1/\tau$ times, after which we must have found a $3\tau$-critical point. 

To make this formal, let $T$ be the index (if it exists) at which $F(w^t)$ is a $3\tau$ stationary point. Define define $X_t= F(w^t)$ for $t\le T$ and $X_t = F(w_T)$ for $t>T$. Then, we can apply Lemma~\ref{lem:stopping} to see that $P\left[T\ge \frac{3\log(\delta)}{\tau}\right]<\delta$. Therefore, with probability at least $1-2\delta$, after 
\begin{align*}
    \frac{3d\log(1/\delta)}{\tau}\lceil \frac{311040\cdot C^2L \log(N/\delta)}{\tau^3}\rceil&\le \frac{3d\log(1/\delta)}{\tau} + \frac{933120\cdot dC^2L \log(N/\delta)^2}{\tau^4}\\
    &\le \frac{B}{2} + \frac{B}{2}=B
\end{align*}
samples, we find a $3\tau$ stationary point, as desired.

\end{proof}

\subsection{A technical stopping time lemma}
% \lemstopping*

\begin{restatable}{lemma}{lemstopping}\label{lem:stopping}
Let $0=X_0,X_1,X_2,\dots$ be a sub-martingale in $[0,1]$. Let $T$ be the stopping time $T=\{\min t| X_t\ge 1-3\tau\}$. Suppose  that for $t\ge T$, $X_{t+1}=X_t$ and for $t<T$, $\E[X_{t+1}]\ge X_t+\tau$. Then for any $\delta>0$, with probability at least $1-\delta$, $T\le \frac{3\log(1/\delta)}{\tau}$.
\end{restatable}
\begin{proof}
We consider an alternative non-negative sub-martingale $Y_1,Y_2,\dots$ defined by $Y_t=X_t$ for $t\le T$ and $Y_{t+1}=Y_t+\tau$ for $t\ge T$. Let $d_t=Y_{t}-Y_{t-1}$ be the associated martingale difference sequence. Clearly we have $\E[d_t]\ge \tau$ for all $t$. Then, we have:
\begin{align*}
    1\ge Y_T&\ge \sum_{t=1}^\infty d_t \mathds{1}[t\le T]\\
    &=\sum_{t=1}^\infty  d_t(1-\mathds{1}[t>T])\\
    &=\sum_{t=1}^\infty d_t - \sum_{i=1}^{t-1} d_t\mathds{1}[i=T]\\
    &=\sum_{t=1}^\infty d_t - \tau \sum_{i=1}^{t-1}\mathds{1}[i=T]\\
    &=\EE\left[\sum_{t=1}^\infty d_t - \tau \sum_{i=1}^{t-1}\mathds{1}[i=T]\right]\\
    &=\sum_{t=1}^\infty \EE[d_t] - \tau P[T<t]\\
    &\ge \sum_{t=1}^\infty \tau P[T\ge t]\\
    &= \tau \EE[T]
\end{align*}
Therefore, we have $\EE[T]\le \frac{1}{\tau}$. Thus, by Markov inequality, $P[T>3/\tau]\le 1/3\le 1/e$.

Now, let $L_i$ be the event that $T>\frac{3i}{\tau}$. Let $T_i= T-\frac{3(i-1)}{\tau}$ for $i\ge 1$. Observe that the above argument also shows that $\EE[T_i|L_{i-1}]\le \frac{1}{\tau}$, so that $P[T_i>3/\tau|L_i]\le \frac{1}{e}$. Further, notice that
\begin{align*}
    P[L_i]&= P[T_i>3/\tau |L_{i-1}]P[L_{i-1}]\\
    &\le \frac{1}{e^i}
\end{align*}
The result now follows by seeing that $P\left[T>\frac{3\log(1/\delta)}{\tau}\right]=L_{\log(1/\delta)}$


\end{proof}

\section{More Experimental Details}
\label{sec:moreexp}

% \begin{algorithm}
% \caption{Modified Unimodal Coordinate Ascent}
% \label{algo:mod}
% \begin{algorithmic}[1]
% \STATE Set $N, t=10$. Choose 10 random points $[0,1]^d$ and set arbitrary $w^t$ as the one with the best observed value.
% \WHILE{$N\le B-d$}
% \STATE Given a starting point $w^t\in[0,1]^d$, initialize $d$ copies of Algorithm~\ref{alg:1d_v2} with $\delta=6/dt^2\pi^2$, budget set to $\infty$ where the $i$th copy considers the function $F_i:[0,1]\to [0,1]$ $F_i(x) = F(w^t_{1},\dots,w^t_{i-1},x,w^t_{i+1},\dots,w^t_{d})$.\\ Let $\Delta^t_{ti}$ be the active interval associated with the $i$th copy.\\
% Let $I^t_{\text{best}, i}$ be the current value of $\Ibest$ maintained by the $i$th copy.
% \WHILE{$w^t_{i}\in \Delta^t_{i}$ for all $i$}
% \IF{$N\le B-d$}
% \STATE Return $w^t$.
% \ENDIF
% \STATE Let $i$ be the dimension sampled from distribution proportional to $\{\exp(s_1), \cdots, \exp(s_d)\}.$ Here $s_i$ is the observed standard deviation of the values for dimension $i$ so far.
% \STATE Run one more epoch of Algorithm~\ref{alg:1d_v2} for dimension $i$.
% \STATE $N=N+ \text{$<$\#new points sampled$>$}$.
% \ENDWHILE
% \STATE Suppose the $j$-th copy is one such copy that eliminated $w^t_{j}$ ($w^t_{j}\notin \Delta^t_{j}$) and $\Delta_{t, j}$ is smallest among all eliminating dimensions.
% \STATE Let $w^{t+1}_{j}$ be the best point selected in $I^t_{\text{best}, j}$, in terms of observed value.
% \STATE Set $w^{t+1}=(w^t_{1},\dots,w^t_{j-1},w^{t+1}_{j},w^t_{j+1},\dots)$ and begin a new round (reuse points if available).
% \ENDWHILE
% \STATE Return $w^t$.
% \end{algorithmic}
% \end{algorithm}

\begin{algorithm}[b]
\caption{One Elimination Round of Unimodal Optimization - Power of 2 Intervals}
\label{alg:oneround_v2}
\begin{algorithmic}[1]
\REQUIRE Confidence parameter $\delta$, interval $\Delta=[l,r]$, confidence scaling constant $h>0$ to be set by Lemma~\ref{lem:azuma}.
\STATE Define the points $l=x^{(1)}<x^{(2)}<\ldots<x^{(N)}=r$ for $N =1+ \frac{l-r}{\gap}$ equally spaced in the interval $\Delta$.
\STATE Compute $f(x^{(k)},z^{(k)})$ for i.i.d. $z^{(1)},\dots,z^{(N)}$.
\STATE Given any $i,j\in\{1,\dots,N\}$, define $m_{ij} = \sum_{k=i}^{j} f(x^{(k)}, z^{(k)})/ (j - i + 1)$, $s_{ij} = h\sqrt{\log (2N/ \delta)}/ {\sqrt{j - i+1}}$.
\STATE Define upper confidence bound $U(i, j) = m_{ij} + s_{ij}$ for each $i,j\in\{1,\dots,N\}$
\STATE Define lower confidence bound $L(i, j) = m_{ij} -  s_{ij}$ for each $i,j\in\{1,\dots,N\}$
\FOR{$s$ in $\{2^2, 2^3, \cdots, 2^{\log N}$\}}
\STATE Let $S^{(s)}_l=\max \{x^{(i)}\ |\exists i\le j\le k\le l\ s.t.\ U(i,j)<L(k,l) \text{ and } j-i +1 = 2^s, l-k +1 = 2^s \}\cup \{x^{(1)}\}$.
\STATE Let $S^{(s)}_r=\min \{x^{(l)}\ |\exists i\le j\le k\le l\ s.t.\ U(k,l) < L(i,j) \text{ and } j-i +1 = 2^s, l-k +1 = 2^s \}\cup\{x^{(N)}\}$.
\ENDFOR
\STATE Let $\Ibest$ be the best LCB for intervals considered in the above for loop.
\STATE Return New interval $[\max_s S^{(s)}_l, \min_sS^{(s)}_r] \subset \Delta$ and the best interval $\Ibest$.
\end{algorithmic}
\end{algorithm}

\begin{algorithm}
\caption{Modified 1D Unimodal Optimization}
\label{alg:1d_v2}
\begin{algorithmic}[1]
\REQUIRE Confidence parameter $\delta$, budget $B$.
\STATE Initialize $t=1$, $\Delta_1=[0,1]$, $\gap_1=0.5$, $N=N_1=3$.
\WHILE{$N\le B$}
\STATE Call Algorithm~\ref{alg:oneround_v2} with input $6\delta/\pi^2 t^2, \Delta_t, \gap_t$ to obtain outputs $[S_l,S_r]$, $\Ibest$.
\STATE Let $\Delta_{t+1}=[S_l,S_r]$.
\STATE Set $\gap_{t+1}=\gap_t/2$.
\STATE Set $N_{t+1}=1+\frac{S_r-S_l}{\gap}$ //Budget to be consumed by next iteration.
\STATE Set $N=N+N_t$. //Total budget consumed at end of next iteration.
\STATE Set $t=t+1$.
\ENDWHILE
\STATE Return a random element of $\Ibest$.
\end{algorithmic}
\end{algorithm}

% \begin{algorithm}
% \caption{Modified Unimodal Coordinate Ascent}
% \label{algo:mod}
% \begin{algorithmic}[1]
% \STATE Set $N, t=10$. Choose 10 random points $[0,1]^d$ and set arbitrary $w^t$ as the one with the best observed value.
% \WHILE{$N\le B-d$}
% \STATE Given a starting point $w^t\in[0,1]^d$, initialize $d$ copies of Algorithm~\ref{alg:1d_v2} with $\delta=6/dt^2\pi^2$, budget set to $\infty$ where the $i$th copy considers the function $F_i:[0,1]\to [0,1]$ $F_i(x) = F(w^t_{1},\dots,w^t_{i-1},x,w^t_{i+1},\dots,w^t_{d})$.\\ Let $\Delta^t_{ti}$ be the active interval associated with the $i$th copy.\\
% Let $I^t_{\text{best}, i}$ be the current value of $\Ibest$ maintained by the $i$th copy.
% \WHILE{$w^t_{i}\in \Delta^t_{i}$ for all $i$}
% \IF{$N\le B-d$}
% \STATE Return $w^t$.
% \ENDIF
% \STATE Let $i$ be the dimension sampled from distribution proportional to $\{\exp(s_1), \cdots, \exp(s_d)\}.$ Here $s_i$ is the observed standard deviation of the values for dimension $i$ so far.
% \STATE Run one more epoch of Algorithm~\ref{alg:1d_v2} for dimension $i$.
% \STATE $N=N+ \text{$<$\#new points sampled$>$}$.
% \ENDWHILE
% \STATE Suppose the $j$-th copy is one such copy that eliminated $w^t_{j}$ ($w^t_{j}\notin \Delta^t_{j}$) and $\Delta_{t, j}$ is smallest among all eliminating dimensions.
% \STATE Let $w^{t+1}_{j}$ be the best point selected in $I^t_{\text{best}, j}$, in terms of observed value.
% \STATE Set $w^{t+1}=(w^t_{1},\dots,w^t_{j-1},w^{t+1}_{j},w^t_{j+1},\dots)$ and begin a new round (reuse points if available).
% \ENDWHILE
% \STATE Return $w^t$.
% \end{algorithmic}
% \end{algorithm}

Algorithm~\ref{algo:mod} contains some small but useful modifications of Algorithm~\ref{algo:coord}. We need to first introduce a modified version of Algorithm~\ref{alg:oneround} in Algorithm~\ref{alg:oneround_v2} that has $\tau$ set to zero and only compares intervals of size in a power of 2 among each other. This reduces the running time of each round to $O(N)$, where $N$ is the number of points sampled. It can also be shown that this would only affect lower order terms in our final bounds. Algorithm~\ref{alg:1d_v2} is a version of Algorithm~\ref{alg:1d} that uses Algorithm~\ref{alg:oneround_v2} as the sub-routine.

Apart from using the modified Algorithm~\ref{alg:1d_v2} in the inner loop per dimension, we make some other practical changes. For the first ten slots we sample a random point. The the best out of these 10 points (in terms of observed value) is set as the starting point $w^t$. Next, out of many possible dimensions that can eliminate the current point in line 12, we select the one that has the smallest surviving interval. Lastly, in the loop at line 4 (when none of the dimensions have managed to eliminate the current point), we choose a dimension from a distribution that gives higher weights to dimensions with more standard deviation of observed values so far.  The sampled dimension is the one that gets to sample the next epoch of points for Algorithm~\ref{alg:1d_v2}.

\begin{figure}
  \begin{center}
    \includegraphics[scale=0.28]{demo_all_params.png}
  \end{center}
\caption{We illustrate approximate unimodality per coordinate dimensions for the task of tuning a CNN on Cifar 10. Each row corresponds to varying one parameter (coordinate) while all other points are held fixed. each column corresponds to a randomly chosen point in the search-space from which one dimension is varied at a time, while keeping all other fixed. The x-axis denotes the hyper-param that is varied. We plot the standard error over three runs per point, as demonstrated by the shaded area.} \label{fig:demo}
\end{figure}


\subsection{Comparison with Zooming Algorithm}
\label{sec:zooming}
In order to show the advantage of using unimodal structure in real hyper-parameter tuning tasks we implement a version of zooming algorithm~\citep{kleinberg2008multi} under our framework and run it on the HPO-B benchmark. In particular, in our algorithm we do not eliminate regions from the left and right (as we would like to not use unimodal structure in the zooming baseline). Instead we eliminate the current best point along a dimension if any interval containing the current best point has an UCB less than highest LCB of any interval. The rest of the algorithm is identical to ours with all the improvements mentioned in Section~\ref{sec:experiments}. Note that this algorithm can adapt to unknown Lipschitz constant unlike the original algorithm in~\citep{kleinberg2008multi} and therefore is arguably a stronger baseline. The results are shown in Fig.~\ref{fig:zooming}. It can be seen that \texttt{Zooming Ascent} is worse than GP-UCB while our algorithm that uses unimodal structure is better.

\begin{figure}
  \begin{center}
    \includegraphics[scale=0.50]{with_zooming.png}
  \end{center}
\caption{Comparison with respect to a version of the zooming algorithm~\citep{kleinberg2008multi}} \label{fig:zooming}
\end{figure}


\section{Empirical evidence for unimodality}\label{sec:unimodalevidence}

In order to find evidence for approximate unimodality across coordinate dimensions we run a grid search of five hyper-parameters on the Cifar 10 dataset. The hyper-parameters are tuned for AlexNet architecture~\citep{krizhevsky2009learning} where the number of convolutional filters can be changed for the first two layers. The other three hyper-parameters we were allowed to vary were batch size, learning rate and drouput level of the last linear layer. We had a grid of 5 points per dimension and each point was sampled 3 times to estimate the noise in experiments with the same hyper-parameter. Thus the experiment involved training $5^5 \times 3$ models and getting the test accuracy for them. 

Figure~\ref{fig:demo} shows the resulting plots. Each row corresponds to varying one parameter (coordinate) while all other points are held fixed. each column corresponds to a randomly chosen point in the search-space from which one dimension is varied at a time, while keeping all other fixed. The x-axis denotes the hyperparam that is varied. We can see that the black-box function is approximately unimodal per coordinate dimension. We believe that our algorithm works well on benchmarks because such approximate unimodal structure might be present in many real world hyper-parameter tuning tasks.


\end{document}
