% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023}
% after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

\usepackage[usenames,dvipsnames]{xcolor}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%%%%% PREAMBLE %%%%%

\newcommand{\X}{\mathcal{X}}
\newcommand{\Y}{\mathcal{Y}}
\newcommand{\Z}{\mathcal{Z}}
\newcommand{\cO}{\mathcal{O}}

\newcommand{\1}{\mathbf{1}}
\renewcommand{\>}{\rightarrow}
\newcommand{\E}{\mathbb{E}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\N}{\mathbb{N}}
\renewcommand{\P}{\mathbb{P}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cW}{\mathcal{W}}
\newcommand{\cT}{\mathcal{T}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\bc}{\mathbf{c}}
\newcommand{\boldf}{\mathbf{f}}
\newcommand{\bp}{\mathbf{p}}
\newcommand{\blambda}{{\lambda}}

\newcommand{\bV}{\mathbb{V}}
\newcommand{\cM}{\mathcal{M}}

\newcommand{\Argmax}[1]{\underset{#1}{\operatorname{argmax}}}
\newcommand{\Argmin}[1]{\underset{#1}{\operatorname{argmin}}}
\newcommand{\argmax}[1]{{\operatorname{argmax}}_{#1}}
\newcommand{\argmin}[1]{{\operatorname{argmin}}_{#1}}


\newcommand{\err}{\textrm{\textup{ERR}}}
\newcommand{\wer}{\textrm{\textup{WER}}}
\newcommand{\ber}{\textrm{\textup{BER}}}
\newcommand{\val}{\textrm{\textup{val}}}


\newcommand{\xent}{\textrm{\textup{xent}}}
\newcommand{\dis}{\textrm{\textup{dis}}}
\newcommand{\bal}{\textrm{\textup{bal}}}
\newcommand{\wc}{\textrm{\textup{wor}}}
\newcommand{\rob}{\textrm{\textup{rob}}}
\newcommand{\robd}{\textrm{\textup{rob-d}}}
\newcommand{\bald}{\textrm{\textup{bal-d}}}
\newcommand{\zo}{\textrm{\textup{0-1}}}
\newcommand{\std}{\textrm{\textup{std}}}
\newcommand{\stdd}{\textrm{\textup{std-d}}}
\newcommand{\tdf}{\textrm{\textup{tdf}}}
\newcommand{\tdfd}{\textrm{\textup{tdf-d}}}
\newcommand{\mar}{\textrm{\textup{mar}}}
\newcommand{\oh}{\textrm{\textup{oh}}}
\newcommand{\softmax}{\textrm{\textup{softmax}}}

\newcommand\AddLabel[1]{\refstepcounter{equation}(\theequation)\label{#1}}

% To control spacing in itemized lists
\usepackage{enumitem}

% Algorithm command
\usepackage{algorithm}
\usepackage{algorithmic} 

% Theorems
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem*{theorem*}{Theorem}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem*{proposition*}{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
% \usepackage[disable,textsize=tiny]{todonotes}
% \usepackage[textsize=tiny]{todonotes}
 
% Added by Serena: for tables
\usepackage{multirow}
\setlength{\tabcolsep}{3pt}
\usepackage{colortbl}

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors

\usepackage{capt-of}
% \usepackage{floatrow}
% % Table float box with bottom caption, box width adjusted to content
% \newfloatcommand{capbtabbox}{table}[][\FBwidth]

\usepackage{boldline}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% bibentry package for referring to main bib references
\usepackage{bibentry}

% xr package for referring to main document
\usepackage{xr}

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

\myexternaldocument{wang_601}

\title{Robust Distillation for Worst-class Performance: \\ On the Interplay Between Teacher and Student Objectives \\ (Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors

\author[1,2]{\href{mailto:<serenawang@google.com>?Subject=Your UAI 2023 paper}{Serena~Wang}{}}
\author[1]{\href{mailto:<hnarasimhan@google.com>?Subject=Your UAI 2023 paper}{Harikrishna~Narasimhan}{}}
\author[1]{\href{mailto:<yichenzhou@google.com>?Subject=Your UAI 2023 paper}{Yichen~Zhou}{}}
\author[3]{\href{mailto:<sarahooker@cohere.com>?Subject=Your UAI 2023 paper}{Sara~Hooker}{}}
\author[1]{\href{mailto:<mlukasik@google.com>?Subject=Your UAI 2023 paper}{Michal~Lukasik}{}}
\author[1]{\href{mailto:<adityakmenon@google.com>?Subject=Your UAI 2023 paper}{Aditya~Krishna~Menon}{}}
% Add affiliations after the authors
\affil[1]{%
    Google Research\\
    Mountain View, California and New York, New York, USA\\
}
\affil[2]{%
    University of California, Berkeley\\
    Berkeley, California, USA\\
}
\affil[3]{%
    Cohere For AI\\
    Palo Alto, California, USA
}
  
  \begin{document}
\onecolumn
\maketitle

\appendix

\section{Proofs}
\label{app:proofs}

\subsection{Proof of Theorem \ref{thm:bayes}}
\label{app:proof-bayes}
(i) The first result follows from the fact that the cross-entropy loss is a proper composite loss \citep{williamson2016composite} with the softmax function as the  associated (inverse) link function.

(ii) For a proof of the second result, please see \citet{menon2020long}. 


(iii) Below, we provide a proof for the third result. 

The minimization of the robust objective in \eqref{eq:robust} over $f$ can be re-written as
a min-max optimization problem:
\begin{align}
\min_{f: \X \> \R^m}\,L^\rob(f) = \min_{f: \X \> \R^m}\,\max_{\lambda \in \Delta_m} 
\underbrace{
\sum_{y=1}^m \frac{\lambda_y}{\pi_y}\E\left[ \eta_y(X)\,\ell(y, f(X)) \right]}_{\omega(\lambda, f)}.
\label{eq:min-max}
\end{align}
% which can be seen as a min-max game between a $\lambda$-player and a $f$-player.

The min-max objective $\omega(\lambda, f)$ is clearly linear in $\lambda$ (for fixed $f$) and 
with $\ell$ chosen to be the cross-entropy loss, is convex in $f$ (for fixed $\lambda$), i.e.,
$\omega(\lambda, \kappa f_1 + (1-\kappa) f_2) \leq \kappa\omega(\lambda,  f_1) + (1-\kappa) \omega(\lambda, f_2), \,\forall f_1, f_2:\X\>\R^m, \kappa\in [0,1]$. Furthermore, $\Delta_m$ is a convex compact set, while the domain of $f$ is convex. It follows 
from Sion's minimax theorem \citep{sion1958general} that:
\begin{align}
\min_{f: \X \> \R^m}\max_{\lambda \in \Delta_m}\,\omega(\lambda, f)
&=\, \max_{\lambda \in \Delta_m}\min_{f: \X \> \R^m}\,\omega(\lambda, f).
\label{eq:min-max-swap}
\end{align}

Let $(\lambda^*, f^*)$ be such that:
\[
\lambda^* \in \Argmax{\lambda \in \Delta_m}\min_{f: \X \> \R^m}\,\omega(\lambda, f);~~~~~
f^* \in \Argmin{f: \X \> \R^m}\max_{\lambda \in \Delta_m}\,\omega(\lambda, f),
\]
Such a $\lambda^*$ exists for the following reason:
for any fixed $\lambda \in \Delta_m$, owing to the use of the cross-entropy loss, a minimizer over
always exists for $\omega(\lambda, f)$, and is given by $f_y(x) = \log\left(\frac{\lambda_y}{\pi_y}\eta_y(x)\right) + C,$ for some $C\in \R$; therefore $\min_{f: \X \> \R^m}\, \omega(\lambda, f)$ is bounded above for any $\lambda$, and $\Delta_m$ being compact set gives us there exits a maximizer $\lambda^*$ over this set. Similarly, such an $f^*$ exists for the following reason: the objective $\max_{\lambda \in \Delta_m}\,\omega(\lambda, f)$ takes a bounded value when $f=\eta$, and any minimizer of $\max_{\lambda \in \Delta_m}\,\omega(\lambda, f)$ yields a value below that; because $\omega(\lambda, f) \geq 0$ and is convex in $f$, the minimizer $f^*$ exits.
% This follows from the fact that the cross-entropy loss is a proper composite loss \citep{williamson2016composite} with the softmax function as the 
% associated (inverse) link function.

We then have from \eqref{eq:min-max-swap}:
\begin{align*}
\omega(\lambda^*, f^*) &\leq\, \max_{\lambda \in \Delta_m}\, \omega(\lambda, f^*)
\\
&=\, \min_{f: \X \> \R^m}\max_{\lambda \in \Delta_m}\,\omega(\lambda, f)
\,=\, \max_{\lambda \in \Delta_m}\min_{f: \X \> \R^m}\,\omega(\lambda, f)\\
&=\,\min_{f: \X \> \R^m}\,\omega(\lambda^*, f)
\,\leq\, \omega(\lambda^*, f^*),
\end{align*}
which tells us that there exists $(\lambda^*, f^*)$ is a saddle-point for
\eqref{eq:min-max}, i.e.,
\begin{align*}
\omega(\lambda^*, f^*) &= \max_{\lambda \in \Delta_m}\, \omega(\lambda, f^*) \,=\, \min_{f: \X \> \R^m}\, \omega(\lambda^*, f).
\label{eq:saddle-point}
\end{align*}
Consequently, we have:
\begin{align*}
L^\rob(f^*) &=\, 
\max_{\lambda \in \Delta_m}\, \omega(\lambda, f^*) 
% \,=\,
% \min_{f: \X \> \R^m}\, \omega(\lambda^*, f)\\
% &\leq\, \max_{\lambda \in \Delta_m}\min_{f: \X \> \R^m}\, \omega(\lambda, f)
\,=\, \min_{f: \X \> \R^m}\max_{\lambda \in \Delta_m}\, \omega(\lambda, f) %\\
\,=\, \min_{f: \X \> \R^m}\,L^\rob(f).
\end{align*}
% where the last equality follows from \eqref{eq:min-max}.
% where we can swap the `min' and `max' because strong duality holds, and the last equality uses \eqref{eq:min-max}. 
We thus have that $f^*$ is a minimizer of $L^\rob(f)$.
%
%
%Because $\omega$ is strictly convex in $f$, we also have from \eqref{eq:saddle-point}
%that $f^*$ is the unique minimizer of $\omega(\lambda^*, f)$ over $f$, i.e.\todo{[Hari] Strict convexity not true, but statement still holds}
Furthermore, because $f^*$ is also a minimizer of $\omega(\lambda^*, f)$ over $f$, i.e.,\
\[
f^* \in \Argmin{f: \X \> \R^m} \sum_{y=1}^m \frac{\lambda_y^*}{\pi_y}\E\left[ \eta_y(X)\,\ell(y, f(X)) \right],
\]
%Using the fact that $\ell$ is the softmax cross-entropy loss, which is a proper composite loss \citep{williamson2016composite} with the softmax function as the 
%associated (inverse) link function, 
it follows that:
$$
\softmax_y(f^*(x)) \,\propto\, \frac{\lambda^*_y}{\pi_y}\eta_y(x).
$$

(iv) For the fourth result, we expand the traded-off objective,
and re-write it as:
\begin{align*}
    L^\tdf(f)
         &= (1 - \alpha) L^\bal(f) + \alpha L^\rob(f)\\
         &= 
             (1-\alpha)\frac{1}{m}\sum_{y=1}^m \frac{1}{\pi_y}\E\left[ \eta_y(X)\,\ell(y, f(X)) \right]
             \,+\,
             \alpha \max_{\lambda \in \Delta_m}\,\sum_{y=1}^m \frac{\lambda_y}{\pi_y}\E\left[ \eta_y(X)\,\ell(y, f(X)) \right]\\
        &=  \max_{\lambda \in \Delta_m}
        \underbrace{
            \sum_{y=1}^m \left((1-\alpha)\frac{1}{m} + \alpha\lambda_y\right)\frac{1}{\pi_y}\E\left[ \eta_y(X)\,\ell(y, f(X)) \right]}_{\omega(\lambda, f)}.
    \label{eq:trade-off}
\end{align*}
For a fixed $\lambda$, $\omega(\lambda, f)$ is convex in $f$ (as the loss $\ell$ is the cross-entropy loss),
and for a fixed $f$, $\omega(\lambda, f)$ is linear in $\lambda$. Following the same steps as the proof of (iii), we have that there exists $(\lambda^*, f^*)$ such that
\[
L^\tdf(f^*) \,=\, 
\max_{\lambda \in \Delta_m}\, \omega(\lambda, f^*) \,=\,
\min_{f: \X \> \R^m}\,L^\tdf(f),
\]
and
\[
f^* \in \Argmin{f: \X \> \R^m}\,
            \sum_{y=1}^m \left((1-\alpha)\frac{1}{m} + \alpha\lambda^*_y\right)\frac{1}{\pi_y}\E\left[ \eta_y(X)\,\ell(y, f(X)) \right],
\]
which, owing to the properties of the cross-entropy loss, then gives us the desired form for $f^*$.

\subsection{Proof of Theorem \ref{thm:good-teacher}}
\label{app:proof-good-teacher}
\begin{proof}
Expanding the left-hand side, we have:
\begin{align*}
|\hat{L}^\robd(f) - L^\rob(f)|
&\leq |\hat{L}^\robd(f) - {L}^\robd(f) + {L}^\robd(f) - L^\rob(f)|\\
&\leq |\hat{L}^\robd(f) - {L}^\robd(f)| + |{L}^\robd(f) - L^\rob(f)|\\
&= 
|\hat{L}^\robd(f) - {L}^\robd(f)| +  \left|
                        \max_{y \in [m]} \frac{ \E_{x}\left[ p^t_y(x)\,\ell(y, f(x)) \right] }{ \E_x\left[ p^t_y(x) \right] } -
                        \max_{y \in [m]} \frac{ \E_{x}\left[ \eta_y(x)\,\ell(y, f(x)) \right] }{ \pi_y } \right|\\
&\leq 
|\hat{L}^\robd(f) - {L}^\robd(f)| + \max_{y \in [m]} \left|
                        \frac{ \E_{x}\left[ p^t_y(x)\,\ell(y, f(x)) \right] }{ \E_x\left[ p^t_y(x) \right] } -
                        \frac{ \E_{x}\left[ \eta_y(x)\,\ell(y, f(x)) \right] }{ \pi_y } \right|\\
&\leq
|\hat{L}^\robd(f) - {L}^\robd(f)| +
    B\max_{y \in [m]} \E_x\left[
        \left| \frac{p^t_y(x)}{\E_x\left[ p^t_y(x) \right]} \,-\, \frac{\eta_y(x)}{\pi_y} \right|\ell(y, f(x))\right]\\
&\leq
|\hat{L}^\robd(f) - {L}^\robd(f)| +
    B\max_{y \in [m]} \E_x\left[
        \left| \frac{p^t_y(x)}{\E_x\left[ p^t_y(x) \right]} \,-\, \frac{\eta_y(x)}{\pi_y} \right|\right],
\end{align*}
where the second-last step uses Jensen's inequality and the fact that $\ell(y, f(x)) \geq 0$, 
and the last step uses the fact that $\ell(y, f(x)) \leq B$.

Further expanding the first term,
\begin{align*}
|\hat{L}^\robd(f) - L^\rob(f)|
&\leq
    \left| 
        \max_{y \in [m]} \phi_y(f)
        \,-\,
        \max_{y \in [m]} \hat{\phi}_y(f)
    \right|
        +
    B\max_{y \in [m]} \E_x\left[
        \left| \frac{p^t_y(x)}{\E_x\left[ p^t_y(x) \right]} \,-\, \frac{\eta_y(x)}{\pi_y} \right|\right]\\
&\leq
    \max_{y \in [m]}\left| 
         \phi_y(f)
        \,-\,
         \hat{\phi}_y(f)
    \right|
        +
    B\max_{y \in [m]} \E_x\left[
        \left| \frac{p^t_y(x)}{\E_x\left[ p^t_y(x) \right]} \,-\, \frac{\eta_y(x)}{\pi_y} \right|\right],
\end{align*}
as desired.
\end{proof}


\subsection{Calibration of Margin-based Loss}
\label{app:calibration-mar}
To show that minimizer of the margin-based objective in \eqref{eq:margin-la} also minimizes
the balanced objective in \eqref{eq:balanced-distilled-empirical}, we state the following general result:
\begin{lemma}
\label{lem:helper-dro-1}
    % Suppose $p^t \in \cF$ and $\cF$ is closed under linear transformations. 
    % Let 
    % $$\hat{f} \,\in\, \displaystyle\Argmin{f \in \cF}\, \frac{1}{n}\sum_{i=1}^n \cL^\mar\left(p^t(x_i), f(x_i); \frac{\lambda}{\hat{\pi}^t} \right),$$
    % for some $\lambda \in \Delta_m$. Then:
    % \[
    %     \hat{f}_y(x') = \log\left(\frac{\lambda_y}{\hat{\pi}^t_y} p_y^t(x')\right) + C, ~~\forall (x', y') \in S,
    % \]
    % for some constant $C \in \R$. Moreover, for any example weighting $w \in \R^m_+$, $\hat{f}$ is also a minimizer for
    % the weighted objective:
    % \[
    %     \hat{f} \,\in\, \displaystyle\Argmin{f \in \cF}\,\frac{1}{n}\sum_{i=1}^n w_i\sum_{y=1}^m \frac{\lambda_y}{\hat{\pi}^t_y}  p_y^t(x_i)\, \ell\left( y , {f}(x_i) \right).
    % \]
    Suppose $p^t \in \cF$ and $\cF$ is closed under linear transformations. 
    Let 
    \begin{align}
        \hat{f} &\in\, \displaystyle\Argmin{f \in \cF}\, \frac{1}{n}\sum_{i=1}^n \cL^\mar\left(p^t(x_i), f(x_i); \bc \right)
        \label{eq:margin-empirical}
    \end{align}
    for some cost vector $\bc \in \R_+^m$. Then:
    \[
            \hat{f}_y(x_i) = \log\left(c_y p_y^t(x_i)\right) + C_i, ~~\forall i \in [n],
    \]
for some example-specific constant constants $C_i \in \R, \forall i \in [n]$. Furthermore, 
    for any assignment of example weights of $w \in \R^n_+$,  
    $\hat{f}$ is also the minimizer of 
    the weighted objective:
    \begin{align}
        \hat{f} \,\in\, \displaystyle\Argmin{f \in \cF}\,\frac{1}{n}\sum_{i=1}^n w_i\sum_{y=1}^m c_y\, p_y^t(x_i)\, \ell\left( y , {f}(x_i) \right).
        \label{eq:weighted-loss-empirical}
    \end{align}
\end{lemma}


\begin{proof}%[Proof of Lemma \ref{lem:helper-dro-1}]
Following \citet{menon2020long} (e.g.\ proof of Theorem 1), we have that
for class probabilities $\bp \in \Delta_m$ and costs $\bc \in \R^m_+$, 
the margin-based loss in \eqref{eq:margin-la} 
\begin{align*}
\cL^\mar\left(\bp, \boldf; \bc \right) 
&=
\frac{1}{m}\sum_{y \in [m]} p_y \log\bigg(1 + \sum_{j \ne y}\exp\left(\log(c_y / c_{j}) \,-\, (f_y - f_j) \right) \bigg).
\end{align*}
is minimized by:
\[
        f^*_y = \log\left(c_y p_y\right) + C,
\]
for any $C > 0$. To see why this is true, note that the above loss can be equivalently written as:
\begin{align*}
\cL^\mar\left(\bp, \boldf; \bc \right) 
&=
-\frac{1}{m}\sum_{y \in [m]} p_y \log\bigg(\frac{ \exp\left(f_y - \log(c_y) \right) }{ \sum_{j=1}^m \exp\left(f_j - \log(c_j) \right) } \bigg).
\end{align*}
This the same as the softmax cross-entropy loss with adjustments made to the logits, the minimizer for which is of the form:
\[
        f^*_y - \log(c_y) = \log\left(p_y\right) + C~~~~\text{or}~~~~f^*_y = \log\left(c_y p_y\right) + C.
\]


It follows that any minimizer $\hat{f}$ of the
average margin-based loss in \eqref{eq:margin-empirical} over sample $S$,
would do so point-wise, and therefore
\begin{align*}
        \hat{f}_y(x_i) = \log\left(c_y p_y^t(x_i)\right) + C_i, ~~\forall i \in [n],
\end{align*}
for some example-specific constant constants $C_i \in \R, \forall i \in [n]$.

To prove the second part, we  note that for the minimizer $\hat{f}$ to also minimize 
the weighted objective:
    \[
        \frac{1}{n}\sum_{i=1}^n w_i\sum_{y=1}^m c_y\, p_y^t(x_i)\, \ell\left( y , {f}(x_i) \right),
    \]
it would also have to do so point-wise for each $i \in [m]$, and so as long the weights $w_i$ are non-negative,
it suffices that
\[
    \hat{f}(x_i) \in \Argmin{\boldf \in \R^m}\, \sum_{y=1}^m c_y\, p_y^t(x_i)\, \ell\left( y , {f}(x_i) \right).
\]
This is indeed the case when $\ell$ is the softmax cross-entropy loss, where
the point-wise minimizer for each $i \in [m]$ would be of the form $\softmax_y(f(x)) = c_y p^t_y(x)$,
which is satisfied by $\hat{f}$.
\end{proof}
A similar result also holds in the population limit, when \eqref{eq:margin-empirical}
and \eqref{eq:weighted-loss-empirical} are computed in expectation, and the per-example weighting in \eqref{eq:weighted-loss-empirical} is
replaced by an arbitrary weighting function $w(x) \in \R_+$. Any scorer of the following form
would then minimize both objectives:
\[
\hat{f}_y(x) = \log\left(c_y p_y^t(x)\right) + C(x), ~~\forall x \in \X,
\]
where $C(x)$ is some  example-specific constant.


\subsection{Proof of Proposition \ref{prop:student-form}}
\label{app:student-form}
\begin{proposition*}[Restated]
Suppose $p^t \in \cF$ and $\cF$ is closed under linear transformations. Then the final scoring function $\bar{f}^s(x) = \frac{1}{K} \sum_{k=1}^K f^{k}(x)$ output by Algorithm \ref{algo:dro} is of the form:
    \[
        \softmax_j(\bar{f}^s(x)) \propto \bar{\lambda}_j p_j^t(x),~~~~\forall j \in [m],~ \forall (x, y) \in S,
    \]
where $\bar{\lambda}_y = \left(\prod_{k=1}^K \lambda_y^k / \pi^t_y\right)^{1/K}$. 
\end{proposition*}
\begin{proof}
The proof follows from Lemma \ref{lem:helper-dro-1} with the costs $\bc$ set to $\lambda^k / \pi^t$ for each iteration $k$. 
The lemma tells us that each $f^k$ is of the form:
\[
        f^k(x') = \log\left(\frac{\lambda^k_y}{\pi^t_y} p_y^t(x')\right) + C(x'), ~~\forall (x', y') \in S,
    \]
for some example-specific constant $C(x') \in \R$. Consequently, we have that:
    \[
        \bar{f}_y^s(x') = \log(\bar{\lambda}_y p_y^t(x')) + \bar{C}(x'), ~~\forall (x', y') \in S,
    \]
    where $\bar{\lambda}_y = \left(\prod_{k=1}^K \lambda_y^k / \pi^t_y\right)^{1/K}$ and $\bar{C}(x') \in \R$. Applying a
    softmax to $\bar{f}^s$ results in the desired form.
\end{proof}

\subsection{Proof of Theorem \ref{thm:dro}}
\label{app:convergence-dro}

\begin{theorem*}[Restated]
Suppose $p^t \in \cF$ and $\cF$ is closed under linear transformations.  Suppose
$\ell$ is the softmax cross-entropy loss $\ell^\xent$,
$\ell(y, z) \leq B$
and $\max_{y \in [m]}\frac{1}{\pi^t_y} \leq Z$, for some $B, Z > 0$.
Furthermore, suppose for any $\delta \in (0,1)$, the following bound holds on the
estimation error in Theorem \ref{thm:good-teacher}:
with probability at least $1 - \delta$ (over draw of $S \sim D^n$),
for all $f \in \cF$,
\[
\max_{y \in [m]} \big|\phi_y(f) - \hat{\phi}_y(f)\big| \leq \Delta(n, \delta),
\]
for some $\Delta(n, \delta) \in \R_+$ that
is increasing in $1/\delta$, and goes to 0 as $n \> \infty$. Fix $\delta \in (0,1)$.
Then when the step size $\gamma = \frac{1}{2BZ}\sqrt{\frac{\log(m)}{{K}}}$
and $n^\val \geq 8Z\log(2m/\delta)$, with
probability at least $1-\delta$ (over draw of $S \sim D^n$ and $S^\val \sim D^{n^\val}$)
\begin{align*}
L^\rob(\bar{f}^s) &\leq\,
\min_{f \in \cF}L^\rob(f)
        \,+\,
    \underbrace{2B\max_{y \in [m]} \E_x\left[
        \left| \frac{p^t_y(x)}{\pi^t_y} \,-\, \frac{\eta_y(x)}{\pi_y} \right|\right]}_{\text{Approximation error}}\\
        &\hspace{4cm}
        \,+\, \underbrace{2\Delta(n^\val, \delta/2) \,+\,  2\Delta(n, \delta/2)}_{\text{Estimation error}}
        \,+\,
        \underbrace{4BZ\sqrt{\frac{\log(m)}{{K}}}}_{\text{EG convergence}}.
\end{align*}
\end{theorem*}

Before proceeding to the proof, we will find it useful to define:
\begin{align*}
\displaystyle
\hat{\phi}^\val_y(f^s) &= \frac{1}{\hat{\pi}^{t,\val}_y}\frac{1}{n^\val}\sum_{(x', y') \in S^{\val}}  p_y^t(x')\,\ell\left( y , f^s(x') \right).
\end{align*}
We then state a useful lemma.

% \begin{lemma}
%     Suppose for any $\delta \in (0,1)$, the following bound holds on the
% estimation error in Theorem \ref{thm:good-teacher}
% with probability at least $1 - \delta$ (over draw of $S \sim D^n$):
% \[
% |\hat{L}^\robd(f) - L^\robd(f)| \leq \Delta(n, \delta),
% \]
% for some $\Delta(n, \delta) \in \R_+$ that
% is decreasing in sample size $n$ and increasing in $1/\delta$. Then
% with probability at least $1 - \delta$ (over draw of $S \sim D^n$), 
% for any $\lambda \in \Delta_m$
% \[
% \left|\frac{1}{n}\sum_{i=1}^n\sum_{y=1}^m \frac{\lambda_y}{\hat{\pi}^t_y}  p_y^t(x_i)\, \ell\left( y , {f}(x_i) \right)
%     \,-\, \E_{x}\left[\sum_{y=1}^m \frac{\lambda_y}{\hat{\pi}^t_y}  p_y^t(x_i)\, \ell\left( y , {f}(x_i) \right)\right]
%     \right| \leq \Delta(n, \delta)
% \]
% \end{lemma}
% \begin{proof}
% We first note that:
% \begin{align*}
% \frac{1}{n}\sum_{i=1}^n\sum_{y=1}^m \frac{\lambda_y}{\hat{\pi}^t_y}  p_y^t(x_i)\, \ell\left( y , {f}(x_i) \right)
%     \,-\, \E_{x}\left[\sum_{y=1}^m \frac{\lambda_y}{\hat{\pi}^t_y}  p_y^t(x_i)\, \ell\left( y , {f}(x_i) \right)\right]
%     &\leq&
%     \max_{\lambda' \in \Delta_m}\frac{1}{n}\sum_{i=1}^n\sum_{y=1}^m \frac{\lambda_y}{\hat{\pi}^t_y}  p_y^t(x_i)\, \ell\left( y , {f}(x_i) \right)
%     \,-\,  \max_{\lambda' \in \Delta_m}\E_{x}\left[\sum_{y=1}^m \frac{\lambda_y}{\hat{\pi}^t_y}  p_y^t(x_i)\, \ell\left( y , {f}(x_i) \right)\right]
% \end{align*}
% \end{proof}


\allowdisplaybreaks

\begin{lemma}
\label{lem:helper-dro-2}
Suppose the conditions in Theorem \ref{thm:dro} hold.
Then with probability $\leq 1 - \delta$ (over draw of $S \sim D^{n}$ and $S^\val \sim D^{n^\val}$), at each iteration $k$, 
% \[
% \E_{x}\left[\sum_{y=1}^m \frac{\lambda^{k+1}_y}{{\pi}^t_y}  p_y^t(x)\, \ell\left( y , {f}^{k+1}(x) \right)\right]
% \,-\,
% \min_{f \in \cF}\,\E_{x}\left[\sum_{y=1}^m \frac{\lambda^{k+1}_y}{{\pi}^t_y}  p_y^t(x)\, \ell\left( y , {f}(x) \right)\right] 
% \leq \Delta(n, \delta);
% \]
% \[
% \left|\frac{1}{n^\val}\sum_{(x',y') \in S^\val}\sum_{y=1}^m \frac{\lambda^{k+1}_y}{\hat{\pi}^{t,\val}_y}  p_y^t(x'_i)\, \ell\left( y , {f}^{k+1}(x'_i) \right) \,-\, 
% \E_{x}\left[\sum_{y=1}^m \frac{\lambda^{k+1}_y}{{\pi}^t_y}  p_y^t(x)\, \ell\left( y , {f}^{k+1}(x) \right)\right] \right| ~\leq~ \Delta(n^\val, \delta).
% \]
\[
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(f^{k+1})
\,-\,
\min_{f \in \cF}\,\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(f)
\leq 2\Delta(n, \delta);
\]
and for any $\lambda \in \Delta_m$:
\[
\left|\sum_{y=1}^m \lambda_y \hat{\phi}^\val_y(f^{k+1}) \,-\, 
\sum_{y=1}^m \lambda_y  \phi_y(f^{k+1}) \right| ~\leq~ \Delta(n^\val, \delta).
\]
\end{lemma}
\begin{proof}
We first note that by applying Lemma \ref{lem:helper-dro-1}  with $w_i = 1,\forall i$, we have that $f^{k+1}$ is the minimizer
of $\sum_{y=1}^m \lambda^{k+1}_y  \hat{\phi}_y(f)$ over all $f\in \cF$, and therefore:
\begin{equation}
    \sum_{y=1}^m \lambda^{k+1}_y  \hat{\phi}_y(f^{k+1}) \,\leq\, \sum_{y=1}^m \lambda^{k+1}_y  \hat{\phi}_y(f),~\forall f\in \cF.
    \label{eq:f-k+1-minimizer}
\end{equation}

Further, for a fixed iteration $k$, let us denote $\tilde{f} \in \Argmin{f \in \cF}\,  \sum_{y=1}^m\lambda^{k+1}_y  \phi_y({f})$. 
Then for the first part, we have:
\begin{align*}
\lefteqn{
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(f^{k+1})
\,-\,
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(\tilde{f})
}\\
&\leq
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(f^{k+1})
\,-\,
\sum_{y=1}^m \lambda^{k+1}_y \hat{\phi}_y(f^{k+1})
\,+\, \sum_{y=1}^m \lambda^{k+1}_y \hat{\phi}_y(f^{k+1})
\,-\,
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(\tilde{f})
\\
&\leq 
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(f^{k+1})
\,-\,
\sum_{y=1}^m \lambda^{k+1}_y \hat{\phi}_y(f^{k+1})
\,+\, \sum_{y=1}^m \lambda^{k+1}_y \hat{\phi}_y(\tilde{f})
\,-\,
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(\tilde{f})
\\
&\leq
    2\sup_{f \in \cF}\left|
    \sum_{y=1}^m \lambda^{k+1}_y  \hat{\phi}_y(f)
        \,-\,
    \sum_{y=1}^m \lambda^{k+1}_y  \phi_y(f)
    \right|\\
    &\leq
         2\sup_{f \in \cF}\max_{\lambda \in \Delta_m}\, 
        \left|\sum_{y=1}^m \lambda_y  \hat{\phi}_y(f)
        \,-\,
    \sum_{y=1}^m \lambda_y  \phi_y(f)
            \right|\\
    &\leq
        2 \sup_{f \in \cF}\max_{\lambda \in \Delta_m}\, 
        \sum_{y=1}^m {\lambda_y}
        \left|\hat{\phi}_y(f) \,-\, \phi_y(f) \right|
    \\
    &=  2\sup_{f \in \cF}\max_{y \in [m]} \big|\hat{\phi}_y(f) - {\phi}_y(f)\big|.
\end{align*}
where for the second inequality, we use \eqref{eq:f-k+1-minimizer}.
Applying the generalization bound assumed in Theorem \ref{thm:dro},
we have with probability $\leq 1 - \delta$ (over draw of $S \sim D^{n}$), for all iterations $k \in [K]$,
\[
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(f^{k+1})
\,-\,
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(\tilde{f})
    \,\leq\, 2\Delta(n, \delta),
\]

For the second part, note that for any $\lambda \in \Delta_m$,
\begin{align*}
    \left|\sum_{y=1}^m \lambda_y \hat{\phi}^\val_y(f^{k+1}) \,-\, 
\sum_{y=1}^m \lambda_y  \phi_y(f^{k+1}) \right| &\leq \sum_{y=1}^m \lambda_y\left| \hat{\phi}^\val_y(f^{k+1}) \,-\, 
\phi_y(f^{k+1})\right|\\
&\leq \max_{y\in[m]}\,\left| \hat{\phi}^\val_y(f^{k+1}) \,-\, 
\phi_y(f^{k+1})\right|\\
&\leq \sup_{f\in \cF}\max_{y\in[m]}\,\left| \hat{\phi}^\val_y(f) \,-\, 
\phi_y(f)\right|.
\end{align*}
An application of the generalization bound assumed in Theorem \ref{thm:dro} to empirical estimates from the validation sample
completes the proof.
\end{proof}


We are now ready to prove Theorem \ref{thm:dro}.

\begin{proof}[Proof of Theorem \ref{thm:dro}]
Note that because $\min_{y \in [m]}\pi^{t}_y \geq \frac{1}{Z}$ and $n^\val \geq 8Z\log(2m/\delta)$,
we have by a direct application of Chernoff's bound (along with a union bound over all $m$ classes) that with
probability at least $1-\delta/2$:
$$
\min_{y \in [m]}\hat{\pi}^{t,\val}_y \geq \frac{1}{2Z}, \forall y \in [m]
$$
and consequently,
$\hat{\phi}_y^\val(f) \leq 2BZ, \forall f \in \cF$. The boundedness of $\hat{\phi}_y^\val$ will then allow us to apply standard convergence guarantees for exponentiated gradient ascent \citep{shalev2011online}. For $\gamma = \frac{1}{2BZ}\sqrt{\frac{\log(m)}{{K}}}$,
the updates on $\lambda$ will give us  with
probability at least $1-\delta/2$:
\begin{equation}
\max_{\lambda \in \Delta_m}\,\frac{1}{K}\sum_{k=1}^K\sum_{y=1}^m \lambda_y \hat{\phi}_y^\val(f^k)
\,\leq\, 
\frac{1}{K}\sum_{k=1}^K\sum_{y=1}^m \lambda_y^k \hat{\phi}_y^\val(f^k)
\,+\,
4BZ\sqrt{\frac{\log(m)}{{K}}}
    \label{eq:phi-upper-bound}
\end{equation}

Applying the second part of Lemma \ref{lem:helper-dro-2} to each iteration $k$, we have
with probability at least $1-\delta$:
\[
\max_{\lambda \in \Delta_m}\,\frac{1}{K}\sum_{k=1}^K\sum_{y=1}^m \lambda_y {\phi}_y(f^k)
\,\leq\, 
\frac{1}{K}\sum_{k=1}^K\sum_{y=1}^m \lambda_y^k {\phi}_y(f^k)
\,+\,
4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta(n^\val, \delta/2),
\]
and applying the first part of Lemma \ref{lem:helper-dro-2} to the RHS, we have with the same probability:
% \todo{Change $\delta/2$ to $\delta/2$}
\begin{align*}
\lefteqn{
\max_{\lambda \in \Delta_m}\,\frac{1}{K}\sum_{k=1}^K\sum_{y=1}^m \lambda_y {\phi}_y(f^k)}\\
&\leq
\frac{1}{K}\sum_{k=1}^K\min_{f \in \cF}\sum_{y=1}^m \lambda_y^k {\phi}_y(f)
\,+\,
4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta(n^\val, \delta/2) \,+\,  2\Delta(n, \delta/2)\\
&\leq
\min_{f \in \cF}\,\frac{1}{K}\sum_{k=1}^K\sum_{y=1}^m \lambda_y^k {\phi}_y(f)
\,+\,
4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta(n^\val, \delta/2) \,+\,  2\Delta(n, \delta/2).
\end{align*}
Note that we have taken a union bound over the high probability statement in \eqref{eq:phi-upper-bound} and that in Lemma \ref{lem:helper-dro-2}. 
Using the convexity of $\phi(\cdot)$ in $f(x)$ and Jensen's inequality, 
we have that $\sum_{y=1}^m \lambda_y {\phi}_y(\bar{f}^s) \leq \frac{1}{K}\sum_{k=1}^K\sum_{y=1}^m \lambda_y {\phi}_y(f^k)$.
We use this to further lower bound the LHS 
in terms of the averaged scoring function $\bar{f}^s(x) = \frac{1}{K}\sum_{k=1}^K f^k(x)$:
\begin{align}
\lefteqn{\max_{\lambda \in \Delta_m}\,\sum_{y=1}^m \lambda_y {\phi}_y(\bar{f}^s)}
\nonumber
\\
    &\leq
        \min_{f \in \cF}\,\frac{1}{K}\sum_{k=1}^K\sum_{y=1}^m \lambda_y^k {\phi}_y(f)
        \,+\,
        4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta(n^\val, \delta/2) \,+\,  2\Delta(n, \delta/2)
        \nonumber\\
    &=
        \min_{f \in \cF}\,\sum_{y=1}^m \tilde{\lambda}_y {\phi}_y(f)
        \,+\,
        4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta(n^\val, \delta/2) \,+\,  2\Delta(n, \delta/2)
        \nonumber\\
    &\leq
        \max_{\lambda \in \Delta_m}\min_{f \in \cF}\,\sum_{y=1}^m {\lambda}_y {\phi}_y(f)
        \,+\,
        4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta(n^\val, \delta/2) \,+\,  2\Delta(n, \delta/2)
        \nonumber\\
    &=
        \min_{f \in \cF}\max_{\lambda \in \Delta_m}\,\sum_{y=1}^m {\lambda}_y {\phi}_y(f)
        \,+\,
        4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta(n^\val, \delta/2) \,+\,  2\Delta(n, \delta/2)
        \nonumber\\
    &=
        \min_{f \in \cF}\max_{y \in [m]}\, {\phi}_y(f)
        \,+\,
        4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta(n^\val, \delta/2) \,+\,  2\Delta(n, \delta/2),
        \label{eq:dro-final}
\end{align}
where in the second step $\tilde{\lambda}_y = \frac{1}{K}\sum_{k=1}^K \lambda^k_y$;
in the fourth step, we swap the `min' and `max' using  Sion's minimax theorem \citep{sion1958general}. 
We further have from \eqref{eq:dro-final},
\begin{align*}
\max_{y \in [m]}\,{\phi}_y(\bar{f}^s)
    &\leq
        \min_{f \in \cF}\max_{y \in [m]}\, {\phi}_y(f)
        \,+\,
        4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta(n^\val, \delta/2) \,+\,  2\Delta(n, \delta/2).
        % \label{eq:dro-final}
\end{align*}
In other words,
\[
    L^\robd(\bar{f}^s)
    \leq
        \min_{f \in \cF}L^\robd(f)
        \,+\,
        4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta(n^\val, \delta/2) \,+\,  2\Delta(n, \delta/2).
\]

To complete the proof, we need to turn this into a guarantee on the original robust objective $L^\rob$ in \eqref{eq:robust}:
\begin{align*}
    \lefteqn{L^\rob(\bar{f}^s)}\\
    &\leq
        \min_{f \in \cF}L^\rob(f)
        \,+\, 2\max_{f \in \cF}\,\left|L^\rob(f) - L^\robd(f)\right|
        \,+\,
        4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta(n^\val, \delta/2) \,+\,  2\Delta(n, \delta/2)\\
    &\leq
    \min_{f \in \cF}L^\rob(f)
        \,+\,
    2B\max_{y \in [m]} \E_x\left[
        \left| \frac{p^t_y(x)}{\pi^t_y} \,-\, \frac{\eta_y(x)}{\pi_y} \right|\right]
        \,+\,
        4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta(n^\val, \delta/2) \,+\,  2\Delta(n, \delta/2),
\end{align*}
where we have used the bound on the approximation error in the proof of Theorem \ref{thm:good-teacher}. %Taking
% a union bound over the two high probability statements 
This completes the proof.
\end{proof}

\section{Student Estimation Error}
\label{app:student-gen-bound}

We now provide a bound on the estimation error in Theorem \ref{thm:dro} using a generalization bound from \citet{menon2021statistical}.
\begin{lemma}
\label{lem:student-gen-bound}
Let $\cF \subseteq \R^\X$ be a given class of scoring functions.
Let $\mathcal{V} \subseteq \R^\X$ denote the class of loss functions $v(x, y) = \ell(y, f(x))$ induced
by scorers $f \in \cF$. Let $\cM_n = \mathcal{N}_\infty(\frac{1}{n}, \mathcal{V}, 2n)$ denote the
uniform $L_\infty$ covering number for $\mathcal{V}$. Fix $\delta \in (0,1)$. 
Suppose $\ell(y, z) \leq B$,
$\pi^t_y \leq \frac{1}{Z}, \forall y \in [m]$, and the
number of samples $n \geq 8Z\log(4m/\delta)$. 
Then with probability $\geq 1 - \delta$
over draw of $S \sim D^n$, for any $f \in \cF$ and $y \in [m]$:
\[
\left| \phi_y(f) - \hat{\phi}_y(f) \right| 
\,\leq\,
CZ\left( \sqrt{\bV_{n,y}(f) \frac{\log(m\cM_n / \delta)}{n}} \,+\, \frac{\log(m\cM_n / \delta)}{n} 
\,+\,
B\sqrt{ \frac{\log(m/\delta)}{n} }\right),
\]
where $\bV_{n,y}(f)$ denotes the empirical variance of the loss values $\{p^t_y(x_i)\cdot\ell(y, f(x_i))\}_{i=1}^n$ for class $y$, and
$C > 0$ is a distribution-independent constant.
\end{lemma}
Notice the dependence on the \emph{variance} that the teacher's predictions induce on the loss. This suggests that the lower the variance in the teacher's predictions, the better is the student's generalization. Similar to \citet{menon2021statistical}, one can further show that when the teacher closely approximates the Bayes-probabilities $\eta(x)$, the distilled loss $p^t_y(x_i)\cdot\ell(y, f(x_i))$ has a lower empirical variance that the loss $\ell(y_i, f(x_i))$ computed from one-hot labels.


\begin{proof}[Proof of Lemma \ref{lem:student-gen-bound}]
We begin by defining the following intermediate term:
\[
\tilde{\phi}_y(f) = 
\frac{1}{\pi^t_y}\frac{1}{n}\sum_{i=1}^n  p_y^t(x_i)\,\ell\left( y , f(x_i) \right).
\]
Then for any $y \in [m]$,
\begin{align}
    \left| \phi_y(f) - \hat{\phi}_y(f) \right|
    &\leq 
    \left| \phi_y(f) - \tilde{\phi}_y(f) \right|
    +
    \left| \tilde{\phi}_y(f) - \hat{\phi}_y(f) \right|.
    \label{eq:genbound-inter}
\end{align}
We next bound each of the terms in \eqref{eq:genbound-inter}, starting with the first term:
\begin{align*}
\left| \phi_y(f) - \tilde{\phi}_y(f) \right|
    &= 
        \frac{1}{\pi^t_y}
        \left|
        \E_x\left[ p_y^t(x)\, \ell\left( y , f(x) \right)\right] \,-\, \frac{1}{n}\sum_{i=1}^n  p_y^t(x_i)\,\ell\left( y , f(x_i) \right)
        \right|
        \\
    &\leq
        Z
        \left|
        \E_x\left[ p_y^t(x)\, \ell\left( y , f(x) \right)\right] \,-\, \frac{1}{n}\sum_{i=1}^n  p_y^t(x_i)\,\ell\left( y , f(x_i) \right)
        \right|,
\end{align*}
where we use the fact that $\pi^t_y \leq \frac{1}{Z}, \forall y$. 
Applying the generalization bound from \citet[Proposition 2]{menon2021statistical}, along with a union bound over all $m$ classes, we have with probability at least $1-\delta/2$ over the draw of $S \sim D^n$, for all $y \in [m]$:
\begin{align}
    \left| \phi_y(f) - \tilde{\phi}_y(f) \right|
    &\leq
    C'Z\left( \sqrt{\bV_{n,y}(f) \frac{\log(m\cM_n / \delta)}{n}} \,+\, \frac{\log(m\cM_n / \delta)}{n} \right),
    \label{eq:genbound-term1}
\end{align}
for a distribution-independent constant $C' > 0$.

We next bound the second term in \eqref{eq:genbound-inter}:
\begin{align*}
\left| \tilde{\phi}_y(f) - \hat{\phi}_y(f) \right|
    &= 
        \left|
        \frac{1}{\pi^t_y}
        \,-\,
        \frac{1}{\hat{\pi}^t_y}
        \right|
        \frac{1}{n}\sum_{i=1}^n  p_y^t(x_i)\cdot\ell\left( y , f(x_i) \right)
        \\
    &\leq
        B\left|
        \frac{1}{\pi^t_y}
        \,-\,
        \frac{1}{\hat{\pi}^t_y}
        \right|
        \\
    &=
    \frac{B}{\pi^t_y\hat{\pi}^t_y}
    \left|
        {\pi}^t_y - \hat{\pi}^t_y
    \right|,
\end{align*}
where in the second step we use the fact that $\ell(y, f(x)) \leq B$ and $p_y^t(x) \leq 1$. 


Further note that because $\min_{y \in [m]}\pi^t_y \geq \frac{1}{Z}$ and $n \geq 8Z\log(4m/\delta)$,
we have by a direct application of Chernoff's bound (and a union bound over $m$ classes) that with
probability at least $1-\delta/4$:
\begin{equation}
  \min_{y \in [m]}\hat{\pi}^t_y \geq \frac{1}{2Z}, \forall y \in [m].  
  \label{eq:pi-hat-bound}
\end{equation}

Therefore for any $y \in [m]$:
\begin{align*}
\left| \tilde{\phi}_y(f) - \hat{\phi}_y(f) \right|
    &\leq
    2BZ^2
    \left|
        {\pi}^t_y - \hat{\pi}^t_y
    \right|.
\end{align*}
Conditioned on the above statement, a simple application of Hoeffding's inequality and a union bound over all $y \in [m]$ gives us that with probability at least $1-\delta/4$ over the draw of $S \sim D^n$, for all $y \in [m]$:
\begin{align}
\left| \tilde{\phi}_y(f) - \hat{\phi}_y(f) \right|
    &\leq
    2BZ^2\left(\frac{1}{Z} \sqrt{ 
            \frac{\log(8m/\delta)}{2n} }\right)
    ~= 
    2BZ \sqrt{ \frac{\log(8m/\delta)}{2n} }.
    \label{eq:genbound-term2}
\end{align}

A union bound over the high probability statements in (\ref{eq:genbound-term1}--\ref{eq:genbound-term2}) completes the proof. To see this, note that, for any $\epsilon > 0$ and $y \in [m]$,
\begin{align*}
    \lefteqn{\P\left( \left| {\phi}_y(f) - \hat{\phi}_y(f) \right| \geq \epsilon \right)}
    \\
    &\leq 
      \P\left(
        \left(\left| {\phi}_y(f) - \tilde{\phi}_y(f) \right| \geq \epsilon\right)
        \vee
      \left(\left| \tilde{\phi}_y(f) - \hat{\phi}_y(f) \right| \geq \epsilon\right)
      \right)\\
    &\leq 
      \P\left(\left| {\phi}_y(f) - \tilde{\phi}_y(f) \right| \geq \epsilon\right)
      \,+\,
     \P\left(\left| \tilde{\phi}_y(f) - \hat{\phi}_y(f) \right| \geq \epsilon\right)\\
     &\leq 
      \P\left(\left| {\phi}_y(f) - \tilde{\phi}_y(f) \right| \geq \epsilon\right)
      \,+\,
     \P\left(\hat{\pi}^t_y \leq \frac{1}{Z}\right) \cdot
     \P\left(\left| \tilde{\phi}_y(f) - \hat{\phi}_y(f) \right| \geq \epsilon ~\bigg|~ \hat{\pi}^t_y \leq \frac{1}{Z}\right)
     \\
     &
     \hspace{5cm}\,+\,
     \P\left(\hat{\pi}^t_y \geq \frac{1}{Z}\right) \cdot
     \P\left(\left| \tilde{\phi}_y(f) - \hat{\phi}_y(f) \right| \geq \epsilon ~\bigg|~ \hat{\pi}^t_y \geq \frac{1}{Z}\right)
     \\
    &\leq 
      \P\left(\left| {\phi}_y(f) - \tilde{\phi}_y(f) \right| \geq \epsilon\right)
      \,+\,
     \P\left(\hat{\pi}^t_y \leq \frac{1}{Z}\right)
     \,+\,
     \P\left(\left| \tilde{\phi}_y(f) - \hat{\phi}_y(f) \right| \geq \epsilon ~\bigg|~ \hat{\pi}^t_y \geq \frac{1}{Z}\right),
\end{align*}
which implies that a union bound over (\ref{eq:genbound-term1}--\ref{eq:genbound-term2}) would give us the desired result in Lemma \ref{lem:student-gen-bound}.
\end{proof}

\section{DRO with One-hot Validation Labels}
\label{app:one-hot-vali}
\begin{figure}
\begin{algorithm}[H]
\caption{Distilled Margin-based DRO with One-hot Validation Labels}% for Robust Student}
\label{algo:dro-val}
\begin{algorithmic}
\STATE \textbf{Inputs:} Teacher $p^t$, Student hypothesis class $\cF$, Training set $S$, Validation set $S^\val$, Step-size $\gamma \in \R_+$,
Number of iterations $K$, Loss $\ell$
\STATE \textbf{Initialize:} Student $f^0 \in \cF$, Multipliers $\blambda^0 \in \Delta_m$
\STATE \textbf{For}~{$k = 0 $ to $K-1$}
\STATE ~~~$\tilde{\lambda}^{k+1}_j \,=\, \lambda^k_j\exp\big( \gamma \hat{R}_j \big), \forall j \in [m]$
\STATE \hspace{2cm}\text{where} $\hat{R}_j =$ $\displaystyle\frac{1}{n^\val}\frac{1}{\hat{\pi}^{\val}_j}\sum_{(x, y) \in S^\val} \ell( y , f^k(x) )$
and $\hat{\pi}^{\val}_j = \displaystyle\frac{1}{n^\val}\sum_{(x, y) \in S^\val} \1(y = j)$
% \STATE ~~~~~~~~~$\begin{cases} 
%             \displaystyle\frac{1}{|S^\val|}\frac{1}{\hat{\pi}^t_j}\sum_{(x, y) \in S^\val} p_j^t(x_i)\, \ell( j , f^k(x) ) & \text{:A}\\
%             \displaystyle\frac{1}{|S^\val|}\frac{1}{\hat{\pi}_j}\sum_{(x, y) \in S^\val} \1(y=j)\,\ell( j , f^k(x) ) & \text{:B}
%         \end{cases}$
%\frac{1}{n^\val\hat{\pi}^t_y}\sum_{ p_y^t(x_i)\, \ell\left( y , f^k(x_i) \right)
\STATE ~~~$\lambda^{k+1}_y \,=\, \frac{\tilde{\lambda}^{k+1}_y}{\sum_{j=1}^m \tilde{\lambda}^{k+1}_j}, \forall y$
% \STATE ~~~$\lambda^{k+1}_y \,=\, \frac{\lambda^{k+1}_y}{\sum_{j=1}^m \lambda^{k+1}_j}, \forall y$
\STATE ~~~$f^{k+1} \,\in\, \displaystyle\Argmin{f \in \cF}\, \frac{1}{n}\sum_{i=1}^n \cL^\mar\left(p^t(x_i), f(x_i); \frac{\lambda^{k+1}}{\hat{\pi}^t} \right)$
~~// Replaced with a few steps of SGD
\STATE \textbf{End For}
\STATE \textbf{Output:} $\bar{f}^{s}: x \mapsto \frac{1}{K}\sum_{k =1}^K f^k(x)$
\end{algorithmic}
\end{algorithm}
\end{figure}

The updates on $\lambda$
in Algorithm \ref{algo:dro} use a validation set labeled by the teacher. 
One could instead perform these updates with a curated validation set containing the original one-hot labels. Each of these choices presents different merits. The use of a teacher-labeled validation set is useful
  in many real world scenarios where labeled data is hard to obtain, 
%\citep{Khan2021,dixon2017,veale_2017}, 
 while unlabeled data abounds.
In contrast, the use of one-hot validation labels, although more expensive to obtain,  may
make the student more immune to errors in the teacher's predictions, 
as the coefficients $\lambda$s are now based on an unbiased estimate of 
the student's performance on each class.

Algorithm \ref{algo:dro-val} contains a version of the margin-based DRO described in Section \ref{sec:algorithms}, where instead of teacher labels the original one-hot labels are used in the validation set. 




Before proceeding to providing a convergence guarantee for this algorithm, we will find it useful
to define the following one-hot metrics:
\begin{align*}
\phi^\oh_y(f^s) &= \frac{1}{\pi_y}\E_x\left[ \eta_y(x)\, \ell\left( y , f^s(x) \right)\right]\\
% \tilde{\phi}^{\oh}_y(f^s) &= \frac{1}{{\pi}_y}\frac{1}{n}\sum_{i=1}^n \eta_y(x_i)\, \ell\left( y , f^s(x_i) \right)\\
\hat{\phi}^{\oh,\val}_y(f^s) &= \frac{1}{\hat{\pi}_y}\frac{1}{n^\val}\sum_{(x', y') \in S^\val}\,\1(y' = y)\, \ell\left( y' , f^s(x') \right).
\end{align*}

\begin{theorem}
\label{thm:dro-oh-vali}
Suppose $p^t \in \cF$ and $\cF$ is closed under linear transformations. Then the final scoring function $\bar{f}^s(x) = \frac{1}{K} \sum_{k=1}^K f^{k}(x)$ output by Algorithm \ref{algo:dro-val} is of the form:
    \[
    \softmax_y(\bar{f}^s(x')) \propto \bar{\lambda}_y p_y^t(x'),~~~~\forall (x', y') \in S,
    \]
where $\bar{\lambda}_y = \left(\prod_{k=1}^K \lambda_y^k / \pi^t_y\right)^{1/K}$. 
Furthermore, 
suppose
$\ell$ is the softmax cross-entropy loss in $\ell^\xent$,
$\ell(y, z) \leq B$, for some $B > 0$, and 
% both $\max_{y \in [m]}\frac{1}{\pi^t_y} \leq Z$  
% and 
$\max_{y \in [m]}\frac{1}{\pi_y} \leq Z$, for some $Z > 0$.
Suppose for any $\delta \in (0,1)$, the following holds:
with probability at least $1 - \delta$ (over draw of $S \sim D^n$),
for all $f \in \cF$,
\[
\max_{y \in [m]} \big|\phi^\oh_y(f) - \hat{\phi}^{\oh}_y(f)\big| \leq \Delta^\oh(n, \delta);
~~~~~~~~
\max_{y \in [m]} \big|\phi_y(f) - \hat{\phi}_y(f)\big| \leq {\Delta}(n, \delta),
\]
for some $\Delta^\oh(n, \delta), {\Delta}(n, \delta) \in \R_+$ that
is increasing in $1/\delta$, and goes to 0 as $n \> \infty$. 
Fix $\delta \in (0,1)$.
Then when the step size $\gamma = \frac{1}{2BZ}\sqrt{\frac{\log(m)}{{K}}}$
and $n^\val \geq 8Z\log(2m/\delta)$, with
probability at least $1-\delta$ (over draw of $S \sim D^n$ and $S^\val \sim D^{n^\val}$), for any $\tau \in \R_+$,
\begin{align*}
L^\rob(\bar{f}^s) &\leq\,
\min_{f \in \cF}L^\rob(f)
        \,+\,
    \underbrace{2B\max_{y \in [m]} \E_x\left[
        \left| \tau\cdot\frac{p^t_y(x)}{\pi^t_y} \,-\, \frac{\eta_y(x)}{\pi_y} \right|\right]}_{\text{Approximation error}}\\
        &\hspace{4cm}
        \,+\, \underbrace{2\tau\cdot\Delta^\oh(n^\val, \delta/2) \,+\,  2\Delta(n, \delta/2)}_{\text{Estimation error}}
        \,+\,
        \underbrace{4BZ\sqrt{\frac{\log(m)}{{K}}}}_{\text{EG convergence}}.
\end{align*}
\end{theorem}

Comparing this  to the bound in Theorem \ref{thm:dro}, we can see that there
is an additional scaling factor $\tau$ against the teacher probabilities $p^t_y(x)$ and in the approximation error. When we set $\tau = 1$, the bound looks very similar to  Theorem \ref{thm:dro}, except that the estimation error term $\Delta^\oh$ now involves one-hot labels. Therefore the estimation error may incur a slower convergence with sample size as it no longer benefits from the lower variance that the teacher predictions  may offer (see Appendix \ref{app:student-gen-bound} for details).

The $\tau$-scaling in the
approximation error also means that the teacher is no longer required to exactly match the (normalized) class probabilities $\eta(x)$. In fact, one can set $\tau$ to a value for which the approximation error is the lowest, and in general to a value that minimizes the upper bound in Theorem \ref{thm:dro-oh-vali}, potentially providing us with a tighter convergence rate than Theorem \ref{thm:dro}.

% weaker dependence on the teacher's predictions $p^t(x)$, 
% thanks to the presence of the $\tau$-weighting. Note that when $\tau =1$,
% the approximation error term matches that in Theorem \ref{thm:dro}; but for other values of $\tau \in \R_+$, it can potentially yield lower values. This tells us that the teacher need not exactly mimic the conditional-class probabilities $\eta(x)$, but it suffices that it approximates some positive scaling of $\eta(x)$. 

% However, the estimation error term $\Delta^\oh$ now involves one-hot labels, and 
% does not benefit from the lower variance that the teacher predictions  may offer (see Appendix \ref{app:student-gen-bound} for further details). Furthermore, the scaling $\tau$ also features in the approximation error term; so the choice of $\tau$ in the bound can be seen as controlling a trade-off between achieving low approximation error and low estimation error.

The proof of Theorem \ref{thm:dro-oh-vali} is similar to that of Theorem \ref{thm:dro}, but requires a modified version of Lemma \ref{lem:helper-dro-2}: % in our proof:
\begin{lemma}
\label{lem:helper-dro-3}
Suppose the conditions in Theorem \ref{thm:dro} hold.
% Suppose $\ell(y, z) \leq B$, for some $B > 0$,
% and $\max_{y \in [m]}\frac{1}{\pi_y} \leq Z'$.
With probability $\leq 1 - \delta$ (over draw of $S \sim D^{n}$ and $S^\val \sim D^{n^\val}$), 
at each iteration $k$ and for any $\tau \in \R_+$,
\begin{align*}
{
\sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(f^{k+1})
\,-\,
\min_{f \in \cF}\,\sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(f)}
&\leq 
2\tau\cdot{\Delta}(n, \delta) \,+\,
2B\max_{y \in [m]} \E_x\left[
        \left| \tau\frac{p^t_y(x)}{\pi^t_y} \,-\, \frac{\eta_y(x)}{\pi_y} \right|\right].
% 2B\min_{\tau \in \cT}\max_{y \in [m]} \E_x\left[
%         \left| \tau\frac{p^t_y(x)}{\pi^t_y} \,-\, \frac{\eta_y(x)}{\pi_y} \right|\right] \,+\,
        % 2\tilde{\Delta}(n, \delta/3) \,+\,
        % \,+\,\cO\left(BZ^2\sqrt{ \frac{\log(Mm/\delta)}{n} } \right)
        % \,+\, 2BZ^2\epsilon
\end{align*}
% $
% \cT =\big\{0, \frac{Z}{M-1}, \frac{2Z}{M-1}, \ldots, Z\big\}. 
% $ 
Furthermore, with the same probability, for any $\lambda \in \Delta_m$:
\[
\left|\sum_{y=1}^m \lambda_y \hat{\phi}^{\oh,\val}_y(f^{k+1}) \,-\, 
\sum_{y=1}^m \lambda_y  \phi^\oh_y(f^{k+1}) \right| ~\leq~ \Delta^\oh(n^\val, \delta).
\]
\end{lemma}
\begin{proof}
We first note from Lemma \ref{lem:helper-dro-1} that
because $f^{k+1} \,\in\, \displaystyle\Argmin{f \in \cF}\, \frac{1}{n}\sum_{i=1}^n \cL^\mar\Big(p^t(x_i), f(x_i); \frac{\lambda^{k+1}}{\hat{\pi}} \Big)$, we have for the example-weighting
 $w_i = \tau, \forall i$:
% \begin{equation}
%     \tau\frac{1}{n}\sum_{i=1}^n \sum_{y=1}^m\frac{\lambda^{k+1}_y}{\hat{\pi}^t_y} p^t_y(x_i) \ell\left( y , f^{k+1}(x_i) \right) \,\leq\, \tau\frac{1}{n}\sum_{i=1}^n \sum_{y=1}^m\frac{\lambda^{k+1}_y}{\hat{\pi}^t_y} p^t_y(x_i) \ell\left( y , f(x_i) \right),~\forall f\in \cF.
%     \label{eq:f-k+1-minimizer-2}
% \end{equation}
\begin{equation}
    \tau\sum_{y=1}^m \lambda^{k+1}_y  \hat{\phi}_y(f^{k+1}) \,\leq\, \tau\sum_{y=1}^m \lambda^{k+1}_y  \hat{\phi}_y(f),~\forall f\in \cF.
    \label{eq:f-k+1-minimizer-2}
\end{equation}

For a fixed iteration $k$, let us denote $\tilde{f} \in \Argmin{f \in \cF}\,  \sum_{y=1}^m\lambda^{k+1}_y  \phi_y({f})$. Then for the first part,
we have for any $\tau \in \R_+$:
\begin{align*}
\lefteqn{
\sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(f^{k+1})
\,-\,
\sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(\tilde{f})
\hspace{10cm}}\\
%
&\leq
\tau\left(
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(f^{k+1})
\,-\,
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(\tilde{f})
\right)
\,+\,
\sum_{y=1}^m \lambda^{k+1}_y  \left|\phi^\oh_y(f^{k+1}) 
- \tau\phi_y(f^{k+1})\right| 
\\
&\hspace{9cm}
\,+\,
\sum_{y=1}^m \lambda^{k+1}_y  \left|\phi^\oh_y(\tilde{f}) 
- \tau\phi_y(\tilde{f})\right| 
\\
%
&\leq
\tau\left(
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(f^{k+1})
\,-\,
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(\tilde{f})
\right)
\,+\,
2\max_{f \in \cF}\,
\sum_{y=1}^m \lambda^{k+1}_y  \left|\phi^\oh_y(f) 
- \tau\phi_y(f)\right| 
\\
%
&\leq
\tau\left(
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(f^{k+1})
\,-\,
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(\tilde{f})
\right)
\,+\,
2\max_{f \in \cF}\max_{\lambda \in \Delta_m}\,
\sum_{y=1}^m \lambda \left|\phi^\oh_y(f) 
- \tau\phi_y(f)\right| 
\\
%
&\leq
\tau\left(
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(f^{k+1})
\,-\,
\sum_{y=1}^m \lambda^{k+1}_y  \phi_y(\tilde{f})
\right)
\,+\,
2\max_{f \in \cF}\,
\max_{y\in[m]} \left|\phi^\oh_y(f) 
- \tau\phi_y(f)\right|
\\
&\leq
2\tau\sup_{f \in \cF}\max_{y \in [m]} \big|\hat{\phi}_y(f) - {\phi}_y(f)\big|
\,+\,
2\max_{f \in \cF}\,
\max_{y\in[m]} \left|\phi^\oh_y(f) 
- \tau\phi_y(f)\right|. 
% \\
% &\leq
% \sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(f^{k+1})
%     \,-\, 
%     \tau\frac{1}{n}\sum_{i=1}^n \sum_{y=1}^m\frac{\lambda^{k+1}_y}{\hat{\pi}^t_y}p^t_y(x_i) \ell\left( y , f^{k+1}(x_i) \right)
%     \hspace{5cm}\\
% &\hspace{2cm}
%     \,+\,
%     \tau\frac{1}{n}\sum_{i=1}^n \sum_{y=1}^m\frac{\lambda^{k+1}_y}{\hat{\pi}^t_y}p^t_y(x_i) \ell\left( y , f^{k+1}(x_i) \right)
%     \,-\,
%     \sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(\tilde{f})
% \\
% &\leq
% \sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(f^{k+1})
%     \,-\, 
% \tau\frac{1}{n}\sum_{i=1}^n \sum_{y=1}^m\frac{\lambda^{k+1}_y}{\hat{\pi}^t_y}p^t_y(x_i) \ell\left( y , f^{k+1}(x_i) \right)\\
% &\hspace{2cm}
%     \,+\,
%     \tau\frac{1}{n}\sum_{i=1}^n \sum_{y=1}^m\frac{\lambda^{k+1}_y}{\hat{\pi}^t_y}p^t_y(x_i) \ell\left( y , \tilde{f}(x_i) \right)
%     \,-\,
%     \sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(\tilde{f})
% \\
% &\leq
%     2\sup_{f \in \cF}\left|
%         \sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(f)
%         \,-\, 
%         \tau\frac{1}{n}\sum_{i=1}^n \sum_{y=1}^m\frac{\lambda^{k+1}_y}{\hat{\pi}^t_y}p^t_y(x_i) \ell\left( y , f(x_i) \right)
%     \right|\\
%     &\leq
%          2\sup_{f \in \cF}\max_{\lambda \in \Delta_m}\, 
%         \left|
%             \sum_{y=1}^m \lambda_y  \phi^\oh_y(f)
%             \,-\, 
%             \tau\frac{1}{n}\sum_{i=1}^n \sum_{y=1}^m\frac{\lambda_y}{\hat{\pi}^t_y}p^t_y(x_i) \ell\left( y , f(x_i) \right)
%         \right|\\
%     &\leq
%          2\sup_{f \in \cF}\max_{\lambda \in \Delta_m}\, 
%         \sum_{y=1}^m \lambda_y\left|
%             \phi^\oh_y(f)
%             \,-\, 
%             \tau\frac{1}{n}\sum_{i=1}^n \frac{1}{\hat{\pi}^t_y}p^t_y(x_i) \ell\left( y , f(x_i) \right)
%         \right|\\
%     &\leq
%          2\sup_{f \in \cF}
%             \max_{y \in [m]}\left|
%                 \phi^\oh_y(f)
%                 \,-\, 
%                 \tau\frac{1}{n}\sum_{i=1}^n\frac{1}{\hat{\pi}^t_y}p^t_y(x_i) \ell\left( y , f(x_i) \right)
%         \right|\\
%     &= 2\sup_{f \in \cF}
%         \max_{y \in [m]}\left|
%             \phi^\oh_y(f)
%             \,-\, 
%             \tilde{\phi}^\oh_y(f)
%             \,+\,
%             \tilde{\phi}^\oh_y(f)
%             \,-\,
%             \tau\frac{1}{n}\sum_{i=1}^n\frac{1}{\hat{\pi}^t_y}p^t_y(x_i) \ell\left( y , f(x_i) \right)
%         \right|\\
%     &\leq 2\sup_{f \in \cF}
%         \max_{y \in [m]}
%             \left\{
%             \left|
%             \phi^\oh_y(f)
%             \,-\, 
%             \tilde{\phi}^\oh_y(f)
%             \right|
%             \,+\,
%             \left|
%             \tilde{\phi}^\oh_y(f)
%             \,-\,
%             \tau\frac{1}{n}\sum_{i=1}^n\frac{1}{\hat{\pi}^t_y}p^t_y(x_i) \ell\left( y , f(x_i) \right)
%         \right|\right\}\\
%     &\leq 2\sup_{f \in \cF}
%         \max_{y \in [m]}\
%             \left|
%             \phi^\oh_y(f)
%             \,-\, 
%             \tilde{\phi}^\oh_y(f)
%             \right|
%             \,+\,
%             2\sup_{f \in \cF}
%             \max_{y \in [m]}
%                 \left|
%                 \tilde{\phi}^\oh_y(f)
%                 \,-\,
%                 \tau\frac{1}{n}\sum_{i=1}^n\frac{1}{\hat{\pi}^t_y}p^t_y(x_i) \ell\left( y , f(x_i) \right)
%             \right|,
\end{align*}
where the last inequality re-traces the steps in Lemma \ref{lem:helper-dro-2}. Further applying the generalization bound assumed in Theorem \ref{thm:dro},
we have with probability $\leq 1 - \delta$ (over draw of $S \sim D^{n}$), for all iterations $k \in [K]$ and any $\tau \in \R_+$,
\begin{equation}
    \sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(f^{k+1})
    \,-\,
    \sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(\tilde{f})
        \,\leq\, 2\tau\Delta(n, \delta) + 
        2\max_{f \in \cF}\,
    \max_{y\in[m]} \left|\phi^\oh_y(f) 
    - \tau\phi_y(f)\right|.
    \label{eq:dro-vali-inter}
\end{equation}

All that remains is to bound the second term in \eqref{eq:dro-vali-inter}. For any $f \in \cF$ and $y \in [m]$,
\begin{align*}
    \left|\phi^\oh_y(f) 
    - \tau\phi_y(f)\right|
    &\leq
    \left|
    \frac{1}{\pi_y}\E_x\left[ \eta_y(x)\, \ell\left( y , f(x) \right)\right]
    \,-\,
    \frac{\tau}{\pi^t_y}\E_x\left[ p^t_y(x)\, \ell\left( y , f(x) \right)\right]
    \right|\\
    &\leq
    \E_x\left[
    \left|
    \frac{1}{\pi_y} \eta_y(x)\, \ell\left( y , f(x) \right)
    \,-\,
    \frac{\tau}{\pi^t_y} p^t_y(x)\, \ell\left( y , f(x) \right)
    \right|\right]
    \\
    &=
    \E_x\left[
    \left|
    \frac{1}{\pi_y} \eta_y(x)
    \,-\,
    \frac{\tau}{\pi^t_y} p^t_y(x)
    \right|\ell\left( y , f^s(x) \right)\right]
    \\
    &\leq
    B\E_x\left[
    \left|
    \frac{\eta_y(x)}{\pi_y} 
    \,-\,
    \tau\frac{p^t_y(x)}{\pi^t_y} 
    \right|\right],
\end{align*}
where we use Jensen's inequality in the second step,
the fact  that $\ell(y, z) \leq B$ is non-negative in the second step, 
and the fact  that $\ell(y, z) \leq B$ in the last step. Substituting this upper bound back into \eqref{eq:dro-vali-inter} completes the proof of the first part.

The second part follows from a direct application of the bound on the per-class estimation error $\max_{y \in [m]} \big|\phi^\oh_y(f) - \hat{\phi}^{\oh,\val}_y(f)\big|$.
\end{proof}
%
%where the second inequality follows from \eqref{eq:f-k+1-minimizer-2}.
%
% Next, note that because $\min_{y \in [m]}\pi^{t}_y \geq \frac{1}{Z}$ and $n \geq 8Z\log(4m/\delta)$,
% we have by a direct application of Chernoff's bound 
% and taking a union bound over all $m$ classes
% that with
% probability at least $1-\delta/4$:
% $$
% \hat{\pi}^{t}_y \geq \frac{1}{2}\pi^t_y, \forall y \in [m].
% $$
% Further, using the generalization bound assumed in Theorem \ref{thm:dro-oh-vali}, %
% % coupled with a union bound,
% we have
% with probability at least $1-\delta/3$ (over draw of $S \sim D^n$), for any $k \in [K]$:
% \begin{align*}
% \lefteqn{
% \sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(f^{k+1})
% \,-\,
% \sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(\tilde{f})
% }\\
%     &\leq
%         2\tilde{\Delta}(n, \delta/3) 
%         \,+\,
%             2\sup_{f \in \cF}
%             \max_{y \in [m]}
%                 \left|
%                 \tilde{\phi}^\oh_y(f)
%                 \,-\,
%                 \tau\frac{1}{n}\sum_{i=1}^n\frac{1}{\hat{\pi}^t_y}p^t_y(x_i) \ell\left( y , f(x_i) \right)\right|
%         \\
%     &=
%         2\tilde{\Delta}(n, \delta/3) 
%         \,+\,
%             2\sup_{f \in \cF}
%             \max_{y \in [m]}
%                 \left|
%                 \frac{1}{n}\sum_{i=1}^n \frac{1}{{\pi}_y}\eta_y(x_i) \ell\left( y , f(x_i) \right)
%                 \,-\,
%                 \tau\frac{1}{n}\sum_{i=1}^n \frac{1}{\hat{\pi}^t_y}p^t_y(x_i) \ell\left( y , f(x_i) \right)\right|
%         \\
%     &\leq
%         2\tilde{\Delta}(n, \delta/3)
%         \,+\,
%             2\sup_{f \in \cF}
%             \max_{y \in [m]}
%                 \frac{1}{n}\sum_{i=1}^n\left|
%                  \frac{1}{{\pi}_y}\eta_y(x_i) 
%                 \,-\,
%                  \tau \frac{1}{\hat{\pi}^t_y}p^t_y(x_i)\right| \ell\left( y , f(x_i) \right)
%         \\
%     &\leq
%     2\tilde{\Delta}(n, \delta/3) \,+\, 
%         2B\max_{y \in [m]} \frac{1}{n}\sum_{i=1}^n
%         \left| \tau\frac{p^t_y(x_i)}{\hat{\pi}^t_y} \,-\, \frac{\eta_y(x_i)}{\pi_y} \right|,
% \end{align*}
% where we have used the fact that $\ell(y, z) \leq B$.

% Next, note that because $\min_{y \in [m]}\pi^{t}_y \geq \frac{1}{Z}$ and $n = \cO\left(\frac{Z\log(m/\delta)}{\epsilon^2}\right)$,
% we have by a direct application of Chernoff's bound 
% and taking a union bound over all $m$ classes
% that with
% probability at least $1-\delta/3$, for each $y \in [m]$:
% $$
% (1-\epsilon)\pi^t_y \leq \hat{\pi}^{t}_y \leq (1+{\epsilon})\pi^t_y,
% $$
% which gives us that
% $$
% \left|\frac{1}{\hat{\pi}^{t}_y} - \frac{1}{\pi^t_y}\right| \leq \max\left\{\frac{\epsilon}{(1-\epsilon)\pi^t_y}, \frac{\epsilon}{(1+\epsilon)\pi^t_y}\right\} \leq 2Z\epsilon
% ~~~~\text{and}~~~~
%  \left| \tau\frac{p^t_y(x_i)}{\hat{\pi}^t_y} \,-\, \tau\frac{p^t_y(x_i)}{{\pi}^t_y} \right|
%  \leq 2\tau Z\epsilon \leq Z^2\epsilon.
% $$
% Plugging this back into the above inequality, and taking a
% union bound over both high probability statements,
% we have
% with probability at least $1-2\delta/3$ (over draw of $S \sim D^n$), for any $k \in [K]$:
% \begin{align*}
% {
% \sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(f^{k+1})
% \,-\,
% \sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(\tilde{f})
% }
% &\leq
%     2\tilde{\Delta}(n, \delta/3) \,+\, 
%         2B\max_{y \in [m]} \frac{1}{n}\sum_{i=1}^n
%         \left| \tau\frac{p^t_y(x_i)}{\hat{\pi}^t_y} \,-\, \frac{\eta_y(x_i)}{\pi_y} \right| \,+\, 2BZ^2\epsilon.
% \end{align*}
%
% Furthermore, because the above statement holds for any example weighting $\tau \in \cT$, we have with the same probability:
% \begin{align}
% {\sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(f^{k+1})
% \,-\,
% \sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(\tilde{f})}
% &\leq
%     2\tilde{\Delta}(n, \delta/3) \,+\, 
%         2B
%         \min_{\tau \in \cT}\max_{y \in [m]} \underbrace{\frac{1}{n}\sum_{i=1}^n
%         \left| \tau\frac{p^t_y(x_i)}{\hat{\pi}^t_y} \,-\, \frac{\eta_y(x_i)}{\pi_y} \right|}_{\Omega(x, y, \tau)} \,+\, 2BZ^2\epsilon.
%         \label{eq:oh-vali-helper}
% \end{align}
% % where we have absorbed the multiplication by 2 into the weights $w_i$.

% We next apply Hoeffding's inequality to the second term. Noting that the inner term
% $
% \Omega(x, y, \tau) \,\leq\, \max\{\frac{\eta_y(x)}{\pi_y}, \tau\frac{p^t_y(x)}{\pi^t_y}\} \leq Z^2,
% $
% we have for each $y \in [m]$ and $\tau \in \cT$, with probability at least $1-\delta/3Mm$ (over draw of $S \sim D^n$).
% \[
%     \frac{1}{n}\sum_{i=1}^n
%         \Omega(x_i, y, \tau)
%         \,\leq\,
%         \E_x\left[\Omega(x, y, \tau)\right] 
%         \,+\, \cO\left(BZ^2\sqrt{ \frac{\log(Mm/\delta)}{n} }\right)
% \]
% Taking a union bound over all $y \in [m]$ and $\tau \in \cT$, we have with probability at least $1-\delta/3$ (over draw of $S \sim D^n$),
% for each $y \in [m]$ and $\tau \in \cT$:
% \[
%  \frac{1}{n}\sum_{i=1}^n
%         \Omega(x_i, y, \tau)
%         \,\leq\,
%         \E_x\left[\Omega(x, y, \tau)\right] 
%         \,+\, \cO\left(BZ^2\sqrt{ \frac{\log(Mm/\delta)}{n} }\right)
% \]
% Plugging this back into \eqref{eq:oh-vali-helper} (along with a union bound over both the high probability statements),
% we have we have with probability at least $1-\delta$ (over draw of $S \sim D^n$),
% for $\tau \in \cT$:
% \[
% \sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(f^{k+1})
% \,-\,
% \sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(\tilde{f})
% \,\leq\, 
% 2\Delta^\oh(n, \delta/3) \,+\, 2B\max_{y \in [m]} \E_x\left[
%         \Omega(x, y, \tau)\right] 
%         \,+\, \cO\left(BZ^2\sqrt{ \frac{\log(Mm/\delta)}{n} }\right) \,+\, 2BZ^2\epsilon.
% \]
% Consequently, with the same probability,
% \begin{align*}
% \lefteqn{\sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(f^{k+1})
% \,-\,
% \sum_{y=1}^m \lambda^{k+1}_y  \phi^\oh_y(\tilde{f})}\\
% &\leq\, 
% 2\tilde{\Delta}(n, \delta/3) \,+\, 2B\min_{\tau \in \cT}\max_{y \in [m]} \E_x\left[
%         \Omega(x, y, \tau)\right] 
%         \,+\, \cO\left(BZ^2\sqrt{ \frac{\log(Mm/\delta)}{n} }\right) \,+\, 2BZ^2\epsilon,
% \end{align*}
% as desired.
%
% The second part follows from a direct application of the bound on the per-class estimation error $\max_{y \in [m]} \big|\phi^\oh_y(f) - \hat{\phi}^{\oh,\val}_y(f)\big|$.


\begin{proof}[Proof of Theorem \ref{thm:dro-oh-vali}]
The proof traces the same steps as 
Proposition \ref{prop:student-form} and
Theorem \ref{thm:dro}, except that 
it applies Lemma \ref{lem:helper-dro-3} instead of Lemma \ref{lem:helper-dro-2}.

Note that because $\min_{y \in [m]}\pi_y \geq \frac{1}{Z}$ and $n^\val \geq 8Z\log(2m/\delta)$,
we have by a direct application of Chernoff's bound (along with a union bound over all $m$ classes) that with
probability at least $1-\delta/2$:
$$
\min_{y \in [m]}\hat{\pi}^{\oh,\val}_y \geq \frac{1}{2Z}, \forall y \in [m],
$$
and consequently,
$\hat{\phi}_y^{\oh,\val}(f) \leq 2BZ, \forall f \in \cF$. The boundedness of $\hat{\phi}_y^{\oh,\val}$ will then allow us to apply standard convergence guarantees for exponentiated gradient ascent \citep{shalev2011online}. For $\gamma = \frac{1}{2BZ}\sqrt{\frac{\log(m)}{{K}}}$,
the updates on $\lambda$ will give us:
\[
\max_{\lambda \in \Delta_m}\,\frac{1}{K}\sum_{k=1}^K\sum_{y=1}^m \lambda_y \hat{\phi}_y^{\oh,\val}(f^k)
\,\leq\, 
\frac{1}{K}\sum_{k=1}^K\sum_{y=1}^m \lambda_y^k \hat{\phi}_y^{\oh,\val}(f^k)
\,+\,
4BZ\sqrt{\frac{\log(m)}{{K}}}
\]

Applying the second part of Lemma \ref{lem:helper-dro-2} to each iteration $k$, we have
with probability at least $1-\delta$:
\[
\max_{\lambda \in \Delta_m}\,\frac{1}{K}\sum_{k=1}^K\sum_{y=1}^m \lambda_y {\phi}^\oh_y(f^k)
\,\leq\, 
\frac{1}{K}\sum_{k=1}^K\sum_{y=1}^m \lambda_y^k {\phi}^\oh_y(f^k)
\,+\,
4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta^\oh(n^\val, \delta/2),
\]
and applying the first part of Lemma \ref{lem:helper-dro-2} to the RHS, we have with the same probability, for any $\tau \in \R_+$:
% \todo{Change $\delta/2$ to $\delta/2$}
\begin{align*}
{
\max_{\lambda \in \Delta_m}\,\frac{1}{K}\sum_{k=1}^K\sum_{y=1}^m \lambda_y {\phi}^\oh_y(f^k)}
&\leq
\frac{1}{K}\sum_{k=1}^K\min_{f \in \cF}\sum_{y=1}^m \lambda_y^k {\phi}^\oh_y(f)
\,+\,
4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta^\oh(n^\val, \delta/2) 
\\
&
\hspace{1cm}
\,+\,  2\tau\Delta(n, \delta/2)
\,+\, 2B\max_{y \in [m]} \E_x\left[
        \left| \tau\frac{p^t_y(x)}{\pi^t_y} \,-\, \frac{\eta_y(x)}{\pi_y} \right|\right]
\\
&\leq
\min_{f \in \cF}\,\frac{1}{K}\sum_{k=1}^K\sum_{y=1}^m \lambda_y^k {\phi}^\oh_y(f)
\,+\,
4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta^\oh(n^\val, \delta/2)
\\
&
\hspace{1cm}
\,+\,  2\tau\Delta(n, \delta/2)
\,+\, 2B\max_{y \in [m]} \E_x\left[
        \left| \tau\frac{p^t_y(x)}{\pi^t_y} \,-\, \frac{\eta_y(x)}{\pi_y} \right|\right].
\end{align*}
Using the convexity of $\phi(\cdot)$ in $f(x)$ and Jensen's inequality, 
we have that $\sum_{y=1}^m \lambda_y {\phi}_y(\bar{f}^s) \leq \frac{1}{K}\sum_{k=1}^K\sum_{y=1}^m \lambda_y {\phi}_y(f^k)$.
We use this to further lower bound the LHS 
in terms of the averaged scoring function $\bar{f}^s(x) = \frac{1}{K}\sum_{k=1}^K f^k(x)$, and re-trace the steps in Theorem \ref{thm:dro} to get"
\begin{align*}
{\max_{y \in [m]}\, {\phi}^\oh_y(\bar{f}^s)}
    &\leq
        \min_{f \in \cF}\max_{y \in [m]}\, {\phi}^\oh_y(f)
        \,+\,
        4BZ\sqrt{\frac{\log(m)}{{K}}} \,+\, 2\Delta^\oh(n^\val, \delta/2)
        \nonumber
        \\
    &\hspace{1cm}
        \,+\,  2\tau\Delta(n, \delta/2)
\,+\, 2B\max_{y \in [m]} \E_x\left[
        \left| \tau\frac{p^t_y(x)}{\pi^t_y} \,-\, \frac{\eta_y(x)}{\pi_y} \right|\right].
\end{align*}
Noting that $L^\rob(f) = \max_{y \in [m]}\, {\phi}^\oh_y(f)$ completes the proof.
\end{proof}




\section{DRO for Traded-off Objective}
\label{app:dro-general-algo}
\begin{figure}
\begin{algorithm}[H]
\caption{Distilled Margin-based DRO for Traded-off Objective}% for Robust Student}
\label{algo:dro-general}
\begin{algorithmic}
\STATE \textbf{Inputs:} Teacher $p^t$, Student hypothesis class $\cF$, Training set $S$, Validation set $S^\val$, Step-size $\gamma \in \R_+$,
Number of iterations $K$, Loss $\ell$, Trade-off parameter $\alpha$
\STATE \textbf{Initialize:} Student $f^0 \in \cF$, Multipliers $\blambda^0 \in \Delta_m$
\STATE \textbf{For}~{$k = 0 $ to $K-1$}
\STATE ~~~$\tilde{\lambda}^{k+1}_j \,=\, \lambda^k_j\exp\big( \gamma \alpha \hat{R}_j \big), \forall j \in [m]$
~\text{where} $\hat{R}_j =$ $\displaystyle\frac{1}{n^\val}\frac{1}{\hat{\pi}^{t,\val}_j}\sum_{(x, y) \in S^\val} p_j^t(x_i)\, \ell( j , f^k(x) )$
% \STATE ~~~~~~~~~$\begin{cases} 
%             \displaystyle\frac{1}{|S^\val|}\frac{1}{\hat{\pi}^t_j}\sum_{(x, y) \in S^\val} p_j^t(x_i)\, \ell( j , f^k(x) ) & \text{:A}\\
%             \displaystyle\frac{1}{|S^\val|}\frac{1}{\hat{\pi}_j}\sum_{(x, y) \in S^\val} \1(y=j)\,\ell( j , f^k(x) ) & \text{:B}
%         \end{cases}$
%\frac{1}{n^\val\hat{\pi}^t_y}\sum_{ p_y^t(x_i)\, \ell\left( y , f^k(x_i) \right)
\STATE ~~~$\lambda^{k+1}_y \,=\, \frac{\tilde{\lambda}^{k+1}_y}{\sum_{j=1}^m \tilde{\lambda}^{k+1}_j}, \forall y$
\STATE ~~~
    $\beta^{k+1}_y =\,(1 - \alpha)\frac{1}{m} \,+\, \alpha\lambda^{k+1}_y$
% \STATE ~~~$\lambda^{k+1}_y \,=\, \frac{\lambda^{k+1}_y}{\sum_{j=1}^m \lambda^{k+1}_j}, \forall y$
\STATE ~~~$f^{k+1} \,\in\, \displaystyle\Argmin{f \in \cF}\, \frac{1}{n}\sum_{i=1}^n \cL^\mar\left(p^t(x_i), f(x_i); \frac{\beta^{k+1}}{\hat{\pi}^t} \right)$
~~// Replaced with a few steps of SGD
\STATE \textbf{End For}
\STATE \textbf{Output:} $\bar{f}^{s}: x \mapsto \frac{1}{K}\sum_{k =1}^K f^k(x)$
\end{algorithmic}
\end{algorithm}
\end{figure}

We present a variant of the margin-based DRO algorithm described in Section \ref{sec:algorithms}
that seeks to minimize a trade-off between the balanced and robust student objectives:
$$\displaystyle \hat{L}^\tdfd(f^s) = (1-\alpha)\hat{L}^\bald(f^s) + \alpha\hat{L}^\robd(f^s),$$
for some $\alpha \in [0,1]$. 

Expanding this, we have:
\begin{align*}
    L^\tdfd(f)
         &= 
             (1-\alpha)\frac{1}{m}\sum_{y=1}^m \frac{1}{\hat{\pi}^t_y}\frac{1}{n}\sum_{i=1}^n p^t_y(x_i)\,\ell(y, f(x_i))
             \,+\,
             \alpha \max_{y \in [m]}\,\sum_{y=1}^m \frac{1}{\hat{\pi}^t_y}\frac{1}{n}\sum_{i=1}^n p^t_y(x_i)\,\ell(y, f(x_i))\\
        &= 
             (1-\alpha)\frac{1}{m}\sum_{y=1}^m \frac{1}{\hat{\pi}^t_y}\frac{1}{n}\sum_{i=1}^n p^t_y(x_i)\,\ell(y, f(x_i))
             \,+\,
             \alpha \max_{\lambda \in \Delta_m}\,\sum_{y=1}^m \frac{\lambda_y}{\hat{\pi}^t_y}\frac{1}{n}\sum_{i=1}^n p^t_y(x_i)\,\ell(y, f(x_i))\\
        &= 
             \max_{\lambda \in \Delta_m}\sum_{y=1}^m\left((1-\alpha)\frac{1}{m} + \alpha\lambda_y\right) \frac{1}{\hat{\pi}^t_y}\frac{1}{n}\sum_{i=1}^n p^t_y(x_i)\,\ell(y, f(x_i)).
        % &=  \max_{\lambda \in \Delta_m}
        % \underbrace{
        %     \sum_{y=1}^m \left((1-\alpha)\frac{1}{m} + \alpha\lambda_y\right)\frac{1}{\pi_y}\E\left[ \eta_y(X)\,\ell(y, f(X)) \right]}_{\omega(\lambda, f)}.
\end{align*}
The minimization of $ L^\tdfd(f)$ over $f$ can then be a cast as a min-max problem:
\begin{align*}
    \min_{f:\X\>\R^m}\,L^\tdfd(f)
         &= 
             \min_{f:\X\>\R^m}\,\max_{\lambda \in \Delta_m}\sum_{y=1}^m\left((1-\alpha)\frac{1}{m} + \alpha\lambda_y\right) \frac{1}{\hat{\pi}^t_y}\frac{1}{n}\sum_{i=1}^n p^t_y(x_i)\,\ell(y, f(x_i)).
        % &=  \max_{\lambda \in \Delta_m}
        % \underbrace{
        %     \sum_{y=1}^m \left((1-\alpha)\frac{1}{m} + \alpha\lambda_y\right)\frac{1}{\pi_y}\E\left[ \eta_y(X)\,\ell(y, f(X)) \right]}_{\omega(\lambda, f)}.
\end{align*}
Retracing the steps in the derivation of Algorithm \ref{algo:dro} in Section \ref{sec:algorithms}, we
have the following updates on $\lambda$ and $f$ to solve the above min-max problem:
\begin{align*}
    \tilde{\lambda}^{k+1}_y &= \lambda^k_y\exp\bigg( \gamma\alpha\frac{1}{n\hat{\pi}^t_y}\sum_{i=1}^n  p_y^t(x_i)\, \ell\left( y , f^k(x_i) \right) \bigg), \forall y\\
    \lambda^{k+1}_y &=\, \frac{\tilde{\lambda}^{k+1}_y}{\sum_{j=1}^m \tilde{\lambda}^{k+1}_j}, \forall y\\
    \beta^{k+1}_y &=\,(1 - \alpha)\frac{1}{m} \,+\, \alpha\lambda^{k+1}_y\\
    f^{k+1} &\in\, \Argmin{f \in \cF} \sum_{y \in [m]}\frac{\beta^{k+1}_y}{n\hat{\pi}^t_y}\sum_{i=1}^n  p_y^t(x_i)\, \ell\left( y , f(x_i) \right),
\end{align*}
for step-size parameter $\gamma > 0$. 
To better handle training of over-parameterized students, we will perform the updates on $\lambda$ using a held-out validation set,
and employ a margin-based surrogate for performing the minimization over $f$.  This procedure is outlined in Algorithm \ref{algo:dro-general}.

\subsection{Connection to post-hoc adjustment}
%\label{sec:post-hoc-adjustment}
The form of the student %in the self-distillation setup 
in Proposition \ref{prop:student-form} raises an interesting question. Instead of training an explicit student model, 
why not directly construct a new scoring model by making post-hoc adjustments
to the teacher's predictions? Specifically, one could optimize over functions of the form $f^s_y(x) = \log(\gamma_y p^t_y(x)),$ where the teacher $p^t$ is fixed, and pick 
the coefficients $\gamma \in \R^m$ so that resulting scoring function yields the best worst-class accuracy on a held-out dataset.  
This simple \emph{post-hoc adjustment} strategy 
may not be feasible if the goal is to distill to a student that is considerably smaller than the teacher. Often, this is the case in settings where distillation is used as a compression technique.
% feasible if the teacher model is too complex to deploy in practice, and one desires a student with smaller complexity. 
Yet, this post-hoc method %may still be useful in providing a rough estimate of the worst-case performance that a student can hope to 
% achieve when using a particular teacher, and will 
serves as good baseline to compare with.

\section{Additional experiment details}\label{app:experiment_details}
% \todo{Mention details about why assumptions are satisfied}
This section contains further experiment details about the datasets, hyperparameters, and baselines. 


\subsection{Additional details about datasets}\label{app:datasets}

\subsubsection{Building long tailed datasets}
The long-tailed datasets were created from the original datasets following \citet{cui2019class} by downsampling examples with an exponential decay in the per-class sizes. As done by \citet{narasimhan2021training}, we set
the imbalance ratio $\frac{\max_i P(y=i)}{\min_i P(y=i)}$ to 100 for CIFAR-10 and CIFAR-100, and to 83 for TinyImageNet
(the slightly smaller ratio is to ensure that the smallest class is of a reasonable size).
We use the long-tail version of ImageNet generated by \citet{liu2017sphereface}.

\subsubsection{Dataset splits}
The original test samples for  CIFAR-10, CIFAR-10-LT, CIFAR-100, CIFAR-100-LT, TinyImageNet (200 classes), TinyImageNet-LT (200 classes), and ImageNet (1000 classes)
are all balanced. Following \citet{narasimhan2021training}, we randomly split them in half and use half the samples as a validation set, and the other half as a test set. For the CIFAR and TineImageNet datasets, this amounts to using a validation set of size 5000. For the ImageNet dataset, we sample a subset of 5000 examples from the validation set each time we update the Lagrange multipliers in Algorithm \ref{algo:dro}.

%For the ImageNet datasets, we also split the original balanced test samples into two sets: 5000 samples as a validation set, and another 24600 samples as a test set. 


In keeping with prior work \cite{menon2020long, narasimhan2021training, lukasik2021teachers}, we use the same validation and test sets for the long-tailed training sets as we do for the original versions. For the long tailed training sets, this simulates a scenario where the training data follows a long tailed distribution due to practical data collection limitations, but the test distribution of interest still comes from the original data distribution. In plots, the ``balanced accuracy'' that we report for the long-tail datasets (e.g., CIFAR-10-LT) is actually the standard accuracy calculated over the balanced test set, which is shared with the original balanced dataset (e.g., CIFAR-10).

Both teacher and student were always trained on the same training set. 

The CIFAR
datasets had images of size 32 $\times$ 32, while the TinyImageNet and ImageNet datasets 
dataset 
had images of size 224 $\times$ 224.

These datasets do not contain personally identifiable information or offensive content. The CIFAR-10 and CIFAR-100 datasets are licensed under the MIT License. The terms of access for ImageNet are given at \url{https://www.image-net.org/download.php}.

% \subsection{Dataset augmentation}
% The CIFAR
% datasets had images of size 32 $\times$ 32, while the TinyImageNet and ImageNet datasets had images of size 224 $\times$ 224. 
% For the CIFAR datasets, we employed the same data augmentation strategy used by \citet{menon2020long}, with four pixels
% padded to each side of an image, a random 32 $\times$ 32 patch of the image cropped, and the image flipped
% horizontally with probability 0.5.

\subsection{Additional details about training and hyperparameters}
\label{app:setup-details}
% \subsubsection{Code}
% \label{app:code}
% We have made our code available as a part of the supplementary material.

\subsubsection{Training details and hyperparameters}

\paragraph{Temperature hyperparameters.} We apply temperature scaling to the teacher scores on both the training set and validation set when training the student, i.e., compute $p^t(x) = \softmax(f^t(x) / \gamma)$, and  vary the temperature parameter $\gamma$  over a range of $\{1, 3, 5\}$. When training with teacher labels on the validation set (Algorithm \ref{algo:dro}), we vary the temperature parameters independently for the training set and the validation set. That is, we apply $p^t(x) = \softmax(f^t(x) / \gamma_{\text{train}})$ over the training set and $p^t(x) = \softmax(f^t(x) / \gamma_{\text{val}})$ over the validation set. When teacher labels are applied to the validation set, we additionally include a temperature of 0.1 on the teacher's validation set labels to approximate a hard thresholding of the teacher probabilities. Thus, the final hyperparameter search spaces are $\gamma_{\text{train}} \in \{1, 3, 5\}$, and $\gamma_{\text{val}} \in \{0.1, 1, 3, 5\}$.

Unless otherwise specified, in all tables, the temperature hyperparameters were chosen to achieve the best worst-class accuracy on the validation set. In all scatter plots such as Figure \ref{fig:alphas_cifar10}, for each $\alpha^t, \alpha^s$ combination, temperature hyperparameters were selected to achieve the best worst-class accuracy on the validation set. 

\paragraph{Learning rate hyperparameters.} All models were trained using SGD with momentum of 0.9 \citep{lukasik2021teachers, narasimhan2021training}. 
%Details about the number of batch sizes, number of epochs and initial step sizes are provided in Table \ref{tab:datasets}. 
% All models were trained for 450 epochs for CIFAR-10, CIFAR10-LT, CIFAR-100, and CIFAR-100-LT, and for 200 epochs for TinyImageNet. 

The learning rate schedule were chosen to mimic the settings in prior work \cite{narasimhan2021training, lukasik2021teachers}. 
 %
% learning rates followed a cosine schedule for all long tailed CIFAR datasets \citep{loshchilov2016sgdr}.
% For TinyImageNet, we use an anneal schedule.
For CIFAR-10 and CIFAR-100 datasets, we ran the optimizer for 450 epochs, linearly warming up the learning rate till the 15th epoch, and then applied a step-size decay of 0.1 after the 200th, 300th and 400th epochs, as done by \citet{lukasik2021teachers}.
For the long-tail versions of these datasets, we trained for 256 epochs,
linearly warming up the learning rate till the 15th epoch, and then applied a step-size decay of 0.1 after the 96th, 192nd and 224th epochs, as done by \citet{narasimhan2021training}. Similarly, for the TinyImageNet datasets, we train for 200 epochs,  linearly warming up the learning rate till the 5th epoch, and then applying a decay of 0.1 after the 75th and 135th epochs, as done by \citet{narasimhan2021training}. For ImageNet, we train for 90 epochs, linearly warming up the learning rate till the 5th epoch, then applying a decay of 0.1 after the 30th, 60th and 80th epochs, as done by \citet{lukasik2021teachers}. %For ImageNet-LT, linearly warm up the learning rate till the 5th epoch, then apply a decay of 0.1 after the 5th, 30th, 60th and 80th epochs. 
We used a batch size of 128 for the CIFAR-10 and the long-tailed TinyImageNet datasets \citep{narasimhan2021training}, a batch size of 512 for the balanced ImageNet dataset, a batch size of 2048 for the balanced TinyImageNet dataset, and a batch size of 1024 for other datasets \cite{lukasik2021teachers}.

We apply an $L_2$ weight decay of $10^{-4}$ in all our SGD updates \cite{lukasik2021teachers}. This amounts to applying an \emph{$L_2$ regularization} on the model parameters, and has the effect of keeping the model parameters (and as a result the loss function) bounded. 

When training with the margin-based robust objective (see Algorithm \ref{algo:dro}), a  separate step size $\alpha$ was applied for training the main model function $f$, and for updating the multipliers $\lambda$. %Exact batch sizes and learning rates for each dataset are given in Table \ref{tab:datasets}, and were selected to achieve convergence on the training set.
We set $\alpha$ to 0.1 in all experiments.

% For the ImageNet experiments, we vary the temperature applied to the teacher score distributions on the training set over a range of $\{1, 3, 5\}$, and only include evaluations with one-hot labels on the validation set.

% \begin{table}[!ht]
% \caption{Hyperparameters per dataset}
% \label{tab:datasets}
% \vskip 0.15in
% \begin{center}
% \begin{small}
% \begin{tabular}{lcccccc}
% \toprule
% Dataset & \#num-classes & minibatch size & epochs & base step size ($f$) & step size ($\lambda$) \\
% \midrule
% CIFAR-10  & 10 & 128 & 450 & 0.1 & 0.1 \\
% CIFAR-10-LT & 10 & 128 & 450 & 0.1 & 0.1 \\
% CIFAR-100 & 100  & 1024 & 450 & 0.4 & 0.1 \\
% CIFAR-100-LT & 100 & 256 & 256 & 0.1 & 0.1 \\
% TinyImageNet & 200 &  512 & 200 & 0.1 & 0.1 \\
% TinyImageNet-LT & 200 & 512 & 200 & 0.1  & 0.1 \\
% ImageNet & 1000 & 1024 & 45& 0.8 & 0.1  \\
% ImageNet-LT & 1000 & 1024 & 90 & 0.4 &  0.1 \\
% \bottomrule
% \end{tabular}
% \end{small}
% \end{center}
% \vskip -0.1in
% \end{table}


\paragraph{Hardware.} Model training was done using TPUv2. % on Google Cloud. 

\subsubsection{Repeats}\label{app:repeats}
For all comparative baselines without distillation (Group DRO, Post shift, and all teachers alone), we provide average results over $m$ retrained models ($m=5$ for ImageNet / TinyImageNet, or $m=10$ for CIFAR datasets). For students on all CIFAR* datasets, unless otherwise specified, we train the teacher once and run the student training 10 times using the same arbitrarily chosen fixed teacher. We compute the mean and standard error of metrics over these $m=10$ runs. For the resource-heavy TinyImageNet and ImageNet students, we reduce the number of repeats to $m=5$. This methodology captures variation in the student retrainings while holding the teacher fixed. To capture the end-to-end variation in both teacher and student training, we include Appendix \ref{app:different_teachers} and Table \ref{tab:teacher_var} which contains a rerun of the CIFAR experiments in Table \ref{tab:combos_self_full} using a distinct teacher for each student retraining. The overall best teacher/student objective combinations did not change for most datasets, with the only exception coming from a difference in the use of validation set labels.
% For the resource-heavy ImageNet experiments, we
% report results for one trial.
% also reduce the number of repeats down to 5 using the same fixed teacher. While we do not re-train the teachers for them, we anticipate the magnitude of teachers' variability similar to the students' as suggested by other experiments.

\subsection{Additional details about algorithms and baselines}\label{app:baselines}

\subsubsection{Practical improvements to Algorithms \ref{algo:dro}--\ref{algo:dro-general}}
\label{app:post-hoc-details}
 Algorithms \ref{algo:dro}--\ref{algo:dro-general} currently return a scorer that averages over
 all $K$ iterates $\bar{f}^s(x) = \frac{1}{K}\sum_{k=1}^K f^k(x)$. While this averaging was required for our theoretical robustness guarantees to hold,
 in our experiments, we find it sufficient to simply return the last model $f^{K}$.
 Another practical improvement that we make  to these algorithms following \citet{cotter2019optimization}, is to employ the 0-1 loss while 
 performing updates on $\lambda$, i.e., set $\ell = \ell^\zo$ in the $\lambda$-update step. We are able to do this because
 the convergence of the exponentiated gradient updates on $\lambda$ does not depend on $\ell$ being differentiable. 
 This modification allows $\lambda$s to better reflect the model's per-class performance on the validation sample. 
 
 \subsubsection{Discussion on post-shifting baseline}
 We implement the post-shifting method in \citet{narasimhan2021training} (Algorithm 3 in their paper),
 which provides for an efficient way to construct a scoring function of the form $f^s_y(x) = \log(\gamma_y p^t_y(x)),$ for a fixed teacher $p^t$, where
the coefficients $\gamma \in \R^m$ are chosen to maximize the worst-class accuracy on the validation dataset. 
Interestingly, in our experiments, we find this approach
to do exceedingly well on the validation sample, but this does not always translate to good worst-class test performance. 
In contrast, some of the teacher-student combinations
that we experiment with were seen to over-fit less to the validation sample, and as a result were
able to generalize better to the test set. This could perhaps indicate that the
teacher labels we use in these combinations benefit the student in a way that it improves its generalization. 
The variance reduction effect that \citet{menon2021statistical} postulate may be one possible explanation 
for why we see this behavior.

% \subsection{Additional results and comparisons}\label{app:tables}
% This section presents additional experimental results and comparisons to baselines.

\section{Additional experimental results}\label{app:experiment_results}
This section contains additional experimental results.

% \begin{itemize}
%     \item Appendices \ref{app:tables} through \ref{app:trade-off_plots} contain additional experimental comparisons with the AdaMargin and AdaAlpha baselines \cite{lukasik2021teachers} and group DRO \cite{Sagawa2020Distributionally}, and additional experimental results on CIFAR, TinyImageNet and ImageNet, along with additional trade-off plots.
% \end{itemize}

\subsection{Extended tables for objective combinations}
We include extended tables comparing worst-class performance for different combinations of teacher and student objectives. The mean and standard errors are reported over repeat trainings as described in Appendix \ref{app:repeats}. 

Table \ref{tab:combos_self_full} is an extended version of Table \ref{tab:combos_shortened} that includes standard errors for both worst-$k$ accuracy and average accuracy. 

Table \ref{tab:combos_32} includes similar comparisons when the student is compressed -- that is, the student's architecture is smaller than the teacher's architecture. 

% \input{tables/table_combos_updated}
%%%%%%% UPDATED RESULTS 2/17/2023 (for cifar100 and tinyimagenet) %%%%%%%%
\begin{table*}[!ht]
\caption{Worst-class accuracy comparison of self-distilled teacher/student combos on test. The ``none'' row indicates the performance of the teacher alone. Worst-class accuracy is shown above (or worst-10 accuracy for TinyImageNet-LT), and average is accuracy shown in parentheses below. The combination with the best worst-class accuracy is in \textbf{bold}. 
%Mean and standard error are reported over repeat trainings (10 repeats for CIFAR*, 5 repeats for TinyImageNet). 
We include results for the robust student using either a teacher labeled validation set (``teacher val''), or true one-hot class labels in the validation set (``one-hot val''), as outlined in Appendix \ref{app:one-hot-vali}. 
Perhaps counterintuitively, the teacher with the best worst-class accuracy alone (the ``none'' row) did not always produce the student with the highest worst-class accuracy.
}
\label{tab:combos_self_full}
\begin{center}
\begin{tabular}{p{0.1cm}cV{2.5}c|cV{2.5}c|cV{2.5}c|cV{2.5}}
\toprule
& & \multicolumn{2}{cV{2.5}}{\textbf{CIFAR-10} Teacher Obj.} & \multicolumn{2}{cV{2.5}}{\textbf{CIFAR-100} Teacher Obj.} & \multicolumn{2}{cV{2.5}}{\textbf{TinyImageNet} Teacher Obj.} \\
& & $L^{\std}$ & $L^{\rob}$ & $L^{\std}$ & $L^{\rob}$ & $L^{\std}$ & $L^{\rob}$ \\
\midrule
\multirow{10}{*}{\rotatebox{90}{Student Obj.}} 
& none & $86.48\pm 0.32$  & $90.09 \pm 0.22$ &  $42.22 \pm 0.90$  & $43.42 \pm 1.03$  & $8.42 \pm 1.88$  & $11.87\pm 1.74$ \\
& & \scriptsize{($93.74 \pm 0.05$)} & \scriptsize{($92.67 \pm 0.09$)} & \scriptsize{$72.42 \pm 0.16$} & \scriptsize{$68.81 \pm 0.11$} & \scriptsize{($56.79 \pm 0.33$)} & \scriptsize{($48.40 \pm 0.15$)} \\
\cline{2-8}
& $L^{\stdd}$ & $87.66 \pm 0.40$  & $90.12 \pm 0.23$  &$43.81 \pm 0.58$  & \cellcolor{blue!15}$\mathbf{48.20}\pm 1.15$ & $6.32 \pm 2.31$  & $10.53\pm 1.49$ \\
& & \scriptsize{($94.34 \pm 0.07$)} & \scriptsize{($94.07 \pm 0.07$)} & \scriptsize{($74.61 \pm 0.15$)} & \cellcolor{blue!15}\scriptsize{($73.23 \pm 0.07$)} & \scriptsize{($57.83 \pm 0.13$)} & \scriptsize{($55.36 \pm 0.16$)}  \\
\cline{2-8}
& $L^{\robd}$  & \cellcolor{blue!15} $\mathbf{90.94}\pm 0.16$ & $85.14\pm 0.47$ & $39.18 \pm 1.58$ & $30.42 \pm 1.30$  & $9.98 \pm 1.87$ & $16.58 \pm 1.23$  \\
&\scriptsize{(teacher val)} & \cellcolor{blue!15}\scriptsize{($92.54 \pm 0.05$)} & \scriptsize{($89.58 \pm 0.11$)} & \scriptsize{($63.49 \pm 0.29$)} & \scriptsize{($55.77 \pm 0.39$)} & \scriptsize{($49.84 \pm 0.21$)} & \scriptsize{($46.11 \pm 0.37$)}  \\
\cline{2-8}
& $L^{\robd}$ & $89.37\pm 0.17$ & $87.32\pm 0.21$ & $44.61 \pm 1.55$ & $42.68 \pm 0.74$ & $16.27 \pm 0.43$ & \cellcolor{blue!15}$\mathbf{17.36} \pm 1.32$ \\
&\scriptsize{(one-hot val)} & \scriptsize{($91.63 \pm 0.06$)} & \scriptsize{($91.16 \pm 0.10$)} & \scriptsize{($69.02 \pm 0.30$)} & \scriptsize{($62.03 \pm 0.24$)} & \scriptsize{($48.06 \pm 0.24$)} & \cellcolor{blue!15}\scriptsize{($43.92 \pm 0.30$)}  \\
\bottomrule
\end{tabular}

\begin{tabular}{p{0.1cm}cV{2.5}c|c|cV{2.5}c|c|cV{2.5}}
\toprule
& & \multicolumn{3}{cV{2.5}}{\textbf{CIFAR-10-LT} Teacher Obj.} & \multicolumn{3}{cV{2.5}}{\textbf{CIFAR-100-LT} Teacher Obj.} \\
% & & \multicolumn{3}{c||}{Teacher Obj.} & \multicolumn{3}{c||}{Teacher Obj.} \\
& & $L^{\std}$ & $L^{\bal}$ & $L^{\rob}$ & $L^{\std}$ & $L^{\bal}$ & $L^{\rob}$ \\
\midrule
\multirow{14}{*}{\rotatebox{90}{Student Obj.}} 
& None & $57.26 \pm 0.55$ & $68.52 \pm 0.52$ & $74.8\pm 0.30$ &   $0.00 \pm 0.00$ & $3.75 \pm 0.62$ & $10.33 \pm 0.82$ \\
&& \scriptsize{($76.27 \pm 0.20$)} & \scriptsize{($79.85 \pm 0.20$)} & \scriptsize{($80.29 \pm 0.12$)}  & \scriptsize{($43.33 \pm 0.16$)} & \scriptsize{($47.55 \pm 0.17$)} & \scriptsize{($44.27 \pm 0.13$)}\\
\cline{2-8}
& $L^{\stdd}$ & $36.67  \pm 0.28$ & $66.96  \pm 0.43$ & $71.15  \pm 0.24$ & $0.00 \pm 0.00$ & $2.39 \pm 0.24$ & $7.32  \pm 0.47$ \\
&&\scriptsize{($69.5 \pm 0.13$)} & \scriptsize{($79.25 \pm 0.10$)} & \scriptsize{($80.95 \pm 0.11$)}  &\scriptsize{($43.86 \pm 0.14$)} & \scriptsize{($48.95 \pm 0.15$)} & \scriptsize{($47.93 \pm 0.11$)}\\
\cline{2-8}
& $L^{\bald}$ & $71.23 \pm 0.44$ & $70.52 \pm 0.20$ & $72.96 \pm 0.53$ & $4.39\pm 0.65$ & $7.08 \pm 0.80$ & $7.19 \pm 0.79$ \\
&& \scriptsize{($80.5 \pm 0.12$)} & \scriptsize{($81.12 \pm 0.08$)} & \scriptsize{($80.71 \pm 0.07$)} & \scriptsize{($50.4 \pm 0.11$)} & \scriptsize{($50.1 \pm 0.09$)} & \scriptsize{($47.51\pm 0.20$)}\\
\cline{2-8}
& $L^{\robd}$  & $63.85 \pm 0.21$  & \cellcolor{blue!15}$\mathbf{75.56} \pm 0.19$ & $69.21 \pm 0.45$  & $9.05 \pm 0.71$  & $12.52 \pm 0.98$ & $10.32 \pm 0.76$ \\
& \scriptsize{(teacher val)} & \scriptsize{($76.81 \pm 0.08$)} & \cellcolor{blue!15}\scriptsize{($80.81\pm 0.08$)} & \scriptsize{($76.72 \pm 0.19$)} & \scriptsize{($33.75 \pm 0.10$)} & \scriptsize{($34.05 \pm 0.09$)} & \scriptsize{($36.83 \pm 0.15$)}\\
\cline{2-8}
& $L^{\robd}$  & $73.59 \pm 0.25$ & $75.43 \pm 0.38$ & $74.7  $ \scriptsize{$\pm 0.19$} & $12.28 \pm 0.46$ & $11.94 \pm 0.80$ & \cellcolor{blue!15}$\mathbf{13.18} \pm 0.61$ \\
& \scriptsize{(one-hot val)} & \scriptsize{($77.92 \pm 0.05$)} & \scriptsize{($79.02 \pm 0.07$)} & \scriptsize{($77.99 \pm 0.10$} & \scriptsize{($30.79 \pm 0.18$)} & \scriptsize{($29.8 \pm 0.20$)} & \cellcolor{blue!15}\scriptsize{($31.88 \pm 0.20$}\\
\bottomrule
\end{tabular} \\

\begin{tabular}{p{0.1cm}cV{2.5}c|c|cV{2.5}}
\toprule
& & \multicolumn{3}{cV{2.5}}{\textbf{TinyImageNet-LT} Teacher Obj.} \\
& & $L^{\std}$ & $L^{\bal}$ & $L^{\rob}$ \\
\midrule
\multirow{10}{*}{\rotatebox{90}{Student Obj.}} 
& None &  $0.00 \pm 0.00$  & $2.11\pm 0.37$ & $4.92\pm 0.66$ \\
& &  \scriptsize{($33.15 \pm 0.17$)} & \scriptsize{($35.96 \pm 0.12$)} & \scriptsize{($27.23 \pm 0.15$)} \\ \cline{2-5} 
& $L^{\stdd}$ &  $0.00 \pm 0.00$  & $0.00\pm 0.00$ & $1.87\pm 0.23$ \\
& &  \scriptsize{($26.05 \pm 0.18$)} & \scriptsize{($27.21 \pm 0.15$)} & \scriptsize{($25.34 \pm 0.13$)} \\ \cline{2-5} 
& $L^{\bald}$ &  $0.20 \pm 0.18$  & $2.82\pm 0.14$ & $4.77\pm 0.41$ \\
& &  \scriptsize{($30.43 \pm 0.06$)} & \scriptsize{($39.41 \pm 0.15$)} & \scriptsize{($38.41 \pm 0.15$)} \\ \cline{2-5} 
& $L^{\robd}$ & $0.00 \pm 0.00$ & $4.93 \pm 0.38$ & $3.32 \pm 0.43$  \\ 
& \scriptsize{(teacher val)} & \scriptsize{($22.66 \pm 0.08$)} & \scriptsize{($35.43 \pm 0.18$)} & \scriptsize{($25.11 \pm 0.17$)} \\ \cline{2-5} 
& $L^{\robd}$ & $1.55 \pm 0.37$ & $6.11 \pm 0.39$ & \cellcolor{blue!15}$\mathbf{6.19} \pm 0.25$  \\
& \scriptsize{(one-hot val)} & \scriptsize{($21.59 \pm 0.19$)} & \scriptsize{($28.24 \pm 0.17$)} & \cellcolor{blue!15}\scriptsize{($25.30 \pm 0.18$)} \\
\bottomrule
\end{tabular}
\end{center}
\vskip -0.2in
\end{table*}

% \input{tables/large_table_compressed}
\begin{table*}[!ht]
\caption{Comparison of ResNet-56$\to$ResNet-32 distilled teacher/student combos on test on CIFAR datasets. Worst-class accuracy shown above, and average accuracy shown in parentheses below. The combination with the best worst-class accuracy is bolded. Mean and standard error are reported over 10 repeats. We include results for the robust student using either a teacher labeled validation set (``teacher val''), or true one-hot class labels in the validation set (``one-hot val''), as outlined in Section \ref{sec:algorithms}.}
\label{tab:combos_32}
% \vskip 0.15in
\begin{center}

\begin{tabular}{p{0.1cm}p{0.1cm}cV{2.5}c|cV{2.5}c|cV{2.5}}
\toprule
& & & \multicolumn{2}{cV{2.5}}{\textbf{CIFAR-10} Teacher Obj.} & \multicolumn{2}{cV{2.5}}{\textbf{CIFAR-100} Teacher Obj.} \\
& & & $L^{\std}$ & $L^{\rob}$ & $L^{\std}$ & $L^{\rob}$ \\
\midrule
\multirow{6}{*}{\rotatebox{90}{Student Obj.}} &\multirow{6}{*}{\rotatebox{90}{(ResNet-32)}}
& $L^{\stdd}$ & $86.4 \pm 0.27$  & $89.56 \pm 0.20$  & $41.82\pm1.12$  & \cellcolor{blue!15}$\mathbf{45.7}\pm1.13$ \\
&& & \tiny{($93.73 \pm 0.05$)} & \tiny{($93.38 \pm 0.05$)} & \tiny{$73.19\pm0.10$} & \cellcolor{blue!15}\tiny{$71.42\pm0.22$}  \\
\cline{3-7}
&& $L^{\robd}$  & \cellcolor{blue!15} $\mathbf{89.61} \pm 0.27$ & $83.8 \pm 0.95$ & $38.94\pm2.61$  & $19.15\pm0.00$ \\
&&\tiny{(teacher val)} & \cellcolor{blue!15}\tiny{($92.20 \pm 0.08$)} & \tiny{($88.71 \pm 0.24$)} & \tiny{($62.28\pm0.40$)} & \tiny{($52.9\pm0.00$)} \\
\cline{3-7}
&& $L^{\robd}$ & $87.92 \pm 0.23$ & $86.57 \pm 0.24$ & $33.19\pm1.29$  & $41.23\pm0.84$ \\
&&\tiny{(one-hot val)} & \tiny{($90.89 \pm 0.12$)} & \tiny{($90.54 \pm 0.11$)} & \tiny{($57.43\pm0.29$)} & \tiny{($61.14\pm0.24$)} \\
\bottomrule
\end{tabular}

\begin{tabular}{p{0.1cm}p{0.1cm}cV{2.5}c|c|cV{2.5}c|c|cV{2.5}}
\toprule
&& & \multicolumn{3}{cV{2.5}}{\textbf{CIFAR-10-LT} Teacher Obj.} & \multicolumn{3}{cV{2.5}}{\textbf{CIFAR-100-LT} Teacher Obj.} \\
&& & $L^{\std}$ & $L^{\bal}$ & $L^{\rob}$ & $L^{\std}$ & $L^{\bal}$ & $L^{\rob}$ \\
\midrule
\multirow{6}{*}{\rotatebox{90}{Student Obj.}} &\multirow{6}{*}{\rotatebox{90}{(ResNet-32)}} 
& $L^{\stdd}$ & $57.23 \pm 0.53$ & $66.80 \pm 0.25$ & $72.36\pm0.39$ & $0.00 \pm 0.00$ & $1.38 \pm 0.39$ & $7.99\pm0.48$ \\
&&&\tiny{($75.76 \pm 0.12$)} & \tiny{($78.99 \pm 0.06$)} & \tiny{($80.74\pm0.09$)}  &\tiny{($44.33 \pm 0.11$)} & \tiny{($47.28\pm0.13$)} & \tiny{($47.34\pm0.08$)}\\
\cline{3-9}
&& $L^{\bald}$ & $71.37 \pm 0.50$ & $71.00 \pm 0.45$ & $72.17 \pm 0.40$ & $3.57 \pm 0.58$ & $4.28 \pm 0.45$ & $5.58 \pm 0.53$ \\
&&& \tiny{($81.13 \pm 0.12$)} & \tiny{($81.12 \pm 0.15$)} & \tiny{($79.91 \pm 0.08$)} & \tiny{($49.21 \pm 0.10$)} & \tiny{($46.56 \pm 0.13$)} & \tiny{($48.58\pm 0.09$)}\\
\cline{3-9}
&& $L^{\robd}$  & $64.1 \pm 0.36$  & $73.51\pm 0.33$ & $69.90 \pm 0.42$  & $10.24 \pm 0.71$  & \cellcolor{blue!15}$\mathbf{13.41} \pm 0.72$ & $11.27 \pm 0.61$ \\
&& \tiny{(teacher val)} & \tiny{($76.34 \pm 0.12$)} & \tiny{($80.10\pm 0.10$)} & \tiny{($76.37 \pm 0.14$)} & \tiny{($33.55 \pm 0.16$)} & \cellcolor{blue!15}\tiny{($33.37 \pm 0.17$)} & \tiny{($36.14 \pm 0.19$)}\\
\cline{3-9}
&& $L^{\robd}$  & $72.65 \pm 0.27$ & $74.39 \pm 0.34$ & \cellcolor{blue!15}$\mathbf{74.45} \pm 0.26$ & $10.93 \pm 0.65$ & $12.2 \pm 0.65$ & $12.93\pm 0.62$ \\
&& \tiny{(one-hot val)} & \tiny{($77.69 \pm 0.11$)} & \tiny{($78.68 \pm 0.16$)} & \cellcolor{blue!15}\tiny{($77.97 \pm 0.10$} & \tiny{($29.48 \pm 0.22$)} & \tiny{($30.27 \pm 0.18$)} & \tiny{($31.83 \pm 0.17$}\\
\bottomrule
\end{tabular}

\end{center}
\vskip -0.12in
\end{table*}

\subsubsection{Robust distillation with a onehot-labeled validation set}
Tables \ref{tab:combos_self_full} and \ref{tab:combos_32} also include results when the robust student is trained using a validation set using onehot labels, as described in Appendix \ref{app:one-hot-vali}. We report the accuracies for this robust student for different teachers trained with the standard, balanced, and robust objectives in the last rows of Tables \ref{tab:combos_self_full} and \ref{tab:combos_32} ($L^{\robd}$ (one-hot val)). We compare these to the robust student trained using teacher labels on the validation set ($L^{\robd}$ (teacher val)), which require less labeled data.

Perhaps surprisingly, it did not always benefit the robust student to utilize the true one-hot labels in the validation set. Instead, training the robust student with teacher labels on the validation set was often sufficient to achieve the best or close to the best worst-class performance. This is promising from a data efficiency standpoint, since it can be expensive to build up a labeled dataset for validation, especially if the training data is long-tailed.

\subsection{Additional plots for all trade-off parameter combinations}
Figure \ref{fig:alphas_all} show accuracies for all $\alpha^t, \alpha^s$ the equivalent of Figure \ref{fig:alphas_cifar10} but for all datasets.

% \input{figures/alphas_all}
\begin{figure}[t]
    \centering
    \begin{tabular}{cc}
        CIFAR-10 & CIFAR-10-LT \\
        \includegraphics[width=0.4\columnwidth]{plot_students_only_cf10_arch56_teacher} & \includegraphics[width=0.4\columnwidth]{plot_students_only_cf10lt_arch56_teacher} \\
        CIFAR-100 & CIFAR-100-LT \\
        \includegraphics[width=0.4\columnwidth]{plot_students_only_cf100_arch56_teacher} & \includegraphics[width=0.4\columnwidth]{plot_students_only_cf100lt_arch56_teacher} \\
        TinyImageNet & TinyImageNet-LT \\
        \includegraphics[width=0.4\columnwidth]{plot_students_only_tin_arch56_teacher} & \includegraphics[width=0.4\columnwidth]{plot_students_only_tinlt_arch56_teacher} \\
    \end{tabular}
    
    \caption{All $\alpha^t, \alpha^s$ combinations for all datasets on test. The black line traces out the Pareto frontier. Average accuracy is roughly determined by $\alpha^s$. The labeled point corresponds to the ``best'' combination selected in Table \ref{tab:baselines} based on validation criteria, but other domain-specific trade-off criteria could yield any of these other points.}
    \label{fig:alphas_all}
\end{figure}

\subsection{Comparison to baselines of all Pareto efficient trade-off parameters}

To supplement the comparison to baselines in Table \ref{tab:baselines}, Figures \ref{fig:trade-offs_balanced} and \ref{fig:trade-offs_lt} show all Pareto efficient $\alpha^t$ and $\alpha^s$ combinations on test. Whereas only a single $\alpha^t, \alpha^s$ combination was selected on the validation set and reported in Table \ref{tab:baselines}, Figures \ref{fig:trade-offs_balanced} and \ref{fig:trade-offs_lt} show that there were many more combinations of $\alpha^t, \alpha^s$ that could have Pareto dominated all baselines. 

Figures \ref{fig:trade-offs_balanced} and \ref{fig:trade-offs_lt} also give more insight into which values of $\alpha^t$ work best for different values of $\alpha^s$. Whereas Figure \ref{fig:alphas_cifar10} shows that $\alpha^s$ is highly correlated with average accuracy, the same is not true for $\alpha^t$. Worst-class accuracy generally increases with $\alpha^s$, but the teachers that achieve the Pareto efficient points all have $\alpha^t < 1$. This reveals counter-intuitively that the teacher's worst-class accuracy is not a direct predictor of the robustness of a subsequent student. This couples with our theoretical understanding in Section \ref{sec:theory}, which showed that the ability of a teacher to train robust students is determined by the calibration of scores within each class.

\textit{Trading off average vs. worst-class accuracy.}
Figures \ref{fig:trade-offs_balanced} and \ref{fig:trade-offs_lt} show that when we allow for more nuanced $L^{\tdf}$ objective combinations, the resulting models may have higher average accuracy and worst-class accuracy than standard distillation. Interestingly, the models with the most ``even'' trade-offs between average accuracy and worst-class accuracy tend to have low $\alpha^t$ (around 0.25) and low $\alpha^s$ (also around 0.25). Higher values of $\alpha^t$ tended to lead to more extreme points on the trade-off curve, either with higher average accuracy at the expense of worst-class accuracy, or vice versa. Overall, the robust $L^{\tdf}$ combinations also Pareto dominated most of the baselines that all used the standard teacher. Together, these results highlight the fact that in robust distillation, the teacher's training objective is important and should be tailored to the desired final accuracy/robustness trade-off (perhaps using a held-out validation sample with some domain-specific criteria in practice). Figure \ref{fig:trade-offs_balanced_compressed} confirms that these results also hold up in a compression setting, where the compressed models can actually even beat their larger teachers.

% Strikingly, Figure \ref{fig:trade-offs_cf-lt_32_onehot} shows that ResNet-32 students distilled with robust trade-offs can be more Pareto efficient than even the larger ResNet-56 teacher models. Thus, distillation with combinations of robust losses not only helps worst-case accuracy, but also achieves better trade-offs with balanced accuracy. Similar trends prevail across our experiment setups, including self distillation and the original non-long-tailed datasets (see Appendix \ref{app:experiment_details}).

% In this section, we supplement the results in Figure \ref{fig:trade-offs_main} by reporting similar plots for the following additional combinations of datasets and model architectures:

% \begin{itemize}
%     \item Figure \ref{fig:trade-offs_balanced} shows results for additional class-balanced datasets under self-distillation.
%     \item Figure \ref{fig:trade-offs_lt} shows results for additional long-tailed datasets under self-distillation. 
%     \item Figure \ref{fig:trade-offs_balanced_compressed} shows results for additional class-balanced datasets under distillation to a compressed student.
% \end{itemize}

% \input{figures/trade-offs_balanced}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% All balanced datasets, self distillation
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{figure*}
\centering
\begin{minipage}[tb]{0.43\linewidth}
\centering
\setlength{\tabcolsep}{0pt}
\begin{tabular}{c}
    CIFAR-10 ResNet56 $\to$ ResNet56 \\
    \includegraphics[trim={0 0.2cm 0 0.1cm},clip,width=\linewidth]{plot_cf10_arch56_onehot_eff}\\
\end{tabular}\\
\end{minipage}\hfill
\begin{minipage}[tb]{0.5\linewidth}
\centering
\small{\begin{tabular}{llcc}
\multicolumn{4}{c}{\textbf{Pareto efficient robust distillation results (test)}} \\
$\alpha^t$ & $\alpha^s$ & Average acc.  & Worst-class acc. \\
\hline 
0.75 & 0.25 & $93.81 \pm 0.07$ & $90.68 \pm 0.20$ \\ 
 0.50 & 0.25 & $93.82 \pm 0.09$ & $90.54 \pm 0.22$ \\ 
 0.25 & 0.25 & $93.87 \pm 0.08$ & $90.50 \pm 0.18$ \\ 
 1.00 & 0.00 & $94.07 \pm 0.07$ & $90.12 \pm 0.23$ \\ 
 0.75 & 0.00 &  \cellcolor{blue!15}$94.25 \pm 0.05$ &  \cellcolor{blue!15}$90.00 \pm 0.17$ \\ 
 0.25 & 0.00 & $94.34 \pm 0.06$ & $89.10 \pm 0.31$ \\ 
\end{tabular}
\begin{tabular}{lcc}
\multicolumn{3}{c}{\textbf{Baseline results (test)}} \\
Baseline & Average acc.  & Worst-class acc. \\
\hline 
Standard distillation & $94.34 \pm 0.07$ & $87.66 \pm 0.40$ \\ 
Post shift \text{[NM'21]} & $92.16 \pm 0.18$ & $88.60 \pm 0.35$ \\ 
Robust student \text{[NM'21]} & $92.72 \pm 0.05$ & $89.90 \pm 0.21$ \\ 
AdaMargin \text{[LBMK'21]} & $93.69 \pm 0.06$ & $88.42 \pm 0.36$ \\ 
AdaAlpha \text{[LBMK'21]} & $94.31 \pm 0.01$ & $88.33 \pm 0.14$ \\ 
Group DRO \text{[SKHL'20]} & $92.34 \pm 0.07$ & $89.32 \pm 0.21$ \\ 
\end{tabular}} \\
\end{minipage} \\

\begin{minipage}[tb]{0.47\linewidth}
\centering
\setlength{\tabcolsep}{0pt}
\begin{tabular}{c}
    CIFAR-100 ResNet56 $\to$ ResNet56 \\
    \includegraphics[trim={0 0.2cm 0 0.1cm},clip,width=\linewidth]{plot_cf100_arch56_onehot_eff}\\
\end{tabular}\\
\end{minipage}\hfill
\begin{minipage}[tb]{0.5\linewidth}
\centering
\small{\begin{tabular}{llcc}
\multicolumn{4}{c}{\textbf{Pareto efficient robust distillation results (test)}} \\
$\alpha^t$ & $\alpha^s$ & Average acc.  & Worst-class acc. \\
\hline 
1.00 & 0.25 & $70.45 \pm 0.16$ & $48.99 \pm 0.72$ \\ 
 1.00 & 0.00 & $73.23 \pm 0.07$ & $48.20 \pm 1.15$ \\ 
 0.25 & 0.00 & \cellcolor{blue!15}$74.57 \pm 0.12$ & \cellcolor{blue!15}$46.99 \pm 1.09$ \\ 
 0.25 & 0.00 & $74.59 \pm 0.09$ & $44.37 \pm 0.58$ \\ 
 0.00 & 0.00 & $74.61 \pm 0.15$ & $43.81 \pm 0.58$ \\ 
\end{tabular}
\begin{tabular}{lcc}
\multicolumn{3}{c}{\textbf{Baseline results (test)}} \\
Baseline & Average acc.  & Worst-class acc. \\
\hline 
Standard distillation & $74.61 \pm 0.15$ & $43.81 \pm 0.58$ \\ 
Post shift \text{[NM'21]} & $61.22 \pm 0.36$ & $38.19 \pm 0.40$ \\ 
Robust student \text{[NM'21]} & $68.45 \pm 0.13$ & $43.62 \pm 1.27$ \\ 
AdaMargin \text{[LBMK'21]} & $73.58 \pm 0.11$ & $43.91 \pm 1.11$ \\ 
AdaAlpha \text{[LBMK'21]} & $74.15 \pm 0.08$ & $45.46 \pm 0.67$ \\ 
Group DRO \text{[SKHL'20]} & $65.18 \pm 0.08$ & $43.89 \pm 1.12$ \\ 
\end{tabular}} \\
\end{minipage}

\begin{minipage}[tb]{0.43\linewidth}
\centering
\setlength{\tabcolsep}{0pt}
\begin{tabular}{c}
    TinyImageNet ResNet18 $\to$ ResNet18 \\
    \includegraphics[trim={0 0.2cm 0 0.1cm},clip,width=\linewidth]{plot_tin_arch56_onehot_eff}\\
\end{tabular}\\
\end{minipage}\hfill
\begin{minipage}[tb]{0.5\linewidth}
\centering
\small{\begin{tabular}{llcc}
\multicolumn{4}{c}{\textbf{Pareto efficient robust distillation results (test)}} \\
$\alpha^t$ & $\alpha^s$ & Average acc.  & Worst-class acc. \\
\hline 
 0.50 & 0.75 & $51.88 \pm 0.18$ & $19.29 \pm 1.27$ \\ 
 0.75 & 0.50 & $53.60 \pm 0.31$ & $18.98 \pm 0.86$ \\ 
 0.25 & 0.25 & $56.99 \pm 0.14$ & $18.83 \pm 0.85$ \\ 
 0.00 & 0.25 & $57.26 \pm 0.15$ & $14.44 \pm 0.91$ \\ 
 0.75 & 0.00 & $57.35 \pm 0.17$ & $9.47 \pm 1.76$ \\ 
 0.50 & 0.00 & \cellcolor{blue!15}$57.74 \pm 0.20$ & \cellcolor{blue!15}$8.22 \pm 1.09$ \\ 
%  0.00 & 0.00 & $57.83 \pm 0.13$ & $6.32 \pm 2.31$ \\ 
\end{tabular}
\begin{tabular}{lcc}
\multicolumn{3}{c}{\textbf{Baseline results (test)}} \\
Baseline & Average acc.  & Worst-class acc. \\
\hline 
Standard distillation & $57.83 \pm 0.13$ & $6.32 \pm 2.31$ \\ 
Post shift \text{[NM'21]} & $43.02 \pm 0.79$ & $14.39 \pm 1.13$ \\ 
Robust student \text{[NM'21]} & $48.06 \pm 0.24$ & $16.27 \pm 0.43$ \\ 
AdaMargin \text{[LBMK'21]} & $52.45 \pm 0.08$ & $15.41 \pm 0.71$ \\ 
AdaAlpha \text{[LBMK'21]} & $57.22 \pm 0.08$ & $7.62 \pm 2.17$ \\ 
Group DRO \text{[SKHL'20]} & $48.78 \pm 0.21$ & $11.38 \pm 1.79$ \\ 
\end{tabular}} \\
% \captionof{table}{Numerical results}\label{tab: table-label}
\end{minipage} \\

\caption{Trade-offs in worst-class test accuracy vs. average test accuracy for CIFAR-10 and CIFAR-100 distilling from ResNet-56 to ResNet-56, and TinyImageNet distilling from ResNet-18 to ResNet-18. All baseline results that require a teacher use the ``standard teacher'' (trained using $L^\std$), as done in the original papers. For methods run multiple times with multiple hyperparameters (e.g. temperatures), all Pareto efficient results are shown in the plot, but the tables show only the baseline results with the best worst-class accuracy (on the validation set). The \colorbox{blue!15}{highlighted row} indicates the model with the highest worst-class accuracy that also achieves at least as high average accuracy as \textit{standard distillation}.}
\label{fig:trade-offs_balanced}
\end{figure*}


% \input{figures/trade-offs_lt}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% All LT datasets, self distillation
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{figure*}
\centering
\begin{minipage}[tb]{0.4\linewidth}
\centering
\setlength{\tabcolsep}{0pt}
\begin{tabular}{c}
    CIFAR-10-LT ResNet56 $\to$ ResNet56 \\
    \includegraphics[trim={0 0.2cm 0 0.1cm},clip,width=\linewidth]{plot_cf10lt_arch56_onehot_eff}\\
\end{tabular}\\
\end{minipage}\hfill
\begin{minipage}[tb]{0.5\linewidth}
\centering
\small{\begin{tabular}{llcc}
\multicolumn{4}{c}{\textbf{Pareto efficient robust distillation results (test)}} \\
$\alpha^t$ & $\alpha^s$ & Average acc.  & Worst-class acc. \\
\hline 
 0.75 & 0.75 & \cellcolor{blue!15}$80.86 \pm 0.09$ & \cellcolor{blue!15}$75.58 \pm 0.17$ \\ 
 0.75 & 0.50 & $81.12 \pm 0.11$ & $75.52 \pm 0.22$ \\ 
 0.00 & 0.75 & $81.40 \pm 0.10$ & $75.15 \pm 0.38$ \\ 
 0.00 & 0.50 & $81.82 \pm 0.11$ & $75.13 \pm 0.24$ \\ 
 0.00 & 0.25 & $81.89 \pm 0.08$ & $73.09 \pm 0.32$ \\ 
 0.00 & 0.00 & $81.94 \pm 0.16$ & $70.61 \pm 0.39$ \\
\end{tabular}
\begin{tabular}{lcc}
\multicolumn{3}{c}{\textbf{Baseline results (test)}} \\
Baseline & Average acc.  & Worst-class acc. \\
\hline 
Standard distillation & $77.39 \pm 0.10$ & $60.12 \pm 0.56$ \\ 
Post shift \text{[NM'21]} & $78.28 \pm 0.05$ & $74.33 \pm 0.09$ \\ 
Robust student \text{[NM'21]} & $80.05 \pm 0.13$ & $74.91 \pm 0.24$ \\ 
AdaMargin \text{[LBMK'21]} & $72.69 \pm 0.24$ & $47.52 \pm 0.95$ \\ 
AdaAlpha \text{[LBMK'21]} & $70.83 \pm 0.28$ & $43.64 \pm 1.09$ \\ 
Group DRO \text{[SKHL'20]} & $74.39 \pm 0.17$ & $59.93 \pm 0.59$ \\
\end{tabular}} \\
% \captionof{table}{Numerical results}\label{tab: table-label}
\end{minipage} \\

\begin{minipage}[tb]{0.4\linewidth}
\centering
\setlength{\tabcolsep}{0pt}
\begin{tabular}{c}
    CIFAR-100-LT ResNet56 $\to$ ResNet56 \\
    \includegraphics[trim={0 0.2cm 0 0.1cm},clip,width=\linewidth]{plot_cf100lt_arch56_onehot_eff}\\
\end{tabular}\\
\end{minipage}\hfill
\begin{minipage}[tb]{0.5\linewidth}
\centering
\small{
\begin{tabular}{llcc}
\multicolumn{4}{c}{\textbf{Pareto efficient robust distillation results (test)}} \\
$\alpha^t$ & $\alpha^s$ & Average acc.  & Worst-class acc. \\
\hline 
0.75 & 0.50 & $41.91 \pm 0.15$ & $16.08 \pm 0.52$ \\ 
 0.00 & 0.50 & $43.82 \pm 0.14$ & $16.06 \pm 0.89$ \\ 
 0.25 & 0.25 & \cellcolor{blue!15}$48.01 \pm 0.09$ & \cellcolor{blue!15}$15.52 \pm 0.41$ \\ 
 0.25 & 0.25 & $48.20 \pm 0.11$ & $15.26 \pm 0.73$ \\ 
 0.50 & 0.00 & $50.41 \pm 0.11$ & $7.49 \pm 0.72$ \\ 
 0.75 & 0.00 & $50.57 \pm 0.18$ & $5.55 \pm 0.54$ \\
\end{tabular}
\begin{tabular}{lcc}
\multicolumn{3}{c}{\textbf{Baseline results (test)}} \\
Baseline & Average acc.  & Worst-class acc. \\
\hline 
Standard distillation & $46.01 \pm 0.16$ & $0.00 \pm 0.00$ \\ 
Post shift \text{[NM'21]} & $29.88 \pm 0.61$ & $10.01 \pm 0.72$ \\ 
Robust student \text{[NM'21]} & $30.79 \pm 0.18$ & $12.28 \pm 0.46$ \\ 
AdaMargin \text{[LBMK'21]} & $31.26 \pm 0.21$ & $0.00 \pm 0.00$ \\ 
AdaAlpha \text{[LBMK'21]} & $42.52 \pm 0.08$ & $0.00 \pm 0.00$ \\ 
Balanced student \text{[MJRJVK'21]} & $50.40 \pm 0.12$ & $4.39 \pm 0.66$ \\ 
Group DRO \text{[SKHL'20]} & $40.47 \pm 0.17$ & $0.19 \pm 0.17$ \\ 
\end{tabular}} \\
% \captionof{table}{Numerical results}\label{tab: table-label}
\end{minipage}\\

\begin{minipage}[tb]{0.42\linewidth}
\centering
\setlength{\tabcolsep}{0pt}
\begin{tabular}{c}
    TinyImageNet-LT ResNet18 $\to$ ResNet18 \\
    \includegraphics[trim={0 0.2cm 0 0.1cm},clip,width=\linewidth]{plot_tinlt_arch56_onehot_eff}\\
\end{tabular}\\
\end{minipage}\hfill
\begin{minipage}[tb]{0.5\linewidth}
\centering
\small{\begin{tabular}{llcc}
\multicolumn{4}{c}{\textbf{Pareto efficient robust distillation results (test)}} \\
$\alpha^t$ & $\alpha^s$ & Average acc.  & Worst-10 acc. \\
\hline 
 1.00 & 0.25 & \cellcolor{blue!15}$36.28 \pm 0.17$ & \cellcolor{blue!15}$7.98 \pm 0.21$ \\ 
 0.75 & 0.25 & $37.62 \pm 0.15$ & $6.25 \pm 0.12$ \\ 
 0.00 & 0.25 & $38.44 \pm 0.13$ & $5.90 \pm 0.45$ \\ 
 0.50 & 0.00 & $39.29 \pm 0.09$ & $4.17 \pm 0.34$ \\ 
 0.25 & 0.00 & $39.57 \pm 0.06$ & $3.68 \pm 0.30$ \\ 
\end{tabular}
\begin{tabular}{lcc}
\multicolumn{3}{c}{\textbf{Baseline results (test)}} \\
Baseline & Average acc.  & Worst-10 acc. \\
\hline 
Standard distillation & $26.05 \pm 0.18$ & $0.00 \pm 0.00$ \\ 
Post shift \text{[NM'21]} & $21.32 \pm 0.49$ & $2.58 \pm 0.42$ \\ 
Robust student \text{[NM'21]} & $21.59 \pm 0.19$ & $1.55 \pm 0.37$ \\ 
AdaMargin \text{[LBMK'21]} & $4.41 \pm 0.09$ & $0.00 \pm 0.00$ \\ 
AdaAlpha \text{[LBMK'21]} & $27.95 \pm 0.14$ & $0.00 \pm 0.00$ \\ 
Balanced student \text{[MJRJVK'21]} & $30.43 \pm 0.06$ & $0.20 \pm 0.18$ \\ 
Group DRO \text{[SKHL'20]} & $27.78 \pm 0.13$ & $0.00 \pm 0.00$ \\ 
\end{tabular}} \\
% \captionof{table}{Numerical results}\label{tab: table-label}
\end{minipage}
\caption{Trade-offs in worst-class test accuracy vs. average test accuracy for CIFAR-10-LT, CIFAR-100-LT, and TinyImageNet-LT under self-distillation. All baseline results that require a teacher use the ``standard teacher'' (trained using $L^\std$), as done in the original papers. For methods run multiple times with multiple hyperparameters (e.g. temperatures), all Pareto efficient results are shown in the plot, but the tables show only the baseline results with the best worst-class accuracy (on the validation set). The \colorbox{blue!15}{highlighted row} indicates the model with the highest worst-class (or worst-10) accuracy that also achieves at least as high average accuracy as \textit{standard distillation} (within error margins). Note that the for the LT datasets, $L^{\tdf}$ mixes between $L^{\bal}$ and $L^{\rob}$.}
\label{fig:trade-offs_lt}
\end{figure*}

% \input{figures/trade-offs_balanced_compressed}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% All balanced datasets, self distillation
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{figure*}
\centering
\begin{minipage}[tb]{0.5\linewidth}
\centering
\setlength{\tabcolsep}{0pt}
\begin{tabular}{c}
    CIFAR-10 ResNet56 $\to$ ResNet32 \\
    \includegraphics[trim={0 0.2cm 0 0.1cm},clip,width=\linewidth]{plot_cf10_arch32_onehot_eff}\\
\end{tabular}\\
\end{minipage}\hfill
\begin{minipage}[tb]{0.5\linewidth}
\centering
\small{\begin{tabular}{llcc}
\multicolumn{4}{c}{\textbf{Pareto efficient robust distillation results (test)}} \\
$\alpha^t$ & $\alpha^s$ & Average acc.  & Worst-class acc. \\
\hline 
0.00 & 0.25 & $93.08 \pm 0.07$ & $89.85 \pm 0.22$ \\ 
 1.00 & 0.00 & $93.38 \pm 0.05$ & $89.56 \pm 0.20$ \\ 
 0.75 & 0.00 & $93.58 \pm 0.09$ & $88.91 \pm 0.25$ \\ 
 1.00 & 0.00 & $93.59 \pm 0.06$ & $88.88 \pm 0.36$ \\ 
 0.75 & 0.00 & \cellcolor{blue!15}$93.61 \pm 0.05$ & \cellcolor{blue!15}$88.44 \pm 0.33$ \\ 
 0.25 & 0.00 & $93.74 \pm 0.07$ & $88.41 \pm 0.32$ \\ 
\end{tabular}
\begin{tabular}{lcc}
\multicolumn{3}{c}{\textbf{Baseline results (test)}} \\
Baseline & Average acc.  & Worst-class acc. \\
\hline 
Standard distillation & $93.71 \pm 0.05$ & $86.98 \pm 0.36$ \\ 
Robust student \text{[NM'21]} & $91.57 \pm 0.08$ & $88.57 \pm 0.18$ \\ 
AdaMargin \text{[LBMK'21]} & $92.09 \pm 0.09$ & $83.57 \pm 0.64$ \\ 
AdaAlpha \text{[LBMK'21]} & $93.52 \pm 0.11$ & $85.41 \pm 0.45$ \\ 
\end{tabular}} \\
\end{minipage} \\

\begin{minipage}[tb]{0.5\linewidth}
\centering
\setlength{\tabcolsep}{0pt}
\begin{tabular}{c}
    CIFAR-100 ResNet56 $\to$ ResNet32 \\
    \includegraphics[trim={0 0.2cm 0 0.1cm},clip,width=\linewidth]{plot_cf100_arch32_onehot_eff}\\
\end{tabular}\\
\end{minipage}\hfill
\begin{minipage}[tb]{0.5\linewidth}
\centering
\small{\begin{tabular}{llcc}
\multicolumn{4}{c}{\textbf{Pareto efficient robust distillation results (test)}} \\
$\alpha^t$ & $\alpha^s$ & Average acc.  & Worst-class acc. \\
\hline 
 0.75 & 0.25 & $70.42 \pm 0.14$ & $48.57 \pm 0.55$ \\ 
 0.75 & 0.00 & $72.84 \pm 0.22$ & $45.74 \pm 1.57$ \\ 
 0.75 & 0.00 & \cellcolor{blue!15}$72.97 \pm 0.18$ & \cellcolor{blue!15}$43.73 \pm 1.72$ \\ 
\end{tabular}
\begin{tabular}{lcc}
\multicolumn{3}{c}{\textbf{Baseline results (test)}} \\
Baseline & Average acc.  & Worst-class acc. \\
\hline 
Standard distillation & $73.19 \pm 0.10$ & $41.82 \pm 1.12$ \\ 
Robust student \text{[NM'21]} & $65.17 \pm 0.11$ & $40.87 \pm 0.89$ \\ 
AdaMargin \text{[LBMK'21]} & $71.92 \pm 0.17$ & $42.22 \pm 1.65$ \\ 
AdaAlpha \text{[LBMK'21]} & $72.93 \pm 0.09$ & $41.50 \pm 1.14$ \\ 
\end{tabular}} \\
\end{minipage}
\caption{Trade-offs in worst-class test accuracy vs. average test accuracy for CIFAR-10 and CIFAR-100 distilling from ResNet-56 to ResNet-32. All baseline results that require a teacher use the ``standard teacher'' (trained using $L^\std$), as done in the original papers. For methods run multiple times with multiple hyperparameters (e.g. temperatures), all Pareto efficient results are shown in the plot, but the tables show only the baseline results with the best worst-class accuracy (on the validation set). The \colorbox{blue!15}{highlighted row} indicates the model with the highest worst-class accuracy that also achieves at least as high average accuracy as \textit{standard distillation} (within error margins).}
\label{fig:trade-offs_balanced_compressed}
\end{figure*}



\subsection{Different teachers on repeat trainings}\label{app:different_teachers}

Distillation experimental results in the main paper use the same teacher for all repeat trainings of the student. This captures the variance in the student training process while omitting the variance in the teacher training process. To capture the variance in the full training pipeline, we ran an additional set of experiments where students were trained on different retrained teachers, rather than on the same teacher. We report results on all CIFAR datasets in Table \ref{tab:teacher_var}. The best teacher/student combinations are identical for all datasets except for CIFAR-10-LT, for which the best teacher/student combinations from Table \ref{tab:teacher_var} and Table \ref{tab:combos_self_full} were both a robust student trained with a balanced teacher, and only differed in whether the validation set contained teacher labels or one-hot labels ($L^{\bal}$/$L^{\robd}$ (one-hot val) in Table \ref{tab:teacher_var} vs. $L^{\bal}$/$L^{\robd}$ (teacher val) in Table \ref{tab:combos_self_full}). Note that the first and second rows of Table \ref{tab:combos_self_full} are already averaged over $m$ retrained teachers ($m=5$ for TinyImageNet, or $m=10$ for CIFAR datasets), and those same $m$ teachers are used in the repeat trainings in Table \ref{tab:teacher_var}.

% \input{tables/table_combos_different_teachers}
\begin{table}[!ht]
\caption{Comparison using different teachers for student retrainings for self-distilled teacher/student combos on test. For each student/teacher objective pair, we train $m=10$ students total on each of $m=10$ distinct retrained teachers. For comparability, the same set of $m$ teachers is used for each student. This differs from Table \ref{tab:combos_self_full} in that in Table \ref{tab:combos_self_full}, the students are retrained on each repeat using the same teacher (arbitrarily selected). Otherwise, setups are the same as in Table \ref{tab:combos_self_full}.
}
\label{tab:teacher_var}
\begin{center}
\begin{small}
\hspace{-2pt}
\begin{tabular}{p{0.1cm}cV{2.5}c|cV{2.5}c|cV{2.5}}
\toprule
& & \multicolumn{2}{cV{2.5}}{\textbf{CIFAR-10} Teacher Obj.} & \multicolumn{2}{cV{2.5}}{\textbf{CIFAR-100}Teacher Obj.} \\
& & $L^{\std}$ & $L^{\rob}$ & $L^{\std}$ & $L^{\rob}$ \\
\midrule
\multirow{6}{*}{\rotatebox{90}{Student Obj.}}
& $L^{\stdd}$ & $87.09 \pm 0.51$  & $89.68 \pm 0.20$   & $44.21\pm0.57$  & \cellcolor{blue!15}$\mathbf{47.79}\pm0.82$ \\
& & \tiny{($93.78 \pm 0.22$)} & \tiny{($93.74 \pm 0.07$)} & \tiny{$74.6\pm0.11$} & \cellcolor{blue!15}\tiny{$73.48\pm0.11$}  \\
\cline{2-6}
& $L^{\robd}$  & \cellcolor{blue!15} $\textbf{90.62} \pm 0.19$ & $87.12 \pm 0.38$ & $39.7\pm1.32$  & $31.09\pm1.21$ \\
&\tiny{(teacher val)} & \cellcolor{blue!15}\tiny{($92.58 \pm 0.08$)} & \tiny{($90.46 \pm 0.08$)} & \tiny{($64.28\pm0.41$)} & \tiny{($55.39\pm0.28$)} \\
\cline{2-6}
& $L^{\robd}$ & $88.15 \pm 0.66$ & $86.44 \pm 0.52$ & $39.44\pm0.94$  & $39.65\pm0.59$ \\
&\tiny{(one-hot val)} & \tiny{($91.03 \pm 0.47$)} & \tiny{($90.16 \pm 0.42$)} & \tiny{($61.23\pm0.36$)} & \tiny{($60.89\pm0.29$)} \\
\bottomrule
\end{tabular}

\begin{tabular}{p{0.1cm}cV{2.5}c|c|cV{2.5}c|c|cV{2.5}}
\toprule
& & \multicolumn{3}{cV{2.5}}{\textbf{CIFAR-10-LT} Teacher Obj.} & \multicolumn{3}{cV{2.5}}{\textbf{CIFAR-100-LT} Teacher Obj.} \\
& & $L^{\std}$ & $L^{\bal}$ & $L^{\rob}$ & $L^{\std}$ & $L^{\bal}$ & $L^{\rob}$ \\
\midrule
\multirow{6}{*}{\rotatebox{90}{Student Obj.}}
& $L^{\stdd}$ & $60.12 \pm 0.56$ & $66.13 \pm 0.47$ & $69.75\pm0.52$ & $0.00 \pm 0.00$ & $1.41 \pm 0.41$ & $9.17\pm0.74$ \\
&&\tiny{($77.39 \pm 0.10$)} & \tiny{($79.16 \pm 0.20$)} & \tiny{($80.73\pm0.08$)}  &\tiny{($45.84 \pm 0.13$)} & \tiny{($49.67\pm 0.20$)} & \tiny{($48.55\pm0.14$)}\\
\cline{2-8}
& $L^{\bald}$ & $72.41 \pm 0.52$ & $71.49 \pm 0.30$ & $71.70 \pm 0.33$ & $5.83 \pm 0.54$ & $5.94 \pm 0.50$ & $8.37 \pm 0.72$ \\
&& \tiny{($81.97 \pm 0.11$)} & \tiny{($81.20 \pm 0.15$)} & \tiny{($80.29 \pm 0.11$)} & \tiny{($50.58 \pm 0.15$)} & \tiny{($50.85 \pm 0.14$)} & \tiny{($48.16\pm 0.20$)}\\
\cline{2-8}
& $L^{\robd}$  & $62.77 \pm 0.58$  & $73.09\pm 0.34$ & $68.04 \pm 0.47$  & $10.53 \pm 0.76$  & $12.04 \pm 0.89$ & $9.66 \pm 1.15$ \\
& \tiny{(teacher val)} & \tiny{($77.18 \pm 0.15$)} & \tiny{($80.03\pm 0.22$)} & \tiny{($75.36 \pm 0.25$)} & \tiny{($33.69 \pm 0.14$)} & \tiny{($34.08 \pm 0.12$)} & \tiny{($37.10 \pm 0.15$)}\\
\cline{2-8}
& $L^{\robd}$  & \cellcolor{blue!15}$\mathbf{75.10} \pm 0.36$ & \cellcolor{blue!15}$\mathbf{75.10} \pm 0.50$ & $74.16 \pm 0.34$ & $10.74 \pm 0.44$ & $11.95 \pm 0.69$ & \cellcolor{blue!15} $\mathbf{12.87}\pm 0.81$ \\
& \tiny{(one-hot val)} & \cellcolor{blue!15}\tiny{($79.27 \pm 0.13$)} & \cellcolor{blue!15}\tiny{($79.07 \pm 0.20$)} & \tiny{($78.11 \pm 0.14$} & \tiny{($30.36 \pm 0.39$)} & \tiny{($31.00 \pm 0.16$)} & \cellcolor{blue!15}\tiny{($31.62 \pm 0.34$}\\
\bottomrule
\end{tabular}
\end{small}
\end{center}
\end{table}

\subsection{AdaAlpha and AdaMargin comparisons with different teachers}

We include and discuss additional comparisons to the AdaMargin and AdaAlpha methods \cite{lukasik2021teachers}, which each define additional ways to modify the student training algorithm (see Section \ref{sec:expts}). In Table \ref{tab:baselines}, we show results with each of these methods using the standard teacher, as done in the original paper. However, in this section we extend these results by also applying AdaMargin and AdaAlpha with different teachers trained with the robust and balanced objectives. Table \ref{tab:ada*} compares the results of AdaMargin and AdaAlpha for these different teachers under the same self distillation setup as Table \ref{tab:combos_self_full}.

Overall, the use of a robust teacher leads to marked improvements for students trained by AdaMargin and AdaAlpha.
For the balanced datasets, AdaMargin was competitive with the robust and standard students: on CIFAR-100 and TinyImageNet, AdaMargin combined with the robust teacher and the standard teacher (respectively) achieved worst-class accuracies that are statistically comparable to the best worst-class accuracies in Table \ref{tab:combos_self_full}. However, on the long-tailed datasets, AdaAlpha and AdaMargin did not achieve worst-class accuracies as high as other teacher/student combinations. This suggests that the AdaMargin method can work well on balanced datasets in combination with a robust teacher, but other combinations of standard/balanced/robust objectives are valuable for long-tailed datasets.

Relative to each other, AdaMargin usually achieved higher worst-class accuracy than AdaAlpha, whereas AdaAlpha often achieved higher average accuracy. 


% \input{tables/table_combos_ada}
\begin{table*}[!ht]
\caption{Results for AdaAlpha and AdaMargin baselines for different teachers under self-distillation. For all CIFAR datasets, self-distillation is done from ResNet56 $\to$ ResNet56. For TinyImageNet, self-distillation is done from ResNet18 $\to$ ResNet18. Worst-class accuracy shown above (or worst-10 accuracy for TinyImageNet-LT), and average accuracy is shown in parentheses below. The temperature hyperparameter was tuned to maximize worst-class accuracy on the held-out validation set. Mean and standard error are reported over 5 repeats for all datasets.}
\label{tab:ada*}
% \vskip 0.15in
\begin{center}
\begin{small}
\begin{tabular}{p{0.1cm}cV{2.5}c|cV{2.5}c|cV{2.5}c|cV{2.5}}
\toprule
& & \multicolumn{2}{cV{2.5}}{\textbf{CIFAR-10} Teacher Obj.} & \multicolumn{2}{cV{2.5}}{\textbf{CIFAR-100}  Teacher Obj.} & \multicolumn{2}{cV{2.5}}{\textbf{TinyImageNet} Teacher Obj.} \\
% & & \multicolumn{2}{cV{2.5}}{Teacher Obj.} & \multicolumn{2}{cV{2.5}}{Teacher Obj.} & \multicolumn{2}{cV{2.5}}{Teacher Obj.}\\
& & $L^{\std}$ & $L^{\rob}$ & $L^{\std}$ & $L^{\rob}$ & $L^{\std}$ & $L^{\rob}$ \\
\midrule
& Ada & $88.33\pm0.14$  & $89.96\pm0.44$ & $43.50\pm 0.62$  & $45.59\pm 0.82$ & $11.11 \pm 1.29$ & $16.58\pm1.67$ \\
& Alpha & \tiny{($94.31\pm0.01$)} & \tiny{($93.97\pm0.07$)} & \tiny{$73.96\pm 0.09$} & \tiny{$71.42\pm 0.14$} & \tiny{$61.13\pm0.09$} & \tiny{$56.84\pm0.15$} \\
\cline{2-8}
& Ada & $87.36$ \tiny{$\pm0.06$}  & $90.37$\tiny{$\pm0.26$} & $43.91$ \tiny{$\pm 1.11$}  &  $47.78$ \tiny{$\pm 0.96$} & $18.17$ \tiny{$\pm 3.89$}  & $17.84$ \tiny{$\pm 1.77$}\\
& Margin & \tiny{($94.25\pm0.02$)} & \tiny{($94.02\pm0.12$)} & \tiny{($73.58\pm 0.11$)} & \tiny{($70.92\pm 0.09$)} & \tiny{($61.3 \pm 0.28$)} & \tiny{($55.77 \pm 0.32$)}\\
\bottomrule
\end{tabular}

\begin{tabular}{p{0.1cm}cV{2.5}c|c|cV{2.5}c|c|cV{2.5}}
\toprule
& & \multicolumn{3}{cV{2.5}}{\textbf{CIFAR-10-LT} Teacher Obj.} & \multicolumn{3}{cV{2.5}}{\textbf{CIFAR-100-LT} Teacher Obj.} \\
& & $L^{\std}$ & $L^{\bal}$ & $L^{\rob}$ & $L^{\std}$ & $L^{\bal}$ & $L^{\rob}$ \\
\midrule
& Ada & $41.90\pm0.44$ & $66.23\pm0.39$ & $71.17\pm0.32$  & $0.00 \pm 0.00$ & $1.46 \pm 0.61$ & $9.15 \pm 0.54$ \\
& Alpha & \tiny{($71.67\pm0.08$)} & \tiny{($77.87\pm0.16$)} & \tiny{($79.66\pm0.13$)}   & \tiny{($42.52 \pm 0.08$)} & \tiny{($45.44 \pm 0.14$)} & \tiny{($45.64 \pm 0.11$)}\\
\cline{2-8}
& Ada & $47.52 $ \tiny{$\pm0.95$} & $66.74 $ \tiny{$\pm0.35$} & $70.33 $ \tiny{$\pm0.50$} & $0.00  $ \tiny{$\pm 0.00$} & $0.00  $ \tiny{$\pm 0.00$} & $12.46  $ \tiny{$\pm 0.36$} \\
& Margin & \tiny{($72.69\pm0.24$)} & \tiny{($78.20\pm0.09$)} & \tiny{($78.87\pm0.12$)}  & \tiny{($31.26 \pm 0.21$)} & \tiny{($34.06 \pm 0.12$)} & \tiny{($42.90 \pm 0.07$)}\\
\bottomrule
\end{tabular}\\

\begin{tabular}{p{0.1cm}cV{2.5}c|c|cV{2.5}}
\toprule
& & \multicolumn{3}{cV{2.5}}{\textbf{TinyImageNet-LT} Teacher Obj.} \\
& & $L^{\std}$ & $L^{\bal}$ & $L^{\rob}$ \\
\midrule
& Ada &  $0.00 \pm 0.00$ & $0.00 \pm 0.00$ &  $0.00 \pm 0.00$ \\
& Alpha & \tiny{($28.14 \pm 0.12$)} & \tiny{($0.50 \pm 0.00$)} & \tiny{($0.50 \pm 0.00$)} \\
\cline{2-5}
& Ada &  $0.00 \pm 0.00$ & $0.00 \pm 0.00$ &  $0.41 \pm 0.17$ \\
& Margin & \tiny{($9.18 \pm 0.09$)} & \tiny{($7.92 \pm 0.10$)} & \tiny{($23.08 \pm 0.15$)} \\
\bottomrule
\end{tabular}
\end{small}
\end{center}
% \vskip -0.12in
\end{table*}

\subsection{Group DRO comparison}

\citet{Sagawa2020Distributionally} propose a group DRO algorithm to improve long tail performance without distillation. In this section we present additional experimental comparisons to Algorithm 1 from \citet{Sagawa2020Distributionally}. This differs from our robust optimization methodology in Section \ref{sec:algorithms} in two key ways: \textit{(i)} we apply a margin-based surrogates of \citet{menon2020long}, and \textit{(ii)} we use a validation set to update the Lagrange multipliers $\lambda$ in Algorithm \ref{algo:dro-general}. Table \ref{tab:groupdro} shows results from running group DRO directly as specified in Algorithm 1 in \citet{Sagawa2020Distributionally}, as well as a variant where we use the validation set to update Lagrange multipliers in group DRO (labeled as ``with vali'' in Table \ref{tab:groupdro}). Table \ref{tab:groupdro} shows that this latter variant ``with vali'' performs better than the original version without a validation set; thus, for the results in Figures \ref{fig:trade-offs_balanced} and \ref{fig:trade-offs_lt}, we report these better results marked in Table \ref{tab:groupdro} as ``with vali.'' Overall, this comparison shows that $L^{\rob}$ is comparable to group DRO, and that robust distillation protocols can outperform group DRO alone. 

% \input{tables/table_groupdro}
\begin{table}[!ht]
\caption{Results from comparison to group DRO (Algorithm 1 in \citet{Sagawa2020Distributionally}) without distillation. ``No vali'' uses the training set to update group Lagrange multipliers, as done originally by \citet{Sagawa2020Distributionally}. ``With vali'' uses the validation set to compute group Lagrange multipliers as done in all other experiments in our paper. Worst-class accuracy is shown above, and balanced accuracy is shown in parentheses below. Mean and standard error are shown over 5 repeats.}
\label{tab:groupdro}
% \vskip 0.15in
\begin{center}
\begin{small}
\begin{tabular}{V{2.5}c|cV{2.5}c|cV{2.5}c|cV{2.5}}
\toprule
\multicolumn{2}{V{2.5}cV{2.5}}{\textbf{CIFAR-10} group DRO} & \multicolumn{2}{cV{2.5}}{\textbf{CIFAR-100} group DRO} & \multicolumn{2}{cV{2.5}}{\textbf{TinyImageNet} group DRO} \\
% & & \multicolumn{2}{cV{2.5}}{Teacher Obj.} & \multicolumn{2}{cV{2.5}}{Teacher Obj.} & \multicolumn{2}{cV{2.5}}{Teacher Obj.}\\
 No vali & With vali & No vali & With vali & No vali & With vali \\
\midrule
 $86.65$ \tiny{$\pm 0.49$}  & $89.32 $ \tiny{$\pm 0.21$} &  $40.35 $ \tiny{$\pm 1.18$}  & $43.89 $ \tiny{$\pm 1.12$}  & $0.00 $ \tiny{$\pm 0.00$}  & $9.17 $ \tiny{$\pm 1.55$}\\
 \tiny{($93.61 \pm 0.09$)} & \tiny{($92.34 \pm 0.07$)} & \tiny{$70.25 \pm 0.17$} & \tiny{$65.18 \pm 0.08$} & \tiny{($6.55 \pm 0.41$)} & \tiny{($47.67 \pm 0.22$)}\\
\bottomrule
\end{tabular}

\begin{tabular}{V{2.5}c|cV{2.5}c|cV{2.5}c|cV{2.5}}
\toprule
\multicolumn{2}{V{2.5}cV{2.5}}{\textbf{CIFAR-10-LT} group DRO} & \multicolumn{2}{cV{2.5}}{\textbf{CIFAR-100-LT} group DRO} & \multicolumn{2}{cV{2.5}}{\textbf{TinyImageNet-LT} group DRO} \\
No vali & With vali & No vali & With vali & No vali & With vali \\
\midrule
$51.59 $ \tiny{$\pm 2.49$} & $59.93 $ \tiny{$\pm 0.59$} &   $0.00 $ \tiny{$\pm 0.00$} & $0.19 $ \tiny{$\pm 0.17$}  &   $0.00 $ \tiny{$\pm 0.00$} & $0.00 $ \tiny{$\pm 0.00$}\\
\tiny{($71.94 \pm 0.75$)} & \tiny{($74.39 \pm 0.17$)}  & \tiny{($39.81 \pm 0.23$)} & \tiny{($40.47 \pm 0.17$)}& \tiny{($9.79 \pm 0.40$)} & \tiny{($22.49 \pm 0.10$)}\\
\bottomrule
\end{tabular}

\end{small}
\end{center}
% \vskip -0.2in
\end{table}

\subsection{Additional ImageNet comparisons}\label{app:imgnet}
Here we present additional results when training ResNet-18 teachers and students on ImageNet. Table \ref{tab:combos_imagenet} includes measures of worst-1 accuracy, worst-10 accuracy, worst-50 accuracy, and worst-100 accuracy.

% \input{tables/table_combos_imagenet}
\begin{table}[!ht]
\caption{ImageNet comparison of ResNet-18 teacher/student combos on test. Average worst-1/10/100 accuracy shown above, standard accuracy shown in parentheses below. The combination with the best worst-class accuracy is bolded. Mean and standard error are reported over up to 5 repeats.}
\label{tab:combos_imagenet}
% \vskip 0.15in
\begin{center}
\begin{small}
\begin{tabular}{p{0.1cm}cV{2.5}c|cV{2.5}c|cV{2.5}c|cV{2.5}c|cV{2.5}}
& & \multicolumn{2}{cV{2.5}}{Worst-1 Accuracy} & \multicolumn{2}{cV{2.5}}{Worst-10 Accuracy} & \multicolumn{2}{cV{2.5}}{Worst-50 Accuracy} & \multicolumn{2}{cV{2.5}}{Worst-100 Accuracy} \\
\toprule
& & \multicolumn{2}{cV{2.5}}{\textbf{ImageNet} Teacher Obj.} & \multicolumn{2}{cV{2.5}}{\textbf{ImageNet} Teacher Obj.} & \multicolumn{2}{cV{2.5}}{\textbf{ImageNet} Teacher Obj.} & \multicolumn{2}{cV{2.5}}{\textbf{ImageNet} Teacher Obj.} \\
&  & $L^{\std}$ & $L^{\rob}$ & $L^{\std}$ & $L^{\rob}$ & $L^{\std}$ & $L^{\rob}$ & $L^{\std}$ & $L^{\rob}$  \\
\midrule
\multirow{10}{*}{\rotatebox{90}{Student Obj.}} 
& none & $0.00$ & $10.71$ & $11.54$ & $17.13$ & $24.01$ & $25.08$ & $30.35$ & $29.96$ \\
&& \tiny{($67.29$)} & \tiny{($63.10$)} & \tiny{($67.29$)} & \tiny{($63.10$)} &\tiny{$67.29$} & \tiny{$63.10$} & \tiny{($67.29$)} & \tiny{($63.10$)} \\
\cline{2-10}
& Post &  $8.70$ & $3.57$ & $16.15$ & $11.64$ & $21.85$ & $18.86$ & $25.58$ & $23.17$ \\
& shift & \tiny{($48.62$)} & \tiny{($48.83$)} &  \tiny{($48.62$)} & \tiny{($48.83$)} & \tiny{($48.62$)} & \tiny{($48.83$)} & \tiny{($48.62$)} & \tiny{($48.83$)} \\
% \cline{2-8}
% & Ada &   && & & & \\
% & Alpha & & & & & & \\
% \cline{2-8}
% & Ada &  &  & & & & \\
% & Margin & &  & & & & \\
\cline{2-10}
& $L^{\stdd}$ & $3.20 \pm 1.33$ &\cellcolor{blue!15} $3.79 \pm 0.11$ & $10.07 \pm 0.27$ &\cellcolor{blue!15} $10.22 \pm 0.33$ & $20.30 \pm 0.39$ &\cellcolor{blue!15} $22.61 \pm 0.32$ & $26.45 \pm 0.25$ &\cellcolor{blue!15} $29.01 \pm 0.21$ \\
&& \tiny{($65.46 \pm 0.05$)} &\cellcolor{blue!15} \tiny{($64.54 \pm 0.01$)} & \tiny{($65.46 \pm 0.05$)} &\cellcolor{blue!15} \tiny{($64.54 \pm 0.01$)} & \tiny{($65.46 \pm 0.05$)} &\cellcolor{blue!15} \tiny{($64.54 \pm 0.01$)} & \tiny{($65.46 \pm 0.05$)} &\cellcolor{blue!15} \tiny{($64.54 \pm 0.01$)}\\
\cline{2-10}
& $L^{\robd}$  & $0.00 \pm 0.00$  & $0.00 \pm 0.00$ &$1.18 \pm 0.02$  &$1.47 \pm 0.04$ & $13.02 \pm 0.16$ & $6.85 \pm 0.13$ &  $21.00 \pm 0.19$&  $11.26 \pm 0.16$\\
& \tiny{(teacher val)} &\tiny{($59.60 \pm 0.10$)} & \tiny{($51.01 \pm 0.12$)} &\tiny{($59.60 \pm 0.10$)} & \tiny{($51.01 \pm 0.12$)}& \tiny{($59.60 \pm 0.10$)} & \tiny{($51.01 \pm 0.12$)} & \tiny{($59.60 \pm 0.10$)} & \tiny{($51.01 \pm 0.12$)}\\
\cline{2-10}
& $L^{\robd}$  &$0.00 \pm 0.00$ & $0.00 \pm 0.00$ &  $8.32 \pm 1.04$ & $5.99 \pm 0.00$ & $18.77 \pm 0.03$ &  $16.82 \pm 0.00$ & $23.95 \pm 0.31$ &  $22.16 \pm 0.00$ \\
& \tiny{(one-hot val)} & \tiny{($59.65 \pm 0.01$)} & \tiny{($55.34 \pm 0.00$)} & \tiny{($59.65 \pm 0.01$)} & \tiny{($55.34 \pm 0.00$)} & \tiny{($59.65 \pm 0.01$)} & \tiny{($55.34 \pm 0.00$)} & \tiny{($59.65 \pm 0.01$)} & \tiny{($55.34 \pm 0.00$)}\\
\bottomrule
\end{tabular}
\end{small}

\end{center}
% \vskip -0.1in
\end{table}

% for referring to main bib references
\bibliography{main}

\end{document}
