% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
% \externaldocument{zhu_155}



% Optional math commands from https://github.com/goodfeli/dlbook_notation.
% \input{math_commands.tex}
%%%%% NEW MATH DEFINITIONS %%%%%

\usepackage{amsmath,amsfonts,bm}

% Mark sections of captions for referring to divisions of figures
\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

% Highlight a newly defined term
\newcommand{\newterm}[1]{{\bf #1}}


% Figure reference, lower-case.
\def\figref#1{figure~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}
% Reference to two sections.
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
% Reference to three sections.
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
% Reference to an equation, lower-case.
\def\eqref#1{equation~\ref{#1}}
% Reference to an equation, upper case
\def\Eqref#1{Equation~\ref{#1}}
% A raw reference to an equation---avoid using if possible
\def\plaineqref#1{\ref{#1}}
% Reference to a chapter, lower-case.
\def\chapref#1{chapter~\ref{#1}}
% Reference to an equation, upper case.
\def\Chapref#1{Chapter~\ref{#1}}
% Reference to a range of chapters
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
% Reference to an algorithm, lower-case.
\def\algref#1{algorithm~\ref{#1}}
% Reference to an algorithm, upper case.
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
% Reference to a part, lower case
\def\partref#1{part~\ref{#1}}
% Reference to a part, upper case
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


% Random variables
\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
% rm is already a command, just don't name any random variables m
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

% Random vectors
\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

% Elements of random vectors
\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

% Random matrices
\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

% Elements of random matrices
\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

% Vectors
\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}

% Elements of vectors
\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

% Matrix
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}

% Tensor
\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


% Graph
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

% Entries of a matrix
\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

% entries of a tensor
% Same font as tensor, without \bm wrapper
\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

% The true underlying data generating distribution
\newcommand{\pdata}{p_{\rm{data}}}
% The empirical distribution defined by the training set
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
% The model distribution
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
% Stochastic autoencoder distributions
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
% But then they seem to use $L^2$ for vectors throughout the site, and so does
% wikipedia.
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak



\usepackage{hyperref}
\usepackage{url}
\usepackage{times}

% if you need to pass options to natbib, use, e.g.:
\PassOptionsToPackage{numbers, compress}{natbib}
% before loading neurips_2022
\usepackage{natbib}
% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{multirow}
\usepackage{booktabs} % for professional tables
% \input{math_symbol}
\def \S {\mathbf{S}}
\def \A {\mathcal{A}}
\def \X {\mathcal{X}}
\def \Ab {\bar{\A}}
\def \R {\mathbb{R}}
\def \Kt {\widetilde{K}}
\def \k {\mathbf{k}}
\def \w {\mathbf{w}}
\def \v {\mathbf{v}}
\def \t {\mathbf{t}}
\def \x {\mathbf{x}}
\def \m {\mathbf{m}}
\def \n {\mathbf{n}}
\def \Se {\mathcal{S}}
%\def \E {\mathrm{E}}
\def \E {\mathbb{E}}
\def \Rh {\widehat{R}}
\def \x {\mathbf{x}}
\def \u {\mathbf{u}}
\def \v {\mathbf{v}}
\def \p {\mathbf{p}}
\def \a {\mathbf{a}}
\def \diag {\mbox{diag}}
\def \b {\mathbf{b}}
\def \e {\mathbf{e}}
\def \ba {\boldsymbol{\alpha}}
\def \c {\mathbf{c}}
\def \tr {\mbox{tr}}
\def \d {\mathbf{d}}
\def \z {\mathbf{z}}
\def \s {\mathbf{s}}
\def \bh {\widehat{b}}
\def \y {\mathbf{y}}
\def \u {\mathbf{u}}
%\def \L {\mathcal{L}}
\def \H {\mathcal{H}}
\def \g {\mathbf{g}}
\def \E {\mathbb{E}}
\def \F {\mathcal{F}}
\def \I {\mathbb{I}}
\def \P {\mathcal{P}}
\def \Q {\mathcal{Q}}
\def \xh {\widehat{\x}}
\def \wh {\widehat{\w}}
\def \ah {\widehat{\alpha}}
\def \Rc {\mathcal R}

\def \Bh {\widehat B}
\def \Ah {\widehat A}
\def \Uh {\widehat U}
\def \Ut {\widetilde U}
\def \B {\mathbf B}
\def \C {\mathbf C}
\def \U {\mathbf U}
\def \Kh {\widehat K}
\def \fh {\widehat f}
\def \yh {\widehat y}
\def \Xh {\widehat{X}}
\def \Fh {\widehat{F}}




\def \y {\mathbf{y}}
\def \E {\mathbb{E}}
\def \I {\mathbb{I}}
\def \x {\mathbf{x}}
\def \g {\mathbf{g}}
\def \D {\mathcal{D}}
\def \z {\mathbf{z}}
\def \u {\mathbf{u}}
\def \H {\mathcal{H}}
\def \Pc {\mathcal{P}}
\def \w {\mathbf{w}}
\def \r {\mathbf{r}}
\def \R {\mathbb{R}}
\def \S {\mathcal{S}}
\def \regret {\mbox{regret}}
\def \Uh {\widehat{U}}
\def \Q {\mathcal{Q}}
\def \W {\mathcal{W}}
\def \N {\mathcal{N}}
\def \A {\mathcal{A}}
\def \q {\mathbf{q}}
\def \v {\mathbf{v}}
\def \M {\mathcal{M}}
\def \c {\mathbf{c}}
\def \ph {\widehat{p}}
\def \d {\mathbf{d}}
\def \p {\mathbf{p}}
\def \q {\mathbf{q}}
\def \db {\bar{\d}}
\def \dbb {\bar{d}}
\def \I {\mathcal{I}}
\def \xt {\widetilde{\x}}
\def \f {\mathbf{f}}
\def \a {\mathbf{a}}
\def \b {\mathbf{b}}
\def \ft {\widetilde{\f}}
\def \bt {\widetilde{\b}}
\def \h {\mathbf{h}}
\def \B {\mathbf{B}}
\def \bts {\widetilde{b}}
\def \fts {\widetilde{f}}
\def \Gh {\widehat{G}}
\def \G {\mathcal {G}}
\def \bh {\widehat{b}}
\def \fh {\widehat{f}}
\def \wh {\widehat{\w}}
\def \vb {\bar{v}}
\def \zt {\widetilde{\z}}
\def \zts {\widetilde{z}}
\def \s {\mathbf{s}}
\def \gh {\widehat{\g}}
\def \vh {\widehat{\v}}
\def \Sh {\widehat{S}}
\def \rhoh {\widehat{\rho}}
\def \hh {\widehat{\h}}
\def \C {\mathcal{C}}
\def \V {\mathcal{L}}
\def \t {\mathbf{t}}
\def \xh {\widehat{\x}}
\def \Ut {\widetilde{U}}
\def \wt {\widetilde{\w}}
\def \Th {\widehat{T}}
\def \Ot {\tilde{\mathcal{O}}}
\def \X {\mathcal{X}}
\def \Y {\mathcal{Y}}
\def \nb {\widehat{\nabla}}
\def \K {\mathcal{K}}
\def \P {\mathbb{P}}
\def \T {\mathcal{T}}
\def \F {\mathcal{F}}
\def \ft{\widetilde{f}}
\def \xt {\widetilde{x}}
\def \Rt {\mathcal{R}}
\def \Rb {\bar{\Rt}}
\def \wb {\bar{\w}}



\usepackage{wrapfig}
\newtheorem{prop}{Proposition}
\usepackage{hyperref}
% Attempt to make %hyperref and algorithmic work together better:
 \newcommand{\theHalgorithm}{\arabic{algorithm}}
\usepackage{algorithm}
\usepackage{algorithmicx,algpseudocode}





%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{AUC Maximization in Imbalanced Lifelong Learning\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Xiangyu Zhu}{}}
\author[1,2]{Xiangyu Zhu}
\author[2]{Jie Hao}
\author[3]{Yunhui Guo}
\author[2]{Mingrui Liu}
% Add affiliations after the authors
\affil[1]{%
    Meituan, China
}
\affil[2]{%
        Department of Computer Science, George Mason University, USA
}
\affil[3]{%
    Department of Computer Science, University of Texas at Dallas, USA
  }


  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement

\maketitle

\appendix
\section{DIANA Outperforms Other Methods with Class Balanced Sampling}
\label{sec: other_methods}

\begin{table*}[h]
	\centering
	\caption{The results of memory-based methods with Class Balanced Reservoir Sampling on Split CIFAR100, Split CUB200, and Split AWA2. }
    \setlength\tabcolsep{2.5pt} 
	\scalebox{0.75}{
    	\begin{tabular}{l|ccc|ccc|ccc}
		\toprule
		\multicolumn{1}{c|}{\multirow{2}[2]{*}{\textbf{Method}}} & \multicolumn{3}{c|}{\textbf{Split-CIFAR}} & \multicolumn{3}{c}{\textbf{Split-CUB}} & \multicolumn{3}{|c}{\textbf{Split-AWA2}} \\
		& \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%} & \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%}
		& \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%}\\
    \midrule
    A-GEM  & 58.3 $\pm$ 3.5 & 60.3 $\pm$ 0.3 & 19.5 $\pm$ 3.6 & 50.8 $\pm$ 4.0 & 50.2 $\pm$ 0.3 & 17.6 $\pm$ 2.5 & 55.4 $\pm$ 1.8 & 49.9 $\pm$ 4.0 & 30.6 $\pm$ 2.6 \\
    Gdumb & 70.6 $\pm$ 1.4 & 63.1 $\pm$ 0.9 & 6.3 $\pm$ 0.5 & 66.8 $\pm$ 5.5 & \textbf{55.3 $\pm$ 5.2} & 4.3 $\pm$ 2.7 & 98.4 $\pm$ 0.8 & 92.5 $\pm$ 2.6 & 0.2 $\pm$ 0.6 \\
    DER   & 64.9 $\pm$ 3.2 & 58.3 $\pm$ 1.4 & 13.4 $\pm$ 3.3 & 65.0 $\pm$ 5.8 & 51.7 $\pm$ 2.0 & 10.3 $\pm$ 3.2 & 86.1 $\pm$ 14.1 & 71.6 $\pm$ 8.0 & 10.6 $\pm$ 11.4 \\
    MEGA  & 66.1 $\pm$ 5.7 & 59.4 $\pm$ 2.6 & 3.2 $\pm$ 1.5 & 50.8 $\pm$ 1.1 & 50.0 $\pm$ 0.4 & 13.0 $\pm$ 2.1 & 57.1 $\pm$ 6.3 & 50.5 $\pm$ 5.9 & 17.3 $\pm$ 1.4 \\
	RM & \textbf{77.4 $\pm$ 1.9} & \textbf{69.8 $\pm$ 1.7} & 4.3 $\pm$ 1.9 & 58.8 $\pm$ 2.8 & 54.4 $\pm$ 1.1 & 11.3 $\pm$ 1.2 & 80.5 $\pm$ 6.0 & 71.1 $\pm$ 8.8 & 5.6 $\pm$ 2.0 \\
    \midrule
    DIANA & 76.7 $\pm$ 1.0 & 69.3 $\pm$ 1.7 & \textbf{3.1 $\pm$ 0.7} &  \textbf{73.4 $\pm$ 2.1} & 51.0 $\pm$ 0.5 &  \textbf{0.2 $\pm$ 0.3} & \textbf{99.5 $\pm$ 0.2} &  \textbf{87.4 $\pm$ 2.9} & \textbf{0.1 $\pm$ 0.1} \\
	%DIANA-RM & 0.766 $\pm$ 0.005 & 0.685 $\pm$ 0.011 & 0.034 $\pm$ 0.008 & \textbf{0.763 $\pm$ 0.031} & 0.545 $\pm$ 0.011 & \textbf{0.001 $\pm$ 0.008} & 0.994 $\pm$ 0.002 & \textbf{0.941 $\pm$ 0.002} & 0.039 $\pm$ 0.006 \\
    \bottomrule
    \end{tabular}}%
  \label{tab:cbrs_stand}%
\end{table*}%

\begin{table*}[h]
  \centering
  \caption{The results of memory-based methods with Class Balanced Reservoir Sampling on medical images ISIC2019 and satellite images EuroSat.}
	\scalebox{0.75}{
    \begin{tabular}{l|ccc|ccc}
    \toprule
    \multicolumn{1}{c|}{\multirow{2}[2]{*}{\textbf{Method}}} & \multicolumn{3}{c|}{\textbf{ISIC2019}} & \multicolumn{3}{c}{\textbf{EuroSat}} \\
          	& \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%} & \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%}\\
    \midrule
    A-GEM  & 66.5 $\pm$ 10.3 & \textbf{90.6 $\pm$ 0.1} & 12.8 $\pm$ 11.9 & 79.4 $\pm$ 4.4 & 55.6 $\pm$ 4.0 & 14.4 $\pm$ 5.0 \\
    Gdumb & 57.2 $\pm$ 9.0 & 71.1 $\pm$ 15.9 & 22.9 $\pm$ 9.1 & 81.6 $\pm$ 8.4 & 73.4 $\pm$ 8.9 & 2.4 $\pm$ 6.6 \\
    DER   & 61.8 $\pm$ 13.7 & 84.5 $\pm$ 12.6 & 17.7 $\pm$ 17.1 & 81.2 $\pm$ 4.5 & 65.3 $\pm$ 6.2 & 7.4 $\pm$ 4.2 \\
    MEGA  & 54.9 $\pm$ 9.6 & 62.3 $\pm$ 15.9 & 18.8 $\pm$ 10.7 & 77.8 $\pm$ 7.3 & 69.7 $\pm$ 10.3 & 2.0 $\pm$ 3.6 \\
    RM & 72.8 $\pm$ 5.0 & 71.5 $\pm$ 3.7 & 10.3 $\pm$ 2.5 & 86.3 $\pm$ 2.6 & 79.3 $\pm$ 1.5 & 0.7 $\pm$ 4.2 \\
        \midrule
    DIANA &  \textbf{77.7 $\pm$ 4.2} & 78.2 $\pm$ 4.7 &  \textbf{1.9 $\pm$ 3.2} & \textbf{90.9 $\pm$ 1.7} & \textbf{83.8 $\pm$ 2.7} & \textbf{-1.0 $\pm$ 1.8} \\
    %DIANA-RM & \textbf{0.800 $\pm$ 0.005} & 0.773 $\pm$ 0.037 & \textbf{0.002 $\pm$ 0.025} & 0.908 $\pm$ 0.018 & 0.825 $\pm$ 0.030 & 0.014 $\pm$ 0.013 \\ 
    \bottomrule
    \end{tabular}}%
  \label{tab:cbrs_real}%
\end{table*}%
For imbalanced lifelong learning, class balanced sampling is an efficient way to alleviate the harm of imbalanced data stream in some cases. Class-Balancing Reservoir Sampling (CBRS) is an enhanced Reservoir Sampling strategy for memory-based methods under imbalanced lifelong learning settings. It maintains a class-balanced memory by replacing instances of the major class. Thus, in experience replay, training batches from episodic memory are collected in a balanced form. However, since memory-based methods require data from both the current task and episodic memory, they still need to handle an imbalanced data stream from the current task even though the memory is balanced. Moreover, the distributions of balanced and imbalanced data streams differ significantly, which may lead to server gradient interference between these two streams. 
%Thus, it's worth investigating the imbalanced problem with class-balanced sampling.

We examine memory-based approaches with CBRS instead of vanilla Reservoir Sampling, the learning rate and batch size are consistent with Section 5.1. %\ref{sec:exp-setups}. 
We implement CBRS according to the pseudo-code described in \citet{chrysakis2020online}. When the memory is not full, all samples are stored in memory. After the memory is full, all classes are categorized as the full class and the not-full class. Once a class has been the largest class in memory, it's marked as the full class and would be ignored by the population. On the contrary, the instances of the full class would be replaced by the not-full class.

Table \ref{tab:cbrs_stand} and \ref{tab:cbrs_real} present the results with CBRS. The evolution of average AUC during the lifelong learning process is shown in Figure \ref{fig:evolution_cbrs}. GEM is not included because this algorithm is not compatible with CBRS, it uses Ring Buffer instead of Reservoir Sampling and each task has individual memory space.
% For example, DER cannot access task identifiers in its memory, so it is not able to identify the largest class.   
Compared with vanilla Reservoir sampling in Table \ref{tab:result_of_std_data} and \ref{tab:result_of_prac_data}, using CBRS significantly improves the performance of most algorithms, except A-GEM. On Split-CIFAR, A-GEM, GDumb, DER, MEGA, and DIANA increase 1.5\%, 5.1\%, 2.5\%, 3.0\%, and 8.3\% in terms of AUC score respectively.
We also compare DIANA with another class-balanced method Rainbow Memory (RM), it uses a different class-balanced sampling strategy. Our proposed DIANA beats RM except Split-CIFAR.
%On Split-CUB, they increase -0.8\%, 5.4\%, 0.7\%, and 3.0\% respectively. 

We give a detailed analysis of each method below.

\begin{itemize}
    \item A-GEM obtains no benefits from CBRS. Since the current data stream is still imbalanced, we analyze that it's not suitable for A-GEM to rectify the current gradient based on the reference gradient which is computed on a balanced replay buffer. 
    
    \item GDumb has stable and significant increments on all 5 datasets, because it's only trained on balanced memory data, and doesn't suffer from the gradient interference problem. 
    
    \item DER increases on Split-CIFAR, Split-AWA2, and EuroSat, while dropping performance on ISIC2019 and Split-CUB.
    
    \item MEGA gets boosted on Split-CIFAR and Split-CUB, but drops on the other three datasets. 
    
    \item DIANA has much better AUC than other methods as shown in Table~\ref{tab:cbrs_stand} and Table~\ref{tab:cbrs_real}. There are two interesting observations. First, on ISIC2019 dataset, DIANA has worse accuracy than A-GEM ($78.2\%$ versus $90.6\%$) but better AUC value ($77.7\%$ versus $66.5\%$). The reason is that the dataset ISIC2019 is very imbalanced (See Section 5.1 Datasets). Second, on EuroSat dataset, DIANA has a negative forgetting measure, which means that DIANA with CBRS can help learn previous tasks when learning new tasks.
\end{itemize}



\textbf{In summary, DIANA achieves higher AUC than other memory-based approaches when applying Class-Balancing Reservoir Sampling.} DIANA has consistent improvements on all 5 datasets. We have a conjecture that the two-model framework alleviates the gradient interference between imbalanced data stream and balanced replay buffer, according to Section 4.2.%\ref{sec:grad_interf}.

% A-GEM does not even benefit from CBRS, because when the gradients from episodic memory and current task conflict more severely. Gdumb has a because it only trained on episodic memory data and would not be affected by imbalanced data from the current task.

% Class balance sampling


\begin{figure*}[!t]
\begin{center}
	\subfigure[Split-CIFAR]{\includegraphics[width=0.45\linewidth]{figures/evoplot_cbrs_cifar.pdf}} 
	\subfigure[Split-CUB]{\includegraphics[width=0.45\linewidth]{figures/evoplot_cbrs_cub.pdf}}   
	\subfigure[Split-AWA2]{\includegraphics[width=0.45\linewidth]{figures/evoplot_cbrs_awa.pdf}}    
	\subfigure[ISIC2019]{\includegraphics[width=0.45\linewidth]{figures/evoplot_cbrs_isic.pdf}}     
	\subfigure[EuroSat]{\includegraphics[width=0.45\linewidth]{figures/evoplot_cbrs_eurosat.pdf}}
	\includegraphics[width=0.6\linewidth]{figures/cbrs_legend.pdf}
\end{center}
\vspace*{-0.1in}
\caption{ Evolution of average AUC of memory-based methods with Class Balanced Reservoir Sampling. }
\label{fig:evolution_cbrs}
\vspace{-.1in}
\end{figure*}

\section{Result Detail}


% You can have as much text here as you want. The main body must be at most $8$ pages long.
% For the final version, one more page can be added.
% If you want, you can use an appendix like this one, even using the one-column format.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We show the detailed results of all the methods on imbalanced benchmarks and practical data in Figure~\ref{fig:result}, Table~\ref{tab:result_of_std_data} and Table~\ref{tab:result_of_prac_data}.

\begin{figure*}[t]
\begin{center}
	\includegraphics[width=1.0\linewidth]{figures/result.pdf}    \\
	\includegraphics[width=0.9\linewidth]{figures/legend.pdf}
\end{center}
 \caption{Performance of DIANA and other baselines on Split-CIFAR, Split-CUB, Split-AWA2, ISIC, and EuroSAT, each result is averaged after 5 runs with different random seeds. The Standard Deviation (std) is represented by the black line.}
\label{fig:result}
\vspace{-.1in}
\end{figure*}


\begin{table*}%[htbp]
	\centering
	\caption{The results of average AUC, average ACC, and average Forgetting (FGT) of different
methods on Split CIFAR100, Split CUB200, and Split AWA2. }
    \setlength\tabcolsep{2.5pt} 
	\scalebox{0.75}{
	\begin{tabular}{l|ccc|ccc|ccc}
		\toprule
		\multicolumn{1}{c|}{\multirow{2}[2]{*}{\textbf{Method}}} & \multicolumn{3}{c|}{\textbf{Split-CIFAR}} & \multicolumn{3}{c}{\textbf{Split-CUB}} & \multicolumn{3}{|c}{\textbf{Split-AWA2}} \\
		& \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%} & \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%}
		& \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%}\\
		\midrule
		SINGLE & 58.9 $\pm$ 3.0 & 60.1 $\pm$ 0.2 & 15.9 $\pm$ 3.3 & 54.2 $\pm$ 2.5 & 50.1 $\pm$ 0.5 & 14.9 $\pm$ 2.7 & 55.0 $\pm$ 5.3 & 47.6 $\pm$ 0.5 & 33.8 $\pm$ 6.4 \\
		EWC   & 64.4 $\pm$ 2.7 & 60.3 $\pm$ 0.6 & 10.0 $\pm$ 2.2 & 51.5 $\pm$ 0.9 & 50.1 $\pm$ 0.4 & 16.6 $\pm$ 0.6 & 56.2 $\pm$ 5.9 & 47.0 $\pm$ 1.3 & 32.2 $\pm$ 5.6 \\
		MAS   & 57.4 $\pm$ 0.9 & 60.0 $\pm$ 0.3 & 17.1 $\pm$ 1.2 & 51.7 $\pm$ 1.0 & 50.1 $\pm$ 0.4 & 15.5 $\pm$ 1.9 & 57.4 $\pm$ 5.2 & 47.4 $\pm$ 1.3 & 30.6 $\pm$ 5.4 \\
		GEM   & 65.2 $\pm$ 1.6 & 61.5 $\pm$ 0.1 & 11.9 $\pm$ 1.1 & 55.4 $\pm$ 1.0 & 50.4 $\pm$ 0.6 & 15.5 $\pm$ 0.7 & 69.6 $\pm$ 5.5 & 54.3 $\pm$ 4.4 & 21.7 $\pm$ 5.4 \\
		A-GEM  & 56.8 $\pm$ 4.6 & 60.2 $\pm$ 0.3 & 19.9 $\pm$ 5.0 & 51.6 $\pm$ 2.0 & 50.1 $\pm$ 0.5 & 17.5 $\pm$ 1.1 & 56.4 $\pm$ 3.2 & 47.7 $\pm$ 0.4 & 31.5 $\pm$ 6.2 \\
		GDumb & 65.5 $\pm$ 2.7 & 61.4 $\pm$ 2.0 & 6.0 $\pm$ 3.4 & 61.4 $\pm$ 1.1 & 50.5 $\pm$ 0.8 & 6.0 $\pm$ 0.8 & 85.2 $\pm$ 4.3 & 72.2 $\pm$ 1.5 & \textbf{1.9 $\pm$ 0.9} \\
		DER  & 62.6 $\pm$ 3.2 & 59.6 $\pm$ 1.2 & 16.9 $\pm$ 3.1 & 65.7 $\pm$ 4.4 &  \textbf{53.5 $\pm$ 2.1} & 
		11.2 $\pm$ 3.8 & 80.4 $\pm$ 3.3 & 63.1 $\pm$ 6.4 & 15.3 $\pm$ 1.7 \\
		MEGA  & 63.1 $\pm$ 3.0 & 61.5 $\pm$ 1.3 & \textbf{5.5 $\pm$ 2.0} & 50.1 $\pm$ 3.0 & 50.6 $\pm$ 0.3 & 16.1 $\pm$ 3.9 & 65.3 $\pm$ 5.3 & 50.8 $\pm$ 5.4 & 20.5 $\pm$ 5.0 \\
% 		RM & \textbf{0.774 $\pm$ 0.019} & \textbf{0.698 $\pm$ 0.017} & 0.043 $\pm$ 0.019 & 0.588 $\pm$ 0.028 & 0.544 $\pm$ 0.011 & 0.113 $\pm$ 0.012 & 0.805 $\pm$ 0.060 & 0.711 $\pm$ 0.088 & 0.056 $\pm$ 0.020 \\
		\midrule
		DIANA & \textbf{68.4 $\pm$ 2.0} & \textbf{62.3 $\pm$ 0.9} & 6.6 $\pm$ 1.2 & \textbf{70.4 $\pm$ 0.7} & 52.3 $\pm$ 0.3 & \textbf{2.4 $\pm$ 0.7} & \textbf{98.2 $\pm$ 1.1} & \textbf{78.7 $\pm$ 3.4} & 2.0 $\pm$ 0.3\\
		\bottomrule
% 		DIANA-RM & 0.766 $\pm$ 0.005 & 0.685 $\pm$ 0.011 & \textbf{0.034 $\pm$ 0.008} & \textbf{0.763 $\pm$ 0.031} & \textbf{0.545 $\pm$ 0.011} & \textbf{0.001 $\pm$ 0.008} & \textbf{0.994 $\pm$ 0.002} & \textbf{0.941 $\pm$ 0.002} & 0.039 $\pm$ 0.006 \\
	\end{tabular}}%
	\label{tab:result_of_std_data}%



  \centering
  \caption{The results of average AUC, average ACC, and average Forgetting (FGT) of different
methods on medical images ISIC2019 and satellite images EuroSat.}
	\scalebox{0.75}{
    \begin{tabular}{l|ccc|ccc}
    \toprule
    \multicolumn{1}{c|}{\multirow{2}[2]{*}{\textbf{Method}}} & \multicolumn{3}{c|}{\textbf{ISIC2019}} & \multicolumn{3}{c}{\textbf{EuroSat}} \\
          	& \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%} & \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%}\\
    \midrule
    SINGLE & 61.7 $\pm$ 7.7 & 86.7 $\pm$ 5.0 & 17.7 $\pm$ 5.3 & 69.3 $\pm$ 6.0 & 56.0 $\pm$ 1.5 & 20.0 $\pm$ 6.5 \\
    EWC   & 59.2 $\pm$ 8.1 & 86.7 $\pm$ 5.0 & 19.4 $\pm$ 6.1 & 76.3 $\pm$ 6.3 & 55.3 $\pm$ 2.7 & 10.1 $\pm$ 7.2 \\
    MAS   & 62.5 $\pm$ 6.5 & 86.7 $\pm$ 5.0 & 17.5 $\pm$ 6.0 & 74.6 $\pm$ 4.4 & 56.4 $\pm$ 4.0 & 13.1 $\pm$ 6.4 \\
    GEM   & 71.3 $\pm$ 6.5 & 84.3 $\pm$ 4.6 & 7.2 $\pm$ 2.0 & 80.5 $\pm$ 3.2 & 61.0 $\pm$ 5.4 & 8.3 $\pm$ 4.1 \\
    A-GEM  & 66.3 $\pm$ 6.5 & 86.7 $\pm$ 4.8 & 13.9 $\pm$ 5.2 & 74.0 $\pm$ 6.5 & 58.0 $\pm$ 3.4 & 13.5 $\pm$ 6.3 \\
    GDumb & 72.2 $\pm$ 4.2 & \textbf{90.4 $\pm$ 0.5} &  \textbf{4.4 $\pm$ 2.0} & 78.3 $\pm$ 5.5 & 64.9 $\pm$ 4.6 & 4.8 $\pm$ 1.9 \\
    DER  & 69.2 $\pm$ 3.9 & 84.1 $\pm$ 4.5 & 9.5 $\pm$ 5.4 & 73.9 $\pm$ 2.6 & 58.6 $\pm$ 2.0 & 17.5 $\pm$ 3.9 \\
    MEGA  & 71.1 $\pm$ 5.3 & 85.8 $\pm$ 4.7 & 5.9 $\pm$ 2.4 & 79.8 $\pm$ 3.1 & 55.8 $\pm$ 4.3 & 5.3 $\pm$ 2.4 \\
    % RM & 0.728 $\pm$ 0.050 & 0.715 $\pm$ 0.037 & 0.103 $\pm$ 0.025 & 0.863 $\pm$ 0.026 & 0.793 $\pm$ 0.015 & \textbf{0.007 $\pm$ 0.042} \\
     \midrule
    DIANA & \textbf{73.6 $\pm$ 4.8} & 85.2 $\pm$ 3.7 & 5.5 $\pm$ 2.5 &  \textbf{86.7 $\pm$ 1.5} &  \textbf{66.0 $\pm$ 1.8} &  \textbf{3.8 $\pm$ 1.7} \\
    % DIANA-RM & \textbf{0.800 $\pm$ 0.005} & 0.773 $\pm$ 0.037 & \textbf{0.002 $\pm$ 0.025} & \textbf{0.908 $\pm$ 0.018} & \textbf{0.825 $\pm$ 0.030} & 0.014 $\pm$ 0.013 \\    
    \bottomrule
    \end{tabular}}%
  \label{tab:result_of_prac_data}%
\end{table*}%

% \begin{table}

% \end{table}%

\begin{figure*}[htbp]
\begin{center}
	\subfigure[EuroSat]{\includegraphics[width=0.8\linewidth]{figures/evoplot_eurosat.pdf}}
	\includegraphics[width=0.7\linewidth]{figures/legend.pdf}
\end{center}
\vspace*{-0.1in}
\caption{ Evolution of average AUC on EuroSat.}
\label{fig:evolution-eurosat}
%\vspace{-.1in}
\end{figure*}


\section{Imbalanced Ratio}
In Figure~\ref{fig:imratio}, we vary the imbalanced ratio to evaluate the robustness of different methods. We consider three imratio settings: $\{0.01, 0.05, 0.1\}$. As expected, when the imratio increases, all the algorithms generally achieve better performance. Across all three settings, DIANA achieves the highest performance except when $imratio=0.01$ on ISIC2019. It is worth mentioning that when imratio is 0.01, i.e., only $1\%$ of samples are positive, all the methods drop drastically including DIANA. The possible reason is that when imratio is $0.01$, the positive samples are very rare so it is difficult for all the methods to learn efficiently.

\begin{figure*}[t]
\begin{center}
	\subfigure[Split-CIFAR]{\includegraphics[width=0.32\linewidth]{figures/imratio_cifar.pdf}}
	\subfigure[Split-CUB]{\includegraphics[width=0.32\linewidth]{figures/imratio_cub.pdf}}  
	\subfigure[ISIC2019]{\includegraphics[width=0.32\linewidth]{figures/imratio_isic.pdf}} 
	\includegraphics[width=0.7\linewidth]{figures/legend.pdf}
\end{center}
\vspace*{-0.2in}
\caption{Average AUC on the data with different imbalanced ratio.}
\vspace*{-0.1in}
\label{fig:imratio}
\vspace{-.1in}
\end{figure*}
%\vspace{-.3in}


\section{Balanced Versus Imbalanced}
In this section, it's studied how imbalance affects lifelong learning. In hyperplane, if the direction of gradient descent changes a lot, it's probably unstable and hard to optimize. Thus, we can exploit the deviation of direction to indicate whether a task is getting harder or not. We first store all gradients in a task and get an averaged gradient as an anchor. Then calculate the angle between the anchor and each mini-batch gradient. Drawing the histograms of angle deviation, we can see the distribution of gradient direction before and after making it imbalanced. It's plotted in Figure \ref{fig:dist-of-angle-on-current-tasks}. 
After making imbalanced, the histograms have a wider range and larger std, which means the directions of gradient vary more dramatically. 

 
\begin{figure*}[!h]
\begin{center}
	\subfigure[Split-CIFAR]{\includegraphics[width=0.30\linewidth]{figures/g_direction_cifar.pdf}} \quad
	\subfigure[Split-CUB]{\includegraphics[width=0.30\linewidth]{figures/g_direction_cub.pdf}}  \quad
	\subfigure[ISIC2019]{\includegraphics[width=0.30\linewidth]{figures/g_direction_isic.pdf}}   
  
\end{center}
\caption{ The distributions of the directions of the gradients on the current tasks. (a) balance std: 1.59, imbalance std: 3.50; (b) balance std: 2.58; imbalance std: 3.85; (c)balance std: 2.24, imbalance std: 5.52.   }
\label{fig:dist-of-angle-on-current-tasks}
%\vspace{-.1in}
\end{figure*}




\begin{figure*}[!h]
\begin{center}
	\subfigure[Balanced Split-CUB]{\includegraphics[width=0.24\linewidth]{figures/angle_dis_cub_ba.pdf}}
	\subfigure[Imbalanced Split-CUB]{\includegraphics[width=0.24\linewidth]{figures/angle_dis_cub_imba.pdf}}   
	\subfigure[Balanced Split-CIFAR]{\includegraphics[width=0.255\linewidth]{figures/angle_dis_cifar100_ba.pdf}}   
	\subfigure[Imbalanced Split-CIFAR]{\includegraphics[width=0.24\linewidth]{figures/angle_dis_cifar100_imba.pdf}}  


\end{center}
\caption{The distributions of the angles between the current gradient and reference gradient, where the proportion of angles greater than 90 degrees is: (a) $1.05\%$, (b) $3.16\%$, (c) $0.13\%$, (d) $9.68\%$}
\label{fig:dist-of-angle}
%\vspace{-.1in}
\end{figure*}

\bibliography{zhu_155}

\end{document}
