% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Instructions for Authors: Title in Title Case}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Xiangyu Zhu}{}}
\author[1,2]{Xiangyu Zhu}
\author[2]{Jie Hao}
\author[3]{Yunhui Guo}
\author[2]{Mingrui Liu}
% Add affiliations after the authors
\affil[1]{%
    Meituan, China
}
\affil[2]{%
        Department of Computer Science, George Mason University, USA
}
\affil[3]{%
    Department of Computer Science, University of Texas at Dallas, USA
  }


  
%%%%%%%%%% package
% Optional math commands from https://github.com/goodfeli/dlbook_notation.
% \input{math_commands.tex}
%%%%% NEW MATH DEFINITIONS %%%%%

\usepackage{amsmath,amsfonts,bm}

% Mark sections of captions for referring to divisions of figures
\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

% Highlight a newly defined term
\newcommand{\newterm}[1]{{\bf #1}}


% Figure reference, lower-case.
\def\figref#1{figure~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}
% Reference to two sections.
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
% Reference to three sections.
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
% Reference to an equation, lower-case.
\def\eqref#1{equation~\ref{#1}}
% Reference to an equation, upper case
\def\Eqref#1{Equation~\ref{#1}}
% A raw reference to an equation---avoid using if possible
\def\plaineqref#1{\ref{#1}}
% Reference to a chapter, lower-case.
\def\chapref#1{chapter~\ref{#1}}
% Reference to an equation, upper case.
\def\Chapref#1{Chapter~\ref{#1}}
% Reference to a range of chapters
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
% Reference to an algorithm, lower-case.
\def\algref#1{algorithm~\ref{#1}}
% Reference to an algorithm, upper case.
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
% Reference to a part, lower case
\def\partref#1{part~\ref{#1}}
% Reference to a part, upper case
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


% Random variables
\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
% rm is already a command, just don't name any random variables m
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

% Random vectors
\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

% Elements of random vectors
\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

% Random matrices
\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

% Elements of random matrices
\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

% Vectors
\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}

% Elements of vectors
\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

% Matrix
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}

% Tensor
\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


% Graph
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

% Entries of a matrix
\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

% entries of a tensor
% Same font as tensor, without \bm wrapper
\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

% The true underlying data generating distribution
\newcommand{\pdata}{p_{\rm{data}}}
% The empirical distribution defined by the training set
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
% The model distribution
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
% Stochastic autoencoder distributions
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
% But then they seem to use $L^2$ for vectors throughout the site, and so does
% wikipedia.
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak

\usepackage{hyperref}
\usepackage{url}
\usepackage{times}

% if you need to pass options to natbib, use, e.g.:
\PassOptionsToPackage{numbers, compress}{natbib}
% before loading neurips_2022
\usepackage{natbib}
% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{multirow}
\usepackage{booktabs} % for professional tables
%\input{math_symbol}
\def \S {\mathbf{S}}
\def \A {\mathcal{A}}
\def \X {\mathcal{X}}
\def \Ab {\bar{\A}}
\def \R {\mathbb{R}}
\def \Kt {\widetilde{K}}
\def \k {\mathbf{k}}
\def \w {\mathbf{w}}
\def \v {\mathbf{v}}
\def \t {\mathbf{t}}
\def \x {\mathbf{x}}
\def \m {\mathbf{m}}
\def \n {\mathbf{n}}
\def \Se {\mathcal{S}}
%\def \E {\mathrm{E}}
\def \E {\mathbb{E}}
\def \Rh {\widehat{R}}
\def \x {\mathbf{x}}
\def \u {\mathbf{u}}
\def \v {\mathbf{v}}
\def \p {\mathbf{p}}
\def \a {\mathbf{a}}
\def \diag {\mbox{diag}}
\def \b {\mathbf{b}}
\def \e {\mathbf{e}}
\def \ba {\boldsymbol{\alpha}}
\def \c {\mathbf{c}}
\def \tr {\mbox{tr}}
\def \d {\mathbf{d}}
\def \z {\mathbf{z}}
\def \s {\mathbf{s}}
\def \bh {\widehat{b}}
\def \y {\mathbf{y}}
\def \u {\mathbf{u}}
%\def \L {\mathcal{L}}
\def \H {\mathcal{H}}
\def \g {\mathbf{g}}
\def \E {\mathbb{E}}
\def \F {\mathcal{F}}
\def \I {\mathbb{I}}
\def \P {\mathcal{P}}
\def \Q {\mathcal{Q}}
\def \xh {\widehat{\x}}
\def \wh {\widehat{\w}}
\def \ah {\widehat{\alpha}}
\def \Rc {\mathcal R}

\def \Bh {\widehat B}
\def \Ah {\widehat A}
\def \Uh {\widehat U}
\def \Ut {\widetilde U}
\def \B {\mathbf B}
\def \C {\mathbf C}
\def \U {\mathbf U}
\def \Kh {\widehat K}
\def \fh {\widehat f}
\def \yh {\widehat y}
\def \Xh {\widehat{X}}
\def \Fh {\widehat{F}}




\def \y {\mathbf{y}}
\def \E {\mathbb{E}}
\def \I {\mathbb{I}}
\def \x {\mathbf{x}}
\def \g {\mathbf{g}}
\def \D {\mathcal{D}}
\def \z {\mathbf{z}}
\def \u {\mathbf{u}}
\def \H {\mathcal{H}}
\def \Pc {\mathcal{P}}
\def \w {\mathbf{w}}
\def \r {\mathbf{r}}
\def \R {\mathbb{R}}
\def \S {\mathcal{S}}
\def \regret {\mbox{regret}}
\def \Uh {\widehat{U}}
\def \Q {\mathcal{Q}}
\def \W {\mathcal{W}}
\def \N {\mathcal{N}}
\def \A {\mathcal{A}}
\def \q {\mathbf{q}}
\def \v {\mathbf{v}}
\def \M {\mathcal{M}}
\def \c {\mathbf{c}}
\def \ph {\widehat{p}}
\def \d {\mathbf{d}}
\def \p {\mathbf{p}}
\def \q {\mathbf{q}}
\def \db {\bar{\d}}
\def \dbb {\bar{d}}
\def \I {\mathcal{I}}
\def \xt {\widetilde{\x}}
\def \f {\mathbf{f}}
\def \a {\mathbf{a}}
\def \b {\mathbf{b}}
\def \ft {\widetilde{\f}}
\def \bt {\widetilde{\b}}
\def \h {\mathbf{h}}
\def \B {\mathbf{B}}
\def \bts {\widetilde{b}}
\def \fts {\widetilde{f}}
\def \Gh {\widehat{G}}
\def \G {\mathcal {G}}
\def \bh {\widehat{b}}
\def \fh {\widehat{f}}
\def \wh {\widehat{\w}}
\def \vb {\bar{v}}
\def \zt {\widetilde{\z}}
\def \zts {\widetilde{z}}
\def \s {\mathbf{s}}
\def \gh {\widehat{\g}}
\def \vh {\widehat{\v}}
\def \Sh {\widehat{S}}
\def \rhoh {\widehat{\rho}}
\def \hh {\widehat{\h}}
\def \C {\mathcal{C}}
\def \V {\mathcal{L}}
\def \t {\mathbf{t}}
\def \xh {\widehat{\x}}
\def \Ut {\widetilde{U}}
\def \wt {\widetilde{\w}}
\def \Th {\widehat{T}}
\def \Ot {\tilde{\mathcal{O}}}
\def \X {\mathcal{X}}
\def \Y {\mathcal{Y}}
\def \nb {\widehat{\nabla}}
\def \K {\mathcal{K}}
\def \P {\mathbb{P}}
\def \T {\mathcal{T}}
\def \F {\mathcal{F}}
\def \ft{\widetilde{f}}
\def \xt {\widetilde{x}}
\def \Rt {\mathcal{R}}
\def \Rb {\bar{\Rt}}
\def \wb {\bar{\w}}
\usepackage{wrapfig}
\newtheorem{prop}{Proposition}
\usepackage{hyperref}
% Attempt to make %hyperref and algorithmic work together better:
 \newcommand{\theHalgorithm}{\arabic{algorithm}}
\usepackage{algorithm}
\usepackage{algorithmicx,algpseudocode}




\begin{document}
\title{AUC Maximization in Imbalanced Lifelong Learning}
\maketitle
%%\vspace*{-0.2in}

%%%%%%%%% ABSTRACT
\begin{abstract}
% %\vspace*{-0.2in}

%Existing continual learning methods mostly focus on improving classification accuracy for balanced datasets. However, imbalanced datasets, such as medical or fine-grained image datasets, are ubiquitous in machine learning. Even if we apply these continual learning methods to imbalanced datasets, it is still hard to evaluate model performance since classification accuracy is not a suitable metric for imbalanced datasets. Unfortunately, metrics such as recall, F1-score, and Area under the ROC Curve, which is more common for comparing model performance on imbalanced datasets, are ignored by the current continual learning methods. 

Imbalanced data is ubiquitous in machine learning, such as medical or fine-grained image datasets. The existing continual learning methods employ various techniques such as balanced sampling to improve classification accuracy in this setting. However, classification accuracy is not a suitable metric for imbalanced data, and hence these methods may not obtain a good classifier as measured by other metrics (e.g., Area under the ROC Curve). In this paper, we propose a solution to enable efficient imbalanced continual learning by designing an algorithm to effectively maximize one widely used metric in an imbalanced data setting: Area Under the ROC Curve (AUC). We find that simply replacing accuracy with AUC will cause \textit{gradient interference problem} due to the imbalanced data distribution. To address this issue, we propose a new algorithm, namely DIANA, which performs a novel synthesis of model \underline{D}ecoupl\underline{I}ng \underline{AN}d \underline{A}lignment. In particular, the algorithm updates two models simultaneously: one focuses on learning the current knowledge while the other concentrates on reviewing previously-learned knowledge, and the two models gradually align during training. The results show that the proposed DIANA achieves state-of-the-art performance on all the imbalanced datasets compared with several competitive baselines. Code is available at \url{https://github.com/MingruiLiu-ML-Lab/Lifelong-AUC}.
%We further consider standard balanced benchmarks used in lifelong learning to show the effectiveness of DIANA as a general lifelong learning method.


\end{abstract}
% Imbalanced data is ubiquitous in machine learning, such as medical or fine-grained image datasets. The existing continual learning methods employ balanced sampling techniques to improve classification accuracy in this setting. However, classification accuracy is not a suitable metric for imbalanced data, and hence these methods may not obtain a good classifier under other metrics (e.g., recall, F1-score, Area under the ROC Curve), which are more relevant in imbalanced data settings. In this paper, we propose a new approach to empower continual learning with imbalanced data: designing an algorithm to directly maximize one widely used metric in an imbalanced data setting: Area Under the ROC Curve (AUC). Current research on AUC optimization focuses on learning on a single task and may suffer from the issue of ``catastrophic forgetting". To address this issue, we propose a method to optimize AUC continuously such that the model can retain the performance on all the tasks after training, called Lifelong AUC (DIANA). DIANA is built upon memory-based lifelong learning. The imbalanced data stream poses severe challenges for commonly-used gradient surgery approaches (e.g., GEM, A-GEM). We first identify the \textit{gradient interference problem} in imbalanced lifelong learning. Then we address this issue by updating two models simultaneously: one focuses on learning the current knowledge while the other concentrates on reviewing previously-learned knowledge. The two models gradually align during training. We conduct extensive experiments on datasets across various imbalanced domains, ranging from natural images to medical and satellite images. The results show that the proposed DIANA achieves state-of-the-art performance on all the imbalanced datasets compared with several competitive baselines. We further consider standard balanced benchmarks used in lifelong learning to show the effectiveness of DIANA as a general lifelong learning method.

% \begin{figure}[t]
% \centering 
% \includegraphics[width=0.999\linewidth]{figures/overview_1.png} 
% %vspace{-.4cm}
% \caption{Overview of the Model Decoupling and Alignment framework. In the previous memory-based methods, data from the replay buffer and current task are fed into a single model. The very different distributions might produce conflict gradients that interfere with each other, which is common for imbalanced datasets.
% To solve it, our method decouples data into two models. Model A is designed to deal with data from the current task, while model B focuses on data from the replay buffer collected from past tasks. The parameters of the two models are gradually aligned during training.
% }
% \label{fig:overview} %vspace{-.5cm}
% \end{figure} 

% %vspace*{-0.1in}
%\vspace*{-0.15in}
\section{Introduction}
%vspace*{-0.1in}
Models can achieve superior performance on a single task \citep{goodfellow2016deep}. However, they often lack the capability to mimic the continual learning ability of the human brain. Specifically, the model performance drops drastically on old tasks when being trained on a new task, also referred to as ``catastrophic forgetting" \citep{french1999catastrophic, kemker2018measuring, nguyen2019toward}. 
 Current research on lifelong learning \citep{kirkpatrick2017overcoming,lopez2017gradient,chaudhry2018efficient} mostly focuses on balanced datasets, while ignoring the more challenging imbalanced classification problem. 
 % However, natural data are inherently imbalanced: a few classes contain significantly more samples than other classes.
 This impedes applications of lifelong learning to online advertisement \citep{hu2022continual}, satellite imagery \citep{tasar2019incremental}, or medical image classification~\citep{irvin2019chexpert}. Moreover, it is not suitable to use classification accuracy to assess the model performance in these domains due to the data imbalanced issue. One of the popular metrics for measuring the performance of classifiers on the imbalanced task is the Area Under the Curve (AUC)~\citep{hanley1982meaning,hanley1983method}.
%This is also true when the data arrive continuously which leads to the lifelong learning problem. The Area Under the Curve (AUC)~\cite{hanley1982meaning,hanley1983method} is a default metric to evaluate the performance of machine learning models in these application domains and is better suited for imbalanced classification~\cite{irvin2019chexpert}. 
For example, if the imbalanced data ratio is $99:1$, then a naive classifier that classifies every example to be positive has $99\%$ accuracy, but it is definitely not a good classifier. In this case, AUC is much more informative and we should directly optimize AUC rather than accuracy.
However, the current lifelong learning methods are limited to maximize the classification accuracy \citep{kirkpatrick2017overcoming,lopez2017gradient,chaudhry2018efficient} which makes them not suitable for imbalanced lifelong learning. Although one may tackle the imbalanced problem by class balanced sampling~\citep{chrysakis2020online, de2021continual, kim2020imbalanced} to form balanced training batches, these approaches still may not be able to directly optimize metrics such as AUC.

%In addition \textcolor{red}{directly maximizing suitable metrics is more direct and orthogonal to balanced sampling.}
%it may not be a good mechanism to maximize AUC. 


%it's not applicable for online lifelong learning, since it acquires imbalanced data stream at each iteration.
% Although one can avoid the imbalanced problem by class balanced sampling to form balanced training batches offline, it's not applicable for online lifelong learning, since it acquires imbalanced data stream at each iteration.



%The increasing size and complexity of data sets create challenges as well as opportunities for machine learning models. Typically, current models can achieve superior performance on a single task \cite{goodfellow2016deep}. However, they often lack the capability to mimic the continual learning ability of the human brain. Specifically, the model performance drops drastically on old tasks when being trained on a new task, also referred to as ``catastrophic forgetting" \cite{french1999catastrophic, kemker2018measuring, nguyen2019toward}. This is particularly undesirable for applications in online advertisement \cite{hu2022continual} or satellite imagery \cite{tasar2019incremental}. \emph{Data imbalance} in these applications is another factor that can limit the continual learning ability of the models \cite{chrysakis2020online}. 

%The Area Under the Curve (AUC)~\cite{hanley1982meaning,hanley1983method} is a common metric to evaluate the performance of machine learning models in these application domains. However, the current lifelong learning methods are limited to maximizing the classification accuracy \cite{kirkpatrick2017overcoming,lopez2017gradient,chaudhry2018efficient,guo2020improved}. Given the wide applicability of AUC, it is thus of natural interest to extend current methods for optimizing AUC continually.

Memory-based lifelong learning methods \citet{lopez2017gradient,chaudhry2018efficient,aljundi2019gradient} achieve competitive performance across commonly used lifelong learning benchmarks. In memory-based lifelong learning methods, a replay buffer is used to store a subset of examples from old tasks for rehearsal. The gradient computed on the replay buffer \citep{lopez2017gradient} is used as a reference to alter the direction of the gradient computed on the current task. 

Proceeding from the memory-based lifelong learning methods for maximizing classification accuracy, one may think of applying the existing methods and replacing the metric of classification accuracy with AUC. One possible approach to directly maximize AUC for imbalanced lifelong learning is to employ the minimax reformulation of AUC as in the literature of online AUC maximization~\citep{ying2016stochastic,liu2020stochastic} . This minimax reformulation introduces a data-dependent decision threshold of the model to decouple the pairwise formulation of the AUC objective which facilitates model update in an online fashion. However, we find that maximizing AUC with memory-based lifelong learning introduces an issue called \emph{gradient interference}. In particular, when the data stream is imbalanced, the gradients computed on the current task can interfere with the gradients computed on the replay buffer severely.

%However, compared with cross-entropy loss which can be calculated based on one individual example, AUC depends on a pair of examples with different labels~\cite{hanley1982meaning}. In the lifelong learning setting, data are arriving in an online fashion and one cannot compute the gradient of AUC based on a single data point. This indicates that typical memory-based lifelong learning methods are not directly applicable for imbalanced lifelong learning with AUC maximization. 

%Another issue is that when the data is imbalanced, the gradients computed on the current task can interfere with the gradients computed on the replay buffer, severely affecting the training of the model.

In this paper, we propose a novel algorithm, called DIANA, to address the \textit{gradient interference problem} when replacing accuracy with AUC in the imbalanced lifelong learning setting. We first formulate the objective as a composite optimization problem as in works ~\citet{lopez2017gradient,chaudhry2018efficient,guo2020improved}. Similar to existing memory-based lifelong learning methods, we aim to maximize AUC on both the current task and the replay buffer to prevent catastrophic forgetting. Notably, DIANA is designed with two novel techniques to address the \textit{gradient interference problem}: model decoupling and alignment.
% , given in Figure~\ref{fig:overview}
In particular, DIANA decouples the learning of previous tasks and current task into two models: one focuses on learning the current task while the other reviews previously-learned knowledge. The two models gradually align during training due to an alignment penalty. Since each model computes its own gradients, we can reduce interference between the learning of the current task and the reviewing of old tasks. As we will show, the introduction of the additional model greatly alleviates the gradient interference problem for maximizing AUC with an imbalanced data stream continually, while still being computationally efficient. 


% The model decoupling technique enables that each model computes its own gradients separately, and the alignment penalty gradually
%In this paper, we propose a novel algorithm, called DIANA, to address two issues of the current lifelong learning methods: {\bf 1)} they are designed specifically for maximizing classification accuracy. {\bf 2)} they suffer from the \textit{gradient interference problem} when replacing accuracy with AUC in the imbalanced setting. To address the first issue, we aim to directly optimize AUC in a continuous manner. In particular, we employ the minimax reformulation of AUC as in the literature of online AUC maximization~\cite{ying2016stochastic,liu2020stochastic} for imbalanced lifelong learning. This minimax reformulation introduces a data-dependent decision threshold of the model to decouple the pairwise formulation of the AUC objective which facilitates model update in an online fashion.   To address the second issue, we aim to maximize the AUC on both the current task and the replay buffer to prevent catastrophic forgetting. We first formulate the objective as a composite optimization problem as in~\cite{lopez2017gradient,chaudhry2018efficient,guo2020improved}. %DIANA decouples previous tasks and current task into two models: one focuses on learning the current task while the other reviews previously-learned knowledge. The two models gradually align during training with an equality constraint. As we will show, the introduction of the additional model greatly alleviates the gradient interference problem for maximizing AUC with an imbalanced data stream continually. 


% However, simply applying the reformulation in~\cite{ying2016stochastic,liu2020stochastic} is insufficient: the lack of mechanisms for alleviating catastrophic forgetting is detrimental to lifelong learning.

%and identify a critical \textit{gradient interference problem}: when the data is imbalanced, the gradients computed on the current task can interfere with the gradients computed on the replay buffer, severely affecting the training of the model. To mitigate the gradient interference on the current task and the replay buffer caused by imbalanced data,

%However, optimizing the composite objective in lifelong AUC maximization is difficult since the objective function is a summation of two functions of explicit min-max structure without closed form, and hence it is infeasible to calculate the gradient with respect to each of them. This is in stark contrast to memory-based lifelong learning methods for optimizing cross-entropy which can directly compute the gradients with respect to the composite objective function. In DIANA, our solution is to introduce auxiliary variables with equality constraints such that it is mathematically equivalent to the original formulation. With the additional auxiliary variables, we avoid computing the gradient of the original composite objective directly: instead, we compute gradients on the current task and replay buffer w.r.t. the auxiliary variables separately. 
 
Our contributions can be summarized as follows:
 
 \begin{itemize}
 %vspace*{-0.1in}
   %  \item We address the limitations of the existing algorithmic framework of memory-based lifelong learning by learning imbalanced data continuously with AUC  maximization, \textcolor{red}{which is orthogonal to class balanced sampling.}
     
     \item We advance imbalanced lifelong learning through a completely orthogonal approach to the traditional balanced sampling techniques, which enables the lifelong learning algorithm to directly maximize an important metric (AUC). We also identify the \textit{gradient interference problem} under imbalanced setting when the existing memory-based lifelong learning methods are simply applied to maximize AUC. 
     
     \item We design a new algorithm for maximizing AUC in imbalanced lifelong learning, DIANA, which decouples conflicting gradients into two models with an alignment penalty. We show that DIANA can alleviate the \textit{gradient interference problem}. 
     
     \item We verify the efficacy of DIANA on imbalanced lifelong learning benchmarks across natural images, medical images, and satellite images. We show that DIANA outperforms several state-of-the-art lifelong learning algorithms by a large margin (e.g., 6.5\% AUC score on average over five benchmark datasets), including the approaches which purely use balanced sampling.
     % we show that the DIANA method coupled with the balanced sampling techniques can outperform these approaches by an average 6.6\% AUC score on five imbalanced datasets.
     
    
     \item We further expand the scope of our algorithm by considering maximizing AUC on balanced multi-class classification problems, which are standard benchmarks in lifelong learning literature. 
     %We show that DIANA improves  9.6\% AUC score on three standard benchmarks on average.
 \end{itemize}
%, to update the model to maximize the objective once a single sample. arrives

% 6.1, 5.6, 1.1, 11.2, 9.3
% Method

% Contribution

% Area Under the ROC Curve (AUC) optimization is very important for applications such as medical images, online advertisements, etc. However, there is no practical algorithm which is devoted to AUC maximization in the continual learning setting. The reason is that the current efforts on AUC optimization simply use standard online and stochastic optimization techniques and do not consider explicitly how to alleviate the catastrophic forgetting issue in continual learning. Our main contribution is to provide the first algorithm for maximizing AUC explicitly in the continual learning setting.
% %vspace*{-0.2in}
%\vspace*{-0.15in}

\section{Related Work}
% %vspace*{-0.1in}
\subsection{Lifelong Learning}
Lifelong learning is an important topic in machine learning \citep{thrun1995lifelong} and is extensively studied in recent years \citep{parisi2019continual, mai2022online, borsos2020coresets, Ahrens_2021}. The current methods tackle lifelong learning from multiple objectives. 
%The methods can be categorized based on different methodologies.

\noindent\textbf{Regularization-based approaches:} Regularization-based approaches aim at preserving important weights for old tasks. Representative works include EWC \citep{kirkpatrick2017overcoming} which adopted Fisher information matrix, PI \citep{zenke2017continual} which introduced \textit{intelligent synapses}, RWALK \citep{chaudhry2018riemannian} which utilized a KL-divergence based regularization for preserving knowledge of old tasks, and MAS \citep{aljundi2018memory} in which the importance measure for each parameter was computed based on how sensitive the predicted output function is to a change in this parameter. 

% \noindent\textbf{Transfer learning-based methods:}
% Transfer learning-based methods aim at leveraging \emph{knowledge} from old tasks for learning new tasks. Representative works include PROG-NN \cite{rusu2016progressive} which adds a new ``column'' with lateral connections to previous hidden layers for each new task and OWN \cite{zeng2019continual} which enables networks to continually learn different mapping rules in a context-dependent way. 

% In \cite{lee2019overcoming}, the authors proposed a method to leverage unlabeled data in the wild to avoid catastrophic forgetting using knowledge distillation.

\noindent\textbf{Memory-based approaches:} Episodic memory based lifelong learning methods~\citep{hayes2021replay, verwimp2021rehearsal, jin2021gradient} leverage a small episodic memory for storing examples from old tasks. In GEM \citep{lopez2017gradient}, A-GEM \citep{chaudhry2018efficient}, MEGA \citep{guo2020improved} and OGD \citep{farajtabar2019orthogonal}, the direction of the current gradient is modified to overcome forgetting in lifelong learning. In MER \citep{riemer2018learning}, meta-learning is employed as a subroutine for mitigating catastrophic forgetting. In iCARL \citep{rebuffi2017icarl}, \textit{class exemplars} are stored for each class and used for classification in class incremental lifelong learning. In experience replay (ER) based methods \citep{chaudhry2019tiny,aljundi2019gradient}, the model is trained continuously with batch gradient descent by sampling examples from the current task and the episodic memory. CTN~\citep{pham2020contextual} exploits memory to store task features. 


\noindent\textbf{Imbalanced lifelong learning:} Existing imbalanced lifelong learning methods mainly focus on maintaining a balanced memory. CBRS~\citep{chrysakis2020online} tackles imbalanced lifelong learning by using a Class-Balanced memory population strategy. Similar to CBRS, PRS~\citep{kim2020imbalanced}, CoPE~\citep{de2021continual} and Rainbow Memory~\citep{bang2021rainbow} introduces different balanced sampling strategies. \citet{zhou2022forgetting} discusses imbalanced lifelong learning in Reinforcement Learning.
ROSE~\citep{CanoK22} designs an online ensemble classifier to handle imbalanced data streams.
% %vspace*{-0.1in}
\subsection{AUC Optimization}
Online AUC maximization aims to design algorithms to overcome the difficulty of sampling pairwise data due to the definition of AUC. ~\citet{zhao2011online} addressed this problem by maintaining a buffer and stored representative examples to construct the positive-negative label pair to calculate the gradient. ~\citet{gao2013one} maintained the mean and the covariance matrix for the streaming data and performed a gradient-based update. ~\citet{ying2016stochastic} introduced the saddle point reformulation of AUC maximization with the squared loss and developed an algorithm that can update the model once receiving one data to maximize AUC. There are some future extensions of solving the saddle point formulation under different scenarios, including algorithms with fast rate under function growth condition~\citep{liu2018fast}, deep learning~\citep{liu2020stochastic}, proximal gradient methods~\citep{lei2021stochastic}, variance reduction~\citep{dan2021variance}. However, none of them are directly applicable in continual learning setting, since they 
do not take into account the catastrophic forgetting.
%either have regret guarantees in non-i.i.d. data setting or optimization guarantees under i.i.d. data setting. 
%These guarantees cannot imply that the model can remember old knowledge while learning new knowledge. The model may suffer from catastrophic forgetting which is undesirable for real-world applications.
% %vspace*{-0.1in}
%\vspace*{-0.15in}

\section{Preliminaries}
% %vspace*{-0.1in}
\noindent\textbf{Lifelong Learning.} We closely follow the lifelong learning settings in \citet{lopez2017gradient,chaudhry2018efficient,guo2020improved}. Specifically, we consider \textit{task-incremental lifelong learning} in which case the tasks are arriving sequentially. Suppose we have a total of $T$ tasks: $\{D_1, ..., D_T\}$. For each task $D_i$, we have a set of training examples $\{\x_j, y_j\}_{j=1}^{K}$. In this paper, we consider imbalanced classification, i.e., each class can have a different number of samples. The given model $f_{\w}$
is trained continuously on the tasks over a \textit{single} pass of the samples. After training, the model is evaluated on the test datasets to assess its performance. The goal of task-incremental lifelong learning is to achieve high performance across all tasks. The crux of task-incremental lifelong learning is the catastrophic forgetting: the model tends to forget previously acquired knowledge while being trained on a new task.

In memory-based lifelong learning methods \citep{lopez2017gradient,chaudhry2018efficient,guo2020improved}, a replay buffer is used to store a subset of examples from old tasks. The central idea of \citet{lopez2017gradient,chaudhry2018efficient,guo2020improved} is to utilize the replay buffer for computing gradients which serves as the reference for modifying the direction of the gradient computed on the current task. 

\noindent\textbf{Online AUC Optimization.} AUC is defined as the probability of the score of the positive sample being larger than the negative example. Denote $\x\in\R^d$ and $y\in\{+1, -1\}$ by feature and label respectively, and denote $\z=(\x,y)$ by the feature-label pair. We assume that $\z$ is sampled from an unknown distribution $\P$. Define $p=\text{Pr}(y=1)=
\E_{y}\left[\mathbb{I}_{[y=1]}\right]$ as the likelihood of a random data being positive, where $\mathbb{I}(\cdot)$ is the indicator function.
AUC for a general scoring function $h:\R^d\rightarrow\R$ is defined as
\begin{equation}\label{def:auc}
\text{AUC}(h)=\text{Pr}\left(h(\w;\x)\geq h(\w;\x')\middle\vert y=1,y'=-1\right),
\end{equation}
where $\w$ is the model parameter, $\z=(\x,y)$ and $\z'=(\x',y')$ are drawn independently from $\P$, $h(\w;\x)$ is the scoring function parameterized by $\w$. Following~\citet{gao2013one,ying2016stochastic,liu2020stochastic}, we use the squared function as a surrogate to replace the indication function and end up with the following loss function:
% \begin{align}~\label{opt:obj}
% \min_{\w\in \R^d}P(\w):= \E_{\z,\z'}\left[(1-h( \w; \x)+h(\w; \x'))^2 \middle\vert y=1,y'=-1\right]
% \end{align}
%  Hence, the AUC maximization problem in the i.i.d. data setting can be written as the following:
\begin{equation}
\begin{aligned}~\label{opt:obj}
\min_{\w\in\R^d} \E_{\z,\z'}\left[(1-h( \w; \x)+h(\w; \x'))^2 \middle\vert y=1,y'=-1\right].
\end{aligned}
\end{equation}

The above formulation depends on pairwise data with both positive and negative labels, so it is hard to optimize in the online learning setting. It was shown in~\citet{ying2016stochastic} that the AUC maximization problem can be formulated as a minimax saddle point problem, and the stochastic gradient descent ascent algorithm can be employed to solve this saddle point problem. The saddle point reformulation is described in Proposition~\ref{thm:spp}. 

\begin{prop}~\citep{ying2016stochastic}~\label{thm:spp}
	The optimization problem~(\ref{opt:obj}) is equivalent to
	\begin{align}~\label{opt:spp}
	\min_{\w\in\R^d,(a,b)\in\R^2}\max_{\alpha\in\R}f\left(\w,a,b,\alpha\right):=\E_{\z}\left[F\left(\w,a,b,\alpha;\z\right)\right],
	\end{align}
	where $\z=(\x,y)\sim\P$, and 
	\begin{equation}
	\label{eq:reformulation}
	\begin{aligned}
&F(\w,a,b,\alpha,\z)=(1-p)\left(h(\w;\x)-a\right)^2\mathbb{I}_{[y=1]}\\
&+p(h(\w;\x)-b)^2\mathbb{I}_{[y=-1]}-p(1-p)\alpha^2\\
	&+2\left(1+\alpha\right)\left(p h(\w;\x)\mathbb{I}_{[y=-1]}-(1-p)h(\w;\x)\mathbb{I}_{[y=1]}\right),
	\end{aligned}
	\end{equation}
\end{prop} 

Intuitively speaking, the Proposition~\ref{thm:spp} allows us to update the model parameter $\w$ and the decision threshold $\alpha$ simultaneously to effectively improve AUC each time receiving a new individual sample. This mechanism avoids the requirement of updating model with pairwise data which is typically infeasible in online learning. It is worth mentioning that DIANA is also built upon the saddle point reformulation. The main difference between our work and previous works is that our work focuses on developing memory-based lifelong learning to maximize AUC in the imbalanced continual learning setting (to alleviate catastrophic forgetting).

\begin{figure*}[!t]
\begin{center}
	\includegraphics[width=0.8\linewidth]{figures/illustrate_2.png}
\end{center}
%vspace*{-0.2in}
\caption{
Illustration of DIANA.
The loss function is based on the Equation \ref{formulation:final}. $loss_W$ aims to maximize AUC of model A on the current task, while $loss_V$ aims to optimize model B on replay buffer. The alignment of two models is implemented by $loss_{WV}$.}
\label{fig:illusrate}
%vspace*{-0.2in}
\end{figure*}


% %vspace*{-0.1in}
%\vspace*{-0.15in}

\section{DIANA}
% %vspace*{-0.1in}
\label{sec:alogrithm}
In this section, we introduce a novel algorithmic framework for optimizing AUC in the lifelong learning setting with an imbalanced data stream. The proposed 
algorithmic framework is built upon the memory-based lifelong learning methods which leverage a replay buffer for rehearsal. The essence of the algorithmic framework is to maximize the AUC score both on the current task and replay buffer as a composite optimization problem. We circumvent the requirement of a pair of samples for computing AUC score based on the literature of online AUC optimization~\citep{ying2016stochastic,liu2018fast}. By decoupling conflict gradients into two models and aligning gradually, we effectively overcome the gradient interference problem caused by imbalanced data.


\subsection{Imbalanced Lifelong Learning by Maximizing AUC}
In lifelong learning, the model $f_{\w}$ is trained sequentially over $T$ tasks. On each task $t$, the samples are arriving in a batch-wise fashion. Let $\w_t^k$ denote the model parameter on the $k$-th minibatch of the $t$-th task. Define $\z_t$ and $\hat{\z}_t$ as random variables following the distribution of the $t$-th task's data and the replay buffer's data upon $t$-th task respectively. To balance the current task and the replay buffer, similar to \citet{guo2020improved}, we define $\lambda_1(\cdot),\lambda_2(\cdot):\mathbb{R}^d\mapsto \mathbb{R}_+$ as real-valued functions which depend on the state of the model. On the $k$-th minibatch of the $t$-th task, we aim to solve the following optimization problem:
\begin{equation}
\label{auc:mega}
\begin{aligned}
    &\max_{\w}\quad \lambda_1(\w_k^t)\cdot\text{AUC}_{t}(\w)+\lambda_2(\w_k^t)\cdot\text{AUC}_{\text{ref}}(\w):= \\
    &\lambda_1(\w_k^t)\cdot\E_{\z_+,\z_-}\left[\text{AUC}_{t}(\w;\z_{+},\z_{-})\right] \\
    &+\lambda_2(\w_k^t)\cdot\E_{\hat{\z}_+,\hat{\z}_-}\left[\text{AUC}_{\text{ref}}(\w;\hat{\z}_{+},\hat{\z}_{-})\right],
\end{aligned}
\end{equation}
where $\w$ is the model parameter, $\text{AUC}_t(\w)$ denotes the population AUC at the $t$-th task, $\text{AUC}_{\text{ref}}(\w)$ denotes the population AUC of the replay buffer, $\z_{+}$ ($\z_{-}$) and $\hat{\z}_{+}$ ($\hat{\z}_{-}$) denote random samples with positive (negative) labels on current task and replay buffer respectively, $\lambda_1(\w_k^t)$ and $\lambda_2(\w_k^t)$ characterize the scaling factors of the two AUC values on current task and replay buffer respectively on the $k$-th minibatch at the $t$-th task. The choice of the scaling factors determines the degree of prioritizing the current task or replay buffer. Inspired by~\citet{guo2020improved}, we choose $\lambda_1= \text{AUC}_{\text{ref}}(\w) / \text{AUC}_t(\w) $ and $\lambda_2=1$ based on the model performance. If AUC score on replay buffer is worse compared with current task, then $\lambda_1<1=\lambda_2$ and hence our algorithm puts more weights on replay buffer. Observing from our experiments, the weight  $\lambda_1$ varies from 0.1 to 10 most of the time. 

However, by the pairwise formulation of AUC, to solve the problem~(\ref{auc:mega}), one needs to sample a pair of positive and negative examples ($\z_+$ and $\z_{-}$, $\hat{\z}_+$ and $\hat{\z}_-$) at every iteration, which is not feasible for lifelong learning. A natural idea to address this issue is to employ the minimax reformulation (Proposition~\ref{thm:spp}) of AUC~\citep{ying2016stochastic,liu2020stochastic}, which ends up with the following problem:
% \begin{equation}
% \label{eq:1}
%     \min_{\w}\alpha_1(\w_k^t)\ell_t(\w)+\alpha_2(\w_k^t)\ell_{\text{ref}}(\w):=\alpha_1(\w_k^t)\E_{\xi}\ell_{t}(\w;\xi)+\alpha_2(\w_k^t)\E_{\zeta}\ell_{\text{ref}}(\w;\zeta),
% \end{equation}
% where $$\ell_t(\w;\xi)=\min_{(a,b)\in\R^2}\max_{\alpha\in\R}F(\w,a,b,\alpha;\xi),$$ and $$\ell_{\text{ref}}(\w;\zeta)=\min_{(a,b)\in\R^2}\max_{\alpha\in\R}F(\w,a,b,\alpha;\zeta)$$.
% We can rewrite (\ref{eq:1}) as the following optimization problem:
% \begin{equation}
%     \min_{\w_1,\w_2}\E_{\xi,\zeta}\left[\alpha_1(\w_k^t)\ell_{t}(\w_1;\xi)+\alpha_2(\w_k^t)\ell_{\text{ref}}(\w_2;\zeta)\right]
% \end{equation}
\begin{equation}
\label{eq:1}
\begin{aligned}
    &\min_{\w}[\lambda_1(\w_k^t)\min_{(a,b)\in\R^2}\max_{\alpha\in\R}\E_{\z_t}F(\w,a,b,\alpha;\z_t)\\
    &+ \lambda_2(\w_k^t)\min_{(a,b)\in\R^2}\max_{\alpha\in\R}\E_{\hat{\z}_t}F(\w,a,b,\alpha;\hat{\z}_t)],
    \end{aligned}
\end{equation}
where $F$ is defined in Equation~(\ref{eq:reformulation}). This is equivalent to the following formulation,
%vspace*{-0.1in}
\begin{equation}
\label{eq:22}
\begin{aligned}
    &\min_{\w,a_1,a_2,b_1,b_2}\max_{\alpha_1,\alpha_2}[\lambda_1(\w_k^t)\E_{\z_t}F(\w,a_1,b_1,\alpha_1;\z_t)\\ &+\lambda_2(\w_k^t)\E_{\hat{\z}_t}F(\w,a_2,b_2,\alpha_2;\hat{\z}_t)].
    \end{aligned}
\end{equation}
We can solve the problem (\ref{eq:22}) by stochastic gradient descent on variables $\w,a_1,a_2,b_1,b_2$ and stochastic gradient ascent on variables $\alpha_1,\alpha_2$. The stochastic gradient w.r.t. $\w$ is $\lambda_1(\w_k^t)\nabla F_{\w}(\w,a_1,b_1,\alpha_1;\z_t)+\lambda_2(\w_k^t)\nabla F_{\w}(\w,a_2,b_2,\alpha_2;\hat{\z}_t)$ which consists of the gradients on the current task and the gradients on the replay buffer, we refer to the gradients as the current gradient and reference gradient respectively. 

\subsection{Gradient Interference Problem}
\label{sec:grad_interf}
%The current lifelong learning methods only consider balanced datasets. However,
Imbalanced datasets are ubiquitous \citep{ramyachitra2014imbalanced} but are largely overlooked by current research efforts on lifelong learning. We find that the imbalanced nature of the data stream poses severe challenges for optimization. Specifically, the gradients on the current task and the gradients on the replay buffer may \emph{interfere} with each other during training due to a mismatch of the data distribution of the current task and the replay buffer. This resembles the observations made in GEM~\citep{lopez2017gradient} when the gradient is calculated based on the standard loss function: the angle between gradient of current task (the replay buffer) is used to evaluate whether the gradient update would harm previous tasks. When the angle is acute, the gradient is unlikely to increase the loss at previous tasks and vice versa. It proposes gradient projection to align gradients with obtuse angles and get remarkable improvement on performance. Similarly, we use the angle between the gradients as the metric of gradient interference.

In the following, we empirically demonstrate this phenomenon on a real-world dataset. We consider two settings: imbalanced data distribution and balanced data distribution. We construct imbalanced data from the medical dataset ISIC2019 \citep{DBLP:journals/corr/GutmanCCHMMH16} which consists of 25331 images and 8 classes. These classes are divided into 4 disjoint tasks %and each task contains two classes,
representing positive and negative samples, respectively. We reduce the number of positive samples to be $5\%$ of the negative ones. The setting of balanced data is to keep the number of samples of different classes equal.

We train the models based on the formulation of Equation~(\ref{eq:22}) on balanced and imbalanced data respectively. We compute the current gradient and reference gradient in each mini-batch. The angle distribution is shown in Figure~\ref{fig:gradient_angle_isic}.
In the balanced case, the angles between the current gradient and the reference gradient are almost acute angles. In the imbalanced case, there appear to be more obtuse angles. That means, the gradients computed on the current task interfere with the gradients computed on the replay buffer, severely affecting the training of the model.  

 To investigate this further, we analyzed the gradient angles and present more results in Appendix Figures 5 and 6. They illustrates that the imbalanced case has a larger variance and show that after making the dataset imbalanced, the proportion of obtuse angles increased from 1.05\% to 3.16\% on Split-CUB200 and from 0.13\% to 9.68\% on Split-CIFAR100. Please notice on Split-CIFAR, the number of obtuse angles increase by around 75 times.


\begin{figure}[!t]
\begin{center}
	 	\subfigure[Balanced ISIC2019]{\includegraphics[ height=0.4\linewidth]{figures/angle_dis_isic_ba.pdf}}  
 		\subfigure[Imbalanced ISIC2019]{\includegraphics[ height=0.4\linewidth]{figures/angle_dis_isic_imba.pdf}}  
\end{center}
%vspace{-.2in}
\caption{ The distributions of the angles between the current gradient and reference gradient.   }
%\vspace*{-0.3in}
%vspace{-.2in}
\label{fig:gradient_angle_isic}
\end{figure}

% %vspace*{-0.1in}
\subsection{Model Decoupling and Alignment}
To address the gradient interference problem, we introduce our algorithm DIANA. 
The high-level idea is to use a relaxation of Equation~(\ref{eq:22}) such that we can still learn useful information even if the gradients at the current task and the replay buffer are conflicting. 
In particular, we first note the equivalent formulation of~(\ref{eq:22}):
\begin{equation}
\label{eq:23}
\begin{aligned}
    &\min_{\w,\v,a_1,a_2,b_1,b_2}\max_{\alpha_1,\alpha_2}[\lambda_1(\w_k^t)\E_{\z_t}F(\w,a_1,b_1,\alpha_1;\z_t) \\
    &+\lambda_2(\v_k^t)\E_{\hat{\z}_t}F(\v,a_2,b_2,\alpha_2;\hat{\z}_t)], \text{s.t.} \quad \w=\v.
    \end{aligned}
\end{equation}

%-----------------------------------------------------
\begin{algorithm}[!t]
\small
\caption{Lifelong AUC Maximization with Model Decoupling and Alignment (DIANA)}
\label{alg:L_AUC}
\begin{algorithmic}[1]
\State $\text{Buffer} \gets \{\}$
\For{$t \gets 1$ to $T$}
    \For{$k \gets 1$ to $|D^{tr}_t|$}
        \If{$\text{Buffer}\neq \{\}$}
            \State $ \zeta_k^t \gets \textsc{sample}(\text{Buffer}) $
            \State Compute $\lambda_1(\w_k^t)$ and $\lambda_2(\v_k^t)$ 
        \Else
            \State Set $\lambda_1(\w_k^t) = 1$ and $\lambda_2(\v_k^t) = 0$.
        \EndIf
        \State Update $\w, \v,a_1,a_2,b_1,b_2$ by one step of stochastic gradient descent w.r.t. the objective function defined in~(\ref{formulation:final})
        \State  Update $\alpha_1,\alpha_2$ by one step of stochastic gradient ascent w.r.t. the objective function defined in~(\ref{formulation:final}) .
        \State $\text{Buffer} \gets \text{Buffer} \bigcup (\xi_k^t)$
        \State Remove samples if the $\text{Buffer}$ is full.
    \EndFor
\EndFor
\end{algorithmic}
\end{algorithm}
%vspace*{-0.1in}
%---------------------------------------------------
% \begin{equation}
% \label{formulation:1}
%     \begin{aligned}
%     %   &\min_{\w,\v,a_1,a_2,b_1,b_2}\max_{\alpha_1,\alpha_2}\left[\lambda_1(\w_k^t)\E_{\z_t}F(\w,a_1,b_1,\alpha_1;\z_t) \\ &+\lambda_2(\v_k^t)\E_{\hat{\z}_t}F(\v,a_2,b_2,\alpha_2;\hat{\z}_t)\right]+\beta\cdot \text{dist}(\w,\v),
%     loss_{W} = \lambda_1(\w_k^t)\E_{\z_t}F(\w,a_1,b_1,\alpha_1;\z_t)
%     \end{aligned}
% \end{equation}


Since using the same model for data distribution of different tasks would lead to the gradient interference problem, we propose a model decoupling and alignment technique to address the issue. In particular, we propose to solve the following problem~(\ref{formulation:final}) to relax the equality constraint in~(\ref{eq:23}),
\begin{equation}
\label{formulation:final}
    \begin{aligned}
       &\min_{\w,\v,a_1,a_2,b_1,b_2}\max_{\alpha_1,\alpha_2}[\lambda_1(\w_k^t)\E_{\z_t}F(\w,a_1,b_1,\alpha_1;\z_t) \\ &+\lambda_2(\v_k^t)\E_{\hat{\z}_t}F(\v,a_2,b_2,\alpha_2;\hat{\z}_t)]+\beta\cdot \text{dist}(\w,\v),
    \end{aligned}
\end{equation}
where $\text{dist}(\cdot,\cdot)$ denotes a distance function between two models, and $\beta>0$ is a penalty parameter. The typical choice of $\text{dist}$ can be squared loss, distillation loss~\citep{hinton2015distilling}, etc. The term $\beta\cdot\text{dist}(\w,\v)$ is referred to as the alignment penalty. 

In view of Equation~(\ref{formulation:final}), we know that we are decoupling one model as in~(\ref{eq:23}) into two models $\w$ and $\v$ under the coordination of an alignment penalty, which has the following two benefits. First, the formulation is a standard minimax optimization, where stochastic gradient descent ascent suffices to solve it efficiently. Second, it can partially alleviate the gradient interference problem since conflicting gradients are decoupled. For example, in DIANA, the gradient w.r.t. $\w$ is $\lambda_1(\w_k^t)\nabla_{\w} F(\w,a_1,b_1,\alpha_1;\z_t)+\beta\nabla_{\w}\text{dist}(\w,\v)$. 
%Even in the extreme case such that $\nabla_{\w} F(\w,a_1,b_1,\alpha_1;\z_t)$ is of opposite direction of $\nabla_{\v} F (\v,a_2,b_2,\alpha_2;\hat{\z}_t)$, the gradient interference does not exist so the model can still learn new information. 
%This particular structure of gradient indicates that 
The gradient w.r.t. $\w$ consists of two terms, the first term represents the current gradient which depends on the current data, while the second term characterizes the gradient of the distance function between two models and it is independent of data. Essentially, we undermine the impact of some noisy gradients on the model update. By pulling the current model close to the model on the replay buffer \emph{in each step}, the current model can essentially \emph{learn} from the model on the replay buffer to retain performance on old tasks. 

For implementation, since we cannot solve~(\ref{formulation:final}) exactly, we use one step of stochastic gradient descent ascent as an approximate solution. This is consistent with the literature of memory-based lifelong learning~\citep{chaudhry2018efficient,guo2020improved}. Please refer to Algorithm~\ref{alg:L_AUC} and Figure~\ref{fig:illusrate} for details.

%The detailed algorithm can be found in Algorithm~\ref{alg:L_AUC} 



%  The problem~(\ref{eq:1}) is difficult to solve directly using a gradient-based method, since the objective function is a summation of two functions of explicit min-max structure without closed form, and hence it is difficult to calculate the gradient of the composite function with respect to model parameter $\w$. To address this issue, we first rewrite~(\ref{eq:1}) to the following problem:
% % \begin{equation}
% %     \begin{aligned}
% %       &\min_{\w,\v} \left[\lambda_1(\w_k^t)\min_{(a_1,b_1)\in\R^2}\max_{\alpha_1\in\R}\E_{\z_t}F(\w,a_1,b_1,\alpha_1;\z_t)\right.\\&\left.+\lambda_2(\w_k^t)\min_{(a_2,b_2)\in\R^2}\max_{\alpha_2\in\R}\E_{\hat{\z}_t}F(\v,a_2,b_2,\alpha_2;\hat{\z}_t)\right]\\
% %       &\text{s.t.,} \quad\quad \w=\v, a_1=a_2, b_1=b_2, \alpha_1=\alpha_2.
% %     \end{aligned}
% % \end{equation}
% \begin{equation}
% \label{eq:1}
%     \begin{aligned}
%       &\min_{\w,\v} \left[\lambda_1(\w_k^t)\min_{(a,b)\in\R^2}\max_{\alpha\in\R}\E_{\z_t}F(\w,a,b,\alpha;\z_t)\right.\\&\left.+\lambda_2(\w_k^t)\min_{(a,b)\in\R^2}\max_{\alpha\in\R}\E_{\hat{\z}_t}F(\v,a,b,\alpha;\hat{\z}_t)\right]\\
%       &\text{s.t.,} \quad\quad \w=\v%, a_1=a_2, b_1=b_2, \alpha_1=\alpha_2.
%     \end{aligned}
% \end{equation}
% Solving (\ref{eq:1})  Then we can merge two minimax into one:
% \begin{equation}
% \label{eq:2}
%     \begin{aligned}
%       &\min_{\w,\v,a_1,a_2,b_1,b_2}\max_{\alpha_1,\alpha_2\in\R}\left[ \lambda_1(\w_k^t)\E_{\z_t}F(\w,a_1,b_1,\alpha_1;\z_t)\right.\\&\left.
%       \quad\quad+\lambda_2(\w_k^t)\E_{\hat{\z}_t}F(\v,a_2,b_2,\alpha_2;\hat{\z}_t)\right],\\
%       &\text{s.t.} \quad\quad \w=\v,% a_1=a_2, b_1=b_2, \alpha_1=\alpha_2.
%     \end{aligned}
% \end{equation}
% According to Lagrangian duality theory, there exists $\beta>0$ such that it is equivalent to
% \begin{equation}
% \label{formulation:final}
%     \begin{aligned}
%       &\min_{\w,\v,a_1,a_2,b_1,b_2}\max_{\alpha_1,\alpha_2\in\R}\left[ \lambda_1(\w_k^t)\E_{\z_t}F(\w,a_1,b_1,\alpha_1;\z_t)\right.\\&\left.
%       \quad\quad+\lambda_2(\w_k^t)\E_{\z_t}F(\v,a_2,b_2,\alpha_2;\hat{\z}_t)\right.\\&\left.
%       +\beta \cdot\text{dist}((\w,a_1,b_1,\alpha_1),(\v,a_2,b_2,\alpha_2))\right],\\
%       %&\text{s.t.} \quad\quad \w=\v, a_1=a_2, b_1=b_2, \alpha_1=\alpha_2.
%     \end{aligned}
% \end{equation}
% where $\text{dist}(\cdot)$ denotes a distance function between two vectors. The benefit of formulation~(\ref{formulation:final}) is that the formulation is a standard minimax optimization, where stochastic gradient descent ascent suffices to solve it in polynomial time.

% \begin{equation}
% \label{eq:1}
% \begin{aligned}
%   &\min_{\w,\v}\left[\alpha_1(\w_k^t)\min_{(a,b)\in\R^2}\max_{\alpha\in\R}\E_{\xi_t}F(\w,a,b,\alpha;\xi_t)+\alpha_2(\w_k^t)\min_{(a,b)\in\R^2}\max_{\alpha\in\R}\E_{\zeta_t}F(\v,a,b,\alpha;\zeta_t)\right],\\
%   & \quad\quad \text{s.t.} \quad\quad \w=\v
%   \end{aligned}
% \end{equation}
%It is equivalent to
% \begin{equation}
% \begin{aligned}
%     \min_{\w,\v,a,b}\max_{\alpha\in\R,\lambda\in\R}\left[\alpha_1(\w_k^t)\E_{\xi_t}F(\w,a,b,\alpha;\xi_t)+\alpha_2(\w_k^t)\E_{\zeta_t}F(\v,a,b,\alpha;\zeta_t)+\frac{\lambda}{2}\|\w-\v\|_2^2\right]
%     \end{aligned}
% \end{equation}
% We cannot solve this problem exactly, but we can approximately solve it by stochastic gradient descent ascent, i.e.,
% \begin{equation}
%     \begin{aligned}
%     \begin{cases}
% &    \w_{k+1}^t\leftarrow \w_k^t-\eta\left[\alpha_1(\w_k^t)\nabla F_{\w}(\w_k^t,a_k^t,b_k^t,\alpha_k^t;\xi_k^t)+\lambda(\w_k^t-\v_k^t)\right]\\
% &    \v_{k+1}^t\leftarrow \v_k^t-\eta\left[\alpha_2(\w_k^t)\nabla F_{\v}(\v_k^t,a_k^t,b_k^t,\alpha_k^t;\zeta_k^t)+\lambda(\v_k^t-\w_k^t)\right]\\
% &a_{k+1}^t\leftarrow a_k^t-\eta\left[\alpha_1(\w_k^t)\nabla F_{a}(\w_k^t,a_k^t,b_k^t,\alpha_k^t;\xi_k^t)+\alpha_2(\w_k^t)\nabla F_a(\v_k^t,a_k^t,b_k^t,\alpha_k^t;\zeta_k^t)\right]\\
% &b_{k+1}^t\leftarrow a_k^t-\eta\left[\alpha_1(\w_k^t)\nabla F_{b}(\w_k^t,a_k^t,b_k^t,\alpha_k^t;\xi_k^t)+\alpha_2(\w_k^t)\nabla F_b(\v_k^t,a_k^t,b_k^t,\alpha_k^t;\zeta_k^t)\right]\\
% &     \alpha_{k+1}^t\leftarrow \alpha_k^t+\eta\left[\alpha_1(\w_k^t)\nabla_{\alpha} F(\v_k^t,a_k^t,b_k^t,\alpha_k^t;\xi_k^t)+\alpha_2(\w_k^t)\nabla_{\alpha} F(\v_k^t,a_k^t,b_k^t,\alpha_k^t,\zeta_k^t)\right]\\
% & \lambda_{k+1}^t\leftarrow \lambda_k^t+\frac{\eta\|\w_k^t-\v_k^t\|_2^2}{2}\\
%     \end{cases}
%     \end{aligned}
% \end{equation}

% where $$\ell_t(\w;\xi)=\min_{(a,b)\in\R^2}\max_{\alpha\in\R}F(\w,a,b,\alpha;\xi),$$ and $$\ell_{\text{ref}}(\w;\zeta)=\min_{(a,b)\in\R^2}\max_{\alpha\in\R}F(\w,a,b,\alpha;\zeta)$$.
% We can rewrite (\ref{eq:1}) as the following optimization problem:
% \begin{equation}
%     \min_{\w_1,\w_2}\E_{\xi,\zeta}\left[\alpha_1(\w_k^t)\ell_{t}(\w_1;\xi)+\alpha_2(\w_k^t)\ell_{\text{ref}}(\w_2;\zeta)\right]
% \end{equation}


% %vspace*{-0.15in}
%\vspace*{-0.15in}

\section{Experiments}

\begin{figure*}[!t]
\begin{center}
	\subfigure[Split-CIFAR]{\includegraphics[width=0.45\linewidth,height=0.13\linewidth]{figures/evoplot_cifar.pdf}} 
	\subfigure[Split-CUB]{\includegraphics[width=0.45\linewidth,height=0.13\linewidth]{figures/evoplot_cub.pdf}}   
	\subfigure[Split-AWA2]{\includegraphics[width=0.45\linewidth,height=0.13\linewidth]{figures/evoplot_awa.pdf}}    
	\subfigure[ISIC2019]{\includegraphics[width=0.45\linewidth,height=0.13\linewidth]{figures/evoplot_isic.pdf}}     
	%\subfigure[EuroSat]{\includegraphics[width=0.32\linewidth]{figures/evoplot_eurosat.pdf}}
	\includegraphics[width=0.8\linewidth]{figures/legend.pdf}
\end{center}
%vspace*{-0.4in}
\caption{ Evolution of average AUC during the lifelong learning process. For the results on EuroSat, please see Appendix. }
\label{fig:evolution}
%vspace*{-0.2in}
\end{figure*}


% %vspace*{-0.1in}
\subsection{Experimental Setups}
\label{sec:exp-setups}

\noindent \textbf{Datasets.} We perform experiments on popular lifelong learning benchmarks, Split-CIFAR \citep{zenke2017continual}, Split-CUB and Split-AWA2 \citep{chaudhry2018efficient}. Moreover, to further explore the application of lifelong AUC Maximization in industry, a medical dataset ISIC2019 \citep{DBLP:journals/corr/GutmanCCHMMH16} and a satellite dataset EuroSat \citep{helber2019eurosat} are also introduced as new benchmarks. The medical dataset ISIC2019 consists of 25331 medical images and 8 different diagnostic categories. 
%The goal for ISIC2019 is to classify dermoscopic images among 8 different diagnostic categories. : Melanoma, Melanocytic nevus, Basal cell carcinoma, Actinic keratosis, Benign keratosis Dermatofibroma, Vascular lesion, Squamous cell carcinoma, and none of them.
While satellite dataset EuroSat covers 13 spectral bands and consists of 10 classes with in total of 27,000 labeled and geo-referenced images,
% including cities, rivers, and forests.
We believe it is important to measure performance in real industrial scenarios, especially for the medical scenario, which is naturally imbalanced and prefers AUC as the criterion rather than accuracy. A false positive causes severe consequences, such as fault diagnosis of cancers. Split-CIFAR and Split-CUB consist of 20 tasks, while Split-AWA2 has 25 tasks, ISIC2019 has 4 tasks and EuroSat has 5 tasks. For ISIC2019, every two classes constitute a task (4 tasks in total since ISIC2019 has 8 classes). Similar to ISIC2019, we construct one task with data from two classes (5 tasks in total).

\noindent \textbf{Make Imbalanced.} Following \citet{yuan2021robust}, we make training set imbalanced with a pre-defined imbalanced ratio (\textbf{imratio}) and leave validation set and test set unchanged. According to chosen imbalanced ratio, positive samples are randomly discarded, until the ratio of positive samples to all samples equals the imbalanced ratio. 
%Except for multi-class balanced classification and the imbalanced ratio ablation in Appendix, 
We set imratio=0.05 in all experiments, which means that only 5\% data are positive. 
In addition, an ablation study of different imbalanced ratios is proposed in Appendix, which conducts experiments with proportions from 0.01 to 0.1.
If a task has multiple classes, half of the classes are regarded as negative and the rest are regarded as positive. For example, in Split-CIFAR, the negative class is defined as classes $\{0+i*n\sim 3+i*n\}_{i=1}^{T}$ in the original CIFAR100 dataset, where $T$ is the total number of tasks, $n$ is the number of classes in each task in the original Split CIFAR-10 dataset. The rest of classes are all defined as positive. To clarify, Split-CIFAR is separated into 20 disjoint tasks, each task contains 5 classes, classes 0-3 in each task are regarded as negative, and classes 4-5 are positive.

\noindent \textbf{Metrics.} Since our purpose is to maximize the AUC score, AUC ($AUC_T$) is used as the primary metric. $AUC_T$ represents the averaged AUC value when finishing training the $T$-th task, where $T$ is the total number of tasks. To be consistent with previous works \citep{kamp2018efficient}, Accuracy ($ACC_T$) and Forgetting ($FGT_T$) are also reported. $ACC_T$ is the average accuracy tested on all tasks after finishing training $T$-th task.  $FGT_T$ measures the drop of AUC on past tasks after training on the $T$-th task. It's defined as $FGT_T = \frac{1}{T-1} \sum_{j=1}^{T-1} {(\max_{l \in \{1,\cdots,T-1\}} AUC_{l, j} - AUC_{T, j})}$. where $AUC_{l, j}$ is the AUC score tested on the $j$-th task after training on the $l$-th task.
%\textcolor{red}{add a formula for definitions}


\noindent \textbf{Implementation Details.} For Split-CIFAR, ISIC2019 and EuroSAT, we use a reduced ResNet18~\citep{chaudhry2018efficient} to handle small input resolution. For Split-CUB and Split-AWA2, we use a standard ResNet18 pre-trained model on ImageNet. We set batch size as 64 for Split-CIFAR, Split CUB, and Split-AWA2, while batch size as 128 for ISIC2019 and EuroSAT. The learning rate is 0.1 across different datasets and methods. As to memory size for each task, it's fixed to 64 for Split-CIFAR, Split CUB, and Split-AWA. Fixed to 128 for ISIC2019 and EuroSAT.


\begin{table*}[t]
	\centering
	\caption{Comparison of ONE-MODEL and TWO-MODEL (DIANA) }
		 	%vspace*{-0.1in}
	\scalebox{0.85}{
	\begin{tabular}{llllll}
		\toprule
		\multicolumn{1}{c}{\multirow{2}[5]{*}{\textbf{Method}}} & \textbf{Split-CIFAR} & \textbf{Split-CUB} & \textbf{Split-AWA2} & \textbf{ISIC2019} & \textbf{EuroSat} \\
		& \multicolumn{1}{c}{\small{\textit{AUC} (↑)}} & \multicolumn{1}{c}{\small{\textit{AUC} (↑)}} & \multicolumn{1}{c}{\small{\textit{AUC} (↑)}} & \multicolumn{1}{c}{\small{\textit{AUC} (↑)}} & \multicolumn{1}{c}{\small{\textit{AUC} (↑)}}
		\\
		\midrule
		ONE-MODEL & 58.7 $\pm$ 1.7 & \textbf{73.5 $\pm$ 2.3} & 79.1 $\pm$ 2.7 &71.3 $\pm$ 9.2 & 72.4 $\pm$ 6.5 \\
		TWO-MODEL & \textbf{68.4 $\pm$ 2.0} & 70.4 $\pm$ 0.7 & \textbf{98.2 $\pm$ 1.1}&  \textbf{73.6 $\pm$ 4.8} & \textbf{86.7 $\pm$ 1.5} \\
		\bottomrule
	\end{tabular}}%
	\label{tab:one_vs_two}%
%	%vspace*{-0.2in}
\end{table*}%

\begin{table*}[t]

	\centering
	\caption{Comparison of capacity on Split-CIFAR100 }
%	 	%vspace*{-0.1in}
	\scalebox{0.85}{
	\begin{tabular}{llllll}
		\toprule
		\textbf{Method} & \textbf{architecture} & \textbf{AUC(↑) \%} & \textbf{memory} & \textbf{GFLOPS} & \textbf{params}
		\\
		\midrule
		% ONE-MODEL &ResNet18 & 0.587 & 1280 & 0.06 & \textbf{1.12M} \\
		ONE-MODEL &ResNet34 & 56.2  & 1280 & 0.11 & \textbf{2.50M} \\
		TWO-MODEL &ResNet18(x2) &\textbf{68.4} & 1280 & \textbf{0.12}&  2.24M \\
		\bottomrule
	\end{tabular}}%
	\label{tab:twice-capcity}%
 	%vspace*{-0.2in}

\end{table*}%


\noindent \textbf{Baselines.} We consider EWC \citep{kirkpatrick2017overcoming} , MAS \citep{aljundi2018memory}, GEM \citep{lopez2017gradient}, A-GEM \citep{chaudhry2018efficient}, MEGA \citep{guo2020improved}, DER \citep{buzzega2020dark} and GDumb \citep{prabhu2020greedy} as baselines. EWC and MAS are regularization-based approaches. GEM, A-GEM, MEGA, DER, and GDumb are built upon episodic memory. We also consider a simple baseline that uses stochastic gradient descent to train these tasks sequentially without any memory or regularization. It's marked as SINGLE in our experiments. All baselines are task-incremental and only require one-pass, so class-incremental\citep{shim2021online, mai2021supervised} and multiple-passes methods~\citep{ebrahimi2020adversarial, mallya2018packnet} are not considered.
To clarify, SINGLE, EWC, and MAS are only trained on current task. GEM, A-GEM, MEGA, DER are trained on the combination of current task and replay buffer. GDumb is only trained on replay buffer.

\noindent \textbf{Sampling.} For a fair comparison, we use the same memory size and regular reservoir sampling~\citep{chaudhry2019tiny} for all the memory-based methods, including GEM, A-GEM, DER, MEGA, and DIANA. Since the benchmark datasets are imbalanced,  to be fair, GDumb is also implemented with reservoir sampling \citep{chaudhry2019tiny} instead of class-balanced sampling.Data from the current task and replay buffer are both imbalanced.
The sampling size is the same as the batch size used on the current task similar to existing continual learning methods \cite{chaudhry2018efficient}, namely 64 on Split-CIFAR100, Split-CUB200, and Split-AWA2. To clarify, when running on Split-CIFAR100, the replay buffer feeds 64 samples and the current data stream feeds 64 samples. 
We also compare our method with Rainbow-memory (RM)~\citep{bang2021rainbow} and CBRS~\citep{chrysakis2020online}, which use class-balanced sampling.
Thus, the train batches from the replay buffer are collected in a balanced form, while data from the current task stay imbalanced.
The results are presented in Appendix A. We show that our method and class-balanced sampling can be combined to further improve performance.

% MARK2022
%Here we give some brief introductions to reservoir sampling. We use $memory\_size$ to denote the capacity of memory and $n$ to denote the number of samples observed so far. In reservoir sampling, a new arriving sample is selected with probability $\frac{memory\_size}{n}$.


%vspace*{-0.15in}
\subsection{Results}


% %vspace*{-0.1in}
%\noindent \textbf{Results.} 
\label{sec:main-results}
In Figure \ref{fig:evolution}, we present the evolution of average AUC during the lifelong learning process. We can observe that DIANA dominates the baselines in most cases. An interesting observation is that DIANA begins to gain advantages over baselines as the training proceeds. One plausible reason is that it is harder to learn useful features for maximizing the AUC objective in the initial training process. However, as shown in Figure \ref{fig:evolution}, by optimizing the right objective as in the proposed DIANA, we can achieve a high average AUC score in the long run. This observation indicates that DIANA has the potential to handle large amounts of tasks.


We report results of $AUC_T$, $ACC_T$, and $FGT_T$ for all algorithms on the imbalanced benchmarks. Details can be found in Appendix: Figure 2, Table 3 and Table 4. %Figure \ref{fig:result}, Table \ref{tab:result_of_std_data} and Table \ref{tab:result_of_prac_data}. 
In terms of AUC score, our DIANA outperforms other baselines with a large margin. In particular, compared with the best baseline, DIANA improves $2.9\%$ on Split-CIFAR, $9.0\%$ on Split-CUB, $13.0\%$ on Split-AWA2, $1.4\%$ on ISIC2019, and $6.2\%$ on EuroSat. This shows that it is more effective to directly optimize AUC as in the proposed DIANA. It can also be observed that DIANA achieves the highest Accuracy $ACC_T$ on all the datasets except ISIC2019.
this further shows the discriminativeness of the learned features. 

%In addition, our algorithm is more stable than baselines. In particular, the standard deviation of DIANA is $0.020$, while the standard deviations for A-GEM, GDumb, and MEGA are $0.046$, $0.027$, and $0.030$ respectively. 

%it also reveals a dramatic deviation on imbalanced scenarios. Our algorithms get smaller deviations in multiple runs, AUC score changes  $\pm$ 0.020 on Split CIFAR, while A-GEM, GDumb, and MEGA are $ $\pm$ 0.046$, $ $\pm$ 0.027$, $ $\pm$ 0.030$ respectively.


One interesting observation is that GDumb generally achieves a high AUC than other baselines on the imbalanced benchmarks. However, it is worth noting that GDumb is trained \emph{offline} on the replay buffer with \emph{multiple} passes over the data. GDumb avoids the gradient interference problem by only leveraging the gradients on the replay buffer. 
% However, in real-world lifelong learning applications, the model needs to quickly adapt to the data with a \emph{single} pass. 
This impedes the practical applicability of GDumb. 
% In contrast, DIANA is trained \emph{online} with a \emph{single} over the samples which can be applied in real-time applications. 


\subsection{Ablation}


\begin{table*}[!t]
  \centering
  \caption{The multi-class results of average AUC, average ACC, and average Forgetting (FGT) of different methods on Split CIFAR100, Split CUB200, and Split AWA2.}
  	%vspace*{-0.1in}
    \setlength\tabcolsep{2.5pt} 
   \scalebox{0.78}{
    \begin{tabular}{l|ccc|ccc|ccc}
	\toprule
	\multicolumn{1}{c|}{\multirow{2}[2]{*}{\textbf{Method}}} & \multicolumn{3}{c|}{\textbf{Split-CIFAR}} & \multicolumn{3}{c}{\textbf{Split-CUB}} & \multicolumn{3}{|c}{\textbf{Split-AWA2}} \\
	& \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%} & \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%}
	& \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%}\\
	\midrule
    SINGLE & 75.8 $\pm$ 2.0 & 42.8 $\pm$ 3.4 & 12.7 $\pm$ 2.1 & 91.9 $\pm$ 2.4 & 48.1 $\pm$ 6.3 & 3.2 $\pm$ 2.3 & 51.0 $\pm$ 2.1 & 58.0 $\pm$ 0.7 & 28.4 $\pm$ 2.9 \\
    EWC   & 76.4 $\pm$ 1.8 & 43.9 $\pm$ 2.4 & 11.9 $\pm$ 1.4 & 91.5 $\pm$ 2.6 & 47.2 $\pm$ 6.3 & 3.8 $\pm$ 2.5 & 51.7 $\pm$ 1.3 & 57.9 $\pm$ 1.6 & 28.7 $\pm$ 3.8 \\
    MAS   & 78.2 $\pm$ 1.7 & 44.5 $\pm$ 3.3 & 8.1 $\pm$ 1.9 & 92.7 $\pm$ 2.1 & 49.9 $\pm$ 6.2 & 2.6 $\pm$ 2.1 & 50.1 $\pm$ 0.1 & 53.6 $\pm$ 6.2 & 28.5 $\pm$ 3.9 \\
    A-GEM  & 81.6 $\pm$ 0.9 & 54.5 $\pm$ 1.7 & 7.2 $\pm$ 0.9 & 92.8 $\pm$ 0.8 & 54.4 $\pm$ 7.5 & 1.1 $\pm$ 1.1 & 51.1 $\pm$ 3.2 & 59.6 $\pm$ 0.8 & 27.3 $\pm$ 1.6 \\
    GDumb & 75.9 $\pm$ 0.9 & 49.4 $\pm$ 1.4 & 2.6 $\pm$ 0.4 & 92.8 $\pm$ 1.8 & \textbf{65.5 $\pm$ 2.3} & 0.3 $\pm$ s0.2 & 54.4 $\pm$ 3.4 & 60.8 $\pm$ 2.7 & 14.1 $\pm$ 6.1 \\
    DER   & 80.3 $\pm$ 1.2 & 40.7 $\pm$ 2.5 & 8.9 $\pm$ 1.2 & 92.6 $\pm$ 2.8 & 59.5 $\pm$ 9.0 & 1.5 $\pm$ 0.7 & 71.3 $\pm$ 16.0 & 63.1 $\pm$ 3.4 & 15.3 $\pm$ 9.2 \\
    MEGA  & 62.2 $\pm$ 2.7 & 32.3 $\pm$ 2.4 & 10.8 $\pm$ 2.2 & 89.6 $\pm$ 4.0 & 50.6 $\pm$ 7.3 & 4.3 $\pm$ 2.5 & 51.1 $\pm$ 1.0 & 59.6 $\pm$ 5.6 & 12.8 $\pm$ 2.9 \\
%	RM  & 0.816 $\pm$ 0.011 & 0.505 $\pm$ 0.014	&0.065 $\pm$ 0.009  & 0.593 $\pm$ 0.049	& 0.123 $\pm$ 0.017	& 0.022 $\pm$ 0.012	& 0.562 $\pm$ 0.029	& 0.510 $\pm$ 0.029	& 0.128 $\pm$ 0.021\\
	\midrule
    DIANA & \textbf{89.5 $\pm$ 0.3} & \textbf{65.5 $\pm$ 0.9} & \textbf{1.2 $\pm$ 1.2} & \textbf{93.4 $\pm$ 0.6} & 64.3 $\pm$ 1.8 & \textbf{0.2 $\pm$ 0.1} & \textbf{92.6 $\pm$ 1.4} & \textbf{82.5 $\pm$ 2.3} & \textbf{0.3 $\pm$ 0.2} \\
    %DIANA-RM &0.669 $\pm$ 0.006	&0.275 $\pm$ 0004	&0.137 $\pm$ 009	&0.525 $\pm$ 0.011	&0.11 $\pm$ 0.008	&0.026 $\pm$ 0.005	&0.727 $\pm$ 0.005	&0.632 $\pm$ 0.026	0.020 $\pm$ 0.011
    \bottomrule
    \end{tabular}}%
  \label{tab:multi-class}%
%	%vspace*{-0.2in}
\end{table*}%

%\subsubsection{One model vs. two models.} 
\noindent \textbf{One model vs. two models.} 
To verify the gradient interference problem discussed in Section \ref{sec:grad_interf}, we conduct ablation on ONE-MODEL and TWO-MODEL. ONE-MODEL denotes the method of solving Equation~(\ref{eq:22}) 
%by one-step of stochastic gradient descent ascent,
%whose gradient is calculated based on both current task data and replay buffer with a single model. 
which is a naive combination of AUC maximization and memory-based lifelong learning. It would suffer from the gradient interference problem due to conflict gradients computed on the current task and the replay buffer. TWO-MODEL denotes the method of solving Equation ~(\ref{formulation:final}) 
%by one-step stochastic gradient descent ascent 
(a.k.a., DIANA). We denote the model $\w$ and $\v$ in Equation~(\ref{formulation:final}) as the current model and reference model respectively. The gradient w.r.t. current (reference) model is the summation over two parts: the gradient calculated based on current (reference) task data, and the gradient of the distance function between two models.

As shown in Table~\ref{tab:one_vs_two}, TWO-MODEL outperforms ONE-MODEL in all benchmarks except for Split-CUB. In particular, TWO-MODEL is better than ONE-MODEL by $+10.3\%$, $+19.9\%$, $+2.3\%$, and $+14.3\%$ on Split-CIFAR, Split-AWA2, ISIC2019, and EuroSat respectively. 


We analyze why TWO-MODEL is better than ONE-MODEL on Split-CIFAR but a bit worse on Split-CUB. We probe this problem by observing the angle between the current gradient and reference gradient in ONE-MODEL. Concretely, we follow GEM and A-GEM by firstly storing the gradients computed on the current mini-batch and the episodic memory, then calculating the angle between them. We repeat this in each iteration and show the distribution the angles on two standard datasets (balanced and imbalanced) with a histograms in Figure 6 %~\ref{fig:dist-of-angle} 
in Appendix. Especially, we experimentally find that $9.68\%$ of the angles are obtuse in imbalanced Split-CIFAR; while only $3.16\%$ are obtuse in imbalanced Split-CUB. Fewer obtuse angles indicate less gradient interference, so ONE-MODEL is better on Split-CUB. One possible reason for the Split-CUB dataset to have fewer obtuse angles is that the images in Split-CUB dataset are fine-grained and different tasks have high similarity, so the gradients between tasks are more likely to be similar.



We have the following conjecture: when the data on different tasks has very different distributions, TWO-MODEL is preferred over ONE-MODEL and vice versa.

Furthermore, to eliminate the concern of unfair comparison, we compare the capacity in terms of episodic memory size, computation (GFLOPS, i.e., one billion ($10^9$) floating-point operations per second), and the parameters. We enlarged the one-model approach's model architecture to match the two-model capacity. Table \ref{tab:twice-capcity} presents the comparison under same capacity. With nearly the same capacity, TWO-MODEL still outperforms ONE-MODEL. In conclusion, it's the decoupled and aligned mechanism itself that benefits imbalanced lifelong learning, not the extra model parameters. %On the contrary, a larger number of parameters may have difficulties when training on small datasets with limited iterations. 


% \subsubsection{Multi-class Balanced Classification.} 
\noindent \textbf{Multi-class Balanced Classification.}  To further verify the effectiveness of our algorithm on general lifelong learning, we conduct experiments on standard multi-class classification benchmarks without making datasets imbalanced. 
Similar to most of the existing AUC maximization literature, our algorithm focuses on binary classification. In order to apply DIANA for general lifelong learning problems, we extend our method to accommodate multi-class AUC maximization following works \cite{liu2020stochastic,yang2021learning}. If there are $c$ classes, we have $c$ output scores from the network, one score for each class. If a sample belongs to $i$-th class, the corresponding score at $i$-th position is treated as positive and the rest scores negative, a binary AUC loss is calculated based on these scores. By iterating over $c$ classes, multi-class AUC loss accumulates. 
%The process is similar to the way that adapting binary cross-entropy loss to multi-class classification. 
To obtain AUC metric in multi-class form, we calculate AUC score for each class pair (i.e., one versus all) and then perform the average.


Table \ref{tab:multi-class} presents results on standard Split-CIFAR, Split-CUB200 and Split-AWA2. 
Because most of the baselines have reported the accuracy and tuned hyper-parameters on Split-CIFAR, we just follow their settings. 
%A uniform memory size at 1280 is selected since the majority of the baseline methods choose a close memory size\cite{lopez2017gradient, guo2020improved, chaudhry2018efficient}. 
As to Split-CUB200 and Split-AWA2, the setups follow Section~\ref{sec:exp-setups}. DIANA outperforms other baselines in terms of AUC, Accuracy, and forgetting.  Particularly, on Split-CUB200, DIANA obtains a slightly lower accuracy than Gdumb but the highest on the AUC metric.
% mutli-class auc metrics
The results show that our method is not only suitable for imbalanced scenarios but also works well under general lifelong learning settings such as multi-class classification on balanced datasets.

%\subsubsection{Time and space complexity}
\noindent \textbf{Time and space complexity.} 
% We explore the time and space complexity of DIANA algorithm.
All the experiments are performed on a single GTX-1080Ti GPU. The running time is reported in Table~\ref{tab:time}. Our TWO-MODEL-based approach increases slightly computational cost but improves significantly performance. As for space complexity, DIANA uses the same memory size as those reply methods like A-GEM.
Model decoupling with an alignment penalty can be computationally expensive compared with training one model because of maintaining two models. However, this approach can often yield better results and faster than using a single model with a complex function. According to Table~\ref{tab:time}, although our approach cost 34.5s, it is still faster than EWC, MAS, and GEM, which cost 78.6s, 77.3s, and 109s respectively.
% Table generated by Excel2LaTeX from sheet 'Sheet1'

\begin{table}[ht]
  \centering
  \caption{Time and space complexity on Split-CIFAR}
%    %vspace*{-0.1in}
    \begin{tabular}{l|cc}
    \toprule
    \textbf{Method} & \textbf{Training time (s)} &\textbf{memory}\\
    \midrule
    Single & 8.8 &0 \\
    EWC   & 78.6 &0\\
    MAS   & 77.3 &0\\
    GEM   & 109 &1280 \\ 
    A-GEM  & 17.1 &1280 \\
    GDumb & 8.8 &1280\\
    DER   & 20.0 &1280 \\
    MEGA  & 15.7 &1280\\
    DIANA & 34.5 &1280\\
    \bottomrule
    \end{tabular}%
  \label{tab:time}%
%  	%vspace*{-0.1in}
\end{table}%

\noindent \textbf{Hyperparameters}. $\lambda$ and  $\beta$ are two hyperparameters used in the alignment penalty. $\lambda$ is adaptively changed according to our algorithm so practitioners do not need to tune $\lambda$. Now we provide some results in varying  $\beta$ because $\beta$ is a tuned hyperparameter that controls the tradeoff of maximizing AUC and alignment of two models, the result is presented in Table~\ref{tab:hypter-paramters}. We find that the best tradeoff is choosing $\beta$ to be $0.1$ in various benchmarks, so we suggest practitioners use $\beta=0.1$ for their own tasks. 

\begin{table}[H]
  \centering
  \caption{Varing hyperparameter $\beta$}
    \begin{tabular}{l|ccc}
    \toprule
    \textbf{$\beta$} & \textbf{CIFAR100} &\textbf{CUB200} &\textbf{AWA2}\\
    \midrule
    0.1 & 68.4 & 70.4 &98.2 \\
    0 & 66.09 &72.03 &95.8 \\
    0.01 & 65.8 & 69.6 &93.7 \\
    1.0 & 63.6 & 51.11 & 54.1 \\
    \bottomrule
    \end{tabular}%
  \label{tab:hypter-paramters}%
\end{table}%

\noindent \textbf{Combine AUC maximization}. To further validate the advantage of AUC maximization in imbalanced setting, we combine AUC maximization with previous methods, like EWC and DER. Table~\ref{tab:existing-auc} shows when replacing the cross-entropy loss with AUC maximization, both EWC and DER have significant improvement. However, our proposed method achieve superior performance compared to these existing approaches, owing to the incorporation of an additional model decoupling mechanism. Our algorithm DIANA is slightly worse than DER-AUC on split-CIFAR100, but much better than all other baselines on datasets split-CUB200 and split-AWA2.

\begin{table}[H]
  \centering
  \caption{Combine AUC maximization with Existing literature $\beta$}
    \begin{tabular}{l|ccc}
    \toprule
    \textbf{method} & \textbf{CIFAR100} &\textbf{CUB200} &\textbf{AWA2}\\
    \midrule
    EWC &64.4 &51.5 &56.2 \\
    EWC-auc &62.4&69.7&97.1 \\
    DER &62.6 &65.7 &80.4 \\
    DER-auc &70.1&62.1&91.5\\
    DIANA &68.4&70.4&98.2\\
    \bottomrule
    \end{tabular}%
  \label{tab:existing-auc}%
\end{table}%




%\subsubsection{More ablations in Appendix.} 
%\subsection{More ablations}
\noindent \textbf{More ablations in Appendix.} 
Due to limited space, more ablations are studied in Appendix, including results with balanced sampling in Appendix A, imbalanced ratio in Appendix C, balanced vs. imbalanced in Appendix D.

% \textcolor{red}{I do not understand this paragraph: Besides, We find most of the time GDumb ranks second, only inferior to DIANA. GDumb is trained on tiny episodic memory or called reference data. As a result, the interference between the current gradient and reference gradient does not exist, leading to a throughout descent. However, tiny episodic memory data can not guarantee a promising industrial application. Instead, it impedes the growth of training data and leads to over-fitting. 
% In contrast, DIANA benefits from both current task data and memory data, without suffering severe interference directly. As discussed in Section \ref{sec:alogrithm}, current and reference gradients are disentangled to two models, represented by $F(\w,a_1,b_1,\alpha_1;\z_t)$ and $F(\v,a_2,b_2,\alpha_2;\hat{\z}_t)$. The model in charge of the reference data stream is similar to GDumb, free from gradient interference. The other model looks like the naive training strategy SINGLE. Instead of direct supervision, the alignment of the two models provides a soft label of current task data, which alleviates the interference. To clarify, it's not like knowledge distillation, a teacher model supervises a student model by soft labels, and there's no priority between the two models. Both two models are gradually aligned with each other.}

%\subsection{Ablation}
%%vspace{-.2in}

%\vspace*{-0.15in}
\section{Conclusion}
% %vspace*{-0.1in}
We study AUC optimization under imbalanced continual learning settings.
We propose a novel algorithm DIANA based on the minimax reformulation of the AUC objective. 
We systematically study the gradient interference problem on imbalanced data. 
% both theoretically and empirically. 
We demonstrate that this problem can be alleviated by employing two models with model decoupling and alignment.
% : one for the current task and the other for past tasks, and they gradually align during the training.
We extend our algorithm to multi-class AUC maximization in general balanced lifelong learning.
Compared to existing approaches, the proposed algorithm achieves a higher AUC score as well as less forgetting. One limitation is that our approach uses two models and slightly increases the computational cost.

% \section*{Acknowledgements}
% We would like to thank the anonymous reviewers for their helpful comments. Jie Hao and Mingrui Liu are supported by a grant at George Mason University. The work of Xiangyu Zhu was done when virtually visiting Mingrui Liu's group at Department of Computer Science at George Mason University.
%We would like to thank the anonymous reviewers for their helpful comments. 
\noindent\textbf{Acknowledgments.} 
J. Hao and M. Liu are supported by a grant at George Mason University (GMU). The work of X. Zhu was done when virtually visiting M. Liu's group at Department of Computer Science at GMU.
%In future works, we would like to further investigate the tasks where AUC score is important. Besides, we are also interested in tasks with minor similarities, which might lead to more gradient interference.


% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% % APPENDIX
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \newpage
% \appendix

% \section{DIANA Outperforms Other Methods with Class Balanced Sampling}
% \label{sec: other_methods}

% \begin{table*}[h]
% 	\centering
% 	\caption{The results of memory-based methods with Class Balanced Reservoir Sampling on Split CIFAR100, Split CUB200, and Split AWA2. }
%     \setlength\tabcolsep{2.5pt} 
% 	\scalebox{0.75}{
%     	\begin{tabular}{l|ccc|ccc|ccc}
% 		\toprule
% 		\multicolumn{1}{c|}{\multirow{2}[2]{*}{\textbf{Method}}} & \multicolumn{3}{c|}{\textbf{Split-CIFAR}} & \multicolumn{3}{c}{\textbf{Split-CUB}} & \multicolumn{3}{|c}{\textbf{Split-AWA2}} \\
% 		& \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%} & \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%}
% 		& \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%}\\
%     \midrule
%     A-GEM  & 58.3 $\pm$ 3.5 & 60.3 $\pm$ 0.3 & 19.5 $\pm$ 3.6 & 50.8 $\pm$ 4.0 & 50.2 $\pm$ 0.3 & 17.6 $\pm$ 2.5 & 55.4 $\pm$ 1.8 & 49.9 $\pm$ 4.0 & 30.6 $\pm$ 2.6 \\
%     Gdumb & 70.6 $\pm$ 1.4 & 63.1 $\pm$ 0.9 & 6.3 $\pm$ 0.5 & 66.8 $\pm$ 5.5 & \textbf{55.3 $\pm$ 5.2} & 4.3 $\pm$ 2.7 & 98.4 $\pm$ 0.8 & 92.5 $\pm$ 2.6 & 0.2 $\pm$ 0.6 \\
%     DER   & 64.9 $\pm$ 3.2 & 58.3 $\pm$ 1.4 & 13.4 $\pm$ 3.3 & 65.0 $\pm$ 5.8 & 51.7 $\pm$ 2.0 & 10.3 $\pm$ 3.2 & 86.1 $\pm$ 14.1 & 71.6 $\pm$ 8.0 & 10.6 $\pm$ 11.4 \\
%     MEGA  & 66.1 $\pm$ 5.7 & 59.4 $\pm$ 2.6 & 3.2 $\pm$ 1.5 & 50.8 $\pm$ 1.1 & 50.0 $\pm$ 0.4 & 13.0 $\pm$ 2.1 & 57.1 $\pm$ 6.3 & 50.5 $\pm$ 5.9 & 17.3 $\pm$ 1.4 \\
% 	RM & \textbf{77.4 $\pm$ 1.9} & \textbf{69.8 $\pm$ 1.7} & 4.3 $\pm$ 1.9 & 58.8 $\pm$ 2.8 & 54.4 $\pm$ 1.1 & 11.3 $\pm$ 1.2 & 80.5 $\pm$ 6.0 & 71.1 $\pm$ 8.8 & 5.6 $\pm$ 2.0 \\
%     \midrule
%     DIANA & 76.7 $\pm$ 1.0 & 69.3 $\pm$ 1.7 & \textbf{3.1 $\pm$ 0.7} &  \textbf{73.4 $\pm$ 2.1} & 51.0 $\pm$ 0.5 &  \textbf{0.2 $\pm$ 0.3} & \textbf{99.5 $\pm$ 0.2} &  \textbf{87.4 $\pm$ 2.9} & \textbf{0.1 $\pm$ 0.1} \\
% 	%DIANA-RM & 0.766 $\pm$ 0.005 & 0.685 $\pm$ 0.011 & 0.034 $\pm$ 0.008 & \textbf{0.763 $\pm$ 0.031} & 0.545 $\pm$ 0.011 & \textbf{0.001 $\pm$ 0.008} & 0.994 $\pm$ 0.002 & \textbf{0.941 $\pm$ 0.002} & 0.039 $\pm$ 0.006 \\
%     \bottomrule
%     \end{tabular}}%
%   \label{tab:cbrs_stand}%
% \end{table*}%

% \begin{table*}[h]
%   \centering
%   \caption{The results of memory-based methods with Class Balanced Reservoir Sampling on medical images ISIC2019 and satellite images EuroSat.}
% 	\scalebox{0.75}{
%     \begin{tabular}{l|ccc|ccc}
%     \toprule
%     \multicolumn{1}{c|}{\multirow{2}[2]{*}{\textbf{Method}}} & \multicolumn{3}{c|}{\textbf{ISIC2019}} & \multicolumn{3}{c}{\textbf{EuroSat}} \\
%           	& \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%} & \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%}\\
%     \midrule
%     A-GEM  & 66.5 $\pm$ 10.3 & \textbf{90.6 $\pm$ 0.1} & 12.8 $\pm$ 11.9 & 79.4 $\pm$ 4.4 & 55.6 $\pm$ 4.0 & 14.4 $\pm$ 5.0 \\
%     Gdumb & 57.2 $\pm$ 9.0 & 71.1 $\pm$ 15.9 & 22.9 $\pm$ 9.1 & 81.6 $\pm$ 8.4 & 73.4 $\pm$ 8.9 & 2.4 $\pm$ 6.6 \\
%     DER   & 61.8 $\pm$ 13.7 & 84.5 $\pm$ 12.6 & 17.7 $\pm$ 17.1 & 81.2 $\pm$ 4.5 & 65.3 $\pm$ 6.2 & 7.4 $\pm$ 4.2 \\
%     MEGA  & 54.9 $\pm$ 9.6 & 62.3 $\pm$ 15.9 & 18.8 $\pm$ 10.7 & 77.8 $\pm$ 7.3 & 69.7 $\pm$ 10.3 & 2.0 $\pm$ 3.6 \\
%     RM & 72.8 $\pm$ 5.0 & 71.5 $\pm$ 3.7 & 10.3 $\pm$ 2.5 & 86.3 $\pm$ 2.6 & 79.3 $\pm$ 1.5 & 0.7 $\pm$ 4.2 \\
%         \midrule
%     DIANA &  \textbf{77.7 $\pm$ 4.2} & 78.2 $\pm$ 4.7 &  \textbf{1.9 $\pm$ 3.2} & \textbf{90.9 $\pm$ 1.7} & \textbf{83.8 $\pm$ 2.7} & \textbf{-1.0 $\pm$ 1.8} \\
%     %DIANA-RM & \textbf{0.800 $\pm$ 0.005} & 0.773 $\pm$ 0.037 & \textbf{0.002 $\pm$ 0.025} & 0.908 $\pm$ 0.018 & 0.825 $\pm$ 0.030 & 0.014 $\pm$ 0.013 \\ 
%     \bottomrule
%     \end{tabular}}%
%   \label{tab:cbrs_real}%
% \end{table*}%
% For imbalanced lifelong learning, class balanced sampling is an efficient way to alleviate the harm of imbalanced data stream in some cases. Class-Balancing Reservoir Sampling (CBRS) \cite{chrysakis2020online} is an enhanced Reservoir Sampling strategy for memory-based methods under imbalanced lifelong learning settings. It maintains a class-balanced memory by replacing instances of the major class. Thus, in experience replay, training batches from episodic memory are collected in a balanced form. However, since memory-based methods require data from both the current task and episodic memory, they still need to handle an imbalanced data stream from the current task even though the memory is balanced. Moreover, the distributions of balanced and imbalanced data streams differ significantly, which may lead to server gradient interference between these two streams. 
% %Thus, it's worth investigating the imbalanced problem with class-balanced sampling.

% We examine memory-based approaches with CBRS instead of vanilla Reservoir Sampling, the learning rate and batch size are consistent with section \ref{sec:exp-setups}. We implement CBRS according to the pseudo-code described in \cite{chrysakis2020online}. When the memory is not full, all samples are stored in memory. After the memory is full, all classes are categorized as the full class and the not-full class. Once a class has been the largest class in memory, it's marked as the full class and would be ignored by the population. On the contrary, the instances of the full class would be replaced by the not-full class.

% Table \ref{tab:cbrs_stand} and \ref{tab:cbrs_real} present the results with CBRS. The evolution of average AUC during the lifelong learning process is shown in Figure \ref{fig:evolution_cbrs}. GEM is not included because this algorithm is not compatible with CBRS, it uses Ring Buffer instead of Reservoir Sampling and each task has individual memory space.
% % For example, DER cannot access task identifiers in its memory, so it is not able to identify the largest class.   
% Compared with vanilla Reservoir sampling in Table \ref{tab:result_of_std_data} and \ref{tab:result_of_prac_data}, using CBRS significantly improves the performance of most algorithms, except A-GEM. On Split-CIFAR, A-GEM, GDumb, DER, MEGA, and DIANA increase 1.5\%, 5.1\%, 2.5\%, 3.0\%, and 8.3\% in terms of AUC score respectively.
% We also compare DIANA with another class-balanced method Rainbow Memory (RM), it uses a different class-balanced sampling strategy. Our proposed DIANA beats RM except Split-CIFAR.
% %On Split-CUB, they increase -0.8\%, 5.4\%, 0.7\%, and 3.0\% respectively. 

% We give a detailed analysis of each method below.

% \begin{itemize}
%     \item A-GEM obtains no benefits from CBRS. Since the current data stream is still imbalanced, we analyze that it's not suitable for A-GEM to rectify the current gradient based on the reference gradient which is computed on a balanced replay buffer. 
    
%     \item GDumb has stable and significant increments on all 5 datasets, because it's only trained on balanced memory data, and doesn't suffer from the gradient interference problem. 
    
%     \item DER increases on Split-CIFAR, Split-AWA2, and EuroSat, while dropping performance on ISIC2019 and Split-CUB.
    
%     \item MEGA gets boosted on Split-CIFAR and Split-CUB, but drops on the other three datasets. 
    
%     \item DIANA has much better AUC than other methods as shown in Table~\ref{tab:cbrs_stand} and Table~\ref{tab:cbrs_real}. There are two interesting observations. First, on ISIC2019 dataset, DIANA has worse accuracy than A-GEM ($78.2\%$ versus $90.6\%$) but better AUC value ($77.7\%$ versus $66.5\%$). The reason is that the dataset ISIC2019 is very imbalanced (See Section \ref{sec:exp-setups} Datasets). Second, on EuroSat dataset, DIANA has a negative forgetting measure, which means that DIANA with CBRS can help learn previous tasks when learning new tasks.
% \end{itemize}



% \textbf{In summary, DIANA achieves higher AUC than other memory-based approaches when applying Class-Balancing Reservoir Sampling.} DIANA has consistent improvements on all 5 datasets. We have a conjecture that the two-model framework alleviates the gradient interference between imbalanced data stream and balanced replay buffer, according to Section \ref{sec:grad_interf}.

% % A-GEM does not even benefit from CBRS, because when the gradients from episodic memory and current task conflict more severely. Gdumb has a because it only trained on episodic memory data and would not be affected by imbalanced data from the current task.

% % Class balance sampling


% \begin{figure*}[!t]
% \begin{center}
% 	\subfigure[Split-CIFAR]{\includegraphics[width=0.45\linewidth]{figures/evoplot_cbrs_cifar.pdf}} 
% 	\subfigure[Split-CUB]{\includegraphics[width=0.45\linewidth]{figures/evoplot_cbrs_cub.pdf}}   
% 	\subfigure[Split-AWA2]{\includegraphics[width=0.45\linewidth]{figures/evoplot_cbrs_awa.pdf}}    
% 	\subfigure[ISIC2019]{\includegraphics[width=0.45\linewidth]{figures/evoplot_cbrs_isic.pdf}}     
% 	\subfigure[EuroSat]{\includegraphics[width=0.45\linewidth]{figures/evoplot_cbrs_eurosat.pdf}}
% 	\includegraphics[width=0.6\linewidth]{figures/cbrs_legend.pdf}
% \end{center}
% %vspace*{-0.1in}
% \caption{ Evolution of average AUC of memory-based methods with Class Balanced Reservoir Sampling. }
% \label{fig:evolution_cbrs}
% %vspace{-.1in}
% \end{figure*}

% \section{Result Detail}


% % You can have as much text here as you want. The main body must be at most $8$ pages long.
% % For the final version, one more page can be added.
% % If you want, you can use an appendix like this one, even using the one-column format.
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% We show the detailed results of all the methods on imbalanced benchmarks and practical data in Figure~\ref{fig:result}, Table~\ref{tab:result_of_std_data} and Table~\ref{tab:result_of_prac_data}.

% \begin{figure*}[t]
% \begin{center}
% 	\includegraphics[width=1.0\linewidth]{figures/result.pdf}    \\
% 	\includegraphics[width=0.9\linewidth]{figures/legend.pdf}
% \end{center}
%  \caption{Performance of DIANA and other baselines on Split-CIFAR, Split-CUB, Split-AWA2, ISIC, and EuroSAT, each result is averaged after 5 runs with different random seeds. The Standard Deviation (std) is represented by the black line.}
% \label{fig:result}
% %vspace{-.1in}
% \end{figure*}


% \begin{table*}%[htbp]
% 	\centering
% 	\caption{The results of average AUC, average ACC, and average Forgetting (FGT) of different
% methods on Split CIFAR100, Split CUB200, and Split AWA2. }
%     \setlength\tabcolsep{2.5pt} 
% 	\scalebox{0.75}{
% 	\begin{tabular}{l|ccc|ccc|ccc}
% 		\toprule
% 		\multicolumn{1}{c|}{\multirow{2}[2]{*}{\textbf{Method}}} & \multicolumn{3}{c|}{\textbf{Split-CIFAR}} & \multicolumn{3}{c}{\textbf{Split-CUB}} & \multicolumn{3}{|c}{\textbf{Split-AWA2}} \\
% 		& \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%} & \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%}
% 		& \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%}\\
% 		\midrule
% 		SINGLE & 58.9 $\pm$ 3.0 & 60.1 $\pm$ 0.2 & 15.9 $\pm$ 3.3 & 54.2 $\pm$ 2.5 & 50.1 $\pm$ 0.5 & 14.9 $\pm$ 2.7 & 55.0 $\pm$ 5.3 & 47.6 $\pm$ 0.5 & 33.8 $\pm$ 6.4 \\
% 		EWC   & 64.4 $\pm$ 2.7 & 60.3 $\pm$ 0.6 & 10.0 $\pm$ 2.2 & 51.5 $\pm$ 0.9 & 50.1 $\pm$ 0.4 & 16.6 $\pm$ 0.6 & 56.2 $\pm$ 5.9 & 47.0 $\pm$ 1.3 & 32.2 $\pm$ 5.6 \\
% 		MAS   & 57.4 $\pm$ 0.9 & 60.0 $\pm$ 0.3 & 17.1 $\pm$ 1.2 & 51.7 $\pm$ 1.0 & 50.1 $\pm$ 0.4 & 15.5 $\pm$ 1.9 & 57.4 $\pm$ 5.2 & 47.4 $\pm$ 1.3 & 30.6 $\pm$ 5.4 \\
% 		GEM   & 65.2 $\pm$ 1.6 & 61.5 $\pm$ 0.1 & 11.9 $\pm$ 1.1 & 55.4 $\pm$ 1.0 & 50.4 $\pm$ 0.6 & 15.5 $\pm$ 0.7 & 69.6 $\pm$ 5.5 & 54.3 $\pm$ 4.4 & 21.7 $\pm$ 5.4 \\
% 		A-GEM  & 56.8 $\pm$ 4.6 & 60.2 $\pm$ 0.3 & 19.9 $\pm$ 5.0 & 51.6 $\pm$ 2.0 & 50.1 $\pm$ 0.5 & 17.5 $\pm$ 1.1 & 56.4 $\pm$ 3.2 & 47.7 $\pm$ 0.4 & 31.5 $\pm$ 6.2 \\
% 		GDumb & 65.5 $\pm$ 2.7 & 61.4 $\pm$ 2.0 & 6.0 $\pm$ 3.4 & 61.4 $\pm$ 1.1 & 50.5 $\pm$ 0.8 & 6.0 $\pm$ 0.8 & 85.2 $\pm$ 4.3 & 72.2 $\pm$ 1.5 & \textbf{1.9 $\pm$ 0.9} \\
% 		DER  & 62.6 $\pm$ 3.2 & 59.6 $\pm$ 1.2 & 16.9 $\pm$ 3.1 & 65.7 $\pm$ 4.4 &  \textbf{53.5 $\pm$ 2.1} & 
% 		11.2 $\pm$ 3.8 & 80.4 $\pm$ 3.3 & 63.1 $\pm$ 6.4 & 15.3 $\pm$ 1.7 \\
% 		MEGA  & 63.1 $\pm$ 3.0 & 61.5 $\pm$ 1.3 & \textbf{5.5 $\pm$ 2.0} & 50.1 $\pm$ 3.0 & 50.6 $\pm$ 0.3 & 16.1 $\pm$ 3.9 & 65.3 $\pm$ 5.3 & 50.8 $\pm$ 5.4 & 20.5 $\pm$ 5.0 \\
% % 		RM & \textbf{0.774 $\pm$ 0.019} & \textbf{0.698 $\pm$ 0.017} & 0.043 $\pm$ 0.019 & 0.588 $\pm$ 0.028 & 0.544 $\pm$ 0.011 & 0.113 $\pm$ 0.012 & 0.805 $\pm$ 0.060 & 0.711 $\pm$ 0.088 & 0.056 $\pm$ 0.020 \\
% 		\midrule
% 		DIANA & \textbf{68.4 $\pm$ 2.0} & \textbf{62.3 $\pm$ 0.9} & 6.6 $\pm$ 1.2 & \textbf{70.4 $\pm$ 0.7} & 52.3 $\pm$ 0.3 & \textbf{2.4 $\pm$ 0.7} & \textbf{98.2 $\pm$ 1.1} & \textbf{78.7 $\pm$ 3.4} & 2.0 $\pm$ 0.3\\
% 		\bottomrule
% % 		DIANA-RM & 0.766 $\pm$ 0.005 & 0.685 $\pm$ 0.011 & \textbf{0.034 $\pm$ 0.008} & \textbf{0.763 $\pm$ 0.031} & \textbf{0.545 $\pm$ 0.011} & \textbf{0.001 $\pm$ 0.008} & \textbf{0.994 $\pm$ 0.002} & \textbf{0.941 $\pm$ 0.002} & 0.039 $\pm$ 0.006 \\
% 	\end{tabular}}%
% 	\label{tab:result_of_std_data}%



%   \centering
%   \caption{The results of average AUC, average ACC, and average Forgetting (FGT) of different
% methods on medical images ISIC2019 and satellite images EuroSat.}
% 	\scalebox{0.75}{
%     \begin{tabular}{l|ccc|ccc}
%     \toprule
%     \multicolumn{1}{c|}{\multirow{2}[2]{*}{\textbf{Method}}} & \multicolumn{3}{c|}{\textbf{ISIC2019}} & \multicolumn{3}{c}{\textbf{EuroSat}} \\
%           	& \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%} & \small{\textit{AUC} (↑)\%} & \small{\textit{ACC} (↑)\%} & \small{\textit{FGT} (↓)\%}\\
%     \midrule
%     SINGLE & 61.7 $\pm$ 7.7 & 86.7 $\pm$ 5.0 & 17.7 $\pm$ 5.3 & 69.3 $\pm$ 6.0 & 56.0 $\pm$ 1.5 & 20.0 $\pm$ 6.5 \\
%     EWC   & 59.2 $\pm$ 8.1 & 86.7 $\pm$ 5.0 & 19.4 $\pm$ 6.1 & 76.3 $\pm$ 6.3 & 55.3 $\pm$ 2.7 & 10.1 $\pm$ 7.2 \\
%     MAS   & 62.5 $\pm$ 6.5 & 86.7 $\pm$ 5.0 & 17.5 $\pm$ 6.0 & 74.6 $\pm$ 4.4 & 56.4 $\pm$ 4.0 & 13.1 $\pm$ 6.4 \\
%     GEM   & 71.3 $\pm$ 6.5 & 84.3 $\pm$ 4.6 & 7.2 $\pm$ 2.0 & 80.5 $\pm$ 3.2 & 61.0 $\pm$ 5.4 & 8.3 $\pm$ 4.1 \\
%     A-GEM  & 66.3 $\pm$ 6.5 & 86.7 $\pm$ 4.8 & 13.9 $\pm$ 5.2 & 74.0 $\pm$ 6.5 & 58.0 $\pm$ 3.4 & 13.5 $\pm$ 6.3 \\
%     GDumb & 72.2 $\pm$ 4.2 & \textbf{90.4 $\pm$ 0.5} &  \textbf{4.4 $\pm$ 2.0} & 78.3 $\pm$ 5.5 & 64.9 $\pm$ 4.6 & 4.8 $\pm$ 1.9 \\
%     DER  & 69.2 $\pm$ 3.9 & 84.1 $\pm$ 4.5 & 9.5 $\pm$ 5.4 & 73.9 $\pm$ 2.6 & 58.6 $\pm$ 2.0 & 17.5 $\pm$ 3.9 \\
%     MEGA  & 71.1 $\pm$ 5.3 & 85.8 $\pm$ 4.7 & 5.9 $\pm$ 2.4 & 79.8 $\pm$ 3.1 & 55.8 $\pm$ 4.3 & 5.3 $\pm$ 2.4 \\
%     % RM & 0.728 $\pm$ 0.050 & 0.715 $\pm$ 0.037 & 0.103 $\pm$ 0.025 & 0.863 $\pm$ 0.026 & 0.793 $\pm$ 0.015 & \textbf{0.007 $\pm$ 0.042} \\
%      \midrule
%     DIANA & \textbf{73.6 $\pm$ 4.8} & 85.2 $\pm$ 3.7 & 5.5 $\pm$ 2.5 &  \textbf{86.7 $\pm$ 1.5} &  \textbf{66.0 $\pm$ 1.8} &  \textbf{3.8 $\pm$ 1.7} \\
%     % DIANA-RM & \textbf{0.800 $\pm$ 0.005} & 0.773 $\pm$ 0.037 & \textbf{0.002 $\pm$ 0.025} & \textbf{0.908 $\pm$ 0.018} & \textbf{0.825 $\pm$ 0.030} & 0.014 $\pm$ 0.013 \\    
%     \bottomrule
%     \end{tabular}}%
%   \label{tab:result_of_prac_data}%
% \end{table*}%

% % \begin{table}

% % \end{table}%

% \begin{figure*}[htbp]
% \begin{center}
% 	\subfigure[EuroSat]{\includegraphics[width=0.8\linewidth]{figures/evoplot_eurosat.pdf}}
% 	\includegraphics[width=0.7\linewidth]{figures/legend.pdf}
% \end{center}
% %vspace*{-0.1in}
% \caption{ Evolution of average AUC on EuroSat.}
% \label{fig:evolution-eurosat}
% %%vspace{-.1in}
% \end{figure*}


% \section{Imbalanced Ratio}
% In Figure~\ref{fig:imratio}, we vary the imbalanced ratio to evaluate the robustness of different methods. We consider three imratio settings: $\{0.01, 0.05, 0.1\}$. As expected, when the imratio increases, all the algorithms generally achieve better performance. Across all three settings, DIANA achieves the highest performance except when $imratio=0.01$ on ISIC2019. It is worth mentioning that when imratio is 0.01, i.e., only $1\%$ of samples are positive, all the methods drop drastically including DIANA. The possible reason is that when imratio is $0.01$, the positive samples are very rare so it is difficult for all the methods to learn efficiently.

% \begin{figure*}[t]
% \begin{center}
% 	\subfigure[Split-CIFAR]{\includegraphics[width=0.32\linewidth]{figures/imratio_cifar.pdf}}
% 	\subfigure[Split-CUB]{\includegraphics[width=0.32\linewidth]{figures/imratio_cub.pdf}}  
% 	\subfigure[ISIC2019]{\includegraphics[width=0.32\linewidth]{figures/imratio_isic.pdf}} 
% 	\includegraphics[width=0.7\linewidth]{figures/legend.pdf}
% \end{center}
% %vspace*{-0.2in}
% \caption{Average AUC on the data with different imbalanced ratio.}
% %vspace*{-0.1in}
% \label{fig:imratio}
% %vspace{-.1in}
% \end{figure*}
% %%vspace{-.3in}


% \section{Balanced Versus Imbalanced}
% In this section, it's studied how imbalance affects lifelong learning. In hyperplane, if the direction of gradient descent changes a lot, it's probably unstable and hard to optimize. Thus, we can exploit the deviation of direction to indicate whether a task is getting harder or not. We first store all gradients in a task and get an averaged gradient as an anchor. Then calculate the angle between the anchor and each mini-batch gradient. Drawing the histograms of angle deviation, we can see the distribution of gradient direction before and after making it imbalanced. It's plotted in Figure \ref{fig:dist-of-angle-on-current-tasks}. 
% After making imbalanced, the histograms have a wider range and larger std, which means the directions of gradient vary more dramatically. 

 
% \begin{figure*}[!h]
% \begin{center}
% 	\subfigure[Split-CIFAR]{\includegraphics[width=0.30\linewidth]{figures/g_direction_cifar.pdf}} \quad
% 	\subfigure[Split-CUB]{\includegraphics[width=0.30\linewidth]{figures/g_direction_cub.pdf}}  \quad
% 	\subfigure[ISIC2019]{\includegraphics[width=0.30\linewidth]{figures/g_direction_isic.pdf}}   
  
% \end{center}
% \caption{ The distributions of the directions of the gradients on the current tasks. (a) balance std: 1.59, imbalance std: 3.50; (b) balance std: 2.58; imbalance std: 3.85; (c)balance std: 2.24, imbalance std: 5.52.   }
% \label{fig:dist-of-angle-on-current-tasks}
% %%vspace{-.1in}
% \end{figure*}




% \begin{figure*}[!h]
% \begin{center}
% 	\subfigure[Balanced Split-CUB]{\includegraphics[width=0.24\linewidth]{figures/angle_dis_cub_ba.pdf}}
% 	\subfigure[Imbalanced Split-CUB]{\includegraphics[width=0.24\linewidth]{figures/angle_dis_cub_imba.pdf}}   
% 	\subfigure[Balanced Split-CIFAR]{\includegraphics[width=0.255\linewidth]{figures/angle_dis_cifar100_ba.pdf}}   
% 	\subfigure[Imbalanced Split-CIFAR]{\includegraphics[width=0.24\linewidth]{figures/angle_dis_cifar100_imba.pdf}}  


% \end{center}
% \caption{The distributions of the angles between the current gradient and reference gradient, where the proportion of angles greater than 90 degrees is: (a) $1.05\%$, (b) $3.16\%$, (c) $0.13\%$, (d) $9.68\%$}
% \label{fig:dist-of-angle}
% %%vspace{-.1in}
% \end{figure*}

% References
% \bibliography{uai2023-template}
\bibliography{zhu_155}
\end{document}
