\documentclass{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{paralist}

% % added package includes
\usepackage{caption} % customize table captions
\usepackage{array} % for column width adjustments
\usepackage{rotating} % for rotating tables
\usepackage{tabularx}
\usepackage{multirow}
\usepackage{inconsolata}
\usepackage{amsfonts}
\usepackage{arydshln}

\usepackage{color} % [usenames,dvipsnames]
\newcommand{\blue}[1]{{\color{blue}{#1}}}
\newcommand{\red}[1]{{\color{red}{#1}}}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Dirichlet Generative Rehearsal: Unlocking Continual Learning}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2024 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Harry~Q.~Bovik}
\author[1,2]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[1]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Cranberry University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Second Affiliation\\
    Address\\
    …
}
\affil[3]{%
    Another Affiliation\\
    Address\\
    …
  }
  
\begin{document}
\maketitle
\begin{abstract}
% Recently, data-driven task-oriented dialogue systems (ToDs) have struggled with incremental learning due to computational constraints and time-consuming issues. Continual Learning (CL) attempts to solve this by avoiding intensive pre-training, but it faces the problem of catastrophic forgetting (CF). While generative-based rehearsal CL methods have made significant strides, it still remains challenging to generate pseudo samples that can accurately reflect the underlying task-specific distribution. In this paper, we present Dirichlet Continual Learning (DCL), a novel generative-based rehearsal strategy, for CL. Unlike the traditionally used Gaussian latent variable in the Conditional Variational Autoencoder (CVAE), DCL leverages the flexibility and versatility of the Dirichlet distribution to model the latent prior variable. This enables it to efficiently capture sentence-level features of previous tasks and to effectively guide the generation of pseudo samples. In addition, we introduce Jensen-Shannon Knowledge Distillation (JSKD), a robust logit-based knowledge distillation method, to enhance knowledge transfer during pseudo sample generation. Experiments show the efficacy of our approach outperforms state-of-the-art methods in both intent detection and slot filling tasks. Code and model checkpoints are accessible at https://github/com/*****.
Catastrophic forgetting poses a significant challenge in continual learning (CL).  In the context of Natural Language Processing (NLP), generative-based rehearsal CL methods have made notable progress in avoiding expensive pre-training.  However, generating pseudo samples that accurately capture the task-specific distribution remains a daunting task.  In this paper, we introduce Dirichlet Continual Learning, namely DCL, a novel generative-based rehearsal strategy designed specifically for CL.  Departing from the conventional use of Gaussian latent variables in Conditional Variational Autoencoder (CVAE), our DCL employs the flexibility and versatility of the Dirichlet distribution to model the latent prior variable.  This enables DCL to efficiently capture sentence-level features from previous tasks and effectively guide the generation of pseudo samples.  Furthermore, we propose Jensen-Shannon Knowledge Distillation (JSKD), a robust logit-based knowledge distillation method that enhances knowledge transfer during the generation of pseudo samples.  Our experiments demonstrate the efficacy of our approach, surpassing state-of-the-art methods in two NLP tasks in task-oriented dialogue systems.  Code and model checkpoints for DCL are accessible at https://github/com/*****.
\end{abstract}



\section{Introduction}

Continual learning (CL) is a significant learning paradigm that aims to emulate the human capacity for continuous learning and knowledge accumulation while ensuring that previously learned knowledge is retained and effectively transferred to facilitate the learning of new tasks \cite{parisi2019continual}. As models are continually trained on new tasks, they often struggle to retain knowledge from previous tasks, leading to a degradation in performance \citep{mccloskey1989catastrophic}. This problem becomes even more pronounced in the context of Natural Language Processing (NLP), where the complexity and diversity of language make CL particularly challenging~\citep{ke2022continual}. Retraining the whole model from scratch is computationally expensive and time-consuming, continually learning with the latest data/tasks is critical.



% % 这篇看看是不是需要在上面引用 https://arxiv.org/pdf/2211.12701.pdf
% \begin{figure}[!t]
%     \centering
%     \includegraphics[width=60mm]{cl.pdf}
%     \caption{In this example of Continual Learning, the LM first trains on the \textit{BANKING} dataset, resulting in parameter $\theta_1$. The LM then trains on \textit{HWU}, followed by \textit{SNIPS}, and so on. The parameters are updated sequentially.}
%     \label{fig:cl}
% \end{figure}


Existing research tries to tackle the issue of CL in three categories: (1) {\em Regularization} methods: EWC \citep{kirkpatrick2017overcoming},
% \citep{zenke2017continual}, 
MAS \citep{aljundi2018memory}, ARPER \citep{mi2020continual} aim to minimize updates to the important parameters of previous tasks, preserving the performance of previous tasks. These methods mainly aim to minimize to update the important parameters of previous tasks, preserving performance. However, the accumulation of regularizers may overly constrain network parameters, impeding new task learning. (2) {\em Architectural} approaches: these methods, such as  \citep{serra2018overcoming},\citep{ke2021achieving}, \citep{madotto2021continual}, \citep{zhang2022continual}, mainly modify the network structure to enhance the extraction of task-specific features.  Nevertheless, their task-focused approach might overlook effective knowledge transfer between old and new tasks. (3) {\em Rehearsal} strategies, including \citep{lopez2017gradient}, \citep{sun2019lamol}, \citep{rolnick2019experience}, \citep{chuang2020lifelong}, \citep{mi2020continual}, \citep{Mi_2020_CVPR_Workshops} and \citep{zhao2022prompt}, involve replaying samples from previous tasks, which are combined with the current task dataset for training. Rehearsal methods can be categorized into "store-based rehearsal" and "generative-based rehearsal". It is noteworthy that rehearsal, as demonstrated by various studies, emerges as a promising strategy to mitigate forgetting in CL. However, store-based rehearsal may lead to inefficiencies and increased memory demands. In contrast, generative-based rehearsal emerges as a more effective alternative. This approach facilitates effective memory utilization and knowledge retention across sequential learning scenarios.

 







% (2) 
%\这里写相关的方法，存在的问题。最终引出真正的问题 

%这段内容估计需要用到: 

Generative-based rehearsal methods have shown promise by mitigating the need for extensive pre-training. These methods aim to generate pseudo samples that closely capture the task-specific distribution, allowing the model to retain the knowledge of previous tasks.  However, accurately generating such samples remains a daunting task.  The key to generative replay lies in producing high-quality pseudo-samples that approximate the real data distribution of prior tasks well. Higher-quality pseudo-samples intuitively contribute to better preservation of learned tasks, minimizing forgetting in CL.  However, in previous studies (\citep{sun2019lamol}; \citep{chuang2020lifelong}; \citep{zhao2022prompt}), the generation of pseudo-samples for each observed task often lacks diversity, fluency, or poor alignment with the designated task. 

% Notably, noisy pseudo samples will harm the performance of CL.
% LAMOL \cite{sun2019lamol} prevents forgetting by replaying pseudo-samples of previous tasks generated by the LM itself.
% PCLL \cite{zhao2022prompt} proposed 


% lacking diversity and training robustness 

To address the persisting problems mentioned above, in this paper, we propose a novel generative-based rehearsal strategy called Dirichlet Continual Learning (DCL) specifically designed for CL in NLP.  Unlike conventional approaches that use Gaussian latent variables in Conditional Variational Autoencoders (CVAE), DCL leverages the flexibility and versatility of the Dirichlet distribution to model the latent prior variable.  This enables DCL to efficiently capture and represent sentence-level features from previous tasks, providing a strong foundation for generating high-quality pseudo samples.  Moreover, we introduce Jensen-Shannon Knowledge Distillation (JSKD), a robust logit-based knowledge distillation method that enhances knowledge transfer during the generation of pseudo samples.  By leveraging the relationships between the model logits and the ground truth labels, JSKD facilitates the transfer of task-specific knowledge from the model to the generated samples, further improving the effectiveness of the rehearsal process.

% To evaluate the efficacy of our proposed DCL, we conducted comprehensive experiments on two significant NLP tasks in task-oriented dialogue systems. The results demonstrate that our DCL framework, combined with JSKD, outperforms state-of-the-art methods, achieving superior performance in terms of both accuracy and knowledge retention.



We summarize our main contributions as follows:
\begin{compactitem}
\item We propose a generative-based rehearsal method that can mitigate forgetting by approximating the real data distribution with the Dirichlet latent variable in the CVAE framework. 
\item We develop Jensen-Shannon Knowledge Distillation (JSKD), a new logit-based knowledge distillation strategy, which enables better and more robust knowledge transfer between teacher and student models.
\item Experimental results demonstrate the significant improvement of DCL. Compared with SOTA baseline PCLL, DCL shows 3.48\% accuracy and 4.22\% LCA improvement in the Intent Detection task, respectively. In Slot Filling, DCL shows 2.89\% accuracy and 6.08\% LCA improvement, respectively.  
\end{compactitem}




\section{Related Work}
We highlight the related work in the following two parts: continual learning, and continual learning in NLP.

\subsection{Continual Learning}
Continual learning (CL) involves three main categories: \emph{regularization}, \emph{architectural}, and \emph{rehearsal} strategies. 

\emph{Regularization} method reinforces previous knowledge by constraining crucial parameters and introducing regularization terms into the loss function.  Elastic Weight Consolidation (EWC)~\citep{kirkpatrick2017overcoming} is notable for identifying crucial parameters and preventing their updates, thereby preserving performance on previous tasks. 
\citet{zenke2017continual} computes per-synapse consolidation strength online throughout the entire learning trajectory in parameter space. In contrast, EWC determines synaptic importance offline, specifically as the Fisher information at the minimum loss for a designated task.
MAS \citep{aljundi2018memory} computes the importance of neural network parameters in an unsupervised and online manner, showcasing its effectiveness in preserving critical knowledge over sequential tasks.
% Progress \& Compress \citep{schwarz2018progress} introduces a unified framework for continual learning, utilizing two neural networks—an active column and a knowledge base—to optimize task-specific parameters and facilitate feature reuse from prior tasks, enhancing positive transfer.
LwM \citep{Dhar_2019_CVPR} utilizes Attention Distillation Loss (LAD) to support the progressive learning of new classes, making it effectively preserves information on base classes when incorporating new classes.




\emph{Architectural} approaches modify the network structure by incorporating task-specific parameters into the base models, effectively capturing task-specific features and minimizing catastrophic forgetting.
Pathnet \citep{fernando2017pathnet}introduces dynamic pathway evolution, allowing the model to learn task-specific paths through a shared network, fostering transfer learning and optimizing task performance. 
CLAN \citep{seff2017continual} leverages adversarial training to maintain a balance between learning new tasks and retaining knowledge from previous tasks, reducing forgetting in CL.
Piggyback GAN \citep{zhai2020piggyback} efficiently tackles CF in CL by sharing filters between tasks, achieving high-quality generation for current tasks with fewer parameters, and preserving performance on previous tasks.
AdapterCL \citep{madotto2021continual} introduces a CL benchmark for ToDs with 37 domains, evaluating existing methods and proposing a simple yet effective architectural approach based on residual adapters \citep{houlsby2019parameter}.




\emph{Rehearsal} methods, which sustain performance by leveraging samples from previous tasks, can be categorized into store-based and generative-based rehearsal. Store-based rehearsal methods \citep{hayes2021replay, jin2021gradient} rely on episodic memory to store examples from previous tasks.
ICaRL \citep{Rebuffi_2017_CVPR} is an incremental approach that employs a herding-based step for selecting representative samples to mitigate the CF problem.
Gradient Episodic Memory (GEM) \citep{lopez2017gradient} uses memory to store and replay important exemplars from previous tasks while learning new ones. 
In ER \citep{rolnick2019experience}, the model is continuously trained using batch gradient descent by sampling examples from both the current task and the episodic memory.
Generative-based rehearsal methods like \citep{shin2017continual} create samples with generative adversarial networks (GANs) \cite{goodfellow2014generative}. 
ReMix \citep{Mi_2020_CVPR_Workshops} generate pseudo samples from prior tasks with Mixup \citep{zhang2017mixup}.



% In summary, paper1 introduces LAMOL, a lifelong language learning method that prevents catastrophic forgetting by replaying pseudo-samples of previous tasks without requiring extra memory or model capacity. Paper2 addresses the challenge of catastrophic forgetting in natural language generation for task-oriented dialogue systems through ARPER, a method that replays prioritized historical exemplars and employs adaptive regularization. Finally, paper3 proposes the Generalized Class Incremental Learning (GCIL) framework and introduces ReMix, a simple yet effective method combining Exemplar Replay and Mixup to outperform state-of-the-art methods in realistic class incremental learning scenarios on CIFAR-100.

\subsection{Continual Learning in NLP}
Following the general framework of CL, in the domain of continual learning for NLP, researchers have explored various strategies to tackle the challenges associated with evolving language tasks.
LAMOL \citep{sun2019lamol} is a rehearsal method based on continual sequence generation. Generative-based rehearsal approach does not require memory to store previous samples. 
ARPER \citep{mi2020continual} presented the initial attempt to explore practical continual learning configuration for Natural Language Generation (NLG) by prioritized exemplar replay and adaptive regularization based on EWC. Specifically, ARPER prioritizes representative and diverse utterances in exemplar selection, aiming to comprehensively cover information from previous tasks.  
AdapterCL \citep{madotto2021continual} is an architectural approach that places residual adapters \citep{houlsby2019parameter} atop the transformer layer to approximate each task.
CPT \citep{zhu2022continual} ensures non-forgetting and bidirectional knowledge transfer in a parameter-efficient dialog system, utilizing techniques like prompt learning, memory replay, and query fusion.
SSLL \citep{zhao2022semi} integrates both labeled and unlabeled data for sequentially arriving tasks, with specialized modules to mitigate forgetting and harness the potential of unlabeled data. 
PCLL \citep{zhu2022continual} adopts a Conditional Variational Autoencoder (CVAE) to generate pseudo samples from past tasks.
ACM \citep{zhang2022continual} adopts a two-stage method to achieve efficient continual sequence generation by dynamically adding or reusing modules based on task similarity, along with pseudo rehearsal for effective knowledge transfer, outperforming existing baselines.



% \subsection{Latent Dirichlet Allocation}
%  Latent Dirichlet Allocation (LDA) \citep{blei2003latent} is a popular topic model, employing the Dirichlet distribution to model topic and word distributions in documents. It serves as the conjugate prior to the multinomial distribution. LDA-based document models for ad-hoc retrieval were proposed in \citep{wei2006lda}. An online variational bayes (VB) algorithm for LDA was developed by \citet{hoffman2010online}. \citet{foulds2013stochastic} proposed a stochastic algorithm for collapsed VB inference in LDA. The Embedded Topic Model (ETM) \citep{dieng2020topic} combines LDA and word embeddings to identify interpretable topics with large vocabularies including rare and stop words. In addition, \citet{li2020dirichlet} introduced a Dirichlet graph VAE for graph generation and clustering.


% \subsection{Reparametrize Trick}
% The Reparametrize Trick (RT) is pivotal in probabilistic machine learning, particularly in variational autoencoder (VAE). It transforms the sampling process by decoupling it from model parameters. Instead of directly sampling from the latent distribution, the RT employs a deterministic transformation using a standard Gaussian distribution. Specifically, for a standard Gaussian sample \( \epsilon \), it adjusts the values based on the distribution's parameters (\(\mu\) for the mean and \(\sigma\) for the standard deviation) using the formula:
% \[
% z = \mu + \sigma \odot \epsilon
% \]
% Here, \(\odot\) denotes element-wise multiplication. This transformation facilitates efficient gradient optimization during training, enhancing the stability and convergence of the model.

% If the cumulative distribution function (CDF) is known and invertible, inverse CDF sampling can be used to do the sampling. Otherwise, alternative methods such as rejection sampling or Markov Chain Monte Carlo (MCMC) are employed. 







\begin{figure*}[!t]
    \centering    \includegraphics[width=150mm]{DCL_overview.pdf}
    \caption{Overview of the proposed DCL model.  DCL consists of two main modules: the pseudo-rehearsal module and the LM training module.  The pipeline of DCL can be summarized as follows: (1) In training Task $N$, the pseudo-rehearsal module applies CVAE to produce pseudo samples from Task $1$ to Task ${N-1}$. (2) These pseudo samples are then combined with the data in Task $N$. (3) The combination would be used in the current task training in the LM training module.}
    \label{fig:overview1}
\end{figure*}


% \input prelimiaries.tex

\section{Preliminaries}
Given a sequence of tasks, \(T=\{T_1, \cdots, T_N\}\), where \(N\) is the number of tasks. For Task $n$ which is denoted as $T_n$, its dataset is \(\mathcal{D}_n=\{(x_i, y_i)\}_{i=1}^{N_n}\) where $N_n$ denotes the number of samples in Task $n$.  Here, \(x_i\) is the $i$-th input utterance, and \(y_i\) is the label.  In the intent detection task, give an example is \{\textit{``utterance: "i need you to get me a flight booked from houston to miami on united airlines", "intent": "book flight"''}\} where  $x_i$ is \textit{``i need you to get me a flight booked from houston to miami on united airlines''} and $y_i$ is an intention of \textit{``book flight''}. 
Meanwhile, in the slot filling task, a typical example is \{\textit{``"how many comedy movies starring kevin costner have come out in the year 2000", "GENRE: comedy; ACTOR: kevin costner"''}\}.  In this example, $x_i$ is \textit{``how many comedy movies starring kevin costner have come out in the year 2000''}, while $y_i$ is \textit{``GENRE: comedy; ACTOR: kevin costner''}.






\section{Dirichlet Generative Rehearsal}
\label{sec:def}
Our Dirichlet generative rehearsal (DCL) utilizes CVAE to mitigate forgetting in CL.  As outlined in Fig.~\ref{fig:overview1}, DCL consists of two main modules: pseudo-rehearsal and Language Model (LM) training modules.  To be specific, we apply CVAE to generate pseudo samples of previous tasks before the current task training, and an LM continues to train using both pseudo samples of previous tasks and real samples of the current task. 
In CVAE, both encoder and decoder employ pre-trained GPT-2 \citep{radford2019language} with distinct parameters to encode information and generate pseudo samples for tasks. 
Notably, the LM shares parameters with the decoder of CVAE. % This delicate framework leverages a unified model to solve sequential tasks and generates pseudo samples simultaneously. 
We also propose Jensen-Shannon Knowledge Distillation in Sec.~\ref{sec:3.5} elaborated below.

% $\Tilde{x}_{i,n}={P}_n \oplus x_i \oplus P^*_n$ 

\subsection{CL Via Generative Rehearsal}
DCL consists of two modules: the pseudo-rehearsal module and the LM training module. The total loss of DCL is defined as \(L_{total}\), and the loss can be calculated as:
\begin{equation}
\label{total loss}
   \mathcal{L}_{total} = \mathcal{L}_{\rm CVAE}+ \mathcal{L}_{\rm LM}
\end{equation}


Let $\mathcal{D}_{curr}$ be the set of current training samples and $\mathcal{D}_{pseu}$ be the set of generated pseudo samples of previous tasks.  $\mathcal{D}_{\cup}=\mathcal{D}_{curr}\cup\mathcal{D}_{pseu}$ thus is the training samples used to train the current task.  Given an input utterance $x_i$, we generate $y_i$ in the task $T_n$. To achieve task-dependent generation, specific prefix prompt ${P}_n$ and postfix prompt $P^*_n$ for $T_n$ are first defined.  Then they are concatenated to the input utterance, yielding the augmented input $\Tilde x_i={P}_n \oplus x_i \oplus P^*_n$,  
where $\oplus$ means the word concatenation. 
Details of the ${P}_n$ and $P^*_n$ are described in Appendix A. 
% \ref{sec:appendixa}.
% The CVAE can be utilized to generate the pseudo sample for $T_n$ based on $\Tilde{x}_{i}$. 
In our implementation, LM is parameterized based on a pre-trained GPT-2.  The training loss of the LM is defined as: 
\begin{align}
\small
&\mathcal{L}_{\rm LM}(\theta)\nonumber\\
&=-\sum_{(x_i,y_i)\in \mathcal{D}_{\cup}}^{}\log p_{\theta}{(\Tilde x_i,y_i)} 
+ \log p_{\theta}{(y_i|\Tilde x_i)}.
\end{align}



\subsection{Pseudo Generation by Dirichlet Latent Variable}
Variational Autoencoder (VAE) models the distribution of high-dimensional data $x$ using lower-dimensional latent variables $z$. 
Let $z$ be a continuous variable representing the sentence-level features of input utterance $x$.  
The generative process involves an encoder $q_\phi(z | x)$ mapping $x$ to approximate the true posterior $p(z | x)$.  Latent variables are sampled from $q_\phi(z | x)$, and a decoder $p_\theta(x | z)$ reconstructs $x$. 
CVAE modifies VAE conditioned on certain attributes $c$.  In our implementation, we take task ID as $c$.  CVAE is then trained to maximize the log-likelihood $\log {p(x|c)}$. The lower bound $\mathcal{L}(\theta,\phi;x,c)$ is applied for tractable optimization:
\begin{align} 
\small
\label{elbo}
% \log{p_\theta(x)}
% & \ge 
\mathcal{L}(\theta,\phi;x,c)
&=-\lambda \rm KL(q_\phi(z|x, c)||p_\theta(z|c)) \nonumber\\
&+\mathbb{E}_{q_\phi(z|c, x)}[\log{p_\theta}(x|z, c)]\nonumber\\
&\le \log{p(x|c)},
\end{align}
where $\theta$ is the model parameter, $p_\theta(z|c)$ is the prior distribution of $z$, $q_\phi(z|x, c)$ approximates the intractable true posterior distribution, 
and $\lambda$ is the dynamic KL weight, gradually increasing from 0 to 1 via the annealing technique, to mitigate the KL-vanishing as proposed by \citet{bowman2016generating}. 

In the DCL, departing from approximate $z\sim \mathcal{N}(\mu,\sigma^2)$ as a symmetric Gaussian from continuous space, we introduce the Dirichlet latent variable $z\sim {Dir}(\alpha)$ to express the latent variable $z$ originating from discrete space. The Dirichlet distribution can be concave, convex, symmetrical, or asymmetrical, making it an appealing option for our model.
We apply the rejection sampling to reparametrize the Dirichlet latent variable \citep{jankowiak2018pathwise}.

% Traditionally, $z$ is Gaussian prior which tends to KL-vanishing.  
% KL-vanishing means the CVAE model would degrade into an autoencoder model which tends to generate generic and meaningless utterances \cite{zeng2019dirichlet}. We propose to use $z\sim D_{ir}(\cdot)$ to express $z$ originating discrete space $\mathcal{X}$. 
 
% The main reason is that a symmetric Gaussian from continuous space is not flexible enough to express the latent variable $z$ originating from discrete space. In DCL, we introduce the Dirichlet distribution to approximate the latent variable since this distribution owns a more flexible mathematical structure. The Dirichlet distribution can be concave, convex, symmetrical, or asymmetrical, making it an appealing option for our model.
% We use the reject sampling to reparametrize the Dirichlet latent variable \cite{jankowiak2018pathwise}.

 

% The probability density function (PDF) of a Dirichlet distribution with parameters $\boldsymbol{\alpha}$ is given by:

% \[
% f(\mathbf{x}; \boldsymbol{\alpha}) = \frac{\Gamma\left(\sum_{i=1}^{K}\alpha_i\right)}{\prod_{i=1}^{K}\Gamma(\alpha_i)} \prod_{i=1}^{K} x_i^{\alpha_i - 1}
% \]

% where $\mathbf{x} = (x_1, x_2, \ldots, x_K)$ is a K-dimensional vector that lies in the K-dimensional simplex, i.e., $x_i \geq 0$ and $\sum_{i=1}^{K} x_i = 1$. Here, $\Gamma(\cdot)$ is the gamma function.


% However, as illustrated in \cite{shen2018improving, zeng2019dirichlet}, although the weighting scheme can be used, KL-vanishing can not be essentially tackled. The main reason is that a symmetric Gaussian from continuous space is not flexible and sufficient enough to express the latent $z$ originating from discrete space. Here, we introduce the Dirichlet distribution, which uses a more flexible structure to approximate the prior distribution of $z$. The versatile forms of the Dirichlet distribution, which can be concave, convex, symmetrical, or asymmetrical, make it an appealing choice for our model.
The CVAE loss denoted as $\mathcal{L}_{\rm CVAE}$ is the negative of evidence lower bound (ELBO). Following equation \eqref{elbo}, we have
\begin{equation}
\mathcal{L}_{\rm CVAE} = \mathcal{L}'_{\rm KL}+\mathcal{L}_{\rm Rec},
\end{equation}
where $\mathcal{L}'_{\rm KL}$ can be expressed as follows after derivation \citep{zeng2019dirichlet}:
\begin{align}
\small
\label{derivation_kl}
\mathcal{L}'_{\rm KL}=&~{\rm KL}(q_\phi(z|x,c)||p(z|c))= \nonumber\\
&\log\Gamma(\sum_{k=1}^{K}\alpha_k)-\sum_{k=1}^{K}\log\Gamma(\alpha_k)\nonumber\\
&-\log\Gamma(\sum_{k=1}^{K}\beta_k)
+\sum_{k=1}^{K}\log\Gamma(\beta_k)\nonumber\\
&+\sum_{k=1}^{K}(\alpha_k-\beta_k)(\psi(\alpha_k)-\psi(\sum_{k=1}^{K}\alpha_k)),
\end{align}
where $\alpha$ and $\beta$ represent the parameters of the Dirichlet distributions $q_\phi(z|x,c)$ and $p_\theta(z|c)$, respectively. $K$ denotes the dimension of $z$. $\Gamma$ is the gamma function and $\psi$ is the Digamma function. 

% \subsection{Learning}
% \subsubsection{Variational Learning}
% We consider the pseudo sample generation as a conditional generation process given the input utterance and the task ID. Let $z$ be a continuous variable as the summarization and feature representation of input for representing the sentence-level features.



% \subsubsection{LM Learning}
% We use an language model $\mathcal{M}$ to incrementally learn tasks and the training loss of $\mathcal{M}$ is defined as: 
% \begin{align}
% \small
% &\mathcal{L}_{\rm LM}(\theta)\nonumber\\
% &=-\sum_{(x_i,y_i)\in \mathcal{D}_{cups}}^{}\log p_{\theta}{(x_i,y_i)} 
% + \log p_{\theta}{(y_i|x_i)}.
% \end{align}
% To be specific, In our implementation, $\mathcal{M}$ is parameterized based on a pre-trained GPT2. Further, $\mathcal{M}$ shares parameters with the decoder of CVAE.



% Taking task $T_n$ as an example, the pseudo-rehearsal module first generates pseudo samples of previous tasks $T_1, \cdots, T_{n-1}$ and then we combine the generated pseudo samples with the samples for the task $T_n$ to train the model. Hence, training dataset for task $T_n$ becomes $\mathcal{D}_{cups}=\mathcal{D}_{curr}\cup\mathcal{D}_{pseu}$ where $\mathcal{D}_{curr}$ denotes current task samples, $\mathcal{D}_{pseu}$ represents generated pseudo samples, and $\mathcal{D}_{cups}$ denotes the combination of current task samples and generated pseudo samples. The training loss is defined as: 
% \begin{align}
% \small
% &\mathcal{L}_{\rm LM}(\theta)\nonumber\\
% &=-\sum_{(x_i,y_i)\in \mathcal{D}_{cups}}^{}\log p_{\theta}{(x_i,y_i)} 
% + \log p_{\theta}{(y_i|x_i)}.
% \end{align}



% Referring to the common practice \cite{zhao2022prompt, madotto2021continual, mi2020continual, zhu2022continual}, we mainly consider intent detection and slot filling tasks in NLU.







% \subsection{Overview}
% \label{sec:archi}


% In the rehearsal method of continual learning, 
% the representative and diverse exemplars contribute to the model performance, as presented in \cite{mi2020continual}. Thus, it is important to generate representative and diverse pseudo utterances. Among commonly used generative models, VAE and GAN are prominent. VAE, in particular, is known for generating meaningful and diverse utterances \cite{serban2017hierarchical}. We propose a CVAE-based method called DCL to mitigate forgetting in ToDs.

% As shown in Figure \ref{fig:overview1}, DCL has two main modules: pseudo-rehearsal and Language Model (LM) training modules. The pseudo-rehearsal module employs CVAE to generate pseudo samples from previous tasks. Then, the LM is trained using both pseudo samples of previous tasks and real samples of the current task to enable continual learning. 
% % The total training loss of DCL combines the CVAE loss for pseudo-rehearsal and the LM loss for updating LM parameters.
% % The structure of the CVAE and LM models is outlined as follows. 
% In CVAE, both encoder and decoder employ GPT-2 \cite{radford2019language} with distinct parameters to encode information and generate pseudo samples for tasks. 
% % In a Continual Learning (CL) context, the LM is expected to generate samples for task sequences, making the decoder of the CVAE also function as the LM as depicted in the LM training Module of Figure \ref{fig:overview1}. 
% Notably, the decoder of the CVAE also functions as the LM as depicted in Figure \ref{fig:overview1}. 
% This delicate framework leverages a unified model to solve sequential tasks and generates pseudo samples simultaneously. 

% In the following sections, we introduce the pseudo-rehearsal module (Sec. \ref{sec:3.3}) and LM training module (Sec. \ref{sec:3.4}) in detail. Then we present the proposed Jensen-Shannon Knowledge Distillation in (Sec. \ref{sec:3.5}) and explain the advantages of using JS divergence over KL divergence.


% % \subsection{Dirichlet-guided Pseudo-rehearsal}
% % \label{sec:3.3}
% % In the proposed DCL model, we introduce a CVAE to generate pseudo samples. 
% % As illustrated in \cite{shen2018improving, zeng2019dirichlet}, KL-vanishing problems can not be essentially tackled even using a weighting scheme. KL-vanishing means the CVAE model would degrade into an autoencoder model which tends to generate generic and meaningless utterances \cite{zeng2019dirichlet}. The main reason is that a symmetric Gaussian from continuous space is not flexible enough to express the latent variable $z$ originating from discrete space. In DCL, we introduce the Dirichlet distribution to approximate the latent variable since this distribution owns a more flexible mathematical structure. The Dirichlet distribution can be concave, convex, symmetrical, or asymmetrical, making it an appealing option for our model.
% % We use the reject sampling to reparametrize the Dirichlet latent variable \cite{jankowiak2018pathwise}.


% % This module aims to generate the pseudo samples based on the task ID and prompt \cite{lester2021power}. Given the input utterance $x_i$, we generate $y_i$ in the task $T_n$. To achieve task-dependent generation, specific prefix prompt ${P}_n$ and postfix prompt $P^*_n$ for $T_n$ are first defined. Then they are concatenated to the input utterance, yielding the augmented input $\Tilde{x}_{i,n}={P}_n \oplus x_i \oplus P^*_n$. 
% % Details of the prompt design are described in \ref{sec:appendixa}.
% % The CVAE can be utilized to generate the pseudo sample for $T_n$ based on $\Tilde{x}_{i,n}$ \cite{zhao2017learning}. 
% % % he key idea of CVAE is to reconstruct the input $x$ through the latent variable $z$, which is normally modeled through the Gaussian distribution. 

% % The CVAE is trained to maximize the log-likelihood $\log {p_(x|c)}$. The lower bound ELBO $\mathcal{L}(\theta,\phi;x,c)$ is used for tractable optimization:
% % \begin{align} 
% % \small
% % \label{elbo}
% % % \log{p_\theta(x)}
% % % & \ge 
% % \mathcal{L}(\theta,\phi;x,c)
% % &=-\lambda KL(q_\phi(z|x, c)||p_\theta(z|c)) \nonumber\\
% % &+\mathbb{E}_{q_\phi(z|c, x)}[\log{p_\theta}(x|z, c)]\nonumber\\
% % &\le \log{p(x|c)},
% % \end{align}
% % where $\theta$ is the model parameter, $p_\theta(z|c)$ is the prior distribution of $z$, $q_\phi(z|x, c)$ approximates the intractable true posterior distribution, $c$ defines the task ID, and $\lambda$ is the dynamic KL weight to mitigate the KL-vanishing as proposed by \citet{bowman2016generating}. 
% % % However, as illustrated in \cite{shen2018improving, zeng2019dirichlet}, although the weighting scheme can be used, KL-vanishing can not be essentially tackled. The main reason is that a symmetric Gaussian from continuous space is not flexible and sufficient enough to express the latent $z$ originating from discrete space. Here, we introduce the Dirichlet distribution, which uses a more flexible structure to approximate the prior distribution of $z$. The versatile forms of the Dirichlet distribution, which can be concave, convex, symmetrical, or asymmetrical, make it an appealing choice for our model.
% % The CVAE loss denoted as $\mathcal{L}_{\rm CVAE}$ is the negative of ELBO. Following \eqref{elbo}, we have
% % \begin{equation}
% % \mathcal{L}_{\rm CVAE} = \mathcal{L}'_{\rm KL}+\mathcal{L}_{\rm Rec},
% % \end{equation}
% % where $\mathcal{L}'_{\rm KL}$ can be expressed as follows after derivation \cite{zeng2019dirichlet}:
% % \begin{align}
% % \small
% % \label{derivation_kl}
% % &KL(q_\phi(z|x,c)||p(z|c))= \nonumber\\
% % &\log\Gamma(\sum_{k=1}^{K}\alpha_k)-\sum_{k=1}^{K}\log\Gamma(\alpha_k)\nonumber\\
% % &-\log\Gamma(\sum_{k=1}^{K}\beta_k)
% % +\sum_{k=1}^{K}\log\Gamma(\beta_k)\nonumber\\
% % &+\sum_{k=1}^{K}(\alpha_k-\beta_k)(\psi(\alpha_k)-\psi(\sum_{k=1}^{K}\alpha_k)),
% % \end{align}
% % where $\alpha$ and $\beta$ represent the parameters of the Dirichlet distributions $q_\phi(z|x,c)$ and $p_\theta(z|c)$, respectively. $K$ denotes the dimension of $z$, and $\psi$ is the Digamma function. 


% \subsection{LM Training Module}
% \label{sec:3.4}
% LM training module shares parameters with the decoder of CVAE. Taking task $T_n$ as an example, the pseudo-rehearsal module first generates pseudo samples of previous tasks $T_1, \cdots, T_{n-1}$ and then we combine the generated pseudo samples with the samples for the task $T_n$ to train the model. Hence, training dataset for task $T_n$ becomes $\mathcal{D}_{cups}=\mathcal{D}_{curr}\cup\mathcal{D}_{pseu}$ where $\mathcal{D}_{curr}$ denotes current task samples, $\mathcal{D}_{pseu}$ represents generated pseudo samples, and $\mathcal{D}_{cups}$ denotes the combination of current task samples and generated pseudo samples. The training loss is defined as: 
% \begin{align}
% \small
% &\mathcal{L}_{\rm LM}(\theta)\nonumber\\
% &=-\sum_{(x_i,y_i)\in \mathcal{D}_{cups}}^{}\log p_{\theta}{(x_i,y_i)} 
% + \log p_{\theta}{(y_i|x_i)}.
% \end{align}




\subsection{Jensen-Shannon Knowledge Distillation}
\label{sec:3.5}
Due to the potential drift from the real data distribution in the pseudo data, simply combining the generated pseudo samples with the training data may add noise and harm the model's performance.  
Following existing studies~\citep{chuang2020lifelong, mi2020continual, zhao2022prompt, chen2023lifelong}, we adopt knowledge distillation to shield our model from the impact of noisy pseudo data.
Traditionally they use \citep{hinton2015distilling} which proposes to use KL divergence to measure the distribution similarity of student and teacher models. 
However, KL divergence is not robust since it is sensitive to outliers, especially in scenarios with noisy or sparse data. Jensen-Shannon (JS) divergence offers a more stable alternative by introducing a symmetric term. 
This makes JS divergence advantageous for assessing dissimilarity in complex and diverse data distributions, mitigating the limitations associated with KL divergence.
Thus, we propose a new knowledge distillation method called Jensen-Shannon Knowledge Distillation (JSKD), a novel logit-based knowledge distillation method, that achieves remarkable performance. 
% Compared to traditional KL divergence knowledge distillation, JS divergence has the advantage of robustness since the symmetry ensures consistent values regardless of comparison order, which eases the distribution similarity measurement.





To be specific, when the model is trained on task $n$, it acts as the student model while the model trained on task $n-1$ serves as the teacher model. As the training continues, the model trained on task $n$ would switch the role to become the teacher whereas the model trained on task $n+1$ takes over the role of the student. 
This role-switching continues iteratively, ensuring knowledge transfer from the teacher model to the student model. This cumulative knowledge accumulation from previous tasks is achieved through CL.
% We introduce knowledge distillation because this strategy helps prevent model weights from drifting too far when learning new tasks, thereby mitigating forgetting in CL. 


% \begin{figure}[!t]
%     \centering
%     \includegraphics[width=70mm]{kd.pdf}
%     \caption{Knowledge Distillation of DCL.}
%     \label{fig:kd}
%     \vspace{-1em}
% \end{figure}


\noindent\textbf{Knowledge Distillation}
Given a training sample $(x,y)$, we want to minimize the cross-entropy between the output distribution of the teacher and student models. The training objective is:
\begin{equation}
\small
\mathcal{L}_{\rm KD} = \alpha \cdot \mathcal{L}_{\rm KL}(S, T) \cdot\tau^2  + (1 - \alpha) \cdot \mathcal{L}_{\rm CE}(S, Y),
\end{equation} 
where $T$ and $S$ are teacher and student predictions, respectively. $\tau$ is the temperature to soften the teacher's predictions while $\mathcal{L}_{\rm CE}(S, Y)$ quantifies the cross-entropy loss between student predictions and the ground truth labels $Y$. $\mathcal{L}_{\rm KL}$ implicitly prevents the parameters of the student model from straying too far away from the ones of the teacher model. The first term denotes a soft target while the second term is a hard target. $\alpha\in[0, 1]$ balances the soft and hard target evaluations.

\noindent\textbf{JS Divergence vs. KL Divergence} For distributions $p$ and $q$, JS divergence \citep{lin1991divergence} is defined by :
\begin{align}
\small
\mathcal{L}_{\rm JS}(p \parallel q) = &\frac{1}{2} {\mathcal{L}_{\rm KL}\left(p \parallel \frac{1}{2}(p + q)\right)} \nonumber\\
&+ \frac{1}{2} {\mathcal{L}_{\rm KL}\left(q \parallel \frac{1}{2}(p + q)\right)}.
\end{align}
JS divergence offers advantages over KL divergence: a) its symmetry ensures consistent values regardless of comparison order, making it ideal for measuring distribution similarities. b) JS provides bounded value in $[0,1]$ while KL divergence spans $[0,+\infty)$.  
JS divergence symmetrically measures the similarity between two probability distributions, in contrast to the asymmetric KL divergence which has values ranging from 0 (identical distributions) to 1(no shared support). 
The above properties make JS divergence more suitable for knowledge distillation than KL since the KL divergence will be infinite when one sample appears only in one task distribution.


\noindent\textbf{Knowledge Distillation via JS Divergence }
Motivated by the above discussions, we propose a JS divergence-based knowledge distillation (JSKD) to accurately measure the distance between teacher and student models, enhancing model robustness. The JSKD loss is defined as:
\begin{equation}
\small
\label{kd}
\mathcal{L}_{\rm KD} = \alpha \cdot \mathcal{L}_{\rm JS}(S, T) \cdot \tau^2 + (1 - \alpha) \cdot \mathcal{L}_{\rm CE}(S, Y). 
\end{equation} 
Specifically, we adopt the preceding task as the teacher model and the current task as the student model. As aforementioned, DCL optimizes $\mathcal{L}_{total}$ in equation \eqref{total loss}. For CVAE, the training loss
$\mathcal{L}_{\rm CVAE}=\mathcal{L}_{\rm Rec} + \mathcal{L}'_{\rm KL}$. 
Incorporating knowledge distillation, the $\mathcal{L}_{\rm Rec}$ and $\mathcal{L}_{\rm LM}$ for task $T_n$ is defined as:
\begin{align}
&\mathcal{L_{\rm Rec}} = \alpha\mathcal{L}_{\rm JS}(l_c, l_c^*)\tau^2  + (1 - \alpha) \mathcal{L}_{\rm CE}(l_c, Y),\nonumber\\
&\mathcal{L}_{\rm LM} = \alpha\mathcal{L}_{\rm JS}(l_l, l_l^*) \tau^2 + (1 - \alpha) \mathcal{L}_{\rm CE}(l_l, Y),
\end{align}
where $l_c$ and $l_l$ are the logits output of CVAE and LM of task $T_n$, respectively. $l_c^*$ and $l_l^*$ represent the logits output of task $T_{n-1}$, and $Y$ signifies the ground truth. We emphasize that $\mathcal{L}'_{\rm KL}$ is the KL loss in equation~\eqref{derivation_kl} to evaluate the distance between the assumed Dirichlet data distribution and the real distribution. It is different from the $\mathcal{L}_{\rm KL}$ where it evaluates the distance between the student and teacher models in cross-task knowledge distillation. 





\section{Experiments}
% \red{
% In the following, we conduct extensive experiments to address the following questions: (1) ; （2) ; (3) 
% }

\subsection{Datasets}
Following the setup of PCLL \citep{zhao2022prompt}, we evaluate our DCL by mimicking CL for two tasks: 
intent detection and slot filling.  We adopt the pre-processed datasets released by PCLL in the experiments.
% For the intent detection task, we use the HWU \cite{liu2019benchmarking}, BANKING \cite{casanueva2020efficient}, CLINC \cite{larson2019evaluation}, SNIPS \cite{coucke2018snips}, AITS \cite{hemphill1990atis}, and TOP\cite{gupta2018semantic} datasets. For fairness, we follow PCLL's \cite{zhao2022prompt} data preprocessing procedure. The TOP dataset is divided into three subsets: TOP-S1, TOP-S2, and TOP-S3, considered separate tasks. This partitioning increases the total number of tasks for continual training evaluation.
For intent detection, we employ HWU \citep{liu2021benchmarking}, BANKING \citep{casanueva2020efficient}, CLINC \citep{larson2019evaluation}, SNIPS \citep{coucke2018snips}, AITS \citep{hemphill1990atis}, and TOP \citep{gupta2018semantic} datasets. 
Consistent with previous works \citep{zhao2022prompt}, we divide the TOP dataset into three separate subsets: TOP-S1, TOP-S2, and TOP-S3. Each one is treated as an individual task to expand the number of tasks for CL evaluation.
For slot filling, SNIPS, AITS, DSTC \citep{rastogi2020towards}, MIT-MOVIE \footnote{\url{https://groups.csail.mit.edu/sls/downloads/}\label{mit-courpus}}, and MIT-RESTAURANT \footref{mit-courpus} datasets are adopted. For slot filling, we utilize five datasets for which slot labels are available: SNIPS, AITS, DSTC, MIT-MOVIE, and MIT-RESTAURANT. Each of these datasets is treated as an individual task, resulting in the learning of five distinct tasks during CL.
As a result, following PCLL, there are a total of 8 tasks in intent detection and 5 tasks in slot filling, respectively. 
 %We adopt the pre-processed datasets released by PCLL.  
%The pre-processed datasets are publicly accessed in the PCLL code repository. We use the pre-processed datasets they published. 

For a fair comparison, these tasks are learned in six different orders, and the average performances of these orders are reported. Dataset orders are listed in Appendix B. 
% \ref{sec:appendixb}.


\subsection{Compared Methods}
To demonstrate the effectiveness of our proposed DCL, we compare it with eleven methods.
% : (1) \textbf{Fine-tune} the pre-trained language models; (2) \textbf{EWC} \citep{kirkpatrick2017overcoming} is a regularization method that mitigates CF by constraining crucial parameters while enabling unimportant ones to be adapted to the new task; (3) \textbf{MAS} \citep{aljundi2018memory} quantifies parameter importance in the network based on task memory contributions toward mitigated CF. 
We \textbf{Fine-tune} the pre-trained language models GPT-2 in the stream of tasks without any strategy to prevent CF.
Performance of Multi-task (\textbf{Multi}) learning is the upper bound. 

\noindent\textit{\textbf{Regularization:}} \textbf{EWC} \citep{kirkpatrick2017overcoming} is a regularization method that mitigates CF by constraining crucial parameters while enabling unimportant ones to be adapted to the new task. \textbf{MAS} \citep{aljundi2018memory} quantifies parameter importance in the network based on task memory contributions toward mitigated CF. 
 
\noindent\textit{\textbf{Rehearsal:}} \textbf{LAMOL} \citep{sun2019lamol} is a rehearsal method that utilizes the language model as both a learner and a generator, facilitating the creation of pseudo samples for current training. Its variations, \textbf{LAMOL-g} and \textbf{LAMOL-t}, diverge in terms of the global incorporation or task-specific tokens. \textbf{L2KD} \citep{chuang2020lifelong} is proposed to introduce knowledge distillation into LAMOL. \textbf{ER} \citep{rolnick2019experience} uses on-policy learning for quick adaptation to new tasks and off-policy learning with behavioral cloning for enhancement of the past tasks' performance. We retain $1\%$ of previously encountered samples in memory, adhering to the specified configuration.
\textbf{PCLL} \citep{zhao2022prompt} is a CVAE-based generative replay method that reaches the SOTA performance in this setting. 

\noindent\textit{\textbf{Architectural:}} \textbf{HAT} \citep{serra2018overcoming} proposes a task-based hard attention mechanism that preserves information from previous tasks without affecting the learning of the current task.
\textbf{CTR} \citep{ke2021achieving} inserts a continual learning plug-in module in two locations in BERT \citep{devlin2019bert} to achieve both CF mitigation and knowledge transfer. \textbf{AdapterCL} \citep{madotto2021continual} leverages task-specific residual adapters in a frozen GPT-2 backbone to reduce parameter number and promote efficient continual learning.


% See Table~\ref{tab:accents} for an example of a table and its caption.
% \textbf{Do not override the default caption sizes.}
\subsection{Experimental Settings}
All experiments are conducted on NVIDIA A100-80GB GPU.  Experimental settings are summarized as: 
% \begin{itemize}
%     \item In intent detection, the batch size is 32 with a learning rate of 5e-5 and a pseudo sample rate of 0.2. The dimension of $z$ is 128, and we use the Adam optimizer. We set the maximum context length as 256 and train it for 5 epochs. 
%     \item In the slot filling task, compared to intent detection, we set $z$'s dimension to 512, limit the maximum context length to 50, and train the model for 10 epochs.
% \end{itemize}
In intent detection, the batch size is 32 with a learning rate of 5e-5 and a pseudo sample rate of $0.2$. The dimension of $z$ is 128, and the Adam optimizer is used. We set the maximum context length as 256 and train it for 5 epochs. $\alpha$ is $0.9$ and $\tau$ is $2.0$ in knowledge distillation. In intent detection, DCL will cost $5$ hours for training in one GPU. 
There are differences between intent detection and slot filling. 
In slot filling, the dimension of $z$ is 512, the maximum context length is 50, and it is trained for 10 epochs. $\alpha$ is $1.0$ and $\tau$ is $2.0$. In slot filling, DCL costs $6$ hours for training.



\subsection{Evaluation Metrics}

\noindent\textbf{Average Score (Score):} Average score \citep{lopez2017gradient} denotes the average accuracy on all tasks after the final task has been learned, which is defined as:
    ${\rm Score} = \frac{1}{T} \sum\limits_{i=1}^{T}{R_{T, i}}$,
where $R_{i, j}$ denotes the evaluation metric on task $t_j$ after training on task $t_i$. Since intent detection and slot filling can be viewed as classification and sequence labeling tasks, we adopt the accuracy score and F1 score for intent detection and slot filling, respectively.  

\noindent\textbf{Learning Curve Area (LCA):} We also use LCA \citep{chaudhry2018efficient} which is computed as the area under a learning curve to indicate a model's performance in a sequence of tasks. LCA is defined as:
${\rm LCA} = \int_{0}^{T} P(t) dt$,
where $P(t)$ is the average model performance at step $t$ across all already-learnt tasks, and $T$ is the total number of steps. Higher LCA values indicate efficient CL.

\subsection{Main Results} 
Table~\ref{tab:overall} presents the performance of our proposed DCL model compared to the baselines. Our DCL model demonstrates a significant improvement over all baselines in both intent detection and slot filling tasks.  Notably, our DCL surpasses the state-of-the-art model, PCLL, by a wide margin, achieving superior results across all evaluation metrics. In intent detection, our DCL achieves a remarkable 3.48\% increase in accuracy score and a 4.22\% improvement in LCA.  Similarly, in slot filling, our DCL achieves a notable 2.89\% increase in F1 score and a substantial 6.08\% improvement in LCA.  These impressive enhancements are attributed to the effective utilization of Dirichlet-guided pseudo-rehearsal and JSKD techniques.  By leveraging these components, our DCL model generates a more diverse and representative set of examples, leading to further optimization of model performance. The generation of diverse and representative examples is crucial for effectively capturing task-specific information, particularly when the number of available pseudo samples is limited.

Furthermore, our DCL model achieves performance that is comparable to the upper bound in Multi-task learning (\textbf{Multi}), with only a slight lag of 2.52\% in accuracy for intent detection and 3.43\% in F1 score for slot filling. This slight difference in performance can be attributed to variations in the amount of data and the realism of the samples used for evaluation.

To further understand these trends, the learning curve of the average scores for DCL and PCLL in intent detection tasks is shown in Appendix C. According to the comparison, our model alleviates the CF issue more effectively evidenced by higher accuracy than the PCLL, especially after step 6000. The sharp drop in accuracy is induced by task switching. The delayed prominence of performance improvement around 6000 steps suggests that as the model encounters a more diverse set of tasks, its efficacy becomes more pronounced, highlighting the promising nature of our approach.



% The result of our model is shown in the last two rows, where ``KL'' and ``JS'' refer to KL divergence knowledge distillation and JS knowledge distillation, respectively. 
\begin{table}[!t]
\centering
\caption{Comparison results of DCL and baselines.  Scores of baselines are reported in~\citep{zhao2022prompt} and the performance of Multi is the upper bound. The best results are highlighted in bold.}
\label{tab:overall}
\resizebox{1\linewidth}{!}{
\begin{tabular}{lccccc}
\toprule
\multicolumn{1}{c}{\multirow{2}{*}{\textbf{Models}}} & \multicolumn{2}{c}{\textbf{Intent Detection (\%)}}  & \multicolumn{2}{c}{\textbf{Slot Filling (\%)}}                 \\
& \multicolumn{1}{c}{\textbf{Score $\uparrow$}} & \multicolumn{1}{c}{\textbf{LCA $\uparrow$}}  & \multicolumn{1}{c}{\textbf{Score $\uparrow$}} & \multicolumn{1}{c}{\textbf{LCA $\uparrow$}}
\\ \hline
Finetune  &14.09 & 28.76 & 15.38&19.55\\
EWC & 14.16 & 28.34 & 15.67&19.51\\
MAS & 14.15 & 28.61 & 15.59&19.37 \\
L2KD & 35.22 & 61.78 & 44.16&39.94\\
LAMOL-g & 50.30 & 60.67 & 45.12 & 38.03 \\
LAMOL-t & 51.81 & 67.97 & 44.83 & 37.58\\
ER & 78.19 & 78.19  & 44.95 & 39.32\\
HAT &73.92 & 73.03  & 61.99 & 67.33\\
CTR & 67.44 & 71.11 & 63.84 & 67.28\\
AdapterCL & 81.15 & 75.60 & 75.60 &48.47 \\
PCLL & 90.25 & 88.82 & 74.48 & 68.41\\
% PCLL(reproduced) & \textbf{90.37} & \textbf{90.30} & \textbf{75.32} & \textbf{73.75}\\
\hline
% DCL (with KL)  & 92.83 & 91.32 & 76.42 & 73.76 \\
% DCL (with JS)  & \textbf{93.73} & \textbf{93.04} & \textbf{77.37} & \textbf{74.49}\\
\textbf{DCL}  & \textbf{93.73} & \textbf{93.04} & \textbf{77.37} & \textbf{74.49}\\
\hline
Multi (Upper Bound) & 96.25 & N/A & 80.80 & N/A\\
\bottomrule
\end{tabular}
}
% \vspace{-1em}
\end{table}



% \begin{figure}[!t]
%     \centering
%     \includegraphics[width=70mm]{curve.pdf}
%     \caption{Learning curves of DCL and PCLL on intent detection task. The accuracy of DCL surpasses PCLL significantly after 6,000 steps.}
%     \label{fig:curve}
% \end{figure}









\subsection{Ablation Study}
\noindent\textbf{Gaussian vs. Dirichlet-guided Rehearsal Module.} To assess the influence of a Dirichlet-guided rehearsal module, we conduct a comparative analysis of DCL and PCLL in the intent detection and slot filling tasks. To ensure a fair comparison, our DCL model incorporates KL knowledge distillation as that in PCLL.  The findings presented in Table \ref{tab:dir-gau} demonstrate that DCL, with its Dirichlet-guided rehearsal module, outperforms PCLL, which employs a Gaussian-guided module, across all evaluation metrics in two tasks.  These results suggest that the Dirichlet distribution is more effective in approximating the true data distribution, leading to better performance.

% In order to find the effect of introducing a Dirichlet-guided rehearsal module, we listed the performance of DCL and PCLL in the intent detection and slot filling tasks, both of them equipped with KL knowledge distillation. The difference between them is choosing either Dirichlet or Gaussian latent variable. Table \ref{tab:dir-gau} shows that introducing a Dirichlet-guided rehearsal module outperforms PCLL which uses a Gaussian-guided rehearsal module. It indicates the Dirichlet distribution is better to approximate the true data distribution. 




\begin{table}[!t]
  \centering
   \caption{Results of DCL with KL knowledge distillation and PCLL in two tasks.  DCL, incorporating the Dirichlet-guided rehearsal module, outperforms PCLL across all metrics evaluated. }
    \label{tab:dir-gau}
  \resizebox{0.8\linewidth}{!}{
  \begin{tabular}{lccccc}
\toprule
\multicolumn{1}{c}{\multirow{2}{*}{\textbf{Models}}} & \multicolumn{2}{c}{\textbf{Intent Detection (\%)}}  & \multicolumn{2}{c}{\textbf{Slot Filling (\%)}}                 \\
& \multicolumn{1}{c}{\textbf{Score $\uparrow$}} & \multicolumn{1}{c}{\textbf{LCA $\uparrow$}}  & \multicolumn{1}{c}{\textbf{Score $\uparrow$}} & \multicolumn{1}{c}{\textbf{LCA $\uparrow$}}
\\ \hline
   % \cline{1-3}
   PCLL & 90.25 & 88.82 & 74.48 & 68.41\\
   DCL (with KL)  & \textbf{92.83} & \textbf{91.32} & \textbf{76.42} & \textbf{73.76} \\
   \bottomrule
  \end{tabular}}
\end{table}



% \resizebox{1\linewidth}{!}{
% \begin{tabular}{lccccc}
% \toprule
% \multicolumn{1}{c}{\multirow{2}{*}{\textbf{Models}}} & \multicolumn{2}{c}{\textbf{Intent Detection (\%)}}  & \multicolumn{2}{c}{\textbf{Slot Filling (\%)}}                 \\
% & \multicolumn{1}{c}{\textbf{ACC $\uparrow$}} & \multicolumn{1}{c}{\textbf{LCA $\uparrow$}}  & \multicolumn{1}{c}{\textbf{F1 $\uparrow$}} & \multicolumn{1}{c}{\textbf{LCA $\uparrow$}}
% \\ \hline
% Finetune  &14.09 & 28.76 & 15.38&19.55\\
% EWC & 14.16 & 28.34 & 15.67&19.51\\
% MAS & 14.15 & 28.61 & 15.59&19.37 \\
% L2KD & 35.22 & 61.78 & 44.16&39.94\\
% LAMOL-g & 50.30 & 60.67 & 45.12 & 38.03 \\
% LAMOL-t & 51.81 & 67.97 & 44.83 & 37.58\\
% ER & 78.19 & 78.19  & 44.95 & 39.32\\
% HAT &73.92 & 73.03  & 61.99 & 67.33\\
% CTR & 67.44 & 71.11 & 63.84 & 67.28\\
% AdapterCL & 81.15 & 75.60 & 75.60 &48.47 \\
% PCLL & 90.25 & 88.82 & 74.48 & 68.41\\
% % PCLL(reproduced) & \textbf{90.37} & \textbf{90.30} & \textbf{75.32} & \textbf{73.75}\\
% Multi (Upper Bound) & 96.25 & N/A & 80.80 & N/A\\
% \hline
% % DCL (with KL)  & 92.83 & 91.32 & 76.42 & 73.76 \\
% % DCL (with JS)  & \textbf{93.73} & \textbf{93.04} & \textbf{77.37} & \textbf{74.49}\\
% \textbf{DCL}  & \textbf{93.73} & \textbf{93.04} & \textbf{77.37} & \textbf{74.49}\\
% \bottomrule
% \end{tabular}
% }





\noindent\textbf{KL Knowledge Distillation vs. JS Knowledge Distillation.}  We evaluate the impact of Jensen-Shannon (JS) Knowledge Distillation on the performance of DCL and compare it with DCL equipped with KL Knowledge Distillation.  Due to space limitations, we report the performance in the slot filling task under different learning orders, as summarized in Table~\ref{tab:ablation-kl}, where the performance in the intention detection task follows a similar trend.  Our findings consistently demonstrate that the DCL model equipped with JS knowledge distillation outperforms the DCL model with KL knowledge distillation across all task learning orders. Notably, we observe significant improvements ranging from 0.63\% to 1.55\% in the F1 score and 0.41\% to 1.11\% in LCA.  
We performed a t-test on DCL incorporating both JS and KL knowledge distillation, resulting in a P-value of 0.0013. This suggests that JS knowledge distillation outperforms KL knowledge distillation significantly.
These results provide compelling evidence that JS divergence effectively facilitates knowledge transfer within the DCL framework, underscoring its beneficial role in enhancing model performance. 
This conclusion holds true for intent detection, as similar results were observed in the slot filling task.
%We evaluate the impact of Jensen-Shannon (JS) Knowledge Distillation on the overall performance of DCL. Table~\ref{tab:ablation-kl} shows the performance differences in the slot filling task for various task learning orders between DCL implementations with KL and with JS knowledge distillation.  We find that the DCL equipped with JS knowledge distillation outperforms the DCL with KL knowledge distillation in all orders. Notably, $0.63\%-1.55\%$ and $0.41\%-1.11\%$ improvements are obtained in the F1 score and LCA, respectively.  This serves as evidence that JS divergence is helpful to knowledge transfer.

% The results show that the model using JS Knowledge Distillation outperforms the one using KL knowledge distillation in F1, and LCA. 


% 1.13 
% 0.63
% 0.73
% 1.03
% 1.55
% 0.64


% 0.84
% 0.41
% 0.72
% 0.41
% 0.89
% 1.11

\begin{table}[t]
\small
\centering
\caption{Slot filling results of F1 score(\%) and LCA (\%) on six orders with KL and JS knowledge distillation. F1 score and LCA of DCL using JS knowledge distillation are improved in all six orders compared with using KL knowledge distillation.}
\label{tab:ablation-kl}
\renewcommand{\arraystretch}{1.1} 
\resizebox{0.7\linewidth}{!}
{
\begin{tabular}{lccccc}
\toprule
\multicolumn{1}{c}{\multirow{2}{*}{Orders}} & \multicolumn{2}{c}{DCL (with KL)}  & \multicolumn{2}{c}{DCL (with JS)}                 \\
& \multicolumn{1}{c}{Score $\uparrow$} & \multicolumn{1}{c}{LCA $\uparrow$}  & \multicolumn{1}{c}{Score $\uparrow$} & \multicolumn{1}{c}{LCA $\uparrow$}
\\ \hline
order 0 & 80.26 & 73.82 & 81.39 & 74.66\\
order 1 & 80.31 & 74.82 & 80.94 & 75.23\\
order 2 & 74.04 & 70.01 & 74.77 & 70.73 \\
order 3 & 77.00 & 76.28 & 78.03 & 76.69\\
order 4 & 72.98 & 70.10 & 74.53 & 70.99 \\
order 5 & 73.92 & 77.50  & 74.56 & 78.61 \\
\hline
Mean & 76.42 & 73.76 & \textbf{77.37} & \textbf{74.49}\\
\bottomrule
\end{tabular}
}
% \vspace{-1em}
\end{table}




% \begin{table}[!t]
%   \centering
%   % \resizebox{1\linewidth}{!}{
%   \begin{tabular}{lccc}
%   \toprule
%    % \hline
%    Model & DCL (with KL) & DCL (with JS)   \\
%    \hline
%    \cline{1-3}
%    order 0 & $80.26$ & $81.39$ \\
%    order 1 & $80.31$ & $80.94$ \\
%    order 2 & $74.04$ & $74.77$  \\
%    order 3 & $77.00$ & $78.03$ \\
%    order 4 & $72.98$ & $74.53$ \\
%    order 5 & $73.92$ & $74.56$  \\
%    \hline
%   \end{tabular}
%     \caption{Slot filling results of F1 metrics (\%) on six orders with KL and JS knowledge distillation.}
%     \label{tab:ablation-kl}
%     \vspace{-.5em}
% \end{table}



\noindent\textbf{Number of Pseudo Samples.}
To have a better understanding of how the number of pseudo samples influences the performance of the proposed approach, various ratios of pseudo samples are employed in DCL. Table \ref{tab:ablation-psn} presents the comparison between PCLL and DCL with different ratios of pseudo samples. Specifically, we analyze PCLL with ratio of 0.2 and DCL with ratios of 0.1, 0.2, 0.4, and 0.5. Although fewer pseudo samples are added to the training, DCL with a ratio of 0.1 still outperforms PCLL with a ratio of 0.2 in terms of accuracy score and LCA. This can be ascribed to the superiority of DCL. Moreover, we find that the performance can be further improved as the number of pseudo samples increases. This is because more information is carried by more data samples, which can enhance the model's capabilities.


\renewcommand{\arraystretch}{1.1} 
\begin{table}[!t]
\small
\centering
\caption{Intent detection result of PCLL with pseudo samples ratio of 0.2 and DCL with a different number of pseudo samples numbers ranging from 0.1 to 0.5.} 
\label{tab:ablation-psn}
\resizebox{0.6\linewidth}{!}
{
\begin{tabular}{lcccc}
\toprule
Model &Ratio & Score & LCA
\\ \hline
PCLL& 0.2 &90.25& 88.82\\
DCL&0.1 & 91.66 & 91.83 \\
DCL&0.2 & 93.73 &\textbf{93.04}\\
DCL&0.4 & 93.97 & 92.76 \\
DCL&0.5 & \textbf{94.23} & 92.82  \\
\bottomrule
\end{tabular}
}

% \vspace{-1em}
\end{table}


% \end{minipage}
% \hfill
% \begin{minipage}{.45\linewidth}
% \small
% \centering
% {
% \begin{tabular}{lccc}
% \toprule
% MCL & ACC & LCA
% \\ \hline
% 50 & 92.17 & 91.16 \\
% 256 & 93.73 &93.04\\
% \bottomrule
% \end{tabular}
% }
% \caption{Intent detection of DCL with input maximum context length (MCL) of 50 and 256.} 
% \label{tab:ablation-mcl}
% \vspace{-.5em}
% \end{minipage}
% \end{table}


% \noindent\textbf{Maximum Context Length.} 
% We study the effect of maximum context length on performance by comparing cases with lengths of 256 and 50 in the intent detection task, keeping other parameters constant. Table~\ref{tab:ablation-mcl} details the average performance for six task learning orders. As anticipated, longer input utterances generally contain more information, leading to improved performance.


\noindent\textbf{Evaluating Pseudo Samples Quality.} 
To compare the quality of the generated pseudo samples in different baselines with our proposed model, we utilize  \textbf{Dist-n} \citep{li2015diversity} to assess pseudo samples. Dist-n measures the proportion of distinct n-grams in the generated pseudo samples. A higher Dist-n value corresponding to larger pseudo sample diversity is preferred where the samples are more distinct. We employ Dist-1, Dist-2, Dist-3, and Dist-4 to analyze the quality of generated samples completely.

Given the limited number of pseudo samples, the quality of our exemplars is crucial to preserve the performance of previous tasks. We aim to carefully select representative and diverse utterances instead of generic and similar ones. Table~\ref{tab:ablation-dist} summarizes the Dist-n results. Notably, DCL achieves higher distinct scores compared to other methods, indicating that DCL-generated pseudo samples exhibit larger diversity. This suggests that pseudo samples created by DCL are more similar to real samples.






\begin{table}[t]
\centering
\caption{Distinct scores for generated pseudo samples. A higher Dist-n score means higher diversity.} 
\label{tab:ablation-dist}
\renewcommand{\arraystretch}{1.1} 
\resizebox{0.8\linewidth}{!}
{
\begin{tabular}{lccccc}
\toprule
Model & Dist-1 & Dist-2 & Dist-3 & Dist-4
\\ \hline
LAMOL-g & 0.0602 & 0.2466 & 0.4489 & 0.6178\\
LAMOL-t & 0.1758 & 0.4733 & 0.6837 & 0.8090\\
PCLL & 0.2836 & 0.6566 & 0.8369 & 0.9221 \\
\textbf{DCL} & \textbf{0.3092} & \textbf{0.7019} & \textbf{0.8708} & \textbf{0.9389} \\
\hline
Real Sample & 0.4000 & 0.7972 & 0.9255 & 0.9717\\
\bottomrule
\end{tabular}
}
\end{table}




% \begin{table}[t]
% \centering
% % \renewcommand{\arraystretch}{1} 
% % \resizebox{1\linewidth}{!}
% {
% \begin{tabular}{lccccc}
% \toprule
% \multicolumn{1}{c}{\multirow{2}{*}{Orders}} & \multicolumn{2}{c}{MCL = 50}  & \multicolumn{2}{c}{MCL = 256}                 \\
% & \multicolumn{1}{c}{ACC $\uparrow$} & \multicolumn{1}{c}{LCA $\uparrow$}  & \multicolumn{1}{c}{ACC $\uparrow$} & \multicolumn{1}{c}{LCA $\uparrow$}
% \\ \hline
% order 0 & 92.24 & 91.18 & 93.52 & 92.69 \\
% order 1 & 91.73 & 88.28 & 93.64 & 90.45 \\
% order 2 & 91.52 & 93.13 & 93.74 & 95.47 \\
% order 3 & 92.12 & 91.48 & 93.83 & 93.34 \\
% order 4 & 92.22 & 89.58  & 93.79 & 92.04 \\
% order 5 & 93.21 & 93.28 & 93.85 & 94.22  \\
% \hline
% Mean &92.17 &91.16 &93.73 & 93.04\\
% \bottomrule
% \end{tabular}
% }
% \caption{Intent detection results of DCL evaluated by ACC (\%) and LCA (\%) on six orders with input maximum context length (MCL) as 50 and 256. }
% \label{tab:ablation-mcl}
% \end{table}





% \begin{table}[t]
% \small
% \centering
% {
% \begin{tabular}{lccc}
% \toprule
% MCL & ACC & LCA
% \\ \hline
% 50 & 92.17 & 91.16 \\
% 256 & 93.73 &93.04\\
% \bottomrule
% \end{tabular}
% }
% \caption{Intent detection results of DCL with input maximum context length (MCL) of 50 and 256.} 
% \label{tab:ablation-mcl}
% \vspace{-.5em}
% \end{table}


\noindent\textbf{Dimension of Latent Variable.}
The impact of the latent variable $z$'s dimension is displayed in Table~\ref{tab:ablation-dim-z}. It shows that DCL using JSKD with a small dimension of $8$ presents better performance than DCL using KL with a large dimension of $128$. The DCL model can generate high-quality pseudo samples even with smaller dimensions and less information encoded, leading to improved accuracy. This demonstrates that the Dirichlet latent is superior to the Gaussian counterpart. 
It should be noted that DCL using JSKD with a latent dimension of $8$ exhibits degraded performance compared with that of $128$. This is caused by the shrunk information capacity in the scenario with a smaller $z$ dimension.

\begin{table}[t]
\small
\centering
\caption{Intent detection of DCL (with JS) and DCL (with KL) for different latent variable dimensions.}
\label{tab:ablation-dim-z}
\renewcommand{\arraystretch}{1.1} 
  \resizebox{0.6\linewidth}{!}
 {
\begin{tabular}{lccc}
\toprule
\small{Models} & \small{Score} & \small{LCA}
\\ \hline
DCL (with KL, z = 128)  & 92.83 & 91.32 \\
DCL (with JS, z = 8)  & 93.51 & \textbf{93.11} \\
DCL (with JS, z = 128)  & \textbf{93.73} & 93.04 \\
\hline
\end{tabular}}
\end{table}





\begin{table}[ht]
\small
\centering
\caption{Comparison of Generated Pseudo Samples by PCLL and DCL with the Ground Truth (Golden).}
\label{tab:casestudy}
\begin{tabularx}{\columnwidth}{
  >{\hsize=.4\hsize\raggedright\arraybackslash}X
  >{\hsize=1.8\hsize\raggedright\arraybackslash}X
  >{\hsize=.8\hsize\raggedright\arraybackslash}X
}
\toprule
\textbf{Models}  & \textbf{Input Utterance} & \textbf{Output y} \\
\midrule
\multirow{4}{*}{Golden} & {what's the fuel economy of my car.} & { mpg} \\\cdashline{2-3}
   & { What is the expiration date on my card?} & { expiration date} \\
\midrule
\multirow{4}{*}{PCLL} & {Do they have a lot of miles on this road?} & { mpg} \\\cdashline{2-3}
   & { Do you know how much my new credit card is worth?} & { expiration date} \\
\midrule
\multirow{4}{*}{DCL} & {What is the mpg of this car?} & { mpg} \\\cdashline{2-3}
   &{Can you check my expiration month?} & {expiration date} \\
\bottomrule
\end{tabularx}
\end{table}



% We further investigate the influence of the dimension of $z$ and list the performances in Table~\ref{tab:ablation-dim-z}. The results indicate that DCL using JSKD with a latent dimension of $8$ outperforms DCL using KL with a dimension of $128$, suggesting that the Dirichlet latent outperforms the Gaussian latent. Despite smaller dimensions and consequently less information encoded in the latent $z$, the model can generate superior pseudo samples, which further results in higher accuracy. We also notice that DCL using JSKD with a latent dimension of $8$ underperforms  DCL using JSKD with a latent dimension of $128$. This is understandable as a smaller $z$ dimension carries less information, leading to a performance drop. 
% \begin{table}[t]
% % \small
%   \centering
%   \renewcommand{\arraystretch}{1.3} 
%   \resizebox{1\linewidth}{!}{
%   \begin{tabular}{lcccccc}
%   \toprule
%   \multicolumn{1}{c}{\multirow{2}{*}{\textbf{Orders}}} & \multicolumn{2}{c}{DCL (with JS, z = 8)}  & \multicolumn{2}{c}{DCL (with JS, z = 128)}   & \multicolumn{2}{c}{DCL (with KL, z = 128)}              \\
% & \multicolumn{1}{c}{ACC $\uparrow$} & \multicolumn{1}{c}{LCA $\uparrow$}  & \multicolumn{1}{c}{ACC $\uparrow$} & \multicolumn{1}{c}{LCA $\uparrow$}
% & \multicolumn{1}{c}{ACC $\uparrow$} & \multicolumn{1}{c}{LCA $\uparrow$}
% \\ \hline
%    order 0 &  94.01  &  93.59  &  93.52 &  92.69 &  92.76&  91.27\\
%   order 1 & 93.00  & 90.39 & 93.64 & 90.45 & 92.76 & 88.49\\
%    order 2 & 93.27  & 95.34 & 93.74 & 95.47 & 92.56 & 93.36 \\
%    order 3 & 93.57 & 93.23 & 93.83 & 93.34 & 92.91 & 91.53 \\
%    order 4 & 93.21 & 91.81 & 93.79 & 92.04 & 92.68 & 89.92\\
%    order 5 & 94.02 & 94.32 & 93.85 & 94.22 & 93.33 & 93.32\\
%    \hline
%    Mean & 93.51 & 93.11 & 93.73 & 93.04 & 92.83 & 91.32 \\
%    \bottomrule
%   \end{tabular}}
%     \caption{Intent detection results of DCL (with JS) and DCL (with KL). The results encompass six orders of $z$, with dimensions of 8 and 128 for DCL (with JS), and dimension 128 for DCL (with KL).}
%     \label{tab:ablation-dim-z1}
%     \vspace{-.5em}
% \end{table}




% \begin{table*}[t]
% \small
% \centering
% \begin{tblr}{
%   cell{2}{1} = {r=2}{},
%   cell{4}{1} = {r=2}{},
%   cell{6}{1} = {r=2}{},
%   hline{1-2,4,6,8} = {-}{},
% }
% \textbf{Models}  & \textbf{Input Utterance} & \textbf{Output y} \\
% Golden & {1. what's the fuel economy of my car.} & { 1. mpg} \\
%    & { 2. What is the expiration date on my card?} & { 2. expiration date} \\
% PCLL & {1. Do they have a lot of miles on this road?} & { 1. mpg} \\
%    & { 2. Do you know how much my new credit card is worth?} & { 2. expiration date} \\
% DCL & {1. What is the mpg of this car?} & { 1. mpg} \\
%    &{2. Can you check my expiration month?} & { 2. expiration date} 
% \end{tblr}
% \caption{Comparison of Generated Pseudo Samples by PCLL and DCL against the Ground Truth (Golden).}
% \label{tab:casestudy}
% \vspace{-1em}
% \end{table*}
\noindent\textbf{Explore Llama2 as a Backbone.} To explore the scalability of DCL in large language models (LLMs), we further did intent detection experiments with Llama 2-7B \citep{touvron2023llama} as a backbone and compared DCL with PCLL. To be specific, we want to demonstrate further that DCL is scalable in LLM such as Llama2. Table \ref{tab:llama} shows the result of order 0 of intent detection. We find that DCL outperforms PCLL with a $5.72\%$ improvement, reflecting DCL's efficiency and scalability. We use Lora \citep{hu2021lora} to tune the DCL and PCLL so that the model can be fine-tuned in a single GPU which costs 80 training hours. 




\begin{table}[t]
\small
\centering
\caption{Intent detection of DCL and PCLL with Llama2 backbone.}
\label{tab:llama}
\renewcommand{\arraystretch}{1.1} 
  \resizebox{0.4\linewidth}{!}
 {
\begin{tabular}{lccc}
\toprule
\small{Models} & \small{Score} 
\\ \hline
PCLL (Llama2)  & 66.69  \\
DCL (Llama2)  & 72.41 \\
\hline
\end{tabular}}
\end{table}








\subsection{Case Study}
Table~\ref{tab:casestudy} presents a comparison between the pseudo samples generated by DCL, PCLL, and real samples (Golden) from the intent detection task. A pseudo sample includes the input utterance (middle column) and the intent (right column). It can be observed that PCLL struggles to generate the intent of specific sentences correctly. For instance, PCLL wrongly generates the intent \textit{``mpg''} (miles per gallon) for the utterance \textit{``Do they have a lot of miles on this road''}, suggesting it fails to capture the actual meaning of the utterance. 
% This sentence does not fall under the \textit{``mpg''} intent as it doesn't contain references to terms like \textit{``gas''}, \textit{``fuel''}, or \textit{``mpg''}.
In addition, for the utterance \textit{``Do you know how much my new credit card is worth?''}, PCLL also makes mistakes in intent detection where the output \textit{``expiration date''} is irrelevant to the input. 
% Sentences that should have the intent of \textit{``expiration date''} would look more like \textit{``Can   you check my expiration month?''}, and would typically include keywords such as \textit{``expire''}, \textit{``validity''}, and related terms.

\section{Conclusions}
In this paper, we propose DCL, a generative-based rehearsal method, to alleviate CF for CL in ToDs. 
A Dirichlet distribution-based CVAE is developed to exploit the flexibility of Dirichlet distribution in the process of modeling the utterance-level characteristics and thus
obtain better pseudo sample generation compared to the conventional Gaussian-based CVAE.
% which helps to generate more realistic pseudo samples for the rehearsal than the traditional Gaussian distribution-based CVAE. 
In addition, a more robust JS divergence-based knowledge distillation method is proposed to facilitate knowledge transfer between tasks. Comprehensive experiments show the superiority of the proposed method.



% References
\bibliography{uai2024-template}

\newpage

\onecolumn

\title{Supplementary Material}
\maketitle
\appendix



\section*{Appendix A}
\label{sec:appendixa}
\textbf{Prompt Example}
% \textit{``Do you know how much my new credit card is worth?''}
The task prompt used for intent detection is \textit{``For an utterance from the ID task, $x$ has the following intent''} and the task prompt used for slot filling is \textit{``In the ID task, if there are any slots and values, what are they in this sentence: $x$? Answer: ''}. For example, when training on a BANKING task, the input x is \textit{``Please tell me how to link the card?''}, and the modified $x$ is \textit{``For an utterance from the BANKING task, ``Please tell me how to link the card?'' has the following intent''}. Thus the output y is its corresponding intent annotation \textit{``card linking''}.



\section*{Appendix B}
\label{sec:appendixb}
\textbf{Dataset Order} 
Table \ref{tab:order-intent} summarizes the six dataset orders of the intent detection task and table \ref{tab:order-slot} lists out the six dataset orders of the slot filling task.

\begin{table*}[ht]
\small
\centering
\renewcommand{\arraystretch}{1.5} 
\begin{tabular}{lllllllll}
\hline
Order 1 & TOP\_S1 & HWU     & SNIPS   & BANKING & CLINC  & TOP\_S2 & TOP\_S3 & ATIS  \\
Order 2 & BANKING & HWU     & TOP\_S1 & TOP\_S3 & CLINC  & TOP\_S2 & SNIPS   & ATIS   \\
Order 3 & SNIPS   & ATIS    & TOP\_S2 & TOP\_S3 & CLINC  & BANKING & HWU     & TOP\_S1 \\
Order 4 & CLINC   & SNIPS   & TOP\_S3 & BANKING & TOP\_S2 & HWU     & TOP\_S1 & ATIS   \\
Order 5 & BANKING & TOP\_S2 & TOP\_S1 & ATIS    & TOP\_S3 & HWU     & CLINC   & SNIPS  \\
Order 6 & CLINC   & TOP\_S1 & TOP\_S2 & ATIS    & SNIPS  & HWU     & BANKING & TOP\_S3 \\
\bottomrule
\end{tabular}
\caption{Dataset Orders of Intent Dectection Tasks}
\label{tab:order-intent}
% \vspace{-10em}
\end{table*}



\begin{table*}[ht]
\small
\centering
\renewcommand{\arraystretch}{1.5} 
\begin{tabular}{llllll}
\hline
Order 1 & MIT\_MOVIE & DSTC     & MIT\_RESTAURANT   & SNIPS & ATIS \\
Order 2 & MIT\_MOVIE & SNIPS     & DSTC & MIT\_RESTAURANT & ATIS \\
Order 3 & ATIS   & MIT\_MOVIE    & DSTC & MIT\_RESTAURANT & SNIPS\\
Order 4 & DSTC   & MIT\_RESTAURANT   & MIT\_MOVIE & ATIS & SNIPS\\
Order 5 & MIT\_MOVIE & ATIS & SNIPS & MIT\_RESTAURANT    & DSTC\\
Order 6 & SNIPS   & ATIS & MIT\_RESTAURANT & MIT\_MOVIE    & DSTC\\
\bottomrule
\end{tabular}
\caption{Dataset Orders of Slot Filling Tasks}
\label{tab:order-slot}
% \vspace{-10em}
\end{table*}


% Slot Filling Tasks:\\
% Order 1 MIT_MOVIE, DSTC, MIT_RESTAURANT, SNIPS, ATIS \\
% Order 2 MIT_MOVIE, SNIPS, DSTC, MIT_RESTAURANT, ATIS \\
% Order 3 ATIS, MIT_MOVIE, DSTC, MIT_RESTAURANT, SNIPS \\
% Order 4 DSTC, MIT_RESTAURANT, MIT_MOVIE, ATIS, SNIPS \\
% Order 5 MIT_MOVIE, ATIS, SNIPS, MIT_RESTAURANT, DSTC \\
% Order 6 SNIPS, ATIS, MIT_RESTAURANT, MIT_MOVIE, DSTC\\


\section*{Appendix C}
\label{sec:appendixc}
\textbf{Learning Curve} Figure \ref{fig:curve} plots the learning curves of DCL and PCLL on intent detection task. The accuracy of DCL surpasses PCLL significantly after 6,000 steps.


\begin{figure}[ht]
    \centering
    \includegraphics[width=90mm]{curve.pdf}
    \caption{Learning curves of DCL and PCLL on intent detection task.}
    \label{fig:curve}
\end{figure}





\end{document}
