\documentclass{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{paralist}

% % added package includes
\usepackage{caption} % customize table captions
\usepackage{array} % for column width adjustments
\usepackage{rotating} % for rotating tables
\usepackage{tabularx}
\usepackage{multirow}
\usepackage{inconsolata}
\usepackage{amsfonts}
\usepackage{arydshln}

\usepackage{color} % [usenames,dvipsnames]
\newcommand{\blue}[1]{{\color{blue}{#1}}}
\newcommand{\red}[1]{{\color{red}{#1}}}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Dirichlet Generative Rehearsal: Unlocking Continual Learning}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2024 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Harry~Q.~Bovik}
\author[1,2]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[1]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Cranberry University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Second Affiliation\\
    Address\\
    …
}
\affil[3]{%
    Another Affiliation\\
    Address\\
    …
  }
  
\begin{document}
\maketitle
\begin{abstract}
Catastrophic forgetting poses a significant challenge in continual learning (CL).  In the context of Natural Language Processing, generative-based rehearsal CL methods have made progress in avoiding expensive pre-training.  However, generating pseudo samples that accurately capture the task-specific distribution remains a daunting task.  In this paper, we introduce Dirichlet Continual Learning (DCL), a novel generative-based rehearsal strategy designed specifically for CL.  Differing from the conventional use of Gaussian latent variables in Conditional Variational Autoencoder (CVAE), our DCL employs the flexibility of the Dirichlet distribution to model the latent prior variable.  This allows DCL to effectively capture sentence-level features from previous tasks and guide the generation of pseudo samples. Additionally, we propose Jensen-Shannon Knowledge Distillation (JSKD), a robust logit-based knowledge distillation method that enhances knowledge transfer during pseudo sample generation.  Our extensive experiments show that DCL outperforms state-of-the-art methods in two typical tasks of task-oriented dialogue systems, demonstrating its efficacy. %In Intent Detection, DCL achieves a significant 3.48\% accuracy improvement and a 4.22\% improvement in the Learning Curve Area (LCA) over the best baseline. Likewise, in Slot Filling, DCL exhibits a notable 2.89\% accuracy improvement and a 6.08\% LCA improvement. Code and model checkpoints for DCL can be accessed at https://github/com/*****.


%We conduct extensive experiments and the results demonstrate the efficacy of our approach, surpassing state-of-the-art methods in two typical tasks in task-oriented dialogue systems.  DCL achieves a significant 3.48\% accuracy improvement and a 4.22\% Learning Curve Area (LCA) improvement.  Similarly, in the Slot Filling task, DCL demonstrates a notable 2.89\% accuracy improvement and a 6.08\% LCA improvement.  Code and model checkpoints for DCL are accessible at https://github/com/*****.
%Departing from the conventional use of Gaussian latent variables in Conditional Variational Autoencoder (CVAE), our DCL model employs the flexibility and versatility of the Dirichlet distribution to model the latent prior variable.  This enables DCL to efficiently capture sentence-level features from previous tasks and effectively guide the generation of pseudo samples.  Furthermore, we propose Jensen-Shannon Knowledge Distillation (JSKD), a robust logit-based knowledge distillation method that enhances knowledge transfer during the generation of pseudo samples.  Our experiments demonstrate the efficacy of our approach, surpassing state-of-the-art methods in two typical tasks in task-oriented dialogue systems.  

\end{abstract}

%dCatastrophic forgetting is a major challenge in continual learning (CL). In the field of Natural Language Processing (NLP), generative-based rehearsal CL methods have made progress in avoiding the need for expensive pre-training. However, generating accurate task-specific pseudo samples remains difficult. In this paper, we introduce Dirichlet Continual Learning (DCL), a novel generative-based rehearsal strategy designed specifically for CL. Our DCL model departs from the conventional use of Gaussian latent variables in Conditional Variational Autoencoder (CVAE) and instead utilizes the flexibility of the Dirichlet distribution to model the latent prior variable. This allows DCL to effectively capture sentence-level features from previous tasks and guide the generation of pseudo samples. Additionally, we propose Jensen-Shannon Knowledge Distillation (JSKD), a robust logit-based knowledge distillation method that enhances knowledge transfer during pseudo sample generation.

\section{Introduction}

Continual learning (CL) is a significant learning paradigm that aims to emulate the human capacity for continuous learning and knowledge accumulation, while also ensuring that previously learned knowledge is retained and effectively transferred to facilitate the learning of new tasks~\citep{parisi2019continual}.  However, in practice, models trained on new tasks often struggle to retain knowledge from previous tasks, leading to a degradation in performance~\citep{mccloskey1989catastrophic}.  This challenge is particularly pronounced in the context of Natural Language Processing (NLP), where the complexity and diversity of language pose additional difficulties for CL~\citep{ke2022continual}. Given the computational expense and time-consuming nature of retraining the entire model from scratch, continual learning with the latest data and tasks becomes crucial.


% % 这篇看看是不是需要在上面引用 https://arxiv.org/pdf/2211.12701.pdf
% \begin{figure}[!t]
%     \centering
%     \includegraphics[width=60mm]{cl.pdf}
%     \caption{In this example of Continual Learning, the LM first trains on the \textit{BANKING} dataset, resulting in parameter $\theta_1$. The LM then trains on \textit{HWU}, followed by \textit{SNIPS}, and so on. The parameters are updated sequentially.}
%     \label{fig:cl}
% \end{figure}  and can be categorized into three main categories

Catastrophic forgetting, the phenomenon where a model forgets previously learned tasks when learning new tasks, poses a significant challenge in Continual Learning (CL)~\citep{mcclelland1995there,DBLP:journals/nn/ParisiKPKW19}.  To address this issue, various approaches have been proposed: (1) {\em Regularization} methods aim to minimize updates to the important parameters of previous tasks, thus preserving their performance~\citep{kirkpatrick2017overcoming,zenke2017continual,aljundi2018memory}.  However, the accumulation of regularizers may overly constrain network parameters, hindering the learning of new tasks. (2) {\em Architectural} approaches modify the network structure to enhance the extraction of task-specific features~\citep{serra2018overcoming,ke2021achieving,madotto2021continual,zhang2022continual}.  However, their task-focused approach may overlook effective knowledge transfer across tasks.  (3) {\em Rehearsal} strategies involve replaying samples from previous tasks during training with the current task dataset~\citep{lopez2017gradient,sun2019lamol,rolnick2019experience,chuang2020lifelong,mi2020continual,Mi_2020_CVPR_Workshops,zhao2022prompt}. Rehearsal methods can be categorized into store-based rehearsal and generative-based rehearsal. It is worth noting that rehearsal methods have shown promise in mitigating forgetting in CL. However, store-based rehearsal may result in inefficiencies and increased memory demands, while generative-based rehearsal emerges as a more effective alternative. This approach facilitates efficient memory utilization and knowledge retention across sequential learning scenarios.



%{\em Catastrophic forgetting}, which refers to the phenomenon that a model forgets previously learned tasks when learning new tasks, poses a significant challenge in CL~\citep{mcclelland1995there,DBLP:journals/nn/ParisiKPKW19}.  To alleviate this phenomenon, various approaches have been proposed and can be divided into three categories: (1) {\em Regularization} methods: typical methods, such as Elastic Weight Consolidation (EWC)~\citep{kirkpatrick2017overcoming}, Memory Aware Synapses (MAS)~\citep{aljundi2018memory}, Adaptively Regularized Prioritized Exemplar Replay (ARPER)~\citep{mi2020continual}, have aimed to minimize updates to the important parameters of previous tasks, preserving the performance of previous tasks.  However, the accumulation of regularizers may overly constrain network parameters, impeding new task learning. (2) {\em Architectural} approaches: these methods, such as  \citep{serra2018overcoming},\citep{ke2021achieving}, \citep{madotto2021continual}, \citep{zhang2022continual}, mainly modify the network structure to enhance the extraction of task-specific features.  Nevertheless, their task-focused approach might overlook effective knowledge transfer between old and new tasks. (3) {\em Rehearsal} strategies, including \citep{lopez2017gradient}, \citep{sun2019lamol}, \citep{rolnick2019experience}, \citep{chuang2020lifelong}, \citep{mi2020continual}, \citep{Mi_2020_CVPR_Workshops} and \citep{zhao2022prompt}, involve replaying samples from previous tasks, which are combined with the current task dataset for training.  Rehearsal methods can be categorized into ``store-based rehearsal" and ``generative-based rehearsal".  It is noteworthy that rehearsal, as demonstrated by various studies, emerges as a promising strategy to mitigate forgetting in CL.  However, store-based rehearsal may lead to inefficiencies and increased memory demands.  In contrast, generative-based rehearsal emerges as a more effective alternative.  This approach facilitates effective memory utilization and knowledge retention across sequential learning scenarios.

 







% (2) 
%\这里写相关的方法，存在的问题。最终引出真正的问题 

%这段内容估计需要用到: 

%Generative-based rehearsal methods have shown promise by mitigating the need for extensive pre-training. These methods aim to generate pseudo samples that closely capture the task-specific distribution, allowing the model to retain the knowledge of previous tasks.  However, accurately generating such samples remains a daunting task.  The key to generative replay lies in producing high-quality pseudo-samples that approximate the real data distribution of prior tasks well. Higher-quality pseudo-samples intuitively contribute to better preservation of learned tasks, minimizing forgetting in CL.  However, in previous studies (\citep{sun2019lamol}; \citep{chuang2020lifelong}; \citep{zhao2022prompt}), the generation of pseudo-samples for each observed task often lacks diversity, fluency, or poor alignment with the designated task. 

Generative-based rehearsal methods have emerged as promising approaches for mitigating the need for extensive pre-training~\citep{DBLP:journals/nn/ParisiKPKW19,DBLP:journals/corr/abs-2211-12701}.  These methods aim to generate pseudo samples that closely mimic the task-specific distribution, enabling the model to retain knowledge from previous tasks.  However, generating such samples accurately remains a challenging task.  The success of generative replay hinges on the production of high-quality pseudo samples that effectively approximate the real data distribution of prior tasks.  Higher-quality pseudo samples inherently contribute to better preservation of learned tasks, thereby minimizing forgetting in CL.  However, previous studies~\citep{sun2019lamol,chuang2020lifelong,zhao2022prompt} have often demonstrated limited diversity, fluency, or poor alignment with the designated task when generating pseudo samples for each observed task.

% Notably, noisy pseudo samples will harm the performance of CL.
% LAMOL \cite{sun2019lamol} prevents forgetting by replaying pseudo-samples of previous tasks generated by the LM itself.
% PCLL \cite{zhao2022prompt} proposed 


% lacking diversity and training robustness 

To address the aforementioned challenges, we propose a novel generative-based rehearsal strategy called Dirichlet Continual Learning (DCL) specifically tailored for CL in NLP.  In contrast to conventional approaches that employ Gaussian latent variables in Conditional Variational Autoencoders (CVAE), DCL harnesses the flexibility and versatility of the Dirichlet distribution to model the latent prior variable.  This unique feature empowers DCL to effectively capture and represent sentence-level features from previous tasks, laying a solid foundation for generating high-quality pseudo samples.  Moreover, we introduce Jensen-Shannon Knowledge Distillation (JSKD), a robust logit-based knowledge distillation method that enhances knowledge transfer during the generation of pseudo samples. By accurately measuring the distance or similarity of the teacher (previous tasks) and student (current task) model, JSKD facilitates the transfer of accumulated knowledge from previous tasks to the current task.
% \blue{
% % By leveraging the relationships between the model's logits and the ground truth labels, 
% JSKD facilitates the transfer of task-specific knowledge from the model to the generated samples.} 
This innovative approach augments the effectiveness of the rehearsal process, ensuring that the generated samples retain and reflect the task-specific knowledge acquired by the model.

We summarize our main contributions as follows:
\begin{compactitem}[-]
\item We propose a novel generative-based rehearsal method, which effectively addresses the issue of forgetting and diversity by employing the Dirichlet latent variable within the framework of Conditional Variational Autoencoders (CVAE).  This approach enables us to approximate the real data distribution more accurately, thereby improving the quality of generated pseudo samples.
\item We introduce Jensen-Shannon Knowledge Distillation (JSKD), a new logit-based knowledge distillation strategy that enhances the transfer of knowledge between teacher and student models. JSKD facilitates more effective and robust knowledge transfer, leading to improved performance in the generation of pseudo samples.
\item Through extensive experiments on two typical tasks in task-oriented dialogue systems, we demonstrate the remarkable performance improvement achieved by our proposed DCL compared to state-of-the-art baselines for CL in NLP.  DCL achieves performance close to the upper bound of multi-task learning, showcasing its effectiveness.  Most notably, we outperform the state-of-the-art generative-based rehearsal method, Prompt Conditioned VAE for Lifelong Learning (PCLL), in all metrics.  Specifically, in the Intent Detection task, our DCL method achieves a significant 3.48\% accuracy improvement and a 4.22\% Learning Curve Area (LCA) improvement.  Similarly, in the Slot Filling task, DCL demonstrates a notable 2.89\% accuracy improvement and a 6.08\% LCA improvement.
\end{compactitem}




%Our experimental results demonstrate the substantial advancements achieved by our approach, DCL. In comparison to the state-of-the-art baseline method PCLL, DCL exhibits a remarkable 3.48% accuracy improvement and a 4.22% LCA (Learning Capacity Allocation) improvement in the Intent Detection task. Similarly, in the Slot Filling task, DCL demonstrates a significant 2.89% accuracy improvement and a 6.08% LCA improvement.


%Catastrophic forgetting, the phenomenon where a model forgets previously learned tasks when learning new tasks, poses a significant challenge in Continual Learning (CL)~\citep{mcclelland1995there,DBLP:journals/nn/ParisiKPKW19}.  To address this issue, various approaches have been proposed and can be categorized into three main categories: (1) {\em Regularization} methods: These methods, such as Elastic Weight Consolidation (EWC)~\citep{kirkpatrick2017overcoming}, Memory Aware Synapses (MAS)~\citep{aljundi2018memory}, and Adaptively Regularized Prioritized Exemplar Replay (ARPER)~\citep{mi2020continual}, aim to minimize updates to the important parameters of previous tasks, thus preserving their performance.  However, the accumulation of regularizers may overly constrain network parameters, hindering the learning of new tasks. (2) {Architectural} approaches: These methods, including~\citet{serra2018overcoming},~\citet{ke2021achieving},~\citet{madotto2021continual}, and~\citet{zhang2022continual}, modify the network structure to enhance the extraction of task-specific features.  However, their task-focused approach may overlook effective knowledge transfer between old and new tasks.  (3) {Rehearsal} strategies: Rehearsal methods, such as Gradient Episodic Memory (GEM)~\citep{lopez2017gradient}, Language modeling for lifelong language learning (LAMOL), Experience Replay (ER)~\citep{rolnick2019experience}, \citep{chuang2020lifelong}, Lifelong Generative Replay (LGR)~\citep{mi2020continual}, and Prompt-based Replay~\citep{Mi_2020_CVPR_Workshops,zhao2022prompt}, involve replaying samples from previous tasks during training with the current task dataset. Rehearsal methods can be categorized into "store-based rehearsal" and "generative-based rehearsal". It is worth noting that rehearsal methods have shown promise in mitigating forgetting in CL. However, store-based rehearsal may result in inefficiencies and increased memory demands, while generative-based rehearsal emerges as a more effective alternative. This approach facilitates efficient memory utilization and knowledge retention across sequential learning scenarios.


\section{Related Work}
We highlight the related work in the following two parts: continual learning and continual learning for NLP.

\subsection{Continual Learning}
CL can be categorized into three main strategies: 
\begin{compactitem}[-]
\item {\em Regularization} methods aim to strengthen previous knowledge by imposing constraints on important parameters and incorporating regularization terms into the loss function.  One notable approach is Elastic Weight Consolidation (EWC)~\citep{kirkpatrick2017overcoming}, which identifies crucial parameters and prevents their updates, thereby preserving performance on previous tasks.  Additionally, the Synaptic Intelligence (SI) model~\citep{zenke2017continual} dynamically computes per-synapse consolidation strength throughout the learning trajectory.  Memory Aware Synapses (MAS)~\citep{aljundi2018memory} and Adaptively Regularized Prioritized Exemplar Replay (ARPER) \citep{mi2020continual} determine parameter importance in an online and unsupervised manner.  Learning without Memorizing (LwM)~\citep{Dhar_2019_CVPR} utilizes Attention Distillation Loss (LAD) to support the progressive learning of new classes, making it effectively preserves information on base classes when incorporating new classes.  However, the accumulation of multiple regularizers may overly constrain network parameters, potentially hindering the learning of new tasks.

\item {\em Architectural} approaches usually modify the network structure to capture task-specific features and mitigate catastrophic forgetting.  PathNet~\citep{fernando2017pathnet} introduces dynamic pathway evolution, allowing the model to learn task-specific paths through a shared network. CL with GANs~\citep{seff2017continual} employs adversarial training to achieve a balance between learning new tasks and retaining knowledge from previous tasks.  Piggyback GAN~\citep{zhai2020piggyback} shares filter between tasks, enabling high-quality generation with fewer parameters while preserving performance on previous tasks.  However, their task-focused approach may overlook effective knowledge transfer between old and new tasks.

%\red{In task-oriented dialogues, AdapterCL \citep{madotto2021continual} presents a simple yet effective architectural approach based on residual adapters \citep{houlsby2019parameter}, providing a benchmark for evaluating continual learning methods in this domain.}

\item {\em Rehearsal} methods, utilized to sustain performance by leveraging samples from previous tasks, can be classified into two categories: store-based rehearsal and generative-based rehearsal.  Store-based rehearsal methods, such as Replay in Deep Learning~\citep{hayes2021replay} and Gradient-based Memory Editing (GMED)~\citep{jin2021gradient}, rely on episodic memory to store examples from previous tasks.  For instance, ICaRL~\citep{Rebuffi_2017_CVPR} incorporates a herding-based step to select representative samples and alleviate the catastrophic forgetting problem.  Gradient Episodic Memory (GEM)~\citep{lopez2017gradient} stores and replays important exemplars from previous tasks while learning new ones. Experience replay (ER) for CL~\citep{rolnick2019experience} continuously trains the model using batch gradient descent by sampling examples from both the current task and the episodic memory.  On the other hand, generative-based rehearsal methods, such as CL with deep generative replay~\citep{shin2017continual}, rely on generative adversarial networks (GANs)~\citep{goodfellow2014generative} to create synthetic samples.  ReMix~\citep{Mi_2020_CVPR_Workshops} generates pseudo samples by applying Mixup \citep{zhang2017mixup} to samples from previous tasks.  However, store-based rehearsal methods can be inefficient and require increased memory demands, while generative-based rehearsal methods may suffer from limited diversity, fluency, or poor alignment with the designated task. 
\end{compactitem}



\subsection{Continual Learning in NLP}
Within the context of {\em CL in NLP}, researchers have explored various strategies to address the challenges associated with evolving language tasks: %.  They can be categorized into the following types: 
\begin{compactitem}
\item{\em Regularization} Adaptively Regularized Prioritized Exemplar Replay (ARPER)~\citep{mi2020continual} presents the initial attempt to explore a practical continual learning configuration for Natural Language Generation (NLG) by incorporating prioritized exemplar replay and adaptive regularization based on Elastic Weight Consolidation (EWC). Specifically, ARPER prioritizes representative and diverse utterances in exemplar selection, aiming to comprehensively cover information from previous tasks.  


\item {\em Architectural} 
Adapter-based Continual Learning (AdapterCL)~\citep{madotto2021continual} is an architectural approach that places residual adapters \citep{houlsby2019parameter} atop the transformer layer to approximate each task in a continual learning setting.  Continual Prompt Tuning (CPT)~\citep{zhu2022continual} ensures non-forgetting and bidirectional knowledge transfer in a parameter-efficient dialog system.  It utilizes techniques such as prompt learning, memory replay, and query fusion to facilitate continual learning.
Semi-Supervised Lifelong Learning (SSLL)~\citep{zhao2022semi} integrates both labeled and unlabeled data for sequentially arriving tasks in a continual learning setting. It incorporates specialized modules to mitigate forgetting and harness the potential of unlabeled data. 
Adaptive Continual Modeling (ACM)~\citep{zhang2022continual} adopts a two-stage method to achieve efficient continual sequence generation. It dynamically adds or reuses modules based on task similarity and employs pseudo rehearsal for effective knowledge transfer, outperforming existing baselines in continual learning scenarios.


\item {\em Rehearsal}
Language Model with Adaptive Learning (LAMOL)~\citep{sun2019lamol} is a rehearsal method based on continual sequence generation. The generative-based rehearsal approach employed by LAMOL does not require memory to store previous samples. 
L2KD~\citep{chuang2020lifelong} improves LAMOL by assigning an extra teacher for each new task to perform knowledge distillation
PCLL~\citep{zhu2022continual} adopts a Conditional Variational Autoencoder (CVAE) to generate pseudo samples from past tasks, which are then used for continual learning.
\end{compactitem}
The remarkable performance of PCLL motivates us to explore further in this paper.

%Following the general framework of CL, in the domain of continual learning for NLP, researchers have explored various strategies to tackle the challenges associated with evolving language tasks. LAMOL \citep{sun2019lamol} is a rehearsal method based on continual sequence generation. Generative-based rehearsal approach does not require memory to store previous samples.  ARPER \citep{mi2020continual} presented the initial attempt to explore practical continual learning configuration for Natural Language Generation (NLG) by prioritized exemplar replay and adaptive regularization based on EWC. Specifically, ARPER prioritizes representative and diverse utterances in exemplar selection, aiming to comprehensively cover information from previous tasks.   AdapterCL \citep{madotto2021continual} is an architectural approach that places residual adapters \citep{houlsby2019parameter} atop the transformer layer to approximate each task. CPT \citep{zhu2022continual} ensures non-forgetting and bidirectional knowledge transfer in a parameter-efficient dialog system, utilizing techniques like prompt learning, memory replay, and query fusion. SSLL \citep{zhao2022semi} integrates both labeled and unlabeled data for sequentially arriving tasks, with specialized modules to mitigate forgetting and harness the potential of unlabeled data.  PCLL \citep{zhu2022continual} adopts a Conditional Variational Autoencoder (CVAE) to generate pseudo samples from past tasks. ACM \citep{zhang2022continual} adopts a two-stage method to achieve efficient continual sequence generation by dynamically adding or reusing modules based on task similarity, along with pseudo rehearsal for effective knowledge transfer, outperforming existing baselines.



% \subsection{Latent Dirichlet Allocation}
%  Latent Dirichlet Allocation (LDA) \citep{blei2003latent} is a popular topic model, employing the Dirichlet distribution to model topic and word distributions in documents. It serves as the conjugate prior to the multinomial distribution. LDA-based document models for ad-hoc retrieval were proposed in \citep{wei2006lda}. An online variational bayes (VB) algorithm for LDA was developed by \citet{hoffman2010online}. \citet{foulds2013stochastic} proposed a stochastic algorithm for collapsed VB inference in LDA. The Embedded Topic Model (ETM) \citep{dieng2020topic} combines LDA and word embeddings to identify interpretable topics with large vocabularies including rare and stop words. In addition, \citet{li2020dirichlet} introduced a Dirichlet graph VAE for graph generation and clustering.


% \subsection{Reparametrize Trick}
% The Reparametrize Trick (RT) is pivotal in probabilistic machine learning, particularly in variational autoencoder (VAE). It transforms the sampling process by decoupling it from model parameters. Instead of directly sampling from the latent distribution, the RT employs a deterministic transformation using a standard Gaussian distribution. Specifically, for a standard Gaussian sample \( \epsilon \), it adjusts the values based on the distribution's parameters (\(\mu\) for the mean and \(\sigma\) for the standard deviation) using the formula:
% \[
% z = \mu + \sigma \odot \epsilon
% \]
% Here, \(\odot\) denotes element-wise multiplication. This transformation facilitates efficient gradient optimization during training, enhancing the stability and convergence of the model.

% If the cumulative distribution function (CDF) is known and invertible, inverse CDF sampling can be used to do the sampling. Otherwise, alternative methods such as rejection sampling or Markov Chain Monte Carlo (MCMC) are employed. 







\begin{figure*}[!t]
    \centering    \includegraphics[width=150mm]{DCL_overview.pdf}
    \caption{Overview of our proposed DCL model.  DCL comprises two primary modules: the pseudo-rehearsal module and the LM training module.  The DCL model involves three main steps: (1) During the training of Task $N$, the pseudo-rehearsal module utilizes CVAE to generate pseudo samples from Task $1$ to Task ${N-1}$. (2) These pseudo samples are subsequently merged with the data from Task $N$. (3) The combined dataset is then employed for training the current task in the LM training module.}
    \label{fig:overview1}
\end{figure*}


% \input prelimiaries.tex where $N_n$ denotes the number of samples in Task $n$.




%give an example is \{\textit{``utterance: "i need you to get me a flight booked from houston to miami on united airlines", "intent": "book flight"''}\} where  $x_i$ is \textit{``i need you to get me a flight booked from houston to miami on united airlines''} and $y_i$ is an intention of \textit{``book flight''}.  Meanwhile, in the slot filling task, a typical example is \{\textit{``"how many comedy movies starring kevin costner have come out in the year 2000", "GENRE: comedy; ACTOR: kevin costner"''}\}.  In this example, $x_i$ is \textit{``how many comedy movies starring kevin costner have come out in the year 2000''}, while $y_i$ is \textit{``GENRE: comedy; ACTOR: kevin costner''}.





\section{Dirichlet Generative Rehearsal}
\label{sec:def}
%Our Dirichlet generative rehearsal (DCL) utilizes CVAE to mitigate forgetting in CL.  As outlined in Fig.~\ref{fig:overview1}, DCL consists of two main modules: pseudo-rehearsal and Language Model (LM) training modules.  To be specific, we apply CVAE to generate pseudo samples of previous tasks before the current task training, and an LM continues to train using both pseudo samples of previous tasks and real samples of the current task.  In CVAE, both encoder and decoder employ a pre-trained language model, e.g., GPT-2~\citep{radford2019language}, with distinct parameters to encode information and generate pseudo samples for tasks.  Notably, the LM shares parameters with the decoder of CVAE.  We also propose Jensen-Shannon Knowledge Distillation in Sec.~\ref{sec:3.5} elaborated below.
\subsection{Problem Definition}
A CL model for NLP aims to learn a stream of NLP tasks sequentially, \(T=\{T_1, \cdots, T_N\}\), where \(N\) is the number of tasks, which can be (potentially) infinite.  For task $n$, denoted by $T_n$, its data \(\mathcal{D}_n=\{(x_i, y_i)\}_{i=1}^{N_n}\) are drawn from an underlying data distribution.  Here, \(x_i\) denotes an input utterance, and \(y_i\) denotes the output label.  In the intent detection task, a typical example can be (``I need you to get me a flight booked from Houston to Miami on United Airlines", ``book flight").  Meanwhile, in the slot filling task, a typical example is \{\textit{``utterance: ``how many comedy movies starring kevin costner have come out in the year 2000", ``GENRE: comedy; ACTOR: kevin costner"}\}.  Our goal is to train a model capable of excelling in all encountered tasks while minimizing the extent of forgetting.

\subsection{Overview}
Our approach, called Dirichlet generative rehearsal (DCL), leverages the Conditional Variational Autoencoder (CVAE) to address the issue of forgetting in continual learning (CL). As illustrated in Fig.~\ref{fig:overview1}, DCL consists of two main modules: the pseudo-rehearsal module and the Language Model (LM) training module. Specifically, before training the current task, we employ CVAE to generate pseudo samples from previous tasks.  Then DCL continues to train using both the pseudo data and real data in the current task.
In the CVAE framework, both the encoder and decoder utilize a pre-trained language model, such as GPT-2~\citep{radford2019language}, with distinct parameters.  Notably, the LM shares parameters with the decoder of CVAE so that we can build a unified model to solve the CL problem. 

We introduce the LM training and pseudo-rehearsal modules in Section~\ref{sec:lm} and Section~\ref{sec:pseudo}, respectively.
Additionally, in Section~\ref{sec:js}, we introduce Jensen-Shannon Knowledge Distillation, a new knowledge distillation method used in DCL which will be elaborated on below.




% $\mathcal{\tilde{D}}_{\cup} = \mathcal{\tilde{D}}_{\text{curr}} \cup \mathcal{\tilde{D}}_{\text{pseu}}$ to represent the set of samples consisting of the concatenated utterances from $\mathcal{D}_{\text{curr}}$ and $\mathcal{D}_{\text{pseu}}$, respectively.

%  During the training of Task $N$, the pseudo-rehearsal module utilizes CVAE to generate pseudo samples from Task $1$ to Task ${N-1}$. (2) These pseudo samples are subsequently merged with the data from Task $N$. (3) The combined dataset is then employed for training the current task in the LM training module.

% Let $\mathcal{\tilde{D}}_{\cup} = \mathcal{\tilde{D}}_{\text{curr}} \cup \mathcal{\tilde{D}}_{\text{pseu}}$ to represent the set of samples consisting of the concatenated prompts from $\mathcal{D}_{\text{curr}}$ and $\mathcal{D}_{\text{pseu}}$, respectively. 
% Our goal is maximize the $\mathcal{L}_{\rm LM}$ :
% \begin{align}
% \label{eq:lm}
% \small
% &\mathcal{L}_{\rm LM}(\theta)\nonumber\\
% &=-\sum_{(x_i,y_i)\in \mathcal{\tilde{D}}_{\cup}}^{}\log p_{\theta}{( x_i,y_i)} 
% + \log p_{\theta}{(y_i|x_i)}.
% \end{align}



% \subsection{CL Via Generative Rehearsal}
\subsection{LM Training Module}
\label{sec:lm}
For data sample $(x_i, y_i)\in\mathcal{{D}}_{n}$ in task $T_n$, given an input utterance $x_i$, LM aims to generate the corresponding output $y_i$. 
% Let $\mathcal{D}_{\text{curr}}$ denotes the set of current training samples, and $\mathcal{D}_{\text{pseu}}$ denotes the set of generated pseudo samples from previous tasks. 
To achieve task-dependent generation, we define specific prefix prompt ${P}_n$ and postfix prompt $P^*_n$ for task $T_n$. These prompts are concatenated with the input utterance, resulting in the augmented input $g(x_i) = {P}_n \oplus x_i \oplus P^*_n$, where $\oplus$ denotes word concatenation. The details of ${P}_n$ and $P^*_n$ can be found in Appendix A.  The LM for task $T_n$ is optimized based on $g(x_i)$ which carries the task-specific information. 
The LM is optimized by minimizing the loss which is defined as $\mathcal{L}_{\rm LM}$:
\begin{align}
\label{eq:lm}
\small
&\mathcal{L}_{\rm LM}(\theta)\nonumber\\
&=-\sum_{( x_i,y_i)\in \mathcal{{D}}_{n}}^{}\log p_{\theta}{(g(x_i),y_i)} 
+ \log p_{\theta}{(y_i|g(x_i))}.
\end{align}



% \begin{align}
% \small
% &\mathcal{L}_{\rm LM}(\theta)\nonumber=\log p_{\theta}{(\Tilde x_i,y_i)} 
% + \log p_{\theta}{(y_i|\Tilde x_i)}.
% \end{align}


% $\mathcal{L}_{\rm CVAE}$ is denoted as


% \subsection{Pseudo Generation by Dirichlet Latent Variable}
\subsection{Pseudo-rehearsal Module}
\label{sec:pseudo}
Following the setting of PCLL, we use Conditional Variational Autoencoder (CVAE) to model the distribution of high-dimensional data $x$ using lower-dimensional latent variables $z$ \citep{agarwal2023decoder}. 
Let $z$ be a continuous variable representing the sentence-level features of input utterance $x$, and we take the task
ID as $c$.  
The generative process involves an encoder $q_\phi(z | x,c)$ mapping $x$ to approximate the true posterior $p(z | x,c)$.  Latent variables are sampled from $q_\phi(z | x,c)$, and a decoder $p_\theta(x | z,c)$ reconstructs $x$. 
The CVAE is trained to maximize the log-likelihood $\log {p(x|c)}$. The evidence lower bound (ELBO) denoted as $\mathcal{L}(\theta,\phi;x,c)$ is used for tractable optimization:
\begin{align} 
\small
\label{elbo}
\mathcal{L}(\theta,\phi;x,c)
&=-\lambda \rm KL(q_\phi(z|x, c)||p_\theta(z|c)) \nonumber\\
&+\mathbb{E}_{q_\phi(z|x,c)}[\log{p_\theta}(x|z, c)]\nonumber\\
&\le \log{p(x|c)},
\end{align}
where $\theta$ is the model parameter, $p_\theta(z|c)$ is the prior distribution of $z$, $q_\phi(z|x, c)$ approximates the intractable true posterior distribution, 
and $\lambda$ is the dynamic KL weight, gradually increasing from 0 to 1 via the annealing technique, to mitigate the KL-vanishing as proposed by \citet{bowman2016generating}. 

The CVAE loss, denoted as $\mathcal{L}_{\rm CVAE}$, is the negative of ELBO. The CVAE is trained by minimizing the $\mathcal{L}_{\rm CVAE}$.
Following \eqref{elbo}, we can decompose it as:
\begin{equation}
\label{eq:cvae_decouple}
\mathcal{L}_{\rm CVAE} = \lambda\mathcal{L}^G_{\rm KL}+\mathcal{L}_{\rm Rec},
\end{equation}
where $\mathcal{L}_{\rm Rec}=-\mathbb{E}_{q_\phi(z|x,c)}[\log{p_\theta}(x|z, c)]$ represents the reconstruction loss.
$\mathcal{L}^G_{\rm KL}=\rm KL(q_\phi(z|x, c)||p_\theta(z|c))$ is the KL loss/divergence used to measure the distribution distance of Gaussian latent.


\subsubsection{Pseudo-rehearsal via Dirichlet Latent}
In the DCL, departing from approximate $z\sim \mathcal{N}(\boldsymbol{\mu}, \boldsymbol{\Sigma})$ as a symmetric multivariate Gaussian from continuous space, we introduce the Dirichlet latent variable \(z \sim \text{Dir}(\boldsymbol{\alpha})\) to express $z$ originating from discrete space. $\boldsymbol{\Sigma}$ denotes diagonal covariance matrix and $\boldsymbol\alpha$ is the parameter vector. The Dirichlet distribution can be concave, convex, symmetrical, or asymmetrical, making it an appealing option for our model.
We apply the rejection sampling to reparametrize the Dirichlet latent variable \citep{jankowiak2018pathwise}.




% Traditionally, $z$ is Gaussian prior which tends to KL-vanishing.  
% KL-vanishing means the CVAE model would degrade into an autoencoder model which tends to generate generic and meaningless utterances \cite{zeng2019dirichlet}. We propose to use $z\sim D_{ir}(\cdot)$ to express $z$ originating discrete space $\mathcal{X}$. 
 
% The main reason is that a symmetric Gaussian from continuous space is not flexible enough to express the latent variable $z$ originating from discrete space. In DCL, we introduce the Dirichlet distribution to approximate the latent variable since this distribution owns a more flexible mathematical structure. The Dirichlet distribution can be concave, convex, symmetrical, or asymmetrical, making it an appealing option for our model.
% We use the reject sampling to reparametrize the Dirichlet latent variable \cite{jankowiak2018pathwise}.

 

% The probability density function (PDF) of a Dirichlet distribution with parameters $\boldsymbol{\alpha}$ is given by:

% \[
% f(\mathbf{x}; \boldsymbol{\alpha}) = \frac{\Gamma\left(\sum_{i=1}^{K}\alpha_i\right)}{\prod_{i=1}^{K}\Gamma(\alpha_i)} \prod_{i=1}^{K} x_i^{\alpha_i - 1}
% \]

% where $\mathbf{x} = (x_1, x_2, \ldots, x_K)$ is a K-dimensional vector that lies in the K-dimensional simplex, i.e., $x_i \geq 0$ and $\sum_{i=1}^{K} x_i = 1$. Here, $\Gamma(\cdot)$ is the gamma function.


% However, as illustrated in \cite{shen2018improving, zeng2019dirichlet}, although the weighting scheme can be used, KL-vanishing can not be essentially tackled. The main reason is that a symmetric Gaussian from continuous space is not flexible and sufficient enough to express the latent $z$ originating from discrete space. Here, we introduce the Dirichlet distribution, which uses a more flexible structure to approximate the prior distribution of $z$. The versatile forms of the Dirichlet distribution, which can be concave, convex, symmetrical, or asymmetrical, make it an appealing choice for our model.
% \begin{align} 
% \small
% \label{elbo}
% % \log{p_\theta(x)}
% % & \ge 
% \mathcal{L}(\theta,\phi;x,c)
% &=-\lambda \rm KL(q_\phi(z|x, c)||p_\theta(z|c)) \nonumber\\
% &+\mathbb{E}_{q_\phi(z|c, x)}[\log{p_\theta}(x|z, c)]\nonumber\\
% &\le \log{p(x|c)},
% \end{align}

% Decoupling the $\mathcal{L}_{\rm CVAE}$, 
% Eq. \eqref{elbo} can be reformulated as:
% \begin{equation}
% \label{eq:cvae_decouple}
% \mathcal{L}_{\rm CVAE} = \lambda\mathcal{L}^G_{\rm KL}-\mathcal{L}_{\rm Rec},
% \end{equation}
% where $\mathcal{L}^G_{\rm KL}$ is the KL divergence and $\mathcal{L}_{\rm Rec}$ is the reconstruction term of Gaussian latent. 
Equipped with Dirichlet latent $z$, $\mathcal{L}_{\rm CVAE}$ in DCL can be reformulated as:
\begin{equation}
\label{eq:cvae}
\mathcal{L}_{\rm CVAE} = \lambda\mathcal{L}^D_{\rm KL}+\mathcal{L}_{\rm Rec},
\end{equation}
where 
$\mathcal{L}^D_{\rm KL}$ can be expressed as follows after derivation \citep{zeng2019dirichlet}:
\begin{align}
\small
\label{derivation_kl}
\mathcal{L}^D_{\rm KL}=&~{\rm KL}(q_\phi(z|x,c)||p(z|c))= \nonumber\\
&\log\Gamma(\sum_{k=1}^{K}\alpha_k)-\sum_{k=1}^{K}\log\Gamma(\alpha_k)\nonumber\\
&-\log\Gamma(\sum_{k=1}^{K}\beta_k)
+\sum_{k=1}^{K}\log\Gamma(\beta_k)\nonumber\\
&+\sum_{k=1}^{K}(\alpha_k-\beta_k)(\psi(\alpha_k)-\psi(\sum_{k=1}^{K}\alpha_k)),
\end{align}
where $\alpha$ and $\beta$ represent the parameters of the Dirichlet distributions $q_\phi(z|x,c)$ and $p_\theta(z|c)$, respectively. $K$ denotes the dimension of $z$. $\Gamma$ is the gamma function and $\psi$ is the Digamma function. 

% \subsection{Learning}
% \subsubsection{Variational Learning}
% We consider the pseudo sample generation as a conditional generation process given the input utterance and the task ID. Let $z$ be a continuous variable as the summarization and feature representation of input for representing the sentence-level features.



% \subsubsection{LM Learning}
% We use an language model $\mathcal{M}$ to incrementally learn tasks and the training loss of $\mathcal{M}$ is defined as: 
% \begin{align}
% \small
% &\mathcal{L}_{\rm LM}(\theta)\nonumber\\
% &=-\sum_{(x_i,y_i)\in \mathcal{D}_{cups}}^{}\log p_{\theta}{(x_i,y_i)} 
% + \log p_{\theta}{(y_i|x_i)}.
% \end{align}
% To be specific, In our implementation, $\mathcal{M}$ is parameterized based on a pre-trained GPT2. Further, $\mathcal{M}$ shares parameters with the decoder of CVAE.



% Taking task $T_n$ as an example, the pseudo-rehearsal module first generates pseudo samples of previous tasks $T_1, \cdots, T_{n-1}$ and then we combine the generated pseudo samples with the samples for the task $T_n$ to train the model. Hence, training dataset for task $T_n$ becomes $\mathcal{D}_{cups}=\mathcal{D}_{curr}\cup\mathcal{D}_{pseu}$ where $\mathcal{D}_{curr}$ denotes current task samples, $\mathcal{D}_{pseu}$ represents generated pseudo samples, and $\mathcal{D}_{cups}$ denotes the combination of current task samples and generated pseudo samples. The training loss is defined as: 
% \begin{align}
% \small
% &\mathcal{L}_{\rm LM}(\theta)\nonumber\\
% &=-\sum_{(x_i,y_i)\in \mathcal{D}_{cups}}^{}\log p_{\theta}{(x_i,y_i)} 
% + \log p_{\theta}{(y_i|x_i)}.
% \end{align}



% Referring to the common practice \cite{zhao2022prompt, madotto2021continual, mi2020continual, zhu2022continual}, we mainly consider intent detection and slot filling tasks in NLU.







% \subsection{Overview}
% \label{sec:archi}


% In the rehearsal method of continual learning, 
% the representative and diverse exemplars contribute to the model performance, as presented in \cite{mi2020continual}. Thus, it is important to generate representative and diverse pseudo utterances. Among commonly used generative models, VAE and GAN are prominent. VAE, in particular, is known for generating meaningful and diverse utterances \cite{serban2017hierarchical}. We propose a CVAE-based method called DCL to mitigate forgetting in ToDs.

% As shown in Figure \ref{fig:overview1}, DCL has two main modules: pseudo-rehearsal and Language Model (LM) training modules. The pseudo-rehearsal module employs CVAE to generate pseudo samples from previous tasks. Then, the LM is trained using both pseudo samples of previous tasks and real samples of the current task to enable continual learning. 
% % The total training loss of DCL combines the CVAE loss for pseudo-rehearsal and the LM loss for updating LM parameters.
% % The structure of the CVAE and LM models is outlined as follows. 
% In CVAE, both encoder and decoder employ GPT-2 \cite{radford2019language} with distinct parameters to encode information and generate pseudo samples for tasks. 
% % In a Continual Learning (CL) context, the LM is expected to generate samples for task sequences, making the decoder of the CVAE also function as the LM as depicted in the LM training Module of Figure \ref{fig:overview1}. 
% Notably, the decoder of the CVAE also functions as the LM as depicted in Figure \ref{fig:overview1}. 
% This delicate framework leverages a unified model to solve sequential tasks and generates pseudo samples simultaneously. 

% In the following sections, we introduce the pseudo-rehearsal module (Sec. \ref{sec:3.3}) and LM training module (Sec. \ref{sec:3.4}) in detail. Then we present the proposed Jensen-Shannon Knowledge Distillation in (Sec. \ref{sec:3.5}) and explain the advantages of using JS divergence over KL divergence.


% % \subsection{Dirichlet-guided Pseudo-rehearsal}
% % \label{sec:3.3}
% % In the proposed DCL model, we introduce a CVAE to generate pseudo samples. 
% % As illustrated in \cite{shen2018improving, zeng2019dirichlet}, KL-vanishing problems can not be essentially tackled even using a weighting scheme. KL-vanishing means the CVAE model would degrade into an autoencoder model which tends to generate generic and meaningless utterances \cite{zeng2019dirichlet}. The main reason is that a symmetric Gaussian from continuous space is not flexible enough to express the latent variable $z$ originating from discrete space. In DCL, we introduce the Dirichlet distribution to approximate the latent variable since this distribution owns a more flexible mathematical structure. The Dirichlet distribution can be concave, convex, symmetrical, or asymmetrical, making it an appealing option for our model.
% % We use the reject sampling to reparametrize the Dirichlet latent variable \cite{jankowiak2018pathwise}.


% % This module aims to generate the pseudo samples based on the task ID and prompt \cite{lester2021power}. Given the input utterance $x_i$, we generate $y_i$ in the task $T_n$. To achieve task-dependent generation, specific prefix prompt ${P}_n$ and postfix prompt $P^*_n$ for $T_n$ are first defined. Then they are concatenated to the input utterance, yielding the augmented input $\Tilde{x}_{i,n}={P}_n \oplus x_i \oplus P^*_n$. 
% % Details of the prompt design are described in \ref{sec:appendixa}.
% % The CVAE can be utilized to generate the pseudo sample for $T_n$ based on $\Tilde{x}_{i,n}$ \cite{zhao2017learning}. 
% % % he key idea of CVAE is to reconstruct the input $x$ through the latent variable $z$, which is normally modeled through the Gaussian distribution. 

% % The CVAE is trained to maximize the log-likelihood $\log {p_(x|c)}$. The lower bound ELBO $\mathcal{L}(\theta,\phi;x,c)$ is used for tractable optimization:
% % \begin{align} 
% % \small
% % \label{elbo}
% % % \log{p_\theta(x)}
% % % & \ge 
% % \mathcal{L}(\theta,\phi;x,c)
% % &=-\lambda KL(q_\phi(z|x, c)||p_\theta(z|c)) \nonumber\\
% % &+\mathbb{E}_{q_\phi(z|c, x)}[\log{p_\theta}(x|z, c)]\nonumber\\
% % &\le \log{p(x|c)},
% % \end{align}
% % where $\theta$ is the model parameter, $p_\theta(z|c)$ is the prior distribution of $z$, $q_\phi(z|x, c)$ approximates the intractable true posterior distribution, $c$ defines the task ID, and $\lambda$ is the dynamic KL weight to mitigate the KL-vanishing as proposed by \citet{bowman2016generating}. 
% % % However, as illustrated in \cite{shen2018improving, zeng2019dirichlet}, although the weighting scheme can be used, KL-vanishing can not be essentially tackled. The main reason is that a symmetric Gaussian from continuous space is not flexible and sufficient enough to express the latent $z$ originating from discrete space. Here, we introduce the Dirichlet distribution, which uses a more flexible structure to approximate the prior distribution of $z$. The versatile forms of the Dirichlet distribution, which can be concave, convex, symmetrical, or asymmetrical, make it an appealing choice for our model.
% % The CVAE loss denoted as $\mathcal{L}_{\rm CVAE}$ is the negative of ELBO. Following \eqref{elbo}, we have
% % \begin{equation}
% % \mathcal{L}_{\rm CVAE} = \mathcal{L}'_{\rm KL}+\mathcal{L}_{\rm Rec},
% % \end{equation}
% % where $\mathcal{L}'_{\rm KL}$ can be expressed as follows after derivation \cite{zeng2019dirichlet}:
% % \begin{align}
% % \small
% % \label{derivation_kl}
% % &KL(q_\phi(z|x,c)||p(z|c))= \nonumber\\
% % &\log\Gamma(\sum_{k=1}^{K}\alpha_k)-\sum_{k=1}^{K}\log\Gamma(\alpha_k)\nonumber\\
% % &-\log\Gamma(\sum_{k=1}^{K}\beta_k)
% % +\sum_{k=1}^{K}\log\Gamma(\beta_k)\nonumber\\
% % &+\sum_{k=1}^{K}(\alpha_k-\beta_k)(\psi(\alpha_k)-\psi(\sum_{k=1}^{K}\alpha_k)),
% % \end{align}
% % where $\alpha$ and $\beta$ represent the parameters of the Dirichlet distributions $q_\phi(z|x,c)$ and $p_\theta(z|c)$, respectively. $K$ denotes the dimension of $z$, and $\psi$ is the Digamma function. 


% \subsection{LM Training Module}
% \label{sec:3.4}
% LM training module shares parameters with the decoder of CVAE. Taking task $T_n$ as an example, the pseudo-rehearsal module first generates pseudo samples of previous tasks $T_1, \cdots, T_{n-1}$ and then we combine the generated pseudo samples with the samples for the task $T_n$ to train the model. Hence, training dataset for task $T_n$ becomes $\mathcal{D}_{cups}=\mathcal{D}_{curr}\cup\mathcal{D}_{pseu}$ where $\mathcal{D}_{curr}$ denotes current task samples, $\mathcal{D}_{pseu}$ represents generated pseudo samples, and $\mathcal{D}_{cups}$ denotes the combination of current task samples and generated pseudo samples. The training loss is defined as: 
% \begin{align}
% \small
% &\mathcal{L}_{\rm LM}(\theta)\nonumber\\
% &=-\sum_{(x_i,y_i)\in \mathcal{D}_{cups}}^{}\log p_{\theta}{(x_i,y_i)} 
% + \log p_{\theta}{(y_i|x_i)}.
% \end{align}




\subsection{Jensen-Shannon Knowledge Distillation}
\label{sec:js}
Due to the potential drift from the real data distribution in the pseudo data, simply combining the generated pseudo samples with the training data may add noise and harm the model's performance.  
Following existing studies~\citep{chuang2020lifelong, mi2020continual, zhao2022prompt, chen2023lifelong}, we apply knowledge distillation to the pseudo samples for
shielding our model from the impact of noisy pseudo data.

% During model distillation for a new task N, the distillation loss is applied to the pseudo samples generated for task N. The process involves treating the model obtained from previous N-1 tasks as a teacher model and distilling knowledge from this teacher model to the current model (acting as the student model).

% Compared to traditional KL divergence knowledge distillation, JS divergence has the advantage of robustness since the symmetry ensures consistent values regardless of comparison order, which eases the distribution similarity measurement.




To be specific, give student model $f_{\theta_n}$ training on task $T_n$, teacher model $f_{\theta_{n-1}}$ trained on task $T_{n-1}$. $f_{\theta_{n-1}}$ is distilled on the pseudo data to the $f_{\theta_{n}}$ by minimize the knowledge distillation loss.
As training continues, the roles switch, making the $T_n$ model the teacher for $T_{n+1}$.
This iterative role-switching facilitates the accumulation of cumulative knowledge from previous tasks, thereby achieving effective CL.


% \begin{figure}[!t]
%     \centering
%     \includegraphics[width=70mm]{kd.pdf}
%     \caption{Knowledge Distillation of DCL.}
%     \label{fig:kd}
%     \vspace{-1em}
% \end{figure}


\noindent\textbf{Knowledge Distillation}
Given a training sample $(x,y)$, the goal is to minimize the cross-entropy between the probability distributions produced by the teacher and student models \citep{chuang2020lifelong}. 
The knowledge distillation loss is:
\begin{equation}
\label{eq:kl}
\small
\mathcal{L}_{\rm KD} = \alpha \cdot \mathcal{L}_{\rm KL}(S, T) \cdot\tau^2  + (1 - \alpha) \cdot \mathcal{L}_{\rm CE}(S, Y),
\end{equation} 
where $T$ and $S$ are teacher and student model output logits, respectively. $\tau$ is the temperature to soften the teacher's predictions while $\mathcal{L}_{\rm CE}(S, Y)$ quantifies the cross-entropy loss between student predictions and the ground truth labels $Y$. $\mathcal{L}_{\rm KL}$ implicitly prevents the parameters of the student model from straying too far away from the ones of the teacher model. The first term denotes a soft target while the second term is a hard target. $\alpha\in[0, 1]$ balances the soft and hard target evaluations.


\subsubsection{Knowledge Distillation via JS Divergence}
Previous methods introduce \citep{hinton2015distilling} which uses KL divergence to measure the distribution similarity of student and teacher models. 
However, KL divergence is not robust since it is sensitive to outliers, especially in scenarios with noisy or sparse data. Jensen-Shannon (JS) divergence offers a more stable alternative by introducing a symmetric term. 
% This makes JS divergence advantageous for assessing dissimilarity in complex and diverse data distributions, mitigating the limitations associated with KL divergence.
Thus, we propose a new knowledge distillation method called Jensen-Shannon Knowledge Distillation (JSKD), a novel logit-based knowledge distillation method, that achieves remarkable performance. 


\noindent\textbf{JS Divergence vs. KL Divergence} For distributions $p$ and $q$, JS divergence \citep{lin1991divergence} is defined by :
\begin{align}
\small
\mathcal{L}_{\rm JS}(p \parallel q) = &\frac{1}{2} {\mathcal{L}_{\rm KL}\left(p \parallel \frac{1}{2}(p + q)\right)} \nonumber\\
&+ \frac{1}{2} {\mathcal{L}_{\rm KL}\left(q \parallel \frac{1}{2}(p + q)\right)}.
\end{align}
JS divergence offers advantages over KL divergence: a) its symmetry ensures consistent values regardless of comparison order, making it ideal for measuring distribution similarities. b) JS provides bounded value in $[0,1]$ while KL divergence spans $[0,+\infty)$.  
JS divergence symmetrically measures the similarity between two probability distributions, in contrast to the asymmetric KL divergence which has values ranging from 0 (identical distributions) to 1(no shared support). 
The above properties make JS divergence more suitable for knowledge distillation than KL since the KL divergence will be infinite when one sample appears only in one task distribution.


\noindent\textbf{Knowledge Distillation via JS Divergence }
Motivated by the above discussions, we propose a JS divergence-based knowledge distillation (JSKD) to accurately measure the distance between teacher and student models, enhancing model robustness. The JSKD loss is defined as:
\begin{equation}
\small
\label{kd}
\mathcal{L}_{\rm KD} = \alpha \cdot \mathcal{L}_{\rm JS}(S, T) \cdot \tau^2 + (1 - \alpha) \cdot \mathcal{L}_{\rm CE}(S, Y). 
\end{equation} 
Specifically, we adopt the preceding task as the teacher model and the current task as the student model. Incorporating knowledge distillation, the $L_{\rm Rec}$ and $L_{\rm LM}$ for task $T_n$ is defined as:
\begin{align}
&\mathcal{L}_{\rm Rec} = \alpha \mathcal{L}_{\rm JS}(l_c, l_c^*)\tau^2  + (1 - \alpha) \mathcal{L}_{\rm CE}(l_c, Y),\nonumber\\
&L_{\rm LM} = \alpha \mathcal{L}_{\rm JS}(l_l, l_l^*) \tau^2 + (1 - \alpha) \mathcal{L}_{\rm CE}(l_l, Y),
\end{align}
where $l_c$ and $l_l$ are the logits output of CVAE and LM of task $T_n$, respectively. $l_c^*$ and $l_l^*$ represent the logits output of task $T_{n-1}$, and $Y$ signifies the ground truth. 
We emphasize that $\mathcal{L}^D_{\rm KL}$ is the KL divergence in Eq.~\eqref{derivation_kl} to evaluate the distance between the assumed Dirichlet data distribution and the real distribution. It is different from the $\mathcal{L}_{\rm KL}$ where it evaluates the distance between the student and teacher models in cross-task knowledge distillation. 







\subsection{CL Via Generative Rehearsal}
Our DCL aims to train the model by optimizing the following objective: 
\begin{equation}
\label{total loss}
   \mathcal{L}_{\rm total} = \mathcal{L}_{\rm LM}+\mathcal{L}_{\rm CVAE}, 
\end{equation}
where $\mathcal{L}_{\rm LM}$ is defined by Eq. \eqref{eq:lm} and 
$\mathcal{L}_{\rm CVAE}$ is defined by Eq. \eqref{eq:cvae}.




\section{Experiments}
% \red{
% In the following, we conduct extensive experiments to address the following questions: (1) ; （2) ; (3)  %We adopt the pre-processed datasets released by PCLL in the experiments.
% For the intent detection task, we use the HWU \cite{liu2019benchmarking}, BANKING \cite{casanueva2020efficient}, CLINC \cite{larson2019evaluation}, SNIPS \cite{coucke2018snips}, AITS \cite{hemphill1990atis}, and TOP\cite{gupta2018semantic} datasets. For fairness, we follow PCLL's \cite{zhao2022prompt} data preprocessing procedure. The TOP dataset is divided into three subsets: TOP-S1, TOP-S2, and TOP-S3, considered separate tasks. This partitioning increases the total number of tasks for continual training evaluation.
% }

\subsection{Datasets}
Based on the setup and pre-processed datasets described in PCLL~\citep{zhao2022prompt}, we conduct experiments to simulate continual learning (CL) for two NLP tasks in task-oriented dialogue systems: intent detection and slot filling.  For intent detection, we utilize six datasets annotated with intents: HWU~\citep{liu2021benchmarking}, BANKING~\citep{casanueva2020efficient}, CLINC~\citep{larson2019evaluation}, SNIPS~\citep{coucke2018snips}, AITS~\citep{hemphill1990atis}, and TOP~\citep{gupta2018semantic} datasets.  Notably, the TOP dataset is divided into three distinct subsets: TOP-S1, TOP-S2, and TOP-S3.  These subsets, along with the other five datasets, constitute a total of eight tasks used to evaluate CL in the intent detection experiment.  For slot filling, we adopt the SNIPS, AITS, DSTC~\citep{rastogi2020towards}, MIT-MOVIE\,\footnote{\url{https://groups.csail.mit.edu/sls/downloads/}\label{mit-courpus}}, and MIT-RESTAURANT\,\footref{mit-courpus} datasets. These datasets yield a total of five tasks to evaluate CL in the slot filling experiment.  For a fair comparison, these tasks are learned in six different orders, and the average performances of these orders are reported.  Dataset orders are listed in Appendix B. 


%For slot filling, we utilize five datasets for which slot labels are available: SNIPS, AITS, DSTC, MIT-MOVIE, and MIT-RESTAURANT. Each of these datasets is treated as an individual task, resulting in the learning of five distinct tasks during CL. As a result, following PCLL, there are a total of 8 tasks in intent detection and 5 tasks in slot filling, respectively. 
 %We adopt the pre-processed datasets released by PCLL.  
%The pre-processed datasets are publicly accessed in the PCLL code repository. We use the pre-processed datasets they published. 

% \ref{sec:appendixb}.
% : (1) \textbf{Fine-tune} the pre-trained language models; (2) \textbf{EWC} \citep{kirkpatrick2017overcoming} is a regularization method that mitigates CF by constraining crucial parameters while enabling unimportant ones to be adapted to the new task; (3) \textbf{MAS} \citep{aljundi2018memory} quantifies parameter importance in the network based on task memory contributions toward mitigated CF. 

% polishing the paragraph and making it shorter: We compare our DCL model with eleven competitive methods: (1) \textbf{Fine-tune} directly fine-tunes GPT-2 on the task stream without preventing CF; (2-3) Two typical {\textbf{Regularization}} methods are \textbf{EWC}~\citep{kirkpatrick2017overcoming} and \textbf{MAS}~\citep{aljundi2018memory}, mitigating forgetting by penalizing changes of important parameters for learned tasks; (4-6) Three \textbf{Architectural} methods include \textbf{HAT}~\citep{serra2018overcoming} task-based hard attention during training, \textbf{CTR}~\citep{ke2021achieving} inserts continual learning plug-ins into BERT to mitigate forgetting and encourage knowledge transfer, and \textbf{AdapterCL}~\citep{madotto2021continual} builds residual adapter for each task independently.  (7-10) Four famous \textbf{Rehearsal} methods consist of two variants of \textbf{LAMOL}~\citep{sun2019lamol}, \textbf{LAMOL-g} and \textbf{LAMOL-t}, for global incorporation and task-specific tokens, respectively, \textbf{ER} \citep{rolnick2019experience} preserves previously seen real samples for replay to prevent forgetting, \textbf{L2KD} \citep{chuang2020lifelong} improves LAMOL by assigning an extra teacher for each new task to perform knowledge distillation,  and \textbf{PCLL}~\citep{zhao2022prompt}, the current SOTA method by applying prompt conditioned VAE for lifelong learning. (11) Additionally, we evaluate the model performance when all tasks are trained simultaneously in a multi-task learning setting (\textbf{Multi}), which is the upper bound of CL in NLP.

\subsection{Compared Methods}
We compare our DCL model with eleven competitive methods: (1) \textbf{Fine-tune} directly fine-tunes GPT-2 on the task stream without preventing catastrophic forgetting (CF); (2-3) Two typical \textbf{Regularization} methods are \textbf{EWC}~\citep{kirkpatrick2017overcoming} and \textbf{MAS}~\citep{aljundi2018memory}, which penalize changes of important parameters to mitigate forgetting; (4-6) Three \textbf{Architectural} methods include \textbf{HAT}~\citep{serra2018overcoming} with task-based hard attention, \textbf{CTR}~\citep{ke2021achieving} which inserts continual learning plug-ins into BERT, and \textbf{AdapterCL}~\citep{madotto2021continual} with task-specific residual adapters; (7-10) Four \textbf{Rehearsal} methods consist of two variants of \textbf{LAMOL}~\citep{sun2019lamol}, \textbf{LAMOL-g} and \textbf{LAMOL-t} for global incorporation and task-specific tokens respectively, \textbf{ER}~\citep{rolnick2019experience} which preserves previously seen real samples for replay, \textbf{L2KD}~\citep{chuang2020lifelong} which performs knowledge distillation with an extra teacher, and \textbf{PCLL}~\citep{zhao2022prompt}, the current state-of-the-art method that applies prompt conditioned VAE for lifelong learning; (11) Additionally, we evaluate the model performance when all tasks are trained simultaneously in a multi-task learning setting (\textbf{Multi}), which serves as the upper bound in the comparison.

% See Table~\ref{tab:accents} for an example of a table and its caption.
% \textbf{Do not override the default caption sizes.}
% \begin{itemize}
%     \item In intent detection, the batch size is 32 with a learning rate of 5e-5 and a pseudo sample rate of 0.2. The dimension of $z$ is 128, and we use the Adam optimizer. We set the maximum context length as 256 and train it for 5 epochs. 
%     \item In the slot filling task, compared to intent detection, we set $z$'s dimension to 512, limit the maximum context length to 50, and train the model for 10 epochs.
% \end{itemize}
\subsection{Experimental Settings}
The experiments are conducted on an NVIDIA A100-80GB GPU.  In intent detection, we use a batch size of 32, a learning rate of 5e-5, and a pseudo sample rate of 0.2. The dimension of $z$ is set to 128, and the Adam optimizer is employed. The maximum context length is 256, and the model is trained for 5 epochs.  Knowledge distillation is performed with $\alpha$ set to 0.9 and $\tau$ set to 2.0.  In slot filling, the dimension of $z$ is increased to 512, and the maximum context length is set to 50. The model was trained for 10 epochs with the knowledge distillation parameters of $\alpha$ = 1.0 and $\tau$ = 2.0.  
Unless specified, the batch size, learning rate, and pseudo samples ratio are the same as in intent detection.  The training time for DCL on a single GPU is approximately 5 hours for intent detection and 6 hours for slot filling.


%In intent detection, the batch size is 32 with a learning rate of 5e-5 and a pseudo sample rate of $0.2$.  The dimension of $z$ is 128, and the Adam optimizer is used. We set the maximum context length as 256 and train it for 5 epochs. $\alpha$ is $0.9$ and $\tau$ is $2.0$ in knowledge distillation.  In slot filling, the dimension of $z$ is 512, the maximum context length is 50, and it is trained for 10 epochs. $\alpha$ is $1.0$ and $\tau$ is $2.0$.  Without specifying, the batch size, the learning rate and the pseudo sample rate are the same in intent detection.  

%All experiments are conducted on NVIDIA A100-80GB GPU.  DCL costs around five hours for training in one GPU in intent detection and six hours in slot filling.

\subsection{Evaluation Metrics}

\noindent\textbf{Average Score (Score)}~\citep{lopez2017gradient} denotes the average accuracy on all tasks after the final task has been learned: ${\rm Score} = \frac{1}{T} \sum\limits_{i=1}^{T}{R_{T, i}}$,
where $R_{i, j}$ denotes the evaluation metric on task $t_j$ after training on task $t_i$.  Since intent detection and slot filling can be viewed as classification and sequence labeling tasks, we adopt the accuracy score and F1 score for intent detection and slot filling, respectively.  

\noindent\textbf{Learning Curve Area (LCA)}~\citep{chaudhry2018efficient} is computed as the area under a learning curve to indicate a model's performance in a sequence of tasks: ${\rm LCA} = \int_{0}^{T} P(t) dt$,
where $P(t)$ is the average model performance at step $t$ across all already-learnt tasks, and $T$ is the total number of steps. Higher LCA values indicate the effectiveness of CL.

\subsection{Main Results} 
Table~\ref{tab:overall} presents the performance of our proposed DCL model compared to the baselines. Our DCL model demonstrates a significant improvement over all baselines in both intent detection and slot filling tasks.  Notably, our DCL surpasses the state-of-the-art model, PCLL, by a wide margin, achieving superior results across all evaluation metrics. In intent detection, our DCL achieves a remarkable 3.48\% increase in accuracy score and a 4.22\% improvement in LCA.  Similarly, in slot filling, our DCL achieves a notable 2.89\% increase in F1 score and a substantial 6.08\% improvement in LCA.  These impressive enhancements are attributed to the effective utilization of Dirichlet-guided pseudo-rehearsal and JSKD techniques.  By leveraging these components, our DCL model generates a more diverse and representative set of examples, leading to further optimization of model performance. The generation of diverse and representative examples is crucial for effectively capturing task-specific information, particularly when the number of available pseudo samples is limited.

Furthermore, our DCL model achieves performance that is comparable to the upper bound in Multi-task learning (\textbf{Multi}), with only a slight lag of 2.52\% in accuracy for intent detection and 3.43\% in F1 score for slot filling. This slight difference in performance can be attributed to variations in the amount of data and the realism of the samples used for evaluation.

To further understand these trends, the learning curve of the average scores for DCL and PCLL in intent detection tasks is shown in Appendix C. According to the comparison, our model alleviates the CF issue more effectively evidenced by higher accuracy than the PCLL, especially after step 6000. The sharp drop in accuracy is induced by task switching. The delayed prominence of performance improvement around 6000 steps suggests that as the model encounters a more diverse set of tasks, its efficacy becomes more pronounced, highlighting the promising nature of our approach.



% The result of our model is shown in the last two rows, where ``KL'' and ``JS'' refer to KL divergence knowledge distillation and JS knowledge distillation, respectively. 
\begin{table}[!t]
\centering
\caption{Comparison results of DCL and baselines.  Scores of baselines are reported in~\citep{zhao2022prompt} and the performance of Multi is the upper bound. The best results are highlighted in bold.}
\label{tab:overall}
\resizebox{1\linewidth}{!}{
\begin{tabular}{lccccc}
\toprule
\multicolumn{1}{c}{\multirow{2}{*}{\textbf{Models}}} & \multicolumn{2}{c}{\textbf{Intent Detection (\%)}}  & \multicolumn{2}{c}{\textbf{Slot Filling (\%)}}                 \\
& \multicolumn{1}{c}{\textbf{Score $\uparrow$}} & \multicolumn{1}{c}{\textbf{LCA $\uparrow$}}  & \multicolumn{1}{c}{\textbf{Score $\uparrow$}} & \multicolumn{1}{c}{\textbf{LCA $\uparrow$}}
\\ \hline
Finetune  &14.09 & 28.76 & 15.38&19.55\\\hdashline
EWC & 14.16 & 28.34 & 15.67&19.51\\
MAS & 14.15 & 28.61 & 15.59&19.37 \\\hdashline
HAT &73.92 & 73.03  & 61.99 & 67.33\\
CTR & 67.44 & 71.11 & 63.84 & 67.28\\
AdapterCL & 81.15 & 75.60 & 75.60 &48.47 \\\hdashline
L2KD & 35.22 & 61.78 & 44.16&39.94\\
LAMOL-g & 50.30 & 60.67 & 45.12 & 38.03 \\
LAMOL-t & 51.81 & 67.97 & 44.83 & 37.58\\
ER & 78.19 & 78.19  & 44.95 & 39.32\\
PCLL & 90.25 & 88.82 & 74.48 & 68.41\\
% PCLL(reproduced) & \textbf{90.37} & \textbf{90.30} & \textbf{75.32} & \textbf{73.75}\\
\hline
% DCL (with KL)  & 92.83 & 91.32 & 76.42 & 73.76 \\
% DCL (with JS)  & \textbf{93.73} & \textbf{93.04} & \textbf{77.37} & \textbf{74.49}\\
\textbf{DCL}  & \textbf{93.73} & \textbf{93.04} & \textbf{77.37} & \textbf{74.49}\\
\hline
Multi (Upper Bound) & 96.25 & N/A & 80.80 & N/A\\
\bottomrule
\end{tabular}
}
% \vspace{-1em}
\end{table}



% \begin{figure}[!t]
%     \centering
%     \includegraphics[width=70mm]{curve.pdf}
%     \caption{Learning curves of DCL and PCLL on intent detection task. The accuracy of DCL surpasses PCLL significantly after 6,000 steps.}
%     \label{fig:curve}
% \end{figure}









\subsection{Ablation Study}
\noindent\textbf{Gaussian vs. Dirichlet-guided Rehearsal Module.} To assess the influence of a Dirichlet-guided rehearsal module, we conduct a comparative analysis of DCL and PCLL in the intent detection and slot filling tasks. To ensure a fair comparison, our DCL model incorporates KL knowledge distillation as that in PCLL.  The findings presented in Table \ref{tab:dir-gau} demonstrate that DCL, with its Dirichlet-guided rehearsal module, outperforms PCLL, which employs a Gaussian-guided module, across all evaluation metrics in two tasks.  These results suggest that the Dirichlet distribution is more effective in approximating the true data distribution, leading to better performance.

% In order to find the effect of introducing a Dirichlet-guided rehearsal module, we listed the performance of DCL and PCLL in the intent detection and slot filling tasks, both of them equipped with KL knowledge distillation. The difference between them is choosing either Dirichlet or Gaussian latent variable. Table \ref{tab:dir-gau} shows that introducing a Dirichlet-guided rehearsal module outperforms PCLL which uses a Gaussian-guided rehearsal module. It indicates the Dirichlet distribution is better to approximate the true data distribution. 




\begin{table}[!t]
  \centering
   \caption{Results of DCL with KL knowledge distillation and PCLL in two tasks.  DCL, incorporating the Dirichlet-guided rehearsal module, outperforms PCLL across all metrics evaluated. }
    \label{tab:dir-gau}
  \resizebox{0.8\linewidth}{!}{
  \begin{tabular}{lccccc}
\toprule
\multicolumn{1}{c}{\multirow{2}{*}{\textbf{Models}}} & \multicolumn{2}{c}{\textbf{Intent Detection (\%)}}  & \multicolumn{2}{c}{\textbf{Slot Filling (\%)}}                 \\
& \multicolumn{1}{c}{\textbf{Score $\uparrow$}} & \multicolumn{1}{c}{\textbf{LCA $\uparrow$}}  & \multicolumn{1}{c}{\textbf{Score $\uparrow$}} & \multicolumn{1}{c}{\textbf{LCA $\uparrow$}}
\\ \hline
   % \cline{1-3}
   PCLL & 90.25 & 88.82 & 74.48 & 68.41\\
   DCL (with KL)  & \textbf{92.83} & \textbf{91.32} & \textbf{76.42} & \textbf{73.76} \\
   \bottomrule
  \end{tabular}}
\end{table}



% \resizebox{1\linewidth}{!}{
% \begin{tabular}{lccccc}
% \toprule
% \multicolumn{1}{c}{\multirow{2}{*}{\textbf{Models}}} & \multicolumn{2}{c}{\textbf{Intent Detection (\%)}}  & \multicolumn{2}{c}{\textbf{Slot Filling (\%)}}                 \\
% & \multicolumn{1}{c}{\textbf{ACC $\uparrow$}} & \multicolumn{1}{c}{\textbf{LCA $\uparrow$}}  & \multicolumn{1}{c}{\textbf{F1 $\uparrow$}} & \multicolumn{1}{c}{\textbf{LCA $\uparrow$}}
% \\ \hline
% Finetune  &14.09 & 28.76 & 15.38&19.55\\
% EWC & 14.16 & 28.34 & 15.67&19.51\\
% MAS & 14.15 & 28.61 & 15.59&19.37 \\
% L2KD & 35.22 & 61.78 & 44.16&39.94\\
% LAMOL-g & 50.30 & 60.67 & 45.12 & 38.03 \\
% LAMOL-t & 51.81 & 67.97 & 44.83 & 37.58\\
% ER & 78.19 & 78.19  & 44.95 & 39.32\\
% HAT &73.92 & 73.03  & 61.99 & 67.33\\
% CTR & 67.44 & 71.11 & 63.84 & 67.28\\
% AdapterCL & 81.15 & 75.60 & 75.60 &48.47 \\
% PCLL & 90.25 & 88.82 & 74.48 & 68.41\\
% % PCLL(reproduced) & \textbf{90.37} & \textbf{90.30} & \textbf{75.32} & \textbf{73.75}\\
% Multi (Upper Bound) & 96.25 & N/A & 80.80 & N/A\\
% \hline
% % DCL (with KL)  & 92.83 & 91.32 & 76.42 & 73.76 \\
% % DCL (with JS)  & \textbf{93.73} & \textbf{93.04} & \textbf{77.37} & \textbf{74.49}\\
% \textbf{DCL}  & \textbf{93.73} & \textbf{93.04} & \textbf{77.37} & \textbf{74.49}\\
% \bottomrule
% \end{tabular}
% }





\noindent\textbf{KL Knowledge Distillation vs. JS Knowledge Distillation.}  We evaluate the impact of Jensen-Shannon (JS) Knowledge Distillation on the performance of DCL and compare it with DCL equipped with KL Knowledge Distillation.  Due to space limitations, we report the performance in the slot filling task under different learning orders, as summarized in Table~\ref{tab:ablation-kl}, where the performance in the intention detection task follows a similar trend.  Our findings consistently demonstrate that the DCL model equipped with JS knowledge distillation outperforms the DCL model with KL knowledge distillation across all task learning orders. Notably, we observe significant improvements ranging from 0.63\% to 1.55\% in the F1 score and 0.41\% to 1.11\% in LCA.  
We performed a t-test on DCL incorporating both JS and KL knowledge distillation, resulting in a P-value of 0.0013. This suggests that JS knowledge distillation outperforms KL knowledge distillation significantly.
These results provide compelling evidence that JS divergence effectively facilitates knowledge transfer within the DCL framework, underscoring its beneficial role in enhancing model performance. 
This conclusion holds true for intent detection, as similar results were observed in the slot filling task.
%We evaluate the impact of Jensen-Shannon (JS) Knowledge Distillation on the overall performance of DCL. Table~\ref{tab:ablation-kl} shows the performance differences in the slot filling task for various task learning orders between DCL implementations with KL and with JS knowledge distillation.  We find that the DCL equipped with JS knowledge distillation outperforms the DCL with KL knowledge distillation in all orders. Notably, $0.63\%-1.55\%$ and $0.41\%-1.11\%$ improvements are obtained in the F1 score and LCA, respectively.  This serves as evidence that JS divergence is helpful to knowledge transfer.

% The results show that the model using JS Knowledge Distillation outperforms the one using KL knowledge distillation in F1, and LCA. 


% 1.13 
% 0.63
% 0.73
% 1.03
% 1.55
% 0.64


% 0.84
% 0.41
% 0.72
% 0.41
% 0.89
% 1.11

\begin{table}[t]
\small
\centering
\caption{Slot filling results of F1 score(\%) and LCA (\%) on six orders with KL and JS knowledge distillation. F1 score and LCA of DCL using JS knowledge distillation are improved in all six orders compared with using KL knowledge distillation.}
\label{tab:ablation-kl}
\renewcommand{\arraystretch}{1.1} 
\resizebox{0.7\linewidth}{!}
{
\begin{tabular}{lccccc}
\toprule
\multicolumn{1}{c}{\multirow{2}{*}{Orders}} & \multicolumn{2}{c}{DCL (with KL)}  & \multicolumn{2}{c}{DCL (with JS)}                 \\
& \multicolumn{1}{c}{Score $\uparrow$} & \multicolumn{1}{c}{LCA $\uparrow$}  & \multicolumn{1}{c}{Score $\uparrow$} & \multicolumn{1}{c}{LCA $\uparrow$}
\\ \hline
order 0 & 80.26 & 73.82 & 81.39 & 74.66\\
order 1 & 80.31 & 74.82 & 80.94 & 75.23\\
order 2 & 74.04 & 70.01 & 74.77 & 70.73 \\
order 3 & 77.00 & 76.28 & 78.03 & 76.69\\
order 4 & 72.98 & 70.10 & 74.53 & 70.99 \\
order 5 & 73.92 & 77.50  & 74.56 & 78.61 \\
\hline
Mean & 76.42 & 73.76 & \textbf{77.37} & \textbf{74.49}\\
\bottomrule
\end{tabular}
}
% \vspace{-1em}
\end{table}




% \begin{table}[!t]
%   \centering
%   % \resizebox{1\linewidth}{!}{
%   \begin{tabular}{lccc}
%   \toprule
%    % \hline
%    Model & DCL (with KL) & DCL (with JS)   \\
%    \hline
%    \cline{1-3}
%    order 0 & $80.26$ & $81.39$ \\
%    order 1 & $80.31$ & $80.94$ \\
%    order 2 & $74.04$ & $74.77$  \\
%    order 3 & $77.00$ & $78.03$ \\
%    order 4 & $72.98$ & $74.53$ \\
%    order 5 & $73.92$ & $74.56$  \\
%    \hline
%   \end{tabular}
%     \caption{Slot filling results of F1 metrics (\%) on six orders with KL and JS knowledge distillation.}
%     \label{tab:ablation-kl}
%     \vspace{-.5em}
% \end{table}



\noindent\textbf{Number of Pseudo Samples.}
To have a better understanding of how the number of pseudo samples influences the performance of the proposed approach, various ratios of pseudo samples are employed in DCL. Table \ref{tab:ablation-psn} presents the comparison between PCLL and DCL with different ratios of pseudo samples. Specifically, we analyze PCLL with ratio of 0.2 and DCL with ratios of 0.1, 0.2, 0.4, and 0.5. Although fewer pseudo samples are added to the training, DCL with a ratio of 0.1 still outperforms PCLL with a ratio of 0.2 in terms of accuracy score and LCA. This can be ascribed to the superiority of DCL. Moreover, we find that the performance can be further improved as the number of pseudo samples increases. This is because more information is carried by more data samples, which can enhance the model's capabilities.


\renewcommand{\arraystretch}{1.1} 
\begin{table}[!t]
\small
\centering
\caption{Intent detection result of PCLL with pseudo samples ratio of 0.2 and DCL with a different number of pseudo samples numbers ranging from 0.1 to 0.5.} 
\label{tab:ablation-psn}
\resizebox{0.6\linewidth}{!}
{
\begin{tabular}{lcccc}
\toprule
Model &Ratio & Score & LCA
\\ \hline
PCLL& 0.2 &90.25& 88.82\\
DCL&0.1 & 91.66 & 91.83 \\
DCL&0.2 & 93.73 &\textbf{93.04}\\
DCL&0.4 & 93.97 & 92.76 \\
DCL&0.5 & \textbf{94.23} & 92.82  \\
\bottomrule
\end{tabular}
}

% \vspace{-1em}
\end{table}


% \end{minipage}
% \hfill
% \begin{minipage}{.45\linewidth}
% \small
% \centering
% {
% \begin{tabular}{lccc}
% \toprule
% MCL & ACC & LCA
% \\ \hline
% 50 & 92.17 & 91.16 \\
% 256 & 93.73 &93.04\\
% \bottomrule
% \end{tabular}
% }
% \caption{Intent detection of DCL with input maximum context length (MCL) of 50 and 256.} 
% \label{tab:ablation-mcl}
% \vspace{-.5em}
% \end{minipage}
% \end{table}


% \noindent\textbf{Maximum Context Length.} 
% We study the effect of maximum context length on performance by comparing cases with lengths of 256 and 50 in the intent detection task, keeping other parameters constant. Table~\ref{tab:ablation-mcl} details the average performance for six task learning orders. As anticipated, longer input utterances generally contain more information, leading to improved performance.


\noindent\textbf{Evaluating Pseudo Samples Quality.} 
To compare the quality of the generated pseudo samples in different baselines with our proposed model, we utilize  \textbf{Dist-n} \citep{li2015diversity} to assess pseudo samples. Dist-n measures the proportion of distinct n-grams in the generated pseudo samples. A higher Dist-n value corresponding to larger pseudo sample diversity is preferred where the samples are more distinct. We employ Dist-1, Dist-2, Dist-3, and Dist-4 to analyze the quality of generated samples completely.

Given the limited number of pseudo samples, the quality of our exemplars is crucial to preserve the performance of previous tasks. We aim to carefully select representative and diverse utterances instead of generic and similar ones. Table~\ref{tab:ablation-dist} summarizes the Dist-n results. Notably, DCL achieves higher distinct scores compared to other methods, indicating that DCL-generated pseudo samples exhibit larger diversity. This suggests that pseudo samples created by DCL are more similar to real samples.






\begin{table}[t]
\centering
\caption{Distinct scores for generated pseudo samples. A higher Dist-n score means higher diversity.} 
\label{tab:ablation-dist}
\renewcommand{\arraystretch}{1.1} 
\resizebox{0.8\linewidth}{!}
{
\begin{tabular}{lccccc}
\toprule
Model & Dist-1 & Dist-2 & Dist-3 & Dist-4
\\ \hline
LAMOL-g & 0.0602 & 0.2466 & 0.4489 & 0.6178\\
LAMOL-t & 0.1758 & 0.4733 & 0.6837 & 0.8090\\
PCLL & 0.2836 & 0.6566 & 0.8369 & 0.9221 \\
\textbf{DCL} & \textbf{0.3092} & \textbf{0.7019} & \textbf{0.8708} & \textbf{0.9389} \\
\hline
Real Sample & 0.4000 & 0.7972 & 0.9255 & 0.9717\\
\bottomrule
\end{tabular}
}
\end{table}




% \begin{table}[t]
% \centering
% % \renewcommand{\arraystretch}{1} 
% % \resizebox{1\linewidth}{!}
% {
% \begin{tabular}{lccccc}
% \toprule
% \multicolumn{1}{c}{\multirow{2}{*}{Orders}} & \multicolumn{2}{c}{MCL = 50}  & \multicolumn{2}{c}{MCL = 256}                 \\
% & \multicolumn{1}{c}{ACC $\uparrow$} & \multicolumn{1}{c}{LCA $\uparrow$}  & \multicolumn{1}{c}{ACC $\uparrow$} & \multicolumn{1}{c}{LCA $\uparrow$}
% \\ \hline
% order 0 & 92.24 & 91.18 & 93.52 & 92.69 \\
% order 1 & 91.73 & 88.28 & 93.64 & 90.45 \\
% order 2 & 91.52 & 93.13 & 93.74 & 95.47 \\
% order 3 & 92.12 & 91.48 & 93.83 & 93.34 \\
% order 4 & 92.22 & 89.58  & 93.79 & 92.04 \\
% order 5 & 93.21 & 93.28 & 93.85 & 94.22  \\
% \hline
% Mean &92.17 &91.16 &93.73 & 93.04\\
% \bottomrule
% \end{tabular}
% }
% \caption{Intent detection results of DCL evaluated by ACC (\%) and LCA (\%) on six orders with input maximum context length (MCL) as 50 and 256. }
% \label{tab:ablation-mcl}
% \end{table}





% \begin{table}[t]
% \small
% \centering
% {
% \begin{tabular}{lccc}
% \toprule
% MCL & ACC & LCA
% \\ \hline
% 50 & 92.17 & 91.16 \\
% 256 & 93.73 &93.04\\
% \bottomrule
% \end{tabular}
% }
% \caption{Intent detection results of DCL with input maximum context length (MCL) of 50 and 256.} 
% \label{tab:ablation-mcl}
% \vspace{-.5em}
% \end{table}


\noindent\textbf{Dimension of Latent Variable.}
The impact of the latent variable $z$'s dimension is displayed in Table~\ref{tab:ablation-dim-z}. It shows that DCL using JSKD with a small dimension of $8$ presents better performance than DCL using KL with a large dimension of $128$. The DCL model can generate high-quality pseudo samples even with smaller dimensions and less information encoded, leading to improved accuracy. This demonstrates that the Dirichlet latent is superior to the Gaussian counterpart. 
It should be noted that DCL using JSKD with a latent dimension of $8$ exhibits degraded performance compared with that of $128$. This is caused by the shrunk information capacity in the scenario with a smaller $z$ dimension.

\begin{table}[t]
\small
\centering
\caption{Intent detection of DCL (with JS) and DCL (with KL) for different latent variable dimensions.}
\label{tab:ablation-dim-z}
\renewcommand{\arraystretch}{1.1} 
  \resizebox{0.6\linewidth}{!}
 {
\begin{tabular}{lccc}
\toprule
\small{Models} & \small{Score} & \small{LCA}
\\ \hline
DCL (with KL, z = 128)  & 92.83 & 91.32 \\
DCL (with JS, z = 8)  & 93.51 & \textbf{93.11} \\
DCL (with JS, z = 128)  & \textbf{93.73} & 93.04 \\
\hline
\end{tabular}}
\end{table}





\begin{table}[ht]
\small
\centering
\caption{Comparison of Generated Pseudo Samples by PCLL and DCL with the Ground Truth (Golden).}
\label{tab:casestudy}
\begin{tabularx}{\columnwidth}{
  >{\hsize=.4\hsize\raggedright\arraybackslash}X
  >{\hsize=1.8\hsize\raggedright\arraybackslash}X
  >{\hsize=.8\hsize\raggedright\arraybackslash}X
}
\toprule
\textbf{Models}  & \textbf{Input Utterance} & \textbf{Output y} \\
\midrule
\multirow{4}{*}{Golden} & {what's the fuel economy of my car.} & { mpg} \\\cdashline{2-3}
   & { What is the expiration date on my card?} & { expiration date} \\
\midrule
\multirow{4}{*}{PCLL} & {Do they have a lot of miles on this road?} & { mpg} \\\cdashline{2-3}
   & { Do you know how much my new credit card is worth?} & { expiration date} \\
\midrule
\multirow{4}{*}{DCL} & {What is the mpg of this car?} & { mpg} \\\cdashline{2-3}
   &{Can you check my expiration month?} & {expiration date} \\
\bottomrule
\end{tabularx}
\end{table}



% We further investigate the influence of the dimension of $z$ and list the performances in Table~\ref{tab:ablation-dim-z}. The results indicate that DCL using JSKD with a latent dimension of $8$ outperforms DCL using KL with a dimension of $128$, suggesting that the Dirichlet latent outperforms the Gaussian latent. Despite smaller dimensions and consequently less information encoded in the latent $z$, the model can generate superior pseudo samples, which further results in higher accuracy. We also notice that DCL using JSKD with a latent dimension of $8$ underperforms  DCL using JSKD with a latent dimension of $128$. This is understandable as a smaller $z$ dimension carries less information, leading to a performance drop. 
% \begin{table}[t]
% % \small
%   \centering
%   \renewcommand{\arraystretch}{1.3} 
%   \resizebox{1\linewidth}{!}{
%   \begin{tabular}{lcccccc}
%   \toprule
%   \multicolumn{1}{c}{\multirow{2}{*}{\textbf{Orders}}} & \multicolumn{2}{c}{DCL (with JS, z = 8)}  & \multicolumn{2}{c}{DCL (with JS, z = 128)}   & \multicolumn{2}{c}{DCL (with KL, z = 128)}              \\
% & \multicolumn{1}{c}{ACC $\uparrow$} & \multicolumn{1}{c}{LCA $\uparrow$}  & \multicolumn{1}{c}{ACC $\uparrow$} & \multicolumn{1}{c}{LCA $\uparrow$}
% & \multicolumn{1}{c}{ACC $\uparrow$} & \multicolumn{1}{c}{LCA $\uparrow$}
% \\ \hline
%    order 0 &  94.01  &  93.59  &  93.52 &  92.69 &  92.76&  91.27\\
%   order 1 & 93.00  & 90.39 & 93.64 & 90.45 & 92.76 & 88.49\\
%    order 2 & 93.27  & 95.34 & 93.74 & 95.47 & 92.56 & 93.36 \\
%    order 3 & 93.57 & 93.23 & 93.83 & 93.34 & 92.91 & 91.53 \\
%    order 4 & 93.21 & 91.81 & 93.79 & 92.04 & 92.68 & 89.92\\
%    order 5 & 94.02 & 94.32 & 93.85 & 94.22 & 93.33 & 93.32\\
%    \hline
%    Mean & 93.51 & 93.11 & 93.73 & 93.04 & 92.83 & 91.32 \\
%    \bottomrule
%   \end{tabular}}
%     \caption{Intent detection results of DCL (with JS) and DCL (with KL). The results encompass six orders of $z$, with dimensions of 8 and 128 for DCL (with JS), and dimension 128 for DCL (with KL).}
%     \label{tab:ablation-dim-z1}
%     \vspace{-.5em}
% \end{table}




% \begin{table*}[t]
% \small
% \centering
% \begin{tblr}{
%   cell{2}{1} = {r=2}{},
%   cell{4}{1} = {r=2}{},
%   cell{6}{1} = {r=2}{},
%   hline{1-2,4,6,8} = {-}{},
% }
% \textbf{Models}  & \textbf{Input Utterance} & \textbf{Output y} \\
% Golden & {1. what's the fuel economy of my car.} & { 1. mpg} \\
%    & { 2. What is the expiration date on my card?} & { 2. expiration date} \\
% PCLL & {1. Do they have a lot of miles on this road?} & { 1. mpg} \\
%    & { 2. Do you know how much my new credit card is worth?} & { 2. expiration date} \\
% DCL & {1. What is the mpg of this car?} & { 1. mpg} \\
%    &{2. Can you check my expiration month?} & { 2. expiration date} 
% \end{tblr}
% \caption{Comparison of Generated Pseudo Samples by PCLL and DCL against the Ground Truth (Golden).}
% \label{tab:casestudy}
% \vspace{-1em}
% \end{table*}
\noindent\textbf{Explore Llama2 as a Backbone.} To explore the scalability of DCL in large language models (LLMs), we further did intent detection experiments with Llama 2-7B \citep{touvron2023llama} as a backbone and compared DCL with PCLL. To be specific, we want to demonstrate further that DCL is scalable in LLM such as Llama2. Table \ref{tab:llama} shows the result of order 0 of intent detection. We find that DCL outperforms PCLL with a $5.72\%$ improvement, reflecting DCL's efficiency and scalability. We use Lora \citep{hu2021lora} to tune the DCL and PCLL so that the model can be fine-tuned in a single GPU which costs 80 training hours. 




\begin{table}[t]
\small
\centering
\caption{Intent detection of DCL and PCLL with Llama2 backbone.}
\label{tab:llama}
\renewcommand{\arraystretch}{1.1} 
  \resizebox{0.4\linewidth}{!}
 {
\begin{tabular}{lccc}
\toprule
\small{Models} & \small{Score} 
\\ \hline
PCLL (Llama2)  & 66.69  \\
DCL (Llama2)  & 72.41 \\
\hline
\end{tabular}}
\end{table}








\subsection{Case Study}
Table~\ref{tab:casestudy} presents a comparison between the pseudo samples generated by DCL, PCLL, and real samples (Golden) from the intent detection task. A pseudo sample includes the input utterance (middle column) and the intent (right column). It can be observed that PCLL struggles to generate the intent of specific sentences correctly. For instance, PCLL wrongly generates the intent \textit{``mpg''} (miles per gallon) for the utterance \textit{``Do they have a lot of miles on this road''}, suggesting it fails to capture the actual meaning of the utterance. 
% This sentence does not fall under the \textit{``mpg''} intent as it doesn't contain references to terms like \textit{``gas''}, \textit{``fuel''}, or \textit{``mpg''}.
In addition, for the utterance \textit{``Do you know how much my new credit card is worth?''}, PCLL also makes mistakes in intent detection where the output \textit{``expiration date''} is irrelevant to the input. 
% Sentences that should have the intent of \textit{``expiration date''} would look more like \textit{``Can   you check my expiration month?''}, and would typically include keywords such as \textit{``expire''}, \textit{``validity''}, and related terms.

\section{Conclusions}
In this paper, we propose DCL, a generative-based rehearsal method, to alleviate CF for CL in ToDs. 
A Dirichlet distribution-based CVAE is developed to exploit the flexibility of Dirichlet distribution in the process of modeling the utterance-level characteristics and thus
obtain better pseudo sample generation compared to the conventional Gaussian-based CVAE.
% which helps to generate more realistic pseudo samples for the rehearsal than the traditional Gaussian distribution-based CVAE. 
In addition, a more robust JS divergence-based knowledge distillation method is proposed to facilitate knowledge transfer between tasks. Comprehensive experiments show the superiority of the proposed method.



% References
\bibliography{uai2024-template}

\newpage

\onecolumn

\title{Supplementary Material}
\maketitle
\appendix



\section*{Appendix A}
\label{sec:appendixa}
\textbf{Prompt Example}
The task prompt used for intent detection is \textit{``For an utterance from the ID task, $x$ has the following intent''} and the task prompt used for slot filling is \textit{``In the ID task, if there are any slots and values, what are they in this sentence: $x$? Answer: ''}. For example, when training on a \textit{BANKING} task, the input x is \textit{``Please tell me how to link the card?''}, and the modified $x$ is \textit{``For an utterance from the BANKING task, ``Please tell me how to link the card?'' has the following intent''}. Thus the output y is its corresponding intent annotation \textit{``card linking''}.



\section*{Appendix B}
\label{sec:appendixb}
\textbf{Dataset Order} 
Table \ref{tab:order-intent} summarizes the six dataset orders of the intent detection task and table \ref{tab:order-slot} lists out the six dataset orders of the slot filling task.

\begin{table*}[ht]
\small
\centering
\renewcommand{\arraystretch}{1.5} 
\begin{tabular}{lllllllll}
\hline
Order 1 & TOP\_S1 & HWU     & SNIPS   & BANKING & CLINC  & TOP\_S2 & TOP\_S3 & ATIS  \\
Order 2 & BANKING & HWU     & TOP\_S1 & TOP\_S3 & CLINC  & TOP\_S2 & SNIPS   & ATIS   \\
Order 3 & SNIPS   & ATIS    & TOP\_S2 & TOP\_S3 & CLINC  & BANKING & HWU     & TOP\_S1 \\
Order 4 & CLINC   & SNIPS   & TOP\_S3 & BANKING & TOP\_S2 & HWU     & TOP\_S1 & ATIS   \\
Order 5 & BANKING & TOP\_S2 & TOP\_S1 & ATIS    & TOP\_S3 & HWU     & CLINC   & SNIPS  \\
Order 6 & CLINC   & TOP\_S1 & TOP\_S2 & ATIS    & SNIPS  & HWU     & BANKING & TOP\_S3 \\
\bottomrule
\end{tabular}
\caption{Dataset Orders of Intent Dectection Tasks}
\label{tab:order-intent}
% \vspace{-10em}
\end{table*}



\begin{table*}[ht]
\small
\centering
\renewcommand{\arraystretch}{1.5} 
\begin{tabular}{llllll}
\hline
Order 1 & MIT\_MOVIE & DSTC     & MIT\_RESTAURANT   & SNIPS & ATIS \\
Order 2 & MIT\_MOVIE & SNIPS     & DSTC & MIT\_RESTAURANT & ATIS \\
Order 3 & ATIS   & MIT\_MOVIE    & DSTC & MIT\_RESTAURANT & SNIPS\\
Order 4 & DSTC   & MIT\_RESTAURANT   & MIT\_MOVIE & ATIS & SNIPS\\
Order 5 & MIT\_MOVIE & ATIS & SNIPS & MIT\_RESTAURANT    & DSTC\\
Order 6 & SNIPS   & ATIS & MIT\_RESTAURANT & MIT\_MOVIE    & DSTC\\
\bottomrule
\end{tabular}
\caption{Dataset Orders of Slot Filling Tasks}
\label{tab:order-slot}
% \vspace{-10em}
\end{table*}


% Slot Filling Tasks:\\
% Order 1 MIT_MOVIE, DSTC, MIT_RESTAURANT, SNIPS, ATIS \\
% Order 2 MIT_MOVIE, SNIPS, DSTC, MIT_RESTAURANT, ATIS \\
% Order 3 ATIS, MIT_MOVIE, DSTC, MIT_RESTAURANT, SNIPS \\
% Order 4 DSTC, MIT_RESTAURANT, MIT_MOVIE, ATIS, SNIPS \\
% Order 5 MIT_MOVIE, ATIS, SNIPS, MIT_RESTAURANT, DSTC \\
% Order 6 SNIPS, ATIS, MIT_RESTAURANT, MIT_MOVIE, DSTC\\


\section*{Appendix C}
\label{sec:appendixc}
\textbf{Learning Curve on Intent Detection} Figure \ref{fig:curve} plots the learning curves of DCL and PCLL on intent detection task. The accuracy of DCL surpasses PCLL significantly after 6,000 steps.


\begin{figure}[ht]
    \centering
    \includegraphics[width=90mm]{curve.pdf}
    \caption{Learning curves of DCL and PCLL on intent detection task.}
    \label{fig:curve}
\end{figure}





\end{document}
