% \documentclass{uai2025} % for initial submission
\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts     % 
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{amsmath,amsfonts}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{array}
\usepackage{amssymb, amsmath}
\usepackage[caption=false,font=normalsize,labelfont=sf,textfont=sf]{subfig}
\usepackage{textcomp}
\usepackage{url}
\usepackage{verbatim}
\usepackage{graphicx}
\usepackage{amsthm}
\usepackage{cite}
\usepackage{multirow}
\usepackage{multicol}
\usepackage{booktabs}
\usepackage{makecell}
\usepackage{siunitx}
\usepackage{listings}
\usepackage{dsfont}
\usepackage{amssymb}
\usepackage{newfloat}
\usepackage{listings}
\usepackage{booktabs}
\usepackage{placeins}
\usepackage{tabularx}
\usepackage{rotating}
\usepackage{wrapfig}
\usepackage{caption}
\newtheorem{theo}{Theorem}
\newtheorem{lem}{Lemma}
\newtheorem{rem}{Remark}
\newtheorem{prop}{Proposition}
\newtheorem{dfn}{Definition}
%\newtheorem{proof}{Proof}

\newcommand{\RR}{\mathbb{R}}
\newcommand{\EE}{\mathbb{E}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cA}{\mathcal{A}}

\title{Concept Forgetting via Label Annealing}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<subhodipp@iisc.ac.in>?Subject=Your UAI 2025 paper}{Subhodip Panda$^{*}$}{}}
\author[2]{Ananda Theertha Suresh\thanks{Equal contribution}}
\author[3]{Atri Guha}
\author[1]{Prathosh A.P}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Department of ECE\\
    Indian Institute of Science \\
    Bangalore, India
}
\affil[2]{%
    Google Research \\
    New York, USA
}
\affil[3]{%
    Department of EE \\
    Indian Institute of Technology \\
    Patna, India
  }
  
\begin{document}

\maketitle

\begin{abstract}
The effectiveness of current machine learning models relies on their ability to grasp diverse concepts present in datasets. However, biased and noisy data can inadvertently cause these models to learn certain undesired concepts, undermining their ability to generalize and provide utility. Consequently, modifying a trained model to forget these concepts becomes imperative for their responsible deployment. We refer to this problem as \emph{concept forgetting}. Our goal is to develop techniques for forgetting specific undesired concepts from a pre-trained classification model's prediction. To achieve this goal, we present an algorithm called \underline{L}abel \underline{AN}nealing (LAN). This iterative algorithm employs a two-stage method for each iteration. In the first stage, pseudo-labels are assigned to all the samples by annealing or redistributing the original labels based on the predictions of the model in the current iteration. During the second stage, the model is fine-tuned on this pseudo-labeled dataset generated from the first stage. We illustrate the effectiveness of the proposed algorithms across various models and datasets. Our method reduces \textit{concept violation}, a metric that measures how much the model forgets specific concepts, by about 85.35\% on the MNIST dataset, 73.25\% on the CIFAR-10 dataset, and 69.46\% on the CelebA dataset while maintaining high model accuracy.
Our  implementation can be found at this following link: \url{https://github.com/Subhodip123/LAN}
\end{abstract}

\section{Introduction}
The superior performance capability of deep learning systems is primarily attributed to their ability to learn various concepts inherent in the dataset. For instance, advancements in face recognition systems~\citep{lecun1998gradient,krizhevsky2009learning,he2016deep} can be largely attributed to their ability to discern and characterize different semantic features from facial images, such as age, gender, and facial hair characteristics, etc. Similarly, in tasks involving image and text generation~\citep{dall-e, dall-e2, stable-diffusion}, the ability to learn varied concepts present in the dataset, enables the generation of realistic and diverse outputs. Consequently, the efficacy of these models relies upon the acquisition of learned concepts inherent within the dataset. Nevertheless, when the dataset is tainted with noisy samples or biased concepts~\citep{dataset-bais}, these models are susceptible to learning such undesired concepts. For example, suppose we are learning a model to predict whether a person should get a bank loan or not. Such a model should not depend on the gender or race of the person. However, it is possible that the machine learning model might inadvertently use these features to make predictions, which is highly undesirable. As a result, there emerges a pressing necessity to forget the undesired biased concept from these trained models to ensure their reliable and accountable deployment.  Apart from removing biases, forgetting concepts can prove beneficial in topics such as domain generalization. For example, envision a CelebA~\citep{CelebA} image classifier that heavily relies on background color as a distinguishing feature to classify different celebrities, limiting its ability to generalize effectively. Therefore, in such scenarios, rapidly forgetting only undesired concepts from a pre-trained model, without affecting the ability of the model to use other features, can improve the model's generalization capabilities. 
% \textcolor{red}{is the last sentence needed here? Not sure which problem are we referring to. I would remove this.}

 

\noindent \textbf{Motivation:} To make a pre-trained model forget a concept, we start by asking the following question - \emph{what is meant by forgetting?} Our definition of forgetting is motivated by the fact that in the case of humans, if one forgets a concept, the forgotten concept doesn't affect one's decision-making. Thus \emph{concept forgetting} within the context of machine learning entails ensuring that a model's predictions become entirely independent of the targeted forgetting concept. 

However, achieving this task presents challenges, as the goal is to forget a specific undesired concept without adversely affecting the ability of the model to use other concepts. This challenge, well studied in a similar context, is also known as \emph{catastrophic forgetting}~\citep{cat-forgetting-cohen,cat-forgetting-goodfellow,cat-forgetting-kirkpatrick,making-ai-forget}, where adapting a model for new tasks (in our case, the task of \emph{concept forgetting}), can significantly degrade the model's performance on older tasks (in this case, retaining the model's test accuracy). 
Thus, to explore the extent of forgetting concepts in pre-trained models, we pose the following question:

\begin{center}
    \emph{Can we efficiently modify a pre-trained model to forget an undesirable concept while maintaining a low performance degradation?}
\end{center}

Before we proceed further, we first state the differences between \emph{concept forgetting} and \emph{machine unlearning}, the latter of which has been recently used to remove the effect of certain training examples from the model.

\begin{figure}[ht]
    \centering
    \includegraphics[width=0.47\textwidth]{images/LAN_new.drawio.png}
    \caption{\begin{small}
        \textbf{Machine Unlearning vs. Concept Forgetting:} In the first scenario (on the left), such as gender, machine unlearning fails. This failure occurs because the dataset includes only males and females, making it impossible to retrain the model without the gender concept. In the second scenario (on the right), when a user requests the removal of concepts related to "Angelina Jolie," unlearning methods, like the optimal retraining approach, can be used successfully. 
    \end{small}}
    \label{fig:mu-failure}
\end{figure}


\textbf{Machine Unlearning vs. Concept Forgetting:} 
\emph{Machine unlearning}~\citep{making-systems-forget,unlearningsurvey1,unlearningsurvey2} aims to remove the influence of particular subsets of training data from a model so that the unlearned model mirrors the behaviour of a retrained model that is trained from scratch without the undesired data subset. The best method to achieve this is by retraining the model from scratch without the unwanted data, although this process can be computationally expensive~\citep{making-ai-forget,rmwhatforget}. However the goal of \emph{concept forgetting} is to make a model's predictions entirely independent of the targeted forgetting concept. The formal definition of \emph{concept forgetting} is given in Sec. \ref{problem-formulation}. Given this, we note that \emph{machine unlearning} and \emph{concept forgetting} as two different problem scenarios (see Figure~\ref{fig:mu-failure}). For example, consider the CelebA dataset which contains images of celebrities. Suppose we would like to make the model forget the concept of gender. A \emph{machine unlearning} approach should remove the influence of all examples that have gender and produce an unlearned model that is equivalent to retraining the model on the empty dataset as all CelebA dataset images have gender. However, as we show later, \emph{concept forgetting} can be used to remove the dependence on gender concept. We do note that \emph{machine unlearning} can be potentially used to forget small concepts, which are only present in a subset of examples. For example,  if we want to eliminate the concept of a particular celebrity, from a classifier trained on the CelebA dataset, we can retrain the model without the images of that celebrity. However, such applications are limited, and our proposed algorithm works for forgetting the dependence of undesired concepts from the model's predictions.


\textbf{Our contributions:} Our contribution can be summarized as follows:

\textbullet\ We introduce the framework of \emph{concept forgetting} from pre-trained classification models. Motivated by works in fairness~\citep{Dwork-fairness,Hardt-fairness, FERMI}, we propose \emph{concept violation} metric that empirically quantifies the extent to which the model remains dependent on a concept for its predictions.

\textbullet\ We propose an algorithm called \textbf{L}abel \textbf{AN}nealing (\textbf{LAN}). \emph{LAN} employs an iterative approach, where each iteration redistributes the class labels of all data points, thus creating a pseudo-labelled dataset. This redistribution ensures that the empirical \emph{concept violation} on the newly generated pseudo-labeled dataset is essentially zero. This method draws an analogy to the term \emph{annealing} frequently employed in material science. It denotes the controlled redistribution of atoms within a solid material under specific temperature conditions to attain an equilibrium state, which inspired our method's nomenclature. This strategy not only aids in removing the dependence of the undesirable concept from the model's predictions but also is computationally efficient. This method necessitates minimal epochs, sometimes as few as a single epoch, to diminish the reliance of the model's predictions on the forgetting concept, all the while maintaining low performance degradation of the model.

\textbullet\ We demonstrate the efficacy of our algorithm through detailed evaluations on various image classification datasets such as MNIST~\citep{lecun1998gradient}, CIFAR-10~\citep{krizhevsky2009learning}, miniImageNet~\citep{mini-imagenet}, and CelebA~\citep{CelebA} using state-of-the-art image classification models such as MobileNetV2, DenseNet-121, ResNet-50. From Table~\ref{tab:multi-forgetting} and Table~\ref{tab:binary-forgetting} (Appendix~\ref{non-binary-concept-results} and \ref{binary-concept-results}), it can be seen that our method reduces (averaged over several concepts) empirical concept violation by about 85.35\% on the MNIST dataset, 73.25\% on the CIFAR-10 dataset, 17.05\% on the miniImageNet dataset, and 69.46\% (averaged over 81.34\% for binary concepts and 63.52\% for non-binary concepts forgetting) on the CelebA dataset.

\section{Related Works}

\subsection{Fairness}
Fairness in machine learning systems is an important research area aimed at ensuring that system predictions are both accurate and fair across different groups (based on their features) of data points. Earlier works~\citep{Dwork-fairness} initially proposed the notion of \textit{demographic parity} as a preliminary definition of fairness. According to this concept, a machine learning algorithm satisfies demographic parity if the predicted target is independent of sensitive attributes. However, promoting demographic parity may lead to diminished performance, particularly if the true outcome is not independent of sensitive attributes. To address this, subsequent works~\citep{Hardt-fairness} introduced a relaxed notion of fairness based on \textit{equalized odds} and \textit{equal opportunity} definition. 
%Although these methods are typically implemented as post-processing techniques, they are susceptible to adversarial attacks when the post-processing is removed. 
To achieve a fair model, recent works~\citep{fairness-kamishima, fairness-feldman,fairness-zafar,fairness-donni,mary-fairness,fairness-cho,fairness-jiang,fairness-rezaei,FERMI} have explored incorporating different fairness regularizers during the training process itself. These methods incorporate regularization-based techniques based on different statistical distances between the distribution of the model's prediction and sensitive attributes. 

% Additionally, the recent works by \citep{FERMI} proposed an Exponential R\'enyi Mutual Information (ERMI) based state-of-the-art regularization approach that is not only a stronger notion of fairness but also converges theoretically and empirically. 

\par Drawing inspiration from fairness notions, especially \emph{demographic parity} \citep{Dwork-fairness, FERMI}, we propose that forgetting a particular concept can also be interpreted as achieving independence between the model's prediction and the undesired concept or feature we aim to forget. However, methods focusing on enforcing fairness with respect to the forgetting concept require a large number of epochs to converge and can be computationally inefficient. For instance, according to state-of-the-art FERMI algorithm \citep{FERMI}, achieving $||\nabla \ell(\theta,x,y)|| \leq \epsilon$ where $\ell$ is the loss function requires approximately $\mathcal{O}(\frac{1}{\epsilon^4})$ iterations. Empirically, the convergence of the FERMI algorithm varies depending on the dataset and application, typically ranging from as few as 50 to as many as 2000 epochs~\citep{FERMI}. Given our specific objective of forgetting only certain concepts from a model's parameters without affecting others, we aim to devise a more computationally efficient approach for concept forgetting from pre-trained models.


\subsection{Machine Unlearning for training example removal}
Machine unlearning, as described in the literature~\citep{unlearningsurvey1,unlearningsurvey2,making-systems-forget,making-ai-forget,golatkar2020eternal,golatkar2020forgetting,golatkar2021mixed,descent-to-delete,bayesianunlearning,cetified-data-removal,graves2021amnesiac,rmwhatforget}, involves intentionally erasing the impact of particular subsets of training data from a pre-trained model, addressing user privacy concerns. Here, the objective is to craft a computationally efficient method that produces an unlearned model that mirrors the behavior of the model that is trained from scratch, on the training dataset devoid of the sensitive data points. Although retraining serves as the optimal benchmark method for unlearning, this method becomes computationally impractical for large models and iterative unlearning demands~\citep{making-systems-forget,making-ai-forget,rmwhatforget}. Consequently, to address user privacy concerns, more efficient data deletion methods~\citep{making-systems-forget,making-ai-forget} were devised, leading to the emergence of machine unlearning. The machine unlearning methods are broadly categorized into two types: \emph{exact unlearning}~\citep{deltagrad} and \emph{approximate unlearning}~\citep{descent-to-delete, rmwhatforget}. 
Exact unlearning aims to completely eliminate the influence of unwanted data from the trained model, while approximate unlearning methods only partially mitigate data influence, resulting in parameter distributions closely resembling the retrained model. More sophisticated methods~\citep{cetified-data-removal,graves2021amnesiac} have suggested using influence functions, but these are computationally demanding and limited to small convex models. To extend unlearning techniques to non-convex models like deep neural networks,~\citet{golatkar2020eternal,golatkar2020forgetting} introduced a scrubbing mechanism based on the Fisher Information matrix.

\par As we noted earlier, concept forgetting and machine unlearning have fundamental differences ( Figure~\ref{fig:mu-failure} demonstrates machine unlearning cannot be applied for a general concept forgetting setup) in their objectives. \emph{Machine Unlearning} seeks to forget specific data points while emulating the behavior of a retrained model, whereas \emph{Concept Forgetting} aims for the model's predictive performance to become independent of the forgotten concept.  



\section{Preliminaries and Background}
\subsection{Problem Formulation}\label{sec-2.1}
\label{problem-formulation}
Unless otherwise specified, we consider the problem of multi-class classification throughout the paper. Let $\mathcal{Y} \triangleq \{0, 1, 2, \ldots, k-1\}$ denote the set of $k$ labels. Let $z = (x, y)$  denote a data point where $x \in \RR^d$ is the feature and $y \in \mathcal{Y}$ is the label. A dataset $\mathcal{D} \triangleq \{z_i\}_{i=1}^{|\mathcal{D}|}$ is a set of samples sampled from the underlying data distribution $P_{xy}: \RR^d \times \cY \to [0,1]$. Let a categorical concept $\mathcal{C}: \RR^d \times \cY \to \{0, 1, 2, \ldots, m-1\}$ be a mapping from the sample to the set of all possible values the concept can take.  For example, if the concept is binary such as \textrm{beard}, it can take two values $\{0,1\}$ ($m=2$), which denotes the absence and presence of the beard, respectively. Similarly, if the concept is non-binary such as \textrm{facial hair type}, it can take multiple values $\{0,1,2,3\}$ ($m=4$) which signifies no facial hair, mustache, beard, and goatee respectively. Let $h_\theta: \RR^d \to \Delta^{|\cY|}$ denote a classifier parameterized by $\theta \in \RR^p$ where $\Delta$ is the probability simplex. This classifier takes a feature $x \in \RR^d$ and predicts a distribution over the label space. Let $\hat{h} (\theta,z)$ denote a post-processing step on the classifier (e.g. argmax) where a hard label is inferred based on the probability over the labels.

\begin{dfn}
\label{dfn-1}
% can we think of a better name for this?
    \textbf{(Concept neutral):} We call a classifier with parameter $\theta$ concept neutral with respect to a concept $\cC$, if for all output class $y \in \cY$ and all possible concept values $c \in \{0, 1, 2, \ldots, m-1\}$,
    \begin{equation}
        P_{xy}(\hat{h}(\theta, z) = y | \cC(z) = c) = P_{xy}(\hat{h}(\theta, z) = y).
    \end{equation}
\end{dfn}

%  

% Similarly, we define concept-violation as follows:
\begin{dfn}
\label{dfn-2}
    \textbf{(Concept violation):} For a classifier $\hat{h}$ with parameter $\theta$, we measure the violation of concept neutrality in terms of the total variation distance as follows: 
    \begin{align}
    V(\theta, \mathcal{C}, P) 
    &\triangleq \frac{1}{m} \sum^{m-1}_{c=0} 
    d_{\text{TV}}\bigg( P_{xy}(\hat{h}(\theta, z) = y), \notag \\ 
    &\quad P_{xy}(\hat{h}(\theta, z) = y \mid \mathcal{C}(z) = c) \bigg) \notag \\
    &= \frac{1}{2m} \sum^{m-1}_{c=0} \sum^{k-1}_{y=0} 
    \bigg| P_{xy}(\hat{h}(\theta, z) = y) \notag \\ 
    &\quad - P_{xy}(\hat{h}(\theta, z) = y \mid \mathcal{C}(z) = c) \bigg|.
    \end{align}
\end{dfn}


Note that $V(\theta,\cC, P) \in [0, 1]$ and if a model is concept neutral, then $V(\theta,\cC, P) = 0$. As the underlying data distribution $P_{xy}$ is unknown, we have only access to the dataset $\cD$ to empirically estimate concept violation $V(\theta,\cC, P)$ as follows:

\begin{align}
    \hat{V}(\theta, \mathcal{C}, \mathcal{D}) 
    &\triangleq \frac{1}{m} \sum^{m-1}_{c=0} 
    d_{\text{TV}}\bigg( \hat{P}_{\mathcal{D}}(\hat{h}(\theta, z) = y), \notag \\  
    &\quad \hat{P}_{\mathcal{D}}(\hat{h}(\theta, z) = y \mid \mathcal{C}(z) = c) \bigg) \notag \\
    &= \frac{1}{2m} \sum^{m-1}_{c=0} \sum^{k-1}_{y=0} 
    \big| \hat{P}_{\mathcal{D}}(\hat{h}(\theta, z) = y) \notag \\ 
    &\quad - \hat{P}_{\mathcal{D}}(\hat{h}(\theta, z) = y \mid \mathcal{C}(z) = c) \big|. \label{eq-3}
\end{align}


% \begin{align}
%     \hat{V}(\theta, \mathcal{C}, \mathcal{D}) 
%     &\triangleq \frac{1}{m} \sum^{m-1}_{c=0} 
%     d_{\text{TV}}\bigg( \hat{P}_{\mathcal{D}}(\hat{h}(\theta, z) = y),  \hat{P}_{\mathcal{D}}(\hat{h}(\theta, z) = y \mid \mathcal{C}(z) = c) \bigg) \notag \\
%     &= \sum^{m-1}_{c=0} \frac{|\cD_c|}{|\cD|} \sum^{k-1}_{y=0} 
%     \left| \hat{P}_{\mathcal{D}}(\hat{h}(\theta, z) = y) \right. \left. - \hat{P}_{\mathcal{D}}(\hat{h}(\theta, z) = y \mid \mathcal{C}(z) = c) \right|, \label{eq-3}
% \end{align}

where $\hat{P}_\cD(\hat{h}(\theta, z) = y) = \frac{1}{|\cD|} \underset{z \in \cD}{\sum} \mathds{1}(\hat{h}(\theta, z) = y)$,  $\cD_c = \{z \in D: C(z)=c \}$, and $\hat{P}_\cD(\hat{h}(\theta, z) = y | \cC(z) = c) = \frac{1}{|\cD_{c}|} \underset{z \in \cD_c}{\sum} \mathds{1}(\hat{h}(\theta, z) = y)$. Now given a pre-trained model with a parameter $\theta^*$ and a concept $\cC$, the goal of a concept forgetting algorithm is to find the forgotten parameter $\theta_\cC$ such that the algorithm has the following properties:


\label{forgetting-properties}

\textbullet\ \textbf{Minimize empirical concept violation:}  The \emph{empirical concept violation} metric $\hat{V}(\theta_\cC, \cC, \cD)$ measures how much `neutral' is the forgotten model for the given concept. For an ideal forgotten model, this metric will be zero indicating that the model has forgotten the concept. Hence minimizing concept violation is an important criterion and our goal is to ensure $\hat{V}(\theta_{\cC}, \cC, \cD) \ll \hat{V} (\theta^*, \cC, \cD)$.

\textbullet\ \textbf{Minimize accuracy loss:} Any forgotten model $\theta_\cC$ should exclusively erase the specified concept without erasing other concepts, thereby enabling the retained generalization capabilities to persist. Hence minimizing loss of the forgotten model's test accuracy is one of the important criteria. Let $\ell(\theta, z)$ denote the loss of the model with parameters $\theta$ for a sample $z=(x,y) \in \cD$. 
%The expected test loss of the original model be $\mathds{E}_{z \sim P}[\ell(\theta^*,z)]$. Then 
Hence our goal is to ensure $\left | \underset{z \sim P_{xy}}{\mathds{E}}[\ell(\theta_\cC,z)] - \underset{z \sim P_{xy}}{\mathds{E}}[\ell(\theta^*,z)]  \right |$ to be small.

\textbullet\ \textbf{Small time complexity:} In dynamic environments, rapid model adaptation and updating are vital to remove undesired or outdated information. Thus, it is desirable to have concept-forgetting algorithms that have low computational time for running. 


\section{Methodology}


\subsection{Label Annealing (LAN) Algorithm}

\begin{figure*}[!htbp]
    \centering
    \includegraphics[width=0.95\textwidth]{images/LAN.drawio.png}
    \caption{\textbf{Label Annealing (LAN) methodology} - The task involves forgetting the concept $\cC(z)=c \in \{0,1,2\}$ from a classification task with data points labeled as $j \in \{\text{Class-}0,\text{Class-}1,\text{Class-}2\}$ (blue, red, and yellow, respectively). This iterative method runs $E$ iterations, where each $e^{th}$-iteration constitutes two stages: in first stage, known as \emph{label annealing subroutine}, the labels within each concept data subset (e.g., $\cD_0,\cD_1,\cD_2$) are redistributed based on the class prediction of $e^{th}$-iteration's model $\theta_e$, denoted as $\hat{p}_{\theta_e}(x,j)$, resulting in the label annealed dataset $\widetilde{\cD}$. Subsequently at the next stage, termed as \emph{parameter fine-tuning} using $\widetilde{\cD}$, we minimize the loss function $\mathcal{L}_{\text{LAN}}(\theta,\widetilde{\cD})$ to obtain final the concept forgotten model $\theta_\cC$.   } 
    \label{fig:LAN-method}
\end{figure*}  

\textbf{Main intuition:} Recall our goal is to reduce the concept violation while minimizing accuracy degradation. To achieve the forgotten model $\theta_\cC$, we devise a method called \underline{L}abel \underline{AN}nealing (LAN). In the first stage LAN tries to create a pseudo-labeled dataset where the empirical concept violation is zero/very low and then as part of second stage, model is finetuned with this pseudo-labeled dataset. The intuition is that since the model is trained on this pseudo-labeled dataset with nearly zero empirical concept violation, the resulting model should also have nearly zero empirical concept violation. Note that there are many ways to pseudo-label dataset such that empirical concept violation is zero. We propose a way such that when the model is trained on this pseudo-labeled dataset, the model quality does not degrade much, to this end: if the original model is confident in certain examples, we prefer not to change them and if model is not confident in certain examples, we allow them to be changed so that concept violation becomes zero. The overall methodology is shown in the Figure~\ref{fig:LAN-method}.


\begin{algorithm}[H]
    \caption{\textbf{Label Annealing Subroutine}}
    \label{alg1}
    \centering
    \scalebox{0.75}{%
    \begin{minipage}{\linewidth}
    \textbf{Input}: model parameter $\theta_e \in \RR^p$, dataset $\mathcal{D}$, forgetting concept $\mathcal{C}$ 
    \begin{algorithmic}[1]
    \STATE{For each class $j \in [k-1]$, let $b_{j}$ denote the number of samples $z \in \cD$ with $\arg \max_j p_{\theta_e}(x,j) = j$.}
    \FOR{$c = 0,1,\ldots,m-1$}
    \STATE{Construct $\cD_c = \{z \in \cD: \cC(z)=c\}, n_c = |\cD_c|$}
    \STATE{For each sample $z_i \in \cD_c$, compute $p_{\max}(x_i) = \max_{j} p_{\theta_e}(x_i, j)$. Sort $\cD_c$ in decreasing order of $p_{\max}(x)$.}
    \STATE{Let $n_{c,j}$ be the number of samples in $\cD_c$ such that $\arg \max_j p_{\theta_e}(x,j) = j$.}
    \STATE{Initialize $\alpha_{c, j} = 0$ for all $j \in [k-1]$.}
    \FOR{$i = 1,2,\ldots, |\cD_c|$}
    \STATE{$\widetilde{y_i} \gets \phi$}
    \STATE{$j^* \gets  \underset{j \in \{0,\ldots,k-1\}}{\arg \max} p_{\theta_e}(x_i, j)$}
    \IF{$\alpha_{c, j^*} < b_{j^*} \cdot n_c / |\cD|$}
    \STATE{$\widetilde{y_i} \gets  j^*$}
    \STATE{$\alpha_{c,\widetilde{y_i} } \gets \alpha_{c,\widetilde{y_i} } + 1$}
    \ENDIF
    \ENDFOR
    \FOR{$i = 1,2,\ldots, |\cD_c|$}
    \IF{$\tilde{y}_i == \phi$}
    \STATE{$j^i \gets \phi$}
    \WHILE{$\widetilde{y_i} == \phi$}
    \STATE{$j_i \gets \underset{j \in \{0,\ldots,k-1\} \setminus j^i}{\arg \max} p_{\theta_e}(x_i, j)$}
    \IF{$\alpha_{c, j_i} < b_{j_i} \cdot n_c / |\cD|$}
    \STATE{$\widetilde{y_i} \gets j_i$}
    \STATE{$\alpha_{c,\widetilde{y_i}} \gets \alpha_{c,\widetilde{y_i}} + 1$}
    \ELSE
    \STATE{$j^i \gets j^i \cup \{j_i\}$}
    \ENDIF
    \ENDWHILE
    \ENDIF
    \ENDFOR
    \ENDFOR
    \STATE{\textbf{Output}: $\widetilde{\cD} \gets \{\widetilde{z_i} = (x_i,\widetilde{y_i})\}^{|\cD|}_{i=1}$}
    \end{algorithmic}
    \end{minipage}
    }
\end{algorithm}

\begin{algorithm}[H]
    \caption{\textbf{Parameter Fine-Tuning}}
    \label{alg2}
    \centering
    \scalebox{0.75}{%
    \begin{minipage}{\linewidth}
    \textbf{Input}: pre-trained parameter $\theta^* \in \RR^p$, dataset $\mathcal{D}$, concept that needs to be forgotten $\mathcal{C}$, batch size $B$, learning rate $\eta$, number of iterations $E$, number of steps $T$.
    \begin{algorithmic}[1]
    \STATE{\textbf{Initialize: } $\theta_e \gets \theta^*$}
    \FOR{$e = 1, \ldots, E$}
    \STATE {$\widetilde{\mathcal{D}} \gets \text{LAN}(\theta_e, \mathcal{D}, \mathcal{C})$}
    \FOR{$t = 1,\ldots,T$}
    \STATE {Draw a random mini-batch of size $B$ from $\widetilde{\mathcal{D}}$, denoted as $\widetilde{\mathcal{D}}^b$}
    \STATE{$\theta_{e} \gets \theta_{e} - \eta \nabla_{\theta} \mathcal{L}(\theta, \widetilde{\mathcal{D}}^b)$}
    \ENDFOR
    \ENDFOR
    \STATE{\textbf{Output}: $\theta_E$} 
    \end{algorithmic}
    \end{minipage}
    }
\end{algorithm}

% pseudo-labels must result in a minimal change in empirical risk, i.e., $\frac{1}{|\cD|}  \sum^{|\cD|}_{i=1} \left[ \ell(\theta_e, (x_i, y_i)) - \ell(\theta_e, (x_i, \tilde{y}_i)) \right]  \approx 0$. A natural question is to ask how to create pseudo-labels that ensure zero concept violation while preserving model quality. For example, if the loss function is cross-entropy this is given by $\frac{1}{|\cD|}  \left( \log \frac{1}{p_{\theta_e}(x_i, y_i)} - \log \frac{1}{p_{\theta_e}(x_i, \tilde{y}_i)} \right).$ Thus, we would like to change labels for the samples where changing the label does not significantly change the loss. Algorithm~\ref{alg1} is a greedy algorithm that aims to minimize this change in empirical risk while ensuring that the concept neutrality condition is met. In particular, given a model $\theta_e$, the dataset $\cD$, and the user-specified concept to forget $\cC$, a particular $c$-concept violated data subset $\cD_c = \{z \in \cD: \cC(z)=c\}$ is obtained. For each sample $z_i = (x_i, y_i) \in \cD_c$, the maximum probability $p_{\text{max}}(x_i) = \max_j p_{\theta_e}(x_i,j)$ is calculated, and $\cD_c$ is sorted in decreasing order of $p_{\text{max}}(x)$. In this sorted $\cD_c$, each sample $x_i$ is assigned the most probable hard label $\tilde{y}_i$ in such a way that in each $c$-concept violated data subset $\cD_c$, class label redistribution adheres to the $\theta_e$ model's prediction distribution $\hat{p}_{\theta_e}$ on the entire dataset $\cD$ making the concept violation zero.
At the heart of this algorithm is the \emph{label annealing} subroutine, given in Algorithm~\ref{alg1}. Given a model parameter $\theta_e$, training dataset $\cD$, and particular concept $\cC$ targeted forforgetting, this subroutine at a particular iteration $e$ creates a dataset with the same features as $x_i \in \cD$ and with pseudo-labels $\tilde{y}_i$ such that the model $\theta_e$ has zero empirical concept-violation on the newly created dataset $\widetilde{\cD}$. Further to retain the model's overall performance, this assignment of pseudo-labels must result in a minimal change in empirical risk. Thus, we would like to change labels for the samples where changing the label does not significantly change the loss. To achieve these goals, the whole dataset is divided into concept data sub-groups $\mathcal{D}_c$ for each $c \in \{0,1,...,m-1\}$. Now for each of $\mathcal{D}_c$, the first term in Eq.~\ref{eq-3} for a particular class label $j$  would be $\frac{b_j}{|D|}$ and second term would be $\frac{n_{cj}}{|n_c|}$ where $n_c=|\mathcal{D}_c|$, $b_j$ and $n_{cj}$ are the nos. of samples of class-$j$ predicted by the current model $\theta_e$ on dataset $\mathcal{D}$ and $\mathcal{D}_c$ respectively. In other words to make concept violation zero in $\mathcal{D}_c$, number of samples predicted class-$j$ in $\mathcal{D}_c$ i.e. $n_{cj}$ must be equal to $b_j \cdot \frac{n_c}{|D|}$. This is why we need to redistribute the labels of each class-$j$ in $\mathcal{D}_c$ without much affecting the model performance (empirical loss). Thus to achieve this dual objective of redistributing the labels without much affecting the empirical loss, we calculate $p_{\text{max}}(x_i) = \max_j p_{\theta_e}(x_i,j)$ for each sample $z_i = (x_i, y_i) \in \mathcal{D}_c$, and then $\mathcal{D}_c$ is sorted in decreasing order of $p_{\text{max}}(x)$ (Ref. line 4 in Algorithm~\ref{alg1}). Now, in the second for loop (Ref. lines 7-13 in Algorithm~\ref{alg1}) for this sorted $\mathcal{D}_c$, each sample $x_i$ is initialized with label $\tilde{y}_i=\phi$ and iteratively assigned the most probable label $\tilde{y}_i=j^*=\arg \max_j p_{\theta_e}(x_i, j)$ until the nos. of samples in class-$j^*$ is less than $b_{j^*} \cdot \frac{n_c}{|D|}$. This part of the algorithm ensures that reassigned labels don't change from the initial ones, specifically on those data points where the model is most confident. This is why the deterministic assignment of labels is done on $\mathcal{D}_c$ sorted based on higher to lower prediction confidence of the model. Further in the subsequent for loop (Ref. lines 15-28 in Algorithm~\ref{alg1}), the data points where the labels are unassigned i.e. $y_i = \phi$, the algorithm assigns the subsequent (second or third and so on) most probable label class-$j$ if still, the nos. of samples in that assigned class-$j$ is less than $b_{j} \cdot \frac{n_c}{|D|}$. This step tries to redistribute the labels of data points where the model is not confident (low concept violation is achieved by trading off some accuracy). 
Subsequently, in the next stage of \emph{parameter fine-tuning} (Algorithm~\ref{alg2}), we fine-tune the $e^{th}$-model $\theta_e$ on the new dataset $\widetilde{\mathcal{D}}=\bigcup_{c=0}^{m-1} \widetilde{\cD}_c$ to obtain the forgotten model $\theta_{e+1}$ by minimizing the Label Annealing loss function $\mathcal{L}_{\text{LAN}}$ as follows: 
% (\theta,\widetilde{\cD})= \frac{1}{|\widetilde{\cD}|} \sum_{c=0}^{m-1} \sum_{i=1}^{|\widetilde{\cD}_c|} \ell(\theta, \widetilde{z}_i)$.
\begin{align}
\label{LAN-loss}
\mathcal{L}_{\text{LAN}}(\theta, \widetilde{\cD}) &= \frac{1}{|\widetilde{\cD}|} \sum_{c=0}^{m-1} \sum_{i=1}^{|\widetilde{\cD}_c|} \ell(\theta, \widetilde{z}_i).
\end{align}

We repeat this process for $E$ steps to get the final concept-neutral model $\theta_\cC$. The value of $E$ depends on the user's choice. However, to achieve concept forgetting with low computational complexity we experimented with $E=1$ (results of Table~\ref{tab:multi-forgetting} and Table~\ref{tab:binary-forgetting}). Further results on higher values of $E \in \{2,4\}$ are given in the ablation studies section.



\subsection{Theoretical Analysis}

In this section, we theoretically analyze the properties of Algorithm~\ref{alg1} using Lemma~\ref{lemma-1}. Theorem~\ref{theorem-1} signifies that the \emph{LAN} algorithm reduces the pre-trained model's accuracy loss if the original model has low concept violation. Recall that $\theta^*$ denotes the input to Algorithm~\ref{alg2} and  $\theta_\cC$ denotes the output of our algorithm. With this, let $b_j$ denotes the number of samples of class-$j$ in $\cD$ predicted by the initial model. Thus, $n = |\cD| = \sum^{k-1}_{j=0}b_j$. Similarly, let $n_{cj}$ for the number of samples of class-$j$ in $\cD_c$. Hence, $n_c=|\cD_c|=\sum^{k-1}_{j=0} n_{cj}$. 
Let the number of labels changed by Algorithm~\ref{alg1} (denoted by $\cA$) in the total dataset $\cD$ be $\text{cl}(\cA)$. We first prove the following lemma.
\begin{lem}
\label{lemma-1}
Let $E=1$.   For any concept $\cC$, the number of labels changed by Algorithm~\ref{alg1}  $\text{cl}(\cA) \leq
    2 n m \cdot \hat{V}(\theta^*,\cC,{\cD})$.
\end{lem}

\begin{theo}
\label{theorem-1}
Let the loss function be bounded i.e., $\forall \theta, z$ $\ell(\theta,z) \leq L$. If the fine-tuning reduces the loss on $\widetilde{{\cD}}$ i.e., $ \EE \left[\mathcal{L}_{\widetilde{\mathcal{D}}}(\theta_\cC) \right] \leq \mathcal{L}_{\widetilde{\mathcal{D}}}(\theta^*)$, then 
    \begin{equation}
        \EE \left[\mathcal{L}_\mathcal{D}(\theta_\cC) \right] \leq \mathcal{L}_\mathcal{D}(\theta^*) + 4L Em \cdot \hat{V}(\theta^*, \cC,{\cD})
    \end{equation}
    where the expectation is over the randomization in the stochastic gradients in Algorithm~\ref{alg2}.
\end{theo}

The above bound implies that if the original concept violation is small, then the performance of the new model (trained on $\widetilde{\cD}$) will not degrade significantly. In particular, if the original concept violation is zero, then the loss of the forgotten model is the same as the loss of the original model. Furthermore, while the upper bound degrades with $E$, as we show in experiments, the performance improves or remains the same with an increasing value of $E$. Due to space constraints, we provide the proof of the above lemma and theorem in Appendix~\ref{app:theory}.


% In this section, we theoretically show that the proposed \emph{LAN} algorithm (denoted by $\cA$) retains its accuracy if the forgotten model has low concept violation. Let's denote the pre-trained model $\theta^*$ and the forgotten model as $\theta_\cC$ obtained by finetuning $\theta^*$ on the label annealed dataset $\widetilde{\mathcal{D}}$ using the LAN loss(~\eqref{LAN-loss}) i.e., $\theta_\cC = \cA_{\widetilde{\mathcal{D}}}(\theta^*)$. Now, $\mathcal{L}_\cD(\theta^*)$ and $\mathcal{L}_\cD(\theta_\cC)$ denote the empirical losses of the pre-trained model and forgotten model on the initial dataset $\cD$ respectively. Let the number of labels changed by algorithm $\cA$ in the total dataset $\cD$ be $\text{cl}(\cA)$ and in concept data subset $\cD_c$ be  $\text{cl}(\cA)_c$. Now the empirical concept violation of the forgotten model $\theta_\cC$ on the label annealed dataset $\widetilde{\cD}$ for a particular concept $\cC=c$ be $\hat{V}(\theta_\cC,c,\widetilde{\cD})$ then following are proved:
% \begin{lem}
% \label{lemma-1}
%     If $\theta_\cC = \arg \min_\theta \mathcal{L}_{LAN} (\theta,\widetilde{\mathcal{D}})$ is the perfect minimizer then   $\text{cl}(\cA)_c \leq
%     2mn_c\hat{V}(\theta_\cC,c,\widetilde{\cD})$
% \end{lem}
% Due to space constraints, we relegate the proof to Appendix~\ref{proof-lemma}. Using this lemma, we prove the following result on the final loss of the forgotten model. 
% \begin{theo}
% \label{theorem-1}
%     If $\forall \theta, z$ the loss $\ell(\theta,z)$ is bounded i.e. $\ell(\theta,z) \leq B$  and $\forall z = \widetilde{z} $, $\ell(\theta,z) = \ell(\theta,\widetilde{z})$ then $\mathcal{L}_\mathcal{D}(\theta_\cC) \leq \mathcal{L}_\mathcal{D}(\theta^*) + 4Bm \sum^{m-1}_{c=0} \frac{n_c}{n} \hat{V}(\theta_\cC,c,\widetilde{\cD})$
% \end{theo}
% The proof of the above theorem is given in the appendix \ref{proof-theorem}.    To show that the performance of $\theta_\cC$ doesn't degrade much it is useful to show $\mathcal{L}_\cD(\theta^*) \approx \mathcal{L}_\cD(\theta_\cC)$. Now, from the upper bound on $\mathcal{L}_\cD(\theta_\cC)$ in Theorem \ref{theorem-1} it is inferable that if the forgotten model has very low concept valuation then the  performance of the forgotten model $\theta_\cC$ doesn't degrade much.

\section{Experiments and Results}

\subsection{Datasets and Models}
For our experiments, we consider mainly forgetting two types of concepts: \emph{binary-concept} ($m=2$) and \emph{non-binary concept} ($m>2$). We have used different image classification models such as 2-layer-MLP (hidden layer size 500), Mobinetv2~\citep{mobinetv2}, Densenet-121~\citep{densenet}, Resnet-50~\citep{he2016deep}. Further to show the applicability of our method for different classification tasks across diverse datasets, we have used MNIST~\citep{lecun1998gradient}, CIFAR-10~\citep{krizhevsky2009learning}, miniImageNet~\citep{mini-imagenet}, and CelebA~\citep{CelebA} datasets. Different concept forgetting scenarios for $E=1$ can be seen from Table~\ref{tab:multi-forgetting} and Table~\ref{tab:binary-forgetting}. Further details about the datasets and models are included in the Appendix~\ref{dataset-model}. 

\subsection{Evaluation Metrics}
To evaluate the efficacy of any concept-forgetting algorithm we propose two different metrics as defined below:

\textbullet\ \textbf{Empirical concept violation:}
This metric denoted as $\hat{V}(\theta_\cC,\cC,\cD)$, is defined in equation~\ref{eq-3}, quantifies the concept neutrality of the forgotten model $\theta_\cC$. Observe that $\hat{V}(\theta_\cC,\cC,\cD) \in [0,1]$, and a smaller $\hat{V}(\theta_\cC,\cC,\cD)$ signifies 

\FloatBarrier
%% CelebA visual results
\begin{figure*}[!t]
\centering
% First row (3 subfigures)
\subfloat[\begin{small}(\underline{Facial hair}, Resnet-50, CelebA)\end{small}]{
    \includegraphics[width=0.32\textwidth]{revised_plots/celeba_resnet50_heavy_makeup_facial_hair.png}
}\hfill
\subfloat[\begin{small}(\underline{Facial hair}, Resnet-50, CelebA)\end{small}]{
    \includegraphics[width=0.32\textwidth]{revised_plots/celeba_resnet50_attractive_facial_hair.png}
}\hfill
\subfloat[\begin{small}(\underline{Triceratops}, Resnet-50, miniImageNet)\end{small}]{
    \includegraphics[width=0.32\textwidth]{revised_plots/IMAGENET_Resnet50_Class3.png}
}\\
% Second row (3 subfigures)
\subfloat[\begin{small}(\underline{Bugs}, Resnet-50, miniImageNet)\end{small}]{
    \includegraphics[width=0.32\textwidth]{revised_plots/IMAGENET_Resnet50_Class30.png}
}\hfill
\subfloat[\begin{small}(\underline{Frog}, Densenet-121, CIFAR-10)\end{small}]{
    \includegraphics[width=0.32\textwidth]{revised_plots/CIFAR10_DensetNet_Class6.png}
}\hfill
\subfloat[\begin{small}(\underline{Frog}, Mobinetv2, CIFAR-10)\end{small}]{
    \includegraphics[width=0.32\textwidth]{revised_plots/CIFAR10_MobiNet_Class6.png}
}\\
% Third row (2 subfigures)
\subfloat[\begin{small}(\underline{Digit-3}, MLP, MNIST)\end{small}]{
    \includegraphics[width=0.4\textwidth]{revised_plots/MNIST_MLP_Class3.png}
}
\subfloat[\begin{small}(\underline{Digit-3}, Resnet-50, MNIST)\end{small}]{
    \includegraphics[width=0.4\textwidth]{revised_plots/MNIST_RESNET50_Class3.png}
}
\caption{\textbf{Concept violation vs. accuracy trade-off:} We have plotted concept violation on the y-axis and accuracy on the x-axis. Each point represents an algorithm with a hyper-parameter, and the \emph{Fit} line for an algorithm is obtained by a linear fit of all experiments corresponding to the algorithm. The underlined concepts are forgotten in different settings as follows: Figures (a) and (b) show forgetting facial hair from the task of heavy makeup vs. not-heavy makeup and attractive vs. not-attractive classification respectively, on CelebA. Figures (c) and (d) show different concepts forgotten from pre-trained Resnet-50 on the miniImageNet dataset. Figures (e) and (f) show concept forgetting from pre-trained Densenet-121 and Mobinetv2, respectively, on the CIFAR-10 dataset. Figures (g) and (h) show digit-3 concept forgetting from pre-trained MLP and Resnet-50 models, respectively, on the MNIST dataset. It can be seen that increasing accuracy increases concept violation. Thus, for a particular achievable accuracy, \emph{LAN} achieves lower concept violation than other baseline methods.}
\label{figure-3}
\end{figure*}
\FloatBarrier

that the model is conceptually neutral regarding the forgetting concept $\cC$. In the rest of the section, we denote $\hat{V}(\theta_\cC,\cC,\cD)$ as $\hat{V}_\cC$.

\textbullet\ \textbf{Test accuracy:} Any concept forgetting algorithm mustn't render the initial model ineffective during the forgetting process. Therefore, maintaining a low performance degradation, i.e., accuracy, close to that of the initial model $\theta^*$ is desirable. This metric is denoted by $A_\cD$,

\subsection{Baselines}
According to our knowledge, this is the first work that introduces \emph{concept forgetting} as a property of the forgotten model to induce independence from the forgetting feature during its prediction task. Thus for proper evaluation of our method, we adopt several baselines from fairness because these baseline methods also advocate for the independence of prediction and sensitive concept features. Here we have used particularly three baseline methods: (a) FERMI~\citep{FERMI} (b) Continuous-Fairness~\citep{mary-fairness} and (c) Fairness-KDE~\citep{fairness-kde}. We have used official implementation for both FERMI and Continuous-Fairness baselines while for Fairness-KDE an open-source implementation has been used. Further details about the baselines can be found in the Appendix~\ref{baselines}.

\subsection{Results: Binary and Non-Binary Concept Forgetting}

We evaluated our approach for different classification scenarios to forget both binary concepts with $c\in\{0,1\}$ ($m=2$) 
and non-binary concepts with $c\in\{0,1,\ldots,m-1\}$ ($m >2$). For example, as illustrated in Table \ref{tab:binary-forgetting} (Appendix~\ref{binary-concept-results}), in the context of the MNIST digit classification problem, the objective is to forget a particular class digit concept e.g. class-3 data. Thus, here $c=0$ represents concepts of non-digit-3 data and $c=1$ represents concepts of digit-3 data. Similarly, in the CelebA dataset for gender concept $c=0$ represents male and $c=1$ represents female. Table~\ref{tab:multi-forgetting} (Appendix~\ref{non-binary-concept-results}) illustrates LAN method's performance for forgetting non-binary concepts. In this scenario, we forget certain features from a pre-trained classifier in the process of classifying other features. For example, while classifying samples as young vs. not-young, we aim to forget subtle feature concepts such as hair color and facial hair from the pre-trained models. As there exists a trade-off between our two metrics of interest, for proper evaluation of our method with FERMI~\citep{FERMI}, Continuous-Fairness~\citep{mary-fairness}, and Fairness-KDE~\citep{fairness-kde} baselines, concept-violation vs. accuracy trade-off plots are depicted in Figure~\ref{figure-3}. It can be seen from Figure~\ref{figure-3} that our method performs significantly better than other baseline methods in terms of achieving a better trade-off. From these plots, it can be seen that at a particular accuracy, our method achieves a lower concept violation (LAN trade-off curve lies below the other baseline methods) than other baseline methods.





\subsection{Ablation Study}

\noindent \textbf{Effect of learning rate:} Table~\ref{table-3} demonstrates the performance of the LAN method for different learning rates. It can be seen as the learning rate increases the accuracy decreases, while the concept violation decreases at first but starts to increase afterward. Thus, at higher accuracy regions, concept violation decreases along with accuracy whereas at lower accuracy regions concept violation increases with a decrease in accuracy. This suggests an optimal point lies in the trade-off curve where concept violation is low with a slight reduction of accuracy.

\begin{table}[!htbp]
\centering
\caption{Empirical concept violation $\hat{V}_\cC$ and accuracy $A_D$ for different learning rates.}
\label{table-3}
\resizebox{0.47\textwidth}{!}{%
\begin{tabular}{cccc|cc}
\toprule
\textbf{Dataset} & \textbf{Models} & \textbf{Concepts} & \textbf{Learning Rates} & $\hat{V}$ & $A_D$ \\
\midrule
\multirow{5}{*}{\textbf{MNIST}} & \multirow{5}{*}{\makecell{\textbf{2-layer} \\ \textbf{MLP}}} & \multirow{5}{*}{\textbf{Digit-3}} 
    & 1.00e-07 & 0.476 & 0.973 \\
    &  &  & 1.00e-05 & 0.091 & 0.884 \\
    &  &  & 0.0001 & 0.055 & 0.883 \\
    &  &  & 0.001 & 0.148 & 0.876 \\
    &  &  & 0.005 & 0.257 & 0.842 \\
\midrule
\multirow{5}{*}{\textbf{CIFAR-10}} & \multirow{5}{*}{\textbf{Mobinetv2}} & \multirow{5}{*}{\textbf{Frog}} 
    & 1.00e-07 & 0.481 & 0.9253 \\
    &  &  & 1.00e-05 & 0.141 & 0.861 \\
    &  &  & 0.0001 & 0.108 & 0.856 \\
    &  &  & 0.001 & 0.170 & 0.810 \\
    &  &  & 0.005 & 0.210 & 0.6371 \\
\midrule
\multirow{5}{*}{\textbf{miniImageNet}} & \multirow{5}{*}{\textbf{Resnet-50}} & \multirow{5}{*}{\textbf{Triceratops}} 
    & 1.00e-07 & 0.506 & 0.968 \\
    &  &  & 1.00e-05 & 0.419 & 0.959 \\
    &  &  & 0.0001 & 0.4166 & 0.8564 \\
    &  &  & 0.001 & 0.453 & 0.477 \\
    &  &  & 0.005 & 0.833 & 0.034 \\
\midrule
\multirow{5}{*}{\textbf{CelebA}} & \multirow{5}{*}{\textbf{Resnet-50}} & \multirow{5}{*}{\makecell{\textbf{Facial Hair} \\ \textbf{(Attractive vs.} \\ \textbf{not-Attractive})}} 
    & 1.00e-08 & 0.234 & 0.826 \\
    &  &  & 1.00e-06 & 0.103 & 0.817 \\
    &  &  & 0.0001 & 0.076 & 0.800 \\
    &  &  & 0.001 & 0.120 & 0.802 \\
    &  &  & 0.01 & 0.320 & 0.680 \\
\bottomrule
\end{tabular}%
}
\end{table}




\noindent \textbf{Performance at higher values of E:} Further in Figure~\ref{figure-4}, we demonstrate the effectiveness of \emph{LAN} over multiple iterations $E \in \{2,4\}$. We present concept violation vs. accuracy trade-off plot to forget the facial hair concept while classifying attractive vs. not-attractive on the CelebA dataset.  As $E$ increases, at higher accuracy regions, the concept violation further decreases for the same accuracy value making the trade-off plot flatter. This suggests that increasing values of $E$ result in flatter trade-off curves, signifying better performance.

\begin{figure}[!htbp]
\centering
\includegraphics[width=0.47\textwidth]{revised_plots/Resnet50_Celeba_Attractive_Facial_Hair_epoch_study.png}
\caption{Concept violations vs. accuracy plots for \emph{LAN} method at higher values of $E$}
\label{figure-4}
\end{figure}



\section{Conclusion and Limiatations}
In the pursuit of safer and more responsible machine learning, the elimination of undesired concepts from models is crucial. Our work focuses on efficiently removing these undesired concepts from pre-trained classification models, a task that is challenging due to the degradation in generalization performance, which can render the forgotten model ineffective. To address this, we propose a computationally efficient algorithm termed as \underline{L}abel \underline{AN}nealing (LAN) algorithm to create a forgotten model while preserving its ability to generalize. We define \emph{concept forgetting} as the property of a model to disregard undesired concepts during its decision-making process and introduce \emph{concept neutrality} as a necessary attribute of a forgotten model. To quantify the extent of \emph{concept neutrality} in any model, we propose a novel metric called \emph{concept violation}. Our experimental results demonstrate that our method effectively reduces \emph{concept violation} while maintaining the model's performance across multiple concept-forgetting settings, various models, and datasets. Additionally, we acknowledge that our definition and method are limited only to concept forgetting in classification models. Further research is needed to develop definitions and methods for concept forgetting that generalize to generative models as well.

% \begin{acknowledgements} % will be removed in pdf for initial submission,
% 						 % (without ‘accepted’ option in \documentclass)
%                          % so you can already fill it to test with the
%                          % ‘accepted’ class option
%     Briefly acknowledge people and organizations here.

%     \emph{All} acknowledgements go in this section.
% \end{acknowledgements}

% References
\bibliography{uai2025-template}

\newpage

\onecolumn

\title{Concept Forgetting via Label Annealing\\(Appendix)}
\maketitle


\appendix
\section{Theoretical Analysis}
\label{app:theory}

\subsection{Proof of Lemma-1}
We recall some of the notation used in the algorithm. $b_j$ denotes the number of samples of class-$j$ in $\cD$ predicted by the initial model. Thus, $n = |\cD| = \sum^{k-1}_{j=0}b_j$. Similarly, let $n_{cj}$ for the number of samples of class-$j$ in $\cD_c$. Hence, $n_c=|\cD_c|=\sum^{k-1}_{j=0} n_{cj}$. 
Let the number of labels changed by Algorithm~\ref{alg1} (denoted by $\cA$) in the total dataset $\cD$ be $\text{cl}(\cA)$ and in concept data subset $\cD_c$ be  $\text{cl}(\cA)_c$. We first prove the following lemma.


\begin{proof}
For a particular concept value $\cC=c$ the label annealing subroutine Algorithm~\ref{alg1} changes the concept data subset to $\widetilde{\cD}_c$ by redistributing the labels of the samples in $\cD_c$. Let $T_{cj} = \min(n_{cj},b_j\frac{n_c}{|\cD|})$ and $\alpha_{cj}$ be the number of samples for class-$j$ in $\cD_c$ assigned by the algorithm in current run. By closely observing algorithm~\ref{alg1} it can be said that the first phase (second \emph{for} loop) algorithm tries to retain the original labels of the data until $\alpha_{c,j} < b_j\frac{n_c}{|\cD|}$ while in the second phase (third \emph{for} loop) the labels are assigned to other most likelihood classes. Thus the following propositions holds:
\begin{itemize}
\item    If $T_{cj} = n_{cj}$, then the number of labels changed for class-$j$ in $\cD_c$ termed as $\text{cl}(\cA)_{cj}=0$.
\item If $T_{cj} = b_j\frac{n_c}{|\cD|}$, then the number of labels changed for class-$j$ is in $\cD_c$ termed as $\text{cl}(\cA)_{cj}=\left\lvert n_{cj}-b_j\frac{n_c}{|\cD|} \right \rvert$.
\end{itemize}
Hence, in the worst-case scenario number of labels changed for class-$j$ in $\cD_c$ , $\text{cl}(\cA)_{cj} = \bigg|n_{cj}-b_j\frac{n_c}{|\cD|}\bigg|$. Therefore,
\begin{align}
    \text{cl}(\cA)_c \leq \sum^{k-1}_{j=0} \text{cl}(\cA)_{cj}
    =  \sum^{k-1}_{j=0} \bigg|n_{cj}-b_j\frac{n_c}{|\cD|}\bigg|. \label{eq-5}
\end{align}
 Now, for a particular concept $\cC=c$ the empirical concept violation of the forgotten model $\theta_\cC$ on $\widetilde{\cD}_c$ is as follows:
\begin{align}
    \hat{V}(\theta^*, \cC = c, {D}) &= \frac{1}{2}\sum^{k-1}_{j=0}\bigg|\hat{P}_{{\cD}}(\hat{h}(\theta^*, z) = j) - \hat{P}_{{\cD}}(\hat{h}(\theta^*, z) = j \mid \cC = c)\bigg| \\
    &= \frac{1}{2}\sum^{k-1}_{j=0}\bigg|\frac{b_j(\theta^*)}{|{\cD}|} - \frac{n_{cj}(\theta^*)}{n_c}\bigg| \\
     & \geq \frac{1}{2n_c} \text{cl}(\cA)_c.
\end{align}
Hence, $\text{cl}(\cA)_c \leq 2 n_c \hat{V}(\theta^*, \cC = c, {D}) \leq 2 n \hat{V}(\theta^*, \cC = c, {D})$. Summing over all concepts $c$ results in the lemma.
\end{proof}

\subsection{Proof of Theorem-1}


\begin{proof}[Proof]

We will use the above lemma to prove our main result. We provide the proof for $E=1$. The proof for larger values of $E$ follows by a telescoping sum of the epochs. Let's denote $\mathcal{L}_\cD(\theta^*)$ and $\mathcal{L}_\cD(\theta_\cC)$ denote the empirical losses of the pre-trained model and forgotten model on the initial dataset $\cD$ respectively. Now following the notations from the above proof of Lemma~\ref{lemma-1} the number of labels changed in the whole dataset $\widetilde{\cD} = \bigcup^{m-1}_{c=0} \widetilde{\cD}_c$ is $\text{cl}(\cA)$. We now upper bound the empirical loss of $\theta_\cC$ on $\cD$ as follows:

% \begin{align}
%     \text{cl}(\cA) = \sum^{m-1}_{c=0} \text{cl}(\cA)_c 
%     \leq \sum^{m-1}_{c=0} 2n_c \hat{V}(\theta^*,c,{\cD})
%     \label{eq-11}
% \end{align}
% Now, 
\begin{align}
 \EE\left[   \cL_{\cD}(\theta_\cC) \right] &=  \EE\left[   \cL_{\widetilde{\cD}}(\theta_\cC) + \cL_{\cD}(\theta_\cC) - \cL_{\widetilde{\cD}}(\theta_\cC) \right]\\
    &=  \EE\left[   \cL_{\widetilde{\cD}}(\theta_\cC) + \frac{1}{n} \bigg[\sum_{z_i \in \cD}\ell(\theta_\cC, z_i) - \sum_{z_i \in \widetilde{\cD}}\ell(\theta_\cC, z_i)\bigg]  \right]   \\
    &\stackrel{(c)}{\leq}  \EE\left[   \cL_{\widetilde{\cD}}(\theta_\cC)\right] + \frac{L}{n} \text{cl}(\cA) \\
    &\stackrel{(d)}{\leq} \cL_{\widetilde{\cD}}(\theta^*) + \frac{L}{n} \text{cl}(\cA) \\
    &= \cL_{\cD}(\theta^*) + \cL_{\widetilde{\cD}}(\theta^*) - \cL_{\cD}(\theta^*) + \frac{L}{n} \text{cl}(\cA) \\
    &\stackrel{(e)}{\leq} \cL_{\cD}(\theta^*) + \frac{2L}{n} \text{cl}(\cA) \\
    &\stackrel{(f)}{\leq} \cL_{\cD}(\theta^*) + 4L m \hat{V}(\theta^*, \cC,{\cD})
\end{align}
Here $(c)$ and $(e)$ holds as $\forall \theta$ if $z=\widetilde{z}$ then $\ell(\theta,z) = \ell(\theta,\widetilde{z})$ and the fact that 
%Thus the loss where the labels are not changed cancels out and only losses where labels are changed matter. 
$\ell(\theta,z) \leq L$. $(d)$ holds because of the assumption. Finally applying Lemma~\ref{lemma-1}, we get $(f)$.
\end{proof}

% \subsection{Proof of Lemma~\ref{lemma-1}} \label{proof-lemma}
% Let's denote $b_j$ the number of samples of class-$j$ in $\cD$ predicted by the initial model. Thus, $n = |\cD| = \sum^{k-1}_{j=0}b_j$. Similarly, let and $n_{cj}$ for the number of samples of class-$j$ in $\cD_c$. Hence, $n_c=|\cD_c|=\sum^{k-1}_{j=0} n_{cj}$. For a particular concept value $\cC=c$ the label annealing subroutine Algorithm~\ref{alg1} changes the concept data subset to $\widetilde{\cD}_c$ by redistributing the labels of the samples in $\cD_c$. Let $T_{cj} = \min(n_{cj},b_j\frac{n_c}{|\cD|})$ and $\alpha_{cj}$ be the number of samples for class-$j$ in $\cD_c$ assigned by the algorithm in current run. By closely observing algorithm~\ref{alg1} it can be said that the first phase (second \emph{for} loop) algorithm tries to retain the original labels of the data until $\alpha_{c,j} < b_j\frac{n_c}{|\cD|}$ while in the second phase (third \emph{for} loop) the labels are assigned to other most likelihood classes. Thus the following proposition holds:
% \begin{prop}
%     If $T_{cj} = n_{cj}$, then the number of labels changed for class-$j$ in $\cD_c$ termed as $\text{cl}(\cA)_{cj}=0$.
% \end{prop}
% \begin{prop}
%     If $T_{cj} = b_j\frac{n_c}{|\cD|}$, then the number of labels changed for class-$j$ is in $\cD_c$ termed as $\text{cl}(\cA)_{cj}=\left\lvert n_{cj}-b_j\frac{n_c}{|\cD|} \right \rvert$.
% \end{prop}

% Thus in the worst-case scenario number of labels changed for class-$j$ in $\cD_c$ , $\text{cl}(\cA)_{cj} = \bigg|n_{cj}-b_j\frac{n_c}{|\cD|}\bigg|$ 

% \begin{align}
%     \text{cl}(\cA)_c \leq \sum^{k-1}_{j=0} \text{cl}(\cA)_{cj}
%     =  \sum^{k-1}_{j=0} \bigg|n_{cj}-b_j\frac{n_c}{|\cD|}\bigg| \label{eq-5}
% \end{align}

% The inequality~\ref{eq-5} holds due to the group effect of different labels. Now, for a particular concept $\cC=c$ the empirical concept violation of the forgotten model $\theta_\cC$ on $\widetilde{\cD} = \bigcup^{m-1}_{c=0} \widetilde{\cD}_c$ is as follows:
% \begin{align}
%     \hat{V}(\theta_\cC, \cC = c, \widetilde{D}) &= \frac{1}{2m}\sum^{k-1}_{j=0}\bigg|\hat{P}_{\widetilde{\cD}}(\hat{h}(\theta_\cC, z) = j) - \hat{P}_{\widetilde{\cD}}(\hat{h}(\theta_\cC, z) = j \mid \cC = c)\bigg| \\
%     &= \frac{1}{2m}\sum^{k-1}_{j=0}\bigg|\frac{b_j(\theta_\cC)}{|\widetilde{\cD}|} - \frac{n_{cj}(\theta_\cC)}{n_c}\bigg| \\
%     &\stackrel{(a)}{\approx} \frac{1}{2m}\sum^{k-1}_{j=0}\bigg|\frac{b_j(\widetilde{\cD})}{|\widetilde{\cD}|} - \frac{n_{cj}(\widetilde{D})}{n_c}\bigg| \\
%     &\stackrel{(b)}{\geq} \frac{1}{2m} \frac{\text{cl}(\cA)_c}{n_c} \\
%     &\implies \text{cl}(\cA)_c \leq
%     2mn_c \hat{V}(\theta_\cC,c,\widetilde{\cD})
% \end{align}

% In the above equation $(a)$ holds as $\theta_\cC$ is the perfect minimizer of LAN loss as per the assumption thus capturing the underlying uncertainty in $\widetilde{\cD}$. $(b)$ holds due to the inequality~\ref{eq-5}.
    

% \subsection{Proof of Theorem~\ref{theorem-1}} \label{proof-theorem}

% Let's denote $\mathcal{L}_\cD(\theta^*)$ and $\mathcal{L}_\cD(\theta_\cC)$ denote the empirical losses of the pre-trained model and forgotten model on the initial dataset $\cD$ respectively. Now following the notations from the above proof of Lemma~\ref{lemma-1} the number of labels changed in the whole dataset $\widetilde{\cD} = \bigcup^{m-1}_{c=0} \widetilde{\cD}_c$ is $\text{cl}(\cA)$.

% \begin{align}
%     \text{cl}(\cA) = \sum^{m-1}_{c=0} \text{cl}(\cA)_c 
%     \leq \sum^{m-1}_{c=0} 2mn_c \hat{V}(\theta_\cC,c,\widetilde{\cD})
%     \label{eq-11}
% \end{align}
% Now, 
% \begin{align}
%     \cL_{\cD}(\theta_\cC) &= \cL_{\widetilde{\cD}}(\theta_\cC) + \cL_{\cD}(\theta_\cC) - \cL_{\widetilde{\cD}}(\theta_\cC) \\
%     &= \cL_{\widetilde{\cD}}(\theta_\cC) + \frac{1}{n} \bigg[\sum_{z_i \in \cD}\ell(\theta_\cC, z_i) - \sum_{z_i \in \widetilde{\cD}}\ell(\theta_\cC, z_i)\bigg] \\
%     &\stackrel{(c)}{\leq} \cL_{\widetilde{\cD}}(\theta_\cC) + \frac{B}{n} \text{cl}(\cA) \\
%     &\stackrel{(d)}{\leq} \cL_{\widetilde{\cD}}(\theta^*) + \frac{B}{n} \text{cl}(\cA) \\
%     &= \cL_{\cD}(\theta^*) + \cL_{\widetilde{\cD}}(\theta^*) - \cL_{\cD}(\theta^*) + \frac{B}{n} \text{cl}(\cA) \\
%     &\stackrel{(e)}{\leq} \cL_{\cD}(\theta^*) + \frac{2B}{n} \text{cl}(\cA) \\
%     &\stackrel{(f)}{\leq} \cL_{\cD}(\theta^*) + 4Bm \sum^{m-1}_{c=0} \frac{n_c}{n}\hat{V}(\theta_\cC,c,\widetilde{\cD})
% \end{align}
% Here $(c)$ and $(e)$ hold as per the assumption that $\forall \theta$ if $z=\widetilde{z}$ then $\ell(\theta,z) = \ell(\theta,\widetilde{z})$. Thus the loss where the labels are not changed cancels out and only losses where labels are changed matter. Further apply $\ell(\theta,z) \leq B$. $(d)$ holds because $\theta^* = \arg \min_\theta\cL_{\cD}(\theta,z)$. Finally applying inequality~\ref{eq-11} we get $(f)$.

\section{Quantative Results}
\subsection{Non-Binary Concept Forgetting}\label{non-binary-concept-results}
Table~\ref{tab:multi-forgetting} illustrates the performance of \emph{LAN}—in reducing concept violation and maintaining test accuracy across various settings of concept forgetting from a pre-trained Resnet-50 model trained on the CelebA~\citep{CelebA} dataset. In this setting, the LAN algorithm reduces concept violation by about 63.52\% without significantly affecting test accuracy.
\begin{table}[!htbp]
\centering
\caption{Empirical concept violation $\hat{V}_\cC (\downarrow)$  and test accuracy $A_\cD (\uparrow)$ of the initial model and forgotten model via \emph{LAN}. For the forgotten model, $\hat{V}_\cC$ reduced without significantly reducing $A_{D}$}
\label{tab:multi-forgetting}
\resizebox{0.9\textwidth}{!}{%
\begin{tabular}{c|c|cc|cc}
\toprule
\multirow{2}{*}{\textbf{Tasks}} & \multirow{2}{*}{\textbf{Concepts}} & \multicolumn{2}{c|}{\textbf{Initial Model}}  & \multicolumn{2}{c}{\textbf{LAN}} \\

 &  & $\hat{V}_\cC$ & $A_\cD$ & $\hat{V}_\cC$ & $A_\cD$  \\

\midrule
\multirow{2}{*}{
    \textbf{Young vs. Not-Young}}
 & \textbf{Hair Color} &0.2  &0.898 & 0.063  & 0.8626  \\
 & \textbf{Facial Hair} &0.11  &0.897   & 0.0329  & 0.8921\\
 	
\midrule

\multirow{2}{*}{
    \textbf{Attractive vs. Not-attractive}}
 & \textbf{Hair Color} &0.195  &0.827   &0.083  & 0.7955  \\
 & \textbf{Facial Hair} &0.1716  &0.827  &0.076  &0.8088\\

\midrule

\multirow{2}{*}{
    \textbf{Heavy Makeup vs. Not-Heavy Makeup }}
 &  \textbf{Hair Color} &0.157  &0.92  &0.073  &0.881 \\
 &  \textbf{Facial Hair} &0.316  &0.919   &0.077  &0.844  \\
\bottomrule
\end{tabular}%
}
\end{table}

\subsection{Binary Concept Forgetting}\label{binary-concept-results}

Table~\ref{tab:binary-forgetting} shows the efficacy of our method for different concept-forgetting scenarios. In this case, the average reduction of concept violation is about 85.35\% on the MNIST dataset, 73.25\% on the CIFAR-10 dataset, 17.05\% on the miniImageNet dataset, and 81.34\% on the CelebA dataset, while retention of high model accuracy. 

\begin{table}[!htbp]
\centering
\caption{Empirical concept violation $\hat{V}_\cC (\downarrow)$ and test accuracy $A_{D} (\uparrow)$ of the initial model and forgotten model via \emph{LAN}. For the forgotten model, $\hat{V}_\cC$ reduced without significantly reducing $A_{D}$}
\label{tab:binary-forgetting}
\resizebox{0.9\textwidth}{!}{%
\begin{tabular}{c|ccc|cc|cc}
\toprule
\multirow{2}{*}{\textbf{Dataset}} & \multirow{2}{*}{\textbf{Models}} & \multirow{2}{*}{\textbf{Task}} & \textbf{Concept} & \multicolumn{2}{c|}{\textbf{Initial Model}} & \multicolumn{2}{c}{\textbf{LAN}} \\

 &  &  &  & $\hat{V}_\cC$ & $A_{D}$ & $\hat{V}_\cC$ & $A_{D}$ \\

\midrule
\multirow{3}{*}{
    \textbf{CelebA}
} & \multirow{3}{*}{\textbf{Resnet-50}} & \textbf{Young or not} & \textbf{Gender} & 0.117 & 0.898 & 0.015 & 0.847 \\
 &  &\textbf{Attractive or not}  &\textbf{Gender}  & 0.2219 & 0.827 & 0.006 & 0.767 \\
 &  &\textbf{Heavy makeup or not}  &\textbf{Gender}  & 0.314 & 0.919 & 0.127 & 0.764 \\
		
\midrule

\multirow{3}{*}{
    \textbf{miniImageNet}
} & \multirow{3}{*}{\textbf{Resnet-50}} & \multirow{3}{*}{\textbf{class 0-99 classification}} & \textbf{Triceratops} & 0.4991 & 0.9791 &0.406  & 0.951  \\
 &  &  & \textbf{Bugs} & 0.4966	& 0.9791 &0.364  & 0.936 \\
 &  &  & \textbf{Fences} & 0.4948	& 0.9791 &0.466  &0.96 \\
	
\midrule

\multirow{6}{*}{
    \textbf{CIFAR-10}
} & \multirow{3}{*}{\textbf{Mobinet-v2}} & \multirow{3}{*}{\textbf{class 0-9 classification}} & \textbf{Bird} & 0.440 & 0.928 & 0.103 & 0.871 \\
 &  &  & \textbf{Frog} & 0.473 & 0.921 & 0.108 & 0.855 \\
 &  &  & \textbf{Truck} & 0.472 & 0.921 & 0.113 & 0.855 \\
	
\cmidrule{2-8}

 & \multirow{3}{*}{\textbf{Densenet-121}} & \multirow{3}{*}{\textbf{class 0-9 classification}} & \textbf{Bird} & 0.445 & 0.923 & 0.152 & 0.869 \\
 &  &  & \textbf{Frog} & 0.473 & 0.917 & 0.116 & 0.878 \\
 &  &  & \textbf{Truck} & 0.472 & 0.917 & 0.147 &0.861 \\
	
\midrule
	
\multirow{6}{*}{
    \textbf{MNIST}
} & \multirow{3}{*}{\textbf{2-layer MLP}} & \multirow{3}{*}{\textbf{Digit 0-9 classification}} & \textbf{Digit-3} & 0.479 & 0.974 & 0.055 & 0.883 \\
 &  &  & \textbf{Digit-5} & 0.491 & 0.971 & 0.104 & 0.901 \\
 &  &  & \textbf{Digit-8} & 0.470 & 0.976 & 0.081 & 0.889 \\

\cmidrule{2-8}

 & \multirow{3}{*}{\textbf{Resnet-50}} & \multirow{3}{*}{\textbf{Digit 0-9 classification}} & \textbf{Digit-3} & 0.498 & 0.990 & 0.047 & 0.893 \\
 &  &  & \textbf{Digit-5} & 0.492 & 0.992 & 0.078 & 0.905 \\
 &  &  & \textbf{Digit-8} & 0.496 & 0.991 & 0.063 & 0.897 \\

\bottomrule
\end{tabular}%
}
\end{table}



\section{Implementational Details}

\subsection{Datasets and Models} \label{dataset-model}
Here we have used four datasets as follows:
\begin{itemize}
    \item \textbf{MNIST~\citep{lecun1998gradient}:} The MNIST dataset consist of 28 $\times$ 28 gray-scale representing handwritten digits from 0 to 9. The MNIST dataset contains 6,000 images per digit class totaling 60,000 training samples and 1,000 images per digit class totalling 10,000 testing images. 
    \item \textbf{CIFAR-10~\citep{krizhevsky2009learning}:} The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes: airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck with 6000 images per class. There are 50000 training images and 10000 test images.
    \item \textbf{CelebA~\citep{CelebA}:} The Celeb Faces Attributes Dataset (CelebA) is a large-scale facial attributes dataset comprising over 200,000 celebrity images, each annotated with 40 attributes. This dataset features significant pose variations and background clutter. CelebA offers extensive diversity, a substantial quantity of images, and rich annotations.
    \item \textbf{miniImageNet~\citep{mini-imagenet}:} Here we have used a smaller subset of the ImageNet dataset consisting of 50,000 training images and 10,000 testing images, evenly distributed across 100 classes. Here we have used an image dimension of $224\times224$ same as the original ImageNet data dimension.
\end{itemize} 

\textbf{Models:} Further to evaluate our method to different models we experimented with a variety of models with different learnable parameter sizes such as 2-layer-MLP, Mobinet-v2~\citep{mobinetv2}, Densenet-121~\citep{densenet}, Resnet-50~\citep{he2016deep}. The 2-layer-MLP net has two hidden layers both having the size of 500. For Mobinet-v2, Densenet-121, and Resnet-50 we have taken Pytorch default models with pre-trained weights. For all of these models, the last layer is changed to an appropriate size suitable for the classification tasks. 







\subsection{Initial training:}

\subsubsection{Initial training on MNIST} \label{pre-train-mnist} Here we have used 2-layer-MLP and Resnet-50 models for the classification tasks. For optimization, we have used the Adam optimizer with a learning rate of 0.001 on mean cross-entropy loss.  All the models are trained for 5 epochs with a batch size of 64. The loss and accuracy curves can be seen in Figure~\ref{fig:MNIST-pretrained}.

\FloatBarrier 
%%celeba visual results
\begin{figure}[!htbp]
    \centering
    \subfloat[MLP loss]{\label{fig:MLP-pretrained-loss}\includegraphics[width=0.24\textwidth]{plots/mnist/pretrained/mlp_loss.pdf}}~
    \subfloat[MLP accuracy]{\label{fig:MLP-pretrained-acc}\includegraphics[width=0.24\textwidth]{plots/mnist/pretrained/mlp_accs.pdf}}~
    \subfloat[Resnet-50 loss]{\label{fig:resnet-pretrained-loss}\includegraphics[width=0.24\textwidth]{plots/mnist/pretrained/resnet50_loss.pdf}}~
    \subfloat[Renset-50 accuracy]{\label{fig:resnet-pretrained-acc}\includegraphics[width=0.24\textwidth]{plots/mnist/pretrained/resnet50_accs.pdf}}
\caption{Results of training the initial models on MNIST dataset}
\label{fig:MNIST-pretrained}
\end{figure}

\subsubsection{Initial training on CIFAR-10} \label{pre-train-cifar}
Here we have used Mobinet-v2 and Densenet-121 models for the classification tasks. For optimization, we have used the Adam optimizer with a learning rate of 0.001 on mean cross-entropy loss. Mobinet-v2 and Densenet-121 models are trained for 60 and 20 epochs respectively with a batch size of 64. We have an early-stopping of 3 epochs for all the models. The loss and accuracy curves can be seen in Figure~\ref{fig:CIFAR-10 Pre-trained}.

\FloatBarrier 
%%celeba visual results
\begin{figure}[!htbp]
    \centering
    \subfloat[Mobinet-v2 loss]{\label{fig:mobinet-pretrained-loss}\includegraphics[width=0.24\textwidth]{plots/cifar-10/pretrained/mobinetv2_cifar10_loss_history.pdf}}~
    \subfloat[Mobinet-v2 accuracy]{\label{fig:mobinet-pretrained-acc}\includegraphics[width=0.24\textwidth]{plots/cifar-10/pretrained/mobinetv2_cifar10_accuracy_history.pdf}}~
    \subfloat[Densenet-121 loss]{\label{fig:densenet-pretrained-loss}\includegraphics[width=0.24\textwidth]{plots/cifar-10/pretrained/densenet-121_cifar10_loss_history.pdf}}~
    \subfloat[Densenet-121 accuracy]{\label{fig:Mobinet-pretrained-acc}\includegraphics[width=0.24\textwidth]{plots/cifar-10/pretrained/densenet-121_cifar10_accuracy_history.pdf}}
\caption{Results of Training of the Initial Models on CIFAR-10 dataset}
\label{fig:CIFAR-10 Pre-trained}
\end{figure}

\subsubsection{Initial training on CelebA} \label{pre-train-celeba}
Here we have used the Resnet-50 model for the classification tasks. As there are 40 attributes for classification we have trained 40 MLP heads for this. For optimization, we have used the SGD optimizer with a learning rate of 0.01 a learning rate scheduler with a decay of 0.1 every 30 steps, momentum of 0.9, and weight decay 1e-4 on total cross-entropy loss. Here Resnet-50 is trained for 90 epochs with a batch size of 256. 

\subsection{Training for Concept Forgetting}

\subsubsection{LAN training} \label{LAN-training}
Our official codebase for LAN is available at the following link:\url{https://github.com/Subhodip123/LAN}. Here we have used the label annealing methodology to finetune the pre-trained model for 1 epoch. We evaluated our method with both retraining and FERMI methodology in different forgetting settings. For our results the optimal hyper-parameters  For different settings of forgetting we give the optimal hyper-parameters for our optimal results in the following tables

% \begin{table*}[!htbp]
% \begin{minipage}{0.47\textwidth}
% \centering
% \resizebox{\textwidth}{!}{%
% \begin{tabular}{cc|c|c}
% \toprule
% \multirow{2}{*}{\textbf{Datasets}} & \multirow{2}{*}{\textbf{Models}} & \multirow{2}{*}{\textbf{Concepts}} & \multicolumn{1}{c}{\textbf{Optimal Hyperparameters}} \\

%  &  &  & \textbf{Learning Rate} \\ \cmidrule(r){1-4}
% \multirow{6}{*}{\textbf{MNIST}} & \multirow{3}{*}{2-layer MLP} & Class-3 & 0.0005 \\
%  &  & Class-5 & 0.005 \\
%  &  & Class-8 & 0.005 \\ \cmidrule(r){2-4}
%  & \multirow{3}{*}{Resnet-50} & Class-3 & 0.005 \\
%  &  & Class-5 & 0.005 \\
%  &  & Class-8 & 0.0005 \\ \cmidrule(r){1-4}
% \multirow{6}{*}{\textbf{Cifar-10}} & \multirow{3}{*}{Mobinet-v2} & Class-2 & 0.001 \\
%  &  & Class-6 & 0.001 \\
%  &  & Class-9 & 0.0001 \\ \cmidrule(r){2-4}
%  & \multirow{3}{*}{Densenet-121} & Class-2 & 0.005 \\
%  &  & Class-6 & 0.0005 \\
%  &  & Class-9 & 0.001 \\

% \bottomrule
% \end{tabular}%
% }
% \caption{Optimal Hyperparameters for LAN method on MNIST and CIFAR-10 dataset}
% \label{tab:optimal-hyperparameters-datasets}
% \end{minipage}~
% \begin{minipage}{0.47\textwidth}
%     \centering
% \resizebox{\textwidth}{!}{%
% \begin{tabular}{cc|c|c}
% \toprule
% \multirow{2}{*}{\textbf{Classification Task}} & \multirow{2}{*}{\textbf{Models}} & \multirow{2}{*}{\textbf{Concepts}} & \multicolumn{1}{c}{\textbf{Optimal Hyperparameters}} \\

%  &  &  & \textbf{Learning Rate} \\ \cmidrule(r){1-4}
% \multirow{3}{*}{\textbf{Young}} & \multirow{3}{*}{Resnet-50} & Gender & 1.00E-06 \\
%  &  & Hair Color & 1.00E-06 \\
%  &  & Facial Hair & 1.00E-07 \\ \cmidrule(r){1-4}
% \multirow{3}{*}{\textbf{Attractive or Not}} & \multirow{3}{*}{Resnet-50} & Gender & 1.00E-06 \\
%  &  & Hair Color & 1.00E-06 \\
%  &  & Facial Hair & 1.00E-06 \\ \cmidrule(r){1-4}
% \multirow{3}{*}{\textbf{Heavy Makeup}} & \multirow{3}{*}{Resnet-50} & Gender & 1.00E-06 \\
%  &  & Hair Color & 1.00E-07 \\
%  &  & Facial Hair & 1.00E-06 \\

% \bottomrule
% \end{tabular}%
% }
% \caption{Optimal Hyperparameters for LAN method on CelabA dataset}
% \label{tab:optimal-hyperparameters-tasks}
% \end{minipage}
% \end{table*}

% \newpage
\subsubsection{Baselines}\label{baselines}

\begin{itemize}
    \item \textbf{FERMI~\citep{FERMI}:} Here we have used the official implementation of FERMI which can be found in the following link: \url{https://www.dropbox.com/scl/fo/tz8aksm4ibsta9l9hzig7/AMK3ixeUQRqoY0FhWgDy5rM?rlkey=yufnfhuvhs91mvvl9kc3lbss1&e=1&dl=0}. Here we have used the FERMI loss with the usual regularized cross-entropy loss to fine-tune the pre-trained model for E=1. 
    \item \textbf{Continuous Fairness~\citep{mary-fairness}:} The official implementation can be found at: \url{https://github.com/criteo-research/continuous-fairness}. We have used the usual regularized cross-entropy loss to fine-tune the pre-trained model for E=1. 
    \item \textbf{Fariness-KDE~\citep{fairness-cho}:} As there is no official implementation for this method we use the open-source implementation from \url{https://github.com/Gyeongjo/FairClassifier_using_KDE}. Similarly, like other baselines, we train the pre-trained model using this regularized loss for E=1.
\end{itemize}

\end{document}
