% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%%%%%%%%%%% 3rd - parties start %%%%%%%%%%%%%%%%%%%%%%%%%%%%5
\usepackage{comment}
\usepackage{color}
\usepackage{kotex}
\usepackage{adjustbox}
\usepackage{booktabs}
\usepackage{multicol}
\usepackage{multirow}
\usepackage{amsmath}
\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{comment}

\usepackage{caption}
\usepackage{subcaption}

\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{bm}
\usepackage{hyperref}
% \hypersetup{
%     colorlinks=true,
%     linkcolor=blue,
%     filecolor=magenta,      
%     urlcolor=blue,
% }


\newcommand\delete[1]{\textcolor{yellow}{#1}}
\newcommand\jy[1]{\textcolor{blue}{#1}}
\newcommand\jh[1]{\textcolor{red}{#1}}




\newcommand\blfootnote[1]{%
  \begingroup
  \renewcommand\thefootnote{}\footnote{#1}%
  \addtocounter{footnote}{-1}%
  \endgroup
}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
% \newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Cooperative Meta-Learning with Gradient Augmentation}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jjongyn@gmail.com>?Subject=Cooperative Meta-Learning with Gradient Augmentation}{Jongyun Shin}{}} 
\author[1]{\href{mailto:<gkstmdwls99@kookmin.ac.kr>?Subject=Cooperative Meta-Learning with Gradient Augmentation}{Seungjin Han}{}}
\author[1]{\href{mailto:<jangho.kim@kookmin.ac.kr>?Subject=Cooperative Meta-Learning with Gradient Augmentation}{Jangho Kim$^\ast$}{}}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Department\\
    Kookmin University\\
    Seoul, Korea
}
\affil[ ]{%
    \url{{whddbs519, gkstmdwls99, jangho.kim}@kookmin.ac.kr}
}

  
  \begin{document}
\maketitle
% This creates the footnote text
\blfootnote{$^\ast$ Corresponding Author}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Abstract %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{abstract}
  Model agnostic meta-learning (MAML) is one of the most widely used gradient-based meta-learning, consisting of two optimization loops: an inner loop and outer loop. MAML learns the new task from meta-initialization parameters with an inner update and finds the meta-initialization parameters in the outer loop. In general, the injection of noise into the gradient of the model for augmenting the gradient is one of the widely used regularization methods. In this work, we propose a novel cooperative meta-learning framework dubbed CML which leverages gradient-level regularization with gradient augmentation. We inject learnable noise into the gradient of the model for the model generalization. The key idea of CML is introducing the co-learner which has no inner update but the outer loop update to augment gradients for finding better meta-initialization parameters. Since the co-learner does not update in the inner loop, it can be easily deleted after meta-training. Therefore, CML infers with only meta-learner without additional cost and performance degradation. We demonstrate that CML is easily applicable to gradient-based meta-learning methods and CML leads to increased performance in few-shot regression, few-shot image classification and few-shot node classification tasks. Our codes are available at \url{https://github.com/JJongyn/CML}.
  % https://github.com/JJongyn/CML.
\end{abstract}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Introduction %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction}\label{sec:intro}
Meta-learning, also known as ``learning to learn'', is a methodology to learn a new task by utilizing previous knowledge and experience \citep{vilalta2002perspective}. Model-agnostic meta-learning (MAML) \citep{finn2017model} is one of the dominant gradient-based meta-learning methods \citep{rajeswaran2019meta,rusu2018meta,gupta2020maml}. MAML consists of two optimization loops including an inner loop and an outer loop. The inner loop adapts the model with task-specific knowledge and the outer loop finds the meta-initialization parameters which can quickly adapt the new task knowledge in the inner loop, called task-adaptation. Generally, meta-learning with a few-shot setting involves both meta-training and meta-testing. In meta-training,  a variety of few-shot learning tasks are provided for a meta-learner and the meta-learner should solve an unseen task with few-shot samples in meta-testing. In the process, meta-learner learns the ability to adapt to various tasks, but they are challenged to form meta-initialization parameters with well-generalized knowledge. 

Traditionally, noise injection to the model is widely used for improving the generalization performance of the model. \citet{neelakantan2015adding} finds that adding noise to a network's gradient improves the network's generalization performance. Similarly, \citet{yang2020gradaug} performs gradient augmentation by pruning the model to create multiple sub-networks and using different data augmentations for each input in the sub-networks for inducing the diversity into the gradient, but this requires multiple inferences. They show that injecting noise into gradients plays an important role in improving generalization performance. 




\begin{figure*}[h!]
\centering
 \includegraphics[width=0.9\linewidth]{IMAGE/overall_cml2.png}
%  \includegraphics[width=1\linewidth]{IMAGE/Testfig.png}\\
  \caption{Overall process of CML and comparisons with other methods with a given task ($\mathcal{T}_i$). $\mathcal{\psi}, \mathcal{\theta}$ and $ \phi$ denote meta-initialization parameters of the feature extractor, meta-learner and co-learner. The feature extractor $\psi$ extracts the features, i.e., body layers of DNN. The meta-learner  $\mathcal{\theta}$ and co-learner $\phi$ predict outputs based on the features, i.e., classifier. $\mathcal{\psi}_{i}^{\prime}$, $\mathcal{\theta}_{i}^{\prime}$, and $\mathcal{\phi}_{i}^{\prime}$ means adapted parameters with $i$-task during an inner loop. Since CML does not adapt the co-learner to the task for generalization from gradient augmentation, after meta-training, CML can infer without additional costs. In meta-testing, CML evaluates performance after performing a task-adaptation, like standard MAML having $\mathcal{\psi}$ and $\mathcal{\theta}$. On the other hand, CML$^{\dagger}$ has parameters $\mathcal{\psi}$ and $ \phi$, where only $\mathcal{\psi}$ performs the task-adaptation and then evaluates the performance.}  \label{fig:overall_framework}
\end{figure*}



Motivated by the regularization effect of noise in gradients and the diverse gradient augmentation for the model generalization, we propose a novel cooperative meta-learning (CML) framework. It can be applied with gradient-based meta-learning to find better meta-initialization parameters through a regularization effect but has no additional cost at test time. CML has three parts which are the feature extractor, meta-learner and co-learner. The feature extractor and meta-learner parameters already exist in the original MAML and co-learner is newly introduced in this work for generating the new gradient. The co-learner is a plug-and-play module that takes the features of the feature extractor as input and generates their gradients by backpropagation. Thus, its goal is to provide a gradient for augmentation from a different perspective than the meta-learner, creating an augmented meta-gradient. We think that this is effective as a learned meaningful noise generated by the training of the co-learner rather than simply adding random noise.
% Co-learner cooperatively provides different aspects for generating a kind of diverse gradient noise to augment original meta-gradient for the generalization of meta-initialization.
To achieve our goal, we design the CML with two purposes: Firstly, the co-learner arouses a different point of view from the naive meta-learner for generalization ability and diversity of meta-gradient. Secondly, the co-learner can be easily deleted at test time without any accuracy drop, which means the co-learner affects only finding meta-initialization parameters not learning a new task.




Figure \ref{fig:overall_framework} shows the overall process of CML and comparisons with other methods such as a naive gradient-based meta-learning and a multi-branch framework with meta-learning. In meta-training, our newly introduced co-learner is only updated in the outer loop, which means the meta-learner solely adapts the new task in the inner loop. Since the co-learner is updated at the previous outer loop, not the current inner loop, the co-learner cooperatively finds the meta-initialization parameters of the shared feature extractor by gradient augmentation in the meta-gradient with a different perspective than the meta-learner. 
Hence, CML does not need to make a sub-network such as pruning and use a different data augmentation with multiple inferences for the diversity. Also, after meta-training, CML can easily delete the co-learner because the co-learner does not change the meta-initialization parameters in the inner loop. Therefore, CML can only infer the feature extractor and meta-learner in meta-testing. Another variation of CML, the co-learner without task-adaptation can be used with the feature extractor which is represented as CML$^\dagger$ in Figure \ref{fig:overall_framework}. Our main contributions are summarized as follows:


%%%%% Contribution
\begin{itemize}
    \item We propose the cooperative meta-learning (CML) framework which finds the better meta-initialization parameters without additional cost at test time. Unlike previous regularization methods, our proposed co-learner generates diverse meta-gradient without multiple data augmentation, inference and pruning.
    \item We verify the effectiveness of CML and its applicability, where CML is applied with gradient-based meta-learning methods on various tasks such as few-shot regression, few-shot image classification and few-shot node classification tasks. % in Sec.~\ref{exp}.
    \item We show that CML's gradient augmentation induces gradient diversity
    %show the effect of gradient augmentation from CML 
    and conduct an analysis of the gradient of the co-learner and meta-learner during meta-optimization.
    %CML induces diversity on the meta-gradient through co-learner. %, as demonstrated in Sec. \ref{exp:4.4_cml_grad}.
    \item We demonstrate that the performance improvement is not solely attributed to the additional parameters of the co-learner during meta-training, but rather to the framework of CML with meta-training. %, as illustrated in Sec. \ref{exp:4.5_cml_structure}.
    
\end{itemize}








%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Related work %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Related Work}
\label{sec:related}

\subsection{Gradient-based Meta-learning}
In recent, meta-learning successfully covers a diverse application \citep{hospedales2021meta}. Gradient-based meta-learning optimizes a bilevel optimization problem \citep{colson2007overview} where it has a task-adaptation (inner loop) learning a new task with a few shot samples from meta-initialization parameters and meta-optimization (outer loop) finding proper meta-initialization parameters from an inner loop update. Many variants of MAML have been studied in various domains \citep{yin2019meta,obamuyide2019model,collins2022maml,lee2021meta}. BOIL \citep{oh2020boil} tackles the feature reuse problem in meta-optimization and freezes the classifier in task-adaptation. Sharp-MAML \citep{abbas2022sharp} leverages sharpness-aware minimization to solve a bilevel optimization problem. In this work, we propose a new meta-learning framework that can be applied to any gradient-based meta-learning.

\subsection{Multi-branch framework} While maintaining the exact computational graph for inference, many works to boost the performance of the model have been studied. Auxiliary training adds auxiliary classifiers connected in intermediate layers \citep{szegedy2015going,zhang2020auxiliary} and multi-task learning simultaneously learns multiple related tasks and the knowledge from multi-task can be reused by the others \citep{yang2016deep}. Unlike previous methods, multi-branch frameworks \citep{kim2021feature,Xie_2022_CVPR,liang2022camero} shared intermediate layers and split multi-branch under the same task which utilizes knowledge distillation \citep{hinton2015distilling} transferring the knowledge to enhance independent branches. \citet{zhu2018knowledge} split the model into several sub-networks and made an ensemble logit to teach individual sub-networks. \citet{song2018collaborative} introduces multiple heads from the same network to improve the generalization of the model. 

\subsection{Regularization by noise} To improve the generalization performance, various ways to impose constraints on model structure and gradients by noise have been studied. \citet{hinton2002stochastic} uses gaussian gradient noise schedule to train the embedding model. Dropout \citep{srivastava2014dropout} randomly drops the connections during training which introduces the random noise into forward propagation. Similarly, \citet{huang2016deep} randomly disconnects the layers during training. \citet{neelakantan2015adding} shows injecting noise to gradient works very deep architecture. GradAug \citep{yang2020gradaug} generates meaningful noise in gradients rather than random noises by multiple data augmentation and pruning of the model.

Cooperative meta-learning leverages the advantages of both multi-branch framework and regularization by noise in the gradient-based meta-learning domain. CML only introduces a co-learner and trains it in meta-optimization to augment a meta-gradient by sharing the feature extractor. It induces a regularization effect by injecting noise into the meta-gradient without multiple forwarding or making a sub-network such as pruning.








%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Methodology %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Methodology}
\label{headings}

In this section, we give a brief explanation of the Model-Agnostic Meta-Learning (MAML) and then, we explain a proposed cooperative meta-learning (CML) framework which is an extension of MAML that uses cooperative learning with gradient augmentation to learn meta-initialization parameters of the DNN. In meta-learning, the ability to generalize to a new task is a challenging problem. To solve this problem, we introduce a co-learner that drives the augmentation at the gradient-level regularization.

%%%%%%%%%%%% method:MAML %%%%%%%%%%%%
\subsection{Model-Agnostic Meta-Learning (MAML)} In this work, we divide the DNN model used for meta-learning into two groups: the feature extractor $\psi$ which extracts the features, i.e., body layers of DNN and the meta-learner $\mathcal{\theta}$ predicting outputs based on the features, i.e., classifier. We sample a set of tasks $\{\mathcal{T}\}^N_i$ containing N tasks from the task distribution $p(\mathcal{T})$. DNN model represented by a $f_{(\psi, \theta)}$ is trained using samples from each task $\mathcal{T}_i$ under the two optimization loops. These samples $\mathcal{D}_i$ are divided into support set $\mathcal{D}^S_i$ and query set $\mathcal{D}^Q_i$ which are used in the inner loop and outer loop, respectively. MAML consisting of two optimization loops which are the inner loop and outer loop tries to find well-generalized meta-initialization parameters during meta-training. In the inner loop, we update task-specific parameters from meta-initialization parameters $(\psi, \theta)$ using the support set with an outer step size of $\alpha$.  
\begin{equation}
\label{eq:maml_inner1}
   (\mathcal{\psi}^{\prime}_{i}, \mathcal{\theta}^{\prime}_{i}) = (\mathcal{\psi},\mathcal{\theta}) - \alpha \nabla_{(\psi, \theta)} \mathcal{L}(f_{(\mathcal{\psi}, 
   \mathcal{\theta})};\mathcal{D}_{i}^{S})
\end{equation}
and takes totally $M$-updates for task-specific parameters.
\begin{equation}
\label{eq:maml_inner2}
   (\mathcal{\psi}^{\prime}_{i}, \mathcal{\theta}^{\prime}_{i}) \gets (\mathcal{\psi}^{\prime}_{i},\mathcal{\theta}^{\prime}_{i}) - \alpha \nabla_{(\mathcal{\psi}^{\prime}_{i}, 
   \mathcal{\theta}^{\prime}_{i})} \mathcal{L}(f_{(\mathcal{\psi}^{\prime}_{i}, 
   \mathcal{\theta}^{\prime}_{i})};\mathcal{D}_{i}^{S})
\end{equation}
We will consider one gradient step for the rest for simplification. After task-adaptation in the inner loop, we compute each task loss for the query set with task-specific parameters $(\mathcal{\psi}^{\prime}_{i}, 
   \mathcal{\theta}^{\prime}_{i})$. By summing all task losses, meta-optimization optimizes the following objectiveness: 
\begin{equation}
\begin{split}
\min\limits_{\psi,\theta}\sum\limits_{i}^{N}\mathcal{L}(f_{(\mathcal{\psi}^{\prime}_{i}, 
   \mathcal{\theta}^{\prime}_{i})};\mathcal{D}_{i}^{Q}) 
   = \\ \sum\limits_{i}^{N}\mathcal{L}(f_{(\mathcal{\psi},\mathcal{\theta}) - \alpha \nabla_{(\psi, \theta)} \mathcal{L}(f_{(\mathcal{\psi}, 
   \mathcal{\theta})};\mathcal{D}_{i}^{S})};\mathcal{D}_{i}^{Q})
\label{eq:maml_outer}
\end{split}
\end{equation}
In the outer loop, we update meta-initialization parameters with $N$ task losses using meta-gradient by meta-optimization with an outer step size of $\beta$.
\begin{equation}
   (\mathcal{\psi}, \mathcal{\theta}) \gets (\mathcal{\psi},\mathcal{\theta}) - \beta \nabla_{(\mathcal{\psi}, 
   \mathcal{\theta})} \sum\limits_{i}^{N}\mathcal{L}(f_{(\mathcal{\psi}^{\prime}_{i}, 
   \mathcal{\theta}^{\prime}_{i})};\mathcal{D}_{i}^{Q})
\label{eq:maml_outer2}
\end{equation}
In meta-testing, we verify the trained meta-initialization parameters. The inner loop adapts to the new task with a support set that remains the same as in meta-training. However, the outer loop only computes the accuracy using a query set for each task. There is no meta-optimization process in the outer loop of meta-testing.


%%%%%%%%%%%%%%%%%%%%% algorithm %%%%%%%%%%%%%%%%%%%
\begin{algorithm}[t]
%\begin{algorithm}[t]


\caption{Cooperative Meta Learning}
\label{algo:cml}
\begin{algorithmic}[1]
\STATE \textbf{[Meta-training]} \\
\STATE \textbf{Input}: Task distribution $p(\mathcal{T})$; Meta-learner model $f^{m}$; Co-learner model $f^{c}$; Step sizes $\alpha, \beta$; Loss scaling factor $\gamma$ ; The number of task in batch: N \\
\STATE \textbf{Output}: Meta-initialization parameters $\mathcal{\psi}$,  $\mathcal{\theta}$, $\mathcal{\phi}$ 


 %[1] enables line numbers
\STATE Randomly initialize parameters $\mathcal{\psi}$,  $\mathcal{\theta}$, $\mathcal{\phi}$ 
\WHILE{not converged}
\STATE Sample N tasks for batch $\mathcal{T}_{i} \sim p(\mathcal{T})$
\FORALL{$\mathcal{T}_{i}$}
\STATE Sample dataset  $\mathcal{D} = (\mathcal{D}^{S}_{i},\mathcal{D}^{Q}_{i})$ $\;$from $\mathcal{T}_{i}$  
% \STATE Calculate meta-learner Loss $\mathcal{L}^{m}_{D^{S}}$ with $(\mathcal{\psi}, \mathcal{\theta})$ \\ 
\STATE Update task-specific parameters $(\mathcal{\psi}_{i}^{\prime}, \mathcal{\theta}_{i}^{\prime})$ by Eq.(\ref{eq:cml_inner1})
\ENDFOR
\STATE Intervene co-learner $\mathcal{\phi}$ in meta-optimization step
\STATE Calculate total loss with co-learner by Eq.(\ref{eq:cml_outer1})
% and co-learner Loss $\mathcal{L}^{c}_{D^{Q}}$ with $(\mathcal{\psi}_{i}^{\prime}, \mathcal{\theta}_{i}^{\prime}, \mathcal{\phi})$:
% \STATE $\mathcal{L} = \mathcal{L}^{m}_{D^{Q}} + \gamma \mathcal{L}^{c}_{D^{Q}}$
\STATE Update meta-initialization parameters $(\mathcal{\psi}, \mathcal{\theta}, \mathcal{\phi})$ with $\beta$ by Eq.(\ref{eq:cml_outer2})
\ENDWHILE
\STATE \textbf{return} $\mathcal{\psi}, \mathcal{\theta}, \mathcal{\phi}$ 

\STATE \textbf{[Meta-testing]} \\
\STATE \textbf{Input}: Sample test dataset $\mathcal{D}_{test} = (\mathcal{D}^{S}_{test},\mathcal{D}^{Q}_{test})$ \\
\STATE Load meta-initialization parameters $\mathcal{\psi}$,  $\mathcal{\theta}$, $\mathcal{\phi}$ 
\FORALL{$\mathcal{D}_{test}$}
\IF {method is “CML”} 
\STATE Update task-specific parameters $(\mathcal{\psi}^{\prime}, \mathcal{\theta}^{\prime})$ for $\mathcal{D}^{S}_{test}$ by Eq.(\ref{eq:maml_testing2})
\STATE Evaluate the model $f^{m}_{\psi^{\prime}, \theta^{\prime}}$ with $\mathcal{D}^{Q}_{test}$
\ENDIF
\IF {method is “CML$^{\dagger}$”} 
\STATE Update task-specific parameters $\mathcal{\psi}^{\prime}$ for $\mathcal{D}^{S}_{test}$ by Eq.(\ref{eq:cml_testing2})
\STATE Evaluate the model $f^{c}_{\psi^{\prime}, \mathcal{\phi}}$ with $\mathcal{D}^{Q}_{test}$
\ENDIF
\ENDFOR
\end{algorithmic}
\end{algorithm}


%%%%%%%%%%%% method:CML  %%%%%%%%%%%%

\subsection{Cooperative Meta-Learning (CML)} \label{method:cml}
Our proposed framework includes an additional module called co-learner $\phi$ inducing gradient augmentation in the meta-optimization. The co-learner can consist of a convolution layer or a fully connected layer, depending on the task. In meta-training, CML performs task-adaptation with the feature extractor and meta-learner in the inner loop same as the original gradient-based meta-learning such as MAML. The co-learner is added to the feature extractor during meta-optimization in the outer loop. Note that the co-learner only intervenes in the outer loop to perform meta-optimization with the feature extractor and meta-learner. In other words, the co-learner does not perform task-adaptation for the current task in the inner loop, therefore, it has implicit knowledge of tasks in the previous sampled batch. As a result, it has a different representation from the naive meta-learner. In this framework, the meta-learner and co-learner always share a representation of the feature extractor. Hence, their gradients are aggregated in the feature extractor, resulting in gradient augmentation. 

\begin{figure*}[t]
\centering
\begin{subfigure}{0.32\textwidth}
    \includegraphics[width=\linewidth]{IMAGE/reg_5shot_new2.png}
    \caption{5-shot}
    \label{fig:first}
\end{subfigure}
\hfill
\begin{subfigure}{0.32\textwidth}
    \includegraphics[width=\linewidth]{IMAGE/reg_10shot_new2.png}
    \caption{10-shot}
    \label{fig:second}
\end{subfigure}
\hfill
\begin{subfigure}{0.31\textwidth}
    \includegraphics[width=\linewidth]{IMAGE/reg_20shot_new2.png}
    \caption{20-shot}
    \label{fig:third}
\end{subfigure}
\caption{Results of MAML and CML on 5,10 and 20-shot of simple regression task.}
\label{fig:reg}
\end{figure*}
Formally, we sample a support set $\mathcal{D}_{i}^{S}$ and query set $\mathcal{D}_{i}^{Q}$ from a new task $\mathcal{T}_{i}$. Also, we denote the initial parameters for the feature extractor, meta-learner and co-learner as $\mathcal{\psi}$, $\mathcal{\theta}$ and $\mathcal{\phi}$ and two models in our framework: model $f^{m}$, which composes of a shared feature extractor and a meta-learner, and $f^{c}$, which composes of a shared feature extractor and a co-learner. In the inner loop, the feature extractor and the meta-learner update the parameters $\psi_{i}^{\prime}$ and $\mathcal{\theta}_{i}^{\prime}$ with $\mathnormal{M}$-updates from a batch of $\mathcal{D}_{i}^{S}$, respectively. However, the co-learner does not update the parameters  $\mathcal{\phi}_{i}^{\prime}$ in the inner loop. Therefore, task-specific parameters $\mathcal{\psi}_{i}^{\prime}$, $\mathcal{\theta}_{i}^{\prime}$ and $\mathcal{\phi}_{i}^{\prime}$ are as follows:
\begin{equation}
\label{eq:cml_inner1}
   (\mathcal{\psi}_{i}^{\prime}, \mathcal{\theta}_{i}^{\prime}) \gets (\mathcal{\psi},\mathcal{\theta}) - \alpha \nabla_{(\mathcal{\psi}, 
   \mathcal{\theta})} \mathcal{L}(f_{(\mathcal{\psi}, 
   \mathcal{\theta})}^{m};\mathcal{D}_{i}^{S}), \quad 
   \mathcal{\phi}^{\prime}_{i} = \mathcal{\phi}
\end{equation} 
where $\alpha$ is an inner step size which is a fixed hyper parameters. Unlike $\psi_{i}^{\prime}$ and $\theta_{i}^{\prime}$ which are updated for the current task $\mathcal{T}_{i}$ in the inner loop, $\mathcal{\phi}_{i}^{\prime}$ keeps the updated parameters from the previously sampled tasks in the outer loop. In the outer loop, our model $f$ updates meta-initialization parameters from $\mathcal{D}_{i}^{Q}$ with task-specific parameters updated by $\mathcal{D}_{i}^{S}$. Our purpose is to converge to $\psi$, $\theta$ and $\phi$ that minimize Eq.(\ref{eq:cml_outer1}) with the loss of the meta-learner and co-learner. 
\begin{equation}
\begin{split}
\label{eq:cml_outer1}
\sum\limits_{i}^{N}\{\mathcal{L}(f_{(\psi^{\prime}_{i},\theta^{\prime}_{i}, \phi_{i})};\mathcal{D}_{i}^{Q})\} =\\ \sum\limits_{i}^{N}\{\mathcal{L}(f_{(\mathcal{\psi}^{\prime}_{i},\mathcal{\theta}^{\prime}_{i})}^{m};\mathcal{D}_{i}^{Q})+\gamma\mathcal{L}(f_{(\mathcal{\psi}^{\prime}_{i}, 
   \mathcal{\phi})}^{c};\mathcal{D}_{i}^{Q})\}
\end{split}
\end{equation}
where $\gamma$ is the loss scaling factor. The feature extractor and meta-learner have task-specific parameters $\mathcal{\psi}_{i}^{\prime}$ and $\mathcal{\theta}_{i}^{\prime}$ with knowledge about the current task $\mathcal{T}_i$, but the co-learner has the parameters $\mathcal{\phi}$ that have been updated by meta-optimization on the previous sampled N tasks. 
\begin{equation}
(\mathcal{\psi}, \mathcal{\theta}, \phi) \gets (\mathcal{\psi},\mathcal{\theta},\phi) - \beta\nabla_{(\mathcal{\psi}, 
   \mathcal{\theta},\phi)} \sum\limits_{i}^{N}\{\mathcal{L}(f_{(\psi^{\prime}_{i},\theta^{\prime}_{i}, \phi_{i})};\mathcal{D}_{i}^{Q})\}
\label{eq:cml_outer2}
\end{equation}
Then, we compute the meta-gradient with $N$ task losses for the query set $\mathcal{D}_{i}^{Q}$. It is created by gradient augmentation, where the gradient noise from the co-learner is added to the existing gradient. From Eq.(\ref{eq:cml_outer2}), we update the meta-initialization parameters with an outer step size of $\beta$ by meta-optimization in the outer loop. The updated $\mathcal{\psi}, \mathcal{\theta}$ and $\phi$ are initialized with meta-initialization parameters for meta-testing. 
Lastly, our framework can infer with CML and CML$^{\dagger}$ using meta-testing dataset $D_{test} = (D_{test}^{S}, D_{test}^{Q})$ in meta-testing phase, and CML performs the task-adaptation as follows:
\begin{equation}
\label{eq:maml_testing2}
   (\mathcal{\psi}^{\prime}, \mathcal{\theta}^{\prime}) \gets (\mathcal{\psi},\mathcal{\theta}) - \alpha \nabla_{(\mathcal{\psi}, 
   \mathcal{\theta})} \mathcal{L}(f_{(\mathcal{\psi}, 
   \mathcal{\theta})}^{m};\mathcal{D}^{S}_{test})
\end{equation}
The model then evaluates against $\mathcal{D}^{Q}_{test}$ by using the adapted parameters $\mathcal{\psi^{\prime}}$ and $\mathcal{\theta^{\prime}}$ like standard MAML. Therefore, it does not require any additional inference cost for the co-learner.

%% camera ready 수정
\begin{equation}
\label{eq:cml_testing2}
   \mathcal{\psi}^{\prime} \gets \mathcal{\psi} - \alpha \nabla_{(\mathcal{\psi}, 
   \mathcal{\phi})} \mathcal{L}(f_{(\mathcal{\psi}, 
   \mathcal{\phi})}^{c};\mathcal{D}^{S}_{test})
\end{equation}
On the other hand, in Eq.(\ref{eq:cml_testing2}), CML$^{\dagger}$ performs the task-adaptation only for $\mathcal{\psi}$. Note that $\phi$ of the co-learner does not perform task-adaptation and has existing meta-initialization parameters. Then we evaluate model with $\mathcal{\psi}^{\prime}$ and $\phi$ against $\mathcal{D}^{Q}_{test}$. Our CML algorithm is shown in Algorithm \ref{algo:cml}.




%%%%%%%%%% Gradient proving %%%%%%%%
%% camera ready 수정
Next, we demonstrate that the gradient calculated from the co-learner converges theoretically when it is combined into meta-gradients.
Our meta-gradient is updated by combining the gradients from the meta-learner($\theta^{\prime}$) and co-learner($\phi$) in the feature extractor($\psi^{\prime}$). 
We represent the loss function of the base network with $\psi^{\prime}$, $\theta^{\prime}$ to be $\mathcal{L}(\psi^{\prime}, \theta^{\prime})$ after the task-adaptation.



\begin{theorem} 
\label{Theorem:loss_minize}
% Let the meta-initialization parameters of the shared feature extractor containing $N$ layers be $\theta = \{\psi^{\prime}_{1}, \cdots , \psi^{\prime}_{N}\}$.
% Consider the gradient of the base network, $G^{(\mathcal{\psi^{\prime}}, 
%    \mathcal{\theta^{\prime}})} = \{g^{(\mathcal{\psi^{\prime}}, 
%    \mathcal{\theta^{\prime}})}_{1}, \cdots , g_{N}^{(\mathcal{\psi^{\prime}}, \mathcal{\theta^{\prime}})}\}$ in the outer loop for its loss $\sum\limits_{i}^{N}\{\mathcal{L}(f_{(\mathcal{\psi}^{\prime}_{i},\mathcal{\theta}^{\prime}_{i})}^{m};\mathcal{D}_{i}^{Q})$ and the gradient of co-learner, $\tilde{G}_{{(\mathcal{\psi},\mathcal{\phi})}}= \{\tilde{g}^{1}_{(\mathcal{\psi},\mathcal{\phi})}, \cdots , \tilde{g}^{N}_{(\mathcal{\psi},\mathcal{\phi})}\}$ calculated by $\mathcal{L}(f_{(\mathcal{\psi}^{\prime}_{i}, 
%    \mathcal
Let the meta-initialization parameters of the base network consisting of $N$ feature extraction layers and the meta-learner as  $\omega = \{\psi^{\prime}_{1}, \cdots , \psi^{\prime}_{N}, \theta^{\prime}\}$. Consider the gradient $G^{(\mathcal{\psi^{\prime}}, 
   \mathcal{\theta^{\prime}})} = \{g^{\mathcal{\psi^{\prime}}}_{1}, \cdots , g_{N}^{\mathcal{\psi^{\prime}}}, g^{\theta^{\prime}}\}$ of the base network computed by the meta-learner in the outer loop
   and the gradient $\bar{G}^{(\mathcal{\psi^{\prime}}, 
   \mathcal{\phi})} = \{\bar{g}^{\mathcal{\psi^{\prime}}}_{1}, \cdots , \bar{g}_{N}^{\mathcal{\psi^{\prime}}}, 0\}$ of the feature extractor computed by the co-learner. A zero value is just for matching the dimension.  
   Let $\bm{\hat{G}}^{(\psi^{\prime}, \theta^{\prime}, \phi)} = G^{(\mathcal{\psi^{\prime}}, 
   \mathcal{\theta^{\prime}})} + \bar{G}^{(\mathcal{\psi^{\prime}}, 
   \mathcal{\phi})} =\{ (g^{\mathcal{\psi^{\prime}}}_{1}+\bar{g}^{\mathcal{\psi^{\prime}}}_{1}), \cdots , (g^{\mathcal{\psi^{\prime}}}_{N}+\bar{g}^{\mathcal{\psi^{\prime}}}_{N}), g^{\theta^{\prime}}\}$ be the gradient of base network by gradient aggregation computed by the loss function $\mathcal{L}(\psi^{\prime}, \theta^{\prime};D^{Q})$.
% since 올리고 , co-learner할 때 shared 된 파라미터로 계산된$\tilde{G}^{{(\mathcal{\psi}^{\prime},\mathcal{\theta^{\prime}},\mathcal{\phi})}}= \{\tilde{g}_{1}^{(\mathcal{\psi}^{\prime},\mathcal{\theta^{\prime}},\mathcal{\phi})}, \cdots , \tilde{g}_{N}^{(\mathcal{\psi^{\prime}}, \mathcal{\theta^{\prime}}, \mathcal{\phi})}\}$ calculated by $\mathcal{L}_{i}(\psi^{\prime},\mathcal{\theta^{\prime}}, \phi;\mathcal{D}_{i}^{Q})$.
If $\langle g^{\mathcal{\psi^{\prime}}}_{j},\bar{g}^{\mathcal{\psi^{\prime}}}_{j}\rangle > 0, \ \forall j, (1 \leq j \leq N) $
is satisfied, the direction of the augmented gradient is a descent direction for finding meta-initialization parameters.
\end{theorem}




%%%%%%%% proof

\begin{proof}

By Taylor's expansion of the loss function $\mathcal{L}$ for task and the base network of $\omega$ with CML updates: 
\begin{align*}
\mathcal{L}(\omega-\alpha\bm{\hat{G}}^{(\psi^{\prime}, \theta^{\prime}, \phi)})=\mathcal{L}(\omega)-\alpha\nabla\mathcal{L}(\omega)^T\bm{\hat{G}}^{(\psi^{\prime}, \theta^{\prime}, \phi)}+\mathcal{O}(\alpha^2)
\end{align*}
With $\nabla\mathcal{L}(\omega) = {G}^{(\mathcal{\psi^{\prime}}, 
   \mathcal{\theta^{\prime}})}$ and $\lim_{\alpha \to 0} \frac{ |\mathcal{O}(\alpha^2)|}{ \alpha}=0$, there exists $\bar{\alpha} > 0$ such that
\begin{align*}
\frac{ |\mathcal{O}(\alpha^2)|}{ \alpha} < |\langle G^{(\mathcal{\psi^{\prime}}, 
   \mathcal{\theta^{\prime}})},\bm{\hat{G}}^{(\psi^{\prime}, \theta^{\prime}, \phi)}\rangle|, \;\;\; \forall \alpha \in (0,\bar{\alpha})
\end{align*}
we have $|\langle G^{(\mathcal{\psi^{\prime}}, \mathcal{\theta^{\prime}})},\bm{\hat{G}}^{(\psi^{\prime}, \theta^{\prime}, \phi)}\rangle|>0 \  (\because \langle g^{\mathcal{\psi^{\prime}}}_{j},\bar{g}^{\mathcal{\psi^{\prime}}}_{j}\rangle > 0, \ \forall j)$. In this condition, $\mathcal{L}(\omega-\alpha\bm{\hat{G}}^{(\psi^{\prime}, \theta^{\prime}, \phi)})-\mathcal{L}(\omega) < 0 \ \text{and} \ \forall \alpha \in (0,\bar{\alpha})$.
Therefore, CML updates the parameters $\omega$ toward the descent direction in the outer loop.





%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Experiments %%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Experiments}
\label{exp}
In this section, we apply our CML to various gradient-based meta-learning and evaluate the performance of our framework on few-shot regression, few-shot image classification and few-shot node classification in Section \ref{exp:reg}$\sim$\ref{exp:gnn}.
We also conduct a gradient analysis of the co-learner in our framework, as discussed in Section \ref{exp:4.4_cml_grad}. To confirm the performance improvement from the gradient augmentation effect in CML, not from additional parameters or multi-branch structure, we compare CML with CL, having the same structure, and the naive gradient-based meta-learning in Section \ref{exp:4.5_cml_structure}. To a fair comparison, we follow the original settings of several gradient-based meta-learning algorithms and test them on well-known few-shot benchmarks. More implementation details are in the Appendix. 

%%%%%%%%%%%%%%%%%%%%% Exp: Few-shot regression %%%%%%%%%%%%%%%%%%%
\subsection{Few-shot regression} \label{exp:reg}
We evaluate the performance of CML with MAML as a baseline in K-shot sinusoidal regression. The amplitude and phase of the sinusoidal wave follow the ranges of [0.1,5.0] and [0,$\pi$]. Each task consists of datapoints $\mathbf{x}$, $\mathbf{y}$ of a sinusoidal wave. The input $\mathbf{x}$ is uniformly sampled in the range [-5.0,5.0]. The loss function for comparing predicted $\mathbf{y}$ and target $\mathbf{y}$ uses mean-squared error. The baseline consists of 2 hidden layers of size 40 with ReLU nonlinearities, 1 input layer and 1 output layer following \citep{finn2017model}. For CML, the regressor is additionally attached with 1 hidden layer of size 40 with ReLU nonlinearities and 1 output layer as a co-learner. In meta-training, we use K $\in$ $\{$5,10,20$\}$ samples as training examples and train using a batch size of 4, one inner-gradient step, a fixed step size of 0.01 and our loss scaling factor $\gamma$ of 0.2. For meta-testing, we evaluate adaptation with one gradient step for K=5, 10, and 20 test points. Each model predicts the target sinusoidal wave through the given K test points. Furthermore, the co-learner is deleted and it is only evaluated from the feature extractor and meta-learner as the original model. Figure \ref{fig:reg} shows that our CML performs better than MAML for 5, 10, and 20 shots. It means that our framework adapts well to simple networks and shows better generalization performance than the original framework.







%%%%%%%%%%%%%%%%%%%%% Exp: Few-shot image classification %%%%%%%%%%%%%%%%%%%
\subsection{Few-shot image classification}
\label{exp:image}
%% camera ready 수정
We compare the performance of the proposed method on few-shot image classification with several gradient-based meta-learning algorithms including MAML \citep{finn2017model}, MAML++ \citep{antoniou2018train}, BOIL \citep{oh2020boil} and Sharp-MAML \citep{abbas2022sharp}. In this experiment, we evaluate the performance of 5-way 1/5-shot problems on MiniImagenet datasets. In CML, the co-learner uses two convolution layers and a fully connected layer. Our loss scaling factor $\gamma$ is fixed at 0.5 for all methods. We also evaluate the performance of the co-learner. 

%%%%%%%%%%%%% MAIN TABLE %%%%%%%%%%%
\begin{table}
%\small
\caption{Test accuracy of 4-conv network with the CML framework on MiniImagenet dataset. The MAML algorithms are from \citep{oh2020boil}. The Sharp-MAML is used for reproduction. The \jy{blue color} and \jh{red color} indicate the output of the meta-learner and co-learner, respectively. Our experiments are performed in 3 runs.}
\centering
\begin{adjustbox}{width=0.9\linewidth}
%\small
\begin{tabular}{     l c   c }
\toprule
\multirow{2}{*}{{Method}} & \multicolumn{2}{c}{MiniImagenet 5-way (\%)}\\
% 	 \cmidrule{3-6} \cmidrule{7-10} 
& {1-shot} & {5-shot}\\

\midrule
MAML \citep{finn2017model} & {47.44 $\pm$ 0.23} & {61.75 $\pm$ 0.42}  \\ 
MAML + CML  &\jy{49.32 $\pm$ 0.37} & \jy{65.84 $\pm$ 0.46}    \\		
MAML + CML$^\dagger$  & \jh{{50.35 $\pm$ 0.15}} &  \jh{{66.43 $\pm$ 0.43}} \\
\midrule
MAML++ \citep{antoniou2018train} & {52.15 $\pm$ 0.26} & {68.32 $\pm$ 0.44}\\
MAML++ + CML  &\jy{52.46 $\pm$ 0.05} & \jy{70.08 $\pm$ 0.61}   \\
MAML++ + CML$^\dagger$  & \jh{52.86 $\pm$ 0.17} & \jh{{70.69 $\pm$ 0.49}} \\
\midrule
BOIL \citep{oh2020boil} & {49.61 $\pm$ 0.61} & {66.45 $\pm$ 0.37} \\
BOIL + CML  &  \jy{50.04 $\pm$ 0.30} & \jy{66.91 $\pm$ 0.13}   \\
BOIL + CML$^\dagger$ &  \jh{{50.83 $\pm$ 0.25}} & \jh{{67.50 $\pm$ 0.48}} \\
\midrule
Sharp-MAML \citep{abbas2022sharp} & {49.06 $\pm$ 0.52}  & {65.63 $\pm$ 0.54} \\
Sharp-MAML + CML & \jy{49.56 $\pm$ 0.45} & \jy{66.90 $\pm$ 0.20}  \\
Sharp-MAML + CML$^\dagger$ & \jh{{49.70 $\pm$ 0.62}} & \jh{{67.06 $\pm$ 0.16}} \\
\bottomrule
\end{tabular}
\end{adjustbox}
\label{table:mini}
\end{table}
\begin{table}[t]%[hbt!]
%\small 
\centering
\caption{Test accuracy for 5-way 1/5-shot of the MAML and CML framework on the diverse datasets.}
\begin{adjustbox}{width=1\linewidth}
%\small
\begin{tabular}{     l c   c c c c c c c}
\toprule
\multirow{2}{*}{Method} & \multicolumn{2}{c}{Omniglot (\%)} & \multicolumn{2}{c}{CIFAR-FS (\%)}
& \multicolumn{2}{c}{FC100 (\%)} & \multicolumn{2}{c}{VGG Flower (\%)}\\
\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9}
& {1-shot} & {5-shot } & {1-shot} & {5-shot} & {1-shot} & {5-shot} & {1-shot} & {5-shot} \\
\midrule
MAML & {91.78} & {96.59} & {56.55} & {70.10}  & {36.07} & {48.03}  & {63.17} & {74.48}  \\ 
CML  & \textbf{93.99} & \textbf{97.15} & \textbf{57.67} & \textbf{73.87}  & \textbf{36.90} & \textbf{51.06}  & \textbf{64.31} & \textbf{77.03}  \\	
\bottomrule
\end{tabular}
\end{adjustbox}
\label{table:dif_dataset}
\end{table}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\noindent\textbf{Results}\quad Table \ref{table:mini} shows that the proposed methods outperform the original algorithms. Note that CML, which removed the co-learner during meta-testing, improves the performance of the original algorithms. It indicates that the co-learner only performs meta-optimization, which successfully leads it to converge to well-generalized meta-initialization parameters. Specifically, on MAML++ \citep{antoniou2018train}, our framework achieves 70.08\% performance without any additional inference cost in meta-testing. In CML$^{\dagger}$, we infer through the co-learner instead of the meta-learner. CML$^{\dagger}$ outperforms CML because CML$^{\dagger}$ has more parameters.  More interestingly, the co-learner shows high performance without any adaptation. This suggests that our framework has a well-trained feature extractor, and the co-learner plays an important role in achieving this. It looks similar to BOIL \citep{oh2020boil}, but whereas BOIL freezes the meta-learner for representation changes, we introduce a co-learner to take advantage of the gradient augmentation effect of Theorem \ref{Theorem:loss_minize}. In other words, the co-learner provides a gradient augmentation effect to converge the feature extractor with meta-initialization parameters that enable good generalization. The effectiveness of this approach is also demonstrated across different datasets, as shown in Table \ref{table:dif_dataset}.





\begin{table}[t]
%\small
\centering
\caption{Results on node classification with CML. The G-Meta and AMM-GNN algorithms are from \citep{tan2022transductive}. Our all experiments are performed 5 runs.}
\begin{adjustbox}{width=1.0\linewidth}
%\small
\begin{tabular}{     l c c  c c  c c}
\toprule
\multirow{2}{*}{Method} & \multicolumn{2}{c}{CiteSeer 2-way (\%)} & \multicolumn{2}{c}{Amazon 2-way (\%)} & \multicolumn{2}{c}{CoraFull 5-way (\%)}  \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7}
% 	 \cmidrule{3-6} \cmidrule{7-10} 
& {1-shot} & {5-shot} & {1-shot} & {5-shot} & {1-shot} & {5-shot}\\

\midrule

G-Meta  & {55.15} & {64.53} & {70.57} & {85.96} & {60.44} & {75.84} \\
G-Meta + CML & \textbf{{61.17}} & \textbf{{76.07}} & \textbf{{72.26}} & \textbf{{87.10}} & \textbf{{60.49}} & \textbf{{76.02}} \\
\midrule

AMM-GNN  & {54.53} & {62.93} & {74.29} & {80.10} & {58.77} & {75.61}\\
AMM-GNN + CML & \textbf{{61.13 }} & {\textbf{66.88}} & \textbf{{78.91}} & \textbf{{86.68}} & \textbf{{63.27}} & \textbf{{76.19}} \\

\bottomrule
\end{tabular}
\end{adjustbox}

\label{table:gnn}
\end{table}



\begin{figure*}[t]
\centering % 3개 일 떄0.32
\subfloat[\label{fig:gradnoise}Accuracy curves]{{\includegraphics[width=0.25\textwidth ]{IMAGE/MAML_noise_final_new.png}}}
\subfloat[\label{fig:gradsim}Gradient Similarity]{{\includegraphics[width=0.25\textwidth ]{IMAGE/gradient_conv4_new.png}}}
\subfloat[\label{fig:gradnorm}Gradient Norm]{{\includegraphics[width=0.25\textwidth ]{IMAGE/[inner]GradNorm_final_new.png} }}
\subfloat[\label{fig:cka}CKA Similarity]{{\includegraphics[width=0.25\textwidth ]{IMAGE/cka_new5_new.png}}}

\caption{\textbf{(a)} Accuracy of MAML with random noise and CML. \textbf{(b)} Gradient similarity for the meta-learner and co-learner of the 4th convolution layer. \textbf{(c)} Comparison of gradient norm for the feature extractor in MAML, CL and CML after task-adaptation in the inner loop. At this point, we ignore the effect of bias, because of its negligible impact. \textbf{(d)} CKA Similarity results of representations before and after task-adaptation in the inner loop.} 
% All our experiments are evaluated on MiniImagenet 5-way 5-shot, and the network structure and experimental settings are shown in Section \ref{exp:image}.
\end{figure*}



%%%%%%%%%%%%%%%%%%%%% Exp: Few-shot node classification %%%%%%%%%%%%%%%%%%%

\subsection{Few-shot node classification}
\label{exp:gnn}
%%%%% 앞에 network설명 부록으로 빼고 AMM, GNN 설명 간단하게 넣기?
In this experiment, we evaluate CML on a few-shot node classification of graph neural networks (GNNs). Few-shot node classification aims to achieve fast adaptation to new node tasks that are unseen during training. We also define an N-way K-shot problem and consider node tasks $\mathcal{T}_{node}$ which consist of support nodes $\mathcal{D}_{node}^{\mathcal{S}}$ and query nodes  $\mathcal{D}_{node}^{\mathcal{Q}}$. For performance comparison, we use G-Meta \citep{huang2020graph} and AMM-GNN \citep{wang2020graph}, which belong to gradient-based meta learning with GNN, as a baseline and evaluate on the CoraFull, Amazon-Computer and CiteSeer datasets \citep{sen2008collective, shchur2018pitfalls}. We also perform 5-way 1/5-shot and 2-way 1/5-shot, respectively. Our base model follows \citep{tan2022transductive}, using GCN as an encoder of hidden size 16 and a fully connected layer as a meta-learner. We train using the Adam optimizer for a step size of 0.001. Also, we set the inner gradient-steps of 20 with a step size of 0.05. In CML framework, our co-learner additionally includes 1 hidden layer of size 16 and a fully connected layer as the output layer using the loss scaling factor $\gamma$ of 0.2. As shown in Table \ref{table:gnn},  our framework outperforms the baseline method on node classification. In this experiment, the co-learner improves the performance of the encoder and meta-learner, despite having a very simple network structure. It shows that our framework is suitable for solving the few-shot problems and is applicable to various DNN methods related to meta-learning.



%%%%%%%%%%%%%%%%%%%%% Exp: gradient analysis %%%%%%%%%%%%%%%%%%%
\subsection{Gradient augmentation analysis}
\label{exp:4.4_cml_grad}
%% camera ready 수정
In this section, we analyze the effect of gradient augmentation by a co-learner. All of our experiments are evaluated on MiniImagenet 5-way 5-shot, and the network structure and experimental settings are as in Section \ref{exp:image}.



\noindent\textbf{Is the gradient of the co-learner really meaningful?}\quad To verify that the gradient of the co-learner is applied as meaningful noise on the meta-gradient, we compare it to MAML with random noise. 
To generate random noise, we introduce a co-learner that does not perform any updating into the MAML (e.g. inner and outer loops). By doing so, the meta-gradient of MAML is updated with a randomized gradient added to the original gradient. For a fair comparison, both models have the same initialization parameters and take the same sampled data as input. Our CML outperforms MAML with random noise and converges to well-generalized parameters much faster, as shown in Figure \ref{fig:gradnoise}. It shows that the gradient of the co-learner influences the meta-gradient with meaningful noise, not just random noise.


\noindent\textbf{The co-learner induces the diversity of the meta-gradient}\quad
\citet{yang2020gradaug} learns a well-generalized full network by inducing gradient diversity with multiple-forwarding of subnetworks. Inspired by this, we perform gradient augmentation on the meta-gradient by updating the proposed co-learner only in the outer loop, unlike the meta-learner, to induce gradient diversity. To demonstrate this, we compare it to Collaborative Learning (CL) \citep{song2018collaborative}, a multi-branch framework approach that does not freeze the co-learner, i.e., CL is like a multi-head framework that updates both the meta-learner and the co-learner. \citet{oh2020boil} shows that the convolution layer before the classifier on task-adaptation is the key to inducing representation change. Based on their findings, we compare CL and CML by computing the gradient similarity of the 4th convolution layer of the feature extractor calculated from each meta-learner and co-learner. From Figure \ref{fig:gradsim}, we observe that CML has a lower gradient similarity between the meta-learner and co-learner in the feature extractor than CL. In general, a value closer to 1 suggests that the patterns and features captured are more similar. Our CML explores more different directions during the optimization process than CL due to its co-learner. It indicates that the co-learner produces a notably more diverse gradient, attributed to the augmentation effect within the meta-gradient. We also show that the gradient similarity of CML is larger than zero, which satisfies the precondition in Theorem \ref{Theorem:loss_minize}.

\noindent\textbf{Effect of the gradient augmentation}\quad In this experiment, we investigate the impact of the augmentation on the meta-gradient. Firstly, we analyze the norm of the gradient for each convolution layer in the feature extractor after task-adaptation in the inner loop. The gradient norm is an important indicator of how much a particular layer affects learning.
% \citep{oh2020boil} finds via gradient norm that changes in high-level features are important.
Figure \ref{fig:gradnorm} shows the averaged gradient norm of each convolution layer in the feature extractor for MAML, CL and CML. We observe that CL and MAML have very small gradient norms, close to zero on all convolution layers. It indicates that they mostly maintain the existing representation with minimal changes for a new task. However, our framework has relatively larger gradient norms, which indicates that the model is dynamically adapting to new tasks, and there is an amplification of diversity on the meta-gradient from the co-learner.
We also perform a Centered Kernel Alignment (CKA) \citep{kornblith2019similarity} to compare representations similarity before and after adaptation. CKA is one way to compare the similarity of representation and a CKA value close to 1 means that the two representations are similar. Figure \ref{fig:cka} shows the CKA similarity of MAML and CML representations before and after task-adaptation. In MAML, the change in representation occurs only at the head. On the other hand, CML indicates that the representation change occurs in the 4th convolution layer, which also proves that the co-learner in CML induces the representation change at a high level. Thus, our results suggest that a new meta-gradient from the co-learner induces it to learn more task-specific features.


\begin{table}[t]
%\small
\centering
\caption{Number of parameters and test accuracy on MiniImagenet 5-way 1/5-shot. CML and CL use the MAML framework as a baseline. The "$\star$" and "$\dagger$" indicate the output of the meta-learner and co-learner, respectively. All experiments are performed in 3 runs.}
% The "$\dagger$" indicates the output of the co-learner in our framework and "$\star$" and "$_\star^\dagger$" means only updating the feature-extractor $\&$ meta-learner and feature-extractor $\&$ co-learner, respectively in meta-testing. All experiments are performed 3 runs.}
\begin{adjustbox}{width=0.9\linewidth}
%\small
\begin{tabular}{     l c c |  c c}
\toprule
\multirow{2}{*}{Method} & \multicolumn{2}{c|}{Parameters \#} & \multicolumn{2}{c}{MiniImagenet 5-way (\%)}\\
% 	 \cmidrule{3-6} \cmidrule{7-10} 
& Train & Test & {1-shot} & {5-shot}\\
\midrule
MAML  & 129K & 129K &  {47.44 $\pm$ 0.23} & {61.75 $\pm$ 0.42} \\
 More-MAML  & 232K & 232K & {48.48 $\pm$ 0.60} & {62.53 $\pm$ 0.12} \\
 CL  & 203K & 203K & {47.57 $\pm$ 0.15} & {62.36 $\pm$ 1.02} \\ % 2개 다 사용
 \midrule
 \multicolumn{5}{l}{\textbf{\textit{(1) Comparison to the meta-learner}}} \\
 CL$\star$  & 203K & 129K & {47.45 $\pm$ 0.13} & {61.60 $\pm$ 1.39} \\ % meta learner만 추론
 CML (Ours) & 203K & 129K & \textbf{{49.32 $\pm$ 0.37}} & \textbf{65.84 $\pm$ 0.46} \\
 \midrule
 \multicolumn{5}{l}{\textbf{\textit{(2) Comparison to the co-learner}}} \\
 % CL$^\dagger$  & 203K & 203K & {48.45 $\pm$ 0.40} & {62.50 $\pm$ 0.62} \\
 % CL$_\star^\dagger$  & 203K & 195K &  {48.54 $\pm$ 0.30} &  {62.74 $\pm$ 0.50} \\
 CL$^\dagger$  & 203K & 195K & {48.45 $\pm$ 0.40} & {62.50 $\pm$ 0.62} \\
 CL$^\dagger$ w/o adaptation & 203K & 195K & { 20.66 $\pm$ 0.38 } & { 20.54 $\pm$ 2.61 } \\
 CML$^\dagger$ (Ours) & 203K & 195K & \textbf{{50.35 $\pm$ 0.15}} &  \textbf{{66.43 $\pm$ 0.43}} \\
\bottomrule
\end{tabular}
\end{adjustbox}
\label{table:more_param}
\end{table}


%%%%%%%%%%%%%%%%%%%%% Exp: structure %%%%%%%%%%%%%%%%%%%
\subsection{Efficiency analysis of the CML structure}
\label{exp:4.5_cml_structure}

In this section, we conduct an experiment to justify the validity of our framework's structure. Our framework requires more parameters during meta-training due to the addition of the co-learner. Therefore, we compare the parameter sizes of CML, CL and MAML with more parameters, called More-MAML, to demonstrate that our framework does not simply improve performance by having more parameters. In this experiment, CML and CL follow the same network architecture as Section \ref{exp:image}, while More-MAML has additional convolution layers. From Table \ref{table:more_param}, we can see that More-MAML, CL and CML have 232K, 203K and 203K parameters, respectively, during meta-training. Note that the CML has fewer parameter sizes than More-MAML and CL in meta-testing, but shows better performance on MiniImagenet datasets. Also in (1), CL{$_\star$} shows a performance degradation when inferring with a meta-learner like CML. In setting (2), both CL$^{\dagger}$ and CML$^{\dagger}$ use the co-learner to evaluate performance. We observe that without performing adaptation, CL$^{\dagger}$ leads to a deterioration in the model's inferential capabilities. It emphasizes that adaptation is essential in the general case, and that our approach has a uniquely structured framework. Notably, although our co-learner does not perform task-adaptation during meta-testing in CML$^\dagger$, it outperforms models with a similar number of parameters while achieving the highest accuracy. In this experiment, our findings highlight that having more parameters in meta-learning does not necessarily lead to improved performance, while our framework demonstrates an effective learning framework to address this limitation.



\begin{table}[t]
%\small
\centering
\caption{Ablation study of the loss scaling factor.}
\begin{adjustbox}{width=0.7\linewidth}
%\small
\begin{tabular}{  c  c  c}
\toprule
\multirow{2}{*}{Loss scaling factor ($\gamma$)} & \multicolumn{2}{c}{MiniImagenet 5-way (\%)} \\ % & \multicolumn{1}{c}{5-shot}\\
& \multicolumn{1}{c}{1-shot}  & \multicolumn{1}{c}{5-shot}\\
% & \multicolumn{1}{c}{\centering Accuracy (\%)}  & \multicolumn{1}{c}{\centering Accuracy (\%)}\\

\midrule
 {0.2} & {49.07} & {64.39} \\ 
 {0.5} & \textbf{49.61} & {65.53}   \\		
 {0.8} & {49.07} & {64.73} \\
 {1.0} & {49.32} & \textbf{65.84} \\
\bottomrule
\end{tabular}
\end{adjustbox}
\label{table:ablation}
\end{table}


%%%%%%%%%%%%%%%%%%%%%%%%% Ablation study %%%%%%%%%%%%%%%%%%%%%%%%%

\subsection{Ablation study}
\label{exp:4.6_ablation}
\noindent\textbf{Update scheme for loss scaling factor}\quad The proposed method has parameters $\gamma$ for the influence of the co-learner on the feature extractor. To verify the effect of this influence, we conduct experiments for 5-way 1/5-shot on MiniImagenet datasets. From Table \ref{table:ablation}, we show that our method has higher performance than conventional MAML regardless of $\gamma$. In particular, 1 shot and 5 shot achieve the highest performance at 0.5 and 1.0, respectively. This result shows that our method is robust against $\gamma$ and suggests that the intervention of the co-learner is important.




%%%%%%%%%%%%%%%%%%%%%%%%% Conclusion %%%%%%%%%%%%%%%%%%%%%%%%%
\section{Conclusion and Discussion}
In this paper, we propose a novel training framework called Cooperative Meta-Learning (CML). The main idea of our framework is that the proposed co-learner in meta-training generates a gradient augmentation effect. To achieve this, we design the co-learner so that it only updates in the outer loop and can be easily deleted in meta-testing. Our experiments demonstrate that our co-learner generates meaningful gradients, which leads to diversity on the meta-gradient, and this guides the learning direction to better meta-initialization parameters. It also shows that the diversity of the meta-gradient is a key factor in its strong generalization ability in the few-shot problem.



%%%%%%%%%%%%%%%%%%%%%%%%% Acknowledgements %%%%%%%%%%%%%%%%%%%%%%%%%
\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
This work was carried out with the support of "Cooperative Research Program for Agriculture Science and Technology Development (Project No. RS-2024-00332198 )" Rural Development Administration, Republic of Korea.
\end{acknowledgements}

% References
\bibliography{uai2024-template}

\newpage

\onecolumn



%%%%%%%%%%%%%%%%%%%%%%%%% Appendix %%%%%%%%%%%%%%%%%%%%%%%%%

\title{Cooperative Meta-Learning with Gradient Augmentation\\(Supplementary Material)}
\maketitle




\appendix
\section{Implementation details}
\subsection{Image classification}
\label{appendix:image_details}
In our experiments, BOIL \citep{oh2020boil} and MAML++ \citep{antoniou2018train} demonstrate results that are highly consistent with the original papers, thus reporting the original paper results \footnote{\url{https://github.com/jhoon-oh/BOIL}} \footnote{\url{https://github.com/AntreasAntoniou/HowToTrainYourMAMLPytorch}}. In addition, the MAML \citep{finn2017model} follows the experiments in \citep{oh2020boil}. However, in the case of Sharp-MAML, the results obtained using the official code in the same experimental setup differed from the original paper. Therefore, we report the experimental results based on our execution following the official code \footnote{\url{https://github.com/mominabbass/Sharp-MAML}}.

\noindent \textbf{Architecture} we used the 4-conv network model, following \citep{finn2017model}. In detail, the model contains four 3 $\times$ 3 convolution layers with batch normalization, a ReLU nonLinearity and 2 $\times$ 2 max-pooling and a fully connected layer. CML additionally includes two 3 $\times$ 3 convolution layers and a fully connected layer as a co-learner.

\noindent \textbf{Experimental settings} We basically follow the original settings for each algorithm. For task-adaption, We follow the original settings: 5 inner-gradient steps on Sharp-MAML and MAML++ and 1 inner-gradient step on the rest following \citep{oh2020boil}. In CML framework, we train using loss scaling of $\gamma$ = 1. We perform 3 runs and report all our results from the model with the best validation accuracy. We used the Pytorch framework and GeForce RTX 3090 for all experiments. 

\noindent \textbf{Datasets} We evaluate our method on the following benchmark datasets. \textbf{MiniImagenet} contains 60000 images with 100 classes and 600 images size of 84 $\times$ 84 for each class. \textbf{Omniglot} contains 32,460 images size of 28 $\times$ 28 of handwritten characters with 1,623 different characters from 50 alphabets. \textbf{CIFAR-FS} is randomly sampled based on CIFAR-100 and it contains 600 images size of 32 $\times$32 with 100 classes. \textbf{FC-100} is also a split dataset from CIFAR-100 that contains 600 images size of 32 $\times$ 32 with 100 classes. \textbf{VGG-Flower} contains 258 images size of 32 $\times$ 32 for each class.
\begin{table}[h]
%\small
\centering
\caption{Statistics datasets }
\begin{adjustbox}{width=0.7\linewidth}
%\small
\begin{tabular}{    c | c c c c }
\toprule
\multirow{1}{*}{Dataset} & \multicolumn{1}{c}{Nodes \#} & \multicolumn{1}{c}{Edge \#} & \multicolumn{1}{c}{Features \#} & \multicolumn{1}{c}{Class split (train / validation / test)} \\

\midrule
CoraFull  & 19,793  & 63,421  & 8,710 & 40 / 15 / 15  \\
Amazon-Computer  & 13,752  & 245,861  & 767 & 4 / 3 / 3  \\
CiteSeer & 3,327  & 4,552  & 3,703 & 20 / 10 / 10  \\
\bottomrule
\end{tabular}
\end{adjustbox}

\label{table:datasets_node}
\end{table}
\subsection{Node classification}
We perform our experiments with the same environment from the official code \footnote{\url{https://github.com/Zhen-Tan-dmml/TLP-FSNC}} of \citep{tan2022transductive}. We experiment on the CoraFull, Amazon-Computer and CiteSeer datasets from Table \ref{table:datasets_node}, which are widely used in node classification.


%%%%%%%%%%%%%%%%%%%%% Appendix: Comparison of meta-learner and co-learner for the same capacity%%%%%%%%%%%%%%%%%%%%%%
\section{Comparison of the meta-learner and co-learner for the same parameters}


\begin{table}[h]
\centering
\caption{Results on the performance of the meta-learner and co-learner with the same parameters. Our CML framework uses MAML as a baseline with a shared feature extractor, meta-learner, and co-learner. The "$\ast$" indicates that the model failed to converge.}
\begin{tabular}{ c | c | c c}
\toprule
\multirow{2}{*}{Structure} & \multirow{2}{*}{Learner} & \multicolumn{2}{c}{MiniImagenet 5-way (\%)} \\
& & 1-shot & 5-shot \\
\midrule
\multirow{2}{*}{Conv(0)}
& Meta-learner & \textbf{{49.52 $\pm$ 0.41}} & \textbf{{65.82 $\pm$ 0.55}}   \\ 
& Co-learner & {49.16 $\pm$ 0.47} & {65.13 $\pm$ 0.27} \\
\midrule
\multirow{2}{*}{Conv(2)}
& Meta-learner & \textbf{{48.06 $\pm$ 0.97}} & \textbf{{66.20 $\pm$ 0.44}}   \\ 
& Co-learner & $\ast$  & {65.44 $\pm$ 0.27} \\
\hline
\end{tabular}
\label{table:meta_learner_same_capacity}
\end{table}



We conduct experiments on the performance of the meta-learner and co-learner with the same capacity. In this experiment, our feature extractor has the four convolution layers as shown in Section 4.2, and the co-learner and meta-learner are evaluated on the same structure, Conv(0) with no convolution layer and only a fully connected layer, and Conv(2) with two convolution layers and a fully connected layer. From Table \ref{table:meta_learner_same_capacity}, we observe that the meta-learner that performs task-adaptation in meta-testing achieves higher performance than the co-learner. It can be seen that our feature extractor already has good performance during meta-training, rather than the co-learner having good performance despite not performing task-adaptation. Therefore, the co-learner assists the learning of the feature extractor during meta-training and guides it to converge in a good direction. However, we find that our co-learner fails to converge on the 1-shot problem with the Conv(2) structure. This suggests that we need to empirically evaluate the optimized structure of the co-learner based on the network architecture.

%%%%%%%%%%%%%%%%%%%%% Appendix: the number of Conv-layer %%%%%%%%%%%%%%%%%%%%%%
\section{Ablation study on the number of Conv-layer in the co-learner}

\begin{table}[h]
%\small
\centering
\caption{Test Accuracy(\%) by number of convolution layer on MiniImagenet 5-way 1-shot. We use MAML as a baseline and follow the experimental settings in Section 4.2.}
\begin{adjustbox}{width=0.6\linewidth}
%\small
\begin{tabular}{    c | c c c c c}
\toprule
% \multirow{1}{*}{Dataset} & \multicolumn{1}{|c}{Nodes \#} & \multicolumn{1}{c}{Edge \#} & \multicolumn{1}{c}{Features \#} & \multicolumn{1}{c}{Class split (train / validation / test)} \\
Conv layer (\#) & Conv(0) & Conv(1) & Conv(2) & Conv(3) & Conv(4) \\
\midrule
CML & \textbf{{49.52 $\pm$ 0.41}} & {49.39 $\pm$ 0.51} & 49.32 $\pm$ 0.37 & {48.85 $\pm$ 0.28} & {49.36 $\pm$ 0.23} \\
\, CML$^{\dagger}$ & {49.16 $\pm$ 0.47} &50.21 $\pm$ 0.60 & \textbf{{50.35 $\pm$ 0.15}} & {49.82 $\pm$ 0.29} & {50.17 $\pm$ 0.28} \\
\bottomrule
\end{tabular}
\end{adjustbox}
\label{table:num_co-learner}
\end{table}



We explore the impact for the structure of the co-learner in our CML framework. Table \ref{table:num_co-learner} shows that all models outperform the performance of standard MAML. In particular, Conv(0), which has only a fully connected layer without a convolution layer, achieved the highest performance. It clearly shows that our learning framework is effective in leading convergence to well-generalized meta-initialization parameters. Also, the co-learner in the Conv(2) model with two convolution layers achieves the highest accuracy of 50.35\%.

%%%%%%%%%%%%%%%%%%%%% Appendix: T-sne %%%%%%%%%%%%%%%%%%%%%%
\section{CML TO A LARGER NETWORK}
We run experiments on CIFAR-FS, VGG-Flower, and FC-100 on a larger network. Resnet12 \citep{oreshkin2018tadam} network.The findings demonstrate that CML enhances the performance of MAML, even with larger backbone architectures. This improvement can be attributed to the enhanced representational ability of the feature extractor facilitated by the co-learner, irrespective of the backbone network size. 
\begin{table}[h]
\centering
\caption{Test accuracy of Resnet12 network with the CML framework on CIFAR-FS, VGG Flower, and FC100 dataset.}
\begin{adjustbox}{width=0.5\linewidth}
%\small
\begin{tabular}{     l c c c c c c}
\toprule
\multirow{2}{*}{Method} & \multicolumn{2}{c}{CIFAR-FS (\%)} & \multicolumn{2}{c}{VGG Flower (\%)}
& \multicolumn{2}{c}{FC100 (\%)}\\
\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} 
& {1-shot} & {5-shot } & {1-shot} & {5-shot} & {1-shot} & {5-shot} \\
\midrule
MAML & {61.86} & {73.32} & {63.43} & {75.42} & {36.61} & {47.48}   \\ 
CML  & \textbf{62.11} & \textbf{78.31} & \textbf{66.07} & \textbf{81.15}  & \textbf{37.56} & \textbf{51.51}  \\	
\bottomrule
\end{tabular}
\end{adjustbox}
\label{table:dif_dataset}
\end{table}








%%%%%%%%%%%%%%%%%%%%% Appendix: T-sne %%%%%%%%%%%%%%%%%%%%%%


\section{VISUALIZATION OF CML BY T-SNE}

\begin{figure}[h]
\centering
\subfloat[\label{fig:tsne_maml}MAML]{{\includegraphics[width=0.5\columnwidth ]{IMAGE/tsne_maml.png}}}
\\
\subfloat[\label{fig:tsne_cml}CML]{{\includegraphics[width=0.5\columnwidth ]{IMAGE/tsne_cml.png} }}
\caption{t-SNE of (a) MAML and (b) CML on trained miniimagenet. We perform the adaptation with the support set and then evaluate the method with the query set.}
\label{fig:tnse}
\end{figure}


T-SNE \citep{van2008visualizing} is a typical dimension reduction technique that maps high-dimensional data into a lower-dimensional space. It allows us to assess the similarity of data points before and after adaptation. We experiment with T-SNE for MAML and CML without the co-learner with the same parameters at inference time. In Table \ref{fig:tnse}, our CML shows that the ability to form more consistent and distinct clusters than MAML. It can be seen that the intervention of the co-learner attached to the CML produces a gradient augmentation effect, which is attributed to better generalization performance.









\end{document}
