\documentclass[accepted]{uai2022} %
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{
    \externaldocument{#1}
    \addFileDependency{#1.tex}
    \addFileDependency{#1.aux}
}

\myexternaldocument{wang_312-supp}
\usepackage[american]{babel}
\graphicspath{{figures/}{../figures/}{figs/}{../figs/}}
\usepackage{fancyhdr}
\usepackage{color}
\usepackage{url}
\usepackage{multirow}
\usepackage{graphicx} %
\usepackage{times}
\usepackage{latexsym}
\usepackage{multicol}
\usepackage{multirow}

\usepackage{placeins}


\usepackage{tabularx} %
\usepackage{amssymb}%
\usepackage{amsmath}
\usepackage{mathtools}
\usepackage{pifont}%


\usepackage{ragged2e}


\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%

\usepackage{hyperref}
\usepackage{url}
\usepackage{dirtytalk}





\newcommand{\sref}[1]{Section~(\ref{#1})} 
\newcommand{\eref}[1]{Eq.~(\ref{#1})} 
\newcommand{\rref}[1]{Theorem~(\ref{#1})} 
\newcommand{\iref}[1]{Inequality~(\ref{#1})} 
\newcommand{\lref}[1]{Lemma~(\ref{#1})} 
\newcommand{\cref}[1]{Condition~(\ref{#1})} 
\newcommand{\dref}[1]{Definition~(\ref{#1})} 
\newcommand{\coref}[1]{Corollary~(\ref{#1})}


\def\e{{\mathbf e}} 
\def\x{{\mathbf x}} 
\def\z{{\mathbf z}} 
\def\y{{\mathbf y}} 
\def\X{{\mathbf X}} 
\def\bZ{{\mathbf Z}} 
\def\Y{{\mathbf Y}} 
\def\s{{\mathbf s}} 
\def\h{{\mathbf h}} 
\def\p{{\mathbf p}} 
\def\W{{\mathbf W}} 



\def\x{{\mathbf x}} 
\def\z{{\mathbf z}} 
\def\y{{\mathbf y}} 
\def\s{{\mathbf s}} 
\def\h{{\mathbf h}} 
\def\p{{\mathbf p}} 
\def\W{{\mathbf W}} 
\def\Ib{{\mathbf I}} 
\def\Cb{{\mathbf C}} 




\usepackage{amsfonts,amssymb,bm}
\usepackage{pifont} %
\usepackage{graphicx,subfigure,epsfig,fancybox} %
\usepackage{float}
\usepackage{color} %
\usepackage{multirow}
\usepackage{natbib}

\newcommand{\mycite}[1]{\small{(#1)}}
\newcommand{\revise}[1]{\textcolor{blue}{}}
\newcommand{\revised}[1]{\textcolor{blue}{}}
\renewcommand{\vec}[1]{\boldsymbol{#1}}
\newcommand{\eq}[1]{\left\langle #1 \right\rangle}
\newcommand{\mat}[1]{\mathbf{#1}}
\newcommand{\trans}[1]{#1^{\textsf{T}}}
\newcommand{\Count}[1]{\text{count}(#1)}
\newcommand{\inv}[1]{(#1)^{-1}}
\newcommand{\Inv}[1]{\left(#1\right)^{-1}}
\newcommand{\set}[1]{\mathcal{#1}}
\newcommand{\diag}[1]{\text{diag}\left(#1\right)}
\newcommand{\annotation}[1]{\textsc{#1}}
\newcommand{\annot}[1]{\annotation{#1}}
\newcommand{\tagword}[2]{\example{#1}/\annot{#2}}
\newcommand{\phon}[1]{\textsf{#1}}
\newcommand{\expect}[1]{E[{#1}]}
\newcommand{\dd}[2]{\frac{\partial #1}{\partial #2}}
\renewcommand{\symbol}[1]{$\langle$#1$\rangle$}
\newcommand{\sentstart}[0]{\symbol{s}}
\newcommand{\sentend}[0]{\symbol{/s}}
\newcommand{\tuple}[1]{\langle #1 \rangle}
\newcommand{\cue}[1]{[\textcolor{blue}{#1}]}
\newcommand{\question}[1]{\textcolor{red}{#1}}
\newcommand{\fstransx}[2]{\xrightarrow[\example{#2}]{\example{#1}}}
\newcommand{\fstrans}[2]{\xrightarrow[{#2}]{#1}}
\newcommand{\gloss}[1]{{\fontfamily{phv}\selectfont #1}}
\newcommand{\newadd}[1]{\textcolor{magenta}{#1}}

\newcommand{\from}[0]{\leftarrow}
\newcommand{\reals}[0]{\mathbb{R}}
\newcommand{\normal}[1]{\mathcal{N}\left(#1\right)}
\newcommand{\cpic}[2][.95\textwidth]{\begin{center}\includegraphics[width=#1]{#2}\end{center}}
\newcommand{\bigo}[1]{\mathcal{O}(#1)}
\newcommand{\Pmult}[0]{P_{\text{multinomial}}}
\newcommand{\Pcat}[0]{P_{\text{categorical}}}
\newcommand{\vw}[0]{\vec{w}}
\newcommand{\vwt}[0]{\trans{\vec{w}}}
\newcommand{\vd}[0]{\vec{d}}
\newcommand{\vf}[0]{\vec{f}}
\newcommand{\vg}[0]{\vec{g}}
\newcommand{\vx}[0]{\vec{x}}
\newcommand{\vz}[0]{\vec{z}}
\newcommand{\vy}[0]{\vec{y}}
\newcommand{\vs}[0]{\vec{s}}
\newcommand{\vu}[0]{\vec{u}}
\newcommand{\vth}[0]{\vec{\theta}}
\newcommand{\vtht}[0]{\trans{\vec{\theta}}}
\newcommand{\vph}[0]{\vec{\phi}}
\newcommand{\jm}[0]{J\&M}
\newcommand{\dkl}[0]{\mathcal{D}_{KL}}
\newcommand{\Z}[0]{\mathcal{Z}}

\newcommand{\vr}[0]{\vec{r}}
\newcommand{\vbeta}[0]{\vec{\beta}}


\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{proof}{Proof}
\newcommand{\task}{\mathcal{T}}
\newcommand{\likeli}{\mathcal{L} ({\task})}

\newcommand{\modelnop}{\vf}
\newcommand{\model}{\vf_{\goal_{\task}}}
\newcommand{\modeli}{\vf_{\goal_{\task}^{0}}()}
\newcommand{\modelipost}{\vf_{\goal_{\task}^{1}}()}

\newcommand{\loss}{\mathcal{L}oss}


\newcommand{\point}{\mathbf{x}}
\newcommand{\goal}{\vec{\theta}}
\newcommand{\datatr}{\vec{D}^{tr}_{\task}}
\newcommand{\datats}{\vec{D}^{te}_{\task}}
\newcommand{\biggoal}{\vec{\Theta}_{\task}}
\newcommand{\taskZ}{\vec{Z}_{\task}}
\newcommand{\datayitr}{\vec{Y}_{\task}^{tr}}
\newcommand{\dataxitr}{\vec{X}_{\task}^{tr}}
\newcommand{\datayite}{\vec{Y}_{\task}^{te}}
\newcommand{\dataxite}{\vec{X}_{\task}^{te}}
\newcommand{\goali}{\goal_{\task}}



\usepackage{adjustbox}
\usepackage{array}
\usepackage{booktabs}

\newcommand{\qnote}[1]{[\textcolor{blue}{Q-note: #1}]}
 \newcommand{\znote}[1]{[\textcolor{green}{Z-note: #1}]}



\newcommand{\methodt}{\texttt{ST-MAML}\xspace}
\newcommand{\method}[0]{\texttt{ST-MAML} }



\newcommand{\func}[2]{\vec{g}_{#1}\left(#2\right)} 
\newcommand{\gfunc}[1]{\vec{s}\left(#1\right)} 
\newcommand{\btheta}{\bm{\theta}}
\newcommand{\bphi}{\bm{\phi}}


\newcommand{\here}{\textcolor{red}{\tt It ends here last time.}}

\def\etal{{\it et~al.}}



\setlength{\abovecaptionskip}{0pt plus 0pt minus 1pt}
\setlength{\belowcaptionskip}{0pt plus 0pt minus 1pt}

\usepackage{enumitem}
\setlist[itemize]{leftmargin=*}




\newenvironment{CompactItemize}
{
  \begin{list}{--}{%
      \usecounter{enumi}
      \setlength{\leftmargin}{12pt}%
      \setlength{\itemsep}{-2pt}
      }}
  {\end{list}}





\usepackage[nolist,nohyperlinks]{acronym}

\makeatletter
\newcommand{\removelatexerror}{\let\@latex@error\@gobble}
\makeatother


\usepackage{lipsum}
\usepackage{titlesec}


\titlespacing\subsection{0pt}{5pt plus 2pt minus 1pt}{2pt plus 0pt minus 0pt}


\parskip 0pt
\setlength\abovedisplayskip{0pt}

\setlength{\textfloatsep}{3pt}
 \addtolength{\parskip}{0mm}

 \usepackage{etoolbox}
 \makeatletter
 \preto{\@tabular}{\parskip=3pt}
 \makeatother



\setlength{\belowdisplayskip}{0pt} \setlength{\belowdisplayshortskip}{0pt}
\setlength{\abovedisplayskip}{0pt} 
\setlength{\abovedisplayshortskip}{0pt}

\setlength{\abovecaptionskip}{0pt plus 0pt minus 1pt}
\setlength{\belowcaptionskip}{-5pt}


\usepackage{enumitem}
\setlist[itemize]{leftmargin=*}

\usepackage[font=small,skip=0pt]{caption}

\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{graphicx}

\usepackage{natbib} %
\bibliographystyle{abbrvnat}
\setcitestyle{authoryear,open={(},close={)}} %
    
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} %
\usepackage{booktabs} %
\usepackage{tikz} %


\newcommand{\swap}[3][-]{#3#1#2} %

\title{\method : A Stochastic-Task based Method for Task-Heterogeneous Meta-Learning}

\author[1]{\href{mailto:<zw6sg@virginia.edu>?Subject=Your UAI 2022 paper}{Zhe Wang}{}}
\author[1]{Jake Grigsby}
\author[1]{Arshdeep Sekhon}
\author[1]{Yanjun Qi}

\affil[1]{%
    Computer Science Dept.\\
    University of Virginia\\
    Charlottesville, Virginia, USA
}

  
  \begin{document}

\maketitle



\begin{abstract}
\label{abs}


Optimization-based meta-learning typically assumes tasks are sampled from a single distribution -- an assumption that oversimplifies and limits the diversity of tasks that meta-learning can model. 
Handling tasks from multiple distributions is challenging for meta-learning because it adds ambiguity to task identities. This paper proposes a novel method, \texttt{ST-MAML}, that empowers model-agnostic meta-learning (\texttt{MAML}) to learn from multiple task distributions. \method encodes tasks using a stochastic neural network module, that summarizes every task with a stochastic representation. The proposed Stochastic Task (\texttt{ST}) strategy learns a distribution of solutions for an ambiguous task and allows a meta-model to self-adapt to the current task. \method also propagates the task representation to enhance input variable encodings. 
Empirically, we demonstrate that \method  outperforms the state-of-the-art on two few-shot image classification tasks, one curve regression benchmark, one image completion problem, and a real-world temperature prediction application. 




\end{abstract}

\section{Introduction}
\label{intro}



Meta-learning aims to train a model on multiple machine learning tasks to adapt to a new task with only a few training samples. Optimization-based meta-learning  like model-agnostic meta-learning (MAML) facilitate such a goal by involving the optimization process. For example, MAML trains a global initialization of model parameters that are close to the optimal parameter values of every task ~\citep{finn2017model}.  Recent methods expand MAML's ``global initialization'' to a notion of``globally shared knowledge'', including not only  initialization~\citep{finn2017model, li2017meta, rajeswaran2019meta} but also update rules~\citep{andrychowicz2016learning,  Ravi2017OptimizationAA}. Globally shared knowledge
allows these methods to produce good generalization performance on new tasks with a small number of training samples. 

\begin{figure}[t]
  \centering
  \includegraphics[width=0.99\columnwidth]{FigChallenge.pdf}
  \caption{
We are given three red dots representing the training data for a meta-test task. The dashed and solid curves are potential explanations of the data (better read in color). (a) Homogeneous setup. All meta-training tasks are sampled from linear regression family. (b) Heterogeneous setup. The meta-training tasks are sampled from three possible  function families including \textit{sinusoids, straight line, and quadratic}. It is difficult to figure out what family this meta-test task is sampled from, due to limited annotated data and three possible meta distributions.  }    	\label{fig:abs_demostration}
\end{figure}



	

Most optimization-based meta-learning algorithms assume all tasks $\task$ are identically and independently sampled from a single  distribution~\citep{andrychowicz2016learning, finn2017model, li2017meta, Ravi2017OptimizationAA, rusu2018meta}. We refer to meta-learning's task distribution as the ``meta-distribution''. Formally, these methods assume  $\task \sim P(\task)$. Real-world tasks, however, may come from multiple meta-distributions, $\task \sim \{P_1(\task), P_2(\task), \cdots, P_k(\task)\}$. For instance, when analyzing multiple writers' hand written digits, writers from different age group (like children versus adults) indicate different meta-distributions. This more challenging setup, we call task heterogeneity, poses technical challenges to homogenous strategies like MAML ~\citep{vuorio2019multimodal}. 




For task heterogeneity, a naive and widely accepted meta-learning solution first learns a globally shared initialization across all meta-distributions and then tailors model parameters to the current task~\citep{vuorio2019multimodal, yao2020automated, yao2019hierarchically, lee2018gradient, oreshkin2018tadam}. The tailoring step needs to rely on the task-specific information or, ideally, the identity information of the task. 
It, therefore, requires the meta-learner to infer the potential identity of a new task from a limited number of annotated samples~\citep{finn2018probabilistic}. This requirement raises severe uncertainty issues -- a challenge known as ``task ambiguity''. Figure \ref{fig:abs_demostration} provides a concrete example of the task ambiguity that arises from limited annotated data and unknown meta distribution when facing task heterogeneity.  Surprisingly, recent optimization-based meta-learning literature pays little attention to the task ambiguity challenge ~\citep{vuorio2019multimodal, yao2020automated, yao2019hierarchically, lee2018gradient}. Besides,  the task heterogeneity amplifies the ``distribution shift'' issue \citep{zhang2021adaptive, dubey2021adaptive}. The 
difference between two tasks can significantly increase in the heterogeneous setup since tasks are from various meta-distributions.





This paper proposes a novel meta-learning method \method for the task heterogeneity challenge. Our approach extends MAML by modeling tasks as a stochastic variable that we call the ``stochastic task''. Stochastic tasks (STs) let us learn a distribution of solutions to capture the uncertainty of an ambiguous new task. At the same time, STs enable self-adaptive model initialization based on the current task. We use variational inference as a solver and the whole learning process is meta-distribution agnostic. 
We apply \method to a wide range of common meta-learning benchmarks including synthetic regression, image completion, and few-shot image classification, where \method exceeds the performance of existing work. We also build a large temperature prediction dataset that highlights the challenges of real-world meta-distributions. Our empirical results demonstrate that \method outperforms the MAML baselines by $40\%$ on this new task.



\section{Methods}
\begin{figure*}[ttt!]

\begin{tabular}{cc}
\begin{minipage}{.45\textwidth}
    
    \centering
    \includegraphics[width=\textwidth]{NewFigPGraph.pdf}
    \captionof{figure}{
        Probabilistic model overview of \method. The stochastic variable $\taskZ$ conditioned on task information $(\X_{\task}, \Y_{\task})$ is used for model's self-adaptation and input variable's re-encoding.  
    } \label{fig:pgraph}

\end{minipage} &

\scalebox{0.78}{
    \begin{minipage}{.66\textwidth}
    \begin{algorithm}[H]
    \captionof{algorithm}{\textsc{\method Meta-Training Procedure.}} \label{alg:train}
        \begin{algorithmic}[1]
        \STATE {\bfseries Input:} Meta-distribution set $\{P_1(\mathcal{T}), \cdots, P_k(\mathcal{T})\} $, Hyper-parameters $\gamma_1$ and $\gamma_2$.
    	\STATE Randomly initialize model parameter $\goal$, stochastic task module parameters $\vec{\phi}$, tailoring module parameters $\vec{w}$, input encoding parameters $\vbeta$.
    		\WHILE{not DONE}
        		\STATE Sample batches of $m$ tasks $\{\mathcal{T}\}$ from meta-distribution set.
        		\FOR{every task $\mathcal{T}$}
        		    \STATE Infer the posterior distribution of stochastic task variable $q(\taskZ|\task)$ and sample $\vz_{\task}\sim q(\taskZ|\task)$. [eq.\eqref{eq:Rtask} and eq.\eqref{eq:postz}]
		    \STATE Tailor $\goal$  with sample $\vz_{\task}$ to get task-specific initialization $\goali^0$. [eq.\eqref{eq:trans1}]
		    \STATE Revise the encoding of input variable by augmenting the raw input. [eq.\eqref{eq:trans2}]
        		    \STATE Evaluate the inner loss $\mathcal{L}_{in}(\task)$ on training set $\datatr$.  [eq.\eqref{eq:inner_loss}]
        		    \STATE Compute adapted parameter and augmented feature with gradient descent [eq.\eqref{eq:inner_update}]: \\ $\goali^1 = \goali^0 - \gamma_1 \nabla_{\goali^0}\mathcal{L}_{in}(\task)$,\ 
        		    $ \h_{\task}^{1} = \h_{\task}^{0} - \gamma_1\nabla_{\h_{\task}^0}\mathcal{L}_{in}(\task)$. 
        		\ENDFOR
        		\STATE Update $\goal, \vec{\phi}, \vec{w}, \vec{\beta}$ with $\gamma_2\dfrac{1}{m} \nabla_{[\goal, \vec{\phi}, \vec{w}, \vec{\beta}]}\sum_\task \mathcal{L}_{ELBO}(\task)$. [eq.\eqref{eq:LIB}]
        		
    		\ENDWHILE  
    \end{algorithmic}

    \end{algorithm}
    
    \end{minipage}
}

\end{tabular}

\end{figure*}


\subsection{Preliminaries on Meta Learning}

We describe a supervised learning task in meta-learning as 
\begin{align}
\task & = \{ \loss(), \model, \datatr, \datats \} \nonumber \\
& = \{ \loss(), \model, [\dataxitr, \datayitr], [\dataxite, \datayite]\},
\end{align}
Here $\loss()$, which takes as input model $\model$ and dataset,  describes the loss function that measures the quality of learner $\model$, whose parameter weight is $\goali$. Every task includes an annotated training set $\datatr = [\dataxitr, \datayitr]$ and a test set $\datats =[\dataxite, \datayite]$.  During meta-training, the test set $\datats$ is fully observed, but during meta-testing only its input $\dataxite$ is available. $\datatr$ and $\datats$ are sampled from $ \mathcal{\vec{X}}\times \mathcal{\vec{Y}}$, $\mathcal{\vec{X}}$ describes the input space and $\mathcal{\vec{Y}}$ is the output space.


The goal of meta learning is to train a learning machine which can perform well on $\datats$ after fine-tuning on this task's training set $\datatr$. The difficulty lies at finding a balance between underfitting to all tasks and overfitting to any particular task.
MAML \citep{finn2017model} achieves such a goal by learning a globally shared weight initialization  $\goal^*$ that is close to the optimal weight parameter of every task. We can write its training objective for getting the best initialization $\goal^*$ as:
\begin{align}
& \min_{\goal} \mathop{\mathbf{E}} \limits_{\task \sim P(\task) } [ \loss ( \vf_{\goali^{1}}, \datats) ] , \nonumber  \\
& \rm {where\quad }
 \goali^{1} = \goali^{0} - \alpha \nabla_{\goal} [ \loss  ( \vf_{\goali^{0}},  \datatr)],\nonumber \\
& \rm {and\quad }  \goali^0 = \goal.
\label{eq:maml_obj}
\end{align}
 

MAML samples a set of tasks $\{\task \}$ from the meta distribution $P(\task)$ and initialize each task's weight $\goali^{0}$ from the global knowledge $\goal$ (to be learnt): i.e., setting $\goali^{0} = \goal$. On each task, the learner performs gradient descent on its training set $\datatr$ to reach task-specific fine-tuned parameters $\goali^1$. The test set $\datats$ of task $\task$ is used for evaluating parameter $\goali^1$, and the evaluation will be used as the objective to optimize for learning the best global knowledge $\goal$.


In probabilistic language, the above objective (in \eref{eq:maml_obj}) can be equivalently framed as maximizing the likelihood:
\begin{align}
&\max_{\goal} \mathop{\prod} \limits_{\task \sim P(\task) } [ \likeli ]   =   \prod\limits_{\task\sim P(\task)}p(\datayite|\dataxite, \datatr, \goal) \label{eq:likeli} \\ &= \prod\limits_{\task\sim P(\task)}\sum_{\goali^1}p(\datayite | \dataxite, \goali^1)p(\goali^1 | \datatr, \goal),
\end{align}
where $p(\goali^1 | \datatr, \goal)$ is a Dirac distribution derived by minimizing the negative log-likelihood(NLL) on $\datatr$ with gradient descent.




\subsection{Previous Heterogeneous Meta Learning}



Task-homogeneous meta-learning assumes that there exists one meta-distribution $P(\task)$ and all tasks are identically and independently (i.i.d.) sampled from $P(\task)$. Differently, in a task-heterogeneous setup, there exist multiple meta-distributions $\task \sim \{P_1(\task), P_2(\task), \cdots, P_k(\task)\}$.

We can naively use MAML and assign all tasks with the same global initialization (though they come from different distributions). 
Figure \ref{fig:abs_demostration}(a, b) show that the ``task ambiguity'' issue is more critical in task-heterogeneous setup and will hinder the generalization from MAML initialization since multiple very different task distributions exist. 


A handful of previous works learn a customized initialization that was tailored from global initialization, in order to tackle the task heterogeneity challenge. MMAML~\citep{vuorio2019multimodal} learns a deterministic task embedding with an RNN module. %
HSML~\citep{yao2019hierarchically} manually designs a task clustering algorithm to assign tasks to different clusters, then customizes the global initialization to each cluster.  ARML~\citep{yao2020automated} models global knowledge and task-specific knowledge as graphs; the interaction between tasks is modeled by message passing. 



However, none of the recent works consider the task ambiguity issue when solving task-heterogeneous domains. Most frameworks are still based on the assumption that  only one distribution exists to explain a task's observed training set (e.g., a new task should be assigned to only one cluster in HSML). However, the source of a task can be highly uncertain based on limited annotated data. Figure \ref{fig:abs_demostration}(b) shows that there can be multiple explanations of an observed dataset in the task-heterogeneous setup and we should not expect to obtain a unique solution.













\subsection{Stochastic  $Z_{\task}$ to Encode Task}




When facing the task-heterogeneous setup, we hypothesize that a meta-learner that can encode potential tasks' patterns will help alleviate the task ambiguity issue. These patterns could describe valuable information about tasks like the more possible shapes of curves for a regression meta-application. Moreover, we propose to enable task encoding with uncertainty estimates. This is because learning a task representation from its limited annotated data is challenging and such uncertainty measures can help inform the downstream meta-adaptation to new tasks (see Figure~\ref{fig:abs_demostration}(b)).  



This hypothesis motivates us to describe a task $\task$ with a stochastic variable $\taskZ$ and model its distribution to condition on observations. With this additional latent variable, we can rewrite the per task likelihood $\likeli$ in \eref{eq:likeli} as:  
\begin{align}
\likeli & = \sum_{\taskZ} p(\datayite|\dataxite, \datatr, \taskZ, \goal) p(\taskZ |\datatr).
\label{eq:likeliz}
\end{align}
We assume in the second term from above, $\taskZ$ only conditions on  $\datatr$. Figure~\ref{fig:pgraph} shows our design.  


In \sref{sec:update}, we show that the likelihood is intractable as defined above, and choose to maximize its evidence lower bound (a.k.a ELBO) instead. Optimizing this variational objective requires the prior $p(\taskZ|\datatr)$ and the posterior $ q(\taskZ| \task)$. We model the prior $p(\taskZ|\datatr)$ as a Gaussian distribution, whose mean and variance are outputs from a multi-layer perceptron (MLP) module with input vector $\vr_\task$:
\begin{equation}
     p(\taskZ|\datatr) = \mathcal{N}(\vec{\mu}(\vr_{\task}), \vec{\sigma}(\vr_{\task})). \label{eq:zmu}
\end{equation}
Here vector $\vr_\task$ is a vector summarizing the encoding of a task $\task$. 
We propose a neural network module to learn  $\vr_\task$ from the sample observations $\datatr$.  The training observations of task $\task$ consist of unordered annotated data pairs $[(\x_{\task}^{tr}, \y_{\task}^{tr})]$. Permutation invariance is a desirable property for functions acting on sets. \citet{zaheer2017deep} showed any function acting on sets ${S}$ is permutation invariant if and only if it can be decomposed as $\rho(\sum_{\s\in S}\phi(\s))$ for suitable choice of transformations $\rho, \phi$. We follow such a design, and encode a task by encoding every pair of its observation in $\datatr$ through a neural network layer:
\begin{align}
&    \vr_{\task, j}  = \vg^{Enc}_{\vph}(\x_{\task, j}^{tr}, \y_{\task, j}^{tr}),\quad j = 1, \cdots, |\datatr|, \label{eq:encode} \\
&    \vr_\task  = \frac{1}{|\datatr|}\sum_{j=1}^{|\datatr|}\vr_{\task, j}.
    \label{eq:Rtask}
\end{align}
\eref{eq:Rtask} uses average function as aggregation operator to obtain the task embedding because it is able to remove the inductive bias due to different sizes of training set from $\vr_{\task}$. In \eref{eq:encode}, $\vg^{Enc}_{\vph}()$ is implemented as a MLP module with learnable parameter $\vph$.
      


We then approximate the intractable posterior distribution $q(\taskZ| \task)$ of $\taskZ$ as conditioned on the whole $\{\datatr, \datats\}$ (see Appendix~\ref{app:post_z}):
\begin{align}
     & q(\taskZ| \task) = q(\taskZ|\datatr, \datats) = \mathcal{N}(\vec{\mu}(\vr'_\task), \vec{\sigma}(\vr'_\task) ), \\  
     & \vr'_\task = \frac{1}{|\task|}\sum_{j=1}^{|\task|}\vr_{\task, j},\quad j = 1, \cdots, (|\datatr| + |\datats|),
     \label{eq:postz}
\end{align}
where $|\task| = |\datatr| + |\datats|$ , $\vec{\mu}(\cdot)$ and $\vec{\sigma}(\cdot)$ are the same MLP modules we have in \eref{eq:zmu}. 





\subsection{\method: Self Adaptation  with $Z_{\task}$}
\label{sec:self-adaptation}
We propose to revise MAML for the heterogeneous meta-learning setup using the summary task representation $\taskZ$, creating \method.
 $\taskZ$ helps tailor the global initialization $\goal$ to task-specific initialization $\goali^0$ for a task $\task$. Its basic motivation is to improves flexibility by incorporating task information into the model.
 This self adaption design is motivated by the recent ideas that design self-adaptation conditioned on global knowledge to  conquer distribution shift issue in domain generalization/adaptation ~\citep{zhang2021adaptive, dubey2021adaptive, xiao2021learning,vuorio2019multimodal}.   










There exist many potential ways to use $\taskZ$ to tailor the global initialization $\goal$ to task-specific initialization $\goali^0$. We assume our target learning machine is a composition of a base learner and a task learner: $$\model = \vf_{\goal_c} ( \vf_{\goal_b}).$$ Here the base learner's parameters are $\goal_b$, and the task learner's parameters are $\goal_c$. For example, in an image classification domain the base learner would be the the CNN backbone and the task learner would be the last linear layer. We can then rewrite $\goal=[\goal_b, \goal_c]$. We propose to only customize $\goal_c$ with $\taskZ$:
\begin{align}
    \goali^0= \vg^{Gate}_{\vw}(\goal, \taskZ) &=  [\goal_b, \sigma(\vec{w_1z}_{\task} + \vw_0)\odot\goal_c ],
    \\
    &= [\goal_b, \sigma(\vw_{gate})\odot\goal_c ]
\label{eq:trans1}
\end{align}
Here $\vec{z}_{\task}$ is sampled from the distribution $q(\taskZ|\task)$ during meta-training and from $p(\taskZ|\datatr)$ during meta-testing. $\sigma$ is the sigmoid function,  $\odot$ represents the element-wise multiplication, $\vw=[\vw_1, \vw_0]^T$ are learnable parameters. $\vw_{gate}$, the gate vector will apply element-wise scaling to navigate global initialization $\goal$ to task-specific initialization $\goali^0$.



Moreover, we design additional customized knowledge for task $\task$. The basic intuition is that the final prediction of a meta-learner depends on both model parameters and input representations. To increase the capacity of the task-specific knowledge, we propose to further propagate task representation $\taskZ$ into encoding augmented feature representations we denote as $\h_{\task}$. We concatenate $\h_{\task}$ with a sample's input representation $\x_{\task}$, and feed the combined vector $\hat{\x}_{\task}$ to our learning machine as its new input. 
\begin{equation}
    \h_{\task}^0 = \vg^{In}_{\vbeta}(\taskZ) = \vec{\beta}_1\vz_{\task} + \vec{\beta}_0,\quad \hat{\x}_{\task} = [\x_{\task}, \h_{\task}^0].
    \label{eq:trans2}
\end{equation}
Same as \eref{eq:trans1}, $\vz_{\task}$ is sampled from its distribution, $\vbeta = [\vbeta_1, \vbeta_0]$ are learnable parameters. 



Now when facing a new task $\task$, a meta-model will first generate the task-specific knowledge that includes both augmented feature $\h_{\task}$ and task-specific parameter $\goali$. We denote the combined knowledge set for task $\task$ as: 
\begin{equation}
\biggoal=[\goali, \h_{\task}]. \label{eq:biggoal}
\end{equation}
This is the meta-knowledge we need to learn in \method. We note its initial values as $\biggoal^0 =[\goali^0, \h_{\task}^0]$ and fine-tuned values as $\biggoal^1=[\goali^1, \h_{\task}^1]$.
















Aiming to learn the meta knowledge defined in \eref{eq:biggoal}, we can rewrite our objective (task likelihood) in \eref{eq:likeliz}: 
\begin{multline}
\begin{split}
\likeli  = \sum_{\biggoal^0, \biggoal^1, \taskZ} p(\datayite|\dataxite, \biggoal^1)p(\biggoal^1|\biggoal^0, \datatr) \\
p(\goali^0|\goal, \taskZ) p(\h_{\task}^0|\taskZ)p(\taskZ|\datatr).
\end{split}
\label{eq:pdist}
\end{multline}
This follows the Bayesian graph provided in Figure~\ref{fig:pgraph}.


\paragraph{Design Choices: } There exist many other possible probabilistic designs besides Figure~\ref{fig:pgraph}. For instance, we can model every variable in the figure as a stochastic distribution and build a complicated hybrid framework. However, this will lead to excessive stochasticity and increase the potential of underfitting in a limited data situation. Instead, similar to $p(\biggoal^1|\biggoal^0, \datatr)$, we choose to model both $p(\h_{\task}^0|\taskZ)$ and $p(\goali^0|\goal, \taskZ)$ as deterministic (see \eref{eq:trans1} and \eref{eq:trans2}) that allow us to employ an amortized variational inference technique~\cite{ravi2019amortized}. %




Our design is different from recent probabilistic extensions of MAML~\cite{finn2018probabilistic, yoon2018bayesian}. They conduct inference on model parameters $\goali$ (initial value $\goali^0$ or fine-tuned value $\goali^1$). Our \method shifts the burden of variational inference to the task representation $\taskZ$, whose dimension is of multiple orders smaller than the size of model parameters.  








\subsection{\method: Update Rules}
\label{sec:update}

\begin{figure*}[t]
  \centering
  \includegraphics[width=0.95\textwidth]{NewFigOpt2.pdf}
  \caption{Iterative optimization process.  \label{fig:opt} In the inner loop, Starting from task-specific parameter initialization $\goali^0$ and augmented features $\h_{\task}^0$, their fine-tuned values $\goali^K, \h_\task^K$ are inferred by performing gradient descent on the training set $\datatr$ for $K$ iterations. }
  
  
\end{figure*}





\paragraph{Variational Objective:} To optimize the intractable likelihood as defined in \eref{eq:pdist}, we choose to maximize its evidence lower bound (a.k.a ELBO) instead:
 \begin{multline}
    \mathcal{L}_{ELBO}(\task) = \mathbf{E}_{\biggoal^1\sim q(\biggoal^1|\task)} \log p(\datayite|\dataxite, \biggoal^1)
    \\ - KL(q(\taskZ|\task)||p(\taskZ|\datatr).
\label{eq:LIB}
\end{multline}
During meta-training, we sample $m$ tasks and optimize the empirical average $\dfrac{1}{m}\sum\limits_{t=1}^m \mathcal{L}_{ELBO}(\task_t)$. 




\paragraph{Update Rules:} Much like MAML, the optimization of \method consists of two nested loops. Figure~\ref{fig:opt} shows the iterative optimization process. In the inner loop, for the $j_{th}$ training data, we concatenate $\x_{\task, j}^{tr}$ with augmented feature $\h_{\task}^0$ to get augmented input vector $\hat{\vx}_{\task, j}^{tr}$. We
feed $\hat{\vx}_{\task, j}^{tr}$ into the learning machine $\modelnop$ whose parameter is $\goali^0$ to calculate the inner loss:
\begin{equation}
    \mathcal{L}_{in}(\task) = \frac{1}{|\datatr|}\sum_{j=1}^{|\datatr|}\mathcal{L}(\vf_{\goali^0}, [\hat{\x}_{\task, j}^{tr}, \y_{\task, j}^{tr}]).
    \label{eq:inner_loss}
\end{equation}




The inner loss is then used for updating $\goali^0$ and $h_{\task}^0$:
\begin{equation}
    \h_{\task}^{1} = \h_{\task}^{0} - \frac{\partial \mathcal{L}_{in}(\task)}{\partial \h_{\task}^{0}},\quad  \goali^{1} = \goali^{0} - \frac{\partial \mathcal{L}_{in}(\task)}{\partial \goali^{0}}.
    \label{eq:inner_update}
\end{equation}

Figure~\ref{fig:opt} shows we can optimize the inner loss for $K$ iterations to achieve a closer approximation for optimal values in \eref{eq:inner_loss}. In the outer loop, we maximize the approximated ELBO $\mathcal{L}_{ELBO}$ in \eref{eq:LIB} using a batch of $m$ tasks. The amortized variational technique allows us to conduct the sampling from $q(\biggoal^1|\task)$ by first sampling from $q(\taskZ|\task)$ and then applying a deterministic transformation using \eref{eq:trans1} and \eref{eq:trans2}.



\paragraph{Algorithm of \method: }
We describe the procedure of \method in the form of pseudo code as shown in Algorithm~\ref{alg:train}. Note, parameters of neural functions $\vec{\mu}(\cdot)$, $\vec{\sigma}(\cdot)$, $\vg^{Enc}_{\vph}()$, $\vg^{Gate}_{\vw}()$, and $\vg^{In}_{\vbeta}()$ are updated in the outer loop.  




\paragraph{Theoretical Analysis of \method: } We also provide a second interpretation of our objective from an information bottleneck perspective and prove they lead to exactly the same target. See Appendix~\ref{app:derivation} for detailed proofs. 






\subsection{Connecting to Related Work}
\label{related_work}

Optimization-based meta-learning methods facilitate the model's adaption to new tasks through global knowledge learned by the optimization process. Meta-LSTM~\citep{Ravi2017OptimizationAA} meta-learns the update rule with an RNN meta-learner. MAML~\citep{finn2017model} trains a global initialization close to the optimal value of every task. Leveraging diverse meta-knowledge further accelerates the learning process. In Meta-SGD~\citep{li2017meta}, the meta-knowledge consists of both initialization and learning rate. ALFA~\citep{baik2020meta} proposes to meta-learn both initialization and hyperparameter update module. 
Most methods assign the same global knowledge to every task that leads to sub-optimal solutions for heterogeneous settings. Besides, they are all deterministic and can only learn one solution for a new task. 



Bayesian approaches are a long-standing discipline that incorporates uncertainty in modeling. Multiple recent works extend MAML into the Bayesian framework and recast meta-learning as the probabilistic framework~\citep{finn2018probabilistic, grant2018recasting, yoon2018bayesian, ravi2019amortized, garnelo2018neural}. BMAML~\citep{yoon2018bayesian} recast MAML into probabilistic framework and provides a Bayesian explanation of MAML. PLATIPUS \citep{finn2018probabilistic} builds upon amortized variational inference and injects Gaussian noise into the gradient during the meta-testing time to learn a distribution over model parameters. LLAMA \citep{grant2018recasting} applies Laplace approximation for modeling the parameter distribution, but it requires the approximation of a high dimensional covariance matrix. These methods view model parameters (i.e. network weights and bias) as random variables and perform inference on them. This leads to significant challenges when working with complicated models and high-dimensional data. 

Our work also loosely connects to the ``prototype meta-learning" ~\citep{triantafillou2019meta, snell2017prototypical}. These studies learn a prototype for every class we need to predict and the final prediction depends on the distances between instances and prototypes.  Amortized bayesian prototype meta-learning~ \citep{sun2021amortized} assumes a distribution over class prototypes. This design requires prior knowledge about the classes of tasks and only applies to the classification homogeneous-meta setup.

Another line of related works   studies neural approximators of the stochastic process family ~\citep{garnelo2018neural, wang2020doubly,louizos2019functional, kim2018attentive}. They learn a prior for every task or further use a hierarchical model that learns the instance prior. However, these methods don't share knowledge across tasks. Table~\ref{table:model_comparison} compares related lines of works with ours.


 


\begin{table}[h]
\caption{A summary of datasets and tasks.}
\centering
\resizebox{\columnwidth}{!}{%
\begin{tabular}{c|c|c| p{22mm}}
\toprule
Problems & Tasks & Cardinality & $|\datatr|\to |\datats|$ \\\midrule
\multirow{3}{*}{\shortstack{Regression}} & 
 2D regression & $k=6$ & $ 10 \to 40$  \\
 &Weather prediction & $k> 9000$ &  $10 \to 100$\\
&Image completion &  $k=3$ & $40 \to 784$
\\\cmidrule{1-4}
\multirow{3}{*}{\shortstack{Classification}}& PlainMulti classification & $k=4$ &  5way 5shot\\
& CelebA binary classification   & \multirow{2}*{$k=1$} & \multirow{2}*{2way 5shot} \\
& (see Appendix~\ref{app:exp_binary}) & & \\
\bottomrule
\end{tabular}
}
\label{table:exp_summary}
\end{table}


\begin{table*}[h!]
\centering
\caption{Prediction error with 95\% confidence interval on 2D regression tasks.}
\resizebox{\linewidth}{!}{
\begin{tabular}{c|cccccccc}
\hline
Model & MAML       & MetaSGD   & BMAML      & MMAML      & HSMAML       & \method  &\method w/o aug  &  \method w/o tarilor \\ \hline
MSE   & $2.29\pm 0.16$ & $2.91\pm 0.23$ & $1.65\pm 0.10$ & $0.52\pm 0.04$ & $0.44\pm 0.03$ &  $\mathbf{0.37\pm 0.04}$    & $0.44\pm 0.05$  & $0.41\pm 0.06$ \\ \hline
\end{tabular}}
\label{table:2Dregression}
\end{table*}


\begin{figure*}[h]
  \centering 
      \includegraphics[width=1.0\linewidth]{2D_various.pdf}
\caption{\label{fig:2DReg_2_5shot}Few-shot 2D regression with various number of training data and noise level. (a) $|\datatr|=2, \sigma=0.3$ (b)  $|\datatr|=5, \sigma=0.3$, (c) $|\datatr|=10, \sigma=0.8$, (d)  $|\datatr|=10, \sigma=0.1$. Black star represents training data, dashed lines characterize different sampled models, the blue curve is the true mapping. Solutions sampled from \method span a wide range and stay faithful around annotated data. }
\end{figure*}

\section{Experiments}

Our experiments are designed to answer the following: 

\medskip \textbf{Q1.} Does \method successfully meta-learn from heterogeneous tasks across a variety of applications?

\textbf{Q2.} How does \method perform when we have more or less task ambiguity?  


\textbf{Q3.} How does \method  compare to previous heterogeneous meta approaches in terms of accuracy and adaptation?

\textbf{Q4.} How does \method  perform when applied to a challenging real-world dataset?


To answer \textbf{Q1}, we select a wide range of applications in our experiments.  We provide a summary of our experimental datasets, and their properties in Table~\ref{table:exp_summary} . 




We compare against several baselines representing four meta-learning groups: (1) meta-learning methods designed for homogeneous tasks: MAML \citep{finn2017model} and MetaSGD  \citep{li2017meta}. (2) Meta-learning methods designed for heterogeneous tasks including MMAML \citep{vuorio2019multimodal} and HSMAML \citep{yao2019hierarchically}.
(3) Bayesian meta-learning methods: Bayesian MAML \citep{yoon2018bayesian}, which recasts MAML into the Bayesian framework. %
(4) Neural processes (NPs) methods \citep{garnelo2018conditional, garnelo2018neural}. NPs learn a distribution over solutions and are regarded as state-of-the-art methods for small scale meta-learning regression applications.











\begin{table*}[th!]
\caption{5-way 5-shot classification accuracy with 95\% confidence interval on Plain-Multi dataset.}
\begin{center}
\begin{tabular}{l|l|cccc}
\toprule
Settings & Algorithms & Data: Bird & Data: Texture & Data: Aircraft & Data: Fungi \\\midrule
\multirow{7}{*}{\shortstack{5-way\\5-shot}} & 
 MAML & $68.52\pm0.79\%$ & $44.56\pm0.68\%$ & $66.18\pm 0.71\%$ & $51.85\pm0.85\%$ \\
&MetaSGD & $67.87\pm0.74\%$ & $45.49\pm0.68\%$ & $66.84\pm0.70\%$ & $52.51\pm0.81\%$ \\\cmidrule{2-6}
& BMAML & $69.01\pm 0.74\%$ & $46.06\pm 0.69\%$ & $65.74\pm 0.67\%$ & $52.43\pm 0.84\%$ \\
& MMAML & $70.49\pm0.76\%$ & $45.89\pm0.69\%$ & $67.31\pm0.68\%$ & $53.96\pm0.82\%$ \\
&  HSMAML & $\mathbf{71.68\pm 0.73\%}$ & $\mathbf{48.08\pm 0.69\%}$ & $\mathbf{73.49\pm 0.68\%}$ & $\mathbf{56.32\pm 0.80\%}$ \\
\cmidrule{2-6}
& \method  & $\mathbf{72.49 \pm 0.53\%}$  & $46.51 \pm 0.42\%$ & $\mathbf{72.64 \pm 0.44\%}$ & $\mathbf{55.29 \pm 0.57\%}$ \\
& \method(w/o aug) & $71.49 \pm 0.55\%$  & $\mathbf{47.17 \pm 0.44\%}$ & $71.62 \pm 0.43\%$ & $54.91 \pm 0.56\%$ \\
& \method(w/o tailor) & $71.48 \pm 0.55\%$ & $46.07 \pm 0.40\%$ & $70.46 \pm 0.44\% $& $54.59 \pm 0.56\%$\\
\bottomrule
\end{tabular}
\end{center}
\label{tab:plainmulti_res}
\end{table*}













































\subsection{2D Regression: Simulated Studies}
To answer \textbf{Q2}, we generate synthetic heterogeneous regression tasks that come from  multiple functional families of curves. We  use probabilistic meta-learning models to sample and visualize multiple solutions.



\paragraph{Setup.} We follow a similar setup as \citep{yao2020automated} to generate  2D regression tasks. The meta-distribution set $\{P_k(\task)\}$ consists of 6 function families including \textit{sinusoids, straight line, quadratic, cubic, quadratic surface}, and \textit{ripple} functions. We perturb the output by adding Gaussian noise with standard deviation 0.3 in meta-train tasks. During meta-training, every task is uniformly randomly sampled from one of the $6$ function families, and the size of the training set $|\datatr|=10$. We adopted mean square error (MSE) to measure prediction accuracy. A detailed description of the setup and model architecture is available in Appendix \ref{app:exp_setup}.


\paragraph{Results, ablations, and analysis.}  We train models on around $10,000$ tasks and evaluate it on over $1,000$ newly sampled tasks. The results are summarized in Table~\ref{table:2Dregression}. We can 
see clearly  \method outperforms the baselines. 
To better investigate the contribution of each component, we perform ablation experiments by either removing model tailoring or input variable augmentation. Table~\ref{table:2Dregression} shows that both types of task-specific knowledge provide important contributions  and their combination gives the best performance. 


We visualize example curve fits in Figure~\ref{fig:2DReg_2_5shot} and Figure~\ref{fig:2DReg_10shot}. During meta-testing, we can decrease the size of training set or increase the noise level such that tasks ambiguity can be more concerning. In Figure~\ref{fig:2DReg_10shot}, all sampled solutions are close to the ground-truth since tasks are less uncertain. Differently,  Figure~\ref{fig:2DReg_2_5shot} shows that as tasks become more ambiguous, those sampled solutions by \method tend to span a wider range. However, they stay faithful around those annotated training data. More analysis visualization results can be found in Appendix~\ref{app:exp_setup}. 




\subsection{Heterogeneous Few-shot Classification}

To answer \textbf{Q3}, we apply \method on two common few-shot classification benchmarks from the literature. With space limit, results on CelebA data are in Appendix \ref{app:exp_binary}. 

\paragraph{Setup} N-way K-shot classification is a popular setup in few-shot meta-learning~\citep{chen2019closer, ren2018meta, vinyals2016matching}. The training set of every task consists of $N$ classes with $K$ labeled data in each class. 
In the benchmark Plain-Multi dataset, each meta-task is sampled from one of four diverse datasets ~\citep{yao2019hierarchically}. We follow the benchmark architecture, including a feature learner using four convolutional blocks. Our ST module takes the input $\vec{x}$ into two convolutional blocks with 6 channels, and then concatenate the output vector with the target variable  into a two-layer MLP to model the mean and variance of $\taskZ$. 





\paragraph{Results, ablations, and analysis.} After training on over $50,000$ total tasks, the model is evaluated on $1,000$ tasks for each dataset and the results are summarized in Table~\ref{tab:plainmulti_res}. The most relevant method is MMAML. It learns a deterministic task embedding with an RNN module and encodes all parameters in both base learner $\vf_{\goal_b}$ and task learner $\vf_{\goal_c}$. Our method outperforms it on every dataset. Also, the probabilistic framework enables us to achieve consistently low variance. Note that HSMAML uses the prior knowledge about the number of clusters, which plays an important role with respect to the final accuracy. Our \method does not reply upon such prior and achieves lower variance and similar performance than HSMAML. We again run two ablated versions of the proposed \method, and compare it against the full version. The combination of input augmentation and model tailoring yields the best results and is most capable of confronting task-heterogeneity.




\subsection{Real-World Temperature Prediction}

Now we answer \textbf{Q4} by applying \method to a challenging regression problem using real-world data.

\begin{table*}[h!]
\caption{10-Shot temperature prediction. Mean square losses are averaged across over 1,000 sampled test tasks.}
\centering
\begin{tabular}{c|ccccc}
\hline
Model & MAML   & MetaSGD & \method  & \method(w/o aug) & \method(w/o tailor) \\ \hline
MSE   & $141.43 \pm 9.33$ & $291.42 \pm 14.89$ &  $\mathbf{86.56 \pm 4.89}$  &   $100.27 \pm 5.87$ & $106.37 \pm 5.77 $    \\\hline
\end{tabular}
\label{table:weather_reg}
\end{table*}

\paragraph{Setup.} The NOAA Global Surface Summary of the Day (GSOD) dataset contains daily weather data from thousands of stations around the world. Each task is created by sampling data points from (station, year) pairs. Each sample  takes in one date of the year along with $15$ weather features such as wind speed, station elevation, precipitation, fog, air pressure, etc for that date. It then learns to predict the average temperature in Fahrenheit on that day. We remove important information like the weather station number, name, latitude, and longitude. Hiding the station information in this way creates a highly heterogeneous problem where each station generates its own task distribution. The model sees $10$ days of labeled temperature data before predicting the temperature on $100$ test days. More technical details can be found in Appendix \ref{app:exp_setup}.




\paragraph{Results and analysis.} After $100$ epochs of training on approximately $42,000$ unique (station, year) tasks, we evaluate the model on a test set of $1,000$ (station, year) pairs. The results are summarized in Table \ref{table:weather_reg}. \method predictions are approximately $40\%$ more accurate than MAML. MetaSGD, designed for homogeneous meta-learning, achieves low accuracy because the globally learned learning knowledge hurts the model's generalization to unseen tasks from different distributions. This is consistent with our assumption that incorporating task-specific knowledge into the model can help solve the task-heterogeneous challenge. We also perform ablation experiments in Table \ref{table:weather_reg}. Both tailored initialization and augmented features outperform the baselines, and they combine for further improvement. Figure~\ref{fig:temp_preds} provides a visualization of trained \method on the NOAA-GSOD temperature prediction task. %
    




\subsection{Heterogeneous Image Completion}


While we have already demonstrated \method on regression and classification tasks, we continue to answer \textbf{Q1} by expanding to image completion, which is a popular small scale meta-learning task.

\paragraph{Setup.} In our heterogeneous image completion application, the meta distribution set $\{P_k(\task)\} = \{\text{MNSIT}, \text{FMNIST}, \text{KMNIST}\}$. Every task contains one image of size $28\times 28$ sampled randomly from one of the three dataset distributions. In meta-training, $40$ pixels are observed for every image, thus, $|\datatr|=40$. We use coordinates as inputs and pixel value as the target variable. Each image completion can be interpreted as a meta-learning task which generalizes the knowledge from a limited training set $|\datatr|=40$ to the entire image of size $|\datats|=784$. Architecture details can be found in Appendix~\ref{app:exp_setup}.
\begin{figure*}[t]
    \centering 
    \includegraphics[width=0.9\linewidth]{FigCompletion2.pdf}
\caption{Visualization of completed images. First column contains  original images, second column shows the observations which contains $8$ annotated pixels (left) and 40 annotated pixels (right). The unobserved pixels have been colored blue for better clarity. The remaining columns correspond to $4$ different sampled solutions (completed images) given observations. } 
\label{fig:ImgCompletion}
\end{figure*}

\begin{figure*}[h!]
    \centering
    \begin{minipage}{.28\textwidth}
        \centering
         \includegraphics[width=\linewidth]{tsne1.pdf}
        \label{fig:t-SNE-1}
    \end{minipage}%
    \begin{minipage}{0.28\textwidth}
        \centering
 \includegraphics[width=\linewidth]{tsne2.pdf}
        \label{fig:t-SNE-2}
    \end{minipage}
    \hspace{3mm}
     \begin{minipage}{.2\textwidth}
    \caption{\label{fig:t-SNE} { t-SNE plots of gate vectors for tasks randomly sampled from the meta-distributions of synthetic regression (left) and image completion (right)}. Best view in color.}
       \end{minipage}
\end{figure*}
\paragraph{Baselines, results and analysis.}  The described setup is a benchmark task for Neural processes~\citep{garnelo2018conditional,garnelo2018neural}. Thus, we compare our proposed \method with neural processes (NP) \citep{garnelo2018neural} and conditional neural processes (CNP) \citep{garnelo2018conditional} which is a deterministic NP. The numerical comparison is shown in Table~\ref{table:img_comp}. \method achieves higher completion precision compared with NP and CNP. We leave out the variance for all methods because the difference is insignificant.


 Image completion task can be highly ambiguous, because there exist multiple full image choices that could explain the pattern of a handful of observed pixels, especially for gray images. Uncertainty arises on three levels: the inter-class level, inter-distribution level, and cross-distribution level. \method can capture more potential truths by learning a distribution of possibilities rather than a unique mapping. We visualize observations and their completions in Figure~\ref{fig:ImgCompletion}. Interestingly, when we compare the two half-rows describing image completion for a button-up shirt image, the half-row with more pixels observed during meta-testing, its task is less ambiguous. Therefore, its completed images are closer to the original image. This reflects one merit of \method: its set operations allows \method to learn from any size of the training set during meta-testing. 

\begin{table}[h!]
     \centering
\caption{Image completion accuracy. Binary cross entropy values are averaged across 300 test tasks.   \label{table:img_comp}}
{%
\begin{tabular}{c|cccc}
\hline
Model & NP    & CNP   & \method (deter) & \method  \\ \hline
BCE   & $0.302$ & $0.358  $ & $0.272 $           & $\mathbf{0.268}$    \\ \hline
\end{tabular}
}
\end{table}


\subsection{Visualization of gate vectors $\mathbf{w}_{gate}$.}


As noted in \sref{sec:self-adaptation}, gate vector $\vw_{gate}$ (Eq~\ref{eq:trans1}), which originates from stochastic task variable $\taskZ$, translates global initialization $\goal$ to task-specific initialization $\goali^0$. Thus, we hypothesis patterns of gate vectors contain information about the relationships between similar tasks. To gain insights into the tasks' gate vectors $\vw_{gate}$, we visualize sampled vectors on two applications: 2D regression and image completion. For both applications, we sample 200 tasks from each $P_k(\task)$, and
visualize their gate vectors $\vw_{gate}$ using a t-SNE plot~\citep{van2008visualizing}. The visualizations are shown in Figure~\ref{fig:t-SNE}. Gate vectors of tasks from the same distribution (shown as same color points) are clustered on t-SNE plots while tasks from very distinct distributions are further away. For instance, In Figure~\ref{fig:t-SNE} left, sinusoidal regression tasks (blue) sit far away from ripple surface tasks (brown). These observations can be seen as  evidence of the task identification capability of \method. Furthermore,  
tasks from similar distributions may entangle  (Figure~\ref{fig:t-SNE} left, linear, quadratic, and cubic regression tasks). The uncertain identity of similar tasks justifies the representation of task information as stochastic variables. 





\section{Conclusion}
Task heterogeneity is one critical challenge in meta-learning. Most meta-learning methods assign the same initialization to every task and fail to handle task heterogeneity.  \method encodes tasks using a stochastic task module with set-based operations for permutation-invariance. The probabilistic framework allows us to learn a distribution of solutions for ambiguous tasks and recover better potential task identities. This stochastic task design allows for customizing global knowledge with a learned stochastic task distribution.   Empirically, we design extensive experiments on various applications and show that \method provides an effective way to learn from diverse and ambiguous tasks. As next step, we plan to add domain generalization during meta-testing to enhance our work. 




\FloatBarrier






\bibliography{wang_312.bib}


\end{document}