\documentclass[accepted]{uai2022} %

\usepackage[american]{babel}
\graphicspath{{figures/}{../figures/}{figs/}{../figs/}}
\usepackage{fancyhdr}
\usepackage{color}
\usepackage{url}
\usepackage{multirow}
\usepackage{graphicx} %
\usepackage{times}
\usepackage{latexsym}
\usepackage{multicol}
\usepackage{multirow}

\usepackage{placeins}


\usepackage{tabularx} %
\usepackage{amssymb}%
\usepackage{amsmath}
\usepackage{mathtools}
\usepackage{pifont}%


\usepackage{ragged2e}


\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%

\usepackage{hyperref}
\usepackage{url}
\usepackage{dirtytalk}



\makeatletter
\renewcommand \thesection{S\@arabic\c@section}
\renewcommand\thetable{S\@arabic\c@table}
\renewcommand \thefigure{S\@arabic\c@figure}
\makeatother



\newcommand{\sref}[1]{Section~(\ref{#1})} 
\newcommand{\eref}[1]{Eq.~(\ref{#1})} 
\newcommand{\rref}[1]{Theorem~(\ref{#1})} 
\newcommand{\iref}[1]{Inequality~(\ref{#1})} 
\newcommand{\lref}[1]{Lemma~(\ref{#1})} 
\newcommand{\cref}[1]{Condition~(\ref{#1})} 
\newcommand{\dref}[1]{Definition~(\ref{#1})} 
\newcommand{\coref}[1]{Corollary~(\ref{#1})}


\def\e{{\mathbf e}} 
\def\x{{\mathbf x}} 
\def\z{{\mathbf z}} 
\def\y{{\mathbf y}} 
\def\X{{\mathbf X}} 
\def\bZ{{\mathbf Z}} 
\def\Y{{\mathbf Y}} 
\def\s{{\mathbf s}} 
\def\h{{\mathbf h}} 
\def\p{{\mathbf p}} 
\def\W{{\mathbf W}} 



\def\x{{\mathbf x}} 
\def\z{{\mathbf z}} 
\def\y{{\mathbf y}} 
\def\s{{\mathbf s}} 
\def\h{{\mathbf h}} 
\def\p{{\mathbf p}} 
\def\W{{\mathbf W}} 
\def\Ib{{\mathbf I}} 
\def\Cb{{\mathbf C}} 




\usepackage{amsfonts,amssymb,bm}
\usepackage{pifont} %
\usepackage{graphicx,subfigure,epsfig,fancybox} %
\usepackage{float}
\usepackage{color} %
\usepackage{multirow}
\usepackage{natbib}

\newcommand{\mycite}[1]{\small{(#1)}}
\newcommand{\revise}[1]{\textcolor{blue}{}}
\newcommand{\revised}[1]{\textcolor{blue}{}}
\renewcommand{\vec}[1]{\boldsymbol{#1}}
\newcommand{\eq}[1]{\left\langle #1 \right\rangle}
\newcommand{\mat}[1]{\mathbf{#1}}
\newcommand{\trans}[1]{#1^{\textsf{T}}}
\newcommand{\Count}[1]{\text{count}(#1)}
\newcommand{\inv}[1]{(#1)^{-1}}
\newcommand{\Inv}[1]{\left(#1\right)^{-1}}
\newcommand{\set}[1]{\mathcal{#1}}
\newcommand{\diag}[1]{\text{diag}\left(#1\right)}
\newcommand{\annotation}[1]{\textsc{#1}}
\newcommand{\annot}[1]{\annotation{#1}}
\newcommand{\tagword}[2]{\example{#1}/\annot{#2}}
\newcommand{\phon}[1]{\textsf{#1}}
\newcommand{\expect}[1]{E[{#1}]}
\newcommand{\dd}[2]{\frac{\partial #1}{\partial #2}}
\renewcommand{\symbol}[1]{$\langle$#1$\rangle$}
\newcommand{\sentstart}[0]{\symbol{s}}
\newcommand{\sentend}[0]{\symbol{/s}}
\newcommand{\tuple}[1]{\langle #1 \rangle}
\newcommand{\cue}[1]{[\textcolor{blue}{#1}]}
\newcommand{\question}[1]{\textcolor{red}{#1}}
\newcommand{\fstransx}[2]{\xrightarrow[\example{#2}]{\example{#1}}}
\newcommand{\fstrans}[2]{\xrightarrow[{#2}]{#1}}
\newcommand{\gloss}[1]{{\fontfamily{phv}\selectfont #1}}
\newcommand{\newadd}[1]{\textcolor{magenta}{#1}}

\newcommand{\from}[0]{\leftarrow}
\newcommand{\reals}[0]{\mathbb{R}}
\newcommand{\normal}[1]{\mathcal{N}\left(#1\right)}
\newcommand{\cpic}[2][.95\textwidth]{\begin{center}\includegraphics[width=#1]{#2}\end{center}}
\newcommand{\bigo}[1]{\mathcal{O}(#1)}
\newcommand{\Pmult}[0]{P_{\text{multinomial}}}
\newcommand{\Pcat}[0]{P_{\text{categorical}}}
\newcommand{\vw}[0]{\vec{w}}
\newcommand{\vwt}[0]{\trans{\vec{w}}}
\newcommand{\vd}[0]{\vec{d}}
\newcommand{\vf}[0]{\vec{f}}
\newcommand{\vg}[0]{\vec{g}}
\newcommand{\vx}[0]{\vec{x}}
\newcommand{\vz}[0]{\vec{z}}
\newcommand{\vy}[0]{\vec{y}}
\newcommand{\vs}[0]{\vec{s}}
\newcommand{\vu}[0]{\vec{u}}
\newcommand{\vth}[0]{\vec{\theta}}
\newcommand{\vtht}[0]{\trans{\vec{\theta}}}
\newcommand{\vph}[0]{\vec{\phi}}
\newcommand{\jm}[0]{J\&M}
\newcommand{\dkl}[0]{\mathcal{D}_{KL}}
\newcommand{\Z}[0]{\mathcal{Z}}

\newcommand{\vr}[0]{\vec{r}}
\newcommand{\vbeta}[0]{\vec{\beta}}


\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{proof}{Proof}
\newcommand{\task}{\mathcal{T}}
\newcommand{\likeli}{\mathcal{L} ({\task})}

\newcommand{\modelnop}{\vf}
\newcommand{\model}{\vf_{\goal_{\task}}}
\newcommand{\modeli}{\vf_{\goal_{\task}^{0}}()}
\newcommand{\modelipost}{\vf_{\goal_{\task}^{1}}()}

\newcommand{\loss}{\mathcal{L}oss}


\newcommand{\point}{\mathbf{x}}
\newcommand{\goal}{\vec{\theta}}
\newcommand{\datatr}{\vec{D}^{tr}_{\task}}
\newcommand{\datats}{\vec{D}^{te}_{\task}}
\newcommand{\biggoal}{\vec{\Theta}_{\task}}
\newcommand{\taskZ}{\vec{Z}_{\task}}
\newcommand{\datayitr}{\vec{Y}_{\task}^{tr}}
\newcommand{\dataxitr}{\vec{X}_{\task}^{tr}}
\newcommand{\datayite}{\vec{Y}_{\task}^{te}}
\newcommand{\dataxite}{\vec{X}_{\task}^{te}}
\newcommand{\goali}{\goal_{\task}}



\usepackage{adjustbox}
\usepackage{array}
\usepackage{booktabs}

\newcommand{\qnote}[1]{[\textcolor{blue}{Q-note: #1}]}
 \newcommand{\znote}[1]{[\textcolor{green}{Z-note: #1}]}



\newcommand{\methodt}{\texttt{ST-MAML}\xspace}
\newcommand{\method}[0]{\texttt{ST-MAML} }



\newcommand{\func}[2]{\vec{g}_{#1}\left(#2\right)} 
\newcommand{\gfunc}[1]{\vec{s}\left(#1\right)} 
\newcommand{\btheta}{\bm{\theta}}
\newcommand{\bphi}{\bm{\phi}}


\newcommand{\here}{\textcolor{red}{\tt It ends here last time.}}

\def\etal{{\it et~al.}}



\setlength{\abovecaptionskip}{0pt plus 0pt minus 1pt}
\setlength{\belowcaptionskip}{0pt plus 0pt minus 1pt}

\usepackage{enumitem}
\setlist[itemize]{leftmargin=*}




\newenvironment{CompactItemize}
{
  \begin{list}{--}{%
      \usecounter{enumi}
      \setlength{\leftmargin}{12pt}%
      \setlength{\itemsep}{-2pt}
      }}
  {\end{list}}





\usepackage[nolist,nohyperlinks]{acronym}

\makeatletter
\newcommand{\removelatexerror}{\let\@latex@error\@gobble}
\makeatother


\usepackage{lipsum}
\usepackage{titlesec}


\titlespacing\subsection{0pt}{5pt plus 2pt minus 1pt}{2pt plus 0pt minus 0pt}


\parskip 0pt
\setlength\abovedisplayskip{0pt}

\setlength{\textfloatsep}{3pt}
 \addtolength{\parskip}{0mm}

 \usepackage{etoolbox}
 \makeatletter
 \preto{\@tabular}{\parskip=3pt}
 \makeatother



\setlength{\belowdisplayskip}{0pt} \setlength{\belowdisplayshortskip}{0pt}
\setlength{\abovedisplayskip}{0pt} 
\setlength{\abovedisplayshortskip}{0pt}

\setlength{\abovecaptionskip}{0pt plus 0pt minus 1pt}
\setlength{\belowcaptionskip}{-5pt}


\usepackage{enumitem}
\setlist[itemize]{leftmargin=*}

\usepackage[font=small,skip=0pt]{caption}





\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{
    \externaldocument{#1}
    \addFileDependency{#1.tex}
    \addFileDependency{#1.aux}
}

\myexternaldocument{wang_312}

\usepackage{natbib} %
\bibliographystyle{abbrvnat}
\setcitestyle{authoryear,open={(},close={)}} %
    
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} %
\usepackage{booktabs} %
\usepackage{tikz} %


\newcommand{\swap}[3][-]{#3#1#2} %

\title{\method : A Stochastic-Task based Method for Task-Heterogeneous Meta-Learning Supplementary Material}

\author[1]{\href{mailto:<zw6sg@virginia.edu>?Subject=Your UAI 2022 paper}{Zhe Wang}{}}
\author[1]{Jake Grigsby}
\author[1]{Arshdeep Sekhon}
\author[1]{Yanjun Qi}

\affil[1]{%
    Computer Science Dept.\\
    University of Virginia\\
    Charlottesville, Virginia, USA
}

  \begin{document}

\onecolumn
\maketitle
\section{Model Comparison.}
\label{app:model_comparison}
\begin{table*}[h]
\caption{Model comparison table. HoMAMLs are MAMLs designed for task homogeneity, and HeMAMLs are for heterogeneity. NPs describe methods in Neural Processes family. PMAMLs mean probabilistic extensions of MAML. Aug feature represents the augmented features.}
\begin{center}
\begin{tabular}{c|c|cccc}
\toprule
Category & Tasks & Knowledge Set & Tailoring & Sampling & Inference on \\\midrule
\multirow{2}{*}{\shortstack{HoMAMLs}} & 
 MAML~\cite{finn2017model} & Initialization & & &  \\
 &MetaSGD~\cite{li2017meta} & Initialization$+$lr & & & \\\cmidrule{1-6}
\multirow{2}{*}{\shortstack{HeMAMLs}}& MMAML~\cite{vuorio2019multimodal}& Initialization &\checkmark & & \\ 
&HSML~\cite{yao2019hierarchically} & Initialization &\checkmark & &  \\\cmidrule{1-6}
\multirow{2}{*}{\shortstack{NPs}} & 
 NP~\cite{garnelo2018neural} & Aug feature  & & \checkmark & Representation \\
 &CNP~\cite{garnelo2018conditional} & Aug feature & &   & \\\cmidrule{1-6}
 \multirow{3}{*}{\shortstack{PMAMLs}}& BMAML~\cite{yoon2018bayesian}& Initialization  &  &\checkmark & Parameters\\ 
&PLATIPUS~\cite{finn2018probabilistic} &Initialization & & \checkmark  & Parameters \\
&\method &Initialization$+$Aug feature & \checkmark & \checkmark & Representation\\

\bottomrule
\end{tabular}
\end{center}
\label{table:model_comparison}
\end{table*}

\section{Approximation for posterior distribution $q(Z_{\task})$.}
\label{app:post_z}
Given the training set $\datatr$ of a task $\task$, the stochastic task variable $\taskZ$ is supposed to infer its posterior distribution conditioned on $\datatr$ only, specifically, we have the true posterior:
\begin{equation}
p(\taskZ|\task) = \dfrac{p(\taskZ|\datatr) p(\datayite|\taskZ, \dataxite, \datatr)}{p(\task)}
\end{equation}
the empirical distribution $p(\task)$ is only known in the form of $\{(\datatr, \datats)\}$ pairs. Thus, the true posterior distribution is intractable. Based on our design, we suppose the prior distribution $p(\taskZ|\datatr)$ is a multivariate Gaussian distribution, whose mean and variance is the output of a set operator acting on $(\dataxitr, \datayitr)$ pairs. To ensure the posterior stays close to the prior, also the posterior is derived from $(\datatr, \datats)$, we approximate it with the output of the same set operator acting on both $(\dataxitr, \datayitr)$ and $(\dataxite, \datayite)$ pairs. 


\section{Derivation of ELBO  approximation as Variational Information Bottleneck Objective}
\label{app:derivation}


For task $\task$, our fine-tuned task-specific knowledge set $\biggoal^1$ contains two variables: model parameters $\goali^1$ and augmented features $\h_{\task}^1$. Given task inputs $\X_{\task}=[\dataxitr, \dataxite]$, we are seeking a task-specific knowledge set that is maximally informative of test target $\datayite$, while being mostly compressive of training target $\datayitr$. Correspondingly, we would like to maximize the conditional mutual information $I(\datayite; \biggoal^1| \X_\task)$ and minimize $I(\datayitr; \biggoal^1 | \X_\task)$. The information bottleneck objective is:
\begin{equation}
\mathcal{L}_{IB}(\task) = I(\datayite; \biggoal^1| \X_\task) - \beta I(\datayitr; \biggoal^1 | \X_\task).
\label{eq:IB}
\end{equation}


We show the following lemma:
\begin{lemma}
Given a task $\task$, maximizing the information bottleneck loss $\mathcal{L}_{IB}$ defined in \eqref{eq:IB} is equivalent to maximizing the weighted ELBO :
 \begin{align}
    \mathcal{L}_{wELBO}(\task) = \mathbf{E}_{\biggoal^1\sim q(\biggoal^1|\task)} \log p(\datayite|\biggoal^1,\dataxite)- \beta KL(q(\taskZ|\task)||p(\taskZ|\datatr).
\label{eq:weightLIB}
\end{align}
\end{lemma}

 







\begin{proof}To lower bound IB objective defined in \eref{eq:IB}, we derive the lower bound for first term $I(\datayite; \biggoal^1 | \X_{\task})$ and upper bound for second term $I(\datayitr; \biggoal^1 | \X_{\task})$. 
Further, we assume a distribution $q(\datayite, \biggoal^1|\X_{\task})$ as a variational approximation of the true distribution $p(\datayite, \biggoal^1|\X_{\task})$. 
\begin{equation}
\begin{split}
I(\datayite, \biggoal^1|\X_{\task})
&=\int p(\X_{\task})\left[\int q(\datayite, \biggoal^1|\X_{\task}) \log\dfrac{q(\datayite, \biggoal^1|\X_{\task})}{p(\datayite) q(\biggoal^1|X)}d\datayite d\biggoal^1\right]d\X_{\task}\\
&= \int p(\X_{\task})\left[\int q(\datayite, \biggoal^1 | \X_{\task}) \log\dfrac{q(\datayite| \biggoal^1, \X_{\task})}{p(\datayite)}d\datayite d\biggoal^1\right]d\X_{\task}
\end{split}
\label{eq:IB_term1}
\end{equation}


\begin{equation}
\begin{split}
    q(\biggoal^1|\X_{\task})&=\int q(\biggoal^1 | \datayitr, \X_{\task}) p(\datayitr ) d\datayitr \\
    &= \int q(\biggoal^1 | \datayitr , \X_{\task}) p(\datayitr ,\datayite ) d\datayitr d\datayite 
    \end{split}
\end{equation}

\begin{equation}
\begin{split}
    q(\datayite , \biggoal^1|\X_{\task})&=\int q(\biggoal^1, \datayitr ,\datayite   |  \X_{\task})  d\datayitr \\
    &=\int q(\biggoal^1, | \datayitr ,\datayite   ,  \X_{\task}) p(\datayitr ,\datayite   | \X_{\task})  d\datayitr \\
    &=\int q(\biggoal^1, | \datayitr   ,  \X_{\task}) p(\datayitr ,\datayite   | \X_{\task})  d\datayitr \\
   \end{split}
\end{equation}
The last part follows  from the fact that $\biggoal^1$ is independent of $\datayite $ given $[\X_{\task}, \datayitr ]$. Putting this together: 

\begin{equation}
q(\datayite |\biggoal^1, \X_{\task}) = \dfrac{\int p(\datayite ,\datayitr )q(\biggoal^1 | \datayitr , \X_{\task})d\datayitr }{\int p(\datayite ,\datayitr )q(\biggoal^1 | \datayitr , \X_{\task})d\datayitr d\datayite }
\end{equation}

However, the above conditional distribution $q(\datayite |\biggoal^1, \X_{\task})$ is intractable due to the unknown data distribution $p(\datayite ,\datayitr )$. To derive the upper bound, we introduce a variational approximation $ p_{\theta}(\datayite |\biggoal^1, \X_{\task})$ for $q(\datayite |\biggoal^1, \X_{\task})$. 

Take it into the \eref{eq:IB_term1}, we have:

\begin{equation}
\begin{split}
I(\datayite , \biggoal^1|\X_{\task}) &= \int p(\X_{\task}) \left[\int q(\datayite , \biggoal^1|\X_{\task})\log\dfrac{p_{\theta}(\datayite |\biggoal^1, \X_{\task})q(\datayite | \biggoal^1, \X_{\task})}{p_{\theta}(\datayite |\biggoal^1, \X_{\task})p(\datayite )}d\datayite d\biggoal^1\right]d\X_{\task}\\
&\geq \int p(\X_{\task}) \left[\int q(\datayite , \biggoal^1| \X_{\task})\log\dfrac{p_{\theta}(\datayite |\biggoal^1, \X_{\task})}{p(\datayite )}d\datayite d\biggoal^1\right]d\X_{\task}\\
&= \int p(\X_{\task}) \left[\int q(\datayite , \biggoal^1|\X_{\task})\log p_{\theta}(\datayite |\biggoal^1, \X_{\task})d\datayite d\biggoal^1\right]d\X_{\task} + C\\
&= \int q(\datayite , \biggoal^1, \X_{\task})\log p_{\theta}(\datayite |\biggoal^1, \X_{\task})d\datayite d\biggoal^1d\X_{\task} + C
\end{split}
\label{eq:IB_term1_app1}
\end{equation}


In the above equation, we use  $KL(q(\datayite |\biggoal^1,\X_{\task})|| p_{\theta}(\datayite |\biggoal^1, \X_{\task})) \geq 0$ in the second step.


The second term is irrelevant to our objective so we can treat it as a constant. Note that:
\begin{equation}
 q(\datayite , \biggoal^1, \X_{\task})
 =\int q(\biggoal^1|\datayitr , \X_{\task})p(\datayitr, \datayite|\X_{\task})p(\X_{\task})d\datayitr 
\end{equation}
Thus, an unbiased estimation of the first term is:
\begin{equation}
I(\datayite , \biggoal^1|\X_{\task}) \geq  \int  q(\biggoal^1 | \datayitr, \X_{\task})\log p_{\theta}(\datayite |\biggoal^1, \X_{\task}) d\biggoal^1.
\label{eq:upper_bound}
\end{equation}


We derive the upper bound for second term:
\begin{equation}
\begin{split}
I(\datayitr , \biggoal^1|\X_{\task}) 
&=\int p(\X_{\task})\left[\int q(\datayitr , \biggoal^1|\X_{\task}) \log\dfrac{q(\datayitr , \biggoal^1|\X_{\task})}{p(\datayitr ) q(\biggoal^1|\X_{\task})} d\datayitr d\biggoal^1\right]d\X_{\task}\\
&= \int p(\X_{\task})\left[\int q(\datayitr , \biggoal^1 | \X_{\task}) \log\dfrac{q(\biggoal^1 | \datayitr , \X_{\task})}{q(\biggoal^1|\X_{\task})}d\datayitr d\biggoal^1\right]d\X_{\task}
\end{split}
\label{eq:IB_term2}
\end{equation}
The denominator $q(\biggoal^1|\X_{\task}) = \int q(\biggoal^1|\datayitr , \X_{\task})p(\datayitr )d\datayitr $ is intractable for unknown $p(\datayitr )$. We  approximate it with $p_{\theta}(\biggoal^1|\X_{\task})$. With similar derivation, the second term is upper bounded by:
\begin{equation}
 I(\datayitr , \biggoal^1|\X_{\task}) \leq  \int q(\biggoal^1|\datayitr , \X_{\task})p(\datayitr , \X_{\task})\log\dfrac{q(\biggoal^1 | \datayitr , \X_{\task} )}{p_{\theta}(\biggoal^1|\X_{\task})}d\datayite d\datayitr d\biggoal^1.
\end{equation}



Similarly, its unbiased estimation is given as:
\begin{equation}
 I(\datayitr , \biggoal^1|\X_{\task}) \leq  \int q(\biggoal^1|\datayitr , \X_{\task})\log\dfrac{q(\biggoal^1 | \datayitr , \X_{\task} )}{p_{\theta}(\biggoal^1|\X_{\task})}d\biggoal^1.
\end{equation}

Combining two terms, we get the total unbiased estimation of the IB loss:
\begin{equation}
    L_{IB} = \mathbf{E}_{q(\biggoal^1 | \datayitr , \X_{\task})}\log p_{\theta}(\datayite |\biggoal^1, \X_{\task})  - \beta KL(q(\biggoal^1|\datayitr , \X_{\task})||p_{\theta}(\biggoal^1|\X_{\task})).
\label{eq:LIB_unbias}
\end{equation}

To incorporate target information, we inject the target variable $\datayite $ into posterior and $\datayitr $ into prior, and get the new approximation:
\begin{equation}
    L_{IB} = \mathbf{E}_{q(\biggoal^1 | \task)}\log p_{\theta}(\datayite |\biggoal^1, \X_{\task})
    - \beta KL(q(\biggoal^1|\task)||p_{\theta}(\biggoal^1|\datayitr , \X_{\task})).
\label{eq:LIB_unbias_np}
\end{equation}


Since $\goali^0=\vg^{Gate}_{\vw}(\goal, \taskZ), \h_{\task}^0=\vg^{Gate}_{\vbeta}(\taskZ)$, where $\vg^{Gate}_{\vw}, \vg^{Gate}_{\vbeta}$ are both deterministic and invertible mappings of $\taskZ$. We have $ p(\goali^0|\goal) = \delta(\goali^0=\vg^{Gate}_{\vw}(\taskZ, \goal)), p(\h_{\task}^0|\taskZ) = \delta(\h_{\task}^0=\vg^{Gate}_{\vbeta}(\taskZ))$. Moreover, $\h_{\task}^0, \goali^0$ are conditionally independent given $\taskZ$. Similarly, $\h_{\task}^1, \goali^1$ are deterministic function of $\h_{\task}^0$ and $\goali^0$. Thus, the second term in \eref{eq:LIB_unbias_np} can be replaced with the divergence between the posterior and prior distribution of $\taskZ$, i.e. $KL(q(\taskZ|\task)||p(\taskZ|\datayitr , \dataxitr))$. 




We now look into the log likelihood term in \eref{eq:LIB_unbias}. Since the transitions $\taskZ \to \goali^0 \to \goali^1$ and $\taskZ \to \h_{\task}^0 \to \h_{\task}^1$ are deterministic:
\begin{equation}
\begin{split}
    \goali^1 = \goali^{0} -\nabla_{\goal} \mathcal{L}(\vf_{\goali^0}, \h_{\task}^0, \datatr)),\quad  \goali^0 = \vg^{Gate}_{\vw}(\goal, \vz), \quad \vz\sim q(\taskZ|\task)\\
    \h_{\task}^1 = \h_{\task}^0 - \nabla_{\h}\mathcal{L}(\vf_{\goali^0}, \h_{\task}^0, \datatr)),\quad \h_{\task}^0 = \vg^{Gate}_{\vbeta}(\vz).
    \end{split}
\label{eq:g_theta_post}
\end{equation}

According to the analysis, the approximation to be optimized is:
\begin{equation}
    L_{app} = \mathbf{E}_{\biggoal^1\sim q(\biggoal|\task)} \log p_{\theta}(\datayite | \goali^1, X^{te})
    - \beta KL(q(\taskZ|\task)||p(\taskZ|\datatr)).
\label{eq:LIB_app2}
\end{equation}
\end{proof}

\section{Heterogeneous Few Shot Binary Classification Results.}
\label{app:exp_binary}
\textbf{Task design.} In classification, task ambiguity is common when annotated data are limited. Images can share many attributes, and various combinations of them can be used for final decision-making. We evaluate our method on the ambiguous classification benchmark proposed in \cite{finn2018probabilistic}. The CelebA dataset contains cropped images of celebrity faces and a list of attributes that describe their appearance.  We split these attributes into training, validation, and test sets. During meta-training, we randomly sample two training attributes and form the positive class of images that share them. The negative class is formed by sampling the same number of images containing neither attribute. During meta-testing, training set images share three attributes. We construct three test sets by choosing two of the three attributes to define the positive class. The model learns to apply two attributes for decision making, but there are three combinations of two attributes for classification. Thus the task is ambiguous. We sample models from our distribution of solutions and assign them to the three test sets based on the loss values. If all test sets are covered with at least one model, the method can effectively discover all potential decision rules. The cover number is calculated as the average number of test sets that are covered. The coverage number for a deterministic method is $1$. As Table~\ref{table:amb_clf} shows, our method can 1) achieve better accuracy, 2) reach lower NLL, and 3) discover more decision rules compared to MAML.

 

\begin{table}[h!]
\centering
\caption{5-Shot Ambiguous Binary Classification.\label{table:amb_clf}}
\resizebox{0.45\textwidth}{!}{%
\begin{tabular}{c|ccc}
\hline
Model & Accuracy    & Coverage number & NLL  \\ \hline
MAML   & 77.924  & 1.00 &     0.454            \\
\method  & 79.698 & 1.13  &    0.439            \\\hline
\end{tabular}}
\end{table}
\section{Experiment setup for Simulation.}
\label{app:exp_setup}







\textbf{2D Regression setup.} Meta distribution $\mathcal{T}$ contains 6 function families. Input $X = [x_1,x_2]\sim U(0.0, 5.0)$. The value for $x_2$ is fixed as 1 if only $x_1$ is used. For \textit{sinusoids} families : $y = a sin(wx_1 + b)+\epsilon$, where $a \sim U[0.1, 5.0], b \sim U[0, 2\pi], w\sim  U[0.8, 1.2]$; for \textit{line} families: $y = ax_1 + b+\epsilon$, where $a\sim  U[-3.0,3.0], b\sim U[-3.0,3.0]$;  for \textit{quadratic curves}: $y = ax_1^2 + bx_1 + c+\epsilon$, where $a\sim U[-0.2, 0.2], b\sim U[-2.0, 2.0], c\sim U[-3.0, 3.0]$; for cubic curves: $y = ax_1^3 + bx_1^2 + cx_1 + d+\epsilon$, where $a \sim U [-0.1, 0.1], b\sim U [-0.2, 0.2], c\sim U [-2.0, 2.0], d\sim U [-3.0, 3.0]$; for \textit{quadratic surface}: $y = ax_1^2 + bx_2^2+\epsilon$, where $a\sim U[-1.0, 1.0], b\sim U[-1.0, 1.0]$;  for \textit{ripple}: $y = sin(-a(x_1^2 + x_2^2)) + b+\epsilon$, where $a\sim U[-0.2,0.2], b\sim U[-3.0,3.0]$. 

\textbf{Model architecture for 2D regression.} We adopt the same base model as in~\cite{yao2020automated, finn2017model}, it contains 2 linear layer with 40 neurons followed by ReLU function.  For the task representative module, we use 2 linear layers with 80 neurons.

\textbf{Visualization for 2D regression.} See Figure~\ref{fig:2DReg_10shot}.
\begin{figure*}[thb]
  \centering 
    \begin{minipage}[b]{.24\textwidth} 
      \centering 
      \includegraphics[width=\linewidth]{Fig2D_1.pdf}
    \end{minipage} 
    \begin{minipage}[b]{0.24\textwidth} 
      \centering 
      \includegraphics[width=1\linewidth]{Fig2D_2.pdf}
    \end{minipage} 
    \begin{minipage}[b]{0.24\textwidth} 
      \centering 
      \includegraphics[width=1\linewidth]{Fig2D_3.pdf}
    \end{minipage}
    \begin{minipage}[b]{0.24\textwidth} 
      \centering 
      \includegraphics[width=1\linewidth]{Fig2D_4.pdf}
    \end{minipage}
  \caption{\label{fig:2DReg_10shot}Qualitative Visualization of fitting curves. Black stars represent training set $\datatr$, 10 different samples of fitting curves are shown as colored dotted lines. The blue solid line is the true mapping.} 
\end{figure*}


\textbf{More results for 2D regression.}
During meta-training, we fixed the size of training set $|\datatr|$ as 10, the standard deviation for Gaussian noise $\sigma$ to be 0.3. During meta-testing, we can decrease the size of training set or increase the noise level such that tasks ambiguity can be more concerning. We visualize them in Figure~\ref{fig:2DReg_2_5shot}. The model can effectively reason over ambiguity as we vary the size of the training data or noise level. The sampled functions tend to span wider space as $|\datatr|$  decreases or the noise level increases. However, they stay faithful around those annotated training data.



\textbf{NOAA GSOD Dataset Details}. The data is available at \url{https://data.noaa.gov/dataset/dataset/global-surface-summary-of-the-day-gsod}. The dataset is large, so we reduce the size while preserving a wide range of years by using every $10$th year from $1969-2019$. Each file in the unzipped dataset corresponds to one year of data at a particular station. Files that do not contain at least $40$ days of data are ignored. Task number $i$ is created in the following way:
\begin{enumerate}
    \item We sample $40$ days of data that have valid temperature entires.
    \item We drop the columns ("STATION", "NAME", "TEMP\_ATTRIBUTES", "DEWP",
                "DEWP\_ATTRIBUTES",
                "PRCP\_ATTRIBUTES",
                "SLP\_ATTRIBUTES",
                "STP\_ATTRIBUTES",
                "VISIB\_ATTRIBUTES",
                "WDSP\_ATTRIBUTES",
                "MAX",
                "MIN",
                "\text{MAX}\_ATTRIBUTES",
                "MIN\_ATTRIBUTES",
                "LATITUDE", and
                "LONGITUDE")
    \item We convert the date column from (MM/DD/YYYY) to a float [0, 1] representing the time since the first day of that year.
    \item The ``FRSHTT" is a 6 bit binary string where each digit indicates the presence of fog, rain, snow, hail, thunder, and tornadoes respectively. We transform the ``FRSHTT" column into 6 binary columns.
    \item The GSOD dataset reports missing values with all $9$s, e.g. $99.99$, or $999.9$. We find and replace these values with $0.0$. We also replace NaN entries with $0.0$.
    \item The units of some input variables are adjusted to bring their values down to a smaller range. Pressure variables (``SLP" and ``STP") are converted from millibars to bars. Elevation is changed from meters to kilometers.
    \item The ``TEMP" variable is split from the data to become our target value.
\end{enumerate}

We use a 42k/5k/1k split to divide the files into train, val and test sets.

\textbf{Model architecture for weather prediction.}
Similar to 2D regression, the feature learner has two linear layers with 100 neurons followed by ReLU activation funcion. The mapping to task representation $\taskZ$ contains 3 layers with hidden dimension 40. 80, 200. The augmented dimension is set to be 20. 



\textbf{Model runtime and compute.} The model trains on one GTX 2080 card. Training times vary by experiment, ranging from a few hours to a day.


\begin{figure*}[th!]
    \centering
    \makebox[\textwidth][c]{\includegraphics[width=0.75\textwidth]{temp_preds.pdf}}
    \caption{A visualization of trained \method on the NOAA-GSOD temperature prediction task. The model is given $10$ training points (red) and predicts the remaining days of the year (orange). The true temperatures are shown in blue.  }
    \label{fig:temp_preds}
\end{figure*}






\FloatBarrier









\end{document}