%\documentclass[export]{uai2022} % for initial submission
\documentclass[export, accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
%\usepackage[export]{adjustbox}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% ====== CUSTOM =======
%%%%% Custom MATH definitions
\newcommand{\GC}{ G_{\mb{c}} }
\newcommand{\GR}{ G_{\mb{r}} }
\newcommand{\GChat}{{\smash{\widehat{G}_{\mb{c}}}}}
\newcommand{\GRhat}{{\smash{\widehat{G}_{\mb{r}}}}}
\newcommand{\CSet}{ \mc{S}_{\text{comp}} }
\newcommand{\ESet}{ \mc{S}_{\text{elig}} }

\newcommand{\traintask}{\mc{M}^\text{train}}
\newcommand{\testtask}{\mc{M}^\text{test}}

\newcommand{\subtask}{ \Phi }
\newcommand{\cond}{ \GC }
\newcommand{\subrew}{\GR}

\newcommand{\pieval}{ \pi^\text{eval} }
\newcommand{\piadapt}{ \pi^\text{adapt} }
\newcommand{\taup}{ \tau^\text{p} }

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}
\newcommand{\mb}{\mathbf}
\newcommand{\tb}{\textbf}
\newcommand{\mbb}{\mathbb}
\newcommand{\mc}{\mathcal}
\newcommand{\wt}{\widetilde}
\newcommand{\tr}{\textrm}

\DeclareRobustCommand\onedot{\futurelet\@let@token\@onedot}
\def\onedot{.}
\def\eg{\emph{e.g}\onedot} \def\Eg{\emph{E.g}\onedot}
\def\ie{\emph{i.e}\onedot} \def\Ie{\emph{I.e}\onedot}
\def\cf{\emph{c.f}\onedot} \def\Cf{\emph{C.f}\onedot}
\def\etc{\emph{etc}\onedot} \def\vs{\emph{vs}\onedot}
\def\wrt{w.r.t\onedot} \def\dof{d.o.f\onedot}
\def\etal{\emph{et al}\onedot}
%%%%% NEW MATH DEFINITIONS %%%%%

\usepackage{amsmath,amsfonts,bm}

% Highlight a newly defined term
\newcommand{\newterm}[1]{{\bf #1}}


% Figure reference, lower-case.
\def\figref#1{figure~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}
% Reference to two sections.
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
% Reference to three sections.
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
% Reference to an equation, lower-case.
\def\eqref#1{equation~\ref{#1}}
% Reference to an equation, upper case
\def\Eqref#1{Equation~\ref{#1}}
% A raw reference to an equation---avoid using if possible
\def\plaineqref#1{\ref{#1}}
% Reference to a chapter, lower-case.
\def\chapref#1{chapter~\ref{#1}}
% Reference to an equation, upper case.
\def\Chapref#1{Chapter~\ref{#1}}
% Reference to a range of chapters
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
% Reference to an algorithm, lower-case.
\def\algref#1{algorithm~\ref{#1}}
% Reference to an algorithm, upper case.
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
% Reference to a part, lower case
\def\partref#1{part~\ref{#1}}
% Reference to a part, upper case
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


% Random variables
\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
% rm is already a command, just don't name any random variables m
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

% Random vectors
\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

% Elements of random vectors
\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

% Random matrices
\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

% Elements of random matrices
\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

% Vectors
\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}

% Elements of vectors
\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

% Matrix
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}

% Tensor
\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


% Graph
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

% Entries of a matrix
\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

% entries of a tensor
% Same font as tensor, without \bm wrapper
\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

% The true underlying data generating distribution
\newcommand{\pdata}{p_{\rm{data}}}
% The empirical distribution defined by the training set
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
% The model distribution
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
% Stochastic autoencoder distributions
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
% But then they seem to use $L^2$ for vectors throughout the site, and so does
% wikipedia.
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak


%\newcommand{\adidas}{\textbf{Adidas}\xspace}
\newcommand{\amazon}{\textbf{Amazon}\xspace}
\newcommand{\apple}{\textbf{Apple}\xspace}
\newcommand{\bestbuy}{\textbf{BestBuy}\xspace}
%\newcommand{\converse}{\textbf{Converse}\xspace}
\newcommand{\dicks}{\textbf{Dick's}\xspace}
\newcommand{\ebay}{\textbf{eBay}\xspace}
\newcommand{\expedia}{\textbf{Expedia}\xspace}
\newcommand{\ikea}{\textbf{Ikea}\xspace}
\newcommand{\lego}{\textbf{Lego}\xspace}
\newcommand{\lenox}{\textbf{Lenox}\xspace}
\newcommand{\omahasteaks}{\textbf{Omahasteaks}\xspace}
\newcommand{\swarovski}{\textbf{Swarovski}\xspace}
%\newcommand{\samsung}{\textbf{Samsung}\xspace}
%\newcommand{\sephora}{\textbf{Sephora}\xspace}
\newcommand{\thriftbooks}{\textbf{Thriftbooks}\xspace}
%\newcommand{\target}{\textbf{Target}\xspace}
\newcommand{\todaytix}{\textbf{Todaytix}\xspace}
\newcommand{\walgreens}{\textbf{Walgreens}\xspace}
\newcommand{\walmart}{\textbf{Walmart}\xspace}
%
\newcommand{\order}{\texttt{Click\_Place\_Order}}

% Domain
\def\wob{\textit{SymWoB}\xspace}
\def\mining{\textit{Mining}\xspace}

% Methods
\def\mtsgi{ \textbf{MTSGI}\xspace }
\def\msgi{ \textbf{MSGI}\xspace }
\newcommand{\nti}{\textbf{MSGI}\xspace }
\def\hrl{ \textbf{HRL}\xspace }
\def\random{ \textbf{Random}\xspace }

% Mark sections of captions for referring to divisions of figures
\newcommand{\figleft}{{\em (Left)}\xspace}
\newcommand{\figcenter}{{\em (Center)}\xspace}
\newcommand{\figright}{{\em (Right)}\xspace}
\newcommand{\figtop}{{\em (Top)}\xspace}
\newcommand{\figbottom}{{\em (Bottom)}\xspace}
\newcommand{\captiona}{{\em (a)}\xspace}
\newcommand{\captionb}{{\em (b)}\xspace}
\newcommand{\captionc}{{\em (c)}\xspace}
\newcommand{\captiond}{{\em (d)}\xspace}
\newcommand{\figA}{\textbf{\color{blue} A}\xspace}
\newcommand{\figB}{\textbf{\color{blue} B}\xspace}
\newcommand{\figC}{\textbf{\color{blue} C}\xspace}

\usepackage{algorithm}
\usepackage{algorithmic}
%\usepackage{booktabs}       % professional-quality tables
\usepackage{boldline}
\usepackage{xspace}
\usepackage{comment}
\usepackage{microtype}
%\usepackage[colorinlistoftodos]{todonotes}
\usepackage{cleveref}
%\usepackage[export]{adjustbox}
\usepackage{adjustbox}
% ====== CUSTOM =======


\newcommand{\cutabstractup}{\vspace*{-0.0in}}
\newcommand{\cutabstractdown}{\vspace*{-0.0in}}

\newcommand{\cutsectionup}{\vspace*{-0.05in}}
\newcommand{\cutsectiondown}{\vspace*{-0.0in}}

\newcommand{\cutsubsectionup}{\vspace*{-0.0in}}
\newcommand{\cutsubsectiondown}{\vspace*{-0.0in}}

\newcommand{\cutsubsubsectionup}{\vspace*{-0in}}
\newcommand{\cutsubsubsectiondown}{\vspace*{-0in}}

\newcommand{\cutcaptionup}{\vspace*{-0.0in}}
\newcommand{\cutcaptiondown}{\vspace*{-0.0in}}

\newcommand{\cutparagraphup}{\vspace{-0pt}}
\newcommand{\cutparagraphdown}{\vspace*{-0in}}

\newcommand{\cutitemizeup}{\vspace{-5pt}}
\newcommand{\cutitemizedown}{\vspace{-5pt}}
\providecommand{\tightlist}{\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
% ====== CUSTOM =======

\title{Fast Inference and Transfer of Compositional Task Structures\\ for Few-shot Task Generalization}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors

\author[1, 2]{\href{mailto:<srsohn@umich.edu>}{Sungryull Sohn}{}}
\author[1]{Hyunjae Woo}
\author[1]{Jongwook Choi}
\author[1]{Lyubing Qiang}
\author[3]{Izzeddin Gur}
\author[3]{Aleksandra Faust}
\author[1,2]{Honglak Lee}

% Add affiliations after the authors
\affil[1]{%
    University of Michigan
}
\affil[2]{%
    LG AI Research Center Ann Arbor
}
\affil[3]{%
    Google Research
}
  
  \begin{document}
\maketitle

\begin{abstract}
    We tackle real-world problems with complex structures beyond the pixel-based game or simulator.
    We formulate it as a few-shot reinforcement learning problem where a task is characterized by a subtask graph that defines a set of subtasks and their dependencies that are unknown to the agent.  
    Different from the previous meta-RL methods trying to directly infer the unstructured task embedding, our multi-task subtask graph inferencer (MTSGI) first infers the common high-level task structure in terms of the subtask graph from the training tasks, and use it as a prior to improve the task inference in testing.
    Our experiment results on 2D grid-world and complex web navigation domains show that the proposed method can learn and leverage the common underlying structure of the tasks for faster adaptation to the unseen tasks than various existing algorithms such as meta reinforcement learning, hierarchical reinforcement learning, and other heuristic agents.
\end{abstract}    


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure*}[!htp]
    \centering
    \includegraphics[draft=false,width=0.98\linewidth, valign=b]{./figure/Overview_task_v3.pdf}%
    %\vspace{-8pt}
    \caption{%
        An illustration of the train \figtop and test task \figbottom in our \wob domain. Some selected actionable web-elements (\eg, text fields and buttons) are magnified (dotted arrow and box) for readability. The agent’s goal ({\color{green} green box}) is to checkout the products in unseen test website by interacting with the web elements in a correct order. %by learning the high-level structure of the tasks in training. 
        For example, in train task, the agent should fill out all the text fields in ({\em Top}, \figA) \textbf{before} clicking the \texttt{credit\_card} button to transition ({\color{gray} gray arrow}) to next page.
        The high-level checkout processes in different websites have many commonalities while certain details may differ. %The agent is expected to capture such common knowledge while quickly adapting to the different details to solve the unseen test tasks.
        For example, in both train and test tasks, the agent should fill out the user information ({\em Top} and {\em Bottom}, \figA) before proceeding to the next page or there exist similar elements ({\em Top} and {\em Bottom}, \figC). However, the details may differ; \eg, the train task ({\em Top}, \figA) has a single text field for full name, while the test task ({\em Bottom}, \figA) has separate text fields for the first and last name, respectively. Also, only the test website ({\em Bottom}, \figB) requires shipping information since the training website does not ship the product.
    }
    \label{fig:task_overview}
\end{figure*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\vspace*{-10pt}
\cutsectionup
\section{Introduction}\label{sec:i}
\cutsectiondown
Recently, deep reinforcement learning (RL) has shown an outstanding performance on various domains such as video games~\citep{mnih2015human, vinyals2019grandmaster} and board games~\citep{silver2017mastering}.
However, most of the successes of deep RL were focused on a single-task setting where the agent is allowed to interact with the environment for hundreds of millions of time steps. 
In numerous real-world scenarios, interacting with the environment is expensive or limited, and the agent is often presented with a novel task that is not seen during its training time.
%
% There were other multi-task works but requires human knowledge
To overcome this limitation, many recent works focused on scaling the RL algorithm beyond the single-task setting.
Recent works on multi-task RL aim to build a single, contextual policy that can solve multiple related tasks and generalize to unseen tasks.
However, they require a certain form of task embedding as an extra input that often fully characterizes the given task~\citep{oh2017zero, Andreas2017Modular, yu2017deep, chaplot2018aaai}, or requires a human demonstration~\cite{huang2018neural}, which are not readily available in practice.
%
% MEta-rl is more general but focused only on the simple task or a set of similar tasks
Meta RL~\citep{finn2017model,duan2016rl} focuses on a more general setting where the agent should learn about the unseen task purely via interacting with the environment without any additional information. However, such meta-RL algorithms either require a large amount of experience on the diverse set of tasks or are limited to a relatively smaller set of simple tasks with a simple task structure.

% 
% In Real-world, agent should solve complex tasks with
On the contrary, real-world problems require the agent to solve much more complex and compositional tasks without human supervision.
% Explanation why web-navigation task is compositional and complex
Consider a web-navigating RL agent given the task of checking out the products from an online store as shown in~\Cref{fig:task_overview}.
The agent can complete the task by filling out the required web elements with the correct information such as shipping or payment information, navigating between the web pages, and placing the order.
Note that the task consists of multiple \textit{subtasks} and the subtasks have complex dependencies in the form of \textit{precondition};
%For instance, the agent may proceed to the next web page by clicking the \texttt{continue} button \textit{after} all the required information has been correctly filled in, or the \texttt{credit card number} field will show up \textit{after} selecting the \texttt{credit card} as a payment method.
for instance, the agent may proceed to the payment web page (see {\em Bottom}, {\color{blue} B}) \textit{after} all the required shipping information has been correctly filled in (see {\em Bottom}, {\color{blue} A}), or the \texttt{credit\_card\_number} field will appear \textit{after} selecting the \texttt{credit\_card} as a payment method (see {\em Top, Middle} in~\Cref{fig:task_overview}).
%%%%%
Learning to perform such a task can be quite challenging if the reward is given only after yielding meaningful outcomes (\ie, sparse reward task).
This is the problem scope we focus on in this work: solving and generalizing to unseen compositional sparse-reward tasks with complex subtask dependencies without human supervision.

%
% SGI can solve this challenge but has a limitation
Recent works~\citep{sohn2019meta, NTP, huang2018neural, liu2016GTSvisual, ghazanfari2017autonomous} tackled the compositional tasks by explicitly inferring the underlying task structure in a graph form.
Specifically, the subtask graph inference (SGI) framework~\citep{sohn2019meta} uses inductive logic programming (ILP) on the agent's own experience to infer the task structure in terms of \textit{subtask graph} and learns a contextual policy to \textit{execute} the inferred task in few-shot RL setting.
However, it only meta-learned the adaptation policy that relates to the efficient exploration, while the task inference and execution policy learning were limited to a single task (\ie, both task inference and policy learning were done from scratch for each task), limiting its capability of handling large variance in the task structure.
We claim that the inefficient task inference may hinder applying the SGI framework to a more complex domain such as web navigation~\citep{shi2017world, liu2018reinforcement} where a task may have a large number of subtasks and complex dependencies between them.
We note that humans can navigate an unseen website by transferring the high-level process learned from previously seen websites.

%
Inspired by this, we extend the SGI framework to a \textit{multi-task subtask graph inferencer} (MTSGI) that can generalize the previously learned task structure to the unseen task for faster adaptation and stronger generalization.~\Cref{fig:method_overview} outlines our method.
%overview of our method and main contributions
MTSGI estimates the prior model of the subtask graphs from the training tasks. When an unseen task is presented, MTSGI samples the prior that best matches with the current task, and incorporates the sampled prior model to improve the latent subtask graph inference, which in turn improves the performance of the evaluation policy.
%
We demonstrate results in the 2D grid-world domain and the web navigation domain that simulates the interaction with 15 actual websites.
We compare our method with MSGI~\citep{sohn2019meta} that learns the task hierarchy from scratch for each task, and two other baselines including hierarchical RL and a heuristic algorithm. 
We find that MTSGI significantly outperforms all other baselines, and the learned prior model enables more efficient task inference compared to MSGI.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure*}[!htp]
    \centering
    \includegraphics[draft=false,width=0.98\linewidth, valign=b]{./figure/method_overview_v3.pdf}%
    \vspace{-6pt}
    \caption{%
        The overview of our algorithm and the example of agent's {\color{blue} trajectory} and the inferred {\color[rgb]{0.5,0.75,0.4}subtask graph}. In meta-train \figleft, the adaptation policy $\piadapt$ interacts with the environment and collects the trajectory $\tau$. The inductive logic programming (ILP) module takes as input the trajectory, and infers the task structure in terms of the subtask graph $G^\tau$. The trajectory and the subtask graph are stored as a prior. In meta-testing \figright, the adaptation policy incorporates the prior trajectory $\tau^\text{p}$ to efficiently explore the environment, and ILP module infers the subtask graph $G^\tau$ from the adaptation trajectory $\tau$. Finally, the evaluation policy $\pieval$ takes as input the prior and inferred subtask graphs $(G^\text{p}, G^\tau)$ to solve the test task.
        %(Left) In meta-train, the adaptation trajectory $\tau$ is collected for each training task. The subtask graphs are inferred from the trajectories and stored as prior. (Right) In meta-testing, 
    }
    \label{fig:method_overview}
\end{figure*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\cutsectionup
\section{Preliminaries}\label{sec:pre}
\cutsectiondown

\paragraph{Few-shot Reinforcement Learning}
\label{sec:metarl}
%\cutsubsectiondown
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
A \emph{task} is defined by an MDP $\mc{M}_{G}=(\mc{S, A}, \mc{P}_{G}, \mc{R}_{G})$ parameterized by a task parameter $G$ %(\ie~\emph{subtask graph} \cite{sohn2018hierarchical})
with a set of states $\mc{S}$, a set of actions $\mc{A}$, transition dynamics $\mc{P}_{G}$, reward function $\mc{R}_{G}$.
% K-shot RL: adapt+eval
The goal of $K$-shot RL~\citep{duan2016rl,finn2017model},
is to efficiently solve a distribution of unseen test tasks $\testtask{}$ by learning and transferring the common knowledge from the training tasks $\traintask{}$.
It is assumed that the training and test tasks do not overlap (\ie, $\traintask\cap \testtask = \emptyset$) but share a certain commonality such that the knowledge learned from the training tasks may be helpful for learning the test tasks.
%For a given task $\mc{M}_G$, the agent is allowed to interact with the environment for exactly $K$ environment steps (\ie, adaptation budget), and after $K$ steps, the agent's performance is evaluated.
For each task $\mc{M}_G$, the agent is given $K$ steps budget for interacting with the environment.
During meta-training, the goal of multi-task RL agent is to learn a prior (\ie, slow-learning) over the training tasks $\traintask{}$. 
Then, the learned prior may be exploited during the meta-test to enable faster adaptation on unseen test tasks $\testtask$.
For each task, the agent faces two phases:
an \emph{adaptation phase} where the agent learns a task-specific behavior (\ie, fast-learning) for $K$ environment steps, which often spans over multiple episodes,
and a \emph{evaluation phase} where the adapted behavior is evaluated.
%
In the evaluation phase, the agent is not allowed to perform any form of learning, and agent's performance on the task $\mc{M}_G$ is measured in terms of the return:
\begin{align}
    \mc{R}_{\mc{M}_{G}}(\pi_{\phi_K}) = \mathbb{E}_{\pi_{\phi_K}, \mc{M}_{G} }\left[\textstyle\sum^{H}_{t=1}{r_t}\right],\label{eq:loss}
\end{align}
where $\pi_{\phi_{K}}$ is the policy after $K$ update steps of adaptation,
$H$ is the horizon of evaluation phase,
and $r_t$ is the reward at time $t$ in the evaluation phase.

\cutsectionup
\section{Subtask Graph Inference Problem}\label{sec:problem}
\cutsectiondown
%\cutsubsectiondown \vspace*{-1pt}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The \textit{subtask graph inference} problem~\citep{sohn2019meta} is a few-shot RL problem where a task is parameterized by a set of subtasks and their dependencies.
% Description of a single task. (don't go beyond a single task)
% Here, ignore the fact that its ``graph''. 
%A task consists of $N$ subtasks $\{\subtask^1, \ldots, \subtask^N\}$. 
%The agent's goal is to maximize the total reward cumulated over an episode, and the agent can get reward by \textit{completing} the subtasks.
Formally, a task consists of $N$ subtasks $\mathbf{\Phi}=\{\subtask^1, \ldots, \subtask^N\}$, and each subtask $\subtask^i$ is parameterized by a tuple ($\CSet{}^i, \cond^i, \subrew^i$).
% Goal-state & precond
The \textit{goal state} $\CSet{}^i\subset \mc{S}$ and \textit{precondition} $\cond^i: \mc{S} \rightarrow \{0, 1\}$ defines the condition that a subtask is \textit{completed}: the current state should be contained in its goal states  (\ie, $\mb{s}_t\in\smash{\CSet^i}$) and the precondition should be satisfied (\ie, $\cond^i(\mb{s}_t)=1$).
If the precondition is not satisfied (\ie, $\cond^i(\mb{s}_t)=0$), the subtask cannot be completed and the agent receives no reward even if the goal state is achieved.
% Reward
The \textit{subtask reward function} $\subrew^i$ defines the amount of reward given to the agent when it \textit{completes} the subtask $i$: $r_t\sim\subrew^i$.
% Alternatively, this can be also formulated as a factored MDP~\citep{boutilier1995exploiting, schuurmans2002direct}. Please refer to~\Cref{app:FMDP} for more detail.
% Goal
We note that the subtasks $\{\subtask^1, \ldots, \subtask^N\}$ are unknown to the agent. Thus, the agent should learn to infer the underlying task structure and complete the subtasks in an optimal order while satisfying the required preconditions.

\paragraph{State}
In the subtask graph inference problem, it is assumed that the state input provides the high-level status of the subtasks. 
%This enables the agent to infer the latent subtask structure only from the interaction with the environment.
Specifically, the state consists of the followings: $\mb{s}_t=(\text{obs}_t, \mb{x}_t, \mb{e}_t, \text{step}_{\text{epi}, t}, \text{step}_{\text{phase}, t})$.
The $\text{obs}_t\in\{0, 1\}^{W\times H\times C}$ is a visual observation of the environment.
The completion vector $\smash{\mb{x}_t\in\{0, 1\}^N }$ indicates whether each subtask is complete. The eligibility vector $\smash{\mb{e}_t\in\{0, 1\}^N }$ indicates whether each subtask is eligible (\ie, precondition is satisfied). Following the few-shot RL setting, the agent observes two scalar-valued time features: the remaining time steps until the episode termination $\text{step}_{\text{epi}, t}\in\mathbb{R}$ and the remaining time steps until the phase termination $\text{step}_{\text{phase}, t}\in\mathbb{R}$.

% Options
\paragraph{Options} 
For each subtask $\Phi^i$, the agent can learn an option $\mc{O}^i$~\citep{sutton1999between} that reaches the goal state of the subtask.
Following~\cite{sohn2019meta}, such options are pre-learned individually by maximizing the goal-reaching reward: $r_t=\mbb{I}(\mb{s}_t\in\smash{\CSet^i})$. At time step $t$, we denote the option taken by the agent as $\mb{o}_t$ and the binary variable that indicates whether episode is terminated as $d_t$.

\cutsectionup
\section{Method}\label{sec:m}
\cutsectiondown

We propose a novel Multi-Task Subtask Graph Inference (MTSGI) framework that can perform an efficient inference of latent task embedding (\ie, subtask graph). The overall method is outlined in~\Cref{fig:method_overview}.
Specifically, in meta-training, MTSGI models the prior in terms of (1) adaptation trajectory $\tau$ and (2) subtask graph $G$ from the agent's experience. In meta-testing, MTSGI samples (1) the prior trajectory $\tau^\text{p}$ for more efficient exploration in adaptation and (2) the prior subtask graph $G^\text{p}$ for more accurate task inference.%We will first describe how the multi-task adaptation policy enables efficient exploration, and then the multi-task subtask graph inference.
\iffalse
Specifically, in meta-training, MTSGI learns the prior from the subtask graphs of the training tasks $\traintask$ where each task is inferred from the agent's experience. In meta-testing, MTSGI samples the prior and incorporates the agent's adaptation trajectory to update the prediction of the latent subtask graph of the evaluation tasks $\testtask{}$.
\fi

\subsection{Multi-task Adaptation Policy}\label{sec:adapt-policy}
The goal of \textit{adaptation policy} is to efficiently explore and gather the information about the task. 
%In both meta-train and meta-test, the agent is given a limited time to learn about the task during the adaptation phase.
%Thus, we adopt the \textit{adaptation policy} to maximally explore and gather the information about the task. 
Intuitively, if the adaptation policy completes more diverse subtasks, then it can provide more data to the task inference module (ILP), which in turn can more accurately infer the task structure.
To this end, we extend the upper confidence bound (UCB)-based adaptation policy proposed in~\citet{sohn2019meta} as follows:
\begin{align}
\piadapt(o=\mc{O}^i \mid s) \propto \exp \left(r^i+\sqrt{2}\frac{\log \left(\sum_{j} n^j\right)}{n^i}\right),\label{eq:UCB-policy}
\end{align}
where $r^i$ is the empirical mean of the reward received after executing subtask $i$ and $n^i$ is the number of times subtask $i$ has been executed within the current task. Note that the exploration parameters $\{r^i, n^i\}_{i=1}^N$ can be computed from the agent's trajectory.
In meta-train, the exploration parameters are initialized to zero when a new task is sampled. In meta-test, the exploration parameters are initialized with those of the sampled prior. Intuitively, this helps the agent visit novel states that were unseen during meta-training.
%The adaptation knowledge can be transferred between training and evaluation tasks via transferring the parameters $\{r^i, n^i\}_{i=1}^N$.
%Specifically, we store $\{r^i, n^i\}_{i=1}^N$ of each training task as a prior, and during meta-testing, we initialize $\{r^i, n^i\}_{i=1}^N$ with those of the sampled prior to incorporate the information gathered in the sampled prior task. Intuitively, the agent is encouraged to execute the subtask that was either not present in the prior task (\ie, $n^i=0$) or has been executed less often (\ie, $n^i\ll\sum_j n^j$).

% such that the agent has relatively less information about the subtask.
\iffalse
	During meta-training, we reset the parameters $\{r^i, n^i\}_{i=1}^N$ to zero at the beginning of each task, and after the adaptation is over (\ie, $t=K$) the final parameters $\{r^i, n^i\}_{i=1}^N$ were stored together with the inferred subtask graph as prior data. During meta-testing, we initialized $\{r^i, n^i\}_{i=1}^N$ with those of the sampled prior task to incorporate the information gathered in the sampled prior task. Intuitively, the agent is encouraged to execute the subtask that was either not present in the prior task (\ie, $n^i=0$) or have been executed less often (\ie, $n^i\ll\sum_j n^j$) than others such that the agent has relatively less information about the subtask.
\fi

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{algorithm}[t]
\caption{Meta-training: learning the prior}\label{alg:meta-train}
\begin{algorithmic}[1]
\REQUIRE{ Adaptation policy $\piadapt$}
%\Require{ $\piadapt$: adaptation policy}
\ENSURE{ Prior set $\mc{T}^{\text{p}}$}
\STATE $\mc{T}^{\text{p}} \gets \emptyset$
\FOR{each task $\mc{M}\in\traintask$}
    \STATE Rollout adaptation policy: \\
    $\tau=\{\mb{s}_t, \mb{o}_t,r_t, d_t\}_{t=1}^{K} \sim \piadapt$ in task $\mc{M}$
    \STATE Infer subtask graph $G^\tau = \argmax_{G} p(\tau|G)$ %\COMMENT{ILP}
    \STATE $\pieval= \text{GRProp}(G^\tau)$
    \STATE Evaluate the agent: $\tau^\text{eval}\sim \pieval$ in task $\mc{M}$
    \STATE Update prior $\mc{T}^{\text{p}}\gets \mc{T}^{\text{p}} \cup \left( G^\tau,  \tau\right)$
\ENDFOR
\end{algorithmic}
\end{algorithm}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\subsection{Meta-train: Learning the Prior Subtask Graph}
Let $\tau$ be an adaptation trajectory of the agent for $K$ steps.
The goal is to infer the latent subtask graph $G$ for the given training task $\mc{M}_{G}\in\traintask$,
specified by preconditions $\GC{}$ and subtask rewards $\GR{}$.
We find the maximum-likelihood estimate (MLE) of $G = (\GC{}, \GR{})$
that maximizes the likelihood of the adaptation trajectory $\tau$: % \ie,
\begin{align}
    \widehat{G}^\text{MLE} = \argmax_{ \GC{}, \GR{} } p(\tau| \GC{}, \GR{} ).  \label{eq:objective}
\end{align}
Following~\citet{sohn2019meta}, we infer the precondition $\GC{}$ and the subtask reward $\GR{}$ as follows (See Appendix for the detailed derivation):
\begin{align}
\widehat{G}_\mb{c}^\text{MLE}&=\argmax_{\GC{}} \prod_{t=1}^{H}{ p(\mb{e}_{t}|\mb{x}_{t}, \GC{}) },\label{eq:infer-precond}\\
\widehat{G}_\mb{r}^\text{MLE}&=\argmax_{\GR{}} \prod_{t=1}^{H}{ p(r_t|\mb{e}_t,\mb{o}_t, \GR{})}.\label{eq:infer-reward}
\end{align}
where $\mb{e}_t$ is the eligibility vector, $\mb{x}_t$ is the completion vector, $\mb{o}_t$ is the option taken, $r_t$ is the reward at time step $t$.

\paragraph{Precondition inference} 
The problem in~\Cref{eq:infer-precond} is known as the inductive logic programming (ILP) problem that finds a boolean function that satisfies all the indicator functions.
Specifically, $\{\mb{x}_{t}\}^{H}_{t=1}$ forms binary vector inputs to programs,
and $\{e^{i}_{t}\}^{H}_{t=1}$ forms Boolean-valued outputs of the $i$-th program that predicts the eligibility of the $i$-th subtask.
%
We use the \textit{classification and regression tree} (CART) to infer the precondition function $f_{\GC{}}: \mb{x}\rightarrow \mb{e}$ for each subtask based on Gini impurity~\citep{breiman1984classification}. 
Intuitively, the constructed decision tree is the simplest boolean function approximation for the given input-output pairs $\{\mb{x}_t, \mb{e}_t\}$.
The decision tree is converted to a logic expression (\ie, precondition) in sum-of-product (SOP) form to build the subtask graph.

\paragraph{Subtask reward inference} 
To infer the subtask reward $\widehat{G}_\mb{r}^{\text{MLE}}$ in~\Cref{eq:infer-reward},
we model the reward for $i$-th subtask as a Gaussian distribution: $G_\mb{r}^{i} \sim \mathcal{N}(\widehat{\mu}^{i}, \widehat{\sigma}^{i} )$.
Then, the MLE of subtask reward is given as the empirical mean and variance of the rewards received after taking the eligible option $\mc{O}^i$ in adaptation phase:
\begin{align}
\widehat{\mu}^{i}_{\text{MLE}}
 &   = \mathbb E \left[ r_t|\mb{o}_t=\mc{O}^i, \mb{e}_t^i=1 \right],\label{eq:infer-reward-mu}\\
\widehat{\sigma^2}^{i}_{\text{MLE}}
 &   = \mathbb E \left[ (r_t - \widehat{\mu}^{i}_{\text{MLE}})^2 |\mb{o}_t=\mc{O}^i, \mb{e}_t^i=1 \right],\label{eq:infer-reward-sigma}
\end{align}
where  $\mc{O}^i$ is the option corresponding to the $i$-th subtask.
\Cref{alg:meta-train} outlines the meta-training process.
%Specifically, we record all the reward received after executing an option, and compute the sample mean and variance to get~\Cref{eq:infer-reward-mu} and~\ref{eq:infer-reward-sigma} in adaptation phase.
%The mean and variance is computed over the observed rewards during the adaptation phase, and 
%we accumulate the reward over the option execution and store them. In test phase, we compute the empirical mean and variance from the recorded rewards and use them as the Gaussian parameters of subtask reward distribution.
%\cutparagraphdown
%For all the training tasks, we store the inferred subtask graph $\widehat{G}^\text{MLE}$ (\ie, the precondition and the subtask reward) after the adaptation. The stored subtask graphs are used as a prior model (See~\Cref{sec:meta-eval-prior-sample}) in meta-testing.
%\Cref{alg:meta-train} summarizes the overall meta-training process.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{algorithm}[t]
\caption{meta-testing: multi-task SGI}\label{alg:meta-eval}
\begin{algorithmic}[1]
\REQUIRE{  Adaptation policy $\piadapt$, prior set $\mc{T}^{\text{p}}$}
%\ENSURE{ Test policy (GRProp) $\pieval$ }
\FOR{each task $\mc{M}\in\testtask$}
    \STATE Sample prior: $(G^{\text{p}}, \taup) \sim p(\mc{T}^{\text{p}})$
    %
%\STATE $\widehat{G}^{\text{p}} = \text{Map}_{\mathbf{\Phi}^{\text{p}}\rightarrow\mathbf{\Phi}} (G^{\text{p}})$  \COMMENT{subtask mapping}
    %\STATE Initialize the adaptation policy from $\taup$
    \STATE Rollout adaptation policy:\\
    $\tau=\{\mb{s}_t, \mb{o}_t,r_t, d_t\}_{t=1}^{K} \sim \piadapt$ in task $\mc{M}$
    \STATE Infer subtask graph $G^{\tau} = \argmax_{G} p(\tau|G)$ 
%    \STATE $\pieval\propto \pi(o|s, G^{\tau})^\alpha\pi(o|s, G^\text{p})^{(1-\alpha)}$
    \STATE $\pieval(\cdot| \tau, \taup)
\propto \text{GRProp}(\cdot| G^{\tau})^\alpha\text{GRProp}(\cdot|G^\text{p})^{(1-\alpha)}$
    \STATE Evaluate the agent: $\tau^\text{eval}\sim \pieval$ in task $\mc{M}$
\ENDFOR
    %\STATE Update $\theta \gets \theta - \eta\nabla_{\theta}\sum_{i=1}^{M}{ \mc{L}^{\text{UCB}}_{\mc{M}_{G_i}}\left( \piadapt_{\theta} \right)}$ using $\mc{L}^{\text{UCB}}_{\mc{M}}$ in Eq.(\ref{eq:UCB-loss})
%\Else
    %\STATE Update $\theta \gets \theta - \eta\nabla_{\theta}\sum_{i=1}^{M}{ \mc{L}_{\mc{M}_{G_i}}\left( \pi^{\text{GRProp}}_{\widehat{G}_i} \right)}$ using $\mc{L}_{\mc{M}}$ in Eq.(\ref{eq:loss})
    %\STATE Update $\theta \gets \theta + \eta\nabla_{\theta}\sum_{i=1}^{M}{ \mc{R}_{\mc{M}_{G_i}}\left( \pi^{\text{GRProp}}_{\widehat{G}_i} \right)}$ using $\mc{R}_{\mc{M}}$ in Eq.(\ref{eq:loss})
%\EndIf
\end{algorithmic}
\end{algorithm}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\subsection{Evaluation: Graph-reward Propagation Policy}\label{sec:test-policy}
In both meta-training and meta-testing, the agent's adapted behavior is evaluated during the test phase. Following~\citet{sohn2019meta}, we adopted the graph reward propagation (GRProp) policy as an evaluation policy $\pieval$ that takes as input the inferred subtask graph $\widehat{G}$ and outputs the subtasks to execute to maximize the return.
Intuitively, the GRProp policy approximates a subtask graph to a differentiable form
such that we can compute the gradient of return with respect to the completion vector to measure how much each subtask is likely to increase the return.
%
Due to space limitations, we give a detail of the GRProp policy in Appendix.
%
The overall meta-training process is summarized in Appendix.

\subsection{Meta-testing: Multi-task Task Inference}

\paragraph{Prior sampling} 
In meta-testing, MTSGI first chooses the prior task that most resembles the given evaluation task. Specifically, we define the pair-wise similarity between a prior task $\mathcal{M}_G^\text{prior }$ and the evaluation task $\mathcal{M}_G$ as follows:
\begin{align}
\operatorname{sim}\left(\mathcal{M}_G, \mathcal{M}_G^\text{prior }\right)=F_{\beta}\left(\mathbf{\Phi}, \mathbf{\Phi}^{\text {prior }}\right)+\kappa R\left(\tau^{\text {prior }}\right) ,
\end{align}
where $F_{\beta}$ is the F-score with weight parameter $\beta$, $\mathbf{\Phi}$ is the subtask set of $\mc{M}_{G}$, $\mathbf{\Phi}^{\text {prior }}$ is the subtask set of $\mathcal{M}_G^\text{prior }$, $R\left(\tau^{\text {prior }}\right)$ is the agent's empirical performance on the prior task $\mathcal{M}_G^\text{prior }$, and $\kappa$ is a scalar-valued weight which we used $\kappa=1.0$ in experiment.
$F_{\beta}$ measures how many subtasks overlap between current and prior tasks in terms of precision and recall as follows:
\begin{align}
F_{\beta}&=\left(1+\beta^{2}\right) \cdot \frac{\text { precision } \cdot \text { recall }}{\left(\beta^{2} \cdot \text { precision }\right)+\text { recall }},\\
\text{Precision} &= \lvert \mathbf{\Phi} \cap \mathbf{\Phi}^{\text {prior }}\rvert / \lvert  \mathbf{\Phi}^{\text {prior }} \rvert,\\
\text{Recall} &=  \lvert \mathbf{\Phi} \cap \mathbf{\Phi}^{\text {prior }}\rvert / \lvert \mathbf{\Phi}\rvert.
\end{align}
We used $\beta=10$ to assign a higher weight to the current task (\ie, recall) than the prior task (\ie, precision). %The pseudo-code of the prior sampling process is summarized in~\Cref{alg:meta-eval}.

\paragraph{Multi-task subtask graph inference} 
%\subsection{meta-testing: multi-task subtask graph inference}\label{sec:meta-eval-sgi}
%Let $\tau$ be the adaptation trajectory of the current task $\mc{M}_G$, and $\mc{T}^{\text{p}}=\{\tau^{\text{p}}_1, \ldots, \tau^{\text{p}}_{|\mc{T}^{\text{p}}|}\}$ be the adaptation trajectories of the seen training tasks.
Let $\tau$ be the adaptation trajectory, and $\taup$ be the sampled prior adaptation trajectory. Then, we model our evaluation policy as follows:
\begin{align}
\pi(o|s,\tau, \taup)
%\propto \sum_G \pi(o|s,G)p(\tau|G)p(\mc{T}^{\text{p}}|G)\\
\simeq \pi(o|s, G^{\tau})^\alpha\pi(o|s, G^\text{p})^{(1-\alpha)}.\label{eq:policy-mix}
\end{align}
Due to the limited space, we include the detailed derivation of~\Cref{eq:policy-mix} in Appendix. Finally, we deploy the GRProp policy as a contextual policy:
\begin{align}
\pieval(\cdot| \tau, \taup)
= \text{GRProp}(\cdot| G^{\tau})^\alpha\text{GRProp}(\cdot|G^\text{p})^{(1-\alpha)}.\label{eq:MTSGI-policy-mix}
\end{align}
Note that~\Cref{eq:MTSGI-policy-mix} is the weighted sum of the logits of two GRProp policies induced by prior $\taup$ and current experience $\tau$.
We claim that such form of ensemble induces the positive transfer in compositional tasks.
Intuitively, ensembling GRProp is taking a union of preconditions since GRProp assigns a positive logit to task-relevant subtask and non-positive logit to other subtasks.
As motivated in the Introduction, related tasks often share the task-relevant preconditions; thus, taking the union of task-relevant preconditions is likely to be a positive transfer and improve the generalization.
% Maybe remove below?
%For efficient computation, we first compute and store the MLE of all the training tasks $G^\text{p}$ during meta-training. In meta-testing, we sample $G^\text{p}$ together with the prior, and merge with the current policy as in~\Cref{eq:MTSGI-policy-mix}. 
The pseudo-code of the multi-task subtask graph inference process is summarized in~\Cref{alg:meta-eval}.

\cutsectionup
\section{Related Work}\label{sec:r}
\cutsectiondown
\paragraph{Web navigating RL agent}
Previous work introduced MiniWoB \citep{shi2017world} and MiniWoB++ \citep{liu2018reinforcement} benchmarks that are manually curated sets of simulated toy environments for the web navigation problem.
They formulated the problem as acting on a page represented as a Document Object Model (DOM), a hierarchy of objects in the page.
The agent is trained with human demonstrations and online episodes in an RL loop.
\citet{jia2018domqnet} proposed a graph neural network based DOM encoder and a multi-task formulation of the problem similar to this work.
\citet{gur2018learning} introduced a manually-designed curriculum learning method and an LSTM based DOM encoder.
DOM level representations of web pages pose a significant sim-to-real gap as simulated websites are considerably smaller (100s of nodes) compared to noisy real websites (1000s of nodes).
As a result, these models are trained and evaluated on the same simulated environments which are difficult to deploy on real websites.
Our work formulates the problem as abstract web navigation on real websites where the objective is to learn a latent subtask dependency graph similar to a sitemap of websites.
We propose a multi-task training objective that generalizes from a fixed set of real websites to unseen websites without any demonstration, illustrating an agent capable of navigating real websites for the first time.
\paragraph{Meta-reinforcement learning}
To tackle the few-shot RL problem,
researchers have proposed two broad categories of meta-RL approaches: RNN- and gradient-based methods.
% RNN-based
The RNN-based meta-RL methods~\citep{duan2016rl, wang2016learning, hochreiter2001learning} encode the common knowledge of the task into the hidden states and the parameters of the RNN. 
% Gradient-based
The gradient-based meta-RL methods~\citep{finn2017model,nichol2018first,Gupta:1802.07245,Finn2018PMAML,Kim:2018:BMAML}
encode the task embedding in terms of the initial policy parameter for fast adaptation through meta gradient.
Existing meta-RL approaches, however, often require a large amount of environment interaction due to the long-horizon nature of the few-shot RL tasks.
% Our work
Our work instead explicitly infers the underlying task parameter in terms of subtask graph, which can be efficiently inferred using the inductive logic programming (ILP) method and be transferred across different, unseen tasks.
%
\iffalse
    The most similar work to ours is the PEARL~\cite{rakelly2019efficient} which also predicts the posterior over latent task embedding from prior experience. However, PEARL's task embedding is a latent vector without any inductive bias, while ours is a subtask graph, which has been shown to be highly effective for the compositional tasks. Also, we use the inductive logic programming method for task inference while PEARL uses amortized variational inference.
\fi
%
\paragraph{More Related Works}
Please refer to the Appendix for further discussions about other related works.

%----------------------------------------------------------------------------------------------------
\begin{figure*}[!htp]
    %\vspace*{-3pt}
    \centering
    \includegraphics[draft=false,width=0.19\linewidth, valign=b]{./figure/apple_performance.pdf}
    \includegraphics[draft=false,width=0.19\linewidth, valign=b]{./figure/dicks_performance.pdf}
    \includegraphics[draft=false,width=0.185\linewidth, valign=b]{./figure/expedia_performance.pdf}
    \includegraphics[draft=false,width=0.19\linewidth, valign=b]{./figure/ikea_performance.pdf}
    \includegraphics[draft=false,width=0.19\linewidth, valign=b]{./figure/walgreen_performance.pdf}\\
    \includegraphics[draft=false,width=0.98\linewidth, valign=b]{./figure/performance_xaxis_legend.pdf}
    \vspace{-8pt}
    \caption{%
        The success rate (y-axis) of the compared methods in the test phase in terms of the environment step during the adaptation phase (x-axis) on \wob{} domain. See Appendix for the results on other tasks.
    }
    \label{fig:fewshot_wob}
    \vspace{-7pt}
\end{figure*}

\section{Experiment}
\subsection{Domains}
%We evaluate our method on 2D grid-world domain and realistic symbolic web navigation domain.
%
\paragraph{Mining}
\mining~\citep{sohn2018hierarchical} is a 2D grid-world domain inspired by Minecraft game
where the agent receives a reward by picking up raw materials in the world or crafting items with raw materials.
The subtask dependency in \mining{} domain comes from the crafting recipe implemented in the game. 
Following~\citet{sohn2018hierarchical}, we used the pre-generated training/testing task splits generated with four different random seeds. Each split set consists of 3200 training tasks and 440 testing tasks for meta-training and meta-testing, respectively. We report the performance averaged over the four task split sets.

%
\paragraph{SymWoB}
We implement a symbolic version of the checkout process on the 15 real-world websites such as \amazon{}, \bestbuy{}, and \walmart{}, etc. %in~\Cref{tab:symwob}.

% Subtasks
\textbf{Subtask and option policy.}
Each actionable web element (\eg, text field, button drop-down list, and hyperlink) is considered as a subtask. We assume the agent has pre-learned the option policies that correctly interact with each element (\eg, click the button or fill out the text field). Thus, the agent should learn a policy over the option.

% Comp & elig
\textbf{Completion and eligibility.}
For each subtask, the completion and eligibility are determined based on the status of the corresponding web element. 
For example, the subtask of a text field is \textit{completed} if the text field is filled with the correct information, and the subtask of a \texttt{confirm\_credit\_info} button is \textit{eligible} if all the required subtasks (\ie, filling out credit card information) on the webpage are completed. Executing an option will complete the corresponding subtask \textit{only if} the subtask is eligible.

% Reward & done
\textbf{Reward function and episode termination.}
The agent may receive a non-zero reward only at the end of the episode (\ie, sparse-reward task). When the episode terminates due to the time budget, the agent may not receive any reward. Otherwise, the following  two types of subtasks terminate the episode and give a non-zero reward upon completion:
\cutitemizeup
\begin{itemize}
\setlength{\itemsep}{1pt}\setlength{\parskip}{1pt}
\item{\tb{Goal subtask}} refers to the button that completes the order (See the green boxes in~\Cref{fig:task_overview}). Completing this subtask gives the agent a +5 reward, and the episode is terminated. %In our checkout tasks, the \order{} subtask is the goal subtask.
\item{\tb{Distractor subtask}} does not contribute to solving the given task but terminates the episode with a -1 reward. It models the web elements that lead to external web pages such as \texttt{Contact\_Us} button in~\Cref{fig:task_overview}. 
\end{itemize}
\cutitemizedown

\textbf{Transition dynamics.}
% Transition
The transition dynamics follow the dynamics of the actual website. 
Each website consists of multiple web pages.
The agent may only execute the subtasks that are currently visible (\ie, on the current web page) and can navigate to the next web page only after filling out all the required fields and clicking the continue button.
The goal subtask is present in the last web page; thus, the agent must learn to navigate through the web pages to solve the task.

% More detail
For more details about each task, please refer to Appendix.
%: E.g., “First name” field in amazon and walmart are represented as the same symbol

\iffalse
\subsection{Domain Randomization}
Perturbation of graph randomly changes the number of ...
Preconditions: subtasks to be completed in order to proceed (e.g. Fill City)
Distractors: subtasks that are not preconditions and has no effect (e.g. Click Subscribe)
Failures: terminal subtasks that denote the failure of the current task (e.g. Click Feedback)
\fi

\subsection{Agents}
We compared the following algorithms in the experiment.
\cutitemizeup
\begin{itemize}
\setlength{\itemsep}{1pt}\setlength{\parskip}{1pt}
\item MTSGI (Ours): our multi-task SGI agent
\item MSGI~\citep{sohn2019meta}: SGI agent without multi-task learning
\item HRL: an Option~\citep{sutton1999between}-based proximal policy optimization (PPO)~\citep{schulman2017proximal} agent with the gated rectifier unit (GRU)
\item Random: a heuristic policy that uniform randomly executes an eligible subtask
\end{itemize}
\cutitemizedown
More details on the architectures and the hyperparameters can be found in Appendix.
%

\paragraph{Meta-training}
% Meta training
In \wob{}, for each task chosen for a meta-testing, we randomly sampled $N_\text{train}$ tasks among the remaining 14 tasks and used it for meta-training.
We used $N_\text{train}=1$ in the experiment (See~\Cref{fig:ablation_nprior} for the impact of the choice of $N_\text{train}$).
For example, we meta-trained our MTSGI on \amazon{} and meta-tested on \expedia{}. 
For \mining{}, we used the train/test task split provided in the environment.
% HRL training.
The RL agents (\eg, HRL) were individually trained on each test task; the policy was initialized when a new task is sampled and trained during the adaptation phase.
% Detail
All the experiments were repeated with four random seeds, where different training tasks were sampled for different seeds.

\subsection{Result: Few-shot Generalization Performance}

\begin{figure}[!t]
    %\vspace*{-3pt}
    \centering
    \includegraphics[draft=false,width=0.49\linewidth, valign=b]{./figure/avg_performance_symwob.pdf}%
    \includegraphics[draft=false,width=0.49\linewidth, valign=b]{./figure/performance_mining.pdf}%
    \vspace{-8pt}
    \caption{%
        The performance of the compared methods in terms of the adaptation steps averaged over all the tasks in \wob{} \figleft and \mining{} \figright domains. 
    }
    \label{fig:fewshot_mining}
    \vspace{-7pt}
\end{figure}
%--------------------------------------------------------------------------
\begin{figure}[!t]
    %\vspace*{-3pt}
    \centering
    \includegraphics[draft=false,width=0.49\linewidth, valign=b]{./figure/symwob_prec_rec.pdf}
    \includegraphics[draft=false,width=0.49\linewidth, valign=b]{./figure/mining_prec_rec.pdf}
    \vspace{-8pt}
    \caption{%
        The precision and recall of the subtask graphs inferred by MTSGI and MSGI on \wob and \mining.
    }
    \label{fig:precision_recall}
    \vspace{-7pt}
\end{figure}
%---------------------------------------------------------------------------
%---------------------------------------------------------------------------
%--------------------------------------------------------------------------
\begin{figure*}[!t]
    %\vspace*{-3pt}
    \centering
    %\includegraphics[draft=false,width=0.9\linewidth, valign=b]{./figure/GT_graph_walmart.pdf}\\
    \includegraphics[draft=false,width=0.9\linewidth, valign=b]{./figure/Inferred_graph_walmart.pdf}%
    \vspace{-8pt}
    \caption{%
        %(Top) The ground-truth subtask graph of \walmart{} domain (not available to the agent) and (Bottom) The subtask graph inferred by our MTSGI after 1,000 steps of environment interaction on \walmart{} domain.
        The visualization of the subtask graph inferred by our MTSGI after 1,000 steps of environment interaction on \walmart{} domain. Compared to the ground-truth subtask graph (not available to the agent), there was no error in the nodes and only two missing edges ({\color{red} in red}). See Appendix for the progression of the inferred subtask graph with varying adaptation steps.
    }
    \label{fig:inferred_graph_wob}
    \vspace{-7pt}
\end{figure*}
%---------------------------------------------------------------------------
%------------------------------------------------------------------------------
\begin{figure}[!t]
    %\vspace*{-3pt}
    \centering
    \includegraphics[draft=false,width=0.49\linewidth, valign=b]{./figure/ablation_exploration_symwob.pdf}%
    \includegraphics[draft=false,width=0.49\linewidth, valign=b]{./figure/ablation_exploration_mining.pdf}%
    \vspace{-8pt}
    \caption{%
        Comparison of different exploration strategies for MTSGI used in adaptation phase for \wob and \mining. %The performance was averaged over the 15 websites for \wob. We report the mean (solid curve) and standard error (shadowed area) of the performance over four random seeds.
    }
    \label{fig:ablation_exploration}
    \vspace{-7pt}
\end{figure}
%----------------------------------------------------------------------------

%To compare how fast each agent can adapt to the given unseen task, we evaluated the few-shot generalization performance of our MTSGI with baselines on \mining{} and \wob{}.
\Cref{fig:fewshot_wob} and~\Cref{fig:fewshot_mining} show the few-shot generalization performance of the compared methods on \wob{} and \mining{}. 
In~\Cref{fig:fewshot_wob}, MTSGI achieves more than 75\% zero-shot success rate (\ie, success rate at x-axis=0) on all five tasks, which is significantly higher than the zero-shot performance of MSGI. 
This indicates that the prior learned from the training task significantly improves the subtask graph inference and in turn improves the multi-task evaluation policy.
Moreover, our MTSGI can learn a near-optimal policy on all the tasks after only 1,000 steps of environment interactions, demonstrating that the proposed multi-task learning scheme enables fast adaptation. 
Even though the MSGI agent is learning each task from scratch, it still outperforms the HRL and Random agents. This shows that explicitly inferring the underlying task structure and executing the predicted subtask graph
is significantly more effective than learning the policy from the reward signal (\ie, HRL) on complex compositional tasks.
Given the pre-learned options, HRL agent can slightly improve the success rate during the adaptation via PPO update. However, training the policy only from the sparse reward requires a large number of interactions especially for the tasks with many distractors (\eg, \expedia{} and \walgreens{}).
%
\subsection{Analysis on the Inferred Subtask Graph}
% Intro + note
We compare the inferred subtask graph with the ground-truth subtask graph.
%\footnote{We note that the ground-truth subtask graph is not available to the agent in both meta-train and meta-test}.
% Qualitative
%\cutparagraphup
%\paragraph{Qualitative result.} 
\Cref{fig:inferred_graph_wob} shows the subtask graph inferred by MTSGI in \walmart{}. We can see that MTSGI can accurately infer the subtask graph; the inferred subtask graph is missing only two preconditions (shown in red color) of \texttt{Click\_Continue\_Payment} subtask.%, and for all other subtasks, their preconditions are correctly inferred. 
We note that such a small error in the subtask graph has a negligible effect as shown in~\Cref{fig:fewshot_wob}: \ie, MTSGI achieves near-optimal performance on \walmart{} after 1,000 steps of adaptation.~\Cref{fig:precision_recall} measures the precision and recall of the inferred precondition (\ie, the edge of the graph). First, both MTSGI and MSGI achieve high precision and recall after only a few hundred of adaptation. Also, MTSGI outperforms MSGI in the early stage of adaptation. This clearly demonstrates that the MTSGI can perform more accurate task inference due to the prior learned from the training tasks.

\begin{comment}
% Quantitative
\cutparagraphup
\paragraph{Quantitative result.} 
We quantitatively evaluate the subtask graph inferred by MTSGI by measuring the precision and recall against the ground-truth subtask graph.
Intuitively, we generate all possible combinations of completion vector input and plug into both inferred and ground-truth subtask graphs, and compare the output. \sungryull{TODO: Add precision-recall graph}The~\Cref{fig:PR_mining} and~\Cref{fig:PR_toywob} show that MTSGI achieves over 95\% of precision and recall in only 1,000 steps of environment interaction on both \mining{} and \wob{} domains.
\end{comment}

%------------------------------------------------------------------------------
\begin{figure}[!t]
    %\vspace*{-3pt}
    \centering
    \includegraphics[draft=false,width=0.49\linewidth, valign=b]{./figure/ablation_npriors_symwob.pdf}%
    \includegraphics[draft=false,width=0.49\linewidth, valign=b]{./figure/ablation_npriors_mining.pdf}%
    \vspace{-8pt}
    \caption{%
       Comparison of different number of priors for MTSGI on \wob{} and \mining{}.
    }
    \label{fig:ablation_nprior}
    \vspace{-7pt}
\end{figure}
%----------------------------------------------------------------------------

\newcommand{\rand}{MTSGI+Random\xspace}
\newcommand{\ucb}{MTSGI+UCB\xspace}
\newcommand{\mtucb}{MTSGI+MTUCB\xspace}
\subsection{Ablation Study: Effect of Exploration Strategy}
In this section, we investigate the effect of various exploration strategies on the performance of MTSGI. We compared the following three adaptation policies:
\begin{itemize}
\setlength{\itemsep}{1pt}\setlength{\parskip}{1pt}
    \item{Random}: A policy that uniformly randomly executes any eligible subtask. % is used as an adaptation policy for both meta-training and meta-testing.
    \item{UCB}: The UCB policy defined in~\Cref{eq:UCB-policy} that aims to execute the novel subtask. The exploration parameters are initialized to zero when a new task is sampled.
    \item{MTUCB (Ours)}: Our multi-task extension of UCB policy. When a new task is sampled, the exploration parameter is initialized with those of the sampled prior.
\end{itemize}
\Cref{fig:ablation_exploration} summarizes the result on \wob{} and \mining{} domain, respectively. Using the more sophisticated exploration policy such as \ucb or \mtucb improved the performance of MTSGI compared to \rand, which was also observed in~\citet{sohn2019meta}. This is because better exploration helps the adaptation policy collect more data for logic induction by executing more diverse subtasks. In turn, this results in more accurate subtask graph inference and better performance.
% 
Also, \mtucb outperforms \ucb on both domains. This indicates that transferring the exploration parameters makes the agent's exploration more efficient in meta-testing. Intuitively, the transferred exploration counts inform the agent which subtasks were \textit{under-explored} during meta-training, such that the agent can focus more on exploring those in meta-testing.

\subsection{Ablation Study: Effect of the prior set size}
MTSGI learns the prior from the training tasks. 
We investigated how many training tasks are required for MTSGI to learn a good prior for transfer learning.~\Cref{fig:ablation_nprior} compares the performance of MTSGI with the varying number of training tasks: 1, 4, 14 tasks for \wob and 10, 100, 500, 3200 tasks for \mining.
The training tasks are randomly subsampled from the entire training set.
The result shows that training on a larger number of tasks generally improves the performance.
%, but it saturates (\eg, $|\traintask|=1$ for \wob and $|\traintask|=500$ for \mining). 
\mining generally requires more number of training tasks than \wob because the agent is required to solve 440 different tasks in \mining while \wob was evaluated on 15 tasks; the agent is required to capture a wider range of task distribution in \mining than \wob.
Also, we note that MTSGI can still adapt much more efficiently than all other baseline methods even when only a small number of training tasks are available (\eg, one task for \wob and ten tasks for \mining).

\cutsectionup
\section{Conclusion}\label{sec:conclusion}
\cutsectiondown

We introduce a multi-task RL extension of the subtask graph inference framework that can quickly adapt to the unseen tasks by modeling the prior of subtask graph from the training tasks and transferring it to the test tasks.
%Specifically, our multi-task subtask graph inferencer (MTSGI) samples a prior by measuring the similarity between the current task and the prior task and merges the two policies constructed from the subtask graph inferred from prior and current tasks, respectively.
The empirical results demonstrate that our MTSGI achieves strong zero-shot and few-shot generalization performance on 2D grid-world and complex web navigation domains by transferring the common knowledge learned in the training tasks to the unseen ones in terms of subtask graph.

In this work, we have assumed that the subtasks and the corresponding options are pre-learned and that the environment provides a high-level status of each subtask (\eg, whether the web element is filled in with the correct information).
%\todo{More limitation and discussion}
In future work, our approach may be extended to a more general setting
where the relevant subtask structure is fully learned from (visual) observations,
and the corresponding options are autonomously discovered.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\bibliography{sohn_527}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{document}
