\documentclass[accepted]{uai2024} 
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

\usepackage{amsmath,amssymb,amsfonts,amsthm}
\usepackage{hyperref}
\usepackage{booktabs}
\usepackage{textcomp}
\usepackage{gensymb}
\usepackage{graphicx}
\usepackage{epstopdf}
\usepackage{multirow}
\usepackage{tabularx}
\usepackage{caption}
\usepackage{natbib}
\usepackage{comment, bm, enumerate}
\usepackage{wrapfig}
%\usepackage[ruled,vlined]{algorithm2e}
%\usepackage{subfig}
\usepackage{caption}
\usepackage{subcaption}
\usepackage[dvipsnames]{xcolor}
\usepackage{url}
\usepackage{algpseudocode,algorithm,algorithmicx}
\usepackage{tcolorbox}
%\usepackage{todonotes}
%\usepackage{subcaption}
\usepackage{booktabs} % for professional tables
\usepackage{microtype}
\usepackage{multicol}
\usepackage{cleveref}
\usepackage{enumerate}

\usepackage{tikz}
\usetikzlibrary{calc}

\newtheorem{theorem}{{\bf Theorem}}
\newtheorem{lemma}{{\bf Lemma}}
\newtheorem{proposition}{{\bf Proposition}}
\newtheorem{remark}{{\bf Remark}}
\newtheorem{corollary}{{\bf Corollary}}
\newtheorem{definition}{{\bf Definition}}
\newtheorem{assumption}{Assumption}
\newcommand\inv[1]{#1\raisebox{1.15ex}{$\scriptscriptstyle-\!1$}}

\newcommand{\po}[1]{{\color{black} #1 }}
\newcommand{\mo}[1]{{\color{black} #1 }}
\newcommand{\so}[1]{{\color{black} #1 }}

\newcommand{\ngo}{\mathrm{NGO}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\D}{\mathcal{D}}
\renewcommand{\d}{\mathrm{d}}

\newcommand{\ah}[1]{{\color{red} ali: #1 }}
\newcommand{\jie}[1]{{\color{blue} J: #1 }}
\newcommand{\xx}[1]{{\color{teal} X: #1 }}
\newcommand*\samethanks[1][\value{footnote}]{\footnotemark[#1]}

\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
\providecommand*\algorithmautorefname{Algorithm}
\providecommand*\definitionautorefname{Definition}

\newcommand{\tikzul}[2][2pt]{
\tikz[baseline=(content.base)]{
    \node(content)[inner sep=0pt]{$#2$};
    \draw($(content.south west)+(#1,-1pt)$)--($(content.south east)-(#1,1pt)$);
}
}   
\newcommand{\tikzol}[2][2pt]{
    \tikz[baseline=(content.base)]{
        \node(content)[inner sep=0pt]{$#2$};
        \draw($(content.north west)+(#1,1pt)$)--($(content.north east)-(#1,-1pt)$);
    }
    }

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


\title{Base Models for Parabolic Partial Differential Equations}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<xingzi.xu@duke.edu>?Subject=Your UAI 2024 paper}{Xingzi~Xu}\thanks{Equal contribution.}{}}
\author[1,2]{\href{mailto:<ali.hasan@duke.edu>?Subject=Your UAI 2024 paper}{Ali~Hasan}\textsuperscript{*}\thanks{Corresponding author.}{}}
\author[3]{\href{mailto:<dingj@umn.edu>?Subject=Your UAI 2024 paper}{Jie~Ding}{}}
\author[1]{\href{mailto:<vahid.tarokh@duke.edu>?Subject=Your UAI 2024 paper}{Vahid~Tarokh}{}}

% Add affiliations after the authors
\affil[1]{%
    Department of Electrical and Computer Engineering\\
    Duke University\\
    Durham, North Carolina, USA
}
\affil[2]{%
    Machine Learning Research\\
    Morgan Stanley
}
\affil[3]{%
    School of Statistics\\
    University of Minnesota\\
    Minneapolis, Minnesota, USA
}

\begin{document}
\maketitle

\begin{abstract}
Parabolic partial differential equations (PDEs) appear in many disciplines to model the evolution of various mathematical objects, such as probability flows, value functions in control theory, and derivative prices in finance. 
It is often necessary to compute the solutions or a function of the solutions to a parametric PDE in multiple scenarios corresponding to different parameters of this PDE.
This process often requires resolving the PDEs from scratch, which is time-consuming.
To better employ existing simulations for the PDEs, we propose a framework for finding solutions to parabolic PDEs across different scenarios by meta-learning an underlying base distribution.
%Specifically, we exploit the stochastic representation of parabolic PDEs to estimate a base distribution that is reused across different tasks through importance sampling. 
%The stochastic representation allows for theoretical analysis of the convergence rate of the operator to the ground truth solution over a class of parameters.
We build upon this base distribution to propose a method for computing solutions to parametric PDEs under different parameter settings.
Finally, we illustrate the application of the proposed methods through extensive experiments in generative modeling, stochastic control, and finance.
The empirical results suggest that the proposed approach improves generalization to solving new PDEs. 
\end{abstract}

\section{Introduction}
%\vspace{-0.1in}
%\ah{todo: change all importance sampling/girsanov/change of measure to same terminology}
In this work, we propose and study a particular type of neural structure that can adapt rapidly to tasks associated with parabolic partial differential equations (PDEs).
Parabolic PDEs are a standard mathematical framework for describing the evolution of different processes.
Applications are evident in various fields, such as probabilistic modeling and mathematical finance~\citep{pardoux2014stochastic}.
For instance, the probability density functions (PDFs) in generative modeling with diffusion processes satisfy a parabolic PDE known as the Fokker-Planck equation. 
In continuous stochastic control problems, the optimal policy satisfies the Hamilton-Jacobi-Bellman equation.
In finance, the Black-Scholes equation describes the price of a derivative. 
Due to the numerous applications, there is a general need to describe such processes under different \emph{boundary conditions} and \emph{parameters}.
In the probabilistic modeling example, we may consider a scenario where we wish to sample from multiple related distributions while using their shared features to accelerate training. 
In this case, different diffusion processes can correspond to distinct parameters of the Fokker-Planck equation.
%In the finance example, we may be interested in computing the price of a derivative under different assumptions on the underlying asset. 

Parabolic PDEs have a particular structure that allows for efficient computation of their solution in high dimensions using Monte Carlo techniques~\citep{pardoux2014stochastic}. 
The Feynman-Kac formula formalizes this for linear parabolic PDEs through a connection between an expectation over sample paths and the solution to the corresponding PDE~\citep[Chapter 7.7]{sarkka2019applied} with the nonlinear extension following a similar argument~\citep{fahim2011probabilistic}. 
However, the formula requires sampling solution paths of a stochastic differential equation (SDE) with parameters corresponding to the PDE. 
This process can be time-consuming due to the large number of sequential samples needed, and it requires sample paths to be~\emph{resampled} for different parameters of the PDE. 
\begin{figure*}
    \centering
    \begin{subfigure}[t]{0.45\textwidth}
    \centering
    \includegraphics[width=\textwidth, trim=100pt 40pt 10pt 10pt]{figs/new_schem_fig.pdf}
    \caption{Maximizing likelihoods of target distributions $p_\text{Gaussians}, p_\text{Moons}$ using a shared base distribution $p_\text{Meta}$.}
    \label{fig:schem_gen}
    \end{subfigure}
    \begin{subfigure}[t]{0.45\textwidth}
    \centering
    \includegraphics[width=\textwidth, trim=30pt 40pt 80pt 10pt]{figs/com-sample-gbm_multi.pdf}
    \caption{Solving PDEs with different parameters (Task 1 and 2) by reusing the base solution.}
    \label{fig:schem_pde}
    \end{subfigure}
    \caption{
    Schematic of the proposed procedures for maximizing functions~(\ref{fig:schem_gen}) and solving PDEs with different parameters~(\ref{fig:schem_pde}).
    In both scenarios, we reuse a meta-learned base parameterization across different tasks. 
    (\ref{fig:schem_gen}) illustrates sampling target densities using a task-specific diffusion for each density.
    (\ref{fig:schem_pde}) illustrates two solutions to linear parabolic PDEs simulated with the same stochastic process based on the proposed method.} 
    \label{fig:schematic}
\end{figure*}
We instead build upon this idea to approximate the PDE solutions by recycling Monte Carlo samples used in computing the solutions while mitigating the instabilities inherent in the direct application of importance sampling.
This idea leads to a base model (or meta model) that can swiftly adapt and extend to different tasks (e.g., solving different PDEs or sampling distinct target distributions). 
We borrow techniques from two research disciplines: the first related to meta-learning and the second related to operator learning~\citep{Finn2017ModelAgnosticMF,Lu2019LearningNO,jin2022minimax}. %\citep{Finn2017ModelAgnosticMF,Nichol2018OnFM,Rajeswaran2019MetaLearningWI,Li2020FourierNO,Lu2019LearningNO}.%\jie{references}
Rooted from meta-learning, we want to modify the base model to unseen tasks with lower training effort. 
Related to operator learning, we want to define an operator using the base model that maps parameters and boundary conditions to solutions.
The outline of the paper is as follows:
We first describe the importance sampling framework in section~\ref{sec:change_param}; we then discuss the issues of directly applying importance sampling and introduce the proposed meta-learning approach in section~\ref{sec:pde_girsanov}.
In section~\ref{sec:max}, we describe a lower bound for applying the framework to maximization tasks, such as maximizing the likelihood in generative modeling.
%We then describe the application of the importance sampling framework in maximization tasks such as generative modeling, where we can replace solutions of the parabolic PDE with a lower bound that removes numerical issues of the naive approach;
In Section~\ref{sec:ngo}, we generalize the computation for maximization tasks and present a neural operator that allows computing solutions to PDEs at a reduced cost. 
Figure~\ref{fig:schematic} illustrates an overview of these concepts where: on the left, we apply the proposed framework to sample from different target distributions corresponding to $p_\text{Moons}, p_\text{Gaussians}$ using a shared base model $p_\text{Meta}$; and, on the right, we apply the operator framework to solve two parabolic PDEs (Task 1 and 2) with different parameters by reusing the original simulated stochastic process (Base). \footnote{Github repository to this paper is: \url{https://github.com/XingziXu/base_parabolic.git}}

\paragraph{Contributions} 
To summarize, our main contribution is a framework for establishing a base model for reuse across problems associated with parabolic PDEs. 
To that end, 
\begin{itemize}
    \item We describe the base model and a corresponding meta-learning framework for solving maximization problems associated with parabolic PDEs;
    \item We use the meta-learning framework and propose a neural operator to approximate the solution of parabolic PDEs under different parameter settings; 
    \item We analyze the convergence of the operator over parametric classes of functions; 
    \item We evaluate the methods in different experimental settings, including synthetic and real-world examples.
\end{itemize}
%Extensive research on solving PDEs with machine learning exists, with particular attention to solving PDEs using deep learning methods~\citep{raissi2019686,SAMANIEGO2020112790,doi:10.1137/19M1274067}. 
%Existing literature on generalizing deep learning models to different boundary conditions or parameters mostly employ meta-learning approaches to solve PDEs, which first train a neural network on multiple PDE tasks, and start from this meta-learned network parameters when given a new PDE task, or train a hypernetwork to generate new networks for new PDEs \citep{xu2022meta, PSAROS2022111121, CHEN2022110996}. 
%These methods apply to most families of PDEs but are computationally intensive since the training procedure requires repeatedly differentiating through neural networks or a computationally heavy hypernetwork. 
%In our work, we only focus on parabolic PDEs.
%In particular, to avoid issues associated with scaling to high dimensions, we exploit the stochastic representation of parabolic PDEs, which allows representing the PDE solution as an expectation of stochastic differential equation (SDE) solutions via the Feynman-Kac formula~\citep{sarkka2019applied}. 
%We simplify evaluations of this expectation to a weighted expectation of Brownian motions using the Girsanov theorem and Lamperti transformation, removing the usage of for-loops when solving SDEs and greatly accelerating the computation. Evaluating the expected value using Girsanov introduces errors growing with time. To mitigate this issue, we propose an operator learning scheme based on the Girsanov solutions in a short period, where the Girsanov solutions are precise. The proposed operator learning method has a lower error than the Girsanov-based method and is still significantly faster than numerical SDE solver-based approaches.
%Monte-Carlo type methods are mesh-free and might solve high-dimensional PDEs without suffering from the curse of dimensionalities \citep{Beck2021OnNF,Han2018SolvingHP}. 
%Rather than learning to solve the PDE with a neural network directly, we express the solution to the parabolic PDE as expectations of solutions of stochastic differential equations (SDEs) and approximate the SDEs with neural networks.

%Learning solutions to parabolic PDEs with SDEs through the Feynman-Kac method can be viewed as approximating the density at the terminal condition of the SDE. 
%It is then natural to ask: how to get close to a target distribution quickly? 
%Modeling densities with SDEs has been cast as a generative modeling problem, with numerous methods proposed, achieving state-of-the-art performance\citep{Song2021ScoreBasedGM,Huang2021AVP,Vahdat2021ScorebasedGM}. \citep{Huang2021AVP} proposed to fit the terminal density of a forward SDE to one described with a dataset, by maximizing the likelihood of the data points, which is calculated with a Feynman-Kac approximation of the Fokker-Planck equation.
%Current methodologies use a standard Gaussian distribution as the prior. 
%Here, we theoretically and empirically show that a prior closer to the target distribution in Wasserstein distance is beneficial to training and propose to switch from a standard Gaussian to a prior that is meta-learned and closer to target distributions.
%\ah{redo}

%We organize the paper as follows:
%We first introduce the mathematical tools used in sections \ref{sec:background}, \ref{sec:fkmethod}, \ref{sec:change_param}, and derive the stochastic solution of parabolic PDEs using the tools.
%We then present meta-learning for generative modeling based on solutions of the Fokker-Planck equation. Rather than applying meta-learning methods directly on generative models, we meta-learn a boundary condition close to target distributions. 
%We finally present an operator-learning scheme for parabolic PDEs. We express the solution of a parabolic PDE as expected values of terminal values of SDEs. While we illustrate the framework with linear parabolic PDEs, the proposed framework extends to nonlinear parabolic PDEs and elliptic PDEs.

%\ah{this section talks about related work but so does the previous section, the previous section should mainly be introduction}

\subsection{Related work}
%\vspace{-0.1in}
%\textcolor{red}{Notes:
%This paper is interesting: \citet{becker2022learning} they look at how to learn the random variables similar to how we are doing our work in the operator learning perspective. 
%Schrodinger bridge problems, gradient flows~\citep{alvarez2021dataset}.
%\citet{demange2022efficient} similar to what we're doing with meta learning a change of measure.
%}
%\todo{empirical comparison in the same fashion}

We will discuss the related work on meta-learning techniques for PDEs and operator-learning methods for PDEs.  
%Meta-learning is ubiquitous in machine learning applications, including solving PDEs. 
%For example, \citet{Finn2017ModelAgnosticMF} proposed MAML, which aims to optimize the model's performance on new tasks after fine-tuning for a few iterations on task specific data for classification. %MAML proposes to minimize the error on multiple tasks after a fixed number of epochs. MAML involves backpropagation through multiple tasks and updates, which is computationally intensive. \citep{Nichol2018OnFM} proposes reptile, which simplifies MAML and uses only first-order gradients for meta-learning updates. Other model-agnostic meta-learning methods include \citep{Li2017MetaSGDLT,Rajeswaran2019MetaLearningWI}.
%Besides classification, recent works considered the problem of meta-learning a generative model. 
%In particular,~\citet{Cloutre2019FIGRFI, Robb2020FewShotAO, Liang2020DAWSONAD, sridhar2022metagan} consider variants of the MAML algorithm applied to generative adversarial networks (GANs).
%applies MAML on the discriminator and generator of a GAN to meta-learn the optimal weight initialization. 
%\citet{Robb2020FewShotAO} proposed to freeze part of the parameters during the pretraining and adapt with the target dataset. 
%\citep{Liang2020DAWSONAD,Cloutre2019FIGRFI} also applied a meta-learning algorithm to GAN networks. 
%\citep{Edwards2017TowardsAN} proposed an extension of a variational autoencoder that learns to generate statistics that encapsulate a generative model for different datasets.
\citet{ PSAROS2022111121, Chen2022MetaMgNetMM, Liu2022ANM} consider applying the MAML framework to physics-informed neural networks (PINNs)~\citep{raissi2019physics}.
In these approaches, the authors learn a meta-parameterization of PDEs for efficient optimization to estimate PDEs under new parameters.
%However, PINNs may fail to scale to high-dimensional PDEs with higher order derivatives due to errors associated with automatic differentiation over many variables (e.g., see the review in~\citet{margossian2019review}), making their application to tasks in high-dimensional spaces difficult. 
\citet{huang2022meta} considered a latent variable model fine-tuned to obtain the solution at different regions of the parameter space and domain based on the PINNs framework.
\citet{Chen2022MetaMgNetMM} uses hypernetworks for multi-task learning with parameterized PDEs, which also focuses on low-dimensional settings.

%The closest work to our proposed method is \citep{Bartunov2018FewshotGM}, which aims to model a conditional generative distribution, 
%\begin{align*}
%    p(x|x,\theta)=\int p(\mathbf{z}|x,\theta)p(x|\mathbf{z},x,\theta)d\mathbf{z}
%\end{align*}
%and the prior is a Gaussian distribution with its mean and variance depending on the dataset. %Empirical results show that the method improves performance on the fly as additional data is available, outperforming existing conditional generative models. Instead of restraining to Gaussian distributions, our method meta-learns a flexible prior represented with a normalizing flow. Also note our method does not constrain to normalizing flows, as any generative model with an explicit formulation of density can model the prior.

On the second front are neural operator architectures used to solve PDEs.
Operator-learning methods map sets of parameters to solutions of PDEs by estimating the operator that describes this transformation.
The work of~\citet{Lu2019LearningNO} proposed DeepONet, which uses the result that a neural network with a single hidden layer can approximate any nonlinear continuous operator, and follow-up work of~\citet{Li2020FourierNO} considers learning a mapping in  Fourier space with improved empirical performance. 
\citet{gupta2021multiwavelet, Wang2021LearningTS, Li2022FourierNO,Zheng2022FastSO} propose additional variants on this operator framework.
However, these methods require first generating a dataset of parameter and solution pairs by solving the PDE according to standard techniques, which can be expensive.
\citet{Wang2021LearningTS} attempts to circumvent this issue by including the PINN loss in the operator learning framework. 
Additionally, \citet{Hu2023TacklingTC} considers learning PINNs for high-dimensional PDEs by decomposing a gradient of PDEs into pieces corresponding
to different dimensions.
However, various issues associated with optimization persist with the PINNs framework, e.g., in~\citet{krishnapriyan2021characterizing, wang2021understanding}, making its application difficult with certain PDEs.
Finally, several methods consider the stochastic representation used in this paper.
\citet{berner2020numerically,richter2022robust,glau2022deep,zhang2023monte} consider various forms of regression-based techniques wherein the stochastic representation is repeatedly sampled for different parameter values and regressed to a sufficiently expressive function approximator (usually a neural network).
However, this incurs significant training costs due to the required sampling at each iteration in training and a feature that the proposed methods in this work explicitly circumvents.
\citet{Han2018SolvingHP} consider solving semi-linear parabolic PDEs by regressing the stochastic representation of the solution to the neural network.  
A hybrid approach that considers both PINN losses and the stochastic representation of PDEs was considered in~\citet{nusken2021interpolating} for the single parameter case.
These methods, however, only provide solutions for PDEs for one particular set of parameters rather than a family of parameters.

%\begin{align*}
%    \frac{\partial p}{\partial t}(t,x)+\frac{1}{2}\text{Tr}(\sigma\sigma^T(t,x)(\text{Hess}_x p)(t,x))+\nabla p(t,x)\cdot \mu(t,x)+f(t,x,u(t,x),\sigma^T(t,x)\nabla p(t,x))=0
%\end{align*}

To address these gaps in the literature, including the lack of scalability to high dimensions, requiring many solution pairs for training, and requiring computing new solutions for different parameters, our framework exploits the stochastic representation of parabolic PDE with an importance sampling technique to transform the learned solution to one with distinct parameters.
%\citep{PSAROS2022111121,CHENG2022106719,Liu2022ANM} applied MAML algorithms on PINNs to learn an optimal parameter initialization.
%\citep{Chen2022MetaMgNetMM} makes use of hyper networks to achieve multi-task learning on solving parameterized PDEs. 
%Additionally, \citep{Goswami2022DeepTL} proposed a transfer-learning scheme for solving PDEs with machine learning, showing results on geometry domain adaptation. \citep{Chakraborty2021TransferLB} also proposed a transfer-learning scheme for solving PDEs with neural networks.
%\vspace{-0.1in}

\section{Stochastic representations of parabolic PDEs} %\jie{lower letter except for the first letter in the first word in all (sub)section titles, according to NIPS policy}}
%\vspace{-0.1in}
We will now describe the key ingredients of the proposed method and how to apply them in the proposed framework. 
Throughout the paper, we will refer to the base model interchangeably as a meta model.

%\vspace{-0.12in}
\subsection{Parabolic PDEs}
\label{sec:background}
%\vspace{-0.12in}
Consider a domain $\mathcal{D} \subset [0, T] \times \mathbb{R}^d$ such that the solution $p(t,x) : \D \to \mathbb{R}$ of the PDE is defined. 
A linear parabolic PDE is of the form
\begin{align}
    %\frac{\partial p}{\partial t}(t,x)+\frac{1}{2}\text{Tr}(\sigma\sigma^T(t,x)(\text{Hess}_x p)(t,x))+\nabla p(t,x)\cdot \mu(t,x)+f(t,x,u(t,x),\sigma^T(t,x)\nabla p(t,x))=0
    \nonumber 
    &\frac{\partial p}{\partial t}(t,x) = \nabla p(t,x)\cdot \mu(t,x) \\
    & +\frac12\text{Tr}(\sigma\sigma^\top(t,x)(\text{Hess}_{x} p)(t,x)) -r(t,x)p(t,x),
\label{eq:parabolic}
\end{align}
%\jie{define all notations, including $\sigma\sigma^T$, $r(t,x)$, and $\text{Hess}_{x}$. use $\times$ for multiplication wherever possible to clarify}
with boundary condition $p(0,x)=p_0(x)$.
The function $\mu(t, x) : \D \to \mathbb{R}^d$ is referred to as the \emph{drift function} and the function $\sigma(t, x) : \D \to \mathbb{R}^{d \times d}_+$ where $\sigma\sigma^\top$ is semi positive definite for all elements in $\D$ is referred to as the \emph{volatility function}. 
The function $r(t,x) : \mathcal{D} \to \mathbb{R}$ is sometimes referred to as the growth term. 
For all functions, we require the usual conditions on $\mu, \sigma, r$ such that~\eqref{eq:parabolic} is uniformly parabolic (e.g. see~\citet[Chapter 7.1]{evans2022partial}).
$\mathrm{Hess}_x p(x,t)$ denotes the Hessian of $p$ with respect to $x$. 
For the remainder of the text, we will focus on the interplay between the functions $\mu, \sigma,$ and $p_0$, as we are interested in how they can be easily updated to solve different tasks.

\subsection{Feynman-Kac method}
\label{sec:fkmethod}
%\vspace{-0.1in}
Solving~\eqref{eq:parabolic} in high dimensions generally requires using a Monte Carlo method to alleviate the curse of dimensionality. 
In particular, the Feynman-Kac method provides such a mechanism:
\begin{lemma}[Feynman-Kac method \cite{sarkka2019applied}]
Let $X_t$ satisfy the following It\^o diffusion:
$
\mathrm{d}X_t = \mu(t, X_t) \mathrm{d}t + \sigma(t, X_t)\mathrm{d}W_t.
$
Then, the solution of the PDE in~\eqref{eq:parabolic} is 
\begin{equation}
\label{eq:fk}
    p(t, x) = \mathbb{E}\left[\po{p_0}(X_t) \exp\left (-\int_0^t r(s,X_s) \mathrm{d}s \right) \bigg | \; X_0 = x \right].
    %p(t, x) = \mathbb{E}\left[p_0(X_t) \exp(-\int_0^t r(s,x))ds| \; X_0 = x\right].
\end{equation}
\end{lemma}
While the Feynman-Kac method applies to various types of parabolic equations, we present it in the case of solving Equation~\eqref{eq:parabolic} with an initial condition to simplify the exposition. %\jie{undefined term}
%We present the forward equation case to simplify the presentation of our generative modeling example, where $p_0$ is a density and $r(t,x) = \nabla \mu(t, x)$. The resulting PDE corresponds to the Fokker-Planck equation for a diffusion given with parameters $\mu, \sigma$.

\subsection{Changing parameters}
\label{sec:change_param}
%\vspace{-0.1in}
%While~\eqref{eq:fk} allows efficient pointwise solutions of the corresponding PDEs,
From~\eqref{eq:fk}, changing the parameters of the PDE requires sampling new sample paths of $X_t$. %\jie{change the word insofar}
We aim to decrease this sampling burden by reusing the existing samples for different parameters. 
To do this, we consider an application of Girsanov's theorem:

\begin{definition}[Likelihood Ratio~\citep{sarkka2019applied}]
\label{def:girsanov}
Let $X_t, Y_t$ be two It\^o diffusions satisfying
\begin{align*}
    \mathrm{d}X_t &= \mu^{(1)}(t, X_t) \mathrm{d} t+ \sigma (t)\mathrm{d} W_t, \\
    \mathrm{d}Y_t &= \mu^{(2)}(t, Y_t) \mathrm{d}t + \sigma (t)\mathrm{d}W_t
\end{align*}
with laws $\mathbb{P}_{X_t}, \mathbb{P}_{Y_t}$. 
Define $\Sigma = \sigma\sigma^\top$ and $\delta\mu(s,x) = \mu^{(1)}(s,x) - \mu^{(2)}(s,x)$ and assume $\E \left [ \frac12 \int_0^t \delta \mu(s, x) \d s\right ] < \infty$.
Then, the \emph{exponential martingale} gives the likelihood ratio of the two processes: 
\begin{align}
   \nonumber  \frac{\mathrm{d}\mathbb{P}_{X_{t}^{(1)}}}{\mathrm{d}\mathbb{P}_{Y_{t}^{(2)}}} := \exp \biggl(- \frac12& \int_0^t \delta\mu(s,x)^\top \Sigma^{-1} \delta\mu(s,x) \mathrm{d}s  \\
     + &\int_0^t \delta\mu(s,x)^{\top} \Sigma^{-1} \mathrm{d}W_t \biggr).
     \label{eq:exp}
\end{align}
\end{definition}
We will drop the superscript in $\frac{\mathrm{d}\mathbb{P}_{X_{t}^{(1)}}}{\mathrm{d}\mathbb{P}_{Y_{t}^{(2)}}}$ except for cases where specifying the drift index is necessary. Definition~\ref{def:girsanov} relates to Girsanov's theorem~\citep{sarkka2019applied} and note that the $X_t, Y_t$ originates from the same filtration $\mathcal{F}_t$ generated by the Brownian motion $W_t$. 
The likelihood ratio facilitates computing the expectation of a function of $X_t$ by using generated samples of $Y_t$.
Specifically, it holds that
\begin{equation}
\mathbb{E}\left[p_0(X_t) \; |\; \mathcal{F}_t\right] = \mathbb{E}\left[p_0(Y_t)\frac{\mathrm{d}\mathbb{P}_{X_t}}{\mathrm{d}\mathbb{P}_{Y_t}} \; \bigg | \; \mathcal{F}_t \right]
\label{eq:com}
\end{equation}
meaning that to change the PDE parameters $\mu, \sigma$, we only need to compute a transformation of the existing sampled paths rather than resampling from scratch.
A standout example is the case of Brownian motion when $\mu^{(1)}(t, X_t) =0$, where we can sample $N$ Gaussian random variables with variances depending on $\sigma$ and $t$ (i.e., independent of state) and reuse the sample path for different $\mu^{(i)}$. 
Through It\^o's lemma, we can apply a function $f$ to the sampled Brownian motion to find a new sample path that has non-unit volatility\footnote{To avoid confusion with diffusion processes, we will refer to $\sigma$ as the \emph{volatility}, but note that it often denotes the diffusion coefficient.} and depends on the state: 
\begin{lemma}[It\^{o}'s lemma~\citep{sarkka2019applied}]
\label{lem:ito}
If $W_t \in \mathbb{R}^d$ is a Brownian motion on $[0,T]$, and $f(x) : \mathbb{R}^d \to \mathbb{R}$ is a twice continuously differentiable function, then for any $t\leq T$, 
\begin{align}
\label{eq:ito_lemma}
    \mathrm{d}f(W_t)=\frac{1}{2} \mathrm{tr}(\mathrm{Hess}_xf(W_t))\mathrm{d}t+\nabla_x f(W_t) ^T \mathrm{d}W_t.
\end{align}
\end{lemma}
With these tools in mind, we describe how to efficiently calculate solutions of parabolic PDEs with changing drift and volatility functions. 

%\vspace{-0.1in}
\section{Learning a base model}
%\vspace{-0.1in}
Recall that we want to develop a model with the following properties such that it applies to many tasks:
\begin{itemize}
    \item Adaptability, the model defines a representation shared across many tasks; 
    \item Extensibility, the representation can be rapidly assimilated to new tasks. 
\end{itemize}
%\vspace{-0.1in}
To achieve these properties, we invoke Definition~\ref{def:girsanov} and Lemma~\ref{lem:ito} to a base set of sample paths shared across all tasks.
We first describe the general approach and why naively applying Definition~\ref{def:girsanov} fails in many cases, which motivates the need for neural parameterization.
We then consider maximizing a function of parabolic PDE solutions rather than solving the PDEs.
We finally present the operator-learning framework where different tasks correspond to the solutions to a PDE under different parameter configurations.
%All methods compute solutions for different parameter settings, with the main parameter of interest being $\mu$ in~\eqref{eq:fk}. 

\subsection{Solving parabolic PDEs through importance sampling}
%\vspace{-0.1in}

\label{sec:pde_girsanov}
Consider a parabolic PDE that satisfies~\eqref{eq:parabolic} using the representation given in~\eqref{eq:fk}.
Computing the solution for~\eqref{eq:parabolic} for a series of $K$ different drifts, $\{\mu^{(i)}\}_{i=1}^K$, requires simulating a different SDE for each $\mu^{(i)}$. 
Simulating the SDEs with an Euler-Maruyama integration, where the discretization size is $h$, the number of time steps is $N_T=T/h$, and approximating the expectation with $N_E$ different realizations requires $N_E \times N_T \times K$ computations.
Importantly, to compute $X_t$, the computation \emph{must be performed sequentially}, since it relies on the previous value of $X_t$.
This sequential operation induces the main bottleneck, and we try to avoid it whenever possible.

We instead consider the SDE $\mathrm{d}Y_t = \sigma(Y_t) \mathrm{d}W_t$ associated with marginal distribution $\mathbb{P}_{Y_t}$ which we assume we can simulate as a function of Brownian motion $f(W_t)$.
Suppose we wish to solve the PDE in~\eqref{eq:parabolic} for $\mu = \{\mu^{(i)}(x)\}_{i=1}^K, \sigma = \sigma(x)$.
Using~\eqref{eq:com} combined with~\eqref{eq:fk}, we can write the solution with $\mu^{(i)}$ write the expectation as
\begin{align}
 \nonumber  &p_{\mu^{(i)}}(T, x) := \\ \nonumber &\mathbb{E}_{\mathbb{P}_{X_t^{(i)}}}\left[p_0(X_T^{(i)})\exp\left(-\int_0^T r\left(X_s^{(i)}\right)\mathrm{d}s\right) \; \bigg | \; X_0^{(i)}=x\right  ] \\ 
   &=\mathbb{E}_{\mathbb{P}_{Y_t}} \left[p_0(Y_T)\exp\left(-\int_0^T r(Y_s)\mathrm{d}s\right)\frac{\mathrm{d}\mathbb{P}_{X_T^{(i)}}}
   {\mathrm{d}\mathbb{P}_{Y_T}} \; \bigg | \; Y_0=x\right ]
   \label{eq:fk_is}
\end{align}
where $X_t$ satisfies $\mathrm{d}X_t^{(i)} = \mu^{(i)}(X_t^{(i)}) \mathrm{d} t + \sigma(X_t^{(i)}) \mathrm{d}W_t$ with measure $\mathbb{P}_{X_t^{(i)}}$. 
By writing the expectation with respect to $Y_t$, we can reuse the $N$ simulations of $Y_t$ for each $\mu^{(i)}$, and we only need to compute the likelihood ratio, $\frac{\mathrm{d}\mathbb{P}_{X_T}^{(i)}}{\mathrm{d}\mathbb{P}_{Y_T}}$ for $i=1\ldots K$.
This formulation is crucial since each $\frac{\mathrm{d}\mathbb{P}_{X_T}^{(i)}}{\mathrm{d}\mathbb{P}_{Y_T}}$  only requires an integral (approximated by a sum) rather than sampling an SDE. 
Unfortunately upon inspecting~\eqref{eq:exp}, computing the likelihood ratio $\frac{\mathrm{d}\mathbb{P}_{X_T}}{\mathrm{d}\mathbb{P}_{Y_T}}$ requires computing the exponential of numerically approximated integrals.
This results in an error that grows exponentially in the discretization size $h$ rather than linearly in $h$, as is the case when directly computing the expectation with new sample paths from each $\mu^{(i)}$.
In the next section, we discuss ways to circumvent this issue by deriving a lower bound of the PDE solution with an error linear in $h$ and developing a neural network-based representation of the likelihood ratio. 

\subsection{Maximizing parabolic PDEs}
\label{sec:meta_gen_formulation}
%\jie{need to explain a bit more}}
%\vspace{-0.1in}
%\begin{wrapfigure}{r}{0.26\textwidth} 
%\vspace{-0pt}
%  \begin{center}
%    \includegraphics[width=0.26\textwidth, trim=35pt 45pt 15pt 105pt]{figs/1d_vis_ou.pdf}
%    \caption{Simulated solutions of the Fokker-Planck equation of an $1d$ OU-process compared with the analytic solution.}
%    \label{fig:ou_1d}
%  \end{center}
%  \vspace{-70pt}
%\end{wrapfigure}

\begin{comment}
\begin{figure}
  \begin{center}
    \includegraphics[width=0.3\textwidth, trim=30pt 30pt 10pt 0pt]{figs/1d_vis_ou.pdf}%{./Pictures/mainscreen1.png}
    \caption{Simulated solutions of the Fokker-Planck equation of an $1d$ OU-process compared with the analytic solution.}%, NGO (normalized error is 0.0024), Euler-Maruyama (normalized error is 0.015), and Girsanov (normalized error is 0.0092).}
    %The top row shows the magnitude of the loss normalized by the magnitude of the solution calculated by Euler-Maruyama, and the bottom row shows the inference times. The left column represents results on linear parabolic PDEs, and the right column represents results on semi-linear parabolic PDEs.}
    \label{fig:ou_1d}
  \end{center}
  \vspace{-0.3in}
\end{figure} 
\end{comment}
It is often the case that we are interested in the parameters that \emph{maximize} the expectation of a function of sample paths, i.e. $\max \mathbb{E}\left[ J\left(X^{(i)}_T\right) \right]$ where the maximization is over parameters of $X_T$ for different tasks associated with $K$ distinct datasets, $\{X^{(i)}\}_{i=1}^K$. 
Note that by Lemma~\ref{lem:ito}, this expectation satisfies the PDE~\eqref{eq:parabolic}.
To motivate this approach, we will consider the problem of sampling from a family of target distributions, as illustrated on the left side of Figure~\ref{fig:schematic}.
This is closely related to the works on diffusion models in~\citep{NEURIPS2021_0a9fdbb1,Song2021ScoreBasedGM},%,NEURIPS2021_67d96d45,Song2020DenoisingDI,Berner2022AnOC}, 
which uses a neural network to approximate the score function of a transition density described by an SDE.
However, in those cases, a specific form of the forward SDE is used such that an analytical form of the transition density exists.
Instead, we are interested in relaxing this parametric assumption and working directly with the Fokker-Planck equation through its stochastic representation.

Continuing with $K$ target distributions $\{p^{(i)}(x)\}_{i=1}^K$ that we wish to approximate, when approximating with an It\^o diffusion, we can represent each distribution as the solution of the Fokker-Planck equation at some terminal time $T$ under $K$ different parameters.
This equation is a linear parabolic PDE which we can write in terms of an expectation according to the Feynman-Kac formula~\eqref{eq:fk} with $r = \nabla \cdot \mu^{(i)}$ (see Appendix~\ref{sec:max} for derivation) and $ \d X_t^{(i)} = \mu^{(i)}(t, X_t^{(i)}) \d t + \sigma(t) \d W_t$, assuming that $\sigma$ is independent of $X_t$ for ease of explanation.
%\begin{align}
%\label{eq:fk_fp}
%   \nonumber p^{(i)}(x, T) &= \mathbb{E}\left[  p_0(X_T^{(i)})  \exp \left ( \int_0^T \nabla \cdot \mu^{(i)} (X_s^{(i)}, s) \mathrm{d} s \right ) \mid X_0^{(i)} = x \right] \\
%    \mathrm{d}X_t^{(i)} &= \mu^{(i)}(X_t^{(i)}, t) \mathrm{d}t + \mathrm{d}W_t.
%\end{align}
Then, we want to maximize the distribution's likelihoods for $K$ different tasks at $T$.
We will use the idea described in Section~\ref{sec:pde_girsanov} to bypass the expensive Euler-Maruyama sampling procedure during each training iteration.
To do this, we use the form in~\eqref{eq:fk_is} with the parameters of the latent process $Y_t$ being the meta-learned parameters, i.e. 
%We instead modify~\eqref{eq:fk} using Girsanov's theorem such that we only need to sample Brownian motion at each iteration to obtain:
%\begin{align}
%\label{eq:fk_fp_gir}
%    p^{(i)}(T,x) &= \mathbb{E}\left[p_0(Y_T) \exp \left ( \int_0^T \nabla \cdot \mu^{(i)} (s,Y_s)  - \frac{1}{2}\mu^{(i)}^2(s,Y_s) \mathrm{d} s  + \int _0^T \mu^{(i)}(s,Y_s)\mathrm{d} W_s \right )    \mid Y_0 = x \right]
%\end{align}
$\mathrm{d}Y_t = \mu_0(Y_t, t) \d t + \sigma_0(t) \d W_t, \;\; Y_0 \sim p_0$.
As such, we have translated the problem from requiring $X_t$ to $Y_t$ samples, which are reusable across training iterations.
%To generalize this further, we can see three parameters related to all $K$ tasks in~\eqref{eq:fk_fp_gir}: $p_0, \mu_0, \sigma_0$.
We take the parameters $(p_0, \mu_0, \sigma_0)$ as the \emph{meta parameters}, which are optimized over all $K$ tasks.

The exponential term in~\eqref{eq:com} still incurs high errors for large $T$, and solving the PDE using this form is inaccurate without prohibitively large $N_h$.
To circumvent this, we remind ourselves that the goal is to \emph{maximize} the solution of the PDE, which corresponds to the likelihood in this case. 
Applying Jensen's inequality, we obtain an evidence lower bound (ELBO) without the exponential error:
\begin{align}
    \log p(T,x)
    &\geq \mathbb{E}_{\mathbb{P}_{X_t}}\left[\int_0^T\nabla\cdot\mu(s, X_{s})\mathrm{d}s + \log p_0(X_T)  \right]     \label{eq:elbo_direct}  \\
  \nonumber  &= \mathbb{E}_{\mathbb{P}_{Y_t}}\bigg[\int_0^T\hat{\mu}(s,Y_{s})\mathrm{d}W_s -\int_0^T\frac{1}{2}\hat{\mu}^2(s,Y_{s}) \\ &\;\; \quad - \nabla \cdot \mu(s,Y_s) \mathrm{d}s + \log p_0(Y_T) \mid Y_0 = x \bigg] 
    \label{eq:elbo_is}
\end{align}
with~\eqref{eq:elbo_is} denoted as $\mathrm{ELBO}_{\mathrm{IS}}$, with $\mathrm{IS}$ referring to \emph{importance sampling}, and~\eqref{eq:elbo_direct} denoted as $\mathrm{ELBO}_\mathrm{direct}$ and $\hat{\mu} =  (\sigma_0\sigma_0^\top)^{-1}\mu, \hat{\mu}^2 = \mu^\top (\sigma_0\sigma_0^\top)^{-1} \mu$.
\citet{Huang2021AVP} explored a similar idea for score-based diffusion models and proved that the bound in~\eqref{eq:elbo_direct} is tight when maximizing over a sufficiently expressive class of drift parameterizations.
Circling back to the maximum likelihood estimation problem, we can apply this technique to reduce the exponential error to linear while reusing the sample paths $Y_t$ and meta-learning the parameters $(p_0, \mu_0, \sigma_0)$.
We describe an explicit example of meta-learning $p_0$ in Appendix~\ref{sec:max}.


\subsection{\texorpdfstring{$\ngo$}{Lg}: Meta-learning a continuous space of tasks}%Operator Learning for Parabolic PDEs \jie{highlight the relationship with other subsections and mention the term NGO in the title}}
%\vspace{-0.1in}
\label{sec:ngo}

\begin{figure} 
\centering
    \includegraphics[width=0.45\textwidth]{figs/1d_vis_ou.pdf}
    \caption{Simulated solutions of the Fokker-Planck equation of an $1d$ OU-process compared with the analytic solution.}
    \label{fig:ou_1d}
\end{figure}

Having described the meta-learning framework where we consider $(p_0, \mu_0, \sigma_0)$ as meta-learned parameters for the case of maximizing the solution to a parabolic PDE, we now study estimating the explicit \emph{solution} to the PDE. 
Specifically, we want a model that easily generalizes to solutions for different parameters in one shot. 
From the previous section, we can apply importance sampling to compute the solution for different $\mu^{(i)}$'s while reusing the sample paths. 
However, we noted that this incurs an exponential error for the integration in time when computing with the Euler-Maruyama method. 
The key idea of the operator learning approach is to instead learn an optimal integrator through a neural network for a parametric family of drifts. 
%Girsanov's theorem provides an alternative estimator of the expected value of functions of SDEs solutions. 
%While Girsanov's theorem applies to SDEs with drift that satisfies Novikov's condition (see appendix \ah{ add in appendix}), numerical approximation of the exponential martingale $\frac{d\mathbb{P}_{X_T}}{d\mathbb{P}_{Y_T}}$ through the Euler-Maruyama method suffers from exponentially growing errors. 
%We instead focus on a parametric family of drift functions with parameters in some compact subset of $\mathbb{R}^k$ that we try to learn the optimal numerical approximation for. 
%We conjecture that by reducing the drift function to a smaller family, it is possible to find alternative estimators that do not require taking the exponential and have linearly growing errors. 
%Specifically, we consider drift functions $\mu$ in the form of 
%\begin{equation}
 %   \label{eq:mu_fam}
%    \mu(x,t) = \sum_{n_{\mu}=1}^{N_{\mu}} c_{n_{\mu}} u_{n_{\mu}}(x,t)
%\end{equation}
%where $\{u_{n_{\mu}}\}_{n_{\mu}=1}^{N_{\mu}}$ are basis functions and $\{c_{n_{\mu}}\}_{{n_{\mu}=1}}^{N_{\mu}}$ are coefficients for the basis functions. 
We refer to this as the Neural Girsanov Operator ($\ngo$), which transforms the expected value of solutions of SDEs $Y_t$ with meta-parameters $(p_0, \mu_0, \sigma_0)$ to SDEs $X_t$ with drift $\mu^{(i)}$.
We will consider the case where $\mu_0 = 0, \sigma_0 = 1$ for ease of exposition.
%We consider the unit volatility case for ease of exposition, where $Y_t=W_t$, but note that the framework applies to the case with non-unit volatility by applying Ito's lemma, which we expand upon later. 
%We additionally consider the case $r(t,x)=0$. 
This approximation of $\mathbb{E}[p_0(X_T)\mid \mathcal{F}_T]$ is done by equating the following:
\begin{align*}
    &\mathbb{E}_{\mathbb{P}_X}[p_0(X_T) \mid \mathcal{F}_T] \approx \\ &\mathbb{E}_{\mathbb{P}_Y}\left [p_0(Y_t)\ngo\left(\{\mu(Y_{s_n}), \Delta W_{s_n}, h \}_{n_T=1}^{N_T} ; \theta\right ) \; \Big | \; \mathcal{F}_T\right ] 
\end{align*}
where $\ngo(\cdot ; \theta)$ is a neural network with non-negative outputs with parameters $\theta$. 
For a parametric set of drifts $\{\mu^{(i)}( \cdot ; \xi_i) \}$, the $\ngo$ learns the optimal numerical integrator over the parameter space.
We parameterize $\ngo$ using a convolutional neural network, motivated by the connection between finite different stencils and convolutions, although the integration performed by $\ngo$ involves non-linear terms.
To optimize the parameters of $\ngo$, we consider the following optimization over the measure $\nu(\xi)$ of all parameterizations of drifts $\mu$ we are interested in solving:
\begin{align}
\nonumber \min_{\theta} \mathbb{E}_{\mu_\xi } \Bigg (  &\mathbb{E}\left[ p_0(Y_T) \ngo (  \{\mu_\xi (Y_{s_n}), \Delta W_{s_n}, h \right\}_{n_T =1}^{N_T}; \\ \theta )  ] &-  \mathbb{E} \left [  p_0(Y_T) \frac{\mathrm{d} \mathbb{P}_{X_{\mu_\xi}}}{\mathrm{d} \mathbb{P}_Y} \right] \Bigg)^2.
\label{eq:loss_ngo}
\end{align}
The loss in~\eqref{eq:loss_ngo} has a few helpful properties.
First, this loss only requires sampling Brownian motion and approximating the integral in~\eqref{eq:exp}.
This property is a notable departure from the usual requirement of the exact solution and parameter values --- we do not need to solve for the explicit solution of the PDE using Euler-Maruyama but only need to sample Brownian motion and approximate an integral.
Second, although the second term incurs high numerical error, since we average over many realizations, this error does not affect the approximation of the solution.
Finally, this solution does not require a meshing of the domain and provides solutions depending on the starting points of the Brownian motion, which can be arbitrary.
This attribute is vital for evaluating solutions over complex domains.
~\autoref{alg:gir} describes the full algorithm.

\begin{algorithm}
\caption{Approximating linear parabolic PDEs with $\ngo$} \label{alg:gir}
\begin{algorithmic}[1]
    \Require $N \in \mathbb{N}$, $h \in \mathbb{R}_+$, $\mu(t, x) : \mathbb{R}_+ \times \mathbb{R}^d \to \mathbb{R}^d$, evaluation coordinate $(T, X)$ 
    \State Sample $N$ Brownian motions to time $T$ starting at $X$, $\left \{X + \sqrt{k h} \varepsilon^{(i)}\right\}_{k =1\ldots T/h }^{i=1 \ldots N}$, $\varepsilon \sim \mathcal{N}(0,1)$
    \For{$i \in \{1, \ldots, N\}$} \Comment{Easy to parallelize.}
        %\State Compute $\frac{\mathrm{d} \mathbb{P}_\mu^{(i)}}{\mathrm{d} \mathbb{P}_W} \approx \sum_{k=1}^{T/h} \mu(k, W_k^{(i)}) \sqrt{k} \varepsilon^{(i)} - \frac12 \sum_{k=1}^{T/h} \mu^2(k, W_k^{(i)}) h $ 
        \State Compute $\frac{\mathrm{d} \mathbb{P}_\mu^{(i)}}{\mathrm{d} \mathbb{P}_W} \approx \ngo\left[ \left\{\mu\left( W_{k }^{(i)} \right) \right\}_{k=1}^{ T/h}, \left\{\sqrt{kh}\varepsilon^{(i)} \right\}_{k=1}^{T/h},h\right] $ \Comment{Stochastic exponential.} 
        
    \EndFor 
    \Ensure Approximation of $u(T,X)$ as $\check{u}(T,X) = \frac1N \sum_{i=1}^N p_0(W_T^{(i)}) \frac{\mathrm{d} \mathbb{P}_\mu^{(i)}}{\mathrm{d} \mathbb{P}_W}$
\end{algorithmic}
\end{algorithm}

\paragraph{Extension to semi-linear parabolic PDEs} 
We presented the method in terms of linear parabolic PDEs. 
However, extending to semi-linear parabolic PDEs is relatively straightforward by again using the stochastic representation of such PDEs. 
We can consider equations of the form:
\begin{equation}
\begin{cases}
    \frac{\partial p}{\partial t}+\frac{1}{2}\mathrm{Tr}(\sigma\sigma^\top(t,x) \mathrm{Hess}_x p(t,x))+\nabla_x p(t,x)^\top \mu(t,x) \\ \quad +\phi(x,p,\sigma^\top \nabla p)=0,\\
     p(T,x)=g(x).
\end{cases}
\label{eq:nonlin_pde}
\end{equation}
This PDE has a stochastic representation:
\begin{align*}
    \d X_t &= \mu(t,x) \d t+\sigma(t,x)\d W_t, & X_0 &= x; &  \\
     \d S_t&=-\phi(x,p,\sigma^\top \nabla_x p) \d t +Z_t^\top \d W_t, & S_T &=g(X_T), 
\end{align*}
and $p(t,x) = \mathbb{E}[S_t \mid X_0 = x]$~\citep{EXARCHOS2018159,doi:10.1080/07362990903546405}.
Simulating this system requires computing two Euler schemes: one for the forward component $X_t$ and the other for the backward process $S_t$.
We can easily follow the scheme for the forward component $X_t$ presented in~\autoref{alg:gir} by sampling Brownian motion and computing an expectation with the estimated exponential martingale.
If $\phi$ does not depend on $p$, we can compute the integration of $S_t$ using a basic sum without requiring sequential computations.
We provide the complete algorithm in Appendix~\ref{sec:alg_semi_linear}.% \xx{write the full algorithm}.

%\vspace{-0.12in}
\paragraph{Extension to elliptic PDEs}
Finally, we note that extending to elliptic PDEs is also relatively straightforward. 
Elliptic PDEs require computing the first hitting time of the domain boundary $\partial \mathcal{D}$ at each evaluation point. 
Then, using the first hitting times, the importance sampling follows. 
Specifically, we modify the stochastic exponential in~\eqref{eq:exp} to be:
$
\frac{\mathrm{d} \mathbb{P}_Y}{\mathrm{d} \mathbb{P}_X} := \exp\left( \int_0^{\tau} \mu(X_s) \mathrm{d}W_s - \frac12 \int_0^{\tau} \mu(X_s)^2 \mathrm{d} t \right)
$
where $\tau$ is the first hitting time of $\partial \mathcal{D}$ starting at $x$.
%$\eta(\cdot)$ is cast as a weighted stochastic integrator, tuned to be optimal for one set of basis functions. $\eta$ is designed to be general to all $\mu$ in the family of equation \ref{eq:mu_fam}. We train the NGO framework using PDEs with different drift functions from the family. To bypass the usage of Euler-Maruyama in the training, we train with PDE solutions where the terminal time $T$ is small, and can be approximated accurately with Girsanov's transformation. Specifically, sample $m$ drift functions $\{\mu_m\}_{m=1}^M$, we solve the following optimization problem:

%\begin{equation}
%\label{eqn:loss}
%\begin{aligned}
%    &\arg\min_{\theta} E_{\mu}\|E_{W_t}[p_0(Y_T)\text{NGO}(\{\mu(Y_{s_n}), \Delta B_{s_n},h_n\}_{n_T=1}^{N_T},\theta)|\mathcal{F}_T]-E_{W_t}[p_0(Y_T)\frac{d\mathbb{P}_{X_T}}{d\mathbb{P}_{Y_T}}|\mathcal{F}_T]\|\\
 %   \approx &\sum_{m=1}^M\|\frac1N \sum_{n=1}^N[p_0(Y_T^{n})\text{NGO}(\{\mu(Y_{s_n}), \Delta B_{s_n},h_n\}_{n_T=1}^{N_T},\theta)|\mathcal{F}_T]-\frac1N\sum_{n=1}^N[p_0(Y_T^n)\frac{d\mathbb{P}_{X_{T,m}^n}}{d\mathbb{P}_{Y_T^n}}|\mathcal{F}_T]\|,
%\end{aligned}
%\end{equation}



%\begin{figure}
%\centering
%\subfloat[Solution error of NGO, DeepONet~\citep{Lu2019LearningNO}, and computation with Girsanov (\autoref{alg:gir}) on linear parabolic PDEs. The y-axis is the magnitude of the loss normalized by the magnitude of solution calculated by Euler-Maruyama.]{\includegraphics[width=0.48\textwidth]{figs/10_loss_fk.pdf}\label{fig:ngo_loss}}\hskip1ex
%\subfloat[Wall-clock times of NGO, DeepONet, Girsanov, and direct Euler-Maruyama for computing solution to linear parabolic PDE.
%]{\includegraphics[width=0.48\textwidth]{figs/10_time_fk.pdf}\label{fig:ngo_time}}
%\caption{Comparision of the losses and inference times of NGO, DeepONet, Girsanov calculation, and Euler-Maruyama on linear parabolic PDEs. NGO achieves the better accuracies than Girsanov and DeepONet, and is significantly faster than Euler-Maruyama.}
%\end{figure}
%\vspace{-0.1in}
\section{Properties of the estimators}
%\vspace{-0.1in}
Since the meta-learning framework follows from the stochastic representation of this class of PDEs, theoretical analysis is particularly amenable in contrast to other black-box methods that only use neural networks. 
We discuss the error rate of the $\mathrm{ELBO}$ in~\eqref{eq:elbo_is} and the convergence rate of the $\ngo$ over a parametric space of solutions. 
%We provide detailed proof of the results in Appendix~\ref{sec:error_proof}, \ref{sec:uniform}.
%\jie{this sentence is hard to understand}

%\vspace{-0.12in}
\subsection{Error analysis}
\label{sec:error_analysis}
%\vspace{-0.12in}

We first note that the proposed algorithm induces a tradeoff between memory and execution time since we save the Brownian motions underlying the importance sampling. 
Saving the Brownian motions is a minor constraint since the Brownian motions saved are only $\mathcal{O}(N_E \times d)$ where $N_E$ is the number of Monte Carlo samples used to estimate the expectation, and $d$ is the number of dimensions. %This is a minor constraint \jie{undefined term}
Additionally, these can be distributed over multiple devices, as no communication between nodes is needed when computing the expectation.
We also analyze the approximation error of both $\mathrm{ELBO}_{\mathrm{IS}}$ and $\mathrm{ELBO}_{\mathrm{direct}}$ presented in Section~\ref{sec:meta_gen_formulation}. % \jie{introduced in xxx}. 

%\jie{present the remark as a theorem?}
\begin{comment}
\begin{proposition}[Order of Error]
\label{prop:error_order}
Approximating the $\mathrm{ELBO}_{\mathrm{IS}}$ and $\mathrm{ELBO}_{\mathrm{direct}}$ terms with Euler-Maruyama using step size $h$ will both induce an error of the same order.
\end{proposition}
The proof follows the statement provided in~\citet{BallyTalay1995}.
The full statement is in Appendix~\ref{sec:error_proof}.
\end{comment}
Although $\mathrm{ELBO}_{\mathrm{IS}}$ introduces more errors than $\mathrm{ELBO}_{\mathrm{direct}}$ by having more integration terms, they are all of at least order $\mathcal{O}(h^2)$. 
Employing a multi-level architecture based on the multi-level Monte Carlo can improve the accuracy further under a similar computational budget~\citep{giles2015multilevel}.

\subsection{Uniform convergence over drift parameters}
%\vspace{-0.1in}

A final property of the $\ngo$ concerns the convergence rate over a family of solutions to PDEs with parameter $\mu(x,t)$ and $\sigma(t)$ dependent only on $t$. 
Specifically, by using the properties of the stochastic representation, we can show that a well-learned $\ngo$-based solution $p_\theta^{\xi}(x)$ uniformly converges over the parameter space $\Xi$ to the ground truth $p^\xi(x)$ under mild conditions.
Intuitively, since the $\ngo$ learns how to compute the likelihood ratio, we can change the parameters within a compact set while maintaining high performance over this set.

\begin{proposition}[Uniform Convergence]
\label{prop:unif}
For fixed $x \in \mathcal{D}, T \in \mathbb{R}_+$, consider a space of functions $\mathcal{F} = \left \{ \d \mathbb{P}_{X_T^{(\xi)}}/\d \mathbb{P}_{Y_T} : \xi \in \Xi \right \}$ parameterized by $\xi$ from a compact set $\Xi \subset \mathbb{R}^k$ satisfying $Var\left(\d \mathbb{P}_{X_T^{(\xi)}}/\d \mathbb{P}_{Y_T}\right) < \infty$ for all $\xi \in \Xi$ with $\mathbb{P}_{X_T^{(\xi)}}$ denoting the distribution of the solution $ X_T = x + \int_0^T \mu(X_t, t; \xi) \d t + \int_0^T \sigma(t) \d W_t$ and $\mathbb{P}_{Y_T}$ the distribution of $Y_t = x + \int_0^T \sigma(t) \d W_t$. 
Additionally, assume that the image of $(T, X_T) \mapsto \mu(T,X_T;\xi)$ is compact for all $X_T, \xi$.
Then, %in the limit as $h\to 0$ \jie{it has to be related to the rate of $n$. needs more rigorous statement} and for fixed $x \in \mathcal{X}$,
$
\mathbb{G}_{N_E} = \sqrt{N_E} \left(p_\theta^{\xi}(x, T) - p^\xi(x, T) \right)
$
converges in distribution to a zero-mean Gaussian process over $\xi \in \Xi$ as $N_E \to \infty$ where $N_E$ is the number of samples used to compute the expectation. 
\end{proposition}
The proof follows from first showing that $\mathcal{F}$ is $\mathbb{P}_{Y_T}$-Donsker and then follows with an analysis of our construction of the solution $p^{\xi}_\theta$ in terms of an expectation.
The complete statement is in Appendix~\ref{sec:uniform}.

 
%\vspace{-0.1in}
\section{Experiments}
%\vspace{-0.1in}

We now examine the capabilities of the models in their respective tasks.
First, we illustrate a proof-of-concept experiment on maximizing the parameters of a PDE by estimating $K$ target distributions in a generative modeling setting.
Then, we present our main experiments on solving parabolic PDEs. 
We simulate the sample paths with the basic Euler-Maruyama solver for all experiments.
\begin{figure}[ht!]
    \begin{subfigure}{0.23\textwidth}
        \centering
\includegraphics[width=\textwidth, trim=20pt 60pt 60pt 10pt]{figs/meta_reg.pdf}%{./Pictures/mainscreen1.png}
    \caption{Sampling $2d$ (left) and $100d$ (right) Gaussians with different means starting from a standard Gaussian versus $p_\mathrm{meta}$ and no meta-drift versus $\mu_\mathrm{meta}$.}
    \label{fig:max_pde}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.23\textwidth}
    \centering
    \includegraphics[width=\textwidth, trim=20pt 40pt 10pt 50pt]{figs/2d_kde.pdf}
    \caption{Comparing $p_\mathrm{meta}$, a standard Gaussian, and the target task distributions. $p_\mathrm{meta}$ provides the best initial condition for all the target distributions.}
    \label{fig:meta_dist}
    \end{subfigure}
    \caption{Numerical results for meta-learning generative models for Gaussian distributions where different means correspond to different tasks.}
    \label{fig:meta_learning_exp}
\end{figure}
%\vspace{-0.12in}

%\begin{figure*}
%    \centering
%    \begin{subfigure}[b]{0.4\textwidth}
%        \centering
%        \includegraphics[width=\textwidth]{figs/meta_reg.pdf}
%        \caption{Bits/Dim on test distributions after training forward SDE for three epochs on 2-$d$ (left) and 100-$d$ (right) Gaussian with different means.}
%        \label{fig:max_pde}
%    \end{subfigure}
%    \hfill
%    \begin{subfigure}[b]{0.4\textwidth}
%        \includegraphics[width=\textwidth]{figs/2d_kde.pdf}%{./Pictures/mainscreen1.png}
%        \caption{Comparing $p_\mathrm{meta}$, standard Gaussian, and task distributions.}
%        \label{fig:meta_dist}
%    \end{subfigure}
%    \caption{Comparison of test Bits/Dim on different initial distribution of the SDE.}
%    \vspace{-15pt}
%    \label{fig:gen_model}
%\end{figure*}

\begin{figure*}[ht]
    \centering
    \begin{subfigure}[b]{0.32\textwidth}
        \centering
        \includegraphics[width=\textwidth, trim=20pt 10pt 10pt 40pt]{figs/10_loss_fbfk.pdf}
        \caption{$10d$ linear (left) and semi-linear (right) parabolic PDEs. }
        \label{fig:ngo_loss_time}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.32\textwidth}
        \includegraphics[width=\textwidth, trim=20pt 10pt 10pt 30pt]{figs/ablation_loss_fbfk.pdf}%{./Pictures/mainscreen1.png}
        \caption{Varying the dimensionality at $T=0.25$ for linear (left) and semi-linear (right) parabolic PDEs.}
        \label{fig:vary_d}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.32\textwidth}
\includegraphics[width=\textwidth, trim=20pt 10pt 10pt 30pt]{figs/10_loss_exact.pdf}
        \caption{Normalized errors for $10d$ HJB (top left), BSB (top right), BS (bottom left), and FP (bottom right). }
        \label{fig:ngo_loss_exact}
        
    \end{subfigure}
    \caption{Comparison of the normalized errors and inference times of $\ngo$, DeepONet (DON), Girsanov (GIR), and Euler-Maruyama (E-M) on linear and semi-linear parabolic PDEs.}
    %\vspace{-18pt}
    \label{fig:random_basis}
\end{figure*}

%\vspace{-0.12in}
\subsection{Maximizing parameters}
%\vspace{-0.12in}

To illustrate the running example on generative modeling, we consider maximizing the solution of the Fokker-Planck PDE (corresponding to the likelihood of a generative model) at a terminal time using~\eqref{eq:elbo_is}.
Consider the problem of approximating $K$ Gaussian target distributions with different means and the same covariance matrix.
We are interested in investigating the sample quality in the few-shot learning setting with and without the meta-learned parameter by training $K$ separate drift and diffusion functions $\{\mu_{i}, \sigma_{i}\}_{i=1}^K$ on $K$ different target distributions.
We represent the meta-parameter $p_\mathrm{meta}$ as a small normalizing flow, which we optimize over $\kappa < K$ training distributions similar to the $K$ target distributions.
To sample the $i^\text{th}$ target distribution parameterized by $\mu_{i}$ and $\sigma_{i}$, we first sample from the initial distribution (either a standard Gaussian or the meta-learned $p_\mathrm{meta}$) and evolve the SDE to the terminal time according to $\mu_{i}$ and $\sigma_{i}$.
Figure~\ref{fig:max_pde} shows the test distribution bits/dim for the $100$ target distributions compared to the $2$-Wasserstein distance between the initial and target distributions. 
Figure~\ref{fig:meta_dist} visualizes $p_\mathrm{meta}$, standard Gaussian, and sampled target distributions in the $2d$ case.
The results demonstrate the importance of including the meta-learned parameter in the optimization to improve generalization.

%We compare the performance of 
%Using the formulation in section \ref{sec:meta_gen_formulation}, we train a forward SDE starting from a prior $p_0$ set as a standard Gaussian and as a meta-learned prior represented with a normalizing flow. 
%As shown in Figure~\ref{fig:max_pde}, on 100-dimensional Gaussian target distributions, few-shot learning losses decrease as the Wasserstein distance between the prior and the target distribution decreases. 
%As shown in Figure~\ref{fig:comp_2d_gauss}, empirical results suggest using a meta-learned prior reduces the influence of Wasserstein distance and improves the few-shot learning performance. 

%\begin{figure}
%\centering
%\subfloat[Test Bits/Dim after training forward SDE for three epochs on 100-dimensional Gaussian with different means. The prior is a standard 100-dimensional Gaussian. We plot the test Bits/Dim against the Wasserstein distance between the standard Gaussian and every dataset. We notice a positive correlation between the Wasserstein distance and the Bits/Dim. The Pearson correlation test indicates that this has a positive correlation factor of 0.9976, with a p-value of 1.48E-10.]{\includegraphics[width=0.4\textwidth]{bpd_toy_fokker_planck_100d.png}\label{fig:wass_100_gauss}}\hskip1ex
%\subfloat[Test Bits/Dim after training forward SDE for three epochs on 2-dimensional Gaussian with different means. Using a meta-prior significantly improves the few-shot learning performance and reduces the influence of the Wasserstein distance between the prior distribution and the target distribution on training performances.]{\includegraphics[width=0.4\textwidth]{comparison_gen.png}\label{fig:comp_2d_gauss}}

%\caption{Few-shot learning training dynamics for parabolic PDE based generative modeling.}
%\label{fig:gen_opt}
%\end{figure}


\begin{comment}
\begin{wrapfigure}{r}{0.5\textwidth} 
\vspace{-45pt}
  \begin{center}
    \includegraphics[width=0.5\textwidth, trim=20pt 60pt 20pt 30pt]{figs/meta_reg.pdf}%{./Pictures/mainscreen1.png}
    \caption{Few-shot learning performance on $2d$ (left) and $100d$ (right) Gaussian with different means using forward SDE starting from $p_\mathrm{meta}$ and standard Gaussian.} %The prior is a standard 2-dimension and a 100-dimensional Gaussian. We plot the test Bits/Dim against the Wasserstein distance between the standard Gaussian and every dataset. We notice a positive correlation between the Wasserstein distance and the Bits/Dim.}
    %The top row shows the magnitude of the loss normalized by the magnitude of the solution calculated by Euler-Maruyama, and the bottom row shows the inference times. The left column represents results on linear parabolic PDEs, and the right column represents results on semi-linear parabolic PDEs.}
    \label{fig:max_pde}
  \end{center}
  \vspace{-50pt}
\end{wrapfigure} 

\begin{wrapfigure}{r}{0.4\textwidth} 
\vspace{-40pt}
  \begin{center}
    \includegraphics[width=0.4\textwidth, trim=30pt 40pt 10pt 10pt]{figs/2d_kde.pdf}
    \caption{Comparing $p_\mathrm{meta}$, standard Gaussian, and task distributions.}
    \label{fig:meta_dist}
  \end{center}
  \vspace{-10pt}
\end{wrapfigure}
\end{comment}

%\vspace{-0.12in}
\subsection{Operator learning}
%\vspace{-0.12in}
%\begin{figure}
%  \begin{center}
        %\includegraphics[width=0.4\textwidth, trim=20pt 10pt 10pt 30pt]{figs/10_loss_exact.pdf}
%        \caption{Normalized errors for $10d$ HJB (top left), BSB (top right), BS (bottom left), and FP (bottom right). }
%        \label{fig:ngo_loss_exact}
    %The top row shows the magnitude of the loss normalized by the magnitude of the solution calculated by Euler-Maruyama, and the bottom row shows the inference times. The left column represents results on linear parabolic PDEs, and the right column represents results on semi-linear parabolic PDEs.}
%    \label{fig:ngo_ablation_dim}
%  \end{center}
 % \vspace{-0.12in}
%\end{figure} 

We consider examples of the $\ngo$ on operator learning tasks by testing on a few PDEs. 
We first consider a linear and semi-linear PDE, with parametric classes of the function $\mu$ in the linear PDE~\eqref{eq:parabolic} and $\mu,\,h$ in the semi-linear PDE \eqref{eq:nonlin_pde}.
Figure~\ref{fig:ou_1d} in the Appendix shows a visualization of solutions of a $1d$ Fokker-Planck equation calculated with the analytical solution, $\ngo$, Euler-Maruyama, and directly with Girsanov.
We then consider canonical parabolic PDEs consisting of two linear equations -- the Black-Scholes (BS) and the Fokker-Planck (FP) equations, and two semi-linear equations -- the Hamilton-Jacobi-Bellman (HJB) and the Black-Scholes-Barrenblatt (BSB) equations. 
These equations have applications in finance (BS, BSB), stochastic control (HJB), and the previously explored probabilistic modeling (FP).
We present detailed definitions of the PDEs in Appendix~\ref{sec:canonical}.
For this study, we compare against the DeepONet operator learning architecture~\citep{Lu2019LearningNO} (DON) with a similar model size, naively applying the change-of-measure (GIR), and the direct simulation with Euler-Maruyama (E-M). 
Note that the E-M method provides a strong baseline that encompasses many techniques in the related work (e.g.~\citep{berner2020numerically, glau2022deep, richter2022robust}), so we use this as a baseline for the existing deep learning methods based on Feynman-Kac. 
We compare computation time and accuracy between the different methods for estimating the solution under different $\mu$ and $h$ functions. 
When analytical solutions do not exist, we consider E-M with a substantial $N_E, N_T$ as the ground truth. 

%\vspace{-0.12in}
\paragraph{PDEs with defined basis} 
%\label{sec:pde_basis}
In the linear case, we consider second-order polynomials 
\begin{equation*}
    \mu^{(i)}(x) \in \left\{ \sum_{i=0}^2 c_i x^i \mid c_i \sim \mathcal{U}(0,1) \right \}.
\end{equation*}
For the semi-linear equation, we also test changing the backward drift $\phi$ in~\eqref{eq:nonlin_pde} by considering basis functions given by 
\begin{align*}
    \phi_i(t, x, s, z) \in  \{ c_1 \sum \sin(x_i) + c_2 \sum z_i^2 + c_3 \cos(t + s) \\ \mid c_i \sim \mathcal{U}(0,1)  \}.
\end{align*}
We set the parameters $r=0$ and $\sigma=1$ for these experiments. 
We randomly sample from these function classes during training and then evaluate on a different test set of functions. 
The results are illustrated in Figure~\ref{fig:random_basis} and Table~\ref{tab:ngo} for these two equations with Figure~\ref{fig:ngo_loss_time} presenting the error and computation at different terminal times and Figure~\ref{fig:vary_d} considering the error and inference time at various dimensions. 
%For the 10-d equations in Figure~\ref{fig:ngo_loss_time}, linear parabolic case, $\ngo$ model contains 53.71K parameters, DON model contains 90.51K parameters; for the semi-linear parabolic case, $\ngo$ model contains 66.02K parameters, DON model contains 90.51K parameters. %Additional ablation study on dimension and number of sample paths can be found in Appendix~\ref{sec:ablation}.
The proposed $\ngo$ has high accuracies while maintaining small computation times in all the tested regimes.

%\vspace{-0.12in}
\paragraph{Canonical parabolic PDEs}
We test the generalization capabilities of NGO and DON models trained in the previous section on four canonical parabolic PDEs previously mentioned (BS, FP, HJB, and BSB) in $10d$. 
Since an exact solution is known for these equations, we compare $\ngo$, DeepONet, and E-M to the analytical solution (presented in Appendix~\ref{sec:canonical}).
Note that for the BS and BSB equations, a change in the volatility function $\sigma$ occurs.
Results on the normalized error are in Figure~\ref{fig:ngo_loss_exact}. 
The proposed $\ngo$ achieves low errors across all four tested canonical PDEs again. 
Additional ablations are available in Appendix~\ref{sec:ablation}. 
%\xx{We say something like "NGO achieves the lowest error among the three methods", or not needed maybe?}
\begin{comment}
\begin{table}[hbt!]
\begin{tabular}{llllll}
    & \# Param. & Loss & Std. Err. & Inf. Time \\ \toprule
NGO & 15.7K  & $4.50(0.05)${\footnotesize${ \times 10^{-2}}$}    & $4.7${\footnotesize${ \times 10^{-4}}$}    & 0.027s   \\
FNO & 19.9K  & 4.44(0.16){\footnotesize${ \times 10^{-2}}$}    & 1.6{\footnotesize${ \times 10^{-3}}$}     & 0.074s   \\ \midrule
NGO & 18.8K  & 5.1(0.11){\footnotesize${ \times 10^{-2}}$}    & 1.1{\footnotesize${ \times 10^{-3}}$}    & 0.018s   \\
FNO & 2.0M   & 6.1(0.02){\footnotesize${ \times 10^{-2}}$}    & 2.4{\footnotesize${ \times 10^{-4}}$}     & 0.114s  
\end{tabular}
\caption{$\ngo$'s and Fourier Neural Operator(FNO)'s  performances on one-dimensional (first two rows) and two-dimensional (last two rows) linear parabolic equations. We calculate the normalized losses and standard errors with five independent training episodes.}
\label{tab:ngo}
\vspace{-0.12in}
\end{table}
\end{comment}
\begin{table}[hbt!]
\centering
\begin{tabular}{lllll}
    & \# Param. & Loss  & Inf. Time (s) \\ \toprule
NGO & 15.7K  & $4.50(0.05)${\footnotesize${ \times 10^{-2}}$}    &  2.7{\footnotesize${ \times 10^{-2}}$}   \\
FNO & 19.9K  & 4.44(0.16){\footnotesize${ \times 10^{-2}}$}    & 7.4{\footnotesize${ \times 10^{-2}}$}   \\ \midrule
NGO & 18.8K  & 5.10(0.11){\footnotesize${ \times 10^{-2}}$}    & 1.80{\footnotesize${ \times 10^{-2}}$}  \\
FNO & 2.0M   & 6.10(0.02){\footnotesize${ \times 10^{-2}}$}   & 1.14{\footnotesize${ \times 10^{-1}}$}  
\end{tabular}
\caption{$\ngo$ and Fourier Neural Operator (FNO) performances on one-dimensional (first two rows) and two-dimensional (last two rows) linear parabolic equations. We calculate the normalized losses and standard errors with five independent training episodes.}
\label{tab:ngo}
%\vspace{-0.12in}
\end{table}

%\vspace{-0.12in}
\section{Discussion}
%\vspace{-0.12in}
We proposed a method for solving problems related to parabolic PDEs based on their stochastic representation.
We treat the parameters of parabolic PDEs' stochastic representation as the meta-learned parameters shared across all tasks and calculate task-specific solutions with them. 
This structure allows application in optimizations under different scenarios and solving PDEs with distinct parameters through the $\ngo$. 
Empirical results indicate that $\ngo$ provides a sizable advantage in computation time and accuracy compared to baselines.
%One can extend this paper in a variety of ways. 
%In regards to the architecture, applying techniques from multi-level Monte Carlo can further reduce the computational error~\citep{giles2015multilevel}.
%Extending the work to additional families of parabolic PDEs with more complicated interactions should also be considered.
%For example, the extension to elliptic PDEs is fairly straightforward by changing the terminal integration time $T$ to the first-hitting time of the boundary.
%Finally, additional theoretical analysis regarding the norm of the exponential martingale when taking the difference between the meta-learned parameters could improve the theoretical convergence.

%\vspace{-0.12in}
\paragraph{Limitations}
Theoretically, if the target drift has a large magnitude, the variance of stochastic exponential can be high, which may lead to numerical instabilities. 
In this case, the direct Euler-Maruyama approach may be beneficial for training the neural operator. 

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
This work was supported in part by the Office of Naval Research (ONR) under grant number N00014-21-1-2590.
AH was supported by NSF-GRFP.
\end{acknowledgements}

% References
\bibliography{uai2024-template}

\newpage

\onecolumn

\title{Base Models for Parabolic Partial Differential Equations\\(Supplementary Material)}
%\maketitle

\appendix
\section{Additional simulation results}
\begin{proposition}
    Approximating the $\mathrm{ELBO}_{\mathrm{IS}}$ and $\mathrm{ELBO}_{\mathrm{direct}}$ terms with Euler-Maruyama using step size $h$ will both induce the error $h\int_0^t\mathbb{E}[\psi_e(s,X_s)\mathrm{d}s]+\mathcal{O}(h^2)$, where 
\begin{align*}
    \psi_e(t,x)&=\frac{1}{2}\sum_{i,j=1}^d \mu^i(t,x) \mu^j(t,x)\partial_{x_ix_j}p(t,x)
    +\frac{1}{2}\sum_{i,j,k=1}^d \mu^i(t,x)a_k^j(t,x)\partial_{x_ix_jx_k}p(t,k)\\
    &+\frac{1}{8}\sum_{i,j,k,l=1}^d a_j^i(t,x)a_l^k(t,x)\partial_{x_ix_jx_kx_l}p(t,x)+\frac{1}{2}\frac{\partial^2}{\partial t^2}p(t,x)\\
    &+\sum_{i=1}^d \mu^i(t,x)\frac{\partial}{\partial t}\partial_{x_i} u(t,x)+\frac{1}{2}\sum_{i,j=1}^d a_j^i(t,x)\frac{\partial}{\partial t}\partial_{x_ix_j}u(t,x)
\end{align*}
and
$a(t,x)=\sigma(t,x)\sigma^\top(t,x)$.
\end{proposition}

\begin{proof}
For this section, we do not consider Monte Carlo error $N_E$ and focus only on the integration error. 
We analyze the approximation error of $\text{ELBO}_{\mathrm{direct}}$ and $\text{ELBO}_{\mathrm{IS}}$ using the error bound introduced in~\citep{doi:10.1080/07362999008809220}. Given the following SDE:
\begin{align*}
    \d X_t=\mu(X_t,t)\d t+\sigma(X_t,t)\d W_t
\end{align*}
We want to estimate the error of $\E[f(X_T)]$, where we evaluate $X_T$ through the Euler-Maruyama scheme. 
Define $h=T/N$ as the step size in the Euler-Maruyama scheme and denote $X_T^h$ as the approximated $X_T$ using step size $h$. 
Following~\citet{doi:10.1080/07362999008809220}, we define the error term $err(T,h)=\E [f(X_T)] -\E[f(X_T^h)]$.

\citet{doi:10.1080/07362999008809220} proved that 
\begin{align}
    err(T,h)=-h\int_0^T \E[ \psi_e(s,X_s)]\d s+\mathcal{O}(h^2)
\end{align}
where 
\begin{align*}
    \psi_e(t,x)&=\frac{1}{2}\sum_{i,j=1}^d \mu^i(t,x)\mu^j(t,x)\partial_{x_ix_j}u(t,x)\\
    &+\frac{1}{2}\sum_{i,j,k=1}^d \mu^i(t,x)a_k^j(t,x)\partial_{x_ix_jx_k}u(t,k)\\
    &+\frac{1}{8}\sum_{i,j,k,l=1}^d a_j^i(t,x)a_l^k(t,x)\partial_{x_ix_jx_kx_l}u(t,x)+\frac{1}{2}\frac{\partial^2}{\partial t^2}u(t,x)\\
    &+\sum_{i=1}^d \mu^i(t,x)\frac{\partial}{\partial t}\partial_{x_i} u(t,x)+\frac{1}{2}\sum_{i,j=1}^d a_j^i(t,x)\frac{\partial}{\partial t}\partial_{x_ix_j}u(t,x)
\end{align*}
and $a(t,x)=\sigma(t,x)\sigma^T(t,x)$.

We will assume that $\sigma = 1$ for ease of analysis. 
For the $\mathrm{ELBO}$, we have an additional integral we must approximate related to the divergence of the drift term.

Recall that
\begin{align*}
    &\mathrm{ELBO}_{\mathrm{direct}} = \mathbb{E}_{\mathbb{P}_{X_t}}\left [\int_0^T\nabla\cdot \mu(s,X_s
    )\d s+\log p_0(X_T)\mid X_0=x \right ]\\
    &\mathrm{ELBO}_{\mathrm{IS}} = \mathbb{E}_{\mathbb{P}_{Y_t}}\left [\int_0^T\mu(s,Y_s)\d W_s-\int_0^T\frac12\mu(s,Y_s)^T\mu(s,Y_s)-\nabla\cdot\mu(s,Y_s)\d s+\log p_0(Y_T) \mid Y_0=x\right]
\end{align*}
\begin{comment}

Assume the absolute value of the error in integration
\begin{align*}
    &\mathbb{E}[\int_0^T|\nabla\cdot\mu(s,X_s)-\nabla\cdot\mu(s,X_s^h)|ds|X_0=x]<\infty,\\
    &\mathbb{E}[\int_0^T|\frac12\mu(s,Y_s)^T\mu(s,Y_s)-\frac12\mu(s,Y_s^h)^T\mu(s,Y_s^h) -\nabla\cdot\mu(s,Y_s)+\nabla\cdot\mu(s,Y_s^h)|ds|Y_0=x]<\infty.
\end{align*}
By Fubini's theorem, we have
\begin{align*}
    \mathrm{Err}_{\mathrm{direct}} &= \mathbb{E}[\int_0^T\nabla\cdot\mu(s,X_s)-\nabla\cdot\mu(s,X_s^h)ds+\log p_0(X_T)-\log p_0(X_T^h)|X_0=x]\\
    &=\int_0^T\mathbb{E}[\nabla\cdot\mu(s,X_s)-\nabla\cdot\mu(s,X_s^h)]ds+\mathbb{E}[\log p_0(X_T)-\log p_0(X_T^h)|X_0=x]\\
    &= \int_0^T err (s,h) ds+\mathbb{E}[\log p_0(X_T)-\log p_0(X_T^h)|X_0=x]
\end{align*}
We have
\begin{align*}
    \mathrm{Err}_{\mathrm{direct}} &\leq \int_0^T err (T,h) ds+\mathbb{E}[\log p_0(X_T)-\log p_0(X_T^h)|X_0=x]\\
    &=T \times err(T,h)+\mathbb{E}[\log p_0(X_T)-\log p_0(X_T^h)|X_0=x]\\
    &=T \times err(T,h)+ err(T,h)\\
    &= (T+1) \times err(T,h)\\
    &= (T+1) \times (-h\int_0^T \E[ \psi_e(s,X_s)]\d s+\mathcal{O}(h^2))\\
    &= \tikzol[0pt]{\mathrm{Err}_{\mathrm{direct}}}.
\end{align*}

We also have 
\begin{align*}
    \mathrm{Err}_{\mathrm{direct}} &\geq \mathbb{E}[\log p_0(X_T)-\log p_0(X_T^h)|X_0=x]\\
    &=err(T,h)\\
    &= -h\int_0^T \E[ \psi_e(s,X_s)]\d s+\mathcal{O}(h^2)\\
    &=\tikzul[0pt]{\mathrm{Err}_{\mathrm{direct}}}.
\end{align*}

Both $\tikzol[0pt]{\mathrm{Err}_{\mathrm{direct}}}$ and $\tikzul[0pt]{\mathrm{Err}_{\mathrm{direct}}}$ are of order $\mathcal{O}(h^2)$, so $\mathrm{Err}_{\mathrm{direct}}$ is of order $\mathcal{O}(h^2)$.

We now analyze $\mathrm{Err}_{\mathrm{IS}}$. By Fubini's theorem, we have
\begin{align*}
    \mathrm{Err}_{\mathrm{IS}} &= \mathbb{E}[-\int_0^T\frac{\mu(s,Y_s)^T\mu(s,Y_s)-\mu(s,Y_s^h)^T\mu(s,Y_s^h)}{2}-\nabla\cdot\mu(s,Y_s)+\nabla\cdot\mu(s,Y_s^h)ds+\log p_0(Y_T)-\log p_0(Y_T^h)|Y_0=x]\\
    &=-\int_0^T\mathbb{E}[\int_0^T\frac{\mu(s,Y_s)^T\mu(s,Y_s)-\mu(s,Y_s^h)^T\mu(s,Y_s^h)}{2}-\nabla\cdot\mu(s,Y_s)+\nabla\cdot\mu(s,Y_s^h)|Y_0=x] ds \\
    &+ \mathbb{E}[\log p_0(Y_T)-\log p_0(Y_T^h)|Y_0=x]\\
    &= -\int_0^Terr(s,h)ds+ \mathbb{E}[\log p_0(Y_T)-\log p_0(Y_T^h)|Y_0=x]
\end{align*}
We have
\begin{align*}
    \mathrm{Err}_{\mathrm{IS}} &\leq \int_0^T err (T,h) ds+\mathbb{E}[\log p_0(X_T)-\log p_0(X_T^h)|X_0=x]\\
    &=T \times err(T,h)+\mathbb{E}[\log p_0(X_T)-\log p_0(X_T^h)|X_0=x]\\
    &=T \times err(T,h)+ err(T,h)\\
    &= (T+1) \times err(T,h)\\
    &= (T+1) \times (-h\int_0^T \E[ \psi_e(s,X_s)]\d s+\mathcal{O}(h^2))\\
    &= \tikzol[0pt]{\mathrm{Err}_{\mathrm{IS}}}.
\end{align*}

We also have 
\begin{align*}
    \mathrm{Err}_{\mathrm{IS}} &\geq \mathbb{E}[\log p_0(X_T)-\log p_0(X_T^h)|X_0=x]\\
    &=err(T,h)\\
    &= -h\int_0^T \E[ \psi_e(s,X_s)]\d s+\mathcal{O}(h^2)\\
    &=\tikzul[0pt]{\mathrm{Err}_{\mathrm{IS}}}.
\end{align*}

Both $\tikzol[0pt]{\mathrm{Err}_{\mathrm{IS}}}$ and $\tikzul[0pt]{\mathrm{Err}_{\mathrm{IS}}}$ are of order $\mathcal{O}(h^2)$, so $\mathrm{Err}_{\mathrm{IS}}$ is of order $\mathcal{O}(h^2)$.
\end{comment}

The error associated with the divergence is defined as
\begin{align*}
err_{\mathrm{div},t}
&= -h \int_0^t \E[ \psi_e(s, X_s)] \d s + \mathcal{O}(h^2)
\end{align*}
with a similar argument following the initial condition, i.e.
$$
err_{p_0} = -h \int_0^T \E[ \psi_e(s, X_s)] \d s + \mathcal{O}(h^2)
$$

Combining these terms, we get the final error, $err_{\mathrm{ELBO}_\mathrm{direct}}$: 
\begin{align*}
err_{\mathrm{ELBO}_\mathrm{direct}} &= \int_0^T err_{\mathrm{div}, s} \d s + err_{p_0} \\
&= \sum_{i=1}^N h \times err_{\mathrm{div}, t_i} + err_{p_0} \\
&= N \times h[-h \int_0^T \E [ \psi_e (s, X_s) ] + \mathcal{O}(h^2)] - h \int_0^T \E [\psi_e(s, X_s) ]\d s + \mathcal{O}(h^2) \\
%&= h (T + 1) \int_0^T \E [ \psi_e (s, X_s)] + \mathcal{O}(h^2). \\
&\approx  - h \int_0^T \E [ \psi_e (s, X_s)] + \mathcal{O}(h^2).
\end{align*}


For the $\mathrm{ELBO}_\mathrm{IS}$ case, $X_t$ can be sampled exactly since we assume it follows a Brownian motion, which removes the integration error in $err_{p_0}$. 

The errors are then introduced when integrating the terms in the stochastic exponential. 
Then we have 
\begin{align*}
\underbrace{\int_0^T \nabla \cdot \mu(s, Y_s) \d s}_{ err_{\mathrm{div}}} - \underbrace{\int_0^T \frac12 \mu(s, Y_s)^T \mu(s, Y_s) \d s + \int_0^T \mu(s, Y_s) \d W_s}_{\mathcal{O}(h)}
\end{align*}
where the second error rate comes from the Euler discretization. 
Following this argument, we get 
\begin{align}
    err_{\mathrm{ELBO}_\mathrm{IS}} = -h\int_0^T \E [\psi_e(s,X_s)] \d s+\mathcal{O}(h^2).
\end{align}

Although $\text{ELBO}_{\mathrm{IS}}$ introduces more error than $\text{ELBO}_{\mathrm{direct}}$ by having more integration terms, they are all of the order $\mathcal{O}(h^2)$.
\end{proof}

\begin{comment}

We then have $err E[\nabla\cdot\mu(X_{\tau},\tau)|X_0=x]=-h\int_0^{\tau}E\psi_e(s,X_s)ds+\mathcal{O}(h^2)$ and $err E[\log\psi(X_t)|X_0=x]=-h\int_0^{\tau}E\psi_e(s,X_s)ds+\mathcal{O}(h^2)$.


\begin{align*}
    err \text{ELBO}_{\mathrm{direct}} &=\int_0^t err(E[\nabla\cdot\mu(X_{\tau},\tau)|X_0=x])d\tau+err E[\log\psi(X_t)|X_0=x]\\
    & = \sum_{i=1}^N h \times err(E[\nabla\cdot\mu(X_{\tau_i},\tau_i)|X_0=x])+\mathcal{O}(h^2)+err E[\log\psi(X_t)|X_0=x]\\
    &=N \times h^2\int_0^{\tau}E\psi_e(s,X_s)ds+\mathcal{O}(h^3)+\mathcal{O}(h^2)+h\int_0^tE\psi_e(s,X_s)ds+\mathcal{O}(h^2)\\
    &=h\int_0^tE\psi_e(s,X_s)ds+\mathcal{O}(h^2)
\end{align*}


\begin{remark}[Approximation error for generative model]
We provide a lower bound for the approximation of $p(x)$ by applying Feynman Kac. 
We first develop the error bound when approximating without importance sampling.
We do this by writing the numerical error for approximating the integral:
\begin{align*}
    p(x)&=\E\left[\exp\left(\int_0^T\nabla\cdot\mu(X_{\tau},\tau)\d\tau \right )\psi(X_T)\mid X_0=x\right]\\
    &=\E\left[\sum_{n=0}^{\infty}\frac{1}{n!}\left(\sum_{i=1}^N\nabla\cdot\mu(X_{\tau_i},\tau_i)h+\mathcal{O}(h^2)\right)^n\psi(X_T)\mid X_0=x \right ]\\
    &=\sum_{n=0}^{\infty}\frac{h}{n!}\left(\sum_{i=1}^N \E[\nabla\cdot\mu(X_{\tau_i},\tau_i)\psi(X_T) \mid X_0=x]\right)^n+\mathcal{O}(h^2)
\end{align*}

We first analyse the error for the term $\E[\nabla\cdot\mu(X_{\tau_i},\tau_i)\psi(X_T)\mid X_0=x]$. We have
\begin{align*}
    Err_i &=\E\left[\nabla\cdot\mu(X_{\tau_i}^h,\tau_i)\psi(X_T^h)-\nabla\cdot\mu(X_{\tau_i},\tau_i)\psi(X_T)\mid X_0=x \right ]\\
    &\geq \E[\nabla\cdot\mu(X_{\tau_i},\tau_i)\psi(X_T^h)-\nabla\cdot\mu(X_{\tau_i},\tau_i)\psi(X_T)\mid X_0=x]\\    &=\E[\E[\nabla\cdot\mu(X_{\tau_i},\tau_i)\psi(X_T^h)-\nabla\cdot\mu(X_{\tau_i},\tau_i)\psi(X_T)\mid \mathcal{F}_{\tau_i}] \mid X_0=x]\\    &=\E[\nabla\cdot\mu(X_{\tau_i},\tau_i)\E[\psi(X_T^h)-\psi(X_T) \mid \mathcal{F}_{\tau_i}] \mid X_0=x]\\
    &=\E[\nabla\cdot\mu(X_{\tau_i},\tau_i)  err(T,h) \mid X_0=x]\\
    &=err(T,h) \E[\nabla\cdot\mu(X_{\tau_i},\tau_i) \mid X_0=x]
\end{align*}
Thus, we have
\begin{align*}
    Error(p(X_T=x))&=\sum_{n=0}^{\infty}\frac{(\sum_{i=1}^N h  Error(E[\nabla\cdot\mu(X_{\tau_i},\tau_i)\psi(X_T)|X_0=x]))^n}{n!}+\mathcal{O}(h^2)\\
    &= \sum_{n=0}^{\infty}\frac{(\sum_{i=1}^Nh  Err_i)^n}{n!}+\mathcal{O}(h^2)\\
    &\geq \sum_{n=0}^{\infty}\frac{(\sum_{i=1}^Nh\times err(T,h) E[\nabla\cdot\mu(X_{\tau_i},\tau_i)|X_0=x])^n}{n!}+\mathcal{O}(h^2)\\
    &=\mathcal{O}(h^2)
\end{align*}
\end{remark}
\end{comment}

\subsection{Uniform convergence}
\label{sec:uniform}

\begin{proposition}[Uniform Convergence]
For fixed $x \in \mathcal{D}, T \in \mathbb{R}_+$, consider a space of functions $$\mathcal{F} = \left \{ \d \mathbb{P}_{X_T^{(\xi)}}/\d \mathbb{P}_{Y_T} : \xi \in \Xi \right \}$$ parameterized by $\xi$ from a compact set $\Xi \subset \mathbb{R}^k$ satisfying $Var\left(\d \mathbb{P}_{X_T^{(\xi)}}/\d \mathbb{P}_{Y_T}\right) < \infty$ for all $\xi \in \Xi$ with $\mathbb{P}_{X_T^{(\xi)}}$ denoting the distribution of the solution $ X_T = x + \int_0^T \mu(X_t, t; \xi) \d t + \int_0^T \sigma(t) \d W_t$ and $\mathbb{P}_{Y_T}$ the distribution of $Y_t = x + \int_0^T \sigma(t) \d W_t$. 
Additionally, assume that the image of $(T, X_T) \mapsto \mu(T,X_T;\xi)$ is compact for all $X_T, \xi$.
Then, %in the limit as $h\to 0$ \jie{it has to be related to the rate of $n$. needs more rigorous statement} and for fixed $x \in \mathcal{X}$,
$$
\mathbb{G}_{N_E} = \sqrt{N_E} \left(p_\theta^{\xi}(x, T) - p^\xi(x, T) \right)
$$
converges in distribution to a zero-mean Gaussian process over $\xi \in \Xi$ as $N_E \to \infty$ where $N_E$ is the number of samples used to compute the expectation. 
\end{proposition}
\begin{proof}
First, we will assume that the stochastic exponential has a finite variance for all $t, x$ within the support of the distribution. 
The finite variance allows us to use the law of large numbers to obtain pointwise convergence of the empirical expectation to the ground-truth solution.
Additionally, we assume that the operator $\ngo$ is well-learned in the sense that $\ngo(x, T, \xi) = \frac{\d \mathbb{P}_{X_T^{(\xi)}}}{\d \mathbb{P}_{Y_T}}$ can be computed exactly. 

Next, we need to show that the class of functions we are approximating are $P$-Donsker, which we will do using a covering number argument. 
Recall that the parameters of the functions are assumed to be from a compact set $\Omega \subset \mathbb{R}^d$ with $$ \mathcal{F} = \left\{f(T, \{X_t\}; \xi) := \exp\left( \int_0^T \mu(X_t; \xi) \d W_t - \frac12 \int_0^T \mu(X_t; \xi)^2 \d t \right) : \xi \in \Xi \right \} $$ also compact.
The covering number of $\Xi$, a subset of the Euclidean space, is known to be bounded by $N(\varepsilon, \Xi, \| \cdot \| ) \leq C \left ( \frac1\varepsilon \right ) ^d$ for some $C > 0$.
We will use this to bound the bracketing number of $\mathcal{F}$.
The $\log$ of the covering number is then  bounded by $O( d \log \frac 1 \varepsilon) < \frac{1}{\varepsilon^2}$.

Using this to bound the bracketing number, we can obtain that $\mathcal{F}$ is $P$-Donsker (c.f.~\citet[Theorem 11]{sen2018gentle}).
We can then define the empirical process $\mathbb{G}_{N_E}$, $N_E$ corresponding to the number of terms used to take the expectation as
\begin{align*}
    \mathbb{G}_{N_E}^{\xi} &:= \sqrt{N_E} \left (\mathbb{P}^{N_E}_{Y_T} - \mathbb{P}_{Y_T} \right ) p_0(Y_T) \ngo(\cdot; \xi)  \\
    &= \sqrt{N_E} \left ( \sum_{i=1}^{N_E} p_0(Y_T^{(i)}) \ngo(Y_T^{(i)}; \xi) - \E \left [ p_0(Y_T) \ngo(Y_T; \xi )\right]  \right ).
\end{align*}
Since $\ngo$ is assumed to be well learned, it approximates the likelihood ratio exactly, so the expectation gives the ground-truth solution. 
From the finite variance assumption on $\ngo$, by the central limit theorem for any $\xi \in \Xi$,  $ \mathbb{G}_{N_E}^{\xi} \to \sqrt{N_E}\mathcal{N}(0, 1)$. 
Given that the finite-dimensional margins are unit normal, we conclude that $\mathbb{G}$ converges to a Gaussian process over $\Xi$. 
\end{proof}
\section{Network structure and training hyperparameters}
\subsection{Linear parabolic PDE}
\paragraph{Network structure}
We first provide network structures on the $\ngo$ and DeepONet used to calculate solutions of linear parabolic PDEs.
For both $\ngo$ and DeepONet, we use a network structure that correlates to the number of dimensions of the PDEs, as shown in Tables~\ref{tab:ngo_fk},\ref{tab:don_fk_branch},\ref{tab:don_fk_trunk}.

\begin{table}[hbt!]
\centering
\begin{tabular}{lll}
\toprule
Operation Layer     & Input Channels                                    & Output Channels                                   \\
Convolutional Layer & $N_\mathrm{dim} \times 2 + 1$                     & $N_\mathrm{dim}\times 5+50$                       \\
Softplus            & {\color[HTML]{9B9B9B} N/A} & {\color[HTML]{9B9B9B} N/A} \\
Convolutional Layer & $N_\mathrm{dim}\times 5+50$                       & $N_\mathrm{dim}\times 5+50$                       \\
Softplus            & {\color[HTML]{9B9B9B} N/A} & {\color[HTML]{9B9B9B} N/A} \\
Convolutional Layer & $N_\mathrm{dim}\times 5+50$                       & $N_\mathrm{dim}\times 5+50$                       \\
Softplus            & {\color[HTML]{9B9B9B} N/A} & {\color[HTML]{9B9B9B} N/A} \\
Convolutional Layer & $N_\mathrm{dim}\times 5+50$                       & $N_\mathrm{dim}\times 5+50$                       \\
Softplus            & {\color[HTML]{9B9B9B} N/A} & {\color[HTML]{9B9B9B} N/A} \\
Convolutional Layer & $N_\mathrm{dim}\times 5+50$                       & $N_\mathrm{dim}\times 5+50$                       \\
Softplus            & {\color[HTML]{9B9B9B} N/A} & {\color[HTML]{9B9B9B} N/A} \\
Convolutional Layer & $N_\mathrm{dim}\times 5+50$                       & $N_\mathrm{dim}\times 5+50$                       \\
Softplus            & {\color[HTML]{9B9B9B} N/A} & {\color[HTML]{9B9B9B} N/A} \\
Convolutional Layer & $N_\mathrm{dim}\times 5+50$                       & $N_\mathrm{dim}$ \\ \bottomrule
\end{tabular}
\caption{Network structure of $\ngo$ in the linear parabolic case.}
\label{tab:ngo_fk}
\end{table}

\begin{table}[hbt!]
\centering
\begin{tabular}{lll}
\toprule
Operation Layer & Input Number                & Output Number               \\
Linear Layer    & $N_\mathrm{sensor}=100$         & $N_\mathrm{dim}\times 5+70$ \\
Tanh            & {\color[HTML]{9B9B9B} N/A}                         & {\color[HTML]{9B9B9B} N/A}                         \\
Linear Layer    & $N_\mathrm{dim}\times 5+70$ & $N_\mathrm{dim}\times 5+70$ \\
Tanh            & {\color[HTML]{9B9B9B} N/A}                         & {\color[HTML]{9B9B9B} N/A}                         \\
Linear Layer    & $N_\mathrm{dim}\times 5+70$ & $N_\mathrm{dim}\times 5+70$ \\
Tanh            & {\color[HTML]{9B9B9B} N/A}                         & {\color[HTML]{9B9B9B} N/A}                         \\
Linear Layer    & $N_\mathrm{dim}\times 5+70$ & $N_\mathrm{dim}\times 5+70$ \\
Tanh            & {\color[HTML]{9B9B9B} N/A}                         & {\color[HTML]{9B9B9B} N/A}                         \\
Linear Layer    & $N_\mathrm{dim}\times 5+70$ & $N_\mathrm{branch}=15$         \\\bottomrule
\end{tabular}
\caption{Network structure of the ``branch'' network of the DeepONet in both the linear and semi-linear parabolic case\citet{Goswami2022DeepTL}.}
\label{tab:don_fk_branch}
\end{table}

\begin{table}[hbt!]
\centering
\begin{tabular}{lll}
\toprule
Operation Layer & Input Number                & Output Number               \\
Linear Layer    & $N_\mathrm{dim}+1$         & $N_\mathrm{dim}\times 5+50$ \\
Tanh            & {\color[HTML]{9B9B9B} N/A}                         & {\color[HTML]{9B9B9B} N/A}                         \\
Linear Layer    & $N_\mathrm{dim}\times 5+50$ & $N_\mathrm{dim}\times 5+50$ \\
Tanh            & {\color[HTML]{9B9B9B} N/A}                         & {\color[HTML]{9B9B9B} N/A}                         \\
Linear Layer    & $N_\mathrm{dim}\times 5+50$ & $N_\mathrm{dim}\times 5+50$ \\
Tanh            & {\color[HTML]{9B9B9B} N/A}                         & {\color[HTML]{9B9B9B} N/A}                         \\
Linear Layer    & $N_\mathrm{dim}\times 5+50$ & $N_\mathrm{dim}\times 5+50$ \\
Tanh            & {\color[HTML]{9B9B9B} N/A}                         & {\color[HTML]{9B9B9B} N/A}                         \\
Linear Layer    & $N_\mathrm{dim}\times 5+50$ & $N_\mathrm{branch}=15$         \\\bottomrule
\end{tabular}
\caption{Network structure of the ``trunk'' network of the DeepONet in both the linear and semi-linear parabolic case\citet{Goswami2022DeepTL}.}
\label{tab:don_fk_trunk}
\end{table}

\paragraph{Training hyperparameters}
We train both $\ngo$ and DeepONet on random PDEs with a defined basis. 
Specifically, we consider second-order polynomials $\mu_i(x) \in \left\{ \sum_{i=0}^2 c_i x^i \mid c_i \sim \mathcal{U}(0,1) \right \}$, $x\in\mathbb{R}^{N_\mathrm{dim}}$. 
We set the parameters $r=0$ and $\sigma=1$ during training.  
For each epoch, we sample $6000$ initial $x$ values, $x\in[0.1, 0.6]$, and initial $t$ values, $t\in [0,0.1]$, and PDE parameters $\{c_i\}_{i=0}^2$. 
We calculate PDE solutions $p(x,t)$ through direct Girsanov calculation, $\ngo$, and DeepONet. 
We use $4000$ sample paths for Girsanov calculation $\ngo$. 
We then minimize the $\ell_1$ loss between Girsanov calculation and $\ngo$ or DeepONet using an Adam optimizer, with a learning rate $1\times 10^{-3}$. 

\subsection{Semi-linear parabolic PDE}
\paragraph{Network structure}
We now provide network structures on the $\ngo$ and DeepONet used to calculate solutions of semi-linear parabolic PDEs.
For both $\ngo$ and DeepONet, we use a network structure that depends on the number of input dimensions, as shown in Tables~\ref{tab:don_fk_branch},~\ref{tab:don_fk_trunk},~\ref{tab:ngo_expmart_fb}, and~\ref{tab:ngo_zt_fb}.

\begin{table}[hbt!]
\centering
\begin{tabular}{lll}
\toprule
Operation Layer     & Input Channels                                    & Output Channels                                   \\
Convolutional Layer & $N_\mathrm{dim}\times 2 + 1$                     & $N_\mathrm{dim}\times 5+50$                       \\
Softplus            & {\color[HTML]{9B9B9B} N/A} & {\color[HTML]{9B9B9B} N/A} \\
Convolutional Layer & $N_\mathrm{dim}\times 5+50$                       & $N_\mathrm{dim}\times 5+50$                       \\
Softplus            & {\color[HTML]{9B9B9B} N/A} & {\color[HTML]{9B9B9B} N/A} \\
Convolutional Layer & $N_\mathrm{dim}\times 5+50$                       & $N_\mathrm{dim}$ \\ \bottomrule
\end{tabular}
\caption{Network structure of $\ngo$ for estimating the exponential martingale in the semi-linear parabolic case.}
\label{tab:ngo_expmart_fb}
\end{table}

\begin{table}[hbt!]
\centering
\begin{tabular}{lll}
\toprule
Operation Layer     & Input Channels                                    & Output Channels                                   \\
Convolutional Layer & $N_\mathrm{dim} + 1$                     & $N_\mathrm{dim}\times 5+50$                       \\
Softplus            & {\color[HTML]{9B9B9B} N/A} & {\color[HTML]{9B9B9B} N/A} \\
Convolutional Layer & $N_\mathrm{dim}\times 5+50$                       & $N_\mathrm{dim}\times 5+50$                       \\
Softplus            & {\color[HTML]{9B9B9B} N/A} & {\color[HTML]{9B9B9B} N/A} \\
Convolutional Layer & $N_\mathrm{dim}\times 5+50$                       & $N_\mathrm{dim}\times 5+50$                       \\
Softplus            & {\color[HTML]{9B9B9B} N/A} & {\color[HTML]{9B9B9B} N/A} \\
Convolutional Layer & $N_\mathrm{dim}\times 5+50$                       & $N_\mathrm{dim}\times 5+50$                       \\
Softplus            & {\color[HTML]{9B9B9B} N/A} & {\color[HTML]{9B9B9B} N/A} \\
Convolutional Layer & $N_\mathrm{dim}\times 5+50$                       & $N_\mathrm{dim}\times 5+50$                       \\
Softplus            & {\color[HTML]{9B9B9B} N/A} & {\color[HTML]{9B9B9B} N/A} \\
Convolutional Layer & $N_\mathrm{dim}\times 5+50$                       & $N_\mathrm{dim}\times 5+50$                       \\
Softplus            & {\color[HTML]{9B9B9B} N/A} & {\color[HTML]{9B9B9B} N/A} \\
Convolutional Layer & $N_\mathrm{dim}\times 5+50$                       & $N_\mathrm{dim}$ \\ \bottomrule
\end{tabular}
\caption{Network structure of $\ngo$ for estimating $Z_t$ in the semi-linear parabolic case.}
\label{tab:ngo_zt_fb}
\end{table}

\paragraph{Training hyperparameters}
We train both $\ngo$ and DeepONet to estimate random PDEs with a defined basis. 
Specifically, we consider second-order polynomials  $h_i(t, x, s, z) \in \left \{ c_1 \sum \sin(x_i) + c_2 \sum z_i^2 + c_3 \cos(t + s) \mid c_i \sim \mathcal{U}(0,1) \right \} $.
We set the parameters $r=0$ and $\sigma=1$ during training.  
For each epoch, we uniformly sample $6000$ initial $x$ values, $x\in[0.1, 0.6]$, and initial $t$ values, $t\in [0,0.1]$, and PDE parameters $\{c_i\}_{i=0}^2$. 
We calculate the PDE solution $p(x,t)$ through direct Girsanov calculation, $\ngo$, and DeepONet. 
We use $4000$ sample paths for Girsanov calculation $\ngo$. 
We then minimize the $\ell_1$ loss between Girsanov calculation and $\ngo$ or DeepONet using the Adam optimizer, with a learning rate of $1 \times 10^{-3}$. 

\subsection{Generative modeling}

\paragraph{Network structure}
We describe the network structure for the normalizing flow used to model $p_\mathrm{meta}$ and the forward SDE.
For $p_\mathrm{meta}$, we use a real-NVP model~\citep{Dinh2014NICENI, Dinh2016DensityEU} with $32$ affine coupling layers, each having the structure as shown in Table~\ref{tab:p_meta_2} for 2-$d$ $p_\mathrm{meta}$ and in Table~\ref{tab:p_meta_100} for 100-$d$ $p_\mathrm{meta}$.

\paragraph{Training hyperparameters}
We train the normalizing flows using the ``normflows'' platform~\citet{Stimper2023normflowsAP}. 
The training dataset for the 2-d and the 100-d case contain $600$ samples each. 
We sample $60$ points from each of the $10$ Gaussians with different means and standard variances to form the meta-dataset. 
Training is performed with the Adam optimizer using a learning rate of $5 \times 10^{-4}$ and a weight 
decay of $1\times 10^{-5}$.
We define the diffusion function of the forward SDE as a $d$-dimensional diagonal matrix, where $d$ is the dimension of the forward SDE.
We set the terminal time of the forward SDE as $T =0.1$, the number of Euler steps when training is $40$, and the number of Euler steps when testing is $50$.
We set the number of samples $N_E$ used to estimate $\mathrm{ELBO}_\mathrm{IS}$ to be $75$. 
We estimate the divergence with Hutchinson's trace estimator as used in~\citet{Grathwohl2018FFJORDFC}.
We minimize $\mathrm{ELBO}_\mathrm{IS}$ using an AdamW optimizer and a learning rate of $8 \times 10^{-4}$.

\begin{table}[hbt!]
\centering
\begin{tabular}{lll}
\toprule
Operation Layer & Input Number & Output Number \\
Linear Layer    & 1            & 64            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 64           & 64            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 64           & 64            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 64           & 64            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 64           & 2             \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}          \\ \bottomrule
\end{tabular}
\caption{Network structure of the affine coupling layer in the normalizing for 2-d $p_\mathrm{meta}$.}
\label{tab:p_meta_2}
\end{table}

\begin{table}[hbt!]
\centering
\begin{tabular}{lll}
\toprule
Operation Layer & Input Number & Output Number \\
Linear Layer    & 50            & 200            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 200           & 200            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 200           & 200            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 200           & 200            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 200           & 100             \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}          \\ \bottomrule
\end{tabular}
\caption{Network structure of the affine coupling layer in the normalizing for 100-d $p_\mathrm{meta}$.}
\label{tab:p_meta_100}
\end{table}

\begin{table}[hbt!]
\centering
\begin{tabular}{lll}
\toprule
Operation Layer & Input Number & Output Number \\
Linear Layer    & 2            & 200            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 200           & 200            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 200           & 200            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 200           & 200            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 200           & 2             \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}          \\ \bottomrule
\end{tabular}
\caption{Network structure of the drift function of the forward SDE in the 2-d case.}
\label{tab:mu_gen_2}
\end{table}

\begin{table}[hbt!]
\centering
\begin{tabular}{lll}
\toprule
Operation Layer & Input Number & Output Number \\
Linear Layer    & 100            & 200            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 200           & 200            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 200           & 200            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 200           & 200            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 200           & 100             \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}          \\ \bottomrule
\end{tabular}
\caption{Network structure of the drift function of the forward SDE in the 100-d case.}
\label{tab:mu_gen_100}
\end{table}

\begin{table}[hbt!]
\centering
\begin{tabular}{lll}
\toprule
Operation Layer & Input Number & Output Number \\
Linear Layer    & 1            & 16            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 16           & 16            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 16           & 16            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 16           & 16            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 16           & 2             \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}          \\ \bottomrule
\end{tabular}
\caption{Network structure of the diffusion function of the forward SDE in the 2-d case.}
\label{tab:sigma_gen_2}
\end{table}

\begin{table}[hbt!]
\centering
\begin{tabular}{lll}
\toprule
Operation Layer & Input Number & Output Number \\
Linear Layer    & 1            & 16            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 16           & 16            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 16           & 16            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 16           & 16            \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}           \\
Linear Layer    & 16           & 100             \\
Tanh            & {\color[HTML]{9B9B9B} N/A}          & {\color[HTML]{9B9B9B} N/A}          \\ \bottomrule
\end{tabular}
\caption{Network structure of the diffusion function of the forward SDE in the 100-d case.}
\label{tab:sigma_gen_100}
\end{table}

\section{Ablation study}
\label{sec:ablation}
We perform a series of ablation studies of the proposed $\ngo$ algorithm for the linear and semi-linear parabolic PDEs on the number of dimensions of the PDEs and on the number of sample paths used to approximate the solutions.

\paragraph{Number of dimensions}
We investigate the influence of the number of dimensions on the performance of direct Girsanov calculation (GIR),  $\ngo$, DeepONet (DON), and compare to either the analytical solutions or solutions simulated with Euler Maruyama (E-M) using a large number of sample paths. 

Figure~\ref{fig:dim_ablation_fk} shows the results on linear parabolic PDEs. As the number of dimensions grows, the normalized errors of $\ngo$ increase for random PDEs with a defined basis and decrease in the Fokker-Planck and Black-Scholes equation. 
The figures suggest that the number of dimensions does not significantly influence the inference time of $\ngo$.

Figure~\ref{fig:dim_ablation_fb} shows the results on semi-linear parabolic PDEs. The number of dimensions does not significantly influence the normalized error of $\ngo$ on random PDEs with a defined basis and on the Black-Scholes-Barrenblatt equation. The normalized error of $\ngo$ decreases in the Hamilton-Jacobi-Bellman equation as the number of dimensions grows. 
The inference time of $\ngo$ correlates positively with the number of dimensions but is still the lowest among all methods tested.

\paragraph{Number of sample paths}
We additionally how the number of sample paths used to calculate the solutions correlates with the performance of direct Girsanov calculation, $\ngo$, DeepONet (which is uninfluenced by the number of sample paths), and compare to either the analytical solutions or solutions simulated with Euler Maruyama (E-M) and a large number of sample paths. 

Figure~\ref{fig:N_ablation_fk} shows the results on linear parabolic PDEs. 
As the number of sample paths grows, the normalized errors of $\ngo$ on all three PDEs decrease and then stabilize. 
This behavior is particularly obvious on random PDEs with a defined basis. 
Due to parallel rather than sequential computations, the number of sample paths does not significantly influence the inference time of $\ngo$.
The number of sample paths does not impact the inference time before reaching the GPU's memory limit.

Figure~\ref{fig:N_ablation_fb} shows the results on semi-linear parabolic PDEs. 
As the number of sample paths increases, the normalized error of $\ngo$ slightly decreases. 
The number of sample paths does not significantly influence the normalized errors of $\ngo$ on the Hamilton-Jacobi-Bellman equation and the Black-Scholes-Barrenblatt equation. 
The inference time of $\ngo$ first increases with the number of sample paths and then stabilizes but is still the lowest among all methods tested.

\begin{figure*}
    \centering
    \includegraphics[width=\textwidth]{figs/dim_ablation_fk.pdf}
    \caption{Ablation study on the number of dimensions of linear parabolic PDEs evaluated at the terminal time $T=0.5$. 
    We show the normalized error (top) and inference time (bottom). 
    The \textbf{first column} shows results on random PDEs with a defined basis; the \textbf{second column} shows results on the Fokker-Planck equation of the OU process; the \textbf{third column} shows results on the Black-Scholes equation.}
\label{fig:dim_ablation_fk}
    %\vspace{-18pt}
\end{figure*}

\begin{figure*}
    \centering
    \includegraphics[width=\textwidth]{figs/dim_ablation_fb.pdf}
    \caption{Ablation study on the number of dimensions of semi-linear parabolic PDEs evaluated at terminal time $T=0.5$. 
    We show the normalized error and inference time. 
    The \textbf{first column} shows results on random PDEs with a defined basis; the \textbf{second column} shows results on the Hamilton-Jacobi-Bellman equation; the \textbf{third column} shows results on the Black-Scholes-Barrenblatt equation. 
    The variance of the Girsanov calculation on the random PDEs with a defined basis increases significantly for $d > 11$.
    }
    \label{fig:dim_ablation_fb}
    %\vspace{-18pt}
\end{figure*}

\begin{figure*}
    \centering
    \includegraphics[width=\textwidth]{figs/N_ablation_fk.pdf}
    \caption{Ablation study on the number of sample paths used in simulating linear parabolic PDEs evaluated at terminal time $T=0.5$. 
    We show the normalized error (top) and inference time (bottom). 
    The \textbf{first column} shows results on random PDEs with a defined basis; the \textbf{second column} shows results on the Fokker-Planck equation of the OU process; the \textbf{third column} shows results on the Black-Scholes equation.}
    \label{fig:N_ablation_fk}
    %\vspace{-18pt}
\end{figure*}

\begin{figure*}
    \centering
    \includegraphics[width=\textwidth]{figs/N_ablation_fb.pdf}
    \caption{Ablation study on the number of sample paths used in simulating semi-linear parabolic PDEs evaluated at terminal time $T=0.5$. We show the normalized error (top) and inference time (bottom). The \textbf{first column} shows results on random PDEs with a defined basis; the \textbf{second column} shows results on the Hamilton-Jacobi-Bellman equation; the \textbf{third column} shows results on the Black-Scholes-Barrenblatt equation.}
    \label{fig:N_ablation_fb}
    %\vspace{-18pt}
\end{figure*}

\section{Definitions of canonical PDEs}
\label{sec:canonical}
%Authors may wish to optionally include extra information (complete proofs, additional experiments and plots) in the appendix. All such materials should be part of the supplemental material (submitted separately) and should NOT be included in the main submission.

\subsection{Fokker Planck equation}
We study the PDF of a $d$-dimensional time-invariant linear SDE of the form:
\[\begin{cases}
    &\d X_t = X_t \d t+\d W_t,\\
    &X_0\sim \mathcal{N}(0,I).
\end{cases}\]
$X_t$ follows a Gaussian distribution for all $t$: $X_t\sim\mathcal{N}(m(t),c(t))$, where $m(t)$ and $c(t)$ satisfies the following ODE system:
\[\begin{cases}
    m(t)&=\exp(t-t_0)m(t_0),\\
    c(t)&=\exp(t-t_0)c(t_0)\exp(t-t_0)^T+\int_{t_0}^t\exp(t-\tau)Q\exp(t-\tau)^Td\tau,
\end{cases}\]
where $Q$ is the Brownian motion's diffusion coefficient and is assumed to be a $d$-dimensional identity matrix in our case~\citep{sarkka2019applied}. 

We assume the mean at initial time $m(t_0)=0$, and find
\[\begin{cases}
    m(t)&=0,\\
    c(t)&=(\frac{3}{2}\exp(2t)-\frac{1}{2})I.
\end{cases}\]
So we have an analytical form of $X_t$'s distribution: $X_t\sim\mathcal{N}(0,(\frac{3}{2}\exp(2t)-\frac{1}{2})I)$. 

Distribution of the SDE $X_t$ corresponds to the initial value problem (IVP) satisfying the Fokker-Planck equation
$$
\begin{cases}
    \frac{\partial p}{\partial t}&=-x\cdot\nabla p+\frac{1}{2}\mathrm{Tr}(\mathrm{Hess}_{x}p)-d p\\
    p_0(x,0)&=\frac{1}{(2\pi)^d}\exp\left(-\frac{xx^T}{2}\right),
\end{cases}
$$
where $d$ is the dimension of the SDE, $\cdot$ is inner product,  $\times$ is scalar multiplication. 

Using the Feynman Kac formula introduced in equation \ref{eq:fk}, the solution to this IVP has a stochastic representation 
\[\begin{cases}
    p(x,t) &= \exp(-d \, t)\E[p_0(X_t)],\\
    \d X_t &= X_t \d t + \d W_t,\\
    X_0 &= x.
\end{cases}\]

We apply $\ngo$ and DeepONet to solve this PDE according to the expectation and compare it with the ground truth PDF.

\subsection{Multi-dimensional Black-Scholes equation}
\label{sec:exact_bs}
We consider a multi-variate extension of the Black-Scholes model where multiple, correlated assets govern the price of a derivative. 
The price evolution of a European call under the Black–Scholes model is modeled by the expectation of the corresponding payoff function with respect to geometric Brownian motion: 
\begin{align}
\label{tvp:bs}
    \begin{cases}
        & \frac{\partial p}{\partial \tau}+\frac{\hat{\sigma}^2}{2}\sum_{i=1}^d s_i^2\frac{\partial^2 p}{\partial s_i^2}+r(s \cdot \nabla p -p)=0,\\
        & p(s,T)=\Phi(s).
    \end{cases}
\end{align}

We apply a change of variable $t=T-\tau$ and transform the terminal condition problem to an equivalent initial value problem (IVP):

\begin{align}
\label{ivp:bs}
    \begin{cases}
    & \frac{\partial p}{\partial t} = \frac{\hat{\sigma}^2}{2}\sum_{i=1}^d s_i^2\frac{\partial^2 p}{\partial s_i^2}+r(s \cdot \nabla p -p)=0,\\
    & p(s,0)=\Phi(s).
\end{cases}
\end{align}

Note that this transformation is not necessary, and the Black-Scholes equation is usually solved according to a terminal condition, but to maintain consistency with the other experiments we consider an IVP.
This IVP can be solved using the Feynman-Kac method described in equation~\ref{eq:fk}, with drift $\mu(s)=r s$ and  volatility $\sigma(s)=\hat{\sigma}s$ for some prescribed volatility coefficient $\hat{\sigma}$ and payoff $\Phi$.

To get a more accurate estimate, we simplify the terminal value problem~\ref{tvp:bs} with a change of variable by transforming it to a function of Brownian motion, as described in the main text. 
We will consider the variable $x_i = f(s_i)$ where $f(s) = \ln s + (r - \frac12 \hat{\sigma}^2)\tau $.
Since $\d s_t = r s_t \d t + \hat{\sigma} s_t  \d W_t$, applying It\^o's lemma to $f(s)$ gives us the new SDE
\begin{align*} 
\d f(s_t) &= \left (r s_t \underbrace{ \left (\frac{1}{s_t}\right )}_{\frac{\partial f}{\partial s}} + \frac12 \hat{\sigma}^2 s_t^2 \underbrace{\left (\frac{-1}{s_t^2}\right)}_{\frac{\partial f^2}{\partial s^2}} - \underbrace{\left(r - \frac12 \hat{\sigma}^2\right)}_{\frac{\partial f}{\partial t}} \right ) \d t + \hat{\sigma} s_t\left(\frac{1}{s_t}\right ) \d W_t \\
\d X_t &= \hat{\sigma} \d W_t.
\end{align*}
Now, we can consider taking expectations of $X_t$, where exact sampling is possible since it is Brownian motion.
After the transformation, the PDE now corresponds to the heat equation plus the discount factor given by $-r p$: 
\begin{align}
\label{ivp:bs_soln}
    \begin{cases}
        &\frac{\partial \Psi}{\partial t}=\frac{\hat{\sigma}^2}{2}\sum_{i=1}^N\frac{\partial^2\Psi}{\partial x_i^2} - rp,\\
        &\Psi(x,0)=p(x,0)=\Phi(\exp(x_i+(r-\frac{\hat{\sigma}^2}{2})\tau)).
    \end{cases}
\end{align}

This gives us a relationship between the PDE on the transformed variable and the original one through 
\begin{align*}
    p(s,t)=\exp(-rt)\Psi\left(\ln s-\left(r-\frac{\hat{\sigma}^2}{2}\right)(T-t),t\right).
\end{align*}
We take $\Phi(s) = \max\{ \max_{k=1\ldots d} s_k - K, 0 \}$ for the payoff function, which corresponds to the payoff for the best-asset rainbow option.
\begin{comment}
Define $x_i=\ln s_i-(r-\frac{1}{2}\hat{\sigma}^2)\tau$ to get
\[\begin{cases}
    & \frac{\partial p}{\partial \tau}+\frac{\hat{\sigma}^2}{2}\sum_{i=1}^d \frac{\partial^2 p}{\partial x_i^2}-rp=0,\\
    & p(x,T) = \Phi(\exp(x_i+(r-\frac{\hat{\sigma}^2}{2})\tau)).
\end{cases}\]
Further define $\Psi(x,\tau)=\exp(r(T-\tau))p(x,\tau)$, we have
\[\begin{cases}
    &\frac{\partial \Psi}{\partial \tau}+\frac{\hat{\sigma}^2}{2}\sum_{i=1}^d\frac{\partial^2\Psi}{\partial x_i^2}=0,\\
    &\Psi(x,T)=p(x,T)=\Phi(\exp(x_i+(r-\frac{\hat{\sigma}^2}{2})\tau)).
\end{cases}\]
Applying the change of variable of $t=T-\tau$ described above, 
\begin{align}
\label{ivp:bs_soln}
    \begin{cases}
        &\frac{\partial \Psi}{\partial t}=\frac{\hat{\sigma}^2}{2}\sum_{i=1}^N\frac{\partial^2\Psi}{\partial x_i^2},\\
        &\Psi(x,0)=p(x,0)=\Phi(\exp(x_i+(r-\frac{\hat{\sigma}^2}{2})\tau)).
    \end{cases}
\end{align}

We solve this PDE instead where 
\begin{align*}
    p(s,t)=\exp(-rt)\Psi\left(\ln s-\left(r-\frac{\hat{\sigma}^2}{2}\right)(T-t),t\right),
\end{align*}
applying the Feynman-Kac, and $\Psi$ can be approximated with a driftless system with a constant diffusion term $\hat{\sigma}$.
\end{comment}

We approximate $p(s,t)$ with $\ngo$ and DeepONet, following the IVP problem~\ref{ivp:bs}, and compare to solutions calculated with the simplified IVP system~\ref{ivp:bs_soln}. Note $\ngo$ is trained with a constant diffusion term, whereas the SDE system required here has a state-dependent diffusion. The low error of $\ngo$ on this problem showcases its generality.

\subsection{Hamilton-Jacobi-Bellman equation}
The field of optimal control often requires directly or indirectly solving a terminal value problem involving a $d$-dimensional HJB equation. We study such a problem:
\[\begin{cases}
    \frac{\partial p}{\partial t} &= -\text{Tr}(\mathrm{Hess}_xp)+\|\nabla p\|^2,\\
    p(T,x) &= g(x),
\end{cases}\]
where $g(x)=\ln\left(\frac12(1+\|x\|^2)\right)$. 

This PDE has a stochastic solution given by $p(t,x)=-\ln(\mathbb{E}[\exp(-g(x+\sqrt{2}W_{T-t}))])$~\citep{raissi2019686}.

This PDE's semi-linear results in a representation as a Forward-Backward SDE (FBSDE) of the form of
\[\begin{cases}
    \d X_t &= \sqrt{2}\d W_t,\\
    X_0 &= 0,\\
    \d S_t &= \frac{\|Z_t\|^2}{2}\d t+Z_t^T \d W_t,\\
    S_T &= g(X_T),
\end{cases}\]
where $Z_t=\sqrt{2}p_x^t$. 
We apply $\ngo$ on this FBSDE system, with forward drift function $\mu(t,x)=0$, forward diffusion function $\sigma(t,x)=\sqrt{2}$, backward drift function $h(t,x,p,\sigma^T\nabla_x p)=-\frac{\|\sigma^T\nabla_x p\|^2}{2\sigma^2}$, and terminal condition $g(x)=\ln\left(\frac12(1+\|x\|^2)\right)$.
We compare the results generated with $\ngo$, DeepONet, and Girsanov with the analytical solution.


\subsection{Black-Scholes-Barrenblatt equation}
The Black-Scholes-Barenblatt (BSB) equation is a semi-linear extension to the Black-Scholes equation mentioned in section~\ref{sec:exact_bs} and models uncertainty in  volatility and interest rates under the Black-Scholes model \citet{bsb2019}.
We study a terminal value problem involving the BSB equation:

\[\begin{cases}
    p_t &= -\frac12\text{Tr}[\sigma^2\text{diag}(X_t^2)D^2p]+r(p-(Dp)^Tx),\\
    p(T,x) &= \|x\|^2.
\end{cases}\]

From~\citet{raissi2019686}, this problem has an exact solution given as
\begin{align}
    p(t,x) &= \exp((r+\sigma^2)(T-t))g(x).
\end{align}


Due to the BSB equation's semi-linear nature, one can represent its solution with an FBSDE system. 
\[\begin{cases}
    \d X_t &= \sigma\text{diag}(X_t)\d W_t, t\in[0,T],\\
    X_0 &= x_0,\\
    \d S_t &= r(S_t-Z_t^TX_t)\d t+\sigma Z_t^T\text{diag}(X_t)\d W_t, t\in[0,T),\\
    S_T &= g(X_T).
\end{cases}\]
Following the FBSDE system, we then construct the $\ngo$, where the forward drift function $\mu(t,x)=0$, the forward diffusion function $\sigma(t,x)=\sigma\text{diag}(X_t)$, the backward drift function $h(t,x,p,\sigma^T\nabla_x p) = r(p-\frac{\sigma^T\nabla_x p^T}{\sigma x})$, and the terminal condition $g(x)=\|x\|^2$. We compare the explicit solution of the BSB equation with the solutions generated by $\ngo$, DeepONet, and Girsanov.
%\section{Extensions}

%\paragraph{Extension to elliptic PDEs}
%Finally, we note that the extension to elliptic PDEs is relatively straightforward as well. 
%Elliptic PDEs require computing the first hitting time of the boundary of the domain $\mathcal{D}$ at each point of evaluation. 
%Then, using the first hitting times, the importance sampling follows. 
%Specifically, we modify the stochastic exponential in~\eqref{eq:exp} to be:
%$
%\frac{\mathrm{d} \mathbb{P}_Y}{\mathrm{d} \mathbb{P}_X} := \exp\left( \int_0^{\tau} \mu(X_s) \mathrm{d}W_s - \frac12 \int_0^{\tau} \mu(X_s)^2 \mathrm{d} t \right)
%$
%where $\tau$ is the first hitting time of $\partial \mathcal{D}$ starting at $x$.

\section{Algorithm for \texorpdfstring{$\ngo$}{Lg} of semi-linear parabolic PDEs}
\label{sec:alg_semi_linear}

We present the complete algorithm of $\ngo$ on semi-linear parabolic PDEs in algorithm~\ref{alg:semi_linear}.

\begin{algorithm}[hbt!]
\caption{Approximating semi-linear PDEs with $\ngo$} \label{alg:semi_linear}
\begin{algorithmic}[1]
    \Require $N \in \mathbb{N}$, $h \in \mathbb{R}_+$, $\mu(t, x) : \mathbb{R}_+ \times \mathbb{R}^d \to \mathbb{R}^d$, terminal time $T$, initial position $X$ 
    \State Sample $N$ Brownian motions to time $T$ starting at $X$, $\left \{X + \sqrt{k h} \varepsilon^{(i)}\right\}_{k =1\ldots T/h }^{i=1 \ldots N}$, $\varepsilon \sim \mathcal{N}(0,1)$
    \For{$i \in \{1, \ldots, N\}$} 
        %\State Compute $\frac{\mathrm{d} \mathbb{P}_\mu^{(i)}}{\mathrm{d} \mathbb{P}_W} \approx \sum_{k=1}^{T/h} \mu(k, W_k^{(i)}) \sqrt{k} \varepsilon^{(i)} - \frac12 \sum_{k=1}^{T/h} \mu^2(k, W_k^{(i)}) h $ 
        \State Compute $\frac{\mathrm{d} \mathbb{P}_\mu^{(i)}}{\mathrm{d} \mathbb{P}_W} \approx \ngo^\mathrm{expmart}\left[ \left\{\mu\left( W_{k }^{(i)} \right) \right\}_{k=1}^{ T/h}, \left\{\sqrt{kh}\varepsilon^{(i)} \right\}_{k=1}^{T/h},h\right] $ 
        \State Compute $S_T^{(i)}\approx g(W_T^{(i)}) \ngo^\mathrm{expmart}_T$ and $Z_T^{(i)}=\ngo^\mathrm{grad}\left[S_T^{(i)},W_T^{(i)} \right]$ 
    \EndFor 
    \For{$k \in \{T/h, \ldots, 1\},\,i\in\{1, \ldots,N\}$} 
        %\State Compute $\frac{\mathrm{d} \mathbb{P}_\mu^{(i)}}{\mathrm{d} \mathbb{P}_W} \approx \sum_{k=1}^{T/h} \mu(k, W_k^{(i)}) \sqrt{k} \varepsilon^{(i)} - \frac12 \sum_{k=1}^{T/h} \mu^2(k, W_k^{(i)}) h $ 
        \State Compute $S_{T-kh}^{(i)}=S_T^{(i)}+h(T-kh,W_{T-kh}^{(i)},S_{T-(k-1)h}^{(i)},Z_{T-(k-1)h}^{(i)}) \times h \times \ngo^\mathrm{expmart}_{T-(k-1)h}$ 
        \State Compute $Z_{T-kh}^{(i)}=\ngo^\mathrm{grad}\left[S_{T-kh}^{(i)},W_{T-kh}^{(i)} \right]$
    \EndFor 
    
    %yi[i-1,:,:] = yi[i,:,:] + h(t_current,x_current,yi[i,:,:],z_current,coef1) * self.dt * cnn_expmart[i-1,:,:]
    
    %\State compute as $\check{u}(T,X) = \frac1N \sum_{i=1}^N p_0(W_T^{(i)}) \frac{\mathrm{d} \mathbb{P}_\mu^{(i)}}{\mathrm{d} \mathbb{P}_W}$
    \Ensure Approximation of $u(T-kh,X_{T-kh})$ as $\check{u}(T-kh,X_{T-kh})=\frac1N \sum_{i=1}^N S_{T-kh}^{(i)}$
\end{algorithmic}
\end{algorithm}





\section{Other parameters for meta-learning}

In the main text, we focused on meta-learning the prior $p_\mathrm{meta}$ for the generative modeling task. 
Here we describe how to meta-learn the other parameters associated with the PDE and provide examples of use cases. 
In terms of the solution to a PDE, this corresponds to learning an optimal initial condition that satisfies all tasks. 

\subsection{Meta-learning the base \texorpdfstring{$\mu_0$}{Lg}}
In the experiments provided in the main text, we always considered sampling from $\mathbb{P}_{X_t}$ being standard Brownian motion (that is, $\mu_0 = 0$). 
However, this need not be the case. 
Consider $k$ task-specific $\{\mu_i\}_{i=1}^k$ drifts.
We can learn an optimal $\mu_0$ that minimizes the distance between all the task-specific $\mu_i$'s.
This has the effect of $\mu_i - \mu_0 \to 0$ for all $i$, which would lead to a  
However, this comes at the expense of requiring an Euler-Maruyama solve for each iteration of training since $\mu_0$ would need to change. 

\paragraph{Example: baseline policy}
Suppose our interest lies in solving the following maximization problem:
$$ 
\max_{\mu_0} \E_i \left\{\E_{\mu_i - \mu_0}[J(X_T)] \right \},
$$
where we compute the inner expectation over an objective function and the outer expectation over various tasks $i$ with distinct drift functions $\mu_i$.
This maximization problem could, for example, relate to the maximization of a portfolio under $k$ different market conditions.
The meta-learned parameter $\mu_0$ then describes the optimal policy in all $k$ market conditions. 
We rewrite this maximization problem with using Girsanov's theorem as 
$$
\max_{\mu_0} \E_i \left\{\E_{ -\mu_0}\left[J(X_T) \exp \left ( \int_0^T \mu_i \d W_t - \frac12 \int_0^T \mu_i^2 \d t \right ) \right] \right \},
$$
leading to a similar meta-learning problem as described in the main text.

\subsection{Meta-learning the base \texorpdfstring{$\sigma_0$}{Lg}}

In addition to optimizing for the base drift, $\mu_0$, we can also consider optimizing for a baseline $\sigma_0$. Diffusion models utilize similar concepts by learning $\sigma$ values for known SDEs. However, due to challenges with sampling state-dependent diffusion, it is more convenient to consider a function linear in $X_t$. In the following example, we will delve into this issue further.

\paragraph{Example: baseline volatility}

Suppose our interest lies in sampling from a generative model with distributions that satisfy a Fokker-Planck equation. In order to sample from all target distributions optimized for the considered set of distributions, we can estimate a baseline volatility. Score-based generative modeling applies similar concepts, typically using an affine SDE as a base model that is adapted for different target distributions.
To solve the same maximization problem we previously discussed, we can now introduce a parameter $\sigma(t)$ shared among all sample paths for different tasks. We express the problem as:
$$
\max_{\sigma_\star(t)} \E_i \left \{ \E_{\mu_0} \log p_0(Y_T) + \int \sigma^{-1}(t) \mu_i \d W_t - \frac12 \int \mu_i^T \sigma^{-1}(t) \mu_i  \d t  \mid Y_0 \sim p_i(Y) \right \}.
$$

\paragraph{Extension to state-dependent volatility}
The primary focus of this study is situations where $\mu$ represents different parameters of a partial differential equation's (PDE) solution or various tasks within a meta-learning framework. We also examine the instances where the volatility, denoted by $\sigma$, changes. We limit our consideration to those volatilities that fulfill the conditions set by the multivariate \emph{Lamperti transform}~\citep[Proposition 1]{ait2008closed}.

The Lamperti transform imposes specific constraints on the partial derivatives, which come from Itô's Lemma in~\ref{lem:ito}. This property enables converting the corresponding diffusion into a form with unit volatility. 

Assuming the existence of a function $f$ such that $\sigma = \nabla_x f( \, \cdot \,)$, the PDE can be solved for various parameters using the following approximation:

$$
\mathbb{E}_{\mathbb{P}_{X, \sigma}}[p_0(X_T) \mid \mathcal{F}_T] \approx \mathbb{E}_{\mathbb{P}_Y}\left [p_0(Y_t)\ngo\left(\{\mu(f(Y_{s_n})), \Delta f( W_{s_n}), h \}_{n_T=1}^{N_T} ; \theta\right ) \; \Big | \; \mathcal{F}_T\right ].
$$

Finally, the operator approximates the integral, with the additional component contributed by the trace of the Hessian, as outlined in equation~\eqref{eq:ito_lemma}.

%\paragraph{Example: Market Volatility}

%\section{Derivation of the Feynman-Kac Formula for the Fokker-Planck Equation}

\begin{comment}
\section{Additional details on semi-linear PDEs}
\label{sec:fbsde}

In the context of semi-linear partial differential equations (PDEs), we examine two stochastic processes: $X_t$ and $Y_t$. We can express $Y_T$ as $Y_T = \E[g(X_T) \mid \mathcal{F}_t]$. Leveraging the importance sampling argument, we further represent $Y_T$ as $Y_T = \E[g(W_T) \frac{\d \mathbb{P}{X_T}}{\d \mathbb{P}_{W_T}} \mid \mathcal{F}_t]$.
Moving forward, let's denote $Z_t = \nabla_x Y_t$. With this, the evolution of $Y_t$ for $t< T$ can be written in a discretized form as: $Y_t = Y_{t+1} - h(X_t, Z_t, t) h + Z_t \Delta W_t$.
The $\ngo$ approximation for $Y_T$ can be expressed as: $\E[g(W_T) \ngo({\mu(W_n), \Delta W_n, h }_{n=1}^{N_T}; \theta) \mid \mathcal{F}_t]$.
Alternatively, this can also be written as: $\E[g(X_{t+1}) - g(X_t)] \approx \E[h(X_t, Z_t, t) \ngo ]$.
\ah{continue here}
\end{comment}

\section{Maximization problems}
\label{sec:max}
We presented the main motivation of the maximization problem in terms of the Fokker-Planck equation under different types of target distributions.

\paragraph{Maximizing value functions}
Consider an example where we wish to obtain the policy that maximizes a certain utility function. 
This situation arises, for instance, in a portfolio optimization problem where we assume a particular stochastic differential equation (SDE) governs a vector of assets $S_t$, and we aim to find the policy $\pi^\star$ that maximizes the utility $J$ across $K$ different market scenarios. 
In other words, we seek the meta-parameter $\mu_\mathrm{meta} = \pi^\star$ that maximizes the utility across the various scenarios, with each scenario specified by the drift $\mu^{(i)}$.

This maximization problem is expressed as $\max_{\pi} \sum_{i=1}^{K} \E_{\mu^{(i)}}[J(S_T)]$. 
To tackle this problem, we employ the same Monte Carlo approach used in the generative modeling case. 
Assuming a linear interaction between the policy and the assets, we once again apply Jensen's inequality to maximize the expectation of the logarithm.
The evidence lower bound (ELBO) in this case is given by:
$$
\max_{\pi} \sum_{i=1}^{K} \E_{\pi} \left [J(S_T) + \int_0^T \mu^{(i)}(S_t) \, \d W_t - \frac12 \int_0^T \left (\mu^{(i)}(S_t)\right)^2 \, \d t \right ].
$$
This formulation eliminates the need to recompute sample paths, as only the sample path corresponding to $\pi$ requires computation.

\paragraph{Example: Meta-learning $p_0$ for $K$ tasks}
Continuing with our example from the main text, we assume that all tasks have some relationship to each other, and we aim to leverage these relationships to enhance the performance of the sampling task compared to individual training. 
We consider the initial distribution, $p_0$, as a meta-learned parameter, which we can represent using a parametric model, $p_\mathrm{meta}(\,\cdot\,; \theta) \equiv p_0$.

When integrated with the forward stochastic differential equation (SDE), we need to solve for:

\begin{align*}
   \max_{p_\mathrm{meta}} &\sum_{i=1}^K  \max_{\mu_i} \text{ELBO}_{\mathrm{IS}}(\{X^{(i)}\}; \mu_i) = \\  
   \max_{p_\mathrm{meta}} & \sum_{i=1}^K  \max_{\mu_i} \mathbb{E}\left[\int_0^T\mu_i(Y_{s},s)\,\mathrm{d}W_s -\frac{1}{2}\int_0^T\mu_i^2(Y_{s},s) - \nabla \cdot \mu_i(Y_s, s) \,\mathrm{d}s + \log p_\mathrm{meta}(Y_T) \right].
\end{align*}

By maximizing this expression over all $p_i$ that we want to approximate, with a corresponding $\mu_i$ for each $p_i$, we can use the associated information of each $p_i$ in the form of $p_\mathrm{meta}$. It is important to note that the same derivation holds if we compute the expectation over full sample paths without employing importance sampling:

\begin{align*}
    \max_{p_\mathrm{meta}}\sum_{i=1}^K \max_{\mu_i} \mathrm{ELBO}_{\mathrm{direct}} (\{X^{(i)}\}; \mu_i) =  \max_{p_\mathrm{meta}}\sum_{i=1}^K \max_{\mu_i} & \mathbb{E}\left[\int_0^T\nabla\cdot\mu_i (X_{s}^{(i)},s)\,\mathrm{d}s + \log p_\mathrm{meta}\left(X_T^{(i)}\right)\right].
\end{align*}

However, in the case of $\mathrm{ELBO}_\mathrm{direct}$, the sample paths need to be recomputed for each iteration since they depend on the parameters of the drift, whereas the simpler model under $\mathbb{P}_{Y_t}$ needs to be resampled for each iteration of the importance sampling-based method. 
The direct case requires sequential computation at each time, whereas the computation in $\mathrm{ELBO}_{\mathrm{IS}}$ can be parallelized.


\end{document}
