%\documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.


\usepackage{graphicx} % Required for inserting images
\usepackage{amsfonts}
\usepackage{amsmath}
\newcommand\Tau{\mathrm{T}}
%\usepackage{appendix}
\usepackage{comment}
\usepackage{amssymb}
\usepackage{xcolor}
\usepackage{amsthm}


\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{proposition}{Proposition}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}

\usepackage{float}
\usepackage[caption = false]{subfig}
%\usepackage[final]{graphicx}

% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\bibliographystyle{plainnat}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{A Generalized Bayesian Approach to Distribution-on-Distribution Regression}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
%
% Add authors
\author[1]{\href{mailto:<ngja@tcd.ie>?Subject=Your UAI 2024 paper}{Tin Lok James Ng}{}}
% Add affiliations after the authors
\affil[1]{%
    School of Computer Science and Statistics\\
    Trinity College Dublin\\
    Dublin, Ireland
}
  
  \begin{document}
\maketitle

\begin{abstract}
In recent years, there has been growing interest in distribution-on-distribution regression, a regression problem where both covariates and responses are represented as probability distributions. Despite various methodologies proposed to address this challenge, a notable absence has been a Bayesian approach, which offers benefits by allowing for the integration of prior knowledge and providing a formal means of quantifying uncertainty. However, a major challenge in employing a Bayesian approach lies in the complexity of fully specifying the data generating process. To overcome this obstacle, we adopt a generalized Bayesian approach and investigate the contraction rates of the resulting generalized (Gibbs) posterior distributions. We propose an MCMC algorithm to sample from the generalized posterior distribution and conduct simulation studies to validate the theoretical findings. Finally, we apply the model to a data application involving mortality data.
\end{abstract}

\section{Introduction}
Performing inference on a set of probability distributions has become increasingly popular in both the statistics and machine learning communities \citep{Poczos2013, Hron2016, Petersen2019, Pegoraro2022, Chen2023}. However, the inherent non-linear structure of the space of probability distributions poses challenges when applying methods designed for Euclidean or functional data. An approach to address this challenge involves applying a suitable transformation to map the probability distributions to a space with a linear structure \citep{Kneip2001, Petersen2016}. However, this approach does not consider the geometry of the space of probability distributions, resulting in a non-isometric transformation altering distances between pairs of distributions.
\\\\
The Wasserstein metric has gained prominence as a tool for measuring the distance between probability distributions. This metric has been applied in various contexts, such as principal component analysis (PCA) of probability distributions, and K-means clustering of probability distributions \citep{Zhuang2022}. The Wasserstein metric has also found applications in distribution-on-distribution regression, a type of regression modeling where both predictors and responses are distributions. For example, \cite{Chen2023} and \citep{Zhang2022} use the tangent structure of Wasserstein space to develop linear regression models between tangent spaces. On the other hand, \cite{Ghodrati2022} proposed a regression model using a monotone optimal transport map. Recently, there has been growing research aimed at addressing the problem in higher dimensions \cite{Okano2023, Ghodrati2023}. This has proven to be a challenging task due to the lack of closed-form solution for computing the Wasserstein distance in general cases and the curse of dimensionality.
\\\\
Despite the increasing array of methodologies addressing distribution-on-distribution regression, a Bayesian approach for this problem is notably absent. A Bayesian perspective offers advantages by providing a principled means to incorporate prior information and a formal framework for understanding and quantifying uncertainty associated with the regression operator. In this work, we address this gap by proposing a (generalized) Bayesian framework for distribution-on-distribution regression. Given the potential challenges of fully specifying the data generation process in a standard Bayesian approach, we navigate this issue by adopting the generalized Bayesian framework \citep{Bissiri2016, Syring2023}, which replaces the (negative) log-likelihood function with a loss function.
\\\\
In this work, our focus is on scenarios where both the covariate and response distributions are defined on the real line. Following the approach in \cite{Ghodrati2022}, we directly model the regression operator using a monotone transport map. We parameterize the monotone transport map through Bernstein polynomial basis functions. A natural loss function is introduced, enabling the use of a generalized Bayesian framework for inferring the monotone transport map. We investigate the contraction rates of the (generalized) posterior distribution in two settings: one where the covariate and response measures are directly observed, and another where only consistent estimates of the measures are available, obtained from random samples of each respective measure.
\\\\
The rest of the article follows this structure: In Section \ref{sec_background}, we provide an overview of Wasserstein space, Gibbs posterior distributions, and Bernstein polynomials. We then outline our modeling assumptions for distribution-on-distribution regression, discuss prior specification, examine the contraction rates of Gibbs posterior distributions, and elaborate on the MCMC sampling process in Section \ref{sec_method}. Section \ref{sec_simulation} delves into the details of the simulation studies, while Section \ref{sec_app} focuses on a data application. In Section \ref{sec_discussion}, we explore potential extensions of the current work.

\section{Background}
\label{sec_background}
\subsection{Wasserstein Space}
Let $\Omega \subset \mathbb{R}$ be a compact interval, let ${\cal W}_2(\Omega)$ be the set of probability measures on $\Omega$ with finite second moment. The 2-Wasserstein distance on ${\cal W}_2(\Omega)$ is defined as
\begin{equation}
\label{Wasserstein_dist}
    d_{{\cal W}}^2(\mu, \nu) = \inf_{\gamma \in \Gamma(\mu, \nu)} \int_{ \Omega \times \Omega} |x-y|^2 d\gamma(x,y) ,
\end{equation}
where $\mu, \nu \in {\cal W}_2(\Omega)$ and $\Gamma(\mu, \nu)$ is the set of couplings of $\mu$ and $\nu$, that is, the set of probability measures on $\Omega \times \Omega$ with marginals $\mu$ and $\nu$. The $\gamma$ that achieves the infinum on the RHS of \eqref{Wasserstein_dist} is said to be the optimal transport plan between $\mu$ and $\nu$.
\\\\
A deterministic map $T: \Omega \rightarrow \Omega$ is said to be a transport map from $\mu$ to $\nu$ if $\nu = T \# \mu$, that is, $\nu$ is the pushforward measure of $\mu$ under the map $T$, i.e. $\nu(B) = \mu(T^{-1}(B))$ for all Borel sets $B$. Suppose for some $T$, the joint distribution of $(X, T(X))$ achieves the infinum on the RHS of \eqref{Wasserstein_dist}, $T$ is said to be an optimal transport map. 
\\\\
When $\mu$ is absolutely continuous with respect to the Lebesgue measure on $\Omega$, the optimal map is given by $T = F_{\nu}^{-1} \circ F_{\mu}$, where $F_{\mu}$ and $F_{\nu}$ are the cumulative distribution functions for $\mu$ and $\nu$, respectively \citep[Chapter~6]{Ambrosio2008}. In this case, the 2-Wasserstein distance \eqref{Wasserstein_dist} reduces to
\begin{eqnarray}
\label{Wasserstein_dist_2}
 d_{{\cal W}}^2(\mu, \nu) = \int_0^1 | F_{\mu}^{-1}(p) - F_{\nu}^{-1}(p) |^2 dp .
 \end{eqnarray}

\subsection{Gibbs Posterior Distribution}
The generalized (Gibbs) posterior distribution \citep{Bissiri2016, Syring2023} generalizes the standard posterior distribution in Bayesian inference setting by replacing the log-likelihood function with a (negative) loss function. Consider random elements $U_1, \ldots, U_n$  generated from some distribution $P$, and suppose we wish to make inference on a relevant feature of $P$ defined as some functional $\theta = \theta(P)$, taking values in $\Theta$. The Gibbs posterior framework requires specifying a loss function $\ell_{\theta}(u)$ that measures how closely $\theta$ agrees with a data point $u$. The risk function corresponding to the loss $\ell_{\theta}(u)$ is
\begin{eqnarray}
\label{eqn_expected_risk_general_def}
 R(\theta) = P \ell_{\theta}(U) .    
\end{eqnarray}
Here, $P f$ denotes the expectation of $f(U)$ with respect to $U \sim P$. 
\\\\
The risk function is unattainable since it depends on the unknown distribution $P$, and thus the inference is conducted using the empirical risk:
$$ R_n(\theta) = \frac{1}{n} \sum_{i=1}^{n} \ell_{\theta}(U_i) .$$
Given a prior distribution $\Pi_{\Theta}$ on $\Theta$, the Gibbs posterior distribution $\Pi_{\Theta}^{(n)}$ is defined as
\begin{eqnarray}
\label{gibbs_posterior}
    \Pi_{\Theta}^{(n)}(d \theta) \propto e^{-\omega n R_n(\theta) } \Pi_{\Theta}(d \theta ), \quad \theta \in \Theta, 
\end{eqnarray} 
where $\omega > 0$ is the so-called learning rate parameter specified by the data analyst. The right-hand side of \eqref{gibbs_posterior} is assumed to be integrable in $\theta$, and thus the proportionality constant is well defined. 
\\\\
Let $\theta_0$ denote the parameter that minimizes the expected risk \eqref{eqn_expected_risk_general_def}. Given a semi-metric $d$ on $\Theta$, the Gibbs posterior distribution is said to asymptotically concentrate around $\theta_0$ at rate (at least) $\epsilon_n$, with respect to $d$, if 
\begin{eqnarray}
\label{contraction_def}
 P^n \Pi_n(\{ \theta: d(\theta, \theta_0) > M_n \epsilon_n\} ) \rightarrow 0    
\end{eqnarray}
as $n \rightarrow \infty$, where $M_n \rightarrow \infty$ arbitrarily slowly or $M_n = M$ for some large constant $M$. Here $P^n f$ denotes the expectation of $f(U_1, \ldots, U_n)$ where $U_1, \ldots, U_n$ are an i.i.d. sample from $P$. The general conditions for contraction of Gibbs posterior distribution were studied in \cite{Syring2023}. 



\subsection{Bernstein Polynomials}
\label{sec_BP}
Bernstein polynomials (BP) basis functions are a popular choice for monotone and shape-constrained regression \citep{Chak2005, Curtis2011, Wilson2020}. The $k$th BP basis function of order $K$ is
$$ b_K(x,k) = {K \choose k} x^k (1-x)^{M-k},$$
for $k=0,1,\ldots,K$ and for $x \in [0,1]$. An unknown regression function $f: [0,1] \rightarrow \mathbb{R}$ can then be modelled as
\begin{eqnarray}
\label{Bernstein_basis_rep}
    f(x) = \sum_{k=0}^{K} \beta_k b_K(x,k) .
\end{eqnarray} 
Requiring $f$ to be monotonically increasing is equivalent to requiring $\beta_k \le \beta_{k+1}$, $k=0,\ldots, K-1$. \cite{Curtis2011} uses the re-parameterization $\theta_0 = \beta_0$, and $\theta_k = \beta_k - \beta_{k-1}$ for $k=1,\ldots, K$. It can be shown that \eqref{Bernstein_basis_rep} can then be written as
\begin{eqnarray}
\label{Bernstein_basis_rep_2}
    f(x) = \theta_0 + \sum_{k=1}^{K} \theta_k G_{B(k,K-k+1)}(x),
\end{eqnarray} 
where $G_{B(k,K-k+1)}$ is the cumulative distribution function of the Beta distribution with parameters $k$ and $K-k+1$. This re-parameterization is advantageous when the target monotone function exhibits relative flatness within certain sub-intervals $(a,b) \subset [0,1]$ \citep{Curtis2011}. For finite $K$, the BP basis functions with these constraints do not span the entire class of continuous monotonic functions. However, as $K \rightarrow \infty$, any continuous monotonic function $f: [0,1] \rightarrow \infty$ can be increasingly well approximated using BP basis functions \citep{Chang2007}.

\subsection{Other Related Works}
There are other regression problems related to the distribution-on-distribution regression problem, such as real-valued responses paired with distribution predictors \citep{Law2018}, distributional responses paired with Eucdliean predictors \citep{Han2020}, 
and regression with manifold-valued data \citep{Shi2009, Lin2017}. A distinctive feature of the distribution-on-distribution regression problem is that it typically involves working with Wasserstein space and optimal transport theory.



\subsection{Additional Notations}
We use $a \lesssim b$ to denote that there exists a constant $C$ such that $a \le C b$ holds, and $a \asymp b$ to denote that $a \lesssim b$ and $b \lesssim a$. Given a measure $\mu$, the $L^p$ norm of a function $f$ is denoted by $||f||_{L^p(\mu)}$.



\section{Distribution-on-distribution regression}
\label{sec_method}
\subsection{Model Specification}
\label{sec_model}
We consider the setting that we have access to a sample of $n$ independent and identically distributed covariate-response pairs of measures $\{(\mu_i, \nu_i)\}_{i=1}^{n}$ in ${\cal W}_2(\Omega) \times {\cal W}_2(\Omega)$. We consider the case where $\Omega$ is compact and assume without loss of generality that $\Omega = [0,1]$. We let $P$ denote the joint distribution of the covariate and response measures $(\mu, \nu)$, and $P(\cdot|\mu)$ denote the conditional distribution of the response $\nu$ given the covariate $\mu$. 
\\\\
As in \cite{Ghodrati2022}, we define the regression operator $\Gamma: {\cal W}_2(\Omega) \rightarrow {\cal W}_2(\Omega)$ as the minimizer of the conditional Fr\'echet functional
\begin{eqnarray}
\label{Regression_operator}
    \Gamma(\mu) := \mbox{argmin}_{b \in {\cal W}_2(\Omega)} \int_{{\cal W}_2(\Omega)} d_{{\cal W}}^2(b, \nu) dP(\nu|\mu) ,
\end{eqnarray}
where the definition above assumes the uniqueness of Fr\'echet mean of $P(\cdot|\mu)$. In a regression setting with covariate $x \in \mathbb{R}^d$ and scalar response $y \in \mathbb{R}$, the regression function $f: \mathbb{R}^d \rightarrow \mathbb{R}$ may be defined as $f(x) := \mbox{argmin}_{w \in \mathbb{R}} \mathbb{E}\big( |w - Y|^2|X=x \big)$ where $\mathbb{E}(\cdot|X=x)$ conditional expectation of the response given covariate $x$. We note that in the formulation \eqref{Regression_operator}, the notion of expectation is replaced by a Wasserstein-Fr\'echet mean.  
\\\\
\cite{Ghodrati2022} adopts a non-parametric approach and only impose a shape constraint on the regression operator by assuming $\Gamma(\mu) = T\# \mu$ where $T: \Omega \rightarrow \Omega$ is left unspecified and assumed to be a monotone increasing map. 
\\\\
In this work, we consider a parametric approach and model the map $T$ using the BP basis functions as described in Section \ref{sec_BP}:
$$ T_{\boldsymbol{\theta}}(x) = \sum_{k=0}^{K} \theta_k G_{B(k,K-k+1)}(x) ,$$
where $\boldsymbol{\theta} = (\theta_0, \ldots, \theta_K)^{T}$ are the unknown coefficients. To ensure that $T_{\boldsymbol{\theta}}$ is monotonically increasing with range contained in $[0,1]$, we require $\theta_k \ge 0$, $k=0,\ldots,K$, and $\sum_{k=0}^{K} \theta_k = 1$. 
\\\\
We let $\Theta := \{ \boldsymbol{\theta} \in \mathbb{R}^{K+1}: \theta_k \ge 0, k=0,1,\ldots,K, \sum_{k=0}^{K} \theta_k = 1 \} $ denote the parameter space. We then assume that the regression operator $\Gamma$ satisfies $\Gamma(\mu) = T_{\boldsymbol{\theta}_0}\# \mu$ for some $\boldsymbol{\theta}_0 \in \Theta$. That is, we assume that the model is well-specified where the parameter space contains the ``true'' parameter $\boldsymbol{\theta}_0$. The optimal transport map $T_{\boldsymbol{\theta}_0}$ acts to move the probability mass assigned by the covariate measure $\mu$ from a subinterval $(a,b) \subset \Omega$ to the corresponding transformed subinterval $(T_{\boldsymbol{\theta}_0}(a), T_{\boldsymbol{\theta}_0}(b))$.
\\\\
While assuming that the parameter space $\Theta$ encompasses the true parameter $\boldsymbol{\theta}_0$ may seem somewhat limiting, our proof approach hinges on this assumption. This assumption is crucial to demonstrate the existence and uniqueness of the risk minimizer of \eqref{eqn_expected_risk_general_def} and enables us to establish the contraction rates of the generalized posterior distributions. Investigating the theoretical properties of the proposed framework when the proposed model does not contain the true optimal transport map is deferred to future research endeavors.
\\\\
Given covariate-response pairs $(\mu_i, \nu_i)$, we assume the regression model takes the form
\begin{eqnarray}
\label{Full_reg_model}
   \nu_i = T_{\epsilon_i}\# (T_{\boldsymbol{\theta}_0}\# \mu_i ), \quad i=1,\ldots,n, 
\end{eqnarray}
where $T_{\epsilon_i}$ are independent and identically distributed random transport maps. $T_{\epsilon_i}$ can be interpreted as noise in the model. As in \cite{Ghodrati2022}, the specific distribution of $T_{\epsilon_i}$ is left unspecified, and is only required to be monotonically increasing and satisfy $\mathbb{E}(T_{\epsilon_i}(x)) = x$ for almost every $x \in \Omega$.
\\\\
Hence, our model structure can be understood as a semi-parametric approach, comprising a parametric component for the optimal transport map $T_{\boldsymbol{\theta}}$ and a nonparametric component for the random error maps $T_{\epsilon_i}$. In contrast to a conventional Bayesian framework that necessitates fully specifying the random error maps, in the generalized Bayesian setting, they can remain unspecified. This offers several advantages. Firstly, fully parameterizing the random error maps might prove challenging and increase the likelihood of model mis-specification. Secondly, by leaving the random error maps unspecified, we can concentrate our modeling efforts on the optimal transport map $T_{\boldsymbol{\theta}}$, resulting in more efficient posterior sampling.
\\\\
With the modeling assumptions described above, the distribution-on-distribution regression problem now becomes inferring the unknown parameter $\boldsymbol{\theta}_0 = (\theta_{0,0}, \ldots, \theta_{0,K})^{T}$ from a sample of covariate-response measures $\{\mu_i, \nu_i\}_{i=1}^{n}$. Choosing the 2-Wasserstein distance \eqref{Wasserstein_dist} as the loss function
$$ \ell_{\boldsymbol{\theta}}(\mu, \nu) = \frac{1}{2} d_{{\cal W}}^2(T_{\boldsymbol{\theta}} \# \mu, \nu),$$
the expected risk is given by
\begin{equation}
\label{expected_risk}
R(\boldsymbol{\theta}) = \frac{1}{2} \int_{{\cal W}_2(\Omega) \times {\cal W}_2(\Omega)} d_{{\cal W}}^2(T_{\boldsymbol{\theta}} \# \mu, \nu) dP(\mu, \nu) .
\end{equation}
We first state two assumptions to ensure that the true parameter  $\boldsymbol{\theta}_0$ is the unique parameter which minimizes the expected risk \eqref{expected_risk}. These assumptions are analogous to those in \cite{Ghodrati2022}.
\\\\
Let $P_M$ be the marginal distribution of the covariate measure $\mu$. 
\begin{assumption}
    \label{Assump_1}
Let $\mu$ be in the support of $P_M$, then $\mu$ is absolutely continuous with respect to the Lebesgue measure on $\Omega$.
\end{assumption}
\begin{assumption}
    \label{Assump_2}
The true regression model has the form $\nu = T_{\epsilon} \# (T_{\boldsymbol{\theta}_0}\# \mu)$ for some $\boldsymbol{\theta}_0 \in \Theta$, and the random optimal transport map $T_{\epsilon}$ satisfies $\mathbb{E}(T_{\epsilon}(x)) = x$ $\Omega$-a.e.
\end{assumption}
\begin{proposition}
Suppose that the joint distribution $P$ induced by the model \eqref{Full_reg_model} satisfies Assumptions \ref{Assump_1} and \ref{Assump_2}. Then $\boldsymbol{\theta}_0$ is the unique minimizer of the expected risk in \eqref{expected_risk}.
\end{proposition}
\begin{proof}
This result is a direct consequence of Theorem 3.3 of \cite{Ghodrati2022}.    
\end{proof}



\subsection{Prior Specification}
\label{sec_prior}
We now specify the prior on the unknown coefficients $\boldsymbol{\theta}$. Our prior structure is similar to the one adopted by \cite{Curtis2011}. We first sample $K + 1$ binary latent indicator random variables with parameter $p_{\gamma}$:
$$\gamma_0, \ldots, \gamma_K \sim \mbox{Ber}(p_{\gamma}) .$$
In particular, $p_{\gamma}$ determines the sparsity of the binary variables $\gamma_k, k=0,1,\ldots,K$
We assign a beta prior on $p_{\gamma}$:
$$ p_{\gamma} \sim \mbox{Be}(a_p, b_p), \quad a_p > 0, b_p > 0.$$
Conditional on $\gamma_0, \ldots, \gamma_K$, we sample $u_k$ as
\begin{eqnarray}
\label{eqn_u_sample}
 u_k \sim \gamma_k \mbox{Unif}(0,1) + (1-\gamma_k) \delta_{\{0\}} ,\quad k=0,\ldots,K,    
\end{eqnarray}
where $\mbox{Unif}(0,1)$ is the uniform distribution on $(0,1)$ and $\delta_{\{0\}}$ is the Dirac measure on $0$. Finally, if $\sum_{k=0}^{K} \gamma_k > 0$, we set
$$ \theta_k = \frac{u_k}{\sum_{j=0}^{K} u_j }, \quad k=0,\ldots,K.$$
Otherwise, if $\sum_{k=0}^{K} \gamma_k = 0$, we set $\theta_k = 0, \quad k =0,\ldots, K$. In particular, the case $\theta_k=0$ for all $k$ corresponds to the transport map $T(x) = 0$ for all $x \in [0,1]$. This is a degenerate case where the covariate measure $\mu$ is transformed to the Dirac measure $\nu = \delta_{(0)}$.
\\\\
\textbf{Alternative Prior Specification}\\
If prior information about the shape of the transport map is available, one may incorporate this information in the prior specification. Instead of sampling the random variables $u_k$ as a mixture of uniform distribution and Dirac measure $\delta_{(0)}$ as in \eqref{eqn_u_sample}, we instead sample $u_k$ as
$$ u_k \sim \gamma_k \mbox{Beta}(a_k, b_k) + (1- \gamma_k) \delta_{(0)}, $$
for appropriately chosen values $a_k, b_k > 0, k=0,\ldots,K$. We note that  \eqref{eqn_u_sample} is recovered by setting $a_k = 1, b_k = 1$.


\subsection{Concentration of Posterior Distribution}
Building upon the model assumptions detailed in Section \ref{sec_model} and the prior specification presented in Section \ref{sec_prior}, we study the contraction rates of the Gibbs posterior distribution in two scenarios. The first scenario involves perfect observation of both covariate and response measures, while the second scenario entails solely observing samples from the respective covariate and response measures.

\subsubsection{Perfect Observations}
We first consider the case where the measures $\{(\mu_i,  \nu_i)\}_{i=1}^{n}$ are perfectly observed. The empirical risk corresponding to the expected risk in \eqref{expected_risk} is given by
\begin{eqnarray}
    \label{empricial_risk}
 \quad R_n(\boldsymbol{\theta}) &:=& \frac{1}{n} \sum_{i=1}^{n} \ell_{\boldsymbol{\theta}}( \mu_i, \nu_i)  \nonumber \\
 &=& \frac{1}{2n} \sum_{i=1}^{n} d_{{\cal W}}^2(T_{\boldsymbol{\theta}} \# \mu_i, \nu_i) .
\end{eqnarray}
Before stating our first theoretical result, we have to introduce a distance on the space of optimal transport maps $\{ T_{\boldsymbol{\theta}}: \boldsymbol{\theta} \in \Theta\}$. As in \cite{Ghodrati2022}, we measure the distance between two optimal transport maps using the $L^2(Q)$ distance where
$Q$ is the measure defined as the linear average of $P_M$:
$$ Q(A) = \int_{{\cal W}_2(\Omega)} \mu(A) dP_M(\mu), \quad A \subset \Omega.$$
We show that the Gibbs posterior resulting from the empirical risk in \eqref{empricial_risk} and the prior distribution specified in Section \ref{sec_prior} contracts around the true optimal map $T_{\boldsymbol{\theta}_0}$ with respect to $||\cdot||_{L^2(Q)}$ at rate (at least) $ \epsilon_n = n^{-1/2}(\log n)^{1/2}$.


\begin{theorem}
\label{Posterior_contraction_rates}
Suppose Assumption \ref{Assump_1} and \ref{Assump_2} hold. The Gibbs posterior distribution \eqref{gibbs_posterior} with empirical risk in \eqref{empricial_risk} asymptotically concentrates around the true optimal transport map $T_{\boldsymbol{\theta}_0}$ where $\boldsymbol{\theta}_0$ is the unique minimizer of $R(\boldsymbol{\theta})$ defined in \eqref{expected_risk}
with respect to $ ||\cdot||_{L^2(Q)}$ at rate (at least) $\epsilon_n = n^{-1/2} (\log n)^{1/2}$. That is,
$$ P^{n} \Pi_n ( \{ \boldsymbol{\theta}: ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)} > M \epsilon_n \}) \rightarrow 0 $$
as $n \rightarrow \infty$.
\end{theorem}
It's worth noting that the contraction result is expressed in terms of the optimal transport map $T_{\boldsymbol{\theta}}$, while the prior distribution is assigned to the parameter $\boldsymbol{\theta}$. The Gibbs posterior will concentrate around $T_{\boldsymbol{\theta}_0}$ as long as it concentrates around $\boldsymbol{\theta}_0$ since
$$ ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)} \lesssim ||\boldsymbol{\theta} - \boldsymbol{\theta}_0||_2 ,$$
where $||\cdot||_2$ is the $L^2$ norm on $\mathbb{R}^{K+1}$. \\\\
The proof of Theorem \ref{Posterior_contraction_rates} is provided in the supplementary material. In particular, the proof applies Theorem 3.2 of \cite{Syring2023}. This amounts to verifying a sub-exponential condition on the loss function and that the prior probability measure puts sufficient amount of mass around certain ``neighborhood'' of the true parameter $\boldsymbol{\theta}_0$.
\\\\
As our loss function $\ell_{\boldsymbol{\theta}}(\mu, \nu)$ is bounded with respect to both the parameter $\boldsymbol{\theta}$ and the measures $(\mu, \nu)$, we can employ the approach outlined in Section 3.4.1 of \cite{Syring2023} to verify the condition concerning the loss function. Since the parameter space is finite dimensional, the prior mass condition can be easily verified. 
\\\\
The presence of the logarithm factor in the contraction rate is due to the fact that in the finite dimensional case, it is impossible for a fixed prior to assign mass bounded away from $0$ to a shrinking neighborhood of $\boldsymbol{\theta}_0$.
\\\\
We note that the proof of the contraction result in Theorem \ref{Posterior_contraction_rates} does not rely on the properties of Bernstein basis functions. Therefore, this result can be adapted to alternative choices of basis functions, such as monotone B-spline bases \citep{Leitenstorfer2006}.






\subsubsection{Imperfect Observations}
We now consider the case where we do not observe $\{(\mu_i, \nu_i)\}_{i=1}^{n}$ but rather samples $\{ x_{ij} \}_{j=1}^{m}$ and $\{ y_{ij} \}_{j=1}^{m}$ from $\mu_i, \nu_i$, respectively, for $i=1,\ldots,n$. We let $\hat{\mu}_i^{m}$ denote the estimated covariate measure $\mu_i$ based on the sample $\{ x_{ij} \}_{j=1}^{m}$ and $\hat{\nu}_i^{m}$ denote the estimated response measure $\nu_i$ based on the sample $\{ y_{ij} \}_{j=1}^{m}$. For simplicity we consider the setting where the sample size $m$ is the same for all covariate and response measures. 
\\\\
To study the rate of convergence of the Gibbs posterior distribution, we assume that for any $(\mu,  \nu) \sim P$, we have a sequence of (deterministic) absolutely continuous measures $\hat{\mu}^m$ and a sequence of (deterministic) measures $\hat{\nu}^m$ such that
$$ d_{{\cal W}}(\hat{\mu}^m, \mu) \lesssim r_m^{-1}$$
$$ d_{{\cal W}}(\hat{\nu}^m, \nu) \lesssim r_m^{-1},$$
where $r_m^{-1}$ is the convergence rate with respect to the 2-Wasserstein distance.
\\\\
The empirical risk in this setting is given by
\begin{eqnarray}
\label{empirical_risk_2}
\tilde{R}^m_n(\boldsymbol{\theta}) &:=& \frac{1}{n} \ell_{\boldsymbol{\theta}}(\hat{\mu}^m, \hat{\nu}^m) \nonumber \\
&=& \frac{1}{2n} \sum_{i=1}^{n} d_{{\cal W}}^2(T_{\boldsymbol{\theta}} \# \hat{\mu}^m_i, \hat{\nu}^m_i) .
\end{eqnarray}
In order for the Gibbs posterior distribution to contract around the true optimal map $T_{\boldsymbol{\theta}_0}$, we assume that $m(n)$ is a deterministic function of $n$ and that $m \rightarrow \infty$ as $n \rightarrow \infty$ and $r_m^{-1} \rightarrow 0$ as $m \rightarrow \infty$, and require that $ r_m^{-\frac{1}{2}} < \frac{1}{2} n^{-\frac{1}{2}} \log n $ for all $n$.

\begin{theorem}
\label{thm_posterir_contraction_rates_imperfect}
Suppose Assumption \ref{Assump_1} and \ref{Assump_2} hold. Suppose $ r_m^{-\frac{1}{2}} < \frac{1}{2} n^{-\frac{1}{2}} \log n $ for all $n$. The Gibbs posterior distribution \eqref{gibbs_posterior} with empirical risk in \eqref{empirical_risk_2} asymptotically concentrates around the true optimal transport map $T_{\boldsymbol{\theta}_0}$ where $\boldsymbol{\theta}_0$ is the unique minimizer of $R(\boldsymbol{\theta})$ defined in \eqref{expected_risk} with respect to $ ||\cdot||_{L^2(Q)}$ at rate (at least) $\epsilon_n \asymp   n^{-1/2} (\log n)^{1/2} $. That is,
$$ P^{n} \Pi_n ( \{ \boldsymbol{\theta}: ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)} > M \epsilon_n \}) \rightarrow 0 $$
as $n \rightarrow \infty$.
\end{theorem}
The proof of Theorem \ref{thm_posterir_contraction_rates_imperfect} is presented in the supplementary material. Directly applying the general contraction theorems presented in \cite{Syring2023} is not feasible in the current context. Nonetheless, we can modify the proof methodology in Theorem 3.2 of \cite{Syring2023} to suit our current scenario.

\subsection{Bayesian Computation}
\label{sec_mcmc}
With prior specification described above, we sample from the Gibbs posterior distribution of $u_0, \ldots, u_K, \gamma_0, \ldots, \gamma_K$ and $p_{\gamma}$. The posterior of $\boldsymbol{\theta}$ and hence the optimal transport map $T_{\boldsymbol{\theta}}$ are then induced from the posterior of $u_0, \ldots, u_K$. We describe the sampling procedure for the scenario of perfect observation of measures, highlighting that the process for imperfect observation is entirely analogous.
\\\\
We first initialize all parameters $(\gamma_0^{(0)}, \gamma_1^{(0)}, \ldots, \gamma_K^{(0)})$, $p_{\gamma}^{(0)}$, and $(u_0^{(0)}, u_1^{(0)}, \ldots, u_K^{(0)})$.  For each iteration $t+1$, and for each $k=0,1,\ldots,K$, we jointly sample $(\gamma_k^{(t+1)}, u_k^{(t+1)})$ as follows:
\begin{eqnarray*}
\tilde{\gamma}_k^{(t+1)} = \begin{cases}
        1 - \gamma_k^{(t)} \quad \mbox{with probability} = \quad  q_{\gamma} \\
        \gamma_k^{(t)} \quad \mbox{with probability} = \quad 1 - q_{\gamma},
    \end{cases}
\end{eqnarray*}
where $q_{\gamma} \in (0,1)$. 
\\\\
Conditional on $\tilde{\gamma}_k^{(t+1)} = 0$, we set $\tilde{u}_k^{(t+1)} = 0$. Otherwise, we draw $\tilde{u}_k^{(t+1)}$ from $\mbox{Unif}(0,1)$. 
\\\\
We then compute $\boldsymbol{\tilde{\theta}}^{(t+1)}$ using 
$$ u_0^{(t+1)}, u_1^{(t+1)}, \ldots, u_{k-1}^{(t+1)}, \tilde{u}_k^{(t+1)}, u_{k+1}^{(t)}, \ldots, u_K^{(t)} ,$$
and set $(\gamma_k^{(t+1)}, u_k^{(t+1)}) = (\tilde{\gamma}_k^{(t+1)}, \tilde{u}_k^{(t+1)})$ with probability equal to
\begin{eqnarray}
\min \Bigg\{ 1, \frac{ e^{-\omega n R_n(\boldsymbol{\tilde{\theta}}^{(t+1)}) } \pi_{\gamma}(\tilde{\gamma}_k^{(t+1)}) }{ e^{-\omega n R_n(\boldsymbol{\theta}^{(t)})}  \pi_{\gamma}(\gamma_k^{(t)}) } \Bigg\} ,    
\end{eqnarray}
and $(\gamma_k^{(t+1)}, u_k^{(t+1)}) = (\gamma_k^{(t)}, u_k^{(t)})$ otherwise, and $\pi_{\gamma}(\gamma_k) = p_{\gamma}^{\gamma_k} (1-p_{\gamma})^{1-\gamma_k}$.
\\\\
The full conditional posterior distribution of $p_{\gamma}$ is of closed form and can be sampled directly:
$$ p_{\gamma}^{(t+1)} \sim \mbox{Ber}\bigg(a_p + \sum_{k=0}^{K} \gamma_k^{(t+1)}, b_p + K + 1 - \sum_{k=0}^{K} \gamma_k^{(t+1)} \bigg) .$$
We need to specify the polynomial order $K$ for BP. Unlike monotone regression, where various strategies exist for determining $K$, such as setting it to the order of unique predictor values in the data, our context is more intricate. Therefore, we opt for a larger value of $K$ and rely on the fitting procedure to eliminate unnecessary basis functions. In our simulation studies and data application, we opt for $K=50$ while also examining alternative values for $K$.




\section{Simulation Studies}
\label{sec_simulation}
We conduct simulation studies to investigate the concentration of the generalized posterior distributions around the true optimal transport maps. In order to carry out these simulation studies, we need to simulate the covariate probability measures $\{ \mu_i \}_{i=1}^{n}$, the optimal transport map $T_{\boldsymbol{\theta}_0}$, and the random error maps $\{ T_{\epsilon_i} \}_{i=1}^{n}$.
\\\\
Each of the covariate measures is assumed to be Beta distribution $\mbox{Be}(a,b)$, with the parameters $a$ and $b$ generated randomly from beta distributions, where $a \sim \mbox{Be}(a_1, b_1)$ and $b \sim \mbox{Be}(a_2, b_2)$.
\\\\
The parameters $\boldsymbol{\theta}_0$ governing the true optimal transport map $T_{\boldsymbol{\theta}_0}$ are sampled from the prior distribution outlined in Section \ref{sec_prior}. In all simulations, we fix $K=50$, while the sparsity parameter $p_{\gamma}$ varies across different simulation scenarios.
\\\\
To generate the random error maps $\{T_{\epsilon_i}\}_{i=1}^{n}$, we consider the class of random error optimal maps introduced in \cite{Panaretos2016}. The maps are defined as
$$ \Psi_0(x) = x,$$
$$ \Psi_z(x) = x - \frac{ \sin(\pi z x) }{|z| \pi }, \quad z \in \mathbb{Z}  -\{0\} .$$
These are strictly increasing smooth functions satisfying $\Psi_z(0) = 0$ and $\Psi_z(1) = 1$ for all $z \in \mathbb{Z}$. Random maps can be constructed by replacing the integer $z$ with a random variable $Z$ that has a distribution symmetric around 0, it is straightforward to see that $\mathbb{E}( \Psi_Z(x) ) = x$ for all $x \in [0,1]$. 
\\\\
As in \cite{Panaretos2016}, we use the following distribution for $Z$ parameterized by $\lambda > 0$:
$$ \mathbb{P}(Z = 0) = e^{-\lambda}, $$
$$ \mathbb{P}(Z = +z) = \mathbb{P}(Z = -z) = \frac{e^{-\lambda } \lambda^z }{2 (z!) }, \quad z \in \mathbb{Z} - \{0\} .$$
The random error map can be constructed as follows: for $J > 1$, we simulate i.i.d. integer-valued symmetric random variables $z_j, j=1, \ldots, J$. We then simulate $J-1$ uniform random variables $v_1, \ldots, v_{J-1} \sim \mbox{Unif}(0,1)$. We let $v_{(1)}, \ldots, v_{(J-1)}$ denote the order statistics of $v_1, \ldots, v_{J-1}$.
The random error map is then given by
\begin{eqnarray*}
    T_{\epsilon}(x) &=& v_{(1)} \Psi(x) + \sum_{j=2}^{J-1} (v_{(j)} - v_{(j-1)}) \Psi_{z_j}(x) \\
   && + (1 - v_{(J-1)}) \Psi_{z_J}(x) .
\end{eqnarray*}
In all of our simulation scenarios, we set $J=20$ and $\lambda = 5$. The response measures $\{\nu_i\}_{i=1}^{n}$ are then generated based on the generated covariate measures $\{\mu_i\}_{i=1}^{n}$, the optimal transport map $T_{\boldsymbol{\theta}_0}$, and the random error maps $\{T_{\epsilon_i}\}_{i=1}^{n}$.
\\\\
We carry out three simulation studies, producing $n=100$ pairs of covariate-response measures for each study using the method outlined previously. Then, we perform posterior sampling on the simulated data, following the procedure outlined in Section \ref{sec_mcmc}. In each simulation study, we apply the algorithm described in Section \ref{sec_mcmc} to subsets of the data containing $n=5$, $n=20$, $n=50$, and $n=100$ pairs of covariate-response measures to explore how the behavior of the posterior distribution varies with sample size.
\\\\
The determination of the learning rate $\omega$ for the generalized posterior distribution is crucial. Several strategies for selecting $\omega$ have been proposed \citep{Grunwald2017, Holmes2017, Lyddon2019, Syring2018}, with a comparative analysis presented in \cite{Wu2023}. In our current scenario, we adopt the strategies of \cite{Syring2018} and \cite{Syring2023} to let $\omega$ be a decreasing function of the sample size $n$ and also tune $\omega$ to ensure the resulting Gibbs posterior has sufficient coverage probability. 
\\\\
The outcomes of the simulation studies are depicted in Figures \ref{plot_sim1}, \ref{plot_sim2}, and \ref{plot_sim3}. These figures showcase the estimated posterior means and posterior credible intervals of the optimal transport map, juxtaposed with the true optimal transport map. Notably, the posterior mean of the optimal transport map adeptly captures the true counterpart across all scenarios, with the true map being well-contained within the posterior credible intervals. These observations hold true across all three simulation scenarios and various sample sizes. Hence, even with a smaller sample size of $n=5$, the recovery of the true optimal transport map appears successful. This observation is not entirely unexpected, as one can interpret each covariate-response pair as containing an infinite amount of data points. 
\\\\
In the supplementary material, we will further investigate the behavior of the model when it is misspecified for the true optimal transport maps. The first scenario involves the specified models having fewer basis functions $K$ than the true optimal transport maps. The second, more challenging scenario occurs when the true optimal transport maps do not belong to the model class, meaning they cannot be expressed as linear combinations of BP basis functions.


\begin{figure}[!htb]
\subfloat{\includegraphics[width = 1.7in]{Plots/sim1_plot_sub_5.png}}
\subfloat{\includegraphics[width = 1.7in]{Plots/sim1_plot_sub_20.png}}\\
\subfloat{\includegraphics[width = 1.7in]{Plots/sim1_plot_sub_50.png}} 
\subfloat{\includegraphics[width = 1.7in]{Plots/sim1_plot_full.png}} 
\caption{Simulation 1. Top left: n=5 ($\omega=500$). Top right: n=20 ($\omega=200$). Bottom left: n=50 ($\omega=100$). Bottom right: n=100 ($\omega=50)$. Black curve: True optimal transport map. Blue dashed curves: estimated posterior mean and 95\% posterior credible intervals of optimal transport map.}
\label{plot_sim1}
\end{figure}

\begin{figure}[!htb]
\subfloat{\includegraphics[width = 1.7in]{Plots/sim2_plot_sub_5.png}}
\subfloat{\includegraphics[width = 1.7in]{Plots/sim2_plot_sub_20.png}}\\
\subfloat{\includegraphics[width = 1.7in]{Plots/sim2_plot_sub_50.png}} 
\subfloat{\includegraphics[width = 1.7in]{Plots/sim2_plot_full.png}} 
\caption{Simulation 2. Top left: n=5 ($\omega=500$). Top right: n=20 ($\omega=200$). Bottom left: n=50 ($\omega=100$). Bottom right: n=100 ($\omega=50)$. Black curve: True optimal transport map. Blue dashed curves: estimated posterior mean and 95\% posterior credible intervals of optimal transport map.}
\label{plot_sim2}
\end{figure}

\begin{figure}[!htb]
\subfloat{\includegraphics[width = 1.7in]{Plots/sim3_plot_sub_5.png}}
\subfloat{\includegraphics[width = 1.7in]{Plots/sim3_plot_sub_20.png}}\\
\subfloat{\includegraphics[width = 1.7in]{Plots/sim3_plot_sub_50.png}} 
\subfloat{\includegraphics[width = 1.7in]{Plots/sim3_plot_full.png}} 
\caption{Simulation 3. Top left: n=5 ($\omega=500$). Top right: n=20 ($\omega=200$). Bottom left: n=50 ($\omega=100$). Bottom right: n=100 ($\omega=50)$. Black curve: True optimal transport map. Blue dashed curves: estimated posterior mean and 95\% posterior credible intervals of optimal transport map.}
\label{plot_sim3}
\end{figure}



\section{Data Application - Analysis of Mortality Data}
\label{sec_app}
We examine the age-at-death distributions for $N = 37$ countries in the years 1983 and 2013, sourced from the Human Mortality Database accessible via UC Berkeley and the Max Planck Institute for Demographic Research, openly available on www.mortality.org. The provided death rates span single years of age up to 109, with an open age interval for individuals aged 110 and above. Utilizing the binsmooth R package (version 0.2.2), we fit smooth cubic splines to the binned data to estimate the cumulative distribution functions (CDFs) for each country for both year 1983 and 2013. Specifically, we designate the age-at-death distribution for the $i$th country in the year 1983 as the covariate distribution, and the corresponding distribution for the same country in the year 2013 as the response distribution. This allows for comparisons with the studies by \cite{Pegoraro2022} and \cite{Chen2023}.
\\\\
Figure \ref{plot_mortality} displays the posterior mean of the optimal transport map, accompanied by 95\% credible intervals. These results are derived through the MCMC sampling approach outlined in Section \ref{sec_mcmc}, utilizing covariate-response pairs for the $n=37$ countries. Here, we set $K=50$ and $p_{\gamma}=0.2$. Notably, the estimated posterior mean optimal transport map surpasses the identity transport map pointwise, suggesting an overall enhancement in mortality rates across all age groups, with the most significant improvements observed in younger age groups. These findings align with those of \cite{Pegoraro2022}. To evaluate the model's goodness of fit, we compare the cumulative distribution functions (CDFs) at year 2013 obtained from the MCMC samples with the observed CDFs at year 2013. Figure \ref{plot_cdfs} illustrates this comparison, indicating a favorable fit of the model to the data.





\begin{figure}[!htb]\includegraphics[width = 3in]{Plots/mortality_plot.png}
\caption{Mortality dataset. Blue: posterior mean and 95\% credible intervals of the optimal transport map. Red: identity transport map.}
\label{plot_mortality}
\end{figure}

\begin{figure}[!htb]
\subfloat{\includegraphics[width = 0.65in]{Plots/AUS.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/AUT.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/BEL.png}} 
\subfloat{\includegraphics[width = 0.65in]{Plots/BGR.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/BLR.png}}\\
\subfloat{\includegraphics[width = 0.65in]{Plots/CAN.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/CHE.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/CZE.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/DEUTE.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/DNK.png}}\\
\subfloat{\includegraphics[width = 0.65in]{Plots/ESP.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/EST.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/FIN.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/FRA.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/GBR.png}}\\
\subfloat{\includegraphics[width = 0.65in]{Plots/GRC.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/HUN.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/IRL.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/ISL.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/ISR.png}}\\
\subfloat{\includegraphics[width = 0.65in]{Plots/ITA.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/JPN.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/LTU.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/LUX.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/LVA.png}}\\
\subfloat{\includegraphics[width = 0.65in]{Plots/NLD.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/NOR.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/NZL.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/POL.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/PRT.png}}\\
\subfloat{\includegraphics[width = 0.65in]{Plots/RUS.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/SVK.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/SVN.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/SWE.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/TWN.png}}\\
\subfloat{\includegraphics[width = 0.65in]{Plots/UKR.png}}
\subfloat{\includegraphics[width = 0.65in]{Plots/USA.png}}
\caption{Black: True CDFs of age-at-death distribution in year 2013 for each country. Red: Estimated posterior means and posterior credible intervals of CDFs in year 2013 for each country.}
\label{plot_cdfs}
\end{figure}



\section{Discussion}
\label{sec_discussion}
In this study, we introduced a generalized Bayesian framework for distribution-on-distribution regression. We studied the contraction rates of the Gibbs posterior distribution under two scenarios: one where both covariate and response measures are fully observed, and another where we only have access to samples from these measures. Experimental studies were conducted to investigate the contraction properties of the posterior distribution.

In our theoretical analysis, we made the assumption that the true optimal transport map can be represented as convex combinations of basis functions from BP. However, it is desirable to examine the contraction rates of the posterior distribution in cases where the true optimal transport map does not conform to this assumed form. This may necessitate a different proof strategy, as our current approach heavily relies on this assumption. Additionally, an intriguing extension would involve extending the framework to higher dimensional settings. Nonetheless, in higher dimensions, there's often a trade-off between the flexibility of optimal transport maps and computational efficiency. One potential avenue to explore is adapting the additive monotone regression approach proposed by \cite{Engebretsen2019} to our present context.



% References
\bibliography{uai2024-GBDistReg}

\newpage

\onecolumn

\title{A Generalized Bayesian Approach to Distribution-on-Distribution Regression\\(Supplementary Material)}
\maketitle





\appendix

\section{Proofs}
\subsection{Proof of Theorem 1}
\label{proof_thm_post_cont_rate_1}
We first state a lemma which is a direct implication of Lemma 3.6 of \cite{Ghodrati2022}. For $\boldsymbol{\eta} \in \mathbb{R}^{K+1}$, we define
$$ T_{\boldsymbol{\eta}}(x) := \sum_{k=0}^{M} \eta_k G_{B(k,K-1+k)}(x) .$$
Note that we have extended the definition of the map $ T_{\boldsymbol{\eta}}$ from $\Theta$ to $\mathbb{R}^{K+1}$.
\begin{lemma}
\label{lemma_taylor_expansion}
For $\epsilon > 0$, we have the following expansion of the expected risk around $\boldsymbol{\theta}_1$:
\begin{equation*}
    R(\boldsymbol{\theta}_1 + \epsilon \boldsymbol{\eta}) = R( \boldsymbol{\theta}_1 ) + \epsilon D_{\boldsymbol{\eta}} R(\boldsymbol{\theta}_1) + \frac{\epsilon^2}{2} || T_{\boldsymbol{\eta}} ||^{2}_{L^2(Q)}
\end{equation*}
where 
$$D_{\boldsymbol{\eta}}R(\boldsymbol{\theta}_1) = \int_{ {\cal W}_2(\Omega) \times {\cal W}_2(\Omega) } \int_{0}^{1} (T_{\boldsymbol{\theta}}(F_{\mu}^{-1}(p)) - F_{\nu}^{-1}(p) ) T_{\boldsymbol{\eta}}(F_{\mu}^{-1}(p)) dp dP(\mu, \nu) $$
is the directional derivative of $R(\boldsymbol{\theta}_1)$ in the direction of $\boldsymbol{\eta}$.
\end{lemma}


\begin{proof}[Proof of Theorem 1]
We apply Theorem 3.2 of \cite{Syring2023} to derive the stated contraction rate.
\\\\
We need to show that the loss function $\ell_{\boldsymbol{\theta}}$ satisfies the sub-exponential condition:\\\\
There exists an interval $(0, \bar{\omega})$ and constant $K > 0$ such that for all $\omega \in (0, \bar{\omega})$ and for all sufficiently small $\delta > 0$, for $\boldsymbol{\theta} \in \Theta$,
\begin{eqnarray}
\label{sub_exp_cond}
||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)} > \delta \implies P e^{-\omega (\ell_{\boldsymbol{\theta}} - l_{\boldsymbol{\theta}_0})} < e^{-K \omega \delta^2} .
\end{eqnarray}
We also need to show that the prior $\Pi$ puts sufficient amount of mass on ``neighborhood'' $G_n$ of the true parameter $\boldsymbol{\theta}_0$:
\begin{eqnarray}
\label{prior_mass_cond_1}
    \log \Pi(G_n) \gtrsim - n \epsilon_n^2 ,
\end{eqnarray}
where $G_n$ is defined as
$$ G_n := \{ \boldsymbol{\theta} \in \Theta: u(\boldsymbol{\theta}, \boldsymbol{\theta}_0) \le \epsilon_n^{2}, v(\boldsymbol{\theta}, \boldsymbol{\theta}_0) \le \epsilon_n^2 \},  \quad n=1,2,\ldots,$$
and $u(\boldsymbol{\theta}, \boldsymbol{\theta}_0)$ and $v(\boldsymbol{\theta}, \boldsymbol{\theta}_0) $ are the mean and variance of excess risk:
$$ u(\boldsymbol{\theta}, \boldsymbol{\theta}_0) := \frac{1}{2} P( d_{{\cal W}}^2(T_{\boldsymbol{\theta}} \# \mu, \nu) - d_{{\cal W}}^2(T_{\boldsymbol{\theta}_0} \# \mu, \nu) ) = R(\boldsymbol{\theta}) - R(\boldsymbol{\theta}_0)  ,$$
and
$$ v(\boldsymbol{\theta}, \boldsymbol{\theta}_0) := P \bigg( \bigg( \frac{1}{2} d_{{\cal W}}^2(T_{\boldsymbol{\theta}} \# \mu, \nu) - \frac{1}{2}d_{{\cal W}}^2(T_{\boldsymbol{\theta}_0} \# \mu, \nu) \bigg)^2 \bigg) -   u(\boldsymbol{\theta}, \boldsymbol{\theta}_0)^2.$$
We first show that the sub-exponential condition \eqref{sub_exp_cond} is satisfied. By compactness of $\Omega$, we have that for all $\boldsymbol{\theta} \in \Theta$ and all $(\mu, \nu)$ in the support of $P$,
$$ \ell_{\boldsymbol{\theta}}(\mu, \nu) - \ell_{\boldsymbol{\theta}_0}(\mu, \nu) < C, $$
for some constant $C > 0$. Thus, by Section 3.4.1 of \cite{Syring2023}, 
$$  P e^{-\omega (\ell_{\boldsymbol{\theta}} - l_{\boldsymbol{\theta}_0})} \le \exp \big( - \omega u(\boldsymbol{\theta}, \boldsymbol{\theta}_0) + C \omega^3 v(\boldsymbol{\theta}, \boldsymbol{\theta}_0) \big), $$
for $\omega$ small enough.
\\\\
Now, consider $\boldsymbol{\theta}_1, \boldsymbol{\theta}_2 \in \Theta$, and let $\boldsymbol{\eta} = \boldsymbol{\theta}_2 - \boldsymbol{\theta}_1$. By Lemma \ref{lemma_taylor_expansion}, we have
\begin{equation*}
R(\boldsymbol{\theta}_1 + \epsilon \boldsymbol{\eta}) = R( \boldsymbol{\theta}_1 ) + \epsilon D_{\boldsymbol{\eta}} R(\boldsymbol{\theta}_1) + \frac{\epsilon^2}{2} || T_{\boldsymbol{\eta}} ||^{2}_{L^2(Q)}
\end{equation*}
where 
$ D_{\boldsymbol{\eta}}R(\boldsymbol{\theta}_1) $
is the directional derivative of $R(\boldsymbol{\theta}_1)$ in the direction of $\boldsymbol{\eta}$.
\\\\
For any $\boldsymbol{\theta} \in \Theta$, let $\boldsymbol{\eta} = \boldsymbol{\theta} - \boldsymbol{\theta}_0$, applying the expansion above with $\epsilon = 1$, we have
$$ R(\boldsymbol{\theta}) - R(\boldsymbol{\theta}_0) = D_{\boldsymbol{\eta}} R(\boldsymbol{\theta}_0) + \frac{1}{2} || T_{\boldsymbol{\eta}} ||^{2}_{L^2(Q)} .$$
Since $\boldsymbol{\theta}_0$ is the minimizer of $R$, we have $D_{\boldsymbol{\eta}} R(\boldsymbol{\theta}_0) = 0$, and thus
\begin{eqnarray}
\label{eq_u}
u(\boldsymbol{\theta}, \boldsymbol{\theta}_0) = R(\boldsymbol{\theta}) - R(\boldsymbol{\theta}_0) = \frac{1}{2} || T_{\boldsymbol{\eta}} ||^{2}_{L^2(Q)} = \frac{1}{2} ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)}^2 .    
\end{eqnarray}
We also have that
\begin{eqnarray*}
    | d_{{\cal W}}^2(T_{\boldsymbol{\theta}_1}\#\mu, \nu) - d_{{\cal W}}^2(T_{\boldsymbol{\theta}_2}\#\mu, \nu) |  &\lesssim& | d_{{\cal W}}(T_{\boldsymbol{\theta}_1}\#\mu, \nu) - d_{{\cal W}}(T_{\boldsymbol{\theta}_2}\#\mu, \nu) |  \\
    & \le & d_{{\cal W}}(T_{\boldsymbol{\theta}_1}\#\mu, T_{\boldsymbol{\theta}_2}\#\mu) \\
    & = & ||T_{\boldsymbol{\theta}_1} - T_{\boldsymbol{\theta}_2} ||_{L^2(\mu)} \\
    & \lesssim & ||T_{\boldsymbol{\theta}_1} - T_{\boldsymbol{\theta}_2} ||_{L^2(Q)} .
\end{eqnarray*}
where the second inequality follows from triangle inequality, and the equality follows from that ${\cal W}_2(\Omega)$ is flat. Therefore, it follows that
\begin{eqnarray}
\label{ineq_v}
v(\boldsymbol{\theta}, \boldsymbol{\theta}_0) \lesssim ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0} ||_{L^2(Q)}^2 
\lesssim u(\boldsymbol{\theta}, \boldsymbol{\theta}_0) . 
\end{eqnarray} 
Combining \eqref{eq_u} and \eqref{ineq_v}, We obtain
\begin{eqnarray*}
    P e^{-\omega (\ell_{\boldsymbol{\theta}} - l_{\boldsymbol{\theta}_0})} &\le& \exp \big( - C_1 \omega u(\boldsymbol{\theta}, \boldsymbol{\theta}_0) \big) \\
    & = & \exp \bigg( - \frac{1}{2} C_1 \omega ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)}^2 \bigg) 
\end{eqnarray*}
for some constant $C_1 > 0$. It follows that $||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)} > \delta$ implies that 
$$  P e^{-\omega (\ell_{\boldsymbol{\theta}} - l_{\boldsymbol{\theta}_0})} \le \exp\bigg(- \frac{1}{2} C_1 \omega \delta^2\bigg) ,$$
and Condition \eqref{sub_exp_cond} is verified. 
\\\\
We now verify the prior mass condition \eqref{prior_mass_cond_1}. We note that our prior specification satisfies
$$ \Pi(\{ || \boldsymbol{\theta} - \boldsymbol{\theta}_0 ||_2 \le \delta ) \gtrsim \delta^{K+1} ,  $$
where $||\cdot||_2$ is the 2-norm on $\mathbb{R}^{K+1}$. Since $||\boldsymbol{\theta} - \boldsymbol{\theta}_0||_2 \le \delta$ implies $||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)} \lesssim \delta$, it follows that
$$ \Pi(\{ || T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0} ||_{L^2(Q)} \le \delta ) \gtrsim \delta^{K+1}. $$
Since $|| T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0} ||_{L^2(Q)} \le \delta$ implies $\{ u(\boldsymbol{\theta}, \boldsymbol{\theta}_0) \lesssim \delta^2, v(\boldsymbol{\theta}, \boldsymbol{\theta}_0) \lesssim \delta^2 \}$, we have
\begin{eqnarray*}
    \Pi(G_n) \gtrsim \Pi(\{ \boldsymbol{\theta}: || T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0} ||_{L^2(Q)} \le \epsilon_n \}) \gtrsim \epsilon_n^{K+1} .
\end{eqnarray*}
Therefore, with $\epsilon_n = n^{-1/2} (\log n)^{1/2}$, we have
$$ \log \Pi(G_n) \gtrsim - \log n \gtrsim -n \epsilon_n^2 .$$
Thus, the prior mass condition is satisfied, and the proof is completed. 
\end{proof}



\subsection{Proof of Theorem 2}
For each $m =1,2,\ldots,$ we define
\begin{eqnarray}
\tilde{R}^m(\boldsymbol{\theta}) := \frac{1}{2} \int_{{\cal W}_2(\Omega) \times {\cal W}_2(\Omega)} d_{{\cal W}}^2(T_{\boldsymbol{\theta}} \# \hat{\mu}^m, \hat{\nu}^m) dP(\mu, \nu) .
\end{eqnarray}
Also define the mean and variance of excess risk as
$$ u_m(\boldsymbol{\theta}, \boldsymbol{\theta}_0) := \frac{1}{2} P( d_{{\cal W}}^2(T_{\boldsymbol{\theta}} \# \hat{\mu}^m, \hat{\nu}^m) - d_{{\cal W}}^2(T_{\boldsymbol{\theta}_0} \# \hat{\mu}^m, \hat{\nu}^m) ) = \tilde{R}^m(\boldsymbol{\theta}) - \tilde{R}^m(\boldsymbol{\theta}_0)  ,$$
and
$$ v_m(\boldsymbol{\theta}, \boldsymbol{\theta}_0) := P \bigg( \bigg( \frac{1}{2} d_{{\cal W}}^2(T_{\boldsymbol{\theta}} \# \hat{\mu}^m, \hat{\nu}^m) - \frac{1}{2}d_{{\cal W}}^2(T_{\boldsymbol{\theta}_0} \# \hat{\mu}^m, \hat{\nu}^m) \bigg)^2 \bigg) -   u_m(\boldsymbol{\theta}, \boldsymbol{\theta}_0)^2.$$
We first prove the following lemma bounding $u_m(\boldsymbol{\theta}, \boldsymbol{\theta}_0)$ and $v_m(\boldsymbol{\theta}, \boldsymbol{\theta}_0)$ in terms of $ ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)}^2$ and $r_m^{-1}$ .
\begin{lemma}
\label{lemma_bound_u_v}
$$ ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)}^2 - r_m^{-1} \lesssim u_m(\boldsymbol{\theta}, \boldsymbol{\theta}_0) \lesssim ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)}^2 + r_m^{-1} ,$$
$$ v_m(\boldsymbol{\theta}, \boldsymbol{\theta}_0) \lesssim ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||^2_{L^2(Q)} + r_m^{-2}.$$
\end{lemma}
\begin{proof}
%By Lemma 6.6 of \cite{Ghodrati2022}, 
%$$ d_{{\cal W}}(\hat{\mu}^m, \mu) \lesssim r_m^{-1} \implies d_{{\cal W}}(T_{\boldsymbol{\theta}}\# \hat{\mu}^m, T_{\boldsymbol{\theta}}\#\mu) \lesssim r_m^{-1}.$$
We first have the following decomposition of $u_m(\boldsymbol{\theta}, \boldsymbol{\theta}_0)$:
\begin{eqnarray}
\label{eq_decomp}
\tilde{R}^m(\boldsymbol{\theta}) - \tilde{R}^m(\boldsymbol{\theta}_0) = \underbrace{ \tilde{R}^m(\boldsymbol{\theta}) - R(\boldsymbol{\theta}) } + \underbrace{ R(\boldsymbol{\theta}) - R(\boldsymbol{\theta}_0) } + \underbrace{ R(\boldsymbol{\theta}_0) - \tilde{R}^m(\boldsymbol{\theta}_0) }.    
\end{eqnarray} 
We bound each of the three terms on the RHS of \eqref{eq_decomp}.
\begin{eqnarray*}
\tilde{R}^m(\boldsymbol{\theta}) - R(\boldsymbol{\theta}) &=& P \bigg( d_{{\cal W}}^2(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \hat{\nu}^m) - d_{{\cal W}}^2(T_{\boldsymbol{\theta}}\#\mu, \nu) \bigg) \\
&=& P \bigg( d_{{\cal W}}^2(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \hat{\nu}^m) - d_{{\cal W}}^2(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \nu) + d_{{\cal W}}^2(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \nu) - d_{{\cal W}}^2(T_{\boldsymbol{\theta}}\#\mu, \nu)  \bigg) .
\end{eqnarray*}
We have that
\begin{eqnarray*}
&& P \bigg( d_{{\cal W}}^2(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \hat{\nu}^m) - d_{{\cal W}}^2(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \nu) \bigg) \\
&=& P \bigg( \big( d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \hat{\nu}^m) - d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \nu) \big) \big(  d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \hat{\nu}^m) + d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \nu) \big)  \bigg) \\
& \ge & P \big( - d_{{\cal W}}(\hat{\nu}^m, \nu)  \big(  d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \hat{\nu}^m) + d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \nu) \big)  \big) \\
& \gtrsim & P ( - d_{{\cal W}}(\hat{\nu}^m, \nu) ) \\
& \gtrsim & - r_m^{-1} ,
\end{eqnarray*}
where the first inequality follows from the reverse triangle inequality, and the last inequality follows from our assumption. We also have that
\begin{eqnarray*}
&& P \bigg( d_{{\cal W}}^2(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \hat{\nu}^m) - d_{{\cal W}}^2(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \nu) \bigg) \\
&=& P \bigg( \big( d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \hat{\nu}^m) - d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \nu) \big) \big(  d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \hat{\nu}^m) + d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \nu) \big)  \bigg)  \\
&\le & P \bigg(  d_{{\cal W}}( \hat{\nu}^m, \nu)  \big(  d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \hat{\nu}^m) + d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \nu) \big)  \bigg) \\
& \lesssim & r_m^{-1}
\end{eqnarray*}
by an application of the triangle inequality. 
\\\\
Similarly, we can show that
\begin{eqnarray*}
 - r_m^{-1} \lesssim   P \bigg( d_{{\cal W}}^2(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \nu) - d_{{\cal W}}^2(T_{\boldsymbol{\theta}}\#\mu, \nu) \bigg)  \lesssim  r_m^{-1} .
\end{eqnarray*}
It follows that the first term on the RHS of \eqref{eq_decomp} can be bounded as
$$ -r_m^{-1} \lesssim \tilde{R}^m(\boldsymbol{\theta}) - R(\boldsymbol{\theta}) \lesssim  r_m^{-1} .$$
Using the same calculation, we also have the bound for the third term on the RHS of \eqref{eq_decomp}:
$$ -r_m^{-1} \lesssim R(\boldsymbol{\theta}_0) - \tilde{R}^m(\boldsymbol{\theta}_0) \lesssim  r_m^{-1} .$$
For the second term on the RHS of \eqref{eq_decomp}, we recall from the proof of Theorem 1 that
$$ R(\boldsymbol{\theta}) - R(\boldsymbol{\theta}_0) = \frac{1}{2} ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)}^2 . $$
Thus, we obtain the following bound for $ u_m( \boldsymbol{\theta}, \boldsymbol{\theta}_0)$:
$$ ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||^2_{L^2(Q)} - r_m^{-1} \lesssim u_m( \boldsymbol{\theta}, \boldsymbol{\theta}_0)  \lesssim  ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||^2_{L^2(Q)} + r_m^{-1} .$$
Now we try to obtain the upper bound for $v_m(\boldsymbol{\theta}, \boldsymbol{\theta}_0)$. 
By triangle inequality,
\begin{eqnarray*}
& & |d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \hat{\nu}^m) -  d_{{\cal W}}(T_{\boldsymbol{\theta}_0}\#\hat{\mu}^m, \hat{\nu}^m) | \\
   &\le&  |d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \hat{\nu}^m) -  d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\mu, \nu) | + |d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\mu, \nu) -  d_{{\cal W}}(T_{\boldsymbol{\theta}_0}\#\mu, \nu) | 
   \\ && + |d_{{\cal W}}(T_{\boldsymbol{\theta}_0}\#\mu, \nu) -  d_{{\cal W}}(T_{\boldsymbol{\theta}_0}\#\hat{\mu}^m, \hat{\nu}^m) |
\end{eqnarray*}
Similar calculations as above lead to 
$$ |d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \hat{\nu}^m) -  d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\mu, \nu) | \lesssim r_m^{-1}$$
and
$$ |d_{{\cal W}}(T_{\boldsymbol{\theta}_0}\#\mu, \nu) -  d_{{\cal W}}(T_{\boldsymbol{\theta}_0}\#\hat{\mu}^m, \hat{\nu}^m) | \lesssim r_m^{-1} .$$
We also have that 
\begin{eqnarray*}
   |d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\mu, \nu) -  d_{{\cal W}}(T_{\boldsymbol{\theta}_0}\#\mu, \nu) | &\le&  d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\mu, T_{\boldsymbol{\theta}_0}\#\mu) \\
   &=& ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(\mu)} \\
   & \lesssim & ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)}  .
\end{eqnarray*}
Since
$$ v_m(\boldsymbol{\theta}, \boldsymbol{\theta}_0) \lesssim P( |d_{{\cal W}}(T_{\boldsymbol{\theta}}\#\hat{\mu}^m, \hat{\nu}^m) -  d_{{\cal W}}(T_{\boldsymbol{\theta}_0}\#\hat{\mu}^m, \hat{\nu}^m) |^2 ),
$$
it follows that
$$v_m(\boldsymbol{\theta}, \boldsymbol{\theta}_0) \lesssim  ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)}^2 + r_m^{-2} .$$


\end{proof}
We are now in a position to prove Theorem 2.
\begin{proof}[Proof of Theorem 2]
Let $A_n := \{\boldsymbol{\theta} \in \Theta: ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)} > M \epsilon_n \}$. The Gibbs posterior probability of $A_n$ is given by
\begin{eqnarray*}
    \Pi_n(A_n) &=& \frac{N^m_n(A_n)}{D_n^m} \\
    & = & \frac{\int_{A_n} \exp \big(- \omega n \big( \tilde{R}_n^m(\boldsymbol{\theta}) - \tilde{R}_n^m(\boldsymbol{\theta}_0) \big) \big) \Pi(d \boldsymbol{\theta}) }{\int_{\Theta} \exp\big(- \omega n \big( \tilde{R}_n^m(\boldsymbol{\theta}) - \tilde{R}_n^m(\boldsymbol{\theta}_0) \big) \big) \Pi(d \boldsymbol{\theta}) } .
\end{eqnarray*}
Note that $m(n)$ is assumed to be a deterministic function of $n$. We aim to show that $P^n \Pi_n(A_n) \rightarrow 0$ as $n \rightarrow \infty$.
We define the set
$$ G_n^m := \{ \boldsymbol{\theta} \in \Theta: u_m(\boldsymbol{\theta}, \boldsymbol{\theta}_0) \le \epsilon_n^2, v_m(\boldsymbol{\theta}, \boldsymbol{\theta}_0) \le \epsilon_n^2 \}.$$
By Lemma \ref{lemma_bound_u_v} and the assumption $\epsilon_n^{2} > r_m^{-1}$, $G_n^{m}$ is implied by the event 
$$ H_n^{m} := \{ \boldsymbol{\theta} \in \Theta: ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)}^{2} \le c (\epsilon_n^2 - r_m^{-1}) \} ,$$
for some constant $c>0$. We thus have
$$ \Pi(G_n^{m}) \ge \Pi(H_n^{m}) \gtrsim \epsilon_n^{K+1}, $$
from which it follows that
$$\log \Pi(G_n^m) \gtrsim -n\epsilon_n^2 .$$
Since the excess loss 
$$ \ell_{\boldsymbol{\theta}}(\hat{\mu}^m, \hat{\nu}^m) - \ell_{\boldsymbol{\theta}_0}(\hat{\mu}^m, \hat{\nu}^m) $$
is bounded for all $\boldsymbol{\theta}$ and $(\hat{\mu}^m, \hat{\nu}^m)$, when $$||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)}^2 > \epsilon_n,$$
we apply Section 3.4.1 of \cite{Syring2023} to obtain
\begin{eqnarray*}
    P e^{- n \omega (\tilde{R}_n^m(\boldsymbol{\theta}) - \tilde{R}_n^m(\boldsymbol{\theta}_0))} &\le& \exp\Big( - n c_0 \omega \Big( ||T_{\boldsymbol{\theta}} - T_{\boldsymbol{\theta}_0}||_{L^2(Q)}^2 - r_{m(n)}^{-1} - r_{m(n)}^{-2} \Big) \Big) \\
& \le & \exp\big( - n c_1 \omega  (\epsilon_n^2 - r_m^{-1} ) \big) \\
& \le & \exp\big( - n c_2 \omega  \epsilon_n^2 \big)
\end{eqnarray*} 
for some constants $c_0, c_1, c_2 > 0$. 
By Fubini's Theorem, we have
$$ P^n N_n(A_n) = \int_{A_n} P e^{-\omega n (\tilde{R}_n^m(\boldsymbol{\theta}) - \tilde{R}_n^m(\boldsymbol{\theta}_0))} \Pi(d \boldsymbol{\theta}) \le \exp\Big( - n c_2 \omega M^2 \epsilon_n^2 \Big) . $$
Following essentially the same lines as the proof of Lemma 1 of \cite{Syring2023}, we obtain
$$ P^n \bigg( D_n^m > \frac{1}{2}\Pi(G_n^m) e^{-2\omega n \epsilon_n^2} \bigg) \rightarrow 1,$$
as $n \rightarrow \infty$.
\\\\
Let $b_n^m = \frac{1}{2} \Pi(G_n^m) e^{-2\omega n \epsilon_n^2}$, we have 
$$ P^n(D_n^m \le b_n^m) \rightarrow 0$$
as $n \rightarrow \infty$. Since
\begin{eqnarray*}
    \Pi_n(A_n) &\le& \frac{N^m_n(A_n)}{D_n^m} 1(D_n^m > b_n^m) + 1(D_n^m \le b_n^m) \\
    & \le & b_n^{-1} N_n^m(A_n) + 1(D_n^m \le b_n^m) 
\end{eqnarray*}
It follows that
$$ P^n \Pi_n(A_n) \rightarrow 0$$
as $n \rightarrow \infty$. The proof is completed.

\end{proof}







\section{Additional simulation studies}
In order to evaluate the robustness of our proposed model and posterior sampling strategy, we conduct additional simulation studies aimed at investigating its behavior under mis-specification. We conduct simulation studies to investigate the behavior of the model under two scenarios of mis-specification. The first scenario involves the specified models having fewer basis functions than the true optimal transport maps. The second, more challenging scenario occurs when the true optimal transport maps do not belong to the model class.
\\\\
In the first scenario, we replicate the simulation settings described in the main article, wherein the true optimal transport map is generated using BP basis functions with a polynomial order of $K=50$. However, in the model fitting process, we set the polynomial order to $K=20$. The outcomes of these experiments are illustrated in Figures \ref{plot_sim1_mis}, \ref{plot_sim2_mis}, and \ref{plot_sim3_mis}. Upon examination of the results, we observe that despite the mis-specification in the model fitting, the true optimal transport maps are successfully recovered in all scenarios. This suggests that our proposed model and posterior sampling strategy exhibit robustness to mis-specification, demonstrating their effectiveness in capturing underlying patterns even when the model assumptions are not entirely met.
\\\\
In the second scenario, the true optimal transport maps cannot be expressed as linear combinations of BP basis functions. We conduct two simulations, and the results are shown in Figures \ref{plot_sim4} and \ref{plot_sim5}. In the first case, the true optimal map can still be well approximated by BP basis functions, allowing the true optimal map to be well estimated. In the second scenario, the true optimal map is a step function with discontinuities and cannot be well approximated using BP basis functions. Consequently and not surprisingly, the estimated maps do not capture the shape of the true map.


\begin{figure}[!htb]
\subfloat{\includegraphics[width = 3in]{Plots/sim1_plot_sub_5_mis.png}}
\subfloat{\includegraphics[width = 3in]{Plots/sim1_plot_sub_20_mis.png}}\\
\subfloat{\includegraphics[width = 3in]{Plots/sim1_plot_sub_50_mis.png}} 
\subfloat{\includegraphics[width = 3in]{Plots/sim1_plot_full_mis.png}} 
\caption{Simulation 1. Top left: n=5 ($\omega=500$). Top right: n=20 ($\omega=200$). Bottom left: n=50 ($\omega=100$). Bottom right: n=100 ($\omega=50)$. Black curve: True optimal transport map. Blue dashed curves: estimated posterior mean and posterior credible intervals of optimal transport map.}
\label{plot_sim1_mis}
\end{figure}

\begin{figure}[!htb]
\subfloat{\includegraphics[width = 3in]{Plots/sim2_plot_sub_5_mis.png}}
\subfloat{\includegraphics[width = 3in]{Plots/sim2_plot_sub_20_mis.png}}\\
\subfloat{\includegraphics[width = 3in]{Plots/sim2_plot_sub_50_mis.png}} 
\subfloat{\includegraphics[width = 3in]{Plots/sim2_plot_full_mis.png}} 
\caption{Simulation 2. Top left: n=5 ($\omega=500$). Top right: n=20 ($\omega=200$). Bottom left: n=50 ($\omega=100$). Bottom right: n=100 ($\omega=50)$. Black curve: True optimal transport map. Blue dashed curves: estimated posterior mean and posterior credible intervals of optimal transport map.}
\label{plot_sim2_mis}
\end{figure}

\begin{figure}[!htb]
\subfloat{\includegraphics[width = 3in]{Plots/sim3_plot_sub_5_mis.png}}
\subfloat{\includegraphics[width = 3in]{Plots/sim3_plot_sub_20_mis.png}}\\
\subfloat{\includegraphics[width = 3in]{Plots/sim3_plot_sub_50_mis.png}} 
\subfloat{\includegraphics[width = 3in]{Plots/sim3_plot_full_mis.png}} 
\caption{Simulation 3. Top left: n=5 ($\omega=500$). Top right: n=20 ($\omega=200$). Bottom left: n=50 ($\omega=100$). Bottom right: n=100 ($\omega=50)$. Black curve: True optimal transport map. Blue dashed curves: estimated posterior mean and posterior credible intervals of optimal transport map.}
\label{plot_sim3_mis}
\end{figure}


\begin{figure}[!htb]
\subfloat{\includegraphics[width = 3in]{Plots/sim4_plot_sub_5.png}}
\subfloat{\includegraphics[width = 3in]{Plots/sim4_plot_sub_20.png}}\\
\subfloat{\includegraphics[width = 3in]{Plots/sim4_plot_sub_50.png}} 
\subfloat{\includegraphics[width = 3in]{Plots/sim4_plot.png}} 
\caption{Simulation 4. Top left: n=5 ($\omega=500$). Top right: n=20 ($\omega=200$). Bottom left: n=50 ($\omega=100$). Bottom right: n=100 ($\omega=50)$. Black curve: True optimal transport map. Blue dashed curves: estimated posterior mean and posterior credible intervals of optimal transport map.}
\label{plot_sim4}
\end{figure}

\begin{figure}[!htb]
\subfloat{\includegraphics[width = 3in]{Plots/sim5_plot_sub_5.png}}
\subfloat{\includegraphics[width = 3in]{Plots/sim5_plot_sub_20.png}}\\
\subfloat{\includegraphics[width = 3in]{Plots/sim5_plot_sub_50.png}} 
\subfloat{\includegraphics[width = 3in]{Plots/sim5_plot.png}} 
\caption{Simulation 5. Top left: n=5 ($\omega=500$). Top right: n=20 ($\omega=200$). Bottom left: n=50 ($\omega=100$). Bottom right: n=100 ($\omega=50)$. Black curve: True optimal transport map. Blue dashed curves: estimated posterior mean and posterior credible intervals of optimal transport map.}
\label{plot_sim5}
\end{figure}

%\bibliography{uai2024-GBDistReg}


\end{document}
