
%\newpage
%\onecolumn

% \subsection{Appendix for \ours: Distribution-Free Decision-Focused Learning}


% \subsubsection{Figure~\ref{fig:ablation}}
% \label{appendix:ablation}
% \begin{figure*}[!ht]
%     %\hspace{-1.4cm}
%     \centering
%     \includegraphics[width=0.95\textwidth]{figure/ablation_v2.pdf}
%     \caption{Ablation study on the wind power bidding problem.}
%     \label{fig:ablation}
% \end{figure*}

% We investigate the effectiveness of each model design of \ours via ablation studies on the wind power bidding. Fig.~\ref{fig:ablation}  shows the results and our findings can be summarized as follows:
% (1) Without the attention-based network architecture, we see a significant performance drop in Fig.~\ref{fig:ablation}(a). This is because, without the attention architecture, the network architecture may not be within the true model class and  thus suffer from high bias error in Proposition~\ref{prop:1}. (2) Our model performance can be improved with more attention points as in Fig.\ref{fig:ablation}(b). We also plot the decision regret and training time of DFL. We find that when the number of attention points is over 200, \ours can outperform DFL in terms of the decision regret while being orders of magnitude faster. (3) Our method outperforms baselines constantly with different ratios of training data as shown in Fig.~\ref{fig:ablation}(c). The superior performance is because we use attention-based network architecture to mimic the distribution-based parameterization. Compared with the two-stage model, we are decision-aware; compared with DFL methods, we do not suffer from the three bottlenecks.

% \subsection{Limitations}
% \label{s:limitations}
% In our work, we focus on the probabilistic setting where the predictive distribution of the forecasting task has high uncertainty. In this setting, both model mismatch error and sample average approximation error are significant. However, if the forecasting task is relatively straightforward, a simple Gaussian distribution might suffice. For certain objective functions, the expectation under a Gaussian distribution has a closed-form expression. In such cases, existing model-based DFL methods may still be a better choice.

% \subsection{Broader Impact}
% \label{s:impact}

% Decision-focused learning (DFL) has a wide range of real-world applications, from public health and operations research to personalized medicine. We propose a new DFL method designed for settings where the forecasting task is challenging and uncertainty is high. We anticipate that our method will positively impact the community by enabling better decision-making in highly uncertain environments. However, it is crucial to ensure that this method is not misused in harmful contexts, such as optimizing resource allocation for detrimental purposes.







% \begin{figure}[t]
%     %\hspace{-1.4cm}
%     \centering
%     \includegraphics[width=0.5\textwidth]{figure/ab2.pdf}
%     \caption{Data}
%     \label{fig:ab2}
% \end{figure}


% \begin{figure}
%     \centering
%     \includegraphics[width=0.4\textwidth]{figure/df2.drawio.pdf}
%     \caption{Relaxed constrained sampling. We can uniformly sample from the encompassing outer box of the original constrained space. }
%     \label{fig:enter-label}
% \end{figure}


\startcontents[sections]
\printcontents[sections]{l}{1}{\setcounter{tocdepth}{2}}

% \section{Limitations}
%  In our work, we focus on the probabilistic setting where the predictive distribution of the forecasting task has high uncertainty. In this setting, both model mismatch error and sample average approximation error are significant. However, if the forecasting task is relatively straightforward, a simple Gaussian distribution might suffice. For certain objective functions, the expectation under a Gaussian distribution has a closed-form expression. In such cases, existing model-based DFL methods may still be a better choice.

 
% When the number of attention points is large, scalability may become an issue at inference time. This challenge can potentially be alleviated by employing fast attention mechanisms, such as sparse attention (e.g., Longformer; \citep{beltagy2020longformer}) or low-rank approximations (e.g., Linformer; \citep{wang2020linformer}).

\section{Training Algorithm}


% Input: Objective function $f$, feasible set $\mathcal{C}$, training dataset  
% Output: Learned encoder, value embeddings, and key embeddings  

% 1. Initialize encoder, value, and key embeddings 
% For i in 1 to T:
% 2. Sample $(x, y)$ from the dataset  
% 3. Sample $a$ from the feasible set $\mathcal{C}$  
% 4. Compute $g(x, a)$ as defined in Eq. 4  
% 5. Compute the MSE loss between $f(x,y)$ and $g(x,a)$ according to Eq. 2  
% 6. Update encoder, value, and key embeddings using the computed loss  
% End For

The full training procedure of \ours is given in Algorithm~\ref{alg:training}.

\begin{algorithm}[H]
\caption{Training Procedure of \ours}
\label{alg:training}
\begin{algorithmic}[1]
\REQUIRE Objective function $f$, feasible set $\mathcal{C}$, training dataset $\mathcal{D}$
\ENSURE Learned encoder, value embeddings, and key embeddings

\STATE Initialize encoder, value embeddings, and key embeddings
\FOR{$t = 1$ to $T$}
    \STATE Sample a mini-batch $B = \{(\mathbf{x}_i, \mathbf{y}_i)\}_{i=1}^{|B|}$ from $\mathcal{D}$
    \FOR{each $(\mathbf{x}_i, \mathbf{y}_i)$ in $B$}
        \STATE Sample actions $\{\mathbf{a}_i^j\}_{j=1}^J$ from the feasible set $C$
        \STATE Compute $g(\mathbf{x}_i, \mathbf{a}_i^j)$ for all $j$, as defined in Eq.~\ref{eq:attention}.
        \STATE Compute the MSE loss for the $i$-th sample:
        \[
        L_i = \frac{1}{J} \sum_{j=1}^J \left( f(\mathbf{x}_i, \mathbf{y}_i) - g(\mathbf{x}_i, \mathbf{a}_i^j) \right)^2
        \]
    \ENDFOR
    \STATE Update encoder, value embeddings, and key embeddings using the aggregated loss $\sum_i L_i$
\ENDFOR
\end{algorithmic}
\end{algorithm}


% \section{Figure~\ref{fig:landscape}}
% \label{sec:landscape}

% Fig.~\ref{fig:landscape} (in Appendix~\ref{sec:landscape})  visualizes the learned expected function and the ground truth expectation on a test sample for both objectives. We found that the \ours can effectively recover the landscape of the ground truth expected cost.

% \begin{figure*}[!ht]
%     %\hspace{-1.4cm}
%     \centering
%     \includegraphics[width=0.95\textwidth]{figure/landscape_new.pdf}
%     \caption{Randomly initialized landscape, \ours recovered landscape and the ground-truth landscape on the synthetic data. The landscape is conditioned on an input feature sampled from the test set.}
%     \label{fig:landscape}
%     %\vspace{-0.5em}
% \end{figure*}



% \section{Full Ablation Study}
% \label{sec:ablation}
% \begin{figure*}[!ht]
%     %\hspace{-1.4cm}
%     \centering
%     \includegraphics[width=0.9\textwidth]{figure/ablation_v2-KDD.pdf}
%     \caption{Ablation study on the impact of attention-based architecture, number of attention points, and training data size on the wind power bidding problem.}
%     \label{fig:ablation}
% \end{figure*}

% % \subsubsection{Experimental Setup}
% % We provide the details of the experimental setup and the optimization objectives in Supplementary material~\ref{s:experiment} due to the space limit.
% \begin{figure}[h]
%     %\hspace{-1.4cm}
%     \centering
%     \includegraphics[width=0.5\textwidth]{figure/ab23_v2.pdf}
%     \caption{\ours vs. DFL with different numbers of samples: the left figure shows decision regret, while the right figure displays training time.}
%     \label{fig:ab1}
% \end{figure}

% \begin{figure}[h]
%     %\hspace{-1.4cm}
%     \centering
%     \includegraphics[width=0.45\textwidth]{figure/ab23_v3-KDD.pdf}
%     \caption{Impact of learnable value embeddings and number of action samples.}
%     \label{fig:ab23}
% \end{figure}



% In this subsection, we investigate the effectiveness of each model design of \ours via ablation studies on the wind power bidding. %We aim to answer the following question: Q1: Does the attention-based network architecture plays a key role in the outstanding performance of \ours? Q2: How does the number of attention points affect the performance? Q3: How much data does \ours need?  
% % Fig.~\ref{fig:ablation} shows the results and our findings can be summarized as follows:

% \emph{Impact of attention-based architecture.} Without the attention-based network architecture, we see a significant performance drop in Fig.~\ref{fig:ablation}(a). This is because, without the attention architecture, the network architecture may not be within the true model class and  thus suffer from high bias error in Proposition~\ref{prop:1}. 

% \emph{Impact of number of attention points.} Our model performance can be improved with more attention points as in Fig.\ref{fig:ablation}(b). We also plot the decision regret and training time of DFL. We find that when the number of attention points is over 200, \ours can outperform DFL in terms of the decision regret while being orders of magnitude faster. 

% \emph{Impact of training data size.} Our method outperforms baselines constantly with different ratios of training data as shown in Fig.~\ref{fig:ablation}(c). The superior performance is because we use attention-based network architecture to mimic the distribution-based parameterization. Compared with the two-stage model, we are decision-aware; compared with DFL methods, we mitigate the three bottlenecks. 




% \emph{\ours vs DFL with different number of samples.} The number of samples used to estimate the expected objective in DFL is an important hyperparameter. To investigate its impact, we compare the decision regret and training time of \ours with DFL using different numbers of samples. We use GMM with 1000 components in the DFL forecaster as it achieves the best performance shown in Section 5.2.
% As shown in Fig.~\ref{fig:ab1}, when the number of samples for DFL exceeds 100, the performance improvement becomes very marginal (64.52 with 100 samples vs. 64.07 with 200 samples). However, the training time increases significantly (1878 seconds/epoch with 100 samples vs. 5251 seconds/epoch with 200 samples). In contrast, \ours achieves significantly better decision regret (58.41) while being orders of magnitude faster (2.17 seconds/epoch). 


% \emph{Impact of learnable value embeddings.} In \ours, the value embeddings are initialized with randomly sampled labels from the training set and then updated during the training process. An alternative is to directly use these randomly selected labels and keep the value embeddings fixed during the training process.
% We examine whether making the value embeddings learnable improves the performance. The results are shown in Fig.~\ref{fig:ab23}(a). As we can see, with learnable value embeddings, the decision regret of \ours decreases significantly compared with the fixed value embeddings.




%  \emph{Impact of number of action samples.} In \ours, we need to sample actions for each $(\mathbf{x},\mathbf{y})$ pair at each training iteration to fit the function.  In this study, we investigate the influence of the number of action samples on the performance. As shown in Fig.\ref{fig:ab23}(b), the decision regret remains stable even for a sample size of 5. Notably, as the number of action samples increases, the variance of the decision regret across different random seeds decreases, indicating improved stability in the results.


\section{Constrained Sampling}
\label{s:sampling}


In practice, it's unnecessary to fit the true objective across the entire Euclidean space. Instead, we only need to sample from the constrained space $C$. There are several strategies for this. First, we can employ Markov chain Monte Carlo (MCMC) methods to uniformly sample within $C$, such as Ball Walk~\cite{lovasz1990mixing} and the hit-and-run algorithm ~\cite{belisle1993hit, lovasz1999hit}. Alternatively. we can sample from a relaxed constrained space, such as the encompassing outer box of the original constrained space. This allows us to sample each dimension of $\mathbf{a}$ independently from a uniform distribution. Figure~\ref{fig:sampling} gives an illustration.
\begin{figure}
  \centering
  \begin{center}
    \includegraphics[width=0.25\textwidth]{figure/sampling.pdf}
  \end{center}
  \caption{Relaxed constrained sampling. We can sample from the encompassing outer box of the original constrained space. }
  \label{fig:sampling}
\end{figure}

Consider the following convex constraints:
\begin{align}
\mathbf{A}\mathbf{a} = \mathbf{b}, \quad \mathbf{G}\mathbf{a} \preceq \mathbf{h}.
\end{align}

In particular, instead of directly sampling the full-dimensional decision vector $\mathbf{a}$, we initially output a subset of the variables ${a_1, \cdots, a_d }$, and then deduce the remaining variables by resolving the given set of equations.

To sample from the relaxed constraint space, we initially determine the maximum and minimum values for ${a_1, \cdots, a_d }$, guided by the given inequality constraints. These boundary points can be effortlessly acquired utilizing the Python SciPy package. Following this, we execute uniform sampling between these extremal values for each variable in the set ${a_1, \cdots, a_d }$. Essentially, we transform the polyhedron into a box, simplifying the uniform sampling process.

Furthermore, many predict-then-optimize problems manifest as resource allocation issues wherein the decision variable $\mathbf{a}$ embodies a simplex; in such cases, we can directly sample from the Dirichlet distribution.



% \subsubsection{Visualization on the Synthetic Dataset}
% \label{sec:vis}
% Fig.~\ref{fig:landscape} visualizes the learned expected function and the ground truth expectation on a test sample for both objectives. We found that the \ours can effectively recover the landscape of the ground truth expected cost function.

% \begin{figure*}[t]
%     %\hspace{-1.4cm}
%     \centering
%     \includegraphics[width=0.95\textwidth]{figure/landscape_new.pdf}
%     \caption{Randomly initialized landscape, \ours recovered landscape and the ground-truth landscape on the synthetic data. The landscape is conditioned on an input feature sampled from the test set.}
%     \label{fig:landscape}
% \end{figure*}



\section{Additional Background on Conditional Mean Embedding}
\label{s:Background}

\begin{table}[t]
\centering
\begin{tabular}{c c c}
\toprule[1.5pt]
Variable & $\mathbf{x}$ & $\mathbf{y}$ \\
Domain & $\mathcal{X}$ & $\mathcal{Y}$ \\
Kernel & $\mathcal{R}_{\mathbf{x}}(\mathbf{x},\mathbf{x}')$ & $\mathcal{R}_{\mathbf{y}}(\mathbf{y},\mathbf{y}')$ \\
Feature map & $\phi(\mathbf{x})$/$\mathcal{R}_{\mathbf{x}}(\mathbf{x},\cdot)$ & $\varphi(\mathbf{y})$/$\mathcal{R}_{\mathbf{y}}(\mathbf{y},\cdot)$ \\
Feature matrix & $\Upsilon=(\phi(\mathbf{x}_1), \cdots, \phi(\mathbf{x}_s))$ & $\Phi=(\varphi(\mathbf{y}_1), \cdots, \varphi(\mathbf{y}_s))$ \\
Kernel matrix & $\mathbf{K}=\Upsilon^{\top}\Upsilon$ & $\mathbf{L}=\Phi^{\top}\Phi$ \\
RKHS & $\mathcal{G}$ & $\mathcal{F}$ \\
\bottomrule[1.5pt]
\end{tabular}
\caption{Table of Notations}
\label{table:notation:appendix}
\end{table}


We provide more details about how to compute conditional mean embedding (CME) in this subsection. Table~\ref{table:notation:appendix} presents the notations related to CME.






% A reproducing kernel Hilbert space (RKHS) $\mathcal{F}$ on $\Sigma$ with a kernel is a Hilbert space of functions 

% $\Upsilon=(\mathcal{R}_x)$
% $\Phi=(\varphi)$
% $(\mathcal{R}_y(\mathbf{y}_1),\mathcal{R}_y)$ is the feature matrices


Let $\mathcal{F}$ be a reproducing kernel Hilbert space (RKHS) over the domain of $\mathbf{y}$ with kernel function $\mathcal{R}_{\mathbf{y}}(\mathbf{y},\mathbf{y}')$ and inner product $\langle \cdot, \cdot \rangle_{\mathcal{F}}$. Its inner product $\langle \cdot, \cdot\rangle_{\mathcal{F}}$ satisfies the reproducing property: $$\langle f(\cdot), \mathcal{R}_{\mathbf{y}}(\mathbf{y}, \cdot) \rangle_{\mathcal{F}}=f(\mathbf{y}),$$
meaning that we can view the evaluation of a function $f\in \mathcal{F}$ at any point $\mathbf{y}$ as an inner product and the linear evaluation operator is given by $\mathcal{R}_{\mathbf{y}}(\mathbf{y},\cdot)$, \ie~the kernel function. Alternatively, $\mathcal{R}_{\mathbf{y}}(\mathbf{y}, \cdot)$ can also be viewed as a feature map $\varphi(\mathbf{y})$ where $\mathcal{R}_{\mathbf{y}}(\mathbf{y},\mathbf{y}')=\langle \varphi(\mathbf{y}), \varphi(\mathbf{y}')\rangle_{\mathcal{F}}$.
Similarly, we can define the RKHS $\mathcal{G}$ over the domain of $\mathbf{x}$ with kernel function $\mathcal{R}_{\mathbf{x}}(\mathbf{x},\mathbf{x}')$.

For a particular $\mathbf{a}$, we denote the corresponding function with respect to $\mathbf{y}$ as $f_\mathbf{a}(\mathbf{y})$. CME projects the conditional distribution to its expected feature map $\mu_{\mathbf{y}|\mathbf{x}}\triangleq \mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[\mathcal{R}_{\mathbf{y}}(\mathbf{y}, \cdot)]$ and evaluates 
the conditional expectation of any RKHS function, $f_\mathbf{a} \in \mathcal{F}$, as an inner product in $\mathcal{F}$ using the reproducing property:
\begin{align}
\mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f_{\mathbf{a}}] &=\int p(\mathbf{y}|\mathbf{x})\langle \mathcal{R}_{\mathbf{y}}(\mathbf{y},\cdot), f_{\mathbf{a}} \rangle_{\mathcal{F}}d\mathbf{y} \nonumber\\&= \left\langle \int p(\mathbf{y}|\mathbf{x})\mathcal{R}_{\mathbf{y}}(\mathbf{y}, \cdot)\mathrm{d}\mathbf{y}, f_{\mathbf{a}} \right\rangle_{\mathcal{F}} \nonumber\\&=\langle \mu_{\mathbf{y}|\mathbf{x}}, f_{\mathbf{a}}\rangle_{\mathcal{F}}.
\end{align}


Assume that for all $f_{\mathbf{a}}\in \mathcal{F}$, the conditional expectation $\mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f_{\mathbf{a}}(\mathbf{y})]$ is an element of $\mathcal{G}$, the conditional embedding can be estimated with a finite dataset $\{\mathbf{x}_s, \mathbf{y}_s\}_{s=1}^S$ as \cite{song2013kernel, song2009hilbert}:
\begin{align}
    \hat{\mu}_{\mathbf{y}|\mathbf{x}} =  \Phi(\mathbf{K}+\lambda \mathbf{I})^{-1}\Upsilon^\top\phi(\mathbf{x}) =  \sum_{s=1}^S \beta_s(\mathbf{x}) \mathcal{R}_{\mathbf{y}}(\mathbf{y}_s,\cdot),
    \label{eq:cme_kernel}
\end{align}
where $\Phi=(\mathcal{R}_{\mathbf{y}}(\mathbf{y}_1,\cdot), \cdots, \mathcal{R}_{\mathbf{y}}(\mathbf{y}_S,\cdot))$ is the feature matrix; $\mathbf{K}=\Upsilon^\top \Upsilon$ is the Gram matrix for samples from variable $\mathbf{x}$ with $\Upsilon=(\mathcal{R}_{\mathbf{x}}(\mathbf{x}_1,\cdot), \cdots, \mathcal{R}_{\mathbf{x}}(\mathbf{x}_S,\cdot))$; 
$\lambda$ is the additional regularization parameter to avoid overfitting. Though the assumption $\mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f_{\mathbf{a}}(\mathbf{y})]\in \mathcal{G}$ is not necessarily true for continuous domains, existing works treat the expression as an approximation \cite{song2009hilbert} and works well in practice.
 

% Assume that for all $f_{\mathbf{a}}\in \mathcal{F}$, the conditional expectation $\mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f_{\mathbf{a}}(\mathbf{y})]$ is an element of the RKHS over the domain of $\mathbf{x}$, the conditional embedding can be estimated with a finite dataset $\{\mathbf{x}_s, \mathbf{y}_s\}_{s=1}^S$ as
% $\hat{\mu}_{\mathbf{y}|\mathbf{x}}=\sum_{s=1}^S\beta_s(\mathbf{x})\mathcal{R}(\mathbf{y}_s,\cdot)$,
%  where $\beta_s$ is a real-valued weight and can be computed with matrix calculation (see more details about this computation in the Appendix). Though the assumption $\mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f_{\mathbf{a}}(\mathbf{y})]\in \mathcal{G}$ is not necessarily true for continuous domains, existing works treat the expression as an approximation \cite{song2009hilbert} and works well in practice.
 
 One advantage of CME is that $\hat{\mu}_{\mathbf{y}|\mathbf{x}}$ can converge to $\mu_{\mathbf{y}|\mathbf{x}}$ in the RKHS norm at an overall rate of $\mathcal{O}(S^{-\frac{1}{2}})$ \cite{song2009hilbert}, which is independent of the input dimensions. This property let CME works well in the high-dimensional space. 

As we can see from Eq.~\ref{eq:cme_kernel}, the empirical estimator of CME, ${\hat \mu}_{\mathbf{y}|\mathbf{x}}$,  applies non-uniform
weights, $\beta_s$, on observations which are, in turn, determined by the conditioning variable $\mathbf{x}$.


\section{Proof of Proposition 1}
\label{s:Prop1}

\newtheorem{prop}{Proposition}

\begin{prop}\label{prop:1}
% The optimal solution $g^*$ is the conditional mean $\mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f(\mathbf{y},\mathbf{a})]$ 
The expected MSE of the optimal solution $g^*$ on the test set is: 
\begin{align}
 \text{MSE}_{\rm test}  =  \underbrace{\mathbb{E}_{\mathcal{D}'}\left [ \left(g^*_{\mathcal{D}'}(\mathbf{x,\mathbf{a}})- \mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f(\mathbf{y},\mathbf{a})]   \right)^2 \right]}_{\text{Bias}}\nonumber\\ 
 + \underbrace{\mathbb{E}_{\mathcal{D}'}\left[ \left(g^*_{\mathcal{D}'}(\mathbf{x,\mathbf{a}})-  \mathbb{E}_{\mathcal{D}'}[g^*_{\mathcal{D}'}(\mathbf{x,\mathbf{a}})]  \right)^2 \right]}_{\text{Variance}}, \nonumber
\end{align} 
where $\mathcal{D}'$ denotes the training dataset  augmented with the sampled actions $\mathbf{a}$. 
\end{prop}


\begin{proof}
The training set consists of the given $\mathcal{D}=\{\mathbf{x}_i,\mathbf{y}_i\}_{i=1}^N$ augmented with the sampled actions $\mathbf{a}$. We denote the augmented dataset as $\mathcal{D}'$. We assume the fitted function is in a hypothesis $g^*(\mathbf{x},\mathbf{a})$. Let $g^*_{\mathcal{D}'}(\mathbf{x},\mathbf{a})$ denote the function fitted on the dataset $\mathcal{D}'$.
The expectation of the mean squared error (MSE) for a given unseen test sample, over all possible learning sets, is:
% \begin{align}
% \mathbb{E}_{\mathcal{D}}[(\mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f(\mathbf{y},\mathbf{a})]-g^*_{\mathcal{D}}(\mathbf{x,\mathbf{a}}))^2]   
% \end{align}
\begin{align}
& \hspace{1.1em} \mathbb{E}_{\mathcal{D}'}[(\mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f(\mathbf{y},\mathbf{a})]-g^*_{\mathcal{D}'}(\mathbf{x},\mathbf{a}))^2]   \nonumber \\
& = \mathbb{E}_{\mathcal{D}'}[(\underbrace{\mathbb{E}_{p(\mathbf{y}|\mathbf{x}^*)}[f(\mathbf{y},\mathbf{a})] - \mathbb{E}_{\mathcal{D}'}[g^*_{\mathcal{D}'}(\mathbf{x,\mathbf{a}})]}_{a} \nonumber \\ &+ \underbrace{\mathbb{E}_{\mathcal{D}'}[g^*_{\mathcal{D}'}(\mathbf{x,\mathbf{a}})]-g^*_{\mathcal{D}'}(\mathbf{x,\mathbf{a}})}_{b} )^2 ] 
\nonumber \\ &= \mathbb{E}_{\mathcal{D}'}[(a+b)^2] \nonumber \\
&= \mathbb{E}_{\mathcal{D}'}[a^2]+\mathbb{E}_{\mathcal{D}'}[b^2] + \mathbb{E}_{\mathcal{D}'}[2ab] \nonumber
\end{align}

The first two terms represent the bias and variance errors respectively:
$$\mathbb{E}_{\mathcal{D}'}[a^2] = \mathbb{E}_{\mathcal{D}'}\left[ \left(g^*_{\mathcal{D}'}(\mathbf{x,\mathbf{a}})- \mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f(\mathbf{y},\mathbf{a})]   \right)^2 \right] =\text{Bias}^2(g^*).$$
$$\mathbb{E}_{\mathcal{D}'}[b^2] = \mathbb{E}_{\mathcal{D}'}\left[ \left(g^*_{\mathcal{D}'}(\mathbf{x,\mathbf{a}})-  \mathbb{E}_{\mathcal{D}'}[g^*_{\mathcal{D}'}(\mathbf{x,\mathbf{a}})]  \right)^2 \right] =\text{Variance}(g^*),$$



% \begin{align}
%     &\mathbb{E}_{\mathcal{D}}\left[2\left(g^*_{\mathcal{D}}(\mathbf{x,\mathbf{a}})- \mathbb{E}_{\mathcal{D}}[g^*_{\mathcal{D}}(\mathbf{x,\mathbf{a}})]\right)\left( \mathbb{E}_{\mathcal{D}}[g^*_{\mathcal{D}}(\mathbf{x,\mathbf{a}})] - \mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f(\mathbf{y},\mathbf{a})]  \right)\right] \nonumber \\
%     & = 2\mathbb{E}_{\mathcal{D}}\left[ g^*_{\mathcal{D}}(\mathbf{x,\mathbf{a}}) \mathbb{E}_{\mathcal{D}}[g^*_{\mathcal{D}}(\mathbf{x,\mathbf{a}})] - g^*_{\mathcal{D}}(\mathbf{x,\mathbf{a}}) \mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f(\mathbf{y},\mathbf{a})] - (\mathbb{E}_{\mathcal{D}}[g^*_{\mathcal{D}}(\mathbf{x,\mathbf{a}})])^2 + \mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f(\mathbf{y},\mathbf{a})](\mathbb{E}_{\mathcal{D}}[g^*_{\mathcal{D}}(\mathbf{x,\mathbf{a}})]   \right] \nonumber\\
%     & = 2 (\mathbb{E}_{\mathcal{D}}[g^*_{\mathcal{D}}(\mathbf{x,\mathbf{a}})])^2 - \mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f(\mathbf{y},\mathbf{a})]\mathbb{E}_{\mathcal{D}}[g^*_{\mathcal{D}}(\mathbf{x,\mathbf{a}})] -(\mathbb{E}_{\mathcal{D}}[g^*_{\mathcal{D}}(\mathbf{x,\mathbf{a}})])^2  + \mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f(\mathbf{y},\mathbf{a})] \mathbb{E}_{\mathcal{D}}[g^*_{\mathcal{D}}(\mathbf{x,\mathbf{a}})] \nonumber \\
%     & = 0
% \end{align}

Next, we prove the cross-term $\mathbb{E}_{\mathcal{D}'}[2ab]=0$. To simplify the notation, let ${ \overline g}$ denote $\mathbb{E}_{\mathcal{D}'}[g^*_{\mathcal{D}'}(\mathbf{x,\mathbf{a}})]$; $g$  denote $g^*_{\mathcal{D}'}(\mathbf{x,\mathbf{a}})$; ${\tilde f}$ denote $\mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f(\mathbf{y},\mathbf{a})]$. Then we can obtain:
\begin{align}
&\mathbb{E}_{\mathcal{D}'}\left[2\left(g - {\overline g}\right)\left( {\overline g} - {\tilde f} \right)\right] \nonumber\\
&= 2\cdot\mathbb{E}_{\mathcal{D}'}[g \cdot {\overline g} - g \cdot {\tilde f} - {\overline g} \cdot {\overline g} + {\overline g} \cdot {\tilde f}] \nonumber \\
& = 2\cdot\mathbb{E}_{\mathcal{D}'}[g]\cdot {\overline g}- 2\cdot\mathbb{E}_{\mathcal{D}'}[g]\cdot{\tilde f} -2\cdot\mathbb{E}_{\mathcal{D}'}[{\overline g}^2] + 2\cdot{\tilde f}\cdot\mathbb{E}_{\mathcal{D}'}[{\overline g}] \nonumber \\
& = 2 \cdot {\overline g}^2 - 2\cdot {\overline g}\cdot{\tilde f} - 2\cdot{\overline g}^2 + 2\cdot{\tilde f}\cdot{\overline g} \nonumber \\
& = 0 \nonumber
\end{align}

Hence, the expectation of the MSE for a given test sample $\mathbf{x}^*$ is expressed as:
\begin{align}
 \text{MSE}_{\rm test} & = \mathbb{E}_{\mathcal{D}'}[(\mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f(\mathbf{y},\mathbf{a})]-g^*_{\mathcal{D}'}(\mathbf{x,\mathbf{a}}))^2] \nonumber \\
  & =  \underbrace{\mathbb{E}_{\mathcal{D}'}\left [ \left(g^*_{\mathcal{D}'}(\mathbf{x,\mathbf{a}})- \mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f(\mathbf{y},\mathbf{a})]   \right)^2 \right]}_{\text{Bias}} \nonumber\\ &+ \underbrace{\mathbb{E}_{\mathcal{D}'}\left[ \left(g^*_{\mathcal{D}'}(\mathbf{x,\mathbf{a}})-  \mathbb{E}_{\mathcal{D}'}[g^*_{\mathcal{D}'}(\mathbf{x,\mathbf{a}})]  \right)^2 \right]}_{\text{Variance}} 
  \label{eq:mse}
\end{align}

Since the training dataset consists of $\mathbf{x}, \mathbf{y}, \mathbf{a}$, and each $\mathbf{y}$ corresponds to a  specific $\mathbf{x}$ from $\mathcal{D}$, we can replace the expectation $\mathbb{E}_{\mathcal{D}'}[\cdot]$ in Eq.~\ref{eq:mse} with $\mathbb{E}_{\mathbf{x},\mathbf{a}}[\cdot]$ and recover Proposition~\ref{prop:1}.

\end{proof}


\section{Proof of Proposition 2}
\label{s:Prop2}


\begin{prop}\label{prop:my_proposition}
It holds for any $\mathbf{x}$ and $\mathbf{a}$, the function $g(\mathbf{x},\mathbf{a})$ defined by the softmax attention in Eq.~\ref{eq:attention} $\mathbb{E}_{\hat{p}_{\mathcal{R}}(\mathbf{y}|\mathbf{x})}[f(\mathbf{y},\mathbf{a})]=g(\mathbf{x},\mathbf{a})$. Here,
$\hat{p}_{\mathcal{R}}(\mathbf{y}|\mathbf{x})$ is a  parameterization restriction of $p(\mathbf{y}|\mathbf{x})$. 
\end{prop} 


\begin{proof}
In order to ensure that $\hat{p}_{\mathcal{R}}(\mathbf{y}|\mathbf{x})$ is a valid parameterization of $p(\mathbf{y}|\mathbf{x})$, we define it as a  conditional kernel density estimator (KDE) as follows,
\begin{align}
    {\hat p}_{\mathcal{R}}(\mathbf{y}|\mathbf{x}) = \frac{ \sum_{s=1}^S\mathcal{R}_{\mathbf{x}}(\mathbf{k}_s, \mathbf{q}(\mathbf{x}))\mathcal{R}_{\mathbf{y}}(\mathbf{y}_s, \mathbf{y})}{\sum_{s=1}^S \mathcal{R}_{\mathbf{x}} (\mathbf{k}_s, \mathbf{q(\mathbf{x})})},
\end{align}

Then, we  can obtain 
\begin{align}
    \mathbb{E}_{{\hat p}_{\mathcal{R}}(\mathbf{y}|\mathbf{x})}[f(\mathbf{y}, \mathbf{a})] &= \int \frac{ \sum_{s=1}^S\mathcal{R}_{\mathbf{x}}(\mathbf{k}_s, \mathbf{q}(\mathbf{x}))\mathcal{R}_{\mathbf{y}}(\mathbf{y}_s, \mathbf{y})}{\sum_{s=1}^S \mathcal{R}_{\mathbf{x}} (\mathbf{k}_s, \mathbf{q(\mathbf{x})})} f(\mathbf{y},\mathbf{a}) \mathrm{d} \mathbf{y} \nonumber \\
    & = \frac{ \sum_{s=1}^S\mathcal{R}_{\mathbf{x}}(\mathbf{k}_s, \mathbf{q})\int \mathcal{R}_{\mathbf{y}}(\mathbf{y}_s, \mathbf{y}) f(\mathbf{y},\mathbf{a}) \mathrm{d} \mathbf{y}}{\sum_{s=1}^S \mathcal{R}_{\mathbf{x}} (\mathbf{k}_s, \mathbf{q(\mathbf{x})})} \nonumber \\
    & = \frac{ \sum_{s=1}^S\mathcal{R}_{\mathbf{x}}(\mathbf{k}_s, \mathbf{q})\int  \mathcal{R}_{\mathbf{z}}(f(\mathbf{y}_s, \mathbf{a}), \mathbf{z})\mathbf{z}\mathrm{d}\mathbf{z}}{\sum_{s=1}^S \mathcal{R}_{\mathbf{x}} (\mathbf{k}_s, \mathbf{q(\mathbf{x})})} \nonumber \\
   & = \frac{ \sum_{s=1}^S\mathcal{R}_{\mathbf{x}}(\mathbf{k}_s, \mathbf{q})f(\mathbf{y}_s, \mathbf{a})}{\sum_{s=1}^S \mathcal{R}_{\mathbf{x}} (\mathbf{k}_s, \mathbf{q(\mathbf{x})})}
\end{align}

The second last equation comes from the result of the change of variable by setting $\mathbf{z}=f(\mathbf{y},\mathbf{a})$. The last equation comes from the assumption that $\mathcal{R}_{\mathbf{z}}(\mathbf{z}_s, \mathbf{z})$ is symmetric.


When $\mathcal{R}_{\mathbf{x}}({\mathbf{k}, \mathbf{q}})$ is an exponential kernel. \ie~ $\mathcal{R}_{\mathbf{x}}({\mathbf{k}, \mathbf{q}})=\text{exp}(\frac{\mathbf{q}^\top\mathbf{k}}{\sqrt{d}})$, we can obtain
\begin{align}
\mathbb{E}_{{\hat p}_{\mathcal{R}}(\mathbf{y}|\mathbf{x})}[f(\mathbf{y}, \mathbf{a})] &= \frac{ \sum_{s=1}^S\mathcal{R}_{\mathbf{x}}(\mathbf{k}_s, \mathbf{q}(\mathbf{x}))f(\mathbf{y}_s, \mathbf{a})}{\sum_{s=1}^S \mathcal{R}_{\mathbf{x}} (\mathbf{k}_s, \mathbf{q}(\mathbf{x}))} \nonumber\\
& = \frac{ \sum_{s=1}^S \exp\left(\frac{\mathbf{q}(\mathbf{x})^\top\mathbf{k}_s}{\sqrt{d}}\right) f(\mathbf{y}_s, \mathbf{a})}{\sum_{s=1}^S \exp\left(\frac{\mathbf{q}(\mathbf{x})^\top\mathbf{k}_s}{\sqrt{d}}\right)} \nonumber \\
&= \text{Softmax}\left(\left[\frac{\mathbf{q}(\mathbf{x})^\top\mathbf{k}_1}{\sqrt{d}}, \cdots, \frac{\mathbf{q}(\mathbf{x})^\top\mathbf{k}_S}{\sqrt{d}}\right]\right)^\top \nonumber \\ &[f(\mathbf{v}_1, \mathbf{a}), \cdots, f(\mathbf{v}_S, \mathbf{a})] \nonumber \\
& = g(\mathbf{x},\mathbf{a}).
\end{align}
The second last equation comes from the definition of the softmax function and replacing the notation $\mathbf{y}$ with $\mathbf{v}$ which is commonly used in the existing literature.



\end{proof}
% \begin{definition}{$(\beta;\mu,\nu,\delta)\mathbf{-valid\ density\ kernel)}$}
% We say a kernel function $\mathcal{R}(\cdot)$ is a $(\beta;\mu,\nu)$-valid density kernel, if $\mathcal{R}(\mathbf{y},\mathbf{y})=\mathcal{R}(\mathbf{y}-\mathbf{y})$ is a bounded, compactly supported kernel such that
% \begin{align*}
%     (i) & \int \mathcal{R}(z)dz=1\\
%     (ii)& \int |\mathcal{R}(z)|^rdz \leq \infty\ for\ any\ r \geq 1,\ particularly, \int \mathcal{R}(z)^2dz\leq \mu^2\ for\ some\ \mu >0.\\
%     (iii)& \int z^s\mathcal{R}(z)dz=0,\ for\ any\ s=(s_1,...,s_d)\in \mathbb{N}^d\ such\ that\ 1\leq |s| \leq \lfloor \beta \rfloor.\ In\ addition,\\ &\int \left\| z \right\|^\beta|\mathcal{R}(z)|dz \leq \nu\ for\ some\ \nu >0.
% \end{align*}
% \end{definition}
% For simplicity, we sometimes call $\mathcal{R}(\cdot)$ as a $\beta$-valid density kernel if the constants $\mu$ and $\nu$ are not specifically given. Notice that all spherically symmetric compactly supported probability density and product kernels based on compactly supported symmetric univariate densities satisfy the conditions. For instance, the kernel $\mathcal{R}(\mathbf{y})=(2\pi)^{-d/2}\exp(-\left\|\mathbf{y}\right\|^2/2)$ satisfies the conditions with $\beta=\infty$.

% Then, we  can obtain 
% \begin{align}
%     \mathbb{E}_{{\hat p}_{\mathcal{R}}(\mathbf{y}|\mathbf{x})}[f(\mathbf{y}, \mathbf{a})] &= \int \frac{ \sum_{s=1}^S\mathcal{R}_x(\mathbf{k}_s, \mathbf{q}(\mathbf{x}))\mathcal{R}_y(\mathbf{y}_s, \mathbf{y})}{\sum_{s=1}^S \mathcal{R}_x (\mathbf{k}_s, \mathbf{q(\mathbf{x})})} f(\mathbf{y},\mathbf{a}) \mathrm{d} \mathbf{y} \nonumber \\
%     & = \frac{ \sum_{s=1}^S\mathcal{R}_x(\mathbf{k}_s, \mathbf{q})\int \mathcal{R}_y(\mathbf{y}_s, \mathbf{y}) f(\mathbf{y},\mathbf{a}) \mathrm{d} \mathbf{y}}{\sum_{s=1}^S \mathcal{R}_x (\mathbf{k}_s, \mathbf{q(\mathbf{x})})}  \nonumber \\
%     &= \frac{ \sum_{s=1}^S\mathcal{R}_x(\mathbf{k}_s, \mathbf{q}(\mathbf{x}))(f(\mathbf{y}_s, \mathbf{a})+\epsilon)}{\sum_{s=1}^S \mathcal{R}_x (\mathbf{k}_s, \mathbf{q(\mathbf{x})})} 
% \end{align}

% We next prove that the $L_1$ norm of the residue term goes to zero, \ie~ $||\epsilon||_1 \rightarrow 0, $ under the condition that  ${\hat p}_{\mathcal{R}}(\mathbf{y}|\mathbf{x})\rightarrow p(\mathbf{y}|\mathbf{x})$ uniformly as $S \rightarrow \infty$. To prove this, we follow \citet{dai2016provable}'s study and assume that $\mathcal{R}_y(\mathbf{y}, \mathbf{y}')=\mathcal{R}_y(\mathbf{y}-\mathbf{y}')$ is a $(\eta; \nu)$ valid kernel \cite{dai2016provable} which satisfies the following property:

% $\int \mathbf{z}^{\mathbf{a}}\mathcal{R}_y(\mathbf{y})\mathrm{d}\mathbf{y}=0,$ for any $\mathbf{a}=(a_1,...,a_d)\in \mathbb{N}^d$ such\ that $1\leq |\mathbf{a}| \leq \lfloor \eta \rfloor.$ In addition,$ \int \left\| \mathbf{y} \right\|^\eta|\mathcal{R}_y(\mathbf{y})|\mathrm{d}\mathbf{y} \leq \nu$ for some $\nu >0$.
    
%  Notice that all spherically symmetric compactly supported probability densities and product kernels based on compactly supported symmetric univariate densities satisfy the properties.  For instance, the kernel $\mathcal{R}(\mathbf{y})=(2\pi)^{-d/2}\exp(-\left\|\mathbf{y}\right\|^2/2)$ satisfies the conditions with $\beta=\infty$. We denote $\mathcal{R}^h_y(\mathbf{y})=\frac{1}{h^d}\mathcal{R}_y(\frac{\mathbf{y}}{h})$ as the kernel with bandwidth $h$.


% \begin{definition}{($(\eta;\mathcal{L})$-H\"older density function)}
%     We say a density function $f(\cdot)$ is a $(\eta;\mathcal{L})$-H\"older density function if function $f(\cdot)$ is $\lfloor \eta \rfloor$-times continuously differentiable on its support $\Omega$ and satisfies\\
%     \hspace*{1em}(i) for any $z_0$, there exists $L(z_0)>0$ such that\\
%     $$|f(z)-f_{z_0}^{(\eta)}(z)|\leq L(z_0) \left\|z-z_0\right\|^\eta,\forall z \in \Omega $$
%     \hspace*{1.7em} where $q_{z_0}^{(\eta)}$ is the $\lfloor \eta \rfloor$-order Taylor approximation, i.e.
%     $$f_{z_0}^{(\eta)}(z):=\sum_{s=(s_1,...,s_d):|s|\leq \lfloor \eta \rfloor}\frac{(z-z_0)^s}{s!}D^sf(z_0);$$
%     \hspace*{1em}(ii) in addition, the integral $\int L(z)dz \leq \mathcal{L}$.\\

% $f\in C_{\mathcal{L}}^{\eta}(\Omega)$ means $f$ is $(\eta;\mathcal{L})$-H\"older density function.
% \end{definition}
% Then given the above setting for the kernel function and the smooth densities, we can characterize the error of the weighted kernel density estimator as follows.


% For a particular $\mathbf{a}$, we denote the corresponding function with respect to $\mathbf{y}$ as $f_{\mathbf{a}}(\mathbf{y})$. Also following \citet{dai2016provable}'s work, we assume that  $f_{\mathbf{a}}(\mathbf{y})$ is a H\"older function which satisfies: \\
%   \hspace*{1em}(i) for any $\mathbf{y}_0$, there exists $L(\mathbf{y}_0)>0$ such that\\
%     $$|f(\mathbf{y})-f_{\mathbf{a}}^{(\eta;\mathbf{y}_0)}(\mathbf{y})|\leq L(\mathbf{y}_0) \left\|\mathbf{y}-\mathbf{y}_0\right\|^\eta,\forall \mathbf{y} \in \mathcal{Y}, $$
%     \hspace*{1.7em} where $f_{\mathbf{a}}^{(\eta;\mathbf{y}_0)}$ is the $\lfloor \eta \rfloor$-order Taylor approximation at $\mathbf{y}_0$.\\
%     \hspace*{1em}(ii) in addition, the integral $\int L(\mathbf{y})\mathrm{d}\mathbf{y} \leq \mathcal{L}$.\\

% \begin{lemma}
% If $\mathcal{R}_y$ is a $(\eta;\nu)$ valid kernel and $f_{\mathbf{a}}(\mathbf{y})$ is a H\"older function, then
% \begin{align}
%   ||\epsilon||_1 = ||\int \mathcal{R}^h_y(\mathbf{y}_s, \mathbf{y})f(\mathbf{y},\mathbf{a})\mathrm{d}\mathbf{y}-f(\mathbf{y}_s,\mathbf{a})||_1 \leq \nu\mathcal{L}h^{\eta}.
% \end{align}
% \end{lemma}

% \begin{proof}
%     The proof of this lemma follows directly from Chapter 4.3 in \citet{wand1994kernel}'s book.
%     \begin{align*}
%         |\epsilon| =\ & |\mathcal{R}^h_y(\mathbf{y}_s, \mathbf{y})f_{\mathbf{a}}(\mathbf{y})\mathrm{d}\mathbf{y}-f_{\mathbf{a}}(\mathbf{y}_s)|\\
%         =\ & |\int \frac{1}{h^d}\mathcal{R}_y(\frac{\mathbf{y}-\mathbf{y}_s}{h})f_{\mathbf{a}}(\mathbf{y})\mathrm{d}\mathbf{y}-f_{\mathbf{a}}(\mathbf{y}_s)|\\
%         =\ & | \int \frac{1}{h^d}\mathcal{R}_y(\frac{\mathbf{y}}{h})[f_{\mathbf{a}}(\mathbf{y}_s+\mathbf{y})-f_{\mathbf{a}}(\mathbf{y}_s)]\mathrm{d}\mathbf{y} |\\
%         =\ & |\int \mathcal{R}_y(\mathbf{y})[f_{\mathbf{a}}(\mathbf{y}_s+h\mathbf{y})-f_{\mathbf{a}}(\mathbf{y}_s)]\mathrm{d}\mathbf{y}|\\
%         \leq \ & \lvert \int \mathcal{R}_y(\mathbf{y})[f_{\mathbf{a}}(\mathbf{y}_s+h\mathbf{y})-f_{\mathbf{a}}^{(\eta;\mathbf{y}_s)}(\mathbf{y}_s+h\mathbf{y})]\mathrm{d}\mathbf{y} \rvert 
%         \\ &+ \lvert \int \mathcal{R}_y(\mathbf{y})[f_{\mathbf{a}}^{(\eta;\mathbf{y}_s)}(\mathbf{y}_s+h\mathbf{y})-f_{\mathbf{a}}(\mathbf{y}_s)]\mathrm{d}\mathbf{y}\rvert\\
%         \leq \ & L(\mathbf{y}) \int |\mathcal{R}_y(\mathbf{y})|\left\|h\mathbf{y}\right\|^{\eta}d\mathbf{y} + \lvert \int \mathcal{R}_y(\mathbf{y})[f_{\mathbf{a}}^{(\eta;\mathbf{y}_s)}(\mathbf{y}_s+h\mathbf{y})-f_{\mathbf{a}}(\mathbf{y})]\mathrm{d}\mathbf{y}\rvert\\
%     \end{align*}
%     Note that $f_{\mathbf{a}}^{(\eta;\mathbf{y}_s)}(\mathbf{y}_s+h\mathbf{y})-f_{\mathbf{a}}(\mathbf{y})$ is a polynomial of degree at most $\lfloor \eta \rfloor$ with no constant, by the definition of $(\eta;\nu)$-valid density kernel, the second term is zero. Hence, we have $|\epsilon| \leq \nu L(\mathbf{y})h^\eta$, and therefore
%     $$||\epsilon||_1 \leq \nu h^\eta \int L(\mathbf{y})d\mathbf{y} \leq \nu \mathcal{L}h^\eta.$$
% \end{proof}

% As we can see from Lemma 1, the $L_1$ norm of the residue term is bounded by $\nu\mathcal{L}h^{\eta}$. According to \citet{RePEc:bla:stanee:v:57:y:2003:i:2:p:159-176}'s work, when ${\hat p}_{\mathcal{R}}(\mathbf{y}|\mathbf{x})\rightarrow p(\mathbf{y}|\mathbf{x})$ uniformly as $S \rightarrow \infty$, the bandwith of the kernel should goes to zero, \ie~ $h \rightarrow 0$. Hence we obtain  $||\epsilon||_1 \rightarrow 0,$ which further leads to:
% $$ \mathbb{E}_{{\hat p}_{\mathcal{R}}(\mathbf{y}|\mathbf{x})}[f(\mathbf{y}, \mathbf{a})] \rightarrow \frac{ \sum_{s=1}^S\mathcal{R}_x(\mathbf{k}_s, \mathbf{q}(\mathbf{x}))f(\mathbf{y}_s, \mathbf{a})}{\sum_{s=1}^S \mathcal{R}_x (\mathbf{k}_s, \mathbf{q(\mathbf{x})})}.$$

% When $\mathcal{R}_x({\mathbf{k}, \mathbf{q}})$ is a exponential kernel. \ie~ $\mathcal{R}_x({\mathbf{k}, \mathbf{q}})=\text{exp}(\frac{\mathbf{q}^\top\mathbf{k}}{\sqrt{d}})$, we can obtain
% \begin{align}
% \mathbb{E}_{{\hat p}_{\mathcal{R}}(\mathbf{y}|\mathbf{x})}[f(\mathbf{y}, \mathbf{a})] &= \frac{ \sum_{s=1}^S\mathcal{R}_x(\mathbf{k}_s, \mathbf{q}(\mathbf{x}))f(\mathbf{y}_s, \mathbf{a})}{\sum_{s=1}^S \mathcal{R}_x (\mathbf{k}_s, \mathbf{q}(\mathbf{x}))} \nonumber\\
% & = \frac{ \sum_{s=1}^S \exp\left(\frac{\mathbf{q}(\mathbf{x})^\top\mathbf{k}_s}{\sqrt{d}}\right) f(\mathbf{y}_s, \mathbf{a})}{\sum_{s=1}^S \exp\left(\frac{\mathbf{q}(\mathbf{x})^\top\mathbf{k}_s}{\sqrt{d}}\right)} \nonumber \\
% &= \text{Softmax}\left(\left[\frac{\mathbf{q}(\mathbf{x})^\top\mathbf{k}_1}{\sqrt{d}}, \cdots, \frac{\mathbf{q}(\mathbf{x})^\top\mathbf{k}_S}{\sqrt{d}}\right]\right)^\top[f(\mathbf{v}_1, \mathbf{a}), \cdots, f(\mathbf{v}_S, \mathbf{a})] \nonumber \\
% & = g(\mathbf{x},\mathbf{a}).
% \end{align}
% The second last equation comes from the definition of the softmax function and replacing the notation $\mathbf{y}$ with $\mathbf{v}$ which is commonly used in the existing literature.


% Therefore, under the condition that $\hat{p}_{\mathcal{R}}(\mathbf{y}|\mathbf{x}) \rightarrow p(\mathbf{y}|\mathbf{x}) $ uniformly for any $\mathbf{x}$ as $S \rightarrow \infty $, we have:
% \begin{align}
%     g(\mathbf{x}, \mathbf{a})=\mathbb{E}_{{\hat p}_{\mathcal{R}}(\mathbf{y}|\mathbf{x})}[f(\mathbf{y}, \mathbf{a})] \rightarrow \mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f(\mathbf{y}, \mathbf{a})] 
%     \quad \text{as} \quad S \rightarrow \infty.
%  \end{align}

% Besides the Softmax attention,  an alternative attention mechanism is called conditional mean embedding (CME) attention \cite{zhang2022analysis1} which directly mimics the computation of CME weights $\beta_s$ in Eq.~\ref{eq:cme_kernel}:
% \begin{align}
% [\beta_1(\mathbf{x}), \cdots, \beta_S(\mathbf{x})] =    (\mathcal{R}_x(\mathbf{K}, \mathbf{K})+\lambda \mathbf{I})^{-1}\mathcal{R}_x(\mathbf{K},\mathbf{q}(\mathbf{x})),
% \end{align}
% where $\mathcal{R}_x(\mathbf{K}, \mathbf{q}(\mathbf{x})) \in \mathbb{R}^S $ with $\mathcal{R}_x(\mathbf{K}, \mathbf{q}(\mathbf{x}))[s]= \mathcal{R}_x(\mathbf{k}_s, \mathbf{q}(\mathbf{x}))$ and $\mathcal{R}_x(\mathbf{K},\mathbf{K})\in \mathbb{R}^{S\times S}$ with $\mathcal{R}_x(\mathbf{K},\mathbf{K})[s_1,s_2]=\mathcal{R}_x(\mathbf{k}_{s_1},\mathbf{k}_{s_2})$.

% With the CME attention, we obtain the corresponding function $g_{\rm CME}(\mathbf{x}, \mathbf{a})$:
% \begin{align}
%     g_{\rm CME}(\mathbf{x}, \mathbf{a}) =  \left((\mathcal{R}_x(\mathbf{K}, \mathbf{K})+\lambda \mathbf{I})^{-1}\mathcal{R}_x(\mathbf{K},\mathbf{q}(\mathbf{x})) \right)^\top [f(\mathbf{v}_1, \mathbf{a}), \cdots, f(\mathbf{v}_S, \mathbf{a})].
% \end{align}
% According to \citet{zhang2022analysis1}'s research, CME attention converges to the kernel conditional mean embedding. It holds with probability at least $1-\delta$ that 
% \begin{align}
%     || g_{\rm CME}(\mathbf{x}, \mathbf{a})-\mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[f(\mathbf{y},\mathbf{a}) ] ||_2 = \mathcal{O}\left( \sqrt{\frac{S}{\lambda}}\cdot \left(\frac{2}{\lambda}+\sqrt{\frac{\Gamma(S^{-1})\lambda}{\lambda}}\right)\log\frac{1}{\delta}+\lambda S^{-1} \right),
% \end{align}
% where $\Gamma(S^{-1}\lambda)$ is the effective dimension of the covariance operator.

% Note that the CME attention is  essentially a variant of the Softmax attention \cite{vaswani2017attention1} with a different normalization. In our work, we choose to use the Softmax attention which has achieved success in many deep learning applications.
% \emph{Existing work\cite{zhang2022analysis1} has also shown that Softmax attention has the same limit as CME attention as $S \rightarrow \infty$.}

%\end{proof}

% \begin{definition}{($(\eta;\mathcal{L})$-H\"older density function)}
%     We say a density function $f(\cdot)$ is a $(\eta;\mathcal{L})$-H\"older density function if function $f(\cdot)$ is $\lfloor \eta \rfloor$-times continuously differentiable on its support $\Omega$ and satisfies\\
%     \hspace*{1em}(i) for any $z_0$, there exists $L(z_0)>0$ such that\\
%     $$|f(z)-f_{z_0}^{(\eta)}(z)|\leq L(z_0) \left\|z-z_0\right\|^\eta,\forall z \in \Omega $$
%     \hspace*{1.7em} where $q_{z_0}^{(\eta)}$ is the $\lfloor \eta \rfloor$-order Taylor approximation, i.e.
%     $$f_{z_0}^{(\eta)}(z):=\sum_{s=(s_1,...,s_d):|s|\leq \lfloor \eta \rfloor}\frac{(z-z_0)^s}{s!}D^sf(z_0);$$
%     \hspace*{1em}(ii) in addition, the integral $\int L(z)dz \leq \mathcal{L}$.\\

% $f\in C_{\mathcal{L}}^{\eta}(\Omega)$ means $f$ is $(\eta;\mathcal{L})$-H\"older density function.
% \end{definition}
% Then given the above setting for the kernel function and the smooth densities, we can characterize the error of the weighted kernel density estimator as follows.


\section{Experimental Details}
\label{s:experiment}
\subsection{Computing Infrastructure}
\label{s:computing}
System: Ubuntu 18.04.6 LTS; Python 3.9; Pytorch
1.11. CPU: Intel(R) Xeon(R) Silver 4214 CPU @ 2.20GHz. GPU: GeForce GTX 2080 Ti.

\subsection{Synthetic Data}
\label{s:synthetic}
\noindent\textbf{Data generation process:}
We generate the synthetic dataset following a mixture of three Gaussians:
\begin{align}
    &\mathbf{x} \sim \mathcal{U}^2[-1,1], \quad \nonumber \\
    &\mathbf{y} \sim 0.3\mathcal{N}(\mathbf{A}_1\mathbf{x}, 0.1\cdot\mathbf{I}) + 0.3 \mathcal{N}(\mathbf{A}_2\mathbf{x}, 0.1\cdot\mathbf{I}) + 0.4 \mathcal{N}(\mathbf{A}_3\mathbf{x}, 0.1\cdot\mathbf{I}),
\end{align}

where the elements of $\mathbf{A}_1, \mathbf{A}_2, \mathbf{A}_3 \in \mathbb{R}^{2\times 2}$ are uniformly sampled from $\mathcal{U}[0,1].$

We generate 5000 $(\mathbf{x},\mathbf{y})$ pairs, randomly dividing them into a training set (70\%, 3500 pairs), and equal validation and testing sets (15\% each, 750 pairs).
 
\noindent\textbf{Optimization objective:} We consider both the convex and non-convex objectives.

Convex objective:
\begin{align}
   &\text{minimize}_{\mathbf{a}\in \mathbb{R}^2} \mathbb{E}_{p(\mathbf{y}|\mathbf{x})} \left[ 
   \sum_{i=1}^2 \left( 5(\mathbf{y}[i]-\mathbf{a}[i])_{+} + 20(\mathbf{a}[i]-\mathbf{y}[i])_{+} 
   + 0.5(\mathbf{y}[i]-\mathbf{a}[i])_{+}^2 + 0.2(\mathbf{a}[i]-\mathbf{y}[i])_{+}^2 \right)
   \right] \nonumber \\
   &\text{subject to}  \quad -1 \le \mathbf{a}[i] \le 1, \forall i. \nonumber
\end{align}

Non-convex objective:
\begin{align}
    &\text{minimize}_{\mathbf{a} \in \mathbb{R}^2} \mathbb{E}_{p(\mathbf{y}|\mathbf{x})}\sum_{i=1}^2\left[10(\mathbf{y}[i]-\mathbf{a}[i])_{+}^2 + 2(\mathbf{a}[i]-\mathbf{y}[i])_{+}^2 + 4\mathbf{a}[i]^3\right] \nonumber\\
    &\text{subject to} \quad -2 \le \mathbf{a}[i] \le 2, \forall i, \nonumber
\end{align}
where $(v)_{+}$ denote $\text{max}\{v,0\}$.

\noindent\textbf{Solver at test time:} At test time, for a fair comparison, we use the same optimization solver for all the methods. Specifically, we use projected gradient descent and the gradient update step adopts the Adam \cite{Adam} optimizer. The learning rate is $0.01$ and we repeat $500$ iterations. We empirically found that this solver solves this optimization problem very well.

\noindent\textbf{Model Hyperparameters:}
For the two-stage model, DFL, LODL and SO-EBM, the forecaster uses GMM with a different number of components and use 100 samples to estimate the expectation of the objective as we found that more samples bring little performance gain. The forecaster uses a neural network with one hidden layer as the feature extractor which is further stacked by a linear layer. This network has a hidden size of 128, employing ReLU as the nonlinear activation function. The forecaster outputs the mean, log variance, and weight for each GMM component. During training, we sample from the GMM using the Gumbel softmax trick \cite{jang2017categorical} to make the sampling process differentiable. 
SO-EBM draws 512 samples from the proposal distribution to estimate the gradient of the model parameters. The proposal distribution is a mixture of Gaussians with 3 components where the variances are $\{0.01, 0.02, 0.05\}$.



For a fair comparison, \ours uses the same feature extractor for the encoder. The attention architecture uses 1000 attention points for both the convex and non-convex objectives. During training, \ours samples 100 actions $\mathbf{a}$ uniformly from the constrained space, \ie~ the box, for each $(\mathbf{x},\mathbf{y})$ pair at each iteration for function fitting. 

 

\noindent\textbf{Model Optimization:}
We use the Adam~\cite{Adam} algorithm for model optimization. The number of training epochs is 50. The learning rate for all the methods is $10^{-3}$. DFL, LODL and SO-EBM use the two-stage model as the pre-trained model for faster training convergence.

\subsection{Wind Power Bidding}
\label{s:wind}
\noindent\textbf{Optimization objective:} 
In this task, a wind power firm engages in both energy and reserve markets,  given the generated wind power $\mathbf{x}\in \mathbb{R}^{24}$ in the last 24 hours. The firm needs to decide the energy quantity $\mathbf{a}_E \in \mathbb{R}^{12}$ to bid and quantity $\mathbf{a}_R \in \mathbb{R}^{12}$ to reserve over the upcoming 12th-24th hours in advance, based on the forecasted wind power $\mathbf{y}\in \mathbb{R}^{12}$.
The optimization objective is to maximize the profit which is a piecewise function consisting of three segments~\cite{di2020bidding1,manasssakan20221}:
{\small
\begin{align}
&\text{maximize}_{\mathbf{a}_E \in \mathbb{R}^{12}, \mathbf{a}_R \in \mathbb{R}^{12}} \mathbb{E}_{p(\mathbf{y}|\mathbf{x})}\sum_{i=1}^{12} P\mathbf{y}[i] - \nu \mathbf{a}_{R}[i] \nonumber\\ &+ 
\begin{cases} 
&-\Delta P_{\rm up,1}(\mathbf{a}_{E}[i]-\mathbf{a}_{R}[i]-\mathbf{y}[i]) -\Delta P_{\rm up,2}(\mathbf{a}_{E}[i]-\mathbf{a}_{R}[i]-\mathbf{y}[i])^2 \\ &- \mu \mathbf{a}_{R}[i] -F, \text{if } \mathbf{y}[i] < \mathbf{a}_{E}[i] - \mathbf{a}_{R}[i] \nonumber\\
&- \mu (\mathbf{a}_{E}[i]-\mathbf{y}[i]), \text{if}\ \mathbf{a}_{E}[i]-\mathbf{a}_{R}[i] \leq \mathbf{y}[i] \leq \mathbf{a}_{E}[i] \\
&-\Delta P_{\rm down}(\mathbf{y}[i]-\mathbf{a}_{E}[i]),\text{if } \mathbf{y}[i] > \mathbf{a}_{E}[i] 
\end{cases}
\end{align}
}
\hspace*{1em}$\text{subject to}  \quad E_{\rm min} \le \mathbf{a}_E[i] \le E_{\rm max},  R_{\rm min} \le \mathbf{a}_R[i] \le R_{\rm max}, \quad \forall i.$


$P$ is the regular price of the wind energy sold, $\mathbf{y}[i]$ is the energy generated during period $i$,  $\mathbf{a}_{E}[i]$ and $\mathbf{a}_{R}[i]$ are the bid and up reserve energy volumes for period $i$, respectively. $\nu$ corresponds to the opportunity cost when the company participates in the reserve markets, and $\mu$ is the deploy price of the reserved energy. This structure encapsulates three market participation scenarios. In the scenario where $\mathbf{y}[i] < \mathbf{a}_{E}[i] - \mathbf{a}_{R}[i]$, the company overbids, consequently deploying all reserved energy and facing a linear overbidding penalty, a quadratic overbidding penalty and a constant penalty determined by coefficients $\Delta P_{\rm up,1}$, $\Delta P_{\rm up,2}$, and $F$. If $ \mathbf{a}_{E}[i]-\mathbf{a}_{R}[i] \leq \mathbf{y}[i] \leq \mathbf{a}_{E}[i]$, the company meets its bid by deploying reserve market energy, thereby avoiding penalties. In this case, the company only needs to pay the deployment fee for the reserved energy. However, when $\mathbf{y}[i] > \mathbf{a}_{E}[i]$, the company underbids, resulting in the selling of surplus electricity at a discount and incurring losses defined by the coefficient $\Delta P_{\rm down}$. 
We set $P$ as 100, according to the average bidding price obtained from Nord Pool, a European power exchange. $\nu$ and $\mu$ are 20 and 110 respectively, as a general setting~\cite{di2020bidding1,manasssakan20221}. The value of $\Delta P_{\rm up,1}$, $\Delta P_{\rm up,2}$, $\Delta P_{\rm down}$ and $F$ are set to 200, 100, 20 and 10, to ensure an effective penalty. $E_{\rm min}=0$, $R_{\rm min}= 0.15$, and $E_{\rm max}=R_{\rm max}=4$.

According to the optimality condition, the optimal $\mathbf{a}_{R}[i]$ is always equal to $R_{\rm min}$ for all $i$. Therefore, we only need to determine the decision variable $\mathbf{a}_E$. 


We use the wind power generation dataset of the German energy company TenneT during 08/23/2019 to 09/22/2020 \footnote{The dataset is available at: \url{https://www.kaggle.com/datasets/jorgesandoval/wind-power-generation?select=TransnetBW.csv}}. 
The split ratio of the training dataset, validation dataset, and test datset are 64\%, 16\%, 20\%, respectively.


\noindent\textbf{Solver at test time:} At test time, for a fair comparison, we use the same optimization solver for all the methods. Specifically, we use projected gradient descent and the gradient update step adopts the Adam~\cite{kingma2015adam} optimizer. The learning rate is $0.1$ and we repeat $500$ iterations. We empirically found that this solver solves this optimization problem very well.


\noindent\textbf{Model Hyperparameters:}
For the two-stage model, DFL and SO-EBM, the forecaster uses GMM with a different number of components and use 100 samples to estimate the expectation of the objective as we found that more samples bring little performance gain. The forecaster uses a two-layer long short-term memory network (LSTM)  as the feature extractor which is further stacked by a linear layer. The network has a hidden size of 256. It takes the historical wind power in the last 24 hours as input features and outputs the forecasted wind power for the 12th to 24th hours in the future. The forecaster outputs the mean, log variance, and weight for each GMM component. During training, we sample from the GMM using the Gumbel softmax trick~\cite{jang2016categorical} to make the sampling process differentiable. 
SO-EBM draws 512 samples from the proposal distribution to estimate the gradient of the model parameters. The proposal distribution is a mixture of Gaussians with 3 components where the variances are $\{0.02, 0.05, 0.1\}$.


For a fair comparison, \ours uses the same LSTM architecture as the encoder and 500 attention points. During training, we sample 100 actions $\mathbf{a}$ uniformly  from the constrained space for each $(\mathbf{x},\mathbf{y})$ pair at each iteration.



% We use .. .\ours draws 100 action samples at each training iteration. DFL, two-stage and policy-net draws 100 samples form $p(\mathbf{y}|\mathbf{x};\theta)$ to estimate the expectation of the task loss. The split ratio of training dataset, validation dataset and test datset are 64\%, 16\%, 20\%, respectively.

\noindent\textbf{Model Optimization:}
We use the Adam~\cite{kingma2015adam} algorithm for model optimization. The number of training epochs is 200. The learning rate for all the methods is $10^{-3}$. DFL and SO-EBM use the two-stage model as the pre-trained model for faster training convergence.

\subsection{COVID-19 Vaccine Distribution}
\label{s:covid}
\noindent\textbf{Optimization objective:} In this task, given the OD matrices $\mathbf{x}\in \mathbb{R}^{47\times 47 \times 7}$ of last week, \ie~ $\mathbf{x}[i,j,t]$ represents the number of people move from region $i$ to $j$ on day $t$, we need to decide the vaccine distribution $\mathbf{a} \in \mathbb{R}^{47}$ across the 47 regions in Japan with a budget constraint ($\mathbf{a}[i]$ is the number of vaccines distributed to the region $i$). 
 The optimization objective is to 
minimize the total number of infected people over the ODE-drived dynamics, based on the forecasted OD matrices $\mathbf{y}\in \mathbb{R}^{47 \times 47 \times 7}$ for the next week. 

We want to distribute the vaccine over each county to minimize the number of infected cases. The number of infected cases is given by a metapopulation SEIRV model \cite{li2020substantial1,pei2020differential1}, denoted by $\text{Simulator}(\cdot,\cdot)$:
\begin{align}
    \argmin_{\mathbf{a}\in \mathbb{R}^{47}} \mathbb{E}_{p(\mathbf{y}|\mathbf{x})}[\text{Simulator}(\mathbf{y}, \mathbf{a})], \nonumber\\
    \text{Subject to} \quad \sum_i \mathbf{a}[i] \le \text{Budget}, \mathbf{a}[i] \ge 0. \nonumber
\end{align}
We use the OD matrices dataset of Japan \footnote{The dataset is available at \url{https://github.com/deepkashiwa20/ODCRN/tree/main/data}} during 04/01/2020 to 02/28/2021. The split ratio of the training dataset, validation dataset, and test datset are 64\%, 16\%, 20\%, respectively. We set the budget as $5\times10^{6}$.

\textbf{Details of the simulator:}
The SEIRV model is an epidemiological model used to predict and understand the spread of infectious diseases. It divides the population into five compartments: Susceptible (S), Exposed (E), Infectious (I), Recovered (R) and Vaccined (V).
The model is defined by a set of differential equations that describe the transitions between these compartments. There are four hyperparameters in the SEIRV model: 
\begin{itemize}
    \item  $\beta$ - Transmission rate: Represents the average number of contacts per person per unit of time multiplied by the probability of disease transmission in a contact between a susceptible and an infectious individual.
\item $\sigma$ - Latent rate (or the inverse of the incubation period): The rate at which exposed individuals progress to the infectious state. The incubation period is the time it takes for an individual to become infectious after exposure.

 \item $\gamma$ - Recovery rate (or the inverse of the infectious period): The rate at which infectious individuals recover or die and transition to the recovered state. The infectious period is the time during which an infected individual can transmit the disease.

\item $N$ - Total population: The sum of individuals in all compartments (S, E, I, R, V).
\end{itemize}


When considering mobility flow among different regions, we need to adapt the SEIRV model to account for the movement of individuals between regions. In this case, the model becomes a spatially explicit, multi-region SEIRV model. Each region will have its own SEIRV model, and the flow of individuals between regions will affect the dynamics of the compartments. Specifically, for each region $k=1,\cdots, K$, we have:
\begin{align}
\frac{\mathrm{d}\mathbf{S}[k]}{\mathrm{d}t} &= -\bm{\beta}[k] \frac{\mathbf{S}[k] \cdot\mathbf{I}[k]}{\mathbf{N}[k]} - \frac{\mathbf{S}[k]}{\mathbf{S}[k]+\mathbf{E}[k]}\cdot\frac{\mathbf{a}[k]}{T} \nonumber\\ &+ \sum_{i \neq k} \tilde{\mathbf{y}}[i,k, t] \cdot\mathbf{S}[i] - \sum_{j \neq k} \tilde{\mathbf{y}}[k,j,t] \cdot\mathbf{S}[k], \nonumber\\
\frac{\mathrm{d}\mathbf{E}[k]}{\mathrm{d}t} &= \bm{\beta}[k] \frac{\mathbf{S}[k] \cdot\mathbf{I}[k]}{\mathbf{N}[k]} - \bm{\sigma}[k] \cdot\mathbf{E}[k] - \frac{\mathbf{E}[k]}{\mathbf{S}[k]+\mathbf{E}[k]} \cdot\frac{\mathbf{a}[k]}{T} \nonumber\\&+ \sum_{i \neq k} \tilde{\mathbf{y}}[i,k,t] \cdot\mathbf{E}[i] - \sum_{j \neq k} \tilde{\mathbf{y}}[k,j,t] \cdot\mathbf{E}[k], \nonumber\\
\frac{\mathrm{d}\mathbf{I}[k]}{\mathrm{d}t} &= \bm{\sigma}[k]\cdot \mathbf{E}[k] - \bm{\gamma}[k] \cdot\mathbf{I}[k] \nonumber\\ &+ \sum_{i \neq k} \tilde{\mathbf{y}}[i,k,t]\cdot \mathbf{I}[i] - \sum_{j \neq k} \tilde{\mathbf{y}}[k,j,t] \cdot\mathbf{I}[k], \nonumber \\
\frac{\mathrm{d}\mathbf{R}[k]}{\mathrm{d}t} &= \bm{\gamma}[k] \cdot\mathbf{I}[k] + \sum_{i \neq k} \tilde{\mathbf{y}}[i,k,t]\cdot \mathbf{R}[i] - \sum_{j \neq k} \tilde{\mathbf{y}}[k,j,t] \cdot\mathbf{R}[k], \nonumber\\
\frac{\mathrm{d}\mathbf{V}[k]}{\mathrm{d}t} &= \frac{\mathbf{a}[k]}{T} + \sum_{i \neq k} \tilde{\mathbf{y}}[i,k,t]\cdot \mathbf{V}[i] - \sum_{j \neq k} \tilde{\mathbf{y}}[k,j,t] \cdot\mathbf{V}[k],
\end{align}
where $\bm{\beta}[k]$, $\bm{\gamma}[k]$, and $\bm{\sigma}[k]$ are hyper-parameter for region $k$. These hyperparameters are fitted on the dataset using maximum likelihood estimation. $\tilde{\mathbf{y}}$ is the normalized OD matrix.

Finally, the simulator will output the total number of newly infected people across all the regions and we aim to minimize this value.


\noindent\textbf{Solver at test time:} At test time, for a fair comparison, we use the same optimization solver for all the methods. Specifically, we use mirror descent \cite{beck2003mirror} so that the updated decision variable will still variable satisfy the constraints. Specifically, the update rule takes the following form at $t$-th iteration:
\begin{align}
    \mathbf{a}_{t+1}[i] = \text{Budget}\cdot\frac{\mathbf{a}_t[i] \exp(-\gamma\nabla_if(\mathbf{a}_t))}{\sum_{j=1}^n\mathbf{a}_t[i]\exp(-\gamma\nabla_j f(\mathbf{a}_t))},
\end{align}
where $\gamma$ is the learning rate. We set the learning rate as $0.01$ and repeat $500$ iterations. We empirically found that this solver solves this optimization problem very well.



\noindent\textbf{Model Hyperparameters:}
For the two-stage model, DFL, LODL and SO-EBM, the forecaster uses GMM with a different number of components and use 100 samples to estimate the expectation of the objective as we found that more samples bring little performance gain. The forecaster is a DC-RNN \cite{li2018diffusion1} which adopts an encoder-decoder architecture. The encoder and decoder both have two hidden layers with a hidden size of 128. The forecaster takes the OD matrices of last week as input features and predicts the OD matrices of next week. The forecaster outputs the mean, log variance, and weight for each GMM component. During training, we sample from the GMM using the Gumbel softmax trick \cite{jang2017categorical} to make the sampling process differentiable.  Since the decision variable is a simplex, we train SO-EBM with projected Langevin dynamics. Specifically, at each iteration of the Langevin dynamics, we project the decision variable into the simplex. The number of iterations of the Langevin dynamics is 100 and the step size is 0.05.

 
 For a fair comparison, \ours employs the same encoder as the DC-RNN architecture and uses 100 attention points.  During training, \ours samples 100 actions $\mathbf{a}$ uniformly from the constrained space, \ie~ the simplex, for each $(\mathbf{x},\mathbf{y})$ pair at each iteration for function fitting. To uniformly sample from the simplex, we sample from the Dirichlet distribution where all parameters are 1.



\noindent\textbf{Model Optimization:}
We use the Adam~\cite{Adam} algorithm for model optimization. The number of training epochs is 50. The learning rate for all the methods is $10^{-4}$. DFL, LODL and SO-EBM use the two-stage model as the pre-trained model for faster training convergence. 



\subsection{Inventory Optimization}
\label{s:customer}

\noindent\textbf{Optimization objective}  In this task, a department store is tasked with predicting the sales $\mathbf{y} \in \mathbb{R}^7$ for the upcoming 7th-14th days based on the past 14 days' sales data $\mathbf{x} \in \mathbb{R}^{14}$ for a specific product, and accordingly, determining the best replenishment strategy $\mathbf{a} \in \mathbb{R}^7$ for each day. The optimization objective is a combination of an under-purchasing penalty, an over-purchasing penalty, and a squared loss between supplies and demands: 
\begin{align}
   \text{minimize}_{\mathbf{a}\in \mathbb{R}^7} \mathbb{E}_{p(\mathbf{y}|\mathbf{x})}&\sum_{i=1}^7[20(\mathbf{y}[i]-\mathbf{a}[i])_{+} + 5(\mathbf{a}[i]-\mathbf{y}[i])_{+} \nonumber\\ &+ (\mathbf{a}[i]-\mathbf{y}[i])^2]\nonumber \\
   % &\text{subject to}  \quad \mathbf{G}\mathbf{a} \le \mathbf{h}\nonumber
   &\text{subject to} \quad 0 \le \mathbf{a}[i] \le 3, \forall i, \nonumber
\end{align}
where $(v)_{+}$ denote $\text{max}\{v,0\}$.


\noindent\textbf{Solver at test time:} At test time, for a fair comparison, we use the same optimization solver for all the methods. Specifically, we use projected gradient descent and the gradient update step adopts the Adam \cite{Adam} optimizer. The learning rate is $0.1$ and we repeat $500$ iterations. We empirically found that this solver solves this optimization problem very well.

\noindent\textbf{Model Hyperparameters:}
The forecaster of the two-stage model, DFL, LODL and SO-EBM uses a two-layer long short-term memory network (LSTM) \cite{hochreiter1997long1} as a feature extractor which is further stacked by a linear layer. The forecaster takes the historical item sales in the last 14 days as input features and outputs the forecasted item sales for the 7th to 14th days in the future. The network has a hidden size of 128. SO-EBM draws 512 samples from the proposal distribution to estimate the gradient of the model parameters. The proposal distribution is a mixture of Gaussians with 3 components where the variances are $\{0.05, 0.1, 0.2\}$.



For a fair comparison, \ours uses the same LSTM architecture as the encoder and 230 attention points. During training, the two-stage model, DFL, LODL and SO-EBM use 100 samples to estimate the expected objective as more samples provide little performance gain. 

\noindent\textbf{Model Optimization:}
We use the Adam~\cite{Adam} algorithm for model optimization. The number of training epochs is 200. The learning rate for all the methods is $10^{-3}$. DFL, LODL and SO-EBM use the two-stage model as the pre-trained model for faster training convergence.

%following are appendix_refs
