%=========================
\section{Proof of proposition~\ref{prop:metric}}
%=========================


To demonstrate that the OTD$^3$, $d_{OT}(\cdot, \cdot; c_{PtO})$ is a valid metric, it is sufficient to verify that the ground cost function $c_{PtO}$ used in the optimal transport problem is a metric on $\X \times \Y \times \Omega$. If $c_{PtO}$ is indeed a metric, then $d_{OT}(\cdot, \cdot; c_{PtO})$ corresponds to the Wasserstein distance \cite{villani_optimal_2008}. In Equation~\ref{eq:cpto}, $d_{OT}(\cdot, \cdot; c_{PtO})$ is defined as a convex combination of $d_{\X}$ and $d_{\Y}$, which are metrics on $\X$ and $\Y$ respectively, and the decision quality disparity $l_q$. To show that $c_{PtO}$ is a metric, it suffices to show that $l_q$ satisfies the four metric properties: non-negativity, identity of indiscernibles, symmetry, and the triangle inequality. If $l_q$ does not individually satisfy these properties, we must demonstrate that the convex combination of $d_{\X}$, $d_{\Y}$, and $l_q$ satisfies these properties collectively under the assumption that $\alpha_X,\alpha_Y,\alpha_W > 0$.

First, $l_q$ is clearly non-negative because it is defined as an absolute value. It is symmetric in the convex combination of $c_{PtO}$ because it is taken as the absolute difference between two decision qualities with fixed true costs. 
\begin{align*}
    \ldq(z, z'; y', y') 
    &= \big| q(z; y') - q(z'; z') \big| \\
    &= \big| q(z'; y') - q(z; z') \big| \\
    &= \ldq(z', z; y', y')
\end{align*}
Moreover, $l_q$ satisfies triangle inequality due to the triangle inequality property of the absolute value.
\begin{align*}
    &\ldq(z_1, z_2; y_1, y_2) + \ldq(z_2, z_3; y_2, y_3) \\
    &= \big| g(z_1; y_1) - g(z_2; y_2) \big| + \big| g(z_2; y_2) - g(z_3; y_3) \big| \\
    &\leq \big| g(z_1; y_1) - g(z_2; y_2) + g(z_2; y_2) - g(z_3; y_3) \big| \\
    &= \big| g(z_1; y_1)  - g(z_3; y_3) \big| \\
    &= \ldq(z_1, z_3; y_1, y_3)
\end{align*}
Lastly, while $l_q$ might not satisfy the identity of indiscernibles in isolation (specifically, $l_q(y, y'; z, z) = 0$ does not necessarily imply $y = y'$; meaning two different decisions can lead to the same objective value), $c_{PtO}$ does satisfy this property for $\alpha_X,\alpha_Y,\alpha_W > 0$. If $(x,y,z) = (x',y',z')$, then $\ldq(z, z'; y', y') = \big| g(z; y) - g(z'; y) \big| = 0$ because $z=z'$ implies $g(z; y) = g(z'; y)$ and hence $c_{PtO}((x, y, z), (x', y', z'))=0$. Conversely, if $c_{PtO}((x, y, z), (x', y', z')) = 0$, then $d_{\X}(x, x') = 0$, $d_{\Y}(y, y') = 0$, and $l_q(y, y'; z, z) = 0$ because $\alpha_X,\alpha_Y,\alpha_W > 0$. Since $d_{\Y}(y, y') = 0$ implies $y = y'$ (because $d_{\Y}$ is a metric), it follows that $w^*(y) = w^*(y')$ and hence $z = z'$.

Therefore, $c_{PtO}$ satisfies the identity of indiscernibles. Consequently, since $l_q$ satisfies non-negativity, symmetry, and the triangle inequality, and since $c_{PtO}$ satisfies the identity of indiscernibles, $d_{OT}(\cdot, \cdot; c_{PtO})$ is indeed a valid metric with $c_{PtO}$ a valid metric on $\X \times \Y \times \Omega$.


\section{Preamble for Theorem~\ref{theo1}}

\subsection{Validity Assumption \ref{assump:1}}
Assumption~\ref{assump:1} imposes a specific structure on the downstream optimization problem by assuming that the decision quality function has a bounded rate of change with respect to both the predicted and true cost vectors. This is a reasonable assumption for certain downstream optimization tasks, as highlighted in the following lemmas. 

\begin{lemma}
\label{lemma1}
    If $M(\cdot)$ is a convex program with a strongly convex objective and constraints with independent derivatives (Linear Independence Constraint Qualification (LICQ)), Assumption~\ref{assump:1} holds. 
\end{lemma}
The strong convexity of the objective ensures that the gradient is Lipschitz continuous, while the LICQ guarantees that the optimal solutions depend continuously on the parameters. By the smoothness of the objective and the continuity of the optimal solutions, the difference in the decision quality function $q$ between two sets of parameters and their corresponding optimal solutions can be bounded by a linear combination of the distances between the parameters and the distances between the optimal solutions.

\begin{lemma}
\label{lemma2}
    If $M(\cdot)$ has a linear optimization objective with a strongly convex feasible region, Assumption \ref{assump:1} holds.
\end{lemma}
When $M(\cdot)$ has a linear optimization objective and a strongly convex feasible region, the decision quality function $q$ satisfies the $k_1,k_2$-Lipschitz property. The linearity of the objective ensures that changes in the parameters lead to proportional changes in the objective value, while the strong convexity of the feasible region guarantees that the optimal solutions are unique and vary smoothly with respect to the parameters. This smooth dependence, combined with the linear structure of the objective, implies that the difference in $q$ between two sets of parameters and their corresponding optimal solutions can be bounded by a linear combination of the distances between the parameters and the distances between the optimal solutions.

\subsection{Lipschitzness of the Decision Quality Disparity Function}
To establish the bound presented in Theorem \ref{theo1}, we rely on the fact that $\ldq$ is $k_1,k_2$-Lipschitz under Assumption \ref{assump:1}. The following proposition demonstrates that $\ldq$ indeed satisfies the Lipschitz condition given this assumption.

\begin{proposition}
If $g$, the objective function of the downstream optimization problem, is $k_1,k_2$-Lipschitz (Assumption \ref{assump:1}), then $\ldq$ is also $k_1,k_2$-Lipschitz.
\end{proposition}

\begin{proof}
    \begin{align}
    &\big| \ldq(z,z_1; y,y_1) - \ldq(z,z_2; y,y_2) \big| \notag \\
    &= \big| \abs{g(z;y)-g(z_1;y_1)} - \abs{g(z;y)-g(z_2;y_2)} \big| \notag \\
    &\leq \big| g(z;y) - g(z_1;y_1) - g(z;y) + g(z_2;y_2) \big| \label{eq:kl1}\\
    &= \big|g(z_2;y_2) - g(z_1;y_1) \big| \notag \\
    &= \big|g(z_2;y_2) - g(z_1;y_2) + g(z_1;y_2) - g(z_1;y_1)  \big| \notag \\
    &\leq \big|g(z_2;y_2) - g(z_1;y_2)\big| + \big|g(z_1;y_2) - g(z_1;y_1) \big| \label{eq:kl2}\\
    &\leq k_1  \norm{z_1-z_2} + k_2 \norm{y_1-y_2} \label{eq:kl3}
\end{align}
Inequalities (\ref{eq:kl1}) and (\ref{eq:kl2}) are a result of the triangle inequality of the absolute value. Inequality (\ref{eq:kl3}) is due to the $k_1-k_2$-lipschitzness of $g$.
\end{proof}


\section{Proof of Theorem~\ref{theo1}} \label{sec:proof_theorem}

\begin{theorem}    
    Suppose Assumption \ref{assump:1} holds. For a feature space $\X$, a label space $\Y$, and a decision set $\Omega$, let $\Z := \X \times \Y \times \Omega$. Let $\Pd_{S}$ and $\Pd_{T}$ be the source and target distributions over $\X \times \Y$ respectively. For any labeling function $f:\X \to \Y$, let $\Pd_{T}^{f}$ and $\Pd_{S}^{*}$ be distributions over $\Z$ given by $\Pd_{T}^{f} := (x,y,w^*(f(x)))_{(x,y) \sim \Pd_T}$ and $\Pd_{S}^{*} := (x,y,w^*(y))_{(x,y) \sim \Pd_S}$. For a ground cost function of the form \looseness=-1
    \begin{align*}
        c_{PtO}^{\bm{\alpha}}((x,y,z),(x',y',z')) = \alpha_X d_{\X}(x,x') + \alpha_Y d_{\Y}(y,y') + \alpha_W \ldq(z,z';y',y'), \notag
    \end{align*}
    let $\Pi^*$ be the coupling that minimizes the OT problem with ground cost $c_{PtO}^{\bm{\alpha}}$ between $\Pd_{T}^{f}$ and $\Pd_{S}^{*}$. Let $\Tilde{f}$ be a labeling function that is $\phi$-Lipschitz transferable w.r.t. $\Pi^*$. We assume $\X$ is bounded by $K$ and $\Tilde{f}$ is $l$-Lipschitz, such that $|\Tilde{f}(x_1)-\Tilde{f}(x_2)| \leq 2lK = L$. Then, for all $\lambda > 0$ and $\alpha_W \in (0,1)$ such that $(\lambda k_1 + k_2 + 1)\alpha_W = 1$, and $\alpha_X = \lambda k_1 \alpha_W$ and $\alpha_Y = k_2\alpha_W$, we have with probability at least $1-\delta$ that:
    \begin{align*}
        err(f; \dqreg, \Pd_T)  \leq \ err(\Tilde{f}; \dqreg, \Pd_S) + err(\Tilde{f}; \dqreg, \Pd_T) + k_1L\phi(\lambda)
        + (1/\alpha_{W}) d_{OT}(\Pd_{T}^{f}, \Pd_{S}^{*} \ ; c_{PtO}^{\bm{\alpha}})
    \end{align*}
\end{theorem}
\begin{proof}
\begin{align}
    er&r(f; \dqreg, \Pd_T) \notag \\
    % &= \Et \dqreg(f(x),y) \notag \\
    &= \Et \ldq(w^*(f(x)),w^*(y);y,y) \notag \\
    &\leq \Et \ldq(w^*(f(x)),w^*(\Tilde{f}(x));y,y) + \Et \ldq(w^*(\Tilde{f}(x)),w^*(y);y,y) \label{eq:bound1}\\
    &=  \Et \ldq(w^*(\Tilde{f}(x)), w^*(f(x));y,y) + \Et \ldq(w^*(\Tilde{f}(x)),w^*(y);y,y) \label{eq:bound2}\\
    &=  \Etf \ldq(w^*(\Tilde{f}(x)), z;y,y) + \Et \ldq(w^*(\Tilde{f}(x)),w^*(y);y,y) \label{eq:bound3} \\
    &=  \Etf \ldq(w^*(\Tilde{f}(x)), z;y,y) - err(\Tilde{f}; \dqreg, \Pd_S) + err(\Tilde{f}; \dqreg, \Pd_S) + err(\Tilde{f}; \dqreg, \Pd_T) \notag \\
    &=  \Etf \ldq(w^*(\Tilde{f}(x)), z;y,y) - \Etstar \ldq(w^*(\Tilde{f}(x)), z;y,y) + err(\Tilde{f}; \dqreg, \Pd_S) + err(\Tilde{f}; \dqreg, \Pd_T) \notag \\
    &\leq  \big| \Etf \ldq(w^*(\Tilde{f}(x)), z;y,y) - \Etstar \ldq(w^*(\Tilde{f}(x)), z;y,y)\big| + err(\Tilde{f}; \dqreg, \Pd_S) + err(\Tilde{f}; \dqreg, \Pd_T) \notag
\end{align}

Inequality (\ref{eq:bound1}) uses the fact that $\ldq(\ \cdot \ ;y,y)$ satisfies the triangle inequality and line (\ref{eq:bound2}) is due to the symmetry of $\ldq(\ \cdot \ ;y,y)$ for any $y \in \C$. Line (\ref{eq:bound3}) comes from the fact that $\Pd_{T}^{f} := (x,f(x),y)_{(x,y) \sim \Pd_T}$. We continue by bounding the first term.
\begin{align}
    &\big|\Etf \ldq(w^*(\Tilde{f}(x)), z;y,y) - \Etstar \ldq(w^*(\Tilde{f}(x)), z;y,y)\big| \notag \\[0.1cm]
    &=  \abs{\int_{\Z} \ldq(w^*(\Tilde{f}(x)),z;y,y)(\Pd_{T}^{f}(X=x, Y=y, Z=z)-\Pd_{S}^{*}(X=x, Y=y, Z=z))\dt x \dt y \dt z} \notag \\
    &=  \abs{\int_{\Z} \ldq(w^*(\Tilde{f}(x)),z;y,y) \dtproof}
        \notag \\
    &\leq  \int_{\Z^2} 
        \abs{\ldq(\Tilde{z}_t,z_{t}^{f};y_t,y_t)
        -\ldq(\Tilde{z}_s,z_{s};y_s,y_s)}
        \dtproofshort
        \label{eq:bound4}\\ %tag
    &\leq   \int_{\Z^2} 
            \Big|\ldq(\Tilde{z}_t,z_{t}^{f};y_t,y_t) 
            - \ldq(\Tilde{z}_s,z_{t}^{f};y_s,y_t)\Big|
            + \Big|\ldq(\Tilde{z}_s,z_{t}^{f};y_s,y_t) 
            - \ldq(\Tilde{z}_s,z_{s},;y_s,y_s)\Big| 
            \dtproofshort
            \label{eq:bound5}\\   
    &\leq   \int_{\Z^2} 
            k_1 d_\C(\Tilde{f}(x_t),\Tilde{f}(x_s)) + k_2 d_\C({y_{t},y_s})
            + \Big|\ldq(\Tilde{z}_s,z_{t}^{f};y_s,y_t) 
            - \ldq(\Tilde{z}_s,y_{s};y_s,y_s)\Big| 
            \dtproofshort
            \label{eq:bound6}\\
    &\leq   \ k_1L\phi(\lambda) + \int_{\Z^2} 
            \lambda k_1d_\mathcal{X}(x_t,x_s) 
            + k_2 d_\C({y_{t},y_s})
            + \Big|\ldq(\Tilde{z}_s,z_{t}^{f};y_s,y_t) - \ldq(\Tilde{z}_s,y_{s};y_s,y_s)\Big| 
            \dtproofshort\\
    &\leq   \ k_1L\phi(\lambda) + \int_{\Z^2} 
            \lambda k_1 d_\mathcal{X}(x_t,x_s) 
            + k_2 d_\C({y_{t},y_s}) + \ldq(z_{t}^{f},z_s;y_s,y_s) 
            \dtproofshort
            \notag
\end{align}


From line (\ref{eq:bound4}) onwards we take $\vb{w}_{s}:=(x_s,y_s,y_s), \vb{w}_{t}^{f}:=(x_t,y_{t}^{f},y_t)$ and $\Tilde{z}_s = w^*(\Tilde{f}(x_s)), \Tilde{z}_t = w^*(\Tilde{f}(x_t))$ for ease of notation. Given a weight $\alpha_W$, we now normalize the last term such that the ground cost function is a convex combination of $d_\X$, $d_\Y$m and $\ldq$.


\begin{align}
    & \int_{\Z^2} \lambda k_1 d_\mathcal{X}(x_t,x_s) 
    + k_2 d_\C({y_{t},y_s}) + \ldq(z_{t}^{f},z_s;y_s,y_s) 
    \dtproofshort
    \notag \\
    &= \frac{1}{\alpha_W}\int_{\Z^2} 
    \lambda k_1 \alpha_W  d_\X(x_t,x_s) 
    + k_2 \alpha_W d_\C(x_t,x_s)
    + \alpha_W  \ldq(z_{t}^{f},z_s;y_s,y_s) 
    \dtproofshort
    \notag \\
    &= \frac{1}{\alpha_W} \ d_{OT}(\Pd^{f}_{T},\Pd^{*}_{S}; c_{PtO}^{\bm{\alpha}}) \notag
\end{align}
\end{proof}


\section{Experimental Settings Details} \label{app:experiments}
\subsection{Linear Model Top-K \cite{shah_decision-focused_2022}}
\paragraph{PtO task description.} The Linear Model Top-K setting is a learning task designed to evaluate decision-focused learning approaches in scenarios where the true relationship between features and outcomes is nonlinear, yet the model used for prediction is constrained to be linear. Specifically, the objective is to train a linear model to perform top–$K$ selection when the underlying data is generated by a cubic polynomial function. This controlled setup enables an assessment of how well decision-focused methods handle model misspecification. The predict-then-optimize (PtO) task in this setting is defined as follows:

\begin{itemize}[leftmargin=0.5cm]
    \item[] \textit{Predict:} Given the feature $x_n \sim \Pd_\X$, where $\Pd_\mathcal{X} = \text{Unif}[-1, 1]$, of a resource $n$, the prediction tasks consists of using a linear model to predict the corresponding utility $\hat{y}_n$, where the true utility $y_n = p(x_n)$ is a cubic polynomial in $x_n$. The predictions for $N$ resources are aggregated into a vector $\hat{\bm{y}} = [\hat{y}_1, \ldots, \hat{y}_N]$, where each element corresponds to the predicted utility of a resource.
    
    \item[] \textit{Optimize:} The optimization task involves selecting the $K$ out of $N$ resources with the highest utility. This corresponds to solving the optimization problem $M(\hat{\bm{y}}) = \max_{\bm{z} \in [0,1]^N} \{\bm{z} \cdot \sigma_x(\hat{\bm{y}})\}$ such that $||\bm{z}||_0 = K$, where $\sigma_x$ is the permutation that orders $\hat{\bm{y}}$ in ascending order of $\bm{x} = [x_1, \ldots, x_N]$.
\end{itemize}

\paragraph{Synthetic distribution shift} We introduce synthetic distribution shifts to create a scenario for transfer learning. We modify the original feature-label distribution \( \Pd = (\text{Id}, p)_*U[-1,1] \). Specifically, for various values of \( \gamma \in [0,1.3] \), we define the feature-label distributions \( \Pd_\gamma = (\text{Id}, p_\gamma)_*U[-1,1] \) where \( p_{\gamma}(x_n) = 10(x_{n}^3 - \gamma x_n) \), using \( \Pd_{0.65} \) as the target distribution.

\paragraph{Training details} We use the implementation from \citet{shah_decision-focused_2022}\footnote{\url{github.com/sanketkshah/LODLs}} to train models by setting \texttt{loss="DFL"}. This implementation uses an entropy regularized Top-K loss function proposed by \citet{xie_differentiable_2020} that reframes the Top-K problem with entropy regularization as an optimal transport problem, enabling end-to-end learning. 


\subsection{Warcraft Shortest Path \cite{vlastelica_differentiation_2020}}
\paragraph{PtO task description.} This setting involves finding the minimum-cost path on $d \times d$ RGB grid maps from the Warcraft II tileset dataset, where each pixel represents terrain with an unknown traversal cost. The task is to first predict these costs from an input image and then determine the shortest path from the top-left to the bottom-right corner based on the predicted cost map. This benchmark is particularly notable because it involves image inputs, a modality not widely explored in other shortest-path learning tasks. Following \cite{pogancic_differentiation_2020}, we use $96 \times 96$ RGB images as input, with the shortest path being computed on a coarser $12 \times 12$ grid representation of the predicted costs.

\begin{itemize}[leftmargin=0.5cm]
    \item[] \textit{Predict:} Given the feature $x_n \in \mathbb{R}^{d \times d \times 3}$, predict the travel cost grid $\hat{y}_n \in \mathbb{R}^{p \times p}$.
    \item[] \textit{Optimize:} Solve a shortest-path problem over the predicted cost grid. Specifically, find the path $\bm{z}$ that minimizes the total traversal cost: $M(\hat{\bm{y}}) = \min_{\bm{z} \in [0,1]^p} \{\bm{z} \cdot \hat{y}\}$ subject to boundary conditions $\bm{z}_{0,0}=\bm{z}_{p,p}=1$ and connectivity constraints ensuring that $\bm{z}$ represents a valid path from the top-left to the bottom-right corner.
\end{itemize}

\paragraph{Synthetic distribution shift.} The original distribution $\mathcal{P}$, which we treat as the target distribution, is defined over $\mathbb{R}^{d \times d} \times \mathbb{R}^{p \times p}$, where $d = 96$ and $p = 12$. Here, $\mathbb{R}^{d \times d}$ represents the feature space depicting maps, while $\mathbb{R}^{p \times p}$ represents the traveling costs on these maps. We induce a target shift for $\Pd_\gamma$ by uniformly sampling the costs for different pixel classes from the same range as  $\mathcal{P}$ ([0.8, 9.2] for the Warcraft II tileset dataset). Figure~\ref{fig:warcraft_shift} illustrates the costs coming from two different distributions over one same feature while highlighting the different decisions (shortest path) that these costs yield.

\paragraph{Training details.} We use \texttt{pyepo}\footnote{\url{github.com/khalil-research/PyEPO}} implementation with SPO+ loss function on a truncated ResNet-18 consisting of the first five layers, followed by a final convolutional layer that reduces the number of output channels to one. Finally, we use an adaptive max-pooling layer to obtain a fixed $p \times p$ spatial resolution, allowing for a structured representation of the extracted features.

\begin{figure}[h!]
    \centering
    \includegraphics[width=3.25in]{images/Warcraft_shift.png}
    \caption{\textit{Synthetic distribution shift in Warcraft Shortest Path}. The white line illustrates the decision, corresponding to the shortest path, on dataset A (center) and dataset B (right) for a sample with the same features (left map).}
    \label{fig:warcraft_shift}
\end{figure}

\subsection{Inventory Stock Problem \cite{donti_task-based_2017}}
\paragraph{PtO task description.} In this problem a company must determine the optimal order quantity \(z\) of a product to minimize costs given a stochastic demand \(y\), which is influenced by observed features \(x\). The cost structure includes both linear and quadratic costs for the amount of product ordered, as well as different linear and quadratic costs for over-orders \([z - y]^+\) and under-orders \([y - z]^+\). The objective function is:
\begin{align}
f_{\text{stock}}(y, z) =& c_{0} z + \frac{1}{2} q_{0} z^{2} + c_{b} [y - z]_{+} + \frac{1}{2} q_{b} ([y - z]_{+})^{2} \notag \\&+ c_{h} [z - y]_{+} + \frac{1}{2} q_{h} ([z - y]_{+})^{2} 
\end{align}


\noindent where \([v]_{+} \equiv \max \{v, 0\}\). In our paper, we use $c_0=30, q_0=10, c_b=10, q_b=2, c_h=30, q_h=25$.
For a given probability model \(p(y| x; \theta)\), the proxy stochastic programming problem can be formulated as: $\underset{z}{\operatorname{minimize}} \quad \mathbf{E}_{y \sim p(y|x; \theta)} \left[ f_{\text{stock}}(y, z) \right]$.

To simplify the setting, we assume that the demands are discrete, taking on values \(d_{1}, \ldots, d_{k}\) with probabilities (conditional on \(x\)) \(\left(p_{\theta}\right)_{i} \equiv p\left(y = d_{i}|x; \theta\right)\). Thus, our stochastic programming problem can be succinctly expressed as a joint quadratic program:
\begin{align*}
&\underset{z \in \mathbb{R}, z_{b}, z_{h} \in \mathbb{R}^{k}}{\operatorname{minimize}} \Big\{
c_{0} z + \frac{1}{2} q_{0} z^{2} + \sum_{i = 1}^{k} \left(p_{\theta}\right)_{i} \big( c_{b} (z_{b})_{i} \tag{10}\\
& \hspace{2.2cm}+ \frac{1}{2} q_{b} (z_{b})_{i}^{2} + c_{h} (z_{h})_{i} + \frac{1}{2} q_{h} (z_{h})_{i}^{2} \big)\Big\} \\
&\text{subject to} \quad d - z \mathbf{1} \leq z_{b}, \quad z \mathbf{1} - d \leq z_{h}, \quad z, z_{h}, z_{b} \geq 0
\end{align*}

\paragraph{Synthetic distribution shift} We generate problem instances by randomly sampling \(x \in \mathbb{R}^n\) and then generating \(p(y| x; \theta)\) according to \(p(y|x; \theta) \propto \exp((\theta^T x)^2)\). We introduce distribution shifts for both \(x\) and \(y\). Specifically, \(x\) is sampled from a Gaussian distribution where the mean is sampled from \(U[-0.5, 0.5]\), and \(\theta\) is also sampled from a Gaussian distribution.

\paragraph{Training details} We use the implementation from \cite{donti_task-based_2017}\footnote{\url{github.com/locuslab/e2e-model-learning}} following their Inventory Stock Problem experiments. 

\section{OTD$^3$ Implementation Details}
Our implementation of the OTD$^3$ relies on the \texttt{POT}\footnote{\url{pythonot.github.io/}} package. The computation of dataset distance involves two main steps:
\begin{enumerate}
    \item \textbf{Computing Pairwise Pointwise Distances:}
    We first compute the pairwise distances between samples in the source and target datasets. This involves calculating distances separately for features, labels, and decisions, weighted according to the selected component weights $(\alpha_X,\alpha_Y,\alpha_W)$. Feature and label distances are computed using standard metric spaces (e.g., Euclidean or cosine distance), while decision distances are computed using decision quality disparity.
    \item \textbf{Solving the Optimal Transport Problem:}
    Given the computed pairwise distances, we compute the dataset distance using Earth Mover’s Distance (EMD) via \texttt{POT}'s \texttt{emd} solver. EMD finds the exact optimal transport plan, making it well-suited for capturing true correspondences between source and target datasets without introducing regularization bias. This approach was computationally feasible in our experiments due to the relatively small dataset sizes.
\end{enumerate}
Additionally, for experiments involving hyperparameter tuning, we evaluate multiple weight combinations on a predefined grid and select the setting that maximizes correlation with regret transferability.

\section{Additional Results}
\subsection{Selecting source datasets for transfer learning}
In Section~\ref{sec:pred_transfer} we analyzed the correlation between dataset distance and transferability in PtO. The plots presented in Figure \ref{app-fig:regression} show this correlation for the Linear Model TopK setting and the Inventory Stock problem under two weighting profiles: one where decision-related features are excluded (left) and one where they are included (right). In both settings, incorporating decisions into the distance metric leads to improved predictability of transfer performance. This effect is more pronounced in the Linear Model TopK task than in the Inventory Stock problem.

For these settings, we do not perform fine-tuning on the target dataset. Instead, we assess transferability in a zero-shot setting, where a model trained on the source dataset is directly applied to the target domain without further adaptation. This choice is motivated by the relative simplicity of the feature spaces involved, which enables a meaningful evaluation of dataset distances without introducing potential confounding effects from additional training steps. Accordingly, rather than plotting dataset distance against the relative drop in regret after fine-tuning, we plot it against $\mathcal{T}(S \to T) = (\text{reg}(\D_{S})-\text{reg}(\D_{T}))/\text{reg}(\D_{T})$,
where $\text{reg}(\D_{S})$ denotes the decision regret when applying the source-trained model to the target dataset, and $\text{reg}(\D_{S})$ is the regret of a model trained directly on the target.

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.45\columnwidth]{images/dist_vs_te_best_topk.png}
    % \includegraphics[width=0.3\columnwidth]{images/unif_dist_vs_te_best.png}
    \includegraphics[width=0.45\columnwidth]{images/dist_vs_te_best_inventory.png}
    \caption{\textit{Distance vs Adaptation}. OT distance for the best feature-label and feature-label-decision weighting against regret transferability.}
    \label{app-fig:regression}
\end{figure}

\noindent
\begin{minipage}[t]{0.55\textwidth}
    \vspace{0pt} 
    \centering
    \begin{minipage}[b]{0.48\textwidth}
        \centering
        \includegraphics[height=1.6in]{images/correlation_warcraft.png}
        \makebox[\linewidth]{\small (a) Warcraft Shortest Path}
        \label{fig:correlation_warcraft}
    \end{minipage}%
    \hfill
    \begin{minipage}[b]{0.48\textwidth}
        \centering
        \includegraphics[height=1.6in]{images/correlation_inventory.png}
        \makebox[\linewidth]{\small (b) Inventory Stock}
        \label{fig:correlation_inventory}
    \end{minipage}
    \captionof{figure}{Difference in labels against difference in decisions.}
\label{fig:combined_correlation}
\end{minipage}%
\hfill
\begin{minipage}[t]{0.4\textwidth}
    \vspace{0pt}
    To illustrate the relationship between label space differences $d_y(y,y')$ and decision space differences $l_q(y,y',z,z')$ in different PtO tasks, we provide the following visualizations. Figure~\ref{fig:correlation_inventory} shows this correlation for the Inventory Stock problem, while Figure~\ref{fig:correlation_warcraft} presents the same analysis for the Warcraft domain.
\end{minipage}
