

\begin{figure*}[h]
    \centering
\includegraphics[width=0.95\textwidth]{figures/overview.pdf} % Adjust width as needed
    \caption{Overview of \ours. We begin by initializing the strategy set for each player. At the \(i\)-th iteration, we use SMC sampler to obtain a set of empirical distributions \(\hat{\mathcal{T}}_{i-1}\). Next, a mixed Nash solver computes the equilibrium \(\pi^*_{i-1}\) and \(\hat{\sigma}^*_{i-1}\). We then compute each player’s best response against the opponent’s mixed strategy and update the players’ strategy sets. This procedure is repeated until convergence.}
    \label{fig:overview}
\end{figure*}





% \begin{figure*}[h]
%     \centering
% \includegraphics[width=0.95\textwidth]{figures/overview.pdf} % Adjust width as needed
%     \caption{Overview of \ours. We begin by initializing the strategy set for each player. At the \(i\)-th iteration, we use SMC sampler to obtain a set of empirical distributions \(\hat{\mathcal{T}}\). Next, a mixed Nash solver computes the equilibrium \(\pi^*\) and \(\hat{\sigma}^*\). We then compute each player’s best response against the opponent’s mixed strategy and update the players’ strategy sets. This procedure is repeated until convergence.}
%     \label{fig:overview}
% \end{figure*}

\section{Robust Optimization with Diffusion Model}

In this section, we propose \ours to solve the robust optimization problem in Eq.~\ref{eq:DRO}. In Section \ref{sec:mixed_over_mixed}, we introduce a mixed strategy over mixed strategies to ensure the applicability of the double oracle approach. Section \ref{sec:double_oracle} details the overall workflow of the algorithm. In Section \ref{thm:twisted_diffusion_sampler}, we present twisted SMC sampler to estimate the expected utility. Finally, in Section \ref{sec:convergence_analysis}, we provide a convergence analysis of \ours.

\subsection{Mixed strategy over mixed strategies}\label{sec:mixed_over_mixed}
Eq.~\eqref{eq:DRO} requires solving for mixed strategies in a continuous game with infinitely many strategies. A common approach for such problems is the double oracle method~\citep{adam2021double}, which iteratively expands both players’ strategy sets and computes the equilibrium of the resulting subgame. This procedure is guaranteed to converge to an equilibrium in any two-player zero-sum continuous game. However, during the double oracle process, the mixed strategy it produces is necessarily a discrete distribution, whereas $p_{\theta}(\mathbf{z}|\mathbf{c})$ is a continuous distribution. As a result, the KL divergence between these two distributions is ill-defined, making it difficult to include the KL-divergence constraint in the subgame-equilibrium computation.

To address this limitation, we note that given a fixed \(\pi(\mathbf{z})\), the inner constrained minimization problem admits a closed-form solution that can be sampled using the diffusion model. This procedure can be interpreted as computing a best-response pure strategy in the double oracle framework. Consequently, we propose viewing the original mixed strategy \(\tau(\mathbf{z})\) as a “pure” strategy and introducing a \emph{mixed strategy over mixed strategies}. This reformulation enables the application of the double oracle method while preserving the desired constraints.




% The key insight is to shift the constraints from the mixed strategy space to the pure strategy space, effectively treating the original mixed strategy as a pure strategy and introducing a \emph{mixed strategy over mixed strategies}. This reformulation allows us to apply the double oracle method while preserving the desired constraints. 




% However, applying the double oracle algorithm directly is infeasible due to the constrained mixed strategy space of the attacker. 

% To address this limitation, we introduce a novel construct: a \textit{mixed strategy over mixed strategies}.



% \begin{definition}[Mixed Strategy over mixed strategy]
%     % Let \( \mathcal{S} \) denote the space of pure strategies for a player, and \( \Delta(\mathcal{S}) \) denote the space of mixed strategies over \( \mathcal{S} \), i.e., the set of all probability distributions over \( \mathcal{S} \). 
%     Let $\Delta$ denote the space of the mixed strategy and $\tilde{p}$ is a mixed strategy in $\Delta$.
%     A \textbf{mixed strategy over mixed strategies}, \( \sigma \), is a probability distribution over \( \Delta \), denoted \( \sigma(\tilde{p}) \), where \( \tilde{p} \in \Delta \). Formally:

%  \( \sigma \in \Delta(\Delta) \), i.e., \( \sigma \) satisfies:
%    (1) \( \sigma(\tilde{\pi}) \geq 0 \) for all \( \tilde{\pi} \in \Delta(\mathcal{S}) \),
%    (2) \( \int_{\Delta(\mathcal{S})} q(\tilde{\pi}) d\tilde{\pi} = 1 \).
% \end{definition}

\begin{definition}[Mixed Strategy over Mixed Strategies]\label{def:mixed_over_mixed}
Let \(\mathcal{T} \) denote the space of mixed strategies, where each \( \tau \in \mathcal{T} \) represents a probability distribution over pure strategies. A \textit{mixed strategy over mixed strategies}, \( \sigma \), is a probability distribution over \( \mathcal{T}\), formally expressed as \( \sigma \in \Delta(\mathcal{T}) \). This implies that \( \sigma \) satisfies the following conditions: (1) \( \sigma(\tau) \geq 0 \) for all \( \tau \in \Delta \), and (2) \( \int_{\mathcal{T}} \sigma(\tau) \, d\tau = 1 \).  
\end{definition}

% \lk{Maybe we should change to mixed strategy over distribution?}
% \ak{Personally, mixed strategy over distributions sounds a little bit more natural to me. Then perhaps add a remark statement that this can be in fact viewed as a mixed strategy over mixed strategies..}
We provide concrete examples in Appendix \ref{appdx:example} to help readers understand 
Definition \ref{def:mixed_over_mixed}. By introducing this concept of a mixed strategy over mixed strategies, \( \sigma \), we can reformulate our objective as follows:  
\begin{align}
\max_{\pi(\mathbf{x})\in\Delta(\mathcal{X})} \min_{\sigma(\tau)\in \Delta({\mathcal{T})}}  
\mathbb{E}_{\pi(\mathbf{x})} \mathbb{E}_{\sigma(\tau)} \left( \mathbb{E}_{\tau(\mathbf{z})}
\left[ u(\mathbf{x}, \mathbf{z}) \right] \right)\notag \\
\mathcal{T} \;=\; \{\, \tau(\mathbf{z}) \;|\; D_{\mathrm{KL}}(\tau(\mathbf{z}) \,\|\, p_{\theta}(\mathbf{z}\mid \mathbf{c})) \leq \rho \}, 
\label{eq:new-formulation}
\end{align}
In this reformulation, the adversary’s pure strategy is no longer a single value but instead a full distribution \(\tau(\mathbf{z})\). Consequently, the adversary’s pure strategy space becomes $\mathcal{T}$
and the corresponding mixed strategy space is the set of distributions over these distributions, \(\Delta(\mathcal{T})\). Under this framework, the defender’s utility function takes the form \(\mathbb{E}_{\tau(\mathbf{z})}\bigl[u(\mathbf{x}, \mathbf{z})\bigr]\), while the attacker’s utility becomes \(-\mathbb{E}_{\tau(\mathbf{z})}\bigl[u(\mathbf{x}, \mathbf{z})\bigr]\).

Crucially, this reformulation shifts the KL-divergence constraint from the adversary’s mixed strategy space to its pure strategy space. As we will show in Section~\ref{sec:double_oracle}, the best response for such a constrained pure strategy can be written in closed form. Hence, Eq.~\eqref{eq:new-formulation} can be solved efficiently using the double oracle algorithm.

\begin{proposition}\label{thm:mixed_over_mixed}
The reformulated objective in Eq.~\eqref{eq:new-formulation} yields the same defender mixed strategy \(\pi(\mathbf{x})\) as the original formulation in Eq.~\eqref{eq:DRO}.

Proof. See Appendix.~\ref{appdx:mixed_over_mixed}.
\label{theorem:mixed}
\end{proposition}

By Proposition~\ref{theorem:mixed}, solving Eq.~\eqref{eq:new-formulation} is equivalent to solving Eq.~\eqref{eq:DRO}. Therefore, applying the double oracle algorithm to Eq.~\eqref{eq:new-formulation} recovers the optimal defender mixed strategy for the original problem (Eq.~\eqref{eq:DRO}).

Since we have reformulated the problem, we will henceforth refer to the adversary's pure strategy as \(\tau(\mathbf{z})\) and the mixed strategy as \(\sigma(\tau)\).



% \begin{algorithm}[t]
%     \caption{Double Oracle with Diffusion Models}
%     \begin{algorithmic}[1]    \STATE \textbf{Input: } Pretrained diffusion model \( p_{\theta}(\mathbf{z} | \mathbf{c}) \), utility function \( U(\mathbf{x}, \tau) \) and \(\mathsf{prob}>0\).
%     \STATE Initialize iteration counter \( i = 0 \).
%     \STATE Initialize $\mathcal{X}_0$ and $\mathcal{T}_0$ by selecting a defender strategy $\mathbf{x}_0\in \mathcal{X}$ at random and an attacker strategy $\tau_0=p_{\theta}(\mathbf{z}|\mathbf{c})$ at random.    \REPEAT
%         \STATE Increment \( i \) by 1.
%         \STATE  Use Alg.~\ref{alg:twisted-smc} to sample from all \( \tau(\mathbf{z}) \in\mathcal{T}_{i-1}=\{\tau_0, \cdots, \tau_{i-1}\}\) and obtain the resulting empirical distribution $\hat{\tau}(\mathbf{z})$. Let $\hat{\mathcal{T}}_{i-1}=\{ \hat{\tau}_0,\cdots, \hat{\tau}_{i-1}\}$ denote the set of empirical distributions for $\mathcal{T}_{i-1}$.
%         \STATE Use \textit{mixed Nash Equilibrium solver} to find an equilibrium \( (\pi_{i-1}^*, \hat{\sigma}_{i-1}^*) \) for the subgame \( (\mathcal{X}_{i-1}, \hat{\mathcal{T}}_{i-1},U) \).
%         \STATE Use \textit{defender oracle} to compute the best response:
%         \[
%         \mathbf{x}_{i} = \arg\max_{\mathbf{x}\in\mathcal{X}} U(\mathbf{x},\hat{\sigma}_{i-1}^*).
%         \]
%         \STATE Use \textit{adversary oracle} to compute the best response:
%         \[\tau_{i}(\mathbf{z}) \propto p_{\theta}(\mathbf{z} | \mathbf{c}) \exp\left(-\gamma U(\pi_{i-1}^*,\mathbf{z})\right).
%         \]
%         \STATE Update the strategy sets: $\mathcal{X}_{i} = \mathcal{X}_{i-1} \cup \{\mathbf{x}_{i}\}, \quad \mathcal{T}_{i} = \mathcal{T}_{i-1} \cup \{\tau_{i}\}.$
%         \STATE Use SMC to sample from $\tau_i$ and obtain $\hat{\tau}_i$. Then compute:
%         \[
%         \underline{v}_i = U(\pi^*_{i-1}, \hat{\tau}_{i}), \quad \bar{v}_i = U(\mathbf{x}_{i}, \hat{\sigma}^*_{i-1}).
%         \]
%     \UNTIL \( \bar{v}_i - \underline{v}_i \in (-2\epsilon, 2\epsilon) \) and \(i>\nicefrac{1}{16\mathsf{prob}}\).
%     \STATE \textbf{Output: } Final mixed strategies \( \pi^*_{i-1} \) for the defender.
%     \end{algorithmic}
%     \label{alg:double-oracle}
% \end{algorithm}





\subsection{Double Oracle Flow}\label{sec:double_oracle}

% The double oracle framework comprises three key components: the \textit{attacker oracle}, the \textit{defender oracle}, and the \textit{Mixed Nash Equilibrium solver}. At each iteration, the oracles compute the best response to the opponent’s mixed strategy, thereby expanding the strategy set. The Mixed Nash Equilibrium solver then determines the equilibrium of the resulting subgame.

The overall double oracle algorithm is outlined in Algorithm~\ref{alg:double-oracle} and illustrated in Figure~\ref{fig:overview}. We begin by initializing the adversary’s strategy as \(\tau_0 = p_{\theta}(\mathbf{z}|\mathbf{c}) \) and selecting a random initial defender strategy \(\mathbf{x}_0\) from \(\mathcal{X}\) (lines 2-3), forming the initial strategy sets \(\mathcal{T}_0\) and \(\mathcal{X}_0\). These serve as the foundation for the iterative process.
In each iteration, we first sample from each distribution in \(\mathcal{T}_{i-1} = \{\tau_0, \dots, \tau_{i-1}\}\) to obtain a set of empirical distributions \(\hat{\mathcal{T}}_{i-1} = \{\hat{\tau}_0, \dots, \hat{\tau}_{i-1}\}\) (line 6). These empirical distributions are used to estimate expected utilities, which are then input into the \textit{Mixed Nash Equilibrium solver} to compute an equilibrium \((\pi_{i-1}^*, \hat{\sigma}^*_{i-1})\) of the subgame \(\{\mathcal{X}_{i-1}, \hat{\mathcal{T}}_{i-1}, U \}\) (lines 7). Next, the \textit{defender oracle} and \textit{attacker oracle} compute their respective best responses to the mixed strategy, yielding new strategies \(\mathbf{x}_i\) and \(\tau_i(\mathbf{z})\) (lines 8-9). These best response strategies are then added to the strategy sets, expanding them to \(\mathcal{X}_i\) and \(\mathcal{T}_i\) (line 10). 

 % ensuring that neither player can improve their utility beyond a specified tolerance \(\epsilon\)
This iterative procedure alternates between the oracles and the solver until convergence (lines 11–13). The parameters \(\mathsf{prob}\) and tolerance $\epsilon$ are user-defined and guarantee that the algorithm converges to an \(4\epsilon\)-equilibrium with probability \(1 - \mathsf{prob}\), as detailed in Theorem~\ref{thm:convergence_inf_round}. In practice, to manage runtime, we cap the number of double oracle iterations to a fixed limit—a common strategy also employed in \citet{lanctot2017unified, pmlr-v161-xu21a}. 

We introduce the details of the three key components defender oracle, adversary oracle and Mixed Nash equilibrium solver as below.


\textbf{Adversary Oracle}  
At the \(i\)-th iteration, given the defender's mixed strategy $\pi_{i-1}^*$, %\(\pi^*_{i-1} = \sum_{j=1}^{i-1} w^{\rm a}_j \delta_{\mathbf{x}_j}\), 
the adversary oracle computes the best response by solving:
% \begin{align}
% \tau_{i}(\mathbf{z}) 
% &= \arg\max_{\tau\in \mathcal{T}} U(\pi^{*}_{i-1}, \tau) \notag \\
% &= \arg\max_{\tau \in \mathcal{T}} 
% \mathbb{E}_{\tau(\mathbf{z})} \biggl[\sum_{j=1}^{i-1} w^{\rm a}_j\,u(\mathbf{x}_j, \mathbf{z})\biggr]
% \label{eq:nature}
% \end{align}
\begin{align}
&\tau_{i}(\mathbf{z}) 
= \arg\max_{\tau\in \mathcal{T}} U(\pi^{*}_{i-1}, \tau) \notag \\
&\mathcal{T} = \left\{ \tau_i(\mathbf{z}) \mid D_{\rm KL}(\tau_i(\mathbf{z}) \parallel p_{\theta}(\mathbf{z} \mid \mathbf{c})) \leq \rho \right\},
\label{eq:nature}
\end{align}

% where \(U(\pi_i, \tau)\) represents the expected utility under the mixed strategies \(\pi_i\) and \(\tau\).  
% \begin{align}
% \textstyle\tau_i(\mathbf{z}) \propto p_{\theta}(\mathbf{z} | \mathbf{c}) \exp\left(-\gamma \sum_{j=1}^{i-1} w^{\rm a}_j u(\mathbf{x}_j, \mathbf{z})\right),
% \label{eq:bo-poacher}
% \end{align}
\begin{proposition}
\label{propo:kl}
The optimal solution $\tau_i(\mathbf{z})$ of \ref{eq:nature} has a closed-form:
\begin{align}
\textstyle\tau_i(\mathbf{z}) \propto p_{\theta}(\mathbf{z} | \mathbf{c}) \exp\left(-\gamma U(\pi^*_{i-1}, \mathbf{z})\right),
\label{eq:bo-poacher}
\end{align}
where \(\gamma\) is the Lagrange multiplier associated with the KL-divergence constraint.

Proof. See Appendix~\ref{appdx:propo_kl}
\end{proposition}

% The proof of Proposition \ref{propo:kl} is deferred to Appendix \ref{appdx:propo_kl}.
As shown in Eq.~\ref{eq:bo-poacher}, \(\tau_i(\mathbf{z})\) is an unnormalized distribution obtained by reweighting the original diffusion model distribution according to the utility function. Computing expected utilities under \(\tau_i(\mathbf{z})\) requires sampling from this high-dimensional unnormalized distribution, which is challenging in practice. To address this, we employ twisted Sequential Monte Carlo (SMC) techniques~\citep{chopin2020introduction,wu2023practical}, detailed in Section 4.2, which provide asymptotically exact utility estimates. We denote the resulting empirical distribution as \(\hat{\tau}_i(\mathbf{z})\).

%\(\hat{\sigma}^*_{i-1} = \sum_{j=1}^{i-1} w^{\rm d}_j \hat{\tau}_j(\mathbf{z})\),
\textbf{Defender Oracle} 
At the \(i\)-th epoch, given the attacker's mixed strategy \(\hat{\sigma}^*_{i-1}\), the defender oracle computes the best response by solving:  
% \begin{align}\label{eq:def_oracle}
%     \mathbf{x}_{i} = \argmax_{\mathbf{x}\in\mathcal{X}} U(\mathbf{x}, \hat{\sigma}^*_{i-1}) = \argmax_{\mathbf{x}\in\mathcal{X}} \sum_{j=1}^{i-1} w^{\rm d}_j \mathbb{E}_{\hat{\tau}_j(\mathbf{z})}[u(\mathbf{x}, \mathbf{z})].\nonumber
% \end{align}
\begin{align}\label{eq:defender_oracle}
\mathbf{x}_{i} = \arg\max_{\mathbf{x}\in\mathcal{X}} U(\mathbf{x}, \hat{\sigma}^*_{i-1}).
\end{align}
Since \(\hat{\sigma}^*_{i-1}\) represents a mixed strategy over a set of empirical distributions $\hat{\mathcal{T}}_{i-1}$, we can directly compute the expected utility, reducing the problem to a standard deterministic optimization. To  handle the budget constraint in our setting, we employ mirror ascent~\citep{nemirovski2012tutorial}. 
 

\textbf{Mixed Nash Equilibrium Solver}  
At $i$-th iteration, the Mixed Nash Equilibrium solver computes a mixed Nash equilibrium  $(\pi^*_{i-1}, \hat{\sigma}^*_{i-1})$ over the players' current strategy sets $\mathcal{X}_{i-1}$ and $\hat{\mathcal{T}}_{i-1}$. 
The equilibrium can be found using linear programming \citep{nisan2007algorithmic}, and in our work, we utilize the PuLP implementation~\citep{pulp} for this purpose. 
\begin{algorithm}[!t]
\small
\caption{Double Oracle with Diffusion Models}
\label{alg:double-oracle}
\begin{algorithmic}[1]
\Require Pretrained diffusion model $p_{\theta}(\mathbf{z} \mid \mathbf{c})$, utility function $U(\mathbf{x}, \tau)$, probability threshold $\mathsf{prob} > 0$ 
\State Initialize $i \gets 0$
\State $\mathbf{x}_0 \gets \text{random strategy}$, \quad $\tau_0 \gets p_{\theta}(\mathbf{z}\mid \mathbf{c})$
\State $\mathcal{X}_0 \gets \{\mathbf{x}_0\}$, \quad $\mathcal{T}_0 \gets \{\tau_0\}$

\Repeat
    \State $i \gets i+1$
    \State $\hat{\mathcal{T}}_{i-1} \gets \text{Empirical distributions from }\mathcal{T}_{i-1}$ using Alg.~\ref{alg:twisted-smc}
    \State $(\pi_{i-1}^*, \hat{\sigma}_{i-1}^*) \gets \textsc{MixedNashSolver}(\mathcal{X}_{i-1},\hat{\mathcal{T}}_{i-1}, U)$
    
    \State $\mathbf{x}_i \gets \underset{\mathbf{x}\in \mathcal{X}}{\arg\max}\;U(\mathbf{x}, \hat{\sigma}_{i-1}^*)$ //Adversary Oracle
    
    \State $\tau_i(\mathbf{z}) \propto p_{\theta}(\mathbf{z}|\mathbf{c}) 
            \exp\bigl(-\gamma\,U(\pi_{i-1}^*, \mathbf{z})\bigr)$ // Defender Oracle
    
    \State$\mathcal{X}_i \gets \mathcal{X}_{i-1} \cup \{\mathbf{x}_i\}, 
            \quad \mathcal{T}_i \gets \mathcal{T}_{i-1} \cup \{\tau_i\}$
    
  \State $\hat{\tau}_i \gets \text{sample from }\tau_i$ using Alg~\ref{alg:twisted-smc}
   \State $\underline{v}_i \gets U(\pi_{i-1}^*, \hat{\tau}_i), \quad
            \bar{v}_i \gets U(\mathbf{x}_i, \hat{\sigma}_{i-1}^*)$

\Until{$(\bar{v}_i - \underline{v}_i \in (-2\epsilon, 2\epsilon)) 
       \,\wedge\, (i > 1/(16\,\mathsf{prob}))$}

\State \textbf{Output:} Final defender strategy $\pi_{i-1}^*$
\end{algorithmic}
\end{algorithm}



% The overall double oracle algorithm is outlined in Algorithm~\ref{alg:double-oracle}. The double oracle algorithm begins by initializing strategies for both players. The nature player starts with an initial mixed strategy, denoted as \(\tilde{p}_0\), while the ranger selects a heuristic initial strategy, \(\mathbf{x}_0\). These initial strategies serve as the foundation for the iterative process that follows.

% In each iteration, the ranger first takes their turn to refine their strategy using the agent oracle. Based on the current mixed strategy of the nature player, \(\tilde{p}_i\), the ranger computes their best response, \(\mathbf{x}_{i+1}\), which maximizes their expected utility against the nature player’s strategy. This step ensures that the ranger adapts and seeks to exploit the nature player’s current choices.

% Next, the nature player responds using the nature oracle. Taking into account the ranger’s updated mixed strategy, \(q_i\), the nature player calculates a new optimal mixed strategy, \(\tilde{p}_{i+1}\). This step ensures that the nature player adapts to counteract the ranger’s strategy and maximizes its utility within its own constraints.

% Once the best response strategies for both players are computed, these strategies are added to their respective strategy sets, expanding the pool of available strategies for each player. At this point, the Mixed Nash Solver is employed to determine a mixed Nash equilibrium over the expanded strategy sets. This solver ensures that both players’ strategies are balanced, with neither player having an incentive to unilaterally deviate.

% The process then repeats, alternating between the agent oracle, nature oracle, and Mixed Nash Solver, progressively refining the strategies for both players. This iterative approach continues until convergence, at which point neither player can improve their utility by adjusting their strategies further.

% By iteratively expanding the strategy sets and ensuring equilibrium at each step, the double oracle algorithm efficiently converges to a solution that satisfies the Nash equilibrium for the continuous game.


% To estimate the expected utility \(U(q, \tilde{p})\), we use a sample average approximation (SAA). This involves drawing samples from the mixed strategies \(q\) and \(\tilde{p}\) and computing the empirical mean of the utility function:

% \[
% \hat{U}(q, \tilde{p}) = \frac{1}{N} \sum_{j=1}^N u(\mathbf{x}_j, \mathbf{z}_j),
% \]

% where \(N\) is the number of samples, \(\mathbf{x}_j \sim q\), and \(\mathbf{z}_j \sim \tilde{p}\).


% \begin{algorithm}[t]
% \footnotesize
% \caption{Twisted SMC for Diffusion Model}
% \label{alg:twisted-smc}
% \begin{algorithmic}[1]
% \Require Pretrained diffusion model, number of particles $N$, time horizon $T$, $\Phi(\mathbf{z})$ (Eq.~\ref{eq:twisting})
% \State Initialize $\mathbf{z}_n^T \sim p_{\theta}(\mathbf{z}^T)$,\; $w_n \gets \Phi(\mathbf{z}_n^T)$
% \For{$t = T, \dots, 1$}
%   \State \textbf{Resample:} \\
%     $\quad\quad\{\mathbf{z}_n^t\}_{n=1}^N \sim \mathrm{Multinomial}\bigl(\{\mathbf{z}_n^t\}_{n=1}^N;\,\{w_n^t\}_{n=1}^N\bigr)$
%   \For{$k = 1 \dots K$}
%     \State $\displaystyle 
%            \hat{s}_k \gets s_\theta(\mathbf{z}_k^t,\mathbf{c},t) \;-\; 
%              \gamma \,\nabla_{\mathbf{z}_k^t}\Bigl[\sum_j w_j^\mathrm{d}\,u\bigl(\mathbf{x}_j,\hat{\mathbf{z}}_{\theta}^0(\mathbf{z}_k^t)\bigr)\Bigr]$
%     \State $\mathbf{z}_k^{t-1} \sim 
%             \mathcal{N}\bigl(\mathbf{z}_k^t + \sigma^2 \hat{s}_k,\;\sigma^2\bigr)$
%     \State $\displaystyle
%            w_k^{t-1} \gets 
%            \frac{p_{\theta}\bigl(\mathbf{z}_k^{t-1} \mid \mathbf{z}_k^t,\mathbf{c}\bigr)\,\Phi(\mathbf{z}_k^{t-1})}{
%                  \hat{p}_{\theta}\bigl(\mathbf{z}_k^{t-1} \mid \mathbf{z}_k^t,\mathbf{c}\bigr)\,\Phi(\mathbf{z}_k^{t})}$
%   \EndFor
% \EndFor
% \State \textbf{Output:} Weighted particles $\{\mathbf{z}_k^0,\, w_k^0\}_{k=1}^K$
% \end{algorithmic}
% \end{algorithm}



\subsection{Sampling with Twisted Sequential Monte Carlo}\label{sec:twisted_sampler}

% To efficiently sample from the unnormalized distribution in Eq.~(4) while ensuring correctness, we propose Twisted Sequential Monte Carlo (Twisted SMC), an approach that refines Diffusion Posterior Sampling (DPS) by incorporating importance weighting to correct for bias.  

% A straightforward approach to sampling from the target distribution is importance sampling, where trajectories are drawn from \( p_{\theta}(\mathbf{z}|\mathbf{c}) \) and reweighted accordingly. However, this method suffers from high variance, particularly in high-dimensional settings, as the number of required samples scales exponentially with the KL divergence between the target and proposal distributions. 


% To sample from the reweighted distribution in Eq.~(4), a straightforward approach is \emph{Diffusion Posterior Sampling} (DPS)~\citep{chung2023diffusion}. DPS modifies the reverse generative process of the original diffusion model as follows:
% \[
% \hat{p}_\theta(\mathbf{z}^{t-1} \mid \mathbf{z}^{t}, \mathbf{c}) 
% = \mathcal{N}\!\bigl(\mathbf{z}^{t-1};\, \mathbf{z}^{t} + \sigma^2 \hat{s}_\theta(\mathbf{z}^{t}, \mathbf{c}, t),\, \sigma^2\bigr),
% \]
% where the adjusted score function is
% % \[
% % \textstyle \hat{s}_\theta(\mathbf{z}^{t}, \mathbf{c}, t) 
% % = s_\theta(\mathbf{z}^{t}, \mathbf{c}, t) 
% % - \gamma \nabla_{\mathbf{z}^{t}} \sum_{j=1}^i w^{\mathrm{d}}_j\, u(\mathbf{x}_j, \hat{\mathbf{z}}^0_\theta(\mathbf{z}^t)).
% % \]
% \[
% \textstyle \hat{s}_\theta(\mathbf{z}^{t}, \mathbf{c}, t) 
% = s_\theta(\mathbf{z}^{t}, \mathbf{c}, t) 
% - \gamma \nabla_{\mathbf{z}^{t}}  U(\pi^*_{i-1}, \hat{\mathbf{z}}^0_\theta(\mathbf{z}^t)).
% \]
% Here, \(\hat{\mathbf{z}}^0_\theta(\mathbf{z}^t)\) is an estimate of the original state \(\mathbf{z}^0\) obtained via Tweedie’s formula~\citep{robbins1992empirical,efron2011tweedie}:
% \begin{align}
% \textstyle
% \hat{\mathbf{z}}^0_\theta(\mathbf{z}^t) \;=\; \mathbf{z}^t \;+\; t \beta^2\, s_\theta(\mathbf{z}^t, \mathbf{c}, t).
% \end{align}
% At \(t = 0\), we define \(\hat{\mathbf{z}}^0_\theta(\mathbf{z}^0) \coloneqq \mathbf{z}^0\). The correction term in \(\hat{s}_\theta\) first reconstructs \(\mathbf{z}^0\) and then incorporates the reweighted term from Eq.~(4), ensuring that the sampling process accounts for the reweighting. 


% While DPS provides a heuristic approach, it does not guarantee exact sampling from Eq.(4)\citep{lu2023contrastive}. To address this limitation, we adopt Twisted Sequential Monte Carlo (Twisted SMC)~\citep{wu2023practical}, which leverages DPS as a proposal distribution at each step while incorporating an additional importance weighting step to ensure unbiased sampling. The complete Twisted SMC algorithm is presented in Alg.~\ref{alg:twisted-smc}.



% Specifically, Twisted SMC employs a collection of \( N \) weighted particles \( \{(w_n^t, \mathbf{z}_n^t)\}_{n=1}^N \) over \( T \) iterative steps. At each timestep, particles are propagated via the DPS transition model, while their weights are updated to correct for discrepancies between the proposal and target distributions:
% \begin{align}
% w^t_n = \frac{p_\theta(\mathbf{x}_n^t|\mathbf{x}_n^{t+1},\mathbf{c}) \Phi_t(\mathbf{x}_n^t)}{ \hat{p}_\theta(\mathbf{x}_n^t|\mathbf{x}_n^{t+1},\mathbf{c})\Phi_{t+1}(\mathbf{x}_n^{t+1})},\nonumber
% \end{align}
% where 
% % \begin{align}
% % \textstyle\Phi_t(\mathbf{z}_n^t) = \exp\left(-\gamma \sum_{j=1}^i w_j^{\rm d} u(\mathbf{x}_j, \hat{\mathbf{z}}_{\theta}^0(\mathbf{z}_n^t))\right).
% % \label{eq:twisting}
% % \end{align}
% \begin{align}
% \textstyle\Phi_t(\mathbf{z}_n^t) = \exp\left(-\gamma  U(\pi_{i-1}^*, \hat{\mathbf{z}}_{\theta}^0(\mathbf{z}_n^t)\right).
% \label{eq:twisting}
% \end{align}
% To mitigate variance and prevent particle degeneracy over long horizons, multinomial resampling can be performed at each step based on the normalized weights~\citep{douc2005comparison}.

% Finally, the empirical approximation of the target distribution is given by  
% $\hat{\tau} = \sum_{n=1}^N\frac{w_n^0}{\sum_{n'=1}^K w_{n'}^0} \delta_{\mathbf{z}_k^0}.$
 


To efficiently sample from the unnormalized distribution in Eq.~\ref{eq:bo-poacher} while ensuring correctness, we leverage Twisted Sequential Monte Carlo (Twisted SMC)~\citep{chopin2020introduction}, an adaptive importance sampling technique that improves sampling through sequential proposal and weighting. \citet{wu2023practical} applied it to sampling from a conditional distribution with diffusion model; here, we adapt it to sample from the unnormalized reweighted distribution in Eq.~\ref{eq:bo-poacher}.


Twisted SMC operates with a collection of \(N\) weighted particles \(\{(w_n^t, \mathbf{z}_n^t)\}_{n=1}^N\) that evolve iteratively over \(T\) steps. At each step \(t\), particles are propagated using an adjusted score function, similar to \citet{chung2023diffusion}:
\[
\hat{p}_\theta(\mathbf{z}^{t-1} \mid \mathbf{z}^{t}, \mathbf{c}) 
= \mathcal{N}\!\bigl(\mathbf{z}^{t-1};\, \mathbf{z}^{t} + \sigma^2 \hat{s}_\theta(\mathbf{z}^{t}, \mathbf{c}, t),\, \hat{\beta}^2\bigr),
\]
where the adjusted score function is:
\[
\textstyle \hat{s}_\theta(\mathbf{z}^{t}, \mathbf{c}, t) 
= s_\theta(\mathbf{z}^{t}, \mathbf{c}, t) 
+ \gamma \log\Phi_t(\mathbf{z}^t).
\]
The twisting function \(\Phi_t\) is defined as:
\begin{align}
\textstyle\Phi_t(\mathbf{z}_n^t) = \exp\left(-\gamma  U(\pi_{i-1}^*, \hat{\mathbf{z}}_{\theta}^0(\mathbf{z}_n^t))\right).
\label{eq:twisting}
\end{align}
Here, \(\hat{\mathbf{z}}^0_\theta(\mathbf{z}^t)\) estimates the original state \(\mathbf{z}^0\) using Tweedie’s formula~\citep{robbins1992empirical,efron2011tweedie}:
\[
\textstyle
\hat{\mathbf{z}}^0_\theta(\mathbf{z}^t) = \mathbf{z}^t + t \beta^2\, s_\theta(\mathbf{z}^t, \mathbf{c}, t).
\]
At \(t = 0\), we set \(\hat{\mathbf{z}}^0_\theta(\mathbf{z}^0) \coloneqq \mathbf{z}^0\). The correction term in \(\hat{s}_\theta\) reconstructs \(\mathbf{z}^0\) and incorporates the reweighted term from Eq.~\ref{eq:bo-poacher}, ensuring proper adaptation of the sampling process.

To account for discrepancies between the proposal and target distributions, Twisted SMC assigns a weight to each particle:
\[
\textstyle w^t_n = \frac{p_\theta(\mathbf{x}_n^t|\mathbf{x}_n^{t+1},\mathbf{c}) \Phi_t(\mathbf{x}_n^t)}{ \hat{p}_\theta(\mathbf{x}_n^t|\mathbf{x}_n^{t+1},\mathbf{c})\Phi_{t+1}(\mathbf{x}_n^{t+1})}.
\]

This reweighting step ensures unbiased estimation.

To mitigate variance and prevent particle degeneracy over long horizons, we apply multinomial resampling at each step based on normalized weights~\citep{douc2005comparison}. The final approximation of the target distribution is:
$\hat{\tau} = \sum_{n=1}^N\frac{w_n^0}{\sum_{n'=1}^N w_{n'}^0} \delta_{\mathbf{z}_n^0}.$

A full description of Twisted SMC is provided in Algorithm~\ref{alg:twisted-smc}.

% By leveraging DPS as a proposal mechanism while incorporating correction terms via importance weighting, Twisted SMC ensures a more robust and unbiased sampling procedure, addressing the limitations of previous diffusion-based heuristics~\citep{lu2023contrastive}.



\begin{algorithm}[t]
\small
\caption{Twisted SMC for Diffusion Model}
\label{alg:twisted-smc}
\begin{algorithmic}[1]
\Require Pretrained diffusion model, number of particles $N$, time horizon $T$, $\Phi(\mathbf{z})$ (Eq.~\ref{eq:twisting})
\State Initialize $\mathbf{z}_n^T \sim p_{\theta}(\mathbf{z}^T)$,\; $w_n \gets \Phi(\mathbf{z}_n^T)$
\For{$t = T, \dots, 1$}
  \State \textbf{Resample:} \\
    $\quad\quad\{\mathbf{z}_n^t\}_{n=1}^N \sim \mathrm{Multinomial}\bigl(\{\mathbf{z}_n^t\}_{n=1}^N;\,\{w_n^t\}_{n=1}^N\bigr)$
  \For{$k = 1 \dots K$}
    \State $\displaystyle 
           \hat{s}_k \gets s_\theta(\mathbf{z}_k^t,\mathbf{c},t) \;-\; 
             \gamma \,\nabla_{\mathbf{z}_k^t}\Bigl[U(\pi^*_{i-1},\mathbf{z})\Bigr]$
    \State $\mathbf{z}_k^{t-1} \sim 
            \mathcal{N}\bigl(\mathbf{z}_k^t + \sigma^2 \hat{s}_k,\;\hat{\beta}^2\bigr)$
    \State $\displaystyle
           w_k^{t-1} \gets 
           \frac{p_{\theta}\bigl(\mathbf{z}_k^{t-1} \mid \mathbf{z}_k^t,\mathbf{c}\bigr)\,\Phi(\mathbf{z}_k^{t-1})}{
                 \hat{p}_{\theta}\bigl(\mathbf{z}_k^{t-1} \mid \mathbf{z}_k^t,\mathbf{c}\bigr)\,\Phi(\mathbf{z}_k^{t})}$
  \EndFor
\EndFor
\State \textbf{Output:} Weighted particles $\{\mathbf{z}_k^0,\, w_k^0\}_{k=1}^K$
\end{algorithmic}
\end{algorithm}


\begin{proposition}
\label{thm:twisted_diffusion_sampler}
(Informal) Under regularity conditions on the score function, as the number of particles \( N \to \infty \), we have
\[
U(\mathbf{x},\hat{\tau}(\mathbf{z})) \to U(\mathbf{x},\tau(\mathbf{z})) \quad \text{almost surely},
\]
where \( \hat{\tau} \) is the empirical distribution returned by Algorithm~\ref{alg:twisted-smc}.

Proof. See Appendix.~\ref{appdx:twisted_diffusion_model}.


% \textbf{Error Bound under Finite Samples:}  
% For a finite number of particles, the following bound holds for the mean squared error of the Twisted SMC sampler:
% \[
% \textstyle \mathbb{E} \left[ \left| U(\mathbf{x}, \tau(\mathbf{z})) - U(\mathbf{x}, \hat{\tau}(\mathbf{z})) \right|^2 \right] \leq \frac{C M^2}{N}.
% \]
\end{proposition}
% These properties of the sampler will be used to analyze the convergence of the double oracle algorithm.


\subsection{Convergence Analysis}\label{sec:convergence_analysis}

In Section \ref{sec:convergence_analysis}, we analyze the convergence properties of our framework. For theoretical analysis, we introduce two mild assumptions. 
\begin{assumption}\label{assump:concavity}
    We assume that the utility function is twice differentiable and concave with respect to $\mathbf{x}$.
\end{assumption}
% \yuqi{maybe less words here? just "twice differentiable and concave in x"}
% and we assume that $\frac{\partial^2 U(x,\tau)}{\partial x^2} \leq 0 \ \forall x$, i.e., the utility function is concave in ranger effort.
Assumption \ref{assump:concavity} implies there is diminishing marginal return in ranger effort, which is  a common assumption in economics models \citep{mankiw1998principles} and reflects the intuition that initial patrol efforts contribute more significantly to wildlife protection than additional increments in effort. Under assumption \ref{assump:concavity}, Eq. \ref{eq:defender_oracle} is a convex optimization problem and existing optimization solvers~\citep{diamond2016cvxpy} can accurately find the defender's best response. 

\begin{assumption}
\label{assump:full_support}
We assume that the distribution \(p_{\theta}(\mathbf{z}\mid \mathbf{c})\) places its mass on a compact space.
\end{assumption}

 In practice, the attacker’s action at each target must lie in a bounded interval, e.g.\ \([0, z_{\mathrm{max}}]\). For instance, the number of snares at any region cannot exceed a practical upper limit. Consequently, it is reasonable to treat the action space as compact, ensuring that \(p_{\theta}(\mathbf{z}\mid \mathbf{c})\) has compact support.



% In Section \ref{sec:convergence_analysis}, we analyze the convergence properties of our framework.
For each $\hat{\sigma}^*_i$, we denote the corresponding mixed strategy on the underlying true adversary strategy distribution as $\sigma^*_i$. Formally, $\sigma^*_i(\tau_l) = \hat{\sigma}^*_i(\hat{\tau_l}) \ \forall l \in [i]$.
Without the terminating condition, Algorithm~\ref{alg:double-oracle} produces two sequences of mixed strategies: $(\pi_i^*)_{i=0}^{\infty}$ and $(\sigma_i^*)_{i=0}^{\infty}$. Proposition \ref{thm:twisted_diffusion_sampler} says if we use infinite samples to estimate expected utilities, then there is no estimation error and 
Theorem~\ref{cor:double_oracle_infinite} follows from the original double oracle algorithm's proof~\citep{adam2021double}.


% If we have access to infinite samples to approximate the poacher strategy distribution at each iteration, an immediate consequence of Proposition \ref{thm:twisted_diffusion_sampler} is that there is no estimation error and Theorem~\ref{cor:double_oracle_infinite} follows after the proof of the original DO algorithm.   
% An immediate consequence of Proposition \ref{thm:twisted_diffusion_sampler} is for every iteration, if an infinite number of samples are available for estimating each cell in the payoff matrix, then there is no estimation error and Corollary \ref{cor:double_oracle_infinite} follows after the proof of the original DO algorithm.   

\begin{theorem}\label{cor:double_oracle_infinite}
Without terminating conditions, under assumptions \ref{assump:concavity}, \ref{assump:full_support}, if we use $N \rightarrow \infty$ samples for all iterations, every weakly convergent subsequence of Alg.~\ref{alg:double-oracle} converges to an exact equilibrium in possibly infinite iterations. Such a weakly convergent subsequence always exists. \footnote{We include the definition of weak convergence in Appendix \ref{appdx:weak_convergence}.}
\end{theorem}

However, in practical scenarios where only a finite number of samples is available, the estimation of the expected utility is imprecise. Consequently, estimation errors will appear in the following steps within each iteration of our algorithm: (1) solving the subgame, (2) computing the defender oracle, and (3) evaluating the terminating condition. 

%These estimation errors must be accounted for in the convergence analysis.
% \begin{enumerate}
%     \item Applying Linear Programming to solve the subgame.
%     \item Computing the defender oracle.
%     \item Evaluating the terminating condition.
% \end{enumerate}
% For theoretical analysis, we introduce two mild assumptions. 
% \begin{assumption}[Concavity of Utility Function]\label{assump:concavity}
%     We assume that the utility function is twice differentiable with respect to $x$, and we assume that $\frac{\partial^2 U(x,\tau)}{\partial x^2} \leq 0 \ \forall x$, i.e., the utility function is concave in ranger effort.
% \end{assumption}

% Assumption \ref{assump:concavity} implies there is diminishing marginal return in ranger effort. This assumption reflects the intuition that initial patrol efforts contribute more significantly to wildlife protection than additional increments in effort \haichuan{would be great if we can cite some empirical evidence}. Under assumption \ref{assump:concavity}, Eq. \ref{eq:defender_oracle} is a convex optimization problem and gradient descent finds the defender's best response \haichuan{Do we need to cite anything?}. 
% We also introduce the following assumption on the learned distribution of the diffusion model.

% \begin{assumption}[Exponential Tail/Moment Condition for \(\boldsymbol{p_0}\)]
% \label{assump:full_support}
% We assume \(p_{\theta}(\mathbf{z}|\mathbf{c})\) satisfies exponential tail decay:} 
%     There exist constants \(\alpha > 0\) and \(C < \infty\) such that
%     \[
%       \int_{\mathcal{X}} \exp\bigl(\alpha \,\|x\|\bigr)\,p_0(\mathrm{d}x)
%       \;\le\; C
%       \quad
%       % \text{or equivalently,}\quad
%       % p_0(x) \;\lesssim\; \exp\!\bigl(-\alpha\,\|x\|\bigr)
%       % \text{ as }\|x\|\to\infty.
%     \]
% \end{assumption}



\begin{restatable}{theorem}{convergenceinfround}\label{thm:convergence_inf_round}
Under assumptions \ref{assump:concavity} and \ref{assump:full_support}, with finite number of samples at the $i$-th iteration 
 $$N_i = \left\lceil CM^2(i+1)^2 i^{1+\delta}/\epsilon^2 \right\rceil, $$
for each adversary distribution, where $C$ is a constant, $M$ is the upper bound of utility function, $\epsilon$ is the approximation error, and $\delta$ is any positive number. 
\begin{itemize}
    \item \textbf{Item 1:} Without terminating condition, every weakly convergent subsequence of Alg.~\ref{alg:double-oracle} converges to an $\epsilon$-equilibrium in a possibly infinite number of iterations. Such a weakly convergent subsequence always exist. 
    % \ak{Perhaps we can define a ``weakly convergent subsequence of an algorithm''?}
    \item \textbf{Item 2:} With the terminating condition, Alg.~\ref{alg:double-oracle} terminates in a finite number of iterations. Also, it converges to a finitely supported $4\epsilon$-equilibrium with probability at least $1-\mathsf{prob}$.
\end{itemize}
\end{restatable}
\begin{proof}
 We provide a sketch of the proof here and defer the full details to Appendix~\ref{appdx:convergence_inf_round}. The key steps for proving Item~1 are as follows:

\begin{itemize}
    \item \textbf{Step 1:} We bound the utility estimation error for any mixed strategy pair at iteration \(i\) by the maximum estimation error over all entries in the payoff matrix.
    \item \textbf{Step 2:} We show that, under our finite sampling scheme, the probability that the maximum cell-wise error exceeds \(\epsilon/4\) is nonzero only during the first \(i_r\) iterations, for some finite \(i_r\).
    \item \textbf{Step 3:} We treat the strategies generated in the first \(i_r\) rounds as the initial strategy set in the standard Double Oracle (DO) algorithm~\citep{adam2021double}. We then adapt the original convergence proof to account for the error introduced by finite sampling, which is now bounded by \(\epsilon/4\).
\end{itemize}

By relaxing the error bound in Item~1, we obtain convergence within a finite number of iterations. The additional approximation error in Item~2 stems from two sources: (1) enforcing finite termination, and (2) using estimated utilities of mixed strategy pairs when evaluating the stopping condition.

\end{proof}
In practice, we use a fixed number of samples across iterations, and experiments in Section~\ref{sec:experiments} shows our framework still achieves robust performance.
% The full proof is deferred to Appendix \ref{appdx:convergence_inf_round}.



% \textbf{Item 1} in Theorem \ref{thm:convergence_inf_round} constructs a sampling scheme that guarantees convergence to an $\epsilon$-equilibrium with a possibly infinite number of iterations. \textbf{Item 2} in Theorem \ref{thm:convergence_inf_round} further demonstrates that our algorithm can converge to a $4\delta$ equilibrium with high probability within a finite number of iterations. 
% \paragraph{Proof Sketch.} 
% We outline the key steps in the proof of \textbf{Item 1} in Theorem \ref{thm:convergence_inf_round}, and defer the full proof to Appendix \ref{appdx:convergence_inf_round}. 
% \begin{enumerate}
%     \item We bound the utility estimation error for any mixed strategy pair at iteration $i$ by the largest cell estimation error in the payoff matrix.
%     \item We show under our finite sampling scheme, the event of the largest cell deviation exceeding a given $\frac{\epsilon}{4}$ will only happen in the first $i_0$ iterations for some finite $i_0$.
%     \item We view the strategies added in the first $i_0$ round as the initial set of strategies in the original DO algorithm. We modify the original proof and prove convergence by accounting for the error introduced by the finite sampling scheme, which is now bounded by $\frac{\epsilon}{4}$.
% \end{enumerate}

% \begin{restatable}{theorem}{convergencefiniteround}\label{thm: convergence_finite_round}
%     Under the assumption 1 and 2, with finite number of samples at the $i$-th iteration 
% $$N_i = \left\lceil 16cM^2(i+1)^2 i^{1+\delta}/\epsilon^2 \right\rceil, $$
% where $c$ is a constant in Proposition \ref{thm:twisted_diffusion_sampler}, $\delta$ and $\epsilon$ can be any positive number, the algorithm terminates in a finite number of iterations. Also, it converges to a finitely supported $4\epsilon$-equilibrium with probability at least $1-\mathsf{prob}$.\footnote{The error probability $\mathsf{prob}$ is reflected in the terminating condition.}
% \end{restatable}

% The additional approximation error in \textbf{Item 2} of Theorem \ref{thm:convergence_inf_round} arises from (1) enforcing finite termination and (2)using estimated utility of mixed strategy pairs for checking whether the terminating condition is satisfied. 

% The theoretical results show the general convergence properties of our framework. \haichuan{This may have already been answered} However, in empirical experiments, we typically set a maximum iteration number as the terminating condition. The experiment section shows our framework still outperforms other benchmarks using much fewer samples compared to the number of samples needed in the theoretical sampling scheme.


% \lk{TODO by Haichuan}
% \haichuan{Theorem part writing is finished. Please move assumption 1 to where we introduce the utility function and assumption $2$ to where we introduce the diffusion model}

% \begin{assumption}[Concavity of Utility Function]\label{assump:concavity}
%     We assume that the utility function is twice differentiable with respect to $a$, and we assume that $\frac{\partial^2 V(a,\tilde{\pi}(z))}{\partial a^2} \leq 0 \ \forall a$, i.e., the utility function is concave in ranger effort.
% \end{assumption}

% The concavity assumption implies that $\frac{\partial V(a,q(z))}{\partial a}$ is decreasing in $a$, which is equivalent to saying there is diminishing marginal return to ranger effort. The intuition is within the first unit of effort, the most obvious snares in the field are first removed. Hence, the same effort will lead to less snares found later. %This phenomenon is also documented in [ideally we can find some conservation literature here]. 

% \begin{assumption}[Full Support of the Diffusion Model]\label{assump:full_support}
%     The distribution learned by the diffusion model, i.e., $\pi_{\theta}(\mathbf{z}|\mathbf{a}')$ has full support.
% \end{assumption}

% \begin{assumption}[Exponential Tail/Moment Condition for \(\boldsymbol{p_0}\)]
% \label{assump:full_support}
% We assume \(p_{\theta}(\mathbf{z}|\mathbf{c})\) satisfies exponential tail decay:} 
%     There exist constants \(\alpha > 0\) and \(C < \infty\) such that
%     \[
%       \int_{\mathcal{X}} \exp\bigl(\alpha \,\|x\|\bigr)\,p_0(\mathrm{d}x)
%       \;\le\; C
%       \quad
%       \text{or equivalently,}\quad
%       p_0(x) \;\lesssim\; \exp\!\bigl(-\alpha\,\|x\|\bigr)
%       \text{ as }\|x\|\to\infty.
%     \]
% \end{assumption}
% \lk{Will add a justification}

% \begin{theorem}[Convergence of Double Oracle with Infinite Samples and Infinite Epochs]
% \label{thm:double_oracle_infinite}
% Under the assumption 1 and 2, when the number of samples $N\rightarrow \infty$ and number of iterations $T\rightarrow \infty$, the double oracle algorithm converges to an exact equilibrium.
% \end{theorem}




