\section{Proof of Proposition~\ref{propo:kl}}\label{appdx:propo_kl}

\begin{proof}
We introduce a Lagrange multiplier \(\alpha\) for the KL-divergence constraint and another multiplier \(\lambda\) for the normalization constraint. The Lagrangian is
\[
\mathcal{L}(\tau,\alpha,\lambda)\;=\;
\int \tau(\mathbf{z}) \Bigl(U(\pi_{i-1}^*, \mathbf{z})\Bigr)\,d\mathbf{z}
\;-\;\alpha \Bigl(D_{\rm KL}(\tau(\mathbf{z})||p_{\theta}(\mathbf{z}|\mathbf{c})) - \rho\Bigr)
\;+\;\lambda \Bigl(\!\int \tau(\mathbf{z})\,d\mathbf{z}-1\Bigr).
\]
By taking the functional derivative of \(\mathcal{L}\) with respect to \(\tau(\mathbf{z})\) and setting it to zero, one obtains
\[
\tau(\mathbf{z})
\;\propto\;
p_{\theta}(\mathbf{z}\mid \mathbf{c})
\exp\!\Bigl(\tfrac{1}{\alpha}\, U(\pi_{i-1}^*,\mathbf{z}))\Bigr).
\]
Defining \(\gamma = -\tfrac{1}{\alpha}\) (where \(\gamma>0\) absorbs constants and signs from the Lagrange approach) gives the closed-form solution
\[
\tau_i(\mathbf{z})
\;\propto\;
p_{\theta}(\mathbf{z}\mid \mathbf{c})
\exp\!\Bigl(-\gamma U(\pi_{i-1}^*,\mathbf{z})\Bigr),
\]
which matches Eq.~\eqref{eq:bo-poacher}. This completes the proof.
\end{proof}



\renewcommand{\theproposition}{5.3}


\section{Proof of Proposition~\ref{thm:twisted_diffusion_sampler}}\label{appdx:twisted_diffusion_model}

We first provide the full statement of Proposition~\ref{thm:twisted_diffusion_sampler} as below.
\begin{proposition}[Twisted SMC] 
Suppose the following conditions hold:
\begin{enumerate}
    \item \(\Phi_t(\mathbf{z}^T)\) and \(\Phi_{t}(\mathbf{z}^t)/\Phi_{t-1}(\mathbf{z}^{t-1})\) are positive and bounded.
    \item For \( t > 0 \), \( \log \Phi_t(\mathbf{z}^t) \) is continuous and has bounded gradients with respect to \( \mathbf{z}^t \).
    \item \( \hat{\beta}^2 > \beta^2 \).
\end{enumerate}


\textbf{Almost Sure Convergence:}  Under these assumptions, as the number of particles \( N \to \infty \), we have
\[
U(\mathbf{x},\hat{\tau}(\mathbf{z})) \to U(\mathbf{x},\tau(\mathbf{z})) \quad \text{almost surely},
\]
where \( \hat{\tau} \) is the empirical distribution returned by Algorithm~\ref{alg:twisted-smc}.

\textbf{Error Bound under Finite Samples:}\label{finite_sample_bound} 
Under these assumptions, the mean squared error of the twisted SMC sampler satisfies the bound:
\[
\mathbb{E}\left[|U(\mathbf{x}, \tau(\mathbf{z})) - U(\mathbf{x}, \hat{\tau}(\mathbf{z}))|^2 \right] \leq \frac{C'M^2}{N},
\]
where $C'$ is a constant and $M$ is the maximum value of the utility function.
\end{proposition}


\textbf{Justification of Assumptions:}  
\begin{itemize}
    \item \textbf{Assumption (1):} This holds if \( \exp(-\gamma U(\pi, \mathbf{z})) \) is positive and bounded away from zero. In our green security domain, \( U \) is always positive (as introduced in Section~X), ensuring this condition is automatically satisfied.  
    \item \textbf{Assumption (2):} This is justified by the Appendix A.5 of \citet{wu2023practical}.  
    \item \textbf{Assumption (3):} This can be ensured by selecting a sufficiently large \( \hat{\beta} \).  
\end{itemize}


\begin{proof}
Recall that \( p_{\theta}(\mathbf{z}|\mathbf{c}) \) serves as the prior, while the likelihood is given by \( \exp\left(-\gamma U(\pi, \mathbf{z})\right) \).

We first prove that the marginal distribution of the sampler is \( \tau(\mathbf{z}) \):

\begin{align}
\hat{p}(\mathbf{z}^{0:T}) &= \frac{1}{Z} \left[ p_{\theta}(\mathbf{z}^T) \prod_{t=1}^{T-1} \hat{p}(\mathbf{z}^t|\mathbf{z}^{t-1}) \right] 
\left[ \Phi_T(\mathbf{z}^T) \prod_{t=1}^{T-1} \frac{p_{\theta}(\mathbf{z}^t|\mathbf{z}^{t+1}) \Phi_t(\mathbf{z}^t)}{\hat{p}_{\theta}(\mathbf{z}^t|\mathbf{z}^{t+1}) \Phi_{t+1}(\mathbf{z}^{t+1})} \right] \nonumber \\
&= \frac{1}{Z} \left[ p_{\theta}(\mathbf{z}^T) \prod_{t=1}^{T-1} p(\mathbf{z}^t|\mathbf{z}^{t-1}) \right] 
\left[ \Phi_T(\mathbf{z}^T) \prod_{t=1}^{T-1} \frac{\hat{p}_{\theta}(\mathbf{z}^t|\mathbf{z}^{t+1}) \Phi_t(\mathbf{z}^t)}{\hat{p}_{\theta}(\mathbf{z}^t|\mathbf{z}^{t+1}) \Phi_{t+1}(\mathbf{z}^{t+1})} \right] \nonumber \\
&= \frac{1}{Z} p_{\theta}(\mathbf{z}^{0:T}) 
\left[ \prod_{t=0}^{T-1} \frac{\Phi_t(\mathbf{z}^t)}{\Phi_{t+1}(\mathbf{z}^{t+1})} \right] \Phi_T(\mathbf{z}^T) \nonumber \\
&= \frac{1}{Z} p_{\theta}(\mathbf{z}^{0:T}) \Phi_0(\mathbf{z}^0).
\end{align}

Since \( \Phi_0(\mathbf{z}^0) = \exp(-\gamma U(\pi, \mathbf{z}^0)) \), marginalizing out \( \mathbf{z}^{1:T} \) yields

\[
\hat{p}(\mathbf{z}^0) = \tau(\mathbf{z}^0) \propto p_{\theta}(\mathbf{z}|\mathbf{c}) \exp\left( -\gamma U(\pi, \mathbf{z}) \right),
\]

as desired.

Next, according to Appendix A.5 in \citet{wu2023practical}, under the given assumptions, the importance weights \( w^t \) remain bounded. Consequently, applying Propositions 11.5 and 11.3 from \citet{chopin2020introduction} establishes almost sure convergence and the error bound under finite samples.

\end{proof}

\section{Definition of weak convergence}\label{appdx:weak_convergence}
We directly cite the definition of weak convergence provided in \cite{adam2021double}, and a more detailed discussion of the convergence of probability measures can be seen in \cite{billingsley2013convergence}.

\begin{definition}
    A sequence of mixed strategy $(\pi_i^*)_{i=1}^{\infty}$ in $\Delta(\mathcal{X})$ weakly converges to $\pi^* \in \Delta(\mathcal{X}) $ if
    $$\lim\limits_{i \to \infty}\int_{\mathcal{X}} f(x) d\pi_i = \int_{\mathcal{X}} f(x) d\pi^*$$
    for every continuous function $f: \mathcal{X} \to R$. We use $\pi_i  \Rightarrow \pi^*$ to denote weak convergence.
\end{definition}
\section{Proof of Theorem \ref{thm:convergence_inf_round}}\label{appdx:convergence_inf_round}

% We prove the following Theorem \ref{thm:convergence_inf_round}, and Item 2 in the following theorem is exactly theorem \ref{thm:convergence_finite_round}.

% \begin{restatable}{theorem}{convergenceinfround}\label{thm:convergence_inf_round}
% Under assumptions \ref{assump:concavity} and \ref{assump:full_support}, with finite number of samples at the $i$-th iteration 
%  $$N_i = \left\lceil 16CM^2(i+1)^2 i^{1+\delta}/\epsilon^2 \right\rceil, $$
% where $C$ is a constant in Proposition \ref{thm:twisted_diffusion_sampler}, $\delta$ and $\epsilon$ can be any positive number. 
% \begin{itemize}
%     \item \textbf{Item 1:} Without terminating condition, every weakly convergent subsequence of Algorithm~\ref{alg:double-oracle} converges to an $\epsilon$-equilibrium in a possibly infinite number of iterations. Moreover, such a weakly convergent subsequence always exist. \ak{Perhaps we can define a ``weakly convergent subsequence of an algorithm''?}
%     \item \textbf{Item 2:} With the terminating condition, Alg.~\ref{alg:double-oracle} terminates in a finite number of iterations. Also, it converges to a finitely supported $4\epsilon$-equilibrium with probability at least $1-\mathsf{prob}$.\footnote{The error probability $\mathsf{prob}$ is reflected in the terminating condition of Algorithm~\ref{alg:double-oracle}.}
% \end{itemize}
% \end{restatable}

\convergenceinfround*

The constant $C$ in $N_i$ can be expressed as $16C'$, where $C'$ is the constant in \textbf{Error Bound under Finite Sample} discussed in Appendix \ref{appdx:twisted_diffusion_model}. To prove Theorem \ref{thm:convergence_inf_round}, we first prove the utility estimation error bound for the twisted diffusion sampler.
\begin{lemma}\label{lem:chebyshev}
    Under the same assumptions as Proposition \ref{thm:twisted_diffusion_sampler}, the utility estimation error of the twisted SMC sampler satisfies the bound:

\[
P(|U(\mathbf{x}, \tau(\mathbf{z}))-U(\mathbf{x},\hat{\tau}(\mathbf{z}))| \geq \epsilon) \leq \frac{C'M^2}{N\epsilon^2},
\]
where $C'$ is the constant in \textbf{Error Bound under Finite Sample} of Appendix \ref{appdx:twisted_diffusion_model} and $M$ is the maximum value of the utility function.
\end{lemma}
\begin{proof}
    Consider the random variable $U(\mathbf{x}, \tau(\mathbf{z}))-U(\mathbf{x},\hat{\tau}(\mathbf{z}))$, whose variance is upper bounded by $E|(U(\mathbf{x}, \tau(\mathbf{z}))-U(\mathbf{x},\hat{\tau}(\mathbf{z}))^2|$. By \textbf{Error Bound under Finite Sample}, we know that this variance is upper bounded by $\frac{C'M^2}{N}$.

    Applying Chebyshev's inequality to the random variable $U(\mathbf{x}, \tau(\mathbf{z}))-U(\mathbf{x},\hat{\tau}(\mathbf{z}))$, we have
    $$P(|U(\mathbf{x}, \tau(\mathbf{z}))-U(\mathbf{x},\hat{\tau}(\mathbf{z})| \geq \epsilon) \leq \frac{C'M^2}{N\epsilon^2}$$
\end{proof}


\paragraph{Notation} We introduce notations used in the proof of Theorem \ref{thm:convergence_inf_round}. Recall at the $i$-th iteration of algorithm \ref{alg:double-oracle}, we use an empirical distribution $\hat{\tau_i}$ with $N_i$ samples to approximate each adversary strategy (distribution) $\tau_i \in \mathcal{T}_i$.  Because of the finite sample approximation, the utility estimation for each pure strategy pair in the payoff matrix is imprecise. Define the estimation error in row $j$, column $k$ of payoff matrix at iteration $i$ as
$$\Delta_i^{j,k} = U(x_j, \tau_k) - U(x_j, \hat{\tau}_k).$$ 
Let $\Delta_i$ denote the absolute value of the largest utility estimation error for any cell in the payoff matrix at the $i$-th iteration of the algorithm, i.e., $\Delta_i = \max_{j,k} |\Delta_i^{j,k}|$. At step 7 of algorithm \ref{alg:double-oracle}, when we apply linear programming to solve the subgame $(\mathcal{X}_{i}, \hat{\mathcal{T}}_{i}, U)$, we obtain a mixed strategy for adversary $\hat{\sigma}^*_{i}$ defined on $\hat{\mathcal{T}}_{i}$. Recall $\sigma^*_i \in \Delta(\mathcal{T}_i)$ is the mixed strategy on the underlying true adversary distribution that shares the same weight as  $\hat{\sigma}^*_i$. Formally, $\sigma^*_i(\tau_l) = \hat{\sigma}^*_i(\hat{\tau_l}) \ \forall l \in [i]$.

\paragraph{Proof of Item 1}
\begin{proof}
We first show that for any $\pi_i \in \Delta(\mathcal{X}_{i})$ and $\sigma_i \in \Delta(\mathcal{T}_{i})$, we have $|U(\pi_i, \hat{\sigma}_i) - U(\pi_i, \sigma_i)| \leq \Delta_i$.
We write 
    \begin{equation}\label{eq:utility_bound_pf}
        |U(\pi_i, \sigma_i) - U(\pi_i, \hat{\sigma}_i)| = \sum_{\mathbf{x} \in \mathcal{X}_i}\sum_{\tau \in \mathcal{T}_i} \pi_i(\mathbf{x}) \cdot \sigma_i (\tau) \cdot |U(\mathbf{x}, \tau) - U(\mathbf{x}, \hat{\tau})|
    \end{equation}
   $|U(\mathbf{x}, \tau) - U(\mathbf{x}, \hat{\tau})|$ denotes the sample estimation error for the pure strategy pair $(\mathbf{x}, \tau)$ in the payoff matrix. The maximum on the right-hand side of Equation \ref{eq:utility_bound_pf} is obtained when putting all the probability mass on the strategy pair with the largest sample estimation error, which is $\Delta_i$.

    Then we bound $\Delta_i$ for each $i$. For any cell $(j,k)$ in the matrix, we apply Lemma \ref{lem:chebyshev}:
    $$P(|\Delta_{i}^{j,k}| \geq \frac{\epsilon}{4}) \leq \frac{16C'M^2}{N\epsilon^2 }.$$
    Since at $i$-th iteration, there are $(i+1)^2$ cells in the payoff matrix, we apply the union bound and obtain:
     $$P(\Delta_i \geq \frac{\epsilon}{4}) \leq \frac{16C'M^2 (i+1)^2}{N_i\epsilon^2}.$$
    By setting $N_i =\left\lceil 16C'M^2(i+1)^2 i^{1+\delta}/\epsilon^2 \right\rceil$, we have 
    $$P(\Delta_i \geq \frac{\epsilon}{4}) \leq \frac{1}{i^{1+\delta}}.$$
    Here we consider the events $A_i=\{\Delta_i \geq \frac{\epsilon}{4}\}$. From the step above, we have $$P(A_i)\leq \frac{1}{i^{1+\delta}}.$$ 
    Because $\delta >0$, $\sum_{i=1}^{\infty} \frac{1}{i^{1+\delta}}$ is a convergent series. Therefore, $\sum_{i=1}^{\infty} P(A_i)<\infty$. From Borel-Cantelli Lemma, we then obtain $P(\limsup\limits_{i \to \infty}A_i)=0$, which implies $A_i$ only happens for finite times. There exists $i_r$ that for any $i>i_r$,
$$P(\Delta_i \geq \frac{\epsilon}{4})=0.$$
Because of assumption \ref{assump:full_support}, the pure strategy space $\mathcal{X}$ and $\mathcal{T}$ are both compact and $U$ is continuous. Hence, $(\mathcal{X}, \mathcal{T}, U)$ is a two-player zero-sum continuous game, and here are several results that are already proven in \cite{adam2021double} for two-player zero-sum continuous games.
\begin{itemize}
\item Sequences $(\pi_i^*)_{i=1}^{\infty}$ and $(\sigma_i^*)_{i=1}^{\infty}$ have a weakly convergent subsequence, which for simplicity, will be denoted by the same indices. Therefore, $\pi_i^*\Rightarrow \pi^*$ for some $\pi^*$ and $\sigma_i^* \Rightarrow \sigma^*$ for some $\sigma^*$, where $\Rightarrow$ denotes weak convergence.
\item If $\pi_i \Rightarrow \pi$ in $\Delta(\mathcal{X})$ and $\sigma_i \Rightarrow \sigma$ in $\Delta(\mathcal{T})$, then $U(\pi_i, \sigma_i) \to U(\pi, \sigma)$. If $\pi_i \Rightarrow \pi$ in $\Delta(\mathcal{X})$ and $\tau_i \to \tau$ in $\mathcal{T}$, then $U(\pi_i, \tau_i) \to U(\pi,\tau)$.
\item For any $\pi \in \Delta(\mathcal{X})$ we have
        $$\min_{\tau \in \mathcal{T}} U(\pi,\tau) = \min_{\sigma \in \Delta(\mathcal{T})} U(\pi,\sigma)$$
\item The size of initial subset $X_1$ and $Y_1$ can be any finite number.
\end{itemize}
From the proof above, $A_i$ only happens for finite times. Assume $i_r$ is the largest number satisfying that $A_i$ happens. We then treat $(\mathcal{X}_{i_r}, \mathcal{T}_{i_r})$ as the initial set of strategies for both players.  Then our sampling scheme ensures that for any strategy pair $(\pi,\sigma)$ and iteration $i$, we have $|U(\pi_i,\sigma_i) - U(\pi_i,\hat{\sigma}_i)| \leq \Delta_i \leq \epsilon/4$.

Consider any $\mathbf{x}$ such that $\mathbf{x} \in \mathcal{X}_{i_0}$ for some $i_0$. Take an arbitrary $i \geq i_0$, which implies $\mathbf{x} \in \mathcal{X}_i$. Since $(\pi_i^*, \hat{\sigma}_i^*)$ is an equilibrium of the subgame $(\mathcal{X}_i, \hat{\mathcal{T}}_i, U)$, we  get
    $$U(\pi_i^*, \hat{\sigma}_i^*) \geq U(\mathbf{x}, \hat{\sigma}_i^*)$$
    Since $U(\pi_i^*, \sigma_i^*)$ and $U(\pi_i^*, \hat{\sigma}_i^*)$ differ by at most $\frac{\epsilon}{4}$, and $U(\mathbf{x}, \sigma_i^*)$ and $U(\mathbf{x}, \hat{\sigma}_i^*)$ differ by at most $\frac{\epsilon}{4}$, we have
    \begin{align*}
        U(\pi_i^*, \sigma_i^*) + \frac{\epsilon}{2} \geq U(\mathbf{x}, \sigma_i^*) \to U(\mathbf{x}, \sigma^*).
    \end{align*}
    % $$U(p_i^*, q_i^*) + 2\delta \geq U(x, q_i^*) \to U(x, q^*)$$
    Since $U(\pi_i^*, \sigma_i^*) \to U(\pi^*, \sigma^*)$, we have
    \begin{align}\label{eq:for-closed-x}
        U(\pi^*, \sigma^*) + \frac{\epsilon}{2} \geq U(\mathbf{x}, \sigma^*)
    \end{align}
    for all $\mathbf{x} \in \cup \mathcal{X}_i$. Since $U$ is continuous, the previous inequality holds for all $\mathbf{x} \in cl(\cup \mathcal{X}_i)$.

     Fix now an arbitrary $\mathbf{x} \in \mathcal{X}$. Note $\mathbf{x}_{i+1}$ is the best response to $U(\cdot, \hat{\sigma}^*_i)$ (since ranger oracle uses finite sample estimation of payoff matrix), and we have
    $$U(\mathbf{x}_{i+1}, \hat{\sigma}_i^*) \geq U(\mathbf{x},  \hat{\sigma}_i^*)$$

    Because $U(\mathbf{x}_{i+1}, \sigma_i^*)$ and $U(\mathbf{x}_{i+1}, \hat{\sigma}_i^*)$ differ by at most $\epsilon/4$, and $U(\mathbf{x}, \sigma_i^*)$ and $U(\mathbf{x}, \hat{\sigma}_i^*)$ differ by at most $\epsilon/4$, we have
    \begin{align*}
        U(\mathbf{x}_{i+1}, \sigma_i^*) + \frac{\epsilon}{2} \geq U(\mathbf{x}, \sigma_i^*) \to U(\mathbf{x}, \sigma^*)
    \end{align*}
     Since $\mathbf{x}_{i+1} \in \mathcal{X}_{i+1}$ and by compactness of $\mathcal{X}$, we can select a convergence subsequence $\mathbf{x}_i \to \tilde{\mathbf{x}}$, where $\tilde{\mathbf{x}} \in cl(\cup \mathcal{X}_i)$. This allows us to use \ref{eq:for-closed-x} to obtain 
    \begin{align*}
        U(\mathbf{x}_{i+1}, \sigma_i^*) \to U(\tilde{\mathbf{x}}, \sigma^*) \leq U(\pi^*, \sigma^*) +\frac{\epsilon}{2}.
    \end{align*}
    Therefore, for any $\mathbf{x}\in X$,
    \[
    U(\mathbf{x},\sigma^*)\leq U(\pi^*,\sigma^*)+\epsilon.
    \]
    Similarly, we have for any $\tau \in \mathcal{T}$,
    \[
    U(\pi^*,\tau)\geq U(\pi^*,\sigma^*)-\frac{\epsilon}{2}.
    \]
    The two sides are not symmetrical because the best response for the poacher doesn't use the finite sample approximation of payoff matrix, thus having a smaller error.  We then conclude the proof.
\end{proof}

We then show that adding the terminating condition, for any $\epsilon>0$, algorithm \ref{alg:double-oracle} can terminate in a finite number of iterations. Also, when it stops, it converges to a $4\epsilon$-equilibrium with high probability. 
% \haichuan{I feel like this sentence doesn't add much in the sense that $\epsilon$ itself can be arbitrarily small. Maybe we can change it to "The approximation $\epsilon$ can be set arbitrarily close to $0$ and error probability $\mathsf{prob}$ can be set arbitrarily close to $1$ as input of the algorithm.}\yuqi{removed it. I forgot why I wrote this sentence...}

\paragraph{Proof of Item 2} 
\begin{proof}
    Consider now an infinite game, from Item 1 in Theorem \ref{thm:convergence_inf_round}, we know that $\bar{v}_i - \underline{v}_i\rightarrow \epsilon$. Also, our sampling scheme ensures that for any strategy pair $(\pi,\sigma)$ and iteration $i$ after some finite rounds $i_r$, we have $|U(\pi_i,\sigma_i) - U(\pi_i,\hat{\sigma_i})| \leq \Delta_i \leq \frac{\epsilon}{4}$. This indicates that with $\epsilon>0$, the terminating condition will be satisfied after a finite number of iterations. Assume that the algorithm ends at the $j$-th iteration. This implies
    $$U(\mathbf{x}_{j+1}, \hat{\sigma}^*_j)- U(\pi^*_j, \hat{\tau}_{j+1}) \in (-2\epsilon, 2\epsilon)$$
Then we have
 \begin{align*}
     U(\pi_j,\sigma_j)&\leq U(\pi_j,\hat{\sigma}_j)+ \Delta_j\\
     &\leq U(\mathbf{x}_{j+1},\hat{\sigma}_j)+\Delta_j\\
     &\leq U(\pi_j,\hat{\tau}_{j+1})+\Delta_j+2\epsilon\\
     &\leq U(\pi_j,\tau_{j+1})+2\Delta_j+2\epsilon\\
     &=\arg\min_{\tau} U(\pi_j,\tau)+2\Delta_j+2\epsilon.
 \end{align*}

Here the first and fourth relation follows from the estimation error of $U$. The second one is because $\mathbf{x}_{j+1}$ is the best response for the ranger. The third one is from the terminating condition and the fifth one comes from the best response for the poacher.
Similarly,
 \begin{align*}
     U(\pi_j,\sigma_j)&\geq U(\pi_j,\sigma_{j+1})\\
     &\geq U(\pi_j,\hat{\sigma}_{j+1})-\Delta_j\\
     &\geq U(\mathbf{x}_{j+1},\hat{\sigma}_j)-\Delta_j-2\epsilon\\
     &\geq U(\mathbf{x}^*,\hat{\sigma}_j)-\Delta_j-2\epsilon,\text{ where }\mathbf{x}^*=\arg\max_{\mathbf{x}}U(\mathbf{x},\sigma_j)\\
     &\geq \arg\max_{\mathbf{x}}U(\mathbf{x},\sigma_j)-2\Delta_j-2\epsilon.
\end{align*}
Here the second and fifth relation comes from the estimation error of $U$. The first one is from the best response of the poacher and the fourth one is from the best response of the ranger. The third one follows from the terminating condition.

For $\Delta_j$, we have 
$$P(\Delta_j \geq \epsilon)\leq \frac{1}{16j^{1+\delta}}\leq \mathsf{prob},$$ where the second relation comes from $j>\frac{1}{16\mathsf{prob}}$. Therefore, we show that $(\pi_j,\sigma_j)$ is a $4\epsilon$-equilibrium with probability at least $1-\mathsf{prob}$.
 
\end{proof}

% \newpage

% \begin{proof}
% We here use $\hat{V}_i$ to denote the sample approximation of the utility matrix at $i$-th iteration of Algorithm \ref{alg:double-oracle}. Also, let $\Delta_i$ denote the largest difference among the cell of the true payoff table and sample payoff table at the $i$-th iteration of the algorithm.

% We first show that for any $\epsilon>0$, without terminating condition, Algorithm \ref{alg:double-oracle} can generate a convergent subsequence that converge to an $\epsilon$-equilibrium with finite samples each iteration.

% \convergenceinfround*
% % \begin{theorem}\label{thm:convergence_inf_round}
% % Under the Assumptions \ref{assump:concavity} and \ref{assump:full_support}, with finite number of samples at the $i$-th iteration 
% %  $$N_i =16cM^2(i+1)^2 i^{1+\delta}/\epsilon^2,$$
% % where $c$ is a constant in Proposition \ref{thm:twisted_diffusion_sampler}, $\delta$ and $\epsilon$ can be any positive number. Without terminating condition, every weakly convergent subsequence of the algorithm converge to an $\epsilon$-equilibrium in a possibly infinite number of iterations. Moreover, such a weakly convergent subsequence always exist.
% % \end{theorem}
% % \begin{proof}
% % We divide the proof into three steps as follows.
% % \paragraph{Step 1: Show Convergence with Infinite Samples and Infinite Epochs}

% % When the number of samples $N\rightarrow \infty$, according to Proposition \ref{thm:twisted_diffusion_sampler}, we have $\Delta_i\rightarrow 0$ almost surely for all $i$. Then from Theorem 3.1 in \cite{adam2021double}, with infinite samples each iteration, every weakly convergent subsequence of Algorithm \ref{alg:double-oracle} converges to an equilibrium in a possibly infinite number of iterations. Moreover, such a weakly convergent subsequence always exist.

% % \paragraph{Step 1: Bound Sample Estimation Error.}

% We first show that for any $p_i \in \Delta X_i$ and $q_i \in \Delta Y_i$, we have $|\hat{V_i}(p_i, q_i) - V(p_i, q_i)| \leq \Delta_i$.
% We write 
%     \begin{equation}\label{eq:utility_bound_pf}
%         |\hat{U_i}(\pi_i, \sigma_i) - U(\pi_i, \sigma_i)| = \sum_{x \in \mathcal{X}_i}\sum_{\tau \in \mathcal{T}_i} \pi_i(x) \cdot \tau_i (\tau) \cdot |\hat{U_i}(x, \tau) - U(x, \tau)|  
%     \end{equation}
%    $|\hat{V_i}(x, y) - V(x, y)|$ denotes the sample estimation error for pure strategy pair $(x,y)$ in the payoff matrix. The maximum on the right-hand side of Equation \ref{eq:utility_bound_pf} is obtained when putting all the probability mass on the strategy pair with the largest sample estimation error, which is $\Delta_i$.

% Then we bound $\Delta_i$ for each $i$. For any cell $(j,k)$ in the matrix, we apply the Chebyshev bound of twisted sampling:
%     $$P(|\Delta_{i,(j,k)}| > \frac{\epsilon}{4}) \leq \frac{16cM^2}{N\epsilon^2 }.$$
%     Since at $i$-th iteration, there are $(i+1)^2$ cells in the payoff matrix, we apply the union bound and obtain:
%      $$P(\Delta_i > \frac{\epsilon}{4}) \leq \frac{16cM^2 (i+1)^2}{N_i\epsilon^2}.$$
%     By setting $N_i =16cM^2(i+1)^2 i^{1+\delta}/\epsilon^2$, we have 
%     $$P(\Delta_i > \frac{\epsilon}{4}) \leq \frac{1}{i^{1+\delta}}.$$

% There exists $i_0$ that for any $i>i_0$,
% $$P(\Delta_i>\frac{\epsilon}{4})=0.$$
% % as $i\rightarrow \infty$, we have $\Delta_i\rightarrow 0$ almost surely. Equivalently, we now show that for any $\epsilon>0$, $$P(\limsup_{i\rightarrow\infty} \Delta_i\geq \epsilon)=0.$$
% Here we consider the events $A_i=\{\Delta_i>\epsilon/4\}$. From Lemma \ref{lem:order_stats_bound}, we have $$P(A_i)\leq \frac{1}{i^{1+\delta}}.$$  Therefore, $\sum_{i=1}^{\infty} P(A_i)<\infty$. From Borel-Cantelli Lemma, we then obtain $P(\limsup\limits_{i \to \infty}A_i)=0$, which implies $A_i$ only happens for finite times.

% \yuqi{add assumption 2}
%  We then list here several results that are already proven in \cite{adam2021double}.
%     \begin{enumerate}
%         \item There exists a weakly convergent subsequence, which for simplicity, will be denoted by the same indices. Therefore, $p_i^*\Rightarrow p^*$ for some $p^*$ and $q_i^* \Rightarrow q^*$ for some $q^*$, where $\Rightarrow$ denotes weak convergence.
%         \item If $p_i \Rightarrow p$ in $\Delta_X$ and $q_i \Rightarrow q$ in $\Delta_Y$, then $U(p_i, q_i) \to U(p,q)$. If $p_i \Rightarrow p$ in $\Delta_X$ and $y_i \to y$ in $Y$, then $U(p_i, y_i) \to U(p,y)$.
%         \item For any $p \in \Delta_X$ we have
%         $$\min_{y \in Y} U(p,y) = \min_{q \in \Delta_Y} U(p,q)$$
%         \item The size of initial subset $X_1$ and $Y_1$ can be any finite number.
%     \end{enumerate}
%     % From \cite{adam2021double}, we know there exists a weakly convergent subsequence, which for simplicity, will be denoted by the same indices. Therefore, $p_i^*(w) \Rightarrow p^*$ for some $p^*$ and $q_i^*(w) \Rightarrow q^*$ for some $q^*$, where $\Rightarrow$ denotes weak convergence.
%     From the proof above, $A_i$ only happens for finite times. Assume $i_0$ is the largest number satisfying that $A_i$ happens. We then treat $(X_{i_0}, Y_{i_0})$ as the initial subset. 
%      Then our sampling scheme ensures that for any strategy pair $(p,q)$ and iteration $i$, we have $|V(p,q) - \hat{V}_i(p,q)| \leq \Delta_i \leq \epsilon/4$.

%     Consider any $x$ such that $x \in X_{i_0}$ for some $i_0$. Take an arbitrary $i \geq i_0$, which implies $x \in X_i$. Since $(p_i^*, q_i^*)$ is an equilibrium of the subgame $(X_i, Y_i, \hat{V}_i)$, we  get
%     $$\hat{V}_i(p_i^*, q_i^*) \geq \hat{V}_i(x, q_i^*)$$
%     Since $V(p_i^*, q_i^*)$ and $\hat{V}_i(p_i^*, q_i^*)$ differ by at most $\epsilon/4$, we have
%     \begin{align*}
%         V(p_i^*, q_i^*) + \frac{\epsilon}{2} \geq V(x, q_i^*) \to V(x, q^*).
%     \end{align*}
%     % $$U(p_i^*, q_i^*) + 2\delta \geq U(x, q_i^*) \to U(x, q^*)$$
%     Since $V(p_i^*, q_i^*) \to V(p^*, q^*)$, we have
%     \begin{align}\label{eq:for-closed-x}
%         V(p^*, q^*) + \frac{\epsilon}{2} \geq V(x, q^*)
%     \end{align}
%     for all $x \in \cup X_i$. Since $V$ is continuous, the previous inequality holds for all $x \in cl(\cup X_i)$.
    
%  Fix now an arbitrary $x \in X$. Note $x_{i+1}$ is best response to $\hat{V}_i$ (since ranger oracle uses finite sample estimation of payoff matrix), and we have
%     $$\hat{V}_i(x_{i+1}, q_i^*) \geq \hat{V}_i(x, q_i^*)$$

%      Because $V$ and $\hat{V}_i$ differ by at most $\epsilon/4$, we have
%     \begin{align*}
%         V(x_{i+1}, q_i^*) + \frac{\epsilon}{2} \geq V(x, q_i^*) \to V(x, q^*)
%     \end{align*}
%      Since $x_{i+1} \in X_{i+1}$ and by compactness of $X$, we can select a convergence subsequence $x_i \to \tilde{x}$, where $\tilde{x} \in cl(\cup X_i)$. This allows us to use \ref{eq:for-closed-x} to obtain 
%     \begin{align*}
%         V(x_{i+1}, q_i^*) \to V(\tilde{x}, q^*) \leq V(p^*, q^*) +\frac{\epsilon}{2}.
%     \end{align*}
%     Therefore, for any $x\in X$,
%     \[
%     V(x,q^*)\leq V(p^*,q^*)+\epsilon.
%     \]
%     Similarly, we have for any $y\in Y$,
%     \[
%     V(p^*,y)\geq V(p^*,q^*)-\frac{\epsilon}{2}.
%     \]
%     The two sides are not symmetrical because the best response for the poacher doesn't use the finite sample approximation of payoff matrix, thus having a smaller error. 
%     %Here we also have 
%     % \begin{align*}
%     %     \bar{v} - \underline{v}&= \hat{V}(x^*,q^*)-\hat{V}(p^*,y^*)\\
%     %     &\leq V(x^*,q^*)-V(p^*,y^*)+\epsilon\\
%     %     &\leq 4\epsilon.
%     % \end{align*}
%     % Here we omit the time subscript because it may goes to infinity.
%     % Similarly,
%     % \[
%     % \bar{v} - \underline{v}\geq 2\epsilon.
%     % \]
%     We then conclude the proof.
% \end{proof}


% We then show that adding the terminating condition, for any $\epsilon>0$, Algorithm \ref{alg:double-oracle} can terminate in a finite number of iterations. Also, when it stops, it converges to a $4\epsilon$-equilibrium with high probability. Actually it can converge to any $\delta$-equilibrium that $\delta>\epsilon$ with high probability after slightly modifying the terminating condition.


% \section{Missing Proof of \textbf{Item 2} in Theorem \ref{thm:convergence_inf_round}}\label{appdx:convergence_finite_round}
% % \convergenceinfiniteround*
% \convergenceinfround*
% \begin{proof}
%     Consider now an infinite game, from Theorem \ref{thm:convergence_inf_round}, we know that $\bar{v}_i - \underline{v}_i\rightarrow \epsilon$. Also, our sampling scheme ensures that for any strategy pair $(p,q)$ and iteration $i$ after some finite rounds $i_0$, we have $|V(p,q) - \hat{V}_j(p,q)| \leq \Delta_i \leq \epsilon/4$. This indicates that with $\epsilon>0$, the terminating condition will be satisfied after a finite number of iterations. Assume the algorithm terminates at $j$-th iteration. This implies
%  $$\hat{V}_j(\mathbf{a}_{j+1},q_j)-\hat{V}_j(p_j,\tilde{\pi}_{j+1})\in (-2\epsilon,2\epsilon).$$

%  Then we have
%  \begin{align*}
%      V(p_j,q_j)&\leq \hat{V}_j(p_j,q_j)+\Delta_j\\
%      &\leq \hat{V}_j(\mathbf{a}_{j+1},q_j)+\Delta_j\\
%      &\leq \hat{V}_j(p_j,\tilde{\pi}_{j+1})+\Delta_j+2\epsilon\\
%      &\leq V(p_j,\tilde{\pi}_{j+1})+2\Delta_j+2\epsilon\\
%      &=\arg\min_{y} V(p_j,y)+2\Delta_j+2\epsilon.
%  \end{align*}
%  Here the first and fourth relation follows from the estimation error of $V_j$. The second one is because $a_{j+1}$ is the best response for the ranger. The third one is from the terminating condition and the fifth one comes from the best response for the poacher.
 
%  Similarly,
%  \begin{align*}
%      V(p_j,q_j)&\geq V(p_i,\tilde{\pi}_{j+1})\\
%      &\geq \hat{V}_j(p_i,\tilde{\pi}_{j+1})-\Delta_j\\
%      &\geq \hat{V}_j(\mathbf{a}_{j+1},q_j)-\Delta_j-2\epsilon\\
%      &\geq \hat{V}_j(x^*,q_j)-\Delta_j-2\epsilon,\text{ where }x^*=\arg\max_{x}V(x,q_j)\\
%      &\geq \arg\max_{x}V(x,q_j)-2\Delta_j-2\epsilon.
% \end{align*}
% Here the second and fifth relation comes from the estimation error of $V_j$. The first one is from the best response of the poacher and the fourth one is from the best response of the ranger. The third one follows from the terminating condition.

% For $\Delta_j$, we have 
% $$P(\Delta_j>\epsilon)\leq \frac{1}{16j^{1+\delta}}\leq \mathsf{p},$$ where the second relation comes from $j>1/16\mathsf{p}$. Therefore, we show that $(p_j,q_j)$ is a $4\epsilon$-equilibrium with probability at least $1-\mathsf{p}$.
 
% \end{proof}


% % Let $\Delta_i$ denote the largest difference among the cell of the true payoff table and sample payoff table at the $i$-th iteration of the algorithm. We assume the utility is bounded between $[0, M]$.
% % \begin{lemma}\label{lem:utility_bound}
% %     For any $p_i \in \Delta X_i$ and $q_i \in \Delta Y_i$, we have $|\hat{U_i}(p_i, q_i) - U(p_i, q_i)| \leq \Delta_i$.
% % \end{lemma}
% % \begin{proof}
% %     We write 
% %     \begin{equation}\label{eq:utility_bound_pf}
% %         |\hat{U_i}(p_i, q_i) - U(p_i, q_i)| = \sum_{x}\sum_{y} p_i(x) \cdot q_i (y) \cdot |\hat{U_i}(x, y) - U(x, y)|
% %     \end{equation}
% %    $|\hat{U_i}(x, y) - U(x, y)|$ denotes the sample estimation error for pure strategy pair $(x,y)$ in the payoff matrix. The maximum on the right-hand side of \ref{eq:utility_bound_pf} is obtained when putting all the probability mass on the strategy pair with the largest sample estimation error, which is $\Delta_i$.
% % \end{proof}

% % \begin{lemma}\label{lem:order_stats_bound}
% %     Given any $\delta$, if we sample
% %     $$N_i = \frac{cM^2(i+1)^2 i \log^3i}{\delta^2} $$
% %     particles for each cell at the $i$-th iteration of the algorithm, then $\Delta_i\leq \delta/\log i$ with probability at least $1-1/i\log i$. 
% % \end{lemma}

% % \begin{proof}
% %     For any cell $(j,k)$ in the matrix, we apply the Chebyshev bound of twisted sampling:
% %     $$P(|\Delta_{i,(j,k)}| > \frac{\delta}{\log i}) \leq \frac{cM^2\log^2 i}{N \cdot \delta^2}$$.
% %     Since at $i$-th iteration, there are $(i+1)^2$ cells in the payoff matrix, we apply the union bound and obtain:
% %      $$P(\|\Delta\|_i > \frac{\delta}{\log i}) \leq \frac{cM^2 (i+1)^2\log^2 i}{N\cdot \delta^2}$$
% %     By setting $N_i =\frac{cM^2(i+1)^2 i \log^3i}{\delta^2}$, we have 
% %     $$P(\|\Delta\|_\infty > \frac{\delta}{\log i}) \leq \frac{1}{i\log i}.$$
% % \end{proof}

% % \begin{lemma}
% %     As $i\rightarrow \infty$, we have $\Delta_i\rightarrow 0$ almost surely.
% % \end{lemma}
% % \begin{proof}
% %     Equivalently, we now show that for any $\epsilon>0$, 
% %     $
% %     P(\limsup_{i\rightarrow\infty} \Delta_i\geq \epsilon)=0.
% %     $
% %     Here we consider the events $A_i=\{\Delta_i>\epsilon\}$. From Lemma \ref{lem:order_stats_bound}, we have $P(A_i)\leq \frac{1}{i\log i}$ for any $i>e^{\frac{\delta}{\epsilon}}$. Therefore, $\sum_{i=1}^{\infty} P(A_i)<\infty$. From Borel-Cantelli Lemma, we then obtain $P(\lim\sup_{i\rightarrow\infty} A_i)=0$, which concludes the proof.
% % \end{proof}
% % \begin{lemma}[Convergence of Double Oracle with Infinite Samples and Infinite Epochs]
% % \label{thm:double_oracle_infinite}
% % Under the assumption 1 and 2, when the number of samples $N\rightarrow \infty$ and number of iterations $T\rightarrow \infty$, every weakly convergent subsequence of the algorithm converge to an equilibrium. Moreover, such a weakly convergent subsequence always exist.
% % \end{lemma}

% % \section{Equations for making slides, should be deleted later}
% % \begin{align}
% % \max_{\pi} \min_{\tau}  
% % U(\pi, \tau) \notag \\
% % \text{s.t.} \quad D_{\rm KL}(\tau || p_{\theta}(\mathbf{z} | \mathbf{c})) \leq \rho, 
% % \label{eq:new}
% % \end{align}

