% main content
% introduce general iteravtive approach for the solution, no need algorithm block using enumerate
% introduce the solution for the inner maximization problem, 1. prove the dual problem gradient always positive and then lead to binary search algo. (introduce a proposition and an algorithm )
% argue that the out maximization problem is standard MLE learning on weighted data
% case study of nn-gbn on the outer minimization problem . mixmg part using weighted em, nn part using weighted loss 
% how we the iteration, each step we don't go to the optimal and 1 epoch learning of nn and 1 epoch of em . 




% logic flow
% general overview of this section, problem restate, general iterative algorithm as the learner and adversary
% inner maximization solution when kl-divergence, the exact problem formualte , mention it works for all probabilistic models allows efficient likelihood computation, rnade, spn, cusset networks, psdds, acs, etc.
% outter minimization, case study

In this section, we present our approach to learn distributionally robust probabilistic models, leveraging the DRSL framework with the KL-divergence as the distance metric between distributions.
Specifically, given a set of data observations $\mathcal{D} = \{ \bm{z}_1,  ..., \bm{z}_n \}$ that are sampled independently from a distribution $P_{\theta}( \bm{Z} )$ with unknown parameter $\theta$, our goal is to find the best parameters such that the adversarial ``risk'' in \eqref{eq:arm} is minimized. 
Formally, the problem is expressed as follows.
\begin{equation}\label{eq:ll_arm}
\begin{aligned}
& \argmax_{\theta} \inf_{\bm{r} }  \frac{1}{n} \sum_{i=1}^n r_i \cdot \log P_{\theta} (\bm{z_i}) \\
 s.t. &  \\
& \frac{1}{n} \sum_{i=1}^n r_i \log r_i \le \delta,  \quad \frac{1}{n} \sum_{i=1}^n r_i = 1, \quad r_i \ge 0  
\end{aligned}
\end{equation}
The inner optimization problem ($ \inf_{\bm{r} }$) corresponds to an \emph{adversarial} step where we find the worst weight $\bm{r}$ such that the reweighted loglikelihood is minimized; while the outer optimization problem ($  \argmax_{\theta}  $) corresponds to a \emph{learning} step where we find the best $\theta$ to maximize the weighted loglikelihood.
Taking the same approach as Generative Adversarial Networks (GAN)~\citep{goodfellow2020generative}, we tackle this optimization problem by alternating between the learning and adversarial steps as follows.
\begin{enumerate}
    \item \textbf{Init}: Initialize the parameters $\theta$ of model $P_{\theta}(\bm{Z})$.
    \item \textbf{Adversarial Step}: Fix $\theta$, update the weight vector $\bm{r}$ by solving the inner minimization problem. 
    \item \textbf{Learning Step}: Fix $\bm{r}$, update the parameter $\theta$ by solving the outer maximization problem.
    \item \textbf{Repeat}: Repeat step 2-3 until a suitable stopping condition is met, such as reaching a stationary point or hitting the maximum number of iterations.
\end{enumerate}

The following sections provide a detailed prescription for solving the optimization problems in both the adversarial and learning steps.
Subsequently, we will demonstrate how it can be employed to train robust probabilistic models. 

\subsection{Adversarial Step}
\label{sec:adv-step}
The optimization problem we need to solve in the adversarial step is 
\begin{equation}
\label{eq:adv-problem}
\begin{aligned}
&  \inf_{\bm{r} }  \sum_{i=1}^n r_i \cdot l_i \\
 s.t. &  \\
& \frac{1}{n} \sum_{i=1}^n r_i \log r_i \le \delta,  \quad \frac{1}{n} \sum_{i=1}^n r_i = 1, \quad r \ge 0  ,
\end{aligned}
\end{equation}
where $l_i \equiv \log P_{\theta} (\bm{z_i})$ can be treated as a constant because the parameter $\theta$ is fixed. In addition, we also ignore the constant \nicefrac{1}{n} in the objective function because it doesn't change the optimal  weight values, $\bm{r}$.

We employ the method of Lagrange multipliers and derive the dual problem of the optimization problem in \eqref{eq:adv-problem} as 
\begin{equation}
\label{eq:adv-dual}
\sup_{\alpha \geq 0, \beta} \quad - \sum_i \alpha \cdot \exp \left ( \frac{-\beta - l_i}{\alpha} - 1 \right )  - \alpha n \delta - \beta n, 
\end{equation}
where $\alpha, \beta$ are Lagrange multipliers (detailed derivation is shown in Appendix Section \ref{sec:dual-details}) and the value of primal variable $r_i$ can be calculated as 
\begin{equation}
\label{eq:rab}
r_i = \exp \left ( \frac{-\beta - l_i}{\alpha} - 1 \right ).
\end{equation}
Note that the primal problem satisfies the Slater’s condition~\citep{slater2013lagrange}, therefore ensuring strong duality.

We can attempt to solve \eqref{eq:adv-dual} by taking the derivatives respective to $\alpha$ and $\beta$ and setting them equal to zero (assuming there is a non-negative solution for $\alpha$). Denoting the objective in \eqref{eq:adv-dual} as $L'(\alpha, \beta)$, we have
\begin{equation}
\label{eq:dl-dalpha}
\resizebox{0.9 \hsize}{!}{
$\frac{\partial L'}{\partial \alpha} = -n\delta + \sum_i \exp \left ( \frac{-\beta - l_i}{\alpha} - 1 \right ) \left ( \frac{-\beta - l_i}{\alpha} - 1 \right ) = 0$ 
}
\end{equation}
and
\begin{equation}
\label{eq:dl-dbeta}
\frac{\partial L'}{\partial \beta} = -n + \sum_i \exp \left ( \frac{-\beta - l_i}{\alpha} - 1 \right ) = 0.
\end{equation}
Because $r_i = \exp \left ( \frac{-\beta - l_i}{\alpha} - 1 \right )$, we can see that  \eqref{eq:dl-dalpha} is equivalent to $\sum_i r_i \log r_i = n\delta$; and \eqref{eq:dl-dbeta} is equivalent to $\sum_i r_i = n$. These two equations correspond to our original constraints, and no closed form solution for $\alpha$ and $\beta$ is available.



\begin{algorithm}[t]
\caption{Efficient Linearithmic Search Algorithm for the Adversarial Step} 
\label{alg:adv-search}
\textbf{Input}: (1) $\bm{l} = \{ l_1,...,l_n \}$, the loglikelihoods with respect to all training data points (under the current model parameter $\theta$); (2) U, upper searching bound of the dual variable $\alpha$; (3) $\delta$, the hyper-parameter controlling the amount of distribution shifts; and (4) $\epsilon$, the maximum allowed error for the variable $\alpha$. \\
\textbf{Output}: the weight vector $\bm{r}$ that solves the optimization problem \eqref{eq:adv-problem}. \\ 

% \textcolor{grey}{an example comment}  \\

$\alpha_l \gets 0$ \;
$\alpha_u \gets$ U \;
\While{$\alpha_u - \alpha_l \ge \epsilon$}{
    $\alpha \gets \nicefrac{ (\alpha_l+\alpha_u )}{2}$ \;
    $\beta \gets - \alpha \log \left(  \frac{ n } { \sum_i  \exp \left (  \nicefrac{-l_i  } {\alpha}  -1   \right )    } \right )  $ \;
    % $\bm{w} \gets \exp(-\nicefrac{\bm{l}}{\alpha})$ \textcolor{gray}{//unnormalized weight} \;
    % $\bm{r} \gets n \cdot \nicefrac{\bm{w}}{ sum(\bm{w}) }$ \;
    $\bm{r} \gets \exp \left (  \frac{-\beta - \bm{l}}{\alpha} -1  \right) $ \;
    $g \gets -n\delta + \sum_i  \log r_i^{r_i}  $ \textcolor{gray}{//numerical stability} \;
    \eIf{$g \le 0$}{  
        \textcolor{gray}{// negative gradient} \\
        $\alpha_u \gets \alpha$ \;
    }{
        $\alpha_l \gets \alpha$ \;
    }
}
Return the weight vector $\bm{r}$ \;
\end{algorithm}

Nevertheless, we note that the dual problem involves only two variables, $\alpha \ge 0$ and $\beta$, and the dual objective function in \eqref{eq:adv-dual} derived from the method of Lagrange multipliers is always concave. 
We further prove that the dual objective is twice differentiable and \emph{strictly} concave unless all log-likelihoods, $l_i$, are equal, which is unlikely given that $l_i$ is real-valued and there are usually many training instances (see Appendix Section~\ref{sec:strict-concav} for a detailed proof). 
This implies that numerical optimization algorithms like gradient ascent or coordinate ascent can be employed to arbitrarily well approximate a \emph{globally} optimal solution with enough iterations~\citep{tseng2001convergence}. 

We propose an efficient search-based algorithm capable of solving the problem and obtaining a solution that is arbitrarily close to the exact answer. 
Our algorithm is essentially a coordinate ascent algorithm with two key changes that guarantee linearithmic time complexity.
\begin{enumerate}
\item 
When conducting coordinate ascent along the $\beta$ direction ($\alpha$ is fixed), we can solve for $\beta$ in closed form using \eqref{eq:dl-dbeta} as 
\begin{equation}
\label{eq:beta-formula}
 \beta = - \alpha \log \left(  \frac{ n } { \sum_i  \exp \left (  \frac{-l_i  } {\alpha}  -1   \right )    } \right )   
\end{equation}

\item 
When optimizing along the $\alpha$ direction with $\beta$ fixed, we use binary search (highly efficient with guaranteed logarithmic time complexity) instead of gradient ascent because it is a \emph{one-dimensional strictly concave} maximization problem.

\end{enumerate}

With the above key observations, we can effectively binary search for the optimal $\alpha$ using the method shown in Algorithm~\ref{alg:adv-search}. Specifically, we first fix the $\alpha$ as the middle point of the interval  $[\alpha_l,\alpha_u]$, then compute the corresponding $\beta$ using the equation \eqref{eq:beta-formula}, and use the derivative evaluated at this point to constrain the location of the optimal $\alpha$. Note that the initial upper bound $\alpha = U$ must have a negative derivative, i.e., \eqref{eq:dl-dalpha}, when evaluated at the corresponding optimal $\beta$. One of the approaches to obtain such a upper bound is to keep multiplying $U$ by two until the gradient becomes negative.
In addition, we calculate $\log r_i^{r_i}$ as the surrogate for $r_i \log r_i$ for numerical stability.

The time complexity for Algorithm~\ref{alg:adv-search} is $O(n \log (\nicefrac{U}{\epsilon}) )$ where $n$ is the number of training instances, $U$ is the search upper bound  and $\epsilon$ is the error tolerance. The algorithm is very efficient even if $U$ is very large and we have high accuracy requirement. For example, when $U = 10^{20}$ (larger than the biggest 64-bit integer) and $\epsilon = 10^{-5}$, $\log (\nicefrac{U}{\epsilon})$ is less than 84.
In practice, because the absolute loglikelihoods $|l_i|$ are usually small, the value of $U$ is small as well. Therefore, $\log (\nicefrac{U}{\epsilon}) $ is typically less than 30. 






\subsection{Learning Step}
The optimization problem in the learning step is 
\begin{equation}
\label{eq:learn-fomular}
\argmax_{\theta} \sum_{i=1}^n r_i \cdot \log P_{\theta} (\bm{z_i}),    
\end{equation}
where $r_i$ is a fixed constant weight. 
In fact, the above problem is equivalent to learning a model via standard MLE in which each data instance $\bm{z_i}$ is associated with a weight $r_i$~\citep{legeleux2022gaussian}. Here, the weight can be interpreted as ``how many times we see the data instance''~\footnote{Float values are allowed here and it won't break the evaluation of joint likelihood.}.   
Therefore, given a data instance $\bm{z_i}$ that is observed $r_i$ times, the loglikelihood with respect to this instance can be formulated as 
$$
\log P_{\theta} (\bm{z_i}) ^ {r_i} =  r_i \log P_{\theta} (\bm{z_i}), 
$$
which corresponds to the weighted loglikelihood in \eqref{eq:learn-fomular}.

In general, the weighted MLE problem can be solved using the gradient ascent method~\footnote{We limit our focus to model parameters during the learning step, assuming the model's structure is fixed (if the model involves a structural learning component).} similar to the standard MLE case. 
In addition, for certain probabilistic models such as Multivariate Gaussians, this problem admits a closed form solution.

% \subsection{Case Study for NN-GBN Model}

% In this section, we explore how the adversarial and learning steps are conducted specifically for the NN-GBN model (see Section~\ref{sec:nn-gbn}). 

% During the adversarial step, we simply evaluate the loglikelihood of all training data points as $\bm{l} = \{l_1,...,l_n\}$ where $l_i = \log P_{\theta} (\bm{z_i})$ and then use Algorithm~\ref{alg:adv-search} to find the new weight $\bm{r}$. 
% This step is same for all  probabilistic models that admit efficient exact likelihood computation, which demonstrates the flexibility of this framework.

% For the learning step, because NN-GBN models the joint distribution as two parts, we have 
% \begin{align*}
% & \argmax_{\theta} \sum_{i=1}^n r_i  \log P_{\theta} (\bm{z_i}) \\
% & =  \argmax_{\alpha} \sum_{i=1}^n r_i  \log \text{MixMG}_{\alpha} (\bm{x_i}) \\ 
% & \quad + \argmax_{\beta} \sum_{i=1}^n r_i \log \text{GBN}_{NN_{\beta}(\bm{x_i})} (\bm{y_i}), 
% \end{align*}
% where $\theta = \{\alpha, \beta \}$. 
% As we can see, the learning step of NN-GBN breaks down into two weighted MLE learning subproblems. 
% For the first part, fitting a MixMG model on weighted data can be efficiently done using the EM algorithm where the solution for the E-step and M-step are still in closed form~\citep{legeleux2022gaussian}.  
% For the second part, we add an extra weight term into the original loss function used by \citet{dong2022conditionally} for training the neural network $NN_{\beta}$, and return the weighted negative loglikelihood as the loss value. 
% The whole network is then trained in a mini-batch fashion with the Adam optimizer in PyTorch~\citep{paszke2019pytorch}.


\subsection{Practical Concerns}
\label{sec:practical}
The adversarial step is usually significantly faster (typically around 1-5 seconds), compared to the learning step that often takes minutes or even hours due to the iterative EM or gradient optimization processes over neural networks.
Therefore, executing EM or NN gradient updates until convergence in the learning step to achieve an accurate estimation of equation (9) would be highly inefficient. 
It is also essential to recognize that running EM to convergence is likely unnecessary, given that, in early rounds, the current weight vector $\bm{r}$ is sub-optimal and will be changed in subsequent iterations.

Therefore, a practical strategy is to execute the learning step for only a few iterations, aiming to identify a good or moderate parameter configuration under the current weight settings.
After that, we promptly transition to the adversarial step to update the weight vector.
This approach enhances the efficiency of the entire adversarial learning process and enables multiple iterations between the adversarial and learning steps. 












% ------------ the following content are outdated -----

% \subsection{General formulation}
% We start from the final DRSL formulation that is summarized at the 2nd page of this paper~\footnote{https://proceedings.mlr.press/v80/hu18a.html}. The detailed formulation is as follows:
% $$
% \min_{\theta} \sup_{\bm{r} \in U_f} \frac{1}{N} \sum_{i=1}^N r_i \cdot l_i(\theta)
% $$
% where
% $$
% U_f = \left \{   \bm{r} \bigm| \frac{1}{N} \sum_i f(r_i) \le \delta, \frac{1}{N} \sum_i r_i = 1, r_i \ge 0  \right \}
% $$.

% In the above formulation, $f(x) = x \log x$, which instantiates the f-divergence as KL-divergence. The weight $r_i = \frac{q(x_i)}{p(x_i)}$. 

% This problem can be solved efficiently by iteratively doing 
% \begin{enumerate}
%     \item fix the weight $\bm{r}$, learn a best parameter $\theta$ for the classifier/regressor on weighted data (training data $(x_i,y_i) $ with weight $r_i$).
%     \item fix the parameters $\theta$, compute the current loss $l_i(\theta) = l(g_{\theta}(x_i), y_i)$ for each data point  $(x_i,y_i) $. Solve the inner maximization problem to get the new weight $\bm{r}$. This can be done easily in $O(n \log n)$ time as well.
% \end{enumerate}


% \subsection{Plug in our model}
% The learning of the whole model falls into the MLE setting as follows
% $$
% \max_{\theta} \log \mathcal{L_{\theta}}(\mathcal{D}) = \max_{\theta} \sum_{i=1}^N \log P_{\theta} (z_i)
% $$
% . Because the design of our model, it can further split into two parts.
% \begin{multline}
%  \max_{\theta} \sum_{i=1}^N \log P_{\theta} (z_i) \\ =  \max_{\alpha} \sum_{i=1}^N \log P_{\alpha} (x_i) + \max_{\beta} \sum_{i=1}^N \log P_{\beta} (y_i|x_i)   
% \end{multline}

% Let's focus on the later part and see how DRSL can be applied in our case. Specifically, we have 
% $$
%  P_{\beta} (y_i|x_i)  =  P_{\gamma}(y_i), \gamma = G_{\beta} (x_i)
% $$
% where $G$ is our parameter generation neural network (PGNN) with the parameters $\beta$. Learning the distribution $P(Y|X)$ in our case is essentially learning the parameter $\beta$ of PGNN.

% Let's define the loss function as
% $$
% l_i(\theta) = l(G_{\beta}(x_i), y_i) = l(\gamma, y_i) \equiv - \log P_{\gamma}(y_i).
% $$

% Then we have 
% $$
% \max_{\beta} \sum_{i=1}^N \log P_{\beta} (y_i|x_i) = \min_{\beta} \sum_{i=1}^N l(G_{\beta}(x_i), y_i)
% $$
% This problem is essentially a supervise learning problem with a special loss function under the framework of empirical risk minimization (ERM) framework. 
% Our goal is to learn and regressor $G$ with parameter $\beta$ such that the empirical loss is minimized. 

% Because of this, instead of using ERM framework, we can also apply the DRSL framework as well, and it will help us to learn a better PGNN model $G_{\beta}$.

% By plug in our customized loss function into the DRSL formulation, we have 
% $$
% \min_{\beta} \sup_{\bm{r} \in U_f} \sum_{i=1}^N - r_i \cdot \log P_{\gamma = G_{\beta}(x_i)}(y_i)
% $$
% where
% $$
% U_f = \left \{   \bm{r} \bigm| \frac{1}{N} \sum_i r_i \cdot \log r_i \le \delta, \frac{1}{N} \sum_i r_i = 1, r_i \ge 0  \right \}
% $$.



