%\documentclass{uai2025} % for initial submission
\documentclass[accepted]{uai2025}

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{float}
\usepackage{multirow} 
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{xcolor}
\newcommand{\YL}{\color{magenta} }  \newcommand{\YLe}{\color{black} }

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
% \theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
% \theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}


%%%==============
\newcommand{\ucal}{\mathcal{U}}
\newcommand{\lcal}{\mathcal{L}}
\newcommand{\nbb}{\mathbb{N}}
\newcommand{\bz}{\mathbf{z}}
\newcommand{\bw}{\mathbf{w}}
\newcommand{\fcal}{\mathcal{F}}
\newcommand{\ibb}{\mathbb{I}}
\newcommand{\xcal}{\mathcal{X}}
\newcommand{\wcal}{\mathcal{W}}
\newcommand{\vcal}{\mathcal{V}}
\newcommand{\rcal}{\mathcal{R}}
\newcommand{\ccal}{\mathcal{C}}
\newcommand{\hcal}{\mathcal{H}}
\newcommand{\bW}{\mathbf{W}}
\newcommand{\bU}{\mathbf{U}}
\newcommand{\bu}{\mathbf{u}}
\newcommand{\bs}{\mathbf{s}}
\newcommand{\zcal}{\mathcal{Z}}
\newcommand{\ncal}{\mathcal{N}}
\newcommand{\acal}{\mathcal{A}}
\newcommand{\tcal}{\mathcal{T}}
\newcommand{\ecal}{\mathcal{E}}
\newcommand{\dcal}{\mathcal{D}}
% \newcommand{\bb}{\tilde{b}}
\newcommand{\bb}{b_2}
\newcommand{\aaa}{b_1}
% \newcommand{\aaa}{\ebb_{\phi\sim\pbb}[\gamma_\phi]}
\newcommand{\ycal}{\mathcal{Y}}
\newcommand{\pcal}{\mathcal{P}}
\newcommand{\gcal}{\mathcal{G}}
\newcommand{\bp}{\mathbf{p}}
\newcommand{\ocal}{\mathcal{O}}
\newcommand{\ebb}{\mathbb{E}}
\newcommand{\qbb}{\mathbb{Q}}
\newcommand{\bg}{\mathbf{g}}
\newcommand{\be}{\mathbf{e}}
\newcommand{\bv}{\mathbf{v}}
\newcommand{\emp}{\mathcal{E}_{\bz}}
\newcommand{\rbb}{\mathbb{R}}
\newcommand{\pbb}{\mathbb{P}}
\newcommand{\red}{\color{red}}
% \numberwithin{equation}{section}
\usepackage{graphicx}
\newcommand{\zsj}{\color{orange}Sijia: }
\newcommand{\td}{\color{red}TODO: }

 % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Learning to Sample in Stochastic Optimization %A Generic Analysis via PAC-Bayes Sampling in \\  Algorithm Robustness and Visualization?
}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors Sijia Zhou\inst{1} \Letter \and

% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2025 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Sijia Zhou}
\author[2]{Yunwen Lei}
\author[1]{Ata Kab\' an}

% Add affiliations after the authors
\affil[1]{%
    University of Birmingham, Birmingham B15 2TT, United Kingdom 
}
\affil[2]{%
    University of Hong Kong,
  Pokfulam, Hong Kong, China
}
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
  

\begin{document}
\maketitle


\begin{abstract}
We consider a PAC-Bayes analysis of stochastic optimization algorithms, and devise a new SGDA algorithm inspired from our bounds. Our algorithm learns a data-dependent sampling scheme along with model parameters, which may be seen as assigning a probability to each training point. 
We demonstrate that learning the sampling scheme increases robustness against misleading training points, as our algorithm learns to avoid bad examples during training. 
We conduct experiments in both standard and adversarial learning problems on several benchmark datasets, and demonstrate various applications including interpretability upon visual inspection, and robustness to the ill effects of bad training points. We also extend our analysis to pairwise SGD to demonstrate the generalizability of our methodology. 
\end{abstract}


\section{Introduction} 
%{\td shorten. }
%%%
Stochastic optimization is a cornerstone in training deep learning models on large-scale datasets.
%Stochastic Gradient Descent (SGD) and Stochastic Gradient Descent Ascent (SGDA) are widely used in large-scale minimization and min-max optimization problems in machine learning due to their efficiency. 
These algorithms employ sampling strategies to estimate gradients and improve computational efficiency. While uniform sampling of training points is the classic approach in these optimization methods, recent studies have explored data-dependent sampling to accelerate convergence, reduce the variance of the gradient estimates, and enhance prediction accuracy 
\citep{zhao2015stochastic, allen2016even, katharopoulos2017biased, johnson2018training, wu2017sampling, han2022rethinking}. 

In real-world datasets, training examples may have a varying degree of relevance to the target, some are less typical than others, or data may contain noise or outliers, and identifying difficult examples \citep{agarwal2022estimating} is an active area of research.

Recent works \citep{london2017pac,zhou2023toward} made a start at developing theory to explain generalization of Stochastic Gradient Descent (SGD) with non-uniform sampling, by combining algorithmic stability \citep{bousquet2002stability,bousquet2020sharper} with the PAC-Bayes framework \citep{shawe1997pac,mcallester1999some}. However, those works only considered SGD, and only a few give practical algorithms to exploit the potential of the analysis.

In this paper, we take advantage of the PAC-Bayes machinery, namely that bounds hold uniformly for all sampling distributions over the indices of training points. This allows us to learn a sampling strategy from the data to maximize accuracy by minimizing the generalization bound. 
We also anticipate increased robustness due to the model averaging built into the PAC-Bayes framework. In addition, the learned sampling distribution is readily interpretable as weights on individual training examples, providing new avenues. 

Moreover we develop the analytic framework to Stochastic Gradient Descent Ascent (SGDA). % [and Pairwise SGD]. 
SGDA has gained attention in various areas, such as adversarial training \citep{sinha2017certifying}, generative adversarial networks (GANs) \citep{sanjabi2018convergence}, robust optimization \citep{namkoong2016stochastic}, and reinforcement learning \citep{dai2018sbeed}. In particular, it plays a key role in adversarial training, a primary defense against adversarial attacks on deep neural networks. This process is framed as a min-max optimization problem, where the goal is to optimize the model while accounting for worst-case perturbations introduced by adversaries.

%However, most adversarial trainingapproaches assume clean datasets with correctly labeled data, whereas real-world datasets often contain mislabeled or corrupted examples. Training on such noisy datasets using traditional stochastic optimization methods can distort model parameters and degrade performance.

%To address these challenges, we build on a previvous theoretical framework that integrates algorithmic stability and PAC-Bayes analysis \citep{london2017pac,zhou2023toward} and we offer robust theoretical guarantees for SGDA under data-dependent sampling. This framework accounts for noisy labels and adversarial perturbations, demonstrating improved resilience to data poisoning compared to prior PAC-Bayes approaches and robust algorithms. Our method optimizes the sampling distribution to enhance generalization, identify mislabeled or atypical examples, and strengthen adversarial defenses. We empirically validate our approach on standard and adversarial trainingsettings, to demonstrate that our methods not only improve training stability but also enhance interpretability in adversarial and noisy data environments. 


%%====
Beyond SGDA, we extend our analysis to pairwise SGD. 
Pairwise comparisons provide deeper insights into model behavior, particularly in distinguishing human and machine visual scene recognition, which cannot be fully addressed through pointwise analysis alone. By visualizing pairwise comparisons, we uncover models' behavior when faced with atypical examples, offering a deeper understanding of the optimization process. 

Our contributions are summarized as follows:


%Our main theoretical contribution establishes generalization bounds for SGD and SGDA in both smooth and non-smooth convex cases, considering data-dependent sampling distributions. Our approach balances empirical risk minimization with keeping closeness to the prior distribution, ensuring stability during optimization.

%Building on this, we propose a theory-driven algorithm that dynamically adjusts the sampling distribution based on Q-scores, which quantify individual data point information. By leveraging Q-scores, we refine training dynamics, strengthen models defenses, and improve post-processing techniques for adversarial analysis. Departing from uniform sampling, this method enhances robustness against data poisoning and adversarial perturbations.


%Experiments on linear regression and neural networks demonstrate the effectiveness of our algorithms, particularly in adversarial trainingand data poisoning scenarios. Our findings advance the understanding of stochastic optimization in robust machine learning.

\begin{itemize}
\item We prove the sub-exponential stability property of SGDA and establish PAC-Bayes generalization bounds in smooth and non-smooth cases. Contrary to classic PAC-Bayes, our methodology does not require randomization of the model parameters but exploits the stochasticity of the gradient-based optimizer instead.  
\item Our methodology framework can be used to devise new stochastic optimization algorithms by minimizing generalization bounds w.r.t. the sampling distribution. We demonstrate the generality of this approach, obtaining two algorithms: SGDA-Q and pairwise SGD-Q.
\item 
We conduct experiments to evaluate the proposed algorithms in both adversarial and standard training. Our results demonstrate the robustness of these algorithms on several tasks, including pairwise learning.
\end{itemize}

\section{Related Work}
Non-uniform, data-dependent sampling strategies are widely used in stochastic optimization. One example is importance  sampling \citep{zhao2015stochastic}, where samples are selected proportional to the gradient norm in order to reduce the variance of the gradient. This was shown to accelerate training. Various approximation methods have been devised to enhance the computational efficiency in implementing this idea \citep{johnson2018training, katharopoulos2018not}. 

Some works use loss-based sampling for faster convergence \citep{katharopoulos2017biased, london2017pac}, while others propose upper bounds on gradient norms for improved performance \citep{katharopoulos2018not}. Alternative approaches include distance-based sampling \citep{wu2017sampling}, multi-armed bandit frameworks \citep{salehi2018coordinate, liu2020adam}, and data-dependent sampling for coordinate selection \citep{allen2016even}. Despite these advances, generalization analysis for non-uniform sampling remains limited, which we address in this paper.

The classic PAC-Bayes framework can compute numerical generalization bounds where the weights follow the prior and posterior distributions \citep{perez2021tighter,dziugaite2017computing}, with tighter results achieved using a learned, data-dependent prior \citep{ambroladze2006tighter,parrado2012pac,rivasplata2018pac,dziugaite2021role,dziugaite2018data,perez2021learning}. 

In this paper, we use the idea of algorithms inspired by PAC-Bayes bounds. However, contrary to existing works, we exploit the intrinsic randomness of stochastic algorithms. The indices of examples chosen for estimating the gradient directions during training are treated as hyperparameters, which follow a uniform PAC-Bayes prior at first, and a data-dependent PAC-Bayes posterior that we learn from data by minimizing the bound.

\if 0
\section{Main Results}
Let $\mathcal{D}$ be an unknown distribution on sample space $\mathcal{Z}$.  We denote by $\wcal$, $\vcal \subseteq \rbb^d$ the parameter space, and $\Phi$ will be a hyperparameter space. In the context of stochastic optimization algorithms, the hyperparameter is the random sequence of indices of training points used to approximate the gradient throughout iterations. The PAC-Bayes framework allows us to model this stochasticity by defining two discrete distributions on the hyperparameter space: The prior denoted by $\pbb$ and the PAC-Bayes posterior denoted by $\qbb$. In the paper, we always set $\pbb$ as the uniform distribution and learn $\qbb$ from the data. 

Given a training set $S = \{z_1,\ldots, z_n\}$ drawn i.i.d. from $\mathcal{D}$, and a hyperparameter $\phi\in\Phi$, a learning algorithm $A$ returns a model parameterized by $A(S;\phi )\in\wcal$. 
For SGDA, we define $A(S; \phi) := \left(A_{\mathrm{w}}(S; \phi), A_{\mathrm{v}}(S; \phi)\right) \in \mathcal{W} \times \mathcal{V}$.%, where $\ell: \mathcal{W} \times \mathcal{V} \times \mathcal{Z} \to \mathbb{R}$.  
We have the minimax optimization problem formulated as follows. By a slight abuse of notation, we will write the loss function $\ell$ as having either two or three arguments. %\YL Furthermore, it seems that we do not require to define $\ell$ for standard problems \YLe
\begin{multline*}
R(A(S;\phi)) = \min _{\mathbf{w} \in \mathcal{W}} \max _{\mathbf{v} \in \mathcal{V}} R(A_{\mathrm{w,v}}(S; \phi)) \\ :=\mathbb{E}_{z \sim \mathbb{D}}[\ell\left(A_{\mathrm{w,v}}(S; \phi), z\right)].    
\end{multline*}
Since $\mathcal{D}$ is unknown, the empirical risk typically serves as a proxi:
\begin{multline*}
R_S(A(S;\phi)) = R_{S}(A_{\mathrm{w,v}}(S; \phi)) \\ =\frac{1}{n} \sum_{i=1}^{n} \ell\left(A_{\mathrm{w,v}}(S; \phi), z\right).\end{multline*}
% {\zsj add empirical risk and risk in pairwise cases. put neural in arxiv and cite before theorem 4.5.}

We denote the difference between the risk and the empirical risk (i.e. the generalization gap) by $G(S,\phi) := R(A(S;\phi)) - R_S(A(S;\phi))$. 

%By considering a PAC-Bayes analysis, we can transfer the results hold for simple prior (e.g. uniform) to those holding for all posterior distributions. 
%Here, for stochastic algorithms, we consider the inherent randomness of algorithms, e.g. sampling indices, to follow the distributions.  
In the PAC-Bayes framework %, we have a sample-independent prior $\pbb$ and a sample-dependent posterior $\qbb$ on the hyperparameter space $\Phi$. The 
we work with the 
expected risk and expected empirical risk w.r.t. $\qbb(S)$, which we refer to simply as $\qbb$ for brevity, defined as:
\begin{equation}
R( \qbb) = \mathop{\mathbb{E}}\limits_{\phi \sim \qbb} [R(A(S;\phi))], \quad  R_S( \qbb) = \mathop{\mathbb{E}}\limits_{\phi \sim \qbb} [R_S(A(S;\phi))].\end{equation}
For uniform SGDA and $\ell:= \max _{\mathbf{v}}\ell\left(\bw,\bv; \cdot \right)$, the stochastic gradient for updating $\textbf{w}_T(\phi)$ and $\textbf{v}_T(\phi)$ is given as %\YL this is not a stochastic algorithm as there is no example selected\YLe
\begin{align}
\begin{cases} \bv_{t+1}= \bv_{t}+\eta_t\nabla_{\bv}\ell((\bw_{t},\bv_{t});z_{i_t} ),\\ \bw_{t+1}= \bw_{t}-\eta_{t}\nabla_{\bw}\ell((\bw_{t},\bv_{t});z_{i_t} ),\end{cases} \label{update:sgda}\end{align}
where at $t$-th iteration, $z_{i_t}$ is independently drawn 
from the uniform distribution over $[n]$, for $[n]:=\{1,\dots,n\}$.

Next, we give our main theorem for the generalization bounds for SGDA, whose proof is given in Appendix \ref{prf:sgda}.
% {\zsj lemmas and definitions included in the main content?}
We first introduce several definitions \citep{zhang2021generalization}. 
\begin{definition} [Lipschitz\label{def:sgda:lip}]
Let $L \geq0$. 
For any $z$, we say $\ell: (\bw, \bv) \mapsto  \ell((\bw, \bv) ; z)$  is $L$-Lipschitz if the following inequalities hold for all $\bw \in \mathcal{W}$, $\bv \in \mathcal{V}$ and $z \in \mathcal{Z}$
\[\left\|\nabla_{\bw} \ell((\bw, \bv) ; z)\right\|_{2} \leq L \quad \text{and} \quad \left\|\nabla_{\bv} \ell((\bw, \bv) ; z)\right\|_{2} \leq L.\]   
\end{definition}

% \YL first one needs to define strong convexity\YLe

We say $\ell$ is convex if for any $\bw_1,\bw_2\in \wcal$, we have $
\ell(\bw_1) \geq \ell(\bw_2) +  \big\langle \nabla \ell(\bw_2), \bw_1 - \bw_2 \big\rangle.$
We say $\ell$ is concave if $-\ell$ is convex.

\begin{definition}[Convexity-Concavity\label{defn_strongly_convex_sgda}] We say $\ell: (\bw, \bv) \mapsto  \ell((\bw, \bv) ; z)$ is  convex-concave if for any  $\mathbf{v} \in \mathcal{V}$, the function  $\mathbf{w} \mapsto \ell(\mathbf{w}, \mathbf{v})$  is  convex and for any  $\mathbf{w} \in \mathcal{W}$, the function  $\mathbf{v} \mapsto \ell(\mathbf{w}, \mathbf{v})$  is   concave.
\end{definition}


\begin{definition} [Smoothness\label{sgda:smooth}]
$\ell:(\bw, \bv) \mapsto  \ell((\bw, \bv) ; z)$ is said to be $\alpha$-smooth, $ \alpha>0$, if for all $ \bw_1$, $\bw_2 \in \mathcal{W}$, $\bv_1$, $\bv_2 \in \mathcal{V}$ and  $z \in \mathcal{Z}$, the following holds 
\[\!\Big\|\!\Big(\!\begin{array}{c}\!
\nabla_{\bw } \ell((\!\bw_1,\! \bv_1); z)\!-\!\nabla_{\bw } \ell((\!\bw_2,\! \bv_2); z )\! \\
\nabla_{\bv } \ell((\!\bw_1,\! \bv_1); z)\!-\!\nabla_{\mathbf{v}} \ell((\!\bw_2,\! \bv_2); z)\!
\end{array}\!\Big)\!\Big\|_{2}\! \leq \!\alpha \Big\|\Big(\!\begin{array}{c}\!
\bw_1\!-\!\bw_2\! \\
\!\bv_1\!-\!\bv_2\!
\end{array}\!\Big)\!\Big\|_{2}\!.\] \end{definition}

\if 0 
\begin{lemma}[{Stability of SGDA}\label{lem:sta_sgda}]
Let $\{\bw_t,\bv_t\}, \{\bw_t',\bv_t'\}$ be the sequences produced by SGDA on $S$ and $S'$ respectively with uniform distribution $\pbb$ and fixed step sizes. Assume $\ell$ is convex-concave and $L$-Lipschitz. At the $t$-th iteration, SGDA is  
\begin{itemize}
    \item[1)] 
At the $t$-th iteration,
Assumption \ref{ass:beta-theta} holds with \[\aaa = 2\sqrt{2e}L^2\eta (\sqrt{t}+ 2t/n) \quad \text{and} \quad \bb= 4\sqrt{2e}L^2\eta (1+\sqrt{ 2t/n}).\] 
\item[2)] In addition, we assume the Assumption \ref{sgda:smooth} holds. At t-th iteration, Assumption \ref{ass:beta-theta} holds with \[\aaa = 4\sqrt{ e} L^2\eta \exp(\frac{1}{2}\alpha^2 t\eta^2)(1+2t/n)   \text{ and }   \bb= 8\sqrt{ e} L^2\eta \exp(\frac{1}{2}\alpha^2 t\eta^2)(1+\sqrt{2t/n}). \]\end{itemize} \end{lemma}
 

We combine the above lemma with Lemma \ref{thm:main} to obtain bounds for SGDA with a general sampling distribution.
\fi 

\begin{theorem}[{Generalization bounds for SGDA}\label{cor_sgda_smooth}]
Assume $\ell$ is $M$-bounded, convex-concave and $L$-Lipschitz. 
For any $\delta \in (0, 1)$ and uniform prior distribution $\pbb$, with probability at least $1-\delta$ over $S$, $S \sim \mathcal{D}^n$, the following bounds hold for SGDA with fixed step sizes and all posterior sampling distribution $\qbb$ on $ [n]^T$ . At $T$-th iterations, we have 
\begin{multline*} 
\ebb_{\phi\sim\qbb}\left[G(S,\phi)\right]\lesssim \Big(\! \text{\rm{KL}}(\qbb\|\pbb)+\log\frac{1}{\delta}\Big)\\ \max\Big\{L^2\eta (\sqrt{T}+T/n )\log^2 n,\frac{M}{\sqrt{n}}\Big\}.
\end{multline*}
In addition, if $\ell$ is $\alpha$-smooth, we have
\begin{multline*}  \ebb_{\phi\sim\qbb}\big[G(S,\phi)\big]\lesssim  
  \big(\text{\rm{KL}}(\qbb\|\pbb)+\log(1/\delta)\big)\\\max\Big\{ L^2\eta \exp(\alpha^2 t\eta^2)\Big( \frac{T}{n} + 1 + \sqrt{\frac{T}{n}} 
 \Big)\log^2 n,\frac{M}{\sqrt{n}}\Big\}.
\end{multline*}
\end{theorem}

Theorem \ref{cor_sgda_smooth} implies that if we choose $T=O(n^{2})$ and $\eta=O (T^{-\frac{3}{4}})$ gives nonvacuous results of the order $\widetilde{O}(\frac{1}{\sqrt{n}})$. In smooth cases, if we choose $T=O(n)$ and $\eta=O(\frac{1}{\sqrt{n}})$, this gives the bounds of the order $\widetilde{O}(\frac{1}{\sqrt{n}})$. 

% We also give generalization bounds of pairwise SGD. The proof is given in appendix \ref{}. {\td add proof?}
% \begin{theorem}[{Generalization bounds for pairwise SGD}\label{cor_sgd_pairwise}]
% Let the assumptions in Theorem \ref{cor_sgd_smooth} hold.
% For any $\delta \in (0, 1)$ and uniform prior $\pbb$, the following holds with fixed step sizes and all posterior distribution $\qbb$ 
%  \begin{multline*} \ebb_{ \qbb}\left[G(S,\phi)\right]\!\lesssim \!\Big( \!\rm{KL}(\qbb\|\pbb)\!+\!\log\frac{1}{\delta}\!\Big) \\\max\!\Big\{\!L^2\eta \Big(\sqrt{T}\!+\!\frac{T}{n}\!+ \!\sqrt{\frac{T}{n}
% }\Big)\log^2 n,\frac{M}{\sqrt{n}}\!\Big\}.
%  \end{multline*}
% \end{theorem}
For sampling distribution of indices, both $\pbb$ and $\qbb$ are discrete distributions on hyperparameter space $\Phi$, we have KL divergence $\text{\rm{KL}}(\qbb\| \pbb):=\sum_{\phi \in \Phi} \qbb \log  \frac{\qbb}{\pbb}$. By considering a PAC-Bayes analysis, we can transfer the results hold for simple prior (e.g. uniform) to those holding for all posterior distributions. 

{\zsj SGD from here }For completeness, we provide notations and results for SGD. 
The generalization error, or risk, relative to a loss function $\ell$, is defined as 
\begin{equation}
R(A(S;\phi)) = \mathbb{E}_{z \sim \mathcal{D}} [\ell(A(S;\phi),z)].
\end{equation}
For a pairwise loss, we have 
\begin{equation}
R(A(S;\phi)) = \mathbb{E}_{z,\tilde{z} \sim \mathcal{D}} [\ell(A(S;\phi),z,\tilde{z})].%\label{Risk}
\end{equation}
For pointwise SGD, the empirical risk is 
\begin{equation}
R_S(A(S;\phi)) = \frac{1}{n}\sum_{i=1}^{n} \ell(A(S;\phi),z_i).
\end{equation}  
For a pairwise loss, we have
\begin{equation}
R_S(A(S;\phi)) = \frac{1}{n(n-1)}\sum_{i,j\in[n]:i\neq j}\ell(A(S;\phi),z_i,z_j).
\end{equation} 
For uniform SGD and $\ell:= \ell\left(\bw; \cdot \right) $, we have the update rule 
\begin{align}
\mathbf{w}_{t+1} := \mathbf{w}_t - \eta_t \nabla_{\mathbf{w}} \ell(\mathbf{w}_t;z_{i_t} ).\label{wupds}
\end{align}
Recent works give generalization bounds of pointwise and pairwise SGD \citep{london2017pac,zhou2023toward}.  
\begin{theorem}[\label{cor_sgd_smooth}\citep{zhou2023toward}]
Assume $\ell$ is $M$-bounded, convex and $L$-Lipschitz.
For any $\delta \in (0, 1)$ and uniform prior $\pbb$, the following bounds hold for SGD with fixed step sizes and all posterior sampling distribution $\qbb$ 
 \begin{multline*} \ebb_{ \qbb}\left[G(S,\phi)\right]\!\lesssim \!\Big( \!\text{\rm{KL}}(\qbb\|\pbb)\!+\!\log\frac{1}{\delta}\!\Big) \\\max\!\Big\{\!L^2\eta \Big(\sqrt{T}\!+\!\frac{T}{n}\!+ \!\sqrt{\frac{T}{n}
}\Big)\log^2 n,\frac{M}{\sqrt{n}}\!\Big\}.
 \end{multline*}
\end{theorem}



Next, we consider to develop learning algorithms based on the main result for these stochastic optimization methods.

\fi


\section{Preliminaries} \label{sec:pre}
 
Let $\mathcal{D}$ be an unknown distribution on a sample space $\mathcal{Z}$.  We denote by $\wcal$, $\vcal \subseteq \rbb^d$ the parameter space, and $\Phi$ will be a hyperparameter space. In the context of stochastic optimization algorithms, the hyperparameter is the random sequence of indices of training inputs used to approximate the gradient throughout iterations. The PAC-Bayes framework allows us to model this stochasticity by defining two discrete distributions on $\Phi$: the prior denoted by $\pbb$ and the PAC-Bayes posterior denoted by $\qbb$. In the paper, we always set prior $\pbb$ as the uniform distribution and learn the posterior $\qbb$ from the data. 

Given a training set $S = \{z_1,\ldots, z_n\}$ drawn i.i.d. from $\mathcal{D}$, and a hyperparameter $\phi\in\Phi$, a learning algorithm $A$ returns a model parameterized by $A(S;\phi )$, mapping the training inputs to a hypothesis $h\in \mathcal{H}$. 

The generalization error, or risk, relative to a loss function $\mathcal{L}$, is defined as 
\begin{equation}
R(A(S;\phi)) = \mathbb{E}_{z \sim \mathcal{D}} [\mathcal{L}(A(S;\phi),z)].
\end{equation}
Since $\mathcal{D}$ is unknown, the empirical risk serves as a proxi:
\begin{equation}
R_S(A(S;\phi)) = \frac{1}{n}\sum_{i=1}^{n} \mathcal{L}(A(S;\phi),z_i).
\end{equation}  

% {\zsj add empirical risk and risk in pairwise cases. put neural in arxiv and cite before theorem 4.5.}

We denote the difference between the risk and the empirical risk (i.e., the generalization gap) by $G(S,\phi) := R(A(S;\phi)) - R_S(A(S;\phi))$. 
%By considering a PAC-Bayes analysis, we can transfer the results hold for simple prior (e.g. uniform) to those holding for all posterior distributions. 
%Here, for stochastic algorithms, we consider the inherent randomness of algorithms, e.g. sampling indices, to follow the distributions.  
In the PAC-Bayes framework, %, we have a sample-independent prior $\pbb$ and a sample-dependent posterior $\qbb$ on the hyperparameter space $\Phi$. The 
we work with the expected risk and expected empirical risk w.r.t. $\qbb(S)$, to which we refer as $\qbb$ for brevity, defined as:
\begin{equation*}
R( \qbb) = \mathop{\mathbb{E}}\limits_{\phi \sim \qbb} [R(A(S;\phi))], \quad  R_S( \qbb) = \mathop{\mathbb{E}}\limits_{\phi \sim \qbb} [R_S(A(S;\phi))].\end{equation*}



% Recent works give generalization bounds for pairwise SGD \citep{zhou2025randomized}. {\zsj give pairwise}
% \begin{theorem}[\label{cor_sgd_smooth}\citep{zhou2025randomized}]
% Assume $\ell$ is $M$-bounded, convex and $L$-Lipschitz.
% For any $\delta \in (0, 1)$ and uniform prior $\pbb$, the following bound holds w.p. $1-\delta$ for SGD with fixed step size, uniformly for all posterior sampling distribution $\qbb$ 
%  \begin{multline*} \ebb_{ \qbb}\left[G(S,\phi)\right]\!\lesssim \\ \!\Big( \!\text{\rm{KL}}(\qbb\|\pbb)\!+\!\log\frac{1}{\delta}\!\Big) \max\!\left\{\!L^2\eta \Big(\sqrt{T}\!+\!\frac{T}{n}\!\Big)\log^2 n,\frac{M}{\sqrt{n}}\!\right\}.
%  \end{multline*}
% \end{theorem}




\section{Main Results}

\begin{table*}[htb]
\renewcommand{\arraystretch}{1.2}
\centering
\begin{tabular}{|l|l|l|l|l|l|}
\hline
\textbf{Algorithm} & \textbf{Sampling}& \textbf{Reference} & \textbf{Assumption} & \textbf{Bound Type} & \textbf{Rate} \\
\hline
\multirow{2}{*}{SGDA} & Adaptive
    & {Theorem 4.4 in this work} & L,C-C(S) & w.h.p.  & $\tilde{O}(1/\sqrt{n})$ \\ \cline{2-6} & Uniform
    & \cite{lei2021stability} & L,C-C(S) & In expectation & $O(1/\sqrt{n})$ \\
\hline
% \multirow{1}{*}{pointwise SGD} & Adaptive & Theorem 5,8 in \citep{zhou2023toward} & L,C(S) & w.h.p. & $\tilde{O}(1/\sqrt{n})$ \\
% \hline
\multirow{3}{*}{pairwise SGD} & Adaptive & {\cite{zhou2025randomized}} & L,C(S) & w.h.p. & $\tilde{O}(1/\sqrt{n})$ \\ \cline{2-6}
    & Uniform & \cite{lei2020sharper} & L,S,C   & w.h.p. & $\tilde{O}(1/\sqrt{n})$ \\ \cline{2-6}
   & Uniform &   \cite{lei2021generalizationb} & L,C & w.h.p. & $O(1/\sqrt{n})$\\
\hline
\end{tabular}
\caption{Summary of generalization rates, either in expectation or with high probability (w.h.p.), for optimization algorithms (SGDA and pairwise SGD) under various assumptions—including Lipschitz continuity (L), smoothness (S), convexity (C), and convex-concavity (C-C)—as a function of the sample size $n$.}\label{table:rate}
\end{table*}

Our first result is generalization bounds on SGDA with adaptive sampling (proof given in Appendix \ref{prf:sgda}).
% {\zsj lemmas and definitions included in the main content?}
We first introduce several notations and definitions \citep{zhang2021generalization}. 

For SGDA, we have $\mathcal{L}(\bw; \cdot):= \max_{\bv}\ell\left((\bw,\bv); \cdot \right)$, where $\ell:\wcal\times\vcal\times\zcal\mapsto\rbb_+$ and we define \[A(S; \phi) := A_{\mathrm{w,v}}(S; \phi) = (\bw,\bv)\in \mathcal{W} \times \mathcal{V}.\] %, where $\ell: \mathcal{W} \times \mathcal{V} \times \mathcal{Z} \to \mathbb{R}$.  
In SGDA, we seek the minimizer of the true risk, 
%By a slight abuse of notation, we will write the loss $\ell$ as having either two or three arguments. %\YL Furthermore, it seems that we do not require to define $\ell$ for standard problems \YLe
\begin{multline*}
\min _{\mathbf{w} \in \mathcal{W}}  R(A_{\mathrm{w,v}}(S; \phi))  = \min _{\mathbf{w} \in \mathcal{W}} \max _{\mathbf{v} \in \mathcal{V}}\mathbb{E}_{z \sim \mathcal{D}}[\ell\left(A_{\mathrm{w,v}}(S; \phi), z\right)].    
\end{multline*}
Since the true risk is unknown, the minimizer of the empirical risk defines the following minimax optimization problem: 
\begin{multline*}
\min _{\mathbf{w} \in \mathcal{W}} R_{S}(A_{\mathrm{w,v}}(S; \phi))  =\min _{\mathbf{w} \in \mathcal{W}} \max _{\mathbf{v} \in \mathcal{V}} \frac{1}{n} \sum_{i=1}^{n} \ell\left(A_{\mathrm{w,v}}(S; \phi), z_i\right).\end{multline*}
We say $\ell$ is convex if for any $\bw_1,\bw_2\in \wcal$, we have $
\ell(\bw_1,\cdot) \geq \ell(\bw_2,\cdot) +  \big\langle \nabla \ell(\bw_2,\cdot), \bw_1 - \bw_2 \big\rangle.$
We say that $\ell$ is concave if $-\ell$ is convex.

\begin{definition}[Convexity-Concavity\label{defn_strongly_convex_sgda}] We say $\ell: (\bw, \bv) \mapsto  \ell((\bw, \bv) ; z)$ is  convex-concave if for any  $\mathbf{v} \in \mathcal{V}$, the function  $\mathbf{w} \mapsto \ell((\mathbf{w}, \mathbf{v}),\cdot)$  is  convex and for any  $\mathbf{w} \in \mathcal{W}$, the function  $\mathbf{v} \mapsto \ell((\mathbf{w}, \mathbf{v}),\cdot)$  is   concave.
\end{definition}
\begin{definition} [Lipschitz\label{def:sgda:lip}]
Let $L \geq0$. 
For any $z$, we say $\ell: (\bw, \bv) \mapsto  \ell((\bw, \bv) ; z)$  is $L$-Lipschitz if the following inequalities hold for all $\bw \in \mathcal{W}$, $\bv \in \mathcal{V}$ and $z \in \mathcal{Z}$
\[\left\|\nabla_{\bw} \ell((\bw, \bv) ; z)\right\|_{2} \leq L \quad \text{and} \quad \left\|\nabla_{\bv} \ell((\bw, \bv) ; z)\right\|_{2} \leq L.\]   
\end{definition}

% \YL first one needs to define strong convexity\YLe

\begin{definition} [Smoothness\label{sgda:smooth}]
$\ell:(\bw, \bv) \mapsto  \ell((\bw, \bv) ; z)$ is said to be $\alpha$-smooth, $ \alpha>0$, if for all $ \bw_1$, $\bw_2 \in \mathcal{W}$, $\bv_1$, $\bv_2 \in \mathcal{V}$ and  $z \in \mathcal{Z}$, the following holds 
\begin{multline}
\Big\|\Big(\begin{array}{c}
\nabla_{\bw } \ell((\bw_1, \bv_1); z)-\nabla_{\bw } \ell((\bw_2, \bv_2); z ) \\
\nabla_{\bv } \ell((\bw_1, \bv_1); z)-\nabla_{\mathbf{v}} \ell((\bw_2, \bv_2); z)
\end{array}\Big)\Big\|_{2} \\ \leq \alpha \Big\|\Big(\begin{array}{c}
\bw_1-\bw_2 \\
\bv_1-\bv_2
\end{array}\!\Big)\!\Big\|_{2}.\end{multline}\end{definition}




Let $\qbb$ be a probability measure over $[n]^T$. SGDA with sampling scheme $\qbb$ updates $\textbf{w}_t(\phi)$ and $\textbf{v}_t(\phi)$ by %\YL this is not a stochastic algorithm as there is no example selected\YLe
\begin{align}
\begin{cases} \bv_{t+1}= \bv_{t}+\eta_t\nabla_{\bv}\ell((\bw_{t},\bv_{t});z_{i_t} ),\\ \bw_{t+1}= \bw_{t}-\eta_{t}\nabla_{\bw}\ell((\bw_{t},\bv_{t});z_{i_t} ),\end{cases} \label{update:sgda}\end{align}
where at the $t$-th iteration, $z_{i_t\in [n]}$ is such that $i_t=\phi_t$ where $\phi\in [n]^T$ is drawn from $\qbb$. For sampling distributions of indices, both $\pbb$ and $\qbb$ are discrete distributions over $\Phi$, so their KL divergence is \[\text{\rm{KL}}(\qbb\| \pbb):=\sum_{\phi \in \Phi} \qbb(\phi) \log  \frac{\qbb(\phi)}{\pbb(\phi)}.\]  

We will assume throughout that ${\rm{KL}}(\qbb\|\pbb)\in \tilde{O}(1)$ when quantifying the rate of convergence of the forthcoming bounds. With the choice of $\pbb$ taken as the uniform distribution, this will be sufficient to allow us to account for a small fraction of outliers in algorithmic applications. 

%\YL the meaning of SGDA with sampling distribution $\qbb$ should be given here\YLe
\begin{theorem}[{Generalization bounds for SGDA}\label{cor_sgda_smooth}]
Assume $\ell$ is $M$-bounded, convex-concave and $L$-Lipschitz. 
For any $\delta \in (0, 1)$ and uniform prior $\pbb$, with probability at least $1-\delta$ over $S$, the following holds for SGDA with fixed $\eta$ and all posterior sampling distribution $\qbb$ on $ [n]^T$, 
\begin{multline*} 
\ebb_{\phi\sim\qbb}\left[G(S,\phi)\right]\lesssim \\ \Big(\text{\rm{KL}}(\qbb\|\pbb)+\log\frac{1}{\delta}\Big) \max\left\{L^2\eta (\sqrt{T}+T/n )\log^2 n,\frac{M}{\sqrt{n}}\right\}.
\end{multline*}
In addition, if $\ell$ is $\alpha$-smooth, we have
\begin{multline*}  \ebb_{\phi\sim\qbb}\big[G(S,\phi)\big]\lesssim  
  \big(\text{\rm{KL}}(\qbb\|\pbb)+\log(1/\delta)\big)\\\max\Big\{ L^2\eta \exp(\alpha^2 t\eta^2)\Big( \frac{T}{n} + 1 + \sqrt{\frac{T}{n}} 
 \Big)\log^2 n,\frac{M}{\sqrt{n}}\Big\}.
\end{multline*}
\end{theorem}

Theorem \ref{cor_sgda_smooth} implies that choosing $T=O(n^{2})$ and $\eta=O (T^{-\frac{3}{4}})$ gives nonvacuous results of the order $\widetilde{O}(\frac{1}{\sqrt{n}})$. In smooth cases, if we choose $T=O(n)$ and $\eta=O(\frac{1}{\sqrt{n}})$, this gives the bounds of the order $\widetilde{O}(\frac{1}{\sqrt{n}})$. 

% We also give generalization bounds of pairwise SGD. The proof is given in appendix \ref{}. {\td add proof?}
% \begin{theorem}[{Generalization bounds for pairwise SGD}\label{cor_sgd_pairwise}]
% Let the assumptions in Theorem \ref{cor_sgd_smooth} hold.
% For any $\delta \in (0, 1)$ and uniform prior $\pbb$, the following holds with fixed step sizes and all posterior distribution $\qbb$ 
%  \begin{multline*} \ebb_{ \qbb}\left[G(S,\phi)\right]\!\lesssim \!\Big( \!\rm{KL}(\qbb\|\pbb)\!+\!\log\frac{1}{\delta}\!\Big) \\\max\!\Big\{\!L^2\eta \Big(\sqrt{T}\!+\!\frac{T}{n}\!+ \!\sqrt{\frac{T}{n}
% }\Big)\log^2 n,\frac{M}{\sqrt{n}}\!\Big\}.
%  \end{multline*}
% \end{theorem}
The key benefit of our PAC-Bayes analysis is that results hold with uniform sampling transfer to guarantee holding for all posterior sampling distributions. 



Next, we give results for pairwise SGD. 

\textbf{Pairwise SGD:} For a pairwise loss, we define $\mathcal{L}(\bw ; \cdot):=\ell(\bw;\cdot,\cdot)$, where $\ell:\wcal\times(\zcal\times\zcal)\mapsto\rbb_+$ and the risk is 
\begin{equation}
R(A(S;\phi)) = \mathbb{E}_{z,\tilde{z} \sim \mathcal{D}} [\ell(A(S;\phi),z,\tilde{z})].%\label{Risk}
\end{equation}
For a pairwise loss, the empirical risk is 
\begin{equation*}
R_S(A(S;\phi)) = \frac{1}{n(n-1)}\sum_{i,j\in[n]:i\neq j}\ell(A(S;\phi);z_i,z_j).
\end{equation*} 
At the $t$-th iteration for pairwise SGD, a pair of sample indices $\phi_t =(i_t, j_t)$ is drawn from the set $\{(i_t, j_t)\!:\!i_t, j_t \in [n], i_t \neq j_t\}$ from $\qbb$ over $([n]\times[n])^T$. %This forms a sequence of index pairs $\phi=  (\phi_1,...,\phi_T)$. 
The update rule is $\bw_{t+1} = \bw_t \!-\! \eta_t \nabla \ell(\bw_t;z_{i_t},z_{j_t}).$
Recent work gives generalization bounds for pairwise SGD \citep{zhou2025randomized}. 
%We give the pairwise generalization bounds for  SGD, with the proof in Appendix \ref{app_pair_alg}.

\begin{theorem}[{Paiwise SGD}, \citep{zhou2025randomized}\label{cor_sgd_pairwise}]
Assume $\ell$ is $M$-bounded, convex and $L$-Lipschitz.
For any $\delta \in (0, 1)$ and uniform prior $\pbb$, the following bounds hold for pairwise SGD with fixed step sizes and all posterior sampling distribution $\qbb$ 
\begin{multline*} 
\ebb_{\phi\sim\qbb}\left[G(S,\phi)\right] \lesssim \\ \Big(\! \text{\rm{KL}}(\qbb\|\pbb)+\log\frac{1}{\delta}\Big)  \max\left\{L^2\eta (\sqrt{T}+T/n )\log^2 n,\frac{M}{\sqrt{n}}\right\}.
\end{multline*}
In addition, if $\ell$ is $\alpha$-smooth and $\eta \leq 2/{\alpha}$, we have
\begin{multline*}  \ebb_{\phi\sim\qbb}\big[G(S,\phi)\big]\lesssim  \\
  \big(\text{\rm{KL}}(\qbb\|\pbb)+\log\frac{1}{\delta}\big) \max\left\{ L^2\eta\Big( \frac{T}{n} \!+ \!1\!+\!\sqrt{\frac{T}{n}}
 \Big)\log^2 n,\frac{M}{\sqrt{n}}\right\}.
\end{multline*}
\end{theorem}



Table~\ref{table:rate} summarizes the generalization bounds for SGDA and pairwise SGD under various assumptions considered in our paper and in recent work \citep{lei2021generalizationb,lei2020sharper,lei2021stability,zhou2025randomized}, where $n$ is the sample size.

Next, we consider to develop learning algorithms based on the generalization bounds for these stochastic optimization methods.


\subsection{Optimization of the Bounds w.r.t. Posterior \texorpdfstring{$\qbb$}{Q}} 

%We consider stochastic optimization methods, including SGDA and SGD, with a uniform sampling distribution. We will work with i.i.d. sampling of the training inputs, and a uniform prior $p$. Take pointwise SGDA as an example. At the $t$-th iteration, a sample index $\phi_t =\{i_t\}$ is uniformly randomly selected from $S$, giving a sequence of indices $\phi=(\phi_1,...,\phi_T)$ and parameter $\nu$.


Inspired by the r.h.s. of the PAC-Bayes generalization bound of Theorem \ref{cor_sgda_smooth}, in this section we devise a new SGDA-Q algorithm that learns a sampling distribution (along with the model's parameters) from the data. 

Recall that, at the $t$-th iteration, a sample index $\phi_t =\{i_t\}$ is randomly selected from $S$, giving a sequence of indices $\phi=(\phi_1,...,\phi_T)$ and parameter $\nu$. 


The PAC-Bayes posterior $\qbb$ in our bounds was a distribution on the set for trajectories. Here we will denote by $q(S)$ the sampling distribution over $[n]$, which we refer to simply as $q$ for brevity, used to pick the next training point in the trajectory or training sequence.
Consequently, the following objective function resembles the form of the r.h.s. of the bounds. 
\begin{equation}  
\mathcal{L}(q) = \sum_{i=1}^{n} q(i){\mathcal{L}}\left( h; z_i\right) + \nu\cdot \text{KL}(q \| p) + \lambda \big( \sum_{i=1}^n q(i) - 1 \big),\label{obj_1}
\end{equation}
where $h$ contains the model parameters, $p(i)=1/n, \forall i\in [n]$. 
We minimize this objective w.r.t. $q$, that is to find $q$ that minimizes the expected empirical loss while staying close to the prior.

Depending on the choice of $\mathcal{L}$, minimizing Eq. \eqref{obj_1} leads to our new algorithms. When $\mathcal{L}= \!\max _{\mathbf{v}}\!\ell\left((\bw,\bv); \cdot \right) $, the minimization of \eqref{obj_1} is carried out by SGDA-Q. Here, the suffix Q signifies that we are learning a sampling distribution $q$ alongside the parameter values.  

The pseudo code of the resulting algorithms, SGDA-Q and pairwise SGD-Q are given in Algorithm \ref{alg:example}. The derivation for pairwise SGD-Q is also given in Appendix \ref{app:pairwise_alg} for completeness.

\begin{algorithm} % [ht]
   \caption{SGDA-Q/Pairwise SGD-Q}
   \label{alg:example}
\begin{algorithmic}[1]
   \STATE {\bfseries Inputs: }$ S$,  $\ell$, $\nu$ \\
{\bfseries Optimize :} $\bw$, $\bv$, $q$
\STATE{ $q \leftarrow$ uniform,  $t \leftarrow 1 $  }
\FOR{$k = 0,\dots $}
   \REPEAT
   \STATE {Sample $\phi_{t} \sim q_{t}$ } 
\STATE{\underline{\textbf{SGDA-Q:}}} %\YL there is a missing $z_{i_t}$\YLe
\STATE $\quad i_t \leftarrow \phi_t$
\STATE $\quad\bv_{t+1} = \bv_t + \eta \nabla_\bv \ell((\bw_{t}, \bv_t);z_{i_t})$;
\STATE $\quad\bw_{t+1} = \bw_t - \eta \nabla_\bw \ell((\bw_{t}, \bv_{t});z_{i_t})$;
% \STATE{\underline{\textbf{SGD-Q:}}} %\YL there is a missing $z_{i_t}$\YLe
% \STATE$\quad\bw_{t+1} = \bw_t - \eta \nabla_\bw \ell(\bw_{t},z_{i_t})$;
\STATE{\underline{\textbf{Pairwise SGD-Q:}}}
\STATE $\quad (i_t,j_t) \leftarrow \phi_t$
\STATE$\quad\bw_{t+1} = \bw_t - \eta \nabla_\bw \ell(\bw_{t};z_{i_t},z_{j_t})$;
\STATE {$t=t+1$}
\UNTIL{ $t>T_{iter}$}\\
\STATE{Update $q$ by Eq.  \eqref{qupd2} for SGDA-Q / Eq. \eqref{qupd3} for pairwise SGD-Q.}
\ENDFOR
\STATE {\bfseries return}{  $\bw$, $q$
}
\end{algorithmic}
\end{algorithm}

% {\zsj move appendix for pairwise algorithms here.}

The minimization yields the following alternating updates.  

First, keeping $q$ fixed, we approximate the adversarial loss by taking one stochastic gradient ascent step w.r.t. $\bv$, followed by taking a gradient descent step w.r.t. $\bw$ (lines 7-8 in the pseudo code). These steps represent the vanilla SGDA updates.

Then, keeping $\bw$ and $\bv$ fixed, derive the update $q$ as follows. Observe that all terms depend on $q$; taking derivative w.r.t. each $q(i)$ and rearranging the stationary equation yields
\begin{align}
q(i) \!=\!\frac{ \exp\left(-\frac{1}{\nu} {\mathcal L}(h; z_i)\right)}{\sum_{j=1}^n\exp(-\frac{1}{\nu}  {\mathcal L}(h; z_j) ) } \!\propto \!\exp\left(-\frac{1}{\nu} {\mathcal L}(h; z_i)\right).
\label{qupd2}
\end{align}


%Based on the above analysis, we propose learning algorithms that generate data-dependent posterior. 
This iterative approach updates the data-dependent posterior $q$ conditioned on the optimized parameters and the training sample. 
At the beginning, the sampling distribution is initialized with $q = p$. 

The algorithm updates the sampling distribution conditioned on the training data. During training, an index is drawn from $q$. 
According to Eq. \eqref{qupd2}, the probability of selecting the $i$-th input is proportional to the exponential of the negative of its loss from the previous epoch, denoted as $q_t(i)$. %After each epoch, the probability for $i$-th sample is updated as $q_{i} \leftarrow \exp(- \nu^{-1} \ell(h_t, \cdot))$, depending on the training data and the current hypothesis. 
{This differs from AdaSamp \citep{london2017pac} -- an existing adaptive sampling algorithm inspired from PAC-Bayes bounds, where the probability is proportional to the loss.}
As a comparison, for our algorithms, data points that have a large loss are less likely to be selected, and so potential outliers or noisy examples are automatically down-weighted.


% \textcolor{red}{Difference from Adasamp to be explained in the Algo section}

%\begin{remark}
%Investigating the objective functions in Eq. \eqref{obj_1} and Eq. \eqref{obj_2}, we consider when zero empirical risk is neither achievable nor preferable, where non-uniform sampling may be more effective than uniform sampling. For instance, when dataset contains noise examples, models should avoid memorizing these corrupted samples to achieve zero training loss. A robust approach is to train on these examples with less repetition, as deep models show  sufficient capacity to learn even random noise \citep{zhang2021understanding},  probabilistically reducing their influence and not memorizing them.
%\end{remark}

%Alg.~\ref{alg:example} meets the above analysis. During the sampling distribution update, it assigns lower weights to high-loss data, to train them less frequently. This improves robustness by down-weighting the  outliers. 
%This owns to the PAC-Bayes framework. 
Learning the sampling distribution balances the minimization of the expected empirical risk -- which down-weights certain examples -- and the minimization of divergence from a uniform prior -- which weights all examples equally. Thus, the learned sampling distribution will only deviate from uniform sampling for a gain in the expected empirical risk. 
Next, we apply our algorithms to see the benefits of this trade-off.

% \vspace{-0.7cm}




% \subsection*{A note on the first term}
% A comment is in order to connect the first term of the bound (an expectation w.r.t. index trajectories of size $T$) with the weighted loss in our objective function (an expectation w.r.t. the distribution $q$ over individual indexes $i\in[n]$). 
% To see this connection, first we note that the bounds hold for any sampling distribution over trajectories, and therefore applicable to any data-dependent non-uniform sampling based stochastic optimizers. The objective function that 
% stochastic optimizer minimizes when using a nonuniform sampling from $q$ and performing unweighted gradient updates (of the form  
% Eq.~\ref{wupds} and Eq.~\ref{update:sgda}) is the \emph{weighted} loss:
% \[
% L_q(h) = \sum_{i=1}^{n} q(i) \ell(h; z_i).
% \]
% Indeed, the full gradient of $L_q(h)$ is
% $\nabla L_q(h)=\frac{1}{n}\sum_{i=1}^n q(i)\nabla \ell(h;z_{i})$,
% which coincides with the expectation w.r.t. $i_t\sim q$, of the stochastic gradient in eq. \eqref{wupds}. 

% The expression $L_q(h)$ is similar in flavour, but not the same as the original expected empirical risk $ \mathbb{E}_{\phi\sim \mathbb{Q}} \left[ R_S(A(S);\phi) \right]$.  
% However, by the above reasoning, for a fixed $q$ and a trajectory $\phi\sim q^T$, both have the same minimizer {\zsj ?} $\textbf{w}_T(\phi)$ at convergence as $T\rightarrow \infty$, and therefore for large-enough $T$ we shall approximate the expected empirical risk with the simpler expression  $L_q(h)$.

% A key advantage of $L_q(h)$ is its simpler dependence on $q$, which will facilitate a practical way to obtain $q(i)$. Since our bound is valid with all choices of $\mathbb{Q}$, the resulting algorithm still enjoys the generalization guarantees stipulated by the bounds.



% \subsection{PAC-Bayes Bound Minimization Algorithm } \label{app_alg}





\section{Empirical Evaluation %of Stochastic Algorithms
} \label{sec:exp}


In this section, we empirically evaluate our algorithms, highlighting their ability to increase robustness and interpretability, in both standard training via pairwise SGD-Q and adversarial training via SGDA-Q. Results are presented across various architectures and datasets.

The visualizations in the pairwise setting inspire an interesting question about the differences between human and machine visual scene recognition \citep{bamber1969reaction}, which cannot be solved in the pointwise setting. These results further suggest the generalizability of our algorithms.


% \textbf{Experimental Setup}




%Next, we evaluate our algorithms SGDA-Q and SGD-Q on several tasks across several architectures and datasets%: (1) identifying difficult examples on clean dataset (2) detecting out-of-distribution examples on noise dataset. 


\subsection{Experimental Results}

\begin{figure}[tb]
\subfigure[Top k smallest Q-scores]{\includegraphics[width=0.230\textwidth]{img/adversary/visual_sgda_mnist_small.pdf}}\;\;\subfigure[Top k largest Q-scores]{\includegraphics[width=0.230\textwidth]{img/adversary/visual_sgda_mnist_large.pdf}}
\caption{Examples with lowest and highest Q-scores as found by SGDA-Q in MNIST. 'PT' denotes the predicted label, 'GT' the ground truth, and 'q' the Q-scores.}\label{top_mnist_sgda}
\end{figure}


First, we introduce the datasets and architectures used in our experiments. 
We evaluate on MNIST and CIFAR-10 dataset.
%and CIFAR-10 {\zsj see. maybe cifar for sgda may not needed.}. 
The parameter settings follow those in  \citep{shah2020choosing,nouiehed2019solving,chen2024nrat}.  
\textbf{MNIST}: We use a four-layer neural network and train over 100 epochs using an initial learning rate of 0.001 for SGDA-Q with the decaying schedule of factor 5 after every 50 epochs. 
Our code is publicly available \footnote{\scriptsize Code available at \url{https://github.com/git0405/UAI-Learning-to-Sample-in-Stochastic-Optimization}}. 



\textbf{Application 1: Estimating Example Difficulty}



Identifying challenging examples and estimating the level of difficulty of individual data points is crucial for detecting abnormal cases and samples needing further human evaluation. As discussed by \cite{agarwal2022estimating}, methods that are able to do this have potential to improve the safe use of data, as well as model interpretability. 
We verify that our algorithms can identify difficult or atypical examples, often corresponding to blurry or noisy data. 
We evaluate this on the MNIST and CUHK03 datasets for pointwise and pairwise cases, ranking training-set data by Q-scores. The results are shown in %Figure~\ref{top_mnist_pointwise}, 
Figure~\ref{top_mnist_sgda} and 
Figure~\ref{top_mnist_clean_pairwise}.




In Figure~\ref{top_mnist_sgda} (a), we list the examples with lowest estimated values of $q$ and highest estimated values of $q$ in Figure~\ref{top_mnist_sgda} (b)  for MNIST. We can see from Figure~\ref{top_mnist_sgda} and Figure~\ref{top_mnist_clean_pairwise} that high Q-score images typically have clear, uncluttered backgrounds and contain typical and well-visible objects, while low Q-score images are atypical, blurry or unclear, making object identification difficult. In pairwise tasks on similarity shown in Figure~\ref{top_mnist_clean_pairwise}, low Q-score image pairs often show objects from unconventional angles, hindering the recognition of their similarity. %Smaller Q-scores help prevent memorizing noisy examples by training with fewer iterations. 
Our algorithms effectively identify challenging or atypical examples by assigning them low Q-scores, which may be used to prompt human input or further review.




%In these toy experiments, adversarial examples are near the decision boundary or overlapping. We train a linear neural network on a synthetic dataset using SGD-Q and give dataset  The original and pruned datasets are shown in Figure~\ref{pruning}. This illustrates SGD-Q's effectiveness in identifying adversarial examples. These low Q-score examples correspond to adversarial data. Removing them results in well-separated data from different classes in Figure~\ref{pruning_2}. SGD-Q identifies these low Q-scores examples and down-weights their influence during training.

%\textcolor{red}{come back to this later:}


%%======




\begin{table*}[bt]
\centering
\begin{tabular}{llccccccccc}
\toprule
\textbf{Noise Rate}& \textbf{Alg.}&\textbf{Natural} & \multicolumn{4}{c}{PGD$^{40}$ $L_\infty$ \citep{kurakin2016adversarial}} & \multicolumn{4}{c}{FGSM $L_\infty$ \citep{goodfellow2014explaining}} \\
\cmidrule(lr){1-1} 
\cmidrule(lr){2-2} 
\cmidrule(lr){3-3} 
\cmidrule(lr){4-7} \cmidrule(lr){8-11}
 Symmetric& & & $\epsilon = 0.1$ & $\epsilon = 0.2$ & $\epsilon = 0.3$ & $\epsilon = 0.4$ & $\epsilon = 0.1$ & $\epsilon = 0.2$ & $\epsilon = 0.3$ & $\epsilon = 0.4$ \\
\midrule
\multirow{ 3}{*}{Sym 0.2}& SGDA-Q &  \textbf{99.26}\% & \textbf{99.05}\% &  \textbf{98.69}\% &  \textbf{98.25}\% &  \textbf{97.68}\% &  \textbf{97.71}\%&  \textbf{94.65}\% &  \textbf{91.26}\% &  \textbf{86.02}\% \\
& SGDA &  98.98\% & 98.72\% &  98.39\% &  97.95\% &  97.37\% &  97.28\%&  94.16\% &  89.27\% &  83.01\% \\ & MART &   98.84\% &  98.60\%&  98.34\% &  97.98\% &  97.57\% &  96.70\% &   93.33\% &  88.47\% &  82.44\%    \\
& TRADES &   98.71\% &   98.48\%&   98.21\% &   97.88\% &   96.37\% &   96.18\% &    90.55\% &   81.18\% &   70.85\%    \\
\midrule
\multirow{3 }{*}{Sym 0.4} & SGDA-Q  &  \textbf{99.12}\% & \textbf{98.91}\% &  \textbf{98.53}\% &  \textbf{98.09}\% &  \textbf{97.47}\% & \textbf{97.51}\% &  \textbf{94.37}\% &  \textbf{90.70}\% &  \textbf{85.37}\% \\
& SGDA & 98.58\% & 98.32\% &  97.96\% &  97.48\% &  96.96\% & 96.58\% &  93.31\% &  88.76\% &  83.15\% \\ & MART &   98.21\% &  97.90\% &   97.57\% &  97.14\% &  96.67\% &  95.62\%&  91.69\% &  86.11\% &  79.05\%   \\ & TRADES &   98.35\% &   98.07\%&   97.73\% &   97.32\% &   96.77\% &   95.35\% &    89.40\% &   79.60\% &   70.16\%    \\ \midrule
 Asymmetric&   &    &   &   &    &    &   &   &    &   \\
\midrule
\multirow{ 3}{*}{Asym 0.2}& SGDA-Q &  \textbf{99.33} \% &  \textbf{99.06}\%&  \textbf{98.76}\% &  98.29\% &  97.72\% & \textbf{98.02}\% &  95.72\% &  93.32\% &  \textbf{90.38}\% \\
& SGDA &  99.26\% & 99.00\% &  98.66\% &  98.19\% &  97.57\% & 97.98\% &  \textbf{95.97}\% &  \textbf{93.45}\% &  90.37\% \\ & MART &   99.28 \% &  99.03\% &   98.70\% &  98.29\% &  97.65\% &  97.18\%&  92.61\% &  87.82\% &  82.36\%   \\ & TRADES &   99.24\% &   99.02\%&    98.74\% &   \textbf{98.38}\% &   \textbf{97.89}\% &   97.48\% &    94.34\% &   88.95\% &   81.18\%    \\
\midrule
\multirow{ 2}{*}{Asym 0.4}& SGDA-Q &  98.88\%  & 98.52\% &  98.07\% &  \textbf{97.50}\% &  96.83\% &  \textbf{97.23}\%&  \textbf{94.98}\% &  \textbf{92.69}\% &  \textbf{89.60}\%\\
& SGDA &  98.73\% & 98.39\% &  97.89\% &  97.32\% &  96.67\% & 95.28\% &  92.03\% &  88.48\% &  83.76\% \\ & MART &   \textbf{99.11}\% &  \textbf{98.79}\% &   \textbf{98.39}\% &  97.45\% &  \textbf{97.40}\% &  96.34\%&  92.55\% &  87.42\% &  80.83\%   \\ & TRADES &    98.35\% &    98.33\%&    97.87\% &    97.29\% &    96.73\% &    93.60\% &     85.37\% &     74.86\% &    63.33\%    \\
\bottomrule
\end{tabular}
\caption{Comparison of natural and adversarial accuracy under FGSM and PGD attacks with symmetric and asymmetric noise on four  algorithms. The maximum results in each column are highlighted in bold font.}\label{tab:results_sym}
\end{table*}


\textbf{Application 2: Training in the Presence of Label Noise}

\begin{figure}[htb]
\centering
%\subfigure[Raw data]{\includegraphics[width=0.5\textwidth]{img/test/raw data_52-cropped.pdf}}\\\;\;
%\subfigure[SGD-Q]{\includegraphics[height=3cm,width=0.24\textwidth]{img/test/visual_sgd_raw.pdf}}\;\;
%\subfigure[SGD-Q and Vanilla SGD]{\includegraphics[height=3.5cm,width=0.25\textwidth]{img/test/visual_sgd_test.pdf}}
%\subfigure[SGDA-Q and Vanilla SGDA]
{\includegraphics[width=0.37\textwidth]{img/test/new_visual_sgda_test.pdf}}%\;\;\subfigure[Vanilla SGDA]{\includegraphics[height=3cm,width=0.24\textwidth]{img/test/visual_sgda.pdf}}
\caption{Decision boundaries obtained using SGDA-Q vs. vanilla SGDA on a simple 2D dataset with noisy labels. Dark filling means low Q-score.}\label{2d-examples}
\end{figure}


We evaluate SGDA-Q here and pairwise SGD-Q in Section \ref{sec:pairwise}, in conditions of label noise to test their ability to identify and downweight the noisy examples and hence achieve robustness in the presence of out-of-distribution (OOD) samples that could bias estimates. %Since outliers are often the most disruptive, our algorithms improve robustness against such outliers and low-quality samples in both standard and adversarial training. 

%Our algorithms assign lower weights to such examples, adjusting the sampling distribution to focus on normal data while reducing emphasis on attacked samples. Meanwhile, the PAC-Bayes posterior remains close to the prior to weight all samples equally, with balance controlled by a divergence penalty term in the generalization bounds. 





We first generated a 2D toy example with asymmetric label noise rate of 0.1 to illustrate the working of our algorithms. We ran logistic regression trained with SGDA and SGDA-Q for a comparison; the obtained decision boundaries are shown in Figure \ref{2d-examples}. % a). Likewise, SGDA and SGDA-Q are shown in Figures \ref{2d-examples} b). 
The filling of markers reflects the Q-scores -- darker means lower Q-score. In this comparison, we see that, in vanilla SGDA the decision boundary shifts due to the label noise. However, our algorithms demonstrate robustness to such noise. This is because our methods learn to down-weight and consequently avoid training on the misleading mislabeled points. %We show more comparison with algorithms SVRG, SAGA, and gradient norm-based importance sampling (SGD-IS) under the same setting. See Figures \ref{more_2d-examples} in appendix.


% \textbf{Standard Training of SGD-Q}. 




\textbf{Adversarial Training of SGDA-Q.} 

\begin{figure}[tb]
\centering
\includegraphics[height = 6.2cm,width=0.5\textwidth]{img/adversary/adv_img.png}
\caption{Adversarial training setting, with inherent noise. }\label{fig_model_adv}
\end{figure}







Next, we evaluate the robustness of our SGDA-Q algorithm. Similarly to SGDA, this method is applicable to adversarial training. 
It aims to reduce the effect of test-time adversarial perturbations by training with a loss function that simulates adversarial examples. Indeed it is well known that addition certain imperceptible noises can fool the models into making wrong predictions \citep{goodfellow2014explaining}. Adversarial training is an effective way to defend against these adversarial attacks.

Let us denote an adversarial example by $\mathbf{x}'$, obtained by adding an adversarial perturbation to a natural example $\mathbf{x}$. 
In the case of an $\ell_\infty$ adversarial attack, an adversarial example $\mathbf{x}'$ is chosen such that $\|\mathbf{x}' - \mathbf{x}\|_\infty \leq \epsilon$. Such perturbation is often imperceptible to humans but can cause the classifier \(h\) to mispredict \citep{goodfellow2014explaining}. 
In adversarial training, the goal is to guard against the ill effect of adversarial perturbations by solving the empirical adversarial risk minimization problem
\begin{equation}
\min_{h \in \mathcal{H}} \frac{1}{n} \sum_{i=1}^n \max_{\|\mathbf{x}_i' - \mathbf{x}_i\|_\infty \leq \epsilon} \ell(h(\mathbf{x}_i'), y_i). \label{adv_emp}\end{equation}

This is also a min-max problem, where $\bv$ in our earlier general formulation is instantiated as $(\mathbf{x}'_1,\dots,  \mathbf{x}'_n)$.

For each training point, the maximum in the loss function searches for the worst-case perturbation of the input features, while the outer minimization aims to reduce this worst-case value of the loss by adjusting the model parameters. In addition to this adversarial training min-max problem classically approached by SGDA, our SGDA-Q algorithm also adjusts the sampling probabilities to minimize the expected worst-case loss. This creates a fine balance between the adversarial training creating hard examples and our updates of $q$ potentially down-weighting them. Therefore, we expect that SGDA-Q is best suited when there are outliers or mislabeled points in the data set. Indeed, recent literature \citep{chen2024nrat} reported that having both adversarial attacks and label noise is both realistic and challenging. We expect our method to identify and down-weight the noisy points while at the same time carrying out adversarial training.

We shall now examine our SGDA-Q algorithm in adversarial training in the presence of mislabeled samples to demonstrate the enhanced robustness of min-max stochastic optimization based learning. The problem setting is depicted in Figure~\ref{fig_model_adv}. 


% \begin{table*}[htb]%h]
% \centering
% \begin{tabular}{llccccccccc}
% \toprule
% \textbf{Noise Rate}& \textbf{Alg.}&\textbf{Natural} & \multicolumn{4}{c}{PGD$^{40}$ $L_\infty$ \citep{kurakin2016adversarial}} & \multicolumn{4}{c}{FGSM $L_\infty$ \citep{goodfellow2014explaining}} \\
% \cmidrule(lr){1-1} 
% \cmidrule(lr){2-2} 
% \cmidrule(lr){3-3} 
% \cmidrule(lr){4-7} \cmidrule(lr){8-11}
%  Symmetric& & & $\epsilon = 0.1$ & $\epsilon = 0.2$ & $\epsilon = 0.3$ & $\epsilon = 0.4$ & $\epsilon = 0.1$ & $\epsilon = 0.2$ & $\epsilon = 0.3$ & $\epsilon = 0.4$ \\
% \midrule
% \multirow{ 2}{*}{Sym 0.2}& SGDA-Q &  99.26\% & 99.05\% &  98.69\% &  98.25\% &  97.68\% &  97.71\%&  94.65\% &  91.26\% &  86.02\% \\
% & SGDA &  98.98\% & 98.72\% &  98.39\% &  97.95\% &  97.37\% &  97.28\%&  94.16\% &  89.27\% &  83.01\% \\ 
% \midrule
% \multirow{ 2}{*}{Sym 0.4} & SGDA-Q  &  99.12\% & 98.91\% &  98.53\% &  98.09\% &  97.47\% & 97.51\% &  94.37\% &  90.70\% &  85.37\% \\
% & SGDA & 98.58\% & 98.32\% &  97.96\% &  97.48\% &  96.96\% & 96.58\% &  93.31\% &  88.76\% &  83.15\% \\ \midrule
%  Asymmetric&   &    &   &   &    &    &   &   &    &   \\
% \midrule
% \multirow{ 2}{*}{Asym 0.2}& SGDA-Q &  99.33 \% &  99.06\%&  98.76\% &  98.29\% &  97.72\% & 98.02\% &  95.72\% &  93.32\% &  90.38\% \\
% & SGDA &  99.26\% & 99.00\% &  98.66\% &  98.19\% &  97.57\% & 97.98\% &  95.97\% &  93.45\% &  90.37\% \\ 
% \midrule
% \multirow{ 2}{*}{Asym 0.4}& SGDA-Q &  98.88\%  & 98.52\% &  98.07\% &  97.50\% &  96.83\% &  97.23\%&  94.98\% &  92.69\% &  89.60\%\\
% & SGDA &  98.73\% & 98.39\% &  97.89\% &  97.32\% &  96.67\% & 95.28\% &  92.03\% &  88.48\% &  83.76\% \\ 
% \bottomrule
% \end{tabular}
% \caption{Comparison of natural and adversarial accuracy under FGSM and PGD attacks with symmetric and asymmetric noise.}
% 
% \end{table*}





% {\zsj We first visualize the two-dimensional dataset for interpretability. Figure \ref{2d-examples} (c) and (d) show linear models trained with SGDA-Q and SGDA, respectively. The results further highlight our algorithm's robustness to label noise, as it maintains a larger decision boundary margin, whereas the decision boundary trained with SGDA shifts significantly due to noisy labels.}

% Table \ref{tab:results_sym} presents results of our algorithms compared to three algorithms: vanilla SGDA (i.e. with uniform sampling), MART \citep{wang2019improving}, and  TRADES \cite{zhang2019theoretically} on MNIST in the presence of label noises at rates of 0.2 and 0.4, following the setting in  \citep{chen2024nrat}. Test accuracy was evaluated under FGSM \citep{goodfellow2014explaining} and PGD-40 \citep{kurakin2016adversarial} attacks with results averaged over 3 independent runs. 
% According to the results, our algorithm achieves higher test accuracy in adversarial training in the presence of label noise in most cases, indicating improved robustness. 



\begin{table*}[tb]
\caption{Best and last accuracy (\%) on CIFAR-10 with inherent symmetric and asymmetric label noise with 20\% and 40\% noise rate with PGD attack.}
\label{tab:2}
\begin{center}
\begin{tabular}{llccccc ccccc}
\hline
& \multirow{2}{*}{Alg.}& \multicolumn{2}{c}{Sym0.2} & \multicolumn{2}{c}{Sym0.4} & \multicolumn{2}{c}{Asym0.2} & \multicolumn{2}{c}{Asym0.4} \\
\cmidrule(lr){3-6} \cmidrule(lr){7-10} &  & Natural & PGD-20 &   Natural & PGD-20 
& Natural & PGD-20 &   Natural & PGD-20 & \\
\hline \multirow{3}{*}{Best}&
% \multicolumn{3}{l}{Best natural and robust accuracy} \\
Ours & 81.13 & \textbf{58.40}  
& 74.94 & 52.91 &    \textbf{84.29}  & \textbf{60.78} &  \textbf{78.25} & \textbf{56.03} &  \\
&MART & 78.96 & 48.01 & 74.97 & 45.82 & 83.04 & 54.19 & 76.85 & 47.97 \\ 
&TRADES & \textbf{81.37} & 56.71 & \textbf{75.80} & \textbf{54.80} & 82.46 & 54.12 & 77.44 & 50.46 \\
% \multicolumn{3}{l}{Last natural and robust accuracy} 
\hline 
\multirow{3}{*}{Last} & Ours &  \textbf{74.94} & \textbf{40.73}  & 59.12 & \textbf{26.38} & 80.37  & 47.43 & 71.68  & \textbf{40.67} \\
&MART & 74.11 & 37.07 & 54.54 & 22.89 & \textbf{80.81} & 42.62 & \textbf{71.88} & 39.68 \\
&TRADES & 74.76 & {39.74} & \textbf{60.23} & 26.12& 77.71 & \textbf{49.31} & 70.87 & 40.62 \\
\hline
\end{tabular}\label{tab:results_sym_cifar}
\end{center}
\end{table*}



\begin{figure}[tb]
\subfigure[eps=0]{\includegraphics[width=0.25\textwidth]{img/test/all_test_mnist_pointwise_sym_0.4_pgd_eps_0.0.pdf}}\subfigure[eps=0.2]{\includegraphics[width=0.25\textwidth]{img/test/all_test_mnist_pointwise_sym_0.4_pgd_eps_0.2.pdf}}\\ 
% \subfigure[eps=0.3]{\includegraphics[width=0.25\textwidth]{img/test/all_test_mnist_pointwise_sym_0.4_pgd_eps_0.3.pdf
% % }}\subfigure[eps=0.4]{\includegraphics[width=0.25\textwidth]{img/test/all_test_mnist_pointwise_sym_0.4_pgd_eps_0.4.pdf
% % }}
% \subfigure[eps=0]{\includegraphics[width=0.25\textwidth]{img/test/4_test_mnist_pointwise_sym_0.4_pgd_eps_0.0.pdf}}\subfigure[eps=0.2]{\includegraphics[width=0.25\textwidth]{img/test/4_test_mnist_pointwise_sym_0.4_pgd_eps_0.2.pdf}}\\ 
\subfigure[eps=0.3]{\includegraphics[width=0.25\textwidth]{img/test/4_test_mnist_pointwise_sym_0.4_pgd_eps_0.3.pdf
}}\subfigure[eps=0.4]{\includegraphics[width=0.25\textwidth]{img/test/4_test_mnist_pointwise_sym_0.4_pgd_eps_0.4.pdf
}}
\caption{
The impact of $1/\nu$ on test accuracy under PGD attack  across different values of $\epsilon$ under a symmetric noise rate of 0.4.}\label{sym_0.4_alpha_pgd}
\end{figure}

\begin{figure}[tb]
\subfigure[eps=0]{\includegraphics[width=0.25\textwidth]{img/test/all_test_mnist_pointwise_sym_0.4_fgsm_eps_0.0.pdf}}\subfigure[eps=0.2]{\includegraphics[width=0.25\textwidth]{img/test/all_test_mnist_pointwise_sym_0.4_fgsm_eps_0.2.pdf}}\\ 
% \subfigure[eps=0.3]{\includegraphics[width=0.25\textwidth]{img/test/all_test_mnist_pointwise_sym_0.4_fgsm_eps_0.3.pdf
% % }}\subfigure[eps=0.4]{\includegraphics[width=0.25\textwidth]{img/test/all_test_mnist_pointwise_sym_0.4_fgsm_eps_0.4.pdf
% % }}
% \subfigure[eps=0]{\includegraphics[width=0.25\textwidth]{img/test/4_test_mnist_pointwise_sym_0.4_fgsm_eps_0.0.pdf}}\subfigure[eps=0.2]{\includegraphics[width=0.25\textwidth]{img/test/4_test_mnist_pointwise_sym_0.4_fgsm_eps_0.2.pdf}}\\ 
\subfigure[eps=0.3]{\includegraphics[width=0.25\textwidth]{img/test/4_test_mnist_pointwise_sym_0.4_fgsm_eps_0.3.pdf
}}\subfigure[eps=0.4]{\includegraphics[width=0.25\textwidth]{img/test/4_test_mnist_pointwise_sym_0.4_fgsm_eps_0.4.pdf
}}
\caption{
The impact of $1/\nu$ on test accuracy under FGSM attack  across different values of $\epsilon$ under a symmetric noise rate of 0.4.}\label{sym_0.4_alpha_fgsm}
\end{figure}


Table \ref{tab:results_sym} presents the results of our algorithms compared to vanilla SGDA (i.e. SGDA with uniform sampling), and two other state-of-the-art algorithms, namely MART \citep{wang2019improving}, and TRADES \citep{zhang2019theoretically}. We use MNIST in the presence of random symmetric and asymmetric label noises at rates of 0.2 and 0.4,
following the setting in \citep{chen2024nrat,shah2020choosing}. {For both SGDA and SGDA-Q, we use the cross-entropy loss along with the training protocol described in \citep{nouiehed2019solving}.} Test accuracy was evaluated under FGSM \citep{goodfellow2014explaining} and
PGD-40 \citep{kurakin2016adversarial} attacks with results averaged
over 3 independent runs. Based on the results in Table \ref{tab:results_sym}, our method achieves competitive accuracy performance, scoring best in most cases and at least second best in all cases tested. 



We further investigate how \(1/\nu\) affects the balance between the KL term and the expected empirical risk, influencing the posterior update and its impact on generalization. Figure~\ref{sym_0.4_alpha_pgd} and Figure~\ref{sym_0.4_alpha_fgsm} 
show the effect of varying parameter \(1/\nu\). We give the test accuracy in problems with FGSM and PGD attacks on clean data, across different values of $\epsilon$, in the presence of random symmetric label noise rate of 0.4. Results for other noise proportions are shown in the Appendix \ref{sec:appdx}. A grid search over \(1/\nu \in (0, 3]\) reveals that smaller values of \(1/\nu\) give  better performance for smaller \(\epsilon\), while larger \(1/\nu\) achieves higher accuracy for larger \(\epsilon\). We find that \(1/\nu\) higher than 1 leads to decreased training performance. 



To evaluate the robustness of our sampling strategy, we conduct additional experiments on the CIFAR-10 dataset. We use the loss function of TRADES, and only modify their algorithm by replacing uniform sampling with our adaptive sampling. We compare with the original TRADES (uniform sampling), as well as with MART, in the presence of label noise rates of 0.2 and 0.4, following the same setting as in \citep{chen2024nrat}. We train ResNet-18 neural networks \citep{he2016deep} for 200 epochs using an initial learning rate of 0.05, which decays by a factor of 10 at the 150th and 200th epochs. 
Table \ref{tab:results_sym_cifar} presents both adversarial and natural accuracy results in the presence of label noise on the CIFAR-10 dataset. Here, “Best" means highest accuracy from across all epochs, and “Last” means the accuracy at the last training epoch. Our sampling strategy again achieves competitive performance, with higher test accuracy under PGD-20 attacks in more than half of the cases tested.  



% Based on the above results on several tasks, our algorithms can identify atypical examples, which needs further human inspection, efficiently. It downweights the influence of OOD examples to increase the robustness and achieve great performance in standard training and adversarial training.



\subsection{Results for Pairwise SGD-Q}\label{sec:pairwise} 

In this section, we test and demonstrate our pairwise SGD-Q algorithm. We consider a problem of similarity learning. Given an input pair, the goal is to predict if they belong to the same class or not. 

\textbf{Architecture: }
We employ the Siamese architecture depicted in Figure~\ref{fig_model}, built on the work of \citep{lv2018unsupervised,zheng2017discriminatively}. It learns feature representations of the input pairs and their corresponding similarities. This framework consists of two modules to extract features from $(\mathbf{x}, \tilde{\mathbf{x}})$, both sharing the same weights. 

The outputs of these two modules are flattened into one-dimensional feature vectors ($f_1$ and $f_2$ as shown in Figure~\ref{fig_model}). The element-wise squared difference between $f_1$ and $f_2$ is fed into a fully connected layer with a softmax, outputting the probability that the input pair belongs to the same class.

\begin{figure}[h]
\centering
\includegraphics[width=0.5\textwidth]{img/siamese_1020.pdf}
\caption{ 
In the Siamese network architecture, when an input pair ($\mathbf{x}$, $\tilde{\mathbf{x}}$) is provided, two models with shared weights generate feature embeddings $f_1$ and $f_2$, which are then utilized to evaluate the similarity between the inputs.}\label{fig_model}
\end{figure}

\begin{figure}[htb]
\subfigure[Top k smallest Q-scores]{\includegraphics[width=0.24\textwidth]{img/mnist_pairwise_small_clean_visual.pdf}}\begin{tikzpicture}
        \draw[thick] (0,0) -- (0,3.8); \end{tikzpicture}\subfigure[Top k largest Q-scores] {\includegraphics[width=0.24\textwidth]{img/mnist_pairwise_large_clean_visual.pdf}} 
\subfigure[Top k smallest Q-scores]{\includegraphics[height=8cm,width=0.23\textwidth]{img/cuhk03_pairwise_small_clean_visual.pdf}}
\begin{tikzpicture}
        \draw[thick] (0,0) -- (0,7.8); \end{tikzpicture}
        \subfigure[Top k largest Q-scores]{\includegraphics[height=8cm,width=0.23\textwidth]{img/cuhk03_pairwise_large_clean_visual.pdf}} 
\caption{Pairwise: the top-k training-set pairs, with the lowest and highest Q-scores on dataset MNIST and CUHK03.}
\label{top_mnist_clean_pairwise}
\end{figure}

We tested two different base network modules with two datasets as follows. 
\textbf{MNIST}: We use a two-layer convolutional network as the CNN modules in Siamese networks and train over 100 epochs using an initial learning rate of 0.01 with the decaying schedule of factor 5 after every 30 epochs.  
\textbf{CUHK03 \citep{li2014deepreid}}: The CNN modules are based on the ResNet-18, pre-trained on the ImageNet dataset \citep{deng2009imagenet}. The model is trained for 65 epochs with an initial learning rate of 0.01, employing a decay schedule that decreases the learning rate by a factor of 5 every 20 epochs. The CUHK03 dataset comprises 14,097 images of 1,467 individuals.




 \begin{figure}[htb] \subfigure[MNIST -- outliers]{\includegraphics[ width=0.235\textwidth]{img/mnist_pairwise_outliers.pdf}} \subfigure[MNIST-- clean data]{\includegraphics[width=0.235\textwidth]{img/mnist_pairwise_clean.pdf}} 
 \subfigure[CUHK03 -- outliers] {\includegraphics[width=0.235\textwidth]{img/cuhk03_pairwise_outliers.pdf}} 
 \subfigure[CUHK03 -- clean data]{\includegraphics[width=0.235\textwidth]{img/cuhk03_pairwise_clean.pdf}} 
\caption{Pairwise SGD-Q: Comparison of the test accuracy on MNIST and CUHK03 with and without outliers.} \label{fig_label_flip}
\end{figure}


In Figure~\ref{top_mnist_clean_pairwise} (a) and (c), we list the example pairs with lowest estimated values of $q$ for MNIST and CUHK03. In Figure~\ref{top_mnist_clean_pairwise} (b) and (d), we list the pairs with highest estimated values of $q$ in MNIST and CUHK03. An interesting observation from these figures is that low Q-score pairs tend to be same-label pairs whereas the high Q-score pairs tend to be different-label pairs. 
%%=== to edit
% \begin{figure}[htb] \subfigure[MNIST -- outliers]{\includegraphics[width=0.235\textwidth]{img/mnist_pairwise_outliers.pdf}} \subfigure[MNIST-- clean data]{\includegraphics[width=0.235\textwidth]{img/mnist_pairwise_clean.pdf}} \subfigure[CUHK03 -- outliers] {\includegraphics[width=0.235\textwidth]{img/cuhk03_pairwise_outliers.pdf}} \subfigure[CUHK03 -- clean data]{\includegraphics[width=0.235\textwidth]{img/cuhk03_pairwise_clean.pdf}} 
%\caption{ Pairwise: Compasison of the test accuracy on MNIST and CUHK03 against with and without outliers.} \label{fig_label_flip}
%\end{figure}
It seems to be an intriguing observation from these pairwise experiments that identifying two images as representing different content appears to be an easier problem than recognizing them as representing similar content. This suggests that machines (contrary to humans) may find it easier to detect differences than similarities, in the datasets tested.

%An interesting finding that transpires from all pairwise experiments is that correctly recognizing that two images represent different content seems to be an easier problem than correctly recognizing that they represent similar content. This may suggest that for machine it is easier to tell difference than similarity.

%We can see that high Q-score images typically have clear, uncluttered backgrounds with visible objects, while low Q-score images are blurry or unclear, making object identification difficult. 
% In Re-ID tasks, low Q-score images often show objects from unconventional angles, hindering recognition. 
%Smaller Q-scores help prevent memorizing noisy examples by training with fewer iterations. Our algorithms effectively identify challenging or noisy examples, assigning them low probabilities to indicate the need for further review.


\textbf{Training with Pairwise SGD-Q in the Presence of Label Noise.} 


Next, we test our algorithms in accuracy comparison experiments.
The results shown in %AdaSamp{fig_label_flip_pointwise} and 
Figure \ref{fig_label_flip} are obtained in the presence of random label noise in a setup similar to \citep{shah2020choosing}: 10\% of the samples are randomly selected and assigned incorrect (opposite) labels. We report the accuracy measured on a clean independent test set, averaged over 5 independent repetitions. 

We compare our algorithms with three methods,  
%in both pointwise (Figure\ref{fig_label_flip_pointwise}) and pairwise (Figure\ref{fig_label_flip}) cases, AdaSamp \citep{london2017pac} -- an existing adaptive sampling algorithm inspired from PAC-Bayes bounds, where the probability is proportional to the exponential of the loss.
with vanilla SGD, AdaSamp \citep{london2017pac} and MKL-SGD \citep{shah2020choosing} -- a Min-k Loss SGD that aims to improve robustness against outliers. 
MKL-SGD is an existing variant of SGD that previously demonstrated the robustness achieved by discarding high-loss examples, however, without any generalization guarantees. Recall that AdaSamp is an adaptive sampling algorithm inspired from PAC-Bayes bounds, with sampling probability proportional to the loss.  




According to Figure~\ref{fig_label_flip}, pairwise SGD-Q demonstrates superior test accuracy under label noise compared to both MKL-SGD and AdaSamp on MNIST and CUHK03, highlighting its robustness and enhanced generalization performance.   

% Comparing with results of MKL-SGD on CIFAR-10, our algorithm is more stable while MKL-SGD displays sensitivity to the batch size. %We provide results of MKL-SGD with two batch size cases on CIFAR-10 according to their default settings. 
% This sensitivity can result in a reduced accuracy, particularly when large batches are used, potentially limiting the algorithm's effectiveness in real-world applications where batch sizes vary or computational efficiency is a concern. 



\section{Conclusions }\label{sec:conclusion}
We considered a PAC-Bayes analysis of stochastic optimization algorithms, and based on this, learning the adaptive sampling scheme. We introduced new bounds-based algorithms that demonstrate strong robustness and offer insights into model behavior regarding example difficulty. 
Future research could explore the performance of these algorithms under different attacks and investigate their application with other optimization methods, such as randomized coordinate descent. It would also be interesting to follow up on our observations in pairwise learning in machines vs. humans. 
% Limitations: In label noise cases for adversarial training, our algorithms penalize large-loss data and train them with fewer repetitions. This works well with in standard training, but performs worse with large perturbations, as close-to-boundary data receive less training compared to uniform cases.




% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions. 
%     This is a nice way of making clear who did what and to give proper credit.
%     This section is optional.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
The work of Sijia Zhou is funded by CSC and UoB scholarship.
The work of Yunwen Lei is partially supported by the Research Grants Council of Hong Kong [Project No. 22303723]. Ata Kab\'an acknowledges past funding from EPSRC fellowship EP/P004245/1. The experiments were conducted using the University of Birmingham's Baskerville and BlueBEAR HPC services. 
\end{acknowledgements}

% References


\bibliography{example_paper}


\onecolumn
%\title{Learning to Sample in Stochastic Optimization\\(Supplementary Material)}
%\maketitle

\appendix

\section{Appendix: Applications to Bounding the error of Stochastic Gradient Descent Ascent  }\label{app_sgda}

%%=========updata rule
We study SGDA for addressing min-max optimization problems in the convex-concave setting.
We analyze SGDA under a general sampling scheme, where the random index is drawn from an arbitrary distribution.
\begin{definition}[SGDA with general sampling]
Let $\bw_1$ and $\bv_1$ denote the initial points, and $\nabla_{\bw}\ell$ represent the gradient with respect to $\bw$. Consider a probability measure $\mathbb{P}$ over $[n]^T$ and a training dataset $S=\{z_1,\ldots,z_n\}$. A sequence $(i_1, \ldots, i_T)$ is sampled according to $\mathbb{P}$. At the $t$-th iteration, SGDA with the sampling scheme $\mathbb{P}$ updates the model as follows:
\[
\begin{cases} \bw_{t+1}= \bw_{t}-\eta_{t}\nabla_{\bw}\ell((\bw_{t},\bv_{t});z_{i_t})
,\\\bv_{t+1}= \bv_{t}+\eta_t\nabla_{\bv}\ell((\bw_{t},\bv_{t});z_{i_t}),\end{cases}
\]
where $\{\eta_t\}$ is a positive step-size sequence.
If $\pbb$ is the uniform distribution, then we call it SGDA with uniform sampling (SGDAU). \end{definition}

 
%%=============

Next we give stability bounds to develop PAC-Bayes bounds for SGDA, covering both smooth and non-smooth cases. We first introduce sub-exponential stability~\citep{zhou2023toward}.
\begin{assumption}[{Sub-exponential stability}\label{ass:beta-theta}]
Let $\pbb$ be a fixed probability distribution. We say that a stochastic algorithm is sub-exponentially $\beta_{\phi}$-stable (w.r.t. $\pbb$) if, given any fixed instance of $\phi \sim \pbb$, it is $\beta_{\phi}$-uniformly stable, and there exist $\aaa, \bb\in\rbb$ such that for any $\delta\in(0,1/n]$, the following holds with probability at least $1-\delta$
  \begin{equation}\label{beta-theta}
  \beta_\phi\leq\aaa+\bb\log(1/\delta).
  \end{equation} \end{assumption}

\subsection{Non-smooth case}

 
The following lemma shows that SGDAU applied to non-smooth problems enjoys the sub-exponential stability. The proof is given in Appendix \ref{proof:nonsmooth_sgda}.
\begin{lemma}[{Stability bound}\label{sta_sgda_non-smooth}]
Let $S$ and $S'$ be neighboring datasets.
Suppose for all $z \in \mathcal{Z}$ the loss function is convex-concave and $L$-Lipschitz. Let $\{\bw_t,\bv_t\}, \{\bw_t',\bv_t'\}$ be the sequence produced by SGDAU on $S$ and $S'$ respectively with fixed step sizes. Then SGDAU with $t$ iterations and the hyperparameter $\phi$ is $\beta_\phi$-uniformly stable with
$
\beta_\phi = 4\sqrt{ e}L^2\eta\big( \sqrt{t}+ \max_{k\in[n]}\sum_{j=1}^{t} \ibb[i_j=k]\big)
$. 
For any $\delta\in(0,1)$, with probability at least $1-\delta$ we have
\[
\beta_\phi \leq \aaa + 8\sqrt{ e}L^2\eta (1+\sqrt{ t/n})\log(1/\delta) .
\]
That is, Assumption \ref{ass:beta-theta} holds with $ \bb = 8\sqrt{ e}L^2\eta (1+\sqrt{ t/n})$ w.r.t. $\pbb$.

\end{lemma}



 


\subsection{Smooth case}
In the following lemma to be proved in Appendix  \ref{proof:smooth_sgda}, we give stability bounds for SGDA which satisfy Assumption \ref{ass:beta-theta}. 

\begin{lemma}[{Stability bound}\label{lem:sta_sgda}]
Let $S$ and $S'$ be neighboring datasets.
Suppose for all $z \in \mathcal{Z}$ the loss function is convex-concave, $\alpha$-smooth and $L$-Lipschitz.
Let $\{\bw_t,\bv_t\}, \{\bw_t',\bv_t'\}$ be the sequence produced by SGDA on $S$ and $S'$ respectively with fixed step sizes. 
Then at $t$ iterations, SGDA with uniform sampling and the hyperparameter $\phi$ is $\beta_\phi$-uniformly stable with  
\[
\beta_\phi =  4\sqrt{ e} L^2\eta \exp(\frac{1}{2}\alpha^2\eta^{2}t) \max_{k\in[n]} \left(1 +  \sum_{r=1}^{t}  \mathbb{I}[i_r=k]\right).\]
 
If $\eta_t=\eta$, then for any $\delta\in(0,1)$, with probability at least $1-\delta$ we have
\[
\beta_\phi \leq \aaa +  8\sqrt{ e} L^2\eta \exp(\frac{1}{2}\alpha^2\eta^{2}t)(1+\sqrt{t/n})\log(1/\delta).
\]
That is, Assumption \ref{ass:beta-theta} holds with $\bb= 8\sqrt{ e} L^2\eta \exp(\frac{1}{2}\alpha^2\eta^{2}t)(1+\sqrt{t/n})$ w.r.t. $\pbb$.
\end{lemma}


%%%next corollary

We can derive Theorem \ref{cor_sgda_smooth}, based on the above lemmas. Next, we proof the above stability bounds for SGDA. 

 

\subsection{Proofs on Applications of SGDA }\label{prf:sgda}


% {\zsj for the following, give details or just cite?}




% \begin{theorem}[{Generalization of sub-exponentially stable randomized algorithms}\label{thm:main}]
%   \end{theorem}


\begin{lemma}[Chernoff's Bound\label{Chernoff's Bound}]
Let $X_1, \ldots, X_t$ be independent random variables taking values in $\{0, 1\}$. Let $X = \sum_{k=1}^{t} X_k$ and $\mu = \mathbb{E}[X]$. Then for any $\epsilon > 0 $ with probability at least $1 - \exp(-\mu \epsilon^2 /(2+\epsilon))$ we have $X\leq(1+\epsilon)\mu$. Furthermore, for any $\delta \in (0,1)$ with probability at least $1- \delta$ we have
\[
X \leq \mu + \log(1/\delta) + \sqrt{2\mu \log(1/\delta)}.
\]
\end{lemma}



 
 
 
First, we present the proofs on the generalization bounds for SGDA with smooth and non-smooth convex loss functions. Before that, we need to prove that SGDA meets Assumption \ref{ass:beta-theta}.
 
\subsubsection{Non-smooth case}\label{proof:nonsmooth_sgda}
\begin{proof}[Proof of Lemma \ref{sta_sgda_non-smooth}]
Without loss of generality, we first assume $S$ and $S'$ differ by the last example.
According to the SGDA update rule and proof of Theorem 2(c) in \citep{lei2021stability}, for $p>0$, we get

\begin{align*}\label{sgda:smooth}
\left\|\left(\begin{array}{c}
\mathbf{w}_{t+1}-\mathbf{w}_{t+1}^{\prime} \\
\mathbf{v}_{t+1}-\mathbf{v}_{t+1}^{\prime}
\end{array}\right)\right\|_{2}^{2} 
& \leq 8 L^{2} \eta^{2}(1+p)^{\sum_{j=1}^{t} \mathbb{I}_{\left[i_{j}=n\right]}}\left(t+\sum_{k=1}^{t} \mathbb{I}_{\left[i_{k}=n\right]} / p\right).\end{align*}

% =========

We set $p=1/\sum_{j=1}^{t} \ibb[i_j=n]$ and use the inequality $(1+1/x)^x\leq e$ to derive
\[
\left\|\left(\begin{array}{c}
\mathbf{w}_{t+1}-\mathbf{w}_{t+1}^{\prime} \\
\mathbf{v}_{t+1}-\mathbf{v}_{t+1}^{\prime}
\end{array}\right)\right\|_{2}^{2} 
 \leq 8e L^{2} \eta^{2} \left(  t+ \Big(\sum_{k=1}^{t} \ibb[i_k=n]\Big)^2  \right).
\]
It then follows that
\[
\left\|\left(\begin{array}{c}
\mathbf{w}_{t+1}-\mathbf{w}_{t+1}^{\prime} \\
\mathbf{v}_{t+1}-\mathbf{v}_{t+1}^{\prime}
\end{array}\right)\right\|_{2}  
 \leq \sqrt{8e} L \eta \left(  \sqrt{t}+ \sum_{k=1}^{t} \ibb[i_k=n] \right).
\]
 

Based on Eq. \eqref{sgda:lip-1}, we further know that SGDA is $\beta_\phi$-uniformly stable with %\color{blue}
\begin{equation} \label{eqa:nonsmooth_bound}
\beta_\phi = 4\sqrt{ e}L^2\eta\Big(  \sqrt{t}+ \max_{k\in[n]}\sum_{j=1}^{t} \ibb[i_j=k]\Big).
\end{equation}
For simplicity, let $\beta_{\phi,k}= 4\sqrt{ e}L^2\eta\big(\sqrt{t}+ \sum_{j=1}^{t} \ibb[i_j=k]\big)$.
Take the expectation of the above inequality, then gives the following bound
\[
\ebb_{\phi\sim\pbb}[\beta_{\phi,k}] = 4\sqrt{ e}L^2\eta\big(\sqrt{t}+ t/n\big).
\]
Applying Lemma \ref{Chernoff's Bound} to Eq. \eqref{eqa:nonsmooth_bound}, with probability at least $1- \delta/n$, we have %the following inequality simultaneously for all $k\in[n]$
\begin{align*}
\beta_{\phi,k} \leq 4\sqrt{ e}L^2\eta (\sqrt{t}+ t/n + \log(n/\delta) + \sqrt{2t/n\log(n/\delta)}).
\end{align*}
Therefore, with probability at least $1-\delta$, the following inequality holds simultaneously for all $k\in[n]$
\[
\beta_{\phi,k} \leq 4\sqrt{ e}L^2\eta (\sqrt{t}+ t/n + \log(n/\delta) + \sqrt{2t/n\log(n/\delta)}),
\]
which implies the following inequality with probability at least $1-\delta$
\[
\beta_\phi\leq 4\sqrt{ e}L^2\eta (\sqrt{t}+ t/n + 2\log(1/\delta) + \sqrt{4t/n\log(1/\delta)}),
\]
where we have used $\delta\in(0,1/n)$ in the above inequality.
Combining the stability bounds above, then we can prove that SGDAU with the hyperparameter $\phi$ meets the Assumption \ref{ass:beta-theta} with
\[
\aaa \geq 4\sqrt{ e}L^2\eta (\sqrt{t}+ t/n), \quad\bb= 8\sqrt{ e}L^2\eta (1+\sqrt{ t/n}).
\]

The proof is completed.
\end{proof}


\subsubsection{Smooth case}\label{proof:smooth_sgda}
%%=== proof of lemma
% \begin{proof}[Proof of Corollary \ref{cor_sgd_smooth}]%




\begin{proof}[Proof of Lemma \ref{lem:sta_sgda}]
Without loss of generality, we first assume $S$ and $S'$ differ by the last example. 
According to the SGDA update rule and proof of Theorem 2(d) in \citep{lei2021stability}, for $p>0$ and fixed step sizes, we get 

\begin{align*}
\left\|\left(\begin{array}{c}
\mathbf{w}_{t+1}-\mathbf{w}_{t+1}^{\prime} \\
\mathbf{v}_{t+1}-\mathbf{v}_{t+1}^{\prime}
\end{array}\right)\right\|_{2}^{2} 
 \leq & 8(1+1 / p) L^{2} \eta^{2} \prod_{j=1}^{t}\left(1+\alpha^{2} \eta_{j}^{2}\right) \prod_{j=1}^{t}(1+p)^{\mathbb{I}_{\left[i_{j}=n\right]}} \sum_{k=1}^{t} \mathbb{I}_{\left[i_{k}=n\right]} \\
\leq & 8(1+1 / p) L^{2} \eta^{2} \exp\left(\alpha^2 \sum_{j=1}^{t} \eta_{j}^{2}\right)  (1+p)^{ \sum_{j=1}^{t} \mathbb{I}_{ \left[i_{j}=n\right]}} \sum_{k=1}^{t} \mathbb{I}_{\left[i_{k}=n\right]}.
\end{align*}
We set $p=1/\sum_{j=1}^{t} \ibb[i_j=n]$ and use the inequality $(1+1/x)^x\leq e$ to derive
\[ \left\|\left(\begin{array}{c}
\mathbf{w}_{t+1}-\mathbf{w}_{t+1}^{\prime} \\
\mathbf{v}_{t+1}-\mathbf{v}_{t+1}^{\prime}
\end{array}\right)\right\|_{2}^{2} 
\leq 8e\left(1 +  \sum_{k=1}^{t} \mathbb{I}_{\left[i_{k}=n\right]}  \right)^2 L^{2} \eta^{2} \exp\left( \alpha^2 \sum_{j=1}^{t} \eta_{j}^{2}\right)  .
\]
Based on the Lipschitz continuity and above inequality, $\forall S\sim S'\in \mathcal{Z}^n,\forall z \in \mathcal{Z}$ we have the following, where we use the notation $(\bw,\bv)\equiv A_{\bw,\bv}(S;\phi)$ and $(\bw',\bv')\equiv A_{\bw,\bv}(S';\phi)$:
\begin{multline}
  | \ell\left(A_{\mathrm{w,v}}(S; \phi), z\right)-\ell\left(A_{\mathrm{w,v}}(S^{\prime}; \phi), z\right)|  %\leq %|\sup_{\bv' \in \vcal}(\ell\left(A_{\mathrm{w,v'}}(S; \phi), \ z\right) - )   
= |\ell((\bw,\bv);z)-\ell((\bw',\bv');z)| \\
\leq |\ell((\bw,\bv);z)- \ell((\bw',\bv);z) |+ |\ell((\bw',\bv);z)-\ell((\bw',\bv');z)| 
%  |\ell\left(A_{\mathrm{w}}(S; \phi),\mathrm{\bv'}; z\right)-   \ell\left(A_{\mathrm{w}}(S^{\prime}; \phi),\mathrm{\bv'}; z\right)| \\ + |\ell\left(A_{\mathrm{v}}(S; \phi), z\right)-\ell\left(A_{\mathrm{v}}(S^{\prime}; \phi), z\right)| \\ 
\leq L\left( \| {\mathbf{w}}- {\mathbf{w}'}\|_2  + \| {\mathbf{v}} - {\mathbf{v}}'\|_2\right) \\  \leq 4\sqrt{e} L^2\eta \exp(\frac{1}{2}\alpha^2\eta^{2}t) \max_{k\in[n]} \left(1 +  \sum_{r=1}^{t}  \mathbb{I}[i_r=k]\right). \label{sgda:lip-1}
\end{multline}

%%%==============

Based on the above inequalities, we know that SGDA is $\beta_\phi$-uniformly stable with
\begin{align*}
\beta_\phi = 4\sqrt{ e} L^2\eta \exp(\frac{1}{2}\alpha^2\eta^{2}t) \max_{k\in[n]} \left(1 +  \sum_{r=1}^{t}  \mathbb{I}[i_r=k]\right).
\end{align*}
For simplicity, let
$\beta_{\phi,k} = 4\sqrt{ e} L^2\eta \exp(\frac{1}{2}\alpha^2\eta^{2}t) \left( 1+\sum_{j=1}^{t}   \mathbb{I}[i_j=k] \right)$ for any $k\in[n]$.
Taking the expectation over both sides of above inequality, we derive
\begin{equation}\label{stab-sgda-1}%\label{}
\ebb_{\phi\sim\pbb}[\beta_\phi]\geq \ebb_{\phi\sim\pbb}[\beta_{\phi,k}] = 4\sqrt{ e} L^2\eta \exp(\frac{1}{2}\alpha^2\eta^{2}t)(1+t/n),
\end{equation}
where $\mathbb{E} [ \mathbb{I}[i_j=k]] =1/n.$
Based on the above stability bounds, it remains to show that the stability parameter of SGDA meets Assumption \ref{ass:beta-theta}.
According to Lemma \ref{Chernoff's Bound} with $X_j = \mathbb{I}[i_j=k]$, we
get the following inequality with probability at least $1- \delta/n$ 
\begin{align}\label{convex:pbound_sgda}
\beta_{\phi,k} 
&\leq 4\sqrt{ e} L^2\eta \exp(\frac{1}{2}\alpha^2\eta^{2}t)(1+t/n + \log(n/\delta) + \sqrt{2t/n\log(n/\delta)}).
\end{align}
By the union of probability,  with probability at least $1-\delta$, Eq. \eqref{convex:pbound_sgda} holds for all $k\in[n]$. Therefore, with probability at least $1-\delta$
\begin{align*}
\beta_\phi & \leq 4\sqrt{ e} L^2\eta \exp(\frac{1}{2}\alpha^2\eta^{2}t)(1+t/n + \log(n/\delta) + \sqrt{2t/n\log(n/\delta)})\\ 
&\leq 4\sqrt{ e} L^2\eta \exp(\frac{1}{2}\alpha^2\eta^{2}t)(1+t/n + 2\log(1/\delta) + 2\sqrt{ t/n\log(1/\delta)})\\
& \leq 4\sqrt{ e} L^2\eta \exp(\frac{1}{2}\alpha^2\eta^{2}t)(1+t/n) + 8\sqrt{ e} L^2\eta \exp(\frac{1}{2}\alpha^2\eta^{2}t)(1+\sqrt{t/n} )\log(1/\delta) \\ 
& \leq \ebb_{\phi\sim\pbb}[\beta_\phi]+ 8\sqrt{ e} L^2\eta \exp(\frac{1}{2}\alpha^2\eta^{2}t)(1+\sqrt{t/n}) \log(1/\delta),
\end{align*}
where we have used $\delta\in(0,1/n)$ in the second inequality, and Eq. \eqref{stab-sgda-1} in the last inequality.
Therefore, Assumption \ref{ass:beta-theta} holds with $\bb= 8\sqrt{ e} L^2\eta \exp(\frac{1}{2}\alpha^2\eta^{2}t)(1+\sqrt{t/n})$.

The proof is completed. \end{proof}




%%===========

Based on the above lemmas, we are ready to state generalization bounds in Corollary \ref{cor_sgda_smooth} for SGDA in non-smooth and smooth cases. We derive the generalization bounds for SGDA with general sampling based on the stability analysis for SGDA with uniform sampling. 

\begin{proof}[Proof of Corollary \ref{cor_sgda_smooth}]%


With $A(S;\phi) =\left(A_{\mathrm{w,v}}(S; \phi)\right),$
it is then clear that SGDA with convex-concave loss functions in both smooth and non-smooth cases satisfies Assumption \ref{ass:beta-theta} based on Lemma \ref{sta_sgda_non-smooth} and Lemma \ref{lem:sta_sgda}. Therefore, the results are derived by applying the upper bound on $\beta_\phi$ to Theorem 1 in \citep{zhou2023toward}. \end{proof}


% \begin{proof}[Proof of Corollary \ref{cor_sgda_smooth}]%

% % $A(S;\phi):=\left(A_{\mathrm{w}}(S;\phi), A_{\mathrm{v}}(S;\phi)\right) \in \mathcal{W} \times \mathcal{V}$

% With  $A(S;\phi) =\left(A_{\mathrm{w}}(S;\phi), A_{\mathrm{v}}(S;\phi)\right)$, %and $L$-Lipschitzness,
% it follows from Lemma \ref{lem:sta_sgda} that SGDA with convex-concave and smooth loss functions satisfies Assumption \ref{ass:beta-theta}. Applying the upper bound on $\beta_\phi$ to Theorem \ref{thm:main}, we derive the result.\end{proof}



\if 0
\begin{figure}
% \subfigure[topk smallest]{\includegraphics[width=0.24\textwidth]{}}\subfigure[topk largest]{\includegraphics[width=0.24\textwidth]{}}
% \subfigure[topk smallest]{\includegraphics[width=0.245\textwidth]{}}\subfigure[topk largest]{\includegraphics[width=0.245\textwidth]{}}
\caption{{\td give results on clean dataset, about example difficulty.}visualization: The grid shows the top-20 pairs on dataset MNIST and CUHK03 training-set pairs respectively, with the lowest and highest sampling probabilities. 'PL' means  predicted label, 'GT' means ground truth and 'OOD' means noise data with wrong labels.   }
\label{top_mnist_clean_pairwise}
\end{figure}


\section{Appendix: Proof of Generalization Bound for Pairwise SGD}\label{app_pair_alg}

{\zsj may just cite arxiv}

We denote $\bw_1$ an initial point and a uniform distribution over $\left([n]\times[n]\right)^T$. 
At the $t$-th iteration for SGD, a pair of sample indices $\phi_t =(i_t, j_t)$ is uniformly randomly selected from the set $\{(i_t, j_t) : i_t, j_t \in [n], i_t \neq j_t\}$. This forms a sequence of index pairs $\phi=  (\phi_1,...,\phi_T)$. For step-size $\eta_t$, the model is updated  by $\bw_{t+1} = \bw_t - \eta_t \nabla \ell(\bw_t;z_{i_t},z_{j_t}).$

\begin{assumption}[Lipschitz continuity\label{defn_lipschitz}] Let $L>0$. We say $\ell$ is $L$-Lipschitz if for any $\bw_1$, $\bw_2$ $\in \wcal$, we have $
|\ell(\bw_1 )-\ell(\bw_2  )|\leq L\|\bw_1 - \bw_2\|_2.$ \end{assumption}
\begin{assumption}[Smoothness\label{defn_smooth}]
Let $\alpha\geq0$. We say a differentiable function $\ell$ is $\alpha$-smooth, if for any $\bw_1$, $\bw_2$ $\in \wcal$, $\|\nabla \ell(\bw_1   ) - \nabla \ell(\bw_2 )\|_2\leq \alpha\| \bw_1 - \bw_2\|_2,$
where $\nabla \ell$ represents the gradient of $\ell$.
\end{assumption}
\begin{assumption}[Convexity\label{defn_strongly_convex}]
We say $\ell$ is convex if the following holds $\forall \bw_1, \bw_2\in \wcal$, 
\begin{equation}
\ell(\bw_1 ) \geq \ell(\bw_2 ) +  \big\langle \nabla \ell(\bw_2 ), \bw_1 - \bw_2 \big\rangle,
\end{equation} 
where $\langle \cdot,  \cdot \rangle$ represents the inner product.
\end{assumption}
 
The following lemma shows that pairwise SGDU applied to non-smooth and smooth problems enjoys the sub-exponential stability. 

\begin{lemma}[{Sub-exponential stability of pairwise SGD}]\label{sta_sgd} 
Let $\{\bw_t\}, \{\bw_t'\}$ be two  sequences produced by SGD with uniform distribution $\pbb$ on neighboring $S$ and $S'$, respectively. Let Assumption \ref{defn_lipschitz} and Assumption~\ref{defn_strongly_convex} hold. 
\begin{itemize}
    \item[1)]At the $t$-th iteration, with fixed step sizes, % SGD is $\beta_\phi$-uniformly stable with  \[ \beta_\phi = 2\sqrt{e}L^2\eta\Big(  \sqrt{t}+ \max_{k\in[n]}\sum_{m=1}^{t}\ibb[i_m=k\mkern5mu\text{ or }\mkern5mu j_m =k]\Big),
% \quad\text{and}\quad 
% \aaa = 2\sqrt{e}L\eta\big(  \sqrt{t}+ 2t/n\big). \] where $i_m,j_m \in [n]$ and $i_m \neq j_m$. Furthermore, for any $\delta\in(0,1)$, with probability at least $1-\delta$ we have  \[ \beta_\phi \leq 2\sqrt{e}L^2\eta (\sqrt{t}+ 2t/n) +4\sqrt{e}L^2\eta\big(1+2(t/n)^{\frac{1}{2}}\big) \log(1/\delta). \]
Assumption \ref{ass:beta-theta} holds with %\textcolor{red}
\[ \aaa\!=\!2\sqrt{e}L^2\eta (\sqrt{t}+ 2t/n) \quad \text{and} \quad \bb\! =\! 4\sqrt{e}L^2\eta\big(1+2(t/n)^{\frac{1}{2}}\big) .\]
 \item[2)] In addition, if  the Assumption \ref{defn_smooth} holds and $\eta \leq 2/\alpha$, at t-th iteration, % RCD is $\beta_\phi$-uniformly stable with \[ \beta_\phi =  2L^2 \max_{k\in[n]}\sum_{m=1}^{t}\eta_m\ibb[i_m=k\mkern5mu\text{ or }\mkern5mu j_m =k], %\quad\textbf{and}\quad 
% \aaa = \frac{4L^2}{n}\sum_{k=1}^{t}\eta_k, \] where $i_m,j_m \in [n]$ and $i_m \neq j_m$, and $\eta_m \leq 2/\alpha$. If $\eta_t=\eta$, then $\forall \delta\in(0,1)$, w.p. at least $1-\delta$ we have \[ \beta_\phi \leq 4L^2\eta t/n + 4L^2\eta\big(1+2(t/n)^{\frac{1}{2}}\big)\log(1/\delta). \]
Assumption~\ref{ass:beta-theta} holds with % \textcolor{red}
 \[b_1= 4L^2\eta t/n\quad \text{and} \quad\bb=4L^2\eta\big(1+2(t/n)^{\frac{1}{2}}\big).\]
\end{itemize}
\end{lemma}

\begin{proof}[Proof of Lemma \ref{sta_sgd}, 1)]
Without loss of generality, we assume $S$ and $S'$ differ by the last example. Based on the Eq. (F.2) in~\citet{lei2021generalizationb}, we have %\textcolor{red}{refer to an eq?} {\zsj no eq. to cite} , we have
\begin{align*}\|\bw_{t+1} - \bw_{t+1}'\|^2_2 &\leq 4L^2\eta^2 (1+p)^{\sum_{k=1}^{t} \ibb[i_k=n\mkern5mu\text{ or }\mkern5mu j_k =n] }\Big(  t+ p^{-1} \sum_{k=1}^{t} \ibb[i_k=n\mkern5mu\text{ or }\mkern5mu j_k =n] \Big).
\end{align*}
We set $p=1/\sum_{k=1}^{t} \ibb[i_k=n\mkern5mu\text{ or }\mkern5mu j_k =n]$ and use the inequality $(1+1/x)^x\leq e$ to get
\[
\|\bw_{t+1} - \bw_{t+1}'\|^2_2
\leq 4eL^2\eta^2\Big(  t+ \Big(\sum_{k=1}^{t} \ibb[i_k=n\mkern5mu\text{ or }\mkern5mu j_k =n]\Big)^2 \Big).
\]
It then follows that
\[
\|\bw_{t+1} - \bw_{t+1}'\|_2
\leq 2\sqrt{e}L\eta\Big(  \sqrt{t}+ \sum_{k=1}^{t} \ibb[i_k=n\mkern5mu\text{ or }\mkern5mu j_k =n]\Big).
\]
According to the Lipschitz continuity, we know that SGD is $\beta_\phi$-uniformly stable with 
\begin{equation} \label{eqa:nonsmooth_bound_sgd_pairwise}
\beta_\phi = 2\sqrt{e}L^2\eta\Big(  \sqrt{t}+ \max_{k\in[n]}\sum_{m=1}^{t}\ibb[i_m=k\mkern5mu\text{ or }\mkern5mu j_m =k]\Big).
\end{equation}
To bound $\beta_\phi$ with high probability, we set $\beta_{\phi,k}=2\sqrt{e}L^2\eta\big(\sqrt{t}+ \sum_{m=1}^{t} \ibb[i_m=k\mkern5mu\text{ or }\mkern5mu j_m =k]\big)$, and note that
%its expectation is \textcolor{red}{$E[\beta_{\phi,k}]$ is not needed}
%\[
%\ebb_{\phi\sim\pbb}[\beta_{\phi,k}] \leq 2\sqrt{e}L\eta\big(  \sqrt{t}+ 2t/n\big),
%\]
%where 
$\mathbb{E} [ \mathbb{I}[i_m=k\mkern5mu\text{ or }\mkern5mu j_m =k]] \leq \mbox{Pr}\{i_m=k\} +\mbox{Pr}\{j_m=k\} = 2/n.$
Applying Lemma \ref{Chernoff's Bound} to the sum in Eq. \eqref{eqa:nonsmooth_bound_sgd_pairwise}, with probability at least $1- \delta/n$, we get
\begin{align*}
\beta_{\phi,k} \leq 2\sqrt{e}L^2\eta (\sqrt{t}+ 2t/n + \log(n/\delta) + 2\sqrt{ t/n\log(n/\delta)}).
\end{align*}
Therefore, with probability at least $1-\delta$, the following holds simultaneously for all $k\in[n]$ by the union bound on probability
\[
\beta_{\phi,k} \leq 2\sqrt{e}L^2\eta (\sqrt{t}+ 2t/n + \log(n/\delta) + 2\sqrt{ t/n\log(n/\delta)}).
\]
For $\delta\in(0,1/n)$, this implies the following inequality with probability at least $1-\delta$
\begin{align}
\beta_\phi\leq 2\sqrt{e}L^2\eta (\sqrt{t}+ 2t/n + 2\log(1/\delta) + 2\sqrt{ 2 t/n \log(1/\delta)}). \label{beta_theta}
\end{align}
Finally, from Eq. \eqref{beta_theta} we know that SGD with the uniformly distributed hyperparameter $\phi$ meets Assumption \ref{ass:beta-theta} with 
\[
\aaa = 2\sqrt{e}L^2\eta (\sqrt{t}+ 2t/n), \quad\bb=4\sqrt{e}L^2\eta (1+ \sqrt{2t/n}).
\]
The proof is completed.
\end{proof}

\begin{proof}[Proof of Lemma \ref{sta_sgd}, 2)] 
By an intermediate result in the proof in Lemma C.3 of \citet{lei2020sharper}, for all $z, \tilde{z} \in \mathcal{Z}$ and $i_k, j_k\in [n], i_k\neq j_k$, with $L$-Lipschitz, we have 
\[
\left|\ell\left(\mathbf{w}_{t+1} ; z, \tilde{z} \right)-\ell\left(\mathbf{w}_{t+1}; z, \tilde{z}\right)\right| \leq L \|\mathbf{w}_{t+1}-\mathbf{w}_{t+1}^{\prime} \|_{2}  \leq 2 L^{2} \sum_{k=1}^{t} \eta_{k} \mathbb{I}\left[i_{k}=n \text { or } j_{k}=n\right] .\]
From this inequality it follows that SGD is $\beta_\phi$-uniformly stable with
\begin{align*}
\beta_\phi = 2L^2 \max_{k\in[n] }\sum_{m=1}^{t}\eta_m  \mathbb{I}[i_m=k\mkern5mu\text{ or }\mkern5mu j_m =k].
\end{align*}
Let $\beta_{\phi,k} = 2L^2 \sum_{m=1}^{t}\eta_j  \mathbb{I}[i_m=k\mkern5mu\text{ or }\mkern5mu j_m =k]$ for any $k\in[n]$.

It remains to show that the stability parameter of SGD meets  Assumption \ref{ass:beta-theta}.
Using Lemma \ref{Chernoff's Bound} with $Z_m = \mathbb{I}[i_m=k\mkern5mu\text{ or }\mkern5mu j_m =k]$ and noting that $\mathbb{E} [ \mathbb{I}[i_m=k\mkern5mu\text{ or }\mkern5mu j_m =k]] \leq  2/n$, we
get the following inequality with probability at least $1- \delta/n$ (taking $\eta_j=\eta$),
\begin{align}\label{convex:pbound}
\beta_{\phi,k}
&\leq 2L^2\eta (2t/n + \log(n/\delta) + 2\sqrt{  t/n\log(n/\delta)}).
\end{align}
By the union bound,  with probability at least $1-\delta$, Eq. \eqref{convex:pbound} holds for all $k\in[n]$. Therefore, with probability at least $1-\delta$, it gives
\begin{align*}
\beta_\phi & \leq 2L^2\eta (2t/n + \log(n/\delta) + 2\sqrt{ t/n\log(n/\delta)}) \leq 2L^2\eta (2t/n +  2\log(1/\delta) +\\
& 2\sqrt{2 t/n\log(1/\delta)}) \leq 4L^2\eta t/n + 4L^2\eta(1+ \sqrt{2t/n}) \log(1/\delta),
\end{align*}
where we have used $\delta\in(0,1/n)$ in the second inequality. Assumption \ref{ass:beta-theta} holds with %\textcolor{red}{check this doesn't look right - maybe we need to rewrite the assumption to have a parameter instead of $E[\beta_{\phi}]$?}
\[\aaa = 4L^2\eta t/n, \bb=4L^2\eta(1+ \sqrt{2t/n}).\] 
This completes the proof.
\end{proof}

\begin{proof}[Proof of Theorem \ref{cor_sgd_pairwise} ]% 
With $A(S;\phi)= \bw_T$, %and $L$-Lipschitzness, 
it follows from Lemma \ref{sta_sgd}, 1) and 2) that SGD with convex non-smooth and convex smooth loss functions satisfy Assumption \ref{ass:beta-theta} respectively. Applying the upper bound on $\beta_\phi$ to Theorem 1 in \citep{zhou2023toward}, the result follows.\end{proof}



\fi

\section{Algorithm for pairwise SGD}\label{app:pairwise_alg}

Inspired by the r.h.s. of PAC-Bayes generalization bound of Theorem \ref{cor_sgd_pairwise}, in this section we devise new pairwise SGD-Q algorithms that learn a sampling distribution (along with the model's parameters) from the data. 
The following objective function resembles the form of the r.h.s. of the bounds of pairwise SGD. 


\begin{equation}  
\mathcal{L}(q(i,j)) = \sum_{i,j\in[n]:i\neq j} q(i,j)\ell\left( h; z_i,z_j\right) + \nu\cdot \text{KL}(q \| p) + \lambda \big( \sum_{i,j\in[n]:i\neq j} q(i,j) - 1 \big),\label{obj_2}
\end{equation}
where $p(i,j)=\frac{1}{n(n-1)}, \forall i,j\in[n]:i\neq j$.
We want to minimize this objective w.r.t. $q$, that is to find $q$ that minimizes the expected empirical loss while staying close to the prior.



All terms depend on $q$; taking derivative w.r.t. each $q(i,j)$, and rearranging the stationary equation yields the update for $q$:
\begin{align}
q(i,j) =\frac{ \exp\left(-\frac{1}{\nu} \ell(h; z_i,z_j)\right)}{\sum_{a,b\in[n]:a\neq b} \exp\left(-\frac{1}{\nu} \ell(h; z_a,z_b)\right) } \propto  \exp\big(-\frac{1}{\nu} \ell(h; z_i,z_j)\big).
\label{qupd3}
\end{align}

\section{Appendix: Additional experiments}\label{sec:appdx}
Figures \ref{Appc1}-\ref{Appc4} provide results on the effect of $1/\nu$ in SGDA-Q for  adversarial training, while varying the label noise proportions, and diameter of the adversarial perturbation $\epsilon$.

\begin{figure}[H]
\centering
\subfigure[eps=0]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.2_eps_0.0.pdf}}\;\;
\subfigure[eps=0.1]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.2_eps_0.1.pdf}}
\subfigure[eps=0.2]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.2_eps_0.2.pdf}}\;\; 
\subfigure[eps=0.3]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.2_eps_0.3.pdf}} 
\subfigure[eps=0.4]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.2_eps_0.4.pdf
}}
\caption{\label{Appc1} The impact of $1/\nu$ on test accuracy under PGD attack  across different values of $\epsilon$ under symmetric noise rate 0.2. }
\end{figure}

\begin{figure}
\centering
\subfigure[eps=0]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.2_fgsm_eps_0.0.pdf}}\;\;
\subfigure[eps=0.1]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.2_fgsm_eps_0.1.pdf}}
\subfigure[eps=0.2]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.2_fgsm_eps_0.2.pdf}}\;\;
\subfigure[eps=0.3]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.2_fgsm_eps_0.3.pdf}} 
\subfigure[eps=0.4]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.2_fgsm_eps_0.4.pdf}}
\caption{\label{Appc2} The impact of $1/\nu$ on test accuracy under FGSM  attack across different values of $\epsilon$ under symmetric noise rate 0.2. }
\end{figure}


\begin{figure}
\centering
\subfigure[eps=0]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.4_pgd_eps_0.0.pdf}}\;\;
\subfigure[eps=0.1]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.4_pgd_eps_0.1.pdf}}
\subfigure[eps=0.2]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.4_pgd_eps_0.2.pdf}}\;\;
\subfigure[eps=0.3]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.4_pgd_eps_0.3.pdf}} 
\subfigure[eps=0.4]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.4_pgd_eps_0.4.pdf}}
\caption{\label{Appc3} The impact of $1/\nu$ on test accuracy under PGD attack  across different values of $\epsilon$ under symmetric noise rate 0.4. }
\end{figure}

\begin{figure}
\centering
\subfigure[eps=0]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.4_fgsm_eps_0.0.pdf}}\;\;
\subfigure[eps=0.1]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.4_fgsm_eps_0.1.pdf}}
\subfigure[eps=0.2]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.4_fgsm_eps_0.2.pdf}}\;\;
\subfigure[eps=0.3]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.4_fgsm_eps_0.3.pdf}} 
\subfigure[eps=0.4]{\includegraphics[width=0.25\textwidth]{img/test/test_mnist_pointwise_sym_0.4_fgsm_eps_0.4.pdf}}
\caption{\label{Appc4} The impact of $1/\nu$ on test accuracy under FGSM attack  across different values of $\epsilon$ under symmetric noise rate 0.4. }
\end{figure}

\end{document}
