%\documentclass{uai2022} % for initial submission
 \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
%\documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
% \usepackage[american]{babel}
\usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% -------
% ADDED by us 
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage[super]{nth}
\usepackage{multirow}
\usepackage{booktabs} % for tables
\usepackage{makecell}
\usepackage{xcolor} % to load extra colors
\usepackage{colortbl}
\usepackage{quiver}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage[capitalize,noabbrev]{cleveref}


% table column alignement with fixed width
\usepackage{array}
\newcolumntype{L}[1]{>{\raggedright\let\newline\\\arraybackslash\hspace{0pt}}m{#1}}
\newcolumntype{C}[1]{>{\centering\let\newline\\\arraybackslash\hspace{0pt}}m{#1}}
\newcolumntype{R}[1]{>{\raggedleft\let\newline\\\arraybackslash\hspace{0pt}}m{#1}}

% math ops
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator{\E}{\mathbb{E}}

% theorem
\usepackage{amsthm}
%\newtheorem{theorem}{Theorem}
%\theoremstyle{theorem}
%\newtheorem{postulate}{Postulate}
\theoremstyle{definition}
\newtheorem{assumption}{Assumption}
\newtheorem*{remark}{Remark}
\theoremstyle{definition}
\newtheorem{definition}{Definition}

% -------


\title{Efficient and Transferable Adversarial Examples \\ from Bayesian Neural Networks \\ (Supplementary Material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<martin.gubri@uni.lu>?Subject=Your UAI 2022 paper on transferability from BNN}{Martin~Gubri}{}}
\author[1]{Maxime~Cordy}
\author[1]{Mike~Papadakis}
\author[1]{Yves~Le~Traon}
\author[2]{Koushik~Sen}
% Add affiliations after the authors
\affil[1]{%
    University of Luxembourg\\
    Luxembourg, LU
}
\affil[2]{%
    University of California\\
    Berkeley, CA, USA
}

\begin{document}

\onecolumn
\maketitle

\appendix

%\section{Supplementary Materials}

In the supplementary materials for the paper, the following are provided:
\begin{itemize}
    \item The detailed experimental setup section
    \item The description and experimental setup of Bayesian and Ensembling training techniques
    \item Additional results, including:
    \begin{itemize}
        \item The natural accuracy of target Neural Networks;
        \item The proportions of vanishing gradients of cSGLD surrogate compared to DNN surrogate;
        \item The intra-architecture transfer success rates on cSGLD and Deep Ensemble of 1, 2, 5 and 15 DNNs surrogates;
        \item The inter-architecture transfer success rates of single architecture surrogates;
        \item The intra-architecture transfer success rate of six Bayesian and Ensemble training methods attacked by L$2$ I-FGSM;
        \item The intra-architecture transfer success rate combined with test-time transformations on CIFAR-10;
        \item The transfer rate of cSGLD with respect to the number of cycles and samples per cycle;
    \end{itemize}
    \item An illustration of the cSGLD cyclical learning rate schedule;
    \item A diagram of the relationships between gradient-based attacks;
    \item The algorithm applied to perform approximate Bayesian model averaging efficiently;
    \item Details on hyperparameters, including:
    \begin{itemize}
        \item The transfer success rate of iterative attacks with respect to the number of iterations;
        \item The tuning of the hyperparameter of the Skip Gradient Method technique to extend it to PreResNet110;
        \item The hyperparameters used to train and attack models.
    \end{itemize}
\end{itemize}

\begin{figure}[!htb]
\centering
\includegraphics[width=0.5\columnwidth]{figure/figure_csgld_lr_croped}
\caption{Illustration of the cSGLD cyclical learning rate schedule~(red) and the traditional decreasing learning rate schedule~(blue). Each cSGLD cycle is composed of an exploration phase~(burn-in period of MCMC algorithms --- red doted) and of a sampling phase~(red plain). Figure taken from Zhang et al. (2020).} %\cite{Zhang2020CyclicalLearning}.}
\label{fig:illustration-csgld-lr}
\end{figure}


\section{Experimental Setup}
\label{sec:xp-setup-appendix}


\noindent \textbf{Datasets.} We consider ImageNet (ILSVRC2012; \citealt{ILSVRC15}), CIFAR-10 \citep{Krizhevsky2009LearningImages} and MNIST. In all cases, we train the surrogate and target models on the entire training set. For each CIFAR-10 and MNIST target model, we select all the examples from the test set that are correctly predicted by it. In the case of ImageNet, we use a random subset of 5000 correctly predicted test images.


\noindent \textbf{Architectures.} We cover a diverse set of architectures in terms of heterogeneity (similar and different families of architecture), computation cost, and release date. For ImageNet, we select five architectures with $3\times224\times244$ input size. Three classical architectures: ResNet-50 \cite{He2016}\footnote{\cite{Ashukha2020PitfallsLearning} study ResNet-50 only on ImageNet. We used their shared trained models as surrogate DNNs.}, ResNeXt-50 32x4d \cite{Xie2017} and Densenet-121 \cite{Xie2017}; and two mobile architectures: MNASNet 1.0 \cite{Tan2018} and EfficientNet-B0 \cite{Tan2019}. Following the work of \cite{Ashukha2020PitfallsLearning}, we consider the following five architectures for CIFAR-10: PreResNet110 \cite{He2016IdentityNetworks}, PreResNet164, VGG16BN, VGG19BN \cite{Simonyan2015VeryRecognition}, and WideResNet28x10 \cite{Zagoruyko2016WideNetworks}. We study three architectures on MNIST: ``FC'' a fully connected neural network with two hidden layers 1200-1200, ``Small FC'' with a single fully connected hidden layer of size 512, and ``CNN'' a convolutional neural network composed of two convolutional layers with 32 filters each followed by two fully connected hidden layers 200-200.

\noindent \textbf{Target models.} The target models are deterministic DNNs. For ImageNet, we use the pre-trained models provided by PyTorch \cite{NEURIPS2019_9015} and the pre-trained EfficientNet-B0 provided by PyTorch Image Models (\textit{timm}). In the case of CIFAR-10, they are trained using Adam optimizer for 300 epochs with step-wise learning rate decay that divides it by 10 every 75 epochs (MNIST: 50 epochs in total, learning rate divided by 10 every 20 epochs). The benign accuracy of all target models exceeds 73\% (ImageNet), 83\% (CIFAR-10) and 98\% (MNIST); see \cref{table-benin-acc-target} below exact values.

\begin{table}[!htb]
\centering
\caption{Top-1 natural test accuracy of target DNNs.}
\begin{tabular}{ll|L{5em}}
\toprule
\bfseries Dataset                   & \bfseries Target DNN       & \bfseries Benign Test Accuracy \\
\midrule
\multirow{5}{*}{CIFAR-10} & PreResNet110     & 93.26 \%             \\
                          & PreResNet164     & 93.03 \%             \\
                          & VGG16bn          & 83.68 \%             \\
                          & VGG19bn          & 83.62 \%             \\
                          & WideResNet28x10  & 92.13 \%             \\
\hline
\multirow{5}{*}{ImageNet} & ResNet50        & 76.15 \%              \\
                          & ResNeXt50 32x4d & 77.62 \%              \\
                          & Densenet121     & 74.65 \%              \\
                          & MNASNet 1.0      & 73.51 \%             \\
                          & EfficientNet-B0  & 77.70 \%             \\       
\hline
\multirow{3}{*}{MNIST}    & CNN             & 99.33 \%              \\
                          & FC              & 98.65 \%              \\
                          & Small FC        & 98.41 \%              \\
\bottomrule
\end{tabular}
\label{table-benin-acc-target}
\end{table}


\noindent \textbf{Surrogate models (Deep Ensemble).} For CIFAR-10 and MNIST, the DNNs used to form surrogate ensembles are trained using the same process as the target models. Therefore, the comparison between deterministic DNNs and cSGLD is fair, since one can expect the deterministic DNNs surrogate to be ``close'' to the target. As for ImageNet, we retrieve an ensemble of 15 ResNet-50 models trained independently by \cite{Ashukha2020PitfallsLearning} using SGD with momentum during 130 epochs. For the RQ2 experiments, we train similarly one model for every 4 other ImageNet architectures.

\noindent \textbf{Surrogate models (cSGLD).} Following the work of \cite{Ashukha2020PitfallsLearning} and \cite{Zhang2020CyclicalLearning}, we train models with cSGLD on CIFAR-10 for 6 learning rate cycles (which, as our RQ4 experiments reveal, is where the transfer rate starts plateauing). cSGLD performs 5 cycles on ImageNet, and 10 on MNIST. 
The learning rate is set with cosine annealing schedule for fast convergence. Each cycle lasts 45 on ImageNet, 50 epochs on CIFAR-10 and 10 on MNIST. The last epochs of every cycle form the sampling phase: noise is added and one sample is drawn at the end of each epoch. On CIFAR-10, we obtain 5 samples per cycle (resp. 3 on ImageNet and 4 MNIST), so 30 samples in total (resp. 15 and 20). An illustration of a cSGLD cyclical learning rate schedule is in supplementary materials.
To train ResNet-50 models on ImageNet, we re-use the original cSGLD hyperparameters.

\noindent \textbf{Surrogate models (other training methods).} Additionally, to Deep Ensemble cSGLD and following \cite{Ashukha2020PitfallsLearning}, we consider 2 Bayesian Deep Learning techniques (SWAG and VI) and 2 Ensemble ones (SSE and FGE). We train every technique on CIFAR-10 and cSGLD and SWAG on ImageNet. We retrieve trained Deep Ensemble, SSE, FGE and VI ImageNet models from \cite{Ashukha2020PitfallsLearning}. Technique descriptions and experimental setup of surrogates trained with SWAG, VI, FGE, or SSE are detailed below in the Bayesian and Ensemble Training Techniques section. 

\begin{figure}[!htb]
\centering
\[\begin{tikzcd}
	{\text{MI-FSGM}} \\
	\\
	{\text{I-FSGM}} && {\text{FSGM}} \\
	\\
	{\text{PGD}}
	\arrow["{\text{Momentum} = 0}"{description}, from=1-1, to=3-1]
	\arrow["\substack{\text{Nb restarts}=1 \\ \text{No random init}}"{description}, from=5-1, to=3-1]
	\arrow["\substack{\text{Nb iters}=1 \\ \text{Step-size } \alpha=\varepsilon}"{marking}, from=3-1, to=3-3]
\end{tikzcd}\]
\caption{Relationships between gradient-based attacks.}
\label{fig:relation-attacks}
\end{figure}


\noindent \textbf{Adversarial attacks.} We applied our variant of 4 gradient-based attacks as described in the approach section. The attacker's goal is misclassification (untargeted adversarial examples). We perform both $2$-norm and $\infty$-norm bounded adversarial attacks, and report means and standard deviations computed on 3 random seeds. In accordance to values commonly used in the literature \cite{Croce2020ReliableAttacks}, the maximum perturbation norm $\varepsilon$ is set respectively to 0.5 and $\frac{4}{255}$ on CIFAR-10, and respectively to 3 and $\frac{4}{255}$ on ImageNet. MNIST ones are respectively 3 and 0.1. The step-size $\alpha$ is set to $\frac{\varepsilon}{10}$. We choose to perform 50 iterations such that the transferability rates plateaus for all iterative attacks (I-FGSM, MI-FGSM and PGD) on both norms and both datasets (see \cref{fig:nb-iters-imagenet,fig:nb-iters-cifar} below). PGD runs with 5 random restarts. FGSM aside, \textit{every iteration computes the gradient of 1 model per architecture}. Therefore, the attack computation cost and volatile memory are not multiplied by the size of the surrogate, except for FGSM which computes its unique gradient against all available models. cSGLD samples are attacked in random order.
The MI-FGSM decay factor is set to $0.9$.

\begin{algorithm}[tb]
   \caption{Variant of I-FGSM attack to perform approximate Bayesian Model Averaging efficiently on numerous models from several architectures}
   \label{alg:attack-variant}
\begin{algorithmic}
   \STATE {\bfseries Input:} original example $(x, y)$, $S_A$ ordered sets of model parameters $( \theta_s^1 )_{s=1}^S, \ldots, ( \theta_s^{S_A} )_{s=1}^S$ sampled from the corresponding posterior distribution $\theta_s^i \sim p(\theta_s  | \mathcal{D})$, number of iterations $n_{\text{iter}}$, perturbation $p$-norm $\varepsilon$, step-size $\alpha$
   \STATE {\bfseries Output:} adversarial example $x_\text{adv}$
   \STATE Shuffle each ordered set of model samples $( \theta_s^1 )_{s=1}^S, \ldots, ( \theta_s^{S_A} )_{s=1}^S $
   \STATE $x_\text{adv} \leftarrow x$
   \FOR{$i=1$ {\bfseries to} $n_{\text{iter}}$}
   \STATE ${x_\text{adv}} \leftarrow x_\text{adv} + \frac{\alpha}{S_A} \sum_{a=1}^{S_A} \nabla \mathcal{L}(x_\text{adv} ;\, y, \theta^a_{i \bmod S}) $
   \STATE $x_\text{adv} \leftarrow \text{project}(x_\text{adv}, B_{\varepsilon}[x])$
   \STATE $x_\text{adv} \leftarrow \text{clip}(x_\text{adv})$
   \ENDFOR
\end{algorithmic}
\end{algorithm}



\noindent \textbf{Test-time transformations.} In the dedicated section, we consider three test-time transformations applied during attack designed for transferability (see related work section): Ghost Networks \cite{Li2018LearningNetworks}, Input Diversity \cite{Xie2019} and Skip Gradient Method \cite{Wu2020SkipResNets}. We implemented the first two in PyTorch with their original hyperparameters. To extend Input Diversity to the smaller input sizes of CIFAR-10, we keep the same maximum resize ratio of $0.9$. We reuse the original implementation of the third one on ResNet50, and extend it to PreResNet110 (we set its hyperparameter via grid-search, see Figure \ref{fig:hp-tuning-sgm} below).

\noindent \textbf{Implementation.} The source code of the experiments are publicly available on GitHub\footnote{\url{https://github.com/Framartin/transferable-bnn-adv-ex}}. Our attack is built on top of the Python ART library \cite{art2018}. cSGLD, VI, SSE, and FGE models were trained thanks to the implementation of \cite{Ashukha2020PitfallsLearning} available on GitHub\footnote{\url{https://github.com/bayesgroup/pytorch-ensembles}}. All models were trained with PyTorch \cite{NEURIPS2019_9015}. We use EfficientNet-B0 from timm\footnote{\url{https://github.com/rwightman/pytorch-image-models}}. We train SWAG on ImageNet with the original implementation \cite{Maddox2019ALearning}. We use the following software versions: Python 3.8.8, Pytorch 1.7.1 (1.9.0 for Flops measurement), torchvision 0.8.2, Adversarial Robustness Toolbox 1.6.0, and timm 0.3.2.

\noindent \textbf{Flops.} We measure the training computational complexity in Flops using the PyTorch profiler. The computation overhead of one epoch with cSGLD compared to one with SGD/Adam is negligible. The main difference is the addition of noise to the weights during the sampling phase. On CIFAR-10, the overhead of 1 cSGLD epoch of PreResNet110 with added noise compared to one of a DNN trained with Adam (SGD) is 0.0187\% Flops (respectively 0.0146\% for ResNet50 on ImageNet).

\noindent \textbf{Infrastructure.} Experiments were run on Tesla V100-DGXS-32GB GPUs. The server has the following specifications: 256GB RDIMM DDR4, CUDA version 10.1, Linux (Ubuntu) operating system. 


%\clearpage
\section{Bayesian and Ensemble Training Techniques}
\label{sec:bayesian-ensemble-techniques-appendix}

Following the work of \cite{Ashukha2020PitfallsLearning}, we consider the following training techniques: Deep Ensemble \cite{Lakshminarayanan2016SimpleEnsembles}, cSGLD \cite{Zhang2020CyclicalLearning}, SWAG \cite{Maddox2019ALearning}, VI, SSE \cite{Huang2017SnapshotFree}, and FGE \cite{Garipov2018LossDNNs}. 
For computational limitations, we evaluate them on a single attack run (one random seed) of 5000 images. 

\begin{figure}[ht]
    \centering
        \subfloat[ImageNet]{\includegraphics[width=0.45\textwidth]{figure/RQ_techniques_imagenet_transfer_vs_time_L2.pdf}}
        \qquad
        \subfloat[CIFAR-10]{\includegraphics[width=0.45\textwidth]{figure/RQ_techniques_transfer_vs_time_L2.pdf}}
    \caption{Intra-architecture L$\infty$ I-FGSM success rate with respect to the training computational complexity of six Bayesian and Ensemble methods. Every curve starts with one model, and each successive point is obtained by forming an ensemble with one more model.}
    \label{fig:ensemble-technique-L2}
\end{figure}


\noindent \textbf{Deep Ensemble.} Deep Ensemble \cite{Lakshminarayanan2016SimpleEnsembles} simply trains several DNNs independently with random initialization and random subsampling (mini-batch on shuffled data in practice). All DNNs have the same standard hyperparameters for training. For classification, predictions of individual DNNs are averaged. We train 15 PreResNet110, 4 PreResNet164, 4 VGG16bn, 4 VGG19bn, and 4 WideResNet28x10 DNNs on CIFAR-10. We retrieve 15 ResNet50 DNNs trained by \cite{Ashukha2020PitfallsLearning} on ImageNet, and trained on our own 1 DNN for each of the remaining studied architectures (ResNeXt50 32x4d, DenseNet121, MNASNet 1.0, and EfficientNet-B0).

\noindent \textbf{cSGLD.} We refer the reader to the approach section for a detailed description of cyclical Stochastic Gradient Langevin Dynamics. \cref{fig:illustration-csgld-lr} illustrates both the cyclical cosine annealing learning rate schedule and the separation of each cycle into an exploration phase (called the burn-in period of MCMC algorithm) and a sampling phase.

\noindent \textbf{SWAG.} Stochastic Weight Averaging-Gaussian (SWAG) \cite{Maddox2019ALearning} is a Bayesian Deep Learning method that fits a Gaussian onto SGD iterates to approximate the posterior distribution over weights. Its first moment is the SWA solution, and its second moment a diagonal plus low-rank covariance matrix. Both are estimated from SGD iterates with constant learning rate (0.001 on ImageNet and 0.01 on CIFAR-10). On ImageNet, SWAG performs 10 additional epochs to collect SGD iterates from one of the Deep Ensemble DNNs. On CIFAR-10, a regular pre-training phase of 160 epochs precedes 140 epochs to collect checkpoints. Once fitted, models are sampled from the Gaussian distribution. For every sample, batch normalization statistics are updated in a forward pass over the entire CIFAR-10 train set and over a random subset of 10\% on ImageNet. Apart from the fixed initial cost, the marginal computational cost to obtain a sample is very low. We sample a maximum of 50 models because iterative attacks perform 50 iterations of one model per iteration, and further samples would be discarded. Thus, the line corresponding to SWAG in \cref{fig:ensemble-technique-L2} is shorter than the ones of other methods. The rank of the estimated covariance matrix is 20. Batch-size is 128 on CIFAR-10, and 256 on ImageNet.

\noindent \textbf{VI.} Variational Inference (VI) approximates the true posterior distribution with a variational approximation, here a fully-factorized Gaussian distribution, and maximizes a corresponding lower bound. A Gaussian prior is chosen. Once trained, the variational approximation is used as the posterior. There is no additional sampling phase to perform Bayesian model averaging. Therefore, we cannot tune the number of samples and a single VI point is plotted in \cref{fig:ensemble-technique-L2}. We follow the solutions of \cite{Ashukha2020PitfallsLearning} to avoid underfitting: pre-training and annealing of $\beta$. The first moment of the Gaussian variational approximation is initially set to a DNN pre-trained similarly to Deep Ensemble (300 epochs on CIFAR-10 with initial learning rate of $10^{-4}$, and 130 epochs on ImageNet starting at $10^{-3}$). The log of its second moment is initially set to $-5$ on CIFAR-10 and $-6$ on ImageNet, and further optimized for 100 epochs (45 on ImageNet) with Adam and a learning rate of $10^{-4}$. $\beta$ is set to $10^{-5}$ on CIFAR-10 and $10^{-4}$ on ImageNet. Batch-size is 128 on CIFAR-10, and 256 on ImageNet. On MNIST, we train VI using the code and the hyperparameters of \cite{Carbone2020RobustnessAttacks}.

\noindent \textbf{SSE.} Snapshot ensembles technique \cite{Huang2017SnapshotFree} is the foundation of cSGLD. The learning rate is cyclical with a cosine annealing schedule. Contrary to cSGLD, SSE saves a single snapshot per cycle and does not add gradient noise. The cycles are 40 epochs long on CIFAR-10, 45 on ImageNet. The maximum learning rate is 0.2, batch size is 64 on CIFAR-10, respectively 0.1 and 256 on ImageNet.

\noindent \textbf{FGE.} Fast Geometric Ensembling \cite{Garipov2018LossDNNs} is a method developed after the empirical observation of Mode Connectivity on CIFAR-10 and CIFAR-100: it's possible to find a path in the parameters space that connects two independently trained DNNs such that the models along the path have low loss and high test accuracy. In practice, it uses a cyclical triangular learning rate and collects one model during each cycle. It is quite similar to SSE, except for the learning rate schedule, the much shorter cycles (4 epochs on CIFAR-10, 2 epochs on ImageNet), and a pre-training phase. Pre-training lasts for 160 epochs on CIFAR-10. On ImageNet, FGE is initialized from one Deep Ensemble checkpoint. The learning rate varies between $5\times10^{-5}$ and $5\times10^{-3}$ on CIFAR-10 and $10^{-6}$ and $10^{-4}$ on ImageNet. Batch-size is 128 on CIFAR-10, and 256 on ImageNet.


\noindent \textbf{HMC.} Hamiltonian Monte Carlo (HMC) is considered a golden standard to train BNN. We trained the small FC architecture on MNIST, using the code and the hyperparameters of \cite{Carbone2020RobustnessAttacks}. Unfortunately, HMC does not scale to larger DNNs, even on MNIST.

\clearpage

\section{Vanishing gradients}
\label{sec:vanishing-grads-appendix}



\begin{table*}[!htb]
\centering
\caption{Proportion of vanished gradients of each 15 individual models and of the ensemble of 15 models (in \%). Gradients disappear before and after averaging in similar proportion (except in one case for VI where there is more gradient vanishing after averaging). A gradient vanishes if its L$2$ norm is lower than $10^{-8}$, the numerical tolerance of the Adversarial Robustness Toolbox library. Gradients are on 10~000 original test examples. Means and standard deviations of 15 models are reported when not ensembled.}
\begin{tabular}{lll|R{7em}|R{9em}}
\toprule
\bfseries Dataset                   & \bfseries Architecture                  & \bfseries Surrogate    & \bfseries Vanished individual model gradients & \bfseries Vanished ensemble gradients (averaging) \\
\midrule
\multirow{3}{*}{ImageNet} & \multirow{3}{*}{ResNet50}     & cSGLD (ours) & 0.06 \tiny ±0.06               & 0.00        \\  
 &                           & VI           & 0.15 \tiny ±0.02  & 0.05  \\ 
 &                           & DNN          & 0.11 \tiny ±0.03  & 0.00  \\ \hline
\multirow{3}{*}{CIFAR-10} & \multirow{3}{*}{PreResNet110} & cSGLD (ours) & 3.04 \tiny ±0.72               & 2.52        \\  
 &                           & VI           & 2.79 \tiny ±0.11  & 2.08  \\ 
 &                           & DNN          & 59.15 \tiny ±0.73 & 63.96 \\ \hline
\multirow{9}{*}{MNIST}    & \multirow{2}{*}{CNN}          & cSGLD (ours) & 30.94 \tiny ±2.00              & 31.67       \\ 
 &                           & DNN          & 91.53 \tiny ±2.36 & 94.20 \\ \cline{2-5} 
 & \multirow{3}{*}{FC}       & cSGLD (ours) & 11.16 \tiny ±0.51 & 11.31 \\ 
 &                           & VI           & 84.73 \tiny ±1.95 & 91.72 \\ 
 &                           & DNN          & 90.60 \tiny ±1.71 & 92.14 \\ \cline{2-5} 
 & \multirow{4}{*}{Small FC} & cSGLD (ours) & 4.63 \tiny ±0.48  & 4.71  \\
 &                           & VI           & 60.61 \tiny ±4.61 & 82.00 \\
 &                           & HMC          & 85.61 \tiny ±0.02 & 85.62 \\
 &                           & DNN          & 77.56 \tiny ±2.84 & 79.88 \\
\bottomrule
\end{tabular}
\label{tab:zero-grads}
\end{table*}


\clearpage

\section{Intra-architecture transferability}
\label{sec:intra-arch-appendix}


% Iterative attacks on 1 models per iteration + FSGM on all models
\begin{table*}[!htb]
\centering
\caption{Intra-architecture transfer success rates of four attacks on PreResNet110 (CIFAR-10) and ResNet50 (ImageNet), in \%. Bold is best. Higher is better.}
%\resizebox{.95\columnwidth}{!}{
\begin{tabular}{lll|R{5em}R{5em}|R{5em}R{6em}}
\toprule
\bfseries Dataset & \bfseries Attack & \bfseries Surrogate &  \bfseries L2 Attack &   \bfseries  L$\infty$ Attack &  \bfseries Nb training epochs & \bfseries  Nb backward passes \\
\midrule
\multirow{20}{*}{ImageNet} & \multirow{5}{*}{I-FGSM} & cSGLD &  94.41 \tiny ±0.46 &  90.77 \tiny ±0.09 &                 225 &                  50 \\
         &        & 1 DNN &  64.95 \tiny ±0.54 &  57.79 \tiny ±0.17 &                 130 &                  50 \\
         &        & 2 DNNs &  80.39 \tiny ±0.83 &  74.25 \tiny ±0.71 &                 260 &                  50 \\
         &        & 5 DNNs &  94.53 \tiny ±0.43 &  92.81 \tiny ±0.45 &                 650 &                  50 \\
         &        & 15 DNNs &  \textbf{98.51 \tiny ±0.11} &  \textbf{98.28 \tiny ±0.16} &                1950 &                  50 \\
\cline{2-7}
         & \multirow{5}{*}{MI-FGSM} & cSGLD &  93.42 \tiny ±0.73 &  93.61 \tiny ±0.41 &                 225 &                  50 \\
         &        & 1 DNN &  61.11 \tiny ±0.35 &  63.70 \tiny ±0.21 &                 130 &                  50 \\
         &        & 2 DNNs &  77.93 \tiny ±0.44 &  79.27 \tiny ±0.76 &                 260 &                  50 \\
         &        & 5 DNNs &  94.41 \tiny ±0.47 &  95.32 \tiny ±0.25 &                 650 &                  50 \\
         &        & 15 DNNs &  \textbf{98.89 \tiny ±0.13} &  \textbf{99.19 \tiny ±0.13} &                1950 &                  50 \\
\cline{2-7}
         & \multirow{5}{*}{PGD \footnotesize{(5 restarts)}} & cSGLD &  91.81 \tiny ±0.38 &  88.76 \tiny ±0.24 &                 225 &                 250 \\
         &        & 1 DNN &  57.47 \tiny ±0.52 &  53.79 \tiny ±0.45 &                 130 &                 250 \\
         &        & 2 DNNs &  74.04 \tiny ±0.47 &  70.90 \tiny ±0.41 &                 260 &                 250 \\
         &        & 5 DNNs &  91.99 \tiny ±0.41 &  91.27 \tiny ±0.59 &                 650 &                 250 \\
         &        & 15 DNNs &  \textbf{97.83 \tiny ±0.20} &  \textbf{97.65 \tiny ±0.21} &                1950 &                 250 \\
\cline{2-7}
         & \multirow{5}{*}{FGSM} & cSGLD &  58.91 \tiny ±0.11 &  67.17 \tiny ±0.26 &                 225 &                  15 \\
         &        & 1 DNN &  37.37 \tiny ±0.19 &  44.55 \tiny ±0.72 &                 130 &                   1 \\
         &        & 2 DNNs &  46.73 \tiny ±0.34 &  53.91 \tiny ±0.60 &                 260 &                   2 \\
         &        & 5 DNNs &  58.17 \tiny ±0.18 &  65.53 \tiny ±0.10 &                 650 &                   5 \\
         &        & 15 DNNs &  \textbf{68.48 \tiny ±0.52} &  \textbf{76.57 \tiny ±0.62} &                1950 &                  15 \\
\cline{1-7}
\multirow{20}{*}{CIFAR-10} & \multirow{5}{*}{I-FGSM} & cSGLD &         \textbf{92.38 \tiny ±0.23} &         92.74 \tiny ±0.33 &                 300 &                  50 \\
         &        & 1 DNN &         43.17 \tiny ±0.97 &         77.59 \tiny ±0.01 &                 300 &                  50 \\
         &        & 2 DNNs &         52.08 \tiny ±1.03 &         84.75 \tiny ±0.20 &                 600 &                  50 \\
         &        & 5 DNNs &         58.74 \tiny ±0.98 &         94.81 \tiny ±0.17 &                1500 &                  50 \\
         &        & 15 DNNs &         62.08 \tiny ±0.92 &         \textbf{97.83 \tiny ±0.03} &                4500 &                  50 \\
\cline{2-7}
         & \multirow{5}{*}{MI-FGSM} & cSGLD &         92.29 \tiny ±0.25 &         94.20 \tiny ±0.14 &                 300 &                  50 \\
         &        & 1 DNN &         72.34 \tiny ±0.23 &         80.43 \tiny ±0.04 &                 300 &                  50 \\
         &        & 2 DNNs &         84.10 \tiny ±0.33 &         90.70 \tiny ±0.07 &                 600 &                  50 \\
         &        & 5 DNNs &         91.66 \tiny ±0.26 &         97.04 \tiny ±0.07 &                1500 &                  50 \\
         &        & 15 DNNs &         \textbf{93.87 \tiny ±0.30} &         \textbf{98.30 \tiny ±0.11} &                4500 &                  50 \\
\cline{2-7}
         & \multirow{5}{*}{PGD \footnotesize{(5 restarts)}} & cSGLD &         \textbf{91.65 \tiny ±0.33} &         92.10 \tiny ±0.25 &                 300 &                 250 \\
         &        & 1 DNN &         51.08 \tiny ±0.10 &         77.58 \tiny ±0.38 &                 300 &                 250 \\
         &        & 2 DNNs &         60.60 \tiny ±0.06 &         83.67 \tiny ±0.27 &                 600 &                 250 \\
         &        & 5 DNNs &         67.55 \tiny ±0.21 &         94.19 \tiny ±0.07 &                1500 &                 250 \\
         &        & 15 DNNs &         70.42 \tiny ±0.23 &         \textbf{97.37 \tiny ±0.06} &                4500 &                 250 \\
\cline{2-7}
         & \multirow{5}{*}{FGSM} & cSGLD &         \textbf{43.13 \tiny ±0.00} &         58.85 \tiny ±0.01 &                 300 &                  30 \\
         &        & 1 DNN &         20.92 \tiny ±0.00 &         38.89 \tiny ±0.01 &                 300 &                   1 \\
         &        & 2 DNNs &         23.75 \tiny ±0.00 &         45.83 \tiny ±0.01 &                 600 &                   2 \\
         &        & 5 DNNs &         25.60 \tiny ±0.00 &         54.62 \tiny ±0.01 &                1500 &                   5 \\
         &        & 15 DNNs &         26.71 \tiny ±0.00 &         \textbf{61.81 \tiny ±0.00} &                4500 &                  15 \\
\bottomrule
\end{tabular}
%}
\label{tab:transfer-same-arch}
\end{table*}



\begin{table*}[!htb]
\centering
\caption{Intra-architecture transfer success rates of four attacks on the FC architecture (MNIST), in \%. Bold is best. Higher is better.}
%\resizebox{.95\columnwidth}{!}{
\begin{tabular}{lll|R{5em}R{5em}|R{5em}R{6em}}
\toprule
\bfseries Dataset & \bfseries Attack & \bfseries Surrogate &  \bfseries L2 Attack &   \bfseries  L$\infty$ Attack &  \bfseries Nb training epochs & \bfseries  Nb backward passes \\
\midrule
\multirow{20}{*}{MNIST} & \multirow{5}{*}{I-FGSM} & cSGLD &  \textbf{97.65\% \tiny ±0.02} &  \textbf{41.49\% \tiny ±0.02} &                  50 &                  50 \\
      &        & 1 DNN &  17.17\% \tiny ±0.00 &  34.53\% \tiny ±0.00 &                  50 &                  50 \\
      &        & 2 DNNs &  18.52\% \tiny ±0.01 &  36.44\% \tiny ±0.01 &                 100 &                  50 \\
      &        & 5 DNNs &  26.21\% \tiny ±0.10 &  43.12\% \tiny ±0.16 &                 250 &                  50 \\
      &        & 15 DNNs &  26.46\% \tiny ±0.19 &  45.22\% \tiny ±0.27 &                 750 &                  50 \\
\cline{2-7}
      & \multirow{5}{*}{MI-FGSM} & cSGLD &  \textbf{97.62\% \tiny ±0.05} &  \textbf{42.07\% \tiny ±0.09} &                  50 &                  50 \\
      &        & 1 DNN &  80.72\% \tiny ±0.00 &  34.52\% \tiny ±0.00 &                  50 &                  50 \\
      &        & 2 DNNs &  82.63\% \tiny ±0.05 &  39.83\% \tiny ±0.06 &                 100 &                  50 \\
      &        & 5 DNNs &  91.83\% \tiny ±0.12 &  44.74\% \tiny ±0.23 &                 250 &                  50 \\
      &        & 15 DNNs &  92.08\% \tiny ±0.09 &  46.99\% \tiny ±0.37 &                 750 &                  50 \\
\cline{2-7}
      & \multirow{5}{*}{PGD (5 restarts)} & cSGLD &  \textbf{97.78\% \tiny ±0.04} &  \textbf{41.64\% \tiny ±0.18} &                  50 &                 250 \\
      &        & 1 DNN &  31.99\% \tiny ±0.08 &  34.80\% \tiny ±0.07 &                  50 &                 250 \\
      &        & 2 DNNs &  33.61\% \tiny ±0.07 &  37.26\% \tiny ±0.17 &                 100 &                 250 \\
      &        & 5 DNNs &  43.27\% \tiny ±0.37 &  43.61\% \tiny ±0.29 &                 250 &                 250 \\
      &        & 15 DNNs &  44.56\% \tiny ±0.29 &  45.50\% \tiny ±0.29 &                 750 &                 250 \\
\cline{2-7}
      & \multirow{5}{*}{FGSM} & cSGLD &  \textbf{75.09\% \tiny ±0.00} &  \textbf{34.90\% \tiny ±0.00} &                  50 &                  20 \\
      &        & 1 DNN &   8.62\% \tiny ±0.00 &  22.52\% \tiny ±0.00 &                  50 &                   1 \\
      &        & 2 DNNs &   7.42\% \tiny ±0.00 &  25.76\% \tiny ±0.00 &                 100 &                   2 \\
      &        & 5 DNNs &   7.95\% \tiny ±0.00 &  29.52\% \tiny ±0.00 &                 250 &                   5 \\
      &        & 15 DNNs &   7.52\% \tiny ±0.00 &  31.08\% \tiny ±0.00 &                 750 &                  15 \\
\bottomrule
\end{tabular}
%}
\label{tab:transfer-same-arch-mnist}
\end{table*}


\clearpage

\section{Inter-architecture transferability}
\label{sec:inter-arch-appendix}


%---- inter

\begin{table*}[!htb]
\centering
\caption{Inter-architecture transfer success rates of I-FGSM of single architecture surrogate on ImageNet (in \%). All combinations of surrogate and targeted architectures are evaluated. Diagonals are intra-architecture. 1 DNN and cSGLD have similar computation budget (135 epochs). Bold is best. Higher is better.}
%\resizebox{1.95\columnwidth}{!}{
\begin{tabular}{lC{6.2em}L{4.5em}rrrrr}
\toprule
\multicolumn{3}{c}{ } & \multicolumn{5}{c}{\bfseries Target Architecture} \\
\cmidrule(l{3pt}r{3pt}){4-8}
\rowcolor{white}
\bfseries Norm & \bfseries Surrogate Architecture & \bfseries Surrogate &    ResNet50 &     ResNeXt50 &   DenseNet121 &       MNASNet & EfficientNetB0 \\
\midrule
\multirow{10}{*}{L$2$} & \multirow{2}{*}{ResNet50} & cSGLD &    \textbf{84.93 \tiny ±0.59} &  \textbf{74.70 \tiny ±0.91} &  \textbf{71.32 \tiny ±0.63} &  \textbf{60.09 \tiny ±0.60} &    \textbf{39.70 \tiny ±0.29} \\
           &                                       & 1 DNN &    56.98 \tiny ±0.62 &  41.13 \tiny ±0.97 &  29.81 \tiny ±0.33 &  27.90 \tiny ±0.43 &    16.39 \tiny ±0.46 \\
\cline{2-8}
           & \multirow{2}{*}{ResNeXt50} & cSGLD &    \textbf{79.25 \tiny ±0.24} &  \textbf{77.34 \tiny ±0.39} &  \textbf{68.53 \tiny ±0.19} &  \textbf{62.16 \tiny ±0.19} &    \textbf{43.51 \tiny ±0.62} \\
           &                            & 1 DNN &    37.48 \tiny ±0.52 &  36.35 \tiny ±0.22 &  23.77 \tiny ±0.41 &  23.69 \tiny ±0.21 &    14.32 \tiny ±0.24 \\
\cline{2-8}
           & \multirow{2}{*}{DenseNet121} & cSGLD &    \textbf{63.23 \tiny ±1.16} &  \textbf{59.89 \tiny ±1.12} &  \textbf{73.28 \tiny ±0.45} &  \textbf{60.84 \tiny ±0.33} &    \textbf{40.27 \tiny ±0.44} \\
           &                            & 1 DNN &    32.61 \tiny ±0.29 &  32.06 \tiny ±0.61 &  39.18 \tiny ±0.47 &  32.01 \tiny ±0.44 &    17.72 \tiny ±0.49 \\
\cline{2-8}
           & \multirow{2}{*}{MNASNet} & cSGLD &     \textbf{7.81 \tiny ±0.19} &   \textbf{5.97 \tiny ±0.37} &   \textbf{9.81 \tiny ±0.31} &  30.41 \tiny ±1.45 &    \textbf{15.46 \tiny ±0.44} \\
           &                          & 1 DNN &     7.04 \tiny ±0.51 &   5.29 \tiny ±0.36 &   8.41 \tiny ±0.20 &  \textbf{32.65 \tiny ±0.22} &    13.13 \tiny ±0.06 \\
\cline{2-8}
           & \multirow{2}{*}{EfficientNetB0} & cSGLD &    \textbf{18.93 \tiny ±2.17} &  \textbf{14.16 \tiny ±1.69} &  \textbf{19.89 \tiny ±1.21} &  \textbf{65.97 \tiny ±3.60} &    \textbf{49.41 \tiny ±3.64} \\
           &                                  & 1 DNN &    15.15 \tiny ±0.30 &  13.33 \tiny ±0.33 &  16.12 \tiny ±0.71 &  58.73 \tiny ±0.25 &    48.85 \tiny ±0.56 \\
\cline{1-8}
\multirow{10}{*}{L$\infty$} & \multirow{2}{*}{ResNet50} & cSGLD &    \textbf{78.67 \tiny ±1.19} &  \textbf{65.21 \tiny ±1.48} &  \textbf{61.54 \tiny ±0.83} &  \textbf{51.75 \tiny ±1.39} &    \textbf{31.11 \tiny ±1.13} \\
           &                                            & 1 DNN &    48.03 \tiny ±0.94 &  32.17 \tiny ±0.43 &  23.37 \tiny ±0.34 &  22.60 \tiny ±0.40 &    12.59 \tiny ±0.21 \\
\cline{2-8}
           & \multirow{2}{*}{ResNeXt50} & cSGLD &    \textbf{71.67 \tiny ±1.00} &  \textbf{69.33 \tiny ±0.85} &  \textbf{59.18 \tiny ±1.14} &  \textbf{54.75 \tiny ±1.33} &    \textbf{35.13 \tiny ±0.71} \\
           &                            & 1 DNN &    31.19 \tiny ±0.42 &  28.68 \tiny ±0.76 &  19.12 \tiny ±0.07 &  19.53 \tiny ±0.51 &    11.20 \tiny ±0.33 \\
\cline{2-8}
           & \multirow{2}{*}{DenseNet121} & cSGLD &    \textbf{54.13 \tiny ±1.70} &  \textbf{50.66 \tiny ±1.62} &  \textbf{65.80 \tiny ±0.66} &  \textbf{53.43 \tiny ±1.30} &    \textbf{32.49 \tiny ±0.36} \\
           &                              & 1 DNN &    25.49 \tiny ±0.81 &  23.73 \tiny ±0.59 &  30.78 \tiny ±0.21 &  26.05 \tiny ±0.66 &    13.41 \tiny ±0.20 \\
\cline{2-8}
           & \multirow{2}{*}{MNASNet} & cSGLD &     \textbf{6.77 \tiny ±0.29} &   4.72 \tiny ±0.27 &   \textbf{8.26 \tiny ±0.36} &  25.27 \tiny ±1.83 &    \textbf{12.21 \tiny ±0.84} \\
           &                          & 1 DNN &     6.52 \tiny ±0.23 &   \textbf{5.06 \tiny ±0.12} &   7.83 \tiny ±0.13 &  \textbf{29.19 \tiny ±0.05} &    11.13 \tiny ±0.16 \\
\cline{2-8}
           & \multirow{2}{*}{EfficientNetB0} & cSGLD &    \textbf{17.81 \tiny ±1.58} &  \textbf{13.91 \tiny ±1.45} &  \textbf{19.71 \tiny ±1.29} &  \textbf{63.67 \tiny ±3.16} &    46.91 \tiny ±3.44 \\
           &                                  & 1 DNN &    15.83 \tiny ±0.32 &  13.51 \tiny ±0.52 &  16.78 \tiny ±0.38 &  60.14 \tiny ±0.37 &    \textbf{50.16 \tiny ±0.64} \\
\bottomrule
\end{tabular}
%}
\label{tab:transfer-inter-combinaison-imagenet}
\end{table*}


\begin{table*}[!htb]
\centering
\caption{Inter-architecture transfer success rates of I-FGSM of single architecture surrogate on CIFAR-10 (in \%). All combinations of surrogate and targeted architectures are evaluated. Diagonals are intra-architecture. Symbols $\star$ indicate 1 DNN having higher transferability than cSGLD. 1 DNN and cSGLD have similar computation budget (300 epochs). Bold is best. Higher is better.}
%\resizebox{1.95\columnwidth}{!}{
\begin{tabular}{lC{6em}L{4.5em}rrrrr}
\toprule
\multicolumn{3}{c}{ } & \multicolumn{5}{c}{\bfseries Target Architecture} \\
\cmidrule(l{3pt}r{3pt}){4-8}
\bfseries Norm & \bfseries Surrogate Architecture & \bfseries Surrogate &    PreResNet110 &  PreResNet164 &       VGG16bn &       VGG19bn &    WideResNet \\
\midrule
\multirow{15}{*}{L$2$} & \multirow{3}{*}{PreResNet110} & cSGLD &    \textbf{88.96 \tiny ±0.02} &  \textbf{88.57 \tiny ±0.00} &  26.18 \tiny ±0.02 &  24.38 \tiny ±0.00 &  \textbf{63.35 \tiny ±0.01} \\
           &                                           & 1 DNN &    34.42 \tiny ±0.00 &  34.39 \tiny ±0.01 &  12.66 \tiny ±0.01 &  12.54 \tiny ±0.00 &  26.29 \tiny ±0.00 \\
           &                                           & 4 DNNs &    50.50 \tiny ±0.00 &  50.49 \tiny ±0.00 &  \textbf{27.45 \tiny ±0.01} &  \textbf{27.30 \tiny ±0.00} &  46.10 \tiny ±0.00 \\
\cline{2-8}
           & \multirow{3}{*}{PreResNet164}  & cSGLD &    \textbf{88.28 \tiny ±0.01} &  \textbf{87.52 \tiny ±0.01} &  25.83 \tiny ±0.01 &  23.64 \tiny ±0.01 &  \textbf{62.79 \tiny ±0.01} \\
           &                                & 1 DNN &    33.89 \tiny ±0.00 &  34.36 \tiny ±0.01 &  11.93 \tiny ±0.00 &  12.07 \tiny ±0.01 &  25.95 \tiny ±0.01 \\
           &                                & 4 DNNs &    50.36 \tiny ±0.01 &  50.45 \tiny ±0.00 &  \textbf{26.79 \tiny ±0.01} &  \textbf{27.13 \tiny ±0.00} &  45.94 \tiny ±0.00 \\
\cline{2-8}
           & \multirow{3}{*}{VGG16bn}   & cSGLD &    \textbf{69.22 \tiny ±0.06} &  \textbf{69.03 \tiny ±0.03} &  43.70 \tiny ±0.04 &  38.54 \tiny ±0.02 &  \textbf{55.62 \tiny ±0.07} \\
           &                            & 1 DNN &    27.22 \tiny ±0.04 &  27.23 \tiny ±0.05 &  29.28 \tiny ±0.08 &  28.73 \tiny ±0.02 &  22.22 \tiny ±0.00 \\
           &                            & 4 DNNs &    55.14 \tiny ±0.06 &  54.96 \tiny ±0.04 &  \textbf{73.65 \tiny ±0.00} &  \textbf{71.24 \tiny ±0.04} &  44.89 \tiny ±0.09 \\
\cline{2-8}
           & \multirow{3}{*}{VGG19bn}   & cSGLD &    \textbf{69.82 \tiny ±0.05} &  \textbf{68.27 \tiny ±0.07} &  44.59 \tiny ±0.10 &  39.76 \tiny ±0.13 &  \textbf{54.40 \tiny ±0.08} \\
           &                            & 1 DNN &    18.09 \tiny ±0.10 &  18.09 \tiny ±0.06 &  $\star$44.63 \tiny ±0.03 &  $\star$46.76 \tiny ±0.03 &  14.38 \tiny ±0.03 \\
           &                            & 4 DNNs &    34.30 \tiny ±0.06 &  33.77 \tiny ±0.01 &  \textbf{66.20 \tiny ±0.03} &  \textbf{68.87 \tiny ±0.05} &  27.44 \tiny ±0.02 \\
\cline{2-8}
           & \multirow{3}{*}{WideResNet}    & cSGLD &    \textbf{82.25 \tiny ±0.03} &  \textbf{85.06 \tiny ±0.02} &  \textbf{26.34 \tiny ±0.08} &  \textbf{23.81 \tiny ±0.03} &  \textbf{69.31 \tiny ±0.07} \\
           &                                & 1 DNN &    22.14 \tiny ±0.01 &  23.00 \tiny ±0.00 &   9.43 \tiny ±0.00 &   9.54 \tiny ±0.00 &  26.85 \tiny ±0.00 \\
           &                                & 4 DNNs &    41.07 \tiny ±0.00 &  41.75 \tiny ±0.04 &  22.91 \tiny ±0.04 &  22.65 \tiny ±0.03 &  43.00 \tiny ±0.01 \\
\cline{1-8}
\multirow{15}{*}{L$\infty$} & \multirow{3}{*}{PreResNet110} & cSGLD &    88.70 \tiny ±0.00 &  88.48 \tiny ±0.01 &  26.32 \tiny ±0.00 &  24.27 \tiny ±0.01 &  62.95 \tiny ±0.01 \\
           &                                                & 1 DNN &    72.73 \tiny ±0.00 &  74.57 \tiny ±0.00 &  22.26 \tiny ±0.00 &  20.98 \tiny ±0.00 &  47.59 \tiny ±0.01 \\
           &                                                & 4 DNNs &    \textbf{91.98 \tiny ±0.00} &  \textbf{92.25 \tiny ±0.00} &  \textbf{38.24 \tiny ±0.00} &  \textbf{35.56 \tiny ±0.00} &  \textbf{72.64 \tiny ±0.01} \\
\cline{2-8}
           & \multirow{3}{*}{PreResNet164}  & cSGLD  &    87.99 \tiny ±0.01 &  87.74 \tiny ±0.00 &  26.33 \tiny ±0.00 &  23.67 \tiny ±0.01 &  61.83 \tiny ±0.02 \\
           &                                & 1 DNN  &    68.97 \tiny ±0.01 &  71.76 \tiny ±0.00 &  20.29 \tiny ±0.00 &  18.86 \tiny ±0.00 &  45.07 \tiny ±0.00 \\
           &                                & 4 DNNs &    \textbf{90.67 \tiny ±0.00} &  \textbf{92.22 \tiny ±0.00} &  \textbf{37.62 \tiny ±0.00} &  \textbf{35.23 \tiny ±0.00} &  \textbf{73.18 \tiny ±0.00} \\
\cline{2-8}
           & \multirow{3}{*}{VGG16bn}   & cSGLD &    \textbf{66.97 \tiny ±0.13} &  \textbf{67.48 \tiny ±0.11} &  42.91 \tiny ±0.05 &  37.91 \tiny ±0.02 &  \textbf{50.52 \tiny ±0.01} \\
           &                            & 1 DNN &    35.57 \tiny ±0.02 &  35.89 \tiny ±0.03 &  38.35 \tiny ±0.00 &  35.82 \tiny ±0.00 &  26.77 \tiny ±0.02 \\
           &                            & 4 DNNs &    52.59 \tiny ±0.00 &  53.12 \tiny ±0.00 &  \textbf{70.89 \tiny ±0.00} &  \textbf{68.53 \tiny ±0.00} &  41.34 \tiny ±0.00 \\
\cline{2-8}
           & \multirow{3}{*}{VGG19bn}   & cSGLD &    \textbf{67.11 \tiny ±0.00} &  \textbf{66.55 \tiny ±0.02} &  43.50 \tiny ±0.01 &  38.72 \tiny ±0.02 &  \textbf{49.69 \tiny ±0.02} \\
           &                            & 1 DNN &    20.50 \tiny ±0.02 &  20.97 \tiny ±0.00 &  $\star$45.90 \tiny ±0.02 &  $\star$48.60 \tiny ±0.02 &  16.37 \tiny ±0.01 \\
           &                            & 4 DNNs &    32.43 \tiny ±0.06 &  32.25 \tiny ±0.04 &  \textbf{63.11 \tiny ±0.07} &  \textbf{65.64 \tiny ±0.06} &  25.34 \tiny ±0.02 \\
\cline{2-8}
           & \multirow{3}{*}{WideResNet}    & cSGLD &    \textbf{81.99 \tiny ±0.01} &  \textbf{85.63 \tiny ±0.01} &  27.04 \tiny ±0.02 &  23.46 \tiny ±0.01 &  68.43 \tiny ±0.01 \\
           &                                & 1 DNN &    49.24 \tiny ±0.16 &  52.84 \tiny ±0.03 &  20.23 \tiny ±0.04 &  18.53 \tiny ±0.02 &  60.84 \tiny ±0.09 \\
           &                                & 4 DNNs &    77.45 \tiny ±0.01 &  79.55 \tiny ±0.13 &  \textbf{36.33 \tiny ±0.13} &  \textbf{33.60 \tiny ±0.22} &  \textbf{83.24 \tiny ±0.00} \\
\bottomrule
\end{tabular}
%}
\label{tab:transfer-inter-combinaison-cifar}
\end{table*}


\begin{table*}[!htb]
\centering
\caption{Inter-architecture transfer success rates of I-FGSM of single architecture surrogate on MNIST (in \%). All combinations of surrogate and targeted architectures are evaluated. Diagonals are intra-architecture. cSGLD has always higher transferability than 1 DNN. Symbols $\star$ indicate Bayesian methods (SVI or HMC) having lower transferability than 1 DNN. 1 DNN and cSGLD have similar computation budget (50 epochs). Bold is best. Higher is better.}
\begin{tabular}{lC{6em}L{5em}rrr}
\toprule
\multicolumn{3}{c}{ } & \multicolumn{3}{c}{\bfseries Target Architecture} \\
\cmidrule(l{3pt}r{3pt}){4-6}
 \bfseries Norm & \bfseries Surrogate Architecture & \bfseries Surrogate Method    & Small FC &  FC & CNN  \\
\midrule
\multirow{18}{*}{L2} &
  \multirow{7}{*}{Small FC} &
  cSGLD &
  \textbf{99.17 \tiny ±0.01} &
  \textbf{97.15 \tiny ±0.05} &
  \textbf{46.04 \tiny ±0.15} \\ 
 &                      & HMC     & $\star$2.66 \tiny ±0.01   & $\star$2.04 \tiny ±0.01   & $\star$0.37 \tiny ±0.01           \\  
 &                      & SVI     & $\star$5.67 \tiny ±0.09   & $\star$4.04 \tiny ±0.09  & $\star$0.62 \tiny ±0.02           \\  
 &                      & 1 DNN   & 44.19 \tiny ±0.00          & 43.98 \tiny ±0.00          & 19.35 \tiny ±0.00          \\  
 &                      & 5 DNNs  & 48.01 \tiny ±0.01          & 47.78 \tiny ±0.04          & 24.76 \tiny ±0.02          \\  
 &                      & 10 DNNs & 52.36 \tiny ±0.09          & 51.97 \tiny ±0.11          & 26.52 \tiny ±0.05          \\  
 &                      & 15 DNNs & 53.13 \tiny ±0.09          & 52.84 \tiny ±0.08          & 27.05 \tiny ±0.12          \\ \cline{2-6} 
 & \multirow{6}{*}{FC}  & cSGLD   & \textbf{98.61 \tiny ±0.00} & \textbf{97.36 \tiny ±0.03} & \textbf{49.27 \tiny ±0.17} \\  
 &                      & SVI     & 17.16 \tiny ±0.17          & 15.47 \tiny ±0.17          & $\star$4.85 \tiny ±0.06           \\  
 &                      & 1 DNN   & 15.37 \tiny ±0.00          & 15.32 \tiny ±0.00          & 10.40 \tiny ±0.00          \\  
 &                      & 5 DNNs  & 23.13 \tiny ±0.06          & 23.07 \tiny ±0.08          & 16.03 \tiny ±0.06          \\  
 &                      & 10 DNNs & 24.55 \tiny ±0.14          & 24.46 \tiny ±0.13          & 16.96 \tiny ±0.21          \\  
 &                      & 15 DNNs & 23.46 \tiny ±0.13          & 23.44 \tiny ±0.12          & 16.44 \tiny ±0.21          \\ \cline{2-6} 
 & \multirow{5}{*}{CNN} & cSGLD   & \textbf{46.86 \tiny ±0.27} & \textbf{47.06 \tiny ±0.32} & \textbf{92.57 \tiny ±0.14} \\  
 &                      & 1 DNN   & 10.73 \tiny ±0.00          & 10.43 \tiny ±0.00          & 14.80 \tiny ±0.00          \\  
 &                      & 5 DNNs  & 22.20 \tiny ±0.09          & 22.22 \tiny ±0.05          & 28.69 \tiny ±0.03          \\  
 &                      & 10 DNNs & 19.18 \tiny ±0.23          & 19.27 \tiny ±0.34          & 23.84 \tiny ±0.40          \\  
 &                      & 15 DNNs & 19.71 \tiny ±0.22          & 19.83 \tiny ±0.22          & 24.33 \tiny ±0.26          \\ \hline
\multirow{18}{*}{L$\infty$} &
  \multirow{7}{*}{Small FC} &
  cSGLD &
  61.75 \tiny ±0.25 &
  37.66 \tiny ±0.25 &
  \textbf{1.25 \tiny ±0.01} \\  
 &                      & HMC     & $\star$1.24 \tiny ±0.01           & $\star$0.91 \tiny ±0.03           & $\star$0.10 \tiny ±0.01           \\  
 &                      & SVI     & $\star$1.76 \tiny ±0.02           & $\star$1.25 \tiny ±0.01           & $\star$0.16 \tiny ±0.03           \\  
 &                      & 1 DNN   & 58.77 \tiny ±0.00          & 32.15 \tiny ±0.00          & 0.95 \tiny ±0.00           \\  
 &                      & 5 DNNs  & 66.81 \tiny ±0.02          & 37.40 \tiny ±0.04          & 1.04 \tiny ±0.01           \\  
 &                      & 10 DNNs & 67.88 \tiny ±0.18          & 38.22 \tiny ±0.02          & 1.02 \tiny ±0.02           \\  
 &                      & 15 DNNs & \textbf{68.07 \tiny ±0.13} & \textbf{38.35 \tiny ±0.08} & 1.04 \tiny ±0.03           \\ \cline{2-6} 
 & \multirow{6}{*}{FC}  & cSGLD   & \textbf{60.06 \tiny ±0.01} & 41.04 \tiny ±0.02          & \textbf{1.33 \tiny ±0.01}  \\  
 &                      & SVI     & $\star$4.29 \tiny ±0.02           & $\star$3.18 \tiny ±0.05           & $\star$0.30 \tiny ±0.01           \\  
 &                      & 1 DNN   & 40.15 \tiny ±0.00          & 34.01 \tiny ±0.00          & 1.11 \tiny ±0.00           \\  
 &                      & 5 DNNs  & 51.62 \tiny ±0.05          & 42.66 \tiny ±0.17          & 1.25 \tiny ±0.02           \\  
 &                      & 10 DNNs & 54.05 \tiny ±0.52          & 44.44 \tiny ±0.15          & 1.26 \tiny ±0.02           \\  
 &                      & 15 DNNs & 55.03 \tiny ±0.45          & \textbf{44.78 \tiny ±0.27} & 1.27 \tiny ±0.01           \\ \cline{2-6} 
 & \multirow{5}{*}{CNN} & cSGLD   & 3.07 \tiny ±0.08           & 2.89 \tiny ±0.04           & 5.42 \tiny ±0.03           \\  
 &                      & 1 DNN   & 2.40 \tiny ±0.00           & 2.30 \tiny ±0.00           & 3.83 \tiny ±0.00           \\  
 &                      & 5 DNNs  & 3.50 \tiny ±0.03           & 3.09 \tiny ±0.06           & 6.05 \tiny ±0.04           \\  
 &                      & 10 DNNs & 3.79 \tiny ±0.04           & \textbf{3.39 \tiny ±0.01}  & 6.37 \tiny ±0.03           \\  
 &                      & 15 DNNs & \textbf{3.81 \tiny ±0.09}  & 3.37 \tiny ±0.04           & \textbf{6.55 \tiny ±0.05}  \\
 \bottomrule
\end{tabular}
\label{tab:transfer-inter-combinaison-mnist}
\end{table*}


\clearpage

\section{Test-time transferability techniques}
\label{sec:test-techs-appendix}


%---- test-time techs


\begin{table*}[!htb]
\centering
\caption{Transfer success rates of (M)I-FGSM attack improved by our approach combined with test-time transformations on CIFAR-10 (in \%). Columns are targets. PreResNet110 columns are intra-architecture transferability, others are inter-architecture. Bold is best. Symbols $\star$ are DNN-based techniques better than our vanilla cSGLD surrogate, and $\dagger$ are techniques that do not improve the corresponding vanilla surrogate. The success rate for every cSGLD-based technique is better than its counterpart with 1 DNN.}
%\resizebox{1.95\columnwidth}{!}{
\begin{tabular}{llrrrrr}
\toprule
\multicolumn{2}{c}{ } & \multicolumn{5}{c}{\bfseries Target Architecture} \\
\cmidrule(l{3pt}r{3pt}){3-7}
\bfseries Norm                      & \bfseries Surrogate                        & PreResNet110           & PreResNet164           & VGG16bn                & VGG19bn                & WideResNet             \\
\midrule
\multirow{16}{*}{L$2$}      & 1 DNN                            & 34.42 \tiny ±0.00          & 34.39 \tiny ±0.01          & 12.67 \tiny ±0.00          & 12.54 \tiny ±0.00          & 26.29 \tiny ±0.01          \\
                            & \quad+ Input Diversity           & 59.63 \tiny ±0.80          & 59.79 \tiny ±0.75          & 24.37 \tiny ±0.16          & 23.25 \tiny ±0.12          & 46.09 \tiny ±0.47          \\
                            & \quad+ Skip Gradient Method      & 57.00 \tiny ±0.00          & 57.66 \tiny ±0.04          & 20.87 \tiny ±0.03          & 20.10 \tiny ±0.09          & 41.80 \tiny ±0.04          \\
                            & \quad+ Ghost Networks            & 79.22 \tiny ±0.30          & 80.38 \tiny ±0.16          & $\star$32.03 \tiny ±0.25   & $\star$28.63 \tiny ±0.17   & 56.65 \tiny ±0.24          \\
                            & \quad+ Momentum                  & 67.12 \tiny ±0.07          & 67.80 \tiny ±0.00          & 20.49 \tiny ±0.02          & 19.15 \tiny ±0.01          & 44.11 \tiny ±0.04          \\
                            & \quad\quad+ Input Diversity      & 81.44 \tiny ±0.32          & 82.69 \tiny ±0.29          & 27.64 \tiny ±0.03          & 25.82 \tiny ±0.42          & 57.29 \tiny ±0.12          \\
                            & \quad\quad+ Skip Gradient Method & 73.52 \tiny ±0.00          & 75.23 \tiny ±0.01          & 24.52 \tiny ±0.00          & 22.76 \tiny ±0.00          & 49.73 \tiny ±0.00          \\
                            & \quad\quad+ Ghost Networks       & 77.44 \tiny ±0.28          & 79.13 \tiny ±0.12          & $\star$28.98 \tiny ±0.57   & 25.74 \tiny ±0.18          & 54.06 \tiny ±0.04          \\
                            \cline{2-7}
                            & cSGLD                            & 90.67 \tiny ±0.39          & 89.74 \tiny ±0.31          & 28.05 \tiny ±0.33          & 26.12 \tiny ±0.14          & 67.27 \tiny ±0.89          \\
                            & \quad+ Input Diversity           & 92.45 \tiny ±0.14          & 91.80 \tiny ±0.14          & 33.69 \tiny ±0.28          & 31.35 \tiny ±0.28          & 72.41 \tiny ±0.76          \\
                            & \quad+ Skip Gradient Method      & 92.46 \tiny ±0.17          & 92.10 \tiny ±0.28          & 31.96 \tiny ±0.53          & 29.84 \tiny ±0.34          & 71.04 \tiny ±1.23          \\
                            & \quad+ Ghost Networks            & \textbf{92.73 \tiny ±0.21} & \textbf{92.20 \tiny ±0.07} & \textbf{36.17 \tiny ±0.39} & \textbf{33.08 \tiny ±0.32} & \textbf{74.77 \tiny ±0.10} \\
                            & \quad+ Momentum                  & $\dagger$90.35 \tiny ±0.37 & 89.77 \tiny ±0.28          & $\dagger$26.89 \tiny ±0.37 & $\dagger$25.02 \tiny ±0.29 & $\dagger$65.98 \tiny ±0.52 \\
                            & \quad\quad+ Input Diversity      & 92.31 \tiny ±0.33          & 91.58 \tiny ±0.23          & 31.92 \tiny ±0.49          & 29.72 \tiny ±0.46          & 70.94 \tiny ±0.31          \\
                            & \quad\quad+ Skip Gradient Method & 92.33 \tiny ±0.34          & 91.94 \tiny ±0.41          & 31.95 \tiny ±0.29          & 29.85 \tiny ±0.28          & 70.96 \tiny ±0.65          \\
                            & \quad\quad+ Ghost Networks       & 92.42 \tiny ±0.16          & 91.93 \tiny ±0.25          & 33.02 \tiny ±0.60          & 29.77 \tiny ±0.14          & 72.28 \tiny ±0.53          \\
\hline
\multirow{16}{*}{L$\infty$} & 1 DNN                            & 72.73 \tiny ±0.00          & 74.58 \tiny ±0.01          & 22.26 \tiny ±0.00          & 20.98 \tiny ±0.00          & 47.59 \tiny ±0.01          \\
                            & \quad+ Input Diversity           & 81.29 \tiny ±0.18          & 82.77 \tiny ±0.12          & 28.10 \tiny ±0.22          & 26.17 \tiny ±0.25          & 57.04 \tiny ±0.10          \\
                            & \quad+ Skip Gradient Method      & 77.92 \tiny ±0.00          & 79.50 \tiny ±0.01          & 27.43 \tiny ±0.00          & 25.31 \tiny ±0.01          & 53.39 \tiny ±0.00          \\
                            & \quad+ Ghost Networks            & 74.92 \tiny ±0.08          & 77.23 \tiny ±0.26          & $\star$29.61 \tiny ±0.19   & 26.31 \tiny ±0.30          & 52.93 \tiny ±0.05          \\
                            & \quad+ Momentum                  & 76.12 \tiny ±0.01          & 78.05 \tiny ±0.00          & 23.77 \tiny ±0.02          & 22.33 \tiny ±0.01          & 50.49 \tiny ±0.01          \\
                            & \quad\quad+ Input Diversity      & 84.66 \tiny ±0.19          & 86.38 \tiny ±0.12          & $\star$31.47 \tiny ±0.05   & $\star$28.89 \tiny ±0.31   & 61.60 \tiny ±0.16          \\
                            & \quad\quad+ Skip Gradient Method & 79.72 \tiny ±0.02          & 80.80 \tiny ±0.02          & 28.75 \tiny ±0.01          & 26.12 \tiny ±0.00          & 55.74 \tiny ±0.00          \\
                            & \quad\quad+ Ghost Networks       & 80.34 \tiny ±0.34          & 82.59 \tiny ±0.42          & $\star$34.17 \tiny ±0.48   & $\star$29.37 \tiny ±0.18   & 60.62 \tiny ±0.40          \\
                            \cline{2-7}
                            & cSGLD                            & 90.98 \tiny ±0.40          & 90.26 \tiny ±0.35          & 29.26 \tiny ±0.53          & 26.97 \tiny ±0.43          & 67.18 \tiny ±1.03          \\
                            & \quad+ Input Diversity           & 92.46 \tiny ±0.14          & 91.62 \tiny ±0.16          & 33.81 \tiny ±0.25          & 30.84 \tiny ±0.34          & 71.15 \tiny ±0.92          \\
                            & \quad+ Skip Gradient Method      & 93.38 \tiny ±0.50          & 92.84 \tiny ±0.25          & 35.68 \tiny ±0.61          & 32.43 \tiny ±0.52          & 73.55 \tiny ±1.08          \\
                            & \quad+ Ghost Networks            & 91.66 \tiny ±0.40          & 91.32 \tiny ±0.19          & 34.77 \tiny ±0.09          & 31.01 \tiny ±0.27          & 71.60 \tiny ±0.40          \\
                            & \quad+ Momentum                  & 92.84 \tiny ±0.18          & 92.18 \tiny ±0.28          & 32.03 \tiny ±0.49          & 28.53 \tiny ±0.38          & 71.56 \tiny ±0.25          \\
                            & \quad\quad+ Input Diversity      & 94.05 \tiny ±0.31          & 93.53 \tiny ±0.21          & 37.31 \tiny ±0.38          & 33.23 \tiny ±0.23          & 75.40 \tiny ±0.25          \\
                            & \quad\quad+ Skip Gradient Method & \textbf{94.64 \tiny ±0.26} & \textbf{94.29 \tiny ±0.31} & \textbf{38.08 \tiny ±0.27} & \textbf{34.28 \tiny ±0.17} & \textbf{76.62 \tiny ±0.50} \\
                            & \quad\quad+ Ghost Networks       & 93.76 \tiny ±0.14          & 93.75 \tiny ±0.13          & 38.01 \tiny ±0.44          & 33.15 \tiny ±0.36          & 76.23 \tiny ±0.29         \\
\bottomrule
\end{tabular}
%}
\label{tab:transfer-test-time-techs-cifar}
\end{table*}



\clearpage

\section{Attack and training Hyperparameters}
\label{sec:hp-appendix}


\begin{figure*}[!htb]
\centering
\includegraphics[width=0.95\textwidth]{figure/nb_iters_all_attacks_ImageNet}
\caption{Transfer success rates on ImageNet of three iterative gradient-based attacks on the same architecture (ResNet-50) with respect to the number of iterations.}
\label{fig:nb-iters-imagenet}
\end{figure*}


\begin{figure*}[!htb]
\centering
\includegraphics[width=0.95\textwidth]{figure/nb_iters_all_attacks_CIFAR}
\caption{Transfer success rates on CIFAR-10 of three iterative gradient-based attacks on the same architecture (PreResNet110) with respect to the number of iterations.}
\label{fig:nb-iters-cifar}
\end{figure*}


\begin{figure}[!htb]
\centering
\includegraphics[width=0.45\columnwidth]{figure/IFSGM_transfer_vs_nb_samples_per_cycle}
\caption{Intra-architecture transfer success rate of I-FGSM with respect to the number of cSGLD samples per cycle. We train one PreResNet110 cSGLD on CIFAR-10 for every number of cycles, from 1 to 10 samples per cycle. Each additional sample per cycle increases the training cost by 1 epoch per cycle (starting at 48 epochs per cycle). A fixed number of 5 cSGLD cycles is used.}
\label{fig-nb-samples-per-cycle}
\end{figure}

\begin{figure}[!htb]
\centering
\includegraphics[width=0.45\columnwidth]{figure/IFSGM_transfer_vs_nb_cycles}
\caption{Intra-architecture transfer success rate of I-FGSM with respect to the number of cSGLD cycles on CIFAR-10 (PreResNet110).}
\label{fig:nb-cycles}
\end{figure}



\begin{figure*}[!htb]
\centering
\includegraphics[width=0.6\textwidth]{figure/HP_SGM_tuning_val.pdf}
\caption{Transfer success rates of the test-time transferability technique Skip Gradient Method with varying values of its hyperparameter $\gamma$ between $0$ and $1$ with $0.1$ steps. The surrogate is a PreResNet110 DNN trained on CIFAR-10 and evaluated on 1~independently trained DNN for every targeted architecture. The plain line represents the intra-architecture transferability, and the dotted ones the inter-architecture transferability. Adversarial examples are crafted from a validation set randomly sampled from the train set. $\gamma = 0.7$ is selected in the rest of the paper for PreResNet110.}
\label{fig:hp-tuning-sgm}
\end{figure*}

% Train HPs
\begin{table*}[ht]
\centering
\caption{Hyperparameters used to train cSGLD or Deep Ensemble. The $\star$ symbols refer to the inter-architecture and test-time techniques sections, and $\star\star$ to the Bayesian and Ensemble training methods section. We do not include target DNNs on ImageNet, since they are pretrained models from PyTorch and timm.}
\begin{tabular}{lL{6.5em}|L{5em}L{5em}L{5em}|L{6.5em}L{6em}}
\toprule
\multicolumn{2}{c}{ } &  \multicolumn{3}{|c|}{\bfseries CIFAR-10}                              & \multicolumn{2}{c}{\bfseries ImageNet}           \\
\cmidrule(l{3pt}r{3pt}){3-7}
\bfseries Method      &   \bfseries Hyperparameter                    & \bfseries cSGLD                        & \bfseries DNN Surrogate & \bfseries DNN Target & \bfseries cSGLD              & \bfseries DNN Surrogate     \\
\midrule
\multirow{7}{*}{All}   & Number epochs             & 50 per cycle                 & 300           & 300        & 45 per cycle       & 130 \newline (135 for $\star$) \\
\cline{2-7}
                       & Initial learning rate & 0.5                          & 0.01          & 0.01       & 0.1                & 0.1               \\
\cline{2-7}
 &
  Learning rate schedule &
  Cosine Annealing &
  Step size decay \newline ($\times 0.1$ each 75 epochs) &
  Step size decay \newline ($\times 0.1$ each 75 epochs) &
  Cosine Annealing &
  Step size decay \newline ($\times 0.1$ each 30 epochs) \\
\cline{2-7}
                       & Optimizer             & cSGLD                        & Adam          & Adam       & cSGLD              & SGD               \\
\cline{2-7}
                       & Momentum              & 0                            & 0.9           & 0.9        & 0.9                & 0.9               \\
\cline{2-7}
                       & Weight decay          & 5e-4 \newline (3e-4 for PreResNet)  & 1e-4      & 1e-4   & 1e-4           & 1e-4          \\
\cline{2-7}
 &
  Batch-size &  64 &   128 &  128 &  256 for ResNet50, \newline 64 for others &  256 for ResNet50, \newline 64 for others \\
\hline
\multirow{4}{*}{cSGLD} & Sampling interval     & 1 sample per epoch           &        -       &    -        & 1 sample per epoch &            -       \\
\cline{2-7}
                       & Nb cycles             & 6\newline (18 for $\star\star$)                           &        -       &     -       & 5\newline (3 for $\star$, 6 for $\star\star$)      &        -           \\
\cline{2-7}
                       & Nb samples per cycle  & 5                            &       -        &      -      & 3                  &       -            \\
\cline{2-7}
                       & Nb epochs with noise  & 5                            &      -         &     -       & 3                  &      -            \\
\bottomrule
\end{tabular}
\label{tab:hps-train}
\end{table*}


% Attack HPs
\begin{table*}[!ht]
\centering
\caption{Hyperparameters of attacks and test-time transferability techniques.}
\begin{tabular}{L{7em}L{10em}|C{6em}C{6em}C{5em}}
\toprule
\bfseries Attack / Technique           & \bfseries Hyperparameter             & \bfseries ImageNet  & \bfseries CIFAR-10  & \bfseries MNIST \\
\midrule
\multirow{2}{*}{All attacks} & Perturbation $2$-norm $\varepsilon$      & 3  & 0.5   & 3    \\
                             & Perturbation $\infty$-norm $\varepsilon$ & $\frac{4}{255}$ & $\frac{4}{255}$ & 0.1 \\
\hline
\multirow{2}{*}{Iterative Attacks} & Step-size $\alpha$   & $\frac{\varepsilon}{10}$  & $\frac{\varepsilon}{10}$ & $\frac{\varepsilon}{10}$ \\
                             & Number iterations          & 50       & 50       & 50       \\
\hline
MI-FGSM                    & Momentum term              & 0.9      & 0.9      & 0.9      \\
\hline
PGD                          & Number random restarts     & 5        & 5        & 5        \\
\hline
Ghost Network                      & Skip connection erosion random range & {[}1-0.22, 1+0.22{]} & {[}1-0.22, 1+0.22{]} &  -  \\
\hline
\multirow{2}{*}{Input Diversity}   & Minimum resize ratio                 & 90 \%                & 90 \%        &  -        \\
                             & Probability transformation & 0.5      & 0.5   &   -   \\
\hline
Skip Gradient Method         & Residual Gradient Decay $\gamma$           & 0.2 (ResNet50)   & 0.7 (PreResNet110)     &  -  \\  
\bottomrule
\end{tabular}
\label{tab:hps-attack}
\end{table*}

% cite w/o bibliography in the end (full refs already in the main paper)
\newsavebox\mytempbib
\savebox\mytempbib{\parbox{\textwidth}{\bibliography{references}}}


\end{document}