\subsection{Derivation of the proximal operators}
{\normalsize
\subsubsection{Laplace Prior with Unknown Mean $\theta$}\label{ap:proximal_op_laplace_mean}

We recall that using a Laplace prior $g_2(\theta, x) = \sum_{i=1}^{d_x}|x_i-\theta|$.
\begin{equation*}
    \prox_{g_2}^{\lambda}(\theta,x)=\argmin_{(u_0,u)} h(u_0, u)= \argmin_{(u_0,u)} \{g_2(u_0, u) + \Vert (u_0, u)-(\theta, x)\Vert^2/{(2\lambda)}\}.
\end{equation*}
The first order optimality condition is given by
\begin{equation*}
    0\in\partial g_2(u_0, u) + \nabla\big(\Vert(u_0, u)-(\theta, x)\Vert^2/(2\lambda)\big).
\end{equation*}

We recall that $\phi\in\real^d$ is a subdifferential of the $\ell^1$-norm at $x\in\real^d$ if and only if $\phi_i(x) =  \text{sign}(x_i)$ if $x_i\neq 0$ and $\vert \phi_i(x)\vert \leq 1$ otherwise \citep{proximal_algorithms_neal}.

Let us define the set $D=\{i\in\{1,\dots, d_x\}| u_i-u_0 = 0\}$. Then, the first order optimality condition becomes 
\begin{align*}
        0&\in 
       \left\lbrace- \sum_{i\notin D} t_i  -\sum_{i\notin D}\text{sign}(u_{i}-u_0) + (u_0-\theta)/\lambda\ |\ \vert t_i\vert \leq 1\right\rbrace\\
        \ &\begin{cases}
        0\in \left\lbrace t_i + \frac{u_i-x_i}{\lambda}\ |\ \vert t_i\vert \leq 1\right\rbrace \quad\quad \text{if $i\in D$}\\
        0 = \text{sign}(u_{i}-u_0)+ (u_i-x_i)/\lambda \quad\text{if $i\not\in D$}
    \end{cases}.
\end{align*}

Reordering terms, we get
\begin{align}
    u_0&\in\left\lbrace 
       \theta + \lambda\bigg(\sum_{i\notin D} t_i  -\sum_{i\notin D}\text{sign}(u_{i}-u_0)\bigg)\ |\ \vert t_i\vert \leq 1\right\rbrace,\label{eq:iterative_regression_11}
\\
    \ & \begin{cases}
       u_i\in \left\lbrace x_i - \lambda t_i\ |\  \vert t_i\vert \leq 1\right\rbrace  \quad\quad \text{if $i\in D$},\\
        u_i = x_i - \lambda\; \text{sign}(u_{i}-u_0)\quad\text{if $i\not\in D$}.
    \end{cases}\label{eq:iterative_regression_22}
\end{align}
Assuming that $D = \emptyset$, the previous system of equations can be solved iteratively using a fixed point algorithm. 

Alternatively, for a lower computational cost we can obtain an approximate solution by setting $u_0=\theta$ in (\ref{eq:iterative_regression_22})
\begin{equation*}
     \begin{cases}
       u_i\in \left\lbrace x_i - \lambda t_i\ | \ \vert t_i\vert \leq 1\right\rbrace  \quad\quad \text{if $i\in D$},\\
        u_i = x_i - \lambda\; \text{sign}(u_{i}-\theta)\quad\text{if $i\not\in D$}.
    \end{cases}
\end{equation*}
which is solved applying the soft-thresholding operator
\begin{equation*}
    u_i = \theta + [x_i -\theta-\lambda\;\text{sign}(x_i-\theta)]\mathds{1}\{\vert x_i-\theta\vert \geq\lambda \}.
\end{equation*}
Using these $u_i$'s, taking $u_0=\theta$ in the right-hand side of (\ref{eq:iterative_regression_11}) and assuming $D = \emptyset$, we obtain
\begin{equation*}
    u_0 = \theta + \lambda\sum_{i=1}^{d_x} \text{sign}(u_i-\theta).
\end{equation*}

\subsubsection{Laplace Prior with Unknown Scale $e^{2\theta}$}\label{ap:proximal_approx_bnn}
For the Bayesian neural network experiment  we consider a Laplace prior with zero mean and unknown scale parameterised by $e^{2\theta}$ (which ensures that the scale is positive), we have $g_2(\theta, x) = d_x\alpha + \sum_i |x_i|e^{-2\alpha}$.
Its proximal operator is given by
\begin{equation*}
    \prox_{g_2}^{\lambda}(\theta, x) =\argmin_{(u_0, u)} h(u_0, u), \quad h(u_0, u) = u_0d_x+ \sum_i |u_i| e^{-2u_0} + \Vert (u_0, u)-(\theta, x)\Vert^2/{(2\lambda)}.
\end{equation*}
The optimality condition is given by
\begin{equation*}
    0\in \partial\big(u_0d_x+ \sum_i |u_i| e^{-2u_0}\big)+\nabla\big(\Vert(u_0, u)-(\theta, x)\Vert^2/(2\lambda)\big),
\end{equation*}
which provides the following system of equations
\begin{align*}
    0 &= d_x-2e^{-2u_0}\sum_{i=1}^{d_x} |u_i| + \frac{1}{\lambda}(u_0-\theta),\\
    \ &\begin{cases}
        0\in \left\lbrace e^{-2u_0}t_i + (u_i-x_i)/\lambda\ |\ \vert t_i\vert \leq 1\right\rbrace\quad\quad \text{if $u_i=0$},\\
        0 = e^{-2u_0}\;\text{sign}(u_{i}) + (u_i-x_i)/\lambda \quad \text{if $u_i\neq 0$}.
    \end{cases} 
\end{align*}
Reordering terms, we get
\begin{align}
    u_0 &= \theta -\lambda d_x +2\lambda e^{-2u_0}\sum_i\vert u_i \vert\label{eq:grad_theta}\\
    \ & \begin{cases}
u_i\in \{x_i - \lambda e^{-2u_0}t_i \ | \ \vert t_i\vert \leq 1\}\quad \quad \text{if $u_i=0$},\\
        u_i=x_i - \lambda e^{-2u_0}\;\text{sign}(u_i)\quad \;\text{if $u_i\neq 0$}. 
    \end{cases}\label{eq:grad_w}
\end{align}
This system of equations can be solved using an iterative solver, however this will incur in a high computational cost. Therefore, we opt for the following approximation of (\ref{eq:grad_w}), where we set $u_0=\theta$,
\begin{equation}
    \begin{cases}
u_i\in \{x_i - \lambda e^{-2\theta}t_i \ | \ \vert t_i\vert \leq 1\}\quad \quad \text{if $u_i=0$},\\
        u_i=x_i - \lambda e^{-2\theta}\;\text{sign}(u_i)\quad \;\text{if $u_i\neq 0$}. 
    \end{cases}\label{eq:grad_w_approx}
\end{equation}
The solution of \eqref{eq:grad_w_approx} is 
\begin{equation*}
    u_i \approx [x_i-\lambda e^{-2\theta}\;\text{sign}(x_i)]\mathds{1}\{\vert x_i\vert \geq\lambda e^{-2\theta}\}. 
\end{equation*}
Using these $u_i$'s together with the Lambert $W$ function, the solution of \eqref{eq:grad_theta} is given by
\begin{equation*}
    u_0 \approx \theta -\lambda d_x + \frac{1}{2} W\left(4\lambda e^{-2\theta}\sum_i\vert u_i \vert\right).
\end{equation*}

\subsubsection{Uniform Prior}\label{ap:proximal_op_uniform_mean}

We recall that using a uniform prior 
\begin{equation*}
    g_2(\theta, x) = d_x\log(2\theta) + \sum_{i=1}^{d_x} \imath_{[-\theta,\theta]}(x_i),
\end{equation*}
where $\imath_{\mathcal{K}}$ is the convex indicator of $\mathcal{K}$ defined by $\imath_{\mathcal{K}}(x)=0$ if $x\in\mathcal{K}$ and $\imath_{\mathcal{K}}(x)=\infty$ otherwise. 
In this case, the proximal operator satisfies
\begin{align*}
    \prox_{g_2}^{\lambda}(\theta,x)&= \argmin_{(u_0,u)} \{g_2(u_0, u) + \Vert (u_0, u)-(\theta, x)\Vert^2/{(2\lambda)}\} \\
    &= \argmin_{\substack{(u_0,u)\\|u_i|\leq u_0}} \{d_x\log(2u_0) + \Vert (u_0, u)-(\theta, x)\Vert^2/{(2\lambda)}\}.
\end{align*}
We can obtain an approximate solution by deriving the first order conditions for $u_i$ with $i=0, 1, \dots, d_x$ and combining them with the constraint $|u_i|\leq u_0$:
\begin{align*}
u_0 &= \begin{cases}
    \frac{\theta + \sqrt{\theta^2-4\lambda d_x}}{2} \;\;\text{if}\;\;\theta^2\geq4\lambda d_x,\\
    \max_i |x_i|\;\; \text{otherwise},
\end{cases}\\
u_i &= \text{sign}(x_i)\cdot\min\{|x_i|, |u_0|\}. 
\end{align*}

\subsubsection{Approximation for PIPULA and PPGD}\label{ap:proximal_operator_whole}
In PIPULA and PPGD, we need to compute the proximal operator of $U = g_1 + g_2$ which is usually not available in closed form. Since $\gamma$ is normally set to a small enough value, we follow \cite{pereyra2016proximal} and approximate the proximity map of $U$ as
\begin{align*}
    \prox_{U}^{\gamma}(v) &= \argmin_{v'} \{g_1(v') + g_2(v') + \Vert v'- v \Vert^2/(2\gamma)\}\\
    &\approx\argmin_{v'} \{g_1(v) + (v'-v)^{\intercal}\nabla g_1(v) + g_2(v') + \Vert v'- v \Vert^2/(2\gamma)\}\\
    &\approx\argmin_{v'} \{g_2(v') + \Vert v'- v + 2\gamma\nabla g_1^{\intercal}(v)  \Vert^2/(2\gamma)\}\\
    &\approx \prox_{g_2}^{\gamma}(v + 2\gamma\nabla g_1^{\intercal}(v)),
\end{align*}
where $v = (\theta, x), v' = (\theta',x')$.}
\subsection{Bayesian logistic regression}\label{ap:experiments_logistic}
In the case of the Laplace prior, the negative log joint likelihood is given by 
\begin{equation*}
  -\log p_{\theta}(x,y) = \underbrace{\sum_{i=1}^{d_x}|x_i-\theta|}_{g_2(\theta, x)} + \underbrace{d_x\log2 -\log p(y|x)}_{g_1(\theta, x)};
\end{equation*}
and for the uniform prior, we obtain
\begin{equation*}
  -\log p_{\theta}(x,y) = \underbrace{d_x\log(2\theta) + \sum_{i=1}^{d_x} \imath_{[-\theta,\theta]}(x_i)}_{g_2(\theta, x)} - \underbrace{\log p(y|x)}_{g_1(\theta, x)},
\end{equation*}
where $g_1$ is differentiable and $g_2$ is lower semi-continuous, and $\imath_{\mathcal{K}}$ is the convex indicator of $\mathcal{K}$ defined by $\imath_{\mathcal{K}}(x)=0$ if $x\in\mathcal{K}$ and $\imath_{\mathcal{K}}(x)=\infty$ otherwise.

In both cases we have that 
\begin{align*}
    g_1(\theta, x) = \sum_{j=1}^{d_y}\left(y_j\log(s(v_j^Tx))+(1-y_j)\log(s(-v_j^Tx))\right)+C
\end{align*}
where $C$ is a constant. As shown in \citet[Section 6.1.1]{akyildiz2023interacting}, the function $g_1$ is gradient Lipschitz and strictly convex but not strongly convex.
The function $g_2$ satisfies \textbf{A\ref{assumption_1}} for both the Laplace and the uniform prior, as observed in \cite{pereyra2016proximal}, in the case of the Laplace prior $g_2$ also satisfies \textbf{A\ref{assumption_1prime}} while the uniform prior does not lead to a Lipschitz $g_2$. Since $g_1$ does not depend on $\theta$, \textbf{A\ref{assumption_4}} holds for the Laplace prior. 

{\normalsize
\paragraph{Dataset.} We create a synthetic dataset by first fixing the value of $\theta$ and sampling the latent variable $x\in \mathbb{R}^{50}$ from the corresponding prior. We then sample the 900 observations from a Bernoulli distribution with parameter $s(v_j^T x)$, where $s$ is the logistic function and the entries of the covariates $v_j$ are drawn from a uniform distribution $\mathcal{U}(-1,1)$. 
The true value of $\theta$ is set to $\bar{\theta}_\star = -4$ for the Laplace prior and $\bar{\theta}_\star = 1.5$ for the uniform one.



\paragraph{Implementation details.}
The $x$-gradients of $g_1$ can be computed analytically. 
To choose the optimal values of $\gamma$ and $\lambda$ for the different implementations, we perform a grid search in the range $[5\times10^{-4}, 0.5]$. The selected optimal values are displayed in Table \ref{table-logistic-hyperparameters}. 
We note that in PIPGLA the optimal values for $\lambda, \gamma$ turn out  to be when $\lambda = \gamma$.

\begin{table}[h!]
  \caption{{\normalsize Optimal hyperparameters for Bayesian logistic regression example. Recall that for PPGD and PIPULA we only have the $\gamma$ parameter since we set $\lambda=\gamma$.}}
  \label{table-logistic-hyperparameters}
  \centering
  \begin{tabular}{llllll}
    \toprule
    Algorithm     & Approx./Iterative  & \multicolumn{2}{c}{$\gamma$} & \multicolumn{2}{c}{$\lambda$} \\
    \cmidrule(r){3-4}
    \cmidrule(r){5-6}
         &   &  Laplace & Unif & Laplace & Unif \\
    \midrule
    \multirow{2}{*}{PPGD} & Approx  & $0.1$  &   $0.03$ & $-$& $-$\\
    & Iterative  & $0.06$ & $-$ &$-$ & $-$ \\
     \midrule
    \multirow{2}{*}{PIPULA} & Approx  & $0.06$  &   $0.03$ & $-$&$-$\\
    & Iterative  & $0.06$ & $-$ & $-$ & $-$\\
     \midrule
   \multirow{2}{*}{MYPGD} & Approx  & \multirow{1}{*}{$0.05$}  &   \multirow{1}{*}{$0.001$}& \multirow{1}{*}{$0.25$} & \multirow{1}{*}{$0.01$}  \\
       & Iterative  &   $0.05$&   $-$& $0.005$ & $-$  \\
        \midrule
    \multirow{2}{*}{MYIPLA} & Approx  & $0.05$& $0.001$& $0.35$& $0.01$\\
       & Iterative  &   $0.05$&   $-$& $0.005$ & $-$  \\
 \midrule
        \multirow{2}{*}{PIPGLA} & Approx  & $0.01$ & $0.02$ & $0.01$ & $0.02$\\
     & Iterative  &   $0.01$&   $-$& $0.01$ & $-$  \\
    \bottomrule
  \end{tabular}
\end{table}

\begin{table}[t!]
\caption{Bayesian logistic regression for Laplace and uniform priors. Normalised MSE (NMSE) for $\theta$ for different algorithm when run 500 times using 50 particles, 5000 steps and different starting points. Computation times and NMSEs are averaged over the 500 replicates. The second column indicates whether the proximal map is calculated approximately or iteratively, using 40 steps in each iteration. For the uniform prior case we have not implemented the iterative method.}
\label{table-logistic-comparison-extended}
\centering
\begin{tabular}{llllll}
  \toprule
  Algorithm     & Approx./Iterative  & \multicolumn{2}{c}{NMSE (\%)} & \multicolumn{2}{c}{Times (s)} \\
  \cmidrule(r){3-4}
  \cmidrule(r){5-6}
       &   &  Laplace & Unif & Laplace & Unif \\
  \midrule
  \multirow{2}{*}{PPGD} & Approx  & $14.70\pm 4.42$  &   $3.63\pm 4.93$ & $102.6\pm 5.1$& $107.9\pm 5.5$\\
  & Iterative  & $19.04\pm 1.34$ & $-$ &$122.3\pm 5.1$ & $-$ \\
   \midrule
  \multirow{2}{*}{PIPULA} & Approx  & $12.18\pm1.62$  &   $4.71\pm 6.02$ & $98.8\pm 5.7$ & $101.0\pm4.0$ \\
  & Iterative  & $19.22\pm1.28$ & $-$ & $126.2\pm 3.8$ & $-$\\
   \midrule
 \multirow{2}{*}{MYPGD} & Approx  & $6.09\pm 0.34$  &   $\mathbf{0.60\pm 0.23}$& $91.9\pm4.8$ &$109.3\pm4.6$  \\
 & Iterative  & $4.44\pm1.40$  &   $-$& $129.7\pm 15.8$ & $-$  \\
  \midrule
  \multirow{2}{*}{MYIPLA} & Approx  & $4.42\pm 1.32$  & $15.26\pm 4.44$ & ${89.9\pm4.2}$& ${97.0\pm4.2}$\\
 & Iterative  &  $4.67\pm1.60$ &   $-$& $120.5\pm10.1$ & $-$  \\
 \midrule
  \multirow{2}{*}{PIPGLA} & Approx  & $2.30\pm0.58$  & $6.83\pm 3.97$ & $116.5\pm5.5$& $103.1\pm8.0$\\
   & Iterative  & $\mathbf{2.02\pm0.54}$  & $-$ & $122.9\pm6.9$& $-$\\
\midrule
  \multirow{1}{*}{IPLA} & --  & $7.76\pm3.39$  & $20.12\pm 2.88$ & $\mathbf{81.1\pm3.0}$& $\mathbf{82.9\pm4.9}$\\
  \bottomrule
\end{tabular}
\end{table}

\paragraph{Results.} 
Table \ref{table-logistic-comparison-extended} extends the results in Table \ref{table-logistic-comparison} by also including the results for PPGD, PIPULA and IPLA (as a benchmark).
Figure~\ref{fig:results_logistic_regression2} shows the $\theta$-iterates obtained with MYIPLA and PIPGLA starting from 7 different initial values $\theta_0$ and using the approximate solver for $\prox_{g_2}^\lambda$ with $g_2(\theta, x)=\sum_{i=1}^{d_x}|x_i-\theta|$ and an iterative procedure using 40 iterations in each step.
We observe that the iterative solver results in a slightly slower convergence to stationarity, but overall the two sets of algorithms converge to the same true value of $\theta$.
We also observe that the convergence to stationarity for PIPGLA is much slower compared to MYIPLA.
However, if we increase the value of $\gamma$ in the hope of faster convergence, the iterates either do not converge to the true value or the standard deviation is significantly larger.
For all algorithms considered, approximate solvers are 25\% faster than iterative solvers (see Table \ref{table-logistic-comparison-extended}).

We also compare the results for the uniform prior, in this case we only use the approximate proximity map (Figure \ref{fig:logistic_uniform}), as the iterative approach is not numerically stable.

\begin{figure}
  \centering
    \begin{subfigure}[b]{0.32\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/aa_moreau_yosida_ipla_approx.pdf} 
        \caption{MYIPLA, approximated $\prox_{g_2}^{\lambda}$.}
        \label{fig:logistic_myula}
    \end{subfigure}
    \begin{subfigure}[b]{0.32\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/aa_moreau_yosida_pipula_iterative.pdf} 
        \caption{MYIPLA, iterative solver for $\prox_{g_2}^{\lambda}$.}
        \label{fig:logistic_myula_iterative}
    \end{subfigure}
    \vfill
    \begin{subfigure}[b]{0.32\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/prox_pipgla_approximate_regression_laplace_same_lambda_gamma.pdf} 
        \caption{PIPGLA, approximated $\prox_{g_2}^{\lambda}$.}
        \label{fig:logistic_pipgla_same}
    \end{subfigure}
    \begin{subfigure}[b]{0.32\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/prox_pipgla_iterative_regression_laplace_same_lambda_gamma.pdf} 
        \caption{PIPGLA, iterative solver for $\prox_{g_2}^{\lambda}$.}
        \label{fig:logistic_pipgla_same_iterative}
    \end{subfigure}
    \hfill
  \caption{{\normalsize Bayesian logistic regression with isotropic Laplace priors on the regression weights $\prod_i \text{Laplace}(x_i|\theta,1)$, with true $\theta=-4$. Each plot shows the $\theta$-iterates for 7 different starting points.}}  
  \label{fig:results_logistic_regression2}
\end{figure}



\begin{figure}
  \centering
      \begin{subfigure}[b]{0.32\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/aa_my_pgd_approx_unif.pdf}
        \caption{MYPGD.}        \label{fig:logistic_uniform_pgd}
    \end{subfigure}
    \hfill
        \begin{subfigure}[b]{0.32\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/MYIPLA_UNIFORM_REPORT.pdf}
        \caption{MYIPLA.}     \label{fig:logistic_uniform_ipla}
    \end{subfigure}
    \hfill
        \hfill
        \begin{subfigure}[b]{0.32\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/PIPGLA_approx_UNIF_report.pdf}
        \caption{PIPGLA.}   \label{fig:logistic_uniform_PIPGLA}
    \end{subfigure}
    \hfill
  \caption{{\normalsize Bayesian logistic regression with isotropic uniform priors on the regression weights $\prod_i \mathcal{U}(x_i|-\theta,\theta)$, with true $\theta=1.5$. The plot displays the $\theta$-iterates for 7 randomly chosen starting points.}} 
  \label{fig:logistic_uniform}
\end{figure}

Since all the algorithms considered aim at estimating the parameter $\theta$ by sampling from a distribution which concentrates around $\bar{\theta}_\star$, we compare the estimators of $\bar{\theta}_\star$ obtained by using only the last iterate $\theta_{K+1}^N$ and averaging over a number of iterates.
We compare the normalised MSE (NMSE) for $\theta$ for the estimator obtained by averaging 
the $\theta$-iterates after discarding a burn-in of 1500 samples (column named \textit{avg}) against using the last $\theta$ of the chain (column \textit{last}).
The results are in agreement, with the NMSE for the averaged estimator having lower variance in most settings (Table~\ref{tab:blr_avg}).

\begin{table}
\caption{{\normalsize Bayesian logistic regression for Laplace and uniform priors. Normalised MSE (NMSE) for the last iterate of $\theta$ (\textit{last}) and the posterior mean after discarding a burn-in of 1500 samples (\textit{avg}). Each different algorithm is run 500 times for different starting points using 50 particles and 5000 steps. NMSEs are averaged over the 500 replicates. The second column indicates whether the proximal map is calculated approximately or iteratively, using 40 steps in each iteration. For the uniform prior case, we did not implement the iterative method due to numerical instabilities.}}
\label{table-logistic-comparison-avg}
\centering
\begin{tabular}{llllll}
  \toprule
  Algorithm     & Approx/  & \multicolumn{2}{c}{Laplace} & \multicolumn{2}{c}{Uniform} \\
  \cmidrule(r){3-4}
  \cmidrule(r){5-6}
       &  Iterative &  {\small NMSE last(\%)} & {\small NMSE avg(\%)} & {\small NMSE last(\%)} & {\small NMSE avg(\%)}\\
  \midrule
  \multirow{2}{*}{PPGD} & Approx  & $14.70\pm 4.42$  &   $16.73\pm 0.83$ & $3.63\pm 4.93$& $\mathbf{0.11\pm 0.04}$\\
  & Iterative  & $19.04\pm 1.34$ & $18.66\pm0.60 $ &$-$ & $-$ \\
   \midrule
  \multirow{2}{*}{PIPULA} & Approx  & $12.18\pm1.62$  &   $12.34\pm0.82$ & $4.71\pm 6.02$ & $0.12\pm 0.01$ \\
  & Iterative  & $19.22\pm1.28$ & $18.63\pm 0.79$ & $-$ & $-$\\
   \midrule
 \multirow{2}{*}{MYPGD} & Approx  & $6.09\pm 0.34$  &   $4.94\pm 0.51$& $\mathbf{0.60\pm 0.23}$ &$0.60\pm 0.02$  \\
 & Iterative  & $4.44\pm1.40$  &   $4.33\pm 0.59$& $-$ & $-$  \\
  \midrule
  \multirow{2}{*}{MYIPLA} & Approx  & $4.42\pm 1.32$  & $4.31\pm 0.67$ & $15.26\pm 4.44$& $16.01\pm 2.01$\\
 & Iterative  &  $4.67\pm1.60$ &   $4.45\pm 0.42$& $-$ & $-$  \\
 \midrule
  \multirow{2}{*}{PIPGLA} & Approx  & $2.30\pm0.58$  & $2.45\pm 0.94$ & $6.83\pm 3.97$& $4.22\pm 0.07$\\
   & Iterative  & $\mathbf{2.02\pm0.54}$  & $\mathbf{2.03\pm0.88}$ & $-$& $-$\\
  \bottomrule
\end{tabular}
\label{tab:blr_avg}
\end{table}}







\subsection{Bayesian neural network}\label{ap:experiments_bnn}
\subsubsection{Sparsity Inducing Prior: MNIST}\label{ap:experiments_bnn_sparsity_inducing_prior}
Our setting is equivalent to assuming that the datapoints' labels $l$ are conditionally independent given the features $f$ and network weights $x= (w,v)$, and therefore have the following probability density
\begin{equation*}
    p(l|f,x)\propto \exp\bigg(\sum_{j=1}^{40} v_{lj} \tanh\Big(\sum_{i=1}^{784}w_{ji}f_i\Big)\bigg).
\end{equation*}
We assign priors $p_{\alpha}(w)=\prod_i \text{Laplace}(w_i|0, e^{2\alpha})$ and $p_{\beta}(v)=\prod_i \text{Laplace}(v_i|0, e^{2\beta})$ to the input and output layer's weights, respectively, and learn $\theta = (\alpha, \beta)$ from the data. The model's  density is given by
\begin{equation*}
    p_{\theta}(x,\mathcal{Y}_{\text{train}}) =\prod_i \text{Laplace}(w_i|0, e^{2\alpha}) \prod_j \text{Laplace}(v_j|0, e^{2\beta})\prod_{(f,l)\in\mathcal{Y}_{\text{train}}}p(l|f,x),
\end{equation*}
where $x$ denotes the weight matrices, i.e. $x = (w, v)$.
We note that the log density can be decomposed as
\begin{equation*}
    -\log p_{\theta}(x,\mathcal{Y}_{\text{train}}) = \underbrace{d_w\alpha + \sum_i |w_i|e^{-2\alpha} + d_v\beta+ \sum_j |v_j|e^{-2\beta}}_{g_2(\theta, x)} \underbrace{- \sum_{(f,l)\in\mathcal{Y}_{\text{train}}}\log p(l|f,x)}_{g_1(\theta, x)},
\end{equation*}
where $d_w$ and $d_v$ denote the dimensions of the weights $w$ and $v$, respectively, $g_1$ is differentiable and does not depend on $\theta$ and $g_2$ is proper, convex and lower semi-continuous.
We have derived an approximation to the proximity map of $g_2$ in \ref{ap:proximal_approx_bnn}.
{\normalsize
\paragraph{Dataset.} We use the MNIST dataset. Features are normalised so that each pixel has mean zero and unit standard deviation across the dataset. We split the dataset into 80/20 training and test sets.

\paragraph{Proximal operator of $g_2$.} 

As $g_2$ can be expressed as $g_2(\theta, x) = g_2(\alpha, w) + g_2(\beta, v)$, we can compute their proximal operators separately. It is sufficient to calculate the proximal operator for $g_2(w,\alpha)$ since it is equivalent to that of $g_2(v,\beta)$. 
To do so, we have that
\begin{equation*}
    \prox_{g_2}^{\lambda}(\alpha, w) =\argmin_{(u_0, u)} h(u_0, u), \quad h(u_0, u) = u_0d_w+ \sum_i |u_i| e^{-2u_0} + \Vert (u_0, u)-(\alpha, w)\Vert^2/{(2\lambda)},
\end{equation*}
whose approximate solution is calculated in Section \ref{ap:proximal_approx_bnn}.

\paragraph{Implementation details.}
For the $x$-gradients of $g_1$, we use JAX's grad function (implementing a version of autograd). Plugging the expressions above in the corresponding equations, 
we can implement the proposed algorithms. 
However, due to the high dimensionality of the latent variables, we stabilise the algorithm using the heuristics discussed in Section 2 of \citet{pmlr-v206-kuntz23a}. 
This simply entails dividing the gradients and proximal mapping terms of the updates of $\alpha$ and $\beta$ by $d_w$ and $d_v$. 
We then set $\gamma = 0.05$ and $\lambda = 0.5$ (after performing a grid search) which ensures that the algorithms are not close to losing stability. 
In addition, the weights of the network are initialised according to the assumed prior. 
This is done by setting each weight to $\pm\; a\log u$ where $u\sim \mathcal{U}(0, 1)$, the sign is chosen uniformly at random and $a>0$ is interpreted as the average initial size of the weights. 
\citet{10.1162/neco.1995.7.1.117} suggests setting $a=1/\sqrt{2m}$ for $w$ and $a=1.6/\sqrt{2m}$ for $v$, where $m$ is the fan-in of the destination unit.

\paragraph{Predictive performance metrics.} To allow comparison, we use the same performance metrics as in \citet{pmlr-v206-kuntz23a}. We include their presentation of this metrics for completeness.

Given a new feature vector $\hat{f}$, the posterior predictive distribution for a label $\hat{l}$ associated with the marginal likelihood maximiser $\Bar{\theta}_{\star}$ is given by
\begin{equation*}
    p_{\Bar{\theta}_{\star}}(\hat{l}|\hat{f}, \mathcal{Y}_{\text{train}}) = \int p(\hat{l}|\hat{f},x) p_{\Bar{\theta}_{\star}}(x|\mathcal{Y}_{\text{train}}) \md x.
\end{equation*}
As $p_{\Bar{\theta}_{\star}}(x|\mathcal{Y}_\text{train})$ is unknown, we approximate it with the empirical distribution of the final particle cloud $q = N^{-1}\sum_{i=1}^N \delta_{X_K^i}$, leading to
\begin{equation*}
     p_{\Bar{\theta}_{\star}}(\hat{l}|\hat{f}, \mathcal{Y}_{\text{train}}) \approx \int p(\hat{l}|\hat{f},x) q(\md x) = \frac{1}{N}\sum_{i=1}^N p(\hat{l}|\hat{f},X_K^i) =: g(\hat{l}|\hat{f}).
\end{equation*}
The metrics considered to evaluate the approximation of the predictive power are the average classification error over the test set $\mathcal{Y}_{\text{test}}$, i.e. 
\begin{equation*}
    \text{Error}:=\frac{1}{\vert\mathcal{Y}_{\text{test}}\vert}\sum_{(f,l)\in \mathcal{Y}_{\text{test}}} \mathds{1}\{l=\hat{l}(f)\},\quad \text{where} \;\; \hat{l}(f):= \argmax_{\,\hat{l}} \;g(\hat{l}|\hat{f}),
\end{equation*}
and the log pointwise predictive density (LPPD, \citet{RefWorks:RefID:76-vehtari2017practical})
\begin{equation*}
     \text{LPPD}:= \frac{1}{\vert\mathcal{Y}_{\text{test}}\vert} \sum_{(f,l)\in \mathcal{Y}_{\text{test}}} \log(g(l|f)).
\end{equation*}
Under the assumption that data is drawn independently from $p(l, f)$, we have the following approximation for large test data sets,
\begin{align*}
    \text{LPPD}&\approx \int \log(g(l|f)) p(\md l, \md f) = \int \bigg[\int \log\Big(\frac{g(l|f)}{p(l|f)}\Big) p(\md l| \md f)\bigg] p(\md f) + \int \log(p(l|f)) p(\md l, \md f) \\
    &= -\int \text{KL}(g(\cdot|f)\Vert p(\cdot|f)) p(\md f) + \int \log(p(l|f)) p(\md l, \md f).
\end{align*}
This means that the larger the LPPD is, the smaller the mean KL divergence between our classifier $g(l|f)$ and the optimal classifier $p(l|f)$.

\paragraph{Results.} First, it is important to discuss whether the Laplace prior is more appropriate in this setting than the Normal one.
\citet{jaynes_prior} provides two reasons why the Laplace prior is particularly suitable for Bayesian neural network models. Firstly, for any feedforward network there is a functionally equivalent network in which the weight of a non-direct connection has the same size but opposite sign, therefore consistency demands that the prior for a given weight $w$ is a function of $\vert w\vert$ alone. Secondly, if it is assumed that all that is known about $\vert w\vert$ is its scale, and that the scale of a positive quantity is determined by its mean rather than some higher order moment, then the maximum entropy distribution for a positive quantity constrained to a given mean is the exponential distribution. It would follow that the signed weight $w$ has a Laplace density \citep{10.1162/neco.1995.7.1.117}.

We have examined the sparsity-inducing nature of the Laplace prior versus a normal one in Figure \ref{fig:histogram_weights} and Table \ref{table-bnn-comparison}.
As mentioned in the main text, the sparse representation of our experiment also has the advantage of producing models that are smaller in terms of memory usage when small weights are zeroed out.
To investigate this, we set to zero all weights below a certain threshold and analyse the performance of the compressed weight matrices. 
We consider two cases, averaging the particles of the final cloud $X_{500}^1, \dots, X_{500}^{100}$, applying the threshold and then calculating the performance, and secondly, setting to zero small values of each particle of the cloud and averaging the performance of each particle. 
We compare the results for the Bayesian neural networks with Laplace and Normal priors (Table \ref{table-bnn-sparsity}). 
It is important to note that when applying the same threshold to both cases, the Laplace prior leads to a very compressed weight matrix compared to the Normal prior, i.e. there is a significant difference in the percentage of weights set to zero. We observe that when setting the same proportion of weights to zero in both layers, the performance of the BNN with Laplace priors is better in terms of the log pointwise predictive density than that of the BNN with Normal priors, especially when averaging the final cloud of particles before computing the performance.




\begin{table}
\caption{Bayesian neural network. Performance of BNN with Laplace (implemented using MYIPLA) and Normal priors (implementation with PGD) when setting weights from the final particle cloud below a certain threshold to zero. The second column refers to whether the particles are averaged before ($\checkmark$) or after (\scalebox{0.75}{$\ballotx$}) calculating the performance.}
\label{table-bnn-sparsity}
\centering
\begin{tabular}{lcllllll}
  \toprule
  Prior   & Average over  & \multicolumn{2}{c}{$\%$ of zero weights} & \multicolumn{2}{c}{Thresholds} & Error ($\%$)     &  LPPD \\
  \cmidrule(r){3-4}
  \cmidrule(r){5-6}
     & particles? & Layer 1& Layer 2 & Layer 1& Layer 2 &      &   \\
  \midrule
  \multirow{2}{*}{Laplace} & \checkmark & $74$ & $48$  & $0.2$ & $0.2$ &$\mathbf{7.0}$& $\mathbf{-0.23}$\\
  &\scalebox{0.75}{$\ballotx$}  & $56$ & $35$ &   $1$ & $1$& $\mathbf{1.5}$ & $\mathbf{-0.07}$\\
  \midrule  
  \multirow{4}{*}{Normal} & \checkmark  & $74$ & $48$  &   $0.5$ & $1.1$& $15$& $-0.74$\\
      &   \checkmark &$16$ & $15$ &  $0.2$ & $0.2$ & $16$ &$-0.78$\\
  &   \scalebox{0.75}{$\ballotx$} &$56$ & $35$ &  $7$ & $4$ & $2.0$ &$-0.11$\\
  &   \scalebox{0.75}{$\ballotx$} &$8.6$ & $7.1$ &  $1$ & $1$ & $1.5$ &$-0.10$\\    
  \bottomrule
\end{tabular}
\end{table}
Figure \ref{fig:weight_analysis} shows how the performance metrics evolve when weights below a certain threshold are set to zero, when particles are averaged before (\ref{fig:weight_analysis_average_over_particles}) or after (\ref{fig:weight_analysis_not_averaged}) computing the performance for MYIPLA.


Once we have set the weights of the matrix below a certain threshold to zero, it is necessary to explore the dead units. These are hidden units all of whose input or output weights are zero \citep{10.1162/neco.1995.7.1.117}. In both cases, the unit is redundant and it can be eliminated to obtain a functionally equivalent network architecture, we will called this new effective weight matrix $w_{\text{pruned}}$.
The occupancy ratio of a weight matrix $w$ \citep{MARINO2023152} is defined as $\psi=\text{size}(w_{\text{pruned}})/\text{size}(w)$, where $\text{size}$ denotes the memory size. The inverse of $\psi$ is the compression ratio.
We compute the occupancy ratio of the weight matrix for both the hidden and output layer for different values of the pruning threshold. We do this for each particle of the final cloud and obtain the average as well as for the averaged final particle cloud, results are shown in Figure \ref{fig:memory_analysis}.
\vspace{30pt}

\begin{figure}[h]
  \centering
    \begin{subfigure}[b]{0.25
    \textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/performance_vs_threshold_laplace.pdf}
        \caption{}
        \label{fig:weight_analysis_average_over_particles}
    \end{subfigure}
    \hspace{30pt}
        \begin{subfigure}[b]{0.25\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/performance_vs_threshold_laplace_not_average_w.pdf} 
        \caption{}
        \label{fig:weight_analysis_not_averaged}
    \end{subfigure}
    \hfill
  \caption{{\normalsize Evolution of the performance metrics when weights below a certain threshold are set to zero, when particles are averaged before (a) or after (b) computing the performance.}}  
  \label{fig:weight_analysis}
\end{figure}

\begin{figure}[h!]
  \centering
    \begin{subfigure}[b]{0.25\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/memory_occupancy_mean.pdf}
        \caption{}
        \label{fig:memory_analysis_average_over_particles}
    \end{subfigure}
    \hspace{30pt}
        \begin{subfigure}[b]{0.25\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/memory_occupancy_particles.pdf} 
        \caption{}
        \label{fig:memory_analysis_not_averaged}
    \end{subfigure}
    \hfill
  \caption{{\normalsize Occupancy ratio for the weights matrices of the hidden and output layers as a function of the pruning threshold, when particles are averaged before (a) or after (b) computing the occupancy ratio.}}  
  \label{fig:memory_analysis}
\end{figure}}

\subsubsection{Sparsity Inducing Prior: CIFAR10}\label{ap:experiments_bnn_addtional_dataset}
We further evaluate our methods on  a classification task using a more complex dataset: CIFAR10.
As with the MNIST dataset, to reduce the cost of computing the gradients on a big dataset, we subsample 5000 data points with labels \emph{plane}, \emph{car}, \emph{ship} and \emph{truck}. 

Given that the data consists of colour images, we employ a convolutional neural network (CNN) architecture. Specifically, we use a combination of convolutional layers, max pooling layers, and linear layers with non-linear activation functions. For simplicity, we apply a sparsity-inducing prior only to the linear layers, and not to the convolutional ones. The sparsity inducing prior for each layer with weight matrix $w$ is given by $p_{\alpha}(w)=\prod_i \text{Laplace}(w_i|0, e^{2\alpha})$ where $\alpha$ is learn from the data. The network structure and layer dimensions are as follows.

\begin{itemize}
    \item  Convolutional layer (Deterministic): Conv2d(3, 6, 5)
\item Max pooling layer (Deterministic): MaxPool2d(2, 2)
\item Convolutional layer (Deterministic): Conv2d(6, 16, 5)
\item Linear layer with sparsity inducing prior + SELU activation function: Linear(16 $\times$ 5 $\times$ 5, 512)
\item Linear layer with sparsity inducing prior + SELU activation function: Linear(512, 256)
\item Linear layer with sparsity inducing prior + SELU activation function: Linear(256, 128)
\item Linear layer with sparsity inducing prior: Linear(128, 4)
\end{itemize}

Table \ref{tab:classification_results_cifar10} presents quantitative results for the variance of the weights and error metrics. The last column provides a measure of the sparsity-inducing effect of the Laplace prior on the linear layers.

\begin{table}[h]
\caption{Bayesian neural network on CIFAR10 dataset. Test errors and log pointwise predictive density (LPPD) achieved using the final particle cloud with $N = 50$. Computation times and standard deviation of the empirical distribution of the weight matrix $w$ for linear layers are also provided.}
\centering
\begin{tabular}{lcccc}
\toprule
{Algorithm} & {Error (\%)} & {LPPD ($\times 10^{-1}$)} & {Time (s)} & {Std. $w$} \\
\midrule
{MYPGD}   & $5.27\pm0.95$      & $-4.41\pm0.38$      & $201$     & $3.10 $      \\
{MYIPLA}  & $\mathbf{5.23\pm1.31}$ & $-5.05\pm0.45$      & $199$     & $3.22$       \\
{PIPGLA}  & $5.39\pm1.02$      & $\mathbf{-4.32\pm0.37}$ & $295$     & $\mathbf{2.85}$ \\
{PGD}     & $6.01\pm1.15$      & $-5.73\pm0.40$      & $\mathbf{178}$ & $11.51$      \\
{SOUL}    & $9.11\pm2.03$      & $-7.68\pm1.56$     & $433$     & $15.68$     \\
{IPLA}    & $5.40\pm1.33$      & $-5.90\pm0.75$      & $181$     & $15.73$     \\
\bottomrule
\end{tabular}
\label{tab:classification_results_cifar10}
\end{table}


\subsubsection{Non-Differentiable Activation Functions}\label{ap:experiments_bnn_activation_function}
During gradient checking in neural network training, a potential source of inaccuracy arises from the presence of non-differentiable points in the objective function \citep{kumar2024gddoesntmakecut}. These non-smooth points often result from the use of activation functions such as the Rectified Linear Unit (ReLU), defined as $\max(0, x)$, as well as from the hinge loss in support vector machines, maxout neurons, among others. To give a concrete example, consider the ReLU activation function and $x<0$ but very close to $0$. The analytic gradient evaluated at $x$ is equal to $0$. However, the numerical gradient can be non-zero when using a finite difference approximation in case $x+h>0$. 

Our method provides a principled way of dealing with these non-differentiable points. 
To illustrate this, we present a simple example similar to the one in the previous section. Here, we consider a Bayesian neural network with a Normal prior distribution on the weights to classify MNIST digits 1 and 7, instead of 4 and 9. 
Additionally, we use a linear approximation of $tanh$ as the activation function to mitigate the dying neuron problem associated with ReLU \citep{dying_neuron_2020}, while noting that it still remains non-differentiable. This linear approximation is defined as
\begin{equation*}
    h(x) = \begin{cases}
    -1 \quad  \text{if} \; x<-1,\\
     x \quad \ \ \;\text{if} \; x\in[-1, 1],\\
    1 \quad \ \;\;\text{if} \; x>1.
    \end{cases}
\end{equation*}
Furthermore, we can compute the proximal mapping of $h$ which is given by
\begin{equation}\label{eq:proximity_map_leaky_relu}
    \prox_{h}^\lambda(x) = \begin{cases}
        x &\text{if} \; x<-1,\\
        -1  &\text{if} \; x\in[-1, -1+\lambda],\\
        x-\lambda   & \text{for} \; x\in[-1+\lambda,1-\lambda],\\
        1 & \text{for} \; x\in[1-\lambda, 1],\\
        x&\text{if} \; x> 1,
    \end{cases}
\end{equation}
where we have applied the first order optimality condition and used the subgradient of the function at $x=-1$ and $1$ which is given by the sets $[0, 1]$ and $[-1, 0]$, respectively.
Therefore, in this setting we have the following likelihood
\begin{equation*}
    p(l|f,x)\propto \exp\bigg(\sum_{j=1}^{40} v_{lj} h_{LR}\Big(\sum_{i=1}^{784}w_{ji}f_i\Big)\bigg).
\end{equation*}
We assign priors $p_{\alpha}(w)=\prod_i \mathcal{N}(w_i|0, e^{2\alpha})$ and $p_{\beta}(v)=\prod_i \mathcal{N}(v_i|0, e^{2\beta})$ to the input and output layer's weights, respectively, and learn $\theta = (\alpha, \beta)$ from the data. Hence, model's  density is given by
\begin{equation*}
    p_{\theta}(x,\mathcal{Y}_{\text{train}}) =\prod_i \mathcal{N}(w_i|0, e^{2\alpha}) \prod_j \mathcal{N}(v_j|0, e^{2\beta})\prod_{(f,l)\in\mathcal{Y}_{\text{train}}}p(l|f,x),
\end{equation*}
where $x$ denotes the weight matrices, i.e. $x = (w, v)$.
We note that the log density can be decomposed as
\begin{equation*}
    -\log p_{\theta}(x,\mathcal{Y}_{\text{train}}) = \underbrace{d_w\alpha + \frac{1}{2}\sum_i |w_i|^2e^{-2\alpha} + d_v\beta+ \frac{1}{2}\sum_j |v_j|^2e^{-2\beta}}_{g_1(\theta, x)} \underbrace{- \sum_{(f,l)\in\mathcal{Y}_{\text{train}}}\log p(l|f,x)}_{g_2(\theta, x)},
\end{equation*}
where $d_w$ and $d_v$ denote the dimensions of the weights $w$ and $v$, respectively, $g_1$ is differentiable and depends on $\theta$ and $x$, while $g_2$ is proper, convex and lower semi-continuous and only depends on $x$, that is, $g_2(\theta, x) = g_2(x)$. 
As a result, the non-differentiability affects only the latent variables $x$. In this case, we can compute the proximity map of $g_2$ by using the expression for the proximity map of the activation function, provided in \eqref{eq:proximity_map_leaky_relu}.

We follow the same implementation details as outlined in the previous section and use the same performance metrics: average classification error over a test set and log pointwise predictive density.
The results for the proposed proximal algorithms are provided in Table \ref{table-bnn-comparison-nondiff-activation} together with the computation times for $N=50$ and $500$ iterations. In addition, plots of the evolution of the different performance metrics for different number of particles are shown in Figure \ref{fig:bnn_non_diff_activation_performance}.
We observe that the standard deviation of the LPPD across runs decreases as the number of particles increases. Moreover, PIPGLA exhibits a lower standard deviation compared to the other methods.

\begin{table}[t]
  \caption{Bayesian neural network with non-differentiable activation function. Test errors and log pointwise predictive density (LPPD) achieved using the final particle cloud with $N = 50$ and $500$ iterations.}
  \label{table-bnn-comparison-nondiff-activation}
  \centering
  \begin{tabular}{llll}
    \toprule
    Algorithm     & Error $\left(\%\right)$     &  LPPD $\left(\times 10^{-2}\right)$ & Times (s)\\
    \midrule
    MYPGD & $0.75\pm 0.68$  & $\mathbf{-3.36\pm1.18}$  &   $\mathbf{40}$\\
    MYIPLA & $\mathbf{0.70\pm 0.50}$  & $-4.28\pm 2.86$ & $\mathbf{40}$\\
        PIPGLA & $0.90\pm 0.49$  & $-3.76\pm0.96$  &   $68$\\
    \bottomrule
  \end{tabular}
\end{table}
\normalsize

\begin{figure}[t]
    \centering

    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/evolution_test_error_N_5.pdf}
        \caption{Test error. $N=5$}
        \label{fig:subfig_11_error_label}
    \end{subfigure}
        \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/evolution_test_error_N_50.pdf}
        \caption{Test error. $N=50$}
        \label{fig:subfig_33_error_label}
    \end{subfigure}
        \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/evolution_test_error_N_100.pdf}
        \caption{Test error. $N=100$}
        \label{fig:subfig_33_error_label_100}
    \end{subfigure}
    \vfill
    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/evolution_lppd_N_5.pdf}
        \caption{LPPD. $N=5$}
        \label{fig:subfig_22_lppd}
    \end{subfigure}
    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/evolution_lppd_N_50.pdf}
        \caption{LPPD. $N=50$}
        \label{fig:subfig_44_lppd}
    \end{subfigure}
    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/evolution_lppd_N_100.pdf}
        \caption{LPPD. $N=100$}
        \label{fig:subfig_44_lppd_100}
    \end{subfigure}

    \caption{Evolution of the classification error on a test set (top) and the log pointwise predictive density (LPPD) (bottom) over iterations in the BNN experiment with non-differentiable activation function, using $250$ iterations. Values averaged over 100 runs.}
    \label{fig:bnn_non_diff_activation_performance}
\end{figure}




\subsection{Image Deblurring}\label{app:image_deblurring}
We consider the problem of recovering a high-quality image from a blurred and noisy observation $y = Hx +\varepsilon$, where $H$ is a blurring operator that blurs a pixel $x_{i,j}$ uniformly with its closest neighbours (10 $\times$ 10 patch), and $\varepsilon\sim\mathcal{N}(0, \sigma^2 I)$. 
The log prior is proportional to the total variation defined as $ TV(x)=\Vert \nabla_d x\Vert_{1}$, where $\Vert \cdot\Vert_{1}$ is the $\ell_1$ norm and $\nabla_d$ is the two-dimensional discrete gradient operator, which is non-differentiable. 
The proportionality parameter, $e^\theta$, which controls the strength of this log prior, typically requires manual tuning.
Instead of fixing this parameter manually, we estimate its optimal value using our proposed algorithms. Note that we exponentiate $\theta$ to ensure its positivity.

The posterior distribution for the model takes the form
\begin{equation*}
    p_\theta(y|x)\propto\exp\left(-\Vert y- Hx\Vert^2/(2\sigma^2)-e^\theta TV(x) + \log C(\theta)\right),
\end{equation*}
where $C(\theta)$ is proportional to the normalising constant of the prior distribution. 
To compute $C(\theta)$, we start by considering the case when $\theta = 0$. In this case, the total variation prior is given by
\begin{equation*}
    p(x)= C \exp(-TV(x)),
\end{equation*}
where $C$ is constant. For $\theta\neq 0$, the prior $p_\theta(x)$ can be expressed using the pushforward measure as
\begin{equation*}
    p_\theta(x) = T_{e^\theta}\#p(x) = e^{d_x\theta}p(e^\theta x),
\end{equation*}
where $T_{e^\theta}\#$ denotes the pushforward operator and $d_x$ is the dimension of $x$. Due to the linearity of the total variation norm, it follows that 
\begin{equation*}
    p_\theta(x)=C e^{d_x\theta} \exp\left(-e^\theta TV(x)\right).
\end{equation*}
Thus, we obtain that theta $C(\theta) = e^{d_x\theta}$.
For the experiments, we employ the algorithms proposed by \citep{douglas_56} and \citep{chambolle2004algorithm} to efficiently compute the proximal operator of the total variation norm.
Due to the difficulty of computing the joint proximal operator over the parameter $\theta$ and latent variables $x$, we have consider hybrid versions of the algorithms, which use standard gradient-based updates for the parameters and proximal updates for the particles. 
That is, the updates for the hybrid MYIPLA algorithm are given by
\begin{align*}
    \theta_{n+1}^N =& \theta_{n}^N  - \frac{\gamma}{d_xN}\sum_{i=1}^N e^{\theta_n^N} TV( X_n^{i, N})+\gamma+\sqrt{\frac{2\gamma}{N}}\xi_{n+1}^{0, N},\\
    X_{n+1}^{i, N} =& \Big(1-\frac{\gamma}{\lambda}\Big)X_{n}^{i, N} -\gamma \frac{H^\intercal(HX_n^{i, N} -y)}{\sigma^2} + \frac{\gamma}{\lambda}\prox_{e^{\theta_n^N} TV}^{\lambda}(X_n^{i, N})+\sqrt{2\gamma}\;\xi_{n+1}^{i, N}.
\end{align*}
Note that as in the Bayesian neural network example, we apply the heuristic of dividing the gradient term in the $\theta$ updates by $d_x$ for numerical stability.
For PIPGLA, the update for the parameter $\theta$ remains the same, while the updates for the particles are of the form
\begin{align*}
    X_{n+1/2}^{i, N} &= X_{n}^{i, N} -\gamma \frac{H^\intercal(HX_n^{i, N} -y)}{\sigma^2} +\sqrt{2\gamma}\;\xi_{n+1}^{i, N},\\
    X_{n+1}^{i, N} &= \prox_{e^{\theta_{n+1}^ {N}}TV}^{\lambda} \big(X_{n+1/2}^{i, N}\big).
\end{align*}
Analogous forms are defined for the proximal PGD algorithms.


\paragraph{Dataset.} We use black and white images with pixels values ranging from 0 to 255. The dimensions of the acoustic guitar image are $d_x = n_1\times n_2 = 584\times 238$, while the dimensions of the boat image (a standard benchmark in the image reconstruction literature) are $d_x = 512\times512$.

\paragraph{Implementation details.} 
We implement the proximal operator of the total variation using the \textit{proxTV} Python package \citep{2011_Barbero11_icml, 2018_barbero_jmlr}. Specifically, we employ the Douglas-Rachford method introduced by \citet{douglas_56} and the Chambolle-Pock method \citep{chambolle2004algorithm}. The Douglas-Rachford method is significantly faster than the Chambolle-Pock method.
It is important to note that increasing the precision of the Moreau-Yosida envelope significantly slows down the computation of the proximal operator when using these numerical schemes.
We set $\gamma = 0.01$, and $\lambda = 0.4$ for MYPGD and MYIPLA and $\lambda = 0.001$ for PIPGLA (after performing a grid search) which ensures that the algorithms are not close to losing stability. 
In addition, the pixels of the initial particles are drawn from a normal distribution with mean $\mu = 50$ and scale parameter $10$, while the initial parameter estimate $\theta_0$ is sampled from a uniform distribution over $[-15, 10]$.

\paragraph{Performance metrics.} 
To evaluate the performance of our algorithms in image reconstruction, we evaluate the mean squared error (MSE) and the structural similarity index (SSIM) between the particle cloud and the ground-truth image. The SSIM quantifies image quality by comparing luminance, contrast, and structural details.



\begin{figure}[t]
    \centering

    \begin{subfigure}[b]{0.18\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/original_reshaped.jpg}
        \caption{Original}
        \label{fig:subfig_11}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.18\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/blurred.jpg}
        \caption{Blurred}
        \label{fig:subfig_22}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.18\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/mypgd_reconstructed_2.jpg}
        \caption{MYPGD}
        \label{fig:subfig_33}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.18\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/myipla_reconstructed_2.jpg}
        \caption{MYIPLA}
        \label{fig:subfig_44}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.18\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/pipgla_reconstructed_4.jpg}
        \caption{PIPGLA}
        \label{fig:subfig_55}
    \end{subfigure}

    \caption{Image deblurring experiment. All the algorithms use $N=10$ particles and are run for 3000 iterations with a burn-in of 100 iterations.}
    \label{fig:image_deconvolution_methods}
\end{figure}


\begin{figure}[h!]
    \centering

    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/parameter_estimates_dr_5_axis.pdf}
        \caption{$\theta$ estimates}
        \label{fig:subfig_11_estimates}
    \end{subfigure}
    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/evolution_mse_2.pdf}
        \caption{MSE}
        \label{fig:subfig_22_nmse}
    \end{subfigure}
    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/evolution_ssim_dr_5.pdf}
        \caption{Structural similarity index (SSIM)}
        \label{fig:subfig_22_ssim}
    \end{subfigure}

    \caption{Evolution of different quantities over iterations in the image deblurring experiment with $N=10$ particles for the acoustic guitar image. The plots are shown after discarding a burn-in period of 100 iterations and the initial parameter is $\theta_0 = -10.5$.}
    \label{fig:image_deconvolution_methods_analysis}
\end{figure}

\begin{figure}[t]
    \centering

    \begin{subfigure}[b]{0.18\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/original_boat.jpg}
        \caption{Original}
        \label{fig:subfig_11_boat}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.18\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/blurred_boat.jpg}
        \caption{Blurred}
        \label{fig:subfig_22_boat}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.18\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/mypgd_reconstructed_boat.jpg}
        \caption{MYPGD}
        \label{fig:subfig_33_boat}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.18\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/myipla_reconstructed_boat.jpg}
        \caption{MYIPLA}
        \label{fig:subfig_44_boat}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.18\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/pipgla_reconstructed_boat.jpg}
        \caption{PIPGLA}
        \label{fig:subfig_55_boat}
    \end{subfigure}

    \caption{Image deblurring experiment. All the algorithms use $N=10$ particles and are run for 3000 iterations with a burn-in of 100 iterations.}
    \label{fig:image_deconvolution_methods_boat}
\end{figure}

\begin{figure}[h!]
    \centering

    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/parameter_estimates_dr_boat.pdf}
        \caption{$\theta$ estimates}
        \label{fig:subfig_11_estimates_boat}
    \end{subfigure}
    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/evolution_mse_dr_boat.pdf}
        \caption{MSE}
        \label{fig:subfig_22_nmse_boat}
    \end{subfigure}
    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/evolution_ssim_boat.pdf}
        \caption{Structural similarity index (SSIM)}
        \label{fig:subfig_22_ssim_boat}
    \end{subfigure}

    \caption{Evolution of different quantities over iterations in the image deblurring experiment with $N=10$ particles for the boat image. The plots are shown after discarding a burn-in period of 100 iterations and the initial parameter is $\theta_0 = -8.1$.}
    \label{fig:image_deconvolution_methods_analysis_boat}
\end{figure}

\paragraph{Results.}
Figures \ref{fig:image_deconvolution_methods} and \ref{fig:image_deconvolution_methods_boat}  display the original and blurred images alongside the reconstructed images obtained using our different proximal algorithms. The methods are run for 3000 iterations (with a burn-in of 100 iterations) and $N=10$ particles, employing the Douglas-Rachford method to numerically evaluate the proximal operator of the total variation norm. Figures \ref{fig:image_deconvolution_methods_analysis} and \ref{fig:image_deconvolution_methods_analysis_boat} illustrate the evolution of the parameter estimates $\theta$, the mean squared error and the SSIM, after discarding a burn-in period of 100 iterations.
The high MSE for PIPGLA (Figures \ref{fig:subfig_22_nmse} and \ref{fig:subfig_22_nmse_boat}) arises from the difference in the shades of grey between the reconstructed and the original images, remaining large regardless of the choice of the proximal parameter $\lambda$.
Besides, the optimal value for the strength of the total variation prior achieved by the algorithms, $e^\theta\approx 0.35$ (for both test images) is close to the value set manually in similar works for image reconstruction (e.g. \citet{pereyra2016proximal, durmus_proximal,goldman2022gradient}).


\subsection{Nuclear-norm models for low rank matrix estimation}\label{app:low_rank_matrix}
In this section, we demonstrate another application of our methods: the problem of matrix completion. Matrix completion \citep{sparse_matrix_noise, sparse_matrix} focuses on recovering an intact matrix with low-rank property from incomplete data. Its application varies from
wireless communications \citep{mc_wireless}, traffic sensing \citep{Mardani2014EstimatingTA} to integrated radar and recommender systems \citep{GOGNA20155789}. 
The low-rank prior knowledge is incorporated in the model using the nuclear-norm of the matrix \citep{fazel_2002}. 
However, similar to the image deblurring example, the strength of this prior is a hyperparameter that must be set manually. Instead, we estimate the optimal value of this parameter, thereby extending the applicability of proximal methods that typically perform MLE, rather than MMLE, as in our algorithms.

We conduct a graphical posterior predictive check of the widely used nuclear norm model for low-rank matrices, similar to the example in \citet{pereyra2016proximal}, but in the context of matrix completion rather than matrix denoising. Let $x$ be an unknown low-rank matrix of size $n_1 \times n_2$. 
Consider a mask $M_\Omega$, where $\Omega$ is a set of indices from a matrix of size $n_1 \times n_2$. 
When the mask is applied to the matrix $x$, i.e., $M_\Omega X$, only the entries of the matrix corresponding to indices in  $\Omega$ are observed.
Furthermore, after the masking operation, we do not have direct access to the observed entries but instead observe a noisy version of them, where the observational noise has mean zero and covariance $\sigma^2 I$. Thus, our observations are given by $y = M_\Omega x + \sigma^2\varepsilon$, with $\varepsilon\sim\mathcal{N}(0, I)$. 
It is important to highlight, that we will also estimate with our algorithms the scale parameter $\sigma$, rather than requiring it to be fixed manually.

Our objective is to recover $x$ from $y$ under the prior knowledge that $x$ has low rank, that is, most of its singular values are zero. A convenient model for this type of problem is the nuclear
norm prior, which is a sparsity-inducing prior, given by
\begin{equation*}
    p_\theta(x) = C(\theta_1) e^{-e^{\theta_1}\Vert x\Vert_{\text{tr}}},
\end{equation*}
where $\Vert \cdot\Vert_{\text{tr}}$ is the trace (or nuclear) norm, which is a convex envelope of the rank function \citep{JMLR:v9:bach08a_trace_norm}, and is defined as
\begin{equation*}
  \Vert x\Vert_{\text{tr}} = \sum_{i=1}^r\sigma_i(x),
\end{equation*}
$r=\text{rank}(x)$ and $\sigma_1(x)\geq \dots\geq \sigma_r(x)\geq 0$ are the singular values.
Besides, the constant $C(\theta_1)$ can be computed using the pushforward argument, as in Section \ref{app:image_deblurring}, leveraging the linearity of the trace norm. Specifically, it is given by  $C(\theta_1) = C e^{d_x\theta_1}$, where $d_x$ denotes the dimension of the original matrix $x$ and $C$ is a constant.
The posterior distribution of our model can be written as
\begin{equation*}
    p_\theta(y|x)\propto \frac{e^{d_x\theta_1}}{e^{d_y\theta_2}}\exp\left({-\frac{\Vert M_\Omega x-y \Vert^2}{2e^{2\theta_2}}-e^{\theta_1}\Vert x\Vert_{\text{tr}}}\right).
\end{equation*}
Therefore, the negative log density can be decomposed as
\begin{equation*}
  U(\theta, x) =  \underbrace{-d_x\theta_1+d_y\theta_2 +\frac{\Vert M_\Omega x -y\Vert^ 2}{2 e^{2\theta_2}}}_{g_1(\theta, x)} +\underbrace{e^{\theta_1}\Vert x\Vert_{\text{tr}}}_{g_2(\theta, X)},
\end{equation*}
where $d_y$ denotes the number of observed entries.
Note that we exponentiate the parameters $\theta_1$ and $\theta_2$ to ensure their positivity.

\paragraph{Dataset.} We use the \textit{checkerboard} image of size $188\times 188$ and rank 2. We add Gaussian observational noise with variance $\sigma^2 = 0.1$ and mask $30\%$ of the pixels in the image.


\paragraph{Proximal operator of $g_2$.} Recall that $g_2(\theta, x)$ is of the form
\begin{equation*}
    g_2(\theta, x)= e^{\theta_1}\Vert x\Vert_{\text{tr}}.
\end{equation*}
To compute the proximal map, we first observe that if $\theta_1$ is known, then by \citet[Theorem 2.1]{svd_thresholding}, it follows that
\begin{equation*}
    \prox_{g_2}^{\lambda}(x) = \argmin_{z}\; \{e^{\theta_1} \Vert z\Vert_{\text{tr}} + \frac{1}{2\lambda} \Vert x-z\Vert_{F}^2 \}= S_{e^{\theta_1}\lambda}(x) := U\Sigma_{e^{\theta_1}\lambda} V^{T},
\end{equation*}
where $U\Sigma V^T$ is a singular value decomposition, and $\Sigma_{\beta}$ is diagonal with entries $(\Sigma_{\beta})_{ii} = \max \{\Sigma_{ii}-\beta, 0\}$. Based on this, we calculate
\begin{equation*}
    \prox_{g_2}^{\lambda}(\theta, x) = \argmin_{(\alpha, z)}\; \{e^{\alpha} \Vert z\Vert_{\text{tr}} + \frac{1}{2\lambda} \big(\Vert\theta_1-\alpha\Vert^2 + \Vert x-z\Vert_{F}^2 \big)\},
\end{equation*}
where $\Vert\cdot\Vert$ denotes the Frobenius norm.
The minimisers $(\alpha, z)$ satisfy the following system of equations
\begin{align}
        &\alpha = \theta_1 +\lambda e^{\alpha}\Vert S_{e^{\alpha}\lambda}(x)\Vert_{\text{tr}} \Longrightarrow (\alpha -\theta)e^{\theta_1 - \alpha} = \lambda e^{\theta_1}\Vert S_{e^{\alpha}\lambda}(x)\Vert_{\text{tr}},\label{eq:alpha_matrix}\\
    &z = S_{\lambda e^{\alpha}}(x).\label{eq:x_matrix}
\end{align}
Solving this system is complicated due to the dependence between $\alpha$ and $z$ and using an iterative solver can be computationally burdensome. Therefore, we have decided to approximate (\ref{eq:alpha_matrix}) by
\begin{equation*}
    (\alpha -\theta_1)e^{\theta_1-\alpha} \approx \lambda e^{\theta_1}\Vert S_{e^{\theta}\lambda}(x)\Vert_{\text{tr}} \Longrightarrow \alpha \approx \theta_1 + W(\lambda e^{\theta_1}\Vert S_{e^{\theta_1}\lambda}(X^l)\Vert_{\text{tr}}),
\end{equation*}
where $W$ is the Lambert $W$ function.
Substituting this value of $\alpha$ into (\ref{eq:x_matrix}), we obtain 
\begin{equation*}
    z \approx S_{e^{\alpha}\lambda}(x).
\end{equation*}

\paragraph{Implementation.} To stabilise the implementation of the algorithms, we divide the gradient and proximal mapping terms in the updates of $\theta_1$ and $\theta_2$ by the dimension of the the matrix $x$, $d_x$, and the number of observed entries in $y$, $d_y$, respectively. We then set $\gamma= 0.01$, and $\lambda=0.25$ for MYPGD and MYIPLA and $\lambda = 0.01$ for PIPGLA. 
The pixels of the initial particles are drawn from a normal distribution with mean $\mu = 50$ and scale parameter $10$, while the initial values of the parameters $\theta_1$ and $\theta_2$ are drawn from uniform distributions over $[-15, 5]$ and $[-10, 10]$, respectively.

\paragraph{Performance metrics.} 
To asses the performance of our algorithms for low-rank matrix completion, we analyse the normalised mean squared error (NMSE) for both the entire matrix and the missing entries.


\paragraph{Results.} Figure \ref{fig:matrix_completion_experiment} displays the original and observed matrices alongside the reconstructed matrices obtained using our different proximal algorithms. The methods are run for 3000 iterations (with a burn-in of 100 iterations) and $N=10$ particles. The NMSEs for the entire matrix and the missing entries for the final particle cloud are displayed in Table \ref{table-matrix-completion-experiment}, together with the computation times.
 

\begin{figure}[t]
    \centering

    \begin{subfigure}[b]{0.18\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/checkerboard.jpg}
        \caption{Original}
        \label{fig:subfig_11_matrix}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.18\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/observed_matrix_completion.jpg}
        \caption{Observed}
        \label{fig:subfig_22_matrix}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.18\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/mypgd_matrix_completion.jpg}
        \caption{MYPGD}
        \label{fig:subfig_33_matrix}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.18\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/myipla_matrix_completion.jpg}
        \caption{MYIPLA}
        \label{fig:subfig_44_matrix}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.18\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/pipgla_matrix_completion.jpg}
        \caption{PIPGLA}
        \label{fig:subfig_55_matrix}
    \end{subfigure}

    \caption{Low-rank matrix completion. All the algorithms use $N=10$ particles and are run for 3000 iterations. The blue pixels in (b) represent the mask.}
    \label{fig:matrix_completion_experiment}
\end{figure}

\begin{table}[t]
  \caption{Low-rank matrix completion. Normalised mean squared errors (NMSE) for the entire matrix and the missing entries achieved using the final particle cloud with $N = 10$ and $3000$ iterations.}
  \label{table-matrix-completion-experiment}
  \centering
  \begin{tabular}{llll}
    \toprule
    Algorithm     & NMSE entire $\left(\%\right)$     &   NMSE missing $\left(\%\right)$ & Times (min)\\
    \midrule
    MYPGD & $1.21\pm 0.49$  & $1.67\pm0.52$  &   $\mathbf{4.1}$\\
    MYIPLA & $\mathbf{1.13\pm 0.48}$  & $\mathbf{1.53\pm 0.44}$ & $4.7$\\
        PIPGLA & $2.02\pm 0.29$  & $2.11\pm 0.31$  &   $5.5$\\
    \bottomrule
  \end{tabular}
\end{table}
\normalsize

\subsection{Ablation Study}
In this section, we analyse how the choice of the regularisation parameter $\lambda$ in the Moreau–Yosida approximation affects the performance and stability of the algorithm.

Choosing an appropriate value for $\lambda$ is a challenging task as this parameter controls both the level of regularisation and the closeness to the target, and is closely tied to the step size parameter $\gamma$.
\citet{durmus_proximal} provides some empirical guidance on the choice of $\gamma, \lambda$ for sampling tasks. 
\citet{crucinio2023optimal} shows that $\lambda \leq \gamma$ generally leads to better results in the case of grad Lipschitz potentials, while one should choose $\lambda \geq \gamma$ for light tail distributions. Adaptive strategies to choose $\lambda$ have been considered in the optimisation literature (see \citet{oikonomidis24a} and references therein) but equivalent results for sampling have not been obtained yet.


We conduct additional experiments to analyse the impact of the regularisation parameter $\lambda$ in the Bayesian logistic regression task with Laplace prior. In Figure \ref{fig:regularisation_parameter_analysis}, we report the performance (measured by NMSE) of MYIPLA, MYPGD and PIPGLA algorithms using approximate proximity maps, evaluated over a fine grid of $\lambda$ values.
The step size parameters used are those listed in Table~\ref{table-logistic-hyperparameters}: $\gamma = 0.05$ for MYIPLA and MYPGD, and $\gamma = 0.01$ for PIPGLA.
Each configuration is run with 100 different random seeds to compute confidence intervals.  
We observe that our algorithms exhibit stable performance across a broad range of $\lambda$ values.


\begin{figure}[h!]
    \centering

    \begin{subfigure}[b]{0.6\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/mypgd_lambda.pdf}
        \caption{MYPGD, $\gamma = 0.05$}
        \label{fig:subfig_11_lambda_mypgd}
    \end{subfigure}
    \begin{subfigure}[b]{0.6\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/myipla_lambda.pdf}
        \caption{MYIPLA,  $\gamma = 0.05$}
        \label{fig:subfig_22_lambda_myipla}
    \end{subfigure}
    \begin{subfigure}[b]{0.6\textwidth}
        \centering
        \includegraphics[width=\textwidth]{plots/pipgla_lambda.pdf}
        \caption{PIPGLA,  $\gamma = 0.01$}
        \label{fig:subfig_22_lambda_pipgla}
    \end{subfigure}

    \caption{Normalised MSE (\%) for different values of the regularisation parameter $\lambda$ and a fixed step size $\gamma$. Each configuration is run with 100 random seeds for 50 particles and 5000 steps. The proximal map for all algorithms is computed approximately.}
    \label{fig:regularisation_parameter_analysis}
\end{figure}