%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% package added by myself
\usepackage{algorithm}
\usepackage{algpseudocode}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{ma_85}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

%\title{Soldering Defect Detection for New Components in Printed Circuit Boards with Unknown Awareness \\(Supplementary Material)}

\title{DeepGD3: Unknown-Aware Deep Generative/Discriminative Hybrid Defect Detector for PCB Soldering Inspection \\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<machingwen@nycu.edu.tw>?Subject=DeepGD3}{Ching-Wen~Ma}{}}
\author[1]{Yanwei~Liu}
% Add affiliations after the authors
\affil[1]{%
	College of Artificial Intelligence\\
	National Yang Ming Chiao Tung University\\
	Tainan, Taiwan
}
  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix
\section{Visual Explanation of the Prediction Converter}
$\tau_0$ controls the variance of Gaussians. If it is small, the Gaussian looks shaper. If it is large, the Gaussian looks wider. Figure.~\ref{fig:tau0} explain this concept. Bayesian optimization then optimizes the shape of Gaussians, making them proper for thresholding. 
\begin{figure}[ht]
	\centering
	\includegraphics[width=0.4\columnwidth]{figures/tau0_.pdf}
	\caption{The effect of $\tau_{0}$ on the shape of probability density function. Black curve: the baseline shape of the probability density function. Red curve: smaller $\tau_{0}$ makes the shape of the probability density function concentrated. Blue curve: larger $\tau_{0}$ makes the shape of the probability density function wider.}
	\label{fig:tau0}
\end{figure}

\begin{figure}[!]
	\centering
	\includegraphics[width=0.8\columnwidth]{figures/tau1_.pdf}
	\caption {The effect of $\tau_1$ and $h^{1}_{j,n}$. The larger the $\tau_1$ is, or similarly, the smaller the $h^{1}_{j,n}$ is, the more test samples will be classified as good samples.}
	\label{fig:tau1}
\end{figure}

\begin{figure}[!]
	\centering
	\includegraphics[width=0.8\columnwidth]{figures/tau2_.pdf}
	\caption{The effect of $\tau_2$ and $h^{2}_{j,n}$. The larger the $\tau_2$ is, or similarly, the smaller the $h^{2}_{j,n}$ is, the fewer test samples will be classified as unknown samples.}
	\label{fig:tau2}
\end{figure}


$\tau_1$ and $\tau_2$ adjust the thresholds $h^{1}_{j,n}$ and $h^{1}_{j,n}$ by adjusting the variance of the corresponding Gaussians. A larger $\tau_1$ makes $h^1_{j,n}$ smaller, resulting in more samples being classified as good samples. A larger $\tau_2$ makes $h^2_{j,n}$ smaller, resulting in fewer samples being classified as unknown samples.
Figure~\ref{fig:tau1} and Figure~\ref{fig:tau2} explain this concept. 
Bayesian optimization then optimizes $\tau_1$ and $\tau_2$ for maximizing harmonic score $H$ in main paper's Equation~\ref{eq: harmonicScore}.
%\label{gmm_threshold}

\section{Additional Experimental Details}
\label{sec:experiment_setup}
We fine-tuned the MobileNetV3 large model pre-trained on ImageNet~\citep{deng2009imagenet} using PyTorch and trained the defect and component classifiers.

The default hyperparameters were as follows: The number of training epochs was 30. The initial learning rate was 0.05. We used linear warm-up for the first 5 epochs and decay the learning rate with the cosine annealing scheduler; For the gradient descent optimization algorithm, stochastic gradient descent (SGD) was used; the momentum and weight decay were set at 0.9 and 0.0001. The input image was 224 $\times$ 224 color images. 

In addition, the early stop technique was applied to prevent overfitting.
We applied random horizontal flip, random vertical flip, color jitter, and random grayscale to the data loader of the training set. We did not apply any image augmentation to the validation and test set. A single NVIDIA Tesla V100 GPU was used for all experiments.

\section{Additional Simulation Results for Ablation Study}
We investigated the effects of (a) not including the good new compoent samples in the training data and (b) not using the combiner.

\textbf{Comparisons between not including and including good new component samples in the training data.}
Table~\ref{table:additional_good_new_component_all} shows the effects of including and not including good new component samples in the training data for all test samples, i.e. both old and new componet test samples. Table~\ref{table:additional_good_new_component_OLD} and Table~\ref{table:additional_good_new_component_NEW} show the performances for the old and new component test samples, respectively. 
The $(\cdot)^{-}$ symbol indicates training without including good new component samples. 

Observing all the three tables, we notice that Expert 1 shows a lower overkill rate with including good new component samples in the training data, suggesting that the good new component samples help improve performance. Expert 2's leakage rates are greatly reduced when good new component samples are available during training. For the Hybrid Expert, if there are no good new component samples during the training period, the shared fully connected network $f_{\theta2}$ may not be able to form proper clusters for these new components. As a result, the GMM model will not be able to correctly classify the new component samples, resulting in a higher unknown rate. However, both the overkill rates and the leakage rates still remains low. This implies that even if good new component samples are not available during training, the Hybrid Expert remains trustworthy.

\begin{table}[ht]
	\centering
	\caption{Comparisons between not including $(\cdot)^{-}$ and including $(\cdot)$ good new compoent samples as additional training data. The test performances for all test samlpes, i.e. both old and new component test samples.}
	\label{table:additional_good_new_component_all}
	\scalebox{0.8}{\begin{tabular}{cccc}
			\toprule[1.1pt]
			& Overkill rate (\%)    & Leakage rate (\%) & Unknown rate (\%) \\
			\midrule[1.1pt]
			Expert 1$^{-}$ & 1.773 $\pm$ 1.459  & 0.327 $\pm$ 0.463 & - \\
			\midrule
			Expert 1 & 0.015 $\pm$ 0.008  & 1.827 $\pm$ 3.063 & - \\
			\midrule
			Expert 2$^{-}$ & 3.650 $\pm$ 2.902 & 11.419 $\pm$ 19.321 & 0.000 $\pm$ 0.000 \\
			\midrule
			Expert 2 & 1.954 $\pm$ 0.724 & 1.942 $\pm$ 1.337 & 0.000 $\pm$ 0.000 \\
			\midrule
			Hybrid Expert$^{-}$ & 0.189 $\pm$ 0.266 & 0.074 $\pm$ 0.114 & 26.089 $\pm$ 40.843\\
			\midrule
			Hybrid Expert & 0.108 $\pm$ 0.033 & \textbf{0.063 $\pm$ 0.075} & 3.706 $\pm$ 2.270 \\
			\bottomrule[1.1pt]
	\end{tabular}}
\end{table}

\begin{table}[ht]
	\centering
	\caption{Comparisons between not including $(\cdot)^{-}$ and including $(\cdot)$ good new compoent samples as additional training data. The test performances for old component test samples.}
	\label{table:additional_good_new_component_OLD}
	\scalebox{0.8}{\begin{tabular}{cccc}
			\toprule[1.1pt]
			& Overkill rate (\%)    & Leakage rate (\%) & Unknown rate (\%) \\
			\midrule[1.1pt]
			Expert 1$^{-}$ & 0.097 $\pm$ 0.146  & 0.028 $\pm$ 0.018 & - \\
			\midrule
			Expert 1 & 0.017 $\pm$ 0.007  & 0.021 $\pm$ 0.011 & - \\
			\midrule
			Expert 2$^{-}$ & 3.293 $\pm$ 5.306 & 9.268 $\pm$ 15.373 & 0.000 $\pm$ 0.000 \\
			\midrule
			Expert 2 & 1.282 $\pm$ 0.192 & 2.257 $\pm$ 1.495 & 0.000 $\pm$ 0.000 \\
			\midrule
			Hybrid Expert$^{-}$ & 0.007 $\pm$ 0.010 & 0.042 $\pm$ 0.055 & 18.695 $\pm$ 31.605\\
			\midrule
			Hybrid Expert & 0.129 $\pm$ 0.110 & \textbf{0.019 $\pm$ 0.013} & 3.529 $\pm$ 3.002 \\
			\bottomrule[1.1pt]
	\end{tabular}}
\end{table}

\begin{table}[ht]
	\centering
	\caption{Comparisions between not including $(\cdot)^{-}$ and including $(\cdot)$ good new component samples as additional training data. The test performances for new component test samples.}
	\label{table:additional_good_new_component_NEW}
	\scalebox{0.8}{\begin{tabular}{cccc}
			\toprule[1.1pt]
			& Overkill rate (\%)    & Leakage rate (\%) & Unknown rate (\%) \\
			\midrule[1.1pt]
			Expert 1$^{-}$ & 4.591 $\pm$ 3.687  & 0.611 $\pm$ 0.824 & - \\
			\midrule
			Expert 1 & 0.010 $\pm$ 0.010  & 3.380 $\pm$ 5.540 & - \\
			\midrule
			Expert 2$^{-}$ & 3.368 $\pm$ 3.112 & 17.666 $\pm$ 30.599 & 0.000 $\pm$ 0.000 \\
			\midrule
			Expert 2 & 3.739 $\pm$ 2.459 & 0.989 $\pm$ 1.713 & 0.000 $\pm$ 0.000 \\
			\midrule
			Hybrid Expert$^{-}$ & 0.713 $\pm$ 1.033 & 0.096 $\pm$ 0.167 & 35.055 $\pm$ 46.395\\
			\midrule
			Hybrid Expert & 0.126 $\pm$ 0.062 & \textbf{0.090 $\pm$ 0.156} & 3.324 $\pm$ 1.553 \\
			\bottomrule[1.1pt]
	\end{tabular}}
\end{table}

\textbf{Comparisions between not using and using the prediction combiner}
Table~\ref{table:component_inside_HBE} shows the performance of $\hat{y}^1_{def}$,  $\hat{y}^2_{def}$, and $\hat{y}_{def}$. The results show that $\hat{y}^1_{def}$ produces a high leakage rate with a high standard deviation, and $\hat{y}^2_{def}$ achieves 0.691\% $\pm$ 0.974\% in terms of leakage rate, but also produces a high overkill rate. The prediction combiner combines $\hat{y}^1_{def}$ and $\hat{y}^2_{def}
$, achieving an overkill rate of 0.108\% $\pm$ 0.033\% and a leakage rate of 0.063\% $\pm$ 0.075\%, with an unknown rate of 3.706\% $\pm$ 2.270\%. 

\begin{table}[ht]
	\centering
	\caption{Performance of $\hat{y}^1_{def}$,  $\hat{y}^2_{def}$, and $\hat{y}_{def}$ for all test samples, i.e. both old and  new component test samples}
	\label{table:component_inside_HBE}
	\scalebox{0.8}{\begin{tabular}{cccc}
			\toprule[1.1pt]
			& Overkill rate (\%)    & Leakage rate (\%) & Unknown rate (\%) \\
			\midrule[1.1pt]
			$\hat{y}^1_{def}$  & 0.390 $\pm$ 0.320 & 0.681 $\pm$ 1.114 & - \\
			\midrule
			$\hat{y}^2_{def}$  & 2.287 $\pm$ 1.424 & 0.691 $\pm$ 0.974 & 0.000 $\pm$ 0.000 \\
			\midrule
			$\hat{y}_{def}$ & \textbf{0.108 $\pm$ 0.033} & \textbf{0.063 $\pm$ 0.075} & 3.706 $\pm$ 2.270 \\
			\bottomrule[1.1pt]
	\end{tabular}}
\end{table}

Table~\ref{table:OLD_test_set_HBE} and Table~\ref{table:NEW_test_set_HBE} show the performance of $\hat{y}^1_{def}$,  $\hat{y}^2_{def}$, and $\hat{y}_{def}$ for the old component test set and new component test set, respectively. For the old component settings, $\hat{y}_{def}$ yields the best overkill and leakage rate performance. For the new component setting, $\hat{y}_{def}$ achieves 0.063\% $\pm$ 0.075\% in terms of leakage rate, indicating excellent performance for detecting new component defects. 

\begin{table}[ht]
	\centering
	\caption{Performance of $\hat{y}^1_{def}$,  $\hat{y}^2_{def}$, and $\hat{y}_{def}$ for old component test samples.}
	\label{table:OLD_test_set_HBE}
	\scalebox{0.8}{\begin{tabular}{cccc}
			\toprule[1.1pt]
			& Overkill rate (\%)    & Leakage rate (\%) & Unknown rate (\%) \\
			\midrule[1.1pt]
			$\hat{y}^1_{def}$ & 0.671 $\pm$ 0.794 & 0.034 $\pm$ 0.032 & - \\
			\midrule
			$\hat{y}^2_{def}$ & 2.227 $\pm$ 1.768 & 0.891 $\pm$ 1.343 & 0.000 $\pm$ 0.000 \\
			\midrule
			$\hat{y}_{def}$ & \textbf{0.129 $\pm$ 0.110} & \textbf{0.019 $\pm$ 0.013} & 3.529 $\pm$ 3.002 \\
			\bottomrule[1.1pt]
	\end{tabular}}
\end{table}

\begin{table}[ht]
	\centering
	\caption{Performance of $\hat{y}^1_{def}$,  $\hat{y}^2_{def}$, and $\hat{y}_{def}$ for  new component test samlpes.}
	\label{table:NEW_test_set_HBE}
	\scalebox{0.8}{\begin{tabular}{cccc}
			\toprule[1.1pt]
			& Overkill rate (\%)    & Leakage rate (\%) & Unknown rate (\%) \\
			\midrule[1.1pt]
			$\hat{y}^1_{def}$ & 0.150 $\pm$ 0.073 & 1.202 $\pm$ 2.064 & - \\
			\midrule
			$\hat{y}^2_{def}$ & 2.315 $\pm$ 0.592 & 0.090 $\pm$ 0.156 & 0.000 $\pm$ 0.000 \\
			\midrule
			$\hat{y}_{def}$ & \textbf{0.126 $\pm$ 0.062} & \textbf{0.090 $\pm$ 0.156} & 3.324 $\pm$ 1.553 \\
			\bottomrule[1.1pt]
	\end{tabular}}
\end{table}

\section{Algorithms}
\label{sec:alg}

\textbf{Training procedure:}
The detailed training procedure of Figure~\ref{fig:HybridExpert_training} is described in Algorithm~\ref{alg:training}.

\begin{algorithm}[ht]
	\caption{Learning algorithm of the proposed method}
	\label{alg:training}
	\begin{algorithmic}[1]
		
		\State \textbf{Input:} A mini batch of defect training samples ($x_{def}, y_{def}$), and component training samples ($x_{com}, y_{com}$) $\in$ $D_{train}$, A mini batch of validation samples ($x_{val}, y'_{def}, y'_{com}$) $\in$ $D_{val}$, Max number of training epochs $E$
		
		\State \textbf{Output:} Feature extractor $f_{\theta1}$, Encoder network $f_{\theta2}$, Defect classifier $\psi$, Projection network $\phi$, Gaussian mixture model $\gamma_{j}$ where $j\in \{1, 2, \ldots, 23\}$.
		\State Initialize $f_{\theta1}$, $f_{\theta2}$, $\psi$, $\phi$, model selection loss $\ell_{\omega}$ = $\infty$\;
		\For{$e\leftarrow 1$ to $E$}
		\State $\ell_{def}$ $\leftarrow$ CrossEntropy(${\psi}(f_{\theta2}(f_{\theta1}(x_{def})))$, ${y}_{def}$) $//$ Train the upper branch, i.e. $f_{\theta1}$, $f_{\theta2}$, and Classifier $\psi$.
		\State Update $\psi, f_{\theta2}, f_{\theta1}$ using $\ell_{def}$\;
		
		\State $\ell_{com}$ $\leftarrow$ MultiSimilarity($\phi(f_{\theta2}(f_{\theta1}(x_{com})))$, $y_{com}$) $//$ Train the lower branch, i.e. $f_{\theta1}$, $f_{\theta2}$, and Projector $\phi$.
		\State Update $\phi, f_{\theta2}, f_{\theta1}$ using $\ell_{com}$\;
		
		\State $\ell_{def'}$ $\leftarrow$ CrossEntropy($\psi(f_{\theta2}(f_{\theta1}(x_{val}))), y'_{def}$) $//$ Evaluate the upper branch
		
		
		\State $\ell_{com'}$ $\leftarrow$ MultiSimilarity($\phi(f_{\theta2}(f_{\theta1}(x_{val}))), y'_{com}$) $//$ Evaluate the lower branch
		
		\State $\ell_{eval}$ $\leftarrow$ $\ell_{def'}$ + $\ell_{com'}$ $//$ Procedure for model selection
		\If{$\ell_{\omega}$ $>$ $\ell_{eval}$} 
		\State $\ell_{\omega}$ $\leftarrow$ $\ell_{eval}$\;
		\State saveModel($f_{\theta1}$, $f_{\theta2}$, $\psi$, $\phi$)\;
		\EndIf;
		\EndFor;
		
		\For{$j\leftarrow 1$ to 23}
		\State $\gamma_{j}$ $\leftarrow$ $\gamma_{j}$($f_{\theta2}(f_{\theta1}(x_{com, j}), y_{com}$) $//$ Gaussian mixture model fitting on $f_{\theta2}(f_{\theta1}(x_{com, j})$ guided by $y_{com}$. 
		\EndFor
	\end{algorithmic}
\end{algorithm}

\textbf{Inference procedure:}
The Hybrid Expert inference procedure is described in Algorithm~\ref{alg:inference}.

\begin{algorithm}[ht]
	\caption{Inference Procedure of the Proposed Method}
	\label{alg:inference}
	\begin{algorithmic}[1]
		
		\State \textbf{Input:} A mini batch of test samples $x_{test}$ $\in$ $D_{test}$
		
		\State \textbf{Output:} Defect prediction $\hat{y}_{def}$
		
		\State Require Feature extractor $f_{\theta1}$, Shared fully connected network $f_{\theta2}$, Defect classifier $\psi$, Gaussian mixture model $\gamma_{j}$ where $j\in \{1, 2, \ldots, 23\}$, and prediction converter $\Lambda$;
		
		\State $\hat{y}^{1}_{def}$ $\leftarrow$ $\psi(f_{\theta2}(f_{\theta1}(x_{test})))$ $//$ Defect classification.
		
		\For{$j\leftarrow 1$ to 23}
		\State $P(\cdot|y_{com}=j)$ $\leftarrow$ $\gamma_{j}(f_{\theta2}(f_{\theta1}(x_{test})))$ $//$ Probabilistic component type prediction.
		\EndFor
		\State $\hat{y}^{2}_{def}$ $\leftarrow$ $\Lambda(\hat{y}^{j = 1, 2, \ldots, 23} )$ $//$ Prediction convertion.
		
		\If{$\hat{y}^{1}_{def}$ == $\hat{y}^{2}_{def}$}
		\State $\hat{y}_{def}$ $\leftarrow$ $\hat{y}^{1}_{def}$, $\hat{y}^{2}_{def}$ $//$ If the predictions match, $\hat{y}_{def}$ will be the same as $\hat{y}^{1}_{def}$ and $\hat{y}^{2}_{def}$.
		\Else
		\State {$\hat{y}_{def}$ $\leftarrow$ unknown} $//$ If the predictions do not match, $\hat{y}_{def}$ is unknown.
		\EndIf
		
		\State return $\hat{y}_{def}$
	\end{algorithmic}
\end{algorithm}



\bibliography{reference}

\end{document}
