% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage[utf8]{inputenc}
\usepackage{tikz}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{bbm}
\usepackage{bm}
\usepackage{booktabs}
\usepackage{algorithm}
\usepackage{listings}
\usepackage{multirow}
\usepackage{caption}
\usepackage{subcaption}
\captionsetup[subfigure]{justification=justified,singlelinecheck=false}
\usepackage{comment}
\usepackage[noend]{algpseudocode}
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{Proposition}
\usetikzlibrary{shapes,arrows,calc,positioning}
\usetikzlibrary{graphs}
\usepackage{bm}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
% some macros
\newcommand{\bb}{\bm{b}}
\newcommand{\be}{\bm{e}}
\newcommand{\bp}{\bm{p}}
\newcommand{\bw}{\bm{w}}
\newcommand{\bx}{\bm{x}}
\newcommand{\bz}{\bm{z}}
\newcommand{\bA}{\bm{A}}
\newcommand{\bM}{\bm{M}}
\newcommand{\sety}{\hat{Y}}
\newcommand{\setv}{\hat{V}}
\newcommand{\calC}{\mathcal{C}}
\newcommand{\calM}{\mathcal{M}}
\newcommand{\calQ}{\mathcal{Q}}
\newcommand{\calR}{\mathcal{R}} % DELETE
\newcommand{\calT}{\mathcal{T}}
\newcommand{\calX}{\mathcal{X}}
\newcommand{\calY}{\mathcal{Y}}
\newcommand{\powerset}{\mathcal{P}(\mathcal{Y})}
\newcommand{\bopset}{\sety^{*}}
\newcommand{\calET}{\mathcal{E}_{\mathcal{T}}}
\newcommand{\calGT}{\mathcal{G}_{\mathcal{T}}}
\newcommand{\calVT}{\mathcal{V}_{\mathcal{T}}}
\newcommand{\RT}{R_{\calT}}
\newcommand{\calRT}{\mathcal{R}_{\calT}}
\newcommand{\calST}{\mathcal{S}_{\calT}}
\DeclareMathOperator*{\argmax}{\arg\max}

\title{Set-Valued Prediction in Hierarchical Classification with Constrained Representation Complexity (Supplementary Material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<thomasf.mortier@ugent.be>?Subject=UAI 2022 paper}{Thomas~Mortier}{}}
\author[2]{Eyke~H\"ullermeier}
\author[3,4]{Krzysztof~Dembczy\'nski}
\author[1]{Willem~Waegeman}
% Add affiliations after the authors
\affil[1]{%
    Dept. of Data Analysis and Mathematical Modelling\\
    Ghent University\\
    Coupure links 653, Ghent, Belgium
}
\affil[2]{%
    Institute of Informatics\\
    LMU Munich\\
    Akademiestr. 7, Munich, Germany
}
\affil[3]{%
    Institute of Computing Science\\
    Pozna\'n University of Technology\\
    Piotrowo 2, Pozna\'n, Poland
}
\affil[4]{%
    Yahoo! Research\\
    770 Broadway, New York, USA
}
  
  \begin{document}
\maketitle

\appendix

\section{Proof of Theorem~1}\label{sec:app:proofth}
We first prove an intermediate result. 
\begin{proposition}
\label{prop:rc:disjoint}
For any class space $\calY$ and valid hierarchy $\calT$ we have that:
\begin{equation}
	\forall i,j \in [K-1]: \calRT^{(i)}\neq \calRT^{(j)} \implies \calRT^{(i)}\cap\calRT^{(j)} = \emptyset
\end{equation}
\end{proposition}
\begin{proof}
Let $i,j$ with $\calRT^{(i)}\neq \calRT^{(j)}$ and assume that $\calRT^{(i)}\cap\calRT^{(j)} \neq \emptyset$. For $\hat{Y}\in\calRT^{(i)}\cap\calRT^{(j)}$, we know that:
\begin{align*}
\RT(\hat{Y})=i &\Leftrightarrow \min_{\setv\in \calST(\hat{Y})} |\setv| = i\,,\\
\RT(\hat{Y})=j &\Leftrightarrow \min_{\setv\in \calST(\hat{Y})} |\setv| = j\,,
\end{align*}
and, hence, is only possible when $i=j$, which contradicts with the beginning of this proof.
\end{proof}
In order to prove Theorem~1, we need to show that the following conditions are met:
\begin{enumerate}
\item $\forall i,j \in [K-1]: \calRT^{(i)}\neq \calRT^{(j)} \implies \calRT^{(i)}\cap\calRT^{(j)} = \emptyset$
\item $\bigcup_{i\in [K-1]}\calRT^{(i)}=\powerset$
\end{enumerate}
The first condition is met due to Proposition~\ref{prop:rc:disjoint}. To show that the second condition is met, we need to prove that $\sety \in \bigcup_{i\in [K-1]}\calRT^{(i)} \implies \sety\in \powerset \land \sety\in \powerset \implies \sety \in \bigcup_{i\in [K-1]}\calRT^{(i)}$. We start by proving the first part, which follows trivially from the definition of a representation complexity class, as each set that belongs to a given representation complexity class must be element of $\powerset$. To prove the second part, it suffices to show that $\forall\,\sety\in \powerset: \calST(\sety)\neq\emptyset$, or in other words, for each element $\sety$ in $\powerset$ there exists at least one $\setv\subset \calVT$ such that:
\begin{equation*}
\bigcup_{v_{i}\in \setv} v_{i}=\sety\,,\quad\bigcap_{v_{i}\in\setv} v_{i}=\emptyset\,.
\end{equation*} 
Note that each element $\sety\in \powerset$ can be represented by either a node in the hierarchy, the union of sets of leaf nodes in the hierarchy $\calT$:
\begin{equation*}
\sety = \bigcup_{c_{i}\in\sety} \{c_{i}\}\,,
\end{equation*}
or by a union of internal and/or leaf nodes. From this, it follows that $\calST(\sety)\neq\emptyset$ and $\RT(\sety)=\min_{\setv\in \calST(\sety)} |\setv|=i$, where $i$ is lower bounded by one and upper bounded by $|\sety|$. Therefore, given the above, it follows that $\forall\, \sety\in \powerset\,,\exists\,i\in[K-1]\,:\sety\in\calRT^{(i)}$ which proves the second and last part of this proof. 

\section{Experimental setup}\label{sec:app:expsetup}

We use a MobileNetV2 convolutional neural network~\citep{Sandler18mobilenetv2}, pretrained on ImageNet~\citep{Deng09ImageNet}, to obtain hidden representations for all image datasets. For the bacteria dataset, tf-idf representations are obtained by means of extracting 3-, 4- and 5-grams from the 16S rRNA sequences that were provided in the dataset~\citep{Fiannaca18Bacteria}. For the proteins dataset, tf-idf representations are obtained by considering 3-grams only. Furthermore, to comply with literature, the tf-idf representations are concatenated with functional domain encodings, which contain distinct functional and evolutional information about the protein sequence~\citep{Li18DEEPre}. Next, the obtained feature representations for the biological datasets are then passed through a single-layer neural net with 1000 output neurons and a ReLU activation function. We use the categorical cross-entropy loss by means of stochastic gradient descent with momentum, where the learning rate and momentum are set to $1e-5$ and 0.99, respectively. For the models without hierarchical factorization, we set the number of epochs to 2 and 20, for the Caltech and other datasets, respectively. For the models with hierarchical factorization, we use 4 and 30, respectively. We train all models end-to-end on a GPU, by using the PyTorch library~\citep{Paszke17pytorch} and infrastructure with the following specifications:
\begin{itemize}
    \item \textbf{CPU:} i7-6800K 3.4 GHz (3.8 GHz Turbo Boost) – 6 cores / 12 threads,
    \item \textbf{GPU:} 2x Nvidia GTX 1080 Ti 11GB + 1x Nvidia Tesla K40c 11GB,
    \item \textbf{RAM:} 64GB DDR4-2666.
\end{itemize}
Finally, we implemented the RTS and TOP-$k$ algorithms in C++ by using the PyTorch C++ API~\citep{Paszke17pytorch}.

\bibliography{mortier_453}

\end{document}
