\documentclass[accepted]{uai2022} %
%
%
%
%
%
%
%
%

%
%
%
%

%
\usepackage[round]{natbib}
\renewcommand{\bibname}{References}
\renewcommand{\bibsection}{\subsubsection*{\bibname}}

%
%

%
%

\usepackage{amsmath,amsfonts,bm}

%
\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

%
\newcommand{\newterm}[1]{{\bf #1}}


%
\def\figref#1{figure~\ref{#1}}
%
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
%
\def\secref#1{section~\ref{#1}}
%
\def\Secref#1{Section~\ref{#1}}
%
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
%
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
%
\def\eqref#1{equation~\ref{#1}}
%
\def\Eqref#1{Equation~\ref{#1}}
%
\def\plaineqref#1{\ref{#1}}
%
\def\chapref#1{chapter~\ref{#1}}
%
\def\Chapref#1{Chapter~\ref{#1}}
%
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
%
\def\algref#1{algorithm~\ref{#1}}
%
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
%
\def\partref#1{part~\ref{#1}}
%
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


%
\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
%
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

%
\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

%
\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

%
\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

%
\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

%
\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}

%
\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

%
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}

%
\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


%
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

%
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
%
%
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

%
\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

%
%
\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

%
\newcommand{\pdata}{p_{\rm{data}}}
%
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
%
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
%
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} %

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
%
%
%
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} %

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak
 \usepackage{graphicx}

\usepackage[utf8]{inputenc} %
\usepackage[T1]{fontenc}    %
\usepackage{hyperref}       %
\usepackage{url}            %
\usepackage{booktabs}       %
\usepackage{amsfonts}       %
\usepackage{nicefrac}       %
\usepackage{microtype}      %
\usepackage{xcolor}         %

\usepackage{xr}

\usepackage{amsmath} 
\usepackage{amssymb}




\externaldocument{hosoya_14}


%
%
%
%
%
%
%


\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}



%
\ExplSyntaxOn
\cs_new:Npn \expandableinput #1
  { \use:c { @@input } { \file_full_name:n {#1} } }
\AddToHook{env/tabular/begin}
  { \cs_set_eq:NN \input \expandableinput }
\ExplSyntaxOff

\title{CIGMO: Categorical invariant representations in a deep generative framework (Supplementary material)}
\author[1]{\href{mailto:<hosoya@atr.jp>?Subject=Your UAI 2022 paper}{Haruo~Hosoya}{}}
\affil[1]{%
    Brain Labs.\\
    ATR International\\
    Kyoto, Japan
    }

\begin{document}
\maketitle

\newcommand{\CIGMO}{\textsc{CIGMO}}

\newcommand{\gaussd}{\mathcal{N}}
\newcommand{\lowerb}{\mathcal{L}}

\newcommand{\diag}{\mathop{\mathrm{diag}}}
%
\newcommand{\todo}[1]{}

\newcommand{\methIIC}{IIC}
\newcommand{\methIICoc}{IIC (overclust.)}
\newcommand{\methVAE}{VAE + $k$-means}
\newcommand{\methMVAE}{Mixture of VAEs}
\newcommand{\methGVAE}{GVAE + $k$-means}
\newcommand{\methMGVAE}{{\bf\CIGMO}}
\newcommand{\methMLVAE}{MLVAE + $k$-means}

\renewcommand{\methVAE}{VAE}
\renewcommand{\methGVAE}{GVAE}
\renewcommand{\methMLVAE}{MLVAE}




%
\section{Dataset details}
\label{sec:dataset}

%

\paragraph{ShapeNet}

We used an object image dataset derived from a core subset of (public-domain) ShapeNet database of 3D object models \citep{Chang2015} used in SHREC2016 challenge\footnote{\url{https://shapenet.cs.stanford.edu/shrec16/}}.  The subset contained 55 object classes and did not include material data.  We selected 10 out of the pre-defined classes: car, chair, table, airplane, lamp, boat, box, display, truck, and vase.  Our criterion of selection was those with a large number of object identities but avoid including visually similar classes, e.g., chair, sofa, and bench.  We rendered each object in 30 views consisting of 15 azimuths (equally dividing $360^\circ$) and 2 elevations ($0^\circ$ and $22.5^\circ$ downward) in a single lighting condition; the images were gray-scale and had size $64\times 64$ pixels.  All the rendering used Blender software\footnote{\url{https://www.blender.org}}.  We divided the data into training and test following the split given in the original database. 

\paragraph{MVC Cloth}

We used a subset of MVC Cloth image dataset \citep{Liu2016a}\footnote{\url{https://github.com/MVC-Datasets/MVC}}, which contains a number of photos of cloths worn by fashion models; the same cloth type is shown in multiple viewing angles.  The dataset provides no class label, but provides 264 binary attribute labels that are related to cloth kinds (Dresses, Denim, TShirts, etc.), cloth styles (Short, Sleeveless, LongSleeves, etc.), cloth materials (Cotton, Nylon, Black, White, etc.), and prices (hundred1U, fiftyU, etc.).  We rescaled the images to $64\times 64$ pixels and split the data into training and test sets (each of size $\sim$112K and $\sim$28K)  so that the cloth types do not overlap between these.  

%

\section{Platform details}

We used 3 computers with the following specifications: (1) 56 core CPUs (256G memory) with 4 V100 GPUs (16G memory each), (2) 56 core CPUs (256G memory) with 4 V100 GPUs (32G memory each), and (3) 96 core CPUs (256G memory) with 4 A100 GPUs (40G memory each).  All code is implemented with Python (3.7.4) / Pytorch (1.7.1).

\section{Architecture details}
\label{sec:arch}

In a {\CIGMO} model, the categorizer deep net $u$ consisted of three convolutional layers each with $32$, $64$, and $128$ filters (kernel $5\times 5$; stride $2$; padding 2), followed by two fully connected layers each with $500$ intermediate units and $C$ output units.  These layers were each intervened with Batch Normalization and ReLU nonlinearity, except that the last layer ended with Softmax.  The shape and view encoder deep nets had a similar architecture, except that the last layer was linear for encoding the mean ($g$ and $h_c$) or ended with Softplus for encoding the variance ($r$ and $s_c$).   The decoder deep nets $f_c$ had two fully connected layers ($103$ input units and $500$ intermediate units) followed by three transposed convolutional layers each with $128$, $64$, and $32$ filters (kernel $6\times 6$; stride $2$; padding $2$).  These layers were again intervened with Batch Normalization and ReLU nonlinearity, but the last layer was Sigmoid.
%
To save the memory space, the shape encoders shared the first four layers for all categories and for mean and variance.  %
%
The decoders shared all but the first layer for all categories.

%


\section{Additional results for ShapeNet}
%
\label{sec:design}
%

%
%
%
%
%
%
%


In Section~\ref{sec:learning} and Section~\ref{sec:variants}, we have raised several design alternatives in the model construction.  The first choice regards how to combine instance-specific categorical probability distributions and has three options: averaging (default), normalized product, and logit averaging.   The second choice regards whether views are dependent on category or not (default).  Table~\ref{tbl:design} summarizes performance results in invariant clustering and one-shot identification tasks, changing the options from the default, for ShapeNet.  Overall, the default design tended to give slightly better performance than the other options (though mostly statistically insignificant in invariant clustering).  In particular, we could not see any advantage of using category-dependent view representations despite potentially different meanings of views.  
%
In addition, Tables~\ref{tbl:shape-view-alt} and~\ref{tbl:swapping-alt} compare the design options in terms of degree of shape-view disentanglement and swapping errors, respectively.  The results again show that the default design tended to give slightly better performance than the other options (though often statistically insigificant).
As an additional remark,  we found the product option numerically rather unstable.

%
%

%
%
%
%

\renewcommand{\methMGVAE}{AV & N}
\newcommand{\methMGVAEap}{PD & N}
\newcommand{\methMGVAEal}{LA & N}
\newcommand{\methMGVAEmv}{AV & Y}

\begin{table*}[t]
\caption{Comparison of design options in {\CIGMO} in terms of (1) how to combine category distributions: averaging (AV; default), product (PD), and logit-averaging (LA) and (2) whether views depend on the category or not (default).  The comparisons are made with invariant clustering accuracy (left half; \%) and one-shot identification accuracy (right half; \%) for ShapeNet. }
\fontsize{8.5pt}{10pt}\selectfont
\begin{center}
\begin{tabular}{cccccccc}
\toprule%
&& \multicolumn{3}{c}{invariant clustering (\%)} & \multicolumn{3}{c}{one-shot identification (\%)} \\
(1) & (2) & 3 classes & 5 classes & 10 classes & 3 classes & 5 classes & 10 classes \\
\midrule%
%
\methMGVAE & \textbf{94.83} $\pm$ 6.06 & \textbf{89.36} $\pm$ 4.53 & \textrm{68.53} $\pm$ 4.24 & \textbf{27.33} $\pm$ 0.55 & \textbf{24.51} $\pm$ 0.68 & \textbf{21.79} $\pm$ 0.71 \\
\methMGVAEap & \textrm{90.44} $\pm$ 9.11 & \textrm{80.96} $\pm$ 9.65 & \textrm{68.30} $\pm$ 3.64 & \textrm{26.64} $\pm$ 0.57$^{*}$ & \textrm{23.55} $\pm$ 0.56$^{*}$ & \textrm{20.63} $\pm$ 0.80$^{*}$ \\
\methMGVAEal & \textrm{88.16} $\pm$ 15.89 & \textrm{89.16} $\pm$ 3.08 & \textbf{69.55} $\pm$ 3.26 & \textrm{26.52} $\pm$ 0.84$^{*}$ & \textrm{23.57} $\pm$ 0.34$^{*}$ & \textrm{20.58} $\pm$ 0.91$^{*}$ \\
\methMGVAEmv & \textrm{92.32} $\pm$ 8.39 & \textrm{82.60} $\pm$ 6.53$^{*}$ & \textrm{65.03} $\pm$ 6.62 & \textrm{26.54} $\pm$ 0.61$^{*}$ & \textrm{24.16} $\pm$ 0.58 & \textrm{21.24} $\pm$ 1.00 \\

 \bottomrule %
\end{tabular}
\label{tbl:design}
\end{center}
\end{table*}


\begin{table*}[t]
\caption{Comparison of different design options in terms of degree of shape-view disentanglement for ShapeNet, measured as neural network classification accuracy (\%) for object identity from the shape (left half; higher is better) or view variable (right half; lower is better).   }
\small
\begin{center}
\begin{tabular}{llcccccc}
\toprule%
&& \multicolumn{3}{c}{shape $\rightarrow$ id} & \multicolumn{3}{c}{view $\rightarrow$ id}\\
& & 3 cats. & 5 cats. & 10 cats. & 3 cats. & 5 cats. & 10 cats.\\
\midrule%
%
\methMGVAE & \textbf{57.62} $\pm$ 0.93 & \textbf{50.91} $\pm$ 0.97 & \textbf{46.28} $\pm$ 0.87 & \textbf{0.26} $\pm$ 0.04 & \textrm{0.65} $\pm$ 0.05 & \textrm{0.67} $\pm$ 0.07 \\
\methMGVAEap & \textrm{57.18} $\pm$ 1.30 & \textrm{49.13} $\pm$ 1.06$^{*}$ & \textrm{44.17} $\pm$ 1.13$^{*}$ & \textrm{0.27} $\pm$ 0.03 & \textbf{0.63} $\pm$ 0.08 & \textrm{0.69} $\pm$ 0.04 \\
\methMGVAEal & \textrm{56.31} $\pm$ 1.90 & \textrm{49.34} $\pm$ 0.62$^{*}$ & \textrm{43.35} $\pm$ 1.42$^{*}$ & \textrm{0.29} $\pm$ 0.05 & \textrm{0.69} $\pm$ 0.10 & \textrm{0.73} $\pm$ 0.07 \\
\methMGVAEmv & \textrm{55.30} $\pm$ 0.78$^{*}$ & \textrm{48.57} $\pm$ 0.82$^{*}$ & \textrm{44.28} $\pm$ 1.21$^{*}$ & \textrm{0.28} $\pm$ 0.05 & \textrm{0.63} $\pm$ 0.14 & \textbf{0.63} $\pm$ 0.11 \\

 \bottomrule %
\end{tabular}
\label{tbl:shape-view-alt}
\end{center}
\end{table*}

\begin{table*}[t]
\caption{Comparison of different design options in terms of swapping error for ShapeNet.    }
\small
\begin{center}
\begin{tabular}{ccccc}
\toprule%
&& 3 cats. & 5 cats. & 10 cats. \\
\midrule%
%
\methMGVAE & \textbf{0.220} $\pm$ 0.025 & \textbf{0.300} $\pm$ 0.025 & \textrm{0.340} $\pm$ 0.035 \\
\methMGVAEap & \textrm{0.245} $\pm$ 0.031 & \textrm{0.333} $\pm$ 0.037 & \textrm{0.361} $\pm$ 0.025 \\
\methMGVAEal & \textrm{0.236} $\pm$ 0.031 & \textrm{0.301} $\pm$ 0.019 & \textbf{0.340} $\pm$ 0.025 \\
\methMGVAEmv & \textrm{0.236} $\pm$ 0.047 & \textrm{0.318} $\pm$ 0.038 & \textrm{0.364} $\pm$ 0.060 \\

 \bottomrule %
\end{tabular}
\label{tbl:swapping-alt}
\end{center}
\end{table*}


%
%
%
%
%
%
%

%
%
%

%
%
%
%
%
\renewcommand{\methMGVAE}{{\bf\CIGMO}}
%



%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
 
\bibliography{library}
\bibliographystyle{plainnat}

\end{document}
