%
\documentclass[accepted]{uai2022} %
                                    %
                                    %
                                    %
%
%
                                         %
%
                                          %
%
%
%

%
\usepackage[american]{babel}
%

%
\usepackage{natbib} %
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} %
%
\usepackage{booktabs} %
\usepackage{tikz} %

%
%
%
%
%

%
%

%
%

\usepackage{amsmath,amsfonts,bm}

%
\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

%
\newcommand{\newterm}[1]{{\bf #1}}


%
\def\figref#1{figure~\ref{#1}}
%
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
%
\def\secref#1{section~\ref{#1}}
%
\def\Secref#1{Section~\ref{#1}}
%
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
%
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
%
\def\eqref#1{equation~\ref{#1}}
%
\def\Eqref#1{Equation~\ref{#1}}
%
\def\plaineqref#1{\ref{#1}}
%
\def\chapref#1{chapter~\ref{#1}}
%
\def\Chapref#1{Chapter~\ref{#1}}
%
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
%
\def\algref#1{algorithm~\ref{#1}}
%
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
%
\def\partref#1{part~\ref{#1}}
%
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


%
\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
%
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

%
\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

%
\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

%
\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

%
\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

%
\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}

%
\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

%
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}

%
\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


%
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

%
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
%
%
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

%
\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

%
%
\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

%
\newcommand{\pdata}{p_{\rm{data}}}
%
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
%
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
%
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} %

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
%
%
%
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} %

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak
 \usepackage{graphicx}

\usepackage[utf8]{inputenc} %
\usepackage[T1]{fontenc}    %
\usepackage{hyperref}       %
\usepackage{url}            %
%
\usepackage{amsfonts}       %
\usepackage{nicefrac}       %
\usepackage{microtype}      %
\usepackage{xcolor}         %

\usepackage{amsmath} 
\usepackage{amssymb}
\usepackage{amsthm} 

\newtheorem{theorem}{Theorem}

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

\newcommand{\CIGMO}{\textsc{CIGMO}}

\newcommand{\gaussd}{\mathcal{N}}
\newcommand{\lowerb}{\mathcal{L}}

\newcommand{\diag}{\mathop{\mathrm{diag}}}
%
\newcommand{\todo}[1]{}

%
\ExplSyntaxOn
\cs_new:Npn \expandableinput #1
  { \use:c { @@input } { \file_full_name:n {#1} } }
\AddToHook{env/tabular/begin}
  { \cs_set_eq:NN \input \expandableinput }
\ExplSyntaxOff

\title{CIGMO: Categorical invariant representations in a deep generative framework}

%
%
%
%
%
%
%
%
\author[1]{\href{mailto:<hosoya@atr.jp>?Subject=Your UAI 2022 paper}{Haruo~Hosoya}{}}
%
%
%
%
%
%
%
\affil[1]{%
    Brain Labs.\\
    ATR International\\
    Kyoto, Japan
    }
%
%
%
%
%
%
%
%
%
%
  
  \begin{document}
\maketitle

\begin{abstract}
Data of general object images have two most common structures: (1) each object of a given shape can be rendered in multiple different views, and (2) shapes of objects can be categorized in such a way that the diversity of shapes is much larger across categories than within a category.  Existing deep generative models can typically capture either structure, but not both.  In this work, we introduce a novel deep generative model, called CIGMO, that can learn to represent category, shape, and view factors from image data.  The model is comprised of multiple modules of shape representations that are each specialized to a particular category and disentangled from view representation, and can be learned using a group-based weakly supervised learning method.  By empirical investigation, we show that our model can effectively discover categories of object shapes despite large view variation and quantitatively supersede various previous methods including the state-of-the-art invariant clustering algorithm.  Further, we show that our approach using category-specialization can enhance the learned shape representation to better perform down-stream tasks such as one-shot object identification as well as shape-view disentanglement.

%

%

\end{abstract}

%

\section{Introduction}
\label{sec:intro}





In everyday life, we see objects in a great variety.  Categories of objects are numerous and their shape variations are tremendously rich; different views make an object look totally different (Figure~\ref{fig:object}).  Recent neuroscientific studies document how the primate visual system represents such complex objects in a characteristic, modular architecture, which is comprised of multiple cortical regions that each encode invariant features specialized to a particular object category, such as faces \citep{Freiwald2010}, body parts \citep{Kumar2017}, and other general categories \citep{Srihasam2014,Bao2020}.  Our work here takes inspiration from these biological findings for developing a novel learning model with ``categorical invariant representation.''

%

\begin{figure}[t]
\begin{center}
\includegraphics[width=6.5cm]{figures/example_object.pdf}
\caption{Examples of general object images.  These include two categories (car and chair) each with three shape variations.  The object of each shape is rendered in three different views. }
\label{fig:object}
\end{center}
\end{figure}

\begin{figure*}[t]
\begin{center}
\includegraphics[width=15cm]{figures/example_clustering.pdf}
\caption{%
Examples of invariant clustering from (A) {\CIGMO}, (B) GVAE \citep{Hosoya2019} with $k$-means, and (C) IIC \citep{Ji2019}, for ShapeNet, in the case of 3 categories.  Random 24 test images belonging to each estimated category are shown in a box.  Note that the categories almost perfectly correspond to the chair, table, car image classes in (A).  Such correspondence is much less clear in (B) and (C); in particular, cars are mixed with many other objects (category 3 in (B) and category 1 in (C)).  
}
\label{fig:clustering}
\end{center}
\end{figure*}

To be more specific, consider two most common domain structures of general object images.  First, 
each object has a specific shape and can be rendered in multiple different views (e.g., 3D orientation) independently of shape.  Second, the shapes of objects can be categorized in such a way that the diversity of shapes is much larger across categories than within a category.  For example, Figure~\ref{fig:object} illustrates various object images of car and chair categories, in which the shape of a particular car type is relatively similar to the shape of another car type but is substantially different from any chair type.  Existing deep generative models typically capture either structure.  The first structure on view variation is often handled by disentangling deep models, i.e., representation learning of mutually invariant factors of variation in the input \citep[etc.]{Bengio2013,Higgins2016,Chen2016,Bouchacourt2018,Hosoya2019,Mathieu2016}, while the second structure on categorization can be handled by deep clustering methods \citep{Jiang2017,Ji2019}.  However, models that capture both structures in a single generative framework have rarely been studied.

In this work, to exploit both domain structures, we develop a probabilistic deep generative model that learns to encode three latent factors, namely, (1) category, (2) shape, and (3) view, from a dataset of general object images.  However, this goal is generally difficult in a completely unsupervised setting, thus requiring some kind of ``inductive bias'' to be imposed \citep{Locatello2020b}.  To this end, we start with recently emerging, group-based weakly supervised learning \citep{Mathieu2016,Bouchacourt2018,Chen2018,Hosoya2019}, which can learn separate representations of shape and view from object images using no explicit labels, but only grouping information that links together different views of the same object.  We extend this approach by introducing multiple modules of shape representations and then devising mechanisms to specialize each shape representation to a particular object category while disentangling it from view representation.  We call the resulting model {\CIGMO} (Categorical Invariant Generative MOdel).





%
%

We have empirically investigated representational advantages of our model.  First, we found that our model allows for effectively solving the invariant clustering problem, that is, it can discover categories of object shapes in an unsupervised manner despite significant variation of object views.  Our model can quantitatively outperform various previous methods including the state-of-the-art invariant clustering method \citep{Ji2019} as well as combination of existing disentangling and clustering methods; Figure~\ref{fig:clustering} shows demonstrating examples.  Second, we found that our approach using category-specialization can enhance the learned shape representation to perform better multiple tasks, including one-shot identification (object recognition given one example per shape) as well as shape-view disentanglement in multiple criteria.  Thus, we propose {\CIGMO} as a novel learning approach to represent object images in a general manner using category, shape, and view latent variables, which can provide more precise information for down-stream tasks than typical approaches to represent only part of those variables. 
%
The source code written in {\tt pytorch} is available at {\tt https://github.com/HaruoHosoya/cigmo}.


%
 
%

%
%
%
%
%

%




\section{Related work}
\label{sec:related}

%

A number of studies exist for disentangling deep generative models.  A particularly relevant technique is the group-based disentangling approach that can learn to separate content (shape) and view variables from grouped data \citep{Mathieu2016,Bouchacourt2018,Chen2018,Hosoya2019,pmlr-v119-locatello20a}.  However, these studies most often use image datasets of a single object category, paying no attention to the large cross-category diversity of object shapes mentioned in the introduction.  Our model can be seen as a generalization of this approach with categorization, where our specific contribution is how to handle grouped data in the presence of multiple categories (Section~\ref{sec:learning}); such issue would not arise in a non-group-based setting, e.g., \citep{Kingma2014a}. 

Although our focus here is weak supervision, we should mention that there are various approaches to use explicit labels for enhancing disentangling performance, such as semi-supervised learning \citep{Kingma2014a,Siddharth2017} or adversarial learning to promote disentanglement \citep{Lample2017,Mathieu2016}.  Also, as group-based learning was originally inspired by temporal coherence principle \citep{Foldiak1991} (i.e., the object identity is often stable over time), some weakly supervised disentangling approaches have explicitly used it \citep{Yang2015}.  
%

Some studies have proposed unsupervised disentangling algorithms that impose statistical constraints on the latent representation such as information maximization \citep{Higgins2016,Chen2016,Dupont2018}.  However, we emphasize that the problem that they try to solve is very different from ours.  That is, their models can learn arbitrarily many continuous variables that are each single-dimensional, whereas our model (as well as all other disentangling models cited so far) can learn exactly two continuous variables that are each multi-dimensional (shape and view).

Our study is also related to recent deep clustering methods.  In particular, a latest approach proposes group-based learning for invariant clustering, which maximizes mutual information between categorical posterior distributions for paired images \citep{Ji2019}.  This method has exhibited remarkable performance on natural images under various view variation; Section~\ref{sec:exper} gives an empirical comparison with our method. 
Although earlier work combines variational autoencoders (VAE) \citep{Kingma2014} with a conventional clustering method (e.g., Gaussian mixture), such approach seems to be limited in capturing large object view variation \citep{Jiang2017}.   In any case, these methods are specialized to clustering and throw away all information other than the category.

The group-based learning is loosely related to contrastive learning \citep{Jaiswal2020a}, which is a self-supervised learning approach to make representations of ``positive'' data pairs near (e.g., objects of the same shape) and those of ``negative pairs'' distant (e.g., objects of different shapes).  Such negative data (not used in our study) could help drastically improve performance, though it incurs substantial technical complication (large batch size, memory bank, etc.).  

\todo{Other weakly related work.  computer vision models... }

\section{CIGMO: Categorical Invariant Generative Model}
\label{sec:cigmo}

\begin{figure}[t]
\begin{center}
\includegraphics[width=8cm]{figures/example_model.pdf}
\caption{ (A) The graphical model.  Each instance $x_k$ in a data group is generated from a category $c$, a shape $z$, and a view $y_k$.  Round boxes are discrete variables; circles are continuous variables; shaded are visible variables.
(B) The inference flow.  Each hidden variable is inferred from the set of incoming variables. 
}
\label{fig:model}
\end{center}
\end{figure}

\subsection{Model}
\label{sec:model}

In our model construction, similarly to the previous group-based disentangling approaches \citep{Mathieu2016,Bouchacourt2018,Hosoya2019,Chen2018,pmlr-v119-locatello20a}, we assume a dataset that groups together multiple object images of the same shape but possibly in different views.  Here, shape refers to the property of object that is invariant in view, where view is defined depending on the dataset.  For example, 3-dimensional viewpoints are considered as views in the object images in Figure~\ref{fig:object} (columns).  Given such dataset, we can extract the shape as a group-common factor and the view as an instance-specific factor.  
%
In this study here, we generalize this idea with category-modular representation.  That is, we assume multiple object categories each of which includes a distinct and potentially infinite set of shapes.  Since the variety of shapes can be category-specific (e.g., shapes of chairs vary in a different way from those of cars), we endow each category with its own specialized representation (i.e., separate encoders and decoders). 

%
%


Formally, we assume a grouped dataset 
%
$
	\sD=\{ (\vx_1^{(n)},\ldots,\vx_K^{(n)}) \mid \vx_k^{(n)}\in \sR^D, n=1,\ldots,N\}
$,	
%
where each data point is a group (tuple) of $K$ data instances (e.g., images); we assume that the groups are i.i.d.  For a data group $(\vx_1,\ldots,\vx_K)$, we consider three types of hidden variables:  category $c\in\{1,\ldots,C\}$, shape $\vz\in\sR^M$, and views $\vy_1,\ldots,\vy_K\in\sR^L$ (omitting the superscript $(n)$), where the category and shape are common for the group while the views are specific to each instance.  We consider the following generative model (Figure~\ref{fig:model}(A)):
\begin{align*}
	p(c) &= \pi_c  \\
	p(\vz) &= \gaussd_M(0,\mI) \\
	p(\vy_k) &= \gaussd_L(0,\mI) \\
	p(\vx_k|\vy_k,\vz,c) &= \gaussd_D( f_c(\vy_k,\vz), \mI)
\end{align*}
for $c=1,\ldots,C$ and $k=1,\ldots,K$.  Here, $\pi_c$ is a category prior with $\sum_{c=1}^C \pi_c=1$ and $f_c$ is a decoder deep net defined for category $c$ corresponding to the category-specific module of shape representation.  In the generative process, the category $c$ is first drawn from the categorical distribution $(\pi_1,\ldots,\pi_C)$, while the shape $\vz$ and views $\vy_k$ are drawn from standard Gaussian priors.   Then, each data instance $\vx_k$ is generated by the decoder $f_c$ for the selected category $c$, which is applied to the group-common shape $\vz$ and the instance-specific view $\vy_k$ (added with Gaussian noise of unit variance).  In other words, different data instances for a group are generated from the same shape and different views.  



\subsection{Learning}
\label{sec:learning}

Having defined a generative model as above, we expect category, shape, and view representations to arise as latent variables after fitting the model to a grouped dataset.  To derive a concrete algorithm, we use variational autoencoders \citep{Kingma2014} as a basic methodology, where we develop a specific architecture to solve our particular problem.  

As the most important step, we specify inference models to encode approximate posterior distributions (Figure~\ref{fig:model}(B)).  %
First, we estimate the posterior probability for category $c$ as follows:
\begin{align}
	q(c|\vx_1,\ldots,\vx_K) &= \frac{1}{K}\sum_{k=1}^K u^{(c)}(\vx_k) \label{eq:cat-inf}
\end{align}
Here, $u$ is a categorizer deep net that computes, for an individual instance $\vx_k$, a probability distribution over the categories ($\sum_{c=1}^C u^{(c)}(\vx_k)=1$) \citep{Kingma2014a}.  Since we have $K$ such instances, we take the average over the instance-specific distributions to obtain the group-common distribution.  The averaging operation is justified as estimation of the expected probability of each category within the given group.  
%

Next, we infer the posterior for each instance-specific view $\vy_k$ from the input $\vx_k$ as follows:
\begin{align}
	q(\vy_k|\vx_k) &= \gaussd_L\left( g(\vx_k), \diag(r(\vx_k))\right) \label{eq:gvae-inf1} 
\end{align}
where $g$ and $r$ are encoder deep nets to specify the mean and variance, respectively.  
Here, we use the view representation that does not depend on the category, assuming a ``universal'' view space.
Then, we estimate the posterior for group-common shape $\vz$ from inputs $\vx_1,\ldots,\vx_K$ as follows: 
\begin{align}
	&q(\vz|\vx_1,\ldots,\vx_K,c) = \nonumber \\ 
	&\quad
	\gaussd_M\left( 
		\frac{1}{K} \sum_{k=1}^K h_c(\vx_k), 
		\frac{1}{K} \sum_{k=1}^K \diag(s_c(\vx_k))
	\right) \label{eq:gvae-inf2} 
\end{align}
This time, shape representation does depend on the category, unlike views.  Thus, the encoder deep nets $h_c$ and $s_c$ are defined for each category $c$, which compute the mean and variance, respectively, for each individual shape for $\vx_k$.  We then obtain the group-common shape $\vz$ as the average over all the individual shapes \citep{Hosoya2019}.  %
%
%
%

For training, we define the following variational lower bound of the marginal log likelihood for a data point:
%
%
$
	\lowerb(\phi; \vec{\vx}) = \lowerb_\mathrm{recon} + \lowerb_\mathrm{KL}
$
%
with
\begin{align*}
	\lowerb_\mathrm{recon} 
%
	&= \E_{q(\vec{\vy},\vz,c|\vec{\vx})}\left[\sum_{k=1}^K \log p(\vx_k|\vy_k,\vz,c)\right] \\
	\lowerb_\mathrm{KL}
%
	&= - \KL( q(\vec{\vy},\vz,c | \vec{\vx}) \Vert p(\vec{\vy},\vz,c) ) \end{align*}
where $\vec{\vx}$ stands for $(\vx_1,\ldots,\vx_K)$, etc., and $\phi$ is the set of all weight parameters in the categorizer, encoder, and decoder deep nets.  We compute the reconstruction term $\lowerb_\mathrm{recon}$ as follows:
\begin{align*}
	\lowerb_\mathrm{recon} 
%
%
%
	&= \sum_{c=1}^C q(c|\vec{\vx}) \E_{q(\vec{\vy},\vz|\vec{\vx},c)}
	\left[\sum_{k=1}^K \log p(\vx_k|\vy_k,\vz,c)\right] \\
	&\approx \sum_{c=1}^C q(c|\vec{\vx}) \sum_{k=1}^K \log p(\vx_k|\vy_k,\vz,c)
\end{align*}
where we approximate the expectation using one sample $\vz \sim q(\vz|\vec{\vx})$  %
and $\vy_k \sim q(\vy_k|\vx_k,c)$ for each $k$, but directly use the probability value $q(c|\vec{\vx})$ for $c$.  The latter is crucial for making the loss function differentiable.
The KL term $\lowerb_\mathrm{KL}$ is computed as follows:
\begin{align}
	\lowerb_\mathrm{KL}
%
%
%
%
%
%
	=& - \KL( q(c|\vec{\vx}) \Vert p(c) ) \nonumber \\
	  & - \sum_{k=1}^K \KL( q(\vy_k | \vx_k) \Vert p(\vy_k) ) \nonumber \\
	  & - \sum_{c=1}^C q(c|\vec{\vx}) 
	     \KL( q(\vz | \vec{\vx},c) \Vert p(\vz) ) \nonumber %
\end{align}
Finally, our training procedure is to maximize the lower bound for the entire dataset with respect to the weight parameters: 
$\hat{\phi}=\argmax_\phi \frac{1}{N}\sum_{n=1}^{N}\lowerb(\phi; \vx_1^{(n)},\ldots,\vx_K^{(n)})$.  

Figure~\ref{fig:algo} depicts the outline of our learning algorithm.  We emphasize here that, even though the architecture has rather intertwined interaction between grouping and categorization, our model can be formalized elegantly in a probabilistic generative framework, can be trained in an end-to-end manner only using a grouped dataset and no explicit label for category and so on (nor any pre-trained model), and can perform well for multiple down-stream tasks (Section~\ref{sec:exper}).


\begin{figure}[t]
\begin{center}
\includegraphics[width=8cm]{figures/example_algo.pdf}
\caption{ A diagrammatic outline of {\CIGMO} learning algorithm.  %
The entire workflow consists of three kinds of networks corresponding to views (left-most), category (right-most), and shapes in $C$ modules.  Given an input group of instances $x_k$ (bottom), an instance-specific view $y_k$ is computed by encoders $g$ and $r$.  In each module $c$, a group-common shape $z$ is computed by encoders $h_c$ and $s_c$ followed by averaging.  Then, new data instances generated by decoder $f_c$ from each shape and view are compared with the original data for obtaining the reconstruction error (loss).  This process is repeated for all modules.  In parallel, the posterior probability for category $c$ is computed by the categorizer $u$ followed by averaging and multiplied with the reconstruction error for the corresponding module.   Note that other probabilistic mechanisms (e.g., priors) are omitted here for brevity.
}
\label{fig:algo}
\end{center}
\end{figure}

%

\subsection{Model variants}
\label{sec:variants}

The above construction is only one approach to categorical invariant generative models.  There can indeed be a number of variants.  First, instead of combining categorical distributions by averaging as in \eqref{eq:cat-inf}, we could take the (normalized) product:
$q(c|\vx_1,\ldots,\vx_K) \propto \prod_{k=1}^K u^{(c)}(\vx_k)$
(which can be interpreted as evidence accumulation); or taking the softmax on the averaged logits:
$q(c|\vx_1,\ldots,\vx_K) \propto \exp\left(\frac{1}{K}\sum_{k=1}^K \tilde{u}^{(c)}(\vx_k))\right)$
where $\tilde{u}^{(c)}$ is the logit of $u^{(c)}$ (which estimates expectation at the level of neural network outputs).  Second, instead of the ``universal'' view space as in \eqref{eq:gvae-inf1}, we could consider a category-dependent view representation:
$q(\vy_k|\vx_k,c) = \gaussd_L\left( g_c(\vx_k), \diag(r_c(\vx_k))\right)$
using encoders $g_c$ and $r_c$ defined for each category $c$.  This is sensible since, e.g., frontal views for chairs could arguably mean differently from frontal views for tables.  For these two, Supplementary Materials make empirical comparison, showing that our original approach in Section~\ref{sec:learning} gives slightly better performance than the variants.  

In addition, we can think of a relatively different approach that takes an existing group-based disentangling model \citep{Bouchacourt2018,Hosoya2019,pmlr-v119-locatello20a}, but adds Gaussian mixture prior on the shape variable.
Formally, assume
\begin{align*}
	p(c) &= \pi_c  \\
	p(\vz'|c) &= \gaussd_M(\vb_c,\mA_c) \\
	p(\vy_k) &= \gaussd_L(0,\mI) \\
	p(\vx_k|\vy_k,\vz') &= \gaussd_D( f(\vy_k,\vz'), \mI)
\end{align*}
where $\vb_c$ and $\mA_c$ are a mean vector and covariance matrix, respectively, corresponding to category $c$; $f$ is a deep net decoder (not depending on the category).  However, the following can be proved.
\begin{theorem}\label{th:gvae-gm}
  Group-based disentangling models with Gaussian mixture prior are a special case of {\CIGMO}.
\end{theorem}
\begin{proof}
The result immediately follows by letting $\vz=\mA_c^{-\frac{1}{2}} (\vz' - \vb_c)$ and 
$f_c(\vy_k,\vz) = f(\vy_k, \mA_c^{\frac{1}{2}} \vz + \vb_c)$.
\end{proof}
This corresponds to the specific architecture in {\CIGMO} that shares the part of decoder networks after the first fully connected layer.  In fact, we precisely use such architecture in the experiments (Supplementary Materials).  %


%



\newcommand{\casetype}{\# of categories}
\newcommand{\chancelevel}{chance level}
\newcommand{\methIIC}{IIC}
\newcommand{\methIICoc}{IIC (overclust.)}
\newcommand{\methVAE}{VAE + $k$-means}
\newcommand{\methMVAE}{Mixture of VAEs}
\newcommand{\methGVAE}{GVAE + $k$-means}
\newcommand{\methMGVAE}{{\bf\CIGMO}}
\newcommand{\methMLVAE}{MLVAE + $k$-means}


\section{Experiments}
\label{sec:exper}

We have applied {\CIGMO} as described in Section~\ref{sec:cigmo} to two image datasets: ShapeNet (general objects) and MVC Cloth (cloth images).  Below, we outline the experimental set-up and show the results.

\subsection{ShapeNet}
\label{sec:shapenet}


\begin{table*}[t]
\caption{Invariant clustering accuracy (\%) for ShapeNet.  The mean and SD over 10 model instances are shown; the best method with the highest mean score is shown in the bold font; the $*$-mark indicates statistical significance relative to the best method (t-test; $p<0.05$); ditto for the remaining tables.  %
}
\fontsize{9pt}{10.5pt}\selectfont
\begin{center}
\begin{tabular}{l@{\;\;}cccc}
\toprule%
Methods & 3 classes & 5 classes & 10 classes \\
\midrule%
%
\chancelevel & \textrm{33.33}  & \textrm{20.00}  & \textrm{10.00} \\
\methIIC & \textrm{85.25} $\pm$ 13.74 & \textrm{81.10} $\pm$ 7.33$^{*}$ & \textrm{60.84} $\pm$ 1.45$^{*}$ \\
\methIICoc & \textrm{79.86} $\pm$ 13.78$^{*}$ & \textrm{81.87} $\pm$ 4.57$^{*}$ & \textrm{59.73} $\pm$ 1.49$^{*}$ \\
\methVAE & \textrm{66.41} $\pm$ 5.69$^{*}$ & \textrm{50.83} $\pm$ 3.85$^{*}$ & \textrm{37.07} $\pm$ 1.00$^{*}$ \\
\methMVAE & \textrm{82.35} $\pm$ 5.66$^{*}$ & \textrm{65.73} $\pm$ 6.24$^{*}$ & \textrm{40.86} $\pm$ 3.58$^{*}$ \\
\methMLVAE & \textrm{82.04} $\pm$ 7.78$^{*}$ & \textrm{70.68} $\pm$ 5.04$^{*}$ & \textrm{54.47} $\pm$ 1.92$^{*}$ \\
\methGVAE & \textrm{73.20} $\pm$ 10.93$^{*}$ & \textrm{69.42} $\pm$ 3.47$^{*}$ & \textrm{52.55} $\pm$ 2.74$^{*}$ \\
\methMGVAE & \textbf{94.83} $\pm$ 6.06 & \textbf{89.36} $\pm$ 4.53 & \textbf{68.53} $\pm$ 4.24 \\

 %
%
\bottomrule 
%
\end{tabular}
\label{tbl:clustering}
\end{center}
\end{table*}

\renewcommand{\methVAE}{VAE}
\renewcommand{\methGVAE}{GVAE}
\renewcommand{\methMLVAE}{MLVAE}

\begin{table*}[t]
\caption{One-shot identifica\fontsize{9pt}{10.5pt}tion accuracy (\%) for ShapeNet.     }
\fontsize{9pt}{10.5pt}\selectfont
\begin{center}
\begin{tabular}{l@{\;\;}cccc}
\toprule%
Methods & 3 classes (3705 objs.) & 5 classes (4977 objs.) & 10 classes (6210 objs.)\\
\midrule%
%
\chancelevel & \textrm{0.03}  & \textrm{0.02}  & \textrm{0.02} \\
\methVAE & \textrm{2.17} $\pm$ 0.03$^{*}$ & \textrm{3.49} $\pm$ 0.04$^{*}$ & \textrm{3.43} $\pm$ 0.02$^{*}$ \\
\methMVAE & \textrm{2.31} $\pm$ 0.04$^{*}$ & \textrm{3.71} $\pm$ 0.06$^{*}$ & \textrm{3.55} $\pm$ 0.06$^{*}$ \\
\methMLVAE & \textrm{24.00} $\pm$ 0.43$^{*}$ & \textrm{20.30} $\pm$ 0.26$^{*}$ & \textrm{17.93} $\pm$ 0.29$^{*}$ \\
\methGVAE & \textrm{24.51} $\pm$ 0.44$^{*}$ & \textrm{20.30} $\pm$ 0.24$^{*}$ & \textrm{17.91} $\pm$ 0.20$^{*}$ \\
\methMGVAE & \textbf{27.33} $\pm$ 0.55 & \textbf{24.51} $\pm$ 0.68 & \textbf{21.79} $\pm$ 0.71 \\

 \bottomrule %
\end{tabular}
\label{tbl:oneshot}
\end{center}
\end{table*}


For the first set of experiment, we derived a dataset of multi-viewed object images from 3D models in ShapeNet database \citep{Chang2015}.  The dataset consisted of 10 pre-defined object classes: car, chair, table, airplane, lamp, boat, box, display, truck, and vase.  Within each class, we had a large number of objects with specific shapes, which we distinguished by object identities.
We rendered each object in 30 views (3-dimensional viewpoints) in a single lighting condition.  We split the training and test sets, which consisted of 21888 and 6210 object identities, respectively.  (See Supplementary Materials for more details.)  %
%
We also created subset versions with 3 or 5 object classes.  %
For training data, we formed groups of  images of the same object in random 3 views ($K=3$), though our approach works for any choice of group size of $2$ or larger (Section~\ref{sec:groupsize})  We used object identity labels (not class labels) for grouping.  Importantly, however,  after this step, we never used any label during training.  %


To train a {\CIGMO} model, we used the following setting.  We set the number of categories in the model either to the number of classes in the data ($C=3,5,$ or $10$), or to a larger number, depending on the task.   %
We set the shape dimension $M=100$ and the view dimension $L=3$ (where a very low view dimension was used to avoid the view variable taking over all the input information and thereby the shape variable becoming degenerate).  For the categorizer, encoder, and decoder deep nets, we adopted commonly used architectures of convolutional or deconvolutional neural networks.  Since the model had so many deep nets, a large part of the networks was shared to save the memory space.  For simplicity, we fixed the category prior $\pi_c=1/C$. %
Supplementary Materials give more details on the architecture.  
%
For optimization, we used Adam \citep{Kingma2015} with mini-batches of size $100$ and ran 20 epochs.  %

We evaluated the trained models using test data, which were ungrouped and contained objects of the same classes as training data but of different identities.  Below, we describe the evaluation in three parts: (1) invariant clustering task, (2) one-shot identification task, and (3) category-wise shape-view disentangling.  Supplementary Materials give comparison of the model variants discussed in Section~\ref{sec:variants}.  


\subsubsection{Invariant clustering}

\begin{table*}[t]
\caption{Degree of shape-view disentanglement for ShapeNet by three measures.  Left: swapping error (lower is better); middle: neural network classification accuracy (\%) for object identity from the shape (higher is better); right: that for view variable (lower is better); weighted average over categories.  }
\fontsize{9pt}{10.5pt}\selectfont
\begin{center}
\begin{tabular}{lcccccc}
\toprule%
& \multicolumn{2}{c}{swapping error} & \multicolumn{2}{c}{shape $\rightarrow$ id (\%)} & \multicolumn{2}{c}{view $\rightarrow$ id (\%)}  \\
Methods & 5 classes & 10 classes & 5 classes & 10 classes & 5 classes & 10 classes \\
\midrule%
%
\methMLVAE & \textrm{0.497} $\pm$ 0.005$^{*}$ & \textrm{0.554} $\pm$ 0.010$^{*}$ & \textrm{47.30} $\pm$ 0.78$^{*}$ & \textrm{42.96} $\pm$ 0.74$^{*}$ & \textrm{0.60} $\pm$ 0.05 & \textrm{0.52} $\pm$ 0.08 \\
\methGVAE & \textrm{0.491} $\pm$ 0.004$^{*}$ & \textrm{0.549} $\pm$ 0.005$^{*}$ & \textrm{48.34} $\pm$ 0.55$^{*}$ & \textrm{43.87} $\pm$ 0.30$^{*}$ & \textbf{0.56} $\pm$ 0.05 & \textbf{0.48} $\pm$ 0.04 \\
\methMGVAE & \textbf{0.300} $\pm$ 0.025 & \textbf{0.340} $\pm$ 0.035 & \textbf{50.91} $\pm$ 0.97 & \textbf{46.28} $\pm$ 0.87 & \textrm{0.65} $\pm$ 0.05$^{*}$ & \textrm{0.67} $\pm$ 0.07$^{*}$ \\

 \bottomrule %
\end{tabular}
\label{tbl:shape-view}
\end{center}
\end{table*}


The goal of this task is to perform clustering of input images regardless of the view.  Since our learning method has already estimated  the latent category variable without using labels, the rest is to simply infer the most probable category from a given test image, $\hat{c}=\argmax_c{q(c|\vx)}$.  

For comparison, we considered the following baseline models.  First, we included two group-based disentangling methods, namely,
Group-based VAE (GVAE) \citep{Hosoya2019} and Multi-Level VAE (MLVAE) \citep{Bouchacourt2018}.  These can learn to separately infer shape and view from object images; we performed $k$-means clustering on the learned shape variable.  In addition, we incorporated Invariant Information Clustering (IIC) \citep{Ji2019}, the state-of-the-art, group-based method specialized to invariant clustering.  IIC came in two versions: with and without regularization using 5-times overclustering.  For all the baseline methods above, we gave exactly the same grouped dataset for training.  In addition to these, we examined, as an ablation study, two non-group-based methods: mixture of VAEs and vanilla VAE with $k$-means.

Figure~\ref{fig:clustering}(A) shows an example result from a {\CIGMO} model with 3 categories applied to the 3-class dataset, where each box shows random test images belonging to each estimated category.  This demonstrates a very precise clustering of objects achieved by our model, which is quite remarkable given the large view variation and no category label used during training.   Figures~\ref{fig:clustering}(B) and (C) show analogous examples from GVAE with $k$-means as well as IIC, clearly indicating lower performance compared to {\CIGMO}.  %


%


For quantitative comparison, we measured the performance of invariant clustering in two standard criteria.  The first is classification accuracy.  For this, since we needed to compute the best category-to-class mapping by the Hungarian algorithm \citep{Munkres1957} and since this step requires the number of categories in the model to equal the number of classes in the data, we used only such models corresponding to each dataset with 3, 5, or 10 classes.  Table~\ref{tbl:clustering} summarizes the results (for 10 model instances for each method).  Generally, {\CIGMO} outperformed the other methods in all cases with a large margin (mostly with statistical significance).  More specifically, first, {\CIGMO} performed better than GVAE or MLVAE with $k$-means, which shows that category information can be more clearly represented with multiple category-specific shape representations, compared to a general shape representation with a conventional clustering method.  Second, {\CIGMO} superseded IIC, which shows that general modeling of object images with category, shape, and view latent variables can do better than directly solving the specific task.  This was the case both with and without overclustering; in fact, we could not find a consistent improvement by overclustering in IIC, contrary to the claim by \citet{Ji2019}.  Third, {\CIGMO} showed significantly higher scores than mixture of VAEs, which confirms the large impact of group-based learning.  Taken together, these results demonstrate the efficacy of the categorical invariant representation in the invariant clustering task.  

The second criterion is Adjusted Rand Index (ARI), which provides a similarity measure between sample-to-category and sample-to-class assignments \citep{Hubert1985}.  Since this criterion allows any number of categories in the model, we assessed the change of scores for different numbers of categories.  Figure~\ref{fig:ari} shows the results for the 3-class and 10-class datasets.  Generally, all methods decreased performance while the number of categories increased.  However, {\CIGMO} reasonably retained the score even for larger numbers of categories, while the other methods exhibited a sharp drop.  This is because {\CIGMO} tends to use just a necessary number of categories and leave the remaining categories as degenerate (to which no input belongs), whereas the other methods try to find out as many clusters as possible and thus always use all categories.  This property of {\CIGMO} would be useful in a realistic setting where the number of true categories is unknown {\it a priori}.

%

\begin{figure}[t]
\begin{center}
\includegraphics[width=4cm]{figs/more_categs_ari_c3.pdf}
\includegraphics[width=4cm]{figs/more_categs_ari_c10.pdf}
\caption{Adjusted Rand Index for ShapeNet while increasing the number of categories in the model (line: mean, shade: SD).  %
}
\label{fig:ari}
\end{center}
\end{figure}

\subsubsection{One-shot identification}

The goal of this task is to perform object recognition given only one example per identity.  More precisely, we randomly pick up a subset of the test images consisting of exactly one image for each object identity and then identify the objects of the remaining test images.  Since the test objects are disjoint from the training objects, we deal with only unseen objects for the model.  This task can measure the strength of view-invariance of shape representation: if shape code is perfectly invariant in view, then all images of the same object should be mapped to an identical point in the shape space.  Note that our purpose here is not to infer the class but the object identity, unlike invariant clustering.  

Thus, we compared overall accuracy of one-shot identification for {\CIGMO} and other models.  For this, we performed a nearest-neighbor method according to Euclidean distance in the shape space.  Here, the shape space was defined depending on the method.  For GVAE or MLVAE, the shape variable $\vz=h(\vx)$ directly defined the shape space.  For {\CIGMO}, since the shape representation depended on the category, we first defined $\vz^c=h_c(\vx)$ for $c=\hat{c}$ (where $\hat{c}$ is inferred from the input) and $\vz^c=0$ otherwise, and then concatenated them together: $[\vz^1,\ldots,\vz^C]$.  This gave us category-dependent shape vectors that could be directly compared.  For VAE or mixture of VAEs, we used the entire latent variable in place of shape variable.  For simplicity, we again used models with the same number of categories as the image classes.

Table~\ref{tbl:oneshot} summarizes the results.  Overall, {\CIGMO} performed the best among the compared methods in all cases (with statistical significance).  In particular, it performed significantly better than GVAE and MLVAE, which indicates that shapes can be represented more precisely with category specialization than without.  In addition, our model outperformed, by far, mixture of VAEs, showing the successful disentanglement of shape from view.  These results, again, indicate the advantage of the categorical and invariant representations in the one-shot identification task.  (Note also that the scores were remarkably high even for up to 6210-way classification by one shot.)  

\subsubsection{Category-wise shape-view disentanglement}
\label{sec:feature}


{\CIGMO} provides category-specific shape representations that are disentangled from views.  We quantified the degree of disentanglement in two ways.  First, we generated a number of ``swapping'' images by the decoder from the view of one image and the shape of another, and calculated the (normalized) mean squared error between the generated images and the ground truths (swapping error).  Second, we measured how much information the shape or view variable contained on object identity, for which we trained two-layer neural networks on either variable for classification (identity inference) \citep{Mathieu2016,Bouchacourt2018}.  A better disentangled representation would give a higher accuracy from the shape variable and a lower accuracy from the view variable.  For each analysis, we conducted the measurement for each category using the belonging test images and took the average weighted by the number of those images.  Table~\ref{tbl:shape-view} summarizes the results.  Overall, {\CIGMO} tended to give better results than the baselines, except for object information in the view variable (though the compared accuracies were all negligibly low). 

%
%
%

\subsubsection{Group size variation}
\label{sec:groupsize}


\begin{figure}[t]
\begin{center}
\includegraphics[width=4cm]{figs/vary_group_size_ba_acc.pdf}
\includegraphics[width=4cm]{figs/vary_group_size_oneshot.pdf}
\includegraphics[width=4cm]{figs/vary_group_size_swapping.pdf}
\includegraphics[width=4cm]{figs/vary_group_size_shape_id.pdf}
\caption{Comparison of performance in invariant clustering, one-shot identification, swapping, and shape-to-identity when the group size is varied. %
}
\label{fig:groupsize}
\end{center}
\end{figure}



In Section~\ref{sec:shapenet}, we have argued that our model works as long as the group size is two or larger.  This is because the shape should be in principle the same no matter how many instances are in each group.  However, a larger group size could more easily stabilize the model training, thus potentially changing the result quantitatively.  Figure~\ref{fig:groupsize} shows the change of performance while the group size was varied.  The result indeed shows that increasing the group size tends to slightly improve the performance (except for the saturation in invariant clustering for fewer classes).


\begin{figure*}[t]
\begin{center}
\includegraphics[width=14cm]{figures/example_mvc.pdf}
\caption{Results from a {\CIGMO} model trained for MVC Cloth dataset.  (A) Invariant clustering.  Random images belonging to each estimated category are shown in a box.  (B) Swapping.  For each category, an image is generated from the view of one image in the top row and the shape of another image in the left column.
}
\label{fig:mvc}
\end{center}
\end{figure*}


\subsection{MVC Cloth}
\label{sec:mvc}

For the second set of experiment, we used a dataset of multi-viewed clothing images based on MVC Cloth dataset \citep{Liu2016a}.  The dataset contains a number of photos of cloths worn by fashion models and taken in multiple viewpoints (Supplementary Materials).  Unlike ShapeNet, this dataset provides no class label.  Therefore our focus here is to demonstrate what kind of unknown but interesting categories can be discovered by our model.  We split the dataset into the training and test sets each consisting of $\sim$112K and $\sim$28K images with disjoint cloth types; the training images were grouped for the same cloth types (group size 3).  
For model training, we arbitrarily set the number of categories to $7$ in a {\CIGMO}.  Other training conditions were identical as before.

Figure~\ref{fig:mvc}(A) illustrates an invariant clustering result, where only 4 categories were shown as the other 3 categories were degenerate.  By inspection, category~1 represents whole-body cloths like suits and dresses; category~2 represents tops like sweaters, jackets, blouses, and shirts typically with long sleeves; category~3 represents bottoms like trousers and jeans; category~4 represents tops like shirts typically with short sleeves.  
%
To characterize the categories more systematically, we used the attribute information provided by the dataset (264 boolean attributes for each image; see Supplementary Materials).  For each estimated category and each attribute label, we calculated F1-score to measure their relevance \citep{Fawcett2006}.  Table~\ref{tbl:mvc-attrs} gives top 10 most relevant attributes to each category; we can see good matching of the result with the visual impression from Figure~\ref{fig:mvc}(A).



Figure~\ref{fig:mvc}(B) shows a swapping result from the same {\CIGMO} model, where each matrix corresponds to a category and shows the images generated from the shape of one image (left column) and the view of another (top row).  We can see that the shape and view representations are well disentangled in each category, as the generated images are clearly aligned for the shape in rows and for the view in columns.\footnote{Note that our goal here is not to generate sharp images.  Generally, it is well known that, compared to VAE-based methods, GAN-based methods tend to generate sharper but often more corrupted images especially when the number of training images is not so large, e.g., \citep[Figs. 7 and 12]{Chen2018}.}

\begin{table}[h]
\caption{Top 10 most relevant attributes to each category in the model in Figure~\ref{fig:mvc}.  Each attribute name is shown with the F1-score in parentheses.
}
\fontsize{9pt}{10.5pt}\selectfont
\begin{center}
\begin{tabular}{c@{\;\;}p{6.5cm}}
\toprule%
Catg. & Attributes (F1-scores) \\
\midrule%
%
1 & Short (0.45); Sleeveless (0.33); hundred2O (0.32); Polyester (0.32); AlineDresses (0.31); SheathDresses (0.31); Black (0.29); hundred2U (0.28); KneeLength (0.24); Nylon (0.20); \\
2 & hundred2U (0.65); LongSleeves (0.63); hundred1U (0.57); Polyester (0.55); Pullover (0.47); Cotton (0.47); TShirts (0.39); Black (0.38); fiftyU (0.31); Nylon (0.28); \\
3 & Denim (0.49); Cotton (0.38); hundred2U (0.34); hundred1U (0.33); Polyester (0.27); Black (0.27); PullOn (0.27); Blue (0.27); ContrastStitching (0.26); Leggings (0.26); \\
4 & White (0.57); ShortSleeves (0.27); Pullover (0.25); TShirts (0.24); fiftyU (0.19); hundred1U (0.19); Crew (0.19); ButtonUpShirts (0.17); hundred2U (0.17); Cotton (0.16); \\
 %
%
%
%
%
%
\bottomrule 
%
\end{tabular}
\label{tbl:mvc-attrs}
\end{center}
\end{table}


%
%


%

%


\section{Conclusion}
\label{sec:concl}

In this paper, taking inspiration from the primate higher vision, we have proposed {\CIGMO} as a deep generative model that has category-modular shape representations in disentanglement with views, which can learn to represent category, shape, and view latent variables with group-based weak supervision.  We have shown empirical representational advantages that allow our model to outperform previous methods in multiple down-stream tasks such as invariant clustering and one-shot identification.  
One drawback is scalability: the per-step time complexity proportional to the number of categories, due to the reparametrization trick incompatible with category variables (Section~\ref{sec:learning}).  The Gumbel-Softmax technique is well-known for this type of problem \citep{Jang2017}, but did not work in our case for an unknown reason, though not so surprising since it is only a heuristics.  

Future investigation may include improvements in overall task performance as well as image generation quality, possibly with the aid of adversarial learning \citep{Goodfellow2014,Mathieu2016,Chen2018}, and application to more realistic settings.
%
Lastly, back to our original inspiration, we are keen to pursue the biological relationship of {\CIGMO} to the primate higher visual cortex, continuing our previous investigations \citep{Hosoya2017,Raman2020}.

%
%


%


\bibliography{library}
%

\begin{acknowledgements} %
                         %
                         %
This work has been supported by Commissioned Research of NICT (1940201) and Grants-in-Aid for Scientific Research (19H04999 and 21K19812). 
 \end{acknowledgements}


%
%
%
%
%
%
%
%
%
%


%
%
%
%
%
%
%
%
%
%

\end{document}
