\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}

%% Some suggested packages, as needed:
\usepackage[round]{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
  
%%%%% NEW MATH DEFINITIONS %%%%%

\usepackage{amsmath,amsfonts,bm}

% Mark sections of captions for referring to divisions of figures
\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

% Highlight a newly defined term
\newcommand{\newterm}[1]{{\bf #1}}


% Figure reference, lower-case.
\def\figref#1{figure~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}
% Reference to two sections.
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
% Reference to three sections.
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
% Reference to an equation, lower-case.
\def\eqref#1{equation~\ref{#1}}
% Reference to an equation, upper case
\def\Eqref#1{Equation~\ref{#1}}
% A raw reference to an equation---avoid using if possible
\def\plaineqref#1{\ref{#1}}
% Reference to a chapter, lower-case.
\def\chapref#1{chapter~\ref{#1}}
% Reference to an equation, upper case.
\def\Chapref#1{Chapter~\ref{#1}}
% Reference to a range of chapters
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
% Reference to an algorithm, lower-case.
\def\algref#1{algorithm~\ref{#1}}
% Reference to an algorithm, upper case.
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
% Reference to a part, lower case
\def\partref#1{part~\ref{#1}}
% Reference to a part, upper case
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


% Random variables
\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
% rm is already a command, just don't name any random variables m
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

% Random vectors
\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

% Elements of random vectors
\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

% Random matrices
\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

% Elements of random matrices
\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

% Vectors
\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vvv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}
\def\vkappa{{\bm{\kappa}}}
\def\vlambda{{\bm{\lambda}}}
\def\vgamma{{\bm{\gamma}}}
\def\vmu{{\bm{\mu}}}

% Elements of vectors
\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

% Matrix
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}
\def\mGamma{{\bm{\Gamma}}}
\def\mKappa{{\bm{\Kappa}}}

% Tensor
\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


% Graph
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

% Entries of a matrix
\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

% entries of a tensor
% Same font as tensor, without \bm wrapper
\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

% The true underlying data generating distribution
\newcommand{\pdata}{p_{\rm{data}}}
% The empirical distribution defined by the training set
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
% The model distribution
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
% Stochastic autoencoder distributions
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
% But then they seem to use $L^2$ for vectors throughout the site, and so does
% wikipedia.
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak


\usepackage[dvipsnames]{xcolor}         % colors
\usepackage{hyperref}
\hypersetup{
colorlinks = true,
linkcolor = ForestGreen,
anchorcolor = blue,
citecolor = Blue,
filecolor = cyan,
menucolor = ForestGreen,
runcolor = cyan,
urlcolor = ForestGreen}

\usepackage{url}
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{amssymb}
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{multicol, multirow}
%% Default packages I prefer to use for all my paper projects.

%\usepackage[round]{natbib} % has a nice set of citation styles and commands
\usepackage{amsmath, mathtools, amssymb}
\usepackage[capitalise]{cleveref}

\usepackage{subcaption, caption}
\usepackage{bm}
\usepackage{adjustbox}
\usepackage{tikz}
\usepackage{wrapfig}

\usetikzlibrary{calc,matrix,positioning,patterns,shapes.geometric}
%\DeclareMathOperator{\Var}{Var}
\DeclareMathOperator{\spann}{span}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\spur}{tr}
\DeclareMathOperator{\lnorm}{lnorm}
\DeclareMathOperator{\Det}{Det}
\DeclareMathOperator{\Covv}{Cov}
% Cleverref setup.
\newtheorem{definition}{Definition}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{condition}{Condition}[section]
\newtheorem{example}{Example}[section]
\newtheorem{theorem}{Theorem}[section]
\newtheorem{corrolary}{Corrolary}[section]
\crefname{condition}{condition}{conditions}
\Crefname{condition}{Condition}{Conditions}
\crefname{example}{example}{example}
\Crefname{example}{Example}{Example}
\Crefname{section}{Section}{Section} % Use small cref to print Sec., Fig. Tab.
\crefname{section}{Sec.}{Sec.} % Use small cref to print Sec., Fig. Tab.
\crefname{figure}{Fig.}{Figs.} 
\Crefname{figure}{Figure}{Figures} 
\Crefname{table}{Table}{Tables} 
\crefname{table}{Tab.}{Tab.} 
\Crefname{equation}{Equation}{Equations} 
\crefname{equation}{Eqn.}{Eqns.} 
\Crefname{algocf}{Algorithm}{Algorithms} 
\crefname{algocf}{Alg.}{Algs.} 
\Crefname{theorem}{Theorem}{Theorem} 
\crefname{theorem}{Thm.}{Thms.} 
% Math notation
\newcommand{\jacg}[0]{\bm{J}_{g}}
\newcommand{\jacf}[0]{\bm{J}_{f}}
\newcommand{\jacgstar}[0]{\bm{J}_{g^*}}
\newcommand{\jacfstar}[0]{\bm{J}_{f^*}}
\newcommand{\jacfprime}[0]{\bm{J}_{f^{'}}}
\newcommand{\jacphi}[0]{J_{\bm{\phi}}}
\newcommand{\jacgt}[0]{\tilde{J}_{g}}
\newcommand{\jacft}[0]{\tilde{J}_{f}}
\newcommand{\jacphit}[0]{\tilde{J}_{\phi}}
\newcommand{\jacfti}[1]{\tilde{J}^{(#1)}_{f}}
\newcommand{\trof}[1]{\spur\left( #1 \right)}

\newcommand*{\QED}{\null\nobreak\hfill\ensuremath{\square}}
%\usepackage{graphicx}
\usepackage{tikz}
\usepackage{esvect}

\usepackage[ruled]{algorithm2e}
% Equal contribution footnote

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\added}[1]{#1}

%\title{Disentangling Visual Embeddings with Minimal Distributional Assumptions}
\title{When are Post-hoc Conceptual Explanations Identifiable?}
% Suggestions:
% Rediscovering Latent Concepts in Learned Embedding Spaces
% Post-hoc, Provable, Disentangling/Factorizing
% From Correlated Factors Presence of Correlations
% DONT: Linear, Assumption...

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2,$\dagger$]{\href{mailto:tobias.leemann@uni-tuebingen.de}{\textcolor{black}{Tobias Leemann}}{}}
\author[1,$\dagger$]{\href{mailto:michael.kirchhof@uni-tuebingen.de}{\textcolor{black}{Michael Kirchhof}}{}}
\author[1,2]{Yao Rong}
\author[2]{Enkelejda Kasneci}
\author[2]{Gjergji Kasneci}
% Add affiliations after the authors
\affil[1]{%
    University of Tübingen\\
    Tübingen, Germany
}
\affil[2]{%
    Technical University of Munich\\
    Munich, Germany
}
\affil[$\dagger$]{%
    equal contribution
}

\begin{document}
\maketitle

\begin{abstract}
Interest in understanding and factorizing learned embedding spaces through conceptual explanations is steadily growing. When no human concept labels are available, concept discovery methods search trained embedding spaces for interpretable concepts like \textit{object shape} or \textit{color} that can provide post-hoc explanations for decisions. 
Unlike previous work, we argue that concept discovery should be \emph{identifiable}, meaning that a number of known concepts can be provably recovered to guarantee reliability of the explanations.
As a starting point, we explicitly make the connection between concept discovery and classical methods like Principal Component Analysis and Independent Component Analysis by showing that they can recover independent concepts under non-Gaussian distributions. For dependent concepts, we propose two novel approaches that exploit functional compositionality properties of image-generating processes.
Our provably identifiable concept discovery methods substantially outperform competitors on a battery of experiments including hundreds of trained models and dependent concepts, where they exhibit up to 29\,\% better alignment with the ground truth. Our results highlight the strict conditions under which reliable concept discovery without human labels can be guaranteed and provide a formal foundation for the domain. 
Our code is available \href{https://github.com/tleemann/identifiable_concepts}{online}.
\end{abstract}

% Content File for ICLR / ArXiv V2
\section{Introduction}\label{sec:intro}
Modern computer vision systems represent images in embedding spaces. These are either constructed implicitly in higher-level layers of large models or explicitly through generative models such as Variational Autoencoders \citep{kingma2013auto} or recent Diffusion Models \citep{song2019generative, ho2020denoising}.
To unveil why an image is considered similar to a certain class, interest in understanding these embeddings is increasing. Conceptual explanations \citep{crabbe2022concept, muttenthaler2022vice, akula2020cocox, kazhdan2020now, yeh2019completeness, Kim2018interpretabilityTCAV} are a popular explainable AI (XAI) technique for this purpose. They scrutinize a given encoder by decomposing its embedding space into interpretable concepts post-hoc, i.e., after training. 
Subsequently, these concepts form the basis of popular post-hoc explanations such as TCAV \citep{Kim2018interpretabilityTCAV} or allow high-level interventions \citep{koh2020concept}. \cref{fig:conceptexplanation} outlines a real-world example. A misclassification made by a pretrained model shipped with the \texttt{pytorch} library \citep{paszke2017automatic} is to be explained. In the given example, the conceptual explanation allows identification of a spurious correlation that the model has picked up: Most jack-o-lanterns are found in combination with dark backgrounds, which causes it to mistake the traffic light at night for a jack-o-lantern.

\definecolor{officered}{RGB}{192,0,0}
\begin{figure*}[t]
\begin{subfigure}[b]{0.20\textwidth}
    \scalebox{0.70}{
    \begin{tikzpicture}
    \node[inner sep=0pt] (lantern) at (0,0)
    {\includegraphics[width=4.5cm]{figures/LanternFigureA.pdf}};
    \node at (0.5, 1.1) {\parbox{5cm}{\centering prediction: \textbf{jack-o-lantern}\\true class: \textbf{traffic light}}};
    \end{tikzpicture}}
    \caption{\textbf{Misclassification:} A\newline model makes an incorrect prediction. A user is interested in understanding why this incident happened.\newline~}
\end{subfigure}\hspace{2mm}
\begin{subfigure}[b]{0.25\textwidth}
    \scalebox{0.70}{
    \begin{tikzpicture}
    \node[inner sep=0pt] (lantern) at (0,0)
    {\includegraphics[width=6cm]{figures/LanternFigureB.pdf}};
    \node at (0.0, 1.2) {\parbox{6cm}{\centering concept contributions for prediction\\\textbf{jack-o-lantern}}};
    \node at (-0.65, 0.48) {darkness};
    \node at (-0.6, -0.35) {fire, red};
    \node at (-0.7, -1.1) {pumpkins};
    \node[rotate=90] at (-2.4, -0.3) {\textcolor{officered}{dependencies}};
    \end{tikzpicture}}
    \caption{\textbf{Conceptual Explanation:} %\newline
    Concept contributions are computed that explain the prediction. In this example, the concept ``darkness'' is relevant for the outcome.}
\end{subfigure}\hspace{2mm}
\begin{subfigure}[b]{0.23\textwidth}
    \centering
    \scalebox{0.70}{
    \begin{tikzpicture}
    \node[inner sep=0pt] (lantern) at (0,0)
    {\includegraphics[width=4cm]{figures/LanternFigureC.pdf}};
    \node at (-0.1, 1.05) {\parbox{5cm}{\centering examples of\\\textbf{jack-o-lantern}}};
    \end{tikzpicture}}
    \caption{\textbf{Inspection:} A closer inspection of samples from the predicted class reveals that most images in this class have a dark background; a spurious correlation picked up by the model.}
\end{subfigure}
\hspace{2mm}
\begin{subfigure}[b]{0.25\textwidth}
    \scalebox{0.70}{
    \begin{tikzpicture}
    \node[inner sep=0pt] (lantern) at (0,0)
    {\includegraphics[width=6cm]{figures/LanternFigureD2.pdf}};
    \node at (0.0, 1.2) {\parbox{6cm}{\centering concept contributions for prediction\\\textbf{jack-o-lantern}}};
    \node at (-0.65, 0.48) {darkness};
    \node at (-0.6, -0.35) {fire, red};
    \node at (-0.7, -1.1) {pumpkins};
    \node[rotate=90] at (-2.4, -0.3) {\textcolor{officered}{dependencies}};
    \end{tikzpicture}}
    \caption{\textbf{Entangled Conceptual Explanation:} It is essential to correctly split up the contribution of individual concepts to allow for valid inferences.\newline\label{fig:motivationd}}
\end{subfigure}
\caption{Schematical use-case of conceptual explanations: A misclassification of an image classifier is explained. The example is based on a real explanation for a ResNet50 model. Details and the original explanation are provided in App. C.8.\label{fig:conceptexplanation}}
\end{figure*}

Constructing such explanations is non-trivial.
The key ingredient to all conceptual explanation techniques is a set of interpretable concepts, which is notoriously hard to specify \citep{leemann2022coherence}. It is frequently defined through human annotations \citep{crabbe2022concept, koh2020concept, Kim2018interpretabilityTCAV} on individual samples of the dataset that can be prohibitively expensive \citep{kazhdan2021disentanglement}. Furthermore, it is usually unknown which concepts will be leveraged by a machine learning model without a model at hand. Therefore, we consider fully unsupervised concept discovery \citep{ghorbani2019towards, yeh2019completeness}, where the concepts are automatically discovered in the data. %.
Concepts are frequently modeled as directions in a given embedding space \citep{ghorbani2019towards,Kim2018interpretabilityTCAV, yeh2019completeness}, which have to be discovered without supervision. These embedding spaces can be highly distorted, making it hard to correctly separate the influences of individual concepts. However, this is essential to make the right inferences in practice (see \cref{fig:motivationd}). \added{This intuition is supported by prior work on generative models \citep{ross2021evaluating}, which has shown that user understanding is strongly linked to the representations' respective disentanglement.}


While many methods have been empirically shown to work well, a rigorous theoretical analysis of the conditions under which concept discovery is possible is still lacking in previous works. We propose to consider concept discovery methods that are \textit{identifiable}. This means when a known number of \textit{ground truth components} generated the data, the concept discovery method provably yields concepts that correspond to the individual ground truth components and can correctly represent an input in the concept space. This is a crucial requirement: If a method is even incapable of recovering known components, there is no indication for its reliability in practice. In this work, we are the first to investigate identifiability results in the context of post-hoc concept discovery.

First, we find that identifiability results from Principal Component Analysis (PCA) and Independent Component Analysis (ICA) literature \citep{jolliffe2002principal, comon1994independent, hyvarinen2001independent} can be transferred to the conceptual explanation setup. We establish that they cover the case of independent ground truth components with non-Gaussian distributions. This is insufficient for two reasons: (1) In practice, concepts such as height and weight \citep{trauble2021disentangled} or wing and head colors of birds often follow complex dependency patterns. (2) Popular generative models \citep{kingma2013auto, song2019generative} frequently work with an embedding space with a Gaussian distribution.

As a second contribution, we seek to fill this void by providing an identifiable concept discovery approach that can handle dependent and Gaussian ground truth components. We can show that this is possible through taking the nature of the image-generating process into consideration. Specifically, we propose utilizing \emph{visual compositionality properties}. These are based on the observation that tiny changes in the components frequently affect input images in orthogonal or even disjoint ways. These properties of image-generating processes also leave a ``trace'' in the encoders learned from a set of data samples. This insightful finding permits to construct two novel post-hoc concept discovery methods based on the \emph{disjoint} or \emph{independent mechanisms} criterion. We prove strong identifiability guarantees for recovering components, even if they are dependent. \added{Our results highlight the strict and nuanced conditions under which identifiable concept discovery is possible.}

In summary, our work advances current literature in multiple ways: 
\textbf{(1)} We present first identifiability results for post-hoc conceptual explanations. We find that results from ICA can be transferred under the assumption of independent ground truth components.
\textbf{(2)} For the more intricate setting of dependent components, we propose the \textit{disjoint mechanism analysis (DMA)} criterion and the less constrained \emph{independent mechanism analysis (IMA)} criterion. We prove that they recover even dependent original components up to permutation and scale.
\textbf{(3)} We construct DMA and IMA-based concept discovery algorithms for encoder embedding spaces with the same theoretical identifiability guarantees. 
\textbf{(4)} We test them (i) on embeddings of several autoencoder models learned from correlated data, (ii) with multiple and strong correlations, (iii) on discriminative encoders, and (iv) on the real-world CUB-200-2011 dataset \citep{Wah2011}. Our approaches maintain superior performance amidst increasingly severe challenges.

\section{Related Work}
\label{sec:relatedwork}
Works on the analysis and interpretation of embedding spaces touch a variety of subfields of machine learning.

\textbf{Concept discovery for explainable AI.} Conceptual explanations \citep{koh2020concept, Kim2018interpretabilityTCAV, ghorbani2019towards, yeh2019completeness, akula2020cocox, chen2020concept} have gained popularity within the XAI community. They aim to explain a trained machine learning model post-hoc in terms of human-friendly, high-level concept directions \citep{Kim2018interpretabilityTCAV}. These concepts are found via supervised \citep{koh2020concept, kim2018disentangling, kazhdan2020now} or unsupervised approaches \citep{yeh2019completeness, akula2020cocox, ren2022learning}, such as clustering of embeddings \citep{ghorbani2019towards}. However, their results are not always meaningful \citep{leemann2022coherence, yeh2019completeness}. Therefore, we suggest approaches with identifiability guarantees. We provide initial identifiability results and a novel approach, which can be used for unsupervised concept discovery under correlated components. 

\textbf{Independent Component Analysis (ICA).} Independent Component Analysis \citep{comon1994independent, hyvarinen1999, hyvarinen2001independent} or blind source separation (BSS) consider a generative process $\vg(\vz)$ as a mixture to undo and rely on traces that the distribution of the generating components $\vz$ leaves in the mixture.
%tackles a similar problem but relies on a non-gaussianity of components to separate the hidden sources in a mixture. 
In this work, we show that an identifiability result from ICA can be transferred to the conceptual explanation setup, but recovery is only possible under independent underlying components of which all but one are non-Gaussian. This result is not applicable to naturally correlated processes, which is why we design a novel method for this case.

\textbf{Disentanglement Learning.} Concurrently, literature on disentanglement learning is concerned with finding a data-generating mechanism $\vg(\vz)$ and a latent representation $\vz$ for a dataset, such that each of the original components (also known as factors of variation) is mapped to one (controllable) unit direction in $\vz$ \citep{bengio2013representation}. An alternative definition relies on group theory \citep{higgins2017beta} where certain group operations (symmetries) should be reflected in the learned representation \citep{painter2020LinearGroupDisentanglement, yang2021towards}. Most works in the domain enhance VAEs \citep{kingma2013auto} with additional loss terms \citep{higgins2017beta, burgess2018understanding, kim2018disentangling, chen2018isolating}.
Despite recent progress it is not always possible to construct disentangled embedding spaces from scratch: \cite{locatello2019challenging} have shown that the problem is inherently unidentifiable without additional assumptions. A more recent work by \citet{trauble2021disentangled} shows that even if just two components of a dataset are correlated, current disentanglement learning methods fail. In this work, we focus on post-hoc explanations of embedding spaces of given models, which are usually entangled.

\textbf{Identifiability results.} \added{Identifiability questions have been raised in domains such as Natural Language Processing \citep{carrington2019invariance} or in disentanglement learning, which is most related to this work.} It has been previously shown that unsupervised disentanglement, without further conditions, is impossible \citep{hyvarinen1999, locatello2019challenging, moran2022identifiable}. Hence, recent works aim to understand the conditions sufficient for identifiability. One strain of work relies on additional supervision, i.e., access to an additional observed variable \citep{hyvarinen2019nonlinear, khemakhem2020variational} or to tuples of observations that differ in only a limited number of components \citep{locatello2020weakly}. 
\citet{gresele2021independent} and \citet{zheng2022on} proved identifiable disentanglement under independently distributed components and introduce a functional condition on the data generator.
We also consider functional properties, but our setting is different as (1) we have access to a trained encoder only and (2) not even partial annotations or relations are available.

\section{Analysis}
\label{sec:theory}
In this section, we formalize post-hoc concept discovery to provide an identifiability perspective. We find that Independent Component Analysis (ICA) and Principal Component Analysis (PCA) only guarantee identifiability when the ground-truth components are stochastically independent. We then study the intricate case of dependent components and propose using \emph{disjoint} and \emph{independent mechanisms analysis} (DMA / IMA) along with identifiability results. All proofs are provided in App. B.


\begin{figure*}[t]
    \centering
    \ifdefined\arxiv
    \scalebox{0.75}{\input{figures/setup_figure}}
    \else
    \scalebox{0.75}{\input{figures/setup_figure}}
    \fi
    \caption{Overview over the concept discovery setup. We consider a process where data samples $\bm{x}$ are generated from possibly correlated ground truth components $\bm{z}$, e.g., a wingspan or beak length of a bird, by an unknown process $\vg$ (left). The high-dimensional data is mapped to the to the embedding space of a given model $\vf$ (center). A suitable post-hoc concept discovery yields concept vectors $\vm_i$ that correspond to the original components (right).}
    \label{fig:encdecsetup}
\end{figure*}

\subsection{Problem Formalization} \label{sec:form}

In post-hoc concept discovery, we are given a trained encoder $\vf:\mathcal{X} \rightarrow \mathcal{E}$ with embeddings $\ve = \vf(\vx) \in \mathcal{E} \subset \mathbb{R}^K$ of each image $\vx \in \mathcal{X}$. We do not impose any restriction on how $\vf$ was obtained; it can be the feature extractor part of a large classification model, or a feature representation learned through autoencoding, contrastive learning \citep{chen2020simple}, or related techniques. Interpretability literature seeks to understand the embedding space by factorizing it into concepts. Based on the observations that directions in the embedding space often correspond to meaningful features \citep{szegedy2013intriguing, bau2017network, alain2016understanding, bisazza2018lazy}, these concepts are frequently defined as direction vectors $\bm{m}_i$ \citep{Kim2018interpretabilityTCAV, ghorbani2019towards, yeh2019completeness}. \added{These are commonly referred to as concept activation vectors (CAVs).} Hence, the combined output of a concept discovery algorithm is a matrix $\mM =[\bm{m}_1, \ldots, \bm{m}_K]^\top\in \mathbb{R}^{K \times K}$ where each row contains a concept direction.

We seek a theoretical guarantee on when these discovered concept directions align with ground truth components that generated the data. To this end, we formalize the data-generating process as shown in \cref{fig:encdecsetup}: There are $K$ ground-truth components with scores $z_k, k=1\ldots K$, summarized $\vz \in \mathcal{Z} \subset \mathbb{R}^K$, that define an image. The term \textit{components} always refers to the ground truth as opposed to the \textit{concepts}, which denote the discovered directions. A data-generating process $\vg:\mathcal{Z} \rightarrow \mathcal{X}$ generates images $\vx = \vg(\vz) \in \mathcal{X} \subset \mathbb{R}^L$, $L \gg K$. A powerful algorithm should be able to recover the original components. That is, there should be a one-to-one mapping between entries of $\mM \ve$ and the entries in $\bm{z}$, up to the arbitrary scale and order of the entries. We say that a concept discovery algorithm \emph{identifies} the true components if it is guaranteed to output directions $\mM$ that satisfy $\mM \ve = \mM \vf(\vg(\vz)) = \mP\mS \vz$ $\forall \vz \in \mathcal{Z}$, where $\mP \in \mathbb{R}^{K \times K}$ is a permutation matrix that has one $1$ per row and column and is $0$ otherwise, and $\mS \in \mathbb{R}^{K \times K}$ is an invertible diagonal scaling matrix. 

To make the problem solvable in the first place, concept directions must exist in the embedding space of the given encoder, requiring $ \bm{e} = \mD \vz$, where $\mD \in \mathbb{R}^{K \times K}$ is of full rank. Depending on the scope of the conceptual explanation desired, it can be sufficient for the components to exist in a local region of the embedding space if the concept discovery algorithm is only applied around a region around a certain point of interest. This only changes the meaning of $\mathcal{E}, \mathcal{X},$ and $\mathcal{Z}$ but is formally equivalent.

\subsection{Identifiability via Independence}
Initially, we turn towards classical component analysis methods. We find that their identifiability results \added{use non-correlation or even stronger stochastic independence assumptions of the ground truth components.} 

Principal Component Analysis (PCA) \citep{jolliffe2002principal} uses eigenvector decompositions to find orthogonal directions $\mM$ that result in uncorrelated components $\mM \ve$. This means that PCA is only capable of identifying the original components if the ground truth components $\vz$ were uncorrelated and exist as orthogonal directions in our embedding space. In our setup and notation, this leads to the following result:
 
\begin{theorem}[PCA identifiability]\label{thm:ident_pca}
     Let $z_k, k=1, \dotsc, K,$ be uncorrelated random variables with non-zero and unequal variances. Let $\bm{e} = \bm{D}\bm{z}$, where $\bm{D} \in \mathbb{R}^{K \times K}$ is an orthonormal matrix. If an orthonormal post-hoc transformation $\bm{M}\in \mathbb{R}^{K \times K}$  results in mutually uncorrelated components $(z'_1, \dotsc, z'_K) = \bm{z}' = \bm{M}\bm{e}$, then $\bm{M}\ve = \bm{P}\bm{S} \vz$, where $\mP \in \mathbb{R}^{K \times K}$ is a permutation and $\bm{S} \in \mathbb{R}^{K \times K}$ is a diagonal matrix where $|s_{ii}|=1$ for $i \in 1,\ldots K$.\footnote{To simplify notation, $\mP$ and $\mS$ mean \textit{any} permutation and scale matrices. They do not have to be equal between the theorems.} 
\end{theorem}

All proofs in this work are deferred to App.~B. It is arguably a strong condition that the ground truth directions are encoded orthogonally in the embedding space. Independent Component Analysis (ICA) overcomes this limitation and allows for arbitrary directions. \added{However, the classic result by \cite{comon1994independent} even demands stochastically independent components. Transferred to our setup and notation, the result can be stated as follows.}
\begin{theorem}[ICA identifiability]\label{thm:ident_ica}
    Let $z_k, k=1, \dotsc, K,$ be independent random variables with non-zero variances where at most one component is Gaussian. Let $\bm{e} = \bm{D}\bm{z}$, where $\bm{D} \in \mathbb{R}^{K \times K}$ has full rank.  
    If a post-hoc transformation $\bm{M} \in \mathbb{R}^{N\times N}$ results in mutually independent components $(z'_1, \dotsc, z'_K) = \bm{z}' = \bm{M}\bm{e}$, then $\bm{M}\ve = \bm{P}\bm{S} \vz$, where $\mP \in \mathbb{R}^{K \times K}$ is a perm. and $\bm{S} \in \mathbb{R}^{K \times K}$ is a diag. matrix.
\end{theorem}

This result shows that stochastic independence of the ground truth components leaves a strong trace in the embeddings that can be leveraged. Algorithms like \texttt{fastICA} \citep{hyvarinen1997fast} can find the concept directions $\mM$ by searching for independence \citep{comon1994independent}. We conclude that ICA is suited for post-hoc concept discovery under independent components.

In summary, we have transferred two results from the component analysis literature to the setup of post-hoc conceptual explanations. However, these results do not allow to recover components that are correlated or follow a Gaussian distribution. This limits their applicability in practice where concepts often appear pairwise (e.g., darkness and jack-o-lanterns, cf. \cref{fig:conceptexplanation}). We will bridge this gap in the remainder of this paper by introducing two new identifiable discovery methods based on functional properties of the generation process that we term \textit{disjoint} and \textit{independent} mechanisms. A summary of identifiability results is provided in \cref{tab:summary}.
%setup reduces to a linear de-mixing operation and results from linear ICA 
\begin{table}[t]
\adjustbox{width=\columnwidth}{
\setlength{\tabcolsep}{2pt}
\begin{tabular}{cccl}
    \toprule
    Dependency & Marginal Dist. & Transform & Criterion\\
    \midrule
    uncorr. & uneq. variances & orthogonal & non-correlation (PCA) \\
    independent & non-Gaussian & invertible & independence (ICA) \\
    arbitrary & arbitrary & invertible &  disj. mechanisms (DMA) \\
    arbitrary & arbitrary & invertible &  indep. mechanisms (IMA) \\
    \bottomrule
\end{tabular}
}
\caption{PCA and ICA provably identify concepts via their distributions. DMA and IMA utilize functional properties.}
\label{tab:summary}
\end{table}

\subsection{Identifiability via Disjoint Mechanisms}
\label{sec:transfer}
Instead of placing independence assumptions on $\vz$, we propose a concept discovery algorithm that makes use of natural properties of the generative process $\vg$. 
In particular, generative processes in vision are often compositional \citep{ommer2007learning}: Different groups of pixels in an image, like a bird's wings, legs, and head, are each controlled by different components. Effects of tiny changes in components are visible in the Jacobian $\mJ_\vg$, where each row points to the pixels affected. Thus, a compositional process will follow the \textit{disjoint mechanisms} principle. 

\begin{definition}[Disjoint mechanism analysis (DMA)] \label{def:dma}
$\vg$ is said to generate $\vx$ from its components $\vz$ via disjoint mechanisms if the Jacobian $\mJ_\vg(\vz) \in \mathbb{R}^{L\times K}$ exists and is a block matrix $\forall \vz \in \mathcal{Z}$. That is, the columns of $\mJ_\vg(\vz)$ are non-zero at disjoint rows, i.e. $| \mJ_\vg(\vz) |^\top | \mJ_\vg(\vz) | = \mS(\vz)$, where $\mS \in \mathbb{R}^{K \times K}$ is a diagonal matrix that may be different for each $\vz$ and $| \cdot |$ takes the element-wise absolute value.
\end{definition}
Note that this definition does not globally constrain the location of affected pixels. The components may still alter different but disjoint pixels for each image.
In real concept discovery, we do not have access to the generative process $\vg$ but can only access the encoder $\vf$. However, an encoder corresponding to $\vg$ will not be arbitrary and its Jacobian $\jacf \in \mathbb{R}^{K\times L}$ will have a distinct form in practice: First, to maintain the component information the composition $\vf \circ \vg$ will be of the form $\vf(\vg(\vz))=\bm{D}\bm{z}$, with a yet unknown matrix $\bm{D} \in \mathbb{R}^{K\times K}$. Furthermore, we expect encoders to be rather lazy, meaning they only perform the changes to invert the data generation process but are almost invariant to input deviations not due to changes in the components. \added{This is in line with the classic interpretability literature, where gradients of models were observed to noisily highlight the relevant input features \citep{baehrens2010explain, simonyan2013deep} and form the basis of popular attribution methods such as Integrated Gradients \citep{Sundararajan2017}.} Technically, the changes effected by the components form the linear $\text{span}(\mJ_\vg(\vz))$, whereas entirely external changes are given in its orthogonal complement $\text{span}(\mJ_\vg(\vz))^{\perp}$. Thus, for $\mathbf{v} \in \text{span}(\mJ_\vg(\vz))^\perp \subset \mathbb{R}^L$ the encoder should not react to these change and \added{the corresponding gradients of the encoder for these changes should be zero, i.e., $\mJ_\vf(\vg(\vz))\mathbf{v} = \mathbf{0} \Leftrightarrow \mathbf{v} \in \text{ker}(\mJ_\vf(\vg(\vz)))$.}
%nts of $\vf$ can still rather arbitrary because they  the set $\left\{\vx~\middle|~\exists \vz \in \mathcal{Z}: \vx=\vg(\vz) \right\}$ and its graidents can be arbitrary. will therefore define a faithful encoder as follows:
\begin{definition}[Faithful encoder]
    $\vf$ is a faithful encoder for the generative process $\vg$ if the ground truth components remain recoverable, i.e., $\vf(\vg(\vz))=\bm{D}\bm{z}$, for some $\bm{D} \in \mathbb{R}^{K \times K}$ with full rank. Furthermore, $\bm{f}$ is lazy and invariant to changes in $\bm{x}$ which cannot be explained by the ground truth components, requiring $\mJ_{\vf}(\vg(\vz))$ and $\mJ_\vg(\vz)$ to exist and $ \text{\normalfont span}(\mJ_\vg(\vz))^\perp \subseteq \text{\normalfont ker}(\mJ_\vf(\vg(\vz))),~ \forall \vz \in \mathcal{Z}$.
    %locally defined requires that $\mJ_{\vf}(\vg(\vz))$ and $\mJ_\vg(\vz)$ exist and $\text{span}(\mJ_\vf(\vz)) \cap \text{ker}(\mJ_\vg(\vz)) = \emptyset$ $\forall \vz \in \mathcal{Z}$.
\end{definition}

\added{Having defined what realistic encoders look like through the notion of faithful encoders,} we find that there is distinct property which can be leveraged to discover the directions in $\mM$ among faithful encoders: \added{It is sufficient to find an encoder $\mM\vf$ whose Jacobian $\mM\mJ_{f}$ will have disjoint rows. Intuitively, this means searching for components whose gradients affect disjoint image regions.}
%One technical corner case is that the given encoder $\vf$ could be ill-fitted to the generative process and react to image parts that are unrelated to the original components.  This would render any analysis of its gradients impossible since they could point to pure noise regions. To make concept discovery possible we thus assume that the encoder we are given is well-fitted:

\begin{theorem}[Identifiability under DMA] \label{eq:ident_full}
Let $\vg$ have disjoint mechanisms and $\vf$ be a faithful encoder to $\vg$. %Let $\bm{e} = \bm{D}\bm{z}$. 
If a post-hoc transformation $\bm{M} \in \mathbb{R}^{K\times K}$ of full rank results in disjoint rows in the Jacobian $\mM \mJ_{\vf}(\vg(\vz))$, i.e., $\lvert\mM \mJ_{\vf}(\vg(\vz))\rvert\lvert\mM \mJ_{\vf}(\vg(\vz))\rvert^\top$ is invertible and diagonal for some $\vz \in \mathcal{Z}$, then $\bm{M}\ve = \bm{P}\bm{S} \vz$ where $\mP \in \mathbb{R}^{K \times K}$ is a permutation and $\bm{S} \in \mathbb{R}^{K \times K}$ is a scaling matrix.
\end{theorem}

This theorem does not impose any restrictions on the distribution $\vz$, making it applicable to realistic concept discovery scenarios through leveraging the nature of the generative process. The proof of this algorithm in App.~B.5 also yields an analytical solution. We will use it to verify conditions in a controlled experiment in \cref{sec:synthdata}. We have thus identified the \textit{DMA criterion} that \added{is sufficient} to discover the component directions when the rows of $\mM \mJ_\vf$ point to disjoint image regions. We can formulate this as a loss function and optimize for $\mM$ via off-the-shelf gradient descent: 
\begin{align}\label{eqn:objective}
    \mathcal{L}(\mM) = \mathbb{E}_\vx\lVert \text{arn}\left[\mM \jacf(\vx)\right]\text{arn}\left[\mM \jacf(\vx)\right]^\top - \mI \rVert_F^2.
\end{align}
The expectation is taken over a collection of real data samples $\vx=\vg(\vz)$. The $\text{arn}$-operator (\underline{a}bsoute values, \underline{r}ow \underline{n}ormalization) takes the element-wise absolute value and subsequently normalizes the rows. This does not constrain the norms of the Jacobian's rows but only enforces disjointness. 


\subsection{Concept Discovery via Independent Mechanisms} \label{sec:theoryOrtho}
We can perform an analogous derivation for a class of generating processes that is more general. Grounded by causal principles instead of compositionality, the independent mechanisms property has been argued to define a class of natural generators \citep{gresele2021independent}.

\begin{definition}[Independent mechanism analysis (IMA)]
\label{def:ima}
$\vg$ is said to generate $\vx$ from its components $\vz$ via independent mechanisms if the Jacobian $\mJ_\vg(\vz)$ of $\vg$ exists and its columns (one per component) are orthogonal $\forall \vz \in \mathcal{Z}$, i.e., $\mJ_\vg^\top(\vz) \mJ_\vg(\vz) = \mS(\vz)$, where $\mS \in \mathbb{R}^{K \times K}$ is a diagonal matrix that may differ for each $\vz$ \citep{gresele2021independent}.
\end{definition}

\citet{gresele2021independent} and \citet{zheng2022on} used this characteristic to find disentangled data generators, but we can again transfer characteristics via faithful encoders: This time we find that searching for an $\mM\mJ_{f}$ with \textit{orthogonal} (instead of disjoint) rows permits post-hoc discovery of concepts. We refer to is property of $\mM\mJ_{f}$ as the \textit{IMA criterion}. 

However, as the class of admissible processes has been increased, it is not strong enough to ensure identifiability in the most general case. This is prevented under an additional technical condition on the component magnitudes, which we refer to as \emph{non-equal magnitude ratios} (NEMR). Intuitively, the magnitudes of the component gradients have to change non-uniformly between at least two points \added{for the conditions to be sufficient}. \added{If there were two factors that always attribute to input pixels in the same way (imagine the sky being partitioned into two components termed ``left sky'' and ``right sky''), they cannot be told apart anymore since there can be other mixtures which would result in orthogonality (they could equally be ``lower sky'' and ``upper sky'').}

\begin{theorem}[Identifiability under IMA] \label{thm:ident_ima}
Let $\vg$ adhere to IMA. Let $\vf$ be a faithful encoder to $\vg$. 
%Suppose there exists an MDC $\vf^* = \mM \vf$, where $\mM \in \mathbb{R}^{K \times K}$ has full rank. 
Suppose we have obtained an $\vf'= \mM \vf$ with a full-rank $\mM \in \mathbb{R}^{K \times K}$ and orthogonal rows in its Jacobian  $\mM \mJ_{\vf}(\vg(\vz)) \coloneqq \jacfprime(\vg(\vz))$, i.e,  $\jacfprime(\bm{g}(\bm{z}))\jacfprime(\bm{g}(\bm{z}))^\top = \bm{\Sigma}(\bm{z})$ where $\bm{\Sigma}(\bm{z})$ is diagonal and full-rank at two points $\vz \in \{\vz_a, \vz_b\}$. If additionally $\bm{\Sigma}(\bm{z}_a) \bm{\Sigma}(\bm{z}_b)^{-1}$
has unequal entries in its diagonal (NEMR condition), then $\bm{M}\ve = \bm{P}\bm{S} \vz$, where $\mP \in \mathbb{R}^{K \times K}$ is a permutation and $\bm{S} \in \mathbb{R}^{K \times K}$ is a scaling matrix.%. $f'$ is an MDC and has orthogonal rows in its Jacobian for all $z \in \mathcal{Z}$  (orthogonal attributions).
\end{theorem}

The constructive proof in App.~B.6 can also be condensed into an analytical solution. Alternatively, one can again construct a suitable optimization objective for the IMA criterion, i.e., orthogonal Jacobians. This is achieved by removing the absolute value operation from the arn-operator in \cref{eqn:objective}, so that it solely performs a row-wise normalization. In summary, we have established the novel DMA and IMA criteria that allow concept discovery under dependent components.

\section{Experiments}
In the following, we perform a battery of experiments of increasing complexity to compare the practical capabilities of approaches for identifiable concept discovery. We start by verifying the theoretical identifiability conditions (\cref{sec:synthdata}), then perform evaluation under increasing multi-component correlations for embedding spaces of generative and discriminative models (\cref{sec:exp_comp} to \ref{sec:exp_discr}), and finally use a large-scale, discriminatively-trained ResNet50 encoder (\cref{sec:exp_cub}). 

We borrow the DCI metric \citep{eastwood2018framework} from disentanglement learning 
with scores in $[0,1]$ to measure whether each discovered component predicts precisely one ground-truth component and vice versa. Following \citet{locatello2020weakly}, 
we report additional metrics with similar results in App.~D, along with results on additional datasets and ablations. For reproducibility, each experiment is repeated on five seeds and code is made available upon acceptance. In total, we train and analyze over 300 embedding spaces, requiring about 124 Nvidia RTX2080Ti GPU days. More implementation details are in App.~C.
% Estimate: 
% Section 5.2: 180 Disentanglement Models: (4*Corr+4*Model+5*Seeds+1Factors + (1*Corr+2*Factors*4Models*5Seeds) + (1*Corr+4*model+5*seeds+3*factors)
% Section 5.3: 100 models (5*corrstrength+1*model+5*seeds) + (15*pairs + 1*model + 5*seeds)
% Section 5.4: 30 models: 6Corr*1Factor*5Seeds
% Section 5.5: 5*models
% Each was trained and disentangled with at least 2 combinations of params (~9.5 hours hours per train + eval).

\newcommand{\figthreew}{1.0}
\begin{figure}[t]
    \begin{subfigure}[b]{\figthreew\linewidth}
    \begin{minipage}[t]{0.5\textwidth}
    \centering \small Traversals along each component
    \includegraphics[width=0.9\textwidth, trim=0 0.56cm 0 0, clip]{figures/gt_traversal.pdf}
    \end{minipage}
    \begin{minipage}[t]{0.3\textwidth}
    \small \,\,\,\,\,\,\,\,Gradients
    \includegraphics[width=\textwidth]{figures/gradients_bars.pdf}
    \end{minipage}
    \begin{minipage}[t]{0.16\textwidth}
    ~\newline
    \scalebox{0.7}{
    \begin{tabular}{c}
    DCI Scores\\
    \toprule
    IMA\\
    0.24 $\pm$ 0.10 \\
    \midrule
    DMA \\
    \textbf{1.00} $\pm$ 0.00 \\
    \bottomrule
    \end{tabular}
    }
    \end{minipage}
    \caption{\texttt{FourBars}:\,DMA datasets\,can\,be\,solved\,by\,the\,DMA\,criterion.\label{fig:fourbars}\newline~} 
    \end{subfigure}
   \begin{subfigure}[b]{\figthreew\linewidth}
    \begin{minipage}[t]{0.5\textwidth}
    \centering
    \small Traversals along each component
    \includegraphics[width=0.9\textwidth, trim=0 0.56cm 0 0, clip]{figures/gt_traversal_color_bar.pdf}
    \end{minipage}
    \hfill
    \begin{minipage}[t]{0.3\textwidth}
    \small \,\,\,\,\,\,\,\,Gradients
    \includegraphics[width=\textwidth]{figures/gradients_bars_color.pdf}
    \end{minipage}
    \begin{minipage}[t]{0.18\textwidth}
    ~\newline
    \scalebox{0.7}{
    \begin{tabular}{c}
    DCI Scores\\
    \toprule
    IMA \\
    \textbf{1.00} $\pm$ 0.00 \\
    \midrule
    DMA\\
    0.26 $\pm$ 0.05 \\
    \bottomrule
    \end{tabular}
    }
    \end{minipage}
    \caption{\texttt{ColorBar}: IMA datasets can be solved by the IMA criterion.\label{fig:colorbar}}
    \end{subfigure}
    \caption{Experiments on two synthetic datasets: We confirm our analytical results and show that DMA (a) and IMA (b) cover visual concepts such as colors and translations.}
    \label{fig:syntheticdata}
\end{figure}

\subsection{Confirming Identifiability}\label{sec:synthdata}
We first confirm our identifiability guarantees with the analytical solutions. To this end, we implement two realistic synthetic datasets with differentiable generators. This allows computing the closed form of $\mJ_\vg$ and deliberately fulfilling or violating the DMA, IMA, and NEMR conditions. 

\texttt{FourBars} consists of gray-scale images of four components: Three bars change their colors (black to white) and one bar moves vertically, showing that the image regions affected by each component may change in each image. The plot of $\mJ_\vg$ in \cref{fig:fourbars} shows that each component maps to a disjoint image region. This fulfills DMA and thus also IMA. However, all factors have the same gradient magnitudes, making it impossible to find two points with NEMR. According to our theory, we expect DMA optimization to work and IMA to fail \added{as NEMR is essential to make to proof of \cref{thm:ident_ima}}. The second dataset, \texttt{ColorBar}, contains a single bar that undergoes realistic changes in color, width, and its vertical position, see \cref{fig:colorbar}. It conforms to IMA and NEMR but not DMA. Our proofs indicate that IMA should work, and DMA should fail. Completing the problem formalization in \cref{sec:form}, we compute analytical faithful encoders $\vf$ for these datasets distorted by a random matrix $\mD$. The solutions behave as expected: On \texttt{FourBars} only the DMA criterion delivers perfectly recovered components (DCI=1) whereas on \texttt{ColorBars} only IMA succeeds.

\subsection{Correlated Components} \label{sec:exp_comp}
We now move to the common Shapes3D \citep{3dshapes18} dataset. It shows geometric bodies that vary in their colors, shape, orientation, size, and background totaling six components. Compared to the previous section we train real encoders. We start our analysis where disentanglement learning is no longer possible: When components are correlated.
Following \citet{trauble2021disentangled}, the dataset is resampled such that two components $z_i, z_j \in [0, 1]$ follow $z_i - z_j \sim \mathcal{N}(0, s^2)$. Lower $s$ results in a stronger correlation where only few pairs of component values co-occur frequently. We choose a moderate correlation of $s=0.4$ here and three pairs $z_i, z_j$ that are nominal/nominal, nominal/ordinal, and ordinal/ordinal variables. 
We train four state-of-the-art disentanglement learning VAEs (BetaVAE \citep{higgins2017beta}, FactorVAE \citep{kim2018disentangling}, BetaTCVAE \citep{chen2018isolating}, DipVAE \citep{kumar2018variational}) from a recent study \citep{locatello2019challenging} and apply ICA, PCA, and our DMA and IMA discovery methods on their embedding spaces to post-hoc recover the original components. For DMA and IMA, we use the optimization-based algorithms (Eqn.~\ref{eqn:objective}) since they find approximate solutions through aggregation of many noisy sample gradients. %($K{\in}\left\{6,7\right\}$). 
%They are challenged with different pairs of correlated components, e.g., nominal/nominal, nominal/ordinal, ordinal/ordinal, and those encoded in the same image areas. 

\newcommand{\res}[2]{$#1 \pm \small#2$}
\newcommand{\posimp}[1]{\textcolor{ForestGreen}{#1}}
\newcommand{\bposimp}[1]{\textbf{\textcolor{ForestGreen}{#1}}}
\newcommand{\negimp}[1]{\textcolor{BrickRed}{#1}}
\newcommand{\bres}[2]{$\bm{#1} \pm \bm{#2}$}
\newcommand{\ures}[2]{\underline{$#1 \pm #2$}}
\newcommand{\wrapb}[2]{\begin{tabular}[c]{@{}c@{}}#1 \\#2 \end{tabular}}

\begin{table}[t]
%. The models were trained on two datasets where different pairs of components were correlated. The best result per model is marked in bold.}
\centering
\adjustbox{width=\figthreew\columnwidth}{
\setlength{\tabcolsep}{2pt}
\begin{tabular}{r*{6}{c}}
\toprule
\wrapb{Correlated}{components} & \multicolumn{2}{c}{\wrapb{floor \&}{background}}  &\multicolumn{2}{c}{\wrapb{orientation \&}{background}} &  \multicolumn{2}{c}{\wrapb{orientation \&}{size}}\\
\cmidrule{1-1}\cmidrule(lr){2-3}\cmidrule(lr){4-5} \cmidrule(lr){6-7}
\textbf{BetaVAE} &\res{0.497}{0.03}& & \res{0.581}{0.04}& & \res{0.491}{0.05}& \\
+PCA&\res{0.263}{0.03}&\negimp{-47\%}& \res{0.310}{0.02}&\negimp{-47\%}& \res{0.324}{0.04}&\negimp{-34\%}\\
+ICA&\res{0.574}{0.04}&\posimp{+16\%}& \res{0.540}{0.08}&\negimp{-7\%}& \res{0.577}{0.04}&\posimp{+17\%}\\
+Ours (IMA)&\res{0.617}{0.02}&\posimp{+24\%}& \res{0.602}{0.05}&\posimp{+3\%}& \res{0.579}{0.03}&\posimp{+18\%}\\
+Ours (DMA)&\bres{0.641}{0.03}&\bposimp{+29\%}& \bres{0.624}{0.06}&\bposimp{+7\%}& \bres{0.627}{0.03}&\bposimp{+28\%}\\
\cmidrule{1-1}\cmidrule(lr){2-3}\cmidrule(lr){4-5} \cmidrule(lr){6-7}
\textbf{FactorVAE} &\res{0.507}{0.11}& & \res{0.502}{0.08}& & \bres{0.712}{0.01}& \\
+PCA&\res{0.358}{0.07}&\negimp{-29\%}& \res{0.474}{0.05}&\negimp{-5\%}& \res{0.556}{0.03}&\negimp{-22\%}\\
+ICA&\res{0.294}{0.07}&\negimp{-42\%}& \res{0.263}{0.05}&\negimp{-48\%}& \res{0.340}{0.03}&\negimp{-52\%}\\
+Ours (IMA)&\res{0.551}{0.04}&\posimp{+9\%}& \res{0.498}{0.03}&\negimp{-1\%}& \res{0.595}{0.05}&\negimp{-16\%}\\
+Ours (DMA)&\bres{0.584}{0.05}&\bposimp{+15\%}& \bres{0.510}{0.05}&\bposimp{+2\%}& \res{0.556}{0.04}&\negimp{-22\%}\\
\cmidrule{1-1}\cmidrule(lr){2-3}\cmidrule(lr){4-5} \cmidrule(lr){6-7}
\textbf{BetaTCVAE} &\res{0.619}{0.01}& & \res{0.613}{0.04}& & \res{0.659}{0.01}& \\
+PCA&\res{0.400}{0.03}&\negimp{-35\%}& \res{0.421}{0.07}&\negimp{-31\%}& \res{0.450}{0.07}&\negimp{-32\%}\\
+ICA&\res{0.540}{0.02}&\negimp{-13\%}& \res{0.497}{0.04}&\negimp{-19\%}& \res{0.627}{0.02}&\negimp{-5\%}\\
+Ours (IMA)&\res{0.623}{0.02}&\posimp{+1\%}& \res{0.652}{0.03}&\posimp{+6\%}& \res{0.638}{0.04}&\negimp{-3\%}\\
+Ours (DMA)&\bres{0.666}{0.01}&\bposimp{+8\%}& \bres{0.664}{0.02}&\bposimp{+8\%}& \bres{0.748}{0.03}&\bposimp{+14\%}\\
\cmidrule{1-1}\cmidrule(lr){2-3}\cmidrule(lr){4-5} \cmidrule(lr){6-7}
\textbf{DipVAE} &\res{0.631}{0.02}& & \res{0.652}{0.02}& & \res{0.548}{0.04}& \\
+PCA&\res{0.158}{0.01}&\negimp{-75\%}& \res{0.160}{0.02}&\negimp{-75\%}& \res{0.170}{0.02}&\negimp{-69\%}\\
+ICA&\res{0.630}{0.02}&\negimp{-0\%}& \res{0.651}{0.02}&\negimp{-0\%}& \res{0.542}{0.03}&\negimp{-1\%}\\
+Ours (IMA)&\res{0.644}{0.02}&\posimp{+2\%}& \res{0.624}{0.01}&\negimp{-4\%}& \res{0.558}{0.05}&\posimp{+2\%}\\
+Ours (DMA)&\bres{0.684}{0.01}&\bposimp{+8\%}& \bres{0.679}{0.01}&\bposimp{+4\%}& \bres{0.601}{0.05}&\bposimp{+10\%}\\
\bottomrule
\end{tabular}
}
\ifdefined\arxiv
\fi

\caption{DMA recovers the components best in 11 out of 12 cases across different models and correlated components of Shapes3D. Mean $\pm$ std. err. of DCI across all components.\label{tab:posthocdisentangle}}
\end{table}

\definecolor{mygrey}{HTML}{797979}
\definecolor{mygreen}{HTML}{69cd64}
\definecolor{myorange}{HTML}{ef8649}
\definecolor{myblue}{HTML}{4878d0}
\definecolor{myred}{HTML}{ae0031}
\newcommand{\includetraversalsmall}[1]{\includegraphics[width=0.9\textwidth]{#1}}
\begin{figure}[t]
\centering
\begin{subfigure}[b]{0.48\linewidth}
\centering
\includetraversalsmall{figures/traversal_unit.pdf}
\caption{Autoencoder (DipVAE) \label{fig:travunit}}
\end{subfigure}
\begin{subfigure}[b]{0.48\linewidth}
\centering
\includegraphics[width=0.92\textwidth]{figures/traversal_ours.pdf}
\caption{Autoencoder + DMA\,(ours) \label{fig:travours}}
\end{subfigure}
\caption{DMA discovers directions $\vm$ that control individual concepts (wall \& floor color) of Shapes3D although they are confused in the original embedding space ($e_1$, $e_2, \ldots$).}
\label{fig:travsersal}
\end{figure}

\begin{figure}
\begin{subfigure}[b]{0.54\linewidth}
\centering
\includegraphics[height=2.35cm, trim=0 0.55cm 0 0, clip]{figures/corstrength.pdf}
\caption{Correlation strength} \label{fig:robustness_strength}
\end{subfigure}
\begin{subfigure}[b]{0.44\linewidth}
\centering
\includegraphics[trim={1.265cm 0.55cm 0 0},clip,height=2.35cm]{figures/ncor.pdf}
\caption{Number pairwise corr.} \label{fig:robustness_num}
\end{subfigure}
\caption{\textcolor{myred}{DMA} and \textcolor{myblue}{IMA} recover the components even under strong and multiple correlations between them. \textcolor{myorange}{ICA} and \textcolor{mygreen}{PCA} fail to return better components than the \textcolor{mygrey}{unit axes}.} 
\end{figure}


\cref{tab:posthocdisentangle} shows the resulting DCI scores. In line with \citet{trauble2021disentangled}, we find that the disentanglement learning VAEs fail to recover the correlated components on their own due to their violated stochastic independence assumption (\cref{fig:travunit}).
In eleven of the twelve model/correlation pairs, DMA or IMA identify better concepts than the VAE unit axes and the PCA/ICA components with improvements of up to 29\,\%. This experiment shows that their concept discovery works regardless of (1) the model type and (2) the type of components correlated. On average, DMA delivers better results than IMA ($+0.047$), despite the generative process of Shapes3D only being roughly IMA or DMA-compliant. \added{We therefore hypothesize that the DMA criterion might be more robustly optimizable in practice.} \cref{fig:travours} visualizes the performance achieved via DMA when traversing the embedding space. It also shows that small DCI differences can mean a significant improvement. This is because (1) the metric is computed across all six components and the strong baselines already identify many concepts and (2) a perfect score of 1.0 is usually not possible due to non-linearly encoded components. We investigate other correlation strengths with similar findings in App.~D.3.
%On Shapes3D, DA outperforms the baselines on all but one setup, impressively highlighting that it works regardless of the disentanglement method used and components correlated. DA also outperforms OA on all but one occasion on Shapes3D. 

%We conduct further experiments on the challenging MPI3D-real \citep{gondal2019transfer} dataset in \Cref{sec:app_mpi3d}. Here, the VAEs struggle to reconstruct the fine-detailed images, let alone disentangle the components. However, DA works well with the DipVAE model, and OA still leads to improvement for the BetaVAE model, while PCA and ICA consistently fail to beat either the VAE unit axes or OA/DA. Unlike on Shapes3D, we observed no substantial performance gap between OA and DA on MPI3D-real.

\begin{table}[t]
 %Best results per correlation are marked in bold.}
     \centering
     \adjustbox{width=\figthreew\linewidth}{
     \begin{tabular}{r*{5}{c}}
     \toprule
Method & $s=0.1$ & $s=0.15$ & $s=0.2$ & $s=\infty$\\
\midrule
unit dirs.&\res{0.238}{0.01} & \res{0.244}{0.01} & \res{0.247}{0.01} & \res{0.286}{0.02}\\
PCA&\res{0.238}{0.01} & \res{0.376}{0.03} & \res{0.373}{0.03} & \res{0.343}{0.03}\\
ICA&\res{0.409}{0.02} & \res{0.309}{0.02} & \res{0.311}{0.01} & \bres{0.652}{0.00}\\
(Ours) IMA &\res{0.295}{0.01} & \res{0.302}{0.01} & \res{0.333}{0.04} & \res{0.266}{0.12}\\
(Ours) DMA &\bres{0.435}{0.01} & \bres{0.411}{0.03} & \bres{0.392}{0.02} & \res{0.369}{0.05} \\
%\midrule
%supervised & 0.820 & 0.898 & 0.892 & 0.920 \\
\bottomrule
 \end{tabular}}
\caption{Without correlations ($s = \infty$), ICA is able to recover the components of a classification model. Under correlations, DMA works best. Mean $\pm$ std. err. of DCI. \label{tab:posthocdisc}}
\end{table}

\subsection{Gaussianity and Multiple Correlations} \label{sec:exp_robust}
In this section, we increase the distributional challenges to analyze whether our approaches are as distribution-agnostic as intended. We sample the components of Shapes3D from a (rotationally symmetric) Gaussian. Additionally, we introduce correlations between multiple components to its covariance matrix. Details on how covariance matrices are constructed are given in App.~C.3.  

First, we study a single pair of correlated components (floor and background color) with increasing correlation strength $\rho$. %, where a higher $\rho$ is more restrictive of the seen combinations. 
\cref{fig:robustness_strength} shows that the BetaVAE handles low correlations well but starts deteriorating from a strength of $\rho > 0.5$, along with ICA. The DCI of our methods is an average constant of $+0.145$ above the BetaVAE's for $\rho \leq 0.85$. After this, it returns to the underlying BetaVAE's DCI, possibly because the two components collapsed in the BetaVAE's embedding space. 
For \cref{fig:robustness_num}, we gradually add more moderately correlated ($\rho \approx 0.7$) pairs to the Gaussian's covariance matrix until eventually all components are correlated. Again, our models show a constant benefit over the underlying BetaVAE's DCI curve. This experiment highlights that both DMA and IMA perform well with (1) strong and (2) multiple correlations and (3) Gaussian components. 


\begin{figure*}[t]
    \centering
    %\includegraphics[width=\linewidth]{figures/cub_attr_
    \input{figures/cub_new}
    \caption{Components discovered by DMA on CUB correlate with interpretable ground truth attributes. Images are ordered by their concept scores $(\mM\ve)_i$, and the numbers show their ground truth annotated attribute score.}
    \label{fig:cub_attr}
\end{figure*}
\subsection{Discriminative Embedding Spaces}
\label{sec:exp_discr}

We highlight that our approach is also applicable to classification models that were trained in a purely discriminative manner, e.g., the feature space of a CNN model. 
To investigate this setting, we set up an 8-class classification problem on the Shapes3D dataset, where the combination of the four binarized components object color, wall color (blue/red vs. yellow/green), shape (cylinder vs. cube) and orientation (left vs. right) determines the class as visualized in App.~C.4. 
To make the setting even more realistic, we artificially add labeling noise close to the decision boundary, correlations as in \cref{sec:exp_comp}, and a small L2-regularizer on the embeddings to keep them in a reasonable range. We train a discriminative CNN with a $K{=}6$-dimensional embedding space.

The discriminative loss leads to a clustered distribution in the embedding space. ICA expectedly works very well in this highly non-Gaussian distribution, when no significant correlations are present which is in line with the result in \cref{thm:ident_ica}. However, tables turn as we increasingly correlate the floor and background color: Starting at $s=0.2$, DMA outperforms ICA and the other methods as can be seen in \cref{tab:posthocdisc}. While IMA leads to better concepts over the unit directions, it does not reach the level of DMA. \added{We note that both ICA and our methods improve again for very strong correlations, where the setup approaches the case of three independent components (the other two components being treated as one) that is easier again.} Overall, this demonstrates that our methods are applicable to purely discriminative embedding spaces and are more robust to high levels of correlations than ICA. 


\subsection{Real-world Concept Discovery} \label{sec:exp_cub}
Last, we go beyond the traditional benchmarks and perform realistic concept discovery: We analyze the embedding space of a ResNet50 classifier \citep{He2016} trained on the CUB-200-2011 \citep{Wah2011} dataset consisting of high-resolution images of birds. This amplifies the challenges of the previous sections, i.e., a discriminative space, non-linear component dependencies of varying strengths across multiple components, and a large 512-dimensional embedding space. One restriction of this experiment is that CUB has no data-generating components to compare against, so we cannot report DCI scores. However, we qualitatively show that DMA can deliver interpretable concepts by matching them to annotated attributes of CUB.

We apply DMA and IMA to discover $K{=}30$ concepts of which the first two \added{DMA} concepts are shown exemplarily in \cref{fig:cub_attr}. The images with the highest positive scores on the first component (on the right) consistently show white birds. The other end of the component comprises birds whose primary color is black. This gives a high Spearman rank correlation with the CUB attribute ``primary color: white''. The second concept is similarly interpretable. To quantify this across all $K$ components, we provide an initial quantitative evaluation based on the Spearman rank correlation between components and attributes in App.~D.7. It indicates that ICA and PCA have problems providing such components and the components identified by DMA usually correspond more closely to the attributes. \added{The concepts provided by our method also compare favorably to those identified by ACE \citep{ghorbani2019towards} and ConceptSHAP \citep{yeh2019completeness}.} While the construction of further quantitative evaluation schemes goes beyond the scope of this work, these promising results highlight that DMA also works for high-dimensional, real-world datasets. 

\section{Discussion}
 % and on datasets ranging from toy datasets up to the large-scale CUB-200-2011.

\added{We conclude by discussing the limitations of this work and related approaches and provide constructive guidance on which approach to choose in practice.}

\textbf{Limitations.} \added{In order to overcome distributional assumptions, our approach requires other forms of constraints. Most notably, we suppose that the generative processes comply with the functional properties of Disjoint or Independent Mechanisms. While they are intuitive and our empirical results suggest that they are a useful approximation of real-world images, we acknowledge that these requirements are not strictly fulfilled in most practical scenarios and the quality of the results depends on the extent to which these constraints are violated. We investigate the robustness of our methods to violations of the assumptions in App.~D.5. Compared to the classical methods such as PCA or IMA, the gradient-based optimization requires additional resources. However, the runtime strongly depends on hyperparameters such as the number of optimization steps. We also show that improved results can still be obtained time budget comparable to that of PCA and IMA in App.~D.5.}

\textbf{Choosing the right approach for concept discovery.} \added{Overall, our results show that unsupervised conceptual explanations with guarantees are only possible under specific sets of working assumptions. In this paragraph, we would like to briefly summarize them and give constructive suggestions on which approaches are best used when.
\begin{itemize}
    \item PCA works with uncorrelated components that are orthogonally encoded. We believe that the assumption of an orthogonal encoding is rather unlikely in practice, even if non-correlation was possible.
    \item ICA works well under independently distributed components but fails under dependent components. We suggest using this method when there is evidence that the ground truth components are independent.
    \item DMA does not require independence, but instead requires a disjoint mechanisms process and a faithful encoder to this process. This assumption is particularly suitable for image-generating processes.
    \item IMA does not require independence as well, but requires a faithful encoder to an independent mechanisms process. The class of independent mechanism processes is larger and may also cover non-image processes \citep{gresele2021independent}. However, it requires the additional NEMR condition. We further empirically observed that the objective derived from IMA is harder to optimize for with SGD optimizers.
    \item Other approaches like ConceptSHAP \citep{yeh2019completeness} and ACE \citep{ghorbani2019towards} also come with certain restrictions: ACE requires a model that is scale and shift invariant, while ConceptSHAP is specifically designed for computer vision models with spatial feature maps such as ResNet \citep{He2016}. Further, these approaches come without formal guarantees.
\end{itemize}
}

\section{Conclusion}
\textbf{Summary.} We proposed identifiability as a minimal requirement for concept discovery algorithms. Furthermore, we suggested the two functional paradigms of disjoint and independent mechanisms and proved that they can recover known components in visual embedding spaces. Extensive experiments confirmed that they offer substantial improvements on various generative and discriminative models and remain unaffected by distributional challenges.

\textbf{Outlook.} We believe our work to be a valuable step towards a rigorous formalization of concept discovery. However, the considered setup can be generalized in the future, for instance to components that are not linearly encoded. This would permit even stronger guarantees. While we have taken a technical perspective here, future work is required to investigate the effect of improved concepts on upstream explanations.

\begin{acknowledgements} % will be removed in pdf for 
The authors thank Frederik Träuble, Luigi Gresele, and Julius von Kügelgen for insightful discussions during early development of this project. This work was funded by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under Germany’s Excellence Strategy – EXC number 2064/1 – Project number 390727645. We thank the International Max Planck Research School for Intelligent Systems (IMPRS-IS) for supporting Michael Kirchhof.
\end{acknowledgements}

% References
\bibliography{references}

\end{document}

