\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.



%% Choose your variant of English; be consistent
%%%%% NEW MATH DEFINITIONS %%%%%

\usepackage{amsmath,amsfonts,bm}

% Mark sections of captions for referring to divisions of figures
\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

% Highlight a newly defined term
\newcommand{\newterm}[1]{{\bf #1}}


% Figure reference, lower-case.
\def\figref#1{figure~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}
% Reference to two sections.
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
% Reference to three sections.
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
% Reference to an equation, lower-case.
\def\eqref#1{equation~\ref{#1}}
% Reference to an equation, upper case
\def\Eqref#1{Equation~\ref{#1}}
% A raw reference to an equation---avoid using if possible
\def\plaineqref#1{\ref{#1}}
% Reference to a chapter, lower-case.
\def\chapref#1{chapter~\ref{#1}}
% Reference to an equation, upper case.
\def\Chapref#1{Chapter~\ref{#1}}
% Reference to a range of chapters
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
% Reference to an algorithm, lower-case.
\def\algref#1{algorithm~\ref{#1}}
% Reference to an algorithm, upper case.
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
% Reference to a part, lower case
\def\partref#1{part~\ref{#1}}
% Reference to a part, upper case
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


% Random variables
\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
% rm is already a command, just don't name any random variables m
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

% Random vectors
\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

% Elements of random vectors
\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

% Random matrices
\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

% Elements of random matrices
\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

% Vectors
\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vvv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}
\def\vkappa{{\bm{\kappa}}}
\def\vlambda{{\bm{\lambda}}}
\def\vgamma{{\bm{\gamma}}}
\def\vmu{{\bm{\mu}}}

% Elements of vectors
\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

% Matrix
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}
\def\mGamma{{\bm{\Gamma}}}
\def\mKappa{{\bm{\Kappa}}}

% Tensor
\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


% Graph
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

% Entries of a matrix
\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

% entries of a tensor
% Same font as tensor, without \bm wrapper
\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

% The true underlying data generating distribution
\newcommand{\pdata}{p_{\rm{data}}}
% The empirical distribution defined by the training set
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
% The model distribution
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
% Stochastic autoencoder distributions
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
% But then they seem to use $L^2$ for vectors throughout the site, and so does
% wikipedia.
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak

\usepackage[dvipsnames]{xcolor}         % colors
\usepackage{hyperref}
\hypersetup{
colorlinks = true,
linkcolor = ForestGreen,
anchorcolor = blue,
citecolor = Blue,
filecolor = cyan,
menucolor = ForestGreen,
runcolor = cyan,
urlcolor = ForestGreen}

\usepackage{url}
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{amssymb}
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography

\usepackage{multicol, multirow}
%% Default packages I prefer to use for all my paper projects.

%\usepackage[round]{natbib} % has a nice set of citation styles and commands
\usepackage{amsmath, mathtools, amssymb}
\usepackage[capitalise]{cleveref}

\usepackage{subcaption, caption}
\usepackage{bm}
\usepackage{adjustbox}
\usepackage{tikz}
\usepackage{wrapfig}

\usetikzlibrary{calc,matrix,positioning,patterns,shapes.geometric}
%\DeclareMathOperator{\Var}{Var}
\DeclareMathOperator{\spann}{span}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\spur}{tr}
\DeclareMathOperator{\lnorm}{lnorm}
\DeclareMathOperator{\Det}{Det}
\DeclareMathOperator{\Covv}{Cov}
% Cleverref setup.
\newtheorem{definition}{Definition}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{condition}{Condition}[section]
\newtheorem{example}{Example}[section]
\newtheorem{theorem}{Theorem}[section]
\newtheorem{corrolary}{Corrolary}[section]
\crefname{condition}{condition}{conditions}
\Crefname{condition}{Condition}{Conditions}
\crefname{example}{example}{example}
\Crefname{example}{Example}{Example}
\Crefname{section}{Section}{Section} % Use small cref to print Sec., Fig. Tab.
\crefname{section}{Sec.}{Sec.} % Use small cref to print Sec., Fig. Tab.
\crefname{figure}{Fig.}{Figs.} 
\Crefname{figure}{Figure}{Figures} 
\Crefname{table}{Table}{Tables} 
\crefname{table}{Tab.}{Tab.} 
\Crefname{equation}{Equation}{Equations} 
\crefname{equation}{Eqn.}{Eqns.} 
\Crefname{algocf}{Algorithm}{Algorithms} 
\crefname{algocf}{Alg.}{Algs.} 
\Crefname{theorem}{Theorem}{Theorem} 
\crefname{theorem}{Thm.}{Thms.} 
% Math notation
\newcommand{\jacg}[0]{\bm{J}_{g}}
\newcommand{\jacf}[0]{\bm{J}_{f}}
\newcommand{\jacgstar}[0]{\bm{J}_{g^*}}
\newcommand{\jacfstar}[0]{\bm{J}_{f^*}}
\newcommand{\jacfprime}[0]{\bm{J}_{f^{'}}}
\newcommand{\jacphi}[0]{J_{\bm{\phi}}}
\newcommand{\jacgt}[0]{\tilde{J}_{g}}
\newcommand{\jacft}[0]{\tilde{J}_{f}}
\newcommand{\jacphit}[0]{\tilde{J}_{\phi}}
\newcommand{\jacfti}[1]{\tilde{J}^{(#1)}_{f}}
\newcommand{\trof}[1]{\spur\left( #1 \right)}

\newcommand*{\QED}{\null\nobreak\hfill\ensuremath{\square}}
%\usepackage{graphicx}
\usepackage{tikz}
\usepackage{esvect}
\usepackage[ruled]{algorithm2e}
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 

%%% HELPER CODE FOR DEALING WITH EXTERNAL REFERENCES in overleaf
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument[main]{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
%%% END HELPER CODE

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\added}[1]{#1}
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\res}[2]{$#1 \pm \small#2$}
\newcommand{\posimp}[1]{\textcolor{ForestGreen}{#1}}
\newcommand{\bposimp}[1]{\textbf{\textcolor{ForestGreen}{#1}}}
\newcommand{\negimp}[1]{\textcolor{BrickRed}{#1}}
\newcommand{\bres}[2]{$\bm{#1} \pm \bm{#2}$}
\newcommand{\ures}[2]{\underline{$#1 \pm #2$}}
\newcommand{\wrapb}[2]{\begin{tabular}[c]{@{}c@{}}#1 \\#2 \end{tabular}}

\title{When are Post-hoc Conceptual Explanations Identifiable? (Supplementary material)}

\author[1,2,$\dagger$]{\href{mailto:tobias.leemann@uni-tuebingen.de}{\textcolor{black}{Tobias Leemann}}{}}
\author[1,$\dagger$]{\href{mailto:michael.kirchhof@uni-tuebingen.de}{\textcolor{black}{Michael Kirchhof}}{}}
\author[1,2]{Yao Rong}
\author[2]{Enkelejda Kasneci}
\author[2]{Gjergji Kasneci}
% Add affiliations after the authors
\affil[1]{%
    University of Tübingen\\
    Tübingen, Germany
}
\affil[2]{%
    Technical University of Munich\\
    Munich, Germany
}
\affil[$\dagger$]{%
    equal contribution
}


\myexternaldocument{leemann_396}
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix

\setcounter{page}{11}
\setcounter{table}{3}
\setcounter{figure}{6}
\setcounter{equation}{1}

\section{Additional Related Work}
\paragraph{Orthogonality constraints and disentanglement for generative models.}  In the context of generative adversarial networks (GANs) \citep{goodfellow2014generative}, the problem of analyzing and discovering interpretable directions has be studied recently by \citet{voynov2020unsupervised}. \citet{ren2022learning} propose a contrastive approach to discover interpretable directions using pretrained generative models. \citet{wei2021orthogonal} have proposed an orthogonality regularization of the Jacobian, which resulted in more interpretable generative abilities. \citet{ramesh2018spectral} constrain the right-singular vectors of a generator Jacobian to be unit directions, which corresponds to column-wise orthogonal generator Jacobians.  We go beyond these works by providing rigorous results on identifiability and by extending the scope to a encoder-only models.

\section{Proofs} \label{sec:app_proofs}

\subsection{Rotations Destroy Orthogonality Lemma}
We start by first proving an auxiliary lemma. We show that orthogonality of Jacobians, i.e., $\bm{J}_f\bm{J}_f^\top=\bm{S}$ with a diagonal matrix $\bm{S}$ will be destroyed in the general case when a rotation $\bm{R}$ is applied, such that $\bm{J}_{Rf}\bm{J}_{Rf}^\top = \bm{R}\bm{J}_f\bm{J}_f^\top\bm{R}^\top = \bm{R}\bm{S}\bm{R}^\top$ is not a diagonal matrix anymore.

\begin{lemma}[Rotations destroy orthogonality patterns.]
Let $\bm{S} \in \mathbb{R}^{K\times K}$ be a diagonal matrix, $\bm{S}=\diag{\left(\bm{s}\right)}$ with diagonal entries $\bm{s} > 0$ and $s_i \neq s_j, \forall i\neq j$, i.e., all diagonal entries of $\bm{S}$ are different and positive. Let $\bm{R} \in \mathbb{R}^{K\times K}$ be any rotation matrix with $\bm{R}^\top\bm{R}=\bm{I}$. If $\bm{R}\bm{S}\bm{R}^\top$ is a diagonal matrix, $\bm{R}$ must a signed permutation matrix (a permutation matrix where entries can be $\pm1$). \label{lem:helperorthogonality}
\end{lemma}

\textbf{Proof.} %Suppose there is an $\bm{S}$ with $\bm{s} \geq 0$ and $ \forall i\neq j: s_i \neq s_j$ and a rotation matrix $\bm{R}$ with $\bm{R}\bm{S}\bm{R}^\top = \diag\left(\lambda_1, \ldots \lambda_K \right)$.
With $\bm{R}\bm{S}\bm{R}^\top = \diag\left(\lambda_1, \ldots \lambda_K \right)$, we have for each unit vector $\bm{e}^{(i)}$, $i = 1, \ldots, K$, that
\begin{align}
    \bm{R}\bm{S}\bm{R}^\top \bm{e}^{(i)} = \lambda_i \bm{e}^{(i)}\,\,.
\end{align}
We can represent $\bm{R}$ by its rows, $\bm{R}=\left[\bm{r}_1,\ldots, \bm{r}_K\right]^\top$ where each $\bm{r}_i\in \mathbb{R}^K$. In this notation, $\bm{R}^T \bm{e}^{(i)} = \bm{r}_i$, i.e., multiplication of the transpose with a unit vector will select the row $\bm{r}_i$. This results in \begin{align}
    \bm{R}\bm{S}\bm{r}_i = \lambda_i \bm{e}^{(i)}
\end{align}
Because $\bm{R}$ is invertible and square, we can left-multiply the equation by $\bm{R}^\top$. Using $\bm{R}^T \bm{e}^{(i)} = \bm{r}_i$ again, we arrive at
\begin{align}
    \bm{S}\bm{r}_i = \lambda_i \bm{r}_i.
\end{align}
This implies that all $\bm{r}_i$ are eigenvectors of the matrix $\bm{S}$ with the eigenvalues $\lambda_i$. By the initial assumption, $\bm{S}$ is a diagonal matrix with all-different entries $s_i$. The eigenvectors of such a matrix are only scaled unit vectors $\bm{e}^{(j)}$. Thus, each $\bm{r}_i$ will be a scaled unit-vector. The constraint of $\bm{R}$ being an orthogonal matrix enforces the $\bm{r}_i$ to be mutually different unit vectors with length $1$. Therefore, $\bm{R}$ necessarily has the form of a signed permutation. \hfill $\square$

Note that the converse is also true. If $\bm{R}$ is a signed permutation matrix, $\bm{R}\bm{S}\bm{R}^\top$ will be diagonal.

\subsection{PCA Ensures Identifiability (Theorem 3.1)}
\label{sec:proof_indep}

\begin{theorem}[PCA identifiability,~Theorem 3.1]
     Let $z_k, k=1, \dotsc, K,$ be uncorrelated random variables with non-zero and unequal variancecs. Let $\bm{e} = \bm{D}\bm{z}$, where $\bm{D} \in \mathbb{R}^{K \times K}$ is an orthonormal matrix. If an orthonormal post-hoc transformation $\bm{M}\in \mathbb{R}^{K \times K}$  results in mutually uncorrelated components $(z'_1, \dotsc, z'_K) = \bm{z}' = \bm{M}\bm{e}$, then $\bm{M}\ve = \bm{P}\bm{S} \vz$, where $\mP \in \mathbb{R}^{K \times K}$ is a permutation and $\bm{S} \in \mathbb{R}^{K \times K}$ is a matrix where $|s_{ii}|=1$ for $i \in 1,\ldots K$.
\end{theorem}

\textbf{Proof.} Since both $\mM$ and $\mD$ are orthogonal, $\bm{M}\bm{D}= \bm{Q}$ is also orthogonal. 
Our post-hoc transformation resulted in uncorrelated components, i.e., $\Covv(\bm{Q}\bm{x}) = \bm{Q}\Covv(\bm{x})\bm{Q}^\top\bm{\Gamma}$ is diagonal, where $\bm{\Gamma}$ is some diagonal matrix. Thus, $\bm{Q}\Covv(\bm{x})\bm{Q}^\top$ is diagonal, too.  We also know that our original components are uncorrelated with unequal variances, i.e., $\Covv(\bm{x}) = \diag(\bm{s})$ with $\bm{s} > 0$ and $s_i \neq s_j, \forall i\neq j$. Our helper \Cref{lem:helperorthogonality} then implies that $\bm{Q}$ must be a signed permutation. Thus, $\vz' := \bm{M}\ve = \mM \mD \vz = \mQ \vz =: \bm{P}\bm{S} \vz$, where $\mP \in \mathbb{R}^{K \times K}$ is a permutation and $\bm{S} \in \mathbb{R}^{K \times K}$ is a matrix where $|s_{ii}|=1$ for $i \in 1,\ldots K$. $\hfill \square$

\subsection{ICA Ensures Identifiability (Theorem 3.2)}

\begin{theorem}[ICA identifiability,~Theorem 3.2]
    Let $z_k, k=1, \dotsc, K,$ be independent random variables with non-zero variances where at most one component is Gaussian. Let $\bm{e} = \bm{D}\bm{z}$, where $\bm{D} \in \mathbb{R}^{K \times K}$ has full rank.  
    If a post-hoc transformation $\bm{M} \in \mathbb{R}^{N\times N}$ results in mutually independent components $(z'_1, \dotsc, z'_K) = \bm{z}' = \bm{M}\bm{e}$, then $\bm{M}\ve = \bm{P}\bm{S} \vz$, where $\mP \in \mathbb{R}^{K \times K}$ is a permutation and $\bm{S} \in \mathbb{R}^{K \times K}$ is a scaling matrix.
\end{theorem}

\textbf{Proof.} (1) We know that $\vz' = MDz =: \bm{C}' z$. Let us start with an additional assumption that both $\vz'$ and $\vz$ have unit variances. Then, by \citet[App. A .1]{comon1994independent}, $\bm{C}'$ must be orthonormal. 

%This results is concerned with the set of transformations $\vz^\prime = \mM\mD \vz$ where $\bm{z}^\prime$ preserves independence from  $\bm{z}$. Define $\mC^\prime=\mM\mD$. We start with the case of $\Covv(\vz')=\mI$ and $\Covv(\vz)=\mI$. 
%\citet[App. A .1]{comon1994independent} also showed that each transformation $\bm{C}^\prime$ that preserves non-correlation of $\mC^\prime\vz$ must be orthonormal, i.e., $\mC^{\prime\top}\mC^\prime=\bm{I}$.

Let us recall the following result
\begin{theorem}[Theorem 11 from \citet{comon1994independent}]
Let $\bm{x}$ be a vector with independent
components, of which at most one is Gaussian, and
whose densities are not reduced to a point-like mass.
Let $\bm{C}$ be an orthogonal $K \times K$ matrix and $\bm{z}$ the
vector $\bm{z} = \bm{C}\bm{x}$. Then the following three properties are equivalent:
\begin{enumerate}
    \item The components $z_i$ are pairwise independent.
    \item The components $z_i$ are mutually independent.
    \item $\bm{C} = \bm{S}\bm{P}$ where $\bm{S}$ is diagonal, $\bm{P}$ is a permutation.
\end{enumerate}
\end{theorem}

Since $\vz$ fulfills the conditions of this this theorem and $\bm{z}'$ has mutually independent entries, we know that $\bm{C}' = \bm{S}\bm{P}$.

(2) We now allow arbitrary variances, i.e., $\Covv(\vz')=\bm{\Lambda}$ and $\Covv(\vz)=\bm{\Gamma}$ where both covariance matrices are positive diagonal matrices. $\vz^\prime = \mM\mD \vz = \bm{C}^\prime \vz = \bm{\Lambda}^{1/2} \bm{\Lambda}^{-1/2} \bm{C}^\prime \bm{\Gamma}^{1/2} \bm{\Gamma}^{-1/2} \vz =: \bm{\Lambda}^{1/2} \bm{C}^{\prime\prime}  \bm{\Gamma}^{-1/2} \vz$. This is equivalent to $(\bm{\Lambda}^{-1/2} \bm{z}') = \bm{C}^{\prime\prime}  (\bm{\Gamma}^{-1/2} \vz)$. These rescaled random vectors both have unit variances, so (1) implies that $\bm{C}^{\prime\prime} = \bm{S'}\bm{P'}$. We can plug this back into the previous equation and see that $\vz^\prime = \bm{\Lambda}^{1/2} \bm{C}^{\prime\prime}  \bm{\Gamma}^{-1/2} \vz = \bm{\Lambda}^{1/2} \bm{S'}\bm{P'}  \bm{\Gamma}^{-1/2} \vz =: \bm{P'} \bm{S''} \vz$. Thus, $\vz' = \bm{M} \ve = \mM \mD \vz= \bm{P'}\bm{S''} \vz$, where $\mP' \in \mathbb{R}^{K \times K}$ is a permutation and $\bm{S''} \in \mathbb{R}^{K \times K}$ is a scaling matrix. $\hfill \square$


\subsection{Transfer lemma} \label{sec:prooftransfer}
DMA and IMA are based on structures in the Jacobian of the generative process. To be able to use them in the encoder and ultimately discover concepts, we first show that if an encoder mirrors the behavior of the generative process, up to a rotation and scale, its Jacobians must also mirror the Jacobians of the generative process.

\begin{lemma}[Transfer lemma] 
\label{lem:transferlemma}
Let $\vf$ be a faithful encoder for the generative process $\vg$ and further $\vf \circ \vg (\vz) = \mP \mS \vz$ $\forall \vz \in \mathcal{Z}$ where $\mP \in \mathbb{R}^{K \times K}$ is a permutation and $\mS \in \mathbb{R}^{K \times K}$ is a diagonal matrix. Then $\mJ_{\vf}(\vg(\vz)) = \mP' \mS' \mJ_\vg(\vz)^\top$  where $\mP' \in \mathbb{R}^{K \times K}$ is a permutation and $\mS' \in \mathbb{R}^{K \times K}$ is a diagonal matrix. 
\end{lemma}

\textbf{Proof.} Let $\vz \in \mathcal{Z}$ be arbitrary. $(\vf \circ \vg)(\vz) = \mP\mS\vz$ implies $\mJ_{\vf}(\vg(\vz)) \mJ_\vg(\vz) = \mP\mS$. Since $\vf$ is faithful to $\vg$,  $\mS$ has full rank, i.e., $\mS = \text{diag}(\alpha_1, \dotsc, \alpha_K)$ with $\alpha_k \in \mathbb{R}_{\neq 0}, k = 1, \dotsc, K$.

Now, let us write $\mJ_{\vf}(\vg(\vz)) = [\vvv_1, \dotsc, \vvv_K]^\top$ with $\vvv_i \in \mathbb{R}^{L}$. Similarly, we can write $\mJ_\vg(\vz) = [\vw_1, \dotsc, \vw_K]$ with $\vw_i \in \mathbb{R}^L, i = 1, \dotsc, K$.

Let us focus on an individual row of $\mJ_{\vf}$, i.e., let $k \in \{1, \dotsc, K\}$ be a fixed index of a row. Since $\mJ_{\vf}(\vg(\vz)) \mJ_\vg(\vz) = \mP \mS$ and $\mP$ is a permutation matrix with exactly one $1$ per row, there is precisely one column index $k'$ such that the $k$-th row and $k'$-th column of $\mP\mS$ is non-zero.
This setup allows drawing certain conclusions about the vector $\vvv_k$. Let $j = 1, \dotsc, K$ denote an arbitrary column of $\mP\mS$. Then,

(i) if $j = k'$, then $\vvv_k^\top \vw_{k'} = \alpha_{k'} \neq 0$. In consequence, $\vvv_k \neq 0$, $\vw_{k'} \neq 0$ and so we can decompose $\vvv_k = \va_k + \vb_k$, where $\va_k \in \text{span}(\{\vw_{k'}\}) \setminus \{0\}$ and $\vb_k \in \text{span}(\{\vw_{k'}\})^\bot$, where $^\bot$ denotes the orthogonal complement. Because $\text{span}(\{\vw_{k'}\}) = \left\{ \mu \vw_{k'}\middle| \mu \in \mathbb{R} \right\}$, we know that $\va_k=\frac{\alpha_{k'}}{\|\vw_{k'}\|^2_2}\vw_{k'}$.

(ii) if $j \neq k'$, then $\vvv_k^\top \vw_{j} = 0$. With (i), it follows that $\vb_k \in \text{span}\left(\{\vw_1, \dotsc, \vw_K\}\right)^\bot = \text{span}(\mJ_\vg(\vz))^\perp$. 
% = \text{ker}(\mJ^\top_\vf(\vz))$, because $\vf$ is a faithful encoder to $\vg$.

Since $\vf$ is faithful to $\vg$, we know that for each $\mathbf{c} \in \text{span}(\mJ_\vg(\vz))^\perp$, $\jacf(\vg(\vz))\mathbf{c} = \mathbf{0}$ ant therefore $\jacf(\vg(\vz))\bm{b}_k =\mathbf{0}$ This demands that the $k$-th component of the product is also 0, i.e., $\mathbf{v}_k\vb_k = (\va_k+\vb_k)^\top\vb_k=\va_k^\top\vb_k+ \vb_k^\top\vb_k=0$. By design $\va_k$ and $\vb_k$ are orthogonal such that immediately follows
$\bm{b}_k = \vzero$
Hence, $\vvv_k = \va_k + \vzero=\frac{\alpha_{k'}}{\|\vw_{k'}\|^2_2}\vw_{k'} + \vzero$ for our selected row $k$. Globally, this means $\mJ_{\vf}(\vg(\vz)) = \mP'\mS' \mJ_\vg(\vz)^\top$, with some scaling matrix $\mS'$ and permutation matrix $\mP'$. \hfill $\square$

\subsection{Disjoint Mechanisms ensure identifiability (Theorem 3.3)}
\label{sec:proofident}
\begin{theorem}[Identifiability under DMA,~Theorem 3.3] %\label{eq:ident_full}
Let $\vg$ have disjoint mechanisms and $\vf$ be a faithful encoder to $\vg$. %Let $\bm{e} = \bm{D}\bm{z}$. 
If a full-rank post-hoc transformation $\bm{M} \in \mathbb{R}^{N\times N}$ results in disjoint rows in the Jacobian $\mM \mJ_{\vf}(\vg(\vz))$ for some $\vz \in \mathcal{Z}$, then $\bm{M}\ve = \bm{P}\bm{S} \vz$, where $\mP \in \mathbb{R}^{K \times K}$ is a permutation and $\bm{S} \in \mathbb{R}^{K \times K}$ is a scaling matrix.
\end{theorem}

\textbf{Proof.} We know that $\vf \circ \vg = \mD$ and $\mD$ has full rank. Since $\bm{M}$ also has full rank, there exists a non-singular matrix $\bm{E}'$ such that $\bm{M} = \bm{E}' \bm{D}^{-1}$. We can rewrite $\bm{E}' = \bm{S} \bm{E}$, where $\bm{E}$ has normalized rows and $\bm{S}$ is a diagonal matrix. 

Since $\bm{D}^{-1} \vf \circ \vg = \bm{I}$ and $\vg$ is DMA, we can apply the transfer lemma (\Cref{lem:transferlemma}). It implies that $\bm{D}^{-1} \jacf(g(z))$ has orthogonal rows. 

Suppose now for contradiction that $\bm{E}$ was not a permutation matrix. This means that without loss of generality the first row must contain at least two columns whose entries are not equal to zero. Since $\bm{E}$ has full rank, there must be a second row with a non-zero entry in at least one of these columns. Since $\bm{D}^{-1} \mJ_\vf(\vg(\vz_a))$ has disjoint rows, $\bm{S} \bm{E} \bm{D}^{-1} \mJ_\vf(\vg(\vz_a)) = \bm{M} \mJ_\vf(\vg(\vz_a))$ can no longer have disjoint rows. This contradicts the assumption. Hence, $\bm{E}$ must be a permutation matrix $\bm{P}$. This give $\bm{z}' = \mM \ve = \bm{P} \bm{S} \bm{D}^{-1} \bm{D} \vz = \bm{P} \bm{S} \vz$. \hfill $\square$

\subsection{Independent Mechanisms ensure Identifiability (Theorem 3.4)}
\begin{theorem}[Identifiability under IMA,~Theorem 3.4] %\label{thm:ident_ima}
Let $\vg$ adhere to IMA. Let $\vf$ be a faithful encoder to $\vg$. 
%Suppose there exists an MDC $\vf^* = \mM \vf$, where $\mM \in \mathbb{R}^{K \times K}$ has full rank. 
Suppose we have obtained an $\vf'= \mM \vf$ with a full-rank $\mM \in \mathbb{R}^{K \times K}$ and orthogonal rows in its Jacobian  $\jacfprime(\vg(\vz))$, i.e,  $\jacfprime(\bm{g}(\bm{z}))\jacfprime(\bm{g}(\bm{z}))^\top = \bm{\Sigma}(\bm{z})$ where $\bm{\Sigma}(\bm{z})$ is diagonal. If additionally for two points $\vz_a, \vz_b \in \mathcal{Z}$
and $\gamma_i \coloneqq \frac{\Sigma_{ii}(\vz_b)}{\Sigma_{ii}(\vz_a)}$ and $\forall i,j =1...K, i\neq j: \gamma_i \neq \gamma_j$ (NEMR condition), then $\bm{M}\ve = \bm{P}\bm{S} \vz$, where $\mP \in \mathbb{R}^{K \times K}$ is a permutation and $\bm{S} \in \mathbb{R}^{K \times K}$ is a scaling matrix.%. $f'$ is an MDC and has orthogonal rows in its Jacobian for all $z \in \mathcal{Z}$  (orthogonal attributions).
\end{theorem}

\textbf{Proof.} We know that $\vf \circ \vg = \mD$ and $\mD$ has full rank. Since $\bm{M}$ also has full rank, there exists a non-singular matrix $\bm{E}$ such that $\bm{M} = \bm{E} \bm{D}^{-1}$. We will now show that the solution set of $\bm{E}$ can be constrained to be a permutation and scaling operation in three steps.

(1) $\mJ_{\vf^\prime}$ is orthogonal, i.e., $\bm{\Sigma}(\vz_a) = (\bm{M}\jacf(\vg(\vz_a))) (\bm{M}\jacf(\vg(\vz_a))^\top = ( \bm{E} \bm{D}^{-1} \jacf(g(z_a))) (\bm{E} \bm{D}^{-1} \jacf(g(z_a)))^\top = \bm{E} (\bm{D}^{-1} \jacf(g(z_a))) (\bm{D}^{-1} \jacf(g(z_a)))^\top \bm{E}^\top$. Since $\bm{D}^{-1} \vf \circ \vg = \bm{I}$ and $\vg$ is DMA, we can apply the transfer lemma (\Cref{lem:transferlemma}) and know that $\bm{D}^{-1} \jacf(g(z_a))$ must have orthogonal rows, i.e., $(\bm{D}^{-1} \jacf(g(z_a))) (\bm{D}^{-1} \jacf(g(z_a)))^\top = \bm{\Gamma}_a$, where $\bm{\Gamma}_a$ is some diagonal matrix with full rank. Substituting this back into the previous term, $\bm{\Sigma}(\vz_a) = \bm{E}  \bm{\Gamma}_a  \bm{E}^\top$.
The same holds for $\vz_b$, i.e., $\bm{\Sigma}(\vz_b) = \bm{E}  \bm{\Gamma}_b  \bm{E}^\top$.

%In other words, $\bm{E}$ is restricted to be from the set of solutions %the ones that keep %\begin{align}
%\bm{E}  \bm{\Sigma}_a  \bm{E}^\top = \bm{\Lambda} = %\diag (\bm{\lambda})
%\label{eqn:classEproblem}
%\end{align}
%for some $\bm{\lambda} \geq 0$. For any non-singular $\bm{E}$ and $\bm{\Sigma}_a ^*>0$ (positive def.), $\bm{\Lambda} = \bm{E} \bm{\Sigma}_a ^* \bm{E}^\top = \bm{E} \bm{\Sigma}_a^{*1/2}\bm{\Sigma}_a^{*1/2} \bm{E}^\top$ and the matrix is thus positive definite (this follows from the SVD). Thus, requiring $\bm{\lambda}>0$ is not a restriction. We define the set of solutions to this equation as 
%$\mathcal{E}_{\bm{\Sigma}_a } := \left\{\bm{E} \in \mathbb{R}^{K \times K} \middle| ~\mE \text{ non-singular with } \bm{E}  \bm{\Sigma}_a  \bm{E}^\top = \diag (\bm{\kappa}), \vkappa \in \mathbb{R}^K > 0 \right\}$. 

(2) We've seen in (1) that both $\bm{\Sigma}(\vz_a)$ and $\bm{\Gamma}_a$ are the results of quadratic forms. Hence, their entries are all positive, and strictly positive because they have full rank. Thus we can define $\bm{Q} := \bm{\Sigma}(\vz_a)^{-1/2} \bm{E} \bm{\Gamma}_a^{1/2}$. Due to (1), $\bm{Q} \bm{Q}^\top = \bm{I}$, i.e., $\bm{Q}$ is orthogonal. It is easy to see that $\bm{E} = \bm{\Sigma}(\vz_a)^{-1/2} \bm{Q}\bm{\Gamma}_a^{1/2}$. In other words, $\bm{E}$ must be a (twice) scaled orthogonal matrix. 

(3) From (1) we get that
\begin{align}
    \bm{\Sigma}(\vz_a) \bm{\Sigma}(\vz_b)^{-1} &= \bm{E} \bm{\Gamma}_a \bm{E}^\top (\bm{E} \bm{\Gamma}_b \bm{E}^\top)^{-1} \\
    \bm{\Sigma}(\vz_a) \bm{\Sigma}(\vz_b)^{-1} &= \bm{E} \bm{\Gamma}_a \bm{\Gamma}_b^{-1} \bm{E}^{-1} \\
    \bm{E}^{-1} \bm{\Sigma}(\vz_a) \bm{\Sigma}(\vz_b)^{-1} \bm{E} &= \bm{\Gamma}_a \bm{\Gamma}_b^{-1} 
\end{align}
Now we can insert the result from (2)
\begin{align}
     \bm{\Gamma}_a^{-1/2} \bm{Q}^\top \bm{\Sigma}(\vz_a)^{1/2} \bm{\Sigma}(\vz_a) \bm{\Sigma}(\vz_b)^{-1} \bm{\Sigma}(\vz_a)^{-1/2} \bm{Q} \bm{\Gamma}_a^{1/2} &= \bm{\Gamma}_a \bm{\Gamma}_b^{-1} \\
     \bm{Q}^\top \bm{\Sigma}(\vz_a)^{1/2} \bm{\Sigma}(\vz_a) \bm{\Sigma}(\vz_b)^{-1} \bm{\Sigma}(\vz_a)^{-1/2} \bm{Q}  &= \bm{\Gamma}_a^{1/2} \bm{\Gamma}_a \bm{\Gamma}_b^{-1} \bm{\Gamma}_a^{-1/2} \\ 
    \bm{Q}^\top \bm{\Sigma}(\vz_a) \bm{\Sigma}(\vz_b)^{-1} \bm{Q}  &= \bm{\Gamma}_a \bm{\Gamma}_b^{-1} \\ 
\end{align}
Due to the NEMR condition, $\bm{\Sigma}(\vz_a) \bm{\Sigma}(\vz_b)^{-1}$ is a diagonal matrix with unequal positive entries. We can thus apply \Cref{lem:helperorthogonality} which implies that $\bm{Q} = \bm{P} \bm{S}$ where $\bm{P}$ is a permutation and $\bm{S}$ a diagonal matrix. Inserting this back into (2) gives $\bm{E} = \bm{\Sigma}(\vz_a)^{-1/2} \bm{Q}\bm{\Gamma}_a^{1/2} = \bm{\Sigma}(\vz_a)^{-1/2} \bm{P} \bm{S} \bm{\Gamma}_a^{1/2} = \bm{P} \bm{S}'$, where $\bm{S}'$ is a diagonal matrix. Hence, $\bm{z}' = \mM \ve = \bm{P} \bm{S}' \bm{D}^{-1} \bm{D} \vz = \bm{P} \bm{S}' \vz$.
\hfill$\square$

In the next section, we discuss how the proofs can be turned into analytical solutions to discover the ground truth components.
\subsection{Analytical Solutions to Concept Discovery}
\label{sec:app_analyticalsolutions}
\subsubsection{Disjoint Mechanisms}
Under a perfect DMA process $\vg$ and a noiseless faithful encoder $\vf$ to $\vg$, we can compute an analytical solution for $\mM$ that will result in an encoder $\vf'=\mM\vf$ that is compliant with the \textit{DMA criterion}, i.e., disjoint rows in its Jacobian. Suppose we are provided with a gradient matrix of $\vf$, $\jacf(\vx_a) \in \mathbb{R}^{K \times L}$. We propose the following steps:
\begin{enumerate}
    \item Select a submatrix  $\mJ_{reg} \in \mathbb{R}^{K \times K}$ of $K$ linearly independent columns in $\jacf(\bm{x}_a)$, such that  $\text{det}(\mJ_{reg}) \neq 0$.
    \item Compute and return $\mM = \mJ_{reg}^{-1}$
    \item This will result in $\vf^\prime= \mM\vf$ having disjoint rows in its Jacobian.
\end{enumerate}
\textbf{Proof}. $\jacf(\vx_a)$ must be of the form $\jacf(\vx_a) = \mH^{-1} \jacfstar(\bm{x}_a)$ for such an $\mM$ to exist, where $\jacfstar$ is the Jacobian of an encoder $\vf^{*}$ with disjoint rows and $\mH$ has full rank. $\mJ_{reg}$ can be written as $\mJ_{reg}=\mH^{-1} \mJ_{f^*, reg}$, where $\mJ_{f^*, reg}$ is a square submatrix of $\jacfstar$ with the same selected selected columns.
The submatrix $\mJ_{f^*, reg}$ also will be of to be of full rank because it can be written as $\mH\mJ_{reg}$, which are both full rank. Because of the DMA principle, $\mJ_{f^*, reg}$ again needs to be of the form $\mP \mS$ with one component active in each column. Furthermore, $\mM=\mJ_{reg}^{-1}=(\mH^{-1} \mP \mS)^{-1} = \mS^{-1}\mP^{-1}\mH$. As the inverses of scaling and permutation matrices have the same respective form again, $\mM\mH^{-1}=\mS^\prime\mP^\prime$. Therefore, $\vf^\prime = \mS^\prime\mP^\prime\vf^{*}$, maintaining its disjoint Jacobians.  

\subsubsection{Independent Mechanisms}
Suppose we are given matrices $\bm{\Sigma}(\vz_a)=\jacf(\bm{x}_a)\jacf(\bm{x}_a)^\top=\bm{D}^{-1}\bm{\Gamma}_a (\bm{D}^{-1})^\top$ and $\bm{\Sigma}(\vz_b)=\jacf(\bm{x}_b)\jacf(\bm{x}_b)^\top$.
We then apply the following steps
\begin{enumerate}
    \item $\bm{U} = \text{inverse}(\text{cholesky}(\bm{\Sigma}(\vz_a))$
    \item $\bm{V} = \text{eigenvectors}(\bm{U}\bm{\Sigma}(\vz_b)\bm{U}^\top)$
    \item return $\bm{H}=\bm{V}^\top\bm{U}$
\end{enumerate}
The first step implies that $\bm{U}^{-1}\bm{U}^{-\top}=\bm{\Sigma}(\vz_a)$ and that $\bm{U}\bm{\Sigma}(\vz_a) \bm{U}^\top=\bm{I}$. We have thus identified the matrix $\bm{E}$ from step (2) of the identifiability proof, which has the form $\bm{U}=\bm{\Lambda}^{1/2}\bm{Q}\bm{\Gamma}_a^{-1/2}\bm{M}$. In step two we compute $\bm{U}\bm{\Sigma}(\vz_b)\bm{U}^\top = \bm{\Lambda}^{1/2}\bm{Q}\bm{\Gamma}_a^{-1/2}\bm{\Gamma}_b\bm{\Gamma}_a^{-1/2}\bm{Q}^\top\bm{\Lambda}^{1/2} = \bm{V}\bm{R}\bm{V}^\top$, where $\bm{R}$ holds the eigenvalues. Accordingly, by left and right multiplying with $\bm{V}$, we observe that 
$(\bm{V}^\top\bm{U})\bm{\Sigma}(\vz_b)(\bm{V}^\top\bm{U})^\top = \bm{R}$, i.e., $(\bm{V}^\top\bm{U})$ solves the orthogonality problem for $\bm{\Sigma}(\vz_b)$. We can easily verify that $\bm{H}=\bm{V}^\top\bm{U}$ is also a solution for $\bm{\Sigma}(\vz_a)$ by computing $\bm{V}^\top\bm{U}\bm{\Sigma}(\vz_a) \bm{U}^\top\bm{V}=\bm{I}$. By the identifiability result, $\bm{H}=\bm{V}^\top\bm{U}=\bm{\Lambda}\bm{P}\bm{M}$, a scaled and permuted version of $\bm{D}^{-1}$, if the additional gradient ratio condition is fulfilled with $\vx_a$ and $\vx_b$.

\newcommand\mycommfont[1]{\footnotesize\ttfamily\textcolor{blue}{#1}}

\SetCommentSty{mycommfont}
\subsection{Algorithms}
\begin{algorithm}[t]
\DontPrintSemicolon
\caption{DMA concept discovery with SGD.\label{alg:da}}
\textbf{Input:} encoder $\vf$, images $\{\vx_n\}_{n=1, \dotsc, N}$\;
Jacobians $\gets$ Gradient($\vf$, $\{\vx_n\}_{n=1, \dotsc, N}$).detach()\;
$M \gets$ $K$-dim identity matrix\;
\For{$L$ \text{epochs}, $\mJ_\vf(\vx) \in$ \text{Jacobians}}{
    $\mU$ $\gets | \mM \mJ_\vf(\vx) |$\tcp*{No absolute value operation here for IMA}
    $\mU$ $\gets$ row-normalize $\mU$\;
    loss $\gets || \mU\,\mU^\top - \mI_K ||_F$\;
    loss.backwards() \tcp*{Optimize $M$}
}
\Return{$M$}
\end{algorithm}

\begin{algorithm}[t]
\DontPrintSemicolon
\caption{DMA concept discovery with SGD (determinant loss).\label{alg:dadet}}
\textbf{Input:} encoder $\vf$, images $\{\vx_n\}_{n=1, \dotsc, N}$\;
Jacobians $\gets$ Gradient($\vf$, $\{\vx_n\}_{n=1, \dotsc, N}$).detach()\;
$M \gets$ $K$-dim identity matrix\;
\For{$L$ \text{epochs}, $\mJ_\vf(\vx) \in$ \text{Jacobians}}{
    $\mU$ $\gets | \mM \mJ_\vf(\vx) |$\tcp*{No absolute value operation here for IMA}
    $\mV$ $\gets \mU\mU^\top$\;
    loss $\gets \log\left(\prod_i V_{ii}\right) - \log\text{det}\left(\mV\right)$\;
    loss.backwards() \tcp*{Optimize $M$}
}
\Return{$M$}
\end{algorithm}

% \begin{algorithm}[t]
% \DontPrintSemicolon
% \caption{Concept discovery via Clustering of analytical solutions.\label{alg:cluster}}
% \textbf{Input:} encoder $\vf$, images $\{\vx_n\}_{n=1, \dotsc, N}$, DMA/IMA flag\;
% Jacobians $\gets$ Gradient($\vf$, $\{\vx_n\}_{n=1, \dotsc, N}$).detach()\;
% \For{$i \in 1 \ldots N$, $\mJ_\vf(\vx_i) \in$ \text{Jacobians}}{
%     \uIf{DMA}{
%     $\mM_i \gets \text{computeAnalyticalDMA}(\mJ_\vf(\vx_i))$\;
%     }
%     \uIf{IMA}{
%     $\mM_i \gets \text{computeAnalyticalIMA}(\mJ_\vf(\vx_i), \mJ_\vf(\vx_{i+1}))$ \tcp*{IMA needs two Jacobians for $\mM$}
%     }
%     $\mM_i \gets \text{row-normalize}~\mM_i$\;
% }
% $\mC \in \mathbb{R}^{2K \times K}\gets \text{sphericalCluster}(\left\{\mM_i[j,:]~|~i \in 1\ldots N,~j \in 1, \ldots, K \right\}, \text{n\_clusters}=2K$) \tcp*{cluster rows in the $\mM_i$, return 2K cluster centers in rows of $\mC$}
% $\mM \in \mathbb{R}^{K\times K} \gets \text{mergePoles}(\mC)$ \tcp*{merge rows of opposite signs}
% \Return{$\mM$}
% \end{algorithm}

We present the SGD optimization for DMA in \Cref{alg:da}. Note that the algorithm for IMA optimization via SGD can be obtained by just omitting the absolute value operation in the line indicated by the comment. \added{For the smaller toy datasets, we experiment with a version of the algorithm that uses the determinant (see \Cref{alg:dadet}), similar to the objective put forward by \cite{gresele2021independent}}.
As the determinant operation is hard to backpropagate through and might be unstable, we recommend \Cref{alg:da} for real-world applications and observed no significant performance differences on the datasets studied in this work.

%We further derive a clustering strategy that allows aggregating many analytical solutions, that are subject to noise. It is provided in \Cref{alg:cluster} and directly based on our identifiability theorems. They state that a scaled and permuted version of the distortion $\mD$ will be identified. We can compute a batch of analytical solution for data samples (one Jacobian required for DMA, two for IMA). However, when facing noise, there is a need to aggregate and average different solutions, which can only be done if the same scaling and permutations are used. The scaling can be undone easily by applying a row-wise normalization. However, we still face permutations and possibly opposite signs in the remaining rows of the matrices. Therefore, we apply an implementation of the K-Means algorithm on a unit sphere.\footnote{We use the implementation from \url{https://github.com/jasonlaska/spherecluster}} By this approach we identify $2K$ cluster centers, corresponding to the directions with either positive or negative signs. We merge the directions with opposite signs by a greedy strategy: We always pick the most diametrical pair of vectors (lowest $\cos$-distance) to be merged and only return one of them as a row in the final matrix $\mM$. The two merged direction are removed from the list of cluster centers and the next directions to be merged will be selected until only $K$ directions are in the rows of the final matrix $\mM$ that is returned.


\subsection{Extending gradients to general attributions}
\label{sec:app_gradtoattrib}
We make an initial attempt to generalize our method, considering gradients as a simple form of attribution method. Intuitively, $\mJ_\vf = \nabla_{\vx} (\vf(\vx))$ contains input gradients (termed grad in the remainder) which can be thought of as a simple form of attribution for each component \citep{simonyan2013deep, shah2021doinput}. % that correspond to the importance of certain features for the final prediction. It has been shown that input gradients highlight important features, in particular for regularized models. 
Thus, on a more general level, our proposed approach optimizes for the disjointness of attributions. % \hl{motivated by visual compositionality: The image comprises several parts that are each controlled by one original component, and so their attributions, that show the most-influcenced regions, should be independent.}
Thus, we may use other forms of \textit{homogeneous attributions} in place of $\mJ_\vf$. These are local attribution methods $A_\vf: \mathbb{R}^L \rightarrow \mathbb{R}^{K \times L}$ for the encoder $\vf$ with $A_{\mM\vf}(\vx) = \mM A_\vf(\vx)$ that map an instance $\vx$ to a matrix of attributions for each latent dimension. Besides the above input gradients, this class contains other popular methods such as integrated gradients (IG) \citep{Sundararajan2017} and smoothed gradients (SG) \citep{Smilkov2017} (because these methods are linear in $\vf$). Thus, we can formulate a generalized \textit{disjoint attributions objective}:
\begin{align}
    \min_{\mM} ~& \sum_{n=1}^{N} \left|\left| \left| \overline{\mM A_f(x)} \right|\,\left|\overline{\mM A_f(x)}\right|^\top - I_K\right|\right|^2_F.
    \label{eqn:orthogonaloptattr}
\end{align}
We indicate the row-normalization operation by the overbar, and denote by $\lvert\cdot\rvert$ the element-wise absolute values operation. Without the absolute value operation this results in the \emph{independent attributions objective}.

\section{Experimental Details}
We report the most important implementation details for our experiments in this section. Please confer the actual implementation available online\footnote{\url{https://github.com/tleemann/identifiable_concepts}} for full information. 
\label{sec:app_expdetails}
\subsection{Synthetic datasets}
We show random samples from both datasets in \Cref{fig:randomsamplestoy}. We provide an additional graphics with the behavior on the synthetic datasets in \Cref{fig:sgdcurves}. They show that SGD exhibits a convergence behavior as predicted by our theory and comparable to the analytical solutions (shown in the main paper).


\begin{figure}[tb]
    \begin{subfigure}[b]{0.35\linewidth}\centering
    \includegraphics[width=\textwidth]{figures/gt_random.pdf}
    \caption{Random samples in the \texttt{FourBars} dataset.\newline~}
    \end{subfigure}
    \begin{subfigure}[b]{0.35\linewidth}
    \centering
    \includegraphics[width=\textwidth]{figures/random_bars.pdf}
    \caption{Random samples in the \texttt{ColorBar} dataset.\newline~}
    \end{subfigure}
    \begin{subfigure}[b]{0.24\linewidth}\centering
    \includegraphics[width=\textwidth]{figures/sgd_curves_bars.pdf}
    \caption{Disentangling gradients of synthetic datasets with SGD.\label{fig:sgdcurves}}
    \end{subfigure}
    \caption{Random samples drawn from the synthetic datasets (a,b). On the \texttt{FourBars} dataset, IMA fails to iterate towards a disentangled solution, because the non-equal magnitudes condition is violated. However, IMA converges on the \texttt{ColorBar} dataset, although at a slower rate (c)}
    \label{fig:randomsamplestoy}
\end{figure}

\subsection{Architectures} \label{sec:app_architectures} % TOBIAS
For the disentanglement models, we use the implementations provided by the open source library \texttt{disentanglement-pytorch}\footnote{\url{https://github.com/amir-abdi/disentanglement-pytorch}}. For the evaluation measures, we use the implementation of \texttt{disentanglement\_lib}\footnote{\url{https://github.com/google-research/disentanglement_lib}} with their respective default parameters. We use a simple encoder and decoder architecture, that consists of five and six feed-forward convolutional layers respectively and relies on the ReLU activation function.


\subsection{Correlated sampling} \label{sec:app_corrsampling}% MICHAEL
\begin{figure}[tb]
    \centering
    \includegraphics{figures/dens_trauble_0.2.pdf}
    \includegraphics{figures/dens_trauble_0.4.pdf}
    \includegraphics{figures/dens_trauble_1.pdf}
    \includegraphics{figures/dens_normal_0.pdf}
    \includegraphics{figures/dens_normal_0.5.pdf}
    \includegraphics{figures/dens_normal_0.85.pdf}
    \caption{Exemplary correlated densities of the components floor color and object scale under the correlated sampling setup of \citet{gresele2021independent} (a -- c) and with our Gaussian sampling (d -- f). The correlation strength is indicated on top. Purple denotes a low and yellow a high density.}
    \label{fig:app_sample}
\end{figure}
In this paper, we use two methods to introduce correlations between the ground truth components. Both methods rely on proportional resampling: We first draw a batch that has multiple times the final batch size (we use factors from 3-6 depending on the non-uniformity of the distribution), then compute the (non-normalized) probability of each sample under a given distribution over the component values, and then resample a final batch (with replacement) proportional to these probabilities. 

The two methods differ in the probability distribution assigned to the component values. The first setting (used in \cref{mainsec:exp_comp}) uses the approach of \citet{trauble2021disentangled}: As visualized in \cref{fig:app_sample}(a) to (c), we pick two components $z_1$ and $z_2$, create the grid of possible values, and then lay a diagonal line over this grid. Along this line, we set a normal distribution with a standard deviation $s$. A higher $s$ means that the distribution gives a higher probability to more component combinations of the grid, whereas a smaller $s$ is more restrictive. Mathematically, it is defined by \citet{trauble2021disentangled} as:
\begin{align}
    p(z_1, z_2) \propto \exp\left(-\frac{(z_1 - \alpha z_2)^2}{2s^2}\right),
\end{align}
where $\alpha = z_1^{\text{max}} / z_2^{\text{max}}$ brings the components to a same scale and $s$ is similarly normalized to the maximum values that $z_1$ and $z_2$ can take. The remaining components $z_i, i > 2,$ are marginalized out of this distribution and thus continue to be sampled uniformly at random.

This setting is limited to one pair of components and also introduces a non-Gaussian distribution over all components. To tackle these limitations and thus to make the distributional challenge harder, we use a different probability distribution in \cref{mainsec:exp_robust}. Here, we lay a normal distribution over \emph{all} components, i.e., $z \sim \mathcal{N}(\mu, \Sigma)$, where $\mu$ is centered in the middle of the possible values, i.e., $\mu = \frac{z^{\text{max}} - z^{\text{min}}}{2}$. $\Sigma$ is similarly normalized, since we decompose it into $\Sigma = \text{diag}(\sigma^2) \Gamma$. The vector $\sigma \in \mathbb{R}_{>0}^K$ gives standard deviations for each component via $\sigma^2 = \left(\frac{\mu + 0.5}{2}\right)^2$ such that the distribution stretches across the grid of possible values. Note that the $+0.5$ is because the values are assumed to be zero-indexed. $\Gamma$ is a correlation matrix with $1$ on its diagonal. In the first experiment in \cref{mainsec:exp_robust}, we correlate only one pair of variables and set their corresponding off-diagonal entries in $\Gamma$ to $\rho$. \cref{fig:app_sample} (d) to (f) show the corresponding marginal distributions of these components. In the second experiment, we fill $\Gamma$ with several correlations in the following order:
\begin{align}\begin{matrix}
 z_1 \\
 z_2 \\
 z_3 \\
 z_4 \\
 z_5 \\
 z_6 \\
\end{matrix}
    \begin{pmatrix}
     & 1 & 4 & 12 & 14 & 9 \\
     & & 11 & 5 & 10 & 6 \\
     & & & 3 & 8 & 15 \\
     & & & & 13 & 7 \\
     & & & & & 2\\
     & & & & & & \\
    \end{pmatrix}
\end{align}
where the component order of the rows and columns is $z_1=$ \texttt{floor\_color}, $z_2=$ \texttt{background\_color}, $z_3=$ \texttt{object\_color}, $z_4=$ \texttt{object\_scale}, $z_5=$ \texttt{object\_shape}, $z_6=$ \texttt{orientation}. Here, it is important to ascertain that the covariance matrix stays positive definite. Thus, we start with $\rho = 0.7$, check if the lowest eigenvalue of $\Sigma$ is at least $0.2$, and if not, reduce $\rho$ by a factor of $0.9$ until the eigenvalue fulfills this property. While technically it would be enough to have the smallest eigenvalue anywhere above $0$, we found that $0.2$ helps in numerical stability, for instance when inverting the covariance matrix to compute the multivariate normal distribution density. 

%Plots Träuble + Normal,
% Resampling
% rho, s
%Wie matrix gefüllt (Reihenfolge + PD problem)

\subsection{Discriminative setup} % TOBIAS 
\label{sec:app_discrsetup} 
The decision tree that is used to generate the class distribution is shown in  \Cref{fig:decisiontree}. It relies on 4 (binarized) components. We trained a simple CNN classifier for this problem using the cross-entropy loss. In addition to the classification loss terms, we add a regularizer $\|\vz\|_2^2$, which constrains the latent codes to not grow arbitrarily large, during training. To create a realistic setup, we subsample the dataset to follow a normal distribution as shown in \cref{fig:app_sample}d. We also add label noise near the decision boundary: For objects which have an orientation that is nearly centered, we follow each branch (left/right) with a probability of 50\,\%. With increasing left-orientedness, the probability of following the left branch increases to almost 100\,\% in form of a sigmoid function over the actual orientation. We follow the same procedure for the remaining features. We train the classifier for 10k iterations at a batch size of 24 and verify that it reaches an accuracy close to the best-possible one taking the mislabeled samples into account. We add correlations by increasing the chance of the the factors \emph{obj. color} and \emph{floor color} taking the same binary value. We use our disjoint attributions approach to find a $H\in \mathbb{R}^{4\times 6}$ matrix that should map the 6-dimensional latent space of the model to the four binary concepts that are used in the classification task. For the unit directions, we take the first four unit directions of the latent space, for PCA and ICA, we take the most prominent four components discovered for the evaluation with the four annotated ground truth concepts.

% welcher classifier architecture, loss

\begin{figure}[tb]
    \centering
    \includegraphics[width=\textwidth]{figures/tree.pdf}
    \caption{The decision tree setup that we use for the discriminative classification problem. Each image is assigned one out of eight class labels $y$ according to the following decision tree.}
    \label{fig:decisiontree}
\end{figure}

\subsection{Evaluation scores} % Tobias und Michael
Several scores to quantify disentanglement have been proposed in the literature and often emphasize a different aspect of disentanglement \citep{sepliarskaia2019evaluating}. Among the most common scores is the Disentanglement-Completeness-Informativenss score (DCI) by \citet{eastwood2018framework}.  In their work, they propose a metric to measure Disentanglement, that relies on training predictors $\hat{z}_j = f_j(e)$  to predict each individual ground truth component $z_j$ from the learned latent representation $e$. Furthermore, they compute normalized importance weights $P_{ik}$  that quantify how important learned component $e_i$ is for predicting the ground component $z_k$. The disentanglement metric computes a row-wise entropy over the $P$-matrix, which assigns a score of 1, if the learned component $e_i$ is useful for predicting only a single factor and as score of 0, if it is equally useful for predicting all factors. Other commonly used metrics include the Mutual Information Gap (MIG) \citep{chen2018isolating}, Separated Attribute Predictability (SAP) \citep{kumar2018variational} and the FactorVAE metric \citep{kim2018disentangling}. However, it is unclear which of these metrics (or if any) also provide useful results in the correlated setting \citet{trauble2021disentangled}. Therefore, to compute the reliable evaluations, we train the model (and the post-processing methods such as PCA, ICA, IMA, DMA) on the correlated dataset, but compute the metrics on samples from the full, \emph{uncorrelated} datasets to avoid distortion in our scores. Träuble et al. noted that the DCI scores were able to discover entanglement between 2 variables \cite[Figure 11, Appendix]{trauble2021disentangled}, whereas most other metrics failed even in this case. Therefore, we mainly rely on this score for our experiments but also report results corresponding to \cref{mainsec:exp_comp} for the other scores that show a similar picture in this appendix (\cref{sec:app_furthermetricresults}).
% Review paper zu scores, warum nehmen DCI 

\subsection{CUB experiments} \label{sec:app_cubdetails} % Yao und Michael
CUB-200-2011 is a fine-grained dataset containing a total of 11,788 images of 200 bird species (5994 for training and 5794 for testing). We trained a ResNet-50 with two fully-connected (fc) layers (the second fc layer served as a bottleneck layer and took 2048-dim feature vectors as input and output 512-dim ones) on CUB for 100 epochs using a SGD optimizer with an initial learning rate of 0.001. The input images were center cropped to $224 \times 224$ pixels. Trained on a standard cross-entropy loss, the ResNet achieved a classification accuracy of on average 77.47\% on five random seeds, indicating proper training. After training the classifier, we applied our proposed method to discover components in the embedding space.

CUB provides no ground-truth components since it is a real-world dataset. It does, however, contain 312 attributes semantically describing the bird classes, e.g., wing color or beak shape. These attributes have no guarantee to be complete, but they offer $312$ interpretable components. % that are actually embedded almost linearly in the embedding space (i.e., a supervised linear regressor is able to predict them well). 
This allows for an attempt to quantify whether our discovered components are interpretable and meaningful by comparing whether they match some of these interpretable ones.

Formally, we are given a set of image feature embeddings $\{\ve_n\}_{n = 1, \dotsc, N}$, $\ve_n \in \mathbb{R}^L$ and a  matrix $\mH = (\vh_1, \dotsc, \vh_K) \in \mathbb{R}^{L\times K}$ that contains the directions of discovered components ($L=512$, $K=30$). A score $s_n^k$ of $n$-th image for the $k$-th discovered component can be calculated by projecting the feature embeddings on that component direction, i.e., $s_n^k = \langle  \ve_n,  \vh_k  \rangle$. One pitfall is that $s_n^k$ can be negative, indicating, e.g., a non-black bird for the component "primary color: black", but this opposite attribute is usually encoded in a separate attribute in CUB, e.g., "primary color: white". Thus, we separate the negative and positive values into two components (where we set values of the opposite sign to 0), resulting in $2\cdot K$ positive scores for each image. 

To compare these component scores with the attributes, we make use of the numerical attribute values provided in CUB. First, we average the $2 \cdot K$ component values of all images of a class, to be comparable with the class-wise attributes provided by CUB. This gives us a numerical $2 \cdot K$ dimensional component description and a $312$ dimensional attribute description per class. Now, we match the discovered components to the attributes. We compare each discovered component to each attributes via the Spearman's rank correlation coefficient and consider the attribute with the highest score to match the component. These are the matches used in \cref{mainsec:exp_cub}. We further use the (average) Spearman's rank correlation across all components to their best-matching attributes to quantify how well the components match to interpretable attributes in \cref{sec:app_cub_eval}.

\subsection{Hyperparameters for the disentanglement models} \label{sec:app_hyperparameters} 
We orient our hyperparameter ranges by the works of \citet{trauble2021disentangled, locatello2019challenging}. The exact ranges are provided in \cref{tab:hyperparameterranges}. We find the best hyperparameters in the ranges for each correlation strength/dataset/model triple separately. Then we train five models from independent seeds to run our experiments.
We use the Adam optimizer for all model with a learning rate of $10^{-4}$, batch size of 64 and train for 300k iterations (equiv. to 40 epochs on Shapes3D). 

For the optimization of the post-hoc disentanglement problem, we use slightly different hyperparameters. We use the RMSProp optimizer with learning rate of $10^{-3}$ and a batch size of 48.

\begin{table}[tb]
    \centering
    \begin{tabular}{cc}
    \toprule
        Model & Ranges \\
    \midrule
        BetaVAE & $\beta \in \{1,2,4,6,8,16\}$ \\
        FactorVAE & $\gamma \in \{5, 8, 10, 20, 30, 40, 50, 100\}$\\
        BetaTCVAE & $\beta \in \{1,2,4,6,8,10\}$ \\
        DIPVAEI & $\lambda_{od} \in \{1,2,5,10, 20, 50\}$ \\
    \bottomrule
    \end{tabular}
    \caption{The hyperparameter ranges considered in this work.}
    \label{tab:hyperparameterranges}
\end{table}
% Grids, ranges, best model choice per correlation. Optimizer, number of epochs, ...



\begin{table}[tb]
%. The models were trained on two datasets where different pairs of components were correlated. The best result per model is marked in bold.}
\centering
\adjustbox{width=0.5\textwidth}{
\begin{tabular}{r*{3}{c}}
\toprule
 Dataset & \multicolumn{3}{c}{MPI3D-real}\\
\cmidrule{1-1} \cmidrule(lr){2-4}
 \wrapb{Correlated}{components} &  \wrapb{background \&}{object color} &  \wrapb{background \&}{robot arm dof-1} &  \wrapb{robot arm dof-1 \&}{robot arm dof-2}\\
\cmidrule{1-1} \cmidrule(lr){2-4} 
\textbf{BetaVAE} &
\res{0.340}{0.027} & \res{0.277}{0.026} & \res{0.300}{0.046} \\
+PCA &
\res{0.116}{0.008} & \res{0.174}{0.021} & \res{0.154}{0.015}\\
+ICA &
\res{0.237}{0.042} & \res{0.205}{0.023} & \res{0.180}{0.021} \\
+Ours (IMA) & 
\bres{0.355}{0.033} & \bres{0.349}{0.015} & \bres{0.337}{0.038} \\
+Ours (DMA) &
\res{0.334}{0.025} & \res{0.317}{0.028} & \res{0.278}{0.030} \\
\cmidrule{1-1} \cmidrule(lr){2-4} 

\textbf{FactorVAE} &
\bres{0.205}{0.022} & \bres{0.239}{0.017} & \res{0.171}{0.005}\\
+PCA&
\res{0.179}{0.010} & \res{0.234}{0.012} & \res{0.171}{0.006}\\
+ICA&
\res{0.066}{0.009} & \res{0.090}{0.006} & \res{0.073}{0.011}\\
+Ours (IMA)&
\res{0.201}{0.019} & \res{0.226}{0.010} & \bres{0.191}{0.011}\\
+Ours (DMA)&
\res{0.184}{0.013} & \res{0.218}{0.016} & \res{0.180}{0.013}\\
\cmidrule{1-1} \cmidrule(lr){2-4} 

\textbf{BetaTCVAE} &
\bres{0.383}{0.022} & \bres{0.359}{0.026} & \bres{0.309}{0.036}\\
+PCA&
\res{0.356}{0.022} & \res{0.328}{0.017} & \res{0.295}{0.038}\\
+ICA& 
\res{0.245}{0.041} & \res{0.260}{0.024} & \res{0.170}{0.045}\\
+Ours (IMA)&
\res{0.323}{0.025} & \res{0.316}{0.029} & \res{0.271}{0.033} \\
+Ours (DMA)&
\res{0.327}{0.027} & \res{0.325}{0.025} & \res{0.272}{0.033} \\
\cmidrule{1-1} \cmidrule(lr){2-4}

\textbf{DipVAE} &
\res{0.235}{0.019} & \res{0.181}{0.049} & \res{0.232}{0.040}\\
+PCA&
\res{0.090}{0.005} & \res{0.088}{0.028} & \res{0.091}{0.011}\\
+ICA&
\res{0.234}{0.019} & \res{0.180}{0.048} & \res{0.232}{0.041}\\
+Ours (IMA)&
\res{0.230}{0.022} & \res{0.182}{0.048} & \res{0.230}{0.042} \\
Ours (DMA)&
\bres{0.249}{0.026} & \bres{0.188}{0.049} & \bres{0.253}{0.051}\\
 \bottomrule
  \end{tabular}
}
\caption{MPI-3D dataset: Mean $\pm$ std. err. of the DCI scores (across all components of the dataset) of several models and post-hoc methods applied to their embeddings. Columns show which pair of components was correlated during training.\label{tab:app_posthocdisentangle}}
\end{table}
\subsection{Details on the introductory example}
The introductory example is inspired by a real explanation generated for a missclassification of the ResNet50 model pretrained on the ImageNet \citep{russakovsky2015imagenet} dataset delivered with the popular \texttt{pytorch} \citep{paszke2017automatic} package. Using the approach devised by \cite{leemann2022coherence}, we use the individual neurons of the classifier's last-layer as concepts and describe them by words. We obtain the conceptual explanation shown in \Cref{fig:localexplwecann}. We simplify the explanation for the motivational figure and give the concepts relatable names. However, the gist of the example stays the same.
\begin{figure*}[t]
\centering
\includegraphics[width=0.9\textwidth]{figures/LocalExplTraffic.pdf}
\caption{Original local conceptual explanation of the missclassification. We find that the most activating concept ``candles, burning, flame...'' activates for very dark images. This concept is also highly activated for the traffic light example. We cleared up the description of the concepts for the motivational figure.}
\label{fig:localexplwecann}
\end{figure*}


\section{Additional results}
\subsection{Reconstruction quality} \label{sec:app_reconstruction} % Michael 

As a check, we investigate the reconstruction quality of the disentanglement models. For the 3D shapes, the reconstruction is very high, but we observe some more serious reconstruction errors on the MPI-3d dataset (see \cref{sec:app_mpi3d}). %
Figures~\ref{fig:app_reconcub} and \ref{fig:app_reconmpi} show the original images on the left and the reconstructions of a randomly chosen BetaVAE on the right. On Shapes3D, the BetaVAE is able to reconstruct the image from its embedding representation. On MPI3D-real, it is able to reconstruct the big image parts shared across many pictures (ground, background stripe and background), but becomes blurry in the smaller and more nuanced robot arm and object shapes. This indicates that the information on these components might not be stored in the embedding space and is thus hardly disentanglable. A longer training (800k instead of 300k iterations) did not resolve the issue. The issue might arise, following \citet{gondal2019transfer}, because the input images were scaled down to 64x64 pixels making the detailed objects hard to perceive, and because the same architecture as in the Shapes3D experiments was used, which might not be expressive enough. 

\begin{figure}[tb]
    \centering
    \includegraphics[scale=0.35]{figures/recon_shapes.jpg}
    \caption{Random example images (left) and their reconstructions (right) of a BetaVAE on Shapes3D.}
    \label{fig:app_reconcub}
\end{figure}

\begin{figure}[tb]
    \centering
    \includegraphics[scale=0.35]{figures/recon_mpi3d.jpg}
    \caption{Random example images (left) and their reconstructions (right) of a BetaVAE on MPI3D-real.}
    \label{fig:app_reconmpi}
\end{figure}

\subsection{Results for the MPI-3D dataset}
\label{sec:app_mpi3d}
In addition to 3Dshapes, we use the challenging MPI3D-real dataset \citep{gondal2019transfer}, which consists of realistic images of a moving robot arm. It is by far more challenging, as the component is only present in a small portion of the images, and the data consists of real photographs. We report the results on this dataset in \Cref{tab:app_posthocdisentangle}. We saw low disentanglement scores of both the base and post-hoc models on MPI3D-real compared to the performance on Shapes3D. This implies that the embedding spaces of the VAEs was not trained well. In fact, this is supported by the reconstruction quality considerations on both Shapes3D and MPI3D-real. Because our approaches are based on the given embeddings, they also struggle when they incorrectly reflect the sample. 

\subsection{Correlation strengths and attribution methods in first experiment} % Tobias
\label{sec:app_morerectify}
In this section we provide additional ablations for the rectification experiment in \cref{mainsec:exp_comp}. We investigate the impact of the choice of attribution method (\cref{sec:app_gradtoattrib}) and the correlation strength $s$. The values (DCI scores) are shown in \cref{tab:app_further_cor}. As expected, our approach offers the highest gains over the baseline when the correlation is higher. Starting at $s=0.4$, our runs start to reliably outperform the baselines. Regarding the attributions, there is no clear picture, but Grad and SG seem to yield good results more stably across runs. DMA usually outperforms IMA, which supports our theoretical results on identifiability.

\subsection{Further disentanglement metrics} % Tobias and Michael
\label{sec:app_furthermetricresults}
Tables \ref{tab:app_posthocdisentangle_mig} -- \ref{tab:app_posthocdisentangle_sap} show the results of the experiment in \cref{mainsec:exp_comp} measured in the alternative metrics MIG, FactorVAE and SAP score. %We use the SG-Variant of our orthogonal attribution loss for the optimization in all plots.
For MIG, we see similar results as for DCI in \Cref{maintab:posthocdisentangle} and in \Cref{tab:app_posthocdisentangle}. The results in FactorVAE and SAP score are slightly inferior but our approach still improves over the baseline in many setups. We also compute the disentanglement only on the two correlated components for the first pair of factors in \Cref{tab:app_pairwisedci}. This emphasized the improvement introduced by our IMA and DMA approaches.

\subsection{Runtimes and further ablation studies}
\added{\textbf{Runtime.} Runtime can be an important concern for algorithms in explainable AI, for instance when they are to be deployed on embedded devices. We therefore report the runtimes required to obtain the results shown in \Cref{maintab:posthocdisentangle} here:}
\begin{center}
\begin{tabular}{ccccc}
\toprule
Algorithm & PCA	& ICA & Ours-DMA & Ours-IMA \\
\midrule
Runtime (sec) &  316 $\pm$ 38 & 320 $\pm$ 44 & 1140 $\pm$ 97	& 1017 $\pm$ 121 \\
\bottomrule
\end{tabular}
\end{center}

\added{For our SGD-based optimization, we note that the user can choose how many optimization steps are executed. In the present work, we chose 20000 steps to make sure that the optimization has converged. Using these settings, the runtime of our algorithms is approximately 3 times as high as that of the baseline. We think that this is not prohibitively more expensive. However, convergence of the optimization is usually achieved much quicker.}

\added{\textbf{Effect of less SGD iterations.} To ablate the behavior of our approach with a smaller runtime budget, we rerun all the approaches in \Cref{maintab:posthocdisentangle} using only 8000 iterations, making the runtime approximately equal across methods. We report the DCI scores as in the original table in \Cref{tab:app_lessiterations} and see that our DMA approach still outperforms all the baselines in 10 of 12 settings. Thus, even when runtime is an important concern in the evaluation, our approach can still yield competitive results.}

\added{\textbf{Robustness with respect to noise.} While IMA covers a more general class of functions, we empirically observed superior performance for DMA in most experiments. We therefore hypothesize that the performance difference stems from the behavior of IMA and DMA under noisy gradients and from the approximate optimizers that we use. We conduct an ablation study to obtain further evidence for these hypotheses. We modify the \texttt{FourBars} dataset to fulfill NEMR by adding varying magnitudes of the component gradients in the rows of $\jacf(\vg(\vx))$. This dataset is now solvable by both IMA and DMA. We then add noise to the analytical gradients. We perform a fixed number of 500 SGD steps of \Cref{alg:dadet} and otherwise use the same optimizer parameters as in the main paper. We obtain the DCI curves across different noise levels shown in \Cref{fig:app_robustness}. Without noise, both algorithms find disentangled solutions with DCI scores >0.9 (practically perfect disentanglement when evaluated on traversals). When we add noise, the disentanglement scores decrease as the working assumptions now only hold approximately. At a noise level of 0.1, the actual gradients shown in \Cref{mainfig:fourbars} are hard to see already with bare eyes. At each point there is a small but consistent gap between the performance of IMA and DMA, indicating that the DMA objective often finds better solutions with the standard SGD optimizer pipeline. This matches our empirical findings of the real data experiments.
}

%We add the clustering approach given in \Cref{alg:cluster} to the set of competitors.  The results are given in \Cref{tab:app_sgd_analytical}. While it is able to beat the baselines for 8/12 setups, it does not reach the level of the DMA method with SGD optimization. 


\begin{figure}[tb]
\begin{minipage}[b]{\textwidth}
\begin{minipage}[t]{0.6\textwidth}
\scalebox{0.75}{
\begin{tabular}{r*{6}{c}}
\toprule
\wrapb{Correlated}{components} & \multicolumn{2}{c}{\wrapb{floor \&}{background}}  &\multicolumn{2}{c}{\wrapb{orientation \&}{background}} &  \multicolumn{2}{c}{\wrapb{orientation \&}{size}}\\
\cmidrule{1-1}\cmidrule(lr){2-3}\cmidrule(lr){4-5} \cmidrule(lr){6-7}
\textbf{BetaVAE} &\res{0.497}{0.03}& & \res{0.581}{0.04}& & \res{0.491}{0.05}& \\
+PCA&\res{0.263}{0.03}&\negimp{-47\%}& \res{0.310}{0.02}&\negimp{-47\%}& \res{0.324}{0.04}&\negimp{-34\%}\\
+ICA&\bres{0.574}{0.04}&\posimp{+16\%}& \res{0.540}{0.08}&\negimp{-7\%}& \res{0.577}{0.04}&\posimp{+17\%}\\
+Ours (OA)&\res{0.533}{0.11}&\posimp{+7\%}& \res{0.594}{0.04}&\posimp{+2\%}& \res{0.576}{0.03}&\posimp{+17\%}\\
+Ours (DA)&\res{0.472}{0.14}&\negimp{-5\%}& \bres{0.633}{0.05}&\posimp{+9\%}& \bres{0.617}{0.03}&\posimp{+26\%}\\
\cmidrule{1-1} \cmidrule(lr){2-6}
\textbf{FactorVAE} &\res{0.507}{0.11}& & \res{0.502}{0.08}& & \bres{0.712}{0.01}& \\
+PCA&\res{0.358}{0.07}&\negimp{-29\%}& \res{0.474}{0.05}&\negimp{-5\%}& \res{0.556}{0.03}&\negimp{-22\%}\\
+ICA&\res{0.294}{0.07}&\negimp{-42\%}& \res{0.263}{0.05}&\negimp{-48\%}& \res{0.340}{0.03}&\negimp{-52\%}\\
+Ours (OA)&\res{0.539}{0.04}&\posimp{+6\%}& \res{0.498}{0.03}&\negimp{-1\%}& \res{0.568}{0.06}&\negimp{-20\%}\\
+Ours (DA)&\bres{0.567}{0.07}&\posimp{+12\%}& \bres{0.531}{0.04}&\posimp{+6\%}& \res{0.571}{0.02}&\negimp{-20\%}\\
\cmidrule{1-1} \cmidrule(lr){2-6}
\textbf{BetaTCVAE} &\res{0.619}{0.01}& & \res{0.613}{0.04}& & \res{0.659}{0.01}& \\
+PCA&\res{0.400}{0.03}&\negimp{-35\%}& \res{0.421}{0.07}&\negimp{-31\%}& \res{0.450}{0.07}&\negimp{-32\%}\\
+ICA&\res{0.540}{0.02}&\negimp{-13\%}& \res{0.497}{0.04}&\negimp{-19\%}& \res{0.627}{0.02}&\negimp{-5\%}\\
+Ours (OA)&\res{0.635}{0.04}&\posimp{+3\%}& \res{0.648}{0.03}&\posimp{+6\%}& \res{0.682}{0.02}&\posimp{+4\%}\\
+Ours (DA)&\bres{0.644}{0.01}&\posimp{+4\%}& \bres{0.659}{0.02}&\posimp{+8\%}& \bres{0.724}{0.02}&\posimp{+10\%}\\
\cmidrule{1-1} \cmidrule(lr){2-6}
\textbf{DipVAE} &\res{0.631}{0.02}& & \res{0.652}{0.02}& & \res{0.548}{0.04}& \\
+PCA&\res{0.158}{0.01}&\negimp{-75\%}& \res{0.160}{0.02}&\negimp{-75\%}& \res{0.170}{0.02}&\negimp{-69\%}\\
+ICA&\res{0.630}{0.02}&\negimp{-0\%}& \res{0.651}{0.02}&\negimp{-0\%}& \res{0.542}{0.03}&\negimp{-1\%}\\
+Ours (OA)&\res{0.640}{0.01}&\posimp{+1\%}& \res{0.621}{0.02}&\negimp{-5\%}& \res{0.545}{0.05}&\negimp{-1\%}\\
+Ours (DA)&\bres{0.683}{0.01}&\posimp{+8\%}& \bres{0.676}{0.01}&\posimp{+4\%}& \bres{0.591}{0.06}&\posimp{+8\%}\\
 \bottomrule
 \end{tabular}}

\captionof{table}{Using 8000 instead of 20000 SGD iterations: Mean $\pm$ std. err. of the DCI scores of post-hoc methods applied to the embedding spaces of four disentanglement architectures with different pairs of correlated variables. Our DMA method still yields competitive results even with fewer SGD steps.\label{tab:app_lessiterations}}%. The models were trained on two datasets where different pairs of components were correlated. The best result per model is marked in bold.}
\end{minipage}
\hfill
\begin{minipage}[t]{0.37\textwidth}
\centering
\scalebox{0.8}{
\begin{tabular}{r*{1}{c}}
\toprule
 Dataset & \multicolumn{1}{c}{Shapes3D} \\
\cmidrule{1-1} \cmidrule(lr){2-2}
 \wrapb{Correlated}{factors} & \wrapb{floor vs.}{background} \\
\cmidrule{1-1} \cmidrule(lr){2-2} 
\textbf{BetaVAE} &\res{0.579}{0.089} \\
+PCA&\res{0.291}{0.033} \\
+ICA&\res{0.435}{0.076} \\
+IMA-SGD&\ures{0.738}{0.072} \\
+DMA-SGD&\bres{0.868}{0.025} \\
\cmidrule{1-1} \cmidrule(lr){2-2}
\textbf{FactorVAE} &\res{0.684}{0.163} \\
+PCA&\res{0.526}{0.136} \\
+ICA&\res{0.363}{0.097} \\
+IMA-SGD&\ures{0.779}{0.063} \\
+DMA-SGD&\bres{0.847}{0.072} \\
\cmidrule{1-1} \cmidrule(lr){2-2}
\textbf{BetaTCVAE} &\res{0.589}{0.005} \\
+PCA&\res{0.388}{0.046} \\
+ICA&\res{0.609}{0.065} \\
+IMA-SGD&\bres{0.876}{0.027} \\
+DMA-SGD&\ures{0.754}{0.127} \\
\cmidrule{1-1} \cmidrule(lr){2-2}
\textbf{DipVAE} &\res{0.615}{0.114} \\
+PCA&\res{0.429}{0.169} \\
+ICA&\res{0.585}{0.024} \\
+IMA-SGD&\bres{0.798}{0.099} \\
+DMA-SGD&\ures{0.782}{0.009} \\
 \bottomrule
  \end{tabular}}
\captionof{table}{Mean $\pm$ std. err. of the DCI scores of four post-hoc methods applied to the embedding spaces of four disentanglement models on two datasets with different pairs of correlated variables. The DCI is computed across \textbf{the two correlated components} of the dataset.\label{tab:app_pairwisedci}}%. The models were trained on two datasets where different pairs of components were correlated. The best result per model is marked in bold.}
\end{minipage}
\end{minipage}
\end{figure}
\begin{figure}[tb]
    \centering
    \includegraphics[width=0.5\textwidth]{figures/ablation_noise.pdf}
    \caption{Robustness of optimization to noisy gradients. We use a variant of the \texttt{FourBars} dataset that can be identified both by IMA and DMA (the NEMR condition holds) and add noise of increasing magnitude to the analytical gradients. While the disentanglement scores (DCI) decrease for both methods, we observe that the performance of IMA under noise is slightly worse than that of DMA. This may be one factor contributing to the weaker overall performance of IMA as compared to DMA.\label{fig:app_robustness}}
   
\end{figure}
\newcommand{\wres}[2]{\begin{tabular}[c]{@{}c@{}}#1 \\ \small{$\pm$#2} \end{tabular}}
\newcommand{\bwres}[2]{\begin{tabular}[c]{@{}c@{}}\textbf{#1} \\ \small{$\pm$\textbf{#2}} \end{tabular}}

\begin{table}[tb]
\adjustbox{width=\textwidth}{
\begin{tabular}{r*{12}{c}}
\toprule
 Model & \multicolumn{3}{c}{BetaVAE} & \multicolumn{3}{c}{FactorVAE} & \multicolumn{3}{c}{BetaTCVAE} & \multicolumn{3}{c}{DIPVAEI}\\

 Correlation & $s=0.2$ & $s=0.4$  & $s=\infty$  & $s=0.2$ & $s=0.4$  & $s=\infty$ & $s=0.2$ & $s=0.4$  & $s=\infty$& $s=0.2$ & $s=0.4$  & $s=\infty$\\
 \cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7} \cmidrule(lr){8-10} \cmidrule(lr){11-13}
unit dirs. & \wres{0.666}{0.030} & \wres{0.497}{0.028} & \wres{0.650}{0.049} & \wres{0.441}{0.065} & \wres{0.507}{0.105} & \wres{0.651}{0.087} & \wres{0.580}{0.022} & \wres{0.619}{0.008} & \wres{0.504}{0.056} & \wres{0.686}{0.072} & \wres{0.631}{0.018} & \wres{0.868}{0.052}\\
PCA & \wres{0.287}{0.010} & \wres{0.263}{0.028} & \wres{0.357}{0.024} & \wres{0.312}{0.048} & \wres{0.358}{0.075} & \wres{0.484}{0.064} & \wres{0.341}{0.018} & \wres{0.400}{0.030} & \wres{0.396}{0.061} & \wres{0.266}{0.029} & \wres{0.158}{0.013} & \wres{0.215}{0.037}\\
ICA & \wres{0.394}{0.099} & \wres{0.574}{0.040} & \wres{0.674}{0.012} & \wres{0.193}{0.052} & \wres{0.294}{0.070} & \wres{0.390}{0.109} & \wres{0.516}{0.019} & \wres{0.540}{0.023} & \bwres{0.642}{0.007} & \wres{0.672}{0.073} & \wres{0.630}{0.018} & \bwres{0.870}{0.049}\\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7} \cmidrule(lr){8-10} \cmidrule(lr){11-13}
Grad (IMA) & \wres{0.638}{0.067} & \wres{0.617}{0.018} & \wres{0.556}{0.109} & \wres{0.478}{0.046} & \wres{0.551}{0.040} & \bwres{0.666}{0.041} & \wres{0.548}{0.035} & \wres{0.623}{0.021} & \wres{0.551}{0.038} & \wres{0.705}{0.062} & \wres{0.644}{0.019} & \wres{0.794}{0.043}\\
IG (IMA) & \wres{0.702}{0.035} & \wres{0.460}{0.128} & \wres{0.578}{0.117} & \wres{0.470}{0.035} & \wres{0.511}{0.042} & \wres{0.581}{0.066} & \wres{0.619}{0.024} & \wres{0.533}{0.006} & \wres{0.612}{0.024} & \wres{0.650}{0.072} & \wres{0.605}{0.006} & \wres{0.701}{0.045}\\
SG (IMA) & \wres{0.677}{0.037} & \wres{0.438}{0.127} & \wres{0.609}{0.131} & \wres{0.475}{0.042} & \wres{0.561}{0.040} & \wres{0.644}{0.055} & \wres{0.533}{0.028} & \wres{0.620}{0.021} & \wres{0.559}{0.040} & \wres{0.698}{0.060} & \wres{0.642}{0.017} & \wres{0.785}{0.046}\\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7} \cmidrule(lr){8-10} \cmidrule(lr){11-13}
Grad (DMA) & \wres{0.645}{0.067} & \bwres{0.641}{0.031} & \bwres{0.690}{0.062} & \wres{0.547}{0.056} & \wres{0.584}{0.047} & \wres{0.385}{0.169} & \bwres{0.629}{0.033} & \wres{0.666}{0.010} & \wres{0.598}{0.057} & \bwres{0.717}{0.059} & \bwres{0.684}{0.009} & \wres{0.857}{0.037}\\
IG (DMA) & \wres{0.645}{0.076} & \wres{0.530}{0.106} & \wres{0.548}{0.114} & \bwres{0.573}{0.046} & \bwres{0.615}{0.045} & \wres{0.631}{0.128} & \wres{0.607}{0.028} & \wres{0.624}{0.021} & \wres{0.584}{0.039} & \wres{0.703}{0.073} & \wres{0.659}{0.008} & \wres{0.771}{0.029}\\
SG (DMA) & \bwres{0.711}{0.040} & \wres{0.593}{0.094} & \wres{0.633}{0.062} & \wres{0.506}{0.057} & \wres{0.600}{0.027} & \wres{0.644}{0.066} & \wres{0.628}{0.033} & \bwres{0.670}{0.014} & \wres{0.595}{0.059} & \wres{0.716}{0.059} & \wres{0.682}{0.010} & \wres{0.851}{0.036}\\


\bottomrule
\end{tabular}
}
\vspace{0.5em}
\caption{Mean $\pm$ std. err. of the DCI score of the experiments in \cref{mainsec:exp_comp} for the first correlated component pair (\emph{floor} vs \emph{background} color) in Shapes3D, as an ablation study with further correlations strengths and attribution methods (see \Cref{sec:app_gradtoattrib}). We observe only small differences between attribution methods, with plain Grad and SG performing best in the DMA setting.}
\label{tab:app_further_cor}
\end{table}
\begin{table}[tb]
\adjustbox{width=\textwidth}{
\begin{tabular}{r*{6}{c}}
\toprule
 Dataset & \multicolumn{3}{c}{Shapes3D} & \multicolumn{3}{c}{MPI3D-real}\\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7}
 \wrapb{Correlated}{factors} & \wrapb{floor vs.}{background} & \wrapb{orientation vs.}{background} &  \wrapb{orientation vs.}{size} &  \wrapb{background vs.}{object color} &  \wrapb{background vs.}{robot arm dof-1} &  \wrapb{robot arm dof-1 vs.}{robot arm dof-2}\\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7}
\textbf{BetaVAE} &\res{0.309}{0.031}& \res{0.426}{0.043}& \res{0.335}{0.059}&
\res{0.232}{0.022} & \res{0.185}{0.031} & \bres{0.196}{0.034}\\
+PCA&\res{0.111}{0.031}& \res{0.101}{0.009}& \res{0.092}{0.031}& 
\res{0.095}{0.010} & \res{0.105}{0.023} & \res{0.123}{0.033}\\
+ICA&\res{0.360}{0.040}& \res{0.324}{0.054}& \res{0.277}{0.036}& 
\res{0.155}{0.025} & \res{0.163}{0.014} & \res{0.071}{0.014}\\
+Ours (IMA)&\res{0.511}{0.029}& \res{0.437}{0.044}& \res{0.502}{0.030}& 
\bres{0.239}{0.021} & \bres{0.229}{0.022} & \res{0.187}{0.039}\\
+Ours (DMA)&\bres{0.594}{0.023}& \bres{0.485}{0.057}& \bres{0.545}{0.034}& 
\res{0.193}{0.036} & \res{0.092}{0.038} & \res{0.080}{0.015} \\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7}
\textbf{FactorVAE} &\res{0.297}{0.084}& \res{0.319}{0.076}& \bres{0.423}{0.018}& 
\res{0.079}{0.001} & \res{0.103}{0.020} & \res{0.080}{0.010} \\
+PCA&\res{0.202}{0.057}& \res{0.135}{0.028}& \res{0.235}{0.036}& 
\bres{0.111}{0.006} & \bres{0.122}{0.011} & \bres{0.107}{0.009} \\
+ICA&\res{0.199}{0.061}& \res{0.106}{0.025}& \res{0.078}{0.021}&
\res{0.018}{0.008} & \res{0.061}{0.015} & \res{0.069}{0.015}\\
+Ours (IMA)&\bres{0.337}{0.033}& \bres{0.322}{0.056}& \res{0.288}{0.092}& 
\res{0.070}{0.014} & \res{0.086}{0.018} & \res{0.039}{0.014}\\
+Ours (DMA)&\res{0.276}{0.036}& \res{0.217}{0.064}& \res{0.213}{0.036}& 
\res{0.046}{0.021} & \res{0.045}{0.016} & \res{0.048}{0.015}\\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7}
\textbf{BetaTCVAE} &\res{0.333}{0.008}& \res{0.400}{0.046}& \res{0.402}{0.017}& 
\bres{0.279}{0.025} & \bres{0.223}{0.030} & \res{0.201}{0.039}\\
+PCA&\res{0.249}{0.033}& \res{0.145}{0.039}& \res{0.184}{0.062}& 
\res{0.265}{0.019} & \res{0.203}{0.028} & \bres{0.213}{0.035}\\
+ICA&\res{0.390}{0.031}& \res{0.276}{0.043}& \res{0.346}{0.072}& 
\res{0.199}{0.040} & \res{0.158}{0.038} & \res{0.170}{0.033}\\
+Ours (IMA)&\res{0.484}{0.025}& \res{0.490}{0.033}& \res{0.526}{0.036}& 
\res{0.092}{0.029} & \res{0.071}{0.029} & \res{0.041}{0.014}\\
+Ours (DMA)&\bres{0.525}{0.014}& \bres{0.540}{0.021}& \bres{0.620}{0.024}& 
\res{0.120}{0.037} & \res{0.122}{0.044} & \res{0.075}{0.028}\\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7}
\textbf{DipVAE} &\res{0.493}{0.032}& \res{0.481}{0.020}& \res{0.433}{0.044}& 
\res{0.138}{0.020} & \res{0.099}{0.040} & \bres{0.143}{0.045}\\
+PCA&\res{0.063}{0.006}& \res{0.086}{0.027}& \res{0.108}{0.014}& 
\res{0.054}{0.016} & \res{0.042}{0.011} & \res{0.064}{0.010}\\
+ICA&\res{0.495}{0.032}& \res{0.438}{0.053}& \res{0.224}{0.026}& 
\res{0.138}{0.023} & \res{0.096}{0.040} & \res{0.139}{0.047}\\
+Ours (IMA)&\res{0.512}{0.042}& \res{0.425}{0.036}& \res{0.465}{0.049}& 
\bres{0.146}{0.019} & \bres{0.105}{0.033} & \res{0.136}{0.049}\\
+Ours (DMA)&\bres{0.591}{0.028}& \bres{0.546}{0.017}& \bres{0.497}{0.060}& 
\res{0.133}{0.029} & \res{0.094}{0.036} & \res{0.125}{0.045}\\
 \bottomrule
  \end{tabular}
}
\vspace{0.5em}
\caption{Mean $\pm$ std. err. of the Mutual-Information Gap (MIG) scores of four post-hoc methods applied to the embedding spaces of four disentanglement models on two datasets with different pairs of correlated variables. The MIG is computed across all components of the dataset.}%. The models were trained on two datasets where different pairs of components were correlated. The best result per model is marked in bold.}
\label{tab:app_posthocdisentangle_mig}
\end{table}

\begin{table}[tb]
\adjustbox{width=\textwidth}{
\begin{tabular}{r*{6}{c}}
\toprule
 Dataset & \multicolumn{3}{c}{Shapes3D} & \multicolumn{3}{c}{MPI3D-real}\\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7}
 \wrapb{Correlated}{factors} & \wrapb{floor vs.}{background} & \wrapb{orientation vs.}{background} &  \wrapb{orientation vs.}{size} &  \wrapb{background vs.}{object color} &  \wrapb{background vs.}{robot arm dof-1} &  \wrapb{robot arm dof-1 vs.}{robot arm dof-2}\\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7}
\textbf{BetaVAE} &\bres{0.834}{0.022}& \bres{0.839}{0.053}& \res{0.828}{0.011} & \res{0.557}{0.032} & \res{0.490}{0.044} & \res{0.412}{0.022} \\
+PCA&\res{0.722}{0.060}& \res{0.689}{0.047}& \res{0.716}{0.035}& \res{0.393}{0.037} & \res{0.452}{0.031} & \res{0.398}{0.031}\\
+ICA&\res{0.797}{0.036}& \res{0.775}{0.083}& \res{0.794}{0.022}&
\res{0.385}{0.100} & \res{0.262}{0.061} & \res{0.251}{0.031} \\
+Ours (IMA)&\res{0.767}{0.108}& \res{0.808}{0.060}& \bres{0.832}{0.022}& \res{0.565}{0.022} & \res{0.504}{0.036} & \res{0.443}{0.027} \\
+Ours (DMA)&\res{0.813}{0.087}& \res{0.829}{0.068}& \res{0.826}{0.029}& 
\bres{0.567}{0.024} & \bres{0.525}{0.042} & \bres{0.444}{0.027} \\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7}

\textbf{FactorVAE} &\res{0.636}{0.045}& \res{0.622}{0.064}& \res{0.595}{0.050}&
\bres{0.354}{0.016} & \bres{0.389}{0.015} & \res{0.342}{0.006}\\
+PCA&\res{0.627}{0.071}& \bres{0.680}{0.027}& \bres{0.652}{0.024}&
\res{0.330}{0.018} & \res{0.388}{0.022} & \bres{0.353}{0.016}\\
+ICA&\res{0.619}{0.059}& \res{0.446}{0.146}& \res{0.200}{0.148}&
\res{0.277}{0.013} & \res{0.242}{0.082} & \res{0.304}{0.017}\\
+Ours (IMA)&\bres{0.663}{0.022}& \res{0.661}{0.028}& \res{0.644}{0.051} &
\res{0.347}{0.007} & \res{0.386}{0.020} & \res{0.337}{0.013}\\
+Ours (DMA)&\res{0.646}{0.026}& \res{0.637}{0.023}& \res{0.619}{0.026}& 
\res{0.330}{0.015} & \res{0.375}{0.016} & \res{0.335}{0.013}\\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7}

\textbf{BetaTCVAE} &\res{0.676}{0.012}& \res{0.814}{0.052}& \res{0.877}{0.015}& 
\res{0.445}{0.044} & \res{0.379}{0.021} & \res{0.346}{0.020}\\
+PCA&\res{0.761}{0.035}& \res{0.738}{0.063}& \res{0.794}{0.037}&
\bres{0.505}{0.040} & \bres{0.425}{0.012} & \res{0.389}{0.008}\\
+ICA&\res{0.834}{0.004}& \res{0.761}{0.051}& \res{0.806}{0.051}&
\res{0.149}{0.099} & \res{0.168}{0.053} & \res{0.057}{0.035} \\
+Ours (IMA)&\res{0.837}{0.004}& \res{0.849}{0.015} & \bres{0.879}{0.013} & 
\res{0.463}{0.048} & \res{0.401}{0.018} & \res{0.399}{0.019} \\
+Ours (DMA) &\bres{0.842}{0.000} & \bres{0.854}{0.017} & \res{0.878}{0.013} &
\res{0.460}{0.046} & \res{0.399}{0.018} & \bres{0.399}{0.014} \\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7}

\textbf{DipVAE} &\bres{0.826}{0.006}& \res{0.839}{0.006}& \res{0.785}{0.033}& 
\bres{0.517}{0.046} & \bres{0.473}{0.046} & \res{0.430}{0.013}\\
+PCA&\res{0.671}{0.019}& \res{0.603}{0.064}& \res{0.653}{0.039}& 
\res{0.431}{0.028} & \res{0.373}{0.027} & \res{0.344}{0.021}\\
+ICA&\res{0.826}{0.006}& \res{0.831}{0.007}& \res{0.749}{0.027}&
\res{0.434}{0.042} & \res{0.423}{0.027} & \res{0.424}{0.012}\\
+Ours (IMA)&\res{0.824}{0.007}& \res{0.812}{0.018}& \res{0.785}{0.029}&
\res{0.503}{0.044} & \res{0.471}{0.035} & \res{0.436}{0.021} \\
+Ours (DMA)&\res{0.822}{0.006}& \bres{0.850}{0.012}& \bres{0.809}{0.045}& 
\res{0.505}{0.040} & \res{0.459}{0.040} & \bres{0.448}{0.026}\\
 \bottomrule
  \end{tabular}
}
\vspace{0.5em}
\caption{Mean $\pm$ std. err. of the FactorVAE scores of four post-hoc methods applied to the embedding spaces of four disentanglement models on two datasets with different pairs of correlated variables. The FactorVAE score is computed across all components of the dataset.}%. The models were trained on two datasets where different pairs of components were correlated. The best result per model is marked in bold.}
\label{tab:app_posthocdisentangle_factorvae}
\end{table}

\begin{table}[tb]
\adjustbox{width=\textwidth}{
\begin{tabular}{r*{6}{c}}
\toprule
 Dataset & \multicolumn{3}{c}{Shapes3D} & \multicolumn{3}{c}{MPI3D-real}\\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7}
 \wrapb{Correlated}{factors} & \wrapb{floor vs.}{background} & \wrapb{orientation vs.}{background} &  \wrapb{orientation vs.}{size} &  \wrapb{background vs.}{object color} &  \wrapb{background vs.}{robot arm dof-1} &  \wrapb{robot arm dof-1 vs.}{robot arm dof-2}\\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7}
\textbf{BetaVAE} &\res{0.086}{0.003}& \res{0.119}{0.004}& \res{0.100}{0.005}&
\res{0.127}{0.014} & \res{0.098}{0.015} & \bres{0.092}{0.025} \\
+PCA&\res{0.047}{0.005}& \res{0.062}{0.006}& \res{0.066}{0.006}& 
\res{0.027}{0.005} & \res{0.055}{0.008} & \res{0.037}{0.006}\\
+ICA&\res{0.007}{0.001}& \res{0.013}{0.001}& \res{0.019}{0.004}&
\res{0.017}{0.006} & \res{0.007}{0.002} & \res{0.004}{0.001} \\
+Ours (IMA)&\bres{0.099}{0.026}& \res{0.114}{0.008}& \res{0.112}{0.007}&
\bres{0.131}{0.011} & \bres{0.113}{0.005} & \res{0.082}{0.024} \\
+Ours (DMA)&\res{0.094}{0.020}& \bres{0.127}{0.012}& \bres{0.114}{0.013}& 
\res{0.107}{0.025} & \res{0.059}{0.024} & \res{0.037}{0.013} \\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7}

\textbf{FactorVAE} &\res{0.072}{0.006}& \res{0.059}{0.006}& \bres{0.064}{0.001}&
\res{0.059}{0.004} & \res{0.066}{0.008} & \res{0.054}{0.003}\\
+PCA&\res{0.060}{0.006}& \bres{0.066}{0.004}& \res{0.057}{0.004}&
\bres{0.065}{0.008} & \bres{0.076}{0.004} & \bres{0.071}{0.003}\\
+ICA&\res{0.013}{0.002}& \res{0.008}{0.001}& \res{0.006}{0.002}& 
\res{0.002}{0.000} & \res{0.002}{0.001} & \res{0.001}{0.000}\\
+Ours (IMA)&\bres{0.077}{0.012}& \res{0.052}{0.005}& \res{0.054}{0.017}&
\res{0.054}{0.006} & \res{0.059}{0.006} & \bres{0.036}{0.015}\\
+Ours (DMA)&\res{0.071}{0.014}& \res{0.053}{0.012}& \res{0.040}{0.010}& 
\res{0.041}{0.017} & \res{0.043}{0.015} & \res{0.044}{0.013}\\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7}

\textbf{BetaTCVAE} &\res{0.052}{0.002}& \res{0.107}{0.013}& \res{0.096}{0.016}& 
\bres{0.151}{0.017} & \bres{0.133}{0.007} & \bres{0.117}{0.011}\\
+PCA&\res{0.073}{0.004}& \res{0.075}{0.011}& \res{0.107}{0.015}&
\res{0.148}{0.018} & \res{0.125}{0.009} & \res{0.109}{0.007}\\
+ICA&\res{0.015}{0.000}& \res{0.010}{0.001}& \res{0.011}{0.002}& 
\res{0.011}{0.004} & \res{0.005}{0.002} & \res{0.004}{0.002}\\
+Ours (IMA)&\res{0.105}{0.003}& \res{0.119}{0.012}& \bres{0.130}{0.023}&
\res{0.055}{0.017} & \res{0.059}{0.016} & \res{0.056}{0.003} \\
+Ours (DMA)&\bres{0.108}{0.005}& \bres{0.127}{0.013}& \res{0.109}{0.017}& 
\res{0.071}{0.020} & \res{0.072}{0.010} & \res{0.051}{0.015} \\
\cmidrule{1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-7}

\textbf{DipVAE} &\res{0.083}{0.004}& \res{0.084}{0.003}& \res{0.070}{0.002}&
\res{0.056}{0.011} & \res{0.039}{0.013} & \res{0.057}{0.016}\\
+PCA&\res{0.027}{0.003}& \res{0.034}{0.006}& \res{0.043}{0.004}& 
\res{0.023}{0.004} & \res{0.030}{0.008} & \res{0.022}{0.005}\\
+ICA&\res{0.006}{0.001}& \res{0.003}{0.002}& \res{0.030}{0.002}&
\res{0.011}{0.005} & \res{0.005}{0.003} & \res{0.005}{0.002}\\
+Ours (IMA)&\res{0.089}{0.012}& \res{0.082}{0.005}& \res{0.077}{0.002}&
\bres{0.060}{0.008} & \bres{0.047}{0.010} & \bres{0.061}{0.016} \\
+Ours (DMA)&\bres{0.114}{0.003}& \bres{0.105}{0.008}& \bres{0.084}{0.007}&
\res{0.051}{0.008} & \res{0.043}{0.012} & \res{0.054}{0.016}\\
 \bottomrule
  \end{tabular}
}
\vspace{0.5em}
\caption{Mean $\pm$ std. err. of the SAP scores of four post-hoc methods applied to the embedding spaces of four disentanglement models on two datasets with different pairs of correlated variables. The SAP score is computed across all components of the dataset.} %. The models were trained on two datasets where different pairs of components were correlated. The best result per model is marked in bold.}
\label{tab:app_posthocdisentangle_sap}
\end{table}

\subsection{Qualitative results on Shapes3D}
In this section, we want to show another traversal plot like the one in \cref{mainfig:travsersal} and more thoroughly analyze its latent space. We chose another architecture (BetaTCVAE) and $s=0.2$ with the usual correlated factors \emph{floor color} and \emph{background color}. Out of the 5 independent runs, we selected the one with the highest DCI score (of the base model) for the analysis.
\newcommand{\includetraversalsupp}[1]{\includegraphics[clip, trim=0.37cm 2.24cm 0.3cm 1.7cm, width=0.75\textwidth]{#1}}
\newcommand{\includetraversalsuppB}[1]{\includegraphics[clip, trim=0.37cm 2.24cm 0.3cm 1.7cm, width=0.75\textwidth]{#1}}
\newcommand{\includematrixplot}[1]{\includegraphics[clip, trim=0.1cm 1.6cm 0.3cm 0.4cm, width=0.65\textwidth]{#1}}
\newcommand{\includematrixplotB}[1]{\includegraphics[clip, trim=0.1cm 0.2cm 0.3cm 0.4cm, width=0.75\textwidth]{#1}}
\ifdefined\arxiv
\newcommand{\sfigw}[0]{0.4\textwidth}
\else
\newcommand{\sfigw}[0]{0.48\textwidth}
\fi
\begin{figure}[tb]
   \begin{subfigure}[b]{\sfigw}
    \centering
    \includetraversalsuppB{figures/traversal_linearreg.pdf}
    \caption{optimal linear directions (traversal plot)\label{fig:app_travlin}}
    \end{subfigure}
    \begin{subfigure}[b]{\sfigw}
    \centering
    \includematrixplotB{figures/orthogonality_matrix.pdf}
    \caption{Cosines of the linear directions\label{fig:app_orthmatrix}}
    \end{subfigure}
    \caption{Empirical results for linear entanglement. For the model shown in \cref{mainfig:travsersal} (trained on correlated data), we observe almost perfect linear entanglement, i.e., that $f \circ g = D$: (a) There exist linear directions $d_1$ to $d_6$ in $f$'s embedding space that encode the individual components. (b) However, these directions are not necessarily orthogonal; they can be entangled as testified by non-zero cosine distances between them. See \cref{fig:app_traversal} for additional results.}
\end{figure}
\paragraph{Linear entanglement matrix.} To study which factors are encoded in which latent dimension, we compute a matrix of linear entanglement. By our linear entanglement hypothesis, $\vz' = \mD\vz$, where the matrix $\mD=\lbrack \vd_1, \cdots, \vd_K\rbrack \in \mathbb{R}^{K\times K}$ contains the directions $\vd_i \in \mathbb{R}^K$, in which the ground truth concepts are encoded. Changing the component $i$ (entry $\vz_i$) by one unit will change the resulting embedding by $\vd_i$. To find these $\vd_i$, we take the factors at the origin of the traversal plot and alter only a single component $i$. We then encode the image corresponding to that change, and measure the change in embeddings to find the linear direction $\vd_i$ that the corresponding component is encoded in (to be precise, we sample several changes and take the largest eigenvector of the embedding changes covariance). Thus, we can estimate the matrix $\mD$. An example is shown in \cref{fig:app_travlin} and provides evidence that linear entanglement is possible when training autoencoder models from correlated data.

To estimate which factors are changing when a unit direction of the (plain or post-processed) embedding space is followed (a change in $\vz_i^{'}$), we can invert the equation to $\vz=\mD^{-1}\vz'$. The columns in $\mD^{-1}$ correspond to the change in ground truth components that going one unit in the latent space coordinate $i$ will entail. We refer to this matrix $\mD^{-1}$, that shows which ground truth components will be altered by moving along one latent dimension as \emph{linear entanglement matrix}.

\Cref{fig:app_traversal} shows the traversals along with the corresponding linear entanglement matrices that correspond well to the changes observed. For the plain method, the components that were correlated are deeply entangled (upper line). However, our method (DMA, SG, lower line) is able to separate them well, which is testified both by the traversal and the linear disentanglement matrix. 
% trim order: left bottom right top 


\begin{figure}[tb]
   \begin{subfigure}[b]{\sfigw}
    \centering
    \includetraversalsupp{figures/traversal_unit_beta.pdf}
    \caption{BetaTCVAE \label{fig:travunittc}}
    \end{subfigure}
    \begin{subfigure}[b]{\sfigw}
    \centering
    \includematrixplot{figures/encoding_matrix_unit.pdf}
    \caption{Corresponding linear entanglement matrix\label{fig:matrixunittc}}
    \end{subfigure}
   \begin{subfigure}[b]{\sfigw}
    \centering
    \includetraversalsupp{figures/traversal_ica_beta.pdf}
    \caption{BetaTCVAE + ICA \label{fig:travunittc2}}
    \end{subfigure}
    \begin{subfigure}[b]{\sfigw}
    \centering
    \includematrixplot{figures/encoding_matrix_ica.pdf}
    \caption{Corresponding linear entanglement matrix\label{fig:matrixunittc2}}
    \end{subfigure}
    \begin{subfigure}[b]{\sfigw}
    \centering
    \includetraversalsupp{figures/traversal_ours_beta.pdf}
    \caption{BetaTCVAE + Ours (DMA, SG) \label{fig:travunittc3}}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{\sfigw}
    \centering
    \includematrixplot{figures/encoding_matrix_ours.pdf}
    \caption{Corresponding linear entanglement matrix\label{fig:matrixunittc3}}
    \end{subfigure}
    \caption{Traversal plots from another model (BetaTCVAE) trained on the correlated dataset. As for all traversal plots in this paper, we manually permuted the dimensions to match across plots. In addition, we compute a matrix of linear entanglement that shows which ground truth factors is changed when moving into a certain direction (brightness corresponds to maginitude of change). While none of the post-hoc methods manages to disentangle shape and size (most likely due to their non-linear encoding), our model resolves the linearly entangled factors \emph{floor hue} and \emph{wall hue} fairly well, which can also be seen from the entanglement matrix. \label{fig:app_traversal}}
\end{figure}

\subsection{Further results on CUB} % Yao und Michael
\label{sec:app_cub_eval}
For a quantitative evaluation, we match the discovered concepts on CUB with the annotated ground truth attributes. we report results for the quantitative comparison on CUB introduced in \cref{sec:app_cubdetails} of our methods with PCA, ICA, and a baseline of randomly sampled directions.  Furthermore, we implement ConceptSHAP \citep{yeh2019completeness} and ACE \citep{ghorbani2019towards} and use them to discover concepts on CUB (using their default settings otherwise).
The results of this metric are presented in \Cref{tab:cub_quantitative}.

ICA failed to discover meaningful components, while PCA was only capable of discovering very few high-variance ones in the beginning, but begins to fail for $K>10$. This is possibly because in PCA, the directions are required to be orthogonal. Surprisingly, both PCA and ICA were not much better than the random baseline.  Regarding ConceptSHAP and ACE, we find that ACE often focused on the background concepts and ConceptSHAP discovered concepts that are usually more focused on the birds but hard to localize in a fine-grained manner. Our method constantly discovered components and surpassed all three baselines. 
In particular, our method (DMA) lead to good performance. 
This leads us to the hypotheses that for high-dimensional data, the disjointness principle is required to identify solutions. 
\Cref{fig:cub_correlation} illustrates the correlation between the ground-truth attribute representation (scores) and predicted representation by using our model (using plain gradients) for the top discovered component. The two components are clearly correlated, but more in a block-sense: Classes with low scores on the attribute received low scores on the discovered component. The same holds for high scores, but within these, we observe stronger noise, which explains why the Spearman's correlation values were imperfect. This can be due to a certain degree of arbitrage in the ground-truth attribute values of each class. Here, \cref{fig:cub_attr2}, just like \cref{mainfig:cub_attr} in the main paper, shows qualitative examples, including the ground-truth values which appear to fluctuate. We emphasize that this analysis should be viewed as an initial take on quantifying the quality of interpretable components, but that a refined benchmark is material for future work.

\begin{table}[tb]
\centering
\resizebox{0.8\linewidth}{!}{
\begin{tabular}{rccccccc}
\toprule
Num. components & K=1 & K=10 & K=20 & K=30 \\ 
\midrule
PCA & \textbf{0.789}  $\pm$ 0.024  &  0.602 $\pm$ 0.007 & 0.497 $\pm$  0.005 &  0.440 $\pm$ 0.006 \\ 
ICA &  0.515 $\pm$ 0.028  & 0.442 $\pm$ 0.005 & 0.412 $\pm$ 0.006  &  0.390 $\pm$ 0.007  \\ 
\midrule
ACE \citep{ghorbani2019towards} & 0.623 $\pm$ 0.012 & 0.579 $\pm$ 0.010 & 0.550 $\pm$ 0.008 & 0.527 $\pm$ 0.007 \\
ConceptSHAP \citep{yeh2019completeness} & 0.655 $\pm$ 0.014 & 0.596 $\pm$ 0.006 & 0.568 $\pm$ 0.008 & 0.545 $\pm$ 0.006 \\
\midrule
Ours-IMA,Grad  & 0.657 $\pm$  0.025 & 0.601 $\pm$ 0.009  & 0.564 $\pm$ 0.009   & 0.535 $\pm$ 0.008 \\
Ours-DMA,Grad      & 0.701  $\pm$  0.045 & \textbf{0.626} $\pm$ 0.029  & \textbf{0.585} $\pm$ 0.028   & \textbf{0.559} $\pm$ 0.011 \\
\bottomrule
\end{tabular}
}
\vspace{0.5em}
\caption{Quantitative comparison of discovered components using our methods, PCA, ICA and a random baseline. Mean correlation score of top-K (K in column) discovered components are shown in (mean ± std.) for five runs.}
\label{tab:cub_quantitative}
\end{table}


\begin{figure}[tb]
    \centering
    \includegraphics[width=.5\linewidth]{figures/correlation_plot.png}
    \caption{Correlation between ground-truth attribute scores and our predicted scores for the best matched component. Each dot represents a class.}
    \label{fig:cub_correlation}
\end{figure}

\begin{figure}[tb]
    \centering
    %\includegraphics[width=\linewidth]{figures/cub_attr_
    \input{figures/cub_appendix_new}
\caption{Examples of discovered components on CUB. The corresponding ground-truth attribute is shown under images and the ground-truth value of each image is depicted above the image. ``$+$/$-$'' indicate the positive/negative direction along the discovered concept.}
    \label{fig:cub_attr2}
\end{figure}

\clearpage
\newpage
\bibliography{references}

\end{document}