\documentclass[accepted]{uai2023}
\newif\ifinappendix

\usepackage[sort&compress, elide]{natbib}
\renewcommand{\cite}{\citep}
%%%%% NEW MATH DEFINITIONS %%%%%

\usepackage{amsthm,amsmath,amssymb,amsfonts,bm,bbm,stmaryrd}

% Mark sections of captions for referring to divisions of figures
\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

% Highlight a newly defined term
\newcommand{\newterm}[1]{{\bf #1}}


% Figure reference, lower-case.
\def\figref#1{figure~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
% Table reference, lower-case.
\def\tabref#1{table~\ref{#1}}
% Table reference, capital. For start of sentence
\def\Tabref#1{Table~\ref{#1}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}
% Definition reference, capital.
\def\Defref#1{Definition~\ref{#1}}
\def\defref#1{definition~\ref{#1}}
% Reference to two sections.
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
% Reference to two sections.
\def\twoSecrefs#1#2{Sections \ref{#1} and \ref{#2}}
% Reference to three sections.
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
% Reference to an equation, lower-case.
\def\eqref#1{equation~\ref{#1}}
% Reference to an equation, upper case
\def\Eqref#1{Equation~\ref{#1}}
% A raw reference to an equation---avoid using if possible
\def\twoEqref#1#2{Equations \ref{#1} and \ref{#2}}
% A raw reference to an equation---avoid using if possible
\def\plaineqref#1{\ref{#1}}
% Reference to a chapter, lower-case.
\def\chapref#1{chapter~\ref{#1}}
% Reference to an equation, upper case.
\def\Chapref#1{Chapter~\ref{#1}}
% Reference to a range of chapters
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
% Reference to an algorithm, lower-case.
\def\algref#1{algorithm~\ref{#1}}
% Reference to an algorithm, upper case.
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
% Reference to a part, lower case
\def\partref#1{part~\ref{#1}}
% Reference to a part, upper case
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


% Random variables
\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
% rm is already a command, just don't name any random variables m
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

% Random vectors
\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

% Elements of random vectors
\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

% Random matrices
\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

% Elements of random matrices
\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

% Vectors
\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}

% Elements of vectors
\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

% Matrix
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}

% Tensor
\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


% Graph
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

% Entries of a matrix
\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

% entries of a tensor
% Same font as tensor, without \bm wrapper
\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

% The true underlying data generating distribution
\newcommand{\pdata}{p_{\rm{data}}}
% The empirical distribution defined by the training set
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
% The model distribution
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
% Stochastic autoencoder distributions
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
% But then they seem to use $L^2$ for vectors throughout the site, and so does
% wikipedia.
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}
% \newcommand{\range}[2]{\{#1,...,#2\}}
\newcommand{\range}[2]{\llbracket#1..#2\rrbracket}

\newcommand{\lone}{\ell_1}
\newcommand{\ltwo}{\ell_2}
\newcommand{\one}{\mathbbm{1}}
\newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
\newcommand{\jointP}{\mathbb{P}}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{corollary}{Corollary}[theorem]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}[theorem]{Definition}

\newcommand{\sindep}{s^{\mathrm{inv}}}
\newcommand{\sdep}{s^{\mathrm{var}}}
\newcommand{\sdepStar}{s_i^{\mathrm{var}^*}}
\newcommand{\sindepStar}{s_i^{\mathrm{inv}^*}}
\newcommand{\noisevarcap}{E}
\newcommand{\noisevarlow}{\epsilon}
\newcommand{\zpsi}[1]{z_{\psi_{#1}}}
\newcommand{\zpsistar}[1]{z_{\psi^*_{#1}}}

\newcommand{\pa}[1]{\text{pa}(#1)} % See usage in notation.tex. Chosen to match Daphne's book.
\newcommand{\paG}[1]{\text{pa}_G(#1)}
\newcommand{\paGG}[1]{\text{pa}_{G'}(#1)}
\newcommand{\pacG}[1]{\text{pa}_{\mathcal{G}}(#1)}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{booktabs} % for professional tables
\usepackage{paralist} % compactitem
\usepackage{enumitem}
\usepackage{multirow}
\usepackage{afterpage}
\usepackage{nicefrac}
\usepackage{tabularray}
\usepackage{graphbox}
\usepackage{wrapfig}
\usepackage{amssymb}% http://ctan.org/pkg/amssymb
\usepackage{pifont}% http://ctan.org/pkg/pifont
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%

\usepackage{tikz}
\usetikzlibrary{bayesnet}
\usetikzlibrary{arrows.meta}
\usetikzlibrary{arrows.meta,calc,decorations.markings,math,arrows.meta}
\usetikzlibrary{matrix}
\usepackage{pgfplots}
\pgfplotsset{compat=1.17}

\usepackage{hyperref}
\usepackage{xcolor, colortbl}
\usepackage[capitalize,noabbrev]{cleveref}
\usepackage{arydshln}

\newcommand{\theHalgorithm}{\arabic{algorithm}}
\newcommand{\OurApproach}{BISCUIT}
\newcommand{\OurApproachFull}{\textbf{B}inary \textbf{I}nteraction\textbf{s} for \textbf{C}a\textbf{u}sal \textbf{I}den\textbf{t}ifiability}
\newcommand{\iVAEAdapt}{iVAE$^{*}$}
\newcommand{\robotstate}{regime variable}  % To be discussed what we really call it
\newcommand{\eg}{\emph{e.g.},}
\newcommand{\ie}{\emph{i.e.},}
\newcommand{\highlight}[1]{\textbf{#1}}
\newcommand{\obs}{h}
\newcommand{\MLPI}{$\text{MLP}^{\hat{I}_i}_{\omega}$}
\newcommand{\MLPImath}{\text{MLP}^{\hat{I}_i}_{\omega}}
\newcommand{\MLPzi}{$\text{MLP}^{z_i}_{\omega}$}
\newcommand{\MLPzimath}{\text{MLP}^{z_i}_{\omega}}
\newcommand{\intvdiv}{\Delta}
\newcommand{\pseudoparagraph}[1]{\vspace{3mm}\textbf{#1}\hspace{3mm}}
\usepackage{comment}
\usetikzlibrary{backgrounds}
\usetikzlibrary{calc}

\definecolor{instantcolor}{HTML}{117733}
\definecolor{tempcolor}{HTML}{004488}
\definecolor{intvcolor}{HTML}{c83c04}
\definecolor{confcolor}{HTML}{886600}
\definecolor{obscolor}{HTML}{7a1f5c}
\definecolor{extraintvcolor}{HTML}{117733}
% \definecolor{gaussianplot_I1}{}{}
\definecolor{gaussianplotIone}{HTML}{BB5566}
\definecolor{gaussianplotItwo}{HTML}{004488}
\definecolor{gaussianplotIonetwo}{HTML}{BB9911}
\definecolor{gaussianplotObs}{HTML}{808080}
\definecolor{dark-green}{HTML}{00ba16}

\newcommand{\ourtitle}{\OurApproach{}: Causal Representation Learning from Binary Interactions}

\usepackage{titletoc}
\newcommand{\smallparagraph}[1]{\textbf{#1}\hspace{2mm}}
\linepenalty=1000
\brokenpenalty=0

\begin{document}

\title{\ourtitle}

\author[1]{\href{mailto:p.lippe@uva.nl?Subject=UAI2023-BISCUIT}{Phillip Lippe}{}}
\author[2,3]{Sara Magliacane}
\author[2]{Sindy L\"owe}
\author[1]{Yuki M. Asano}
\author[4]{Taco Cohen}
\author[1]{Efstratios Gavves}

\affil[1]{%
    QUVA Lab, University of Amsterdam
}
\affil[2]{%
    AMLab, University of Amsterdam
}
\affil[3]{%
    MIT-IBM Watson AI Lab
}
\affil[4]{
    Qualcomm AI Research\thanks{
    Qualcomm AI Research is an initiative of Qualcomm Technologies, Inc.
    }\\
    Amsterdam, Netherlands
}

\maketitle

\begin{abstract}
Identifying the causal variables of an environment and how to intervene on them is of core value in applications such as robotics and embodied AI.
While an agent can commonly interact with the environment and may implicitly perturb the behavior of some of these causal variables, often the targets it affects remain unknown.
In this paper, we show that causal variables can still be identified for many common setups, e.g., additive Gaussian noise models, if the agent's interactions with a causal variable can be described by an unknown binary variable. This happens when each causal variable has two different mechanisms, e.g., an observational and an interventional one.
Using this identifiability result, we propose \OurApproach{}, a method for simultaneously learning causal variables and their corresponding binary interaction variables.
On three robotic-inspired datasets, \OurApproach{} accurately identifies causal variables and can even be scaled to complex, realistic environments for embodied AI.
\end{abstract}

\section{Introduction}
\label{sec:introduction}

\begin{figure}[t!]
    \centering
    \begin{tikzpicture}
        \node[inner sep=0pt, outer sep=0pt] (img) at (0,0)
        {\includegraphics[width=\linewidth]{figures/biscuit_figure_1_v2.pdf}};
        \node[inner sep=0pt] (Xtn1) at (-0.455\linewidth, 0.13\linewidth) {\scriptsize $X^{t-1}$};
        \node[inner sep=0pt] (Xt) at (-0.455\linewidth, -0.14\linewidth) {\scriptsize $X^{t}$};
        \node[inner sep=0pt] (Rt) at (-0.455\linewidth, 0.0\linewidth) {\scriptsize $R^{t}$};
    \end{tikzpicture}
    \caption{
    \OurApproach{} identifies causal variables from images $X^{t-1}$ and $X^{t}$, by learning to encode an observable \robotstate{} $R^t$, \eg{} an action, as binary variables. 
    Conditioning each latent on one of these binary variables identifies causal variables in environments like iTHOR \cite{kolve2017ai}.
    }
    \label{fig:introduction_figure_1}
\end{figure}

\begin{figure*}[t!]
    \centering
    \resizebox{0.9\textwidth}{!}{
        \begin{tikzpicture}
            \node (Ct) at (2.75,0) [draw,rounded corners=15pt,minimum width=7.5cm,minimum height=2cm,fill=white] {};
    	\node (Ct1) at (11.75,0) [draw,rounded corners=15pt,minimum width=7.5cm,minimum height=2cm,fill=white] {};
    	
    	\node[latent, minimum size=.9cm] (C1t) {$C^{t-1}_1$} ; %
    	\node[latent, minimum size=.9cm, right=of C1t] (C2t) {$C^{t-1}_2$} ; %
    	\node[const, right=of C2t] (dotdott) {$\cdots$} ; %
    	\node[latent, minimum size=.9cm, right=of C2t, xshift=1.5cm] (CKt) {$C^{t-1}_K$} ; %
    	\node[obs, above=of Ct, minimum size=.9cm, yshift=-.3cm] (Xt) {$X^{t-1}$} ; %
    	
    	\node[latent, xshift=9cm, minimum size=.9cm] (C1t1) {$C^{t}_1$} ; %
    	\node[latent, right=of C1t, xshift=9cm, minimum size=.9cm] (C2t1) {$C^{t}_2$} ; %
    	\node[const, right=of C2t1] (dotdott1) {$\cdots$} ; %
    	\node[latent, right=of C2t, xshift=10.5cm, minimum size=.9cm] (CKt1) {$C^{t}_K$} ; %
    	\node[obs, above=of Ct1, minimum size=.9cm, yshift=-.3cm] (Xt1) {$X^{t}$} ; %
    	
    	\node[latent, below=of C1t1, yshift=-.1cm, minimum size=.9cm] (I1t1) {$I^{t}_1$} ; %
    	\node[latent, below=of C2t1, yshift=-.1cm, minimum size=.9cm] (I2t1) {$I^{t}_2$} ; %
    	\node[const, right=of I2t1] (dotdotIt1) {$\cdots$} ; %
    	\node[latent, below=of CKt1, yshift=-.1cm, minimum size=.9cm] (IKt1) {$I^{t}_K$} ; %
    	\node[obs, below=of I2t1, xshift=1.2cm, minimum size=.9cm, yshift=.3cm] (Rt1) {$R^{t}$} ; %
    	
    	\node[latent, below=of C1t, yshift=-.1cm, minimum size=.9cm] (I1t) {$I^{t-1}_1$} ; %
    	\node[latent, below=of C2t, yshift=-.1cm, minimum size=.9cm] (I2t) {$I^{t-1}_2$} ; %
    	\node[const, right=of I2t] (dotdotIt) {$\cdots$} ; %
    	\node[latent, below=of CKt, yshift=-.1cm, minimum size=.9cm] (IKt) {$I^{t-1}_K$} ; %
    	\node[obs, below=of I2t, xshift=1.2cm, minimum size=.9cm, yshift=.3cm] (Rt) {$R^{t-1}$} ; %
    	
    	\node[const, left=of Ct] (dotdotCt) {$\cdots$\hspace{2mm}\phantom{}} ;
    	\node[const, left=of Rt] (dotdotRt) {$\cdots$\hspace{2mm}\phantom{}} ;
    	\node[const, right=of Ct1] (dotdotCt1) {\hspace{2mm}$\cdots$} ;
    	\node[const, right=of Rt1] (dotdotRt1) {\hspace{2mm}$\cdots$} ;
    	
    	\node[align=center,tempcolor] at (7.25,1.35) {Temporal causal\\relations};
    	\node[align=center,intvcolor,fill=white] at (7.3,-1.3) {Interactions};
    	\node[align=center,obscolor] at (4.5,2.2) {Observations};
    	\node[align=center,confcolor] at (5.2,-3.1) {Regimes};
    	\node[align=center,black] at (0.25,1.3) {Causal variables};
    	
    	\edge[obscolor]{Ct}{Xt} ;
    	% \edge[obscolor]{Eot}{Xt} ;
    	\edge[obscolor]{Ct1}{Xt1} ;
    	% \edge[obscolor]{Eot1}{Xt1} ;
    	% \edge[obscolor,dashed]{Rt}{Xt} ;
    	% \edge[obscolor,dashed]{Rt1}{Xt1} ;
     
            %% For the theory, we will consider $R^{t}$ to be *not* in the observation
            \begin{scope}[on background layer]
    	    % \draw[->,obscolor] (Rt) to [out=90,in=310] (Xt);
    	    % \draw[->,obscolor] (Rt1) to [out=90,in=310] (Xt1);
    	    % \ifextraintv
                \draw[->,dashed,extraintvcolor] (Ct) to [out=355, in=150] (I1t1);
                \draw[->,dashed,extraintvcolor] (Ct) to [out=355, in=150] (I2t1);
                \draw[->,dashed,extraintvcolor] (Ct) to [out=355, in=150] (IKt1);
                \draw[->,dashed,tempcolor] (Ct) to [out=342, in=180] (Rt1);
        	% \fi
    	\end{scope}
    	
    	\edge[tempcolor,dashed]{Rt}{Rt1} ;
    	\edge[tempcolor]{Ct}{Ct1} ;
    	\edge[tempcolor]{dotdotCt}{Ct} ;
    	\edge[tempcolor]{dotdotRt}{Rt} ;
    	\edge[tempcolor]{Ct1}{dotdotCt1} ;
    	\edge[tempcolor]{Rt1}{dotdotRt1} ;
    	
    	\edge[intvcolor]{I1t1}{C1t1} ;
    	\edge[intvcolor]{I2t1}{C2t1} ;
    	\edge[intvcolor]{IKt1}{CKt1} ;
    	
    	\edge[confcolor]{Rt1}{I1t1} ;
    	\edge[confcolor]{Rt1}{I2t1} ;
    	\edge[confcolor]{Rt1}{IKt1} ;
    	
    	\edge[intvcolor]{I1t}{C1t} ;
    	\edge[intvcolor]{I2t}{C2t} ;
    	\edge[intvcolor]{IKt}{CKt} ;
    	
    	\edge[confcolor]{Rt}{I1t} ;
    	\edge[confcolor]{Rt}{I2t} ;
    	\edge[confcolor]{Rt}{IKt} ;
    
            \draw[->,tempcolor,opacity=0.4] (C1t) to [out=16, in=164] (C1t1);
            \draw[->,tempcolor,opacity=0.4] (C2t) to [out=16, in=164] (C2t1);
            \draw[->,tempcolor,opacity=0.4] (CKt) to [out=16, in=164] (CKt1);
            \draw[->,tempcolor,opacity=0.4] (CKt) to [out=10, in=170] (C1t1);
            \draw[->,tempcolor,opacity=0.4] (CKt) to [out=20, in=160] (C2t1);
        \end{tikzpicture}
    }
    \caption{A representation of our assumptions. Observed variables are shown in gray ($X^{\tau}$ and $R^{\tau}$) and latent variables in white. Optional causal edges are shown as dashed lines. A latent causal variable $C^{t}_i$ has as parents a subset of the causal factors at the \textcolor{tempcolor}{previous time step} $C^{t-1}=\{C_1^{t-1}, \dots, C_{K}^{t-1}\}$, and its latent \textcolor{intvcolor}{binary interaction variable} $I^{t}_i$. The interaction variables are determined by an observed \textcolor{confcolor}{\robotstate{}} $R^{t}$ and potentially by the variables from the \textcolor{extraintvcolor}{previous time step} $C^{t-1}$ (\eg{} in a collision). The \robotstate{} can be a dynamical process over time as well, for example, by depending on the previous time step. The \textcolor{obscolor}{observation} $X^{\tau}$ is a high-dimensional entangled representation of all causal variables $C^{\tau}$ at time step $\tau$.}
    \label{fig:main_causal_graph_full}
\end{figure*}

Learning a low-dimensional representation of an environment is a crucial step in many applications, \eg{} robotics \cite{lesort2018state}, embodied AI \cite{kolve2017ai} and reinforcement learning \cite{hafner2021mastering, traeuble2022the}.
A promising direction for learning robust and actionable representations is \textit{causal representation learning} \cite{schoelkopf2021towards}, which aims to identify the underlying causal variables and their relations in a given environment from high-dimensional observations, \eg{} images.
However, learning causal variables from high-dimensional observations is a considerable challenge and may not always be possible, since multiple underlying causal systems could generate the same data distribution \cite{hyvarinen1999nonlinear}.
%up to permutation and component-wise transformation 
To overcome this, several works make use of additional information, \eg{} by using counterfactual observations \cite{brehmer2022weakly, ahuja2022weakly, locatello2020weakly}, observed intervention targets \cite{lippe2022citris, lippe2022icitris}.
Alternatively, one can restrict the distributions of causal variables, \eg{} by considering environments with non-stationary noise \cite{yao2021learning, yao2022temporally, khemakhem2020variational} or sparse causal relations \cite{lachapelle2021disentanglement, lachapelle2022partial}.

In this paper, instead, we focus on interactive environments, where an agent can perform actions which may have an effect on the underlying causal variables.
We will assume that these interactions between the agent and the causal variables can be described by \emph{binary} variables, \ie{} that with the agent's actions, we can switch between two mechanisms, or distributions, of a causal variable, similarly to performing soft interventions.
Despite being binary, these interactions include a wide range of common scenarios, such as a robot pressing a button, opening/closing a door, or even colliding with a moving object and alternating its course.

In this setup, we prove that causal variables are identifiable if the agent interacts with each causal variable in a distinct pattern, \ie{} does not always interact with any two causal variables at the same time.
We show that for $K$ variables, we can in many cases fulfill this by having as few as $\lfloor \log_2 K \rfloor +2$ actions with sufficiently diverse effects, allowing identifiability even for a limited number of actions.
The binary nature of the interactions permits the identification of a wider class of causal models than previous work in a similar setup, including the common, challenging additive Gaussian noise model \cite{hyvarinen1999nonlinear}.

Based on these theoretical results, we propose \OurApproach{} (\OurApproachFull{}).
\OurApproach{} is a variational autoencoder \cite{kingma2014auto} which learns the causal variables and the agent's binary interactions with them in an unsupervised manner (see \cref{fig:introduction_figure_1}).
In experiments on robotic-inspired datasets, \OurApproach{} identifies the causal variables and outperforms previous methods.
Furthermore, we apply \OurApproach{} to the realistic 3D embodied AI environment iTHOR \cite{kolve2017ai}, and show that \OurApproach{} is able to generate realistic renderings of unseen causal states in a controlled manner.
This highlights the potential of causal representation learning in the challenging task of embodied AI.
In summary, our contributions are:
\begin{compactitem}
    \item We show that under mild assumptions, binary interactions with unknown targets identify the causal variables from high-dimensional observations over time.
    \item We propose \OurApproach{}, a causal representation learning framework that learns the causal variables and their binary interactions simultaneously.
    \item We empirically show that \OurApproach{} identifies both the causal variables and the interaction targets on three robotic-inspired causal representation learning benchmarks, and allows for controllable generations.
\end{compactitem}

\section{Preliminaries}
\label{sec:preliminaries}

In this paper, we consider a causal model $\mathcal{M}$ as visualized in \cref{fig:main_causal_graph_full}.
The model $\mathcal{M}$ consists of $K$ latent causal variables $C_1,...,C_K$ which interact with each other over time, like in a dynamic Bayesian Network (DBN) \cite{DBN, Murphy_DBN}.
In other words, at each time step $t$, we instantiate the causal variables as $C^t=\{C_1^t,...,C_K^t\}\in\mathcal{C}$, where $\mathcal{C}\subseteq\mathbb{R}^K$ is the domain.
In terms of the causal graph, each variable $C^t_i$ may be caused by a subset of variables in the previous time step $\{C_1^{t-1},...,C_K^{t-1}\}$.
For simplicity, we restrict the temporal causal graph to only model dependencies on the previous time step. 
Yet, as we show in Appendix B.3, our results in this paper can be trivially extended to longer dependencies, \eg{} $(C^{t-2},C^{t-1})\to C^t$, since $C^{t-1}$ is only used for ensuring conditional independence.
As in DBNs, we consider the graph structure to be time-invariant.

Besides the intra-variable dynamics, we assume that the causal system is affected by a \robotstate{} $R^t$ with arbitrary domain $\mathcal{R}$, which can be continuous or discrete of arbitrary dimensionality.
This \robotstate{} can model any known external causes on the system, which, for instance, could be a robotic arm interacting with an environment.
For the causal graph, we assume that the effect of the \robotstate{} $R^t$ on a causal variable $C^t_i$ can be described by a latent \emph{binary interaction} variable $I^t_i\in\{0,1\}$.
This can be interpreted as each causal variable having two mechanisms/distributions, \eg{} an observational and an interventional mechanism, which has similarly been assumed in previous work \cite{brehmer2022weakly, lippe2022citris, lippe2022icitris}. 
Thereby, the role of the interaction variable $I^t_i$ is to select the mechanism, \ie{} observational or interventional, at time step $t$.
For example, a collision between an agent and an object is an interaction that switches the dynamics of the object from its natural course to a perturbed one.
In this paper, we consider the interaction variable $I^t_i$ to be an unknown function of the \robotstate{} and the previous causal variables, \ie{} $I^t_i=f_i(R^t,C^{t-1})$.
The dependency on the previous time step allows us to model interactions that only occur in certain states of the system, \eg{} a collision between an agent (modeled by $R^t$) with an object with position $C_i^{t-1}$ will only happen for certain positions of the agent and the object.

We consider the causal graph of \cref{fig:main_causal_graph_full} to be causally sufficient, \ie{} we assume there are no other unobserved confounders except the ones we have described in the previous paragraphs and represented in the Figure, and that the causal variables within the same time step are independent of each other, conditioned on the previous time step and their interaction variables.
We summarize the dynamics as $p(C^t|C^{t-1},R^t)=\prod_{i=1}^K p(C^t_i|C^{t-1},I^t_i)$.
Although $C^t_i$ only depends on a subset of $C^{t-1}$, w.l.o.g. we model it as depending on all causal variables from the previous time step.

In causal representation learning, the task is to identify causal variables from an entangled, potentially higher-dimensional representation, \eg{} an image.
We consider an injective observation function $g$, mapping the causal variables $C^t$ to an observation $X^t=g(C^t)$.
Following \citet{klindt2021towards,yao2021learning}, we assume $g$ to be defined everywhere for $C^t$ and differentiable almost everywhere.
In our setting, once we identify the causal variables, the causal graph can be trivially learned by testing for conditional independence, since the causal graph is limited to edges following the temporal dimension, \ie{} from $C^{t-1}$ to $C^t$.
We provide further details on the graph discovery and an example on learned causal variables in Appendix B.4. 

\section{Identifying Causal Variables}
\label{sec:theory}

Our goal in this paper is to identify the causal variables $C_1,...,C_K$ of a causal system from sequences of observations $(X^t,R^t)$.
We first define the identifiability class that we consider.
We then provide an intuition on how binary interactions enable identifiability, before presenting our two identifiability results.
The practical algorithm based on these results, \OurApproach{}, is presented in \cref{sec:method}.

\subsection{Identifiability Class and Definitions}

Intuitively, we seek to estimate an observation function $\hat{g}$, which maps a latent space $\hat{\mathcal{C}}$ to observations $X$, and models each true causal variable $C_i$ in a different dimension of the latent space $\hat{\mathcal{C}}$. 
This observation function should be equivalent to the true observation function $g$, up to permuting and transforming the variables individually, \eg{} through scaling. 
Several previous works \cite{yao2021learning, yao2022temporally, lachapelle2021disentanglement, khemakhem2020variational} have considered equivalent identifiability classes, which we define as:

\begin{definition}
    \label{def:identifiability}
    Consider a model $\mathcal{M}=\langle g,f,\omega,\mathcal{C} \rangle$ with an injective function $g(C)=X$ with $C\in\mathcal{C}$ and a latent distribution $p_{\omega}(C^{t}|C^{t-1},R^{t})$, parameterized by $\omega$ and defined:
    \begin{equation*}
        p_{\omega}(C^{t}|C^{t-1},R^{t})=\prod_{i=1}^{K} p_{\omega,i}\big(C^t_i|C^{t-1},f_i(R^t,C^{t-1})\big),
    \end{equation*}
    where $f_i:\mathcal{R}\times\mathcal{C}\to\{0,1\}$ outputs a binary variable for the variable $C^t_i$. 
    We call $\mathcal{M}$ identifiable iff for any other model $\mathcal{\widetilde{M}}=\langle \tilde{g},\tilde{f},\tilde{\omega},\tilde{\mathcal{C}} \rangle$ with the same observational distribution $p(X^t|X^{t-1},R^t)$, $g$ and $\tilde{g}$ are equivalent up to a component-wise invertible transformation $T$ and a permutation $\pi$:
    \begin{equation*}
        p_{\mathcal{M}}(X^t|X^{t-1}\!,R^t) = p_{\mathcal{\widetilde{M}}}(X^t|X^{t-1}\!,R^t) \Rightarrow g = \tilde{g} \circ T \circ \pi
    \end{equation*}
\end{definition}

To achieve this identifiability, we rely on the interaction variables $I^t_i$ being binary and having \textit{distinct interaction patterns}, a weaker form of faithfulness on the interaction variables.
Intuitively, we do not allow that any two causal variables to have identical interaction variables $I^t_i,I^t_j$ across the whole dataset, \ie{} being always interacted with at the same time.
Similarly, if all $I^t_i$ are always zero ($\forall t,i\colon I^t_i=0$), then we fall back into the well-known unidentifiable setting of non-linear ICA \cite{hyvarinen1999nonlinear}.
Since interaction variables can also be functions of the previous state, we additionally assume that for all possible previous states, the interaction variables cannot be deterministic functions of any other. 
Thus, we assume that all causal variables have \textit{distinct interaction patterns}, which we formally define as:

\begin{definition}
    \label{def:theory_distinct_intvs}
    A causal variable $C_i$ in $\mathcal{M}=\langle g,f,\omega,\mathcal{C}\rangle$ has a \textbf{distinct interaction pattern} if for all values of $C^{t-1}$, its interaction variable $I^t_i=f_i(R^t,C^{t-1})$ is not a deterministic function $b\!:\!\{0,1\}\!\to\!\{0,1\}$ of any other $I^t_j$:
    \begin{equation*}
        \label{eq:theory_assum_distinct_intvs}
        \forall C^{t-1}\!,\forall j\!\neq\!i, \nexists b, \forall R^t\colon f_i(R^t,C^{t-1})=b(f_j(R^t,C^{t-1})).
    \end{equation*}
\end{definition}

This assumption generalizes the intervention setup of \citet{lippe2022intervention}, which has a similar condition on its binary intervention variables, but assumed them to be independent of the previous time step.
This implies that we can create a distinct interaction pattern for each of the $K$ causal variables by having as few as $\lfloor \log_2 K\rfloor + 2$ different values for $R^t$, if the interaction variables are independent of $C^{t-1}$.
In contrast, other methods in similar setups that also exploit an external, temporally independent, observed variable \citep{yao2022temporally, yao2021learning, khemakhem2020variational} require the number of regimes to scale linearly with the number of causal variables.
If the interaction variables depend on $C^{t-1}$, the lower bound of the number of different values for $R^t$ depends on the causal model $\mathcal{M}$, more specifically its interaction functions $f_i$. Concretely, the lower bound for a causal model $\mathcal{M}$ is the smallest set of values of $R^t$ that ensure different interaction patterns for all $C^{t-1}$ in $\mathcal{M}$. In the worst case, each $C^{t-1}$ may require different values of $R^t$ to fulfill the condition of \cref{def:theory_distinct_intvs}, such that $R^t$ would need to be of the same domain as $C^{t-1}$ (for instance being continuous). At the same time, for models $\mathcal{M}$ in which the condition of \cref{def:theory_distinct_intvs} can be fulfilled by the same values of $R^t$ for all $C^{t-1}$, we again recover the lower bound of $\lfloor \log_2 K + 2 \rfloor$ different values of $R^t$.

\subsection{Intuition: Additive Gaussian Noise}
\label{sec:theory_intuition}

\begin{figure}[t!]
    \centering
    \begin{tabular}{cc}
        \begin{tikzpicture}
            \begin{axis}[
              width=0.6\columnwidth,
              height=0.6\columnwidth,
              axis lines=middle,
              axis line style={draw=none},
              tick style={draw=none},
              xmin=-0.99,xmax=1.49,ymin=-0.99,ymax=1.49,
              xtick distance=0.5,
              ytick distance=0.5,
              yticklabels={,,},
              xticklabels={,,},
              title={True variables},
              grid=major,
              grid style={thin,densely dotted,black!20}
              ]
              \draw[black,-Stealth] (-1.0,0.0) -- (1.5,0.0);
              \draw[black,-Stealth] (0.0,-1.0) -- (0.0,1.5);
              
              \coordinate (Obs) at (axis cs: 0.0,0.0);
              \coordinate (I1) at (axis cs: 1.0,0.0);
              \coordinate (I2) at (axis cs: 0.0,1.0);
              \coordinate (I1I2) at (axis cs: 1.0,1.0);
              \draw[thick,color=gaussianplotIone,->](Obs)-- node[yshift=-3mm,xshift=0mm]{$I_1$}(I1);
              \draw[thick,color=gaussianplotItwo,->](Obs)--  node[yshift=0mm,xshift=-3mm]{$I_2$}(I2);
              \draw[color=gaussianplotIonetwo,dashed,opacity=0.5](I1)--(I1I2);
              \draw[color=gaussianplotIonetwo,dashed,opacity=0.5](I2)--(I1I2);
              \draw[thick,color=gaussianplotIonetwo,->](Obs)-- node[yshift=-2mm,xshift=2mm] {$I_{12}$}(I1I2);
              
              \draw[line width=0.7mm,opacity=0.8,gaussianplotItwo] (axis cs: -0.07,1.0) -- (axis cs: 0.0,1.0);
              \draw[line width=0.7mm,opacity=0.8,gaussianplotIonetwo] (axis cs: 0.0,1.0) -- (axis cs: 0.07,1.0);
              \draw[line width=0.7mm,opacity=0.8,gaussianplotObs] (axis cs: 0.07,0.0) -- (axis cs: -0.07,0.0);
            
              \draw[line width=0.7mm,opacity=0.8,gaussianplotIone] (axis cs: 1.0,-0.07) -- (axis cs: 1.0,0.0);
              \draw[line width=0.7mm,opacity=0.8,gaussianplotIonetwo] (axis cs: 1.0,0.0) -- (axis cs: 1.0,0.07);
              \draw[line width=0.7mm,opacity=0.8,gaussianplotObs] (axis cs: 0.0,0.07) -- (axis cs: 0.0,-0.07);
            \end{axis}
            \node (C2) at (1.7,3.2) {$C_2$};
            \node (C1) at (3.2,1.7) {$C_1$};
        \end{tikzpicture}
        & 
        \begin{tikzpicture}
            \begin{axis}[
              anchor=origin,
              width=0.6\columnwidth,
              height=0.6\columnwidth,
              axis lines=middle,
              axis line style={draw=none},
              tick style={draw=none},
              xmin=-1.0,xmax=1.5,ymin=-1.0,ymax=1.5,
              xtick distance=0.5,
              ytick distance=0.5,
              yticklabels={,,},
              xticklabels={,,},
              title={Rotated variables},
              grid=major,
              grid style={draw=none}
              ]
              \foreach \x in {-3,-2,-1,0,1,2,3,4}
                {   
                  \edef\temp{\noexpand
                  \draw[thin,densely dotted,black!20] (axis cs: \x*0.5*0.707+10,\x*0.5*0.707-10) -- (axis cs: \x*0.5*0.707-10,\x*0.5*0.707+10);
                  }
                  \temp
                  \edef\temp{\noexpand
                  \draw[thin,densely dotted,black!20] (axis cs: -\x*0.5*0.707+10,\x*0.5*0.707+10) -- (axis cs: -\x*0.5*0.707-10,\x*0.5*0.707-10);
                  }
                  \temp
                }
              
              \draw[black,-Stealth] (-1.0,1.0) -- (1.0,-1.0);
              \draw[black,-Stealth] (-1.0,-1.0) -- (1.5,1.5);
              
              \coordinate (Obs) at (axis cs: 0.0,0.0);
              \coordinate (I1) at (axis cs: 1.0,0.0);
              \coordinate (I2) at (axis cs: 0.0,1.0);
              \coordinate (I1I2) at (axis cs: 1.0,1.0);
              \draw[thick,color=gaussianplotIone,->](Obs)-- node[yshift=-3mm,xshift=0mm]{$I_1$}(I1);
              \draw[thick,color=gaussianplotItwo,->](Obs)--  node[yshift=0mm,xshift=-3mm]{$I_2$}(I2);
              \draw[color=gaussianplotItwo,dashed,opacity=0.5](I2)--(axis cs: 0.5,0.5);
              \draw[color=gaussianplotItwo,dashed,opacity=0.5](I2)--(axis cs: -0.5,0.5);
              \draw[color=gaussianplotIone,dashed,opacity=0.5](I1)--(axis cs: 0.5,0.5);
              \draw[color=gaussianplotIone,dashed,opacity=0.5](I1)--(axis cs: 0.5,-0.5);
              \draw[thick,color=gaussianplotIonetwo,->](Obs)-- node[yshift=2mm,xshift=7mm] {$I_{12}$}(I1I2);
    
              \draw[line width=0.7mm,opacity=0.8,gaussianplotIone] (axis cs: 0.5+0.05,-0.5+0.05) -- (axis cs: 0.5-0.05,-0.5-0.05);
              \draw[line width=0.7mm,opacity=0.8,gaussianplotItwo] (axis cs: -0.5+0.05,0.5+0.05) -- (axis cs: -0.5-0.05,0.5-0.05);
              \draw[line width=0.7mm,opacity=0.8,gaussianplotObs] (axis cs: 0.05,0.05) -- (axis cs: -0.05,-0.05);
              
              \draw[line width=0.7mm,opacity=0.8,gaussianplotIone] (axis cs: 0.5+0.05,0.5-0.05) -- (axis cs: 0.5,0.5);
              \draw[line width=0.7mm,opacity=0.8,gaussianplotItwo] (axis cs: 0.5,0.5) -- (axis cs: 0.5-0.05,0.5+0.05);
              \draw[line width=0.7mm,opacity=0.8,gaussianplotObs] (axis cs: -0.05,0.05) -- (axis cs: 0.05,-0.05);
              \draw[line width=0.7mm,opacity=0.8,gaussianplotIonetwo] (axis cs: 1.0+0.05,1.0-0.05) -- (axis cs: 1.0-0.05,1.0+0.05);
            \end{axis}
            \node (Chat2) at (2.1,1.7) {$\hat{C}_2$};
            \node (Chat1) at (1.5,-1.0) {$\hat{C}_1$};
        \end{tikzpicture}
    \end{tabular}
    \caption{
    Binary interactions identify the additive Gaussian noise model in \cref{eq:theory_additive_gaussian_model}. 
    The plots show the change of the mean for each variable for a fixed $C^{t-1}$ under interactions affecting only one variable (\textcolor{gaussianplotIone}{$I_1=1$} or \textcolor{gaussianplotItwo}{$I_2=1$}), and under joint interactions \textcolor{gaussianplotIonetwo}{$I_{12}$} (\textcolor{gaussianplotIonetwo}{$I_{1}=I_{2}=1$}). 
    Ticks on axes show mean values for each variable, where the mean for $(I_1=0, I_2=0)$ lies at the origin. The colors of the ticks match the interaction color. 
    \textbf{Left}: true causal variables $C_1$ and $C_2$. 
    Each variable has two possible means after any of the possible interactions. 
    The effect of the interactions can be described by a binary variable per axis. 
    \textbf{Right}: a rotated representation. 
    Both $\hat{C}_1$ and $\hat{C}_2$ have three possible means, which cannot be described by a binary variable per axis anymore.
    }
    \label{fig:main_gaussian_example}
\end{figure}

We first provide some intuition on how binary interactions, \ie{} knowing that each variable has exactly two potential mechanisms, enable identifiability, even when we do not know which variables are interacted with at each time step.
We take as an example an additive Gaussian noise model with two variables $C_1,C_2$, each described by the equation:
\begin{equation}
    \label{eq:theory_additive_gaussian_model}
    C^{t}_i = \mu_i(C^{t-1}, I^{t}_i) + \epsilon_i, \hspace{3mm} \epsilon_i \sim \mathcal{N}(0, \sigma^2),
\end{equation}
where $\epsilon_i$ is additive noise with variance $\sigma^2$, and $\mu_i$ a function for the mean with $\mu_i(C^{t-1}, I^{t}_i=0)\neq \mu_i(C^{t-1}, I^{t}_i=1)$.
Due to the rotational invariance of Gaussians, the true causal variables $C_1,C_2$ and their rotated counterparts $\hat{C}_1, \hat{C}_2$ model the same distribution with the same factorization: $\prod_{i=1}^2 p_i(C^{t}_i|C^{t-1},R^{t}) = \prod_{i=1}^2 \hat{p}_i(\hat{C}^{t}_i|\hat{C}^{t-1},R^{t})$.
This property makes the model unidentifiable in many cases \cite{yao2022temporally, khemakhem2020variational, hyvaerinen2019nonlinear, lachapelle2021disentanglement}. 
However, when the effect of the \robotstate{} on a causal variable $C_i$ can be described by a \textit{binary} variable, \ie{} $I_i\in\{0,1\}$, the two representations become distinguishable.
In \cref{fig:main_gaussian_example}, we visualize the two representations by showing the means of the different variables under interactions, which we detail in Appendix B.6 and provide intuition here. % as ticks on their respective axes, where the color of the tick represents the interaction it models.
For the original representation $C_1,C_2$, each variable's mean takes on only two different values for any $R^t$. For example, for \robotstate{}s where $I_1=0$, the variable $C_1$ takes a mean that is in the center of the coordinate system. Similarly, when $I_1=1$, the variable $C_1$ will take a mean that is represented as a pink (for $I_1=1, I_2=0$) or yellow tick (for $I_1=1, I_2=1$).  
In contrast, for the rotated variables, both $\hat{C}_1$ and $\hat{C}_2$ have three different means depending on the interactions, making it impossible to model them with individual binary variables.
Intuitively, the only alternative representations to $C_1,C_2$ which can be described by binary variables are permutations and/or element-wise transformations, effectively identifying the causal variables according to our identifiability class.

\subsection{Identifiability Result}
\label{sec:theory_identifiability_result}

When extending this intuition to more than two variables, we find that systems may become unidentifiable when the two distributions of each causal variable, \ie{} interacted and not interacted, always differ in the same manner.
Formally, we denote the log-likelihood difference between the two distributions of a causal variable $C^t_i$ as $\intvdiv(C^{t}_i|C^{t-1}) :=\log p(C^{t}_i|C^{t-1},I^t_i=1) - \log p(C^{t}_i|C^{t-1},I^t_i=0)$.
If this difference or its derivative w.r.t. $C^t_i$ is constant for all values of $C^t_i$, the effect of the interactions could be potentially modeled in fewer than $K$ dimensions, giving rise to models that do not identify the causal model $\mathcal{M}$.

To prevent this, we consider two possible setups for ensuring sufficient variability of $\intvdiv(C^{t}_i|C^{t-1})$: \emph{dynamics variability}, and \emph{time variability}.
We present our identifiability result below and provide the proofs in Appendix B.

\begin{theorem}
    \label{theo:main_theorem_no_gaus}
    \label{theo:main_theorem_time_var}
    \label{theo:main_theorem}
    % In the setup of \cref{sec:preliminaries},
    An estimated model $\mathcal{\widehat{M}}=\langle \hat{g},\hat{f},\hat{\omega},\hat{\mathcal{C}} \rangle$ identifies the true causal model $\mathcal{M}=\langle g,f,\omega,\mathcal{C} \rangle$ if:
    \begin{compactenum}
        \item (\textbf{Observations}) $\mathcal{\widehat{M}}$ and $\mathcal{M}$ model the same likelihood:
        $$
            % (g^*, \omega^*) \in \arg\max_{(g,\omega)} \E_{\mathcal{S}}\left[\log p_{g,\omega}(X^t|X^{t-1},R^t)\right];
            p_{\widehat{\mathcal{M}}}(X^t|X^{t-1},R^t)=p_{\mathcal{M}}(X^t|X^{t-1},R^t);
        $$
        \item (\textbf{Distinct Interaction Patterns}) Each variable $C_i$ in $\mathcal{M}$ has a distinct interaction pattern (Definition \ref{def:theory_distinct_intvs});
    \end{compactenum}
    and one of the following two conditions holds for $\mathcal{M}$:
    \begin{compactenum}
        \item[A.] (\textbf{Dynamics Variability}) Each variable's log-likelihood difference is twice differentiable and not always zero:
        $$
            \forall C^t_i ,\exists C^{t-1} \colon \frac{\partial^2 \intvdiv(C^{t}_i|C^{t-1})}{\partial (C^t_i)^2} \neq 0;
        $$
        \item[B.] (\textbf{Time Variability}) For any $C^{t}\in\mathcal{C}$, there exist $K+1$ different values of $C^{t-1}$ denoted with $c^1,...,c^{K+1}\in\mathcal{C}$, for which the vectors $v_1,...,v_{K}\in\R^{K+1}$ with
        \ifinappendix
        $$
            v_{i} = \begin{bmatrix}
                \frac{\partial \intvdiv\left(C^{t}_i|C^{t-1}=c^1\right)}{\partial C^{t}_i} & 
                \frac{\partial \intvdiv\left(C^{t}_i|C^{t-1}=c^2\right)}{\partial C^{t}_i} & 
                \cdots &
                \frac{\partial \intvdiv\left(C^{t}_i|C^{t-1}=c^{K+1}\right)}{\partial C^{t}_i} \\ 
            \end{bmatrix}^T \in \mathbb{R}^{K+1}
        $$
        \else
        $$
            v_{i} = \begin{bmatrix}
                \frac{\partial \intvdiv\left(C^{t}_i|C^{t-1}=c^1\right)}{\partial C^{t}_i} & 
                % \frac{\partial \intvdiv\left(C^{t}_i|C^{t-1}=c^2\right)}{\partial C^{t}_i} & 
                \cdots &
                \frac{\partial \intvdiv\left(C^{t}_i|C^{t-1}=c^{K+1}\right)}{\partial C^{t}_i} \\ 
            \end{bmatrix}^T
        $$
        % $$
        %     v^{i}_j=\frac{\partial \intvdiv(C^{t}_i|C^{t-1}=c^j)}{\partial C^{t}_i}
        % $$
        \fi
        are linearly independent.
    \end{compactenum}
\end{theorem}

Intuitively, \cref{theo:main_theorem} states that we can identify a causal model $\mathcal{M}$ by maximum likelihood optimization, if we have distinct interaction patterns (Definition~\ref{def:theory_distinct_intvs}) and $\intvdiv(C^{t}_i|C^{t-1})$ varies sufficiently, either in dynamics or in time.
\emph{Dynamics variability} can be achieved by the difference $\intvdiv(C^{t}_i|C^{t-1})$ being non-linear for all causal variables.
This assumption is common in previous ICA-based works \cite{yao2021learning, yao2022temporally, hyvaerinen2019nonlinear} and, for instance, allows for Gaussian distributions with variable standard deviations.
While allowing for a variety of distributions, it excludes additive Gaussian noise models.
We can include this challenging setup by considering the \emph{time variability} assumption, which states that the effect of the interaction depends on the previous time step, and must do so differently for each variable.
As an example, consider a dynamical system with several moving objects, where an interaction is a collision with a robotic arm.
The time variability condition is commonly fulfilled by the fact that the trajectory of each object depends on its own velocity and position.

In comparison to previous work, our identifiability results cover a larger class of causal models by exploiting the binary nature of the interaction variables. We provide a detailed comparison in Appendix B.5. 
In short, closest to our setup, \citet{khemakhem2020variational} and \citet{yao2021learning} require a stronger form of both our dynamics and time variability assumptions, excluding common models like additive Gaussian noise models. 
\citet{lachapelle2021disentanglement} requires that no two causal variables share the same parents, limiting the allowed temporal graph structures. 
Meanwhile, our identifiability results allow for arbitrary temporal causal graphs.
Further, the two conditions of \cref{theo:main_theorem} complement each other well by covering different underlying distributions for the same general setup.
Thus, in the next section, we can develop one joint learning algorithm for identifying the causal variables based on both conditions in \cref{theo:main_theorem}.

\section{\OurApproach{}}
\label{sec:method}

Using the results of \cref{sec:theory}, we propose \OurApproach{} (\OurApproachFull{}), a neural-network based approach to identify causal variables and their interaction variables.
In short, \OurApproach{} is a variational autoencoder (VAE) \cite{kingma2014auto}, which aims at modeling each of the causal variables $C_1,...,C_K$ in a separate latent dimension by enforcing the latent structure of \cref{fig:main_causal_graph_full}.
We first give an overview of \OurApproach{} and then detail the design choices for the model prior.

\begin{figure}[t!]
    \centering
    \resizebox{\columnwidth}{!}{
        \begin{tikzpicture}
            \node (zt) at (0,0) {$z^{t-1}$};
            \node (Rt1) at (0,-1.4) {$R^{t}$};
            \node (MLPI) at (2.75,-1.25) [draw,minimum width=1.5cm,minimum height=1cm,fill=green!10,rounded corners=5pt] {\MLPI{}};
            \node (Ihat) at (4.8,-1.25) {$\hat{I}^{t}_i$};
            \node (MLPz) at (5.7,0.0) [draw,minimum width=1.5cm,minimum height=1cm,fill=orange!10,rounded corners=5pt] {\MLPzi{}};
            \node (zt1) at (7.9,0) [align=center] {$\mu^{t}_i,\sigma^{t}_i$};
        
            \coordinate (MLPIinter) at ($(MLPI.west)!0.5!(Rt1.east)$);
            \coordinate (MLPIztconnect) at ($(MLPIinter)!0.2!(zt.east)$);
            \draw[->] (zt) -| (MLPIztconnect) -- (MLPI.west |- MLPIztconnect);
            \draw[->] (Rt1) -- (MLPI.west |- Rt1);
            \draw[->] (MLPI) -- (Ihat);
            \draw[->] (zt) -- (MLPz);
            \draw[->] (Ihat) -| (MLPz.south |- Ihat.east) -- (MLPz);
            \draw[->] (MLPz) -- (zt1);
        \end{tikzpicture}
    }
    \caption{
    The prior structure of \OurApproach{}. Based on the previous latents $z^{t-1}$ and the observed \robotstate{} $R^{t}$, the \MLPI{} predicts the interaction variable $\hat{I}_i^{t}$. Then, \MLPzi{} outputs the distribution for the next time step $p(z^{t}_i|z^{t-1},\hat{I}_i^{t})$, which can be parameterized by, \eg{} a mean $\mu^{t}_i$ and std $\sigma^{t}_i$. 
    }
    \label{fig:model_prior}
\end{figure}

\subsection{Overview}
\label{sec:method_overview}

\OurApproach{} consists of three main elements: the encoder $q_{\phi}$, the decoder $p_{\theta}$, and the prior $p_{\omega}$.
The decoder and encoder implement the observation function $g$ and its inverse $g^{-1}$ (Definition~\ref{def:identifiability}), respectively, and act as a map between observations $x^t$ and a lower-dimensional latent space $z^t \in\mathbb{R}^{M}$, in which we learn the causal variables $C_1^t,...,C_K^t$.
The goal of the model is to learn each causal variable $C_i^t$ in a different latent dimension, \eg{} $z^t_j$, effectively separating and hence identifying the causal variables according to \cref{def:identifiability}.
Thus, we need the latent space to have at least $K$ dimensions.
In practice, since the number of causal variables is not known a priori, we commonly overestimate the latent dimensionality, \ie{} $M\gg K$.
Still, we expect the model to only use $K$ dimensions actively, with the redundant dimensions not containing any information after training.

On this latent space, the prior $p_{\omega}$ learns a distribution that follows the structure in Definition~\ref{def:identifiability}, modeling the dynamics in the latent space.
As an objective, we maximize the data likelihood of observation triplets $\{X^t,X^{t-1},R^t\}$ from the true causal model, as stated in \cref{theo:main_theorem_no_gaus,theo:main_theorem_time_var}.
The loss function for \OurApproach{} is:
\begin{equation}
    \label{eq:method_vae_objective}
    \begin{split}
        &\mathcal{L}^{t} = -\E_{q_{\phi}\left(z^{t}|x^{t}\right)}\left[\log p_{\theta}(x^{t}|z^{t})\right] + \\ 
        &\E_{q_{\phi}\left(z^{t-1}|x^{t-1}\right)}\left[\text{KL}\left(q_{\phi}(z^{t}|x^{t})||p_{\omega}(z^{t}|z^{t-1},R^{t})\right)\right]
    \end{split}
\end{equation}
with learnable parameter sets $\phi$ (encoder), $\theta$ (decoder), and $\omega$ (prior), and KL being the Kullback-Leibler divergence.

For visually complex datasets, the VAE commonly has to perform a trade-off between reconstruction quality and prior modeling, which may cause poorer identification of the causal variables.
To circumvent that, we follow \citet{lippe2022citris} by separating the reconstruction and prior modeling stage by training an autoencoder and a normalizing flow \cite{rezende2015variational} in separate stages.
In this setup, an autoencoder is first trained to map the observations $x^t$ into a lower-dimensional space.
Afterward, we learn a normalizing flow on the autoencoder's representations to transform them into the desired causal representation, using the same prior structure as in the VAE.
In experiments, we refer to this approach as \OurApproach{}-NF, and the previously described VAE-based approach as \OurApproach{}-VAE.

\subsection{Model Prior}
\label{sec:method_prior}
Our prior follows the distribution structure of Definition~\ref{def:identifiability}, which has two elements per latent variable: a function to model the binary interaction variable, and a conditional distribution.
We integrate this into \OurApproach{}'s prior by learning both via multi-layer perceptrons (MLPs):
\begin{equation}
    p_{\omega}(z^{t}|z^{t-1}\!\!,R^{t})\!=\!\prod_{i=1}^{M} p_{\omega,i}\big(z_i^{t}|z^{t-1}\!\!,\MLPImath{}(R^{t}, z^{t-1})\big).\!
\end{equation}
Here, \MLPI{} is an MLP that maps the \robotstate{} $R^t$ and the latents of the previous time step $z^{t-1}$ to a binary output $\hat{I}^{t}_i$, as shown in \cref{fig:model_prior} .
This MLP aims to learn the interaction variable for the latent variable $z_i$, simply by optimizing \cref{eq:method_vae_objective}.
The variable $\hat{I}^t_i$ is then used as input for predicting the distribution over $z^{t}_i$.
For simplicity, we model $p_{\omega,i}$ as a Gaussian distribution, which is parameterized by one MLP per variable, \MLPzi{}, predicting the mean and standard deviation.
To allow for more complex distributions, $p_{\omega,i}$ can alternatively be modeled by a conditional normalizing flow \cite{winkler2019learning}.

In early experiments, we found that enforcing $\hat{I}^{t}_i$ to be a binary variable and backpropagating through it with the straight-through estimator \cite{bengio2013estimating} leads to suboptimal performances.
Instead, we model $\hat{I}^{t}_i$ as a continuous variable during training by using a temperature-scaled $\tanh{}$ as the output activation function of \MLPI{}.
By gradually decreasing the temperature, we bring the activation function closer to a discrete step function towards the end of training.

\section{Related Work}
\label{sec:related_work}

\smallparagraph{Causal Discovery from Unknown Targets}
Learning (equivalence classes of) causal graphs from observational and interventional data, even with unknown intervention targets, is a common setting in causal discovery \cite{jaber2020causal, brouillard2020differentiable, squires2020permutation, mooij2020joint}. 
In recent work, this is even extended to the case in which we have unknown mixtures of interventional data \cite{kumar2021disentangling,faria2022differentiable,mian2023information}, which for example can happen if the \robotstate{} is not observed.
In our paper, we assume that we observe the \robotstate{} and then reconstruct the latent interaction variables, which resemble the observed context variables by \citet{mooij2020joint}. 
Moreover, our work is on a different task, causal representation learning, in which we try to learn the causal variables from high-dimensional data.

\smallparagraph{Causal Representation Learning}
A common basis for causal representation learning is Independent Component Analysis (ICA) \cite{comon1994independent, hyvaerinen2001independent}, which aims to identify independent latent variables from observations.
Due to the non-identifiability for the general case of non-linear observation functions \cite{hyvarinen1999nonlinear}, additional auxiliary variables are often considered in this setting \cite{hyvaerinen2019nonlinear, hyvarinen2016unsupervised}.
Ideas from ICA have been integrated into neural networks \cite{khemakhem2020variational, khemakhem2020ice, reizinger2022embrace} and applied to causality \cite{shimizu2006linear,gresele2021independent, monti2019causal} for identifying causal variables. 

Recently, several works in causal representation learning have exploited distribution shifts or interventions to identify causal variables.
Using counterfactual observations, \citet{brehmer2022weakly, ahuja2022weakly, locatello2020weakly} learn causal variables from pairs of images, between which only a subset of variables has changed via interventions with unknown targets.
For temporal processes, \citet{lachapelle2021disentanglement,lachapelle2022partial} can model interventions of unknown target via \emph{actions}, which is equivalent to the \robotstate{} in our setting, but require that each causal variable has a strictly unique parent set.
On the other hand, \citet{yao2021learning, yao2022temporally} consider observations from $m$ different regimes $u_1,...,u_m$, where, in our setting, the regime indicator $u$ is a discrete version of $R^t$. 
However, they require at least $2K+1$ different regimes compared to $\lfloor \log_2 K\rfloor + 2$ settings for ours, and have stronger conditions on the distribution changes over regimes (\eg{} no additive Gaussian noise models).
In temporal settings where the intervention targets are known, CITRIS \cite{lippe2022citris,lippe2022icitris} identifies scalar and multidimensional causal variables from high-dimensional images.
Nonetheless, observing the intervention targets requires additional supervision, which may not always be available.
To the best of our knowledge, we are the first to use unknown binary interactions to identify the causal variables from high-dimensional observations. 

\section{Experiments}
\label{sec:experiments}

To illustrate the effectiveness of \OurApproach{}, we evaluate it on a synthetic toy benchmark and two environments generated by 3D robotic simulators.
We publish our code at \url{https://github.com/phlippe/BISCUIT}, and detail the data generation and hyperparameters in Appendix~C.

\subsection{Synthetic Toy Benchmark}

To evaluate \OurApproach{} on various graph structures, we extend the Voronoi benchmark \cite{lippe2022citris} by replacing observed intervention targets with unobserved binary interactions.
In this dataset, each causal variable follows an additive Gaussian noise model, where the mean is modeled by a randomly initialized MLP.
To determine the parent set, we randomly sample the causal graph with an edge likelihood of $0.4$.
Instead of observing the causal variables directly, they are first entangled by applying a two-layer randomly initialized normalizing flow before visualizing the outputs as colors in a Voronoi diagram of size $32\times 32$ (see \cref{fig:experiments_environments_voronoi}).
We extend the original benchmark by including a robotic arm that moves over the Voronoi diagram and interacts by touching individual color regions/tiles.
Each tile corresponds to one causal variable, allowing for both single- and multi-target interactions.
The models need to deduce these interactions from a \robotstate{} $R^t\in[0,1]^2$ which is the 2D location of the robotic arm on the image.
When the robotic arm interacts with a variable, its mean is set to zero, which resembles a stochastic perfect intervention.

\smallparagraph{Evaluation}
We generate five Voronoi systems with six causal variables, and five systems with nine variables.
We compare \OurApproach{} to iVAE \cite{khemakhem2020variational}, LEAP \cite{yao2021learning}, and Disentanglement via Mechanism Sparsity (DMS) \cite{lachapelle2021disentanglement}, since all use a \robotstate{}. We do not compare with CITRIS \citep{lippe2022citris,lippe2022icitris}, because it requires known intervention targets.
We follow \citet{lippe2022icitris} in evaluating the models on a held-out test set where all causal variables are independently sampled.
We calculate the coefficient of determination \cite{wright1921correlation}, also called the $R^2$ score, between each causal variable $C_i$ and each learned latent variable $z_j$, denoted by $R^2_{ij}$.
If a model identifies the causal variables according to Definition~\ref{def:identifiability}, then for each causal variable $C_i$, there exists one latent variable $z_j$ for which $R^2_{ij}=1$, while it is zero for all others.
Since the alignment of the learned latent variables to causal variables is not known, we report $R^2$ scores for the permutation $\pi$ that maximizes the diagonal of the $R^2$ matrix, \ie{} $R^2\text{-diag}=\nicefrac{1}{K}\sum_{i=1}^{K} R^{2}_{i,\pi(i)}$ (where 1 is optimal).
To account for spurious modeled correlation, we also report the maximum correlation besides this alignment: $R^2\text{-sep}=\nicefrac{1}{K}\sum_{i=1}^{K}\max_{j\neq\pi(i)} R^{2}_{ij}$ (optimal 0).

\begin{figure}[t!]
    \centering
    \begin{subfigure}[b]{0.32\columnwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/experiments/voronoi/example_figure.png}
        \caption{Voronoi}
        \label{fig:experiments_environments_voronoi}
    \end{subfigure}
    \begin{subfigure}[b]{0.32\columnwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/experiments/causalworld/example_figure.png}
        \caption{CausalWorld}
        \label{fig:experiments_environments_causalworld}
    \end{subfigure}
    \begin{subfigure}[b]{0.32\columnwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/experiments/ithor/example_figure.png}
        \caption{iTHOR}
        \label{fig:experiments_environments_ithor}
    \end{subfigure}
    \caption{Example figures of our three environments with increasing complexity: Voronoi \cite{lippe2022icitris}, CausalWorld \cite{ahmed2020causalworld}, and iTHOR \cite{kolve2017ai}.}
    \label{fig:experiments_environments}
\end{figure}

\begin{figure}[t!]
    \centering
    \begin{tabular}{cc}
        \multicolumn{2}{c}{\includegraphics[width=0.8\columnwidth]{figures/experiments/voronoi/robotic_voronoi_labels.pdf}} \\[-2mm]
        \includegraphics[width=0.45\columnwidth]{figures/experiments/voronoi/robotic_voronoi_robarm.pdf} & 
        \includegraphics[width=0.45\columnwidth]{figures/experiments/voronoi/robotic_voronoi_minimal.pdf} \\
        (a) Random Interactions & (b) Minimal Interactions \\ 
    \end{tabular}
    \caption{Results on the Voronoi benchmark averaged over 10 seeds. Solid bars show the mean $R^2$-diag score (higher is better), and striped bars the $R^2$-sep scores (lower is better, non-visible bars indicate close-to zero values). \OurApproach{} accurately identifies the causal variables across settings.}
    \label{fig:experiments_voronoi_bar_plot}
\end{figure}

\smallparagraph{Results}
The results in \cref{fig:experiments_voronoi_bar_plot}a show that \OurApproach{} identifies the causal variables with high accuracy for both graphs with six and nine variables.
In comparison, all baselines struggle to identify the causal variables, often falling back to modeling the colors as latent variables instead. 
While the assumptions of iVAE and LEAP do not hold for additive Gaussian noise models, the assumptions of DMS, including the graph sparsity, mostly hold.
Still, \OurApproach{} is the only method to consistently identify the true variables, illustrating that its stable optimization and robustness.

\smallparagraph{Minimal Number of Regimes}
To verify that \OurApproach{} only requires $\lfloor \log_2 K \rfloor + 2$ different regimes (\cref{theo:main_theorem}), we repeat the previous experiments with reducing the interaction maps to a minimum.
This results in four sets of interactions for six variables, and five for nine variables.
\cref{fig:experiments_voronoi_bar_plot}b shows that \OurApproach{} still correctly identifies causal variables in this setting, supporting our theoretical results.

\smallparagraph{Learned Intervention Targets} 
After training, we can use the interaction variables $\hat{I}_1,...,\hat{I}_M$ learned by \OurApproach{} to identify the regions in which the robotic arm interacts with a causal variable.
Based on our theoretical results, we expect that some of the learned variables are identical to the true interaction variables $I_1,...I_K$ up to permutations and sign-flips.
In all settings, we find that the learned binary variables match the true interaction variables with an average F1 score of $98\%$ for the same permutation of variables as in the $R^2$ evaluation. This shows that \OurApproach{} identified the true interaction variables.
Thus, in practice, one could use a few samples with labeled interaction variables to identify the learned permutation of the model.


\subsection{CausalWorld}
\label{sec:experiments_causalworld}

CausalWorld \cite{ahmed2020causalworld} is a robotic manipulation environment with a tri-finger robot, which can interact with objects in an enclosed space by touch (see \cref{fig:experiments_environments_causalworld}).
The environment also allows for interventions on various environment parameters, including the colors or friction parameters of individual elements.
We experiment on this environment by recording the robot's interactions with a cube.
Besides the cube position, rotation and velocity, the causal variables are the colors of the three fingertips, as well as the floor, stage and cube friction, which we visualize by the colors of the respective objects.
All colors and friction parameters follow an additive Gaussian noise model. 
When a robot finger touches the cube, we perform a stochastic perfect intervention on its color.
Similarly, an interaction with the friction parameters correspond to touching these objects with all three fingers.
The \robotstate{} $R^t$ is modeled by the angles of the three motors per robot finger from the current and previous time step, providing velocity information. 

This environment provides two new challenges.
Firstly, not all interactions are necessarily binary.
In particular, the collisions between the robot and the cube have different effects depending on the velocity and direction of the fingers of the robot, which are not part of the state of the causal variables at the previous time step.
Additionally, the robotic system is present in the observation/image, while our theoretical results assume that $R^t$ is not a direct cause of $X^t$.
We adapt \OurApproach{}-NF and the baselines to this case by adding $R^t$ as additional information to the decoder, effectively removing the need to model $R^t$ in the latent space.
% For a fair comparison, we apply the same approach to the baselines.

\begin{table}[t!]
    \centering
    \caption{$R^2$ scores (diag $\uparrow$ / sep $\downarrow$) for the identification of the causal variables on CausalWorld and iTHOR.}
    \resizebox{\columnwidth}{!}{
    \begin{tabular}{lcc}
        \toprule
        \textbf{Models} & \textbf{CausalWorld} & \textbf{iTHOR} \\
        \midrule
        iVAE \cite{khemakhem2020variational} & 0.28 / 0.00 & 0.48 / 0.35 \\
        LEAP \cite{yao2021learning} & 0.30 / 0.00 & 0.63 / 0.45 \\
        DMS \cite{lachapelle2021disentanglement} & 0.32 / 0.00 & 0.61 / 0.40\\
        \OurApproach{}-NF (Ours) & \textbf{0.97} / 0.01 & \textbf{0.96} / \textbf{0.15}\\
        \bottomrule
    \end{tabular}
    }
    \label{tab:experiments_causalworld_ithor}
\end{table}

On this task, \OurApproach{} identifies the causal variables well, as seen in \cref{tab:experiments_causalworld_ithor}.
Because the cube position, velocity and rotation share the same interactions, in the evaluation we consider them as a multidimensional variable.
Although the true model cannot be fully described by binary interaction variables, \OurApproach{} still models the binary information of whether a collision happens or not for the cube, since it is the most important part of the dynamics.
We verify this in Appendix~C.2.3 by measuring the F1 score between the predicted interaction variables and ground truth interactions/collisions.
\OurApproach{} achieves an F1 score of 50\% for all cube-arm interactions, which indicates a high similarity between the learned
interaction and the ground truth collisions considering that collisions only happen in approximately 5\%
of the frames.
The mismatches are mostly due to the learned interactions being more conservative, \ie{} being 1 already a frame too early sometimes. 
Meanwhile, none of the baselines are able to reconstruct the image sufficiently, missing the robotic arms and the cube as shown in Appendix C.2.3.
While this might improve with significant tuning effort, \OurApproach{}-NF is not sensitive to the difficulty of the reconstruction due to its separate autoencoder training stage.

\subsection{iTHOR - Embodied AI}
\label{sec:experiments_ithor}

To illustrate the potential of causal representation learning in embodied AI, we apply \OurApproach{} to the iTHOR environment \cite{kolve2017ai}.
In this environment, an embodied AI agent can perform actions on various objects in an 3D indoor scene such as a kitchen.
These agent-object interactions can often be described by a binary variable, \eg{} pickup/put down an object, open/close a door, turn on/off an object, etc., which makes it an ideal setup for \OurApproach{}.

Our goal in this environment is to identify the causal variables, \ie{} the objects and their states, from sequences of interactions.
We perform this task on the kitchen environment shown in \cref{fig:experiments_environments_ithor}.
This environment contains two movable objects, \ie{} a plate and an egg, and seven static objects, \eg{} a microwave and a stove.
Overall, we have 18 causal variables, which include both continuous, \eg{} the location of the plate, and binary variables, \eg{} whether the microwave is on or off.
Causal variables influence each other by state changes, \eg{} the egg gets cooked when it is in the pan and the stove is turned on.
Further, the set of possible actions that can be performed depends on the previous time step, \eg{} only one object can be picked up at a time.
For training, we generate a dataset where we randomly pick a valid action at each time step.
We model the \robotstate{} $R^t$ as a two-dimensional pixel coordinate, which is the position of a pixel showing the interacted object in the image ($R^t\in[0,1]^2$).
This simulates iTHOR's web demo \cite{kolve2017ai}, where a user interacts with objects by clicking on them.

We train \OurApproach{}-NF and our baselines on this dataset, and compare the latent representation to the ground truth causal variables in terms of the $R^2$ score in \cref{tab:experiments_causalworld_ithor}.
Although the baselines reconstruct the image mostly well, the causal variables are highly entangled in their representations.
In contrast, \OurApproach{} identifies and separates most of the causal variables optimally, except for the two movable objects (egg/plate).
This is likely due to the high inherent correlation of the two objects, since their positions cannot overlap and only one of them can be picked up at a time.

\begin{figure}[t!]
    \centering
    \footnotesize
    \setlength{\tabcolsep}{1pt}
    \begin{tabular}{ccc}
        \includegraphics[width=0.3\linewidth]{figures/experiments/ithor/interaction_maps/ithor_hard_vq_matches_5_orig_img.png} & 
        \includegraphics[width=0.3\linewidth]{figures/experiments/ithor/interaction_maps/ithor_hard_vq_matches_5.png} &
        \includegraphics[width=0.3\linewidth]{figures/experiments/ithor/interaction_maps/ithor_hard_vq_matches_5_augmented.png} \\
        Input Image & Learned Interactions & Combined Image \\
    \end{tabular}
    \caption{Visualizing the learned interaction variables of \OurApproach{} for an example input image (left). We show the locations, \ie{} values of $R^t\in[0,1]^2$, for which each interaction variable is greater than zero/active as different colors. For readability, only nine interaction variables are shown. The right image is an overlay of both. \OurApproach{} accurately learns the interactions and adapts them to the input image.
    }
    \label{fig:experiments_ithor_interaction_maps}
\end{figure}

Besides evaluating the causal representation, we also visualize the learned interaction variables of \OurApproach{} in \cref{fig:experiments_ithor_interaction_maps}. 
Here, each color represents the region in which \OurApproach{} identified an interaction with a different causal variable.
\cref{fig:experiments_ithor_interaction_maps} shows that \OurApproach{} has identified the correct interaction region for each object.
Moreover, it allows for context-dependent interactions, as the location of the plate influences the region of its corresponding interaction variable.

\begin{figure}[t!]
    \centering
    \footnotesize
    \setlength{\tabcolsep}{1pt}
    \begin{tabular}{ccc}
        \includegraphics[width=0.3\linewidth]{figures/experiments/ithor/triplets/ithor_triplet_090.png} & 
        \includegraphics[width=0.3\linewidth]{figures/experiments/ithor/triplets/ithor_triplet_099.png} &
        \includegraphics[width=0.3\linewidth]{figures/experiments/ithor/triplets/rec_frame_11+23_090_099.png} \\
        Input Image 1 & Input Image 2 & Generated Output \\
    \end{tabular}
    \caption{
    Performing interventions in the latent space. First, the two inputs images are encoded into latent space. Then, we replace the latents of the front-left stove and microwave in the first image by the corresponding latents of the second image. Decoding these new latents creates an unseen scenario where the egg is uncooked, but the stove is turned on. This shows the modularity of \OurApproach{}'s representations.
    }
    \label{fig:experiments_ithor_triplet}
\end{figure}

Finally, we can use the learned causal representation to perform interventions and create novel combinations of causal variables.
For this, we encode two images into the learned latent space of \OurApproach{}, and combine the latent representations of the causal variables to have a novel image decoded.
For example, in \cref{fig:experiments_ithor_triplet}, we replace the latents representing the front-left stove and the microwave state in the first image by the corresponding latents of the second image.
\OurApproach{} not only integrates these changes without influencing any of the other causal variables, but generates a completely novel state: even though in the iTHOR environment, the egg is instantaneously cooked when the stove turns on, \OurApproach{} correctly combines the state of the egg being raw with the stove burning.
This shows the capabilities of \OurApproach{} to model unseen causal interventions. 

\section{Conclusion}
\label{sec:conclusion}

We prove that under mild assumptions, causal variables become identifiable from high-dimensional observations, when their interactions with an external system can be described by unknown binary variables.
As a practical algorithm, we propose \OurApproach{}, which learns the causal variables and their interaction variables.
In experiments across three robotic-inspired datasets, \OurApproach{} outperforms previous methods in identifying the causal variables from images.

While in experiments, \OurApproach{} shows strong identification even for complex interactions, the presented theory is currently limited to binary interaction variables.
Although the first step may be to generalize the theory to interaction variables with more than two states, extensions to unknown domains or sparse, continuous interaction variables are other interesting future directions.
Instead of assuming distinct interaction patterns, future work can extend these results to partial identifiability, similar to \citet{lippe2022citris, lachapelle2022partial}.
Finally, our results open up the opportunity for empirical studies showing the benefits of causal representations for complex real-world tasks like embodied AI. 

\begin{contributions}
    P.~Lippe conceived the idea, developed the theoretical results, implemented the models and experiments, and wrote the paper.
    S.~Magliacane, S.~L\"owe, Y.~M.~Asano, T.~Cohen, E.~Gavves advised during the project and helped in writing the paper.
\end{contributions}

\begin{acknowledgements}
    We thank SURF for the support in using the National Supercomputer Snellius.
    This work is financially supported by Qualcomm Technologies Inc., the University of Amsterdam and the allowance Top consortia for Knowledge and Innovation (TKIs) from the Netherlands Ministry of Economic Affairs and Climate Policy.
\end{acknowledgements}

\bibliography{references}
\bibliographystyle{uai2023}

\end{document}