%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{subcaption}
\usepackage{multirow}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{corollary}{Corollary}[section]
\usepackage{tikz}
\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
		\node[shape=circle,draw,inner sep=1pt] (char) {#1};}}
%%%%% NEW MATH DEFINITIONS %%%%%

\usepackage{amsmath,amsfonts,bm}

% Mark sections of captions for referring to divisions of figures
\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

% Highlight a newly defined term
\newcommand{\newterm}[1]{{\bf #1}}


% Figure reference, lower-case.
\def\figref#1{figure~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}
% Reference to two sections.
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
% Reference to three sections.
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
% Reference to an equation, lower-case.
\def\eqref#1{equation~\ref{#1}}
% Reference to an equation, upper case
\def\Eqref#1{Equation~\ref{#1}}
% A raw reference to an equation---avoid using if possible
\def\plaineqref#1{\ref{#1}}
% Reference to a chapter, lower-case.
\def\chapref#1{chapter~\ref{#1}}
% Reference to an equation, upper case.
\def\Chapref#1{Chapter~\ref{#1}}
% Reference to a range of chapters
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
% Reference to an algorithm, lower-case.
\def\algref#1{algorithm~\ref{#1}}
% Reference to an algorithm, upper case.
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
% Reference to a part, lower case
\def\partref#1{part~\ref{#1}}
% Reference to a part, upper case
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


% Random variables
\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
% rm is already a command, just don't name any random variables m
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

% Random vectors
\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

% Elements of random vectors
\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

% Random matrices
\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

% Elements of random matrices
\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

% Vectors
\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}

% Elements of vectors
\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

% Matrix
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}

\def\la{\langle}
\def\ra{\rangle}

% Tensor
\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


% Graph
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

% Entries of a matrix
\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

% entries of a tensor
% Same font as tensor, without \bm wrapper
\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

% The true underlying data generating distribution
\newcommand{\pdata}{p_{\rm{data}}}
% The empirical distribution defined by the training set
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
% The model distribution
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
% Stochastic autoencoder distributions
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
% But then they seem to use $L^2$ for vectors throughout the site, and so does
% wikipedia.
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak

\graphicspath{{figures/}}
\newcommand{\eat}[1]{}

\newcommand{\Cy}{\Ccal_y}
\renewcommand{\l}{\boldsymbol{\ell}}
\renewcommand{\S}
{\!\!\,\includegraphics[height=1em]{figs/arcsinh}\!\!}
\def\Gt{\widetilde{G}}
\def\ah{\widehat{a}}
\def\ahb{\mathbf{\ah}}
\def\at{\widetilde{a}}
\def\atb{\mathbf{\at}}
\def\yh{\widehat{y}}
\def\yt{\widetilde{y}}
\def\ytb{\mathbf{\yt}}
\def\yhb{\mathbf{\yh}}
\def\zh{\widehat{z}}
\def\zt{\widetilde{z}}
\def\ztb{\mathbf{\zt}}
\def\zhb{\mathbf{\zh}}
\def\g{\mathbf{g}}
\def\bah{\mathbf{\widehat{a}}}
\RequirePackage{latexsym}
\RequirePackage{amsmath}
%\RequirePackage{amsthm}
\RequirePackage{amssymb}
\RequirePackage{bm}

\newcommand{\Magenta}[1]{\color{magenta}{#1}\color{black}}
\newcommand{\Blue}[1]{\textcolor{blue}{#1}}
%\newcommand{\Green}[1]{\color{OliveGreen}{#1}}
\newcommand{\Green}[1]{\textcolor{mygreen}{#1}}
\newcommand{\Red}[1]{\textcolor{red}{#1}}
\newcommand{\Cyan}[1]{\textcolor{cyan}{#1}}
\newcommand{\Orange}[1]{\textcolor{orange}{#1}}
\newcommand{\Brown}[1]{\textcolor{brown}{#1}}
%\newcommand{\Ref}[1]{\hfill\Green{[#1]}}
\DeclareMathOperator{\arginf}{arg\,inf}
%\newcommand{\alert}[1]{\Red{\bf {#1}}}

\newcommand{\rmd}{\mathrm{d}} % use italics only for variable

%\newcommand{\Tr}{^\mathsf{T}} % sf looks nicer than rm


%%% MW %%%%

\newcommand{\mnote}[1]{{\bf\large *}\marginpar{\Magenta{#1}}}
\newcommand{\dnote}[1]{{\bf\large +}\marginpar{\tiny#1}}
\ifx\note\undefined
\newcommand{\note}[1]{{\bf \Magenta{#1}}}
\else
\renewcommand{\note}[1]{{\bf \Magenta{#1}}}
\fi

\def\x{\mathbf{x}}
\def\a{\mathbf{a}}
\def\y{\mathbf{y}}
\def\z{\mathbf{z}}
\def\w{\mathbf{w}}
\def\t{\boldsymbol{\theta}}
\def\R{\mathbb{R}}

%%%%%%%% Stock standard definitions %%%%%%%%%%%%%%%

%\DeclareMathOperator{\ab}{\mathbf{a}}
\DeclareMathOperator{\bb}{\mathbf{b}}
\DeclareMathOperator{\cbb}{\mathbf{c}}
\DeclareMathOperator{\db}{\mathbf{d}}
\DeclareMathOperator{\eb}{\mathbf{e}}
\DeclareMathOperator{\fb}{\mathbf{f}}
\DeclareMathOperator{\gb}{\mathbf{g}}
\DeclareMathOperator{\hb}{\mathbf{h}}
\DeclareMathOperator{\ib}{\mathbf{i}}
\DeclareMathOperator{\jb}{\mathbf{j}}
\DeclareMathOperator{\kb}{\mathbf{k}}
\DeclareMathOperator{\lb}{\mathbf{l}}
\DeclareMathOperator{\mb}{\mathbf{m}}
\DeclareMathOperator{\nbb}{\mathbf{n}}
\DeclareMathOperator{\ob}{\mathbf{o}}
\DeclareMathOperator{\pb}{\mathbf{p}}
\DeclareMathOperator{\qb}{\mathbf{q}}
\DeclareMathOperator{\sbb}{\mathbf{s}}
\DeclareMathOperator{\tb}{\mathbf{t}}
\DeclareMathOperator{\ub}{\mathbf{u}}
\DeclareMathOperator{\wb}{\mathbf{w}}
\DeclareMathOperator{\xb}{\mathbf{x}}
\DeclareMathOperator{\yb}{\mathbf{y}}
\DeclareMathOperator{\zb}{\mathbf{z}}

\DeclareMathOperator{\pr}{\mathrm{p}}

\DeclareMathOperator{\deltab}{\boldsymbol{\delta}}

\DeclareMathOperator{\atilde}{\tilde{\ab}}
\DeclareMathOperator{\btilde}{\tilde{\bb}}
\DeclareMathOperator{\ctilde}{\tilde{\cb}}
\DeclareMathOperator{\dtilde}{\tilde{\db}}
\DeclareMathOperator{\etilde}{\tilde{\eb}}
\DeclareMathOperator{\ftilde}{\tilde{\fb}}
\DeclareMathOperator{\gtilde}{\tilde{\gb}}
\DeclareMathOperator{\htilde}{\tilde{\hb}}
\DeclareMathOperator{\itilde}{\tilde{\ib}}
\DeclareMathOperator{\jtilde}{\tilde{\jb}}
\DeclareMathOperator{\ktilde}{\tilde{\kb}}
\DeclareMathOperator{\ltilde}{\tilde{\lb}}
\DeclareMathOperator{\mtilde}{\tilde{\mb}}
\DeclareMathOperator{\ntilde}{\tilde{\nbb}}
\DeclareMathOperator{\otilde}{\tilde{\ob}}
%
\DeclareMathOperator{\qtilde}{\tilde{\qb}}
\DeclareMathOperator{\rtilde}{\tilde{\rb}}
\DeclareMathOperator{\stilde}{\tilde{\sbb}}
\DeclareMathOperator{\ttilde}{\tilde{\tb}}
\DeclareMathOperator{\utilde}{\tilde{\ub}}
\DeclareMathOperator{\vtilde}{\tilde{\vb}}
\DeclareMathOperator{\wtilde}{\tilde{\wb}}
\DeclareMathOperator{\xtilde}{\tilde{\xb}}
\DeclareMathOperator{\ytilde}{\tilde{\yb}}
\DeclareMathOperator{\ztilde}{\tilde{\zb}}

\DeclareMathOperator{\abar}{\bar{\ab}}
\DeclareMathOperator{\bbar}{\bar{\bb}}
\DeclareMathOperator{\cbar}{\bar{\cb}}
\DeclareMathOperator{\dbar}{\bar{\db}}
\DeclareMathOperator{\ebar}{\bar{\eb}}
\DeclareMathOperator{\fbar}{\bar{\fb}}
\DeclareMathOperator{\gbar}{\bar{\gb}}
\DeclareMathOperator{\hbbar}{\bar{\hb}}
\DeclareMathOperator{\ibar}{\bar{\ib}}
\DeclareMathOperator{\jbar}{\bar{\jb}}
\DeclareMathOperator{\kbar}{\bar{\kb}}
\DeclareMathOperator{\lbar}{\bar{\lb}}
\DeclareMathOperator{\mbar}{\bar{\mb}}
\DeclareMathOperator{\nbar}{\bar{\nbb}}
\DeclareMathOperator{\obar}{\bar{\ob}}
\DeclareMathOperator{\pbar}{\bar{\pb}}
\DeclareMathOperator{\qbar}{\bar{\qb}}
\DeclareMathOperator{\rbar}{\bar{\rb}}
\DeclareMathOperator{\sbar}{\bar{\sbb}}
\DeclareMathOperator{\tbar}{\bar{\tb}}
\DeclareMathOperator{\ubar}{\bar{\ub}}
\DeclareMathOperator{\vbar}{\bar{\vb}}
\DeclareMathOperator{\wbar}{\bar{\wb}}
\DeclareMathOperator{\xbar}{\bar{\xb}}
\DeclareMathOperator{\ybar}{\bar{\yb}}
\DeclareMathOperator{\zbar}{\bar{\zb}}

\DeclareMathOperator{\Ab}{\mathbf{A}}
\DeclareMathOperator{\Bb}{\mathbf{B}}
\DeclareMathOperator{\Cb}{\mathbf{C}}
\DeclareMathOperator{\Db}{\mathbf{D}}
\DeclareMathOperator{\Eb}{\mathbf{E}}
\DeclareMathOperator{\Fb}{\mathbf{F}}
\DeclareMathOperator{\Gb}{\mathbf{G}}
\DeclareMathOperator{\Hb}{\mathbf{H}}
\DeclareMathOperator{\Ib}{\mathbf{I}}
\DeclareMathOperator{\Jb}{\mathbf{J}}
\DeclareMathOperator{\Kb}{\mathbf{K}}
\DeclareMathOperator{\Lb}{\mathbf{L}}
\DeclareMathOperator{\Mb}{\mathbf{M}}
\DeclareMathOperator{\Nb}{\mathbf{N}}
\DeclareMathOperator{\Ob}{\mathbf{O}}
\DeclareMathOperator{\Pb}{\mathbf{P}}
\DeclareMathOperator{\Qb}{\mathbf{Q}}
\DeclareMathOperator{\Rb}{\mathbf{R}}
\DeclareMathOperator{\Sbb}{\mathbf{S}}
\DeclareMathOperator{\Tb}{\mathbf{T}}
\DeclareMathOperator{\Ub}{\mathbf{U}}
\DeclareMathOperator{\Vb}{\mathbf{V}}
\DeclareMathOperator{\Wb}{\mathbf{W}}
\DeclareMathOperator{\Xb}{\mathbf{X}}
\DeclareMathOperator{\Yb}{\mathbf{Y}}
\DeclareMathOperator{\Zb}{\mathbf{Z}}

\DeclareMathOperator{\Abar}{\bar{A}}
\DeclareMathOperator{\Bbar}{\bar{B}}
\DeclareMathOperator{\Cbar}{\bar{C}}
\DeclareMathOperator{\Dbar}{\bar{D}}
\DeclareMathOperator{\Ebar}{\bar{E}}
\DeclareMathOperator{\Fbar}{\bar{F}}
\DeclareMathOperator{\Gbar}{\bar{G}}
\DeclareMathOperator{\Hbar}{\bar{H}}
\DeclareMathOperator{\Ibar}{\bar{I}}
\DeclareMathOperator{\Jbar}{\bar{J}}
\DeclareMathOperator{\Kbar}{\bar{K}}
\DeclareMathOperator{\Lbar}{\bar{L}}
\DeclareMathOperator{\Mbar}{\bar{M}}
\DeclareMathOperator{\Nbar}{\bar{N}}
\DeclareMathOperator{\Obar}{\bar{O}}
\DeclareMathOperator{\Pbar}{\bar{P}}
\DeclareMathOperator{\Qbar}{\bar{Q}}
\DeclareMathOperator{\Rbar}{\bar{R}}
\DeclareMathOperator{\Sbar}{\bar{S}}
\DeclareMathOperator{\Tbar}{\bar{T}}
\DeclareMathOperator{\Ubar}{\bar{U}}
\DeclareMathOperator{\Vbar}{\bar{V}}
\DeclareMathOperator{\Wbar}{\bar{W}}
\DeclareMathOperator{\Xbar}{\bar{X}}
\DeclareMathOperator{\Ybar}{\bar{Y}}

\DeclareMathOperator{\Abbar}{\bar{\Ab}}
\DeclareMathOperator{\Bbbar}{\bar{\Bb}}
\DeclareMathOperator{\Cbbar}{\bar{\Cb}}
\DeclareMathOperator{\Dbbar}{\bar{\Db}}
\DeclareMathOperator{\Ebbar}{\bar{\Eb}}
\DeclareMathOperator{\Fbbar}{\bar{\Fb}}
\DeclareMathOperator{\Gbbar}{\bar{\Gb}}
\DeclareMathOperator{\Hbbar}{\bar{\Hb}}
\DeclareMathOperator{\Ibbar}{\bar{\Ib}}
\DeclareMathOperator{\Jbbar}{\bar{\Jb}}
\DeclareMathOperator{\Kbbar}{\bar{\Kb}}
\DeclareMathOperator{\Lbbar}{\bar{\Lb}}
\DeclareMathOperator{\Mbbar}{\bar{\Mb}}
\DeclareMathOperator{\Nbbar}{\bar{\Nb}}
\DeclareMathOperator{\Obbar}{\bar{\Ob}}
\DeclareMathOperator{\Pbbar}{\bar{\Pb}}
\DeclareMathOperator{\Qbbar}{\bar{\Qb}}
\DeclareMathOperator{\Rbbar}{\bar{\Rb}}
\DeclareMathOperator{\Sbbar}{\bar{\Sb}}
\DeclareMathOperator{\Tbbar}{\bar{\Tb}}
\DeclareMathOperator{\Ubbar}{\bar{\Ub}}
\DeclareMathOperator{\Vbbar}{\bar{\Vb}}
\DeclareMathOperator{\Wbbar}{\bar{\Wb}}
\DeclareMathOperator{\Xbbar}{\bar{\Xb}}
\DeclareMathOperator{\Ybbar}{\bar{\Yb}}
\DeclareMathOperator{\Zbbar}{\bar{\Zb}}

\DeclareMathOperator{\Ahat}{\widehat{A}}
\DeclareMathOperator{\Bhat}{\widehat{B}}
\DeclareMathOperator{\Chat}{\widehat{C}}
\DeclareMathOperator{\Dhat}{\widehat{D}}
\DeclareMathOperator{\Ehat}{\widehat{E}}
\DeclareMathOperator{\Fhat}{\widehat{F}}
\DeclareMathOperator{\Ghat}{\widehat{G}}
\DeclareMathOperator{\Hhat}{\widehat{H}}
\DeclareMathOperator{\Ihat}{\widehat{I}}
\DeclareMathOperator{\Jhat}{\widehat{J}}
\DeclareMathOperator{\Khat}{\widehat{K}}
\DeclareMathOperator{\Lhat}{\widehat{L}}
\DeclareMathOperator{\Mhat}{\widehat{M}}
\DeclareMathOperator{\Nhat}{\widehat{N}}
\DeclareMathOperator{\Ohat}{\widehat{O}}
\DeclareMathOperator{\Phat}{\widehat{P}}
\DeclareMathOperator{\Qhat}{\widehat{Q}}
\DeclareMathOperator{\Rhat}{\widehat{R}}
\DeclareMathOperator{\Shat}{\widehat{S}}
\DeclareMathOperator{\That}{\widehat{T}}
\DeclareMathOperator{\Uhat}{\widehat{U}}
\DeclareMathOperator{\Vhat}{\widehat{V}}
\DeclareMathOperator{\What}{\widehat{W}}
\DeclareMathOperator{\Xhat}{\widehat{X}}
\DeclareMathOperator{\Yhat}{\widehat{Y}}
\DeclareMathOperator{\Zhat}{\widehat{Z}}

\DeclareMathOperator{\Abhat}{\widehat{\Ab}}
\DeclareMathOperator{\Bbhat}{\widehat{\Bb}}
\DeclareMathOperator{\Cbhat}{\widehat{\Cb}}
\DeclareMathOperator{\Dbhat}{\widehat{\Db}}
\DeclareMathOperator{\Ebhat}{\widehat{\Eb}}
\DeclareMathOperator{\Fbhat}{\widehat{\Fb}}
\DeclareMathOperator{\Gbhat}{\widehat{\Gb}}
\DeclareMathOperator{\Hbhat}{\widehat{\Hb}}
\DeclareMathOperator{\Ibhat}{\widehat{\Ib}}
\DeclareMathOperator{\Jbhat}{\widehat{\Jb}}
\DeclareMathOperator{\Kbhat}{\widehat{\Kb}}
\DeclareMathOperator{\Lbhat}{\widehat{\Lb}}
\DeclareMathOperator{\Mbhat}{\widehat{\Mb}}
\DeclareMathOperator{\Nbhat}{\widehat{\Nb}}
\DeclareMathOperator{\Obhat}{\widehat{\Ob}}
\DeclareMathOperator{\Pbhat}{\widehat{\Pb}}
\DeclareMathOperator{\Qbhat}{\widehat{\Qb}}
\DeclareMathOperator{\Rbhat}{\widehat{\Rb}}
\DeclareMathOperator{\Sbhat}{\widehat{\Sb}}
\DeclareMathOperator{\Tbhat}{\widehat{\Tb}}
\DeclareMathOperator{\Ubhat}{\widehat{\Ub}}
\DeclareMathOperator{\Vbhat}{\widehat{\Vb}}
\DeclareMathOperator{\Wbhat}{\widehat{\Wb}}
\DeclareMathOperator{\Xbhat}{\widehat{\Xb}}
\DeclareMathOperator{\Ybhat}{\widehat{\Yb}}
\DeclareMathOperator{\Zbhat}{\widehat{\Zb}}

\DeclareMathOperator{\Acal}{\mathcal{A}}
\DeclareMathOperator{\Bcal}{\mathcal{B}}
\DeclareMathOperator{\Ccal}{\mathcal{C}}
\DeclareMathOperator{\Dcal}{\mathcal{D}}
\DeclareMathOperator{\Ecal}{\mathcal{E}}
\DeclareMathOperator{\Fcal}{\mathcal{F}}
\DeclareMathOperator{\Gcal}{\mathcal{G}}
\DeclareMathOperator{\Hcal}{\mathcal{H}}
\DeclareMathOperator{\Ical}{\mathcal{I}}
\DeclareMathOperator{\Jcal}{\mathcal{J}}
\DeclareMathOperator{\Kcal}{\mathcal{K}}
\DeclareMathOperator{\Lcal}{\mathcal{L}}
\DeclareMathOperator{\Mcal}{\mathcal{M}}
\DeclareMathOperator{\Ncal}{\mathcal{N}}
\DeclareMathOperator{\Ocal}{\mathcal{O}}
\DeclareMathOperator{\Pcal}{\mathcal{P}}
\DeclareMathOperator{\Qcal}{\mathcal{Q}}
\DeclareMathOperator{\Rcal}{\mathcal{R}}
\DeclareMathOperator{\Scal}{\mathcal{S}}
\DeclareMathOperator{\Tcal}{\mathcal{T}}
\DeclareMathOperator{\Ucal}{\mathcal{U}}
\DeclareMathOperator{\Vcal}{\mathcal{V}}
\DeclareMathOperator{\Wcal}{\mathcal{W}}
\DeclareMathOperator{\Xcal}{\mathcal{X}}
\DeclareMathOperator{\Ycal}{\mathcal{Y}}
\DeclareMathOperator{\Zcal}{\mathcal{Z}}

\DeclareMathOperator{\Atilde}{\widetilde{A}}
\DeclareMathOperator{\Btilde}{\widetilde{B}}
\DeclareMathOperator{\Ctilde}{\widetilde{C}}
\DeclareMathOperator{\Dtilde}{\widetilde{D}}
\DeclareMathOperator{\Etilde}{\widetilde{E}}
\DeclareMathOperator{\Ftilde}{\widetilde{F}}
\DeclareMathOperator{\Gtilde}{\widetilde{G}}
\DeclareMathOperator{\Htilde}{\widetilde{H}}
\DeclareMathOperator{\Itilde}{\widetilde{I}}
\DeclareMathOperator{\Jtilde}{\widetilde{J}}
\DeclareMathOperator{\Ktilde}{\widetilde{K}}
\DeclareMathOperator{\Ltilde}{\widetilde{L}}
\DeclareMathOperator{\Mtilde}{\widetilde{M}}
\DeclareMathOperator{\Ntilde}{\widetilde{N}}
\DeclareMathOperator{\Otilde}{\widetilde{O}}
\DeclareMathOperator{\Ptilde}{\widetilde{P}}
\DeclareMathOperator{\Qtilde}{\widetilde{Q}}
\DeclareMathOperator{\Rtilde}{\widetilde{R}}
\DeclareMathOperator{\Stilde}{\widetilde{S}}
\DeclareMathOperator{\Ttilde}{\widetilde{T}}
\DeclareMathOperator{\Utilde}{\widetilde{U}}
\DeclareMathOperator{\Vtilde}{\widetilde{V}}
\DeclareMathOperator{\Wtilde}{\widetilde{W}}
\DeclareMathOperator{\Xtilde}{\widetilde{X}}
\DeclareMathOperator{\Ytilde}{\widetilde{Y}}
\DeclareMathOperator{\Ztilde}{\widetilde{Z}}


%%%%%%%% Widely accepted definitions %%%%%%%%%%%%%%%

\DeclareMathOperator{\CC}{\mathbb{C}} % Complex numbers
\DeclareMathOperator{\EE}{\mathbb{E}} % Expectation
\DeclareMathOperator{\KK}{\mathbb{K}} % Arbitrary field
\DeclareMathOperator{\MM}{\mathbb{M}} % Median
\DeclareMathOperator{\NN}{\mathbb{N}} % Natural numbers
\DeclareMathOperator{\PP}{\mathbb{P}} % Probability
\DeclareMathOperator{\QQ}{\mathbb{Q}} % Rationals
\DeclareMathOperator{\RR}{\mathbb{R}} % Real numbers
\DeclareMathOperator{\ZZ}{\mathbb{Z}} % Integers

\DeclareMathOperator{\one}{\mathbf{1}}  % Identity
\DeclareMathOperator{\zero}{\mathbf{0}} % Zero

\DeclareMathOperator*{\mini}{\mathop{\mathrm{minimize}}}
\DeclareMathOperator*{\maxi}{\mathop{\mathrm{maximize}}}

\DeclareMathOperator*{\argsup}{\mathop{\mathrm{argsup}}}
\DeclareMathOperator*{\arcsinh}{\mathop{\mathrm{arcsinh}}}
\DeclareMathOperator*{\limit}{\mathop{\mathrm{limit}}}
\DeclareMathOperator{\sgn}{\mathop{\mathrm{sign}}}
\DeclareMathOperator{\tr}{\mathop{\mathrm{tr}}}
\DeclareMathOperator{\rank}{\mathop{\mathrm{rank}}}
\DeclareMathOperator{\traj}{\mathop{\mathrm{Traj}}}
\DeclareMathOperator{\diag}{diag}

%%%%%%%% Bold Greek Letters %%%%%%%%%%%%%%%
\newcommand{\sigmab}{\bm{\sigma}}
\newcommand{\Sigmab}{\mathbf{\Sigma}}
\newcommand{\Thetab}{{\bm{\Theta}}}
\newcommand{\thetab}{{\bm{\theta}}}
\newcommand{\xib}{{\bm{\xi}}}
\newcommand{\Xib}{{\bm{\Xi}}}
\newcommand{\zetab}{{\bm{\zeta}}}
\newcommand{\alphab}{{\bm{\alpha}}}
\newcommand{\taub}{{\bm{\tau}}}
\newcommand{\etab}{{\bm{\eta}}}


\DeclareMathOperator{\alphahat}{\widehat{\alpha}}
%%%%%%%% Mess around with LaTeX %%%%%%%%%%%%%%%

%% Some style files might actually define these variables.
%% So don't mess with them if they are already defined

\ifx\BlackBox\undefined
\newcommand{\BlackBox}{\rule{1.5ex}{1.5ex}}  % end of proof
\fi

\ifx\proof\undefined
\newenvironment{proof}{\par\noindent{\bf Proof\ }}{\hfill\BlackBox\\[2mm]}
\fi

\ifx\theorem\undefined
\newtheorem{theorem}{Theorem}[section]
\fi
\ifx\example\undefined
\newtheorem{example}{Example}
\fi
\ifx\lemma\undefined
\newtheorem{lemma}{Lemma}[section]
\fi

\newtheorem{assumption}{Assumption}[section]

\ifx\corollary\undefined
\newtheorem{corollary}{Corollary}[section]
\fi


\ifx\definition\undefined
\newtheorem{definition}[theorem]{Definition}
\fi

\ifx\proposition\undefined
\newtheorem{proposition}[theorem]{Proposition}
\fi

\ifx\remark\undefined
\newtheorem{remark}[theorem]{Remark}
\fi

\ifx\conjecture\undefined
\newtheorem{conjecture}[theorem]{Conjecture}
\fi

\ifx\factoid\undefined
\newtheorem{factoid}[theorem]{Fact}
\fi

\ifx\axiom\undefined
\newtheorem{axiom}[theorem]{Axiom}
\fi

%%%%%%%% Utility functions %%%%%%%%%%%%%%%

\newcommand{\eq}[1]{(\ref{#1})}
\newcommand{\mymatrix}[2]{\left[\begin{array}{#1} #2 \end{array}\right]}
\newcommand{\mychoose}[2]{\left(\begin{array}{c} #1 \\ #2 \end{array}\right)}
\newcommand{\mydet}[1]{\det\left[ #1 \right]}
\newcommand{\sembrack}[1]{[\![#1]\!]}

%brackets
\newcommand{\inner}[2]{\left\langle #1,#2 \right\rangle}
\newcommand{\rbr}[1]{\left(#1\right)}
\newcommand{\sbr}[1]{\left[#1\right]}
\newcommand{\cbr}[1]{\left\{#1\right\}}
\newcommand{\nbr}[1]{\left\|#1\right\|}
\newcommand{\abr}[1]{\left|#1\right|}

\newcommand{\ea}{\emph{et al. }}

\newcommand{\vishy}[1]{{\bf[#1--VISHY]}}
\newcommand{\tim}[1]{{\bf[#1 --TIM]}}

%Logic
%\newcommand{\implies}{\Rightarrow}
%\newcommand{\iff}{\Leftrightarrow}
\newcommand{\goesto}{\rightarrow}
\newcommand{\suchthat}{\ensuremathmode{\text{ such that }}}
\newcommand{\st}{\suchthat}

%%%%%%%% Specific symbols for this project %%%%%%%%%%%%%%%

% upright names for matrices --don't use for single-letter function names!
\newcommand{\A}{\mathbf{A}}
\newcommand{\As}{\mathbf{A}^{\!\!*}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\Bs}{\mathbf{B}^{*}}

% p and variants
\newcommand{\p}{\mathbf{p}}
\newcommand{\pp}{\p^{\prime}}
\newcommand{\ps}{\p^*}
\newcommand{\po}{\p_0}
\newcommand{\pos}{\p_{0}^{*}}
\newcommand{\poi}{[\p_{0}]_i}
\newcommand{\pii}{\p_{i}}
\newcommand{\pip}{\p_{i}^{\prime}}
\newcommand{\pis}{\p_{i}^{*}}
\newcommand{\pBar}{\bar{\p}}
\newcommand{\pohat}{\widehat{\p}_0}
\newcommand{\phat}{\widehat{\p}}
\newcommand{\psBar}{\bar{\p}^{*}}
\newcommand{\pbb}{\p_{\bb}}

% mu and variants
\newcommand{\mub}{{\bm{\mu}}}
\newcommand{\ms}{\m^{*}}
\newcommand{\mo}{\m_0}
\newcommand{\mh}{\widehat{\m}}
\newcommand{\mBar}{\bar{\m}}
\newcommand{\msBar}{\bar{\m}^{*}}

% Bold mu_B
\newcommand{\mBBar}{\bar{\m}_{\B}}
% mu_1
\newcommand{\mone}{\mu_{\mathbf{1}}}
\newcommand{\mones}{\mu_{\mathbf{1}}^{*}}
\newcommand{\monesBar}{\bar{\mu}^{*}_\mathbf{1}}

% q and variants
\newcommand{\q}{\mathbf{q}}
\newcommand{\qs}{\q^{*}}
\newcommand{\qii}{\q_{i}}

% b and variants
\newcommand{\bone}{\mathbf{b_1}}
%\newcommand{\bone}{\ones}

% c and variants
\newcommand{\cs}{\cbb^{*}}

% Functions
\newcommand{\Fs}{F^{*}}
\newcommand{\Gs}{G^{*}}
\newcommand{\Hs}{H^{*}}
\newcommand{\fsi}{f_{i}^{*}}

% Dual Spaces
\DeclareMathOperator{\Xcals}{\Xcal^{*}}
\DeclareMathOperator{\Mcals}{\Mcal^{*}}

% set and domain concepts
\DeclareMathOperator{\intr}{\mathrm{int}}   % Interior
\DeclareMathOperator{\bd}{\mathrm{bdry}}      % Boundary (see RocWet98)
\DeclareMathOperator{\dom}{\mathrm{dom}}    % Effective Domain
\DeclareMathOperator{\epi}{\mathrm{epi}}    % Epigraph
\DeclareMathOperator{\ran}{\mathrm{range}}  % Range of an operator

\newcommand{\by}{\times}

% gradient: \grad{F}{\m}
\newcommand{\grad}[2]{\nabla #1 \left( #2 \right) }
\newcommand{\hess}[2]{\nabla^2 #1 \left( #2 \right) }
\newcommand{\grd}{\nabla} % simpler version!
\newcommand{\breg}[1]{\Delta #1} % Bregman divergence

% subgradient: \subgrad{F}{\m}
\newcommand{\subgrad}[2]{\partial #1 \left( #2 \right) }
% subdiferential
\newcommand{\subdiff}{\partial}

% Vector of all ones
\newcommand{\ones}{\eb}

% Sum from i equal 1 to n
\newcommand{\sumin}{\sum_{i=1}^{n}}

% Deformed logarithm wrt \phi
\newcommand{\lphi}{\log_{\phi}}

% function k_{\phi}
\newcommand{\kphi}{k_{\phi}}

% reparam for dual log
%\newcommand{\phib}{\phi^{\bullet}}
\newcommand{\phib}{\pmb{\phi}}
\newcommand{\psib}{\psi^{\bullet}}

% Deformed exponential wrt \phi
\newcommand{\ephi}{\exp_{\phi}}

% One over deformed exponential wrt \phi
\newcommand{\ephib}{\exp^{\bullet}_{\phi}}

% Gradient of deformed exponential wrt \phi
\newcommand{\ephip}{\nabla_{x} \exp_{\phi}}

% Gradient of t-exponential wrt \phi
\newcommand{\ephit}{\nabla_{x} \exp_{t}}

% Deformed logarithm wrt \psi
\newcommand{\lpsi}{\log_{\psi}}

% One over deformed logarithm wrt \psi
\newcommand{\lpsib}{\log^{\bullet}_{\psi}}
\newcommand{\lphib}{\log^{\bullet}_{\phi}}

%composition needed for conjugate--formerly combo
\newcommand{\lcirce}{\lpsi \circ \ephib}

% S and related
\newcommand{\Sp}{S_{\phi}}
\newcommand{\Sps}{S^{*}_{\phi}}
\newcommand{\smp}{s_{\phi}}
\newcommand{\smpp}{s^{\prime}_{\phi}}
\newcommand{\smps}{s^{*}_{\phi}}

% T
\newcommand{\T}{T^{*}}
\DeclareMathOperator{\tp}{\tilde{p}}
\DeclareMathOperator{\tq}{\tilde{q}}
\DeclareMathOperator{\tg}{\tilde{g}}
\DeclareMathOperator{\tmu}{\tilde{\mu}}
\DeclareMathOperator{\tthe}{\tilde{\theta}}
\DeclareMathOperator{\ttheb}{{\bf \tilde{\theta}}}
\DeclareMathOperator{\tmc}{\widehat{\Mcal}}
\DeclareMathOperator{\tv}{\tilde{v}}


\DeclareMathOperator{\Remp}{R_{\text{emp}}}

\DeclareMathOperator{\tsallis}{\text{tsallis}}

%\newcommand{\inner}[2]{\left\langle #1,#2 \right\rangle}
%\newcommand{\rbr}[1]{\left(#1\right)}
%\newcommand{\cbr}[1]{\left\{#1\right\}}
%\newcommand{\nbr}[1]{\left\|#1\right\|}
%\newcommand{\abr}[1]{\left|#1\right|}
\def\ah{\widehat{a}}
\def\ahb{\mathbf{\ah}}
%\newcommand{\sbr}[1]{\left[#1\right]}
%\DeclareMathOperator*{\arginf}{\mathop{\mathrm{arginf}}}
\DeclareMathOperator*{\lnm}{\mathop{\mathbf{ln}}}
\mathchardef\Theta="7102 \mathchardef\theta="7112
\newcommand{\I}{{\mathcal{I}}}
\newcommand{\tops}{{\!\top\!}}
%\newcommand{\R}{\hbox{\bf R}}
%\newcommand{\z}{{\mathbf{z}}}
\newcommand{\vxi}{{\boldsymbol{\xi}}}
\newcommand{\valpha}{{\boldsymbol{\alpha}}}
\newcommand{\m}{{\mathbf{m}}}
\newcommand{\tc}[2]{\textcolor{#1}{#2}}
\renewcommand{\k}{k}
\ifx\br\undefined
\newcommand{\br}{\color{red}}
\else
\renewcommand{\br}{\color{red}}
\fi
%\newcommand{\bb}{\color{blue}}
\newcommand{\brh}{\color{red}}
\newcommand{\bbh}{\color{blue}}
\newcommand{\N}{\hbox{\bf N}}
\newcommand{\Nl}{N}
\newcommand{\Proj}{P} 
\newcommand{\Hc}{{H}}             % base hypothesis class
\def\x{{\mathbf{x}}} 
\def\X{{\mathbf{X}}} 
\def\Xr{{\rm\sf X}} 
\def\y{{\mathbf{y}}} 
\def\w{{\mathbf{w}}} 
\def\W{{\mathbf{W}}} 
\def\RR{{\mathbf{R}}} 
\def\NN{{\mathbf{N}}} 
\newcommand{\M}{{\boldsymbol{\mu}}}
\newcommand{\ev}{\hbox{\bf v}}
\newcommand{\EV}{\hbox{\bf V}}
\def\H{{\cal{H}}}
\def\F{{\cal{F}}}
\def\Fc{{S}}
\def\Sample{{\Z}}
\def\vc{{d}}
\def\D{{\cal{D}}} 
\def\hatD{\hat{D}} 
\def\A{\hat{A}} 
\def\B{\hat{B}} 
\def\T{\hat{T}} 
\def\G{\mathcal{G}} 
\def\P{\hat{P}} 
\newcommand{\al}{\alpha}
%\newcommand{\Fs}{{\mathcal{F}}}   % feature space
\newcommand{\AL}{{\boldsymbol \alpha}}
\newcommand{\BE}{{\boldsymbol \beta}}
\newcommand{\XI}{{\boldsymbol \xi}}
%\newcommand{\sgn}{\mbox{sgn}}
\def\one{{\mathbf{1}}}
\def\hatf{\hat{f}}


%\definecolor{Red}{rgb}{0.75,0,0}
%\definecolor{Blue}{rgb}{0,0,0.75}
%\definecolor{Green}{rgb}{0,0.75,0}
%\definecolor{Black}{rgb}{0,0,0}
%\definecolor{Orange}{rgb}{0.9,0.5,0}

%\newcommand{\Blue}[1]{\textcolor{blue}{#1}}
%\newcommand{\Green}[1]{\color{OliveGreen}{#1}}
%\newcommand{\Green}[1]{\textcolor{mygreen}{#1}}
%\newcommand{\Red}[1]{\textcolor{red}{#1}}
%\newcommand{\Orange}[1]{\textcolor{orange}{#1}}

%\newcommand{\argmin}{\mathop{\rm argmin}}
%\newcommand{\tr}{\mathop{\rm tr}}
%\newcommand{\argmax}{\mathop{\rm argmax}}
\newcommand{\Ind}{\mathop{\rm I}}
\renewcommand{\d}{{\bf d}}
\renewcommand{\l}{\Nl}
%\newcommand{\Hcal}{\mathcal{H}}
\newcommand{\Z}{\bf Z}

\newcommand{\mg}{{\rho}}
\newcommand{\MG}{{\varrho}}
\newcommand{\half}{\frac{1}{2}}
\newcommand{\ct}{{\tilde{c}}}

%\newtheorem{example}{Example} 
%\newtheorem{remark}[example]{Remark}
%\newtheorem{theorem}[example]{Theorem} 
%\newtheorem{lemma}[example]{Lemma} 
%\newtheorem{proposition}[example]{Proposition} 
%\newtheorem{corollary}[example]{Corollary}
%\newtheorem{definition}[example]{Definition}

%%% Local Variables: 
%%% mode: latex
%%% TeX-master: "tit"
%%% End: 

\newcommand{\bigdot}[1]{\stackrel{\bullet}{#1}} 
\newcommand{\textc}{\color{hblue}}
\newcommand{\fgc}{\color{red}}
\newcommand{\othc}{\color{green}}
\newcommand{\bgc}{\color{black}}
\newcommand{\titc}{\bf\fgc}

%\renewcommand{{\tem}}{{\item[{\fgc $\bullet$}]}}
%\newcommand{\Ref}[1]{{\hfill\othc [#1]}}

\newcommand{\s}{\vspace*{2cm}}
\newcommand{\hs}{\vspace*{1cm}}
\ifx\bl\undefined
\newcommand{\bl}{\hspace*{0.8cm}}
\else
\renewcommand{\bl}{\hspace*{0.8cm}}
\fi
\renewcommand{\ss}{\vspace*{4cm}}
\newcommand{\fhs}{\\[2mm]}
\newcommand{\fs}{\\[4mm]}
\newcommand{\fss}{\\[8mm]}
\newcommand{\bra}[1]
{\left. \begin{array}{c} \\[#1] \end{array} \right\} }

%\newcommand{\Lbf}{\LARGE\bf}
\newcommand{\lbf}{\large\bf}


%\newcommand{\grad}{\nabla}
\newcommand{\diff}{d\;}
%\newcommand{\qed}{$\;\;\;\Box$}
\newcommand{\equref}[1]{(\ref{#1})}
\newcommand{\sfrac}[2]{\mbox{$\frac{#1}{#2}$}}
%\newcommand{\half}{\sfrac{1}{2}}
%\newcommand{\R}{{\bf R}}
\newcommand{\eqdef}{\stackrel{\rm def}{=}}
%\newcommand{\argmin}{\mbox{argmin}}
%\renewcommand{\min}{\mbox{min}}
%\renewcommand{\max}{\mbox{max}}


\newcommand{\Wc}{{\mathcal{W}}}


%\newcommand{\vect}[1]{{\bf #1}}
\newcommand{\vect}[1]{{\mbox{\boldmath $#1$}}}
\newcommand{\squiggle}[1]{{\widetilde{#1}}}
%\newcommand{\bm}[1]{{\mbox{$\boldmath#1 $}}}

\newcommand{\dat}{{\cal{X}}}
\renewcommand{\d}{\Delta}

%\newcommand{\vb}{V_B}
\newcommand{\uo}{U_1}
\newcommand{\utp}{U_{t+1}}

%\newcommand{\zero}{\vect{0}}
%\renewcommand{\one}{\vect{1}}


\newcommand{\f}{\vect{f}}
\renewcommand{\x}{\vect{x}}
\renewcommand{\u}{\vect{u}}
%\newcommand{\bb}{\vect{b}}
\renewcommand{\y}{\vect{y}}
\newcommand{\xs}{\tilde{\x}}
\newcommand{\xo}{\x_1}
\newcommand{\xt}{\x_t}
\newcommand{\xq}{\x_q}
\newcommand{\xtp}{\x_{t+1}}
\newcommand{\xtm}{\x_{t-1}}

%\newcommand{\yt}{y_t}
\newcommand{\yq}{y_q}
%\newcommand{\yh}{\hat{y}}
\newcommand{\yht}{\yh_t}

\newcommand{\mus}{\squiggle{\mu}}
\renewcommand{\m}{\vect{\mu}}
%\newcommand{\mb}{\vect{\mu}_B}
\newcommand{\mt}{\m_t}
%\newcommand{\mr}{\m^*}
%\newcommand{\mo}{\m_1}
\newcommand{\mtp}{\m_{t+1}}
\newcommand{\mtm}{\m_{t-1}}
\newcommand{\mTp}{\m_{T+1}}
%\newcommand{\ms}{\squiggle{\m}}

%\renewcommand{\o}{{\omega}}
%\newcommand{\ot}{\o_t}
%\newcommand{\otp}{\o_{t+1}}
%\newcommand{\os}{\squiggle{\o}}
%\renewcommand{\O}{\bf{\Omega}}
%\renewcommand{\O}{\bf{g(\TH)}}
%\newcommand{\M}{\bf{M}}


\renewcommand{\th}{\vect{\theta}}
\newcommand{\tht}{\th_t}
\newcommand{\tho}{\th_1}
\newcommand{\thb}{\th_B}
\newcommand{\thtp}{\th_{t+1}}
\newcommand{\thTp}{\th_{T+1}}
\newcommand{\ths}{\squiggle{\th}}
\newcommand{\thetas}{\squiggle{\theta}}
\newcommand{\thr}{\th^*}

\newcommand{\ta}{\vect{\tau}}
\renewcommand{\w}{\vect{w}}
\newcommand{\uu}{\vect{u}}
\newcommand{\dv}{\vect{d}}

\newcommand{\ws}{\squiggle{\w}}
\newcommand{\Ws}{\widetilde{\W}}
\newcommand{\wo}{\w_1}
\newcommand{\wt}{\w_t}
%\newcommand{\wb}{\w_B}

\newcommand{\wtp}{\w_{t+1}}
\newcommand{\vtp}{\vv_{t+1}}
\newcommand{\ttheta}{\mbox {{\boldmath $\theta$}}}
\newcommand{\zz}{\vect{z}}

\renewcommand{\TH}{\vect{\Theta}}

%\newcommand{\at}{{\alpha_t}}
\newcommand{\At}{\vect{\alpha}_t}
\newcommand{\aq}{\alpha_q}
\newcommand{\atq}{\at^q}
\newcommand{\atmq}{\a_{t-1}^q}
\newcommand{\btq}{\beta_t^q}

\newcommand{\e}{\eta}
%\newcommand{\eb}{\eta_B}
\newcommand{\ebi}{\e_B^{-1}}
\newcommand{\et}{\e_t}
\newcommand{\eT}{\e_T}
\newcommand{\eo}{\e_1}
\newcommand{\eoo}{\e_2}
\newcommand{\etp}{\e_{t+1}}
\newcommand{\eTp}{\e_{T+1}}
\newcommand{\etpp}{\e_{t+2}}
\newcommand{\eTpp}{\e_{T+2}}
\newcommand{\etm}{\e_{t-1}}
\newcommand{\eti}{\et^{-1}}
\newcommand{\eoi}{\eo^{-1}}
\newcommand{\eooi}{\eoo^{-1}}
\newcommand{\etpi}{\etp^{-1}}
\newcommand{\eTpi}{\eTp^{-1}}
\newcommand{\etppi}{\etpp^{-1}}
\newcommand{\eTppi}{\eTpp^{-1}}
\newcommand{\etmi}{\etm^{-1}}
\newcommand{\es}{\squiggle{\e}}

\renewcommand{\I}{\vect{I}}


\newcommand{\Ei}{\E^{-1}}
%\newcommand{\Eb}{\E_B}
\newcommand{\Et}{\E_t}
\newcommand{\ET}{\E_T}
\newcommand{\Eo}{\E_1}
\newcommand{\Etp}{\E_{t+1}}
\newcommand{\Eti}{\Et^{-1}}
\newcommand{\Eoi}{\Eo^{-1}}
\newcommand{\Etpi}{\Etp^{-1}}
\newcommand{\Ebi}{\E_B^{-1}}
\newcommand{\Es}{\squiggle{\E}}

\renewcommand{\at}{\a_t}
\newcommand{\atp}{\a_{t+1}}

\newcommand{\pred}{{\hat{a}}}
\newcommand{\predt}{{\pred_t}}

\newcommand{\Q}{\Phi}
%\newcommand{\q}{\phi}

\renewcommand{\z}{\vect{z}}
\newcommand{\zs}{\tilde{\z}}
\renewcommand{\r}{\vect{r}}

\newcommand{\lt}{{L_t}}
\newcommand{\lot}{{L_{1..t}}}
\newcommand{\loT}{{L_{1..T}}}
\newcommand{\lotm}{{L_{1..t-1}}}
\renewcommand{\lq}{{L_q}}

\renewcommand{\thefootnote}{\fnsymbol{footnote}}


\title{RDM-DC: Poisoning Resilient Dataset Condensation \\ with Robust Distribution Matching}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<th.zheng@mail.utoronto.ca>?Subject=Your UAI 2023 paper}{Tianhang Zheng}{}}
\author[1]{\href{mailto:<bli@ece.toronto.edu>?Subject=Your UAI 2023 paper}{Baochun Li}{}}
% Add affiliations after the authors
\affil[1]{%
    Department of Electrical and Computer Engineering\\
    University of Toronto
}
  
\begin{document}
\maketitle

\begin{abstract}
  Dataset condensation aims to condense the original training dataset into a small synthetic dataset for data-efficient learning. The recently proposed dataset condensation techniques allow the model trainers with limited resources to learn acceptable deep learning models on a small amount of synthetic data.
  However, in an adversarial environment, given the original dataset as a poisoned dataset, dataset condensation may encode the poisoning information into the condensed synthetic dataset. 
  To explore the vulnerability of dataset condensation to data poisoning, we revisit the state-of-the-art targeted data poisoning method and customize a targeted data poisoning algorithm for dataset condensation. By executing the two poisoning methods, we demonstrate that, when the synthetic dataset is condensed from a poisoned dataset, the models trained on the synthetic dataset may predict the targeted sample as the attack-targeted label. To defend against data poisoning, we introduce the concept of poisoned deviation to quantify the poisoning effect. We further propose a poisoning-resilient dataset condensation algorithm with a calibration method to reduce poisoned deviation. Extensive evaluations demonstrate that our proposed algorithm can protect the synthetic dataset from data poisoning with minor performance drop.
\end{abstract}

\section{Introduction}
In the past decade, deep learning has significantly contributed to many on-going advances regarding computer vision and natural language processing. Big data is an essential key to unlock the success of deep learning, which though introduces an obstacle to hinder the clients with limited resources from applying deep learning. To address this issue, the community proposed dataset condensation---a research topic studying how to condense the large training dataset into a small synthetic dataset and simultaneously maintain the utility of the synthetic data for model training.

Recent research on this topic has produced several effective dataset condensation techniques \citep{zhao2021DM, zhao2021dataset, zhao2021dataset2, cazenavette2022dataset, liudataset, cuidc}. For instance, \cite{zhao2021dataset} proposed to match model gradients on the synthetic data and the original data, for the purpose of maintaining the performance of the model trained by gradient descent on the synthetic data. \cite{zhao2021DM} proposed to match the representations of synthetic and original data. \cite{cazenavette2022dataset} matched the training trajectories on real data and the training trajectories on synthetic data to learn high-utility synthetic data. \cite{liudataset} factorized a dataset into data hallucination networks and bases and generated synthetic samples via arbitrary combinations between networks and bases.

Despite the remarkable progress, there has been little investigation on the vulnerability of dataset condensation to attacks in real-world adversarial environments. For instance, in practice, some training data may come from untrusted sources, which gives an adversary opportunities to insert poisoned data into the training dataset. Previous literature \citep{wallace2021concealed, geiping2020witches, zheng2021first, schuster2021you} has shown that a small subset of poisoned data can mislead the trained models to output adversary-defined (attack-targeted) predictions on targeted samples in computer vision and natural language processing tasks. Therefore, if the original dataset contains poisoned data from untrusted sources, a natural question to ask is whether dataset condensation will encode the poisoned information into the condensed synthetic dataset.


In this paper, we investigate the most scalable and efficient method among \citep{zhao2021DM, zhao2021dataset, zhao2021dataset2, cazenavette2022dataset, liudataset}, which is the distribution matching (DM) method \cite{zhao2021DM}\footnote{We leave the investigations on other dataset condensation methods for future research.}. We conduct the first study on the vulnerability of distribution matching based dataset condensation against the state-of-the-art targeted data poisoning method, {\em i.e.,} gradient matching attack \citep{geiping2020witches}. Inspired by distribution matching \citep{zhao2021DM}, we further propose a targeted data poisoning attack to evaluate dataset condensation, which crafts the poisoned perturbation by matching the representations of poisoned data and the targeted sample.

To quantify the effect of data poisoning on dataset condensation, we introduce the concept of \textbf{poisoned deviation}, which can be viewed as the major cause of the poisoning vulnerability. 
We show that, for the DM method, the poisoned deviation in the average of the original data representations scales in $O(\eps\sqrt{d_{\bm{r}}})$, where $\sqrt{d_{\bm{r}}}$ refers to the representation dimension. The poisoned deviation could convey poisoned information to the synthetic data representations during the representation matching process. As a result, the models trained on the synthetic data may output adversary-defined predictions on the targeted data sample.

To defend against data poisoning, we propose a poisoning resilient dataset condensation algorithm with a calibration method to reduce the poisoned deviation. We prove that, when the poisoned deviation is large with potentially significant impact on the representations, the calibration method can reduce the magnitude of the poisoned deviation from $O(\eps\sqrt{d_{\bm{r}}})$ to $\Theta(\eps^2\sqrt{d}_{\bm{r}})$ and thus alleviate the effect of the poisoned deviation. Since our method improves the robustness distribution matching to poisoned deviation, we call our defense method as RDM-DC (\underline{R}obust \underline{D}istribution \underline{M}atching based \underline{D}ataset \underline{C}ondensation).

We conduct extensive experiments with two targeted data poisoning methods, including the state-of-the-art targeted data poisoning method and our proposed data poisoning method. We show that, even if the poisoning rate is small, distribution matching based dataset condensation is still vulnerable to targeted data poisoning. We further demonstrate that, the distribution matching method is more vulnerable to our proposed distribution matching based attack than the gradient matching based attack. This result indicates that our proposed attack is a more suitable attack benchmark to evaluate the distribution matching based dataset condensation method. We evaluate our proposed defense against those two data poisoning attacks with multiple random seeds and find that our defense is able to reduce the attack success rate to $0\%$ with mild utility loss.

\section{Background and Related Work}

\subsection{Definitions and Notations}
In this paper, we denote a neural network by$\bm{f}_{\bm{\theta}}(\cdot)$ with parameters $\bm{\theta}$. 
We denote a data sample by $\bm{x}$ and its label by $y$. 
Let $\gT$ and $\gS$ represent the original dataset and the synthetic dataset, respectively. For dataset condensation, we have $|\gS| \ll |\gT|$, which means the size of the synthetic dataset is expected to be much smaller than the size of the original dataset.
We denote the targeted data sample and the adversary-defined prediction by $\bm{x}^t$ and $y^{adv}$, respectively. We denote the poisoned samples by $\{\bm{x}_p\}_{p=1}^P$ and the perturbation added to those poisoned samples by $\{\bm{\delta}_p\}_{p=1}^P$. We refer to the poisoned dataset as $\gT'$. Formally, $\gT'$ can be expressed as $\gT' = (\gT/\{\bm{x}_p, y_{adv}\}_{p=1}^P) \cup \{\bm{x}_p + \bm{\delta}_p, y_{adv}\}_{p=1}^P$.

\subsection{Dataset Condensation}
Dataset condensation is a recently emerged technique for condensing the original training datasets into small synthetic datasets and simultaneously maintaining the data utility for training models to the greatest extent. Given the problem objective, \cite{wang2018dataset} formulated the dataset condensation problem as
\begin{align}
	\argmin_{\gS}\mathbb{E}_{(\bm{x}, y)\sim \gT}\ell(&\bm{f}_{\bm{\theta}(\mathcal{S})}(\bm{x}), y),\\~\mbox{where}~ \bm{\theta}(\mathcal{S}) = \argmin_{\bm{\theta}}&\mathbb{E}_{(\bm{x}, y)\sim \mathcal{S}}\ell(\bm{f}_{\bm{\theta}}(\bm{s}), y). \nonumber
\end{align}
\cite{wang2018dataset} proposed to solve the above bi-level problem via meta-learning \citep{finn2017model}. The proposed solution consists of an inner optimization step to update $\bm{\theta}$ and a outer optimization step to update $\gS$. A drawback of \cite{wang2018dataset}'s method is the heavy computational cost to involve second-order information in the optimization process. To address this drawback, \cite{nguyen2020dataset} proposed an algorithm called kernel inducing points (KIP).

\cite{zhao2021dataset} proposed to match the model gradients on the real and synthetic data for dataset condensation. Since \cite{zhao2021dataset}'s method includes gradients in the objective, it implicitly uses second-order information in the optimization process. \cite{zhao2021dataset2} further proposed differentiable Siamese augmentation $\gA_{\bm{w}}(\cdot)$ and found that applying $\gA_{\bm{w}}(\cdot)$ with random parameters $\bm{w}$ to both the original and synthetic data samples can improve the performance of \cite{zhao2021dataset}'s method.

To avoid using implicit second-order information, \cite{zhao2021DM} proposed to match the representations of the original and synthetic data for dataset condensation. \cite{zhao2021DM} used the following formula as the matching loss, {\em i.e.,}
\begin{align}\label{eq:dm_loss}
	\mathbb{E}_{\bm{\theta}\sim P_{\bm{\theta}}} \|\frac{1}{|\gT|}\sum_{i=1}^{|\gT|}\bm{\Phi}_{\bm{\theta}}(\gA_{\bm{w}}(\bm{x}_i)) - \frac{1}{|\gS|}\sum_{i=1}^{|\gS|}\bm{\Phi}_{\bm{\theta}}(\gA_{\bm{w}}(\bm{s}_i))\|_2^2,
\end{align}
where $\bm{\Phi}_{\bm{\theta}}(\cdot)$ refers to the feature extractor (not pretrained), and $\bm{\theta}$ is sampled from a random parameter distribution $P_{\bm{\theta}}$.

\cite{cazenavette2022dataset} proposed to learn the synthetic data by expert training trajectories, which refer to the model parameters during the training process. Specifically, \citep{cazenavette2022dataset} first trains models on the original dataset and collect the training trajectories. Afterwards, \citep{cazenavette2022dataset} trains models on the synthetic dataset and match the training trajectories with those collected from the models trained on the original data. Finally, \citep{cazenavette2022dataset} backpropagates the matching loss to optimize the synthetic data. This method achieves better performance than \citep{zhao2021DM} but requires much more computational cost.

\cite{liudataset} proposed to factorize a dataset into data hallucination networks and bases and feed the data bases into the hallucination networks to generate synthetic data. This dataset factorization approach can use limited storage to represent more synthetic data compared to the previous methods. 
We note that, although \citep{cazenavette2022dataset, liudataset} achieve better testing accuracy than \citep{zhao2021DM}, \citep{zhao2021DM} is much more efficient and scalable than \citep{cazenavette2022dataset, liudataset}. This is because \citep{zhao2021DM} does not need to train any expert models and does not need any second-order derivative information to optimize the synthetic data. In this paper, we mainly focus on designing a defense for this efficient and scalable dataset condensation technique, which could be attacked by the gradient matching based poisoning attack and our proposed method introduced in Section~\ref{sec:poison_attack}.


\subsection{Targeted Data Poisoning}
In this paper, we mainly focus on targeted data poisoning. We note that we have also evaluated untargeted data poisoning attacks against distribution matching based dataset condensation, but we find that untargeted attacks could not degrade the performance with a small poisoning rate like $1\%$. We conjecture that this is because the poisoned deviation is not enough to have a significant overall impact with a small poisoning rate (See discussion in Section~\ref{subsec:pddm}).

The goal of targeted data poisoning is misleading the model trained on poisoned data to output adversary-defined label on certain targeted data. Given this objective, the adversary could formulate the following objective to craft the perturbation on the poisoned data:
\begin{align}
	\argmin_{\bm{\delta}_p}\ell(\bm{x}^t, y^{adv}, &\bm{\theta}_p),\\~\mbox{where}~ \bm{\theta} = \argmin_{\bm{\theta}_p}\mathbb{E}_{(\bm{x}, y)\sim \gT'}\ell(\bm{f}_{\bm{\theta}}&(\bm{x}), y); \|\bm{\delta}_p\|_2 \leq \eps \nonumber.
\end{align}
\cite{huang2020metapoison} proposed to solve above objective by meta learning, with a step to update the model parameters $\bm{\theta}_p$ and another step to update the perturbation $\bm{\delta}_p$. A drawback of the meta learning method is that the second step implicitly uses second-order information, leading to heavy additional computational cost. To reduce the computational cost, \cite{zheng2021first} derived a general targeted data poisoning method that only uses first-order information to update the perturbation. However, \cite{zheng2021first}'s method needs to add color perturbation and watermarks to the poisoned images. \cite{geiping2020witches} proposed to craft the perturbation by matching the model gradients on the targeted data sample and poisoned data, which achieves the state-of-the-art attack performance. We will detail this gradient matching approach in the next section.


We note that some previous literature claims that if the perturbation is large, it could be easily detected by human beings. However, in practice, there may not be sufficient human labor to inspect every data sample in a dataset. Supposing that the dataset inspector randomly checks $50$ samples and the poisoning rate is $1\%$, then the probability that the inspector could \textbf{not} find any poisoned sample is approximately $60\%$, which is still very high. Therefore, if the poisoning rate is only $1\%$, the setting of large perturbation is valid.




\section{Targeted Data Poisoning against Dataset Condensation}\label{sec:poison_attack}
\subsection{Gradient Matching}
Gradient matching is the state-of-the-art targeted data poisoning method. The basic idea of gradient matching is to match the model gradients on the poisoned data and the targeted data sample. If the gradients are approximately matched, updating the model to decrease the loss on poisoned data will be similar to updating the model to decrease the loss on the targeted sample.
The objective of gradient matching based data poisoning can be expressed as
\begin{align}\label{eq:grad_match}
	1 - \frac{\la \nabla_{\bm{\theta}}\ell(\bm{x}^t, y^{adv}, \bm{\theta}), \sum_{p=1}^{P}\nabla_{\bm{\theta}}\ell(\bm{x}_p + \bm{\delta}_p , y^{adv}, \bm{\theta})\ra}{\|\nabla_{\bm{\theta}}\ell(\bm{x}^t, y^{adv}, \bm{\theta})\|\cdot \|\sum_{p=1}^{P}\nabla_{\bm{\theta}}\ell(\bm{x}_p + \bm{\delta}_p , y^{adv}, \bm{\theta})\|},
\end{align}
where $\{\bm{x}_p + \bm{\delta}_p\}_{p=1}^P$ are the poisoned data samples. The true label of those poisoned samples is the adversary-defined label $y^{adv}$.
If the attack objective (\ref{eq:grad_match}) is minimized to a small value, then the average of the model gradients on the poisoning data, {\em i.e.,} $\frac{1}{P}\sum_{p=1}^{P}\nabla_{\bm{\theta}}\ell(\bm{x}_p + \bm{\delta}_p , y^{adv}, \bm{\theta})$, will have same direction as the model gradient on the targeted data sample and adversary-defined label, {\em i.e.,} $\nabla_{\bm{\theta}}\ell(\bm{x}^t, y^{adv}, \bm{\theta})$.

To boost the attack performance, the gradient matching attack pre-trains multiple models for different epochs on the clean training data and optimizes the perturbation with the attack objective (\ref{eq:grad_match}) on those pre-trained models. Besides, the attack also runs the perturbation-crafting process for multiple rounds with different initial values and output the perturbation with the best attack performance.
For completeness, we provide the algorithm of gradient matching based data poisoning from \citep{geiping2020witches} in Algorithm~\ref{alg:grad_match}. \cite{geiping2020witches}'s attack is mainly designed for the standard classification model training process not for distribution matching based dataset condensation. Therefore, in this paper, we design a more suitable targeted data poisoning attack for distribution matching based dataset condensation, as introduced in the next subsection.
\begin{algorithm}
	\caption{Gradient Matching based Data Poisoning}
	\label{alg:grad_match}
	\begin{algorithmic}
		\REQUIRE Pretrained clean networks $\bm{f}_{\bm{\theta}}(\cdot)$; targeted sample $\bm{x}_t$ with targeted label $y^{adv}$; number of poisoned samples $P\ll N$ ($N$ is the total number of training samples); perturbation bound $\eps$; number of restarts $R$, number of iterations $T$.
		\\
		
		\STATE Randomly select $P$ training images with true label $y^{adv}$, denoted by $\{\bm{x}_p, y^{adv}\}_{p=1}^P$.
		\FOR{$r=1$ to $R$}
		\STATE Randomly initialize the perturbation $\bm{\delta}^r_p$ for $\bm{x}_p$.
		\FOR{$t=1$ to $T$}
		\STATE Apply differentiable data augmentation to $\bm{x}_p +\bm{\delta}^r_p$
		\STATE Compute the objective by (\ref{eq:grad_match})
		\STATE Update $\bm{\delta}^r_p$ by a step signed Adam/MSGD with the gradient of (\ref{eq:grad_match}) w.r.t. $\bm{\delta}^r_p$.
		\ENDFOR
		\STATE Choose the $\bm{\delta}^r_p$ with the minimal value of (\ref{eq:grad_match}) as $\bm{\delta}^*_p$.
		\STATE Replace $\{\bm{x}_p, y^{adv}\}_{p=1}^P$ with $\{\bm{x}_p + \bm{\delta}^*_p, y^{adv}\}_{p=1}^P$.
		\ENDFOR
	\end{algorithmic}
\end{algorithm}

\subsection{Distribution Matching Poisoning}
Our proposed targeted data poisoning method is called distribution matching poisoning (DM poisoning).
The core idea of DM poisoning is to match the representations of the targeted data sample and the representations of the poisoned data with the adversary-defined label. Based on this idea, we define the objective of DM poisoning as
\begin{align}\label{eq:dm_poison}
	\mathbb{E}_{\bm{\theta}\sim P_{\bm{\theta}}} \|\bm{\Phi}_{\bm{\theta}}(\gA_{\bm{w}}(\bm{x}_t)) - \frac{1}{P}\sum_{p=1}^{P}\bm{\Phi}_{\bm{\theta}}(\gA_{\bm{w}}(\bm{x}_p + \bm{\delta}_p))\|_2^2.
\end{align}
where $\{\bm{x}_p\}_{p=1}^P$  also refer to the poisoned samples with label $y^{adv}$. After optimizing the perturbation with the above objective, the similarity between the representation of the targeted sample and the representations of the poisoned samples will increase. Since a small proportion of the condensed synthetic data with label $y^{adv}$ is supposed to share similar representations with the poisoned data with label $y^{adv}$ after executing \citep{zhao2021DM}, the representations of those condensed synthetic data samples are also similar as the representation of the targeted data sample. Therefore, DM poisoning is able to encode the poisoning information regarding the targeted sample into the synthetic data.

On top of the above objective, we propose an algorithm (Algorithm~\ref{alg:dm_poison}) to craft poisoned data. 
\textbf{In our DM poisoning attack, we do not need to pretrain multiple models, and thus our attack runs faster than the gradient matching attack.} In each iteration of DM poisoning, we first randomly sample the parameters $\bm{\theta}$ for the feature extractors $\bm{\Phi}_{\bm{\theta}}$. After that, we sample the augmentation parameters $\bm{w}$ and compute the representations by feeding the original and synthetic data into the augmentation function followed by the feature extractors. Finally, we compute the objective (\ref{eq:dm_poison}) with the representations and update the perturbation by a signed Adam optimizer, which is equivalent to a signed momentum SGD optimizer. To boost the attack performance, we also run the algorithm for multiple rounds and output the poisoning perturbation with the best attack performance.
\begin{algorithm}
	\caption{DM Poisoning}
	\label{alg:dm_poison}
	\begin{algorithmic}
		\REQUIRE Targeted sample $\bm{x}_t$ with targeted label $y^{adv}$; number of poisoned samples $P\ll N$; perturbation bound $\eps$; number of restarts $R$, number of iterations $T$; feature extractors $\bm{\Phi}_{\bm{\theta}}$; parameter distribution $P_{\bm{\theta}}$; data augmentation $\mathcal{A}_{\bm{w}}(\cdot)$.
		\\
		\STATE Randomly select $P$ training images with true label $y^{adv}$, denoted by $\{\bm{x}_p, y^{adv}\}_{p=1}^P$.
		\FOR{$r=1$ to $R$}
		\STATE Randomly initialize the perturbation $\bm{\delta}^r_p$ for $\bm{x}_p$.
		\FOR{$t=1$ to $T$}
		\STATE Randomly sample $\bm{\theta}$ from $P_{\bm{\theta}}$.
		\STATE Randomly sample the augmentation parameters $\bm{w}$.
		\STATE Compute Representations: $\bm{r}(\bm{x}_p) = \bm{\Phi}_{\bm{\theta}}(\mathcal{A}_{\bm{w}}(\bm{x}_p))$ and $\bm{r}(\bm{x}_t) = \bm{\Phi}_{\bm{\theta}}(\mathcal{A}_{\bm{w}}(\bm{x}_t))$.
		\STATE Update $\bm{\delta}^r_p$ by a step signed Adam/MSGD with the gradient of (\ref{eq:dm_poison}) w.r.t. $\bm{\delta}^r_p$.
		\ENDFOR
		\STATE Choose the $\bm{\delta}^r_p$ with the minimal value of (\ref{eq:dm_poison}) as $\bm{\delta}^*_p$.
		\STATE Replace $\{\bm{x}_p, y^{adv}\}_{p=1}^P$ with $\{\bm{x}_p + \bm{\delta}^*_p, y^{adv}\}_{p=1}^P$.
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
DM poisoning is similar to feature collision attacks \citep{shafahi2018poison, zhu2019transferable}, but DM poisoning explores far more random feature spaces compared to \citep{shafahi2018poison, zhu2019transferable}.

\section{Poisoning-Resilient Dataset Condensation}\label{sec:prdc}

\subsection{Poisoned Deviation}
To delve into data poisoning in dataset condensation, we propose the concept of poisoned deviation as follows:
\begin{definition}[poisoned deviation]\label{def:poi_dev}
	Given a variable or metric $\bm{z} = \bm{z}_{\gD} + \bm{z}_{\gB}$, if $\bm{z}_{\gD}$ is computed on the original data, and $\bm{z}_{\gB}$ is computed on the poisoned data, then we define $\bm{z}_{\gB}$ as  the poisoned deviation of $\bm{z}$. Poisoned deviation can be viewed as the main cause of the abnormal model behavior desired by the poisoning attack.
\end{definition}

Here we provide a practical example to help readers understand the above definition: A model trainer learns its deep learning model on a poisoned dataset by stochastic gradient descent. In each learning step, the model trainer samples a minibatch from the poisoned dataset. We denote the minibatch by $\{\bm{x}_i, y_i\}^{L_1}_{i=1}\cup\{\bm{x}'_j, y'_j\}^{L_2}_{j=1}$, where $\{\bm{x}_i, y_i\}^{L_1}_{i=1}$ refers to original data, and $\{\bm{x}'_j, y'_j\}^{L_2}_{j=1}$ refers to poisoned data. 
The model gradient w.r.t. the model parameters $\bm{\theta}$ on the minibatch is $\bm{g} = \frac{1}{L_1 + L_2}(\sum_{i=1}^{L_1}\nabla_{\bm{\theta}}\ell(\bm{x}_i, y_i) + \sum_{j=1}^{L_2}\nabla\ell(\bm{x}'_j, y'_j))$. According to Definition~\ref{def:poi_dev}, $\frac{1}{L_1 + L_2}\sum_{j=1}^{L_2}\nabla\ell(\bm{x}'_j, y'_j)$ is the poisoned deviation of $\bm{g}$, which causes the abnormal model behavior desired by the poisoning attack.

\subsection{Poisoned Deviation in Distribution Matching}\label{subsec:pddm}
In this paper, we mainly focus on distribution matching, which is the most scalable dataset condensation method among \citep{zhao2021dataset, liudataset, zhao2021DM, cazenavette2022dataset}. We leave investigation on the other dataset condensation methods for future research.
Distribution matching learns the synthetic data by matching the mean of the subsampled original data representations and the mean of the subsampled synthetic data representations, as shown in Eq.~\ref{eq:dm_loss}.

However, since the original dataset contains poisoned data samples,  $\frac{1}{|\gT|}\sum_{i=1}^{|\gT|}\bm{\Phi}_{\bm{\theta}}(\gA_{\bm{w}}(\bm{x}_i))$ is polluted by poisoned deviation.

In the following, we show that the poisoned deviation scales in $O(\eps\sqrt{d_{\bm{r}}})$.
We denote the poisoned dataset by $\gT'$. Suppose the fraction of the poisoned data samples (poisoning rate) in $\gT$ is $\eps$, which means there exists $\eps|\gT'|$ poisoned samples, then we could express $\gT'$ as $\gT' = \gO \cup \gB$, where $\gO \subset \gT$, and $B$ is the poisoned subset, then we have $|B| = \eps |\gT|$, and $|\gO| =(1-\eps) |\gT|$.
The poisoned deviation of the mean estimation could be expressed as

\begin{align}
	\|\frac{1}{|\gT|}\sum_{i=1}^{|B|}\bm{\Phi}_{\bm{\theta}}(\gA_{\bm{w}}(\bm{x}'_i)) \|_2 
	= O(\frac{|B|}{|\gT|}\sqrt{d}_r) = O(\eps\sqrt{d}_{\bm{r}}), 
\end{align}
where $d_{\bm{r}}$ is the representation dimension.
Since the scale of $\|\frac{1}{|\gT|}\sum_{i=1}^{|\gT|}\bm{\Phi}_{\bm{\theta}}(\gA_{\bm{w}}(\bm{x}_i))\|_2$ is $O(\sqrt{d}_{\bm{r}})$, the poisoned deviation could not have a significant overall impact with a small $\eps$. But this poisoned deviation is enough to cause the models trained by DM-generated synthetic data to make adversary-defined prediction $y^{adv}$ on a certain sample $\bm{x}_t$.

\subsection{Poisoning-Resilient Dataset Condensation}
We propose an algorithm for poisoning resilient dataset condensation by reducing the poisoned deviation with a mean calibration method. We prove that, by executing the mean calibration method, we could reduce the bound on the poisoned deviation by an order of magnitude. The algorithm is given in Algorithm~\ref{alg:defense}, which is similar to the algorithm of distribution matching. The main difference between Algorithm~\ref{alg:defense} and the distribution matching algorithm is that we calibrate the mean of the original data representations to reduce the poisoned deviation.
\begin{algorithm}
	\caption{RDM-DC: Robust Distribution Matching for Dataset Condensation}
	\label{alg:defense}
	\begin{algorithmic}
		\REQUIRE Original Dataset $\gT=\gT_1 \cup \gT_2... \cup \gT_C$; the number of classes $C$; the number of data samples per class $N_c$; the number of synthetic samples per class $M$; feature extractors $\bm{\Phi}_{\bm{\theta}}$; parameter distribution $P_{\bm{\theta}}$; data augmentation $\mathcal{A}_{\bm{w}_c}(\cdot)$.
		\STATE Initialize  $\gS = \{\{\bm{s}_j^c\}_{j=1}^{M}\}_{c=1}^C$ with random noise from $\gN(\bm{0}, \bm{I}_d)$
		\FOR{each iteration}
		\STATE Sample $\bm{\theta}$ from $P_{\bm{\theta}}$ and initialize the loss as $\ell=0$
		\FOR{each class $c$}
		\STATE Sample the augmentation parameters $\bm{w}_c$.
		\STATE Sample a minibatch $B^{\gT}_c$ from $\gT_c$
		\STATE Compute Representations: $\bm{r}(\bm{x}_i^c) = \bm{\Phi}_{\bm{\theta}}(\mathcal{A}_{\bm{w}_c}(\bm{x}_i^c))$ for the minibatch $B^{\gT}_c$ and $\bm{r}(\bm{s}_j^c) = \bm{\Phi}_{\bm{\theta}}(\mathcal{A}_{\bm{w}_c}(\bm{s}_j^c))$ for $S_c=\{\bm{s}_j^c\}_{j=1}^{M}$.
		\STATE Mean calibration: $\hat{\bm{\mu}} = \mbox{Calibrate}(\{\bm{r}(\bm{x}_i^c)\}_{i=1}^{|B^{\gT}_c|})$
		\STATE Compute Loss: $\ell = \ell + \|\frac{1}{M}\sum_{j=1}^{M}\bm{r}(\bm{s}_j^c) - \hat{\bm{\mu}}\|_2^2$.
		\ENDFOR
		\STATE $\gS = \gS - \eta \nabla \ell$
		\ENDFOR
		\STATE Output the synthetic dataset $\gS = \{\{\bm{s}_j^c\}_{j=1}^{M}\}_{c=1}^C$
	\end{algorithmic}
\end{algorithm}


The mean calibration method is illustrated in Algorithm~\ref{alg:mean_calibration}, which is inspired by \citep{tran2018spectral}. Given a batch of original data representations, we first estimate the mean and covariance matrix. After that, we compute the eigenvector of the covariance matrix. Since directly computing the first eigenvector requires heavy computational cost, we approximate the eigenvector using the power method \citep{van1996matrix} with a few iterations. For completeness, we provide the power method in Algorithm~\ref{alg:power_method}.

\begin{algorithm}
	\caption{Mean Calibration}
	\label{alg:mean_calibration}
	\begin{algorithmic}
		\REQUIRE Representations $\{\bm{r}(\bm{x}_i)\}_{i=1}^{N}$
		\STATE 1. Estimate the mean and covariance matrix of the representations by $\overline{\bm{r}} = \frac{1}{N}\sum_{i=1}^{N}\bm{r}(\bm{x}_i)$ and $\bm{\Sigma}_{\bm{r}} = \frac{1}{N-1}\sum_{i=1}^{N}(\bm{r}(\bm{x}_i) - \overline{\bm{r}})^T(\bm{r}(\bm{x}_i) - \overline{\bm{r}})$
		\STATE 2. Compute the top eigenvector of $\bm{\Sigma}_{\bm{r}}$, denoted by $\bm{v}_r$.
		\STATE 3. Assign $|\la \bm{r}(\bm{x}_i) - \overline{\bm{r}}, \bm{v}_r\ra|$ as the score of $\bm{r}(\bm{x}_i)$.
		\STATE 4. Filter out $[3\eps N]$ samples with the largest scores (remaining data samples are denoted by $\{\bm{r}(\bm{x}_i)\}_{i=1}^{N-[3\eps N]}$).
		
		\STATE 5. Recompute the mean  $\overline{\bm{r}} = \frac{1}{N-[3\eps N]}\sum_{i=1}^{N-[3\eps N]}\bm{r}(\bm{x}_i)$
	\end{algorithmic}
\end{algorithm}

\begin{algorithm}
	\caption{Power Method}
	\label{alg:power_method}
	\begin{algorithmic}
		\REQUIRE Covariance matrix $\bm{\Sigma}_{\bm{r}}$; number of iterations $I$
		\STATE Randomly sample a vector $\bm{v} \sim \mathcal{N}(\bm{0}, \bm{I})$.
		\FOR{each iteration}
		\STATE $\bm{v} = \bm{\Sigma}_{\bm{r}} \bm{v}$; $\bm{v} = \bm{v}/\|\bm{v}\|_2$.
		\ENDFOR
		\STATE Output $\bm{v}$.
	\end{algorithmic}
\end{algorithm}

According to the lemma below (inspired by \citep{tran2018spectral}), the mean calibration method is able to reduce the poisoned deviation by an order of magnitude. In the following, We denote the original representation distribution by $\gD$ and the poisoned representation distribution by $\gB$ and their means by $\bm{\mu}_{\gD}$ and $\bm{\mu}_{\gB}$ respectively. We denote the poisoning rate by $\eps$, and we denote the mean and the covariance matrix of $(1-\eps)\gD + \eps\gB$ by $\bm{\mu}_{\gP}$ and $\bm{\Sigma}_{\gP}$, respectively. Given those notations, we could present the lemma.
%Given those notations, we can present the proposition and its proof.

\begin{lemma}\label{prop:main}
	Assuming that $\gD$ and $\gB$ have bounded covariance matrices $\bm{\Sigma}_{\gD}, \bm{\Sigma}_{\gB}\leq \sigma^2\bm{I}$, and their means have an apparent difference, {\em i.e.,} $\|\bm{\mu}_{\gD} - \bm{\mu}_{\gB}\|_2^2 \geq \frac{\alpha\sigma^2}{\eps}$ where $\alpha > \frac{2665}{576}$, then if we drop all the representations that satisfies $|\la\bm{r} - \bm{\mu}_{\gP}, \bm{v}\ra| \geq t$ with a certain $t$, then we can reduce the scale of the poisoned deviation from $O(\eps\sqrt{d_{\bm{r}}})$ to $\Theta(\eps^2\sqrt{d}_{\bm{r}})$.
\end{lemma}

Lemma~\ref{prop:main} is similar but not identical to Lemma 3.1 in \citep{tran2018spectral}. We provide the proof of Lemma~\ref{prop:main} and the related lemmas in the supplementary material.

We remark that, if the assumption in Lemma~\ref{prop:main} does \textbf{not} hold, {\em i.e.,} the difference between the mean of original representations and the mean of the poisoned representations is not significant, the poisoned deviation will be small. In that case, the poisoning attack will not succeed on the distribution matching based dataset condensation method. This is because, if $\|\bm{\mu}_{\gD} - \bm{\mu}_{\gB}\|_2^2 < \frac{\alpha\sigma^2}{\eps}$, the poisoned deviation is expected to be $\|\bm{\mu}_{\gP} - \bm{\mu}_{\gD}\|_2 = \eps\|\bm{\mu}_{\gD} - \bm{\mu}_{\gB}\|_2 < \sqrt{\alpha\eps}\sigma\sim O(\sqrt{\eps})$. In practice, we also find that, if the perturbation budget of the poisoned data is small, which indicates that the difference between $\bm{\mu}_{\gD}$ and $\bm{\mu}_{\gB}$ is small, then distribution matching does not show vulnerability to the targeted data poisoning attacks.

We also note that, in practice, it is intractable to compute the $t$ since $\bm{\mu}_{\gD}$ and $\bm{\mu}_{\gB}$ are unknown. But we know that, the proportion of the dropped representations is expected to be approximately $2\eps$, where $\gB$ accounts for approximately $\eps$ of the dropped data, and the other $\eps$ dropped data comes from $\gD$. To ensure that we drop most of the poisoned representations, we set the dropping rate as $3\eps$, which means that we drop $[3\eps N]$ representations given totally $N$ representations in each iteration, as shown in Algorithm~\ref{alg:defense}. Note that if the poisoned data only corresponds to one class, then we need to use the proportion of the poisoned data to all the data from that class as $\eps$ to calculate the dropping ratio. 

To generalize our theoretical analysis to other dataset condensation methods such as gradient matching based dataset condensation \cite{zhao2021dataset, zhao2021dataset2}, we could replace the representations in the theoretical analysis in this section with model gradients.

\begin{figure}[h]
	\centering
	\includegraphics[width=0.42\textwidth]{application.png}
	\caption{An application of poisoning-resilient dataset condensation. An trusted entity is responsible for collecting the data and condensing the data into synthetic data.}
	\label{fig:application}
\end{figure}
\subsection{Application of RDM-DC}\label{subsec:rdm-dc}
In this subsection, we discuss the potential applications of our RDM-DC algorithm.
One application of the algorithm is to assist the establishment of a trustworthy data supply chain for deep learning. We sketch the supply chain in Fig.~\ref{fig:application}, where a trusted entity like the government, an agency, or a hospital is responsible for collecting data from the users. After data collection, the trusted entity executes our RDM-DC algorithm on the collected data to generate synthetic datasets and share the synthetic data with the third party to train deep learning models. This supply chain inherits the benefits of dataset condensation and simultaneously addresses the threat of targeted data poisoning.


Another application of the algorithm is to store the core information from a large amount of data, with tolerance to a small subset of bad samples, into a small storage space. Data explosion is a common problem faced by the many institutions and users since their limited resources can not store the astronomical amount of digital information generated by this world. Instead of deleting the historical data, the institutions and users could condense the historical data into a small synthetic dataset with the RDM-DC algorithm so that they can keep the unpoisoned core information from the historical data with limited resources.


\subsection{Comparison with Empirical Robust Aggregation Methods}\label{sec:empirical}
We compare the mean calibration method with several robust aggregation methods, including Trimmed Mean, Truncated Mean, and Median \citep{yin2018byzantine, portnoy2020towards}. The advantage of those empirical methods is that they are very efficient and thus do not add much additional cost to the dataset condensation process. However, the drawback of those methods is that they do not provide any theoretical guarantees, and the poisoned deviation of the mean may be still large after applying those empirical robust aggregation. For instance, the maximum poisoned deviation of the truncated mean still scales in $O(\eps\sqrt{d})$. In this paper, we compare our proposed method with those empirical robust aggregation methods. For completeness, we introduce the empirical robust aggregation methods below.
\paragraph{Truncated Mean}
Given the representations $\{\bm{r}(\bm{x}_i)\}_{i=1}^{N}$, we first compute the mean by $\overline{\bm{r}} = \frac{1}{N}\sum_{i=1}^{N}\bm{r}(\bm{x}_i)$. We then compute the distance between each representation $\bm{r}(\bm{x}_i)$ and the mean by $d_i = \|\bm{r}(\bm{x}_i) - \overline{\bm{r}}\|_2$. Finally, we drop the $\bm{r}(\bm{x}_i)$ with the top-$k$ $d_i$ and average the remaining representations to obtain the truncated mean.
\paragraph{Trimmed Mean} Given the representations $\{\bm{r}(\bm{x}_i)\}_{i=1}^{N}$, we refer to the $j$-th dimension of $\bm{r}(\bm{x}_i)$ as $\bm{r}_j(\bm{x}_i)$. For each dimension $j$, we drop off the $k/2$ largest elements and the $k/2$ smallest elements in $\{\bm{r}_j(\bm{x}_i)\}_{i=1}^{N}$ and aggregate the remaining elements to compute the mean for dimension $j$, {\em i.e.,} $\overline{\bm{r}}_j$.

\paragraph{Median} Given the representations $\{\bm{r}(\bm{x}_i)\}_{i=1}^{N}$, we also refer to the $j$-th dimension of $\bm{r}(\bm{x}_i)$ as $\bm{r}_j(\bm{x}_i)$. For each dimension $j$, we use the median of $\{\bm{r}_j(\bm{x}_i)\}_{i=1}^{N}$ as $\overline{\bm{r}}_j$.



\begin{table*}
	\begin{center}
			\begin{tabular}{cccc}
				% \toprule
				\hline
				$\eps$ & Attack  & Test Acc & ASR \\
				\hline
				\multirow{2}{*}{$64/255$} & Grad Match &$59.59\%\pm0.38\%$  & $20.00\%\pm40.00\%$ \\
				& DM Poison &$59.25\%\pm0.33\%$  & $60.00\%\pm48.99\%$ \\
				\multirow{2}{*}{$128/255$} & Grad Match &$59.37\%\pm0.36\%$  & $60.00\%\pm48.99\%$ \\
				& DM Poison &$59.23\%\pm0.40\%$  & $100.00\%\pm0.00\%$ \\
				\hline
		\end{tabular}
		\caption{Comparing the attack results of gradient matching based data poisoning (Grad Match) and distribution matching based data poisoning (DM poisoning).}
		\label{tab:attack_results}
	\end{center}
\end{table*}

\section{Experiments}
\subsection{Experimental Setup}
\paragraph{Dataset and Poisoning Budget}
We follow the previous literature on targeted data poisoning \citep{geiping2020witches, zheng2021first, yang2022not, huang2020metapoison} to mainly conduct evaluations on CIFAR10 \citep{krizhevsky2009learning}. We also conduct experiments on TinyImageNet \cite{le2015tiny}. We note that, although we follow the previous works on dataset condensation to mainly evaluate image data, our theoretical analysis is generalizable to different data formats. 

We set the poisoning rate as $1\%$, which means the proportion of poisoned data in the training dataset is only $1\%$. With this attack setting, even if a dataset inspector randomly inspect 50 samples, the probability that the inspector can not find a poisoned data sample is approximately $(1-0.01)^{50} \approx 0.605$. \textit{In another word, even if we set a large perturbation size for those poisoned data samples, it is still very likely that the inspector could {\em not} detect the attack without investing much human labor into the inspection.} Therefore, in our experiments, we set the perturbation size as $64/255$ by default, and we also conduct experiments with other perturbation sizes. 
Note that in most experiments, this perturbation size is not enough to change any poisoned data sample into the targeted sample since in most cases, even smallest  $\ell_\infty$ distance between the targeted sample and a poisoned data sample is larger than $64/255$.
\begin{figure}[h]
	\begin{center}
		\includegraphics[width=0.44\textwidth]{target_images.png}
	\end{center}
	\caption{The targeted images for random seeds $0\sim4$. The number before $\rightarrow$ is original label, and the number after $\rightarrow$ is the targeted label (adversary-defined label).}
	\label{fig:target_imgs}
\end{figure}
\begin{table*}
	\begin{center}
		\begin{tabular}{ccc}
			% \toprule
			\hline
			Attack $\rightarrow$  & \multicolumn{2}{c}{DM Poison}\\
			\cmidrule(lr){2-3}
			Method $\downarrow$& Test Acc & ASR\\
			\hline
			DM &$59.25\%\pm0.33\%$ & $60.00\%\pm48.99\%$\\
			DM + Median & $44.08\%\pm0.57\%$ & $60.00\%\pm48.99\%$\\
			DM + Trim & $51.14\%\pm0.73\%$  & $60.00\%\pm48.99\%$\\
			DM + Truncated & $57.04\%\pm0.45\%$  & $44.00\%\pm49.64\%$ \\
			RDM-DC &$57.65\%\pm0.44\%$ & $0.00\%\pm0.00\%$ \\
			\hline
		\end{tabular}
		\caption{Evaluate the dataset condensation methods and defenses against targeted poisoning attacks with $\eps=64/255$ and poisoning rate $1\%$.}
		\label{tab:main_results}
	\end{center}
\end{table*}

\paragraph{Attack Settings}
In general, we evaluate the dataset condensation methods against two attack baselines, {\em i.e.,} gradient matching and our proposed DM poisoning. For gradient matching attack, we pretrain 16 models to craft the perturbation. For both gradient matching attack and DM poisoning attack, we set the attack step size for the sign Adam optimizer as $1/255$ to ensure that the perturbation can be accurately discretized. 
We randomly select five targeted samples with five random seeds ($0\sim 4$) and craft the poisoning data for those targeted samples respective. We show the targeted samples and corresponding original and adversary-defined labels in Fig.~\ref{fig:target_imgs}. 
We employ attack success rate to evaluate the attack performance. For an experiment, if the trained model recognizes the targeted sample as the adversary-defined label, the attack success rate is $100\%$. Otherwise, the attack success rate is $0\%$. We compute the mean and standard deviation of the attack success rate over $5\times5$ ($5$ random seeds to generate attack datasets and $5$ runs of the dataset condensation method for each seed) experiments for evaluating each dataset condensation method.

\paragraph{Dataset Condensation and Defense Settings}
In this paper, we mainly consider distribution matching based dataset condensation. By default, we set the number of synthetic samples per class as $50$. We set the batch size for sampling the original data as $256$. We use an SGD optimizer with momentum $0.5$ and learning rate $1.0$ to update the synthetic data. We follow \citep{zhao2021DM} to set the number of iterations as $20000$. We use Gaussian noise instead of real images to initialize the synthetic data. If using real images that containing poisoning samples for initialization, all the methods will be very vulnerable to poisoning attacks. For the empirical robust aggregation methods, we set $k$ to $[3\eps N]$, where $\eps$ is the poisoning rate and $N$ is the batch size. We follow \citep{zhao2021DM, zhao2021dataset, zhao2021dataset2, cazenavette2022dataset, liudataset} to train deep learning models on the synthetic dataset and employ the testing accuracy of the models on the original testing dataset to evaluate model performance. To evaluate the defensive performance, as mentioned before, we employ attack success rate (ASR) as the evaluation metric. As aforementioned, for each targeted sample (random seed), we run five experiments with different random seeds and compute the mean and variance of the attack success rate to evaluate the defensive performance.





\subsection{Attack Performance}
We first compare the attack performance of the gradient matching based data poisoning and our proposed DM poisoning on distribution matching based dataset condensation. We provide the attack results in Table~\ref{tab:attack_results}, which shows that our DM poisoning attack is more effective than gradient matching based data poisoning here. We conjecture that this is because gradient matching based data poisoning is mainly designed for the classification task, and it attempts to match the gradients of cross-entropy loss computed on the targeted sample and poisoned samples. That means the representations of the poisoned samples are not necessarily aligned with the representations of the targeted sample in certain feature spaces. Therefore, feature matching may not be able to encode the poisoning information that can flip the prediction of the targeted sample into the synthetic data. In contrast, DM poisoning directly matches the representations of the poisoned data and the target data sample in broad feature spaces so that feature matching can encode the desired poisoning information into the synthetic data.

We also note that, as we increase the perturbation size to $128/255$, DM poisoning achieves $100\%$ success rate against distribution matching based dataset condensation in all the experiments. This result indicates that the $O(\eps\sqrt{d_{\bm{r}}})$ poisoned deviation is enough to encode adversary-defined information about the targeted sample into the synthetic data.

\begin{table}[h]
	\begin{center}
		\begin{tabular}{ccc}
			% \toprule
			\hline
			Attack $\rightarrow$  & \multicolumn{2}{c}{Grad Match}\\
			\cmidrule(lr){2-3}
			Method $\downarrow$& Test Acc & ASR\\
			\hline
			DM &$59.59\%\pm0.38\%$  & $20.00\%\pm40.00\%$ \\
			RDM-DC & $57.78\%\pm0.41\%$ & $0.00\%\pm0.00\%$\\
			\hline
		\end{tabular}
		\caption{Evaluate the dataset condensation defense against the gradient matching based attack.}
		\label{tab:defend_gm_poison}
	\end{center}
\end{table}

\begin{table}
	\begin{center}
			\begin{tabular}{ccc}
				% \toprule
				\hline
				Attack $\rightarrow$  & \multicolumn{2}{c}{DM Poison (TinyImageNet)}\\
				\cmidrule(lr){2-3}
				Method $\downarrow$& Test Acc & ASR\\
				\hline
				DM & $18.49\%\pm0.23\%$ & $100.00\%\pm0.00\%$\\
				RDM-DC & $17.68\%\pm0.34\%$ & $0.00\%\pm0.00\%$\\
				\hline
		\end{tabular}
		\caption{Evaluate our attack and defense on TinyImageNet.}
		\label{tab:imagenet}
	\end{center}
\end{table}

\begin{table}
	\begin{center}
			\begin{tabular}{ccc}
				% \toprule
				\hline
				Attack $\rightarrow$  & \multicolumn{2}{c}{Direct Attack}\\
				\cmidrule(lr){2-3}
				Method $\downarrow$& Test Acc & ASR\\
				\hline
				DM & $59.29\%\pm0.38\%$ & $100.00\%\pm0.00\%$\\
				RDM-DC & $57.79\%\pm0.45\%$ & $0.00\%\pm0.00\%$\\
				\hline
		\end{tabular}
		\caption{Evaluate the dataset condensation methods and defenses against the direct attack.}
		\label{tab:direct_attack}
	\end{center}
\end{table}

\subsection{Defensive Performance}
We evaluate our proposed RDM-DC algorithm and compare it with the empirical robust aggregation methods mentioned in Section~\ref{sec:empirical}. As indicated by the results in Table~\ref{tab:main_results}, Median and Trimmed Mean significantly hurt the utility of synthetic data. We conjecture that this is because the Median and Trimmed Mean methods disregard too much information in the representations as they could not maintain the complete information of any representation in the aggregation process. Truncated Mean is similar to the mean calibration method in the sense that Truncated Mean also disregards some representations and aggregates the remaining representations. The main difference between Truncated Mean and the mean calibration method is that Truncated Mean disregards the representations with large distances to the center (spherical distances), while the mean calibration method disregards the representations with large deviations along the principal vector. With Lemma~\ref{prop:main}, we show that the mean calibration method indeed can reduce the poison deviation, while Truncated Mean does not have this guarantee. Therefore, it is not surprising that RDM-DC has better defensive performance than DM + Truncated Mean---DM poisoning can achieve good attack performance against DM + Truncated Mean for some random seeds.

Besides, we evaluate RDM-DC against the gradient matching based attack and report the results in Table~\ref{tab:defend_gm_poison}, which shows that RDM-DC successfully defends against the gradient matching based attack. We also evaluate our attack and defense on TinyImageNet and report the results in Table~\ref{tab:imagenet}. Table~\ref{tab:imagenet} shows that DM poisoning with $\eps=64/255$ can achieve $100\%$ success rate on TinyImageNet, and RDM-DC successfully defends against DM poisoning.

To further demonstrate the outstanding defensive performance of RDM-DC, we evaluate it against a very strong attack, where we directly use the targeted sample with the adversary-defined label as the poisoned data. We name this strong attack as ``direct attack''. As shown in Table~\ref{tab:direct_attack}, this strong direct attack can achieve $100\%$ ASR against \citep{zhao2021DM}, but RDM-DC is able to reduce the ASR to $0\%$, indicating the strong defensive ability of RDM-DC.


\section{Conclusions}
In this paper, we study the vulnerability of dataset condensation to targeted data poisoning. We evaluate the existing dataset condensation approaches against the state-of-the-art targeted data poisoning attack, {\em i.e.,} gradient matching, and our proposed data poisoning attack, {\em i.e.,} DM poisoning. We demonstrate that only $1\%$ poisoned data can mislead dataset condensation to encode poisoning information into the condensed synthetic dataset. As a result, the models trained on the synthetic dataset will output adversary-defined prediction for the targeted data sample. To quantify the effect of poisoned data, we propose the concept of poisoned deviation and show that the poisoned deviation in distribution matching based dataset condensation scales in $O(\eps\sqrt{d_{\bm{r}}})$. We further propose a poisoning-resilient dataset condensation algorithm with a calibration method to reduce the poisoned deviation to $O(\eps^2\sqrt{d_{\bm{r}}})$. Extensive evaluations demonstrate the effectiveness of our proposed poisoning-resilient algorithm against targeted data poisoning.

% References
\bibliography{zheng_31}
\end{document}
