
%\documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

\usepackage{multirow}
\usepackage{arydshln}
\newcommand{\blue}{\textcolor{blue}}
\setlength{\textfloatsep}{8pt plus 1.0pt minus 2.0pt} 

\usepackage{bm}
\usepackage{subcaption}

\usepackage{amsthm}
\newtheorem{cor}{Corollary}
\newtheorem{prop}{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}[theorem]
\theoremstyle{remark}
\newtheorem*{remark}{Remark}
\newtheorem*{claim}{Claim}
\theoremstyle{definition}
\newtheorem{definition}{Definition}

\newenvironment{propa}[1]{\par\noindent{\scshape Proposition #1\ }\em}{\em \\}
\newenvironment{cora}[1]{\par\noindent{\scshape Corollary #1\ }\em}{\em \\}
\newenvironment{lema}[1]{\par\noindent{\scshape Lemma #1\ }\em}{\em \\}
\newenvironment{theo}[1]{\par\noindent{\scshape Theorem #1\ }\em}{\em \\}


%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools, amsfonts, amssymb} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{tikz-qtree}
\usetikzlibrary{trees}
\usetikzlibrary{automata,positioning}

\allowdisplaybreaks


\usepackage{xr}
\usepackage{xr-hyper} 
\usepackage{hyperref}
\externaldocument{nabi_424}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\def\ci{\perp\!\!\!\perp}
\newcommand{\red}{\textcolor{red}}
\newcommand{\E}{\mathbb{E}}
\DeclareMathOperator{\pa}{pa} 

%\title{Goodness of Fit Tests for Subclasses of Graphical Models of Missing Data}
\title{Semiparametric Causal Sufficient Dimension Reduction of \\ Multidimensional Treatments (Supplementary Material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<razieh.nabi@emory.edu>?Subject=Your UAI 2022 paper}{Razieh~Nabi}{}}
\author[2]{Todd~McNutt}
\author[3]{Ilya~Shpitser}
% Add affiliations after the authors
\affil[1]{%
	Department of Biostatistics and Bioinformatics\\
	Emory University\\
	Atlanta, Georgia, USA
}
\affil[2]{%
	School of Medicine\\
	Johns Hopkins University\\
	Baltimore, Maryland, USA
}
\affil[3]{%
	Department of Computer Science\\
	Johns Hopkins University\\
	Baltimore, Maryland, USA
}

\begin{document}

\appendix
\onecolumn  
	\maketitle

% NOTE: necessary when ptmx or no mathfont class option is given
\providecommand{\upGamma}{\Gamma}
\providecommand{\uppi}{\pi} 

%\section{Math font exposition}
%How math looks in equations is important:
%\begin{equation*}
%  F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
%\end{equation*}
%However, one should not ignore how well math mixes with text:
%The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
%It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.



%{\Large \bf APPENDIX}

%\vspace{0.25cm}
For clearer presentation of materials and equations in this supplement, we switch to a single-column format. Appendix~\ref{app:semiparam} contains a brief overview of inference in semiparametric models. Appendix~\ref{app:discussions} contains additional discussion on how to ensure the nuisance models are trained in  a congenial manner. Appendix ~\ref{app:exp} contains additional results with simulated data and real data application. Appendix~\ref{app:proofs} contains all the proofs. 

%######################################################

\section{brief overview of Semiparametric estimation}
\label{app:semiparam}

Let $Z_1, \ldots, Z_n,$ be iid samples from a general class of probability densities $p(Z; \theta)$ parameterized by $\theta^T = (\beta^T, \eta^T),$ where $\beta \in \mathbb{R}^q$ denotes the set of target parameters, and $\eta$ denotes a possibly infinite dimensional set of nuisance parameters.  
%This type of model is termed semiparametric, since it has both a parametric and a nonparametric component.  
The goal of statistical inference in semiparametric models is to find ``the best" estimator of $\beta$ in the model, denoted by $\widehat{\beta}.$ We will consider \emph{regular asymptotically linear (RAL)} estimators, which are estimators of the form
\begin{eqnarray}
	\sqrt{n}(\hat{\beta} - \beta) = \frac{1}{\sqrt{n}} \sum_{i = 1}^{n} \phi(Z_i) + o_p(1), \nonumber 
\end{eqnarray} %
where $\phi \in \mathbb{R}^q$ with mean zero and finite variance, $o_p(1)$ denotes a term that approaches to zero in probability, and $\phi(Z_i)$ is the \emph{influence function (IF)} of the $i$th observation for the parameter vector $\beta.$ RAL estimators are consistent and asymptotically normal, with the variance of the estimator given by its IF,
\begin{eqnarray}
	\sqrt{n}(\hat{\beta} - \beta) \xrightarrow[]{\mathcal{D}} \mathcal{N}\big(0, \E[\phi\phi^T]\big). \nonumber 
\end{eqnarray}

There is a bijective correspondence between RAL estimators and IFs.  IFs provide a geometric view of the behavior of RAL estimators. Consider a Hilbert space ${\cal H}$ of all mean-zero $q-$dimensional functions, equipped with an inner product, defined between two arbitrary elements of the Hilbert space $h_1$ and $h_2$ as $\mathbb{E}[h_1^Th_2].$ The \emph{nuisance tangent space} $\Lambda$ in the semiparametric model is defined to be the mean square closure of elements of the nuisance tangent spaces $\Lambda_{\beta} = \{B^{q\times r} S_\eta(Z; \theta) \}$ of every \emph{parametric submodel}. A parametric submodel is defined as a subset of densities in the semiparametric model parameterized by $\theta^T_{\beta} = (\beta^T, \eta_{\beta}^T),$ where $\eta_{\beta}^T \in {\mathbb R}^r,$ such that the subset contains the density $p(Z; \theta_0)$ in the semiparametric model evaluated at the true parameter values $\theta_0.$ The space $\Lambda$ is important because it is known all influence functions lie in the orthogonal complement $\Lambda^{\perp}$ of $\Lambda$ with respect to ${\cal H}.$ For this reason, recovering $\Lambda^{\perp}$ is often the first step for constructing RAL estimators in semiparametric models.  Out of all IFs in $\Lambda^{\perp}$ there exists a unique one which lies in the tangent space, and which yields the most efficient RAL estimator by recovering the \emph{semiparametric efficiency bound}, see \cite{tsiatis07missing} for details.

%######################################################

\section{Additional Discussions}
\label{app:discussions}

{\bf``Inverted'' Structural Nested Mean Model.} 
In order to deal with the issue of congeniality, we may opt to specify $\E_q[ Y \mid g(A;\beta)]$ and $\widetilde{f}(A,{ C},\beta) = \E_q[Y \mid A,{ C}] - \E_q[Y \mid g(A;\beta)],$ which yield a variationally independent specification of $\E_q[ Y \mid g(A; \beta)]$ and
$\E_q[ Y \mid A,{ C}] = \E_q[ Y \mid g(A; \beta)] + \widetilde{f}(A,{ C},\beta).$ Consequently, the four variationally independent models we need to specify are as follows: $\ell(g(A;\beta)),$ $\nu(g(A;\beta)),$ $p(A | { C}),$ and $\widetilde{f}(A,{ C},\beta).$
The last term in (\ref{eqn:u-star}) can be evaluated empirically without additional modeling.  Thus, we need to specify the additional  nuisance model $\widetilde{f}$. We propose to fit $\widetilde{f}$ by borrowing ideas from the theory of structural nested mean models (SNMMs) in \citep{structural14stijn, robins99marginal}. Unlike MSMs, which are regression models for causal relationships, SNMMs directly model the so called ``blip effects,'' namely counterfactual differences between the response to a particular treatment, and a response to a reference treatment, given a particular observed trajectory. For a single treatment, this difference simplifies to
$\gamma(A,{ C}; \psi) = \E[ Y(A) \mid A, { C}] - \E[ Y(0) \mid A, { C} ].$ Let $U_{sn}(\psi) \coloneqq Y - \gamma(A, { C}; \psi).$  Consequently, $\E[ U_{sn}(\psi) \mid A, { C}] = \E[Y(0) \mid A, { C}] = \E[Y(0) \mid { C}] = \E[ U_{sn}(\psi) | { C} ]$ (by conditional ignorability). This  estimating equation then  leads to a consistent estimation of parameters $\psi$: $$\mathbb{P}_n\Big[ \{ d(A,{ C}) - \E[ d(A,{ C}) \mid { C} ] \} \times \{ U_{sn}(\psi) - \E[ U_{sn}(\psi) \mid { C} ] \} \Big] = 0,$$ where $d(A,{ C})$ is a function of $A$ and $ C$ with the same cardinality as $\psi$ \citep{structural14stijn}. Assuming $\widetilde{f}$ is parameterized by $\psi,$  we now show that estimating $\psi$ can be viewed as an estimation problem for a kind of ``inverted SNMM.''
\begin{lemma}
	\label{lem:f-tilde}
	Let $U_{dim}(\psi) = Y - \widetilde{f}(A, { C}, \beta; \psi)$, and fix any $d(A,{ C}).$ If either $\E[ d(A,{ C}) \mid g(A; \beta)]$ or $\E[ U_{dim}(\psi) \mid g(A;\beta)]$ are correctly specified, the following estimating equations yield a consistent estimator of $\psi,$
	%
	%	{\small
	\begin{align*}
		&\mathbb{P}_n\Big[ \big\{ d(A,{ C}) - \E[ d(A,{ C}) \mid g(A; \beta) ] \big\} 
		%			&\hspace{1cm} 
		\times \big\{ U_{dim}(\psi) - \E [ U_{dim}(\psi) \mid g(A; \beta) ] \big\} \Big] = 0. 
	\end{align*} 
	%	}
\end{lemma}
For the purposes of robustness, specifying both $\E[\widetilde{f} \mid g(A;\beta)]$ and $\E[U_{dim}(\psi) \mid g(A;\beta)]$ correctly is part of the correct specification of $\E[U(\beta) \mid A, { C}]$, given the type of estimation strategy we use. 

The implementation provided in Section~\ref{sec:estimation} can be modified to take advantage of modeling congenial models. Right before step (c), we need to estimate $ \widehat{\widetilde{f}^{(j)}}\big(A, { C}, \beta^{(j)}; \widehat{\psi}\big)$ using Lemma~\ref{lem:f-tilde}, and modify step (c) by letting $\E[ U^q(\beta^{(j)}) \mid A, { C} ] =  \widehat{\widetilde{f}^{(j)}} \times  \big\{\alpha(A) - \widehat{\nu}(g(A;\beta^{(j)})) \big\}$. A downside of estimating congenial models  is that the overall procedure becomes quite computationally intensive. 


%######################################################

\section{Additional Experiments}
\label{app:exp}

In light of an anonymous reviewer's suggestion, we provide the steps for numerical computation of the first derivative. Computing the second derivative follows similarly.  

\begin{itemize}
	\item Step 1.  Define a function that takes as input a vector-valued $\beta$ and computes $\zeta(\beta)$ as defined in Step 4 of the implementation discussion.
	
	\item Step 2. Let $K$ be the length of $\beta$, that is (\# of rows) $\times$ (\# of columns)
	
	\item Step 3. Let \emph{deriv} be the vector that collects the derivatives of $\zeta(\beta)$ with respect to $\beta[i, j]$. Thus \emph{deriv} is a vector of size $K$.
	
	\item Step 4. For $k \in 1:K$ do:
	
	\begin{itemize}
		\item (i) Let the $k^\text{th}$ element in the matrix $\beta$ be perturbed by $\delta$, as  follows
		\begin{itemize}
			\item (a) Let $e$ be a matrix of all zeros with the same number of rows and columns as $\beta$
			
			\item (b) $e[k] = 1$
			
			\item (c) $\beta^+_k = \beta + \delta \times e$
			
			\item (d) $\beta^-_k = \beta - \delta \times e$
		\end{itemize}
		
		\item (ii) Compute $\zeta(\beta^+_k)$ and $\zeta(\beta^-_k)$
		
		\item (iii) \emph{deriv}$[k] = \{\sum \zeta^2(\beta^+_k) - \sum \zeta^2(\beta^-_k)\}/\{2 \times \delta\}$
	\end{itemize}
	
\end{itemize}


\subsection{Simulations}

\noindent\textbf{Simulation 1.} 
The boxplots in Fig.~\ref{fig:boxplots2}(a) illustrate the performance of different estimation strategies for $\beta$ for both Case 1 and Case 2, when sample size is set for $200$ and $p = 12$. 

\begin{figure}[t]
	\begin{minipage}[b]{.32\textwidth}
		%	\centering
		\includegraphics[scale=0.13]{pics/boxplots_p12.png} 
		\subcaption{}\label{label-a}
	\end{minipage}  %\hfill
	\begin{minipage}[b]{.32\textwidth}
		%				\centering
		\includegraphics[scale=0.13]{pics/boxplots_case2.png}
		\subcaption{}\label{label-b}
	\end{minipage} 
	\begin{minipage}[b]{.32\textwidth}
		%	\centering
		\includegraphics[scale=0.13]{pics/dr_boxplots_p6.png} 
		\subcaption{}\label{label-c}
	\end{minipage}  %\hfill
	\caption{ (a) Boxplots of Frobenius norms between true and estimated parameters in simulations ($p=12$); (b) Illustration of the effect of sample size on the Frobenius norms between true and estimated parameters using data generated from Case $2$ with $p = 6.$ (c) Boxplots to illustrate the robustness property of our AIPW-style estimator. } 
	\label{fig:boxplots2}
\end{figure}

\noindent\textbf{Simulation 2.} 
The relative frequency of the selected dimension are reported in the table below, which reveals that the bootstrap procedure reliably recovers the true structural dimension, namely $2$.

\begin{center}
	\scalebox{1.}{
		\begin{tabular}{| c | c | c | c | c | c |}
			\hline 
			Model $(p = 6)$ & $\hat{d} = 1$ & $\hat{d} = 2$ & $\hat{d} = 3$ & $\hat{d} = 4$ & $\hat{d} = 5$  \\ \hline 
			Case $1$ & $0\%$ & $98\%$ & $2\%$ & $0\%$ & $0\%$ \\ \hline
			Case $2$ & $0\%$ & $90\%$ & $10\%$ & $0\%$ & $0\%$ \\ 
			\hline  
		\end{tabular}
	}
\end{center}

\noindent\textbf{Simulation 3.} 
In the third set of simulations, we demonstrated the effect of sample size on \textit{IPW} and \textit{AIPW} estimators of $\beta$ in the causal SDR model. Results are shown in Fig.~\ref{fig:boxplots2}(b).  While both estimators are consistent under our model specification, AIPW exhibits favorable convergence rates compared to IPW, as expected.

\noindent\textbf{Simulation 4. } 
In light of an anonymous reviewer's  suggestion, we ran  simulation to illustrates the robustness property of our AIPW-style estimator. 
%Instead, we provide summary statistics of the plots (that would resemble the boxplots in Fig.1), which can be used to infer the robustness behavior. 
For such illustration, we focused on Case 2 with $p = 6$ and fix the sample size to $200$. We computed the Frobenius norms between true and estimated parameters under four different scenarios: (1) \emph{miss-no} where all model are correctly specified, (2) \emph{miss-A} where only the propensity model is misspecified, (3) \emph{miss-Y} where only the outcome model is misspecified, and (4) \emph{miss-AY} where both models are misspecified. For miss-specification of outcome model, we excluded the dependency of $Y$ on $C_3$ and $C_4$. For miss-specification of the propensity model, we changed the mean of the multivariate normal distribution by altering the dependency of $A$ on the baseline factors. We performed $100$ iterations and the boxplots are provided in Fig.~\ref{fig:boxplots2}(c).  


We assure the reviewer to include demonstrations of the robustness property in the camera ready revision.

\subsection{Real Data}

Assume treatment is collected using $p$ equally spaced percentages of volume. In other words, treatment is assumed to be a vector in $\mathbb{R}^p$ where the $i$-{th} element corresponds to the radiation dose on $p$ percentages of the parotid glands. The effect of radiation on weight loss is illustrated in Fig.~\ref{fig:RT_high} by allowing $p$ to be $10$ and $20,$ and reducing the size of treatment to one dimension. We use IPW estimators to calculate the effects. Both plots agree with our stated conclusion in the main body of the manuscript, i.e., radiation has a negative effect on weight loss. 

\begin{figure}[h]
	\begin{minipage}[b]{.48\textwidth}
		\centering 
		\includegraphics[scale=0.395]{pics/RT_ipw_d1_p10.png}
	\end{minipage}  \hfill
	\begin{minipage}[b]{.48\textwidth}
		\centering 
		\includegraphics[scale=0.4]{pics/RT_ipw_d1_p20.png}
	\end{minipage}  \hfill
	\caption{Heatmaps to illustrate the causal effect of radiation on weight loss. Treatment is collected using (right) $10$ and (left) $20$ equally spaced percentages of volume in parotid glands.}
	\label{fig:RT_high}
\end{figure}


%######################################################
%\clearpage
\section{Proofs}
\label{app:proofs}

{\bf Lemma \ref{lem:ipw-consistent}}. 
\begin{proof}
	Choosing $\phi(A,{ C}) = 0$ in Theorem \ref{thm:orth} yields (\ref{eqn:sdr_ipw}).  All elements of the orthocomplement of the nuisance tangent space are mean zero under the true distribution (we give an argument for elements of $\widetilde{\Lambda}^{\perp}_{\eta}$ in Proposition~\ref{pro:mean-zero}).  Since $\widetilde{U}(\beta)$ exhibits double robustness, i.e. remaining consistent if either $\ell(g(A;\beta))$ or $\nu(g(A;\beta))$ is correctly specified \citep{MA12SDR},  the correct specification of $p(A \mid { C})$ yields our conclusion. 
\end{proof}

%######################################################

\begin{prop}
	For all $\tilde{U}(\beta^*) \in \widetilde{\Lambda}^{\perp}_{\eta},$ $\E[\widetilde{U}(\beta^*)] = 0$.
	\label{pro:mean-zero}
\end{prop}
%
\begin{proof}
	The second and third terms of $\widetilde{U}(\beta^*)$ are mean zero by construction. The first term, under truth with the property that
	$\E_q[Y \mid A] = \E_q[Y \mid g(A; \beta)],$ is
	\begin{small}
		\begin{align*}
			\E\Big[ \frac{p^*(A)}{p(A \mid { C})} &\times \widetilde{U}(\beta)  \Big]
			= \int \widetilde{U}(\beta) \times p(Y \mid A, { C}) \times p^*(A) \times p({ C}) \ d\mu_{Y, A, { C}} \\ 
			&= \int \big\{ Y - \ell(g(a; \beta)) \big\} \times \big\{ \alpha(A) -  \nu(g(a; \beta))  \big\}  \times  q(Y,A,{ C}) \ d\mu_{Y, A, { C}} \\
			&= \E_q\Big[  \big\{ Y - \ell(g(a; \beta)) \big\} \times \big\{ \alpha(A) -  \nu(g(a; \beta))  \big\}  \Big] \\
			&= \E_q\Big[ \big\{ \alpha(A) -  \nu(g(a; \beta))  \big\}  \times \E_q\big[ \big\{ Y - \ell(g(a; \beta)) \big\}  \mid A = a \big] \Big] \\
			&= \E_q\Big[ \big\{ \alpha(A) -  \nu(g(a; \beta))  \big\} \times \big\{ \E_q[Y \mid A = a] - \ell(g(a; \beta)) \big\} \Big]\\
			&= 0.
		\end{align*}
	\end{small}%
	since $ \ell(g(a; \beta)) \coloneqq \E_q[Y \mid A = a]$. Note that even if $\ell(g(a; \beta))$ is miss-specified, the expectation will still be zero if $\nu(g(a; \beta)) $ is correctly specified, shown by iterative expectations. 
	
\end{proof}


%######################################################

{\bf Theorem {\ref{thm:orth}}. 
	\begin{proof}
		This is a direct consequence of Theorems 3.1 and 3.2 in \cite{robins99marginal}, and results in Appendix 3 of \cite{MA12SDR}.
	\end{proof}
	
	
	%######################################################
	
	{\bf Lemma \ref{lem:orth-form}}. 
	\begin{proof}
		Plugging in the optimal $\phi(A,{ C})$ yields $\widetilde{U}(\beta^*)$ to be
		\begin{small}
			\begin{align*}
				\notag
				& \frac{p^*(A)}{p(A \mid { C})} \times \widetilde{U}(\beta) - 
				\E\left[\frac{p^*(A)}{p(A \mid { C})} \ \widetilde{U}(\beta) \middle| A, { C} \right] +
				\E\left[\E\left[\frac{p^*(A)}{p(A \mid { C})} \ \widetilde{U}(\beta)  \middle| A, { C} \right] \middle| { C} \right]. \\
			\end{align*}
		\end{small}
		The conclusion follows, since
		%{\small
		\begin{align*}
			\E&\left[\E\left[ \frac{p^*(A)}{p(A \mid { C})} \ \widetilde{U}(\beta)\middle| A, { C} \right] \middle| { C} \right]  \\
			&\hspace{2cm} = \E\left[ \frac{p^*(A)}{p(A \mid { C})}  \E\left[\ \widetilde{U}(\beta)\middle| A, { C} \right] \middle| { C} \right] \hspace{1.2cm} \\
			&\hspace{2cm} = \int \frac{p^*(A)}{p(A \mid { C})} \ \E[\widetilde{U}(\beta)  \mid A, { C} ] \ p(Y, A \mid { C}) \ d\mu_{Y, A}\\
			&\hspace{2cm} = \int \E[\widetilde{U}(\beta)  \mid A, { C} ] \ p(Y \mid A, { C}) \ p^*(A) \ d\mu_{Y, A} \\
			&\hspace{2cm} = \int \E[\widetilde{U}(\beta)  \mid A, { C} ] \ q(Y, A \mid { C}) \ d\mu_{Y, A} \\
			&\hspace{2cm} = \E_q\Big[\E\big[\widetilde{U}(\beta) \mid A, { C} \big] \Big| { C} \Big].
		\end{align*}
		%}
	\end{proof}
	
	%######################################################
	
	{\bf Lemma \ref{lem:dr}}. 
	\begin{proof}
		Assume either $\ell(g(A; \beta))$ or $\nu(g(A; \beta)),$ and $p(A \mid { C})$ are correctly specified. Consequently, the second and third terms in the expression of $\widetilde{U}(\beta^*)$ are both mean zero, even under an incorrect specification of $\E [ \widetilde{U}(\beta) \mid A, { C}].$ Following the same the argument in Proposition~\ref{pro:mean-zero}, the first term is zero if either $\ell(g(A; \beta))$ or $\nu(g(A; \beta))$ is correctly specified.
		
		Assume either $\ell(g(A; \beta))$ or $\nu(g(A; \beta)),$ and $\E [ \widetilde{U}(\beta) \mid A, { C}]$ are correctly specified.
		Consequently, the first two terms in the expression of $\widetilde{U}^*$ are both mean zero, even under an incorrect specification of $p^*(A \mid { C}).$  For the last term, we have:
		\begin{small}
			\begin{align*}
				&\E\bigg[ \E_q\Big[ \E\big[\widetilde{U}(\beta) \ \big| \ A, { C} \big] \ \Big| \ { C} \Big] \bigg] \\
				&\hspace{2cm} = \E\bigg[  \E_q\Big[  \ \int \widetilde{U}(\beta) \times p(Y \mid A, { C}) \ d\mu_{Y} \ \Big| \ { C} \Big] \bigg]  \hspace{1cm} \\
				&\hspace{2cm} = \E\bigg[ \int  \Big( \int \widetilde{U}(\beta) \times p(Y \mid A, { C}) \ d\mu_{Y} \Big) \times p^*(A) \ d\mu_{A} \bigg]  \\ 
				&\hspace{2cm} = \int \left( \int  \int \widetilde{U}(\beta) \times p(Y \mid A, { C}) \times p^*(A) \ d\mu_{Y} \ d\mu_{A} \right) \times p({ C}) \ d\mu_{ C}  \\
				&\hspace{2cm} = \int \widetilde{U}(\beta) \times p(Y \mid A, { C}) \times p^*(A) \times p({ C}) \ d\mu_{Y, A, { C}}  \\ 
				&\hspace{2cm} = \int \widetilde{U}(\beta) \times q(Y, A, { C}) \ d\mu_{Y, A, { C}}  \\
				&\hspace{2cm} =  \E_q\big[ \widetilde{U}(\beta) \big].
			\end{align*}
		\end{small}
		We conclude the proof by noting that $\E\big[\widetilde{U}(\beta)\big]$ is mean zero if either $\ell(g(A; \beta))$ or $\nu(g(A; \beta))$ is correctly specified. Note that the normalized version of $\widetilde{U}(\beta^*),$ that is $\E[\frac{\partial \widetilde{U}(\beta^*)}{\partial \beta}]^{-1} \times \widetilde{U}(\beta^*),$ is an influence function that lives in the orthogonal complement of the tangent space $\widetilde{\Lambda}^{\perp}_{\eta}.$ Therefore,  the estimator obtained by solving $\E[\widetilde{U}(\beta^*)] = 0$ is RAL and is consistent and asymptotically normal with mean zero and variance equal to the variance of the influence function \citep{van2000asymptotic, tsiatis07missing}. 
	\end{proof}
	
	%######################################################
	
	{\bf Theorem {\ref{theom:convergence}}. 
		\begin{proof} 
			Let $\beta \in \mathbb{R}^q$ and let $\eta$ be infinite dimensional. We prove this theorem for the parametric submodel in the semiparametric model of $\{p(Z; \beta, \eta) \}$. With a slight abuse of notation, we denote $\eta \in \mathbb{R}^{r}$ to be the nuisance parameters within the parametric submodel. 
			The Taylor series expansion of $\widetilde{U}(Z; \widehat{\beta}(\widehat{\eta}), \widehat{\eta})$ around $\beta_0$ is
			{\small
				\begin{align}
					0 
					= &\frac{1}{\sqrt{n}} \sum_{i=1}^n \widetilde{U}(z_i; \widehat{\beta}(\widehat{\eta}), \widehat{\eta})    \label{eq:main}  \\
					= &  \underset{(a)}{\underbrace{\frac{1}{\sqrt{n}} \sum_{i=1}^n \widetilde{U}(z_i; \beta_0, \widehat{\eta}) }} +  
					\underset{(b)}{\underbrace{  \frac{\partial}{\partial \beta} \left\{ \frac{1}{n}  \sum_{i=1}^{n} \widetilde{U}(z_i; \beta_0, \widehat{\eta}) \right\} } }  \sqrt{n}(\widehat{\beta} - \beta_0) + o_p(1)  \nonumber 
				\end{align}
			}
			{\small
				\begin{align*}
					(a) 
					&= \frac{1}{\sqrt{n}} \sum_{i=1}^n \widetilde{U}(z_i; \beta_0, \eta_0)  \\
					&\hspace{0.25cm} + \frac{1}{n} \sum_{i=1}^n \left(\frac{\partial \widetilde{U}(z_i; \beta_0, \eta_0)}{\partial \eta}\right)_{q \times r} \times \sqrt{n}(\widehat{\eta} - \eta_0) \\
					&\hspace{0.25cm} + \frac{1}{2}  \underset{ 1\times 1\times r}{\underbrace{n^{1/4}(\widehat{\eta} - \eta_0)'}} \  
					\underset{r\times q\times r \text{ (tensor)}}{\underbrace{ \left(   \frac{1}{n} \sum_{i=1}^n \frac{\partial^2 \widetilde{U}(z_i; \beta_0, \eta_0)}{\partial^2 \eta }  \right)}} \  
					\underset{r \times 1\times 1}{\underbrace{n^{1/4}(\widehat{\eta} - \eta_0)}} +  o_p(1)
					\\
					(b) 
					&= \underset{(b_1)}{\underbrace{\frac{1}{n} \sum_{i=1}^{n} \left( \frac{\partial \widetilde{U}(z_i;\beta_0, \eta_0)}{\partial \beta} \right)_{q\times q}}} +   
					\underset{(b_2)}{ \frac{\partial }{\partial \beta} \bigg\{ \underbrace{\frac{1}{n} \sum_{i=1}^{n} \left( \frac{\partial \widetilde{U}(z_i; \beta_0, \eta_0)}{\partial \eta} \right)_{q\times r}} } \times \left( \widehat{\eta} - \eta_0 \right)_{r\times 1} \bigg\}
				\end{align*}
			}
			{\small
				\begin{align*}
					&(b_1): \frac{1}{n} \sum_{i=1}^{n} \left( \frac{\partial \widetilde{U}}{\partial \beta} \right)_{q\times q} \longrightarrow  \E_{\theta_0}\left[ \frac{\partial \widetilde{U}}{\partial \beta}  \right]_{q\times q} = - \E_{\theta_0} \left[ \widetilde{U}(Z; \theta_0) S'_\beta (Z; \theta_0)  \right] 
					\\
					\\
					&(b_2): \frac{1}{n} \sum_{i=1}^{n} \left( \frac{\partial \widetilde{U}}{\partial \eta} \right)_{q\times r} \longrightarrow  \E_{\theta_0}\left[ \frac{\partial \widetilde{U}}{\partial \eta}  \right]_{q\times r} = - \E_{\theta_0} \left[ \widetilde{U}(Z; \theta_0) S'_\eta (Z; \theta_0)  \right] = \pmb 0_{q\times r} 
				\end{align*}
			}
			
			Since $n^{1/4} (\widehat{\eta} - \eta_0)$ and $\frac{1}{n} \sum_{i=1}^n \left(\frac{\partial \widetilde{U}(z_i; \beta_0, \eta_0)}{\partial \eta}\right)_{q \times r}$ both converge in probability to zero, then
			\begin{align*}
				\frac{1}{\sqrt{n}} \sum_{i=1}^n \widetilde{U}(z_i; \beta_0, \widehat{\eta})  
				=  &\frac{1}{\sqrt{n}} \sum_{i=1}^n \widetilde{U}(z_i; \beta_0, \eta_0) + o_p(1).
			\end{align*}%
			Therefore, from equation \ref{eq:main}
			{\small
				\begin{align*}
					\sqrt{n}(\widehat{\beta} - \beta_0) = \frac{1}{\sqrt{n}} \sum_{i=1}^n \left\{ - \E^{-1}_{\theta_0} \left[\frac{\partial \widetilde{U}(z_i; \beta_0, \eta_0)}{\partial \beta}\right] \widetilde{U}(z_i; \beta_0, \eta_0) \right\}  +  o_p(1) 
				\end{align*}
			}%
			Which concludes the proof. This procedure carries over to the case where the nuisance parameter is infinite dimensional, \cite{tsiatis07missing}.
		\end{proof}
		
		%######################################################
		
		{\bf Lemma {\ref{lem:f-tilde}}. 
			\begin{proof}
				Define $U_{dim}(\psi) = Y - \widetilde{f}(A,{ C},\beta; \psi).$ Therefore,  
				\[
				\E[ U_{dim}(\psi) \mid A, { C} ] = \ell(g(A; \beta)) = \E[ U_{dim}(\psi) \mid g(A; \beta)].
				\] 
				This is a situation precisely isomorphic to single treatment SNMMs above, except with the roles of $A$ and ${ C}$ reversed (hence this is an ``inverted SNMM''). Our conclusion will then follow by results in \citep{robins00marginal, structural14stijn}.  We provide a more detailed proof as follows. 
				We have that $\widetilde{f}(A,{ C},\beta; \psi) = \mathbb{E}[Y \mid A = a, { C} = { C}] - \ell(g(a; \beta)).$ Therefore, 
				\begin{align*}
					\mathbb{E}[Y \mid A = a, { C} = { C}] = \ell(g(a; \beta)) + \widetilde{f}(a,{ C},\beta; \psi),
				\end{align*}%
				which we can rewrite as follows, 
				\begin{align*}
					Y = \ell(g(a; \beta)) + \widetilde{f}(a,{ C},\beta; \psi) + \epsilon, \ 
					\text{ s.t. } \ \mathbb{E}[\epsilon \mid { C}, a] = 0. 
				\end{align*}
				
				Observed data are instances of the form ${\bf Z} = ({ C}, A, Y)$. The goal is to find semiparametric estimators for $\psi$ in the semiparametric model
				$\mathcal{P} = \{ p({\bf z}; \psi, \psi()), {\bf z} = ({ C}, a, y)\}$ and the truth is $p_0({\bf z}) = p({\bf z}; \psi_0, \eta_0())$. 
				The observed data likelihood can be written as follows,
				\begin{align*}
					p(c, a, y) 
					&= p({ C}, a) \times  p(y \mid a, { C}) \coloneqq p({ C}, a)\times p(\epsilon \mid a, { C})  = \eta_1({ C}, a) \times \eta_2(\epsilon, a, { C}) \\
					&= \eta_1({ C}, a) \times \eta_2\Big(y - \ell(g(a; \beta)) - \widetilde{f}(a,{ C},\beta; \psi), a, { C} \Big),
				\end{align*}
				where $\epsilon = Y - \ell(g(a; \beta)) - \widetilde{f}(a,{ C},\beta; \psi),$ $\eta_1({ C}, a)$ denotes the nuisance model for $p({ C}, a),$ and $\eta_2(\epsilon, a, { C})$ denotes the nuisance model for $p(\epsilon \mid a, { C}),$ which is any density such that $\E[\epsilon \mid a, { C}] = 0.$ $\psi$ is the parameter of interest and the nuisance parameters are $\{\eta_1, \eta_2, \ell(g(a; \beta))\}$. 
				
				\noindent The nuisance tangent space of this semiparametric model, $\Lambda,$ is defined as the mean-square closure of parametric submodel nuisance tangent spaces:  
				\begin{align*}
					\mathcal{P}_{\psi, \zeta} 
					&= \big\{ p(z; \psi, \psi_{\zeta}) = p(c, a; \zeta_1) \times p(\epsilon \mid a, { C}; \zeta_2) \big\} \\
					&= \Big\{ p({ C}, a; \zeta_1) \times p\big(y - \ell(g(a; \beta)) - \widetilde{f}(a,{ C},\beta; \psi) \mid a, { C}; \zeta_2\big)\Big\},  
				\end{align*}
				where $\zeta_1, \zeta_2$ are $r_1, r_2$ dimensional vectors. Thus nuisance parameters in parametric submode are finite dimensional, $\zeta = \{\zeta_1, \zeta_2, \ell(g(a; \beta))\}.$
				\begin{align*}
					\Lambda_\zeta &= \{B\times S_\zeta, \forall B\},  \\
					S_\zeta &= \frac{\partial{\{\text{log likelihood of the submodel evaluated at truth}\}}}{\partial{\zeta}} \\
					&= \Bigg\{ 
					\Big( \frac{\partial \log p(z; \psi, \zeta)}{\partial \zeta_1} \Big), 
					\Big( \frac{\partial \log p(z; \psi, \zeta)}{\partial \zeta_2} \Big), 
					\Big( \frac{\partial \log p(z; \psi, \zeta)}{\partial \ell(g(a; \beta))} \Big) 
					\Bigg\}\Bigg|_{\psi_0, \zeta_0} \\
					&= \Big\{ S_{\zeta_1}(z; \psi_0, \zeta_0),  S_{\zeta_2}(z; \psi_0, \zeta_0), S_{\ell(g(a; \beta))}(z; \psi_0, \zeta_0) \Big\}.  
				\end{align*}
				
				\noindent Hence, $\Lambda_\zeta  = \Lambda_{\zeta_1} + \Lambda_{\zeta_2} + \Lambda_{\ell(g(a; \beta))} $. 
				$S_{\zeta_1}$ should satisfy the density conditions. In addition, $ S_{\zeta_2}$ should satisfy the condition that $\E[\epsilon \mid a, { C}] = 0.$ We derive each of these subspaces using theorems in \citep{tsiatis07missing} as a guideline. 
				\begin{align*}
					& \text{(Theorem 4.6)}  & \Lambda_{\zeta_1} &= \{ f({ C}, a); \E[f] = 0 \} \\
					& \text{(Theorem 4.7)}  & \Lambda_{\zeta_2} &= \{f(\epsilon, a, { C}); \E[f \mid a, { C}] = 0, \E[\epsilon f \mid a, { C}] = 0\} \\
					& \text{(Lemma 4.3)}  & \Lambda_{\zeta_1}^{\perp} &= \{g(\epsilon, a, { C}); \E[g \mid a, { C}] = 0\} \\
					& \text{(Theorem 4.8)}  & (\Lambda_{\zeta_1} + \Lambda_{\zeta_2})^{\perp} &= \{g({ C}, a)\epsilon\} \\
					& \text{(Equation \ref{eq:Smu}) }  &  \Lambda_{\ell(g(a; \beta))} &= \{\frac{\psi_{2\epsilon}'(\epsilon, { C}, a)}{\psi_2(\epsilon, { C}, a)} f(g(a; \beta)) \}
				\end{align*}
				%
				In order to derive $\Lambda_{\ell(g(a; \beta))},$ we write down the corresponding score function as follows. 
				\begin{align}
					&S_{\ell(g(a; \beta))} 
					= \frac{\partial \log p(z; \psi, \zeta)}{\partial \ell(g(a; \beta))} \Big|_{\psi_0, \zeta_0}  \nonumber \\
					&\hspace{0.5cm} = \frac{\partial \log \Big(\psi_1({ C}, a; \zeta_{10}) \times \psi_2\big(y - \ell(g(a; \beta)) - \gamma({ C}, a; \psi), { C}, a; \zeta_{20}\big) \Big)}{\partial \ell(g(a; \beta))} \nonumber \\
					&\hspace{0.5cm}= \frac{\partial \log \psi_2\big(y - \ell(g(a; \beta)) - \gamma({ C}, a; \psi), l, a; \zeta_{20}\big)}{\partial \ell(g(a; \beta))}  \nonumber \\
					&\hspace{0.5cm}= \frac{\partial \log \psi_2\big(\epsilon, { C}, a; \zeta_{20}\big)}{\partial \epsilon}\times \frac{\partial \epsilon}{\partial \ell(g(a; \beta))}  \quad (\epsilon \text{ is a function of }\ell(g(a; \beta))) \nonumber \\
					&\hspace{0.5cm}= \frac{\psi_{2\epsilon}'(\epsilon, { C}, a)}{\psi_2(\epsilon, { C}, a)} f(g(a; \beta)).
					\label{eq:Smu}
				\end{align}
				%
				In order to derive $\Lambda_\zeta^\perp,$ we proceed as follows. Since $\Lambda_\zeta = \Lambda_{\zeta_1} + \Lambda_{\zeta_2} + \Lambda_{\ell(g(a; \beta))}$ and $\Lambda_{\zeta_1} + \Lambda_{\zeta_2} \subset \Lambda_\zeta$, then $\Lambda_\zeta^\perp \subset (\Lambda_{\zeta_1} + \Lambda_{\zeta_2})^\perp = \{g(c, a)\epsilon\}$. Similarly, $\Lambda_\zeta^\perp \subset \Lambda_{\ell(g(a; \beta))}^\perp,$ therefore $\Lambda_\zeta^\perp = \{(\Lambda_{\zeta_1} + \Lambda_{\zeta_2})^\perp \cap  \Lambda_{\ell(g(a; \beta))}^\perp \}.$
				
				\noindent  Pick an arbitrary element in  $(\Lambda_{\zeta_1} + \Lambda_{\zeta_2})^\perp,$ and denote it by $d({ C}, a)\epsilon.$ For $d({ C}, a)\epsilon$ to be an element in $\Lambda_\zeta^\perp,$ it needs to be orthogonal to every element in $\Lambda_{\ell(g(a; \beta))}.$ Pick an arbitrary element in $\Lambda_{\ell(g(a; \beta))}$ and denote it by $\frac{\psi_{2\epsilon}'}{\psi_2} h(g(a; \beta)).$ We have,
				\begin{align*}
					\forall h(g(a; \beta)) \quad 0 
					&= <d({ C}, a)\epsilon, \frac{\psi_{2\epsilon}'}{\psi_2} h(g(a; \beta))> \\
					&= \E\big[d({ C}, a)\epsilon \frac{\psi_{2\epsilon}'}{\psi_2} h(g(a; \beta))\big] \\
					&= \E\big[d({ C}, a) h(g(a; \beta))\big].
				\end{align*}
				Consequently, $\forall h(g(a; \beta))$:
				\begin{align*}
					0 
					&= \E\big[d({ C}, a) \times h(g(a; \beta))\big] \\
					&= \E\Big[ \E\big[d({ C}, a) \times h(g(a; \beta)) \ \big| \ g(a; \beta)\big]  \Big] \\
					&= \E\Big[ h(g(a; \beta))  \times \E\big[d({ C}, a) \ \big| \ g(a; \beta)\big]   \Big] \\
					&= \E\big[h(g(a; \beta))\big] \times \E\big[d({ C}, a) \ \big| \ g(a; \beta)\big]. 
				\end{align*}
				%
				Therefore, $\E[\ d({ C}, a) \mid g(a; \beta) \ ] = 0$ and 
				\begin{align*}
					\Lambda^\perp_\zeta 
					&=\Big\{ \big( d({ C}, a) - \E[d({ C}, a) \mid g(a; \beta)] \big) \times \epsilon \Big\} \\
					&= \Big\{ \big( d({ C}, a) - \E[d({ C}, a) \mid g(a; \beta)] \big) \times \big(Y - \gamma({ C}, a; \psi) - \ell(g(a; \beta)) \Big\} \\
					&= \Big\{ \big( d({ C}, a) - \E[d({ C}, a) \mid g(a; \beta)] \big) \times \big(U(\psi) - \E[U(\psi) \mid { C}, a]\big)  \Big\}. 
				\end{align*}
				%
				Note that $\E[U(\psi) \mid { C}, a] = \E_q[Y \mid g(a; \beta)] = \E[U(\psi) \mid g(a; \beta)]$. Hence, 
				\begin{align*}
					\Lambda^\perp_\zeta =  \Big\{ \big\{ d({ C}, a) - \E[d({ C}, a) \mid g(a; \beta)] \big\} \times \big\{ U(\psi) - \E[U(\psi) \mid g(a; \beta)] \big\} \Big\}.  
				\end{align*}
			\end{proof}
		
\clearpage 

\bibliography{nabi_424}
		
\end{document}