% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{bibentry}
\usepackage{tikz} % nice language for creating drawings and diagrams
% \makeatletter
% \newcommand*{\addFileDependency}[1]{% argument=file name and extension
% \typeout{(#1)}% latexmk will find this if $recorder=0
% % however, in that case, it will ignore #1 if it is a .aux or 
% % .pdf file etc and it exists! If it doesn't exist, it will appear 
% % in the list of dependents regardless)
% %
% % Write the following if you want it to appear in \listfiles 
% % --- although not really necessary and latexmk doesn't use this
% %
% \@addtofilelist{#1}
% %
% % latexmk will find this message if #1 doesn't exist (yet)
% \IfFileExists{#1}{}{\typeout{No file #1.}}
% }\makeatother

% \newcommand*{\myexternaldocument}[1]{%
% \externaldocument{#1}%
% \addFileDependency{#1.tex}%
% \addFileDependency{#1.aux}%
% }

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
% \usepackage{xr-hyper} 
% \externaldocument{uai2023-template}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)
%% Self-defined macros
% \newcommand{\swap}[3][-]{#3#1#2} % just an example
%}
% START -- additional user-added packages
% Additional user-added packages
\usepackage{xcolor}
\definecolor{linkblue}{rgb}{0.0, 0.3, 0.6}
\definecolor{commentgreen}{rgb}{0.0, 0.4, 0.1}
\definecolor{dartmouthgreen}{rgb}{0.05, 0.5, 0.06}
\definecolor{frenchblue}{rgb}{0.0, 0.45, 0.73}
\definecolor{mediumred-violet}{rgb}{0.73, 0.2, 0.52}
\definecolor{darkorange}{rgb}{0.80, 0.439, 0}
\definecolor{orange(ryb)}{rgb}{0.98, 0.6, 0.01}
\definecolor{darkorchid}{rgb}{0.6, 0.2, 0.8}
\definecolor{independence}{RGB}{187,212,113}
\definecolor{independence-text}{RGB}{145,176,53} % A little darker
\definecolor{weakdependence}{RGB}{99,164,108}
\definecolor{weakdependence-text}{RGB}{68,117,75} % A little darker
\definecolor{moderatedependence}{RGB}{35,51,41}
\definecolor{moderatedependence-text}{RGB}{52,76,61} % A little lighter

% Add user-defined packages
\usepackage{hyperref}
\hypersetup{
  colorlinks = true,
  linkcolor=linkblue,   % color of internal links
  citecolor=linkblue,   % color of links to bibliography
  urlcolor=linkblue,    % color of external links
  pagebackref=true,
  implicit=false,
  bookmarks=true,
  bookmarksopen=true,
  pdfdisplaydoctitle=true
}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{amssymb}
\theoremstyle{definition}
\newtheorem{definition}{Definition}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{assumption}{Assumption}
\usepackage{xcolor}
\usepackage{float}
\usepackage{adjustbox}
\usepackage{bbm}
\usepackage{cancel}
\usepackage{soul}
\usepackage[titlenumbered,ruled]{algorithm2e}

\newcommand\mycommfont[1]{\footnotesize\ttfamily\textcolor{commentgreen}{#1}}
\SetCommentSty{mycommfont}

\newcommand{\prob}{\Pr}
\long\def\comment#1{}

\newcommand{\coopermj}[1]{\textcolor{blue}{\textbf{coopermj}: \textit{#1}}}
\newcommand{\aligharari}[1]{\textcolor{purple}{\textbf{aligharari}: \textit{#1}}}
\newcommand{\rgk}[1]{\textcolor{red}{\textbf{rahul suggests:}: \textit{#1}}}

% END -- additional user-added packages

\title{Copula-Based Deep Survival Models for Dependent Censoring\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{Ali Hossein Gharari Foomani$^{*,}$}
\author[3,5]{Michael Cooper$^{*,}$}
\author[1,2]{Russell Greiner}
\author[3,4,5]{Rahul G. Krishnan}
% Add affiliations after the authors
\affil[1]{%
Department of Computing Science, University of Alberta
}
\affil[2]{%
Alberta Machine Intelligence Institute
}
\affil[3]{%
Department of Computer Science, University of Toronto
}
\affil[4]{%
Department of Laboratory Medicine and Pathobiology, University of Toronto
}
\affil[5]{%
Vector Institute
}
\comment{
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Harry~Q.~Bovik}
\author[1,2]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[1]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Cranberry University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Second Affiliation\\
    Address\\
    …
}
\affil[3]{%
    Another Affiliation\\
    Address\\
    …
  }
  }
  
\begin{document}

\setcounter{equation}{11}

% \include{uai2023-template}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\raggedbottom
\maketitle
\tableofcontents
\vfill



\appendix


\section{Table of Notation}

\begin{table}[H]
    \setlength{\tabcolsep}{12pt}
    \begin{adjustbox}{center}
    \begin{tabular}{ll}
    $\textbf{1}^N$ & $N$-vector filled with 1's.\\
    $\mathbbm{1}[\cdot]$ & Indicator function.\\
    $\mathcal{L}(\cdot)$ & Likelihood function.\\
    $\ell(\cdot)$ & Log-likelihood function.\\
 $X \in \mathcal{X}$ & Covariates of one instance (as elements of the covariate space, $\mathcal{X}$).\\
 $T_E \in \mathbb{R}_+$ & Event time.\\
 $T_C \in \mathbb{R}_+$ & Censorship time.\\
 $T_{\text{obs}} \in \mathbb{R}_+$ & Time of last observation; the minimum of $T_E, T_C$.\\
 $T \in \mathbb{R}_+$ & Either event or censoring time; used in contexts where a quantity may refer to either.\\
 $\delta \in \{0,1\}$ & Event indicator. Equal to 1 if the observed time is the event time; 0 otherwise.\\
 $\mathcal{D} \subset \mathcal{X} \times \mathbb{R}_+ \times \{0,1\}$ & Survival dataset of the form $\{(X^{(i)}, T^{(i)}_\text{obs}, \delta^{(i)})\}_{i=1}^N$.\\
 $S_T \in \mathcal{S}$ & Survival function, $S : \mathbb{R}_+ \rightarrow [0,1]$, and space of survival functions, $\mathcal{S}$.\\
 $f_{T}$ & Probability density function over time, representing $\prob(T=t)$.\\
 $F_{T}$ & Cumulative density function over time, representing $\prob(T < t)$.\\
 $C$ & A copula. If written as $C_\theta$, this denotes a copula parameterized by the dependence parameter $\theta$.\\
 $u_1, u_2$ & Inputs to a copula function. It is assumed that these are uniformly distributed.   
    \end{tabular}
    \end{adjustbox}
    \label{tab:notation}
\end{table}


\section{Copula Formulae and Algorithms}

\subsection{Table of Preliminaries}

\begin{table}[H]
    \small
    \begin{adjustbox}{center}
    \begin{tabular}{|c||c|c|c|}
         \hline
         Copula & $C_\theta(u_1, u_2)$ & $\Theta$ & \parbox{0.5cm} {\begin{align*} \frac{\partial}{\partial u_1} C_\theta(u_1, u_2) \end{align*}} \\
         \hline
         \hline
         Independence Copula & $u_1u_2$ & N/A & \parbox{3cm} {\begin{align*} u_2 \end{align*}}\\
         \hline
         Clayton Copula & $\left(\max \left(u_1^{-\theta} + u_2^{-\theta} - 1, 0\right)\right)^{-1/\theta}$ & $[-1, \infty)\backslash\{0\}$ & \parbox{3cm} {\begin{align*} 
         \begin{cases}
            \left(u_1^{-\theta} + u_2^{-\theta}-1\right)^{\frac{-\theta-1}{\theta}}u_2^{-\theta-1} \quad &u_1^{-\theta} + u_2^{-\theta} > 1\\
            0 &\text{otherwise}
         \end{cases}
         \end{align*}}\\
         \hline 
         Frank Copula & \parbox{3cm}{\begin{align*}
    \frac{-1}{\theta} \log \left(1+\frac{(\exp(-\theta u_1)-1)(\exp(-\theta u_2)-1)}{\exp(-\theta)-1}\right)
        \end{align*}} & $\mathbb{R}\backslash\{0\}$ & \parbox{3cm} {\begin{align*} 
        \frac{\exp(-\theta u_1)(\exp(-\theta u_2)-1)}{\exp(-\theta)-1}
         \end{align*}}\\
         \hline
    \end{tabular}
    \end{adjustbox}
    \caption{A table of formulas representing different classes of bivariate copulas used in our experiments. This table provides $C_\theta(u_1, u_2)$, the formula for the cumulative distribution function of the copula;
    $\Theta$, representing the family $\Theta$ from which valid $\theta$ may be drawn; and $\frac{\partial}{\partial u_1} C_\theta(u_1, u_2)$, representing the partial derivative of the copula with respect to its first parameter. Due to the symmetric nature of these copulas, one can readily find $\frac{\partial}{\partial u_2} C_\theta(u_1, u_2)$ from $\frac{\partial}{\partial u_1} C_\theta(u_1, u_2)$ by simply interchanging $u_1$, $u_2$ (hence, we only provide $\frac{\partial}{\partial u_1} C_\theta(u_1, u_2)$).}
    \label{tab:my_label}
\end{table}

\subsection{Sampling from a Copula} % D.2
\label{app:sample-copula}

Algorithm 2 requires that we draw samples from the Clayton and Frank copulas. To do so, we implement the copula sampling scheme from in the Python \href{https://www.statsmodels.org/stable/index.html}{\texttt{statsmodels}} package \citep{seabold2010statsmodels}.

\subsection{Quantile Density Visualizations}
\label{ref:sec_quantiledensity}

\begin{figure}[H]
    \centering
    \includegraphics[width=\textwidth]{figures/quantile_density.png}
    \caption{Plots of the densities for the Clayton (top row) and Frank (bottom row) copulas, under different degrees of dependence. These plots are functions of each of the copula's margins, $u$ and $v$. In practice, $u$ and $v$ are quantiles of the event and censoring distributions. Observe that, as the dependence increases, the difference in density between the on-diagonal points (points where $u \approx v$) and the off-diagonal points increases. Note also that, while the Clayton copula concentrates density around low quantiles (points where $u \approx v \approx 0$) as dependence increases, the Frank copula concentrates density more uniformly around the on-diagonal.}
    \label{fig:quantile-dependence}
\end{figure}


\subsection{Intuition for Copula Selection}

In Section 7, we discussed three different cases that can be used to build intuition around the forms of dependence induced by various copulas. In Figure \ref{fig:three_cases}, we visualize these cases, and relate them to the quantile density plots in Appendix \ref{ref:sec_quantiledensity}. The point of this section is to build intuition regarding the \textit{a priori} selection of a copula, so we will necessarily make a few simplifications. For example, although the three cases we discuss are not exhaustive -- it is possible that the event and censoring survival curves cross (\textit{e.g.} if the event and censoring distributions have different baseline hazards) -- they present clean intuition relating the choice of copula to the structure of the joint density it produces.

\begin{figure}[H]
    \centering
    \includegraphics[width=0.9\textwidth]{figures/SLkW7Lw-2.png}
    \caption{Three survival functions highlighting the three cases we presented in Section 7 of the main body. \textbf{Left}: the case where the conditional survival and censoring functions are the same. \textbf{Center}: the censoring survival function decays faster than the event survival function. \textbf{Right}: the event survival function decays faster than the censoring survival function. }
    \label{fig:three_cases}
\end{figure}

The key intuition for selecting a copula from domain knowledge can be drawn from Sklar's Theorem (Survival), which states that a joint distribution over event and censoring times can be modelled as two independent event and censoring distributions the quantiles of which are linked by a copula. When the event and censoring distributions are the same (left), the event quantile of a given time is the same as the censoring quantile for that same time. Thus, an increased dependence between event and censoring quantiles is directly reflected in a positive dependence between event and censoring times. When the censoring survival curve decays more quickly than the event survival curve, the event quantile of a given event time is higher than the censoring quantile for that same time. Therefore, increasing the dependence between event and censoring quantiles increases the likelihood that the censoring time precedes the event time. By symmetry, the opposite is true when the event survival curve decays more quickly than the censoring survival curve. An increase in dependence beteween quantiles in this setting increases the likelihood that the event itme precedes the censoring time under the model.

% \coopermj{TODO -- this isn't the right algorithm, instead, it's a template. Replace with the right algorithm.}

% \RestyleAlgo{ruled}
% \SetKwComment{Comment}{$\ $\# }{ }
% \SetKwComment{Commentt}{\# }{ }
% {
% \begin{algorithm}[h]
% \label{alg:datagenerating}
% \KwIn{

% }
% \KwResult{

% }
% \hrulefill\\
% $\mathcal{D} = \emptyset$\;
% \For{$i = 1,\, ...\,,\, N$}{
%     $u_1^{(i)}, u_2^{(i)} \sim C_{\theta^*}$\;
%     $T_E^{(i)} \gets \left(\frac{-\log(u_1)}{g_{\psi_E^*}(X^{(i)})}\right)^{\frac{1}{\gamma^*_E}}\rho^*_E$\;
%     $T_C^{(i)} \gets \left(\frac{-\log(u_2)}{g_{\psi_C^*}(X^{(i)})}\right)^{\frac{1}{\gamma^*_C}}\rho^*_C$\;
%     $\mathcal{D} \gets \mathcal{D} \cup \left\{\left(X^{(i)}, \min\left(T_E^{(i)}, T_C^{(i)}\right), \mathbbm{1}\left[T_E^{(i)} < T_C^{(i)}\right]\right)\right\}$\;
% }
% \Return{$\mathcal{D}$}
% \caption{Sampling from a Copula}
% \end{algorithm}
% }


% \subsection{Sampling from a Joint Distribution Defined by a Copula}


% \subsection{Derivative of the Cumulative Density of a Copula}

% \subsection{Stable Approximation of Copula Density Functions}


\section{Derivations}

\subsection{The Right-Censored Likelihood}

As a starting point for the subsequent derivations, we discuss the intuition behind the general likelihood for right-censored survival data, and present its formulation in Equation~\ref{eq:generallikelihood}.

Recall that a survival dataset $\mathcal{D}$ consists of $N$ i.i.d. samples of the form 
$\{(X^{(i)}, T^{(i)}_\text{obs}, \delta^{(i)})\}_{i=1}^N \ \subset\ \mathcal{X} \times \mathbb{R}_+ \times \{0,1\}$.
The likelihood expressed in Equation \ref{eq:generallikelihood} uses the $\delta^{(i)}$ terms in the exponent as a conditional binary filter: raising a term to the power of $\delta^{(i)}$ ensures it is non-degenerate only when the patient experiences an event; raising a term to the power of $1-\delta^{(i)}$ ensures it is non-degenerate only when the patient is censored.

Let $f_{T_E, T_C | X}$ represent the joint density function of the event and censoring times, respectively, conditional on the patients' covariates. There are two mutually-exclusive, collectively-exhaustive into which we can decompose the right-censored likelihood for a given patient $i$:
\begin{enumerate}
\item\textbf{Case 1} ($\delta^{(i)} = 1$): If $\delta^{(i)} = 1$, the likelihood term should express that $T_E^{(i)} = T^{(i)}_\text{obs}$, and $T_C^{(i)} > T^{(i)}_\text{obs}$. This corresponds to the observation that the patient experienced the event at time $T^{(i)}_\text{obs}$, and was not censored prior to experiencing the event. The probability of this event under our density function is $\int_{T^{(i)}_\text{obs}}^\infty f_{T_E, T_C | X}(T^{(i)}_\text{obs}, t_c | X^{(i)})dt_c$.

\item\textbf{Case 2} ($\delta^{(i)} = 0$): If $\delta^{(i)} = 0$, the likelihood term should express that $T_C^{(i)} = T^{(i)}_\text{obs}$, and $T_E^{(i)} > T^{(i)}_\text{obs}$. This corresponds to the observation that the patient is censored at time $T^{(i)}_\text{obs}$, and did not experience an event prior to being censored. The probability of this event under our density function is $\int_{T^{(i)}_\text{obs}}^\infty f_{T_E, T_C | X}(t_e, T^{(i)}_\text{obs} | X^{(i)})dt_e$.
\end{enumerate}

Combining these two cases, and applying the assumption that our data is i.i.d., yields the general likelihood function for right-censored data.

\begin{equation}
\mathcal{L}(\mathcal{D}) = \prod_{i=1}^N \color{frenchblue}{\underbrace{\color{black}\left[\int_{T^{(i)}_\text{obs}}^\infty f_{T_E, T_C | X}(T^{(i)}_{\text{obs}},\, t_c\, |\, X^{(i)})\,dt_c\right]\color{frenchblue}}_{\Pr\left(T_E = T^{(i)}_{\text{obs}},\, T_C > T^{(i)}_{\text{obs}}\, |\, X^{(i)}\right)}}^{\color{black}\delta^{(i)}}\color{mediumred-violet}{\underbrace{\color{black}\left[\int_{T^{(i)}_{\text{obs}}}^\infty f_{T_E, T_C | X}(t_e,\, T^{(i)}_{\text{obs}}\, |\, X^{(i)})\,dt_e\right]}_{\color{mediumred-violet} \Pr\left(T_C = T^{(i)}_{\text{obs}},\, T_E > T^{(i)}_{\text{obs}}\, |\, X^{(i)}\,\right)}}^{\color{black}1-\delta^{(i)}}\color{black}
\label{eq:generallikelihood}
\end{equation}

\subsection{The Right-Censored Log-Likelihood Under Conditional Independence}
Under the assumption that $T_E \perp T_C | X$, we can factorize the conditional density distributions in Equation \ref{eq:generallikelihood}. $f_{T_E,T_C|X}$ factorizes into $f_{T_E|X}f_{T_C|X}$.
\begin{align}
\mathcal{L}(\mathcal{D}) &= \prod_{i=1}^N \left[f_{T_E|X}(T^{(i)}_\text{obs} | X^{(i)})\int_{T^{(i)}_\text{obs}}^\infty f_{T_C | X}(t_c | X^{(i)})dt_c\right]^{\delta^{(i)}} \left[f_{T_C|X}(T^{(i)}_\text{obs} | X^{(i)})\int_{T^{(i)}_\text{obs}}^\infty f_{T_E | X}(t_e | X^{(i)})dt_e\right]^{1-\delta^{(i)}}\\
&= \prod_{i=1}^N \left[f_{T_E|X}(T^{(i)}_\text{obs} | X^{(i)})\left(1-F_{T_C|X}(T^{(i)}_\text{obs}|X^{(i)})\right)\right]^{\delta^{(i)}} \left[f_{T_C|X}(T^{(i)}_\text{obs} | X^{(i)})\left(1-F_{T_E|X}(T^{(i)}_\text{obs}|X^{(i)})\right)\right]^{1-\delta^{(i)}}\\
&= \prod_{i=1}^N \left[f_{T_E|X}(T^{(i)}_\text{obs} | X^{(i)})S_{T_C|X}(T^{(i)}_\text{obs}|X^{(i)})\right]^{\delta^{(i)}} \left[f_{T_C|X}(T^{(i)}_\text{obs} | X^{(i)})S_{T_E|X}(T^{(i)}_\text{obs}|X^{(i)})\right]^{1-\delta^{(i)}}\\
\therefore \quad \ell(\mathcal{D}) &= \sum_{i=1}^N \delta^{(i)}\log\left[f_{T_E|X}(T^{(i)}_\text{obs} | X^{(i)})\right] + \delta^{(i)} \log \left[S_{T_C|X}(T^{(i)}_\text{obs}|X^{(i)})\right] + (1-\delta^{(i)}) \log
\left[f_{T_C|X}(T^{(i)}_\text{obs} | X^{(i)})\right] + \nonumber\\
& \qquad\quad(1-\delta^{(i)})\left[S_{T_E|X}(T^{(i)}_\text{obs}|X^{(i)})\right]
\label{eq:surv_indep}
\end{align}

\subsection{The Right-Censored Log-Likelihood Under Dependence Defined by a Copula}

\subsubsection{Proof of Lemma 1}

\begin{lemma}[Conditional Survival Function Under Sklar's Theorem (Survival)]
\label{lemma:copula-conditional}
If $S_{T_E, T_C | X}(t_e, t_c | x) = \left.C(u_1, u_2)\middle|_{\substack{{u_1=S_{T_E|X}(t_e|x)}\\ {u_2=S_{T_C|X}(t_c|x)}}}\right.$, then, 
\begin{align}
    \int_{t_c}^\infty f_{T_C | T_E, X}(t_c | t_e, x) &= \frac{\partial}{\partial u_1} \left.C(u_1, u_2)\middle|_{\substack{{u_1=S_{T_E|X}(t_e|x)}\\ {u_2=S_{T_C|X}(t_c|x)}}}\right.
\end{align}
\end{lemma}
\begin{proof}
\begin{align}
    \int_{t_c}^\infty f_{T_C | T_E, X}(t_c | t_e, x) &= \frac{\int_{t_c}^\infty f_{T_C, T_E | X}(t_c, t_e | x) dt_c}{f_{T_E|X}(t_e|x)} && \text{(Def'n of Cond. Prob.)}\\
    &= \frac{\frac{-\partial}{\partial T_E} \int_{t_e}^\infty \int_{t_c}^\infty f_{T_C, T_E | X}(t_c, t_e | x) dt_c dt_e}{f_{T_E|X}(t_e|x)}\\
    &= \frac{\frac{-\partial}{\partial T_E} S_{T_C, T_E | X}(t_c, t_e | x)}{f_{T_E|X}(t_e|x)} &&\text{(Def'n of Survival Function)}\\
    &= \frac{\frac{-\partial}{\partial T_E} \left(C(u_1, u_2)\middle|_{\substack{{u_1=S_{T_E|X}(t_e|x)}\\ {u_2=S_{T_C|X}(t_c|x)}}}\right)}{f_{T_E|X}(t_e|x)} &&\text{(Sklar's Theorem)}\\
    &= \frac{\frac{-\partial}{\partial u_1} \left(C(u_1, u_2)\middle|_{\substack{{u_1=S_{T_E|X}(t_e|x)}\\ {u_2=S_{T_C|X}(t_c|x)}}}\right)\frac{\partial}{\partial T_E}S_{T_E|X}(t_e|x)}{f_{T_E|X}(t_e|x)} &&\text{(Chain Rule)}\\
    &= \frac{-\partial}{\partial u_1} \left(C(u_1, u_2)\middle|_{\substack{{u_1=S_{T_E|X}(t_e|x)}\\ {u_2=S_{T_C|X}(t_c|x)}}}\right)\cancelto{-1}{\frac{-f_{T_E|X}(t_e|x)}{f_{T_E|X}(t_e|x)}}\\
    &= \frac{\partial}{\partial u_1} \left(C(u_1, u_2)\middle|_{\substack{{u_1=S_{T_E|X}(t_e|x)}\\ {u_2=S_{T_C|X}(t_c|x)}}}\right)
\end{align}
\end{proof}

\textit{Corollary.} We can symmetrically apply this lemma to the converse case, $f_{T_E|T_C,X}$, to obtain:
\begin{equation}
    \int_{t_e}^\infty f_{T_E | T_C, X}(t_e | t_c, x) = \frac{\partial}{\partial u_2} \left(C(u_1, u_2)\middle|_{\substack{{u_1=S_{T_E|X}(t_e|x)}\\ {u_2=S_{T_C|X}(t_c|x)}}}\right)
\end{equation}

\subsubsection{Derivation of the Right-Censored Log Likelihood Under a Copula}

Having now proven Lemma \ref{lemma:copula-conditional}, we apply it to derive a likelihood function for survival prediction under dependent censoring. We use Equation \ref{eq:generallikelihood} as the starting point for our derivation.



\begin{align}
\mathcal{L}(\mathcal{D}) = \prod_{i=1}^N & \left[\int_{T^{(i)}_\text{obs}}^\infty f_{T_E, T_C | X}(T^{(i)}_\text{obs}, t_c | X^{(i)})\,dt_c\right]^{\delta^{(i)}} \left[\int_{T^{(i)}_\text{obs}}^\infty f_{T_E, T_C | X}(t_e, T^{(i)}_\text{obs} | X^{(i)})\,dt_e\right]^{1-\delta^{(i)}}\\
= \prod_{i=1}^N &\left[f_{T_E|X}(T^{(i)}_\text{obs}| X^{(i)})\int_{T^{(i)}_\text{obs}}^\infty f_{T_C | T_E, X}(t_c | T^{(i)}_\text{obs}, X^{(i)})\,dt_c\right]^{\delta^{(i)}}\times &&\text{(Chain Rule)}\\
&\left[f_{T_C|X}(T^{(i)}_\text{obs}| X^{(i)})\int_{T^{(i)}_\text{obs}}^\infty f_{T_E | T_C, X}(t_e | T^{(i)}_\text{obs}, X^{(i)})\,dt_e\right]^{1-\delta^{(i)}}\nonumber\\
= \prod_{i=1}^N &\left[f_{T_E|X}(T^{(i)}_\text{obs}| X^{(i)})\frac{\partial}{\partial u_1}\left(C(u_1, u_2)\middle\vert_{\substack{u_1 = S_{T_E|X}(T^{(i)}_\text{obs}|X^{(i)})\\u_2 = S_{T_C|X}(T^{(i)}_\text{obs})|X^{(i)}} }\right)\right]^{\delta^{(i)}}\times&&\text{(Lemma \ref{lemma:copula-conditional})}\\
&\left[f_{T_C|X}(T^{(i)}_\text{obs}| X^{(i)})\frac{\partial}{\partial u_2}\left(C(u_1, u_2)\middle\vert_{\substack{u_1 = S_{T_E|X}(T^{(i)}_\text{obs}|X^{(i)})\\u_2 = S_{T_C|X}(T^{(i)}_\text{obs})|X^{(i)}} }\right)\right]^{1-\delta^{(i)}}\nonumber\\
\therefore \quad \ell(\mathcal{D}) = \sum_{i=1}^N &\delta^{(i)}\log \left[f_{T_E|X}\left(T^{(i)}_\text{obs}| X^{(i)}\right)\right] + \delta^{(i)} \log \left[\frac{\partial}{\partial u_1}C(u_1, u_2)\middle\vert_{\substack{u_1 = S_{T_E|X}(T^{(i)}_\text{obs}|X^{(i)})\\u_2 = S_{T_C|X}(T^{(i)}_\text{obs})|X^{(i)}} }\right] +\\
&(1-\delta^{(i)})\log \left[f_{T_C|X}\left(T^{(i)}_\text{obs}| X^{(i)}\right)\right] + \nonumber\\
&(1-\delta^{(i)}) \log \left[\frac{\partial}{\partial u_2}C(u_1, u_2)\middle\vert_{\substack{u_1 = S_{T_E|X}(T^{(i)}_\text{obs}|X^{(i)})\\u_2 = S_{T_C|X}(T^{(i)}_\text{obs})|X^{(i)}} }\right]\nonumber
\end{align}

% \begin{lemma}[\cite{nelsen2007introduction}]
% For a $S_{T_1, ..., T_m}(t_1, ..., t_m) = C(S_{T_1}(t_1), ..., S_{T_m}(t_m))$
% \end{lemma}

% \subsection{The Right-Censored Log-Likelihood Under Dependence with Administrative Censoring}
% \coopermj{Discuss the administrative censoring parameter.}


\subsection{The Weibull CoxPH Model}
Recall that the Weibull CoxPH model is defined in terms of its hazard, as follows.
\begin{equation}
\label{eq:model-hazard}
h_{T|X}(\,t|X\,)\ =\ \left(\frac{\nu}{\rho}\right)\left(\frac{t}{\rho}\right)^{\nu-1} \exp\left(\,g_\psi(X)\,\right)
\end{equation}

Our method, however, relies on the ability to extract additional quantities -- the density ($\hat{f}_{T|X}$) and survival functions ($\hat{S}_{T|X}$) -- from the model, as these are essential to computing our likelihood function. In this section, we derive the closed-form expressions for these two quantities that are present in the main body of our work.


\subsubsection{The Survival Function}

The survival function under our model can be derived via its cumulative hazard.

\begin{definition}[Cumulative Hazard]
The \textit{cumulative hazard}
\begin{equation}
\hat{H}_{T|X}(t|X)\ \triangleq\ \int_{0}^t \hat{h}_{T|X}(u|X)du
\end{equation}
represents the integral of the hazard function over all time prior to a specified time, $t$.
\end{definition}

The cumulative hazard of the Weibull CoxPH can be expressed in closed form as follows:
\begin{align}
\label{eq:model-cum-hazard}
\hat{H}_{T|X}(\,t|X\,)\ &=\ \int_{0}^{t} \left(\frac{\nu}{\rho}\right)\left(\frac{u}{\rho}\right)^{\nu-1} \exp\left(\,g_\psi(X)\,\right)\,du \\
&= \ \left(\frac{t}{\rho}\right)^{\nu} \exp\left(\,g_\psi(X)\,\right)
\label{eq:cumhazard_closedform}
\end{align}
One alternative formulation of the survival function expresses $S_{T|X}$ in terms of the hazard function, as follows.
\begin{align}
\label{eq:model-survival}
S_{T|X}(t|X)\ \triangleq\ \exp(-H_{T|X}(t|X))
\end{align}
We can apply this identity to Equation \ref{eq:cumhazard_closedform} to obtain the following expression for $\hat{S}_{T|X}$ under the Weibull CoxPH model:
\begin{align}
\hat{S}_{T|X}(t|X)= \exp\left(-\left(\frac{t}{\rho}\right)^{\nu} \exp\left(\,g_\psi(X)\,\right)\right)
\end{align}
\subsubsection{The Density Function}\label{sec:density}
From Equation 3, we know that the density of an event can be calculated as follows.
\begin{equation}
\label{eq:model-density}
f_{T|X}(t|X)\ =\ S_{T|X}(t|X) h_{T|X}(t|X)
\end{equation}

\subsection{A Stable Implementation}\label{sec:stable-implementation}
In order to optimize a Weibull model in a stable way we used another representation of Weibull distribution. This new representation is derived by applying log transformation to the cumulative hazard function of Weibull distribution. 
\begin{equation}
    \begin{aligned}
         H_{T|X}(\,t|X\,) &= \exp(\log(H_{T|X}(t|X)))\\
    &=\exp\left(\log\left(\left(\frac{t}{\rho}\right)^{\nu} \exp(g_\psi(X))\right)\right)\\
    &=\exp(\nu\log(t) - \nu\log(\rho) + g_\psi(X))
    \end{aligned}
\end{equation}

Setting $\sigma = \frac{1}{\nu}$, $\mu = \log(\rho)$ ,and $f(x) = -\frac{g_\psi(X)}{\nu}$, gives us a long-cumulative hazard function of the following form.
    
\begin{equation}
\label{eq:model-cum-hazard_stable}
H_{T|X}(\,t|X\,)\ =\ \exp\left(\frac{\log(t) - \mu - f(x)}{\sigma}\right)
\end{equation}

\subsubsection{Hazard function}
Given the formula for the cumulative hazard function we can derive the hazard function in the new format by taking the derivative of cumulative hazard with respect to $t$.
\begin{equation}
    h_{T|X}(\,t|X\,)=\ \frac{\partial H_{T|X}(\,t|X\,)}{\partial t} =\  \frac{H_{T|X}(\,t|X\,)}{t\sigma}
\end{equation}

% \section{Proofs}

% \subsection{Proof of Theorem}

% \textit{Unbiased estimate of the fully-observed case.}

% \begin{definition}[Fully-Observed Dataset]
%     For a given survival dataset $\mathcal{D} = \{X^{(i)}, T^{(i)}, \delta^{(i)}\}_{i=1}^N$, where $T^{(i)}$ is the minimum of latent variables $T_E^{(i)}, T_C^{(i)}$, its \textit{fully-observed counterpart} $\mathcal{D}_{\textit{do}(\delta = 1)}$ is,
%     \begin{equation}
%         \mathcal{D}_{\textit{do}(\delta = 1)} = \{X^{(i)}, T_E^{(i)}, 1\}_{i=1}^N
%     \end{equation}
% \end{definition}

% The fully-observed dataset represents the version of $\mathcal{D}$ that would have taken place had none of the instances in $\mathcal{D}$ been right-censored. 

% \textbf{Remark.} $\mathcal{D}_{\textit{do}(\delta = 1)}$ is a counterfactual object, and as such, is typically not provided by real-world (observational) survival data in which right-censoring is present. Despite this, it is a useful object of study in proving results regarding the bias of survival metrics under different forms of censoring.

% \begin{definition}[Survival Metric]
%     Given a survival model $\mathcal{M}$, and a survival dataset $\mathcal{D} = \{X^{(i)}, T^{(i)}, \delta^{(i)}\}_{i=1}^N$, a \textit{survival metric} is any function of the form $\mathcal{C}: \mathcal{M} \times \mathcal{X} \times \mathbb{R}_+ \times \{0,1\} \rightarrow \mathbb{R}$.
% \end{definition}

% \begin{definition}[Unbiased Metric]
%     A survival metric $\mathcal{C}$ is said to be \textit{unbiased over the fully-observed} $\mathcal{D}$ if, for any model $\mathcal{M}$,
%     \begin{equation}
%         \mathbb{E}[\mathcal{C}(\mathcal{M}, \mathcal{D})] = \mathbb{E}[\mathcal{C}(\mathcal{M}, \mathcal{D}_{\textit{do}(\delta = 1)})]
%     \end{equation}
% \end{definition}



% \begin{theorem}[Unbiased Under CCAR]
%     Under Censoring-Completely-At-Random, a survival metric $\mathcal{C}$ is unbiased.
% \end{theorem}
% \begin{proof}
%     Censoring-Completely-At-Random implies that $T_E \perp T_C$. 
% \end{proof}

% \begin{theorem}[Adjustments for CAR]
%     Under Censoring-Completely-At-Random, 
% \end{theorem}



% \begin{theorem}[Impossibility of IPSW Under CNAR]
    
% \end{theorem}

% \subsection{Improper Scoring Rules}

% In this section, we provide a series of results that comment on the propriety of different scoring rules under dependent censoring. These will serve to justify our use of the Survival-$\ell_1$ (in the case of synthetic data) and the $R^2$ metric (in the case of semi-synthetic data) over conventional metrics like the time-dependent concordance index.

% The proof in this section bears similarity to those of \cite{rindt2022survival}, though crucially, where \cite{rindt2022survival} only considers conditionally independent censoring, we extend these results to the case where conditional independence does not hold. To our knowledge, no prior work has yet generalized these results to the case of dependent censoring.

% \begin{definition}[Scoring Rule]
% Given a probability distribution $S \in \mathcal{S}$, and a collection of observations $y \in \mathcal{Y}$, a scoring rule $\mathcal{C}(S, y)$ provides some means of quantifying how well the distribution fits the observed samples. Formally, a scoring rule is defined as a function of the following form:
% \begin{equation}
%     \mathcal{C}(S, y) : \mathcal{S} \times \mathcal{Y} \rightarrow \mathbb{R}_+
% \end{equation}
% The value returned by the scoring rule for a given tuple $(S, y)$ indicates how well $S$ fits $y$ under the criteria of the rule: a large value indicates a good fit, while a small value indicates a poor fit.

% As an example, the time-dependent concordance index (TDCI) is a scoring rule. $\mathcal{C}_{\text{TDCI}}$ accepts as input a distribution, $\Pr(T_E|X)$, and a collection of times of event and times of censorship. As output, the TDCI produces a real-valued output that is larger when $\Pr(T_E|X)$ is a better predictor of the rank-ordering of event times observed in the samples, and smaller when $\Pr(T_E|X)$ is a worse predictor of this ranking.

% In survival analysis, we typically seek to estimate the survival function, $S_{T_E|X}$, while our observations consist of $(T, E)$-tuples representing times-of-event/censorship, and censorship indicators.

% \end{definition}

% \begin{definition}[Proper Scoring Rule] A scoring rule is called proper if the maximal possible score is achieved, in expectation, by the true underlying distribution $S$ from which the samples $y$ are drawn. Formally, a rule $\mathcal{C}$ is proper scoring rule if it satisfies the following criterion:
% \begin{equation}
%     \sup_{\hat{S} \in \mathcal{S}} 
%     \mathbb{E}_{y \sim S} \left[\mathcal{C}(\hat{S}, y)\right] = \mathbb{E}_{y \sim S} \left[\mathcal{C}(S, y)\right]
% \end{equation}
% \end{definition}

% This is a desirable property, as a non-proper scoring rule may provide misleading information about the extent to which the model approximates the data-generating distribution. Especially in our case, where we wish to obtain a means of quantifying bias in survival curves, we require a score that is minimized when our model most closely approximates the data-generating survival distribution.

% % \subsubsection{Time-Independent Concordance is Not Proper Under Dependent Censoring}

% \subsubsection{Time-Dependent Concordance is Not Proper Under Dependent Censoring}

% \begin{theorem}[]
% Let $\mathcal{C}_{\text{TD}}: \mathcal{S} \times \mathbb{R}^n \rightarrow \mathbb{R}_{[0, 1]}$ represent the time-dependent concordance index. Let $S$ be the survival function associated with the data-generating process. Then, we show that there exists some $\hat{S} \neq S$ for which:
% \begin{equation}
% \mathbb{E}_{y \sim S} \left[\mathcal{C}_{\text{TD}}(S, y)\right] < \mathbb{E}_{y \sim S} \left[\mathcal{C}_{\text{TD}}(\hat{S}, y)\right]
% \end{equation}
% \label{thm:tdci_improper}
% \end{theorem}
% \begin{proof}
% By counterexample. Consider the following data-generating process, conditioned on a user-specified indicator, $\mathbb{I}_{\text{dep}}$, which is set to 1 if there is a dependence between event and censoring times, and 0 otherwise.
% \begin{align}
%     K &\sim \text{Categorical}_{k_1, \dots, k_5 = 0.2}(\{0, 2, 4, 6, 8\})\\
%     U &\sim \mathcal{U}_{[0, 1]}\\
%     X &\sim \text{Bernoulli}(k = 0.5)\\
%     T_E &\sim \begin{cases} K + U \qquad\,\,\,\, X = 0\\ K + U + 1 \quad X = 1\end{cases}\\
%     T_C &\sim \begin{cases} \text{Exponential}(\lambda = 0.2) \quad \mathbb{I}_{\text{dep}} = 0\\ \mathcal{N}\left(\mu=T_E, \sigma = 0.3\right) \quad \mathbb{I}_{\text{dep}} = 1\end{cases}
% \end{align}

% Define $S = \Pr\left(T_E|X\right)$ to be the empirical survival function over draws from this data-generating process. This process represents two modifications to the process presented in Appendix A.1 of \cite{rindt2022survival}: first, changing the mean of the censoring distribution when $\mathbb{I}_{\text{dep}} = 0$, and second, adding the piecewise censorship function that induces dependence conditional on the binary variable $\mathbb{I}_\text{dep}$. By centering the censorship time around the event time in the case where $\mathbb{I}_\text{dep} = 1$, it is clear that we induce a direct dependency between the event and censoring times: this dependence can be visualized in the second row, third column of Figure \ref{fig:c_index_proof}.

% Now consider the following fake data-generating process.
% \begin{align}
%     K &\sim \text{Categorical}_{k_1, \dots, k_5 = 0.2}(\{0, 2, 4, 6, 8\})\\
%     U &\sim \mathcal{U}_{[0, 0.2]}\\
%     X &\sim \text{Bernoulli}(k = 0.5)\\
%     T_E &\sim \begin{cases} K + U + 2 \quad X = 0\\ K + U + 1 \quad X = 1\end{cases}\\
%     T_C &\sim \begin{cases} \text{Exponential}(\lambda = 0.2) \quad \mathbb{I}_{\text{dep}} = 0\\ \mathcal{N}\left(\mu=T_E, \sigma = 0.3\right) \quad \mathbb{I}_{\text{dep}} = 1\end{cases}
% \end{align}

% Define $\hat{S} = \Pr\left(T_E|X\right)$ to be the empirical survival function over 50,000 draws from this data-generating process. We will show that $\hat{S}$ obtains a higher time-dependent concordance index than $S$ on samples drawn from the true data-generating process.

% \begin{figure}[h]
% \includegraphics[width=\textwidth]{figures/c_index_proof.png}
% \caption{A figure showing the event and censoring survival functions, corresponding frequency histograms for events/censored occurrences, and associated time-dependent concordance index values associated with the the real and fake data-generating processes. The left two columns are largely a reproduction of Figure 1 in \citep{rindt2022survival}, while the right two columns showcase that the same result holds under dependent censoring.}
% \label{fig:c_index_proof}
% \end{figure}

% The first row of Figure \ref{fig:c_index_proof} shows the survival curves associated with the true and fake data generating processes under independent censoring (left two columns) and dependent censoring (right two columns). The second row of the figure shows a frequency histogram associated with different outcomes experienced under each data-generating process: in the left two plots of this row, we can see that censoring has no visible relationship with event time, while in the right two plots, there is a clear clustering of censoring instances around times of event. The third row shows the TDCI of samples drawn from the true data-generating process, evaluated using the survival curves in the first row. Note that in both cases of independent and dependent censoring, the TDCI of the fake survival functions are substantially higher than those of the true survival functions, proving the theorem.
% \end{proof}

% % \subsubsection{The Brier Score is Not Proper Under Dependent Censoring}

% % Here, we provide a discussion of the proof given in \cite{rindt2022survival} explaining why the Brier Score is not a proper scoring rule under dependent censoring.

% % Criterion for the Brier Score to be proper: $T_C \perp X$. Requires censoring completely at random.

% \subsection{Proper Scoring Rules}

% \subsubsection{The Survival-$\ell_1$ is Proper Under Dependent Censoring}

% $\mathcal{C}_{\text{Survival-}\ell_1} : \mathcal{S} \times \mathcal{S} \rightarrow \mathbb{R}_{[0, 1]}$ represent our Survival-$\ell_1$ metric. Strictly speaking, the $\mathcal{C}_{\text{Survival-}\ell_1}$ is a score that operates on two survival distributions, rather than on a distribution and a collection of samples, so the standard definition of a proper scoring rule does not directly apply. Instead, we can define the notion of a \textit{distributionally proper scoring rule}, which extends the definition of a proper scoring rule to functionals like the $\mathcal{C}_{\text{Survival-}\ell_1}$.

% \begin{definition}[Distributionally Proper Scoring Rule]
%     Let $\mathcal{S}$ represent some space of distributions, which is assumed to contain the distribution $S$. A scoring rule $\mathcal{C}: \mathcal{S} \times \mathcal{S} \rightarrow \mathbb{R}_+$ is called proper if it satisfies the following criterion:
%     \begin{equation}
%         \sup_{\hat{S} \in \mathcal{S}} \mathcal{C}(\hat{S}, S) = \mathcal{C}(S, S)
%     \end{equation}
% \end{definition}

% Because, unlike $C_{\text{TDCI}}$, a \textit{lower} $\mathcal{C}_{\text{Survival-}\ell_1}$ indicates a better fit of the data-generating distribution, we instead prove the converse: that the true data-generating distribution always yields the \textit{lowest} possible $\mathcal{C}_{\text{Survival-}\ell_1}$ score of all distributions in the space $\mathcal{S}$.

% \begin{theorem}
% The $\mathcal{C}_{\text{Survival-}\ell_1}$ is a distributionally proper scoring rule.
% \end{theorem}
% \begin{proof}
%     Consider a single patient, $i$. If $S(t|X^{(i)}) = \hat{S(t|X^{(i)})}$, then the $\ell_1$ distance between the curves at any point in time is 0, giving us a total $\mathcal{C}_{\text{Survival-}\ell_1}$ score of 0. Additionally, as $\mathcal{C}_{\text{Survival-}\ell_1}$ is comprised of the (weighted) area between two curves, it must be nonnegative. Therefore, when $S(t|X^{(i)}) = \hat{S(t|X^{(i)})}$, the score achieves its lower bound, which makes it a distributionally proper scoring rule.
% \end{proof}

\section{Algorithms}


\subsection{Computing the Survival-$\ell_1$}

Here, we expand on the computation of the Survival-$\ell_1$ metric from the main paper by providing an algorithm for the explicit computation of the inner term of the Survival-$\ell_1$ metric, as well as the value $T_{\text{max}}$ for the given pair of survival curves, $S, \hat{S}$:

\begin{align}
\mathcal{C}_{\textit{Survival-}\ell_1}(S, \hat{S})\quad =\quad \sum_{i=1}^N &\frac{1}{N \times T_{\text{max}}^{(i)}} \underbrace{\int_{0}^{\infty} \left|S_{T\,|\,X}(t\,|\,X^{(i)}) - \hat{S}_{T\,|\,X}(t\,|\,X^{(i)})\right| dt}_{\text{Inner Term}}\nonumber
\end{align}

Although the integral in the $\mathcal{C}_{\textit{Survival-}\ell_1}$ is over an infinite domain, in this approximation, we consider only the simplified case wherein the upper bound of integration is $T_\text{max}$.

\RestyleAlgo{ruled}
\SetKwComment{Comment}{$\ $\# }{ }
\SetKwComment{Commentt}{\# }{ }
\setcounter{algocf}{2}
{
\begin{algorithm}[H]
\label{alg:optimization}
\KwIn{
\begin{enumerate}
    \item $S_1, S_2$: Survival curves to compare under the Survival-$\ell_1$ metric. Here, we assume $S_1$ is the ground-truth survival curve, and $S_2$ is the estimated curve.
    \item $Q_{\lVert\cdot\rVert}$: Normalizing quantile.
    \item $N_\text{steps}$: Number of discretization steps.
\end{enumerate}
}
\KwResult{
\begin{enumerate}
    \item $\Delta_{\text{total}}$: a discretized approximation of the integral $\int_{0}^{T_{\text{max}}} \left|S_{1}(t\,|\,X^{(i)}) - {S}_{2}(t\,|\,X^{(i)})\right| dt$.
    \item $T_\text{max}$: This is used as a normalization weight when computing the full expression for the Survival-$\ell_1$ metric.
\end{enumerate}
}
\hrulefill\\
$T_\text{max} \gets {S}^{-1}_{1}\left(Q_{\lVert\cdot\rVert}\right)$\;
$\Delta_\text{total} \gets 0$\\
\For{$i = 1,\, ...\,,\, N_\text{steps}$}{
    $\Delta_{i;S_1,S_2} \gets \frac{T_{\text{max}}}{N_{\text{steps}}} \times \ell_1\left[{S_1\left(\frac{i \times T_{\max}}{N_{\text{steps}}}\right)}, {S_2\left(\frac{i \times T_{\max}}{N_{\text{steps}}}\right)}\right]$\;
    $\Delta_{\text{total}} \gets \Delta_{\text{total}} + \Delta_{i;S_1,S_2}$\;
}
\Return{$\Delta_{\text{total}}, T_{\text{max}}$}
\caption{Discrete Approximation of the Inner Term of the Survival-$\ell_1$}
\end{algorithm}
}
\vspace*{\fill}

\subsection{Creating a Semi-Synthetic Dataset with Dependent Censoring}
We convert a regression dataset to a survival dataset with dependent censoring using the following algorithm.


\RestyleAlgo{ruled}
\SetKwComment{Comment}{$\ $\# }{ }
\SetKwComment{Commentt}{\# }{ }
\SetKwInOut{Dependencies}{Dependencies}
{
\begin{algorithm}[H]
\label{alg:datagenerating}
\KwIn{
\begin{enumerate}
    \item $\mathcal{D}_{\text{reg}} = \left\{X^{(i)}, Y^{(i)}\right\}_{i=1}^N \subseteq \mathcal{X} \times \mathbb{R}_+$. Regression dataset consisting of covariates and labels.
    \item $C_\theta: [0,1] \times [0,1] \rightarrow [0,1]$. A bivariate, uniparametric copula.
\end{enumerate}
}
\KwResult{
\begin{enumerate}
    \item $\mathcal{D}_{C, \theta} \subseteq \mathcal{X} \times \mathbb{R}_+ \{0,1\}$. Artificially censored version of $D_\text{reg}$ in which the joint distribution between $Y$ and $T_C$ is governed by the application of Sklar's Theorem to the copula $C_\theta$.
\end{enumerate}
}

\hrulefill\\

\Comment{Learn a Weibull CoxPH model based on the outcomes of the train set without any censoring}
$\hat{W}_E \gets \texttt{Weibull-Linear}(Y, X, \textbf{1}^N)$\;
${W}_C \gets {W}_E$\;

${{W}_C}.\nu \gets {{W}_C}.\nu / 0.6$ \Comment{Decreases the variance of the censoring distribution}
$T_{C} \gets \textbf{0}^N$\;
$\mathcal{D}_{C, \theta} = \emptyset$\;
\For{$i = 1,\, ...\,,\, N$}{
    $u_1^{(i)} \gets \hat{S}_{W_E}(Y^{(i)}); $\Comment{Obtain event quantile}
    $u_2^{(i)} \sim C_\theta(\cdot \,\mid\, u_1^{(i)}); $\Comment{Sample censoring quantile conditionally from the copula}
    $T_C^{(i)} \gets \hat{S}_{W_C}^{-1}(u_2^{(i)})$; \Comment{Obtain censoring time via inv. censoring survival function}
    $\mathcal{D}_{C, \theta} \gets \mathcal{D}_{C, \theta}\, \cup\, \{(X^{(i)}, \min\left(Y^{(i)}, T_C^{(i)}\right), \mathbbm{1}[Y^{(i)} \leq T_C^{(i)}])\}$\;
}
\Return{$\mathcal{D}_{C, \theta}$}\;
\caption{Semi-Synthetic Dataset Construction with Dependent Censoring}
\label{alg:semi-synthetic}
\end{algorithm}
}



% \subsection{Creating a Semi-Synthetic Dataset with Nonparametric Marginals Under an Assumed Copula}


% \RestyleAlgo{ruled}
% \SetKwComment{Comment}{$\ $\# }{ }
% \SetKwComment{Commentt}{\# }{ }
% \SetKwInOut{Dependencies}{Dependencies}
% {
% \begin{algorithm}[H]
% \label{alg:datagenerating}
% \KwIn{
% \begin{enumerate}
%     \item $\mathcal{D}_{\text{reg}} = \left\{X^{(i)}, Y^{(i)}\right\}_{i=1}^N \subseteq \mathcal{X} \times \mathbb{R}_+$. Regression dataset consisting of covariates and labels.
%     \item $C_\theta: [0,1] \times [0,1] \rightarrow [0,1]$. A bivariate, uniparametric copula.
%     \item $f_C: \mathcal{X} \rightarrow \mathbb{R}_+$. A censoring function that computes a time of censorship for each sample.
% \end{enumerate}
% }
% \KwResult{
% \begin{enumerate}
%     \item $\mathcal{D}_{C, \theta} \subseteq \mathcal{X} \times \mathbb{R}_+ \{0,1\}$. Artificially censored version of $D_\text{reg}$ in which the joint distribution between $Y$ and $T_C$ is governed by the application of Sklar's Theorem to the copula $C_\theta$ and nonparametric (Kaplan-Meier) marginal distributions.
% \end{enumerate}
% }
% \Dependencies{
% \begin{enumerate}
%     \item \texttt{Kaplan-Meier} \citep{kaplan1958nonparametric}: $\mathbb{R}_+^N \times \{0,1\}^N \rightarrow \mathcal{S}$. Nonparametric estimator that returns an invertible, callable survival function, $\hat{S}: \mathbb{R_+} \rightarrow [0, 1]$.
% \end{enumerate}

% }
% \hrulefill\\

% \Comment{Construct nonparametric estimates of the marginal event and censoring distributions}
% $\hat{S}_E \gets \texttt{Kaplan-Meier}(Y, \textbf{1}^N)$\;

% $T_{(C, \perp)} \gets \textbf{0}^N$\;
% \For{$i = 1,\, ...\,,\, N$}{
%     $T_{(C, \perp)}^{(i)} \gets f_C\left(X^{(i)}\right)$\;
% }
% $\hat{S}_{(C, \perp)} \gets \texttt{Kaplan-Meier}(T_{(C, \perp)}, \textbf{1}^N)$\;
% \Comment{Join the marginal event and censoring distributions with the copula}
% $T_{C} \gets \textbf{0}^N$\;
% $\mathcal{D}_{C, \theta} = \emptyset$\;
% \For{$i = 1,\, ...\,,\, N$}{
%     $u_1^{(i)} \gets \hat{S}_E\left(Y^{(i)}\right); $\Comment{Obtain event quantile}
%     $u_2^{(i)} \sim C_\theta\left(\cdot \,\middle\mid\, u_1^{(i)}\right); $\Comment{Sample censoring quantile conditionally from the copula}
%     $T_C^{(i)} \gets \hat{S}_{(C, \perp)}^{-1}\left(u_2^{(i)}\right)$; \Comment{Obtain censoring time via inv. censoring survival function}
%     $\mathcal{D}_{C, \theta} \gets \mathcal{D}_{C, \theta}\, \cup\, \left\{\left(X^{(i)}, \min\left(Y^{(i)}, T_C^{(i)}\right), \mathbbm{1}\left[Y^{(i)} \leq T_C^{(i)}\right]\right)\right\}$\;
% }
% \Return{$\mathcal{D}_{C, \theta}$}\;
% \caption{Semi-Synthetic Dataset Construction with Dependent Censoring \coopermj{Hyperlink to implementation notebook once uploaded}}
% \end{algorithm}
% }


% \coopermj{Inducing dependency with nonparametric marginals.}
\vspace*{\fill}

\section{Additional Experimental Details}
\subsection{Evaluation Metric Bias Under Dependence}
For this experiment, we sampled 10,000 data points according to Algorithm \ref{alg:datagenerating} with $X \in \mathbb{R}^{N \times 10} \sim \mathcal{U}_{[0,1]}$, $\nu_E^* = 4, \rho_E^* = 17, \psi_E^*(X) = X_{1}^{2}+X_{2}^{2}$, $\nu_C^* = 3, \rho_C^* = 16, \psi_C^*(X) = \sum_{i=1}^{3}\beta_{C_{i}}X_{i}^{2}$, where $ \beta_C \in [0,1]^{10} \sim \mathcal{U}_{[0,1]}$.

\subsection{Implementation Details}
We halted the learning algorithms if the validation loss failed to improve for a consecutive 3000 epochs. The \texttt{Linear-Risk} experiments were conducted without any form of regularization, whereas the \texttt{Nonlinear-Risk} experiments employed $\ell_2$ regularization with a coefficient of $\lambda=0.001$. For all experiments, the learning rate remained constant at $0.001$.
\section{Datasets and Processing}

% \subsection{Synthetic Datasets}

\subsection{Steel Industry Energy Consumption (\texttt{STEEL}) Dataset}

The \texttt{STEEL} dataset \citep{ve2021efficient, sathishkumar2020energy, sathishkumar2020industry} is a regression dataset from the UCI Machine Learning Repository \citep{asuncion2007uci}, comprising 35,040 observations of of the power consumption of plants run by DAEWOO Steel Co. Ltd in Gwangyang, South Korea. The data includes 9 covariates (including day of the week, type of load (light/medium/heavy), CO$_2$ measurements in PPM, and leading/lagging reactive power measurements), and one outcome variable (the industry energy consumption, measured in kWh). For our semi-synthetic experiment, we used $70\%$ of the data as the train set, $15\%$ as the validation set, and $15\%$ as the test set.

\subsection{Airfoil Self-Noise (\texttt{AIRFOIL}) Dataset}
The \texttt{Airfoil} dataset \citep{Dua:2019} is another regression dataset from the UCI Machine Learning Repository \citep{asuncion2007uci}. It comprises 1,503 observations obtained from aerodynamic and acoustic tests of two and three-dimensional airfoil blade sections conducted in an anechoic wind tunnel. The data includes 6 covariates (including frequency, angle of attack, chord length, free-stream velocity, suction side displacement thickness) and one outcome variable (scaled sound pressure level). For our semi-synthetic experiment, we used $70\%$ of the data as the train set, $15\%$ as the validation set, and $15\%$ as the test set.


\section{Additional Results}
For the experiments in this section we used a Clayton copula to censor the dataset as described in Algorithm \ref{alg:semi-synthetic}.
\subsection{Semi-Synthetic Survival Regression on the \texttt{STEEL} Dataset}

Below, we present the results of our survival regression on the test set of the \texttt{STEEL} dataset. 


\begin{table}[H]
    \small
    \begin{adjustbox}{center}
    \begin{tabular}{|c|c|c|c|c|}
    \hline
    & $\tau = 0.2$ & $\tau = 0.4$ & $\tau = 0.6$ & $\tau = 0.8$\\
    \hline
    Weibull CoxPH (No Censoring) & 0.513 & 0.513 & 0.513 & 0.513\\
    \hline
    Weibull CoxPH (Independence Assuming) & 0.333 & 0.309 & 0.324 & 0.341\\
    Weibull CoxPH (Dependent, \textbf{ours}) & 0.371 & 0.442 & 0.512 & 0.508\\
    \hline
    \end{tabular}
    \end{adjustbox}
    \caption{A table of $R^2$ values given by performing survival regression on the \texttt{STEEL} dataset under various degrees of dependence induced by Algorithm \ref{alg:semi-synthetic}. A higher $R^2$ indicates a better performing algorithm. The top row represents the performance of a Weibull CoxPH model trained on the regression data without censoring; this should indicate an upper bound on the performance of any survival model under censoring. We find that the performance of our approach, though below the theoretical upper bound, lies substantially above that of the independence-assuming approach.}
    \label{tab:steel-results}
\end{table}
\subsection{Semi-Synthetic Survival Regression on the \texttt{AIRFOIL} Dataset}
Below, we present the results of our survival regression on the test set of the \texttt{AIRFOIL} dataset. 
\begin{table}[H]
    \small
    \begin{adjustbox}{center}
    \begin{tabular}{|c|c|c|c|c|}
    \hline
    &$\tau = 0.2$& $\tau = 0.4$& $\tau = 0.6$& $\tau = 0.8$\\
    \hline
      Weibull CoxPH (No Censoring)&$0.572$&$0.572$&$0.572$& 0.572\\
    \hline
    Weibull CoxPH (Independence Assuming)&$0.583$&$0.549$&$0.465$&$0.330$\\
    
    Weibull CoxPH (Dependent, \textbf{ours})&$0.580$&$0.564$&$0.507$&$0.484$\\
    \hline
\end{tabular}
    \end{adjustbox}
    \caption{A table of $R^2$ values given by performing survival regression on the \texttt{AIRFOIL} dataset under various degrees of dependence induced by Algorithm \ref{alg:semi-synthetic}. The top row represents the performance of a Weibull CoxPH model trained on the regression data without censoring; this should indicate an upper bound on the performance of any survival model under censoring. While performance of both methods degrades as dependence increases, we find that our method is better able to obtain higher values of $R^2$ than the independence-assuming model under greater degrees of dependence.}
    \label{tab:steel-results}
\end{table}
\vspace*{\fill}
% \section{Additional Experiments and Results}

% \subsection{Semi-Synthetic Survival Regression on the \texttt{STEEL} Dataset}

% \textbf{Modified National Institute of Standards and Technology} \texttt{MNIST}: The \texttt{MNIST} dataset is a multi-class classification dataset comprising 70,000 black-and-white images (each meaasuring $28\times28$ pixels) of handwritten integer digits ranging between 0 and 9. Each image is annotated with its corresponding digit. In this work, we treat the \texttt{MNIST} dataset as a regression dataset by considering treat each annotation as an outcome label in continuous time on which to perform survival regression.

% \section{Optimization and Convergence}

% \coopermj{Discussion of sensitivity to starting points in synthetic experiments. Demonstrate one set of experiments in which the starting point influences the location to which the model converges.}

% \section{Ethical Considerations}

% \section{Additional simulation results}
% Table~\ref{tab:supp-data} lists additional simulation results; see also \citet{einstein} for a comparison. 

% \begin{table}[!h]
%     \centering
%     \caption{An Interesting Table.} \label{tab:supp-data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \section{Math font exposition}
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% How math looks in equations is important:
% \begin{equation*}
%   F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.

%\vspace*{-20000in}
\nobibliography{uai2023-template} % Hack to get rid of the bibliography -- it worked!


\end{document}
