\documentclass[accepted]{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{enumitem}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\usepackage{amsmath}  
  {
      %\theoremstyle{plain}
      \newtheorem{assump}{Assumption}
  }
{
      %\theoremstyle{plain}
      \newtheorem{lem}{Lemma}
  }
      {
      %\theoremstyle{plain}
      \newtheorem{defn}{Definition}
  }
        {
      %\theoremstyle{plain}
      \newtheorem{cor}{Corollary}
  }
    {
      %\theoremstyle{plain}
      \newtheorem{prop}{Proposition}
  }

\newtheorem{remark}{Remark}

\usepackage{caption}
\usepackage{subcaption}
\usepackage{graphicx}
\usepackage[ruled]{algorithm2e}
\usepackage{amssymb}

%table packages
\usepackage{longtable}
\usepackage{booktabs}
\usepackage{longtable}
\usepackage{array}
\usepackage{multirow}
\usepackage{wrapfig}
\usepackage{float}
\usepackage{colortbl}
\usepackage{pdflscape}
\usepackage{tabu}
\usepackage[normalem]{ulem}
\usepackage{makecell}
\usepackage{xcolor}
\usepackage{threeparttable}
\usepackage{amssymb}
\usepackage{comment}
\usepackage{appendix}
\usepackage{bm}


\newcommand{\bX}{ {\mathbf{X}} }
\newcommand{\bx}{ {\mathbf{x}} }
\newcommand{\bmu}{ {\boldsymbol{\mu}} }
\newcommand{\out}{\mathrm{out}}
\newcommand{\inn}{\mathrm{in}}

%We use indirect inference and auxiliary information to improve hidden population size estimation from samples generated using Respondent Driven Sampling.

\newcommand{\av}[1]{{\textcolor{red}{#1}}}

\title{Hidden Population Estimation with Indirect Inference and Auxiliary Information}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<justin.weltz@duke.edu>?Subject=Your UAI 2024 paper}{Justin Weltz}{}}
\author[1,3]{Eric Laber}
\author[1,2]{Alexander Volfovsky}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistical Science\\
    Duke University\\
    Durham, North Carolina, USA
}
\affil[2]{%
    Department of Computer Science\\
    Duke University\\
    Durham, North Carolina, USA
}
\affil[3]{%
    Department of Biostatistics and Bioinformatics\\
    Duke University\\
    Durham, North Carolina, USA
}

  
  \begin{document}
\maketitle

\begin{abstract}
Many populations defined by illegal or stigmatized behavior are difficult to sample using conventional survey methodology. Respondent Driven Sampling (RDS) is a participant referral process frequently employed in this context to collect information. This sampling methodology can be modeled as a stochastic process that explores the graph of a social network, generating a partially observed subgraph between study participants. The methods currently used to impute the missing edges in this subgraph exhibit biased downstream estimation. We leverage auxiliary participant information and concepts from indirect inference to ameliorate these issues and improve estimation of the hidden population size. These advances result in smaller bias and higher precision in the estimation of the study participant arrival rate, the sample subgraph, and the population size. Lastly, we use our method to estimate the number of People Who Inject Drugs (PWID) in the Kohtla-Jarve region of Estonia.
\end{abstract}


\section{Introduction}

Valid statistical inference tasks require understanding the data sampling mechanism \citep{heckathorn1997respondent}. Often this means identifying a sampling frame, e.g., an enumeration of units in the population of interest, and sampling from it with a known rule. However, many populations lack a conventional sampling frame because they are characterized by behaviors that are illegal \citep{frost2006respondent, johnston2010respondent} or stigmatized \citep{hladik2012hiv, kerr2018hiv}. 
These ``hidden'' populations include intravenous drug users \citep{crawford2016graphical}, undocumented immigrants \citep{johnston2010respondent}, and other vulnerable groups. 
% \av{This ``hidden'' status makes understanding the sizes of these populations, as well as the potential impact of interventions on these populations, of primary importance.}
% \av{To study these populations, a participant referral process called Respondent Driven Sampling \citep[RDS]{heckathorn1997respondent} has been developed. While it maintains the privacy of individuals in the population, it }

Respondent Driven Sampling (RDS) is a participant referral process frequently employed by researchers when a sampling frame is unavailable because it preserves the privacy and safety of at-risk populations \citep{heckathorn1997respondent}. 
For example, RDS was used to study HIV incidence and prevalence among people who inject drugs (PWID) in St. Petersburg, Russia \citep{crawford2016graphical, crawford2018hidden, heimer2010estimation}. Here, RDS is leveraged because it is easier to engender trust if study participants recruit their own social contacts. 

RDS has been similarly employed to study other hidden populations at risk of HIV and other infectious diseases \citep{remera2024hiv, mapingure2024presence, alinaghi2024estimating, barry2024high}. 
Beyond epidemiological studies, RDS is used for sampling hard-to-reach populations such as migrant workers \citep{tyldum2014applying}, street children \citep{johnston2010respondent}, the unhoused population \citep{bernard2018ties}, and ethnic minorities \citep{mullo2020respondent}.
%
% For example, RDS was used to collect data on people who inject drugs (PWID) in St. Petersburg, Russia \citep{crawford2016graphical, crawford2018hidden, heimer2010estimation}. Studying PWID in this setting is epidemiologically important because HIV incidence and prevalence among this population is high. 
% However, PWID in St. Petersburg cannot be sampled through conventional methods because this population will not readily self-identify (drug possession in Russia carries severe legal penalties). 
% Consequently, RDS is leveraged because it is easier to engender trust if study participants recruit their own social contacts. 
% RDS has been similarly employed in numerous settings to study hidden populations at risk of HIV and other infectious diseases \citep{remera2024hiv, mapingure2024presence, alinaghi2024estimating, barry2024high}.
%
% Additionally, \cite{tyldum2014applying} have published a comprehensive book surveying RDS studies of migrant workers. They argue that RDS is a natural sampling mechanism for collecting information on these hard-to-reach populations. Reliable data are necessary in this context as ``more and more people cross national and international borders, and labor markets become increasingly reliant on migrant labor" \citep{tyldum2014applying}. RDS has also been used for hard-to-reach populations such as street children \citep{johnston2010respondent}, the unhoused population \citep{bernard2018ties}, and ethnic minorities \citep{mullo2020respondent}.
%Lisa Johnston also uses RDS to sample street children, ``defined as children aged 10–17 years who engage in economic activity on the street," in Tirana, Albania \citep{johnston2010respondent}. 
%Learning about this population is important because they are often subjected to violence, sexual abuse, and poor health.
%
Lastly, RDS is essential when the relevant subpopulation is highly stigmatized \citep{stahlman2016respondent, arayasirikul2015qualitative, magno2022perception,shahmanesh2009suicidal}. 
% For example, \cite{shahmanesh2009suicidal} studies suicidal behavior among female sex workers in Goa, India. 
% This research is imperative for effective public health policy because sex workers in this setting are subject to high levels of HIV prevalence and violence.

RDS begins with a small convenience sample of individuals, who are interviewed and asked to recruit other members of the target population with a limited number of incentivized coupons provided by the researchers. When individuals redeem their coupons, they receive an incentive, are enrolled in the study, and are asked to recruit as well. 
%This sampling mechanism continues in a recursive manner, expanding across the social network, until the study ends. 
Both access and trust are achieved by incentivizing members of the hidden population to recruit along social connections, thereby verifying the safety of participation. Additionally, anonymity is preserved since only the researchers and a participant's recruiter %(who presumably knew before) 
know an individual's membership status.\footnote{Various additional layers of protection are possible, such as ``coupons'' being digital and recruiter information remaining anonymous to recruits.} 

This method cannot rely on the prophylactic effects of simple random sampling during estimation because it operates along social connections. We will show how improving estimation of the underlying social network between participants in an RDS, and leveraging commonly collected auxiliary information about participants, can lead to more accurate population size estimates.

%Although this sampling mechanism provides access to the hidden population of interest while accommodating privacy concerns, it creates unique inferential challenges \citep{heckathorn1997respondent}. 
The current literature has mainly focused on estimating prevalence of health-related characteristics in the hidden population, e.g., HIV \citep{montealegre2013effectiveness} and syphilis \citep{frost2006respondent}. In order to conduct inference under this sampling design, researchers create simple approximate models for RDS recruitment, often treating the implicit social network as a nuisance parameter \citep{gile2011improved, volz2008probability}. 
%The foundational literature models coupon transfers as a graph random walk with \citep{volz2008probability} and without \citep{gile2011improved} replacement. 
In recent years, focus has shifted to uncovering more about this underlying graph \citep{crawford2018identification, verdery2017new} for its use in downstream estimation. Population size is one such downstream target.
%Crawford (2016) reverses conventional priorities; focusing on estimation of the subgraph connecting an RDS sample \citep{crawford2016graphical} as opposed to population characteristics. 

Estimating population sizes is often imperative for assessing the scope of public health crises \citep{crawford2018hidden, wu2017using}. %and its corresponding intervention
There are various approaches to estimating the overall size of a hidden population that do not account for the sampling mechanism, such as RDS, adequately and hence may perform poorly.
% Estimating the social network between RDS study participants allows for efficient population size estimation \citep{crawford2018hidden} since simpler methods are complicated by the atypical sampling mechanism. 
Simple capture-recapture methods require random sampling and so ignore the mechanism altogether \citep{white1982capture}, and multiplier methods \citep{fearon2017sample} depend on every survey participant accurately reporting the hidden population membership status of their acquaintances, which is unrealistic in many sensitive contexts. Successive Sampling has been used to estimate population size from RDS samples \citep{johnston2010respondent, gamble2023estimating}, however
this method 
%only accounts for the ordered degrees (number of connections an individual has) of survey participants and 
does not incorporate all the network information available. 
%or the nuances of the recruitment chain. 
The key problem with these approaches is that they effectively ignore the underlying graph structure in the population.
%It is clear that the issue with these approaches is that they effectively ignore the graph structure of the original data.
\citet{crawford2018hidden} addresses this by estimating the unobserved edges in a subgraph of the population to develop a model for the
% Crawford et al. (2018) builds upon a subgraph estimation procedure to model 
hidden population size. 
% However, they encounter issues when 
Estimating missing graph information requires working with a model over a complex combinatorial space, and we illustrate that the proposed maximum likelihood and Bayesian estimators are necessarily biased or sensitive to the specification of the prior \citep{crawford2018hidden}. 

We make the following three contributions:
% \setlist{nolistsep}
\begin{enumerate}[noitemsep]
    \item Debias existing estimators of the underlying social network in an RDS sample via indirect inference. We provide empirical validation of the theoretical performance suggested by our Proposition 1.
    \item Develop a two-stage procedure that incorporates commonly collected auxiliary information into the estimation of the social network, providing improvements to hidden population size estimates.
    \item When the underlying social network depends on group structure, we derive improved population size estimators. We provide empirical evidence for the robustness of the proposed approaches as compared to state of the art methods.
\end{enumerate}
% \setlist{nolistsep}
% We make two improvements to existing estimators. 
% First, we apply indirect inference, a strategy that helps debias canonical estimators via simulation, and then we incorporate auxiliary information collected during RDS into the estimation process. %to improve estimation of the hidden population size.
The rest of the paper is organized as follows:
Section~\ref{sec:rdsmodel} provides background information on the structure of the RDS stochastic process model and its likelihood. In Section~\ref{sec:indinf}, we extend the original indirect inference estimator (IIE) of \citet{jiang2004indirect} to the case of RDS and show that this estimator is less biased than the MLE asymptotically. Section~\ref{sec:size} reviews the existing population size estimation approaches, extends them to the case of more general underlying graph structures, and develops a method for incorporating auxiliary information (such as group membership) into the population size estimation procedure. Section~\ref{sec:sims} and \ref{sec:appl} demonstrate, through simulation studies and a case study respectively, the impact of indirect inference estimation and auxiliary information on population size estimation.
\input{rds_tikz}

\section{RDS Model Setup and Issues}
\label{sec:rdsmodel}
Throughout we consider a setting where our population is represented by a graph $G=(V,E)$, where $V$ is the set of $|V| = N$ nodes in the graph and $E$ is the set of pairwise connections, or edges, between individuals. Respondent Driven Sampling (RDS) starts with a set of seeds (node 1 in Figure~\ref{fig:GRGS}), and then proceeds by recruiting other participants (the middle and right panels of Figure~\ref{fig:GRGS}) over the edges of the original graph $G$. This process continues until a stopping rule is reached (e.g., a predetermined number of recruited individuals or a budget constraint are met). At the end of this process, a researcher is in possession of a recruitment subgraph $G^R = (V^R,E^R)\subset G$ on $|V^R| = n\leq N$ individuals.  The labels of the nodes in $V^R$ denote the order in which they arrived at the study (and so participant $i$ was interviewed before participant $j$ if $i<j$). Importantly, this is \textit{not} the vertex induced subgraph of $G$ that would have been observed by projecting the original graph $G$ onto the vertices $V^R$. We will call this induced subgraph $G^S = (V^S,E^S)$ and note that, while $V^S = V^R$, we only know that $E^R\subseteq E^S$. \textit{If we had access to $G^S$ then estimating the size of the graph $G$ would be a simple task.}
% The nodes in an RDS sample are embedded in a graph $G=(V,E)$, where $V$ is the complete set of hidden population members and $E$ is the set of edges connecting them. We follow the notation conventions introduced in Crawford (2016) to represent the information available at the end of an RDS study. 

% In a sample of size $n$, $G^R = (V^R, E^R) \subset G$, is the recruitment graph that records the ordered edges between recruiters and recruitees, and $G^S = (V^S, E^S)$ is the complete subgraph the contains all the edges between members of the sample. If node $i$ recruits node $j$, then $i,j \in V^R$ and $\{i,j\} \in E^R$. $V^S = V^R$ and $E^R \subseteq E^S$, but $E^S$ may contain unobserved edges as well. 
There are two reasons that edges in $G^S$ are missing in $G^R$. First, recruiters may run out of coupons before they recruit all of their neighbors (e.g., pariticipant 13 in Figure~\ref{fig:GRGS}). Second, if participant $i$ recruits participant $k$ before participant $j$ does, a connection $ \left \{ j,k \right \} \in E^S$ will not be observed because an individual cannot participate in the study multiple times (e.g., participant 6 is recruited by participant 2 before participant 3 can recruit them in Figure~\ref{fig:GRGS}). %\textcolor{red}{We need additional information to impute the missing edges and estimate $G^S$.}

While $G^S$ cannot be observed directly, it can be estimated from data collected during RDS.\footnote{We use the notation of \citet{crawford2018hidden} when possible for referential convenience.} Typical RDS studies ask participants how many hidden population members they know. For participant $i$, this is their degree in the larger graph $G$, $d_i = | \left \{ \{i,j\} \in E: i \in V^R, j \in V, i \neq j \right \}|$. The vector of observed degrees, $\textbf{d} = \left (d_1, d_2, \ldots, d_n \right )$,  is ordered by arrival to the study. Additionally, we define a vector $\textbf{w}$ such that $w_i$ is the time between the arrival of participant $i-1$ and participant $i$. This makes the full data observed at the end of an RDS study $\mathbf{Y} = (G^R, \mathbf{d}, \textbf{w})$, $\mathbf{Y} \in \mathcal{Y}$. 
%Intuitively, the following RDS model boils down to the idea that if node $i$ has had unused coupons for a long period of time without recruiting potential study participant $j$,  then most likely edge $\{i,j\} \notin E^S$. 
%\begin{defn}[Active Nodes]
%A node is active at a given time point $t$ if it was recruited before $t$ and has connections to %unrecruited vertices and unused coupons.
%\end{defn} 

% At each arrival time, we label an edge as ``susceptible" if it is between a recruiter with unused coupons and an unrecruited member of the hidden population.
%\begin{defn}[Pendant Edge]
%An edge $\{ i, j \}$ between nodes $i \in G^R$ and $j \notin G^R$.
%\end{defn}
Our RDS arrival process model is described by wait times attached to edges in $G$ between recruiters with unused coupons and unrecruited members of the hidden population, termed ``susceptible edges'' \citep{crawford2018hidden}.
When the wait time associated with edge $\{i,j\}$ expires, participant $i$ recruits participant $j$ (as long as $j$ has not been previously recruited); $d_j$ and $w_j$ are then recorded and $\{i,j\}$ is added to $G^R$. We assume that edge times are independent and identically distributed according to an exponential distribution %with common parameter $\lambda \in \mathbb{R}^+$
%in order to facilitate mathematical simplicity
\citep{crawford2016graphical}. %This results in a classic arrival process in which, conditional on the current event, the history of the process is independent of the future.
% 
\begin{assump}[Exponential Wait Times]
\label{ass:exptimes}
% \textbf{Assumptions 4 and 6 from \citet{crawford2018hidden}.} 
Upon entering the study, a participant immediately becomes active, and their susceptible edges are assigned a wait time that is drawn independently from an exponential distribution with common parameter $\lambda \in \mathbb{R}^+$. (This combines assumptions 4 and 6 in \citet{crawford2018hidden}.)
\end{assump}

\begin{remark}
    Assumption~\ref{ass:exptimes} is common when studying arrival data. It implies Markovian dynamics and leads to a closed form likelihood for the RDS process (Equation~\ref{eq:crawlike}). It is possible to relax this assumption, e.g., by considering dependence in arrival times. This will lead to changes in the likelihood, but does not preclude the analytic approach we propose. %--- i.e., we can perform estimation under any wait time distribution by expressing the data likelihood correctly. This would involve computing the likelihood by evaluating the distribution of the minimum of the susceptible edge times at each recruitment event.
\end{remark}

Let $A^S \in \left \{ 0, 1 \right \}^{n \times n }$ be the adjacency matrix associated with graph $G^S$, where $A^S_{i,j} = 1$ if $\{i,j\} \in E^S$ and $0$ if not; let $u_i$ be the number of connections study participant $i$ has to unrecruited hidden population members, $u_i = |\left \{ \{i,j\} \in E: j \notin V^R \right \} |$, and $\mathbf{u} = (u_1,u_2, \ldots, u_n)$; and let $M$ be the seed set. 
Additionally, let $\mathrm{lt}: \mathbb{R}^{n\times n} \to \mathbb{R}^{n\times n}$ be the lower-triangular function, i.e., for any $A \in \mathbb{R}^{n\times n}$, we have $\lbrace \mathrm{lt}(A)\rbrace_{i,j} = A_{i,j}1(i \le j )$.
%to $B\cdot A$, where $B$ is a matrix such that $B_{i,j} = 1$ if $i\leq j$ and $0$ if not, and ``$\cdot$" is element-wise multiplication. 
The joint likelihood for parameters $A^S$ and $\lambda$ is
\begin{equation}\label{eq:crawlike}
    \mathcal{L}_n(\mathbf{Y} | A^S, \lambda) = \left ( \prod_{j\notin M} \lambda s_j \right ) \exp \left ( -\lambda \textbf{s}^\top \textbf{w} \right ),
\end{equation}
where $\textbf{s} = \mathrm{lt}(A^SC)^\top 1 + C^\top \textbf{u}$ is the susceptible edge vector,
and $C \in \mathbb{R}^{n\times n}$ is the coupon matrix %that summarizes $G^R$ to record recruiters with unused coupons %(potentially active)
%before each study participant is recruited.
%\footnote{Here $C$ is the $n\times n$ coupon matrix whose 
in which $C_{ij}=1$ if participant $i$ has at least one coupon before the $j^{th}$ participant is recruited, and zero otherwise (Definition 4 from \citet{crawford2018hidden}). 
%The rows of $C$ are ordered by the subjects' recruitment times.
% \begin{defn}[The Coupon Matrix]
%  Let
% \end{defn}
The $i^{th}$ entry of the susceptible edge vector, $s_i \in \textbf{s}$, is the number of edges between recruiters with coupons and unrecruited members of the hidden population just before the $i^{th}$ study participant is recruited.

Both $G^R$ and $\textbf{d}$ function as graphical constraints ensuring that the estimated adjacency matrix is \textit{compatible} with the observed data.  
\begin{defn}[Compatibility]
An estimated subgraph $\widehat{G}^S = (\widehat{V}^S, \widehat{E}^S)$ represented by the estimated adjacency matrix $\widehat{A}^S_{n}$ is compatible with the observed data, $\mathbf{Y}$, if the following three conditions hold: 1. $V^R = \widehat{V}^S$; 2.  $E^R \subseteq \widehat{E}^S$; 3. The degree of each $i \in \widehat{V}^S$ does not exceed $d_i$. (This is Definition 5 from \citet{crawford2018hidden}.)
\end{defn}

%We note that $A^S$ is \textcolor{red}{partially identifiable} since it only enters the likelihood through the susceptible edge vector, $\textbf{s}$. 
Let $\mathcal{A}$ be the space of compatible subgraphs, then the maximum likelihood estimator (MLE) corresponding to Equation~\eqref{eq:crawlike} is
\begin{equation}\label{eq:MLE}
    \left \{ \widehat{A}^S_{n}, \widehat{\lambda}_n \right \} = \arg \max_{A^S \in \mathcal{A}, \lambda \in \mathbb{R}^+} \mathcal{L}_n(\mathbf{Y} | A^S, \lambda).
\end{equation}
% Since each $\widehat{G}_S$ is associated with an $\widehat{A}^S_{n}$, this compatibility criterion informs the estimation of $A^S$.

\begin{figure}
\centering

\includegraphics[height = 5cm]{Figures/Bias_Evidence_Plot_0.pdf}
% \includegraphics[width =0.85\linewidth,height = 0.7\linewidth]{Figures/Bias_Evidence_Plot_0.pdf}

\caption{This figure depicts the bias of $\widehat{\lambda}_n$ and $\left | \widehat{E}^S_{n} \right |$. We can see that the bias of $\lambda$ and the edge set size are positively correlated and increase as the sample proportion decreases.}
\label{fig:Bias}
\end{figure}

\subsection{Issues with Maximum Likelihood Estimation}
\label{sec:BiasMLE}
Beyond computational difficulties associated with maximizing functions over graph space, the MLE in Equation~\eqref{eq:MLE} can exhibit severe bias even for moderately large sample sizes. We start by noting that \textit{if} $A^S$ were known, Equation~\eqref{eq:crawlike} reduces to the likelihood of exponentially distributed data. It is well known that the MLE for the rate parameter of an exponential, $\lambda$, has a bias that diminishes as the sample size, $n$, increases: $| \mathbb{E}(\widehat{\lambda}_n) - \lambda | = \lambda/(n-1)$. However, in RDS, $A^S$ is not known, \textit{and} the magnitude of the bias is related to the rate of increase of both $n$ and $N$ (the unobserved population size). Specifically, when $A^S$ is unknown, Equation~\eqref{eq:crawlike} has $n+1$ unknown parameters that are meant to be estimated based on $n$ observations and the graphical constraints imposed by $G^R$ and $\textbf{d}$ --- while the parameters remain identifiable due to these constraints, 
% it does not mean that high quality estimation is possible. 
high quality estimation may not be possible.
This is especially true for RDS, as the constraints are often loose in this context ($n\ll N$ and so $n/N\nrightarrow 1$).

In Figure~\ref{fig:Bias}, we plot the observed absolute biases in $|\widehat{E}^S_{n}|$ and $\widehat{\lambda}_n$ following an RDS simulated according to the generative model in Equation~\eqref{eq:crawlike} with $\lambda =1$, a single seed participant, five coupons per participant, and $n = 100$. The population graph, $G$, is simulated from an Erdos-Renyi model with edge probability $p$ set to keep the expected degree $10$
(details about the Erdos-Renyi model are provided in Section~\ref{sec:size}). 
On the x-axis, we vary the \textit{total} population size, $N$.
We see that as $n/N$ decreases and the constraints loosen, the bias increases. The intuition behind this is as follows. For a given $\lambda$ and $i \in \{1,2,\ldots, n\}$, the MLE of $s_i$ \textit{without} graphical constraints is $1/(\lambda w_i)$, which has expectation $\mathbb{E}\left \{1/(\lambda w_i) \right \} = \infty$.  This suggests that if $n/N \nrightarrow 1$ as $n \to \infty$ and $N \to \infty$, %and $P(A_{ij} =1) \nrightarrow 0$, 
then the MLE of  $s_i \lambda$ will have positive bias. RDS is used in settings where $n << N$ (and so the constraints on $\textbf{s}$ are minimal), so an alternative to the MLE is needed for high quality inference.
\textit{We aim to resolve these biases using an alternative estimator motivated by concepts from indirect inference.}
%to ameliorate this issue. }

%\textcolor{red}{As $n \to \infty$ and $N \to \infty$, if $n/N \nrightarrow 1$, then these network constraints are not tight. RDS is often conducted in environments where $n << N$, so the constraints on $\mathbf{s}$ are minimal.}

%We observe the performance of $\widehat{A}^S_{n}$ and $\widehat{\lambda}_n$ by simulating an RDS study according to the generative model in Equation~\eqref{eq:crawlike}. 
%Setting $\lambda = 1$, %giving each study participant $5$ coupons and starting with a single seed participant,
%we take a sample of $n=100$ of population sizes, $N$, ranging from $500$ to $2500$. Figure~\ref{fig:Bias} indicates that both $\widehat{A}^S_{n}$ and $\widehat{\lambda}_n$ are biased under a variety of settings. This suggests that if $n/N \nrightarrow 1$ as $n \to \infty$ and $N \to \infty$, %and $P(A_{ij} =1) \nrightarrow 0$, 
%then the MLE of  $s_i \lambda$ will have positive bias. Since RDS is usually used in exactly these settings (where $n << N$ and so the constraints on $\mathbf{s}$ are minimal), an alternative to the MLE is needed for high quality inference.
%\textit{We aim to resolve these biases using an alternative estimator motivated by concepts from indirect inference to ameliorate this issue. }

\section{Indirect Inference Estimator}
\label{sec:indinf}

We define the indirect inference estimator, derive its theoretical properties, and demonstrate its improvement in estimating RDS model parameters empirically.

\subsection{Indirect Inference}

%\textcolor{red}{Need two sentences about what indirect inference is in general. Needs to say something like ``Indirect inference is a simulation based approach that allows us to reduce the bias in estimating parameters by matching various statistics.}

 
% If we could express $\widehat{\lambda}_n$ and $\widehat{A}^S_{n}$ in closed form, quantifying and correcting for the bias we observe might be possible. Unfortunately, the complicated constraints of the combinatorial graph space make this untenable. Instead, we use an indirect inference approach to search for parameters that mimic the bias observed. 

%Estimators $\widetilde{G^S}, \widetilde{\lambda}_n$ such that $E_{\mathbf{Z} \sim P_{\widetilde{G^S}, \widetilde{\lambda}_n}}[\lambda(\mathbf{Z})]$  is close to the observed data MLE, $\widehat{\lambda}_n = \lambda(\mathbf{Y})$ 
% We propose the indirect inference Estimator (IIE) as an estimator with less bias than the MLE. 

%Indirect inference is a simulation based approach that allows us to reduce the bias in estimating model parameters \citep{gourieroux1993indirect}. 
%It relies on simulating data to find the expectation of a chosen statistic (that summarizes the data, though is not necessarily sufficient)
%on the choice of a statistic that summarizes the data (though is not necessarily sufficient) and requires the evaluation of the expected value of that statistic for 
%under various model parameters. The parameters that minimize the distance between this expectation and the value of the statistic under the observed data are called the indirect inference estimators.
The indirect inference estimator (IIE) relies on specifying a calibration statistic.  The choice of this statistic is not unique, but often there is a natural option in a given problem domain \citep{jiang2004indirect}; we use the MLE for $\lambda$ as our calibration statistic.  The IIE is constructed by finding parameter settings under which the expected value of the calibration statistic matches its observed value.
%The indirect inference estimator (IIE) chooses the value of the parameter for which the expected value of the statistic matches most closely to the observed value of the statistic.
%
% Broadly, an indirect inference estimator (IIE) requires a choice of a statistic (not necessarily sufficient) for summarizing the data and aims to find the model parameters such that the observed value of the statistic . %We take $\widehat{\lambda}_n$ to be our statistic, and find the $\lambda \in \mathbb{R}^+$ that produces MLEs of $\lambda$ that match $\widehat{\lambda}_n$ on average. %Intuitively, our Indirect Inference Estimator is the parameter that produces the biased MLE on average

To formalize the indirect inference estimator (IIE) in our setting, we require a few definitions.
Let  $\lambda^\dagger: \mathcal{Y} \to \mathbb{R}$ and $A_\dagger^S: \mathcal{Y} \to \left \{ 0, 1 \right \}^{n \times n }$ be functions that map the data, $\mathbf{Y}$, to the solutions of Equation~\eqref{eq:MLE}.
% MLE estimators for the true generative parameters $\lambda$  and $A^S$ respectively. 
Additionally, define $A_{\lambda}^S: \mathcal{Y}\times \mathbb{R}^+ \rightarrow \lbrace 0, 1\rbrace^{n \times n }$ so that for observed data, $\mathbf{Y}$, and value $\lambda' > 0$, $A_{\lambda}^S(\mathbf{Y}, \lambda')$ is the solution to Equation~\eqref{eq:MLE} holding $\lambda$ fixed at $\lambda'$.
%Additionally, define $A^S_\lambda: \mathcal{Y} \times \mathbb{R} \to \left \{ 0, 1 \right \}^{n \times n }$ as a function that maps the data to the solution of the same minimization problem while holding $\lambda \in \mathbb{R}^+$ fixed. 
% $\mathbf{Y}=(G^R, \mathbf{d}, \textbf{w})$ is the data we observe at the end of the RDS study, making $\widehat{\lambda}_n = \lambda^\dagger(\mathbf{Y})$ and $\widehat{A}^S_{n} = A^\dagger_S(\mathbf{Y})$. 
%Our Indirect Inference estimation relies on simulating data to find the expectation of the MLE of $\lambda$ under different parameters.

% Indirect inference estimation relies on simulating data to find the expectation of a chosen statistic under candidate parameters --- the parameters that minimize the distance between this expectation and the value of the statistic under the observed data are called the indirect inference estimators. 
%We choose the MLE of $\lambda$ as the calibration statistic and 
We propose the following estimation procedure for our model parameters. %The IIE is the pair $(\tilde\lambda_n, \widetilde{A}^S_{n} = A^S_\lambda(\mathbf{Y}, \widetilde{\lambda}_n))$ for which 
%changed the notation back form A^S^{\tilde\lambda_n} to make it consistent with future notation- I think this superscript method gets uglier later
Let $\tilde\lambda_n$ solve
\begin{equation}
    \label{eq:IIE}
    \mathbb{E}_{\mathbf{Z} \sim P_{A_\lambda^S(\mathbf{Y}, \widetilde{\lambda}_n), \widetilde{\lambda}_n}}\left \{\lambda^\dagger(\mathbf{Z}) \right \}  = \lambda^\dagger(\mathbf{Y}),
\end{equation}
and $\widetilde{A}^S_{n} = A^S_\lambda(\mathbf{Y},\widetilde{\lambda}_n)$, then the IIE is the pair $(\tilde\lambda_n, \widetilde{A}^S_{n})$.
The expectation in Equation~\eqref{eq:IIE} is taken over simulated data $\mathbf{Z} = (G^R, \mathbf{d}, \textbf{w}^*) \in \mathcal{Y}$, where $\textbf{w}^* \sim P_{A^S, \lambda}$ and $P_{A^S, \lambda}$ is the generative model described in Equation~\eqref{eq:crawlike}. The procedure for calculating the IIE is summarized in Algorithm~\ref{alg:altest}. This algorithm requires $K\times J$ evaluations of the MLE. 
Because these evaluations are embarrassingly parallelizable,
the IIE has the same computational complexity as the MLE.

% We define $\mathbf{Z} = (G^R, \mathbf{d}, \textbf{w}^*) \in \mathcal{Y}$  where $\textbf{w}^* \sim P_{A^S, \lambda}$ to be the simulated data under $\lambda, A^S$. 
% %To avoid searching over a combinatorial 
% For $\lambda \in \mathbb{R}^+$, we first find the MLE of $A^S$ given $\lambda$, $A^\lambdA^S(\mathbf{Y}, \widetilde{\lambda}_n)$, and then use $A^\lambdA^S(\mathbf{Y}, \lambda)$ and $\lambda$ to simulate data $\mathbf{Z}$.
% The IIE for $A^S$ and $\lambda$ are $A^\lambdA^S(\mathbf{Y}, \widetilde{\lambda}_n)$ and $\widetilde{\lambda}_n$ respectively such that
% %\begin{equation*}
% %   \mathcal{R}(G^S^*, \lambda^*) = \left \{ \widetilde{\lambda}_n :  E_{\mathbf{Z} \sim P_{A^\lambdA^S(Y, \widetilde{\lambda}_n), %\widetilde{\lambda}_n}}\left \{\lambda^\dagger(\mathbf{Z}) \right \}  = \lambda^\dagger(Y) \right \}.
% %\end{equation*}
% \begin{equation}
%     \label{eq:IIE}
%     \mathbb{E}_{\mathbf{Z} \sim P_{A^\lambdA^S(\mathbf{Y}, \widetilde{\lambda}_n), \widetilde{\lambda}_n}}\left \{\lambda^\dagger(\mathbf{Z}) \right \}  = \lambda^\dagger(\mathbf{Y}).
% \end{equation}
% %The estimator of Equation~\eqref{eq:IIE} is actually a hybrid between Indirect Inference and Maximum Likelihood Estimation. 
% %For $\lambda \in \mathbb{R}^+$, we first find the MLE of $A^S$ given $\lambda$, $A^\lambdA^S(\mathbf{Y}, \widetilde{\lambda}_n)$, and then use $A^\lambdA^S(\mathbf{Y}, \lambda)$ and $\lambda$ to simulate data. 
% The expectation of the MLE for $\lambda$ under parameters $\widetilde{\lambda}_n$ and  $A^\lambdA^S(\mathbf{Y}, \widetilde{\lambda}_n)$ is $\widehat{\lambda}$. 

To understand \textit{why} an IIE can reduce bias, we first discuss the IIE for exponentially distributed data, which we observed in Section~\ref{sec:BiasMLE} are closely related to the data generated by RDS. The important benefit of this setting is that we are able to derive the analytic form of the IIE. 

% We first provide intuition on how the IIE can improve on available estimators using exponential data, which we observed in Section~\ref{sec:BiasMLE} are intimately related to the data generated by RDS.
% \paragraph{Intuition.} We can examine the relationship described by Equation~\ref{eq:linearbias} in an important simplification of the RDS stochastic process likelihood, the exponential model. 
Suppose $\mathbf{X} = (X_1,..., X_n)$ comprises $n$ independent draws from an exponential distribution indexed by $\lambda \in \mathbb{R}^+$. The likelihood of $\mathbf{X}$ is
\begin{equation*}
    \mathcal{L}(\lambda | \textbf{X}) = \prod_{i=1}^n \lambda \mathrm{exp}(-\lambda X_i)  = \lambda^n \mathrm{exp}(-\lambda \sum_{i=1}^n X_i).
\end{equation*}
The MLE is $\widehat{\lambda}_n = n/\left(\sum_{i=1}^n X_i \right )$, which is distributed according to an Inverse-Gamma distribution with shape and scale parameters $(n,n\lambda)$. The %\textcolor{red}{absolute} 
absolute bias of the MLE is $|  \mathbb{E} (\widehat{\lambda}_n -  \lambda) | = \lambda/(n-1)$. Choosing the MLE as the calibration statistic in the IIE procedure, we see that the IIE is $\widetilde{\lambda}_n = (n-1)/(\sum_i X_i)$, which is unbiased.

%\textbf{THIS IS A NICE BONUS: DOUBLE CHECK}
Moreover, the mean squared error (MSE) of $\widetilde{\lambda}_n$ is smaller than that of $\widehat{\lambda}_n$:
% the estimators,
\begin{align*}
    &\mathrm{MSE}(\widehat{\lambda}_n) = 
    \frac{\lambda^2(n^2+n-2)}{(n-1)^2(n-2)},\\
    &\mathrm{MSE}(\widetilde{\lambda}_n) = \frac{\lambda^2}{(n-2)}, \\
    &\mathrm{MSE}(\widehat{\lambda}_n) - \mathrm{MSE}(\widetilde{\lambda}_n)  = \frac{3\lambda^2(n-1)}{(n-1)^2(n-2)}> 0.
\end{align*}
% Not only is $\widetilde{\lambda}_n$ unbiased, it is also more accurate. 

In general, the IIE (that uses the MLE as a calibration statistic) is unbiased for a parameter if the bias of the MLE is linear in the parameter.
The exponential likelihood example above suggests that this is possible in our setting. We formally compare the asymptotic behavior of the IIE bias to the MLE bias in the next subsection.
% First, we examine simple examples in which the IIE is unbiased. For constants, $a,b \in \mathbb{R}$ and $\lambda^* \in \mathbb{R}^+$, if
% \begin{equation}
%     \label{eq:linearbias}
%     \mathbb{E}_{\mathbf{Z} \sim P_{A^\lambdA^S(\mathbf{Y}, \lambda^*), \lambda^*}}\left \{\lambda^\dagger(\mathbf{Z}) \right \} = a\lambda^* + b,
% \end{equation}
% then we know that $\widetilde{\lambda}_n = (\widehat{\lambda}_n -b)/a$ and
% \begin{equation*}
%         \left | \mathbb{E} \left ( \widetilde{\lambda}_n - \lambda \right ) \right |  = 0. 
% \end{equation*}
% Consequently, if the bias is linear (or approximately linear) in $\lambda$, then the IIE has smaller bias than the MLE. \textcolor{red}{WORK HERE}

\begin{algorithm}
\SetAlgoLined
\DontPrintSemicolon
\SetAlgoLined
\textbf{Goal:} Find the estimator, 
\begin{equation*}
     \widetilde{\lambda}_n \in \arg\min_{\lambda} \left | \mathbb{E}_{\mathbf{Z} \sim P_{A_\lambda^S(\mathbf{Y}, \lambda), \lambda}} \left \{ \lambda^\dagger(\mathbf{Z}) \right \} - \lambda^\dagger(\mathbf{Y}) \right |
\end{equation*}
Generate a grid of $\lambda^k$ values, $k \in \{1,2,..,K\}$, centered at $\widehat{\lambda}_n$ \;
\For{k in $\{1,2,\ldots,K\}$}{
    \For{j in $\{1,2,\ldots,J\}$}{
        Find $\widehat{A}^S_{n,k} = A_\lambda^S(\mathbf{Y}, \lambda^k)$\;
        %or, in other words, the subgraph $G^S$ that maximizes the observed data likelihood conditional on a known $\lambda$ value equal to $\lambda^k$
        Simulate wait time vector $w^{k,j}$ from the model defined by parameters $\widehat{A}^S_{n,k}, \lambda^{k}$\;
        Find $\widehat{\lambda}_n^{k,j}$ by solving Equation~\eqref{eq:MLE} with generated data $\mathbf{Z}^{k,j} = (G^R, \mathbf{d}, w^{k,j})$\;
    }
    Save set $ \left \{\lambda^k, \widehat{A}^S_{n,k}, \widehat{\lambda}_n^k= \left (\sum_{j=1}^{J} \widehat{\lambda}_n^{k,j} \right )/J \right \}$\;
    %(where $G^S^k$ is just one of the ${G^S}^{k,j}$ - since they all represent possible local maximums);
}
Calculate $k^* = \arg\min_{k \in \{1,2,\ldots,K\}} \left |\widehat{\lambda}_n^k - \lambda^\dagger(\mathbf{Y}) \right | $\;
%(where $\widehat{\lambda}_n$ is from step 1 of our procedure);
Output estimators $\left \{ \widetilde{\lambda}_n, \widetilde{A}^S_{n} \right \} = \left \{ \lambda^{k^*}, \widehat{A}^S_{n,k^*} \right \}$\;
\caption{The Indirect Inference Estimator}
\label{alg:altest}
\end{algorithm}

\subsection{Asymptotics}
%To characterize the asymptotic behavior of the IIE, we require an expansion of its distribution function. We note that under general conditions, the MLE has an asymptotic representation called an Edgeworth expansion \citep{hall2013bootstrap}. Since $\widehat{\lambda}_n$ is the statistic of interest in our IIE procedure, we can characterize the asymptotic behaviour of the IIE by relating the Edgeworth expansion of $\lambda^\dagger(\mathbf{Z})$, where $\mathbf{Z}$ is generated under the IIE, to the Edgeworth expansion of $\widehat{\lambda}_n$. We require a specific form for the Edgeworth expansion of $\widehat{\lambda}_n$:
To characterize the asymptotic behavior of the IIE, we assume that the MLE admits an Edgeworth expansion \citep{hall2013bootstrap}.  
%There are more general conditions under which the IIE has smaller bias than the MLE asymptotically. 
% First, we assume that $\widehat{\lambda}_n$ admits an Edgeworth expansion.
\begin{assump}
    \label{assump:edge}
    As $n \to \infty$,
    \begin{equation*}
        \widehat{\lambda}_n = \lambda + \frac{A(V, \lambda)}{\sqrt{n}} + \frac{B(V, \lambda)}{n} + \frac{C(V, \lambda)}{n^{3/2}} + o_p(n^{-3/2}),
    \end{equation*}
    where $V$ has a distribution that does not depend on $\lambda$, and $A(V, \lambda)$ $B(V, \lambda)$ and $C(V, \lambda)$ are random vectors that only depend on $\lambda$ and $V$.
\end{assump}
Such an expansion holds for the MLE under general conditions; see Section~2.4 of \citet{hall2013bootstrap} for details.
%for conditions under which this expansion is valid. 
Under this expansion, it can be seen that the bias is of order $n^{-1/2}$.
%We can use the Edgeworth expansion of $\widehat{\lambda}_n$ and $\widetilde{\lambda}_n$ to prove that $\widetilde{\lambda}_n$ is less biased than $\widehat{\lambda}_n$ asymptotically. 
\begin{prop}
    \label{prop:IIE}
    Given Assumption~\ref{assump:edge}, as $n\to \infty$,
    \begin{equation*}
        \mathbb{E}\left (\widetilde{\lambda}_n \right ) = \lambda + \frac{\mathbb{E}\left \{ C^*(V, \lambda) \right \}}{n^{3/2}} + o_p(n^{-3/2}),
    \end{equation*}
    where $V$ is a random variable with a distribution that does not depend on $\lambda$, and $C^*(V, \lambda)$ is a random vector that only depends on $\lambda$ and $V$.
\end{prop}
Proposition~\ref{prop:IIE} follows from Assumption~\ref{assump:edge} and Corollary~2.1 in \citet{gourieroux200013}. It shows that the IIE does not have bias terms of orders $n^{-1/2}, n^{-1}$, while the MLE does. 
%Lastly, we define $\widetilde{A}^S_{n} = A^\lambdA^S(\mathbf{Y}, \widetilde{\lambda}_n)$ and test the performance of $\widetilde{\lambda}_n$ and $\widetilde{A}^S_{n}$ in the next subsection.


%We assume that the third derivative of the likelihood is well behaved.
%\begin{assump}
%    \label{as:lip}
%    \textcolor{red}{tentatively:} Lipchitz constraint on the second derivative and restrict $\widetilde{\lambda}_n$ to be in a range of $\widehat{\lambda}_n$.
%\end{assump}
%This assumption essentially means that the bias of MLE under the true parameters is close to the bias of the MLE taking the IIE to be the true parameters on average. Specifically, the difference between these biases must be less than the bias of the MLE under the true parameters. This is not an unusual requirement in the parametric bootstrap context, which assumes that the distribution of the estimator under the truth is well approximated by the distribution of the estimator taking the truth to be the estimates. While it is not possible to verify Assumption~\ref{as:bias}
%This assumption allows us to prove that the bias of $\widehat{\lambda}_n$ is at least as large as the bias of $\widetilde{\lambda}_n$ (asymptotically?).
%\begin{lem}
%    \label{lem:bias}
%    Under Assumption~\ref{as:lip}, the bias of $\widetilde{\lambda}_n$
    %$\mathcal{R}(G^S^*, \lambda^*)$
%    is less than the bias of $\widehat{\lambda}_n$,
%    \begin{equation*}
%        \left | \mathbb{E} \left \{ \widetilde{\lambda}_n - \lambda^* \right \} \right | \leq \left | \mathbb{E} \left \{ \widehat{\lambda}_n - \lambda^* \right \} \right |. 
%    \end{equation*}
%\end{lem}
%The proof of this lemma is in Appendix~ref{app:indinf}.
%Additionally, we define $\widetilde{A}^S_{n} = A^\lambdA^S(\mathbf{Y}, \widehat{\lambda}_n)$. Although we cannot verify Assumption~\ref{as:lip} for the generative model specified by Equation~\eqref{eq:crawlike}, we verify this assumption for the exponential model -- an important simplification of the RDS arrival time process -- in Appendix~\ref{app:indinf}. \textcolor{red}{There is a commented out subgraph accuracy assumption here, not sure if it's worth mentioning.}
%Improving the estimation of $\lambda$ may be important independently, but we are primarily concerned with the performance of estimators of functions of the subgraph, $A^S$. Consequently, we define $\widetilde{A}^S_{n} = A^\lambdA^S(\mathbf{Y}, \widehat{\lambda}_n)$ and introduce Assumption~\ref{as:graph}.

%\begin{assump}
%\label{as:graph}
%    For an estimator $\lambda^e$ of $\lambda^*$, if
%    \begin{equation*}
%        \left | \mathbb{E} \left \{ \lambda^e - \lambda^* \right \} \right | \leq \left | \mathbb{E} \left \{ %\widehat{\lambda}_n - \lambda^* \right \} \right |,
%    \end{equation*}
%    then for summary function $h:\left \{ 0, 1 \right \}^{n \times n } \to \mathbb{R}$ and loss function $k:\mathbb{R} \to \mathbb{R}$,
%    \begin{align*}
%       &\mathbb{E} \left ( k \left [ h \left \{ A^\lambdA^S(\mathbf{Y}, \lambda^e) \right \} - h(A^*_S) \right ] \right ) \leq  \\
%       &\mathbb{E} \left ( k \left [ h \left \{ A^\lambdA^S(\mathbf{Y}, \widehat{\lambda}_n) \right \} - h(A^*_S) %\right ] \right )
%    \end{align*}
%\end{assump}
%This assumption simply states that estimating a function of the subgraph, $h(A^S^*)$, given a less biased estimate of $\lambda^*$ results in smaller average error (where error is quantified by loss function $k$).
\subsection{Empirical Performance: Study Participant Arrival Rate and Subgraph Accuracy Improvements}
\label{sec:subgraphimprov}

%Although we cannot verify Assumption~\ref{as:lip} and \ref{as:graph} directly, 
In this section, we empirically evaluate the finite sample behavior of our proposed IIE estimator for the two model parameters in the likelihood of Equation~\eqref{eq:crawlike}.
We simulate RDS trajectories of size 100 over various graph sizes, with an average wait time of $1/\lambda = 1$ and each recruit having 5 coupons. The hidden population graph, $G$, is simulated from an Erdos-Renyi model with edge probability $p$ (details of this model choice are provided in Section~\ref{sec:size}). 
% \begin{defn}[Erdos-Renyi Model]
% \label{def:ER}
%     An Erdos-Renyi model with parameters $N$ and $p$ follows the following likelihood,
%     \begin{equation*}
%         \mathcal{L}(N,p | \ G) = p^{|E|}(1-p)^{\binom{N}{2} - |E|}.
%     \end{equation*}
%     In other words, $N = |V|$, and if $i$ and $j$ drawn uniformly from $V$, $p = \mathbb{P}(E_{i,j} \in E)$.
% \end{defn}
In our simulations, we vary $N \in \{1000, 5000,10000\}$ and $Np \in \{5,10,15\}$.
% We compare the IIE to the MLE over a variety of simulation settings. Setting $\lambda = 1$, giving 5 coupons to each study participant and starting with a single seed, each simulation samples 100 participants using the generative model specified in Equation~\eqref{eq:crawlike} for a range of population sizes (implying different sample proportions) and average degrees. 
Algorithm~\ref{alg:altest} is used to construct the IIE, which we compare to the MLE.

Table~\ref{tab:graphtpr} demonstrates that the IIE for the sample subgraph, $\widetilde{A}^S_{n}$, has a higher true positive rate than the MLE in all simulation settings. Importantly, Table~\ref{tab:graphtnr} in Appendix~\ref{app:addsim} shows that these improvements do not come at the expense of the true negative rate.

The rate parameter $\lambda$ is of independent interest for assessing coupon uptake speed and the time necessary for recruiting a target sample size. Table~\ref{tab:lambda} in Appendix~\ref{app:addsim} indicates that over a range of population sizes and graph densities, the IIE, $\widetilde{\lambda}_n$, outperforms the MLE in terms of MSE. 
%This advantage is larger when the sample proportion is lower.
%While Lemma~\ref{lem:bias} references the bias of the IIE, these simulations suggest that this estimator is also consistently more accurate than the MLE.
% We also compare the performance of $\widetilde{A}^S_{n}$ and $\widehat{A}^S_{n}$. The subgraph adjacency matrix between study participants is imperative when optimizing and measuring the effects of interventions on the sample. 
% %For example, when measuring treatment outcomes, $A^S$ could be used to help identify network interference.
% It will also be necessary for estimating the hidden population size, $N = |V|$.
% Table~\ref{tab:graphtpr} demonstrates that the IIE, $\widetilde{A}^S_{n}$, has a higher true positive rate than the MLE in all settings. Table~\ref{tab:graphtnr} in Appendix~\ref{app:addsim} shows that these improvements do not come at the expense of the true negative rate.

\begin{remark}
    \label{rem:graphchar}
    Consistent with Figure~\ref{fig:Bias} and the intuition developed in Section~\ref{sec:BiasMLE}, the advantage of both $\widetilde{\lambda}_n$ and $\widetilde{A}^S_{n}$ over $\widehat{\lambda}_n$ and $\widehat{A}^S_{n}$ respectively is slightly greater in high average degree and low sample proportion settings generally.
\end{remark}
%implying that the overall accuracy of the IIE is greater than the MLE.

%\textcolor{red}{Think about what should be a figure - over population and over density}

\begin{table}

\caption{Graph True Positive Rate (\%)}
\label{tab:graphtpr}
\centering
%\begin{threeparttable}
\begin{tabular}[t]{rrrrrr}
\toprule
\multicolumn{2}{c}{ } & \multicolumn{2}{c}{MLE} & \multicolumn{2}{c}{IIE} \\
\cmidrule(l{3pt}r{3pt}){3-4} \cmidrule(l{3pt}r{3pt}){5-6}
Pop. & Deg. & Average & Std. & Average & Std.\\
\midrule
\cellcolor{gray!6}{1000} & \cellcolor{gray!6}{5} & \cellcolor{gray!6}{56.66} & \cellcolor{gray!6}{0.85} & \cellcolor{gray!6}{67.61} & \cellcolor{gray!6}{1.47}\\
1000 & 10 & 36.52 & 0.82 & 50.48 & 2.00\\
\cellcolor{gray!6}{1000} & \cellcolor{gray!6}{15} & \cellcolor{gray!6}{29.49} & \cellcolor{gray!6}{0.79} & \cellcolor{gray!6}{47.71} & \cellcolor{gray!6}{2.38}\\
5000 & 5 & 58.76 & 0.96 & 69.93 & 1.58\\
\cellcolor{gray!6}{5000} & \cellcolor{gray!6}{10} & \cellcolor{gray!6}{37.00} & \cellcolor{gray!6}{0.91} & \cellcolor{gray!6}{51.73} & \cellcolor{gray!6}{1.92}\\
5000 & 15 & 30.57 & 1.08 & 49.48 & 2.48\\
\cellcolor{gray!6}{10000} & \cellcolor{gray!6}{5} & \cellcolor{gray!6}{59.25} & \cellcolor{gray!6}{0.93} & \cellcolor{gray!6}{72.18} & \cellcolor{gray!6}{1.52}\\
10000 & 10 & 37.52 & 0.93 & 54.15 & 2.08\\
\cellcolor{gray!6}{10000} & \cellcolor{gray!6}{15} & \cellcolor{gray!6}{30.50} & \cellcolor{gray!6}{0.84} & \cellcolor{gray!6}{51.30} & \cellcolor{gray!6}{2.22}\\
\bottomrule
\end{tabular}
\begin{tablenotes}
    \item These are the true positive rates of the estimated subgraphs over a series of population sizes (Pop.) and average degrees (Deg.). The standard deviations reported quantify the Monte Carlo error associated with these estimates based on 100 simulations.
\end{tablenotes}
%\end{threeparttable}
\end{table}


\section{Hidden Population Size Estimation}
\label{sec:size}
One of the primary goals of sampling hidden populations is to estimate their total size, $N$. Imagine that the population graph, $G = (V,E)$, is a sample from an Erdos-Renyi graph model with parameters $N$ and $p$ 
% (Definition~\ref{def:ER}).
(that is, there are $N$ individuals in the graph and the probability of a connection between any two of them is $p$). 
While this is a very simple model, it has demonstrated practical utility when estimating hidden population size, forming the basis for methods such as the snowball sampling estimator \citep{frank1994estimating} and the network scale-up estimator \citep{killworth1998estimation}. Under an Erdos-Renyi model, the degree of each individual in $G$ is distributed as $d_i \sim \mathrm{Binomial}(N-1, p)$.
If we had access to a simple random sample of individuals, then we could directly estimate $N$ based on this likelihood.

As discussed earlier, RDS does not yield a simple random sample from the population (e.g., an individual's probability of being sampled depends on their degree \citep{heckathorn1997respondent, gile2011improved}). Conditional on the (unobserved) $A^S$ and the Erdos-Renyi assumption, it is possible to write down the distribution for the number of edges individual $i \in \{1,2,\ldots,n\}$ shares with unsampled members of the hidden population at the time of individual $i$'s recruitment. Let $d_i^u = d_i - \sum_{j=1}^{i-1} \mathbb{I}\left (\{i,j\} \in E^S \right )$,
and note that, unlike $d_i$, this quantity is independently and identically distributed from a Binomial distribution, $d_i^u \sim \mathrm{Binomial}(N-i, p)$.


Sections 4.1 and 4.2 present population size estimators based on an Erdos-Renyi graph assumption. This is for simplicity of exposition since there are only two parameters in this model, $N$ and $p$. However, the proposed  approach can easily be applied to other graph models by deriving the corresponding distribution of $d_i^u$. 
For example, %following \citet{crawford2018hidden} and \citet{gile2018methods},
consider a population graph that is distributed according to a stochastic block model (SBM) with two groups, $V_A \subseteq V$ and $V_B = V \setminus V_A$.
%with sizes $N_A = |V_A|$ and $N_B = |V_B|$.
Let the probability of an edge between members of the same group be $p_{\mathrm{in}}$, and the probability of a connection between members of different groups be $p_{\mathrm{out}}$. %For constant $c \in [0,1]$, we set $p_{\mathrm{out}} = cp_{\mathrm{in}}$. %We do not account for this graph structure in our estimation procedure and
%To center our prior for overall edge prevalence at a realistic value,
%Defining $A\in \{0,1\}^{N \times N}$ to be the adjacency matrix representing graph $G$, 
Assume that we observe the group membership of each study participant, and define $i_A, i_B$ as the number of individuals in groups $A$ and $B$ respectively that are recruited before participant $i+1$. %is recruited. Labelling $N_i =\mathbb{I}(i \in V_A)(N_A - i_A) + \left \{ 1-\mathbb{I}(i \in V_A) \right \} (N_B - i_B)$, 
% \begin{align*}
%     \mathbb{P}(d_i^u = d)  = & \sum_{j =0}^{d} \binom{N_i}{ j}p_{\mathrm{in}}^j (1-p_{\mathrm{in}})^{N_i - j} \times \\
%     &\hspace{0.5cm}\binom{(N - i)- N_i }{ d - j} \times \\
%     &\hspace{0.5cm}p_{\mathrm{out}}^{d - j} (1-p_{\mathrm{out}})^{ (N - i)- N_i -(d- j)},
% \end{align*}
\begin{align*}
    \mathbb{P}(d_i^u = d)  =&\sum_{j =0}^{d} \binom{N_i}{ j}
    \binom{(N - i)- N_i }{ d - j}
    p_{\mathrm{in}}^j (1-p_{\mathrm{in}})^{N_i - j}\\ 
    &\hspace{0.5cm} \times 
    p_{\mathrm{out}}^{d - j} (1-p_{\mathrm{out}})^{ (N - i)- N_i -(d- j)},
\end{align*}
where $N_A \geq N_B$ and $p_{\mathrm{in}} \geq  p_{\mathrm{out}}$. We can use this distribution 
% in place of Equation~\ref{eq:binUni} 
to estimate $N$ in the SBM setting. Appendix~\ref{app:stochblock} provides details of those derivations.

% , then we could calculate the edges individual $i \in \{1,2,\ldots,n\}$ shares with unsampled members of the hidden population at the time of individual $i$'s recruitment. These connections, which we label $d_i^u \in \mathbf{d}^u = (d_1^u, d_2^u, \ldots, d_n^u)$,  do not have an effect on individual $i$'s probability of recruitment \citep{crawford2018hidden} in the RDS stochastic process model of Equation~\eqref{eq:crawlike}.

% We assess how the improvements in estimation of $A^S$ and $\lambda$ in Section~\ref{sec:subgraphimprov} affect estimation of the hidden population size, $N$. Following \citet{crawford2018hidden}, we construct a model for $N$ based on the assumption that $G$ is Erdos-Renyi with parameters $N,p$ \citep{erdHos1960evolution}, $G\sim \mathrm{ER}(N,p)$. The Erdos-Renyi model 
% %has been debated \citep{robins2001network}, it 
% has demonstrated practical utility when estimating hidden population size, forming the basis for methods such as the snowball sampling estimator \citep{frank1994estimating} and the network scale-up estimator \citep{killworth1998estimation}.
% %(we also test sensitivity to this homogeneity assumption in simulation). 
% %The empirical success of this graph model implies that the relationship between degree and population size may hold approximately regardless of nuances that violate the simple binomial model.
% %overly simplistic Erdos-Renyi setup. 
% %\begin{assump}[Erdos-Renyi Graph]
% %$G$, the hidden population network is Erdos-Renyi, $G\sim \mathcal{G}(N,p)$.
% %\end{assump}
% If the RDS sample were independently and identically drawn from $V$, we could leverage the graph model to estimate $N$ without complication since the following would be true,
% %the degree of each participant would be distributed as
% \begin{equation}
%     \label{eq:binUni}
%     d_i \sim \mathrm{Binomial}(N-1, p).
% \end{equation}
% RDS fails to sample uniformly at random in a variety of ways because it recruits along the graph of a social network (e.g., an individual's probability of being sampled depends on their degree \citep{heckathorn1997respondent, gile2011improved}). If $A^S$ is known, we can calculate the edges individual $i \in \{1,2,\ldots,n\}$ shares with unsampled members of the hidden population at the time of individual $i$'s recruitment. These connections, which we label $d_i^u \in \mathbf{d}^u = (d_1^u, d_2^u, \ldots, d_n^u)$,  do not have an effect on individual $i$'s probability of recruitment \citep{crawford2018hidden} in the RDS stochastic process model of Equation~\eqref{eq:crawlike}. Formally,
% %(or, in other words, the probability of recruitment is solely dependent on the connections an unobserved node has to active recruiters). 
% %In fact, any model that satisfies the following assumption can leverage the theory developed in this section.
% %\begin{assump}[Recruitment Probability]
% %A vertex's probability of recruitment to the sample is independent from connections it has to currently %unrecruited  nodes.
% %\end{assump}
% %Consequently, 
% %although a joint probability model for $G^S$ and $N$ is not easily accessible, 
% %we can model population size through the pendant edges. 
% %If $A^S$ is known, we can record the quantity $d_i^u$ for $i \in \{1,2\ldots,n\}$, the number of edges node $i$ has to unobserved nodes when it is recruited, 
% \begin{equation}
%     \label{eq:diu}
%     d_i^u = d_i - \sum_{j=1}^{i-1} 1\{\{i,j\} \in E^S \}.
% \end{equation}
% Unlike $d_i$, this quantity is independently and identically distributed from a Binomial distribution,
% \begin{equation*}
%         d_i^u \sim \mathrm{Binomial}(N-i, p).
% \end{equation*}
% Therefore, the likelihood for $N$ and $p$ given $A^S$ is
% \begin{equation}
%     \label{eq:poplike}
%     \mathcal{L}(N,p | A^S) = \prod_{i=1}^n {N-i \choose d_i^u } p^{d_i^u}(1-p)^{N-i-d_i^u}.
% \end{equation}
\subsection{Revising Current Approaches}
\label{sec:revcurapp}
Based on this population size model, \citet{crawford2018hidden} propose an approximate Bayesian MCMC sampling scheme with strong priors on $p$ and $A^S$ to conduct inference on $N$. 
They find that informative priors on $p$ are necessary for ensuring finite first and second moments of the posterior distribution for $N$. %and conclude that the prior on $p$ 
%in the Erdos-Renyi model 
%must be very informative for valid inference. 
For example, the most diffuse prior on $p$ 
% they use 
in their simulations has a variance of about %$10^{-6}$.
$5\!\!\times\!\!10^{-6}$. %and the concentration of this distribution only increases from there. 
Moreover, they 
% require 
use
an
%but does not explain 
informative prior on the graph space $\pi(A^S)\! \propto\! \exp(-\gamma |E^S|)$, where $\gamma\! = -\log\left \{ p/(1-p)\right \}$
%($\gamma$ is defined without the negative in the JASA paper, but I found it with the negative in the archive version- which makes a lot more sense given what we know about the bias in the problem). 
% In the simulations they conduct, $p$ is on the order of $10^{-3}$ to $10^{-6}$ so $\gamma$ 
ranges from about $5$ to $9$, %(or $6$ to $12$ if $ln$),
imposing heavy penalties on graphs with large edge sets.
%This counteracts the bias described in section $3$, but
% This has no theoretical justification and
% %Additionally, their simulations indicate that the prior is actually over-compensating for the natural bias in the estimation problem as it favors graphs with smaller edge sets than the truth. This 
These priors inflate the posterior mean of $N$, resulting in significant upward bias.

%Choosing sufficiently non-informative priors would yield an improper posterior \citep{kahn1987cautionary}, but given the nature of the populations we aim to study, it is unlikely that such strong priors are scientifically meaningful. 

Prior selection is non-trivial in our problem. Choosing a non-informative prior risks an improper posterior \citep{kahn1987cautionary}, but, given the nature of the populations we aim to study, it is unlikely that strong informative priors are scientifically justifiable.
% 
% We consider situations in which very little is known about the hidden population and specifying multiple informative priors is not possible. 
% 
Moreover, full posterior inference for $N$ is not possible due to computational constraints, requiring multiple approximations \citep{hunter2006inference,crawford2018hidden}.
We avoid these issues by reformulating the problem as regularized estimation, which incorporates information on edge prevalence, $p$, via a regularization term. Given regularization function $R(\check{p}) = \log \mathrm{Beta}(\check{p};a,b)$ for $a,b \in \mathbb{R}^+$, we define the regularized estimates of $N,p$ conditional on $\widehat{A}^S_{n}$ and $\widetilde{A}^S_{n}$ as
\begin{align*}
    & \left \{ \widehat{p}, \widehat{N} \right \} =  \arg\max_{\check{p}, \check{N}} \ \log \mathcal{L}(\check{N},\check{p} | \widehat{A}^S_{n}) + R(\check{p}), \\
    &\left \{ \widetilde{p}, \widetilde{N} \right \} =  \arg\max_{\check{p}, \check{N}} \ \log \mathcal{L}(\check{N},\check{p} | \widetilde{A}^S_{n}) + R(\check{p}).
\end{align*}
% We compare $\widetilde{N}$ and $\widehat{N}$ in Table~\ref{tab:MADS} in Appendix~\ref{app:addsim} and Table ~\ref{tab:MADW} in Section~\ref{sec:sims}.%, 
%but first we illustrate how auxiliary information can be leveraged in these estimation processes.

%Consequently, in order to avoid 
%the unrealistic requirement of 
%highly informative priors for accurate posterior inference, we will focus on the MAP for population size $N$. We demonstrate the utility of the IIE in Table~\ref , which compares the MSE of $\widehat{N} = \arg \max_N \mathcal{L}(N | \widehat{A}^S_{n})$ to $\widetilde{N} = \arg \max_N \mathcal{L}(N | \widetilde{A}^S_{n})$ and finds that $\widetilde{N}$ is a large improvement over the MLE. 
%\{ Intuition for $\widetilde{N}$'s better performance can be motivated by... (maybe include graphic comparing degree distributions of $\widetilde{G^S}$ and $\widehat{G^S}$) \} \{not positive if we need to include this \}.
%\begin{figure}[h]
%\centering
%\includegraphics[width = 0.8\linewidth,height = 12cm]{Figures/Prop_MSE_Plot.png}
%\caption{This figure compares the MSE of the IIE and the MLE proportional to the MSE of the population size estimate with full graph knowledge ($GMSE$). The results are depicted for a range of hidden population settings ($N$ is the total population size and $Np$ is the average degree of individuals) and are averaged over simulations. We can see that the IIE is performing favorably in a range of prior settings, but the relative gains over the MLE increase as we assume more knowledge about $p$ (the prevalence of edges in the hidden population network).}
%\label{fig:PropMSEReswoAdd}
%\end{figure}

%\begin{figure}[h]
%\centering

%\includegraphics[width = 0.8\linewidth,height = 12cm]{Figures/II_MSE_Plot.png}

%\caption{This figure depicts the MSE of the IIE proportional to the MLE MSE for a range of hidden population settings ($N$ is the total population size and $Np$ is the average degree of individuals) averaged over simulations. We can see that the IIE is performing favorably in a range of prior settings, but the relative gains over the MLE increase as we assume more knowledge about $p$ (the prevalence of edges in the hidden population network).}
%\label{fig:PropMSEReswoAdd}
%\end{figure}


%\begin{figure}[h]
%\centering

%\includegraphics[width = 0.8\linewidth,height = 12cm]{Figures/Accuracy_Plot_2.png}

%\caption{This figure compares the true positive rate, $TPR$, of the indirect inference and MLE graph estimates. We can see that the IIE is an improvement for a variety of hidden population settings ($N$ is the total population size).}
%\label{fig:AccReswoAdd}
%\end{figure}
\subsection{Improving Estimation Using Auxiliary Information}
\label{sec:addinfo}

%Although our alternative estimator is more accurate than current estimators of $A^S$ and $N$, there is still room for improvement.
The RDS data collection process commonly includes a large survey that can be used to improve population size estimation \citep[e.g.][]{frost2006respondent,wu2017using}. In particular, it is common to track how information accumulates over the RDS process, and this measurement necessarily carries information about the underlying network.
% % We incorporate auxiliary information collected during RDS in the estimation of population size. 
% %The general intuition for this idea rests on improving our estimate of $A^S$ %(and therefore of $N$)
% %by observing a probabilistic function of the subgraph. 
% %However, RDS limits the types of information we can observe since repeat contact of the study participants may be difficult and/or expensive.
% %Consequently, it is necessary to devise a method for
% This involves casting RDS participant information as network dependent outcomes.
%as the sample is being conducted in sequence.
% We will motivate this idea with the follo example.
For example, an RDS interview may begin with a quiz about local free resources, important public health issues, or beneficial health practices (e.g., for People Who Inject Drugs this might include drug therapy options or needle exchange sites). The interview ends with the interviewer revealing the answers to the quiz so that each study participant leaves the study with the same amount of information. The performance of a study participant on this quiz is a graph dependent outcome, $\mathbf{Q}$. Below we propose a model for $\mathbf{Q}$ that, when combined with the IIE approach of Section~\ref{sec:indinf}, substantially improves 
% provides substantial improvements over 
the population size estimates of the previous section.
%
% The theme should be useful to an arbitrary hidden population member, e.g., for People Who Inject Drugs (PWID) this might include drug therapy options or needle exchange sites. The performance of a study participant on this quiz is the graph dependent outcome, $\mathbf{Q}$. The interview ends with the interviewer revealing the answers to the quiz so that each study participant leaves the study with the same amount of information.

\begin{remark}
    Other graph-dependent outcomes are certainly possible: measurements may depend on participant interactions with their friends or require participants to quantify some characteristic of their referral chain. These different types of $\mathbf{Q}$ would simply require different models from the ones we study below, but could otherwise be easily incorporated into the analysis.
\end{remark}

% We posit a plausible model for a generic graph dependent outcome, $\textbf{Q}$. 
Define monotonically increasing functions $f:\mathbb{R} \to \mathbb{R}$ and $g:\mathbb{R}\to \mathbb{R}$, $\mathbf{1}_n = (1,1,\ldots,1) \in \mathbb{R}^n$, and zero-mean distribution $F$. If we assume that there is communication over the network, then the performance of an interviewee on the quiz should be proportional to their connections to previously recruited study participants,
\begin{equation}
    \label{eq:addinfo}
    q_i = f\left \{\alpha + \gamma g \left ( m_i \right ) \right \}+ \epsilon_i, \ \epsilon_i \overset{\mathrm{i.i.d}}{\sim} F,
\end{equation}
where $\mathbf{m} = \left \{ A^S \cdot \mathrm{lt}(\mathbf{1}_n\mathbf{1}_n^\top)\right \}\mathbf{1}_n$. The $i^{th}$ entry of $\mathbf{m}$ is the number of neighbors of study participant $i$ who were recruited before participant $i$.
In Equation~\eqref{eq:addinfo}, $\alpha$ represents an average hidden population member's knowledge of the quiz subject without outside intervention and $\gamma$ is the intensity of communication flow. 
%Additionally, the number of previously recruited participants connected to study participant $i$ (when they are recruited), $m_i$, is
%$d-d^u$ or $\sum_{j=1}^{i-1} 1\{\{i,j\} \in E^S \}$, 
%based on $A^S$ when calculating $d_i^u$ for population size estimation (refer to Equations~\eqref{eq:diu} and \eqref{eq:poplike}). 
Adding information about $\mathbf{Q} = (q_1,q_2,\dots,q_n)$ to our analysis will improve estimation of $\mathbf{m}$, which will improve estimators of $A^S$ and $N$.

We now augment our IIE procedure with the auxiliary information contained in $\mathbf{Q}$. 
We expand $\mathbf{Y}$ to include the regression information, $\mathbf{Y}^r = (\mathbf{Q}, G^R, d, \textbf{w}) \in \mathcal{Y}^r$. 
%Defining $\beta = (\mu, \gamma, \sigma^2)$, the likelihood for $\mathbf{Y}^r$ is
Define $\lambda^\dagger_r : \mathcal{Y}^r \to \mathbb{R}$ as the function that maps the data, $\mathbf{Y}^r$,
%(which now includes the likelihood for $\mathbf{Q}$)
to the MLE for $\lambda$. Additionally, define $A^{S,r}_\lambda: \mathcal{Y}^r \times \mathbb{R} \to \left \{ 0, 1 \right \}^{n \times n }$ 
%as a map from the data to the MLE estimator of $A^S$  holding $\lambda \in \mathbb{R}^+$ fixed.
so that for value $\lambda' > 0$, $A^{S,r}_\lambda(\mathbf{Y}^r, \lambda')$ is the MLE estimator of $A^S$ holding $\lambda$ fixed at $\lambda'$.
%The IIE is now the pair$(\widetilde{\lambda}_n^r, \widetilde{A}^{S,r}_{n} =  A^{S,r}_\lambda(\mathbf{Y}^r, \widetilde{\lambda}_n^r))$, which solve
Let $\widetilde{\lambda}_n^r$ solve
\begin{equation}
    \label{eq:IIEInfo}
    \mathbb{E}_{\mathbf{Z}^r \sim P_{A^{S,r}_\lambda(\mathbf{Y}^r, \widetilde{\lambda}_n^r), \widetilde{\lambda}_n^r}} \left \{ \lambda_r^\dagger(\mathbf{Z}^r) \right \}  = \lambda_r^\dagger(\mathbf{Y}^r),
\end{equation}
and $\widetilde{A}^{S,r}_{n} = A^{S,r}_\lambda(\mathbf{Y}^r, \widetilde{\lambda}_n^r)$, then the IIE is now the pair $(\widetilde{\lambda}_n^r, \widetilde{A}^{S,r}_{n})$.
The expectation in Equation~\eqref{eq:IIEInfo} is over simulated data $\mathbf{Z}^r = (\mathbf{Q}, G^R, \mathbf{d}, \textbf{w}^*) \in \mathcal{Y}$, where $\textbf{w}^* \sim P_{A^S, \lambda}$ and $P_{A^S, \lambda}$ is the generative model described in Equation~\eqref{eq:crawlike}.
Algorithm~\ref{alg:indestaux} in Appendix~\ref{app:indinf} builds on Algorithm~\ref{alg:altest} and provides the complete description for this computation. The regularized estimator of population size conditional on the IIE with auxiliary information is
\begin{align*}
    &\left \{ \widetilde{p}^r, \widetilde{N}^r \right \} =  \arg\max_{\check{p}, \check{N}} \ \log \mathcal{L}(\check{N},\check{p} | \widetilde{A}^{S,r}_{n}) + R(\check{p}).
\end{align*}
%We observe in Table~\ref{tab:2} that the indirect estimator with regression information, which is constructed using Algorithm~\ref{alg:indestaux}, significantly outperforms all previous estimation techniques and approaches the MSE of the population size estimator with perfect knowledge of $A^S$.

\section{Population Size Estimation Simulations}
\label{sec:sims}
In this section, we empirically evaluate the IIEs of hidden population size with and without auxiliary information. The first simulation study compares our estimators to state-of-the-art competitors for a variety of population sizes and graph densities. 
The second and third simulations showcase the robustness of our estimators to different graph models.
% $\widehat{N}$, $\widetilde{N}$, and $\widetilde{N}^r$.
\paragraph{Simulation 1.}
For each simulation, we draw a hidden population graph from an Erdos-Renyi model, $G \sim \mathrm{ER}(N,p)$, varying $N \in \{1000, 5000, 10000\}$ and $Np \in \{5,10,15\}$. We then simulate an RDS study of size $n=100$ over this graph, starting from $3$ random seeds. This RDS follows the generative model specified in Equation~\eqref{eq:crawlike} with $\lambda = 1$ and 5 coupons. Letting $\mathbf{m} = \left \{ A^S \cdot \mathrm{lt}(\mathbf{1}_n\mathbf{1}_n^\top)\right \}\mathbf{1}_n$, we observe a vector of study participant attributes, $\mathbf{Q} = (q_1,q_2,\dots,q_n)$, drawn according to
\begin{equation*}
    q_i = \alpha + \gamma m_i + \epsilon_i, \quad \epsilon_i \overset{\mathrm{i.i.d}}{\sim} N(0, \sigma^2),
\end{equation*}
which is within the class of models outlined in Equation~\eqref{eq:addinfo}. We set $\alpha = 0$, $\gamma = 1$, and $\sigma^2 = 1$ %To calculate the regularized MLE estimate of $N$, 
and experiment with regularization information on $p$ to
explore the utility of social network edge density information when estimating population size. This procedure is repeated 200 times for each simulation setting. %\textit{Importantly, our procedure does not require an informative prior on $p$ or $A^S$ to function.}

We compare our estimators to the MLE derived in \citet{crawford2018hidden} as well as to several estimators proposed in \citet{handcock2014estimating} that use the successive sampling (SS) method. Under a uniform prior, the SS estimators, which are posterior summaries, require the researcher to specify the maximum that the population size can attain, $N_{\max}$. For a given $N$, we use values $N_{\max} \in \{3N,5N,8N\}$. 

%Table~\ref{tab:MADS} in Appendix~\ref{app:addsim} and Table~\ref{tab:MADW} 
Figure~\ref{fig:estimators} reports the results across all nine simulation setups. First, we note that the IIEs with and without auxiliary information have lower maximum absolute deviation (MAD) than the MLE over a range of hidden population graph sizes and densities.
The weak regularization information setting is defined by $R(\check{p}) = \log \mathrm{Beta}(\check{p};a, b)$, where $\mathrm{Beta}(\check{p};a, b)$ is centered at $p$ with $a = 0.1$. Consistent with Remark~\ref{rem:graphchar}, the improvements of the IIE without auxiliary information over the MLE are greater in high average degree settings. The improvements of the IIE with auxiliary information over the IIE without auxiliary information follow the same pattern. 
When comparing to the SS approach, we note that the estimators based on this procedure are very sensitive to the prior specification. 
In fact, Figure~\ref{fig:estimators} shows that the MAD for the SS Mean estimator (the posterior mean) where $N_{\max} = kN$ for $k \in \{3,5,8\}$ is almost exactly $|(k-2)N-n|/2$, which is the absolute difference between the prior mean and $N$. 

%Figure~\ref{fig:estimators} indicates that the IIEs with and without auxiliary information have lower MAD than the MLE over a range of hidden population graph sizes and densities.
% The weak regularization information setting is defined by $R(p) = \log \mathrm{Beta}(p;a, b)$, where $\mathrm{Beta}(p;a, b)$ is centered at $p$ with $a = 0.1$. Consistent with Remark~\ref{rem:graphchar}, the improvements of the IIE with auxiliary information over the MLE are greater in high average degree settings. The improvements of the IIE with auxiliary information over the IIE without auxiliary information follow the same pattern. 
In Appendix~\ref{app:strongreg}, we explore the role of regularization in our estimator. Figure~\ref{fig:strongestimators} in Appendix~\ref{app:strongreg} shows that in the strong regularization setting, where $R(\check{p}) = \log \mathrm{Beta}(\check{p};a, b)$ and $a = 10$, the improvements of $\widetilde{N}$ and $\widetilde{N}^r$ over $\widehat{N}$ are greater in larger populations.
%Additionally, 
%Table~\ref{tab:MADW} in Appendix~\ref{app:addsim} shows that these advantages are also present under a weak prior for $p$.


\begin{figure*}
    \centering
    \includegraphics[width=\textwidth]{Figures/graph_complete_sims.pdf}
    \caption{This figure compare the performance of successive sampling estimators, $\widehat{N}, \widetilde{N}$, and $\widetilde{N}^r$ under weak regularization information over a series of population sizes, $N$, and average degrees, $Np$, with 90\% Monte Carlo confidence intervals.}
    \label{fig:estimators}
\end{figure*}

\paragraph{Simulation 2: Stochastic Block Model.}
We assess the sensitivity of our population size estimate results to the Erdos-Renyi model assumption. Following \citet{crawford2018hidden} and \citet{gile2018methods}, we divide the hidden population into two groups, $V_A \subseteq V$ and $V_B = V \setminus V_A$.
%with sizes $N_A = |V_A|$ and $N_B = |V_B|$.
The probability of an edge between members of the same group is $p_{\mathrm{in}}$, and the probability of a connection between members of different groups is $p_{\mathrm{out}}$. For constant $c \in [0,1]$, we set $p_{\mathrm{out}} = cp_{\mathrm{in}}$. %We do not account for this graph structure in our estimation procedure and
%To center our prior for overall edge prevalence at a realistic value,
%Defining $A\in \{0,1\}^{N \times N}$ to be the adjacency matrix representing graph $G$, 
Lastly, we let $p^* =\mathbb{P}\left (\{i,j\} \in E \right )$, where nodes $i$ and $j$ are drawn uniformly at random from $V$. %(we derive an expression for $p^*$ in terms of $p_{\mathrm{in}}$ in Appendix~\ref{app:stochblock}). 
%$\{i,j\} \in E$, in terms of $p_{\mathrm{in}}$,
%\begin{align*}
%\label{eq:stochblock}
%\mathbb{E}\left (A_{ij}\right ) &=  \frac{c(N_a + N_B)(N_A + N_B - 1)p_{\mathrm{in}}}{2N_AN_B + c\left \{N_A(N_A-1) + N_B(N_B-1)\right \}}.
%\end{align*}
%This expression is derived in Appendix~\ref{app:stochblock}.

In this simulation, we set $N=5000$, $p^*=0.002$ (implying an average degree of $10$), $c=0.3$ and $N_A/N = 0.6$. 
We report estimates assuming an underlying SBM 
with strong regularization around $p_\inn$ and $p_\out$ (details in Appendix~\ref{app:stochblock}) and estimates under misspecification of the network model as an Erdos-Renyi. Figure~\ref{fig:block} in Appendix~\ref{app:stochblock} indicates that the estimator error when the network model is correctly specified follows the same pattern as Figure~\ref{fig:estimators}. Figure~\ref{fig:block} also shows that ignoring the block structure results in significantly worse estimator performance.
% We %explore how the misspecified estimator (which assumes the Erdos-Renyi model) performs in various SBM settings to
% conduct a more extensive analysis of estimator error when ignoring the underlying block structure in 
Additional analysis under network model misspecification is in
Appendix~\ref{app:stochblock}. %The results of this experiment are described in Table~\ref{tab:stochblockstr} in Appendix~\ref{app:stochblock}. 
Encouragingly, both the correctly and incorrectly specified IIEs %(with and without auxiliary information) 
perform better than their MLE counterparts.
%We also explore how the misspecified estimator (which assumes the Erdos-Renyi model) performs in various SBM settings. This experiment assesses the performance of our estimators if we are unaware of the underlying block structure.
%We also assess the performance of our estimators if we are unaware of the underlying block structure. This experiment uses the Erdos-Renyi model-based estimators in various SBM settings.
%For these simulations, we again set $N=5000$ and $p^*=0.002$ and assume (very strong) incorrect regularization information. As expected, Table~\ref{tab:stochblockstr} in Appendix~\ref{app:stochblock} indicates that the MAD of $\widehat{N}$, $\widetilde{N}$, and $\widetilde{N}^r$ is higher when the blocks are evenly split and the difference between $p_{\mathrm{in}}$ and $p_{\mathrm{out}}$ is large.
%For example, when $c = 0.3$ and the groups are evenly split, the estimators demonstrate a $150\%-300\%$ increase in MAD over the estimators in the Erdos-Renyi model setting, while when $c = 0.9$ and $N_A/N = 0.75$, the increase is only $19\%-34\%$. 
\paragraph{Simulation 3: Latent Space Model.}
We further assess the sensitivity of our estimation techniques by generating network data from the more general 
% to model assumptions by generating the hidden population graph from
% %a %2-d 
% a 
latent space inner product model. We allow edge probabilities to range from $1.5\times 10^{-3}$ to $2.7\times 10^{-3}$ with an expected degree of 10. In this context, we use estimators that assume the Erdos-Renyi model. Figure~\ref{fig:latent} in Appendix~\ref{app:latentspace} indicates that these estimators, while being incorrectly specified, still yield substantively similar results to Figure~\ref{fig:estimators}.

\section{How many people inject drugs in  Kohtla-Jarve, Estonia?} %the Kohtla-Jarve region of
\label{sec:appl}

According to the European Drug Report 2023, from 2015-2021 Estonia had the highest per capita prevalence of People Who Inject Drugs (PWID) in Europe. 
%This small country in Eastern Europe with a population of only 1.34 million also has one of the highest per capita HIV incidences in the European Union \citep{uuskula2020fentanyl}. 
There is also evidence of high HIV prevalence \citep{degenhardt2017global} and drug overdose death rates (related to the introduction of Fentanyl) among PWID in Estonia during this time period \citep{uuskula2020fentanyl}. 
To lower the prevalence of HIV among PWID in Estonia, syringe exchange programs were launched in 1997 \citep{wu2017using}.
Estimating the size of the PWID population sheds light on the magnitude of this public health crisis and the necessary scope of potential policy solutions. 
% Specifically, syringe exchange programs were launched in 1997 to lower the prevalence of HIV among PWID in Estonia. Without estimates of the PWID population size, it is difficult to confirm that this program's current resources are sufficient \citep{wu2017using}.

\citet{wu2017using} use data from an RDS sample conducted in 2012 to estimate the number of PWID in the Kohtla-Jarve region of Estonia. They compare a series of models including the standard multiplier method \citep{fearon2017sample}, successive sampling \citep{johnston2010respondent}, and a network-based approach 
%(Equation~\eqref{eq:crawlike}) our method is based on 
\citep{crawford2018hidden}. This RDS sample began with 6 seeds and includes 600 participants from the Kohtla-Jarve region. The data on each member of the study includes their arrival time, degree, recruiter identity, and allotted coupons. We use the IIE approach of Section~\ref{sec:size}, estimating the population size to be $\widetilde{N} = 795$ and the average wait time to be $1/\widetilde{\lambda}_n = 1/0.23$.
%without requiring the extensive prior specification inherent to other methods \citep{crawford2018hidden, handcock2014estimating}. 
This is within the intervals implied by previous estimates \citep{wu2017using}.

These data further include an indicator of whether the participant is using antiretroviral therapy (ART) for HIV. We use this covariate and the RDS sample to construct a data-realistic simulation study to showcase how a hypothetical network-based covariate could assist in estimating population size. A simple change to the study could have asked each person to share their ART status with their social connections in the PWID population (to hopefully increase screening for HIV and uptake of ART). The auxiliary information to be collected from each RDS participant is then a measurement of how many people have shared their ART status with them since the beginning of the study. Letting $\mathbf{x}_{ART} \in \{0,1\}^n$ be the indicator of ART status, the responses to this question, $\mathbf{Q} = (q_1, q_2, \ldots, q_n )$, 
%as the number of previously recruited study participants they are connected observed augmented with iid noise,
could follow a Poisson model similar to the one described in Section~\ref{sec:sims}, 
\begin{equation*}
    q_i \sim \mathrm{Poisson} \left (\left [\left \{ A^S \cdot \mathrm{lt}(\textbf{1}_n \textbf{1}_n^\top) \right \} \mathbf{x}_{ART} \right ]_i \right ).
\end{equation*}
The simulation proceeds as follows: we first select an $A^S$ that is compatible with the observed RDS. Treating this $A^S$ as ground truth, we set $N = 1105$ (this is the most likely population size that could have generated that $A^S$). Finally, we set   
% For this simulation we select a subgraph $A^S$ that is compatible with the observed RDS and set 
$\lambda = \widetilde{\lambda}_n = 0.23$ as was estimated without auxiliary information. 
% Treating these selected simulation parameters as ground truth, implies that the population size is $N = 1105$.
% \textcolor{red}{We set $N = 1105$ since this is the most likely population size under these parameters}. 
We incorporate the auxiliary information in $\mathbf{Q}$ to improve our estimation of $N$ as outlined in Section~\ref{sec:addinfo}. Table~\ref{tab:application} compares $\widetilde{N}^r$ and $\widehat{N}$, and we see that when such auxiliary information is available, leveraging it improves population size estimation (by approximately 20\%).

\begin{table}
\caption{Population Estimation MAD}
\label{tab:application}
\centering
% \begin{threeparttable}
\begin{tabular}[t]{rrr}
\toprule
Algorithm & MAD & Std.\\
\midrule
\cellcolor{gray!6}{MLE} & \cellcolor{gray!6}{219.1} & \cellcolor{gray!6}{9.3}\\
IIE w/ Info & 181.3 & 6.8\\
\bottomrule
\end{tabular}
\begin{tablenotes}
     \item This table displays the MAD of population size estimates for the case study of Section~\ref{sec:appl}.
\end{tablenotes}
% \end{threeparttable}
\end{table}

\section{Conclusion}

RDS provides access to populations often excluded from scientific discourse. Although this sampling process presents a variety of inferential problems, it also contains valuable information on the social network connecting study participants. This paper expands on the existing literature with new mechanisms for improving estimation of the study participant arrival rate, complete subgraph, and population size. The first accounts for the the bias of the MLE using concepts from indirect inference, and the second proposes a mechanism for including auxiliary information.
Both methods combine to achieve cutting edge performance.

Although modeling arrival time data as independent and exponential is natural, loosening this assumption would allow for more realistic dependencies in the RDS model. %such as recruiter preferences. 
For example, future work could consider wait times that depend on recruiter covariates (accounting for recruiter preferences).
Additionally, the inferential advantage of including auxiliary information in the estimation procedure depends on the quality of this data. Future research could focus on optimizing auxiliary information collection for inferential targets such as population size and degree distribution. 
%This sampling design problem is complex because it involves collecting (often implicit) social connection information while maintaining privacy.
Lastly, accurate recovery of the sample subgraph is essential for tasks beyond population size estimation, such as running randomized experiments and measuring the efficacy of interventions on the RDS sample.

% useful for many inferential tasks other than population size estimation. For example, researchers may be interested in estimating the efficacy of an intervention over a hidden population by administering randomized treatment over an RDS sample. To accurately measure the treatment effect, they would need to incorporate network interference %along the underlying network between sample participants 
% --- necessitating the estimation of the sample subgraph.

\section{Acknowledgements}
The authors would like to thank the reviewers for numerous helpful comments. In conducting this research, Alexander Volfovsky was partially supported by the National Science Foundation (NSF) Faculty Early Career Development Award DMS-2046880 and NSF award DMS-2230074. Justin Weltz was partially supported by NSF award DMS-2046880 as well. Eric Laber acknowledges support from the NSF award DMS-CDS\&E-MSS-2346292 and the National Institutes of Health award R01DA056407.


%(design and analysis of experiments for complex social processes)
%(equilibrium, network formation, and infectious-disease spread)

% References
\bibliography{bibliography}

\newpage

\onecolumn

\title{Supplementary Material}
\maketitle

\appendix
\section{Additional Simulation Results for Section~\ref{sec:subgraphimprov} }
\label{app:addsim}

This section contains simulation results that are referenced in Section~\ref{sec:subgraphimprov} of the main text. It continues the empirical evaluation of the IIE for the two model parameters in Equation~\eqref{eq:crawlike}: $A^S$, the subgraph between study participants, and $\lambda$, the study paricipant arrival rate. Table~\ref{tab:graphtpr} in Section~\ref{sec:subgraphimprov} of the main text shows that $\widetilde{A}^S_n$, the IIE of $A^S$, has a higher true positive rate than $\widehat{A}^S_n$, the MLE of $A^S$, across all simulation settings.

We first evaluate the error rates of  $\widetilde{A}^S_n$ and $\widehat{A}^S_n$ in more detail. Table~\ref{tab:graphtnr} reports the true negative rates (TNR) of $\widehat{A}^S_n$ and $\widetilde{A}^S_n$ over a range of graph densities and population sizes. It shows that there is no discernible difference between the TNR of the IIE and MLE in these settings. Therefore, the higher true positive rates of $\widetilde{A}^S_n$ depicted in Table~\ref{tab:graphtpr} do not come at the expense of overall accuracy.

We also compare the performance of the IIE and the MLE for $\lambda$ in terms of MSE.
Table~\ref{tab:lambda} shows that the IIE, $\widetilde{\lambda}_n$, is considerably more accurate than the MLE, $\widehat{\lambda}_n$, over a range of graph sizes and densities. 
We observe that $\widetilde{\lambda}_n$ has an MSE that is less than 50\% of the MSE of $\widehat{\lambda}_n$ across all settings. Additionally, the difference in MSE between $\widehat{\lambda}_n$ and $\widetilde{\lambda}_n$ is slightly higher with larger population sizes, which correspond to lower sample proportions (since the sample size is held fixed at $n=100$), and higher average degrees.

\begin{table}[h]

\caption{True Negative Rates of $\widehat{A}^S_n$ and $\widetilde{A}^S_n$ (\%) }
\label{tab:graphtnr}
\centering
%\begin{threeparttable}
\begin{tabular}[t]{rrrrrr}
\toprule
\multicolumn{2}{c}{ } & \multicolumn{2}{c}{MLE} & \multicolumn{2}{c}{IIE} \\
\cmidrule(l{3pt}r{3pt}){3-4} \cmidrule(l{3pt}r{3pt}){5-6}
Pop. & Deg & Average & Std. & Average & Std.\\
\midrule
\cellcolor{gray!6}{1000} & \cellcolor{gray!6}{5} & \cellcolor{gray!6}{99.61} & \cellcolor{gray!6}{0.01} & \cellcolor{gray!6}{99.61} & \cellcolor{gray!6}{0.01}\\
1000 & 10 & 99.08 & 0.01 & 99.08 & 0.01\\
\cellcolor{gray!6}{1000} & \cellcolor{gray!6}{15} & \cellcolor{gray!6}{98.61} & \cellcolor{gray!6}{0.02} & \cellcolor{gray!6}{98.61} & \cellcolor{gray!6}{0.02}\\
5000 & 5 & 99.92 & 0.00 & 99.92 & 0.00\\
\cellcolor{gray!6}{5000} & \cellcolor{gray!6}{10} & \cellcolor{gray!6}{99.83} & \cellcolor{gray!6}{0.01} & \cellcolor{gray!6}{99.82} & \cellcolor{gray!6}{0.01}\\
5000 & 15 & 99.72 & 0.01 & 99.72 & 0.01\\
\cellcolor{gray!6}{10000} & \cellcolor{gray!6}{5} & \cellcolor{gray!6}{99.96} & \cellcolor{gray!6}{0.00} & \cellcolor{gray!6}{99.96} & \cellcolor{gray!6}{0.00}\\
10000 & 10 & 99.90 & 0.00 & 99.90 & 0.00\\
\cellcolor{gray!6}{10000} & \cellcolor{gray!6}{15} & \cellcolor{gray!6}{99.87} & \cellcolor{gray!6}{0.01} & \cellcolor{gray!6}{99.87} & \cellcolor{gray!6}{0.00}\\
\bottomrule
\end{tabular}
\begin{tablenotes}
    \footnotesize \item These are the true negative rates of $\widehat{A}^S_n$ and $\widetilde{A}^S_n$ for a series of population sizes (Pop.) and average degrees (Deg.). The standard deviations reported quantify the Monte Carlo error associated with these estimates based on 100 simulations.
\end{tablenotes}
%\end{threeparttable}
\end{table}

\begin{table}[ht]

\caption{MSE of $\widehat{\lambda}_n$ and $\widetilde{\lambda}_n$}
\label{tab:lambda}
\centering
%\begin{threeparttable}
\begin{tabular}[t]{rrrrrr}
\toprule
\multicolumn{2}{c}{ } & \multicolumn{2}{c}{IIE} & \multicolumn{2}{c}{MLE} \\
\cmidrule(l{3pt}r{3pt}){3-4} \cmidrule(l{3pt}r{3pt}){5-6}
Pop. & Deg. & Mean & Sd & Mean & Sd\\
\midrule
\cellcolor{gray!6}{1000} & \cellcolor{gray!6}{5} & \cellcolor{gray!6}{0.09} & \cellcolor{gray!6}{0.02} & \cellcolor{gray!6}{0.21} & \cellcolor{gray!6}{0.02}\\
1000 & 10 & 0.11 & 0.02 & 0.28 & 0.03\\
\cellcolor{gray!6}{1000} & \cellcolor{gray!6}{15} & \cellcolor{gray!6}{0.09} & \cellcolor{gray!6}{0.02} & \cellcolor{gray!6}{0.24} & \cellcolor{gray!6}{0.03}\\
5000 & 5 & 0.11 & 0.02 & 0.25 & 0.02\\
\cellcolor{gray!6}{5000} & \cellcolor{gray!6}{10} & \cellcolor{gray!6}{0.13} & \cellcolor{gray!6}{0.03} & \cellcolor{gray!6}{0.36} & \cellcolor{gray!6}{0.04}\\
5000 & 15 & 0.10 & 0.02 & 0.27 & 0.03\\
\cellcolor{gray!6}{10000} & \cellcolor{gray!6}{5} & \cellcolor{gray!6}{0.10} & \cellcolor{gray!6}{0.02} & \cellcolor{gray!6}{0.28} & \cellcolor{gray!6}{0.03}\\
10000 & 10 & 0.12 & 0.03 & 0.32 & 0.04\\
\cellcolor{gray!6}{10000} & \cellcolor{gray!6}{15} & \cellcolor{gray!6}{0.09} & \cellcolor{gray!6}{0.02} & \cellcolor{gray!6}{0.29} & \cellcolor{gray!6}{0.03}\\
\bottomrule
\end{tabular}
\begin{tablenotes}
    \small \item These are the MSEs of the $\lambda$ estimators for a series of population sizes (Pop.) and average degrees (Deg.). The standard deviations reported quantify the Monte Carlo error associated with these estimates over $100$ simulations.
\end{tablenotes}
%\end{threeparttable}
\end{table}

% \clearpage
\section{Stochastic Block Model Analysis}
\label{app:stochblock}

This section provides the details of Simulation 2 from Section~\ref{sec:sims} in the main text. 
%In this simulation, we test the robustness of our population size estimators to misspecification in the graph model. 
In this experiment, we test the performance of our population size estimators in a more complex graph model setting.
The Erdos-Renyi model we employ assumes that edges between members of the population form with the same probability, $p$. 
However, individuals may be more likely to form connections with one group of people than another. Consider the following generative model for the population graph, $G = (V,E)$. The hidden population is divided into two groups, $V_A \subseteq V$ and $V_B = V \setminus V_A$ with sizes $N_A = |V_A|$ and $N_B = |V_B|$. The probability of an edge between members of the same group is $p_{\mathrm{in}}$, and the probability of a connection between members of different groups is $p_{\mathrm{out}}$. For constant $c \in [0,1]$, we set $p_{\mathrm{out}} = cp_{\mathrm{in}}$ so that $p_{\mathrm{in}} \geq p_{\mathrm{out}}$. %We do not account for this graph structure in our estimation procedure and
%To center our prior for overall edge prevalence at a realistic value,
This is an example of a stochastic block model (SMB), which is used throughout network analysis \citep{holland1983stochastic, lee2019review, khabbazian2017novel}.
We let $p^* =\mathbb{P}\left (\{i,j\} \in E \right )$, where nodes $i$ and $j$ are drawn uniformly at random from $V$. Defining $E_{\mathrm{out}}$ and $E_{\mathrm{in}}$ as the set of edges between and within groups respectively, we derive an expression for $p^*$ in terms of $p_{\mathrm{out}}$, $c$, $N_A$, and $N_B$,
\begin{align*}
p^* &= \mathbb{P}(\{i,j\} \in E) \\
&= \mathbb{P}(\{i,j\} \in E_{\mathrm{out}})*p_{\mathrm{out}} + \mathbb{P}(\{i,j\} \in E_{\mathrm{in}}) * p_{\mathrm{in}} \\
& = \frac{2N_AN_B}{(N_A + N_B)(N_A + N_B - 1)}p_{\mathrm{out}} \\
& \quad  + \quad \frac{N_A(N_A-1) + N_B(N_B-1)}{(N_A + N_B)(N_A + N_B - 1)} p_{\mathrm{in}}.
\end{align*}
Because $cp_{\mathrm{out}} = p_{\mathrm{in}}$,
\begin{equation*}
    p^* = \frac{2cN_AN_B + N_A(N_A-1) + N_B(N_B-1) }{(N_A + N_B)(N_A + N_B - 1)}p_{\mathrm{out}}.
\end{equation*}
We use this expression to set the overall edge prevalence in the simulations below, making $N=5000$ and $Np^* = 10$.

We assess the performance of the population size estimators in the SBM setting. Assume that we observe the group membership of each study participant, and define $i_A, i_B$ as the number of individuals in groups $A$ and $B$ respectively that are recruited before participant $i+1$. %is recruited 
Labelling $N_i =\mathbb{I}(i \in V_A)(N_A - i_A) + \left \{ 1-\mathbb{I}(i \in V_A) \right \} (N_B - i_B)$, 
\begin{align}
    \label{eq:block}
    \mathbb{P}(d_i^u = d)  = & \sum_{j =0}^{d} \binom{N_i}{ j}p_{\mathrm{in}}^j (1-p_{\mathrm{in}})^{N_i - j}\binom{(N - i)- N_i }{ d - j} p_{\mathrm{out}}^{d - j} (1-p_{\mathrm{out}})^{ (N - i)- N_i -(d- j)}.
\end{align}
We define the following estimators based on Equation~\ref{eq:block},
\begin{align}
    \label{eq:stoch_est}
    \begin{split}
    & \left \{ \widehat{p}_{\inn}, \widehat{p}_{\out},  \widehat{N} \right \} =  \arg\max_{\check{p}_{\inn}, \check{p}_{\out}, \check{N}} \ \log \mathcal{L}(\check{N},\check{p}_\inn, \check{p}_\out | \widehat{A}^S_{n}) + R_\out(\check{p}_\out) + R_\inn(\check{p}_\inn), \\
    &\left \{ \widetilde{p}_{\inn}, \widetilde{p}_{\out}, \widetilde{N} \right \} =  \arg\max_{\check{p}_{\inn}, \check{p}_{\out}, \check{N}} \ \log \mathcal{L}(\check{N},\check{p}_\inn, \check{p}_\out | \widetilde{A}^S_{n}) + R_\out(\check{p}_\out) + R_\inn(\check{p}_\inn), \\
    &\left \{ \widetilde{p}_{\inn}^r, \widetilde{p}_{\out}^r, \widetilde{N}^r \right \} =  \arg\max_{\check{p}_{\inn}, \check{p}_{\out}, \check{N}} \ \log \mathcal{L}(\check{N},\check{p}_\inn, \check{p}_\out | \widetilde{A}^{S,r}_{n}) + R_\out(\check{p}_\out) + R_\inn(\check{p}_\inn),
    \end{split}
\end{align}
where $R_{\mathrm{out}}(\check{p}_\out) = \log \mathrm{Beta}(\check{p}_{\mathrm{out}};a_{\mathrm{out}}, b_{\mathrm{out}})$ and $R_{\mathrm{in}}(\check{p}_\inn) = \log \mathrm{Beta}(\check{p}_\inn;a_{\mathrm{in}}, b_{\mathrm{in}})$.

In our simulation, $c = 0.3$, $N_A/N = 0.6$, and auxiliary information is as specified in Simulation 1 of Section~\ref{sec:sims}. We assess the performance of correctly and incorrectly specified estimators, recognizing that the correctly specified estimators require observing the group label of study participants (but not of their unobserved neighbors). The correctly specified estimators are listed in Equation~\ref{eq:stoch_est}, and the incorrectly specified estimators assume the Erdos-Renyi model. For every correctly specified estimator, $e^{R_\out(\check{p}_\out)} = \mathrm{Beta}(\check{p}_{\mathrm{out}};a_{\mathrm{out}}, b_{\mathrm{out}})$ and $e^{R_\inn(\check{p}_\inn)} = \mathrm{Beta}(\check{p}_\inn;a_{\mathrm{in}}, b_{\mathrm{in}})$ are centered at ${p}_{\mathrm{in}}$ and ${p}_{\mathrm{out}}$ respectively with $a_{\mathrm{out}},a_{\mathrm{in}} = 10$. 
For the misspecified estimators, we center $e^{R(\check{p})} = \mathrm{Beta}(\check{p};a,b)$ at $p_{\inn}$ and set $a = 10$.
Figure~\ref{fig:block} indicates that the estimators based on the correctly specified likelihood perform an order of magnitude better than the estimators assuming the Erdos-Renyi model. Additionally, the relationships between $\widehat{N}$, $\widetilde{N}$, and $\widetilde{N}^r$ mirror the results in the Erdos-Renyi setting (Figure~\ref{fig:estimators}). Lastly, both the correctly and incorrectly specified network-based estimators outperform the successive sampling estimator with $N_{\max}/N = 3$.

\begin{figure}[h]
    \centering
    \includegraphics[width=0.8\textwidth]{Figures/stoch_block_true.pdf}
    \caption{This figure compares the performance of a successive sampling estimator (mean of the posterior distribution), $\widehat{N}, \widetilde{N}$, and $\widetilde{N}^r$ in the SBM setting. The estimators proceeded by ``Misp." incorrectly assume the Erdos-Renyi model. The figure includes 90\% Monte Carlo confidence intervals for each estimator.}
    \label{fig:block}
\end{figure}

We assess the sensitivity of incorrectly specified population size estimators to the SBM setting more extensively in Table~\ref{tab:stochblockstr} (auxiliary information is also as specified in Simulation 1 of Section~\ref{sec:sims}).
To illustrate the effect of misspecification, we vary $N_A/N$ and $c$. As $N_A/N \to 1$ (or $0$) or $c \to 1$, the Erdos-Renyi model becomes a better approximation of the truth. As $N_A/N \to 0.5$ and $c \to 0$, there is more heterogeneity in the graph edge probabilities, and the approximation becomes worse. We can see this pattern in Table~\ref{tab:stochblockstr}. The first line of the table, $N_A/N = 1$ and $c=1$, shows the MAD of our population size estimators under the Erdos-Renyi model for comparison. When $c=0.3$ and $N_A/N=0.5$, the error of the estimators is highest, and when $c=0.9$ and $N_A/N=0.75$, it is lowest; i.e., when $c = 0.3$ and the groups are evenly split, the estimators demonstrate a $150\%-300\%$ increase in MAD over the estimators in the Erdos-Renyi model setting, while when $c = 0.9$ and $N_A/N = 0.75$, the increase is only $19\%-34\%$.  The rest of Table~\ref{tab:stochblockstr} illustrates a continuous spectrum between these two extremes. Lastly, as mentioned in Section~\ref{sec:sims} of the main text, the incorrectly specified IIEs still perform better than their MLE counterparts.

\begin{table}[h]

\caption{Population Size Estimation MAD with Incorrect Strong Regularization Information in the SBM Setting}
\label{tab:stochblockstr}
\centering
%\begin{threeparttable}
\begin{tabular}[t]{rrrrr}
\toprule
$N_A/N$ & c & MLE & IIE & IIE w/ Info\\
\midrule
\cellcolor{gray!6}{1.0} & \cellcolor{gray!6}{1.0} & \cellcolor{gray!6}{861.6} & \cellcolor{gray!6}{560.9} & \cellcolor{gray!6}{459.4}\\
0.50 & 0.3 & 2318.7 & 2071.8 & 2021.4\\
\cellcolor{gray!6}{0.50} & \cellcolor{gray!6}{0.6} & \cellcolor{gray!6}{1711.1} & \cellcolor{gray!6}{1425.1} & \cellcolor{gray!6}{1344.8}\\
0.50 & 0.9 & 1116.1 & 755.6 & 638.7\\
\cellcolor{gray!6}{0.75} & \cellcolor{gray!6}{0.3} & \cellcolor{gray!6}{1784.1} & \cellcolor{gray!6}{1519.8} & \cellcolor{gray!6}{1443.4}\\
0.75 & 0.6 & 1502.3 & 1178.5 & 1079.9\\
\cellcolor{gray!6}{0.75} & \cellcolor{gray!6}{0.9} & \cellcolor{gray!6}{1039.5} & \cellcolor{gray!6}{672.6} & \cellcolor{gray!6}{614.2}\\
\bottomrule
\end{tabular}
\begin{tablenotes}
    \item This table displays the Mean Absolute Deviation (MAD) of the population estimators over a series of $N_A/N$ and $c$ values.
     We use strong regularization information centered at $p_{\mathrm{in}}$ ($a=10$ in the standard $\log \mathrm{Beta}(\check{p};a,b)$ regularizer term) to mimic ignorance of the two block structure. These results are averaged over simulations with Monte Carlo standard deviations below 25.
\end{tablenotes}
%\end{threeparttable}
\end{table}

\section{Latent Space Sensitivity Results}
\label{app:latentspace}

In this section, we present a sensitivity analysis referenced in Section~\ref{sec:sims}. Instead of drawing the hidden population graph from an Erdos-Renyi distribution, we generate it from a latent space inner product model in 2-dimensions. In this context, each member of the hidden population, $i \in V$, has an unobserved ``position," $x_i \in \mathbb{R}^2$, in latent space. The probability of an edge between individuals $i,j \in V$ is dictated by the inner product between $x_i$ and $x_j$,
\begin{equation*}
    \mathbb{P}\left ( \{i,j\} \in E \right ) = \frac{e^{\phi_0 + \phi_1x_i^\top x_j}}{1+e^{\phi_0 + \phi_1x_i^\top x_j}},
\end{equation*}
where $\phi_0, \phi_1 \in \mathbb{R}$. In this simulation setting, we set the population size equal to $5000$. For each $i \in V$, we draw $x_i$ independently,
\begin{equation*}
    x_i \sim \mathrm{Normal} \left \{ (0,0)^\top, \begin{pmatrix}
        0.01 & 0 \\
        0 & 0.01
    \end{pmatrix} \right \}, 
\end{equation*}
and set $\phi_0 = -6.21$ and $\phi_1 = 1$. This results in an expected overall degree of about 10. Under these parameters, the edge probabilities are approximately $1.5\times 10^{-3}$ to $2.7\times 10^{-3}$. The regularization term is set to $\log \mathrm{Beta}(\check{p};a,b)$, where $\mathrm{Beta}(\check{p};a,b)$ is centered at an approximation of the overall edge density with $a=0.1$ (weak regularization). Figure~\ref{fig:latent} indicates that the maximum likelihood estimator, $\widehat{N}$, based on the Erdos-Renyi assumption still has a lower maximum absolute deviation (MAD) than the successive sampling estimator with $N_{\max}/N = 5$. Additionally, the indirect inference estimator, $\widetilde{N}$, and the indirect inference estimator with auxiliary information, $\widetilde{N}^r$ (using the same information as Simulation 1 of Section~\ref{sec:sims}), outperform $\widehat{N}$. This provides evidence that our estimation methods are still advantageous in misspecified settings. 
\begin{figure}[h]
  \begin{center}
    \includegraphics[width=0.4\textwidth]{Figures/ls_graph_size.pdf}
  \end{center}
  \caption{This figure compares the performance of a successive sampling estimator (mean of the posterior distribution), $\widehat{N}, \widetilde{N}$, and $\widetilde{N}^r$ when the Erdos-Renyi assumption is violated by drawing the population graph from a latent space model. It includes 90\% Monte Carlo confidence intervals for each estimator.}
  \label{fig:latent}
\end{figure}

% \clearpage
\section{Simulation Results under Strong Regularization for Section~\ref{sec:sims}}
\label{app:strongreg}

\begin{figure*}[h]
    \centering
    \begin{subfigure}{\textwidth}
    \centering
    \includegraphics[width=\textwidth]{Figures/graph_complete_sims_strong_reg.pdf}
    \caption{$R(\check{p}) = \log \mathrm{Beta}(\check{p};a,b)$ where $a = 1$}
    \label{subfiga1}
    \end{subfigure}
    \begin{subfigure}{\textwidth}
    \centering
    \includegraphics[width=\textwidth]{Figures/graph_complete_sims_stronger_reg.pdf}
    \caption{$R(\check{p}) = \log \mathrm{Beta}(\check{p};a,b)$ where $a = 10$}
    \label{subfiga10}
    \end{subfigure}
    \caption{This figure compares the performance of $\widehat{N}, \widetilde{N}$, and $\widetilde{N}^r$ under strong regularization information over a series of population sizes, $N$, and average degrees, $Np$, with 90\% Monte Carlo confidence intervals.}
    \label{fig:strongestimators}
\end{figure*}


In this section, we present the results of a simulation under strong (correctly specified) regularization. As described in Section~\ref{sec:revcurapp} of the main text, we use a regularized MLE approach to estimate population size to avoid specifying informative priors that are difficult to justify scientifically. 
The regularization function, $R(\check{p}) = \log \mathrm{Beta}(\check{p};a,b)$, incorporates information  on edge prevalence, $p$ --- where $\mathrm{Beta}(\check{p};a,b)$ is a Beta distribution that is centered at $p$ with a variance that is inversely proportional to $a$. Here, we use the same setup as Simulation 1 but vary the hyperparameters in the regularizer.

In Figure~\ref{fig:estimators} of Section~\ref{sec:sims} in the main text, we compare the MAD of $\widehat{N}$ (MLE), $\widetilde{N}$ (IIE), and $\widetilde{N}^r$ (IIE with auxiliary information) with $a = 0.1$, and a series of Successive Sampling (SS) estimators. We observe that $\widetilde{N}^r$ improves on $\widetilde{N}$, and both are more accurate than $\widehat{N}$. Additionally, the performances of the SS estimators are highly dependent on their prior. In Figures~\ref{subfiga1} and \ref{subfiga10}, we show the log(MAD) of $\widehat{N}, \widetilde{N}$, and $\widetilde{N}^r$ with $a = 1$ and $a=10$ respectively.
The relationships between estimators $\widehat{N}, \widetilde{N}$ and $\widetilde{N}^r$ mirror Figure~\ref{fig:estimators}. Encouragingly, the MAD of our population size estimators decreases significantly as $a$ increases, and, with strong regularization information, $\widehat{N}, \widetilde{N}$ and $\widetilde{N}^r$ are consistently more accurate than the SS estimators.


% \clearpage
\section{IIE and Successive Sampling Algorithm Details}
\label{app:indinf}

In this section, we present the details of Algorithms~\ref{alg:altest} and \ref{alg:indestaux} (Algorithm~\ref{alg:altest} is described in Section~\ref{sec:indinf} of the main text).

Both algorithms construct the IIE by finding the parameters under which the expected value of a calibration statistic is equal to the observed value, where the calibration statistic is set equal to the MLE of $\lambda$. In the simulations of Section~\ref{sec:sims} in the main text, we use $K=9$ grid values centered at $\widehat{\lambda}_n$,  the MLE for the observed data. Specifically, we set $\lambda^k = \widehat{\lambda}_n - (k-4)\times0.1$ for $k \in \{1,2,\ldots, 9\}$. The set of candidate parameters are $\left \{ \lambda^k, A^{S}_\lambda(\mathbf{Y}, \lambda^k) \right \}_{k=1}^9$ and $\left \{ \lambda^k, A^{S,r}_\lambda(\mathbf{Y}^r, \lambda^k) \right \}_{k=1}^9$ for Algorithms~\ref{alg:altest} and \ref{alg:indestaux} respectively. For each candidate parameter, we approximate the expected value (setting $J = 25$) of the MLE of $\lambda$, labeling this quantity $\widehat{\lambda}_n^k$. The IIE is the set of parameters under which $\widehat{\lambda}_n^k$ is closest to $\widehat{\lambda}_n$.

As described in Section~\ref{sec:addinfo} of the main text,
Algorithm~\ref{alg:indestaux} augments Algorithm~\ref{alg:altest} with auxiliary information. 
We note that this implies the MLE is taken with respect to different likelihoods in Algorithms~\ref{alg:altest} and \ref{alg:indestaux}.
%, depending on whether auxiliary information is available.
Defining $\beta \in \mathbb{R}^p$ for $p \in \mathbb{N}$ as the parameter that indexes the distribution of $\mathbf{Q}$, the MLE referenced in Algorithm~\ref{alg:indestaux} is

\begin{align*}
    &\left \{ \widehat{A}^S_{n}, \widehat{\lambda}_n, \widehat{\beta}_n \right \} = \arg \max_{A^S \in \mathcal{A}, \lambda \in \mathbb{R}^+, \beta \in \mathbb{R}^p} \mathcal{L}_n(\mathbf{Y}, \mathbf{Q} | A^S, \lambda, \beta).
\end{align*}

Algorithms~\ref{alg:altest} and \ref{alg:indestaux} take about 24 hours to run with a sample size of $100$. They are implemented in the code included in the Supplementary Material.

\begin{algorithm}
\SetAlgoLined
We want to find the estimator,
\begin{equation*}
    \widetilde{\lambda}_n^r \in \arg\min_{\lambda \in \mathbb{R}^+} \left |\mathbb{E}_{\mathbf{Z} \sim P_{A^{S,r}_\lambda(\mathbf{Y}^r, \lambda), \lambda}} \left \{ \lambda^\dagger(\mathbf{Z}^r) \right \}  - \lambda^\dagger(\mathbf{Y}^r) \right |;
\end{equation*}
\noindent Generate a grid of $\lambda^k$ values, $k \in \{1,2,..,K\}$ \;
\For{k in $\{1,2,..,K\}$}{
    \For{j in $\{1,2,..,J\}$}{
        Find $A^S_{k,j} = \max_{A^S, \beta} \mathcal{L}(A^S, \beta | \lambda^k, \mathbf{Y}^r)$, where $\beta$ indexes the distribution of $\mathbf{Q}$\;
        %, or, in other words, the subgraph $G^S$ and regression vector $\beta$ that maximizes the observed data likelihood conditional (which now includes regression information) on a known $\lambda$ value equal to $\lambda^k$\;
        Simulate wait time vector $w^{k,j}$ from the model defined by parameters $A^S_{k,j}, \lambda^{k}$\;
        Find $\widehat{\lambda}_n^{k,j}, \widehat{A}^{S,k,j}_{n}$ by maximizing the likelihood given the generated data $\mathbf{Z}_{k,j}^r = (w^{k,j}, G^R, \mathbf{d}, \mathbf{Q})$\;
    }
    Save vector $(\lambda^k, A^S_k, \beta^k, \widehat{\lambda}_n^k=  \frac{\sum_{j=1}^{J} \widehat{\lambda}_n^{k,j}}{n})$\;
    %(where $G^S^k, \beta^k$ is just one of the ${G^S}^{k,j}, \widehat{\beta}^{k,j}$ - since they all represent possible local maximums);
}
Calculate $k^* = \arg\min_k \left |\widehat{\lambda}_n^k - \lambda^\dagger_r(\mathbf{Y}^r) \right | $\;
%(where $\widehat{\lambda}_n$ is from step 1 of our procedure);
Our estimator is then 
\begin{equation*}
    \left ( \widetilde{\lambda}_n^r, \widetilde{A}^{S,r}_{n}, \widetilde{\beta}^r \right )= \left ( \lambda^{k^*}, A^S_{k^*},\beta^{k^*} \right )
\end{equation*}
\caption{The Indirect Inference Estimator with Auxiliary Information}
\label{alg:indestaux}
\end{algorithm}

Lastly, we use the SSPSE package \citep{sspsepackage} under a ``flat" prior setting to construct the SS estimators analyzed in Figures~\ref{fig:estimators} and \ref{fig:strongestimators}.




\end{document}
