\documentclass[
  journal=proceedings,
  manuscript=article-type,
  year=2024
]{PMET_proc}

%TC:incbib

\usepackage{amsmath}   % For aligned equations
\usepackage{amssymb}   % For additional math symbols
\usepackage{mathtools} % For improved mathematical formatting
\usepackage{bm}        % For bold symbols in equations
\usepackage[nopatch]{microtype}
\usepackage{booktabs}
\usepackage[           % enable hyperlinks but make them invisible
    colorlinks = true,
    urlcolor = blue,   % except external URLs
    linkcolor = black, 
    citecolor = black
]{hyperref}

\title{Exploratory subgroup detection of psychological networks: Assessing the impact of ordinal and skewed data}
\author{T. Kvetnaya}
\affiliation{Department of Psychology, Goethe University Frankfurt, 60478 Frankfurt am Main, Germany}
\email[T. Kvetnaya]{kvetnaya@psych.uni-frankfurt.de}

\author{K. J. Nehler}
\affiliation{Department of Psychology, Goethe University Frankfurt, 60478 Frankfurt am Main, Germany}
% \alsoaffiliation{Joint first authors}

\author{M. Schultze}
\affiliation{Department of Psychology, Goethe University Frankfurt, 60478 Frankfurt am Main, Germany}

\addbibresource{clusternetworks.bib}

\keywords{model-based clustering, ordinal data, network analysis, subgroup detection, Gaussian graphical models} %% First letter not capped

\begin{document}

\begin{abstract}
Exploratory subgroup identification can be a valuable tool for psychological network science, e.g., to identify patient subgroups with distinct symptom constellations in mental disorders. Gaussian mixture modeling (GMM) – a popular method for investigating heterogeneity in multivariate data – offers a promising avenue to achieve this. GMM approaches allow participants to be clustered into subgroups based on their subgroup-specific network structures, rather than symptom profiles or sumscores. Recent advancements in graphical GMM approaches were extended to explicitly consider the structure of associations among variables within each cluster \parencite[e.g., ][]{fop2019}. By introducing a graph structure search step into the expectation–maximization (EM) algorithm, it allows for not only optimizing parameters but also graph edge sets. However, this approach assumes continuous, normally distributed data, whereas real-world psychological data is often ordinal and/or skewed in nature. In this study, we seek to explore how effectively the structural EM algorithm is able to recover underlying subgroups in data under conditions frequently encountered in psychological data. To this end, we generate cross-sectional data stemming from 3 subgroups with different degrees of network sparsity, echoing findings from previous network analyses of psychological disorders. By varying the cluster proportions, the number of ordinal answer categories, and variable skewness in the simulated datasets, we evaluate the performance of graphical GMM in terms of clustering and structure recovery. Classification goodness, as well as recovery of the true cluster proportions, edge sets, and weight estimates are used as performance indicators. 
\end{abstract}


\section{Introduction}
An intriguing question in network psychometrics is whether networks of psychological variables can be used to detect subgroups in an exploratory, data-driven manner.  
Model-based clustering, or Gaussian mixture modeling (GMM), is a popular method to uncover subgroups, which assumes that observations arise from a mixture of  distributions, each representing one cluster or subgroup \autocite{fraley1998}. 
It is common in fields handling high-dimensional data like bioinformatics \autocite{gao2016, mcnicholas2010}, though its use in psychology remains relatively scarce \parencite[i.e. ][]{paul2019}.
For continuous, normally distributed data, GMMs reliably identify the correct number of clusters \autocite{scrucca2016, keribin2000}.  
However, this performance is only guaranteed if the data-generating mechanism is correctly specified, with a common specification error occurring when ordinal data are treated as continuous \autocite{haslbeck2023}.
The present simulation study explores the impact of ordinal and skewed data on a clustering method for mixtures of Gaussian graphical models \parencite[\emph{mixggm, }][]{fop2019} -- which is of particular interest for psychological applications.

\subsection{Sample heterogeneity in network psychometrics} \label{heterogeneity}

Sample heterogeneity refers to the possibility that observed data arise from several latent subgroups \autocite{hoekstra2022, fried2015a}. 
For instance, \textcite{fried2015a} demonstrated marked heterogeneity using questionnaire data from patients with depression, in which they identified 1,030 unique symptom profiles.
Indeed, a number of psychological hypotheses suggest the existence of subpopulations characterized by different network structures.
For example, network \emph{connectivity} or \emph{density} has been proposed to vary between groups with and without psychological disorders, with more severe symptomatology being associated with more densely connected symptom networks \autocite{vanborkulo2015, cramer2016}.
While such hypotheses have been predominantly investigated by comparing a priori-defined groups, exploratory, data-driven approaches to identify possible subgroups based on within-cluster associations could be particularly valuable.

Untangling sample heterogeneity based on empirical network structure has proven challenging, as existing GMM frameworks rarely incorporate within-cluster associations as the guiding property for clustering \autocite{fop2019}.
Some either assume local independence, i.e. diagonal within-cluster matrices \parencite[e.g. \emph{pgmm}, ][]{mcnicholas2010} or are computationally expensive and risk overparameterization \parencite[e.g. \emph{mclust}, ][]{scrucca2016,haslbeck2023}.
While methods have been proposed to model within-cluster associations for continuous data \autocite{gao2016, ren2022}, preliminary work showed a variety of convergence and estimation issues \autocite{kvetnaya2023}. 

\subsection{Ordinal and skewed data in psychology} \label{ordinal_data}

A further challenge is that psychological data are frequently ordinal \autocite{liddell2018} and/or not normally distributed in the population (e.g., suicidality scores), necessitating either robustness checks or dedicated modeling approaches.
For example, many established diagnostic instruments \parencite[e.g., ][]{zigmond1983, beck1996} use ordinal scales with four response categories, and Likert-type scales with 5 to 7 points are common \autocite{liddell2018}.  
This raises the longstanding question: When is it appropriate to treat ordinal data as continuous? 
Previous simulation research found that network estimation performance improved when ordinal categories exceed $c > 5$ and sample size $N$ is sufficiently large ($N > 3{,}000$), even under moderate to strong skewness \autocite{johal2021}. 
Similarly, \textcite{haslbeck2023} found that high clustering accuracy with \emph{mclust} \autocite{scrucca2016} can be achieved for $c > 5$ categories when $N \geq 1{,}000$ and clusters are well-separated.
However, especially with freely estimated covariance matrices, accuracy decreased as parameters grew with rising number of variables $p$, making model recovery difficult without a very large $N$ ($\geq 10{,}000$).

While existing approaches like \emph{clustMD} \autocite{mcparland2016} can accommodate ordinal data, they too assume local independence, as does latent class analysis \parencite[\emph{LCA}, ][]{visser2022}.
Another promising method for ordinal graphical models \autocite{lee2022} was not readily available at the time of this study.
We therefore proceed with the method by \textcite{fop2019}, which explicitly
considers within-cluster associations by optimizing for sparse precision matrices, thus avoiding overparameterization problems. 
In the presence of within-cluster associations, this method outperformed \emph{mclust} \autocite{fop2019}.
Therefore, the key question is whether existing findings about treating ordinal data as continuous apply to this method, which we will now introduce.

\subsection{Model-based clustering for sparse precision matrices \autocite{fop2019}} \label{fop}

The present method assumes that observations $x_i$ are generated by a mixture of $K$ distributions.
The probability density function for each $x_i$ is given by:

\begin{equation}
f(\mathbf{x}_i \mid \boldsymbol{\Theta}, \mathbb{G})~=~\sum_{k=1}^K \tau_k \, 
\phi\left(\mathbf{x}_i \mid \boldsymbol{\mu}_k, \boldsymbol{\Omega}_k, \mathcal{G}_k\right)
\end{equation}

where the parameter vector $\boldsymbol{\Theta}$ comprises cluster proportions $\tau_k$, means $\boldsymbol{\mu}_k$, and precision matrices $\boldsymbol{\Omega}_k$, with $\phi$ as the multivariate Gaussian density.  
$\mathbb{G}$ denotes the collection of graph structures $\mathcal{G}_k$, encoded by adjacency matrices $\mathbf{A}_k$.

Typically, for a fixed number of $K$, the classical EM algorithm \autocite{dempster1977} alternates between the \emph{expectation} (E) and the \emph{maximization} (M) steps. 
Crucially, \textcite{fop2019} supplement the M step with a \emph{structure search} (S) step, which determines the optimal $\mathcal{G}_k$. 

In the E step, the conditional expectation of the log-likelihood $\ell$ is computed given the observed data and $\boldsymbol{\Theta}$ from the previous iteration. 
In the M step, the algorithm maximizes the regularized $\ell_R$ with respect to $\boldsymbol{\Theta}$ and $\mathbb{G}$, adding a penalty term $Q$ to introduce sparsity in $\mathbf{A}_k$:

\begin{equation}
\ell_R~=~\sum_{i=1}^{N} \log \left\{ \sum_{k=1}^{K} \tau_k \, 
\phi \left( \mathbf{x}_i \mid  \boldsymbol{\mu}_k, \boldsymbol{\Omega}_k, \mathcal{G}_k \right) 
\right\}
+ \sum_{k=1}^{K} \log p(\boldsymbol{\Omega}_k)
- \sum_{k=1}^{K} Q(\mathbf{A}_k).
\end{equation}

Here, the term $\log p(\boldsymbol{\Omega}_k)$ introduces Bayesian regularization to prevent singularity in $\boldsymbol{\Omega}_k$.
In this work, we set the penalty $Q$ to correspond to the Bayesian Information Criterion (BIC):

\begin{equation}
Q_{BIC}(\mathbf{A}_k)~=~\frac{1}{2} E_k \log N,
\end{equation}

where $E_k$ denotes the number of edges in the candidate graph $\mathcal{G}_k$. 
In the S step, to efficiently explore the space of possible graph configurations, a stepwise search or a probabilistic genetic algorithm are available. 
Once an optimal $\mathbb{G}$ is identified, the method estimates a positive-definite $\boldsymbol{\Omega}_k$ with fixed zero entries given $\mathcal{G}_k$.  
BIC is then used to select the model with the optimal number of clusters $K_{opt}$.
Finally, individual observations are assigned to clusters.

\subsection{Research question}

While \emph{mixggm} offers a promising way to tackle the first challenge and identify underlying subgroups in psychological data based on their cluster-specific associations, the second challenge of handling ordinality and skew remains unresolved.
Our central question therefore is how tolerant \emph{mixggm} is to violations of distributional assumptions.  
We conducted a simulation to evaluate the method's performance under different degrees of assumption violations.
The outcome measures we focus on here are the recovery rate of the correct number of clusters $K$, the classification quality, and the recovery of edge structures within the clusters.


\section{Method} \label{method}

All analyses were conducted in R \parencite[Version 4.3.3, ][]{rcoreteam2024}.  
Model-based clustering was implemented using the \emph{mixggm} package \parencite[Version 1.0, ][]{fop2019}.  
All code used in the simulation study is available on OSF.io: \url{https://osf.io/6n9tq}.  

\subsection{Population networks}

First, we generated cross-sectional partial correlation matrices for $K~=~3$ clusters and $p~=~10$ variables each to serve as true population networks for data generation in the simulation study.  
To reflect hypotheses about differential network density in psychological research \autocite{cramer2016}, we varied the proportion of non-zero edges across population networks (90\%, 50\%, and 10\%, respectively).
To define edge weights for each population network, we first randomly sampled values from an exponential distribution, with smaller values occurring more frequently. 
This choice was informed by characteristics reported in recent network analysis studies of psychological disorders \autocite{wysocki2021}.
The sampled values were randomly assigned to the off-diagonal entries of each $10 \times 10$ matrix according to the predefined density level.
Diagonal entries were iteratively increased by a small constant until positive definiteness was achieved, while preserving the original association structure.
After this adjustment, the final partial correlation values ranged from 0.04 to 0.56.
All three population matrices are illustrated in Figure~\ref{pop_mats}.

\begin{figure}[t]
    \centering
    \includegraphics[width=0.95\linewidth]{pop_mats.png}
    \caption{Partial correlation matrices representing the true population networks used to generate data in each simulation replication. Network density and edge weight placement vary by matrix, representing distinct correlational structures for each subgroup. Numbers on the main diagonal refer to variable indices.}
    \label{pop_mats}
\end{figure}

\subsection{Data generation}

Multivariate normal data were generated based on the specified population matrices in each simulation replication using the \emph{MASS} package, \parencite[Version 7.3-60.0.1.,][]{venables2002}, preserving the correlational structure within each cluster.  
Cluster proportions $\tau$ were manipulated to be either balanced ($1:1:1$) or unbalanced ($1:2:3$).  
In the balanced condition, each cluster comprised 33.3\% of the sample. In the unbalanced condition, the clusters contained $n_1 = 16.7\%$, $n_2 = 33.3\%$, and $n_3 = 50\%$ of the total sample, respectively.
Importantly, in this condition, the smallest cluster ($n_1$) had the highest network density, while the largest cluster ($n_3$) had the lowest. 
This design reflects hypotheses such as the connectivity hypothesis of mental disorders, which suggests that a smaller subset of the population may exhibit higher symptom connectivity \autocite{cramer2016}.

Ordinal datasets were obtained by thresholding the generated continuous data following \textcite{rhemtulla2012}. 
We applied symmetric, moderately, and heavily asymmetric thresholds to discretize data into 4, 5, or 7 ordinal categories with varied skews. 
Symmetric thresholds were evenly spaced around 0, covering –2.5 to 2.5 standard deviations (e.g., in four categories: –1.25, 0, 1.25). 
For moderate asymmetry, the peak shifted left of center, while for heavy asymmetry, the lowest category contained the most observations, with progressively fewer observations in higher categories.

\subsection{Simulation settings}

In total, the resulting simulation conditions were as follows:  
\begin{itemize}
    \item Number of ordinal categories $c$: 4, 5, 7, continuous  
    \item Skewness of ordinal data: symmetric, moderate, heavy  
    \item Cluster proportions $\tau$: balanced ($1:1:1$), unbalanced ($1:2:3$)  
    \item Sample size $N$: 600, 1200, 2400, 4800, 9600  
\end{itemize}  

This resulted in 100 unique conditions. 
Conditions were fully crossed, with the exception that no condition involved continuous, skewed data due to the selected ordinalization method.  
For each condition, 150 datasets were simulated, on which clustering with \emph{mixggm} was performed for $K~=~[1, 5]$.  
A forward stepwise search algorithm was chosen for the S step.

\subsection{Evaluation criteria}

We evaluated the performance of the model using the following metrics:  
\begin{itemize}
    \item Clustering performance: Probability of correctly identifying the number of clusters $K$  
    \item Classification performance: Adjusted Rand Index \parencite[ARI, ][]{hubert1985}, measuring the similarity between two data clustering assignments
    \item Sensitivity: True positive rate of edge recovery  
    \item Specificity: True negative rate of edge recovery 
\end{itemize}  

Sensitivity and specificity were calculated only for replications in which estimated clusters could be matched to population clusters, which was achieved by minimizing SRMR between estimated and true covariance matrices. 

\begin{figure}[t]
    \centering
    \includegraphics[width=0.95\linewidth]{Fig1.png}
    \caption{Probability to detect the correct number of clusters $K$ (a) and classification quality (b) measured by ARI, as a function of skewness (symmetric, moderate, heavy), number of ordinal categories (4 - continuous), $N$, and balanced (solid line) vs. unbalanced (dashed line) cluster-size proportions in the population. Results for continuous data are presented only for the symmetric condition, as no skewed continuous data was generated in this study. Error bars indicate SE.}
    \label{K_ARI_plot}
\end{figure}

\begin{figure}[t]
    \centering
    \includegraphics[width=0.95\linewidth]{Fig2.png}
    \caption{Sensitivity (a) and specificity (b) as a function of skewness (symmetric, moderate, heavy), number of categories (4 - continuous), $N$, and balanced (solid line) vs. unbalanced (dashed line) cluster-size proportions in the population. Results for continuous data are presented only for the symmetric condition, as no skewed continuous data was generated in this study. 
    Error bars indicate SE.}
    \label{sens_spec_plot}
\end{figure}

\filbreak

\section{Results} \label{results}

\subsection{Clustering performance}

Figure~\ref{K_ARI_plot}, Panel a, summarizes the proportion of simulations $\text{P}(K = 3)$ in which the correct number of $K$ was identified.
Notably, performance was superior in the unbalanced condition for almost all data settings. 
For symmetric, continuous data, $\text{P}(K = 3)$ increased steadily with sample size. For unbalanced clusters, it approached 1 with increasing $N$, while $\text{P}(K_{correct})$ stabilized at around 0.75 in the balanced case.
However, this pattern did not hold for ordinal conditions, where performance decreased with $c$ and dropped to 0 in the least favorable, skewed scenarios.
This might be due to the method often overestimating clusters when $K_{opt}$ is chosen incorrectly. For example, mean $K_{opt}$ was 3.10 ($SD$~=~0.45) clusters for continuous data, but rose up to $M$~=~4.71 ($SD$~=~0.51) clusters for $c~=~4$. 
Larger sample sizes did not mitigate this effect.
Only for non-skewed data with $c~=~7$ categories, the method markedly outperformed other ordinal categories in the unbalanced condition, but even moderate skew led to a drop in performance, such that $\text{P}(K_{correct})$ did not exceed 0.23 in any ordinal, skewed condition. 



\subsection{Classification performance}

As illustrated by Figure~\ref{K_ARI_plot}, Panel b, ARI consistently exceeded 0.80 for continuous, symmetric data (except for $N~=~600$) and showed slight improvement with increasing $N$, indicating good classification performance. 
By contrast, it remained below 0.50 for ordinal conditions, with the exception of $c~=~7$ symmetric data in the unbalanced cluster-size condition.


\subsection{Cluster network recovery}

Sensitivity grew with higher $N$ in all conditions (Figure~\ref{sens_spec_plot}, Panel a). It was highest for continuous data, and was negatively impacted by the extent of skew and ordinality.
By contrast, specificity (Panel b) mostly decreased with rising sample size except in the continuous, symmetric condition, which remained above 0.88 throughout. 
However, these findings warrant cautious interpretation, as both metrics could only be calculated for replications in which estimated and true clusters could be matched: While for continuous data, 95.5\% of replicates could be matched, this number dropped to 45.3\% for $c~=~7$, 41.6\% for $c~=~4$, and only 7.4\% for $c~=~5$. 

\section{Discussion}

The results indicate that both clustering performance and edge recovery are substantially affected by the presence of ordinality and skewness in the data.
While \emph{mixggm} performed well for continuous, symmetric and sparse network data -- particularly when underlying networks were sparse -- its effectiveness diminished markedly in all ordinal and skewed conditions.
This performance decrease could not be compensated for by a larger $N$, as seen for ARI, and for the probability of detecting the correct $K$, an increasing $N$ was actually detrimental.
Edge recovery was impaired compared to continuous data, especially considering these metrics could only be calculated for the small subset of replications.
Consistently, unbalanced cluster-size conditions yielded better performance than balanced conditions, aligning with \emph{mixggm}'s optimization for sparse structures: When sparse subgroups represent a larger
share of the sample, the method benefits.
This indicates that classification performance is constrained not by power alone, but also the characteristics of the data.

Taken together, these findings suggest that prior results from the network estimation \parencite{johal2021} and model-based clustering literature \parencite{haslbeck2023} do not generalize to the approach evaluated here.
A possible explanation is the compounded complexity of combining clustering and network recovery within a single framework. However, alternative reasons due to choices made in this study and limitations may also contribute.

\subsection{Limitations and Future Directions}

Firstly, we fixed key parameters including population structure, $K$, $p$, and penalty parameter $Q$, making it difficult to determine whether reduced performance stems from modeling deficiencies, or unsuitable properties of the population data structure.

The selection of appropriate data generation methods represents another critical decision point. 
Following \textcite{rhemtulla2012}, we simulated multivariate normal data first and subsequently introduced skew to ordinal data through asymmetric thresholds. 
Due to this, a continuous skewed condition was absent in our study, posing a significant limitation. 
While prior research indicates it may be impossible to distinguish between a normally distributed latent variable with asymmetric thresholds and a skewed latent distribution with symmetric thresholds \autocite{gronneberg2019}, alternative ordinalizing approaches might yield different results.

Alternative methods designed specifically for ordinal data may hold promise.
For example, while this method was not publicly accessible during this study, \textcite{lee2022} proposed an approach for clustering ordinal graphical models without assuming local independence, leveraging probit models to represent ordinal variables as discretizations of latent continuous variables.
Tree-based methods such as SEM Trees and Network Trees \parencite{grassi2023, jones2020}, which recursively partition data based on directed or undirected edges, offer another avenue for settings where the structure of subgroups is complex or hierarchically organized. 

\section{Conclusion}

In summary, this study demonstrates that violations of normality and sparsity assumptions substantially impact performance of \emph{mixggm}.
While we replicated its viability for network-based exploratory subgroup detection when assumptions hold, this study established that findings on treating ordinal data as continuous from previous literature do not directly carry over to this joint modeling approach.
Careful consideration of data characteristics therefore remains essential when applying model-based clustering in psychological research.

\printbibliography

\end{document}