\documentclass{midl} % MIDL class
\usepackage{color,soul}
\usepackage{multirow}
\usepackage{booktabs}

% The following packages will be automatically loaded by the MIDL class:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e, ...
% so you normally do not need to load them explicitly.

\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2026}


\title[Pose-based analysis of pediatric movement disorders]{Quantitative Pose-Based Analysis of Movement Disorders in Pediatric NGLY1 and SLC13A5 Patients}

\midlauthor{
\Name{Chengliang Dai\nametag{$^{1,2}$}} \Email{chengliang.dai@ucb.com}\\
\addr $^{1}$ UCB, Slough, United Kingdom\\
\addr $^{2}$ Imperial College London, London, United Kingdom\AND
\Name{Phil Scordis\nametag{$^{1}$}}\\
\Name{Prathyusha Teeyagura\nametag{$^{3}$}}\\
\Name{Rayann M.~Solidum\nametag{$^{3}$}}\\
\addr $^{3}$ Stanford University, Stanford, CA, United States\AND
\Name{Jeff Broderick\nametag{$^{4}$}}\\
\Name{Julia Broderick\nametag{$^{4}$}}\\
\Name{Jane Broderick\nametag{$^{4}$}}\\
\addr $^{4}$ Beneufit, Inc., Kentfield, CA, United States\AND
\Name{Brenda E.~Porter\nametag{$^{3}$}}
}

\begin{document}

\maketitle

\begin{abstract}
Movement disorders have long relied on subjective clinical observation for diagnosis and monitoring. By contrast, computer vision tools such as OpenPose can turn video recordings into precise, time-resolved measurements of a patient’s posture and movement. In this work, we apply a fully markerless, pose-based pipeline to classify abnormal movements in children with NGLY1 or SLC13A5 mutations. Our primary focus is on simple, physician-informed pose features that can be interpreted in clinical terms and used with conventional classifiers (Random Forest, SVM, etc.) on a very small dataset. We show that these handcrafted features capture clinically meaningful differences between movement-disorder phenotypes and can achieve useful classification performance. In addition, we include an exploratory comparison with a transformer model that is pre-trained on large-scale action-recognition data and then fine-tuned on our pose data. This experiment illustrates the potential performance ceiling of deep learning with extensive pretraining, but we emphasize that such models are less transparent and more data-hungry than the traditional approaches that form the core contribution of this study.
\end{abstract}

\begin{keywords}
Movement disorders, pose-based analysis, NGLY1, SLC13A5, pediatrics
\end{keywords}

\section{Introduction}
Movement disorders encompass a range of neurological conditions that impair motor control and lead to symptoms such as tremors, ataxia, and involuntary movements. Clinical evaluation has traditionally relied on expert visual assessment in the clinic. Although such assessments are grounded in deep clinical expertise, they remain subjective, resource-intensive, and difficult to reproduce or scale. For ultra-rare pediatric conditions such as NGLY1 deficiency and SLC13A5 disorder, these limitations are particularly problematic because patients are geographically dispersed and often cannot attend frequent in-person evaluations.

Recent advances in computer vision and artificial intelligence (AI) have opened the possibility of extracting quantitative measurements from ordinary video recordings. For instance, markerless pose-estimation algorithms infer joint locations frame by frame, enabling the computation of spatiotemporal kinematic descriptors without specialized motion-capture equipment. In this paper, we investigate whether pose-based features can be used to quantitatively characterize movement disorders in pediatric patients with NGLY1 deficiency or SLC13A5 disorder. Because our cohort is small and our clinical collaborators value interpretability, our main emphasis is on classical machine-learning models applied to carefully designed, physician-informed pose features. Specifically, we aim to (i) define a set of intuitive, angle-based features that correlate with clinician severity ratings and distinguish between broad categories of movement disorder, and (ii) evaluate a panel of conventional classifiers on these features, highlighting trade-offs between performance and interpretability.

An important design choice is that physician ratings are based on the original clinical videos, whereas the models are trained only on pose data extracted from those videos, since raw videos of pediatric patients are highly sensitive and cannot be readily shared across sites. As a secondary, exploratory analysis, we also adapt a transformer-based architecture that is pre-trained on a large public action-recognition dataset and fine-tuned on our clinical data. This experiment mainly serves to demonstrate what additional performance may be achievable with extensive pretraining.

\section{Background and Related Work}

\subsection{Pose-based motion analysis in medicine}
Markerless pose estimation and mesh recovery have transformed quantitative movement assessment by enabling extraction of 2D/3D joint coordinates from video. Representative frameworks include OpenPose \citep{cao2019openpose}, ViTPose \citep{xu2022vitpose}, HRNet \citep{sun2019deep}, VIBE \citep{kocabas2020vibe}, and HMR \citep{kanazawa2018end}, along with subsequent work building on these approaches. The resulting trajectories support computation of spatiotemporal kinematic descriptors (e.g., joint angles, angular velocities, and movement variability) without markers or wearable sensors.

For clinical use, the reliability of video-derived kinematics is critical. Multiple studies have therefore evaluated markerless pose pipelines against reference systems (e.g., instrumented gait analysis or marker-based motion capture) and reported encouraging agreement under standardized protocols. For example, OpenPose-based or OpenPose-derived pipelines have been assessed in adult gait/kinematic settings \citep{washabaugh2022comparing,stenum2021applications} and in pediatric contexts including toddlers with and without neurodevelopmental disabilities \citep{anderson2025validation}. These validation efforts support the feasibility of using de-identified keypoint trajectories as quantitative inputs for downstream clinical modeling.

Building on these pose/mesh backbones, video-based kinematic analysis has been explored across neurological and developmental conditions. Beyond video, motion modeling has also been used in fetal MRI to estimate fetal motion and reduce motion artifacts \citep{xu2019fetal,zhang2020enhanced}. These pose trajectories enable downstream learning-based clinical assessment.

\subsection{AI-driven assessment of movement disorders}
Given pose trajectories, prior work has developed ML/DL models to detect and quantify movement disorders. In Parkinson's disease, pose-based features combined with Graph Neural Networks have been used to classify tremor severity and assess bradykinesia by highlighting subtle movements that may be difficult to quantify by eye \citep{Zhang2022,Quan2024}. In gait analysis, pose-based metrics derived from joint trajectories, including step length, symmetry indices, and joint-angle entropy, have been used to identify gait abnormalities indicative of ataxia and other disorders \citep{Tang2022}. For infant and developmental disorders, early detection systems for cerebral palsy have analyzed spontaneous infant movements using pose-estimation models, showing that deviations from typical movement patterns can be detected from ordinary videos \citep{Luo2022,Khan2018}. Together, these studies show that pose-based analysis can yield clinically meaningful markers across a range of neurological conditions.

\subsection{Transformers for motion modeling}
A separate line of work has explored transformer architectures for modeling human motion. Pose Transformers (PoTr) \citep{MartinezGonzalez2021} introduced a non-autoregressive approach to motion prediction, thereby avoiding error accumulation in autoregressive models. Subsequent frameworks such as SPOTR \citep{NargundSra2023} and STPOTR \citep{mahdavian2023stpotr} disentangle spatial and temporal features, improving joint-trajectory prediction on large-scale motion datasets. Although these methods were developed primarily for general action recognition and motion synthesis, their ability to capture complex temporal dependencies suggests that, with sufficient data and careful pretraining, they could be applied to clinical video to assess diseases such as Parkinson’s disease \citep{endo2022gaitforemer}, for example by tracking disease progression or predicting changes in motor function. However, such models are typically less transparent than classical approaches and require larger datasets than are usually available in ultra-rare diseases.

\subsection{Relevance to NGLY1 and SLC13A5 disorders}
NGLY1 deficiency and SLC13A5 disorder are ultra-rare genetic syndromes characterized by poorly described motor delays and diverse movement-disorder phenotypes. Patients may exhibit both hyperkinetic movements (e.g., ataxia, chorea, myoclonus) and hypokinetic features (e.g., dystonia, bradykinesia), and these patterns can change with age. For such conditions, quantitative pose analysis offers three main advantages. First, it provides objective measurement: for example, tremor frequency and amplitude can be quantified from joint trajectories \citep{Futrell2024}, and similar principles can be extended to other movement patterns. Second, it enables longitudinal monitoring of therapy effects by tracking changes in movement features over time. Third, because these conditions are rare and patients are widely distributed geographically, pose-based analysis can support remote assessments from home-recorded videos, reducing the need for frequent in-person visits.

Within this context, our primary goal is to understand whether a small set of intuitive, physically meaningful pose features can already capture clinically relevant differences between movement-disorder phenotypes in NGLY1 and SLC13A5. We therefore concentrate on traditional classification models, which can be inspected through feature importance and related tools, and treat deep transformer models as exploratory benchmarks.

\begin{figure}[t]
  \centering
  \includegraphics[width=1\linewidth]{figures/extracted_pose.png}
  \caption{(a) OpenPose keypoint labels; (b) keypoints extracted from a representative frame; (c--e) keypoints extracted from video recordings from pediatric patients. The red lines highlight the mean neck angle.}
  \label{fig:pose_example}
\end{figure}

\section{Methods}
Our pipeline begins by extracting 2D skeleton data (Figure~\ref{fig:pose_example}), then derives a set of angular and joint-angle features inspired by clinical rating scales such as ICARS (International Cooperative Ataxia Rating Scale) \citep{Trouillas1997}, UDRS (Unified Dystonia Rating Scale) \citep{Comella2003}, Burke–Fahn–Marsden Dystonia Rating Scale (BFMDRS) \citep{kuiper2016burke}, and UMRS (Unified Myoclonus Rating Scale) \citep{Frucht2002}, from selected body parts, and finally applies a suite of classical classifiers to predict broad movement-disorder categories. In a separate, exploratory branch, we adapt a pre-trained pose transformer to the same task.

\subsection{Pose features}
The pose data are generated by the TRACER platform \citep{beneufit_tracer}, which uses OpenPose as its core 2D pose-estimation backbone. TRACER also includes additional proprietary improvements for robustness and quality control (e.g., internal filtering and processing to better handle clinical videos). Pose quality was reviewed by neurologists. The labels of the keypoints are given in Figure~\ref{fig:pose_example}(a). Pose-based features derived from these keypoints are used to quantify movement and relate it to the severity of the movement disorder. In contrast to the clinicians, who scored the disorders directly from the original videos, all our models operate only on 2D skeleton sequences. This design reflects privacy and data-governance constraints, as pose sequences provide a de-identified representation that is more amenable to research use and potential future data sharing.

Body parts were selected based on the quality and consistency of the extracted keypoints. For the present analysis, we focus on head, upper limb, and lower limb segments. Table~\ref{tab:keypoints} summarizes the keypoint pairs and triplets used to define angular-movement and joint-angle features.

\begin{table}[t]
\centering
\begin{tabular}{llll}
\toprule
Angular movement & Keypoint labels & Joint angle & Keypoint labels \\
\midrule
Head        & (0,1)            & Neck     & (0,1,2), (0,1,5) \\
Upper limbs & (2,3), (5,6)     & Shoulder & (1,2,3), (1,5,6) \\
Lower limbs & (9,10), (12,13)  & Elbow    & (2,3,4), (5,6,7) \\
\bottomrule
\end{tabular}
\caption{OpenPose keypoint labels of selected body parts.}
\label{tab:keypoints}
\end{table}

\subsubsection{Angular-movement features}
We compute segment-wise angular displacement using a fixed temporal lag of $L$ frames (here $L=10$). Let
\[
\mathbf{v}_t = \bigl[x_t^{(a)} - x_t^{(b)},\; y_t^{(a)} - y_t^{(b)}\bigr]^\top
\]
be the 2D limb vector defined by keypoints $a$ and $b$ at frame $t$. For each segment $i$ with start frame $t_i=iL$, we compute the angular displacement between the vectors at the segment endpoints using a dot-product formulation:
\[
\Delta\theta_i
= \arccos\!\left(
\frac{\mathbf{v}_{t_i} \cdot \mathbf{v}_{t_i+L}}
{\|\mathbf{v}_{t_i}\|\,\|\mathbf{v}_{t_i+L}\|}
\right).
\]

Given $N$ segments, the mean and variance of angular displacement are
\[
\Delta\theta_{\text{avg}} = \frac{1}{N} \sum_{i=0}^{N-1} \Delta\theta_i,
\qquad
\sigma^2 = \frac{1}{N} \sum_{i=0}^{N-1} \left(\Delta\theta_i - \Delta\theta_{\text{avg}}\right)^2.
\]

\subsubsection{Mean joint-angle features}
For joint-angle features, we compute the angle at a joint defined by three keypoints $p_t^{(1)}, p_t^{(2)}, p_t^{(3)}$ at each frame $t$. For instance, the shoulder angle uses neck, shoulder, and elbow keypoints. The angle is defined as
\[
\theta_t = \cos^{-1}\Bigl(\frac{(p_t^{(1)} - p_t^{(2)}) \cdot (p_t^{(3)} - p_t^{(2)})}{\|p_t^{(1)} - p_t^{(2)}\|\,\|p_t^{(3)} - p_t^{(2)}\|}\Bigr),
\]
and the mean joint angle over $T$ frames is
\[
\theta_{\text{mean}} = \frac{1}{T} \sum_{t=0}^{T-1} \theta_t.
\]

\subsection{Clinical labels and classification task}
Pose data were extracted from videos of pediatric patients presenting with at least one movement disorder among ataxia, chorea, myoclonus, dystonia, hyperkinesia, or bradykinesia. For each video, two to three physicians independently scored the presence and severity of these disorders while blinded to each other’s assessments. Each movement disorder was rated on a five-point scale: Absent~(0), Minimal~(1), Mild~(2), Moderate~(3), and Severe~(4).

All clinical ratings were performed on the original video recordings. However, for model development we did not use raw pixel data: instead, each recording was processed with OpenPose/TRACER to obtain 2D joint coordinates, and all features and classifiers were derived exclusively from these pose sequences.

For SLC13A5 patients, only ataxia, chorea, dystonia, and myoclonus scores were observed during physician assessment. For NGLY1 patients, we computed two composite scores: one combining ataxia, chorea, and myoclonus, and another combining dystonia, hypokinesia, and bradykinesia. Although these movement disorders have distinct clinical presentations, their symptoms can overlap, particularly when detailed clinical context is limited, making accurate differentiation challenging for both clinicians and machine-learning models. The limited size of our dataset further complicates robust model development.

To mitigate these challenges, we simplified the classification task into four categories defined by the predominant movement and coordination features. \textbf{Normometric} videos were those with absent or minimal movement disorder (all scores 0--1). \textbf{Hypometric} videos showed dystonia, hypokinesia, or bradykinesia with a score of at least 2. \textbf{Hypermetric} videos showed ataxia, chorea, or myoclonus with a score of at least 2. \textbf{Mixed-metric} videos exhibited both at least one hypometric feature (dystonia, hypokinesia, or bradykinesia) and at least one hypermetric feature (ataxia, chorea, or myoclonus). We then trained several machine-learning classifiers, including Random Forest (RF), Multilayer Perceptron (MLP), Support Vector Machine (SVM), XGBoost (XGB), Logistic Regression (LR), and K-Nearest Neighbors (KNN), to predict these categories from the handcrafted pose features.

\subsection{Transformer-based enhancement}
In addition to the classical models, we trained a transformer-based model (Figure~\ref{fig:transformer}) to explore what performance could be achieved when leveraging large-scale pretraining. This analysis was not designed as an alternative clinical tool, but rather as an exploratory benchmark.

The input to the transformer is a sequence of $t$ skeletons $\mathbf{X}_{1:t}$ extracted by OpenPose. During pre-training on a public action-recognition dataset, the network jointly learns to predict the next $M$ skeletons $\mathbf{X}_{t+1:T}$ and to classify the action class of each input sequence. In the subsequent fine-tuning stage on our clinical dataset, the model is optimized solely to categorize the movement disorder, while the motion-prediction branch is frozen to reduce overfitting.

Our architecture, adapted from \citep{MartinezGonzalez2021,endo2022gaitforemer}, consists of several interconnected modules. A Graph Neural Network (GNN) encoder $\phi$ maps each input skeleton $\mathbf{x}_t$ to a fixed-dimensional embedding. Positional embeddings are then added to these skeleton embeddings before they pass through $L$ multi-head self-attention layers in the transformer encoder, yielding a latent representation $\mathbf{z}_{1:t}$. A linear classification head processes $\mathbf{z}_{1:t}$ to produce either action class logits (during pre-training) or movement-disorder logits (during fine-tuning).

Simultaneously, the transformer decoder takes as input the encoder outputs $\mathbf{z}_{1:t}$ along with a query sequence $\mathbf{q}_{1:M}$ initialized with the last observed skeleton $\mathbf{X}_t$. We choose $\mathbf{X}_t$ because it is the most recent known state available at inference time and provides a strong, stable anchor that improves continuity at the boundary between observed and predicted motion \citep{MartinezGonzalez2021}. After $L$ layers of self-attention, the decoder outputs a sequence of embeddings, which the decoding network $\psi$ transforms into reconstructed future skeletons $\hat{\mathbf{X}}_{t+1:T}$.

The overall loss during pre-training is the sum of a classification term and a motion reconstruction term:
\[
L_{\text{total}} = L_{\text{cls}} + L_{\text{motion}}.
\]
Here, $L_{\text{cls}}$ denotes the cross-entropy loss for action classification. The motion loss $L_{\text{motion}}$ is computed by averaging layerwise $\ell_1$ reconstruction errors across all decoder layers. If $\hat{\mathbf{y}}_m^l$ is the predicted $N$-dimensional pose vector at time step $m$ in decoder layer $l$, and $\mathbf{y}_m^*$ is the ground truth, then each layer’s loss is given by
\[
L_l = \frac{1}{M \cdot N} \sum_{m=t+1}^{T} \bigl\lVert \hat{\mathbf{y}}_m^l - \mathbf{y}_m^* \bigr\rVert_1,
\]
and $L_{\text{motion}} = \frac{1}{L} \sum_{l=1}^{L} L_l$.
In the fine-tuning stage, only the classification head is updated on the clinical labels, and the transformer results are reported primarily as a point of comparison with the more interpretable classical models.

\begin{figure}[t]
  \centering
  \includegraphics[width=1\linewidth]{figures/transformer.png}
  \caption{Transformer-based framework for predicting motion and movement disorder.}
  \label{fig:transformer}
\end{figure}

\section{Experimental Setup}

\subsection{Dataset}
Our dataset comprises 95 video recordings of 26 pediatric patients with NGLY1 deficiency or SLC13A5 disorder, captured in standing or sitting positions. Repeated videos from the same child are separated by 4 to 24 months and can exhibit noticeable phenotype changes with development and disease progression. Thirteen recordings were excluded because the subjects were too young (less than 2 years old) to stand or sit independently at the time of recording. The raw videos were used only for clinical review and for extracting 2D skeletons.

The mean age of the subjects in the analyzed recordings is 9.86~$\pm$~4.50 years. The mean number of frames per recording is 1259.08~$\pm$~131.52. We trim each recording under neurologist supervision to remove non-informative lead-in/lead-out segments and retain a fixed-length clip of 1000 frames. After trimming, we apply uniform fixed-stride sampling by keeping every 10th frame, producing a 100-frame pose sequence per recording. This 100-frame representation is used consistently for both handcrafted feature computation (with $L=10$) and transformer fine-tuning.

After filtering, the dataset contains 7 normometric samples, 11 hypometric samples, 40 hypermetric samples, and 25 mixed-metric samples. Pose data are normalized using the neck (keypoint label 1) as the root: all joint coordinates are translated so that the root is fixed at $(0,0)$ while the relative distances between joints remain unchanged. To pre-train the transformer, we also used 5,688 videos (standing, sitting, staggering gait, etc.) from the NTU RGB+D 120 dataset \citep{liu2019ntu}.

\subsection{Model training and evaluation}
The prepared data were partitioned into an 80\% training set and a 20\% test set using stratified sampling at the video level to preserve the class distribution. For the classical machine-learning models (RF, MLP, SVM, XGB, LR, and KNN), a feature selection process was conducted on the training set. First, an ensemble selection method ranked the features using the Chi-squared test, mutual information, ANOVA F-test, and recursive feature elimination. Subsequently, the final feature set was determined by a voting system, where features ranked in the top 12 (highlighted in Figure~\ref{fig:corr}) by at least three of the four methods were selected for modeling. All features were standardized using z-scoring, with the mean and standard deviation computed on the training set.

To address class imbalance, class weighting was applied. Training was conducted in two phases. First, all models were evaluated using 5-fold stratified cross-validation on the training set to establish baseline performance based on the weighted F1-score. Next, the top three performing models underwent hyperparameter tuning via grid search, using the same 5-fold stratified cross-validation strategy. Finally, all models, including the optimized versions, were assessed on the held-out test set.

The transformer-based model consists of 4 encoder layers and 4 decoder layers, with an FFN dimension of 2048. It was pre-trained on the NTU RGB+D 120 dataset for 100 epochs with an initial learning rate of $1\times10^{-4}$, then fine-tuned on our training set. Due to limited patient data, we froze the motion-prediction branch during fine-tuning and trained only the classification head for 50 epochs at the same learning rate. Training was performed on an NVIDIA L40S GPU.

\section{Results and Discussion}
We first examined the correlation between clinician assessment scores and handcrafted features (Figure~\ref{fig:corr}). Although the correlation coefficients are modest, pose-based features consistently show positive associations with clinical severity ratings, supporting their effectiveness as quantitative proxies for clinician-observed movement abnormalities.

\begin{figure}[t]
  \centering
  \includegraphics[width=1\linewidth]{figures/corr.png}
  \caption{Correlation between clinical severity scores and pose-based features. Features selected for training conventional classifiers are highlighted in red.}
  \label{fig:corr}
\end{figure}

For the classification task, we assessed model performance using accuracy, weighted precision, recall, and F1-score. Balanced metrics and per-class metrics are given in Table~\ref{tab:ml_results} and Table~\ref{tab:ml_results_class}. The RF classifier achieved the highest accuracy among the classical models at 65\%, while other models such as XGB and LR achieved slightly lower but comparable performance (Table~\ref{tab:ml_results}). RF also provides greater interpretability compared with MLP, SVM, and other classifiers. The top 6 features used by the RF model include the mean and variance of left upper-limb angular movement, the mean and variance of neck angle, and the mean of head angular movement. Feature importance values of the top 6 features are shown in Figure~\ref{fig:topfeatures}. A differential analysis of the top 6 features using the Mann--Whitney U test was conducted on the full dataset across movement phenotypes and the healthy-reference cohort, correcting for multiple comparisons using the False Discovery Rate procedure, and the results are shown in Figure~\ref{fig:violin}. The upper limbs and neck/head are often considered particularly important by physicians when assessing patients \citep{schmitz2006scale,kuiper2016burke}. These features help the model distinguish the hypermetric phenotype from others based on the statistical analysis (Figure~\ref{fig:violin}), which is consistent with the strong RF performance for identifying hypermetric videos (Table~\ref{tab:ml_results_class}). Although the healthy-reference cohort from the NTU dataset is generally older than the disease cohort and therefore cannot serve as a matched control group, the differential analysis suggests that the handcrafted features can also help distinguish movement phenotypes in the disease cohort from those in the healthy cohort.

Given the limited patient data, these results are encouraging, especially because RF and related models can provide feature-importance estimates that help clinicians understand which aspects of movement are most discriminative across categories.

The fine-tuned transformer-based model achieved the same accuracy as RF but obtained higher recall and F1-score due to improved performance in predicting the mixed-metric class. In particular, the transformer achieved a recall of 83\% for the mixed-metric class, compared with 50\% for RF (Table~\ref{tab:ml_results_class}), highlighting its ability to identify more complex manifestations of movement disorders. As shown in the confusion matrices in Figure~\ref{fig:confusion}, the RF model commonly misclassifies mixed-metric samples as normometric, suggesting a limitation of the conventional classifiers under our feature set. The transformer results illustrate the potential benefit of large-scale pretraining, but this comes at the cost of reduced interpretability and greater complexity, and depends critically on access to extensive non-clinical training data.

\begin{table}[t]
\centering
\setlength{\tabcolsep}{0.75em}
\begin{tabular}{lcccc}
\toprule
Model & Precision & Recall & F1-score & Accuracy \\
\midrule
RF          & \textbf{0.72} & 0.57 & 0.62 & \textbf{0.65} \\
MLP         & 0.60 & 0.50 & 0.55 & 0.47 \\
SVM         & 0.65 & 0.55 & 0.60 & 0.53 \\
XGB         & 0.70 & 0.55 & 0.62 & 0.59 \\
LR          & 0.51 & 0.59 & 0.50 & 0.59 \\
KNN         & 0.60 & 0.50 & 0.55 & 0.47 \\
Transformer & 0.71 & \textbf{0.60} & \textbf{0.64} & \textbf{0.65} \\
\bottomrule
\end{tabular}
\caption{Classification results. The transformer is pre-trained on NTU RGB+D and fine-tuned on patient data; classical models are trained only on clinical features.}
\label{tab:ml_results}
\end{table}

\begin{table}[t]
\centering
\small
\setlength{\tabcolsep}{4pt}
\begin{tabular}{lcccccccccccc}
\toprule
\multirow{2}{*}{Model} & \multicolumn{4}{c}{Precision} & \multicolumn{4}{c}{Recall} & \multicolumn{4}{c}{F1-score} \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9} \cmidrule(lr){10-13}
 & C1 & C2 & C3 & C4 & C1 & C2 & C3 & C4 & C1 & C2 & C3 & C4 \\
\midrule
RF          & 0.00 & 1.00 & \textbf{0.88} & 1.00 & 0.00 & 1.00 & \textbf{0.78} & 0.50 & 0.00 & 1.00 & \textbf{0.82} & 0.67 \\
MLP         & 1.00 & 0.00 & 0.43 & 1.00 & 1.00 & 0.00 & 0.33 & 0.67 & 1.00 & 0.00 & 0.37 & 0.80 \\
SVM         & 0.00 & 1.00 & 0.67 & 1.00 & 0.00 & 1.00 & 0.44 & 0.67 & 0.00 & 1.00 & 0.53 & 0.80 \\
XGB         & 0.00 & 1.00 & 0.83 & 1.00 & 0.00 & 1.00 & 0.56 & 0.67 & 0.00 & 1.00 & 0.67 & 0.80 \\
LR          & 1.00 & 0.00 & 0.56 & 0.50 & 1.00 & 0.00 & 0.56 & 0.67 & 1.00 & 0.00 & 0.56 & 0.57 \\
KNN         & 1.00 & 0.00 & 0.43 & 1.00 & 1.00 & 0.00 & 0.33 & 0.67 & 1.00 & 0.00 & 0.37 & 0.80 \\
Transformer & 1.00 & 0.00 & 0.83 & 1.00 & 1.00 & 0.00 & 0.56 & \textbf{0.83} & 1.00 & 0.00 & 0.67 & \textbf{0.91} \\
\bottomrule
\end{tabular}
\caption{Per-class performance metrics. C1: Normometric; C2: Hypometric; C3: Hypermetric; C4: Mixed-metric.}
\label{tab:ml_results_class}
\end{table}

\begin{figure}[t]
  \centering
  \includegraphics[width=0.8\linewidth]{figures/top_features.png}
  \caption{Feature importance values of the top 6 features used by RF.}
  \label{fig:topfeatures}
\end{figure}

\begin{figure}[]
  \centering
  \includegraphics[width=1\linewidth]{figures/violin.png}
  \caption{Differential analysis of the top 6 features used by RF for classification.}
  \label{fig:violin}
\end{figure}

\begin{figure}[]
  \centering
  \includegraphics[width=0.8\linewidth]{figures/confusion.png}
  \caption{Confusion matrices for RF and transformer results. The RF model commonly misclassifies mixed-metric samples as normometric.}
  \label{fig:confusion}
\end{figure}

\section{Conclusion}
We presented a quantitative, AI-driven framework for assessing movement disorders in pediatric patients with NGLY1 deficiency and SLC13A5 disorder, focusing on interpretable pose-based features and traditional machine-learning models suitable for small datasets. Our framework deliberately decouples clinical assessment (performed on raw videos) from model training (performed on de-identified pose data), which aligns with privacy constraints in pediatric ultra-rare disease cohorts and increases the feasibility of future cross-center data sharing. The pose-estimator agnostic design of the downstream framework can also ingest keypoints from newer pose estimation systems in the future.

Despite limitations such as the inability to test more recent pose-estimation methods, lack of ablation study for transformer design, limited data for robust patient-level splitting, and possible residual variability in home-recorded videos, our results show that simple, physician-informed features can differentiate broad movement-disorder categories and correlate with clinician severity ratings. An exploratory transformer experiment demonstrates that comparable or higher predictive performance may be possible when leveraging large-scale pretraining, but such models are less transparent and more difficult to deploy in routine clinical practice.

Future work will focus on refining our feature set, incorporating larger and more diverse clinical cohorts, and explicitly integrating clinician feedback on interpretability and usability. In the longer term, we envisage combining classical models for day-to-day use with more complex deep learning models as research tools to explore subtle patterns in larger, multi-center datasets. Our ultimate goal is to deliver a reliable, objective assessment tool that supports clinicians in monitoring these rare pediatric movement disorders and in assessing treatment outcomes while respecting the practical constraints of ultra-rare disease research.

\section{Acknowledgment}
This research was funded by UCB. Chengliang Dai and Phil Scordis are employees of UCB and may hold shares and/or stock options in UCB. We thank the Tess Research Foundation and Grace Sciences for sponsoring the biomarker discovery trials NCT04681781, NCT06144957, and NCT03834987, during which the videos were collected. We also thank Kasper Claes for his early work in designing and implementing the project’s initial architecture.

\bibliography{midl26_053}
\end{document}
